From 1913a5b1053c66cee3be51322fc9f354d00f93cd Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Wed, 11 Dec 2019 15:04:52 +0800 Subject: [PATCH 001/578] create v2.0.0-release branch --- src/backend/utils/adt/version.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/utils/adt/version.c b/src/backend/utils/adt/version.c index 479682c5..9b7f49d3 100644 --- a/src/backend/utils/adt/version.c +++ b/src/backend/utils/adt/version.c @@ -78,7 +78,7 @@ #include "utils/builtins.h" -#define TBASE_VERSION_STR "TBase_V2.0.0" +#define TBASE_VERSION_STR "TBase_V2.0.0_release" Datum pgsql_version(PG_FUNCTION_ARGS) From ea325d947fcd5a4037fc2d41d45ffc5d95585599 Mon Sep 17 00:00:00 2001 From: youngxie Date: Mon, 22 Mar 2021 15:37:37 +0800 Subject: [PATCH 002/578] Revoke pgxc_node public accessed priviledge when init. --- src/bin/initdb/initdb.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index e7186d78..1e644e09 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -275,6 +275,7 @@ static void test_config_settings(void); static void setup_config(void); static void bootstrap_template1(void); static void setup_auth(FILE *cmdfd); +static void setup_pgxc_node(FILE *cmdfd); static void get_su_pwd(void); static void setup_depend(FILE *cmdfd); static void setup_sysviews(FILE *cmdfd); @@ -1518,6 +1519,30 @@ setup_auth(FILE *cmdfd) username, escape_quotes(superuser_password)); } +/* + * set up the pgxc_node table + */ +static void +setup_pgxc_node(FILE *cmdfd) +{ + const char *const *line; + static const char *const pgxc_node_setup[] = { + /* + * Grant all priviledge except node_host and node_port + */ + "REVOKE ALL on pgxc_node FROM public;\n\n", + + "GRANT ALL (xmin_gts, xmax_gts, shardid, xc_node_id , " + " tableoid, cmax, xmax, cmin, xmin, oid, ctid, node_name," + " node_type, nodeis_primary, nodeis_preferred, node_id, " + " node_cluster_name) ON pgxc_node TO public;\n\n", + NULL + }; + + for (line = pgxc_node_setup; *line != NULL; line++) + PG_CMD_PUTS(*line); +} + /* * get the superuser password if required */ @@ -3122,6 +3147,8 @@ initialize_data_directory(void) setup_auth(cmdfd); + setup_pgxc_node(cmdfd); + setup_depend(cmdfd); /* From f059816cf80be8f8879368430fb125856c64dec9 Mon Sep 17 00:00:00 2001 From: ericxwu Date: Tue, 23 Jun 2020 22:54:19 +0800 Subject: [PATCH 003/578] fix regress tests for join and some others --- src/test/regress/expected/join_3.out | 509 +++++++++++--------- src/test/regress/expected/mls_check.out | 6 +- src/test/regress/expected/rowsecurity_1.out | 4 +- src/test/regress/expected/tsrf_1.out | 4 +- src/test/regress/sql/join.sql | 20 + src/test/regress/sql/mls_check.sql | 2 +- src/test/regress/sql/rowsecurity.sql | 2 +- src/test/regress/sql/tsrf.sql | 4 +- 8 files changed, 309 insertions(+), 242 deletions(-) diff --git a/src/test/regress/expected/join_3.out b/src/test/regress/expected/join_3.out index 040c4e20..8e9360aa 100644 --- a/src/test/regress/expected/join_3.out +++ b/src/test/regress/expected/join_3.out @@ -2223,6 +2223,7 @@ select aa, bb, unique1, unique1 -- -- regression test: check handling of empty-FROM subquery underneath outer join -- +set enable_nestloop to off; explain (costs off) select * from int8_tbl i1 left join (int8_tbl i2 join (select 123 as x) ss on i2.q1 = x) on i1.q2 = i2.q2 @@ -2256,6 +2257,7 @@ order by 1, 2; 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 | 123 (5 rows) +reset enable_nestloop; -- -- regression test: check a case where join_clause_is_movable_into() gives -- an imprecise result, causing an assertion failure @@ -3336,8 +3338,8 @@ select b.unique1 from join int4_tbl i1 on b.thousand = f1 right join int4_tbl i2 on i2.f1 = b.tenthous order by 1; - QUERY PLAN -------------------------------------------------------------------------------------------------------------- + QUERY PLAN +----------------------------------------------------------------------------------------------------------------------- Remote Subquery Scan on all -> Sort Sort Key: b.unique1 @@ -3348,23 +3350,25 @@ select b.unique1 from -> Materialize -> Remote Subquery Scan on all Distribute results by H: tenthous - -> Nested Loop - Join Filter: (b.thousand = i1.f1) - -> Nested Loop Left Join - Join Filter: (b.unique1 = 42) + -> Nested Loop Left Join + Join Filter: (b.unique1 = 42) + -> Remote Subquery Scan on all + Distribute results by H: unique1 -> Nested Loop -> Remote Subquery Scan on all Distribute results by H: unique2 - -> Index Scan using tenk1_thous_tenthous on tenk1 b - Index Cond: (i2.f1 = tenthous) + -> Nested Loop + -> Seq Scan on int4_tbl i1 + -> Index Scan using tenk1_thous_tenthous on tenk1 b + Index Cond: ((thousand = i1.f1) AND (i2.f1 = tenthous)) -> Index Scan using tenk1_unique1 on tenk1 a Index Cond: (unique1 = b.unique2) - -> Materialize - -> Remote Subquery Scan on all - -> Index Only Scan using tenk1_thous_tenthous on tenk1 c - Index Cond: (thousand = a.thousand) - -> Seq Scan on int4_tbl i1 -(26 rows) + -> Materialize + -> Remote Subquery Scan on all + Distribute results by H: 42 + -> Index Only Scan using tenk1_thous_tenthous on tenk1 c + Index Cond: (thousand = a.thousand) +(28 rows) select b.unique1 from tenk1 a join tenk1 b on a.unique1 = b.unique2 @@ -3573,6 +3577,9 @@ using (join_key); -- -- test successful handling of nested outer joins with degenerate join quals -- +set enable_nestloop to on; +set enable_hashjoin to off; +set enable_mergejoin to off; explain (verbose, costs off) select t1.* from text_tbl t1 @@ -3584,13 +3591,13 @@ select t1.* from on (t1.f1 = b1.d1) left join int4_tbl i4 on (i8.q2 = i4.f1); - QUERY PLAN ----------------------------------------------------------------------------- + QUERY PLAN +---------------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) Output: t1.f1 - -> Hash Left Join + -> Nested Loop Left Join Output: t1.f1 - Hash Cond: (i8.q2 = i4.f1) + Join Filter: (i8.q2 = i4.f1) -> Nested Loop Left Join Output: t1.f1, i8.q2 Join Filter: (t1.f1 = '***'::text) @@ -3598,23 +3605,23 @@ select t1.* from Output: t1.f1 -> Materialize Output: i8.q2 - -> Hash Right Join + -> Nested Loop Left Join Output: i8.q2 - Hash Cond: ((NULL::integer) = i8b1.q2) - -> Hash Left Join + Join Filter: ((NULL::integer) = i8b1.q2) + -> Seq Scan on public.int8_tbl i8b1 + Output: i8b1.q1, i8b1.q2 + -> Materialize Output: i8.q2, (NULL::integer) - Hash Cond: (i8.q1 = i8b2.q1) - -> Seq Scan on public.int8_tbl i8 - Output: i8.q1, i8.q2 - -> Hash - Output: i8b2.q1, (NULL::integer) - -> Seq Scan on public.int8_tbl i8b2 - Output: i8b2.q1, NULL::integer - -> Hash - Output: i8b1.q2 - -> Seq Scan on public.int8_tbl i8b1 - Output: i8b1.q2 - -> Hash + -> Nested Loop Left Join + Output: i8.q2, (NULL::integer) + Join Filter: (i8.q1 = i8b2.q1) + -> Seq Scan on public.int8_tbl i8 + Output: i8.q1, i8.q2 + -> Materialize + Output: i8b2.q1, (NULL::integer) + -> Seq Scan on public.int8_tbl i8b2 + Output: i8b2.q1, NULL::integer + -> Materialize Output: i4.f1 -> Seq Scan on public.int4_tbl i4 Output: i4.f1 @@ -3647,13 +3654,13 @@ select t1.* from on (t1.f1 = b1.d1) left join int4_tbl i4 on (i8.q2 = i4.f1); - QUERY PLAN ----------------------------------------------------------------------------------- + QUERY PLAN +---------------------------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) Output: t1.f1 - -> Hash Left Join + -> Nested Loop Left Join Output: t1.f1 - Hash Cond: (i8.q2 = i4.f1) + Join Filter: (i8.q2 = i4.f1) -> Nested Loop Left Join Output: t1.f1, i8.q2 Join Filter: (t1.f1 = '***'::text) @@ -3661,27 +3668,27 @@ select t1.* from Output: t1.f1 -> Materialize Output: i8.q2 - -> Hash Right Join + -> Nested Loop Left Join Output: i8.q2 - Hash Cond: ((NULL::integer) = i8b1.q2) - -> Hash Right Join + Join Filter: ((NULL::integer) = i8b1.q2) + -> Seq Scan on public.int8_tbl i8b1 + Output: i8b1.q1, i8b1.q2 + -> Materialize Output: i8.q2, (NULL::integer) - Hash Cond: (i8b2.q1 = i8.q1) - -> Nested Loop - Output: i8b2.q1, NULL::integer - -> Seq Scan on public.int8_tbl i8b2 - Output: i8b2.q1, i8b2.q2 - -> Materialize - -> Seq Scan on public.int4_tbl i4b2 - -> Hash - Output: i8.q1, i8.q2 + -> Nested Loop Left Join + Output: i8.q2, (NULL::integer) + Join Filter: (i8.q1 = i8b2.q1) -> Seq Scan on public.int8_tbl i8 Output: i8.q1, i8.q2 - -> Hash - Output: i8b1.q2 - -> Seq Scan on public.int8_tbl i8b1 - Output: i8b1.q2 - -> Hash + -> Materialize + Output: i8b2.q1, (NULL::integer) + -> Nested Loop + Output: i8b2.q1, NULL::integer + -> Seq Scan on public.int8_tbl i8b2 + Output: i8b2.q1, i8b2.q2 + -> Materialize + -> Seq Scan on public.int4_tbl i4b2 + -> Materialize Output: i4.f1 -> Seq Scan on public.int4_tbl i4 Output: i4.f1 @@ -3715,13 +3722,13 @@ select t1.* from on (t1.f1 = b1.d1) left join int4_tbl i4 on (i8.q2 = i4.f1); - QUERY PLAN ----------------------------------------------------------------------------------- + QUERY PLAN +---------------------------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) Output: t1.f1 - -> Hash Left Join + -> Nested Loop Left Join Output: t1.f1 - Hash Cond: (i8.q2 = i4.f1) + Join Filter: (i8.q2 = i4.f1) -> Nested Loop Left Join Output: t1.f1, i8.q2 Join Filter: (t1.f1 = '***'::text) @@ -3729,30 +3736,30 @@ select t1.* from Output: t1.f1 -> Materialize Output: i8.q2 - -> Hash Right Join + -> Nested Loop Left Join Output: i8.q2 - Hash Cond: ((NULL::integer) = i8b1.q2) - -> Hash Right Join + Join Filter: ((NULL::integer) = i8b1.q2) + -> Seq Scan on public.int8_tbl i8b1 + Output: i8b1.q1, i8b1.q2 + -> Materialize Output: i8.q2, (NULL::integer) - Hash Cond: (i8b2.q1 = i8.q1) - -> Hash Join - Output: i8b2.q1, NULL::integer - Hash Cond: (i8b2.q1 = i4b2.f1) - -> Seq Scan on public.int8_tbl i8b2 - Output: i8b2.q1, i8b2.q2 - -> Hash - Output: i4b2.f1 - -> Seq Scan on public.int4_tbl i4b2 - Output: i4b2.f1 - -> Hash - Output: i8.q1, i8.q2 + -> Nested Loop Left Join + Output: i8.q2, (NULL::integer) + Join Filter: (i8.q1 = i8b2.q1) -> Seq Scan on public.int8_tbl i8 Output: i8.q1, i8.q2 - -> Hash - Output: i8b1.q2 - -> Seq Scan on public.int8_tbl i8b1 - Output: i8b1.q2 - -> Hash + -> Materialize + Output: i8b2.q1, (NULL::integer) + -> Nested Loop + Output: i8b2.q1, NULL::integer + Join Filter: (i8b2.q1 = i4b2.f1) + -> Seq Scan on public.int8_tbl i8b2 + Output: i8b2.q1, i8b2.q2 + -> Materialize + Output: i4b2.f1 + -> Seq Scan on public.int4_tbl i4b2 + Output: i4b2.f1 + -> Materialize Output: i4.f1 -> Seq Scan on public.int4_tbl i4 Output: i4.f1 @@ -3825,6 +3832,9 @@ select * from doh! | 123 | 456 | doh! | (2 rows) +reset enable_nestloop; +reset enable_hashjoin; +reset enable_mergejoin; -- -- test for appropriate join order in the presence of lateral references -- @@ -3835,8 +3845,8 @@ select * from on i8.q2 = 123, lateral (select i8.q1, t2.f1 from text_tbl t2 limit 1) as ss where t1.f1 = ss.f1; - QUERY PLAN ------------------------------------------------------------------ + QUERY PLAN +----------------------------------------------------------------------- Nested Loop Output: t1.f1, i8.q1, i8.q2, (i8.q1), t2.f1 Join Filter: (t1.f1 = t2.f1) @@ -3851,15 +3861,17 @@ where t1.f1 = ss.f1; -> Seq Scan on public.int8_tbl i8 Output: i8.q1, i8.q2 Filter: (i8.q2 = 123) - -> Limit + -> Materialize Output: (i8.q1), t2.f1 - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Output: i8.q1, t2.f1 - -> Limit - Output: (i8.q1), t2.f1 - -> Seq Scan on public.text_tbl t2 - Output: i8.q1, t2.f1 -(22 rows) + -> Limit + Output: (i8.q1), t2.f1 + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: i8.q1, t2.f1 + -> Limit + Output: (i8.q1), t2.f1 + -> Seq Scan on public.text_tbl t2 + Output: i8.q1, t2.f1 +(24 rows) select * from text_tbl t1 @@ -3880,8 +3892,8 @@ select * from lateral (select i8.q1, t2.f1 from text_tbl t2 limit 1) as ss1, lateral (select ss1.* from text_tbl t3 limit 1) as ss2 where t1.f1 = ss2.f1; - QUERY PLAN ------------------------------------------------------------------------ + QUERY PLAN +----------------------------------------------------------------------------- Nested Loop Output: t1.f1, i8.q1, i8.q2, (i8.q1), t2.f1, ((i8.q1)), (t2.f1) Join Filter: (t1.f1 = (t2.f1)) @@ -3898,23 +3910,27 @@ where t1.f1 = ss2.f1; -> Seq Scan on public.int8_tbl i8 Output: i8.q1, i8.q2 Filter: (i8.q2 = 123) - -> Limit + -> Materialize Output: (i8.q1), t2.f1 + -> Limit + Output: (i8.q1), t2.f1 + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: i8.q1, t2.f1 + -> Limit + Output: (i8.q1), t2.f1 + -> Seq Scan on public.text_tbl t2 + Output: i8.q1, t2.f1 + -> Materialize + Output: ((i8.q1)), (t2.f1) + -> Limit + Output: ((i8.q1)), (t2.f1) -> Remote Subquery Scan on all (datanode_1,datanode_2) - Output: i8.q1, t2.f1 + Output: (i8.q1), t2.f1 -> Limit - Output: (i8.q1), t2.f1 - -> Seq Scan on public.text_tbl t2 - Output: i8.q1, t2.f1 - -> Limit - Output: ((i8.q1)), (t2.f1) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Output: (i8.q1), t2.f1 - -> Limit - Output: ((i8.q1)), (t2.f1) - -> Seq Scan on public.text_tbl t3 - Output: (i8.q1), t2.f1 -(32 rows) + Output: ((i8.q1)), (t2.f1) + -> Seq Scan on public.text_tbl t3 + Output: (i8.q1), t2.f1 +(36 rows) select * from text_tbl t1 @@ -3965,18 +3981,20 @@ where tt1.f1 = ss1.c0; -> Seq Scan on public.text_tbl tt4 Output: tt4.f1 Filter: (tt4.f1 = 'foo'::text) - -> Subquery Scan on ss1 + -> Materialize Output: ss1.c0 - Filter: (ss1.c0 = 'foo'::text) - -> Limit - Output: (tt4.f1) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Output: tt4.f1 - -> Limit - Output: (tt4.f1) - -> Seq Scan on public.text_tbl tt5 - Output: tt4.f1 -(38 rows) + -> Subquery Scan on ss1 + Output: ss1.c0 + Filter: (ss1.c0 = 'foo'::text) + -> Limit + Output: (tt4.f1) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: tt4.f1 + -> Limit + Output: (tt4.f1) + -> Seq Scan on public.text_tbl tt5 + Output: tt4.f1 +(40 rows) select 1 from text_tbl as tt1 @@ -4002,8 +4020,8 @@ select ss2.* from on i41.f1 = ss1.c1, lateral (select i41.*, i8.*, ss1.* from text_tbl limit 1) ss2 where ss1.c2 = 0; - QUERY PLAN ------------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------------------ Nested Loop Output: (i41.f1), (i8.q1), (i8.q2), (i42.f1), (i43.f1), ((42)) -> Remote Subquery Scan on all (datanode_1) @@ -4027,15 +4045,17 @@ where ss1.c2 = 0; Output: i42.f1 -> Seq Scan on public.int4_tbl i42 Output: i42.f1 - -> Limit + -> Materialize Output: (i41.f1), (i8.q1), (i8.q2), (i42.f1), (i43.f1), ((42)) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Output: i41.f1, i8.q1, i8.q2, i42.f1, i43.f1, (42) - -> Limit - Output: (i41.f1), (i8.q1), (i8.q2), (i42.f1), (i43.f1), ((42)) - -> Seq Scan on public.text_tbl - Output: i41.f1, i8.q1, i8.q2, i42.f1, i43.f1, (42) -(31 rows) + -> Limit + Output: (i41.f1), (i8.q1), (i8.q2), (i42.f1), (i43.f1), ((42)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: i41.f1, i8.q1, i8.q2, i42.f1, i43.f1, (42) + -> Limit + Output: (i41.f1), (i8.q1), (i8.q2), (i42.f1), (i43.f1), ((42)) + -> Seq Scan on public.text_tbl + Output: i41.f1, i8.q1, i8.q2, i42.f1, i43.f1, (42) +(33 rows) select ss2.* from int4_tbl i41 @@ -4059,18 +4079,19 @@ select * from left join (tenk1 as a1 full join (select 1 as id) as yy on (a1.unique1 = yy.id)) on (xx.id = coalesce(yy.id)); - QUERY PLAN ------------------------------------------------------------------ + QUERY PLAN +----------------------------------------------------------------------- Nested Loop Left Join Join Filter: ((1) = COALESCE((1))) -> Result - -> Hash Full Join - Hash Cond: (a1.unique1 = (1)) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Seq Scan on tenk1 a1 - -> Hash - -> Result -(9 rows) + -> Materialize + -> Hash Full Join + Hash Cond: (a1.unique1 = (1)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on tenk1 a1 + -> Hash + -> Result +(10 rows) select * from (select 1 as id) as xx @@ -4699,8 +4720,8 @@ select * from generate_series(100,200) g, explain (num_nodes off, nodes off, costs off) select count(*) from tenk1 a, tenk1 b join lateral (values(a.unique1)) ss(x) on b.unique2 = ss.x; - QUERY PLAN ------------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------ Finalize Aggregate -> Remote Subquery Scan on all -> Partial Aggregate @@ -4710,7 +4731,7 @@ explain (num_nodes off, nodes off, costs off) -> Hash -> Remote Subquery Scan on all Distribute results by H: unique2 - -> Index Only Scan using tenk1_unique2 on tenk1 b + -> Seq Scan on tenk1 b (10 rows) select count(*) from tenk1 a, @@ -4724,8 +4745,8 @@ select count(*) from tenk1 a, explain (num_nodes off, nodes off, costs off) select count(*) from tenk1 a, tenk1 b join lateral (values(a.unique1),(-1)) ss(x) on b.unique2 = ss.x; - QUERY PLAN ------------------------------------------------------------------------- + QUERY PLAN +----------------------------------------------------- Aggregate -> Hash Join Hash Cond: ("*VALUES*".column1 = b.unique2) @@ -4735,7 +4756,7 @@ explain (num_nodes off, nodes off, costs off) -> Values Scan on "*VALUES*" -> Hash -> Remote Subquery Scan on all - -> Index Only Scan using tenk1_unique2 on tenk1 b + -> Seq Scan on tenk1 b (10 rows) select count(*) from tenk1 a, @@ -4746,6 +4767,8 @@ select count(*) from tenk1 a, (1 row) -- lateral injecting a strange outer join condition +set enable_hashjoin to off; +set enable_mergejoin to off; explain (num_nodes off, nodes off, costs off) select * from int8_tbl a, int8_tbl x left join lateral (select a.q1 from int4_tbl y) ss(z) @@ -4758,11 +4781,11 @@ explain (num_nodes off, nodes off, costs off) Sort Key: a.q1, a.q2, x.q1, x.q2, (a.q1) -> Nested Loop -> Seq Scan on int8_tbl a - -> Hash Right Join - Hash Cond: ((a.q1) = x.q2) - -> Seq Scan on int4_tbl y - -> Hash - -> Seq Scan on int8_tbl x + -> Nested Loop Left Join + Join Filter: (x.q2 = (a.q1)) + -> Seq Scan on int8_tbl x + -> Materialize + -> Seq Scan on int4_tbl y (10 rows) select * from int8_tbl a, @@ -4830,27 +4853,29 @@ select * from int8_tbl a, 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 (57 rows) +reset enable_hashjoin; +reset enable_mergejoin; -- lateral reference to a join alias variable select * from (select f1/2 as x from int4_tbl) ss1 join int4_tbl i4 on x = f1, - lateral (select x) ss2(y); + lateral (select x) ss2(y) order by 1,2,3; x | f1 | y ---+----+--- 0 | 0 | 0 (1 row) select * from (select f1 as x from int4_tbl) ss1 join int4_tbl i4 on x = f1, - lateral (values(x)) ss2(y); + lateral (values(x)) ss2(y) order by 1,2,3; x | f1 | y -------------+-------------+------------- + -2147483647 | -2147483647 | -2147483647 + -123456 | -123456 | -123456 0 | 0 | 0 123456 | 123456 | 123456 - -123456 | -123456 | -123456 2147483647 | 2147483647 | 2147483647 - -2147483647 | -2147483647 | -2147483647 (5 rows) select * from ((select f1/2 as x from int4_tbl) ss1 join int4_tbl i4 on x = f1) j, - lateral (select x) ss2(y); + lateral (select x) ss2(y) order by 1,2,3; x | f1 | y ---+----+--- 0 | 0 | 0 @@ -4858,7 +4883,7 @@ select * from ((select f1/2 as x from int4_tbl) ss1 join int4_tbl i4 on x = f1) -- lateral references requiring pullup select * from (values(1)) x(lb), - lateral generate_series(lb,4) x4; + lateral generate_series(lb,4) x4 order by 1,2; lb | x4 ----+---- 1 | 1 @@ -4868,38 +4893,38 @@ select * from (values(1)) x(lb), (4 rows) select * from (select f1/1000000000 from int4_tbl) x(lb), - lateral generate_series(lb,4) x4; + lateral generate_series(lb,4) x4 order by 1,2; lb | x4 ----+---- + -2 | -2 + -2 | -1 + -2 | 0 + -2 | 1 + -2 | 2 + -2 | 3 + -2 | 4 0 | 0 - 0 | 1 - 0 | 2 - 0 | 3 - 0 | 4 0 | 0 - 0 | 1 - 0 | 2 - 0 | 3 - 0 | 4 0 | 0 0 | 1 + 0 | 1 + 0 | 1 + 0 | 2 0 | 2 + 0 | 2 + 0 | 3 0 | 3 + 0 | 3 + 0 | 4 + 0 | 4 0 | 4 2 | 2 2 | 3 2 | 4 - -2 | -2 - -2 | -1 - -2 | 0 - -2 | 1 - -2 | 2 - -2 | 3 - -2 | 4 (25 rows) select * from (values(1)) x(lb), - lateral (values(lb)) y(lbcopy); + lateral (values(lb)) y(lbcopy) order by 1,2; lb | lbcopy ----+-------- 1 | 1 @@ -5176,6 +5201,8 @@ select * from int4_tbl i left join 2147483647 | (5 rows) +set enable_hashjoin to off; +set enable_mergejoin to off; explain (num_nodes off, nodes off, verbose, costs off) select * from int4_tbl a, lateral ( @@ -5189,12 +5216,12 @@ select * from int4_tbl a, Output: a.f1, b.f1, c.q1, c.q2 -> Seq Scan on public.int4_tbl a Output: a.f1 - -> Hash Left Join + -> Nested Loop Left Join Output: b.f1, c.q1, c.q2 - Hash Cond: (b.f1 = c.q1) + Join Filter: (b.f1 = c.q1) -> Seq Scan on public.int4_tbl b Output: b.f1 - -> Hash + -> Materialize Output: c.q1, c.q2 -> Seq Scan on public.int8_tbl c Output: c.q1, c.q2 @@ -5204,36 +5231,38 @@ select * from int4_tbl a, select * from int4_tbl a, lateral ( select * from int4_tbl b left join int8_tbl c on (b.f1 = q1 and a.f1 = q2) - ) ss; + ) ss order by 1,2,3,4; f1 | f1 | q1 | q2 -------------+-------------+----+---- + -2147483647 | -2147483647 | | + -2147483647 | -123456 | | + -2147483647 | 0 | | + -2147483647 | 123456 | | + -2147483647 | 2147483647 | | + -123456 | -2147483647 | | + -123456 | -123456 | | + -123456 | 0 | | + -123456 | 123456 | | + -123456 | 2147483647 | | + 0 | -2147483647 | | + 0 | -123456 | | 0 | 0 | | 0 | 123456 | | - 0 | -123456 | | 0 | 2147483647 | | - 0 | -2147483647 | | + 123456 | -2147483647 | | + 123456 | -123456 | | 123456 | 0 | | 123456 | 123456 | | - 123456 | -123456 | | 123456 | 2147483647 | | - 123456 | -2147483647 | | - -123456 | 0 | | - -123456 | 123456 | | - -123456 | -123456 | | - -123456 | 2147483647 | | - -123456 | -2147483647 | | + 2147483647 | -2147483647 | | + 2147483647 | -123456 | | 2147483647 | 0 | | 2147483647 | 123456 | | - 2147483647 | -123456 | | 2147483647 | 2147483647 | | - 2147483647 | -2147483647 | | - -2147483647 | 0 | | - -2147483647 | 123456 | | - -2147483647 | -123456 | | - -2147483647 | 2147483647 | | - -2147483647 | -2147483647 | | (25 rows) +reset enable_hashjoin; +reset enable_mergejoin; -- lateral reference in a PlaceHolderVar evaluated at join level explain (num_nodes off, nodes off, verbose, costs off) select * from @@ -5476,44 +5505,46 @@ lateral (select * from int8_tbl t1, where q2 = (select greatest(t1.q1,t2.q2)) and (select v.id=0)) offset 0) ss2) ss where t1.q1 = ss.q2) ss0; - QUERY PLAN ------------------------------------------------------------------------------------ + QUERY PLAN +----------------------------------------------------------------------------------------- Nested Loop Output: "*VALUES*".column1, t1.q1, t1.q2, ss2.q1, ss2.q2 -> Remote Subquery Scan on all (datanode_1) Output: t1.q1, t1.q2 -> Seq Scan on public.int8_tbl t1 Output: t1.q1, t1.q2 - -> Nested Loop + -> Materialize Output: "*VALUES*".column1, ss2.q1, ss2.q2 - -> Values Scan on "*VALUES*" - Output: "*VALUES*".column1 - -> Materialize - Output: ss2.q1, ss2.q2 - -> Remote Subquery Scan on all (datanode_1) + -> Nested Loop + Output: "*VALUES*".column1, ss2.q1, ss2.q2 + -> Values Scan on "*VALUES*" + Output: "*VALUES*".column1 + -> Materialize Output: ss2.q1, ss2.q2 - -> Subquery Scan on ss2 + -> Remote Subquery Scan on all (datanode_1) Output: ss2.q1, ss2.q2 - Filter: (t1.q1 = ss2.q2) - -> Seq Scan on public.int8_tbl t2 - Output: t2.q1, t2.q2 - Filter: (SubPlan 3) - SubPlan 3 - -> Remote Subquery Scan on all (datanode_1) - Output: t3.q2 - -> Result + -> Subquery Scan on ss2 + Output: ss2.q1, ss2.q2 + Filter: (t1.q1 = ss2.q2) + -> Seq Scan on public.int8_tbl t2 + Output: t2.q1, t2.q2 + Filter: (SubPlan 3) + SubPlan 3 + -> Remote Subquery Scan on all (datanode_1) Output: t3.q2 - One-Time Filter: $4 - InitPlan 1 (returns $2) - -> Result - Output: GREATEST($0, t2.q2) - InitPlan 2 (returns $4) - -> Result - Output: ($3 = 0) - -> Seq Scan on public.int8_tbl t3 - Output: t3.q1, t3.q2 - Filter: (t3.q2 = $2) -(35 rows) + -> Result + Output: t3.q2 + One-Time Filter: $4 + InitPlan 1 (returns $2) + -> Result + Output: GREATEST($0, t2.q2) + InitPlan 2 (returns $4) + -> Result + Output: ($3 = 0) + -> Seq Scan on public.int8_tbl t3 + Output: t3.q1, t3.q2 + Filter: (t3.q2 = $2) +(37 rows) select * from (values (0), (1)) v(id), lateral (select * from int8_tbl t1, @@ -6064,34 +6095,48 @@ from onek t1, tenk1 t2 where exists (select 1 from tenk1 t3 where t3.thousand = t1.unique1 and t3.tenthous = t2.hundred) and t1.unique1 < 1; - QUERY PLAN -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) Output: t1.unique1, t2.hundred -> Nested Loop Output: t1.unique1, t2.hundred -> Remote Subquery Scan on all (datanode_1,datanode_2) Output: t1.unique1, t3.tenthous + Distribute results by H: tenthous -> Hash Join Output: t1.unique1, t3.tenthous Hash Cond: (t3.thousand = t1.unique1) -> HashAggregate Output: t3.thousand, t3.tenthous Group Key: t3.thousand, t3.tenthous - -> Seq Scan on public.tenk1 t3 + -> Remote Subquery Scan on all (datanode_1,datanode_2) Output: t3.unique1, t3.unique2, t3.two, t3.four, t3.ten, t3.twenty, t3.hundred, t3.thousand, t3.twothousand, t3.fivethous, t3.tenthous, t3.odd, t3.even, t3.stringu1, t3.stringu2, t3.string4 + Distribute results by H: thousand + -> HashAggregate + Output: t3.unique1, t3.unique2, t3.two, t3.four, t3.ten, t3.twenty, t3.hundred, t3.thousand, t3.twothousand, t3.fivethous, t3.tenthous, t3.odd, t3.even, t3.stringu1, t3.stringu2, t3.string4 + Group Key: t3.thousand, t3.tenthous + -> Seq Scan on public.tenk1 t3 + Output: t3.unique1, t3.unique2, t3.two, t3.four, t3.ten, t3.twenty, t3.hundred, t3.thousand, t3.twothousand, t3.fivethous, t3.tenthous, t3.odd, t3.even, t3.stringu1, t3.stringu2, t3.string4 -> Hash Output: t1.unique1 -> Remote Subquery Scan on all (datanode_1,datanode_2) Output: t1.unique1 - Sort Key: t1.unique1 - -> Index Only Scan using onek_unique1 on public.onek t1 + -> Bitmap Heap Scan on public.onek t1 Output: t1.unique1 - Index Cond: (t1.unique1 < 1) - -> Index Only Scan using tenk1_hundred on public.tenk1 t2 + Recheck Cond: (t1.unique1 < 1) + -> Bitmap Index Scan on onek_unique1 + Index Cond: (t1.unique1 < 1) + -> Materialize Output: t2.hundred - Index Cond: (t2.hundred = t3.tenthous) -(25 rows) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: t2.hundred + Distribute results by H: hundred + Sort Key: t2.hundred + -> Index Only Scan using tenk1_hundred on public.tenk1 t2 + Output: t2.hundred + Index Cond: (t2.hundred = t3.tenthous) +(39 rows) -- ... unless it actually is unique create table j3 as select unique1, tenthous from onek; @@ -6103,8 +6148,8 @@ from onek t1, tenk1 t2 where exists (select 1 from j3 where j3.unique1 = t1.unique1 and j3.tenthous = t2.hundred) and t1.unique1 < 1; - QUERY PLAN ------------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) Output: t1.unique1, t2.hundred -> Nested Loop @@ -6113,15 +6158,17 @@ where exists (select 1 from j3 Output: t1.unique1, j3.tenthous -> Nested Loop Output: t1.unique1, j3.tenthous - -> Index Only Scan using onek_unique1 on public.onek t1 - Output: t1.unique1 - Index Cond: (t1.unique1 < 1) + -> Bitmap Heap Scan on public.onek t1 + Output: t1.unique1, t1.unique2, t1.two, t1.four, t1.ten, t1.twenty, t1.hundred, t1.thousand, t1.twothousand, t1.fivethous, t1.tenthous, t1.odd, t1.even, t1.stringu1, t1.stringu2, t1.string4 + Recheck Cond: (t1.unique1 < 1) + -> Bitmap Index Scan on onek_unique1 + Index Cond: (t1.unique1 < 1) -> Index Only Scan using j3_unique1_tenthous_idx on public.j3 Output: j3.unique1, j3.tenthous Index Cond: (j3.unique1 = t1.unique1) -> Index Only Scan using tenk1_hundred on public.tenk1 t2 Output: t2.hundred Index Cond: (t2.hundred = j3.tenthous) -(17 rows) +(19 rows) drop table j3; diff --git a/src/test/regress/expected/mls_check.out b/src/test/regress/expected/mls_check.out index cd1b996d..0e5b955d 100644 --- a/src/test/regress/expected/mls_check.out +++ b/src/test/regress/expected/mls_check.out @@ -4557,14 +4557,14 @@ select * from xixi where i = 11; --case: insert into select from join \c - godlike insert into lala2 select a.i,a.j,b._cls from lala a, lala3 b where a.i = b.i; -select * from lala2; +select * from lala2 order by i; i | j | _cls ----+----+--------- + 11 | 11 | 99:1026 12 | 12 | 99:1026 13 | 13 | 99:1026 - 15 | 15 | 99:1026 - 11 | 11 | 99:1026 14 | 14 | 99:1026 + 15 | 15 | 99:1026 (5 rows) --ROUND1. end diff --git a/src/test/regress/expected/rowsecurity_1.out b/src/test/regress/expected/rowsecurity_1.out index eb1a95cb..fb6327f3 100644 --- a/src/test/regress/expected/rowsecurity_1.out +++ b/src/test/regress/expected/rowsecurity_1.out @@ -737,13 +737,13 @@ EXPLAIN (COSTS OFF) SELECT * FROM t1 WHERE f_leak(b) FOR SHARE; (9 rows) -- union all query -SELECT a, b, oid FROM t2 UNION ALL SELECT a, b, oid FROM t3; +SELECT a, b, oid FROM t2 UNION ALL SELECT a, b, oid FROM t3 order by oid; a | b | oid ---+-----+----- 1 | abc | 201 + 3 | cde | 203 1 | xxx | 301 2 | yyy | 302 - 3 | cde | 203 3 | zzz | 303 (5 rows) diff --git a/src/test/regress/expected/tsrf_1.out b/src/test/regress/expected/tsrf_1.out index e312b9dc..1831d4d6 100644 --- a/src/test/regress/expected/tsrf_1.out +++ b/src/test/regress/expected/tsrf_1.out @@ -233,7 +233,7 @@ LINE 1: SELECT min(generate_series(1, 3)) OVER() FROM few; ^ HINT: You might be able to move the set-returning function into a LATERAL FROM item. -- SRFs are normally computed after window functions -SELECT id,lag(id) OVER(), count(*) OVER(), generate_series(1,3) FROM few; +SELECT id,lag(id) OVER(), count(*) OVER(), generate_series(1,3) FROM few ORDER BY 1, 4; id | lag | count | generate_series ----+-----+-------+----------------- 1 | | 3 | 1 @@ -561,7 +561,7 @@ ERROR: set-returning functions are not allowed in LIMIT LINE 1: SELECT 1 LIMIT generate_series(1,3); ^ -- tSRF in correlated subquery, referencing table outside -SELECT (SELECT generate_series(1,3) LIMIT 1 OFFSET few.id) FROM few; +SELECT (SELECT generate_series(1,3) LIMIT 1 OFFSET few.id) FROM few order by 1; generate_series ----------------- 2 diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql index c30b6703..e6c695c9 100644 --- a/src/test/regress/sql/join.sql +++ b/src/test/regress/sql/join.sql @@ -404,6 +404,8 @@ select aa, bb, unique1, unique1 -- -- regression test: check handling of empty-FROM subquery underneath outer join -- +set enable_nestloop to off; + explain (costs off) select * from int8_tbl i1 left join (int8_tbl i2 join (select 123 as x) ss on i2.q1 = x) on i1.q2 = i2.q2 @@ -413,6 +415,8 @@ select * from int8_tbl i1 left join (int8_tbl i2 join (select 123 as x) ss on i2.q1 = x) on i1.q2 = i2.q2 order by 1, 2; +reset enable_nestloop; + -- -- regression test: check a case where join_clause_is_movable_into() gives -- an imprecise result, causing an assertion failure @@ -1097,6 +1101,9 @@ using (join_key); -- -- test successful handling of nested outer joins with degenerate join quals -- +set enable_nestloop to on; +set enable_hashjoin to off; +set enable_mergejoin to off; explain (verbose, costs off) select t1.* from @@ -1188,6 +1195,9 @@ select * from left join int4_tbl i4 on i8.q1 = i4.f1; +reset enable_nestloop; +reset enable_hashjoin; +reset enable_mergejoin; -- -- test for appropriate join order in the presence of lateral references -- @@ -1576,6 +1586,9 @@ select count(*) from tenk1 a, tenk1 b join lateral (values(a.unique1),(-1)) ss(x) on b.unique2 = ss.x; -- lateral injecting a strange outer join condition +set enable_hashjoin to off; +set enable_mergejoin to off; + explain (num_nodes off, nodes off, costs off) select * from int8_tbl a, int8_tbl x left join lateral (select a.q1 from int4_tbl y) ss(z) @@ -1586,6 +1599,9 @@ select * from int8_tbl a, on x.q2 = ss.z order by a.q1, a.q2, x.q1, x.q2, ss.z; +reset enable_hashjoin; +reset enable_mergejoin; + -- lateral reference to a join alias variable select * from (select f1/2 as x from int4_tbl) ss1 join int4_tbl i4 on x = f1, lateral (select x) ss2(y) order by 1,2,3; @@ -1658,6 +1674,8 @@ select * from int4_tbl i left join lateral (select coalesce(i) from int2_tbl j where i.f1 = j.f1) k on true; select * from int4_tbl i left join lateral (select coalesce(i) from int2_tbl j where i.f1 = j.f1) k on true order by 1; +set enable_hashjoin to off; +set enable_mergejoin to off; explain (num_nodes off, nodes off, verbose, costs off) select * from int4_tbl a, lateral ( @@ -1667,6 +1685,8 @@ select * from int4_tbl a, lateral ( select * from int4_tbl b left join int8_tbl c on (b.f1 = q1 and a.f1 = q2) ) ss order by 1,2,3,4; +reset enable_hashjoin; +reset enable_mergejoin; -- lateral reference in a PlaceHolderVar evaluated at join level explain (num_nodes off, nodes off, verbose, costs off) diff --git a/src/test/regress/sql/mls_check.sql b/src/test/regress/sql/mls_check.sql index 532c7894..208fd38b 100644 --- a/src/test/regress/sql/mls_check.sql +++ b/src/test/regress/sql/mls_check.sql @@ -1732,7 +1732,7 @@ select * from xixi where i = 11; --case: insert into select from join \c - godlike insert into lala2 select a.i,a.j,b._cls from lala a, lala3 b where a.i = b.i; -select * from lala2; +select * from lala2 order by i; --ROUND1. end truncate table xixi; diff --git a/src/test/regress/sql/rowsecurity.sql b/src/test/regress/sql/rowsecurity.sql index baf951ef..a010dc72 100644 --- a/src/test/regress/sql/rowsecurity.sql +++ b/src/test/regress/sql/rowsecurity.sql @@ -299,7 +299,7 @@ SELECT * FROM t1 WHERE f_leak(b) ORDER BY a FOR SHARE; EXPLAIN (COSTS OFF) SELECT * FROM t1 WHERE f_leak(b) FOR SHARE; -- union all query -SELECT a, b, oid FROM t2 UNION ALL SELECT a, b, oid FROM t3; +SELECT a, b, oid FROM t2 UNION ALL SELECT a, b, oid FROM t3 order by oid; EXPLAIN (COSTS OFF) SELECT a, b, oid FROM t2 UNION ALL SELECT a, b, oid FROM t3; -- superuser is allowed to bypass RLS checks diff --git a/src/test/regress/sql/tsrf.sql b/src/test/regress/sql/tsrf.sql index a6e58c5b..65f5ab61 100644 --- a/src/test/regress/sql/tsrf.sql +++ b/src/test/regress/sql/tsrf.sql @@ -71,7 +71,7 @@ SELECT sum((3 = ANY(SELECT lag(x) over(order by x) SELECT min(generate_series(1, 3)) OVER() FROM few; -- SRFs are normally computed after window functions -SELECT id,lag(id) OVER(), count(*) OVER(), generate_series(1,3) FROM few; +SELECT id,lag(id) OVER(), count(*) OVER(), generate_series(1,3) FROM few ORDER BY 1, 4; -- unless referencing SRFs SELECT SUM(count(*)) OVER(PARTITION BY generate_series(1,3) ORDER BY generate_series(1,3)), generate_series(1,3) g FROM few GROUP BY g; @@ -142,7 +142,7 @@ SELECT a, generate_series(1,2) FROM (VALUES(1),(2),(3)) r(a) LIMIT 2 OFFSET 2; SELECT 1 LIMIT generate_series(1,3); -- tSRF in correlated subquery, referencing table outside -SELECT (SELECT generate_series(1,3) LIMIT 1 OFFSET few.id) FROM few; +SELECT (SELECT generate_series(1,3) LIMIT 1 OFFSET few.id) FROM few order by 1; -- tSRF in correlated subquery, referencing SRF outside SELECT (SELECT generate_series(1,3) LIMIT 1 OFFSET g.i) FROM generate_series(0,3) g(i); From 960f9c967042ebf18a760f9b7adb10a576cd201c Mon Sep 17 00:00:00 2001 From: ericxwu Date: Wed, 24 Jun 2020 16:33:32 +0800 Subject: [PATCH 004/578] fix regress test unstable cases --- src/test/regress/expected/join_3.out | 47 ++++++++++++---------------- src/test/regress/expected/tsrf_1.out | 2 +- src/test/regress/sql/join.sql | 6 ++++ src/test/regress/sql/tsrf.sql | 2 +- 4 files changed, 28 insertions(+), 29 deletions(-) diff --git a/src/test/regress/expected/join_3.out b/src/test/regress/expected/join_3.out index 8e9360aa..f45e67d5 100644 --- a/src/test/regress/expected/join_3.out +++ b/src/test/regress/expected/join_3.out @@ -5139,25 +5139,28 @@ select * from -- lateral can result in join conditions appearing below their -- real semantic level +set enable_nestloop to on; +set enable_hashjoin to off; +set enable_mergejoin to off; explain (num_nodes off, nodes off, verbose, costs off) select * from int4_tbl i left join lateral (select * from int2_tbl j where i.f1 = j.f1) k on true; - QUERY PLAN -------------------------------------------------------- + QUERY PLAN +------------------------------------------------- Remote Subquery Scan on all Output: i.f1, j.f1 - -> Hash Right Join + -> Nested Loop Left Join Output: i.f1, j.f1 - Hash Cond: (j.f1 = i.f1) - -> Seq Scan on public.int2_tbl j - Output: j.f1 - -> Hash + Join Filter: (i.f1 = j.f1) + -> Remote Subquery Scan on all Output: i.f1 - -> Remote Subquery Scan on all + Distribute results by H: f1 + -> Seq Scan on public.int4_tbl i Output: i.f1 - Distribute results by H: f1 - -> Seq Scan on public.int4_tbl i - Output: i.f1 + -> Materialize + Output: j.f1 + -> Seq Scan on public.int2_tbl j + Output: j.f1 (14 rows) select * from int4_tbl i left join @@ -5171,25 +5174,15 @@ select * from int4_tbl i left join 2147483647 | (5 rows) +reset enable_nestloop; +reset enable_hashjoin; +reset enable_mergejoin explain (num_nodes off, nodes off, verbose, costs off) select * from int4_tbl i left join lateral (select coalesce(i) from int2_tbl j where i.f1 = j.f1) k on true; - QUERY PLAN -------------------------------------------------- - Remote Subquery Scan on all - Output: i.f1, COALESCE(i.*) - -> Nested Loop Left Join - Output: i.f1, (COALESCE(i.*)) - -> Remote Subquery Scan on all - Output: i.f1, i.* - Distribute results by H: f1 - -> Seq Scan on public.int4_tbl i - Output: i.f1, i.* - -> Seq Scan on public.int2_tbl j - Output: j.f1, COALESCE(i.*) - Filter: (i.f1 = j.f1) -(12 rows) - +ERROR: syntax error at or near "explain" +LINE 2: explain (num_nodes off, nodes off, verbose, costs off) + ^ select * from int4_tbl i left join lateral (select coalesce(i) from int2_tbl j where i.f1 = j.f1) k on true order by 1; f1 | coalesce diff --git a/src/test/regress/expected/tsrf_1.out b/src/test/regress/expected/tsrf_1.out index 1831d4d6..a2bb9fa6 100644 --- a/src/test/regress/expected/tsrf_1.out +++ b/src/test/regress/expected/tsrf_1.out @@ -233,7 +233,7 @@ LINE 1: SELECT min(generate_series(1, 3)) OVER() FROM few; ^ HINT: You might be able to move the set-returning function into a LATERAL FROM item. -- SRFs are normally computed after window functions -SELECT id,lag(id) OVER(), count(*) OVER(), generate_series(1,3) FROM few ORDER BY 1, 4; +SELECT id,lag(id) OVER(), count(*) OVER(), generate_series(1,3) FROM few ORDER BY 1, 2, 4; id | lag | count | generate_series ----+-----+-------+----------------- 1 | | 3 | 1 diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql index e6c695c9..dceca27f 100644 --- a/src/test/regress/sql/join.sql +++ b/src/test/regress/sql/join.sql @@ -1664,11 +1664,17 @@ select * from -- lateral can result in join conditions appearing below their -- real semantic level +set enable_nestloop to on; +set enable_hashjoin to off; +set enable_mergejoin to off; explain (num_nodes off, nodes off, verbose, costs off) select * from int4_tbl i left join lateral (select * from int2_tbl j where i.f1 = j.f1) k on true; select * from int4_tbl i left join lateral (select * from int2_tbl j where i.f1 = j.f1) k on true order by 1; +reset enable_nestloop; +reset enable_hashjoin; +reset enable_mergejoin explain (num_nodes off, nodes off, verbose, costs off) select * from int4_tbl i left join lateral (select coalesce(i) from int2_tbl j where i.f1 = j.f1) k on true; diff --git a/src/test/regress/sql/tsrf.sql b/src/test/regress/sql/tsrf.sql index 65f5ab61..0833b221 100644 --- a/src/test/regress/sql/tsrf.sql +++ b/src/test/regress/sql/tsrf.sql @@ -71,7 +71,7 @@ SELECT sum((3 = ANY(SELECT lag(x) over(order by x) SELECT min(generate_series(1, 3)) OVER() FROM few; -- SRFs are normally computed after window functions -SELECT id,lag(id) OVER(), count(*) OVER(), generate_series(1,3) FROM few ORDER BY 1, 4; +SELECT id,lag(id) OVER(), count(*) OVER(), generate_series(1,3) FROM few ORDER BY 1, 2, 4; -- unless referencing SRFs SELECT SUM(count(*)) OVER(PARTITION BY generate_series(1,3) ORDER BY generate_series(1,3)), generate_series(1,3) g FROM few GROUP BY g; From cccb332c80cad1a1176f313e2a5caf7be846119b Mon Sep 17 00:00:00 2001 From: ericxwu Date: Wed, 24 Jun 2020 16:39:47 +0800 Subject: [PATCH 005/578] parallel hashagg support group by column with all types of expression http://tapd.oa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131080395083 --- src/backend/executor/nodeAgg.c | 78 ++++++++++--------- .../regress/expected/select_parallel_4.out | 24 ++++++ src/test/regress/sql/select_parallel.sql | 6 ++ 3 files changed, 72 insertions(+), 36 deletions(-) diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c index 3d203081..d0610f2a 100644 --- a/src/backend/executor/nodeAgg.c +++ b/src/backend/executor/nodeAgg.c @@ -3394,65 +3394,71 @@ agg_retrieve_direct(AggState *aggstate) */ static void agg_fill_hash_table(AggState *aggstate) -{// #lizard forgives +{ TupleTableSlot *outerslot; ExprContext *tmpcontext = aggstate->tmpcontext; #ifdef __TBASE__ - AttrNumber varattno = 0; - Oid dataType = 0; - aggstate->tmpcxt = NULL; - + AttrNumber varattno = InvalidAttrNumber; + Oid dataType = InvalidOid; + + aggstate->tmpcxt = NULL; + + /* get the redistribution hashfunc for parallel execution */ if (IsParallelWorker() && aggstate->state) { - AttrNumber group_col = 0; - TargetEntry *en = NULL; + AttrNumber group_col = InvalidAttrNumber; + TargetEntry *tle = NULL; if (aggstate->aggstrategy != AGG_HASHED || list_length(aggstate->all_grouped_cols) == 0) { - elog(ERROR, "mismatch plan while ReDistribute-Data."); + elog(ERROR, "plan mismatched while redistributing data across " + "parallel workers."); } - /* get first groupby column in targetlist */ + /* + * all_grouped_cols was sorted by AttributeNum in descending order, get + * first group-by column in targetlist . + * + * TODO: choose column with better distribution to avoid data skew + * within parallel workers + */ group_col = llast_int(aggstate->all_grouped_cols); if (group_col < 1) { - elog(ERROR, "group column AttrNumber is smaller than 1."); + elog(ERROR, "invalid group by AttrNumber %d found while " + "redistributing data across parallel workers.", group_col); } - /* get the groupby column's datatype and AttrNumber of input from outer plan */ - en = (TargetEntry *)lfirst(list_nth_cell(aggstate->ss.ps.plan->lefttree->targetlist, group_col - 1)); - - if (IsA(en->expr, Var)) - { - Var *var = (Var *)en->expr; + /* + * get DataType and AttrNumber of the redistribution group-by column + * from outer plan + */ + tle = (TargetEntry *) lfirst(list_nth_cell( + aggstate->ss.ps.plan->lefttree->targetlist, group_col - 1)); - dataType = var->vartype; - varattno = group_col; + dataType = exprType((Node*) tle->expr); + varattno = group_col; - aggstate->hashfunc = hash_func_ptr(dataType); - aggstate->dataType = dataType; + aggstate->hashfunc = hash_func_ptr(dataType); + aggstate->dataType = dataType; - /* could not find hash function for given data type */ - if (!aggstate->hashfunc) - { - elog(ERROR, "could not find hash function for given data type:%u", dataType); - } - } - else - { - elog(ERROR, "could not get AttrNumber and data type of group column."); - } + /* could not find hash function for given data type */ + if (!aggstate->hashfunc) + { + elog(ERROR, "could not find hash function for given data type:%u", + dataType); + } - /* initialize resources */ - InitializeReDistribute(aggstate->state, &aggstate->file); + /* initialize resources */ + InitializeReDistribute(aggstate->state, &aggstate->file); - aggstate->tmpcxt = AllocSetContextCreate(CurrentMemoryContext, - "ExecAgg temp memoryContext", - ALLOCSET_DEFAULT_SIZES); + aggstate->tmpcxt = AllocSetContextCreate(CurrentMemoryContext, + "ExecAgg temp memoryContext", + ALLOCSET_DEFAULT_SIZES); - elog(LOG, "worker:%d redistributed in HashAgg.", ParallelWorkerNumber); + elog(LOG, "worker:%d redistributed in HashAgg.", ParallelWorkerNumber); } #endif diff --git a/src/test/regress/expected/select_parallel_4.out b/src/test/regress/expected/select_parallel_4.out index fd99f499..3ae6bc47 100644 --- a/src/test/regress/expected/select_parallel_4.out +++ b/src/test/regress/expected/select_parallel_4.out @@ -95,6 +95,30 @@ explain (costs off) -> Parallel Seq Scan on tenk1 (10 rows) +explain (costs off) + select count(stringu1) as num, (CASE WHEN length(stringu1) > 5 THEN 'LONG' ELSE 'SHORT' END) as islong + from tenk1 group by islong order by num; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------ + Sort + Sort Key: (count(stringu1)) + -> Finalize HashAggregate + Group Key: CASE WHEN (length((stringu1)::text) > 5) THEN 'LONG'::text ELSE 'SHORT'::text END + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Gather + Workers Planned: 4 + -> Partial HashAggregate + Group Key: CASE WHEN (length((stringu1)::text) > 5) THEN 'LONG'::text ELSE 'SHORT'::text END + -> Parallel Seq Scan on tenk1 +(10 rows) + +select count(stringu1) as num, (CASE WHEN length(stringu1) > 5 THEN 'LONG' ELSE 'SHORT' END) as islong + from tenk1 group by islong order by num; + num | islong +-------+-------- + 10000 | LONG +(1 row) + -- test that parallel plan for aggregates is not selected when -- target list contains parallel restricted clause. explain (costs off) diff --git a/src/test/regress/sql/select_parallel.sql b/src/test/regress/sql/select_parallel.sql index 461abaeb..70d0f0fb 100644 --- a/src/test/regress/sql/select_parallel.sql +++ b/src/test/regress/sql/select_parallel.sql @@ -34,6 +34,12 @@ select length(stringu1) from tenk1 group by length(stringu1); explain (costs off) select stringu1, count(*) from tenk1 group by stringu1 order by stringu1; +explain (costs off) + select count(stringu1) as num, (CASE WHEN length(stringu1) > 5 THEN 'LONG' ELSE 'SHORT' END) as islong + from tenk1 group by islong order by num; +select count(stringu1) as num, (CASE WHEN length(stringu1) > 5 THEN 'LONG' ELSE 'SHORT' END) as islong + from tenk1 group by islong order by num; + -- test that parallel plan for aggregates is not selected when -- target list contains parallel restricted clause. explain (costs off) From a4ed26a29463fe0fc93dd2d3989b8d770913d8ac Mon Sep 17 00:00:00 2001 From: ericxwu Date: Fri, 26 Jun 2020 22:18:05 +0800 Subject: [PATCH 006/578] recalculate nestloop/hash/merge join cost caused by redistribution --- src/backend/optimizer/util/pathnode.c | 80 +++- src/backend/optimizer/util/pgxcship.c | 4 + src/test/regress/expected/equivclass.out | 8 +- src/test/regress/expected/join_3.out | 346 +++++++++--------- src/test/regress/expected/privileges.out | 18 +- src/test/regress/expected/stats_ext_2.out | 18 +- src/test/regress/expected/stats_ext_3.out | 92 +++-- src/test/regress/expected/subselect_1.out | 25 +- src/test/regress/expected/xc_FQS_join_1.out | 91 ++--- src/test/regress/expected/xc_for_update_1.out | 64 ++-- src/test/regress/expected/xc_groupby_1.out | 80 ++-- src/test/regress/expected/xc_having_1.out | 22 +- src/test/regress/expected/xl_join.out | 24 +- 13 files changed, 456 insertions(+), 416 deletions(-) diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index ee237fa7..586a595c 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -5088,6 +5088,19 @@ create_nestloop_path(PlannerInfo *root, alternate = set_joinpath_distribution(root, pathnode); #endif + +#ifdef __TBASE__ + /* + * Since set_joinpath_distribution() could add additional pathnode such as + * RemoteSubplan, the result of initial_cost_nestloop() needs to be + * recalculated. + */ + initial_cost_nestloop(root, workspace, jointype, + pathnode->outerjoinpath, + pathnode->innerjoinpath, + extra); +#endif + final_cost_nestloop(root, pathnode, workspace, extra); #ifdef XCP @@ -5097,6 +5110,17 @@ create_nestloop_path(PlannerInfo *root, foreach(lc, alternate) { NestPath *altpath = (NestPath *) lfirst(lc); + +#ifdef __TBASE__ + /* + * Recalculate the initial cost of alternate path + */ + initial_cost_nestloop(root, workspace, jointype, + altpath->outerjoinpath, + altpath->innerjoinpath, + extra); +#endif + final_cost_nestloop(root, altpath, workspace, extra); if (altpath->path.total_cost < pathnode->path.total_cost) pathnode = altpath; @@ -5180,6 +5204,19 @@ create_mergejoin_path(PlannerInfo *root, /* pathnode->skip_mark_restore will be set by final_cost_mergejoin */ /* pathnode->materialize_inner will be set by final_cost_mergejoin */ +#ifdef __TBASE__ + /* + * Since set_joinpath_distribution() could add additional pathnode such as + * RemoteSubplan, the result of initial_cost_mergejoin() needs to be + * recalculated. + */ + initial_cost_mergejoin(root, workspace, jointype, mergeclauses, + pathnode->jpath.outerjoinpath, + pathnode->jpath.innerjoinpath, + outersortkeys, innersortkeys, + extra); +#endif + final_cost_mergejoin(root, pathnode, workspace, extra); #ifdef XCP @@ -5189,6 +5226,18 @@ create_mergejoin_path(PlannerInfo *root, foreach(lc, alternate) { MergePath *altpath = (MergePath *) lfirst(lc); + +#ifdef __TBASE__ + /* + * Recalculate the initial cost of alternate path + */ + initial_cost_mergejoin(root, workspace, jointype, mergeclauses, + altpath->jpath.outerjoinpath, + altpath->jpath.innerjoinpath, + outersortkeys, innersortkeys, + extra); +#endif + final_cost_mergejoin(root, altpath, workspace, extra); if (altpath->jpath.path.total_cost < pathnode->jpath.path.total_cost) pathnode = altpath; @@ -5277,8 +5326,23 @@ create_hashjoin_path(PlannerInfo *root, #ifdef XCP alternate = set_joinpath_distribution(root, (JoinPath *) pathnode); #endif - /* final_cost_hashjoin will fill in pathnode->num_batches */ +#ifdef __TBASE__ + /* + * Since set_joinpath_distribution() could add additional pathnode such as + * RemoteSubplan, the result of initial_cost_hashjoin() needs to be + * recalculated. + */ + initial_cost_hashjoin(root, + workspace, + jointype, + hashclauses, + pathnode->jpath.outerjoinpath, + pathnode->jpath.innerjoinpath, + extra); +#endif + + /* final_cost_hashjoin will fill in pathnode->num_batches */ final_cost_hashjoin(root, pathnode, workspace, extra); #ifdef XCP @@ -5288,6 +5352,20 @@ create_hashjoin_path(PlannerInfo *root, foreach(lc, alternate) { HashPath *altpath = (HashPath *) lfirst(lc); + +#ifdef __TBASE__ + /* + * Recalculate the initial cost of alternate path + */ + initial_cost_hashjoin(root, + workspace, + jointype, + hashclauses, + altpath->jpath.outerjoinpath, + altpath->jpath.innerjoinpath, + extra); +#endif + final_cost_hashjoin(root, altpath, workspace, extra); if (altpath->jpath.path.total_cost < pathnode->jpath.path.total_cost) pathnode = altpath; diff --git a/src/backend/optimizer/util/pgxcship.c b/src/backend/optimizer/util/pgxcship.c index 8bf2e0e2..f79eb3bd 100644 --- a/src/backend/optimizer/util/pgxcship.c +++ b/src/backend/optimizer/util/pgxcship.c @@ -1189,6 +1189,10 @@ pgxc_shippability_walker(Node *node, Shippability_context *sc_context) */ if (query->commandType != CMD_SELECT && list_length(query->rtable) > 1) { + /* + * Try to shipping insert with multiple rtables. Skip FQS if it + * contains subquery. + */ if(query->commandType == CMD_INSERT && query->onConflict) { ListCell *cell; diff --git a/src/test/regress/expected/equivclass.out b/src/test/regress/expected/equivclass.out index cfa96c42..d5868e69 100644 --- a/src/test/regress/expected/equivclass.out +++ b/src/test/regress/expected/equivclass.out @@ -207,13 +207,13 @@ explain (costs off) QUERY PLAN ----------------------------------------------------------------- Nested Loop + Join Filter: (ec1.ff = ec2.x1) -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Seq Scan on ec2 - Filter: (x1 = '42'::int8alias2) + -> Seq Scan on ec1 -> Materialize -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Index Scan using ec1_pkey on ec1 - Index Cond: (ff = ec2.x1) + -> Seq Scan on ec2 + Filter: (x1 = '42'::int8alias2) (8 rows) create unique index ec1_expr1 on ec1((ff + 1)); diff --git a/src/test/regress/expected/join_3.out b/src/test/regress/expected/join_3.out index f45e67d5..9d08f4b2 100644 --- a/src/test/regress/expected/join_3.out +++ b/src/test/regress/expected/join_3.out @@ -2478,8 +2478,8 @@ where not exists ( ) a1 on t3.c2 = a1.c1 where t1.c1 = t2.c2 ); - QUERY PLAN ------------------------------------------------------------------------------------------------------ + QUERY PLAN +----------------------------------------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) -> Hash Anti Join Hash Cond: (t1.c1 = t2.c2) @@ -2487,36 +2487,33 @@ where not exists ( -> Hash -> Remote Subquery Scan on all (datanode_1,datanode_2) Distribute results by H: c2 - -> Merge Left Join - Merge Cond: (t3.c2 = t5.c1) + -> Merge Right Join + Merge Cond: (t5.c1 = t3.c2) -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: c2 - -> Sort - Sort Key: t3.c2 - -> Merge Left Join - Merge Cond: (t2.c3 = t3.c1) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: c3 - -> Sort - Sort Key: t2.c3 - -> Seq Scan on tt4x t2 + Distribute results by H: c1 + -> Merge Join + Merge Cond: (t4.c2 = t5.c1) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: c2 -> Sort - Sort Key: t3.c1 - -> Seq Scan on tt4x t3 + Sort Key: t4.c2 + -> Seq Scan on tt4x t4 + -> Sort + Sort Key: t5.c1 + -> Seq Scan on tt4x t5 -> Materialize -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: c1 - -> Merge Join - Merge Cond: (t4.c2 = t5.c1) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: c2 - -> Sort - Sort Key: t4.c2 - -> Seq Scan on tt4x t4 - -> Sort - Sort Key: t5.c1 - -> Seq Scan on tt4x t5 -(36 rows) + Distribute results by H: c2 + -> Sort + Sort Key: t3.c2 + -> Hash Left Join + Hash Cond: (t2.c3 = t3.c1) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: c3 + -> Seq Scan on tt4x t2 + -> Hash + -> Seq Scan on tt4x t3 +(33 rows) -- -- regression test for problems of the sort depicted in bug #3494 @@ -3084,24 +3081,24 @@ select t1.unique2, t1.stringu1, t2.unique1, t2.stringu2 from left join tenk1 t2 on (subq1.y1 = t2.unique1) where t1.unique2 < 42 and t1.stringu1 > t2.stringu2; - QUERY PLAN ------------------------------------------------------------------------------------ + QUERY PLAN +----------------------------------------------------------------------------- Nested Loop Join Filter: (t1.stringu1 > t2.stringu2) -> Nested Loop - Join Filter: ((0) = i1.f1) -> Nested Loop - -> Nested Loop - Join Filter: ((1) = (1)) - -> Result - -> Result - -> Materialize + Join Filter: ((1) = (1)) + -> Hash Join + Hash Cond: (i1.f1 = (0)) -> Remote Subquery Scan on all - -> Index Scan using tenk1_unique2 on tenk1 t1 - Index Cond: ((unique2 = (11)) AND (unique2 < 42)) + -> Seq Scan on int4_tbl i1 + -> Hash + -> Result + -> Result -> Materialize -> Remote Subquery Scan on all - -> Seq Scan on int4_tbl i1 + -> Index Scan using tenk1_unique2 on tenk1 t1 + Index Cond: ((unique2 = (11)) AND (unique2 < 42)) -> Materialize -> Remote Subquery Scan on all -> Index Scan using tenk1_unique1 on tenk1 t2 @@ -3244,19 +3241,19 @@ where t1.unique1 = 1; -> Materialize -> Remote Subquery Scan on all Distribute results by H: hundred - -> Nested Loop + -> Hash Join + Hash Cond: (t3.unique2 = t2.thousand) Join Filter: (t1.ten = t3.ten) -> Remote Subquery Scan on all - Distribute results by H: thousand - -> Bitmap Heap Scan on tenk1 t2 - Recheck Cond: (t1.hundred = hundred) - -> Bitmap Index Scan on tenk1_hundred - Index Cond: (t1.hundred = hundred) - -> Materialize + Distribute results by H: unique2 + -> Seq Scan on tenk1 t3 + -> Hash -> Remote Subquery Scan on all - Distribute results by H: unique2 - -> Index Scan using tenk1_unique2 on tenk1 t3 - Index Cond: (unique2 = t2.thousand) + Distribute results by H: thousand + -> Bitmap Heap Scan on tenk1 t2 + Recheck Cond: (t1.hundred = hundred) + -> Bitmap Index Scan on tenk1_hundred + Index Cond: (t1.hundred = hundred) (22 rows) explain (num_nodes off, nodes off, costs off) @@ -3275,19 +3272,19 @@ where t1.unique1 = 1; -> Materialize -> Remote Subquery Scan on all Distribute results by H: hundred - -> Nested Loop + -> Hash Join + Hash Cond: (t3.unique2 = t2.thousand) Join Filter: ((t1.ten + t2.ten) = t3.ten) -> Remote Subquery Scan on all - Distribute results by H: thousand - -> Bitmap Heap Scan on tenk1 t2 - Recheck Cond: (t1.hundred = hundred) - -> Bitmap Index Scan on tenk1_hundred - Index Cond: (t1.hundred = hundred) - -> Materialize + Distribute results by H: unique2 + -> Seq Scan on tenk1 t3 + -> Hash -> Remote Subquery Scan on all - Distribute results by H: unique2 - -> Index Scan using tenk1_unique2 on tenk1 t3 - Index Cond: (unique2 = t2.thousand) + Distribute results by H: thousand + -> Bitmap Heap Scan on tenk1 t2 + Recheck Cond: (t1.hundred = hundred) + -> Bitmap Index Scan on tenk1_hundred + Index Cond: (t1.hundred = hundred) (22 rows) explain (num_nodes off, nodes off, costs off) @@ -3295,31 +3292,31 @@ select count(*) from tenk1 a join tenk1 b on a.unique1 = b.unique2 left join tenk1 c on a.unique2 = b.unique1 and c.thousand = a.thousand join int4_tbl on b.thousand = f1; - QUERY PLAN -------------------------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------------------------------- Finalize Aggregate -> Remote Subquery Scan on all -> Partial Aggregate - -> Nested Loop Left Join + -> Hash Right Join + Hash Cond: (c.thousand = a.thousand) Join Filter: (a.unique2 = b.unique1) -> Remote Subquery Scan on all Distribute results by H: thousand - -> Nested Loop - -> Remote Subquery Scan on all - Distribute results by H: unique2 - -> Nested Loop - -> Seq Scan on int4_tbl - -> Bitmap Heap Scan on tenk1 b - Recheck Cond: (thousand = int4_tbl.f1) - -> Bitmap Index Scan on tenk1_thous_tenthous - Index Cond: (thousand = int4_tbl.f1) - -> Index Scan using tenk1_unique1 on tenk1 a - Index Cond: (unique1 = b.unique2) - -> Materialize + -> Seq Scan on tenk1 c + -> Hash -> Remote Subquery Scan on all Distribute results by H: thousand - -> Index Only Scan using tenk1_thous_tenthous on tenk1 c - Index Cond: (thousand = a.thousand) + -> Nested Loop + -> Remote Subquery Scan on all + Distribute results by H: unique2 + -> Nested Loop + -> Seq Scan on int4_tbl + -> Bitmap Heap Scan on tenk1 b + Recheck Cond: (thousand = int4_tbl.f1) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (thousand = int4_tbl.f1) + -> Index Scan using tenk1_unique1 on tenk1 a + Index Cond: (unique1 = b.unique2) (23 rows) select count(*) from @@ -3338,20 +3335,22 @@ select b.unique1 from join int4_tbl i1 on b.thousand = f1 right join int4_tbl i2 on i2.f1 = b.tenthous order by 1; - QUERY PLAN ------------------------------------------------------------------------------------------------------------------------ + QUERY PLAN +------------------------------------------------------------------------------------------------------------- Remote Subquery Scan on all -> Sort Sort Key: b.unique1 - -> Nested Loop Left Join + -> Hash Right Join + Hash Cond: (b.tenthous = i2.f1) -> Remote Subquery Scan on all - Distribute results by H: f1 - -> Seq Scan on int4_tbl i2 - -> Materialize - -> Remote Subquery Scan on all - Distribute results by H: tenthous - -> Nested Loop Left Join - Join Filter: (b.unique1 = 42) + Distribute results by H: tenthous + -> Hash Right Join + Hash Cond: (c.thousand = a.thousand) + Join Filter: (b.unique1 = 42) + -> Remote Subquery Scan on all + Distribute results by H: 42 + -> Seq Scan on tenk1 c + -> Hash -> Remote Subquery Scan on all Distribute results by H: unique1 -> Nested Loop @@ -3359,16 +3358,17 @@ select b.unique1 from Distribute results by H: unique2 -> Nested Loop -> Seq Scan on int4_tbl i1 - -> Index Scan using tenk1_thous_tenthous on tenk1 b - Index Cond: ((thousand = i1.f1) AND (i2.f1 = tenthous)) + -> Bitmap Heap Scan on tenk1 b + Recheck Cond: (thousand = i1.f1) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (thousand = i1.f1) -> Index Scan using tenk1_unique1 on tenk1 a Index Cond: (unique1 = b.unique2) - -> Materialize - -> Remote Subquery Scan on all - Distribute results by H: 42 - -> Index Only Scan using tenk1_thous_tenthous on tenk1 c - Index Cond: (thousand = a.thousand) -(28 rows) + -> Hash + -> Remote Subquery Scan on all + Distribute results by H: f1 + -> Seq Scan on int4_tbl i2 +(31 rows) select b.unique1 from tenk1 a join tenk1 b on a.unique1 = b.unique2 @@ -3487,22 +3487,20 @@ select a.unique1, b.unique1, c.unique1, coalesce(b.twothousand, a.twothousand) --------------------------------------------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) -> Nested Loop Left Join - -> Nested Loop Left Join + -> Hash Right Join + Hash Cond: (b.thousand = a.unique1) Filter: (COALESCE(b.twothousand, a.twothousand) = 44) - -> Index Scan using tenk1_unique2 on tenk1 a - Index Cond: (unique2 < 10) - -> Materialize - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: thousand - -> Bitmap Heap Scan on tenk1 b - Recheck Cond: (thousand = a.unique1) - -> Bitmap Index Scan on tenk1_thous_tenthous - Index Cond: (thousand = a.unique1) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: thousand + -> Seq Scan on tenk1 b + -> Hash + -> Index Scan using tenk1_unique2 on tenk1 a + Index Cond: (unique2 < 10) -> Materialize -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Index Scan using tenk1_unique2 on tenk1 c Index Cond: ((unique2 = COALESCE(b.twothousand, a.twothousand)) AND (unique2 = 44)) -(17 rows) +(15 rows) select a.unique1, b.unique1, c.unique1, coalesce(b.twothousand, a.twothousand) from tenk1 a left join tenk1 b on b.thousand = a.unique1 left join tenk1 c on c.unique2 = coalesce(b.twothousand, a.twothousand) @@ -3527,33 +3525,33 @@ left join using (join_key) ) foo3 using (join_key); - QUERY PLAN --------------------------------------------------------------------------------------------- - Nested Loop Left Join + QUERY PLAN +-------------------------------------------------------------------------------------- + Hash Right Join Output: "*VALUES*".column1, i1.f1, (666) - Join Filter: ("*VALUES*".column1 = i1.f1) - -> Values Scan on "*VALUES*" - Output: "*VALUES*".column1 - -> Materialize - Output: i1.f1, (666) - -> Remote Subquery Scan on all (datanode_1,datanode_2) + Hash Cond: (i1.f1 = "*VALUES*".column1) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: i1.f1, 666 + -> Nested Loop Left Join Output: i1.f1, 666 - -> Nested Loop Left Join - Output: i1.f1, 666 - -> Remote Subquery Scan on all (datanode_1) + -> Remote Subquery Scan on all (datanode_1) + Output: i1.f1 + Distribute results by H: f1 + -> Seq Scan on public.int4_tbl i1 Output: i1.f1 - Distribute results by H: f1 - -> Seq Scan on public.int4_tbl i1 - Output: i1.f1 - -> Materialize + -> Materialize + Output: i2.unique2 + -> Remote Subquery Scan on all (datanode_1,datanode_2) Output: i2.unique2 - -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: unique2 + Sort Key: i2.unique2 + -> Index Only Scan using tenk1_unique2 on public.tenk1 i2 Output: i2.unique2 - Distribute results by H: unique2 - Sort Key: i2.unique2 - -> Index Only Scan using tenk1_unique2 on public.tenk1 i2 - Output: i2.unique2 - Index Cond: (i2.unique2 = i1.f1) + Index Cond: (i2.unique2 = i1.f1) + -> Hash + Output: "*VALUES*".column1 + -> Values Scan on "*VALUES*" + Output: "*VALUES*".column1 (25 rows) select foo1.join_key as foo1_id, foo3.join_key AS foo3_id, bug_field from @@ -4720,18 +4718,18 @@ select * from generate_series(100,200) g, explain (num_nodes off, nodes off, costs off) select count(*) from tenk1 a, tenk1 b join lateral (values(a.unique1)) ss(x) on b.unique2 = ss.x; - QUERY PLAN ------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------ Finalize Aggregate -> Remote Subquery Scan on all -> Partial Aggregate -> Hash Join - Hash Cond: (a.unique1 = b.unique2) - -> Seq Scan on tenk1 a + Hash Cond: (b.unique2 = a.unique1) + -> Remote Subquery Scan on all + Distribute results by H: unique2 + -> Seq Scan on tenk1 b -> Hash - -> Remote Subquery Scan on all - Distribute results by H: unique2 - -> Seq Scan on tenk1 b + -> Seq Scan on tenk1 a (10 rows) select count(*) from tenk1 a, @@ -5462,20 +5460,18 @@ select * from Output: int4_tbl.f1 -> Remote Subquery Scan on all Output: int4_tbl.f1 - -> Nested Loop Semi Join + -> Nested Loop Output: int4_tbl.f1 Join Filter: (int4_tbl.f1 = tenk1.unique1) - -> Remote Subquery Scan on all - Output: int4_tbl.f1 - Distribute results by H: f1 - -> Seq Scan on public.int4_tbl - Output: int4_tbl.f1 - -> Materialize + -> HashAggregate Output: tenk1.unique1 + Group Key: tenk1.unique1 -> Index Scan using tenk1_unique2 on public.tenk1 Output: tenk1.unique1 Index Cond: (tenk1.unique2 = "*VALUES*".column2) -(21 rows) + -> Seq Scan on public.int4_tbl + Output: int4_tbl.f1 +(19 rows) select * from (values (0,9998), (1,1000)) v(id,x), @@ -5498,46 +5494,42 @@ lateral (select * from int8_tbl t1, where q2 = (select greatest(t1.q1,t2.q2)) and (select v.id=0)) offset 0) ss2) ss where t1.q1 = ss.q2) ss0; - QUERY PLAN ------------------------------------------------------------------------------------------ + QUERY PLAN +----------------------------------------------------------------------------------- Nested Loop Output: "*VALUES*".column1, t1.q1, t1.q2, ss2.q1, ss2.q2 - -> Remote Subquery Scan on all (datanode_1) - Output: t1.q1, t1.q2 - -> Seq Scan on public.int8_tbl t1 - Output: t1.q1, t1.q2 + -> Values Scan on "*VALUES*" + Output: "*VALUES*".column1 -> Materialize - Output: "*VALUES*".column1, ss2.q1, ss2.q2 - -> Nested Loop - Output: "*VALUES*".column1, ss2.q1, ss2.q2 - -> Values Scan on "*VALUES*" - Output: "*VALUES*".column1 - -> Materialize - Output: ss2.q1, ss2.q2 - -> Remote Subquery Scan on all (datanode_1) + Output: t1.q1, t1.q2, ss2.q1, ss2.q2 + -> Remote Subquery Scan on all (datanode_1) + Output: t1.q1, t1.q2, ss2.q1, ss2.q2 + -> Nested Loop + Output: t1.q1, t1.q2, ss2.q1, ss2.q2 + -> Seq Scan on public.int8_tbl t1 + Output: t1.q1, t1.q2 + -> Subquery Scan on ss2 Output: ss2.q1, ss2.q2 - -> Subquery Scan on ss2 - Output: ss2.q1, ss2.q2 - Filter: (t1.q1 = ss2.q2) - -> Seq Scan on public.int8_tbl t2 - Output: t2.q1, t2.q2 - Filter: (SubPlan 3) - SubPlan 3 - -> Remote Subquery Scan on all (datanode_1) + Filter: (t1.q1 = ss2.q2) + -> Seq Scan on public.int8_tbl t2 + Output: t2.q1, t2.q2 + Filter: (SubPlan 3) + SubPlan 3 + -> Remote Subquery Scan on all (datanode_1) + Output: t3.q2 + -> Result Output: t3.q2 - -> Result - Output: t3.q2 - One-Time Filter: $4 - InitPlan 1 (returns $2) - -> Result - Output: GREATEST($0, t2.q2) - InitPlan 2 (returns $4) - -> Result - Output: ($3 = 0) - -> Seq Scan on public.int8_tbl t3 - Output: t3.q1, t3.q2 - Filter: (t3.q2 = $2) -(37 rows) + One-Time Filter: $4 + InitPlan 1 (returns $2) + -> Result + Output: GREATEST($0, t2.q2) + InitPlan 2 (returns $4) + -> Result + Output: ($3 = 0) + -> Seq Scan on public.int8_tbl t3 + Output: t3.q1, t3.q2 + Filter: (t3.q2 = $2) +(33 rows) select * from (values (0), (1)) v(id), lateral (select * from int8_tbl t1, diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out index 090e4122..85aea9c7 100644 --- a/src/test/regress/expected/privileges.out +++ b/src/test/regress/expected/privileges.out @@ -244,18 +244,18 @@ ERROR: permission denied for relation atest12 -- This plan should use hashjoin, as it will expect many rows to be selected. SET random_page_cost = 8.5; EXPLAIN (COSTS OFF) SELECT * FROM atest12v x, atest12v y WHERE x.a = y.b; - QUERY PLAN ------------------------------------------------------------------------ + QUERY PLAN +----------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) -> Hash Join - Hash Cond: (atest12.a = atest12_1.b) - -> Seq Scan on atest12 - Filter: (b <<< 5) + Hash Cond: (atest12_1.b = atest12.a) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Seq Scan on atest12 atest12_1 + Filter: (b <<< 5) -> Hash - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b - -> Seq Scan on atest12 atest12_1 - Filter: (b <<< 5) + -> Seq Scan on atest12 + Filter: (b <<< 5) (10 rows) RESET random_page_cost; diff --git a/src/test/regress/expected/stats_ext_2.out b/src/test/regress/expected/stats_ext_2.out index 175fe9ba..3581037d 100644 --- a/src/test/regress/expected/stats_ext_2.out +++ b/src/test/regress/expected/stats_ext_2.out @@ -79,6 +79,7 @@ ALTER TABLE ab1 ALTER a SET STATISTICS 0; INSERT INTO ab1 SELECT a, a%23 FROM generate_series(1, 1000) a; CREATE STATISTICS ab1_a_b_stats ON a, b FROM ab1; ANALYZE ab1; +WARNING: statistics object "public.ab1_a_b_stats" could not be computed for relation "public.ab1" SELECT (stxndistinct IS NOT NULL) AS ndistinct, (stxdependencies IS NOT NULL) AS dependencies @@ -91,6 +92,7 @@ FROM pg_statistic_ext WHERE stxname = 'ab1_a_b_stats'; ALTER TABLE ab1 ALTER a SET STATISTICS -1; -- partial analyze doesn't build stats either ANALYZE ab1 (a); +WARNING: statistics object "public.ab1_a_b_stats" could not be computed for relation "public.ab1" SELECT (stxndistinct IS NOT NULL) AS ndistinct, (stxdependencies IS NOT NULL) AS dependencies @@ -226,7 +228,7 @@ EXPLAIN (COSTS off) -> Finalize HashAggregate Group Key: a, b, c, d -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: d + Distribute results by H: a -> Partial GroupAggregate Group Key: a, b, c, d -> Sort @@ -242,7 +244,7 @@ EXPLAIN (COSTS off) -> Finalize HashAggregate Group Key: b, c, d -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: d + Distribute results by H: b -> Partial GroupAggregate Group Key: b, c, d -> Sort @@ -257,7 +259,7 @@ SELECT stxkind, stxndistinct FROM pg_statistic_ext WHERE stxrelid = 'ndistinct'::regclass; stxkind | stxndistinct ---------+--------------------------------------------------------- - {d,f} | {"3, 4": 161, "3, 6": 161, "4, 6": 161, "3, 4, 6": 161} + {d,f} | {"3, 4": 301, "3, 6": 301, "4, 6": 301, "3, 4, 6": 301} (1 row) -- Hash Aggregate, thanks to estimates improved by the statistic @@ -313,7 +315,7 @@ EXPLAIN (COSTS off) -> Finalize HashAggregate Group Key: a, b, c, d -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: d + Distribute results by H: a -> Partial GroupAggregate Group Key: a, b, c, d -> Sort @@ -329,7 +331,7 @@ EXPLAIN (COSTS off) -> Finalize HashAggregate Group Key: b, c, d -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: d + Distribute results by H: b -> Partial GroupAggregate Group Key: b, c, d -> Sort @@ -346,9 +348,9 @@ INSERT INTO ndistinct (a, b, c, filler1) ANALYZE ndistinct; SELECT stxkind, stxndistinct FROM pg_statistic_ext WHERE stxrelid = 'ndistinct'::regclass; - stxkind | stxndistinct ----------+------------------------------------------------------------ - {d,f} | {"3, 4": 2378, "3, 6": 800, "4, 6": 1632, "3, 4, 6": 6060} + stxkind | stxndistinct +---------+------------------------------------------------------------- + {d,f} | {"3, 4": 2550, "3, 6": 800, "4, 6": 1632, "3, 4, 6": 10000} (1 row) -- plans using Group Aggregate, thanks to using correct esimates diff --git a/src/test/regress/expected/stats_ext_3.out b/src/test/regress/expected/stats_ext_3.out index 3581037d..e69852b6 100644 --- a/src/test/regress/expected/stats_ext_3.out +++ b/src/test/regress/expected/stats_ext_3.out @@ -206,19 +206,21 @@ EXPLAIN (COSTS off) EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c; - QUERY PLAN ------------------------------------------------------------------ + QUERY PLAN +----------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) - -> Finalize HashAggregate + -> Finalize GroupAggregate Group Key: a, b, c - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: a - -> Partial GroupAggregate - Group Key: a, b, c - -> Sort - Sort Key: a, b, c - -> Seq Scan on ndistinct -(10 rows) + -> Sort + Sort Key: a, b, c + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Partial GroupAggregate + Group Key: a, b, c + -> Sort + Sort Key: a, b, c + -> Seq Scan on ndistinct +(12 rows) EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d; @@ -238,19 +240,21 @@ EXPLAIN (COSTS off) EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d; - QUERY PLAN ------------------------------------------------------------------ + QUERY PLAN +----------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) - -> Finalize HashAggregate + -> Finalize GroupAggregate Group Key: b, c, d - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b - -> Partial GroupAggregate - Group Key: b, c, d - -> Sort - Sort Key: b, c, d - -> Seq Scan on ndistinct -(10 rows) + -> Sort + Sort Key: b, c, d + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Partial GroupAggregate + Group Key: b, c, d + -> Sort + Sort Key: b, c, d + -> Seq Scan on ndistinct +(12 rows) -- correct command CREATE STATISTICS s10 ON a, b, c FROM ndistinct; @@ -455,17 +459,21 @@ EXPLAIN (COSTS off) EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c; - QUERY PLAN ------------------------------------------------------------------ + QUERY PLAN +----------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) - -> Finalize HashAggregate + -> Finalize GroupAggregate Group Key: a, b, c - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b - -> Partial HashAggregate - Group Key: a, b, c - -> Seq Scan on ndistinct -(8 rows) + -> Sort + Sort Key: a, b, c + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Partial GroupAggregate + Group Key: a, b, c + -> Sort + Sort Key: a, b, c + -> Seq Scan on ndistinct +(12 rows) EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d; @@ -483,17 +491,21 @@ EXPLAIN (COSTS off) EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d; - QUERY PLAN ------------------------------------------------------------------ + QUERY PLAN +----------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) - -> Finalize HashAggregate + -> Finalize GroupAggregate Group Key: b, c, d - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: d - -> Partial HashAggregate - Group Key: b, c, d - -> Seq Scan on ndistinct -(8 rows) + -> Sort + Sort Key: b, c, d + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: d + -> Partial GroupAggregate + Group Key: b, c, d + -> Sort + Sort Key: b, c, d + -> Seq Scan on ndistinct +(12 rows) EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY a, d; diff --git a/src/test/regress/expected/subselect_1.out b/src/test/regress/expected/subselect_1.out index a63fb3c4..50633a31 100644 --- a/src/test/regress/expected/subselect_1.out +++ b/src/test/regress/expected/subselect_1.out @@ -853,11 +853,11 @@ explain (verbose, costs off) select * from int4_tbl where (case when f1 in (select unique1 from tenk1 a) then f1 else null end) in (select ten from tenk1 b); - QUERY PLAN ---------------------------------------------------------------------------------------------------------------- + QUERY PLAN +--------------------------------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) Output: int4_tbl.f1 - -> Nested Loop Semi Join + -> Hash Join Output: int4_tbl.f1 Join Filter: ((CASE WHEN (hashed SubPlan 1) THEN int4_tbl.f1 ELSE NULL::integer END) = b.ten) -> Remote Subquery Scan on all (datanode_1) @@ -872,12 +872,23 @@ select * from int4_tbl where Output: a.unique1 -> Materialize Output: b.ten - -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> HashAggregate Output: b.ten - Distribute results by H: ten - -> Seq Scan on public.tenk1 b + Group Key: b.ten + -> Remote Subquery Scan on all (datanode_1,datanode_2) Output: b.ten -(22 rows) + Distribute results by H: ten + -> HashAggregate + Output: b.ten + Group Key: b.ten + -> Seq Scan on public.tenk1 b + Output: b.ten + SubPlan 1 + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: a.unique1 + -> Seq Scan on public.tenk1 a + Output: a.unique1 +(26 rows) select * from int4_tbl where (case when f1 in (select unique1 from tenk1 a) then f1 else null end) in diff --git a/src/test/regress/expected/xc_FQS_join_1.out b/src/test/regress/expected/xc_FQS_join_1.out index 4cc96cde..c80fb0f2 100644 --- a/src/test/regress/expected/xc_FQS_join_1.out +++ b/src/test/regress/expected/xc_FQS_join_1.out @@ -313,30 +313,24 @@ select * from tab3_rep natural join tab4_rep explain (num_nodes on, nodes off, costs off, verbose on) select * from tab3_rep natural join tab4_rep where tab3_rep.val > 2 and tab4_rep.val < 5; - QUERY PLAN ------------------------------------------------------------------------------------ - Merge Join + QUERY PLAN +---------------------------------------------------------------------------------- + Hash Join Output: tab3_rep.val, tab3_rep.val2 - Merge Cond: ((tab3_rep.val = tab4_rep.val) AND (tab3_rep.val2 = tab4_rep.val2)) + Hash Cond: ((tab3_rep.val = tab4_rep.val) AND (tab3_rep.val2 = tab4_rep.val2)) -> Remote Subquery Scan on all Output: tab3_rep.val, tab3_rep.val2 - -> Sort + -> Seq Scan on public.tab3_rep Output: tab3_rep.val, tab3_rep.val2 - Sort Key: tab3_rep.val, tab3_rep.val2 - -> Seq Scan on public.tab3_rep - Output: tab3_rep.val, tab3_rep.val2 - Filter: (tab3_rep.val > 2) - -> Materialize + Filter: (tab3_rep.val > 2) + -> Hash Output: tab4_rep.val, tab4_rep.val2 -> Remote Subquery Scan on all Output: tab4_rep.val, tab4_rep.val2 - -> Sort + -> Seq Scan on public.tab4_rep Output: tab4_rep.val, tab4_rep.val2 - Sort Key: tab4_rep.val, tab4_rep.val2 - -> Seq Scan on public.tab4_rep - Output: tab4_rep.val, tab4_rep.val2 - Filter: (tab4_rep.val < 5) -(21 rows) + Filter: (tab4_rep.val < 5) +(15 rows) -- Join involving one distributed and one replicated table, with replicated -- table existing on all nodes where distributed table exists. should be @@ -392,31 +386,24 @@ select * from tab1_mod natural join tab4_rep explain (verbose on, nodes off, costs off) select * from tab1_mod natural join tab4_rep where tab1_mod.val > 2 and tab4_rep.val < 4; - QUERY PLAN ------------------------------------------------------------------------------------ - Merge Join + QUERY PLAN +---------------------------------------------------------------------------------- + Hash Join Output: tab1_mod.val, tab1_mod.val2 - Merge Cond: ((tab1_mod.val = tab4_rep.val) AND (tab1_mod.val2 = tab4_rep.val2)) + Hash Cond: ((tab1_mod.val = tab4_rep.val) AND (tab1_mod.val2 = tab4_rep.val2)) -> Remote Subquery Scan on all Output: tab1_mod.val, tab1_mod.val2 - Sort Key: tab1_mod.val, tab1_mod.val2 - -> Sort + -> Seq Scan on public.tab1_mod Output: tab1_mod.val, tab1_mod.val2 - Sort Key: tab1_mod.val, tab1_mod.val2 - -> Seq Scan on public.tab1_mod - Output: tab1_mod.val, tab1_mod.val2 - Filter: (tab1_mod.val > 2) - -> Materialize + Filter: (tab1_mod.val > 2) + -> Hash Output: tab4_rep.val, tab4_rep.val2 -> Remote Subquery Scan on all Output: tab4_rep.val, tab4_rep.val2 - -> Sort + -> Seq Scan on public.tab4_rep Output: tab4_rep.val, tab4_rep.val2 - Sort Key: tab4_rep.val, tab4_rep.val2 - -> Seq Scan on public.tab4_rep - Output: tab4_rep.val, tab4_rep.val2 - Filter: (tab4_rep.val < 4) -(22 rows) + Filter: (tab4_rep.val < 4) +(15 rows) -- Join involving two distributed tables, never shipped select * from tab1_mod natural join tab2_mod @@ -432,31 +419,25 @@ select * from tab1_mod natural join tab2_mod explain (verbose on, nodes off, costs off) select * from tab1_mod natural join tab2_mod where tab1_mod.val > 2 and tab2_mod.val < 4; - QUERY PLAN ------------------------------------------------------------------------------------------ + QUERY PLAN +---------------------------------------------------------------------------------------- Remote Subquery Scan on all Output: tab1_mod.val, tab1_mod.val2 - -> Merge Join + -> Hash Join Output: tab1_mod.val, tab1_mod.val2 - Merge Cond: ((tab1_mod.val = tab2_mod.val) AND (tab1_mod.val2 = tab2_mod.val2)) - -> Sort + Hash Cond: ((tab1_mod.val = tab2_mod.val) AND (tab1_mod.val2 = tab2_mod.val2)) + -> Seq Scan on public.tab1_mod Output: tab1_mod.val, tab1_mod.val2 - Sort Key: tab1_mod.val, tab1_mod.val2 - -> Seq Scan on public.tab1_mod - Output: tab1_mod.val, tab1_mod.val2 - Filter: (tab1_mod.val > 2) - -> Materialize + Filter: (tab1_mod.val > 2) + -> Hash Output: tab2_mod.val, tab2_mod.val2 -> Remote Subquery Scan on all Output: tab2_mod.val, tab2_mod.val2 Distribute results by M: val - -> Sort + -> Seq Scan on public.tab2_mod Output: tab2_mod.val, tab2_mod.val2 - Sort Key: tab2_mod.val, tab2_mod.val2 - -> Seq Scan on public.tab2_mod - Output: tab2_mod.val, tab2_mod.val2 - Filter: (tab2_mod.val < 4) -(22 rows) + Filter: (tab2_mod.val < 4) +(16 rows) -- Join involving a distributed table and two replicated tables, such that the -- distributed table exists only on nodes common to replicated tables, try few @@ -605,15 +586,15 @@ explain (verbose on, nodes off, costs off, num_nodes on) select * from tab1_mod Sort Output: tab1_mod.val, tab1_mod.val2, tab1_mod.val2 Sort Key: tab1_mod.val2 - -> Hash Join + -> Nested Loop Output: tab1_mod.val, tab1_mod.val2, tab1_mod.val2 - Hash Cond: (tab1_mod.val2 = tab4_rep.val2) + Join Filter: (tab1_mod.val2 = tab4_rep.val2) -> Remote Subquery Scan on all Output: tab1_mod.val, tab1_mod.val2 -> Seq Scan on public.tab1_mod Output: tab1_mod.val, tab1_mod.val2 Filter: (tab1_mod.val = 1) - -> Hash + -> Materialize Output: tab4_rep.val, tab4_rep.val2 -> Remote Subquery Scan on all Output: tab4_rep.val, tab4_rep.val2 @@ -641,15 +622,15 @@ explain (verbose on, nodes off, costs off, num_nodes on) select * from tab1_mod ------------------------------------------------------------------------- Remote Subquery Scan on all Output: tab1_mod.val2, tab1_mod.val, tab2_mod.val, tab1_mod.val - -> Hash Join + -> Nested Loop Output: tab1_mod.val2, tab1_mod.val, tab2_mod.val, tab1_mod.val - Hash Cond: (tab1_mod.val2 = tab2_mod.val2) + Join Filter: (tab1_mod.val2 = tab2_mod.val2) -> Remote Subquery Scan on all Output: tab1_mod.val2, tab1_mod.val -> Seq Scan on public.tab1_mod Output: tab1_mod.val2, tab1_mod.val Filter: (tab1_mod.val = 1) - -> Hash + -> Materialize Output: tab2_mod.val, tab2_mod.val2 -> Seq Scan on public.tab2_mod Output: tab2_mod.val, tab2_mod.val2 diff --git a/src/test/regress/expected/xc_for_update_1.out b/src/test/regress/expected/xc_for_update_1.out index 8f2b3800..69bd0130 100644 --- a/src/test/regress/expected/xc_for_update_1.out +++ b/src/test/regress/expected/xc_for_update_1.out @@ -209,47 +209,47 @@ ERROR: FOR UPDATE is not allowed with joins explain (costs off, num_nodes off, nodes off, verbose on) select * from t1, t2, t3 for share of t1,t2 nowait; ERROR: FOR SHARE is not allowed with joins explain (costs off, num_nodes off, nodes off, verbose on) select * from t1 join t2 on (t1.val2 = t2.val2) join t3 on (t1.val2 = t3.val2); - QUERY PLAN ---------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------- Remote Subquery Scan on all Output: t1.val, t1.val2, t2.val, t2.val2, t3.val, t3.val2 -> Merge Join Output: t1.val, t1.val2, t2.val, t2.val2, t3.val, t3.val2 - Merge Cond: (t3.val2 = t1.val2) - -> Remote Subquery Scan on all - Output: t3.val, t3.val2 - Distribute results by H: val2 - Sort Key: t3.val2 - -> Sort - Output: t3.val, t3.val2 - Sort Key: t3.val2 - -> Seq Scan on public.t3 - Output: t3.val, t3.val2 - -> Materialize + Merge Cond: (t1.val2 = t3.val2) + -> Merge Join Output: t1.val, t1.val2, t2.val, t2.val2 - -> Merge Join - Output: t1.val, t1.val2, t2.val, t2.val2 - Merge Cond: (t1.val2 = t2.val2) - -> Remote Subquery Scan on all + Merge Cond: (t1.val2 = t2.val2) + -> Remote Subquery Scan on all + Output: t1.val, t1.val2 + Distribute results by H: val2 + Sort Key: t1.val2 + -> Sort Output: t1.val, t1.val2 - Distribute results by H: val2 Sort Key: t1.val2 - -> Sort + -> Seq Scan on public.t1 Output: t1.val, t1.val2 - Sort Key: t1.val2 - -> Seq Scan on public.t1 - Output: t1.val, t1.val2 - -> Materialize + -> Materialize + Output: t2.val, t2.val2 + -> Remote Subquery Scan on all Output: t2.val, t2.val2 - -> Remote Subquery Scan on all + Distribute results by H: val2 + Sort Key: t2.val2 + -> Sort Output: t2.val, t2.val2 - Distribute results by H: val2 Sort Key: t2.val2 - -> Sort + -> Seq Scan on public.t2 Output: t2.val, t2.val2 - Sort Key: t2.val2 - -> Seq Scan on public.t2 - Output: t2.val, t2.val2 + -> Materialize + Output: t3.val, t3.val2 + -> Remote Subquery Scan on all + Output: t3.val, t3.val2 + Distribute results by H: val2 + Sort Key: t3.val2 + -> Sort + Output: t3.val, t3.val2 + Sort Key: t3.val2 + -> Seq Scan on public.t3 + Output: t3.val, t3.val2 (39 rows) explain (costs off, num_nodes off, nodes off, verbose on) select * from t1 join t2 on (t1.val2 = t2.val2) join t3 on (t1.val2 = t3.val2) for update; @@ -262,12 +262,12 @@ select * from t1 join t2 on (t1.val2 = t2.val2) join t3 on (t1.val2 = t3.val2); val | val2 | val | val2 | val | val2 -----+------+-----+------+-----+------ 1 | 11 | 3 | 11 | 5 | 11 - 1 | 11 | 4 | 11 | 5 | 11 - 2 | 11 | 3 | 11 | 5 | 11 - 2 | 11 | 4 | 11 | 5 | 11 1 | 11 | 3 | 11 | 6 | 11 + 1 | 11 | 4 | 11 | 5 | 11 1 | 11 | 4 | 11 | 6 | 11 + 2 | 11 | 3 | 11 | 5 | 11 2 | 11 | 3 | 11 | 6 | 11 + 2 | 11 | 4 | 11 | 5 | 11 2 | 11 | 4 | 11 | 6 | 11 (8 rows) diff --git a/src/test/regress/expected/xc_groupby_1.out b/src/test/regress/expected/xc_groupby_1.out index cbc7d0c4..8db42b7f 100644 --- a/src/test/regress/expected/xc_groupby_1.out +++ b/src/test/regress/expected/xc_groupby_1.out @@ -85,30 +85,22 @@ explain (verbose true, costs false, nodes false) select count(*), sum(xc_groupby -> Partial HashAggregate Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, PARTIAL count(*), PARTIAL sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), PARTIAL avg((xc_groupby_tab1.val * xc_groupby_tab2.val)) Group Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2 - -> Merge Full Join + -> Hash Full Join Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val - Merge Cond: (xc_groupby_tab1.val2 = xc_groupby_tab2.val2) + Hash Cond: (xc_groupby_tab1.val2 = xc_groupby_tab2.val2) -> Remote Subquery Scan on all Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 Distribute results by H: val2 - Sort Key: xc_groupby_tab1.val2 - -> Sort + -> Seq Scan on public.xc_groupby_tab1 Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 - Sort Key: xc_groupby_tab1.val2 - -> Seq Scan on public.xc_groupby_tab1 - Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 - -> Materialize + -> Hash Output: xc_groupby_tab2.val, xc_groupby_tab2.val2 -> Remote Subquery Scan on all Output: xc_groupby_tab2.val, xc_groupby_tab2.val2 Distribute results by H: val2 - Sort Key: xc_groupby_tab2.val2 - -> Sort + -> Seq Scan on public.xc_groupby_tab2 Output: xc_groupby_tab2.val, xc_groupby_tab2.val2 - Sort Key: xc_groupby_tab2.val2 - -> Seq Scan on public.xc_groupby_tab2 - Output: xc_groupby_tab2.val, xc_groupby_tab2.val2 -(34 rows) +(26 rows) -- aggregates over aggregates select sum(y) from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x order by 1; @@ -2125,30 +2117,22 @@ explain (verbose true, costs false, nodes false) select count(*), sum(xc_groupby -> Sort Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val Sort Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2 - -> Merge Full Join + -> Hash Full Join Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val - Merge Cond: (xc_groupby_tab1.val2 = xc_groupby_tab2.val2) + Hash Cond: (xc_groupby_tab1.val2 = xc_groupby_tab2.val2) -> Remote Subquery Scan on all Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 Distribute results by H: val2 - Sort Key: xc_groupby_tab1.val2 - -> Sort + -> Seq Scan on public.xc_groupby_tab1 Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 - Sort Key: xc_groupby_tab1.val2 - -> Seq Scan on public.xc_groupby_tab1 - Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 - -> Materialize + -> Hash Output: xc_groupby_tab2.val, xc_groupby_tab2.val2 -> Remote Subquery Scan on all Output: xc_groupby_tab2.val, xc_groupby_tab2.val2 Distribute results by H: val2 - Sort Key: xc_groupby_tab2.val2 - -> Sort + -> Seq Scan on public.xc_groupby_tab2 Output: xc_groupby_tab2.val, xc_groupby_tab2.val2 - Sort Key: xc_groupby_tab2.val2 - -> Seq Scan on public.xc_groupby_tab2 - Output: xc_groupby_tab2.val, xc_groupby_tab2.val2 -(40 rows) +(32 rows) -- aggregates over aggregates select sum(y) from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x; @@ -3736,30 +3720,22 @@ explain (verbose true, costs false, nodes false) select count(*), sum(xc_groupby -> Partial HashAggregate Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, PARTIAL count(*), PARTIAL sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), PARTIAL avg((xc_groupby_tab1.val * xc_groupby_tab2.val)) Group Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2 - -> Merge Full Join + -> Hash Full Join Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val - Merge Cond: (xc_groupby_tab1.val2 = xc_groupby_tab2.val2) + Hash Cond: (xc_groupby_tab1.val2 = xc_groupby_tab2.val2) -> Remote Subquery Scan on all Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 Distribute results by H: val2 - Sort Key: xc_groupby_tab1.val2 - -> Sort + -> Seq Scan on public.xc_groupby_tab1 Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 - Sort Key: xc_groupby_tab1.val2 - -> Seq Scan on public.xc_groupby_tab1 - Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 - -> Materialize + -> Hash Output: xc_groupby_tab2.val, xc_groupby_tab2.val2 -> Remote Subquery Scan on all Output: xc_groupby_tab2.val, xc_groupby_tab2.val2 Distribute results by H: val2 - Sort Key: xc_groupby_tab2.val2 - -> Sort + -> Seq Scan on public.xc_groupby_tab2 Output: xc_groupby_tab2.val, xc_groupby_tab2.val2 - Sort Key: xc_groupby_tab2.val2 - -> Seq Scan on public.xc_groupby_tab2 - Output: xc_groupby_tab2.val, xc_groupby_tab2.val2 -(34 rows) +(26 rows) -- aggregates over aggregates select sum(y) from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x order by 1; @@ -5824,30 +5800,22 @@ explain (verbose true, costs false, nodes false) select count(*), sum(xc_groupby -> Sort Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val Sort Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2 - -> Merge Full Join + -> Hash Full Join Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val - Merge Cond: (xc_groupby_tab1.val2 = xc_groupby_tab2.val2) + Hash Cond: (xc_groupby_tab1.val2 = xc_groupby_tab2.val2) -> Remote Subquery Scan on all Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 Distribute results by H: val2 - Sort Key: xc_groupby_tab1.val2 - -> Sort + -> Seq Scan on public.xc_groupby_tab1 Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 - Sort Key: xc_groupby_tab1.val2 - -> Seq Scan on public.xc_groupby_tab1 - Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 - -> Materialize + -> Hash Output: xc_groupby_tab2.val, xc_groupby_tab2.val2 -> Remote Subquery Scan on all Output: xc_groupby_tab2.val, xc_groupby_tab2.val2 Distribute results by H: val2 - Sort Key: xc_groupby_tab2.val2 - -> Sort + -> Seq Scan on public.xc_groupby_tab2 Output: xc_groupby_tab2.val, xc_groupby_tab2.val2 - Sort Key: xc_groupby_tab2.val2 - -> Seq Scan on public.xc_groupby_tab2 - Output: xc_groupby_tab2.val, xc_groupby_tab2.val2 -(40 rows) +(32 rows) -- aggregates over aggregates select sum(y) from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x; diff --git a/src/test/regress/expected/xc_having_1.out b/src/test/regress/expected/xc_having_1.out index f12d97f9..93469960 100644 --- a/src/test/regress/expected/xc_having_1.out +++ b/src/test/regress/expected/xc_having_1.out @@ -151,34 +151,26 @@ explain (verbose true, costs false, nodes false) select count(*), sum(xc_having_ ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Remote Subquery Scan on all Output: count(*), sum((xc_having_tab1.val * xc_having_tab2.val)), avg((xc_having_tab1.val * xc_having_tab2.val)), ((sum((xc_having_tab1.val * xc_having_tab2.val)))::double precision / (count(*))::double precision), xc_having_tab1.val2, xc_having_tab2.val2 - -> GroupAggregate + -> HashAggregate Output: count(*), sum((xc_having_tab1.val * xc_having_tab2.val)), avg((xc_having_tab1.val * xc_having_tab2.val)), ((sum((xc_having_tab1.val * xc_having_tab2.val)))::double precision / (count(*))::double precision), xc_having_tab1.val2, xc_having_tab2.val2 Group Key: xc_having_tab1.val2, xc_having_tab2.val2 - -> Merge Join + -> Hash Join Output: xc_having_tab1.val2, xc_having_tab2.val2, xc_having_tab1.val, xc_having_tab2.val - Merge Cond: (xc_having_tab1.val2 = xc_having_tab2.val2) + Hash Cond: (xc_having_tab1.val2 = xc_having_tab2.val2) Join Filter: ((xc_having_tab1.val2 + xc_having_tab2.val2) > 2) -> Remote Subquery Scan on all Output: xc_having_tab1.val, xc_having_tab1.val2 Distribute results by H: val2 - Sort Key: xc_having_tab1.val2 - -> Sort + -> Seq Scan on public.xc_having_tab1 Output: xc_having_tab1.val, xc_having_tab1.val2 - Sort Key: xc_having_tab1.val2 - -> Seq Scan on public.xc_having_tab1 - Output: xc_having_tab1.val, xc_having_tab1.val2 - -> Materialize + -> Hash Output: xc_having_tab2.val, xc_having_tab2.val2 -> Remote Subquery Scan on all Output: xc_having_tab2.val, xc_having_tab2.val2 Distribute results by H: val2 - Sort Key: xc_having_tab2.val2 - -> Sort + -> Seq Scan on public.xc_having_tab2 Output: xc_having_tab2.val, xc_having_tab2.val2 - Sort Key: xc_having_tab2.val2 - -> Seq Scan on public.xc_having_tab2 - Output: xc_having_tab2.val, xc_having_tab2.val2 -(29 rows) +(21 rows) -- group by and having, without aggregate in the target list select val2 from xc_having_tab1 group by val2 having sum(val) > 8; diff --git a/src/test/regress/expected/xl_join.out b/src/test/regress/expected/xl_join.out index 463e1baa..6369183d 100644 --- a/src/test/regress/expected/xl_join.out +++ b/src/test/regress/expected/xl_join.out @@ -8,25 +8,25 @@ EXPLAIN (COSTS OFF) SELECT * FROM xl_join_t1 INNER JOIN xl_join_t2 ON xl_join_t1.val1 = xl_join_t2.val2 INNER JOIN xl_join_t3 ON xl_join_t1.val1 = xl_join_t3.val1; - QUERY PLAN ------------------------------------------------------------------------------ + QUERY PLAN +--------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) -> Merge Join - Merge Cond: (xl_join_t3.val1 = xl_join_t1.val1) - -> Sort - Sort Key: xl_join_t3.val1 - -> Seq Scan on xl_join_t3 + Merge Cond: (xl_join_t2.val2 = xl_join_t1.val1) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: val2 + -> Sort + Sort Key: xl_join_t2.val2 + -> Seq Scan on xl_join_t2 -> Materialize -> Merge Join - Merge Cond: (xl_join_t2.val2 = xl_join_t1.val1) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: val2 - -> Sort - Sort Key: xl_join_t2.val2 - -> Seq Scan on xl_join_t2 + Merge Cond: (xl_join_t1.val1 = xl_join_t3.val1) -> Sort Sort Key: xl_join_t1.val1 -> Seq Scan on xl_join_t1 + -> Sort + Sort Key: xl_join_t3.val1 + -> Seq Scan on xl_join_t3 (17 rows) SELECT * FROM xl_join_t1 From 84c9dda7b6de41ff2e2e28acac00222c91af2c39 Mon Sep 17 00:00:00 2001 From: ericxwu Date: Mon, 29 Jun 2020 11:09:35 +0800 Subject: [PATCH 007/578] adjust set_join_distribution to include replicate small rel cost --- src/backend/optimizer/util/pathnode.c | 36 +++++++++++++++++++++------ 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 586a595c..c1c50654 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -1422,6 +1422,13 @@ redistribute_path(PlannerInfo *root, Path *subpath, List *pathkeys, Distribution *distribution = NULL; RelOptInfo *rel = subpath->parent; RemoteSubPath *pathnode; +#ifdef __TBASE__ + int num_replication; + + /* IsLocatorNone() also indicates we are replicating through input nodes */ + num_replication = (IsLocatorReplicated(distributionType) || + IsLocatorNone(distributionType)) ? bms_num_members(nodes) : 1; +#endif if (distributionType != LOCATOR_TYPE_NONE) { @@ -1467,8 +1474,12 @@ redistribute_path(PlannerInfo *root, Path *subpath, List *pathkeys, /* (re)calculate costs */ cost_remote_subplan((Path *) pathnode, subpath->startup_cost, subpath->total_cost, subpath->rows, rel->reltarget->width, +#ifdef __TBASE__ + num_replication); +#else IsLocatorReplicated(distributionType) ? bms_num_members(nodes) : 1); +#endif mpath->subpath = (Path *) pathnode; cost_material(&mpath->path, pathnode->path.startup_cost, @@ -1532,8 +1543,12 @@ redistribute_path(PlannerInfo *root, Path *subpath, List *pathkeys, cost_remote_subplan((Path *) pathnode, input_startup_cost, input_total_cost, subpath->rows, rel->reltarget->width, +#ifdef __TBASE__ + num_replication); +#else IsLocatorReplicated(distributionType) ? bms_num_members(nodes) : 1); +#endif return (Path *) pathnode; } } @@ -2256,6 +2271,7 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) nodes = bms_add_member(nodes, i); #ifdef __TBASE__ + /* check if we can distribute by shard */ if (OidIsValid(group)) { int node_index; @@ -2275,9 +2291,9 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) } /* - * if one of both is smaller enough, - * replicate the small one instead of redistribute both - */ + * if any side is smaller enough, replicate the smaller one + * instead of redistribute both of them. + */ if(inner_size * outer_nodes < inner_size + outer_size && (pathnode->jointype != JOIN_RIGHT && pathnode->jointype != JOIN_FULL) && outerd->distributionType != LOCATOR_TYPE_REPLICATED && !redistribute_inner && @@ -2373,7 +2389,10 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) if (new_inner_key) { #ifdef __TBASE__ - /* replicate outer rel */ + /* + * replicate outer rel, just set LOCATOR_TYPE_NONE to remove + * the path distribution. + */ if(replicate_outer) { pathnode->outerjoinpath = redistribute_path( @@ -2382,7 +2401,7 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) outerpathkeys, LOCATOR_TYPE_NONE, NULL, - NULL, + innerd->nodes, NULL); if (IsA(pathnode, MergePath)) @@ -2414,7 +2433,10 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) if (new_outer_key) { #ifdef __TBASE__ - /* replicate inner rel */ + /* + * replicate inner rel, just set LOCATOR_TYPE_NONE to remove + * the path distribution. + */ if(replicate_inner) { pathnode->innerjoinpath = redistribute_path( @@ -2423,7 +2445,7 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) innerpathkeys, LOCATOR_TYPE_NONE, NULL, - NULL, + outerd->nodes, NULL); if (IsA(pathnode, MergePath)) From 4360b39b684291a520ca358761a1e3480e9d6577 Mon Sep 17 00:00:00 2001 From: branwu Date: Mon, 29 Jun 2020 14:49:03 +0800 Subject: [PATCH 008/578] fix bugs on tbase online upgrade.ID80501281 --- .../tbase_upgrade_spec/2.15.12_after_start.sh | 39 ++++++++++++++----- .../tbase_upgrade_spec/2.15.12_before_stop.sh | 36 +++++++++++++---- 2 files changed, 59 insertions(+), 16 deletions(-) diff --git a/src/backend/utils/tbase_upgrade_spec/2.15.12_after_start.sh b/src/backend/utils/tbase_upgrade_spec/2.15.12_after_start.sh index e4aaa208..1e7d4c45 100644 --- a/src/backend/utils/tbase_upgrade_spec/2.15.12_after_start.sh +++ b/src/backend/utils/tbase_upgrade_spec/2.15.12_after_start.sh @@ -13,18 +13,39 @@ execSql() { local sql="$1" local db=$2 - export LD_LIBRARY_PATH=${bin_dir}/lib:${LD_LIBRARY_PATH} && export PATH=${bin_dir}/bin:${PATH} && $bin_dir/bin/psql -h $host -p $port -d $db -U $user -t -c "$sql" | sed '/^\s*$/d' + local node_host="$3" + local node_port=$4 + export LD_LIBRARY_PATH=${bin_dir}/lib:${LD_LIBRARY_PATH} + export PATH=${bin_dir}/bin:${PATH} + $bin_dir/bin/psql -h $node_host -p $node_port -d $db -U $user -t -c "$sql" | sed '/^\s*$/d' } -dbs=$(execSql "select datname from pg_database where datname !='template0'" $dbname) -for db in ${dbs} -do - echo $db - execSql "create extension pg_stat_log" $db +getSeg() +{ + line="$1" + segNum="$2" + + seg=$(echo "$line" | awk -F'|' '{print $segNum}' "segNum=$segNum") + seg=$(echo $seg) + echo $seg +} + +nodeinfo=$(execSql "select node_host, node_port from pgxc_node where node_type='C' order by node_port asc limit 1" $dbname $host $port) + +num=$(echo "$nodeinfo" | wc -l) +for ((i=1; i<=num; ++i)); do + node_host=$(getSeg "$nodeinfo" 1) + node_port=$(getSeg "$nodeinfo" 2) + dbs=$(execSql "select datname from pg_database where datname !='template0'" $dbname $node_host $node_port) + for db in ${dbs} + do + execSql "create extension if not exists pg_stat_log" $db $node_host $node_port if [ $? -eq 0 ] then - echo "create pg_stat_log on $host:$port:$db success" + echo "create pg_stat_log on $node_host:$node_port:$db success" else - echo "create pg_stat_log on $host:$port:$db failed" + echo "create pg_stat_log on $node_host:$node_port:$db failed" fi -done \ No newline at end of file + done +done + diff --git a/src/backend/utils/tbase_upgrade_spec/2.15.12_before_stop.sh b/src/backend/utils/tbase_upgrade_spec/2.15.12_before_stop.sh index a110a5a3..58e6b62c 100644 --- a/src/backend/utils/tbase_upgrade_spec/2.15.12_before_stop.sh +++ b/src/backend/utils/tbase_upgrade_spec/2.15.12_before_stop.sh @@ -13,17 +13,39 @@ execSql() { local sql="$1" local db=$2 - export LD_LIBRARY_PATH=${bin_dir}/lib:${LD_LIBRARY_PATH} && export PATH=${bin_dir}/bin:${PATH} && $bin_dir/bin/psql -h $host -p $port -d $db -U $user -t -c "$sql" | sed '/^\s*$/d' + local node_host="$3" + local node_port=$4 + export LD_LIBRARY_PATH=${bin_dir}/lib:${LD_LIBRARY_PATH} + export PATH=${bin_dir}/bin:${PATH} + $bin_dir/bin/psql -h $node_host -p $node_port -d $db -U $user -t -c "$sql" | sed '/^\s*$/d' } -dbs=$(execSql "select datname from pg_database where datname !='template0'" $dbname) -for db in ${dbs} -do - execSql "drop extension pg_stat_log" $db +getSeg() +{ + line="$1" + segNum="$2" + + seg=$(echo "$line" | awk -F'|' '{print $segNum}' "segNum=$segNum") + seg=$(echo $seg) + echo $seg +} + +nodeinfo=$(execSql "select node_host, node_port from pgxc_node where node_type='C' order by node_port asc limit 1" $dbname $host $port) + +num=$(echo "$nodeinfo" | wc -l) +for ((i=1; i<=num; ++i)); do + node_host=$(getSeg "$nodeinfo" 1) + node_port=$(getSeg "$nodeinfo" 2) + dbs=$(execSql "select datname from pg_database where datname !='template0'" $dbname $node_host $node_port) + for db in ${dbs} + do + execSql "drop extension if exists pg_stat_log" $db $node_host $node_port if [ $? -eq 0 ] then - echo "drop pg_stat_log on $host:$port:$db success" + echo "drop pg_stat_log on $node_host:$node_port:$db success" else - echo "drop pg_stat_log on $host:$port:$db failed" + echo "drop pg_stat_log on $node_host:$node_port:$db failed" fi + done done + From 52a3acd2afcbc5897a01c023013116df836cb4c7 Mon Sep 17 00:00:00 2001 From: aidenma Date: Mon, 29 Jun 2020 21:24:02 +0800 Subject: [PATCH 009/578] Fallback slave phyiscal replication fail problem tapd:http://tapd.tencent.com/pgxz/bugtrace/bugs/view?bug_id=1110092131078795210 --- src/bin/pg_basebackup/pg_basebackup.c | 167 +++----------------------- 1 file changed, 18 insertions(+), 149 deletions(-) diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c index fa586130..2f5717e2 100644 --- a/src/bin/pg_basebackup/pg_basebackup.c +++ b/src/bin/pg_basebackup/pg_basebackup.c @@ -1751,25 +1751,21 @@ WriteRecoveryConf(void) static void BaseBackup(void) -{// #lizard forgives - PGresult *res; - char *sysidentifier; - TimeLineID latesttli; - TimeLineID starttli; - char *basebkp; - char escaped_label[MAXPGPATH]; - char *maxrate_clause = NULL; - int i; - char xlogstart[64]; - char xlogend[64]; - int minServerMajor, - maxServerMajor; - int serverVersion, - serverMajor; - PGconn *connDev = NULL; - PGresult *resDev = NULL; - char connInfo[MAXPGPATH]; - char *default_dbname = " dbname=postgres"; +{ + PGresult *res; + char *sysidentifier; + TimeLineID latesttli; + TimeLineID starttli; + char *basebkp; + char escaped_label[MAXPGPATH]; + char *maxrate_clause = NULL; + int i; + char xlogstart[64]; + char xlogend[64]; + int minServerMajor, + maxServerMajor; + int serverVersion, + serverMajor; Assert(conn != NULL); @@ -1867,136 +1863,9 @@ BaseBackup(void) disconnect_and_exit(1); } - /* - * found connstr is contain 'dbname' in pg_basebackup use -d parmas - */ - memset(connInfo, '\0', sizeof(connInfo)); - if (NULL != connection_string) - { - if (NULL != strstr(connection_string,"dbname")) - { - snprintf(connInfo, sizeof(connInfo), "%s",connection_string); - - } - else - { - snprintf(connInfo, sizeof(connInfo), "%s %s",connection_string, default_dbname); - } - } - /* - * found connstr is contain 'dbname' in pg_basebackup not use -d parmas| use -U -h -p parmas - */ - else if ((NULL != dbname) || (NULL != dbport) || (NULL != dbuser)) - { - if(NULL == dbname) - { - snprintf(connInfo, sizeof(connInfo), "host=%s port=%s user=%s %s", dbhost, dbport, dbuser, default_dbname); - } - else - { - snprintf(connInfo, sizeof(connInfo), "host=%s port=%s user=%s dbname=%s", dbhost, dbport, dbuser, dbname); - } - } - connDev = PQconnectdb(connInfo); - - if (PQstatus(connDev) != CONNECTION_OK) - { - fprintf(stderr, "Connection to database failed: %s\n", - PQerrorMessage(connDev)); - disconnect_and_exit(1); - } - resDev = PQexec(connDev, "select restart_lsn from pg_replication_slots order by restart_lsn asc limit 1"); - memset(xlogstart, '\0', sizeof(xlogstart)); - if (PQntuples(resDev) == 0) - { - strlcpy(xlogstart, PQgetvalue(res, 0, 0), sizeof(xlogstart)); - } - else - { - strlcpy(xlogstart, PQgetvalue(resDev, 0, 0), sizeof(xlogstart)); - fprintf(stderr, _("%s: In pg_replication_slots restartlsn exchange write-ahead log start point: %s\n"), - progname, xlogstart); - } - - if (NULL != resDev) - { - PQclear(resDev); - } - - if (NULL != connDev) - { - PQfinish(connDev); - } - - - if (verbose) - fprintf(stderr, _("%s: checkpoint completed\n"), progname); - - /* - * 9.3 and later sends the TLI of the starting point. With older servers, - * assume it's the same as the latest timeline reported by - * IDENTIFY_SYSTEM. - */ - if (PQnfields(res) >= 2) - starttli = atoi(PQgetvalue(res, 0, 1)); - else - starttli = latesttli; - PQclear(res); - MemSet(xlogend, 0, sizeof(xlogend)); - - if (verbose && includewal != NO_WAL) - fprintf(stderr, _("%s: write-ahead log start point: %s on timeline %u\n"), - progname, xlogstart, starttli); - - /* - * Get the header - */ - res = PQgetResult(conn); - if (PQresultStatus(res) != PGRES_TUPLES_OK) - { - fprintf(stderr, _("%s: could not get backup header: %s"), - progname, PQerrorMessage(conn)); - disconnect_and_exit(1); - } - if (PQntuples(res) < 1) - { - fprintf(stderr, _("%s: no data returned from server\n"), progname); - disconnect_and_exit(1); - } - - /* - * Sum up the total size, for progress reporting - */ - totalsize = totaldone = 0; - tablespacecount = PQntuples(res); - for (i = 0; i < PQntuples(res); i++) - { - totalsize += atol(PQgetvalue(res, i, 2)); - - /* - * Verify tablespace directories are empty. Don't bother with the - * first once since it can be relocated, and it will be checked before - * we do anything anyway. - */ - if (format == 'p' && !PQgetisnull(res, i, 1)) - { - char *path = (char *) get_tablespace_mapping(PQgetvalue(res, i, 1)); - - verify_dir_is_empty_or_create(path, &made_tablespace_dirs, &found_tablespace_dirs); - } - } - - /* - * When writing to stdout, require a single tablespace - */ - if (format == 't' && strcmp(basedir, "-") == 0 && PQntuples(res) > 1) - { - fprintf(stderr, - _("%s: can only write single tablespace to stdout, database has %d\n"), - progname, PQntuples(res)); - disconnect_and_exit(1); - } - + /* start_point: get last checkpoint point position from master */ + strlcpy(xlogstart, PQgetvalue(res, 0, 0), sizeof(xlogstart)); + /* * If we're streaming WAL, start the streaming session before we start * receiving the actual data chunks. From 467725d7a84d1f533cc80702585936945d3add7b Mon Sep 17 00:00:00 2001 From: ericxwu Date: Mon, 29 Jun 2020 21:54:07 +0800 Subject: [PATCH 010/578] consider nestloop inner plan materialization cost Nestloop join will add material plannode atop of inner subplan. This is checked and added during create plan phase. But we did not consider the cost in cost modle. I think the code was originally copied from merge join, but forgot to copy the cost modle change in final_cost_mergejoin. --- src/backend/optimizer/path/costsize.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 64872e08..7de5eaa4 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -2345,6 +2345,25 @@ final_cost_nestloop(PlannerInfo *root, NestPath *path, startup_cost += path->path.pathtarget->cost.startup; run_cost += path->path.pathtarget->cost.per_tuple * path->path.rows; +#ifdef __TBASE__ + /* + * While NestLoop is executed it rescans inner plan. We do not want to + * rescan RemoteSubplan and do not support it. So if inner_plan is a + * RemoteSubplan, materialize it. + * + * We add materialize plannode during the create plan phase to avoid + * other optimizer side affect. But we still need to add the cost here + * just like mergejoin did when considering materialize_inner flag. + * During join reordering phase, there should be no other node between + * current nestloop and RemoteSubPath. Thus we do not need to traverse + * the whole subpath to find RemoteSubPath. + */ + if (IsA(inner_path, RemoteSubPath)) + { + run_cost += cpu_operator_cost * inner_path_rows; + } +#endif + path->path.startup_cost = startup_cost; path->path.total_cost = startup_cost + run_cost; } From aadbcd3569cc9410bd7dadc25d24ced0e2bf30a7 Mon Sep 17 00:00:00 2001 From: ericxwu Date: Tue, 30 Jun 2020 15:07:48 +0800 Subject: [PATCH 011/578] Fix ReinitializeParallelDSM to tolerate finding no error queues.(Merge Postgres) Commit d4663350646ca0c069a36d906155a0f7e3372eb7 changed things so that shm_toc_lookup would fail with an error rather than silently returning NULL in the hope that such failures would be reported in a useful way rather than via a system crash. However, it overlooked the fact that the lookup of PARALLEL_KEY_ERROR_QUEUE in ReinitializeParallelDSM is expected to fail when no DSM segment was created in the first place; in that case, we end up with a backend-private memory segment that still contains an entry for PARALLEL_KEY_FIXED but no others. Consequently a benign failure to initialize parallelism can escalate into an elog(ERROR); repair. --- src/backend/access/transam/parallel.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/backend/access/transam/parallel.c b/src/backend/access/transam/parallel.c index 8cccbfb0..4cffc98a 100644 --- a/src/backend/access/transam/parallel.c +++ b/src/backend/access/transam/parallel.c @@ -420,9 +420,10 @@ ReinitializeParallelDSM(ParallelContext *pcxt) fps = shm_toc_lookup(pcxt->toc, PARALLEL_KEY_FIXED, false); fps->last_xlog_end = 0; - /* Recreate error queues. */ + /* Recreate error queues (if they exist). */ error_queue_space = - shm_toc_lookup(pcxt->toc, PARALLEL_KEY_ERROR_QUEUE, false); + shm_toc_lookup(pcxt->toc, PARALLEL_KEY_ERROR_QUEUE, true); + Assert(pcxt->nworkers == 0 || error_queue_space != NULL); for (i = 0; i < pcxt->nworkers; ++i) { char *start; From 0368d60dc3ce55c32c508a9a2aeeb94599abc605 Mon Sep 17 00:00:00 2001 From: ericxwu Date: Tue, 30 Jun 2020 15:12:19 +0800 Subject: [PATCH 012/578] Be more wary about shm_toc_lookup failure.(Merge Postgres) Commit 445dbd82a basically missed the point of commit d46633506, which was that we shouldn't allow shm_toc_lookup() failure to lead to a core dump or assertion crash, because the odds of such a failure should never be considered negligible. It's correct that we can't expect the PARALLEL_KEY_ERROR_QUEUE TOC entry to be there if we have no workers. But if we have no workers, we're not going to do anything in this function with the lookup result anyway, so let's just skip it. That lets the code use the easy-to-prove-safe noError=false case, rather than anything requiring effort to review. Back-patch to v10, like the previous commit. Discussion: https://postgr.es/m/3647.1517601675@sss.pgh.pa.us --- src/backend/access/transam/parallel.c | 29 +++++++++++++++------------ 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/src/backend/access/transam/parallel.c b/src/backend/access/transam/parallel.c index 4cffc98a..84256fe1 100644 --- a/src/backend/access/transam/parallel.c +++ b/src/backend/access/transam/parallel.c @@ -405,8 +405,6 @@ void ReinitializeParallelDSM(ParallelContext *pcxt) { FixedParallelState *fps; - char *error_queue_space; - int i; /* Wait for any old workers to exit. */ if (pcxt->nworkers_launched > 0) @@ -421,18 +419,23 @@ ReinitializeParallelDSM(ParallelContext *pcxt) fps->last_xlog_end = 0; /* Recreate error queues (if they exist). */ - error_queue_space = - shm_toc_lookup(pcxt->toc, PARALLEL_KEY_ERROR_QUEUE, true); - Assert(pcxt->nworkers == 0 || error_queue_space != NULL); - for (i = 0; i < pcxt->nworkers; ++i) + if (pcxt->nworkers > 0) { - char *start; - shm_mq *mq; - - start = error_queue_space + i * PARALLEL_ERROR_QUEUE_SIZE; - mq = shm_mq_create(start, PARALLEL_ERROR_QUEUE_SIZE); - shm_mq_set_receiver(mq, MyProc); - pcxt->worker[i].error_mqh = shm_mq_attach(mq, pcxt->seg, NULL); + char *error_queue_space; + int i; + + error_queue_space = + shm_toc_lookup(pcxt->toc, PARALLEL_KEY_ERROR_QUEUE, false); + for (i = 0; i < pcxt->nworkers; ++i) + { + char *start; + shm_mq *mq; + + start = error_queue_space + i * PARALLEL_ERROR_QUEUE_SIZE; + mq = shm_mq_create(start, PARALLEL_ERROR_QUEUE_SIZE); + shm_mq_set_receiver(mq, MyProc); + pcxt->worker[i].error_mqh = shm_mq_attach(mq, pcxt->seg, NULL); + } } } From 28ae14ba9851b828f8d5ca56f6af28050ace68c5 Mon Sep 17 00:00:00 2001 From: ericxwu Date: Mon, 13 Jul 2020 20:21:02 +0800 Subject: [PATCH 013/578] refactor distributed transaction related functions 1. extract SetPlpgsqlTransactionBegin() 2. extract tuple visibility debugging functions 3. mask debugging functions in tuple visibility core path 4. other code format refactor --- src/backend/access/transam/gtm.c | 4 +- src/backend/access/transam/twophase.c | 25 +- src/backend/access/transam/xact.c | 43 +- src/backend/pgxc/pool/execRemote.c | 956 +++++----- src/backend/pgxc/pool/pgxcnode.c | 7 +- src/backend/storage/lmgr/nodelock.c | 8 +- src/backend/tcop/postgres.c | 98 +- src/backend/utils/cache/syscache.c | 2 +- src/backend/utils/time/tqual.c | 2463 +++++++++++-------------- src/include/utils/snapshot.h | 17 +- 10 files changed, 1668 insertions(+), 1955 deletions(-) diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c index 47b698d4..4545592b 100644 --- a/src/backend/access/transam/gtm.c +++ b/src/backend/access/transam/gtm.c @@ -200,7 +200,9 @@ void RegisterRenameSequence(char *new, char *old) rename_info = (RenameInfo *) lfirst(cell); if (0 == strncmp(rename_info->new, old, GTM_NAME_LEN)) { - elog(LOG, "Combine requence seq:%s ->:%s, %s->%s to old:%s latest new:%s", rename_info->new, rename_info->old, new, old, rename_info->old, new); + elog(LOG, "Combine requence seq:%s ->:%s, %s->%s to old:%s latest " + "new:%s", rename_info->new, rename_info->old, new, old, + rename_info->old, new); snprintf(rename_info->new, GTM_NAME_LEN, "%s", new); return; } diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index b2e35941..204e9edd 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -406,7 +406,8 @@ PostPrepare_Twophase(void) #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__ -static void RecoverEndGlobalPrepare(GlobalTransaction gxact) +static void +RecoverEndGlobalPrepare(GlobalTransaction gxact) { volatile PGXACT *pgxact = &ProcGlobal->allPgXact[gxact->pgprocno]; @@ -416,17 +417,17 @@ static void RecoverEndGlobalPrepare(GlobalTransaction gxact) } - -void EndGlobalPrepare(GlobalTransaction gxact, bool isImplicit) -{ - volatile PGXACT *pgxact = &ProcGlobal->allPgXact[gxact->pgprocno]; - - pg_atomic_write_u64(&pgxact->prepare_timestamp, GetGlobalPrepareTimestamp()); - if(enable_distri_print) - { - elog(LOG, "proc no %d prepare timestamp " INT64_FORMAT " xid %d.", gxact->pgprocno, - GetGlobalPrepareTimestamp(), pgxact->xid); - } +void +EndGlobalPrepare(GlobalTransaction gxact, bool isImplicit) +{ + volatile PGXACT *pgxact = &ProcGlobal->allPgXact[gxact->pgprocno]; + + pg_atomic_write_u64(&pgxact->prepare_timestamp, GetGlobalPrepareTimestamp()); + if(enable_distri_print) + { + elog(LOG, "proc no %d prepare timestamp " INT64_FORMAT " xid %d.", gxact->pgprocno, + GetGlobalPrepareTimestamp(), pgxact->xid); + } if(isImplicit && !GlobalTimestampIsValid(pg_atomic_read_u64(&pgxact->prepare_timestamp))) { diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 993d0c52..5369958d 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -353,6 +353,14 @@ static TimestampTz xactStartTimestamp; static TimestampTz stmtStartTimestamp; static TimestampTz xactStopTimestamp; + +#ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__ +static GlobalTimestamp XactGlobalCommitTimestamp = 0; +static GlobalTimestamp XactGlobalPrepareTimestamp = 0; +static GlobalTimestamp XactLocalCommitTimestamp = 0; +static GlobalTimestamp XactLocalPrepareTimestamp = 0; +#endif + /* * PGXC receives from GTM a timestamp value at the same time as a GXID * This one is set as GTMxactStartTimestamp and is a return value of now(), current_transaction(). @@ -361,14 +369,6 @@ static TimestampTz xactStopTimestamp; * during a transaction. Delta can have a different value through the nodes of the cluster * but its uniqueness in the cluster is maintained thanks to the global value GTMxactStartTimestamp. */ -#ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__ -static GlobalTimestamp XactGlobalCommitTimestamp = 0; -static GlobalTimestamp XactGlobalPrepareTimestamp = 0; -static GlobalTimestamp XactLocalCommitTimestamp = 0; -static GlobalTimestamp XactLocalPrepareTimestamp = 0; - -#endif - #ifdef PGXC static TimestampTz GTMxactStartTimestamp = 0; static TimestampTz GTMdeltaTimestamp = 0; @@ -7969,7 +7969,6 @@ NeedBeginTxn(void) return ret; } - bool NeedBeginSubTxn(void) { @@ -7987,19 +7986,21 @@ NeedBeginSubTxn(void) void SetNodeBeginTxn(Oid nodeoid) { - TransactionState s = &TopTransactionStateData; - MemoryContext oldcontext = NULL; - - if (!InPlpgsqlFunc() || s->nestingLevel != 1) - { - elog(PANIC,"SetNodeBeginTxn should only called in plpgsql exec env and TopmostTxn"); - } + TransactionState s = &TopTransactionStateData; + MemoryContext oldcontext = NULL; + + if (!InPlpgsqlFunc() || s->nestingLevel != 1) + { + elog(PANIC,"SetNodeBeginTxn should only called in plpgsql exec env and " + "TopmostTxn"); + } - oldcontext = MemoryContextSwitchTo(TopTransactionContext); - - s->node_has_begin_txn_list = list_append_unique_oid(s->node_has_begin_txn_list, nodeoid); - - MemoryContextSwitchTo(oldcontext); + oldcontext = MemoryContextSwitchTo(TopTransactionContext); + + s->node_has_begin_txn_list = + list_append_unique_oid(s->node_has_begin_txn_list, nodeoid); + + MemoryContextSwitchTo(oldcontext); } void diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 4183be8b..72aa55f1 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -3425,11 +3425,13 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections, } /* Send timestamp and check for errors */ - if (GlobalTimestampIsValid(timestamp) && pgxc_node_send_timestamp(connections[i], timestamp)) - { - elog(WARNING, "pgxc_node_begin sending timestamp fails: local start timestamp" INT64_FORMAT, timestamp); - return EOF; - } + if (GlobalTimestampIsValid(timestamp) && + pgxc_node_send_timestamp(connections[i], timestamp)) + { + elog(WARNING, "pgxc_node_begin sending timestamp fails: local start" + " timestamp" INT64_FORMAT, timestamp); + return EOF; + } if (IS_PGXC_DATANODE && GlobalTransactionIdIsValid(gxid)) { need_tran_block = true; @@ -3445,64 +3447,76 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections, need_send_begin = true; } - if (connections[i]->plpgsql_need_begin_txn && connections[i]->plpgsql_need_begin_sub_txn && 'I' == connections[i]->transaction_status) - { - need_send_begin = true; - cmd = begin_both_cmd; - connections[i]->plpgsql_need_begin_txn = false; - connections[i]->plpgsql_need_begin_sub_txn = false; - if (PlpgsqlDebugPrint) - { - elog(LOG, "[PLPGSQL] pgxc_node_begin cmd:%s conn->plpgsql_need_begin_txn was true, and conn->plpgsql_need_begin_sub_txn was true. in_plpgsql_exec_fun:%d", - cmd, g_in_plpgsql_exec_fun); - } - } - else if (connections[i]->plpgsql_need_begin_txn && 'I' == connections[i]->transaction_status) - { - need_send_begin = true; - connections[i]->plpgsql_need_begin_txn = false; - if (PlpgsqlDebugPrint) - { - elog(LOG, "[PLPGSQL] pgxc_node_begin cmd:%s conn->plpgsql_need_begin_txn was true, g_in_plpgsql_exec_fun:%d, conn->plpgsql_need_begin_sub_txn:%d", - cmd, g_in_plpgsql_exec_fun, connections[i]->plpgsql_need_begin_sub_txn); - } - } - else if (connections[i]->plpgsql_need_begin_sub_txn) - { - need_send_begin = true; - cmd = begin_subtxn_cmd; - connections[i]->plpgsql_need_begin_sub_txn = false; - if (PlpgsqlDebugPrint) - { - elog(LOG, "[PLPGSQL] pgxc_node_begin cmd:%s conn->plpgsql_need_begin_sub_txn was true, g_in_plpgsql_exec_fun:%d, conn->plpgsql_need_begin_txn:%d", - cmd, g_in_plpgsql_exec_fun, connections[i]->plpgsql_need_begin_txn); - } - if ('T' != connections[i]->transaction_status) - { - elog(PANIC, "[PLPGSQL] pgxc_node_begin need_begin_sub_txn wrong transaction_status"); - } - } - + if (connections[i]->plpgsql_need_begin_txn && + connections[i]->plpgsql_need_begin_sub_txn && + 'I' == connections[i]->transaction_status) + { + need_send_begin = true; + cmd = begin_both_cmd; + connections[i]->plpgsql_need_begin_txn = false; + connections[i]->plpgsql_need_begin_sub_txn = false; + if (PlpgsqlDebugPrint) + { + elog(LOG, "[PLPGSQL] pgxc_node_begin cmd:%s conn->plpgsql_need_begin_txn " + "was true, and conn->plpgsql_need_begin_sub_txn was true. " + "in_plpgsql_exec_fun:%d", cmd, g_in_plpgsql_exec_fun); + } + } + else if (connections[i]->plpgsql_need_begin_txn && + 'I' == connections[i]->transaction_status) + { + need_send_begin = true; + connections[i]->plpgsql_need_begin_txn = false; + if (PlpgsqlDebugPrint) + { + elog(LOG, "[PLPGSQL] pgxc_node_begin cmd:%s conn->plpgsql_need_begin_txn " + "was true, g_in_plpgsql_exec_fun:%d, conn->plpgsql_need_begin_sub_txn:%d", + cmd, g_in_plpgsql_exec_fun, connections[i]->plpgsql_need_begin_sub_txn); + } + } + else if (connections[i]->plpgsql_need_begin_sub_txn) + { + need_send_begin = true; + cmd = begin_subtxn_cmd; + connections[i]->plpgsql_need_begin_sub_txn = false; + if (PlpgsqlDebugPrint) + { + elog(LOG, "[PLPGSQL] pgxc_node_begin cmd:%s conn->plpgsql_need_begin_sub_txn was" + " true, g_in_plpgsql_exec_fun:%d, conn->plpgsql_need_begin_txn:%d", + cmd, g_in_plpgsql_exec_fun, connections[i]->plpgsql_need_begin_txn); + } + if ('T' != connections[i]->transaction_status) + { + elog(PANIC, "[PLPGSQL] pgxc_node_begin need_begin_sub_txn wrong" + "transaction_status"); + } + } - /* If exec savepoint command, we make sure begin should send(NB:can be sent only once) before send savepoint */ - if ('I' == connections[i]->transaction_status && SavepointDefined()) - { - need_send_begin = true; - } + /* + * If exec savepoint command, we make sure begin should send(NB:can be + * sent only once) before send savepoint + */ + if ('I' == connections[i]->transaction_status && SavepointDefined()) + { + need_send_begin = true; + } /* - * Send the Coordinator info down to the PGXC node at the beginning of transaction, - * In this way, Datanode can print this Coordinator info into logfile, - * and those infos can be found in Datanode logifile if needed during debugging + * Send the Coordinator info down to the PGXC node at the beginning of + * transaction, In this way, Datanode can print this Coordinator info + * into logfile, and those infos can be found in Datanode logifile if + * needed during debugging */ if (need_send_begin && IS_PGXC_COORDINATOR) { pgxc_node_send_coord_info(connections[i], MyProcPid, MyProc->lxid); } -#endif +#endif - elog(DEBUG5, "[PLPGSQL] pgxc_node_begin need_tran_block %d, connections[%d]->transaction_status %c need_send_begin:%d", - need_tran_block, i, connections[i]->transaction_status, need_send_begin); + elog(DEBUG5, "[PLPGSQL] pgxc_node_begin need_tran_block %d," + "connections[%d]->transaction_status %c need_send_begin:%d", + need_tran_block, i, connections[i]->transaction_status, + need_send_begin); /* Send BEGIN if not already in transaction */ if (need_send_begin) @@ -3513,8 +3527,9 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections, return EOF; } - elog(DEBUG5, "pgxc_node_begin send %s to node %s, pid:%d", cmd, connections[i]->nodename, connections[i]->backend_pid); - new_connections[new_count++] = connections[i]; + elog(DEBUG5, "pgxc_node_begin send %s to node %s, pid:%d", cmd, + connections[i]->nodename, connections[i]->backend_pid); + new_connections[new_count++] = connections[i]; } #if 0 @@ -4877,6 +4892,40 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle) } } +/* + * Set the node begein transaction in plpgsql function + */ +static void +SetPlpgsqlTransactionBegin(PGXCNodeHandle *conn) +{ + Oid nodeOid = conn->nodeoid; + + if (NeedBeginTxn() && !NodeHasBeginTxn(nodeOid)) + { + conn->plpgsql_need_begin_txn = true; + SetNodeBeginTxn(nodeOid); + if (PlpgsqlDebugPrint) + { + elog(LOG, "[PLPGSQL] ExecRemoteUtility conn nodename:%s " + "backendpid:%d sock:%d nodeoid:%u need_begin_txn", + conn->nodename, conn->backend_pid, conn->sock, + conn->nodeoid); + } + } + if (NeedBeginSubTxn() && !NodeHasBeginSubTxn(nodeOid)) + { + conn->plpgsql_need_begin_sub_txn = true; + SetNodeBeginSubTxn(nodeOid); + if (PlpgsqlDebugPrint) + { + elog(LOG, "[PLPGSQL] ExecRemoteUtility conn nodename:%s " + "backendpid:%d sock:%d nodeoid:%u need_begin_sub_txn", + conn->nodename, conn->backend_pid, conn->sock, + conn->nodeoid); + } + } +} + #ifdef __TWO_PHASE_TRANS__ void InitLocalTwoPhaseState(void) { @@ -6496,7 +6545,7 @@ ExecRemoteUtility(RemoteQuery *node) {// #lizard forgives RemoteQueryState *remotestate; ResponseCombiner *combiner; - bool force_autocommit = node->force_autocommit; + bool force_autocommit = node->force_autocommit; RemoteQueryExecType exec_type = node->exec_type; GlobalTransactionId gxid = InvalidGlobalTransactionId; Snapshot snapshot = NULL; @@ -6524,6 +6573,7 @@ ExecRemoteUtility(RemoteQuery *node) dn_conn_count = pgxc_connections->dn_conn_count; co_conn_count = pgxc_connections->co_conn_count; + /* exit right away if no nodes to run command on */ if (dn_conn_count == 0 && co_conn_count == 0) { @@ -6536,16 +6586,20 @@ ExecRemoteUtility(RemoteQuery *node) else need_tran_block = true; - /* Commands launched through EXECUTE DIRECT do not need start a transaction */ + /* + * Commands launched through EXECUTE DIRECT do not need start a + * transaction + */ if (exec_direct_type == EXEC_DIRECT_UTILITY) { need_tran_block = false; /* This check is not done when analyzing to limit dependencies */ if (IsTransactionBlock()) - ereport(ERROR, - (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION), - errmsg("cannot run EXECUTE DIRECT with utility inside a transaction block"))); + ereport(ERROR, + (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION), + errmsg("cannot run EXECUTE DIRECT with utility inside a " + "transaction block"))); } #ifdef __TBASE__ @@ -6558,150 +6612,89 @@ ExecRemoteUtility(RemoteQuery *node) if (ActiveSnapshotSet()) snapshot = GetActiveSnapshot(); + #ifdef __TBASE__ if (!ExecDDLWithoutAcquireXid(node->parsetree)) #endif { if (!GlobalTransactionIdIsValid(gxid)) - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to get next transaction ID"))); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to get next transaction ID"))); } #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__ if(!IS_PGXC_LOCAL_COORDINATOR) { - /* - * Global xid is not needed to send to remote nodes - * for connections from coord and datanode as - * normal DDLs except for set_config_option are all single level - * connections from Coords executing distributed DDLs. - */ + /* + * Distributed DDLs only dispatch from the requested coordinator, thus + * we skip sending gxid to avoid cycling. + * + * Note: except for 'set_config_option'. + */ gxid = InvalidTransactionId; } #endif #ifdef __TBASE__ + /* Set node begin transaction in plpgsql function for CN/DN */ + for (i = 0; i < dn_conn_count; i++) { - Oid nodeoid = InvalidOid; - PGXCNodeHandle *conn = NULL; - for (i = 0; i < dn_conn_count; i++) - { - conn = pgxc_connections->datanode_handles[i]; - nodeoid = conn->nodeoid; - if (NeedBeginTxn() && !NodeHasBeginTxn(nodeoid)) - { - conn->plpgsql_need_begin_txn = true; - SetNodeBeginTxn(nodeoid); - if (PlpgsqlDebugPrint) - { - elog(LOG, "[PLPGSQL] ExecRemoteUtility conn nodename:%s backendpid:%d sock:%d nodeoid:%u need_begin_txn", - conn->nodename, conn->backend_pid, conn->sock, conn->nodeoid); - } - } - if (NeedBeginSubTxn() && !NodeHasBeginSubTxn(nodeoid)) - { - conn->plpgsql_need_begin_sub_txn = true; - SetNodeBeginSubTxn(nodeoid); - if (PlpgsqlDebugPrint) - { - elog(LOG, "[PLPGSQL] ExecRemoteUtility conn nodename:%s backendpid:%d sock:%d nodeoid:%u need_begin_sub_txn", - conn->nodename, conn->backend_pid, conn->sock, conn->nodeoid); - } - } - } - - for (i = 0; i < co_conn_count; i++) - { - conn = pgxc_connections->coord_handles[i]; - nodeoid = conn->nodeoid; - if (NeedBeginTxn() && !NodeHasBeginTxn(nodeoid)) - { - conn->plpgsql_need_begin_txn = true; - SetNodeBeginTxn(nodeoid); - if (PlpgsqlDebugPrint) - { - elog(LOG, "[PLPGSQL] ExecRemoteUtility conn nodename:%s backendpid:%d sock:%d nodeoid:%u need_begin_txn", - conn->nodename, conn->backend_pid, conn->sock, conn->nodeoid); - } - } - if (NeedBeginSubTxn() && !NodeHasBeginSubTxn(nodeoid)) - { - conn->plpgsql_need_begin_sub_txn = true; - SetNodeBeginSubTxn(nodeoid); - if (PlpgsqlDebugPrint) - { - elog(LOG, "[PLPGSQL] ExecRemoteUtility conn nodename:%s backendpid:%d sock:%d nodeoid:%u need_begin_sub_txn", - conn->nodename, conn->backend_pid, conn->sock, conn->nodeoid); - } - } - } - } + SetPlpgsqlTransactionBegin(pgxc_connections->datanode_handles[i]); + } + + for (i = 0; i < co_conn_count; i++) + { + SetPlpgsqlTransactionBegin(pgxc_connections->coord_handles[i]); + } #endif + /* + * DDL will firstly be executed on coordinators then datanodes + * which will avoid deadlocks in cluster. + * Let us assume that user sql and ddl hold conflict locks, + * then there will be two situations: + * 1. The coordinator is not locked, user sql will see datanodes with no lock. + * 2. The coordinator is locked, user sql will wait for ddl to complete. + * + * Send BEGIN control command to all coordinator nodes + */ + if (pgxc_node_begin(co_conn_count, + pgxc_connections->coord_handles, + gxid, + need_tran_block, + false, + PGXC_NODE_COORDINATOR)) { - if (pgxc_node_begin(dn_conn_count, pgxc_connections->datanode_handles, - gxid, need_tran_block, false, PGXC_NODE_DATANODE)) - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Could not begin transaction on Datanodes"))); - for (i = 0; i < dn_conn_count; i++) - { - PGXCNodeHandle *conn = pgxc_connections->datanode_handles[i]; - - if (conn->state == DN_CONNECTION_STATE_QUERY) - BufferConnection(conn); - if (snapshot && pgxc_node_send_snapshot(conn, snapshot)) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to send snapshot to Datanodes"))); - } - if (pgxc_node_send_cmd_id(conn, cid) < 0) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to send command ID to Datanodes"))); - } - - if (pgxc_node_send_query(conn, node->sql_statement) != 0) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to send command to Datanodes"))); - } - } + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Could not begin transaction on coordinators"))); } + /* Send other txn related messages to coordinator nodes */ + for (i = 0; i < co_conn_count; i++) { - if (pgxc_node_begin(co_conn_count, pgxc_connections->coord_handles, - gxid, need_tran_block, false, PGXC_NODE_COORDINATOR)) + PGXCNodeHandle *conn = pgxc_connections->coord_handles[i]; + + if (snapshot && pgxc_node_send_snapshot(conn, snapshot)) + { ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Could not begin transaction on coordinators"))); - /* Now send it to Coordinators if necessary */ - for (i = 0; i < co_conn_count; i++) + errmsg("Failed to send command to coordinators"))); + } + if (pgxc_node_send_cmd_id(conn, cid) < 0) { - if (snapshot && pgxc_node_send_snapshot(pgxc_connections->coord_handles[i], snapshot)) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to send command to coordinators"))); - } - if (pgxc_node_send_cmd_id(pgxc_connections->coord_handles[i], cid) < 0) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to send command ID to Datanodes"))); - } + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command ID to Datanodes"))); + } - if (pgxc_node_send_query(pgxc_connections->coord_handles[i], node->sql_statement) != 0) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to send command to coordinators"))); - } + if (pgxc_node_send_query(conn, node->sql_statement) != 0) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command to coordinators"))); } } @@ -6709,125 +6702,188 @@ ExecRemoteUtility(RemoteQuery *node) * Stop if all commands are completed or we got a data row and * initialized state node for subsequent invocations */ + while (co_conn_count > 0) { - while (dn_conn_count > 0) - { - int i = 0; + int i = 0; - if (pgxc_node_receive(dn_conn_count, pgxc_connections->datanode_handles, NULL)) - break; + /* Wait until one of the connections has data available */ + if (pgxc_node_receive(co_conn_count, + pgxc_connections->coord_handles, + NULL)) + { /* - * Handle input from the Datanodes. - * We do not expect Datanodes returning tuples when running utility - * command. - * If we got EOF, move to the next connection, will receive more - * data on the next iteration. + * Got error + * TODO(Tbase): How do we check the error here? */ - while (i < dn_conn_count) - { - PGXCNodeHandle *conn = pgxc_connections->datanode_handles[i]; - int res = handle_response(conn, combiner); - if (res == RESPONSE_EOF) - { - i++; - } - else if (res == RESPONSE_COMPLETE) - { - /* Ignore, wait for ReadyForQuery */ - if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Unexpected FATAL ERROR on Connection to Datanode %s pid %d", - conn->nodename, conn->backend_pid))); - } - } - else if (res == RESPONSE_ERROR) - { - /* Ignore, wait for ReadyForQuery */ - } - else if (res == RESPONSE_READY) - { - if (i < --dn_conn_count) - pgxc_connections->datanode_handles[i] = - pgxc_connections->datanode_handles[dn_conn_count]; - } - else if (res == RESPONSE_TUPDESC) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Unexpected response from Datanode"))); - } - else if (res == RESPONSE_DATAROW) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Unexpected response from Datanode"))); - } - } + break; } - } - /* Make the same for Coordinators */ - { - while (co_conn_count > 0) + while (i < co_conn_count) { - int i = 0; - - if (pgxc_node_receive(co_conn_count, pgxc_connections->coord_handles, NULL)) - break; + PGXCNodeHandle *conn = pgxc_connections->coord_handles[i]; + int res = handle_response(conn, combiner); - while (i < co_conn_count) + if (res == RESPONSE_EOF) { - int res = handle_response(pgxc_connections->coord_handles[i], combiner); - if (res == RESPONSE_EOF) - { - i++; - } - else if (res == RESPONSE_COMPLETE) - { - /* Ignore, wait for ReadyForQuery */ - if (pgxc_connections->coord_handles[i]->state == DN_CONNECTION_STATE_ERROR_FATAL) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Unexpected FATAL ERROR on Connection to Coordinator %s pid %d", - pgxc_connections->coord_handles[i]->nodename, pgxc_connections->coord_handles[i]->backend_pid))); - } - } - else if (res == RESPONSE_ERROR) - { - /* Ignore, wait for ReadyForQuery */ - } - else if (res == RESPONSE_READY) - { - if (i < --co_conn_count) - pgxc_connections->coord_handles[i] = - pgxc_connections->coord_handles[co_conn_count]; - } - else if (res == RESPONSE_TUPDESC) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Unexpected response from coordinator"))); - } - else if (res == RESPONSE_DATAROW) + i++; + } + else if (res == RESPONSE_COMPLETE) + { + /* Ignore, wait for ReadyForQuery */ + if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL) { ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Unexpected response from coordinator"))); + errmsg("Unexpected FATAL ERROR on Connection to " + "Coordinator %s pid %d", + pgxc_connections->coord_handles[i]->nodename, + pgxc_connections->coord_handles[i]->backend_pid))); } } + else if (res == RESPONSE_ERROR) + { + /* Ignore, wait for ReadyForQuery */ + } + else if (res == RESPONSE_READY) + { + if (i < --co_conn_count) + pgxc_connections->coord_handles[i] = + pgxc_connections->coord_handles[co_conn_count]; + } + else if (res == RESPONSE_TUPDESC) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Unexpected response from coordinator"))); + } + else if (res == RESPONSE_DATAROW) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Unexpected response from coordinator"))); + } } } - /* - * We have processed all responses from nodes and if we have - * error message pending we can report it. All connections should be in - * consistent state now and so they can be released to the pool after ROLLBACK. - */ - pfree_pgxc_all_handles(pgxc_connections); - pgxc_node_report_error(combiner); + /* + * Send BEGIN control command to all data nodes + */ + if (pgxc_node_begin(dn_conn_count, + pgxc_connections->datanode_handles, + gxid, + need_tran_block, + false, + PGXC_NODE_DATANODE)) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Could not begin transaction on Datanodes"))); + } + + /* Send other txn related messages to data nodes */ + for (i = 0; i < dn_conn_count; i++) + { + PGXCNodeHandle *conn = pgxc_connections->datanode_handles[i]; + + if (conn->state == DN_CONNECTION_STATE_QUERY) + BufferConnection(conn); + if (snapshot && pgxc_node_send_snapshot(conn, snapshot)) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send snapshot to Datanodes"))); + } + if (pgxc_node_send_cmd_id(conn, cid) < 0) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command ID to Datanodes"))); + } + + if (pgxc_node_send_query(conn, node->sql_statement) != 0) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command to Datanodes"))); + } + } + + + /* Make the same for data nodes */ + while (dn_conn_count > 0) + { + int i = 0; + + /* Wait until one of the connections has data available */ + if (pgxc_node_receive(dn_conn_count, + pgxc_connections->datanode_handles, + NULL)) + { + /* + * Got error + * TODO(Tbase): How do we check the error here? + */ + break; + } + + /* + * Handle input from the data nodes. We do not expect data nodes + * returning tuples when running utility command. If we got EOF, move + * to the next connection, will receive more data on the next + * iteration. + */ + while (i < dn_conn_count) + { + PGXCNodeHandle *conn = pgxc_connections->datanode_handles[i]; + int res = handle_response(conn, combiner); + if (res == RESPONSE_EOF) + { + i++; + } + else if (res == RESPONSE_COMPLETE) + { + /* Ignore, wait for ReadyForQuery */ + if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Unexpected FATAL ERROR on Connection to " + "Datanode %s pid %d", + conn->nodename, conn->backend_pid))); + } + } + else if (res == RESPONSE_ERROR) + { + /* Ignore, wait for ReadyForQuery */ + } + else if (res == RESPONSE_READY) + { + if (i < --dn_conn_count) + pgxc_connections->datanode_handles[i] = + pgxc_connections->datanode_handles[dn_conn_count]; + } + else if (res == RESPONSE_TUPDESC) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Unexpected response from Datanode"))); + } + else if (res == RESPONSE_DATAROW) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Unexpected response from Datanode"))); + } + } + } + + /* + * We have processed all responses from nodes and if we have error message + * pending we can report it. All connections should be in consistent state + * now and so they can be released to the pool after ROLLBACK. + */ + pfree_pgxc_all_handles(pgxc_connections); + pgxc_node_report_error(combiner); } @@ -8583,25 +8639,24 @@ ExecRemoteQuery(PlanState *pstate) connections = pgxc_connections->datanode_handles; total_conn_count = regular_conn_count = pgxc_connections->dn_conn_count; #ifdef __TBASE__ - if (regular_conn_count > 1) - { - need_global_snapshot = true; - } - else if (regular_conn_count == 1 && !need_global_snapshot) - { - MemoryContext old; - - int nodeid = PGXCNodeGetNodeId(connections[0]->nodeoid, NULL); - - old = MemoryContextSwitchTo(TopTransactionContext); - executed_node_list = list_append_unique_int(executed_node_list, nodeid); - MemoryContextSwitchTo(old); - - if (list_length(executed_node_list) > 1) - { - need_global_snapshot = true; - } - } + if (regular_conn_count > 1) + { + need_global_snapshot = true; + } + else if (regular_conn_count == 1 && !need_global_snapshot) + { + int nodeid = PGXCNodeGetNodeId(connections[0]->nodeoid, NULL); + MemoryContext old = MemoryContextSwitchTo(TopTransactionContext); + + executed_node_list = list_append_unique_int(executed_node_list, nodeid); + + MemoryContextSwitchTo(old); + + if (list_length(executed_node_list) > 1) + { + need_global_snapshot = true; + } + } #endif } else if (step->exec_type == EXEC_ON_COORDS) @@ -8655,58 +8710,16 @@ ExecRemoteQuery(PlanState *pstate) (TransactionBlockStatusCode() == 'T'); #ifdef __TBASE__ -{ - Oid nodeoid = InvalidOid; - if (primaryconnection) - { - nodeoid = primaryconnection->nodeoid; - if (NeedBeginTxn() && !NodeHasBeginTxn(nodeoid)) - { - primaryconnection->plpgsql_need_begin_txn = true; - SetNodeBeginTxn(nodeoid ); - if (PlpgsqlDebugPrint) - { - elog(LOG, "[PLPGSQL] ExecRemoteQuery conn nodename:%s backendpid:%d sock:%d nodeoid:%u need_begin_txn", - primaryconnection->nodename, primaryconnection->backend_pid, primaryconnection->sock, primaryconnection->nodeoid); - } - } - if (NeedBeginSubTxn() && !NodeHasBeginSubTxn(nodeoid)) - { - primaryconnection->plpgsql_need_begin_sub_txn = true; - SetNodeBeginSubTxn(nodeoid); - if (PlpgsqlDebugPrint) - { - elog(LOG, "[PLPGSQL] ExecRemoteQuery conn nodename:%s backendpid:%d sock:%d nodeoid:%u need_begin_sub_txn", - primaryconnection->nodename, primaryconnection->backend_pid, primaryconnection->sock, primaryconnection->nodeoid); - } - } - } + /* Set plpgsql transaction begin for all connections */ + if (primaryconnection) + { + SetPlpgsqlTransactionBegin(primaryconnection); + } - for (i = 0; i < regular_conn_count; i++) - { - nodeoid = connections[i]->nodeoid; - if (NeedBeginTxn() && !NodeHasBeginTxn(nodeoid)) - { - connections[i]->plpgsql_need_begin_txn = true; - SetNodeBeginTxn(nodeoid); - if (PlpgsqlDebugPrint) - { - elog(LOG, "[PLPGSQL] ExecRemoteQuery conn nodename:%s backendpid:%d sock:%d nodeoid:%u need_begin_txn", - connections[i]->nodename, connections[i]->backend_pid, connections[i]->sock, connections[i]->nodeoid); - } - } - if (NeedBeginSubTxn() && !NodeHasBeginSubTxn(nodeoid)) - { - connections[i]->plpgsql_need_begin_sub_txn = true; - SetNodeBeginSubTxn(nodeoid); - if (PlpgsqlDebugPrint) - { - elog(LOG, "[PLPGSQL] ExecRemoteQuery conn nodename:%s backendpid:%d sock:%d nodeoid:%u need_begin_sub_txn", - connections[i]->nodename, connections[i]->backend_pid, connections[i]->sock, connections[i]->nodeoid); - } - } - } -} + for (i = 0; i < regular_conn_count; i++) + { + SetPlpgsqlTransactionBegin(connections[i]); + } #endif stat_statement(); stat_transaction(total_conn_count); @@ -8718,59 +8731,65 @@ ExecRemoteQuery(PlanState *pstate) //elog(LOG, "[PLPGSQL]ExecRemoteQuery has primaryconnection"); //primaryconnection->read_only = true; #ifdef __TBASE__ - combiner->connections = &primaryconnection; - combiner->conn_count = 1; - combiner->current_conn = 0; -#endif - if (pgxc_node_begin(1, &primaryconnection, gxid, need_tran_block, - step->read_only, PGXC_NODE_DATANODE)) - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Could not begin transaction on data node:%s.", primaryconnection->nodename))); - - /* If explicit transaction is needed gxid is already sent */ - if (!pgxc_start_command_on_connection(primaryconnection, node, snapshot)) - { - pgxc_node_remote_abort(TXN_TYPE_RollbackTxn, true); - pfree_pgxc_all_handles(pgxc_connections); - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to send command to data nodes"))); - } - Assert(combiner->combine_type == COMBINE_TYPE_SAME); - - pgxc_node_receive(1, &primaryconnection, NULL); - /* Make sure the command is completed on the primary node */ - while (true) - { - int res = handle_response(primaryconnection, combiner); - if (res == RESPONSE_READY) - break; - else if (res == RESPONSE_EOF) - pgxc_node_receive(1, &primaryconnection, NULL); - else if (res == RESPONSE_COMPLETE || res == RESPONSE_ERROR) - { - if (res == RESPONSE_COMPLETE && primaryconnection->state == DN_CONNECTION_STATE_ERROR_FATAL) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Unexpected FATAL ERROR on Connection to Datanode %s pid %d", - primaryconnection->nodename, primaryconnection->backend_pid))); - - } - /* Get ReadyForQuery */ - continue; - } - else if (res == RESPONSE_ASSIGN_GXID) - continue; - else - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Unexpected response from data node"))); - } - if (combiner->errorMessage) - pgxc_node_report_error(combiner); - } + combiner->connections = &primaryconnection; + combiner->conn_count = 1; + combiner->current_conn = 0; +#endif + if (pgxc_node_begin(1, &primaryconnection, gxid, need_tran_block, + step->read_only, PGXC_NODE_DATANODE)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Could not begin transaction on data node:%s.", + primaryconnection->nodename))); + + /* If explicit transaction is needed gxid is already sent */ + if (!pgxc_start_command_on_connection(primaryconnection, + node, + snapshot)) + { + pgxc_node_remote_abort(TXN_TYPE_RollbackTxn, true); + pfree_pgxc_all_handles(pgxc_connections); + + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command to data nodes"))); + } + Assert(combiner->combine_type == COMBINE_TYPE_SAME); + + pgxc_node_receive(1, &primaryconnection, NULL); + /* Make sure the command is completed on the primary node */ + while (true) + { + int res = handle_response(primaryconnection, combiner); + if (res == RESPONSE_READY) + break; + else if (res == RESPONSE_EOF) + pgxc_node_receive(1, &primaryconnection, NULL); + else if (res == RESPONSE_COMPLETE || res == RESPONSE_ERROR) + { + if (res == RESPONSE_COMPLETE && + primaryconnection->state == DN_CONNECTION_STATE_ERROR_FATAL) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Unexpected FATAL ERROR on Connection to Datanode %s pid %d", + primaryconnection->nodename, + primaryconnection->backend_pid))); + + } + /* Get ReadyForQuery */ + continue; + } + else if (res == RESPONSE_ASSIGN_GXID) + continue; + else + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Unexpected response from data node"))); + } + if (combiner->errorMessage) + pgxc_node_report_error(combiner); + } #ifdef __TBASE__ if (regular_conn_count > 0) @@ -8784,33 +8803,35 @@ ExecRemoteQuery(PlanState *pstate) { //connections[i]->read_only = true; #ifdef __TBASE__ - connections[i]->recv_datarows = 0; -#endif - if (pgxc_node_begin(1, &connections[i], gxid, need_tran_block, - step->read_only, PGXC_NODE_DATANODE)) - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Could not begin transaction on data node:%s.", connections[i]->nodename))); - - /* If explicit transaction is needed gxid is already sent */ - if (!pgxc_start_command_on_connection(connections[i], node, snapshot)) - { - pgxc_node_remote_abort(TXN_TYPE_RollbackTxn, true); - pfree_pgxc_all_handles(pgxc_connections); - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to send command to data nodes"))); - } - connections[i]->combiner = combiner; - } + connections[i]->recv_datarows = 0; +#endif + if (pgxc_node_begin(1, &connections[i], gxid, need_tran_block, + step->read_only, PGXC_NODE_DATANODE)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Could not begin transaction on data node:%s.", + connections[i]->nodename))); + + /* If explicit transaction is needed gxid is already sent */ + if (!pgxc_start_command_on_connection(connections[i], node, snapshot)) + { + pgxc_node_remote_abort(TXN_TYPE_RollbackTxn, true); + pfree_pgxc_all_handles(pgxc_connections); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command to data nodes"))); + } + connections[i]->combiner = combiner; + } - if (step->cursor) - { - combiner->cursor = step->cursor; - combiner->cursor_count = regular_conn_count; - combiner->cursor_connections = (PGXCNodeHandle **) palloc(regular_conn_count * sizeof(PGXCNodeHandle *)); - memcpy(combiner->cursor_connections, connections, regular_conn_count * sizeof(PGXCNodeHandle *)); - } + if (step->cursor) + { + int conn_size = regular_conn_count * sizeof(PGXCNodeHandle *); + combiner->cursor = step->cursor; + combiner->cursor_count = regular_conn_count; + combiner->cursor_connections = (PGXCNodeHandle **)palloc(conn_size); + memcpy(combiner->cursor_connections, connections, conn_size); + } combiner->connections = connections; combiner->conn_count = regular_conn_count; @@ -10170,37 +10191,13 @@ ExecFinishInitRemoteSubplan(RemoteSubplanState *node) !IsA(outerPlan(plan), ModifyTable); #ifdef __TBASE__ -{ - for (i = 0; i < combiner->conn_count; i++) - { - PGXCNodeHandle *connection_tmp = combiner->connections[i]; - Oid nodeoid = connection_tmp->nodeoid; - if (NeedBeginTxn() && !NodeHasBeginTxn(nodeoid)) - { - connection_tmp->plpgsql_need_begin_txn = true; - SetNodeBeginTxn(nodeoid ); - if (PlpgsqlDebugPrint) - { - elog(LOG, "[PLPGSQL] ExecFinishInitRemoteSubplan conn nodename:%s backendpid:%d sock:%d nodeoid:%u need_begin_txn", - connection_tmp->nodename, connection_tmp->backend_pid, connection_tmp->sock, connection_tmp->nodeoid); - } - } - if (NeedBeginSubTxn() && !NodeHasBeginSubTxn(nodeoid)) - { - connection_tmp->plpgsql_need_begin_sub_txn = true; - SetNodeBeginSubTxn(nodeoid); - if (PlpgsqlDebugPrint) - { - elog(LOG, "[PLPGSQL] ExecFinishInitRemoteSubplan conn nodename:%s backendpid:%d sock:%d nodeoid:%u need_begin_sub_txn", - connection_tmp->nodename, connection_tmp->backend_pid, connection_tmp->sock, connection_tmp->nodeoid); - } - } - } -} + /* Set plpgsql transaction begin for all connections */ + for (i = 0; i < combiner->conn_count; i++) + { + SetPlpgsqlTransactionBegin(combiner->connections[i]); + } #endif - - #if 0 for (i = 0; i < combiner->conn_count; i++) { @@ -10235,11 +10232,12 @@ ExecFinishInitRemoteSubplan(RemoteSubplanState *node) { PGXCNodeHandle *connection = combiner->connections[i]; - if (pgxc_node_begin(1, &connection, gxid, true, - is_read_only, PGXC_NODE_DATANODE)) - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Could not begin transaction on data node:%s.", connection->nodename))); + if (pgxc_node_begin(1, &connection, gxid, true, + is_read_only, PGXC_NODE_DATANODE)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Could not begin transaction on data node:%s.", + connection->nodename))); if (pgxc_node_send_timestamp(connection, timestamp)) { @@ -10268,21 +10266,23 @@ ExecFinishInitRemoteSubplan(RemoteSubplanState *node) pgxc_node_send_plan(connection, cursor, "Remote Subplan", node->subplanstr, node->nParamRemote, paramtypes); - if (enable_statistic) - { - elog(LOG, "Plan Message:pid:%d,remote_pid:%d,remote_ip:%s,remote_port:%d,fd:%d,cursor:%s", - MyProcPid, connection->backend_pid, connection->nodehost, connection->nodeport, connection->sock, cursor); - } - - if (pgxc_node_flush(connection)) - { - combiner->conn_count = 0; - pfree(combiner->connections); - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to send subplan to data nodes"))); - } - } + if (enable_statistic) + { + elog(LOG, "Plan Message:pid:%d,remote_pid:%d,remote_ip:%s," + "remote_port:%d,fd:%d,cursor:%s", + MyProcPid, connection->backend_pid, connection->nodehost, + connection->nodeport, connection->sock, cursor); + } + + if (pgxc_node_flush(connection)) + { + combiner->conn_count = 0; + pfree(combiner->connections); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send subplan to data nodes"))); + } + } } diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index 7ca99de4..4279a325 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -708,10 +708,15 @@ pgxc_node_init(PGXCNodeHandle *handle, int sock, bool global_session, int pid) #endif } - /* * Wait while at least one of specified connections has data available and read * the data into the buffer + * + * Returning state code + * DNStatus_OK = 0, + * DNStatus_ERR = 1, + * DNStatus_EXPIRED = 2, + * DNStatus_BUTTY */ #ifdef __TBASE__ int diff --git a/src/backend/storage/lmgr/nodelock.c b/src/backend/storage/lmgr/nodelock.c index 25e44b5f..060e0a0b 100644 --- a/src/backend/storage/lmgr/nodelock.c +++ b/src/backend/storage/lmgr/nodelock.c @@ -905,10 +905,10 @@ bool NodeLock(char *lockActions, char objectType, char *param1, char *param2, in } /* check to see whether running transactions exist or not. - * if checkTimes is given, we will wait for checkTimes seconds at most. - * before time's up, if no running transactions, keep going; else fail to - * lock node - */ + * if checkTimes is given, we will wait for checkTimes seconds at most. + * before time's up, if no running transactions, keep going; else fail to + * lock node + */ if (ret) { RunningTransactions running = NULL; diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 0af8acfa..ce95d95d 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -5743,56 +5743,56 @@ PostgresMain(int argc, char *argv[], SetGlobalTimestamp(gts, SNAPSHOT_COORDINATOR); break; #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__ - case 'Z': /* global prepare timestamp */ - timestamp = (GlobalTimestamp) pq_getmsgint64(&input_message); - pq_getmsgend(&input_message); - - /* - * Set Xact global prepare timestamp - */ - if(enable_distri_print) - { - elog(LOG, "set global prepare gts " INT64_FORMAT, timestamp); - } - SetGlobalPrepareTimestamp(timestamp); - - break; - - - case 'T': /* global timestamp */ - timestamp = (GlobalTimestamp) pq_getmsgint64(&input_message); - pq_getmsgend(&input_message); - - /* - * Set Xact global commit timestamp - */ - if(enable_distri_print) - { - elog(LOG, "set global commit gts " INT64_FORMAT, timestamp); - } - SetGlobalCommitTimestamp(timestamp); - break; + case 'Z': /* global prepare timestamp */ + timestamp = (GlobalTimestamp) pq_getmsgint64(&input_message); + pq_getmsgend(&input_message); + + /* + * Set Xact global prepare timestamp + */ + if(enable_distri_print) + { + elog(LOG, "set global prepare gts " INT64_FORMAT, timestamp); + } + SetGlobalPrepareTimestamp(timestamp); + + break; - case 'G': /* Explicit prepared gid */ - { - const char *gid; - gid = pq_getmsgstring(&input_message); - pq_getmsgend(&input_message); - remotePrepareGID = MemoryContextStrdup(TopMemoryContext, gid); - elog(DEBUG8, "receive remote prepare gid %s", remotePrepareGID); - } - break; - case 'W': /* Prefinish phase */ - timestamp = (GlobalTimestamp) pq_getmsgint64(&input_message); - pq_getmsgend(&input_message); - elog(DEBUG8, "get prefinish timestamp " INT64_FORMAT "for gid %s", timestamp, remotePrepareGID); - SetGlobalPrepareTimestamp(timestamp); - EndExplicitGlobalPrepare(remotePrepareGID); - pfree(remotePrepareGID); - remotePrepareGID = NULL; - ReadyForCommit(whereToSendOutput); - - break; + case 'T': /* global timestamp */ + timestamp = (GlobalTimestamp) pq_getmsgint64(&input_message); + pq_getmsgend(&input_message); + + /* + * Set Xact global commit timestamp + */ + if(enable_distri_print) + { + elog(LOG, "set global commit gts " INT64_FORMAT, timestamp); + } + SetGlobalCommitTimestamp(timestamp); + break; + + case 'G': /* Explicit prepared gid */ + { + const char *gid; + gid = pq_getmsgstring(&input_message); + pq_getmsgend(&input_message); + remotePrepareGID = MemoryContextStrdup(TopMemoryContext, gid); + elog(DEBUG8, "receive remote prepare gid %s", remotePrepareGID); + } + break; + + case 'W': /* Prefinish phase */ + timestamp = (GlobalTimestamp) pq_getmsgint64(&input_message); + pq_getmsgend(&input_message); + elog(DEBUG8, "get prefinish timestamp " INT64_FORMAT "for gid %s", timestamp, remotePrepareGID); + SetGlobalPrepareTimestamp(timestamp); + EndExplicitGlobalPrepare(remotePrepareGID); + pfree(remotePrepareGID); + remotePrepareGID = NULL; + ReadyForCommit(whereToSendOutput); + + break; #endif case 't': /* timestamp */ diff --git a/src/backend/utils/cache/syscache.c b/src/backend/utils/cache/syscache.c index 4c12eb81..dbe9b893 100644 --- a/src/backend/utils/cache/syscache.c +++ b/src/backend/utils/cache/syscache.c @@ -1948,4 +1948,4 @@ void GetSysCacheInfo(int32 cacheid, *nkeys = cacheinfo[cacheid].nkeys; } } -#endif \ No newline at end of file +#endif diff --git a/src/backend/utils/time/tqual.c b/src/backend/utils/time/tqual.c index c95b3fa4..505028eb 100644 --- a/src/backend/utils/time/tqual.c +++ b/src/backend/utils/time/tqual.c @@ -86,18 +86,70 @@ SnapshotData SnapshotSelfData = {HeapTupleSatisfiesSelf}; SnapshotData SnapshotAnyData = {HeapTupleSatisfiesAny}; #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__ -static bool XidInMVCCSnapshotDistri(HeapTupleHeader tuple, TransactionId xid, Snapshot snapshot, Buffer buffer, bool *need_retry, uint16 infomask); -static bool -XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot); -#ifdef __SNAPSHOT_CHECK__ -static bool SnapshotCheck(TransactionId xid, Snapshot snapshot, int target_res, GlobalTimestamp target_committs); +static bool XidInMVCCSnapshotDistri(HeapTupleHeader tuple, TransactionId xid, + Snapshot snapshot, Buffer buffer, + bool *need_retry, uint16 infomask); +static bool XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot); + +/* Debugging.... */ + +#ifdef DIST_TXN_DEBUG +#define DEBUG_MVCC_XMIN(state, msg) \ + if(enable_distri_visibility_print && \ + TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) \ + { \ + elog(LOG, "MVCC ts " INT64_FORMAT " %s xmin %d %s.", \ + state? "true":"false", snapshot->start_ts, \ + HeapTupleHeaderGetRawXmin(tuple), msg); \ + } #else +#define DEBUG_MVCC_XMIN(state, msg) \ + ((void) 0) +#endif -#define SnapshotCheck(xid, snapshot, target_res, target_committs) +#ifdef DIST_TXN_DEBUG +#define DEBUG_MVCC_XMINXMAX(state, xmax, msg) \ + if(enable_distri_visibility_print && \ + TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) \ + { \ + elog(LOG, "MVCC ts " INT64_FORMAT " %s xmin %d xmax %d %s.", \ + state? "true":"false", snapshot->start_ts, \ + HeapTupleHeaderGetRawXmin(tuple), xmax, msg); \ + } +#else +#define DEBUG_MVCC_XMINXMAX(state, xmax, msg) \ + ((void) 0) +#endif +#ifdef DIST_TXN_DEBUG +#define DEBUG_SNAPSHOT(A) \ + do { \ + int _debug_snapshot_save_errno = errno; \ + if (enable_distri_visibility_print) \ + { \ + A; \ + } \ + errno = _debug_snapshot_save_errno; \ + } while (0) +#else +#define DEBUG_SNAPSHOT(A) \ + ((void) 0) #endif +#define DEBUG_INCREASE_VISIBLE_TUPLE \ + if(enable_distri_debug) \ + { \ + snapshot->number_visible_tuples++; \ + } + +#ifdef __SNAPSHOT_CHECK__ +static bool SnapshotCheck(TransactionId xid, Snapshot snapshot, int target_res, GlobalTimestamp target_committs); +#else +#define SnapshotCheck(xid, snapshot, target_res, target_committs) #endif + +#endif // __SUPPORT_DISTRIBUTED_TRANSACTION__ + /* #ifdef _MIGRATE_ SnapshotData SnapshotNowData = {HeapTupleSatisfiesNow}; @@ -1089,196 +1141,186 @@ HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot, #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__ static bool -XminInMVCCSnapshotByTimestamp(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer, - bool *need_retry) -{// #lizard forgives - - - GlobalTimestamp global_committs; - TransactionId xid = HeapTupleHeaderGetRawXmin(tuple); - bool res; - - global_committs = HeapTupleHderGetXminTimestapAtomic(tuple); - - if(!GlobalTimestampIsValid(global_committs)) - { - elog(DEBUG12, "invalid time xmin snapshot ts " INT64_FORMAT " xid %d.", snapshot->start_ts, xid); - return XidInMVCCSnapshotDistri(tuple, xid, snapshot, buffer, need_retry, HEAP_XMIN_COMMITTED); - } - else if (snapshot->local || CommitTimestampIsLocal(global_committs)) - { - res = XidInMVCCSnapshot(xid, snapshot); - SnapshotCheck(xid, snapshot, res, 0); - if(enable_distri_visibility_print) - { - elog(DEBUG12, "xmin local snapshot ts " INT64_FORMAT " res %d xid %d committs " INT64_FORMAT, - snapshot->start_ts, res, xid, global_committs); - } - return res; - } - else - { - if(enable_distri_debug) - { - snapshot->scanned_tuples_after_committed++; - } - - *need_retry = false; - if(!GlobalTimestampIsValid(snapshot->start_ts)) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("transaction %d does not have valid timestamp. snapshot start ts " - INT64_FORMAT ", autovacuum %d in recovery %d", - xid, snapshot->start_ts, IsAutoVacuumWorkerProcess(), snapshot->takenDuringRecovery))); - } - elog(DEBUG12, "outer xmin snapshot ts " INT64_FORMAT " global committs " INT64_FORMAT " xid %d.", - snapshot->start_ts, global_committs, xid); - - if(enable_distri_visibility_print) - { - if(!TransactionIdDidCommit(xid)) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("xmin transaction %d should commit but not. snapshot start ts " INT64_FORMAT - " commit %d abort %d in-progress %d active %d recentxmin %d" - " start ts " INT64_FORMAT " committs " INT64_FORMAT, - xid, - snapshot->start_ts, - TransactionIdDidCommit(xid), - TransactionIdDidAbort(xid), - TransactionIdIsInProgress(xid), - TransactionIdIsActive(xid), - RecentXmin, - snapshot->start_ts, - global_committs))); - } - } - - - if(snapshot->start_ts > global_committs) - { - if(enable_distri_visibility_print) - { - elog(LOG, "snapshot ts " INT64_FORMAT " false xid %d committs "INT64_FORMAT" 21.", - snapshot->start_ts, xid, global_committs); - } - SnapshotCheck(xid, snapshot, false, global_committs); - return false; - } - else - { - if(enable_distri_visibility_print) - { - elog(LOG, "snapshot ts " INT64_FORMAT " true xid %d committs "INT64_FORMAT" 22.", - snapshot->start_ts, xid, global_committs); - } - SnapshotCheck(xid, snapshot, true, global_committs); - return true; - } - } - - +XminInMVCCSnapshotByTimestamp(HeapTupleHeader tuple, Snapshot snapshot, + Buffer buffer, bool *need_retry) +{ + GlobalTimestamp global_committs; + TransactionId xid = HeapTupleHeaderGetRawXmin(tuple); + bool res; + + global_committs = HeapTupleHderGetXminTimestapAtomic(tuple); + + if(!GlobalTimestampIsValid(global_committs)) + { + DEBUG_SNAPSHOT(elog(LOG, "invalid time xmin snapshot ts " INT64_FORMAT + " xid %d.", snapshot->start_ts, xid)); + return XidInMVCCSnapshotDistri(tuple, xid, snapshot, buffer, + need_retry, HEAP_XMIN_COMMITTED); + } + else if (snapshot->local || CommitTimestampIsLocal(global_committs)) + { + res = XidInMVCCSnapshot(xid, snapshot); + SnapshotCheck(xid, snapshot, res, 0); + + DEBUG_SNAPSHOT(elog(LOG, "xmin local snapshot ts " INT64_FORMAT + " res %d xid %d committs " INT64_FORMAT, snapshot->start_ts, + res, xid, global_committs)); + return res; + } + else + { + if(enable_distri_debug) + { + snapshot->scanned_tuples_after_committed++; + } + + *need_retry = false; + if(!GlobalTimestampIsValid(snapshot->start_ts)) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("transaction %d does not have valid timestamp. " + "snapshot start ts " INT64_FORMAT ", autovacuum %d" + " in recovery %d", + xid, snapshot->start_ts, + IsAutoVacuumWorkerProcess(), + snapshot->takenDuringRecovery))); + } + + DEBUG_SNAPSHOT(elog(LOG, "outer xmin snapshot ts " INT64_FORMAT " global" + " committs " INT64_FORMAT " xid %d.", snapshot->start_ts, + global_committs, xid)); + DEBUG_SNAPSHOT( + if(!TransactionIdDidCommit(xid)) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("xmin transaction %d should commit but not. " + "snapshot start ts " INT64_FORMAT " commit %d " + "abort %d in-progress %d active %d recentxmin %d " + "start ts " INT64_FORMAT " committs " INT64_FORMAT, + xid, + snapshot->start_ts, + TransactionIdDidCommit(xid), + TransactionIdDidAbort(xid), + TransactionIdIsInProgress(xid), + TransactionIdIsActive(xid), + RecentXmin, + snapshot->start_ts, + global_committs))); + }); + + if(snapshot->start_ts > global_committs) + { + SnapshotCheck(xid, snapshot, false, global_committs); + + DEBUG_SNAPSHOT(elog(LOG, "snapshot ts " INT64_FORMAT " false xid %d " + "committs " INT64_FORMAT " 21.", snapshot->start_ts, xid, + global_committs)); + return false; + } + else + { + SnapshotCheck(xid, snapshot, true, global_committs); + + DEBUG_SNAPSHOT(elog(LOG, "snapshot ts " INT64_FORMAT " true xid %d" + " committs " INT64_FORMAT " 22.", snapshot->start_ts, xid, + global_committs)); + return true; + } + } } - static bool -XmaxInMVCCSnapshotByTimestamp(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer, - bool *need_retry) -{// #lizard forgives - - - GlobalTimestamp global_committs; - TransactionId xid = HeapTupleHeaderGetRawXmax(tuple); - bool res; - - global_committs = HeapTupleHderGetXmaxTimestapAtomic(tuple); - - if(!GlobalTimestampIsValid(global_committs)) - { - elog(DEBUG12, "invalid time xmax snapshot ts " INT64_FORMAT " xid %d.", snapshot->start_ts, xid); - return XidInMVCCSnapshotDistri(tuple, xid, snapshot, buffer, need_retry, HEAP_XMAX_COMMITTED); - } - else if (snapshot->local || CommitTimestampIsLocal(global_committs)) - { - res = XidInMVCCSnapshot(xid, snapshot); - SnapshotCheck(xid, snapshot, res, 0); - if(enable_distri_visibility_print) - { - elog(DEBUG12, "xmax local snapshot ts " INT64_FORMAT " res %d xid %d.", snapshot->start_ts, res, xid); - } - return res; - } - else - { - if(enable_distri_debug) - { - snapshot->scanned_tuples_after_committed++; - } - - *need_retry = false; - if(!GlobalTimestampIsValid(snapshot->start_ts)) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("transaction %d does not have valid timestamp. snapshot start ts " - INT64_FORMAT ", autovacuum %d in recovery %d", - xid, snapshot->start_ts, IsAutoVacuumWorkerProcess(), snapshot->takenDuringRecovery))); - } - elog(DEBUG12, "outer xmax snapshot ts " INT64_FORMAT " global committs " INT64_FORMAT " xid %d.", - snapshot->start_ts, global_committs, xid); - - - if(enable_distri_visibility_print) - { - if(!TransactionIdDidCommit(xid)) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("xmax transaction %d should commit but not. snapshot start ts " INT64_FORMAT - " commit %d abort %d in-progress %d active %d recentxmin %d" - " start ts " INT64_FORMAT " committs " INT64_FORMAT, - xid, - snapshot->start_ts, - TransactionIdDidCommit(xid), - TransactionIdDidAbort(xid), - TransactionIdIsInProgress(xid), - TransactionIdIsActive(xid), - RecentXmin, - snapshot->start_ts, - global_committs))); - } - } - - - - if(snapshot->start_ts > global_committs) - { - if(enable_distri_visibility_print) - { - elog(LOG, "snapshot ts " INT64_FORMAT " false xid %d committs "INT64_FORMAT" 11.", - snapshot->start_ts, xid, global_committs); - } - SnapshotCheck(xid, snapshot, false, global_committs); - return false; - } - else - { - if(enable_distri_visibility_print) - { - elog(LOG, "snapshot ts " INT64_FORMAT " true xid %d committs "INT64_FORMAT" 12.", - snapshot->start_ts, xid, global_committs); - } - SnapshotCheck(xid, snapshot, true, global_committs); - return true; - } - } - - +XmaxInMVCCSnapshotByTimestamp(HeapTupleHeader tuple, Snapshot snapshot, + Buffer buffer, bool *need_retry) +{ + GlobalTimestamp global_committs; + TransactionId xid = HeapTupleHeaderGetRawXmax(tuple); + bool res; + + global_committs = HeapTupleHderGetXmaxTimestapAtomic(tuple); + + if(!GlobalTimestampIsValid(global_committs)) + { + DEBUG_SNAPSHOT(elog(LOG, "invalid time xmax snapshot ts " INT64_FORMAT + " xid %d.", snapshot->start_ts, xid)); + return XidInMVCCSnapshotDistri(tuple, xid, snapshot, buffer, + need_retry, HEAP_XMAX_COMMITTED); + } + else if (snapshot->local || CommitTimestampIsLocal(global_committs)) + { + res = XidInMVCCSnapshot(xid, snapshot); + SnapshotCheck(xid, snapshot, res, 0); + + DEBUG_SNAPSHOT(elog(LOG, "xmax local snapshot ts " INT64_FORMAT " res " + "%d xid %d.", snapshot->start_ts, res, xid)); + return res; + } + else + { + if(enable_distri_debug) + { + snapshot->scanned_tuples_after_committed++; + } + + *need_retry = false; + if(!GlobalTimestampIsValid(snapshot->start_ts)) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("transaction %d does not have valid timestamp. " + "snapshot start ts " INT64_FORMAT ", autovacuum %d " + "in recovery %d", + xid, snapshot->start_ts, IsAutoVacuumWorkerProcess(), + snapshot->takenDuringRecovery))); + } + + DEBUG_SNAPSHOT(elog(LOG, "outer xmax snapshot ts " INT64_FORMAT "global" + " committs " INT64_FORMAT "xid %d.", snapshot->start_ts, + global_committs, xid)); + DEBUG_SNAPSHOT( + if(!TransactionIdDidCommit(xid)) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("xmax transaction %d should commit but not. " + "snapshot start ts " INT64_FORMAT " commit %d " + "abort %d in-progress %d active %d recentxmin %d " + "start ts " INT64_FORMAT " committs " INT64_FORMAT, + xid, + snapshot->start_ts, + TransactionIdDidCommit(xid), + TransactionIdDidAbort(xid), + TransactionIdIsInProgress(xid), + TransactionIdIsActive(xid), + RecentXmin, + snapshot->start_ts, + global_committs))); + } + ); + + if(snapshot->start_ts > global_committs) + { + SnapshotCheck(xid, snapshot, false, global_committs); + + DEBUG_SNAPSHOT(elog(LOG, "snapshot ts " INT64_FORMAT "false xid %d" + " committs" INT64_FORMAT "11.", snapshot->start_ts, xid, + global_committs)); + return false; + } + else + { + SnapshotCheck(xid, snapshot, true, global_committs); + + DEBUG_SNAPSHOT(elog(LOG, "snapshot ts " INT64_FORMAT "true xid %d " + "committs " INT64_FORMAT "12.", snapshot->start_ts, xid, + global_committs)); + return true; + } + } } - /* * HeapTupleSatisfiesMVCC * True iff heap tuple is valid for the given MVCC snapshot. @@ -1312,976 +1354,651 @@ XmaxInMVCCSnapshotByTimestamp(HeapTupleHeader tuple, Snapshot snapshot, Buffer b bool HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, Buffer buffer) -{// #lizard forgives - HeapTupleHeader tuple = htup->t_data; - bool need_retry; - -retry: - need_retry = false; - Assert(ItemPointerIsValid(&htup->t_self)); - Assert(htup->t_tableOid != InvalidOid); - -#ifdef _MIGRATE_ - if(IS_PGXC_DATANODE && ShardIDIsValid(tuple->t_shardid) && SnapshotGetShardTable(snapshot)) - { - bool shard_is_visible = bms_is_member(tuple->t_shardid/snapshot->groupsize, - SnapshotGetShardTable(snapshot)); - - if(!IsConnFromApp()) - { - if(!shard_is_visible) - return false; - } - else if(g_ShardVisibleMode != SHARD_VISIBLE_MODE_ALL) - { - if((!shard_is_visible && g_ShardVisibleMode == SHARD_VISIBLE_MODE_VISIBLE) - || (shard_is_visible && g_ShardVisibleMode == SHARD_VISIBLE_MODE_HIDDEN)) - { - return false; - } - } - } -#endif - if (!HeapTupleHeaderXminCommitted(tuple)) - { - if (HeapTupleHeaderXminInvalid(tuple)) - { - //elog(DEBUG11, "heap invalid xmin"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d xmin invalid.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - return false; - } - - /* Used by pre-9.0 binary upgrades */ - if (tuple->t_infomask & HEAP_MOVED_OFF) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (TransactionIdIsCurrentTransactionId(xvac)) - { - //elog(DEBUG11, "heap moved off current transaction"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d move off.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - return false; - } - if (!XidInMVCCSnapshotDistri(tuple, xvac, snapshot, buffer, &need_retry, HEAP_XMIN_INVALID)) - { - if (TransactionIdDidCommit(xvac)) - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - //elog(DEBUG11, "heap moved off"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d move off 1.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - return false; - } - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - } - if(need_retry) - { - goto retry; - } - - } - /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_IN) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (!TransactionIdIsCurrentTransactionId(xvac)) - { - if (XidInMVCCSnapshotDistri(tuple, xvac, snapshot, buffer, &need_retry, HEAP_XMIN_INVALID)) - { - //elog(DEBUG11, "heap moved in in snapshot"); - if(need_retry) - { - goto retry; - } - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d move in.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - return false; - } - if (TransactionIdDidCommit(xvac)) - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - else - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - //elog(DEBUG11, "heap moved in"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d move in.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - return false; - } - } - } - else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) - { - - if (HeapTupleHeaderGetCmin(tuple) >= snapshot->curcid) - { - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d current 1.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - return false; /* inserted after scan started */ - } - if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ - { - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d current 2.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - if(enable_distri_debug) - { - snapshot->number_visible_tuples++; - } - return true; - } - if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) /* not deleter */ - { - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d current 3.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - if(enable_distri_debug) - { - snapshot->number_visible_tuples++; - } - return true; - } - if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) - { - TransactionId xmax; - - xmax = HeapTupleGetUpdateXid(tuple); - - /* not LOCKED_ONLY, so it has to have an xmax */ - Assert(TransactionIdIsValid(xmax)); - - /* updating subtransaction must have aborted */ - if (!TransactionIdIsCurrentTransactionId(xmax)) - { - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d current 3.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - if(enable_distri_debug) - { - snapshot->number_visible_tuples++; - } - return true; - } - else if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid) - { - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d current 4.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - if(enable_distri_debug) - { - snapshot->number_visible_tuples++; - } - return true; /* updated after scan started */ - } - else - { - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d current 5.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - return false; /* updated before scan started */ - } - } - - if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) - { - /* deleting subtransaction must have aborted */ - SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, - InvalidTransactionId); - //elog(DEBUG11, "heap deleting subtransaction"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d current 6.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - if(enable_distri_debug) - { - snapshot->number_visible_tuples++; - } - return true; - } - - if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid) - { - //elog(DEBUG11, "heap xmin deleted after scan"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d current 7.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - if(enable_distri_debug) - { - snapshot->number_visible_tuples++; - } - return true; /* deleted after scan started */ - } - else - { - //elog(DEBUG11, "heap xmin deleted before scan"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d current 8.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - return false; /* deleted before scan started */ - } - } - else if (XminInMVCCSnapshotByTimestamp(tuple, snapshot, buffer, &need_retry)) - { - //elog(DEBUG11, "heap xmin in snapshot"); - if(need_retry) - { - goto retry; - } - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - return false; - } - else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple))) - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - HeapTupleHeaderGetRawXmin(tuple)); - else - { - /* it must have aborted or crashed */ - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - //elog(DEBUG11, "heap xmin aborted"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d xmin abort.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - return false; - } - } - else - { - /* xmin is committed, but maybe not according to our snapshot */ - if (!HeapTupleHeaderXminFrozen(tuple) && - XminInMVCCSnapshotByTimestamp(tuple, snapshot, buffer, &need_retry)) - { - if(need_retry) - { - goto retry; - } - //elog(DEBUG11, "heap xmin not committed according to snapshot"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d according to snapshot.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - return false; /* treat as still in progress */ - } - } - - /* by here, the inserting transaction has committed */ - - if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid or aborted */ - { - //elog(DEBUG11, "heap invalid xmax"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - if(enable_distri_debug) - { - snapshot->number_visible_tuples++; - } - return true; - } - if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)){ - //elog(DEBUG11, "heap xmax locked"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - if(enable_distri_debug) - { - snapshot->number_visible_tuples++; - } - return true; - } - - if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) - { - TransactionId xmax; - - /* already checked above */ - Assert(!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)); - - xmax = HeapTupleGetUpdateXid(tuple); - - /* not LOCKED_ONLY, so it has to have an xmax */ - Assert(TransactionIdIsValid(xmax)); - - if (TransactionIdIsCurrentTransactionId(xmax)) - { - if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid) - { - //elog(DEBUG11, "heap multi xmax deleted after scan"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - if(enable_distri_debug) - { - snapshot->number_visible_tuples++; - } - return true; /* deleted after scan started */ - } - else - { - //elog(DEBUG11, "heap multi xmax deleted before scan"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d xmax %d deleted after scan.", - snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple), xmax); - } - return false; /* deleted before scan started */ - } - } - if (XidInMVCCSnapshotDistri(tuple, xmax, snapshot, buffer, &need_retry, HEAP_XMAX_INVALID)) - { - if(need_retry) - { - goto retry; - } - //elog(DEBUG11, "heap multi xmax in snapshot"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - if(enable_distri_debug) - { - snapshot->number_visible_tuples++; - } - return true; - } - if (TransactionIdDidCommit(xmax)) - { - //elog(DEBUG11, "heap multi xmax committed"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d xmax %d committed .", - snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple), xmax); - } - return false; /* updating transaction committed */ - } - /* it must have aborted or crashed */ - //elog(DEBUG11, "heap multi xmax aborted"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - if(enable_distri_debug) - { - snapshot->number_visible_tuples++; - } - return true; - } - - if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED)) - { - if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) - { - if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid) - { - //elog(DEBUG11, "heap xmax deleted after scan"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - if(enable_distri_debug) - { - snapshot->number_visible_tuples++; - } - return true; /* deleted after scan started */ - } - else - { - //elog(DEBUG11, "heap xmax deleted before scan"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d xmax deleted before scan.", - snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - return false; /* deleted before scan started */ - } - } - - if (XmaxInMVCCSnapshotByTimestamp(tuple, snapshot, buffer, &need_retry)) - { - if(need_retry) - { - goto retry; - } - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - if(enable_distri_debug) - { - snapshot->number_visible_tuples++; - } - return true; - } - if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple))) - { - /* it must have aborted or crashed */ - - SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, - InvalidTransactionId); - //elog(DEBUG11, "heap xmax aborted"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - if(enable_distri_debug) - { - snapshot->number_visible_tuples++; - } - return true; - } - - /* xmax transaction committed */ - SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, - HeapTupleHeaderGetRawXmax(tuple)); - } - else - { - /* xmax is committed, but maybe not according to our snapshot */ - if (XmaxInMVCCSnapshotByTimestamp(tuple, snapshot, buffer, &need_retry)) - { - if(need_retry) - { - goto retry; - } - //elog(DEBUG11, "heap xmax not committed"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - if(enable_distri_debug) - { - snapshot->number_visible_tuples++; - } - return true; /* treat as still in progress */ - } - } - - /* xmax transaction committed */ - //elog(DEBUG11, "heap xmax committed"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d xmax %d committed last.", - snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple), HeapTupleHeaderGetRawXmax(tuple)); - } - return false; -} - -#ifdef __STORAGE_SCALABLE__ -bool -HeapTupleSatisfiesUnshard(HeapTuple htup, Snapshot snapshot, Buffer buffer) -{// #lizard forgives - HeapTupleHeader tuple = htup->t_data; - bool need_retry; - -retry: - need_retry = false; - Assert(ItemPointerIsValid(&htup->t_self)); - Assert(htup->t_tableOid != InvalidOid); - - if(IS_PGXC_DATANODE && tuple->t_shardid < 0) - return false; - - if(IS_PGXC_DATANODE && tuple->t_shardid >= 0) - { - if(g_DatanodeShardgroupBitmap == NULL) - { - elog(ERROR, "shard map in share memory has not been initialized yet."); - } - LWLockAcquire(ShardMapLock, LW_SHARED); - if(bms_is_member(tuple->t_shardid, g_DatanodeShardgroupBitmap)) - { - LWLockRelease(ShardMapLock); - return false; - } - LWLockRelease(ShardMapLock); - } - - if (!HeapTupleHeaderXminCommitted(tuple)) - { - if (HeapTupleHeaderXminInvalid(tuple)) - { - //elog(DEBUG11, "heap invalid xmin"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d xmin invalid.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - return false; - } - - /* Used by pre-9.0 binary upgrades */ - if (tuple->t_infomask & HEAP_MOVED_OFF) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (TransactionIdIsCurrentTransactionId(xvac)) - { - //elog(DEBUG11, "heap moved off current transaction"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d move off.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - return false; - } - if (!XidInMVCCSnapshotDistri(tuple, xvac, snapshot, buffer, &need_retry, HEAP_XMIN_INVALID)) - { - if (TransactionIdDidCommit(xvac)) - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - //elog(DEBUG11, "heap moved off"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d move off 1.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - return false; - } - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - } - if(need_retry) - { - goto retry; - } - - } - /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_IN) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (!TransactionIdIsCurrentTransactionId(xvac)) - { - if (XidInMVCCSnapshotDistri(tuple, xvac, snapshot, buffer, &need_retry, HEAP_XMIN_INVALID)) - { - //elog(DEBUG11, "heap moved in in snapshot"); - if(need_retry) - { - goto retry; - } - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d move in.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - return false; - } - if (TransactionIdDidCommit(xvac)) - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - else - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - //elog(DEBUG11, "heap moved in"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d move in.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - return false; - } - } - } - else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) - { - - if (HeapTupleHeaderGetCmin(tuple) >= snapshot->curcid) - { - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d current 1.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - return false; /* inserted after scan started */ - } - if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ - { - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d current 2.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - if(enable_distri_debug) - { - snapshot->number_visible_tuples++; - } - return true; - } - if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) /* not deleter */ - { - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d current 3.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - if(enable_distri_debug) - { - snapshot->number_visible_tuples++; - } - return true; - } - if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) - { - TransactionId xmax; - - xmax = HeapTupleGetUpdateXid(tuple); - - /* not LOCKED_ONLY, so it has to have an xmax */ - Assert(TransactionIdIsValid(xmax)); - - /* updating subtransaction must have aborted */ - if (!TransactionIdIsCurrentTransactionId(xmax)) - { - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d current 3.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - if(enable_distri_debug) - { - snapshot->number_visible_tuples++; - } - return true; - } - else if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid) - { - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d current 4.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - if(enable_distri_debug) - { - snapshot->number_visible_tuples++; - } - return true; /* updated after scan started */ - } - else - { - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d current 5.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - return false; /* updated before scan started */ - } - } - - if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) - { - /* deleting subtransaction must have aborted */ - SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, - InvalidTransactionId); - //elog(DEBUG11, "heap deleting subtransaction"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d current 6.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - if(enable_distri_debug) - { - snapshot->number_visible_tuples++; - } - return true; - } - - if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid) - { - //elog(DEBUG11, "heap xmin deleted after scan"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d current 7.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - if(enable_distri_debug) - { - snapshot->number_visible_tuples++; - } - return true; /* deleted after scan started */ - } - else - { - //elog(DEBUG11, "heap xmin deleted before scan"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d current 8.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - return false; /* deleted before scan started */ - } - } - else if (XminInMVCCSnapshotByTimestamp(tuple, snapshot, buffer, &need_retry)) - { - //elog(DEBUG11, "heap xmin in snapshot"); - if(need_retry) - { - goto retry; - } - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - return false; - } - else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple))) - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - HeapTupleHeaderGetRawXmin(tuple)); - else - { - /* it must have aborted or crashed */ - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - //elog(DEBUG11, "heap xmin aborted"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d xmin abort.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - return false; - } - } - else - { - /* xmin is committed, but maybe not according to our snapshot */ - if (!HeapTupleHeaderXminFrozen(tuple) && - XminInMVCCSnapshotByTimestamp(tuple, snapshot, buffer, &need_retry)) - { - if(need_retry) - { - goto retry; - } - //elog(DEBUG11, "heap xmin not committed according to snapshot"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d according to snapshot.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - return false; /* treat as still in progress */ - } - } - - /* by here, the inserting transaction has committed */ - - if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid or aborted */ - { - //elog(DEBUG11, "heap invalid xmax"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - if(enable_distri_debug) - { - snapshot->number_visible_tuples++; - } - return true; - } - if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)){ - //elog(DEBUG11, "heap xmax locked"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - if(enable_distri_debug) - { - snapshot->number_visible_tuples++; - } - return true; - } - - if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) - { - TransactionId xmax; - - /* already checked above */ - Assert(!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)); +{// #lizard forgives + HeapTupleHeader tuple = htup->t_data; + bool need_retry; - xmax = HeapTupleGetUpdateXid(tuple); +retry: + need_retry = false; + Assert(ItemPointerIsValid(&htup->t_self)); + Assert(htup->t_tableOid != InvalidOid); - /* not LOCKED_ONLY, so it has to have an xmax */ - Assert(TransactionIdIsValid(xmax)); +#ifdef _MIGRATE_ + if(IS_PGXC_DATANODE && ShardIDIsValid(tuple->t_shardid) && SnapshotGetShardTable(snapshot)) + { + bool shard_is_visible = bms_is_member(tuple->t_shardid/snapshot->groupsize, + SnapshotGetShardTable(snapshot)); - if (TransactionIdIsCurrentTransactionId(xmax)) - { - if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid) - { - //elog(DEBUG11, "heap multi xmax deleted after scan"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - if(enable_distri_debug) - { - snapshot->number_visible_tuples++; - } - return true; /* deleted after scan started */ - } - else - { - //elog(DEBUG11, "heap multi xmax deleted before scan"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d xmax %d deleted after scan.", - snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple), xmax); - } - return false; /* deleted before scan started */ - } - } - if (XidInMVCCSnapshotDistri(tuple, xmax, snapshot, buffer, &need_retry, HEAP_XMAX_INVALID)) + if(!IsConnFromApp()) { - if(need_retry) - { - goto retry; - } - //elog(DEBUG11, "heap multi xmax in snapshot"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - if(enable_distri_debug) - { - snapshot->number_visible_tuples++; - } - return true; + if(!shard_is_visible) + return false; } - if (TransactionIdDidCommit(xmax)) + else if(g_ShardVisibleMode != SHARD_VISIBLE_MODE_ALL) { - //elog(DEBUG11, "heap multi xmax committed"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) + if((!shard_is_visible && g_ShardVisibleMode == SHARD_VISIBLE_MODE_VISIBLE) + || (shard_is_visible && g_ShardVisibleMode == SHARD_VISIBLE_MODE_HIDDEN)) { - elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d xmax %d committed .", - snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple), xmax); + return false; } - return false; /* updating transaction committed */ } - /* it must have aborted or crashed */ - //elog(DEBUG11, "heap multi xmax aborted"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - if(enable_distri_debug) - { - snapshot->number_visible_tuples++; - } - return true; } +#endif + if (!HeapTupleHeaderXminCommitted(tuple)) + { + if (HeapTupleHeaderXminInvalid(tuple)) + { + DEBUG_MVCC_XMIN(false, "xmin invalid"); + return false; + } + + /* Used by pre-9.0 binary upgrades */ + if (tuple->t_infomask & HEAP_MOVED_OFF) + { + TransactionId xvac = HeapTupleHeaderGetXvac(tuple); + + if (TransactionIdIsCurrentTransactionId(xvac)) + { + DEBUG_MVCC_XMIN(false, "move off"); + return false; + } + if (!XidInMVCCSnapshotDistri(tuple, xvac, snapshot, buffer, &need_retry, HEAP_XMIN_INVALID)) + { + if (TransactionIdDidCommit(xvac)) + { + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + DEBUG_MVCC_XMIN(false, "move off 1"); + return false; + } + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + InvalidTransactionId); + } + if(need_retry) + { + goto retry; + } + + } + /* Used by pre-9.0 binary upgrades */ + else if (tuple->t_infomask & HEAP_MOVED_IN) + { + TransactionId xvac = HeapTupleHeaderGetXvac(tuple); + + if (!TransactionIdIsCurrentTransactionId(xvac)) + { + if (XidInMVCCSnapshotDistri(tuple, xvac, snapshot, buffer, &need_retry, HEAP_XMIN_INVALID)) + { + if(need_retry) + { + goto retry; + } + DEBUG_MVCC_XMIN(false, " move in"); + return false; + } + if (TransactionIdDidCommit(xvac)) + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + InvalidTransactionId); + else + { + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + DEBUG_MVCC_XMIN(false, "move in"); + return false; + } + } + } + else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) + { + + if (HeapTupleHeaderGetCmin(tuple) >= snapshot->curcid) + { + DEBUG_MVCC_XMIN(false, "current 1"); + return false; /* inserted after scan started */ + } + if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ + { + DEBUG_MVCC_XMIN(true, "current 2"); + DEBUG_INCREASE_VISIBLE_TUPLE; + return true; + } + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) /* not deleter */ + { + DEBUG_MVCC_XMIN(true, "current 3"); + DEBUG_INCREASE_VISIBLE_TUPLE; + return true; + } + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + { + TransactionId xmax; + + xmax = HeapTupleGetUpdateXid(tuple); + + /* not LOCKED_ONLY, so it has to have an xmax */ + Assert(TransactionIdIsValid(xmax)); + + /* updating subtransaction must have aborted */ + if (!TransactionIdIsCurrentTransactionId(xmax)) + { + DEBUG_MVCC_XMIN(true, "current 3"); + DEBUG_INCREASE_VISIBLE_TUPLE; + return true; + } + else if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid) + { + DEBUG_MVCC_XMIN(true, "current 4"); + DEBUG_INCREASE_VISIBLE_TUPLE; + return true; /* updated after scan started */ + } + else + { + DEBUG_MVCC_XMIN(false, "current 5"); + return false; /* updated before scan started */ + } + } + + if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) + { + /* deleting subtransaction must have aborted */ + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId); + DEBUG_MVCC_XMIN(true, "current 6"); + DEBUG_INCREASE_VISIBLE_TUPLE; + return true; + } + + if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid) + { + DEBUG_MVCC_XMIN(true, "current 7"); + DEBUG_INCREASE_VISIBLE_TUPLE; + return true; /* deleted after scan started */ + } + else + { + DEBUG_MVCC_XMIN(false, "current 8"); + return false; /* deleted before scan started */ + } + } + else if (XminInMVCCSnapshotByTimestamp(tuple, snapshot, buffer, &need_retry)) + { + //elog(DEBUG11, "heap xmin in snapshot"); + if(need_retry) + { + goto retry; + } + DEBUG_MVCC_XMIN(false, ""); + return false; + } + else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple))) + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + HeapTupleHeaderGetRawXmin(tuple)); + else + { + /* it must have aborted or crashed */ + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + DEBUG_MVCC_XMIN(false, "xmin abort"); + return false; + } + } + else + { + /* xmin is committed, but maybe not according to our snapshot */ + if (!HeapTupleHeaderXminFrozen(tuple) && + XminInMVCCSnapshotByTimestamp(tuple, snapshot, buffer, &need_retry)) + { + if(need_retry) + { + goto retry; + } + DEBUG_MVCC_XMIN(false, "according to snapshot"); + return false; /* treat as still in progress */ + } + } + + /* by here, the inserting transaction has committed */ + + if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid or aborted */ + { + DEBUG_MVCC_XMIN(true, ""); + DEBUG_INCREASE_VISIBLE_TUPLE; + return true; + } + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)){ + DEBUG_MVCC_XMIN(true, "xmax locked"); + DEBUG_INCREASE_VISIBLE_TUPLE; + return true; + } + + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + { + TransactionId xmax; + + /* already checked above */ + Assert(!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)); + + xmax = HeapTupleGetUpdateXid(tuple); + + /* not LOCKED_ONLY, so it has to have an xmax */ + Assert(TransactionIdIsValid(xmax)); + + if (TransactionIdIsCurrentTransactionId(xmax)) + { + if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid) + { + DEBUG_MVCC_XMIN(true, "heap multi xmax deleted after scan"); + DEBUG_INCREASE_VISIBLE_TUPLE + return true; /* deleted after scan started */ + } + else + { + DEBUG_MVCC_XMINXMAX(true, xmax, "deleted after scan"); + return false; /* deleted before scan started */ + } + } + if (XidInMVCCSnapshotDistri(tuple, xmax, snapshot, buffer, &need_retry, HEAP_XMAX_INVALID)) + { + if(need_retry) + { + goto retry; + } + DEBUG_MVCC_XMIN(true, ""); + DEBUG_INCREASE_VISIBLE_TUPLE + return true; + } + if (TransactionIdDidCommit(xmax)) + { + DEBUG_MVCC_XMINXMAX(false, xmax, "committed"); + return false; /* updating transaction committed */ + } + /* it must have aborted or crashed */ + DEBUG_MVCC_XMIN(true, ""); + DEBUG_INCREASE_VISIBLE_TUPLE; + return true; + } + + if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED)) + { + if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) + { + if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid) + { + DEBUG_MVCC_XMIN(true, ""); + DEBUG_INCREASE_VISIBLE_TUPLE; + return true; /* deleted after scan started */ + } + else + { + DEBUG_MVCC_XMIN(false, "xmax deleted before scan"); + return false; /* deleted before scan started */ + } + } + + if (XmaxInMVCCSnapshotByTimestamp(tuple, snapshot, buffer, &need_retry)) + { + if(need_retry) + { + goto retry; + } + DEBUG_MVCC_XMIN(true, ""); + DEBUG_INCREASE_VISIBLE_TUPLE; + return true; + } + if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple))) + { + /* it must have aborted or crashed */ + + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId); + DEBUG_MVCC_XMIN(true, "heap xmax aborted"); + DEBUG_INCREASE_VISIBLE_TUPLE; + return true; + } + + /* xmax transaction committed */ + SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, + HeapTupleHeaderGetRawXmax(tuple)); + } + else + { + /* xmax is committed, but maybe not according to our snapshot */ + if (XmaxInMVCCSnapshotByTimestamp(tuple, snapshot, buffer, &need_retry)) + { + if(need_retry) + { + goto retry; + } + DEBUG_MVCC_XMIN(true, "heap xmax not committed"); + DEBUG_INCREASE_VISIBLE_TUPLE; + return true; /* treat as still in progress */ + } + } + + /* xmax transaction committed */ + DEBUG_MVCC_XMINXMAX(true, HeapTupleHeaderGetRawXmax(tuple), "committed last"); + return false; +} - if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED)) - { - if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) - { - if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid) - { - //elog(DEBUG11, "heap xmax deleted after scan"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - if(enable_distri_debug) - { - snapshot->number_visible_tuples++; - } - return true; /* deleted after scan started */ - } - else - { - //elog(DEBUG11, "heap xmax deleted before scan"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d xmax deleted before scan.", - snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - return false; /* deleted before scan started */ - } - } +#ifdef __STORAGE_SCALABLE__ +bool +HeapTupleSatisfiesUnshard(HeapTuple htup, Snapshot snapshot, Buffer buffer) +{// #lizard forgives + HeapTupleHeader tuple = htup->t_data; + bool need_retry; + +retry: + need_retry = false; + Assert(ItemPointerIsValid(&htup->t_self)); + Assert(htup->t_tableOid != InvalidOid); + + if(IS_PGXC_DATANODE && tuple->t_shardid < 0) + return false; - if (XmaxInMVCCSnapshotByTimestamp(tuple, snapshot, buffer, &need_retry)) + if(IS_PGXC_DATANODE && tuple->t_shardid >= 0) + { + if(g_DatanodeShardgroupBitmap == NULL) { - if(need_retry) - { - goto retry; - } - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - if(enable_distri_debug) - { - snapshot->number_visible_tuples++; - } - return true; + elog(ERROR, "shard map in share memory has not been initialized yet."); } - if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple))) + LWLockAcquire(ShardMapLock, LW_SHARED); + if(bms_is_member(tuple->t_shardid, g_DatanodeShardgroupBitmap)) { - /* it must have aborted or crashed */ - - SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, - InvalidTransactionId); - //elog(DEBUG11, "heap xmax aborted"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - if(enable_distri_debug) - { - snapshot->number_visible_tuples++; - } - return true; - } - - /* xmax transaction committed */ - SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, - HeapTupleHeaderGetRawXmax(tuple)); - } - else - { - /* xmax is committed, but maybe not according to our snapshot */ - if (XmaxInMVCCSnapshotByTimestamp(tuple, snapshot, buffer, &need_retry)) - { - if(need_retry) - { - goto retry; - } - //elog(DEBUG11, "heap xmax not committed"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple)); - } - if(enable_distri_debug) - { - snapshot->number_visible_tuples++; - } - return true; /* treat as still in progress */ + LWLockRelease(ShardMapLock); + return false; } - } - - /* xmax transaction committed */ - //elog(DEBUG11, "heap xmax committed"); - if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) - { - elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d xmax %d committed last.", - snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple), HeapTupleHeaderGetRawXmax(tuple)); - } - return false; + LWLockRelease(ShardMapLock); + } + + if (!HeapTupleHeaderXminCommitted(tuple)) + { + if (HeapTupleHeaderXminInvalid(tuple)) + { + DEBUG_MVCC_XMIN(false, "xmin invalid"); + return false; + } + + /* Used by pre-9.0 binary upgrades */ + if (tuple->t_infomask & HEAP_MOVED_OFF) + { + TransactionId xvac = HeapTupleHeaderGetXvac(tuple); + + if (TransactionIdIsCurrentTransactionId(xvac)) + { + DEBUG_MVCC_XMIN(false, "move off"); + return false; + } + if (!XidInMVCCSnapshotDistri(tuple, xvac, snapshot, buffer, &need_retry, HEAP_XMIN_INVALID)) + { + if (TransactionIdDidCommit(xvac)) + { + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + DEBUG_MVCC_XMIN(false, "move off 1"); + return false; + } + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + InvalidTransactionId); + } + if(need_retry) + { + goto retry; + } + + } + /* Used by pre-9.0 binary upgrades */ + else if (tuple->t_infomask & HEAP_MOVED_IN) + { + TransactionId xvac = HeapTupleHeaderGetXvac(tuple); + + if (!TransactionIdIsCurrentTransactionId(xvac)) + { + if (XidInMVCCSnapshotDistri(tuple, xvac, snapshot, buffer, &need_retry, HEAP_XMIN_INVALID)) + { + if(need_retry) + { + goto retry; + } + DEBUG_MVCC_XMIN(false, "move in"); + return false; + } + if (TransactionIdDidCommit(xvac)) + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + InvalidTransactionId); + else + { + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + DEBUG_MVCC_XMIN(false, "move in"); + return false; + } + } + } + else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) + { + if (HeapTupleHeaderGetCmin(tuple) >= snapshot->curcid) + { + DEBUG_MVCC_XMIN(false, "current 1"); + return false; /* inserted after scan started */ + } + if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ + { + DEBUG_MVCC_XMIN(true, "current 2"); + DEBUG_INCREASE_VISIBLE_TUPLE; + return true; + } + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) /* not deleter */ + { + DEBUG_MVCC_XMIN(true, "current 3"); + DEBUG_INCREASE_VISIBLE_TUPLE; + return true; + } + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + { + TransactionId xmax; + + xmax = HeapTupleGetUpdateXid(tuple); + + /* not LOCKED_ONLY, so it has to have an xmax */ + Assert(TransactionIdIsValid(xmax)); + + /* updating subtransaction must have aborted */ + if (!TransactionIdIsCurrentTransactionId(xmax)) + { + DEBUG_MVCC_XMIN(true, "current 3"); + DEBUG_INCREASE_VISIBLE_TUPLE; + return true; + } + else if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid) + { + DEBUG_MVCC_XMIN(true, "current 4"); + DEBUG_INCREASE_VISIBLE_TUPLE; + return true; /* updated after scan started */ + } + else + { + DEBUG_MVCC_XMIN(false, "current 5"); + return false; /* updated before scan started */ + } + } + + if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) + { + /* deleting subtransaction must have aborted */ + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId); + DEBUG_MVCC_XMIN(true, "current 6"); + DEBUG_INCREASE_VISIBLE_TUPLE; + return true; + } + + if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid) + { + DEBUG_MVCC_XMIN(true, "current 7"); + DEBUG_INCREASE_VISIBLE_TUPLE; + return true; /* deleted after scan started */ + } + else + { + DEBUG_MVCC_XMIN(false, "current 8"); + return false; /* deleted before scan started */ + } + } + else if (XminInMVCCSnapshotByTimestamp(tuple, snapshot, buffer, &need_retry)) + { + if(need_retry) + { + goto retry; + } + DEBUG_MVCC_XMIN(false, "xmin in snapshot"); + return false; + } + else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple))) + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + HeapTupleHeaderGetRawXmin(tuple)); + else + { + /* it must have aborted or crashed */ + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + DEBUG_MVCC_XMIN(false, "xmin aborted"); + return false; + } + } + else + { + /* xmin is committed, but maybe not according to our snapshot */ + if (!HeapTupleHeaderXminFrozen(tuple) && + XminInMVCCSnapshotByTimestamp(tuple, snapshot, buffer, &need_retry)) + { + if(need_retry) + { + goto retry; + } + DEBUG_MVCC_XMIN(false, " xmin not committed according to snapshot"); + return false; /* treat as still in progress */ + } + } + + /* by here, the inserting transaction has committed */ + + if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid or aborted */ + { + DEBUG_MVCC_XMIN(true, "invalid xmax"); + DEBUG_INCREASE_VISIBLE_TUPLE; + return true; + } + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)){ + DEBUG_MVCC_XMIN(true, "xmax locked"); + DEBUG_INCREASE_VISIBLE_TUPLE; + return true; + } + + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + { + TransactionId xmax; + + /* already checked above */ + Assert(!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)); + + xmax = HeapTupleGetUpdateXid(tuple); + + /* not LOCKED_ONLY, so it has to have an xmax */ + Assert(TransactionIdIsValid(xmax)); + + if (TransactionIdIsCurrentTransactionId(xmax)) + { + if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid) + { + DEBUG_MVCC_XMIN(true, "multi xmax deleted after scan"); + DEBUG_INCREASE_VISIBLE_TUPLE; + return true; /* deleted after scan started */ + } + else + { + DEBUG_MVCC_XMINXMAX(false, xmax, "deleted before scan"); + return false; /* deleted before scan started */ + } + } + if (XidInMVCCSnapshotDistri(tuple, xmax, snapshot, buffer, &need_retry, HEAP_XMAX_INVALID)) + { + if(need_retry) + { + goto retry; + } + DEBUG_MVCC_XMIN(true, "multi xmax in snapshot"); + DEBUG_INCREASE_VISIBLE_TUPLE; + return true; + } + if (TransactionIdDidCommit(xmax)) + { + DEBUG_MVCC_XMINXMAX(false, xmax, "committed"); + return false; /* updating transaction committed */ + } + /* it must have aborted or crashed */ + DEBUG_MVCC_XMIN(true, "xmax aborted"); + DEBUG_INCREASE_VISIBLE_TUPLE; + return true; + } + + if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED)) + { + if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) + { + if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid) + { + DEBUG_MVCC_XMIN(true, "xmax deleted after scan started"); + DEBUG_INCREASE_VISIBLE_TUPLE; + return true; /* deleted after scan started */ + } + else + { + DEBUG_MVCC_XMIN(false, "xmax deleted before scan started"); + return false; /* deleted before scan started */ + } + } + + if (XmaxInMVCCSnapshotByTimestamp(tuple, snapshot, buffer, &need_retry)) + { + if(need_retry) + { + goto retry; + } + DEBUG_MVCC_XMIN(true, "xmax in mvcc snapshot"); + DEBUG_INCREASE_VISIBLE_TUPLE; + return true; + } + if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple))) + { + /* it must have aborted or crashed */ + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId); + DEBUG_MVCC_XMIN(true, "xmax aborted"); + DEBUG_INCREASE_VISIBLE_TUPLE; + return true; + } + + /* xmax transaction committed */ + SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, + HeapTupleHeaderGetRawXmax(tuple)); + } + else + { + /* xmax is committed, but maybe not according to our snapshot */ + if (XmaxInMVCCSnapshotByTimestamp(tuple, snapshot, buffer, &need_retry)) + { + if(need_retry) + { + goto retry; + } + DEBUG_MVCC_XMIN(true, "xmax not committed"); + DEBUG_INCREASE_VISIBLE_TUPLE; + return true; /* treat as still in progress */ + } + } + + /* xmax transaction committed */ + DEBUG_MVCC_XMINXMAX(true, HeapTupleHeaderGetRawXmax(tuple), "xmax committed"); + return false; } #endif @@ -3685,243 +3402,229 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot) return false; } - #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__ static bool -XidInMVCCSnapshotDistri(HeapTupleHeader tuple, TransactionId xid, Snapshot snapshot, Buffer buffer, bool *need_retry, - uint16 infomask) -{// #lizard forgives - int res = false; - GlobalTimestamp prepare_ts; - GlobalTimestamp global_committs = 0; - - *need_retry = false; - /* - * For Tbase, we propose a concurrency control mechanism - * based on global timestamp to maintain distributed transaction consistency. - * - * Rule: T2 can see T1's modification only if T2.start > T1.commit. - * For read-committed isolation, T2.start is the executing statement's start timestmap. - * - */ - - - if (snapshot->local || !TransactionIdIsNormal(xid)) - { - - res = XidInMVCCSnapshot(xid, snapshot); - SnapshotCheck(xid, snapshot, res, 0); - if(enable_distri_visibility_print && snapshot->local) - { - elog(DEBUG12, "local: snapshot ts " INT64_FORMAT " xid %d res %d.", snapshot->start_ts, xid, res); - } - return res; - } - - if(TransactionIdGetCommitTsData(xid, &global_committs, NULL)) - { - if(!GlobalTimestampIsValid(snapshot->start_ts)) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("transaction %d does not have valid timestamp. snapshot start ts " INT64_FORMAT - ", autovacuum %d in recovery %d", - xid, snapshot->start_ts, IsAutoVacuumWorkerProcess(), snapshot->takenDuringRecovery))); - } - Assert(GlobalTimestampIsValid(snapshot->start_ts)); - - if(enable_distri_debug) - { - snapshot->scanned_tuples_after_committed++; - } - - if(CommitTimestampIsLocal(global_committs)) - { - res = XidInMVCCSnapshot(xid, snapshot); - SnapshotCheck(xid, snapshot, res, 0); - elog(DEBUG12, "local snapshot ts " INT64_FORMAT " res %d xid %d after wait.", snapshot->start_ts, res, xid); - return res; - } - - - if(snapshot->start_ts > global_committs) - { - SnapshotCheck(xid, snapshot, false, global_committs); - if(enable_distri_visibility_print) - { - elog(LOG, "snapshot ts " INT64_FORMAT " false xid %d committs "INT64_FORMAT" 1.", - snapshot->start_ts, xid, global_committs); - } - return false; - } - else - { - SnapshotCheck(xid, snapshot, true, global_committs); - if(enable_distri_visibility_print) - { - elog(LOG, "snapshot ts " INT64_FORMAT " true xid %d committs "INT64_FORMAT" 2.", - snapshot->start_ts, xid, global_committs); - } - SetTimestamp(tuple, xid, buffer, infomask); - return true; - } - } - - prepare_ts = InvalidGlobalTimestamp; - /* - * If xid has passed the prepare phase, - * we should wait for it to complete. - */ - if(XidIsPrepared(xid, snapshot, &prepare_ts)) - { - - if(enable_distri_debug) - { - snapshot->scanned_tuples_after_prepare++; - } - - if(!GlobalTimestampIsValid(snapshot->start_ts)) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("transaction %d does not have valid timestamp. snapshot start ts " INT64_FORMAT ", autovacuum %d in recovery %d", - xid, snapshot->start_ts, IsAutoVacuumWorkerProcess(), snapshot->takenDuringRecovery))); - } - - if(GlobalTimestampIsValid(prepare_ts) && !GlobalTimestampIsFrozen(prepare_ts) && - (snapshot->start_ts < prepare_ts)) - { - SnapshotCheck(xid, snapshot, true, 0); - if(enable_distri_visibility_print) - { - elog(LOG, "snapshot ts " INT64_FORMAT " true xid %d prep "INT64_FORMAT".", snapshot->start_ts, xid, prepare_ts); - } - elog(DEBUG12, "xid %d, start_ts " INT64_FORMAT ", prepare " INT64_FORMAT " after wait true.", xid, snapshot->start_ts, prepare_ts); - return true; - } - - if(GlobalTimestampIsValid(prepare_ts)) - { - BufferDesc *buf; - int lock_type = -1; - - buf = GetBufferDescriptor(buffer - 1); - - if(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(buf), - LW_EXCLUSIVE)) - { - lock_type = BUFFER_LOCK_EXCLUSIVE; - } - else if(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(buf), - LW_SHARED)) - { - lock_type = BUFFER_LOCK_SHARE; - } - - XactLockTableWait(xid, NULL, NULL, XLTW_None); - if(lock_type != -1) - { - /* Avoid deadlock */ - if(TransactionIdDidAbort(xid)) - { - if(enable_distri_visibility_print) - { - elog(LOG, "abort snapshot ts " INT64_FORMAT " false xid %d .", snapshot->start_ts, xid); - } - if(enable_distri_debug) - { - snapshot->scanned_tuples_after_abort++; - } - - *need_retry = false; - return false; - } - else - { - *need_retry = true; - return true; - } - } - } - - - - if(TransactionIdGetCommitTsData(xid, &global_committs, NULL)) - { - - if(enable_distri_debug) - { - snapshot->scanned_tuples_after_committed++; - } - - if(CommitTimestampIsLocal(global_committs)) - { - res = XidInMVCCSnapshot(xid, snapshot); - SnapshotCheck(xid, snapshot, res, 0); - elog(DEBUG12, "local snapshot ts " INT64_FORMAT " res %d xid %d after wait.", snapshot->start_ts, res, xid); - return res; - } - elog(DEBUG12, "snapshot ts " INT64_FORMAT " global committs " INT64_FORMAT " xid %d after wait.", snapshot->start_ts, global_committs, xid); - - - if(snapshot->start_ts > global_committs) - { - if(enable_distri_visibility_print) - { - elog(LOG, "snapshot ts " INT64_FORMAT " false xid %d committs "INT64_FORMAT" 3.", - snapshot->start_ts, xid, global_committs); - } - SnapshotCheck(xid, snapshot, false, global_committs); - return false; - } - else - { - if(enable_distri_visibility_print) - { - elog(LOG, "snapshot ts " INT64_FORMAT " true xid %d committs "INT64_FORMAT" 4.", - snapshot->start_ts, xid, global_committs); - } - SnapshotCheck(xid, snapshot, true, global_committs); - SetTimestamp(tuple, xid, buffer, infomask); - return true; - - } - } - else - {/* Abort or crashed */ - - if(enable_distri_debug) - { - snapshot->scanned_tuples_after_abort++; - } - elog(DEBUG12, "abort: snapshot ts " INT64_FORMAT " xid %d.", snapshot->start_ts, xid); - - SnapshotCheck(xid, snapshot, false, 0); - if(enable_distri_visibility_print) - { - elog(LOG, "abort snapshot ts " INT64_FORMAT " false xid %d .", snapshot->start_ts, xid); - } - return false; - } - } - - if(enable_distri_debug) - { - snapshot->scanned_tuples_before_prepare++; - } - /* - * For non-prepared transaction, its commit timestamp must be larger than - * the current running transaction/statement's start timestamp. - * This is because that as T1's commit timestamp has not yet been aquired on CN, - * T2.start < T1.commit is always being held. - */ - SnapshotCheck(xid, snapshot, true, 0); - if(enable_distri_visibility_print) - { - elog(LOG, "snapshot ts " INT64_FORMAT " true xid %d 5.", snapshot->start_ts, xid); - } - return true; - +XidInMVCCSnapshotDistri(HeapTupleHeader tuple, TransactionId xid, + Snapshot snapshot, Buffer buffer, + bool *need_retry, uint16 infomask) +{ + int res = false; + GlobalTimestamp prepare_ts; + GlobalTimestamp global_committs = 0; + + *need_retry = false; + /* + * For Tbase, we propose a concurrency control mechanism based on global + * timestamp to maintain distributed transaction consistency. + * + * Rule: T2 can see T1's modification only if T2.start > T1.commit. + * For read-committed isolation, T2.start is the executing statement's + * start timestmap. + */ + if (snapshot->local || !TransactionIdIsNormal(xid)) + { + res = XidInMVCCSnapshot(xid, snapshot); + SnapshotCheck(xid, snapshot, res, 0); + + DEBUG_SNAPSHOT(elog(DEBUG12, "local: snapshot ts " INT64_FORMAT "xid %d" + " res %d.", snapshot->start_ts, xid, res)); + return res; + } + + if(TransactionIdGetCommitTsData(xid, &global_committs, NULL)) + { + if(!GlobalTimestampIsValid(snapshot->start_ts)) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("transaction %d does not have valid timestamp." + "snapshot start ts " INT64_FORMAT ", autovacuum" + " %d in recovery %d", + xid, snapshot->start_ts, + IsAutoVacuumWorkerProcess(), + snapshot->takenDuringRecovery))); + } + Assert(GlobalTimestampIsValid(snapshot->start_ts)); + + if(enable_distri_debug) + { + snapshot->scanned_tuples_after_committed++; + } + + if(CommitTimestampIsLocal(global_committs)) + { + res = XidInMVCCSnapshot(xid, snapshot); + SnapshotCheck(xid, snapshot, res, 0); + + DEBUG_SNAPSHOT(elog(DEBUG12, "local snapshot ts " INT64_FORMAT "res" + " %d xid %d after wait.", snapshot->start_ts, res, xid)); + return res; + } + + if(snapshot->start_ts > global_committs) + { + SnapshotCheck(xid, snapshot, false, global_committs); + + DEBUG_SNAPSHOT(elog(LOG, "snapshot ts " INT64_FORMAT "false xid %d" + " committs " INT64_FORMAT "1.", snapshot->start_ts, xid, + global_committs)); + return false; + } + else + { + SnapshotCheck(xid, snapshot, true, global_committs); + SetTimestamp(tuple, xid, buffer, infomask); + + DEBUG_SNAPSHOT(elog(LOG, "snapshot ts " INT64_FORMAT "true xid %d " + "committs " INT64_FORMAT "2.", + snapshot->start_ts, xid, global_committs)); + return true; + } + } + + prepare_ts = InvalidGlobalTimestamp; + /* + * If xid has passed the prepare phase, we should wait for it to complete. + */ + if(XidIsPrepared(xid, snapshot, &prepare_ts)) + { + if(enable_distri_debug) + { + snapshot->scanned_tuples_after_prepare++; + } + + if(!GlobalTimestampIsValid(snapshot->start_ts)) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("transaction %d does not have valid timestamp. " + "snapshot start ts " INT64_FORMAT ", autovacuum" + " %d in recovery %d", + xid, snapshot->start_ts, + IsAutoVacuumWorkerProcess(), + snapshot->takenDuringRecovery))); + } + + if(GlobalTimestampIsValid(prepare_ts) && !GlobalTimestampIsFrozen(prepare_ts) && + (snapshot->start_ts < prepare_ts)) + { + SnapshotCheck(xid, snapshot, true, 0); + + DEBUG_SNAPSHOT(elog(LOG, "snapshot ts " INT64_FORMAT " true xid %d" + " prep " INT64_FORMAT, snapshot->start_ts, xid, prepare_ts)); + return true; + } + + if(GlobalTimestampIsValid(prepare_ts)) + { + BufferDesc *buf; + int lock_type = -1; + + buf = GetBufferDescriptor(buffer - 1); + + if(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(buf), + LW_EXCLUSIVE)) + { + lock_type = BUFFER_LOCK_EXCLUSIVE; + } + else if(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(buf), + LW_SHARED)) + { + lock_type = BUFFER_LOCK_SHARE; + } + + XactLockTableWait(xid, NULL, NULL, XLTW_None); + if(lock_type != -1) + { + /* Avoid deadlock */ + if(TransactionIdDidAbort(xid)) + { + DEBUG_SNAPSHOT(elog(LOG, "abort snapshot ts " INT64_FORMAT + "false xid %d .", snapshot->start_ts, xid)); + if(enable_distri_debug) + { + snapshot->scanned_tuples_after_abort++; + } + + *need_retry = false; + return false; + } + else + { + *need_retry = true; + return true; + } + } + } + + if(TransactionIdGetCommitTsData(xid, &global_committs, NULL)) + { + if(enable_distri_debug) + { + snapshot->scanned_tuples_after_committed++; + } + + if(CommitTimestampIsLocal(global_committs)) + { + res = XidInMVCCSnapshot(xid, snapshot); + SnapshotCheck(xid, snapshot, res, 0); + + DEBUG_SNAPSHOT(elog(DEBUG12, "local snapshot ts " INT64_FORMAT + "res %d xid %d after wait.", + snapshot->start_ts, res,xid)); + return res; + } + + if(snapshot->start_ts > global_committs) + { + SnapshotCheck(xid, snapshot, false, global_committs); + + DEBUG_SNAPSHOT(elog(LOG, "snapshot ts " INT64_FORMAT " false " + "xid %d commit_ts " INT64_FORMAT " 3.", + snapshot->start_ts, xid, global_committs)); + return false; + } + else + { + SnapshotCheck(xid, snapshot, true, global_committs); + SetTimestamp(tuple, xid, buffer, infomask); + + DEBUG_SNAPSHOT(elog(LOG, "snapshot ts " INT64_FORMAT " true xid" + " %d committs" INT64_FORMAT " 4.", snapshot->start_ts, + xid, global_committs)); + return true; + } + } + else + {/* Abort or crashed */ + if(enable_distri_debug) + { + snapshot->scanned_tuples_after_abort++; + } + SnapshotCheck(xid, snapshot, false, 0); + + DEBUG_SNAPSHOT(elog(LOG, "abort snapshot ts " INT64_FORMAT " false" + " xid %d .", snapshot->start_ts, xid)); + return false; + } + } + + if(enable_distri_debug) + { + snapshot->scanned_tuples_before_prepare++; + } + + /* + * For non-prepared transaction, its commit timestamp must be larger than + * the current running transaction/statement's start timestamp. This is + * because that as T1's commit timestamp has not yet been aquired on CN, + * T2.start < T1.commit is always being held. + */ + SnapshotCheck(xid, snapshot, true, 0); + + DEBUG_SNAPSHOT(elog(LOG, "snapshot ts " INT64_FORMAT " true xid %d 5.", + snapshot->start_ts, xid)); + return true; } #endif diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h index 94e82799..4ba6f96b 100644 --- a/src/include/utils/snapshot.h +++ b/src/include/utils/snapshot.h @@ -146,19 +146,20 @@ typedef struct SnapshotData #endif #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__ - GlobalTimestamp start_ts; /* global timestamp at which the statement/transaction starts */ + /* + * global timestamp at which the statement/transaction starts + */ + GlobalTimestamp start_ts; - bool local; /* local snapshot */ + bool local; /* local snapshot */ - TransactionId *prepare_xip; - - GlobalTimestamp *prepare_xip_ts; + TransactionId *prepare_xip; + GlobalTimestamp *prepare_xip_ts; uint32 prepare_xcnt; - TransactionId *prepare_subxip; - - GlobalTimestamp *prepare_subxip_ts; + TransactionId *prepare_subxip; + GlobalTimestamp *prepare_subxip_ts; uint32 prepare_subxcnt; From 684e779135021ffbc89def45e1a16849ebbec3ef Mon Sep 17 00:00:00 2001 From: ericxwu Date: Tue, 14 Jul 2020 20:00:08 +0800 Subject: [PATCH 014/578] print debugging info before SnapshotCheck --- src/backend/utils/time/tqual.c | 54 ++++++++++++++++------------------ 1 file changed, 25 insertions(+), 29 deletions(-) diff --git a/src/backend/utils/time/tqual.c b/src/backend/utils/time/tqual.c index 505028eb..6b01aa3e 100644 --- a/src/backend/utils/time/tqual.c +++ b/src/backend/utils/time/tqual.c @@ -1160,11 +1160,11 @@ XminInMVCCSnapshotByTimestamp(HeapTupleHeader tuple, Snapshot snapshot, else if (snapshot->local || CommitTimestampIsLocal(global_committs)) { res = XidInMVCCSnapshot(xid, snapshot); - SnapshotCheck(xid, snapshot, res, 0); DEBUG_SNAPSHOT(elog(LOG, "xmin local snapshot ts " INT64_FORMAT " res %d xid %d committs " INT64_FORMAT, snapshot->start_ts, res, xid, global_committs)); + SnapshotCheck(xid, snapshot, res, 0); return res; } else @@ -1212,20 +1212,19 @@ XminInMVCCSnapshotByTimestamp(HeapTupleHeader tuple, Snapshot snapshot, if(snapshot->start_ts > global_committs) { - SnapshotCheck(xid, snapshot, false, global_committs); - DEBUG_SNAPSHOT(elog(LOG, "snapshot ts " INT64_FORMAT " false xid %d " "committs " INT64_FORMAT " 21.", snapshot->start_ts, xid, global_committs)); + SnapshotCheck(xid, snapshot, false, global_committs); return false; } else - { - SnapshotCheck(xid, snapshot, true, global_committs); - + { DEBUG_SNAPSHOT(elog(LOG, "snapshot ts " INT64_FORMAT " true xid %d" " committs " INT64_FORMAT " 22.", snapshot->start_ts, xid, global_committs)); + + SnapshotCheck(xid, snapshot, true, global_committs); return true; } } @@ -1251,10 +1250,11 @@ XmaxInMVCCSnapshotByTimestamp(HeapTupleHeader tuple, Snapshot snapshot, else if (snapshot->local || CommitTimestampIsLocal(global_committs)) { res = XidInMVCCSnapshot(xid, snapshot); - SnapshotCheck(xid, snapshot, res, 0); DEBUG_SNAPSHOT(elog(LOG, "xmax local snapshot ts " INT64_FORMAT " res " "%d xid %d.", snapshot->start_ts, res, xid)); + + SnapshotCheck(xid, snapshot, res, 0); return res; } else @@ -1302,20 +1302,20 @@ XmaxInMVCCSnapshotByTimestamp(HeapTupleHeader tuple, Snapshot snapshot, if(snapshot->start_ts > global_committs) { - SnapshotCheck(xid, snapshot, false, global_committs); - DEBUG_SNAPSHOT(elog(LOG, "snapshot ts " INT64_FORMAT "false xid %d" " committs" INT64_FORMAT "11.", snapshot->start_ts, xid, global_committs)); + + SnapshotCheck(xid, snapshot, false, global_committs); return false; } else { - SnapshotCheck(xid, snapshot, true, global_committs); - DEBUG_SNAPSHOT(elog(LOG, "snapshot ts " INT64_FORMAT "true xid %d " "committs " INT64_FORMAT "12.", snapshot->start_ts, xid, global_committs)); + + SnapshotCheck(xid, snapshot, true, global_committs); return true; } } @@ -3424,10 +3424,10 @@ XidInMVCCSnapshotDistri(HeapTupleHeader tuple, TransactionId xid, if (snapshot->local || !TransactionIdIsNormal(xid)) { res = XidInMVCCSnapshot(xid, snapshot); - SnapshotCheck(xid, snapshot, res, 0); DEBUG_SNAPSHOT(elog(DEBUG12, "local: snapshot ts " INT64_FORMAT "xid %d" " res %d.", snapshot->start_ts, xid, res)); + SnapshotCheck(xid, snapshot, res, 0); return res; } @@ -3454,30 +3454,28 @@ XidInMVCCSnapshotDistri(HeapTupleHeader tuple, TransactionId xid, if(CommitTimestampIsLocal(global_committs)) { res = XidInMVCCSnapshot(xid, snapshot); - SnapshotCheck(xid, snapshot, res, 0); DEBUG_SNAPSHOT(elog(DEBUG12, "local snapshot ts " INT64_FORMAT "res" " %d xid %d after wait.", snapshot->start_ts, res, xid)); + SnapshotCheck(xid, snapshot, res, 0); return res; } if(snapshot->start_ts > global_committs) { - SnapshotCheck(xid, snapshot, false, global_committs); - DEBUG_SNAPSHOT(elog(LOG, "snapshot ts " INT64_FORMAT "false xid %d" " committs " INT64_FORMAT "1.", snapshot->start_ts, xid, global_committs)); + SnapshotCheck(xid, snapshot, false, global_committs); return false; } else { - SnapshotCheck(xid, snapshot, true, global_committs); - SetTimestamp(tuple, xid, buffer, infomask); - DEBUG_SNAPSHOT(elog(LOG, "snapshot ts " INT64_FORMAT "true xid %d " "committs " INT64_FORMAT "2.", snapshot->start_ts, xid, global_committs)); + SnapshotCheck(xid, snapshot, true, global_committs); + SetTimestamp(tuple, xid, buffer, infomask); return true; } } @@ -3508,10 +3506,9 @@ XidInMVCCSnapshotDistri(HeapTupleHeader tuple, TransactionId xid, if(GlobalTimestampIsValid(prepare_ts) && !GlobalTimestampIsFrozen(prepare_ts) && (snapshot->start_ts < prepare_ts)) { - SnapshotCheck(xid, snapshot, true, 0); - DEBUG_SNAPSHOT(elog(LOG, "snapshot ts " INT64_FORMAT " true xid %d" " prep " INT64_FORMAT, snapshot->start_ts, xid, prepare_ts)); + SnapshotCheck(xid, snapshot, true, 0); return true; } @@ -3567,31 +3564,30 @@ XidInMVCCSnapshotDistri(HeapTupleHeader tuple, TransactionId xid, if(CommitTimestampIsLocal(global_committs)) { res = XidInMVCCSnapshot(xid, snapshot); - SnapshotCheck(xid, snapshot, res, 0); DEBUG_SNAPSHOT(elog(DEBUG12, "local snapshot ts " INT64_FORMAT "res %d xid %d after wait.", snapshot->start_ts, res,xid)); + SnapshotCheck(xid, snapshot, res, 0); return res; } if(snapshot->start_ts > global_committs) { - SnapshotCheck(xid, snapshot, false, global_committs); - DEBUG_SNAPSHOT(elog(LOG, "snapshot ts " INT64_FORMAT " false " "xid %d commit_ts " INT64_FORMAT " 3.", snapshot->start_ts, xid, global_committs)); + SnapshotCheck(xid, snapshot, false, global_committs); return false; } else { - SnapshotCheck(xid, snapshot, true, global_committs); - SetTimestamp(tuple, xid, buffer, infomask); - DEBUG_SNAPSHOT(elog(LOG, "snapshot ts " INT64_FORMAT " true xid" " %d committs" INT64_FORMAT " 4.", snapshot->start_ts, xid, global_committs)); + SnapshotCheck(xid, snapshot, true, global_committs); + + SetTimestamp(tuple, xid, buffer, infomask); return true; } } @@ -3601,10 +3597,10 @@ XidInMVCCSnapshotDistri(HeapTupleHeader tuple, TransactionId xid, { snapshot->scanned_tuples_after_abort++; } - SnapshotCheck(xid, snapshot, false, 0); DEBUG_SNAPSHOT(elog(LOG, "abort snapshot ts " INT64_FORMAT " false" " xid %d .", snapshot->start_ts, xid)); + SnapshotCheck(xid, snapshot, false, 0); return false; } } @@ -3620,10 +3616,10 @@ XidInMVCCSnapshotDistri(HeapTupleHeader tuple, TransactionId xid, * because that as T1's commit timestamp has not yet been aquired on CN, * T2.start < T1.commit is always being held. */ - SnapshotCheck(xid, snapshot, true, 0); - DEBUG_SNAPSHOT(elog(LOG, "snapshot ts " INT64_FORMAT " true xid %d 5.", snapshot->start_ts, xid)); + SnapshotCheck(xid, snapshot, true, 0); + return true; } From 97226a8dfc427a01fb7b52ef7ea6f3d3036710d9 Mon Sep 17 00:00:00 2001 From: aidenma Date: Tue, 28 Jul 2020 11:13:02 +0800 Subject: [PATCH 015/578] fix warning pg_basebackup --- src/bin/pg_basebackup/pg_basebackup.c | 601 ++++++++++++++------------ 1 file changed, 314 insertions(+), 287 deletions(-) diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c index 2f5717e2..91af5b93 100644 --- a/src/bin/pg_basebackup/pg_basebackup.c +++ b/src/bin/pg_basebackup/pg_basebackup.c @@ -1766,302 +1766,329 @@ BaseBackup(void) maxServerMajor; int serverVersion, serverMajor; - - Assert(conn != NULL); - - /* - * Check server version. BASE_BACKUP command was introduced in 9.1, so we - * can't work with servers older than 9.1. - */ - minServerMajor = 901; - maxServerMajor = PG_VERSION_NUM / 100; - serverVersion = PQserverVersion(conn); - serverMajor = serverVersion / 100; - if (serverMajor < minServerMajor || serverMajor > maxServerMajor) - { - const char *serverver = PQparameterStatus(conn, "server_version"); - - fprintf(stderr, _("%s: incompatible server version %s\n"), - progname, serverver ? serverver : "'unknown'"); - disconnect_and_exit(1); - } - - /* - * If WAL streaming was requested, also check that the server is new - * enough for that. - */ - if (includewal == STREAM_WAL && !CheckServerVersionForStreaming(conn)) - { - /* - * Error message already written in CheckServerVersionForStreaming(), - * but add a hint about using -X none. - */ - fprintf(stderr, _("HINT: use -X none or -X fetch to disable log streaming\n")); - disconnect_and_exit(1); - } - - /* - * Build contents of recovery.conf if requested - */ - if (writerecoveryconf) - GenerateRecoveryConf(conn); - - /* - * Run IDENTIFY_SYSTEM so we can get the timeline - */ - if (!RunIdentifySystem(conn, &sysidentifier, &latesttli, NULL, NULL)) - disconnect_and_exit(1); - - /* - * Start the actual backup - */ - PQescapeStringConn(conn, escaped_label, label, sizeof(escaped_label), &i); - - if (maxrate > 0) - maxrate_clause = psprintf("MAX_RATE %u", maxrate); - - if (verbose) - fprintf(stderr, - _("%s: initiating base backup, waiting for checkpoint to complete\n"), - progname); - - if (showprogress && !verbose) - fprintf(stderr, "waiting for checkpoint\r"); - - basebkp = - psprintf("BASE_BACKUP LABEL '%s' %s %s %s %s %s %s", - escaped_label, - showprogress ? "PROGRESS" : "", - includewal == FETCH_WAL ? "WAL" : "", - fastcheckpoint ? "FAST" : "", - includewal == NO_WAL ? "" : "NOWAIT", - maxrate_clause ? maxrate_clause : "", - format == 't' ? "TABLESPACE_MAP" : ""); - - if (PQsendQuery(conn, basebkp) == 0) - { - fprintf(stderr, _("%s: could not send replication command \"%s\": %s"), - progname, "BASE_BACKUP", PQerrorMessage(conn)); - disconnect_and_exit(1); - } - - /* - * Get the starting WAL location - */ - res = PQgetResult(conn); - if (PQresultStatus(res) != PGRES_TUPLES_OK) - { - fprintf(stderr, _("%s: could not initiate base backup: %s"), - progname, PQerrorMessage(conn)); - disconnect_and_exit(1); - } - if (PQntuples(res) != 1) - { - fprintf(stderr, - _("%s: server returned unexpected response to BASE_BACKUP command; got %d rows and %d fields, expected %d rows and %d fields\n"), - progname, PQntuples(res), PQnfields(res), 1, 2); - disconnect_and_exit(1); - } - - /* start_point: get last checkpoint point position from master */ - strlcpy(xlogstart, PQgetvalue(res, 0, 0), sizeof(xlogstart)); - - /* - * If we're streaming WAL, start the streaming session before we start - * receiving the actual data chunks. - */ - if (includewal == STREAM_WAL) - { - if (verbose) - fprintf(stderr, _("%s: starting background WAL receiver\n"), - progname); - StartLogStreamer(xlogstart, starttli, sysidentifier); - } - - /* - * Start receiving chunks - */ - for (i = 0; i < PQntuples(res); i++) - { - if (format == 't') - ReceiveTarFile(conn, res, i); - else - ReceiveAndUnpackTarFile(conn, res, i); - } /* Loop over all tablespaces */ - - if (showprogress) - { - progress_report(PQntuples(res), NULL, true); - fprintf(stderr, "\n"); /* Need to move to next line */ - } - - PQclear(res); - - /* - * Get the stop position - */ - res = PQgetResult(conn); - if (PQresultStatus(res) != PGRES_TUPLES_OK) - { - fprintf(stderr, - _("%s: could not get write-ahead log end position from server: %s"), - progname, PQerrorMessage(conn)); - disconnect_and_exit(1); - } - if (PQntuples(res) != 1) - { - fprintf(stderr, - _("%s: no write-ahead log end position returned from server\n"), - progname); - disconnect_and_exit(1); - } - strlcpy(xlogend, PQgetvalue(res, 0, 0), sizeof(xlogend)); - if (verbose && includewal != NO_WAL) - fprintf(stderr, _("%s: write-ahead log end point: %s\n"), progname, xlogend); - PQclear(res); - - res = PQgetResult(conn); - if (PQresultStatus(res) != PGRES_COMMAND_OK) - { - fprintf(stderr, _("%s: final receive failed: %s"), - progname, PQerrorMessage(conn)); - disconnect_and_exit(1); - } - - if (bgchild > 0) - { + Assert(conn != NULL); + /* + * Check server version. BASE_BACKUP command was introduced in 9.1, so we + * can't work with servers older than 9.1. + */ + minServerMajor = 901; + maxServerMajor = PG_VERSION_NUM / 100; + serverVersion = PQserverVersion(conn); + serverMajor = serverVersion / 100; + if (serverMajor < minServerMajor || serverMajor > maxServerMajor) + { + const char *serverver = PQparameterStatus(conn, "server_version"); + fprintf(stderr, _("%s: incompatible server version %s\n"), + progname, serverver ? serverver : "'unknown'"); + disconnect_and_exit(1); + } + /* + * If WAL streaming was requested, also check that the server is new + * enough for that. + */ + if (includewal == STREAM_WAL && !CheckServerVersionForStreaming(conn)) + { + /* + * Error message already written in CheckServerVersionForStreaming(), + * but add a hint about using -X none. + */ + fprintf(stderr, _("HINT: use -X none or -X fetch to disable log streaming\n")); + disconnect_and_exit(1); + } + /* + * Build contents of recovery.conf if requested + */ + if (writerecoveryconf) + GenerateRecoveryConf(conn); + /* + * Run IDENTIFY_SYSTEM so we can get the timeline + */ + if (!RunIdentifySystem(conn, &sysidentifier, &latesttli, NULL, NULL)) + disconnect_and_exit(1); + /* + * Start the actual backup + */ + PQescapeStringConn(conn, escaped_label, label, sizeof(escaped_label), &i); + if (maxrate > 0) + maxrate_clause = psprintf("MAX_RATE %u", maxrate); + if (verbose) + fprintf(stderr, + _("%s: initiating base backup, waiting for checkpoint to complete\n"), + progname); + if (showprogress && !verbose) + fprintf(stderr, "waiting for checkpoint\r"); + basebkp = + psprintf("BASE_BACKUP LABEL '%s' %s %s %s %s %s %s", + escaped_label, + showprogress ? "PROGRESS" : "", + includewal == FETCH_WAL ? "WAL" : "", + fastcheckpoint ? "FAST" : "", + includewal == NO_WAL ? "" : "NOWAIT", + maxrate_clause ? maxrate_clause : "", + format == 't' ? "TABLESPACE_MAP" : ""); + if (PQsendQuery(conn, basebkp) == 0) + { + fprintf(stderr, _("%s: could not send replication command \"%s\": %s"), + progname, "BASE_BACKUP", PQerrorMessage(conn)); + disconnect_and_exit(1); + } + /* + * Get the starting WAL location + */ + res = PQgetResult(conn); + if (PQresultStatus(res) != PGRES_TUPLES_OK) + { + fprintf(stderr, _("%s: could not initiate base backup: %s"), + progname, PQerrorMessage(conn)); + disconnect_and_exit(1); + } + if (PQntuples(res) != 1) + { + fprintf(stderr, + _("%s: server returned unexpected response to BASE_BACKUP command; got %d rows and %d fields, expected %d rows and %d fields\n"), + progname, PQntuples(res), PQnfields(res), 1, 2); + disconnect_and_exit(1); + } + strlcpy(xlogstart, PQgetvalue(res, 0, 0), sizeof(xlogstart)); + if (verbose) + fprintf(stderr, _("%s: checkpoint completed\n"), progname); + /* + * 9.3 and later sends the TLI of the starting point. With older servers, + * assume it's the same as the latest timeline reported by + * IDENTIFY_SYSTEM. + */ + if (PQnfields(res) >= 2) + starttli = atoi(PQgetvalue(res, 0, 1)); + else + starttli = latesttli; + PQclear(res); + MemSet(xlogend, 0, sizeof(xlogend)); + if (verbose && includewal != NO_WAL) + fprintf(stderr, _("%s: write-ahead log start point: %s on timeline %u\n"), + progname, xlogstart, starttli); + /* + * Get the header + */ + res = PQgetResult(conn); + if (PQresultStatus(res) != PGRES_TUPLES_OK) + { + fprintf(stderr, _("%s: could not get backup header: %s"), + progname, PQerrorMessage(conn)); + disconnect_and_exit(1); + } + if (PQntuples(res) < 1) + { + fprintf(stderr, _("%s: no data returned from server\n"), progname); + disconnect_and_exit(1); + } + /* + * Sum up the total size, for progress reporting + */ + totalsize = totaldone = 0; + tablespacecount = PQntuples(res); + for (i = 0; i < PQntuples(res); i++) + { + totalsize += atol(PQgetvalue(res, i, 2)); + /* + * Verify tablespace directories are empty. Don't bother with the + * first once since it can be relocated, and it will be checked before + * we do anything anyway. + */ + if (format == 'p' && !PQgetisnull(res, i, 1)) + { + char *path = (char *) get_tablespace_mapping(PQgetvalue(res, i, 1)); + verify_dir_is_empty_or_create(path, &made_tablespace_dirs, &found_tablespace_dirs); + } + } + /* + * When writing to stdout, require a single tablespace + */ + if (format == 't' && strcmp(basedir, "-") == 0 && PQntuples(res) > 1) + { + fprintf(stderr, + _("%s: can only write single tablespace to stdout, database has %d\n"), + progname, PQntuples(res)); + disconnect_and_exit(1); + } + /* + * If we're streaming WAL, start the streaming session before we start + * receiving the actual data chunks. + */ + if (includewal == STREAM_WAL) + { + if (verbose) + fprintf(stderr, _("%s: starting background WAL receiver\n"), + progname); + StartLogStreamer(xlogstart, starttli, sysidentifier); + } + /* + * Start receiving chunks + */ + for (i = 0; i < PQntuples(res); i++) + { + if (format == 't') + ReceiveTarFile(conn, res, i); + else + ReceiveAndUnpackTarFile(conn, res, i); + } /* Loop over all tablespaces */ + if (showprogress) + { + progress_report(PQntuples(res), NULL, true); + fprintf(stderr, "\n"); /* Need to move to next line */ + } + PQclear(res); + /* + * Get the stop position + */ + res = PQgetResult(conn); + if (PQresultStatus(res) != PGRES_TUPLES_OK) + { + fprintf(stderr, + _("%s: could not get write-ahead log end position from server: %s"), + progname, PQerrorMessage(conn)); + disconnect_and_exit(1); + } + if (PQntuples(res) != 1) + { + fprintf(stderr, + _("%s: no write-ahead log end position returned from server\n"), + progname); + disconnect_and_exit(1); + } + strlcpy(xlogend, PQgetvalue(res, 0, 0), sizeof(xlogend)); + if (verbose && includewal != NO_WAL) + fprintf(stderr, _("%s: write-ahead log end point: %s\n"), progname, xlogend); + PQclear(res); + res = PQgetResult(conn); + if (PQresultStatus(res) != PGRES_COMMAND_OK) + { + fprintf(stderr, _("%s: final receive failed: %s"), + progname, PQerrorMessage(conn)); + disconnect_and_exit(1); + } + if (bgchild > 0) + { #ifndef WIN32 int status; int r; #else - DWORD status; - - /* - * get a pointer sized version of bgchild to avoid warnings about - * casting to a different size on WIN64. - */ - intptr_t bgchild_handle = bgchild; - uint32 hi, - lo; + DWORD status; + /* + * get a pointer sized version of bgchild to avoid warnings about + * casting to a different size on WIN64. + */ + intptr_t bgchild_handle = bgchild; + uint32 hi, + lo; #endif - - if (verbose) - fprintf(stderr, - _("%s: waiting for background process to finish streaming ...\n"), progname); - + if (verbose) + fprintf(stderr, + _("%s: waiting for background process to finish streaming ...\n"), progname); #ifndef WIN32 - if (write(bgpipe[1], xlogend, strlen(xlogend)) != strlen(xlogend)) - { - fprintf(stderr, - _("%s: could not send command to background pipe: %s\n"), - progname, strerror(errno)); - disconnect_and_exit(1); - } - - /* Just wait for the background process to exit */ - r = waitpid(bgchild, &status, 0); - if (r == -1) - { - fprintf(stderr, _("%s: could not wait for child process: %s\n"), - progname, strerror(errno)); - disconnect_and_exit(1); - } - if (r != bgchild) - { - fprintf(stderr, _("%s: child %d died, expected %d\n"), - progname, r, (int) bgchild); - disconnect_and_exit(1); - } - if (!WIFEXITED(status)) - { - fprintf(stderr, _("%s: child process did not exit normally\n"), - progname); - disconnect_and_exit(1); - } - if (WEXITSTATUS(status) != 0) - { - fprintf(stderr, _("%s: child process exited with error %d\n"), - progname, WEXITSTATUS(status)); - disconnect_and_exit(1); - } - /* Exited normally, we're happy! */ -#else /* WIN32 */ - - /* - * On Windows, since we are in the same process, we can just store the - * value directly in the variable, and then set the flag that says - * it's there. - */ - if (sscanf(xlogend, "%X/%X", &hi, &lo) != 2) - { - fprintf(stderr, - _("%s: could not parse write-ahead log location \"%s\"\n"), - progname, xlogend); - disconnect_and_exit(1); - } - xlogendptr = ((uint64) hi) << 32 | lo; - InterlockedIncrement(&has_xlogendptr); - - /* First wait for the thread to exit */ - if (WaitForSingleObjectEx((HANDLE) bgchild_handle, INFINITE, FALSE) != - WAIT_OBJECT_0) - { - _dosmaperr(GetLastError()); - fprintf(stderr, _("%s: could not wait for child thread: %s\n"), - progname, strerror(errno)); - disconnect_and_exit(1); - } - if (GetExitCodeThread((HANDLE) bgchild_handle, &status) == 0) - { - _dosmaperr(GetLastError()); - fprintf(stderr, _("%s: could not get child thread exit status: %s\n"), - progname, strerror(errno)); - disconnect_and_exit(1); - } - if (status != 0) - { - fprintf(stderr, _("%s: child thread exited with error %u\n"), - progname, (unsigned int) status); - disconnect_and_exit(1); - } - /* Exited normally, we're happy */ + if (write(bgpipe[1], xlogend, strlen(xlogend)) != strlen(xlogend)) + { + fprintf(stderr, + _("%s: could not send command to background pipe: %s\n"), + progname, strerror(errno)); + disconnect_and_exit(1); + } + /* Just wait for the background process to exit */ + r = waitpid(bgchild, &status, 0); + if (r == -1) + { + fprintf(stderr, _("%s: could not wait for child process: %s\n"), + progname, strerror(errno)); + disconnect_and_exit(1); + } + if (r != bgchild) + { + fprintf(stderr, _("%s: child %d died, expected %d\n"), + progname, r, (int) bgchild); + disconnect_and_exit(1); + } + if (!WIFEXITED(status)) + { + fprintf(stderr, _("%s: child process did not exit normally\n"), + progname); + disconnect_and_exit(1); + } + if (WEXITSTATUS(status) != 0) + { + fprintf(stderr, _("%s: child process exited with error %d\n"), + progname, WEXITSTATUS(status)); + disconnect_and_exit(1); + } + /* Exited normally, we're happy! */ +#else /* WIN32 */ + /* + * On Windows, since we are in the same process, we can just store the + * value directly in the variable, and then set the flag that says + * it's there. + */ + if (sscanf(xlogend, "%X/%X", &hi, &lo) != 2) + { + fprintf(stderr, + _("%s: could not parse write-ahead log location \"%s\"\n"), + progname, xlogend); + disconnect_and_exit(1); + } + xlogendptr = ((uint64) hi) << 32 | lo; + InterlockedIncrement(&has_xlogendptr); + /* First wait for the thread to exit */ + if (WaitForSingleObjectEx((HANDLE) bgchild_handle, INFINITE, FALSE) != + WAIT_OBJECT_0) + { + _dosmaperr(GetLastError()); + fprintf(stderr, _("%s: could not wait for child thread: %s\n"), + progname, strerror(errno)); + disconnect_and_exit(1); + } + if (GetExitCodeThread((HANDLE) bgchild_handle, &status) == 0) + { + _dosmaperr(GetLastError()); + fprintf(stderr, _("%s: could not get child thread exit status: %s\n"), + progname, strerror(errno)); + disconnect_and_exit(1); + } + if (status != 0) + { + fprintf(stderr, _("%s: child thread exited with error %u\n"), + progname, (unsigned int) status); + disconnect_and_exit(1); + } + /* Exited normally, we're happy */ #endif - } - - /* Free the recovery.conf contents */ - destroyPQExpBuffer(recoveryconfcontents); - - /* - * End of copy data. Final result is already checked inside the loop. - */ - PQclear(res); - PQfinish(conn); - - /* - * Make data persistent on disk once backup is completed. For tar format - * once syncing the parent directory is fine, each tar file created per - * tablespace has been already synced. In plain format, all the data of - * the base directory is synced, taking into account all the tablespaces. - * Errors are not considered fatal. - */ - if (do_sync) - { - if (format == 't') - { - if (strcmp(basedir, "-") != 0) - (void) fsync_fname(basedir, true, progname); - } - else - { - (void) fsync_pgdata(basedir, progname, serverVersion); - } - } - - if (verbose) - fprintf(stderr, _("%s: base backup completed\n"), progname); + } + /* Free the recovery.conf contents */ + destroyPQExpBuffer(recoveryconfcontents); + /* + * End of copy data. Final result is already checked inside the loop. + */ + PQclear(res); + PQfinish(conn); + /* + * Make data persistent on disk once backup is completed. For tar format + * once syncing the parent directory is fine, each tar file created per + * tablespace has been already synced. In plain format, all the data of + * the base directory is synced, taking into account all the tablespaces. + * Errors are not considered fatal. + */ + if (do_sync) + { + if (format == 't') + { + if (strcmp(basedir, "-") != 0) + (void) fsync_fname(basedir, true, progname); + } + else + { + (void) fsync_pgdata(basedir, progname, serverVersion); + } + } + if (verbose) + fprintf(stderr, _("%s: base backup completed\n"), progname); } - int main(int argc, char **argv) {// #lizard forgives From 44bb70bfd302ececd9f07c907f1c233c973b06a8 Mon Sep 17 00:00:00 2001 From: youngxie Date: Tue, 28 Jul 2020 12:40:20 +0800 Subject: [PATCH 016/578] fix coordinator gets error under readonly plane with coldhot seperation. --- src/backend/pgxc/nodemgr/groupmgr.c | 78 +++++++++++++++-------------- 1 file changed, 40 insertions(+), 38 deletions(-) diff --git a/src/backend/pgxc/nodemgr/groupmgr.c b/src/backend/pgxc/nodemgr/groupmgr.c index b1e4a339..be034bed 100644 --- a/src/backend/pgxc/nodemgr/groupmgr.c +++ b/src/backend/pgxc/nodemgr/groupmgr.c @@ -538,44 +538,46 @@ Oid RemoveNodeFromGroup(Oid nodeoid) Oid GetGroupOidByNode(Oid nodeoid) { - Relation relation; - SysScanDesc scan; - HeapTuple tup; - Form_pgxc_group group; - int i; - Oid groupoid = InvalidOid; - - relation = heap_open(PgxcGroupRelationId, AccessShareLock); - - scan = systable_beginscan(relation, InvalidOid, false, NULL, 0, NULL); - - tup = systable_getnext(scan); - - while(HeapTupleIsValid(tup)) - { - group = (Form_pgxc_group)GETSTRUCT(tup); - - for (i = 0; i < group->group_members.dim1; i++) - { - if (group->group_members.values[i] == nodeoid) - { - groupoid = HeapTupleGetOid(tup); - break; - } - } - - if (OidIsValid(groupoid)) - { - break; - } - - tup = systable_getnext(scan); - } - - systable_endscan(scan); - heap_close(relation, AccessShareLock); - - return groupoid; + Relation relation; + SysScanDesc scan; + HeapTuple tup; + Form_pgxc_group group; + int i; + Oid groupoid = InvalidOid; + + nodeoid = PGXCGetMainNodeOid(nodeoid); + + relation = heap_open(PgxcGroupRelationId, AccessShareLock); + + scan = systable_beginscan(relation, InvalidOid, false, NULL, 0, NULL); + + tup = systable_getnext(scan); + + while(HeapTupleIsValid(tup)) + { + group = (Form_pgxc_group)GETSTRUCT(tup); + + for (i = 0; i < group->group_members.dim1; i++) + { + if (group->group_members.values[i] == nodeoid) + { + groupoid = HeapTupleGetOid(tup); + break; + } + } + + if (OidIsValid(groupoid)) + { + break; + } + + tup = systable_getnext(scan); + } + + systable_endscan(scan); + heap_close(relation, AccessShareLock); + + return groupoid; } List * From 34ef3de8a9b94f5c883bbb1397365052ab10d311 Mon Sep 17 00:00:00 2001 From: ericxwu Date: Fri, 7 Aug 2020 12:42:39 +0800 Subject: [PATCH 017/578] Support complex UDPATE/DELETE when distribution key not matching Previously we just throw error if we failed the distribution check in group_planner. To support such cases, we need to improve the set_joinpath_distribution, make it aware of result relation location. Which means, if we know one side of the join path contains result relation, then we need to keep it not redistributed. TAPD: http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696859222691 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696859323617 --- src/backend/nodes/outfuncs.c | 9 +- src/backend/optimizer/plan/planner.c | 59 +- src/backend/optimizer/util/Makefile | 3 +- src/backend/optimizer/util/distribution.c | 125 +++ src/backend/optimizer/util/pathnode.c | 856 +++++++++++------- src/backend/optimizer/util/relnode.c | 358 ++++---- src/include/nodes/relation.h | 23 +- src/include/optimizer/distribution.h | 28 + src/test/regress/expected/foreign_key_2.out | 23 +- src/test/regress/expected/join_3.out | 13 +- src/test/regress/expected/matview_1.out | 4 +- src/test/regress/expected/returning.out | 4 +- src/test/regress/expected/rowsecurity_1.out | 150 ++- src/test/regress/expected/rowtypes_1.out | 10 +- src/test/regress/expected/rules.out | 132 ++- src/test/regress/expected/subselect_1.out | 8 +- .../regress/expected/xl_limitations_1.out | 24 +- src/test/regress/output/misc.source | 4 - 18 files changed, 1151 insertions(+), 682 deletions(-) create mode 100644 src/backend/optimizer/util/distribution.c create mode 100644 src/include/optimizer/distribution.h diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index dee8b834..fb063aa2 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -3604,10 +3604,11 @@ _outRelOptInfo(StringInfo str, const RelOptInfo *node) WRITE_BOOL_FIELD(has_eclass_joins); WRITE_BITMAPSET_FIELD(top_parent_relids); #ifdef __TBASE__ - WRITE_BOOL_FIELD(intervalparent); - WRITE_BOOL_FIELD(isdefault); - WRITE_BITMAPSET_FIELD(childs); - WRITE_INT_FIELD(estimate_partidx); + WRITE_BOOL_FIELD(intervalparent); + WRITE_BOOL_FIELD(isdefault); + WRITE_BITMAPSET_FIELD(childs); + WRITE_INT_FIELD(estimate_partidx); + WRITE_ENUM_FIELD(resultRelLoc, ResultRelLocation); #endif } diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index f41ef5a1..4495ef70 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -63,7 +63,9 @@ #include "utils/selfuncs.h" #include "utils/lsyscache.h" #include "utils/syscache.h" - +#ifdef __TBASE__ +#include "optimizer/distribution.h" +#endif /* GUC parameters */ double cursor_tuple_fraction = DEFAULT_CURSOR_TUPLE_FRACTION; @@ -189,10 +191,8 @@ static PathTarget *make_window_input_target(PlannerInfo *root, static List *make_pathkeys_for_window(PlannerInfo *root, WindowClause *wc, List *tlist); static PathTarget *make_sort_input_target(PlannerInfo *root, - PathTarget *final_target, - bool *have_postponed_srfs); -static bool equal_distributions(PlannerInfo *root, Distribution *dst1, - Distribution *dst2); + PathTarget *final_target, + bool *have_postponed_srfs); static bool grouping_distribution_match(PlannerInfo *root, Query *parse, Path *path, List *clauses); static bool groupingsets_distribution_match(PlannerInfo *root, Query *parse, @@ -7856,55 +7856,6 @@ groupingsets_distribution_match(PlannerInfo *root, Query *parse, Path *path) return false; } -/* - * equal_distributions - * Check that two distributions are equal. - * - * Distributions are considered equal if they are of the same type, on the - * same set of nodes, and if the distribution expressions are known to be equal - * (either the same expressions or members of the same equivalence class). - */ -static bool -equal_distributions(PlannerInfo *root, Distribution *dst1, - Distribution *dst2) -{// #lizard forgives - /* fast path */ - if (dst1 == dst2) - return true; - - if (dst1 == NULL || dst2 == NULL) - return false; - - /* conditions easier to check go first */ - if (dst1->distributionType != dst2->distributionType) - return false; - - if (!bms_equal(dst1->nodes, dst2->nodes)) - return false; - - if (equal(dst1->distributionExpr, dst2->distributionExpr)) - return true; - - /* - * For more thorough expression check we need to ensure they both are - * defined - */ - if (dst1->distributionExpr == NULL || dst2->distributionExpr == NULL) - return false; - - /* - * More thorough check, but allows some important cases, like if - * distribution column is not updated (implicit set distcol=distcol) or - * set distcol = CONST, ... WHERE distcol = CONST - pattern used by many - * applications. - */ - if (exprs_known_equal(root, dst1->distributionExpr, dst2->distributionExpr)) - return true; - - /* The restrictNodes field does not matter for distribution equality */ - return false; -} - /* * adjust_path_distribution * Adjust distribution of the path to match what's expected by ModifyTable. diff --git a/src/backend/optimizer/util/Makefile b/src/backend/optimizer/util/Makefile index 2455d933..e625e51b 100644 --- a/src/backend/optimizer/util/Makefile +++ b/src/backend/optimizer/util/Makefile @@ -13,6 +13,7 @@ top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global OBJS = clauses.o joininfo.o orclauses.o pathnode.o placeholder.o \ - plancat.o predtest.o relnode.o restrictinfo.o tlist.o var.o pgxcship.o + plancat.o predtest.o relnode.o restrictinfo.o tlist.o var.o pgxcship.o \ + distribution.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/optimizer/util/distribution.c b/src/backend/optimizer/util/distribution.c new file mode 100644 index 00000000..3746b2c8 --- /dev/null +++ b/src/backend/optimizer/util/distribution.c @@ -0,0 +1,125 @@ +/*------------------------------------------------------------------------- + * + * distribution.c + * Routines related to adjust path distribution + * + * Copyright (c) 2020-Present TBase development team, Tencent + * + * + * IDENTIFICATION + * src/backend/optimizer/util/distribution.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "nodes/bitmapset.h" +#include "nodes/nodes.h" +#include "optimizer/distribution.h" +#include "optimizer/paths.h" + +/* + * equal_distributions + * Check that two distributions are equal. + * + * Distributions are considered equal if they are of the same type, on the + * same set of nodes, and if the distribution expressions are known to be equal + * (either the same expressions or members of the same equivalence class). + */ +bool +equal_distributions(PlannerInfo *root, Distribution *dst1, + Distribution *dst2) +{ + /* fast path */ + if (dst1 == dst2) + return true; + + if (dst1 == NULL || dst2 == NULL) + return false; + + /* conditions easier to check go first */ + if (dst1->distributionType != dst2->distributionType) + return false; + + if (!bms_equal(dst1->nodes, dst2->nodes)) + return false; + + if (equal(dst1->distributionExpr, dst2->distributionExpr)) + return true; + + /* + * For more thorough expression check we need to ensure they both are + * defined + */ + if (dst1->distributionExpr == NULL || dst2->distributionExpr == NULL) + return false; + + /* + * More thorough check, but allows some important cases, like if + * distribution column is not updated (implicit set distcol=distcol) or + * set distcol = CONST, ... WHERE distcol = CONST - pattern used by many + * applications. + */ + if (exprs_known_equal(root, dst1->distributionExpr, dst2->distributionExpr)) + return true; + + /* The restrictNodes field does not matter for distribution equality */ + return false; +} + +/* + * Get the location of DML result relation if it appears in either subpath + */ +ResultRelLocation +getResultRelLocation(int resultRel, Relids inner, Relids outer) +{ + ResultRelLocation location = RESULT_REL_NONE; + + if (bms_is_member(resultRel, inner)) + { + location = RESULT_REL_INNER; + } + else if (bms_is_member(resultRel, outer)) + { + location = RESULT_REL_OUTER; + } + + return location; +} + +/* + * Check if the path distribution satisfy the result relation distribution. + */ +bool +SatisfyResultRelDist(PlannerInfo *root, Path *path) +{ + PlannerInfo *top_root = root; + bool equal = false; + + /* Get top root */ + while(top_root->parent_root) + { + top_root = top_root->parent_root; + } + + /* + * Check the UPDATE/DELETE command, make sure the path distribution equals the + * result relation distribution. + * We only invalidate the check if the result relation appears in one of + * the left/right subpath. + */ + if ((top_root->parse->commandType == CMD_UPDATE || + top_root->parse->commandType == CMD_DELETE) && + path->parent->resultRelLoc != RESULT_REL_NONE) + { + equal = equal_distributions(top_root, + top_root->distribution, + path->distribution); + + if (!equal) + return false; + } + + return true; +} diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index c1c50654..9bbf6040 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -37,13 +37,14 @@ #include "pgxc/nodemgr.h" #include "utils/rel.h" #ifdef __TBASE__ +#include "catalog/pgxc_key_values.h" +#include "executor/nodeAgg.h" +#include "optimizer/distribution.h" #include "optimizer/tlist.h" #include "optimizer/planner.h" +#include "optimizer/pgxcship.h" #include "pgxc/groupmgr.h" -#include "catalog/pgxc_key_values.h" #include "pgxc/pgxcnode.h" -#include "optimizer/pgxcship.h" -#include "executor/nodeAgg.h" #endif #ifdef _MIGRATE_ @@ -287,6 +288,32 @@ set_cheapest(RelOptInfo *parent_rel) Assert(IsA(parent_rel, RelOptInfo)); +#ifdef __TBASE__ + /* + * When set_joinpath_distribution() adjusted the strategy for complex + * UPDATE/DELETE, the original paths could be give up caused by no proper + * distribution found. Which lead to an early error pop up here, thus + * we need to provide more accurate error message here. (Before the + * complex delete enhancement, this will pop up in group_planner at + * final stage.) + */ + if (parent_rel->pathlist == NIL && + parent_rel->resultRelLoc != RESULT_REL_NONE) + { +#ifdef _PG_REGRESS_ + ereport(ERROR, + (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), + errmsg("could not plan this distributed UPDATE/DELETE"), + errdetail("correlated or complex UPDATE/DELETE is currently not supported in Postgres-XL."))); +#else + ereport(ERROR, + (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), + errmsg("could not plan this distributed UPDATE/DELETE"), + errdetail("correlated or complex UPDATE/DELETE is currently not supported in TBase."))); +#endif + } +#endif + if (parent_rel->pathlist == NIL) elog(ERROR, "could not devise a query plan for the given query"); @@ -470,6 +497,15 @@ add_path(RelOptInfo *parent_rel, Path *new_path) */ CHECK_FOR_INTERRUPTS(); +#ifdef __TBASE__ + /* + * In case we skipped the join paths caused by invalid result rel + * distribution. + */ + if (!new_path) + return; +#endif + /* Pretend parameterized paths have no pathkeys, per comment above */ new_path_pathkeys = new_path->param_info ? NIL : new_path->pathkeys; @@ -1572,18 +1608,41 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) List *innerpathkeys = pathnode->innerjoinpath->pathkeys; List *outerpathkeys = pathnode->outerjoinpath->pathkeys; #ifdef __TBASE__ - bool dml = false; - PlannerInfo *top_root = root; + bool dml = false; + bool keepResultRelLoc = false; + PlannerInfo *top_root = root; + ResultRelLocation resultRelLoc = RESULT_REL_NONE; - while(top_root->parent_root) - { - top_root = top_root->parent_root; - } + while(top_root->parent_root) + { + top_root = top_root->parent_root; + } - if (top_root->parse->commandType == CMD_UPDATE || - top_root->parse->commandType == CMD_DELETE) - dml = true; - + if (top_root->parse->commandType == CMD_UPDATE || + top_root->parse->commandType == CMD_DELETE) + { + dml = true; + } + + /* + * Only top root will consider more restrict rules to make sure + * UPDATE/DELETE result relation does not redistributed. + */ + if (top_root->parse->commandType == CMD_UPDATE || + top_root->parse->commandType == CMD_DELETE) + { + /* Set the result relation location */ + resultRelLoc = getResultRelLocation(top_root->parse->resultRelation, + pathnode->innerjoinpath->parent->relids, + pathnode->outerjoinpath->parent->relids); + + pathnode->path.parent->resultRelLoc = resultRelLoc; + + if (resultRelLoc != RESULT_REL_NONE) + { + keepResultRelLoc = true; + } + } #endif @@ -1604,9 +1663,17 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) return NIL; #ifdef __TBASE__ /* - * dml may need to push down to datanodes, such as, - * 'delete from geocode_settings as gc using geocode_settings_default as gf where gf.name = gc.name and gf.setting = gc.setting;' - * prefer_olap means pulling query up to coordinator node, in case data re-distribute in TPC-C test case. + * DML may need to push down to datanodes, for example: + * DELETE FROM + * geocode_settings as gc + * USING geocode_settings_default AS gf + * WHERE + * gf.name = gc.name and gf.setting = gc.setting; + * prefer_olap means pulling query up to coordinator node, in case data + * re-distribute in TPC-C test case. + * + * TODO: We need to automatically determine whether we need to pull it up, + * but not using GUC. */ if(!prefer_olap && false == dml) { @@ -1712,22 +1779,18 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) restrictClauses = list_concat(restrictClauses, pathnode->movedrestrictinfo); - /* - * This join is still allowed if inner and outer paths have - * equivalent distribution and joined along the distribution keys. - */ - if (innerd && outerd && - innerd->distributionType == outerd->distributionType && - innerd->distributionExpr && - outerd->distributionExpr && - bms_equal(innerd->nodes, outerd->nodes)) - { - ListCell *lc; - - /* - * Make sure distribution functions are the same, for now they depend - * on data type - */ + /* + * This join is still allowed if inner and outer paths have equivalent + * distribution and joined along the distribution keys. Make sure + * distribution functions are the same, for now they depend on data type. + */ + if (innerd && outerd && + innerd->distributionType == outerd->distributionType && + innerd->distributionExpr && + outerd->distributionExpr && + bms_equal(innerd->nodes, outerd->nodes)) + { + ListCell *lc; /* * Planner already did necessary work and if there is a join @@ -1739,9 +1802,10 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) */ foreach(lc, restrictClauses) { - RestrictInfo *ri = (RestrictInfo *) lfirst(lc); - ListCell *emc; - bool found_outer, found_inner; + RestrictInfo *ri = (RestrictInfo *) lfirst(lc); + ListCell *emc = NULL; + bool found_outer = false; + bool found_inner = false; /* * Restriction operator is not equality operator ? @@ -1762,9 +1826,6 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) if (!OidIsValid(ri->hashjoinoperator)) continue; - found_outer = false; - found_inner = false; - /* * If parts belong to the same equivalence member check * if both distribution keys are members of the class. @@ -1773,8 +1834,9 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) { foreach(emc, ri->left_ec->ec_members) { - EquivalenceMember *em = (EquivalenceMember *) lfirst(emc); - Expr *var = (Expr *)em->em_expr; + EquivalenceMember *em = (EquivalenceMember *) lfirst(emc); + Expr *var = (Expr *)em->em_expr; + if (IsA(var, RelabelType)) var = ((RelabelType *) var)->arg; if (!found_outer) @@ -1813,12 +1875,41 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) if (equal(var, emvar)) { targetd->distributionExpr = (Node *) var; +#ifdef __TBASE__ + /* + * For UPDATE/DELETE, make sure we are distributing by + * the result relation. + */ + if (keepResultRelLoc && + !equal_distributions(top_root, + top_root->distribution, + targetd)) + { + continue; + } +#endif return alternate; } } } /* Not found, take any */ targetd->distributionExpr = innerd->distributionExpr; + +#ifdef __TBASE__ + /* + * For UPDATE/DELETE, make sure we are distributing by + * the result relation. + */ + if (keepResultRelLoc && + !equal_distributions(top_root, + top_root->distribution, + targetd)) + { + pfree(targetd); + targetd = NULL; + continue; + } +#endif return alternate; } } @@ -1874,47 +1965,80 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) targetd->restrictNodes = bms_union(outerd->restrictNodes, innerd->restrictNodes); } #endif - /* - * In case of outer join distribution key should not refer - * distribution key of nullable part. - */ - if (pathnode->jointype == JOIN_FULL) - /* both parts are nullable */ - targetd->distributionExpr = NULL; - else if (pathnode->jointype == JOIN_RIGHT) - targetd->distributionExpr = innerd->distributionExpr; - else - targetd->distributionExpr = outerd->distributionExpr; + /* + * In case of outer join distribution key should not refer + * distribution key of nullable part. + */ + if (pathnode->jointype == JOIN_FULL) + /* both parts are nullable */ + targetd->distributionExpr = NULL; + else if (pathnode->jointype == JOIN_RIGHT) + targetd->distributionExpr = innerd->distributionExpr; + else + targetd->distributionExpr = outerd->distributionExpr; - return alternate; - } - } - } +#ifdef __TBASE__ + /* + * For UPDATE/DELETE, make sure we are distributing by + * the result relation. + */ + if (keepResultRelLoc && + !equal_distributions(top_root, + top_root->distribution, + targetd)) + { + pfree(targetd); + targetd = NULL; + continue; + } +#endif + return alternate; + } + } + } #ifndef _PG_REGRESS_ - if (bms_equal(innerd->restrictNodes, outerd->restrictNodes) && - bms_num_members(innerd->restrictNodes) == 1 && restrict_query && - pathnode->jointype != JOIN_FULL) - { - targetd = makeNode(Distribution); - targetd->distributionType = innerd->distributionType; - targetd->nodes = bms_copy(innerd->nodes); - targetd->restrictNodes = bms_copy(innerd->restrictNodes); - pathnode->path.distribution = targetd; - - /* - * In case of outer join distribution key should not refer - * distribution key of nullable part. - */ - if (pathnode->jointype == JOIN_FULL) - /* both parts are nullable */ - targetd->distributionExpr = NULL; - else if (pathnode->jointype == JOIN_RIGHT) - targetd->distributionExpr = innerd->distributionExpr; - else - targetd->distributionExpr = outerd->distributionExpr; + if (bms_equal(innerd->restrictNodes, outerd->restrictNodes) && + bms_num_members(innerd->restrictNodes) == 1 && restrict_query && + pathnode->jointype != JOIN_FULL) + { + targetd = makeNode(Distribution); + targetd->distributionType = innerd->distributionType; + targetd->nodes = bms_copy(innerd->nodes); + targetd->restrictNodes = bms_copy(innerd->restrictNodes); + pathnode->path.distribution = targetd; + + /* + * In case of outer join distribution key should not refer + * distribution key of nullable part. + */ + if (pathnode->jointype == JOIN_FULL) + /* both parts are nullable */ + targetd->distributionExpr = NULL; + else if (pathnode->jointype == JOIN_RIGHT) + targetd->distributionExpr = innerd->distributionExpr; + else + targetd->distributionExpr = outerd->distributionExpr; - return alternate; - } +#ifdef __TBASE__ + /* + * For UPDATE/DELETE, make sure we are distributing by + * the result relation. + */ + if (!keepResultRelLoc || equal_distributions(top_root, + top_root->distribution, + targetd)) + { + return alternate; + } + else + { + pfree(targetd); + targetd = NULL; + } +#else + return alternate; +#endif + } #endif } @@ -2002,7 +2126,11 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) char distType = LOCATOR_TYPE_NONE; ListCell *lc; #ifdef __TBASE__ - Oid group; + Oid group; + int nRemotePlans_outer = 0; + int nRemotePlans_inner = 0; + bool redistribute_outer = false; + bool redistribute_inner = false; #endif /* @@ -2070,108 +2198,121 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) } } #endif - /* - * Evaluation cost will be needed to choose preferred - * distribution - */ - cost_qual_eval_node(&cost, (Node *) ri, root); + /* + * Evaluation cost will be needed to choose preferred + * distribution + */ + cost_qual_eval_node(&cost, (Node *) ri, root); - if (outerd->distributionExpr) - { - /* - * If left side is distribution key of outer subquery - * and right expression refers only inner subquery - */ - if (equal(outerd->distributionExpr, left_expr) && - bms_is_subset(ri->right_relids, inner_rels)) - { - if (!preferred || /* no preferred restriction yet found */ - (new_inner_key && new_outer_key) || /* preferred restriction require redistribution of both parts */ - (cost.per_tuple < preferred->eval_cost.per_tuple)) /* current restriction is cheaper */ - { - /* set new preferred restriction */ - preferred = ri; - new_inner_key = right; - new_outer_key = NULL; /* no need to change */ - distType = outerd->distributionType; - } - continue; - } - /* - * If right side is distribution key of outer subquery - * and left expression refers only inner subquery - */ - if (equal(outerd->distributionExpr, right_expr) && - bms_is_subset(ri->left_relids, inner_rels)) - { - if (!preferred || /* no preferred restriction yet found */ - (new_inner_key && new_outer_key) || /* preferred restriction require redistribution of both parts */ - (cost.per_tuple < preferred->eval_cost.per_tuple)) /* current restriction is cheaper */ - { - /* set new preferred restriction */ - preferred = ri; - new_inner_key = left; - new_outer_key = NULL; /* no need to change */ - distType = outerd->distributionType; - } - continue; - } - } - if (innerd->distributionExpr) - { - /* - * If left side is distribution key of inner subquery - * and right expression refers only outer subquery - */ - if (equal(innerd->distributionExpr, left_expr) && - bms_is_subset(ri->right_relids, outer_rels)) - { - if (!preferred || /* no preferred restriction yet found */ - (new_inner_key && new_outer_key) || /* preferred restriction require redistribution of both parts */ - (cost.per_tuple < preferred->eval_cost.per_tuple)) /* current restriction is cheaper */ - { - /* set new preferred restriction */ - preferred = ri; - new_inner_key = NULL; /* no need to change */ - new_outer_key = right; - distType = innerd->distributionType; - } - continue; - } - /* - * If right side is distribution key of inner subquery - * and left expression refers only outer subquery - */ - if (equal(innerd->distributionExpr, right_expr) && - bms_is_subset(ri->left_relids, outer_rels)) - { - if (!preferred || /* no preferred restriction yet found */ - (new_inner_key && new_outer_key) || /* preferred restriction require redistribution of both parts */ - (cost.per_tuple < preferred->eval_cost.per_tuple)) /* current restriction is cheaper */ - { - /* set new preferred restriction */ - preferred = ri; - new_inner_key = NULL; /* no need to change */ - new_outer_key = left; - distType = innerd->distributionType; - } - continue; - } - } - /* - * Current restriction recuire redistribution of both parts. - * If preferred restriction require redistribution of one, - * keep it. - */ - if (preferred && - (new_inner_key == NULL || new_outer_key == NULL)) - continue; - - /* - * Skip this condition if the data type of the expressions - * does not allow either HASH or MODULO distribution. - * HASH distribution is preferrable. - */ + if (outerd->distributionExpr) + { +#ifdef __TBASE__ + /* + * For UPDATE/DELETE, make sure outer rel does not need + * to distribute + */ + if (keepResultRelLoc && resultRelLoc == RESULT_REL_INNER) + continue; +#endif + /* + * If left side is distribution key of outer subquery + * and right expression refers only inner subquery + */ + if (equal(outerd->distributionExpr, left_expr) && + bms_is_subset(ri->right_relids, inner_rels)) + { + if (!preferred || /* no preferred restriction yet found */ + (new_inner_key && new_outer_key) || /* preferred restriction require redistribution of both parts */ + (cost.per_tuple < preferred->eval_cost.per_tuple)) /* current restriction is cheaper */ + { + /* set new preferred restriction */ + preferred = ri; + new_inner_key = right; + new_outer_key = NULL; /* no need to change */ + distType = outerd->distributionType; + } + continue; + } + /* + * If right side is distribution key of outer subquery + * and left expression refers only inner subquery + */ + if (equal(outerd->distributionExpr, right_expr) && + bms_is_subset(ri->left_relids, inner_rels)) + { + if (!preferred || /* no preferred restriction yet found */ + (new_inner_key && new_outer_key) || /* preferred restriction require redistribution of both parts */ + (cost.per_tuple < preferred->eval_cost.per_tuple)) /* current restriction is cheaper */ + { + /* set new preferred restriction */ + preferred = ri; + new_inner_key = left; + new_outer_key = NULL; /* no need to change */ + distType = outerd->distributionType; + } + continue; + } + } + if (innerd->distributionExpr) + { +#ifdef __TBASE__ + /* For UPDATE/DELETE, make sure inner rel does not need to distribute */ + if (keepResultRelLoc && resultRelLoc == RESULT_REL_OUTER) + continue; +#endif + /* + * If left side is distribution key of inner subquery + * and right expression refers only outer subquery + */ + if (equal(innerd->distributionExpr, left_expr) && + bms_is_subset(ri->right_relids, outer_rels)) + { + if (!preferred || /* no preferred restriction yet found */ + (new_inner_key && new_outer_key) || /* preferred restriction require redistribution of both parts */ + (cost.per_tuple < preferred->eval_cost.per_tuple)) /* current restriction is cheaper */ + { + /* set new preferred restriction */ + preferred = ri; + new_inner_key = NULL; /* no need to change */ + new_outer_key = right; + distType = innerd->distributionType; + } + continue; + } + /* + * If right side is distribution key of inner subquery + * and left expression refers only outer subquery + */ + if (equal(innerd->distributionExpr, right_expr) && + bms_is_subset(ri->left_relids, outer_rels)) + { + if (!preferred || /* no preferred restriction yet found */ + (new_inner_key && new_outer_key) || /* preferred restriction require redistribution of both parts */ + (cost.per_tuple < preferred->eval_cost.per_tuple)) /* current restriction is cheaper */ + { + /* set new preferred restriction */ + preferred = ri; + new_inner_key = NULL; /* no need to change */ + new_outer_key = left; + distType = innerd->distributionType; + } + continue; + } + } + /* + * Current restriction recuire redistribution of both parts. + * If preferred restriction require redistribution of one, + * keep it. + */ + if (preferred && + (new_inner_key == NULL || new_outer_key == NULL)) + continue; + + /* + * Skip this condition if the data type of the expressions + * does not allow either HASH or MODULO distribution. + * HASH distribution is preferrable. + */ #ifdef __TBASE__ if (groupOids) { @@ -2196,71 +2337,76 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) else continue; #ifdef __TBASE__ - } + } + + /* + * Skip redistribute both side, which will redistribute the + * result relation + */ + if (keepResultRelLoc) + continue; #endif - /* - * If this restriction the first or easier to calculate - * then preferred, try to store it as new preferred - * restriction to redistribute along it. - */ - if (preferred == NULL || - (cost.per_tuple < preferred->eval_cost.per_tuple)) - { - /* - * Left expression depends only on outer subpath and - * right expression depends only on inner subpath, so - * we can redistribute both and make left expression the - * distribution key of outer subplan and right - * expression the distribution key of inner subplan - */ - if (bms_is_subset(ri->left_relids, outer_rels) && - bms_is_subset(ri->right_relids, inner_rels)) - { - preferred = ri; - new_outer_key = left; - new_inner_key = right; - } - /* - * Left expression depends only on inner subpath and - * right expression depends only on outer subpath, so - * we can redistribute both and make left expression the - * distribution key of inner subplan and right - * expression the distribution key of outer subplan - */ - if (bms_is_subset(ri->left_relids, inner_rels) && - bms_is_subset(ri->right_relids, outer_rels)) - { - preferred = ri; - new_inner_key = left; - new_outer_key = right; - } - } - } - } - } - /* If we have suitable restriction we can repartition accordingly */ - if (preferred) - { - Bitmapset *nodes = NULL; - Bitmapset *restrictNodes = NULL; + /* + * If this restriction the first or easier to calculate + * then preferred, try to store it as new preferred + * restriction to redistribute along it. + */ + if (preferred == NULL || + (cost.per_tuple < preferred->eval_cost.per_tuple)) + { + /* + * Left expression depends only on outer subpath and + * right expression depends only on inner subpath, so + * we can redistribute both and make left expression the + * distribution key of outer subplan and right + * expression the distribution key of inner subplan + */ + if (bms_is_subset(ri->left_relids, outer_rels) && + bms_is_subset(ri->right_relids, inner_rels)) + { + preferred = ri; + new_outer_key = left; + new_inner_key = right; + } + /* + * Left expression depends only on inner subpath and + * right expression depends only on outer subpath, so + * we can redistribute both and make left expression the + * distribution key of inner subplan and right + * expression the distribution key of outer subplan + */ + if (bms_is_subset(ri->left_relids, inner_rels) && + bms_is_subset(ri->right_relids, outer_rels)) + { + preferred = ri; + new_inner_key = left; + new_outer_key = right; + } + } + } + } + } + #ifdef __TBASE__ - /* consider the outer/inner size when make the redistribute plan */ - bool replicate_inner = false; - bool replicate_outer = false; - RelOptInfo *outer_rel = pathnode->outerjoinpath->parent; - RelOptInfo *inner_rel = pathnode->innerjoinpath->parent; - double outer_size = outer_rel->rows * outer_rel->reltarget->width; - double inner_size = inner_rel->rows * inner_rel->reltarget->width; - int outer_nodes = bms_num_members(outerd->nodes); - int inner_nodes = bms_num_members(innerd->nodes); - - int nRemotePlans_outer = 0; - int nRemotePlans_inner = 0; - bool redistribute_outer = false; - bool redistribute_inner = false; - - contains_remotesubplan(pathnode->outerjoinpath, &nRemotePlans_outer, &redistribute_outer); - contains_remotesubplan(pathnode->innerjoinpath, &nRemotePlans_inner, &redistribute_inner); + contains_remotesubplan(pathnode->outerjoinpath, &nRemotePlans_outer, &redistribute_outer); + contains_remotesubplan(pathnode->innerjoinpath, &nRemotePlans_inner, &redistribute_inner); +#endif + + /* If we have suitable restriction we can repartition accordingly */ + if (preferred) + { + Bitmapset *nodes = NULL; + Bitmapset *restrictNodes = NULL; +#ifdef __TBASE__ + /* consider the outer/inner size when make the redistribute plan */ + bool replicate_inner = false; + bool replicate_outer = false; + RelOptInfo *outer_rel = pathnode->outerjoinpath->parent; + RelOptInfo *inner_rel = pathnode->innerjoinpath->parent; + double outer_size = outer_rel->rows * outer_rel->reltarget->width; + double inner_size = inner_rel->rows * inner_rel->reltarget->width; + int outer_nodes = bms_num_members(outerd->nodes); + int inner_nodes = bms_num_members(innerd->nodes); #endif /* If we redistribute both parts do join on all nodes ... */ @@ -2271,28 +2417,33 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) nodes = bms_add_member(nodes, i); #ifdef __TBASE__ - /* check if we can distribute by shard */ - if (OidIsValid(group)) - { - int node_index; - int32 dn_num; - int32 *datanodes; - //List *nodelist = GetGroupNodeList(group); - - GetShardNodes(group, &datanodes, &dn_num, NULL); + /* check if we can distribute by shard */ + if (OidIsValid(group)) + { + int node_index; + int32 dn_num; + int32 *datanodes; + + GetShardNodes(group, &datanodes, &dn_num, NULL); + + bms_free(nodes); + nodes = NULL; + + for(node_index = 0; node_index < dn_num; node_index++) + { + nodes = bms_add_member(nodes, datanodes[node_index]); + } + } - bms_free(nodes); - nodes = NULL; - - for(node_index = 0; node_index < dn_num; node_index++) - { - nodes = bms_add_member(nodes, datanodes[node_index]); - } - } + /* + * We should not get both new_inner_key & new_outer_key for + * UPDATE/DELETE + */ + Assert(!keepResultRelLoc); - /* - * if any side is smaller enough, replicate the smaller one - * instead of redistribute both of them. + /* + * if any side is smaller enough, replicate the smaller one + * instead of redistribute both of them. */ if(inner_size * outer_nodes < inner_size + outer_size && (pathnode->jointype != JOIN_RIGHT && pathnode->jointype != JOIN_FULL) && @@ -2325,29 +2476,31 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) else if (new_inner_key) { #ifdef __TBASE__ - /* - * if inner is smaller than outer, redistribute inner - * if inner is bigger than outer (inner > inner->nodes * outer), - * replicate outer; else redistribute inner - */ - if(inner_size > outer_size * inner_nodes && - (pathnode->jointype != JOIN_LEFT && pathnode->jointype != JOIN_FULL && - pathnode->jointype != JOIN_SEMI && pathnode->jointype != JOIN_ANTI) && - innerd->distributionType != LOCATOR_TYPE_REPLICATED && !redistribute_outer && - get_num_connections(inner_nodes, nRemotePlans_outer + 1) < MaxConnections * REPLICATION_FACTOR && + /* + * If inner is smaller than outer, redistribute inner as the + * preferred key we picked. + * If inner is bigger than outer (inner > inner->nodes * outer), + * replicate outer as an optimization to save network costs. + */ + if(inner_size > outer_size * inner_nodes && + (pathnode->jointype != JOIN_LEFT && pathnode->jointype != JOIN_FULL && + pathnode->jointype != JOIN_SEMI && pathnode->jointype != JOIN_ANTI) && + innerd->distributionType != LOCATOR_TYPE_REPLICATED && !redistribute_outer && + get_num_connections(inner_nodes, nRemotePlans_outer + 1) < MaxConnections * REPLICATION_FACTOR && !dml && nRemotePlans_outer < replication_level && !pathnode->inner_unique) - { - replicate_outer = true; + { + replicate_outer = true; - /* replicate outer to all inner nodes */ - nodes = bms_copy(innerd->nodes); - restrictNodes = bms_copy(innerd->restrictNodes); - } - else - { + /* replicate outer to all inner nodes */ + nodes = bms_copy(innerd->nodes); + restrictNodes = bms_copy(innerd->restrictNodes); + } + else + { + Assert(!keepResultRelLoc || resultRelLoc != RESULT_REL_INNER); #endif - nodes = bms_copy(outerd->nodes); - restrictNodes = bms_copy(outerd->restrictNodes); + nodes = bms_copy(outerd->nodes); + restrictNodes = bms_copy(outerd->restrictNodes); #ifdef __TBASE__ } #endif @@ -2355,28 +2508,30 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) else /*if (new_outer_key)*/ { #ifdef __TBASE__ - /* - * if outer is smaller than inner, redistribute outer - * if outer is bigger than inner (outer > outer->nodes * inner), - * replicate inner; else redistribute outer - */ - if(outer_size > inner_size * outer_nodes && - (pathnode->jointype != JOIN_RIGHT && pathnode->jointype != JOIN_FULL) && - outerd->distributionType != LOCATOR_TYPE_REPLICATED && !redistribute_inner && - get_num_connections(outer_nodes, nRemotePlans_inner + 1) < MaxConnections * REPLICATION_FACTOR && + /* + * If outer is smaller than inner, redistribute outer as the + * preferred key we picked. + * If outer is bigger than inner (outer > outer->nodes * inner), + * replicate inner as an optimization to save network costs. + */ + if (outer_size > inner_size * outer_nodes && + (pathnode->jointype != JOIN_RIGHT && pathnode->jointype != JOIN_FULL) && + outerd->distributionType != LOCATOR_TYPE_REPLICATED && !redistribute_inner && + get_num_connections(outer_nodes, nRemotePlans_inner + 1) < MaxConnections * REPLICATION_FACTOR && !dml && nRemotePlans_inner < replication_level && !pathnode->inner_unique) - { - replicate_inner = true; + { + replicate_inner = true; - /* replicate inner to all outer nodes */ - nodes = bms_copy(outerd->nodes); - restrictNodes = bms_copy(outerd->restrictNodes); - } - else - { + /* replicate inner to all outer nodes */ + nodes = bms_copy(outerd->nodes); + restrictNodes = bms_copy(outerd->restrictNodes); + } + else + { + Assert(!keepResultRelLoc || resultRelLoc != RESULT_REL_OUTER); #endif - nodes = bms_copy(innerd->nodes); - restrictNodes = bms_copy(innerd->restrictNodes); + nodes = bms_copy(innerd->nodes); + restrictNodes = bms_copy(innerd->restrictNodes); #ifdef __TBASE__ } #endif @@ -2510,9 +2665,82 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) targetd->distributionExpr = pathnode->outerjoinpath->distribution->distributionExpr; - return alternate; - } - } + return alternate; + } + +#ifdef __TBASE__ + if (keepResultRelLoc) + { + /* + * We didn't got the preferred redistribution plan for UPDATE/DELETE. + * Thus, to keeping result relation not redistributed, we replicate + * the other subpath. + */ + if (resultRelLoc == RESULT_REL_INNER && + pathnode->jointype != JOIN_LEFT && pathnode->jointype != JOIN_FULL && + pathnode->jointype != JOIN_SEMI && pathnode->jointype != JOIN_ANTI && + nRemotePlans_outer < replication_level && !pathnode->inner_unique) + { + /* Replicate outer */ + pathnode->outerjoinpath = redistribute_path( + root, + pathnode->outerjoinpath, + outerpathkeys, + LOCATOR_TYPE_NONE, + NULL, + innerd->nodes, + NULL); + pathnode->path.distribution = innerd; + + if (IsA(pathnode, MergePath)) + ((MergePath*)pathnode)->outersortkeys = NIL; + } + else if (resultRelLoc == RESULT_REL_OUTER && + pathnode->jointype != JOIN_RIGHT && pathnode->jointype != JOIN_FULL && + nRemotePlans_outer < replication_level && !pathnode->inner_unique) + { + /* Replicate inner */ + pathnode->innerjoinpath = redistribute_path( + root, + pathnode->innerjoinpath, + innerpathkeys, + LOCATOR_TYPE_NONE, + NULL, + outerd->nodes, + NULL); + pathnode->path.distribution = outerd; + + if (IsA(pathnode, MergePath)) + ((MergePath*)pathnode)->innersortkeys = NIL; + } + + return alternate; + } + } + + /* + * For DELETE/UPDATE, If the other side already been replicated, we directly + * inherit the resultRelLoc side distribution. + */ + if (keepResultRelLoc) + { + if (innerd &&resultRelLoc == RESULT_REL_INNER && + pathnode->jointype != JOIN_LEFT && pathnode->jointype != JOIN_FULL && + pathnode->jointype != JOIN_SEMI && pathnode->jointype != JOIN_ANTI && + !pathnode->inner_unique) + { + pathnode->path.distribution = innerd; + return alternate; + } + else if (outerd && resultRelLoc == RESULT_REL_OUTER && + pathnode->jointype != JOIN_RIGHT && pathnode->jointype != JOIN_FULL && + !pathnode->inner_unique) + { + pathnode->path.distribution = outerd; + return alternate; + } +#endif + } /* * Build cartesian product, if no hasheable restrictions is found. @@ -5149,7 +5377,11 @@ create_nestloop_path(PlannerInfo *root, } #endif - return pathnode; + /* For DELETE, check if the path distribution satisfy resultRel distribution */ + if (!SatisfyResultRelDist(root, &pathnode->path)) + return NULL; + + return pathnode; } /* @@ -5266,7 +5498,11 @@ create_mergejoin_path(PlannerInfo *root, } #endif - return pathnode; + /* For DELETE, check if the path distribution satisfy resultRel distribution */ + if (!SatisfyResultRelDist(root, &pathnode->jpath.path)) + return NULL; + + return pathnode; } /* @@ -5394,7 +5630,11 @@ create_hashjoin_path(PlannerInfo *root, } #endif - return pathnode; + /* For DELETE, check if the path distribution satisfy resultRel distribution */ + if (!SatisfyResultRelDist(root, &pathnode->jpath.path)) + return NULL; + + return pathnode; } /* diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c index 5ca3723f..153d3d36 100644 --- a/src/backend/optimizer/util/relnode.c +++ b/src/backend/optimizer/util/relnode.c @@ -32,9 +32,10 @@ #endif #ifdef __TBASE__ #include "access/heapam.h" -#include "utils/rel.h" -#include "utils/lsyscache.h" #include "access/sysattr.h" +#include "optimizer/distribution.h" +#include "utils/lsyscache.h" +#include "utils/rel.h" #endif typedef struct JoinHashEntry { @@ -155,10 +156,11 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptInfo *parent) rel->joininfo = NIL; rel->has_eclass_joins = false; #ifdef __TBASE__ - rel->intervalparent = false; - rel->isdefault = rte->isdefault; - rel->estimate_partidx = -1; - rel->childs = NULL; + rel->intervalparent = false; + rel->isdefault = rte->isdefault; + rel->estimate_partidx = -1; + rel->childs = NULL; + rel->resultRelLoc = RESULT_REL_NONE; #endif /* @@ -497,177 +499,183 @@ add_join_rel(PlannerInfo *root, RelOptInfo *joinrel) */ RelOptInfo * build_join_rel(PlannerInfo *root, - Relids joinrelids, - RelOptInfo *outer_rel, - RelOptInfo *inner_rel, - SpecialJoinInfo *sjinfo, - List **restrictlist_ptr) -{// #lizard forgives - RelOptInfo *joinrel; - List *restrictlist; - - /* - * See if we already have a joinrel for this set of base rels. - */ - joinrel = find_join_rel(root, joinrelids); - - if (joinrel) - { - /* - * Yes, so we only need to figure the restrictlist for this particular - * pair of component relations. - */ - if (restrictlist_ptr) - *restrictlist_ptr = build_joinrel_restrictlist(root, - joinrel, - outer_rel, - inner_rel); - return joinrel; - } - - /* - * Nope, so make one. - */ - joinrel = makeNode(RelOptInfo); - joinrel->reloptkind = RELOPT_JOINREL; - joinrel->relids = bms_copy(joinrelids); - joinrel->rows = 0; - /* cheap startup cost is interesting iff not all tuples to be retrieved */ - joinrel->consider_startup = (root->tuple_fraction > 0); - joinrel->consider_param_startup = false; - joinrel->consider_parallel = false; - joinrel->reltarget = create_empty_pathtarget(); - joinrel->pathlist = NIL; - joinrel->ppilist = NIL; - joinrel->partial_pathlist = NIL; - joinrel->cheapest_startup_path = NULL; - joinrel->cheapest_total_path = NULL; - joinrel->cheapest_unique_path = NULL; - joinrel->cheapest_parameterized_paths = NIL; - /* init direct_lateral_relids from children; we'll finish it up below */ - joinrel->direct_lateral_relids = - bms_union(outer_rel->direct_lateral_relids, - inner_rel->direct_lateral_relids); - joinrel->lateral_relids = min_join_parameterization(root, joinrel->relids, - outer_rel, inner_rel); - joinrel->relid = 0; /* indicates not a baserel */ - joinrel->rtekind = RTE_JOIN; - joinrel->min_attr = 0; - joinrel->max_attr = 0; - joinrel->attr_needed = NULL; - joinrel->attr_widths = NULL; - joinrel->lateral_vars = NIL; - joinrel->lateral_referencers = NULL; - joinrel->indexlist = NIL; - joinrel->statlist = NIL; - joinrel->pages = 0; - joinrel->tuples = 0; - joinrel->allvisfrac = 0; - joinrel->subroot = NULL; - joinrel->subplan_params = NIL; - joinrel->rel_parallel_workers = -1; - joinrel->serverid = InvalidOid; - joinrel->userid = InvalidOid; - joinrel->useridiscurrent = false; - joinrel->fdwroutine = NULL; - joinrel->fdw_private = NULL; - joinrel->unique_for_rels = NIL; - joinrel->non_unique_for_rels = NIL; - joinrel->baserestrictinfo = NIL; - joinrel->baserestrictcost.startup = 0; - joinrel->baserestrictcost.per_tuple = 0; - joinrel->baserestrict_min_security = UINT_MAX; - joinrel->joininfo = NIL; - joinrel->has_eclass_joins = false; - joinrel->top_parent_relids = NULL; - - /* Compute information relevant to the foreign relations. */ - set_foreign_rel_properties(joinrel, outer_rel, inner_rel); - - /* - * Create a new tlist containing just the vars that need to be output from - * this join (ie, are needed for higher joinclauses or final output). - * - * NOTE: the tlist order for a join rel will depend on which pair of outer - * and inner rels we first try to build it from. But the contents should - * be the same regardless. - */ - build_joinrel_tlist(root, joinrel, outer_rel); - build_joinrel_tlist(root, joinrel, inner_rel); - add_placeholders_to_joinrel(root, joinrel, outer_rel, inner_rel); - - /* - * add_placeholders_to_joinrel also took care of adding the ph_lateral - * sets of any PlaceHolderVars computed here to direct_lateral_relids, so - * now we can finish computing that. This is much like the computation of - * the transitively-closed lateral_relids in min_join_parameterization, - * except that here we *do* have to consider the added PHVs. - */ - joinrel->direct_lateral_relids = - bms_del_members(joinrel->direct_lateral_relids, joinrel->relids); - if (bms_is_empty(joinrel->direct_lateral_relids)) - joinrel->direct_lateral_relids = NULL; - - /* - * Construct restrict and join clause lists for the new joinrel. (The - * caller might or might not need the restrictlist, but I need it anyway - * for set_joinrel_size_estimates().) - */ - restrictlist = build_joinrel_restrictlist(root, joinrel, - outer_rel, inner_rel); - if (restrictlist_ptr) - *restrictlist_ptr = restrictlist; - build_joinrel_joinlist(joinrel, outer_rel, inner_rel); - - /* - * This is also the right place to check whether the joinrel has any - * pending EquivalenceClass joins. - */ - joinrel->has_eclass_joins = has_relevant_eclass_joinclause(root, joinrel); - - /* - * Set estimates of the joinrel's size. - */ - set_joinrel_size_estimates(root, joinrel, outer_rel, inner_rel, - sjinfo, restrictlist); - - /* - * Set the consider_parallel flag if this joinrel could potentially be - * scanned within a parallel worker. If this flag is false for either - * inner_rel or outer_rel, then it must be false for the joinrel also. - * Even if both are true, there might be parallel-restricted expressions - * in the targetlist or quals. - * - * Note that if there are more than two rels in this relation, they could - * be divided between inner_rel and outer_rel in any arbitrary way. We - * assume this doesn't matter, because we should hit all the same baserels - * and joinclauses while building up to this joinrel no matter which we - * take; therefore, we should make the same decision here however we get - * here. - */ - if (inner_rel->consider_parallel && outer_rel->consider_parallel && - is_parallel_safe(root, (Node *) restrictlist) && - is_parallel_safe(root, (Node *) joinrel->reltarget->exprs)) - joinrel->consider_parallel = true; - - /* Add the joinrel to the PlannerInfo. */ - add_join_rel(root, joinrel); + Relids joinrelids, + RelOptInfo *outer_rel, + RelOptInfo *inner_rel, + SpecialJoinInfo *sjinfo, + List **restrictlist_ptr) +{ + RelOptInfo *joinrel; + List *restrictlist; +#ifdef __TBASE__ + PlannerInfo *top_root = root; +#endif - /* - * Also, if dynamic-programming join search is active, add the new joinrel - * to the appropriate sublist. Note: you might think the Assert on number - * of members should be for equality, but some of the level 1 rels might - * have been joinrels already, so we can only assert <=. - */ - if (root->join_rel_level) - { - Assert(root->join_cur_level > 0); - Assert(root->join_cur_level <= bms_num_members(joinrel->relids)); - root->join_rel_level[root->join_cur_level] = - lappend(root->join_rel_level[root->join_cur_level], joinrel); - } + /* + * See if we already have a joinrel for this set of base rels. + */ + joinrel = find_join_rel(root, joinrelids); + + if (joinrel) + { + /* + * Yes, so we only need to figure the restrictlist for this particular + * pair of component relations. + */ + if (restrictlist_ptr) + *restrictlist_ptr = build_joinrel_restrictlist(root, + joinrel, + outer_rel, + inner_rel); + return joinrel; + } + + /* + * Nope, so make one. + */ + joinrel = makeNode(RelOptInfo); + joinrel->reloptkind = RELOPT_JOINREL; + joinrel->relids = bms_copy(joinrelids); + joinrel->rows = 0; + /* cheap startup cost is interesting iff not all tuples to be retrieved */ + joinrel->consider_startup = (root->tuple_fraction > 0); + joinrel->consider_param_startup = false; + joinrel->consider_parallel = false; + joinrel->reltarget = create_empty_pathtarget(); + joinrel->pathlist = NIL; + joinrel->ppilist = NIL; + joinrel->partial_pathlist = NIL; + joinrel->cheapest_startup_path = NULL; + joinrel->cheapest_total_path = NULL; + joinrel->cheapest_unique_path = NULL; + joinrel->cheapest_parameterized_paths = NIL; + /* init direct_lateral_relids from children; we'll finish it up below */ + joinrel->direct_lateral_relids = + bms_union(outer_rel->direct_lateral_relids, + inner_rel->direct_lateral_relids); + joinrel->lateral_relids = min_join_parameterization(root, joinrel->relids, + outer_rel, inner_rel); + joinrel->relid = 0; /* indicates not a baserel */ + joinrel->rtekind = RTE_JOIN; + joinrel->min_attr = 0; + joinrel->max_attr = 0; + joinrel->attr_needed = NULL; + joinrel->attr_widths = NULL; + joinrel->lateral_vars = NIL; + joinrel->lateral_referencers = NULL; + joinrel->indexlist = NIL; + joinrel->statlist = NIL; + joinrel->pages = 0; + joinrel->tuples = 0; + joinrel->allvisfrac = 0; + joinrel->subroot = NULL; + joinrel->subplan_params = NIL; + joinrel->rel_parallel_workers = -1; + joinrel->serverid = InvalidOid; + joinrel->userid = InvalidOid; + joinrel->useridiscurrent = false; + joinrel->fdwroutine = NULL; + joinrel->fdw_private = NULL; + joinrel->unique_for_rels = NIL; + joinrel->non_unique_for_rels = NIL; + joinrel->baserestrictinfo = NIL; + joinrel->baserestrictcost.startup = 0; + joinrel->baserestrictcost.per_tuple = 0; + joinrel->baserestrict_min_security = UINT_MAX; + joinrel->joininfo = NIL; + joinrel->has_eclass_joins = false; + joinrel->top_parent_relids = NULL; +#ifdef __TBASE__ + joinrel->resultRelLoc = RESULT_REL_NONE; +#endif - return joinrel; + /* Compute information relevant to the foreign relations. */ + set_foreign_rel_properties(joinrel, outer_rel, inner_rel); + + /* + * Create a new tlist containing just the vars that need to be output from + * this join (ie, are needed for higher joinclauses or final output). + * + * NOTE: the tlist order for a join rel will depend on which pair of outer + * and inner rels we first try to build it from. But the contents should + * be the same regardless. + */ + build_joinrel_tlist(root, joinrel, outer_rel); + build_joinrel_tlist(root, joinrel, inner_rel); + add_placeholders_to_joinrel(root, joinrel, outer_rel, inner_rel); + + /* + * add_placeholders_to_joinrel also took care of adding the ph_lateral + * sets of any PlaceHolderVars computed here to direct_lateral_relids, so + * now we can finish computing that. This is much like the computation of + * the transitively-closed lateral_relids in min_join_parameterization, + * except that here we *do* have to consider the added PHVs. + */ + joinrel->direct_lateral_relids = + bms_del_members(joinrel->direct_lateral_relids, joinrel->relids); + if (bms_is_empty(joinrel->direct_lateral_relids)) + joinrel->direct_lateral_relids = NULL; + + /* + * Construct restrict and join clause lists for the new joinrel. (The + * caller might or might not need the restrictlist, but I need it anyway + * for set_joinrel_size_estimates().) + */ + restrictlist = build_joinrel_restrictlist(root, joinrel, + outer_rel, inner_rel); + if (restrictlist_ptr) + *restrictlist_ptr = restrictlist; + build_joinrel_joinlist(joinrel, outer_rel, inner_rel); + + /* + * This is also the right place to check whether the joinrel has any + * pending EquivalenceClass joins. + */ + joinrel->has_eclass_joins = has_relevant_eclass_joinclause(root, joinrel); + + /* + * Set estimates of the joinrel's size. + */ + set_joinrel_size_estimates(root, joinrel, outer_rel, inner_rel, + sjinfo, restrictlist); + + /* + * Set the consider_parallel flag if this joinrel could potentially be + * scanned within a parallel worker. If this flag is false for either + * inner_rel or outer_rel, then it must be false for the joinrel also. + * Even if both are true, there might be parallel-restricted expressions + * in the targetlist or quals. + * + * Note that if there are more than two rels in this relation, they could + * be divided between inner_rel and outer_rel in any arbitrary way. We + * assume this doesn't matter, because we should hit all the same baserels + * and joinclauses while building up to this joinrel no matter which we + * take; therefore, we should make the same decision here however we get + * here. + */ + if (inner_rel->consider_parallel && outer_rel->consider_parallel && + is_parallel_safe(root, (Node *) restrictlist) && + is_parallel_safe(root, (Node *) joinrel->reltarget->exprs)) + joinrel->consider_parallel = true; + + /* Add the joinrel to the PlannerInfo. */ + add_join_rel(root, joinrel); + + /* + * Also, if dynamic-programming join search is active, add the new joinrel + * to the appropriate sublist. Note: you might think the Assert on number + * of members should be for equality, but some of the level 1 rels might + * have been joinrels already, so we can only assert <=. + */ + if (root->join_rel_level) + { + Assert(root->join_cur_level > 0); + Assert(root->join_cur_level <= bms_num_members(joinrel->relids)); + root->join_rel_level[root->join_cur_level] = + lappend(root->join_rel_level[root->join_cur_level], joinrel); + } + + return joinrel; } /* diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h index d2d0ec0a..99a6325c 100644 --- a/src/include/nodes/relation.h +++ b/src/include/nodes/relation.h @@ -100,6 +100,16 @@ typedef struct Distribution } Distribution; #endif +#ifdef __TBASE__ +/* + * The location of DML result relation in JOINREL + */ +typedef enum ResultRelLocation { + RESULT_REL_NONE, /* Not found */ + RESULT_REL_INNER, /* Appears in inner subpath */ + RESULT_REL_OUTER /* Appears in outer subpath */ +} ResultRelLocation; +#endif /* * Relids @@ -697,11 +707,14 @@ typedef struct RelOptInfo /* used by "other" relations */ Relids top_parent_relids; /* Relids of topmost parents */ #ifdef __TBASE__ - /* used for interval partition */ - bool intervalparent; /* is interval partition */ - bool isdefault; /* is default partition table */ - Bitmapset *childs; /* child tables to query */ - int estimate_partidx; /* */ + /* used for interval partition */ + bool intervalparent; /* is interval partition */ + bool isdefault; /* is default partition table */ + Bitmapset *childs; /* child tables to query */ + int estimate_partidx; /* */ + + /* used for complex delete */ + ResultRelLocation resultRelLoc; #endif } RelOptInfo; diff --git a/src/include/optimizer/distribution.h b/src/include/optimizer/distribution.h new file mode 100644 index 00000000..fe4d2aeb --- /dev/null +++ b/src/include/optimizer/distribution.h @@ -0,0 +1,28 @@ +/*------------------------------------------------------------------------- + * + * distribution.h + * Routines related to adjust distribution + * + * Copyright (c) 2020-Present TBase development team, Tencent + * + * + * IDENTIFICATION + * src/include/optimizer/distribution.h + * + *------------------------------------------------------------------------- + */ +#ifndef DISTRIBUTION_H +#define DISTRIBUTION_H + +#include "postgres.h" + +#include "nodes/relation.h" + +/* TODO(TBase): Move all plan/path distribution routines to this file */ + +extern bool equal_distributions(PlannerInfo *root, Distribution *dst1, + Distribution *dst2); +extern ResultRelLocation getResultRelLocation(int resultRel, Relids inner, + Relids outer); +extern bool SatisfyResultRelDist(PlannerInfo *root, Path *path); +#endif /* DISTRIBUTION_H */ diff --git a/src/test/regress/expected/foreign_key_2.out b/src/test/regress/expected/foreign_key_2.out index 1dfe6663..8b8ac8ac 100644 --- a/src/test/regress/expected/foreign_key_2.out +++ b/src/test/regress/expected/foreign_key_2.out @@ -1373,11 +1373,26 @@ create temp table t1 (a integer primary key, b text); create temp table t2 (a integer, b integer references t1) distribute by hash (b); create rule r1 as on delete to t1 do delete from t2 where t2.b = old.a; explain (costs off) delete from t1 where a = 1; -ERROR: could not plan this distributed delete -DETAIL: correlated or complex DELETE is currently not supported in Postgres-XL. + QUERY PLAN +------------------------------------------------------------------ + Remote Subquery Scan on all (datanode_1) + -> Delete on t2 + -> Nested Loop + -> Seq Scan on t2 + Filter: (b = 1) + -> Materialize + -> Remote Subquery Scan on all (datanode_1) + -> Index Scan using t1_pkey on t1 + Index Cond: (a = 1) + + Remote Fast Query Execution + Node/s: datanode_1 + -> Delete on t1 + -> Index Scan using t1_pkey on t1 + Index Cond: (a = 1) +(15 rows) + delete from t1 where a = 1; -ERROR: could not plan this distributed delete -DETAIL: correlated or complex DELETE is currently not supported in Postgres-XL. drop rule r1 on t1; explain (costs off, nodes off) delete from t1 where a = 1; QUERY PLAN diff --git a/src/test/regress/expected/join_3.out b/src/test/regress/expected/join_3.out index 9d08f4b2..f151b912 100644 --- a/src/test/regress/expected/join_3.out +++ b/src/test/regress/expected/join_3.out @@ -2341,15 +2341,12 @@ SELECT * FROM t3 ORDER By x, y; (3 rows) DELETE FROM t3 USING t1 JOIN t2 USING (a) WHERE t3.x > t1.a; -ERROR: could not plan this distributed delete -DETAIL: correlated or complex DELETE is currently not supported in Postgres-XL. SELECT * FROM t3 ORDER By x, y; - x | y ------+----- - 6 | 7 - 7 | 8 - 500 | 100 -(3 rows) + x | y +---+--- + 6 | 7 + 7 | 8 +(2 rows) DELETE FROM t3 USING t3 t3_other WHERE t3.x = t3_other.x AND t3.y = t3_other.y; SELECT * FROM t3 ORDER By x, y; diff --git a/src/test/regress/expected/matview_1.out b/src/test/regress/expected/matview_1.out index 4bd7e9dc..ec7d3220 100644 --- a/src/test/regress/expected/matview_1.out +++ b/src/test/regress/expected/matview_1.out @@ -542,8 +542,8 @@ drop materialized view mvtest_error; CREATE TABLE mvtest_v AS SELECT generate_series(1,10) AS a; CREATE MATERIALIZED VIEW mvtest_mv_v AS SELECT a FROM mvtest_v WHERE a <= 5; DELETE FROM mvtest_v WHERE EXISTS ( SELECT * FROM mvtest_mv_v WHERE mvtest_mv_v.a = mvtest_v.a ); -ERROR: could not plan this distributed delete -DETAIL: correlated or complex DELETE is currently not supported in Postgres-XL. +ERROR: materialized view "mvtest_mv_v" has not been populated +HINT: Use the REFRESH MATERIALIZED VIEW command. SELECT * FROM mvtest_v order by 1; a ---- diff --git a/src/test/regress/expected/returning.out b/src/test/regress/expected/returning.out index 0e32cc37..5f667e4f 100644 --- a/src/test/regress/expected/returning.out +++ b/src/test/regress/expected/returning.out @@ -313,8 +313,8 @@ CREATE RULE joinview_u AS ON UPDATE TO joinview DO INSTEAD FROM joinme WHERE f2 = f2j AND f2 = old.f2 RETURNING foo.*, other; UPDATE joinview SET f1 = f1 + 1 WHERE f3 = 57 RETURNING *, other + 1; -ERROR: could not plan this distributed update -DETAIL: correlated UPDATE or updating distribution column currently not supported in Postgres-XL. +ERROR: could not plan this distributed UPDATE/DELETE +DETAIL: correlated or complex UPDATE/DELETE is currently not supported in Postgres-XL. SELECT * FROM joinview ORDER BY f1; f1 | f2 | f3 | f4 | other ----+------+----+-----+------- diff --git a/src/test/regress/expected/rowsecurity_1.out b/src/test/regress/expected/rowsecurity_1.out index fb6327f3..670e9a06 100644 --- a/src/test/regress/expected/rowsecurity_1.out +++ b/src/test/regress/expected/rowsecurity_1.out @@ -1572,49 +1572,159 @@ UPDATE t1 SET b = b WHERE f_leak(b); -- updates with from clause EXPLAIN (COSTS OFF) UPDATE t2 SET b=t2.b FROM t3 WHERE t2.a = 3 and t3.a = 2 AND f_leak(t2.b) AND f_leak(t3.b); -ERROR: could not plan this distributed update -DETAIL: correlated UPDATE or updating distribution column currently not supported in Postgres-XL. + QUERY PLAN +----------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_2) + -> Update on t2 + -> Nested Loop + -> Seq Scan on t2 + Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b)) + -> Materialize + -> Remote Subquery Scan on all (datanode_1) + -> Seq Scan on t3 + Filter: ((a = 2) AND f_leak(b)) +(9 rows) + UPDATE t2 SET b=t2.b FROM t3 WHERE t2.a = 3 and t3.a = 2 AND f_leak(t2.b) AND f_leak(t3.b); -ERROR: could not plan this distributed update -DETAIL: correlated UPDATE or updating distribution column currently not supported in Postgres-XL. EXPLAIN (COSTS OFF) UPDATE t1 SET b=t1.b FROM t2 WHERE t1.a = 3 and t2.a = 3 AND f_leak(t1.b) AND f_leak(t2.b); -ERROR: could not plan this distributed update -DETAIL: correlated UPDATE or updating distribution column currently not supported in Postgres-XL. + QUERY PLAN +----------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_2) + -> Update on t1 + Update on t1 + Update on t2 t2_1 + Update on t3 + -> Nested Loop + -> Remote Subquery Scan on all (datanode_2) + -> Seq Scan on t2 + Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b)) + -> Seq Scan on t1 + Filter: ((a = 3) AND ((a % 2) = 0) AND f_leak(b)) + -> Nested Loop + -> Remote Subquery Scan on all (datanode_2) + -> Seq Scan on t2 + Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b)) + -> Seq Scan on t2 t2_1 + Filter: ((a = 3) AND ((a % 2) = 0) AND f_leak(b)) + -> Nested Loop + -> Remote Subquery Scan on all (datanode_2) + -> Seq Scan on t2 + Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b)) + -> Seq Scan on t3 + Filter: ((a = 3) AND ((a % 2) = 0) AND f_leak(b)) +(23 rows) + UPDATE t1 SET b=t1.b FROM t2 WHERE t1.a = 3 and t2.a = 3 AND f_leak(t1.b) AND f_leak(t2.b); -ERROR: could not plan this distributed update -DETAIL: correlated UPDATE or updating distribution column currently not supported in Postgres-XL. EXPLAIN (COSTS OFF) UPDATE t2 SET b=t2.b FROM t1 WHERE t1.a = 3 and t2.a = 3 AND f_leak(t1.b) AND f_leak(t2.b); -ERROR: could not plan this distributed update -DETAIL: correlated UPDATE or updating distribution column currently not supported in Postgres-XL. + QUERY PLAN +----------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_2) + -> Update on t2 + -> Nested Loop + -> Seq Scan on t2 + Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b)) + -> Materialize + -> Remote Subquery Scan on all (datanode_2) + -> Append + -> Seq Scan on t1 + Filter: ((a = 3) AND ((a % 2) = 0) AND f_leak(b)) + -> Seq Scan on t2 t2_1 + Filter: ((a = 3) AND ((a % 2) = 0) AND f_leak(b)) + -> Seq Scan on t3 + Filter: ((a = 3) AND ((a % 2) = 0) AND f_leak(b)) +(14 rows) + UPDATE t2 SET b=t2.b FROM t1 WHERE t1.a = 3 and t2.a = 3 AND f_leak(t1.b) AND f_leak(t2.b); -ERROR: could not plan this distributed update -DETAIL: correlated UPDATE or updating distribution column currently not supported in Postgres-XL. -- updates with from clause self join EXPLAIN (COSTS OFF) UPDATE t2 t2_1 SET b = t2_2.b FROM t2 t2_2 WHERE t2_1.a = 3 AND t2_2.a = t2_1.a AND t2_2.b = t2_1.b AND f_leak(t2_1.b) AND f_leak(t2_2.b) RETURNING *, t2_1, t2_2; -ERROR: could not plan this distributed update -DETAIL: correlated UPDATE or updating distribution column currently not supported in Postgres-XL. + QUERY PLAN +----------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_2) + -> Update on t2 t2_1 + -> Nested Loop + Join Filter: (t2_1.b = t2_2.b) + -> Remote Subquery Scan on all (datanode_2) + -> Seq Scan on t2 t2_2 + Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b)) + -> Seq Scan on t2 t2_1 + Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b)) +(9 rows) + UPDATE t2 t2_1 SET b = t2_2.b FROM t2 t2_2 WHERE t2_1.a = 3 AND t2_2.a = t2_1.a AND t2_2.b = t2_1.b AND f_leak(t2_1.b) AND f_leak(t2_2.b) RETURNING *, t2_1, t2_2; -ERROR: could not plan this distributed update -DETAIL: correlated UPDATE or updating distribution column currently not supported in Postgres-XL. + a | b | c | a | b | c | t2_1 | t2_2 +---+-----+-----+---+-----+-----+-------------+------------- + 3 | cde | 3.3 | 3 | cde | 3.3 | (3,cde,3.3) | (3,cde,3.3) +(1 row) + EXPLAIN (COSTS OFF) UPDATE t1 t1_1 SET b = t1_2.b FROM t1 t1_2 WHERE t1_1.a = 4 AND t1_2.a = t1_1.a AND t1_2.b = t1_1.b AND f_leak(t1_1.b) AND f_leak(t1_2.b) RETURNING *, t1_1, t1_2; -ERROR: could not plan this distributed update -DETAIL: correlated UPDATE or updating distribution column currently not supported in Postgres-XL. + QUERY PLAN +----------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_2) + -> Update on t1 t1_1 + Update on t1 t1_1 + Update on t2 t1_1_1 + Update on t3 t1_1_2 + -> Hash Join + Hash Cond: (b = t1_1.b) + -> Remote Subquery Scan on all (datanode_2) + -> Append + -> Seq Scan on t1 t1_2 + Filter: ((a = 4) AND ((a % 2) = 0) AND f_leak(b)) + -> Seq Scan on t2 t1_2_1 + Filter: ((a = 4) AND ((a % 2) = 0) AND f_leak(b)) + -> Seq Scan on t3 t1_2_2 + Filter: ((a = 4) AND ((a % 2) = 0) AND f_leak(b)) + -> Hash + -> Seq Scan on t1 t1_1 + Filter: ((a = 4) AND ((a % 2) = 0) AND f_leak(b)) + -> Nested Loop + Join Filter: (t1_1_1.b = b) + -> Seq Scan on t2 t1_1_1 + Filter: ((a = 4) AND ((a % 2) = 0) AND f_leak(b)) + -> Materialize + -> Remote Subquery Scan on all (datanode_2) + -> Append + -> Seq Scan on t1 t1_2 + Filter: ((a = 4) AND ((a % 2) = 0) AND f_leak(b)) + -> Seq Scan on t2 t1_2_1 + Filter: ((a = 4) AND ((a % 2) = 0) AND f_leak(b)) + -> Seq Scan on t3 t1_2_2 + Filter: ((a = 4) AND ((a % 2) = 0) AND f_leak(b)) + -> Nested Loop + Join Filter: (t1_1_2.b = b) + -> Seq Scan on t3 t1_1_2 + Filter: ((a = 4) AND ((a % 2) = 0) AND f_leak(b)) + -> Materialize + -> Remote Subquery Scan on all (datanode_2) + -> Append + -> Seq Scan on t1 t1_2 + Filter: ((a = 4) AND ((a % 2) = 0) AND f_leak(b)) + -> Seq Scan on t2 t1_2_1 + Filter: ((a = 4) AND ((a % 2) = 0) AND f_leak(b)) + -> Seq Scan on t3 t1_2_2 + Filter: ((a = 4) AND ((a % 2) = 0) AND f_leak(b)) +(44 rows) + UPDATE t1 t1_1 SET b = t1_2.b FROM t1 t1_2 WHERE t1_1.a = 4 AND t1_2.a = t1_1.a AND t1_2.b = t1_1.b AND f_leak(t1_1.b) AND f_leak(t1_2.b) RETURNING *, t1_1, t1_2; -ERROR: could not plan this distributed update -DETAIL: correlated UPDATE or updating distribution column currently not supported in Postgres-XL. + a | b | a | b | t1_1 | t1_2 +---+-------------+---+-------------+-----------------+----------------- + 4 | daddad_updt | 4 | daddad_updt | (4,daddad_updt) | (4,daddad_updt) + 4 | defdef | 4 | defdef | (4,defdef) | (4,defdef) +(2 rows) + RESET SESSION AUTHORIZATION; SET row_security TO OFF; SELECT * FROM t1 ORDER BY a,b; diff --git a/src/test/regress/expected/rowtypes_1.out b/src/test/regress/expected/rowtypes_1.out index 37ec7dc0..57671100 100644 --- a/src/test/regress/expected/rowtypes_1.out +++ b/src/test/regress/expected/rowtypes_1.out @@ -398,10 +398,14 @@ UPDATE price SET active = true, price = input_prices.price FROM unnest(ARRAY[(10, 123.00), (11, 99.99)]::price_input[]) input_prices WHERE price_key_from_table(price.*) = price_key_from_input(input_prices.*); -ERROR: could not plan this distributed update -DETAIL: correlated UPDATE or updating distribution column currently not supported in Postgres-XL. select * from price; -ERROR: current transaction is aborted, commands ignored until end of transaction block + id | active | price +----+--------+-------- + 1 | f | 42 + 10 | t | 123.00 + 11 | t | 99.99 +(3 rows) + rollback; -- -- Test case derived from bug #9085: check * qualification of composite diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index d9f4a310..a169bf4a 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -291,8 +291,8 @@ select * from rtest_v1 order by a, b; -- updates in a mergejoin update rtest_v1 set b = rtest_t2.b from rtest_t2 where rtest_v1.a = rtest_t2.a; -ERROR: could not plan this distributed update -DETAIL: correlated UPDATE or updating distribution column currently not supported in Postgres-XL. +ERROR: could not plan this distributed UPDATE/DELETE +DETAIL: correlated or complex UPDATE/DELETE is currently not supported in Postgres-XL. select * from rtest_v1 order by a, b; a | b ---+---- @@ -330,8 +330,8 @@ select * from rtest_v1 order by a, b; (8 rows) update rtest_v1 set a = rtest_t3.a + 20 from rtest_t3 where rtest_v1.b = rtest_t3.b; -ERROR: could not plan this distributed update -DETAIL: correlated UPDATE or updating distribution column currently not supported in Postgres-XL. +ERROR: could not plan this distributed UPDATE/DELETE +DETAIL: correlated or complex UPDATE/DELETE is currently not supported in Postgres-XL. select * from rtest_v1 order by a, b; a | b ----+---- @@ -361,58 +361,49 @@ insert into rtest_admin values ('jw', 'orion'); insert into rtest_admin values ('jw', 'notjw'); insert into rtest_admin values ('bm', 'neptun'); update rtest_system set sysname = 'pluto' where sysname = 'neptun'; -ERROR: could not plan this distributed update -DETAIL: correlated UPDATE or updating distribution column currently not supported in Postgres-XL. select * from rtest_interface order by sysname, ifname; sysname | ifname ---------+-------- - neptun | eth0 notjw | eth0 orion | eth0 orion | eth1 + pluto | eth0 (4 rows) select * from rtest_admin order by pname, sysname; pname | sysname -------+--------- - bm | neptun + bm | pluto jw | notjw jw | orion (3 rows) update rtest_person set pname = 'jwieck' where pdesc = 'Jan Wieck'; -ERROR: could not plan this distributed update -DETAIL: correlated UPDATE or updating distribution column currently not supported in Postgres-XL. -- Note: use ORDER BY here to ensure consistent output across all systems. -- The above UPDATE affects two rows with equal keys, so they could be -- updated in either order depending on the whim of the local qsort(). select * from rtest_admin order by pname, sysname; - pname | sysname --------+--------- - bm | neptun - jw | notjw - jw | orion + pname | sysname +--------+--------- + bm | pluto + jwieck | notjw + jwieck | orion (3 rows) delete from rtest_system where sysname = 'orion'; -ERROR: could not plan this distributed delete -DETAIL: correlated or complex DELETE is currently not supported in Postgres-XL. select * from rtest_interface order by sysname, ifname; sysname | ifname ---------+-------- - neptun | eth0 notjw | eth0 - orion | eth0 - orion | eth1 -(4 rows) + pluto | eth0 +(2 rows) select * from rtest_admin order by pname, sysname; - pname | sysname --------+--------- - bm | neptun - jw | notjw - jw | orion -(3 rows) + pname | sysname +--------+--------- + bm | pluto + jwieck | notjw +(2 rows) -- -- Rule qualification test @@ -452,36 +443,41 @@ select ename, who = current_user as "matches user", action, newsal, oldsal from update rtest_empmass set salary = salary + '1000.00'; update rtest_emp set salary = rtest_empmass.salary from rtest_empmass where rtest_emp.ename = rtest_empmass.ename; -ERROR: could not plan this distributed update -DETAIL: correlated UPDATE or updating distribution column currently not supported in Postgres-XL. select ename, who = current_user as "matches user", action, newsal, oldsal from rtest_emplog order by ename, action, newsal; ename | matches user | action | newsal | oldsal ----------------------+--------------+------------+------------+------------ gates | t | fired | $0.00 | $80,000.00 gates | t | hired | $80,000.00 | $0.00 maier | t | hired | $5,000.00 | $0.00 + maier | t | honored | $6,000.00 | $5,000.00 mayr | t | hired | $6,000.00 | $0.00 + mayr | t | honored | $7,000.00 | $6,000.00 meyer | t | hired | $4,000.00 | $0.00 + meyer | t | honored | $5,000.00 | $4,000.00 wiecc | t | hired | $5,000.00 | $0.00 wieck | t | honored | $6,000.00 | $5,000.00 wieck | t | honored | $7,000.00 | $6,000.00 -(8 rows) +(11 rows) delete from rtest_emp using rtest_empmass where rtest_emp.ename = rtest_empmass.ename; -ERROR: could not plan this distributed delete -DETAIL: correlated or complex DELETE is currently not supported in Postgres-XL. select ename, who = current_user as "matches user", action, newsal, oldsal from rtest_emplog order by ename, action, newsal; ename | matches user | action | newsal | oldsal ----------------------+--------------+------------+------------+------------ gates | t | fired | $0.00 | $80,000.00 gates | t | hired | $80,000.00 | $0.00 + maier | t | fired | $0.00 | $6,000.00 maier | t | hired | $5,000.00 | $0.00 + maier | t | honored | $6,000.00 | $5,000.00 + mayr | t | fired | $0.00 | $7,000.00 mayr | t | hired | $6,000.00 | $0.00 + mayr | t | honored | $7,000.00 | $6,000.00 + meyer | t | fired | $0.00 | $5,000.00 meyer | t | hired | $4,000.00 | $0.00 + meyer | t | honored | $5,000.00 | $4,000.00 wiecc | t | hired | $5,000.00 | $0.00 wieck | t | honored | $6,000.00 | $5,000.00 wieck | t | honored | $7,000.00 | $6,000.00 -(8 rows) +(14 rows) -- -- Multiple cascaded qualified instead rule test @@ -1103,26 +1099,27 @@ SELECT * FROM shoelace ORDER BY sl_name; (8 rows) insert into shoelace_ok select * from shoelace_arrive; -ERROR: could not plan this distributed update -DETAIL: correlated UPDATE or updating distribution column currently not supported in Postgres-XL. SELECT * FROM shoelace ORDER BY sl_name; sl_name | sl_avail | sl_color | sl_len | sl_unit | sl_len_cm ------------+----------+------------+--------+----------+----------- sl1 | 5 | black | 80 | cm | 80 sl2 | 6 | black | 100 | cm | 100 - sl3 | 0 | black | 35 | inch | 88.9 + sl3 | 10 | black | 35 | inch | 88.9 sl4 | 8 | black | 40 | inch | 101.6 sl5 | 4 | brown | 1 | m | 100 - sl6 | 0 | brown | 0.9 | m | 90 + sl6 | 20 | brown | 0.9 | m | 90 sl7 | 6 | brown | 60 | cm | 60 - sl8 | 1 | brown | 40 | inch | 101.6 + sl8 | 21 | brown | 40 | inch | 101.6 (8 rows) SELECT * FROM shoelace_log ORDER BY sl_name; sl_name | sl_avail | log_who | log_when ------------+----------+----------+-------------------------- + sl3 | 10 | Al Bundy | Thu Jan 01 00:00:00 1970 + sl6 | 20 | Al Bundy | Thu Jan 01 00:00:00 1970 sl7 | 6 | Al Bundy | Thu Jan 01 00:00:00 1970 -(1 row) + sl8 | 21 | Al Bundy | Thu Jan 01 00:00:00 1970 +(4 rows) CREATE VIEW shoelace_obsolete AS SELECT * FROM shoelace WHERE NOT EXISTS @@ -1151,22 +1148,19 @@ SELECT * FROM shoelace_candelete; DELETE FROM shoelace WHERE EXISTS (SELECT * FROM shoelace_candelete WHERE sl_name = shoelace.sl_name); -ERROR: could not plan this distributed delete -DETAIL: correlated or complex DELETE is currently not supported in Postgres-XL. SELECT * FROM shoelace ORDER BY sl_name; sl_name | sl_avail | sl_color | sl_len | sl_unit | sl_len_cm ------------+----------+------------+--------+----------+----------- sl1 | 5 | black | 80 | cm | 80 sl10 | 1000 | magenta | 40 | inch | 101.6 sl2 | 6 | black | 100 | cm | 100 - sl3 | 0 | black | 35 | inch | 88.9 + sl3 | 10 | black | 35 | inch | 88.9 sl4 | 8 | black | 40 | inch | 101.6 sl5 | 4 | brown | 1 | m | 100 - sl6 | 0 | brown | 0.9 | m | 90 + sl6 | 20 | brown | 0.9 | m | 90 sl7 | 6 | brown | 60 | cm | 60 - sl8 | 1 | brown | 40 | inch | 101.6 - sl9 | 0 | pink | 35 | inch | 88.9 -(10 rows) + sl8 | 21 | brown | 40 | inch | 101.6 +(9 rows) SELECT * FROM shoe ORDER BY shoename; shoename | sh_avail | slcolor | slminlen | slminlen_cm | slmaxlen | slmaxlen_cm | slunit @@ -1254,40 +1248,35 @@ select * from vview order by pid; (2 rows) update vview set descrip='test1' where pid=1; -ERROR: could not plan this distributed update -DETAIL: correlated UPDATE or updating distribution column currently not supported in Postgres-XL. select * from vview order by pid; - pid | txt | descrip ------+---------+---------- - 1 | parent1 | descrip1 + pid | txt | descrip +-----+---------+--------- + 1 | parent1 | test1 2 | parent2 | (2 rows) update vview set descrip='test2' where pid=2; -ERROR: could not plan this distributed update -DETAIL: correlated UPDATE or updating distribution column currently not supported in Postgres-XL. select * from vview order by pid; - pid | txt | descrip ------+---------+---------- - 1 | parent1 | descrip1 - 2 | parent2 | + pid | txt | descrip +-----+---------+--------- + 1 | parent1 | test1 + 2 | parent2 | test2 (2 rows) update vview set descrip='test3' where pid=3; -ERROR: could not plan this distributed update -DETAIL: correlated UPDATE or updating distribution column currently not supported in Postgres-XL. select * from vview order by pid; - pid | txt | descrip ------+---------+---------- - 1 | parent1 | descrip1 - 2 | parent2 | + pid | txt | descrip +-----+---------+--------- + 1 | parent1 | test1 + 2 | parent2 | test2 (2 rows) select * from cchild order by pid; - pid | descrip ------+---------- - 1 | descrip1 -(1 row) + pid | descrip +-----+--------- + 1 | test1 + 2 | test2 +(2 rows) drop rule rrule on vview; drop view vview; @@ -2678,14 +2667,11 @@ select * from id_ordered order by id; (6 rows) update id_ordered set name = 'update 2' where id = 2; -ERROR: could not plan this distributed update -DETAIL: correlated UPDATE or updating distribution column currently not supported in Postgres-XL. +ERROR: input of anonymous composite types is not implemented update id_ordered set name = 'update 4' where id = 4; -ERROR: could not plan this distributed update -DETAIL: correlated UPDATE or updating distribution column currently not supported in Postgres-XL. +ERROR: input of anonymous composite types is not implemented update id_ordered set name = 'update 5' where id = 5; -ERROR: could not plan this distributed update -DETAIL: correlated UPDATE or updating distribution column currently not supported in Postgres-XL. +ERROR: input of anonymous composite types is not implemented select * from id_ordered order by id; id | name ----+-------- diff --git a/src/test/regress/expected/subselect_1.out b/src/test/regress/expected/subselect_1.out index 50633a31..e8cd553a 100644 --- a/src/test/regress/expected/subselect_1.out +++ b/src/test/regress/expected/subselect_1.out @@ -530,12 +530,10 @@ update shipped_view set value = 11 from int4_tbl a join int4_tbl b on (a.f1 = (select f1 from int4_tbl c where c.f1=b.f1)) where ordnum = a.f1; -ERROR: could not plan this distributed update -DETAIL: correlated UPDATE or updating distribution column currently not supported in Postgres-XL. select * from shipped_view; - ttype | ordnum | partnum | value --------+--------+---------+--------- - wt | 0 | 1 | 1234.56 + ttype | ordnum | partnum | value +-------+--------+---------+------- + wt | 0 | 1 | 11 (1 row) select f1, ss1 as relabel from diff --git a/src/test/regress/expected/xl_limitations_1.out b/src/test/regress/expected/xl_limitations_1.out index e408b62a..c44f0d64 100644 --- a/src/test/regress/expected/xl_limitations_1.out +++ b/src/test/regress/expected/xl_limitations_1.out @@ -552,8 +552,6 @@ where xl_t.no = T1.no1; update xl_t1 set name1 = T1.name1 from (select name,name1 from xl_names) T1 where xl_t1.name1 = T1.name; -ERROR: could not plan this distributed update -DETAIL: correlated UPDATE or updating distribution column currently not supported in Postgres-XL. select xl_nodename_from_id(xc_node_id), * from xl_t order by 1; xl_nodename_from_id | no | name ---------------------+----+------ @@ -566,10 +564,10 @@ select xl_nodename_from_id(xc_node_id), * from xl_t order by 1; select xl_nodename_from_id(xc_node_id), * from xl_t1 order by 1; xl_nodename_from_id | no1 | name1 ---------------------+-----+------- - datanode_1 | 1 | Z - datanode_1 | 2 | Y - datanode_2 | 3 | X - datanode_2 | 4 | W + datanode_1 | 2 | Y1 + datanode_1 | 1 | Z1 + datanode_2 | 4 | W1 + datanode_2 | 3 | X1 (4 rows) --testing correlated delete: @@ -580,23 +578,21 @@ where xl_t.no in (select no1 from xl_t1 where name1 in ('Z', 'X')) delete from xl_t1 where xl_t1.name1 in (select name1 from xl_names where name in ('Z', 'X')) ; -ERROR: could not plan this distributed delete -DETAIL: correlated or complex DELETE is currently not supported in Postgres-XL. select xl_nodename_from_id(xc_node_id), * from xl_t order by 1; xl_nodename_from_id | no | name ---------------------+----+------ + datanode_1 | 1 | Z datanode_1 | 2 | Y + datanode_2 | 3 | X datanode_2 | 4 | W -(2 rows) +(4 rows) select xl_nodename_from_id(xc_node_id), * from xl_t1 order by 1; xl_nodename_from_id | no1 | name1 ---------------------+-----+------- - datanode_1 | 1 | Z - datanode_1 | 2 | Y - datanode_2 | 3 | X - datanode_2 | 4 | W -(4 rows) + datanode_1 | 2 | Y1 + datanode_2 | 4 | W1 +(2 rows) drop table xl_t; drop table xl_t1; diff --git a/src/test/regress/output/misc.source b/src/test/regress/output/misc.source index c0658578..2e43a09a 100644 --- a/src/test/regress/output/misc.source +++ b/src/test/regress/output/misc.source @@ -29,15 +29,11 @@ UPDATE tmp FROM onek WHERE onek.stringu1 = 'JBAAAA' and onek.stringu1 = tmp.stringu1; -ERROR: could not plan this distributed update -DETAIL: correlated UPDATE or updating distribution column currently not supported in Postgres-XL. UPDATE tmp SET stringu1 = reverse_name(onek2.stringu1) FROM onek2 WHERE onek2.stringu1 = 'JCAAAA' and onek2.stringu1 = tmp.stringu1; -ERROR: could not plan this distributed update -DETAIL: correlated UPDATE or updating distribution column currently not supported in Postgres-XL. DROP TABLE tmp; --UPDATE person* -- SET age = age + 1; From 8a688d1d0d254917a83c9f5159bed930547ab741 Mon Sep 17 00:00:00 2001 From: ericxwu Date: Fri, 7 Aug 2020 14:14:20 +0800 Subject: [PATCH 018/578] Remove the duplicate estate free in ExecEndModifyTable http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view?bug_id=1020421696081354083 --- src/backend/executor/nodeModifyTable.c | 39 ++++++++++++-------------- src/backend/optimizer/plan/planner.c | 38 +++++++++++++++++++++++++ src/include/nodes/parsenodes.h | 12 ++++---- 3 files changed, 62 insertions(+), 27 deletions(-) diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 9d6aaae4..659f15b1 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -3204,30 +3204,27 @@ ExecEndModifyTable(ModifyTableState *node) } #ifdef __TBASE__ - if (IS_PGXC_COORDINATOR) - { - EState *state = NULL; - ResponseCombiner *combiner; - ModifyTable *plan = (ModifyTable *)node->ps.plan; + if (IS_PGXC_COORDINATOR) + { + ResponseCombiner *combiner; + ModifyTable *plan = (ModifyTable *)node->ps.plan; - if (plan->remote_plans) - { - int nremote_plans = list_length(plan->remote_plans); - - for (i = 0; i < nremote_plans; i++) - { - RemoteQuery *rq = (RemoteQuery *)list_nth(plan->remote_plans, i); - - combiner = (ResponseCombiner *) node->mt_remoterels[i]; - state = combiner->ss.ps.state; - ExecEndNode(node->mt_remoterels[i]); + if (plan->remote_plans) + { + int nremote_plans = list_length(plan->remote_plans); - DropRemoteDMLStatement(rq->statement, rq->update_cursor); - } + for (i = 0; i < nremote_plans; i++) + { + RemoteQuery *rq = (RemoteQuery *)list_nth(plan->remote_plans, i); - FreeExecutorState(state); - } - } + combiner = (ResponseCombiner *) node->mt_remoterels[i]; + + ExecEndNode(node->mt_remoterels[i]); + + DropRemoteDMLStatement(rq->statement, rq->update_cursor); + } + } + } #endif /* diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 4495ef70..9fd805a8 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -2312,6 +2312,7 @@ grouping_planner(PlannerInfo *root, bool inheritance_update, path = adjust_path_distribution(root, parse, path); #ifdef __TBASE__ +<<<<<<< HEAD /* * unshippable triggers found on target relation, we have to do DML * on coordinator. @@ -2324,6 +2325,43 @@ grouping_planner(PlannerInfo *root, bool inheritance_update, } } #endif +======= + /* + * unshippable triggers found on target relation, we have to do DML + * on coordinator. + */ + if (parse->hasUnshippableTriggers) + { + if (path->distribution) + { + path = adjust_modifytable_subpath(root, parse, path); + } + } +#endif + + path = (Path *) + create_modifytable_path(root, final_rel, + parse->commandType, + parse->canSetTag, + parse->resultRelation, + NIL, + list_make1_int(parse->resultRelation), + list_make1(path), + list_make1(root), + withCheckOptionLists, + returningLists, + rowMarks, + parse->onConflict, + SS_assign_special_param(root)); + } + else + /* Adjust path by injecting a remote subplan, if appropriate. */ + path = adjust_path_distribution(root, parse, path); + + /* And shove it into final_rel */ + add_path(final_rel, path); + } +>>>>>>> d1855902... Remove the duplicate estate free in ExecEndModifyTable path = (Path *) create_modifytable_path(root, final_rel, diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index 71e853ba..5ab4e1b5 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -146,12 +146,12 @@ typedef struct Query bool hasForUpdate; /* FOR [KEY] UPDATE/SHARE was specified */ bool hasRowSecurity; /* rewriter has applied some RLS policy */ #ifdef __TBASE__ - bool isSingleValues; /*for interval partition insert */ - bool isMultiValues; /* is simple insert into values (),(),()...();? */ - bool hasUnshippableTriggers; /* has unshippable triggers on resultRelation, - * only used for DML. Will be set at the plan phase - * in shippability check. - */ + bool isSingleValues; /*for interval partition insert */ + bool isMultiValues; /* is simple insert into values (),(),()...();? */ + bool hasUnshippableTriggers; /* has unshippable triggers on resultRelation, + * only used for DML. Will be set at the plan phase + * in shippability check. + */ char *copy_filename; /* fake filename for copy from */ Bitmapset *conflict_cols; #endif From 7a65ce134c0927ed978f5261624f3ac6a66aeb10 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Wed, 2 Jun 2021 11:24:04 +0800 Subject: [PATCH 019/578] revert 8a688d1d0d254917a83c9f5159bed930547ab741 --- src/backend/executor/nodeModifyTable.c | 39 ++++++++++++++------------ src/backend/optimizer/plan/planner.c | 38 ------------------------- src/include/nodes/parsenodes.h | 12 ++++---- 3 files changed, 27 insertions(+), 62 deletions(-) diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 659f15b1..9d6aaae4 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -3204,27 +3204,30 @@ ExecEndModifyTable(ModifyTableState *node) } #ifdef __TBASE__ - if (IS_PGXC_COORDINATOR) - { - ResponseCombiner *combiner; - ModifyTable *plan = (ModifyTable *)node->ps.plan; - - if (plan->remote_plans) - { - int nremote_plans = list_length(plan->remote_plans); - - for (i = 0; i < nremote_plans; i++) - { - RemoteQuery *rq = (RemoteQuery *)list_nth(plan->remote_plans, i); + if (IS_PGXC_COORDINATOR) + { + EState *state = NULL; + ResponseCombiner *combiner; + ModifyTable *plan = (ModifyTable *)node->ps.plan; - combiner = (ResponseCombiner *) node->mt_remoterels[i]; + if (plan->remote_plans) + { + int nremote_plans = list_length(plan->remote_plans); + + for (i = 0; i < nremote_plans; i++) + { + RemoteQuery *rq = (RemoteQuery *)list_nth(plan->remote_plans, i); + + combiner = (ResponseCombiner *) node->mt_remoterels[i]; + state = combiner->ss.ps.state; + ExecEndNode(node->mt_remoterels[i]); - ExecEndNode(node->mt_remoterels[i]); + DropRemoteDMLStatement(rq->statement, rq->update_cursor); + } - DropRemoteDMLStatement(rq->statement, rq->update_cursor); - } - } - } + FreeExecutorState(state); + } + } #endif /* diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 9fd805a8..4495ef70 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -2312,7 +2312,6 @@ grouping_planner(PlannerInfo *root, bool inheritance_update, path = adjust_path_distribution(root, parse, path); #ifdef __TBASE__ -<<<<<<< HEAD /* * unshippable triggers found on target relation, we have to do DML * on coordinator. @@ -2325,43 +2324,6 @@ grouping_planner(PlannerInfo *root, bool inheritance_update, } } #endif -======= - /* - * unshippable triggers found on target relation, we have to do DML - * on coordinator. - */ - if (parse->hasUnshippableTriggers) - { - if (path->distribution) - { - path = adjust_modifytable_subpath(root, parse, path); - } - } -#endif - - path = (Path *) - create_modifytable_path(root, final_rel, - parse->commandType, - parse->canSetTag, - parse->resultRelation, - NIL, - list_make1_int(parse->resultRelation), - list_make1(path), - list_make1(root), - withCheckOptionLists, - returningLists, - rowMarks, - parse->onConflict, - SS_assign_special_param(root)); - } - else - /* Adjust path by injecting a remote subplan, if appropriate. */ - path = adjust_path_distribution(root, parse, path); - - /* And shove it into final_rel */ - add_path(final_rel, path); - } ->>>>>>> d1855902... Remove the duplicate estate free in ExecEndModifyTable path = (Path *) create_modifytable_path(root, final_rel, diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index 5ab4e1b5..71e853ba 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -146,12 +146,12 @@ typedef struct Query bool hasForUpdate; /* FOR [KEY] UPDATE/SHARE was specified */ bool hasRowSecurity; /* rewriter has applied some RLS policy */ #ifdef __TBASE__ - bool isSingleValues; /*for interval partition insert */ - bool isMultiValues; /* is simple insert into values (),(),()...();? */ - bool hasUnshippableTriggers; /* has unshippable triggers on resultRelation, - * only used for DML. Will be set at the plan phase - * in shippability check. - */ + bool isSingleValues; /*for interval partition insert */ + bool isMultiValues; /* is simple insert into values (),(),()...();? */ + bool hasUnshippableTriggers; /* has unshippable triggers on resultRelation, + * only used for DML. Will be set at the plan phase + * in shippability check. + */ char *copy_filename; /* fake filename for copy from */ Bitmapset *conflict_cols; #endif From ab1b2607568bfae1ebf99b4aeae59d6f9e904609 Mon Sep 17 00:00:00 2001 From: ericxwu Date: Fri, 7 Aug 2020 14:14:20 +0800 Subject: [PATCH 020/578] Remove the duplicate estate free in ExecEndModifyTable http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view?bug_id=1020421696081354083 --- src/backend/executor/nodeModifyTable.c | 39 ++++++++++++------------- src/backend/optimizer/plan/createplan.c | 16 +++++----- src/backend/optimizer/plan/planner.c | 6 ++-- 3 files changed, 29 insertions(+), 32 deletions(-) diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 9d6aaae4..659f15b1 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -3204,30 +3204,27 @@ ExecEndModifyTable(ModifyTableState *node) } #ifdef __TBASE__ - if (IS_PGXC_COORDINATOR) - { - EState *state = NULL; - ResponseCombiner *combiner; - ModifyTable *plan = (ModifyTable *)node->ps.plan; + if (IS_PGXC_COORDINATOR) + { + ResponseCombiner *combiner; + ModifyTable *plan = (ModifyTable *)node->ps.plan; - if (plan->remote_plans) - { - int nremote_plans = list_length(plan->remote_plans); - - for (i = 0; i < nremote_plans; i++) - { - RemoteQuery *rq = (RemoteQuery *)list_nth(plan->remote_plans, i); - - combiner = (ResponseCombiner *) node->mt_remoterels[i]; - state = combiner->ss.ps.state; - ExecEndNode(node->mt_remoterels[i]); + if (plan->remote_plans) + { + int nremote_plans = list_length(plan->remote_plans); - DropRemoteDMLStatement(rq->statement, rq->update_cursor); - } + for (i = 0; i < nremote_plans; i++) + { + RemoteQuery *rq = (RemoteQuery *)list_nth(plan->remote_plans, i); - FreeExecutorState(state); - } - } + combiner = (ResponseCombiner *) node->mt_remoterels[i]; + + ExecEndNode(node->mt_remoterels[i]); + + DropRemoteDMLStatement(rq->statement, rq->update_cursor); + } + } + } #endif /* diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index d4e31d51..23d04153 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -3026,14 +3026,14 @@ create_modifytable_plan(PlannerInfo *root, ModifyTablePath *best_path) copy_generic_path_info(&plan->plan, &best_path->path); #ifdef __TBASE__ - /* - * If we have unshippable triggers, we have to do DML on coordinators, - * generate remote_dml plan now. - */ - if (root->parse->hasUnshippableTriggers) - { - create_remotedml_plan(root, (Plan *)plan, plan->operation); - } + /* + * If we have unshippable triggers, we have to do DML on coordinators, + * generate remote_dml plan now. + */ + if (root->parse->hasUnshippableTriggers) + { + create_remotedml_plan(root, (Plan *)plan, plan->operation); + } #endif return plan; } diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 4495ef70..b5e70cc0 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -2313,9 +2313,9 @@ grouping_planner(PlannerInfo *root, bool inheritance_update, #ifdef __TBASE__ /* - * unshippable triggers found on target relation, we have to do DML - * on coordinator. - */ + * unshippable triggers found on target relation, we have to do DML + * on coordinator. + */ if (parse->hasUnshippableTriggers) { if (path->distribution) From 952a2a4f440a9a823ac2bcd5dc57567d3cb1edb1 Mon Sep 17 00:00:00 2001 From: ericxwu Date: Tue, 11 Aug 2020 15:41:34 +0800 Subject: [PATCH 021/578] Split QTW_EXAMINE_RTES flag into QTW_EXAMINE_RTES_BEFORE/_AFTER. This change allows callers of query_tree_walker() to choose whether to visit an RTE before or after visiting the contents of the RTE (i.e., prefix or postfix tree order). All existing users of QTW_EXAMINE_RTES want the QTW_EXAMINE_RTES_BEFORE behavior, but an upcoming patch will want QTW_EXAMINE_RTES_AFTER, and it seems like a potentially useful change on its own. Andreas Karlsson (extracted from CTE inlining patch) Discussion: https://postgr.es/m/8810.1542402910@sss.pgh.pa.us TBASE: Reset the definitions of MLS query_tree_walker flag to avoid conflicts with postgres --- src/backend/nodes/nodeFuncs.c | 112 +++++++++-------- src/backend/optimizer/plan/setrefs.c | 52 ++++---- src/backend/rewrite/rewriteManip.c | 172 +++++++++++++-------------- src/include/nodes/nodeFuncs.h | 21 ++-- 4 files changed, 184 insertions(+), 173 deletions(-) diff --git a/src/backend/nodes/nodeFuncs.c b/src/backend/nodes/nodeFuncs.c index 25186782..47ebb30d 100644 --- a/src/backend/nodes/nodeFuncs.c +++ b/src/backend/nodes/nodeFuncs.c @@ -2258,7 +2258,7 @@ expression_tree_walker(Node *node, * Some callers want to suppress visitation of certain items in the sub-Query, * typically because they need to process them specially, or don't actually * want to recurse into subqueries. This is supported by the flags argument, - * which is the bitwise OR of flag values to suppress visitation of + * which is the bitwise OR of flag values to add or suppress visitation of * indicated items. (More flag bits may be added as needed.) */ bool @@ -2320,53 +2320,57 @@ query_tree_walker(Query *query, */ bool range_table_walker(List *rtable, - bool (*walker) (), - void *context, - int flags) -{// #lizard forgives - ListCell *rt; - - foreach(rt, rtable) - { - RangeTblEntry *rte = (RangeTblEntry *) lfirst(rt); - - /* For historical reasons, visiting RTEs is not the default */ - if (flags & QTW_EXAMINE_RTES) - if (walker(rte, context)) - return true; - - switch (rte->rtekind) - { - case RTE_RELATION: - if (walker(rte->tablesample, context)) - return true; - break; - case RTE_CTE: - case RTE_NAMEDTUPLESTORE: - /* nothing to do */ - break; - case RTE_SUBQUERY: - if (!(flags & QTW_IGNORE_RT_SUBQUERIES)) - if (walker(rte->subquery, context)) - return true; - break; - case RTE_JOIN: - if (!(flags & QTW_IGNORE_JOINALIASES)) - if (walker(rte->joinaliasvars, context)) - return true; - break; - case RTE_FUNCTION: - if (walker(rte->functions, context)) - return true; - break; - case RTE_TABLEFUNC: - if (walker(rte->tablefunc, context)) - return true; - break; - case RTE_VALUES: - if (walker(rte->values_lists, context)) - return true; - break; + bool (*walker) (), + void *context, + int flags) +{ + ListCell *rt; + + foreach(rt, rtable) + { + RangeTblEntry *rte = (RangeTblEntry *) lfirst(rt); + + /* + * Walkers might need to examine the RTE node itself either before or + * after visiting its contents (or, conceivably, both). Note that if + * you specify neither flag, the walker won't visit the RTE at all. + */ + if (flags & QTW_EXAMINE_RTES_BEFORE) + if (walker(rte, context)) + return true; + + switch (rte->rtekind) + { + case RTE_RELATION: + if (walker(rte->tablesample, context)) + return true; + break; + case RTE_CTE: + case RTE_NAMEDTUPLESTORE: + /* nothing to do */ + break; + case RTE_SUBQUERY: + if (!(flags & QTW_IGNORE_RT_SUBQUERIES)) + if (walker(rte->subquery, context)) + return true; + break; + case RTE_JOIN: + if (!(flags & QTW_IGNORE_JOINALIASES)) + if (walker(rte->joinaliasvars, context)) + return true; + break; + case RTE_FUNCTION: + if (walker(rte->functions, context)) + return true; + break; + case RTE_TABLEFUNC: + if (walker(rte->tablefunc, context)) + return true; + break; + case RTE_VALUES: + if (walker(rte->values_lists, context)) + return true; + break; #ifdef PGXC case RTE_REMOTE_DUMMY: elog(ERROR, "Invalid RTE found."); @@ -2374,10 +2378,14 @@ range_table_walker(List *rtable, #endif /* PGXC */ } - if (walker(rte->securityQuals, context)) - return true; - } - return false; + if (walker(rte->securityQuals, context)) + return true; + + if (flags & QTW_EXAMINE_RTES_AFTER) + if (walker(rte, context)) + return true; + } + return false; } diff --git a/src/backend/optimizer/plan/setrefs.c b/src/backend/optimizer/plan/setrefs.c index 460a458b..175058f9 100644 --- a/src/backend/optimizer/plan/setrefs.c +++ b/src/backend/optimizer/plan/setrefs.c @@ -352,37 +352,37 @@ add_rtes_to_flat_rtable(PlannerInfo *root, bool recursing) static void flatten_unplanned_rtes(PlannerGlobal *glob, RangeTblEntry *rte) { - /* Use query_tree_walker to find all RTEs in the parse tree */ - (void) query_tree_walker(rte->subquery, - flatten_rtes_walker, - (void *) glob, - QTW_EXAMINE_RTES); + /* Use query_tree_walker to find all RTEs in the parse tree */ + (void) query_tree_walker(rte->subquery, + flatten_rtes_walker, + (void *) glob, + QTW_EXAMINE_RTES_BEFORE); } static bool flatten_rtes_walker(Node *node, PlannerGlobal *glob) { - if (node == NULL) - return false; - if (IsA(node, RangeTblEntry)) - { - RangeTblEntry *rte = (RangeTblEntry *) node; - - /* As above, we need only save relation RTEs */ - if (rte->rtekind == RTE_RELATION) - add_rte_to_flat_rtable(glob, rte); - return false; - } - if (IsA(node, Query)) - { - /* Recurse into subselects */ - return query_tree_walker((Query *) node, - flatten_rtes_walker, - (void *) glob, - QTW_EXAMINE_RTES); - } - return expression_tree_walker(node, flatten_rtes_walker, - (void *) glob); + if (node == NULL) + return false; + if (IsA(node, RangeTblEntry)) + { + RangeTblEntry *rte = (RangeTblEntry *) node; + + /* As above, we need only save relation RTEs */ + if (rte->rtekind == RTE_RELATION) + add_rte_to_flat_rtable(glob, rte); + return false; + } + if (IsA(node, Query)) + { + /* Recurse into subselects */ + return query_tree_walker((Query *) node, + flatten_rtes_walker, + (void *) glob, + QTW_EXAMINE_RTES_BEFORE); + } + return expression_tree_walker(node, flatten_rtes_walker, + (void *) glob); } /* diff --git a/src/backend/rewrite/rewriteManip.c b/src/backend/rewrite/rewriteManip.c index 099fe968..09b14d4e 100644 --- a/src/backend/rewrite/rewriteManip.c +++ b/src/backend/rewrite/rewriteManip.c @@ -698,94 +698,94 @@ typedef struct static bool IncrementVarSublevelsUp_walker(Node *node, - IncrementVarSublevelsUp_context *context) -{// #lizard forgives - if (node == NULL) - return false; - if (IsA(node, Var)) - { - Var *var = (Var *) node; - - if (var->varlevelsup >= context->min_sublevels_up) - var->varlevelsup += context->delta_sublevels_up; - return false; /* done here */ - } - if (IsA(node, CurrentOfExpr)) - { - /* this should not happen */ - if (context->min_sublevels_up == 0) - elog(ERROR, "cannot push down CurrentOfExpr"); - return false; - } - if (IsA(node, Aggref)) - { - Aggref *agg = (Aggref *) node; - - if (agg->agglevelsup >= context->min_sublevels_up) - agg->agglevelsup += context->delta_sublevels_up; - /* fall through to recurse into argument */ - } - if (IsA(node, GroupingFunc)) - { - GroupingFunc *grp = (GroupingFunc *) node; - - if (grp->agglevelsup >= context->min_sublevels_up) - grp->agglevelsup += context->delta_sublevels_up; - /* fall through to recurse into argument */ - } - if (IsA(node, PlaceHolderVar)) - { - PlaceHolderVar *phv = (PlaceHolderVar *) node; - - if (phv->phlevelsup >= context->min_sublevels_up) - phv->phlevelsup += context->delta_sublevels_up; - /* fall through to recurse into argument */ - } - if (IsA(node, RangeTblEntry)) - { - RangeTblEntry *rte = (RangeTblEntry *) node; - - if (rte->rtekind == RTE_CTE) - { - if (rte->ctelevelsup >= context->min_sublevels_up) - rte->ctelevelsup += context->delta_sublevels_up; - } - return false; /* allow range_table_walker to continue */ - } - if (IsA(node, Query)) - { - /* Recurse into subselects */ - bool result; - - context->min_sublevels_up++; - result = query_tree_walker((Query *) node, - IncrementVarSublevelsUp_walker, - (void *) context, - QTW_EXAMINE_RTES); - context->min_sublevels_up--; - return result; - } - return expression_tree_walker(node, IncrementVarSublevelsUp_walker, - (void *) context); + IncrementVarSublevelsUp_context *context) +{ + if (node == NULL) + return false; + if (IsA(node, Var)) + { + Var *var = (Var *) node; + + if (var->varlevelsup >= context->min_sublevels_up) + var->varlevelsup += context->delta_sublevels_up; + return false; /* done here */ + } + if (IsA(node, CurrentOfExpr)) + { + /* this should not happen */ + if (context->min_sublevels_up == 0) + elog(ERROR, "cannot push down CurrentOfExpr"); + return false; + } + if (IsA(node, Aggref)) + { + Aggref *agg = (Aggref *) node; + + if (agg->agglevelsup >= context->min_sublevels_up) + agg->agglevelsup += context->delta_sublevels_up; + /* fall through to recurse into argument */ + } + if (IsA(node, GroupingFunc)) + { + GroupingFunc *grp = (GroupingFunc *) node; + + if (grp->agglevelsup >= context->min_sublevels_up) + grp->agglevelsup += context->delta_sublevels_up; + /* fall through to recurse into argument */ + } + if (IsA(node, PlaceHolderVar)) + { + PlaceHolderVar *phv = (PlaceHolderVar *) node; + + if (phv->phlevelsup >= context->min_sublevels_up) + phv->phlevelsup += context->delta_sublevels_up; + /* fall through to recurse into argument */ + } + if (IsA(node, RangeTblEntry)) + { + RangeTblEntry *rte = (RangeTblEntry *) node; + + if (rte->rtekind == RTE_CTE) + { + if (rte->ctelevelsup >= context->min_sublevels_up) + rte->ctelevelsup += context->delta_sublevels_up; + } + return false; /* allow range_table_walker to continue */ + } + if (IsA(node, Query)) + { + /* Recurse into subselects */ + bool result; + + context->min_sublevels_up++; + result = query_tree_walker((Query *) node, + IncrementVarSublevelsUp_walker, + (void *) context, + QTW_EXAMINE_RTES_BEFORE); + context->min_sublevels_up--; + return result; + } + return expression_tree_walker(node, IncrementVarSublevelsUp_walker, + (void *) context); } void IncrementVarSublevelsUp(Node *node, int delta_sublevels_up, int min_sublevels_up) { - IncrementVarSublevelsUp_context context; - - context.delta_sublevels_up = delta_sublevels_up; - context.min_sublevels_up = min_sublevels_up; - - /* - * Must be prepared to start with a Query or a bare expression tree; if - * it's a Query, we don't want to increment sublevels_up. - */ - query_or_expression_tree_walker(node, - IncrementVarSublevelsUp_walker, - (void *) &context, - QTW_EXAMINE_RTES); + IncrementVarSublevelsUp_context context; + + context.delta_sublevels_up = delta_sublevels_up; + context.min_sublevels_up = min_sublevels_up; + + /* + * Must be prepared to start with a Query or a bare expression tree; if + * it's a Query, we don't want to increment sublevels_up. + */ + query_or_expression_tree_walker(node, + IncrementVarSublevelsUp_walker, + (void *) &context, + QTW_EXAMINE_RTES_BEFORE); } /* @@ -801,10 +801,10 @@ IncrementVarSublevelsUp_rtable(List *rtable, int delta_sublevels_up, context.delta_sublevels_up = delta_sublevels_up; context.min_sublevels_up = min_sublevels_up; - range_table_walker(rtable, - IncrementVarSublevelsUp_walker, - (void *) &context, - QTW_EXAMINE_RTES); + range_table_walker(rtable, + IncrementVarSublevelsUp_walker, + (void *) &context, + QTW_EXAMINE_RTES_BEFORE); } diff --git a/src/include/nodes/nodeFuncs.h b/src/include/nodes/nodeFuncs.h index 8d5c5000..cfb41c3c 100644 --- a/src/include/nodes/nodeFuncs.h +++ b/src/include/nodes/nodeFuncs.h @@ -77,16 +77,19 @@ /* flags bits for query_tree_walker and query_tree_mutator */ -#define QTW_IGNORE_RT_SUBQUERIES 0x01 /* subqueries in rtable */ -#define QTW_IGNORE_CTE_SUBQUERIES 0x02 /* subqueries in cteList */ -#define QTW_IGNORE_RC_SUBQUERIES 0x03 /* both of above */ -#define QTW_IGNORE_JOINALIASES 0x04 /* JOIN alias var lists */ -#define QTW_IGNORE_RANGE_TABLE 0x08 /* skip rangetable entirely */ -#define QTW_EXAMINE_RTES 0x10 /* examine RTEs */ -#define QTW_DONT_COPY_QUERY 0x20 /* do not copy top Query */ +#define QTW_IGNORE_RT_SUBQUERIES 0x01 /* subqueries in rtable */ +#define QTW_IGNORE_CTE_SUBQUERIES 0x02 /* subqueries in cteList */ +#define QTW_IGNORE_RC_SUBQUERIES 0x03 /* both of above */ +#define QTW_IGNORE_JOINALIASES 0x04 /* JOIN alias var lists */ +#define QTW_IGNORE_RANGE_TABLE 0x08 /* skip rangetable entirely */ +#define QTW_EXAMINE_RTES_BEFORE 0x10 /* examine RTE nodes before their + * contents */ +#define QTW_EXAMINE_RTES_AFTER 0x20 /* examine RTE nodes after their + * contents */ +#define QTW_DONT_COPY_QUERY 0x40 /* do not copy top Query */ #ifdef _MLS_ -#define QTW_IGNORE_TARGET_LIST 0x40 /* skip target list */ -#define QTW_IGNORE_RETURNING_LIST 0x80 /* skip returning list */ +#define QTW_IGNORE_TARGET_LIST 0x0100 /* skip target list */ +#define QTW_IGNORE_RETURNING_LIST 0x0200 /* skip returning list */ #endif From 97ba77f9d3185ebd21ce59c6c3283ff591650ea5 Mon Sep 17 00:00:00 2001 From: ericxwu Date: Thu, 13 Aug 2020 12:48:14 +0800 Subject: [PATCH 022/578] Allow user control of CTE materialization, and change the default behavior. Historically we've always materialized the full output of a CTE query, treating WITH as an optimization fence (so that, for example, restrictions from the outer query cannot be pushed into it). This is appropriate when the CTE query is INSERT/UPDATE/DELETE, or is recursive; but when the CTE query is non-recursive and side-effect-free, there's no hazard of changing the query results by pushing restrictions down. Another argument for materialization is that it can avoid duplicate computation of an expensive WITH query --- but that only applies if the WITH query is called more than once in the outer query. Even then it could still be a net loss, if each call has restrictions that would allow just a small part of the WITH query to be computed. Hence, let's change the behavior for WITH queries that are non-recursive and side-effect-free. By default, we will inline them into the outer query (removing the optimization fence) if they are called just once. If they are called more than once, we will keep the old behavior by default, but the user can override this and force inlining by specifying NOT MATERIALIZED. Lastly, the user can force the old behavior by specifying MATERIALIZED; this would mainly be useful when the query had deliberately been employing WITH as an optimization fence to prevent a poor choice of plan. Andreas Karlsson, Andrew Gierth, David Fetter Discussion: https://postgr.es/m/87sh48ffhb.fsf@news-spur.riddles.org.uk --- .../postgres_fdw/expected/postgres_fdw.out | 4 +- contrib/postgres_fdw/sql/postgres_fdw.sql | 4 +- doc/src/sgml/queries.sgml | 84 +- doc/src/sgml/ref/select.sgml | 55 +- src/backend/nodes/copyfuncs.c | 21 +- src/backend/nodes/equalfuncs.c | 21 +- src/backend/nodes/outfuncs.c | 21 +- src/backend/nodes/readfuncs.c | 21 +- src/backend/optimizer/plan/planner.c | 314 +- src/backend/optimizer/plan/subselect.c | 293 +- src/backend/parser/gram.y | 14 +- src/backend/utils/adt/ruleutils.c | 25864 ++++++++-------- src/include/nodes/parsenodes.h | 36 +- src/test/regress/expected/foreign_key_2.out | 13 +- src/test/regress/expected/rowsecurity.out | 4 +- src/test/regress/expected/rowsecurity_1.out | 9 +- src/test/regress/expected/rowtypes.out | 4 +- src/test/regress/expected/rowtypes_1.out | 4 +- src/test/regress/expected/rules.out | 5 +- src/test/regress/expected/subselect.out | 270 +- src/test/regress/expected/xc_for_update_1.out | 34 +- src/test/regress/sql/rowsecurity.sql | 9 +- src/test/regress/sql/rowtypes.sql | 4 +- src/test/regress/sql/rules.sql | 5 +- src/test/regress/sql/subselect.sql | 93 + 25 files changed, 13936 insertions(+), 13270 deletions(-) diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out b/contrib/postgres_fdw/expected/postgres_fdw.out index c19b3318..77a6e2ce 100644 --- a/contrib/postgres_fdw/expected/postgres_fdw.out +++ b/contrib/postgres_fdw/expected/postgres_fdw.out @@ -1868,7 +1868,7 @@ SELECT t1.c1, t2.c1 FROM ft1 t1 JOIN ft2 t2 ON (t1.c1 = t2.c1) ORDER BY t1.c3, t -- join in CTE EXPLAIN (VERBOSE, COSTS OFF) -WITH t (c1_1, c1_3, c2_1) AS (SELECT t1.c1, t1.c3, t2.c1 FROM ft1 t1 JOIN ft2 t2 ON (t1.c1 = t2.c1)) SELECT c1_1, c2_1 FROM t ORDER BY c1_3, c1_1 OFFSET 100 LIMIT 10; +WITH t (c1_1, c1_3, c2_1) AS MATERIALIZED (SELECT t1.c1, t1.c3, t2.c1 FROM ft1 t1 JOIN ft2 t2 ON (t1.c1 = t2.c1)) SELECT c1_1, c2_1 FROM t ORDER BY c1_3, c1_1 OFFSET 100 LIMIT 10; QUERY PLAN ------------------------------------------------------------------------------------------------------------------------------------- Limit @@ -1885,7 +1885,7 @@ WITH t (c1_1, c1_3, c2_1) AS (SELECT t1.c1, t1.c3, t2.c1 FROM ft1 t1 JOIN ft2 t2 Output: t.c1_1, t.c2_1, t.c1_3 (12 rows) -WITH t (c1_1, c1_3, c2_1) AS (SELECT t1.c1, t1.c3, t2.c1 FROM ft1 t1 JOIN ft2 t2 ON (t1.c1 = t2.c1)) SELECT c1_1, c2_1 FROM t ORDER BY c1_3, c1_1 OFFSET 100 LIMIT 10; +WITH t (c1_1, c1_3, c2_1) AS MATERIALIZED (SELECT t1.c1, t1.c3, t2.c1 FROM ft1 t1 JOIN ft2 t2 ON (t1.c1 = t2.c1)) SELECT c1_1, c2_1 FROM t ORDER BY c1_3, c1_1 OFFSET 100 LIMIT 10; c1_1 | c2_1 ------+------ 101 | 101 diff --git a/contrib/postgres_fdw/sql/postgres_fdw.sql b/contrib/postgres_fdw/sql/postgres_fdw.sql index 5f65d9d9..5048bff6 100644 --- a/contrib/postgres_fdw/sql/postgres_fdw.sql +++ b/contrib/postgres_fdw/sql/postgres_fdw.sql @@ -495,8 +495,8 @@ SELECT t1.c1, t2.c1 FROM ft1 t1 JOIN ft2 t2 ON (t1.c1 = t2.c1) ORDER BY t1.c3, t SELECT t1.c1, t2.c1 FROM ft1 t1 JOIN ft2 t2 ON (t1.c1 = t2.c1) ORDER BY t1.c3, t1.c1 OFFSET 100 LIMIT 10 FOR SHARE; -- join in CTE EXPLAIN (VERBOSE, COSTS OFF) -WITH t (c1_1, c1_3, c2_1) AS (SELECT t1.c1, t1.c3, t2.c1 FROM ft1 t1 JOIN ft2 t2 ON (t1.c1 = t2.c1)) SELECT c1_1, c2_1 FROM t ORDER BY c1_3, c1_1 OFFSET 100 LIMIT 10; -WITH t (c1_1, c1_3, c2_1) AS (SELECT t1.c1, t1.c3, t2.c1 FROM ft1 t1 JOIN ft2 t2 ON (t1.c1 = t2.c1)) SELECT c1_1, c2_1 FROM t ORDER BY c1_3, c1_1 OFFSET 100 LIMIT 10; +WITH t (c1_1, c1_3, c2_1) AS MATERIALIZED (SELECT t1.c1, t1.c3, t2.c1 FROM ft1 t1 JOIN ft2 t2 ON (t1.c1 = t2.c1)) SELECT c1_1, c2_1 FROM t ORDER BY c1_3, c1_1 OFFSET 100 LIMIT 10; +WITH t (c1_1, c1_3, c2_1) AS MATERIALIZED (SELECT t1.c1, t1.c3, t2.c1 FROM ft1 t1 JOIN ft2 t2 ON (t1.c1 = t2.c1)) SELECT c1_1, c2_1 FROM t ORDER BY c1_3, c1_1 OFFSET 100 LIMIT 10; -- ctid with whole-row reference EXPLAIN (VERBOSE, COSTS OFF) SELECT t1.ctid, t1, t2, t1.c1 FROM ft1 t1 JOIN ft2 t2 ON (t1.c1 = t2.c1) ORDER BY t1.c3, t1.c1 OFFSET 100 LIMIT 10; diff --git a/doc/src/sgml/queries.sgml b/doc/src/sgml/queries.sgml index 0588da29..0ba7085f 100644 --- a/doc/src/sgml/queries.sgml +++ b/doc/src/sgml/queries.sgml @@ -2195,22 +2195,94 @@ SELECT n FROM t LIMIT 100; - A useful property of WITH queries is that they are evaluated - only once per execution of the parent query, even if they are referred to - more than once by the parent query or sibling WITH queries. + A useful property of WITH queries is that they are + normally evaluated only once per execution of the parent query, even if + they are referred to more than once by the parent query or + sibling WITH queries. Thus, expensive calculations that are needed in multiple places can be placed within a WITH query to avoid redundant work. Another possible application is to prevent unwanted multiple evaluations of functions with side-effects. - However, the other side of this coin is that the optimizer is less able to - push restrictions from the parent query down into a WITH query - than an ordinary subquery. The WITH query will generally be + However, the other side of this coin is that the optimizer is not able to + push restrictions from the parent query down into a multiply-referenced + WITH query, since that might affect all uses of the + WITH query's output when it should affect only one. + The multiply-referenced WITH query will be evaluated as written, without suppression of rows that the parent query might discard afterwards. (But, as mentioned above, evaluation might stop early if the reference(s) to the query demand only a limited number of rows.) + + However, if a WITH query is non-recursive and + side-effect-free (that is, it is a SELECT containing + no volatile functions) then it can be folded into the parent query, + allowing joint optimization of the two query levels. By default, this + happens if the parent query references the WITH query + just once, but not if it references the WITH query + more than once. You can override that decision by + specifying MATERIALIZED to force separate calculation + of the WITH query, or by specifying NOT + MATERIALIZED to force it to be merged into the parent query. + The latter choice risks duplicate computation of + the WITH query, but it can still give a net savings if + each usage of the WITH query needs only a small part + of the WITH query's full output. + + + + A simple example of these rules is + +WITH w AS ( + SELECT * FROM big_table +) +SELECT * FROM w WHERE key = 123; + + This WITH query will be folded, producing the same + execution plan as + +SELECT * FROM big_table WHERE key = 123; + + In particular, if there's an index on key, + it will probably be used to fetch just the rows having key = + 123. On the other hand, in + +WITH w AS ( + SELECT * FROM big_table +) +SELECT * FROM w AS w1 JOIN w AS w2 ON w1.key = w2.ref +WHERE w2.key = 123; + + the WITH query will be materialized, producing a + temporary copy of big_table that is then + joined with itself — without benefit of any index. This query + will be executed much more efficiently if written as + +WITH w AS NOT MATERIALIZED ( + SELECT * FROM big_table +) +SELECT * FROM w AS w1 JOIN w AS w2 ON w1.key = w2.ref +WHERE w2.key = 123; + + so that the parent query's restrictions can be applied directly + to scans of big_table. + + + + An example where NOT MATERIALIZED could be + undesirable is + +WITH w AS ( + SELECT key, very_expensive_function(val) as f FROM some_table +) +SELECT * FROM w AS w1 JOIN w AS w2 ON w1.f = w2.f; + + Here, materialization of the WITH query ensures + that very_expensive_function is evaluated only + once per table row, not twice. + + The examples above only show WITH being used with SELECT, but it can be attached in the same way to diff --git a/doc/src/sgml/ref/select.sgml b/doc/src/sgml/ref/select.sgml index 57f11e66..17172c05 100644 --- a/doc/src/sgml/ref/select.sgml +++ b/doc/src/sgml/ref/select.sgml @@ -72,7 +72,7 @@ SELECT [ ALL | DISTINCT [ ON ( expressionand with_query is: - with_query_name [ ( column_name [, ...] ) ] AS ( select | values | insert | update | delete ) + with_query_name [ ( column_name [, ...] ) ] AS [ [ NOT ] MATERIALIZED ] ( select | values | insert | update | delete ) TABLE [ ONLY ] table_name [ * ] @@ -94,6 +94,7 @@ TABLE [ ONLY ] table_name [ * ] in the FROM list. A WITH query that is referenced more than once in FROM is computed only once. + unless specified otherwise with NOT MATERIALIZED. (See below.) @@ -272,9 +273,18 @@ TABLE [ ONLY ] table_name [ * ] that are earlier in the WITH list. + + The primary query and the WITH queries are all + (notionally) executed at the same time. This implies that the effects of + a data-modifying statement in WITH cannot be seen from + other parts of the query, other than by reading its RETURNING + output. If two such data-modifying statements attempt to modify the same + row, the results are unspecified. + + A key property of WITH queries is that they - are evaluated only once per execution of the primary query, + are normally evaluated only once per execution of the primary query, even if the primary query refers to them more than once. In particular, data-modifying statements are guaranteed to be executed once and only once, regardless of whether the primary query @@ -282,12 +292,35 @@ TABLE [ ONLY ] table_name [ * ] - The primary query and the WITH queries are all - (notionally) executed at the same time. This implies that the effects of - a data-modifying statement in WITH cannot be seen from - other parts of the query, other than by reading its RETURNING - output. If two such data-modifying statements attempt to modify the same - row, the results are unspecified. + However, a WITH query can be marked + NOT MATERIALIZED to remove this guarantee. In that + case, the WITH query can be folded into the primary + query much as though it were a simple sub-SELECT in + the primary query's FROM clause. This results in + duplicate computations if the primary query refers to + that WITH query more than once; but if each such use + requires only a few rows of the WITH query's total + output, NOT MATERIALIZED can provide a net savings by + allowing the queries to be optimized jointly. + NOT MATERIALIZED is ignored if it is attached to + a WITH query that is recursive or is not + side-effect-free (i.e., is not a plain SELECT + containing no volatile functions). + + + + By default, a side-effect-free WITH query is folded + into the primary query if it is used exactly once in the primary + query's FROM clause. This allows joint optimization + of the two query levels in situations where that should be semantically + invisible. However, such folding can be prevented by marking the + WITH query as MATERIALIZED. + That might be useful, for example, if the WITH query + is being used as an optimization fence to prevent the planner from + choosing a bad plan. + PostgreSQL versions before v12 never did + such folding, so queries written for older versions might rely on + WITH to act as an optimization fence. @@ -2046,6 +2079,12 @@ SELECT distributors.* WHERE distributors.name = 'Westward'; ROWS FROM( ... ) is an extension of the SQL standard. + + + The MATERIALIZED and NOT + MATERIALIZED options of WITH are extensions + of the SQL standard. + diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 06c7bdf4..8a447982 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -2747,16 +2747,17 @@ _copyCommonTableExpr(const CommonTableExpr *from) { CommonTableExpr *newnode = makeNode(CommonTableExpr); - COPY_STRING_FIELD(ctename); - COPY_NODE_FIELD(aliascolnames); - COPY_NODE_FIELD(ctequery); - COPY_LOCATION_FIELD(location); - COPY_SCALAR_FIELD(cterecursive); - COPY_SCALAR_FIELD(cterefcount); - COPY_NODE_FIELD(ctecolnames); - COPY_NODE_FIELD(ctecoltypes); - COPY_NODE_FIELD(ctecoltypmods); - COPY_NODE_FIELD(ctecolcollations); + COPY_STRING_FIELD(ctename); + COPY_NODE_FIELD(aliascolnames); + COPY_SCALAR_FIELD(ctematerialized); + COPY_NODE_FIELD(ctequery); + COPY_LOCATION_FIELD(location); + COPY_SCALAR_FIELD(cterecursive); + COPY_SCALAR_FIELD(cterefcount); + COPY_NODE_FIELD(ctecolnames); + COPY_NODE_FIELD(ctecoltypes); + COPY_NODE_FIELD(ctecoltypmods); + COPY_NODE_FIELD(ctecolcollations); return newnode; } diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c index ccd20de5..3dbcb393 100644 --- a/src/backend/nodes/equalfuncs.c +++ b/src/backend/nodes/equalfuncs.c @@ -2850,16 +2850,17 @@ _equalOnConflictClause(const OnConflictClause *a, const OnConflictClause *b) static bool _equalCommonTableExpr(const CommonTableExpr *a, const CommonTableExpr *b) { - COMPARE_STRING_FIELD(ctename); - COMPARE_NODE_FIELD(aliascolnames); - COMPARE_NODE_FIELD(ctequery); - COMPARE_LOCATION_FIELD(location); - COMPARE_SCALAR_FIELD(cterecursive); - COMPARE_SCALAR_FIELD(cterefcount); - COMPARE_NODE_FIELD(ctecolnames); - COMPARE_NODE_FIELD(ctecoltypes); - COMPARE_NODE_FIELD(ctecoltypmods); - COMPARE_NODE_FIELD(ctecolcollations); + COMPARE_STRING_FIELD(ctename); + COMPARE_NODE_FIELD(aliascolnames); + COMPARE_SCALAR_FIELD(ctematerialized); + COMPARE_NODE_FIELD(ctequery); + COMPARE_LOCATION_FIELD(location); + COMPARE_SCALAR_FIELD(cterecursive); + COMPARE_SCALAR_FIELD(cterefcount); + COMPARE_NODE_FIELD(ctecolnames); + COMPARE_NODE_FIELD(ctecoltypes); + COMPARE_NODE_FIELD(ctecoltypmods); + COMPARE_NODE_FIELD(ctecolcollations); return true; } diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index fb063aa2..7d7a9704 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -4377,16 +4377,17 @@ _outCommonTableExpr(StringInfo str, const CommonTableExpr *node) { WRITE_NODE_TYPE("COMMONTABLEEXPR"); - WRITE_STRING_FIELD(ctename); - WRITE_NODE_FIELD(aliascolnames); - WRITE_NODE_FIELD(ctequery); - WRITE_LOCATION_FIELD(location); - WRITE_BOOL_FIELD(cterecursive); - WRITE_INT_FIELD(cterefcount); - WRITE_NODE_FIELD(ctecolnames); - WRITE_NODE_FIELD(ctecoltypes); - WRITE_NODE_FIELD(ctecoltypmods); - WRITE_NODE_FIELD(ctecolcollations); + WRITE_STRING_FIELD(ctename); + WRITE_NODE_FIELD(aliascolnames); + WRITE_ENUM_FIELD(ctematerialized, CTEMaterialize); + WRITE_NODE_FIELD(ctequery); + WRITE_LOCATION_FIELD(location); + WRITE_BOOL_FIELD(cterecursive); + WRITE_INT_FIELD(cterefcount); + WRITE_NODE_FIELD(ctecolnames); + WRITE_NODE_FIELD(ctecoltypes); + WRITE_NODE_FIELD(ctecoltypmods); + WRITE_NODE_FIELD(ctecolcollations); } static void diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c index 2c886d9f..7207a98c 100644 --- a/src/backend/nodes/readfuncs.c +++ b/src/backend/nodes/readfuncs.c @@ -724,16 +724,17 @@ _readCommonTableExpr(void) { READ_LOCALS(CommonTableExpr); - READ_STRING_FIELD(ctename); - READ_NODE_FIELD(aliascolnames); - READ_NODE_FIELD(ctequery); - READ_LOCATION_FIELD(location); - READ_BOOL_FIELD(cterecursive); - READ_INT_FIELD(cterefcount); - READ_NODE_FIELD(ctecolnames); - READ_NODE_FIELD(ctecoltypes); - READ_NODE_FIELD(ctecoltypmods); - READ_NODE_FIELD(ctecolcollations); + READ_STRING_FIELD(ctename); + READ_NODE_FIELD(aliascolnames); + READ_ENUM_FIELD(ctematerialized, CTEMaterialize); + READ_NODE_FIELD(ctequery); + READ_LOCATION_FIELD(location); + READ_BOOL_FIELD(cterecursive); + READ_INT_FIELD(cterefcount); + READ_NODE_FIELD(ctecolnames); + READ_NODE_FIELD(ctecoltypes); + READ_NODE_FIELD(ctecoltypmods); + READ_NODE_FIELD(ctecolcollations); READ_DONE(); } diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index b5e70cc0..6ed3f131 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -626,173 +626,173 @@ subquery_planner(PlannerGlobal *glob, Query *parse, #ifdef _MLS_ root->hasClsPolicy = false; #endif - root->hasInheritedTarget = false; - root->hasRecursion = hasRecursion; - if (hasRecursion) - root->wt_param_id = SS_assign_special_param(root); - else - root->wt_param_id = -1; - root->non_recursive_path = NULL; - - /* - * If there is a WITH list, process each WITH query and build an initplan - * SubPlan structure for it. - */ - if (parse->cteList) - SS_process_ctes(root); - - /* - * Look for ANY and EXISTS SubLinks in WHERE and JOIN/ON clauses, and try - * to transform them into joins. Note that this step does not descend - * into subqueries; if we pull up any subqueries below, their SubLinks are - * processed just before pulling them up. - */ - if (parse->hasSubLinks) - pull_up_sublinks(root); - - /* - * Scan the rangetable for set-returning functions, and inline them if - * possible (producing subqueries that might get pulled up next). - * Recursion issues here are handled in the same way as for SubLinks. - */ - inline_set_returning_functions(root); - - /* - * Check to see if any subqueries in the jointree can be merged into this - * query. - */ - pull_up_subqueries(root); - - /* - * If this is a simple UNION ALL query, flatten it into an appendrel. We - * do this now because it requires applying pull_up_subqueries to the leaf - * queries of the UNION ALL, which weren't touched above because they - * weren't referenced by the jointree (they will be after we do this). - */ - if (parse->setOperations) - flatten_simple_union_all(root); - - /* - * Detect whether any rangetable entries are RTE_JOIN kind; if not, we can - * avoid the expense of doing flatten_join_alias_vars(). Also check for - * outer joins --- if none, we can skip reduce_outer_joins(). And check - * for LATERAL RTEs, too. This must be done after we have done - * pull_up_subqueries(), of course. - */ - root->hasJoinRTEs = false; - root->hasLateralRTEs = false; - hasOuterJoins = false; - foreach(l, parse->rtable) - { - RangeTblEntry *rte = (RangeTblEntry *) lfirst(l); - - if (rte->rtekind == RTE_JOIN) - { - root->hasJoinRTEs = true; - if (IS_OUTER_JOIN(rte->jointype)) - hasOuterJoins = true; - } - if (rte->lateral) - root->hasLateralRTEs = true; - } - - /* - * Preprocess RowMark information. We need to do this after subquery - * pullup (so that all non-inherited RTEs are present) and before - * inheritance expansion (so that the info is available for - * expand_inherited_tables to examine and modify). - */ - preprocess_rowmarks(root); - - /* - * Expand any rangetable entries that are inheritance sets into "append - * relations". This can add entries to the rangetable, but they must be - * plain base relations not joins, so it's OK (and marginally more - * efficient) to do it after checking for join RTEs. We must do it after - * pulling up subqueries, else we'd fail to handle inherited tables in - * subqueries. - */ - expand_inherited_tables(root); - - /* - * Set hasHavingQual to remember if HAVING clause is present. Needed - * because preprocess_expression will reduce a constant-true condition to - * an empty qual list ... but "HAVING TRUE" is not a semantic no-op. - */ - root->hasHavingQual = (parse->havingQual != NULL); - - /* Clear this flag; might get set in distribute_qual_to_rels */ - root->hasPseudoConstantQuals = false; - - /* - * Do expression preprocessing on targetlist and quals, as well as other - * random expressions in the querytree. Note that we do not need to - * handle sort/group expressions explicitly, because they are actually - * part of the targetlist. - */ - parse->targetList = (List *) - preprocess_expression(root, (Node *) parse->targetList, - EXPRKIND_TARGET); + root->hasInheritedTarget = false; + root->hasRecursion = hasRecursion; + if (hasRecursion) + root->wt_param_id = SS_assign_special_param(root); + else + root->wt_param_id = -1; + root->non_recursive_path = NULL; + + /* + * If there is a WITH list, process each WITH query and either convert it + * to RTE_SUBQUERY RTE(s) or build an initplan SubPlan structure for it. + */ + if (parse->cteList) + SS_process_ctes(root); + + /* + * Look for ANY and EXISTS SubLinks in WHERE and JOIN/ON clauses, and try + * to transform them into joins. Note that this step does not descend + * into subqueries; if we pull up any subqueries below, their SubLinks are + * processed just before pulling them up. + */ + if (parse->hasSubLinks) + pull_up_sublinks(root); + + /* + * Scan the rangetable for set-returning functions, and inline them if + * possible (producing subqueries that might get pulled up next). + * Recursion issues here are handled in the same way as for SubLinks. + */ + inline_set_returning_functions(root); + + /* + * Check to see if any subqueries in the jointree can be merged into this + * query. + */ + pull_up_subqueries(root); + + /* + * If this is a simple UNION ALL query, flatten it into an appendrel. We + * do this now because it requires applying pull_up_subqueries to the leaf + * queries of the UNION ALL, which weren't touched above because they + * weren't referenced by the jointree (they will be after we do this). + */ + if (parse->setOperations) + flatten_simple_union_all(root); + + /* + * Detect whether any rangetable entries are RTE_JOIN kind; if not, we can + * avoid the expense of doing flatten_join_alias_vars(). Also check for + * outer joins --- if none, we can skip reduce_outer_joins(). And check + * for LATERAL RTEs, too. This must be done after we have done + * pull_up_subqueries(), of course. + */ + root->hasJoinRTEs = false; + root->hasLateralRTEs = false; + hasOuterJoins = false; + foreach(l, parse->rtable) + { + RangeTblEntry *rte = (RangeTblEntry *) lfirst(l); - /* Constant-folding might have removed all set-returning functions */ - if (parse->hasTargetSRFs) - parse->hasTargetSRFs = expression_returns_set((Node *) parse->targetList); + if (rte->rtekind == RTE_JOIN) + { + root->hasJoinRTEs = true; + if (IS_OUTER_JOIN(rte->jointype)) + hasOuterJoins = true; + } + if (rte->lateral) + root->hasLateralRTEs = true; + } - newWithCheckOptions = NIL; - foreach(l, parse->withCheckOptions) - { - WithCheckOption *wco = (WithCheckOption *) lfirst(l); + /* + * Preprocess RowMark information. We need to do this after subquery + * pullup (so that all non-inherited RTEs are present) and before + * inheritance expansion (so that the info is available for + * expand_inherited_tables to examine and modify). + */ + preprocess_rowmarks(root); + + /* + * Expand any rangetable entries that are inheritance sets into "append + * relations". This can add entries to the rangetable, but they must be + * plain base relations not joins, so it's OK (and marginally more + * efficient) to do it after checking for join RTEs. We must do it after + * pulling up subqueries, else we'd fail to handle inherited tables in + * subqueries. + */ + expand_inherited_tables(root); + + /* + * Set hasHavingQual to remember if HAVING clause is present. Needed + * because preprocess_expression will reduce a constant-true condition to + * an empty qual list ... but "HAVING TRUE" is not a semantic no-op. + */ + root->hasHavingQual = (parse->havingQual != NULL); + + /* Clear this flag; might get set in distribute_qual_to_rels */ + root->hasPseudoConstantQuals = false; + + /* + * Do expression preprocessing on targetlist and quals, as well as other + * random expressions in the querytree. Note that we do not need to + * handle sort/group expressions explicitly, because they are actually + * part of the targetlist. + */ + parse->targetList = (List *) + preprocess_expression(root, (Node *) parse->targetList, + EXPRKIND_TARGET); + + /* Constant-folding might have removed all set-returning functions */ + if (parse->hasTargetSRFs) + parse->hasTargetSRFs = expression_returns_set((Node *) parse->targetList); + + newWithCheckOptions = NIL; + foreach(l, parse->withCheckOptions) + { + WithCheckOption *wco = (WithCheckOption *) lfirst(l); - wco->qual = preprocess_expression(root, wco->qual, - EXPRKIND_QUAL); - if (wco->qual != NULL) - newWithCheckOptions = lappend(newWithCheckOptions, wco); - } - parse->withCheckOptions = newWithCheckOptions; + wco->qual = preprocess_expression(root, wco->qual, + EXPRKIND_QUAL); + if (wco->qual != NULL) + newWithCheckOptions = lappend(newWithCheckOptions, wco); + } + parse->withCheckOptions = newWithCheckOptions; - parse->returningList = (List *) - preprocess_expression(root, (Node *) parse->returningList, - EXPRKIND_TARGET); + parse->returningList = (List *) + preprocess_expression(root, (Node *) parse->returningList, + EXPRKIND_TARGET); - preprocess_qual_conditions(root, (Node *) parse->jointree); + preprocess_qual_conditions(root, (Node *) parse->jointree); - parse->havingQual = preprocess_expression(root, parse->havingQual, - EXPRKIND_QUAL); + parse->havingQual = preprocess_expression(root, parse->havingQual, + EXPRKIND_QUAL); - foreach(l, parse->windowClause) - { - WindowClause *wc = (WindowClause *) lfirst(l); + foreach(l, parse->windowClause) + { + WindowClause *wc = (WindowClause *) lfirst(l); - /* partitionClause/orderClause are sort/group expressions */ - wc->startOffset = preprocess_expression(root, wc->startOffset, - EXPRKIND_LIMIT); - wc->endOffset = preprocess_expression(root, wc->endOffset, - EXPRKIND_LIMIT); - } + /* partitionClause/orderClause are sort/group expressions */ + wc->startOffset = preprocess_expression(root, wc->startOffset, + EXPRKIND_LIMIT); + wc->endOffset = preprocess_expression(root, wc->endOffset, + EXPRKIND_LIMIT); + } - parse->limitOffset = preprocess_expression(root, parse->limitOffset, - EXPRKIND_LIMIT); - parse->limitCount = preprocess_expression(root, parse->limitCount, - EXPRKIND_LIMIT); + parse->limitOffset = preprocess_expression(root, parse->limitOffset, + EXPRKIND_LIMIT); + parse->limitCount = preprocess_expression(root, parse->limitCount, + EXPRKIND_LIMIT); - if (parse->onConflict) - { - parse->onConflict->arbiterElems = (List *) - preprocess_expression(root, - (Node *) parse->onConflict->arbiterElems, - EXPRKIND_ARBITER_ELEM); - parse->onConflict->arbiterWhere = - preprocess_expression(root, - parse->onConflict->arbiterWhere, - EXPRKIND_QUAL); - parse->onConflict->onConflictSet = (List *) - preprocess_expression(root, - (Node *) parse->onConflict->onConflictSet, - EXPRKIND_TARGET); - parse->onConflict->onConflictWhere = - preprocess_expression(root, - parse->onConflict->onConflictWhere, - EXPRKIND_QUAL); + if (parse->onConflict) + { + parse->onConflict->arbiterElems = (List *) + preprocess_expression(root, + (Node *) parse->onConflict->arbiterElems, + EXPRKIND_ARBITER_ELEM); + parse->onConflict->arbiterWhere = + preprocess_expression(root, + parse->onConflict->arbiterWhere, + EXPRKIND_QUAL); + parse->onConflict->onConflictSet = (List *) + preprocess_expression(root, + (Node *) parse->onConflict->onConflictSet, + EXPRKIND_TARGET); + parse->onConflict->onConflictWhere = + preprocess_expression(root, + parse->onConflict->onConflictWhere, + EXPRKIND_QUAL); #ifdef _MLS_ { int rt_index; diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index 4c357f3e..3a7f8ccf 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -126,6 +126,14 @@ typedef struct finalize_primnode_context Bitmapset *paramids; /* Non-local PARAM_EXEC paramids found */ } finalize_primnode_context; +typedef struct inline_cte_walker_context +{ + const char *ctename; /* name and relative level of target CTE */ + int levelsup; + int refcount; /* number of remaining references */ + Query *ctequery; /* query to substitute */ +} inline_cte_walker_context; + static Node *build_subplan(PlannerInfo *root, Plan *plan, PlannerInfo *subroot, List *plan_params, @@ -144,6 +152,10 @@ static Node *convert_testexpr_mutator(Node *node, static bool subplan_is_hashable(Plan *plan); static bool testexpr_is_hashable(Node *testexpr); static bool hash_ok_operator(OpExpr *expr); +static bool contain_dml(Node *node); +static bool contain_dml_walker(Node *node, void *context); +static void inline_cte(PlannerInfo *root, CommonTableExpr *cte); +static bool inline_cte_walker(Node *node, inline_cte_walker_context *context); static bool simplify_EXISTS_query(PlannerInfo *root, Query *query); static Query *convert_EXISTS_to_ANY(PlannerInfo *root, Query *subselect, Node **testexpr, List **paramIds); @@ -1235,76 +1247,117 @@ hash_ok_operator(OpExpr *expr) /* * SS_process_ctes: process a query's WITH list * - * We plan each interesting WITH item and convert it to an initplan. + * Consider each CTE in the WITH list and either ignore it (if it's an + * unreferenced SELECT), "inline" it to create a regular sub-SELECT-in-FROM, + * or convert it to an initplan. + * * A side effect is to fill in root->cte_plan_ids with a list that * parallels root->parse->cteList and provides the subplan ID for - * each CTE's initplan. + * each CTE's initplan, or a dummy ID (-1) if we didn't make an initplan. */ void SS_process_ctes(PlannerInfo *root) { - ListCell *lc; - - Assert(root->cte_plan_ids == NIL); + ListCell *lc; - foreach(lc, root->parse->cteList) - { - CommonTableExpr *cte = (CommonTableExpr *) lfirst(lc); - CmdType cmdType = ((Query *) cte->ctequery)->commandType; - Query *subquery; - PlannerInfo *subroot; - RelOptInfo *final_rel; - Path *best_path; - Plan *plan; - SubPlan *splan; - int paramid; - - /* - * Ignore SELECT CTEs that are not actually referenced anywhere. - */ - if (cte->cterefcount == 0 && cmdType == CMD_SELECT) - { - /* Make a dummy entry in cte_plan_ids */ - root->cte_plan_ids = lappend_int(root->cte_plan_ids, -1); - continue; - } - - /* - * Copy the source Query node. Probably not necessary, but let's keep - * this similar to make_subplan. - */ - subquery = (Query *) copyObject(cte->ctequery); - - /* plan_params should not be in use in current query level */ - Assert(root->plan_params == NIL); - - /* - * Generate Paths for the CTE query. Always plan for full retrieval - * --- we don't have enough info to predict otherwise. - */ - subroot = subquery_planner(root->glob, subquery, - root, - cte->cterecursive, 0.0); - - /* - * Since the current query level doesn't yet contain any RTEs, it - * should not be possible for the CTE to have requested parameters of - * this level. - */ - if (root->plan_params) - elog(ERROR, "unexpected outer reference in CTE query"); + Assert(root->cte_plan_ids == NIL); - /* - * Select best Path and turn it into a Plan. At least for now, there - * seems no reason to postpone doing that. - */ - final_rel = fetch_upper_rel(subroot, UPPERREL_FINAL, NULL); - best_path = final_rel->cheapest_total_path; + foreach(lc, root->parse->cteList) + { + CommonTableExpr *cte = (CommonTableExpr *) lfirst(lc); + CmdType cmdType = ((Query *) cte->ctequery)->commandType; + Query *subquery; + PlannerInfo *subroot; + RelOptInfo *final_rel; + Path *best_path; + Plan *plan; + SubPlan *splan; + int paramid; + + /* + * Ignore SELECT CTEs that are not actually referenced anywhere. + */ + if (cte->cterefcount == 0 && cmdType == CMD_SELECT) + { + /* Make a dummy entry in cte_plan_ids */ + root->cte_plan_ids = lappend_int(root->cte_plan_ids, -1); + continue; + } - if (!subroot->distribution) - subroot->distribution = best_path->distribution; + /* + * Consider inlining the CTE (creating RTE_SUBQUERY RTE(s)) instead of + * implementing it as a separately-planned CTE. + * + * We cannot inline if any of these conditions hold: + * + * 1. The user said not to (the CTEMaterializeAlways option). + * + * 2. The CTE is recursive. + * + * 3. The CTE has side-effects; this includes either not being a plain + * SELECT, or containing volatile functions. Inlining might change + * the side-effects, which would be bad. + * + * Otherwise, we have an option whether to inline or not. That should + * always be a win if there's just a single reference, but if the CTE + * is multiply-referenced then it's unclear: inlining adds duplicate + * computations, but the ability to absorb restrictions from the outer + * query level could outweigh that. We do not have nearly enough + * information at this point to tell whether that's true, so we let + * the user express a preference. Our default behavior is to inline + * only singly-referenced CTEs, but a CTE marked CTEMaterializeNever + * will be inlined even if multiply referenced. + */ + if ((cte->ctematerialized == CTEMaterializeNever || + (cte->ctematerialized == CTEMaterializeDefault && + cte->cterefcount == 1)) && + !cte->cterecursive && + cmdType == CMD_SELECT && + !contain_dml(cte->ctequery) && + !contain_volatile_functions(cte->ctequery)) + { + inline_cte(root, cte); + /* Make a dummy entry in cte_plan_ids */ + root->cte_plan_ids = lappend_int(root->cte_plan_ids, -1); + continue; + } - plan = create_plan(subroot, best_path); + /* + * Copy the source Query node. Probably not necessary, but let's keep + * this similar to make_subplan. + */ + subquery = (Query *) copyObject(cte->ctequery); + + /* plan_params should not be in use in current query level */ + Assert(root->plan_params == NIL); + + /* + * Generate Paths for the CTE query. Always plan for full retrieval + * --- we don't have enough info to predict otherwise. + */ + subroot = subquery_planner(root->glob, subquery, + root, + cte->cterecursive, 0.0); + + /* + * Since the current query level doesn't yet contain any RTEs, it + * should not be possible for the CTE to have requested parameters of + * this level. + */ + if (root->plan_params) + elog(ERROR, "unexpected outer reference in CTE query"); + + /* + * Select best Path and turn it into a Plan. At least for now, there + * seems no reason to postpone doing that. + */ + final_rel = fetch_upper_rel(subroot, UPPERREL_FINAL, NULL); + best_path = final_rel->cheapest_total_path; + + if (!subroot->distribution) + subroot->distribution = best_path->distribution; + + plan = create_plan(subroot, best_path); #ifdef XCP /* Add a remote subplan, if redistribution is needed. */ @@ -1651,6 +1704,126 @@ add_vars_to_subquery_targetlist(Node *whereClause, Query *subselect, int rti #endif +/* + * contain_dml: is any subquery not a plain SELECT? + * + * We reject SELECT FOR UPDATE/SHARE as well as INSERT etc. + */ +static bool +contain_dml(Node *node) +{ + return contain_dml_walker(node, NULL); +} + +static bool +contain_dml_walker(Node *node, void *context) +{ + if (node == NULL) + return false; + if (IsA(node, Query)) + { + Query *query = (Query *) node; + + if (query->commandType != CMD_SELECT || + query->rowMarks != NIL) + return true; + + return query_tree_walker(query, contain_dml_walker, context, 0); + } + return expression_tree_walker(node, contain_dml_walker, context); +} + +/* + * inline_cte: convert RTE_CTE references to given CTE into RTE_SUBQUERYs + */ +static void +inline_cte(PlannerInfo *root, CommonTableExpr *cte) +{ + struct inline_cte_walker_context context; + + context.ctename = cte->ctename; + /* Start at levelsup = -1 because we'll immediately increment it */ + context.levelsup = -1; + context.refcount = cte->cterefcount; + context.ctequery = castNode(Query, cte->ctequery); + + (void) inline_cte_walker((Node *) root->parse, &context); + + /* Assert we replaced all references */ + Assert(context.refcount == 0); +} + +static bool +inline_cte_walker(Node *node, inline_cte_walker_context *context) +{ + if (node == NULL) + return false; + if (IsA(node, Query)) + { + Query *query = (Query *) node; + + context->levelsup++; + + /* + * Visit the query's RTE nodes after their contents; otherwise + * query_tree_walker would descend into the newly inlined CTE query, + * which we don't want. + */ + (void) query_tree_walker(query, inline_cte_walker, context, + QTW_EXAMINE_RTES_AFTER); + + context->levelsup--; + + return false; + } + else if (IsA(node, RangeTblEntry)) + { + RangeTblEntry *rte = (RangeTblEntry *) node; + + if (rte->rtekind == RTE_CTE && + strcmp(rte->ctename, context->ctename) == 0 && + rte->ctelevelsup == context->levelsup) + { + /* + * Found a reference to replace. Generate a copy of the CTE query + * with appropriate level adjustment for outer references (e.g., + * to other CTEs). + */ + Query *newquery = copyObject(context->ctequery); + + if (context->levelsup > 0) + IncrementVarSublevelsUp((Node *) newquery, context->levelsup, 1); + + /* + * Convert the RTE_CTE RTE into a RTE_SUBQUERY. + * + * Historically, a FOR UPDATE clause has been treated as extending + * into views and subqueries, but not into CTEs. We preserve this + * distinction by not trying to push rowmarks into the new + * subquery. + */ + rte->rtekind = RTE_SUBQUERY; + rte->subquery = newquery; + rte->security_barrier = false; + + /* Zero out CTE-specific fields */ + rte->ctename = NULL; + rte->ctelevelsup = 0; + rte->self_reference = false; + rte->coltypes = NIL; + rte->coltypmods = NIL; + rte->colcollations = NIL; + + /* Count the number of replacements we've done */ + context->refcount--; + } + + return false; + } + + return expression_tree_walker(node, inline_cte_walker, context); +} + /* * convert_ANY_sublink_to_join: try to convert an ANY SubLink to a join * diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 4c3768f9..ad22456b 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -507,7 +507,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); %type row explicit_row implicit_row type_list array_expr_list %type case_expr case_arg when_clause case_default %type when_clause_list -%type sub_type +%type sub_type opt_materialized %type NumericOnly %type NumericOnly_list %type alias_clause opt_alias_clause @@ -12750,17 +12750,24 @@ cte_list: | cte_list ',' common_table_expr { $$ = lappend($1, $3); } ; -common_table_expr: name opt_name_list AS '(' PreparableStmt ')' +common_table_expr: name opt_name_list AS opt_materialized '(' PreparableStmt ')' { CommonTableExpr *n = makeNode(CommonTableExpr); n->ctename = $1; n->aliascolnames = $2; - n->ctequery = $5; + n->ctematerialized = $4; + n->ctequery = $6; n->location = @1; $$ = (Node *) n; } ; +opt_materialized: + MATERIALIZED { $$ = CTEMaterializeAlways; } + | NOT MATERIALIZED { $$ = CTEMaterializeNever; } + | /*EMPTY*/ { $$ = CTEMaterializeDefault; } + ; + opt_with_clause: with_clause { $$ = $1; } | /*EMPTY*/ { $$ = NULL; } @@ -17827,6 +17834,7 @@ makeRecursiveViewSelect(char *relname, List *aliases, Node *query) /* create common table expression */ cte->ctename = relname; cte->aliascolnames = aliases; + cte->ctematerialized = CTEMaterializeDefault; cte->ctequery = query; cte->location = -1; diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c index 2a661f67..feb22b86 100644 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@ -1,12926 +1,12938 @@ -/*------------------------------------------------------------------------- - * - * ruleutils.c - * Functions to convert stored expressions/querytrees back to - * source text - * - * Portions Copyright (c) 2012-2014, TransLattice, Inc. - * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group - * Portions Copyright (c) 1994, Regents of the University of California - * - * - * IDENTIFICATION - * src/backend/utils/adt/ruleutils.c - * - *------------------------------------------------------------------------- - */ -#include "postgres.h" - -#include -#include -#include - -#ifdef PGXC -#include "access/reloptions.h" -#endif /* PGXC */ -#include "access/amapi.h" -#include "access/htup_details.h" -#include "access/sysattr.h" -#include "catalog/dependency.h" -#include "catalog/indexing.h" -#include "catalog/partition.h" -#include "catalog/pg_aggregate.h" -#include "catalog/pg_am.h" -#include "catalog/pg_authid.h" -#ifdef PGXC -#include "catalog/pg_aggregate.h" -#endif /* PGXC */ -#include "catalog/pg_collation.h" -#include "catalog/pg_constraint.h" -#include "catalog/pg_depend.h" -#include "catalog/pg_language.h" -#include "catalog/pg_opclass.h" -#include "catalog/pg_operator.h" -#include "catalog/pg_partitioned_table.h" -#include "catalog/pg_proc.h" -#include "catalog/pg_statistic_ext.h" -#include "catalog/pg_trigger.h" -#include "catalog/pg_type.h" -#include "commands/defrem.h" -#include "commands/tablespace.h" -#include "common/keywords.h" -#include "executor/spi.h" -#include "funcapi.h" -#ifdef PGXC -#include "nodes/execnodes.h" -#endif -#include "mb/pg_wchar.h" -#include "miscadmin.h" -#include "nodes/makefuncs.h" -#include "nodes/nodeFuncs.h" -#include "optimizer/tlist.h" -#include "parser/parse_node.h" -#include "parser/parse_agg.h" -#include "parser/parse_func.h" -#include "parser/parse_oper.h" -#include "parser/parse_type.h" -#include "parser/parser.h" -#include "parser/parsetree.h" -#ifdef PGXC -#include "pgxc/pgxc.h" -#include "pgxc/planner.h" -#endif -#include "rewrite/rewriteHandler.h" -#include "rewrite/rewriteManip.h" -#include "rewrite/rewriteSupport.h" -#include "utils/array.h" -#include "utils/builtins.h" -#include "utils/fmgroids.h" -#include "utils/hsearch.h" -#include "utils/lsyscache.h" -#include "utils/rel.h" -#include "utils/ruleutils.h" -#include "utils/snapmgr.h" -#include "utils/syscache.h" -#include "utils/tqual.h" -#include "utils/typcache.h" -#include "utils/varlena.h" -#include "utils/xml.h" -#ifdef __TBASE__ -#include "optimizer/planmain.h" -#endif -#ifdef __COLD_HOT__ -#include "postmaster/postmaster.h" -#endif - -/* ---------- - * Pretty formatting constants - * ---------- - */ - -/* Indent counts */ -#define PRETTYINDENT_STD 8 -#define PRETTYINDENT_JOIN 4 -#define PRETTYINDENT_VAR 4 - -#define PRETTYINDENT_LIMIT 40 /* wrap limit */ - -/* Pretty flags */ -#define PRETTYFLAG_PAREN 1 -#define PRETTYFLAG_INDENT 2 - -/* Default line length for pretty-print wrapping: 0 means wrap always */ -#define WRAP_COLUMN_DEFAULT 0 - -/* macro to test if pretty action needed */ -#define PRETTY_PAREN(context) ((context)->prettyFlags & PRETTYFLAG_PAREN) -#define PRETTY_INDENT(context) ((context)->prettyFlags & PRETTYFLAG_INDENT) - - -#ifdef __TBASE__ -static int daysofmonth[13] = {0,31,29,31,30,31,30,31,31,30,31,30,31}; - -static struct pg_tm g_partition_base_time = { 0, - 0, - 0, - 1, - 1, /* origin 0, not 1 */ - 1970, /* relative to 1900 */ - 1, - 1, - 0, - 0, - NULL - }; -#endif - -/* ---------- - * Local data types - * ---------- - */ - -/* Context info needed for invoking a recursive querytree display routine */ -typedef struct -{ - StringInfo buf; /* output buffer to append to */ - List *namespaces; /* List of deparse_namespace nodes */ - List *windowClause; /* Current query level's WINDOW clause */ - List *windowTList; /* targetlist for resolving WINDOW clause */ - int prettyFlags; /* enabling of pretty-print functions */ - int wrapColumn; /* max line length, or -1 for no limit */ - int indentLevel; /* current indent level for prettyprint */ - bool varprefix; /* TRUE to print prefixes on Vars */ - ParseExprKind special_exprkind; /* set only for exprkinds needing special - * handling */ -#ifdef PGXC - bool finalise_aggs; /* should Datanode finalise the aggregates? */ - bool sortgroup_colno;/* instead of expression use resno for - * sortgrouprefs. - */ -#endif /* PGXC */ -} deparse_context; - -/* - * Each level of query context around a subtree needs a level of Var namespace. - * A Var having varlevelsup=N refers to the N'th item (counting from 0) in - * the current context's namespaces list. - * - * The rangetable is the list of actual RTEs from the query tree, and the - * cte list is the list of actual CTEs. - * - * rtable_names holds the alias name to be used for each RTE (either a C - * string, or NULL for nameless RTEs such as unnamed joins). - * rtable_columns holds the column alias names to be used for each RTE. - * - * In some cases we need to make names of merged JOIN USING columns unique - * across the whole query, not only per-RTE. If so, unique_using is TRUE - * and using_names is a list of C strings representing names already assigned - * to USING columns. - * - * When deparsing plan trees, there is always just a single item in the - * deparse_namespace list (since a plan tree never contains Vars with - * varlevelsup > 0). We store the PlanState node that is the immediate - * parent of the expression to be deparsed, as well as a list of that - * PlanState's ancestors. In addition, we store its outer and inner subplan - * state nodes, as well as their plan nodes' targetlists, and the index tlist - * if the current plan node might contain INDEX_VAR Vars. (These fields could - * be derived on-the-fly from the current PlanState, but it seems notationally - * clearer to set them up as separate fields.) - */ -typedef struct -{ - List *rtable; /* List of RangeTblEntry nodes */ - List *rtable_names; /* Parallel list of names for RTEs */ - List *rtable_columns; /* Parallel list of deparse_columns structs */ - List *ctes; /* List of CommonTableExpr nodes */ - /* Workspace for column alias assignment: */ - bool unique_using; /* Are we making USING names globally unique */ - List *using_names; /* List of assigned names for USING columns */ - /* Remaining fields are used only when deparsing a Plan tree: */ - PlanState *planstate; /* immediate parent of current expression */ - List *ancestors; /* ancestors of planstate */ - PlanState *outer_planstate; /* outer subplan state, or NULL if none */ - PlanState *inner_planstate; /* inner subplan state, or NULL if none */ - List *outer_tlist; /* referent for OUTER_VAR Vars */ - List *inner_tlist; /* referent for INNER_VAR Vars */ - List *index_tlist; /* referent for INDEX_VAR Vars */ -} deparse_namespace; - -/* - * Per-relation data about column alias names. - * - * Selecting aliases is unreasonably complicated because of the need to dump - * rules/views whose underlying tables may have had columns added, deleted, or - * renamed since the query was parsed. We must nonetheless print the rule/view - * in a form that can be reloaded and will produce the same results as before. - * - * For each RTE used in the query, we must assign column aliases that are - * unique within that RTE. SQL does not require this of the original query, - * but due to factors such as *-expansion we need to be able to uniquely - * reference every column in a decompiled query. As long as we qualify all - * column references, per-RTE uniqueness is sufficient for that. - * - * However, we can't ensure per-column name uniqueness for unnamed join RTEs, - * since they just inherit column names from their input RTEs, and we can't - * rename the columns at the join level. Most of the time this isn't an issue - * because we don't need to reference the join's output columns as such; we - * can reference the input columns instead. That approach can fail for merged - * JOIN USING columns, however, so when we have one of those in an unnamed - * join, we have to make that column's alias globally unique across the whole - * query to ensure it can be referenced unambiguously. - * - * Another problem is that a JOIN USING clause requires the columns to be - * merged to have the same aliases in both input RTEs, and that no other - * columns in those RTEs or their children conflict with the USING names. - * To handle that, we do USING-column alias assignment in a recursive - * traversal of the query's jointree. When descending through a JOIN with - * USING, we preassign the USING column names to the child columns, overriding - * other rules for column alias assignment. We also mark each RTE with a list - * of all USING column names selected for joins containing that RTE, so that - * when we assign other columns' aliases later, we can avoid conflicts. - * - * Another problem is that if a JOIN's input tables have had columns added or - * deleted since the query was parsed, we must generate a column alias list - * for the join that matches the current set of input columns --- otherwise, a - * change in the number of columns in the left input would throw off matching - * of aliases to columns of the right input. Thus, positions in the printable - * column alias list are not necessarily one-for-one with varattnos of the - * JOIN, so we need a separate new_colnames[] array for printing purposes. - */ -typedef struct -{ - /* - * colnames is an array containing column aliases to use for columns that - * existed when the query was parsed. Dropped columns have NULL entries. - * This array can be directly indexed by varattno to get a Var's name. - * - * Non-NULL entries are guaranteed unique within the RTE, *except* when - * this is for an unnamed JOIN RTE. In that case we merely copy up names - * from the two input RTEs. - * - * During the recursive descent in set_using_names(), forcible assignment - * of a child RTE's column name is represented by pre-setting that element - * of the child's colnames array. So at that stage, NULL entries in this - * array just mean that no name has been preassigned, not necessarily that - * the column is dropped. - */ - int num_cols; /* length of colnames[] array */ - char **colnames; /* array of C strings and NULLs */ - - /* - * new_colnames is an array containing column aliases to use for columns - * that would exist if the query was re-parsed against the current - * definitions of its base tables. This is what to print as the column - * alias list for the RTE. This array does not include dropped columns, - * but it will include columns added since original parsing. Indexes in - * it therefore have little to do with current varattno values. As above, - * entries are unique unless this is for an unnamed JOIN RTE. (In such an - * RTE, we never actually print this array, but we must compute it anyway - * for possible use in computing column names of upper joins.) The - * parallel array is_new_col marks which of these columns are new since - * original parsing. Entries with is_new_col false must match the - * non-NULL colnames entries one-for-one. - */ - int num_new_cols; /* length of new_colnames[] array */ - char **new_colnames; /* array of C strings */ - bool *is_new_col; /* array of bool flags */ - - /* This flag tells whether we should actually print a column alias list */ - bool printaliases; - - /* This list has all names used as USING names in joins above this RTE */ - List *parentUsing; /* names assigned to parent merged columns */ - - /* - * If this struct is for a JOIN RTE, we fill these fields during the - * set_using_names() pass to describe its relationship to its child RTEs. - * - * leftattnos and rightattnos are arrays with one entry per existing - * output column of the join (hence, indexable by join varattno). For a - * simple reference to a column of the left child, leftattnos[i] is the - * child RTE's attno and rightattnos[i] is zero; and conversely for a - * column of the right child. But for merged columns produced by JOIN - * USING/NATURAL JOIN, both leftattnos[i] and rightattnos[i] are nonzero. - * Also, if the column has been dropped, both are zero. - * - * If it's a JOIN USING, usingNames holds the alias names selected for the - * merged columns (these might be different from the original USING list, - * if we had to modify names to achieve uniqueness). - */ - int leftrti; /* rangetable index of left child */ - int rightrti; /* rangetable index of right child */ - int *leftattnos; /* left-child varattnos of join cols, or 0 */ - int *rightattnos; /* right-child varattnos of join cols, or 0 */ - List *usingNames; /* names assigned to merged columns */ -} deparse_columns; - -/* This macro is analogous to rt_fetch(), but for deparse_columns structs */ -#define deparse_columns_fetch(rangetable_index, dpns) \ - ((deparse_columns *) list_nth((dpns)->rtable_columns, (rangetable_index)-1)) - -/* - * Entry in set_rtable_names' hash table - */ -typedef struct -{ - char name[NAMEDATALEN]; /* Hash key --- must be first */ - int counter; /* Largest addition used so far for name */ -} NameHashEntry; - - -/* ---------- - * Global data - * ---------- - */ -static SPIPlanPtr plan_getrulebyoid = NULL; -static const char *query_getrulebyoid = "SELECT * FROM pg_catalog.pg_rewrite WHERE oid = $1"; -static SPIPlanPtr plan_getviewrule = NULL; -static const char *query_getviewrule = "SELECT * FROM pg_catalog.pg_rewrite WHERE ev_class = $1 AND rulename = $2"; - -/* GUC parameters */ -bool quote_all_identifiers = false; - - -/* ---------- - * Local functions - * - * Most of these functions used to use fixed-size buffers to build their - * results. Now, they take an (already initialized) StringInfo object - * as a parameter, and append their text output to its contents. - * ---------- - */ -static char *deparse_expression_pretty(Node *expr, List *dpcontext, - bool forceprefix, bool showimplicit, - int prettyFlags, int startIndent); -static char *pg_get_viewdef_worker(Oid viewoid, - int prettyFlags, int wrapColumn); -static char *pg_get_triggerdef_worker(Oid trigid, bool pretty); -static void decompile_column_index_array(Datum column_index_array, Oid relId, - StringInfo buf); -static char *pg_get_ruledef_worker(Oid ruleoid, int prettyFlags); -static char *pg_get_indexdef_worker(Oid indexrelid, int colno, - const Oid *excludeOps, - bool attrsOnly, bool showTblSpc, - int prettyFlags, bool missing_ok); -static char *pg_get_statisticsobj_worker(Oid statextid, bool missing_ok); -static char *pg_get_partkeydef_worker(Oid relid, int prettyFlags, - bool attrsOnly, bool missing_ok); -static char *pg_get_constraintdef_worker(Oid constraintId, bool fullCommand, - int prettyFlags, bool missing_ok); -static text *pg_get_expr_worker(text *expr, Oid relid, const char *relname, - int prettyFlags); -static int print_function_arguments(StringInfo buf, HeapTuple proctup, - bool print_table_args, bool print_defaults); -static void print_function_rettype(StringInfo buf, HeapTuple proctup); -static void print_function_trftypes(StringInfo buf, HeapTuple proctup); -static void set_rtable_names(deparse_namespace *dpns, List *parent_namespaces, - Bitmapset *rels_used); -static void set_deparse_for_query(deparse_namespace *dpns, Query *query, - List *parent_namespaces); -static void set_simple_column_names(deparse_namespace *dpns); -static bool has_dangerous_join_using(deparse_namespace *dpns, Node *jtnode); -static void set_using_names(deparse_namespace *dpns, Node *jtnode, - List *parentUsing); -static void set_relation_column_names(deparse_namespace *dpns, - RangeTblEntry *rte, - deparse_columns *colinfo); -static void set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte, - deparse_columns *colinfo); -static bool colname_is_unique(char *colname, deparse_namespace *dpns, - deparse_columns *colinfo); -static char *make_colname_unique(char *colname, deparse_namespace *dpns, - deparse_columns *colinfo); -static void expand_colnames_array_to(deparse_columns *colinfo, int n); -static void identify_join_columns(JoinExpr *j, RangeTblEntry *jrte, - deparse_columns *colinfo); -static void flatten_join_using_qual(Node *qual, - List **leftvars, List **rightvars); -static char *get_rtable_name(int rtindex, deparse_context *context); -static void set_deparse_planstate(deparse_namespace *dpns, PlanState *ps); -#ifdef PGXC -static void set_deparse_plan(deparse_namespace *dpns, Plan *plan); -#endif -static void push_child_plan(deparse_namespace *dpns, PlanState *ps, - deparse_namespace *save_dpns); -static void pop_child_plan(deparse_namespace *dpns, - deparse_namespace *save_dpns); -static void push_ancestor_plan(deparse_namespace *dpns, ListCell *ancestor_cell, - deparse_namespace *save_dpns); -static void pop_ancestor_plan(deparse_namespace *dpns, - deparse_namespace *save_dpns); -static void make_ruledef(StringInfo buf, HeapTuple ruletup, TupleDesc rulettc, - int prettyFlags); -static void make_viewdef(StringInfo buf, HeapTuple ruletup, TupleDesc rulettc, - int prettyFlags, int wrapColumn); -static void get_query_def(Query *query, StringInfo buf, List *parentnamespace, - TupleDesc resultDesc, - int prettyFlags, int wrapColumn, int startIndent -#ifdef PGXC - , bool finalise_aggregates, bool sortgroup_colno -#endif /* PGXC */ - ); -static void get_values_def(List *values_lists, deparse_context *context); -static void get_with_clause(Query *query, deparse_context *context); -static void get_select_query_def(Query *query, deparse_context *context, - TupleDesc resultDesc); -static void get_insert_query_def(Query *query, deparse_context *context); -static void get_update_query_def(Query *query, deparse_context *context); -static void get_update_query_targetlist_def(Query *query, List *targetList, - deparse_context *context, - RangeTblEntry *rte); -static void get_delete_query_def(Query *query, deparse_context *context); -static void get_utility_query_def(Query *query, deparse_context *context); -static void get_basic_select_query(Query *query, deparse_context *context, - TupleDesc resultDesc); -static void get_target_list(List *targetList, deparse_context *context, - TupleDesc resultDesc); -static void get_setop_query(Node *setOp, Query *query, - deparse_context *context, - TupleDesc resultDesc); -static Node *get_rule_sortgroupclause(Index ref, List *tlist, - bool force_colno, - deparse_context *context); -static void get_rule_groupingset(GroupingSet *gset, List *targetlist, - bool omit_parens, deparse_context *context); -static void get_rule_orderby(List *orderList, List *targetList, - bool force_colno, deparse_context *context); -static void get_rule_windowclause(Query *query, deparse_context *context); -static void get_rule_windowspec(WindowClause *wc, List *targetList, - deparse_context *context); -static char *get_variable(Var *var, int levelsup, bool istoplevel, - deparse_context *context); -static void get_special_variable(Node *node, deparse_context *context, - void *private); -static void resolve_special_varno(Node *node, deparse_context *context, - void *private, - void (*callback) (Node *, deparse_context *, void *)); -static Node *find_param_referent(Param *param, deparse_context *context, - deparse_namespace **dpns_p, ListCell **ancestor_cell_p); -static void get_parameter(Param *param, deparse_context *context); -static const char *get_simple_binary_op_name(OpExpr *expr); -static bool isSimpleNode(Node *node, Node *parentNode, int prettyFlags); -static void appendContextKeyword(deparse_context *context, const char *str, - int indentBefore, int indentAfter, int indentPlus); -static void removeStringInfoSpaces(StringInfo str); -static void get_rule_expr(Node *node, deparse_context *context, - bool showimplicit); -static void get_rule_expr_toplevel(Node *node, deparse_context *context, - bool showimplicit); -static void get_rule_expr_funccall(Node *node, deparse_context *context, - bool showimplicit); -static bool looks_like_function(Node *node); -static void get_oper_expr(OpExpr *expr, deparse_context *context); -static void get_func_expr(FuncExpr *expr, deparse_context *context, - bool showimplicit); -static void get_agg_expr(Aggref *aggref, deparse_context *context, - Aggref *original_aggref); -static void get_agg_combine_expr(Node *node, deparse_context *context, - void *private); -static void get_windowfunc_expr(WindowFunc *wfunc, deparse_context *context); -static void get_coercion_expr(Node *arg, deparse_context *context, - Oid resulttype, int32 resulttypmod, - Node *parentNode); -static void get_const_expr(Const *constval, deparse_context *context, - int showtype); -static void get_const_collation(Const *constval, deparse_context *context); -static void simple_quote_literal(StringInfo buf, const char *val); -static void get_sublink_expr(SubLink *sublink, deparse_context *context); -static void get_tablefunc(TableFunc *tf, deparse_context *context, - bool showimplicit); -static void get_from_clause(Query *query, const char *prefix, - deparse_context *context); -static void get_from_clause_item(Node *jtnode, Query *query, - deparse_context *context); -static void get_column_alias_list(deparse_columns *colinfo, - deparse_context *context); -static void get_from_clause_coldeflist(RangeTblFunction *rtfunc, - deparse_columns *colinfo, - deparse_context *context); -static void get_tablesample_def(TableSampleClause *tablesample, - deparse_context *context); -static void get_opclass_name(Oid opclass, Oid actual_datatype, - StringInfo buf); -static Node *processIndirection(Node *node, deparse_context *context); -static void printSubscripts(ArrayRef *aref, deparse_context *context); -static char *get_relation_name(Oid relid); -static char *generate_relation_name(Oid relid, List *namespaces); -static char *generate_qualified_relation_name(Oid relid); -static char *generate_function_name(Oid funcid, int nargs, - List *argnames, Oid *argtypes, - bool has_variadic, bool *use_variadic_p, - ParseExprKind special_exprkind); -static char *generate_operator_name(Oid operid, Oid arg1, Oid arg2); -static text *string_to_text(char *str); -static char *flatten_reloptions(Oid relid); - -#ifdef __TBASE__ -static Bitmapset *pruning_walker(Relation rel, Node *expr); -static Bitmapset *pruning_opexpr(Relation rel, OpExpr *expr); -static Bitmapset *get_full_pruning_result(Relation rel); -static int get_daysofmonth(int startmonth, int startday, - int endmonth, int endday); -#endif -#define only_marker(rte) ((rte)->inh ? "" : "ONLY ") - - -/* ---------- - * get_ruledef - Do it all and return a text - * that could be used as a statement - * to recreate the rule - * ---------- - */ -Datum -pg_get_ruledef(PG_FUNCTION_ARGS) -{ - Oid ruleoid = PG_GETARG_OID(0); - int prettyFlags; - char *res; - - prettyFlags = PRETTYFLAG_INDENT; - - res = pg_get_ruledef_worker(ruleoid, prettyFlags); - - if (res == NULL) - PG_RETURN_NULL(); - - PG_RETURN_TEXT_P(string_to_text(res)); -} - - -Datum -pg_get_ruledef_ext(PG_FUNCTION_ARGS) -{ - Oid ruleoid = PG_GETARG_OID(0); - bool pretty = PG_GETARG_BOOL(1); - int prettyFlags; - char *res; - - prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT; - - res = pg_get_ruledef_worker(ruleoid, prettyFlags); - - if (res == NULL) - PG_RETURN_NULL(); - - PG_RETURN_TEXT_P(string_to_text(res)); -} - - -static char * -pg_get_ruledef_worker(Oid ruleoid, int prettyFlags) -{// #lizard forgives - Datum args[1]; - char nulls[1]; - int spirc; - HeapTuple ruletup; - TupleDesc rulettc; - StringInfoData buf; - - /* - * Do this first so that string is alloc'd in outer context not SPI's. - */ - initStringInfo(&buf); - - /* - * Connect to SPI manager - */ - if (SPI_connect() != SPI_OK_CONNECT) - elog(ERROR, "SPI_connect failed"); - - /* - * On the first call prepare the plan to lookup pg_rewrite. We read - * pg_rewrite over the SPI manager instead of using the syscache to be - * checked for read access on pg_rewrite. - */ - if (plan_getrulebyoid == NULL) - { - Oid argtypes[1]; - SPIPlanPtr plan; - - argtypes[0] = OIDOID; - plan = SPI_prepare(query_getrulebyoid, 1, argtypes); - if (plan == NULL) - elog(ERROR, "SPI_prepare failed for \"%s\"", query_getrulebyoid); - SPI_keepplan(plan); - plan_getrulebyoid = plan; - } - - /* - * Get the pg_rewrite tuple for this rule - */ - args[0] = ObjectIdGetDatum(ruleoid); - nulls[0] = ' '; - spirc = SPI_execute_plan(plan_getrulebyoid, args, nulls, true, 0); - if (spirc != SPI_OK_SELECT) - elog(ERROR, "failed to get pg_rewrite tuple for rule %u", ruleoid); - if (SPI_processed != 1) - { - /* - * There is no tuple data available here, just keep the output buffer - * empty. - */ - } - else - { - /* - * Get the rule's definition and put it into executor's memory - */ - ruletup = SPI_tuptable->vals[0]; - rulettc = SPI_tuptable->tupdesc; - make_ruledef(&buf, ruletup, rulettc, prettyFlags); - } - - /* - * Disconnect from SPI manager - */ - if (SPI_finish() != SPI_OK_FINISH) - elog(ERROR, "SPI_finish failed"); - - if (buf.len == 0) - return NULL; - - return buf.data; -} - - -/* ---------- - * get_viewdef - Mainly the same thing, but we - * only return the SELECT part of a view - * ---------- - */ -Datum -pg_get_viewdef(PG_FUNCTION_ARGS) -{ - /* By OID */ - Oid viewoid = PG_GETARG_OID(0); - int prettyFlags; - char *res; - - prettyFlags = PRETTYFLAG_INDENT; - - res = pg_get_viewdef_worker(viewoid, prettyFlags, WRAP_COLUMN_DEFAULT); - - if (res == NULL) - PG_RETURN_NULL(); - - PG_RETURN_TEXT_P(string_to_text(res)); -} - - -Datum -pg_get_viewdef_ext(PG_FUNCTION_ARGS) -{ - /* By OID */ - Oid viewoid = PG_GETARG_OID(0); - bool pretty = PG_GETARG_BOOL(1); - int prettyFlags; - char *res; - - prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT; - - res = pg_get_viewdef_worker(viewoid, prettyFlags, WRAP_COLUMN_DEFAULT); - - if (res == NULL) - PG_RETURN_NULL(); - - PG_RETURN_TEXT_P(string_to_text(res)); -} - -Datum -pg_get_viewdef_wrap(PG_FUNCTION_ARGS) -{ - /* By OID */ - Oid viewoid = PG_GETARG_OID(0); - int wrap = PG_GETARG_INT32(1); - int prettyFlags; - char *res; - - /* calling this implies we want pretty printing */ - prettyFlags = PRETTYFLAG_PAREN | PRETTYFLAG_INDENT; - - res = pg_get_viewdef_worker(viewoid, prettyFlags, wrap); - - if (res == NULL) - PG_RETURN_NULL(); - - PG_RETURN_TEXT_P(string_to_text(res)); -} - -Datum -pg_get_viewdef_name(PG_FUNCTION_ARGS) -{ - /* By qualified name */ - text *viewname = PG_GETARG_TEXT_PP(0); - int prettyFlags; - RangeVar *viewrel; - Oid viewoid; - char *res; - - prettyFlags = PRETTYFLAG_INDENT; - - /* Look up view name. Can't lock it - we might not have privileges. */ - viewrel = makeRangeVarFromNameList(textToQualifiedNameList(viewname)); - viewoid = RangeVarGetRelid(viewrel, NoLock, false); - - res = pg_get_viewdef_worker(viewoid, prettyFlags, WRAP_COLUMN_DEFAULT); - - if (res == NULL) - PG_RETURN_NULL(); - - PG_RETURN_TEXT_P(string_to_text(res)); -} - - -Datum -pg_get_viewdef_name_ext(PG_FUNCTION_ARGS) -{ - /* By qualified name */ - text *viewname = PG_GETARG_TEXT_PP(0); - bool pretty = PG_GETARG_BOOL(1); - int prettyFlags; - RangeVar *viewrel; - Oid viewoid; - char *res; - - prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT; - - /* Look up view name. Can't lock it - we might not have privileges. */ - viewrel = makeRangeVarFromNameList(textToQualifiedNameList(viewname)); - viewoid = RangeVarGetRelid(viewrel, NoLock, false); - - res = pg_get_viewdef_worker(viewoid, prettyFlags, WRAP_COLUMN_DEFAULT); - - if (res == NULL) - PG_RETURN_NULL(); - - PG_RETURN_TEXT_P(string_to_text(res)); -} - -/* - * Common code for by-OID and by-name variants of pg_get_viewdef - */ -static char * -pg_get_viewdef_worker(Oid viewoid, int prettyFlags, int wrapColumn) -{// #lizard forgives - Datum args[2]; - char nulls[2]; - int spirc; - HeapTuple ruletup; - TupleDesc rulettc; - StringInfoData buf; - - /* - * Do this first so that string is alloc'd in outer context not SPI's. - */ - initStringInfo(&buf); - - /* - * Connect to SPI manager - */ - if (SPI_connect() != SPI_OK_CONNECT) - elog(ERROR, "SPI_connect failed"); - - /* - * On the first call prepare the plan to lookup pg_rewrite. We read - * pg_rewrite over the SPI manager instead of using the syscache to be - * checked for read access on pg_rewrite. - */ - if (plan_getviewrule == NULL) - { - Oid argtypes[2]; - SPIPlanPtr plan; - - argtypes[0] = OIDOID; - argtypes[1] = NAMEOID; - plan = SPI_prepare(query_getviewrule, 2, argtypes); - if (plan == NULL) - elog(ERROR, "SPI_prepare failed for \"%s\"", query_getviewrule); - SPI_keepplan(plan); - plan_getviewrule = plan; - } - - /* - * Get the pg_rewrite tuple for the view's SELECT rule - */ - args[0] = ObjectIdGetDatum(viewoid); - args[1] = DirectFunctionCall1(namein, CStringGetDatum(ViewSelectRuleName)); - nulls[0] = ' '; - nulls[1] = ' '; - spirc = SPI_execute_plan(plan_getviewrule, args, nulls, true, 0); - if (spirc != SPI_OK_SELECT) - elog(ERROR, "failed to get pg_rewrite tuple for view %u", viewoid); - if (SPI_processed != 1) - { - /* - * There is no tuple data available here, just keep the output buffer - * empty. - */ - } - else - { - /* - * Get the rule's definition and put it into executor's memory - */ - ruletup = SPI_tuptable->vals[0]; - rulettc = SPI_tuptable->tupdesc; - make_viewdef(&buf, ruletup, rulettc, prettyFlags, wrapColumn); - } - - /* - * Disconnect from SPI manager - */ - if (SPI_finish() != SPI_OK_FINISH) - elog(ERROR, "SPI_finish failed"); - - if (buf.len == 0) - return NULL; - - return buf.data; -} - -/* ---------- - * get_triggerdef - Get the definition of a trigger - * ---------- - */ -Datum -pg_get_triggerdef(PG_FUNCTION_ARGS) -{ - Oid trigid = PG_GETARG_OID(0); - char *res; - - res = pg_get_triggerdef_worker(trigid, false); - - if (res == NULL) - PG_RETURN_NULL(); - - PG_RETURN_TEXT_P(string_to_text(res)); -} - -Datum -pg_get_triggerdef_ext(PG_FUNCTION_ARGS) -{ - Oid trigid = PG_GETARG_OID(0); - bool pretty = PG_GETARG_BOOL(1); - char *res; - - res = pg_get_triggerdef_worker(trigid, pretty); - - if (res == NULL) - PG_RETURN_NULL(); - - PG_RETURN_TEXT_P(string_to_text(res)); -} - -static char * -pg_get_triggerdef_worker(Oid trigid, bool pretty) -{// #lizard forgives - HeapTuple ht_trig; - Form_pg_trigger trigrec; - StringInfoData buf; - Relation tgrel; - ScanKeyData skey[1]; - SysScanDesc tgscan; - int findx = 0; - char *tgname; - char *tgoldtable; - char *tgnewtable; - Oid argtypes[1]; /* dummy */ - Datum value; - bool isnull; - - /* - * Fetch the pg_trigger tuple by the Oid of the trigger - */ - tgrel = heap_open(TriggerRelationId, AccessShareLock); - - ScanKeyInit(&skey[0], - ObjectIdAttributeNumber, - BTEqualStrategyNumber, F_OIDEQ, - ObjectIdGetDatum(trigid)); - - tgscan = systable_beginscan(tgrel, TriggerOidIndexId, true, - NULL, 1, skey); - - ht_trig = systable_getnext(tgscan); - - if (!HeapTupleIsValid(ht_trig)) - { - systable_endscan(tgscan); - heap_close(tgrel, AccessShareLock); - return NULL; - } - - trigrec = (Form_pg_trigger) GETSTRUCT(ht_trig); - - /* - * Start the trigger definition. Note that the trigger's name should never - * be schema-qualified, but the trigger rel's name may be. - */ - initStringInfo(&buf); - - tgname = NameStr(trigrec->tgname); - appendStringInfo(&buf, "CREATE %sTRIGGER %s ", - OidIsValid(trigrec->tgconstraint) ? "CONSTRAINT " : "", - quote_identifier(tgname)); - - if (TRIGGER_FOR_BEFORE(trigrec->tgtype)) - appendStringInfoString(&buf, "BEFORE"); - else if (TRIGGER_FOR_AFTER(trigrec->tgtype)) - appendStringInfoString(&buf, "AFTER"); - else if (TRIGGER_FOR_INSTEAD(trigrec->tgtype)) - appendStringInfoString(&buf, "INSTEAD OF"); - else - elog(ERROR, "unexpected tgtype value: %d", trigrec->tgtype); - - if (TRIGGER_FOR_INSERT(trigrec->tgtype)) - { - appendStringInfoString(&buf, " INSERT"); - findx++; - } - if (TRIGGER_FOR_DELETE(trigrec->tgtype)) - { - if (findx > 0) - appendStringInfoString(&buf, " OR DELETE"); - else - appendStringInfoString(&buf, " DELETE"); - findx++; - } - if (TRIGGER_FOR_UPDATE(trigrec->tgtype)) - { - if (findx > 0) - appendStringInfoString(&buf, " OR UPDATE"); - else - appendStringInfoString(&buf, " UPDATE"); - findx++; - /* tgattr is first var-width field, so OK to access directly */ - if (trigrec->tgattr.dim1 > 0) - { - int i; - - appendStringInfoString(&buf, " OF "); - for (i = 0; i < trigrec->tgattr.dim1; i++) - { - char *attname; - - if (i > 0) - appendStringInfoString(&buf, ", "); - attname = get_relid_attribute_name(trigrec->tgrelid, - trigrec->tgattr.values[i]); - appendStringInfoString(&buf, quote_identifier(attname)); - } - } - } - if (TRIGGER_FOR_TRUNCATE(trigrec->tgtype)) - { - if (findx > 0) - appendStringInfoString(&buf, " OR TRUNCATE"); - else - appendStringInfoString(&buf, " TRUNCATE"); - findx++; - } - appendStringInfo(&buf, " ON %s ", - generate_relation_name(trigrec->tgrelid, NIL)); - - if (OidIsValid(trigrec->tgconstraint)) - { - if (OidIsValid(trigrec->tgconstrrelid)) - appendStringInfo(&buf, "FROM %s ", - generate_relation_name(trigrec->tgconstrrelid, NIL)); - if (!trigrec->tgdeferrable) - appendStringInfoString(&buf, "NOT "); - appendStringInfoString(&buf, "DEFERRABLE INITIALLY "); - if (trigrec->tginitdeferred) - appendStringInfoString(&buf, "DEFERRED "); - else - appendStringInfoString(&buf, "IMMEDIATE "); - } - - value = fastgetattr(ht_trig, Anum_pg_trigger_tgoldtable, - tgrel->rd_att, &isnull); - if (!isnull) - tgoldtable = NameStr(*((NameData *) DatumGetPointer(value))); - else - tgoldtable = NULL; - value = fastgetattr(ht_trig, Anum_pg_trigger_tgnewtable, - tgrel->rd_att, &isnull); - if (!isnull) - tgnewtable = NameStr(*((NameData *) DatumGetPointer(value))); - else - tgnewtable = NULL; - if (tgoldtable != NULL || tgnewtable != NULL) - { - appendStringInfoString(&buf, "REFERENCING "); - if (tgoldtable != NULL) - appendStringInfo(&buf, "OLD TABLE AS %s ", tgoldtable); - if (tgnewtable != NULL) - appendStringInfo(&buf, "NEW TABLE AS %s ", tgnewtable); - } - - if (TRIGGER_FOR_ROW(trigrec->tgtype)) - appendStringInfoString(&buf, "FOR EACH ROW "); - else - appendStringInfoString(&buf, "FOR EACH STATEMENT "); - - /* If the trigger has a WHEN qualification, add that */ - value = fastgetattr(ht_trig, Anum_pg_trigger_tgqual, - tgrel->rd_att, &isnull); - if (!isnull) - { - Node *qual; - char relkind; - deparse_context context; - deparse_namespace dpns; - RangeTblEntry *oldrte; - RangeTblEntry *newrte; - - appendStringInfoString(&buf, "WHEN ("); - - qual = stringToNode(TextDatumGetCString(value)); - - relkind = get_rel_relkind(trigrec->tgrelid); - - /* Build minimal OLD and NEW RTEs for the rel */ - oldrte = makeNode(RangeTblEntry); - oldrte->rtekind = RTE_RELATION; - oldrte->relid = trigrec->tgrelid; - oldrte->relkind = relkind; - oldrte->alias = makeAlias("old", NIL); - oldrte->eref = oldrte->alias; - oldrte->lateral = false; - oldrte->inh = false; - oldrte->inFromCl = true; - - newrte = makeNode(RangeTblEntry); - newrte->rtekind = RTE_RELATION; - newrte->relid = trigrec->tgrelid; - newrte->relkind = relkind; - newrte->alias = makeAlias("new", NIL); - newrte->eref = newrte->alias; - newrte->lateral = false; - newrte->inh = false; - newrte->inFromCl = true; - - /* Build two-element rtable */ - memset(&dpns, 0, sizeof(dpns)); - dpns.rtable = list_make2(oldrte, newrte); - dpns.ctes = NIL; - set_rtable_names(&dpns, NIL, NULL); - set_simple_column_names(&dpns); - - /* Set up context with one-deep namespace stack */ - context.buf = &buf; - context.namespaces = list_make1(&dpns); - context.windowClause = NIL; - context.windowTList = NIL; - context.varprefix = true; - context.prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT; - context.wrapColumn = WRAP_COLUMN_DEFAULT; - context.indentLevel = PRETTYINDENT_STD; - context.special_exprkind = EXPR_KIND_NONE; - - get_rule_expr(qual, &context, false); - - appendStringInfoString(&buf, ") "); - } - - appendStringInfo(&buf, "EXECUTE PROCEDURE %s(", - generate_function_name(trigrec->tgfoid, 0, - NIL, argtypes, - false, NULL, EXPR_KIND_NONE)); - - if (trigrec->tgnargs > 0) - { - char *p; - int i; - - value = fastgetattr(ht_trig, Anum_pg_trigger_tgargs, - tgrel->rd_att, &isnull); - if (isnull) - elog(ERROR, "tgargs is null for trigger %u", trigid); - p = (char *) VARDATA_ANY(DatumGetByteaPP(value)); - for (i = 0; i < trigrec->tgnargs; i++) - { - if (i > 0) - appendStringInfoString(&buf, ", "); - simple_quote_literal(&buf, p); - /* advance p to next string embedded in tgargs */ - while (*p) - p++; - p++; - } - } - - /* We deliberately do not put semi-colon at end */ - appendStringInfoChar(&buf, ')'); - - /* Clean up */ - systable_endscan(tgscan); - - heap_close(tgrel, AccessShareLock); - - return buf.data; -} - -/* ---------- - * get_indexdef - Get the definition of an index - * - * In the extended version, there is a colno argument as well as pretty bool. - * if colno == 0, we want a complete index definition. - * if colno > 0, we only want the Nth index key's variable or expression. - * - * Note that the SQL-function versions of this omit any info about the - * index tablespace; this is intentional because pg_dump wants it that way. - * However pg_get_indexdef_string() includes the index tablespace. - * ---------- - */ -Datum -pg_get_indexdef(PG_FUNCTION_ARGS) -{ - Oid indexrelid = PG_GETARG_OID(0); - int prettyFlags; - char *res; - - prettyFlags = PRETTYFLAG_INDENT; - - res = pg_get_indexdef_worker(indexrelid, 0, NULL, false, false, - prettyFlags, true); - - if (res == NULL) - PG_RETURN_NULL(); - - PG_RETURN_TEXT_P(string_to_text(res)); -} - -Datum -pg_get_indexdef_ext(PG_FUNCTION_ARGS) -{ - Oid indexrelid = PG_GETARG_OID(0); - int32 colno = PG_GETARG_INT32(1); - bool pretty = PG_GETARG_BOOL(2); - int prettyFlags; - char *res; - - prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT; - - res = pg_get_indexdef_worker(indexrelid, colno, NULL, colno != 0, false, - prettyFlags, true); - - if (res == NULL) - PG_RETURN_NULL(); - - PG_RETURN_TEXT_P(string_to_text(res)); -} - -/* - * Internal version for use by ALTER TABLE. - * Includes a tablespace clause in the result. - * Returns a palloc'd C string; no pretty-printing. - */ -char * -pg_get_indexdef_string(Oid indexrelid) -{ - return pg_get_indexdef_worker(indexrelid, 0, NULL, false, true, 0, false); -} - -/* Internal version that just reports the column definitions */ -char * -pg_get_indexdef_columns(Oid indexrelid, bool pretty) -{ - int prettyFlags; - - prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT; - return pg_get_indexdef_worker(indexrelid, 0, NULL, true, false, - prettyFlags, false); -} - -/* - * Internal workhorse to decompile an index definition. - * - * This is now used for exclusion constraints as well: if excludeOps is not - * NULL then it points to an array of exclusion operator OIDs. - */ -static char * -pg_get_indexdef_worker(Oid indexrelid, int colno, - const Oid *excludeOps, - bool attrsOnly, bool showTblSpc, - int prettyFlags, bool missing_ok) -{// #lizard forgives - /* might want a separate isConstraint parameter later */ - bool isConstraint = (excludeOps != NULL); - HeapTuple ht_idx; - HeapTuple ht_idxrel; - HeapTuple ht_am; - Form_pg_index idxrec; - Form_pg_class idxrelrec; - Form_pg_am amrec; - IndexAmRoutine *amroutine; - List *indexprs; - ListCell *indexpr_item; - List *context; - Oid indrelid; - int keyno; - Datum indcollDatum; - Datum indclassDatum; - Datum indoptionDatum; - bool isnull; - oidvector *indcollation; - oidvector *indclass; - int2vector *indoption; - StringInfoData buf; - char *str; - char *sep; -#ifdef __TBASE__ - bool is_interval_child = false; - HeapTuple ht_parent_idx; -#endif - /* - * Fetch the pg_index tuple by the Oid of the index - */ - ht_idx = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(indexrelid)); - if (!HeapTupleIsValid(ht_idx)) - { - if (missing_ok) - return NULL; - elog(ERROR, "cache lookup failed for index %u", indexrelid); - } - idxrec = (Form_pg_index) GETSTRUCT(ht_idx); - - indrelid = idxrec->indrelid; - Assert(indexrelid == idxrec->indexrelid); - - /* Must get indcollation, indclass, and indoption the hard way */ - indcollDatum = SysCacheGetAttr(INDEXRELID, ht_idx, - Anum_pg_index_indcollation, &isnull); - Assert(!isnull); - indcollation = (oidvector *) DatumGetPointer(indcollDatum); - - indclassDatum = SysCacheGetAttr(INDEXRELID, ht_idx, - Anum_pg_index_indclass, &isnull); - Assert(!isnull); - indclass = (oidvector *) DatumGetPointer(indclassDatum); - - indoptionDatum = SysCacheGetAttr(INDEXRELID, ht_idx, - Anum_pg_index_indoption, &isnull); - Assert(!isnull); - indoption = (int2vector *) DatumGetPointer(indoptionDatum); - - /* - * Fetch the pg_class tuple of the index relation - */ - ht_idxrel = SearchSysCache1(RELOID, ObjectIdGetDatum(indexrelid)); - if (!HeapTupleIsValid(ht_idxrel)) - elog(ERROR, "cache lookup failed for relation %u", indexrelid); - idxrelrec = (Form_pg_class) GETSTRUCT(ht_idxrel); - - /* - * Fetch the pg_am tuple of the index' access method - */ - ht_am = SearchSysCache1(AMOID, ObjectIdGetDatum(idxrelrec->relam)); - if (!HeapTupleIsValid(ht_am)) - elog(ERROR, "cache lookup failed for access method %u", - idxrelrec->relam); - amrec = (Form_pg_am) GETSTRUCT(ht_am); - - /* Fetch the index AM's API struct */ - amroutine = GetIndexAmRoutine(amrec->amhandler); - - /* - * Get the index expressions, if any. (NOTE: we do not use the relcache - * versions of the expressions and predicate, because we want to display - * non-const-folded expressions.) - */ - if (!heap_attisnull(ht_idx, Anum_pg_index_indexprs, NULL)) - { - Datum exprsDatum; - bool isnull; - char *exprsString; - - exprsDatum = SysCacheGetAttr(INDEXRELID, ht_idx, - Anum_pg_index_indexprs, &isnull); - Assert(!isnull); - exprsString = TextDatumGetCString(exprsDatum); - indexprs = (List *) stringToNode(exprsString); - pfree(exprsString); - } - else - indexprs = NIL; - - indexpr_item = list_head(indexprs); - - context = deparse_context_for(get_relation_name(indrelid), indrelid); - - /* - * Start the index definition. Note that the index's name should never be - * schema-qualified, but the indexed rel's name may be. - */ - initStringInfo(&buf); - - if (!attrsOnly) - { - if (!isConstraint) - appendStringInfo(&buf, "CREATE %sINDEX %s ON %s USING %s (", - idxrec->indisunique ? "UNIQUE " : "", - quote_identifier(NameStr(idxrelrec->relname)), - generate_relation_name(indrelid, NIL), - quote_identifier(NameStr(amrec->amname))); - else /* currently, must be EXCLUDE constraint */ - appendStringInfo(&buf, "EXCLUDE USING %s (", - quote_identifier(NameStr(amrec->amname))); - } - - /* - * Report the indexed attributes - */ -#ifdef __TBASE__ - { - Relation rel = relation_open(indrelid, NoLock); - if (rel->rd_rel->relkind == RELKIND_RELATION && RELATION_IS_CHILD(rel)) - { - Oid parentIndexId = get_interval_parent_relid(indexrelid); - Oid parentId = get_interval_parent_relid(indrelid); - if (!OidIsValid(parentId)) - { - elog(ERROR, "could not get interval parent for relation %u", - indrelid); - } - indrelid = parentId; - - if (OidIsValid(parentIndexId)) - { - ht_parent_idx = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(parentIndexId)); - if (!HeapTupleIsValid(ht_parent_idx)) - { - if (missing_ok) - return NULL; - elog(ERROR, "cache lookup failed for index %u", parentIndexId); - } - idxrec = (Form_pg_index) GETSTRUCT(ht_parent_idx); - is_interval_child = true; - } - } - heap_close(rel, NoLock); - } -#endif - sep = ""; - for (keyno = 0; keyno < idxrec->indnatts; keyno++) - { - AttrNumber attnum = idxrec->indkey.values[keyno]; - int16 opt = indoption->values[keyno]; - Oid keycoltype; - Oid keycolcollation; - - if (!colno) - appendStringInfoString(&buf, sep); - sep = ", "; - - if (attnum != 0) - { - /* Simple index column */ - char *attname; - int32 keycoltypmod; - - attname = get_relid_attribute_name(indrelid, attnum); - if (!colno || colno == keyno + 1) - appendStringInfoString(&buf, quote_identifier(attname)); - get_atttypetypmodcoll(indrelid, attnum, - &keycoltype, &keycoltypmod, - &keycolcollation); - } - else - { - /* expressional index */ - Node *indexkey; - - if (indexpr_item == NULL) - elog(ERROR, "too few entries in indexprs list"); - indexkey = (Node *) lfirst(indexpr_item); - indexpr_item = lnext(indexpr_item); - /* Deparse */ - str = deparse_expression_pretty(indexkey, context, false, false, - prettyFlags, 0); - if (!colno || colno == keyno + 1) - { - /* Need parens if it's not a bare function call */ - if (looks_like_function(indexkey)) - appendStringInfoString(&buf, str); - else - appendStringInfo(&buf, "(%s)", str); - } - keycoltype = exprType(indexkey); - keycolcollation = exprCollation(indexkey); - } - - if (!attrsOnly && (!colno || colno == keyno + 1)) - { - Oid indcoll; - - /* Add collation, if not default for column */ - indcoll = indcollation->values[keyno]; - if (OidIsValid(indcoll) && indcoll != keycolcollation) - appendStringInfo(&buf, " COLLATE %s", - generate_collation_name((indcoll))); - - /* Add the operator class name, if not default */ - get_opclass_name(indclass->values[keyno], keycoltype, &buf); - - /* Add options if relevant */ - if (amroutine->amcanorder) - { - /* if it supports sort ordering, report DESC and NULLS opts */ - if (opt & INDOPTION_DESC) - { - appendStringInfoString(&buf, " DESC"); - /* NULLS FIRST is the default in this case */ - if (!(opt & INDOPTION_NULLS_FIRST)) - appendStringInfoString(&buf, " NULLS LAST"); - } - else - { - if (opt & INDOPTION_NULLS_FIRST) - appendStringInfoString(&buf, " NULLS FIRST"); - } - } - - /* Add the exclusion operator if relevant */ - if (excludeOps != NULL) - appendStringInfo(&buf, " WITH %s", - generate_operator_name(excludeOps[keyno], - keycoltype, - keycoltype)); - } - } - - if (!attrsOnly) - { - appendStringInfoChar(&buf, ')'); - - /* - * If it has options, append "WITH (options)" - */ - str = flatten_reloptions(indexrelid); - if (str) - { - appendStringInfo(&buf, " WITH (%s)", str); - pfree(str); - } - - /* - * Print tablespace, but only if requested - */ - if (showTblSpc) - { - Oid tblspc; - - tblspc = get_rel_tablespace(indexrelid); - if (!OidIsValid(tblspc)) - tblspc = MyDatabaseTableSpace; - if (isConstraint) - appendStringInfoString(&buf, " USING INDEX"); - appendStringInfo(&buf, " TABLESPACE %s", - quote_identifier(get_tablespace_name(tblspc))); - } - - /* - * If it's a partial index, decompile and append the predicate - */ - if (!heap_attisnull(ht_idx, Anum_pg_index_indpred, NULL)) - { - Node *node; - Datum predDatum; - bool isnull; - char *predString; - - /* Convert text string to node tree */ - predDatum = SysCacheGetAttr(INDEXRELID, ht_idx, - Anum_pg_index_indpred, &isnull); - Assert(!isnull); - predString = TextDatumGetCString(predDatum); - node = (Node *) stringToNode(predString); - pfree(predString); - - /* Deparse */ - str = deparse_expression_pretty(node, context, false, false, - prettyFlags, 0); - if (isConstraint) - appendStringInfo(&buf, " WHERE (%s)", str); - else - appendStringInfo(&buf, " WHERE %s", str); - } - } - - /* Clean up */ - ReleaseSysCache(ht_idx); - ReleaseSysCache(ht_idxrel); - ReleaseSysCache(ht_am); -#ifdef __TBASE__ - if (is_interval_child) - { - ReleaseSysCache(ht_parent_idx); - } -#endif - return buf.data; -} - -/* - * pg_get_statisticsobjdef - * Get the definition of an extended statistics object - */ -Datum -pg_get_statisticsobjdef(PG_FUNCTION_ARGS) -{ - Oid statextid = PG_GETARG_OID(0); - char *res; - - res = pg_get_statisticsobj_worker(statextid, true); - - if (res == NULL) - PG_RETURN_NULL(); - - PG_RETURN_TEXT_P(string_to_text(res)); -} - -/* - * Internal workhorse to decompile an extended statistics object. - */ -static char * -pg_get_statisticsobj_worker(Oid statextid, bool missing_ok) -{// #lizard forgives - Form_pg_statistic_ext statextrec; - HeapTuple statexttup; - StringInfoData buf; - int colno; - char *nsp; - ArrayType *arr; - char *enabled; - Datum datum; - bool isnull; - bool ndistinct_enabled; - bool dependencies_enabled; - int i; - - statexttup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statextid)); - - if (!HeapTupleIsValid(statexttup)) - { - if (missing_ok) - return NULL; - elog(ERROR, "cache lookup failed for statistics object %u", statextid); - } - - statextrec = (Form_pg_statistic_ext) GETSTRUCT(statexttup); - - initStringInfo(&buf); - - nsp = get_namespace_name(statextrec->stxnamespace); - appendStringInfo(&buf, "CREATE STATISTICS %s", - quote_qualified_identifier(nsp, - NameStr(statextrec->stxname))); - - /* - * Decode the stxkind column so that we know which stats types to print. - */ - datum = SysCacheGetAttr(STATEXTOID, statexttup, - Anum_pg_statistic_ext_stxkind, &isnull); - Assert(!isnull); - arr = DatumGetArrayTypeP(datum); - if (ARR_NDIM(arr) != 1 || - ARR_HASNULL(arr) || - ARR_ELEMTYPE(arr) != CHAROID) - elog(ERROR, "stxkind is not a 1-D char array"); - enabled = (char *) ARR_DATA_PTR(arr); - - ndistinct_enabled = false; - dependencies_enabled = false; - - for (i = 0; i < ARR_DIMS(arr)[0]; i++) - { - if (enabled[i] == STATS_EXT_NDISTINCT) - ndistinct_enabled = true; - if (enabled[i] == STATS_EXT_DEPENDENCIES) - dependencies_enabled = true; - } - - /* - * If any option is disabled, then we'll need to append the types clause - * to show which options are enabled. We omit the types clause on purpose - * when all options are enabled, so a pg_dump/pg_restore will create all - * statistics types on a newer postgres version, if the statistics had all - * options enabled on the original version. - */ - if (!ndistinct_enabled || !dependencies_enabled) - { - appendStringInfoString(&buf, " ("); - if (ndistinct_enabled) - appendStringInfoString(&buf, "ndistinct"); - else if (dependencies_enabled) - appendStringInfoString(&buf, "dependencies"); - appendStringInfoChar(&buf, ')'); - } - - appendStringInfoString(&buf, " ON "); - - for (colno = 0; colno < statextrec->stxkeys.dim1; colno++) - { - AttrNumber attnum = statextrec->stxkeys.values[colno]; - char *attname; - - if (colno > 0) - appendStringInfoString(&buf, ", "); - - attname = get_relid_attribute_name(statextrec->stxrelid, attnum); - - appendStringInfoString(&buf, quote_identifier(attname)); - } - - appendStringInfo(&buf, " FROM %s", - generate_relation_name(statextrec->stxrelid, NIL)); - - ReleaseSysCache(statexttup); - - return buf.data; -} - -/* - * pg_get_partkeydef - * - * Returns the partition key specification, ie, the following: - * - * PARTITION BY { RANGE | LIST } (column opt_collation opt_opclass [, ...]) - */ -Datum -pg_get_partkeydef(PG_FUNCTION_ARGS) -{ - Oid relid = PG_GETARG_OID(0); - char *res; - - res = pg_get_partkeydef_worker(relid, PRETTYFLAG_INDENT, false, true); - - if (res == NULL) - PG_RETURN_NULL(); - - PG_RETURN_TEXT_P(string_to_text(res)); -} - -/* Internal version that just reports the column definitions */ -char * -pg_get_partkeydef_columns(Oid relid, bool pretty) -{ - int prettyFlags; - - prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT; - return pg_get_partkeydef_worker(relid, prettyFlags, true, false); -} - -/* - * Internal workhorse to decompile a partition key definition. - */ -static char * -pg_get_partkeydef_worker(Oid relid, int prettyFlags, - bool attrsOnly, bool missing_ok) -{// #lizard forgives - Form_pg_partitioned_table form; - HeapTuple tuple; - oidvector *partclass; - oidvector *partcollation; - List *partexprs; - ListCell *partexpr_item; - List *context; - Datum datum; - bool isnull; - StringInfoData buf; - int keyno; - char *str; - char *sep; - - tuple = SearchSysCache1(PARTRELID, ObjectIdGetDatum(relid)); - if (!HeapTupleIsValid(tuple)) - { - if (missing_ok) - return NULL; - elog(ERROR, "cache lookup failed for partition key of %u", relid); - } - - form = (Form_pg_partitioned_table) GETSTRUCT(tuple); - - Assert(form->partrelid == relid); - - /* Must get partclass and partcollation the hard way */ - datum = SysCacheGetAttr(PARTRELID, tuple, - Anum_pg_partitioned_table_partclass, &isnull); - Assert(!isnull); - partclass = (oidvector *) DatumGetPointer(datum); - - datum = SysCacheGetAttr(PARTRELID, tuple, - Anum_pg_partitioned_table_partcollation, &isnull); - Assert(!isnull); - partcollation = (oidvector *) DatumGetPointer(datum); - - - /* - * Get the expressions, if any. (NOTE: we do not use the relcache - * versions of the expressions, because we want to display - * non-const-folded expressions.) - */ - if (!heap_attisnull(tuple, Anum_pg_partitioned_table_partexprs, NULL)) - { - Datum exprsDatum; - bool isnull; - char *exprsString; - - exprsDatum = SysCacheGetAttr(PARTRELID, tuple, - Anum_pg_partitioned_table_partexprs, &isnull); - Assert(!isnull); - exprsString = TextDatumGetCString(exprsDatum); - partexprs = (List *) stringToNode(exprsString); - - if (!IsA(partexprs, List)) - elog(ERROR, "unexpected node type found in partexprs: %d", - (int) nodeTag(partexprs)); - - pfree(exprsString); - } - else - partexprs = NIL; - - partexpr_item = list_head(partexprs); - context = deparse_context_for(get_relation_name(relid), relid); - - initStringInfo(&buf); - - switch (form->partstrat) - { - case PARTITION_STRATEGY_LIST: - if (!attrsOnly) - appendStringInfo(&buf, "LIST"); - break; - case PARTITION_STRATEGY_RANGE: - if (!attrsOnly) - appendStringInfo(&buf, "RANGE"); - break; - default: - elog(ERROR, "unexpected partition strategy: %d", - (int) form->partstrat); - } - - if (!attrsOnly) - appendStringInfo(&buf, " ("); - sep = ""; - for (keyno = 0; keyno < form->partnatts; keyno++) - { - AttrNumber attnum = form->partattrs.values[keyno]; - Oid keycoltype; - Oid keycolcollation; - Oid partcoll; - - appendStringInfoString(&buf, sep); - sep = ", "; - if (attnum != 0) - { - /* Simple attribute reference */ - char *attname; - int32 keycoltypmod; - - attname = get_relid_attribute_name(relid, attnum); - appendStringInfoString(&buf, quote_identifier(attname)); - get_atttypetypmodcoll(relid, attnum, - &keycoltype, &keycoltypmod, - &keycolcollation); - } - else - { - /* Expression */ - Node *partkey; - - if (partexpr_item == NULL) - elog(ERROR, "too few entries in partexprs list"); - partkey = (Node *) lfirst(partexpr_item); - partexpr_item = lnext(partexpr_item); - - /* Deparse */ - str = deparse_expression_pretty(partkey, context, false, false, - prettyFlags, 0); - /* Need parens if it's not a bare function call */ - if (looks_like_function(partkey)) - appendStringInfoString(&buf, str); - else - appendStringInfo(&buf, "(%s)", str); - - keycoltype = exprType(partkey); - keycolcollation = exprCollation(partkey); - } - - /* Add collation, if not default for column */ - partcoll = partcollation->values[keyno]; - if (!attrsOnly && OidIsValid(partcoll) && partcoll != keycolcollation) - appendStringInfo(&buf, " COLLATE %s", - generate_collation_name((partcoll))); - - /* Add the operator class name, if not default */ - if (!attrsOnly) - get_opclass_name(partclass->values[keyno], keycoltype, &buf); - } - - if (!attrsOnly) - appendStringInfoChar(&buf, ')'); - - /* Clean up */ - ReleaseSysCache(tuple); - - return buf.data; -} - -/* - * pg_get_partition_constraintdef - * - * Returns partition constraint expression as a string for the input relation - */ -Datum -pg_get_partition_constraintdef(PG_FUNCTION_ARGS) -{ - Oid relationId = PG_GETARG_OID(0); - Expr *constr_expr; - int prettyFlags; - List *context; - char *consrc; - - constr_expr = get_partition_qual_relid(relationId); - - /* Quick exit if not a partition */ - if (constr_expr == NULL) - PG_RETURN_NULL(); - - /* - * Deparse and return the constraint expression. - */ - prettyFlags = PRETTYFLAG_INDENT; - context = deparse_context_for(get_relation_name(relationId), relationId); - consrc = deparse_expression_pretty((Node *) constr_expr, context, false, - false, prettyFlags, 0); - - PG_RETURN_TEXT_P(string_to_text(consrc)); -} - -/* - * pg_get_constraintdef - * - * Returns the definition for the constraint, ie, everything that needs to - * appear after "ALTER TABLE ... ADD CONSTRAINT ". - */ -Datum -pg_get_constraintdef(PG_FUNCTION_ARGS) -{ - Oid constraintId = PG_GETARG_OID(0); - int prettyFlags; - char *res; - - prettyFlags = PRETTYFLAG_INDENT; - - res = pg_get_constraintdef_worker(constraintId, false, prettyFlags, true); - - if (res == NULL) - PG_RETURN_NULL(); - - PG_RETURN_TEXT_P(string_to_text(res)); -} - -Datum -pg_get_constraintdef_ext(PG_FUNCTION_ARGS) -{ - Oid constraintId = PG_GETARG_OID(0); - bool pretty = PG_GETARG_BOOL(1); - int prettyFlags; - char *res; - - prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT; - - res = pg_get_constraintdef_worker(constraintId, false, prettyFlags, true); - - if (res == NULL) - PG_RETURN_NULL(); - - PG_RETURN_TEXT_P(string_to_text(res)); -} - -/* - * Internal version that returns a full ALTER TABLE ... ADD CONSTRAINT command - */ -char * -pg_get_constraintdef_command(Oid constraintId) -{ - return pg_get_constraintdef_worker(constraintId, true, 0, false); -} - -/* - * As of 9.4, we now use an MVCC snapshot for this. - */ -static char * -pg_get_constraintdef_worker(Oid constraintId, bool fullCommand, - int prettyFlags, bool missing_ok) -{// #lizard forgives - HeapTuple tup; - Form_pg_constraint conForm; - StringInfoData buf; - SysScanDesc scandesc; - ScanKeyData scankey[1]; - Snapshot snapshot = RegisterSnapshot(GetTransactionSnapshot()); - Relation relation = heap_open(ConstraintRelationId, AccessShareLock); - - ScanKeyInit(&scankey[0], - ObjectIdAttributeNumber, - BTEqualStrategyNumber, F_OIDEQ, - ObjectIdGetDatum(constraintId)); - - scandesc = systable_beginscan(relation, - ConstraintOidIndexId, - true, - snapshot, - 1, - scankey); - - /* - * We later use the tuple with SysCacheGetAttr() as if we had obtained it - * via SearchSysCache, which works fine. - */ - tup = systable_getnext(scandesc); - - UnregisterSnapshot(snapshot); - - if (!HeapTupleIsValid(tup)) - { - if (missing_ok) - { - systable_endscan(scandesc); - heap_close(relation, AccessShareLock); - return NULL; - } - elog(ERROR, "could not find tuple for constraint %u", constraintId); - } - - conForm = (Form_pg_constraint) GETSTRUCT(tup); - - initStringInfo(&buf); - - if (fullCommand) - { - /* - * Currently, callers want ALTER TABLE (without ONLY) for CHECK - * constraints, and other types of constraints don't inherit anyway so - * it doesn't matter whether we say ONLY or not. Someday we might - * need to let callers specify whether to put ONLY in the command. - */ - appendStringInfo(&buf, "ALTER TABLE %s ADD CONSTRAINT %s ", - generate_qualified_relation_name(conForm->conrelid), - quote_identifier(NameStr(conForm->conname))); - } - - switch (conForm->contype) - { - case CONSTRAINT_FOREIGN: - { - Datum val; - bool isnull; - const char *string; - - /* Start off the constraint definition */ - appendStringInfoString(&buf, "FOREIGN KEY ("); - - /* Fetch and build referencing-column list */ - val = SysCacheGetAttr(CONSTROID, tup, - Anum_pg_constraint_conkey, &isnull); - if (isnull) - elog(ERROR, "null conkey for constraint %u", - constraintId); - - decompile_column_index_array(val, conForm->conrelid, &buf); - - /* add foreign relation name */ - appendStringInfo(&buf, ") REFERENCES %s(", - generate_relation_name(conForm->confrelid, - NIL)); - - /* Fetch and build referenced-column list */ - val = SysCacheGetAttr(CONSTROID, tup, - Anum_pg_constraint_confkey, &isnull); - if (isnull) - elog(ERROR, "null confkey for constraint %u", - constraintId); - - decompile_column_index_array(val, conForm->confrelid, &buf); - - appendStringInfoChar(&buf, ')'); - - /* Add match type */ - switch (conForm->confmatchtype) - { - case FKCONSTR_MATCH_FULL: - string = " MATCH FULL"; - break; - case FKCONSTR_MATCH_PARTIAL: - string = " MATCH PARTIAL"; - break; - case FKCONSTR_MATCH_SIMPLE: - string = ""; - break; - default: - elog(ERROR, "unrecognized confmatchtype: %d", - conForm->confmatchtype); - string = ""; /* keep compiler quiet */ - break; - } - appendStringInfoString(&buf, string); - - /* Add ON UPDATE and ON DELETE clauses, if needed */ - switch (conForm->confupdtype) - { - case FKCONSTR_ACTION_NOACTION: - string = NULL; /* suppress default */ - break; - case FKCONSTR_ACTION_RESTRICT: - string = "RESTRICT"; - break; - case FKCONSTR_ACTION_CASCADE: - string = "CASCADE"; - break; - case FKCONSTR_ACTION_SETNULL: - string = "SET NULL"; - break; - case FKCONSTR_ACTION_SETDEFAULT: - string = "SET DEFAULT"; - break; - default: - elog(ERROR, "unrecognized confupdtype: %d", - conForm->confupdtype); - string = NULL; /* keep compiler quiet */ - break; - } - if (string) - appendStringInfo(&buf, " ON UPDATE %s", string); - - switch (conForm->confdeltype) - { - case FKCONSTR_ACTION_NOACTION: - string = NULL; /* suppress default */ - break; - case FKCONSTR_ACTION_RESTRICT: - string = "RESTRICT"; - break; - case FKCONSTR_ACTION_CASCADE: - string = "CASCADE"; - break; - case FKCONSTR_ACTION_SETNULL: - string = "SET NULL"; - break; - case FKCONSTR_ACTION_SETDEFAULT: - string = "SET DEFAULT"; - break; - default: - elog(ERROR, "unrecognized confdeltype: %d", - conForm->confdeltype); - string = NULL; /* keep compiler quiet */ - break; - } - if (string) - appendStringInfo(&buf, " ON DELETE %s", string); - - break; - } - case CONSTRAINT_PRIMARY: - case CONSTRAINT_UNIQUE: - { - Datum val; - bool isnull; - Oid indexId; - - /* Start off the constraint definition */ - if (conForm->contype == CONSTRAINT_PRIMARY) - appendStringInfoString(&buf, "PRIMARY KEY ("); - else - appendStringInfoString(&buf, "UNIQUE ("); - - /* Fetch and build target column list */ - val = SysCacheGetAttr(CONSTROID, tup, - Anum_pg_constraint_conkey, &isnull); - if (isnull) - elog(ERROR, "null conkey for constraint %u", - constraintId); - - decompile_column_index_array(val, conForm->conrelid, &buf); - - appendStringInfoChar(&buf, ')'); - - indexId = get_constraint_index(constraintId); - - /* XXX why do we only print these bits if fullCommand? */ - if (fullCommand && OidIsValid(indexId)) - { - char *options = flatten_reloptions(indexId); - Oid tblspc; - - if (options) - { - appendStringInfo(&buf, " WITH (%s)", options); - pfree(options); - } - - tblspc = get_rel_tablespace(indexId); - if (OidIsValid(tblspc)) - appendStringInfo(&buf, " USING INDEX TABLESPACE %s", - quote_identifier(get_tablespace_name(tblspc))); - } - - break; - } - case CONSTRAINT_CHECK: - { - Datum val; - bool isnull; - char *conbin; - char *consrc; - Node *expr; - List *context; - - /* Fetch constraint expression in parsetree form */ - val = SysCacheGetAttr(CONSTROID, tup, - Anum_pg_constraint_conbin, &isnull); - if (isnull) - elog(ERROR, "null conbin for constraint %u", - constraintId); - - conbin = TextDatumGetCString(val); - expr = stringToNode(conbin); - - /* Set up deparsing context for Var nodes in constraint */ - if (conForm->conrelid != InvalidOid) - { - /* relation constraint */ - context = deparse_context_for(get_relation_name(conForm->conrelid), - conForm->conrelid); - } - else - { - /* domain constraint --- can't have Vars */ - context = NIL; - } - - consrc = deparse_expression_pretty(expr, context, false, false, - prettyFlags, 0); - - /* - * Now emit the constraint definition, adding NO INHERIT if - * necessary. - * - * There are cases where the constraint expression will be - * fully parenthesized and we don't need the outer parens ... - * but there are other cases where we do need 'em. Be - * conservative for now. - * - * Note that simply checking for leading '(' and trailing ')' - * would NOT be good enough, consider "(x > 0) AND (y > 0)". - */ - appendStringInfo(&buf, "CHECK (%s)%s", - consrc, - conForm->connoinherit ? " NO INHERIT" : ""); - break; - } - case CONSTRAINT_TRIGGER: - - /* - * There isn't an ALTER TABLE syntax for creating a user-defined - * constraint trigger, but it seems better to print something than - * throw an error; if we throw error then this function couldn't - * safely be applied to all rows of pg_constraint. - */ - appendStringInfoString(&buf, "TRIGGER"); - break; - case CONSTRAINT_EXCLUSION: - { - Oid indexOid = conForm->conindid; - Datum val; - bool isnull; - Datum *elems; - int nElems; - int i; - Oid *operators; - - /* Extract operator OIDs from the pg_constraint tuple */ - val = SysCacheGetAttr(CONSTROID, tup, - Anum_pg_constraint_conexclop, - &isnull); - if (isnull) - elog(ERROR, "null conexclop for constraint %u", - constraintId); - - deconstruct_array(DatumGetArrayTypeP(val), - OIDOID, sizeof(Oid), true, 'i', - &elems, NULL, &nElems); - - operators = (Oid *) palloc(nElems * sizeof(Oid)); - for (i = 0; i < nElems; i++) - operators[i] = DatumGetObjectId(elems[i]); - - /* pg_get_indexdef_worker does the rest */ - /* suppress tablespace because pg_dump wants it that way */ - appendStringInfoString(&buf, - pg_get_indexdef_worker(indexOid, - 0, - operators, - false, - false, - prettyFlags, - false)); - break; - } - default: - elog(ERROR, "invalid constraint type \"%c\"", conForm->contype); - break; - } - - if (conForm->condeferrable) - appendStringInfoString(&buf, " DEFERRABLE"); - if (conForm->condeferred) - appendStringInfoString(&buf, " INITIALLY DEFERRED"); - if (!conForm->convalidated) - appendStringInfoString(&buf, " NOT VALID"); - - /* Cleanup */ - systable_endscan(scandesc); - heap_close(relation, AccessShareLock); - - return buf.data; -} - - -/* - * Convert an int16[] Datum into a comma-separated list of column names - * for the indicated relation; append the list to buf. - */ -static void -decompile_column_index_array(Datum column_index_array, Oid relId, - StringInfo buf) -{ - Datum *keys; - int nKeys; - int j; - - /* Extract data from array of int16 */ - deconstruct_array(DatumGetArrayTypeP(column_index_array), - INT2OID, 2, true, 's', - &keys, NULL, &nKeys); - - for (j = 0; j < nKeys; j++) - { - char *colName; - - colName = get_relid_attribute_name(relId, DatumGetInt16(keys[j])); - - if (j == 0) - appendStringInfoString(buf, quote_identifier(colName)); - else - appendStringInfo(buf, ", %s", quote_identifier(colName)); - } -} - - -/* ---------- - * get_expr - Decompile an expression tree - * - * Input: an expression tree in nodeToString form, and a relation OID - * - * Output: reverse-listed expression - * - * Currently, the expression can only refer to a single relation, namely - * the one specified by the second parameter. This is sufficient for - * partial indexes, column default expressions, etc. We also support - * Var-free expressions, for which the OID can be InvalidOid. - * ---------- - */ -Datum -pg_get_expr(PG_FUNCTION_ARGS) -{ - text *expr = PG_GETARG_TEXT_PP(0); - Oid relid = PG_GETARG_OID(1); - int prettyFlags; - char *relname; - - prettyFlags = PRETTYFLAG_INDENT; - - if (OidIsValid(relid)) - { - /* Get the name for the relation */ - relname = get_rel_name(relid); - - /* - * If the OID isn't actually valid, don't throw an error, just return - * NULL. This is a bit questionable, but it's what we've done - * historically, and it can help avoid unwanted failures when - * examining catalog entries for just-deleted relations. - */ - if (relname == NULL) - PG_RETURN_NULL(); - } - else - relname = NULL; - - PG_RETURN_TEXT_P(pg_get_expr_worker(expr, relid, relname, prettyFlags)); -} - -Datum -pg_get_expr_ext(PG_FUNCTION_ARGS) -{ - text *expr = PG_GETARG_TEXT_PP(0); - Oid relid = PG_GETARG_OID(1); - bool pretty = PG_GETARG_BOOL(2); - int prettyFlags; - char *relname; - - prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT; - - if (OidIsValid(relid)) - { - /* Get the name for the relation */ - relname = get_rel_name(relid); - /* See notes above */ - if (relname == NULL) - PG_RETURN_NULL(); - } - else - relname = NULL; - - PG_RETURN_TEXT_P(pg_get_expr_worker(expr, relid, relname, prettyFlags)); -} - -static text * -pg_get_expr_worker(text *expr, Oid relid, const char *relname, int prettyFlags) -{ - Node *node; - List *context; - char *exprstr; - char *str; - - /* Convert input TEXT object to C string */ - exprstr = text_to_cstring(expr); - - /* Convert expression to node tree */ - node = (Node *) stringToNode(exprstr); - - pfree(exprstr); - - /* Prepare deparse context if needed */ - if (OidIsValid(relid)) - context = deparse_context_for(relname, relid); - else - context = NIL; - - /* Deparse */ - str = deparse_expression_pretty(node, context, false, false, - prettyFlags, 0); - - return string_to_text(str); -} - - -/* ---------- - * get_userbyid - Get a user name by roleid and - * fallback to 'unknown (OID=n)' - * ---------- - */ -Datum -pg_get_userbyid(PG_FUNCTION_ARGS) -{ - Oid roleid = PG_GETARG_OID(0); - Name result; - HeapTuple roletup; - Form_pg_authid role_rec; - - /* - * Allocate space for the result - */ - result = (Name) palloc(NAMEDATALEN); - memset(NameStr(*result), 0, NAMEDATALEN); - - /* - * Get the pg_authid entry and print the result - */ - roletup = SearchSysCache1(AUTHOID, ObjectIdGetDatum(roleid)); - if (HeapTupleIsValid(roletup)) - { - role_rec = (Form_pg_authid) GETSTRUCT(roletup); - StrNCpy(NameStr(*result), NameStr(role_rec->rolname), NAMEDATALEN); - ReleaseSysCache(roletup); - } - else - sprintf(NameStr(*result), "unknown (OID=%u)", roleid); - - PG_RETURN_NAME(result); -} - - -/* - * pg_get_serial_sequence - * Get the name of the sequence used by a serial column, - * formatted suitably for passing to setval, nextval or currval. - * First parameter is not treated as double-quoted, second parameter - * is --- see documentation for reason. - */ -Datum -pg_get_serial_sequence(PG_FUNCTION_ARGS) -{// #lizard forgives - text *tablename = PG_GETARG_TEXT_PP(0); - text *columnname = PG_GETARG_TEXT_PP(1); - RangeVar *tablerv; - Oid tableOid; - char *column; - AttrNumber attnum; - Oid sequenceId = InvalidOid; - Relation depRel; - ScanKeyData key[3]; - SysScanDesc scan; - HeapTuple tup; - - /* Look up table name. Can't lock it - we might not have privileges. */ - tablerv = makeRangeVarFromNameList(textToQualifiedNameList(tablename)); - tableOid = RangeVarGetRelid(tablerv, NoLock, false); - - /* Get the number of the column */ - column = text_to_cstring(columnname); - - attnum = get_attnum(tableOid, column); - if (attnum == InvalidAttrNumber) - ereport(ERROR, - (errcode(ERRCODE_UNDEFINED_COLUMN), - errmsg("column \"%s\" of relation \"%s\" does not exist", - column, tablerv->relname))); - - /* Search the dependency table for the dependent sequence */ - depRel = heap_open(DependRelationId, AccessShareLock); - - ScanKeyInit(&key[0], - Anum_pg_depend_refclassid, - BTEqualStrategyNumber, F_OIDEQ, - ObjectIdGetDatum(RelationRelationId)); - ScanKeyInit(&key[1], - Anum_pg_depend_refobjid, - BTEqualStrategyNumber, F_OIDEQ, - ObjectIdGetDatum(tableOid)); - ScanKeyInit(&key[2], - Anum_pg_depend_refobjsubid, - BTEqualStrategyNumber, F_INT4EQ, - Int32GetDatum(attnum)); - - scan = systable_beginscan(depRel, DependReferenceIndexId, true, - NULL, 3, key); - - while (HeapTupleIsValid(tup = systable_getnext(scan))) - { - Form_pg_depend deprec = (Form_pg_depend) GETSTRUCT(tup); - - /* - * We assume any auto dependency of a sequence on a column must be - * what we are looking for. (We need the relkind test because indexes - * can also have auto dependencies on columns.) - */ - if (deprec->classid == RelationRelationId && - deprec->objsubid == 0 && - deprec->deptype == DEPENDENCY_AUTO && - get_rel_relkind(deprec->objid) == RELKIND_SEQUENCE) - { - sequenceId = deprec->objid; - break; - } - } - - systable_endscan(scan); - heap_close(depRel, AccessShareLock); - - if (OidIsValid(sequenceId)) - { - char *result; - - result = generate_qualified_relation_name(sequenceId); - - PG_RETURN_TEXT_P(string_to_text(result)); - } - - PG_RETURN_NULL(); -} - - -/* - * pg_get_functiondef - * Returns the complete "CREATE OR REPLACE FUNCTION ..." statement for - * the specified function. - * - * Note: if you change the output format of this function, be careful not - * to break psql's rules (in \ef and \sf) for identifying the start of the - * function body. To wit: the function body starts on a line that begins - * with "AS ", and no preceding line will look like that. - */ -Datum -pg_get_functiondef(PG_FUNCTION_ARGS) -{// #lizard forgives - Oid funcid = PG_GETARG_OID(0); - StringInfoData buf; - StringInfoData dq; - HeapTuple proctup; - Form_pg_proc proc; - Datum tmp; - bool isnull; - const char *prosrc; - const char *name; - const char *nsp; - float4 procost; - int oldlen; - - initStringInfo(&buf); - - /* Look up the function */ - proctup = SearchSysCache1(PROCOID, ObjectIdGetDatum(funcid)); - if (!HeapTupleIsValid(proctup)) - PG_RETURN_NULL(); - - proc = (Form_pg_proc) GETSTRUCT(proctup); - name = NameStr(proc->proname); - - if (proc->proisagg) - ereport(ERROR, - (errcode(ERRCODE_WRONG_OBJECT_TYPE), - errmsg("\"%s\" is an aggregate function", name))); - - /* - * We always qualify the function name, to ensure the right function gets - * replaced. - */ - nsp = get_namespace_name(proc->pronamespace); - appendStringInfo(&buf, "CREATE OR REPLACE FUNCTION %s(", - quote_qualified_identifier(nsp, name)); - (void) print_function_arguments(&buf, proctup, false, true); - appendStringInfoString(&buf, ")\n RETURNS "); - print_function_rettype(&buf, proctup); - - print_function_trftypes(&buf, proctup); - - appendStringInfo(&buf, "\n LANGUAGE %s\n", - quote_identifier(get_language_name(proc->prolang, false))); - - /* Emit some miscellaneous options on one line */ - oldlen = buf.len; - - if (proc->proiswindow) - appendStringInfoString(&buf, " WINDOW"); - switch (proc->provolatile) - { - case PROVOLATILE_IMMUTABLE: - appendStringInfoString(&buf, " IMMUTABLE"); - break; - case PROVOLATILE_STABLE: - appendStringInfoString(&buf, " STABLE"); - break; - case PROVOLATILE_VOLATILE: - break; - } - - switch (proc->proparallel) - { - case PROPARALLEL_SAFE: - appendStringInfoString(&buf, " PARALLEL SAFE"); - break; - case PROPARALLEL_RESTRICTED: - appendStringInfoString(&buf, " PARALLEL RESTRICTED"); - break; - case PROPARALLEL_UNSAFE: - break; - } - - if (proc->proisstrict) - appendStringInfoString(&buf, " STRICT"); - if (proc->prosecdef) - appendStringInfoString(&buf, " SECURITY DEFINER"); - if (proc->proleakproof) - appendStringInfoString(&buf, " LEAKPROOF"); - - /* This code for the default cost and rows should match functioncmds.c */ - if (proc->prolang == INTERNALlanguageId || - proc->prolang == ClanguageId) - procost = 1; - else - procost = 100; - if (proc->procost != procost) - appendStringInfo(&buf, " COST %g", proc->procost); - - if (proc->prorows > 0 && proc->prorows != 1000) - appendStringInfo(&buf, " ROWS %g", proc->prorows); - - if (oldlen != buf.len) - appendStringInfoChar(&buf, '\n'); - - /* Emit any proconfig options, one per line */ - tmp = SysCacheGetAttr(PROCOID, proctup, Anum_pg_proc_proconfig, &isnull); - if (!isnull) - { - ArrayType *a = DatumGetArrayTypeP(tmp); - int i; - - Assert(ARR_ELEMTYPE(a) == TEXTOID); - Assert(ARR_NDIM(a) == 1); - Assert(ARR_LBOUND(a)[0] == 1); - - for (i = 1; i <= ARR_DIMS(a)[0]; i++) - { - Datum d; - - d = array_ref(a, 1, &i, - -1 /* varlenarray */ , - -1 /* TEXT's typlen */ , - false /* TEXT's typbyval */ , - 'i' /* TEXT's typalign */ , - &isnull); - if (!isnull) - { - char *configitem = TextDatumGetCString(d); - char *pos; - - pos = strchr(configitem, '='); - if (pos == NULL) - continue; - *pos++ = '\0'; - - appendStringInfo(&buf, " SET %s TO ", - quote_identifier(configitem)); - - /* - * Some GUC variable names are 'LIST' type and hence must not - * be quoted. - */ - if (pg_strcasecmp(configitem, "DateStyle") == 0 - || pg_strcasecmp(configitem, "search_path") == 0) - appendStringInfoString(&buf, pos); - else - simple_quote_literal(&buf, pos); - appendStringInfoChar(&buf, '\n'); - } - } - } - - /* And finally the function definition ... */ - appendStringInfoString(&buf, "AS "); - - tmp = SysCacheGetAttr(PROCOID, proctup, Anum_pg_proc_probin, &isnull); - if (!isnull) - { - simple_quote_literal(&buf, TextDatumGetCString(tmp)); - appendStringInfoString(&buf, ", "); /* assume prosrc isn't null */ - } - - tmp = SysCacheGetAttr(PROCOID, proctup, Anum_pg_proc_prosrc, &isnull); - if (isnull) - elog(ERROR, "null prosrc"); - prosrc = TextDatumGetCString(tmp); - - /* - * We always use dollar quoting. Figure out a suitable delimiter. - * - * Since the user is likely to be editing the function body string, we - * shouldn't use a short delimiter that he might easily create a conflict - * with. Hence prefer "$function$", but extend if needed. - */ - initStringInfo(&dq); - appendStringInfoString(&dq, "$function"); - while (strstr(prosrc, dq.data) != NULL) - appendStringInfoChar(&dq, 'x'); - appendStringInfoChar(&dq, '$'); - - appendStringInfoString(&buf, dq.data); - appendStringInfoString(&buf, prosrc); - appendStringInfoString(&buf, dq.data); - - appendStringInfoChar(&buf, '\n'); - - ReleaseSysCache(proctup); - - PG_RETURN_TEXT_P(string_to_text(buf.data)); -} - -/* - * pg_get_function_arguments - * Get a nicely-formatted list of arguments for a function. - * This is everything that would go between the parentheses in - * CREATE FUNCTION. - */ -Datum -pg_get_function_arguments(PG_FUNCTION_ARGS) -{ - Oid funcid = PG_GETARG_OID(0); - StringInfoData buf; - HeapTuple proctup; - - proctup = SearchSysCache1(PROCOID, ObjectIdGetDatum(funcid)); - if (!HeapTupleIsValid(proctup)) - PG_RETURN_NULL(); - - initStringInfo(&buf); - - (void) print_function_arguments(&buf, proctup, false, true); - - ReleaseSysCache(proctup); - - PG_RETURN_TEXT_P(string_to_text(buf.data)); -} - -/* - * pg_get_function_identity_arguments - * Get a formatted list of arguments for a function. - * This is everything that would go between the parentheses in - * ALTER FUNCTION, etc. In particular, don't print defaults. - */ -Datum -pg_get_function_identity_arguments(PG_FUNCTION_ARGS) -{ - Oid funcid = PG_GETARG_OID(0); - StringInfoData buf; - HeapTuple proctup; - - proctup = SearchSysCache1(PROCOID, ObjectIdGetDatum(funcid)); - if (!HeapTupleIsValid(proctup)) - PG_RETURN_NULL(); - - initStringInfo(&buf); - - (void) print_function_arguments(&buf, proctup, false, false); - - ReleaseSysCache(proctup); - - PG_RETURN_TEXT_P(string_to_text(buf.data)); -} - -/* - * pg_get_function_result - * Get a nicely-formatted version of the result type of a function. - * This is what would appear after RETURNS in CREATE FUNCTION. - */ -Datum -pg_get_function_result(PG_FUNCTION_ARGS) -{ - Oid funcid = PG_GETARG_OID(0); - StringInfoData buf; - HeapTuple proctup; - - proctup = SearchSysCache1(PROCOID, ObjectIdGetDatum(funcid)); - if (!HeapTupleIsValid(proctup)) - PG_RETURN_NULL(); - - initStringInfo(&buf); - - print_function_rettype(&buf, proctup); - - ReleaseSysCache(proctup); - - PG_RETURN_TEXT_P(string_to_text(buf.data)); -} - -/* - * Guts of pg_get_function_result: append the function's return type - * to the specified buffer. - */ -static void -print_function_rettype(StringInfo buf, HeapTuple proctup) -{ - Form_pg_proc proc = (Form_pg_proc) GETSTRUCT(proctup); - int ntabargs = 0; - StringInfoData rbuf; - - initStringInfo(&rbuf); - - if (proc->proretset) - { - /* It might be a table function; try to print the arguments */ - appendStringInfoString(&rbuf, "TABLE("); - ntabargs = print_function_arguments(&rbuf, proctup, true, false); - if (ntabargs > 0) - appendStringInfoChar(&rbuf, ')'); - else - resetStringInfo(&rbuf); - } - - if (ntabargs == 0) - { - /* Not a table function, so do the normal thing */ - if (proc->proretset) - appendStringInfoString(&rbuf, "SETOF "); - appendStringInfoString(&rbuf, format_type_be(proc->prorettype)); - } - - appendStringInfoString(buf, rbuf.data); -} - -/* - * Common code for pg_get_function_arguments and pg_get_function_result: - * append the desired subset of arguments to buf. We print only TABLE - * arguments when print_table_args is true, and all the others when it's false. - * We print argument defaults only if print_defaults is true. - * Function return value is the number of arguments printed. - */ -static int -print_function_arguments(StringInfo buf, HeapTuple proctup, - bool print_table_args, bool print_defaults) -{// #lizard forgives - Form_pg_proc proc = (Form_pg_proc) GETSTRUCT(proctup); - int numargs; - Oid *argtypes; - char **argnames; - char *argmodes; - int insertorderbyat = -1; - int argsprinted; - int inputargno; - int nlackdefaults; - ListCell *nextargdefault = NULL; - int i; - - numargs = get_func_arg_info(proctup, - &argtypes, &argnames, &argmodes); - - nlackdefaults = numargs; - if (print_defaults && proc->pronargdefaults > 0) - { - Datum proargdefaults; - bool isnull; - - proargdefaults = SysCacheGetAttr(PROCOID, proctup, - Anum_pg_proc_proargdefaults, - &isnull); - if (!isnull) - { - char *str; - List *argdefaults; - - str = TextDatumGetCString(proargdefaults); - argdefaults = castNode(List, stringToNode(str)); - pfree(str); - nextargdefault = list_head(argdefaults); - /* nlackdefaults counts only *input* arguments lacking defaults */ - nlackdefaults = proc->pronargs - list_length(argdefaults); - } - } - - /* Check for special treatment of ordered-set aggregates */ - if (proc->proisagg) - { - HeapTuple aggtup; - Form_pg_aggregate agg; - - aggtup = SearchSysCache1(AGGFNOID, - ObjectIdGetDatum(HeapTupleGetOid(proctup))); - if (!HeapTupleIsValid(aggtup)) - elog(ERROR, "cache lookup failed for aggregate %u", - HeapTupleGetOid(proctup)); - agg = (Form_pg_aggregate) GETSTRUCT(aggtup); - if (AGGKIND_IS_ORDERED_SET(agg->aggkind)) - insertorderbyat = agg->aggnumdirectargs; - ReleaseSysCache(aggtup); - } - - argsprinted = 0; - inputargno = 0; - for (i = 0; i < numargs; i++) - { - Oid argtype = argtypes[i]; - char *argname = argnames ? argnames[i] : NULL; - char argmode = argmodes ? argmodes[i] : PROARGMODE_IN; - const char *modename; - bool isinput; - - switch (argmode) - { - case PROARGMODE_IN: - modename = ""; - isinput = true; - break; - case PROARGMODE_INOUT: - modename = "INOUT "; - isinput = true; - break; - case PROARGMODE_OUT: - modename = "OUT "; - isinput = false; - break; - case PROARGMODE_VARIADIC: - modename = "VARIADIC "; - isinput = true; - break; - case PROARGMODE_TABLE: - modename = ""; - isinput = false; - break; - default: - elog(ERROR, "invalid parameter mode '%c'", argmode); - modename = NULL; /* keep compiler quiet */ - isinput = false; - break; - } - if (isinput) - inputargno++; /* this is a 1-based counter */ - - if (print_table_args != (argmode == PROARGMODE_TABLE)) - continue; - - if (argsprinted == insertorderbyat) - { - if (argsprinted) - appendStringInfoChar(buf, ' '); - appendStringInfoString(buf, "ORDER BY "); - } - else if (argsprinted) - appendStringInfoString(buf, ", "); - - appendStringInfoString(buf, modename); - if (argname && argname[0]) - appendStringInfo(buf, "%s ", quote_identifier(argname)); - appendStringInfoString(buf, format_type_be(argtype)); - if (print_defaults && isinput && inputargno > nlackdefaults) - { - Node *expr; - - Assert(nextargdefault != NULL); - expr = (Node *) lfirst(nextargdefault); - nextargdefault = lnext(nextargdefault); - - appendStringInfo(buf, " DEFAULT %s", - deparse_expression(expr, NIL, false, false)); - } - argsprinted++; - - /* nasty hack: print the last arg twice for variadic ordered-set agg */ - if (argsprinted == insertorderbyat && i == numargs - 1) - { - i--; - /* aggs shouldn't have defaults anyway, but just to be sure ... */ - print_defaults = false; - } - } - - return argsprinted; -} - -static bool -is_input_argument(int nth, const char *argmodes) -{ - return (!argmodes - || argmodes[nth] == PROARGMODE_IN - || argmodes[nth] == PROARGMODE_INOUT - || argmodes[nth] == PROARGMODE_VARIADIC); -} - -/* - * Append used transformed types to specified buffer - */ -static void -print_function_trftypes(StringInfo buf, HeapTuple proctup) -{ - Oid *trftypes; - int ntypes; - - ntypes = get_func_trftypes(proctup, &trftypes); - if (ntypes > 0) - { - int i; - - appendStringInfoString(buf, "\n TRANSFORM "); - for (i = 0; i < ntypes; i++) - { - if (i != 0) - appendStringInfoString(buf, ", "); - appendStringInfo(buf, "FOR TYPE %s", format_type_be(trftypes[i])); - } - } -} - -/* - * Get textual representation of a function argument's default value. The - * second argument of this function is the argument number among all arguments - * (i.e. proallargtypes, *not* proargtypes), starting with 1, because that's - * how information_schema.sql uses it. - */ -Datum -pg_get_function_arg_default(PG_FUNCTION_ARGS) -{// #lizard forgives - Oid funcid = PG_GETARG_OID(0); - int32 nth_arg = PG_GETARG_INT32(1); - HeapTuple proctup; - Form_pg_proc proc; - int numargs; - Oid *argtypes; - char **argnames; - char *argmodes; - int i; - List *argdefaults; - Node *node; - char *str; - int nth_inputarg; - Datum proargdefaults; - bool isnull; - int nth_default; - - proctup = SearchSysCache1(PROCOID, ObjectIdGetDatum(funcid)); - if (!HeapTupleIsValid(proctup)) - PG_RETURN_NULL(); - - numargs = get_func_arg_info(proctup, &argtypes, &argnames, &argmodes); - if (nth_arg < 1 || nth_arg > numargs || !is_input_argument(nth_arg - 1, argmodes)) - { - ReleaseSysCache(proctup); - PG_RETURN_NULL(); - } - - nth_inputarg = 0; - for (i = 0; i < nth_arg; i++) - if (is_input_argument(i, argmodes)) - nth_inputarg++; - - proargdefaults = SysCacheGetAttr(PROCOID, proctup, - Anum_pg_proc_proargdefaults, - &isnull); - if (isnull) - { - ReleaseSysCache(proctup); - PG_RETURN_NULL(); - } - - str = TextDatumGetCString(proargdefaults); - argdefaults = castNode(List, stringToNode(str)); - pfree(str); - - proc = (Form_pg_proc) GETSTRUCT(proctup); - - /* - * Calculate index into proargdefaults: proargdefaults corresponds to the - * last N input arguments, where N = pronargdefaults. - */ - nth_default = nth_inputarg - 1 - (proc->pronargs - proc->pronargdefaults); - - if (nth_default < 0 || nth_default >= list_length(argdefaults)) - { - ReleaseSysCache(proctup); - PG_RETURN_NULL(); - } - node = list_nth(argdefaults, nth_default); - str = deparse_expression(node, NIL, false, false); - - ReleaseSysCache(proctup); - - PG_RETURN_TEXT_P(string_to_text(str)); -} - - -/* - * deparse_expression - General utility for deparsing expressions - * - * calls deparse_expression_pretty with all prettyPrinting disabled - */ -char * -deparse_expression(Node *expr, List *dpcontext, - bool forceprefix, bool showimplicit) -{ - return deparse_expression_pretty(expr, dpcontext, forceprefix, - showimplicit, 0, 0); -} - -/* ---------- - * deparse_expression_pretty - General utility for deparsing expressions - * - * expr is the node tree to be deparsed. It must be a transformed expression - * tree (ie, not the raw output of gram.y). - * - * dpcontext is a list of deparse_namespace nodes representing the context - * for interpreting Vars in the node tree. It can be NIL if no Vars are - * expected. - * - * forceprefix is TRUE to force all Vars to be prefixed with their table names. - * - * showimplicit is TRUE to force all implicit casts to be shown explicitly. - * - * Tries to pretty up the output according to prettyFlags and startIndent. - * - * The result is a palloc'd string. - * ---------- - */ -static char * -deparse_expression_pretty(Node *expr, List *dpcontext, - bool forceprefix, bool showimplicit, - int prettyFlags, int startIndent) -{ - StringInfoData buf; - deparse_context context; - - initStringInfo(&buf); - context.buf = &buf; - context.namespaces = dpcontext; - context.windowClause = NIL; - context.windowTList = NIL; - context.varprefix = forceprefix; - context.prettyFlags = prettyFlags; - context.wrapColumn = WRAP_COLUMN_DEFAULT; - context.indentLevel = startIndent; - context.special_exprkind = EXPR_KIND_NONE; - - get_rule_expr(expr, &context, showimplicit); - - return buf.data; -} - -/* ---------- - * deparse_context_for - Build deparse context for a single relation - * - * Given the reference name (alias) and OID of a relation, build deparsing - * context for an expression referencing only that relation (as varno 1, - * varlevelsup 0). This is sufficient for many uses of deparse_expression. - * ---------- - */ -List * -deparse_context_for(const char *aliasname, Oid relid) -{ - deparse_namespace *dpns; - RangeTblEntry *rte; - - dpns = (deparse_namespace *) palloc0(sizeof(deparse_namespace)); - - /* Build a minimal RTE for the rel */ - rte = makeNode(RangeTblEntry); - rte->rtekind = RTE_RELATION; - rte->relid = relid; - rte->relkind = RELKIND_RELATION; /* no need for exactness here */ - rte->alias = makeAlias(aliasname, NIL); - rte->eref = rte->alias; - rte->lateral = false; - rte->inh = false; - rte->inFromCl = true; - - /* Build one-element rtable */ - dpns->rtable = list_make1(rte); - dpns->ctes = NIL; - set_rtable_names(dpns, NIL, NULL); - set_simple_column_names(dpns); - - /* Return a one-deep namespace stack */ - return list_make1(dpns); -} - -/* - * deparse_context_for_plan_rtable - Build deparse context for a plan's rtable - * - * When deparsing an expression in a Plan tree, we use the plan's rangetable - * to resolve names of simple Vars. The initialization of column names for - * this is rather expensive if the rangetable is large, and it'll be the same - * for every expression in the Plan tree; so we do it just once and re-use - * the result of this function for each expression. (Note that the result - * is not usable until set_deparse_context_planstate() is applied to it.) - * - * In addition to the plan's rangetable list, pass the per-RTE alias names - * assigned by a previous call to select_rtable_names_for_explain. - */ -List * -deparse_context_for_plan_rtable(List *rtable, List *rtable_names) -{ - deparse_namespace *dpns; - - dpns = (deparse_namespace *) palloc0(sizeof(deparse_namespace)); - - /* Initialize fields that stay the same across the whole plan tree */ - dpns->rtable = rtable; - dpns->rtable_names = rtable_names; - dpns->ctes = NIL; - - /* - * Set up column name aliases. We will get rather bogus results for join - * RTEs, but that doesn't matter because plan trees don't contain any join - * alias Vars. - */ - set_simple_column_names(dpns); - - /* Return a one-deep namespace stack */ - return list_make1(dpns); -} - -/* - * set_deparse_context_planstate - Specify Plan node containing expression - * - * When deparsing an expression in a Plan tree, we might have to resolve - * OUTER_VAR, INNER_VAR, or INDEX_VAR references. To do this, the caller must - * provide the parent PlanState node. Then OUTER_VAR and INNER_VAR references - * can be resolved by drilling down into the left and right child plans. - * Similarly, INDEX_VAR references can be resolved by reference to the - * indextlist given in a parent IndexOnlyScan node, or to the scan tlist in - * ForeignScan and CustomScan nodes. (Note that we don't currently support - * deparsing of indexquals in regular IndexScan or BitmapIndexScan nodes; - * for those, we can only deparse the indexqualorig fields, which won't - * contain INDEX_VAR Vars.) - * - * Note: planstate really ought to be declared as "PlanState *", but we use - * "Node *" to avoid having to include execnodes.h in ruleutils.h. - * - * The ancestors list is a list of the PlanState's parent PlanStates, the - * most-closely-nested first. This is needed to resolve PARAM_EXEC Params. - * Note we assume that all the PlanStates share the same rtable. - * - * Once this function has been called, deparse_expression() can be called on - * subsidiary expression(s) of the specified PlanState node. To deparse - * expressions of a different Plan node in the same Plan tree, re-call this - * function to identify the new parent Plan node. - * - * The result is the same List passed in; this is a notational convenience. - */ -List * -set_deparse_context_planstate(List *dpcontext, - Node *planstate, List *ancestors) -{ - deparse_namespace *dpns; - - - /* Should always have one-entry namespace list for Plan deparsing */ - Assert(list_length(dpcontext) == 1); - dpns = (deparse_namespace *) linitial(dpcontext); - - /* Set our attention on the specific plan node passed in */ - set_deparse_planstate(dpns, (PlanState *) planstate); - dpns->ancestors = ancestors; - - return dpcontext; -} - -/* - * select_rtable_names_for_explain - Select RTE aliases for EXPLAIN - * - * Determine the relation aliases we'll use during an EXPLAIN operation. - * This is just a frontend to set_rtable_names. We have to expose the aliases - * to EXPLAIN because EXPLAIN needs to know the right alias names to print. - */ -List * -select_rtable_names_for_explain(List *rtable, Bitmapset *rels_used) -{ - deparse_namespace dpns; - - memset(&dpns, 0, sizeof(dpns)); - dpns.rtable = rtable; - dpns.ctes = NIL; - set_rtable_names(&dpns, NIL, rels_used); - /* We needn't bother computing column aliases yet */ - - return dpns.rtable_names; -} - -#ifdef PGXC -/* - * This is a special case deparse context to be used at the planning time to - * generate query strings and expressions for remote shipping. - * - * XXX We should be careful while using this since the support is quite - * limited. The only supported use case at this point is for remote join - * reduction and some simple plan trees rooted by Agg node having a single - * RemoteQuery node as leftree. - */ -List * -deparse_context_for_plan(Node *plan, List *ancestors, - List *rtable) -{ - deparse_namespace *dpns; - - dpns = (deparse_namespace *) palloc0(sizeof(deparse_namespace)); - - /* Initialize fields that stay the same across the whole plan tree */ - dpns->rtable = rtable; - dpns->ctes = NIL; - - /* Set our attention on the specific plan node passed in */ - set_deparse_plan(dpns, (Plan *) plan); - dpns->ancestors = ancestors; - - /* Return a one-deep namespace stack */ - return list_make1(dpns); -} - -/* - * Set deparse context for Plan. Only those plan nodes which are immediate (or - * through simple nodes) parents of RemoteQuery nodes are supported right now. - * - * This is a kind of work-around since the new deparse interface (since 9.1) - * expects a PlanState node. But planstates are instantiated only at execution - * time when InitPlan is called. But we are required to deparse the query - * during planning time, so we hand-cook these dummy PlanState nodes instead of - * init-ing the plan. Another approach could have been to delay the query - * generation to the execution time, but we are not yet sure if this can be - * safely done, especially for remote join reduction. - */ -static void -set_deparse_plan(deparse_namespace *dpns, Plan *plan) -{// #lizard forgives - - if (IsA(plan, NestLoop)) - { - NestLoop *nestloop = (NestLoop *) plan; - - dpns->planstate = (PlanState *) makeNode(NestLoopState); - dpns->planstate->plan = plan; - - dpns->outer_planstate = (PlanState *) makeNode(PlanState); - dpns->outer_planstate->plan = nestloop->join.plan.lefttree; - - dpns->inner_planstate = (PlanState *) makeNode(PlanState); - dpns->inner_planstate->plan = nestloop->join.plan.righttree; - } - else if (IsA(plan, RemoteQuery)) - { - dpns->planstate = (PlanState *) makeNode(PlanState); - dpns->planstate->plan = plan; - } - else if (IsA(plan, Agg) || IsA(plan, Group)) - { - /* - * We expect plan tree as Group/Agg->Sort->Result->Material->RemoteQuery, - * Result, Material nodes are optional. Sort is compulsory for Group but not - * for Agg. - * anything else is not handled right now. - */ - Plan *temp_plan = plan->lefttree; - Plan *remote_scan = NULL; - - if (temp_plan && IsA(temp_plan, Sort)) - temp_plan = temp_plan->lefttree; - if (temp_plan && IsA(temp_plan, Result)) - temp_plan = temp_plan->lefttree; - if (temp_plan && IsA(temp_plan, Material)) - temp_plan = temp_plan->lefttree; - if (temp_plan && IsA(temp_plan, RemoteQuery)) - remote_scan = temp_plan; - - if (!remote_scan) - elog(ERROR, "Deparse of this query at planning is not supported yet"); - - dpns->planstate = (PlanState *) makeNode(PlanState); - dpns->planstate->plan = plan; - } - else - elog(ERROR, "Deparse of this query at planning not supported yet"); -} - -#endif -/* - * set_rtable_names: select RTE aliases to be used in printing a query - * - * We fill in dpns->rtable_names with a list of names that is one-for-one with - * the already-filled dpns->rtable list. Each RTE name is unique among those - * in the new namespace plus any ancestor namespaces listed in - * parent_namespaces. - * - * If rels_used isn't NULL, only RTE indexes listed in it are given aliases. - * - * Note that this function is only concerned with relation names, not column - * names. - */ -static void -set_rtable_names(deparse_namespace *dpns, List *parent_namespaces, - Bitmapset *rels_used) -{// #lizard forgives - HASHCTL hash_ctl; - HTAB *names_hash; - NameHashEntry *hentry; - bool found; - int rtindex; - ListCell *lc; - - dpns->rtable_names = NIL; - /* nothing more to do if empty rtable */ - if (dpns->rtable == NIL) - return; - - /* - * We use a hash table to hold known names, so that this process is O(N) - * not O(N^2) for N names. - */ - MemSet(&hash_ctl, 0, sizeof(hash_ctl)); - hash_ctl.keysize = NAMEDATALEN; - hash_ctl.entrysize = sizeof(NameHashEntry); - hash_ctl.hcxt = CurrentMemoryContext; - names_hash = hash_create("set_rtable_names names", - list_length(dpns->rtable), - &hash_ctl, - HASH_ELEM | HASH_CONTEXT); - /* Preload the hash table with names appearing in parent_namespaces */ - foreach(lc, parent_namespaces) - { - deparse_namespace *olddpns = (deparse_namespace *) lfirst(lc); - ListCell *lc2; - - foreach(lc2, olddpns->rtable_names) - { - char *oldname = (char *) lfirst(lc2); - - if (oldname == NULL) - continue; - hentry = (NameHashEntry *) hash_search(names_hash, - oldname, - HASH_ENTER, - &found); - /* we do not complain about duplicate names in parent namespaces */ - hentry->counter = 0; - } - } - - /* Now we can scan the rtable */ - rtindex = 1; - foreach(lc, dpns->rtable) - { - RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc); - char *refname; - - /* Just in case this takes an unreasonable amount of time ... */ - CHECK_FOR_INTERRUPTS(); - - if (rels_used && !bms_is_member(rtindex, rels_used)) - { - /* Ignore unreferenced RTE */ - refname = NULL; - } - else if (rte->alias) - { - /* If RTE has a user-defined alias, prefer that */ - refname = rte->alias->aliasname; - } - else if (rte->rtekind == RTE_RELATION) - { - /* Use the current actual name of the relation */ - refname = get_rel_name(rte->relid); - } - else if (rte->rtekind == RTE_JOIN) - { - /* Unnamed join has no refname */ - refname = NULL; - } - else - { - /* Otherwise use whatever the parser assigned */ - refname = rte->eref->aliasname; - } - - /* - * If the selected name isn't unique, append digits to make it so, and - * make a new hash entry for it once we've got a unique name. For a - * very long input name, we might have to truncate to stay within - * NAMEDATALEN. - */ - if (refname) - { - hentry = (NameHashEntry *) hash_search(names_hash, - refname, - HASH_ENTER, - &found); - if (found) - { - /* Name already in use, must choose a new one */ - int refnamelen = strlen(refname); - char *modname = (char *) palloc(refnamelen + 16); - NameHashEntry *hentry2; - - do - { - hentry->counter++; - for (;;) - { - /* - * We avoid using %.*s here because it can misbehave - * if the data is not valid in what libc thinks is the - * prevailing encoding. - */ - memcpy(modname, refname, refnamelen); - sprintf(modname + refnamelen, "_%d", hentry->counter); - if (strlen(modname) < NAMEDATALEN) - break; - /* drop chars from refname to keep all the digits */ - refnamelen = pg_mbcliplen(refname, refnamelen, - refnamelen - 1); - } - hentry2 = (NameHashEntry *) hash_search(names_hash, - modname, - HASH_ENTER, - &found); - } while (found); - hentry2->counter = 0; /* init new hash entry */ - refname = modname; - } - else - { - /* Name not previously used, need only initialize hentry */ - hentry->counter = 0; - } - } - - dpns->rtable_names = lappend(dpns->rtable_names, refname); - rtindex++; - } - - hash_destroy(names_hash); -} - -/* - * set_deparse_for_query: set up deparse_namespace for deparsing a Query tree - * - * For convenience, this is defined to initialize the deparse_namespace struct - * from scratch. - */ -static void -set_deparse_for_query(deparse_namespace *dpns, Query *query, - List *parent_namespaces) -{ - ListCell *lc; - ListCell *lc2; - - /* Initialize *dpns and fill rtable/ctes links */ - memset(dpns, 0, sizeof(deparse_namespace)); - dpns->rtable = query->rtable; - dpns->ctes = query->cteList; - - /* Assign a unique relation alias to each RTE */ - set_rtable_names(dpns, parent_namespaces, NULL); - - /* Initialize dpns->rtable_columns to contain zeroed structs */ - dpns->rtable_columns = NIL; - while (list_length(dpns->rtable_columns) < list_length(dpns->rtable)) - dpns->rtable_columns = lappend(dpns->rtable_columns, - palloc0(sizeof(deparse_columns))); - - /* If it's a utility query, it won't have a jointree */ - if (query->jointree) - { - /* Detect whether global uniqueness of USING names is needed */ - dpns->unique_using = - has_dangerous_join_using(dpns, (Node *) query->jointree); - - /* - * Select names for columns merged by USING, via a recursive pass over - * the query jointree. - */ - set_using_names(dpns, (Node *) query->jointree, NIL); - } - - /* - * Now assign remaining column aliases for each RTE. We do this in a - * linear scan of the rtable, so as to process RTEs whether or not they - * are in the jointree (we mustn't miss NEW.*, INSERT target relations, - * etc). JOIN RTEs must be processed after their children, but this is - * okay because they appear later in the rtable list than their children - * (cf Asserts in identify_join_columns()). - */ - forboth(lc, dpns->rtable, lc2, dpns->rtable_columns) - { - RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc); - deparse_columns *colinfo = (deparse_columns *) lfirst(lc2); - - if (rte->rtekind == RTE_JOIN) - set_join_column_names(dpns, rte, colinfo); - else - set_relation_column_names(dpns, rte, colinfo); - } -} - -/* - * set_simple_column_names: fill in column aliases for non-query situations - * - * This handles EXPLAIN and cases where we only have relation RTEs. Without - * a join tree, we can't do anything smart about join RTEs, but we don't - * need to (note that EXPLAIN should never see join alias Vars anyway). - * If we do hit a join RTE we'll just process it like a non-table base RTE. - */ -static void -set_simple_column_names(deparse_namespace *dpns) -{ - ListCell *lc; - ListCell *lc2; - - /* Initialize dpns->rtable_columns to contain zeroed structs */ - dpns->rtable_columns = NIL; - while (list_length(dpns->rtable_columns) < list_length(dpns->rtable)) - dpns->rtable_columns = lappend(dpns->rtable_columns, - palloc0(sizeof(deparse_columns))); - - /* Assign unique column aliases within each RTE */ - forboth(lc, dpns->rtable, lc2, dpns->rtable_columns) - { - RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc); - deparse_columns *colinfo = (deparse_columns *) lfirst(lc2); - - set_relation_column_names(dpns, rte, colinfo); - } -} - -/* - * has_dangerous_join_using: search jointree for unnamed JOIN USING - * - * Merged columns of a JOIN USING may act differently from either of the input - * columns, either because they are merged with COALESCE (in a FULL JOIN) or - * because an implicit coercion of the underlying input column is required. - * In such a case the column must be referenced as a column of the JOIN not as - * a column of either input. And this is problematic if the join is unnamed - * (alias-less): we cannot qualify the column's name with an RTE name, since - * there is none. (Forcibly assigning an alias to the join is not a solution, - * since that will prevent legal references to tables below the join.) - * To ensure that every column in the query is unambiguously referenceable, - * we must assign such merged columns names that are globally unique across - * the whole query, aliasing other columns out of the way as necessary. - * - * Because the ensuing re-aliasing is fairly damaging to the readability of - * the query, we don't do this unless we have to. So, we must pre-scan - * the join tree to see if we have to, before starting set_using_names(). - */ -static bool -has_dangerous_join_using(deparse_namespace *dpns, Node *jtnode) -{// #lizard forgives - if (IsA(jtnode, RangeTblRef)) - { - /* nothing to do here */ - } - else if (IsA(jtnode, FromExpr)) - { - FromExpr *f = (FromExpr *) jtnode; - ListCell *lc; - - foreach(lc, f->fromlist) - { - if (has_dangerous_join_using(dpns, (Node *) lfirst(lc))) - return true; - } - } - else if (IsA(jtnode, JoinExpr)) - { - JoinExpr *j = (JoinExpr *) jtnode; - - /* Is it an unnamed JOIN with USING? */ - if (j->alias == NULL && j->usingClause) - { - /* - * Yes, so check each join alias var to see if any of them are not - * simple references to underlying columns. If so, we have a - * dangerous situation and must pick unique aliases. - */ - RangeTblEntry *jrte = rt_fetch(j->rtindex, dpns->rtable); - ListCell *lc; - - foreach(lc, jrte->joinaliasvars) - { - Var *aliasvar = (Var *) lfirst(lc); - - if (aliasvar != NULL && !IsA(aliasvar, Var)) - return true; - } - } - - /* Nope, but inspect children */ - if (has_dangerous_join_using(dpns, j->larg)) - return true; - if (has_dangerous_join_using(dpns, j->rarg)) - return true; - } - else - elog(ERROR, "unrecognized node type: %d", - (int) nodeTag(jtnode)); - return false; -} - -/* - * set_using_names: select column aliases to be used for merged USING columns - * - * We do this during a recursive descent of the query jointree. - * dpns->unique_using must already be set to determine the global strategy. - * - * Column alias info is saved in the dpns->rtable_columns list, which is - * assumed to be filled with pre-zeroed deparse_columns structs. - * - * parentUsing is a list of all USING aliases assigned in parent joins of - * the current jointree node. (The passed-in list must not be modified.) - */ -static void -set_using_names(deparse_namespace *dpns, Node *jtnode, List *parentUsing) -{// #lizard forgives - if (IsA(jtnode, RangeTblRef)) - { - /* nothing to do now */ - } - else if (IsA(jtnode, FromExpr)) - { - FromExpr *f = (FromExpr *) jtnode; - ListCell *lc; - - foreach(lc, f->fromlist) - set_using_names(dpns, (Node *) lfirst(lc), parentUsing); - } - else if (IsA(jtnode, JoinExpr)) - { - JoinExpr *j = (JoinExpr *) jtnode; - RangeTblEntry *rte = rt_fetch(j->rtindex, dpns->rtable); - deparse_columns *colinfo = deparse_columns_fetch(j->rtindex, dpns); - int *leftattnos; - int *rightattnos; - deparse_columns *leftcolinfo; - deparse_columns *rightcolinfo; - int i; - ListCell *lc; - - /* Get info about the shape of the join */ - identify_join_columns(j, rte, colinfo); - leftattnos = colinfo->leftattnos; - rightattnos = colinfo->rightattnos; - - /* Look up the not-yet-filled-in child deparse_columns structs */ - leftcolinfo = deparse_columns_fetch(colinfo->leftrti, dpns); - rightcolinfo = deparse_columns_fetch(colinfo->rightrti, dpns); - - /* - * If this join is unnamed, then we cannot substitute new aliases at - * this level, so any name requirements pushed down to here must be - * pushed down again to the children. - */ - if (rte->alias == NULL) - { - for (i = 0; i < colinfo->num_cols; i++) - { - char *colname = colinfo->colnames[i]; - - if (colname == NULL) - continue; - - /* Push down to left column, unless it's a system column */ - if (leftattnos[i] > 0) - { - expand_colnames_array_to(leftcolinfo, leftattnos[i]); - leftcolinfo->colnames[leftattnos[i] - 1] = colname; - } - - /* Same on the righthand side */ - if (rightattnos[i] > 0) - { - expand_colnames_array_to(rightcolinfo, rightattnos[i]); - rightcolinfo->colnames[rightattnos[i] - 1] = colname; - } - } - } - - /* - * If there's a USING clause, select the USING column names and push - * those names down to the children. We have two strategies: - * - * If dpns->unique_using is TRUE, we force all USING names to be - * unique across the whole query level. In principle we'd only need - * the names of dangerous USING columns to be globally unique, but to - * safely assign all USING names in a single pass, we have to enforce - * the same uniqueness rule for all of them. However, if a USING - * column's name has been pushed down from the parent, we should use - * it as-is rather than making a uniqueness adjustment. This is - * necessary when we're at an unnamed join, and it creates no risk of - * ambiguity. Also, if there's a user-written output alias for a - * merged column, we prefer to use that rather than the input name; - * this simplifies the logic and seems likely to lead to less aliasing - * overall. - * - * If dpns->unique_using is FALSE, we only need USING names to be - * unique within their own join RTE. We still need to honor - * pushed-down names, though. - * - * Though significantly different in results, these two strategies are - * implemented by the same code, with only the difference of whether - * to put assigned names into dpns->using_names. - */ - if (j->usingClause) - { - /* Copy the input parentUsing list so we don't modify it */ - parentUsing = list_copy(parentUsing); - - /* USING names must correspond to the first join output columns */ - expand_colnames_array_to(colinfo, list_length(j->usingClause)); - i = 0; - foreach(lc, j->usingClause) - { - char *colname = strVal(lfirst(lc)); - - /* Assert it's a merged column */ - Assert(leftattnos[i] != 0 && rightattnos[i] != 0); - - /* Adopt passed-down name if any, else select unique name */ - if (colinfo->colnames[i] != NULL) - colname = colinfo->colnames[i]; - else - { - /* Prefer user-written output alias if any */ - if (rte->alias && i < list_length(rte->alias->colnames)) - colname = strVal(list_nth(rte->alias->colnames, i)); - /* Make it appropriately unique */ - colname = make_colname_unique(colname, dpns, colinfo); - if (dpns->unique_using) - dpns->using_names = lappend(dpns->using_names, - colname); - /* Save it as output column name, too */ - colinfo->colnames[i] = colname; - } - - /* Remember selected names for use later */ - colinfo->usingNames = lappend(colinfo->usingNames, colname); - parentUsing = lappend(parentUsing, colname); - - /* Push down to left column, unless it's a system column */ - if (leftattnos[i] > 0) - { - expand_colnames_array_to(leftcolinfo, leftattnos[i]); - leftcolinfo->colnames[leftattnos[i] - 1] = colname; - } - - /* Same on the righthand side */ - if (rightattnos[i] > 0) - { - expand_colnames_array_to(rightcolinfo, rightattnos[i]); - rightcolinfo->colnames[rightattnos[i] - 1] = colname; - } - - i++; - } - } - - /* Mark child deparse_columns structs with correct parentUsing info */ - leftcolinfo->parentUsing = parentUsing; - rightcolinfo->parentUsing = parentUsing; - - /* Now recursively assign USING column names in children */ - set_using_names(dpns, j->larg, parentUsing); - set_using_names(dpns, j->rarg, parentUsing); - } - else - elog(ERROR, "unrecognized node type: %d", - (int) nodeTag(jtnode)); -} - -/* - * set_relation_column_names: select column aliases for a non-join RTE - * - * Column alias info is saved in *colinfo, which is assumed to be pre-zeroed. - * If any colnames entries are already filled in, those override local - * choices. - */ -static void -set_relation_column_names(deparse_namespace *dpns, RangeTblEntry *rte, - deparse_columns *colinfo) -{// #lizard forgives - int ncolumns; - char **real_colnames; - bool changed_any; - int noldcolumns; - int i; - int j; - - /* - * Extract the RTE's "real" column names. This is comparable to - * get_rte_attribute_name, except that it's important to disregard dropped - * columns. We put NULL into the array for a dropped column. - */ - if (rte->rtekind == RTE_RELATION) - { - /* Relation --- look to the system catalogs for up-to-date info */ - Relation rel; - TupleDesc tupdesc; - - rel = relation_open(rte->relid, AccessShareLock); - tupdesc = RelationGetDescr(rel); - - ncolumns = tupdesc->natts; - real_colnames = (char **) palloc(ncolumns * sizeof(char *)); - - for (i = 0; i < ncolumns; i++) - { - if (tupdesc->attrs[i]->attisdropped) - real_colnames[i] = NULL; - else - real_colnames[i] = pstrdup(NameStr(tupdesc->attrs[i]->attname)); - } - relation_close(rel, AccessShareLock); - } - else - { - /* Otherwise use the column names from eref */ - ListCell *lc; - - ncolumns = list_length(rte->eref->colnames); - real_colnames = (char **) palloc(ncolumns * sizeof(char *)); - - i = 0; - foreach(lc, rte->eref->colnames) - { - /* - * If the column name shown in eref is an empty string, then it's - * a column that was dropped at the time of parsing the query, so - * treat it as dropped. - */ - char *cname = strVal(lfirst(lc)); - - if (cname[0] == '\0') - cname = NULL; - real_colnames[i] = cname; - i++; - } - } - - /* - * Ensure colinfo->colnames has a slot for each column. (It could be long - * enough already, if we pushed down a name for the last column.) Note: - * it's possible that there are now more columns than there were when the - * query was parsed, ie colnames could be longer than rte->eref->colnames. - * We must assign unique aliases to the new columns too, else there could - * be unresolved conflicts when the view/rule is reloaded. - */ - expand_colnames_array_to(colinfo, ncolumns); - Assert(colinfo->num_cols == ncolumns); - - /* - * Make sufficiently large new_colnames and is_new_col arrays, too. - * - * Note: because we leave colinfo->num_new_cols zero until after the loop, - * colname_is_unique will not consult that array, which is fine because it - * would only be duplicate effort. - */ - colinfo->new_colnames = (char **) palloc(ncolumns * sizeof(char *)); - colinfo->is_new_col = (bool *) palloc(ncolumns * sizeof(bool)); - - /* - * Scan the columns, select a unique alias for each one, and store it in - * colinfo->colnames and colinfo->new_colnames. The former array has NULL - * entries for dropped columns, the latter omits them. Also mark - * new_colnames entries as to whether they are new since parse time; this - * is the case for entries beyond the length of rte->eref->colnames. - */ - noldcolumns = list_length(rte->eref->colnames); - changed_any = false; - j = 0; - for (i = 0; i < ncolumns; i++) - { - char *real_colname = real_colnames[i]; - char *colname = colinfo->colnames[i]; - - /* Skip dropped columns */ - if (real_colname == NULL) - { - Assert(colname == NULL); /* colnames[i] is already NULL */ - continue; - } - - /* If alias already assigned, that's what to use */ - if (colname == NULL) - { - /* If user wrote an alias, prefer that over real column name */ - if (rte->alias && i < list_length(rte->alias->colnames)) - colname = strVal(list_nth(rte->alias->colnames, i)); - else - colname = real_colname; - - /* Unique-ify and insert into colinfo */ - colname = make_colname_unique(colname, dpns, colinfo); - - colinfo->colnames[i] = colname; - } - - /* Put names of non-dropped columns in new_colnames[] too */ - colinfo->new_colnames[j] = colname; - /* And mark them as new or not */ - colinfo->is_new_col[j] = (i >= noldcolumns); - j++; - - /* Remember if any assigned aliases differ from "real" name */ - if (!changed_any && strcmp(colname, real_colname) != 0) - changed_any = true; - } - - /* - * Set correct length for new_colnames[] array. (Note: if columns have - * been added, colinfo->num_cols includes them, which is not really quite - * right but is harmless, since any new columns must be at the end where - * they won't affect varattnos of pre-existing columns.) - */ - colinfo->num_new_cols = j; - - /* - * For a relation RTE, we need only print the alias column names if any - * are different from the underlying "real" names. For a function RTE, - * always emit a complete column alias list; this is to protect against - * possible instability of the default column names (eg, from altering - * parameter names). For tablefunc RTEs, we never print aliases, because - * the column names are part of the clause itself. For other RTE types, - * print if we changed anything OR if there were user-written column - * aliases (since the latter would be part of the underlying "reality"). - */ - if (rte->rtekind == RTE_RELATION) - colinfo->printaliases = changed_any; - else if (rte->rtekind == RTE_FUNCTION) - colinfo->printaliases = true; - else if (rte->rtekind == RTE_TABLEFUNC) - colinfo->printaliases = false; - else if (rte->alias && rte->alias->colnames != NIL) - colinfo->printaliases = true; - else - colinfo->printaliases = changed_any; -} - -/* - * set_join_column_names: select column aliases for a join RTE - * - * Column alias info is saved in *colinfo, which is assumed to be pre-zeroed. - * If any colnames entries are already filled in, those override local - * choices. Also, names for USING columns were already chosen by - * set_using_names(). We further expect that column alias selection has been - * completed for both input RTEs. - */ -static void -set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte, - deparse_columns *colinfo) -{// #lizard forgives - deparse_columns *leftcolinfo; - deparse_columns *rightcolinfo; - bool changed_any; - int noldcolumns; - int nnewcolumns; - Bitmapset *leftmerged = NULL; - Bitmapset *rightmerged = NULL; - int i; - int j; - int ic; - int jc; - - /* Look up the previously-filled-in child deparse_columns structs */ - leftcolinfo = deparse_columns_fetch(colinfo->leftrti, dpns); - rightcolinfo = deparse_columns_fetch(colinfo->rightrti, dpns); - - /* - * Ensure colinfo->colnames has a slot for each column. (It could be long - * enough already, if we pushed down a name for the last column.) Note: - * it's possible that one or both inputs now have more columns than there - * were when the query was parsed, but we'll deal with that below. We - * only need entries in colnames for pre-existing columns. - */ - noldcolumns = list_length(rte->eref->colnames); - expand_colnames_array_to(colinfo, noldcolumns); - Assert(colinfo->num_cols == noldcolumns); - - /* - * Scan the join output columns, select an alias for each one, and store - * it in colinfo->colnames. If there are USING columns, set_using_names() - * already selected their names, so we can start the loop at the first - * non-merged column. - */ - changed_any = false; - for (i = list_length(colinfo->usingNames); i < noldcolumns; i++) - { - char *colname = colinfo->colnames[i]; - char *real_colname; - - /* Ignore dropped column (only possible for non-merged column) */ - if (colinfo->leftattnos[i] == 0 && colinfo->rightattnos[i] == 0) - { - Assert(colname == NULL); - continue; - } - - /* Get the child column name */ - if (colinfo->leftattnos[i] > 0) - real_colname = leftcolinfo->colnames[colinfo->leftattnos[i] - 1]; - else if (colinfo->rightattnos[i] > 0) - real_colname = rightcolinfo->colnames[colinfo->rightattnos[i] - 1]; - else - { - /* We're joining system columns --- use eref name */ - real_colname = strVal(list_nth(rte->eref->colnames, i)); - } - Assert(real_colname != NULL); - - /* In an unnamed join, just report child column names as-is */ - if (rte->alias == NULL) - { - colinfo->colnames[i] = real_colname; - continue; - } - - /* If alias already assigned, that's what to use */ - if (colname == NULL) - { - /* If user wrote an alias, prefer that over real column name */ - if (rte->alias && i < list_length(rte->alias->colnames)) - colname = strVal(list_nth(rte->alias->colnames, i)); - else - colname = real_colname; - - /* Unique-ify and insert into colinfo */ - colname = make_colname_unique(colname, dpns, colinfo); - - colinfo->colnames[i] = colname; - } - - /* Remember if any assigned aliases differ from "real" name */ - if (!changed_any && strcmp(colname, real_colname) != 0) - changed_any = true; - } - - /* - * Calculate number of columns the join would have if it were re-parsed - * now, and create storage for the new_colnames and is_new_col arrays. - * - * Note: colname_is_unique will be consulting new_colnames[] during the - * loops below, so its not-yet-filled entries must be zeroes. - */ - nnewcolumns = leftcolinfo->num_new_cols + rightcolinfo->num_new_cols - - list_length(colinfo->usingNames); - colinfo->num_new_cols = nnewcolumns; - colinfo->new_colnames = (char **) palloc0(nnewcolumns * sizeof(char *)); - colinfo->is_new_col = (bool *) palloc0(nnewcolumns * sizeof(bool)); - - /* - * Generating the new_colnames array is a bit tricky since any new columns - * added since parse time must be inserted in the right places. This code - * must match the parser, which will order a join's columns as merged - * columns first (in USING-clause order), then non-merged columns from the - * left input (in attnum order), then non-merged columns from the right - * input (ditto). If one of the inputs is itself a join, its columns will - * be ordered according to the same rule, which means newly-added columns - * might not be at the end. We can figure out what's what by consulting - * the leftattnos and rightattnos arrays plus the input is_new_col arrays. - * - * In these loops, i indexes leftattnos/rightattnos (so it's join varattno - * less one), j indexes new_colnames/is_new_col, and ic/jc have similar - * meanings for the current child RTE. - */ - - /* Handle merged columns; they are first and can't be new */ - i = j = 0; - while (i < noldcolumns && - colinfo->leftattnos[i] != 0 && - colinfo->rightattnos[i] != 0) - { - /* column name is already determined and known unique */ - colinfo->new_colnames[j] = colinfo->colnames[i]; - colinfo->is_new_col[j] = false; - - /* build bitmapsets of child attnums of merged columns */ - if (colinfo->leftattnos[i] > 0) - leftmerged = bms_add_member(leftmerged, colinfo->leftattnos[i]); - if (colinfo->rightattnos[i] > 0) - rightmerged = bms_add_member(rightmerged, colinfo->rightattnos[i]); - - i++, j++; - } - - /* Handle non-merged left-child columns */ - ic = 0; - for (jc = 0; jc < leftcolinfo->num_new_cols; jc++) - { - char *child_colname = leftcolinfo->new_colnames[jc]; - - if (!leftcolinfo->is_new_col[jc]) - { - /* Advance ic to next non-dropped old column of left child */ - while (ic < leftcolinfo->num_cols && - leftcolinfo->colnames[ic] == NULL) - ic++; - Assert(ic < leftcolinfo->num_cols); - ic++; - /* If it is a merged column, we already processed it */ - if (bms_is_member(ic, leftmerged)) - continue; - /* Else, advance i to the corresponding existing join column */ - while (i < colinfo->num_cols && - colinfo->colnames[i] == NULL) - i++; - Assert(i < colinfo->num_cols); - Assert(ic == colinfo->leftattnos[i]); - /* Use the already-assigned name of this column */ - colinfo->new_colnames[j] = colinfo->colnames[i]; - i++; - } - else - { - /* - * Unique-ify the new child column name and assign, unless we're - * in an unnamed join, in which case just copy - */ - if (rte->alias != NULL) - { - colinfo->new_colnames[j] = - make_colname_unique(child_colname, dpns, colinfo); - if (!changed_any && - strcmp(colinfo->new_colnames[j], child_colname) != 0) - changed_any = true; - } - else - colinfo->new_colnames[j] = child_colname; - } - - colinfo->is_new_col[j] = leftcolinfo->is_new_col[jc]; - j++; - } - - /* Handle non-merged right-child columns in exactly the same way */ - ic = 0; - for (jc = 0; jc < rightcolinfo->num_new_cols; jc++) - { - char *child_colname = rightcolinfo->new_colnames[jc]; - - if (!rightcolinfo->is_new_col[jc]) - { - /* Advance ic to next non-dropped old column of right child */ - while (ic < rightcolinfo->num_cols && - rightcolinfo->colnames[ic] == NULL) - ic++; - Assert(ic < rightcolinfo->num_cols); - ic++; - /* If it is a merged column, we already processed it */ - if (bms_is_member(ic, rightmerged)) - continue; - /* Else, advance i to the corresponding existing join column */ - while (i < colinfo->num_cols && - colinfo->colnames[i] == NULL) - i++; - Assert(i < colinfo->num_cols); - Assert(ic == colinfo->rightattnos[i]); - /* Use the already-assigned name of this column */ - colinfo->new_colnames[j] = colinfo->colnames[i]; - i++; - } - else - { - /* - * Unique-ify the new child column name and assign, unless we're - * in an unnamed join, in which case just copy - */ - if (rte->alias != NULL) - { - colinfo->new_colnames[j] = - make_colname_unique(child_colname, dpns, colinfo); - if (!changed_any && - strcmp(colinfo->new_colnames[j], child_colname) != 0) - changed_any = true; - } - else - colinfo->new_colnames[j] = child_colname; - } - - colinfo->is_new_col[j] = rightcolinfo->is_new_col[jc]; - j++; - } - - /* Assert we processed the right number of columns */ -#ifdef USE_ASSERT_CHECKING - while (i < colinfo->num_cols && colinfo->colnames[i] == NULL) - i++; - Assert(i == colinfo->num_cols); - Assert(j == nnewcolumns); -#endif - - /* - * For a named join, print column aliases if we changed any from the child - * names. Unnamed joins cannot print aliases. - */ - if (rte->alias != NULL) - colinfo->printaliases = changed_any; - else - colinfo->printaliases = false; -} - -/* - * colname_is_unique: is colname distinct from already-chosen column names? - * - * dpns is query-wide info, colinfo is for the column's RTE - */ -static bool -colname_is_unique(char *colname, deparse_namespace *dpns, - deparse_columns *colinfo) -{// #lizard forgives - int i; - ListCell *lc; - - /* Check against already-assigned column aliases within RTE */ - for (i = 0; i < colinfo->num_cols; i++) - { - char *oldname = colinfo->colnames[i]; - - if (oldname && strcmp(oldname, colname) == 0) - return false; - } - - /* - * If we're building a new_colnames array, check that too (this will be - * partially but not completely redundant with the previous checks) - */ - for (i = 0; i < colinfo->num_new_cols; i++) - { - char *oldname = colinfo->new_colnames[i]; - - if (oldname && strcmp(oldname, colname) == 0) - return false; - } - - /* Also check against USING-column names that must be globally unique */ - foreach(lc, dpns->using_names) - { - char *oldname = (char *) lfirst(lc); - - if (strcmp(oldname, colname) == 0) - return false; - } - - /* Also check against names already assigned for parent-join USING cols */ - foreach(lc, colinfo->parentUsing) - { - char *oldname = (char *) lfirst(lc); - - if (strcmp(oldname, colname) == 0) - return false; - } - - return true; -} - -/* - * make_colname_unique: modify colname if necessary to make it unique - * - * dpns is query-wide info, colinfo is for the column's RTE - */ -static char * -make_colname_unique(char *colname, deparse_namespace *dpns, - deparse_columns *colinfo) -{ - /* - * If the selected name isn't unique, append digits to make it so. For a - * very long input name, we might have to truncate to stay within - * NAMEDATALEN. - */ - if (!colname_is_unique(colname, dpns, colinfo)) - { - int colnamelen = strlen(colname); - char *modname = (char *) palloc(colnamelen + 16); - int i = 0; - - do - { - i++; - for (;;) - { - /* - * We avoid using %.*s here because it can misbehave if the - * data is not valid in what libc thinks is the prevailing - * encoding. - */ - memcpy(modname, colname, colnamelen); - sprintf(modname + colnamelen, "_%d", i); - if (strlen(modname) < NAMEDATALEN) - break; - /* drop chars from colname to keep all the digits */ - colnamelen = pg_mbcliplen(colname, colnamelen, - colnamelen - 1); - } - } while (!colname_is_unique(modname, dpns, colinfo)); - colname = modname; - } - return colname; -} - -/* - * expand_colnames_array_to: make colinfo->colnames at least n items long - * - * Any added array entries are initialized to zero. - */ -static void -expand_colnames_array_to(deparse_columns *colinfo, int n) -{ - if (n > colinfo->num_cols) - { - if (colinfo->colnames == NULL) - colinfo->colnames = (char **) palloc0(n * sizeof(char *)); - else - { - colinfo->colnames = (char **) repalloc(colinfo->colnames, - n * sizeof(char *)); - memset(colinfo->colnames + colinfo->num_cols, 0, - (n - colinfo->num_cols) * sizeof(char *)); - } - colinfo->num_cols = n; - } -} - -/* - * identify_join_columns: figure out where columns of a join come from - * - * Fills the join-specific fields of the colinfo struct, except for - * usingNames which is filled later. - */ -static void -identify_join_columns(JoinExpr *j, RangeTblEntry *jrte, - deparse_columns *colinfo) -{// #lizard forgives - int numjoincols; - int i; - ListCell *lc; - - /* Extract left/right child RT indexes */ - if (IsA(j->larg, RangeTblRef)) - colinfo->leftrti = ((RangeTblRef *) j->larg)->rtindex; - else if (IsA(j->larg, JoinExpr)) - colinfo->leftrti = ((JoinExpr *) j->larg)->rtindex; - else - elog(ERROR, "unrecognized node type in jointree: %d", - (int) nodeTag(j->larg)); - if (IsA(j->rarg, RangeTblRef)) - colinfo->rightrti = ((RangeTblRef *) j->rarg)->rtindex; - else if (IsA(j->rarg, JoinExpr)) - colinfo->rightrti = ((JoinExpr *) j->rarg)->rtindex; - else - elog(ERROR, "unrecognized node type in jointree: %d", - (int) nodeTag(j->rarg)); - - /* Assert children will be processed earlier than join in second pass */ - Assert(colinfo->leftrti < j->rtindex); - Assert(colinfo->rightrti < j->rtindex); - - /* Initialize result arrays with zeroes */ - numjoincols = list_length(jrte->joinaliasvars); - Assert(numjoincols == list_length(jrte->eref->colnames)); - colinfo->leftattnos = (int *) palloc0(numjoincols * sizeof(int)); - colinfo->rightattnos = (int *) palloc0(numjoincols * sizeof(int)); - - /* Scan the joinaliasvars list to identify simple column references */ - i = 0; - foreach(lc, jrte->joinaliasvars) - { - Var *aliasvar = (Var *) lfirst(lc); - - /* get rid of any implicit coercion above the Var */ - aliasvar = (Var *) strip_implicit_coercions((Node *) aliasvar); - - if (aliasvar == NULL) - { - /* It's a dropped column; nothing to do here */ - } - else if (IsA(aliasvar, Var)) - { - Assert(aliasvar->varlevelsup == 0); - Assert(aliasvar->varattno != 0); - if (aliasvar->varno == colinfo->leftrti) - colinfo->leftattnos[i] = aliasvar->varattno; - else if (aliasvar->varno == colinfo->rightrti) - colinfo->rightattnos[i] = aliasvar->varattno; - else - elog(ERROR, "unexpected varno %d in JOIN RTE", - aliasvar->varno); - } - else if (IsA(aliasvar, CoalesceExpr)) - { - /* - * It's a merged column in FULL JOIN USING. Ignore it for now and - * let the code below identify the merged columns. - */ - } - else - elog(ERROR, "unrecognized node type in join alias vars: %d", - (int) nodeTag(aliasvar)); - - i++; - } - - /* - * If there's a USING clause, deconstruct the join quals to identify the - * merged columns. This is a tad painful but if we cannot rely on the - * column names, there is no other representation of which columns were - * joined by USING. (Unless the join type is FULL, we can't tell from the - * joinaliasvars list which columns are merged.) Note: we assume that the - * merged columns are the first output column(s) of the join. - */ - if (j->usingClause) - { - List *leftvars = NIL; - List *rightvars = NIL; - ListCell *lc2; - - /* Extract left- and right-side Vars from the qual expression */ - flatten_join_using_qual(j->quals, &leftvars, &rightvars); - Assert(list_length(leftvars) == list_length(j->usingClause)); - Assert(list_length(rightvars) == list_length(j->usingClause)); - - /* Mark the output columns accordingly */ - i = 0; - forboth(lc, leftvars, lc2, rightvars) - { - Var *leftvar = (Var *) lfirst(lc); - Var *rightvar = (Var *) lfirst(lc2); - - Assert(leftvar->varlevelsup == 0); - Assert(leftvar->varattno != 0); - if (leftvar->varno != colinfo->leftrti) - elog(ERROR, "unexpected varno %d in JOIN USING qual", - leftvar->varno); - colinfo->leftattnos[i] = leftvar->varattno; - - Assert(rightvar->varlevelsup == 0); - Assert(rightvar->varattno != 0); - if (rightvar->varno != colinfo->rightrti) - elog(ERROR, "unexpected varno %d in JOIN USING qual", - rightvar->varno); - colinfo->rightattnos[i] = rightvar->varattno; - - i++; - } - } -} - -/* - * flatten_join_using_qual: extract Vars being joined from a JOIN/USING qual - * - * We assume that transformJoinUsingClause won't have produced anything except - * AND nodes, equality operator nodes, and possibly implicit coercions, and - * that the AND node inputs match left-to-right with the original USING list. - * - * Caller must initialize the result lists to NIL. - */ -static void -flatten_join_using_qual(Node *qual, List **leftvars, List **rightvars) -{ - if (IsA(qual, BoolExpr)) - { - /* Handle AND nodes by recursion */ - BoolExpr *b = (BoolExpr *) qual; - ListCell *lc; - - Assert(b->boolop == AND_EXPR); - foreach(lc, b->args) - { - flatten_join_using_qual((Node *) lfirst(lc), - leftvars, rightvars); - } - } - else if (IsA(qual, OpExpr)) - { - /* Otherwise we should have an equality operator */ - OpExpr *op = (OpExpr *) qual; - Var *var; - - if (list_length(op->args) != 2) - elog(ERROR, "unexpected unary operator in JOIN/USING qual"); - /* Arguments should be Vars with perhaps implicit coercions */ - var = (Var *) strip_implicit_coercions((Node *) linitial(op->args)); - if (!IsA(var, Var)) - elog(ERROR, "unexpected node type in JOIN/USING qual: %d", - (int) nodeTag(var)); - *leftvars = lappend(*leftvars, var); - var = (Var *) strip_implicit_coercions((Node *) lsecond(op->args)); - if (!IsA(var, Var)) - elog(ERROR, "unexpected node type in JOIN/USING qual: %d", - (int) nodeTag(var)); - *rightvars = lappend(*rightvars, var); - } - else - { - /* Perhaps we have an implicit coercion to boolean? */ - Node *q = strip_implicit_coercions(qual); - - if (q != qual) - flatten_join_using_qual(q, leftvars, rightvars); - else - elog(ERROR, "unexpected node type in JOIN/USING qual: %d", - (int) nodeTag(qual)); - } -} - -/* - * get_rtable_name: convenience function to get a previously assigned RTE alias - * - * The RTE must belong to the topmost namespace level in "context". - */ -static char * -get_rtable_name(int rtindex, deparse_context *context) -{ - deparse_namespace *dpns = (deparse_namespace *) linitial(context->namespaces); - - Assert(rtindex > 0 && rtindex <= list_length(dpns->rtable_names)); - return (char *) list_nth(dpns->rtable_names, rtindex - 1); -} - -/* - * set_deparse_planstate: set up deparse_namespace to parse subexpressions - * of a given PlanState node - * - * This sets the planstate, outer_planstate, inner_planstate, outer_tlist, - * inner_tlist, and index_tlist fields. Caller is responsible for adjusting - * the ancestors list if necessary. Note that the rtable and ctes fields do - * not need to change when shifting attention to different plan nodes in a - * single plan tree. - */ -static void -set_deparse_planstate(deparse_namespace *dpns, PlanState *ps) -{// #lizard forgives - dpns->planstate = ps; - - /* - * We special-case Append and MergeAppend to pretend that the first child - * plan is the OUTER referent; we have to interpret OUTER Vars in their - * tlists according to one of the children, and the first one is the most - * natural choice. Likewise special-case ModifyTable to pretend that the - * first child plan is the OUTER referent; this is to support RETURNING - * lists containing references to non-target relations. - */ - if (IsA(ps, AppendState)) - dpns->outer_planstate = ((AppendState *) ps)->appendplans[0]; - else if (IsA(ps, MergeAppendState)) - dpns->outer_planstate = ((MergeAppendState *) ps)->mergeplans[0]; - else if (IsA(ps, ModifyTableState)) - dpns->outer_planstate = ((ModifyTableState *) ps)->mt_plans[0]; - else - dpns->outer_planstate = outerPlanState(ps); - - if (dpns->outer_planstate) - dpns->outer_tlist = dpns->outer_planstate->plan->targetlist; - else - dpns->outer_tlist = NIL; - - /* - * For a SubqueryScan, pretend the subplan is INNER referent. (We don't - * use OUTER because that could someday conflict with the normal meaning.) - * Likewise, for a CteScan, pretend the subquery's plan is INNER referent. - * For ON CONFLICT .. UPDATE we just need the inner tlist to point to the - * excluded expression's tlist. (Similar to the SubqueryScan we don't want - * to reuse OUTER, it's used for RETURNING in some modify table cases, - * although not INSERT .. CONFLICT). - */ - if (IsA(ps, SubqueryScanState)) - dpns->inner_planstate = ((SubqueryScanState *) ps)->subplan; - else if (IsA(ps, CteScanState)) - dpns->inner_planstate = ((CteScanState *) ps)->cteplanstate; - else if (IsA(ps, ModifyTableState)) - dpns->inner_planstate = ps; - else - dpns->inner_planstate = innerPlanState(ps); - - if (IsA(ps, ModifyTableState)) - dpns->inner_tlist = ((ModifyTableState *) ps)->mt_excludedtlist; - else if (dpns->inner_planstate) - dpns->inner_tlist = dpns->inner_planstate->plan->targetlist; - else - dpns->inner_tlist = NIL; - - /* Set up referent for INDEX_VAR Vars, if needed */ - if (IsA(ps->plan, IndexOnlyScan)) - dpns->index_tlist = ((IndexOnlyScan *) ps->plan)->indextlist; - else if (IsA(ps->plan, ForeignScan)) - dpns->index_tlist = ((ForeignScan *) ps->plan)->fdw_scan_tlist; - else if (IsA(ps->plan, CustomScan)) - dpns->index_tlist = ((CustomScan *) ps->plan)->custom_scan_tlist; - else - dpns->index_tlist = NIL; -} - -/* - * push_child_plan: temporarily transfer deparsing attention to a child plan - * - * When expanding an OUTER_VAR or INNER_VAR reference, we must adjust the - * deparse context in case the referenced expression itself uses - * OUTER_VAR/INNER_VAR. We modify the top stack entry in-place to avoid - * affecting levelsup issues (although in a Plan tree there really shouldn't - * be any). - * - * Caller must provide a local deparse_namespace variable to save the - * previous state for pop_child_plan. - */ -static void -push_child_plan(deparse_namespace *dpns, PlanState *ps, - deparse_namespace *save_dpns) -{ - /* Save state for restoration later */ - *save_dpns = *dpns; - - /* Link current plan node into ancestors list */ - dpns->ancestors = lcons(dpns->planstate, dpns->ancestors); - - /* Set attention on selected child */ - set_deparse_planstate(dpns, ps); -} - -/* - * pop_child_plan: undo the effects of push_child_plan - */ -static void -pop_child_plan(deparse_namespace *dpns, deparse_namespace *save_dpns) -{ - List *ancestors; - - /* Get rid of ancestors list cell added by push_child_plan */ - ancestors = list_delete_first(dpns->ancestors); - - /* Restore fields changed by push_child_plan */ - *dpns = *save_dpns; - - /* Make sure dpns->ancestors is right (may be unnecessary) */ - dpns->ancestors = ancestors; -} - -/* - * push_ancestor_plan: temporarily transfer deparsing attention to an - * ancestor plan - * - * When expanding a Param reference, we must adjust the deparse context - * to match the plan node that contains the expression being printed; - * otherwise we'd fail if that expression itself contains a Param or - * OUTER_VAR/INNER_VAR/INDEX_VAR variable. - * - * The target ancestor is conveniently identified by the ListCell holding it - * in dpns->ancestors. - * - * Caller must provide a local deparse_namespace variable to save the - * previous state for pop_ancestor_plan. - */ -static void -push_ancestor_plan(deparse_namespace *dpns, ListCell *ancestor_cell, - deparse_namespace *save_dpns) -{ - PlanState *ps = (PlanState *) lfirst(ancestor_cell); - List *ancestors; - - /* Save state for restoration later */ - *save_dpns = *dpns; - - /* Build a new ancestor list with just this node's ancestors */ - ancestors = NIL; - while ((ancestor_cell = lnext(ancestor_cell)) != NULL) - ancestors = lappend(ancestors, lfirst(ancestor_cell)); - dpns->ancestors = ancestors; - - /* Set attention on selected ancestor */ - set_deparse_planstate(dpns, ps); -} - -/* - * pop_ancestor_plan: undo the effects of push_ancestor_plan - */ -static void -pop_ancestor_plan(deparse_namespace *dpns, deparse_namespace *save_dpns) -{ - /* Free the ancestor list made in push_ancestor_plan */ - list_free(dpns->ancestors); - - /* Restore fields changed by push_ancestor_plan */ - *dpns = *save_dpns; -} - - -/* ---------- - * make_ruledef - reconstruct the CREATE RULE command - * for a given pg_rewrite tuple - * ---------- - */ -static void -make_ruledef(StringInfo buf, HeapTuple ruletup, TupleDesc rulettc, - int prettyFlags) -{// #lizard forgives - char *rulename; - char ev_type; - Oid ev_class; - bool is_instead; - char *ev_qual; - char *ev_action; - List *actions = NIL; - Relation ev_relation; - TupleDesc viewResultDesc = NULL; - int fno; - Datum dat; - bool isnull; - - /* - * Get the attribute values from the rules tuple - */ - fno = SPI_fnumber(rulettc, "rulename"); - dat = SPI_getbinval(ruletup, rulettc, fno, &isnull); - Assert(!isnull); - rulename = NameStr(*(DatumGetName(dat))); - - fno = SPI_fnumber(rulettc, "ev_type"); - dat = SPI_getbinval(ruletup, rulettc, fno, &isnull); - Assert(!isnull); - ev_type = DatumGetChar(dat); - - fno = SPI_fnumber(rulettc, "ev_class"); - dat = SPI_getbinval(ruletup, rulettc, fno, &isnull); - Assert(!isnull); - ev_class = DatumGetObjectId(dat); - - fno = SPI_fnumber(rulettc, "is_instead"); - dat = SPI_getbinval(ruletup, rulettc, fno, &isnull); - Assert(!isnull); - is_instead = DatumGetBool(dat); - - /* these could be nulls */ - fno = SPI_fnumber(rulettc, "ev_qual"); - ev_qual = SPI_getvalue(ruletup, rulettc, fno); - - fno = SPI_fnumber(rulettc, "ev_action"); - ev_action = SPI_getvalue(ruletup, rulettc, fno); - if (ev_action != NULL) - actions = (List *) stringToNode(ev_action); - - ev_relation = heap_open(ev_class, AccessShareLock); - - /* - * Build the rules definition text - */ - appendStringInfo(buf, "CREATE RULE %s AS", - quote_identifier(rulename)); - - if (prettyFlags & PRETTYFLAG_INDENT) - appendStringInfoString(buf, "\n ON "); - else - appendStringInfoString(buf, " ON "); - - /* The event the rule is fired for */ - switch (ev_type) - { - case '1': - appendStringInfoString(buf, "SELECT"); - viewResultDesc = RelationGetDescr(ev_relation); - break; - - case '2': - appendStringInfoString(buf, "UPDATE"); - break; - - case '3': - appendStringInfoString(buf, "INSERT"); - break; - - case '4': - appendStringInfoString(buf, "DELETE"); - break; - - default: - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("rule \"%s\" has unsupported event type %d", - rulename, ev_type))); - break; - } - - /* The relation the rule is fired on */ - appendStringInfo(buf, " TO %s", generate_relation_name(ev_class, NIL)); - - /* If the rule has an event qualification, add it */ - if (ev_qual == NULL) - ev_qual = ""; - if (strlen(ev_qual) > 0 && strcmp(ev_qual, "<>") != 0) - { - Node *qual; - Query *query; - deparse_context context; - deparse_namespace dpns; - - if (prettyFlags & PRETTYFLAG_INDENT) - appendStringInfoString(buf, "\n "); - appendStringInfoString(buf, " WHERE "); - - qual = stringToNode(ev_qual); - - /* - * We need to make a context for recognizing any Vars in the qual - * (which can only be references to OLD and NEW). Use the rtable of - * the first query in the action list for this purpose. - */ - query = (Query *) linitial(actions); - - /* - * If the action is INSERT...SELECT, OLD/NEW have been pushed down - * into the SELECT, and that's what we need to look at. (Ugly kluge - * ... try to fix this when we redesign querytrees.) - */ - query = getInsertSelectQuery(query, NULL); - - /* Must acquire locks right away; see notes in get_query_def() */ - AcquireRewriteLocks(query, false, false); - - context.buf = buf; - context.namespaces = list_make1(&dpns); - context.windowClause = NIL; - context.windowTList = NIL; - context.varprefix = (list_length(query->rtable) != 1); - context.prettyFlags = prettyFlags; - context.wrapColumn = WRAP_COLUMN_DEFAULT; - context.indentLevel = PRETTYINDENT_STD; - context.special_exprkind = EXPR_KIND_NONE; - - set_deparse_for_query(&dpns, query, NIL); - - get_rule_expr(qual, &context, false); - } - - appendStringInfoString(buf, " DO "); - - /* The INSTEAD keyword (if so) */ - if (is_instead) - appendStringInfoString(buf, "INSTEAD "); - - /* Finally the rules actions */ - if (list_length(actions) > 1) - { - ListCell *action; - Query *query; - - appendStringInfoChar(buf, '('); - foreach(action, actions) - { - query = (Query *) lfirst(action); - get_query_def(query, buf, NIL, viewResultDesc, - prettyFlags, WRAP_COLUMN_DEFAULT, 0 -#ifdef PGXC - , false, false -#endif /* PGXC */ - ); - if (prettyFlags) - appendStringInfoString(buf, ";\n"); - else - appendStringInfoString(buf, "; "); - } - appendStringInfoString(buf, ");"); - } - else if (list_length(actions) == 0) - { - appendStringInfoString(buf, "NOTHING;"); - } - else - { - Query *query; - - query = (Query *) linitial(actions); - get_query_def(query, buf, NIL, viewResultDesc, - prettyFlags, WRAP_COLUMN_DEFAULT, 0 -#ifdef PGXC - , false, false -#endif /* PGXC */ - ); - appendStringInfo(buf, ";"); - } - - heap_close(ev_relation, AccessShareLock); -} - - -/* ---------- - * make_viewdef - reconstruct the SELECT part of a - * view rewrite rule - * ---------- - */ -static void -make_viewdef(StringInfo buf, HeapTuple ruletup, TupleDesc rulettc, - int prettyFlags, int wrapColumn) -{// #lizard forgives - Query *query; - char ev_type; - Oid ev_class; - bool is_instead; - char *ev_qual; - char *ev_action; - List *actions = NIL; - Relation ev_relation; - int fno; - Datum dat; - bool isnull; - - /* - * Get the attribute values from the rules tuple - */ - fno = SPI_fnumber(rulettc, "ev_type"); - dat = SPI_getbinval(ruletup, rulettc, fno, &isnull); - Assert(!isnull); - ev_type = DatumGetChar(dat); - - fno = SPI_fnumber(rulettc, "ev_class"); - dat = SPI_getbinval(ruletup, rulettc, fno, &isnull); - Assert(!isnull); - ev_class = DatumGetObjectId(dat); - - fno = SPI_fnumber(rulettc, "is_instead"); - dat = SPI_getbinval(ruletup, rulettc, fno, &isnull); - Assert(!isnull); - is_instead = DatumGetBool(dat); - - /* these could be nulls */ - fno = SPI_fnumber(rulettc, "ev_qual"); - ev_qual = SPI_getvalue(ruletup, rulettc, fno); - - fno = SPI_fnumber(rulettc, "ev_action"); - ev_action = SPI_getvalue(ruletup, rulettc, fno); - if (ev_action != NULL) - actions = (List *) stringToNode(ev_action); - - if (list_length(actions) != 1) - { - /* keep output buffer empty and leave */ - return; - } - - query = (Query *) linitial(actions); - - if (ev_type != '1' || !is_instead || - strcmp(ev_qual, "<>") != 0 || query->commandType != CMD_SELECT) - { - /* keep output buffer empty and leave */ - return; - } - - ev_relation = heap_open(ev_class, AccessShareLock); - - get_query_def(query, buf, NIL, RelationGetDescr(ev_relation), - prettyFlags, wrapColumn, 0 -#ifdef PGXC - , false, false -#endif /* PGXC */ - ); - appendStringInfo(buf, ";"); - - heap_close(ev_relation, AccessShareLock); -} - -#ifdef PGXC -/* ---------- - * deparse_query - Parse back one query parsetree - * - * Purpose of this function is to build up statement for a RemoteQuery - * It just calls get_query_def without pretty print flags - * ---------- - */ -void -deparse_query(Query *query, StringInfo buf, List *parentnamespace, - bool finalise_aggs, bool sortgroup_colno) -{ - get_query_def(query, buf, parentnamespace, NULL, 0, 0, 0, finalise_aggs, - sortgroup_colno); -} - -/* code borrowed from get_insert_query_def */ -void -get_query_def_from_valuesList(Query *query, StringInfo buf) -{// #lizard forgives - - RangeTblEntry *select_rte = NULL; - RangeTblEntry *values_rte = NULL; - RangeTblEntry *rte; - char *sep; - ListCell *values_cell; - ListCell *l; - List *strippedexprs; - deparse_context context; - deparse_namespace dpns; - - /* - * Before we begin to examine the query, acquire locks on referenced - * relations, and fix up deleted columns in JOIN RTEs. This ensures - * consistent results. Note we assume it's OK to scribble on the passed - * querytree! - */ - AcquireRewriteLocks(query, false, false); - - context.buf = buf; - context.namespaces = NIL; - context.windowClause = NIL; - context.windowTList = NIL; - context.varprefix = (list_length(query->rtable) != 1); - context.prettyFlags = 0; - context.indentLevel = 0; - context.wrapColumn = 0; - - dpns.rtable = query->rtable; - dpns.ctes = query->cteList; - dpns.planstate = NULL; - dpns.ancestors = NIL; - dpns.outer_planstate = dpns.inner_planstate = NULL; - - /* - * If it's an INSERT ... SELECT or VALUES (...), (...), ... there will be - * a single RTE for the SELECT or VALUES. - */ - foreach(l, query->rtable) - { - rte = (RangeTblEntry *) lfirst(l); - - if (rte->rtekind == RTE_SUBQUERY) - { - if (select_rte) - elog(ERROR, "too many subquery RTEs in INSERT"); - select_rte = rte; - } - - if (rte->rtekind == RTE_VALUES) - { - if (values_rte) - elog(ERROR, "too many values RTEs in INSERT"); - values_rte = rte; - } - } - if (select_rte && values_rte) - elog(ERROR, "both subquery and values RTEs in INSERT"); - - /* - * Start the query with INSERT INTO relname - */ - rte = rt_fetch(query->resultRelation, query->rtable); - Assert(rte->rtekind == RTE_RELATION); - - appendStringInfo(buf, "INSERT INTO %s (", - generate_relation_name(rte->relid, NIL)); - - /* - * Add the insert-column-names list. To handle indirection properly, we - * need to look for indirection nodes in the top targetlist (if it's - * INSERT ... SELECT or INSERT ... single VALUES), or in the first - * expression list of the VALUES RTE (if it's INSERT ... multi VALUES). We - * assume that all the expression lists will have similar indirection in - * the latter case. - */ - if (values_rte) - values_cell = list_head((List *) linitial(values_rte->values_lists)); - else - values_cell = NULL; - strippedexprs = NIL; - sep = ""; - foreach(l, query->targetList) - { - TargetEntry *tle = (TargetEntry *) lfirst(l); - - elog(DEBUG1, "targetEntry type is %d\n)", tle->expr->type); - if (tle->resjunk || !IsA(tle->expr, Var)) - continue; /* ignore junk entries */ - - appendStringInfoString(buf, sep); - sep = ", "; - - /* - * Put out name of target column; look in the catalogs, not at - * tle->resname, since resname will fail to track RENAME. - */ - appendStringInfoString(buf,quote_identifier(get_relid_attribute_name(rte->relid, tle->resno))); - - /* - * Print any indirection needed (subfields or subscripts), and strip - * off the top-level nodes representing the indirection assignments. - */ - if (values_cell) - { - /* we discard the stripped expression in this case */ - processIndirection((Node *) lfirst(values_cell), &context); - values_cell = lnext(values_cell); - } - else - { - /* we keep a list of the stripped expressions in this case */ - strippedexprs = lappend(strippedexprs, processIndirection((Node *) tle->expr, &context)); - } - } - appendStringInfo(buf, ") "); - - if (select_rte) - { - /* Add the SELECT */ - get_query_def(select_rte->subquery, buf, NIL, NULL, - context.prettyFlags, context.wrapColumn, - context.indentLevel, - context.finalise_aggs, context.sortgroup_colno); - } - else if (values_rte) - { - /* A WITH clause is possible here */ - get_with_clause(query, &context); - /* Add the multi-VALUES expression lists */ - get_values_def(values_rte->values_lists, &context); - } - else - { - /* A WITH clause is possible here */ - get_with_clause(query, &context); - /* Add the single-VALUES expression list */ - appendContextKeyword(&context, "VALUES (", - -PRETTYINDENT_STD, PRETTYINDENT_STD, 2); - get_rule_expr((Node *) strippedexprs, &context, false); - appendStringInfoChar(buf, ')'); - } - - /* Add RETURNING if present */ - if (query->returningList) - { - appendContextKeyword(&context, " RETURNING", - -PRETTYINDENT_STD, PRETTYINDENT_STD, 1); - get_target_list(query->returningList, &context, NULL); - } -} -#endif -/* ---------- - * get_query_def - Parse back one query parsetree - * - * If resultDesc is not NULL, then it is the output tuple descriptor for - * the view represented by a SELECT query. - * ---------- - */ -static void -get_query_def(Query *query, StringInfo buf, List *parentnamespace, - TupleDesc resultDesc, - int prettyFlags, int wrapColumn, int startIndent, - bool finalise_aggs, bool sortgroup_colno) -{// #lizard forgives - deparse_context context; - deparse_namespace dpns; - - /* Guard against excessively long or deeply-nested queries */ - CHECK_FOR_INTERRUPTS(); - check_stack_depth(); - - /* - * Before we begin to examine the query, acquire locks on referenced - * relations, and fix up deleted columns in JOIN RTEs. This ensures - * consistent results. Note we assume it's OK to scribble on the passed - * querytree! - * - * We are only deparsing the query (we are not about to execute it), so we - * only need AccessShareLock on the relations it mentions. - */ - AcquireRewriteLocks(query, false, false); - - context.buf = buf; - context.namespaces = lcons(&dpns, list_copy(parentnamespace)); - context.windowClause = NIL; - context.windowTList = NIL; - context.varprefix = (parentnamespace != NIL || - list_length(query->rtable) != 1); - context.prettyFlags = prettyFlags; - context.wrapColumn = wrapColumn; - context.indentLevel = startIndent; - context.special_exprkind = EXPR_KIND_NONE; - context.finalise_aggs = finalise_aggs; - context.sortgroup_colno = sortgroup_colno; - - set_deparse_for_query(&dpns, query, parentnamespace); - - switch (query->commandType) - { - case CMD_SELECT: - get_select_query_def(query, &context, resultDesc); - break; - - case CMD_UPDATE: - get_update_query_def(query, &context); - break; - - case CMD_INSERT: - get_insert_query_def(query, &context); - break; - - case CMD_DELETE: - get_delete_query_def(query, &context); - break; - - case CMD_NOTHING: - appendStringInfoString(buf, "NOTHING"); - break; - - case CMD_UTILITY: - get_utility_query_def(query, &context); - break; - - default: - elog(ERROR, "unrecognized query command type: %d", - query->commandType); - break; - } -} - -/* ---------- - * get_values_def - Parse back a VALUES list - * ---------- - */ -static void -get_values_def(List *values_lists, deparse_context *context) -{ - StringInfo buf = context->buf; - bool first_list = true; - ListCell *vtl; - - appendStringInfoString(buf, "VALUES "); - - foreach(vtl, values_lists) - { - List *sublist = (List *) lfirst(vtl); - bool first_col = true; - ListCell *lc; - - if (first_list) - first_list = false; - else - appendStringInfoString(buf, ", "); - - appendStringInfoChar(buf, '('); - foreach(lc, sublist) - { - Node *col = (Node *) lfirst(lc); - - if (first_col) - first_col = false; - else - appendStringInfoChar(buf, ','); - - /* - * Print the value. Whole-row Vars need special treatment. - */ - get_rule_expr_toplevel(col, context, false); - } - appendStringInfoChar(buf, ')'); - } -} - -/* ---------- - * get_with_clause - Parse back a WITH clause - * ---------- - */ -static void -get_with_clause(Query *query, deparse_context *context) -{// #lizard forgives - StringInfo buf = context->buf; - const char *sep; - ListCell *l; - - if (query->cteList == NIL) - return; - - if (PRETTY_INDENT(context)) - { - context->indentLevel += PRETTYINDENT_STD; - appendStringInfoChar(buf, ' '); - } - - if (query->hasRecursive) - sep = "WITH RECURSIVE "; - else - sep = "WITH "; - foreach(l, query->cteList) - { - CommonTableExpr *cte = (CommonTableExpr *) lfirst(l); - - appendStringInfoString(buf, sep); - appendStringInfoString(buf, quote_identifier(cte->ctename)); - if (cte->aliascolnames) - { - bool first = true; - ListCell *col; - - appendStringInfoChar(buf, '('); - foreach(col, cte->aliascolnames) - { - if (first) - first = false; - else - appendStringInfoString(buf, ", "); - appendStringInfoString(buf, - quote_identifier(strVal(lfirst(col)))); - } - appendStringInfoChar(buf, ')'); - } - appendStringInfoString(buf, " AS ("); - if (PRETTY_INDENT(context)) - appendContextKeyword(context, "", 0, 0, 0); - get_query_def((Query *) cte->ctequery, buf, context->namespaces, NULL, - context->prettyFlags, context->wrapColumn, - context->indentLevel, - context->finalise_aggs, - context->sortgroup_colno); - if (PRETTY_INDENT(context)) - appendContextKeyword(context, "", 0, 0, 0); - appendStringInfoChar(buf, ')'); - sep = ", "; - } - - if (PRETTY_INDENT(context)) - { - context->indentLevel -= PRETTYINDENT_STD; - appendContextKeyword(context, "", 0, 0, 0); - } - else - appendStringInfoChar(buf, ' '); -} - -/* ---------- - * get_select_query_def - Parse back a SELECT parsetree - * ---------- - */ -static void -get_select_query_def(Query *query, deparse_context *context, - TupleDesc resultDesc) -{// #lizard forgives - StringInfo buf = context->buf; - List *save_windowclause; - List *save_windowtlist; - bool force_colno; - ListCell *l; - - /* Insert the WITH clause if given */ - get_with_clause(query, context); - - /* Set up context for possible window functions */ - save_windowclause = context->windowClause; - context->windowClause = query->windowClause; - save_windowtlist = context->windowTList; - context->windowTList = query->targetList; - - /* - * If the Query node has a setOperations tree, then it's the top level of - * a UNION/INTERSECT/EXCEPT query; only the WITH, ORDER BY and LIMIT - * fields are interesting in the top query itself. - */ - if (query->setOperations) - { - get_setop_query(query->setOperations, query, context, resultDesc); - /* ORDER BY clauses must be simple in this case */ - force_colno = true; - } - else - { - get_basic_select_query(query, context, resultDesc); - force_colno = false; - } - - /* Add the ORDER BY clause if given */ - if (query->sortClause != NIL) - { - appendContextKeyword(context, " ORDER BY ", - -PRETTYINDENT_STD, PRETTYINDENT_STD, 1); - get_rule_orderby(query->sortClause, query->targetList, - force_colno, context); - } - - /* Add the LIMIT clause if given */ - if (query->limitOffset != NULL) - { - appendContextKeyword(context, " OFFSET ", - -PRETTYINDENT_STD, PRETTYINDENT_STD, 0); - get_rule_expr(query->limitOffset, context, false); - } - if (query->limitCount != NULL) - { - appendContextKeyword(context, " LIMIT ", - -PRETTYINDENT_STD, PRETTYINDENT_STD, 0); - if (IsA(query->limitCount, Const) && - ((Const *) query->limitCount)->constisnull) - appendStringInfoString(buf, "ALL"); - else - get_rule_expr(query->limitCount, context, false); - } - - /* Add FOR [KEY] UPDATE/SHARE clauses if present */ - if (query->hasForUpdate) - { - foreach(l, query->rowMarks) - { - RowMarkClause *rc = (RowMarkClause *) lfirst(l); - - /* don't print implicit clauses */ - if (rc->pushedDown) - continue; - - switch (rc->strength) - { - case LCS_NONE: - /* we intentionally throw an error for LCS_NONE */ - elog(ERROR, "unrecognized LockClauseStrength %d", - (int) rc->strength); - break; - case LCS_FORKEYSHARE: - appendContextKeyword(context, " FOR KEY SHARE", - -PRETTYINDENT_STD, PRETTYINDENT_STD, 0); - break; - case LCS_FORSHARE: - appendContextKeyword(context, " FOR SHARE", - -PRETTYINDENT_STD, PRETTYINDENT_STD, 0); - break; - case LCS_FORNOKEYUPDATE: - appendContextKeyword(context, " FOR NO KEY UPDATE", - -PRETTYINDENT_STD, PRETTYINDENT_STD, 0); - break; - case LCS_FORUPDATE: - appendContextKeyword(context, " FOR UPDATE", - -PRETTYINDENT_STD, PRETTYINDENT_STD, 0); - break; - } - - appendStringInfo(buf, " OF %s", - quote_identifier(get_rtable_name(rc->rti, - context))); - if (rc->waitPolicy == LockWaitError) - appendStringInfoString(buf, " NOWAIT"); - else if (rc->waitPolicy == LockWaitSkip) - appendStringInfoString(buf, " SKIP LOCKED"); - } - } - - context->windowClause = save_windowclause; - context->windowTList = save_windowtlist; -} - -/* - * Detect whether query looks like SELECT ... FROM VALUES(); - * if so, return the VALUES RTE. Otherwise return NULL. - */ -static RangeTblEntry * -get_simple_values_rte(Query *query) -{// #lizard forgives - RangeTblEntry *result = NULL; - ListCell *lc; - - /* - * We want to return TRUE even if the Query also contains OLD or NEW rule - * RTEs. So the idea is to scan the rtable and see if there is only one - * inFromCl RTE that is a VALUES RTE. - */ - foreach(lc, query->rtable) - { - RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc); - - if (rte->rtekind == RTE_VALUES && rte->inFromCl) - { - if (result) - return NULL; /* multiple VALUES (probably not possible) */ - result = rte; - } - else if (rte->rtekind == RTE_RELATION && !rte->inFromCl) - continue; /* ignore rule entries */ - else - return NULL; /* something else -> not simple VALUES */ - } - - /* - * We don't need to check the targetlist in any great detail, because - * parser/analyze.c will never generate a "bare" VALUES RTE --- they only - * appear inside auto-generated sub-queries with very restricted - * structure. However, DefineView might have modified the tlist by - * injecting new column aliases; so compare tlist resnames against the - * RTE's names to detect that. - */ - if (result) - { - ListCell *lcn; - - if (list_length(query->targetList) != list_length(result->eref->colnames)) - return NULL; /* this probably cannot happen */ - forboth(lc, query->targetList, lcn, result->eref->colnames) - { - TargetEntry *tle = (TargetEntry *) lfirst(lc); - char *cname = strVal(lfirst(lcn)); - - if (tle->resjunk) - return NULL; /* this probably cannot happen */ - if (tle->resname == NULL || strcmp(tle->resname, cname) != 0) - return NULL; /* column name has been changed */ - } - } - - return result; -} - -static void -get_basic_select_query(Query *query, deparse_context *context, - TupleDesc resultDesc) -{// #lizard forgives - StringInfo buf = context->buf; - RangeTblEntry *values_rte; - char *sep; - ListCell *l; - - if (PRETTY_INDENT(context)) - { - context->indentLevel += PRETTYINDENT_STD; - appendStringInfoChar(buf, ' '); - } - - /* - * If the query looks like SELECT * FROM (VALUES ...), then print just the - * VALUES part. This reverses what transformValuesClause() did at parse - * time. - */ - values_rte = get_simple_values_rte(query); - if (values_rte) - { - get_values_def(values_rte->values_lists, context); - return; - } - - /* - * Build up the query string - first we say SELECT - */ - appendStringInfoString(buf, "SELECT"); - - /* Add the DISTINCT clause if given */ - if (query->distinctClause != NIL) - { - if (query->hasDistinctOn) - { - appendStringInfoString(buf, " DISTINCT ON ("); - sep = ""; - foreach(l, query->distinctClause) - { - SortGroupClause *srt = (SortGroupClause *) lfirst(l); - - appendStringInfoString(buf, sep); - get_rule_sortgroupclause(srt->tleSortGroupRef, query->targetList, - false, context); - sep = ", "; - } - appendStringInfoChar(buf, ')'); - } - else - appendStringInfoString(buf, " DISTINCT"); - } - - /* Then we tell what to select (the targetlist) */ - get_target_list(query->targetList, context, resultDesc); - - /* Add the FROM clause if needed */ - get_from_clause(query, " FROM ", context); - - /* Add the WHERE clause if given */ - if (query->jointree->quals != NULL) - { - appendContextKeyword(context, " WHERE ", - -PRETTYINDENT_STD, PRETTYINDENT_STD, 1); - get_rule_expr(query->jointree->quals, context, false); - } - - /* Add the GROUP BY clause if given */ - if (query->groupClause != NULL || query->groupingSets != NULL) - { - ParseExprKind save_exprkind; - - appendContextKeyword(context, " GROUP BY ", - -PRETTYINDENT_STD, PRETTYINDENT_STD, 1); - - save_exprkind = context->special_exprkind; - context->special_exprkind = EXPR_KIND_GROUP_BY; - - if (query->groupingSets == NIL) - { - sep = ""; - foreach(l, query->groupClause) - { - SortGroupClause *grp = (SortGroupClause *) lfirst(l); - - appendStringInfoString(buf, sep); - get_rule_sortgroupclause(grp->tleSortGroupRef, query->targetList, - false, context); - sep = ", "; - } - } - else - { - sep = ""; - foreach(l, query->groupingSets) - { - GroupingSet *grp = lfirst(l); - - appendStringInfoString(buf, sep); - get_rule_groupingset(grp, query->targetList, true, context); - sep = ", "; - } - } - - context->special_exprkind = save_exprkind; - } - - /* Add the HAVING clause if given */ - if (query->havingQual != NULL) - { - appendContextKeyword(context, " HAVING ", - -PRETTYINDENT_STD, PRETTYINDENT_STD, 0); - get_rule_expr(query->havingQual, context, false); - } - - /* Add the WINDOW clause if needed */ - if (query->windowClause != NIL) - get_rule_windowclause(query, context); -} - -/* ---------- - * get_target_list - Parse back a SELECT target list - * - * This is also used for RETURNING lists in INSERT/UPDATE/DELETE. - * ---------- - */ -static void -get_target_list(List *targetList, deparse_context *context, - TupleDesc resultDesc) -{// #lizard forgives - StringInfo buf = context->buf; - StringInfoData targetbuf; - bool last_was_multiline = false; - char *sep; - int colno; - ListCell *l; -#ifdef PGXC - bool no_targetlist = true; -#endif - - /* we use targetbuf to hold each TLE's text temporarily */ - initStringInfo(&targetbuf); - - sep = " "; - colno = 0; - foreach(l, targetList) - { - TargetEntry *tle = (TargetEntry *) lfirst(l); - char *colname; - char *attname; - - if (tle->resjunk) - continue; /* ignore junk entries */ - -#ifdef PGXC - /* Found at least one element in the target list */ - if (no_targetlist) - no_targetlist = false; -#endif - - appendStringInfoString(buf, sep); - sep = ", "; - colno++; - - /* - * Put the new field text into targetbuf so we can decide after we've - * got it whether or not it needs to go on a new line. - */ - resetStringInfo(&targetbuf); - context->buf = &targetbuf; - - /* - * We special-case Var nodes rather than using get_rule_expr. This is - * needed because get_rule_expr will display a whole-row Var as - * "foo.*", which is the preferred notation in most contexts, but at - * the top level of a SELECT list it's not right (the parser will - * expand that notation into multiple columns, yielding behavior - * different from a whole-row Var). We need to call get_variable - * directly so that we can tell it to do the right thing, and so that - * we can get the attribute name which is the default AS label. - */ - if (tle->expr && (IsA(tle->expr, Var))) - { - attname = get_variable((Var *) tle->expr, 0, true, context); - } - else - { - get_rule_expr((Node *) tle->expr, context, true); - /* We'll show the AS name unless it's this: */ - attname = "?column?"; - } - - /* - * Figure out what the result column should be called. In the context - * of a view, use the view's tuple descriptor (so as to pick up the - * effects of any column RENAME that's been done on the view). - * Otherwise, just use what we can find in the TLE. - */ - if (resultDesc && colno <= resultDesc->natts) - colname = NameStr(resultDesc->attrs[colno - 1]->attname); - else - colname = tle->resname; - - /* Show AS unless the column's name is correct as-is */ - if (colname) /* resname could be NULL */ - { - if (attname == NULL || strcmp(attname, colname) != 0) - appendStringInfo(&targetbuf, " AS %s", quote_identifier(colname)); - } - - /* Restore context's output buffer */ - context->buf = buf; - - /* Consider line-wrapping if enabled */ - if (PRETTY_INDENT(context) && context->wrapColumn >= 0) - { - int leading_nl_pos; - - /* Does the new field start with a new line? */ - if (targetbuf.len > 0 && targetbuf.data[0] == '\n') - leading_nl_pos = 0; - else - leading_nl_pos = -1; - - /* If so, we shouldn't add anything */ - if (leading_nl_pos >= 0) - { - /* instead, remove any trailing spaces currently in buf */ - removeStringInfoSpaces(buf); - } - else - { - char *trailing_nl; - - /* Locate the start of the current line in the output buffer */ - trailing_nl = strrchr(buf->data, '\n'); - if (trailing_nl == NULL) - trailing_nl = buf->data; - else - trailing_nl++; - - /* - * Add a newline, plus some indentation, if the new field is - * not the first and either the new field would cause an - * overflow or the last field used more than one line. - */ - if (colno > 1 && - ((strlen(trailing_nl) + targetbuf.len > context->wrapColumn) || - last_was_multiline)) - appendContextKeyword(context, "", -PRETTYINDENT_STD, - PRETTYINDENT_STD, PRETTYINDENT_VAR); - } - - /* Remember this field's multiline status for next iteration */ - last_was_multiline = - (strchr(targetbuf.data + leading_nl_pos + 1, '\n') != NULL); - } - - /* Add the new field */ - appendStringInfoString(buf, targetbuf.data); - } - -#ifdef PGXC - /* - * Because the empty target list can generate invalid SQL - * clause. Here, just fill a '*' to process a table without - * any columns, this statement will be sent to Datanodes - * and treated correctly on remote nodes. - */ - if (no_targetlist) - appendStringInfo(buf, " *"); -#endif - /* clean up */ - pfree(targetbuf.data); -} - -static void -get_setop_query(Node *setOp, Query *query, deparse_context *context, - TupleDesc resultDesc) -{// #lizard forgives - StringInfo buf = context->buf; - bool need_paren; - - /* Guard against excessively long or deeply-nested queries */ - CHECK_FOR_INTERRUPTS(); - check_stack_depth(); - - if (IsA(setOp, RangeTblRef)) - { - RangeTblRef *rtr = (RangeTblRef *) setOp; - RangeTblEntry *rte = rt_fetch(rtr->rtindex, query->rtable); - Query *subquery = rte->subquery; - - Assert(subquery != NULL); - Assert(subquery->setOperations == NULL); - /* Need parens if WITH, ORDER BY, FOR UPDATE, or LIMIT; see gram.y */ - need_paren = (subquery->cteList || - subquery->sortClause || - subquery->rowMarks || - subquery->limitOffset || - subquery->limitCount); - if (need_paren) - appendStringInfoChar(buf, '('); - get_query_def(subquery, buf, context->namespaces, resultDesc, - context->prettyFlags, context->wrapColumn, - context->indentLevel, - context->finalise_aggs, - context->sortgroup_colno); - if (need_paren) - appendStringInfoChar(buf, ')'); - } - else if (IsA(setOp, SetOperationStmt)) - { - SetOperationStmt *op = (SetOperationStmt *) setOp; - int subindent; - - /* - * We force parens when nesting two SetOperationStmts, except when the - * lefthand input is another setop of the same kind. Syntactically, - * we could omit parens in rather more cases, but it seems best to use - * parens to flag cases where the setop operator changes. If we use - * parens, we also increase the indentation level for the child query. - * - * There are some cases in which parens are needed around a leaf query - * too, but those are more easily handled at the next level down (see - * code above). - */ - if (IsA(op->larg, SetOperationStmt)) - { - SetOperationStmt *lop = (SetOperationStmt *) op->larg; - - if (op->op == lop->op && op->all == lop->all) - need_paren = false; - else - need_paren = true; - } - else - need_paren = false; - - if (need_paren) - { - appendStringInfoChar(buf, '('); - subindent = PRETTYINDENT_STD; - appendContextKeyword(context, "", subindent, 0, 0); - } - else - subindent = 0; - - get_setop_query(op->larg, query, context, resultDesc); - - if (need_paren) - appendContextKeyword(context, ") ", -subindent, 0, 0); - else if (PRETTY_INDENT(context)) - appendContextKeyword(context, "", -subindent, 0, 0); - else - appendStringInfoChar(buf, ' '); - - switch (op->op) - { - case SETOP_UNION: - appendStringInfoString(buf, "UNION "); - break; - case SETOP_INTERSECT: - appendStringInfoString(buf, "INTERSECT "); - break; - case SETOP_EXCEPT: - appendStringInfoString(buf, "EXCEPT "); - break; - default: - elog(ERROR, "unrecognized set op: %d", - (int) op->op); - } - if (op->all) - appendStringInfoString(buf, "ALL "); - - /* Always parenthesize if RHS is another setop */ - need_paren = IsA(op->rarg, SetOperationStmt); - - /* - * The indentation code here is deliberately a bit different from that - * for the lefthand input, because we want the line breaks in - * different places. - */ - if (need_paren) - { - appendStringInfoChar(buf, '('); - subindent = PRETTYINDENT_STD; - } - else - subindent = 0; - appendContextKeyword(context, "", subindent, 0, 0); - - get_setop_query(op->rarg, query, context, resultDesc); - - if (PRETTY_INDENT(context)) - context->indentLevel -= subindent; - if (need_paren) - appendContextKeyword(context, ")", 0, 0, 0); - } - else - { - elog(ERROR, "unrecognized node type: %d", - (int) nodeTag(setOp)); - } -} - -/* - * Display a sort/group clause. - * - * Also returns the expression tree, so caller need not find it again. - */ -static Node * -get_rule_sortgroupclause(Index ref, List *tlist, bool force_colno, - deparse_context *context) -{// #lizard forgives - StringInfo buf = context->buf; - TargetEntry *tle; - Node *expr; - - tle = get_sortgroupref_tle(ref, tlist); - expr = (Node *) tle->expr; - - /* - * Use column-number form if requested by caller. Otherwise, if - * expression is a constant, force it to be dumped with an explicit cast - * as decoration --- this is because a simple integer constant is - * ambiguous (and will be misinterpreted by findTargetlistEntry()) if we - * dump it without any decoration. If it's anything more complex than a - * simple Var, then force extra parens around it, to ensure it can't be - * misinterpreted as a cube() or rollup() construct. - */ - if (force_colno) - { - Assert(!tle->resjunk); - appendStringInfo(buf, "%d", tle->resno); - } - else if (expr && IsA(expr, Const)) - get_const_expr((Const *) expr, context, 1); - else if (!expr || IsA(expr, Var)) - get_rule_expr(expr, context, true); - else - { - /* - * We must force parens for function-like expressions even if - * PRETTY_PAREN is off, since those are the ones in danger of - * misparsing. For other expressions we need to force them only if - * PRETTY_PAREN is on, since otherwise the expression will output them - * itself. (We can't skip the parens.) - */ - bool need_paren = (PRETTY_PAREN(context) - || IsA(expr, FuncExpr) - ||IsA(expr, Aggref) - ||IsA(expr, WindowFunc)); - - if (need_paren) - appendStringInfoString(context->buf, "("); - get_rule_expr(expr, context, true); - if (need_paren) - appendStringInfoString(context->buf, ")"); - } - - return expr; -} - -/* - * Display a GroupingSet - */ -static void -get_rule_groupingset(GroupingSet *gset, List *targetlist, - bool omit_parens, deparse_context *context) -{// #lizard forgives - ListCell *l; - StringInfo buf = context->buf; - bool omit_child_parens = true; - char *sep = ""; - - switch (gset->kind) - { - case GROUPING_SET_EMPTY: - appendStringInfoString(buf, "()"); - return; - - case GROUPING_SET_SIMPLE: - { - if (!omit_parens || list_length(gset->content) != 1) - appendStringInfoString(buf, "("); - - foreach(l, gset->content) - { - Index ref = lfirst_int(l); - - appendStringInfoString(buf, sep); - get_rule_sortgroupclause(ref, targetlist, - false, context); - sep = ", "; - } - - if (!omit_parens || list_length(gset->content) != 1) - appendStringInfoString(buf, ")"); - } - return; - - case GROUPING_SET_ROLLUP: - appendStringInfoString(buf, "ROLLUP("); - break; - case GROUPING_SET_CUBE: - appendStringInfoString(buf, "CUBE("); - break; - case GROUPING_SET_SETS: - appendStringInfoString(buf, "GROUPING SETS ("); - omit_child_parens = false; - break; - } - - foreach(l, gset->content) - { - appendStringInfoString(buf, sep); - get_rule_groupingset(lfirst(l), targetlist, omit_child_parens, context); - sep = ", "; - } - - appendStringInfoString(buf, ")"); -} - -/* - * Display an ORDER BY list. - */ -static void -get_rule_orderby(List *orderList, List *targetList, - bool force_colno, deparse_context *context) -{ - StringInfo buf = context->buf; - const char *sep; - ListCell *l; - - sep = ""; - foreach(l, orderList) - { - SortGroupClause *srt = (SortGroupClause *) lfirst(l); - Node *sortexpr; - Oid sortcoltype; - TypeCacheEntry *typentry; - - appendStringInfoString(buf, sep); - sortexpr = get_rule_sortgroupclause(srt->tleSortGroupRef, targetList, - force_colno, context); - sortcoltype = exprType(sortexpr); - /* See whether operator is default < or > for datatype */ - typentry = lookup_type_cache(sortcoltype, - TYPECACHE_LT_OPR | TYPECACHE_GT_OPR); - if (srt->sortop == typentry->lt_opr) - { - /* ASC is default, so emit nothing for it */ - if (srt->nulls_first) - appendStringInfoString(buf, " NULLS FIRST"); - } - else if (srt->sortop == typentry->gt_opr) - { - appendStringInfoString(buf, " DESC"); - /* DESC defaults to NULLS FIRST */ - if (!srt->nulls_first) - appendStringInfoString(buf, " NULLS LAST"); - } - else - { - appendStringInfo(buf, " USING %s", - generate_operator_name(srt->sortop, - sortcoltype, - sortcoltype)); - /* be specific to eliminate ambiguity */ - if (srt->nulls_first) - appendStringInfoString(buf, " NULLS FIRST"); - else - appendStringInfoString(buf, " NULLS LAST"); - } - sep = ", "; - } -} - -/* - * Display a WINDOW clause. - * - * Note that the windowClause list might contain only anonymous window - * specifications, in which case we should print nothing here. - */ -static void -get_rule_windowclause(Query *query, deparse_context *context) -{ - StringInfo buf = context->buf; - const char *sep; - ListCell *l; - - sep = NULL; - foreach(l, query->windowClause) - { - WindowClause *wc = (WindowClause *) lfirst(l); - - if (wc->name == NULL) - continue; /* ignore anonymous windows */ - - if (sep == NULL) - appendContextKeyword(context, " WINDOW ", - -PRETTYINDENT_STD, PRETTYINDENT_STD, 1); - else - appendStringInfoString(buf, sep); - - appendStringInfo(buf, "%s AS ", quote_identifier(wc->name)); - - get_rule_windowspec(wc, query->targetList, context); - - sep = ", "; - } -} - -/* - * Display a window definition - */ -static void -get_rule_windowspec(WindowClause *wc, List *targetList, - deparse_context *context) -{// #lizard forgives - StringInfo buf = context->buf; - bool needspace = false; - const char *sep; - ListCell *l; - - appendStringInfoChar(buf, '('); - if (wc->refname) - { - appendStringInfoString(buf, quote_identifier(wc->refname)); - needspace = true; - } - /* partition clauses are always inherited, so only print if no refname */ - if (wc->partitionClause && !wc->refname) - { - if (needspace) - appendStringInfoChar(buf, ' '); - appendStringInfoString(buf, "PARTITION BY "); - sep = ""; - foreach(l, wc->partitionClause) - { - SortGroupClause *grp = (SortGroupClause *) lfirst(l); - - appendStringInfoString(buf, sep); - get_rule_sortgroupclause(grp->tleSortGroupRef, targetList, - false, context); - sep = ", "; - } - needspace = true; - } - /* print ordering clause only if not inherited */ - if (wc->orderClause && !wc->copiedOrder) - { - if (needspace) - appendStringInfoChar(buf, ' '); - appendStringInfoString(buf, "ORDER BY "); - get_rule_orderby(wc->orderClause, targetList, false, context); - needspace = true; - } - /* framing clause is never inherited, so print unless it's default */ - if (wc->frameOptions & FRAMEOPTION_NONDEFAULT) - { - if (needspace) - appendStringInfoChar(buf, ' '); - if (wc->frameOptions & FRAMEOPTION_RANGE) - appendStringInfoString(buf, "RANGE "); - else if (wc->frameOptions & FRAMEOPTION_ROWS) - appendStringInfoString(buf, "ROWS "); - else - Assert(false); - if (wc->frameOptions & FRAMEOPTION_BETWEEN) - appendStringInfoString(buf, "BETWEEN "); - if (wc->frameOptions & FRAMEOPTION_START_UNBOUNDED_PRECEDING) - appendStringInfoString(buf, "UNBOUNDED PRECEDING "); - else if (wc->frameOptions & FRAMEOPTION_START_CURRENT_ROW) - appendStringInfoString(buf, "CURRENT ROW "); - else if (wc->frameOptions & FRAMEOPTION_START_VALUE) - { - get_rule_expr(wc->startOffset, context, false); - if (wc->frameOptions & FRAMEOPTION_START_VALUE_PRECEDING) - appendStringInfoString(buf, " PRECEDING "); - else if (wc->frameOptions & FRAMEOPTION_START_VALUE_FOLLOWING) - appendStringInfoString(buf, " FOLLOWING "); - else - Assert(false); - } - else - Assert(false); - if (wc->frameOptions & FRAMEOPTION_BETWEEN) - { - appendStringInfoString(buf, "AND "); - if (wc->frameOptions & FRAMEOPTION_END_UNBOUNDED_FOLLOWING) - appendStringInfoString(buf, "UNBOUNDED FOLLOWING "); - else if (wc->frameOptions & FRAMEOPTION_END_CURRENT_ROW) - appendStringInfoString(buf, "CURRENT ROW "); - else if (wc->frameOptions & FRAMEOPTION_END_VALUE) - { - get_rule_expr(wc->endOffset, context, false); - if (wc->frameOptions & FRAMEOPTION_END_VALUE_PRECEDING) - appendStringInfoString(buf, " PRECEDING "); - else if (wc->frameOptions & FRAMEOPTION_END_VALUE_FOLLOWING) - appendStringInfoString(buf, " FOLLOWING "); - else - Assert(false); - } - else - Assert(false); - } - /* we will now have a trailing space; remove it */ - buf->len--; - } - appendStringInfoChar(buf, ')'); -} - -/* ---------- - * get_insert_query_def - Parse back an INSERT parsetree - * ---------- - */ -static void -get_insert_query_def(Query *query, deparse_context *context) -{// #lizard forgives - StringInfo buf = context->buf; - RangeTblEntry *select_rte = NULL; - RangeTblEntry *values_rte = NULL; - RangeTblEntry *rte; - char *sep; - ListCell *l; - List *strippedexprs; - - /* Insert the WITH clause if given */ - get_with_clause(query, context); - -#ifdef __TBASE__ - /* - * If query has unshippable triggers, we have to do INSERT on coordinator, - * and we do not need select_rte and values_rte. - * Hence we keep both select_rte and values_rte NULL. - */ - if (!query->hasUnshippableTriggers) - { -#endif - /* - * If it's an INSERT ... SELECT or multi-row VALUES, there will be a - * single RTE for the SELECT or VALUES. Plain VALUES has neither. - */ - foreach(l, query->rtable) - { - rte = (RangeTblEntry *) lfirst(l); - - if (rte->rtekind == RTE_SUBQUERY) - { - if (select_rte) - elog(ERROR, "too many subquery RTEs in INSERT"); - select_rte = rte; - } - - if (rte->rtekind == RTE_VALUES) - { - if (values_rte) - elog(ERROR, "too many values RTEs in INSERT"); - values_rte = rte; - } - } -#ifdef __TBASE__ - } -#endif - if (select_rte && values_rte) - elog(ERROR, "both subquery and values RTEs in INSERT"); - - /* - * Start the query with INSERT INTO relname - */ - rte = rt_fetch(query->resultRelation, query->rtable); - Assert(rte->rtekind == RTE_RELATION); - - if (PRETTY_INDENT(context)) - { - context->indentLevel += PRETTYINDENT_STD; - appendStringInfoChar(buf, ' '); - } - appendStringInfo(buf, "INSERT INTO %s ", - generate_relation_name(rte->relid, NIL)); - /* INSERT requires AS keyword for target alias */ - if (rte->alias != NULL) - appendStringInfo(buf, "AS %s ", - quote_identifier(rte->alias->aliasname)); - - /* - * Add the insert-column-names list. Any indirection decoration needed on - * the column names can be inferred from the top targetlist. - */ - strippedexprs = NIL; - sep = ""; - if (query->targetList) - appendStringInfoChar(buf, '('); - foreach(l, query->targetList) - { - TargetEntry *tle = (TargetEntry *) lfirst(l); - - if (tle->resjunk) - continue; /* ignore junk entries */ - - appendStringInfoString(buf, sep); - sep = ", "; - - /* - * Put out name of target column; look in the catalogs, not at - * tle->resname, since resname will fail to track RENAME. - */ - appendStringInfoString(buf, - quote_identifier(get_relid_attribute_name(rte->relid, - tle->resno))); - - /* - * Print any indirection needed (subfields or subscripts), and strip - * off the top-level nodes representing the indirection assignments. - * Add the stripped expressions to strippedexprs. (If it's a - * single-VALUES statement, the stripped expressions are the VALUES to - * print below. Otherwise they're just Vars and not really - * interesting.) - */ - strippedexprs = lappend(strippedexprs, - processIndirection((Node *) tle->expr, - context)); - } - if (query->targetList) - appendStringInfoString(buf, ") "); - - if (query->override) - { - if (query->override == OVERRIDING_SYSTEM_VALUE) - appendStringInfoString(buf, "OVERRIDING SYSTEM VALUE "); - else if (query->override == OVERRIDING_USER_VALUE) - appendStringInfoString(buf, "OVERRIDING USER VALUE "); - } - - if (select_rte) - { - /* Add the SELECT */ - get_query_def(select_rte->subquery, buf, NIL, NULL, - context->prettyFlags, context->wrapColumn, - context->indentLevel, - context->finalise_aggs, - context->sortgroup_colno); - } - else if (values_rte) - { - /* Add the multi-VALUES expression lists */ - get_values_def(values_rte->values_lists, context); - } - else if (strippedexprs) - { - /* Add the single-VALUES expression list */ - appendContextKeyword(context, "VALUES (", - -PRETTYINDENT_STD, PRETTYINDENT_STD, 2); - get_rule_expr((Node *) strippedexprs, context, false); - appendStringInfoChar(buf, ')'); - } - else - { - /* No expressions, so it must be DEFAULT VALUES */ - appendStringInfoString(buf, "DEFAULT VALUES"); - } - - /* Add ON CONFLICT if present */ - if (query->onConflict) - { - OnConflictExpr *confl = query->onConflict; - - appendStringInfoString(buf, " ON CONFLICT"); - - if (confl->arbiterElems) - { - /* Add the single-VALUES expression list */ - appendStringInfoChar(buf, '('); - get_rule_expr((Node *) confl->arbiterElems, context, false); - appendStringInfoChar(buf, ')'); - - /* Add a WHERE clause (for partial indexes) if given */ - if (confl->arbiterWhere != NULL) - { - bool save_varprefix; - - /* - * Force non-prefixing of Vars, since parser assumes that they - * belong to target relation. WHERE clause does not use - * InferenceElem, so this is separately required. - */ - save_varprefix = context->varprefix; - context->varprefix = false; - - appendContextKeyword(context, " WHERE ", - -PRETTYINDENT_STD, PRETTYINDENT_STD, 1); - get_rule_expr(confl->arbiterWhere, context, false); - - context->varprefix = save_varprefix; - } - } - else if (OidIsValid(confl->constraint)) - { - char *constraint = get_constraint_name(confl->constraint); - - if (!constraint) - elog(ERROR, "cache lookup failed for constraint %u", - confl->constraint); - appendStringInfo(buf, " ON CONSTRAINT %s", - quote_identifier(constraint)); - } - - if (confl->action == ONCONFLICT_NOTHING) - { - appendStringInfoString(buf, " DO NOTHING"); - } - else - { - appendStringInfoString(buf, " DO UPDATE SET "); - /* Deparse targetlist */ - get_update_query_targetlist_def(query, confl->onConflictSet, - context, rte); - - /* Add a WHERE clause if given */ - if (confl->onConflictWhere != NULL) - { - appendContextKeyword(context, " WHERE ", - -PRETTYINDENT_STD, PRETTYINDENT_STD, 1); - get_rule_expr(confl->onConflictWhere, context, false); - } - } - } - - /* Add RETURNING if present */ - if (query->returningList) - { - appendContextKeyword(context, " RETURNING", - -PRETTYINDENT_STD, PRETTYINDENT_STD, 1); - get_target_list(query->returningList, context, NULL); - } -} - - -/* ---------- - * get_update_query_def - Parse back an UPDATE parsetree - * ---------- - */ -static void -get_update_query_def(Query *query, deparse_context *context) -{ - StringInfo buf = context->buf; - RangeTblEntry *rte; - - /* Insert the WITH clause if given */ - get_with_clause(query, context); - - /* - * Start the query with UPDATE relname SET - */ - rte = rt_fetch(query->resultRelation, query->rtable); - Assert(rte->rtekind == RTE_RELATION); - if (PRETTY_INDENT(context)) - { - appendStringInfoChar(buf, ' '); - context->indentLevel += PRETTYINDENT_STD; - } - appendStringInfo(buf, "UPDATE %s%s", - only_marker(rte), - generate_relation_name(rte->relid, NIL)); - if (rte->alias != NULL) - appendStringInfo(buf, " %s", - quote_identifier(rte->alias->aliasname)); - appendStringInfoString(buf, " SET "); - - /* Deparse targetlist */ - get_update_query_targetlist_def(query, query->targetList, context, rte); - - /* Add the FROM clause if needed */ - get_from_clause(query, " FROM ", context); - - /* Add a WHERE clause if given */ - if (query->jointree->quals != NULL) - { - appendContextKeyword(context, " WHERE ", - -PRETTYINDENT_STD, PRETTYINDENT_STD, 1); - get_rule_expr(query->jointree->quals, context, false); - } - - /* Add RETURNING if present */ - if (query->returningList) - { - appendContextKeyword(context, " RETURNING", - -PRETTYINDENT_STD, PRETTYINDENT_STD, 1); - get_target_list(query->returningList, context, NULL); - } -} - - -/* ---------- - * get_update_query_targetlist_def - Parse back an UPDATE targetlist - * ---------- - */ -static void -get_update_query_targetlist_def(Query *query, List *targetList, - deparse_context *context, RangeTblEntry *rte) -{// #lizard forgives - StringInfo buf = context->buf; - ListCell *l; - ListCell *next_ma_cell; - int remaining_ma_columns; - const char *sep; - SubLink *cur_ma_sublink; - List *ma_sublinks; - - /* - * Prepare to deal with MULTIEXPR assignments: collect the source SubLinks - * into a list. We expect them to appear, in ID order, in resjunk tlist - * entries. - */ - ma_sublinks = NIL; - if (query->hasSubLinks) /* else there can't be any */ - { - foreach(l, targetList) - { - TargetEntry *tle = (TargetEntry *) lfirst(l); - - if (tle->resjunk && IsA(tle->expr, SubLink)) - { - SubLink *sl = (SubLink *) tle->expr; - - if (sl->subLinkType == MULTIEXPR_SUBLINK) - { - ma_sublinks = lappend(ma_sublinks, sl); - Assert(sl->subLinkId == list_length(ma_sublinks)); - } - } - } - } - next_ma_cell = list_head(ma_sublinks); - cur_ma_sublink = NULL; - remaining_ma_columns = 0; - - /* Add the comma separated list of 'attname = value' */ - sep = ""; - foreach(l, targetList) - { - TargetEntry *tle = (TargetEntry *) lfirst(l); - Node *expr; - - if (tle->resjunk) - continue; /* ignore junk entries */ - - /* Emit separator (OK whether we're in multiassignment or not) */ - appendStringInfoString(buf, sep); - sep = ", "; - - /* - * Check to see if we're starting a multiassignment group: if so, - * output a left paren. - */ - if (next_ma_cell != NULL && cur_ma_sublink == NULL) - { - /* - * We must dig down into the expr to see if it's a PARAM_MULTIEXPR - * Param. That could be buried under FieldStores and ArrayRefs - * and CoerceToDomains (cf processIndirection()), and underneath - * those there could be an implicit type coercion. Because we - * would ignore implicit type coercions anyway, we don't need to - * be as careful as processIndirection() is about descending past - * implicit CoerceToDomains. - */ - expr = (Node *) tle->expr; - while (expr) - { - if (IsA(expr, FieldStore)) - { - FieldStore *fstore = (FieldStore *) expr; - - expr = (Node *) linitial(fstore->newvals); - } - else if (IsA(expr, ArrayRef)) - { - ArrayRef *aref = (ArrayRef *) expr; - - if (aref->refassgnexpr == NULL) - break; - expr = (Node *) aref->refassgnexpr; - } - else if (IsA(expr, CoerceToDomain)) - { - CoerceToDomain *cdomain = (CoerceToDomain *) expr; - - if (cdomain->coercionformat != COERCE_IMPLICIT_CAST) - break; - expr = (Node *) cdomain->arg; - } - else - break; - } - expr = strip_implicit_coercions(expr); - - if (expr && IsA(expr, Param) && - ((Param *) expr)->paramkind == PARAM_MULTIEXPR) - { - cur_ma_sublink = (SubLink *) lfirst(next_ma_cell); - next_ma_cell = lnext(next_ma_cell); - remaining_ma_columns = count_nonjunk_tlist_entries( - ((Query *) cur_ma_sublink->subselect)->targetList); - Assert(((Param *) expr)->paramid == - ((cur_ma_sublink->subLinkId << 16) | 1)); - appendStringInfoChar(buf, '('); - } - } - - /* - * Put out name of target column; look in the catalogs, not at - * tle->resname, since resname will fail to track RENAME. - */ - appendStringInfoString(buf, - quote_identifier(get_relid_attribute_name(rte->relid, - tle->resno))); - - /* - * Print any indirection needed (subfields or subscripts), and strip - * off the top-level nodes representing the indirection assignments. - */ - expr = processIndirection((Node *) tle->expr, context); - - /* - * If we're in a multiassignment, skip printing anything more, unless - * this is the last column; in which case, what we print should be the - * sublink, not the Param. - */ - if (cur_ma_sublink != NULL) - { - if (--remaining_ma_columns > 0) - continue; /* not the last column of multiassignment */ - appendStringInfoChar(buf, ')'); - expr = (Node *) cur_ma_sublink; - cur_ma_sublink = NULL; - } - - appendStringInfoString(buf, " = "); - - get_rule_expr(expr, context, false); - } -} - - -/* ---------- - * get_delete_query_def - Parse back a DELETE parsetree - * ---------- - */ -static void -get_delete_query_def(Query *query, deparse_context *context) -{ - StringInfo buf = context->buf; - RangeTblEntry *rte; - - /* Insert the WITH clause if given */ - get_with_clause(query, context); - - /* - * Start the query with DELETE FROM relname - */ - rte = rt_fetch(query->resultRelation, query->rtable); - Assert(rte->rtekind == RTE_RELATION); - if (PRETTY_INDENT(context)) - { - appendStringInfoChar(buf, ' '); - context->indentLevel += PRETTYINDENT_STD; - } - appendStringInfo(buf, "DELETE FROM %s%s", - only_marker(rte), - generate_relation_name(rte->relid, NIL)); - if (rte->alias != NULL) - appendStringInfo(buf, " %s", - quote_identifier(rte->alias->aliasname)); - - /* Add the USING clause if given */ - get_from_clause(query, " USING ", context); - - /* Add a WHERE clause if given */ - if (query->jointree->quals != NULL) - { - appendContextKeyword(context, " WHERE ", - -PRETTYINDENT_STD, PRETTYINDENT_STD, 1); - get_rule_expr(query->jointree->quals, context, false); - } - - /* Add RETURNING if present */ - if (query->returningList) - { - appendContextKeyword(context, " RETURNING", - -PRETTYINDENT_STD, PRETTYINDENT_STD, 1); - get_target_list(query->returningList, context, NULL); - } -} - - -/* ---------- - * get_utility_query_def - Parse back a UTILITY parsetree - * ---------- - */ -static void -get_utility_query_def(Query *query, deparse_context *context) -{// #lizard forgives - StringInfo buf = context->buf; - - if (query->utilityStmt && IsA(query->utilityStmt, NotifyStmt)) - { - NotifyStmt *stmt = (NotifyStmt *) query->utilityStmt; - - appendContextKeyword(context, "", - 0, PRETTYINDENT_STD, 1); - appendStringInfo(buf, "NOTIFY %s", - quote_identifier(stmt->conditionname)); - if (stmt->payload) - { - appendStringInfoString(buf, ", "); - simple_quote_literal(buf, stmt->payload); - } - } -#ifdef PGXC - else if (query->utilityStmt && IsA(query->utilityStmt, CreateStmt)) - { - CreateStmt *stmt = (CreateStmt *) query->utilityStmt; - ListCell *column; - const char *delimiter = ""; - RangeVar *relation = stmt->relation; - bool istemp = (relation->relpersistence == RELPERSISTENCE_TEMP); - bool isunlogged = (relation->relpersistence == RELPERSISTENCE_UNLOGGED); - - appendStringInfo(buf, "CREATE %s %s %s TABLE %s ", - stmt->islocal ? "LOCAL" : "", - istemp ? "TEMP" : "", - isunlogged ? "UNLOGGED" : "", - stmt->if_not_exists ? "IF NOT EXISTS " : ""); - - if (!istemp && relation->schemaname && relation->schemaname[0]) - appendStringInfo(buf, "%s.", quote_identifier(relation->schemaname)); - appendStringInfo(buf, "%s", quote_identifier(relation->relname)); - - appendStringInfo(buf, "("); - foreach(column, stmt->tableElts) - { - Node *node = (Node *) lfirst(column); - - appendStringInfo(buf, "%s", delimiter); - delimiter = ", "; - - if (IsA(node, ColumnDef)) - { - ColumnDef *coldef = (ColumnDef *) node; - TypeName *typename = coldef->typeName; -#ifdef XCP - appendStringInfo(buf, "%s %s", - quote_identifier(coldef->colname), - format_type_with_typemod(typename->typeOid, - typename->typemod)); -#else - - /* error out if we have no recourse at all */ - if (!OidIsValid(typename->typeOid)) - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("improper type oid: \"%u\"", typename->typeOid))); - - /* get typename from the oid */ - type = typeidType(typename->typeOid); - - if (!HeapTupleIsValid(type)) - ereport(ERROR, - (errcode(ERRCODE_UNDEFINED_OBJECT), - errmsg("type \"%u\" does not exist", - typename->typeOid))); - appendStringInfo(buf, "%s %s", quote_identifier(coldef->colname), - typeTypeName(type)); - ReleaseSysCache(type); -#endif - } - else - elog(ERROR, "Invalid table column definition."); - } - appendStringInfo(buf, ")"); - - /* Append storage parameters, like for instance WITH (OIDS) */ - if (list_length(stmt->options) > 0) - { - Datum reloptions; - static char *validnsps[] = HEAP_RELOPT_NAMESPACES; - - reloptions = transformRelOptions((Datum) 0, stmt->options, NULL, validnsps, - false, false); - - if (reloptions) - { - Datum sep, txt; - /* Below is inspired from flatten_reloptions() */ - sep = CStringGetTextDatum(", "); - txt = OidFunctionCall2(F_ARRAY_TO_TEXT, reloptions, sep); - appendStringInfo(buf, " WITH (%s)", TextDatumGetCString(txt)); - } - } - - /* add the on commit clauses for temporary tables */ - switch (stmt->oncommit) - { - case ONCOMMIT_NOOP: - /* do nothing */ - break; - - case ONCOMMIT_PRESERVE_ROWS: - appendStringInfo(buf, " ON COMMIT PRESERVE ROWS"); - break; - - case ONCOMMIT_DELETE_ROWS: - appendStringInfo(buf, " ON COMMIT DELETE ROWS"); - break; - - case ONCOMMIT_DROP: - appendStringInfo(buf, " ON COMMIT DROP"); - break; - } - - if (stmt->distributeby) - { - /* add the on commit clauses for temporary tables */ - switch (stmt->distributeby->disttype) - { - case DISTTYPE_REPLICATION: - appendStringInfo(buf, " DISTRIBUTE BY REPLICATION"); - break; - - case DISTTYPE_HASH: -#ifdef __COLD_HOT__ - appendStringInfo(buf, " DISTRIBUTE BY HASH(%s)", strVal(linitial(stmt->distributeby->colname))); -#else - appendStringInfo(buf, " DISTRIBUTE BY HASH(%s)", stmt->distributeby->colname); -#endif - break; - - case DISTTYPE_ROUNDROBIN: - appendStringInfo(buf, " DISTRIBUTE BY ROUNDROBIN"); - break; - - case DISTTYPE_MODULO: -#ifdef __COLD_HOT__ - appendStringInfo(buf, " DISTRIBUTE BY MODULO(%s)", - quote_identifier(strVal(linitial(stmt->distributeby->colname)))); -#else - appendStringInfo(buf, " DISTRIBUTE BY MODULO(%s)", - quote_identifier(stmt->distributeby->colname)); -#endif - break; - - default: - ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("Invalid distribution type"))); - - } - } - - if (stmt->subcluster) - { - ListCell *cell; - - switch (stmt->subcluster->clustertype) - { - case SUBCLUSTER_NODE: - appendStringInfo(buf, " TO NODE ("); - - /* Add node members */ - Assert(stmt->subcluster->members); - foreach(cell, stmt->subcluster->members) - { - appendStringInfo(buf, " %s", - quote_identifier(strVal(lfirst(cell)))); - if (cell->next) - appendStringInfo(buf, ","); - } - appendStringInfo(buf, ")"); - break; - - case SUBCLUSTER_GROUP: - appendStringInfo(buf, " TO GROUP"); - - /* Add group members */ - Assert(stmt->subcluster->members); - foreach(cell, stmt->subcluster->members) - { - appendStringInfo(buf, " %s", - quote_identifier(strVal(lfirst(cell)))); - if (cell->next) - appendStringInfo(buf, ","); - } - break; - - case SUBCLUSTER_NONE: - default: - /* Nothing to do */ - break; - } - } - } -#endif - else - { - /* Currently only NOTIFY utility commands can appear in rules */ - elog(ERROR, "unexpected utility statement type"); - } -} - -/* - * Display a Var appropriately. - * - * In some cases (currently only when recursing into an unnamed join) - * the Var's varlevelsup has to be interpreted with respect to a context - * above the current one; levelsup indicates the offset. - * - * If istoplevel is TRUE, the Var is at the top level of a SELECT's - * targetlist, which means we need special treatment of whole-row Vars. - * Instead of the normal "tab.*", we'll print "tab.*::typename", which is a - * dirty hack to prevent "tab.*" from being expanded into multiple columns. - * (The parser will strip the useless coercion, so no inefficiency is added in - * dump and reload.) We used to print just "tab" in such cases, but that is - * ambiguous and will yield the wrong result if "tab" is also a plain column - * name in the query. - * - * Returns the attname of the Var, or NULL if the Var has no attname (because - * it is a whole-row Var or a subplan output reference). - */ -static char * -get_variable(Var *var, int levelsup, bool istoplevel, deparse_context *context) -{// #lizard forgives - StringInfo buf = context->buf; - RangeTblEntry *rte; - AttrNumber attnum; - int netlevelsup; - deparse_namespace *dpns; - deparse_columns *colinfo; - char *refname; - char *attname; - - /* Find appropriate nesting depth */ - netlevelsup = var->varlevelsup + levelsup; - if (netlevelsup >= list_length(context->namespaces)) - elog(ERROR, "bogus varlevelsup: %d offset %d", - var->varlevelsup, levelsup); - dpns = (deparse_namespace *) list_nth(context->namespaces, - netlevelsup); - - /* - * Try to find the relevant RTE in this rtable. In a plan tree, it's - * likely that varno is OUTER_VAR or INNER_VAR, in which case we must dig - * down into the subplans, or INDEX_VAR, which is resolved similarly. Also - * find the aliases previously assigned for this RTE. - */ - if (var->varno >= 1 && var->varno <= list_length(dpns->rtable)) - { - rte = rt_fetch(var->varno, dpns->rtable); - refname = (char *) list_nth(dpns->rtable_names, var->varno - 1); - colinfo = deparse_columns_fetch(var->varno, dpns); - attnum = var->varattno; - } - else - { - resolve_special_varno((Node *) var, context, NULL, - get_special_variable); - return NULL; - } - - /* - * The planner will sometimes emit Vars referencing resjunk elements of a - * subquery's target list (this is currently only possible if it chooses - * to generate a "physical tlist" for a SubqueryScan or CteScan node). - * Although we prefer to print subquery-referencing Vars using the - * subquery's alias, that's not possible for resjunk items since they have - * no alias. So in that case, drill down to the subplan and print the - * contents of the referenced tlist item. This works because in a plan - * tree, such Vars can only occur in a SubqueryScan or CteScan node, and - * we'll have set dpns->inner_planstate to reference the child plan node. - */ - if ((rte->rtekind == RTE_SUBQUERY || rte->rtekind == RTE_CTE) && - attnum > list_length(rte->eref->colnames) && - dpns->inner_planstate) - { - TargetEntry *tle; - deparse_namespace save_dpns; - - tle = get_tle_by_resno(dpns->inner_tlist, var->varattno); - if (!tle) - elog(ERROR, "invalid attnum %d for relation \"%s\"", - var->varattno, rte->eref->aliasname); - - Assert(netlevelsup == 0); - push_child_plan(dpns, dpns->inner_planstate, &save_dpns); - - /* - * Force parentheses because our caller probably assumed a Var is a - * simple expression. - */ - if (!IsA(tle->expr, Var)) - appendStringInfoChar(buf, '('); - get_rule_expr((Node *) tle->expr, context, true); - if (!IsA(tle->expr, Var)) - appendStringInfoChar(buf, ')'); - - pop_child_plan(dpns, &save_dpns); - return NULL; - } - -#ifdef PGXC - if (rte->rtekind == RTE_REMOTE_DUMMY && - attnum > list_length(rte->eref->colnames) && - dpns->planstate) - { - TargetEntry *tle; - RemoteQuery *rqplan; - Assert(IsA(dpns->planstate, RemoteQueryState)); - Assert(netlevelsup == 0); - - /* - * Get the expression representing the given Var from base_tlist of the - * RemoteQuery - */ - rqplan = (RemoteQuery *)dpns->planstate->plan; - Assert(IsA(rqplan, RemoteQuery)); - tle = get_tle_by_resno(rqplan->base_tlist, var->varattno); - if (!tle) - elog(ERROR, "bogus varattno for remotequery var: %d", var->varattno); - /* - * Force parentheses because our caller probably assumed a Var is a - * simple expression. - */ - if (!IsA(tle->expr, Var)) - appendStringInfoChar(buf, '('); - get_rule_expr((Node *) tle->expr, context, true); - if (!IsA(tle->expr, Var)) - appendStringInfoChar(buf, ')'); - - return NULL; - } -#endif /* PGXC */ - - /* - * If it's an unnamed join, look at the expansion of the alias variable. - * If it's a simple reference to one of the input vars, then recursively - * print the name of that var instead. When it's not a simple reference, - * we have to just print the unqualified join column name. (This can only - * happen with "dangerous" merged columns in a JOIN USING; we took pains - * previously to make the unqualified column name unique in such cases.) - * - * This wouldn't work in decompiling plan trees, because we don't store - * joinaliasvars lists after planning; but a plan tree should never - * contain a join alias variable. - */ - if (rte->rtekind == RTE_JOIN && rte->alias == NULL) - { - if (rte->joinaliasvars == NIL) - elog(ERROR, "cannot decompile join alias var in plan tree"); - if (attnum > 0) - { - Var *aliasvar; - - aliasvar = (Var *) list_nth(rte->joinaliasvars, attnum - 1); - /* we intentionally don't strip implicit coercions here */ - if (aliasvar && IsA(aliasvar, Var)) - { - return get_variable(aliasvar, var->varlevelsup + levelsup, - istoplevel, context); - } - } - - /* - * Unnamed join has no refname. (Note: since it's unnamed, there is - * no way the user could have referenced it to create a whole-row Var - * for it. So we don't have to cover that case below.) - */ - Assert(refname == NULL); - } - - if (attnum == InvalidAttrNumber) - attname = NULL; - else if (attnum > 0) - { - /* Get column name to use from the colinfo struct */ - if (attnum > colinfo->num_cols) - elog(ERROR, "invalid attnum %d for relation \"%s\"", - attnum, rte->eref->aliasname); - attname = colinfo->colnames[attnum - 1]; - if (attname == NULL) /* dropped column? */ - elog(ERROR, "invalid attnum %d for relation \"%s\"", - attnum, rte->eref->aliasname); - } - else - { - /* System column - name is fixed, get it from the catalog */ - attname = get_rte_attribute_name(rte, attnum); - } - - if (refname && (context->varprefix || attname == NULL)) - { - appendStringInfoString(buf, quote_identifier(refname)); - appendStringInfoChar(buf, '.'); - } - if (attname) - appendStringInfoString(buf, quote_identifier(attname)); - else - { - appendStringInfoChar(buf, '*'); - if (istoplevel) - appendStringInfo(buf, "::%s", - format_type_with_typemod(var->vartype, - var->vartypmod)); - } - - return attname; -} - -/* - * Deparse a Var which references OUTER_VAR, INNER_VAR, or INDEX_VAR. This - * routine is actually a callback for get_special_varno, which handles finding - * the correct TargetEntry. We get the expression contained in that - * TargetEntry and just need to deparse it, a job we can throw back on - * get_rule_expr. - */ -static void -get_special_variable(Node *node, deparse_context *context, void *private) -{ - StringInfo buf = context->buf; - - /* - * Force parentheses because our caller probably assumed a Var is a simple - * expression. - */ - if (!IsA(node, Var)) - appendStringInfoChar(buf, '('); - get_rule_expr(node, context, true); - if (!IsA(node, Var)) - appendStringInfoChar(buf, ')'); -} - -/* - * Chase through plan references to special varnos (OUTER_VAR, INNER_VAR, - * INDEX_VAR) until we find a real Var or some kind of non-Var node; then, - * invoke the callback provided. - */ -static void -resolve_special_varno(Node *node, deparse_context *context, void *private, - void (*callback) (Node *, deparse_context *, void *)) -{// #lizard forgives - Var *var; - deparse_namespace *dpns; - - /* If it's not a Var, invoke the callback. */ - if (!IsA(node, Var)) - { - callback(node, context, private); - return; - } - - /* Find appropriate nesting depth */ - var = (Var *) node; - dpns = (deparse_namespace *) list_nth(context->namespaces, - var->varlevelsup); - - /* - * It's a special RTE, so recurse. - */ - if (var->varno == OUTER_VAR && dpns->outer_tlist) - { - TargetEntry *tle; - deparse_namespace save_dpns; - - tle = get_tle_by_resno(dpns->outer_tlist, var->varattno); - if (!tle) - elog(ERROR, "bogus varattno for OUTER_VAR var: %d", var->varattno); - - push_child_plan(dpns, dpns->outer_planstate, &save_dpns); - resolve_special_varno((Node *) tle->expr, context, private, callback); - pop_child_plan(dpns, &save_dpns); - return; - } - else if (var->varno == INNER_VAR && dpns->inner_tlist) - { - TargetEntry *tle; - deparse_namespace save_dpns; - - tle = get_tle_by_resno(dpns->inner_tlist, var->varattno); - if (!tle) - elog(ERROR, "bogus varattno for INNER_VAR var: %d", var->varattno); - - push_child_plan(dpns, dpns->inner_planstate, &save_dpns); - resolve_special_varno((Node *) tle->expr, context, private, callback); - pop_child_plan(dpns, &save_dpns); - return; - } - else if (var->varno == INDEX_VAR && dpns->index_tlist) - { - TargetEntry *tle; - - tle = get_tle_by_resno(dpns->index_tlist, var->varattno); - if (!tle) - elog(ERROR, "bogus varattno for INDEX_VAR var: %d", var->varattno); - - resolve_special_varno((Node *) tle->expr, context, private, callback); - return; - } - else if (var->varno < 1 || var->varno > list_length(dpns->rtable)) - elog(ERROR, "bogus varno: %d", var->varno); - - /* Not special. Just invoke the callback. */ - callback(node, context, private); -} - -/* - * Get the name of a field of an expression of composite type. The - * expression is usually a Var, but we handle other cases too. - * - * levelsup is an extra offset to interpret the Var's varlevelsup correctly. - * - * This is fairly straightforward when the expression has a named composite - * type; we need only look up the type in the catalogs. However, the type - * could also be RECORD. Since no actual table or view column is allowed to - * have type RECORD, a Var of type RECORD must refer to a JOIN or FUNCTION RTE - * or to a subquery output. We drill down to find the ultimate defining - * expression and attempt to infer the field name from it. We ereport if we - * can't determine the name. - * - * Similarly, a PARAM of type RECORD has to refer to some expression of - * a determinable composite type. - */ -static const char * -get_name_for_var_field(Var *var, int fieldno, - int levelsup, deparse_context *context) -{// #lizard forgives - RangeTblEntry *rte; - AttrNumber attnum; - int netlevelsup; - deparse_namespace *dpns; - TupleDesc tupleDesc; - Node *expr; - - /* - * If it's a RowExpr that was expanded from a whole-row Var, use the - * column names attached to it. - */ - if (IsA(var, RowExpr)) - { - RowExpr *r = (RowExpr *) var; - - if (fieldno > 0 && fieldno <= list_length(r->colnames)) - return strVal(list_nth(r->colnames, fieldno - 1)); - } - - /* - * If it's a Param of type RECORD, try to find what the Param refers to. - */ - if (IsA(var, Param)) - { - Param *param = (Param *) var; - ListCell *ancestor_cell; - - expr = find_param_referent(param, context, &dpns, &ancestor_cell); - if (expr) - { - /* Found a match, so recurse to decipher the field name */ - deparse_namespace save_dpns; - const char *result; - - push_ancestor_plan(dpns, ancestor_cell, &save_dpns); - result = get_name_for_var_field((Var *) expr, fieldno, - 0, context); - pop_ancestor_plan(dpns, &save_dpns); - return result; - } - } - - /* - * If it's a Var of type RECORD, we have to find what the Var refers to; - * if not, we can use get_expr_result_type. If that fails, we try - * lookup_rowtype_tupdesc, which will probably fail too, but will ereport - * an acceptable message. - */ - if (!IsA(var, Var) || - var->vartype != RECORDOID) - { - if (get_expr_result_type((Node *) var, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE) - tupleDesc = lookup_rowtype_tupdesc_copy(exprType((Node *) var), - exprTypmod((Node *) var)); - Assert(tupleDesc); - /* Got the tupdesc, so we can extract the field name */ - Assert(fieldno >= 1 && fieldno <= tupleDesc->natts); - return NameStr(tupleDesc->attrs[fieldno - 1]->attname); - } - - /* Find appropriate nesting depth */ - netlevelsup = var->varlevelsup + levelsup; - if (netlevelsup >= list_length(context->namespaces)) - elog(ERROR, "bogus varlevelsup: %d offset %d", - var->varlevelsup, levelsup); - dpns = (deparse_namespace *) list_nth(context->namespaces, - netlevelsup); - - /* - * Try to find the relevant RTE in this rtable. In a plan tree, it's - * likely that varno is OUTER_VAR or INNER_VAR, in which case we must dig - * down into the subplans, or INDEX_VAR, which is resolved similarly. - */ - if (var->varno >= 1 && var->varno <= list_length(dpns->rtable)) - { - rte = rt_fetch(var->varno, dpns->rtable); - attnum = var->varattno; - } - else if (var->varno == OUTER_VAR && dpns->outer_tlist) - { - TargetEntry *tle; - deparse_namespace save_dpns; - const char *result; - - tle = get_tle_by_resno(dpns->outer_tlist, var->varattno); - if (!tle) - elog(ERROR, "bogus varattno for OUTER_VAR var: %d", var->varattno); - - Assert(netlevelsup == 0); - push_child_plan(dpns, dpns->outer_planstate, &save_dpns); - - result = get_name_for_var_field((Var *) tle->expr, fieldno, - levelsup, context); - - pop_child_plan(dpns, &save_dpns); - return result; - } - else if (var->varno == INNER_VAR && dpns->inner_tlist) - { - TargetEntry *tle; - deparse_namespace save_dpns; - const char *result; - - tle = get_tle_by_resno(dpns->inner_tlist, var->varattno); - if (!tle) - elog(ERROR, "bogus varattno for INNER_VAR var: %d", var->varattno); - - Assert(netlevelsup == 0); - push_child_plan(dpns, dpns->inner_planstate, &save_dpns); - - result = get_name_for_var_field((Var *) tle->expr, fieldno, - levelsup, context); - - pop_child_plan(dpns, &save_dpns); - return result; - } - else if (var->varno == INDEX_VAR && dpns->index_tlist) - { - TargetEntry *tle; - const char *result; - - tle = get_tle_by_resno(dpns->index_tlist, var->varattno); - if (!tle) - elog(ERROR, "bogus varattno for INDEX_VAR var: %d", var->varattno); - - Assert(netlevelsup == 0); - - result = get_name_for_var_field((Var *) tle->expr, fieldno, - levelsup, context); - - return result; - } - else - { - elog(ERROR, "bogus varno: %d", var->varno); - return NULL; /* keep compiler quiet */ - } - - if (attnum == InvalidAttrNumber) - { - /* Var is whole-row reference to RTE, so select the right field */ - return get_rte_attribute_name(rte, fieldno); - } - - /* - * This part has essentially the same logic as the parser's - * expandRecordVariable() function, but we are dealing with a different - * representation of the input context, and we only need one field name - * not a TupleDesc. Also, we need special cases for finding subquery and - * CTE subplans when deparsing Plan trees. - */ - expr = (Node *) var; /* default if we can't drill down */ - - switch (rte->rtekind) - { - case RTE_RELATION: - case RTE_VALUES: - case RTE_NAMEDTUPLESTORE: - - /* - * This case should not occur: a column of a table or values list - * shouldn't have type RECORD. Fall through and fail (most - * likely) at the bottom. - */ - break; - case RTE_SUBQUERY: - /* Subselect-in-FROM: examine sub-select's output expr */ - { - if (rte->subquery) - { - TargetEntry *ste = get_tle_by_resno(rte->subquery->targetList, - attnum); - - if (ste == NULL || ste->resjunk) - elog(ERROR, "subquery %s does not have attribute %d", - rte->eref->aliasname, attnum); - expr = (Node *) ste->expr; - if (IsA(expr, Var)) - { - /* - * Recurse into the sub-select to see what its Var - * refers to. We have to build an additional level of - * namespace to keep in step with varlevelsup in the - * subselect. - */ - deparse_namespace mydpns; - const char *result; - - set_deparse_for_query(&mydpns, rte->subquery, - context->namespaces); - - context->namespaces = lcons(&mydpns, - context->namespaces); - - result = get_name_for_var_field((Var *) expr, fieldno, - 0, context); - - context->namespaces = - list_delete_first(context->namespaces); - - return result; - } - /* else fall through to inspect the expression */ - } - else - { - /* - * We're deparsing a Plan tree so we don't have complete - * RTE entries (in particular, rte->subquery is NULL). But - * the only place we'd see a Var directly referencing a - * SUBQUERY RTE is in a SubqueryScan plan node, and we can - * look into the child plan's tlist instead. - */ - TargetEntry *tle; - deparse_namespace save_dpns; - const char *result; - - if (!dpns->inner_planstate) - elog(ERROR, "failed to find plan for subquery %s", - rte->eref->aliasname); - tle = get_tle_by_resno(dpns->inner_tlist, attnum); - if (!tle) - elog(ERROR, "bogus varattno for subquery var: %d", - attnum); - Assert(netlevelsup == 0); - push_child_plan(dpns, dpns->inner_planstate, &save_dpns); - - result = get_name_for_var_field((Var *) tle->expr, fieldno, - levelsup, context); - - pop_child_plan(dpns, &save_dpns); - return result; - } - } - break; - case RTE_JOIN: - /* Join RTE --- recursively inspect the alias variable */ - if (rte->joinaliasvars == NIL) - elog(ERROR, "cannot decompile join alias var in plan tree"); - Assert(attnum > 0 && attnum <= list_length(rte->joinaliasvars)); - expr = (Node *) list_nth(rte->joinaliasvars, attnum - 1); - Assert(expr != NULL); - /* we intentionally don't strip implicit coercions here */ - if (IsA(expr, Var)) - return get_name_for_var_field((Var *) expr, fieldno, - var->varlevelsup + levelsup, - context); - /* else fall through to inspect the expression */ - break; - case RTE_FUNCTION: - case RTE_TABLEFUNC: - - /* - * We couldn't get here unless a function is declared with one of - * its result columns as RECORD, which is not allowed. - */ - break; - case RTE_CTE: - /* CTE reference: examine subquery's output expr */ - { - CommonTableExpr *cte = NULL; - Index ctelevelsup; - ListCell *lc; - - /* - * Try to find the referenced CTE using the namespace stack. - */ - ctelevelsup = rte->ctelevelsup + netlevelsup; - if (ctelevelsup >= list_length(context->namespaces)) - lc = NULL; - else - { - deparse_namespace *ctedpns; - - ctedpns = (deparse_namespace *) - list_nth(context->namespaces, ctelevelsup); - foreach(lc, ctedpns->ctes) - { - cte = (CommonTableExpr *) lfirst(lc); - if (strcmp(cte->ctename, rte->ctename) == 0) - break; - } - } - if (lc != NULL) - { - Query *ctequery = (Query *) cte->ctequery; - TargetEntry *ste = get_tle_by_resno(GetCTETargetList(cte), - attnum); - - if (ste == NULL || ste->resjunk) - elog(ERROR, "subquery %s does not have attribute %d", - rte->eref->aliasname, attnum); - expr = (Node *) ste->expr; - if (IsA(expr, Var)) - { - /* - * Recurse into the CTE to see what its Var refers to. - * We have to build an additional level of namespace - * to keep in step with varlevelsup in the CTE. - * Furthermore it could be an outer CTE, so we may - * have to delete some levels of namespace. - */ - List *save_nslist = context->namespaces; - List *new_nslist; - deparse_namespace mydpns; - const char *result; - - set_deparse_for_query(&mydpns, ctequery, - context->namespaces); - - new_nslist = list_copy_tail(context->namespaces, - ctelevelsup); - context->namespaces = lcons(&mydpns, new_nslist); - - result = get_name_for_var_field((Var *) expr, fieldno, - 0, context); - - context->namespaces = save_nslist; - - return result; - } - /* else fall through to inspect the expression */ - } - else - { - /* - * We're deparsing a Plan tree so we don't have a CTE - * list. But the only place we'd see a Var directly - * referencing a CTE RTE is in a CteScan plan node, and we - * can look into the subplan's tlist instead. - */ - TargetEntry *tle; - deparse_namespace save_dpns; - const char *result; - - if (!dpns->inner_planstate) - elog(ERROR, "failed to find plan for CTE %s", - rte->eref->aliasname); - tle = get_tle_by_resno(dpns->inner_tlist, attnum); - if (!tle) - elog(ERROR, "bogus varattno for subquery var: %d", - attnum); - Assert(netlevelsup == 0); - push_child_plan(dpns, dpns->inner_planstate, &save_dpns); - - result = get_name_for_var_field((Var *) tle->expr, fieldno, - levelsup, context); - - pop_child_plan(dpns, &save_dpns); - return result; - } - } - break; -#ifdef PGXC - case RTE_REMOTE_DUMMY: - elog(ERROR, "Invalid RTE found"); - break; -#endif /* PGXC */ - } - - /* - * We now have an expression we can't expand any more, so see if - * get_expr_result_type() can do anything with it. If not, pass to - * lookup_rowtype_tupdesc() which will probably fail, but will give an - * appropriate error message while failing. - */ - if (get_expr_result_type(expr, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE) - tupleDesc = lookup_rowtype_tupdesc_copy(exprType(expr), - exprTypmod(expr)); - Assert(tupleDesc); - /* Got the tupdesc, so we can extract the field name */ - Assert(fieldno >= 1 && fieldno <= tupleDesc->natts); - return NameStr(tupleDesc->attrs[fieldno - 1]->attname); -} - -/* - * Try to find the referenced expression for a PARAM_EXEC Param that might - * reference a parameter supplied by an upper NestLoop or SubPlan plan node. - * - * If successful, return the expression and set *dpns_p and *ancestor_cell_p - * appropriately for calling push_ancestor_plan(). If no referent can be - * found, return NULL. - */ -static Node * -find_param_referent(Param *param, deparse_context *context, - deparse_namespace **dpns_p, ListCell **ancestor_cell_p) -{// #lizard forgives - /* Initialize output parameters to prevent compiler warnings */ - *dpns_p = NULL; - *ancestor_cell_p = NULL; - - /* - * If it's a PARAM_EXEC parameter, look for a matching NestLoopParam or - * SubPlan argument. This will necessarily be in some ancestor of the - * current expression's PlanState. - */ - if (param->paramkind == PARAM_EXEC) - { - deparse_namespace *dpns; - PlanState *child_ps; - bool in_same_plan_level; - ListCell *lc; - - dpns = (deparse_namespace *) linitial(context->namespaces); - child_ps = dpns->planstate; - in_same_plan_level = true; - - foreach(lc, dpns->ancestors) - { - PlanState *ps = (PlanState *) lfirst(lc); - ListCell *lc2; - - /* - * NestLoops transmit params to their inner child only; also, once - * we've crawled up out of a subplan, this couldn't possibly be - * the right match. - */ - if (IsA(ps, NestLoopState) && - child_ps == innerPlanState(ps) && - in_same_plan_level) - { - NestLoop *nl = (NestLoop *) ps->plan; - - foreach(lc2, nl->nestParams) - { - NestLoopParam *nlp = (NestLoopParam *) lfirst(lc2); - - if (nlp->paramno == param->paramid) - { - /* Found a match, so return it */ - *dpns_p = dpns; - *ancestor_cell_p = lc; - return (Node *) nlp->paramval; - } - } - } - - /* - * Check to see if we're crawling up from a subplan. - */ - foreach(lc2, ps->subPlan) - { - SubPlanState *sstate = (SubPlanState *) lfirst(lc2); - SubPlan *subplan = sstate->subplan; - ListCell *lc3; - ListCell *lc4; - - if (child_ps != sstate->planstate) - continue; - - /* Matched subplan, so check its arguments */ - forboth(lc3, subplan->parParam, lc4, subplan->args) - { - int paramid = lfirst_int(lc3); - Node *arg = (Node *) lfirst(lc4); - - if (paramid == param->paramid) - { - /* Found a match, so return it */ - *dpns_p = dpns; - *ancestor_cell_p = lc; - return arg; - } - } - - /* Keep looking, but we are emerging from a subplan. */ - in_same_plan_level = false; - break; - } - - /* - * Likewise check to see if we're emerging from an initplan. - * Initplans never have any parParams, so no need to search that - * list, but we need to know if we should reset - * in_same_plan_level. - */ - foreach(lc2, ps->initPlan) - { - SubPlanState *sstate = (SubPlanState *) lfirst(lc2); - - if (child_ps != sstate->planstate) - continue; - - /* No parameters to be had here. */ - Assert(sstate->subplan->parParam == NIL); - - /* Keep looking, but we are emerging from an initplan. */ - in_same_plan_level = false; - break; - } - - /* No luck, crawl up to next ancestor */ - child_ps = ps; - } - } - - /* No referent found */ - return NULL; -} - -/* - * Display a Param appropriately. - */ -static void -get_parameter(Param *param, deparse_context *context) -{// #lizard forgives - Node *expr; - deparse_namespace *dpns; - ListCell *ancestor_cell; - - /* - * If it's a PARAM_EXEC parameter, try to locate the expression from which - * the parameter was computed. Note that failing to find a referent isn't - * an error, since the Param might well be a subplan output rather than an - * input. - */ - expr = find_param_referent(param, context, &dpns, &ancestor_cell); - if (expr) - { - /* Found a match, so print it */ - deparse_namespace save_dpns; - bool save_varprefix; - bool need_paren; - - /* Switch attention to the ancestor plan node */ - push_ancestor_plan(dpns, ancestor_cell, &save_dpns); - - /* - * Force prefixing of Vars, since they won't belong to the relation - * being scanned in the original plan node. - */ - save_varprefix = context->varprefix; - context->varprefix = true; - - /* - * A Param's expansion is typically a Var, Aggref, or upper-level - * Param, which wouldn't need extra parentheses. Otherwise, insert - * parens to ensure the expression looks atomic. - */ - need_paren = !(IsA(expr, Var) || - IsA(expr, Aggref) || - IsA(expr, Param)); - if (need_paren) - appendStringInfoChar(context->buf, '('); - - get_rule_expr(expr, context, false); - - if (need_paren) - appendStringInfoChar(context->buf, ')'); - - context->varprefix = save_varprefix; - - pop_ancestor_plan(dpns, &save_dpns); - - return; - } - - /* - * Not PARAM_EXEC, or couldn't find referent: just print $N. - */ - appendStringInfo(context->buf, "$%d", param->paramid); - -#ifdef __TBASE__ - /* param need explicit cast */ - if (param->explicit_cast) - { - appendStringInfo(context->buf, "::%s", - format_type_with_typemod(param->paramtype, param->paramtypmod)); - } -#endif -} - -/* - * get_simple_binary_op_name - * - * helper function for isSimpleNode - * will return single char binary operator name, or NULL if it's not - */ -static const char * -get_simple_binary_op_name(OpExpr *expr) -{ - List *args = expr->args; - - if (list_length(args) == 2) - { - /* binary operator */ - Node *arg1 = (Node *) linitial(args); - Node *arg2 = (Node *) lsecond(args); - const char *op; - - op = generate_operator_name(expr->opno, exprType(arg1), exprType(arg2)); - if (strlen(op) == 1) - return op; - } - return NULL; -} - - -/* - * isSimpleNode - check if given node is simple (doesn't need parenthesizing) - * - * true : simple in the context of parent node's type - * false : not simple - */ -static bool -isSimpleNode(Node *node, Node *parentNode, int prettyFlags) -{// #lizard forgives - if (!node) - return false; - - switch (nodeTag(node)) - { - case T_Var: - case T_Const: - case T_Param: - case T_CoerceToDomainValue: - case T_SetToDefault: - case T_CurrentOfExpr: - /* single words: always simple */ - return true; - - case T_ArrayRef: - case T_ArrayExpr: - case T_RowExpr: - case T_CoalesceExpr: - case T_MinMaxExpr: - case T_SQLValueFunction: - case T_XmlExpr: - case T_NextValueExpr: - case T_NullIfExpr: - case T_Aggref: - case T_WindowFunc: - case T_FuncExpr: - /* function-like: name(..) or name[..] */ - return true; - - /* CASE keywords act as parentheses */ - case T_CaseExpr: - return true; - - case T_FieldSelect: - - /* - * appears simple since . has top precedence, unless parent is - * T_FieldSelect itself! - */ - return (IsA(parentNode, FieldSelect) ? false : true); - - case T_FieldStore: - - /* - * treat like FieldSelect (probably doesn't matter) - */ - return (IsA(parentNode, FieldStore) ? false : true); - - case T_CoerceToDomain: - /* maybe simple, check args */ - return isSimpleNode((Node *) ((CoerceToDomain *) node)->arg, - node, prettyFlags); - case T_RelabelType: - return isSimpleNode((Node *) ((RelabelType *) node)->arg, - node, prettyFlags); - case T_CoerceViaIO: - return isSimpleNode((Node *) ((CoerceViaIO *) node)->arg, - node, prettyFlags); - case T_ArrayCoerceExpr: - return isSimpleNode((Node *) ((ArrayCoerceExpr *) node)->arg, - node, prettyFlags); - case T_ConvertRowtypeExpr: - return isSimpleNode((Node *) ((ConvertRowtypeExpr *) node)->arg, - node, prettyFlags); - - case T_OpExpr: - { - /* depends on parent node type; needs further checking */ - if (prettyFlags & PRETTYFLAG_PAREN && IsA(parentNode, OpExpr)) - { - const char *op; - const char *parentOp; - bool is_lopriop; - bool is_hipriop; - bool is_lopriparent; - bool is_hipriparent; - - op = get_simple_binary_op_name((OpExpr *) node); - if (!op) - return false; - - /* We know only the basic operators + - and * / % */ - is_lopriop = (strchr("+-", *op) != NULL); - is_hipriop = (strchr("*/%", *op) != NULL); - if (!(is_lopriop || is_hipriop)) - return false; - - parentOp = get_simple_binary_op_name((OpExpr *) parentNode); - if (!parentOp) - return false; - - is_lopriparent = (strchr("+-", *parentOp) != NULL); - is_hipriparent = (strchr("*/%", *parentOp) != NULL); - if (!(is_lopriparent || is_hipriparent)) - return false; - - if (is_hipriop && is_lopriparent) - return true; /* op binds tighter than parent */ - - if (is_lopriop && is_hipriparent) - return false; - - /* - * Operators are same priority --- can skip parens only if - * we have (a - b) - c, not a - (b - c). - */ - if (node == (Node *) linitial(((OpExpr *) parentNode)->args)) - return true; - - return false; - } - /* else do the same stuff as for T_SubLink et al. */ - /* FALL THROUGH */ - } - - case T_SubLink: - case T_NullTest: - case T_BooleanTest: - case T_DistinctExpr: - switch (nodeTag(parentNode)) - { - case T_FuncExpr: - { - /* special handling for casts */ - CoercionForm type = ((FuncExpr *) parentNode)->funcformat; - - if (type == COERCE_EXPLICIT_CAST || - type == COERCE_IMPLICIT_CAST) - return false; - return true; /* own parentheses */ - } - case T_BoolExpr: /* lower precedence */ - case T_ArrayRef: /* other separators */ - case T_ArrayExpr: /* other separators */ - case T_RowExpr: /* other separators */ - case T_CoalesceExpr: /* own parentheses */ - case T_MinMaxExpr: /* own parentheses */ - case T_XmlExpr: /* own parentheses */ - case T_NullIfExpr: /* other separators */ - case T_Aggref: /* own parentheses */ - case T_WindowFunc: /* own parentheses */ - case T_CaseExpr: /* other separators */ - return true; - default: - return false; - } - - case T_BoolExpr: - switch (nodeTag(parentNode)) - { - case T_BoolExpr: - if (prettyFlags & PRETTYFLAG_PAREN) - { - BoolExprType type; - BoolExprType parentType; - - type = ((BoolExpr *) node)->boolop; - parentType = ((BoolExpr *) parentNode)->boolop; - switch (type) - { - case NOT_EXPR: - case AND_EXPR: - if (parentType == AND_EXPR || parentType == OR_EXPR) - return true; - break; - case OR_EXPR: - if (parentType == OR_EXPR) - return true; - break; - } - } - return false; - case T_FuncExpr: - { - /* special handling for casts */ - CoercionForm type = ((FuncExpr *) parentNode)->funcformat; - - if (type == COERCE_EXPLICIT_CAST || - type == COERCE_IMPLICIT_CAST) - return false; - return true; /* own parentheses */ - } - case T_ArrayRef: /* other separators */ - case T_ArrayExpr: /* other separators */ - case T_RowExpr: /* other separators */ - case T_CoalesceExpr: /* own parentheses */ - case T_MinMaxExpr: /* own parentheses */ - case T_XmlExpr: /* own parentheses */ - case T_NullIfExpr: /* other separators */ - case T_Aggref: /* own parentheses */ - case T_WindowFunc: /* own parentheses */ - case T_CaseExpr: /* other separators */ - return true; - default: - return false; - } - - default: - break; - } - /* those we don't know: in dubio complexo */ - return false; -} - - -/* - * appendContextKeyword - append a keyword to buffer - * - * If prettyPrint is enabled, perform a line break, and adjust indentation. - * Otherwise, just append the keyword. - */ -static void -appendContextKeyword(deparse_context *context, const char *str, - int indentBefore, int indentAfter, int indentPlus) -{ - StringInfo buf = context->buf; - - if (PRETTY_INDENT(context)) - { - int indentAmount; - - context->indentLevel += indentBefore; - - /* remove any trailing spaces currently in the buffer ... */ - removeStringInfoSpaces(buf); - /* ... then add a newline and some spaces */ - appendStringInfoChar(buf, '\n'); - - if (context->indentLevel < PRETTYINDENT_LIMIT) - indentAmount = Max(context->indentLevel, 0) + indentPlus; - else - { - /* - * If we're indented more than PRETTYINDENT_LIMIT characters, try - * to conserve horizontal space by reducing the per-level - * indentation. For best results the scale factor here should - * divide all the indent amounts that get added to indentLevel - * (PRETTYINDENT_STD, etc). It's important that the indentation - * not grow unboundedly, else deeply-nested trees use O(N^2) - * whitespace; so we also wrap modulo PRETTYINDENT_LIMIT. - */ - indentAmount = PRETTYINDENT_LIMIT + - (context->indentLevel - PRETTYINDENT_LIMIT) / - (PRETTYINDENT_STD / 2); - indentAmount %= PRETTYINDENT_LIMIT; - /* scale/wrap logic affects indentLevel, but not indentPlus */ - indentAmount += indentPlus; - } - appendStringInfoSpaces(buf, indentAmount); - - appendStringInfoString(buf, str); - - context->indentLevel += indentAfter; - if (context->indentLevel < 0) - context->indentLevel = 0; - } - else - appendStringInfoString(buf, str); -} - -/* - * removeStringInfoSpaces - delete trailing spaces from a buffer. - * - * Possibly this should move to stringinfo.c at some point. - */ -static void -removeStringInfoSpaces(StringInfo str) -{ - while (str->len > 0 && str->data[str->len - 1] == ' ') - str->data[--(str->len)] = '\0'; -} - - -/* - * get_rule_expr_paren - deparse expr using get_rule_expr, - * embracing the string with parentheses if necessary for prettyPrint. - * - * Never embrace if prettyFlags=0, because it's done in the calling node. - * - * Any node that does *not* embrace its argument node by sql syntax (with - * parentheses, non-operator keywords like CASE/WHEN/ON, or comma etc) should - * use get_rule_expr_paren instead of get_rule_expr so parentheses can be - * added. - */ -static void -get_rule_expr_paren(Node *node, deparse_context *context, - bool showimplicit, Node *parentNode) -{ - bool need_paren; - - need_paren = PRETTY_PAREN(context) && - !isSimpleNode(node, parentNode, context->prettyFlags); - - if (need_paren) - appendStringInfoChar(context->buf, '('); - - get_rule_expr(node, context, showimplicit); - - if (need_paren) - appendStringInfoChar(context->buf, ')'); -} - - -/* ---------- - * get_rule_expr - Parse back an expression - * - * Note: showimplicit determines whether we display any implicit cast that - * is present at the top of the expression tree. It is a passed argument, - * not a field of the context struct, because we change the value as we - * recurse down into the expression. In general we suppress implicit casts - * when the result type is known with certainty (eg, the arguments of an - * OR must be boolean). We display implicit casts for arguments of functions - * and operators, since this is needed to be certain that the same function - * or operator will be chosen when the expression is re-parsed. - * ---------- - */ -static void -get_rule_expr(Node *node, deparse_context *context, - bool showimplicit) -{// #lizard forgives - StringInfo buf = context->buf; - - if (node == NULL) - return; - - /* Guard against excessively long or deeply-nested queries */ - CHECK_FOR_INTERRUPTS(); - check_stack_depth(); - - /* - * Each level of get_rule_expr must emit an indivisible term - * (parenthesized if necessary) to ensure result is reparsed into the same - * expression tree. The only exception is that when the input is a List, - * we emit the component items comma-separated with no surrounding - * decoration; this is convenient for most callers. - */ - switch (nodeTag(node)) - { - case T_Var: - (void) get_variable((Var *) node, 0, false, context); - break; - - case T_Const: - get_const_expr((Const *) node, context, 0); - break; - - case T_Param: - get_parameter((Param *) node, context); - break; - - case T_Aggref: - get_agg_expr((Aggref *) node, context, (Aggref *) node); - break; - - case T_GroupingFunc: - { - GroupingFunc *gexpr = (GroupingFunc *) node; - - appendStringInfoString(buf, "GROUPING("); - get_rule_expr((Node *) gexpr->args, context, true); - appendStringInfoChar(buf, ')'); - } - break; - - case T_WindowFunc: - get_windowfunc_expr((WindowFunc *) node, context); - break; - - case T_ArrayRef: - { - ArrayRef *aref = (ArrayRef *) node; - bool need_parens; - - /* - * If the argument is a CaseTestExpr, we must be inside a - * FieldStore, ie, we are assigning to an element of an array - * within a composite column. Since we already punted on - * displaying the FieldStore's target information, just punt - * here too, and display only the assignment source - * expression. - */ - if (IsA(aref->refexpr, CaseTestExpr)) - { - Assert(aref->refassgnexpr); - get_rule_expr((Node *) aref->refassgnexpr, - context, showimplicit); - break; - } - - /* - * Parenthesize the argument unless it's a simple Var or a - * FieldSelect. (In particular, if it's another ArrayRef, we - * *must* parenthesize to avoid confusion.) - */ - need_parens = !IsA(aref->refexpr, Var) && - !IsA(aref->refexpr, FieldSelect); - if (need_parens) - appendStringInfoChar(buf, '('); - get_rule_expr((Node *) aref->refexpr, context, showimplicit); - if (need_parens) - appendStringInfoChar(buf, ')'); - - /* - * If there's a refassgnexpr, we want to print the node in the - * format "array[subscripts] := refassgnexpr". This is not - * legal SQL, so decompilation of INSERT or UPDATE statements - * should always use processIndirection as part of the - * statement-level syntax. We should only see this when - * EXPLAIN tries to print the targetlist of a plan resulting - * from such a statement. - */ - if (aref->refassgnexpr) - { - Node *refassgnexpr; - - /* - * Use processIndirection to print this node's subscripts - * as well as any additional field selections or - * subscripting in immediate descendants. It returns the - * RHS expr that is actually being "assigned". - */ - refassgnexpr = processIndirection(node, context); - appendStringInfoString(buf, " := "); - get_rule_expr(refassgnexpr, context, showimplicit); - } - else - { - /* Just an ordinary array fetch, so print subscripts */ - printSubscripts(aref, context); - } - } - break; - - case T_FuncExpr: - get_func_expr((FuncExpr *) node, context, showimplicit); - break; - - case T_NamedArgExpr: - { - NamedArgExpr *na = (NamedArgExpr *) node; - - appendStringInfo(buf, "%s => ", quote_identifier(na->name)); - get_rule_expr((Node *) na->arg, context, showimplicit); - } - break; - - case T_OpExpr: - get_oper_expr((OpExpr *) node, context); - break; - - case T_DistinctExpr: - { - DistinctExpr *expr = (DistinctExpr *) node; - List *args = expr->args; - Node *arg1 = (Node *) linitial(args); - Node *arg2 = (Node *) lsecond(args); - - if (!PRETTY_PAREN(context)) - appendStringInfoChar(buf, '('); - get_rule_expr_paren(arg1, context, true, node); - appendStringInfoString(buf, " IS DISTINCT FROM "); - get_rule_expr_paren(arg2, context, true, node); - if (!PRETTY_PAREN(context)) - appendStringInfoChar(buf, ')'); - } - break; - - case T_NullIfExpr: - { - NullIfExpr *nullifexpr = (NullIfExpr *) node; - - appendStringInfoString(buf, "NULLIF("); - get_rule_expr((Node *) nullifexpr->args, context, true); - appendStringInfoChar(buf, ')'); - } - break; - - case T_ScalarArrayOpExpr: - { - ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) node; - List *args = expr->args; - Node *arg1 = (Node *) linitial(args); - Node *arg2 = (Node *) lsecond(args); - - if (!PRETTY_PAREN(context)) - appendStringInfoChar(buf, '('); - get_rule_expr_paren(arg1, context, true, node); - appendStringInfo(buf, " %s %s (", - generate_operator_name(expr->opno, - exprType(arg1), - get_base_element_type(exprType(arg2))), - expr->useOr ? "ANY" : "ALL"); - get_rule_expr_paren(arg2, context, true, node); - - /* - * There's inherent ambiguity in "x op ANY/ALL (y)" when y is - * a bare sub-SELECT. Since we're here, the sub-SELECT must - * be meant as a scalar sub-SELECT yielding an array value to - * be used in ScalarArrayOpExpr; but the grammar will - * preferentially interpret such a construct as an ANY/ALL - * SubLink. To prevent misparsing the output that way, insert - * a dummy coercion (which will be stripped by parse analysis, - * so no inefficiency is added in dump and reload). This is - * indeed most likely what the user wrote to get the construct - * accepted in the first place. - */ - if (IsA(arg2, SubLink) && - ((SubLink *) arg2)->subLinkType == EXPR_SUBLINK) - appendStringInfo(buf, "::%s", - format_type_with_typemod(exprType(arg2), - exprTypmod(arg2))); - appendStringInfoChar(buf, ')'); - if (!PRETTY_PAREN(context)) - appendStringInfoChar(buf, ')'); - } - break; - - case T_BoolExpr: - { - BoolExpr *expr = (BoolExpr *) node; - Node *first_arg = linitial(expr->args); - ListCell *arg = lnext(list_head(expr->args)); - - switch (expr->boolop) - { - case AND_EXPR: - if (!PRETTY_PAREN(context)) - appendStringInfoChar(buf, '('); - get_rule_expr_paren(first_arg, context, - false, node); - while (arg) - { - appendStringInfoString(buf, " AND "); - get_rule_expr_paren((Node *) lfirst(arg), context, - false, node); - arg = lnext(arg); - } - if (!PRETTY_PAREN(context)) - appendStringInfoChar(buf, ')'); - break; - - case OR_EXPR: - if (!PRETTY_PAREN(context)) - appendStringInfoChar(buf, '('); - get_rule_expr_paren(first_arg, context, - false, node); - while (arg) - { - appendStringInfoString(buf, " OR "); - get_rule_expr_paren((Node *) lfirst(arg), context, - false, node); - arg = lnext(arg); - } - if (!PRETTY_PAREN(context)) - appendStringInfoChar(buf, ')'); - break; - - case NOT_EXPR: - if (!PRETTY_PAREN(context)) - appendStringInfoChar(buf, '('); - appendStringInfoString(buf, "NOT "); - get_rule_expr_paren(first_arg, context, - false, node); - if (!PRETTY_PAREN(context)) - appendStringInfoChar(buf, ')'); - break; - - default: - elog(ERROR, "unrecognized boolop: %d", - (int) expr->boolop); - } - } - break; - - case T_SubLink: - get_sublink_expr((SubLink *) node, context); - break; - - case T_SubPlan: - { - SubPlan *subplan = (SubPlan *) node; - - /* - * We cannot see an already-planned subplan in rule deparsing, - * only while EXPLAINing a query plan. We don't try to - * reconstruct the original SQL, just reference the subplan - * that appears elsewhere in EXPLAIN's result. - */ - if (subplan->useHashTable) - appendStringInfo(buf, "(hashed %s)", subplan->plan_name); - else - appendStringInfo(buf, "(%s)", subplan->plan_name); - } - break; - - case T_AlternativeSubPlan: - { - AlternativeSubPlan *asplan = (AlternativeSubPlan *) node; - ListCell *lc; - - /* As above, this can only happen during EXPLAIN */ - appendStringInfoString(buf, "(alternatives: "); - foreach(lc, asplan->subplans) - { - SubPlan *splan = lfirst_node(SubPlan, lc); - - if (splan->useHashTable) - appendStringInfo(buf, "hashed %s", splan->plan_name); - else - appendStringInfoString(buf, splan->plan_name); - if (lnext(lc)) - appendStringInfoString(buf, " or "); - } - appendStringInfoChar(buf, ')'); - } - break; - - case T_FieldSelect: - { - FieldSelect *fselect = (FieldSelect *) node; - Node *arg = (Node *) fselect->arg; - int fno = fselect->fieldnum; - const char *fieldname; - bool need_parens; - - /* - * Parenthesize the argument unless it's an ArrayRef or - * another FieldSelect. Note in particular that it would be - * WRONG to not parenthesize a Var argument; simplicity is not - * the issue here, having the right number of names is. - */ - need_parens = !IsA(arg, ArrayRef) &&!IsA(arg, FieldSelect); - if (need_parens) - appendStringInfoChar(buf, '('); - get_rule_expr(arg, context, true); - if (need_parens) - appendStringInfoChar(buf, ')'); - - /* - * Get and print the field name. - */ - fieldname = get_name_for_var_field((Var *) arg, fno, - 0, context); - appendStringInfo(buf, ".%s", quote_identifier(fieldname)); - } - break; - - case T_FieldStore: - { - FieldStore *fstore = (FieldStore *) node; - bool need_parens; - - /* - * There is no good way to represent a FieldStore as real SQL, - * so decompilation of INSERT or UPDATE statements should - * always use processIndirection as part of the - * statement-level syntax. We should only get here when - * EXPLAIN tries to print the targetlist of a plan resulting - * from such a statement. The plan case is even harder than - * ordinary rules would be, because the planner tries to - * collapse multiple assignments to the same field or subfield - * into one FieldStore; so we can see a list of target fields - * not just one, and the arguments could be FieldStores - * themselves. We don't bother to try to print the target - * field names; we just print the source arguments, with a - * ROW() around them if there's more than one. This isn't - * terribly complete, but it's probably good enough for - * EXPLAIN's purposes; especially since anything more would be - * either hopelessly confusing or an even poorer - * representation of what the plan is actually doing. - */ - need_parens = (list_length(fstore->newvals) != 1); - if (need_parens) - appendStringInfoString(buf, "ROW("); - get_rule_expr((Node *) fstore->newvals, context, showimplicit); - if (need_parens) - appendStringInfoChar(buf, ')'); - } - break; - - case T_RelabelType: - { - RelabelType *relabel = (RelabelType *) node; - Node *arg = (Node *) relabel->arg; - - if (relabel->relabelformat == COERCE_IMPLICIT_CAST && - !showimplicit) - { - /* don't show the implicit cast */ - get_rule_expr_paren(arg, context, false, node); - } - else - { - get_coercion_expr(arg, context, - relabel->resulttype, - relabel->resulttypmod, - node); - } - } - break; - - case T_CoerceViaIO: - { - CoerceViaIO *iocoerce = (CoerceViaIO *) node; - Node *arg = (Node *) iocoerce->arg; - - if (iocoerce->coerceformat == COERCE_IMPLICIT_CAST && - !showimplicit) - { - /* don't show the implicit cast */ - get_rule_expr_paren(arg, context, false, node); - } - else - { - get_coercion_expr(arg, context, - iocoerce->resulttype, - -1, - node); - } - } - break; - - case T_ArrayCoerceExpr: - { - ArrayCoerceExpr *acoerce = (ArrayCoerceExpr *) node; - Node *arg = (Node *) acoerce->arg; - - if (acoerce->coerceformat == COERCE_IMPLICIT_CAST && - !showimplicit) - { - /* don't show the implicit cast */ - get_rule_expr_paren(arg, context, false, node); - } - else - { - get_coercion_expr(arg, context, - acoerce->resulttype, - acoerce->resulttypmod, - node); - } - } - break; - - case T_ConvertRowtypeExpr: - { - ConvertRowtypeExpr *convert = (ConvertRowtypeExpr *) node; - Node *arg = (Node *) convert->arg; - - if (convert->convertformat == COERCE_IMPLICIT_CAST && - !showimplicit) - { - /* don't show the implicit cast */ - get_rule_expr_paren(arg, context, false, node); - } - else - { - get_coercion_expr(arg, context, - convert->resulttype, -1, - node); - } - } - break; - - case T_CollateExpr: - { - CollateExpr *collate = (CollateExpr *) node; - Node *arg = (Node *) collate->arg; - - if (!PRETTY_PAREN(context)) - appendStringInfoChar(buf, '('); - get_rule_expr_paren(arg, context, showimplicit, node); - appendStringInfo(buf, " COLLATE %s", - generate_collation_name(collate->collOid)); - if (!PRETTY_PAREN(context)) - appendStringInfoChar(buf, ')'); - } - break; - - case T_CaseExpr: - { - CaseExpr *caseexpr = (CaseExpr *) node; - ListCell *temp; - - appendContextKeyword(context, "CASE", - 0, PRETTYINDENT_VAR, 0); - if (caseexpr->arg) - { - appendStringInfoChar(buf, ' '); - get_rule_expr((Node *) caseexpr->arg, context, true); - } - foreach(temp, caseexpr->args) - { - CaseWhen *when = (CaseWhen *) lfirst(temp); - Node *w = (Node *) when->expr; - - if (caseexpr->arg) - { - /* - * The parser should have produced WHEN clauses of the - * form "CaseTestExpr = RHS", possibly with an - * implicit coercion inserted above the CaseTestExpr. - * For accurate decompilation of rules it's essential - * that we show just the RHS. However in an - * expression that's been through the optimizer, the - * WHEN clause could be almost anything (since the - * equality operator could have been expanded into an - * inline function). If we don't recognize the form - * of the WHEN clause, just punt and display it as-is. - */ - if (IsA(w, OpExpr)) - { - List *args = ((OpExpr *) w)->args; - - if (list_length(args) == 2 && - IsA(strip_implicit_coercions(linitial(args)), - CaseTestExpr)) - w = (Node *) lsecond(args); - } - } - - if (!PRETTY_INDENT(context)) - appendStringInfoChar(buf, ' '); - appendContextKeyword(context, "WHEN ", - 0, 0, 0); - get_rule_expr(w, context, false); - appendStringInfoString(buf, " THEN "); - get_rule_expr((Node *) when->result, context, true); - } - if (!PRETTY_INDENT(context)) - appendStringInfoChar(buf, ' '); - appendContextKeyword(context, "ELSE ", - 0, 0, 0); - get_rule_expr((Node *) caseexpr->defresult, context, true); - if (!PRETTY_INDENT(context)) - appendStringInfoChar(buf, ' '); - appendContextKeyword(context, "END", - -PRETTYINDENT_VAR, 0, 0); - } - break; - - case T_CaseTestExpr: - { - /* - * Normally we should never get here, since for expressions - * that can contain this node type we attempt to avoid - * recursing to it. But in an optimized expression we might - * be unable to avoid that (see comments for CaseExpr). If we - * do see one, print it as CASE_TEST_EXPR. - */ - appendStringInfoString(buf, "CASE_TEST_EXPR"); - } - break; - - case T_ArrayExpr: - { - ArrayExpr *arrayexpr = (ArrayExpr *) node; - - appendStringInfoString(buf, "ARRAY["); - get_rule_expr((Node *) arrayexpr->elements, context, true); - appendStringInfoChar(buf, ']'); - - /* - * If the array isn't empty, we assume its elements are - * coerced to the desired type. If it's empty, though, we - * need an explicit coercion to the array type. - */ - if (arrayexpr->elements == NIL) - appendStringInfo(buf, "::%s", - format_type_with_typemod(arrayexpr->array_typeid, -1)); - } - break; - - case T_RowExpr: - { - RowExpr *rowexpr = (RowExpr *) node; - TupleDesc tupdesc = NULL; - ListCell *arg; - int i; - char *sep; - - /* - * If it's a named type and not RECORD, we may have to skip - * dropped columns and/or claim there are NULLs for added - * columns. - */ - if (rowexpr->row_typeid != RECORDOID) - { - tupdesc = lookup_rowtype_tupdesc(rowexpr->row_typeid, -1); - Assert(list_length(rowexpr->args) <= tupdesc->natts); - } - - /* - * SQL99 allows "ROW" to be omitted when there is more than - * one column, but for simplicity we always print it. - */ - appendStringInfoString(buf, "ROW("); - sep = ""; - i = 0; - foreach(arg, rowexpr->args) - { - Node *e = (Node *) lfirst(arg); - - if (tupdesc == NULL || - !tupdesc->attrs[i]->attisdropped) - { - appendStringInfoString(buf, sep); - /* Whole-row Vars need special treatment here */ - get_rule_expr_toplevel(e, context, true); - sep = ", "; - } - i++; - } - if (tupdesc != NULL) - { - while (i < tupdesc->natts) - { - if (!tupdesc->attrs[i]->attisdropped) - { - appendStringInfoString(buf, sep); - appendStringInfoString(buf, "NULL"); - sep = ", "; - } - i++; - } - - ReleaseTupleDesc(tupdesc); - } - appendStringInfoChar(buf, ')'); - if (rowexpr->row_format == COERCE_EXPLICIT_CAST) - appendStringInfo(buf, "::%s", - format_type_with_typemod(rowexpr->row_typeid, -1)); - } - break; - - case T_RowCompareExpr: - { - RowCompareExpr *rcexpr = (RowCompareExpr *) node; - ListCell *arg; - char *sep; - - /* - * SQL99 allows "ROW" to be omitted when there is more than - * one column, but for simplicity we always print it. - */ - appendStringInfoString(buf, "(ROW("); - sep = ""; - foreach(arg, rcexpr->largs) - { - Node *e = (Node *) lfirst(arg); - - appendStringInfoString(buf, sep); - get_rule_expr(e, context, true); - sep = ", "; - } - - /* - * We assume that the name of the first-column operator will - * do for all the rest too. This is definitely open to - * failure, eg if some but not all operators were renamed - * since the construct was parsed, but there seems no way to - * be perfect. - */ - appendStringInfo(buf, ") %s ROW(", - generate_operator_name(linitial_oid(rcexpr->opnos), - exprType(linitial(rcexpr->largs)), - exprType(linitial(rcexpr->rargs)))); - sep = ""; - foreach(arg, rcexpr->rargs) - { - Node *e = (Node *) lfirst(arg); - - appendStringInfoString(buf, sep); - get_rule_expr(e, context, true); - sep = ", "; - } - appendStringInfoString(buf, "))"); - } - break; - - case T_CoalesceExpr: - { - CoalesceExpr *coalesceexpr = (CoalesceExpr *) node; - - appendStringInfoString(buf, "COALESCE("); - get_rule_expr((Node *) coalesceexpr->args, context, true); - appendStringInfoChar(buf, ')'); - } - break; - - case T_MinMaxExpr: - { - MinMaxExpr *minmaxexpr = (MinMaxExpr *) node; - - switch (minmaxexpr->op) - { - case IS_GREATEST: - appendStringInfoString(buf, "GREATEST("); - break; - case IS_LEAST: - appendStringInfoString(buf, "LEAST("); - break; - } - get_rule_expr((Node *) minmaxexpr->args, context, true); - appendStringInfoChar(buf, ')'); - } - break; - - case T_SQLValueFunction: - { - SQLValueFunction *svf = (SQLValueFunction *) node; - - /* - * Note: this code knows that typmod for time, timestamp, and - * timestamptz just prints as integer. - */ - switch (svf->op) - { - case SVFOP_CURRENT_DATE: - appendStringInfoString(buf, "CURRENT_DATE"); - break; - case SVFOP_CURRENT_TIME: - appendStringInfoString(buf, "CURRENT_TIME"); - break; - case SVFOP_CURRENT_TIME_N: - appendStringInfo(buf, "CURRENT_TIME(%d)", svf->typmod); - break; - case SVFOP_CURRENT_TIMESTAMP: - appendStringInfoString(buf, "CURRENT_TIMESTAMP"); - break; - case SVFOP_CURRENT_TIMESTAMP_N: - appendStringInfo(buf, "CURRENT_TIMESTAMP(%d)", - svf->typmod); - break; - case SVFOP_LOCALTIME: - appendStringInfoString(buf, "LOCALTIME"); - break; - case SVFOP_LOCALTIME_N: - appendStringInfo(buf, "LOCALTIME(%d)", svf->typmod); - break; - case SVFOP_LOCALTIMESTAMP: - appendStringInfoString(buf, "LOCALTIMESTAMP"); - break; - case SVFOP_LOCALTIMESTAMP_N: - appendStringInfo(buf, "LOCALTIMESTAMP(%d)", - svf->typmod); - break; - case SVFOP_CURRENT_ROLE: - appendStringInfoString(buf, "CURRENT_ROLE"); - break; - case SVFOP_CURRENT_USER: - appendStringInfoString(buf, "CURRENT_USER"); - break; - case SVFOP_USER: - appendStringInfoString(buf, "USER"); - break; - case SVFOP_SESSION_USER: - appendStringInfoString(buf, "SESSION_USER"); - break; - case SVFOP_CURRENT_CATALOG: - appendStringInfoString(buf, "CURRENT_CATALOG"); - break; - case SVFOP_CURRENT_SCHEMA: - appendStringInfoString(buf, "CURRENT_SCHEMA"); - break; - } - } - break; - - case T_NextValueExpr: - { - /* - * This gets invoked by Fast Query Shipping code to deparse a - * query. It seems enough to just generate a "DEFAULT" clause - * and let the remote datanode handle finding the correct - * sequence for replica identity. - * - * XXX PG10MERGE: If we do see issues with this, it might be - * worthwhile to consider generating an expression such as, - * nextval('sequence_name'::regclass) - */ - appendStringInfoString(buf, "DEFAULT"); - } - break; - - case T_XmlExpr: - { - XmlExpr *xexpr = (XmlExpr *) node; - bool needcomma = false; - ListCell *arg; - ListCell *narg; - Const *con; - - switch (xexpr->op) - { - case IS_XMLCONCAT: - appendStringInfoString(buf, "XMLCONCAT("); - break; - case IS_XMLELEMENT: - appendStringInfoString(buf, "XMLELEMENT("); - break; - case IS_XMLFOREST: - appendStringInfoString(buf, "XMLFOREST("); - break; - case IS_XMLPARSE: - appendStringInfoString(buf, "XMLPARSE("); - break; - case IS_XMLPI: - appendStringInfoString(buf, "XMLPI("); - break; - case IS_XMLROOT: - appendStringInfoString(buf, "XMLROOT("); - break; - case IS_XMLSERIALIZE: - appendStringInfoString(buf, "XMLSERIALIZE("); - break; - case IS_DOCUMENT: - break; - } - if (xexpr->op == IS_XMLPARSE || xexpr->op == IS_XMLSERIALIZE) - { - if (xexpr->xmloption == XMLOPTION_DOCUMENT) - appendStringInfoString(buf, "DOCUMENT "); - else - appendStringInfoString(buf, "CONTENT "); - } - if (xexpr->name) - { - appendStringInfo(buf, "NAME %s", - quote_identifier(map_xml_name_to_sql_identifier(xexpr->name))); - needcomma = true; - } - if (xexpr->named_args) - { - if (xexpr->op != IS_XMLFOREST) - { - if (needcomma) - appendStringInfoString(buf, ", "); - appendStringInfoString(buf, "XMLATTRIBUTES("); - needcomma = false; - } - forboth(arg, xexpr->named_args, narg, xexpr->arg_names) - { - Node *e = (Node *) lfirst(arg); - char *argname = strVal(lfirst(narg)); - - if (needcomma) - appendStringInfoString(buf, ", "); - get_rule_expr((Node *) e, context, true); - appendStringInfo(buf, " AS %s", - quote_identifier(map_xml_name_to_sql_identifier(argname))); - needcomma = true; - } - if (xexpr->op != IS_XMLFOREST) - appendStringInfoChar(buf, ')'); - } - if (xexpr->args) - { - if (needcomma) - appendStringInfoString(buf, ", "); - switch (xexpr->op) - { - case IS_XMLCONCAT: - case IS_XMLELEMENT: - case IS_XMLFOREST: - case IS_XMLPI: - case IS_XMLSERIALIZE: - /* no extra decoration needed */ - get_rule_expr((Node *) xexpr->args, context, true); - break; - case IS_XMLPARSE: - Assert(list_length(xexpr->args) == 2); - - get_rule_expr((Node *) linitial(xexpr->args), - context, true); - - con = lsecond_node(Const, xexpr->args); - Assert(!con->constisnull); - if (DatumGetBool(con->constvalue)) - appendStringInfoString(buf, - " PRESERVE WHITESPACE"); - else - appendStringInfoString(buf, - " STRIP WHITESPACE"); - break; - case IS_XMLROOT: - Assert(list_length(xexpr->args) == 3); - - get_rule_expr((Node *) linitial(xexpr->args), - context, true); - - appendStringInfoString(buf, ", VERSION "); - con = (Const *) lsecond(xexpr->args); - if (IsA(con, Const) && - con->constisnull) - appendStringInfoString(buf, "NO VALUE"); - else - get_rule_expr((Node *) con, context, false); - - con = lthird_node(Const, xexpr->args); - if (con->constisnull) - /* suppress STANDALONE NO VALUE */ ; - else - { - switch (DatumGetInt32(con->constvalue)) - { - case XML_STANDALONE_YES: - appendStringInfoString(buf, - ", STANDALONE YES"); - break; - case XML_STANDALONE_NO: - appendStringInfoString(buf, - ", STANDALONE NO"); - break; - case XML_STANDALONE_NO_VALUE: - appendStringInfoString(buf, - ", STANDALONE NO VALUE"); - break; - default: - break; - } - } - break; - case IS_DOCUMENT: - get_rule_expr_paren((Node *) xexpr->args, context, false, node); - break; - } - - } - if (xexpr->op == IS_XMLSERIALIZE) - appendStringInfo(buf, " AS %s", - format_type_with_typemod(xexpr->type, - xexpr->typmod)); - if (xexpr->op == IS_DOCUMENT) - appendStringInfoString(buf, " IS DOCUMENT"); - else - appendStringInfoChar(buf, ')'); - } - break; - - case T_NullTest: - { - NullTest *ntest = (NullTest *) node; - - if (!PRETTY_PAREN(context)) - appendStringInfoChar(buf, '('); - get_rule_expr_paren((Node *) ntest->arg, context, true, node); - - /* - * For scalar inputs, we prefer to print as IS [NOT] NULL, - * which is shorter and traditional. If it's a rowtype input - * but we're applying a scalar test, must print IS [NOT] - * DISTINCT FROM NULL to be semantically correct. - */ - if (ntest->argisrow || - !type_is_rowtype(exprType((Node *) ntest->arg))) - { - switch (ntest->nulltesttype) - { - case IS_NULL: - appendStringInfoString(buf, " IS NULL"); - break; - case IS_NOT_NULL: - appendStringInfoString(buf, " IS NOT NULL"); - break; - default: - elog(ERROR, "unrecognized nulltesttype: %d", - (int) ntest->nulltesttype); - } - } - else - { - switch (ntest->nulltesttype) - { - case IS_NULL: - appendStringInfoString(buf, " IS NOT DISTINCT FROM NULL"); - break; - case IS_NOT_NULL: - appendStringInfoString(buf, " IS DISTINCT FROM NULL"); - break; - default: - elog(ERROR, "unrecognized nulltesttype: %d", - (int) ntest->nulltesttype); - } - } - if (!PRETTY_PAREN(context)) - appendStringInfoChar(buf, ')'); - } - break; - - case T_BooleanTest: - { - BooleanTest *btest = (BooleanTest *) node; - - if (!PRETTY_PAREN(context)) - appendStringInfoChar(buf, '('); - get_rule_expr_paren((Node *) btest->arg, context, false, node); - switch (btest->booltesttype) - { - case IS_TRUE: - appendStringInfoString(buf, " IS TRUE"); - break; - case IS_NOT_TRUE: - appendStringInfoString(buf, " IS NOT TRUE"); - break; - case IS_FALSE: - appendStringInfoString(buf, " IS FALSE"); - break; - case IS_NOT_FALSE: - appendStringInfoString(buf, " IS NOT FALSE"); - break; - case IS_UNKNOWN: - appendStringInfoString(buf, " IS UNKNOWN"); - break; - case IS_NOT_UNKNOWN: - appendStringInfoString(buf, " IS NOT UNKNOWN"); - break; - default: - elog(ERROR, "unrecognized booltesttype: %d", - (int) btest->booltesttype); - } - if (!PRETTY_PAREN(context)) - appendStringInfoChar(buf, ')'); - } - break; - - case T_CoerceToDomain: - { - CoerceToDomain *ctest = (CoerceToDomain *) node; - Node *arg = (Node *) ctest->arg; - - if (ctest->coercionformat == COERCE_IMPLICIT_CAST && - !showimplicit) - { - /* don't show the implicit cast */ - get_rule_expr(arg, context, false); - } - else - { - get_coercion_expr(arg, context, - ctest->resulttype, - ctest->resulttypmod, - node); - } - } - break; - - case T_CoerceToDomainValue: - appendStringInfoString(buf, "VALUE"); - break; - - case T_SetToDefault: - appendStringInfoString(buf, "DEFAULT"); - break; - - case T_CurrentOfExpr: - { - CurrentOfExpr *cexpr = (CurrentOfExpr *) node; - - if (cexpr->cursor_name) - appendStringInfo(buf, "CURRENT OF %s", - quote_identifier(cexpr->cursor_name)); - else - appendStringInfo(buf, "CURRENT OF $%d", - cexpr->cursor_param); - } - break; - - case T_InferenceElem: - { - InferenceElem *iexpr = (InferenceElem *) node; - bool save_varprefix; - bool need_parens; - - /* - * InferenceElem can only refer to target relation, so a - * prefix is not useful, and indeed would cause parse errors. - */ - save_varprefix = context->varprefix; - context->varprefix = false; - - /* - * Parenthesize the element unless it's a simple Var or a bare - * function call. Follows pg_get_indexdef_worker(). - */ - need_parens = !IsA(iexpr->expr, Var); - if (IsA(iexpr->expr, FuncExpr) && - ((FuncExpr *) iexpr->expr)->funcformat == - COERCE_EXPLICIT_CALL) - need_parens = false; - - if (need_parens) - appendStringInfoChar(buf, '('); - get_rule_expr((Node *) iexpr->expr, - context, false); - if (need_parens) - appendStringInfoChar(buf, ')'); - - context->varprefix = save_varprefix; - - if (iexpr->infercollid) - appendStringInfo(buf, " COLLATE %s", - generate_collation_name(iexpr->infercollid)); - - /* Add the operator class name, if not default */ - if (iexpr->inferopclass) - { - Oid inferopclass = iexpr->inferopclass; - Oid inferopcinputtype = get_opclass_input_type(iexpr->inferopclass); - - get_opclass_name(inferopclass, inferopcinputtype, buf); - } - } - break; - - case T_PartitionBoundSpec: - { - PartitionBoundSpec *spec = (PartitionBoundSpec *) node; - ListCell *cell; - char *sep; - - switch (spec->strategy) - { - case PARTITION_STRATEGY_LIST: - Assert(spec->listdatums != NIL); - - appendStringInfoString(buf, "FOR VALUES IN ("); - sep = ""; - foreach(cell, spec->listdatums) - { - Const *val = castNode(Const, lfirst(cell)); - - appendStringInfoString(buf, sep); - get_const_expr(val, context, -1); - sep = ", "; - } - - appendStringInfoString(buf, ")"); - break; - - case PARTITION_STRATEGY_RANGE: - Assert(spec->lowerdatums != NIL && - spec->upperdatums != NIL && - list_length(spec->lowerdatums) == - list_length(spec->upperdatums)); - - appendStringInfo(buf, "FOR VALUES FROM %s TO %s", - get_range_partbound_string(spec->lowerdatums), - get_range_partbound_string(spec->upperdatums)); - break; - - default: - elog(ERROR, "unrecognized partition strategy: %d", - (int) spec->strategy); - break; - } - } - break; - - case T_List: - { - char *sep; - ListCell *l; - - sep = ""; - foreach(l, (List *) node) - { - appendStringInfoString(buf, sep); - get_rule_expr((Node *) lfirst(l), context, showimplicit); - sep = ", "; - } - } - break; - - case T_TableFunc: - get_tablefunc((TableFunc *) node, context, showimplicit); - break; - - default: - elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node)); - break; - } -} - -/* - * get_rule_expr_toplevel - Parse back a toplevel expression - * - * Same as get_rule_expr(), except that if the expr is just a Var, we pass - * istoplevel = true not false to get_variable(). This causes whole-row Vars - * to get printed with decoration that will prevent expansion of "*". - * We need to use this in contexts such as ROW() and VALUES(), where the - * parser would expand "foo.*" appearing at top level. (In principle we'd - * use this in get_target_list() too, but that has additional worries about - * whether to print AS, so it needs to invoke get_variable() directly anyway.) - */ -static void -get_rule_expr_toplevel(Node *node, deparse_context *context, - bool showimplicit) -{ - if (node && IsA(node, Var)) - (void) get_variable((Var *) node, 0, true, context); - else - get_rule_expr(node, context, showimplicit); -} - -/* - * get_rule_expr_funccall - Parse back a function-call expression - * - * Same as get_rule_expr(), except that we guarantee that the output will - * look like a function call, or like one of the things the grammar treats as - * equivalent to a function call (see the func_expr_windowless production). - * This is needed in places where the grammar uses func_expr_windowless and - * you can't substitute a parenthesized a_expr. If what we have isn't going - * to look like a function call, wrap it in a dummy CAST() expression, which - * will satisfy the grammar --- and, indeed, is likely what the user wrote to - * produce such a thing. - */ -static void -get_rule_expr_funccall(Node *node, deparse_context *context, - bool showimplicit) -{ - if (looks_like_function(node)) - get_rule_expr(node, context, showimplicit); - else - { - StringInfo buf = context->buf; - - appendStringInfoString(buf, "CAST("); - /* no point in showing any top-level implicit cast */ - get_rule_expr(node, context, false); - appendStringInfo(buf, " AS %s)", - format_type_with_typemod(exprType(node), - exprTypmod(node))); - } -} - -/* - * Helper function to identify node types that satisfy func_expr_windowless. - * If in doubt, "false" is always a safe answer. - */ -static bool -looks_like_function(Node *node) -{// #lizard forgives - if (node == NULL) - return false; /* probably shouldn't happen */ - switch (nodeTag(node)) - { - case T_FuncExpr: - /* OK, unless it's going to deparse as a cast */ - return (((FuncExpr *) node)->funcformat == COERCE_EXPLICIT_CALL); - case T_NullIfExpr: - case T_CoalesceExpr: - case T_MinMaxExpr: - case T_SQLValueFunction: - case T_XmlExpr: - /* these are all accepted by func_expr_common_subexpr */ - return true; - default: - break; - } - return false; -} - - -/* - * get_oper_expr - Parse back an OpExpr node - */ -static void -get_oper_expr(OpExpr *expr, deparse_context *context) -{ - StringInfo buf = context->buf; - Oid opno = expr->opno; - List *args = expr->args; - - if (!PRETTY_PAREN(context)) - appendStringInfoChar(buf, '('); - if (list_length(args) == 2) - { - /* binary operator */ - Node *arg1 = (Node *) linitial(args); - Node *arg2 = (Node *) lsecond(args); - - get_rule_expr_paren(arg1, context, true, (Node *) expr); - appendStringInfo(buf, " %s ", - generate_operator_name(opno, - exprType(arg1), - exprType(arg2))); - get_rule_expr_paren(arg2, context, true, (Node *) expr); - } - else - { - /* unary operator --- but which side? */ - Node *arg = (Node *) linitial(args); - HeapTuple tp; - Form_pg_operator optup; - - tp = SearchSysCache1(OPEROID, ObjectIdGetDatum(opno)); - if (!HeapTupleIsValid(tp)) - elog(ERROR, "cache lookup failed for operator %u", opno); - optup = (Form_pg_operator) GETSTRUCT(tp); - switch (optup->oprkind) - { - case 'l': - appendStringInfo(buf, "%s ", - generate_operator_name(opno, - InvalidOid, - exprType(arg))); - get_rule_expr_paren(arg, context, true, (Node *) expr); - break; - case 'r': - get_rule_expr_paren(arg, context, true, (Node *) expr); - appendStringInfo(buf, " %s", - generate_operator_name(opno, - exprType(arg), - InvalidOid)); - break; - default: - elog(ERROR, "bogus oprkind: %d", optup->oprkind); - } - ReleaseSysCache(tp); - } - if (!PRETTY_PAREN(context)) - appendStringInfoChar(buf, ')'); -} - -/* - * get_func_expr - Parse back a FuncExpr node - */ -static void -get_func_expr(FuncExpr *expr, deparse_context *context, - bool showimplicit) -{// #lizard forgives - StringInfo buf = context->buf; - Oid funcoid = expr->funcid; - Oid argtypes[FUNC_MAX_ARGS]; - int nargs; - List *argnames; - bool use_variadic; - ListCell *l; - - /* - * If the function call came from an implicit coercion, then just show the - * first argument --- unless caller wants to see implicit coercions. - */ - if (expr->funcformat == COERCE_IMPLICIT_CAST && !showimplicit) - { - get_rule_expr_paren((Node *) linitial(expr->args), context, - false, (Node *) expr); - return; - } - - /* - * If the function call came from a cast, then show the first argument - * plus an explicit cast operation. - */ - if (expr->funcformat == COERCE_EXPLICIT_CAST || - expr->funcformat == COERCE_IMPLICIT_CAST) - { - Node *arg = linitial(expr->args); - Oid rettype = expr->funcresulttype; - int32 coercedTypmod; - - /* Get the typmod if this is a length-coercion function */ - (void) exprIsLengthCoercion((Node *) expr, &coercedTypmod); - - get_coercion_expr(arg, context, - rettype, coercedTypmod, - (Node *) expr); - - return; - } - - /* - * Normal function: display as proname(args). First we need to extract - * the argument datatypes. - */ - if (list_length(expr->args) > FUNC_MAX_ARGS) - ereport(ERROR, - (errcode(ERRCODE_TOO_MANY_ARGUMENTS), - errmsg("too many arguments"))); - nargs = 0; - argnames = NIL; - foreach(l, expr->args) - { - Node *arg = (Node *) lfirst(l); - - if (IsA(arg, NamedArgExpr)) - argnames = lappend(argnames, ((NamedArgExpr *) arg)->name); - argtypes[nargs] = exprType(arg); - nargs++; - } - - appendStringInfo(buf, "%s(", - generate_function_name(funcoid, nargs, - argnames, argtypes, - expr->funcvariadic, - &use_variadic, - context->special_exprkind)); - nargs = 0; - foreach(l, expr->args) - { - if (nargs++ > 0) - appendStringInfoString(buf, ", "); - if (use_variadic && lnext(l) == NULL) - appendStringInfoString(buf, "VARIADIC "); - get_rule_expr((Node *) lfirst(l), context, true); - } - appendStringInfoChar(buf, ')'); -} - -/* - * get_agg_expr - Parse back an Aggref node - */ -static void -get_agg_expr(Aggref *aggref, deparse_context *context, - Aggref *original_aggref) -{// #lizard forgives - StringInfo buf = context->buf; - Oid argtypes[FUNC_MAX_ARGS]; - int nargs; -#ifdef PGXC -// bool added_finalfn = false; -#endif /* PGXC */ - - bool use_variadic; - - /* - * For a combining aggregate, we look up and deparse the corresponding - * partial aggregate instead. This is necessary because our input - * argument list has been replaced; the new argument list always has just - * one element, which will point to a partial Aggref that supplies us with - * transition states to combine. - */ - if (DO_AGGSPLIT_COMBINE(aggref->aggsplit)) - { - TargetEntry *tle = linitial_node(TargetEntry, aggref->args); - - Assert(list_length(aggref->args) == 1); - resolve_special_varno((Node *) tle->expr, context, original_aggref, - get_agg_combine_expr); - return; - } - - /* - * Mark as PARTIAL, if appropriate. We look to the original aggref so as - * to avoid printing this when recursing from the code just above. - */ - if (DO_AGGSPLIT_SKIPFINAL(original_aggref->aggsplit)) - appendStringInfoString(buf, "PARTIAL "); - - /* Extract the argument types as seen by the parser */ - nargs = get_aggregate_argtypes(aggref, argtypes); - - /* Print the aggregate name, schema-qualified if needed */ - appendStringInfo(buf, "%s(%s", - generate_function_name(aggref->aggfnoid, nargs, - NIL, argtypes, - aggref->aggvariadic, - &use_variadic, - context->special_exprkind), - (aggref->aggdistinct != NIL) ? "DISTINCT " : ""); - - if (AGGKIND_IS_ORDERED_SET(aggref->aggkind)) - { - /* - * Ordered-set aggregates do not use "*" syntax. Also, we needn't - * worry about inserting VARIADIC. So we can just dump the direct - * args as-is. - */ - Assert(!aggref->aggvariadic); - get_rule_expr((Node *) aggref->aggdirectargs, context, true); - Assert(aggref->aggorder != NIL); - appendStringInfoString(buf, ") WITHIN GROUP (ORDER BY "); - get_rule_orderby(aggref->aggorder, aggref->args, false, context); - } - else - { - /* aggstar can be set only in zero-argument aggregates */ - if (aggref->aggstar) - appendStringInfoChar(buf, '*'); - else - { - ListCell *l; - int i; - - i = 0; - foreach(l, aggref->args) - { - TargetEntry *tle = (TargetEntry *) lfirst(l); - Node *arg = (Node *) tle->expr; - - Assert(!IsA(arg, NamedArgExpr)); - if (tle->resjunk) - continue; - if (i++ > 0) - appendStringInfoString(buf, ", "); - if (use_variadic && i == nargs) - appendStringInfoString(buf, "VARIADIC "); - get_rule_expr(arg, context, true); - } - } - - if (aggref->aggorder != NIL) - { - appendStringInfoString(buf, " ORDER BY "); - get_rule_orderby(aggref->aggorder, aggref->args, false, context); - } - } - - if (aggref->aggfilter != NULL) - { - appendStringInfoString(buf, ") FILTER (WHERE "); - get_rule_expr((Node *) aggref->aggfilter, context, false); - } - - appendStringInfoChar(buf, ')'); - -} - -/* - * This is a helper function for get_agg_expr(). It's used when we deparse - * a combining Aggref; resolve_special_varno locates the corresponding partial - * Aggref and then calls this. - */ -static void -get_agg_combine_expr(Node *node, deparse_context *context, void *private) -{ - Aggref *aggref; - Aggref *original_aggref = private; - - if (!IsA(node, Aggref)) - elog(ERROR, "combining Aggref does not point to an Aggref"); - - aggref = (Aggref *) node; - get_agg_expr(aggref, context, original_aggref); -} - -/* - * get_windowfunc_expr - Parse back a WindowFunc node - */ -static void -get_windowfunc_expr(WindowFunc *wfunc, deparse_context *context) -{// #lizard forgives - StringInfo buf = context->buf; - Oid argtypes[FUNC_MAX_ARGS]; - int nargs; - List *argnames; - ListCell *l; - - if (list_length(wfunc->args) > FUNC_MAX_ARGS) - ereport(ERROR, - (errcode(ERRCODE_TOO_MANY_ARGUMENTS), - errmsg("too many arguments"))); - nargs = 0; - argnames = NIL; - foreach(l, wfunc->args) - { - Node *arg = (Node *) lfirst(l); - - if (IsA(arg, NamedArgExpr)) - argnames = lappend(argnames, ((NamedArgExpr *) arg)->name); - argtypes[nargs] = exprType(arg); - nargs++; - } - - appendStringInfo(buf, "%s(", - generate_function_name(wfunc->winfnoid, nargs, - argnames, argtypes, - false, NULL, - context->special_exprkind)); - /* winstar can be set only in zero-argument aggregates */ - if (wfunc->winstar) - appendStringInfoChar(buf, '*'); - else - get_rule_expr((Node *) wfunc->args, context, true); - - if (wfunc->aggfilter != NULL) - { - appendStringInfoString(buf, ") FILTER (WHERE "); - get_rule_expr((Node *) wfunc->aggfilter, context, false); - } - - appendStringInfoString(buf, ") OVER "); - - foreach(l, context->windowClause) - { - WindowClause *wc = (WindowClause *) lfirst(l); - - if (wc->winref == wfunc->winref) - { - if (wc->name) - appendStringInfoString(buf, quote_identifier(wc->name)); - else - get_rule_windowspec(wc, context->windowTList, context); - break; - } - } - if (l == NULL) - { - if (context->windowClause) - elog(ERROR, "could not find window clause for winref %u", - wfunc->winref); - - /* - * In EXPLAIN, we don't have window context information available, so - * we have to settle for this: - */ - appendStringInfoString(buf, "(?)"); - } -} - -/* ---------- - * get_coercion_expr - * - * Make a string representation of a value coerced to a specific type - * ---------- - */ -static void -get_coercion_expr(Node *arg, deparse_context *context, - Oid resulttype, int32 resulttypmod, - Node *parentNode) -{ - StringInfo buf = context->buf; - - /* - * Since parse_coerce.c doesn't immediately collapse application of - * length-coercion functions to constants, what we'll typically see in - * such cases is a Const with typmod -1 and a length-coercion function - * right above it. Avoid generating redundant output. However, beware of - * suppressing casts when the user actually wrote something like - * 'foo'::text::char(3). - * - * Note: it might seem that we are missing the possibility of needing to - * print a COLLATE clause for such a Const. However, a Const could only - * have nondefault collation in a post-constant-folding tree, in which the - * length coercion would have been folded too. See also the special - * handling of CollateExpr in coerce_to_target_type(): any collation - * marking will be above the coercion node, not below it. - */ - if (arg && IsA(arg, Const) && - ((Const *) arg)->consttype == resulttype && - ((Const *) arg)->consttypmod == -1) - { - /* Show the constant without normal ::typename decoration */ - get_const_expr((Const *) arg, context, -1); - } - else - { - if (!PRETTY_PAREN(context)) - appendStringInfoChar(buf, '('); - get_rule_expr_paren(arg, context, false, parentNode); - if (!PRETTY_PAREN(context)) - appendStringInfoChar(buf, ')'); - } - appendStringInfo(buf, "::%s", - format_type_with_typemod(resulttype, resulttypmod)); -} - -/* ---------- - * get_const_expr - * - * Make a string representation of a Const - * - * showtype can be -1 to never show "::typename" decoration, or +1 to always - * show it, or 0 to show it only if the constant wouldn't be assumed to be - * the right type by default. - * - * If the Const's collation isn't default for its type, show that too. - * We mustn't do this when showtype is -1 (since that means the caller will - * print "::typename", and we can't put a COLLATE clause in between). It's - * caller's responsibility that collation isn't missed in such cases. - * ---------- - */ -static void -get_const_expr(Const *constval, deparse_context *context, int showtype) -{// #lizard forgives - StringInfo buf = context->buf; - Oid typoutput; - bool typIsVarlena; - char *extval; - bool needlabel = false; - - if (constval->constisnull) - { - /* - * Always label the type of a NULL constant to prevent misdecisions - * about type when reparsing. - */ - appendStringInfoString(buf, "NULL"); - if (showtype >= 0) - { - appendStringInfo(buf, "::%s", - format_type_with_typemod(constval->consttype, - constval->consttypmod)); - get_const_collation(constval, context); - } - return; - } - - getTypeOutputInfo(constval->consttype, - &typoutput, &typIsVarlena); - - extval = OidOutputFunctionCall(typoutput, constval->constvalue); - - switch (constval->consttype) - { - case INT4OID: - - /* - * INT4 can be printed without any decoration, unless it is - * negative; in that case print it as '-nnn'::integer to ensure - * that the output will re-parse as a constant, not as a constant - * plus operator. In most cases we could get away with printing - * (-nnn) instead, because of the way that gram.y handles negative - * literals; but that doesn't work for INT_MIN, and it doesn't - * seem that much prettier anyway. - */ - if (extval[0] != '-') - appendStringInfoString(buf, extval); - else - { - appendStringInfo(buf, "'%s'", extval); - needlabel = true; /* we must attach a cast */ - } - break; - - case NUMERICOID: - - /* - * NUMERIC can be printed without quotes if it looks like a float - * constant (not an integer, and not Infinity or NaN) and doesn't - * have a leading sign (for the same reason as for INT4). - */ - if (isdigit((unsigned char) extval[0]) && - strcspn(extval, "eE.") != strlen(extval)) - { - appendStringInfoString(buf, extval); - } - else - { - appendStringInfo(buf, "'%s'", extval); - needlabel = true; /* we must attach a cast */ - } - break; - - case BITOID: - case VARBITOID: - appendStringInfo(buf, "B'%s'", extval); - break; - - case BOOLOID: - if (strcmp(extval, "t") == 0) - appendStringInfoString(buf, "true"); - else - appendStringInfoString(buf, "false"); - break; - - default: - simple_quote_literal(buf, extval); - break; - } - - pfree(extval); - - if (showtype < 0) - return; - - /* - * For showtype == 0, append ::typename unless the constant will be - * implicitly typed as the right type when it is read in. - * - * XXX this code has to be kept in sync with the behavior of the parser, - * especially make_const. - */ - switch (constval->consttype) - { - case BOOLOID: - case UNKNOWNOID: - /* These types can be left unlabeled */ - needlabel = false; - break; - case INT4OID: - /* We determined above whether a label is needed */ - break; - case NUMERICOID: - - /* - * Float-looking constants will be typed as numeric, which we - * checked above; but if there's a nondefault typmod we need to - * show it. - */ - needlabel |= (constval->consttypmod >= 0); - break; - default: - needlabel = true; - break; - } - if (needlabel || showtype > 0) - appendStringInfo(buf, "::%s", - format_type_with_typemod(constval->consttype, - constval->consttypmod)); - - get_const_collation(constval, context); -} - -/* - * helper for get_const_expr: append COLLATE if needed - */ -static void -get_const_collation(Const *constval, deparse_context *context) -{ - StringInfo buf = context->buf; - - if (OidIsValid(constval->constcollid)) - { - Oid typcollation = get_typcollation(constval->consttype); - - if (constval->constcollid != typcollation) - { - appendStringInfo(buf, " COLLATE %s", - generate_collation_name(constval->constcollid)); - } - } -} - -/* - * simple_quote_literal - Format a string as a SQL literal, append to buf - */ -static void -simple_quote_literal(StringInfo buf, const char *val) -{ - const char *valptr; - - /* - * We form the string literal according to the prevailing setting of - * standard_conforming_strings; we never use E''. User is responsible for - * making sure result is used correctly. - */ - appendStringInfoChar(buf, '\''); - for (valptr = val; *valptr; valptr++) - { - char ch = *valptr; - - if (SQL_STR_DOUBLE(ch, !standard_conforming_strings)) - appendStringInfoChar(buf, ch); - appendStringInfoChar(buf, ch); - } - appendStringInfoChar(buf, '\''); -} - - -/* ---------- - * get_sublink_expr - Parse back a sublink - * ---------- - */ -static void -get_sublink_expr(SubLink *sublink, deparse_context *context) -{// #lizard forgives - StringInfo buf = context->buf; - Query *query = (Query *) (sublink->subselect); - char *opname = NULL; - bool need_paren; - - if (sublink->subLinkType == ARRAY_SUBLINK) - appendStringInfoString(buf, "ARRAY("); - else - appendStringInfoChar(buf, '('); - - /* - * Note that we print the name of only the first operator, when there are - * multiple combining operators. This is an approximation that could go - * wrong in various scenarios (operators in different schemas, renamed - * operators, etc) but there is not a whole lot we can do about it, since - * the syntax allows only one operator to be shown. - */ - if (sublink->testexpr) - { - if (IsA(sublink->testexpr, OpExpr)) - { - /* single combining operator */ - OpExpr *opexpr = (OpExpr *) sublink->testexpr; - - get_rule_expr(linitial(opexpr->args), context, true); - opname = generate_operator_name(opexpr->opno, - exprType(linitial(opexpr->args)), - exprType(lsecond(opexpr->args))); - } - else if (IsA(sublink->testexpr, BoolExpr)) - { - /* multiple combining operators, = or <> cases */ - char *sep; - ListCell *l; - - appendStringInfoChar(buf, '('); - sep = ""; - foreach(l, ((BoolExpr *) sublink->testexpr)->args) - { - OpExpr *opexpr = lfirst_node(OpExpr, l); - - appendStringInfoString(buf, sep); - get_rule_expr(linitial(opexpr->args), context, true); - if (!opname) - opname = generate_operator_name(opexpr->opno, - exprType(linitial(opexpr->args)), - exprType(lsecond(opexpr->args))); - sep = ", "; - } - appendStringInfoChar(buf, ')'); - } - else if (IsA(sublink->testexpr, RowCompareExpr)) - { - /* multiple combining operators, < <= > >= cases */ - RowCompareExpr *rcexpr = (RowCompareExpr *) sublink->testexpr; - - appendStringInfoChar(buf, '('); - get_rule_expr((Node *) rcexpr->largs, context, true); - opname = generate_operator_name(linitial_oid(rcexpr->opnos), - exprType(linitial(rcexpr->largs)), - exprType(linitial(rcexpr->rargs))); - appendStringInfoChar(buf, ')'); - } - else - elog(ERROR, "unrecognized testexpr type: %d", - (int) nodeTag(sublink->testexpr)); - } - - need_paren = true; - - switch (sublink->subLinkType) - { - case EXISTS_SUBLINK: - appendStringInfoString(buf, "EXISTS "); - break; - - case ANY_SUBLINK: - if (strcmp(opname, "=") == 0) /* Represent = ANY as IN */ - appendStringInfoString(buf, " IN "); - else - appendStringInfo(buf, " %s ANY ", opname); - break; - - case ALL_SUBLINK: - appendStringInfo(buf, " %s ALL ", opname); - break; - - case ROWCOMPARE_SUBLINK: - appendStringInfo(buf, " %s ", opname); - break; - - case EXPR_SUBLINK: - case MULTIEXPR_SUBLINK: - case ARRAY_SUBLINK: - need_paren = false; - break; - - case CTE_SUBLINK: /* shouldn't occur in a SubLink */ - default: - elog(ERROR, "unrecognized sublink type: %d", - (int) sublink->subLinkType); - break; - } - - if (need_paren) - appendStringInfoChar(buf, '('); - - get_query_def(query, buf, context->namespaces, NULL, - context->prettyFlags, context->wrapColumn, - context->indentLevel, - context->finalise_aggs, - context->sortgroup_colno); - - if (need_paren) - appendStringInfoString(buf, "))"); - else - appendStringInfoChar(buf, ')'); -} - - -/* ---------- - * get_tablefunc - Parse back a table function - * ---------- - */ -static void -get_tablefunc(TableFunc *tf, deparse_context *context, bool showimplicit) -{// #lizard forgives - StringInfo buf = context->buf; - - /* XMLTABLE is the only existing implementation. */ - - appendStringInfoString(buf, "XMLTABLE("); - - if (tf->ns_uris != NIL) - { - ListCell *lc1, - *lc2; - bool first = true; - - appendStringInfoString(buf, "XMLNAMESPACES ("); - forboth(lc1, tf->ns_uris, lc2, tf->ns_names) - { - Node *expr = (Node *) lfirst(lc1); - char *name = strVal(lfirst(lc2)); - - if (!first) - appendStringInfoString(buf, ", "); - else - first = false; - - if (name != NULL) - { - get_rule_expr(expr, context, showimplicit); - appendStringInfo(buf, " AS %s", name); - } - else - { - appendStringInfoString(buf, "DEFAULT "); - get_rule_expr(expr, context, showimplicit); - } - } - appendStringInfoString(buf, "), "); - } - - appendStringInfoChar(buf, '('); - get_rule_expr((Node *) tf->rowexpr, context, showimplicit); - appendStringInfoString(buf, ") PASSING ("); - get_rule_expr((Node *) tf->docexpr, context, showimplicit); - appendStringInfoChar(buf, ')'); - - if (tf->colexprs != NIL) - { - ListCell *l1; - ListCell *l2; - ListCell *l3; - ListCell *l4; - ListCell *l5; - int colnum = 0; - - l2 = list_head(tf->coltypes); - l3 = list_head(tf->coltypmods); - l4 = list_head(tf->colexprs); - l5 = list_head(tf->coldefexprs); - - appendStringInfoString(buf, " COLUMNS "); - foreach(l1, tf->colnames) - { - char *colname = strVal(lfirst(l1)); - Oid typid; - int32 typmod; - Node *colexpr; - Node *coldefexpr; - bool ordinality = tf->ordinalitycol == colnum; - bool notnull = bms_is_member(colnum, tf->notnulls); - - typid = lfirst_oid(l2); - l2 = lnext(l2); - typmod = lfirst_int(l3); - l3 = lnext(l3); - colexpr = (Node *) lfirst(l4); - l4 = lnext(l4); - coldefexpr = (Node *) lfirst(l5); - l5 = lnext(l5); - - if (colnum > 0) - appendStringInfoString(buf, ", "); - colnum++; - - appendStringInfo(buf, "%s %s", quote_identifier(colname), - ordinality ? "FOR ORDINALITY" : - format_type_with_typemod(typid, typmod)); - if (ordinality) - continue; - - if (coldefexpr != NULL) - { - appendStringInfoString(buf, " DEFAULT ("); - get_rule_expr((Node *) coldefexpr, context, showimplicit); - appendStringInfoChar(buf, ')'); - } - if (colexpr != NULL) - { - appendStringInfoString(buf, " PATH ("); - get_rule_expr((Node *) colexpr, context, showimplicit); - appendStringInfoChar(buf, ')'); - } - if (notnull) - appendStringInfoString(buf, " NOT NULL"); - } - } - - appendStringInfoChar(buf, ')'); -} - -/* ---------- - * get_from_clause - Parse back a FROM clause - * - * "prefix" is the keyword that denotes the start of the list of FROM - * elements. It is FROM when used to parse back SELECT and UPDATE, but - * is USING when parsing back DELETE. - * ---------- - */ -static void -get_from_clause(Query *query, const char *prefix, deparse_context *context) -{// #lizard forgives - StringInfo buf = context->buf; - bool first = true; - ListCell *l; - - /* - * We use the query's jointree as a guide to what to print. However, we - * must ignore auto-added RTEs that are marked not inFromCl. (These can - * only appear at the top level of the jointree, so it's sufficient to - * check here.) This check also ensures we ignore the rule pseudo-RTEs - * for NEW and OLD. - */ - foreach(l, query->jointree->fromlist) - { - Node *jtnode = (Node *) lfirst(l); - - if (IsA(jtnode, RangeTblRef)) - { - int varno = ((RangeTblRef *) jtnode)->rtindex; - RangeTblEntry *rte = rt_fetch(varno, query->rtable); - - if (!rte->inFromCl) - continue; - } - - if (first) - { - appendContextKeyword(context, prefix, - -PRETTYINDENT_STD, PRETTYINDENT_STD, 2); - first = false; - - get_from_clause_item(jtnode, query, context); - } - else - { - StringInfoData itembuf; - - appendStringInfoString(buf, ", "); - - /* - * Put the new FROM item's text into itembuf so we can decide - * after we've got it whether or not it needs to go on a new line. - */ - initStringInfo(&itembuf); - context->buf = &itembuf; - - get_from_clause_item(jtnode, query, context); - - /* Restore context's output buffer */ - context->buf = buf; - - /* Consider line-wrapping if enabled */ - if (PRETTY_INDENT(context) && context->wrapColumn >= 0) - { - /* Does the new item start with a new line? */ - if (itembuf.len > 0 && itembuf.data[0] == '\n') - { - /* If so, we shouldn't add anything */ - /* instead, remove any trailing spaces currently in buf */ - removeStringInfoSpaces(buf); - } - else - { - char *trailing_nl; - - /* Locate the start of the current line in the buffer */ - trailing_nl = strrchr(buf->data, '\n'); - if (trailing_nl == NULL) - trailing_nl = buf->data; - else - trailing_nl++; - - /* - * Add a newline, plus some indentation, if the new item - * would cause an overflow. - */ - if (strlen(trailing_nl) + itembuf.len > context->wrapColumn) - appendContextKeyword(context, "", -PRETTYINDENT_STD, - PRETTYINDENT_STD, - PRETTYINDENT_VAR); - } - } - - /* Add the new item */ - appendStringInfoString(buf, itembuf.data); - - /* clean up */ - pfree(itembuf.data); - } - } -} - -static void -get_from_clause_item(Node *jtnode, Query *query, deparse_context *context) -{// #lizard forgives - StringInfo buf = context->buf; - deparse_namespace *dpns = (deparse_namespace *) linitial(context->namespaces); - - if (IsA(jtnode, RangeTblRef)) - { - int varno = ((RangeTblRef *) jtnode)->rtindex; - RangeTblEntry *rte = rt_fetch(varno, query->rtable); - char *refname = get_rtable_name(varno, context); - deparse_columns *colinfo = deparse_columns_fetch(varno, dpns); - RangeTblFunction *rtfunc1 = NULL; - bool printalias; - - if (rte->lateral) - appendStringInfoString(buf, "LATERAL "); - - /* Print the FROM item proper */ - switch (rte->rtekind) - { - case RTE_RELATION: - /* Normal relation RTE */ - appendStringInfo(buf, "%s%s", - only_marker(rte), - generate_relation_name(rte->relid, - context->namespaces)); -#ifdef __TBASE__ - /* print for default partition */ - if (rte->intervalparent && rte->isdefault) - { - appendStringInfoString(buf, " PARTITION For Default "); - } -#endif - break; - case RTE_SUBQUERY: - /* Subquery RTE */ - appendStringInfoChar(buf, '('); - get_query_def(rte->subquery, buf, context->namespaces, NULL, - context->prettyFlags, context->wrapColumn, - context->indentLevel, - context->finalise_aggs, - context->sortgroup_colno); - appendStringInfoChar(buf, ')'); - break; - case RTE_FUNCTION: - /* Function RTE */ - rtfunc1 = (RangeTblFunction *) linitial(rte->functions); - - /* - * Omit ROWS FROM() syntax for just one function, unless it - * has both a coldeflist and WITH ORDINALITY. If it has both, - * we must use ROWS FROM() syntax to avoid ambiguity about - * whether the coldeflist includes the ordinality column. - */ - if (list_length(rte->functions) == 1 && - (rtfunc1->funccolnames == NIL || !rte->funcordinality)) - { - get_rule_expr_funccall(rtfunc1->funcexpr, context, true); - /* we'll print the coldeflist below, if it has one */ - } - else - { - bool all_unnest; - ListCell *lc; - - /* - * If all the function calls in the list are to unnest, - * and none need a coldeflist, then collapse the list back - * down to UNNEST(args). (If we had more than one - * built-in unnest function, this would get more - * difficult.) - * - * XXX This is pretty ugly, since it makes not-terribly- - * future-proof assumptions about what the parser would do - * with the output; but the alternative is to emit our - * nonstandard ROWS FROM() notation for what might have - * been a perfectly spec-compliant multi-argument - * UNNEST(). - */ - all_unnest = true; - foreach(lc, rte->functions) - { - RangeTblFunction *rtfunc = (RangeTblFunction *) lfirst(lc); - - if (!IsA(rtfunc->funcexpr, FuncExpr) || - ((FuncExpr *) rtfunc->funcexpr)->funcid != F_ARRAY_UNNEST || - rtfunc->funccolnames != NIL) - { - all_unnest = false; - break; - } - } - - if (all_unnest) - { - List *allargs = NIL; - - foreach(lc, rte->functions) - { - RangeTblFunction *rtfunc = (RangeTblFunction *) lfirst(lc); - List *args = ((FuncExpr *) rtfunc->funcexpr)->args; - - allargs = list_concat(allargs, list_copy(args)); - } - - appendStringInfoString(buf, "UNNEST("); - get_rule_expr((Node *) allargs, context, true); - appendStringInfoChar(buf, ')'); - } - else - { - int funcno = 0; - - appendStringInfoString(buf, "ROWS FROM("); - foreach(lc, rte->functions) - { - RangeTblFunction *rtfunc = (RangeTblFunction *) lfirst(lc); - - if (funcno > 0) - appendStringInfoString(buf, ", "); - get_rule_expr_funccall(rtfunc->funcexpr, context, true); - if (rtfunc->funccolnames != NIL) - { - /* Reconstruct the column definition list */ - appendStringInfoString(buf, " AS "); - get_from_clause_coldeflist(rtfunc, - NULL, - context); - } - funcno++; - } - appendStringInfoChar(buf, ')'); - } - /* prevent printing duplicate coldeflist below */ - rtfunc1 = NULL; - } - if (rte->funcordinality) - appendStringInfoString(buf, " WITH ORDINALITY"); - break; - case RTE_TABLEFUNC: - get_tablefunc(rte->tablefunc, context, true); - break; - case RTE_VALUES: - /* Values list RTE */ - appendStringInfoChar(buf, '('); - get_values_def(rte->values_lists, context); - appendStringInfoChar(buf, ')'); - break; - case RTE_CTE: - appendStringInfoString(buf, quote_identifier(rte->ctename)); - break; - default: - elog(ERROR, "unrecognized RTE kind: %d", (int) rte->rtekind); - break; - } - - /* Print the relation alias, if needed */ - printalias = false; - if (rte->alias != NULL) - { - /* Always print alias if user provided one */ - printalias = true; - } - else if (colinfo->printaliases) - { - /* Always print alias if we need to print column aliases */ - printalias = true; - } - else if (rte->rtekind == RTE_RELATION) - { - /* - * No need to print alias if it's same as relation name (this - * would normally be the case, but not if set_rtable_names had to - * resolve a conflict). - */ - if (strcmp(refname, get_relation_name(rte->relid)) != 0) - printalias = true; - } -#ifdef PGXC - else if (rte->rtekind == RTE_SUBQUERY && rte->eref->aliasname) - { - /* - * - * This condition arises when the from clause is a view. The - * corresponding subquery RTE has its eref set to view name. - * The remote query generated has this subquery of which the - * columns can be referred to as view_name.col1, so it should - * be possible to refer to this subquery object. - */ - appendStringInfo(buf, " %s", - quote_identifier(rte->eref->aliasname)); - printalias = true; - } -#endif - else if (rte->rtekind == RTE_FUNCTION) - { - /* - * For a function RTE, always print alias. This covers possible - * renaming of the function and/or instability of the - * FigureColname rules for things that aren't simple functions. - * Note we'd need to force it anyway for the columndef list case. - */ - printalias = true; - } - else if (rte->rtekind == RTE_VALUES) - { - /* Alias is syntactically required for VALUES */ - printalias = true; - } - else if (rte->rtekind == RTE_CTE) - { - /* - * No need to print alias if it's same as CTE name (this would - * normally be the case, but not if set_rtable_names had to - * resolve a conflict). - */ - if (strcmp(refname, rte->ctename) != 0) - printalias = true; - } - if (printalias) - appendStringInfo(buf, " %s", quote_identifier(refname)); - - /* Print the column definitions or aliases, if needed */ - if (rtfunc1 && rtfunc1->funccolnames != NIL) - { - /* Reconstruct the columndef list, which is also the aliases */ - get_from_clause_coldeflist(rtfunc1, colinfo, context); - } - else - { - /* Else print column aliases as needed */ - get_column_alias_list(colinfo, context); - } - - /* Tablesample clause must go after any alias */ - if (rte->rtekind == RTE_RELATION && rte->tablesample) - get_tablesample_def(rte->tablesample, context); - } - else if (IsA(jtnode, JoinExpr)) - { - JoinExpr *j = (JoinExpr *) jtnode; - deparse_columns *colinfo = deparse_columns_fetch(j->rtindex, dpns); - bool need_paren_on_right; - - need_paren_on_right = PRETTY_PAREN(context) && - !IsA(j->rarg, RangeTblRef) && - !(IsA(j->rarg, JoinExpr) &&((JoinExpr *) j->rarg)->alias != NULL); - - if (!PRETTY_PAREN(context) || j->alias != NULL) - appendStringInfoChar(buf, '('); - - get_from_clause_item(j->larg, query, context); - - switch (j->jointype) - { - case JOIN_INNER: - if (j->quals) - appendContextKeyword(context, " JOIN ", - -PRETTYINDENT_STD, - PRETTYINDENT_STD, - PRETTYINDENT_JOIN); - else - appendContextKeyword(context, " CROSS JOIN ", - -PRETTYINDENT_STD, - PRETTYINDENT_STD, - PRETTYINDENT_JOIN); - break; - case JOIN_LEFT: - appendContextKeyword(context, " LEFT JOIN ", - -PRETTYINDENT_STD, - PRETTYINDENT_STD, - PRETTYINDENT_JOIN); - break; - case JOIN_FULL: - appendContextKeyword(context, " FULL JOIN ", - -PRETTYINDENT_STD, - PRETTYINDENT_STD, - PRETTYINDENT_JOIN); - break; - case JOIN_RIGHT: - appendContextKeyword(context, " RIGHT JOIN ", - -PRETTYINDENT_STD, - PRETTYINDENT_STD, - PRETTYINDENT_JOIN); - break; - default: - elog(ERROR, "unrecognized join type: %d", - (int) j->jointype); - } - - if (need_paren_on_right) - appendStringInfoChar(buf, '('); - get_from_clause_item(j->rarg, query, context); - if (need_paren_on_right) - appendStringInfoChar(buf, ')'); - - if (j->usingClause) - { - ListCell *lc; - bool first = true; - - appendStringInfoString(buf, " USING ("); - /* Use the assigned names, not what's in usingClause */ - foreach(lc, colinfo->usingNames) - { - char *colname = (char *) lfirst(lc); - - if (first) - first = false; - else - appendStringInfoString(buf, ", "); - appendStringInfoString(buf, quote_identifier(colname)); - } - appendStringInfoChar(buf, ')'); - } - else if (j->quals) - { - appendStringInfoString(buf, " ON "); - if (!PRETTY_PAREN(context)) - appendStringInfoChar(buf, '('); - get_rule_expr(j->quals, context, false); - if (!PRETTY_PAREN(context)) - appendStringInfoChar(buf, ')'); - } - else if (j->jointype != JOIN_INNER) - { - /* If we didn't say CROSS JOIN above, we must provide an ON */ - appendStringInfoString(buf, " ON TRUE"); - } - - if (!PRETTY_PAREN(context) || j->alias != NULL) - appendStringInfoChar(buf, ')'); - - /* Yes, it's correct to put alias after the right paren ... */ - if (j->alias != NULL) - { - appendStringInfo(buf, " %s", - quote_identifier(j->alias->aliasname)); - get_column_alias_list(colinfo, context); - } - } - else - elog(ERROR, "unrecognized node type: %d", - (int) nodeTag(jtnode)); -} - -/* - * get_column_alias_list - print column alias list for an RTE - * - * Caller must already have printed the relation's alias name. - */ -static void -get_column_alias_list(deparse_columns *colinfo, deparse_context *context) -{ - StringInfo buf = context->buf; - int i; - bool first = true; - - /* Don't print aliases if not needed */ - if (!colinfo->printaliases) - return; - - for (i = 0; i < colinfo->num_new_cols; i++) - { - char *colname = colinfo->new_colnames[i]; - - if (first) - { - appendStringInfoChar(buf, '('); - first = false; - } - else - appendStringInfoString(buf, ", "); - appendStringInfoString(buf, quote_identifier(colname)); - } - if (!first) - appendStringInfoChar(buf, ')'); -} - -/* - * get_from_clause_coldeflist - reproduce FROM clause coldeflist - * - * When printing a top-level coldeflist (which is syntactically also the - * relation's column alias list), use column names from colinfo. But when - * printing a coldeflist embedded inside ROWS FROM(), we prefer to use the - * original coldeflist's names, which are available in rtfunc->funccolnames. - * Pass NULL for colinfo to select the latter behavior. - * - * The coldeflist is appended immediately (no space) to buf. Caller is - * responsible for ensuring that an alias or AS is present before it. - */ -static void -get_from_clause_coldeflist(RangeTblFunction *rtfunc, - deparse_columns *colinfo, - deparse_context *context) -{ - StringInfo buf = context->buf; - ListCell *l1; - ListCell *l2; - ListCell *l3; - ListCell *l4; - int i; - - appendStringInfoChar(buf, '('); - - /* there's no forfour(), so must chase one list the hard way */ - i = 0; - l4 = list_head(rtfunc->funccolnames); - forthree(l1, rtfunc->funccoltypes, - l2, rtfunc->funccoltypmods, - l3, rtfunc->funccolcollations) - { - Oid atttypid = lfirst_oid(l1); - int32 atttypmod = lfirst_int(l2); - Oid attcollation = lfirst_oid(l3); - char *attname; - - if (colinfo) - attname = colinfo->colnames[i]; - else - attname = strVal(lfirst(l4)); - - Assert(attname); /* shouldn't be any dropped columns here */ - - if (i > 0) - appendStringInfoString(buf, ", "); - appendStringInfo(buf, "%s %s", - quote_identifier(attname), - format_type_with_typemod(atttypid, atttypmod)); - if (OidIsValid(attcollation) && - attcollation != get_typcollation(atttypid)) - appendStringInfo(buf, " COLLATE %s", - generate_collation_name(attcollation)); - - l4 = lnext(l4); - i++; - } - - appendStringInfoChar(buf, ')'); -} - -/* - * get_tablesample_def - print a TableSampleClause - */ -static void -get_tablesample_def(TableSampleClause *tablesample, deparse_context *context) -{ - StringInfo buf = context->buf; - Oid argtypes[1]; - int nargs; - ListCell *l; - - /* - * We should qualify the handler's function name if it wouldn't be - * resolved by lookup in the current search path. - */ - argtypes[0] = INTERNALOID; - appendStringInfo(buf, " TABLESAMPLE %s (", - generate_function_name(tablesample->tsmhandler, 1, - NIL, argtypes, - false, NULL, EXPR_KIND_NONE)); - - nargs = 0; - foreach(l, tablesample->args) - { - if (nargs++ > 0) - appendStringInfoString(buf, ", "); - get_rule_expr((Node *) lfirst(l), context, false); - } - appendStringInfoChar(buf, ')'); - - if (tablesample->repeatable != NULL) - { - appendStringInfoString(buf, " REPEATABLE ("); - get_rule_expr((Node *) tablesample->repeatable, context, false); - appendStringInfoChar(buf, ')'); - } -} - -/* - * get_opclass_name - fetch name of an index operator class - * - * The opclass name is appended (after a space) to buf. - * - * Output is suppressed if the opclass is the default for the given - * actual_datatype. (If you don't want this behavior, just pass - * InvalidOid for actual_datatype.) - */ -static void -get_opclass_name(Oid opclass, Oid actual_datatype, - StringInfo buf) -{ - HeapTuple ht_opc; - Form_pg_opclass opcrec; - char *opcname; - char *nspname; - - ht_opc = SearchSysCache1(CLAOID, ObjectIdGetDatum(opclass)); - if (!HeapTupleIsValid(ht_opc)) - elog(ERROR, "cache lookup failed for opclass %u", opclass); - opcrec = (Form_pg_opclass) GETSTRUCT(ht_opc); - - if (!OidIsValid(actual_datatype) || - GetDefaultOpClass(actual_datatype, opcrec->opcmethod) != opclass) - { - /* Okay, we need the opclass name. Do we need to qualify it? */ - opcname = NameStr(opcrec->opcname); - if (OpclassIsVisible(opclass)) - appendStringInfo(buf, " %s", quote_identifier(opcname)); - else - { - nspname = get_namespace_name(opcrec->opcnamespace); - appendStringInfo(buf, " %s.%s", - quote_identifier(nspname), - quote_identifier(opcname)); - } - } - ReleaseSysCache(ht_opc); -} - -/* - * processIndirection - take care of array and subfield assignment - * - * We strip any top-level FieldStore or assignment ArrayRef nodes that - * appear in the input, printing them as decoration for the base column - * name (which we assume the caller just printed). We might also need to - * strip CoerceToDomain nodes, but only ones that appear above assignment - * nodes. - * - * Returns the subexpression that's to be assigned. - */ -static Node * -processIndirection(Node *node, deparse_context *context) -{// #lizard forgives - StringInfo buf = context->buf; - CoerceToDomain *cdomain = NULL; - - for (;;) - { - if (node == NULL) - break; - if (IsA(node, FieldStore)) - { - FieldStore *fstore = (FieldStore *) node; - Oid typrelid; - char *fieldname; - - /* lookup tuple type */ - typrelid = get_typ_typrelid(fstore->resulttype); - if (!OidIsValid(typrelid)) - elog(ERROR, "argument type %s of FieldStore is not a tuple type", - format_type_be(fstore->resulttype)); - - /* - * Print the field name. There should only be one target field in - * stored rules. There could be more than that in executable - * target lists, but this function cannot be used for that case. - */ - Assert(list_length(fstore->fieldnums) == 1); - fieldname = get_relid_attribute_name(typrelid, - linitial_int(fstore->fieldnums)); - appendStringInfo(buf, ".%s", quote_identifier(fieldname)); - - /* - * We ignore arg since it should be an uninteresting reference to - * the target column or subcolumn. - */ - node = (Node *) linitial(fstore->newvals); - } - else if (IsA(node, ArrayRef)) - { - ArrayRef *aref = (ArrayRef *) node; - - if (aref->refassgnexpr == NULL) - break; - printSubscripts(aref, context); - - /* - * We ignore refexpr since it should be an uninteresting reference - * to the target column or subcolumn. - */ - node = (Node *) aref->refassgnexpr; - } - else if (IsA(node, CoerceToDomain)) - { - cdomain = (CoerceToDomain *) node; - /* If it's an explicit domain coercion, we're done */ - if (cdomain->coercionformat != COERCE_IMPLICIT_CAST) - break; - /* Tentatively descend past the CoerceToDomain */ - node = (Node *) cdomain->arg; - } - else - break; - } - - /* - * If we descended past a CoerceToDomain whose argument turned out not to - * be a FieldStore or array assignment, back up to the CoerceToDomain. - * (This is not enough to be fully correct if there are nested implicit - * CoerceToDomains, but such cases shouldn't ever occur.) - */ - if (cdomain && node == (Node *) cdomain->arg) - node = (Node *) cdomain; - - return node; -} - -static void -printSubscripts(ArrayRef *aref, deparse_context *context) -{ - StringInfo buf = context->buf; - ListCell *lowlist_item; - ListCell *uplist_item; - - lowlist_item = list_head(aref->reflowerindexpr); /* could be NULL */ - foreach(uplist_item, aref->refupperindexpr) - { - appendStringInfoChar(buf, '['); - if (lowlist_item) - { - /* If subexpression is NULL, get_rule_expr prints nothing */ - get_rule_expr((Node *) lfirst(lowlist_item), context, false); - appendStringInfoChar(buf, ':'); - lowlist_item = lnext(lowlist_item); - } - /* If subexpression is NULL, get_rule_expr prints nothing */ - get_rule_expr((Node *) lfirst(uplist_item), context, false); - appendStringInfoChar(buf, ']'); - } -} - -/* - * quote_identifier - Quote an identifier only if needed - * - * When quotes are needed, we palloc the required space; slightly - * space-wasteful but well worth it for notational simplicity. - */ -const char * -quote_identifier(const char *ident) -{// #lizard forgives - /* - * Can avoid quoting if ident starts with a lowercase letter or underscore - * and contains only lowercase letters, digits, and underscores, *and* is - * not any SQL keyword. Otherwise, supply quotes. - */ - int nquotes = 0; - bool safe; - const char *ptr; - char *result; - char *optr; - - /* - * would like to use macros here, but they might yield unwanted - * locale-specific results... - */ - safe = ((ident[0] >= 'a' && ident[0] <= 'z') || ident[0] == '_'); - - for (ptr = ident; *ptr; ptr++) - { - char ch = *ptr; - - if ((ch >= 'a' && ch <= 'z') || - (ch >= '0' && ch <= '9') || - (ch == '_')) - { - /* okay */ - } - else - { - safe = false; - if (ch == '"') - nquotes++; - } - } - - if (quote_all_identifiers) - safe = false; - - if (safe) - { - /* - * Check for keyword. We quote keywords except for unreserved ones. - * (In some cases we could avoid quoting a col_name or type_func_name - * keyword, but it seems much harder than it's worth to tell that.) - * - * Note: ScanKeywordLookup() does case-insensitive comparison, but - * that's fine, since we already know we have all-lower-case. - */ - const ScanKeyword *keyword = ScanKeywordLookup(ident, - ScanKeywords, - NumScanKeywords); - - if (keyword != NULL && keyword->category != UNRESERVED_KEYWORD) - safe = false; - } - - if (safe) - return ident; /* no change needed */ - - result = (char *) palloc(strlen(ident) + nquotes + 2 + 1); - - optr = result; - *optr++ = '"'; - for (ptr = ident; *ptr; ptr++) - { - char ch = *ptr; - - if (ch == '"') - *optr++ = '"'; - *optr++ = ch; - } - *optr++ = '"'; - *optr = '\0'; - - return result; -} - -/* - * quote_qualified_identifier - Quote a possibly-qualified identifier - * - * Return a name of the form qualifier.ident, or just ident if qualifier - * is NULL, quoting each component if necessary. The result is palloc'd. - */ -char * -quote_qualified_identifier(const char *qualifier, - const char *ident) -{ - StringInfoData buf; - - initStringInfo(&buf); - if (qualifier) - appendStringInfo(&buf, "%s.", quote_identifier(qualifier)); - appendStringInfoString(&buf, quote_identifier(ident)); - return buf.data; -} - -/* - * get_relation_name - * Get the unqualified name of a relation specified by OID - * - * This differs from the underlying get_rel_name() function in that it will - * throw error instead of silently returning NULL if the OID is bad. - */ -static char * -get_relation_name(Oid relid) -{ - char *relname = get_rel_name(relid); - - if (!relname) - elog(ERROR, "cache lookup failed for relation %u", relid); - return relname; -} - -/* - * generate_relation_name - * Compute the name to display for a relation specified by OID - * - * The result includes all necessary quoting and schema-prefixing. - * - * If namespaces isn't NIL, it must be a list of deparse_namespace nodes. - * We will forcibly qualify the relation name if it equals any CTE name - * visible in the namespace list. - */ -static char * -generate_relation_name(Oid relid, List *namespaces) -{ - HeapTuple tp; - Form_pg_class reltup; - bool need_qual; - ListCell *nslist; - char *relname; - char *nspname; - char *result; - - tp = SearchSysCache1(RELOID, ObjectIdGetDatum(relid)); - if (!HeapTupleIsValid(tp)) - elog(ERROR, "cache lookup failed for relation %u", relid); - reltup = (Form_pg_class) GETSTRUCT(tp); - relname = NameStr(reltup->relname); - - /* Check for conflicting CTE name */ - need_qual = false; - foreach(nslist, namespaces) - { - deparse_namespace *dpns = (deparse_namespace *) lfirst(nslist); - ListCell *ctlist; - - foreach(ctlist, dpns->ctes) - { - CommonTableExpr *cte = (CommonTableExpr *) lfirst(ctlist); - - if (strcmp(cte->ctename, relname) == 0) - { - need_qual = true; - break; - } - } - if (need_qual) - break; - } - - /* Otherwise, qualify the name if not visible in search path */ - if (!need_qual) - need_qual = !RelationIsVisible(relid); - - if (need_qual) - nspname = get_namespace_name(reltup->relnamespace); - else - nspname = NULL; - - result = quote_qualified_identifier(nspname, relname); - - ReleaseSysCache(tp); - - return result; -} - -/* - * generate_qualified_relation_name - * Compute the name to display for a relation specified by OID - * - * As above, but unconditionally schema-qualify the name. - */ -static char * -generate_qualified_relation_name(Oid relid) -{ - HeapTuple tp; - Form_pg_class reltup; - char *relname; - char *nspname; - char *result; - - tp = SearchSysCache1(RELOID, ObjectIdGetDatum(relid)); - if (!HeapTupleIsValid(tp)) - elog(ERROR, "cache lookup failed for relation %u", relid); - reltup = (Form_pg_class) GETSTRUCT(tp); - relname = NameStr(reltup->relname); - - nspname = get_namespace_name(reltup->relnamespace); - if (!nspname) - elog(ERROR, "cache lookup failed for namespace %u", - reltup->relnamespace); - - result = quote_qualified_identifier(nspname, relname); - - ReleaseSysCache(tp); - - return result; -} - -/* - * generate_function_name - * Compute the name to display for a function specified by OID, - * given that it is being called with the specified actual arg names and - * types. (Those matter because of ambiguous-function resolution rules.) - * - * If we're dealing with a potentially variadic function (in practice, this - * means a FuncExpr or Aggref, not some other way of calling a function), then - * has_variadic must specify whether variadic arguments have been merged, - * and *use_variadic_p will be set to indicate whether to print VARIADIC in - * the output. For non-FuncExpr cases, has_variadic should be FALSE and - * use_variadic_p can be NULL. - * - * The result includes all necessary quoting and schema-prefixing. - */ -static char * -generate_function_name(Oid funcid, int nargs, List *argnames, Oid *argtypes, - bool has_variadic, bool *use_variadic_p, - ParseExprKind special_exprkind) -{// #lizard forgives - char *result; - HeapTuple proctup; - Form_pg_proc procform; - char *proname; - bool use_variadic; - char *nspname; - FuncDetailCode p_result; - Oid p_funcid; - Oid p_rettype; - bool p_retset; - int p_nvargs; - Oid p_vatype; - Oid *p_true_typeids; - bool force_qualify = false; - - proctup = SearchSysCache1(PROCOID, ObjectIdGetDatum(funcid)); - if (!HeapTupleIsValid(proctup)) - elog(ERROR, "cache lookup failed for function %u", funcid); - procform = (Form_pg_proc) GETSTRUCT(proctup); - proname = NameStr(procform->proname); - - /* - * Due to parser hacks to avoid needing to reserve CUBE, we need to force - * qualification in some special cases. - */ - if (special_exprkind == EXPR_KIND_GROUP_BY) - { - if (strcmp(proname, "cube") == 0 || strcmp(proname, "rollup") == 0) - force_qualify = true; - } - - /* - * Determine whether VARIADIC should be printed. We must do this first - * since it affects the lookup rules in func_get_detail(). - * - * Currently, we always print VARIADIC if the function has a merged - * variadic-array argument. Note that this is always the case for - * functions taking a VARIADIC argument type other than VARIADIC ANY. - * - * In principle, if VARIADIC wasn't originally specified and the array - * actual argument is deconstructable, we could print the array elements - * separately and not print VARIADIC, thus more nearly reproducing the - * original input. For the moment that seems like too much complication - * for the benefit, and anyway we do not know whether VARIADIC was - * originally specified if it's a non-ANY type. - */ - if (use_variadic_p) - { - /* Parser should not have set funcvariadic unless fn is variadic */ - Assert(!has_variadic || OidIsValid(procform->provariadic)); - use_variadic = has_variadic; - *use_variadic_p = use_variadic; - } - else - { - Assert(!has_variadic); - use_variadic = false; - } - - /* - * The idea here is to schema-qualify only if the parser would fail to - * resolve the correct function given the unqualified func name with the - * specified argtypes and VARIADIC flag. But if we already decided to - * force qualification, then we can skip the lookup and pretend we didn't - * find it. - */ - if (!force_qualify) - p_result = func_get_detail(list_make1(makeString(proname)), - NIL, argnames, nargs, argtypes, - !use_variadic, true, - &p_funcid, &p_rettype, - &p_retset, &p_nvargs, &p_vatype, - &p_true_typeids, NULL); - else - { - p_result = FUNCDETAIL_NOTFOUND; - p_funcid = InvalidOid; - } - - if ((p_result == FUNCDETAIL_NORMAL || - p_result == FUNCDETAIL_AGGREGATE || - p_result == FUNCDETAIL_WINDOWFUNC) && - p_funcid == funcid) - nspname = NULL; - else - nspname = get_namespace_name(procform->pronamespace); - - result = quote_qualified_identifier(nspname, proname); - - ReleaseSysCache(proctup); - - return result; -} - -/* - * generate_operator_name - * Compute the name to display for an operator specified by OID, - * given that it is being called with the specified actual arg types. - * (Arg types matter because of ambiguous-operator resolution rules. - * Pass InvalidOid for unused arg of a unary operator.) - * - * The result includes all necessary quoting and schema-prefixing, - * plus the OPERATOR() decoration needed to use a qualified operator name - * in an expression. - */ -static char * -generate_operator_name(Oid operid, Oid arg1, Oid arg2) -{// #lizard forgives - StringInfoData buf; - HeapTuple opertup; - Form_pg_operator operform; - char *oprname; - char *nspname; - Operator p_result; - - initStringInfo(&buf); - - opertup = SearchSysCache1(OPEROID, ObjectIdGetDatum(operid)); - if (!HeapTupleIsValid(opertup)) - elog(ERROR, "cache lookup failed for operator %u", operid); - operform = (Form_pg_operator) GETSTRUCT(opertup); - oprname = NameStr(operform->oprname); - - /* - * The idea here is to schema-qualify only if the parser would fail to - * resolve the correct operator given the unqualified op name with the - * specified argtypes. - */ - switch (operform->oprkind) - { - case 'b': - p_result = oper(NULL, list_make1(makeString(oprname)), arg1, arg2, - true, -1); - break; - case 'l': - p_result = left_oper(NULL, list_make1(makeString(oprname)), arg2, - true, -1); - break; - case 'r': - p_result = right_oper(NULL, list_make1(makeString(oprname)), arg1, - true, -1); - break; - default: - elog(ERROR, "unrecognized oprkind: %d", operform->oprkind); - p_result = NULL; /* keep compiler quiet */ - break; - } - - if (p_result != NULL && oprid(p_result) == operid) - nspname = NULL; - else - { - nspname = get_namespace_name(operform->oprnamespace); - appendStringInfo(&buf, "OPERATOR(%s.", quote_identifier(nspname)); - } - - appendStringInfoString(&buf, oprname); - - if (nspname) - appendStringInfoChar(&buf, ')'); - - if (p_result != NULL) - ReleaseSysCache(p_result); - - ReleaseSysCache(opertup); - - return buf.data; -} - -/* - * generate_collation_name - * Compute the name to display for a collation specified by OID - * - * The result includes all necessary quoting and schema-prefixing. - */ -char * -generate_collation_name(Oid collid) -{ - HeapTuple tp; - Form_pg_collation colltup; - char *collname; - char *nspname; - char *result; - - tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid)); - if (!HeapTupleIsValid(tp)) - elog(ERROR, "cache lookup failed for collation %u", collid); - colltup = (Form_pg_collation) GETSTRUCT(tp); - collname = NameStr(colltup->collname); - - if (!CollationIsVisible(collid)) - nspname = get_namespace_name(colltup->collnamespace); - else - nspname = NULL; - - result = quote_qualified_identifier(nspname, collname); - - ReleaseSysCache(tp); - - return result; -} - -/* - * Given a C string, produce a TEXT datum. - * - * We assume that the input was palloc'd and may be freed. - */ -static text * -string_to_text(char *str) -{ - text *result; - - result = cstring_to_text(str); - pfree(str); - return result; -} - -/* - * Generate a C string representing a relation's reloptions, or NULL if none. - */ -static char * -flatten_reloptions(Oid relid) -{ - char *result = NULL; - HeapTuple tuple; - Datum reloptions; - bool isnull; - - tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(relid)); - if (!HeapTupleIsValid(tuple)) - elog(ERROR, "cache lookup failed for relation %u", relid); - - reloptions = SysCacheGetAttr(RELOID, tuple, - Anum_pg_class_reloptions, &isnull); - if (!isnull) - { - StringInfoData buf; - Datum *options; - int noptions; - int i; - - initStringInfo(&buf); - - deconstruct_array(DatumGetArrayTypeP(reloptions), - TEXTOID, -1, false, 'i', - &options, NULL, &noptions); - - for (i = 0; i < noptions; i++) - { - char *option = TextDatumGetCString(options[i]); - char *name; - char *separator; - char *value; - - /* - * Each array element should have the form name=value. If the "=" - * is missing for some reason, treat it like an empty value. - */ - name = option; - separator = strchr(option, '='); - if (separator) - { - *separator = '\0'; - value = separator + 1; - } - else - value = ""; - - if (i > 0) - appendStringInfoString(&buf, ", "); - appendStringInfo(&buf, "%s=", quote_identifier(name)); - - /* - * In general we need to quote the value; but to avoid unnecessary - * clutter, do not quote if it is an identifier that would not - * need quoting. (We could also allow numbers, but that is a bit - * trickier than it looks --- for example, are leading zeroes - * significant? We don't want to assume very much here about what - * custom reloptions might mean.) - */ - if (quote_identifier(value) == value) - appendStringInfoString(&buf, value); - else - simple_quote_literal(&buf, value); - - pfree(option); - } - - result = buf.data; - } - - ReleaseSysCache(tuple); - - return result; -} - -/* - * get_one_range_partition_bound_string - * A C string representation of one range partition bound - */ -char * -get_range_partbound_string(List *bound_datums) -{ - deparse_context context; - StringInfo buf = makeStringInfo(); - ListCell *cell; - char *sep; - - memset(&context, 0, sizeof(deparse_context)); - context.buf = buf; - - appendStringInfoString(buf, "("); - sep = ""; - foreach(cell, bound_datums) - { - PartitionRangeDatum *datum = - castNode(PartitionRangeDatum, lfirst(cell)); - - appendStringInfoString(buf, sep); - if (datum->kind == PARTITION_RANGE_DATUM_MINVALUE) - appendStringInfoString(buf, "MINVALUE"); - else if (datum->kind == PARTITION_RANGE_DATUM_MAXVALUE) - appendStringInfoString(buf, "MAXVALUE"); - else - { - Const *val = castNode(Const, datum->value); - - get_const_expr(val, &context, -1); - } - sep = ", "; - } - appendStringInfoString(buf, ")"); - - return buf->data; -} - -#ifdef __TBASE__ -/* form interval partition child table/index name */ -char * -GetPartitionName(Oid parentrelid, int partidx, bool isindex) -{ - char *partname; - char relname[NAMEDATALEN]; - char *parentname = get_rel_name(parentrelid); - - StrNCpy(relname, parentname, NAMEDATALEN - 12); - - partname = (char *)palloc0(NAMEDATALEN); - - snprintf(partname, NAMEDATALEN, - "%s_part_%d", relname, partidx); - -#if 0 - if(!isindex) - snprintf(partname, NAMEDATALEN, - "part_%d_%d", parentrelid, partidx); - else - snprintf(partname, NAMEDATALEN, - "idx_%d_%d", parentrelid, partidx); -#endif - - return partname; -} - -static int -find_partidx_by_int(int64 start, int step, int partitions, - int64 value, QulificationType qualtype) -{// #lizard forgives - int partidx = -1; - int gap = -1; - int align = -1; - - if(value < start || value >= start + step*partitions) - { - return PARTITION_ROUTER_RESULT_NULL; - } - - gap = (int32)((value - start)/step); - - align = (int32)((value - start)%step); - - switch(qualtype) - { - case QULIFICATION_TYPE_LS: - if(align == 0) gap--; - case QULIFICATION_TYPE_LE: - { - if(gap >= partitions) - partidx = PARTITION_ROUTER_RESULT_FULL; - else if(gap < 0) - partidx = PARTITION_ROUTER_RESULT_NULL; - else - partidx = gap; - } - break; - - case QULIFICATION_TYPE_EQUAL: - { - if(gap >= partitions || gap < 0 ) - partidx = PARTITION_ROUTER_RESULT_NULL; - else - partidx = gap; - } - break; - - case QULIFICATION_TYPE_GE: - case QULIFICATION_TYPE_GT: - { - if(gap >= partitions) - partidx = PARTITION_ROUTER_RESULT_NULL; - else if(gap < 0) - partidx = PARTITION_ROUTER_RESULT_FULL; - else - partidx = gap; - } - break; - default: - elog(ERROR, "not supported Qulification Type[%d]", qualtype); - } - - return partidx; -} - -static int get_daysofyear(int startyear, int startmonth, int startday, - int endyear, int endmonth, int endday) -{// #lizard forgives - int result; - - result = 0; - - if(startyear > endyear - || (startyear == endyear && startmonth > endmonth) - || (startyear == endyear && startmonth == endmonth && startday > endday)) - return -1; - - if(startyear == endyear) - { - result = get_daysofmonth(startmonth, startday, endmonth, endday); - } - else - { - result += get_daysofmonth(startmonth,startday, 12, 31); - result += (endyear - startyear - 1)*366; - result += get_daysofmonth(1, 1, endmonth, endday); - } - - return result; -} - -static int get_daysofmonth(int startmonth, int startday, - int endmonth, int endday) -{// #lizard forgives - int result; - - if(startmonth <=0 || startmonth > 12 - || startday <= 0 || startday > 31 - || endmonth <=0 || endmonth > 12 - || endday <= 0 || endday > 31) - { - elog(ERROR, "internal error: getdaysofmonth: parameters is invalid"); - } - - result = 0; - - if(startmonth > endmonth || (startmonth == endmonth && startday > endday)) - return -1; - - if(startmonth == endmonth) - { - result = endday - startday; - } - else - { - int monidx = 0; - - result += daysofmonth[startmonth] - startday; - - monidx = startmonth + 1; - while(monidx < endmonth) - result += daysofmonth[monidx++]; - - result += endday; - } - - return result; -} - -static int get_monthesofyear(int startyear, int startmonth, - int endyear, int endmonth) -{ - int32 gap; - if(endyear < startyear || (endyear == startyear && endmonth < startmonth)) - { - gap = -1; - } - else - { - gap = (endyear - startyear) * 12 + (endmonth - startmonth); - } - return gap; -} - - -static int -find_partidx_by_timestamp(TimestampTz start, int step, int steptype, int partitions, - TimestampTz value, QulificationType qualtype) -{// #lizard forgives - int partidx = -1; - int gap; - struct pg_tm start_time; - fsec_t start_sec; - struct pg_tm current_time; - fsec_t current_sec; - bool isalign = false; - - - /* timestamp convert to posix struct */ - if(timestamp2tm(start, NULL, &start_time, &start_sec, NULL, NULL) != 0) - ereport(ERROR, - (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), - errmsg("timestamp out of range"))); - - if(timestamp2tm(value, NULL, ¤t_time, ¤t_sec, NULL, NULL) != 0) - ereport(ERROR, - (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), - errmsg("timestamp out of range"))); - - if(current_time.tm_hour == 0 && current_time.tm_min == 0 && current_time.tm_sec == 0 && current_sec == 0) - { - isalign = true; - } - - if(isalign && steptype == IntervalType_Month) - { - isalign = (current_time.tm_mday == 1); - } - - /* computer gap*/ - if(steptype == IntervalType_Month) - { - if(current_time.tm_year < start_time.tm_year - || (current_time.tm_year == start_time.tm_year && current_time.tm_mon < start_time.tm_mon)) - { - gap = -1; - } - else - { - gap = (current_time.tm_year - start_time.tm_year)*12 + (current_time.tm_mon - start_time.tm_mon); - } - } - else if(steptype == IntervalType_Day) - { - gap = get_daysofyear(start_time.tm_year, start_time.tm_mon, start_time.tm_mday, - current_time.tm_year, current_time.tm_mon, current_time.tm_mday); - } - else - { - elog(ERROR,"step type[%d] is invalid", steptype); - } - - if(gap >= 0) - { - if(isalign) - isalign = (gap % step == 0); - gap = gap/step; - } - else - { - gap = -1; - isalign = false; - } - - switch(qualtype) - { - case QULIFICATION_TYPE_LS: - if(isalign) - { - if (!(is_first_day_from_start(step, steptype, &start_time, ¤t_time))) - { - gap--; - } - } - case QULIFICATION_TYPE_LE: - { - if(gap >= partitions) - partidx = PARTITION_ROUTER_RESULT_FULL; /* all partitions*/ - else if(gap < 0) - partidx = PARTITION_ROUTER_RESULT_NULL; - else - partidx = gap; - } - break; - - case QULIFICATION_TYPE_EQUAL: - { - if(gap >= partitions || gap < 0) - partidx = PARTITION_ROUTER_RESULT_NULL; - else - partidx = gap; - } - break; - - case QULIFICATION_TYPE_GE: - case QULIFICATION_TYPE_GT: - { - if(gap >= partitions) - partidx = PARTITION_ROUTER_RESULT_NULL; - else if(gap < 0) - partidx = PARTITION_ROUTER_RESULT_FULL; - else - partidx = gap; - } - break; - default: - elog(ERROR, "not supported Qulification Type[%d]", qualtype); - } - - return partidx; -} - -int -RelationGetPartitionIdxByValue(Relation rel, Datum value) -{ - int partidx = -1; - Form_pg_partition_interval routerinfo = NULL; - - routerinfo = rel->rd_partitions_info; - - if(!routerinfo) - { - elog(ERROR, "relation[%s] is not a partitioned table.", RelationGetRelationName(rel)); - } - - switch(routerinfo->partdatatype) - { - case INT2OID: /* int2 */ - { - int value_int16; - value_int16 = DatumGetInt16(value); - partidx = find_partidx_by_int(routerinfo->partstartvalue_int, routerinfo->partinterval_int, - routerinfo->partnparts, (int64)value_int16, QULIFICATION_TYPE_EQUAL); - } - break; - case INT4OID: /* int4 */ - { - int value_int32; - value_int32 = DatumGetInt32(value); - partidx = find_partidx_by_int(routerinfo->partstartvalue_int, routerinfo->partinterval_int, - routerinfo->partnparts, (int64)value_int32, QULIFICATION_TYPE_EQUAL); - } - break; - case INT8OID: /* int8 */ - { - partidx = find_partidx_by_int(routerinfo->partstartvalue_int, routerinfo->partinterval_int, - routerinfo->partnparts, DatumGetInt64(value), QULIFICATION_TYPE_EQUAL); - } - break; - case TIMESTAMPOID: /* timestamp */ - partidx = find_partidx_by_timestamp(routerinfo->partstartvalue_ts, routerinfo->partinterval_int, - routerinfo->partinterval_type, - routerinfo->partnparts, DatumGetTimestamp(value), QULIFICATION_TYPE_EQUAL); - break; - default: - elog(ERROR, "unsupported interval type:[%d]", routerinfo->partinterval_type); - } - - return partidx; - -} - -Bitmapset * -RelationGetPartitionByValue(Relation rel, Const *value) -{ - //TODO: - int partidx = -1; - AttrNumber partkey = InvalidAttrNumber; - Form_pg_attribute attr = NULL; - Bitmapset * bms = NULL; - char *partname = NULL; - Oid partoid = InvalidOid; - - partkey = RelationGetPartitionColumnIndex(rel); - attr = rel->rd_att->attrs[partkey-1]; - - if(attr->atttypid != value->consttype) - { - elog(ERROR, "internal error: RelationGetPartitionByValue: data type of parameter is not same as relation definition"); - } - - partidx = RelationGetPartitionIdxByValue(rel,value->constvalue); - - partname = GetPartitionName(RelationGetRelid(rel), partidx, false); - partoid = get_relname_relid(partname, RelationGetNamespace(rel)); - - if(partidx >= 0 && partoid) - bms = bms_make_singleton(partidx); - else - bms = NULL; - - return bms; -} - -List * -RelationGetAllPartitions(Relation rel) -{ - int nparts = 0; - char *partname = NULL; - Oid partoid = InvalidOid; - int partidx = 0; - List * result = NULL; - - nparts = RelationGetNParts(rel); - - for(partidx = 0; partidx < nparts; partidx++) - { - partname = GetPartitionName(RelationGetRelid(rel), partidx, false); - partoid = get_relname_relid(partname, RelationGetNamespace(rel)); - - if(partname) - pfree(partname); - partname = NULL; - - if (InvalidOid == partoid) - { - continue; - } - - result = lappend_oid(result, partoid); - } - - return result; -} - -int -RelationGetChildIndex(Relation rel, Oid childoid) -{ - int nparts = 0; - char *partname = NULL; - Oid partoid = InvalidOid; - int partidx = 0; - int result = -1; - - if (childoid) - { - nparts = RelationGetNParts(rel); - - for(partidx = 0; partidx < nparts; partidx++) - { - partname = GetPartitionName(RelationGetRelid(rel), partidx, false); - partoid = get_relname_relid(partname, RelationGetNamespace(rel)); - - if (partoid == childoid) - { - result = partidx; - - if(partname) - pfree(partname); - partname = NULL; - - break; - } - - if(partname) - pfree(partname); - partname = NULL; - } - } - - return result; -} - -Oid -RelationGetPartitionIndex(Relation rel, Oid indexOid, int partidx) -{ - char *partidxname = NULL; - Oid partidxoid = InvalidOid; - partidxname = GetPartitionName(indexOid,partidx,true); - partidxoid = get_relname_relid(partidxname,RelationGetNamespace(rel)); - - pfree(partidxname); - partidxname = NULL; - return partidxoid; -} - -Oid -RelationGetPartition(Relation rel, int partidx, bool isindex) -{ - char *partname = NULL; - Oid partoid = InvalidOid; - - partname = GetPartitionName(RelationGetRelid(rel), partidx, isindex); - - partoid = get_relname_relid(partname, RelationGetNamespace(rel)); - - if(partname) - pfree(partname); - partname = NULL; - return partoid; -} - -Bitmapset * -RelationGetPartitionsByQuals(Relation rel, List *strictinfos) -{ - Bitmapset * result; - Bitmapset * temp_bms; - Bitmapset * temp_result; - - ListCell *cell; - RestrictInfo *ele; - result = NULL; - temp_bms = NULL; - temp_result = NULL; - - if(list_length(strictinfos) == 0) - return get_full_pruning_result(rel); - - foreach(cell, strictinfos) - { - ele = (RestrictInfo*)lfirst(cell); - temp_bms = pruning_walker(rel,(Node*)ele); - if(result) - temp_result = bms_intersect(result, temp_bms); - else - temp_result = bms_copy(temp_bms); - bms_free(result); - bms_free(temp_bms); - temp_bms = NULL; - result = temp_result; - } - - return result; -} - -static Bitmapset * -pruning_walker(Relation rel, Node *expr) -{ - Bitmapset * result; - result = NULL; - - switch(nodeTag(expr)) - { - case T_OpExpr: - { - result = pruning_opexpr(rel,(OpExpr*)expr); - } - break; - case T_RestrictInfo: - { - RestrictInfo *restricted = (RestrictInfo *)expr; - result = pruning_walker(rel, (Node *)restricted->clause); - } - break; - case T_BoolExpr: - { - BoolExpr *boolexpr = (BoolExpr*)expr; - switch(boolexpr->boolop) - { - ListCell * cell; - Bitmapset * temp_bms; - Bitmapset * temp_result; - Node *ele; - - temp_bms = NULL; - temp_result = NULL; - case AND_EXPR: - { - foreach(cell,boolexpr->args) - { - ele = (Node*)lfirst(cell); - temp_bms = pruning_walker(rel,ele); - if(result) - temp_result = bms_intersect(result, temp_bms); - else - temp_result = bms_copy(temp_bms); - bms_free(result); - bms_free(temp_bms); - temp_bms = NULL; - result = temp_result; - } - } - break; - case OR_EXPR: - { - foreach(cell,boolexpr->args) - { - ele = (Node*)lfirst(cell); - temp_bms = pruning_walker(rel,ele); - temp_result = bms_union(result, temp_bms); - bms_free(result); - bms_free(temp_bms); - temp_bms = NULL; - result = temp_result; - } - } - break; - case NOT_EXPR: - default: - result = get_full_pruning_result(rel); - break; - } - } - break; - default: - result = get_full_pruning_result(rel); - break; - } - - return result; -} - -static Bitmapset * -pruning_opexpr(Relation rel, OpExpr *expr) -{// #lizard forgives - Bitmapset *result = NULL; - char *opname = NULL; - Node *leftarg = NULL; - Node *rightarg = NULL; - Var *arg_var = NULL; - Const *arg_const = NULL; - bool isswap = false; - int npart; - int partidx; - AttrNumber partkey; - //Oid parttype; - QulificationType qualtype = QULIFICATION_TYPE_EQUAL; - Form_pg_partition_interval routerinfo; - - partkey = RelationGetPartitionColumnIndex(rel); - - //parttype = rel->rd_att->attrs[partkey - 1]->atttypid; - - if(list_length(expr->args) != 2) - return get_full_pruning_result(rel); - - leftarg = (Node *)list_nth(expr->args,0); - rightarg = (Node *)list_nth(expr->args,1); - - if(IsA(leftarg,Var) && IsA(rightarg,Const)) - { - arg_var = (Var *)leftarg; - arg_const = (Const *)rightarg; - } - else if(IsA(leftarg,Const) && IsA(rightarg,Var)) - { - arg_var = (Var *)rightarg; - arg_const = (Const *)leftarg; - isswap = true; - } - else - { - return get_full_pruning_result(rel); - } - - if(arg_var->varattno != partkey) - { - return get_full_pruning_result(rel); - } - - opname = get_opname(expr->opno); - - if(strcmp("<",opname) == 0) - { - if(!isswap) - qualtype = QULIFICATION_TYPE_LS; - else - qualtype = QULIFICATION_TYPE_GT; - } - else if(strcmp("<=",opname) == 0) - { - if(!isswap) - qualtype = QULIFICATION_TYPE_LE; - else - qualtype = QULIFICATION_TYPE_GE; - } - else if(strcmp("=",opname) == 0) - { - qualtype = QULIFICATION_TYPE_EQUAL; - } - else if(strcmp(">=",opname) == 0) - { - if(!isswap) - qualtype = QULIFICATION_TYPE_GE; - else - qualtype = QULIFICATION_TYPE_LE; - } - else if(strcmp(">",opname) == 0) - { - if(!isswap) - qualtype = QULIFICATION_TYPE_GT; - else - qualtype = QULIFICATION_TYPE_LS; - } - else - { - /* any other case, get full partitions */ - return get_full_pruning_result(rel); - } - - routerinfo = rel->rd_partitions_info; - - if(!routerinfo) - { - elog(ERROR, "relation[%s] is not a partitioned table", RelationGetRelationName(rel)); - } - - switch(arg_const->consttype) - { - case INT2OID: /* int2 */ - { - int value_int16; - value_int16 = DatumGetInt16(arg_const->constvalue); - partidx = find_partidx_by_int(routerinfo->partstartvalue_int, routerinfo->partinterval_int, - routerinfo->partnparts, (int64)value_int16, qualtype); - } - break; - case INT4OID: /* int4 */ - { - int value_int32; - value_int32 = DatumGetInt32(arg_const->constvalue); - partidx = find_partidx_by_int(routerinfo->partstartvalue_int, routerinfo->partinterval_int, - routerinfo->partnparts, (int64)value_int32, qualtype); - } - break; - case INT8OID: /* int8 */ - { - partidx = find_partidx_by_int(routerinfo->partstartvalue_int, routerinfo->partinterval_int, - routerinfo->partnparts, DatumGetInt64(arg_const->constvalue), qualtype); - } - break; - case TIMESTAMPOID: /* timestamp */ - partidx = find_partidx_by_timestamp(routerinfo->partstartvalue_ts, routerinfo->partinterval_int, - routerinfo->partinterval_type, - routerinfo->partnparts, DatumGetTimestamp(arg_const->constvalue), qualtype); - break; - default: - elog(ERROR, "unsupported const type:[%u]", arg_const->consttype); - } - - npart = RelationGetNParts(rel); - if(npart <= 0) - { - elog(ERROR, "internal error: pruning_opexpr:partitioned table has no partitions"); - } - - if(partidx == PARTITION_ROUTER_RESULT_FULL) - return get_full_pruning_result(rel); - else if(partidx == PARTITION_ROUTER_RESULT_NULL) - return NULL; - else if(partidx >= 0) - { - char *partname = NULL; - Oid partoid = InvalidOid; - - switch(qualtype) - { - case QULIFICATION_TYPE_LS: - case QULIFICATION_TYPE_LE: - { - int i; - for(i = 0; i <= partidx; i++) - { - partname = GetPartitionName(RelationGetRelid(rel), i, false); - partoid = get_relname_relid(partname, RelationGetNamespace(rel)); - if(partoid) - { - result = bms_add_member(result, i); - } - } - } - break; - case QULIFICATION_TYPE_EQUAL: - { - partname = GetPartitionName(RelationGetRelid(rel), partidx, false); - partoid = get_relname_relid(partname, RelationGetNamespace(rel)); - if(partoid) - { - result = bms_make_singleton(partidx); - } - } - break; - case QULIFICATION_TYPE_GE: - case QULIFICATION_TYPE_GT: - { - int i; - for(i = partidx; i < npart; i++) - { - partname = GetPartitionName(RelationGetRelid(rel), i, false); - partoid = get_relname_relid(partname, RelationGetNamespace(rel)); - if(partoid) - { - result = bms_add_member(result, i); - } - } - } - break; - default: - //nerver occur - elog(ERROR, "internal error: pruning_opexpr: invalid QulificationType[%d]", qualtype); - } - } - - return result; -} - -static Bitmapset * -get_full_pruning_result(Relation rel) -{ - Bitmapset *result = NULL; - int i = 0; - int nparts = RelationGetNParts(rel); - char *partname = NULL; - Oid partoid = InvalidOid; - - Assert(nparts > 0); - - for(i=0; ibitmapplans; - replace_target_relation((Node *)planlist,targetrel,partitionparent,partidx); - } - break; - case T_BitmapOr: - { - List *planlist; - planlist = ((BitmapOr*)node)->bitmapplans; - replace_target_relation((Node *)planlist,targetrel,partitionparent,partidx); - } - break; - - /* - * scan nodes - */ - case T_TidScan: - case T_SeqScan: - { - SeqScan *seqscan; - seqscan = (SeqScan*)node; - - if(seqscan->ispartchild) - break; - if(seqscan->scanrelid != targetrel) - break; - seqscan->ispartchild = true; - seqscan->childidx = partidx; - } - break; - - case T_IndexScan: - { - IndexScan *indexscan; - indexscan = (IndexScan*)node; - - if(indexscan->scan.ispartchild) - break; - if(indexscan->scan.scanrelid != targetrel) - break; - indexscan->scan.ispartchild = true; - indexscan->scan.childidx = partidx; - indexscan->indexid = RelationGetPartitionIndex(partitionparent,indexscan->indexid,partidx); - } - break; - - case T_IndexOnlyScan: - { - IndexOnlyScan *indexscan; - indexscan = (IndexOnlyScan*)node; - - if(indexscan->scan.ispartchild) - return; - if(indexscan->scan.scanrelid != targetrel) - return; - indexscan->scan.ispartchild = true; - indexscan->scan.childidx = partidx; - indexscan->indexid = RelationGetPartitionIndex(partitionparent,indexscan->indexid,partidx); - } - break; - - case T_BitmapIndexScan: - { - BitmapIndexScan *indexscan; - indexscan = (BitmapIndexScan*)node; - - if(indexscan->scan.ispartchild) - break; - if(indexscan->scan.scanrelid != targetrel) - break; - indexscan->scan.ispartchild = true; - indexscan->scan.childidx = partidx; - indexscan->indexid = RelationGetPartitionIndex(partitionparent,indexscan->indexid,partidx); - } - break; - - case T_BitmapHeapScan: - { - Scan *scan; - scan = (Scan*)node; - - if(scan->ispartchild) - break; - if(scan->scanrelid != targetrel) - break; - - scan->ispartchild = true; - scan->childidx = partidx; - replace_partidx_bitmapheapscan(partitionparent,(Node*)scan->plan.lefttree,partidx); - //replace_target_relation((Node*)scan->scan.plan.lefttree,targetrel,partitionparent,partidx); - } - break; - - case T_SubqueryScan: - break; - - case T_FunctionScan: - case T_ValuesScan: - case T_CteScan: - case T_WorkTableScan: - case T_ForeignScan: - break; - - /* - * join nodes - */ - case T_NestLoop: - case T_MergeJoin: - case T_HashJoin: - { - Plan *join; - join = (Plan*)node; - replace_target_relation((Node*)join->lefttree,targetrel,partitionparent,partidx); - replace_target_relation((Node*)join->righttree,targetrel,partitionparent,partidx); - } - break; - - /* - * materialization nodes - */ - case T_Material: - case T_Sort: - case T_Hash: - { - Plan *mat = (Plan*)node; - replace_target_relation((Node*)mat->lefttree,targetrel,partitionparent,partidx); - } - break; - case T_Group: - case T_Agg: - case T_WindowAgg: - case T_Unique: - case T_SetOp: - case T_LockRows: - case T_Limit: - break; - case T_List: - { - List * list; - ListCell *cell; - Node *element; - - list = (List *)node; - foreach(cell,list) - { - element = (Node*)lfirst(cell); - replace_target_relation(element,targetrel,partitionparent,partidx); - } - } - break; - case T_RemoteSubplan: - { - RemoteSubplan *plan = (RemoteSubplan *)node; - - plan->cursor = get_internal_cursor(); - - replace_target_relation((Node*)((Plan *)plan)->lefttree,targetrel,partitionparent,partidx); - } - break; - case T_RemoteQuery: - elog(ERROR,"internal error: update partitioned parent table is forbidden in coordinator"); - break; - default: - elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node)); - break; - } -} - -void -replace_partidx_bitmapheapscan(Relation relation, Node *plan, int partidx) -{ - switch(nodeTag(plan)) - { - case T_BitmapAnd: - { - List *planlist; - planlist = ((BitmapAnd*)plan)->bitmapplans; - replace_partidx_bitmapheapscan(relation,(Node*)planlist, partidx); - } - break; - case T_BitmapOr: - { - List *planlist; - planlist = ((BitmapOr*)plan)->bitmapplans; - replace_partidx_bitmapheapscan(relation,(Node*)planlist, partidx); - } - break; - case T_BitmapIndexScan: - { - Scan *sscan; - BitmapIndexScan *idxscan_child; - - sscan = (Scan *)plan; - sscan->ispartchild = true; - sscan->childidx = partidx; - - idxscan_child = (BitmapIndexScan *)plan; - idxscan_child->indexid = RelationGetPartitionIndex(relation,idxscan_child->indexid,partidx); - } - break; - case T_List: - { - List * list; - ListCell *cell; - Node *scan; - - list = (List *)plan; - foreach(cell,list) - { - scan = (Node*)lfirst(cell); - replace_partidx_bitmapheapscan(relation, scan, partidx); - } - } - break; - default: - elog(ERROR, "internal error: BitmapHeapScan cannot have this subplan[%d]", nodeTag(plan)); - break; - } -} - -int32 -get_timestamptz_gap(TimestampTz value, int32 interval) -{ - int32 gap; - fsec_t fsec; - struct pg_tm user_time; - - if(timestamp2tm(value, NULL, &user_time, &fsec, NULL, NULL) != 0) - ereport(ERROR, - (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), - errmsg("timestamp out of range"))); - - - switch (interval) - { - case IntervalType_Year: - { - gap = get_monthesofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, - user_time.tm_year, 1); - break; - } - - case IntervalType_Month: - { - gap = get_monthesofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, - user_time.tm_year, user_time.tm_mon); - break; - } - - case IntervalType_Day: - { - gap = get_daysofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, g_partition_base_time.tm_mday, - user_time.tm_year, user_time.tm_mon, user_time.tm_mday); - break; - } - - default: - { - ereport(ERROR, - (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), - errmsg("partition interval %d not support hot and cold seperation", interval))); - } - } - return gap; -} - -int32 -get_timestamptz_diff(TimestampTz value, int32 interval) -{ - int32 gap1; - int32 gap2; - TimestampTz current_tmstamp; - fsec_t fsec; - struct pg_tm current_time; - struct pg_tm user_time; - - if(timestamp2tm(value, NULL, &user_time, &fsec, NULL, NULL) != 0) - { - ereport(ERROR, - (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), - errmsg("timestamp out of range"))); - } - - current_tmstamp = GetCurrentTimestamp(); - if(timestamp2tm(current_tmstamp, NULL, ¤t_time, &fsec, NULL, NULL) != 0) - { - ereport(ERROR, - (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), - errmsg("timestamp out of range"))); - } - - switch (interval) - { - case IntervalType_Month: - { - gap1 = get_monthesofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, - current_time.tm_year, current_time.tm_mon); - - gap2 = get_monthesofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, - user_time.tm_year, user_time.tm_mon); - break; - } - - case IntervalType_Day: - { - gap1 = get_daysofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, g_partition_base_time.tm_mday, - current_time.tm_year, current_time.tm_mon, current_time.tm_mday); - gap2 = get_daysofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, g_partition_base_time.tm_mday, - user_time.tm_year, user_time.tm_mon, user_time.tm_mday); - break; - } - - default: - { - ereport(ERROR, - (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), - errmsg("partition interval %d not support hot and cold seperation", interval))); - } - } - - - return gap1 - gap2; -} - -int32 -date_diff(struct pg_tm *user_time) -{ - int32 gap1; - int32 gap2; - TimestampTz current_tmstamp; - fsec_t fsec; - struct pg_tm current_time; - - current_tmstamp = GetCurrentTimestamp(); - if(timestamp2tm(current_tmstamp, NULL, ¤t_time, &fsec, NULL, NULL) != 0) - { - ereport(ERROR, - (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), - errmsg("timestamp out of range"))); - } - - gap1 = get_monthesofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, - current_time.tm_year, current_time.tm_mon); - - gap2 = get_monthesofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, - user_time->tm_year, user_time->tm_mon); - - - - return gap1 - gap2; -} - -int32 -date_diff_indays(struct pg_tm *user_time) -{ - int32 gap1; - int32 gap2; - TimestampTz current_tmstamp; - fsec_t fsec; - struct pg_tm current_time; - - current_tmstamp = GetCurrentTimestamp(); - if(timestamp2tm(current_tmstamp, NULL, ¤t_time, &fsec, NULL, NULL) != 0) - { - ereport(ERROR, - (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), - errmsg("timestamp out of range"))); - } - - gap1 = get_daysofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, g_partition_base_time.tm_mday, - current_time.tm_year, current_time.tm_mon, current_time.tm_mday); - - - gap2 = get_daysofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, g_partition_base_time.tm_mday, - user_time->tm_year, user_time->tm_mon, user_time->tm_mday); - - return gap1 - gap2; -} - -int get_months_away_from_base(struct pg_tm * user_tm) -{ - return get_monthesofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, - user_tm->tm_year, user_tm->tm_mon); -} - -int get_days_away_from_base(struct pg_tm * user_tm) -{ - return get_daysofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, g_partition_base_time.tm_mday, - user_tm->tm_year, user_tm->tm_mon, user_tm->tm_mday); -} - -bool is_sec_meet_temp_cold_date(TimestampTz secvalue, int32 interval, int step, TimestampTz startValue) -{// #lizard forgives - bool ret; - fsec_t fsec; - struct pg_tm sec_time; - - if(timestamp2tm(secvalue, NULL, &sec_time, &fsec, NULL, NULL) != 0) - { - ereport(ERROR, - (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), - errmsg("timestamp out of range"))); - } - - switch (interval) - { - case IntervalType_Year: - { - ret = (g_TempColdDataTime.tm_year == sec_time.tm_year); - break; - } - case IntervalType_Month: - { - ret = (g_TempColdDataTime.tm_year == sec_time.tm_year) - && (g_TempColdDataTime.tm_mon == sec_time.tm_mon); - break; - } - - case IntervalType_Day: - { - ret = (g_TempColdDataTime.tm_year == sec_time.tm_year) - && (g_TempColdDataTime.tm_mon == sec_time.tm_mon) - && (g_TempColdDataTime.tm_mday == sec_time.tm_mday); - if (!ret) - { - struct pg_tm start_time; - - if(timestamp2tm(startValue, NULL, &start_time, &fsec, NULL, NULL) != 0) - { - ereport(ERROR, - (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), - errmsg("timestamp out of range"))); - } - - ret = is_first_day_from_start(step, interval, &start_time, &sec_time); - if (ret) - { - if (g_TempColdDataTime.tm_year + 1 == sec_time.tm_year && - g_TempColdDataTime.tm_mon == 12 && - g_TempColdDataTime.tm_mday == 31) - { - ret = true; - } - else - { - ret = false; - } - } - } - - break; - } - - default: - { - ereport(ERROR, - (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), - errmsg("partition interval %d not support hot and cold seperation", interval))); - } - } - - return ret; -} - -int32 GetPartitionIndex(TimestampTz start, int step, int steptype, int partitions, TimestampTz value) -{ - return find_partidx_by_timestamp(start, step, steptype, partitions, value, QULIFICATION_TYPE_EQUAL); -} - -/* is the first day of next year from start year */ -bool -is_first_day_from_start(int step, int steptype, struct pg_tm *start_time, struct pg_tm *current_time) -{ - bool result = false; - - /* partition by one day */ - if (step == 1 && steptype == IntervalType_Day) - { - if (current_time->tm_year == start_time->tm_year + 1 && current_time->tm_mon == 1 && - current_time->tm_mday == 1) - { - result = true; - } - } - - return result; -} -#endif +/*------------------------------------------------------------------------- + * + * ruleutils.c + * Functions to convert stored expressions/querytrees back to + * source text + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/utils/adt/ruleutils.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include +#include +#include + +#ifdef PGXC +#include "access/reloptions.h" +#endif /* PGXC */ +#include "access/amapi.h" +#include "access/htup_details.h" +#include "access/sysattr.h" +#include "catalog/dependency.h" +#include "catalog/indexing.h" +#include "catalog/partition.h" +#include "catalog/pg_aggregate.h" +#include "catalog/pg_am.h" +#include "catalog/pg_authid.h" +#ifdef PGXC +#include "catalog/pg_aggregate.h" +#endif /* PGXC */ +#include "catalog/pg_collation.h" +#include "catalog/pg_constraint.h" +#include "catalog/pg_depend.h" +#include "catalog/pg_language.h" +#include "catalog/pg_opclass.h" +#include "catalog/pg_operator.h" +#include "catalog/pg_partitioned_table.h" +#include "catalog/pg_proc.h" +#include "catalog/pg_statistic_ext.h" +#include "catalog/pg_trigger.h" +#include "catalog/pg_type.h" +#include "commands/defrem.h" +#include "commands/tablespace.h" +#include "common/keywords.h" +#include "executor/spi.h" +#include "funcapi.h" +#ifdef PGXC +#include "nodes/execnodes.h" +#endif +#include "mb/pg_wchar.h" +#include "miscadmin.h" +#include "nodes/makefuncs.h" +#include "nodes/nodeFuncs.h" +#include "optimizer/tlist.h" +#include "parser/parse_node.h" +#include "parser/parse_agg.h" +#include "parser/parse_func.h" +#include "parser/parse_oper.h" +#include "parser/parse_type.h" +#include "parser/parser.h" +#include "parser/parsetree.h" +#ifdef PGXC +#include "pgxc/pgxc.h" +#include "pgxc/planner.h" +#endif +#include "rewrite/rewriteHandler.h" +#include "rewrite/rewriteManip.h" +#include "rewrite/rewriteSupport.h" +#include "utils/array.h" +#include "utils/builtins.h" +#include "utils/fmgroids.h" +#include "utils/hsearch.h" +#include "utils/lsyscache.h" +#include "utils/rel.h" +#include "utils/ruleutils.h" +#include "utils/snapmgr.h" +#include "utils/syscache.h" +#include "utils/tqual.h" +#include "utils/typcache.h" +#include "utils/varlena.h" +#include "utils/xml.h" +#ifdef __TBASE__ +#include "optimizer/planmain.h" +#endif +#ifdef __COLD_HOT__ +#include "postmaster/postmaster.h" +#endif + +/* ---------- + * Pretty formatting constants + * ---------- + */ + +/* Indent counts */ +#define PRETTYINDENT_STD 8 +#define PRETTYINDENT_JOIN 4 +#define PRETTYINDENT_VAR 4 + +#define PRETTYINDENT_LIMIT 40 /* wrap limit */ + +/* Pretty flags */ +#define PRETTYFLAG_PAREN 1 +#define PRETTYFLAG_INDENT 2 + +/* Default line length for pretty-print wrapping: 0 means wrap always */ +#define WRAP_COLUMN_DEFAULT 0 + +/* macro to test if pretty action needed */ +#define PRETTY_PAREN(context) ((context)->prettyFlags & PRETTYFLAG_PAREN) +#define PRETTY_INDENT(context) ((context)->prettyFlags & PRETTYFLAG_INDENT) + + +#ifdef __TBASE__ +static int daysofmonth[13] = {0,31,29,31,30,31,30,31,31,30,31,30,31}; + +static struct pg_tm g_partition_base_time = { 0, + 0, + 0, + 1, + 1, /* origin 0, not 1 */ + 1970, /* relative to 1900 */ + 1, + 1, + 0, + 0, + NULL + }; +#endif + +/* ---------- + * Local data types + * ---------- + */ + +/* Context info needed for invoking a recursive querytree display routine */ +typedef struct +{ + StringInfo buf; /* output buffer to append to */ + List *namespaces; /* List of deparse_namespace nodes */ + List *windowClause; /* Current query level's WINDOW clause */ + List *windowTList; /* targetlist for resolving WINDOW clause */ + int prettyFlags; /* enabling of pretty-print functions */ + int wrapColumn; /* max line length, or -1 for no limit */ + int indentLevel; /* current indent level for prettyprint */ + bool varprefix; /* TRUE to print prefixes on Vars */ + ParseExprKind special_exprkind; /* set only for exprkinds needing special + * handling */ +#ifdef PGXC + bool finalise_aggs; /* should Datanode finalise the aggregates? */ + bool sortgroup_colno;/* instead of expression use resno for + * sortgrouprefs. + */ +#endif /* PGXC */ +} deparse_context; + +/* + * Each level of query context around a subtree needs a level of Var namespace. + * A Var having varlevelsup=N refers to the N'th item (counting from 0) in + * the current context's namespaces list. + * + * The rangetable is the list of actual RTEs from the query tree, and the + * cte list is the list of actual CTEs. + * + * rtable_names holds the alias name to be used for each RTE (either a C + * string, or NULL for nameless RTEs such as unnamed joins). + * rtable_columns holds the column alias names to be used for each RTE. + * + * In some cases we need to make names of merged JOIN USING columns unique + * across the whole query, not only per-RTE. If so, unique_using is TRUE + * and using_names is a list of C strings representing names already assigned + * to USING columns. + * + * When deparsing plan trees, there is always just a single item in the + * deparse_namespace list (since a plan tree never contains Vars with + * varlevelsup > 0). We store the PlanState node that is the immediate + * parent of the expression to be deparsed, as well as a list of that + * PlanState's ancestors. In addition, we store its outer and inner subplan + * state nodes, as well as their plan nodes' targetlists, and the index tlist + * if the current plan node might contain INDEX_VAR Vars. (These fields could + * be derived on-the-fly from the current PlanState, but it seems notationally + * clearer to set them up as separate fields.) + */ +typedef struct +{ + List *rtable; /* List of RangeTblEntry nodes */ + List *rtable_names; /* Parallel list of names for RTEs */ + List *rtable_columns; /* Parallel list of deparse_columns structs */ + List *ctes; /* List of CommonTableExpr nodes */ + /* Workspace for column alias assignment: */ + bool unique_using; /* Are we making USING names globally unique */ + List *using_names; /* List of assigned names for USING columns */ + /* Remaining fields are used only when deparsing a Plan tree: */ + PlanState *planstate; /* immediate parent of current expression */ + List *ancestors; /* ancestors of planstate */ + PlanState *outer_planstate; /* outer subplan state, or NULL if none */ + PlanState *inner_planstate; /* inner subplan state, or NULL if none */ + List *outer_tlist; /* referent for OUTER_VAR Vars */ + List *inner_tlist; /* referent for INNER_VAR Vars */ + List *index_tlist; /* referent for INDEX_VAR Vars */ +} deparse_namespace; + +/* + * Per-relation data about column alias names. + * + * Selecting aliases is unreasonably complicated because of the need to dump + * rules/views whose underlying tables may have had columns added, deleted, or + * renamed since the query was parsed. We must nonetheless print the rule/view + * in a form that can be reloaded and will produce the same results as before. + * + * For each RTE used in the query, we must assign column aliases that are + * unique within that RTE. SQL does not require this of the original query, + * but due to factors such as *-expansion we need to be able to uniquely + * reference every column in a decompiled query. As long as we qualify all + * column references, per-RTE uniqueness is sufficient for that. + * + * However, we can't ensure per-column name uniqueness for unnamed join RTEs, + * since they just inherit column names from their input RTEs, and we can't + * rename the columns at the join level. Most of the time this isn't an issue + * because we don't need to reference the join's output columns as such; we + * can reference the input columns instead. That approach can fail for merged + * JOIN USING columns, however, so when we have one of those in an unnamed + * join, we have to make that column's alias globally unique across the whole + * query to ensure it can be referenced unambiguously. + * + * Another problem is that a JOIN USING clause requires the columns to be + * merged to have the same aliases in both input RTEs, and that no other + * columns in those RTEs or their children conflict with the USING names. + * To handle that, we do USING-column alias assignment in a recursive + * traversal of the query's jointree. When descending through a JOIN with + * USING, we preassign the USING column names to the child columns, overriding + * other rules for column alias assignment. We also mark each RTE with a list + * of all USING column names selected for joins containing that RTE, so that + * when we assign other columns' aliases later, we can avoid conflicts. + * + * Another problem is that if a JOIN's input tables have had columns added or + * deleted since the query was parsed, we must generate a column alias list + * for the join that matches the current set of input columns --- otherwise, a + * change in the number of columns in the left input would throw off matching + * of aliases to columns of the right input. Thus, positions in the printable + * column alias list are not necessarily one-for-one with varattnos of the + * JOIN, so we need a separate new_colnames[] array for printing purposes. + */ +typedef struct +{ + /* + * colnames is an array containing column aliases to use for columns that + * existed when the query was parsed. Dropped columns have NULL entries. + * This array can be directly indexed by varattno to get a Var's name. + * + * Non-NULL entries are guaranteed unique within the RTE, *except* when + * this is for an unnamed JOIN RTE. In that case we merely copy up names + * from the two input RTEs. + * + * During the recursive descent in set_using_names(), forcible assignment + * of a child RTE's column name is represented by pre-setting that element + * of the child's colnames array. So at that stage, NULL entries in this + * array just mean that no name has been preassigned, not necessarily that + * the column is dropped. + */ + int num_cols; /* length of colnames[] array */ + char **colnames; /* array of C strings and NULLs */ + + /* + * new_colnames is an array containing column aliases to use for columns + * that would exist if the query was re-parsed against the current + * definitions of its base tables. This is what to print as the column + * alias list for the RTE. This array does not include dropped columns, + * but it will include columns added since original parsing. Indexes in + * it therefore have little to do with current varattno values. As above, + * entries are unique unless this is for an unnamed JOIN RTE. (In such an + * RTE, we never actually print this array, but we must compute it anyway + * for possible use in computing column names of upper joins.) The + * parallel array is_new_col marks which of these columns are new since + * original parsing. Entries with is_new_col false must match the + * non-NULL colnames entries one-for-one. + */ + int num_new_cols; /* length of new_colnames[] array */ + char **new_colnames; /* array of C strings */ + bool *is_new_col; /* array of bool flags */ + + /* This flag tells whether we should actually print a column alias list */ + bool printaliases; + + /* This list has all names used as USING names in joins above this RTE */ + List *parentUsing; /* names assigned to parent merged columns */ + + /* + * If this struct is for a JOIN RTE, we fill these fields during the + * set_using_names() pass to describe its relationship to its child RTEs. + * + * leftattnos and rightattnos are arrays with one entry per existing + * output column of the join (hence, indexable by join varattno). For a + * simple reference to a column of the left child, leftattnos[i] is the + * child RTE's attno and rightattnos[i] is zero; and conversely for a + * column of the right child. But for merged columns produced by JOIN + * USING/NATURAL JOIN, both leftattnos[i] and rightattnos[i] are nonzero. + * Also, if the column has been dropped, both are zero. + * + * If it's a JOIN USING, usingNames holds the alias names selected for the + * merged columns (these might be different from the original USING list, + * if we had to modify names to achieve uniqueness). + */ + int leftrti; /* rangetable index of left child */ + int rightrti; /* rangetable index of right child */ + int *leftattnos; /* left-child varattnos of join cols, or 0 */ + int *rightattnos; /* right-child varattnos of join cols, or 0 */ + List *usingNames; /* names assigned to merged columns */ +} deparse_columns; + +/* This macro is analogous to rt_fetch(), but for deparse_columns structs */ +#define deparse_columns_fetch(rangetable_index, dpns) \ + ((deparse_columns *) list_nth((dpns)->rtable_columns, (rangetable_index)-1)) + +/* + * Entry in set_rtable_names' hash table + */ +typedef struct +{ + char name[NAMEDATALEN]; /* Hash key --- must be first */ + int counter; /* Largest addition used so far for name */ +} NameHashEntry; + + +/* ---------- + * Global data + * ---------- + */ +static SPIPlanPtr plan_getrulebyoid = NULL; +static const char *query_getrulebyoid = "SELECT * FROM pg_catalog.pg_rewrite WHERE oid = $1"; +static SPIPlanPtr plan_getviewrule = NULL; +static const char *query_getviewrule = "SELECT * FROM pg_catalog.pg_rewrite WHERE ev_class = $1 AND rulename = $2"; + +/* GUC parameters */ +bool quote_all_identifiers = false; + + +/* ---------- + * Local functions + * + * Most of these functions used to use fixed-size buffers to build their + * results. Now, they take an (already initialized) StringInfo object + * as a parameter, and append their text output to its contents. + * ---------- + */ +static char *deparse_expression_pretty(Node *expr, List *dpcontext, + bool forceprefix, bool showimplicit, + int prettyFlags, int startIndent); +static char *pg_get_viewdef_worker(Oid viewoid, + int prettyFlags, int wrapColumn); +static char *pg_get_triggerdef_worker(Oid trigid, bool pretty); +static void decompile_column_index_array(Datum column_index_array, Oid relId, + StringInfo buf); +static char *pg_get_ruledef_worker(Oid ruleoid, int prettyFlags); +static char *pg_get_indexdef_worker(Oid indexrelid, int colno, + const Oid *excludeOps, + bool attrsOnly, bool showTblSpc, + int prettyFlags, bool missing_ok); +static char *pg_get_statisticsobj_worker(Oid statextid, bool missing_ok); +static char *pg_get_partkeydef_worker(Oid relid, int prettyFlags, + bool attrsOnly, bool missing_ok); +static char *pg_get_constraintdef_worker(Oid constraintId, bool fullCommand, + int prettyFlags, bool missing_ok); +static text *pg_get_expr_worker(text *expr, Oid relid, const char *relname, + int prettyFlags); +static int print_function_arguments(StringInfo buf, HeapTuple proctup, + bool print_table_args, bool print_defaults); +static void print_function_rettype(StringInfo buf, HeapTuple proctup); +static void print_function_trftypes(StringInfo buf, HeapTuple proctup); +static void set_rtable_names(deparse_namespace *dpns, List *parent_namespaces, + Bitmapset *rels_used); +static void set_deparse_for_query(deparse_namespace *dpns, Query *query, + List *parent_namespaces); +static void set_simple_column_names(deparse_namespace *dpns); +static bool has_dangerous_join_using(deparse_namespace *dpns, Node *jtnode); +static void set_using_names(deparse_namespace *dpns, Node *jtnode, + List *parentUsing); +static void set_relation_column_names(deparse_namespace *dpns, + RangeTblEntry *rte, + deparse_columns *colinfo); +static void set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte, + deparse_columns *colinfo); +static bool colname_is_unique(char *colname, deparse_namespace *dpns, + deparse_columns *colinfo); +static char *make_colname_unique(char *colname, deparse_namespace *dpns, + deparse_columns *colinfo); +static void expand_colnames_array_to(deparse_columns *colinfo, int n); +static void identify_join_columns(JoinExpr *j, RangeTblEntry *jrte, + deparse_columns *colinfo); +static void flatten_join_using_qual(Node *qual, + List **leftvars, List **rightvars); +static char *get_rtable_name(int rtindex, deparse_context *context); +static void set_deparse_planstate(deparse_namespace *dpns, PlanState *ps); +#ifdef PGXC +static void set_deparse_plan(deparse_namespace *dpns, Plan *plan); +#endif +static void push_child_plan(deparse_namespace *dpns, PlanState *ps, + deparse_namespace *save_dpns); +static void pop_child_plan(deparse_namespace *dpns, + deparse_namespace *save_dpns); +static void push_ancestor_plan(deparse_namespace *dpns, ListCell *ancestor_cell, + deparse_namespace *save_dpns); +static void pop_ancestor_plan(deparse_namespace *dpns, + deparse_namespace *save_dpns); +static void make_ruledef(StringInfo buf, HeapTuple ruletup, TupleDesc rulettc, + int prettyFlags); +static void make_viewdef(StringInfo buf, HeapTuple ruletup, TupleDesc rulettc, + int prettyFlags, int wrapColumn); +static void get_query_def(Query *query, StringInfo buf, List *parentnamespace, + TupleDesc resultDesc, + int prettyFlags, int wrapColumn, int startIndent +#ifdef PGXC + , bool finalise_aggregates, bool sortgroup_colno +#endif /* PGXC */ + ); +static void get_values_def(List *values_lists, deparse_context *context); +static void get_with_clause(Query *query, deparse_context *context); +static void get_select_query_def(Query *query, deparse_context *context, + TupleDesc resultDesc); +static void get_insert_query_def(Query *query, deparse_context *context); +static void get_update_query_def(Query *query, deparse_context *context); +static void get_update_query_targetlist_def(Query *query, List *targetList, + deparse_context *context, + RangeTblEntry *rte); +static void get_delete_query_def(Query *query, deparse_context *context); +static void get_utility_query_def(Query *query, deparse_context *context); +static void get_basic_select_query(Query *query, deparse_context *context, + TupleDesc resultDesc); +static void get_target_list(List *targetList, deparse_context *context, + TupleDesc resultDesc); +static void get_setop_query(Node *setOp, Query *query, + deparse_context *context, + TupleDesc resultDesc); +static Node *get_rule_sortgroupclause(Index ref, List *tlist, + bool force_colno, + deparse_context *context); +static void get_rule_groupingset(GroupingSet *gset, List *targetlist, + bool omit_parens, deparse_context *context); +static void get_rule_orderby(List *orderList, List *targetList, + bool force_colno, deparse_context *context); +static void get_rule_windowclause(Query *query, deparse_context *context); +static void get_rule_windowspec(WindowClause *wc, List *targetList, + deparse_context *context); +static char *get_variable(Var *var, int levelsup, bool istoplevel, + deparse_context *context); +static void get_special_variable(Node *node, deparse_context *context, + void *private); +static void resolve_special_varno(Node *node, deparse_context *context, + void *private, + void (*callback) (Node *, deparse_context *, void *)); +static Node *find_param_referent(Param *param, deparse_context *context, + deparse_namespace **dpns_p, ListCell **ancestor_cell_p); +static void get_parameter(Param *param, deparse_context *context); +static const char *get_simple_binary_op_name(OpExpr *expr); +static bool isSimpleNode(Node *node, Node *parentNode, int prettyFlags); +static void appendContextKeyword(deparse_context *context, const char *str, + int indentBefore, int indentAfter, int indentPlus); +static void removeStringInfoSpaces(StringInfo str); +static void get_rule_expr(Node *node, deparse_context *context, + bool showimplicit); +static void get_rule_expr_toplevel(Node *node, deparse_context *context, + bool showimplicit); +static void get_rule_expr_funccall(Node *node, deparse_context *context, + bool showimplicit); +static bool looks_like_function(Node *node); +static void get_oper_expr(OpExpr *expr, deparse_context *context); +static void get_func_expr(FuncExpr *expr, deparse_context *context, + bool showimplicit); +static void get_agg_expr(Aggref *aggref, deparse_context *context, + Aggref *original_aggref); +static void get_agg_combine_expr(Node *node, deparse_context *context, + void *private); +static void get_windowfunc_expr(WindowFunc *wfunc, deparse_context *context); +static void get_coercion_expr(Node *arg, deparse_context *context, + Oid resulttype, int32 resulttypmod, + Node *parentNode); +static void get_const_expr(Const *constval, deparse_context *context, + int showtype); +static void get_const_collation(Const *constval, deparse_context *context); +static void simple_quote_literal(StringInfo buf, const char *val); +static void get_sublink_expr(SubLink *sublink, deparse_context *context); +static void get_tablefunc(TableFunc *tf, deparse_context *context, + bool showimplicit); +static void get_from_clause(Query *query, const char *prefix, + deparse_context *context); +static void get_from_clause_item(Node *jtnode, Query *query, + deparse_context *context); +static void get_column_alias_list(deparse_columns *colinfo, + deparse_context *context); +static void get_from_clause_coldeflist(RangeTblFunction *rtfunc, + deparse_columns *colinfo, + deparse_context *context); +static void get_tablesample_def(TableSampleClause *tablesample, + deparse_context *context); +static void get_opclass_name(Oid opclass, Oid actual_datatype, + StringInfo buf); +static Node *processIndirection(Node *node, deparse_context *context); +static void printSubscripts(ArrayRef *aref, deparse_context *context); +static char *get_relation_name(Oid relid); +static char *generate_relation_name(Oid relid, List *namespaces); +static char *generate_qualified_relation_name(Oid relid); +static char *generate_function_name(Oid funcid, int nargs, + List *argnames, Oid *argtypes, + bool has_variadic, bool *use_variadic_p, + ParseExprKind special_exprkind); +static char *generate_operator_name(Oid operid, Oid arg1, Oid arg2); +static text *string_to_text(char *str); +static char *flatten_reloptions(Oid relid); + +#ifdef __TBASE__ +static Bitmapset *pruning_walker(Relation rel, Node *expr); +static Bitmapset *pruning_opexpr(Relation rel, OpExpr *expr); +static Bitmapset *get_full_pruning_result(Relation rel); +static int get_daysofmonth(int startmonth, int startday, + int endmonth, int endday); +#endif +#define only_marker(rte) ((rte)->inh ? "" : "ONLY ") + + +/* ---------- + * get_ruledef - Do it all and return a text + * that could be used as a statement + * to recreate the rule + * ---------- + */ +Datum +pg_get_ruledef(PG_FUNCTION_ARGS) +{ + Oid ruleoid = PG_GETARG_OID(0); + int prettyFlags; + char *res; + + prettyFlags = PRETTYFLAG_INDENT; + + res = pg_get_ruledef_worker(ruleoid, prettyFlags); + + if (res == NULL) + PG_RETURN_NULL(); + + PG_RETURN_TEXT_P(string_to_text(res)); +} + + +Datum +pg_get_ruledef_ext(PG_FUNCTION_ARGS) +{ + Oid ruleoid = PG_GETARG_OID(0); + bool pretty = PG_GETARG_BOOL(1); + int prettyFlags; + char *res; + + prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT; + + res = pg_get_ruledef_worker(ruleoid, prettyFlags); + + if (res == NULL) + PG_RETURN_NULL(); + + PG_RETURN_TEXT_P(string_to_text(res)); +} + + +static char * +pg_get_ruledef_worker(Oid ruleoid, int prettyFlags) +{// #lizard forgives + Datum args[1]; + char nulls[1]; + int spirc; + HeapTuple ruletup; + TupleDesc rulettc; + StringInfoData buf; + + /* + * Do this first so that string is alloc'd in outer context not SPI's. + */ + initStringInfo(&buf); + + /* + * Connect to SPI manager + */ + if (SPI_connect() != SPI_OK_CONNECT) + elog(ERROR, "SPI_connect failed"); + + /* + * On the first call prepare the plan to lookup pg_rewrite. We read + * pg_rewrite over the SPI manager instead of using the syscache to be + * checked for read access on pg_rewrite. + */ + if (plan_getrulebyoid == NULL) + { + Oid argtypes[1]; + SPIPlanPtr plan; + + argtypes[0] = OIDOID; + plan = SPI_prepare(query_getrulebyoid, 1, argtypes); + if (plan == NULL) + elog(ERROR, "SPI_prepare failed for \"%s\"", query_getrulebyoid); + SPI_keepplan(plan); + plan_getrulebyoid = plan; + } + + /* + * Get the pg_rewrite tuple for this rule + */ + args[0] = ObjectIdGetDatum(ruleoid); + nulls[0] = ' '; + spirc = SPI_execute_plan(plan_getrulebyoid, args, nulls, true, 0); + if (spirc != SPI_OK_SELECT) + elog(ERROR, "failed to get pg_rewrite tuple for rule %u", ruleoid); + if (SPI_processed != 1) + { + /* + * There is no tuple data available here, just keep the output buffer + * empty. + */ + } + else + { + /* + * Get the rule's definition and put it into executor's memory + */ + ruletup = SPI_tuptable->vals[0]; + rulettc = SPI_tuptable->tupdesc; + make_ruledef(&buf, ruletup, rulettc, prettyFlags); + } + + /* + * Disconnect from SPI manager + */ + if (SPI_finish() != SPI_OK_FINISH) + elog(ERROR, "SPI_finish failed"); + + if (buf.len == 0) + return NULL; + + return buf.data; +} + + +/* ---------- + * get_viewdef - Mainly the same thing, but we + * only return the SELECT part of a view + * ---------- + */ +Datum +pg_get_viewdef(PG_FUNCTION_ARGS) +{ + /* By OID */ + Oid viewoid = PG_GETARG_OID(0); + int prettyFlags; + char *res; + + prettyFlags = PRETTYFLAG_INDENT; + + res = pg_get_viewdef_worker(viewoid, prettyFlags, WRAP_COLUMN_DEFAULT); + + if (res == NULL) + PG_RETURN_NULL(); + + PG_RETURN_TEXT_P(string_to_text(res)); +} + + +Datum +pg_get_viewdef_ext(PG_FUNCTION_ARGS) +{ + /* By OID */ + Oid viewoid = PG_GETARG_OID(0); + bool pretty = PG_GETARG_BOOL(1); + int prettyFlags; + char *res; + + prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT; + + res = pg_get_viewdef_worker(viewoid, prettyFlags, WRAP_COLUMN_DEFAULT); + + if (res == NULL) + PG_RETURN_NULL(); + + PG_RETURN_TEXT_P(string_to_text(res)); +} + +Datum +pg_get_viewdef_wrap(PG_FUNCTION_ARGS) +{ + /* By OID */ + Oid viewoid = PG_GETARG_OID(0); + int wrap = PG_GETARG_INT32(1); + int prettyFlags; + char *res; + + /* calling this implies we want pretty printing */ + prettyFlags = PRETTYFLAG_PAREN | PRETTYFLAG_INDENT; + + res = pg_get_viewdef_worker(viewoid, prettyFlags, wrap); + + if (res == NULL) + PG_RETURN_NULL(); + + PG_RETURN_TEXT_P(string_to_text(res)); +} + +Datum +pg_get_viewdef_name(PG_FUNCTION_ARGS) +{ + /* By qualified name */ + text *viewname = PG_GETARG_TEXT_PP(0); + int prettyFlags; + RangeVar *viewrel; + Oid viewoid; + char *res; + + prettyFlags = PRETTYFLAG_INDENT; + + /* Look up view name. Can't lock it - we might not have privileges. */ + viewrel = makeRangeVarFromNameList(textToQualifiedNameList(viewname)); + viewoid = RangeVarGetRelid(viewrel, NoLock, false); + + res = pg_get_viewdef_worker(viewoid, prettyFlags, WRAP_COLUMN_DEFAULT); + + if (res == NULL) + PG_RETURN_NULL(); + + PG_RETURN_TEXT_P(string_to_text(res)); +} + + +Datum +pg_get_viewdef_name_ext(PG_FUNCTION_ARGS) +{ + /* By qualified name */ + text *viewname = PG_GETARG_TEXT_PP(0); + bool pretty = PG_GETARG_BOOL(1); + int prettyFlags; + RangeVar *viewrel; + Oid viewoid; + char *res; + + prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT; + + /* Look up view name. Can't lock it - we might not have privileges. */ + viewrel = makeRangeVarFromNameList(textToQualifiedNameList(viewname)); + viewoid = RangeVarGetRelid(viewrel, NoLock, false); + + res = pg_get_viewdef_worker(viewoid, prettyFlags, WRAP_COLUMN_DEFAULT); + + if (res == NULL) + PG_RETURN_NULL(); + + PG_RETURN_TEXT_P(string_to_text(res)); +} + +/* + * Common code for by-OID and by-name variants of pg_get_viewdef + */ +static char * +pg_get_viewdef_worker(Oid viewoid, int prettyFlags, int wrapColumn) +{// #lizard forgives + Datum args[2]; + char nulls[2]; + int spirc; + HeapTuple ruletup; + TupleDesc rulettc; + StringInfoData buf; + + /* + * Do this first so that string is alloc'd in outer context not SPI's. + */ + initStringInfo(&buf); + + /* + * Connect to SPI manager + */ + if (SPI_connect() != SPI_OK_CONNECT) + elog(ERROR, "SPI_connect failed"); + + /* + * On the first call prepare the plan to lookup pg_rewrite. We read + * pg_rewrite over the SPI manager instead of using the syscache to be + * checked for read access on pg_rewrite. + */ + if (plan_getviewrule == NULL) + { + Oid argtypes[2]; + SPIPlanPtr plan; + + argtypes[0] = OIDOID; + argtypes[1] = NAMEOID; + plan = SPI_prepare(query_getviewrule, 2, argtypes); + if (plan == NULL) + elog(ERROR, "SPI_prepare failed for \"%s\"", query_getviewrule); + SPI_keepplan(plan); + plan_getviewrule = plan; + } + + /* + * Get the pg_rewrite tuple for the view's SELECT rule + */ + args[0] = ObjectIdGetDatum(viewoid); + args[1] = DirectFunctionCall1(namein, CStringGetDatum(ViewSelectRuleName)); + nulls[0] = ' '; + nulls[1] = ' '; + spirc = SPI_execute_plan(plan_getviewrule, args, nulls, true, 0); + if (spirc != SPI_OK_SELECT) + elog(ERROR, "failed to get pg_rewrite tuple for view %u", viewoid); + if (SPI_processed != 1) + { + /* + * There is no tuple data available here, just keep the output buffer + * empty. + */ + } + else + { + /* + * Get the rule's definition and put it into executor's memory + */ + ruletup = SPI_tuptable->vals[0]; + rulettc = SPI_tuptable->tupdesc; + make_viewdef(&buf, ruletup, rulettc, prettyFlags, wrapColumn); + } + + /* + * Disconnect from SPI manager + */ + if (SPI_finish() != SPI_OK_FINISH) + elog(ERROR, "SPI_finish failed"); + + if (buf.len == 0) + return NULL; + + return buf.data; +} + +/* ---------- + * get_triggerdef - Get the definition of a trigger + * ---------- + */ +Datum +pg_get_triggerdef(PG_FUNCTION_ARGS) +{ + Oid trigid = PG_GETARG_OID(0); + char *res; + + res = pg_get_triggerdef_worker(trigid, false); + + if (res == NULL) + PG_RETURN_NULL(); + + PG_RETURN_TEXT_P(string_to_text(res)); +} + +Datum +pg_get_triggerdef_ext(PG_FUNCTION_ARGS) +{ + Oid trigid = PG_GETARG_OID(0); + bool pretty = PG_GETARG_BOOL(1); + char *res; + + res = pg_get_triggerdef_worker(trigid, pretty); + + if (res == NULL) + PG_RETURN_NULL(); + + PG_RETURN_TEXT_P(string_to_text(res)); +} + +static char * +pg_get_triggerdef_worker(Oid trigid, bool pretty) +{// #lizard forgives + HeapTuple ht_trig; + Form_pg_trigger trigrec; + StringInfoData buf; + Relation tgrel; + ScanKeyData skey[1]; + SysScanDesc tgscan; + int findx = 0; + char *tgname; + char *tgoldtable; + char *tgnewtable; + Oid argtypes[1]; /* dummy */ + Datum value; + bool isnull; + + /* + * Fetch the pg_trigger tuple by the Oid of the trigger + */ + tgrel = heap_open(TriggerRelationId, AccessShareLock); + + ScanKeyInit(&skey[0], + ObjectIdAttributeNumber, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(trigid)); + + tgscan = systable_beginscan(tgrel, TriggerOidIndexId, true, + NULL, 1, skey); + + ht_trig = systable_getnext(tgscan); + + if (!HeapTupleIsValid(ht_trig)) + { + systable_endscan(tgscan); + heap_close(tgrel, AccessShareLock); + return NULL; + } + + trigrec = (Form_pg_trigger) GETSTRUCT(ht_trig); + + /* + * Start the trigger definition. Note that the trigger's name should never + * be schema-qualified, but the trigger rel's name may be. + */ + initStringInfo(&buf); + + tgname = NameStr(trigrec->tgname); + appendStringInfo(&buf, "CREATE %sTRIGGER %s ", + OidIsValid(trigrec->tgconstraint) ? "CONSTRAINT " : "", + quote_identifier(tgname)); + + if (TRIGGER_FOR_BEFORE(trigrec->tgtype)) + appendStringInfoString(&buf, "BEFORE"); + else if (TRIGGER_FOR_AFTER(trigrec->tgtype)) + appendStringInfoString(&buf, "AFTER"); + else if (TRIGGER_FOR_INSTEAD(trigrec->tgtype)) + appendStringInfoString(&buf, "INSTEAD OF"); + else + elog(ERROR, "unexpected tgtype value: %d", trigrec->tgtype); + + if (TRIGGER_FOR_INSERT(trigrec->tgtype)) + { + appendStringInfoString(&buf, " INSERT"); + findx++; + } + if (TRIGGER_FOR_DELETE(trigrec->tgtype)) + { + if (findx > 0) + appendStringInfoString(&buf, " OR DELETE"); + else + appendStringInfoString(&buf, " DELETE"); + findx++; + } + if (TRIGGER_FOR_UPDATE(trigrec->tgtype)) + { + if (findx > 0) + appendStringInfoString(&buf, " OR UPDATE"); + else + appendStringInfoString(&buf, " UPDATE"); + findx++; + /* tgattr is first var-width field, so OK to access directly */ + if (trigrec->tgattr.dim1 > 0) + { + int i; + + appendStringInfoString(&buf, " OF "); + for (i = 0; i < trigrec->tgattr.dim1; i++) + { + char *attname; + + if (i > 0) + appendStringInfoString(&buf, ", "); + attname = get_relid_attribute_name(trigrec->tgrelid, + trigrec->tgattr.values[i]); + appendStringInfoString(&buf, quote_identifier(attname)); + } + } + } + if (TRIGGER_FOR_TRUNCATE(trigrec->tgtype)) + { + if (findx > 0) + appendStringInfoString(&buf, " OR TRUNCATE"); + else + appendStringInfoString(&buf, " TRUNCATE"); + findx++; + } + appendStringInfo(&buf, " ON %s ", + generate_relation_name(trigrec->tgrelid, NIL)); + + if (OidIsValid(trigrec->tgconstraint)) + { + if (OidIsValid(trigrec->tgconstrrelid)) + appendStringInfo(&buf, "FROM %s ", + generate_relation_name(trigrec->tgconstrrelid, NIL)); + if (!trigrec->tgdeferrable) + appendStringInfoString(&buf, "NOT "); + appendStringInfoString(&buf, "DEFERRABLE INITIALLY "); + if (trigrec->tginitdeferred) + appendStringInfoString(&buf, "DEFERRED "); + else + appendStringInfoString(&buf, "IMMEDIATE "); + } + + value = fastgetattr(ht_trig, Anum_pg_trigger_tgoldtable, + tgrel->rd_att, &isnull); + if (!isnull) + tgoldtable = NameStr(*((NameData *) DatumGetPointer(value))); + else + tgoldtable = NULL; + value = fastgetattr(ht_trig, Anum_pg_trigger_tgnewtable, + tgrel->rd_att, &isnull); + if (!isnull) + tgnewtable = NameStr(*((NameData *) DatumGetPointer(value))); + else + tgnewtable = NULL; + if (tgoldtable != NULL || tgnewtable != NULL) + { + appendStringInfoString(&buf, "REFERENCING "); + if (tgoldtable != NULL) + appendStringInfo(&buf, "OLD TABLE AS %s ", tgoldtable); + if (tgnewtable != NULL) + appendStringInfo(&buf, "NEW TABLE AS %s ", tgnewtable); + } + + if (TRIGGER_FOR_ROW(trigrec->tgtype)) + appendStringInfoString(&buf, "FOR EACH ROW "); + else + appendStringInfoString(&buf, "FOR EACH STATEMENT "); + + /* If the trigger has a WHEN qualification, add that */ + value = fastgetattr(ht_trig, Anum_pg_trigger_tgqual, + tgrel->rd_att, &isnull); + if (!isnull) + { + Node *qual; + char relkind; + deparse_context context; + deparse_namespace dpns; + RangeTblEntry *oldrte; + RangeTblEntry *newrte; + + appendStringInfoString(&buf, "WHEN ("); + + qual = stringToNode(TextDatumGetCString(value)); + + relkind = get_rel_relkind(trigrec->tgrelid); + + /* Build minimal OLD and NEW RTEs for the rel */ + oldrte = makeNode(RangeTblEntry); + oldrte->rtekind = RTE_RELATION; + oldrte->relid = trigrec->tgrelid; + oldrte->relkind = relkind; + oldrte->alias = makeAlias("old", NIL); + oldrte->eref = oldrte->alias; + oldrte->lateral = false; + oldrte->inh = false; + oldrte->inFromCl = true; + + newrte = makeNode(RangeTblEntry); + newrte->rtekind = RTE_RELATION; + newrte->relid = trigrec->tgrelid; + newrte->relkind = relkind; + newrte->alias = makeAlias("new", NIL); + newrte->eref = newrte->alias; + newrte->lateral = false; + newrte->inh = false; + newrte->inFromCl = true; + + /* Build two-element rtable */ + memset(&dpns, 0, sizeof(dpns)); + dpns.rtable = list_make2(oldrte, newrte); + dpns.ctes = NIL; + set_rtable_names(&dpns, NIL, NULL); + set_simple_column_names(&dpns); + + /* Set up context with one-deep namespace stack */ + context.buf = &buf; + context.namespaces = list_make1(&dpns); + context.windowClause = NIL; + context.windowTList = NIL; + context.varprefix = true; + context.prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT; + context.wrapColumn = WRAP_COLUMN_DEFAULT; + context.indentLevel = PRETTYINDENT_STD; + context.special_exprkind = EXPR_KIND_NONE; + + get_rule_expr(qual, &context, false); + + appendStringInfoString(&buf, ") "); + } + + appendStringInfo(&buf, "EXECUTE PROCEDURE %s(", + generate_function_name(trigrec->tgfoid, 0, + NIL, argtypes, + false, NULL, EXPR_KIND_NONE)); + + if (trigrec->tgnargs > 0) + { + char *p; + int i; + + value = fastgetattr(ht_trig, Anum_pg_trigger_tgargs, + tgrel->rd_att, &isnull); + if (isnull) + elog(ERROR, "tgargs is null for trigger %u", trigid); + p = (char *) VARDATA_ANY(DatumGetByteaPP(value)); + for (i = 0; i < trigrec->tgnargs; i++) + { + if (i > 0) + appendStringInfoString(&buf, ", "); + simple_quote_literal(&buf, p); + /* advance p to next string embedded in tgargs */ + while (*p) + p++; + p++; + } + } + + /* We deliberately do not put semi-colon at end */ + appendStringInfoChar(&buf, ')'); + + /* Clean up */ + systable_endscan(tgscan); + + heap_close(tgrel, AccessShareLock); + + return buf.data; +} + +/* ---------- + * get_indexdef - Get the definition of an index + * + * In the extended version, there is a colno argument as well as pretty bool. + * if colno == 0, we want a complete index definition. + * if colno > 0, we only want the Nth index key's variable or expression. + * + * Note that the SQL-function versions of this omit any info about the + * index tablespace; this is intentional because pg_dump wants it that way. + * However pg_get_indexdef_string() includes the index tablespace. + * ---------- + */ +Datum +pg_get_indexdef(PG_FUNCTION_ARGS) +{ + Oid indexrelid = PG_GETARG_OID(0); + int prettyFlags; + char *res; + + prettyFlags = PRETTYFLAG_INDENT; + + res = pg_get_indexdef_worker(indexrelid, 0, NULL, false, false, + prettyFlags, true); + + if (res == NULL) + PG_RETURN_NULL(); + + PG_RETURN_TEXT_P(string_to_text(res)); +} + +Datum +pg_get_indexdef_ext(PG_FUNCTION_ARGS) +{ + Oid indexrelid = PG_GETARG_OID(0); + int32 colno = PG_GETARG_INT32(1); + bool pretty = PG_GETARG_BOOL(2); + int prettyFlags; + char *res; + + prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT; + + res = pg_get_indexdef_worker(indexrelid, colno, NULL, colno != 0, false, + prettyFlags, true); + + if (res == NULL) + PG_RETURN_NULL(); + + PG_RETURN_TEXT_P(string_to_text(res)); +} + +/* + * Internal version for use by ALTER TABLE. + * Includes a tablespace clause in the result. + * Returns a palloc'd C string; no pretty-printing. + */ +char * +pg_get_indexdef_string(Oid indexrelid) +{ + return pg_get_indexdef_worker(indexrelid, 0, NULL, false, true, 0, false); +} + +/* Internal version that just reports the column definitions */ +char * +pg_get_indexdef_columns(Oid indexrelid, bool pretty) +{ + int prettyFlags; + + prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT; + return pg_get_indexdef_worker(indexrelid, 0, NULL, true, false, + prettyFlags, false); +} + +/* + * Internal workhorse to decompile an index definition. + * + * This is now used for exclusion constraints as well: if excludeOps is not + * NULL then it points to an array of exclusion operator OIDs. + */ +static char * +pg_get_indexdef_worker(Oid indexrelid, int colno, + const Oid *excludeOps, + bool attrsOnly, bool showTblSpc, + int prettyFlags, bool missing_ok) +{// #lizard forgives + /* might want a separate isConstraint parameter later */ + bool isConstraint = (excludeOps != NULL); + HeapTuple ht_idx; + HeapTuple ht_idxrel; + HeapTuple ht_am; + Form_pg_index idxrec; + Form_pg_class idxrelrec; + Form_pg_am amrec; + IndexAmRoutine *amroutine; + List *indexprs; + ListCell *indexpr_item; + List *context; + Oid indrelid; + int keyno; + Datum indcollDatum; + Datum indclassDatum; + Datum indoptionDatum; + bool isnull; + oidvector *indcollation; + oidvector *indclass; + int2vector *indoption; + StringInfoData buf; + char *str; + char *sep; +#ifdef __TBASE__ + bool is_interval_child = false; + HeapTuple ht_parent_idx; +#endif + /* + * Fetch the pg_index tuple by the Oid of the index + */ + ht_idx = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(indexrelid)); + if (!HeapTupleIsValid(ht_idx)) + { + if (missing_ok) + return NULL; + elog(ERROR, "cache lookup failed for index %u", indexrelid); + } + idxrec = (Form_pg_index) GETSTRUCT(ht_idx); + + indrelid = idxrec->indrelid; + Assert(indexrelid == idxrec->indexrelid); + + /* Must get indcollation, indclass, and indoption the hard way */ + indcollDatum = SysCacheGetAttr(INDEXRELID, ht_idx, + Anum_pg_index_indcollation, &isnull); + Assert(!isnull); + indcollation = (oidvector *) DatumGetPointer(indcollDatum); + + indclassDatum = SysCacheGetAttr(INDEXRELID, ht_idx, + Anum_pg_index_indclass, &isnull); + Assert(!isnull); + indclass = (oidvector *) DatumGetPointer(indclassDatum); + + indoptionDatum = SysCacheGetAttr(INDEXRELID, ht_idx, + Anum_pg_index_indoption, &isnull); + Assert(!isnull); + indoption = (int2vector *) DatumGetPointer(indoptionDatum); + + /* + * Fetch the pg_class tuple of the index relation + */ + ht_idxrel = SearchSysCache1(RELOID, ObjectIdGetDatum(indexrelid)); + if (!HeapTupleIsValid(ht_idxrel)) + elog(ERROR, "cache lookup failed for relation %u", indexrelid); + idxrelrec = (Form_pg_class) GETSTRUCT(ht_idxrel); + + /* + * Fetch the pg_am tuple of the index' access method + */ + ht_am = SearchSysCache1(AMOID, ObjectIdGetDatum(idxrelrec->relam)); + if (!HeapTupleIsValid(ht_am)) + elog(ERROR, "cache lookup failed for access method %u", + idxrelrec->relam); + amrec = (Form_pg_am) GETSTRUCT(ht_am); + + /* Fetch the index AM's API struct */ + amroutine = GetIndexAmRoutine(amrec->amhandler); + + /* + * Get the index expressions, if any. (NOTE: we do not use the relcache + * versions of the expressions and predicate, because we want to display + * non-const-folded expressions.) + */ + if (!heap_attisnull(ht_idx, Anum_pg_index_indexprs, NULL)) + { + Datum exprsDatum; + bool isnull; + char *exprsString; + + exprsDatum = SysCacheGetAttr(INDEXRELID, ht_idx, + Anum_pg_index_indexprs, &isnull); + Assert(!isnull); + exprsString = TextDatumGetCString(exprsDatum); + indexprs = (List *) stringToNode(exprsString); + pfree(exprsString); + } + else + indexprs = NIL; + + indexpr_item = list_head(indexprs); + + context = deparse_context_for(get_relation_name(indrelid), indrelid); + + /* + * Start the index definition. Note that the index's name should never be + * schema-qualified, but the indexed rel's name may be. + */ + initStringInfo(&buf); + + if (!attrsOnly) + { + if (!isConstraint) + appendStringInfo(&buf, "CREATE %sINDEX %s ON %s USING %s (", + idxrec->indisunique ? "UNIQUE " : "", + quote_identifier(NameStr(idxrelrec->relname)), + generate_relation_name(indrelid, NIL), + quote_identifier(NameStr(amrec->amname))); + else /* currently, must be EXCLUDE constraint */ + appendStringInfo(&buf, "EXCLUDE USING %s (", + quote_identifier(NameStr(amrec->amname))); + } + + /* + * Report the indexed attributes + */ +#ifdef __TBASE__ + { + Relation rel = relation_open(indrelid, NoLock); + if (rel->rd_rel->relkind == RELKIND_RELATION && RELATION_IS_CHILD(rel)) + { + Oid parentIndexId = get_interval_parent_relid(indexrelid); + Oid parentId = get_interval_parent_relid(indrelid); + if (!OidIsValid(parentId)) + { + elog(ERROR, "could not get interval parent for relation %u", + indrelid); + } + indrelid = parentId; + + if (OidIsValid(parentIndexId)) + { + ht_parent_idx = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(parentIndexId)); + if (!HeapTupleIsValid(ht_parent_idx)) + { + if (missing_ok) + return NULL; + elog(ERROR, "cache lookup failed for index %u", parentIndexId); + } + idxrec = (Form_pg_index) GETSTRUCT(ht_parent_idx); + is_interval_child = true; + } + } + heap_close(rel, NoLock); + } +#endif + sep = ""; + for (keyno = 0; keyno < idxrec->indnatts; keyno++) + { + AttrNumber attnum = idxrec->indkey.values[keyno]; + int16 opt = indoption->values[keyno]; + Oid keycoltype; + Oid keycolcollation; + + if (!colno) + appendStringInfoString(&buf, sep); + sep = ", "; + + if (attnum != 0) + { + /* Simple index column */ + char *attname; + int32 keycoltypmod; + + attname = get_relid_attribute_name(indrelid, attnum); + if (!colno || colno == keyno + 1) + appendStringInfoString(&buf, quote_identifier(attname)); + get_atttypetypmodcoll(indrelid, attnum, + &keycoltype, &keycoltypmod, + &keycolcollation); + } + else + { + /* expressional index */ + Node *indexkey; + + if (indexpr_item == NULL) + elog(ERROR, "too few entries in indexprs list"); + indexkey = (Node *) lfirst(indexpr_item); + indexpr_item = lnext(indexpr_item); + /* Deparse */ + str = deparse_expression_pretty(indexkey, context, false, false, + prettyFlags, 0); + if (!colno || colno == keyno + 1) + { + /* Need parens if it's not a bare function call */ + if (looks_like_function(indexkey)) + appendStringInfoString(&buf, str); + else + appendStringInfo(&buf, "(%s)", str); + } + keycoltype = exprType(indexkey); + keycolcollation = exprCollation(indexkey); + } + + if (!attrsOnly && (!colno || colno == keyno + 1)) + { + Oid indcoll; + + /* Add collation, if not default for column */ + indcoll = indcollation->values[keyno]; + if (OidIsValid(indcoll) && indcoll != keycolcollation) + appendStringInfo(&buf, " COLLATE %s", + generate_collation_name((indcoll))); + + /* Add the operator class name, if not default */ + get_opclass_name(indclass->values[keyno], keycoltype, &buf); + + /* Add options if relevant */ + if (amroutine->amcanorder) + { + /* if it supports sort ordering, report DESC and NULLS opts */ + if (opt & INDOPTION_DESC) + { + appendStringInfoString(&buf, " DESC"); + /* NULLS FIRST is the default in this case */ + if (!(opt & INDOPTION_NULLS_FIRST)) + appendStringInfoString(&buf, " NULLS LAST"); + } + else + { + if (opt & INDOPTION_NULLS_FIRST) + appendStringInfoString(&buf, " NULLS FIRST"); + } + } + + /* Add the exclusion operator if relevant */ + if (excludeOps != NULL) + appendStringInfo(&buf, " WITH %s", + generate_operator_name(excludeOps[keyno], + keycoltype, + keycoltype)); + } + } + + if (!attrsOnly) + { + appendStringInfoChar(&buf, ')'); + + /* + * If it has options, append "WITH (options)" + */ + str = flatten_reloptions(indexrelid); + if (str) + { + appendStringInfo(&buf, " WITH (%s)", str); + pfree(str); + } + + /* + * Print tablespace, but only if requested + */ + if (showTblSpc) + { + Oid tblspc; + + tblspc = get_rel_tablespace(indexrelid); + if (!OidIsValid(tblspc)) + tblspc = MyDatabaseTableSpace; + if (isConstraint) + appendStringInfoString(&buf, " USING INDEX"); + appendStringInfo(&buf, " TABLESPACE %s", + quote_identifier(get_tablespace_name(tblspc))); + } + + /* + * If it's a partial index, decompile and append the predicate + */ + if (!heap_attisnull(ht_idx, Anum_pg_index_indpred, NULL)) + { + Node *node; + Datum predDatum; + bool isnull; + char *predString; + + /* Convert text string to node tree */ + predDatum = SysCacheGetAttr(INDEXRELID, ht_idx, + Anum_pg_index_indpred, &isnull); + Assert(!isnull); + predString = TextDatumGetCString(predDatum); + node = (Node *) stringToNode(predString); + pfree(predString); + + /* Deparse */ + str = deparse_expression_pretty(node, context, false, false, + prettyFlags, 0); + if (isConstraint) + appendStringInfo(&buf, " WHERE (%s)", str); + else + appendStringInfo(&buf, " WHERE %s", str); + } + } + + /* Clean up */ + ReleaseSysCache(ht_idx); + ReleaseSysCache(ht_idxrel); + ReleaseSysCache(ht_am); +#ifdef __TBASE__ + if (is_interval_child) + { + ReleaseSysCache(ht_parent_idx); + } +#endif + return buf.data; +} + +/* + * pg_get_statisticsobjdef + * Get the definition of an extended statistics object + */ +Datum +pg_get_statisticsobjdef(PG_FUNCTION_ARGS) +{ + Oid statextid = PG_GETARG_OID(0); + char *res; + + res = pg_get_statisticsobj_worker(statextid, true); + + if (res == NULL) + PG_RETURN_NULL(); + + PG_RETURN_TEXT_P(string_to_text(res)); +} + +/* + * Internal workhorse to decompile an extended statistics object. + */ +static char * +pg_get_statisticsobj_worker(Oid statextid, bool missing_ok) +{// #lizard forgives + Form_pg_statistic_ext statextrec; + HeapTuple statexttup; + StringInfoData buf; + int colno; + char *nsp; + ArrayType *arr; + char *enabled; + Datum datum; + bool isnull; + bool ndistinct_enabled; + bool dependencies_enabled; + int i; + + statexttup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statextid)); + + if (!HeapTupleIsValid(statexttup)) + { + if (missing_ok) + return NULL; + elog(ERROR, "cache lookup failed for statistics object %u", statextid); + } + + statextrec = (Form_pg_statistic_ext) GETSTRUCT(statexttup); + + initStringInfo(&buf); + + nsp = get_namespace_name(statextrec->stxnamespace); + appendStringInfo(&buf, "CREATE STATISTICS %s", + quote_qualified_identifier(nsp, + NameStr(statextrec->stxname))); + + /* + * Decode the stxkind column so that we know which stats types to print. + */ + datum = SysCacheGetAttr(STATEXTOID, statexttup, + Anum_pg_statistic_ext_stxkind, &isnull); + Assert(!isnull); + arr = DatumGetArrayTypeP(datum); + if (ARR_NDIM(arr) != 1 || + ARR_HASNULL(arr) || + ARR_ELEMTYPE(arr) != CHAROID) + elog(ERROR, "stxkind is not a 1-D char array"); + enabled = (char *) ARR_DATA_PTR(arr); + + ndistinct_enabled = false; + dependencies_enabled = false; + + for (i = 0; i < ARR_DIMS(arr)[0]; i++) + { + if (enabled[i] == STATS_EXT_NDISTINCT) + ndistinct_enabled = true; + if (enabled[i] == STATS_EXT_DEPENDENCIES) + dependencies_enabled = true; + } + + /* + * If any option is disabled, then we'll need to append the types clause + * to show which options are enabled. We omit the types clause on purpose + * when all options are enabled, so a pg_dump/pg_restore will create all + * statistics types on a newer postgres version, if the statistics had all + * options enabled on the original version. + */ + if (!ndistinct_enabled || !dependencies_enabled) + { + appendStringInfoString(&buf, " ("); + if (ndistinct_enabled) + appendStringInfoString(&buf, "ndistinct"); + else if (dependencies_enabled) + appendStringInfoString(&buf, "dependencies"); + appendStringInfoChar(&buf, ')'); + } + + appendStringInfoString(&buf, " ON "); + + for (colno = 0; colno < statextrec->stxkeys.dim1; colno++) + { + AttrNumber attnum = statextrec->stxkeys.values[colno]; + char *attname; + + if (colno > 0) + appendStringInfoString(&buf, ", "); + + attname = get_relid_attribute_name(statextrec->stxrelid, attnum); + + appendStringInfoString(&buf, quote_identifier(attname)); + } + + appendStringInfo(&buf, " FROM %s", + generate_relation_name(statextrec->stxrelid, NIL)); + + ReleaseSysCache(statexttup); + + return buf.data; +} + +/* + * pg_get_partkeydef + * + * Returns the partition key specification, ie, the following: + * + * PARTITION BY { RANGE | LIST } (column opt_collation opt_opclass [, ...]) + */ +Datum +pg_get_partkeydef(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); + char *res; + + res = pg_get_partkeydef_worker(relid, PRETTYFLAG_INDENT, false, true); + + if (res == NULL) + PG_RETURN_NULL(); + + PG_RETURN_TEXT_P(string_to_text(res)); +} + +/* Internal version that just reports the column definitions */ +char * +pg_get_partkeydef_columns(Oid relid, bool pretty) +{ + int prettyFlags; + + prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT; + return pg_get_partkeydef_worker(relid, prettyFlags, true, false); +} + +/* + * Internal workhorse to decompile a partition key definition. + */ +static char * +pg_get_partkeydef_worker(Oid relid, int prettyFlags, + bool attrsOnly, bool missing_ok) +{// #lizard forgives + Form_pg_partitioned_table form; + HeapTuple tuple; + oidvector *partclass; + oidvector *partcollation; + List *partexprs; + ListCell *partexpr_item; + List *context; + Datum datum; + bool isnull; + StringInfoData buf; + int keyno; + char *str; + char *sep; + + tuple = SearchSysCache1(PARTRELID, ObjectIdGetDatum(relid)); + if (!HeapTupleIsValid(tuple)) + { + if (missing_ok) + return NULL; + elog(ERROR, "cache lookup failed for partition key of %u", relid); + } + + form = (Form_pg_partitioned_table) GETSTRUCT(tuple); + + Assert(form->partrelid == relid); + + /* Must get partclass and partcollation the hard way */ + datum = SysCacheGetAttr(PARTRELID, tuple, + Anum_pg_partitioned_table_partclass, &isnull); + Assert(!isnull); + partclass = (oidvector *) DatumGetPointer(datum); + + datum = SysCacheGetAttr(PARTRELID, tuple, + Anum_pg_partitioned_table_partcollation, &isnull); + Assert(!isnull); + partcollation = (oidvector *) DatumGetPointer(datum); + + + /* + * Get the expressions, if any. (NOTE: we do not use the relcache + * versions of the expressions, because we want to display + * non-const-folded expressions.) + */ + if (!heap_attisnull(tuple, Anum_pg_partitioned_table_partexprs, NULL)) + { + Datum exprsDatum; + bool isnull; + char *exprsString; + + exprsDatum = SysCacheGetAttr(PARTRELID, tuple, + Anum_pg_partitioned_table_partexprs, &isnull); + Assert(!isnull); + exprsString = TextDatumGetCString(exprsDatum); + partexprs = (List *) stringToNode(exprsString); + + if (!IsA(partexprs, List)) + elog(ERROR, "unexpected node type found in partexprs: %d", + (int) nodeTag(partexprs)); + + pfree(exprsString); + } + else + partexprs = NIL; + + partexpr_item = list_head(partexprs); + context = deparse_context_for(get_relation_name(relid), relid); + + initStringInfo(&buf); + + switch (form->partstrat) + { + case PARTITION_STRATEGY_LIST: + if (!attrsOnly) + appendStringInfo(&buf, "LIST"); + break; + case PARTITION_STRATEGY_RANGE: + if (!attrsOnly) + appendStringInfo(&buf, "RANGE"); + break; + default: + elog(ERROR, "unexpected partition strategy: %d", + (int) form->partstrat); + } + + if (!attrsOnly) + appendStringInfo(&buf, " ("); + sep = ""; + for (keyno = 0; keyno < form->partnatts; keyno++) + { + AttrNumber attnum = form->partattrs.values[keyno]; + Oid keycoltype; + Oid keycolcollation; + Oid partcoll; + + appendStringInfoString(&buf, sep); + sep = ", "; + if (attnum != 0) + { + /* Simple attribute reference */ + char *attname; + int32 keycoltypmod; + + attname = get_relid_attribute_name(relid, attnum); + appendStringInfoString(&buf, quote_identifier(attname)); + get_atttypetypmodcoll(relid, attnum, + &keycoltype, &keycoltypmod, + &keycolcollation); + } + else + { + /* Expression */ + Node *partkey; + + if (partexpr_item == NULL) + elog(ERROR, "too few entries in partexprs list"); + partkey = (Node *) lfirst(partexpr_item); + partexpr_item = lnext(partexpr_item); + + /* Deparse */ + str = deparse_expression_pretty(partkey, context, false, false, + prettyFlags, 0); + /* Need parens if it's not a bare function call */ + if (looks_like_function(partkey)) + appendStringInfoString(&buf, str); + else + appendStringInfo(&buf, "(%s)", str); + + keycoltype = exprType(partkey); + keycolcollation = exprCollation(partkey); + } + + /* Add collation, if not default for column */ + partcoll = partcollation->values[keyno]; + if (!attrsOnly && OidIsValid(partcoll) && partcoll != keycolcollation) + appendStringInfo(&buf, " COLLATE %s", + generate_collation_name((partcoll))); + + /* Add the operator class name, if not default */ + if (!attrsOnly) + get_opclass_name(partclass->values[keyno], keycoltype, &buf); + } + + if (!attrsOnly) + appendStringInfoChar(&buf, ')'); + + /* Clean up */ + ReleaseSysCache(tuple); + + return buf.data; +} + +/* + * pg_get_partition_constraintdef + * + * Returns partition constraint expression as a string for the input relation + */ +Datum +pg_get_partition_constraintdef(PG_FUNCTION_ARGS) +{ + Oid relationId = PG_GETARG_OID(0); + Expr *constr_expr; + int prettyFlags; + List *context; + char *consrc; + + constr_expr = get_partition_qual_relid(relationId); + + /* Quick exit if not a partition */ + if (constr_expr == NULL) + PG_RETURN_NULL(); + + /* + * Deparse and return the constraint expression. + */ + prettyFlags = PRETTYFLAG_INDENT; + context = deparse_context_for(get_relation_name(relationId), relationId); + consrc = deparse_expression_pretty((Node *) constr_expr, context, false, + false, prettyFlags, 0); + + PG_RETURN_TEXT_P(string_to_text(consrc)); +} + +/* + * pg_get_constraintdef + * + * Returns the definition for the constraint, ie, everything that needs to + * appear after "ALTER TABLE ... ADD CONSTRAINT ". + */ +Datum +pg_get_constraintdef(PG_FUNCTION_ARGS) +{ + Oid constraintId = PG_GETARG_OID(0); + int prettyFlags; + char *res; + + prettyFlags = PRETTYFLAG_INDENT; + + res = pg_get_constraintdef_worker(constraintId, false, prettyFlags, true); + + if (res == NULL) + PG_RETURN_NULL(); + + PG_RETURN_TEXT_P(string_to_text(res)); +} + +Datum +pg_get_constraintdef_ext(PG_FUNCTION_ARGS) +{ + Oid constraintId = PG_GETARG_OID(0); + bool pretty = PG_GETARG_BOOL(1); + int prettyFlags; + char *res; + + prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT; + + res = pg_get_constraintdef_worker(constraintId, false, prettyFlags, true); + + if (res == NULL) + PG_RETURN_NULL(); + + PG_RETURN_TEXT_P(string_to_text(res)); +} + +/* + * Internal version that returns a full ALTER TABLE ... ADD CONSTRAINT command + */ +char * +pg_get_constraintdef_command(Oid constraintId) +{ + return pg_get_constraintdef_worker(constraintId, true, 0, false); +} + +/* + * As of 9.4, we now use an MVCC snapshot for this. + */ +static char * +pg_get_constraintdef_worker(Oid constraintId, bool fullCommand, + int prettyFlags, bool missing_ok) +{// #lizard forgives + HeapTuple tup; + Form_pg_constraint conForm; + StringInfoData buf; + SysScanDesc scandesc; + ScanKeyData scankey[1]; + Snapshot snapshot = RegisterSnapshot(GetTransactionSnapshot()); + Relation relation = heap_open(ConstraintRelationId, AccessShareLock); + + ScanKeyInit(&scankey[0], + ObjectIdAttributeNumber, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(constraintId)); + + scandesc = systable_beginscan(relation, + ConstraintOidIndexId, + true, + snapshot, + 1, + scankey); + + /* + * We later use the tuple with SysCacheGetAttr() as if we had obtained it + * via SearchSysCache, which works fine. + */ + tup = systable_getnext(scandesc); + + UnregisterSnapshot(snapshot); + + if (!HeapTupleIsValid(tup)) + { + if (missing_ok) + { + systable_endscan(scandesc); + heap_close(relation, AccessShareLock); + return NULL; + } + elog(ERROR, "could not find tuple for constraint %u", constraintId); + } + + conForm = (Form_pg_constraint) GETSTRUCT(tup); + + initStringInfo(&buf); + + if (fullCommand) + { + /* + * Currently, callers want ALTER TABLE (without ONLY) for CHECK + * constraints, and other types of constraints don't inherit anyway so + * it doesn't matter whether we say ONLY or not. Someday we might + * need to let callers specify whether to put ONLY in the command. + */ + appendStringInfo(&buf, "ALTER TABLE %s ADD CONSTRAINT %s ", + generate_qualified_relation_name(conForm->conrelid), + quote_identifier(NameStr(conForm->conname))); + } + + switch (conForm->contype) + { + case CONSTRAINT_FOREIGN: + { + Datum val; + bool isnull; + const char *string; + + /* Start off the constraint definition */ + appendStringInfoString(&buf, "FOREIGN KEY ("); + + /* Fetch and build referencing-column list */ + val = SysCacheGetAttr(CONSTROID, tup, + Anum_pg_constraint_conkey, &isnull); + if (isnull) + elog(ERROR, "null conkey for constraint %u", + constraintId); + + decompile_column_index_array(val, conForm->conrelid, &buf); + + /* add foreign relation name */ + appendStringInfo(&buf, ") REFERENCES %s(", + generate_relation_name(conForm->confrelid, + NIL)); + + /* Fetch and build referenced-column list */ + val = SysCacheGetAttr(CONSTROID, tup, + Anum_pg_constraint_confkey, &isnull); + if (isnull) + elog(ERROR, "null confkey for constraint %u", + constraintId); + + decompile_column_index_array(val, conForm->confrelid, &buf); + + appendStringInfoChar(&buf, ')'); + + /* Add match type */ + switch (conForm->confmatchtype) + { + case FKCONSTR_MATCH_FULL: + string = " MATCH FULL"; + break; + case FKCONSTR_MATCH_PARTIAL: + string = " MATCH PARTIAL"; + break; + case FKCONSTR_MATCH_SIMPLE: + string = ""; + break; + default: + elog(ERROR, "unrecognized confmatchtype: %d", + conForm->confmatchtype); + string = ""; /* keep compiler quiet */ + break; + } + appendStringInfoString(&buf, string); + + /* Add ON UPDATE and ON DELETE clauses, if needed */ + switch (conForm->confupdtype) + { + case FKCONSTR_ACTION_NOACTION: + string = NULL; /* suppress default */ + break; + case FKCONSTR_ACTION_RESTRICT: + string = "RESTRICT"; + break; + case FKCONSTR_ACTION_CASCADE: + string = "CASCADE"; + break; + case FKCONSTR_ACTION_SETNULL: + string = "SET NULL"; + break; + case FKCONSTR_ACTION_SETDEFAULT: + string = "SET DEFAULT"; + break; + default: + elog(ERROR, "unrecognized confupdtype: %d", + conForm->confupdtype); + string = NULL; /* keep compiler quiet */ + break; + } + if (string) + appendStringInfo(&buf, " ON UPDATE %s", string); + + switch (conForm->confdeltype) + { + case FKCONSTR_ACTION_NOACTION: + string = NULL; /* suppress default */ + break; + case FKCONSTR_ACTION_RESTRICT: + string = "RESTRICT"; + break; + case FKCONSTR_ACTION_CASCADE: + string = "CASCADE"; + break; + case FKCONSTR_ACTION_SETNULL: + string = "SET NULL"; + break; + case FKCONSTR_ACTION_SETDEFAULT: + string = "SET DEFAULT"; + break; + default: + elog(ERROR, "unrecognized confdeltype: %d", + conForm->confdeltype); + string = NULL; /* keep compiler quiet */ + break; + } + if (string) + appendStringInfo(&buf, " ON DELETE %s", string); + + break; + } + case CONSTRAINT_PRIMARY: + case CONSTRAINT_UNIQUE: + { + Datum val; + bool isnull; + Oid indexId; + + /* Start off the constraint definition */ + if (conForm->contype == CONSTRAINT_PRIMARY) + appendStringInfoString(&buf, "PRIMARY KEY ("); + else + appendStringInfoString(&buf, "UNIQUE ("); + + /* Fetch and build target column list */ + val = SysCacheGetAttr(CONSTROID, tup, + Anum_pg_constraint_conkey, &isnull); + if (isnull) + elog(ERROR, "null conkey for constraint %u", + constraintId); + + decompile_column_index_array(val, conForm->conrelid, &buf); + + appendStringInfoChar(&buf, ')'); + + indexId = get_constraint_index(constraintId); + + /* XXX why do we only print these bits if fullCommand? */ + if (fullCommand && OidIsValid(indexId)) + { + char *options = flatten_reloptions(indexId); + Oid tblspc; + + if (options) + { + appendStringInfo(&buf, " WITH (%s)", options); + pfree(options); + } + + tblspc = get_rel_tablespace(indexId); + if (OidIsValid(tblspc)) + appendStringInfo(&buf, " USING INDEX TABLESPACE %s", + quote_identifier(get_tablespace_name(tblspc))); + } + + break; + } + case CONSTRAINT_CHECK: + { + Datum val; + bool isnull; + char *conbin; + char *consrc; + Node *expr; + List *context; + + /* Fetch constraint expression in parsetree form */ + val = SysCacheGetAttr(CONSTROID, tup, + Anum_pg_constraint_conbin, &isnull); + if (isnull) + elog(ERROR, "null conbin for constraint %u", + constraintId); + + conbin = TextDatumGetCString(val); + expr = stringToNode(conbin); + + /* Set up deparsing context for Var nodes in constraint */ + if (conForm->conrelid != InvalidOid) + { + /* relation constraint */ + context = deparse_context_for(get_relation_name(conForm->conrelid), + conForm->conrelid); + } + else + { + /* domain constraint --- can't have Vars */ + context = NIL; + } + + consrc = deparse_expression_pretty(expr, context, false, false, + prettyFlags, 0); + + /* + * Now emit the constraint definition, adding NO INHERIT if + * necessary. + * + * There are cases where the constraint expression will be + * fully parenthesized and we don't need the outer parens ... + * but there are other cases where we do need 'em. Be + * conservative for now. + * + * Note that simply checking for leading '(' and trailing ')' + * would NOT be good enough, consider "(x > 0) AND (y > 0)". + */ + appendStringInfo(&buf, "CHECK (%s)%s", + consrc, + conForm->connoinherit ? " NO INHERIT" : ""); + break; + } + case CONSTRAINT_TRIGGER: + + /* + * There isn't an ALTER TABLE syntax for creating a user-defined + * constraint trigger, but it seems better to print something than + * throw an error; if we throw error then this function couldn't + * safely be applied to all rows of pg_constraint. + */ + appendStringInfoString(&buf, "TRIGGER"); + break; + case CONSTRAINT_EXCLUSION: + { + Oid indexOid = conForm->conindid; + Datum val; + bool isnull; + Datum *elems; + int nElems; + int i; + Oid *operators; + + /* Extract operator OIDs from the pg_constraint tuple */ + val = SysCacheGetAttr(CONSTROID, tup, + Anum_pg_constraint_conexclop, + &isnull); + if (isnull) + elog(ERROR, "null conexclop for constraint %u", + constraintId); + + deconstruct_array(DatumGetArrayTypeP(val), + OIDOID, sizeof(Oid), true, 'i', + &elems, NULL, &nElems); + + operators = (Oid *) palloc(nElems * sizeof(Oid)); + for (i = 0; i < nElems; i++) + operators[i] = DatumGetObjectId(elems[i]); + + /* pg_get_indexdef_worker does the rest */ + /* suppress tablespace because pg_dump wants it that way */ + appendStringInfoString(&buf, + pg_get_indexdef_worker(indexOid, + 0, + operators, + false, + false, + prettyFlags, + false)); + break; + } + default: + elog(ERROR, "invalid constraint type \"%c\"", conForm->contype); + break; + } + + if (conForm->condeferrable) + appendStringInfoString(&buf, " DEFERRABLE"); + if (conForm->condeferred) + appendStringInfoString(&buf, " INITIALLY DEFERRED"); + if (!conForm->convalidated) + appendStringInfoString(&buf, " NOT VALID"); + + /* Cleanup */ + systable_endscan(scandesc); + heap_close(relation, AccessShareLock); + + return buf.data; +} + + +/* + * Convert an int16[] Datum into a comma-separated list of column names + * for the indicated relation; append the list to buf. + */ +static void +decompile_column_index_array(Datum column_index_array, Oid relId, + StringInfo buf) +{ + Datum *keys; + int nKeys; + int j; + + /* Extract data from array of int16 */ + deconstruct_array(DatumGetArrayTypeP(column_index_array), + INT2OID, 2, true, 's', + &keys, NULL, &nKeys); + + for (j = 0; j < nKeys; j++) + { + char *colName; + + colName = get_relid_attribute_name(relId, DatumGetInt16(keys[j])); + + if (j == 0) + appendStringInfoString(buf, quote_identifier(colName)); + else + appendStringInfo(buf, ", %s", quote_identifier(colName)); + } +} + + +/* ---------- + * get_expr - Decompile an expression tree + * + * Input: an expression tree in nodeToString form, and a relation OID + * + * Output: reverse-listed expression + * + * Currently, the expression can only refer to a single relation, namely + * the one specified by the second parameter. This is sufficient for + * partial indexes, column default expressions, etc. We also support + * Var-free expressions, for which the OID can be InvalidOid. + * ---------- + */ +Datum +pg_get_expr(PG_FUNCTION_ARGS) +{ + text *expr = PG_GETARG_TEXT_PP(0); + Oid relid = PG_GETARG_OID(1); + int prettyFlags; + char *relname; + + prettyFlags = PRETTYFLAG_INDENT; + + if (OidIsValid(relid)) + { + /* Get the name for the relation */ + relname = get_rel_name(relid); + + /* + * If the OID isn't actually valid, don't throw an error, just return + * NULL. This is a bit questionable, but it's what we've done + * historically, and it can help avoid unwanted failures when + * examining catalog entries for just-deleted relations. + */ + if (relname == NULL) + PG_RETURN_NULL(); + } + else + relname = NULL; + + PG_RETURN_TEXT_P(pg_get_expr_worker(expr, relid, relname, prettyFlags)); +} + +Datum +pg_get_expr_ext(PG_FUNCTION_ARGS) +{ + text *expr = PG_GETARG_TEXT_PP(0); + Oid relid = PG_GETARG_OID(1); + bool pretty = PG_GETARG_BOOL(2); + int prettyFlags; + char *relname; + + prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT; + + if (OidIsValid(relid)) + { + /* Get the name for the relation */ + relname = get_rel_name(relid); + /* See notes above */ + if (relname == NULL) + PG_RETURN_NULL(); + } + else + relname = NULL; + + PG_RETURN_TEXT_P(pg_get_expr_worker(expr, relid, relname, prettyFlags)); +} + +static text * +pg_get_expr_worker(text *expr, Oid relid, const char *relname, int prettyFlags) +{ + Node *node; + List *context; + char *exprstr; + char *str; + + /* Convert input TEXT object to C string */ + exprstr = text_to_cstring(expr); + + /* Convert expression to node tree */ + node = (Node *) stringToNode(exprstr); + + pfree(exprstr); + + /* Prepare deparse context if needed */ + if (OidIsValid(relid)) + context = deparse_context_for(relname, relid); + else + context = NIL; + + /* Deparse */ + str = deparse_expression_pretty(node, context, false, false, + prettyFlags, 0); + + return string_to_text(str); +} + + +/* ---------- + * get_userbyid - Get a user name by roleid and + * fallback to 'unknown (OID=n)' + * ---------- + */ +Datum +pg_get_userbyid(PG_FUNCTION_ARGS) +{ + Oid roleid = PG_GETARG_OID(0); + Name result; + HeapTuple roletup; + Form_pg_authid role_rec; + + /* + * Allocate space for the result + */ + result = (Name) palloc(NAMEDATALEN); + memset(NameStr(*result), 0, NAMEDATALEN); + + /* + * Get the pg_authid entry and print the result + */ + roletup = SearchSysCache1(AUTHOID, ObjectIdGetDatum(roleid)); + if (HeapTupleIsValid(roletup)) + { + role_rec = (Form_pg_authid) GETSTRUCT(roletup); + StrNCpy(NameStr(*result), NameStr(role_rec->rolname), NAMEDATALEN); + ReleaseSysCache(roletup); + } + else + sprintf(NameStr(*result), "unknown (OID=%u)", roleid); + + PG_RETURN_NAME(result); +} + + +/* + * pg_get_serial_sequence + * Get the name of the sequence used by a serial column, + * formatted suitably for passing to setval, nextval or currval. + * First parameter is not treated as double-quoted, second parameter + * is --- see documentation for reason. + */ +Datum +pg_get_serial_sequence(PG_FUNCTION_ARGS) +{// #lizard forgives + text *tablename = PG_GETARG_TEXT_PP(0); + text *columnname = PG_GETARG_TEXT_PP(1); + RangeVar *tablerv; + Oid tableOid; + char *column; + AttrNumber attnum; + Oid sequenceId = InvalidOid; + Relation depRel; + ScanKeyData key[3]; + SysScanDesc scan; + HeapTuple tup; + + /* Look up table name. Can't lock it - we might not have privileges. */ + tablerv = makeRangeVarFromNameList(textToQualifiedNameList(tablename)); + tableOid = RangeVarGetRelid(tablerv, NoLock, false); + + /* Get the number of the column */ + column = text_to_cstring(columnname); + + attnum = get_attnum(tableOid, column); + if (attnum == InvalidAttrNumber) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_COLUMN), + errmsg("column \"%s\" of relation \"%s\" does not exist", + column, tablerv->relname))); + + /* Search the dependency table for the dependent sequence */ + depRel = heap_open(DependRelationId, AccessShareLock); + + ScanKeyInit(&key[0], + Anum_pg_depend_refclassid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(RelationRelationId)); + ScanKeyInit(&key[1], + Anum_pg_depend_refobjid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(tableOid)); + ScanKeyInit(&key[2], + Anum_pg_depend_refobjsubid, + BTEqualStrategyNumber, F_INT4EQ, + Int32GetDatum(attnum)); + + scan = systable_beginscan(depRel, DependReferenceIndexId, true, + NULL, 3, key); + + while (HeapTupleIsValid(tup = systable_getnext(scan))) + { + Form_pg_depend deprec = (Form_pg_depend) GETSTRUCT(tup); + + /* + * We assume any auto dependency of a sequence on a column must be + * what we are looking for. (We need the relkind test because indexes + * can also have auto dependencies on columns.) + */ + if (deprec->classid == RelationRelationId && + deprec->objsubid == 0 && + deprec->deptype == DEPENDENCY_AUTO && + get_rel_relkind(deprec->objid) == RELKIND_SEQUENCE) + { + sequenceId = deprec->objid; + break; + } + } + + systable_endscan(scan); + heap_close(depRel, AccessShareLock); + + if (OidIsValid(sequenceId)) + { + char *result; + + result = generate_qualified_relation_name(sequenceId); + + PG_RETURN_TEXT_P(string_to_text(result)); + } + + PG_RETURN_NULL(); +} + + +/* + * pg_get_functiondef + * Returns the complete "CREATE OR REPLACE FUNCTION ..." statement for + * the specified function. + * + * Note: if you change the output format of this function, be careful not + * to break psql's rules (in \ef and \sf) for identifying the start of the + * function body. To wit: the function body starts on a line that begins + * with "AS ", and no preceding line will look like that. + */ +Datum +pg_get_functiondef(PG_FUNCTION_ARGS) +{// #lizard forgives + Oid funcid = PG_GETARG_OID(0); + StringInfoData buf; + StringInfoData dq; + HeapTuple proctup; + Form_pg_proc proc; + Datum tmp; + bool isnull; + const char *prosrc; + const char *name; + const char *nsp; + float4 procost; + int oldlen; + + initStringInfo(&buf); + + /* Look up the function */ + proctup = SearchSysCache1(PROCOID, ObjectIdGetDatum(funcid)); + if (!HeapTupleIsValid(proctup)) + PG_RETURN_NULL(); + + proc = (Form_pg_proc) GETSTRUCT(proctup); + name = NameStr(proc->proname); + + if (proc->proisagg) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("\"%s\" is an aggregate function", name))); + + /* + * We always qualify the function name, to ensure the right function gets + * replaced. + */ + nsp = get_namespace_name(proc->pronamespace); + appendStringInfo(&buf, "CREATE OR REPLACE FUNCTION %s(", + quote_qualified_identifier(nsp, name)); + (void) print_function_arguments(&buf, proctup, false, true); + appendStringInfoString(&buf, ")\n RETURNS "); + print_function_rettype(&buf, proctup); + + print_function_trftypes(&buf, proctup); + + appendStringInfo(&buf, "\n LANGUAGE %s\n", + quote_identifier(get_language_name(proc->prolang, false))); + + /* Emit some miscellaneous options on one line */ + oldlen = buf.len; + + if (proc->proiswindow) + appendStringInfoString(&buf, " WINDOW"); + switch (proc->provolatile) + { + case PROVOLATILE_IMMUTABLE: + appendStringInfoString(&buf, " IMMUTABLE"); + break; + case PROVOLATILE_STABLE: + appendStringInfoString(&buf, " STABLE"); + break; + case PROVOLATILE_VOLATILE: + break; + } + + switch (proc->proparallel) + { + case PROPARALLEL_SAFE: + appendStringInfoString(&buf, " PARALLEL SAFE"); + break; + case PROPARALLEL_RESTRICTED: + appendStringInfoString(&buf, " PARALLEL RESTRICTED"); + break; + case PROPARALLEL_UNSAFE: + break; + } + + if (proc->proisstrict) + appendStringInfoString(&buf, " STRICT"); + if (proc->prosecdef) + appendStringInfoString(&buf, " SECURITY DEFINER"); + if (proc->proleakproof) + appendStringInfoString(&buf, " LEAKPROOF"); + + /* This code for the default cost and rows should match functioncmds.c */ + if (proc->prolang == INTERNALlanguageId || + proc->prolang == ClanguageId) + procost = 1; + else + procost = 100; + if (proc->procost != procost) + appendStringInfo(&buf, " COST %g", proc->procost); + + if (proc->prorows > 0 && proc->prorows != 1000) + appendStringInfo(&buf, " ROWS %g", proc->prorows); + + if (oldlen != buf.len) + appendStringInfoChar(&buf, '\n'); + + /* Emit any proconfig options, one per line */ + tmp = SysCacheGetAttr(PROCOID, proctup, Anum_pg_proc_proconfig, &isnull); + if (!isnull) + { + ArrayType *a = DatumGetArrayTypeP(tmp); + int i; + + Assert(ARR_ELEMTYPE(a) == TEXTOID); + Assert(ARR_NDIM(a) == 1); + Assert(ARR_LBOUND(a)[0] == 1); + + for (i = 1; i <= ARR_DIMS(a)[0]; i++) + { + Datum d; + + d = array_ref(a, 1, &i, + -1 /* varlenarray */ , + -1 /* TEXT's typlen */ , + false /* TEXT's typbyval */ , + 'i' /* TEXT's typalign */ , + &isnull); + if (!isnull) + { + char *configitem = TextDatumGetCString(d); + char *pos; + + pos = strchr(configitem, '='); + if (pos == NULL) + continue; + *pos++ = '\0'; + + appendStringInfo(&buf, " SET %s TO ", + quote_identifier(configitem)); + + /* + * Some GUC variable names are 'LIST' type and hence must not + * be quoted. + */ + if (pg_strcasecmp(configitem, "DateStyle") == 0 + || pg_strcasecmp(configitem, "search_path") == 0) + appendStringInfoString(&buf, pos); + else + simple_quote_literal(&buf, pos); + appendStringInfoChar(&buf, '\n'); + } + } + } + + /* And finally the function definition ... */ + appendStringInfoString(&buf, "AS "); + + tmp = SysCacheGetAttr(PROCOID, proctup, Anum_pg_proc_probin, &isnull); + if (!isnull) + { + simple_quote_literal(&buf, TextDatumGetCString(tmp)); + appendStringInfoString(&buf, ", "); /* assume prosrc isn't null */ + } + + tmp = SysCacheGetAttr(PROCOID, proctup, Anum_pg_proc_prosrc, &isnull); + if (isnull) + elog(ERROR, "null prosrc"); + prosrc = TextDatumGetCString(tmp); + + /* + * We always use dollar quoting. Figure out a suitable delimiter. + * + * Since the user is likely to be editing the function body string, we + * shouldn't use a short delimiter that he might easily create a conflict + * with. Hence prefer "$function$", but extend if needed. + */ + initStringInfo(&dq); + appendStringInfoString(&dq, "$function"); + while (strstr(prosrc, dq.data) != NULL) + appendStringInfoChar(&dq, 'x'); + appendStringInfoChar(&dq, '$'); + + appendStringInfoString(&buf, dq.data); + appendStringInfoString(&buf, prosrc); + appendStringInfoString(&buf, dq.data); + + appendStringInfoChar(&buf, '\n'); + + ReleaseSysCache(proctup); + + PG_RETURN_TEXT_P(string_to_text(buf.data)); +} + +/* + * pg_get_function_arguments + * Get a nicely-formatted list of arguments for a function. + * This is everything that would go between the parentheses in + * CREATE FUNCTION. + */ +Datum +pg_get_function_arguments(PG_FUNCTION_ARGS) +{ + Oid funcid = PG_GETARG_OID(0); + StringInfoData buf; + HeapTuple proctup; + + proctup = SearchSysCache1(PROCOID, ObjectIdGetDatum(funcid)); + if (!HeapTupleIsValid(proctup)) + PG_RETURN_NULL(); + + initStringInfo(&buf); + + (void) print_function_arguments(&buf, proctup, false, true); + + ReleaseSysCache(proctup); + + PG_RETURN_TEXT_P(string_to_text(buf.data)); +} + +/* + * pg_get_function_identity_arguments + * Get a formatted list of arguments for a function. + * This is everything that would go between the parentheses in + * ALTER FUNCTION, etc. In particular, don't print defaults. + */ +Datum +pg_get_function_identity_arguments(PG_FUNCTION_ARGS) +{ + Oid funcid = PG_GETARG_OID(0); + StringInfoData buf; + HeapTuple proctup; + + proctup = SearchSysCache1(PROCOID, ObjectIdGetDatum(funcid)); + if (!HeapTupleIsValid(proctup)) + PG_RETURN_NULL(); + + initStringInfo(&buf); + + (void) print_function_arguments(&buf, proctup, false, false); + + ReleaseSysCache(proctup); + + PG_RETURN_TEXT_P(string_to_text(buf.data)); +} + +/* + * pg_get_function_result + * Get a nicely-formatted version of the result type of a function. + * This is what would appear after RETURNS in CREATE FUNCTION. + */ +Datum +pg_get_function_result(PG_FUNCTION_ARGS) +{ + Oid funcid = PG_GETARG_OID(0); + StringInfoData buf; + HeapTuple proctup; + + proctup = SearchSysCache1(PROCOID, ObjectIdGetDatum(funcid)); + if (!HeapTupleIsValid(proctup)) + PG_RETURN_NULL(); + + initStringInfo(&buf); + + print_function_rettype(&buf, proctup); + + ReleaseSysCache(proctup); + + PG_RETURN_TEXT_P(string_to_text(buf.data)); +} + +/* + * Guts of pg_get_function_result: append the function's return type + * to the specified buffer. + */ +static void +print_function_rettype(StringInfo buf, HeapTuple proctup) +{ + Form_pg_proc proc = (Form_pg_proc) GETSTRUCT(proctup); + int ntabargs = 0; + StringInfoData rbuf; + + initStringInfo(&rbuf); + + if (proc->proretset) + { + /* It might be a table function; try to print the arguments */ + appendStringInfoString(&rbuf, "TABLE("); + ntabargs = print_function_arguments(&rbuf, proctup, true, false); + if (ntabargs > 0) + appendStringInfoChar(&rbuf, ')'); + else + resetStringInfo(&rbuf); + } + + if (ntabargs == 0) + { + /* Not a table function, so do the normal thing */ + if (proc->proretset) + appendStringInfoString(&rbuf, "SETOF "); + appendStringInfoString(&rbuf, format_type_be(proc->prorettype)); + } + + appendStringInfoString(buf, rbuf.data); +} + +/* + * Common code for pg_get_function_arguments and pg_get_function_result: + * append the desired subset of arguments to buf. We print only TABLE + * arguments when print_table_args is true, and all the others when it's false. + * We print argument defaults only if print_defaults is true. + * Function return value is the number of arguments printed. + */ +static int +print_function_arguments(StringInfo buf, HeapTuple proctup, + bool print_table_args, bool print_defaults) +{// #lizard forgives + Form_pg_proc proc = (Form_pg_proc) GETSTRUCT(proctup); + int numargs; + Oid *argtypes; + char **argnames; + char *argmodes; + int insertorderbyat = -1; + int argsprinted; + int inputargno; + int nlackdefaults; + ListCell *nextargdefault = NULL; + int i; + + numargs = get_func_arg_info(proctup, + &argtypes, &argnames, &argmodes); + + nlackdefaults = numargs; + if (print_defaults && proc->pronargdefaults > 0) + { + Datum proargdefaults; + bool isnull; + + proargdefaults = SysCacheGetAttr(PROCOID, proctup, + Anum_pg_proc_proargdefaults, + &isnull); + if (!isnull) + { + char *str; + List *argdefaults; + + str = TextDatumGetCString(proargdefaults); + argdefaults = castNode(List, stringToNode(str)); + pfree(str); + nextargdefault = list_head(argdefaults); + /* nlackdefaults counts only *input* arguments lacking defaults */ + nlackdefaults = proc->pronargs - list_length(argdefaults); + } + } + + /* Check for special treatment of ordered-set aggregates */ + if (proc->proisagg) + { + HeapTuple aggtup; + Form_pg_aggregate agg; + + aggtup = SearchSysCache1(AGGFNOID, + ObjectIdGetDatum(HeapTupleGetOid(proctup))); + if (!HeapTupleIsValid(aggtup)) + elog(ERROR, "cache lookup failed for aggregate %u", + HeapTupleGetOid(proctup)); + agg = (Form_pg_aggregate) GETSTRUCT(aggtup); + if (AGGKIND_IS_ORDERED_SET(agg->aggkind)) + insertorderbyat = agg->aggnumdirectargs; + ReleaseSysCache(aggtup); + } + + argsprinted = 0; + inputargno = 0; + for (i = 0; i < numargs; i++) + { + Oid argtype = argtypes[i]; + char *argname = argnames ? argnames[i] : NULL; + char argmode = argmodes ? argmodes[i] : PROARGMODE_IN; + const char *modename; + bool isinput; + + switch (argmode) + { + case PROARGMODE_IN: + modename = ""; + isinput = true; + break; + case PROARGMODE_INOUT: + modename = "INOUT "; + isinput = true; + break; + case PROARGMODE_OUT: + modename = "OUT "; + isinput = false; + break; + case PROARGMODE_VARIADIC: + modename = "VARIADIC "; + isinput = true; + break; + case PROARGMODE_TABLE: + modename = ""; + isinput = false; + break; + default: + elog(ERROR, "invalid parameter mode '%c'", argmode); + modename = NULL; /* keep compiler quiet */ + isinput = false; + break; + } + if (isinput) + inputargno++; /* this is a 1-based counter */ + + if (print_table_args != (argmode == PROARGMODE_TABLE)) + continue; + + if (argsprinted == insertorderbyat) + { + if (argsprinted) + appendStringInfoChar(buf, ' '); + appendStringInfoString(buf, "ORDER BY "); + } + else if (argsprinted) + appendStringInfoString(buf, ", "); + + appendStringInfoString(buf, modename); + if (argname && argname[0]) + appendStringInfo(buf, "%s ", quote_identifier(argname)); + appendStringInfoString(buf, format_type_be(argtype)); + if (print_defaults && isinput && inputargno > nlackdefaults) + { + Node *expr; + + Assert(nextargdefault != NULL); + expr = (Node *) lfirst(nextargdefault); + nextargdefault = lnext(nextargdefault); + + appendStringInfo(buf, " DEFAULT %s", + deparse_expression(expr, NIL, false, false)); + } + argsprinted++; + + /* nasty hack: print the last arg twice for variadic ordered-set agg */ + if (argsprinted == insertorderbyat && i == numargs - 1) + { + i--; + /* aggs shouldn't have defaults anyway, but just to be sure ... */ + print_defaults = false; + } + } + + return argsprinted; +} + +static bool +is_input_argument(int nth, const char *argmodes) +{ + return (!argmodes + || argmodes[nth] == PROARGMODE_IN + || argmodes[nth] == PROARGMODE_INOUT + || argmodes[nth] == PROARGMODE_VARIADIC); +} + +/* + * Append used transformed types to specified buffer + */ +static void +print_function_trftypes(StringInfo buf, HeapTuple proctup) +{ + Oid *trftypes; + int ntypes; + + ntypes = get_func_trftypes(proctup, &trftypes); + if (ntypes > 0) + { + int i; + + appendStringInfoString(buf, "\n TRANSFORM "); + for (i = 0; i < ntypes; i++) + { + if (i != 0) + appendStringInfoString(buf, ", "); + appendStringInfo(buf, "FOR TYPE %s", format_type_be(trftypes[i])); + } + } +} + +/* + * Get textual representation of a function argument's default value. The + * second argument of this function is the argument number among all arguments + * (i.e. proallargtypes, *not* proargtypes), starting with 1, because that's + * how information_schema.sql uses it. + */ +Datum +pg_get_function_arg_default(PG_FUNCTION_ARGS) +{// #lizard forgives + Oid funcid = PG_GETARG_OID(0); + int32 nth_arg = PG_GETARG_INT32(1); + HeapTuple proctup; + Form_pg_proc proc; + int numargs; + Oid *argtypes; + char **argnames; + char *argmodes; + int i; + List *argdefaults; + Node *node; + char *str; + int nth_inputarg; + Datum proargdefaults; + bool isnull; + int nth_default; + + proctup = SearchSysCache1(PROCOID, ObjectIdGetDatum(funcid)); + if (!HeapTupleIsValid(proctup)) + PG_RETURN_NULL(); + + numargs = get_func_arg_info(proctup, &argtypes, &argnames, &argmodes); + if (nth_arg < 1 || nth_arg > numargs || !is_input_argument(nth_arg - 1, argmodes)) + { + ReleaseSysCache(proctup); + PG_RETURN_NULL(); + } + + nth_inputarg = 0; + for (i = 0; i < nth_arg; i++) + if (is_input_argument(i, argmodes)) + nth_inputarg++; + + proargdefaults = SysCacheGetAttr(PROCOID, proctup, + Anum_pg_proc_proargdefaults, + &isnull); + if (isnull) + { + ReleaseSysCache(proctup); + PG_RETURN_NULL(); + } + + str = TextDatumGetCString(proargdefaults); + argdefaults = castNode(List, stringToNode(str)); + pfree(str); + + proc = (Form_pg_proc) GETSTRUCT(proctup); + + /* + * Calculate index into proargdefaults: proargdefaults corresponds to the + * last N input arguments, where N = pronargdefaults. + */ + nth_default = nth_inputarg - 1 - (proc->pronargs - proc->pronargdefaults); + + if (nth_default < 0 || nth_default >= list_length(argdefaults)) + { + ReleaseSysCache(proctup); + PG_RETURN_NULL(); + } + node = list_nth(argdefaults, nth_default); + str = deparse_expression(node, NIL, false, false); + + ReleaseSysCache(proctup); + + PG_RETURN_TEXT_P(string_to_text(str)); +} + + +/* + * deparse_expression - General utility for deparsing expressions + * + * calls deparse_expression_pretty with all prettyPrinting disabled + */ +char * +deparse_expression(Node *expr, List *dpcontext, + bool forceprefix, bool showimplicit) +{ + return deparse_expression_pretty(expr, dpcontext, forceprefix, + showimplicit, 0, 0); +} + +/* ---------- + * deparse_expression_pretty - General utility for deparsing expressions + * + * expr is the node tree to be deparsed. It must be a transformed expression + * tree (ie, not the raw output of gram.y). + * + * dpcontext is a list of deparse_namespace nodes representing the context + * for interpreting Vars in the node tree. It can be NIL if no Vars are + * expected. + * + * forceprefix is TRUE to force all Vars to be prefixed with their table names. + * + * showimplicit is TRUE to force all implicit casts to be shown explicitly. + * + * Tries to pretty up the output according to prettyFlags and startIndent. + * + * The result is a palloc'd string. + * ---------- + */ +static char * +deparse_expression_pretty(Node *expr, List *dpcontext, + bool forceprefix, bool showimplicit, + int prettyFlags, int startIndent) +{ + StringInfoData buf; + deparse_context context; + + initStringInfo(&buf); + context.buf = &buf; + context.namespaces = dpcontext; + context.windowClause = NIL; + context.windowTList = NIL; + context.varprefix = forceprefix; + context.prettyFlags = prettyFlags; + context.wrapColumn = WRAP_COLUMN_DEFAULT; + context.indentLevel = startIndent; + context.special_exprkind = EXPR_KIND_NONE; + + get_rule_expr(expr, &context, showimplicit); + + return buf.data; +} + +/* ---------- + * deparse_context_for - Build deparse context for a single relation + * + * Given the reference name (alias) and OID of a relation, build deparsing + * context for an expression referencing only that relation (as varno 1, + * varlevelsup 0). This is sufficient for many uses of deparse_expression. + * ---------- + */ +List * +deparse_context_for(const char *aliasname, Oid relid) +{ + deparse_namespace *dpns; + RangeTblEntry *rte; + + dpns = (deparse_namespace *) palloc0(sizeof(deparse_namespace)); + + /* Build a minimal RTE for the rel */ + rte = makeNode(RangeTblEntry); + rte->rtekind = RTE_RELATION; + rte->relid = relid; + rte->relkind = RELKIND_RELATION; /* no need for exactness here */ + rte->alias = makeAlias(aliasname, NIL); + rte->eref = rte->alias; + rte->lateral = false; + rte->inh = false; + rte->inFromCl = true; + + /* Build one-element rtable */ + dpns->rtable = list_make1(rte); + dpns->ctes = NIL; + set_rtable_names(dpns, NIL, NULL); + set_simple_column_names(dpns); + + /* Return a one-deep namespace stack */ + return list_make1(dpns); +} + +/* + * deparse_context_for_plan_rtable - Build deparse context for a plan's rtable + * + * When deparsing an expression in a Plan tree, we use the plan's rangetable + * to resolve names of simple Vars. The initialization of column names for + * this is rather expensive if the rangetable is large, and it'll be the same + * for every expression in the Plan tree; so we do it just once and re-use + * the result of this function for each expression. (Note that the result + * is not usable until set_deparse_context_planstate() is applied to it.) + * + * In addition to the plan's rangetable list, pass the per-RTE alias names + * assigned by a previous call to select_rtable_names_for_explain. + */ +List * +deparse_context_for_plan_rtable(List *rtable, List *rtable_names) +{ + deparse_namespace *dpns; + + dpns = (deparse_namespace *) palloc0(sizeof(deparse_namespace)); + + /* Initialize fields that stay the same across the whole plan tree */ + dpns->rtable = rtable; + dpns->rtable_names = rtable_names; + dpns->ctes = NIL; + + /* + * Set up column name aliases. We will get rather bogus results for join + * RTEs, but that doesn't matter because plan trees don't contain any join + * alias Vars. + */ + set_simple_column_names(dpns); + + /* Return a one-deep namespace stack */ + return list_make1(dpns); +} + +/* + * set_deparse_context_planstate - Specify Plan node containing expression + * + * When deparsing an expression in a Plan tree, we might have to resolve + * OUTER_VAR, INNER_VAR, or INDEX_VAR references. To do this, the caller must + * provide the parent PlanState node. Then OUTER_VAR and INNER_VAR references + * can be resolved by drilling down into the left and right child plans. + * Similarly, INDEX_VAR references can be resolved by reference to the + * indextlist given in a parent IndexOnlyScan node, or to the scan tlist in + * ForeignScan and CustomScan nodes. (Note that we don't currently support + * deparsing of indexquals in regular IndexScan or BitmapIndexScan nodes; + * for those, we can only deparse the indexqualorig fields, which won't + * contain INDEX_VAR Vars.) + * + * Note: planstate really ought to be declared as "PlanState *", but we use + * "Node *" to avoid having to include execnodes.h in ruleutils.h. + * + * The ancestors list is a list of the PlanState's parent PlanStates, the + * most-closely-nested first. This is needed to resolve PARAM_EXEC Params. + * Note we assume that all the PlanStates share the same rtable. + * + * Once this function has been called, deparse_expression() can be called on + * subsidiary expression(s) of the specified PlanState node. To deparse + * expressions of a different Plan node in the same Plan tree, re-call this + * function to identify the new parent Plan node. + * + * The result is the same List passed in; this is a notational convenience. + */ +List * +set_deparse_context_planstate(List *dpcontext, + Node *planstate, List *ancestors) +{ + deparse_namespace *dpns; + + + /* Should always have one-entry namespace list for Plan deparsing */ + Assert(list_length(dpcontext) == 1); + dpns = (deparse_namespace *) linitial(dpcontext); + + /* Set our attention on the specific plan node passed in */ + set_deparse_planstate(dpns, (PlanState *) planstate); + dpns->ancestors = ancestors; + + return dpcontext; +} + +/* + * select_rtable_names_for_explain - Select RTE aliases for EXPLAIN + * + * Determine the relation aliases we'll use during an EXPLAIN operation. + * This is just a frontend to set_rtable_names. We have to expose the aliases + * to EXPLAIN because EXPLAIN needs to know the right alias names to print. + */ +List * +select_rtable_names_for_explain(List *rtable, Bitmapset *rels_used) +{ + deparse_namespace dpns; + + memset(&dpns, 0, sizeof(dpns)); + dpns.rtable = rtable; + dpns.ctes = NIL; + set_rtable_names(&dpns, NIL, rels_used); + /* We needn't bother computing column aliases yet */ + + return dpns.rtable_names; +} + +#ifdef PGXC +/* + * This is a special case deparse context to be used at the planning time to + * generate query strings and expressions for remote shipping. + * + * XXX We should be careful while using this since the support is quite + * limited. The only supported use case at this point is for remote join + * reduction and some simple plan trees rooted by Agg node having a single + * RemoteQuery node as leftree. + */ +List * +deparse_context_for_plan(Node *plan, List *ancestors, + List *rtable) +{ + deparse_namespace *dpns; + + dpns = (deparse_namespace *) palloc0(sizeof(deparse_namespace)); + + /* Initialize fields that stay the same across the whole plan tree */ + dpns->rtable = rtable; + dpns->ctes = NIL; + + /* Set our attention on the specific plan node passed in */ + set_deparse_plan(dpns, (Plan *) plan); + dpns->ancestors = ancestors; + + /* Return a one-deep namespace stack */ + return list_make1(dpns); +} + +/* + * Set deparse context for Plan. Only those plan nodes which are immediate (or + * through simple nodes) parents of RemoteQuery nodes are supported right now. + * + * This is a kind of work-around since the new deparse interface (since 9.1) + * expects a PlanState node. But planstates are instantiated only at execution + * time when InitPlan is called. But we are required to deparse the query + * during planning time, so we hand-cook these dummy PlanState nodes instead of + * init-ing the plan. Another approach could have been to delay the query + * generation to the execution time, but we are not yet sure if this can be + * safely done, especially for remote join reduction. + */ +static void +set_deparse_plan(deparse_namespace *dpns, Plan *plan) +{// #lizard forgives + + if (IsA(plan, NestLoop)) + { + NestLoop *nestloop = (NestLoop *) plan; + + dpns->planstate = (PlanState *) makeNode(NestLoopState); + dpns->planstate->plan = plan; + + dpns->outer_planstate = (PlanState *) makeNode(PlanState); + dpns->outer_planstate->plan = nestloop->join.plan.lefttree; + + dpns->inner_planstate = (PlanState *) makeNode(PlanState); + dpns->inner_planstate->plan = nestloop->join.plan.righttree; + } + else if (IsA(plan, RemoteQuery)) + { + dpns->planstate = (PlanState *) makeNode(PlanState); + dpns->planstate->plan = plan; + } + else if (IsA(plan, Agg) || IsA(plan, Group)) + { + /* + * We expect plan tree as Group/Agg->Sort->Result->Material->RemoteQuery, + * Result, Material nodes are optional. Sort is compulsory for Group but not + * for Agg. + * anything else is not handled right now. + */ + Plan *temp_plan = plan->lefttree; + Plan *remote_scan = NULL; + + if (temp_plan && IsA(temp_plan, Sort)) + temp_plan = temp_plan->lefttree; + if (temp_plan && IsA(temp_plan, Result)) + temp_plan = temp_plan->lefttree; + if (temp_plan && IsA(temp_plan, Material)) + temp_plan = temp_plan->lefttree; + if (temp_plan && IsA(temp_plan, RemoteQuery)) + remote_scan = temp_plan; + + if (!remote_scan) + elog(ERROR, "Deparse of this query at planning is not supported yet"); + + dpns->planstate = (PlanState *) makeNode(PlanState); + dpns->planstate->plan = plan; + } + else + elog(ERROR, "Deparse of this query at planning not supported yet"); +} + +#endif +/* + * set_rtable_names: select RTE aliases to be used in printing a query + * + * We fill in dpns->rtable_names with a list of names that is one-for-one with + * the already-filled dpns->rtable list. Each RTE name is unique among those + * in the new namespace plus any ancestor namespaces listed in + * parent_namespaces. + * + * If rels_used isn't NULL, only RTE indexes listed in it are given aliases. + * + * Note that this function is only concerned with relation names, not column + * names. + */ +static void +set_rtable_names(deparse_namespace *dpns, List *parent_namespaces, + Bitmapset *rels_used) +{// #lizard forgives + HASHCTL hash_ctl; + HTAB *names_hash; + NameHashEntry *hentry; + bool found; + int rtindex; + ListCell *lc; + + dpns->rtable_names = NIL; + /* nothing more to do if empty rtable */ + if (dpns->rtable == NIL) + return; + + /* + * We use a hash table to hold known names, so that this process is O(N) + * not O(N^2) for N names. + */ + MemSet(&hash_ctl, 0, sizeof(hash_ctl)); + hash_ctl.keysize = NAMEDATALEN; + hash_ctl.entrysize = sizeof(NameHashEntry); + hash_ctl.hcxt = CurrentMemoryContext; + names_hash = hash_create("set_rtable_names names", + list_length(dpns->rtable), + &hash_ctl, + HASH_ELEM | HASH_CONTEXT); + /* Preload the hash table with names appearing in parent_namespaces */ + foreach(lc, parent_namespaces) + { + deparse_namespace *olddpns = (deparse_namespace *) lfirst(lc); + ListCell *lc2; + + foreach(lc2, olddpns->rtable_names) + { + char *oldname = (char *) lfirst(lc2); + + if (oldname == NULL) + continue; + hentry = (NameHashEntry *) hash_search(names_hash, + oldname, + HASH_ENTER, + &found); + /* we do not complain about duplicate names in parent namespaces */ + hentry->counter = 0; + } + } + + /* Now we can scan the rtable */ + rtindex = 1; + foreach(lc, dpns->rtable) + { + RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc); + char *refname; + + /* Just in case this takes an unreasonable amount of time ... */ + CHECK_FOR_INTERRUPTS(); + + if (rels_used && !bms_is_member(rtindex, rels_used)) + { + /* Ignore unreferenced RTE */ + refname = NULL; + } + else if (rte->alias) + { + /* If RTE has a user-defined alias, prefer that */ + refname = rte->alias->aliasname; + } + else if (rte->rtekind == RTE_RELATION) + { + /* Use the current actual name of the relation */ + refname = get_rel_name(rte->relid); + } + else if (rte->rtekind == RTE_JOIN) + { + /* Unnamed join has no refname */ + refname = NULL; + } + else + { + /* Otherwise use whatever the parser assigned */ + refname = rte->eref->aliasname; + } + + /* + * If the selected name isn't unique, append digits to make it so, and + * make a new hash entry for it once we've got a unique name. For a + * very long input name, we might have to truncate to stay within + * NAMEDATALEN. + */ + if (refname) + { + hentry = (NameHashEntry *) hash_search(names_hash, + refname, + HASH_ENTER, + &found); + if (found) + { + /* Name already in use, must choose a new one */ + int refnamelen = strlen(refname); + char *modname = (char *) palloc(refnamelen + 16); + NameHashEntry *hentry2; + + do + { + hentry->counter++; + for (;;) + { + /* + * We avoid using %.*s here because it can misbehave + * if the data is not valid in what libc thinks is the + * prevailing encoding. + */ + memcpy(modname, refname, refnamelen); + sprintf(modname + refnamelen, "_%d", hentry->counter); + if (strlen(modname) < NAMEDATALEN) + break; + /* drop chars from refname to keep all the digits */ + refnamelen = pg_mbcliplen(refname, refnamelen, + refnamelen - 1); + } + hentry2 = (NameHashEntry *) hash_search(names_hash, + modname, + HASH_ENTER, + &found); + } while (found); + hentry2->counter = 0; /* init new hash entry */ + refname = modname; + } + else + { + /* Name not previously used, need only initialize hentry */ + hentry->counter = 0; + } + } + + dpns->rtable_names = lappend(dpns->rtable_names, refname); + rtindex++; + } + + hash_destroy(names_hash); +} + +/* + * set_deparse_for_query: set up deparse_namespace for deparsing a Query tree + * + * For convenience, this is defined to initialize the deparse_namespace struct + * from scratch. + */ +static void +set_deparse_for_query(deparse_namespace *dpns, Query *query, + List *parent_namespaces) +{ + ListCell *lc; + ListCell *lc2; + + /* Initialize *dpns and fill rtable/ctes links */ + memset(dpns, 0, sizeof(deparse_namespace)); + dpns->rtable = query->rtable; + dpns->ctes = query->cteList; + + /* Assign a unique relation alias to each RTE */ + set_rtable_names(dpns, parent_namespaces, NULL); + + /* Initialize dpns->rtable_columns to contain zeroed structs */ + dpns->rtable_columns = NIL; + while (list_length(dpns->rtable_columns) < list_length(dpns->rtable)) + dpns->rtable_columns = lappend(dpns->rtable_columns, + palloc0(sizeof(deparse_columns))); + + /* If it's a utility query, it won't have a jointree */ + if (query->jointree) + { + /* Detect whether global uniqueness of USING names is needed */ + dpns->unique_using = + has_dangerous_join_using(dpns, (Node *) query->jointree); + + /* + * Select names for columns merged by USING, via a recursive pass over + * the query jointree. + */ + set_using_names(dpns, (Node *) query->jointree, NIL); + } + + /* + * Now assign remaining column aliases for each RTE. We do this in a + * linear scan of the rtable, so as to process RTEs whether or not they + * are in the jointree (we mustn't miss NEW.*, INSERT target relations, + * etc). JOIN RTEs must be processed after their children, but this is + * okay because they appear later in the rtable list than their children + * (cf Asserts in identify_join_columns()). + */ + forboth(lc, dpns->rtable, lc2, dpns->rtable_columns) + { + RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc); + deparse_columns *colinfo = (deparse_columns *) lfirst(lc2); + + if (rte->rtekind == RTE_JOIN) + set_join_column_names(dpns, rte, colinfo); + else + set_relation_column_names(dpns, rte, colinfo); + } +} + +/* + * set_simple_column_names: fill in column aliases for non-query situations + * + * This handles EXPLAIN and cases where we only have relation RTEs. Without + * a join tree, we can't do anything smart about join RTEs, but we don't + * need to (note that EXPLAIN should never see join alias Vars anyway). + * If we do hit a join RTE we'll just process it like a non-table base RTE. + */ +static void +set_simple_column_names(deparse_namespace *dpns) +{ + ListCell *lc; + ListCell *lc2; + + /* Initialize dpns->rtable_columns to contain zeroed structs */ + dpns->rtable_columns = NIL; + while (list_length(dpns->rtable_columns) < list_length(dpns->rtable)) + dpns->rtable_columns = lappend(dpns->rtable_columns, + palloc0(sizeof(deparse_columns))); + + /* Assign unique column aliases within each RTE */ + forboth(lc, dpns->rtable, lc2, dpns->rtable_columns) + { + RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc); + deparse_columns *colinfo = (deparse_columns *) lfirst(lc2); + + set_relation_column_names(dpns, rte, colinfo); + } +} + +/* + * has_dangerous_join_using: search jointree for unnamed JOIN USING + * + * Merged columns of a JOIN USING may act differently from either of the input + * columns, either because they are merged with COALESCE (in a FULL JOIN) or + * because an implicit coercion of the underlying input column is required. + * In such a case the column must be referenced as a column of the JOIN not as + * a column of either input. And this is problematic if the join is unnamed + * (alias-less): we cannot qualify the column's name with an RTE name, since + * there is none. (Forcibly assigning an alias to the join is not a solution, + * since that will prevent legal references to tables below the join.) + * To ensure that every column in the query is unambiguously referenceable, + * we must assign such merged columns names that are globally unique across + * the whole query, aliasing other columns out of the way as necessary. + * + * Because the ensuing re-aliasing is fairly damaging to the readability of + * the query, we don't do this unless we have to. So, we must pre-scan + * the join tree to see if we have to, before starting set_using_names(). + */ +static bool +has_dangerous_join_using(deparse_namespace *dpns, Node *jtnode) +{// #lizard forgives + if (IsA(jtnode, RangeTblRef)) + { + /* nothing to do here */ + } + else if (IsA(jtnode, FromExpr)) + { + FromExpr *f = (FromExpr *) jtnode; + ListCell *lc; + + foreach(lc, f->fromlist) + { + if (has_dangerous_join_using(dpns, (Node *) lfirst(lc))) + return true; + } + } + else if (IsA(jtnode, JoinExpr)) + { + JoinExpr *j = (JoinExpr *) jtnode; + + /* Is it an unnamed JOIN with USING? */ + if (j->alias == NULL && j->usingClause) + { + /* + * Yes, so check each join alias var to see if any of them are not + * simple references to underlying columns. If so, we have a + * dangerous situation and must pick unique aliases. + */ + RangeTblEntry *jrte = rt_fetch(j->rtindex, dpns->rtable); + ListCell *lc; + + foreach(lc, jrte->joinaliasvars) + { + Var *aliasvar = (Var *) lfirst(lc); + + if (aliasvar != NULL && !IsA(aliasvar, Var)) + return true; + } + } + + /* Nope, but inspect children */ + if (has_dangerous_join_using(dpns, j->larg)) + return true; + if (has_dangerous_join_using(dpns, j->rarg)) + return true; + } + else + elog(ERROR, "unrecognized node type: %d", + (int) nodeTag(jtnode)); + return false; +} + +/* + * set_using_names: select column aliases to be used for merged USING columns + * + * We do this during a recursive descent of the query jointree. + * dpns->unique_using must already be set to determine the global strategy. + * + * Column alias info is saved in the dpns->rtable_columns list, which is + * assumed to be filled with pre-zeroed deparse_columns structs. + * + * parentUsing is a list of all USING aliases assigned in parent joins of + * the current jointree node. (The passed-in list must not be modified.) + */ +static void +set_using_names(deparse_namespace *dpns, Node *jtnode, List *parentUsing) +{// #lizard forgives + if (IsA(jtnode, RangeTblRef)) + { + /* nothing to do now */ + } + else if (IsA(jtnode, FromExpr)) + { + FromExpr *f = (FromExpr *) jtnode; + ListCell *lc; + + foreach(lc, f->fromlist) + set_using_names(dpns, (Node *) lfirst(lc), parentUsing); + } + else if (IsA(jtnode, JoinExpr)) + { + JoinExpr *j = (JoinExpr *) jtnode; + RangeTblEntry *rte = rt_fetch(j->rtindex, dpns->rtable); + deparse_columns *colinfo = deparse_columns_fetch(j->rtindex, dpns); + int *leftattnos; + int *rightattnos; + deparse_columns *leftcolinfo; + deparse_columns *rightcolinfo; + int i; + ListCell *lc; + + /* Get info about the shape of the join */ + identify_join_columns(j, rte, colinfo); + leftattnos = colinfo->leftattnos; + rightattnos = colinfo->rightattnos; + + /* Look up the not-yet-filled-in child deparse_columns structs */ + leftcolinfo = deparse_columns_fetch(colinfo->leftrti, dpns); + rightcolinfo = deparse_columns_fetch(colinfo->rightrti, dpns); + + /* + * If this join is unnamed, then we cannot substitute new aliases at + * this level, so any name requirements pushed down to here must be + * pushed down again to the children. + */ + if (rte->alias == NULL) + { + for (i = 0; i < colinfo->num_cols; i++) + { + char *colname = colinfo->colnames[i]; + + if (colname == NULL) + continue; + + /* Push down to left column, unless it's a system column */ + if (leftattnos[i] > 0) + { + expand_colnames_array_to(leftcolinfo, leftattnos[i]); + leftcolinfo->colnames[leftattnos[i] - 1] = colname; + } + + /* Same on the righthand side */ + if (rightattnos[i] > 0) + { + expand_colnames_array_to(rightcolinfo, rightattnos[i]); + rightcolinfo->colnames[rightattnos[i] - 1] = colname; + } + } + } + + /* + * If there's a USING clause, select the USING column names and push + * those names down to the children. We have two strategies: + * + * If dpns->unique_using is TRUE, we force all USING names to be + * unique across the whole query level. In principle we'd only need + * the names of dangerous USING columns to be globally unique, but to + * safely assign all USING names in a single pass, we have to enforce + * the same uniqueness rule for all of them. However, if a USING + * column's name has been pushed down from the parent, we should use + * it as-is rather than making a uniqueness adjustment. This is + * necessary when we're at an unnamed join, and it creates no risk of + * ambiguity. Also, if there's a user-written output alias for a + * merged column, we prefer to use that rather than the input name; + * this simplifies the logic and seems likely to lead to less aliasing + * overall. + * + * If dpns->unique_using is FALSE, we only need USING names to be + * unique within their own join RTE. We still need to honor + * pushed-down names, though. + * + * Though significantly different in results, these two strategies are + * implemented by the same code, with only the difference of whether + * to put assigned names into dpns->using_names. + */ + if (j->usingClause) + { + /* Copy the input parentUsing list so we don't modify it */ + parentUsing = list_copy(parentUsing); + + /* USING names must correspond to the first join output columns */ + expand_colnames_array_to(colinfo, list_length(j->usingClause)); + i = 0; + foreach(lc, j->usingClause) + { + char *colname = strVal(lfirst(lc)); + + /* Assert it's a merged column */ + Assert(leftattnos[i] != 0 && rightattnos[i] != 0); + + /* Adopt passed-down name if any, else select unique name */ + if (colinfo->colnames[i] != NULL) + colname = colinfo->colnames[i]; + else + { + /* Prefer user-written output alias if any */ + if (rte->alias && i < list_length(rte->alias->colnames)) + colname = strVal(list_nth(rte->alias->colnames, i)); + /* Make it appropriately unique */ + colname = make_colname_unique(colname, dpns, colinfo); + if (dpns->unique_using) + dpns->using_names = lappend(dpns->using_names, + colname); + /* Save it as output column name, too */ + colinfo->colnames[i] = colname; + } + + /* Remember selected names for use later */ + colinfo->usingNames = lappend(colinfo->usingNames, colname); + parentUsing = lappend(parentUsing, colname); + + /* Push down to left column, unless it's a system column */ + if (leftattnos[i] > 0) + { + expand_colnames_array_to(leftcolinfo, leftattnos[i]); + leftcolinfo->colnames[leftattnos[i] - 1] = colname; + } + + /* Same on the righthand side */ + if (rightattnos[i] > 0) + { + expand_colnames_array_to(rightcolinfo, rightattnos[i]); + rightcolinfo->colnames[rightattnos[i] - 1] = colname; + } + + i++; + } + } + + /* Mark child deparse_columns structs with correct parentUsing info */ + leftcolinfo->parentUsing = parentUsing; + rightcolinfo->parentUsing = parentUsing; + + /* Now recursively assign USING column names in children */ + set_using_names(dpns, j->larg, parentUsing); + set_using_names(dpns, j->rarg, parentUsing); + } + else + elog(ERROR, "unrecognized node type: %d", + (int) nodeTag(jtnode)); +} + +/* + * set_relation_column_names: select column aliases for a non-join RTE + * + * Column alias info is saved in *colinfo, which is assumed to be pre-zeroed. + * If any colnames entries are already filled in, those override local + * choices. + */ +static void +set_relation_column_names(deparse_namespace *dpns, RangeTblEntry *rte, + deparse_columns *colinfo) +{// #lizard forgives + int ncolumns; + char **real_colnames; + bool changed_any; + int noldcolumns; + int i; + int j; + + /* + * Extract the RTE's "real" column names. This is comparable to + * get_rte_attribute_name, except that it's important to disregard dropped + * columns. We put NULL into the array for a dropped column. + */ + if (rte->rtekind == RTE_RELATION) + { + /* Relation --- look to the system catalogs for up-to-date info */ + Relation rel; + TupleDesc tupdesc; + + rel = relation_open(rte->relid, AccessShareLock); + tupdesc = RelationGetDescr(rel); + + ncolumns = tupdesc->natts; + real_colnames = (char **) palloc(ncolumns * sizeof(char *)); + + for (i = 0; i < ncolumns; i++) + { + if (tupdesc->attrs[i]->attisdropped) + real_colnames[i] = NULL; + else + real_colnames[i] = pstrdup(NameStr(tupdesc->attrs[i]->attname)); + } + relation_close(rel, AccessShareLock); + } + else + { + /* Otherwise use the column names from eref */ + ListCell *lc; + + ncolumns = list_length(rte->eref->colnames); + real_colnames = (char **) palloc(ncolumns * sizeof(char *)); + + i = 0; + foreach(lc, rte->eref->colnames) + { + /* + * If the column name shown in eref is an empty string, then it's + * a column that was dropped at the time of parsing the query, so + * treat it as dropped. + */ + char *cname = strVal(lfirst(lc)); + + if (cname[0] == '\0') + cname = NULL; + real_colnames[i] = cname; + i++; + } + } + + /* + * Ensure colinfo->colnames has a slot for each column. (It could be long + * enough already, if we pushed down a name for the last column.) Note: + * it's possible that there are now more columns than there were when the + * query was parsed, ie colnames could be longer than rte->eref->colnames. + * We must assign unique aliases to the new columns too, else there could + * be unresolved conflicts when the view/rule is reloaded. + */ + expand_colnames_array_to(colinfo, ncolumns); + Assert(colinfo->num_cols == ncolumns); + + /* + * Make sufficiently large new_colnames and is_new_col arrays, too. + * + * Note: because we leave colinfo->num_new_cols zero until after the loop, + * colname_is_unique will not consult that array, which is fine because it + * would only be duplicate effort. + */ + colinfo->new_colnames = (char **) palloc(ncolumns * sizeof(char *)); + colinfo->is_new_col = (bool *) palloc(ncolumns * sizeof(bool)); + + /* + * Scan the columns, select a unique alias for each one, and store it in + * colinfo->colnames and colinfo->new_colnames. The former array has NULL + * entries for dropped columns, the latter omits them. Also mark + * new_colnames entries as to whether they are new since parse time; this + * is the case for entries beyond the length of rte->eref->colnames. + */ + noldcolumns = list_length(rte->eref->colnames); + changed_any = false; + j = 0; + for (i = 0; i < ncolumns; i++) + { + char *real_colname = real_colnames[i]; + char *colname = colinfo->colnames[i]; + + /* Skip dropped columns */ + if (real_colname == NULL) + { + Assert(colname == NULL); /* colnames[i] is already NULL */ + continue; + } + + /* If alias already assigned, that's what to use */ + if (colname == NULL) + { + /* If user wrote an alias, prefer that over real column name */ + if (rte->alias && i < list_length(rte->alias->colnames)) + colname = strVal(list_nth(rte->alias->colnames, i)); + else + colname = real_colname; + + /* Unique-ify and insert into colinfo */ + colname = make_colname_unique(colname, dpns, colinfo); + + colinfo->colnames[i] = colname; + } + + /* Put names of non-dropped columns in new_colnames[] too */ + colinfo->new_colnames[j] = colname; + /* And mark them as new or not */ + colinfo->is_new_col[j] = (i >= noldcolumns); + j++; + + /* Remember if any assigned aliases differ from "real" name */ + if (!changed_any && strcmp(colname, real_colname) != 0) + changed_any = true; + } + + /* + * Set correct length for new_colnames[] array. (Note: if columns have + * been added, colinfo->num_cols includes them, which is not really quite + * right but is harmless, since any new columns must be at the end where + * they won't affect varattnos of pre-existing columns.) + */ + colinfo->num_new_cols = j; + + /* + * For a relation RTE, we need only print the alias column names if any + * are different from the underlying "real" names. For a function RTE, + * always emit a complete column alias list; this is to protect against + * possible instability of the default column names (eg, from altering + * parameter names). For tablefunc RTEs, we never print aliases, because + * the column names are part of the clause itself. For other RTE types, + * print if we changed anything OR if there were user-written column + * aliases (since the latter would be part of the underlying "reality"). + */ + if (rte->rtekind == RTE_RELATION) + colinfo->printaliases = changed_any; + else if (rte->rtekind == RTE_FUNCTION) + colinfo->printaliases = true; + else if (rte->rtekind == RTE_TABLEFUNC) + colinfo->printaliases = false; + else if (rte->alias && rte->alias->colnames != NIL) + colinfo->printaliases = true; + else + colinfo->printaliases = changed_any; +} + +/* + * set_join_column_names: select column aliases for a join RTE + * + * Column alias info is saved in *colinfo, which is assumed to be pre-zeroed. + * If any colnames entries are already filled in, those override local + * choices. Also, names for USING columns were already chosen by + * set_using_names(). We further expect that column alias selection has been + * completed for both input RTEs. + */ +static void +set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte, + deparse_columns *colinfo) +{// #lizard forgives + deparse_columns *leftcolinfo; + deparse_columns *rightcolinfo; + bool changed_any; + int noldcolumns; + int nnewcolumns; + Bitmapset *leftmerged = NULL; + Bitmapset *rightmerged = NULL; + int i; + int j; + int ic; + int jc; + + /* Look up the previously-filled-in child deparse_columns structs */ + leftcolinfo = deparse_columns_fetch(colinfo->leftrti, dpns); + rightcolinfo = deparse_columns_fetch(colinfo->rightrti, dpns); + + /* + * Ensure colinfo->colnames has a slot for each column. (It could be long + * enough already, if we pushed down a name for the last column.) Note: + * it's possible that one or both inputs now have more columns than there + * were when the query was parsed, but we'll deal with that below. We + * only need entries in colnames for pre-existing columns. + */ + noldcolumns = list_length(rte->eref->colnames); + expand_colnames_array_to(colinfo, noldcolumns); + Assert(colinfo->num_cols == noldcolumns); + + /* + * Scan the join output columns, select an alias for each one, and store + * it in colinfo->colnames. If there are USING columns, set_using_names() + * already selected their names, so we can start the loop at the first + * non-merged column. + */ + changed_any = false; + for (i = list_length(colinfo->usingNames); i < noldcolumns; i++) + { + char *colname = colinfo->colnames[i]; + char *real_colname; + + /* Ignore dropped column (only possible for non-merged column) */ + if (colinfo->leftattnos[i] == 0 && colinfo->rightattnos[i] == 0) + { + Assert(colname == NULL); + continue; + } + + /* Get the child column name */ + if (colinfo->leftattnos[i] > 0) + real_colname = leftcolinfo->colnames[colinfo->leftattnos[i] - 1]; + else if (colinfo->rightattnos[i] > 0) + real_colname = rightcolinfo->colnames[colinfo->rightattnos[i] - 1]; + else + { + /* We're joining system columns --- use eref name */ + real_colname = strVal(list_nth(rte->eref->colnames, i)); + } + Assert(real_colname != NULL); + + /* In an unnamed join, just report child column names as-is */ + if (rte->alias == NULL) + { + colinfo->colnames[i] = real_colname; + continue; + } + + /* If alias already assigned, that's what to use */ + if (colname == NULL) + { + /* If user wrote an alias, prefer that over real column name */ + if (rte->alias && i < list_length(rte->alias->colnames)) + colname = strVal(list_nth(rte->alias->colnames, i)); + else + colname = real_colname; + + /* Unique-ify and insert into colinfo */ + colname = make_colname_unique(colname, dpns, colinfo); + + colinfo->colnames[i] = colname; + } + + /* Remember if any assigned aliases differ from "real" name */ + if (!changed_any && strcmp(colname, real_colname) != 0) + changed_any = true; + } + + /* + * Calculate number of columns the join would have if it were re-parsed + * now, and create storage for the new_colnames and is_new_col arrays. + * + * Note: colname_is_unique will be consulting new_colnames[] during the + * loops below, so its not-yet-filled entries must be zeroes. + */ + nnewcolumns = leftcolinfo->num_new_cols + rightcolinfo->num_new_cols - + list_length(colinfo->usingNames); + colinfo->num_new_cols = nnewcolumns; + colinfo->new_colnames = (char **) palloc0(nnewcolumns * sizeof(char *)); + colinfo->is_new_col = (bool *) palloc0(nnewcolumns * sizeof(bool)); + + /* + * Generating the new_colnames array is a bit tricky since any new columns + * added since parse time must be inserted in the right places. This code + * must match the parser, which will order a join's columns as merged + * columns first (in USING-clause order), then non-merged columns from the + * left input (in attnum order), then non-merged columns from the right + * input (ditto). If one of the inputs is itself a join, its columns will + * be ordered according to the same rule, which means newly-added columns + * might not be at the end. We can figure out what's what by consulting + * the leftattnos and rightattnos arrays plus the input is_new_col arrays. + * + * In these loops, i indexes leftattnos/rightattnos (so it's join varattno + * less one), j indexes new_colnames/is_new_col, and ic/jc have similar + * meanings for the current child RTE. + */ + + /* Handle merged columns; they are first and can't be new */ + i = j = 0; + while (i < noldcolumns && + colinfo->leftattnos[i] != 0 && + colinfo->rightattnos[i] != 0) + { + /* column name is already determined and known unique */ + colinfo->new_colnames[j] = colinfo->colnames[i]; + colinfo->is_new_col[j] = false; + + /* build bitmapsets of child attnums of merged columns */ + if (colinfo->leftattnos[i] > 0) + leftmerged = bms_add_member(leftmerged, colinfo->leftattnos[i]); + if (colinfo->rightattnos[i] > 0) + rightmerged = bms_add_member(rightmerged, colinfo->rightattnos[i]); + + i++, j++; + } + + /* Handle non-merged left-child columns */ + ic = 0; + for (jc = 0; jc < leftcolinfo->num_new_cols; jc++) + { + char *child_colname = leftcolinfo->new_colnames[jc]; + + if (!leftcolinfo->is_new_col[jc]) + { + /* Advance ic to next non-dropped old column of left child */ + while (ic < leftcolinfo->num_cols && + leftcolinfo->colnames[ic] == NULL) + ic++; + Assert(ic < leftcolinfo->num_cols); + ic++; + /* If it is a merged column, we already processed it */ + if (bms_is_member(ic, leftmerged)) + continue; + /* Else, advance i to the corresponding existing join column */ + while (i < colinfo->num_cols && + colinfo->colnames[i] == NULL) + i++; + Assert(i < colinfo->num_cols); + Assert(ic == colinfo->leftattnos[i]); + /* Use the already-assigned name of this column */ + colinfo->new_colnames[j] = colinfo->colnames[i]; + i++; + } + else + { + /* + * Unique-ify the new child column name and assign, unless we're + * in an unnamed join, in which case just copy + */ + if (rte->alias != NULL) + { + colinfo->new_colnames[j] = + make_colname_unique(child_colname, dpns, colinfo); + if (!changed_any && + strcmp(colinfo->new_colnames[j], child_colname) != 0) + changed_any = true; + } + else + colinfo->new_colnames[j] = child_colname; + } + + colinfo->is_new_col[j] = leftcolinfo->is_new_col[jc]; + j++; + } + + /* Handle non-merged right-child columns in exactly the same way */ + ic = 0; + for (jc = 0; jc < rightcolinfo->num_new_cols; jc++) + { + char *child_colname = rightcolinfo->new_colnames[jc]; + + if (!rightcolinfo->is_new_col[jc]) + { + /* Advance ic to next non-dropped old column of right child */ + while (ic < rightcolinfo->num_cols && + rightcolinfo->colnames[ic] == NULL) + ic++; + Assert(ic < rightcolinfo->num_cols); + ic++; + /* If it is a merged column, we already processed it */ + if (bms_is_member(ic, rightmerged)) + continue; + /* Else, advance i to the corresponding existing join column */ + while (i < colinfo->num_cols && + colinfo->colnames[i] == NULL) + i++; + Assert(i < colinfo->num_cols); + Assert(ic == colinfo->rightattnos[i]); + /* Use the already-assigned name of this column */ + colinfo->new_colnames[j] = colinfo->colnames[i]; + i++; + } + else + { + /* + * Unique-ify the new child column name and assign, unless we're + * in an unnamed join, in which case just copy + */ + if (rte->alias != NULL) + { + colinfo->new_colnames[j] = + make_colname_unique(child_colname, dpns, colinfo); + if (!changed_any && + strcmp(colinfo->new_colnames[j], child_colname) != 0) + changed_any = true; + } + else + colinfo->new_colnames[j] = child_colname; + } + + colinfo->is_new_col[j] = rightcolinfo->is_new_col[jc]; + j++; + } + + /* Assert we processed the right number of columns */ +#ifdef USE_ASSERT_CHECKING + while (i < colinfo->num_cols && colinfo->colnames[i] == NULL) + i++; + Assert(i == colinfo->num_cols); + Assert(j == nnewcolumns); +#endif + + /* + * For a named join, print column aliases if we changed any from the child + * names. Unnamed joins cannot print aliases. + */ + if (rte->alias != NULL) + colinfo->printaliases = changed_any; + else + colinfo->printaliases = false; +} + +/* + * colname_is_unique: is colname distinct from already-chosen column names? + * + * dpns is query-wide info, colinfo is for the column's RTE + */ +static bool +colname_is_unique(char *colname, deparse_namespace *dpns, + deparse_columns *colinfo) +{// #lizard forgives + int i; + ListCell *lc; + + /* Check against already-assigned column aliases within RTE */ + for (i = 0; i < colinfo->num_cols; i++) + { + char *oldname = colinfo->colnames[i]; + + if (oldname && strcmp(oldname, colname) == 0) + return false; + } + + /* + * If we're building a new_colnames array, check that too (this will be + * partially but not completely redundant with the previous checks) + */ + for (i = 0; i < colinfo->num_new_cols; i++) + { + char *oldname = colinfo->new_colnames[i]; + + if (oldname && strcmp(oldname, colname) == 0) + return false; + } + + /* Also check against USING-column names that must be globally unique */ + foreach(lc, dpns->using_names) + { + char *oldname = (char *) lfirst(lc); + + if (strcmp(oldname, colname) == 0) + return false; + } + + /* Also check against names already assigned for parent-join USING cols */ + foreach(lc, colinfo->parentUsing) + { + char *oldname = (char *) lfirst(lc); + + if (strcmp(oldname, colname) == 0) + return false; + } + + return true; +} + +/* + * make_colname_unique: modify colname if necessary to make it unique + * + * dpns is query-wide info, colinfo is for the column's RTE + */ +static char * +make_colname_unique(char *colname, deparse_namespace *dpns, + deparse_columns *colinfo) +{ + /* + * If the selected name isn't unique, append digits to make it so. For a + * very long input name, we might have to truncate to stay within + * NAMEDATALEN. + */ + if (!colname_is_unique(colname, dpns, colinfo)) + { + int colnamelen = strlen(colname); + char *modname = (char *) palloc(colnamelen + 16); + int i = 0; + + do + { + i++; + for (;;) + { + /* + * We avoid using %.*s here because it can misbehave if the + * data is not valid in what libc thinks is the prevailing + * encoding. + */ + memcpy(modname, colname, colnamelen); + sprintf(modname + colnamelen, "_%d", i); + if (strlen(modname) < NAMEDATALEN) + break; + /* drop chars from colname to keep all the digits */ + colnamelen = pg_mbcliplen(colname, colnamelen, + colnamelen - 1); + } + } while (!colname_is_unique(modname, dpns, colinfo)); + colname = modname; + } + return colname; +} + +/* + * expand_colnames_array_to: make colinfo->colnames at least n items long + * + * Any added array entries are initialized to zero. + */ +static void +expand_colnames_array_to(deparse_columns *colinfo, int n) +{ + if (n > colinfo->num_cols) + { + if (colinfo->colnames == NULL) + colinfo->colnames = (char **) palloc0(n * sizeof(char *)); + else + { + colinfo->colnames = (char **) repalloc(colinfo->colnames, + n * sizeof(char *)); + memset(colinfo->colnames + colinfo->num_cols, 0, + (n - colinfo->num_cols) * sizeof(char *)); + } + colinfo->num_cols = n; + } +} + +/* + * identify_join_columns: figure out where columns of a join come from + * + * Fills the join-specific fields of the colinfo struct, except for + * usingNames which is filled later. + */ +static void +identify_join_columns(JoinExpr *j, RangeTblEntry *jrte, + deparse_columns *colinfo) +{// #lizard forgives + int numjoincols; + int i; + ListCell *lc; + + /* Extract left/right child RT indexes */ + if (IsA(j->larg, RangeTblRef)) + colinfo->leftrti = ((RangeTblRef *) j->larg)->rtindex; + else if (IsA(j->larg, JoinExpr)) + colinfo->leftrti = ((JoinExpr *) j->larg)->rtindex; + else + elog(ERROR, "unrecognized node type in jointree: %d", + (int) nodeTag(j->larg)); + if (IsA(j->rarg, RangeTblRef)) + colinfo->rightrti = ((RangeTblRef *) j->rarg)->rtindex; + else if (IsA(j->rarg, JoinExpr)) + colinfo->rightrti = ((JoinExpr *) j->rarg)->rtindex; + else + elog(ERROR, "unrecognized node type in jointree: %d", + (int) nodeTag(j->rarg)); + + /* Assert children will be processed earlier than join in second pass */ + Assert(colinfo->leftrti < j->rtindex); + Assert(colinfo->rightrti < j->rtindex); + + /* Initialize result arrays with zeroes */ + numjoincols = list_length(jrte->joinaliasvars); + Assert(numjoincols == list_length(jrte->eref->colnames)); + colinfo->leftattnos = (int *) palloc0(numjoincols * sizeof(int)); + colinfo->rightattnos = (int *) palloc0(numjoincols * sizeof(int)); + + /* Scan the joinaliasvars list to identify simple column references */ + i = 0; + foreach(lc, jrte->joinaliasvars) + { + Var *aliasvar = (Var *) lfirst(lc); + + /* get rid of any implicit coercion above the Var */ + aliasvar = (Var *) strip_implicit_coercions((Node *) aliasvar); + + if (aliasvar == NULL) + { + /* It's a dropped column; nothing to do here */ + } + else if (IsA(aliasvar, Var)) + { + Assert(aliasvar->varlevelsup == 0); + Assert(aliasvar->varattno != 0); + if (aliasvar->varno == colinfo->leftrti) + colinfo->leftattnos[i] = aliasvar->varattno; + else if (aliasvar->varno == colinfo->rightrti) + colinfo->rightattnos[i] = aliasvar->varattno; + else + elog(ERROR, "unexpected varno %d in JOIN RTE", + aliasvar->varno); + } + else if (IsA(aliasvar, CoalesceExpr)) + { + /* + * It's a merged column in FULL JOIN USING. Ignore it for now and + * let the code below identify the merged columns. + */ + } + else + elog(ERROR, "unrecognized node type in join alias vars: %d", + (int) nodeTag(aliasvar)); + + i++; + } + + /* + * If there's a USING clause, deconstruct the join quals to identify the + * merged columns. This is a tad painful but if we cannot rely on the + * column names, there is no other representation of which columns were + * joined by USING. (Unless the join type is FULL, we can't tell from the + * joinaliasvars list which columns are merged.) Note: we assume that the + * merged columns are the first output column(s) of the join. + */ + if (j->usingClause) + { + List *leftvars = NIL; + List *rightvars = NIL; + ListCell *lc2; + + /* Extract left- and right-side Vars from the qual expression */ + flatten_join_using_qual(j->quals, &leftvars, &rightvars); + Assert(list_length(leftvars) == list_length(j->usingClause)); + Assert(list_length(rightvars) == list_length(j->usingClause)); + + /* Mark the output columns accordingly */ + i = 0; + forboth(lc, leftvars, lc2, rightvars) + { + Var *leftvar = (Var *) lfirst(lc); + Var *rightvar = (Var *) lfirst(lc2); + + Assert(leftvar->varlevelsup == 0); + Assert(leftvar->varattno != 0); + if (leftvar->varno != colinfo->leftrti) + elog(ERROR, "unexpected varno %d in JOIN USING qual", + leftvar->varno); + colinfo->leftattnos[i] = leftvar->varattno; + + Assert(rightvar->varlevelsup == 0); + Assert(rightvar->varattno != 0); + if (rightvar->varno != colinfo->rightrti) + elog(ERROR, "unexpected varno %d in JOIN USING qual", + rightvar->varno); + colinfo->rightattnos[i] = rightvar->varattno; + + i++; + } + } +} + +/* + * flatten_join_using_qual: extract Vars being joined from a JOIN/USING qual + * + * We assume that transformJoinUsingClause won't have produced anything except + * AND nodes, equality operator nodes, and possibly implicit coercions, and + * that the AND node inputs match left-to-right with the original USING list. + * + * Caller must initialize the result lists to NIL. + */ +static void +flatten_join_using_qual(Node *qual, List **leftvars, List **rightvars) +{ + if (IsA(qual, BoolExpr)) + { + /* Handle AND nodes by recursion */ + BoolExpr *b = (BoolExpr *) qual; + ListCell *lc; + + Assert(b->boolop == AND_EXPR); + foreach(lc, b->args) + { + flatten_join_using_qual((Node *) lfirst(lc), + leftvars, rightvars); + } + } + else if (IsA(qual, OpExpr)) + { + /* Otherwise we should have an equality operator */ + OpExpr *op = (OpExpr *) qual; + Var *var; + + if (list_length(op->args) != 2) + elog(ERROR, "unexpected unary operator in JOIN/USING qual"); + /* Arguments should be Vars with perhaps implicit coercions */ + var = (Var *) strip_implicit_coercions((Node *) linitial(op->args)); + if (!IsA(var, Var)) + elog(ERROR, "unexpected node type in JOIN/USING qual: %d", + (int) nodeTag(var)); + *leftvars = lappend(*leftvars, var); + var = (Var *) strip_implicit_coercions((Node *) lsecond(op->args)); + if (!IsA(var, Var)) + elog(ERROR, "unexpected node type in JOIN/USING qual: %d", + (int) nodeTag(var)); + *rightvars = lappend(*rightvars, var); + } + else + { + /* Perhaps we have an implicit coercion to boolean? */ + Node *q = strip_implicit_coercions(qual); + + if (q != qual) + flatten_join_using_qual(q, leftvars, rightvars); + else + elog(ERROR, "unexpected node type in JOIN/USING qual: %d", + (int) nodeTag(qual)); + } +} + +/* + * get_rtable_name: convenience function to get a previously assigned RTE alias + * + * The RTE must belong to the topmost namespace level in "context". + */ +static char * +get_rtable_name(int rtindex, deparse_context *context) +{ + deparse_namespace *dpns = (deparse_namespace *) linitial(context->namespaces); + + Assert(rtindex > 0 && rtindex <= list_length(dpns->rtable_names)); + return (char *) list_nth(dpns->rtable_names, rtindex - 1); +} + +/* + * set_deparse_planstate: set up deparse_namespace to parse subexpressions + * of a given PlanState node + * + * This sets the planstate, outer_planstate, inner_planstate, outer_tlist, + * inner_tlist, and index_tlist fields. Caller is responsible for adjusting + * the ancestors list if necessary. Note that the rtable and ctes fields do + * not need to change when shifting attention to different plan nodes in a + * single plan tree. + */ +static void +set_deparse_planstate(deparse_namespace *dpns, PlanState *ps) +{// #lizard forgives + dpns->planstate = ps; + + /* + * We special-case Append and MergeAppend to pretend that the first child + * plan is the OUTER referent; we have to interpret OUTER Vars in their + * tlists according to one of the children, and the first one is the most + * natural choice. Likewise special-case ModifyTable to pretend that the + * first child plan is the OUTER referent; this is to support RETURNING + * lists containing references to non-target relations. + */ + if (IsA(ps, AppendState)) + dpns->outer_planstate = ((AppendState *) ps)->appendplans[0]; + else if (IsA(ps, MergeAppendState)) + dpns->outer_planstate = ((MergeAppendState *) ps)->mergeplans[0]; + else if (IsA(ps, ModifyTableState)) + dpns->outer_planstate = ((ModifyTableState *) ps)->mt_plans[0]; + else + dpns->outer_planstate = outerPlanState(ps); + + if (dpns->outer_planstate) + dpns->outer_tlist = dpns->outer_planstate->plan->targetlist; + else + dpns->outer_tlist = NIL; + + /* + * For a SubqueryScan, pretend the subplan is INNER referent. (We don't + * use OUTER because that could someday conflict with the normal meaning.) + * Likewise, for a CteScan, pretend the subquery's plan is INNER referent. + * For ON CONFLICT .. UPDATE we just need the inner tlist to point to the + * excluded expression's tlist. (Similar to the SubqueryScan we don't want + * to reuse OUTER, it's used for RETURNING in some modify table cases, + * although not INSERT .. CONFLICT). + */ + if (IsA(ps, SubqueryScanState)) + dpns->inner_planstate = ((SubqueryScanState *) ps)->subplan; + else if (IsA(ps, CteScanState)) + dpns->inner_planstate = ((CteScanState *) ps)->cteplanstate; + else if (IsA(ps, ModifyTableState)) + dpns->inner_planstate = ps; + else + dpns->inner_planstate = innerPlanState(ps); + + if (IsA(ps, ModifyTableState)) + dpns->inner_tlist = ((ModifyTableState *) ps)->mt_excludedtlist; + else if (dpns->inner_planstate) + dpns->inner_tlist = dpns->inner_planstate->plan->targetlist; + else + dpns->inner_tlist = NIL; + + /* Set up referent for INDEX_VAR Vars, if needed */ + if (IsA(ps->plan, IndexOnlyScan)) + dpns->index_tlist = ((IndexOnlyScan *) ps->plan)->indextlist; + else if (IsA(ps->plan, ForeignScan)) + dpns->index_tlist = ((ForeignScan *) ps->plan)->fdw_scan_tlist; + else if (IsA(ps->plan, CustomScan)) + dpns->index_tlist = ((CustomScan *) ps->plan)->custom_scan_tlist; + else + dpns->index_tlist = NIL; +} + +/* + * push_child_plan: temporarily transfer deparsing attention to a child plan + * + * When expanding an OUTER_VAR or INNER_VAR reference, we must adjust the + * deparse context in case the referenced expression itself uses + * OUTER_VAR/INNER_VAR. We modify the top stack entry in-place to avoid + * affecting levelsup issues (although in a Plan tree there really shouldn't + * be any). + * + * Caller must provide a local deparse_namespace variable to save the + * previous state for pop_child_plan. + */ +static void +push_child_plan(deparse_namespace *dpns, PlanState *ps, + deparse_namespace *save_dpns) +{ + /* Save state for restoration later */ + *save_dpns = *dpns; + + /* Link current plan node into ancestors list */ + dpns->ancestors = lcons(dpns->planstate, dpns->ancestors); + + /* Set attention on selected child */ + set_deparse_planstate(dpns, ps); +} + +/* + * pop_child_plan: undo the effects of push_child_plan + */ +static void +pop_child_plan(deparse_namespace *dpns, deparse_namespace *save_dpns) +{ + List *ancestors; + + /* Get rid of ancestors list cell added by push_child_plan */ + ancestors = list_delete_first(dpns->ancestors); + + /* Restore fields changed by push_child_plan */ + *dpns = *save_dpns; + + /* Make sure dpns->ancestors is right (may be unnecessary) */ + dpns->ancestors = ancestors; +} + +/* + * push_ancestor_plan: temporarily transfer deparsing attention to an + * ancestor plan + * + * When expanding a Param reference, we must adjust the deparse context + * to match the plan node that contains the expression being printed; + * otherwise we'd fail if that expression itself contains a Param or + * OUTER_VAR/INNER_VAR/INDEX_VAR variable. + * + * The target ancestor is conveniently identified by the ListCell holding it + * in dpns->ancestors. + * + * Caller must provide a local deparse_namespace variable to save the + * previous state for pop_ancestor_plan. + */ +static void +push_ancestor_plan(deparse_namespace *dpns, ListCell *ancestor_cell, + deparse_namespace *save_dpns) +{ + PlanState *ps = (PlanState *) lfirst(ancestor_cell); + List *ancestors; + + /* Save state for restoration later */ + *save_dpns = *dpns; + + /* Build a new ancestor list with just this node's ancestors */ + ancestors = NIL; + while ((ancestor_cell = lnext(ancestor_cell)) != NULL) + ancestors = lappend(ancestors, lfirst(ancestor_cell)); + dpns->ancestors = ancestors; + + /* Set attention on selected ancestor */ + set_deparse_planstate(dpns, ps); +} + +/* + * pop_ancestor_plan: undo the effects of push_ancestor_plan + */ +static void +pop_ancestor_plan(deparse_namespace *dpns, deparse_namespace *save_dpns) +{ + /* Free the ancestor list made in push_ancestor_plan */ + list_free(dpns->ancestors); + + /* Restore fields changed by push_ancestor_plan */ + *dpns = *save_dpns; +} + + +/* ---------- + * make_ruledef - reconstruct the CREATE RULE command + * for a given pg_rewrite tuple + * ---------- + */ +static void +make_ruledef(StringInfo buf, HeapTuple ruletup, TupleDesc rulettc, + int prettyFlags) +{// #lizard forgives + char *rulename; + char ev_type; + Oid ev_class; + bool is_instead; + char *ev_qual; + char *ev_action; + List *actions = NIL; + Relation ev_relation; + TupleDesc viewResultDesc = NULL; + int fno; + Datum dat; + bool isnull; + + /* + * Get the attribute values from the rules tuple + */ + fno = SPI_fnumber(rulettc, "rulename"); + dat = SPI_getbinval(ruletup, rulettc, fno, &isnull); + Assert(!isnull); + rulename = NameStr(*(DatumGetName(dat))); + + fno = SPI_fnumber(rulettc, "ev_type"); + dat = SPI_getbinval(ruletup, rulettc, fno, &isnull); + Assert(!isnull); + ev_type = DatumGetChar(dat); + + fno = SPI_fnumber(rulettc, "ev_class"); + dat = SPI_getbinval(ruletup, rulettc, fno, &isnull); + Assert(!isnull); + ev_class = DatumGetObjectId(dat); + + fno = SPI_fnumber(rulettc, "is_instead"); + dat = SPI_getbinval(ruletup, rulettc, fno, &isnull); + Assert(!isnull); + is_instead = DatumGetBool(dat); + + /* these could be nulls */ + fno = SPI_fnumber(rulettc, "ev_qual"); + ev_qual = SPI_getvalue(ruletup, rulettc, fno); + + fno = SPI_fnumber(rulettc, "ev_action"); + ev_action = SPI_getvalue(ruletup, rulettc, fno); + if (ev_action != NULL) + actions = (List *) stringToNode(ev_action); + + ev_relation = heap_open(ev_class, AccessShareLock); + + /* + * Build the rules definition text + */ + appendStringInfo(buf, "CREATE RULE %s AS", + quote_identifier(rulename)); + + if (prettyFlags & PRETTYFLAG_INDENT) + appendStringInfoString(buf, "\n ON "); + else + appendStringInfoString(buf, " ON "); + + /* The event the rule is fired for */ + switch (ev_type) + { + case '1': + appendStringInfoString(buf, "SELECT"); + viewResultDesc = RelationGetDescr(ev_relation); + break; + + case '2': + appendStringInfoString(buf, "UPDATE"); + break; + + case '3': + appendStringInfoString(buf, "INSERT"); + break; + + case '4': + appendStringInfoString(buf, "DELETE"); + break; + + default: + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("rule \"%s\" has unsupported event type %d", + rulename, ev_type))); + break; + } + + /* The relation the rule is fired on */ + appendStringInfo(buf, " TO %s", generate_relation_name(ev_class, NIL)); + + /* If the rule has an event qualification, add it */ + if (ev_qual == NULL) + ev_qual = ""; + if (strlen(ev_qual) > 0 && strcmp(ev_qual, "<>") != 0) + { + Node *qual; + Query *query; + deparse_context context; + deparse_namespace dpns; + + if (prettyFlags & PRETTYFLAG_INDENT) + appendStringInfoString(buf, "\n "); + appendStringInfoString(buf, " WHERE "); + + qual = stringToNode(ev_qual); + + /* + * We need to make a context for recognizing any Vars in the qual + * (which can only be references to OLD and NEW). Use the rtable of + * the first query in the action list for this purpose. + */ + query = (Query *) linitial(actions); + + /* + * If the action is INSERT...SELECT, OLD/NEW have been pushed down + * into the SELECT, and that's what we need to look at. (Ugly kluge + * ... try to fix this when we redesign querytrees.) + */ + query = getInsertSelectQuery(query, NULL); + + /* Must acquire locks right away; see notes in get_query_def() */ + AcquireRewriteLocks(query, false, false); + + context.buf = buf; + context.namespaces = list_make1(&dpns); + context.windowClause = NIL; + context.windowTList = NIL; + context.varprefix = (list_length(query->rtable) != 1); + context.prettyFlags = prettyFlags; + context.wrapColumn = WRAP_COLUMN_DEFAULT; + context.indentLevel = PRETTYINDENT_STD; + context.special_exprkind = EXPR_KIND_NONE; + + set_deparse_for_query(&dpns, query, NIL); + + get_rule_expr(qual, &context, false); + } + + appendStringInfoString(buf, " DO "); + + /* The INSTEAD keyword (if so) */ + if (is_instead) + appendStringInfoString(buf, "INSTEAD "); + + /* Finally the rules actions */ + if (list_length(actions) > 1) + { + ListCell *action; + Query *query; + + appendStringInfoChar(buf, '('); + foreach(action, actions) + { + query = (Query *) lfirst(action); + get_query_def(query, buf, NIL, viewResultDesc, + prettyFlags, WRAP_COLUMN_DEFAULT, 0 +#ifdef PGXC + , false, false +#endif /* PGXC */ + ); + if (prettyFlags) + appendStringInfoString(buf, ";\n"); + else + appendStringInfoString(buf, "; "); + } + appendStringInfoString(buf, ");"); + } + else if (list_length(actions) == 0) + { + appendStringInfoString(buf, "NOTHING;"); + } + else + { + Query *query; + + query = (Query *) linitial(actions); + get_query_def(query, buf, NIL, viewResultDesc, + prettyFlags, WRAP_COLUMN_DEFAULT, 0 +#ifdef PGXC + , false, false +#endif /* PGXC */ + ); + appendStringInfo(buf, ";"); + } + + heap_close(ev_relation, AccessShareLock); +} + + +/* ---------- + * make_viewdef - reconstruct the SELECT part of a + * view rewrite rule + * ---------- + */ +static void +make_viewdef(StringInfo buf, HeapTuple ruletup, TupleDesc rulettc, + int prettyFlags, int wrapColumn) +{// #lizard forgives + Query *query; + char ev_type; + Oid ev_class; + bool is_instead; + char *ev_qual; + char *ev_action; + List *actions = NIL; + Relation ev_relation; + int fno; + Datum dat; + bool isnull; + + /* + * Get the attribute values from the rules tuple + */ + fno = SPI_fnumber(rulettc, "ev_type"); + dat = SPI_getbinval(ruletup, rulettc, fno, &isnull); + Assert(!isnull); + ev_type = DatumGetChar(dat); + + fno = SPI_fnumber(rulettc, "ev_class"); + dat = SPI_getbinval(ruletup, rulettc, fno, &isnull); + Assert(!isnull); + ev_class = DatumGetObjectId(dat); + + fno = SPI_fnumber(rulettc, "is_instead"); + dat = SPI_getbinval(ruletup, rulettc, fno, &isnull); + Assert(!isnull); + is_instead = DatumGetBool(dat); + + /* these could be nulls */ + fno = SPI_fnumber(rulettc, "ev_qual"); + ev_qual = SPI_getvalue(ruletup, rulettc, fno); + + fno = SPI_fnumber(rulettc, "ev_action"); + ev_action = SPI_getvalue(ruletup, rulettc, fno); + if (ev_action != NULL) + actions = (List *) stringToNode(ev_action); + + if (list_length(actions) != 1) + { + /* keep output buffer empty and leave */ + return; + } + + query = (Query *) linitial(actions); + + if (ev_type != '1' || !is_instead || + strcmp(ev_qual, "<>") != 0 || query->commandType != CMD_SELECT) + { + /* keep output buffer empty and leave */ + return; + } + + ev_relation = heap_open(ev_class, AccessShareLock); + + get_query_def(query, buf, NIL, RelationGetDescr(ev_relation), + prettyFlags, wrapColumn, 0 +#ifdef PGXC + , false, false +#endif /* PGXC */ + ); + appendStringInfo(buf, ";"); + + heap_close(ev_relation, AccessShareLock); +} + +#ifdef PGXC +/* ---------- + * deparse_query - Parse back one query parsetree + * + * Purpose of this function is to build up statement for a RemoteQuery + * It just calls get_query_def without pretty print flags + * ---------- + */ +void +deparse_query(Query *query, StringInfo buf, List *parentnamespace, + bool finalise_aggs, bool sortgroup_colno) +{ + get_query_def(query, buf, parentnamespace, NULL, 0, 0, 0, finalise_aggs, + sortgroup_colno); +} + +/* code borrowed from get_insert_query_def */ +void +get_query_def_from_valuesList(Query *query, StringInfo buf) +{// #lizard forgives + + RangeTblEntry *select_rte = NULL; + RangeTblEntry *values_rte = NULL; + RangeTblEntry *rte; + char *sep; + ListCell *values_cell; + ListCell *l; + List *strippedexprs; + deparse_context context; + deparse_namespace dpns; + + /* + * Before we begin to examine the query, acquire locks on referenced + * relations, and fix up deleted columns in JOIN RTEs. This ensures + * consistent results. Note we assume it's OK to scribble on the passed + * querytree! + */ + AcquireRewriteLocks(query, false, false); + + context.buf = buf; + context.namespaces = NIL; + context.windowClause = NIL; + context.windowTList = NIL; + context.varprefix = (list_length(query->rtable) != 1); + context.prettyFlags = 0; + context.indentLevel = 0; + context.wrapColumn = 0; + + dpns.rtable = query->rtable; + dpns.ctes = query->cteList; + dpns.planstate = NULL; + dpns.ancestors = NIL; + dpns.outer_planstate = dpns.inner_planstate = NULL; + + /* + * If it's an INSERT ... SELECT or VALUES (...), (...), ... there will be + * a single RTE for the SELECT or VALUES. + */ + foreach(l, query->rtable) + { + rte = (RangeTblEntry *) lfirst(l); + + if (rte->rtekind == RTE_SUBQUERY) + { + if (select_rte) + elog(ERROR, "too many subquery RTEs in INSERT"); + select_rte = rte; + } + + if (rte->rtekind == RTE_VALUES) + { + if (values_rte) + elog(ERROR, "too many values RTEs in INSERT"); + values_rte = rte; + } + } + if (select_rte && values_rte) + elog(ERROR, "both subquery and values RTEs in INSERT"); + + /* + * Start the query with INSERT INTO relname + */ + rte = rt_fetch(query->resultRelation, query->rtable); + Assert(rte->rtekind == RTE_RELATION); + + appendStringInfo(buf, "INSERT INTO %s (", + generate_relation_name(rte->relid, NIL)); + + /* + * Add the insert-column-names list. To handle indirection properly, we + * need to look for indirection nodes in the top targetlist (if it's + * INSERT ... SELECT or INSERT ... single VALUES), or in the first + * expression list of the VALUES RTE (if it's INSERT ... multi VALUES). We + * assume that all the expression lists will have similar indirection in + * the latter case. + */ + if (values_rte) + values_cell = list_head((List *) linitial(values_rte->values_lists)); + else + values_cell = NULL; + strippedexprs = NIL; + sep = ""; + foreach(l, query->targetList) + { + TargetEntry *tle = (TargetEntry *) lfirst(l); + + elog(DEBUG1, "targetEntry type is %d\n)", tle->expr->type); + if (tle->resjunk || !IsA(tle->expr, Var)) + continue; /* ignore junk entries */ + + appendStringInfoString(buf, sep); + sep = ", "; + + /* + * Put out name of target column; look in the catalogs, not at + * tle->resname, since resname will fail to track RENAME. + */ + appendStringInfoString(buf,quote_identifier(get_relid_attribute_name(rte->relid, tle->resno))); + + /* + * Print any indirection needed (subfields or subscripts), and strip + * off the top-level nodes representing the indirection assignments. + */ + if (values_cell) + { + /* we discard the stripped expression in this case */ + processIndirection((Node *) lfirst(values_cell), &context); + values_cell = lnext(values_cell); + } + else + { + /* we keep a list of the stripped expressions in this case */ + strippedexprs = lappend(strippedexprs, processIndirection((Node *) tle->expr, &context)); + } + } + appendStringInfo(buf, ") "); + + if (select_rte) + { + /* Add the SELECT */ + get_query_def(select_rte->subquery, buf, NIL, NULL, + context.prettyFlags, context.wrapColumn, + context.indentLevel, + context.finalise_aggs, context.sortgroup_colno); + } + else if (values_rte) + { + /* A WITH clause is possible here */ + get_with_clause(query, &context); + /* Add the multi-VALUES expression lists */ + get_values_def(values_rte->values_lists, &context); + } + else + { + /* A WITH clause is possible here */ + get_with_clause(query, &context); + /* Add the single-VALUES expression list */ + appendContextKeyword(&context, "VALUES (", + -PRETTYINDENT_STD, PRETTYINDENT_STD, 2); + get_rule_expr((Node *) strippedexprs, &context, false); + appendStringInfoChar(buf, ')'); + } + + /* Add RETURNING if present */ + if (query->returningList) + { + appendContextKeyword(&context, " RETURNING", + -PRETTYINDENT_STD, PRETTYINDENT_STD, 1); + get_target_list(query->returningList, &context, NULL); + } +} +#endif +/* ---------- + * get_query_def - Parse back one query parsetree + * + * If resultDesc is not NULL, then it is the output tuple descriptor for + * the view represented by a SELECT query. + * ---------- + */ +static void +get_query_def(Query *query, StringInfo buf, List *parentnamespace, + TupleDesc resultDesc, + int prettyFlags, int wrapColumn, int startIndent, + bool finalise_aggs, bool sortgroup_colno) +{// #lizard forgives + deparse_context context; + deparse_namespace dpns; + + /* Guard against excessively long or deeply-nested queries */ + CHECK_FOR_INTERRUPTS(); + check_stack_depth(); + + /* + * Before we begin to examine the query, acquire locks on referenced + * relations, and fix up deleted columns in JOIN RTEs. This ensures + * consistent results. Note we assume it's OK to scribble on the passed + * querytree! + * + * We are only deparsing the query (we are not about to execute it), so we + * only need AccessShareLock on the relations it mentions. + */ + AcquireRewriteLocks(query, false, false); + + context.buf = buf; + context.namespaces = lcons(&dpns, list_copy(parentnamespace)); + context.windowClause = NIL; + context.windowTList = NIL; + context.varprefix = (parentnamespace != NIL || + list_length(query->rtable) != 1); + context.prettyFlags = prettyFlags; + context.wrapColumn = wrapColumn; + context.indentLevel = startIndent; + context.special_exprkind = EXPR_KIND_NONE; + context.finalise_aggs = finalise_aggs; + context.sortgroup_colno = sortgroup_colno; + + set_deparse_for_query(&dpns, query, parentnamespace); + + switch (query->commandType) + { + case CMD_SELECT: + get_select_query_def(query, &context, resultDesc); + break; + + case CMD_UPDATE: + get_update_query_def(query, &context); + break; + + case CMD_INSERT: + get_insert_query_def(query, &context); + break; + + case CMD_DELETE: + get_delete_query_def(query, &context); + break; + + case CMD_NOTHING: + appendStringInfoString(buf, "NOTHING"); + break; + + case CMD_UTILITY: + get_utility_query_def(query, &context); + break; + + default: + elog(ERROR, "unrecognized query command type: %d", + query->commandType); + break; + } +} + +/* ---------- + * get_values_def - Parse back a VALUES list + * ---------- + */ +static void +get_values_def(List *values_lists, deparse_context *context) +{ + StringInfo buf = context->buf; + bool first_list = true; + ListCell *vtl; + + appendStringInfoString(buf, "VALUES "); + + foreach(vtl, values_lists) + { + List *sublist = (List *) lfirst(vtl); + bool first_col = true; + ListCell *lc; + + if (first_list) + first_list = false; + else + appendStringInfoString(buf, ", "); + + appendStringInfoChar(buf, '('); + foreach(lc, sublist) + { + Node *col = (Node *) lfirst(lc); + + if (first_col) + first_col = false; + else + appendStringInfoChar(buf, ','); + + /* + * Print the value. Whole-row Vars need special treatment. + */ + get_rule_expr_toplevel(col, context, false); + } + appendStringInfoChar(buf, ')'); + } +} + +/* ---------- + * get_with_clause - Parse back a WITH clause + * ---------- + */ +static void +get_with_clause(Query *query, deparse_context *context) +{// #lizard forgives + StringInfo buf = context->buf; + const char *sep; + ListCell *l; + + if (query->cteList == NIL) + return; + + if (PRETTY_INDENT(context)) + { + context->indentLevel += PRETTYINDENT_STD; + appendStringInfoChar(buf, ' '); + } + + if (query->hasRecursive) + sep = "WITH RECURSIVE "; + else + sep = "WITH "; + foreach(l, query->cteList) + { + CommonTableExpr *cte = (CommonTableExpr *) lfirst(l); + + appendStringInfoString(buf, sep); + appendStringInfoString(buf, quote_identifier(cte->ctename)); + if (cte->aliascolnames) + { + bool first = true; + ListCell *col; + + appendStringInfoChar(buf, '('); + foreach(col, cte->aliascolnames) + { + if (first) + first = false; + else + appendStringInfoString(buf, ", "); + appendStringInfoString(buf, + quote_identifier(strVal(lfirst(col)))); + } + appendStringInfoChar(buf, ')'); + } + appendStringInfoString(buf, " AS "); + switch (cte->ctematerialized) + { + case CTEMaterializeDefault: + break; + case CTEMaterializeAlways: + appendStringInfoString(buf, "MATERIALIZED "); + break; + case CTEMaterializeNever: + appendStringInfoString(buf, "NOT MATERIALIZED "); + break; + } + appendStringInfoChar(buf, '('); + if (PRETTY_INDENT(context)) + appendContextKeyword(context, "", 0, 0, 0); + get_query_def((Query *) cte->ctequery, buf, context->namespaces, NULL, + context->prettyFlags, context->wrapColumn, + context->indentLevel, + context->finalise_aggs, + context->sortgroup_colno); + if (PRETTY_INDENT(context)) + appendContextKeyword(context, "", 0, 0, 0); + appendStringInfoChar(buf, ')'); + sep = ", "; + } + + if (PRETTY_INDENT(context)) + { + context->indentLevel -= PRETTYINDENT_STD; + appendContextKeyword(context, "", 0, 0, 0); + } + else + appendStringInfoChar(buf, ' '); +} + +/* ---------- + * get_select_query_def - Parse back a SELECT parsetree + * ---------- + */ +static void +get_select_query_def(Query *query, deparse_context *context, + TupleDesc resultDesc) +{// #lizard forgives + StringInfo buf = context->buf; + List *save_windowclause; + List *save_windowtlist; + bool force_colno; + ListCell *l; + + /* Insert the WITH clause if given */ + get_with_clause(query, context); + + /* Set up context for possible window functions */ + save_windowclause = context->windowClause; + context->windowClause = query->windowClause; + save_windowtlist = context->windowTList; + context->windowTList = query->targetList; + + /* + * If the Query node has a setOperations tree, then it's the top level of + * a UNION/INTERSECT/EXCEPT query; only the WITH, ORDER BY and LIMIT + * fields are interesting in the top query itself. + */ + if (query->setOperations) + { + get_setop_query(query->setOperations, query, context, resultDesc); + /* ORDER BY clauses must be simple in this case */ + force_colno = true; + } + else + { + get_basic_select_query(query, context, resultDesc); + force_colno = false; + } + + /* Add the ORDER BY clause if given */ + if (query->sortClause != NIL) + { + appendContextKeyword(context, " ORDER BY ", + -PRETTYINDENT_STD, PRETTYINDENT_STD, 1); + get_rule_orderby(query->sortClause, query->targetList, + force_colno, context); + } + + /* Add the LIMIT clause if given */ + if (query->limitOffset != NULL) + { + appendContextKeyword(context, " OFFSET ", + -PRETTYINDENT_STD, PRETTYINDENT_STD, 0); + get_rule_expr(query->limitOffset, context, false); + } + if (query->limitCount != NULL) + { + appendContextKeyword(context, " LIMIT ", + -PRETTYINDENT_STD, PRETTYINDENT_STD, 0); + if (IsA(query->limitCount, Const) && + ((Const *) query->limitCount)->constisnull) + appendStringInfoString(buf, "ALL"); + else + get_rule_expr(query->limitCount, context, false); + } + + /* Add FOR [KEY] UPDATE/SHARE clauses if present */ + if (query->hasForUpdate) + { + foreach(l, query->rowMarks) + { + RowMarkClause *rc = (RowMarkClause *) lfirst(l); + + /* don't print implicit clauses */ + if (rc->pushedDown) + continue; + + switch (rc->strength) + { + case LCS_NONE: + /* we intentionally throw an error for LCS_NONE */ + elog(ERROR, "unrecognized LockClauseStrength %d", + (int) rc->strength); + break; + case LCS_FORKEYSHARE: + appendContextKeyword(context, " FOR KEY SHARE", + -PRETTYINDENT_STD, PRETTYINDENT_STD, 0); + break; + case LCS_FORSHARE: + appendContextKeyword(context, " FOR SHARE", + -PRETTYINDENT_STD, PRETTYINDENT_STD, 0); + break; + case LCS_FORNOKEYUPDATE: + appendContextKeyword(context, " FOR NO KEY UPDATE", + -PRETTYINDENT_STD, PRETTYINDENT_STD, 0); + break; + case LCS_FORUPDATE: + appendContextKeyword(context, " FOR UPDATE", + -PRETTYINDENT_STD, PRETTYINDENT_STD, 0); + break; + } + + appendStringInfo(buf, " OF %s", + quote_identifier(get_rtable_name(rc->rti, + context))); + if (rc->waitPolicy == LockWaitError) + appendStringInfoString(buf, " NOWAIT"); + else if (rc->waitPolicy == LockWaitSkip) + appendStringInfoString(buf, " SKIP LOCKED"); + } + } + + context->windowClause = save_windowclause; + context->windowTList = save_windowtlist; +} + +/* + * Detect whether query looks like SELECT ... FROM VALUES(); + * if so, return the VALUES RTE. Otherwise return NULL. + */ +static RangeTblEntry * +get_simple_values_rte(Query *query) +{// #lizard forgives + RangeTblEntry *result = NULL; + ListCell *lc; + + /* + * We want to return TRUE even if the Query also contains OLD or NEW rule + * RTEs. So the idea is to scan the rtable and see if there is only one + * inFromCl RTE that is a VALUES RTE. + */ + foreach(lc, query->rtable) + { + RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc); + + if (rte->rtekind == RTE_VALUES && rte->inFromCl) + { + if (result) + return NULL; /* multiple VALUES (probably not possible) */ + result = rte; + } + else if (rte->rtekind == RTE_RELATION && !rte->inFromCl) + continue; /* ignore rule entries */ + else + return NULL; /* something else -> not simple VALUES */ + } + + /* + * We don't need to check the targetlist in any great detail, because + * parser/analyze.c will never generate a "bare" VALUES RTE --- they only + * appear inside auto-generated sub-queries with very restricted + * structure. However, DefineView might have modified the tlist by + * injecting new column aliases; so compare tlist resnames against the + * RTE's names to detect that. + */ + if (result) + { + ListCell *lcn; + + if (list_length(query->targetList) != list_length(result->eref->colnames)) + return NULL; /* this probably cannot happen */ + forboth(lc, query->targetList, lcn, result->eref->colnames) + { + TargetEntry *tle = (TargetEntry *) lfirst(lc); + char *cname = strVal(lfirst(lcn)); + + if (tle->resjunk) + return NULL; /* this probably cannot happen */ + if (tle->resname == NULL || strcmp(tle->resname, cname) != 0) + return NULL; /* column name has been changed */ + } + } + + return result; +} + +static void +get_basic_select_query(Query *query, deparse_context *context, + TupleDesc resultDesc) +{// #lizard forgives + StringInfo buf = context->buf; + RangeTblEntry *values_rte; + char *sep; + ListCell *l; + + if (PRETTY_INDENT(context)) + { + context->indentLevel += PRETTYINDENT_STD; + appendStringInfoChar(buf, ' '); + } + + /* + * If the query looks like SELECT * FROM (VALUES ...), then print just the + * VALUES part. This reverses what transformValuesClause() did at parse + * time. + */ + values_rte = get_simple_values_rte(query); + if (values_rte) + { + get_values_def(values_rte->values_lists, context); + return; + } + + /* + * Build up the query string - first we say SELECT + */ + appendStringInfoString(buf, "SELECT"); + + /* Add the DISTINCT clause if given */ + if (query->distinctClause != NIL) + { + if (query->hasDistinctOn) + { + appendStringInfoString(buf, " DISTINCT ON ("); + sep = ""; + foreach(l, query->distinctClause) + { + SortGroupClause *srt = (SortGroupClause *) lfirst(l); + + appendStringInfoString(buf, sep); + get_rule_sortgroupclause(srt->tleSortGroupRef, query->targetList, + false, context); + sep = ", "; + } + appendStringInfoChar(buf, ')'); + } + else + appendStringInfoString(buf, " DISTINCT"); + } + + /* Then we tell what to select (the targetlist) */ + get_target_list(query->targetList, context, resultDesc); + + /* Add the FROM clause if needed */ + get_from_clause(query, " FROM ", context); + + /* Add the WHERE clause if given */ + if (query->jointree->quals != NULL) + { + appendContextKeyword(context, " WHERE ", + -PRETTYINDENT_STD, PRETTYINDENT_STD, 1); + get_rule_expr(query->jointree->quals, context, false); + } + + /* Add the GROUP BY clause if given */ + if (query->groupClause != NULL || query->groupingSets != NULL) + { + ParseExprKind save_exprkind; + + appendContextKeyword(context, " GROUP BY ", + -PRETTYINDENT_STD, PRETTYINDENT_STD, 1); + + save_exprkind = context->special_exprkind; + context->special_exprkind = EXPR_KIND_GROUP_BY; + + if (query->groupingSets == NIL) + { + sep = ""; + foreach(l, query->groupClause) + { + SortGroupClause *grp = (SortGroupClause *) lfirst(l); + + appendStringInfoString(buf, sep); + get_rule_sortgroupclause(grp->tleSortGroupRef, query->targetList, + false, context); + sep = ", "; + } + } + else + { + sep = ""; + foreach(l, query->groupingSets) + { + GroupingSet *grp = lfirst(l); + + appendStringInfoString(buf, sep); + get_rule_groupingset(grp, query->targetList, true, context); + sep = ", "; + } + } + + context->special_exprkind = save_exprkind; + } + + /* Add the HAVING clause if given */ + if (query->havingQual != NULL) + { + appendContextKeyword(context, " HAVING ", + -PRETTYINDENT_STD, PRETTYINDENT_STD, 0); + get_rule_expr(query->havingQual, context, false); + } + + /* Add the WINDOW clause if needed */ + if (query->windowClause != NIL) + get_rule_windowclause(query, context); +} + +/* ---------- + * get_target_list - Parse back a SELECT target list + * + * This is also used for RETURNING lists in INSERT/UPDATE/DELETE. + * ---------- + */ +static void +get_target_list(List *targetList, deparse_context *context, + TupleDesc resultDesc) +{// #lizard forgives + StringInfo buf = context->buf; + StringInfoData targetbuf; + bool last_was_multiline = false; + char *sep; + int colno; + ListCell *l; +#ifdef PGXC + bool no_targetlist = true; +#endif + + /* we use targetbuf to hold each TLE's text temporarily */ + initStringInfo(&targetbuf); + + sep = " "; + colno = 0; + foreach(l, targetList) + { + TargetEntry *tle = (TargetEntry *) lfirst(l); + char *colname; + char *attname; + + if (tle->resjunk) + continue; /* ignore junk entries */ + +#ifdef PGXC + /* Found at least one element in the target list */ + if (no_targetlist) + no_targetlist = false; +#endif + + appendStringInfoString(buf, sep); + sep = ", "; + colno++; + + /* + * Put the new field text into targetbuf so we can decide after we've + * got it whether or not it needs to go on a new line. + */ + resetStringInfo(&targetbuf); + context->buf = &targetbuf; + + /* + * We special-case Var nodes rather than using get_rule_expr. This is + * needed because get_rule_expr will display a whole-row Var as + * "foo.*", which is the preferred notation in most contexts, but at + * the top level of a SELECT list it's not right (the parser will + * expand that notation into multiple columns, yielding behavior + * different from a whole-row Var). We need to call get_variable + * directly so that we can tell it to do the right thing, and so that + * we can get the attribute name which is the default AS label. + */ + if (tle->expr && (IsA(tle->expr, Var))) + { + attname = get_variable((Var *) tle->expr, 0, true, context); + } + else + { + get_rule_expr((Node *) tle->expr, context, true); + /* We'll show the AS name unless it's this: */ + attname = "?column?"; + } + + /* + * Figure out what the result column should be called. In the context + * of a view, use the view's tuple descriptor (so as to pick up the + * effects of any column RENAME that's been done on the view). + * Otherwise, just use what we can find in the TLE. + */ + if (resultDesc && colno <= resultDesc->natts) + colname = NameStr(resultDesc->attrs[colno - 1]->attname); + else + colname = tle->resname; + + /* Show AS unless the column's name is correct as-is */ + if (colname) /* resname could be NULL */ + { + if (attname == NULL || strcmp(attname, colname) != 0) + appendStringInfo(&targetbuf, " AS %s", quote_identifier(colname)); + } + + /* Restore context's output buffer */ + context->buf = buf; + + /* Consider line-wrapping if enabled */ + if (PRETTY_INDENT(context) && context->wrapColumn >= 0) + { + int leading_nl_pos; + + /* Does the new field start with a new line? */ + if (targetbuf.len > 0 && targetbuf.data[0] == '\n') + leading_nl_pos = 0; + else + leading_nl_pos = -1; + + /* If so, we shouldn't add anything */ + if (leading_nl_pos >= 0) + { + /* instead, remove any trailing spaces currently in buf */ + removeStringInfoSpaces(buf); + } + else + { + char *trailing_nl; + + /* Locate the start of the current line in the output buffer */ + trailing_nl = strrchr(buf->data, '\n'); + if (trailing_nl == NULL) + trailing_nl = buf->data; + else + trailing_nl++; + + /* + * Add a newline, plus some indentation, if the new field is + * not the first and either the new field would cause an + * overflow or the last field used more than one line. + */ + if (colno > 1 && + ((strlen(trailing_nl) + targetbuf.len > context->wrapColumn) || + last_was_multiline)) + appendContextKeyword(context, "", -PRETTYINDENT_STD, + PRETTYINDENT_STD, PRETTYINDENT_VAR); + } + + /* Remember this field's multiline status for next iteration */ + last_was_multiline = + (strchr(targetbuf.data + leading_nl_pos + 1, '\n') != NULL); + } + + /* Add the new field */ + appendStringInfoString(buf, targetbuf.data); + } + +#ifdef PGXC + /* + * Because the empty target list can generate invalid SQL + * clause. Here, just fill a '*' to process a table without + * any columns, this statement will be sent to Datanodes + * and treated correctly on remote nodes. + */ + if (no_targetlist) + appendStringInfo(buf, " *"); +#endif + /* clean up */ + pfree(targetbuf.data); +} + +static void +get_setop_query(Node *setOp, Query *query, deparse_context *context, + TupleDesc resultDesc) +{// #lizard forgives + StringInfo buf = context->buf; + bool need_paren; + + /* Guard against excessively long or deeply-nested queries */ + CHECK_FOR_INTERRUPTS(); + check_stack_depth(); + + if (IsA(setOp, RangeTblRef)) + { + RangeTblRef *rtr = (RangeTblRef *) setOp; + RangeTblEntry *rte = rt_fetch(rtr->rtindex, query->rtable); + Query *subquery = rte->subquery; + + Assert(subquery != NULL); + Assert(subquery->setOperations == NULL); + /* Need parens if WITH, ORDER BY, FOR UPDATE, or LIMIT; see gram.y */ + need_paren = (subquery->cteList || + subquery->sortClause || + subquery->rowMarks || + subquery->limitOffset || + subquery->limitCount); + if (need_paren) + appendStringInfoChar(buf, '('); + get_query_def(subquery, buf, context->namespaces, resultDesc, + context->prettyFlags, context->wrapColumn, + context->indentLevel, + context->finalise_aggs, + context->sortgroup_colno); + if (need_paren) + appendStringInfoChar(buf, ')'); + } + else if (IsA(setOp, SetOperationStmt)) + { + SetOperationStmt *op = (SetOperationStmt *) setOp; + int subindent; + + /* + * We force parens when nesting two SetOperationStmts, except when the + * lefthand input is another setop of the same kind. Syntactically, + * we could omit parens in rather more cases, but it seems best to use + * parens to flag cases where the setop operator changes. If we use + * parens, we also increase the indentation level for the child query. + * + * There are some cases in which parens are needed around a leaf query + * too, but those are more easily handled at the next level down (see + * code above). + */ + if (IsA(op->larg, SetOperationStmt)) + { + SetOperationStmt *lop = (SetOperationStmt *) op->larg; + + if (op->op == lop->op && op->all == lop->all) + need_paren = false; + else + need_paren = true; + } + else + need_paren = false; + + if (need_paren) + { + appendStringInfoChar(buf, '('); + subindent = PRETTYINDENT_STD; + appendContextKeyword(context, "", subindent, 0, 0); + } + else + subindent = 0; + + get_setop_query(op->larg, query, context, resultDesc); + + if (need_paren) + appendContextKeyword(context, ") ", -subindent, 0, 0); + else if (PRETTY_INDENT(context)) + appendContextKeyword(context, "", -subindent, 0, 0); + else + appendStringInfoChar(buf, ' '); + + switch (op->op) + { + case SETOP_UNION: + appendStringInfoString(buf, "UNION "); + break; + case SETOP_INTERSECT: + appendStringInfoString(buf, "INTERSECT "); + break; + case SETOP_EXCEPT: + appendStringInfoString(buf, "EXCEPT "); + break; + default: + elog(ERROR, "unrecognized set op: %d", + (int) op->op); + } + if (op->all) + appendStringInfoString(buf, "ALL "); + + /* Always parenthesize if RHS is another setop */ + need_paren = IsA(op->rarg, SetOperationStmt); + + /* + * The indentation code here is deliberately a bit different from that + * for the lefthand input, because we want the line breaks in + * different places. + */ + if (need_paren) + { + appendStringInfoChar(buf, '('); + subindent = PRETTYINDENT_STD; + } + else + subindent = 0; + appendContextKeyword(context, "", subindent, 0, 0); + + get_setop_query(op->rarg, query, context, resultDesc); + + if (PRETTY_INDENT(context)) + context->indentLevel -= subindent; + if (need_paren) + appendContextKeyword(context, ")", 0, 0, 0); + } + else + { + elog(ERROR, "unrecognized node type: %d", + (int) nodeTag(setOp)); + } +} + +/* + * Display a sort/group clause. + * + * Also returns the expression tree, so caller need not find it again. + */ +static Node * +get_rule_sortgroupclause(Index ref, List *tlist, bool force_colno, + deparse_context *context) +{// #lizard forgives + StringInfo buf = context->buf; + TargetEntry *tle; + Node *expr; + + tle = get_sortgroupref_tle(ref, tlist); + expr = (Node *) tle->expr; + + /* + * Use column-number form if requested by caller. Otherwise, if + * expression is a constant, force it to be dumped with an explicit cast + * as decoration --- this is because a simple integer constant is + * ambiguous (and will be misinterpreted by findTargetlistEntry()) if we + * dump it without any decoration. If it's anything more complex than a + * simple Var, then force extra parens around it, to ensure it can't be + * misinterpreted as a cube() or rollup() construct. + */ + if (force_colno) + { + Assert(!tle->resjunk); + appendStringInfo(buf, "%d", tle->resno); + } + else if (expr && IsA(expr, Const)) + get_const_expr((Const *) expr, context, 1); + else if (!expr || IsA(expr, Var)) + get_rule_expr(expr, context, true); + else + { + /* + * We must force parens for function-like expressions even if + * PRETTY_PAREN is off, since those are the ones in danger of + * misparsing. For other expressions we need to force them only if + * PRETTY_PAREN is on, since otherwise the expression will output them + * itself. (We can't skip the parens.) + */ + bool need_paren = (PRETTY_PAREN(context) + || IsA(expr, FuncExpr) + ||IsA(expr, Aggref) + ||IsA(expr, WindowFunc)); + + if (need_paren) + appendStringInfoString(context->buf, "("); + get_rule_expr(expr, context, true); + if (need_paren) + appendStringInfoString(context->buf, ")"); + } + + return expr; +} + +/* + * Display a GroupingSet + */ +static void +get_rule_groupingset(GroupingSet *gset, List *targetlist, + bool omit_parens, deparse_context *context) +{// #lizard forgives + ListCell *l; + StringInfo buf = context->buf; + bool omit_child_parens = true; + char *sep = ""; + + switch (gset->kind) + { + case GROUPING_SET_EMPTY: + appendStringInfoString(buf, "()"); + return; + + case GROUPING_SET_SIMPLE: + { + if (!omit_parens || list_length(gset->content) != 1) + appendStringInfoString(buf, "("); + + foreach(l, gset->content) + { + Index ref = lfirst_int(l); + + appendStringInfoString(buf, sep); + get_rule_sortgroupclause(ref, targetlist, + false, context); + sep = ", "; + } + + if (!omit_parens || list_length(gset->content) != 1) + appendStringInfoString(buf, ")"); + } + return; + + case GROUPING_SET_ROLLUP: + appendStringInfoString(buf, "ROLLUP("); + break; + case GROUPING_SET_CUBE: + appendStringInfoString(buf, "CUBE("); + break; + case GROUPING_SET_SETS: + appendStringInfoString(buf, "GROUPING SETS ("); + omit_child_parens = false; + break; + } + + foreach(l, gset->content) + { + appendStringInfoString(buf, sep); + get_rule_groupingset(lfirst(l), targetlist, omit_child_parens, context); + sep = ", "; + } + + appendStringInfoString(buf, ")"); +} + +/* + * Display an ORDER BY list. + */ +static void +get_rule_orderby(List *orderList, List *targetList, + bool force_colno, deparse_context *context) +{ + StringInfo buf = context->buf; + const char *sep; + ListCell *l; + + sep = ""; + foreach(l, orderList) + { + SortGroupClause *srt = (SortGroupClause *) lfirst(l); + Node *sortexpr; + Oid sortcoltype; + TypeCacheEntry *typentry; + + appendStringInfoString(buf, sep); + sortexpr = get_rule_sortgroupclause(srt->tleSortGroupRef, targetList, + force_colno, context); + sortcoltype = exprType(sortexpr); + /* See whether operator is default < or > for datatype */ + typentry = lookup_type_cache(sortcoltype, + TYPECACHE_LT_OPR | TYPECACHE_GT_OPR); + if (srt->sortop == typentry->lt_opr) + { + /* ASC is default, so emit nothing for it */ + if (srt->nulls_first) + appendStringInfoString(buf, " NULLS FIRST"); + } + else if (srt->sortop == typentry->gt_opr) + { + appendStringInfoString(buf, " DESC"); + /* DESC defaults to NULLS FIRST */ + if (!srt->nulls_first) + appendStringInfoString(buf, " NULLS LAST"); + } + else + { + appendStringInfo(buf, " USING %s", + generate_operator_name(srt->sortop, + sortcoltype, + sortcoltype)); + /* be specific to eliminate ambiguity */ + if (srt->nulls_first) + appendStringInfoString(buf, " NULLS FIRST"); + else + appendStringInfoString(buf, " NULLS LAST"); + } + sep = ", "; + } +} + +/* + * Display a WINDOW clause. + * + * Note that the windowClause list might contain only anonymous window + * specifications, in which case we should print nothing here. + */ +static void +get_rule_windowclause(Query *query, deparse_context *context) +{ + StringInfo buf = context->buf; + const char *sep; + ListCell *l; + + sep = NULL; + foreach(l, query->windowClause) + { + WindowClause *wc = (WindowClause *) lfirst(l); + + if (wc->name == NULL) + continue; /* ignore anonymous windows */ + + if (sep == NULL) + appendContextKeyword(context, " WINDOW ", + -PRETTYINDENT_STD, PRETTYINDENT_STD, 1); + else + appendStringInfoString(buf, sep); + + appendStringInfo(buf, "%s AS ", quote_identifier(wc->name)); + + get_rule_windowspec(wc, query->targetList, context); + + sep = ", "; + } +} + +/* + * Display a window definition + */ +static void +get_rule_windowspec(WindowClause *wc, List *targetList, + deparse_context *context) +{// #lizard forgives + StringInfo buf = context->buf; + bool needspace = false; + const char *sep; + ListCell *l; + + appendStringInfoChar(buf, '('); + if (wc->refname) + { + appendStringInfoString(buf, quote_identifier(wc->refname)); + needspace = true; + } + /* partition clauses are always inherited, so only print if no refname */ + if (wc->partitionClause && !wc->refname) + { + if (needspace) + appendStringInfoChar(buf, ' '); + appendStringInfoString(buf, "PARTITION BY "); + sep = ""; + foreach(l, wc->partitionClause) + { + SortGroupClause *grp = (SortGroupClause *) lfirst(l); + + appendStringInfoString(buf, sep); + get_rule_sortgroupclause(grp->tleSortGroupRef, targetList, + false, context); + sep = ", "; + } + needspace = true; + } + /* print ordering clause only if not inherited */ + if (wc->orderClause && !wc->copiedOrder) + { + if (needspace) + appendStringInfoChar(buf, ' '); + appendStringInfoString(buf, "ORDER BY "); + get_rule_orderby(wc->orderClause, targetList, false, context); + needspace = true; + } + /* framing clause is never inherited, so print unless it's default */ + if (wc->frameOptions & FRAMEOPTION_NONDEFAULT) + { + if (needspace) + appendStringInfoChar(buf, ' '); + if (wc->frameOptions & FRAMEOPTION_RANGE) + appendStringInfoString(buf, "RANGE "); + else if (wc->frameOptions & FRAMEOPTION_ROWS) + appendStringInfoString(buf, "ROWS "); + else + Assert(false); + if (wc->frameOptions & FRAMEOPTION_BETWEEN) + appendStringInfoString(buf, "BETWEEN "); + if (wc->frameOptions & FRAMEOPTION_START_UNBOUNDED_PRECEDING) + appendStringInfoString(buf, "UNBOUNDED PRECEDING "); + else if (wc->frameOptions & FRAMEOPTION_START_CURRENT_ROW) + appendStringInfoString(buf, "CURRENT ROW "); + else if (wc->frameOptions & FRAMEOPTION_START_VALUE) + { + get_rule_expr(wc->startOffset, context, false); + if (wc->frameOptions & FRAMEOPTION_START_VALUE_PRECEDING) + appendStringInfoString(buf, " PRECEDING "); + else if (wc->frameOptions & FRAMEOPTION_START_VALUE_FOLLOWING) + appendStringInfoString(buf, " FOLLOWING "); + else + Assert(false); + } + else + Assert(false); + if (wc->frameOptions & FRAMEOPTION_BETWEEN) + { + appendStringInfoString(buf, "AND "); + if (wc->frameOptions & FRAMEOPTION_END_UNBOUNDED_FOLLOWING) + appendStringInfoString(buf, "UNBOUNDED FOLLOWING "); + else if (wc->frameOptions & FRAMEOPTION_END_CURRENT_ROW) + appendStringInfoString(buf, "CURRENT ROW "); + else if (wc->frameOptions & FRAMEOPTION_END_VALUE) + { + get_rule_expr(wc->endOffset, context, false); + if (wc->frameOptions & FRAMEOPTION_END_VALUE_PRECEDING) + appendStringInfoString(buf, " PRECEDING "); + else if (wc->frameOptions & FRAMEOPTION_END_VALUE_FOLLOWING) + appendStringInfoString(buf, " FOLLOWING "); + else + Assert(false); + } + else + Assert(false); + } + /* we will now have a trailing space; remove it */ + buf->len--; + } + appendStringInfoChar(buf, ')'); +} + +/* ---------- + * get_insert_query_def - Parse back an INSERT parsetree + * ---------- + */ +static void +get_insert_query_def(Query *query, deparse_context *context) +{// #lizard forgives + StringInfo buf = context->buf; + RangeTblEntry *select_rte = NULL; + RangeTblEntry *values_rte = NULL; + RangeTblEntry *rte; + char *sep; + ListCell *l; + List *strippedexprs; + + /* Insert the WITH clause if given */ + get_with_clause(query, context); + +#ifdef __TBASE__ + /* + * If query has unshippable triggers, we have to do INSERT on coordinator, + * and we do not need select_rte and values_rte. + * Hence we keep both select_rte and values_rte NULL. + */ + if (!query->hasUnshippableTriggers) + { +#endif + /* + * If it's an INSERT ... SELECT or multi-row VALUES, there will be a + * single RTE for the SELECT or VALUES. Plain VALUES has neither. + */ + foreach(l, query->rtable) + { + rte = (RangeTblEntry *) lfirst(l); + + if (rte->rtekind == RTE_SUBQUERY) + { + if (select_rte) + elog(ERROR, "too many subquery RTEs in INSERT"); + select_rte = rte; + } + + if (rte->rtekind == RTE_VALUES) + { + if (values_rte) + elog(ERROR, "too many values RTEs in INSERT"); + values_rte = rte; + } + } +#ifdef __TBASE__ + } +#endif + if (select_rte && values_rte) + elog(ERROR, "both subquery and values RTEs in INSERT"); + + /* + * Start the query with INSERT INTO relname + */ + rte = rt_fetch(query->resultRelation, query->rtable); + Assert(rte->rtekind == RTE_RELATION); + + if (PRETTY_INDENT(context)) + { + context->indentLevel += PRETTYINDENT_STD; + appendStringInfoChar(buf, ' '); + } + appendStringInfo(buf, "INSERT INTO %s ", + generate_relation_name(rte->relid, NIL)); + /* INSERT requires AS keyword for target alias */ + if (rte->alias != NULL) + appendStringInfo(buf, "AS %s ", + quote_identifier(rte->alias->aliasname)); + + /* + * Add the insert-column-names list. Any indirection decoration needed on + * the column names can be inferred from the top targetlist. + */ + strippedexprs = NIL; + sep = ""; + if (query->targetList) + appendStringInfoChar(buf, '('); + foreach(l, query->targetList) + { + TargetEntry *tle = (TargetEntry *) lfirst(l); + + if (tle->resjunk) + continue; /* ignore junk entries */ + + appendStringInfoString(buf, sep); + sep = ", "; + + /* + * Put out name of target column; look in the catalogs, not at + * tle->resname, since resname will fail to track RENAME. + */ + appendStringInfoString(buf, + quote_identifier(get_relid_attribute_name(rte->relid, + tle->resno))); + + /* + * Print any indirection needed (subfields or subscripts), and strip + * off the top-level nodes representing the indirection assignments. + * Add the stripped expressions to strippedexprs. (If it's a + * single-VALUES statement, the stripped expressions are the VALUES to + * print below. Otherwise they're just Vars and not really + * interesting.) + */ + strippedexprs = lappend(strippedexprs, + processIndirection((Node *) tle->expr, + context)); + } + if (query->targetList) + appendStringInfoString(buf, ") "); + + if (query->override) + { + if (query->override == OVERRIDING_SYSTEM_VALUE) + appendStringInfoString(buf, "OVERRIDING SYSTEM VALUE "); + else if (query->override == OVERRIDING_USER_VALUE) + appendStringInfoString(buf, "OVERRIDING USER VALUE "); + } + + if (select_rte) + { + /* Add the SELECT */ + get_query_def(select_rte->subquery, buf, NIL, NULL, + context->prettyFlags, context->wrapColumn, + context->indentLevel, + context->finalise_aggs, + context->sortgroup_colno); + } + else if (values_rte) + { + /* Add the multi-VALUES expression lists */ + get_values_def(values_rte->values_lists, context); + } + else if (strippedexprs) + { + /* Add the single-VALUES expression list */ + appendContextKeyword(context, "VALUES (", + -PRETTYINDENT_STD, PRETTYINDENT_STD, 2); + get_rule_expr((Node *) strippedexprs, context, false); + appendStringInfoChar(buf, ')'); + } + else + { + /* No expressions, so it must be DEFAULT VALUES */ + appendStringInfoString(buf, "DEFAULT VALUES"); + } + + /* Add ON CONFLICT if present */ + if (query->onConflict) + { + OnConflictExpr *confl = query->onConflict; + + appendStringInfoString(buf, " ON CONFLICT"); + + if (confl->arbiterElems) + { + /* Add the single-VALUES expression list */ + appendStringInfoChar(buf, '('); + get_rule_expr((Node *) confl->arbiterElems, context, false); + appendStringInfoChar(buf, ')'); + + /* Add a WHERE clause (for partial indexes) if given */ + if (confl->arbiterWhere != NULL) + { + bool save_varprefix; + + /* + * Force non-prefixing of Vars, since parser assumes that they + * belong to target relation. WHERE clause does not use + * InferenceElem, so this is separately required. + */ + save_varprefix = context->varprefix; + context->varprefix = false; + + appendContextKeyword(context, " WHERE ", + -PRETTYINDENT_STD, PRETTYINDENT_STD, 1); + get_rule_expr(confl->arbiterWhere, context, false); + + context->varprefix = save_varprefix; + } + } + else if (OidIsValid(confl->constraint)) + { + char *constraint = get_constraint_name(confl->constraint); + + if (!constraint) + elog(ERROR, "cache lookup failed for constraint %u", + confl->constraint); + appendStringInfo(buf, " ON CONSTRAINT %s", + quote_identifier(constraint)); + } + + if (confl->action == ONCONFLICT_NOTHING) + { + appendStringInfoString(buf, " DO NOTHING"); + } + else + { + appendStringInfoString(buf, " DO UPDATE SET "); + /* Deparse targetlist */ + get_update_query_targetlist_def(query, confl->onConflictSet, + context, rte); + + /* Add a WHERE clause if given */ + if (confl->onConflictWhere != NULL) + { + appendContextKeyword(context, " WHERE ", + -PRETTYINDENT_STD, PRETTYINDENT_STD, 1); + get_rule_expr(confl->onConflictWhere, context, false); + } + } + } + + /* Add RETURNING if present */ + if (query->returningList) + { + appendContextKeyword(context, " RETURNING", + -PRETTYINDENT_STD, PRETTYINDENT_STD, 1); + get_target_list(query->returningList, context, NULL); + } +} + + +/* ---------- + * get_update_query_def - Parse back an UPDATE parsetree + * ---------- + */ +static void +get_update_query_def(Query *query, deparse_context *context) +{ + StringInfo buf = context->buf; + RangeTblEntry *rte; + + /* Insert the WITH clause if given */ + get_with_clause(query, context); + + /* + * Start the query with UPDATE relname SET + */ + rte = rt_fetch(query->resultRelation, query->rtable); + Assert(rte->rtekind == RTE_RELATION); + if (PRETTY_INDENT(context)) + { + appendStringInfoChar(buf, ' '); + context->indentLevel += PRETTYINDENT_STD; + } + appendStringInfo(buf, "UPDATE %s%s", + only_marker(rte), + generate_relation_name(rte->relid, NIL)); + if (rte->alias != NULL) + appendStringInfo(buf, " %s", + quote_identifier(rte->alias->aliasname)); + appendStringInfoString(buf, " SET "); + + /* Deparse targetlist */ + get_update_query_targetlist_def(query, query->targetList, context, rte); + + /* Add the FROM clause if needed */ + get_from_clause(query, " FROM ", context); + + /* Add a WHERE clause if given */ + if (query->jointree->quals != NULL) + { + appendContextKeyword(context, " WHERE ", + -PRETTYINDENT_STD, PRETTYINDENT_STD, 1); + get_rule_expr(query->jointree->quals, context, false); + } + + /* Add RETURNING if present */ + if (query->returningList) + { + appendContextKeyword(context, " RETURNING", + -PRETTYINDENT_STD, PRETTYINDENT_STD, 1); + get_target_list(query->returningList, context, NULL); + } +} + + +/* ---------- + * get_update_query_targetlist_def - Parse back an UPDATE targetlist + * ---------- + */ +static void +get_update_query_targetlist_def(Query *query, List *targetList, + deparse_context *context, RangeTblEntry *rte) +{// #lizard forgives + StringInfo buf = context->buf; + ListCell *l; + ListCell *next_ma_cell; + int remaining_ma_columns; + const char *sep; + SubLink *cur_ma_sublink; + List *ma_sublinks; + + /* + * Prepare to deal with MULTIEXPR assignments: collect the source SubLinks + * into a list. We expect them to appear, in ID order, in resjunk tlist + * entries. + */ + ma_sublinks = NIL; + if (query->hasSubLinks) /* else there can't be any */ + { + foreach(l, targetList) + { + TargetEntry *tle = (TargetEntry *) lfirst(l); + + if (tle->resjunk && IsA(tle->expr, SubLink)) + { + SubLink *sl = (SubLink *) tle->expr; + + if (sl->subLinkType == MULTIEXPR_SUBLINK) + { + ma_sublinks = lappend(ma_sublinks, sl); + Assert(sl->subLinkId == list_length(ma_sublinks)); + } + } + } + } + next_ma_cell = list_head(ma_sublinks); + cur_ma_sublink = NULL; + remaining_ma_columns = 0; + + /* Add the comma separated list of 'attname = value' */ + sep = ""; + foreach(l, targetList) + { + TargetEntry *tle = (TargetEntry *) lfirst(l); + Node *expr; + + if (tle->resjunk) + continue; /* ignore junk entries */ + + /* Emit separator (OK whether we're in multiassignment or not) */ + appendStringInfoString(buf, sep); + sep = ", "; + + /* + * Check to see if we're starting a multiassignment group: if so, + * output a left paren. + */ + if (next_ma_cell != NULL && cur_ma_sublink == NULL) + { + /* + * We must dig down into the expr to see if it's a PARAM_MULTIEXPR + * Param. That could be buried under FieldStores and ArrayRefs + * and CoerceToDomains (cf processIndirection()), and underneath + * those there could be an implicit type coercion. Because we + * would ignore implicit type coercions anyway, we don't need to + * be as careful as processIndirection() is about descending past + * implicit CoerceToDomains. + */ + expr = (Node *) tle->expr; + while (expr) + { + if (IsA(expr, FieldStore)) + { + FieldStore *fstore = (FieldStore *) expr; + + expr = (Node *) linitial(fstore->newvals); + } + else if (IsA(expr, ArrayRef)) + { + ArrayRef *aref = (ArrayRef *) expr; + + if (aref->refassgnexpr == NULL) + break; + expr = (Node *) aref->refassgnexpr; + } + else if (IsA(expr, CoerceToDomain)) + { + CoerceToDomain *cdomain = (CoerceToDomain *) expr; + + if (cdomain->coercionformat != COERCE_IMPLICIT_CAST) + break; + expr = (Node *) cdomain->arg; + } + else + break; + } + expr = strip_implicit_coercions(expr); + + if (expr && IsA(expr, Param) && + ((Param *) expr)->paramkind == PARAM_MULTIEXPR) + { + cur_ma_sublink = (SubLink *) lfirst(next_ma_cell); + next_ma_cell = lnext(next_ma_cell); + remaining_ma_columns = count_nonjunk_tlist_entries( + ((Query *) cur_ma_sublink->subselect)->targetList); + Assert(((Param *) expr)->paramid == + ((cur_ma_sublink->subLinkId << 16) | 1)); + appendStringInfoChar(buf, '('); + } + } + + /* + * Put out name of target column; look in the catalogs, not at + * tle->resname, since resname will fail to track RENAME. + */ + appendStringInfoString(buf, + quote_identifier(get_relid_attribute_name(rte->relid, + tle->resno))); + + /* + * Print any indirection needed (subfields or subscripts), and strip + * off the top-level nodes representing the indirection assignments. + */ + expr = processIndirection((Node *) tle->expr, context); + + /* + * If we're in a multiassignment, skip printing anything more, unless + * this is the last column; in which case, what we print should be the + * sublink, not the Param. + */ + if (cur_ma_sublink != NULL) + { + if (--remaining_ma_columns > 0) + continue; /* not the last column of multiassignment */ + appendStringInfoChar(buf, ')'); + expr = (Node *) cur_ma_sublink; + cur_ma_sublink = NULL; + } + + appendStringInfoString(buf, " = "); + + get_rule_expr(expr, context, false); + } +} + + +/* ---------- + * get_delete_query_def - Parse back a DELETE parsetree + * ---------- + */ +static void +get_delete_query_def(Query *query, deparse_context *context) +{ + StringInfo buf = context->buf; + RangeTblEntry *rte; + + /* Insert the WITH clause if given */ + get_with_clause(query, context); + + /* + * Start the query with DELETE FROM relname + */ + rte = rt_fetch(query->resultRelation, query->rtable); + Assert(rte->rtekind == RTE_RELATION); + if (PRETTY_INDENT(context)) + { + appendStringInfoChar(buf, ' '); + context->indentLevel += PRETTYINDENT_STD; + } + appendStringInfo(buf, "DELETE FROM %s%s", + only_marker(rte), + generate_relation_name(rte->relid, NIL)); + if (rte->alias != NULL) + appendStringInfo(buf, " %s", + quote_identifier(rte->alias->aliasname)); + + /* Add the USING clause if given */ + get_from_clause(query, " USING ", context); + + /* Add a WHERE clause if given */ + if (query->jointree->quals != NULL) + { + appendContextKeyword(context, " WHERE ", + -PRETTYINDENT_STD, PRETTYINDENT_STD, 1); + get_rule_expr(query->jointree->quals, context, false); + } + + /* Add RETURNING if present */ + if (query->returningList) + { + appendContextKeyword(context, " RETURNING", + -PRETTYINDENT_STD, PRETTYINDENT_STD, 1); + get_target_list(query->returningList, context, NULL); + } +} + + +/* ---------- + * get_utility_query_def - Parse back a UTILITY parsetree + * ---------- + */ +static void +get_utility_query_def(Query *query, deparse_context *context) +{// #lizard forgives + StringInfo buf = context->buf; + + if (query->utilityStmt && IsA(query->utilityStmt, NotifyStmt)) + { + NotifyStmt *stmt = (NotifyStmt *) query->utilityStmt; + + appendContextKeyword(context, "", + 0, PRETTYINDENT_STD, 1); + appendStringInfo(buf, "NOTIFY %s", + quote_identifier(stmt->conditionname)); + if (stmt->payload) + { + appendStringInfoString(buf, ", "); + simple_quote_literal(buf, stmt->payload); + } + } +#ifdef PGXC + else if (query->utilityStmt && IsA(query->utilityStmt, CreateStmt)) + { + CreateStmt *stmt = (CreateStmt *) query->utilityStmt; + ListCell *column; + const char *delimiter = ""; + RangeVar *relation = stmt->relation; + bool istemp = (relation->relpersistence == RELPERSISTENCE_TEMP); + bool isunlogged = (relation->relpersistence == RELPERSISTENCE_UNLOGGED); + + appendStringInfo(buf, "CREATE %s %s %s TABLE %s ", + stmt->islocal ? "LOCAL" : "", + istemp ? "TEMP" : "", + isunlogged ? "UNLOGGED" : "", + stmt->if_not_exists ? "IF NOT EXISTS " : ""); + + if (!istemp && relation->schemaname && relation->schemaname[0]) + appendStringInfo(buf, "%s.", quote_identifier(relation->schemaname)); + appendStringInfo(buf, "%s", quote_identifier(relation->relname)); + + appendStringInfo(buf, "("); + foreach(column, stmt->tableElts) + { + Node *node = (Node *) lfirst(column); + + appendStringInfo(buf, "%s", delimiter); + delimiter = ", "; + + if (IsA(node, ColumnDef)) + { + ColumnDef *coldef = (ColumnDef *) node; + TypeName *typename = coldef->typeName; +#ifdef XCP + appendStringInfo(buf, "%s %s", + quote_identifier(coldef->colname), + format_type_with_typemod(typename->typeOid, + typename->typemod)); +#else + + /* error out if we have no recourse at all */ + if (!OidIsValid(typename->typeOid)) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("improper type oid: \"%u\"", typename->typeOid))); + + /* get typename from the oid */ + type = typeidType(typename->typeOid); + + if (!HeapTupleIsValid(type)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("type \"%u\" does not exist", + typename->typeOid))); + appendStringInfo(buf, "%s %s", quote_identifier(coldef->colname), + typeTypeName(type)); + ReleaseSysCache(type); +#endif + } + else + elog(ERROR, "Invalid table column definition."); + } + appendStringInfo(buf, ")"); + + /* Append storage parameters, like for instance WITH (OIDS) */ + if (list_length(stmt->options) > 0) + { + Datum reloptions; + static char *validnsps[] = HEAP_RELOPT_NAMESPACES; + + reloptions = transformRelOptions((Datum) 0, stmt->options, NULL, validnsps, + false, false); + + if (reloptions) + { + Datum sep, txt; + /* Below is inspired from flatten_reloptions() */ + sep = CStringGetTextDatum(", "); + txt = OidFunctionCall2(F_ARRAY_TO_TEXT, reloptions, sep); + appendStringInfo(buf, " WITH (%s)", TextDatumGetCString(txt)); + } + } + + /* add the on commit clauses for temporary tables */ + switch (stmt->oncommit) + { + case ONCOMMIT_NOOP: + /* do nothing */ + break; + + case ONCOMMIT_PRESERVE_ROWS: + appendStringInfo(buf, " ON COMMIT PRESERVE ROWS"); + break; + + case ONCOMMIT_DELETE_ROWS: + appendStringInfo(buf, " ON COMMIT DELETE ROWS"); + break; + + case ONCOMMIT_DROP: + appendStringInfo(buf, " ON COMMIT DROP"); + break; + } + + if (stmt->distributeby) + { + /* add the on commit clauses for temporary tables */ + switch (stmt->distributeby->disttype) + { + case DISTTYPE_REPLICATION: + appendStringInfo(buf, " DISTRIBUTE BY REPLICATION"); + break; + + case DISTTYPE_HASH: +#ifdef __COLD_HOT__ + appendStringInfo(buf, " DISTRIBUTE BY HASH(%s)", strVal(linitial(stmt->distributeby->colname))); +#else + appendStringInfo(buf, " DISTRIBUTE BY HASH(%s)", stmt->distributeby->colname); +#endif + break; + + case DISTTYPE_ROUNDROBIN: + appendStringInfo(buf, " DISTRIBUTE BY ROUNDROBIN"); + break; + + case DISTTYPE_MODULO: +#ifdef __COLD_HOT__ + appendStringInfo(buf, " DISTRIBUTE BY MODULO(%s)", + quote_identifier(strVal(linitial(stmt->distributeby->colname)))); +#else + appendStringInfo(buf, " DISTRIBUTE BY MODULO(%s)", + quote_identifier(stmt->distributeby->colname)); +#endif + break; + + default: + ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("Invalid distribution type"))); + + } + } + + if (stmt->subcluster) + { + ListCell *cell; + + switch (stmt->subcluster->clustertype) + { + case SUBCLUSTER_NODE: + appendStringInfo(buf, " TO NODE ("); + + /* Add node members */ + Assert(stmt->subcluster->members); + foreach(cell, stmt->subcluster->members) + { + appendStringInfo(buf, " %s", + quote_identifier(strVal(lfirst(cell)))); + if (cell->next) + appendStringInfo(buf, ","); + } + appendStringInfo(buf, ")"); + break; + + case SUBCLUSTER_GROUP: + appendStringInfo(buf, " TO GROUP"); + + /* Add group members */ + Assert(stmt->subcluster->members); + foreach(cell, stmt->subcluster->members) + { + appendStringInfo(buf, " %s", + quote_identifier(strVal(lfirst(cell)))); + if (cell->next) + appendStringInfo(buf, ","); + } + break; + + case SUBCLUSTER_NONE: + default: + /* Nothing to do */ + break; + } + } + } +#endif + else + { + /* Currently only NOTIFY utility commands can appear in rules */ + elog(ERROR, "unexpected utility statement type"); + } +} + +/* + * Display a Var appropriately. + * + * In some cases (currently only when recursing into an unnamed join) + * the Var's varlevelsup has to be interpreted with respect to a context + * above the current one; levelsup indicates the offset. + * + * If istoplevel is TRUE, the Var is at the top level of a SELECT's + * targetlist, which means we need special treatment of whole-row Vars. + * Instead of the normal "tab.*", we'll print "tab.*::typename", which is a + * dirty hack to prevent "tab.*" from being expanded into multiple columns. + * (The parser will strip the useless coercion, so no inefficiency is added in + * dump and reload.) We used to print just "tab" in such cases, but that is + * ambiguous and will yield the wrong result if "tab" is also a plain column + * name in the query. + * + * Returns the attname of the Var, or NULL if the Var has no attname (because + * it is a whole-row Var or a subplan output reference). + */ +static char * +get_variable(Var *var, int levelsup, bool istoplevel, deparse_context *context) +{// #lizard forgives + StringInfo buf = context->buf; + RangeTblEntry *rte; + AttrNumber attnum; + int netlevelsup; + deparse_namespace *dpns; + deparse_columns *colinfo; + char *refname; + char *attname; + + /* Find appropriate nesting depth */ + netlevelsup = var->varlevelsup + levelsup; + if (netlevelsup >= list_length(context->namespaces)) + elog(ERROR, "bogus varlevelsup: %d offset %d", + var->varlevelsup, levelsup); + dpns = (deparse_namespace *) list_nth(context->namespaces, + netlevelsup); + + /* + * Try to find the relevant RTE in this rtable. In a plan tree, it's + * likely that varno is OUTER_VAR or INNER_VAR, in which case we must dig + * down into the subplans, or INDEX_VAR, which is resolved similarly. Also + * find the aliases previously assigned for this RTE. + */ + if (var->varno >= 1 && var->varno <= list_length(dpns->rtable)) + { + rte = rt_fetch(var->varno, dpns->rtable); + refname = (char *) list_nth(dpns->rtable_names, var->varno - 1); + colinfo = deparse_columns_fetch(var->varno, dpns); + attnum = var->varattno; + } + else + { + resolve_special_varno((Node *) var, context, NULL, + get_special_variable); + return NULL; + } + + /* + * The planner will sometimes emit Vars referencing resjunk elements of a + * subquery's target list (this is currently only possible if it chooses + * to generate a "physical tlist" for a SubqueryScan or CteScan node). + * Although we prefer to print subquery-referencing Vars using the + * subquery's alias, that's not possible for resjunk items since they have + * no alias. So in that case, drill down to the subplan and print the + * contents of the referenced tlist item. This works because in a plan + * tree, such Vars can only occur in a SubqueryScan or CteScan node, and + * we'll have set dpns->inner_planstate to reference the child plan node. + */ + if ((rte->rtekind == RTE_SUBQUERY || rte->rtekind == RTE_CTE) && + attnum > list_length(rte->eref->colnames) && + dpns->inner_planstate) + { + TargetEntry *tle; + deparse_namespace save_dpns; + + tle = get_tle_by_resno(dpns->inner_tlist, var->varattno); + if (!tle) + elog(ERROR, "invalid attnum %d for relation \"%s\"", + var->varattno, rte->eref->aliasname); + + Assert(netlevelsup == 0); + push_child_plan(dpns, dpns->inner_planstate, &save_dpns); + + /* + * Force parentheses because our caller probably assumed a Var is a + * simple expression. + */ + if (!IsA(tle->expr, Var)) + appendStringInfoChar(buf, '('); + get_rule_expr((Node *) tle->expr, context, true); + if (!IsA(tle->expr, Var)) + appendStringInfoChar(buf, ')'); + + pop_child_plan(dpns, &save_dpns); + return NULL; + } + +#ifdef PGXC + if (rte->rtekind == RTE_REMOTE_DUMMY && + attnum > list_length(rte->eref->colnames) && + dpns->planstate) + { + TargetEntry *tle; + RemoteQuery *rqplan; + Assert(IsA(dpns->planstate, RemoteQueryState)); + Assert(netlevelsup == 0); + + /* + * Get the expression representing the given Var from base_tlist of the + * RemoteQuery + */ + rqplan = (RemoteQuery *)dpns->planstate->plan; + Assert(IsA(rqplan, RemoteQuery)); + tle = get_tle_by_resno(rqplan->base_tlist, var->varattno); + if (!tle) + elog(ERROR, "bogus varattno for remotequery var: %d", var->varattno); + /* + * Force parentheses because our caller probably assumed a Var is a + * simple expression. + */ + if (!IsA(tle->expr, Var)) + appendStringInfoChar(buf, '('); + get_rule_expr((Node *) tle->expr, context, true); + if (!IsA(tle->expr, Var)) + appendStringInfoChar(buf, ')'); + + return NULL; + } +#endif /* PGXC */ + + /* + * If it's an unnamed join, look at the expansion of the alias variable. + * If it's a simple reference to one of the input vars, then recursively + * print the name of that var instead. When it's not a simple reference, + * we have to just print the unqualified join column name. (This can only + * happen with "dangerous" merged columns in a JOIN USING; we took pains + * previously to make the unqualified column name unique in such cases.) + * + * This wouldn't work in decompiling plan trees, because we don't store + * joinaliasvars lists after planning; but a plan tree should never + * contain a join alias variable. + */ + if (rte->rtekind == RTE_JOIN && rte->alias == NULL) + { + if (rte->joinaliasvars == NIL) + elog(ERROR, "cannot decompile join alias var in plan tree"); + if (attnum > 0) + { + Var *aliasvar; + + aliasvar = (Var *) list_nth(rte->joinaliasvars, attnum - 1); + /* we intentionally don't strip implicit coercions here */ + if (aliasvar && IsA(aliasvar, Var)) + { + return get_variable(aliasvar, var->varlevelsup + levelsup, + istoplevel, context); + } + } + + /* + * Unnamed join has no refname. (Note: since it's unnamed, there is + * no way the user could have referenced it to create a whole-row Var + * for it. So we don't have to cover that case below.) + */ + Assert(refname == NULL); + } + + if (attnum == InvalidAttrNumber) + attname = NULL; + else if (attnum > 0) + { + /* Get column name to use from the colinfo struct */ + if (attnum > colinfo->num_cols) + elog(ERROR, "invalid attnum %d for relation \"%s\"", + attnum, rte->eref->aliasname); + attname = colinfo->colnames[attnum - 1]; + if (attname == NULL) /* dropped column? */ + elog(ERROR, "invalid attnum %d for relation \"%s\"", + attnum, rte->eref->aliasname); + } + else + { + /* System column - name is fixed, get it from the catalog */ + attname = get_rte_attribute_name(rte, attnum); + } + + if (refname && (context->varprefix || attname == NULL)) + { + appendStringInfoString(buf, quote_identifier(refname)); + appendStringInfoChar(buf, '.'); + } + if (attname) + appendStringInfoString(buf, quote_identifier(attname)); + else + { + appendStringInfoChar(buf, '*'); + if (istoplevel) + appendStringInfo(buf, "::%s", + format_type_with_typemod(var->vartype, + var->vartypmod)); + } + + return attname; +} + +/* + * Deparse a Var which references OUTER_VAR, INNER_VAR, or INDEX_VAR. This + * routine is actually a callback for get_special_varno, which handles finding + * the correct TargetEntry. We get the expression contained in that + * TargetEntry and just need to deparse it, a job we can throw back on + * get_rule_expr. + */ +static void +get_special_variable(Node *node, deparse_context *context, void *private) +{ + StringInfo buf = context->buf; + + /* + * Force parentheses because our caller probably assumed a Var is a simple + * expression. + */ + if (!IsA(node, Var)) + appendStringInfoChar(buf, '('); + get_rule_expr(node, context, true); + if (!IsA(node, Var)) + appendStringInfoChar(buf, ')'); +} + +/* + * Chase through plan references to special varnos (OUTER_VAR, INNER_VAR, + * INDEX_VAR) until we find a real Var or some kind of non-Var node; then, + * invoke the callback provided. + */ +static void +resolve_special_varno(Node *node, deparse_context *context, void *private, + void (*callback) (Node *, deparse_context *, void *)) +{// #lizard forgives + Var *var; + deparse_namespace *dpns; + + /* If it's not a Var, invoke the callback. */ + if (!IsA(node, Var)) + { + callback(node, context, private); + return; + } + + /* Find appropriate nesting depth */ + var = (Var *) node; + dpns = (deparse_namespace *) list_nth(context->namespaces, + var->varlevelsup); + + /* + * It's a special RTE, so recurse. + */ + if (var->varno == OUTER_VAR && dpns->outer_tlist) + { + TargetEntry *tle; + deparse_namespace save_dpns; + + tle = get_tle_by_resno(dpns->outer_tlist, var->varattno); + if (!tle) + elog(ERROR, "bogus varattno for OUTER_VAR var: %d", var->varattno); + + push_child_plan(dpns, dpns->outer_planstate, &save_dpns); + resolve_special_varno((Node *) tle->expr, context, private, callback); + pop_child_plan(dpns, &save_dpns); + return; + } + else if (var->varno == INNER_VAR && dpns->inner_tlist) + { + TargetEntry *tle; + deparse_namespace save_dpns; + + tle = get_tle_by_resno(dpns->inner_tlist, var->varattno); + if (!tle) + elog(ERROR, "bogus varattno for INNER_VAR var: %d", var->varattno); + + push_child_plan(dpns, dpns->inner_planstate, &save_dpns); + resolve_special_varno((Node *) tle->expr, context, private, callback); + pop_child_plan(dpns, &save_dpns); + return; + } + else if (var->varno == INDEX_VAR && dpns->index_tlist) + { + TargetEntry *tle; + + tle = get_tle_by_resno(dpns->index_tlist, var->varattno); + if (!tle) + elog(ERROR, "bogus varattno for INDEX_VAR var: %d", var->varattno); + + resolve_special_varno((Node *) tle->expr, context, private, callback); + return; + } + else if (var->varno < 1 || var->varno > list_length(dpns->rtable)) + elog(ERROR, "bogus varno: %d", var->varno); + + /* Not special. Just invoke the callback. */ + callback(node, context, private); +} + +/* + * Get the name of a field of an expression of composite type. The + * expression is usually a Var, but we handle other cases too. + * + * levelsup is an extra offset to interpret the Var's varlevelsup correctly. + * + * This is fairly straightforward when the expression has a named composite + * type; we need only look up the type in the catalogs. However, the type + * could also be RECORD. Since no actual table or view column is allowed to + * have type RECORD, a Var of type RECORD must refer to a JOIN or FUNCTION RTE + * or to a subquery output. We drill down to find the ultimate defining + * expression and attempt to infer the field name from it. We ereport if we + * can't determine the name. + * + * Similarly, a PARAM of type RECORD has to refer to some expression of + * a determinable composite type. + */ +static const char * +get_name_for_var_field(Var *var, int fieldno, + int levelsup, deparse_context *context) +{// #lizard forgives + RangeTblEntry *rte; + AttrNumber attnum; + int netlevelsup; + deparse_namespace *dpns; + TupleDesc tupleDesc; + Node *expr; + + /* + * If it's a RowExpr that was expanded from a whole-row Var, use the + * column names attached to it. + */ + if (IsA(var, RowExpr)) + { + RowExpr *r = (RowExpr *) var; + + if (fieldno > 0 && fieldno <= list_length(r->colnames)) + return strVal(list_nth(r->colnames, fieldno - 1)); + } + + /* + * If it's a Param of type RECORD, try to find what the Param refers to. + */ + if (IsA(var, Param)) + { + Param *param = (Param *) var; + ListCell *ancestor_cell; + + expr = find_param_referent(param, context, &dpns, &ancestor_cell); + if (expr) + { + /* Found a match, so recurse to decipher the field name */ + deparse_namespace save_dpns; + const char *result; + + push_ancestor_plan(dpns, ancestor_cell, &save_dpns); + result = get_name_for_var_field((Var *) expr, fieldno, + 0, context); + pop_ancestor_plan(dpns, &save_dpns); + return result; + } + } + + /* + * If it's a Var of type RECORD, we have to find what the Var refers to; + * if not, we can use get_expr_result_type. If that fails, we try + * lookup_rowtype_tupdesc, which will probably fail too, but will ereport + * an acceptable message. + */ + if (!IsA(var, Var) || + var->vartype != RECORDOID) + { + if (get_expr_result_type((Node *) var, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE) + tupleDesc = lookup_rowtype_tupdesc_copy(exprType((Node *) var), + exprTypmod((Node *) var)); + Assert(tupleDesc); + /* Got the tupdesc, so we can extract the field name */ + Assert(fieldno >= 1 && fieldno <= tupleDesc->natts); + return NameStr(tupleDesc->attrs[fieldno - 1]->attname); + } + + /* Find appropriate nesting depth */ + netlevelsup = var->varlevelsup + levelsup; + if (netlevelsup >= list_length(context->namespaces)) + elog(ERROR, "bogus varlevelsup: %d offset %d", + var->varlevelsup, levelsup); + dpns = (deparse_namespace *) list_nth(context->namespaces, + netlevelsup); + + /* + * Try to find the relevant RTE in this rtable. In a plan tree, it's + * likely that varno is OUTER_VAR or INNER_VAR, in which case we must dig + * down into the subplans, or INDEX_VAR, which is resolved similarly. + */ + if (var->varno >= 1 && var->varno <= list_length(dpns->rtable)) + { + rte = rt_fetch(var->varno, dpns->rtable); + attnum = var->varattno; + } + else if (var->varno == OUTER_VAR && dpns->outer_tlist) + { + TargetEntry *tle; + deparse_namespace save_dpns; + const char *result; + + tle = get_tle_by_resno(dpns->outer_tlist, var->varattno); + if (!tle) + elog(ERROR, "bogus varattno for OUTER_VAR var: %d", var->varattno); + + Assert(netlevelsup == 0); + push_child_plan(dpns, dpns->outer_planstate, &save_dpns); + + result = get_name_for_var_field((Var *) tle->expr, fieldno, + levelsup, context); + + pop_child_plan(dpns, &save_dpns); + return result; + } + else if (var->varno == INNER_VAR && dpns->inner_tlist) + { + TargetEntry *tle; + deparse_namespace save_dpns; + const char *result; + + tle = get_tle_by_resno(dpns->inner_tlist, var->varattno); + if (!tle) + elog(ERROR, "bogus varattno for INNER_VAR var: %d", var->varattno); + + Assert(netlevelsup == 0); + push_child_plan(dpns, dpns->inner_planstate, &save_dpns); + + result = get_name_for_var_field((Var *) tle->expr, fieldno, + levelsup, context); + + pop_child_plan(dpns, &save_dpns); + return result; + } + else if (var->varno == INDEX_VAR && dpns->index_tlist) + { + TargetEntry *tle; + const char *result; + + tle = get_tle_by_resno(dpns->index_tlist, var->varattno); + if (!tle) + elog(ERROR, "bogus varattno for INDEX_VAR var: %d", var->varattno); + + Assert(netlevelsup == 0); + + result = get_name_for_var_field((Var *) tle->expr, fieldno, + levelsup, context); + + return result; + } + else + { + elog(ERROR, "bogus varno: %d", var->varno); + return NULL; /* keep compiler quiet */ + } + + if (attnum == InvalidAttrNumber) + { + /* Var is whole-row reference to RTE, so select the right field */ + return get_rte_attribute_name(rte, fieldno); + } + + /* + * This part has essentially the same logic as the parser's + * expandRecordVariable() function, but we are dealing with a different + * representation of the input context, and we only need one field name + * not a TupleDesc. Also, we need special cases for finding subquery and + * CTE subplans when deparsing Plan trees. + */ + expr = (Node *) var; /* default if we can't drill down */ + + switch (rte->rtekind) + { + case RTE_RELATION: + case RTE_VALUES: + case RTE_NAMEDTUPLESTORE: + + /* + * This case should not occur: a column of a table or values list + * shouldn't have type RECORD. Fall through and fail (most + * likely) at the bottom. + */ + break; + case RTE_SUBQUERY: + /* Subselect-in-FROM: examine sub-select's output expr */ + { + if (rte->subquery) + { + TargetEntry *ste = get_tle_by_resno(rte->subquery->targetList, + attnum); + + if (ste == NULL || ste->resjunk) + elog(ERROR, "subquery %s does not have attribute %d", + rte->eref->aliasname, attnum); + expr = (Node *) ste->expr; + if (IsA(expr, Var)) + { + /* + * Recurse into the sub-select to see what its Var + * refers to. We have to build an additional level of + * namespace to keep in step with varlevelsup in the + * subselect. + */ + deparse_namespace mydpns; + const char *result; + + set_deparse_for_query(&mydpns, rte->subquery, + context->namespaces); + + context->namespaces = lcons(&mydpns, + context->namespaces); + + result = get_name_for_var_field((Var *) expr, fieldno, + 0, context); + + context->namespaces = + list_delete_first(context->namespaces); + + return result; + } + /* else fall through to inspect the expression */ + } + else + { + /* + * We're deparsing a Plan tree so we don't have complete + * RTE entries (in particular, rte->subquery is NULL). But + * the only place we'd see a Var directly referencing a + * SUBQUERY RTE is in a SubqueryScan plan node, and we can + * look into the child plan's tlist instead. + */ + TargetEntry *tle; + deparse_namespace save_dpns; + const char *result; + + if (!dpns->inner_planstate) + elog(ERROR, "failed to find plan for subquery %s", + rte->eref->aliasname); + tle = get_tle_by_resno(dpns->inner_tlist, attnum); + if (!tle) + elog(ERROR, "bogus varattno for subquery var: %d", + attnum); + Assert(netlevelsup == 0); + push_child_plan(dpns, dpns->inner_planstate, &save_dpns); + + result = get_name_for_var_field((Var *) tle->expr, fieldno, + levelsup, context); + + pop_child_plan(dpns, &save_dpns); + return result; + } + } + break; + case RTE_JOIN: + /* Join RTE --- recursively inspect the alias variable */ + if (rte->joinaliasvars == NIL) + elog(ERROR, "cannot decompile join alias var in plan tree"); + Assert(attnum > 0 && attnum <= list_length(rte->joinaliasvars)); + expr = (Node *) list_nth(rte->joinaliasvars, attnum - 1); + Assert(expr != NULL); + /* we intentionally don't strip implicit coercions here */ + if (IsA(expr, Var)) + return get_name_for_var_field((Var *) expr, fieldno, + var->varlevelsup + levelsup, + context); + /* else fall through to inspect the expression */ + break; + case RTE_FUNCTION: + case RTE_TABLEFUNC: + + /* + * We couldn't get here unless a function is declared with one of + * its result columns as RECORD, which is not allowed. + */ + break; + case RTE_CTE: + /* CTE reference: examine subquery's output expr */ + { + CommonTableExpr *cte = NULL; + Index ctelevelsup; + ListCell *lc; + + /* + * Try to find the referenced CTE using the namespace stack. + */ + ctelevelsup = rte->ctelevelsup + netlevelsup; + if (ctelevelsup >= list_length(context->namespaces)) + lc = NULL; + else + { + deparse_namespace *ctedpns; + + ctedpns = (deparse_namespace *) + list_nth(context->namespaces, ctelevelsup); + foreach(lc, ctedpns->ctes) + { + cte = (CommonTableExpr *) lfirst(lc); + if (strcmp(cte->ctename, rte->ctename) == 0) + break; + } + } + if (lc != NULL) + { + Query *ctequery = (Query *) cte->ctequery; + TargetEntry *ste = get_tle_by_resno(GetCTETargetList(cte), + attnum); + + if (ste == NULL || ste->resjunk) + elog(ERROR, "subquery %s does not have attribute %d", + rte->eref->aliasname, attnum); + expr = (Node *) ste->expr; + if (IsA(expr, Var)) + { + /* + * Recurse into the CTE to see what its Var refers to. + * We have to build an additional level of namespace + * to keep in step with varlevelsup in the CTE. + * Furthermore it could be an outer CTE, so we may + * have to delete some levels of namespace. + */ + List *save_nslist = context->namespaces; + List *new_nslist; + deparse_namespace mydpns; + const char *result; + + set_deparse_for_query(&mydpns, ctequery, + context->namespaces); + + new_nslist = list_copy_tail(context->namespaces, + ctelevelsup); + context->namespaces = lcons(&mydpns, new_nslist); + + result = get_name_for_var_field((Var *) expr, fieldno, + 0, context); + + context->namespaces = save_nslist; + + return result; + } + /* else fall through to inspect the expression */ + } + else + { + /* + * We're deparsing a Plan tree so we don't have a CTE + * list. But the only place we'd see a Var directly + * referencing a CTE RTE is in a CteScan plan node, and we + * can look into the subplan's tlist instead. + */ + TargetEntry *tle; + deparse_namespace save_dpns; + const char *result; + + if (!dpns->inner_planstate) + elog(ERROR, "failed to find plan for CTE %s", + rte->eref->aliasname); + tle = get_tle_by_resno(dpns->inner_tlist, attnum); + if (!tle) + elog(ERROR, "bogus varattno for subquery var: %d", + attnum); + Assert(netlevelsup == 0); + push_child_plan(dpns, dpns->inner_planstate, &save_dpns); + + result = get_name_for_var_field((Var *) tle->expr, fieldno, + levelsup, context); + + pop_child_plan(dpns, &save_dpns); + return result; + } + } + break; +#ifdef PGXC + case RTE_REMOTE_DUMMY: + elog(ERROR, "Invalid RTE found"); + break; +#endif /* PGXC */ + } + + /* + * We now have an expression we can't expand any more, so see if + * get_expr_result_type() can do anything with it. If not, pass to + * lookup_rowtype_tupdesc() which will probably fail, but will give an + * appropriate error message while failing. + */ + if (get_expr_result_type(expr, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE) + tupleDesc = lookup_rowtype_tupdesc_copy(exprType(expr), + exprTypmod(expr)); + Assert(tupleDesc); + /* Got the tupdesc, so we can extract the field name */ + Assert(fieldno >= 1 && fieldno <= tupleDesc->natts); + return NameStr(tupleDesc->attrs[fieldno - 1]->attname); +} + +/* + * Try to find the referenced expression for a PARAM_EXEC Param that might + * reference a parameter supplied by an upper NestLoop or SubPlan plan node. + * + * If successful, return the expression and set *dpns_p and *ancestor_cell_p + * appropriately for calling push_ancestor_plan(). If no referent can be + * found, return NULL. + */ +static Node * +find_param_referent(Param *param, deparse_context *context, + deparse_namespace **dpns_p, ListCell **ancestor_cell_p) +{// #lizard forgives + /* Initialize output parameters to prevent compiler warnings */ + *dpns_p = NULL; + *ancestor_cell_p = NULL; + + /* + * If it's a PARAM_EXEC parameter, look for a matching NestLoopParam or + * SubPlan argument. This will necessarily be in some ancestor of the + * current expression's PlanState. + */ + if (param->paramkind == PARAM_EXEC) + { + deparse_namespace *dpns; + PlanState *child_ps; + bool in_same_plan_level; + ListCell *lc; + + dpns = (deparse_namespace *) linitial(context->namespaces); + child_ps = dpns->planstate; + in_same_plan_level = true; + + foreach(lc, dpns->ancestors) + { + PlanState *ps = (PlanState *) lfirst(lc); + ListCell *lc2; + + /* + * NestLoops transmit params to their inner child only; also, once + * we've crawled up out of a subplan, this couldn't possibly be + * the right match. + */ + if (IsA(ps, NestLoopState) && + child_ps == innerPlanState(ps) && + in_same_plan_level) + { + NestLoop *nl = (NestLoop *) ps->plan; + + foreach(lc2, nl->nestParams) + { + NestLoopParam *nlp = (NestLoopParam *) lfirst(lc2); + + if (nlp->paramno == param->paramid) + { + /* Found a match, so return it */ + *dpns_p = dpns; + *ancestor_cell_p = lc; + return (Node *) nlp->paramval; + } + } + } + + /* + * Check to see if we're crawling up from a subplan. + */ + foreach(lc2, ps->subPlan) + { + SubPlanState *sstate = (SubPlanState *) lfirst(lc2); + SubPlan *subplan = sstate->subplan; + ListCell *lc3; + ListCell *lc4; + + if (child_ps != sstate->planstate) + continue; + + /* Matched subplan, so check its arguments */ + forboth(lc3, subplan->parParam, lc4, subplan->args) + { + int paramid = lfirst_int(lc3); + Node *arg = (Node *) lfirst(lc4); + + if (paramid == param->paramid) + { + /* Found a match, so return it */ + *dpns_p = dpns; + *ancestor_cell_p = lc; + return arg; + } + } + + /* Keep looking, but we are emerging from a subplan. */ + in_same_plan_level = false; + break; + } + + /* + * Likewise check to see if we're emerging from an initplan. + * Initplans never have any parParams, so no need to search that + * list, but we need to know if we should reset + * in_same_plan_level. + */ + foreach(lc2, ps->initPlan) + { + SubPlanState *sstate = (SubPlanState *) lfirst(lc2); + + if (child_ps != sstate->planstate) + continue; + + /* No parameters to be had here. */ + Assert(sstate->subplan->parParam == NIL); + + /* Keep looking, but we are emerging from an initplan. */ + in_same_plan_level = false; + break; + } + + /* No luck, crawl up to next ancestor */ + child_ps = ps; + } + } + + /* No referent found */ + return NULL; +} + +/* + * Display a Param appropriately. + */ +static void +get_parameter(Param *param, deparse_context *context) +{// #lizard forgives + Node *expr; + deparse_namespace *dpns; + ListCell *ancestor_cell; + + /* + * If it's a PARAM_EXEC parameter, try to locate the expression from which + * the parameter was computed. Note that failing to find a referent isn't + * an error, since the Param might well be a subplan output rather than an + * input. + */ + expr = find_param_referent(param, context, &dpns, &ancestor_cell); + if (expr) + { + /* Found a match, so print it */ + deparse_namespace save_dpns; + bool save_varprefix; + bool need_paren; + + /* Switch attention to the ancestor plan node */ + push_ancestor_plan(dpns, ancestor_cell, &save_dpns); + + /* + * Force prefixing of Vars, since they won't belong to the relation + * being scanned in the original plan node. + */ + save_varprefix = context->varprefix; + context->varprefix = true; + + /* + * A Param's expansion is typically a Var, Aggref, or upper-level + * Param, which wouldn't need extra parentheses. Otherwise, insert + * parens to ensure the expression looks atomic. + */ + need_paren = !(IsA(expr, Var) || + IsA(expr, Aggref) || + IsA(expr, Param)); + if (need_paren) + appendStringInfoChar(context->buf, '('); + + get_rule_expr(expr, context, false); + + if (need_paren) + appendStringInfoChar(context->buf, ')'); + + context->varprefix = save_varprefix; + + pop_ancestor_plan(dpns, &save_dpns); + + return; + } + + /* + * Not PARAM_EXEC, or couldn't find referent: just print $N. + */ + appendStringInfo(context->buf, "$%d", param->paramid); + +#ifdef __TBASE__ + /* param need explicit cast */ + if (param->explicit_cast) + { + appendStringInfo(context->buf, "::%s", + format_type_with_typemod(param->paramtype, param->paramtypmod)); + } +#endif +} + +/* + * get_simple_binary_op_name + * + * helper function for isSimpleNode + * will return single char binary operator name, or NULL if it's not + */ +static const char * +get_simple_binary_op_name(OpExpr *expr) +{ + List *args = expr->args; + + if (list_length(args) == 2) + { + /* binary operator */ + Node *arg1 = (Node *) linitial(args); + Node *arg2 = (Node *) lsecond(args); + const char *op; + + op = generate_operator_name(expr->opno, exprType(arg1), exprType(arg2)); + if (strlen(op) == 1) + return op; + } + return NULL; +} + + +/* + * isSimpleNode - check if given node is simple (doesn't need parenthesizing) + * + * true : simple in the context of parent node's type + * false : not simple + */ +static bool +isSimpleNode(Node *node, Node *parentNode, int prettyFlags) +{// #lizard forgives + if (!node) + return false; + + switch (nodeTag(node)) + { + case T_Var: + case T_Const: + case T_Param: + case T_CoerceToDomainValue: + case T_SetToDefault: + case T_CurrentOfExpr: + /* single words: always simple */ + return true; + + case T_ArrayRef: + case T_ArrayExpr: + case T_RowExpr: + case T_CoalesceExpr: + case T_MinMaxExpr: + case T_SQLValueFunction: + case T_XmlExpr: + case T_NextValueExpr: + case T_NullIfExpr: + case T_Aggref: + case T_WindowFunc: + case T_FuncExpr: + /* function-like: name(..) or name[..] */ + return true; + + /* CASE keywords act as parentheses */ + case T_CaseExpr: + return true; + + case T_FieldSelect: + + /* + * appears simple since . has top precedence, unless parent is + * T_FieldSelect itself! + */ + return (IsA(parentNode, FieldSelect) ? false : true); + + case T_FieldStore: + + /* + * treat like FieldSelect (probably doesn't matter) + */ + return (IsA(parentNode, FieldStore) ? false : true); + + case T_CoerceToDomain: + /* maybe simple, check args */ + return isSimpleNode((Node *) ((CoerceToDomain *) node)->arg, + node, prettyFlags); + case T_RelabelType: + return isSimpleNode((Node *) ((RelabelType *) node)->arg, + node, prettyFlags); + case T_CoerceViaIO: + return isSimpleNode((Node *) ((CoerceViaIO *) node)->arg, + node, prettyFlags); + case T_ArrayCoerceExpr: + return isSimpleNode((Node *) ((ArrayCoerceExpr *) node)->arg, + node, prettyFlags); + case T_ConvertRowtypeExpr: + return isSimpleNode((Node *) ((ConvertRowtypeExpr *) node)->arg, + node, prettyFlags); + + case T_OpExpr: + { + /* depends on parent node type; needs further checking */ + if (prettyFlags & PRETTYFLAG_PAREN && IsA(parentNode, OpExpr)) + { + const char *op; + const char *parentOp; + bool is_lopriop; + bool is_hipriop; + bool is_lopriparent; + bool is_hipriparent; + + op = get_simple_binary_op_name((OpExpr *) node); + if (!op) + return false; + + /* We know only the basic operators + - and * / % */ + is_lopriop = (strchr("+-", *op) != NULL); + is_hipriop = (strchr("*/%", *op) != NULL); + if (!(is_lopriop || is_hipriop)) + return false; + + parentOp = get_simple_binary_op_name((OpExpr *) parentNode); + if (!parentOp) + return false; + + is_lopriparent = (strchr("+-", *parentOp) != NULL); + is_hipriparent = (strchr("*/%", *parentOp) != NULL); + if (!(is_lopriparent || is_hipriparent)) + return false; + + if (is_hipriop && is_lopriparent) + return true; /* op binds tighter than parent */ + + if (is_lopriop && is_hipriparent) + return false; + + /* + * Operators are same priority --- can skip parens only if + * we have (a - b) - c, not a - (b - c). + */ + if (node == (Node *) linitial(((OpExpr *) parentNode)->args)) + return true; + + return false; + } + /* else do the same stuff as for T_SubLink et al. */ + /* FALL THROUGH */ + } + + case T_SubLink: + case T_NullTest: + case T_BooleanTest: + case T_DistinctExpr: + switch (nodeTag(parentNode)) + { + case T_FuncExpr: + { + /* special handling for casts */ + CoercionForm type = ((FuncExpr *) parentNode)->funcformat; + + if (type == COERCE_EXPLICIT_CAST || + type == COERCE_IMPLICIT_CAST) + return false; + return true; /* own parentheses */ + } + case T_BoolExpr: /* lower precedence */ + case T_ArrayRef: /* other separators */ + case T_ArrayExpr: /* other separators */ + case T_RowExpr: /* other separators */ + case T_CoalesceExpr: /* own parentheses */ + case T_MinMaxExpr: /* own parentheses */ + case T_XmlExpr: /* own parentheses */ + case T_NullIfExpr: /* other separators */ + case T_Aggref: /* own parentheses */ + case T_WindowFunc: /* own parentheses */ + case T_CaseExpr: /* other separators */ + return true; + default: + return false; + } + + case T_BoolExpr: + switch (nodeTag(parentNode)) + { + case T_BoolExpr: + if (prettyFlags & PRETTYFLAG_PAREN) + { + BoolExprType type; + BoolExprType parentType; + + type = ((BoolExpr *) node)->boolop; + parentType = ((BoolExpr *) parentNode)->boolop; + switch (type) + { + case NOT_EXPR: + case AND_EXPR: + if (parentType == AND_EXPR || parentType == OR_EXPR) + return true; + break; + case OR_EXPR: + if (parentType == OR_EXPR) + return true; + break; + } + } + return false; + case T_FuncExpr: + { + /* special handling for casts */ + CoercionForm type = ((FuncExpr *) parentNode)->funcformat; + + if (type == COERCE_EXPLICIT_CAST || + type == COERCE_IMPLICIT_CAST) + return false; + return true; /* own parentheses */ + } + case T_ArrayRef: /* other separators */ + case T_ArrayExpr: /* other separators */ + case T_RowExpr: /* other separators */ + case T_CoalesceExpr: /* own parentheses */ + case T_MinMaxExpr: /* own parentheses */ + case T_XmlExpr: /* own parentheses */ + case T_NullIfExpr: /* other separators */ + case T_Aggref: /* own parentheses */ + case T_WindowFunc: /* own parentheses */ + case T_CaseExpr: /* other separators */ + return true; + default: + return false; + } + + default: + break; + } + /* those we don't know: in dubio complexo */ + return false; +} + + +/* + * appendContextKeyword - append a keyword to buffer + * + * If prettyPrint is enabled, perform a line break, and adjust indentation. + * Otherwise, just append the keyword. + */ +static void +appendContextKeyword(deparse_context *context, const char *str, + int indentBefore, int indentAfter, int indentPlus) +{ + StringInfo buf = context->buf; + + if (PRETTY_INDENT(context)) + { + int indentAmount; + + context->indentLevel += indentBefore; + + /* remove any trailing spaces currently in the buffer ... */ + removeStringInfoSpaces(buf); + /* ... then add a newline and some spaces */ + appendStringInfoChar(buf, '\n'); + + if (context->indentLevel < PRETTYINDENT_LIMIT) + indentAmount = Max(context->indentLevel, 0) + indentPlus; + else + { + /* + * If we're indented more than PRETTYINDENT_LIMIT characters, try + * to conserve horizontal space by reducing the per-level + * indentation. For best results the scale factor here should + * divide all the indent amounts that get added to indentLevel + * (PRETTYINDENT_STD, etc). It's important that the indentation + * not grow unboundedly, else deeply-nested trees use O(N^2) + * whitespace; so we also wrap modulo PRETTYINDENT_LIMIT. + */ + indentAmount = PRETTYINDENT_LIMIT + + (context->indentLevel - PRETTYINDENT_LIMIT) / + (PRETTYINDENT_STD / 2); + indentAmount %= PRETTYINDENT_LIMIT; + /* scale/wrap logic affects indentLevel, but not indentPlus */ + indentAmount += indentPlus; + } + appendStringInfoSpaces(buf, indentAmount); + + appendStringInfoString(buf, str); + + context->indentLevel += indentAfter; + if (context->indentLevel < 0) + context->indentLevel = 0; + } + else + appendStringInfoString(buf, str); +} + +/* + * removeStringInfoSpaces - delete trailing spaces from a buffer. + * + * Possibly this should move to stringinfo.c at some point. + */ +static void +removeStringInfoSpaces(StringInfo str) +{ + while (str->len > 0 && str->data[str->len - 1] == ' ') + str->data[--(str->len)] = '\0'; +} + + +/* + * get_rule_expr_paren - deparse expr using get_rule_expr, + * embracing the string with parentheses if necessary for prettyPrint. + * + * Never embrace if prettyFlags=0, because it's done in the calling node. + * + * Any node that does *not* embrace its argument node by sql syntax (with + * parentheses, non-operator keywords like CASE/WHEN/ON, or comma etc) should + * use get_rule_expr_paren instead of get_rule_expr so parentheses can be + * added. + */ +static void +get_rule_expr_paren(Node *node, deparse_context *context, + bool showimplicit, Node *parentNode) +{ + bool need_paren; + + need_paren = PRETTY_PAREN(context) && + !isSimpleNode(node, parentNode, context->prettyFlags); + + if (need_paren) + appendStringInfoChar(context->buf, '('); + + get_rule_expr(node, context, showimplicit); + + if (need_paren) + appendStringInfoChar(context->buf, ')'); +} + + +/* ---------- + * get_rule_expr - Parse back an expression + * + * Note: showimplicit determines whether we display any implicit cast that + * is present at the top of the expression tree. It is a passed argument, + * not a field of the context struct, because we change the value as we + * recurse down into the expression. In general we suppress implicit casts + * when the result type is known with certainty (eg, the arguments of an + * OR must be boolean). We display implicit casts for arguments of functions + * and operators, since this is needed to be certain that the same function + * or operator will be chosen when the expression is re-parsed. + * ---------- + */ +static void +get_rule_expr(Node *node, deparse_context *context, + bool showimplicit) +{// #lizard forgives + StringInfo buf = context->buf; + + if (node == NULL) + return; + + /* Guard against excessively long or deeply-nested queries */ + CHECK_FOR_INTERRUPTS(); + check_stack_depth(); + + /* + * Each level of get_rule_expr must emit an indivisible term + * (parenthesized if necessary) to ensure result is reparsed into the same + * expression tree. The only exception is that when the input is a List, + * we emit the component items comma-separated with no surrounding + * decoration; this is convenient for most callers. + */ + switch (nodeTag(node)) + { + case T_Var: + (void) get_variable((Var *) node, 0, false, context); + break; + + case T_Const: + get_const_expr((Const *) node, context, 0); + break; + + case T_Param: + get_parameter((Param *) node, context); + break; + + case T_Aggref: + get_agg_expr((Aggref *) node, context, (Aggref *) node); + break; + + case T_GroupingFunc: + { + GroupingFunc *gexpr = (GroupingFunc *) node; + + appendStringInfoString(buf, "GROUPING("); + get_rule_expr((Node *) gexpr->args, context, true); + appendStringInfoChar(buf, ')'); + } + break; + + case T_WindowFunc: + get_windowfunc_expr((WindowFunc *) node, context); + break; + + case T_ArrayRef: + { + ArrayRef *aref = (ArrayRef *) node; + bool need_parens; + + /* + * If the argument is a CaseTestExpr, we must be inside a + * FieldStore, ie, we are assigning to an element of an array + * within a composite column. Since we already punted on + * displaying the FieldStore's target information, just punt + * here too, and display only the assignment source + * expression. + */ + if (IsA(aref->refexpr, CaseTestExpr)) + { + Assert(aref->refassgnexpr); + get_rule_expr((Node *) aref->refassgnexpr, + context, showimplicit); + break; + } + + /* + * Parenthesize the argument unless it's a simple Var or a + * FieldSelect. (In particular, if it's another ArrayRef, we + * *must* parenthesize to avoid confusion.) + */ + need_parens = !IsA(aref->refexpr, Var) && + !IsA(aref->refexpr, FieldSelect); + if (need_parens) + appendStringInfoChar(buf, '('); + get_rule_expr((Node *) aref->refexpr, context, showimplicit); + if (need_parens) + appendStringInfoChar(buf, ')'); + + /* + * If there's a refassgnexpr, we want to print the node in the + * format "array[subscripts] := refassgnexpr". This is not + * legal SQL, so decompilation of INSERT or UPDATE statements + * should always use processIndirection as part of the + * statement-level syntax. We should only see this when + * EXPLAIN tries to print the targetlist of a plan resulting + * from such a statement. + */ + if (aref->refassgnexpr) + { + Node *refassgnexpr; + + /* + * Use processIndirection to print this node's subscripts + * as well as any additional field selections or + * subscripting in immediate descendants. It returns the + * RHS expr that is actually being "assigned". + */ + refassgnexpr = processIndirection(node, context); + appendStringInfoString(buf, " := "); + get_rule_expr(refassgnexpr, context, showimplicit); + } + else + { + /* Just an ordinary array fetch, so print subscripts */ + printSubscripts(aref, context); + } + } + break; + + case T_FuncExpr: + get_func_expr((FuncExpr *) node, context, showimplicit); + break; + + case T_NamedArgExpr: + { + NamedArgExpr *na = (NamedArgExpr *) node; + + appendStringInfo(buf, "%s => ", quote_identifier(na->name)); + get_rule_expr((Node *) na->arg, context, showimplicit); + } + break; + + case T_OpExpr: + get_oper_expr((OpExpr *) node, context); + break; + + case T_DistinctExpr: + { + DistinctExpr *expr = (DistinctExpr *) node; + List *args = expr->args; + Node *arg1 = (Node *) linitial(args); + Node *arg2 = (Node *) lsecond(args); + + if (!PRETTY_PAREN(context)) + appendStringInfoChar(buf, '('); + get_rule_expr_paren(arg1, context, true, node); + appendStringInfoString(buf, " IS DISTINCT FROM "); + get_rule_expr_paren(arg2, context, true, node); + if (!PRETTY_PAREN(context)) + appendStringInfoChar(buf, ')'); + } + break; + + case T_NullIfExpr: + { + NullIfExpr *nullifexpr = (NullIfExpr *) node; + + appendStringInfoString(buf, "NULLIF("); + get_rule_expr((Node *) nullifexpr->args, context, true); + appendStringInfoChar(buf, ')'); + } + break; + + case T_ScalarArrayOpExpr: + { + ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) node; + List *args = expr->args; + Node *arg1 = (Node *) linitial(args); + Node *arg2 = (Node *) lsecond(args); + + if (!PRETTY_PAREN(context)) + appendStringInfoChar(buf, '('); + get_rule_expr_paren(arg1, context, true, node); + appendStringInfo(buf, " %s %s (", + generate_operator_name(expr->opno, + exprType(arg1), + get_base_element_type(exprType(arg2))), + expr->useOr ? "ANY" : "ALL"); + get_rule_expr_paren(arg2, context, true, node); + + /* + * There's inherent ambiguity in "x op ANY/ALL (y)" when y is + * a bare sub-SELECT. Since we're here, the sub-SELECT must + * be meant as a scalar sub-SELECT yielding an array value to + * be used in ScalarArrayOpExpr; but the grammar will + * preferentially interpret such a construct as an ANY/ALL + * SubLink. To prevent misparsing the output that way, insert + * a dummy coercion (which will be stripped by parse analysis, + * so no inefficiency is added in dump and reload). This is + * indeed most likely what the user wrote to get the construct + * accepted in the first place. + */ + if (IsA(arg2, SubLink) && + ((SubLink *) arg2)->subLinkType == EXPR_SUBLINK) + appendStringInfo(buf, "::%s", + format_type_with_typemod(exprType(arg2), + exprTypmod(arg2))); + appendStringInfoChar(buf, ')'); + if (!PRETTY_PAREN(context)) + appendStringInfoChar(buf, ')'); + } + break; + + case T_BoolExpr: + { + BoolExpr *expr = (BoolExpr *) node; + Node *first_arg = linitial(expr->args); + ListCell *arg = lnext(list_head(expr->args)); + + switch (expr->boolop) + { + case AND_EXPR: + if (!PRETTY_PAREN(context)) + appendStringInfoChar(buf, '('); + get_rule_expr_paren(first_arg, context, + false, node); + while (arg) + { + appendStringInfoString(buf, " AND "); + get_rule_expr_paren((Node *) lfirst(arg), context, + false, node); + arg = lnext(arg); + } + if (!PRETTY_PAREN(context)) + appendStringInfoChar(buf, ')'); + break; + + case OR_EXPR: + if (!PRETTY_PAREN(context)) + appendStringInfoChar(buf, '('); + get_rule_expr_paren(first_arg, context, + false, node); + while (arg) + { + appendStringInfoString(buf, " OR "); + get_rule_expr_paren((Node *) lfirst(arg), context, + false, node); + arg = lnext(arg); + } + if (!PRETTY_PAREN(context)) + appendStringInfoChar(buf, ')'); + break; + + case NOT_EXPR: + if (!PRETTY_PAREN(context)) + appendStringInfoChar(buf, '('); + appendStringInfoString(buf, "NOT "); + get_rule_expr_paren(first_arg, context, + false, node); + if (!PRETTY_PAREN(context)) + appendStringInfoChar(buf, ')'); + break; + + default: + elog(ERROR, "unrecognized boolop: %d", + (int) expr->boolop); + } + } + break; + + case T_SubLink: + get_sublink_expr((SubLink *) node, context); + break; + + case T_SubPlan: + { + SubPlan *subplan = (SubPlan *) node; + + /* + * We cannot see an already-planned subplan in rule deparsing, + * only while EXPLAINing a query plan. We don't try to + * reconstruct the original SQL, just reference the subplan + * that appears elsewhere in EXPLAIN's result. + */ + if (subplan->useHashTable) + appendStringInfo(buf, "(hashed %s)", subplan->plan_name); + else + appendStringInfo(buf, "(%s)", subplan->plan_name); + } + break; + + case T_AlternativeSubPlan: + { + AlternativeSubPlan *asplan = (AlternativeSubPlan *) node; + ListCell *lc; + + /* As above, this can only happen during EXPLAIN */ + appendStringInfoString(buf, "(alternatives: "); + foreach(lc, asplan->subplans) + { + SubPlan *splan = lfirst_node(SubPlan, lc); + + if (splan->useHashTable) + appendStringInfo(buf, "hashed %s", splan->plan_name); + else + appendStringInfoString(buf, splan->plan_name); + if (lnext(lc)) + appendStringInfoString(buf, " or "); + } + appendStringInfoChar(buf, ')'); + } + break; + + case T_FieldSelect: + { + FieldSelect *fselect = (FieldSelect *) node; + Node *arg = (Node *) fselect->arg; + int fno = fselect->fieldnum; + const char *fieldname; + bool need_parens; + + /* + * Parenthesize the argument unless it's an ArrayRef or + * another FieldSelect. Note in particular that it would be + * WRONG to not parenthesize a Var argument; simplicity is not + * the issue here, having the right number of names is. + */ + need_parens = !IsA(arg, ArrayRef) &&!IsA(arg, FieldSelect); + if (need_parens) + appendStringInfoChar(buf, '('); + get_rule_expr(arg, context, true); + if (need_parens) + appendStringInfoChar(buf, ')'); + + /* + * Get and print the field name. + */ + fieldname = get_name_for_var_field((Var *) arg, fno, + 0, context); + appendStringInfo(buf, ".%s", quote_identifier(fieldname)); + } + break; + + case T_FieldStore: + { + FieldStore *fstore = (FieldStore *) node; + bool need_parens; + + /* + * There is no good way to represent a FieldStore as real SQL, + * so decompilation of INSERT or UPDATE statements should + * always use processIndirection as part of the + * statement-level syntax. We should only get here when + * EXPLAIN tries to print the targetlist of a plan resulting + * from such a statement. The plan case is even harder than + * ordinary rules would be, because the planner tries to + * collapse multiple assignments to the same field or subfield + * into one FieldStore; so we can see a list of target fields + * not just one, and the arguments could be FieldStores + * themselves. We don't bother to try to print the target + * field names; we just print the source arguments, with a + * ROW() around them if there's more than one. This isn't + * terribly complete, but it's probably good enough for + * EXPLAIN's purposes; especially since anything more would be + * either hopelessly confusing or an even poorer + * representation of what the plan is actually doing. + */ + need_parens = (list_length(fstore->newvals) != 1); + if (need_parens) + appendStringInfoString(buf, "ROW("); + get_rule_expr((Node *) fstore->newvals, context, showimplicit); + if (need_parens) + appendStringInfoChar(buf, ')'); + } + break; + + case T_RelabelType: + { + RelabelType *relabel = (RelabelType *) node; + Node *arg = (Node *) relabel->arg; + + if (relabel->relabelformat == COERCE_IMPLICIT_CAST && + !showimplicit) + { + /* don't show the implicit cast */ + get_rule_expr_paren(arg, context, false, node); + } + else + { + get_coercion_expr(arg, context, + relabel->resulttype, + relabel->resulttypmod, + node); + } + } + break; + + case T_CoerceViaIO: + { + CoerceViaIO *iocoerce = (CoerceViaIO *) node; + Node *arg = (Node *) iocoerce->arg; + + if (iocoerce->coerceformat == COERCE_IMPLICIT_CAST && + !showimplicit) + { + /* don't show the implicit cast */ + get_rule_expr_paren(arg, context, false, node); + } + else + { + get_coercion_expr(arg, context, + iocoerce->resulttype, + -1, + node); + } + } + break; + + case T_ArrayCoerceExpr: + { + ArrayCoerceExpr *acoerce = (ArrayCoerceExpr *) node; + Node *arg = (Node *) acoerce->arg; + + if (acoerce->coerceformat == COERCE_IMPLICIT_CAST && + !showimplicit) + { + /* don't show the implicit cast */ + get_rule_expr_paren(arg, context, false, node); + } + else + { + get_coercion_expr(arg, context, + acoerce->resulttype, + acoerce->resulttypmod, + node); + } + } + break; + + case T_ConvertRowtypeExpr: + { + ConvertRowtypeExpr *convert = (ConvertRowtypeExpr *) node; + Node *arg = (Node *) convert->arg; + + if (convert->convertformat == COERCE_IMPLICIT_CAST && + !showimplicit) + { + /* don't show the implicit cast */ + get_rule_expr_paren(arg, context, false, node); + } + else + { + get_coercion_expr(arg, context, + convert->resulttype, -1, + node); + } + } + break; + + case T_CollateExpr: + { + CollateExpr *collate = (CollateExpr *) node; + Node *arg = (Node *) collate->arg; + + if (!PRETTY_PAREN(context)) + appendStringInfoChar(buf, '('); + get_rule_expr_paren(arg, context, showimplicit, node); + appendStringInfo(buf, " COLLATE %s", + generate_collation_name(collate->collOid)); + if (!PRETTY_PAREN(context)) + appendStringInfoChar(buf, ')'); + } + break; + + case T_CaseExpr: + { + CaseExpr *caseexpr = (CaseExpr *) node; + ListCell *temp; + + appendContextKeyword(context, "CASE", + 0, PRETTYINDENT_VAR, 0); + if (caseexpr->arg) + { + appendStringInfoChar(buf, ' '); + get_rule_expr((Node *) caseexpr->arg, context, true); + } + foreach(temp, caseexpr->args) + { + CaseWhen *when = (CaseWhen *) lfirst(temp); + Node *w = (Node *) when->expr; + + if (caseexpr->arg) + { + /* + * The parser should have produced WHEN clauses of the + * form "CaseTestExpr = RHS", possibly with an + * implicit coercion inserted above the CaseTestExpr. + * For accurate decompilation of rules it's essential + * that we show just the RHS. However in an + * expression that's been through the optimizer, the + * WHEN clause could be almost anything (since the + * equality operator could have been expanded into an + * inline function). If we don't recognize the form + * of the WHEN clause, just punt and display it as-is. + */ + if (IsA(w, OpExpr)) + { + List *args = ((OpExpr *) w)->args; + + if (list_length(args) == 2 && + IsA(strip_implicit_coercions(linitial(args)), + CaseTestExpr)) + w = (Node *) lsecond(args); + } + } + + if (!PRETTY_INDENT(context)) + appendStringInfoChar(buf, ' '); + appendContextKeyword(context, "WHEN ", + 0, 0, 0); + get_rule_expr(w, context, false); + appendStringInfoString(buf, " THEN "); + get_rule_expr((Node *) when->result, context, true); + } + if (!PRETTY_INDENT(context)) + appendStringInfoChar(buf, ' '); + appendContextKeyword(context, "ELSE ", + 0, 0, 0); + get_rule_expr((Node *) caseexpr->defresult, context, true); + if (!PRETTY_INDENT(context)) + appendStringInfoChar(buf, ' '); + appendContextKeyword(context, "END", + -PRETTYINDENT_VAR, 0, 0); + } + break; + + case T_CaseTestExpr: + { + /* + * Normally we should never get here, since for expressions + * that can contain this node type we attempt to avoid + * recursing to it. But in an optimized expression we might + * be unable to avoid that (see comments for CaseExpr). If we + * do see one, print it as CASE_TEST_EXPR. + */ + appendStringInfoString(buf, "CASE_TEST_EXPR"); + } + break; + + case T_ArrayExpr: + { + ArrayExpr *arrayexpr = (ArrayExpr *) node; + + appendStringInfoString(buf, "ARRAY["); + get_rule_expr((Node *) arrayexpr->elements, context, true); + appendStringInfoChar(buf, ']'); + + /* + * If the array isn't empty, we assume its elements are + * coerced to the desired type. If it's empty, though, we + * need an explicit coercion to the array type. + */ + if (arrayexpr->elements == NIL) + appendStringInfo(buf, "::%s", + format_type_with_typemod(arrayexpr->array_typeid, -1)); + } + break; + + case T_RowExpr: + { + RowExpr *rowexpr = (RowExpr *) node; + TupleDesc tupdesc = NULL; + ListCell *arg; + int i; + char *sep; + + /* + * If it's a named type and not RECORD, we may have to skip + * dropped columns and/or claim there are NULLs for added + * columns. + */ + if (rowexpr->row_typeid != RECORDOID) + { + tupdesc = lookup_rowtype_tupdesc(rowexpr->row_typeid, -1); + Assert(list_length(rowexpr->args) <= tupdesc->natts); + } + + /* + * SQL99 allows "ROW" to be omitted when there is more than + * one column, but for simplicity we always print it. + */ + appendStringInfoString(buf, "ROW("); + sep = ""; + i = 0; + foreach(arg, rowexpr->args) + { + Node *e = (Node *) lfirst(arg); + + if (tupdesc == NULL || + !tupdesc->attrs[i]->attisdropped) + { + appendStringInfoString(buf, sep); + /* Whole-row Vars need special treatment here */ + get_rule_expr_toplevel(e, context, true); + sep = ", "; + } + i++; + } + if (tupdesc != NULL) + { + while (i < tupdesc->natts) + { + if (!tupdesc->attrs[i]->attisdropped) + { + appendStringInfoString(buf, sep); + appendStringInfoString(buf, "NULL"); + sep = ", "; + } + i++; + } + + ReleaseTupleDesc(tupdesc); + } + appendStringInfoChar(buf, ')'); + if (rowexpr->row_format == COERCE_EXPLICIT_CAST) + appendStringInfo(buf, "::%s", + format_type_with_typemod(rowexpr->row_typeid, -1)); + } + break; + + case T_RowCompareExpr: + { + RowCompareExpr *rcexpr = (RowCompareExpr *) node; + ListCell *arg; + char *sep; + + /* + * SQL99 allows "ROW" to be omitted when there is more than + * one column, but for simplicity we always print it. + */ + appendStringInfoString(buf, "(ROW("); + sep = ""; + foreach(arg, rcexpr->largs) + { + Node *e = (Node *) lfirst(arg); + + appendStringInfoString(buf, sep); + get_rule_expr(e, context, true); + sep = ", "; + } + + /* + * We assume that the name of the first-column operator will + * do for all the rest too. This is definitely open to + * failure, eg if some but not all operators were renamed + * since the construct was parsed, but there seems no way to + * be perfect. + */ + appendStringInfo(buf, ") %s ROW(", + generate_operator_name(linitial_oid(rcexpr->opnos), + exprType(linitial(rcexpr->largs)), + exprType(linitial(rcexpr->rargs)))); + sep = ""; + foreach(arg, rcexpr->rargs) + { + Node *e = (Node *) lfirst(arg); + + appendStringInfoString(buf, sep); + get_rule_expr(e, context, true); + sep = ", "; + } + appendStringInfoString(buf, "))"); + } + break; + + case T_CoalesceExpr: + { + CoalesceExpr *coalesceexpr = (CoalesceExpr *) node; + + appendStringInfoString(buf, "COALESCE("); + get_rule_expr((Node *) coalesceexpr->args, context, true); + appendStringInfoChar(buf, ')'); + } + break; + + case T_MinMaxExpr: + { + MinMaxExpr *minmaxexpr = (MinMaxExpr *) node; + + switch (minmaxexpr->op) + { + case IS_GREATEST: + appendStringInfoString(buf, "GREATEST("); + break; + case IS_LEAST: + appendStringInfoString(buf, "LEAST("); + break; + } + get_rule_expr((Node *) minmaxexpr->args, context, true); + appendStringInfoChar(buf, ')'); + } + break; + + case T_SQLValueFunction: + { + SQLValueFunction *svf = (SQLValueFunction *) node; + + /* + * Note: this code knows that typmod for time, timestamp, and + * timestamptz just prints as integer. + */ + switch (svf->op) + { + case SVFOP_CURRENT_DATE: + appendStringInfoString(buf, "CURRENT_DATE"); + break; + case SVFOP_CURRENT_TIME: + appendStringInfoString(buf, "CURRENT_TIME"); + break; + case SVFOP_CURRENT_TIME_N: + appendStringInfo(buf, "CURRENT_TIME(%d)", svf->typmod); + break; + case SVFOP_CURRENT_TIMESTAMP: + appendStringInfoString(buf, "CURRENT_TIMESTAMP"); + break; + case SVFOP_CURRENT_TIMESTAMP_N: + appendStringInfo(buf, "CURRENT_TIMESTAMP(%d)", + svf->typmod); + break; + case SVFOP_LOCALTIME: + appendStringInfoString(buf, "LOCALTIME"); + break; + case SVFOP_LOCALTIME_N: + appendStringInfo(buf, "LOCALTIME(%d)", svf->typmod); + break; + case SVFOP_LOCALTIMESTAMP: + appendStringInfoString(buf, "LOCALTIMESTAMP"); + break; + case SVFOP_LOCALTIMESTAMP_N: + appendStringInfo(buf, "LOCALTIMESTAMP(%d)", + svf->typmod); + break; + case SVFOP_CURRENT_ROLE: + appendStringInfoString(buf, "CURRENT_ROLE"); + break; + case SVFOP_CURRENT_USER: + appendStringInfoString(buf, "CURRENT_USER"); + break; + case SVFOP_USER: + appendStringInfoString(buf, "USER"); + break; + case SVFOP_SESSION_USER: + appendStringInfoString(buf, "SESSION_USER"); + break; + case SVFOP_CURRENT_CATALOG: + appendStringInfoString(buf, "CURRENT_CATALOG"); + break; + case SVFOP_CURRENT_SCHEMA: + appendStringInfoString(buf, "CURRENT_SCHEMA"); + break; + } + } + break; + + case T_NextValueExpr: + { + /* + * This gets invoked by Fast Query Shipping code to deparse a + * query. It seems enough to just generate a "DEFAULT" clause + * and let the remote datanode handle finding the correct + * sequence for replica identity. + * + * XXX PG10MERGE: If we do see issues with this, it might be + * worthwhile to consider generating an expression such as, + * nextval('sequence_name'::regclass) + */ + appendStringInfoString(buf, "DEFAULT"); + } + break; + + case T_XmlExpr: + { + XmlExpr *xexpr = (XmlExpr *) node; + bool needcomma = false; + ListCell *arg; + ListCell *narg; + Const *con; + + switch (xexpr->op) + { + case IS_XMLCONCAT: + appendStringInfoString(buf, "XMLCONCAT("); + break; + case IS_XMLELEMENT: + appendStringInfoString(buf, "XMLELEMENT("); + break; + case IS_XMLFOREST: + appendStringInfoString(buf, "XMLFOREST("); + break; + case IS_XMLPARSE: + appendStringInfoString(buf, "XMLPARSE("); + break; + case IS_XMLPI: + appendStringInfoString(buf, "XMLPI("); + break; + case IS_XMLROOT: + appendStringInfoString(buf, "XMLROOT("); + break; + case IS_XMLSERIALIZE: + appendStringInfoString(buf, "XMLSERIALIZE("); + break; + case IS_DOCUMENT: + break; + } + if (xexpr->op == IS_XMLPARSE || xexpr->op == IS_XMLSERIALIZE) + { + if (xexpr->xmloption == XMLOPTION_DOCUMENT) + appendStringInfoString(buf, "DOCUMENT "); + else + appendStringInfoString(buf, "CONTENT "); + } + if (xexpr->name) + { + appendStringInfo(buf, "NAME %s", + quote_identifier(map_xml_name_to_sql_identifier(xexpr->name))); + needcomma = true; + } + if (xexpr->named_args) + { + if (xexpr->op != IS_XMLFOREST) + { + if (needcomma) + appendStringInfoString(buf, ", "); + appendStringInfoString(buf, "XMLATTRIBUTES("); + needcomma = false; + } + forboth(arg, xexpr->named_args, narg, xexpr->arg_names) + { + Node *e = (Node *) lfirst(arg); + char *argname = strVal(lfirst(narg)); + + if (needcomma) + appendStringInfoString(buf, ", "); + get_rule_expr((Node *) e, context, true); + appendStringInfo(buf, " AS %s", + quote_identifier(map_xml_name_to_sql_identifier(argname))); + needcomma = true; + } + if (xexpr->op != IS_XMLFOREST) + appendStringInfoChar(buf, ')'); + } + if (xexpr->args) + { + if (needcomma) + appendStringInfoString(buf, ", "); + switch (xexpr->op) + { + case IS_XMLCONCAT: + case IS_XMLELEMENT: + case IS_XMLFOREST: + case IS_XMLPI: + case IS_XMLSERIALIZE: + /* no extra decoration needed */ + get_rule_expr((Node *) xexpr->args, context, true); + break; + case IS_XMLPARSE: + Assert(list_length(xexpr->args) == 2); + + get_rule_expr((Node *) linitial(xexpr->args), + context, true); + + con = lsecond_node(Const, xexpr->args); + Assert(!con->constisnull); + if (DatumGetBool(con->constvalue)) + appendStringInfoString(buf, + " PRESERVE WHITESPACE"); + else + appendStringInfoString(buf, + " STRIP WHITESPACE"); + break; + case IS_XMLROOT: + Assert(list_length(xexpr->args) == 3); + + get_rule_expr((Node *) linitial(xexpr->args), + context, true); + + appendStringInfoString(buf, ", VERSION "); + con = (Const *) lsecond(xexpr->args); + if (IsA(con, Const) && + con->constisnull) + appendStringInfoString(buf, "NO VALUE"); + else + get_rule_expr((Node *) con, context, false); + + con = lthird_node(Const, xexpr->args); + if (con->constisnull) + /* suppress STANDALONE NO VALUE */ ; + else + { + switch (DatumGetInt32(con->constvalue)) + { + case XML_STANDALONE_YES: + appendStringInfoString(buf, + ", STANDALONE YES"); + break; + case XML_STANDALONE_NO: + appendStringInfoString(buf, + ", STANDALONE NO"); + break; + case XML_STANDALONE_NO_VALUE: + appendStringInfoString(buf, + ", STANDALONE NO VALUE"); + break; + default: + break; + } + } + break; + case IS_DOCUMENT: + get_rule_expr_paren((Node *) xexpr->args, context, false, node); + break; + } + + } + if (xexpr->op == IS_XMLSERIALIZE) + appendStringInfo(buf, " AS %s", + format_type_with_typemod(xexpr->type, + xexpr->typmod)); + if (xexpr->op == IS_DOCUMENT) + appendStringInfoString(buf, " IS DOCUMENT"); + else + appendStringInfoChar(buf, ')'); + } + break; + + case T_NullTest: + { + NullTest *ntest = (NullTest *) node; + + if (!PRETTY_PAREN(context)) + appendStringInfoChar(buf, '('); + get_rule_expr_paren((Node *) ntest->arg, context, true, node); + + /* + * For scalar inputs, we prefer to print as IS [NOT] NULL, + * which is shorter and traditional. If it's a rowtype input + * but we're applying a scalar test, must print IS [NOT] + * DISTINCT FROM NULL to be semantically correct. + */ + if (ntest->argisrow || + !type_is_rowtype(exprType((Node *) ntest->arg))) + { + switch (ntest->nulltesttype) + { + case IS_NULL: + appendStringInfoString(buf, " IS NULL"); + break; + case IS_NOT_NULL: + appendStringInfoString(buf, " IS NOT NULL"); + break; + default: + elog(ERROR, "unrecognized nulltesttype: %d", + (int) ntest->nulltesttype); + } + } + else + { + switch (ntest->nulltesttype) + { + case IS_NULL: + appendStringInfoString(buf, " IS NOT DISTINCT FROM NULL"); + break; + case IS_NOT_NULL: + appendStringInfoString(buf, " IS DISTINCT FROM NULL"); + break; + default: + elog(ERROR, "unrecognized nulltesttype: %d", + (int) ntest->nulltesttype); + } + } + if (!PRETTY_PAREN(context)) + appendStringInfoChar(buf, ')'); + } + break; + + case T_BooleanTest: + { + BooleanTest *btest = (BooleanTest *) node; + + if (!PRETTY_PAREN(context)) + appendStringInfoChar(buf, '('); + get_rule_expr_paren((Node *) btest->arg, context, false, node); + switch (btest->booltesttype) + { + case IS_TRUE: + appendStringInfoString(buf, " IS TRUE"); + break; + case IS_NOT_TRUE: + appendStringInfoString(buf, " IS NOT TRUE"); + break; + case IS_FALSE: + appendStringInfoString(buf, " IS FALSE"); + break; + case IS_NOT_FALSE: + appendStringInfoString(buf, " IS NOT FALSE"); + break; + case IS_UNKNOWN: + appendStringInfoString(buf, " IS UNKNOWN"); + break; + case IS_NOT_UNKNOWN: + appendStringInfoString(buf, " IS NOT UNKNOWN"); + break; + default: + elog(ERROR, "unrecognized booltesttype: %d", + (int) btest->booltesttype); + } + if (!PRETTY_PAREN(context)) + appendStringInfoChar(buf, ')'); + } + break; + + case T_CoerceToDomain: + { + CoerceToDomain *ctest = (CoerceToDomain *) node; + Node *arg = (Node *) ctest->arg; + + if (ctest->coercionformat == COERCE_IMPLICIT_CAST && + !showimplicit) + { + /* don't show the implicit cast */ + get_rule_expr(arg, context, false); + } + else + { + get_coercion_expr(arg, context, + ctest->resulttype, + ctest->resulttypmod, + node); + } + } + break; + + case T_CoerceToDomainValue: + appendStringInfoString(buf, "VALUE"); + break; + + case T_SetToDefault: + appendStringInfoString(buf, "DEFAULT"); + break; + + case T_CurrentOfExpr: + { + CurrentOfExpr *cexpr = (CurrentOfExpr *) node; + + if (cexpr->cursor_name) + appendStringInfo(buf, "CURRENT OF %s", + quote_identifier(cexpr->cursor_name)); + else + appendStringInfo(buf, "CURRENT OF $%d", + cexpr->cursor_param); + } + break; + + case T_InferenceElem: + { + InferenceElem *iexpr = (InferenceElem *) node; + bool save_varprefix; + bool need_parens; + + /* + * InferenceElem can only refer to target relation, so a + * prefix is not useful, and indeed would cause parse errors. + */ + save_varprefix = context->varprefix; + context->varprefix = false; + + /* + * Parenthesize the element unless it's a simple Var or a bare + * function call. Follows pg_get_indexdef_worker(). + */ + need_parens = !IsA(iexpr->expr, Var); + if (IsA(iexpr->expr, FuncExpr) && + ((FuncExpr *) iexpr->expr)->funcformat == + COERCE_EXPLICIT_CALL) + need_parens = false; + + if (need_parens) + appendStringInfoChar(buf, '('); + get_rule_expr((Node *) iexpr->expr, + context, false); + if (need_parens) + appendStringInfoChar(buf, ')'); + + context->varprefix = save_varprefix; + + if (iexpr->infercollid) + appendStringInfo(buf, " COLLATE %s", + generate_collation_name(iexpr->infercollid)); + + /* Add the operator class name, if not default */ + if (iexpr->inferopclass) + { + Oid inferopclass = iexpr->inferopclass; + Oid inferopcinputtype = get_opclass_input_type(iexpr->inferopclass); + + get_opclass_name(inferopclass, inferopcinputtype, buf); + } + } + break; + + case T_PartitionBoundSpec: + { + PartitionBoundSpec *spec = (PartitionBoundSpec *) node; + ListCell *cell; + char *sep; + + switch (spec->strategy) + { + case PARTITION_STRATEGY_LIST: + Assert(spec->listdatums != NIL); + + appendStringInfoString(buf, "FOR VALUES IN ("); + sep = ""; + foreach(cell, spec->listdatums) + { + Const *val = castNode(Const, lfirst(cell)); + + appendStringInfoString(buf, sep); + get_const_expr(val, context, -1); + sep = ", "; + } + + appendStringInfoString(buf, ")"); + break; + + case PARTITION_STRATEGY_RANGE: + Assert(spec->lowerdatums != NIL && + spec->upperdatums != NIL && + list_length(spec->lowerdatums) == + list_length(spec->upperdatums)); + + appendStringInfo(buf, "FOR VALUES FROM %s TO %s", + get_range_partbound_string(spec->lowerdatums), + get_range_partbound_string(spec->upperdatums)); + break; + + default: + elog(ERROR, "unrecognized partition strategy: %d", + (int) spec->strategy); + break; + } + } + break; + + case T_List: + { + char *sep; + ListCell *l; + + sep = ""; + foreach(l, (List *) node) + { + appendStringInfoString(buf, sep); + get_rule_expr((Node *) lfirst(l), context, showimplicit); + sep = ", "; + } + } + break; + + case T_TableFunc: + get_tablefunc((TableFunc *) node, context, showimplicit); + break; + + default: + elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node)); + break; + } +} + +/* + * get_rule_expr_toplevel - Parse back a toplevel expression + * + * Same as get_rule_expr(), except that if the expr is just a Var, we pass + * istoplevel = true not false to get_variable(). This causes whole-row Vars + * to get printed with decoration that will prevent expansion of "*". + * We need to use this in contexts such as ROW() and VALUES(), where the + * parser would expand "foo.*" appearing at top level. (In principle we'd + * use this in get_target_list() too, but that has additional worries about + * whether to print AS, so it needs to invoke get_variable() directly anyway.) + */ +static void +get_rule_expr_toplevel(Node *node, deparse_context *context, + bool showimplicit) +{ + if (node && IsA(node, Var)) + (void) get_variable((Var *) node, 0, true, context); + else + get_rule_expr(node, context, showimplicit); +} + +/* + * get_rule_expr_funccall - Parse back a function-call expression + * + * Same as get_rule_expr(), except that we guarantee that the output will + * look like a function call, or like one of the things the grammar treats as + * equivalent to a function call (see the func_expr_windowless production). + * This is needed in places where the grammar uses func_expr_windowless and + * you can't substitute a parenthesized a_expr. If what we have isn't going + * to look like a function call, wrap it in a dummy CAST() expression, which + * will satisfy the grammar --- and, indeed, is likely what the user wrote to + * produce such a thing. + */ +static void +get_rule_expr_funccall(Node *node, deparse_context *context, + bool showimplicit) +{ + if (looks_like_function(node)) + get_rule_expr(node, context, showimplicit); + else + { + StringInfo buf = context->buf; + + appendStringInfoString(buf, "CAST("); + /* no point in showing any top-level implicit cast */ + get_rule_expr(node, context, false); + appendStringInfo(buf, " AS %s)", + format_type_with_typemod(exprType(node), + exprTypmod(node))); + } +} + +/* + * Helper function to identify node types that satisfy func_expr_windowless. + * If in doubt, "false" is always a safe answer. + */ +static bool +looks_like_function(Node *node) +{// #lizard forgives + if (node == NULL) + return false; /* probably shouldn't happen */ + switch (nodeTag(node)) + { + case T_FuncExpr: + /* OK, unless it's going to deparse as a cast */ + return (((FuncExpr *) node)->funcformat == COERCE_EXPLICIT_CALL); + case T_NullIfExpr: + case T_CoalesceExpr: + case T_MinMaxExpr: + case T_SQLValueFunction: + case T_XmlExpr: + /* these are all accepted by func_expr_common_subexpr */ + return true; + default: + break; + } + return false; +} + + +/* + * get_oper_expr - Parse back an OpExpr node + */ +static void +get_oper_expr(OpExpr *expr, deparse_context *context) +{ + StringInfo buf = context->buf; + Oid opno = expr->opno; + List *args = expr->args; + + if (!PRETTY_PAREN(context)) + appendStringInfoChar(buf, '('); + if (list_length(args) == 2) + { + /* binary operator */ + Node *arg1 = (Node *) linitial(args); + Node *arg2 = (Node *) lsecond(args); + + get_rule_expr_paren(arg1, context, true, (Node *) expr); + appendStringInfo(buf, " %s ", + generate_operator_name(opno, + exprType(arg1), + exprType(arg2))); + get_rule_expr_paren(arg2, context, true, (Node *) expr); + } + else + { + /* unary operator --- but which side? */ + Node *arg = (Node *) linitial(args); + HeapTuple tp; + Form_pg_operator optup; + + tp = SearchSysCache1(OPEROID, ObjectIdGetDatum(opno)); + if (!HeapTupleIsValid(tp)) + elog(ERROR, "cache lookup failed for operator %u", opno); + optup = (Form_pg_operator) GETSTRUCT(tp); + switch (optup->oprkind) + { + case 'l': + appendStringInfo(buf, "%s ", + generate_operator_name(opno, + InvalidOid, + exprType(arg))); + get_rule_expr_paren(arg, context, true, (Node *) expr); + break; + case 'r': + get_rule_expr_paren(arg, context, true, (Node *) expr); + appendStringInfo(buf, " %s", + generate_operator_name(opno, + exprType(arg), + InvalidOid)); + break; + default: + elog(ERROR, "bogus oprkind: %d", optup->oprkind); + } + ReleaseSysCache(tp); + } + if (!PRETTY_PAREN(context)) + appendStringInfoChar(buf, ')'); +} + +/* + * get_func_expr - Parse back a FuncExpr node + */ +static void +get_func_expr(FuncExpr *expr, deparse_context *context, + bool showimplicit) +{// #lizard forgives + StringInfo buf = context->buf; + Oid funcoid = expr->funcid; + Oid argtypes[FUNC_MAX_ARGS]; + int nargs; + List *argnames; + bool use_variadic; + ListCell *l; + + /* + * If the function call came from an implicit coercion, then just show the + * first argument --- unless caller wants to see implicit coercions. + */ + if (expr->funcformat == COERCE_IMPLICIT_CAST && !showimplicit) + { + get_rule_expr_paren((Node *) linitial(expr->args), context, + false, (Node *) expr); + return; + } + + /* + * If the function call came from a cast, then show the first argument + * plus an explicit cast operation. + */ + if (expr->funcformat == COERCE_EXPLICIT_CAST || + expr->funcformat == COERCE_IMPLICIT_CAST) + { + Node *arg = linitial(expr->args); + Oid rettype = expr->funcresulttype; + int32 coercedTypmod; + + /* Get the typmod if this is a length-coercion function */ + (void) exprIsLengthCoercion((Node *) expr, &coercedTypmod); + + get_coercion_expr(arg, context, + rettype, coercedTypmod, + (Node *) expr); + + return; + } + + /* + * Normal function: display as proname(args). First we need to extract + * the argument datatypes. + */ + if (list_length(expr->args) > FUNC_MAX_ARGS) + ereport(ERROR, + (errcode(ERRCODE_TOO_MANY_ARGUMENTS), + errmsg("too many arguments"))); + nargs = 0; + argnames = NIL; + foreach(l, expr->args) + { + Node *arg = (Node *) lfirst(l); + + if (IsA(arg, NamedArgExpr)) + argnames = lappend(argnames, ((NamedArgExpr *) arg)->name); + argtypes[nargs] = exprType(arg); + nargs++; + } + + appendStringInfo(buf, "%s(", + generate_function_name(funcoid, nargs, + argnames, argtypes, + expr->funcvariadic, + &use_variadic, + context->special_exprkind)); + nargs = 0; + foreach(l, expr->args) + { + if (nargs++ > 0) + appendStringInfoString(buf, ", "); + if (use_variadic && lnext(l) == NULL) + appendStringInfoString(buf, "VARIADIC "); + get_rule_expr((Node *) lfirst(l), context, true); + } + appendStringInfoChar(buf, ')'); +} + +/* + * get_agg_expr - Parse back an Aggref node + */ +static void +get_agg_expr(Aggref *aggref, deparse_context *context, + Aggref *original_aggref) +{// #lizard forgives + StringInfo buf = context->buf; + Oid argtypes[FUNC_MAX_ARGS]; + int nargs; +#ifdef PGXC +// bool added_finalfn = false; +#endif /* PGXC */ + + bool use_variadic; + + /* + * For a combining aggregate, we look up and deparse the corresponding + * partial aggregate instead. This is necessary because our input + * argument list has been replaced; the new argument list always has just + * one element, which will point to a partial Aggref that supplies us with + * transition states to combine. + */ + if (DO_AGGSPLIT_COMBINE(aggref->aggsplit)) + { + TargetEntry *tle = linitial_node(TargetEntry, aggref->args); + + Assert(list_length(aggref->args) == 1); + resolve_special_varno((Node *) tle->expr, context, original_aggref, + get_agg_combine_expr); + return; + } + + /* + * Mark as PARTIAL, if appropriate. We look to the original aggref so as + * to avoid printing this when recursing from the code just above. + */ + if (DO_AGGSPLIT_SKIPFINAL(original_aggref->aggsplit)) + appendStringInfoString(buf, "PARTIAL "); + + /* Extract the argument types as seen by the parser */ + nargs = get_aggregate_argtypes(aggref, argtypes); + + /* Print the aggregate name, schema-qualified if needed */ + appendStringInfo(buf, "%s(%s", + generate_function_name(aggref->aggfnoid, nargs, + NIL, argtypes, + aggref->aggvariadic, + &use_variadic, + context->special_exprkind), + (aggref->aggdistinct != NIL) ? "DISTINCT " : ""); + + if (AGGKIND_IS_ORDERED_SET(aggref->aggkind)) + { + /* + * Ordered-set aggregates do not use "*" syntax. Also, we needn't + * worry about inserting VARIADIC. So we can just dump the direct + * args as-is. + */ + Assert(!aggref->aggvariadic); + get_rule_expr((Node *) aggref->aggdirectargs, context, true); + Assert(aggref->aggorder != NIL); + appendStringInfoString(buf, ") WITHIN GROUP (ORDER BY "); + get_rule_orderby(aggref->aggorder, aggref->args, false, context); + } + else + { + /* aggstar can be set only in zero-argument aggregates */ + if (aggref->aggstar) + appendStringInfoChar(buf, '*'); + else + { + ListCell *l; + int i; + + i = 0; + foreach(l, aggref->args) + { + TargetEntry *tle = (TargetEntry *) lfirst(l); + Node *arg = (Node *) tle->expr; + + Assert(!IsA(arg, NamedArgExpr)); + if (tle->resjunk) + continue; + if (i++ > 0) + appendStringInfoString(buf, ", "); + if (use_variadic && i == nargs) + appendStringInfoString(buf, "VARIADIC "); + get_rule_expr(arg, context, true); + } + } + + if (aggref->aggorder != NIL) + { + appendStringInfoString(buf, " ORDER BY "); + get_rule_orderby(aggref->aggorder, aggref->args, false, context); + } + } + + if (aggref->aggfilter != NULL) + { + appendStringInfoString(buf, ") FILTER (WHERE "); + get_rule_expr((Node *) aggref->aggfilter, context, false); + } + + appendStringInfoChar(buf, ')'); + +} + +/* + * This is a helper function for get_agg_expr(). It's used when we deparse + * a combining Aggref; resolve_special_varno locates the corresponding partial + * Aggref and then calls this. + */ +static void +get_agg_combine_expr(Node *node, deparse_context *context, void *private) +{ + Aggref *aggref; + Aggref *original_aggref = private; + + if (!IsA(node, Aggref)) + elog(ERROR, "combining Aggref does not point to an Aggref"); + + aggref = (Aggref *) node; + get_agg_expr(aggref, context, original_aggref); +} + +/* + * get_windowfunc_expr - Parse back a WindowFunc node + */ +static void +get_windowfunc_expr(WindowFunc *wfunc, deparse_context *context) +{// #lizard forgives + StringInfo buf = context->buf; + Oid argtypes[FUNC_MAX_ARGS]; + int nargs; + List *argnames; + ListCell *l; + + if (list_length(wfunc->args) > FUNC_MAX_ARGS) + ereport(ERROR, + (errcode(ERRCODE_TOO_MANY_ARGUMENTS), + errmsg("too many arguments"))); + nargs = 0; + argnames = NIL; + foreach(l, wfunc->args) + { + Node *arg = (Node *) lfirst(l); + + if (IsA(arg, NamedArgExpr)) + argnames = lappend(argnames, ((NamedArgExpr *) arg)->name); + argtypes[nargs] = exprType(arg); + nargs++; + } + + appendStringInfo(buf, "%s(", + generate_function_name(wfunc->winfnoid, nargs, + argnames, argtypes, + false, NULL, + context->special_exprkind)); + /* winstar can be set only in zero-argument aggregates */ + if (wfunc->winstar) + appendStringInfoChar(buf, '*'); + else + get_rule_expr((Node *) wfunc->args, context, true); + + if (wfunc->aggfilter != NULL) + { + appendStringInfoString(buf, ") FILTER (WHERE "); + get_rule_expr((Node *) wfunc->aggfilter, context, false); + } + + appendStringInfoString(buf, ") OVER "); + + foreach(l, context->windowClause) + { + WindowClause *wc = (WindowClause *) lfirst(l); + + if (wc->winref == wfunc->winref) + { + if (wc->name) + appendStringInfoString(buf, quote_identifier(wc->name)); + else + get_rule_windowspec(wc, context->windowTList, context); + break; + } + } + if (l == NULL) + { + if (context->windowClause) + elog(ERROR, "could not find window clause for winref %u", + wfunc->winref); + + /* + * In EXPLAIN, we don't have window context information available, so + * we have to settle for this: + */ + appendStringInfoString(buf, "(?)"); + } +} + +/* ---------- + * get_coercion_expr + * + * Make a string representation of a value coerced to a specific type + * ---------- + */ +static void +get_coercion_expr(Node *arg, deparse_context *context, + Oid resulttype, int32 resulttypmod, + Node *parentNode) +{ + StringInfo buf = context->buf; + + /* + * Since parse_coerce.c doesn't immediately collapse application of + * length-coercion functions to constants, what we'll typically see in + * such cases is a Const with typmod -1 and a length-coercion function + * right above it. Avoid generating redundant output. However, beware of + * suppressing casts when the user actually wrote something like + * 'foo'::text::char(3). + * + * Note: it might seem that we are missing the possibility of needing to + * print a COLLATE clause for such a Const. However, a Const could only + * have nondefault collation in a post-constant-folding tree, in which the + * length coercion would have been folded too. See also the special + * handling of CollateExpr in coerce_to_target_type(): any collation + * marking will be above the coercion node, not below it. + */ + if (arg && IsA(arg, Const) && + ((Const *) arg)->consttype == resulttype && + ((Const *) arg)->consttypmod == -1) + { + /* Show the constant without normal ::typename decoration */ + get_const_expr((Const *) arg, context, -1); + } + else + { + if (!PRETTY_PAREN(context)) + appendStringInfoChar(buf, '('); + get_rule_expr_paren(arg, context, false, parentNode); + if (!PRETTY_PAREN(context)) + appendStringInfoChar(buf, ')'); + } + appendStringInfo(buf, "::%s", + format_type_with_typemod(resulttype, resulttypmod)); +} + +/* ---------- + * get_const_expr + * + * Make a string representation of a Const + * + * showtype can be -1 to never show "::typename" decoration, or +1 to always + * show it, or 0 to show it only if the constant wouldn't be assumed to be + * the right type by default. + * + * If the Const's collation isn't default for its type, show that too. + * We mustn't do this when showtype is -1 (since that means the caller will + * print "::typename", and we can't put a COLLATE clause in between). It's + * caller's responsibility that collation isn't missed in such cases. + * ---------- + */ +static void +get_const_expr(Const *constval, deparse_context *context, int showtype) +{// #lizard forgives + StringInfo buf = context->buf; + Oid typoutput; + bool typIsVarlena; + char *extval; + bool needlabel = false; + + if (constval->constisnull) + { + /* + * Always label the type of a NULL constant to prevent misdecisions + * about type when reparsing. + */ + appendStringInfoString(buf, "NULL"); + if (showtype >= 0) + { + appendStringInfo(buf, "::%s", + format_type_with_typemod(constval->consttype, + constval->consttypmod)); + get_const_collation(constval, context); + } + return; + } + + getTypeOutputInfo(constval->consttype, + &typoutput, &typIsVarlena); + + extval = OidOutputFunctionCall(typoutput, constval->constvalue); + + switch (constval->consttype) + { + case INT4OID: + + /* + * INT4 can be printed without any decoration, unless it is + * negative; in that case print it as '-nnn'::integer to ensure + * that the output will re-parse as a constant, not as a constant + * plus operator. In most cases we could get away with printing + * (-nnn) instead, because of the way that gram.y handles negative + * literals; but that doesn't work for INT_MIN, and it doesn't + * seem that much prettier anyway. + */ + if (extval[0] != '-') + appendStringInfoString(buf, extval); + else + { + appendStringInfo(buf, "'%s'", extval); + needlabel = true; /* we must attach a cast */ + } + break; + + case NUMERICOID: + + /* + * NUMERIC can be printed without quotes if it looks like a float + * constant (not an integer, and not Infinity or NaN) and doesn't + * have a leading sign (for the same reason as for INT4). + */ + if (isdigit((unsigned char) extval[0]) && + strcspn(extval, "eE.") != strlen(extval)) + { + appendStringInfoString(buf, extval); + } + else + { + appendStringInfo(buf, "'%s'", extval); + needlabel = true; /* we must attach a cast */ + } + break; + + case BITOID: + case VARBITOID: + appendStringInfo(buf, "B'%s'", extval); + break; + + case BOOLOID: + if (strcmp(extval, "t") == 0) + appendStringInfoString(buf, "true"); + else + appendStringInfoString(buf, "false"); + break; + + default: + simple_quote_literal(buf, extval); + break; + } + + pfree(extval); + + if (showtype < 0) + return; + + /* + * For showtype == 0, append ::typename unless the constant will be + * implicitly typed as the right type when it is read in. + * + * XXX this code has to be kept in sync with the behavior of the parser, + * especially make_const. + */ + switch (constval->consttype) + { + case BOOLOID: + case UNKNOWNOID: + /* These types can be left unlabeled */ + needlabel = false; + break; + case INT4OID: + /* We determined above whether a label is needed */ + break; + case NUMERICOID: + + /* + * Float-looking constants will be typed as numeric, which we + * checked above; but if there's a nondefault typmod we need to + * show it. + */ + needlabel |= (constval->consttypmod >= 0); + break; + default: + needlabel = true; + break; + } + if (needlabel || showtype > 0) + appendStringInfo(buf, "::%s", + format_type_with_typemod(constval->consttype, + constval->consttypmod)); + + get_const_collation(constval, context); +} + +/* + * helper for get_const_expr: append COLLATE if needed + */ +static void +get_const_collation(Const *constval, deparse_context *context) +{ + StringInfo buf = context->buf; + + if (OidIsValid(constval->constcollid)) + { + Oid typcollation = get_typcollation(constval->consttype); + + if (constval->constcollid != typcollation) + { + appendStringInfo(buf, " COLLATE %s", + generate_collation_name(constval->constcollid)); + } + } +} + +/* + * simple_quote_literal - Format a string as a SQL literal, append to buf + */ +static void +simple_quote_literal(StringInfo buf, const char *val) +{ + const char *valptr; + + /* + * We form the string literal according to the prevailing setting of + * standard_conforming_strings; we never use E''. User is responsible for + * making sure result is used correctly. + */ + appendStringInfoChar(buf, '\''); + for (valptr = val; *valptr; valptr++) + { + char ch = *valptr; + + if (SQL_STR_DOUBLE(ch, !standard_conforming_strings)) + appendStringInfoChar(buf, ch); + appendStringInfoChar(buf, ch); + } + appendStringInfoChar(buf, '\''); +} + + +/* ---------- + * get_sublink_expr - Parse back a sublink + * ---------- + */ +static void +get_sublink_expr(SubLink *sublink, deparse_context *context) +{// #lizard forgives + StringInfo buf = context->buf; + Query *query = (Query *) (sublink->subselect); + char *opname = NULL; + bool need_paren; + + if (sublink->subLinkType == ARRAY_SUBLINK) + appendStringInfoString(buf, "ARRAY("); + else + appendStringInfoChar(buf, '('); + + /* + * Note that we print the name of only the first operator, when there are + * multiple combining operators. This is an approximation that could go + * wrong in various scenarios (operators in different schemas, renamed + * operators, etc) but there is not a whole lot we can do about it, since + * the syntax allows only one operator to be shown. + */ + if (sublink->testexpr) + { + if (IsA(sublink->testexpr, OpExpr)) + { + /* single combining operator */ + OpExpr *opexpr = (OpExpr *) sublink->testexpr; + + get_rule_expr(linitial(opexpr->args), context, true); + opname = generate_operator_name(opexpr->opno, + exprType(linitial(opexpr->args)), + exprType(lsecond(opexpr->args))); + } + else if (IsA(sublink->testexpr, BoolExpr)) + { + /* multiple combining operators, = or <> cases */ + char *sep; + ListCell *l; + + appendStringInfoChar(buf, '('); + sep = ""; + foreach(l, ((BoolExpr *) sublink->testexpr)->args) + { + OpExpr *opexpr = lfirst_node(OpExpr, l); + + appendStringInfoString(buf, sep); + get_rule_expr(linitial(opexpr->args), context, true); + if (!opname) + opname = generate_operator_name(opexpr->opno, + exprType(linitial(opexpr->args)), + exprType(lsecond(opexpr->args))); + sep = ", "; + } + appendStringInfoChar(buf, ')'); + } + else if (IsA(sublink->testexpr, RowCompareExpr)) + { + /* multiple combining operators, < <= > >= cases */ + RowCompareExpr *rcexpr = (RowCompareExpr *) sublink->testexpr; + + appendStringInfoChar(buf, '('); + get_rule_expr((Node *) rcexpr->largs, context, true); + opname = generate_operator_name(linitial_oid(rcexpr->opnos), + exprType(linitial(rcexpr->largs)), + exprType(linitial(rcexpr->rargs))); + appendStringInfoChar(buf, ')'); + } + else + elog(ERROR, "unrecognized testexpr type: %d", + (int) nodeTag(sublink->testexpr)); + } + + need_paren = true; + + switch (sublink->subLinkType) + { + case EXISTS_SUBLINK: + appendStringInfoString(buf, "EXISTS "); + break; + + case ANY_SUBLINK: + if (strcmp(opname, "=") == 0) /* Represent = ANY as IN */ + appendStringInfoString(buf, " IN "); + else + appendStringInfo(buf, " %s ANY ", opname); + break; + + case ALL_SUBLINK: + appendStringInfo(buf, " %s ALL ", opname); + break; + + case ROWCOMPARE_SUBLINK: + appendStringInfo(buf, " %s ", opname); + break; + + case EXPR_SUBLINK: + case MULTIEXPR_SUBLINK: + case ARRAY_SUBLINK: + need_paren = false; + break; + + case CTE_SUBLINK: /* shouldn't occur in a SubLink */ + default: + elog(ERROR, "unrecognized sublink type: %d", + (int) sublink->subLinkType); + break; + } + + if (need_paren) + appendStringInfoChar(buf, '('); + + get_query_def(query, buf, context->namespaces, NULL, + context->prettyFlags, context->wrapColumn, + context->indentLevel, + context->finalise_aggs, + context->sortgroup_colno); + + if (need_paren) + appendStringInfoString(buf, "))"); + else + appendStringInfoChar(buf, ')'); +} + + +/* ---------- + * get_tablefunc - Parse back a table function + * ---------- + */ +static void +get_tablefunc(TableFunc *tf, deparse_context *context, bool showimplicit) +{// #lizard forgives + StringInfo buf = context->buf; + + /* XMLTABLE is the only existing implementation. */ + + appendStringInfoString(buf, "XMLTABLE("); + + if (tf->ns_uris != NIL) + { + ListCell *lc1, + *lc2; + bool first = true; + + appendStringInfoString(buf, "XMLNAMESPACES ("); + forboth(lc1, tf->ns_uris, lc2, tf->ns_names) + { + Node *expr = (Node *) lfirst(lc1); + char *name = strVal(lfirst(lc2)); + + if (!first) + appendStringInfoString(buf, ", "); + else + first = false; + + if (name != NULL) + { + get_rule_expr(expr, context, showimplicit); + appendStringInfo(buf, " AS %s", name); + } + else + { + appendStringInfoString(buf, "DEFAULT "); + get_rule_expr(expr, context, showimplicit); + } + } + appendStringInfoString(buf, "), "); + } + + appendStringInfoChar(buf, '('); + get_rule_expr((Node *) tf->rowexpr, context, showimplicit); + appendStringInfoString(buf, ") PASSING ("); + get_rule_expr((Node *) tf->docexpr, context, showimplicit); + appendStringInfoChar(buf, ')'); + + if (tf->colexprs != NIL) + { + ListCell *l1; + ListCell *l2; + ListCell *l3; + ListCell *l4; + ListCell *l5; + int colnum = 0; + + l2 = list_head(tf->coltypes); + l3 = list_head(tf->coltypmods); + l4 = list_head(tf->colexprs); + l5 = list_head(tf->coldefexprs); + + appendStringInfoString(buf, " COLUMNS "); + foreach(l1, tf->colnames) + { + char *colname = strVal(lfirst(l1)); + Oid typid; + int32 typmod; + Node *colexpr; + Node *coldefexpr; + bool ordinality = tf->ordinalitycol == colnum; + bool notnull = bms_is_member(colnum, tf->notnulls); + + typid = lfirst_oid(l2); + l2 = lnext(l2); + typmod = lfirst_int(l3); + l3 = lnext(l3); + colexpr = (Node *) lfirst(l4); + l4 = lnext(l4); + coldefexpr = (Node *) lfirst(l5); + l5 = lnext(l5); + + if (colnum > 0) + appendStringInfoString(buf, ", "); + colnum++; + + appendStringInfo(buf, "%s %s", quote_identifier(colname), + ordinality ? "FOR ORDINALITY" : + format_type_with_typemod(typid, typmod)); + if (ordinality) + continue; + + if (coldefexpr != NULL) + { + appendStringInfoString(buf, " DEFAULT ("); + get_rule_expr((Node *) coldefexpr, context, showimplicit); + appendStringInfoChar(buf, ')'); + } + if (colexpr != NULL) + { + appendStringInfoString(buf, " PATH ("); + get_rule_expr((Node *) colexpr, context, showimplicit); + appendStringInfoChar(buf, ')'); + } + if (notnull) + appendStringInfoString(buf, " NOT NULL"); + } + } + + appendStringInfoChar(buf, ')'); +} + +/* ---------- + * get_from_clause - Parse back a FROM clause + * + * "prefix" is the keyword that denotes the start of the list of FROM + * elements. It is FROM when used to parse back SELECT and UPDATE, but + * is USING when parsing back DELETE. + * ---------- + */ +static void +get_from_clause(Query *query, const char *prefix, deparse_context *context) +{// #lizard forgives + StringInfo buf = context->buf; + bool first = true; + ListCell *l; + + /* + * We use the query's jointree as a guide to what to print. However, we + * must ignore auto-added RTEs that are marked not inFromCl. (These can + * only appear at the top level of the jointree, so it's sufficient to + * check here.) This check also ensures we ignore the rule pseudo-RTEs + * for NEW and OLD. + */ + foreach(l, query->jointree->fromlist) + { + Node *jtnode = (Node *) lfirst(l); + + if (IsA(jtnode, RangeTblRef)) + { + int varno = ((RangeTblRef *) jtnode)->rtindex; + RangeTblEntry *rte = rt_fetch(varno, query->rtable); + + if (!rte->inFromCl) + continue; + } + + if (first) + { + appendContextKeyword(context, prefix, + -PRETTYINDENT_STD, PRETTYINDENT_STD, 2); + first = false; + + get_from_clause_item(jtnode, query, context); + } + else + { + StringInfoData itembuf; + + appendStringInfoString(buf, ", "); + + /* + * Put the new FROM item's text into itembuf so we can decide + * after we've got it whether or not it needs to go on a new line. + */ + initStringInfo(&itembuf); + context->buf = &itembuf; + + get_from_clause_item(jtnode, query, context); + + /* Restore context's output buffer */ + context->buf = buf; + + /* Consider line-wrapping if enabled */ + if (PRETTY_INDENT(context) && context->wrapColumn >= 0) + { + /* Does the new item start with a new line? */ + if (itembuf.len > 0 && itembuf.data[0] == '\n') + { + /* If so, we shouldn't add anything */ + /* instead, remove any trailing spaces currently in buf */ + removeStringInfoSpaces(buf); + } + else + { + char *trailing_nl; + + /* Locate the start of the current line in the buffer */ + trailing_nl = strrchr(buf->data, '\n'); + if (trailing_nl == NULL) + trailing_nl = buf->data; + else + trailing_nl++; + + /* + * Add a newline, plus some indentation, if the new item + * would cause an overflow. + */ + if (strlen(trailing_nl) + itembuf.len > context->wrapColumn) + appendContextKeyword(context, "", -PRETTYINDENT_STD, + PRETTYINDENT_STD, + PRETTYINDENT_VAR); + } + } + + /* Add the new item */ + appendStringInfoString(buf, itembuf.data); + + /* clean up */ + pfree(itembuf.data); + } + } +} + +static void +get_from_clause_item(Node *jtnode, Query *query, deparse_context *context) +{// #lizard forgives + StringInfo buf = context->buf; + deparse_namespace *dpns = (deparse_namespace *) linitial(context->namespaces); + + if (IsA(jtnode, RangeTblRef)) + { + int varno = ((RangeTblRef *) jtnode)->rtindex; + RangeTblEntry *rte = rt_fetch(varno, query->rtable); + char *refname = get_rtable_name(varno, context); + deparse_columns *colinfo = deparse_columns_fetch(varno, dpns); + RangeTblFunction *rtfunc1 = NULL; + bool printalias; + + if (rte->lateral) + appendStringInfoString(buf, "LATERAL "); + + /* Print the FROM item proper */ + switch (rte->rtekind) + { + case RTE_RELATION: + /* Normal relation RTE */ + appendStringInfo(buf, "%s%s", + only_marker(rte), + generate_relation_name(rte->relid, + context->namespaces)); +#ifdef __TBASE__ + /* print for default partition */ + if (rte->intervalparent && rte->isdefault) + { + appendStringInfoString(buf, " PARTITION For Default "); + } +#endif + break; + case RTE_SUBQUERY: + /* Subquery RTE */ + appendStringInfoChar(buf, '('); + get_query_def(rte->subquery, buf, context->namespaces, NULL, + context->prettyFlags, context->wrapColumn, + context->indentLevel, + context->finalise_aggs, + context->sortgroup_colno); + appendStringInfoChar(buf, ')'); + break; + case RTE_FUNCTION: + /* Function RTE */ + rtfunc1 = (RangeTblFunction *) linitial(rte->functions); + + /* + * Omit ROWS FROM() syntax for just one function, unless it + * has both a coldeflist and WITH ORDINALITY. If it has both, + * we must use ROWS FROM() syntax to avoid ambiguity about + * whether the coldeflist includes the ordinality column. + */ + if (list_length(rte->functions) == 1 && + (rtfunc1->funccolnames == NIL || !rte->funcordinality)) + { + get_rule_expr_funccall(rtfunc1->funcexpr, context, true); + /* we'll print the coldeflist below, if it has one */ + } + else + { + bool all_unnest; + ListCell *lc; + + /* + * If all the function calls in the list are to unnest, + * and none need a coldeflist, then collapse the list back + * down to UNNEST(args). (If we had more than one + * built-in unnest function, this would get more + * difficult.) + * + * XXX This is pretty ugly, since it makes not-terribly- + * future-proof assumptions about what the parser would do + * with the output; but the alternative is to emit our + * nonstandard ROWS FROM() notation for what might have + * been a perfectly spec-compliant multi-argument + * UNNEST(). + */ + all_unnest = true; + foreach(lc, rte->functions) + { + RangeTblFunction *rtfunc = (RangeTblFunction *) lfirst(lc); + + if (!IsA(rtfunc->funcexpr, FuncExpr) || + ((FuncExpr *) rtfunc->funcexpr)->funcid != F_ARRAY_UNNEST || + rtfunc->funccolnames != NIL) + { + all_unnest = false; + break; + } + } + + if (all_unnest) + { + List *allargs = NIL; + + foreach(lc, rte->functions) + { + RangeTblFunction *rtfunc = (RangeTblFunction *) lfirst(lc); + List *args = ((FuncExpr *) rtfunc->funcexpr)->args; + + allargs = list_concat(allargs, list_copy(args)); + } + + appendStringInfoString(buf, "UNNEST("); + get_rule_expr((Node *) allargs, context, true); + appendStringInfoChar(buf, ')'); + } + else + { + int funcno = 0; + + appendStringInfoString(buf, "ROWS FROM("); + foreach(lc, rte->functions) + { + RangeTblFunction *rtfunc = (RangeTblFunction *) lfirst(lc); + + if (funcno > 0) + appendStringInfoString(buf, ", "); + get_rule_expr_funccall(rtfunc->funcexpr, context, true); + if (rtfunc->funccolnames != NIL) + { + /* Reconstruct the column definition list */ + appendStringInfoString(buf, " AS "); + get_from_clause_coldeflist(rtfunc, + NULL, + context); + } + funcno++; + } + appendStringInfoChar(buf, ')'); + } + /* prevent printing duplicate coldeflist below */ + rtfunc1 = NULL; + } + if (rte->funcordinality) + appendStringInfoString(buf, " WITH ORDINALITY"); + break; + case RTE_TABLEFUNC: + get_tablefunc(rte->tablefunc, context, true); + break; + case RTE_VALUES: + /* Values list RTE */ + appendStringInfoChar(buf, '('); + get_values_def(rte->values_lists, context); + appendStringInfoChar(buf, ')'); + break; + case RTE_CTE: + appendStringInfoString(buf, quote_identifier(rte->ctename)); + break; + default: + elog(ERROR, "unrecognized RTE kind: %d", (int) rte->rtekind); + break; + } + + /* Print the relation alias, if needed */ + printalias = false; + if (rte->alias != NULL) + { + /* Always print alias if user provided one */ + printalias = true; + } + else if (colinfo->printaliases) + { + /* Always print alias if we need to print column aliases */ + printalias = true; + } + else if (rte->rtekind == RTE_RELATION) + { + /* + * No need to print alias if it's same as relation name (this + * would normally be the case, but not if set_rtable_names had to + * resolve a conflict). + */ + if (strcmp(refname, get_relation_name(rte->relid)) != 0) + printalias = true; + } +#ifdef PGXC + else if (rte->rtekind == RTE_SUBQUERY && rte->eref->aliasname) + { + /* + * + * This condition arises when the from clause is a view. The + * corresponding subquery RTE has its eref set to view name. + * The remote query generated has this subquery of which the + * columns can be referred to as view_name.col1, so it should + * be possible to refer to this subquery object. + */ + appendStringInfo(buf, " %s", + quote_identifier(rte->eref->aliasname)); + printalias = true; + } +#endif + else if (rte->rtekind == RTE_FUNCTION) + { + /* + * For a function RTE, always print alias. This covers possible + * renaming of the function and/or instability of the + * FigureColname rules for things that aren't simple functions. + * Note we'd need to force it anyway for the columndef list case. + */ + printalias = true; + } + else if (rte->rtekind == RTE_VALUES) + { + /* Alias is syntactically required for VALUES */ + printalias = true; + } + else if (rte->rtekind == RTE_CTE) + { + /* + * No need to print alias if it's same as CTE name (this would + * normally be the case, but not if set_rtable_names had to + * resolve a conflict). + */ + if (strcmp(refname, rte->ctename) != 0) + printalias = true; + } + if (printalias) + appendStringInfo(buf, " %s", quote_identifier(refname)); + + /* Print the column definitions or aliases, if needed */ + if (rtfunc1 && rtfunc1->funccolnames != NIL) + { + /* Reconstruct the columndef list, which is also the aliases */ + get_from_clause_coldeflist(rtfunc1, colinfo, context); + } + else + { + /* Else print column aliases as needed */ + get_column_alias_list(colinfo, context); + } + + /* Tablesample clause must go after any alias */ + if (rte->rtekind == RTE_RELATION && rte->tablesample) + get_tablesample_def(rte->tablesample, context); + } + else if (IsA(jtnode, JoinExpr)) + { + JoinExpr *j = (JoinExpr *) jtnode; + deparse_columns *colinfo = deparse_columns_fetch(j->rtindex, dpns); + bool need_paren_on_right; + + need_paren_on_right = PRETTY_PAREN(context) && + !IsA(j->rarg, RangeTblRef) && + !(IsA(j->rarg, JoinExpr) &&((JoinExpr *) j->rarg)->alias != NULL); + + if (!PRETTY_PAREN(context) || j->alias != NULL) + appendStringInfoChar(buf, '('); + + get_from_clause_item(j->larg, query, context); + + switch (j->jointype) + { + case JOIN_INNER: + if (j->quals) + appendContextKeyword(context, " JOIN ", + -PRETTYINDENT_STD, + PRETTYINDENT_STD, + PRETTYINDENT_JOIN); + else + appendContextKeyword(context, " CROSS JOIN ", + -PRETTYINDENT_STD, + PRETTYINDENT_STD, + PRETTYINDENT_JOIN); + break; + case JOIN_LEFT: + appendContextKeyword(context, " LEFT JOIN ", + -PRETTYINDENT_STD, + PRETTYINDENT_STD, + PRETTYINDENT_JOIN); + break; + case JOIN_FULL: + appendContextKeyword(context, " FULL JOIN ", + -PRETTYINDENT_STD, + PRETTYINDENT_STD, + PRETTYINDENT_JOIN); + break; + case JOIN_RIGHT: + appendContextKeyword(context, " RIGHT JOIN ", + -PRETTYINDENT_STD, + PRETTYINDENT_STD, + PRETTYINDENT_JOIN); + break; + default: + elog(ERROR, "unrecognized join type: %d", + (int) j->jointype); + } + + if (need_paren_on_right) + appendStringInfoChar(buf, '('); + get_from_clause_item(j->rarg, query, context); + if (need_paren_on_right) + appendStringInfoChar(buf, ')'); + + if (j->usingClause) + { + ListCell *lc; + bool first = true; + + appendStringInfoString(buf, " USING ("); + /* Use the assigned names, not what's in usingClause */ + foreach(lc, colinfo->usingNames) + { + char *colname = (char *) lfirst(lc); + + if (first) + first = false; + else + appendStringInfoString(buf, ", "); + appendStringInfoString(buf, quote_identifier(colname)); + } + appendStringInfoChar(buf, ')'); + } + else if (j->quals) + { + appendStringInfoString(buf, " ON "); + if (!PRETTY_PAREN(context)) + appendStringInfoChar(buf, '('); + get_rule_expr(j->quals, context, false); + if (!PRETTY_PAREN(context)) + appendStringInfoChar(buf, ')'); + } + else if (j->jointype != JOIN_INNER) + { + /* If we didn't say CROSS JOIN above, we must provide an ON */ + appendStringInfoString(buf, " ON TRUE"); + } + + if (!PRETTY_PAREN(context) || j->alias != NULL) + appendStringInfoChar(buf, ')'); + + /* Yes, it's correct to put alias after the right paren ... */ + if (j->alias != NULL) + { + appendStringInfo(buf, " %s", + quote_identifier(j->alias->aliasname)); + get_column_alias_list(colinfo, context); + } + } + else + elog(ERROR, "unrecognized node type: %d", + (int) nodeTag(jtnode)); +} + +/* + * get_column_alias_list - print column alias list for an RTE + * + * Caller must already have printed the relation's alias name. + */ +static void +get_column_alias_list(deparse_columns *colinfo, deparse_context *context) +{ + StringInfo buf = context->buf; + int i; + bool first = true; + + /* Don't print aliases if not needed */ + if (!colinfo->printaliases) + return; + + for (i = 0; i < colinfo->num_new_cols; i++) + { + char *colname = colinfo->new_colnames[i]; + + if (first) + { + appendStringInfoChar(buf, '('); + first = false; + } + else + appendStringInfoString(buf, ", "); + appendStringInfoString(buf, quote_identifier(colname)); + } + if (!first) + appendStringInfoChar(buf, ')'); +} + +/* + * get_from_clause_coldeflist - reproduce FROM clause coldeflist + * + * When printing a top-level coldeflist (which is syntactically also the + * relation's column alias list), use column names from colinfo. But when + * printing a coldeflist embedded inside ROWS FROM(), we prefer to use the + * original coldeflist's names, which are available in rtfunc->funccolnames. + * Pass NULL for colinfo to select the latter behavior. + * + * The coldeflist is appended immediately (no space) to buf. Caller is + * responsible for ensuring that an alias or AS is present before it. + */ +static void +get_from_clause_coldeflist(RangeTblFunction *rtfunc, + deparse_columns *colinfo, + deparse_context *context) +{ + StringInfo buf = context->buf; + ListCell *l1; + ListCell *l2; + ListCell *l3; + ListCell *l4; + int i; + + appendStringInfoChar(buf, '('); + + /* there's no forfour(), so must chase one list the hard way */ + i = 0; + l4 = list_head(rtfunc->funccolnames); + forthree(l1, rtfunc->funccoltypes, + l2, rtfunc->funccoltypmods, + l3, rtfunc->funccolcollations) + { + Oid atttypid = lfirst_oid(l1); + int32 atttypmod = lfirst_int(l2); + Oid attcollation = lfirst_oid(l3); + char *attname; + + if (colinfo) + attname = colinfo->colnames[i]; + else + attname = strVal(lfirst(l4)); + + Assert(attname); /* shouldn't be any dropped columns here */ + + if (i > 0) + appendStringInfoString(buf, ", "); + appendStringInfo(buf, "%s %s", + quote_identifier(attname), + format_type_with_typemod(atttypid, atttypmod)); + if (OidIsValid(attcollation) && + attcollation != get_typcollation(atttypid)) + appendStringInfo(buf, " COLLATE %s", + generate_collation_name(attcollation)); + + l4 = lnext(l4); + i++; + } + + appendStringInfoChar(buf, ')'); +} + +/* + * get_tablesample_def - print a TableSampleClause + */ +static void +get_tablesample_def(TableSampleClause *tablesample, deparse_context *context) +{ + StringInfo buf = context->buf; + Oid argtypes[1]; + int nargs; + ListCell *l; + + /* + * We should qualify the handler's function name if it wouldn't be + * resolved by lookup in the current search path. + */ + argtypes[0] = INTERNALOID; + appendStringInfo(buf, " TABLESAMPLE %s (", + generate_function_name(tablesample->tsmhandler, 1, + NIL, argtypes, + false, NULL, EXPR_KIND_NONE)); + + nargs = 0; + foreach(l, tablesample->args) + { + if (nargs++ > 0) + appendStringInfoString(buf, ", "); + get_rule_expr((Node *) lfirst(l), context, false); + } + appendStringInfoChar(buf, ')'); + + if (tablesample->repeatable != NULL) + { + appendStringInfoString(buf, " REPEATABLE ("); + get_rule_expr((Node *) tablesample->repeatable, context, false); + appendStringInfoChar(buf, ')'); + } +} + +/* + * get_opclass_name - fetch name of an index operator class + * + * The opclass name is appended (after a space) to buf. + * + * Output is suppressed if the opclass is the default for the given + * actual_datatype. (If you don't want this behavior, just pass + * InvalidOid for actual_datatype.) + */ +static void +get_opclass_name(Oid opclass, Oid actual_datatype, + StringInfo buf) +{ + HeapTuple ht_opc; + Form_pg_opclass opcrec; + char *opcname; + char *nspname; + + ht_opc = SearchSysCache1(CLAOID, ObjectIdGetDatum(opclass)); + if (!HeapTupleIsValid(ht_opc)) + elog(ERROR, "cache lookup failed for opclass %u", opclass); + opcrec = (Form_pg_opclass) GETSTRUCT(ht_opc); + + if (!OidIsValid(actual_datatype) || + GetDefaultOpClass(actual_datatype, opcrec->opcmethod) != opclass) + { + /* Okay, we need the opclass name. Do we need to qualify it? */ + opcname = NameStr(opcrec->opcname); + if (OpclassIsVisible(opclass)) + appendStringInfo(buf, " %s", quote_identifier(opcname)); + else + { + nspname = get_namespace_name(opcrec->opcnamespace); + appendStringInfo(buf, " %s.%s", + quote_identifier(nspname), + quote_identifier(opcname)); + } + } + ReleaseSysCache(ht_opc); +} + +/* + * processIndirection - take care of array and subfield assignment + * + * We strip any top-level FieldStore or assignment ArrayRef nodes that + * appear in the input, printing them as decoration for the base column + * name (which we assume the caller just printed). We might also need to + * strip CoerceToDomain nodes, but only ones that appear above assignment + * nodes. + * + * Returns the subexpression that's to be assigned. + */ +static Node * +processIndirection(Node *node, deparse_context *context) +{// #lizard forgives + StringInfo buf = context->buf; + CoerceToDomain *cdomain = NULL; + + for (;;) + { + if (node == NULL) + break; + if (IsA(node, FieldStore)) + { + FieldStore *fstore = (FieldStore *) node; + Oid typrelid; + char *fieldname; + + /* lookup tuple type */ + typrelid = get_typ_typrelid(fstore->resulttype); + if (!OidIsValid(typrelid)) + elog(ERROR, "argument type %s of FieldStore is not a tuple type", + format_type_be(fstore->resulttype)); + + /* + * Print the field name. There should only be one target field in + * stored rules. There could be more than that in executable + * target lists, but this function cannot be used for that case. + */ + Assert(list_length(fstore->fieldnums) == 1); + fieldname = get_relid_attribute_name(typrelid, + linitial_int(fstore->fieldnums)); + appendStringInfo(buf, ".%s", quote_identifier(fieldname)); + + /* + * We ignore arg since it should be an uninteresting reference to + * the target column or subcolumn. + */ + node = (Node *) linitial(fstore->newvals); + } + else if (IsA(node, ArrayRef)) + { + ArrayRef *aref = (ArrayRef *) node; + + if (aref->refassgnexpr == NULL) + break; + printSubscripts(aref, context); + + /* + * We ignore refexpr since it should be an uninteresting reference + * to the target column or subcolumn. + */ + node = (Node *) aref->refassgnexpr; + } + else if (IsA(node, CoerceToDomain)) + { + cdomain = (CoerceToDomain *) node; + /* If it's an explicit domain coercion, we're done */ + if (cdomain->coercionformat != COERCE_IMPLICIT_CAST) + break; + /* Tentatively descend past the CoerceToDomain */ + node = (Node *) cdomain->arg; + } + else + break; + } + + /* + * If we descended past a CoerceToDomain whose argument turned out not to + * be a FieldStore or array assignment, back up to the CoerceToDomain. + * (This is not enough to be fully correct if there are nested implicit + * CoerceToDomains, but such cases shouldn't ever occur.) + */ + if (cdomain && node == (Node *) cdomain->arg) + node = (Node *) cdomain; + + return node; +} + +static void +printSubscripts(ArrayRef *aref, deparse_context *context) +{ + StringInfo buf = context->buf; + ListCell *lowlist_item; + ListCell *uplist_item; + + lowlist_item = list_head(aref->reflowerindexpr); /* could be NULL */ + foreach(uplist_item, aref->refupperindexpr) + { + appendStringInfoChar(buf, '['); + if (lowlist_item) + { + /* If subexpression is NULL, get_rule_expr prints nothing */ + get_rule_expr((Node *) lfirst(lowlist_item), context, false); + appendStringInfoChar(buf, ':'); + lowlist_item = lnext(lowlist_item); + } + /* If subexpression is NULL, get_rule_expr prints nothing */ + get_rule_expr((Node *) lfirst(uplist_item), context, false); + appendStringInfoChar(buf, ']'); + } +} + +/* + * quote_identifier - Quote an identifier only if needed + * + * When quotes are needed, we palloc the required space; slightly + * space-wasteful but well worth it for notational simplicity. + */ +const char * +quote_identifier(const char *ident) +{// #lizard forgives + /* + * Can avoid quoting if ident starts with a lowercase letter or underscore + * and contains only lowercase letters, digits, and underscores, *and* is + * not any SQL keyword. Otherwise, supply quotes. + */ + int nquotes = 0; + bool safe; + const char *ptr; + char *result; + char *optr; + + /* + * would like to use macros here, but they might yield unwanted + * locale-specific results... + */ + safe = ((ident[0] >= 'a' && ident[0] <= 'z') || ident[0] == '_'); + + for (ptr = ident; *ptr; ptr++) + { + char ch = *ptr; + + if ((ch >= 'a' && ch <= 'z') || + (ch >= '0' && ch <= '9') || + (ch == '_')) + { + /* okay */ + } + else + { + safe = false; + if (ch == '"') + nquotes++; + } + } + + if (quote_all_identifiers) + safe = false; + + if (safe) + { + /* + * Check for keyword. We quote keywords except for unreserved ones. + * (In some cases we could avoid quoting a col_name or type_func_name + * keyword, but it seems much harder than it's worth to tell that.) + * + * Note: ScanKeywordLookup() does case-insensitive comparison, but + * that's fine, since we already know we have all-lower-case. + */ + const ScanKeyword *keyword = ScanKeywordLookup(ident, + ScanKeywords, + NumScanKeywords); + + if (keyword != NULL && keyword->category != UNRESERVED_KEYWORD) + safe = false; + } + + if (safe) + return ident; /* no change needed */ + + result = (char *) palloc(strlen(ident) + nquotes + 2 + 1); + + optr = result; + *optr++ = '"'; + for (ptr = ident; *ptr; ptr++) + { + char ch = *ptr; + + if (ch == '"') + *optr++ = '"'; + *optr++ = ch; + } + *optr++ = '"'; + *optr = '\0'; + + return result; +} + +/* + * quote_qualified_identifier - Quote a possibly-qualified identifier + * + * Return a name of the form qualifier.ident, or just ident if qualifier + * is NULL, quoting each component if necessary. The result is palloc'd. + */ +char * +quote_qualified_identifier(const char *qualifier, + const char *ident) +{ + StringInfoData buf; + + initStringInfo(&buf); + if (qualifier) + appendStringInfo(&buf, "%s.", quote_identifier(qualifier)); + appendStringInfoString(&buf, quote_identifier(ident)); + return buf.data; +} + +/* + * get_relation_name + * Get the unqualified name of a relation specified by OID + * + * This differs from the underlying get_rel_name() function in that it will + * throw error instead of silently returning NULL if the OID is bad. + */ +static char * +get_relation_name(Oid relid) +{ + char *relname = get_rel_name(relid); + + if (!relname) + elog(ERROR, "cache lookup failed for relation %u", relid); + return relname; +} + +/* + * generate_relation_name + * Compute the name to display for a relation specified by OID + * + * The result includes all necessary quoting and schema-prefixing. + * + * If namespaces isn't NIL, it must be a list of deparse_namespace nodes. + * We will forcibly qualify the relation name if it equals any CTE name + * visible in the namespace list. + */ +static char * +generate_relation_name(Oid relid, List *namespaces) +{ + HeapTuple tp; + Form_pg_class reltup; + bool need_qual; + ListCell *nslist; + char *relname; + char *nspname; + char *result; + + tp = SearchSysCache1(RELOID, ObjectIdGetDatum(relid)); + if (!HeapTupleIsValid(tp)) + elog(ERROR, "cache lookup failed for relation %u", relid); + reltup = (Form_pg_class) GETSTRUCT(tp); + relname = NameStr(reltup->relname); + + /* Check for conflicting CTE name */ + need_qual = false; + foreach(nslist, namespaces) + { + deparse_namespace *dpns = (deparse_namespace *) lfirst(nslist); + ListCell *ctlist; + + foreach(ctlist, dpns->ctes) + { + CommonTableExpr *cte = (CommonTableExpr *) lfirst(ctlist); + + if (strcmp(cte->ctename, relname) == 0) + { + need_qual = true; + break; + } + } + if (need_qual) + break; + } + + /* Otherwise, qualify the name if not visible in search path */ + if (!need_qual) + need_qual = !RelationIsVisible(relid); + + if (need_qual) + nspname = get_namespace_name(reltup->relnamespace); + else + nspname = NULL; + + result = quote_qualified_identifier(nspname, relname); + + ReleaseSysCache(tp); + + return result; +} + +/* + * generate_qualified_relation_name + * Compute the name to display for a relation specified by OID + * + * As above, but unconditionally schema-qualify the name. + */ +static char * +generate_qualified_relation_name(Oid relid) +{ + HeapTuple tp; + Form_pg_class reltup; + char *relname; + char *nspname; + char *result; + + tp = SearchSysCache1(RELOID, ObjectIdGetDatum(relid)); + if (!HeapTupleIsValid(tp)) + elog(ERROR, "cache lookup failed for relation %u", relid); + reltup = (Form_pg_class) GETSTRUCT(tp); + relname = NameStr(reltup->relname); + + nspname = get_namespace_name(reltup->relnamespace); + if (!nspname) + elog(ERROR, "cache lookup failed for namespace %u", + reltup->relnamespace); + + result = quote_qualified_identifier(nspname, relname); + + ReleaseSysCache(tp); + + return result; +} + +/* + * generate_function_name + * Compute the name to display for a function specified by OID, + * given that it is being called with the specified actual arg names and + * types. (Those matter because of ambiguous-function resolution rules.) + * + * If we're dealing with a potentially variadic function (in practice, this + * means a FuncExpr or Aggref, not some other way of calling a function), then + * has_variadic must specify whether variadic arguments have been merged, + * and *use_variadic_p will be set to indicate whether to print VARIADIC in + * the output. For non-FuncExpr cases, has_variadic should be FALSE and + * use_variadic_p can be NULL. + * + * The result includes all necessary quoting and schema-prefixing. + */ +static char * +generate_function_name(Oid funcid, int nargs, List *argnames, Oid *argtypes, + bool has_variadic, bool *use_variadic_p, + ParseExprKind special_exprkind) +{// #lizard forgives + char *result; + HeapTuple proctup; + Form_pg_proc procform; + char *proname; + bool use_variadic; + char *nspname; + FuncDetailCode p_result; + Oid p_funcid; + Oid p_rettype; + bool p_retset; + int p_nvargs; + Oid p_vatype; + Oid *p_true_typeids; + bool force_qualify = false; + + proctup = SearchSysCache1(PROCOID, ObjectIdGetDatum(funcid)); + if (!HeapTupleIsValid(proctup)) + elog(ERROR, "cache lookup failed for function %u", funcid); + procform = (Form_pg_proc) GETSTRUCT(proctup); + proname = NameStr(procform->proname); + + /* + * Due to parser hacks to avoid needing to reserve CUBE, we need to force + * qualification in some special cases. + */ + if (special_exprkind == EXPR_KIND_GROUP_BY) + { + if (strcmp(proname, "cube") == 0 || strcmp(proname, "rollup") == 0) + force_qualify = true; + } + + /* + * Determine whether VARIADIC should be printed. We must do this first + * since it affects the lookup rules in func_get_detail(). + * + * Currently, we always print VARIADIC if the function has a merged + * variadic-array argument. Note that this is always the case for + * functions taking a VARIADIC argument type other than VARIADIC ANY. + * + * In principle, if VARIADIC wasn't originally specified and the array + * actual argument is deconstructable, we could print the array elements + * separately and not print VARIADIC, thus more nearly reproducing the + * original input. For the moment that seems like too much complication + * for the benefit, and anyway we do not know whether VARIADIC was + * originally specified if it's a non-ANY type. + */ + if (use_variadic_p) + { + /* Parser should not have set funcvariadic unless fn is variadic */ + Assert(!has_variadic || OidIsValid(procform->provariadic)); + use_variadic = has_variadic; + *use_variadic_p = use_variadic; + } + else + { + Assert(!has_variadic); + use_variadic = false; + } + + /* + * The idea here is to schema-qualify only if the parser would fail to + * resolve the correct function given the unqualified func name with the + * specified argtypes and VARIADIC flag. But if we already decided to + * force qualification, then we can skip the lookup and pretend we didn't + * find it. + */ + if (!force_qualify) + p_result = func_get_detail(list_make1(makeString(proname)), + NIL, argnames, nargs, argtypes, + !use_variadic, true, + &p_funcid, &p_rettype, + &p_retset, &p_nvargs, &p_vatype, + &p_true_typeids, NULL); + else + { + p_result = FUNCDETAIL_NOTFOUND; + p_funcid = InvalidOid; + } + + if ((p_result == FUNCDETAIL_NORMAL || + p_result == FUNCDETAIL_AGGREGATE || + p_result == FUNCDETAIL_WINDOWFUNC) && + p_funcid == funcid) + nspname = NULL; + else + nspname = get_namespace_name(procform->pronamespace); + + result = quote_qualified_identifier(nspname, proname); + + ReleaseSysCache(proctup); + + return result; +} + +/* + * generate_operator_name + * Compute the name to display for an operator specified by OID, + * given that it is being called with the specified actual arg types. + * (Arg types matter because of ambiguous-operator resolution rules. + * Pass InvalidOid for unused arg of a unary operator.) + * + * The result includes all necessary quoting and schema-prefixing, + * plus the OPERATOR() decoration needed to use a qualified operator name + * in an expression. + */ +static char * +generate_operator_name(Oid operid, Oid arg1, Oid arg2) +{// #lizard forgives + StringInfoData buf; + HeapTuple opertup; + Form_pg_operator operform; + char *oprname; + char *nspname; + Operator p_result; + + initStringInfo(&buf); + + opertup = SearchSysCache1(OPEROID, ObjectIdGetDatum(operid)); + if (!HeapTupleIsValid(opertup)) + elog(ERROR, "cache lookup failed for operator %u", operid); + operform = (Form_pg_operator) GETSTRUCT(opertup); + oprname = NameStr(operform->oprname); + + /* + * The idea here is to schema-qualify only if the parser would fail to + * resolve the correct operator given the unqualified op name with the + * specified argtypes. + */ + switch (operform->oprkind) + { + case 'b': + p_result = oper(NULL, list_make1(makeString(oprname)), arg1, arg2, + true, -1); + break; + case 'l': + p_result = left_oper(NULL, list_make1(makeString(oprname)), arg2, + true, -1); + break; + case 'r': + p_result = right_oper(NULL, list_make1(makeString(oprname)), arg1, + true, -1); + break; + default: + elog(ERROR, "unrecognized oprkind: %d", operform->oprkind); + p_result = NULL; /* keep compiler quiet */ + break; + } + + if (p_result != NULL && oprid(p_result) == operid) + nspname = NULL; + else + { + nspname = get_namespace_name(operform->oprnamespace); + appendStringInfo(&buf, "OPERATOR(%s.", quote_identifier(nspname)); + } + + appendStringInfoString(&buf, oprname); + + if (nspname) + appendStringInfoChar(&buf, ')'); + + if (p_result != NULL) + ReleaseSysCache(p_result); + + ReleaseSysCache(opertup); + + return buf.data; +} + +/* + * generate_collation_name + * Compute the name to display for a collation specified by OID + * + * The result includes all necessary quoting and schema-prefixing. + */ +char * +generate_collation_name(Oid collid) +{ + HeapTuple tp; + Form_pg_collation colltup; + char *collname; + char *nspname; + char *result; + + tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid)); + if (!HeapTupleIsValid(tp)) + elog(ERROR, "cache lookup failed for collation %u", collid); + colltup = (Form_pg_collation) GETSTRUCT(tp); + collname = NameStr(colltup->collname); + + if (!CollationIsVisible(collid)) + nspname = get_namespace_name(colltup->collnamespace); + else + nspname = NULL; + + result = quote_qualified_identifier(nspname, collname); + + ReleaseSysCache(tp); + + return result; +} + +/* + * Given a C string, produce a TEXT datum. + * + * We assume that the input was palloc'd and may be freed. + */ +static text * +string_to_text(char *str) +{ + text *result; + + result = cstring_to_text(str); + pfree(str); + return result; +} + +/* + * Generate a C string representing a relation's reloptions, or NULL if none. + */ +static char * +flatten_reloptions(Oid relid) +{ + char *result = NULL; + HeapTuple tuple; + Datum reloptions; + bool isnull; + + tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(relid)); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for relation %u", relid); + + reloptions = SysCacheGetAttr(RELOID, tuple, + Anum_pg_class_reloptions, &isnull); + if (!isnull) + { + StringInfoData buf; + Datum *options; + int noptions; + int i; + + initStringInfo(&buf); + + deconstruct_array(DatumGetArrayTypeP(reloptions), + TEXTOID, -1, false, 'i', + &options, NULL, &noptions); + + for (i = 0; i < noptions; i++) + { + char *option = TextDatumGetCString(options[i]); + char *name; + char *separator; + char *value; + + /* + * Each array element should have the form name=value. If the "=" + * is missing for some reason, treat it like an empty value. + */ + name = option; + separator = strchr(option, '='); + if (separator) + { + *separator = '\0'; + value = separator + 1; + } + else + value = ""; + + if (i > 0) + appendStringInfoString(&buf, ", "); + appendStringInfo(&buf, "%s=", quote_identifier(name)); + + /* + * In general we need to quote the value; but to avoid unnecessary + * clutter, do not quote if it is an identifier that would not + * need quoting. (We could also allow numbers, but that is a bit + * trickier than it looks --- for example, are leading zeroes + * significant? We don't want to assume very much here about what + * custom reloptions might mean.) + */ + if (quote_identifier(value) == value) + appendStringInfoString(&buf, value); + else + simple_quote_literal(&buf, value); + + pfree(option); + } + + result = buf.data; + } + + ReleaseSysCache(tuple); + + return result; +} + +/* + * get_one_range_partition_bound_string + * A C string representation of one range partition bound + */ +char * +get_range_partbound_string(List *bound_datums) +{ + deparse_context context; + StringInfo buf = makeStringInfo(); + ListCell *cell; + char *sep; + + memset(&context, 0, sizeof(deparse_context)); + context.buf = buf; + + appendStringInfoString(buf, "("); + sep = ""; + foreach(cell, bound_datums) + { + PartitionRangeDatum *datum = + castNode(PartitionRangeDatum, lfirst(cell)); + + appendStringInfoString(buf, sep); + if (datum->kind == PARTITION_RANGE_DATUM_MINVALUE) + appendStringInfoString(buf, "MINVALUE"); + else if (datum->kind == PARTITION_RANGE_DATUM_MAXVALUE) + appendStringInfoString(buf, "MAXVALUE"); + else + { + Const *val = castNode(Const, datum->value); + + get_const_expr(val, &context, -1); + } + sep = ", "; + } + appendStringInfoString(buf, ")"); + + return buf->data; +} + +#ifdef __TBASE__ +/* form interval partition child table/index name */ +char * +GetPartitionName(Oid parentrelid, int partidx, bool isindex) +{ + char *partname; + char relname[NAMEDATALEN]; + char *parentname = get_rel_name(parentrelid); + + StrNCpy(relname, parentname, NAMEDATALEN - 12); + + partname = (char *)palloc0(NAMEDATALEN); + + snprintf(partname, NAMEDATALEN, + "%s_part_%d", relname, partidx); + +#if 0 + if(!isindex) + snprintf(partname, NAMEDATALEN, + "part_%d_%d", parentrelid, partidx); + else + snprintf(partname, NAMEDATALEN, + "idx_%d_%d", parentrelid, partidx); +#endif + + return partname; +} + +static int +find_partidx_by_int(int64 start, int step, int partitions, + int64 value, QulificationType qualtype) +{// #lizard forgives + int partidx = -1; + int gap = -1; + int align = -1; + + if(value < start || value >= start + step*partitions) + { + return PARTITION_ROUTER_RESULT_NULL; + } + + gap = (int32)((value - start)/step); + + align = (int32)((value - start)%step); + + switch(qualtype) + { + case QULIFICATION_TYPE_LS: + if(align == 0) gap--; + case QULIFICATION_TYPE_LE: + { + if(gap >= partitions) + partidx = PARTITION_ROUTER_RESULT_FULL; + else if(gap < 0) + partidx = PARTITION_ROUTER_RESULT_NULL; + else + partidx = gap; + } + break; + + case QULIFICATION_TYPE_EQUAL: + { + if(gap >= partitions || gap < 0 ) + partidx = PARTITION_ROUTER_RESULT_NULL; + else + partidx = gap; + } + break; + + case QULIFICATION_TYPE_GE: + case QULIFICATION_TYPE_GT: + { + if(gap >= partitions) + partidx = PARTITION_ROUTER_RESULT_NULL; + else if(gap < 0) + partidx = PARTITION_ROUTER_RESULT_FULL; + else + partidx = gap; + } + break; + default: + elog(ERROR, "not supported Qulification Type[%d]", qualtype); + } + + return partidx; +} + +static int get_daysofyear(int startyear, int startmonth, int startday, + int endyear, int endmonth, int endday) +{// #lizard forgives + int result; + + result = 0; + + if(startyear > endyear + || (startyear == endyear && startmonth > endmonth) + || (startyear == endyear && startmonth == endmonth && startday > endday)) + return -1; + + if(startyear == endyear) + { + result = get_daysofmonth(startmonth, startday, endmonth, endday); + } + else + { + result += get_daysofmonth(startmonth,startday, 12, 31); + result += (endyear - startyear - 1)*366; + result += get_daysofmonth(1, 1, endmonth, endday); + } + + return result; +} + +static int get_daysofmonth(int startmonth, int startday, + int endmonth, int endday) +{// #lizard forgives + int result; + + if(startmonth <=0 || startmonth > 12 + || startday <= 0 || startday > 31 + || endmonth <=0 || endmonth > 12 + || endday <= 0 || endday > 31) + { + elog(ERROR, "internal error: getdaysofmonth: parameters is invalid"); + } + + result = 0; + + if(startmonth > endmonth || (startmonth == endmonth && startday > endday)) + return -1; + + if(startmonth == endmonth) + { + result = endday - startday; + } + else + { + int monidx = 0; + + result += daysofmonth[startmonth] - startday; + + monidx = startmonth + 1; + while(monidx < endmonth) + result += daysofmonth[monidx++]; + + result += endday; + } + + return result; +} + +static int get_monthesofyear(int startyear, int startmonth, + int endyear, int endmonth) +{ + int32 gap; + if(endyear < startyear || (endyear == startyear && endmonth < startmonth)) + { + gap = -1; + } + else + { + gap = (endyear - startyear) * 12 + (endmonth - startmonth); + } + return gap; +} + + +static int +find_partidx_by_timestamp(TimestampTz start, int step, int steptype, int partitions, + TimestampTz value, QulificationType qualtype) +{// #lizard forgives + int partidx = -1; + int gap; + struct pg_tm start_time; + fsec_t start_sec; + struct pg_tm current_time; + fsec_t current_sec; + bool isalign = false; + + + /* timestamp convert to posix struct */ + if(timestamp2tm(start, NULL, &start_time, &start_sec, NULL, NULL) != 0) + ereport(ERROR, + (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), + errmsg("timestamp out of range"))); + + if(timestamp2tm(value, NULL, ¤t_time, ¤t_sec, NULL, NULL) != 0) + ereport(ERROR, + (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), + errmsg("timestamp out of range"))); + + if(current_time.tm_hour == 0 && current_time.tm_min == 0 && current_time.tm_sec == 0 && current_sec == 0) + { + isalign = true; + } + + if(isalign && steptype == IntervalType_Month) + { + isalign = (current_time.tm_mday == 1); + } + + /* computer gap*/ + if(steptype == IntervalType_Month) + { + if(current_time.tm_year < start_time.tm_year + || (current_time.tm_year == start_time.tm_year && current_time.tm_mon < start_time.tm_mon)) + { + gap = -1; + } + else + { + gap = (current_time.tm_year - start_time.tm_year)*12 + (current_time.tm_mon - start_time.tm_mon); + } + } + else if(steptype == IntervalType_Day) + { + gap = get_daysofyear(start_time.tm_year, start_time.tm_mon, start_time.tm_mday, + current_time.tm_year, current_time.tm_mon, current_time.tm_mday); + } + else + { + elog(ERROR,"step type[%d] is invalid", steptype); + } + + if(gap >= 0) + { + if(isalign) + isalign = (gap % step == 0); + gap = gap/step; + } + else + { + gap = -1; + isalign = false; + } + + switch(qualtype) + { + case QULIFICATION_TYPE_LS: + if(isalign) + { + if (!(is_first_day_from_start(step, steptype, &start_time, ¤t_time))) + { + gap--; + } + } + case QULIFICATION_TYPE_LE: + { + if(gap >= partitions) + partidx = PARTITION_ROUTER_RESULT_FULL; /* all partitions*/ + else if(gap < 0) + partidx = PARTITION_ROUTER_RESULT_NULL; + else + partidx = gap; + } + break; + + case QULIFICATION_TYPE_EQUAL: + { + if(gap >= partitions || gap < 0) + partidx = PARTITION_ROUTER_RESULT_NULL; + else + partidx = gap; + } + break; + + case QULIFICATION_TYPE_GE: + case QULIFICATION_TYPE_GT: + { + if(gap >= partitions) + partidx = PARTITION_ROUTER_RESULT_NULL; + else if(gap < 0) + partidx = PARTITION_ROUTER_RESULT_FULL; + else + partidx = gap; + } + break; + default: + elog(ERROR, "not supported Qulification Type[%d]", qualtype); + } + + return partidx; +} + +int +RelationGetPartitionIdxByValue(Relation rel, Datum value) +{ + int partidx = -1; + Form_pg_partition_interval routerinfo = NULL; + + routerinfo = rel->rd_partitions_info; + + if(!routerinfo) + { + elog(ERROR, "relation[%s] is not a partitioned table.", RelationGetRelationName(rel)); + } + + switch(routerinfo->partdatatype) + { + case INT2OID: /* int2 */ + { + int value_int16; + value_int16 = DatumGetInt16(value); + partidx = find_partidx_by_int(routerinfo->partstartvalue_int, routerinfo->partinterval_int, + routerinfo->partnparts, (int64)value_int16, QULIFICATION_TYPE_EQUAL); + } + break; + case INT4OID: /* int4 */ + { + int value_int32; + value_int32 = DatumGetInt32(value); + partidx = find_partidx_by_int(routerinfo->partstartvalue_int, routerinfo->partinterval_int, + routerinfo->partnparts, (int64)value_int32, QULIFICATION_TYPE_EQUAL); + } + break; + case INT8OID: /* int8 */ + { + partidx = find_partidx_by_int(routerinfo->partstartvalue_int, routerinfo->partinterval_int, + routerinfo->partnparts, DatumGetInt64(value), QULIFICATION_TYPE_EQUAL); + } + break; + case TIMESTAMPOID: /* timestamp */ + partidx = find_partidx_by_timestamp(routerinfo->partstartvalue_ts, routerinfo->partinterval_int, + routerinfo->partinterval_type, + routerinfo->partnparts, DatumGetTimestamp(value), QULIFICATION_TYPE_EQUAL); + break; + default: + elog(ERROR, "unsupported interval type:[%d]", routerinfo->partinterval_type); + } + + return partidx; + +} + +Bitmapset * +RelationGetPartitionByValue(Relation rel, Const *value) +{ + //TODO: + int partidx = -1; + AttrNumber partkey = InvalidAttrNumber; + Form_pg_attribute attr = NULL; + Bitmapset * bms = NULL; + char *partname = NULL; + Oid partoid = InvalidOid; + + partkey = RelationGetPartitionColumnIndex(rel); + attr = rel->rd_att->attrs[partkey-1]; + + if(attr->atttypid != value->consttype) + { + elog(ERROR, "internal error: RelationGetPartitionByValue: data type of parameter is not same as relation definition"); + } + + partidx = RelationGetPartitionIdxByValue(rel,value->constvalue); + + partname = GetPartitionName(RelationGetRelid(rel), partidx, false); + partoid = get_relname_relid(partname, RelationGetNamespace(rel)); + + if(partidx >= 0 && partoid) + bms = bms_make_singleton(partidx); + else + bms = NULL; + + return bms; +} + +List * +RelationGetAllPartitions(Relation rel) +{ + int nparts = 0; + char *partname = NULL; + Oid partoid = InvalidOid; + int partidx = 0; + List * result = NULL; + + nparts = RelationGetNParts(rel); + + for(partidx = 0; partidx < nparts; partidx++) + { + partname = GetPartitionName(RelationGetRelid(rel), partidx, false); + partoid = get_relname_relid(partname, RelationGetNamespace(rel)); + + if(partname) + pfree(partname); + partname = NULL; + + if (InvalidOid == partoid) + { + continue; + } + + result = lappend_oid(result, partoid); + } + + return result; +} + +int +RelationGetChildIndex(Relation rel, Oid childoid) +{ + int nparts = 0; + char *partname = NULL; + Oid partoid = InvalidOid; + int partidx = 0; + int result = -1; + + if (childoid) + { + nparts = RelationGetNParts(rel); + + for(partidx = 0; partidx < nparts; partidx++) + { + partname = GetPartitionName(RelationGetRelid(rel), partidx, false); + partoid = get_relname_relid(partname, RelationGetNamespace(rel)); + + if (partoid == childoid) + { + result = partidx; + + if(partname) + pfree(partname); + partname = NULL; + + break; + } + + if(partname) + pfree(partname); + partname = NULL; + } + } + + return result; +} + +Oid +RelationGetPartitionIndex(Relation rel, Oid indexOid, int partidx) +{ + char *partidxname = NULL; + Oid partidxoid = InvalidOid; + partidxname = GetPartitionName(indexOid,partidx,true); + partidxoid = get_relname_relid(partidxname,RelationGetNamespace(rel)); + + pfree(partidxname); + partidxname = NULL; + return partidxoid; +} + +Oid +RelationGetPartition(Relation rel, int partidx, bool isindex) +{ + char *partname = NULL; + Oid partoid = InvalidOid; + + partname = GetPartitionName(RelationGetRelid(rel), partidx, isindex); + + partoid = get_relname_relid(partname, RelationGetNamespace(rel)); + + if(partname) + pfree(partname); + partname = NULL; + return partoid; +} + +Bitmapset * +RelationGetPartitionsByQuals(Relation rel, List *strictinfos) +{ + Bitmapset * result; + Bitmapset * temp_bms; + Bitmapset * temp_result; + + ListCell *cell; + RestrictInfo *ele; + result = NULL; + temp_bms = NULL; + temp_result = NULL; + + if(list_length(strictinfos) == 0) + return get_full_pruning_result(rel); + + foreach(cell, strictinfos) + { + ele = (RestrictInfo*)lfirst(cell); + temp_bms = pruning_walker(rel,(Node*)ele); + if(result) + temp_result = bms_intersect(result, temp_bms); + else + temp_result = bms_copy(temp_bms); + bms_free(result); + bms_free(temp_bms); + temp_bms = NULL; + result = temp_result; + } + + return result; +} + +static Bitmapset * +pruning_walker(Relation rel, Node *expr) +{ + Bitmapset * result; + result = NULL; + + switch(nodeTag(expr)) + { + case T_OpExpr: + { + result = pruning_opexpr(rel,(OpExpr*)expr); + } + break; + case T_RestrictInfo: + { + RestrictInfo *restricted = (RestrictInfo *)expr; + result = pruning_walker(rel, (Node *)restricted->clause); + } + break; + case T_BoolExpr: + { + BoolExpr *boolexpr = (BoolExpr*)expr; + switch(boolexpr->boolop) + { + ListCell * cell; + Bitmapset * temp_bms; + Bitmapset * temp_result; + Node *ele; + + temp_bms = NULL; + temp_result = NULL; + case AND_EXPR: + { + foreach(cell,boolexpr->args) + { + ele = (Node*)lfirst(cell); + temp_bms = pruning_walker(rel,ele); + if(result) + temp_result = bms_intersect(result, temp_bms); + else + temp_result = bms_copy(temp_bms); + bms_free(result); + bms_free(temp_bms); + temp_bms = NULL; + result = temp_result; + } + } + break; + case OR_EXPR: + { + foreach(cell,boolexpr->args) + { + ele = (Node*)lfirst(cell); + temp_bms = pruning_walker(rel,ele); + temp_result = bms_union(result, temp_bms); + bms_free(result); + bms_free(temp_bms); + temp_bms = NULL; + result = temp_result; + } + } + break; + case NOT_EXPR: + default: + result = get_full_pruning_result(rel); + break; + } + } + break; + default: + result = get_full_pruning_result(rel); + break; + } + + return result; +} + +static Bitmapset * +pruning_opexpr(Relation rel, OpExpr *expr) +{// #lizard forgives + Bitmapset *result = NULL; + char *opname = NULL; + Node *leftarg = NULL; + Node *rightarg = NULL; + Var *arg_var = NULL; + Const *arg_const = NULL; + bool isswap = false; + int npart; + int partidx; + AttrNumber partkey; + //Oid parttype; + QulificationType qualtype = QULIFICATION_TYPE_EQUAL; + Form_pg_partition_interval routerinfo; + + partkey = RelationGetPartitionColumnIndex(rel); + + //parttype = rel->rd_att->attrs[partkey - 1]->atttypid; + + if(list_length(expr->args) != 2) + return get_full_pruning_result(rel); + + leftarg = (Node *)list_nth(expr->args,0); + rightarg = (Node *)list_nth(expr->args,1); + + if(IsA(leftarg,Var) && IsA(rightarg,Const)) + { + arg_var = (Var *)leftarg; + arg_const = (Const *)rightarg; + } + else if(IsA(leftarg,Const) && IsA(rightarg,Var)) + { + arg_var = (Var *)rightarg; + arg_const = (Const *)leftarg; + isswap = true; + } + else + { + return get_full_pruning_result(rel); + } + + if(arg_var->varattno != partkey) + { + return get_full_pruning_result(rel); + } + + opname = get_opname(expr->opno); + + if(strcmp("<",opname) == 0) + { + if(!isswap) + qualtype = QULIFICATION_TYPE_LS; + else + qualtype = QULIFICATION_TYPE_GT; + } + else if(strcmp("<=",opname) == 0) + { + if(!isswap) + qualtype = QULIFICATION_TYPE_LE; + else + qualtype = QULIFICATION_TYPE_GE; + } + else if(strcmp("=",opname) == 0) + { + qualtype = QULIFICATION_TYPE_EQUAL; + } + else if(strcmp(">=",opname) == 0) + { + if(!isswap) + qualtype = QULIFICATION_TYPE_GE; + else + qualtype = QULIFICATION_TYPE_LE; + } + else if(strcmp(">",opname) == 0) + { + if(!isswap) + qualtype = QULIFICATION_TYPE_GT; + else + qualtype = QULIFICATION_TYPE_LS; + } + else + { + /* any other case, get full partitions */ + return get_full_pruning_result(rel); + } + + routerinfo = rel->rd_partitions_info; + + if(!routerinfo) + { + elog(ERROR, "relation[%s] is not a partitioned table", RelationGetRelationName(rel)); + } + + switch(arg_const->consttype) + { + case INT2OID: /* int2 */ + { + int value_int16; + value_int16 = DatumGetInt16(arg_const->constvalue); + partidx = find_partidx_by_int(routerinfo->partstartvalue_int, routerinfo->partinterval_int, + routerinfo->partnparts, (int64)value_int16, qualtype); + } + break; + case INT4OID: /* int4 */ + { + int value_int32; + value_int32 = DatumGetInt32(arg_const->constvalue); + partidx = find_partidx_by_int(routerinfo->partstartvalue_int, routerinfo->partinterval_int, + routerinfo->partnparts, (int64)value_int32, qualtype); + } + break; + case INT8OID: /* int8 */ + { + partidx = find_partidx_by_int(routerinfo->partstartvalue_int, routerinfo->partinterval_int, + routerinfo->partnparts, DatumGetInt64(arg_const->constvalue), qualtype); + } + break; + case TIMESTAMPOID: /* timestamp */ + partidx = find_partidx_by_timestamp(routerinfo->partstartvalue_ts, routerinfo->partinterval_int, + routerinfo->partinterval_type, + routerinfo->partnparts, DatumGetTimestamp(arg_const->constvalue), qualtype); + break; + default: + elog(ERROR, "unsupported const type:[%u]", arg_const->consttype); + } + + npart = RelationGetNParts(rel); + if(npart <= 0) + { + elog(ERROR, "internal error: pruning_opexpr:partitioned table has no partitions"); + } + + if(partidx == PARTITION_ROUTER_RESULT_FULL) + return get_full_pruning_result(rel); + else if(partidx == PARTITION_ROUTER_RESULT_NULL) + return NULL; + else if(partidx >= 0) + { + char *partname = NULL; + Oid partoid = InvalidOid; + + switch(qualtype) + { + case QULIFICATION_TYPE_LS: + case QULIFICATION_TYPE_LE: + { + int i; + for(i = 0; i <= partidx; i++) + { + partname = GetPartitionName(RelationGetRelid(rel), i, false); + partoid = get_relname_relid(partname, RelationGetNamespace(rel)); + if(partoid) + { + result = bms_add_member(result, i); + } + } + } + break; + case QULIFICATION_TYPE_EQUAL: + { + partname = GetPartitionName(RelationGetRelid(rel), partidx, false); + partoid = get_relname_relid(partname, RelationGetNamespace(rel)); + if(partoid) + { + result = bms_make_singleton(partidx); + } + } + break; + case QULIFICATION_TYPE_GE: + case QULIFICATION_TYPE_GT: + { + int i; + for(i = partidx; i < npart; i++) + { + partname = GetPartitionName(RelationGetRelid(rel), i, false); + partoid = get_relname_relid(partname, RelationGetNamespace(rel)); + if(partoid) + { + result = bms_add_member(result, i); + } + } + } + break; + default: + //nerver occur + elog(ERROR, "internal error: pruning_opexpr: invalid QulificationType[%d]", qualtype); + } + } + + return result; +} + +static Bitmapset * +get_full_pruning_result(Relation rel) +{ + Bitmapset *result = NULL; + int i = 0; + int nparts = RelationGetNParts(rel); + char *partname = NULL; + Oid partoid = InvalidOid; + + Assert(nparts > 0); + + for(i=0; ibitmapplans; + replace_target_relation((Node *)planlist,targetrel,partitionparent,partidx); + } + break; + case T_BitmapOr: + { + List *planlist; + planlist = ((BitmapOr*)node)->bitmapplans; + replace_target_relation((Node *)planlist,targetrel,partitionparent,partidx); + } + break; + + /* + * scan nodes + */ + case T_TidScan: + case T_SeqScan: + { + SeqScan *seqscan; + seqscan = (SeqScan*)node; + + if(seqscan->ispartchild) + break; + if(seqscan->scanrelid != targetrel) + break; + seqscan->ispartchild = true; + seqscan->childidx = partidx; + } + break; + + case T_IndexScan: + { + IndexScan *indexscan; + indexscan = (IndexScan*)node; + + if(indexscan->scan.ispartchild) + break; + if(indexscan->scan.scanrelid != targetrel) + break; + indexscan->scan.ispartchild = true; + indexscan->scan.childidx = partidx; + indexscan->indexid = RelationGetPartitionIndex(partitionparent,indexscan->indexid,partidx); + } + break; + + case T_IndexOnlyScan: + { + IndexOnlyScan *indexscan; + indexscan = (IndexOnlyScan*)node; + + if(indexscan->scan.ispartchild) + return; + if(indexscan->scan.scanrelid != targetrel) + return; + indexscan->scan.ispartchild = true; + indexscan->scan.childidx = partidx; + indexscan->indexid = RelationGetPartitionIndex(partitionparent,indexscan->indexid,partidx); + } + break; + + case T_BitmapIndexScan: + { + BitmapIndexScan *indexscan; + indexscan = (BitmapIndexScan*)node; + + if(indexscan->scan.ispartchild) + break; + if(indexscan->scan.scanrelid != targetrel) + break; + indexscan->scan.ispartchild = true; + indexscan->scan.childidx = partidx; + indexscan->indexid = RelationGetPartitionIndex(partitionparent,indexscan->indexid,partidx); + } + break; + + case T_BitmapHeapScan: + { + Scan *scan; + scan = (Scan*)node; + + if(scan->ispartchild) + break; + if(scan->scanrelid != targetrel) + break; + + scan->ispartchild = true; + scan->childidx = partidx; + replace_partidx_bitmapheapscan(partitionparent,(Node*)scan->plan.lefttree,partidx); + //replace_target_relation((Node*)scan->scan.plan.lefttree,targetrel,partitionparent,partidx); + } + break; + + case T_SubqueryScan: + break; + + case T_FunctionScan: + case T_ValuesScan: + case T_CteScan: + case T_WorkTableScan: + case T_ForeignScan: + break; + + /* + * join nodes + */ + case T_NestLoop: + case T_MergeJoin: + case T_HashJoin: + { + Plan *join; + join = (Plan*)node; + replace_target_relation((Node*)join->lefttree,targetrel,partitionparent,partidx); + replace_target_relation((Node*)join->righttree,targetrel,partitionparent,partidx); + } + break; + + /* + * materialization nodes + */ + case T_Material: + case T_Sort: + case T_Hash: + { + Plan *mat = (Plan*)node; + replace_target_relation((Node*)mat->lefttree,targetrel,partitionparent,partidx); + } + break; + case T_Group: + case T_Agg: + case T_WindowAgg: + case T_Unique: + case T_SetOp: + case T_LockRows: + case T_Limit: + break; + case T_List: + { + List * list; + ListCell *cell; + Node *element; + + list = (List *)node; + foreach(cell,list) + { + element = (Node*)lfirst(cell); + replace_target_relation(element,targetrel,partitionparent,partidx); + } + } + break; + case T_RemoteSubplan: + { + RemoteSubplan *plan = (RemoteSubplan *)node; + + plan->cursor = get_internal_cursor(); + + replace_target_relation((Node*)((Plan *)plan)->lefttree,targetrel,partitionparent,partidx); + } + break; + case T_RemoteQuery: + elog(ERROR,"internal error: update partitioned parent table is forbidden in coordinator"); + break; + default: + elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node)); + break; + } +} + +void +replace_partidx_bitmapheapscan(Relation relation, Node *plan, int partidx) +{ + switch(nodeTag(plan)) + { + case T_BitmapAnd: + { + List *planlist; + planlist = ((BitmapAnd*)plan)->bitmapplans; + replace_partidx_bitmapheapscan(relation,(Node*)planlist, partidx); + } + break; + case T_BitmapOr: + { + List *planlist; + planlist = ((BitmapOr*)plan)->bitmapplans; + replace_partidx_bitmapheapscan(relation,(Node*)planlist, partidx); + } + break; + case T_BitmapIndexScan: + { + Scan *sscan; + BitmapIndexScan *idxscan_child; + + sscan = (Scan *)plan; + sscan->ispartchild = true; + sscan->childidx = partidx; + + idxscan_child = (BitmapIndexScan *)plan; + idxscan_child->indexid = RelationGetPartitionIndex(relation,idxscan_child->indexid,partidx); + } + break; + case T_List: + { + List * list; + ListCell *cell; + Node *scan; + + list = (List *)plan; + foreach(cell,list) + { + scan = (Node*)lfirst(cell); + replace_partidx_bitmapheapscan(relation, scan, partidx); + } + } + break; + default: + elog(ERROR, "internal error: BitmapHeapScan cannot have this subplan[%d]", nodeTag(plan)); + break; + } +} + +int32 +get_timestamptz_gap(TimestampTz value, int32 interval) +{ + int32 gap; + fsec_t fsec; + struct pg_tm user_time; + + if(timestamp2tm(value, NULL, &user_time, &fsec, NULL, NULL) != 0) + ereport(ERROR, + (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), + errmsg("timestamp out of range"))); + + + switch (interval) + { + case IntervalType_Year: + { + gap = get_monthesofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, + user_time.tm_year, 1); + break; + } + + case IntervalType_Month: + { + gap = get_monthesofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, + user_time.tm_year, user_time.tm_mon); + break; + } + + case IntervalType_Day: + { + gap = get_daysofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, g_partition_base_time.tm_mday, + user_time.tm_year, user_time.tm_mon, user_time.tm_mday); + break; + } + + default: + { + ereport(ERROR, + (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), + errmsg("partition interval %d not support hot and cold seperation", interval))); + } + } + return gap; +} + +int32 +get_timestamptz_diff(TimestampTz value, int32 interval) +{ + int32 gap1; + int32 gap2; + TimestampTz current_tmstamp; + fsec_t fsec; + struct pg_tm current_time; + struct pg_tm user_time; + + if(timestamp2tm(value, NULL, &user_time, &fsec, NULL, NULL) != 0) + { + ereport(ERROR, + (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), + errmsg("timestamp out of range"))); + } + + current_tmstamp = GetCurrentTimestamp(); + if(timestamp2tm(current_tmstamp, NULL, ¤t_time, &fsec, NULL, NULL) != 0) + { + ereport(ERROR, + (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), + errmsg("timestamp out of range"))); + } + + switch (interval) + { + case IntervalType_Month: + { + gap1 = get_monthesofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, + current_time.tm_year, current_time.tm_mon); + + gap2 = get_monthesofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, + user_time.tm_year, user_time.tm_mon); + break; + } + + case IntervalType_Day: + { + gap1 = get_daysofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, g_partition_base_time.tm_mday, + current_time.tm_year, current_time.tm_mon, current_time.tm_mday); + gap2 = get_daysofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, g_partition_base_time.tm_mday, + user_time.tm_year, user_time.tm_mon, user_time.tm_mday); + break; + } + + default: + { + ereport(ERROR, + (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), + errmsg("partition interval %d not support hot and cold seperation", interval))); + } + } + + + return gap1 - gap2; +} + +int32 +date_diff(struct pg_tm *user_time) +{ + int32 gap1; + int32 gap2; + TimestampTz current_tmstamp; + fsec_t fsec; + struct pg_tm current_time; + + current_tmstamp = GetCurrentTimestamp(); + if(timestamp2tm(current_tmstamp, NULL, ¤t_time, &fsec, NULL, NULL) != 0) + { + ereport(ERROR, + (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), + errmsg("timestamp out of range"))); + } + + gap1 = get_monthesofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, + current_time.tm_year, current_time.tm_mon); + + gap2 = get_monthesofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, + user_time->tm_year, user_time->tm_mon); + + + + return gap1 - gap2; +} + +int32 +date_diff_indays(struct pg_tm *user_time) +{ + int32 gap1; + int32 gap2; + TimestampTz current_tmstamp; + fsec_t fsec; + struct pg_tm current_time; + + current_tmstamp = GetCurrentTimestamp(); + if(timestamp2tm(current_tmstamp, NULL, ¤t_time, &fsec, NULL, NULL) != 0) + { + ereport(ERROR, + (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), + errmsg("timestamp out of range"))); + } + + gap1 = get_daysofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, g_partition_base_time.tm_mday, + current_time.tm_year, current_time.tm_mon, current_time.tm_mday); + + + gap2 = get_daysofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, g_partition_base_time.tm_mday, + user_time->tm_year, user_time->tm_mon, user_time->tm_mday); + + return gap1 - gap2; +} + +int get_months_away_from_base(struct pg_tm * user_tm) +{ + return get_monthesofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, + user_tm->tm_year, user_tm->tm_mon); +} + +int get_days_away_from_base(struct pg_tm * user_tm) +{ + return get_daysofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, g_partition_base_time.tm_mday, + user_tm->tm_year, user_tm->tm_mon, user_tm->tm_mday); +} + +bool is_sec_meet_temp_cold_date(TimestampTz secvalue, int32 interval, int step, TimestampTz startValue) +{// #lizard forgives + bool ret; + fsec_t fsec; + struct pg_tm sec_time; + + if(timestamp2tm(secvalue, NULL, &sec_time, &fsec, NULL, NULL) != 0) + { + ereport(ERROR, + (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), + errmsg("timestamp out of range"))); + } + + switch (interval) + { + case IntervalType_Year: + { + ret = (g_TempColdDataTime.tm_year == sec_time.tm_year); + break; + } + case IntervalType_Month: + { + ret = (g_TempColdDataTime.tm_year == sec_time.tm_year) + && (g_TempColdDataTime.tm_mon == sec_time.tm_mon); + break; + } + + case IntervalType_Day: + { + ret = (g_TempColdDataTime.tm_year == sec_time.tm_year) + && (g_TempColdDataTime.tm_mon == sec_time.tm_mon) + && (g_TempColdDataTime.tm_mday == sec_time.tm_mday); + if (!ret) + { + struct pg_tm start_time; + + if(timestamp2tm(startValue, NULL, &start_time, &fsec, NULL, NULL) != 0) + { + ereport(ERROR, + (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), + errmsg("timestamp out of range"))); + } + + ret = is_first_day_from_start(step, interval, &start_time, &sec_time); + if (ret) + { + if (g_TempColdDataTime.tm_year + 1 == sec_time.tm_year && + g_TempColdDataTime.tm_mon == 12 && + g_TempColdDataTime.tm_mday == 31) + { + ret = true; + } + else + { + ret = false; + } + } + } + + break; + } + + default: + { + ereport(ERROR, + (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), + errmsg("partition interval %d not support hot and cold seperation", interval))); + } + } + + return ret; +} + +int32 GetPartitionIndex(TimestampTz start, int step, int steptype, int partitions, TimestampTz value) +{ + return find_partidx_by_timestamp(start, step, steptype, partitions, value, QULIFICATION_TYPE_EQUAL); +} + +/* is the first day of next year from start year */ +bool +is_first_day_from_start(int step, int steptype, struct pg_tm *start_time, struct pg_tm *current_time) +{ + bool result = false; + + /* partition by one day */ + if (step == 1 && steptype == IntervalType_Day) + { + if (current_time->tm_year == start_time->tm_year + 1 && current_time->tm_mon == 1 && + current_time->tm_mday == 1) + { + result = true; + } + } + + return result; +} +#endif diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index 71e853ba..527cb80d 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -1456,22 +1456,30 @@ typedef struct OnConflictClause * * We don't currently support the SEARCH or CYCLE clause. */ +typedef enum CTEMaterialize +{ + CTEMaterializeDefault, /* no option specified */ + CTEMaterializeAlways, /* MATERIALIZED */ + CTEMaterializeNever /* NOT MATERIALIZED */ +} CTEMaterialize; + typedef struct CommonTableExpr { - NodeTag type; - char *ctename; /* query name (never qualified) */ - List *aliascolnames; /* optional list of column names */ - /* SelectStmt/InsertStmt/etc before parse analysis, Query afterwards: */ - Node *ctequery; /* the CTE's subquery */ - int location; /* token location, or -1 if unknown */ - /* These fields are set during parse analysis: */ - bool cterecursive; /* is this CTE actually recursive? */ - int cterefcount; /* number of RTEs referencing this CTE - * (excluding internal self-references) */ - List *ctecolnames; /* list of output column names */ - List *ctecoltypes; /* OID list of output column type OIDs */ - List *ctecoltypmods; /* integer list of output column typmods */ - List *ctecolcollations; /* OID list of column collation OIDs */ + NodeTag type; + char *ctename; /* query name (never qualified) */ + List *aliascolnames; /* optional list of column names */ + CTEMaterialize ctematerialized; /* is this an optimization fence? */ + /* SelectStmt/InsertStmt/etc before parse analysis, Query afterwards: */ + Node *ctequery; /* the CTE's subquery */ + int location; /* token location, or -1 if unknown */ + /* These fields are set during parse analysis: */ + bool cterecursive; /* is this CTE actually recursive? */ + int cterefcount; /* number of RTEs referencing this CTE + * (excluding internal self-references) */ + List *ctecolnames; /* list of output column names */ + List *ctecoltypes; /* OID list of output column type OIDs */ + List *ctecoltypmods; /* integer list of output column typmods */ + List *ctecolcollations; /* OID list of column collation OIDs */ } CommonTableExpr; /* Convenience macro to get the output tlist of a CTE's query */ diff --git a/src/test/regress/expected/foreign_key_2.out b/src/test/regress/expected/foreign_key_2.out index 8b8ac8ac..ec92a35b 100644 --- a/src/test/regress/expected/foreign_key_2.out +++ b/src/test/regress/expected/foreign_key_2.out @@ -1373,24 +1373,23 @@ create temp table t1 (a integer primary key, b text); create temp table t2 (a integer, b integer references t1) distribute by hash (b); create rule r1 as on delete to t1 do delete from t2 where t2.b = old.a; explain (costs off) delete from t1 where a = 1; - QUERY PLAN ------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------ Remote Subquery Scan on all (datanode_1) -> Delete on t2 -> Nested Loop + -> Remote Subquery Scan on all (datanode_1) + -> Index Scan using t1_pkey on t1 + Index Cond: (a = 1) -> Seq Scan on t2 Filter: (b = 1) - -> Materialize - -> Remote Subquery Scan on all (datanode_1) - -> Index Scan using t1_pkey on t1 - Index Cond: (a = 1) Remote Fast Query Execution Node/s: datanode_1 -> Delete on t1 -> Index Scan using t1_pkey on t1 Index Cond: (a = 1) -(15 rows) +(14 rows) delete from t1 where a = 1; drop rule r1 on t1; diff --git a/src/test/regress/expected/rowsecurity.out b/src/test/regress/expected/rowsecurity.out index 18675344..bfeeedbe 100644 --- a/src/test/regress/expected/rowsecurity.out +++ b/src/test/regress/expected/rowsecurity.out @@ -1934,7 +1934,7 @@ EXPLAIN (COSTS OFF) EXECUTE plancache_test; Filter: (((a % 2) = 0) AND f_leak(b)) (3 rows) -PREPARE plancache_test2 AS WITH q AS (SELECT * FROM z1 WHERE f_leak(b)) SELECT * FROM q,z2; +PREPARE plancache_test2 AS WITH q AS MATERIALIZED (SELECT * FROM z1 WHERE f_leak(b)) SELECT * FROM q,z2; EXPLAIN (COSTS OFF) EXECUTE plancache_test2; QUERY PLAN ----------------------------------------------------------------- @@ -1949,7 +1949,7 @@ EXPLAIN (COSTS OFF) EXECUTE plancache_test2; -> Seq Scan on z2 (9 rows) -PREPARE plancache_test3 AS WITH q AS (SELECT * FROM z2) SELECT * FROM q,z1 WHERE f_leak(z1.b); +PREPARE plancache_test3 AS WITH q AS MATERIALIZED (SELECT * FROM z2) SELECT * FROM q,z1 WHERE f_leak(z1.b); EXPLAIN (COSTS OFF) EXECUTE plancache_test3; QUERY PLAN ----------------------------------------------------------------- diff --git a/src/test/regress/expected/rowsecurity_1.out b/src/test/regress/expected/rowsecurity_1.out index 670e9a06..60160a5a 100644 --- a/src/test/regress/expected/rowsecurity_1.out +++ b/src/test/regress/expected/rowsecurity_1.out @@ -2044,7 +2044,7 @@ EXPLAIN (COSTS OFF) EXECUTE plancache_test; Filter: (((a % 2) = 0) AND f_leak(b)) (3 rows) -PREPARE plancache_test2 AS WITH q AS (SELECT * FROM z1 WHERE f_leak(b)) SELECT * FROM q,z2; +PREPARE plancache_test2 AS WITH q AS MATERIALIZED (SELECT * FROM z1 WHERE f_leak(b)) SELECT * FROM q,z2; EXPLAIN (COSTS OFF) EXECUTE plancache_test2; QUERY PLAN ----------------------------------------------------------------- @@ -2059,7 +2059,7 @@ EXPLAIN (COSTS OFF) EXECUTE plancache_test2; -> Seq Scan on z2 (9 rows) -PREPARE plancache_test3 AS WITH q AS (SELECT * FROM z2) SELECT * FROM q,z1 WHERE f_leak(z1.b); +PREPARE plancache_test3 AS WITH q AS MATERIALIZED (SELECT * FROM z2) SELECT * FROM q,z1 WHERE f_leak(z1.b); EXPLAIN (COSTS OFF) EXECUTE plancache_test3; QUERY PLAN ----------------------------------------------------------------- @@ -2643,7 +2643,7 @@ ALTER TABLE t1 ENABLE ROW LEVEL SECURITY; GRANT ALL ON t1 TO regress_rls_bob; INSERT INTO t1 (SELECT x, md5(x::text) FROM generate_series(0,20) x); SET SESSION AUTHORIZATION regress_rls_bob; -WITH cte1 AS (SELECT * FROM t1 WHERE f_leak(b) order by 1) SELECT * FROM cte1; +WITH cte1 AS MATERIALIZED (SELECT * FROM t1 WHERE f_leak(b) order by 1) SELECT * FROM cte1; a | b ----+---------------------------------- 0 | cfcd208495d565ef66e7dff9f98764da @@ -2659,7 +2659,8 @@ WITH cte1 AS (SELECT * FROM t1 WHERE f_leak(b) order by 1) SELECT * FROM cte1; 20 | 98f13708210194c475687be6106a3b84 (11 rows) -EXPLAIN (COSTS OFF) WITH cte1 AS (SELECT * FROM t1 WHERE f_leak(b)) SELECT * FROM cte1; +EXPLAIN (COSTS OFF) +WITH cte1 AS MATERIALIZED (SELECT * FROM t1 WHERE f_leak(b)) SELECT * FROM cte1; QUERY PLAN ------------------------------------------------------------- CTE Scan on cte1 diff --git a/src/test/regress/expected/rowtypes.out b/src/test/regress/expected/rowtypes.out index 86df2bcc..c1e107f2 100644 --- a/src/test/regress/expected/rowtypes.out +++ b/src/test/regress/expected/rowtypes.out @@ -696,7 +696,7 @@ from (values (1,row(1,2)), (1,row(null,null)), (1,null), (6 rows) explain (verbose, costs off) -with r(a,b) as +with r(a,b) as materialized (values (1,row(1,2)), (1,row(null,null)), (1,null), (null,row(1,2)), (null,row(null,null)), (null,null) ) select r, r is null as isnull, r is not null as isnotnull from r; @@ -709,7 +709,7 @@ select r, r is null as isnull, r is not null as isnotnull from r; Output: "*VALUES*".column1, "*VALUES*".column2 (5 rows) -with r(a,b) as +with r(a,b) as materialized (values (1,row(1,2)), (1,row(null,null)), (1,null), (null,row(1,2)), (null,row(null,null)), (null,null) ) select r, r is null as isnull, r is not null as isnotnull from r; diff --git a/src/test/regress/expected/rowtypes_1.out b/src/test/regress/expected/rowtypes_1.out index 57671100..b22e63f1 100644 --- a/src/test/regress/expected/rowtypes_1.out +++ b/src/test/regress/expected/rowtypes_1.out @@ -700,7 +700,7 @@ from (values (1,row(1,2)), (1,row(null,null)), (1,null), (6 rows) explain (verbose, costs off) -with r(a,b) as +with r(a,b) as materialized (values (1,row(1,2)), (1,row(null,null)), (1,null), (null,row(1,2)), (null,row(null,null)), (null,null) ) select r, r is null as isnull, r is not null as isnotnull from r; @@ -713,7 +713,7 @@ select r, r is null as isnull, r is not null as isnotnull from r; Output: "*VALUES*".column1, "*VALUES*".column2 (5 rows) -with r(a,b) as +with r(a,b) as materialized (values (1,row(1,2)), (1,row(null,null)), (1,null), (null,row(1,2)), (null,row(null,null)), (null,null) ) select r, r is null as isnull, r is not null as isnotnull from r; diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index a169bf4a..0d96dff4 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -3168,7 +3168,7 @@ explain (costs off) INSERT INTO hats VALUES ('h8', 'forbidden') RETURNING *; (6 rows) -- ensure upserting into a rule, with a CTE (different offsets!) works -WITH data(hat_name, hat_color) AS ( +WITH data(hat_name, hat_color) AS MATERIALIZED ( VALUES ('h8', 'green'), ('h9', 'blue'), ('h7', 'forbidden') @@ -3182,7 +3182,8 @@ RETURNING *; h9 | blue (2 rows) -EXPLAIN (nodes off, costs off) WITH data(hat_name, hat_color) AS ( +EXPLAIN (nodes off, costs off) +WITH data(hat_name, hat_color) AS MATERIALIZED ( VALUES ('h8', 'green'), ('h9', 'blue'), ('h7', 'forbidden') diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out index 802e760d..29096c74 100644 --- a/src/test/regress/expected/subselect.out +++ b/src/test/regress/expected/subselect.out @@ -530,12 +530,10 @@ update shipped_view set value = 11 from int4_tbl a join int4_tbl b on (a.f1 = (select f1 from int4_tbl c where c.f1=b.f1)) where ordnum = a.f1; -ERROR: could not plan this distributed update -DETAIL: correlated UPDATE or updating distribution column currently not supported in Postgres-XL. select * from shipped_view; - ttype | ordnum | partnum | value --------+--------+---------+--------- - wt | 0 | 1 | 1234.56 + ttype | ordnum | partnum | value +-------+--------+---------+------- + wt | 0 | 1 | 11 (1 row) select f1, ss1 as relabel from @@ -1158,3 +1156,265 @@ NOTICE: x = 9, y = 13 (3 rows) drop function tattle(x int, y int); + +-- +-- Tests for CTE inlining behavior +-- +-- Basic subquery that can be inlined +explain (verbose, costs off) +with x as (select * from (select f1 from subselect_tbl) ss) +select * from x where f1 = 1; + QUERY PLAN +------------------------------------------ + Remote Subquery Scan on all (datanode_1) + Output: subselect_tbl.f1 + -> Seq Scan on public.subselect_tbl + Output: subselect_tbl.f1 + Filter: (subselect_tbl.f1 = 1) +(5 rows) + +-- Explicitly request materialization +explain (verbose, costs off) +with x as materialized (select * from (select f1 from subselect_tbl) ss) +select * from x where f1 = 1; + QUERY PLAN +------------------------------------------------------------- + CTE Scan on x + Output: x.f1 + Filter: (x.f1 = 1) + CTE x + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: subselect_tbl.f1 + -> Seq Scan on public.subselect_tbl + Output: subselect_tbl.f1 +(8 rows) + +-- Stable functions are safe to inline +explain (verbose, costs off) +with x as (select * from (select f1, now() from subselect_tbl) ss) +select * from x where f1 = 1; + QUERY PLAN +------------------------------------------ + Remote Subquery Scan on all (datanode_1) + Output: subselect_tbl.f1, now() + -> Seq Scan on public.subselect_tbl + Output: subselect_tbl.f1, now() + Filter: (subselect_tbl.f1 = 1) +(5 rows) + +-- Volatile functions prevent inlining +explain (verbose, costs off) +with x as (select * from (select f1, random() from subselect_tbl) ss) +select * from x where f1 = 1; + QUERY PLAN +------------------------------------------------------------- + CTE Scan on x + Output: x.f1, x.random + Filter: (x.f1 = 1) + CTE x + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: f1, random + -> Seq Scan on public.subselect_tbl + Output: subselect_tbl.f1, random() +(8 rows) + +-- SELECT FOR UPDATE cannot be inlined +explain (verbose, costs off) +with x as (select * from (select f1 from subselect_tbl for update) ss) +select * from x where f1 = 1; + QUERY PLAN +-------------------------------------------------------------------------- + CTE Scan on x + Output: x.f1 + Filter: (x.f1 = 1) + CTE x + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: ss.f1 + -> Subquery Scan on ss + Output: ss.f1 + -> LockRows + Output: subselect_tbl.f1, subselect_tbl.ctid + -> Seq Scan on public.subselect_tbl + Output: subselect_tbl.f1, subselect_tbl.ctid +(12 rows) + +-- Multiply-referenced CTEs are inlined only when requested +explain (verbose, costs off) +with x as (select * from (select f1, now() as n from subselect_tbl) ss) +select * from x, x x2 where x.n = x2.n; + QUERY PLAN +------------------------------------------------------------- + Merge Join + Output: x.f1, x.n, x2.f1, x2.n + Merge Cond: (x.n = x2.n) + CTE x + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: subselect_tbl.f1, now() + -> Seq Scan on public.subselect_tbl + Output: subselect_tbl.f1, now() + -> Sort + Output: x.f1, x.n + Sort Key: x.n + -> CTE Scan on x + Output: x.f1, x.n + -> Sort + Output: x2.f1, x2.n + Sort Key: x2.n + -> CTE Scan on x x2 + Output: x2.f1, x2.n +(18 rows) + +explain (verbose, costs off) +with x as not materialized (select * from (select f1, now() as n from subselect_tbl) ss) +select * from x, x x2 where x.n = x2.n; + QUERY PLAN +-------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + Output: subselect_tbl.f1, now(), subselect_tbl_1.f1, now() + -> Result + Output: subselect_tbl.f1, (now()), subselect_tbl_1.f1, (now()) + One-Time Filter: (now() = now()) + -> Nested Loop + Output: subselect_tbl.f1, (now()), subselect_tbl_1.f1, (now()) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: subselect_tbl.f1, now() + Distribute results by H: now() + -> Seq Scan on public.subselect_tbl + Output: subselect_tbl.f1, now() + -> Materialize + Output: subselect_tbl_1.f1, (now()) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: subselect_tbl_1.f1, now() + Distribute results by H: now() + -> Seq Scan on public.subselect_tbl subselect_tbl_1 + Output: subselect_tbl_1.f1, now() +(19 rows) + +-- Multiply-referenced CTEs can't be inlined if they contain outer self-refs +explain (verbose, costs off) +with recursive x(a) as + ((values ('a'), ('b')) + union all + (with z as not materialized (select * from x) + select z.a || z1.a as a from z cross join z as z1 + where length(z.a || z1.a) < 5)) +select * from x; + QUERY PLAN +------------------------------------------------------------- + CTE Scan on x + Output: x.a + CTE x + -> Recursive Union + -> Values Scan on "*VALUES*" + Output: "*VALUES*".column1 + -> Nested Loop + Output: (x_1.a || x_2.a) + Join Filter: (length((x_1.a || x_2.a)) < 5) + -> WorkTable Scan on x x_1 + Output: x_1.a + -> WorkTable Scan on x x_2 + Output: x_2.a +(13 rows) + +with recursive x(a) as + ((values ('a'), ('b')) + union all + (with z as not materialized (select * from x) + select z.a || z1.a as a from z cross join z as z1 + where length(z.a || z1.a) < 5)) +select * from x; + a +------ + a + b + ab + abab +(4 rows) + +explain (verbose, costs off) +with recursive x(a) as + ((values ('a'), ('b')) + union all + (with z as not materialized (select * from x) + select z.a || z.a as a from z + where length(z.a || z.a) < 5)) +select * from x; + QUERY PLAN +-------------------------------------------------------- + CTE Scan on x + Output: x.a + CTE x + -> Recursive Union + -> Values Scan on "*VALUES*" + Output: "*VALUES*".column1 + -> WorkTable Scan on x x_1 + Output: (x_1.a || x_1.a) + Filter: (length((x_1.a || x_1.a)) < 5) +(9 rows) + +with recursive x(a) as + ((values ('a'), ('b')) + union all + (with z as not materialized (select * from x) + select z.a || z.a as a from z + where length(z.a || z.a) < 5)) +select * from x; + a +------ + a + b + aa + bb + aaaa + bbbb +(6 rows) + +-- Check handling of outer references +explain (verbose, costs off) +with x as (select * from int4_tbl) +select * from (with y as (select * from x) select * from y) ss; + QUERY PLAN +------------------------------------------ + Remote Subquery Scan on all (datanode_1) + Output: f1 + -> Seq Scan on public.int4_tbl + Output: int4_tbl.f1 +(4 rows) + +explain (verbose, costs off) +with x as materialized (select * from int4_tbl) +select * from (with y as (select * from x) select * from y) ss; + QUERY PLAN +-------------------------------------------------- + CTE Scan on x + Output: x.f1 + CTE x + -> Remote Subquery Scan on all (datanode_1) + Output: int4_tbl.f1 + -> Seq Scan on public.int4_tbl + Output: int4_tbl.f1 +(7 rows) + +-- Ensure that we inline the currect CTE when there are +-- multiple CTEs with the same name +explain (verbose, costs off) +with x as (select 1 as y) +select * from (with x as (select 2 as y) select * from x) ss; + QUERY PLAN +------------- + Result + Output: 2 +(2 rows) + +-- Row marks are not pushed into CTEs +explain (verbose, costs off) +with x as (select * from subselect_tbl) +select * from x for update; + QUERY PLAN +---------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + Output: subselect_tbl.f1, subselect_tbl.f2, subselect_tbl.f3 + -> Seq Scan on public.subselect_tbl + Output: subselect_tbl.f1, subselect_tbl.f2, subselect_tbl.f3 +(4 rows) + diff --git a/src/test/regress/expected/xc_for_update_1.out b/src/test/regress/expected/xc_for_update_1.out index 69bd0130..b9de2234 100644 --- a/src/test/regress/expected/xc_for_update_1.out +++ b/src/test/regress/expected/xc_for_update_1.out @@ -390,29 +390,23 @@ ERROR: FOR SHARE is not allowed with joins explain (costs off, num_nodes off, nodes off, verbose on) WITH q1 AS (SELECT * from t1 FOR UPDATE) SELECT * FROM q1,t2 FOR UPDATE; ERROR: FOR UPDATE is not allowed with joins explain (costs off, num_nodes off, nodes off, verbose on) WITH q1 AS (SELECT * from t1) SELECT * FROM q1; - QUERY PLAN ------------------------------------------ - CTE Scan on q1 - Output: q1.val, q1.val2 - CTE q1 - -> Remote Subquery Scan on all - Output: t1.val, t1.val2 - -> Seq Scan on public.t1 - Output: t1.val, t1.val2 -(7 rows) + QUERY PLAN +--------------------------------- + Remote Subquery Scan on all + Output: t1.val, t1.val2 + -> Seq Scan on public.t1 + Output: t1.val, t1.val2 +(4 rows) -- make sure row marks are no ops for queries on WITH tables explain (costs off, num_nodes off, nodes off, verbose on) WITH q1 AS (SELECT * from t1) SELECT * FROM q1 FOR UPDATE; - QUERY PLAN ------------------------------------------ - CTE Scan on q1 - Output: q1.val, q1.val2 - CTE q1 - -> Remote Subquery Scan on all - Output: t1.val, t1.val2 - -> Seq Scan on public.t1 - Output: t1.val, t1.val2 -(7 rows) + QUERY PLAN +--------------------------------- + Remote Subquery Scan on all + Output: t1.val, t1.val2 + -> Seq Scan on public.t1 + Output: t1.val, t1.val2 +(4 rows) explain (costs off, num_nodes off, nodes off, verbose on) WITH q1 AS (SELECT * from t1 FOR UPDATE) SELECT * FROM q1 FOR UPDATE; QUERY PLAN diff --git a/src/test/regress/sql/rowsecurity.sql b/src/test/regress/sql/rowsecurity.sql index a010dc72..bd588af8 100644 --- a/src/test/regress/sql/rowsecurity.sql +++ b/src/test/regress/sql/rowsecurity.sql @@ -847,10 +847,10 @@ EXPLAIN (COSTS OFF) SELECT * FROM z1 WHERE f_leak(b); PREPARE plancache_test AS SELECT * FROM z1 WHERE f_leak(b); EXPLAIN (COSTS OFF) EXECUTE plancache_test; -PREPARE plancache_test2 AS WITH q AS (SELECT * FROM z1 WHERE f_leak(b)) SELECT * FROM q,z2; +PREPARE plancache_test2 AS WITH q AS MATERIALIZED (SELECT * FROM z1 WHERE f_leak(b)) SELECT * FROM q,z2; EXPLAIN (COSTS OFF) EXECUTE plancache_test2; -PREPARE plancache_test3 AS WITH q AS (SELECT * FROM z2) SELECT * FROM q,z1 WHERE f_leak(z1.b); +PREPARE plancache_test3 AS WITH q AS MATERIALIZED (SELECT * FROM z2) SELECT * FROM q,z1 WHERE f_leak(z1.b); EXPLAIN (COSTS OFF) EXECUTE plancache_test3; SET ROLE regress_rls_group1; @@ -1078,8 +1078,9 @@ INSERT INTO t1 (SELECT x, md5(x::text) FROM generate_series(0,20) x); SET SESSION AUTHORIZATION regress_rls_bob; -WITH cte1 AS (SELECT * FROM t1 WHERE f_leak(b) order by 1) SELECT * FROM cte1; -EXPLAIN (COSTS OFF) WITH cte1 AS (SELECT * FROM t1 WHERE f_leak(b)) SELECT * FROM cte1; +WITH cte1 AS MATERIALIZED (SELECT * FROM t1 WHERE f_leak(b) order by 1) SELECT * FROM cte1; +EXPLAIN (COSTS OFF) +WITH cte1 AS MATERIALIZED (SELECT * FROM t1 WHERE f_leak(b)) SELECT * FROM cte1; WITH cte1 AS (UPDATE t1 SET a = a + 1 RETURNING *) SELECT * FROM cte1; --fail WITH cte1 AS (UPDATE t1 SET a = a RETURNING *) SELECT * FROM cte1; --ok diff --git a/src/test/regress/sql/rowtypes.sql b/src/test/regress/sql/rowtypes.sql index 4a046c2b..ab7e1488 100644 --- a/src/test/regress/sql/rowtypes.sql +++ b/src/test/regress/sql/rowtypes.sql @@ -306,12 +306,12 @@ from (values (1,row(1,2)), (1,row(null,null)), (1,null), (null,row(1,2)), (null,row(null,null)), (null,null) ) r(a,b); explain (verbose, costs off) -with r(a,b) as +with r(a,b) as materialized (values (1,row(1,2)), (1,row(null,null)), (1,null), (null,row(1,2)), (null,row(null,null)), (null,null) ) select r, r is null as isnull, r is not null as isnotnull from r; -with r(a,b) as +with r(a,b) as materialized (values (1,row(1,2)), (1,row(null,null)), (1,null), (null,row(1,2)), (null,row(null,null)), (null,null) ) select r, r is null as isnull, r is not null as isnotnull from r; diff --git a/src/test/regress/sql/rules.sql b/src/test/regress/sql/rules.sql index 96115bbe..6ebb4cec 100644 --- a/src/test/regress/sql/rules.sql +++ b/src/test/regress/sql/rules.sql @@ -1131,7 +1131,7 @@ SELECT tablename, rulename, definition FROM pg_rules explain (costs off) INSERT INTO hats VALUES ('h8', 'forbidden') RETURNING *; -- ensure upserting into a rule, with a CTE (different offsets!) works -WITH data(hat_name, hat_color) AS ( +WITH data(hat_name, hat_color) AS MATERIALIZED ( VALUES ('h8', 'green'), ('h9', 'blue'), ('h7', 'forbidden') @@ -1139,7 +1139,8 @@ WITH data(hat_name, hat_color) AS ( INSERT INTO hats SELECT * FROM data RETURNING *; -EXPLAIN (nodes off, costs off) WITH data(hat_name, hat_color) AS ( +EXPLAIN (nodes off, costs off) +WITH data(hat_name, hat_color) AS MATERIALIZED ( VALUES ('h8', 'green'), ('h9', 'blue'), ('h7', 'forbidden') diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql index 11b365fe..9b3f974f 100644 --- a/src/test/regress/sql/subselect.sql +++ b/src/test/regress/sql/subselect.sql @@ -598,3 +598,96 @@ select * from where tattle(x, u); drop function tattle(x int, y int); + +-- +-- Tests for CTE inlining behavior +-- + +-- Basic subquery that can be inlined +explain (verbose, costs off) +with x as (select * from (select f1 from subselect_tbl) ss) +select * from x where f1 = 1; + +-- Explicitly request materialization +explain (verbose, costs off) +with x as materialized (select * from (select f1 from subselect_tbl) ss) +select * from x where f1 = 1; + +-- Stable functions are safe to inline +explain (verbose, costs off) +with x as (select * from (select f1, now() from subselect_tbl) ss) +select * from x where f1 = 1; + +-- Volatile functions prevent inlining +explain (verbose, costs off) +with x as (select * from (select f1, random() from subselect_tbl) ss) +select * from x where f1 = 1; + +-- SELECT FOR UPDATE cannot be inlined +explain (verbose, costs off) +with x as (select * from (select f1 from subselect_tbl for update) ss) +select * from x where f1 = 1; + +-- Multiply-referenced CTEs are inlined only when requested +explain (verbose, costs off) +with x as (select * from (select f1, now() as n from subselect_tbl) ss) +select * from x, x x2 where x.n = x2.n; + +explain (verbose, costs off) +with x as not materialized (select * from (select f1, now() as n from subselect_tbl) ss) +select * from x, x x2 where x.n = x2.n; + +-- Multiply-referenced CTEs can't be inlined if they contain outer self-refs +explain (verbose, costs off) +with recursive x(a) as + ((values ('a'), ('b')) + union all + (with z as not materialized (select * from x) + select z.a || z1.a as a from z cross join z as z1 + where length(z.a || z1.a) < 5)) +select * from x; + +with recursive x(a) as + ((values ('a'), ('b')) + union all + (with z as not materialized (select * from x) + select z.a || z1.a as a from z cross join z as z1 + where length(z.a || z1.a) < 5)) +select * from x; + +explain (verbose, costs off) +with recursive x(a) as + ((values ('a'), ('b')) + union all + (with z as not materialized (select * from x) + select z.a || z.a as a from z + where length(z.a || z.a) < 5)) +select * from x; + +with recursive x(a) as + ((values ('a'), ('b')) + union all + (with z as not materialized (select * from x) + select z.a || z.a as a from z + where length(z.a || z.a) < 5)) +select * from x; + +-- Check handling of outer references +explain (verbose, costs off) +with x as (select * from int4_tbl) +select * from (with y as (select * from x) select * from y) ss; + +explain (verbose, costs off) +with x as materialized (select * from int4_tbl) +select * from (with y as (select * from x) select * from y) ss; + +-- Ensure that we inline the currect CTE when there are +-- multiple CTEs with the same name +explain (verbose, costs off) +with x as (select 1 as y) +select * from (with x as (select 2 as y) select * from x) ss; + +-- Row marks are not pushed into CTEs +explain (verbose, costs off) +with x as (select * from subselect_tbl) +select * from x for update; From 037c2e524d68a3aa6fdf77ec54e9bee8de6a6928 Mon Sep 17 00:00:00 2001 From: ericxwu Date: Tue, 11 Aug 2020 21:20:27 +0800 Subject: [PATCH 023/578] Prevent inlining of multiply-referenced CTEs with outer recursive refs. This has to be prevented because inlining would result in multiple self-references, which we don't support (and in fact that's disallowed by the SQL spec, see statements about linearly vs. nonlinearly recursive queries). Bug fix for commit 608b167f9. Per report from Yaroslav Schekin (via Andrew Gierth) Discussion: https://postgr.es/m/87wolmg60q.fsf@news-spur.riddles.org.uk --- src/backend/optimizer/plan/subselect.c | 66 +++++++++++++++++++++++++ src/test/regress/expected/subselect.out | 43 ++++++++++++---- 2 files changed, 99 insertions(+), 10 deletions(-) diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index 3a7f8ccf..1ebdfefc 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -154,6 +154,8 @@ static bool testexpr_is_hashable(Node *testexpr); static bool hash_ok_operator(OpExpr *expr); static bool contain_dml(Node *node); static bool contain_dml_walker(Node *node, void *context); +static bool contain_outer_selfref(Node *node); +static bool contain_outer_selfref_walker(Node *node, Index *depth); static void inline_cte(PlannerInfo *root, CommonTableExpr *cte); static bool inline_cte_walker(Node *node, inline_cte_walker_context *context); static bool simplify_EXISTS_query(PlannerInfo *root, Query *query); @@ -1298,6 +1300,10 @@ SS_process_ctes(PlannerInfo *root) * SELECT, or containing volatile functions. Inlining might change * the side-effects, which would be bad. * + * 4. The CTE is multiply-referenced and contains a self-reference to + * a recursive CTE outside itself. Inlining would result in multiple + * recursive self-references, which we don't support. + * * Otherwise, we have an option whether to inline or not. That should * always be a win if there's just a single reference, but if the CTE * is multiply-referenced then it's unclear: inlining adds duplicate @@ -1307,6 +1313,9 @@ SS_process_ctes(PlannerInfo *root) * the user express a preference. Our default behavior is to inline * only singly-referenced CTEs, but a CTE marked CTEMaterializeNever * will be inlined even if multiply referenced. + * + * Note: we check for volatile functions last, because that's more + * expensive than the other tests needed. */ if ((cte->ctematerialized == CTEMaterializeNever || (cte->ctematerialized == CTEMaterializeDefault && @@ -1314,6 +1323,8 @@ SS_process_ctes(PlannerInfo *root) !cte->cterecursive && cmdType == CMD_SELECT && !contain_dml(cte->ctequery) && + (cte->cterefcount <= 1 || + !contain_outer_selfref(cte->ctequery)) && !contain_volatile_functions(cte->ctequery)) { inline_cte(root, cte); @@ -1733,6 +1744,61 @@ contain_dml_walker(Node *node, void *context) return expression_tree_walker(node, contain_dml_walker, context); } +/* + * contain_outer_selfref: is there an external recursive self-reference? + */ +static bool +contain_outer_selfref(Node *node) +{ + Index depth = 0; + + /* + * We should be starting with a Query, so that depth will be 1 while + * examining its immediate contents. + */ + Assert(IsA(node, Query)); + + return contain_outer_selfref_walker(node, &depth); +} + +static bool +contain_outer_selfref_walker(Node *node, Index *depth) +{ + if (node == NULL) + return false; + if (IsA(node, RangeTblEntry)) + { + RangeTblEntry *rte = (RangeTblEntry *) node; + + /* + * Check for a self-reference to a CTE that's above the Query that our + * search started at. + */ + if (rte->rtekind == RTE_CTE && + rte->self_reference && + rte->ctelevelsup >= *depth) + return true; + return false; /* allow range_table_walker to continue */ + } + if (IsA(node, Query)) + { + /* Recurse into subquery, tracking nesting depth properly */ + Query *query = (Query *) node; + bool result; + + (*depth)++; + + result = query_tree_walker(query, contain_outer_selfref_walker, + (void *) depth, QTW_EXAMINE_RTES_BEFORE); + + (*depth)--; + + return result; + } + return expression_tree_walker(node, contain_outer_selfref_walker, + (void *) depth); +} + /* * inline_cte: convert RTE_CTE references to given CTE into RTE_SUBQUERYs */ diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out index 29096c74..c480d768 100644 --- a/src/test/regress/expected/subselect.out +++ b/src/test/regress/expected/subselect.out @@ -1299,8 +1299,8 @@ with recursive x(a) as select z.a || z1.a as a from z cross join z as z1 where length(z.a || z1.a) < 5)) select * from x; - QUERY PLAN -------------------------------------------------------------- + QUERY PLAN +---------------------------------------------------------- CTE Scan on x Output: x.a CTE x @@ -1308,13 +1308,18 @@ select * from x; -> Values Scan on "*VALUES*" Output: "*VALUES*".column1 -> Nested Loop - Output: (x_1.a || x_2.a) - Join Filter: (length((x_1.a || x_2.a)) < 5) - -> WorkTable Scan on x x_1 - Output: x_1.a - -> WorkTable Scan on x x_2 - Output: x_2.a -(13 rows) + Output: (z.a || z1.a) + Join Filter: (length((z.a || z1.a)) < 5) + CTE z + -> WorkTable Scan on x x_1 + Output: x_1.a + -> CTE Scan on z + Output: z.a + -> Materialize + Output: z1.a + -> CTE Scan on z z1 + Output: z1.a +(18 rows) with recursive x(a) as ((values ('a'), ('b')) @@ -1327,9 +1332,27 @@ select * from x; ------ a b + aa ab + ba + bb + aaaa + aaab + aaba + aabb + abaa abab -(4 rows) + abba + abbb + baaa + baab + baba + babb + bbaa + bbab + bbba + bbbb +(22 rows) explain (verbose, costs off) with recursive x(a) as From dac9ce9ea7da01d083cf86dfe78954d05303abff Mon Sep 17 00:00:00 2001 From: ericxwu Date: Thu, 13 Aug 2020 12:55:02 +0800 Subject: [PATCH 024/578] Refine UPDATE/DELETE join distribution rules Remove the replication_level restriction since we have to do the replicate for UPDATE/DELETE anyway --- src/backend/optimizer/util/pathnode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 9bbf6040..ad966eb2 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -2679,7 +2679,7 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) if (resultRelLoc == RESULT_REL_INNER && pathnode->jointype != JOIN_LEFT && pathnode->jointype != JOIN_FULL && pathnode->jointype != JOIN_SEMI && pathnode->jointype != JOIN_ANTI && - nRemotePlans_outer < replication_level && !pathnode->inner_unique) + !pathnode->inner_unique) { /* Replicate outer */ pathnode->outerjoinpath = redistribute_path( @@ -2697,7 +2697,7 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) } else if (resultRelLoc == RESULT_REL_OUTER && pathnode->jointype != JOIN_RIGHT && pathnode->jointype != JOIN_FULL && - nRemotePlans_outer < replication_level && !pathnode->inner_unique) + !pathnode->inner_unique) { /* Replicate inner */ pathnode->innerjoinpath = redistribute_path( From cb79f573feb76dc0dc4e045cbe1f74f40a851e96 Mon Sep 17 00:00:00 2001 From: ericxwu Date: Tue, 18 Aug 2020 19:05:02 +0800 Subject: [PATCH 025/578] Add GUC setting enable_sampling_analyze to choose the analyze method --- src/backend/utils/misc/guc.c | 11 +++++++++++ src/include/commands/vacuum.h | 6 +++--- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 5ee53910..ad023691 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -2097,6 +2097,17 @@ static struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, + { + { + "enable_pgbouncer", PGC_SIGHUP, STATS_COLLECTOR, + gettext_noop("use pgbouncer as coordinator connection pool."), + NULL + }, + &g_enable_bouncer, + false, + NULL, NULL, NULL + }, + { { "enable_pgbouncer", PGC_SIGHUP, STATS_COLLECTOR, diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h index 9ceed0ce..356efa52 100644 --- a/src/include/commands/vacuum.h +++ b/src/include/commands/vacuum.h @@ -218,9 +218,9 @@ extern int vacuum_freeze_table_age; extern int vacuum_multixact_freeze_min_age; extern int vacuum_multixact_freeze_table_age; #ifdef __TBASE__ -extern bool enable_sampling_analyze; -extern bool distributed_query_analyze; -extern bool explain_query_analyze; +extern bool enable_sampling_analyze; +extern bool distributed_query_analyze; +extern bool explain_query_analyze; /* max number of queries collected */ #define MAX_DISTRIBUTED_QUERIES 512 From 228f4c4d77698d82e285b3792f5cac9e4cfd3b69 Mon Sep 17 00:00:00 2001 From: ericxwu Date: Mon, 20 Jul 2020 20:56:18 +0800 Subject: [PATCH 026/578] Add subset extended statistics Some of our customer experienced that data of two columns have strong correlation. Although postgres extended statistic table support dependency stats checking, but it's restricted to '=' operator based on per-column stats, which is hard to support our customers. Actually, it's a common case that user build their primary-key or distribution-key by simple transform based on their business data column. As a result, forcing query push-down by distribution key could cause over-estimate the selectivity. Thus, we introduced the 'subset' multi-column statistics to hint optimizer. --- src/backend/commands/statscmds.c | 638 ++++++++++-------- src/backend/optimizer/path/clausesel.c | 462 +++++++------ src/backend/optimizer/util/plancat.c | 120 ++-- src/backend/statistics/Makefile | 2 +- src/backend/statistics/dependencies.c | 317 ++++----- src/backend/statistics/extended_stats.c | 438 +++++++----- src/backend/statistics/subset.c | 360 ++++++++++ src/include/catalog/pg_statistic_ext.h | 43 +- .../statistics/extended_stats_internal.h | 5 +- src/include/statistics/statistics.h | 24 +- src/test/regress/expected/stats_ext_2.out | 95 +++ src/test/regress/expected/stats_ext_3.out | 95 +++ src/test/regress/sql/stats_ext.sql | 47 ++ 13 files changed, 1710 insertions(+), 936 deletions(-) create mode 100644 src/backend/statistics/subset.c diff --git a/src/backend/commands/statscmds.c b/src/backend/commands/statscmds.c index 6ea6a323..8fefe73b 100644 --- a/src/backend/commands/statscmds.c +++ b/src/backend/commands/statscmds.c @@ -47,301 +47,349 @@ compare_int16(const void *a, const void *b) */ ObjectAddress CreateStatistics(CreateStatsStmt *stmt) -{// #lizard forgives - int16 attnums[STATS_MAX_DIMENSIONS]; - int numcols = 0; - char *namestr; - NameData stxname; - Oid statoid; - Oid namespaceId; - Oid stxowner = GetUserId(); - HeapTuple htup; - Datum values[Natts_pg_statistic_ext]; - bool nulls[Natts_pg_statistic_ext]; - int2vector *stxkeys; - Relation statrel; - Relation rel = NULL; - Oid relid; - ObjectAddress parentobject, - myself; - Datum types[2]; /* one for each possible type of statistic */ - int ntypes; - ArrayType *stxkind; - bool build_ndistinct; - bool build_dependencies; - bool requested_type = false; - int i; - ListCell *cell; - - Assert(IsA(stmt, CreateStatsStmt)); - - /* resolve the pieces of the name (namespace etc.) */ - namespaceId = QualifiedNameGetCreationNamespace(stmt->defnames, &namestr); - namestrcpy(&stxname, namestr); - - /* - * Deal with the possibility that the statistics object already exists. - */ - if (SearchSysCacheExists2(STATEXTNAMENSP, - NameGetDatum(&stxname), - ObjectIdGetDatum(namespaceId))) - { - if (stmt->if_not_exists) - { - ereport(NOTICE, - (errcode(ERRCODE_DUPLICATE_OBJECT), - errmsg("statistics object \"%s\" already exists, skipping", - namestr))); - return InvalidObjectAddress; - } - - ereport(ERROR, - (errcode(ERRCODE_DUPLICATE_OBJECT), - errmsg("statistics object \"%s\" already exists", namestr))); - } - - /* - * Examine the FROM clause. Currently, we only allow it to be a single - * simple table, but later we'll probably allow multiple tables and JOIN - * syntax. The grammar is already prepared for that, so we have to check - * here that what we got is what we can support. - */ - if (list_length(stmt->relations) != 1) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("only a single relation is allowed in CREATE STATISTICS"))); - - foreach(cell, stmt->relations) - { - Node *rln = (Node *) lfirst(cell); - - if (!IsA(rln, RangeVar)) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("only a single relation is allowed in CREATE STATISTICS"))); - - /* - * CREATE STATISTICS will influence future execution plans but does - * not interfere with currently executing plans. So it should be - * enough to take only ShareUpdateExclusiveLock on relation, - * conflicting with ANALYZE and other DDL that sets statistical - * information, but not with normal queries. - */ - rel = relation_openrv((RangeVar *) rln, ShareUpdateExclusiveLock); - - /* Restrict to allowed relation types */ - if (rel->rd_rel->relkind != RELKIND_RELATION && - rel->rd_rel->relkind != RELKIND_MATVIEW && - rel->rd_rel->relkind != RELKIND_FOREIGN_TABLE && - rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE) - ereport(ERROR, - (errcode(ERRCODE_WRONG_OBJECT_TYPE), - errmsg("relation \"%s\" is not a table, foreign table, or materialized view", - RelationGetRelationName(rel)))); - - /* You must own the relation to create stats on it */ - if (!pg_class_ownercheck(RelationGetRelid(rel), stxowner)) - aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_CLASS, - RelationGetRelationName(rel)); - } - - Assert(rel); - relid = RelationGetRelid(rel); - - /* - * Currently, we only allow simple column references in the expression - * list. That will change someday, and again the grammar already supports - * it so we have to enforce restrictions here. For now, we can convert - * the expression list to a simple array of attnums. While at it, enforce - * some constraints. - */ - foreach(cell, stmt->exprs) - { - Node *expr = (Node *) lfirst(cell); - ColumnRef *cref; - char *attname; - HeapTuple atttuple; - Form_pg_attribute attForm; - TypeCacheEntry *type; - - if (!IsA(expr, ColumnRef)) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("only simple column references are allowed in CREATE STATISTICS"))); - cref = (ColumnRef *) expr; - - if (list_length(cref->fields) != 1) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("only simple column references are allowed in CREATE STATISTICS"))); - attname = strVal((Value *) linitial(cref->fields)); - - atttuple = SearchSysCacheAttName(relid, attname); - if (!HeapTupleIsValid(atttuple)) - ereport(ERROR, - (errcode(ERRCODE_UNDEFINED_COLUMN), - errmsg("column \"%s\" referenced in statistics does not exist", - attname))); - attForm = (Form_pg_attribute) GETSTRUCT(atttuple); - - /* Disallow use of system attributes in extended stats */ - if (attForm->attnum <= 0) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("statistics creation on system columns is not supported"))); - - /* Disallow data types without a less-than operator */ - type = lookup_type_cache(attForm->atttypid, TYPECACHE_LT_OPR); - if (type->lt_opr == InvalidOid) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("column \"%s\" cannot be used in statistics because its type has no default btree operator class", - attname))); - - /* Make sure no more than STATS_MAX_DIMENSIONS columns are used */ - if (numcols >= STATS_MAX_DIMENSIONS) - ereport(ERROR, - (errcode(ERRCODE_TOO_MANY_COLUMNS), - errmsg("cannot have more than %d columns in statistics", - STATS_MAX_DIMENSIONS))); - - attnums[numcols] = attForm->attnum; - numcols++; - ReleaseSysCache(atttuple); - } - - /* - * Check that at least two columns were specified in the statement. The - * upper bound was already checked in the loop above. - */ - if (numcols < 2) - ereport(ERROR, - (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), - errmsg("extended statistics require at least 2 columns"))); - - /* - * Sort the attnums, which makes detecting duplicates somewhat easier, and - * it does not hurt (it does not affect the efficiency, unlike for - * indexes, for example). - */ - qsort(attnums, numcols, sizeof(int16), compare_int16); - - /* - * Check for duplicates in the list of columns. The attnums are sorted so - * just check consecutive elements. - */ - for (i = 1; i < numcols; i++) - { - if (attnums[i] == attnums[i - 1]) - ereport(ERROR, - (errcode(ERRCODE_DUPLICATE_COLUMN), - errmsg("duplicate column name in statistics definition"))); - } - - /* Form an int2vector representation of the sorted column list */ - stxkeys = buildint2vector(attnums, numcols); - - /* - * Parse the statistics types. - */ - build_ndistinct = false; - build_dependencies = false; - foreach(cell, stmt->stat_types) - { - char *type = strVal((Value *) lfirst(cell)); - - if (strcmp(type, "ndistinct") == 0) - { - build_ndistinct = true; - requested_type = true; - } - else if (strcmp(type, "dependencies") == 0) - { - build_dependencies = true; - requested_type = true; - } - else - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("unrecognized statistic type \"%s\"", - type))); - } - /* If no statistic type was specified, build them all. */ - if (!requested_type) - { - build_ndistinct = true; - build_dependencies = true; - } - - /* construct the char array of enabled statistic types */ - ntypes = 0; - if (build_ndistinct) - types[ntypes++] = CharGetDatum(STATS_EXT_NDISTINCT); - if (build_dependencies) - types[ntypes++] = CharGetDatum(STATS_EXT_DEPENDENCIES); - Assert(ntypes > 0 && ntypes <= lengthof(types)); - stxkind = construct_array(types, ntypes, CHAROID, 1, true, 'c'); - - /* - * Everything seems fine, so let's build the pg_statistic_ext tuple. - */ - memset(values, 0, sizeof(values)); - memset(nulls, false, sizeof(nulls)); - values[Anum_pg_statistic_ext_stxrelid - 1] = ObjectIdGetDatum(relid); - values[Anum_pg_statistic_ext_stxname - 1] = NameGetDatum(&stxname); - values[Anum_pg_statistic_ext_stxnamespace - 1] = ObjectIdGetDatum(namespaceId); - values[Anum_pg_statistic_ext_stxowner - 1] = ObjectIdGetDatum(stxowner); - values[Anum_pg_statistic_ext_stxkeys - 1] = PointerGetDatum(stxkeys); - values[Anum_pg_statistic_ext_stxkind - 1] = PointerGetDatum(stxkind); - - /* no statistics built yet */ - nulls[Anum_pg_statistic_ext_stxndistinct - 1] = true; - nulls[Anum_pg_statistic_ext_stxdependencies - 1] = true; - - /* insert it into pg_statistic_ext */ - statrel = heap_open(StatisticExtRelationId, RowExclusiveLock); - htup = heap_form_tuple(statrel->rd_att, values, nulls); - statoid = CatalogTupleInsert(statrel, htup); - heap_freetuple(htup); - relation_close(statrel, RowExclusiveLock); - - /* - * Invalidate relcache so that others see the new statistics object. - */ - CacheInvalidateRelcache(rel); - - relation_close(rel, NoLock); - - /* - * Add an AUTO dependency on each column used in the stats, so that the - * stats object goes away if any or all of them get dropped. - */ - ObjectAddressSet(myself, StatisticExtRelationId, statoid); - - for (i = 0; i < numcols; i++) - { - ObjectAddressSubSet(parentobject, RelationRelationId, relid, attnums[i]); - recordDependencyOn(&myself, &parentobject, DEPENDENCY_AUTO); - } - - /* - * Also add dependencies on namespace and owner. These are required - * because the stats object might have a different namespace and/or owner - * than the underlying table(s). - */ - ObjectAddressSet(parentobject, NamespaceRelationId, namespaceId); - recordDependencyOn(&myself, &parentobject, DEPENDENCY_NORMAL); - - recordDependencyOnOwner(StatisticExtRelationId, statoid, stxowner); - - /* - * XXX probably there should be a recordDependencyOnCurrentExtension call - * here too, but we'd have to add support for ALTER EXTENSION ADD/DROP - * STATISTICS, which is more work than it seems worth. - */ - - /* Return stats object's address */ - return myself; +{ + int16 attnums[STATS_MAX_DIMENSIONS]; +#ifdef __TBASE__ + int16 attnums_ori[STATS_MAX_DIMENSIONS]; +#endif + int numcols = 0; + char *namestr; + NameData stxname; + Oid statoid; + Oid namespaceId; + Oid stxowner = GetUserId(); + HeapTuple htup; + Datum values[Natts_pg_statistic_ext]; + bool nulls[Natts_pg_statistic_ext]; + int2vector *stxkeys; + Relation statrel; + Relation rel = NULL; + Oid relid; + ObjectAddress parentobject, + myself; + Datum types[2]; /* one for each possible type of statistic */ + int ntypes; + ArrayType *stxkind; + bool build_ndistinct; + bool build_dependencies; +#ifdef __TBASE__ + bool build_subset; +#endif + bool requested_type = false; + int i; + ListCell *cell; + + Assert(IsA(stmt, CreateStatsStmt)); + + /* resolve the pieces of the name (namespace etc.) */ + namespaceId = QualifiedNameGetCreationNamespace(stmt->defnames, &namestr); + namestrcpy(&stxname, namestr); + + /* + * Deal with the possibility that the statistics object already exists. + */ + if (SearchSysCacheExists2(STATEXTNAMENSP, + NameGetDatum(&stxname), + ObjectIdGetDatum(namespaceId))) + { + if (stmt->if_not_exists) + { + ereport(NOTICE, + (errcode(ERRCODE_DUPLICATE_OBJECT), + errmsg("statistics object \"%s\" already exists, skipping", + namestr))); + return InvalidObjectAddress; + } + + ereport(ERROR, + (errcode(ERRCODE_DUPLICATE_OBJECT), + errmsg("statistics object \"%s\" already exists", namestr))); + } + + /* + * Examine the FROM clause. Currently, we only allow it to be a single + * simple table, but later we'll probably allow multiple tables and JOIN + * syntax. The grammar is already prepared for that, so we have to check + * here that what we got is what we can support. + */ + if (list_length(stmt->relations) != 1) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("only a single relation is allowed in CREATE STATISTICS"))); + + foreach(cell, stmt->relations) + { + Node *rln = (Node *) lfirst(cell); + + if (!IsA(rln, RangeVar)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("only a single relation is allowed in CREATE STATISTICS"))); + + /* + * CREATE STATISTICS will influence future execution plans but does + * not interfere with currently executing plans. So it should be + * enough to take only ShareUpdateExclusiveLock on relation, + * conflicting with ANALYZE and other DDL that sets statistical + * information, but not with normal queries. + */ + rel = relation_openrv((RangeVar *) rln, ShareUpdateExclusiveLock); + + /* Restrict to allowed relation types */ + if (rel->rd_rel->relkind != RELKIND_RELATION && + rel->rd_rel->relkind != RELKIND_MATVIEW && + rel->rd_rel->relkind != RELKIND_FOREIGN_TABLE && + rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("relation \"%s\" is not a table, foreign table, or materialized view", + RelationGetRelationName(rel)))); + + /* You must own the relation to create stats on it */ + if (!pg_class_ownercheck(RelationGetRelid(rel), stxowner)) + aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_CLASS, + RelationGetRelationName(rel)); + } + + Assert(rel); + relid = RelationGetRelid(rel); + + /* + * Currently, we only allow simple column references in the expression + * list. That will change someday, and again the grammar already supports + * it so we have to enforce restrictions here. For now, we can convert + * the expression list to a simple array of attnums. While at it, enforce + * some constraints. + */ + foreach(cell, stmt->exprs) + { + Node *expr = (Node *) lfirst(cell); + ColumnRef *cref; + char *attname; + HeapTuple atttuple; + Form_pg_attribute attForm; + TypeCacheEntry *type; + + if (!IsA(expr, ColumnRef)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("only simple column references are allowed in CREATE STATISTICS"))); + cref = (ColumnRef *) expr; + + if (list_length(cref->fields) != 1) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("only simple column references are allowed in CREATE STATISTICS"))); + attname = strVal((Value *) linitial(cref->fields)); + + atttuple = SearchSysCacheAttName(relid, attname); + if (!HeapTupleIsValid(atttuple)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_COLUMN), + errmsg("column \"%s\" referenced in statistics does not exist", + attname))); + attForm = (Form_pg_attribute) GETSTRUCT(atttuple); + + /* Disallow use of system attributes in extended stats */ + if (attForm->attnum <= 0) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("statistics creation on system columns is not supported"))); + + /* Disallow data types without a less-than operator */ + type = lookup_type_cache(attForm->atttypid, TYPECACHE_LT_OPR); + if (type->lt_opr == InvalidOid) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("column \"%s\" cannot be used in statistics because its type has no default btree operator class", + attname))); + + /* Make sure no more than STATS_MAX_DIMENSIONS columns are used */ + if (numcols >= STATS_MAX_DIMENSIONS) + ereport(ERROR, + (errcode(ERRCODE_TOO_MANY_COLUMNS), + errmsg("cannot have more than %d columns in statistics", + STATS_MAX_DIMENSIONS))); + + attnums[numcols] = attForm->attnum; +#ifdef __TBASE__ + attnums_ori[numcols] = attForm->attnum; +#endif + numcols++; + ReleaseSysCache(atttuple); + } + + /* + * Check that at least two columns were specified in the statement. The + * upper bound was already checked in the loop above. + */ + if (numcols < 2) + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("extended statistics require at least 2 columns"))); + + /* + * Sort the attnums, which makes detecting duplicates somewhat easier, and + * it does not hurt (it does not affect the efficiency, unlike for + * indexes, for example). + */ + qsort(attnums, numcols, sizeof(int16), compare_int16); + + /* + * Check for duplicates in the list of columns. The attnums are sorted so + * just check consecutive elements. + */ + for (i = 1; i < numcols; i++) + { + if (attnums[i] == attnums[i - 1]) + ereport(ERROR, + (errcode(ERRCODE_DUPLICATE_COLUMN), + errmsg("duplicate column name in statistics definition"))); + } + + /* Form an int2vector representation of the sorted column list */ + stxkeys = buildint2vector(attnums, numcols); + + /* + * Parse the statistics types. + */ + build_ndistinct = false; + build_dependencies = false; +#ifdef __TBASE__ + build_subset = false; +#endif + foreach(cell, stmt->stat_types) + { + char *type = strVal((Value *) lfirst(cell)); + + if (strcmp(type, "ndistinct") == 0) + { + build_ndistinct = true; + requested_type = true; + } + else if (strcmp(type, "dependencies") == 0) + { + build_dependencies = true; + requested_type = true; + } +#ifdef __TBASE__ + else if (strcmp(type, "subset") == 0) + { + if (list_length(stmt->exprs) != 2) + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("subset statistics require exactly 2 columns"))); + } + + build_subset = true; + requested_type = true; + + /* + * The original stmt expr order implies the relation between them, + * thus we need to keep the original order stored. + */ + stxkeys = buildint2vector(attnums_ori, numcols); + } +#endif + else + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("unrecognized statistic type \"%s\"", + type))); + } + /* If no statistic type was specified, build them all. */ + if (!requested_type) + { + build_ndistinct = true; + build_dependencies = true; +#ifdef __TBASE__ + /* No need to build user defined knowledge */ + build_subset = false; +#endif + } + + /* construct the char array of enabled statistic types */ + ntypes = 0; + if (build_ndistinct) + types[ntypes++] = CharGetDatum(STATS_EXT_NDISTINCT); + if (build_dependencies) + types[ntypes++] = CharGetDatum(STATS_EXT_DEPENDENCIES); +#ifdef __TBASE__ + /* + * User defined subset hint should not coexists with other + * types. Thus we don't need to extend the size of 'types' + * array. + */ + if (build_subset) + types[ntypes++] = CharGetDatum(STATS_EXT_SUBSET); +#endif + Assert(ntypes > 0 && ntypes <= lengthof(types)); + stxkind = construct_array(types, ntypes, CHAROID, 1, true, 'c'); + + /* + * Everything seems fine, so let's build the pg_statistic_ext tuple. + */ + memset(values, 0, sizeof(values)); + memset(nulls, false, sizeof(nulls)); + values[Anum_pg_statistic_ext_stxrelid - 1] = ObjectIdGetDatum(relid); + values[Anum_pg_statistic_ext_stxname - 1] = NameGetDatum(&stxname); + values[Anum_pg_statistic_ext_stxnamespace - 1] = ObjectIdGetDatum(namespaceId); + values[Anum_pg_statistic_ext_stxowner - 1] = ObjectIdGetDatum(stxowner); + values[Anum_pg_statistic_ext_stxkeys - 1] = PointerGetDatum(stxkeys); + values[Anum_pg_statistic_ext_stxkind - 1] = PointerGetDatum(stxkind); + + /* no statistics built yet */ + nulls[Anum_pg_statistic_ext_stxndistinct - 1] = true; + nulls[Anum_pg_statistic_ext_stxdependencies - 1] = true; +#ifdef __TBASE__ + nulls[Anum_pg_statistic_ext_stxsubset - 1] = true; +#endif + + /* insert it into pg_statistic_ext */ + statrel = heap_open(StatisticExtRelationId, RowExclusiveLock); + htup = heap_form_tuple(statrel->rd_att, values, nulls); + statoid = CatalogTupleInsert(statrel, htup); + heap_freetuple(htup); + relation_close(statrel, RowExclusiveLock); + + /* + * Invalidate relcache so that others see the new statistics object. + */ + CacheInvalidateRelcache(rel); + + relation_close(rel, NoLock); + + /* + * Add an AUTO dependency on each column used in the stats, so that the + * stats object goes away if any or all of them get dropped. + */ + ObjectAddressSet(myself, StatisticExtRelationId, statoid); + + for (i = 0; i < numcols; i++) + { + ObjectAddressSubSet(parentobject, RelationRelationId, relid, attnums[i]); + recordDependencyOn(&myself, &parentobject, DEPENDENCY_AUTO); + } + + /* + * Also add dependencies on namespace and owner. These are required + * because the stats object might have a different namespace and/or owner + * than the underlying table(s). + */ + ObjectAddressSet(parentobject, NamespaceRelationId, namespaceId); + recordDependencyOn(&myself, &parentobject, DEPENDENCY_NORMAL); + + recordDependencyOnOwner(StatisticExtRelationId, statoid, stxowner); + + /* + * XXX probably there should be a recordDependencyOnCurrentExtension call + * here too, but we'd have to add support for ALTER EXTENSION ADD/DROP + * STATISTICS, which is more work than it seems worth. + */ + + /* Return stats object's address */ + return myself; } /* diff --git a/src/backend/optimizer/path/clausesel.c b/src/backend/optimizer/path/clausesel.c index e0b06c13..8e6e1670 100644 --- a/src/backend/optimizer/path/clausesel.c +++ b/src/backend/optimizer/path/clausesel.c @@ -97,230 +97,244 @@ static RelOptInfo *find_single_rel_for_clauses(PlannerInfo *root, */ Selectivity clauselist_selectivity(PlannerInfo *root, - List *clauses, - int varRelid, - JoinType jointype, - SpecialJoinInfo *sjinfo) -{// #lizard forgives - Selectivity s1 = 1.0; - RelOptInfo *rel; - Bitmapset *estimatedclauses = NULL; - RangeQueryClause *rqlist = NULL; - ListCell *l; - int listidx; - - /* - * If there's exactly one clause, just go directly to - * clause_selectivity(). None of what we might do below is relevant. - */ - if (list_length(clauses) == 1) - return clause_selectivity(root, (Node *) linitial(clauses), - varRelid, jointype, sjinfo); - - /* - * Determine if these clauses reference a single relation. If so, and if - * it has extended statistics, try to apply those. - */ - rel = find_single_rel_for_clauses(root, clauses); - if (rel && rel->rtekind == RTE_RELATION && rel->statlist != NIL) - { - /* - * Perform selectivity estimations on any clauses found applicable by - * dependencies_clauselist_selectivity. 'estimatedclauses' will be - * filled with the 0-based list positions of clauses used that way, so - * that we can ignore them below. - */ - s1 *= dependencies_clauselist_selectivity(root, clauses, varRelid, - jointype, sjinfo, rel, - &estimatedclauses); - - /* - * This would be the place to apply any other types of extended - * statistics selectivity estimations for remaining clauses. - */ - } - - /* - * Apply normal selectivity estimates for remaining clauses. We'll be - * careful to skip any clauses which were already estimated above. - * - * Anything that doesn't look like a potential rangequery clause gets - * multiplied into s1 and forgotten. Anything that does gets inserted into - * an rqlist entry. - */ - listidx = -1; - foreach(l, clauses) - { - Node *clause = (Node *) lfirst(l); - RestrictInfo *rinfo; - Selectivity s2; - - listidx++; - - /* - * Skip this clause if it's already been estimated by some other - * statistics above. - */ - if (bms_is_member(listidx, estimatedclauses)) - continue; - - /* Always compute the selectivity using clause_selectivity */ - s2 = clause_selectivity(root, clause, varRelid, jointype, sjinfo); - - /* - * Check for being passed a RestrictInfo. - * - * If it's a pseudoconstant RestrictInfo, then s2 is either 1.0 or - * 0.0; just use that rather than looking for range pairs. - */ - if (IsA(clause, RestrictInfo)) - { - rinfo = (RestrictInfo *) clause; - if (rinfo->pseudoconstant) - { - s1 = s1 * s2; - continue; - } - clause = (Node *) rinfo->clause; - } - else - rinfo = NULL; - - /* - * See if it looks like a restriction clause with a pseudoconstant on - * one side. (Anything more complicated than that might not behave in - * the simple way we are expecting.) Most of the tests here can be - * done more efficiently with rinfo than without. - */ - if (is_opclause(clause) && list_length(((OpExpr *) clause)->args) == 2) - { - OpExpr *expr = (OpExpr *) clause; - bool varonleft = true; - bool ok; - - if (rinfo) - { - ok = (bms_membership(rinfo->clause_relids) == BMS_SINGLETON) && - (is_pseudo_constant_clause_relids(lsecond(expr->args), - rinfo->right_relids) || - (varonleft = false, - is_pseudo_constant_clause_relids(linitial(expr->args), - rinfo->left_relids))); - } - else - { - ok = (NumRelids(clause) == 1) && - (is_pseudo_constant_clause(lsecond(expr->args)) || - (varonleft = false, - is_pseudo_constant_clause(linitial(expr->args)))); - } - - if (ok) - { - /* - * If it's not a "<" or ">" operator, just merge the - * selectivity in generically. But if it's the right oprrest, - * add the clause to rqlist for later processing. - */ - switch (get_oprrest(expr->opno)) - { - case F_SCALARLTSEL: - addRangeClause(&rqlist, clause, - varonleft, true, s2); - break; - case F_SCALARGTSEL: - addRangeClause(&rqlist, clause, - varonleft, false, s2); - break; - default: - /* Just merge the selectivity in generically */ - s1 = s1 * s2; - break; - } - continue; /* drop to loop bottom */ - } - } - - /* Not the right form, so treat it generically. */ - s1 = s1 * s2; - } - - /* - * Now scan the rangequery pair list. - */ - while (rqlist != NULL) - { - RangeQueryClause *rqnext; - - if (rqlist->have_lobound && rqlist->have_hibound) - { - /* Successfully matched a pair of range clauses */ - Selectivity s2; - - /* - * Exact equality to the default value probably means the - * selectivity function punted. This is not airtight but should - * be good enough. - */ - if (rqlist->hibound == DEFAULT_INEQ_SEL || - rqlist->lobound == DEFAULT_INEQ_SEL) - { - s2 = DEFAULT_RANGE_INEQ_SEL; - } - else - { - s2 = rqlist->hibound + rqlist->lobound - 1.0; - - /* Adjust for double-exclusion of NULLs */ - s2 += nulltestsel(root, IS_NULL, rqlist->var, - varRelid, jointype, sjinfo); - - /* - * A zero or slightly negative s2 should be converted into a - * small positive value; we probably are dealing with a very - * tight range and got a bogus result due to roundoff errors. - * However, if s2 is very negative, then we probably have - * default selectivity estimates on one or both sides of the - * range that we failed to recognize above for some reason. - */ - if (s2 <= 0.0) - { - if (s2 < -0.01) - { - /* - * No data available --- use a default estimate that - * is small, but not real small. - */ - s2 = DEFAULT_RANGE_INEQ_SEL; - } - else - { - /* - * It's just roundoff error; use a small positive - * value - */ - s2 = 1.0e-10; - } - } - } - /* Merge in the selectivity of the pair of clauses */ - s1 *= s2; - } - else - { - /* Only found one of a pair, merge it in generically */ - if (rqlist->have_lobound) - s1 *= rqlist->lobound; - else - s1 *= rqlist->hibound; - } - /* release storage and advance */ - rqnext = rqlist->next; - pfree(rqlist); - rqlist = rqnext; - } - - return s1; + List *clauses, + int varRelid, + JoinType jointype, + SpecialJoinInfo *sjinfo) +{ + Selectivity s1 = 1.0; + RelOptInfo *rel; + Bitmapset *estimatedclauses = NULL; + RangeQueryClause *rqlist = NULL; + ListCell *l; + int listidx; + + /* + * If there's exactly one clause, just go directly to + * clause_selectivity(). None of what we might do below is relevant. + */ + if (list_length(clauses) == 1) + return clause_selectivity(root, (Node *) linitial(clauses), + varRelid, jointype, sjinfo); + + /* + * Determine if these clauses reference a single relation. If so, and if + * it has extended statistics, try to apply those. + */ + rel = find_single_rel_for_clauses(root, clauses); + if (rel && rel->rtekind == RTE_RELATION && rel->statlist != NIL) + { +#ifdef __TBASE__ + /* + * Perform subset eliminations on any clauses found applicable by + * subset_clauselist_selectivity. Subset dependencies got higher + * priority over statistic-based dependencies. 'estimatedclauses' + * will be filled with the 0-based list positions of clauses used + * that way, so that we can ignore them below in both dependencies + * selectivity calculation and independent basic selectivity + * calculation. + */ + s1 *= subset_clauselist_selectivity(root, clauses, varRelid, jointype, + sjinfo, rel, &estimatedclauses); +#endif + + /* + * Perform selectivity estimations on any clauses found applicable by + * dependencies_clauselist_selectivity. 'estimatedclauses' will be + * filled with the 0-based list positions of clauses used that way, so + * that we can ignore them below. + */ + s1 *= dependencies_clauselist_selectivity(root, clauses, varRelid, + jointype, sjinfo, rel, + &estimatedclauses); + + /* + * This would be the place to apply any other types of extended + * statistics selectivity estimations for remaining clauses. + */ + } + + /* + * Apply normal selectivity estimates for remaining clauses. We'll be + * careful to skip any clauses which were already estimated above. + * + * Anything that doesn't look like a potential rangequery clause gets + * multiplied into s1 and forgotten. Anything that does gets inserted into + * an rqlist entry. + */ + listidx = -1; + foreach(l, clauses) + { + Node *clause = (Node *) lfirst(l); + RestrictInfo *rinfo; + Selectivity s2; + + listidx++; + + /* + * Skip this clause if it's already been estimated by some other + * statistics above. + */ + if (bms_is_member(listidx, estimatedclauses)) + continue; + + /* Always compute the selectivity using clause_selectivity */ + s2 = clause_selectivity(root, clause, varRelid, jointype, sjinfo); + + /* + * Check for being passed a RestrictInfo. + * + * If it's a pseudoconstant RestrictInfo, then s2 is either 1.0 or + * 0.0; just use that rather than looking for range pairs. + */ + if (IsA(clause, RestrictInfo)) + { + rinfo = (RestrictInfo *) clause; + if (rinfo->pseudoconstant) + { + s1 = s1 * s2; + continue; + } + clause = (Node *) rinfo->clause; + } + else + rinfo = NULL; + + /* + * See if it looks like a restriction clause with a pseudoconstant on + * one side. (Anything more complicated than that might not behave in + * the simple way we are expecting.) Most of the tests here can be + * done more efficiently with rinfo than without. + */ + if (is_opclause(clause) && list_length(((OpExpr *) clause)->args) == 2) + { + OpExpr *expr = (OpExpr *) clause; + bool varonleft = true; + bool ok; + + if (rinfo) + { + ok = (bms_membership(rinfo->clause_relids) == BMS_SINGLETON) && + (is_pseudo_constant_clause_relids(lsecond(expr->args), + rinfo->right_relids) || + (varonleft = false, + is_pseudo_constant_clause_relids(linitial(expr->args), + rinfo->left_relids))); + } + else + { + ok = (NumRelids(clause) == 1) && + (is_pseudo_constant_clause(lsecond(expr->args)) || + (varonleft = false, + is_pseudo_constant_clause(linitial(expr->args)))); + } + + if (ok) + { + /* + * If it's not a "<" or ">" operator, just merge the + * selectivity in generically. But if it's the right oprrest, + * add the clause to rqlist for later processing. + */ + switch (get_oprrest(expr->opno)) + { + case F_SCALARLTSEL: + addRangeClause(&rqlist, clause, + varonleft, true, s2); + break; + case F_SCALARGTSEL: + addRangeClause(&rqlist, clause, + varonleft, false, s2); + break; + default: + /* Just merge the selectivity in generically */ + s1 = s1 * s2; + break; + } + continue; /* drop to loop bottom */ + } + } + + /* Not the right form, so treat it generically. */ + s1 = s1 * s2; + } + + /* + * Now scan the rangequery pair list. + */ + while (rqlist != NULL) + { + RangeQueryClause *rqnext; + + if (rqlist->have_lobound && rqlist->have_hibound) + { + /* Successfully matched a pair of range clauses */ + Selectivity s2; + + /* + * Exact equality to the default value probably means the + * selectivity function punted. This is not airtight but should + * be good enough. + */ + if (rqlist->hibound == DEFAULT_INEQ_SEL || + rqlist->lobound == DEFAULT_INEQ_SEL) + { + s2 = DEFAULT_RANGE_INEQ_SEL; + } + else + { + s2 = rqlist->hibound + rqlist->lobound - 1.0; + + /* Adjust for double-exclusion of NULLs */ + s2 += nulltestsel(root, IS_NULL, rqlist->var, + varRelid, jointype, sjinfo); + + /* + * A zero or slightly negative s2 should be converted into a + * small positive value; we probably are dealing with a very + * tight range and got a bogus result due to roundoff errors. + * However, if s2 is very negative, then we probably have + * default selectivity estimates on one or both sides of the + * range that we failed to recognize above for some reason. + */ + if (s2 <= 0.0) + { + if (s2 < -0.01) + { + /* + * No data available --- use a default estimate that + * is small, but not real small. + */ + s2 = DEFAULT_RANGE_INEQ_SEL; + } + else + { + /* + * It's just roundoff error; use a small positive + * value + */ + s2 = 1.0e-10; + } + } + } + /* Merge in the selectivity of the pair of clauses */ + s1 *= s2; + } + else + { + /* Only found one of a pair, merge it in generically */ + if (rqlist->have_lobound) + s1 *= rqlist->lobound; + else + s1 *= rqlist->hibound; + } + /* release storage and advance */ + rqnext = rqlist->next; + pfree(rqlist); + rqlist = rqnext; + } + + return s1; } /* diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index d831e03d..a1248e65 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -1405,65 +1405,79 @@ get_relation_constraints(PlannerInfo *root, static List * get_relation_statistics(RelOptInfo *rel, Relation relation) { - List *statoidlist; - List *stainfos = NIL; - ListCell *l; - - statoidlist = RelationGetStatExtList(relation); - - foreach(l, statoidlist) - { - Oid statOid = lfirst_oid(l); - Form_pg_statistic_ext staForm; - HeapTuple htup; - Bitmapset *keys = NULL; - int i; - - htup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statOid)); - if (!htup) - elog(ERROR, "cache lookup failed for statistics object %u", statOid); - staForm = (Form_pg_statistic_ext) GETSTRUCT(htup); - - /* - * First, build the array of columns covered. This is ultimately - * wasted if no stats within the object have actually been built, but - * it doesn't seem worth troubling over that case. - */ - for (i = 0; i < staForm->stxkeys.dim1; i++) - keys = bms_add_member(keys, staForm->stxkeys.values[i]); - - /* add one StatisticExtInfo for each kind built */ - if (statext_is_kind_built(htup, STATS_EXT_NDISTINCT)) - { - StatisticExtInfo *info = makeNode(StatisticExtInfo); + List *statoidlist; + List *stainfos = NIL; + ListCell *l; + + statoidlist = RelationGetStatExtList(relation); + + foreach(l, statoidlist) + { + Oid statOid = lfirst_oid(l); + Form_pg_statistic_ext staForm; + HeapTuple htup; + Bitmapset *keys = NULL; + int i; + + htup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statOid)); + if (!htup) + elog(ERROR, "cache lookup failed for statistics object %u", statOid); + staForm = (Form_pg_statistic_ext) GETSTRUCT(htup); + + /* + * First, build the array of columns covered. This is ultimately + * wasted if no stats within the object have actually been built, but + * it doesn't seem worth troubling over that case. + */ + for (i = 0; i < staForm->stxkeys.dim1; i++) + keys = bms_add_member(keys, staForm->stxkeys.values[i]); + + /* add one StatisticExtInfo for each kind built */ + if (statext_is_kind_built(htup, STATS_EXT_NDISTINCT)) + { + StatisticExtInfo *info = makeNode(StatisticExtInfo); + + info->statOid = statOid; + info->rel = rel; + info->kind = STATS_EXT_NDISTINCT; + info->keys = bms_copy(keys); + + stainfos = lcons(info, stainfos); + } + + if (statext_is_kind_built(htup, STATS_EXT_DEPENDENCIES)) + { + StatisticExtInfo *info = makeNode(StatisticExtInfo); + + info->statOid = statOid; + info->rel = rel; + info->kind = STATS_EXT_DEPENDENCIES; + info->keys = bms_copy(keys); + + stainfos = lcons(info, stainfos); + } - info->statOid = statOid; - info->rel = rel; - info->kind = STATS_EXT_NDISTINCT; - info->keys = bms_copy(keys); - - stainfos = lcons(info, stainfos); - } - - if (statext_is_kind_built(htup, STATS_EXT_DEPENDENCIES)) - { - StatisticExtInfo *info = makeNode(StatisticExtInfo); +#ifdef __TBASE__ + if (statext_is_kind_built(htup, STATS_EXT_SUBSET)) + { + StatisticExtInfo *info = makeNode(StatisticExtInfo); - info->statOid = statOid; - info->rel = rel; - info->kind = STATS_EXT_DEPENDENCIES; - info->keys = bms_copy(keys); + info->statOid = statOid; + info->rel = rel; + info->kind = STATS_EXT_SUBSET; + info->keys = bms_copy(keys); - stainfos = lcons(info, stainfos); - } + stainfos = lcons(info, stainfos); + } +#endif - ReleaseSysCache(htup); - bms_free(keys); - } + ReleaseSysCache(htup); + bms_free(keys); + } - list_free(statoidlist); + list_free(statoidlist); - return stainfos; + return stainfos; } /* diff --git a/src/backend/statistics/Makefile b/src/backend/statistics/Makefile index 3404e455..b9cc0290 100644 --- a/src/backend/statistics/Makefile +++ b/src/backend/statistics/Makefile @@ -12,6 +12,6 @@ subdir = src/backend/statistics top_builddir = ../../.. include $(top_builddir)/src/Makefile.global -OBJS = extended_stats.o dependencies.o mvdistinct.o +OBJS = extended_stats.o dependencies.o mvdistinct.o subset.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/statistics/dependencies.c b/src/backend/statistics/dependencies.c index c828e935..2863517d 100644 --- a/src/backend/statistics/dependencies.c +++ b/src/backend/statistics/dependencies.c @@ -909,159 +909,166 @@ find_strongest_dependency(StatisticExtInfo *stats, MVDependencies *dependencies, */ Selectivity dependencies_clauselist_selectivity(PlannerInfo *root, - List *clauses, - int varRelid, - JoinType jointype, - SpecialJoinInfo *sjinfo, - RelOptInfo *rel, - Bitmapset **estimatedclauses) -{// #lizard forgives - Selectivity s1 = 1.0; - ListCell *l; - Bitmapset *clauses_attnums = NULL; - StatisticExtInfo *stat; - MVDependencies *dependencies; - AttrNumber *list_attnums; - int listidx; - - /* check if there's any stats that might be useful for us. */ - if (!has_stats_of_kind(rel->statlist, STATS_EXT_DEPENDENCIES)) - return 1.0; - - list_attnums = (AttrNumber *) palloc(sizeof(AttrNumber) * - list_length(clauses)); - - /* - * Pre-process the clauses list to extract the attnums seen in each item. - * We need to determine if there's any clauses which will be useful for - * dependency selectivity estimations. Along the way we'll record all of - * the attnums for each clause in a list which we'll reference later so we - * don't need to repeat the same work again. We'll also keep track of all - * attnums seen. - */ - listidx = 0; - foreach(l, clauses) - { - Node *clause = (Node *) lfirst(l); - AttrNumber attnum; - - if (dependency_is_compatible_clause(clause, rel->relid, &attnum)) - { - list_attnums[listidx] = attnum; - clauses_attnums = bms_add_member(clauses_attnums, attnum); - } - else - list_attnums[listidx] = InvalidAttrNumber; - - listidx++; - } - - /* - * If there's not at least two distinct attnums then reject the whole list - * of clauses. We must return 1.0 so the calling function's selectivity is - * unaffected. - */ - if (bms_num_members(clauses_attnums) < 2) - { - pfree(list_attnums); - return 1.0; - } - - /* find the best suited statistics object for these attnums */ - stat = choose_best_statistics(rel->statlist, clauses_attnums, - STATS_EXT_DEPENDENCIES); - - /* if no matching stats could be found then we've nothing to do */ - if (!stat) - { - pfree(list_attnums); - return 1.0; - } - - /* load the dependency items stored in the statistics object */ - dependencies = statext_dependencies_load(stat->statOid); - - /* - * Apply the dependencies recursively, starting with the widest/strongest - * ones, and proceeding to the smaller/weaker ones. At the end of each - * round we factor in the selectivity of clauses on the implied attribute, - * and remove the clauses from the list. - */ - while (true) - { - Selectivity s2 = 1.0; - MVDependency *dependency; - - /* the widest/strongest dependency, fully matched by clauses */ - dependency = find_strongest_dependency(stat, dependencies, - clauses_attnums); - - /* if no suitable dependency was found, we're done */ - if (!dependency) - break; - - /* - * We found an applicable dependency, so find all the clauses on the - * implied attribute - with dependency (a,b => c) we look for clauses - * on 'c'. - */ - listidx = -1; - foreach(l, clauses) - { - Node *clause; - - listidx++; - - /* - * Skip incompatible clauses, and ones we've already estimated on. - */ - if (list_attnums[listidx] == InvalidAttrNumber || - bms_is_member(listidx, *estimatedclauses)) - continue; - - /* - * Technically we could find more than one clause for a given - * attnum. Since these clauses must be equality clauses, we choose - * to only take the selectivity estimate from the final clause in - * the list for this attnum. If the attnum happens to be compared - * to a different Const in another clause then no rows will match - * anyway. If it happens to be compared to the same Const, then - * ignoring the additional clause is just the thing to do. - */ - if (dependency_implies_attribute(dependency, - list_attnums[listidx])) - { - clause = (Node *) lfirst(l); - - s2 = clause_selectivity(root, clause, varRelid, jointype, - sjinfo); - - /* mark this one as done, so we don't touch it again. */ - *estimatedclauses = bms_add_member(*estimatedclauses, listidx); - - /* - * Mark that we've got and used the dependency on this clause. - * We'll want to ignore this when looking for the next - * strongest dependency above. - */ - clauses_attnums = bms_del_member(clauses_attnums, - list_attnums[listidx]); - } - } - - /* - * Now factor in the selectivity for all the "implied" clauses into - * the final one, using this formula: - * - * P(a,b) = P(a) * (f + (1-f) * P(b)) - * - * where 'f' is the degree of validity of the dependency. - */ - s1 *= (dependency->degree + (1 - dependency->degree) * s2); - } - - pfree(dependencies); - pfree(list_attnums); - - return s1; + List *clauses, + int varRelid, + JoinType jointype, + SpecialJoinInfo *sjinfo, + RelOptInfo *rel, + Bitmapset **estimatedclauses) +{ + Selectivity s1 = 1.0; + ListCell *l; + Bitmapset *clauses_attnums = NULL; + StatisticExtInfo *stat; + MVDependencies *dependencies; + AttrNumber *list_attnums; + int listidx; + + /* check if there's any stats that might be useful for us. */ + if (!has_stats_of_kind(rel->statlist, STATS_EXT_DEPENDENCIES)) + return 1.0; + + list_attnums = (AttrNumber *) palloc(sizeof(AttrNumber) * + list_length(clauses)); + + /* + * Pre-process the clauses list to extract the attnums seen in each item. + * We need to determine if there's any clauses which will be useful for + * dependency selectivity estimations. Along the way we'll record all of + * the attnums for each clause in a list which we'll reference later so we + * don't need to repeat the same work again. We'll also keep track of all + * attnums seen. + */ + listidx = 0; + foreach(l, clauses) + { + Node *clause = (Node *) lfirst(l); + AttrNumber attnum; + +#ifdef __TBASE__ + /* Could eliminated by the prior subset dependency */ + if (bms_is_member(listidx, *estimatedclauses)) + { + list_attnums[listidx] = InvalidAttrNumber; + } +#endif + else if (dependency_is_compatible_clause(clause, rel->relid, &attnum)) + { + list_attnums[listidx] = attnum; + clauses_attnums = bms_add_member(clauses_attnums, attnum); + } + else + list_attnums[listidx] = InvalidAttrNumber; + + listidx++; + } + + /* + * If there's not at least two distinct attnums then reject the whole list + * of clauses. We must return 1.0 so the calling function's selectivity is + * unaffected. + */ + if (bms_num_members(clauses_attnums) < 2) + { + pfree(list_attnums); + return 1.0; + } + + /* find the best suited statistics object for these attnums */ + stat = choose_best_statistics(rel->statlist, clauses_attnums, + STATS_EXT_DEPENDENCIES); + + /* if no matching stats could be found then we've nothing to do */ + if (!stat) + { + pfree(list_attnums); + return 1.0; + } + + /* load the dependency items stored in the statistics object */ + dependencies = statext_dependencies_load(stat->statOid); + + /* + * Apply the dependencies recursively, starting with the widest/strongest + * ones, and proceeding to the smaller/weaker ones. At the end of each + * round we factor in the selectivity of clauses on the implied attribute, + * and remove the clauses from the list. + */ + while (true) + { + Selectivity s2 = 1.0; + MVDependency *dependency; + + /* the widest/strongest dependency, fully matched by clauses */ + dependency = find_strongest_dependency(stat, dependencies, + clauses_attnums); + + /* if no suitable dependency was found, we're done */ + if (!dependency) + break; + + /* + * We found an applicable dependency, so find all the clauses on the + * implied attribute - with dependency (a,b => c) we look for clauses + * on 'c'. + */ + listidx = -1; + foreach(l, clauses) + { + Node *clause; + + listidx++; + + /* + * Skip incompatible clauses, and ones we've already estimated on. + */ + if (list_attnums[listidx] == InvalidAttrNumber || + bms_is_member(listidx, *estimatedclauses)) + continue; + + /* + * Technically we could find more than one clause for a given + * attnum. Since these clauses must be equality clauses, we choose + * to only take the selectivity estimate from the final clause in + * the list for this attnum. If the attnum happens to be compared + * to a different Const in another clause then no rows will match + * anyway. If it happens to be compared to the same Const, then + * ignoring the additional clause is just the thing to do. + */ + if (dependency_implies_attribute(dependency, + list_attnums[listidx])) + { + clause = (Node *) lfirst(l); + + s2 = clause_selectivity(root, clause, varRelid, jointype, + sjinfo); + + /* mark this one as done, so we don't touch it again. */ + *estimatedclauses = bms_add_member(*estimatedclauses, listidx); + + /* + * Mark that we've got and used the dependency on this clause. + * We'll want to ignore this when looking for the next + * strongest dependency above. + */ + clauses_attnums = bms_del_member(clauses_attnums, + list_attnums[listidx]); + } + } + + /* + * Now factor in the selectivity for all the "implied" clauses into + * the final one, using this formula: + * + * P(a,b) = P(a) * (f + (1-f) * P(b)) + * + * where 'f' is the degree of validity of the dependency. + */ + s1 *= (dependency->degree + (1 - dependency->degree) * s2); + } + + pfree(dependencies); + pfree(list_attnums); + + return s1; } diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c index f0b11dd1..f1346f64 100644 --- a/src/backend/statistics/extended_stats.c +++ b/src/backend/statistics/extended_stats.c @@ -40,11 +40,14 @@ */ typedef struct StatExtEntry { - Oid statOid; /* OID of pg_statistic_ext entry */ - char *schema; /* statistics object's schema */ - char *name; /* statistics object's name */ - Bitmapset *columns; /* attribute numbers covered by the object */ - List *types; /* 'char' list of enabled statistic kinds */ + Oid statOid; /* OID of pg_statistic_ext entry */ + char *schema; /* statistics object's schema */ + char *name; /* statistics object's name */ + Bitmapset *columns; /* attribute numbers covered by the object */ + List *types; /* 'char' list of enabled statistic kinds */ +#ifdef __TBASE__ + List *orderedColumns; /* attribute numbers in order of dependency */ +#endif } StatExtEntry; @@ -52,8 +55,11 @@ static List *fetch_statentries_for_relation(Relation pg_statext, Oid relid); static VacAttrStats **lookup_var_attr_stats(Relation rel, Bitmapset *attrs, int nvacatts, VacAttrStats **vacatts); static void statext_store(Relation pg_stext, Oid relid, - MVNDistinct *ndistinct, MVDependencies *dependencies, - VacAttrStats **stats); + MVNDistinct *ndistinct, MVDependencies *dependencies, +#ifdef __TBASE__ + MVDependencies *subset, +#endif + VacAttrStats **stats); /* @@ -68,70 +74,83 @@ BuildRelationExtStatistics(Relation onerel, double totalrows, int numrows, HeapTuple *rows, int natts, VacAttrStats **vacattrstats) { - Relation pg_stext; - ListCell *lc; - List *stats; - MemoryContext cxt; - MemoryContext oldcxt; - - cxt = AllocSetContextCreate(CurrentMemoryContext, "stats ext", - ALLOCSET_DEFAULT_SIZES); - oldcxt = MemoryContextSwitchTo(cxt); - - pg_stext = heap_open(StatisticExtRelationId, RowExclusiveLock); - stats = fetch_statentries_for_relation(pg_stext, RelationGetRelid(onerel)); - - foreach(lc, stats) - { - StatExtEntry *stat = (StatExtEntry *) lfirst(lc); - MVNDistinct *ndistinct = NULL; - MVDependencies *dependencies = NULL; - VacAttrStats **stats; - ListCell *lc2; - - /* - * Check if we can build these stats based on the column analyzed. If - * not, report this fact (except in autovacuum) and move on. - */ - stats = lookup_var_attr_stats(onerel, stat->columns, - natts, vacattrstats); - if (!stats && !IsAutoVacuumWorkerProcess()) - { - ereport(WARNING, - (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), - errmsg("statistics object \"%s.%s\" could not be computed for relation \"%s.%s\"", - stat->schema, stat->name, - get_namespace_name(onerel->rd_rel->relnamespace), - RelationGetRelationName(onerel)), - errtable(onerel))); - continue; - } - - /* check allowed number of dimensions */ - Assert(bms_num_members(stat->columns) >= 2 && - bms_num_members(stat->columns) <= STATS_MAX_DIMENSIONS); - - /* compute statistic of each requested type */ - foreach(lc2, stat->types) - { - char t = (char) lfirst_int(lc2); - - if (t == STATS_EXT_NDISTINCT) - ndistinct = statext_ndistinct_build(totalrows, numrows, rows, - stat->columns, stats); - else if (t == STATS_EXT_DEPENDENCIES) - dependencies = statext_dependencies_build(numrows, rows, - stat->columns, stats); - } - - /* store the statistics in the catalog */ - statext_store(pg_stext, stat->statOid, ndistinct, dependencies, stats); - } - - heap_close(pg_stext, RowExclusiveLock); - - MemoryContextSwitchTo(oldcxt); - MemoryContextDelete(cxt); + Relation pg_stext; + ListCell *lc; + List *stats; + MemoryContext cxt; + MemoryContext oldcxt; + + cxt = AllocSetContextCreate(CurrentMemoryContext, "stats ext", + ALLOCSET_DEFAULT_SIZES); + oldcxt = MemoryContextSwitchTo(cxt); + + pg_stext = heap_open(StatisticExtRelationId, RowExclusiveLock); + stats = fetch_statentries_for_relation(pg_stext, RelationGetRelid(onerel)); + + foreach(lc, stats) + { + StatExtEntry *stat = (StatExtEntry *) lfirst(lc); + MVNDistinct *ndistinct = NULL; + MVDependencies *dependencies = NULL; +#ifdef __TBASE__ + MVDependencies *subset = NULL; +#endif + VacAttrStats **stats; + ListCell *lc2; + + /* + * Check if we can build these stats based on the column analyzed. If + * not, report this fact (except in autovacuum) and move on. + */ + stats = lookup_var_attr_stats(onerel, stat->columns, + natts, vacattrstats); + if (!stats && !IsAutoVacuumWorkerProcess()) + { + ereport(WARNING, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("statistics object \"%s.%s\" could not be computed for relation \"%s.%s\"", + stat->schema, stat->name, + get_namespace_name(onerel->rd_rel->relnamespace), + RelationGetRelationName(onerel)), + errtable(onerel))); + continue; + } + + /* check allowed number of dimensions */ + Assert(bms_num_members(stat->columns) >= 2 && + bms_num_members(stat->columns) <= STATS_MAX_DIMENSIONS); + + /* compute statistic of each requested type */ + foreach(lc2, stat->types) + { + char t = (char) lfirst_int(lc2); + + if (t == STATS_EXT_NDISTINCT) + ndistinct = statext_ndistinct_build(totalrows, numrows, rows, + stat->columns, stats); + else if (t == STATS_EXT_DEPENDENCIES) + dependencies = statext_dependencies_build(numrows, rows, + stat->columns, stats); +#ifdef __TBASE__ + else if (t == STATS_EXT_SUBSET) + subset = statext_subset_build(numrows, stat->orderedColumns); +#endif + } + + /* store the statistics in the catalog */ +#ifdef __TBASE__ + statext_store(pg_stext, stat->statOid, + ndistinct, dependencies, + subset, stats); +#else + statext_store(pg_stext, stat->statOid, ndistinct, dependencies, stats); +#endif + } + + heap_close(pg_stext, RowExclusiveLock); + + MemoryContextSwitchTo(oldcxt); + MemoryContextDelete(cxt); } /* @@ -153,9 +172,15 @@ statext_is_kind_built(HeapTuple htup, char type) attnum = Anum_pg_statistic_ext_stxdependencies; break; - default: - elog(ERROR, "unexpected statistics type requested: %d", type); - } +#ifdef __TBASE__ + case STATS_EXT_SUBSET: + attnum = Anum_pg_statistic_ext_stxsubset; + break; +#endif + + default: + elog(ERROR, "unexpected statistics type requested: %d", type); + } return !heap_attisnull(htup, attnum, NULL); } @@ -165,68 +190,93 @@ statext_is_kind_built(HeapTuple htup, char type) */ static List * fetch_statentries_for_relation(Relation pg_statext, Oid relid) -{// #lizard forgives - SysScanDesc scan; - ScanKeyData skey; - HeapTuple htup; - List *result = NIL; - - /* - * Prepare to scan pg_statistic_ext for entries having stxrelid = this - * rel. - */ - ScanKeyInit(&skey, - Anum_pg_statistic_ext_stxrelid, - BTEqualStrategyNumber, F_OIDEQ, - ObjectIdGetDatum(relid)); - - scan = systable_beginscan(pg_statext, StatisticExtRelidIndexId, true, - NULL, 1, &skey); - - while (HeapTupleIsValid(htup = systable_getnext(scan))) - { - StatExtEntry *entry; - Datum datum; - bool isnull; - int i; - ArrayType *arr; - char *enabled; - Form_pg_statistic_ext staForm; - - entry = palloc0(sizeof(StatExtEntry)); - entry->statOid = HeapTupleGetOid(htup); - staForm = (Form_pg_statistic_ext) GETSTRUCT(htup); - entry->schema = get_namespace_name(staForm->stxnamespace); - entry->name = pstrdup(NameStr(staForm->stxname)); - for (i = 0; i < staForm->stxkeys.dim1; i++) - { - entry->columns = bms_add_member(entry->columns, - staForm->stxkeys.values[i]); - } - - /* decode the stxkind char array into a list of chars */ - datum = SysCacheGetAttr(STATEXTOID, htup, - Anum_pg_statistic_ext_stxkind, &isnull); - Assert(!isnull); - arr = DatumGetArrayTypeP(datum); - if (ARR_NDIM(arr) != 1 || - ARR_HASNULL(arr) || - ARR_ELEMTYPE(arr) != CHAROID) - elog(ERROR, "stxkind is not a 1-D char array"); - enabled = (char *) ARR_DATA_PTR(arr); - for (i = 0; i < ARR_DIMS(arr)[0]; i++) - { - Assert((enabled[i] == STATS_EXT_NDISTINCT) || - (enabled[i] == STATS_EXT_DEPENDENCIES)); - entry->types = lappend_int(entry->types, (int) enabled[i]); - } - - result = lappend(result, entry); - } - - systable_endscan(scan); - - return result; +{ + SysScanDesc scan; + ScanKeyData skey; + HeapTuple htup; + List *result = NIL; + + /* + * Prepare to scan pg_statistic_ext for entries having stxrelid = this + * rel. + */ + ScanKeyInit(&skey, + Anum_pg_statistic_ext_stxrelid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(relid)); + + scan = systable_beginscan(pg_statext, StatisticExtRelidIndexId, true, + NULL, 1, &skey); + + while (HeapTupleIsValid(htup = systable_getnext(scan))) + { + StatExtEntry *entry; + Datum datum; + bool isnull; + int i; + ArrayType *arr; + char *enabled; + Form_pg_statistic_ext staForm; +#ifdef __TBASE__ + bool need_column_order = false; +#endif + + entry = palloc0(sizeof(StatExtEntry)); + entry->statOid = HeapTupleGetOid(htup); + staForm = (Form_pg_statistic_ext) GETSTRUCT(htup); + entry->schema = get_namespace_name(staForm->stxnamespace); + entry->name = pstrdup(NameStr(staForm->stxname)); + for (i = 0; i < staForm->stxkeys.dim1; i++) + { + entry->columns = bms_add_member(entry->columns, + staForm->stxkeys.values[i]); + } + + /* decode the stxkind char array into a list of chars */ + datum = SysCacheGetAttr(STATEXTOID, htup, + Anum_pg_statistic_ext_stxkind, &isnull); + Assert(!isnull); + arr = DatumGetArrayTypeP(datum); + if (ARR_NDIM(arr) != 1 || + ARR_HASNULL(arr) || + ARR_ELEMTYPE(arr) != CHAROID) + elog(ERROR, "stxkind is not a 1-D char array"); + enabled = (char *) ARR_DATA_PTR(arr); + for (i = 0; i < ARR_DIMS(arr)[0]; i++) + { + Assert((enabled[i] == STATS_EXT_NDISTINCT) || + (enabled[i] == STATS_EXT_DEPENDENCIES) || + (enabled[i] == STATS_EXT_SUBSET)); + entry->types = lappend_int(entry->types, (int) enabled[i]); +#ifdef __TBASE__ + + if (enabled[i] == STATS_EXT_SUBSET) + { + /* Currently we only support subset of two columns */ + Assert(staForm->stxkeys.dim1 == 2); + + /* Order of column defined indicates the subset relation */ + need_column_order = true; + } + } + + /* Build the list of columns with the original order */ + if (need_column_order) + { + for (i = 0; i < staForm->stxkeys.dim1; i++) + { + entry->orderedColumns = lappend_int(entry->orderedColumns, + staForm->stxkeys.values[i]); + } +#endif + } + + result = lappend(result, entry); + } + + systable_endscan(scan); + + return result; } /* @@ -291,57 +341,73 @@ lookup_var_attr_stats(Relation rel, Bitmapset *attrs, */ static void statext_store(Relation pg_stext, Oid statOid, - MVNDistinct *ndistinct, MVDependencies *dependencies, - VacAttrStats **stats) + MVNDistinct *ndistinct, MVDependencies *dependencies, +#ifdef __TBASE__ + MVDependencies *subset, +#endif + VacAttrStats **stats) { - HeapTuple stup, - oldtup; - Datum values[Natts_pg_statistic_ext]; - bool nulls[Natts_pg_statistic_ext]; - bool replaces[Natts_pg_statistic_ext]; - - memset(nulls, 1, Natts_pg_statistic_ext * sizeof(bool)); - memset(replaces, 0, Natts_pg_statistic_ext * sizeof(bool)); - memset(values, 0, Natts_pg_statistic_ext * sizeof(Datum)); - - /* - * Construct a new pg_statistic_ext tuple, replacing the calculated stats. - */ - if (ndistinct != NULL) - { - bytea *data = statext_ndistinct_serialize(ndistinct); - - nulls[Anum_pg_statistic_ext_stxndistinct - 1] = (data == NULL); - values[Anum_pg_statistic_ext_stxndistinct - 1] = PointerGetDatum(data); - } - - if (dependencies != NULL) - { - bytea *data = statext_dependencies_serialize(dependencies); - - nulls[Anum_pg_statistic_ext_stxdependencies - 1] = (data == NULL); - values[Anum_pg_statistic_ext_stxdependencies - 1] = PointerGetDatum(data); - } - - /* always replace the value (either by bytea or NULL) */ - replaces[Anum_pg_statistic_ext_stxndistinct - 1] = true; - replaces[Anum_pg_statistic_ext_stxdependencies - 1] = true; - - /* there should already be a pg_statistic_ext tuple */ - oldtup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statOid)); - if (!HeapTupleIsValid(oldtup)) - elog(ERROR, "cache lookup failed for statistics object %u", statOid); - - /* replace it */ - stup = heap_modify_tuple(oldtup, - RelationGetDescr(pg_stext), - values, - nulls, - replaces); - ReleaseSysCache(oldtup); - CatalogTupleUpdate(pg_stext, &stup->t_self, stup); - - heap_freetuple(stup); + HeapTuple stup, + oldtup; + Datum values[Natts_pg_statistic_ext]; + bool nulls[Natts_pg_statistic_ext]; + bool replaces[Natts_pg_statistic_ext]; + + memset(nulls, 1, Natts_pg_statistic_ext * sizeof(bool)); + memset(replaces, 0, Natts_pg_statistic_ext * sizeof(bool)); + memset(values, 0, Natts_pg_statistic_ext * sizeof(Datum)); + + /* + * Construct a new pg_statistic_ext tuple, replacing the calculated stats. + */ + if (ndistinct != NULL) + { + bytea *data = statext_ndistinct_serialize(ndistinct); + + nulls[Anum_pg_statistic_ext_stxndistinct - 1] = (data == NULL); + values[Anum_pg_statistic_ext_stxndistinct - 1] = PointerGetDatum(data); + } + + if (dependencies != NULL) + { + bytea *data = statext_dependencies_serialize(dependencies); + + nulls[Anum_pg_statistic_ext_stxdependencies - 1] = (data == NULL); + values[Anum_pg_statistic_ext_stxdependencies - 1] = PointerGetDatum(data); + } + +#ifdef __TBASE__ + if (subset != NULL) + { + bytea *data = statext_dependencies_serialize(subset); + + nulls[Anum_pg_statistic_ext_stxsubset - 1] = (data == NULL); + values[Anum_pg_statistic_ext_stxsubset - 1] = PointerGetDatum(data); + } +#endif + + /* always replace the value (either by bytea or NULL) */ + replaces[Anum_pg_statistic_ext_stxndistinct - 1] = true; + replaces[Anum_pg_statistic_ext_stxdependencies - 1] = true; +#ifdef __TBASE__ + replaces[Anum_pg_statistic_ext_stxsubset - 1] = true; +#endif + + /* there should already be a pg_statistic_ext tuple */ + oldtup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statOid)); + if (!HeapTupleIsValid(oldtup)) + elog(ERROR, "cache lookup failed for statistics object %u", statOid); + + /* replace it */ + stup = heap_modify_tuple(oldtup, + RelationGetDescr(pg_stext), + values, + nulls, + replaces); + ReleaseSysCache(oldtup); + CatalogTupleUpdate(pg_stext, &stup->t_self, stup); + + heap_freetuple(stup); } /* initialize multi-dimensional sort */ diff --git a/src/backend/statistics/subset.c b/src/backend/statistics/subset.c new file mode 100644 index 00000000..1bac5b9a --- /dev/null +++ b/src/backend/statistics/subset.c @@ -0,0 +1,360 @@ +/*------------------------------------------------------------------------- + * + * subset.c + * POSTGRES user defined column correlationship + * + * Portions Copyright (c) 2020-Present, TBase Development Team, Tencent + * + * IDENTIFICATION + * src/backend/statistics/knowledge.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "catalog/pg_statistic_ext.h" +#include "nodes/relation.h" +#include "optimizer/clauses.h" +#include "optimizer/cost.h" +#include "statistics/extended_stats_internal.h" +#include "statistics/statistics.h" +#include "utils/fmgroids.h" +#include "utils/lsyscache.h" +#include "utils/syscache.h" + +static bool subset_is_compatible_clause(Node *clause, Index relid, + AttrNumber *attnum); +static bool subset_implies_attribute(MVDependency *dependency, + AttrNumber attnum); + +/* + * Build subset dependencies between groups of columns + */ +MVDependencies * +statext_subset_build(int numrows, List *columns) +{ + int i; + int k; + + /* result */ + MVDependencies *dependencies = NULL; + MVDependency *d; + + /* Currently, we only support subset defined with 2 columns */ + Assert(list_length(columns) == 2); + k = list_length(columns); + + /* initialize the list of dependencies */ + dependencies = (MVDependencies *) palloc0(sizeof(MVDependencies)); + + dependencies->magic = STATS_DEPS_MAGIC; + dependencies->type = STATS_DEPS_TYPE_BASIC; + dependencies->ndeps = 1; + + dependencies = (MVDependencies *) repalloc(dependencies, + offsetof(MVDependencies, deps) + + dependencies->ndeps * sizeof(MVDependency)); + + d = (MVDependency *) palloc0(offsetof(MVDependency, attributes) + + k * sizeof(AttrNumber)); + d->degree = 1.0; + d->nattributes = k; + for (i = 0; i < k; i++) + { + d->attributes[i] = list_nth_int(columns, i); + } + + dependencies->deps[0] = d; + + return dependencies; +} + +/* + * statext_subset_load + * Load the subset dependency for the indicated pg_statistic_ext tuple + */ +MVDependencies * +statext_subset_load(Oid mvoid) +{ + bool isnull; + Datum deps; + HeapTuple htup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(mvoid)); + + if (!HeapTupleIsValid(htup)) + elog(ERROR, "cache lookup failed for statistics object %u", mvoid); + + deps = SysCacheGetAttr(STATEXTOID, htup, + Anum_pg_statistic_ext_stxsubset, &isnull); + Assert(!isnull); + + ReleaseSysCache(htup); + + /* Reuse the functional dependencies deserialize function */ + return statext_dependencies_deserialize(DatumGetByteaP(deps)); +} + +/* + * subset_is_compatible_clause + * Determines if the clause is compatible with subset dependencies + * + * When returning True attnum is set to the attribute number of the Var within + * the supported clause. Comparing to dependencies compatibility check, subset + * is less restrictive. + */ +static bool +subset_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum) +{ + RestrictInfo *rinfo = (RestrictInfo *) clause; + + if (!IsA(rinfo, RestrictInfo)) + return false; + + /* Pseudoconstants are not really interesting here. */ + if (rinfo->pseudoconstant) + return false; + + /* clauses referencing multiple varnos are incompatible */ + if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON) + return false; + + if (is_opclause(rinfo->clause)) + { + OpExpr *expr = (OpExpr *) rinfo->clause; + Var *var; + bool varonleft = true; + bool ok; + + /* Only expressions with two arguments are considered compatible. */ + if (list_length(expr->args) != 2) + return false; + + /* see if it actually has the right */ + ok = (NumRelids((Node *) expr) == 1) && + (is_pseudo_constant_clause(lsecond(expr->args)) || + (varonleft = false, + is_pseudo_constant_clause(linitial(expr->args)))); + + /* unsupported structure (two variables or so) */ + if (!ok) + return false; + + var = (varonleft) ? linitial(expr->args) : lsecond(expr->args); + + /* in case it's a T_RelableType */ + if (IsA(var, RelabelType)) + var = (Var *) ((RelabelType *) var)->arg; + + /* We only support plain Vars for now */ + if (!IsA(var, Var)) + return false; + + /* Ensure var is from the correct relation */ + if (var->varno != relid) + return false; + + /* we also better ensure the Var is from the current level */ + if (var->varlevelsup > 0) + return false; + + /* Also skip system attributes (we don't allow stats on those). */ + if (!AttrNumberIsForUserDefinedAttr(var->varattno)) + return false; + + *attnum = var->varattno; + return true; + } + + return false; +} + +/* + * subset_eliminate_attribute + * check that the attnum matches is implied by the subset dependency + */ +static bool +subset_implies_attribute(MVDependency *dependency, AttrNumber attnum) +{ + if (attnum == dependency->attributes[dependency->nattributes - 1]) + return true; + + return false; +} + +/* + * subset_clauselist_selectivity + * Return the estimated selectivity of the given clauses using + * functional dependency statistics, or 1.0 if no useful functional + * dependency statistic exists. + * + * 'estimatedclauses' is an output argument that gets a bit set corresponding + * to the (zero-based) list index of clauses that are included in the + * estimated selectivity. + * + * Given equality clauses on attributes (a,b) we find the strongest dependency + * between them, i.e. either (a=>b) or (b=>a). Assuming (a=>b) is the selected + * dependency, we then combine the per-clause selectivities using the formula + */ +Selectivity +subset_clauselist_selectivity(PlannerInfo *root, + List *clauses, + int varRelid, + JoinType jointype, + SpecialJoinInfo *sjinfo, + RelOptInfo *rel, + Bitmapset **estimatedclauses) +{ + Selectivity s1 = 1.0; + ListCell *l; + Bitmapset *clauses_attnums = NULL; + StatisticExtInfo *stat; + MVDependencies *dependencies; + AttrNumber *list_attnums; + int listidx; + + /* check if there's any stats that might be useful for us. */ + if (!has_stats_of_kind(rel->statlist, STATS_EXT_SUBSET)) + return 1.0; + + list_attnums = (AttrNumber *) palloc(sizeof(AttrNumber) * + list_length(clauses)); + + /* + * Pre-process the clauses list to extract the attnums seen in each item. + * We need to determine if there's any clauses which will be useful for + * subset selectivity elimination. Along the way we'll record all of + * the attnums for each clause in a list which we'll reference later so we + * don't need to repeat the same work again. We'll also keep track of all + * attnums seen. + */ + listidx = 0; + foreach(l, clauses) + { + Node *clause = (Node *) lfirst(l); + AttrNumber attnum; + + if (subset_is_compatible_clause(clause, rel->relid, &attnum)) + { + list_attnums[listidx] = attnum; + clauses_attnums = bms_add_member(clauses_attnums, attnum); + } + else + list_attnums[listidx] = InvalidAttrNumber; + + listidx++; + } + + /* + * If there's not at least two distinct attnums then reject the whole list + * of clauses. We must return 1.0 so the calling function's selectivity is + * unaffected. + */ + if (bms_num_members(clauses_attnums) < 2) + { + pfree(list_attnums); + return 1.0; + } + + /* find the best suited statistics object for these attnums */ + stat = choose_best_statistics(rel->statlist, clauses_attnums, + STATS_EXT_SUBSET); + + /* if no matching stats could be found then we've nothing to do */ + if (!stat) + { + pfree(list_attnums); + return 1.0; + } + + /* + * Load the dependency items stored in the statistics object. + */ + dependencies = statext_subset_load(stat->statOid); + + /* + * Apply the dependencies recursively, starting with the widest/strongest + * ones, and proceeding to the smaller/weaker ones. At the end of each + * round we factor in the selectivity of clauses on the implied attribute, + * and remove the clauses from the list. + * + * Actually, for subset dependency, there should be only one dependency + * entry. But we still keep the while loop style align with normal + * dependency selectivity calculation does, to get better support for + * possible future enhancements. + */ + do + { + Selectivity s2 = 1.0; + MVDependency *dependency; + + /* There is only one dependency to indicate the subset relation */ + Assert(dependencies->ndeps == 1); + dependency = dependencies->deps[0]; + + /* + * We found an applicable dependency, so find all the clauses on the + * implied attribute - with dependency (a,b => c) we look for clauses + * on 'c'. + */ + listidx = -1; + foreach(l, clauses) + { + Node *clause; + + listidx++; + + /* + * Skip incompatible clauses, and ones we've already estimated on. + */ + if (list_attnums[listidx] == InvalidAttrNumber || + bms_is_member(listidx, *estimatedclauses)) + continue; + + /* + * Technically we could find more than one clause for a given + * attnum. Since these clauses must be equality clauses, we choose + * to only take the selectivity estimate from the final clause in + * the list for this attnum. If the attnum happens to be compared + * to a different Const in another clause then no rows will match + * anyway. If it happens to be compared to the same Const, then + * ignoring the additional clause is just the thing to do. + */ + if (subset_implies_attribute(dependency, list_attnums[listidx])) + { + clause = (Node *) lfirst(l); + + s2 = clause_selectivity(root, clause, varRelid, jointype, + sjinfo); + + /* mark this one as done, so we don't touch it again. */ + *estimatedclauses = bms_add_member(*estimatedclauses, listidx); + + /* + * Mark that we've got and used the dependency on this clause. + * We'll want to ignore this when looking for the next + * strongest dependency above. + */ + clauses_attnums = bms_del_member(clauses_attnums, + list_attnums[listidx]); + } + } + + /* + * Now factor in the selectivity for all the "implied" clauses into + * the final one, using this formula: + * + * P(a,b) = P(a) * (f + (1-f) * P(b)) + * + * where 'f' is the degree of validity of the dependency. + * + * Currently, the subset statistic can only eliminate the implied + * clause by forcing dependency degree to 1.0. + */ + Assert(dependency->degree == 1.0); + s1 *= (dependency->degree + (1 - dependency->degree) * s2); + } while(0); + + pfree(dependencies); + pfree(list_attnums); + + return s1; +} diff --git a/src/include/catalog/pg_statistic_ext.h b/src/include/catalog/pg_statistic_ext.h index 108944f7..7dc60359 100644 --- a/src/include/catalog/pg_statistic_ext.h +++ b/src/include/catalog/pg_statistic_ext.h @@ -45,10 +45,13 @@ CATALOG(pg_statistic_ext,3381) int2vector stxkeys; /* array of column keys */ #ifdef CATALOG_VARLEN - char stxkind[1] BKI_FORCE_NOT_NULL; /* statistic types requested - * to build */ - pg_ndistinct stxndistinct; /* ndistinct coefficients (serialized) */ - pg_dependencies stxdependencies; /* dependencies (serialized) */ + char stxkind[1] BKI_FORCE_NOT_NULL; /* statistic types requested + * to build */ + pg_ndistinct stxndistinct; /* ndistinct coefficients (serialized) */ + pg_dependencies stxdependencies; /* dependencies (serialized) */ +#ifdef __TBASE__ + pg_dependencies stxsubset; /* subset (serialized) */ +#endif #endif } FormData_pg_statistic_ext; @@ -64,17 +67,27 @@ typedef FormData_pg_statistic_ext *Form_pg_statistic_ext; * compiler constants for pg_statistic_ext * ---------------- */ -#define Natts_pg_statistic_ext 8 -#define Anum_pg_statistic_ext_stxrelid 1 -#define Anum_pg_statistic_ext_stxname 2 -#define Anum_pg_statistic_ext_stxnamespace 3 -#define Anum_pg_statistic_ext_stxowner 4 -#define Anum_pg_statistic_ext_stxkeys 5 -#define Anum_pg_statistic_ext_stxkind 6 -#define Anum_pg_statistic_ext_stxndistinct 7 -#define Anum_pg_statistic_ext_stxdependencies 8 +#ifdef __TBASE__ +#define Natts_pg_statistic_ext 9 +#else +#define Natts_pg_statistic_ext 8 +#endif +#define Anum_pg_statistic_ext_stxrelid 1 +#define Anum_pg_statistic_ext_stxname 2 +#define Anum_pg_statistic_ext_stxnamespace 3 +#define Anum_pg_statistic_ext_stxowner 4 +#define Anum_pg_statistic_ext_stxkeys 5 +#define Anum_pg_statistic_ext_stxkind 6 +#define Anum_pg_statistic_ext_stxndistinct 7 +#define Anum_pg_statistic_ext_stxdependencies 8 +#ifdef __TBASE__ +#define Anum_pg_statistic_ext_stxsubset 9 +#endif -#define STATS_EXT_NDISTINCT 'd' -#define STATS_EXT_DEPENDENCIES 'f' +#define STATS_EXT_NDISTINCT 'd' +#define STATS_EXT_DEPENDENCIES 'f' +#ifdef __TBASE__ +#define STATS_EXT_SUBSET 's' +#endif #endif /* PG_STATISTIC_EXT_H */ diff --git a/src/include/statistics/extended_stats_internal.h b/src/include/statistics/extended_stats_internal.h index ad0d6872..3dbf9e9e 100644 --- a/src/include/statistics/extended_stats_internal.h +++ b/src/include/statistics/extended_stats_internal.h @@ -53,7 +53,10 @@ extern bytea *statext_ndistinct_serialize(MVNDistinct *ndistinct); extern MVNDistinct *statext_ndistinct_deserialize(bytea *data); extern MVDependencies *statext_dependencies_build(int numrows, HeapTuple *rows, - Bitmapset *attrs, VacAttrStats **stats); + Bitmapset *attrs, VacAttrStats **stats); +#ifdef __TBASE__ +extern MVDependencies *statext_subset_build(int numrows, List *columns); +#endif extern bytea *statext_dependencies_serialize(MVDependencies *dependencies); extern MVDependencies *statext_dependencies_deserialize(bytea *data); diff --git a/src/include/statistics/statistics.h b/src/include/statistics/statistics.h index 47b59887..e6923113 100644 --- a/src/include/statistics/statistics.h +++ b/src/include/statistics/statistics.h @@ -80,18 +80,30 @@ typedef struct MVDependencies extern MVNDistinct *statext_ndistinct_load(Oid mvoid); extern MVDependencies *statext_dependencies_load(Oid mvoid); +#ifdef __TBASE__ +extern MVDependencies *statext_subset_load(Oid mvoid); +#endif extern void BuildRelationExtStatistics(Relation onerel, double totalrows, int numrows, HeapTuple *rows, int natts, VacAttrStats **vacattrstats); extern bool statext_is_kind_built(HeapTuple htup, char kind); extern Selectivity dependencies_clauselist_selectivity(PlannerInfo *root, - List *clauses, - int varRelid, - JoinType jointype, - SpecialJoinInfo *sjinfo, - RelOptInfo *rel, - Bitmapset **estimatedclauses); + List *clauses, + int varRelid, + JoinType jointype, + SpecialJoinInfo *sjinfo, + RelOptInfo *rel, + Bitmapset **estimatedclauses); +#ifdef __TBASE__ +extern Selectivity subset_clauselist_selectivity(PlannerInfo *root, + List *clauses, + int varRelid, + JoinType jointype, + SpecialJoinInfo *sjinfo, + RelOptInfo *rel, + Bitmapset **estimatedclauses); +#endif extern bool has_stats_of_kind(List *stats, char requiredkind); extern StatisticExtInfo *choose_best_statistics(List *stats, Bitmapset *attnums, char requiredkind); diff --git a/src/test/regress/expected/stats_ext_2.out b/src/test/regress/expected/stats_ext_2.out index 3581037d..ca7aba0a 100644 --- a/src/test/regress/expected/stats_ext_2.out +++ b/src/test/regress/expected/stats_ext_2.out @@ -656,4 +656,99 @@ EXPLAIN (COSTS OFF) Index Cond: ((a = 1) AND (b = '1'::text)) (7 rows) +-- subset relational tests +CREATE TABLE subset ( + filler1 TEXT, + filler2 NUMERIC, + a INT, + b TEXT, + filler3 DATE, + c INT, + d TEXT +); +-- a => b, b==c +INSERT INTO subset (a, b, c, filler1) + SELECT mod(i,100), 'prefix_'||mod(i,50), mod(i,50), i FROM generate_series(1,5000) s(i); +ANALYZE subset; +-- under-estimates when using only per-column statistics +EXPLAIN + SELECT count(*) FROM subset WHERE b = 'prefix_1' and c = 1; + QUERY PLAN +------------------------------------------------------------------------------------------------- + Finalize Aggregate (cost=255.01..255.02 rows=1 width=8) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=255.00..255.01 rows=1 width=0) + -> Partial Aggregate (cost=155.00..155.01 rows=1 width=8) + -> Seq Scan on subset (cost=0.00..155.00 rows=2 width=0) + Filter: ((b = 'prefix_1'::text) AND (c = 1)) +(5 rows) + +SELECT count(*) FROM subset WHERE b = 'prefix_1' and c = 1; + count +------- + 100 +(1 row) + +-- create dependencies +CREATE STATISTICS deps_stat (dependencies) ON a, b, c FROM subset; +ANALYZE subset; +-- the selectivity is corrected by dependencies stats +EXPLAIN + SELECT count(*) FROM subset WHERE b = 'prefix_1' and c = 1; + QUERY PLAN +------------------------------------------------------------------------------------------------- + Finalize Aggregate (cost=255.01..255.02 rows=1 width=8) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=255.00..255.01 rows=1 width=0) + -> Partial Aggregate (cost=155.00..155.01 rows=1 width=8) + -> Seq Scan on subset (cost=0.00..155.00 rows=100 width=0) + Filter: ((b = 'prefix_1'::text) AND (c = 1)) +(5 rows) + +SELECT count(*) FROM subset WHERE b = 'prefix_1' and c = 1; + count +------- + 100 +(1 row) + +-- dependencies stats does not support operator other than '=' +EXPLAIN + SELECT count(*) FROM subset WHERE b like '%_1' and c = 1; + QUERY PLAN +------------------------------------------------------------------------------------------------- + Finalize Aggregate (cost=255.01..255.02 rows=1 width=8) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=255.00..255.01 rows=1 width=0) + -> Partial Aggregate (cost=155.00..155.01 rows=1 width=8) + -> Seq Scan on subset (cost=0.00..155.00 rows=10 width=0) + Filter: ((b ~~ '%_1'::text) AND (c = 1)) +(5 rows) + +SELECT count(*) FROM subset WHERE b like '%_1' and c = 1; + count +------- + 100 +(1 row) + +-- wrong definition, subset stat only support two column +CREATE STATISTICS subset_stat (subset) ON a, b, c FROM subset; +ERROR: subset statistics require exactly 2 columns +-- create subset stats as user defined hint +CREATE STATISTICS subset_stat (subset) ON c, b FROM subset; +ANALYZE subset; +-- the selectivity is corrected by subset stats +EXPLAIN + SELECT count(*) FROM subset WHERE b like '%_1' and c = 1; + QUERY PLAN +------------------------------------------------------------------------------------------------- + Finalize Aggregate (cost=255.01..255.02 rows=1 width=8) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=255.00..255.01 rows=1 width=0) + -> Partial Aggregate (cost=155.00..155.01 rows=1 width=8) + -> Seq Scan on subset (cost=0.00..155.00 rows=100 width=0) + Filter: ((b ~~ '%_1'::text) AND (c = 1)) +(5 rows) + +SELECT count(*) FROM subset WHERE b like '%_1' and c = 1; + count +------- + 100 +(1 row) + RESET random_page_cost; diff --git a/src/test/regress/expected/stats_ext_3.out b/src/test/regress/expected/stats_ext_3.out index e69852b6..b0f2e1c4 100644 --- a/src/test/regress/expected/stats_ext_3.out +++ b/src/test/regress/expected/stats_ext_3.out @@ -668,4 +668,99 @@ EXPLAIN (COSTS OFF) Index Cond: ((a = 1) AND (b = '1'::text)) (7 rows) +-- subset relational tests +CREATE TABLE subset ( + filler1 TEXT, + filler2 NUMERIC, + a INT, + b TEXT, + filler3 DATE, + c INT, + d TEXT +); +-- a => b, b==c +INSERT INTO subset (a, b, c, filler1) + SELECT mod(i,100), 'prefix_'||mod(i,50), mod(i,50), i FROM generate_series(1,5000) s(i); +ANALYZE subset; +-- under-estimates when using only per-column statistics +EXPLAIN + SELECT count(*) FROM subset WHERE b = 'prefix_1' and c = 1; + QUERY PLAN +------------------------------------------------------------------------------------------------- + Finalize Aggregate (cost=255.01..255.02 rows=1 width=8) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=255.00..255.01 rows=1 width=0) + -> Partial Aggregate (cost=155.00..155.01 rows=1 width=8) + -> Seq Scan on subset (cost=0.00..155.00 rows=2 width=0) + Filter: ((b = 'prefix_1'::text) AND (c = 1)) +(5 rows) + +SELECT count(*) FROM subset WHERE b = 'prefix_1' and c = 1; + count +------- + 100 +(1 row) + +-- create dependencies +CREATE STATISTICS deps_stat (dependencies) ON a, b, c FROM subset; +ANALYZE subset; +-- the selectivity is corrected by dependencies stats +EXPLAIN + SELECT count(*) FROM subset WHERE b = 'prefix_1' and c = 1; + QUERY PLAN +------------------------------------------------------------------------------------------------- + Finalize Aggregate (cost=255.01..255.02 rows=1 width=8) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=255.00..255.01 rows=1 width=0) + -> Partial Aggregate (cost=155.00..155.01 rows=1 width=8) + -> Seq Scan on subset (cost=0.00..155.00 rows=100 width=0) + Filter: ((b = 'prefix_1'::text) AND (c = 1)) +(5 rows) + +SELECT count(*) FROM subset WHERE b = 'prefix_1' and c = 1; + count +------- + 100 +(1 row) + +-- dependencies stats does not support operator other than '=' +EXPLAIN + SELECT count(*) FROM subset WHERE b like '%_1' and c = 1; + QUERY PLAN +------------------------------------------------------------------------------------------------- + Finalize Aggregate (cost=255.01..255.02 rows=1 width=8) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=255.00..255.01 rows=1 width=0) + -> Partial Aggregate (cost=155.00..155.01 rows=1 width=8) + -> Seq Scan on subset (cost=0.00..155.00 rows=10 width=0) + Filter: ((b ~~ '%_1'::text) AND (c = 1)) +(5 rows) + +SELECT count(*) FROM subset WHERE b like '%_1' and c = 1; + count +------- + 100 +(1 row) + +-- wrong definition, subset stat only support two column +CREATE STATISTICS subset_stat (subset) ON a, b, c FROM subset; +ERROR: subset statistics require exactly 2 columns +-- create subset stats as user defined hint +CREATE STATISTICS subset_stat (subset) ON c, b FROM subset; +ANALYZE subset; +-- the selectivity is corrected by subset stats +EXPLAIN + SELECT count(*) FROM subset WHERE b like '%_1' and c = 1; + QUERY PLAN +------------------------------------------------------------------------------------------------- + Finalize Aggregate (cost=255.01..255.02 rows=1 width=8) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=255.00..255.01 rows=1 width=0) + -> Partial Aggregate (cost=155.00..155.01 rows=1 width=8) + -> Seq Scan on subset (cost=0.00..155.00 rows=100 width=0) + Filter: ((b ~~ '%_1'::text) AND (c = 1)) +(5 rows) + +SELECT count(*) FROM subset WHERE b like '%_1' and c = 1; + count +------- + 100 +(1 row) + RESET random_page_cost; diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql index 221db426..a4b91e0c 100644 --- a/src/test/regress/sql/stats_ext.sql +++ b/src/test/regress/sql/stats_ext.sql @@ -297,4 +297,51 @@ ANALYZE functional_dependencies; EXPLAIN (COSTS OFF) SELECT * FROM functional_dependencies WHERE a = 1 AND b = '1' AND c = 1; +-- subset relational tests +CREATE TABLE subset ( + filler1 TEXT, + filler2 NUMERIC, + a INT, + b TEXT, + filler3 DATE, + c INT, + d TEXT +); + +-- a => b, b==c +INSERT INTO subset (a, b, c, filler1) + SELECT mod(i,100), 'prefix_'||mod(i,50), mod(i,50), i FROM generate_series(1,5000) s(i); + +ANALYZE subset; + +-- under-estimates when using only per-column statistics +EXPLAIN + SELECT count(*) FROM subset WHERE b = 'prefix_1' and c = 1; +SELECT count(*) FROM subset WHERE b = 'prefix_1' and c = 1; + +-- create dependencies +CREATE STATISTICS deps_stat (dependencies) ON a, b, c FROM subset; +ANALYZE subset; + +-- the selectivity is corrected by dependencies stats +EXPLAIN + SELECT count(*) FROM subset WHERE b = 'prefix_1' and c = 1; +SELECT count(*) FROM subset WHERE b = 'prefix_1' and c = 1; + +-- dependencies stats does not support operator other than '=' +EXPLAIN + SELECT count(*) FROM subset WHERE b like '%_1' and c = 1; +SELECT count(*) FROM subset WHERE b like '%_1' and c = 1; + +-- wrong definition, subset stat only support two column +CREATE STATISTICS subset_stat (subset) ON a, b, c FROM subset; +-- create subset stats as user defined hint +CREATE STATISTICS subset_stat (subset) ON c, b FROM subset; +ANALYZE subset; + +-- the selectivity is corrected by subset stats +EXPLAIN + SELECT count(*) FROM subset WHERE b like '%_1' and c = 1; +SELECT count(*) FROM subset WHERE b like '%_1' and c = 1; + RESET random_page_cost; From 92aba29af4c8dc7ce7d6a45a25f00bbd5151b071 Mon Sep 17 00:00:00 2001 From: qiannzhang Date: Wed, 26 Aug 2020 14:56:05 +0800 Subject: [PATCH 027/578] Set keepalive, user_timeout, and connect_timeout in pooler --- src/backend/libpq/pqcomm.c | 58 ++++++++++ src/backend/pgxc/pool/pgxcnode.c | 21 ++-- src/backend/pgxc/pool/poolmgr.c | 192 ++++++++++++++++--------------- src/include/libpq/libpq-be.h | 4 +- 4 files changed, 171 insertions(+), 104 deletions(-) diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c index 6834a8c0..ca926c8c 100644 --- a/src/backend/libpq/pqcomm.c +++ b/src/backend/libpq/pqcomm.c @@ -2009,3 +2009,61 @@ pq_setkeepalivescount(int count, Port *port) return STATUS_OK; } + +/* + * Set socket keepalive and user_timeout. + * We can use this to detect the broken connection quickly. + */ +void +SetSockKeepAlive(int sock) +{ + int keepalive = 1; + /* user_timeout in ms */ + uint32 user_timeout = UINT32_MAX / 1000 < tcp_keepalives_idle ? + 0 : tcp_keepalives_idle * (uint32)1000; + struct tcp_info info; + int len = sizeof(info); + /* check sock */ + getsockopt(sock, IPPROTO_TCP, TCP_INFO, &info, (socklen_t *)&len); + if (info.tcpi_state != TCP_ESTABLISHED) + { + return; + } + + /* set keepalive */ + if (setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, + (char *)&keepalive, sizeof(keepalive)) < 0) + { + elog(LOG, "SetSockKeepAlive setsockopt(SO_KEEPALIVE) failed: %m"); + } + if (tcp_keepalives_idle > 0 && + setsockopt(sock, IPPROTO_TCP, TCP_KEEPIDLE, + (char *)&tcp_keepalives_idle, + sizeof(tcp_keepalives_idle)) < 0) + { + elog(LOG, "SetSockKeepAlive setsockopt(TCP_KEEPIDLE) failed: %m"); + } + if (tcp_keepalives_interval > 0 && + setsockopt(sock, IPPROTO_TCP, TCP_KEEPINTVL, + (char *)&tcp_keepalives_interval, + sizeof(tcp_keepalives_interval)) < 0) + { + elog(LOG, "SetSockKeepAlive setsockopt(TCP_KEEPINTVL) failed: %m"); + } + if (tcp_keepalives_count > 0 && + setsockopt(sock, IPPROTO_TCP, TCP_KEEPCNT, + (char *)&tcp_keepalives_count, + sizeof(tcp_keepalives_count)) < 0) + { + elog(LOG, "SetSockKeepAlive setsockopt(TCP_KEEPCNT) failed: %m"); + } + + /* set user_timeout */ + if (user_timeout > 0 && + setsockopt(sock, IPPROTO_TCP, TCP_USER_TIMEOUT, + (char *)&user_timeout, + sizeof(user_timeout)) < 0) + { + elog(LOG, "SetSockKeepAlive setsockopt(TCP_USER_TIMEOUT) failed: %m"); + } +} \ No newline at end of file diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index 4279a325..7bea908f 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -489,14 +489,19 @@ PGXCNodeConnStr(char *host, int port, char *dbname, #ifdef _MLS_ } #endif - /* Check for overflow */ - if (num > 0 && num < sizeof(connstr)) - { - /* Output result */ - out = (char *) palloc(num + 1); - strcpy(out, connstr); - return out; - } + if (tcp_keepalives_idle > 0) + { + num += snprintf(connstr + num, sizeof(connstr) - num, + " connect_timeout=%d", tcp_keepalives_idle); + } + /* Check for overflow */ + if (num > 0 && num < sizeof(connstr)) + { + /* Output result */ + out = (char *) palloc(num + 1); + strcpy(out, connstr); + return out; + } /* return NULL if we have problem */ return NULL; diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c index 1f4bb7b9..dd575f76 100644 --- a/src/backend/pgxc/pool/poolmgr.c +++ b/src/backend/pgxc/pool/poolmgr.c @@ -6986,24 +6986,24 @@ preconnect_and_warm(DatabasePool *dbPool) return false; } - slot->xc_cancelConn = (NODE_CANCEL *) PQgetCancel((PGconn *)slot->conn); - - - /* Increase count of pool size */ - nodePool->slot[nodePool->freeSize] = slot; - - /* Insert at the end of the pool */ - IncreasePoolerSize(nodePool, __FILE__, __LINE__); - IncreasePoolerFreesize(nodePool,__FILE__,__LINE__); - slot->released = time(NULL); - slot->checked = slot->released; - slot->created = slot->released; - slot->node_name = nodePool->node_name; - slot->backend_pid = ((PGconn *) slot->conn)->be_pid; - if (dbPool->oldest_idle == (time_t) 0) - { - dbPool->oldest_idle = slot->released; - } + slot->xc_cancelConn = (NODE_CANCEL *) PQgetCancel((PGconn *)slot->conn); + SetSockKeepAlive(((PGconn *)slot->conn)->sock); + + /* Increase count of pool size */ + nodePool->slot[nodePool->freeSize] = slot; + + /* Insert at the end of the pool */ + IncreasePoolerSize(nodePool, __FILE__, __LINE__); + IncreasePoolerFreesize(nodePool,__FILE__,__LINE__); + slot->released = time(NULL); + slot->checked = slot->released; + slot->created = slot->released; + slot->node_name = nodePool->node_name; + slot->backend_pid = ((PGconn *) slot->conn)->be_pid; + if (dbPool->oldest_idle == (time_t) 0) + { + dbPool->oldest_idle = slot->released; + } if (PoolConnectDebugPrint) { @@ -7062,55 +7062,56 @@ void *pooler_async_connection_management_thread(void *arg) PGXCPoolConnectReq *request = NULL; PGXCNodePoolSlot *slot = NULL; - threadIndex = ((PGXCPoolConnThreadParam*)arg)->threadIndex; - while (1) - { - /* wait for signal */ - ThreadSemaDown(&g_PoolConnControl.sem[threadIndex]); - - /* create connect as needed */ - request = (PGXCPoolConnectReq*)PipeGet(g_PoolConnControl.request[threadIndex]); - if (request) - { - /* record status of the task */ - pooler_async_task_start(&g_PoolConnControl, threadIndex, request->nodeindex, NULL, InvalidOid, request->cmd); - - switch (request->cmd) - { - case COMMAND_CONNECTION_BUILD: - { - for (i = 0; i < request->size; i++, request->validSize++) - { - slot = &request->slot[i]; - /* If connection fails, be sure that slot is destroyed cleanly */ - slot->xc_cancelConn = NULL; - - /* Establish connection */ - slot->conn = PGXCNodeConnectBarely(request->connstr); - if (!PGXCNodeConnected(slot->conn)) - { - request->failed = true; - break; - } - slot->xc_cancelConn = (NODE_CANCEL *) PQgetCancel((PGconn *)slot->conn); - slot->bwarmed = false; - } - break; - } - - case COMMAND_CONNECTION_CLOSE: - { - PQfreeCancel((PGcancel *)request->slot[0].xc_cancelConn); - PGXCNodeClose(request->slot[0].conn); - break; - } - - default: - { - /* should never happen */ - abort(); - } - } + threadIndex = ((PGXCPoolConnThreadParam*)arg)->threadIndex; + while (1) + { + /* wait for signal */ + ThreadSemaDown(&g_PoolConnControl.sem[threadIndex]); + + /* create connect as needed */ + request = (PGXCPoolConnectReq*)PipeGet(g_PoolConnControl.request[threadIndex]); + if (request) + { + /* record status of the task */ + pooler_async_task_start(&g_PoolConnControl, threadIndex, request->nodeindex, NULL, InvalidOid, request->cmd); + + switch (request->cmd) + { + case COMMAND_CONNECTION_BUILD: + { + for (i = 0; i < request->size; i++, request->validSize++) + { + slot = &request->slot[i]; + /* If connection fails, be sure that slot is destroyed cleanly */ + slot->xc_cancelConn = NULL; + + /* Establish connection */ + slot->conn = PGXCNodeConnectBarely(request->connstr); + if (!PGXCNodeConnected(slot->conn)) + { + request->failed = true; + break; + } + slot->xc_cancelConn = (NODE_CANCEL *) PQgetCancel((PGconn *)slot->conn); + slot->bwarmed = false; + SetSockKeepAlive(((PGconn *)slot->conn)->sock); + } + break; + } + + case COMMAND_CONNECTION_CLOSE: + { + PQfreeCancel((PGcancel *)request->slot[0].xc_cancelConn); + PGXCNodeClose(request->slot[0].conn); + break; + } + + default: + { + /* should never happen */ + abort(); + } + } /* clear the work status */ pooler_async_task_done(&g_PoolConnControl, threadIndex); @@ -7357,34 +7358,35 @@ void *pooler_sync_remote_operator_thread(void *arg) request->nodepool->connstr); SpinLockRelease(&request->agent->port.lock); #endif - set_task_status(request->taskControl, PoolAyncCtlStaus_error); - finish_task_request(request->taskControl); - break; - } - - slot->xc_cancelConn = (NODE_CANCEL *) PQgetCancel((PGconn *)slot->conn); - slot->bwarmed = false; - - /* set the time flags */ - slot->released = time(NULL); - slot->checked = slot->released; - slot->created = slot->released; - - /* increase usecount */ - slot->usecount++; - slot->node_name = request->nodepool->node_name; - slot->backend_pid = ((PGconn *) slot->conn)->be_pid; - if (request->bCoord) - { - request->agent->coord_connections[request->nodeindex] = slot; - } - else - { - request->agent->dn_connections[request->nodeindex] = slot; - } - request->current_status = PoolConnectStaus_connected; -#ifdef _POOLER_CHECK_ - snprintf(request->errmsg, POOLER_ERROR_MSG_LEN, "parallel connect thread build connection to node:%s backend_pid:%d nodeidx:%d succeed", slot->node_name, slot->backend_pid, request->nodeindex); + set_task_status(request->taskControl, PoolAyncCtlStaus_error); + finish_task_request(request->taskControl); + break; + } + + slot->xc_cancelConn = (NODE_CANCEL *) PQgetCancel((PGconn *)slot->conn); + slot->bwarmed = false; + SetSockKeepAlive(((PGconn *)slot->conn)->sock); + + /* set the time flags */ + slot->released = time(NULL); + slot->checked = slot->released; + slot->created = slot->released; + + /* increase usecount */ + slot->usecount++; + slot->node_name = request->nodepool->node_name; + slot->backend_pid = ((PGconn *) slot->conn)->be_pid; + if (request->bCoord) + { + request->agent->coord_connections[request->nodeindex] = slot; + } + else + { + request->agent->dn_connections[request->nodeindex] = slot; + } + request->current_status = PoolConnectStaus_connected; +#ifdef _POOLER_CHECK_ + snprintf(request->errmsg, POOLER_ERROR_MSG_LEN, "parallel connect thread build connection to node:%s backend_pid:%d nodeidx:%d succeed", slot->node_name, slot->backend_pid, request->nodeindex); #endif continue; } diff --git a/src/include/libpq/libpq-be.h b/src/include/libpq/libpq-be.h index 42b96486..474d9690 100644 --- a/src/include/libpq/libpq-be.h +++ b/src/include/libpq/libpq-be.h @@ -287,4 +287,6 @@ extern int pq_setkeepalivesidle(int idle, Port *port); extern int pq_setkeepalivesinterval(int interval, Port *port); extern int pq_setkeepalivescount(int count, Port *port); -#endif /* LIBPQ_BE_H */ +extern void SetSockKeepAlive(int sock); + +#endif /* LIBPQ_BE_H */ From 5ab4c78bd9f89ce11b7d740ea44f83dc1d375a8b Mon Sep 17 00:00:00 2001 From: qiannzhang Date: Fri, 28 Aug 2020 15:46:11 +0800 Subject: [PATCH 028/578] ID81500043: also check xmin if tmin is invalid --- src/backend/access/transam/twophase.c | 46 +++++++++++++++------------ 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 204e9edd..28197d77 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -436,27 +436,31 @@ EndGlobalPrepare(GlobalTransaction gxact, bool isImplicit) } #ifdef __TBASE_DEBUG__ - if(enable_distri_print) - { - InsertPreparedXid(pgxact->xid, GetGlobalPrepareTimestamp()); - } -#endif - - // SetGlobalPrepareTimestamp(InvalidGlobalTimestamp); - /* - * Transfer the tmin to the prepared proc without locking. - * As the prepare xact procs lie behind the normal procs in proc array, - * Get Snapshot would not miss the tmin even when it is being transferred. - */ - pg_atomic_write_u64(&pgxact->tmin, pg_atomic_read_u64(&MyPgXact->tmin)); - if(!GlobalTimestampIsValid(pg_atomic_read_u64(&MyPgXact->tmin))) - { - elog(LOG, - "prepare transaction %d does not have valid tmin. autovacuum %d", - MyPgXact->xid, IsAutoVacuumWorkerProcess()); - } - - + if(enable_distri_print) + { + InsertPreparedXid(pgxact->xid, GetGlobalPrepareTimestamp()); + } +#endif + + // SetGlobalPrepareTimestamp(InvalidGlobalTimestamp); + /* + * Transfer the tmin to the prepared proc without locking. + * As the prepare xact procs lie behind the normal procs in proc array, + * Get Snapshot would not miss the tmin even when it is being transferred. + * + * According to PortalRunUtility, we do not set snapshot if transaction + * only contains utilities that do not need one. In that case, + * xmin and tmin are both invalid, for they are both set by snapshot. + * So if xmin is valid, tmin should also be. + */ + pg_atomic_write_u64(&pgxact->tmin, pg_atomic_read_u64(&MyPgXact->tmin)); + if(!GlobalTimestampIsValid(pg_atomic_read_u64(&MyPgXact->tmin)) && + TransactionIdIsValid(MyPgXact->xmin)) + { + elog(LOG, + "prepare transaction %d does not have valid tmin. autovacuum %d", + MyPgXact->xid, IsAutoVacuumWorkerProcess()); + } } From be971c2fa1438411e14910ed1346014a4d1d18f0 Mon Sep 17 00:00:00 2001 From: youngxie Date: Fri, 7 Aug 2020 15:46:31 +0800 Subject: [PATCH 029/578] * support sublink pull up in targetlist. Since subquery in the targetlist has scalar semantics. Normal joins will join simply generate repeated tuples. So we add a new join type JOIN_LEFT_SCALAR which acts like left join and reports error when scalar semantics is broken. --- src/backend/commands/explain.c | 501 +++--- src/backend/executor/nodeHashjoin.c | 501 +++--- src/backend/executor/nodeMergejoin.c | 1803 +++++++++---------- src/backend/executor/nodeNestloop.c | 373 ++-- src/backend/optimizer/path/allpaths.c | 60 +- src/backend/optimizer/path/costsize.c | 1968 +++++++++++---------- src/backend/optimizer/path/indxpath.c | 46 +- src/backend/optimizer/path/joinpath.c | 733 ++++---- src/backend/optimizer/path/joinrels.c | 933 +++++----- src/backend/optimizer/plan/initsplan.c | 1451 +++++++-------- src/backend/optimizer/plan/setrefs.c | 225 +-- src/backend/optimizer/plan/subselect.c | 127 +- src/backend/optimizer/prep/prepjointree.c | 1118 ++++++------ src/backend/optimizer/util/pathnode.c | 176 +- src/backend/utils/adt/network_selfuncs.c | 91 +- src/backend/utils/adt/selfuncs.c | 101 +- src/include/nodes/nodes.h | 83 +- src/include/optimizer/subselect.h | 3 +- src/test/regress/expected/subselect.out | 334 +++- src/test/regress/sql/subselect.sql | 80 + 20 files changed, 5729 insertions(+), 4978 deletions(-) diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 0c22d92b..d49eebc8 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -1312,255 +1312,260 @@ ExplainNode(PlanState *planstate, List *ancestors, } break; #endif /* XCP */ - case T_IndexScan: - { - IndexScan *indexscan = (IndexScan *) plan; - - ExplainIndexScanDetails(indexscan->indexid, - indexscan->indexorderdir, - es); - ExplainScanTarget((Scan *) indexscan, es); - } - break; - case T_IndexOnlyScan: - { - IndexOnlyScan *indexonlyscan = (IndexOnlyScan *) plan; - - ExplainIndexScanDetails(indexonlyscan->indexid, - indexonlyscan->indexorderdir, - es); - ExplainScanTarget((Scan *) indexonlyscan, es); - } - break; - case T_BitmapIndexScan: - { - BitmapIndexScan *bitmapindexscan = (BitmapIndexScan *) plan; - const char *indexname = - explain_get_index_name(bitmapindexscan->indexid); - - if (es->format == EXPLAIN_FORMAT_TEXT) - appendStringInfo(es->str, " on %s", indexname); - else - ExplainPropertyText("Index Name", indexname, es); - } - break; - case T_ModifyTable: - ExplainModifyTarget((ModifyTable *) plan, es); - break; - case T_NestLoop: - case T_MergeJoin: - case T_HashJoin: - { - const char *jointype; - - switch (((Join *) plan)->jointype) - { - case JOIN_INNER: - jointype = "Inner"; - break; - case JOIN_LEFT: - jointype = "Left"; - break; - case JOIN_FULL: - jointype = "Full"; - break; - case JOIN_RIGHT: - jointype = "Right"; - break; - case JOIN_SEMI: - jointype = "Semi"; - break; - case JOIN_ANTI: - jointype = "Anti"; - break; - default: - jointype = "???"; - break; - } - if (es->format == EXPLAIN_FORMAT_TEXT) - { - /* - * For historical reasons, the join type is interpolated - * into the node type name... - */ - if (((Join *) plan)->jointype != JOIN_INNER) - appendStringInfo(es->str, " %s Join", jointype); - else if (!IsA(plan, NestLoop)) - appendStringInfoString(es->str, " Join"); - } - else - ExplainPropertyText("Join Type", jointype, es); - } - break; - case T_SetOp: - { - const char *setopcmd; - - switch (((SetOp *) plan)->cmd) - { - case SETOPCMD_INTERSECT: - setopcmd = "Intersect"; - break; - case SETOPCMD_INTERSECT_ALL: - setopcmd = "Intersect All"; - break; - case SETOPCMD_EXCEPT: - setopcmd = "Except"; - break; - case SETOPCMD_EXCEPT_ALL: - setopcmd = "Except All"; - break; - default: - setopcmd = "???"; + case T_IndexScan: + { + IndexScan *indexscan = (IndexScan *) plan; + + ExplainIndexScanDetails(indexscan->indexid, + indexscan->indexorderdir, + es); + ExplainScanTarget((Scan *) indexscan, es); + } + break; + case T_IndexOnlyScan: + { + IndexOnlyScan *indexonlyscan = (IndexOnlyScan *) plan; + + ExplainIndexScanDetails(indexonlyscan->indexid, + indexonlyscan->indexorderdir, + es); + ExplainScanTarget((Scan *) indexonlyscan, es); + } + break; + case T_BitmapIndexScan: + { + BitmapIndexScan *bitmapindexscan = (BitmapIndexScan *) plan; + const char *indexname = + explain_get_index_name(bitmapindexscan->indexid); + + if (es->format == EXPLAIN_FORMAT_TEXT) + appendStringInfo(es->str, " on %s", indexname); + else + ExplainPropertyText("Index Name", indexname, es); + } + break; + case T_ModifyTable: + ExplainModifyTarget((ModifyTable *) plan, es); + break; + case T_NestLoop: + case T_MergeJoin: + case T_HashJoin: + { + const char *jointype; + + switch (((Join *) plan)->jointype) + { + case JOIN_INNER: + jointype = "Inner"; + break; + case JOIN_LEFT: + jointype = "Left"; + break; + case JOIN_FULL: + jointype = "Full"; + break; + case JOIN_RIGHT: + jointype = "Right"; + break; + case JOIN_SEMI: + jointype = "Semi"; + break; + case JOIN_ANTI: + jointype = "Anti"; + break; +#ifdef __TBASE__ + case JOIN_LEFT_SCALAR: + jointype = "Left Scalar"; break; - } - if (es->format == EXPLAIN_FORMAT_TEXT) - appendStringInfo(es->str, " %s", setopcmd); - else - ExplainPropertyText("Command", setopcmd, es); - } - break; - default: - break; - } - - if (es->costs) - { - if (es->format == EXPLAIN_FORMAT_TEXT) - { - appendStringInfo(es->str, " (cost=%.2f..%.2f rows=%.0f width=%d)", - plan->startup_cost, plan->total_cost, - plan->plan_rows, plan->plan_width); - } - else - { - ExplainPropertyFloat("Startup Cost", plan->startup_cost, 2, es); - ExplainPropertyFloat("Total Cost", plan->total_cost, 2, es); - ExplainPropertyFloat("Plan Rows", plan->plan_rows, 0, es); - ExplainPropertyInteger("Plan Width", plan->plan_width, es); - } - } - - /* - * We have to forcibly clean up the instrumentation state because we - * haven't done ExecutorEnd yet. This is pretty grotty ... - * - * Note: contrib/auto_explain could cause instrumentation to be set up - * even though we didn't ask for it here. Be careful not to print any - * instrumentation results the user didn't ask for. But we do the - * InstrEndLoop call anyway, if possible, to reduce the number of cases - * auto_explain has to contend with. - */ - if (planstate->instrument) - InstrEndLoop(planstate->instrument); - - if (es->analyze && - planstate->instrument && planstate->instrument->nloops > 0) - { - double nloops = planstate->instrument->nloops; - double startup_sec = 1000.0 * planstate->instrument->startup / nloops; - double total_sec = 1000.0 * planstate->instrument->total / nloops; - double rows = planstate->instrument->ntuples / nloops; - - if (es->format == EXPLAIN_FORMAT_TEXT) - { - if (es->timing) - appendStringInfo(es->str, - " (actual time=%.3f..%.3f rows=%.0f loops=%.0f)", - startup_sec, total_sec, rows, nloops); - else - appendStringInfo(es->str, - " (actual rows=%.0f loops=%.0f)", - rows, nloops); - } - else - { - if (es->timing) - { - ExplainPropertyFloat("Actual Startup Time", startup_sec, 3, es); - ExplainPropertyFloat("Actual Total Time", total_sec, 3, es); - } - ExplainPropertyFloat("Actual Rows", rows, 0, es); - ExplainPropertyFloat("Actual Loops", nloops, 0, es); - } - } - else if (es->analyze) - { - if (es->format == EXPLAIN_FORMAT_TEXT) - appendStringInfoString(es->str, " (never executed)"); - else - { - if (es->timing) - { - ExplainPropertyFloat("Actual Startup Time", 0.0, 3, es); - ExplainPropertyFloat("Actual Total Time", 0.0, 3, es); - } - ExplainPropertyFloat("Actual Rows", 0.0, 0, es); - ExplainPropertyFloat("Actual Loops", 0.0, 0, es); - } - } - - /* in text format, first line ends here */ - if (es->format == EXPLAIN_FORMAT_TEXT) - appendStringInfoChar(es->str, '\n'); - - /* target list */ - if (es->verbose) - show_plan_tlist(planstate, ancestors, es); - - /* unique join */ - switch (nodeTag(plan)) - { - case T_NestLoop: - case T_MergeJoin: - case T_HashJoin: - /* try not to be too chatty about this in text mode */ - if (es->format != EXPLAIN_FORMAT_TEXT || - (es->verbose && ((Join *) plan)->inner_unique)) - ExplainPropertyBool("Inner Unique", - ((Join *) plan)->inner_unique, - es); - break; - default: - break; - } - - /* quals, sort keys, etc */ - switch (nodeTag(plan)) - { - case T_IndexScan: - show_scan_qual(((IndexScan *) plan)->indexqualorig, - "Index Cond", planstate, ancestors, es); - if (((IndexScan *) plan)->indexqualorig) - show_instrumentation_count("Rows Removed by Index Recheck", 2, - planstate, es); - show_scan_qual(((IndexScan *) plan)->indexorderbyorig, - "Order By", planstate, ancestors, es); - show_scan_qual(plan->qual, "Filter", planstate, ancestors, es); - if (plan->qual) - show_instrumentation_count("Rows Removed by Filter", 1, - planstate, es); - break; - case T_IndexOnlyScan: - show_scan_qual(((IndexOnlyScan *) plan)->indexqual, - "Index Cond", planstate, ancestors, es); - if (((IndexOnlyScan *) plan)->indexqual) - show_instrumentation_count("Rows Removed by Index Recheck", 2, - planstate, es); - show_scan_qual(((IndexOnlyScan *) plan)->indexorderby, - "Order By", planstate, ancestors, es); - show_scan_qual(plan->qual, "Filter", planstate, ancestors, es); - if (plan->qual) - show_instrumentation_count("Rows Removed by Filter", 1, - planstate, es); - if (es->analyze) - ExplainPropertyLong("Heap Fetches", - ((IndexOnlyScanState *) planstate)->ioss_HeapFetches, es); - break; - case T_BitmapIndexScan: - show_scan_qual(((BitmapIndexScan *) plan)->indexqualorig, - "Index Cond", planstate, ancestors, es); - break; +#endif + default: + jointype = "???"; + break; + } + if (es->format == EXPLAIN_FORMAT_TEXT) + { + /* + * For historical reasons, the join type is interpolated + * into the node type name... + */ + if (((Join *) plan)->jointype != JOIN_INNER) + appendStringInfo(es->str, " %s Join", jointype); + else if (!IsA(plan, NestLoop)) + appendStringInfoString(es->str, " Join"); + } + else + ExplainPropertyText("Join Type", jointype, es); + } + break; + case T_SetOp: + { + const char *setopcmd; + + switch (((SetOp *) plan)->cmd) + { + case SETOPCMD_INTERSECT: + setopcmd = "Intersect"; + break; + case SETOPCMD_INTERSECT_ALL: + setopcmd = "Intersect All"; + break; + case SETOPCMD_EXCEPT: + setopcmd = "Except"; + break; + case SETOPCMD_EXCEPT_ALL: + setopcmd = "Except All"; + break; + default: + setopcmd = "???"; + break; + } + if (es->format == EXPLAIN_FORMAT_TEXT) + appendStringInfo(es->str, " %s", setopcmd); + else + ExplainPropertyText("Command", setopcmd, es); + } + break; + default: + break; + } + + if (es->costs) + { + if (es->format == EXPLAIN_FORMAT_TEXT) + { + appendStringInfo(es->str, " (cost=%.2f..%.2f rows=%.0f width=%d)", + plan->startup_cost, plan->total_cost, + plan->plan_rows, plan->plan_width); + } + else + { + ExplainPropertyFloat("Startup Cost", plan->startup_cost, 2, es); + ExplainPropertyFloat("Total Cost", plan->total_cost, 2, es); + ExplainPropertyFloat("Plan Rows", plan->plan_rows, 0, es); + ExplainPropertyInteger("Plan Width", plan->plan_width, es); + } + } + + /* + * We have to forcibly clean up the instrumentation state because we + * haven't done ExecutorEnd yet. This is pretty grotty ... + * + * Note: contrib/auto_explain could cause instrumentation to be set up + * even though we didn't ask for it here. Be careful not to print any + * instrumentation results the user didn't ask for. But we do the + * InstrEndLoop call anyway, if possible, to reduce the number of cases + * auto_explain has to contend with. + */ + if (planstate->instrument) + InstrEndLoop(planstate->instrument); + + if (es->analyze && + planstate->instrument && planstate->instrument->nloops > 0) + { + double nloops = planstate->instrument->nloops; + double startup_sec = 1000.0 * planstate->instrument->startup / nloops; + double total_sec = 1000.0 * planstate->instrument->total / nloops; + double rows = planstate->instrument->ntuples / nloops; + + if (es->format == EXPLAIN_FORMAT_TEXT) + { + if (es->timing) + appendStringInfo(es->str, + " (actual time=%.3f..%.3f rows=%.0f loops=%.0f)", + startup_sec, total_sec, rows, nloops); + else + appendStringInfo(es->str, + " (actual rows=%.0f loops=%.0f)", + rows, nloops); + } + else + { + if (es->timing) + { + ExplainPropertyFloat("Actual Startup Time", startup_sec, 3, es); + ExplainPropertyFloat("Actual Total Time", total_sec, 3, es); + } + ExplainPropertyFloat("Actual Rows", rows, 0, es); + ExplainPropertyFloat("Actual Loops", nloops, 0, es); + } + } + else if (es->analyze) + { + if (es->format == EXPLAIN_FORMAT_TEXT) + appendStringInfoString(es->str, " (never executed)"); + else + { + if (es->timing) + { + ExplainPropertyFloat("Actual Startup Time", 0.0, 3, es); + ExplainPropertyFloat("Actual Total Time", 0.0, 3, es); + } + ExplainPropertyFloat("Actual Rows", 0.0, 0, es); + ExplainPropertyFloat("Actual Loops", 0.0, 0, es); + } + } + + /* in text format, first line ends here */ + if (es->format == EXPLAIN_FORMAT_TEXT) + appendStringInfoChar(es->str, '\n'); + + /* target list */ + if (es->verbose) + show_plan_tlist(planstate, ancestors, es); + + /* unique join */ + switch (nodeTag(plan)) + { + case T_NestLoop: + case T_MergeJoin: + case T_HashJoin: + /* try not to be too chatty about this in text mode */ + if (es->format != EXPLAIN_FORMAT_TEXT || + (es->verbose && ((Join *) plan)->inner_unique)) + ExplainPropertyBool("Inner Unique", + ((Join *) plan)->inner_unique, + es); + break; + default: + break; + } + + /* quals, sort keys, etc */ + switch (nodeTag(plan)) + { + case T_IndexScan: + show_scan_qual(((IndexScan *) plan)->indexqualorig, + "Index Cond", planstate, ancestors, es); + if (((IndexScan *) plan)->indexqualorig) + show_instrumentation_count("Rows Removed by Index Recheck", 2, + planstate, es); + show_scan_qual(((IndexScan *) plan)->indexorderbyorig, + "Order By", planstate, ancestors, es); + show_scan_qual(plan->qual, "Filter", planstate, ancestors, es); + if (plan->qual) + show_instrumentation_count("Rows Removed by Filter", 1, + planstate, es); + break; + case T_IndexOnlyScan: + show_scan_qual(((IndexOnlyScan *) plan)->indexqual, + "Index Cond", planstate, ancestors, es); + if (((IndexOnlyScan *) plan)->indexqual) + show_instrumentation_count("Rows Removed by Index Recheck", 2, + planstate, es); + show_scan_qual(((IndexOnlyScan *) plan)->indexorderby, + "Order By", planstate, ancestors, es); + show_scan_qual(plan->qual, "Filter", planstate, ancestors, es); + if (plan->qual) + show_instrumentation_count("Rows Removed by Filter", 1, + planstate, es); + if (es->analyze) + ExplainPropertyLong("Heap Fetches", + ((IndexOnlyScanState *) planstate)->ioss_HeapFetches, es); + break; + case T_BitmapIndexScan: + show_scan_qual(((BitmapIndexScan *) plan)->indexqualorig, + "Index Cond", planstate, ancestors, es); + break; #ifdef PGXC case T_RemoteQuery: /* Remote query */ diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c index 87fde8bd..9f1b7b90 100644 --- a/src/backend/executor/nodeHashjoin.c +++ b/src/backend/executor/nodeHashjoin.c @@ -410,98 +410,104 @@ ExecHashJoin(PlanState *pstate) } } #endif - /* set up to scan for unmatched inner tuples */ - ExecPrepHashTableForUnmatched(node); - node->hj_JoinState = HJ_FILL_INNER_TUPLES; - } - else - node->hj_JoinState = HJ_NEED_NEW_BATCH; - continue; - } - - econtext->ecxt_outertuple = outerTupleSlot; - node->hj_MatchedOuter = false; - - /* - * Find the corresponding bucket for this tuple in the main - * hash table or skew hash table. - */ - node->hj_CurHashValue = hashvalue; - ExecHashGetBucketAndBatch(hashtable, hashvalue, - &node->hj_CurBucketNo, &batchno); - node->hj_CurSkewBucketNo = ExecHashGetSkewBucket(hashtable, - hashvalue); - node->hj_CurTuple = NULL; - - /* - * The tuple might not belong to the current batch (where - * "current batch" includes the skew buckets if any). - */ - if (batchno != hashtable->curbatch && - node->hj_CurSkewBucketNo == INVALID_SKEW_BUCKET_NO) - { - /* - * Need to postpone this outer tuple to a later batch. - * Save it in the corresponding outer-batch file. - */ - Assert(batchno > hashtable->curbatch); - ExecHashJoinSaveTuple(ExecFetchSlotMinimalTuple(outerTupleSlot), - hashvalue, - &hashtable->outerBatchFile[batchno]); - /* Loop around, staying in HJ_NEED_NEW_OUTER state */ - continue; - } - - /* OK, let's scan the bucket for matches */ - node->hj_JoinState = HJ_SCAN_BUCKET; - - /* FALL THRU */ - - case HJ_SCAN_BUCKET: - - /* - * Scan the selected hash bucket for matches to current outer - */ - if (!ExecScanHashBucket(node, econtext)) - { - /* out of matches; check for possible outer-join fill */ - node->hj_JoinState = HJ_FILL_OUTER_TUPLE; - continue; - } - - /* - * We've got a match, but still need to test non-hashed quals. - * ExecScanHashBucket already set up all the state needed to - * call ExecQual. - * - * If we pass the qual, then save state for next call and have - * ExecProject form the projection, store it in the tuple - * table, and return the slot. - * - * Only the joinquals determine tuple match status, but all - * quals must pass to actually return the tuple. - */ - if (joinqual == NULL || ExecQual(joinqual, econtext)) - { - node->hj_MatchedOuter = true; - HeapTupleHeaderSetMatch(HJTUPLE_MINTUPLE(node->hj_CurTuple)); - - /* In an antijoin, we never return a matched tuple */ - if (node->js.jointype == JOIN_ANTI) - { - node->hj_JoinState = HJ_NEED_NEW_OUTER; - continue; - } - - /* - * If we only need to join to the first matching inner - * tuple, then consider returning this one, but after that - * continue with next outer tuple. - */ - if (node->js.single_match) - node->hj_JoinState = HJ_NEED_NEW_OUTER; - - if (otherqual == NULL || ExecQual(otherqual, econtext)) + /* set up to scan for unmatched inner tuples */ + ExecPrepHashTableForUnmatched(node); + node->hj_JoinState = HJ_FILL_INNER_TUPLES; + } + else + node->hj_JoinState = HJ_NEED_NEW_BATCH; + continue; + } + + econtext->ecxt_outertuple = outerTupleSlot; + node->hj_MatchedOuter = false; + + /* + * Find the corresponding bucket for this tuple in the main + * hash table or skew hash table. + */ + node->hj_CurHashValue = hashvalue; + ExecHashGetBucketAndBatch(hashtable, hashvalue, + &node->hj_CurBucketNo, &batchno); + node->hj_CurSkewBucketNo = ExecHashGetSkewBucket(hashtable, + hashvalue); + node->hj_CurTuple = NULL; + + /* + * The tuple might not belong to the current batch (where + * "current batch" includes the skew buckets if any). + */ + if (batchno != hashtable->curbatch && + node->hj_CurSkewBucketNo == INVALID_SKEW_BUCKET_NO) + { + /* + * Need to postpone this outer tuple to a later batch. + * Save it in the corresponding outer-batch file. + */ + Assert(batchno > hashtable->curbatch); + ExecHashJoinSaveTuple(ExecFetchSlotMinimalTuple(outerTupleSlot), + hashvalue, + &hashtable->outerBatchFile[batchno]); + /* Loop around, staying in HJ_NEED_NEW_OUTER state */ + continue; + } + + /* OK, let's scan the bucket for matches */ + node->hj_JoinState = HJ_SCAN_BUCKET; + + /* FALL THRU */ + + case HJ_SCAN_BUCKET: + + /* + * Scan the selected hash bucket for matches to current outer + */ + if (!ExecScanHashBucket(node, econtext)) + { + /* out of matches; check for possible outer-join fill */ + node->hj_JoinState = HJ_FILL_OUTER_TUPLE; + continue; + } + + /* + * We've got a match, but still need to test non-hashed quals. + * ExecScanHashBucket already set up all the state needed to + * call ExecQual. + * + * If we pass the qual, then save state for next call and have + * ExecProject form the projection, store it in the tuple + * table, and return the slot. + * + * Only the joinquals determine tuple match status, but all + * quals must pass to actually return the tuple. + */ + if (joinqual == NULL || ExecQual(joinqual, econtext)) + { +#ifdef __TBASE__ + if (node->js.jointype == JOIN_LEFT_SCALAR && node->hj_MatchedOuter) + ereport(ERROR, + (errcode(ERRCODE_CARDINALITY_VIOLATION), + errmsg("more than one row returned by a subquery used as an expression"))); +#endif + node->hj_MatchedOuter = true; + HeapTupleHeaderSetMatch(HJTUPLE_MINTUPLE(node->hj_CurTuple)); + + /* In an antijoin, we never return a matched tuple */ + if (node->js.jointype == JOIN_ANTI) + { + node->hj_JoinState = HJ_NEED_NEW_OUTER; + continue; + } + + /* + * If we only need to join to the first matching inner + * tuple, then consider returning this one, but after that + * continue with next outer tuple. + */ + if (node->js.single_match) + node->hj_JoinState = HJ_NEED_NEW_OUTER; + + if (otherqual == NULL || ExecQual(otherqual, econtext)) #ifdef __TBASE__ { node->matched_tuples++; @@ -593,161 +599,168 @@ ExecHashJoin(PlanState *pstate) */ HashJoinState * ExecInitHashJoin(HashJoin *node, EState *estate, int eflags) -{// #lizard forgives - HashJoinState *hjstate; - Plan *outerNode; - Hash *hashNode; - List *lclauses; - List *rclauses; - List *hoperators; - ListCell *l; - - /* check for unsupported flags */ - Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); - - /* - * create state structure - */ - hjstate = makeNode(HashJoinState); - hjstate->js.ps.plan = (Plan *) node; - hjstate->js.ps.state = estate; - hjstate->js.ps.ExecProcNode = ExecHashJoin; - - /* - * Miscellaneous initialization - * - * create expression context for node - */ - ExecAssignExprContext(estate, &hjstate->js.ps); - - /* - * initialize child expressions - */ - hjstate->js.ps.qual = - ExecInitQual(node->join.plan.qual, (PlanState *) hjstate); - hjstate->js.jointype = node->join.jointype; - hjstate->js.joinqual = - ExecInitQual(node->join.joinqual, (PlanState *) hjstate); - hjstate->hashclauses = - ExecInitQual(node->hashclauses, (PlanState *) hjstate); - - /* - * initialize child nodes - * - * Note: we could suppress the REWIND flag for the inner input, which - * would amount to betting that the hash will be a single batch. Not - * clear if this would be a win or not. - */ - outerNode = outerPlan(node); - hashNode = (Hash *) innerPlan(node); - - outerPlanState(hjstate) = ExecInitNode(outerNode, estate, eflags); - innerPlanState(hjstate) = ExecInitNode((Plan *) hashNode, estate, eflags); - - /* - * tuple table initialization - */ - ExecInitResultTupleSlot(estate, &hjstate->js.ps); - hjstate->hj_OuterTupleSlot = ExecInitExtraTupleSlot(estate); - - /* - * detect whether we need only consider the first matching inner tuple - */ - hjstate->js.single_match = (node->join.inner_unique || - node->join.jointype == JOIN_SEMI); - - /* set up null tuples for outer joins, if needed */ - switch (node->join.jointype) - { - case JOIN_INNER: +{ + HashJoinState *hjstate; + Plan *outerNode; + Hash *hashNode; + List *lclauses; + List *rclauses; + List *hoperators; + ListCell *l; + + /* check for unsupported flags */ + Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); + + /* + * create state structure + */ + hjstate = makeNode(HashJoinState); + hjstate->js.ps.plan = (Plan *) node; + hjstate->js.ps.state = estate; + hjstate->js.ps.ExecProcNode = ExecHashJoin; + + /* + * Miscellaneous initialization + * + * create expression context for node + */ + ExecAssignExprContext(estate, &hjstate->js.ps); + + /* + * initialize child expressions + */ + hjstate->js.ps.qual = + ExecInitQual(node->join.plan.qual, (PlanState *) hjstate); + hjstate->js.jointype = node->join.jointype; + hjstate->js.joinqual = + ExecInitQual(node->join.joinqual, (PlanState *) hjstate); + hjstate->hashclauses = + ExecInitQual(node->hashclauses, (PlanState *) hjstate); + + /* + * initialize child nodes + * + * Note: we could suppress the REWIND flag for the inner input, which + * would amount to betting that the hash will be a single batch. Not + * clear if this would be a win or not. + */ + outerNode = outerPlan(node); + hashNode = (Hash *) innerPlan(node); + + outerPlanState(hjstate) = ExecInitNode(outerNode, estate, eflags); + innerPlanState(hjstate) = ExecInitNode((Plan *) hashNode, estate, eflags); + + /* + * tuple table initialization + */ + ExecInitResultTupleSlot(estate, &hjstate->js.ps); + hjstate->hj_OuterTupleSlot = ExecInitExtraTupleSlot(estate); + + /* + * detect whether we need only consider the first matching inner tuple + */ + hjstate->js.single_match = (node->join.inner_unique || + node->join.jointype == JOIN_SEMI); + + /* set up null tuples for outer joins, if needed */ + switch (node->join.jointype) + { + case JOIN_INNER: case JOIN_SEMI: - break; - case JOIN_LEFT: - case JOIN_ANTI: - hjstate->hj_NullInnerTupleSlot = - ExecInitNullTupleSlot(estate, - ExecGetResultType(innerPlanState(hjstate))); - break; - case JOIN_RIGHT: - hjstate->hj_NullOuterTupleSlot = - ExecInitNullTupleSlot(estate, - ExecGetResultType(outerPlanState(hjstate))); - break; - case JOIN_FULL: - hjstate->hj_NullOuterTupleSlot = - ExecInitNullTupleSlot(estate, - ExecGetResultType(outerPlanState(hjstate))); + break; +#ifdef __TBASE__ + case JOIN_LEFT_SCALAR: hjstate->hj_NullInnerTupleSlot = - ExecInitNullTupleSlot(estate, - ExecGetResultType(innerPlanState(hjstate))); + ExecInitNullTupleSlot(estate, + ExecGetResultType(innerPlanState(hjstate))); break; - default: - elog(ERROR, "unrecognized join type: %d", - (int) node->join.jointype); - } - - /* - * now for some voodoo. our temporary tuple slot is actually the result - * tuple slot of the Hash node (which is our inner plan). we can do this - * because Hash nodes don't return tuples via ExecProcNode() -- instead - * the hash join node uses ExecScanHashBucket() to get at the contents of - * the hash table. -cim 6/9/91 - */ - { - HashState *hashstate = (HashState *) innerPlanState(hjstate); - TupleTableSlot *slot = hashstate->ps.ps_ResultTupleSlot; - - hjstate->hj_HashTupleSlot = slot; - } - - /* - * initialize tuple type and projection info - */ - ExecAssignResultTypeFromTL(&hjstate->js.ps); - ExecAssignProjectionInfo(&hjstate->js.ps, NULL); - - ExecSetSlotDescriptor(hjstate->hj_OuterTupleSlot, - ExecGetResultType(outerPlanState(hjstate))); - - /* - * initialize hash-specific info - */ - hjstate->hj_HashTable = NULL; - hjstate->hj_FirstOuterTupleSlot = NULL; - - hjstate->hj_CurHashValue = 0; - hjstate->hj_CurBucketNo = 0; - hjstate->hj_CurSkewBucketNo = INVALID_SKEW_BUCKET_NO; - hjstate->hj_CurTuple = NULL; - - /* - * Deconstruct the hash clauses into outer and inner argument values, so - * that we can evaluate those subexpressions separately. Also make a list - * of the hash operator OIDs, in preparation for looking up the hash - * functions to use. - */ - lclauses = NIL; - rclauses = NIL; - hoperators = NIL; - foreach(l, node->hashclauses) - { - OpExpr *hclause = lfirst_node(OpExpr, l); - - lclauses = lappend(lclauses, ExecInitExpr(linitial(hclause->args), - (PlanState *) hjstate)); - rclauses = lappend(rclauses, ExecInitExpr(lsecond(hclause->args), - (PlanState *) hjstate)); - hoperators = lappend_oid(hoperators, hclause->opno); - } - hjstate->hj_OuterHashKeys = lclauses; - hjstate->hj_InnerHashKeys = rclauses; - hjstate->hj_HashOperators = hoperators; - /* child Hash node needs to evaluate inner hash keys, too */ - ((HashState *) innerPlanState(hjstate))->hashkeys = rclauses; - - hjstate->hj_JoinState = HJ_BUILD_HASHTABLE; - hjstate->hj_MatchedOuter = false; - hjstate->hj_OuterNotEmpty = false; +#endif + case JOIN_LEFT: + case JOIN_ANTI: + hjstate->hj_NullInnerTupleSlot = + ExecInitNullTupleSlot(estate, + ExecGetResultType(innerPlanState(hjstate))); + break; + case JOIN_RIGHT: + hjstate->hj_NullOuterTupleSlot = + ExecInitNullTupleSlot(estate, + ExecGetResultType(outerPlanState(hjstate))); + break; + case JOIN_FULL: + hjstate->hj_NullOuterTupleSlot = + ExecInitNullTupleSlot(estate, + ExecGetResultType(outerPlanState(hjstate))); + hjstate->hj_NullInnerTupleSlot = + ExecInitNullTupleSlot(estate, + ExecGetResultType(innerPlanState(hjstate))); + break; + default: + elog(ERROR, "unrecognized join type: %d", + (int) node->join.jointype); + } + + /* + * now for some voodoo. our temporary tuple slot is actually the result + * tuple slot of the Hash node (which is our inner plan). we can do this + * because Hash nodes don't return tuples via ExecProcNode() -- instead + * the hash join node uses ExecScanHashBucket() to get at the contents of + * the hash table. -cim 6/9/91 + */ + { + HashState *hashstate = (HashState *) innerPlanState(hjstate); + TupleTableSlot *slot = hashstate->ps.ps_ResultTupleSlot; + + hjstate->hj_HashTupleSlot = slot; + } + + /* + * initialize tuple type and projection info + */ + ExecAssignResultTypeFromTL(&hjstate->js.ps); + ExecAssignProjectionInfo(&hjstate->js.ps, NULL); + + ExecSetSlotDescriptor(hjstate->hj_OuterTupleSlot, + ExecGetResultType(outerPlanState(hjstate))); + + /* + * initialize hash-specific info + */ + hjstate->hj_HashTable = NULL; + hjstate->hj_FirstOuterTupleSlot = NULL; + + hjstate->hj_CurHashValue = 0; + hjstate->hj_CurBucketNo = 0; + hjstate->hj_CurSkewBucketNo = INVALID_SKEW_BUCKET_NO; + hjstate->hj_CurTuple = NULL; + + /* + * Deconstruct the hash clauses into outer and inner argument values, so + * that we can evaluate those subexpressions separately. Also make a list + * of the hash operator OIDs, in preparation for looking up the hash + * functions to use. + */ + lclauses = NIL; + rclauses = NIL; + hoperators = NIL; + foreach(l, node->hashclauses) + { + OpExpr *hclause = lfirst_node(OpExpr, l); + + lclauses = lappend(lclauses, ExecInitExpr(linitial(hclause->args), + (PlanState *) hjstate)); + rclauses = lappend(rclauses, ExecInitExpr(lsecond(hclause->args), + (PlanState *) hjstate)); + hoperators = lappend_oid(hoperators, hclause->opno); + } + hjstate->hj_OuterHashKeys = lclauses; + hjstate->hj_InnerHashKeys = rclauses; + hjstate->hj_HashOperators = hoperators; + /* child Hash node needs to evaluate inner hash keys, too */ + ((HashState *) innerPlanState(hjstate))->hashkeys = rclauses; + + hjstate->hj_JoinState = HJ_BUILD_HASHTABLE; + hjstate->hj_MatchedOuter = false; + hjstate->hj_OuterNotEmpty = false; #ifdef __TBASE__ hjstate->hj_OuterInited = false; hjstate->hj_InnerInited = false; diff --git a/src/backend/executor/nodeMergejoin.c b/src/backend/executor/nodeMergejoin.c index 985362fe..d10b74b0 100644 --- a/src/backend/executor/nodeMergejoin.c +++ b/src/backend/executor/nodeMergejoin.c @@ -714,731 +714,737 @@ ExecMergeJoin(PlanState *pstate) #ifdef __TBASE__ node->mj_InnerInited = true; #endif - innerTupleSlot = ExecProcNode(innerPlan); - node->mj_InnerTupleSlot = innerTupleSlot; - - /* Compute join values and check for unmatchability */ - switch (MJEvalInnerValues(node, innerTupleSlot)) - { - case MJEVAL_MATCHABLE: - - /* - * OK, we have the initial tuples. Begin by skipping - * non-matching tuples. - */ - node->mj_JoinState = EXEC_MJ_SKIP_TEST; - break; - case MJEVAL_NONMATCHABLE: - /* Mark before advancing, if wanted */ - if (node->mj_ExtraMarks) - ExecMarkPos(innerPlan); - /* Stay in same state to fetch next inner tuple */ - if (doFillInner) - { - /* - * Generate a fake join tuple with nulls for the - * outer tuple, and return it if it passes the - * non-join quals. - */ - TupleTableSlot *result; - - result = MJFillInner(node); - if (result) - return result; - } - break; - case MJEVAL_ENDOFJOIN: - /* No more inner tuples */ - MJ_printf("ExecMergeJoin: nothing in inner subplan\n"); - if (doFillOuter) - { - /* - * Need to emit left-join tuples for all outer - * tuples, including the one we just fetched. We - * set MatchedOuter = false to force the ENDINNER - * state to emit first tuple before advancing - * outer. - */ - node->mj_JoinState = EXEC_MJ_ENDINNER; - node->mj_MatchedOuter = false; - break; - } - /* Otherwise we're done. */ - return NULL; - } - break; - - /* - * EXEC_MJ_JOINTUPLES means we have two tuples which satisfied - * the merge clause so we join them and then proceed to get - * the next inner tuple (EXEC_MJ_NEXTINNER). - */ - case EXEC_MJ_JOINTUPLES: - MJ_printf("ExecMergeJoin: EXEC_MJ_JOINTUPLES\n"); - - /* - * Set the next state machine state. The right things will - * happen whether we return this join tuple or just fall - * through to continue the state machine execution. - */ - node->mj_JoinState = EXEC_MJ_NEXTINNER; - - /* - * Check the extra qual conditions to see if we actually want - * to return this join tuple. If not, can proceed with merge. - * We must distinguish the additional joinquals (which must - * pass to consider the tuples "matched" for outer-join logic) - * from the otherquals (which must pass before we actually - * return the tuple). - * - * We don't bother with a ResetExprContext here, on the - * assumption that we just did one while checking the merge - * qual. One per tuple should be sufficient. We do have to - * set up the econtext links to the tuples for ExecQual to - * use. - */ - outerTupleSlot = node->mj_OuterTupleSlot; - econtext->ecxt_outertuple = outerTupleSlot; - innerTupleSlot = node->mj_InnerTupleSlot; - econtext->ecxt_innertuple = innerTupleSlot; - - qualResult = (joinqual == NULL || - ExecQual(joinqual, econtext)); - MJ_DEBUG_QUAL(joinqual, qualResult); - - if (qualResult) - { - node->mj_MatchedOuter = true; - node->mj_MatchedInner = true; - - /* In an antijoin, we never return a matched tuple */ - if (node->js.jointype == JOIN_ANTI) - { - node->mj_JoinState = EXEC_MJ_NEXTOUTER; - break; - } - - /* - * If we only need to join to the first matching inner - * tuple, then consider returning this one, but after that - * continue with next outer tuple. - */ - if (node->js.single_match) - node->mj_JoinState = EXEC_MJ_NEXTOUTER; - - qualResult = (otherqual == NULL || - ExecQual(otherqual, econtext)); - MJ_DEBUG_QUAL(otherqual, qualResult); - - if (qualResult) - { - /* - * qualification succeeded. now form the desired - * projection tuple and return the slot containing it. - */ - MJ_printf("ExecMergeJoin: returning tuple\n"); - - return ExecProject(node->js.ps.ps_ProjInfo); - } - else - InstrCountFiltered2(node, 1); - } - else - InstrCountFiltered1(node, 1); - break; - - /* - * EXEC_MJ_NEXTINNER means advance the inner scan to the next - * tuple. If the tuple is not nil, we then proceed to test it - * against the join qualification. - * - * Before advancing, we check to see if we must emit an - * outer-join fill tuple for this inner tuple. - */ - case EXEC_MJ_NEXTINNER: - MJ_printf("ExecMergeJoin: EXEC_MJ_NEXTINNER\n"); - - if (doFillInner && !node->mj_MatchedInner) - { - /* - * Generate a fake join tuple with nulls for the outer - * tuple, and return it if it passes the non-join quals. - */ - TupleTableSlot *result; - - node->mj_MatchedInner = true; /* do it only once */ - - result = MJFillInner(node); - if (result) - return result; - } - - /* - * now we get the next inner tuple, if any. If there's none, - * advance to next outer tuple (which may be able to join to - * previously marked tuples). - * - * NB: must NOT do "extraMarks" here, since we may need to - * return to previously marked tuples. - */ - innerTupleSlot = ExecProcNode(innerPlan); - node->mj_InnerTupleSlot = innerTupleSlot; - MJ_DEBUG_PROC_NODE(innerTupleSlot); - node->mj_MatchedInner = false; - - /* Compute join values and check for unmatchability */ - switch (MJEvalInnerValues(node, innerTupleSlot)) - { - case MJEVAL_MATCHABLE: - - /* - * Test the new inner tuple to see if it matches - * outer. - * - * If they do match, then we join them and move on to - * the next inner tuple (EXEC_MJ_JOINTUPLES). - * - * If they do not match then advance to next outer - * tuple. - */ - compareResult = MJCompare(node); - MJ_DEBUG_COMPARE(compareResult); - - if (compareResult == 0) - node->mj_JoinState = EXEC_MJ_JOINTUPLES; - else - { - Assert(compareResult < 0); - node->mj_JoinState = EXEC_MJ_NEXTOUTER; - } - break; - case MJEVAL_NONMATCHABLE: - - /* - * It contains a NULL and hence can't match any outer - * tuple, so we can skip the comparison and assume the - * new tuple is greater than current outer. - */ - node->mj_JoinState = EXEC_MJ_NEXTOUTER; - break; - case MJEVAL_ENDOFJOIN: - - /* - * No more inner tuples. However, this might be only - * effective and not physical end of inner plan, so - * force mj_InnerTupleSlot to null to make sure we - * don't fetch more inner tuples. (We need this hack - * because we are not transiting to a state where the - * inner plan is assumed to be exhausted.) - */ - node->mj_InnerTupleSlot = NULL; - node->mj_JoinState = EXEC_MJ_NEXTOUTER; - break; - } - break; - - /*------------------------------------------- - * EXEC_MJ_NEXTOUTER means - * - * outer inner - * outer tuple - 5 5 - marked tuple - * 5 5 - * 6 6 - inner tuple - * 7 7 - * - * we know we just bumped into the - * first inner tuple > current outer tuple (or possibly - * the end of the inner stream) - * so get a new outer tuple and then - * proceed to test it against the marked tuple - * (EXEC_MJ_TESTOUTER) - * - * Before advancing, we check to see if we must emit an - * outer-join fill tuple for this outer tuple. - *------------------------------------------------ - */ - case EXEC_MJ_NEXTOUTER: - MJ_printf("ExecMergeJoin: EXEC_MJ_NEXTOUTER\n"); - - if (doFillOuter && !node->mj_MatchedOuter) - { - /* - * Generate a fake join tuple with nulls for the inner - * tuple, and return it if it passes the non-join quals. - */ - TupleTableSlot *result; - - node->mj_MatchedOuter = true; /* do it only once */ - - result = MJFillOuter(node); - if (result) - return result; - } - - /* - * now we get the next outer tuple, if any - */ - outerTupleSlot = ExecProcNode(outerPlan); - node->mj_OuterTupleSlot = outerTupleSlot; - MJ_DEBUG_PROC_NODE(outerTupleSlot); - node->mj_MatchedOuter = false; - - /* Compute join values and check for unmatchability */ - switch (MJEvalOuterValues(node)) - { - case MJEVAL_MATCHABLE: - /* Go test the new tuple against the marked tuple */ - node->mj_JoinState = EXEC_MJ_TESTOUTER; - break; - case MJEVAL_NONMATCHABLE: - /* Can't match, so fetch next outer tuple */ - node->mj_JoinState = EXEC_MJ_NEXTOUTER; - break; - case MJEVAL_ENDOFJOIN: - /* No more outer tuples */ - MJ_printf("ExecMergeJoin: end of outer subplan\n"); - innerTupleSlot = node->mj_InnerTupleSlot; - if (doFillInner && !TupIsNull(innerTupleSlot)) - { - /* - * Need to emit right-join tuples for remaining - * inner tuples. - */ - node->mj_JoinState = EXEC_MJ_ENDOUTER; - break; - } - /* Otherwise we're done. */ - return NULL; - } - break; - - /*-------------------------------------------------------- - * EXEC_MJ_TESTOUTER If the new outer tuple and the marked - * tuple satisfy the merge clause then we know we have - * duplicates in the outer scan so we have to restore the - * inner scan to the marked tuple and proceed to join the - * new outer tuple with the inner tuples. - * - * This is the case when - * outer inner - * 4 5 - marked tuple - * outer tuple - 5 5 - * new outer tuple - 5 5 - * 6 8 - inner tuple - * 7 12 - * - * new outer tuple == marked tuple - * - * If the outer tuple fails the test, then we are done - * with the marked tuples, and we have to look for a - * match to the current inner tuple. So we will - * proceed to skip outer tuples until outer >= inner - * (EXEC_MJ_SKIP_TEST). - * - * This is the case when - * - * outer inner - * 5 5 - marked tuple - * outer tuple - 5 5 - * new outer tuple - 6 8 - inner tuple - * 7 12 - * - * new outer tuple > marked tuple - * - *--------------------------------------------------------- - */ - case EXEC_MJ_TESTOUTER: - MJ_printf("ExecMergeJoin: EXEC_MJ_TESTOUTER\n"); - - /* - * Here we must compare the outer tuple with the marked inner - * tuple. (We can ignore the result of MJEvalInnerValues, - * since the marked inner tuple is certainly matchable.) - */ - innerTupleSlot = node->mj_MarkedTupleSlot; - (void) MJEvalInnerValues(node, innerTupleSlot); - - compareResult = MJCompare(node); - MJ_DEBUG_COMPARE(compareResult); - - if (compareResult == 0) - { - /* - * the merge clause matched so now we restore the inner - * scan position to the first mark, and go join that tuple - * (and any following ones) to the new outer. - * - * If we were able to determine mark and restore are not - * needed, then we don't have to back up; the current - * inner is already the first possible match. - * - * NOTE: we do not need to worry about the MatchedInner - * state for the rescanned inner tuples. We know all of - * them will match this new outer tuple and therefore - * won't be emitted as fill tuples. This works *only* - * because we require the extra joinquals to be constant - * when doing a right or full join --- otherwise some of - * the rescanned tuples might fail the extra joinquals. - * This obviously won't happen for a constant-true extra - * joinqual, while the constant-false case is handled by - * forcing the merge clause to never match, so we never - * get here. - */ - if (!node->mj_SkipMarkRestore) - { - ExecRestrPos(innerPlan); - - /* - * ExecRestrPos probably should give us back a new - * Slot, but since it doesn't, use the marked slot. - * (The previously returned mj_InnerTupleSlot cannot - * be assumed to hold the required tuple.) - */ - node->mj_InnerTupleSlot = innerTupleSlot; - /* we need not do MJEvalInnerValues again */ - } - - node->mj_JoinState = EXEC_MJ_JOINTUPLES; - } - else - { - /* ---------------- - * if the new outer tuple didn't match the marked inner - * tuple then we have a case like: - * - * outer inner - * 4 4 - marked tuple - * new outer - 5 4 - * 6 5 - inner tuple - * 7 - * - * which means that all subsequent outer tuples will be - * larger than our marked inner tuples. So we need not - * revisit any of the marked tuples but can proceed to - * look for a match to the current inner. If there's - * no more inners, no more matches are possible. - * ---------------- - */ - Assert(compareResult > 0); - innerTupleSlot = node->mj_InnerTupleSlot; - - /* reload comparison data for current inner */ - switch (MJEvalInnerValues(node, innerTupleSlot)) - { - case MJEVAL_MATCHABLE: - /* proceed to compare it to the current outer */ - node->mj_JoinState = EXEC_MJ_SKIP_TEST; - break; - case MJEVAL_NONMATCHABLE: - - /* - * current inner can't possibly match any outer; - * better to advance the inner scan than the - * outer. - */ - node->mj_JoinState = EXEC_MJ_SKIPINNER_ADVANCE; - break; - case MJEVAL_ENDOFJOIN: - /* No more inner tuples */ - if (doFillOuter) - { - /* - * Need to emit left-join tuples for remaining - * outer tuples. - */ - node->mj_JoinState = EXEC_MJ_ENDINNER; - break; - } - /* Otherwise we're done. */ - return NULL; - } - } - break; - - /*---------------------------------------------------------- - * EXEC_MJ_SKIP means compare tuples and if they do not - * match, skip whichever is lesser. - * - * For example: - * - * outer inner - * 5 5 - * 5 5 - * outer tuple - 6 8 - inner tuple - * 7 12 - * 8 14 - * - * we have to advance the outer scan - * until we find the outer 8. - * - * On the other hand: - * - * outer inner - * 5 5 - * 5 5 - * outer tuple - 12 8 - inner tuple - * 14 10 - * 17 12 - * - * we have to advance the inner scan - * until we find the inner 12. - *---------------------------------------------------------- - */ - case EXEC_MJ_SKIP_TEST: - MJ_printf("ExecMergeJoin: EXEC_MJ_SKIP_TEST\n"); - - /* - * before we advance, make sure the current tuples do not - * satisfy the mergeclauses. If they do, then we update the - * marked tuple position and go join them. - */ - compareResult = MJCompare(node); - MJ_DEBUG_COMPARE(compareResult); - - if (compareResult == 0) - { - if (!node->mj_SkipMarkRestore) - ExecMarkPos(innerPlan); - - MarkInnerTuple(node->mj_InnerTupleSlot, node); - - node->mj_JoinState = EXEC_MJ_JOINTUPLES; - } - else if (compareResult < 0) - node->mj_JoinState = EXEC_MJ_SKIPOUTER_ADVANCE; - else - /* compareResult > 0 */ - node->mj_JoinState = EXEC_MJ_SKIPINNER_ADVANCE; - break; - - /* - * SKIPOUTER_ADVANCE: advance over an outer tuple that is - * known not to join to any inner tuple. - * - * Before advancing, we check to see if we must emit an - * outer-join fill tuple for this outer tuple. - */ - case EXEC_MJ_SKIPOUTER_ADVANCE: - MJ_printf("ExecMergeJoin: EXEC_MJ_SKIPOUTER_ADVANCE\n"); - - if (doFillOuter && !node->mj_MatchedOuter) - { - /* - * Generate a fake join tuple with nulls for the inner - * tuple, and return it if it passes the non-join quals. - */ - TupleTableSlot *result; - - node->mj_MatchedOuter = true; /* do it only once */ - - result = MJFillOuter(node); - if (result) - return result; - } - - /* - * now we get the next outer tuple, if any - */ - outerTupleSlot = ExecProcNode(outerPlan); - node->mj_OuterTupleSlot = outerTupleSlot; - MJ_DEBUG_PROC_NODE(outerTupleSlot); - node->mj_MatchedOuter = false; - - /* Compute join values and check for unmatchability */ - switch (MJEvalOuterValues(node)) - { - case MJEVAL_MATCHABLE: - /* Go test the new tuple against the current inner */ - node->mj_JoinState = EXEC_MJ_SKIP_TEST; - break; - case MJEVAL_NONMATCHABLE: - /* Can't match, so fetch next outer tuple */ - node->mj_JoinState = EXEC_MJ_SKIPOUTER_ADVANCE; - break; - case MJEVAL_ENDOFJOIN: - /* No more outer tuples */ - MJ_printf("ExecMergeJoin: end of outer subplan\n"); - innerTupleSlot = node->mj_InnerTupleSlot; - if (doFillInner && !TupIsNull(innerTupleSlot)) - { - /* - * Need to emit right-join tuples for remaining - * inner tuples. - */ - node->mj_JoinState = EXEC_MJ_ENDOUTER; - break; - } - /* Otherwise we're done. */ - return NULL; - } - break; - - /* - * SKIPINNER_ADVANCE: advance over an inner tuple that is - * known not to join to any outer tuple. - * - * Before advancing, we check to see if we must emit an - * outer-join fill tuple for this inner tuple. - */ - case EXEC_MJ_SKIPINNER_ADVANCE: - MJ_printf("ExecMergeJoin: EXEC_MJ_SKIPINNER_ADVANCE\n"); - - if (doFillInner && !node->mj_MatchedInner) - { - /* - * Generate a fake join tuple with nulls for the outer - * tuple, and return it if it passes the non-join quals. - */ - TupleTableSlot *result; - - node->mj_MatchedInner = true; /* do it only once */ - - result = MJFillInner(node); - if (result) - return result; - } - - /* Mark before advancing, if wanted */ - if (node->mj_ExtraMarks) - ExecMarkPos(innerPlan); - - /* - * now we get the next inner tuple, if any - */ - innerTupleSlot = ExecProcNode(innerPlan); - node->mj_InnerTupleSlot = innerTupleSlot; - MJ_DEBUG_PROC_NODE(innerTupleSlot); - node->mj_MatchedInner = false; - - /* Compute join values and check for unmatchability */ - switch (MJEvalInnerValues(node, innerTupleSlot)) - { - case MJEVAL_MATCHABLE: - /* proceed to compare it to the current outer */ - node->mj_JoinState = EXEC_MJ_SKIP_TEST; - break; - case MJEVAL_NONMATCHABLE: - - /* - * current inner can't possibly match any outer; - * better to advance the inner scan than the outer. - */ - node->mj_JoinState = EXEC_MJ_SKIPINNER_ADVANCE; - break; - case MJEVAL_ENDOFJOIN: - /* No more inner tuples */ - MJ_printf("ExecMergeJoin: end of inner subplan\n"); - outerTupleSlot = node->mj_OuterTupleSlot; - if (doFillOuter && !TupIsNull(outerTupleSlot)) - { - /* - * Need to emit left-join tuples for remaining - * outer tuples. - */ - node->mj_JoinState = EXEC_MJ_ENDINNER; - break; - } - /* Otherwise we're done. */ - return NULL; - } - break; - - /* - * EXEC_MJ_ENDOUTER means we have run out of outer tuples, but - * are doing a right/full join and therefore must null-fill - * any remaining unmatched inner tuples. - */ - case EXEC_MJ_ENDOUTER: - MJ_printf("ExecMergeJoin: EXEC_MJ_ENDOUTER\n"); - - Assert(doFillInner); - - if (!node->mj_MatchedInner) - { - /* - * Generate a fake join tuple with nulls for the outer - * tuple, and return it if it passes the non-join quals. - */ - TupleTableSlot *result; - - node->mj_MatchedInner = true; /* do it only once */ - - result = MJFillInner(node); - if (result) - return result; - } - - /* Mark before advancing, if wanted */ - if (node->mj_ExtraMarks) - ExecMarkPos(innerPlan); - - /* - * now we get the next inner tuple, if any - */ - innerTupleSlot = ExecProcNode(innerPlan); - node->mj_InnerTupleSlot = innerTupleSlot; - MJ_DEBUG_PROC_NODE(innerTupleSlot); - node->mj_MatchedInner = false; - - if (TupIsNull(innerTupleSlot)) - { - MJ_printf("ExecMergeJoin: end of inner subplan\n"); - return NULL; - } - - /* Else remain in ENDOUTER state and process next tuple. */ - break; - - /* - * EXEC_MJ_ENDINNER means we have run out of inner tuples, but - * are doing a left/full join and therefore must null- fill - * any remaining unmatched outer tuples. - */ - case EXEC_MJ_ENDINNER: - MJ_printf("ExecMergeJoin: EXEC_MJ_ENDINNER\n"); - - Assert(doFillOuter); - - if (!node->mj_MatchedOuter) - { - /* - * Generate a fake join tuple with nulls for the inner - * tuple, and return it if it passes the non-join quals. - */ - TupleTableSlot *result; - - node->mj_MatchedOuter = true; /* do it only once */ - - result = MJFillOuter(node); - if (result) - return result; - } - - /* - * now we get the next outer tuple, if any - */ - outerTupleSlot = ExecProcNode(outerPlan); - node->mj_OuterTupleSlot = outerTupleSlot; - MJ_DEBUG_PROC_NODE(outerTupleSlot); - node->mj_MatchedOuter = false; - - if (TupIsNull(outerTupleSlot)) - { - MJ_printf("ExecMergeJoin: end of outer subplan\n"); - return NULL; - } - - /* Else remain in ENDINNER state and process next tuple. */ - break; - - /* - * broken state value? - */ - default: - elog(ERROR, "unrecognized mergejoin state: %d", - (int) node->mj_JoinState); - } - } + innerTupleSlot = ExecProcNode(innerPlan); + node->mj_InnerTupleSlot = innerTupleSlot; + + /* Compute join values and check for unmatchability */ + switch (MJEvalInnerValues(node, innerTupleSlot)) + { + case MJEVAL_MATCHABLE: + + /* + * OK, we have the initial tuples. Begin by skipping + * non-matching tuples. + */ + node->mj_JoinState = EXEC_MJ_SKIP_TEST; + break; + case MJEVAL_NONMATCHABLE: + /* Mark before advancing, if wanted */ + if (node->mj_ExtraMarks) + ExecMarkPos(innerPlan); + /* Stay in same state to fetch next inner tuple */ + if (doFillInner) + { + /* + * Generate a fake join tuple with nulls for the + * outer tuple, and return it if it passes the + * non-join quals. + */ + TupleTableSlot *result; + + result = MJFillInner(node); + if (result) + return result; + } + break; + case MJEVAL_ENDOFJOIN: + /* No more inner tuples */ + MJ_printf("ExecMergeJoin: nothing in inner subplan\n"); + if (doFillOuter) + { + /* + * Need to emit left-join tuples for all outer + * tuples, including the one we just fetched. We + * set MatchedOuter = false to force the ENDINNER + * state to emit first tuple before advancing + * outer. + */ + node->mj_JoinState = EXEC_MJ_ENDINNER; + node->mj_MatchedOuter = false; + break; + } + /* Otherwise we're done. */ + return NULL; + } + break; + + /* + * EXEC_MJ_JOINTUPLES means we have two tuples which satisfied + * the merge clause so we join them and then proceed to get + * the next inner tuple (EXEC_MJ_NEXTINNER). + */ + case EXEC_MJ_JOINTUPLES: + MJ_printf("ExecMergeJoin: EXEC_MJ_JOINTUPLES\n"); + + /* + * Set the next state machine state. The right things will + * happen whether we return this join tuple or just fall + * through to continue the state machine execution. + */ + node->mj_JoinState = EXEC_MJ_NEXTINNER; + + /* + * Check the extra qual conditions to see if we actually want + * to return this join tuple. If not, can proceed with merge. + * We must distinguish the additional joinquals (which must + * pass to consider the tuples "matched" for outer-join logic) + * from the otherquals (which must pass before we actually + * return the tuple). + * + * We don't bother with a ResetExprContext here, on the + * assumption that we just did one while checking the merge + * qual. One per tuple should be sufficient. We do have to + * set up the econtext links to the tuples for ExecQual to + * use. + */ + outerTupleSlot = node->mj_OuterTupleSlot; + econtext->ecxt_outertuple = outerTupleSlot; + innerTupleSlot = node->mj_InnerTupleSlot; + econtext->ecxt_innertuple = innerTupleSlot; + + qualResult = (joinqual == NULL || + ExecQual(joinqual, econtext)); + MJ_DEBUG_QUAL(joinqual, qualResult); + + if (qualResult) + { +#ifdef __TBASE__ + if (node->js.jointype == JOIN_LEFT_SCALAR && node->mj_MatchedOuter) + ereport(ERROR, + (errcode(ERRCODE_CARDINALITY_VIOLATION), + errmsg("more than one row returned by a subquery used as an expression"))); +#endif + node->mj_MatchedOuter = true; + node->mj_MatchedInner = true; + + /* In an antijoin, we never return a matched tuple */ + if (node->js.jointype == JOIN_ANTI) + { + node->mj_JoinState = EXEC_MJ_NEXTOUTER; + break; + } + + /* + * If we only need to join to the first matching inner + * tuple, then consider returning this one, but after that + * continue with next outer tuple. + */ + if (node->js.single_match) + node->mj_JoinState = EXEC_MJ_NEXTOUTER; + + qualResult = (otherqual == NULL || + ExecQual(otherqual, econtext)); + MJ_DEBUG_QUAL(otherqual, qualResult); + + if (qualResult) + { + /* + * qualification succeeded. now form the desired + * projection tuple and return the slot containing it. + */ + MJ_printf("ExecMergeJoin: returning tuple\n"); + + return ExecProject(node->js.ps.ps_ProjInfo); + } + else + InstrCountFiltered2(node, 1); + } + else + InstrCountFiltered1(node, 1); + break; + + /* + * EXEC_MJ_NEXTINNER means advance the inner scan to the next + * tuple. If the tuple is not nil, we then proceed to test it + * against the join qualification. + * + * Before advancing, we check to see if we must emit an + * outer-join fill tuple for this inner tuple. + */ + case EXEC_MJ_NEXTINNER: + MJ_printf("ExecMergeJoin: EXEC_MJ_NEXTINNER\n"); + + if (doFillInner && !node->mj_MatchedInner) + { + /* + * Generate a fake join tuple with nulls for the outer + * tuple, and return it if it passes the non-join quals. + */ + TupleTableSlot *result; + + node->mj_MatchedInner = true; /* do it only once */ + + result = MJFillInner(node); + if (result) + return result; + } + + /* + * now we get the next inner tuple, if any. If there's none, + * advance to next outer tuple (which may be able to join to + * previously marked tuples). + * + * NB: must NOT do "extraMarks" here, since we may need to + * return to previously marked tuples. + */ + innerTupleSlot = ExecProcNode(innerPlan); + node->mj_InnerTupleSlot = innerTupleSlot; + MJ_DEBUG_PROC_NODE(innerTupleSlot); + node->mj_MatchedInner = false; + + /* Compute join values and check for unmatchability */ + switch (MJEvalInnerValues(node, innerTupleSlot)) + { + case MJEVAL_MATCHABLE: + + /* + * Test the new inner tuple to see if it matches + * outer. + * + * If they do match, then we join them and move on to + * the next inner tuple (EXEC_MJ_JOINTUPLES). + * + * If they do not match then advance to next outer + * tuple. + */ + compareResult = MJCompare(node); + MJ_DEBUG_COMPARE(compareResult); + + if (compareResult == 0) + node->mj_JoinState = EXEC_MJ_JOINTUPLES; + else + { + Assert(compareResult < 0); + node->mj_JoinState = EXEC_MJ_NEXTOUTER; + } + break; + case MJEVAL_NONMATCHABLE: + + /* + * It contains a NULL and hence can't match any outer + * tuple, so we can skip the comparison and assume the + * new tuple is greater than current outer. + */ + node->mj_JoinState = EXEC_MJ_NEXTOUTER; + break; + case MJEVAL_ENDOFJOIN: + + /* + * No more inner tuples. However, this might be only + * effective and not physical end of inner plan, so + * force mj_InnerTupleSlot to null to make sure we + * don't fetch more inner tuples. (We need this hack + * because we are not transiting to a state where the + * inner plan is assumed to be exhausted.) + */ + node->mj_InnerTupleSlot = NULL; + node->mj_JoinState = EXEC_MJ_NEXTOUTER; + break; + } + break; + + /*------------------------------------------- + * EXEC_MJ_NEXTOUTER means + * + * outer inner + * outer tuple - 5 5 - marked tuple + * 5 5 + * 6 6 - inner tuple + * 7 7 + * + * we know we just bumped into the + * first inner tuple > current outer tuple (or possibly + * the end of the inner stream) + * so get a new outer tuple and then + * proceed to test it against the marked tuple + * (EXEC_MJ_TESTOUTER) + * + * Before advancing, we check to see if we must emit an + * outer-join fill tuple for this outer tuple. + *------------------------------------------------ + */ + case EXEC_MJ_NEXTOUTER: + MJ_printf("ExecMergeJoin: EXEC_MJ_NEXTOUTER\n"); + + if (doFillOuter && !node->mj_MatchedOuter) + { + /* + * Generate a fake join tuple with nulls for the inner + * tuple, and return it if it passes the non-join quals. + */ + TupleTableSlot *result; + + node->mj_MatchedOuter = true; /* do it only once */ + + result = MJFillOuter(node); + if (result) + return result; + } + + /* + * now we get the next outer tuple, if any + */ + outerTupleSlot = ExecProcNode(outerPlan); + node->mj_OuterTupleSlot = outerTupleSlot; + MJ_DEBUG_PROC_NODE(outerTupleSlot); + node->mj_MatchedOuter = false; + + /* Compute join values and check for unmatchability */ + switch (MJEvalOuterValues(node)) + { + case MJEVAL_MATCHABLE: + /* Go test the new tuple against the marked tuple */ + node->mj_JoinState = EXEC_MJ_TESTOUTER; + break; + case MJEVAL_NONMATCHABLE: + /* Can't match, so fetch next outer tuple */ + node->mj_JoinState = EXEC_MJ_NEXTOUTER; + break; + case MJEVAL_ENDOFJOIN: + /* No more outer tuples */ + MJ_printf("ExecMergeJoin: end of outer subplan\n"); + innerTupleSlot = node->mj_InnerTupleSlot; + if (doFillInner && !TupIsNull(innerTupleSlot)) + { + /* + * Need to emit right-join tuples for remaining + * inner tuples. + */ + node->mj_JoinState = EXEC_MJ_ENDOUTER; + break; + } + /* Otherwise we're done. */ + return NULL; + } + break; + + /*-------------------------------------------------------- + * EXEC_MJ_TESTOUTER If the new outer tuple and the marked + * tuple satisfy the merge clause then we know we have + * duplicates in the outer scan so we have to restore the + * inner scan to the marked tuple and proceed to join the + * new outer tuple with the inner tuples. + * + * This is the case when + * outer inner + * 4 5 - marked tuple + * outer tuple - 5 5 + * new outer tuple - 5 5 + * 6 8 - inner tuple + * 7 12 + * + * new outer tuple == marked tuple + * + * If the outer tuple fails the test, then we are done + * with the marked tuples, and we have to look for a + * match to the current inner tuple. So we will + * proceed to skip outer tuples until outer >= inner + * (EXEC_MJ_SKIP_TEST). + * + * This is the case when + * + * outer inner + * 5 5 - marked tuple + * outer tuple - 5 5 + * new outer tuple - 6 8 - inner tuple + * 7 12 + * + * new outer tuple > marked tuple + * + *--------------------------------------------------------- + */ + case EXEC_MJ_TESTOUTER: + MJ_printf("ExecMergeJoin: EXEC_MJ_TESTOUTER\n"); + + /* + * Here we must compare the outer tuple with the marked inner + * tuple. (We can ignore the result of MJEvalInnerValues, + * since the marked inner tuple is certainly matchable.) + */ + innerTupleSlot = node->mj_MarkedTupleSlot; + (void) MJEvalInnerValues(node, innerTupleSlot); + + compareResult = MJCompare(node); + MJ_DEBUG_COMPARE(compareResult); + + if (compareResult == 0) + { + /* + * the merge clause matched so now we restore the inner + * scan position to the first mark, and go join that tuple + * (and any following ones) to the new outer. + * + * If we were able to determine mark and restore are not + * needed, then we don't have to back up; the current + * inner is already the first possible match. + * + * NOTE: we do not need to worry about the MatchedInner + * state for the rescanned inner tuples. We know all of + * them will match this new outer tuple and therefore + * won't be emitted as fill tuples. This works *only* + * because we require the extra joinquals to be constant + * when doing a right or full join --- otherwise some of + * the rescanned tuples might fail the extra joinquals. + * This obviously won't happen for a constant-true extra + * joinqual, while the constant-false case is handled by + * forcing the merge clause to never match, so we never + * get here. + */ + if (!node->mj_SkipMarkRestore) + { + ExecRestrPos(innerPlan); + + /* + * ExecRestrPos probably should give us back a new + * Slot, but since it doesn't, use the marked slot. + * (The previously returned mj_InnerTupleSlot cannot + * be assumed to hold the required tuple.) + */ + node->mj_InnerTupleSlot = innerTupleSlot; + /* we need not do MJEvalInnerValues again */ + } + + node->mj_JoinState = EXEC_MJ_JOINTUPLES; + } + else + { + /* ---------------- + * if the new outer tuple didn't match the marked inner + * tuple then we have a case like: + * + * outer inner + * 4 4 - marked tuple + * new outer - 5 4 + * 6 5 - inner tuple + * 7 + * + * which means that all subsequent outer tuples will be + * larger than our marked inner tuples. So we need not + * revisit any of the marked tuples but can proceed to + * look for a match to the current inner. If there's + * no more inners, no more matches are possible. + * ---------------- + */ + Assert(compareResult > 0); + innerTupleSlot = node->mj_InnerTupleSlot; + + /* reload comparison data for current inner */ + switch (MJEvalInnerValues(node, innerTupleSlot)) + { + case MJEVAL_MATCHABLE: + /* proceed to compare it to the current outer */ + node->mj_JoinState = EXEC_MJ_SKIP_TEST; + break; + case MJEVAL_NONMATCHABLE: + + /* + * current inner can't possibly match any outer; + * better to advance the inner scan than the + * outer. + */ + node->mj_JoinState = EXEC_MJ_SKIPINNER_ADVANCE; + break; + case MJEVAL_ENDOFJOIN: + /* No more inner tuples */ + if (doFillOuter) + { + /* + * Need to emit left-join tuples for remaining + * outer tuples. + */ + node->mj_JoinState = EXEC_MJ_ENDINNER; + break; + } + /* Otherwise we're done. */ + return NULL; + } + } + break; + + /*---------------------------------------------------------- + * EXEC_MJ_SKIP means compare tuples and if they do not + * match, skip whichever is lesser. + * + * For example: + * + * outer inner + * 5 5 + * 5 5 + * outer tuple - 6 8 - inner tuple + * 7 12 + * 8 14 + * + * we have to advance the outer scan + * until we find the outer 8. + * + * On the other hand: + * + * outer inner + * 5 5 + * 5 5 + * outer tuple - 12 8 - inner tuple + * 14 10 + * 17 12 + * + * we have to advance the inner scan + * until we find the inner 12. + *---------------------------------------------------------- + */ + case EXEC_MJ_SKIP_TEST: + MJ_printf("ExecMergeJoin: EXEC_MJ_SKIP_TEST\n"); + + /* + * before we advance, make sure the current tuples do not + * satisfy the mergeclauses. If they do, then we update the + * marked tuple position and go join them. + */ + compareResult = MJCompare(node); + MJ_DEBUG_COMPARE(compareResult); + + if (compareResult == 0) + { + if (!node->mj_SkipMarkRestore) + ExecMarkPos(innerPlan); + + MarkInnerTuple(node->mj_InnerTupleSlot, node); + + node->mj_JoinState = EXEC_MJ_JOINTUPLES; + } + else if (compareResult < 0) + node->mj_JoinState = EXEC_MJ_SKIPOUTER_ADVANCE; + else + /* compareResult > 0 */ + node->mj_JoinState = EXEC_MJ_SKIPINNER_ADVANCE; + break; + + /* + * SKIPOUTER_ADVANCE: advance over an outer tuple that is + * known not to join to any inner tuple. + * + * Before advancing, we check to see if we must emit an + * outer-join fill tuple for this outer tuple. + */ + case EXEC_MJ_SKIPOUTER_ADVANCE: + MJ_printf("ExecMergeJoin: EXEC_MJ_SKIPOUTER_ADVANCE\n"); + + if (doFillOuter && !node->mj_MatchedOuter) + { + /* + * Generate a fake join tuple with nulls for the inner + * tuple, and return it if it passes the non-join quals. + */ + TupleTableSlot *result; + + node->mj_MatchedOuter = true; /* do it only once */ + + result = MJFillOuter(node); + if (result) + return result; + } + + /* + * now we get the next outer tuple, if any + */ + outerTupleSlot = ExecProcNode(outerPlan); + node->mj_OuterTupleSlot = outerTupleSlot; + MJ_DEBUG_PROC_NODE(outerTupleSlot); + node->mj_MatchedOuter = false; + + /* Compute join values and check for unmatchability */ + switch (MJEvalOuterValues(node)) + { + case MJEVAL_MATCHABLE: + /* Go test the new tuple against the current inner */ + node->mj_JoinState = EXEC_MJ_SKIP_TEST; + break; + case MJEVAL_NONMATCHABLE: + /* Can't match, so fetch next outer tuple */ + node->mj_JoinState = EXEC_MJ_SKIPOUTER_ADVANCE; + break; + case MJEVAL_ENDOFJOIN: + /* No more outer tuples */ + MJ_printf("ExecMergeJoin: end of outer subplan\n"); + innerTupleSlot = node->mj_InnerTupleSlot; + if (doFillInner && !TupIsNull(innerTupleSlot)) + { + /* + * Need to emit right-join tuples for remaining + * inner tuples. + */ + node->mj_JoinState = EXEC_MJ_ENDOUTER; + break; + } + /* Otherwise we're done. */ + return NULL; + } + break; + + /* + * SKIPINNER_ADVANCE: advance over an inner tuple that is + * known not to join to any outer tuple. + * + * Before advancing, we check to see if we must emit an + * outer-join fill tuple for this inner tuple. + */ + case EXEC_MJ_SKIPINNER_ADVANCE: + MJ_printf("ExecMergeJoin: EXEC_MJ_SKIPINNER_ADVANCE\n"); + + if (doFillInner && !node->mj_MatchedInner) + { + /* + * Generate a fake join tuple with nulls for the outer + * tuple, and return it if it passes the non-join quals. + */ + TupleTableSlot *result; + + node->mj_MatchedInner = true; /* do it only once */ + + result = MJFillInner(node); + if (result) + return result; + } + + /* Mark before advancing, if wanted */ + if (node->mj_ExtraMarks) + ExecMarkPos(innerPlan); + + /* + * now we get the next inner tuple, if any + */ + innerTupleSlot = ExecProcNode(innerPlan); + node->mj_InnerTupleSlot = innerTupleSlot; + MJ_DEBUG_PROC_NODE(innerTupleSlot); + node->mj_MatchedInner = false; + + /* Compute join values and check for unmatchability */ + switch (MJEvalInnerValues(node, innerTupleSlot)) + { + case MJEVAL_MATCHABLE: + /* proceed to compare it to the current outer */ + node->mj_JoinState = EXEC_MJ_SKIP_TEST; + break; + case MJEVAL_NONMATCHABLE: + + /* + * current inner can't possibly match any outer; + * better to advance the inner scan than the outer. + */ + node->mj_JoinState = EXEC_MJ_SKIPINNER_ADVANCE; + break; + case MJEVAL_ENDOFJOIN: + /* No more inner tuples */ + MJ_printf("ExecMergeJoin: end of inner subplan\n"); + outerTupleSlot = node->mj_OuterTupleSlot; + if (doFillOuter && !TupIsNull(outerTupleSlot)) + { + /* + * Need to emit left-join tuples for remaining + * outer tuples. + */ + node->mj_JoinState = EXEC_MJ_ENDINNER; + break; + } + /* Otherwise we're done. */ + return NULL; + } + break; + + /* + * EXEC_MJ_ENDOUTER means we have run out of outer tuples, but + * are doing a right/full join and therefore must null-fill + * any remaining unmatched inner tuples. + */ + case EXEC_MJ_ENDOUTER: + MJ_printf("ExecMergeJoin: EXEC_MJ_ENDOUTER\n"); + + Assert(doFillInner); + + if (!node->mj_MatchedInner) + { + /* + * Generate a fake join tuple with nulls for the outer + * tuple, and return it if it passes the non-join quals. + */ + TupleTableSlot *result; + + node->mj_MatchedInner = true; /* do it only once */ + + result = MJFillInner(node); + if (result) + return result; + } + + /* Mark before advancing, if wanted */ + if (node->mj_ExtraMarks) + ExecMarkPos(innerPlan); + + /* + * now we get the next inner tuple, if any + */ + innerTupleSlot = ExecProcNode(innerPlan); + node->mj_InnerTupleSlot = innerTupleSlot; + MJ_DEBUG_PROC_NODE(innerTupleSlot); + node->mj_MatchedInner = false; + + if (TupIsNull(innerTupleSlot)) + { + MJ_printf("ExecMergeJoin: end of inner subplan\n"); + return NULL; + } + + /* Else remain in ENDOUTER state and process next tuple. */ + break; + + /* + * EXEC_MJ_ENDINNER means we have run out of inner tuples, but + * are doing a left/full join and therefore must null- fill + * any remaining unmatched outer tuples. + */ + case EXEC_MJ_ENDINNER: + MJ_printf("ExecMergeJoin: EXEC_MJ_ENDINNER\n"); + + Assert(doFillOuter); + + if (!node->mj_MatchedOuter) + { + /* + * Generate a fake join tuple with nulls for the inner + * tuple, and return it if it passes the non-join quals. + */ + TupleTableSlot *result; + + node->mj_MatchedOuter = true; /* do it only once */ + + result = MJFillOuter(node); + if (result) + return result; + } + + /* + * now we get the next outer tuple, if any + */ + outerTupleSlot = ExecProcNode(outerPlan); + node->mj_OuterTupleSlot = outerTupleSlot; + MJ_DEBUG_PROC_NODE(outerTupleSlot); + node->mj_MatchedOuter = false; + + if (TupIsNull(outerTupleSlot)) + { + MJ_printf("ExecMergeJoin: end of outer subplan\n"); + return NULL; + } + + /* Else remain in ENDINNER state and process next tuple. */ + break; + + /* + * broken state value? + */ + default: + elog(ERROR, "unrecognized mergejoin state: %d", + (int) node->mj_JoinState); + } + } } /* ---------------------------------------------------------------- @@ -1447,180 +1453,189 @@ ExecMergeJoin(PlanState *pstate) */ MergeJoinState * ExecInitMergeJoin(MergeJoin *node, EState *estate, int eflags) -{// #lizard forgives - MergeJoinState *mergestate; - - /* check for unsupported flags */ - Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); - - MJ1_printf("ExecInitMergeJoin: %s\n", - "initializing node"); - - /* - * create state structure - */ - mergestate = makeNode(MergeJoinState); - mergestate->js.ps.plan = (Plan *) node; - mergestate->js.ps.state = estate; - mergestate->js.ps.ExecProcNode = ExecMergeJoin; - - /* - * Miscellaneous initialization - * - * create expression context for node - */ - ExecAssignExprContext(estate, &mergestate->js.ps); - - /* - * we need two additional econtexts in which we can compute the join - * expressions from the left and right input tuples. The node's regular - * econtext won't do because it gets reset too often. - */ - mergestate->mj_OuterEContext = CreateExprContext(estate); - mergestate->mj_InnerEContext = CreateExprContext(estate); - - /* - * initialize child expressions - */ - mergestate->js.ps.qual = - ExecInitQual(node->join.plan.qual, (PlanState *) mergestate); - mergestate->js.jointype = node->join.jointype; - mergestate->js.joinqual = - ExecInitQual(node->join.joinqual, (PlanState *) mergestate); - mergestate->mj_ConstFalseJoin = false; - /* mergeclauses are handled below */ - - /* - * initialize child nodes - * - * inner child must support MARK/RESTORE, unless we have detected that we - * don't need that. Note that skip_mark_restore must never be set if - * there are non-mergeclause joinquals, since the logic wouldn't work. - */ - Assert(node->join.joinqual == NIL || !node->skip_mark_restore); - mergestate->mj_SkipMarkRestore = node->skip_mark_restore; - - outerPlanState(mergestate) = ExecInitNode(outerPlan(node), estate, eflags); - innerPlanState(mergestate) = ExecInitNode(innerPlan(node), estate, - mergestate->mj_SkipMarkRestore ? - eflags : - (eflags | EXEC_FLAG_MARK)); - - /* - * For certain types of inner child nodes, it is advantageous to issue - * MARK every time we advance past an inner tuple we will never return to. - * For other types, MARK on a tuple we cannot return to is a waste of - * cycles. Detect which case applies and set mj_ExtraMarks if we want to - * issue "unnecessary" MARK calls. - * - * Currently, only Material wants the extra MARKs, and it will be helpful - * only if eflags doesn't specify REWIND. - */ - if (IsA(innerPlan(node), Material) && - (eflags & EXEC_FLAG_REWIND) == 0 && - !mergestate->mj_SkipMarkRestore) - mergestate->mj_ExtraMarks = true; - else - mergestate->mj_ExtraMarks = false; - - /* - * tuple table initialization - */ - ExecInitResultTupleSlot(estate, &mergestate->js.ps); - - mergestate->mj_MarkedTupleSlot = ExecInitExtraTupleSlot(estate); - ExecSetSlotDescriptor(mergestate->mj_MarkedTupleSlot, - ExecGetResultType(innerPlanState(mergestate))); - - /* - * detect whether we need only consider the first matching inner tuple - */ - mergestate->js.single_match = (node->join.inner_unique || - node->join.jointype == JOIN_SEMI); - - /* set up null tuples for outer joins, if needed */ - switch (node->join.jointype) - { - case JOIN_INNER: +{ + MergeJoinState *mergestate; + + /* check for unsupported flags */ + Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); + + MJ1_printf("ExecInitMergeJoin: %s\n", + "initializing node"); + + /* + * create state structure + */ + mergestate = makeNode(MergeJoinState); + mergestate->js.ps.plan = (Plan *) node; + mergestate->js.ps.state = estate; + mergestate->js.ps.ExecProcNode = ExecMergeJoin; + + /* + * Miscellaneous initialization + * + * create expression context for node + */ + ExecAssignExprContext(estate, &mergestate->js.ps); + + /* + * we need two additional econtexts in which we can compute the join + * expressions from the left and right input tuples. The node's regular + * econtext won't do because it gets reset too often. + */ + mergestate->mj_OuterEContext = CreateExprContext(estate); + mergestate->mj_InnerEContext = CreateExprContext(estate); + + /* + * initialize child expressions + */ + mergestate->js.ps.qual = + ExecInitQual(node->join.plan.qual, (PlanState *) mergestate); + mergestate->js.jointype = node->join.jointype; + mergestate->js.joinqual = + ExecInitQual(node->join.joinqual, (PlanState *) mergestate); + mergestate->mj_ConstFalseJoin = false; + /* mergeclauses are handled below */ + + /* + * initialize child nodes + * + * inner child must support MARK/RESTORE, unless we have detected that we + * don't need that. Note that skip_mark_restore must never be set if + * there are non-mergeclause joinquals, since the logic wouldn't work. + */ + Assert(node->join.joinqual == NIL || !node->skip_mark_restore); + mergestate->mj_SkipMarkRestore = node->skip_mark_restore; + + outerPlanState(mergestate) = ExecInitNode(outerPlan(node), estate, eflags); + innerPlanState(mergestate) = ExecInitNode(innerPlan(node), estate, + mergestate->mj_SkipMarkRestore ? + eflags : + (eflags | EXEC_FLAG_MARK)); + + /* + * For certain types of inner child nodes, it is advantageous to issue + * MARK every time we advance past an inner tuple we will never return to. + * For other types, MARK on a tuple we cannot return to is a waste of + * cycles. Detect which case applies and set mj_ExtraMarks if we want to + * issue "unnecessary" MARK calls. + * + * Currently, only Material wants the extra MARKs, and it will be helpful + * only if eflags doesn't specify REWIND. + */ + if (IsA(innerPlan(node), Material) && + (eflags & EXEC_FLAG_REWIND) == 0 && + !mergestate->mj_SkipMarkRestore) + mergestate->mj_ExtraMarks = true; + else + mergestate->mj_ExtraMarks = false; + + /* + * tuple table initialization + */ + ExecInitResultTupleSlot(estate, &mergestate->js.ps); + + mergestate->mj_MarkedTupleSlot = ExecInitExtraTupleSlot(estate); + ExecSetSlotDescriptor(mergestate->mj_MarkedTupleSlot, + ExecGetResultType(innerPlanState(mergestate))); + + /* + * detect whether we need only consider the first matching inner tuple + */ + mergestate->js.single_match = (node->join.inner_unique || + node->join.jointype == JOIN_SEMI); + + /* set up null tuples for outer joins, if needed */ + switch (node->join.jointype) + { + case JOIN_INNER: case JOIN_SEMI: - mergestate->mj_FillOuter = false; - mergestate->mj_FillInner = false; - break; - case JOIN_LEFT: - case JOIN_ANTI: + mergestate->mj_FillOuter = false; + mergestate->mj_FillInner = false; + break; +#ifdef __TBASE__ + case JOIN_LEFT_SCALAR: mergestate->mj_FillOuter = true; mergestate->mj_FillInner = false; mergestate->mj_NullInnerTupleSlot = - ExecInitNullTupleSlot(estate, - ExecGetResultType(innerPlanState(mergestate))); - break; - case JOIN_RIGHT: - mergestate->mj_FillOuter = false; - mergestate->mj_FillInner = true; - mergestate->mj_NullOuterTupleSlot = - ExecInitNullTupleSlot(estate, - ExecGetResultType(outerPlanState(mergestate))); - - /* - * Can't handle right or full join with non-constant extra - * joinclauses. This should have been caught by planner. - */ - if (!check_constant_qual(node->join.joinqual, - &mergestate->mj_ConstFalseJoin)) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("RIGHT JOIN is only supported with merge-joinable join conditions"))); - break; - case JOIN_FULL: - mergestate->mj_FillOuter = true; - mergestate->mj_FillInner = true; - mergestate->mj_NullOuterTupleSlot = - ExecInitNullTupleSlot(estate, - ExecGetResultType(outerPlanState(mergestate))); - mergestate->mj_NullInnerTupleSlot = - ExecInitNullTupleSlot(estate, - ExecGetResultType(innerPlanState(mergestate))); - - /* - * Can't handle right or full join with non-constant extra - * joinclauses. This should have been caught by planner. - */ - if (!check_constant_qual(node->join.joinqual, - &mergestate->mj_ConstFalseJoin)) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("FULL JOIN is only supported with merge-joinable join conditions"))); + ExecInitNullTupleSlot(estate, + ExecGetResultType(innerPlanState(mergestate))); break; - default: - elog(ERROR, "unrecognized join type: %d", - (int) node->join.jointype); - } - - /* - * initialize tuple type and projection info - */ - ExecAssignResultTypeFromTL(&mergestate->js.ps); - ExecAssignProjectionInfo(&mergestate->js.ps, NULL); - - /* - * preprocess the merge clauses - */ - mergestate->mj_NumClauses = list_length(node->mergeclauses); - mergestate->mj_Clauses = MJExamineQuals(node->mergeclauses, - node->mergeFamilies, - node->mergeCollations, - node->mergeStrategies, - node->mergeNullsFirst, - (PlanState *) mergestate); - - /* - * initialize join state - */ - mergestate->mj_JoinState = EXEC_MJ_INITIALIZE_OUTER; - mergestate->mj_MatchedOuter = false; - mergestate->mj_MatchedInner = false; - mergestate->mj_OuterTupleSlot = NULL; - mergestate->mj_InnerTupleSlot = NULL; +#endif + case JOIN_LEFT: + case JOIN_ANTI: + mergestate->mj_FillOuter = true; + mergestate->mj_FillInner = false; + mergestate->mj_NullInnerTupleSlot = + ExecInitNullTupleSlot(estate, + ExecGetResultType(innerPlanState(mergestate))); + break; + case JOIN_RIGHT: + mergestate->mj_FillOuter = false; + mergestate->mj_FillInner = true; + mergestate->mj_NullOuterTupleSlot = + ExecInitNullTupleSlot(estate, + ExecGetResultType(outerPlanState(mergestate))); + + /* + * Can't handle right or full join with non-constant extra + * joinclauses. This should have been caught by planner. + */ + if (!check_constant_qual(node->join.joinqual, + &mergestate->mj_ConstFalseJoin)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("RIGHT JOIN is only supported with merge-joinable join conditions"))); + break; + case JOIN_FULL: + mergestate->mj_FillOuter = true; + mergestate->mj_FillInner = true; + mergestate->mj_NullOuterTupleSlot = + ExecInitNullTupleSlot(estate, + ExecGetResultType(outerPlanState(mergestate))); + mergestate->mj_NullInnerTupleSlot = + ExecInitNullTupleSlot(estate, + ExecGetResultType(innerPlanState(mergestate))); + + /* + * Can't handle right or full join with non-constant extra + * joinclauses. This should have been caught by planner. + */ + if (!check_constant_qual(node->join.joinqual, + &mergestate->mj_ConstFalseJoin)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("FULL JOIN is only supported with merge-joinable join conditions"))); + break; + default: + elog(ERROR, "unrecognized join type: %d", + (int) node->join.jointype); + } + + /* + * initialize tuple type and projection info + */ + ExecAssignResultTypeFromTL(&mergestate->js.ps); + ExecAssignProjectionInfo(&mergestate->js.ps, NULL); + + /* + * preprocess the merge clauses + */ + mergestate->mj_NumClauses = list_length(node->mergeclauses); + mergestate->mj_Clauses = MJExamineQuals(node->mergeclauses, + node->mergeFamilies, + node->mergeCollations, + node->mergeStrategies, + node->mergeNullsFirst, + (PlanState *) mergestate); + + /* + * initialize join state + */ + mergestate->mj_JoinState = EXEC_MJ_INITIALIZE_OUTER; + mergestate->mj_MatchedOuter = false; + mergestate->mj_MatchedInner = false; + mergestate->mj_OuterTupleSlot = NULL; + mergestate->mj_InnerTupleSlot = NULL; #ifdef __TBASE__ mergestate->mj_InnerInited = false; #endif diff --git a/src/backend/executor/nodeNestloop.c b/src/backend/executor/nodeNestloop.c index 45b0023b..9a9ec8d4 100644 --- a/src/backend/executor/nodeNestloop.c +++ b/src/backend/executor/nodeNestloop.c @@ -167,102 +167,114 @@ ExecNestLoop(PlanState *pstate) #ifdef __TBASE__ node->nl_InnerInited = true; #endif - innerTupleSlot = ExecProcNode(innerPlan); - econtext->ecxt_innertuple = innerTupleSlot; + innerTupleSlot = ExecProcNode(innerPlan); + econtext->ecxt_innertuple = innerTupleSlot; - if (TupIsNull(innerTupleSlot)) - { - ENL1_printf("no inner tuple, need new outer tuple"); - - node->nl_NeedNewOuter = true; + if (TupIsNull(innerTupleSlot)) + { + ENL1_printf("no inner tuple, need new outer tuple"); + node->nl_NeedNewOuter = true; +#ifdef __TBASE__ if (!node->nl_MatchedOuter && (node->js.jointype == JOIN_LEFT || - node->js.jointype == JOIN_ANTI)) - { - /* - * We are doing an outer join and there were no join matches - * for this outer tuple. Generate a fake join tuple with - * nulls for the inner tuple, and return it if it passes the - * non-join quals. - */ - econtext->ecxt_innertuple = node->nl_NullInnerTupleSlot; - - ENL1_printf("testing qualification for outer-join tuple"); - - if (otherqual == NULL || ExecQual(otherqual, econtext)) - { - /* - * qualification was satisfied so we project and return - * the slot containing the result tuple using - * ExecProject(). - */ - ENL1_printf("qualification succeeded, projecting tuple"); - - return ExecProject(node->js.ps.ps_ProjInfo); - } - else - InstrCountFiltered2(node, 1); - } - - /* - * Otherwise just return to top of loop for a new outer tuple. - */ - continue; - } - - /* - * at this point we have a new pair of inner and outer tuples so we - * test the inner and outer tuples to see if they satisfy the node's - * qualification. - * - * Only the joinquals determine MatchedOuter status, but all quals - * must pass to actually return the tuple. - */ - ENL1_printf("testing qualification"); - - if (ExecQual(joinqual, econtext)) - { - node->nl_MatchedOuter = true; - - /* In an antijoin, we never return a matched tuple */ - if (node->js.jointype == JOIN_ANTI) - { - node->nl_NeedNewOuter = true; - continue; /* return to top of loop */ - } - - /* - * If we only need to join to the first matching inner tuple, then - * consider returning this one, but after that continue with next - * outer tuple. - */ - if (node->js.single_match) - node->nl_NeedNewOuter = true; - - if (otherqual == NULL || ExecQual(otherqual, econtext)) - { - /* - * qualification was satisfied so we project and return the - * slot containing the result tuple using ExecProject(). - */ - ENL1_printf("qualification succeeded, projecting tuple"); - - return ExecProject(node->js.ps.ps_ProjInfo); - } - else - InstrCountFiltered2(node, 1); - } - else - InstrCountFiltered1(node, 1); - - /* - * Tuple fails qual, so free per-tuple memory and try again. - */ - ResetExprContext(econtext); - - ENL1_printf("qualification failed, looping"); - } + node->js.jointype == JOIN_ANTI || + node->js.jointype == JOIN_LEFT_SCALAR)) +#else + if (!node->nl_MatchedOuter && + (node->js.jointype == JOIN_LEFT || + node->js.jointype == JOIN_ANTI)) +#endif + { + /* + * We are doing an outer join and there were no join matches + * for this outer tuple. Generate a fake join tuple with + * nulls for the inner tuple, and return it if it passes the + * non-join quals. + */ + econtext->ecxt_innertuple = node->nl_NullInnerTupleSlot; + + ENL1_printf("testing qualification for outer-join tuple"); + + if (otherqual == NULL || ExecQual(otherqual, econtext)) + { + /* + * qualification was satisfied so we project and return + * the slot containing the result tuple using + * ExecProject(). + */ + ENL1_printf("qualification succeeded, projecting tuple"); + + return ExecProject(node->js.ps.ps_ProjInfo); + } + else + InstrCountFiltered2(node, 1); + } + + /* + * Otherwise just return to top of loop for a new outer tuple. + */ + continue; + } + + /* + * at this point we have a new pair of inner and outer tuples so we + * test the inner and outer tuples to see if they satisfy the node's + * qualification. + * + * Only the joinquals determine MatchedOuter status, but all quals + * must pass to actually return the tuple. + */ + ENL1_printf("testing qualification"); + + if (ExecQual(joinqual, econtext)) + { +#ifdef __TBASE__ + if (node->js.jointype == JOIN_LEFT_SCALAR && node->nl_MatchedOuter) + ereport(ERROR, + (errcode(ERRCODE_CARDINALITY_VIOLATION), + errmsg("more than one row returned by a subquery used as an expression"))); +#endif + node->nl_MatchedOuter = true; + + /* In an antijoin, we never return a matched tuple */ + if (node->js.jointype == JOIN_ANTI) + { + node->nl_NeedNewOuter = true; + continue; /* return to top of loop */ + } + + /* + * If we only need to join to the first matching inner tuple, then + * consider returning this one, but after that continue with next + * outer tuple. + */ + if (node->js.single_match) + node->nl_NeedNewOuter = true; + + if (otherqual == NULL || ExecQual(otherqual, econtext)) + { + /* + * qualification was satisfied so we project and return the + * slot containing the result tuple using ExecProject(). + */ + ENL1_printf("qualification succeeded, projecting tuple"); + + return ExecProject(node->js.ps.ps_ProjInfo); + } + else + InstrCountFiltered2(node, 1); + } + else + InstrCountFiltered1(node, 1); + + /* + * Tuple fails qual, so free per-tuple memory and try again. + */ + ResetExprContext(econtext); + + ENL1_printf("qualification failed, looping"); + } } /* ---------------------------------------------------------------- @@ -271,94 +283,101 @@ ExecNestLoop(PlanState *pstate) */ NestLoopState * ExecInitNestLoop(NestLoop *node, EState *estate, int eflags) -{// #lizard forgives - NestLoopState *nlstate; - - /* check for unsupported flags */ - Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); - - NL1_printf("ExecInitNestLoop: %s\n", - "initializing node"); - - /* - * create state structure - */ - nlstate = makeNode(NestLoopState); - nlstate->js.ps.plan = (Plan *) node; - nlstate->js.ps.state = estate; - nlstate->js.ps.ExecProcNode = ExecNestLoop; - - /* - * Miscellaneous initialization - * - * create expression context for node - */ - ExecAssignExprContext(estate, &nlstate->js.ps); - - /* - * initialize child expressions - */ - nlstate->js.ps.qual = - ExecInitQual(node->join.plan.qual, (PlanState *) nlstate); - nlstate->js.jointype = node->join.jointype; - nlstate->js.joinqual = - ExecInitQual(node->join.joinqual, (PlanState *) nlstate); - - /* - * initialize child nodes - * - * If we have no parameters to pass into the inner rel from the outer, - * tell the inner child that cheap rescans would be good. If we do have - * such parameters, then there is no point in REWIND support at all in the - * inner child, because it will always be rescanned with fresh parameter - * values. - */ - outerPlanState(nlstate) = ExecInitNode(outerPlan(node), estate, eflags); - if (node->nestParams == NIL) - eflags |= EXEC_FLAG_REWIND; - else - eflags &= ~EXEC_FLAG_REWIND; - innerPlanState(nlstate) = ExecInitNode(innerPlan(node), estate, eflags); - - /* - * tuple table initialization - */ - ExecInitResultTupleSlot(estate, &nlstate->js.ps); - - /* - * detect whether we need only consider the first matching inner tuple - */ - nlstate->js.single_match = (node->join.inner_unique || - node->join.jointype == JOIN_SEMI); - - /* set up null tuples for outer joins, if needed */ - switch (node->join.jointype) - { - case JOIN_INNER: +{ + NestLoopState *nlstate; + + /* check for unsupported flags */ + Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); + + NL1_printf("ExecInitNestLoop: %s\n", + "initializing node"); + + /* + * create state structure + */ + nlstate = makeNode(NestLoopState); + nlstate->js.ps.plan = (Plan *) node; + nlstate->js.ps.state = estate; + nlstate->js.ps.ExecProcNode = ExecNestLoop; + + /* + * Miscellaneous initialization + * + * create expression context for node + */ + ExecAssignExprContext(estate, &nlstate->js.ps); + + /* + * initialize child expressions + */ + nlstate->js.ps.qual = + ExecInitQual(node->join.plan.qual, (PlanState *) nlstate); + nlstate->js.jointype = node->join.jointype; + nlstate->js.joinqual = + ExecInitQual(node->join.joinqual, (PlanState *) nlstate); + + /* + * initialize child nodes + * + * If we have no parameters to pass into the inner rel from the outer, + * tell the inner child that cheap rescans would be good. If we do have + * such parameters, then there is no point in REWIND support at all in the + * inner child, because it will always be rescanned with fresh parameter + * values. + */ + outerPlanState(nlstate) = ExecInitNode(outerPlan(node), estate, eflags); + if (node->nestParams == NIL) + eflags |= EXEC_FLAG_REWIND; + else + eflags &= ~EXEC_FLAG_REWIND; + innerPlanState(nlstate) = ExecInitNode(innerPlan(node), estate, eflags); + + /* + * tuple table initialization + */ + ExecInitResultTupleSlot(estate, &nlstate->js.ps); + + /* + * detect whether we need only consider the first matching inner tuple + */ + nlstate->js.single_match = (node->join.inner_unique || + node->join.jointype == JOIN_SEMI); + + /* set up null tuples for outer joins, if needed */ + switch (node->join.jointype) + { + case JOIN_INNER: case JOIN_SEMI: break; - case JOIN_LEFT: - case JOIN_ANTI: +#ifdef __TBASE__ + case JOIN_LEFT_SCALAR: nlstate->nl_NullInnerTupleSlot = - ExecInitNullTupleSlot(estate, - ExecGetResultType(innerPlanState(nlstate))); - break; - default: - elog(ERROR, "unrecognized join type: %d", - (int) node->join.jointype); - } - - /* - * initialize tuple type and projection info - */ - ExecAssignResultTypeFromTL(&nlstate->js.ps); - ExecAssignProjectionInfo(&nlstate->js.ps, NULL); - - /* - * finally, wipe the current outer tuple clean. - */ - nlstate->nl_NeedNewOuter = true; - nlstate->nl_MatchedOuter = false; + ExecInitNullTupleSlot(estate, + ExecGetResultType(innerPlanState(nlstate))); + break; +#endif + case JOIN_LEFT: + case JOIN_ANTI: + nlstate->nl_NullInnerTupleSlot = + ExecInitNullTupleSlot(estate, + ExecGetResultType(innerPlanState(nlstate))); + break; + default: + elog(ERROR, "unrecognized join type: %d", + (int) node->join.jointype); + } + + /* + * initialize tuple type and projection info + */ + ExecAssignResultTypeFromTL(&nlstate->js.ps); + ExecAssignProjectionInfo(&nlstate->js.ps, NULL); + + /* + * finally, wipe the current outer tuple clean. + */ + nlstate->nl_NeedNewOuter = true; + nlstate->nl_MatchedOuter = false; #ifdef __TBASE__ nlstate->nl_InnerInited = false; #endif diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index d629ec67..a4fec879 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -210,34 +210,40 @@ make_one_rel(PlannerInfo *root, List *joinlist) static void set_base_rel_consider_startup(PlannerInfo *root) { - /* - * Since parameterized paths can only be used on the inside of a nestloop - * join plan, there is usually little value in considering fast-start - * plans for them. However, for relations that are on the RHS of a SEMI - * or ANTI join, a fast-start plan can be useful because we're only going - * to care about fetching one tuple anyway. - * - * To minimize growth of planning time, we currently restrict this to - * cases where the RHS is a single base relation, not a join; there is no - * provision for consider_param_startup to get set at all on joinrels. - * Also we don't worry about appendrels. costsize.c's costing rules for - * nestloop semi/antijoins don't consider such cases either. - */ - ListCell *lc; - - foreach(lc, root->join_info_list) - { - SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) lfirst(lc); - int varno; - - if ((sjinfo->jointype == JOIN_SEMI || sjinfo->jointype == JOIN_ANTI) && - bms_get_singleton_member(sjinfo->syn_righthand, &varno)) - { - RelOptInfo *rel = find_base_rel(root, varno); + /* + * Since parameterized paths can only be used on the inside of a nestloop + * join plan, there is usually little value in considering fast-start + * plans for them. However, for relations that are on the RHS of a SEMI + * or ANTI join, a fast-start plan can be useful because we're only going + * to care about fetching one tuple anyway. + * + * To minimize growth of planning time, we currently restrict this to + * cases where the RHS is a single base relation, not a join; there is no + * provision for consider_param_startup to get set at all on joinrels. + * Also we don't worry about appendrels. costsize.c's costing rules for + * nestloop semi/antijoins don't consider such cases either. + */ + ListCell *lc; + + foreach(lc, root->join_info_list) + { + SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) lfirst(lc); + int varno; + +#ifdef __TBASE__ + if ((sjinfo->jointype == JOIN_SEMI || sjinfo->jointype == JOIN_ANTI || + sjinfo->jointype == JOIN_LEFT_SCALAR) && + bms_get_singleton_member(sjinfo->syn_righthand, &varno)) +#else + if ((sjinfo->jointype == JOIN_SEMI || sjinfo->jointype == JOIN_ANTI) && + bms_get_singleton_member(sjinfo->syn_righthand, &varno)) +#endif + { + RelOptInfo *rel = find_base_rel(root, varno); - rel->consider_param_startup = true; - } - } + rel->consider_param_startup = true; + } + } } /* diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 7de5eaa4..a6bba0cf 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -2103,65 +2103,72 @@ initial_cost_nestloop(PlannerInfo *root, JoinCostWorkspace *workspace, Path *outer_path, Path *inner_path, JoinPathExtraData *extra) { - Cost startup_cost = 0; - Cost run_cost = 0; - double outer_path_rows = outer_path->rows; - Cost inner_rescan_start_cost; - Cost inner_rescan_total_cost; - Cost inner_run_cost; - Cost inner_rescan_run_cost; + Cost startup_cost = 0; + Cost run_cost = 0; + double outer_path_rows = outer_path->rows; + Cost inner_rescan_start_cost; + Cost inner_rescan_total_cost; + Cost inner_run_cost; + Cost inner_rescan_run_cost; - /* estimate costs to rescan the inner relation */ - cost_rescan(root, inner_path, - &inner_rescan_start_cost, - &inner_rescan_total_cost); + /* estimate costs to rescan the inner relation */ + cost_rescan(root, inner_path, + &inner_rescan_start_cost, + &inner_rescan_total_cost); - /* cost of source data */ + /* cost of source data */ - /* - * NOTE: clearly, we must pay both outer and inner paths' startup_cost - * before we can start returning tuples, so the join's startup cost is - * their sum. We'll also pay the inner path's rescan startup cost - * multiple times. - */ - startup_cost += outer_path->startup_cost + inner_path->startup_cost; - run_cost += outer_path->total_cost - outer_path->startup_cost; - if (outer_path_rows > 1) - run_cost += (outer_path_rows - 1) * inner_rescan_start_cost; + /* + * NOTE: clearly, we must pay both outer and inner paths' startup_cost + * before we can start returning tuples, so the join's startup cost is + * their sum. We'll also pay the inner path's rescan startup cost + * multiple times. + */ + startup_cost += outer_path->startup_cost + inner_path->startup_cost; + run_cost += outer_path->total_cost - outer_path->startup_cost; + if (outer_path_rows > 1) + run_cost += (outer_path_rows - 1) * inner_rescan_start_cost; - inner_run_cost = inner_path->total_cost - inner_path->startup_cost; - inner_rescan_run_cost = inner_rescan_total_cost - inner_rescan_start_cost; + inner_run_cost = inner_path->total_cost - inner_path->startup_cost; + inner_rescan_run_cost = inner_rescan_total_cost - inner_rescan_start_cost; - if (jointype == JOIN_SEMI || jointype == JOIN_ANTI || - extra->inner_unique) - { - /* - * With a SEMI or ANTI join, or if the innerrel is known unique, the - * executor will stop after the first match. - * - * Getting decent estimates requires inspection of the join quals, - * which we choose to postpone to final_cost_nestloop. - */ - - /* Save private data for final_cost_nestloop */ - workspace->inner_run_cost = inner_run_cost; - workspace->inner_rescan_run_cost = inner_rescan_run_cost; - } - else - { - /* Normal case; we'll scan whole input rel for each outer row */ - run_cost += inner_run_cost; - if (outer_path_rows > 1) - run_cost += (outer_path_rows - 1) * inner_rescan_run_cost; - } +#ifdef __TBASE__ + if (jointype == JOIN_SEMI || + jointype == JOIN_ANTI || + jointype == JOIN_LEFT_SCALAR || + extra->inner_unique) +#else + if (jointype == JOIN_SEMI || jointype == JOIN_ANTI || + extra->inner_unique) +#endif + { + /* + * With a SEMI or ANTI join, or if the innerrel is known unique, the + * executor will stop after the first match. + * + * Getting decent estimates requires inspection of the join quals, + * which we choose to postpone to final_cost_nestloop. + */ + + /* Save private data for final_cost_nestloop */ + workspace->inner_run_cost = inner_run_cost; + workspace->inner_rescan_run_cost = inner_rescan_run_cost; + } + else + { + /* Normal case; we'll scan whole input rel for each outer row */ + run_cost += inner_run_cost; + if (outer_path_rows > 1) + run_cost += (outer_path_rows - 1) * inner_rescan_run_cost; + } - /* CPU costs left for later */ + /* CPU costs left for later */ - /* Public result fields */ - workspace->startup_cost = startup_cost; - workspace->total_cost = startup_cost + run_cost; - /* Save private data for final_cost_nestloop */ - workspace->run_cost = run_cost; + /* Public result fields */ + workspace->startup_cost = startup_cost; + workspace->total_cost = startup_cost + run_cost; + /* Save private data for final_cost_nestloop */ + workspace->run_cost = run_cost; } /* @@ -2174,176 +2181,183 @@ initial_cost_nestloop(PlannerInfo *root, JoinCostWorkspace *workspace, */ void final_cost_nestloop(PlannerInfo *root, NestPath *path, - JoinCostWorkspace *workspace, - JoinPathExtraData *extra) -{// #lizard forgives - Path *outer_path = path->outerjoinpath; - Path *inner_path = path->innerjoinpath; - double outer_path_rows = outer_path->rows; - double inner_path_rows = inner_path->rows; - Cost startup_cost = workspace->startup_cost; - Cost run_cost = workspace->run_cost; - Cost cpu_per_tuple; - QualCost restrict_qual_cost; - double ntuples; - - /* Protect some assumptions below that rowcounts aren't zero or NaN */ - if (outer_path_rows <= 0 || isnan(outer_path_rows)) - outer_path_rows = 1; - if (inner_path_rows <= 0 || isnan(inner_path_rows)) - inner_path_rows = 1; - - /* Mark the path with the correct row estimate */ - if (path->path.param_info) - path->path.rows = path->path.param_info->ppi_rows; - else - path->path.rows = path->path.parent->rows; - - /* For partial paths, scale row estimate. */ - if (path->path.parallel_workers > 0) - { - double parallel_divisor = get_parallel_divisor(&path->path); - - path->path.rows = - clamp_row_est(path->path.rows / parallel_divisor); - } - - /* - * We could include disable_cost in the preliminary estimate, but that - * would amount to optimizing for the case where the join method is - * disabled, which doesn't seem like the way to bet. - */ - if (!enable_nestloop) - startup_cost += disable_cost; - - /* cost of inner-relation source data (we already dealt with outer rel) */ - - if (path->jointype == JOIN_SEMI || path->jointype == JOIN_ANTI || - extra->inner_unique) - { - /* - * With a SEMI or ANTI join, or if the innerrel is known unique, the - * executor will stop after the first match. - */ - Cost inner_run_cost = workspace->inner_run_cost; - Cost inner_rescan_run_cost = workspace->inner_rescan_run_cost; - double outer_matched_rows; - double outer_unmatched_rows; - Selectivity inner_scan_frac; - - /* - * For an outer-rel row that has at least one match, we can expect the - * inner scan to stop after a fraction 1/(match_count+1) of the inner - * rows, if the matches are evenly distributed. Since they probably - * aren't quite evenly distributed, we apply a fuzz factor of 2.0 to - * that fraction. (If we used a larger fuzz factor, we'd have to - * clamp inner_scan_frac to at most 1.0; but since match_count is at - * least 1, no such clamp is needed now.) - */ - outer_matched_rows = rint(outer_path_rows * extra->semifactors.outer_match_frac); - outer_unmatched_rows = outer_path_rows - outer_matched_rows; - inner_scan_frac = 2.0 / (extra->semifactors.match_count + 1.0); - - /* - * Compute number of tuples processed (not number emitted!). First, - * account for successfully-matched outer rows. - */ - ntuples = outer_matched_rows * inner_path_rows * inner_scan_frac; - - /* - * Now we need to estimate the actual costs of scanning the inner - * relation, which may be quite a bit less than N times inner_run_cost - * due to early scan stops. We consider two cases. If the inner path - * is an indexscan using all the joinquals as indexquals, then an - * unmatched outer row results in an indexscan returning no rows, - * which is probably quite cheap. Otherwise, the executor will have - * to scan the whole inner rel for an unmatched row; not so cheap. - */ - if (has_indexed_join_quals(path)) - { - /* - * Successfully-matched outer rows will only require scanning - * inner_scan_frac of the inner relation. In this case, we don't - * need to charge the full inner_run_cost even when that's more - * than inner_rescan_run_cost, because we can assume that none of - * the inner scans ever scan the whole inner relation. So it's - * okay to assume that all the inner scan executions can be - * fractions of the full cost, even if materialization is reducing - * the rescan cost. At this writing, it's impossible to get here - * for a materialized inner scan, so inner_run_cost and - * inner_rescan_run_cost will be the same anyway; but just in - * case, use inner_run_cost for the first matched tuple and - * inner_rescan_run_cost for additional ones. - */ - run_cost += inner_run_cost * inner_scan_frac; - if (outer_matched_rows > 1) - run_cost += (outer_matched_rows - 1) * inner_rescan_run_cost * inner_scan_frac; - - /* - * Add the cost of inner-scan executions for unmatched outer rows. - * We estimate this as the same cost as returning the first tuple - * of a nonempty scan. We consider that these are all rescans, - * since we used inner_run_cost once already. - */ - run_cost += outer_unmatched_rows * - inner_rescan_run_cost / inner_path_rows; + JoinCostWorkspace *workspace, + JoinPathExtraData *extra) +{ + Path *outer_path = path->outerjoinpath; + Path *inner_path = path->innerjoinpath; + double outer_path_rows = outer_path->rows; + double inner_path_rows = inner_path->rows; + Cost startup_cost = workspace->startup_cost; + Cost run_cost = workspace->run_cost; + Cost cpu_per_tuple; + QualCost restrict_qual_cost; + double ntuples; + + /* Protect some assumptions below that rowcounts aren't zero or NaN */ + if (outer_path_rows <= 0 || isnan(outer_path_rows)) + outer_path_rows = 1; + if (inner_path_rows <= 0 || isnan(inner_path_rows)) + inner_path_rows = 1; + + /* Mark the path with the correct row estimate */ + if (path->path.param_info) + path->path.rows = path->path.param_info->ppi_rows; + else + path->path.rows = path->path.parent->rows; + + /* For partial paths, scale row estimate. */ + if (path->path.parallel_workers > 0) + { + double parallel_divisor = get_parallel_divisor(&path->path); - /* - * We won't be evaluating any quals at all for unmatched rows, so - * don't add them to ntuples. - */ - } - else - { - /* - * Here, a complicating factor is that rescans may be cheaper than - * first scans. If we never scan all the way to the end of the - * inner rel, it might be (depending on the plan type) that we'd - * never pay the whole inner first-scan run cost. However it is - * difficult to estimate whether that will happen (and it could - * not happen if there are any unmatched outer rows!), so be - * conservative and always charge the whole first-scan cost once. - * We consider this charge to correspond to the first unmatched - * outer row, unless there isn't one in our estimate, in which - * case blame it on the first matched row. - */ + path->path.rows = + clamp_row_est(path->path.rows / parallel_divisor); + } - /* First, count all unmatched join tuples as being processed */ - ntuples += outer_unmatched_rows * inner_path_rows; + /* + * We could include disable_cost in the preliminary estimate, but that + * would amount to optimizing for the case where the join method is + * disabled, which doesn't seem like the way to bet. + */ + if (!enable_nestloop) + startup_cost += disable_cost; - /* Now add the forced full scan, and decrement appropriate count */ - run_cost += inner_run_cost; - if (outer_unmatched_rows >= 1) - outer_unmatched_rows -= 1; - else - outer_matched_rows -= 1; + /* cost of inner-relation source data (we already dealt with outer rel) */ - /* Add inner run cost for additional outer tuples having matches */ - if (outer_matched_rows > 0) - run_cost += outer_matched_rows * inner_rescan_run_cost * inner_scan_frac; +#ifdef __TBASE__ + if (path->jointype == JOIN_SEMI || + path->jointype == JOIN_ANTI || + path->jointype == JOIN_LEFT_SCALAR || + extra->inner_unique) +#else + if (path->jointype == JOIN_SEMI || path->jointype == JOIN_ANTI || + extra->inner_unique) +#endif + { + /* + * With a SEMI or ANTI join, or if the innerrel is known unique, the + * executor will stop after the first match. + */ + Cost inner_run_cost = workspace->inner_run_cost; + Cost inner_rescan_run_cost = workspace->inner_rescan_run_cost; + double outer_matched_rows; + double outer_unmatched_rows; + Selectivity inner_scan_frac; + + /* + * For an outer-rel row that has at least one match, we can expect the + * inner scan to stop after a fraction 1/(match_count+1) of the inner + * rows, if the matches are evenly distributed. Since they probably + * aren't quite evenly distributed, we apply a fuzz factor of 2.0 to + * that fraction. (If we used a larger fuzz factor, we'd have to + * clamp inner_scan_frac to at most 1.0; but since match_count is at + * least 1, no such clamp is needed now.) + */ + outer_matched_rows = rint(outer_path_rows * extra->semifactors.outer_match_frac); + outer_unmatched_rows = outer_path_rows - outer_matched_rows; + inner_scan_frac = 2.0 / (extra->semifactors.match_count + 1.0); + + /* + * Compute number of tuples processed (not number emitted!). First, + * account for successfully-matched outer rows. + */ + ntuples = outer_matched_rows * inner_path_rows * inner_scan_frac; + + /* + * Now we need to estimate the actual costs of scanning the inner + * relation, which may be quite a bit less than N times inner_run_cost + * due to early scan stops. We consider two cases. If the inner path + * is an indexscan using all the joinquals as indexquals, then an + * unmatched outer row results in an indexscan returning no rows, + * which is probably quite cheap. Otherwise, the executor will have + * to scan the whole inner rel for an unmatched row; not so cheap. + */ + if (has_indexed_join_quals(path)) + { + /* + * Successfully-matched outer rows will only require scanning + * inner_scan_frac of the inner relation. In this case, we don't + * need to charge the full inner_run_cost even when that's more + * than inner_rescan_run_cost, because we can assume that none of + * the inner scans ever scan the whole inner relation. So it's + * okay to assume that all the inner scan executions can be + * fractions of the full cost, even if materialization is reducing + * the rescan cost. At this writing, it's impossible to get here + * for a materialized inner scan, so inner_run_cost and + * inner_rescan_run_cost will be the same anyway; but just in + * case, use inner_run_cost for the first matched tuple and + * inner_rescan_run_cost for additional ones. + */ + run_cost += inner_run_cost * inner_scan_frac; + if (outer_matched_rows > 1) + run_cost += (outer_matched_rows - 1) * inner_rescan_run_cost * inner_scan_frac; + + /* + * Add the cost of inner-scan executions for unmatched outer rows. + * We estimate this as the same cost as returning the first tuple + * of a nonempty scan. We consider that these are all rescans, + * since we used inner_run_cost once already. + */ + run_cost += outer_unmatched_rows * + inner_rescan_run_cost / inner_path_rows; + + /* + * We won't be evaluating any quals at all for unmatched rows, so + * don't add them to ntuples. + */ + } + else + { + /* + * Here, a complicating factor is that rescans may be cheaper than + * first scans. If we never scan all the way to the end of the + * inner rel, it might be (depending on the plan type) that we'd + * never pay the whole inner first-scan run cost. However it is + * difficult to estimate whether that will happen (and it could + * not happen if there are any unmatched outer rows!), so be + * conservative and always charge the whole first-scan cost once. + * We consider this charge to correspond to the first unmatched + * outer row, unless there isn't one in our estimate, in which + * case blame it on the first matched row. + */ + + /* First, count all unmatched join tuples as being processed */ + ntuples += outer_unmatched_rows * inner_path_rows; + + /* Now add the forced full scan, and decrement appropriate count */ + run_cost += inner_run_cost; + if (outer_unmatched_rows >= 1) + outer_unmatched_rows -= 1; + else + outer_matched_rows -= 1; + + /* Add inner run cost for additional outer tuples having matches */ + if (outer_matched_rows > 0) + run_cost += outer_matched_rows * inner_rescan_run_cost * inner_scan_frac; + + /* Add inner run cost for additional unmatched outer tuples */ + if (outer_unmatched_rows > 0) + run_cost += outer_unmatched_rows * inner_rescan_run_cost; + } + } + else + { + /* Normal-case source costs were included in preliminary estimate */ - /* Add inner run cost for additional unmatched outer tuples */ - if (outer_unmatched_rows > 0) - run_cost += outer_unmatched_rows * inner_rescan_run_cost; - } - } - else - { - /* Normal-case source costs were included in preliminary estimate */ + /* Compute number of tuples processed (not number emitted!) */ + ntuples = outer_path_rows * inner_path_rows; + } - /* Compute number of tuples processed (not number emitted!) */ - ntuples = outer_path_rows * inner_path_rows; - } + /* CPU costs */ + cost_qual_eval(&restrict_qual_cost, path->joinrestrictinfo, root); + startup_cost += restrict_qual_cost.startup; + cpu_per_tuple = cpu_tuple_cost + restrict_qual_cost.per_tuple; + run_cost += cpu_per_tuple * ntuples; - /* CPU costs */ - cost_qual_eval(&restrict_qual_cost, path->joinrestrictinfo, root); - startup_cost += restrict_qual_cost.startup; - cpu_per_tuple = cpu_tuple_cost + restrict_qual_cost.per_tuple; - run_cost += cpu_per_tuple * ntuples; - - /* tlist eval costs are paid per output row, not per tuple scanned */ - startup_cost += path->path.pathtarget->cost.startup; - run_cost += path->path.pathtarget->cost.per_tuple * path->path.rows; + /* tlist eval costs are paid per output row, not per tuple scanned */ + startup_cost += path->path.pathtarget->cost.startup; + run_cost += path->path.pathtarget->cost.per_tuple * path->path.rows; #ifdef __TBASE__ /* @@ -2630,252 +2644,261 @@ initial_cost_mergejoin(PlannerInfo *root, JoinCostWorkspace *workspace, */ void final_cost_mergejoin(PlannerInfo *root, MergePath *path, - JoinCostWorkspace *workspace, - JoinPathExtraData *extra) -{// #lizard forgives - Path *outer_path = path->jpath.outerjoinpath; - Path *inner_path = path->jpath.innerjoinpath; - double inner_path_rows = inner_path->rows; - List *mergeclauses = path->path_mergeclauses; - List *innersortkeys = path->innersortkeys; - Cost startup_cost = workspace->startup_cost; - Cost run_cost = workspace->run_cost; - Cost inner_run_cost = workspace->inner_run_cost; - double outer_rows = workspace->outer_rows; - double inner_rows = workspace->inner_rows; - double outer_skip_rows = workspace->outer_skip_rows; - double inner_skip_rows = workspace->inner_skip_rows; - Cost cpu_per_tuple, - bare_inner_cost, - mat_inner_cost; - QualCost merge_qual_cost; - QualCost qp_qual_cost; - double mergejointuples, - rescannedtuples; - double rescanratio; - - /* Protect some assumptions below that rowcounts aren't zero or NaN */ - if (inner_path_rows <= 0 || isnan(inner_path_rows)) - inner_path_rows = 1; - - /* Mark the path with the correct row estimate */ - if (path->jpath.path.param_info) - path->jpath.path.rows = path->jpath.path.param_info->ppi_rows; - else - path->jpath.path.rows = path->jpath.path.parent->rows; - - /* For partial paths, scale row estimate. */ - if (path->jpath.path.parallel_workers > 0) - { - double parallel_divisor = get_parallel_divisor(&path->jpath.path); + JoinCostWorkspace *workspace, + JoinPathExtraData *extra) +{ + Path *outer_path = path->jpath.outerjoinpath; + Path *inner_path = path->jpath.innerjoinpath; + double inner_path_rows = inner_path->rows; + List *mergeclauses = path->path_mergeclauses; + List *innersortkeys = path->innersortkeys; + Cost startup_cost = workspace->startup_cost; + Cost run_cost = workspace->run_cost; + Cost inner_run_cost = workspace->inner_run_cost; + double outer_rows = workspace->outer_rows; + double inner_rows = workspace->inner_rows; + double outer_skip_rows = workspace->outer_skip_rows; + double inner_skip_rows = workspace->inner_skip_rows; + Cost cpu_per_tuple, + bare_inner_cost, + mat_inner_cost; + QualCost merge_qual_cost; + QualCost qp_qual_cost; + double mergejointuples, + rescannedtuples; + double rescanratio; + + /* Protect some assumptions below that rowcounts aren't zero or NaN */ + if (inner_path_rows <= 0 || isnan(inner_path_rows)) + inner_path_rows = 1; + + /* Mark the path with the correct row estimate */ + if (path->jpath.path.param_info) + path->jpath.path.rows = path->jpath.path.param_info->ppi_rows; + else + path->jpath.path.rows = path->jpath.path.parent->rows; + + /* For partial paths, scale row estimate. */ + if (path->jpath.path.parallel_workers > 0) + { + double parallel_divisor = get_parallel_divisor(&path->jpath.path); - path->jpath.path.rows = - clamp_row_est(path->jpath.path.rows / parallel_divisor); - } + path->jpath.path.rows = + clamp_row_est(path->jpath.path.rows / parallel_divisor); + } - /* - * We could include disable_cost in the preliminary estimate, but that - * would amount to optimizing for the case where the join method is - * disabled, which doesn't seem like the way to bet. - */ - if (!enable_mergejoin) - startup_cost += disable_cost; + /* + * We could include disable_cost in the preliminary estimate, but that + * would amount to optimizing for the case where the join method is + * disabled, which doesn't seem like the way to bet. + */ + if (!enable_mergejoin) + startup_cost += disable_cost; - /* - * Compute cost of the mergequals and qpquals (other restriction clauses) - * separately. - */ - cost_qual_eval(&merge_qual_cost, mergeclauses, root); - cost_qual_eval(&qp_qual_cost, path->jpath.joinrestrictinfo, root); - qp_qual_cost.startup -= merge_qual_cost.startup; - qp_qual_cost.per_tuple -= merge_qual_cost.per_tuple; + /* + * Compute cost of the mergequals and qpquals (other restriction clauses) + * separately. + */ + cost_qual_eval(&merge_qual_cost, mergeclauses, root); + cost_qual_eval(&qp_qual_cost, path->jpath.joinrestrictinfo, root); + qp_qual_cost.startup -= merge_qual_cost.startup; + qp_qual_cost.per_tuple -= merge_qual_cost.per_tuple; - /* - * With a SEMI or ANTI join, or if the innerrel is known unique, the - * executor will stop scanning for matches after the first match. When - * all the joinclauses are merge clauses, this means we don't ever need to - * back up the merge, and so we can skip mark/restore overhead. - */ - if ((path->jpath.jointype == JOIN_SEMI || - path->jpath.jointype == JOIN_ANTI || - extra->inner_unique) && - (list_length(path->jpath.joinrestrictinfo) == - list_length(path->path_mergeclauses))) - path->skip_mark_restore = true; - else - path->skip_mark_restore = false; + /* + * With a SEMI or ANTI join, or if the innerrel is known unique, the + * executor will stop scanning for matches after the first match. When + * all the joinclauses are merge clauses, this means we don't ever need to + * back up the merge, and so we can skip mark/restore overhead. + */ +#ifdef __TBASE__ + if ((path->jpath.jointype == JOIN_SEMI || + path->jpath.jointype == JOIN_ANTI || + path->jpath.jointype == JOIN_LEFT_SCALAR || + extra->inner_unique) && + (list_length(path->jpath.joinrestrictinfo) == + list_length(path->path_mergeclauses))) +#else + if ((path->jpath.jointype == JOIN_SEMI || + path->jpath.jointype == JOIN_ANTI || + extra->inner_unique) && + (list_length(path->jpath.joinrestrictinfo) == + list_length(path->path_mergeclauses))) +#endif + path->skip_mark_restore = true; + else + path->skip_mark_restore = false; - /* - * Get approx # tuples passing the mergequals. We use approx_tuple_count - * here because we need an estimate done with JOIN_INNER semantics. - */ - mergejointuples = approx_tuple_count(root, &path->jpath, mergeclauses); + /* + * Get approx # tuples passing the mergequals. We use approx_tuple_count + * here because we need an estimate done with JOIN_INNER semantics. + */ + mergejointuples = approx_tuple_count(root, &path->jpath, mergeclauses); - /* - * When there are equal merge keys in the outer relation, the mergejoin - * must rescan any matching tuples in the inner relation. This means - * re-fetching inner tuples; we have to estimate how often that happens. - * - * For regular inner and outer joins, the number of re-fetches can be - * estimated approximately as size of merge join output minus size of - * inner relation. Assume that the distinct key values are 1, 2, ..., and - * denote the number of values of each key in the outer relation as m1, - * m2, ...; in the inner relation, n1, n2, ... Then we have - * - * size of join = m1 * n1 + m2 * n2 + ... - * - * number of rescanned tuples = (m1 - 1) * n1 + (m2 - 1) * n2 + ... = m1 * - * n1 + m2 * n2 + ... - (n1 + n2 + ...) = size of join - size of inner - * relation - * - * This equation works correctly for outer tuples having no inner match - * (nk = 0), but not for inner tuples having no outer match (mk = 0); we - * are effectively subtracting those from the number of rescanned tuples, - * when we should not. Can we do better without expensive selectivity - * computations? - * - * The whole issue is moot if we are working from a unique-ified outer - * input, or if we know we don't need to mark/restore at all. - */ - if (IsA(outer_path, UniquePath) ||path->skip_mark_restore) - rescannedtuples = 0; - else - { - rescannedtuples = mergejointuples - inner_path_rows; - /* Must clamp because of possible underestimate */ - if (rescannedtuples < 0) - rescannedtuples = 0; - } - /* We'll inflate various costs this much to account for rescanning */ - rescanratio = 1.0 + (rescannedtuples / inner_path_rows); + /* + * When there are equal merge keys in the outer relation, the mergejoin + * must rescan any matching tuples in the inner relation. This means + * re-fetching inner tuples; we have to estimate how often that happens. + * + * For regular inner and outer joins, the number of re-fetches can be + * estimated approximately as size of merge join output minus size of + * inner relation. Assume that the distinct key values are 1, 2, ..., and + * denote the number of values of each key in the outer relation as m1, + * m2, ...; in the inner relation, n1, n2, ... Then we have + * + * size of join = m1 * n1 + m2 * n2 + ... + * + * number of rescanned tuples = (m1 - 1) * n1 + (m2 - 1) * n2 + ... = m1 * + * n1 + m2 * n2 + ... - (n1 + n2 + ...) = size of join - size of inner + * relation + * + * This equation works correctly for outer tuples having no inner match + * (nk = 0), but not for inner tuples having no outer match (mk = 0); we + * are effectively subtracting those from the number of rescanned tuples, + * when we should not. Can we do better without expensive selectivity + * computations? + * + * The whole issue is moot if we are working from a unique-ified outer + * input, or if we know we don't need to mark/restore at all. + */ + if (IsA(outer_path, UniquePath) ||path->skip_mark_restore) + rescannedtuples = 0; + else + { + rescannedtuples = mergejointuples - inner_path_rows; + /* Must clamp because of possible underestimate */ + if (rescannedtuples < 0) + rescannedtuples = 0; + } + /* We'll inflate various costs this much to account for rescanning */ + rescanratio = 1.0 + (rescannedtuples / inner_path_rows); - /* - * Decide whether we want to materialize the inner input to shield it from - * mark/restore and performing re-fetches. Our cost model for regular - * re-fetches is that a re-fetch costs the same as an original fetch, - * which is probably an overestimate; but on the other hand we ignore the - * bookkeeping costs of mark/restore. Not clear if it's worth developing - * a more refined model. So we just need to inflate the inner run cost by - * rescanratio. - */ - bare_inner_cost = inner_run_cost * rescanratio; + /* + * Decide whether we want to materialize the inner input to shield it from + * mark/restore and performing re-fetches. Our cost model for regular + * re-fetches is that a re-fetch costs the same as an original fetch, + * which is probably an overestimate; but on the other hand we ignore the + * bookkeeping costs of mark/restore. Not clear if it's worth developing + * a more refined model. So we just need to inflate the inner run cost by + * rescanratio. + */ + bare_inner_cost = inner_run_cost * rescanratio; - /* - * When we interpose a Material node the re-fetch cost is assumed to be - * just cpu_operator_cost per tuple, independently of the underlying - * plan's cost; and we charge an extra cpu_operator_cost per original - * fetch as well. Note that we're assuming the materialize node will - * never spill to disk, since it only has to remember tuples back to the - * last mark. (If there are a huge number of duplicates, our other cost - * factors will make the path so expensive that it probably won't get - * chosen anyway.) So we don't use cost_rescan here. - * - * Note: keep this estimate in sync with create_mergejoin_plan's labeling - * of the generated Material node. - */ - mat_inner_cost = inner_run_cost + - cpu_operator_cost * inner_path_rows * rescanratio; + /* + * When we interpose a Material node the re-fetch cost is assumed to be + * just cpu_operator_cost per tuple, independently of the underlying + * plan's cost; and we charge an extra cpu_operator_cost per original + * fetch as well. Note that we're assuming the materialize node will + * never spill to disk, since it only has to remember tuples back to the + * last mark. (If there are a huge number of duplicates, our other cost + * factors will make the path so expensive that it probably won't get + * chosen anyway.) So we don't use cost_rescan here. + * + * Note: keep this estimate in sync with create_mergejoin_plan's labeling + * of the generated Material node. + */ + mat_inner_cost = inner_run_cost + + cpu_operator_cost * inner_path_rows * rescanratio; - /* - * If we don't need mark/restore at all, we don't need materialization. - */ - if (path->skip_mark_restore) - path->materialize_inner = false; + /* + * If we don't need mark/restore at all, we don't need materialization. + */ + if (path->skip_mark_restore) + path->materialize_inner = false; - /* - * Prefer materializing if it looks cheaper, unless the user has asked to - * suppress materialization. - */ - else if (enable_material && mat_inner_cost < bare_inner_cost) - path->materialize_inner = true; + /* + * Prefer materializing if it looks cheaper, unless the user has asked to + * suppress materialization. + */ + else if (enable_material && mat_inner_cost < bare_inner_cost) + path->materialize_inner = true; - /* - * Even if materializing doesn't look cheaper, we *must* do it if the - * inner path is to be used directly (without sorting) and it doesn't - * support mark/restore. - * - * Since the inner side must be ordered, and only Sorts and IndexScans can - * create order to begin with, and they both support mark/restore, you - * might think there's no problem --- but you'd be wrong. Nestloop and - * merge joins can *preserve* the order of their inputs, so they can be - * selected as the input of a mergejoin, and they don't support - * mark/restore at present. - * - * We don't test the value of enable_material here, because - * materialization is required for correctness in this case, and turning - * it off does not entitle us to deliver an invalid plan. - */ - else if (innersortkeys == NIL && - !ExecSupportsMarkRestore(inner_path)) - path->materialize_inner = true; + /* + * Even if materializing doesn't look cheaper, we *must* do it if the + * inner path is to be used directly (without sorting) and it doesn't + * support mark/restore. + * + * Since the inner side must be ordered, and only Sorts and IndexScans can + * create order to begin with, and they both support mark/restore, you + * might think there's no problem --- but you'd be wrong. Nestloop and + * merge joins can *preserve* the order of their inputs, so they can be + * selected as the input of a mergejoin, and they don't support + * mark/restore at present. + * + * We don't test the value of enable_material here, because + * materialization is required for correctness in this case, and turning + * it off does not entitle us to deliver an invalid plan. + */ + else if (innersortkeys == NIL && + !ExecSupportsMarkRestore(inner_path)) + path->materialize_inner = true; - /* - * Also, force materializing if the inner path is to be sorted and the - * sort is expected to spill to disk. This is because the final merge - * pass can be done on-the-fly if it doesn't have to support mark/restore. - * We don't try to adjust the cost estimates for this consideration, - * though. - * - * Since materialization is a performance optimization in this case, - * rather than necessary for correctness, we skip it if enable_material is - * off. - */ - else if (enable_material && innersortkeys != NIL && - relation_byte_size(inner_path_rows, - inner_path->pathtarget->width) > - (work_mem * 1024L)) - path->materialize_inner = true; + /* + * Also, force materializing if the inner path is to be sorted and the + * sort is expected to spill to disk. This is because the final merge + * pass can be done on-the-fly if it doesn't have to support mark/restore. + * We don't try to adjust the cost estimates for this consideration, + * though. + * + * Since materialization is a performance optimization in this case, + * rather than necessary for correctness, we skip it if enable_material is + * off. + */ + else if (enable_material && innersortkeys != NIL && + relation_byte_size(inner_path_rows, + inner_path->pathtarget->width) > + (work_mem * 1024L)) + path->materialize_inner = true; #ifdef XCP - /* - * Even if innersortkeys are specified, we never add the Sort node on top - * of RemoteSubplan, instead we set up internal sorter. - * Since RemoteSubplan does not support mark/restore we must materialize it - */ - else if (inner_path->pathtype == T_RemoteSubplan) - path->materialize_inner = true; + /* + * Even if innersortkeys are specified, we never add the Sort node on top + * of RemoteSubplan, instead we set up internal sorter. + * Since RemoteSubplan does not support mark/restore we must materialize it + */ + else if (inner_path->pathtype == T_RemoteSubplan) + path->materialize_inner = true; #endif - else - path->materialize_inner = false; + else + path->materialize_inner = false; - /* Charge the right incremental cost for the chosen case */ - if (path->materialize_inner) - run_cost += mat_inner_cost; - else - run_cost += bare_inner_cost; + /* Charge the right incremental cost for the chosen case */ + if (path->materialize_inner) + run_cost += mat_inner_cost; + else + run_cost += bare_inner_cost; - /* CPU costs */ + /* CPU costs */ - /* - * The number of tuple comparisons needed is approximately number of outer - * rows plus number of inner rows plus number of rescanned tuples (can we - * refine this?). At each one, we need to evaluate the mergejoin quals. - */ - startup_cost += merge_qual_cost.startup; - startup_cost += merge_qual_cost.per_tuple * - (outer_skip_rows + inner_skip_rows * rescanratio); - run_cost += merge_qual_cost.per_tuple * - ((outer_rows - outer_skip_rows) + - (inner_rows - inner_skip_rows) * rescanratio); + /* + * The number of tuple comparisons needed is approximately number of outer + * rows plus number of inner rows plus number of rescanned tuples (can we + * refine this?). At each one, we need to evaluate the mergejoin quals. + */ + startup_cost += merge_qual_cost.startup; + startup_cost += merge_qual_cost.per_tuple * + (outer_skip_rows + inner_skip_rows * rescanratio); + run_cost += merge_qual_cost.per_tuple * + ((outer_rows - outer_skip_rows) + + (inner_rows - inner_skip_rows) * rescanratio); - /* - * For each tuple that gets through the mergejoin proper, we charge - * cpu_tuple_cost plus the cost of evaluating additional restriction - * clauses that are to be applied at the join. (This is pessimistic since - * not all of the quals may get evaluated at each tuple.) - * - * Note: we could adjust for SEMI/ANTI joins skipping some qual - * evaluations here, but it's probably not worth the trouble. - */ - startup_cost += qp_qual_cost.startup; - cpu_per_tuple = cpu_tuple_cost + qp_qual_cost.per_tuple; - run_cost += cpu_per_tuple * mergejointuples; + /* + * For each tuple that gets through the mergejoin proper, we charge + * cpu_tuple_cost plus the cost of evaluating additional restriction + * clauses that are to be applied at the join. (This is pessimistic since + * not all of the quals may get evaluated at each tuple.) + * + * Note: we could adjust for SEMI/ANTI joins skipping some qual + * evaluations here, but it's probably not worth the trouble. + */ + startup_cost += qp_qual_cost.startup; + cpu_per_tuple = cpu_tuple_cost + qp_qual_cost.per_tuple; + run_cost += cpu_per_tuple * mergejointuples; - /* tlist eval costs are paid per output row, not per tuple scanned */ - startup_cost += path->jpath.path.pathtarget->cost.startup; - run_cost += path->jpath.path.pathtarget->cost.per_tuple * path->jpath.path.rows; + /* tlist eval costs are paid per output row, not per tuple scanned */ + startup_cost += path->jpath.path.pathtarget->cost.startup; + run_cost += path->jpath.path.pathtarget->cost.per_tuple * path->jpath.path.rows; - path->jpath.path.startup_cost = startup_cost; - path->jpath.path.total_cost = startup_cost + run_cost; + path->jpath.path.startup_cost = startup_cost; + path->jpath.path.total_cost = startup_cost + run_cost; } /* @@ -3071,220 +3094,231 @@ initial_cost_hashjoin(PlannerInfo *root, JoinCostWorkspace *workspace, */ void final_cost_hashjoin(PlannerInfo *root, HashPath *path, - JoinCostWorkspace *workspace, - JoinPathExtraData *extra) -{// #lizard forgives - Path *outer_path = path->jpath.outerjoinpath; - Path *inner_path = path->jpath.innerjoinpath; - double outer_path_rows = outer_path->rows; - double inner_path_rows = inner_path->rows; - List *hashclauses = path->path_hashclauses; - Cost startup_cost = workspace->startup_cost; - Cost run_cost = workspace->run_cost; - int numbuckets = workspace->numbuckets; - int numbatches = workspace->numbatches; - Cost cpu_per_tuple; - QualCost hash_qual_cost; - QualCost qp_qual_cost; - double hashjointuples; - double virtualbuckets; - Selectivity innerbucketsize; - ListCell *hcl; - - /* Mark the path with the correct row estimate */ - if (path->jpath.path.param_info) - path->jpath.path.rows = path->jpath.path.param_info->ppi_rows; - else - path->jpath.path.rows = path->jpath.path.parent->rows; - - /* For partial paths, scale row estimate. */ - if (path->jpath.path.parallel_workers > 0) - { - double parallel_divisor = get_parallel_divisor(&path->jpath.path); - - path->jpath.path.rows = - clamp_row_est(path->jpath.path.rows / parallel_divisor); - } - - /* - * We could include disable_cost in the preliminary estimate, but that - * would amount to optimizing for the case where the join method is - * disabled, which doesn't seem like the way to bet. - */ - if (!enable_hashjoin) - startup_cost += disable_cost; + JoinCostWorkspace *workspace, + JoinPathExtraData *extra) +{ + Path *outer_path = path->jpath.outerjoinpath; + Path *inner_path = path->jpath.innerjoinpath; + double outer_path_rows = outer_path->rows; + double inner_path_rows = inner_path->rows; + List *hashclauses = path->path_hashclauses; + Cost startup_cost = workspace->startup_cost; + Cost run_cost = workspace->run_cost; + int numbuckets = workspace->numbuckets; + int numbatches = workspace->numbatches; + Cost cpu_per_tuple; + QualCost hash_qual_cost; + QualCost qp_qual_cost; + double hashjointuples; + double virtualbuckets; + Selectivity innerbucketsize; + ListCell *hcl; + + /* Mark the path with the correct row estimate */ + if (path->jpath.path.param_info) + path->jpath.path.rows = path->jpath.path.param_info->ppi_rows; + else + path->jpath.path.rows = path->jpath.path.parent->rows; + + /* For partial paths, scale row estimate. */ + if (path->jpath.path.parallel_workers > 0) + { + double parallel_divisor = get_parallel_divisor(&path->jpath.path); - /* mark the path with estimated # of batches */ - path->num_batches = numbatches; + path->jpath.path.rows = + clamp_row_est(path->jpath.path.rows / parallel_divisor); + } - /* and compute the number of "virtual" buckets in the whole join */ - virtualbuckets = (double) numbuckets * (double) numbatches; + /* + * We could include disable_cost in the preliminary estimate, but that + * would amount to optimizing for the case where the join method is + * disabled, which doesn't seem like the way to bet. + */ + if (!enable_hashjoin) + startup_cost += disable_cost; - /* - * Determine bucketsize fraction for inner relation. We use the smallest - * bucketsize estimated for any individual hashclause; this is undoubtedly - * conservative. - * - * BUT: if inner relation has been unique-ified, we can assume it's good - * for hashing. This is important both because it's the right answer, and - * because we avoid contaminating the cache with a value that's wrong for - * non-unique-ified paths. - */ - if (IsA(inner_path, UniquePath)) - innerbucketsize = 1.0 / virtualbuckets; - else - { - innerbucketsize = 1.0; - foreach(hcl, hashclauses) - { - RestrictInfo *restrictinfo = lfirst_node(RestrictInfo, hcl); - Selectivity thisbucketsize; + /* mark the path with estimated # of batches */ + path->num_batches = numbatches; - /* - * First we have to figure out which side of the hashjoin clause - * is the inner side. - * - * Since we tend to visit the same clauses over and over when - * planning a large query, we cache the bucketsize estimate in the - * RestrictInfo node to avoid repeated lookups of statistics. - */ - if (bms_is_subset(restrictinfo->right_relids, - inner_path->parent->relids)) - { - /* righthand side is inner */ - thisbucketsize = restrictinfo->right_bucketsize; - if (thisbucketsize < 0) - { - /* not cached yet */ - thisbucketsize = - estimate_hash_bucketsize(root, - get_rightop(restrictinfo->clause), - virtualbuckets); - restrictinfo->right_bucketsize = thisbucketsize; - } - } - else - { - Assert(bms_is_subset(restrictinfo->left_relids, - inner_path->parent->relids)); - /* lefthand side is inner */ - thisbucketsize = restrictinfo->left_bucketsize; - if (thisbucketsize < 0) - { - /* not cached yet */ - thisbucketsize = - estimate_hash_bucketsize(root, - get_leftop(restrictinfo->clause), - virtualbuckets); - restrictinfo->left_bucketsize = thisbucketsize; - } - } + /* and compute the number of "virtual" buckets in the whole join */ + virtualbuckets = (double) numbuckets * (double) numbatches; - if (innerbucketsize > thisbucketsize) - innerbucketsize = thisbucketsize; - } - } + /* + * Determine bucketsize fraction for inner relation. We use the smallest + * bucketsize estimated for any individual hashclause; this is undoubtedly + * conservative. + * + * BUT: if inner relation has been unique-ified, we can assume it's good + * for hashing. This is important both because it's the right answer, and + * because we avoid contaminating the cache with a value that's wrong for + * non-unique-ified paths. + */ + if (IsA(inner_path, UniquePath)) + innerbucketsize = 1.0 / virtualbuckets; + else + { + innerbucketsize = 1.0; + foreach(hcl, hashclauses) + { + RestrictInfo *restrictinfo = lfirst_node(RestrictInfo, hcl); + Selectivity thisbucketsize; + + /* + * First we have to figure out which side of the hashjoin clause + * is the inner side. + * + * Since we tend to visit the same clauses over and over when + * planning a large query, we cache the bucketsize estimate in the + * RestrictInfo node to avoid repeated lookups of statistics. + */ + if (bms_is_subset(restrictinfo->right_relids, + inner_path->parent->relids)) + { + /* righthand side is inner */ + thisbucketsize = restrictinfo->right_bucketsize; + if (thisbucketsize < 0) + { + /* not cached yet */ + thisbucketsize = + estimate_hash_bucketsize(root, + get_rightop(restrictinfo->clause), + virtualbuckets); + restrictinfo->right_bucketsize = thisbucketsize; + } + } + else + { + Assert(bms_is_subset(restrictinfo->left_relids, + inner_path->parent->relids)); + /* lefthand side is inner */ + thisbucketsize = restrictinfo->left_bucketsize; + if (thisbucketsize < 0) + { + /* not cached yet */ + thisbucketsize = + estimate_hash_bucketsize(root, + get_leftop(restrictinfo->clause), + virtualbuckets); + restrictinfo->left_bucketsize = thisbucketsize; + } + } + + if (innerbucketsize > thisbucketsize) + innerbucketsize = thisbucketsize; + } + } - /* - * Compute cost of the hashquals and qpquals (other restriction clauses) - * separately. - */ - cost_qual_eval(&hash_qual_cost, hashclauses, root); - cost_qual_eval(&qp_qual_cost, path->jpath.joinrestrictinfo, root); - qp_qual_cost.startup -= hash_qual_cost.startup; - qp_qual_cost.per_tuple -= hash_qual_cost.per_tuple; + /* + * Compute cost of the hashquals and qpquals (other restriction clauses) + * separately. + */ + cost_qual_eval(&hash_qual_cost, hashclauses, root); + cost_qual_eval(&qp_qual_cost, path->jpath.joinrestrictinfo, root); + qp_qual_cost.startup -= hash_qual_cost.startup; + qp_qual_cost.per_tuple -= hash_qual_cost.per_tuple; - /* CPU costs */ + /* CPU costs */ +#ifdef __TBASE__ if (path->jpath.jointype == JOIN_SEMI || path->jpath.jointype == JOIN_ANTI || + path->jpath.jointype == JOIN_LEFT_SCALAR || extra->inner_unique) - { - double outer_matched_rows; - Selectivity inner_scan_frac; - - /* - * With a SEMI or ANTI join, or if the innerrel is known unique, the - * executor will stop after the first match. - * - * For an outer-rel row that has at least one match, we can expect the - * bucket scan to stop after a fraction 1/(match_count+1) of the - * bucket's rows, if the matches are evenly distributed. Since they - * probably aren't quite evenly distributed, we apply a fuzz factor of - * 2.0 to that fraction. (If we used a larger fuzz factor, we'd have - * to clamp inner_scan_frac to at most 1.0; but since match_count is - * at least 1, no such clamp is needed now.) - */ - outer_matched_rows = rint(outer_path_rows * extra->semifactors.outer_match_frac); - inner_scan_frac = 2.0 / (extra->semifactors.match_count + 1.0); - - startup_cost += hash_qual_cost.startup; - run_cost += hash_qual_cost.per_tuple * outer_matched_rows * - clamp_row_est(inner_path_rows * innerbucketsize * inner_scan_frac) * 0.5; - - /* - * For unmatched outer-rel rows, the picture is quite a lot different. - * In the first place, there is no reason to assume that these rows - * preferentially hit heavily-populated buckets; instead assume they - * are uncorrelated with the inner distribution and so they see an - * average bucket size of inner_path_rows / virtualbuckets. In the - * second place, it seems likely that they will have few if any exact - * hash-code matches and so very few of the tuples in the bucket will - * actually require eval of the hash quals. We don't have any good - * way to estimate how many will, but for the moment assume that the - * effective cost per bucket entry is one-tenth what it is for - * matchable tuples. - */ - run_cost += hash_qual_cost.per_tuple * - (outer_path_rows - outer_matched_rows) * - clamp_row_est(inner_path_rows / virtualbuckets) * 0.05; - - /* Get # of tuples that will pass the basic join */ - if (path->jpath.jointype == JOIN_SEMI) - hashjointuples = outer_matched_rows; - else - hashjointuples = outer_path_rows - outer_matched_rows; - } - else - { - /* - * The number of tuple comparisons needed is the number of outer - * tuples times the typical number of tuples in a hash bucket, which - * is the inner relation size times its bucketsize fraction. At each - * one, we need to evaluate the hashjoin quals. But actually, - * charging the full qual eval cost at each tuple is pessimistic, - * since we don't evaluate the quals unless the hash values match - * exactly. For lack of a better idea, halve the cost estimate to - * allow for that. - */ - startup_cost += hash_qual_cost.startup; - run_cost += hash_qual_cost.per_tuple * outer_path_rows * - clamp_row_est(inner_path_rows * innerbucketsize) * 0.5; - - /* - * Get approx # tuples passing the hashquals. We use - * approx_tuple_count here because we need an estimate done with - * JOIN_INNER semantics. - */ - hashjointuples = approx_tuple_count(root, &path->jpath, hashclauses); - } +#else + if (path->jpath.jointype == JOIN_SEMI || + path->jpath.jointype == JOIN_ANTI || + extra->inner_unique) +#endif + { + double outer_matched_rows; + Selectivity inner_scan_frac; + + /* + * With a SEMI or ANTI join, or if the innerrel is known unique, the + * executor will stop after the first match. + * + * For an outer-rel row that has at least one match, we can expect the + * bucket scan to stop after a fraction 1/(match_count+1) of the + * bucket's rows, if the matches are evenly distributed. Since they + * probably aren't quite evenly distributed, we apply a fuzz factor of + * 2.0 to that fraction. (If we used a larger fuzz factor, we'd have + * to clamp inner_scan_frac to at most 1.0; but since match_count is + * at least 1, no such clamp is needed now.) + */ + outer_matched_rows = rint(outer_path_rows * extra->semifactors.outer_match_frac); + inner_scan_frac = 2.0 / (extra->semifactors.match_count + 1.0); + + startup_cost += hash_qual_cost.startup; + run_cost += hash_qual_cost.per_tuple * outer_matched_rows * + clamp_row_est(inner_path_rows * innerbucketsize * inner_scan_frac) * 0.5; + + /* + * For unmatched outer-rel rows, the picture is quite a lot different. + * In the first place, there is no reason to assume that these rows + * preferentially hit heavily-populated buckets; instead assume they + * are uncorrelated with the inner distribution and so they see an + * average bucket size of inner_path_rows / virtualbuckets. In the + * second place, it seems likely that they will have few if any exact + * hash-code matches and so very few of the tuples in the bucket will + * actually require eval of the hash quals. We don't have any good + * way to estimate how many will, but for the moment assume that the + * effective cost per bucket entry is one-tenth what it is for + * matchable tuples. + */ + run_cost += hash_qual_cost.per_tuple * + (outer_path_rows - outer_matched_rows) * + clamp_row_est(inner_path_rows / virtualbuckets) * 0.05; + + /* Get # of tuples that will pass the basic join */ +#ifdef __TBASE__ + if (path->jpath.jointype == JOIN_SEMI || path->jpath.jointype == JOIN_LEFT_SCALAR) +#else + if (path->jpath.jointype == JOIN_SEMI) +#endif + hashjointuples = outer_matched_rows; + else + hashjointuples = outer_path_rows - outer_matched_rows; + } + else + { + /* + * The number of tuple comparisons needed is the number of outer + * tuples times the typical number of tuples in a hash bucket, which + * is the inner relation size times its bucketsize fraction. At each + * one, we need to evaluate the hashjoin quals. But actually, + * charging the full qual eval cost at each tuple is pessimistic, + * since we don't evaluate the quals unless the hash values match + * exactly. For lack of a better idea, halve the cost estimate to + * allow for that. + */ + startup_cost += hash_qual_cost.startup; + run_cost += hash_qual_cost.per_tuple * outer_path_rows * + clamp_row_est(inner_path_rows * innerbucketsize) * 0.5; + + /* + * Get approx # tuples passing the hashquals. We use + * approx_tuple_count here because we need an estimate done with + * JOIN_INNER semantics. + */ + hashjointuples = approx_tuple_count(root, &path->jpath, hashclauses); + } - /* - * For each tuple that gets through the hashjoin proper, we charge - * cpu_tuple_cost plus the cost of evaluating additional restriction - * clauses that are to be applied at the join. (This is pessimistic since - * not all of the quals may get evaluated at each tuple.) - */ - startup_cost += qp_qual_cost.startup; - cpu_per_tuple = cpu_tuple_cost + qp_qual_cost.per_tuple; - run_cost += cpu_per_tuple * hashjointuples; + /* + * For each tuple that gets through the hashjoin proper, we charge + * cpu_tuple_cost plus the cost of evaluating additional restriction + * clauses that are to be applied at the join. (This is pessimistic since + * not all of the quals may get evaluated at each tuple.) + */ + startup_cost += qp_qual_cost.startup; + cpu_per_tuple = cpu_tuple_cost + qp_qual_cost.per_tuple; + run_cost += cpu_per_tuple * hashjointuples; - /* tlist eval costs are paid per output row, not per tuple scanned */ - startup_cost += path->jpath.path.pathtarget->cost.startup; - run_cost += path->jpath.path.pathtarget->cost.per_tuple * path->jpath.path.rows; + /* tlist eval costs are paid per output row, not per tuple scanned */ + startup_cost += path->jpath.path.pathtarget->cost.startup; + run_cost += path->jpath.path.pathtarget->cost.per_tuple * path->jpath.path.rows; - path->jpath.path.startup_cost = startup_cost; - path->jpath.path.total_cost = startup_cost + run_cost; + path->jpath.path.startup_cost = startup_cost; + path->jpath.path.total_cost = startup_cost + run_cost; } @@ -4277,138 +4311,141 @@ get_parameterized_joinrel_size(PlannerInfo *root, RelOptInfo *rel, */ static double calc_joinrel_size_estimate(PlannerInfo *root, - RelOptInfo *outer_rel, - RelOptInfo *inner_rel, - double outer_rows, - double inner_rows, - SpecialJoinInfo *sjinfo, - List *restrictlist_in) -{// #lizard forgives - /* This apparently-useless variable dodges a compiler bug in VS2013: */ - List *restrictlist = restrictlist_in; - JoinType jointype = sjinfo->jointype; - Selectivity fkselec; - Selectivity jselec; - Selectivity pselec; - double nrows; - - /* - * Compute joinclause selectivity. Note that we are only considering - * clauses that become restriction clauses at this join level; we are not - * double-counting them because they were not considered in estimating the - * sizes of the component rels. - * - * First, see whether any of the joinclauses can be matched to known FK - * constraints. If so, drop those clauses from the restrictlist, and - * instead estimate their selectivity using FK semantics. (We do this - * without regard to whether said clauses are local or "pushed down". - * Probably, an FK-matching clause could never be seen as pushed down at - * an outer join, since it would be strict and hence would be grounds for - * join strength reduction.) fkselec gets the net selectivity for - * FK-matching clauses, or 1.0 if there are none. - */ - fkselec = get_foreign_key_join_selectivity(root, - outer_rel->relids, - inner_rel->relids, - sjinfo, - &restrictlist); - - /* - * For an outer join, we have to distinguish the selectivity of the join's - * own clauses (JOIN/ON conditions) from any clauses that were "pushed - * down". For inner joins we just count them all as joinclauses. - */ - if (IS_OUTER_JOIN(jointype)) - { - List *joinquals = NIL; - List *pushedquals = NIL; - ListCell *l; - - /* Grovel through the clauses to separate into two lists */ - foreach(l, restrictlist) - { - RestrictInfo *rinfo = lfirst_node(RestrictInfo, l); + RelOptInfo *outer_rel, + RelOptInfo *inner_rel, + double outer_rows, + double inner_rows, + SpecialJoinInfo *sjinfo, + List *restrictlist_in) +{ + /* This apparently-useless variable dodges a compiler bug in VS2013: */ + List *restrictlist = restrictlist_in; + JoinType jointype = sjinfo->jointype; + Selectivity fkselec; + Selectivity jselec; + Selectivity pselec; + double nrows; - if (rinfo->is_pushed_down) - pushedquals = lappend(pushedquals, rinfo); - else - joinquals = lappend(joinquals, rinfo); - } + /* + * Compute joinclause selectivity. Note that we are only considering + * clauses that become restriction clauses at this join level; we are not + * double-counting them because they were not considered in estimating the + * sizes of the component rels. + * + * First, see whether any of the joinclauses can be matched to known FK + * constraints. If so, drop those clauses from the restrictlist, and + * instead estimate their selectivity using FK semantics. (We do this + * without regard to whether said clauses are local or "pushed down". + * Probably, an FK-matching clause could never be seen as pushed down at + * an outer join, since it would be strict and hence would be grounds for + * join strength reduction.) fkselec gets the net selectivity for + * FK-matching clauses, or 1.0 if there are none. + */ + fkselec = get_foreign_key_join_selectivity(root, + outer_rel->relids, + inner_rel->relids, + sjinfo, + &restrictlist); - /* Get the separate selectivities */ - jselec = clauselist_selectivity(root, - joinquals, - 0, - jointype, - sjinfo); - pselec = clauselist_selectivity(root, - pushedquals, - 0, - jointype, - sjinfo); - - /* Avoid leaking a lot of ListCells */ - list_free(joinquals); - list_free(pushedquals); - } - else - { - jselec = clauselist_selectivity(root, - restrictlist, - 0, - jointype, - sjinfo); - pselec = 0.0; /* not used, keep compiler quiet */ - } + /* + * For an outer join, we have to distinguish the selectivity of the join's + * own clauses (JOIN/ON conditions) from any clauses that were "pushed + * down". For inner joins we just count them all as joinclauses. + */ + if (IS_OUTER_JOIN(jointype)) + { + List *joinquals = NIL; + List *pushedquals = NIL; + ListCell *l; + + /* Grovel through the clauses to separate into two lists */ + foreach(l, restrictlist) + { + RestrictInfo *rinfo = lfirst_node(RestrictInfo, l); + + if (rinfo->is_pushed_down) + pushedquals = lappend(pushedquals, rinfo); + else + joinquals = lappend(joinquals, rinfo); + } + + /* Get the separate selectivities */ + jselec = clauselist_selectivity(root, + joinquals, + 0, + jointype, + sjinfo); + pselec = clauselist_selectivity(root, + pushedquals, + 0, + jointype, + sjinfo); + + /* Avoid leaking a lot of ListCells */ + list_free(joinquals); + list_free(pushedquals); + } + else + { + jselec = clauselist_selectivity(root, + restrictlist, + 0, + jointype, + sjinfo); + pselec = 0.0; /* not used, keep compiler quiet */ + } - /* - * Basically, we multiply size of Cartesian product by selectivity. - * - * If we are doing an outer join, take that into account: the joinqual - * selectivity has to be clamped using the knowledge that the output must - * be at least as large as the non-nullable input. However, any - * pushed-down quals are applied after the outer join, so their - * selectivity applies fully. - * - * For JOIN_SEMI and JOIN_ANTI, the selectivity is defined as the fraction - * of LHS rows that have matches, and we apply that straightforwardly. - */ - switch (jointype) - { - case JOIN_INNER: - nrows = outer_rows * inner_rows * fkselec * jselec; - /* pselec not used */ - break; - case JOIN_LEFT: - nrows = outer_rows * inner_rows * fkselec * jselec; - if (nrows < outer_rows) - nrows = outer_rows; - nrows *= pselec; - break; - case JOIN_FULL: - nrows = outer_rows * inner_rows * fkselec * jselec; - if (nrows < outer_rows) - nrows = outer_rows; - if (nrows < inner_rows) - nrows = inner_rows; - nrows *= pselec; - break; + /* + * Basically, we multiply size of Cartesian product by selectivity. + * + * If we are doing an outer join, take that into account: the joinqual + * selectivity has to be clamped using the knowledge that the output must + * be at least as large as the non-nullable input. However, any + * pushed-down quals are applied after the outer join, so their + * selectivity applies fully. + * + * For JOIN_SEMI and JOIN_ANTI, the selectivity is defined as the fraction + * of LHS rows that have matches, and we apply that straightforwardly. + */ + switch (jointype) + { + case JOIN_INNER: + nrows = outer_rows * inner_rows * fkselec * jselec; + /* pselec not used */ + break; + case JOIN_LEFT: + nrows = outer_rows * inner_rows * fkselec * jselec; + if (nrows < outer_rows) + nrows = outer_rows; + nrows *= pselec; + break; + case JOIN_FULL: + nrows = outer_rows * inner_rows * fkselec * jselec; + if (nrows < outer_rows) + nrows = outer_rows; + if (nrows < inner_rows) + nrows = inner_rows; + nrows *= pselec; + break; case JOIN_SEMI: - nrows = outer_rows * fkselec * jselec; - /* pselec not used */ - break; - case JOIN_ANTI: - nrows = outer_rows * (1.0 - fkselec * jselec); - nrows *= pselec; - break; - default: - /* other values not expected here */ - elog(ERROR, "unrecognized join type: %d", (int) jointype); - nrows = 0; /* keep compiler quiet */ - break; - } +#ifdef __TBASE__ + case JOIN_LEFT_SCALAR: +#endif + nrows = outer_rows * fkselec * jselec; + /* pselec not used */ + break; + case JOIN_ANTI: + nrows = outer_rows * (1.0 - fkselec * jselec); + nrows *= pselec; + break; + default: + /* other values not expected here */ + elog(ERROR, "unrecognized join type: %d", (int) jointype); + nrows = 0; /* keep compiler quiet */ + break; + } - return clamp_row_est(nrows); + return clamp_row_est(nrows); } /* @@ -4428,202 +4465,211 @@ calc_joinrel_size_estimate(PlannerInfo *root, */ static Selectivity get_foreign_key_join_selectivity(PlannerInfo *root, - Relids outer_relids, - Relids inner_relids, - SpecialJoinInfo *sjinfo, - List **restrictlist) -{// #lizard forgives - Selectivity fkselec = 1.0; - JoinType jointype = sjinfo->jointype; - List *worklist = *restrictlist; - ListCell *lc; - - /* Consider each FK constraint that is known to match the query */ - foreach(lc, root->fkey_list) - { - ForeignKeyOptInfo *fkinfo = (ForeignKeyOptInfo *) lfirst(lc); - bool ref_is_outer; - List *removedlist; - ListCell *cell; - ListCell *prev; - ListCell *next; - - /* - * This FK is not relevant unless it connects a baserel on one side of - * this join to a baserel on the other side. - */ - if (bms_is_member(fkinfo->con_relid, outer_relids) && - bms_is_member(fkinfo->ref_relid, inner_relids)) - ref_is_outer = false; - else if (bms_is_member(fkinfo->ref_relid, outer_relids) && - bms_is_member(fkinfo->con_relid, inner_relids)) - ref_is_outer = true; - else - continue; - - /* - * If we're dealing with a semi/anti join, and the FK's referenced - * relation is on the outside, then knowledge of the FK doesn't help - * us figure out what we need to know (which is the fraction of outer - * rows that have matches). On the other hand, if the referenced rel - * is on the inside, then all outer rows must have matches in the - * referenced table (ignoring nulls). But any restriction or join - * clauses that filter that table will reduce the fraction of matches. - * We can account for restriction clauses, but it's too hard to guess - * how many table rows would get through a join that's inside the RHS. - * Hence, if either case applies, punt and ignore the FK. - */ - if ((jointype == JOIN_SEMI || jointype == JOIN_ANTI) && - (ref_is_outer || bms_membership(inner_relids) != BMS_SINGLETON)) - continue; - - /* - * Modify the restrictlist by removing clauses that match the FK (and - * putting them into removedlist instead). It seems unsafe to modify - * the originally-passed List structure, so we make a shallow copy the - * first time through. - */ - if (worklist == *restrictlist) - worklist = list_copy(worklist); - - removedlist = NIL; - prev = NULL; - for (cell = list_head(worklist); cell; cell = next) - { - RestrictInfo *rinfo = (RestrictInfo *) lfirst(cell); - bool remove_it = false; - int i; - - next = lnext(cell); - /* Drop this clause if it matches any column of the FK */ - for (i = 0; i < fkinfo->nkeys; i++) - { - if (rinfo->parent_ec) - { - /* - * EC-derived clauses can only match by EC. It is okay to - * consider any clause derived from the same EC as - * matching the FK: even if equivclass.c chose to generate - * a clause equating some other pair of Vars, it could - * have generated one equating the FK's Vars. So for - * purposes of estimation, we can act as though it did so. - * - * Note: checking parent_ec is a bit of a cheat because - * there are EC-derived clauses that don't have parent_ec - * set; but such clauses must compare expressions that - * aren't just Vars, so they cannot match the FK anyway. - */ - if (fkinfo->eclass[i] == rinfo->parent_ec) - { - remove_it = true; - break; - } - } - else - { - /* - * Otherwise, see if rinfo was previously matched to FK as - * a "loose" clause. - */ - if (list_member_ptr(fkinfo->rinfos[i], rinfo)) - { - remove_it = true; - break; - } - } - } - if (remove_it) - { - worklist = list_delete_cell(worklist, cell, prev); - removedlist = lappend(removedlist, rinfo); - } - else - prev = cell; - } + Relids outer_relids, + Relids inner_relids, + SpecialJoinInfo *sjinfo, + List **restrictlist) +{ + Selectivity fkselec = 1.0; + JoinType jointype = sjinfo->jointype; + List *worklist = *restrictlist; + ListCell *lc; - /* - * If we failed to remove all the matching clauses we expected to - * find, chicken out and ignore this FK; applying its selectivity - * might result in double-counting. Put any clauses we did manage to - * remove back into the worklist. - * - * Since the matching clauses are known not outerjoin-delayed, they - * should certainly have appeared in the initial joinclause list. If - * we didn't find them, they must have been matched to, and removed - * by, some other FK in a previous iteration of this loop. (A likely - * case is that two FKs are matched to the same EC; there will be only - * one EC-derived clause in the initial list, so the first FK will - * consume it.) Applying both FKs' selectivity independently risks - * underestimating the join size; in particular, this would undo one - * of the main things that ECs were invented for, namely to avoid - * double-counting the selectivity of redundant equality conditions. - * Later we might think of a reasonable way to combine the estimates, - * but for now, just punt, since this is a fairly uncommon situation. - */ - if (list_length(removedlist) != - (fkinfo->nmatched_ec + fkinfo->nmatched_ri)) - { - worklist = list_concat(worklist, removedlist); + /* Consider each FK constraint that is known to match the query */ + foreach(lc, root->fkey_list) + { + ForeignKeyOptInfo *fkinfo = (ForeignKeyOptInfo *) lfirst(lc); + bool ref_is_outer; + List *removedlist; + ListCell *cell; + ListCell *prev; + ListCell *next; + + /* + * This FK is not relevant unless it connects a baserel on one side of + * this join to a baserel on the other side. + */ + if (bms_is_member(fkinfo->con_relid, outer_relids) && + bms_is_member(fkinfo->ref_relid, inner_relids)) + ref_is_outer = false; + else if (bms_is_member(fkinfo->ref_relid, outer_relids) && + bms_is_member(fkinfo->con_relid, inner_relids)) + ref_is_outer = true; + else + continue; + + /* + * If we're dealing with a semi/anti join, and the FK's referenced + * relation is on the outside, then knowledge of the FK doesn't help + * us figure out what we need to know (which is the fraction of outer + * rows that have matches). On the other hand, if the referenced rel + * is on the inside, then all outer rows must have matches in the + * referenced table (ignoring nulls). But any restriction or join + * clauses that filter that table will reduce the fraction of matches. + * We can account for restriction clauses, but it's too hard to guess + * how many table rows would get through a join that's inside the RHS. + * Hence, if either case applies, punt and ignore the FK. + */ +#ifdef __TBASE__ + if ((jointype == JOIN_SEMI || jointype == JOIN_ANTI || jointype == JOIN_LEFT_SCALAR) && + (ref_is_outer || bms_membership(inner_relids) != BMS_SINGLETON)) continue; - } - - /* - * Finally we get to the payoff: estimate selectivity using the - * knowledge that each referencing row will match exactly one row in - * the referenced table. - * - * XXX that's not true in the presence of nulls in the referencing - * column(s), so in principle we should derate the estimate for those. - * However (1) if there are any strict restriction clauses for the - * referencing column(s) elsewhere in the query, derating here would - * be double-counting the null fraction, and (2) it's not very clear - * how to combine null fractions for multiple referencing columns. So - * we do nothing for now about correcting for nulls. - * - * XXX another point here is that if either side of an FK constraint - * is an inheritance parent, we estimate as though the constraint - * covers all its children as well. This is not an unreasonable - * assumption for a referencing table, ie the user probably applied - * identical constraints to all child tables (though perhaps we ought - * to check that). But it's not possible to have done that for a - * referenced table. Fortunately, precisely because that doesn't - * work, it is uncommon in practice to have an FK referencing a parent - * table. So, at least for now, disregard inheritance here. - */ - if (jointype == JOIN_SEMI || jointype == JOIN_ANTI) - { - /* - * For JOIN_SEMI and JOIN_ANTI, we only get here when the FK's - * referenced table is exactly the inside of the join. The join - * selectivity is defined as the fraction of LHS rows that have - * matches. The FK implies that every LHS row has a match *in the - * referenced table*; but any restriction clauses on it will - * reduce the number of matches. Hence we take the join - * selectivity as equal to the selectivity of the table's - * restriction clauses, which is rows / tuples; but we must guard - * against tuples == 0. - */ - RelOptInfo *ref_rel = find_base_rel(root, fkinfo->ref_relid); - double ref_tuples = Max(ref_rel->tuples, 1.0); - - fkselec *= ref_rel->rows / ref_tuples; - } - else - { - /* - * Otherwise, selectivity is exactly 1/referenced-table-size; but - * guard against tuples == 0. Note we should use the raw table - * tuple count, not any estimate of its filtered or joined size. - */ - RelOptInfo *ref_rel = find_base_rel(root, fkinfo->ref_relid); - double ref_tuples = Max(ref_rel->tuples, 1.0); - - fkselec *= 1.0 / ref_tuples; - } - } +#else + if ((jointype == JOIN_SEMI || jointype == JOIN_ANTI) && + (ref_is_outer || bms_membership(inner_relids) != BMS_SINGLETON)) + continue; +#endif + /* + * Modify the restrictlist by removing clauses that match the FK (and + * putting them into removedlist instead). It seems unsafe to modify + * the originally-passed List structure, so we make a shallow copy the + * first time through. + */ + if (worklist == *restrictlist) + worklist = list_copy(worklist); + + removedlist = NIL; + prev = NULL; + for (cell = list_head(worklist); cell; cell = next) + { + RestrictInfo *rinfo = (RestrictInfo *) lfirst(cell); + bool remove_it = false; + int i; + + next = lnext(cell); + /* Drop this clause if it matches any column of the FK */ + for (i = 0; i < fkinfo->nkeys; i++) + { + if (rinfo->parent_ec) + { + /* + * EC-derived clauses can only match by EC. It is okay to + * consider any clause derived from the same EC as + * matching the FK: even if equivclass.c chose to generate + * a clause equating some other pair of Vars, it could + * have generated one equating the FK's Vars. So for + * purposes of estimation, we can act as though it did so. + * + * Note: checking parent_ec is a bit of a cheat because + * there are EC-derived clauses that don't have parent_ec + * set; but such clauses must compare expressions that + * aren't just Vars, so they cannot match the FK anyway. + */ + if (fkinfo->eclass[i] == rinfo->parent_ec) + { + remove_it = true; + break; + } + } + else + { + /* + * Otherwise, see if rinfo was previously matched to FK as + * a "loose" clause. + */ + if (list_member_ptr(fkinfo->rinfos[i], rinfo)) + { + remove_it = true; + break; + } + } + } + if (remove_it) + { + worklist = list_delete_cell(worklist, cell, prev); + removedlist = lappend(removedlist, rinfo); + } + else + prev = cell; + } + + /* + * If we failed to remove all the matching clauses we expected to + * find, chicken out and ignore this FK; applying its selectivity + * might result in double-counting. Put any clauses we did manage to + * remove back into the worklist. + * + * Since the matching clauses are known not outerjoin-delayed, they + * should certainly have appeared in the initial joinclause list. If + * we didn't find them, they must have been matched to, and removed + * by, some other FK in a previous iteration of this loop. (A likely + * case is that two FKs are matched to the same EC; there will be only + * one EC-derived clause in the initial list, so the first FK will + * consume it.) Applying both FKs' selectivity independently risks + * underestimating the join size; in particular, this would undo one + * of the main things that ECs were invented for, namely to avoid + * double-counting the selectivity of redundant equality conditions. + * Later we might think of a reasonable way to combine the estimates, + * but for now, just punt, since this is a fairly uncommon situation. + */ + if (list_length(removedlist) != + (fkinfo->nmatched_ec + fkinfo->nmatched_ri)) + { + worklist = list_concat(worklist, removedlist); + continue; + } + + /* + * Finally we get to the payoff: estimate selectivity using the + * knowledge that each referencing row will match exactly one row in + * the referenced table. + * + * XXX that's not true in the presence of nulls in the referencing + * column(s), so in principle we should derate the estimate for those. + * However (1) if there are any strict restriction clauses for the + * referencing column(s) elsewhere in the query, derating here would + * be double-counting the null fraction, and (2) it's not very clear + * how to combine null fractions for multiple referencing columns. So + * we do nothing for now about correcting for nulls. + * + * XXX another point here is that if either side of an FK constraint + * is an inheritance parent, we estimate as though the constraint + * covers all its children as well. This is not an unreasonable + * assumption for a referencing table, ie the user probably applied + * identical constraints to all child tables (though perhaps we ought + * to check that). But it's not possible to have done that for a + * referenced table. Fortunately, precisely because that doesn't + * work, it is uncommon in practice to have an FK referencing a parent + * table. So, at least for now, disregard inheritance here. + */ +#ifdef __TBASE__ + if (jointype == JOIN_SEMI || jointype == JOIN_ANTI || jointype == JOIN_LEFT_SCALAR) +#else + if (jointype == JOIN_SEMI || jointype == JOIN_ANTI) +#endif + { + /* + * For JOIN_SEMI and JOIN_ANTI, we only get here when the FK's + * referenced table is exactly the inside of the join. The join + * selectivity is defined as the fraction of LHS rows that have + * matches. The FK implies that every LHS row has a match *in the + * referenced table*; but any restriction clauses on it will + * reduce the number of matches. Hence we take the join + * selectivity as equal to the selectivity of the table's + * restriction clauses, which is rows / tuples; but we must guard + * against tuples == 0. + */ + RelOptInfo *ref_rel = find_base_rel(root, fkinfo->ref_relid); + double ref_tuples = Max(ref_rel->tuples, 1.0); + + fkselec *= ref_rel->rows / ref_tuples; + } + else + { + /* + * Otherwise, selectivity is exactly 1/referenced-table-size; but + * guard against tuples == 0. Note we should use the raw table + * tuple count, not any estimate of its filtered or joined size. + */ + RelOptInfo *ref_rel = find_base_rel(root, fkinfo->ref_relid); + double ref_tuples = Max(ref_rel->tuples, 1.0); + + fkselec *= 1.0 / ref_tuples; + } + } - *restrictlist = worklist; - return fkselec; + *restrictlist = worklist; + return fkselec; } /* diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index a738fc02..b377f0d6 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -2018,30 +2018,36 @@ adjust_rowcount_for_semijoins(PlannerInfo *root, Index outer_relid, double rowcount) { - ListCell *lc; + ListCell *lc; - foreach(lc, root->join_info_list) - { - SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) lfirst(lc); + foreach(lc, root->join_info_list) + { + SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) lfirst(lc); - if (sjinfo->jointype == JOIN_SEMI && +#ifdef __TBASE__ + if ((sjinfo->jointype == JOIN_SEMI || sjinfo->jointype == JOIN_LEFT_SCALAR ) && bms_is_member(cur_relid, sjinfo->syn_lefthand) && bms_is_member(outer_relid, sjinfo->syn_righthand)) - { - /* Estimate number of unique-ified rows */ - double nraw; - double nunique; - - nraw = approximate_joinrel_size(root, sjinfo->syn_righthand); - nunique = estimate_num_groups(root, - sjinfo->semi_rhs_exprs, - nraw, - NULL); - if (rowcount > nunique) - rowcount = nunique; - } - } - return rowcount; +#else + if (sjinfo->jointype == JOIN_SEMI && + bms_is_member(cur_relid, sjinfo->syn_lefthand) && + bms_is_member(outer_relid, sjinfo->syn_righthand)) +#endif + { + /* Estimate number of unique-ified rows */ + double nraw; + double nunique; + + nraw = approximate_joinrel_size(root, sjinfo->syn_righthand); + nunique = estimate_num_groups(root, + sjinfo->semi_rhs_exprs, + nraw, + NULL); + if (rowcount > nunique) + rowcount = nunique; + } + } + return rowcount; } /* diff --git a/src/backend/optimizer/path/joinpath.c b/src/backend/optimizer/path/joinpath.c index 0b01ed12..c832b9d8 100644 --- a/src/backend/optimizer/path/joinpath.c +++ b/src/backend/optimizer/path/joinpath.c @@ -108,155 +108,163 @@ static void generate_mergejoin_paths(PlannerInfo *root, */ void add_paths_to_joinrel(PlannerInfo *root, - RelOptInfo *joinrel, - RelOptInfo *outerrel, - RelOptInfo *innerrel, - JoinType jointype, - SpecialJoinInfo *sjinfo, - List *restrictlist) -{// #lizard forgives - JoinPathExtraData extra; - bool mergejoin_allowed = true; - ListCell *lc; - - extra.restrictlist = restrictlist; - extra.mergeclause_list = NIL; - extra.sjinfo = sjinfo; - extra.param_source_rels = NULL; - - /* - * See if the inner relation is provably unique for this outer rel. - * - * We have some special cases: for JOIN_SEMI and JOIN_ANTI, it doesn't - * matter since the executor can make the equivalent optimization anyway; - * we need not expend planner cycles on proofs. For JOIN_UNIQUE_INNER, we - * must be considering a semijoin whose inner side is not provably unique - * (else reduce_unique_semijoins would've simplified it), so there's no - * point in calling innerrel_is_unique. However, if the LHS covers all of - * the semijoin's min_lefthand, then it's appropriate to set inner_unique - * because the path produced by create_unique_path will be unique relative - * to the LHS. (If we have an LHS that's only part of the min_lefthand, - * that is *not* true.) For JOIN_UNIQUE_OUTER, pass JOIN_INNER to avoid - * letting that value escape this module. - */ - switch (jointype) - { - case JOIN_SEMI: - case JOIN_ANTI: - extra.inner_unique = false; /* well, unproven */ - break; - case JOIN_UNIQUE_INNER: - extra.inner_unique = bms_is_subset(sjinfo->min_lefthand, - outerrel->relids); - break; - case JOIN_UNIQUE_OUTER: - extra.inner_unique = innerrel_is_unique(root, - outerrel->relids, - innerrel, - JOIN_INNER, - restrictlist, - false); - break; - default: - extra.inner_unique = innerrel_is_unique(root, - outerrel->relids, - innerrel, - jointype, - restrictlist, - false); - break; - } - - /* - * Find potential mergejoin clauses. We can skip this if we are not - * interested in doing a mergejoin. However, mergejoin may be our only - * way of implementing a full outer join, so override enable_mergejoin if - * it's a full join. - */ - if (enable_mergejoin || jointype == JOIN_FULL) - extra.mergeclause_list = select_mergejoin_clauses(root, - joinrel, - outerrel, - innerrel, - restrictlist, - jointype, - &mergejoin_allowed); + RelOptInfo *joinrel, + RelOptInfo *outerrel, + RelOptInfo *innerrel, + JoinType jointype, + SpecialJoinInfo *sjinfo, + List *restrictlist) +{ + JoinPathExtraData extra; + bool mergejoin_allowed = true; + ListCell *lc; + + extra.restrictlist = restrictlist; + extra.mergeclause_list = NIL; + extra.sjinfo = sjinfo; + extra.param_source_rels = NULL; + + /* + * See if the inner relation is provably unique for this outer rel. + * + * We have some special cases: for JOIN_SEMI and JOIN_ANTI, it doesn't + * matter since the executor can make the equivalent optimization anyway; + * we need not expend planner cycles on proofs. For JOIN_UNIQUE_INNER, we + * must be considering a semijoin whose inner side is not provably unique + * (else reduce_unique_semijoins would've simplified it), so there's no + * point in calling innerrel_is_unique. However, if the LHS covers all of + * the semijoin's min_lefthand, then it's appropriate to set inner_unique + * because the path produced by create_unique_path will be unique relative + * to the LHS. (If we have an LHS that's only part of the min_lefthand, + * that is *not* true.) For JOIN_UNIQUE_OUTER, pass JOIN_INNER to avoid + * letting that value escape this module. + */ + switch (jointype) + { + case JOIN_SEMI: + case JOIN_ANTI: +#ifdef __TBASE__ + case JOIN_LEFT_SCALAR: +#endif + extra.inner_unique = false; /* well, unproven */ + break; + case JOIN_UNIQUE_INNER: + extra.inner_unique = bms_is_subset(sjinfo->min_lefthand, + outerrel->relids); + break; + case JOIN_UNIQUE_OUTER: + extra.inner_unique = innerrel_is_unique(root, + outerrel->relids, + innerrel, + JOIN_INNER, + restrictlist, + false); + break; + default: + extra.inner_unique = innerrel_is_unique(root, + outerrel->relids, + innerrel, + jointype, + restrictlist, + false); + break; + } - /* - * If it's SEMI, ANTI, or inner_unique join, compute correction factors - * for cost estimation. These will be the same for all paths. - */ - if (jointype == JOIN_SEMI || jointype == JOIN_ANTI || extra.inner_unique) + /* + * Find potential mergejoin clauses. We can skip this if we are not + * interested in doing a mergejoin. However, mergejoin may be our only + * way of implementing a full outer join, so override enable_mergejoin if + * it's a full join. + */ + if (enable_mergejoin || jointype == JOIN_FULL) + extra.mergeclause_list = select_mergejoin_clauses(root, + joinrel, + outerrel, + innerrel, + restrictlist, + jointype, + &mergejoin_allowed); + + /* + * If it's SEMI, ANTI, or inner_unique join, compute correction factors + * for cost estimation. These will be the same for all paths. + */ +#ifdef __TBASE__ + if (jointype == JOIN_SEMI || jointype == JOIN_ANTI || + jointype == JOIN_LEFT_SCALAR || extra.inner_unique) +#else + if (jointype == JOIN_SEMI || jointype == JOIN_ANTI || extra.inner_unique) +#endif compute_semi_anti_join_factors(root, outerrel, innerrel, - jointype, sjinfo, restrictlist, - &extra.semifactors); - - /* - * Decide whether it's sensible to generate parameterized paths for this - * joinrel, and if so, which relations such paths should require. There - * is usually no need to create a parameterized result path unless there - * is a join order restriction that prevents joining one of our input rels - * directly to the parameter source rel instead of joining to the other - * input rel. (But see allow_star_schema_join().) This restriction - * reduces the number of parameterized paths we have to deal with at - * higher join levels, without compromising the quality of the resulting - * plan. We express the restriction as a Relids set that must overlap the - * parameterization of any proposed join path. - */ - foreach(lc, root->join_info_list) - { - SpecialJoinInfo *sjinfo2 = (SpecialJoinInfo *) lfirst(lc); - - /* - * SJ is relevant to this join if we have some part of its RHS - * (possibly not all of it), and haven't yet joined to its LHS. (This - * test is pretty simplistic, but should be sufficient considering the - * join has already been proven legal.) If the SJ is relevant, it - * presents constraints for joining to anything not in its RHS. - */ - if (bms_overlap(joinrel->relids, sjinfo2->min_righthand) && - !bms_overlap(joinrel->relids, sjinfo2->min_lefthand)) - extra.param_source_rels = bms_join(extra.param_source_rels, - bms_difference(root->all_baserels, - sjinfo2->min_righthand)); - - /* full joins constrain both sides symmetrically */ - if (sjinfo2->jointype == JOIN_FULL && - bms_overlap(joinrel->relids, sjinfo2->min_lefthand) && - !bms_overlap(joinrel->relids, sjinfo2->min_righthand)) - extra.param_source_rels = bms_join(extra.param_source_rels, - bms_difference(root->all_baserels, - sjinfo2->min_lefthand)); - } - - /* - * However, when a LATERAL subquery is involved, there will simply not be - * any paths for the joinrel that aren't parameterized by whatever the - * subquery is parameterized by, unless its parameterization is resolved - * within the joinrel. So we might as well allow additional dependencies - * on whatever residual lateral dependencies the joinrel will have. - */ - extra.param_source_rels = bms_add_members(extra.param_source_rels, - joinrel->lateral_relids); - - /* - * 1. Consider mergejoin paths where both relations must be explicitly - * sorted. Skip this if we can't mergejoin. - */ - if (mergejoin_allowed) - sort_inner_and_outer(root, joinrel, outerrel, innerrel, - jointype, &extra); + jointype, sjinfo, restrictlist, + &extra.semifactors); + + /* + * Decide whether it's sensible to generate parameterized paths for this + * joinrel, and if so, which relations such paths should require. There + * is usually no need to create a parameterized result path unless there + * is a join order restriction that prevents joining one of our input rels + * directly to the parameter source rel instead of joining to the other + * input rel. (But see allow_star_schema_join().) This restriction + * reduces the number of parameterized paths we have to deal with at + * higher join levels, without compromising the quality of the resulting + * plan. We express the restriction as a Relids set that must overlap the + * parameterization of any proposed join path. + */ + foreach(lc, root->join_info_list) + { + SpecialJoinInfo *sjinfo2 = (SpecialJoinInfo *) lfirst(lc); + + /* + * SJ is relevant to this join if we have some part of its RHS + * (possibly not all of it), and haven't yet joined to its LHS. (This + * test is pretty simplistic, but should be sufficient considering the + * join has already been proven legal.) If the SJ is relevant, it + * presents constraints for joining to anything not in its RHS. + */ + if (bms_overlap(joinrel->relids, sjinfo2->min_righthand) && + !bms_overlap(joinrel->relids, sjinfo2->min_lefthand)) + extra.param_source_rels = bms_join(extra.param_source_rels, + bms_difference(root->all_baserels, + sjinfo2->min_righthand)); + + /* full joins constrain both sides symmetrically */ + if (sjinfo2->jointype == JOIN_FULL && + bms_overlap(joinrel->relids, sjinfo2->min_lefthand) && + !bms_overlap(joinrel->relids, sjinfo2->min_righthand)) + extra.param_source_rels = bms_join(extra.param_source_rels, + bms_difference(root->all_baserels, + sjinfo2->min_lefthand)); + } - /* - * 2. Consider paths where the outer relation need not be explicitly - * sorted. This includes both nestloops and mergejoins where the outer - * path is already ordered. Again, skip this if we can't mergejoin. - * (That's okay because we know that nestloop can't handle right/full - * joins at all, so it wouldn't work in the prohibited cases either.) - */ - if (mergejoin_allowed) - match_unsorted_outer(root, joinrel, outerrel, innerrel, - jointype, &extra); + /* + * However, when a LATERAL subquery is involved, there will simply not be + * any paths for the joinrel that aren't parameterized by whatever the + * subquery is parameterized by, unless its parameterization is resolved + * within the joinrel. So we might as well allow additional dependencies + * on whatever residual lateral dependencies the joinrel will have. + */ + extra.param_source_rels = bms_add_members(extra.param_source_rels, + joinrel->lateral_relids); + + /* + * 1. Consider mergejoin paths where both relations must be explicitly + * sorted. Skip this if we can't mergejoin. + */ + if (mergejoin_allowed) + sort_inner_and_outer(root, joinrel, outerrel, innerrel, + jointype, &extra); + + /* + * 2. Consider paths where the outer relation need not be explicitly + * sorted. This includes both nestloops and mergejoins where the outer + * path is already ordered. Again, skip this if we can't mergejoin. + * (That's okay because we know that nestloop can't handle right/full + * joins at all, so it wouldn't work in the prohibited cases either.) + */ + if (mergejoin_allowed) + match_unsorted_outer(root, joinrel, outerrel, innerrel, + jointype, &extra); #ifdef NOT_USED @@ -1264,224 +1272,227 @@ generate_mergejoin_paths(PlannerInfo *root, */ static void match_unsorted_outer(PlannerInfo *root, - RelOptInfo *joinrel, - RelOptInfo *outerrel, - RelOptInfo *innerrel, - JoinType jointype, - JoinPathExtraData *extra) -{// #lizard forgives - JoinType save_jointype = jointype; - bool nestjoinOK; - bool useallclauses; - Path *inner_cheapest_total = innerrel->cheapest_total_path; - Path *matpath = NULL; - ListCell *lc1; - - /* - * Nestloop only supports inner, left, semi, and anti joins. Also, if we - * are doing a right or full mergejoin, we must use *all* the mergeclauses - * as join clauses, else we will not have a valid plan. (Although these - * two flags are currently inverses, keep them separate for clarity and - * possible future changes.) - */ - switch (jointype) - { - case JOIN_INNER: - case JOIN_LEFT: - case JOIN_SEMI: - case JOIN_ANTI: - nestjoinOK = true; - useallclauses = false; - break; - case JOIN_RIGHT: - case JOIN_FULL: - nestjoinOK = false; - useallclauses = true; - break; - case JOIN_UNIQUE_OUTER: - case JOIN_UNIQUE_INNER: - jointype = JOIN_INNER; - nestjoinOK = true; - useallclauses = false; - break; - default: - elog(ERROR, "unrecognized join type: %d", - (int) jointype); - nestjoinOK = false; /* keep compiler quiet */ - useallclauses = false; - break; - } - - /* - * If inner_cheapest_total is parameterized by the outer rel, ignore it; - * we will consider it below as a member of cheapest_parameterized_paths, - * but the other possibilities considered in this routine aren't usable. - */ - if (PATH_PARAM_BY_REL(inner_cheapest_total, outerrel)) - inner_cheapest_total = NULL; - - /* - * If we need to unique-ify the inner path, we will consider only the - * cheapest-total inner. - */ - if (save_jointype == JOIN_UNIQUE_INNER) - { - /* No way to do this with an inner path parameterized by outer rel */ - if (inner_cheapest_total == NULL) - return; - inner_cheapest_total = (Path *) - create_unique_path(root, innerrel, inner_cheapest_total, extra->sjinfo); - Assert(inner_cheapest_total); - } - else if (nestjoinOK) - { - /* - * Consider materializing the cheapest inner path, unless - * enable_material is off or the path in question materializes its - * output anyway. - */ - if (enable_material && inner_cheapest_total != NULL && - !ExecMaterializesOutput(inner_cheapest_total->pathtype)) - matpath = (Path *) - create_material_path(innerrel, inner_cheapest_total); - } - - foreach(lc1, outerrel->pathlist) - { - Path *outerpath = (Path *) lfirst(lc1); - List *merge_pathkeys; - - /* - * We cannot use an outer path that is parameterized by the inner rel. - */ - if (PATH_PARAM_BY_REL(outerpath, innerrel)) - continue; - - /* - * If we need to unique-ify the outer path, it's pointless to consider - * any but the cheapest outer. (XXX we don't consider parameterized - * outers, nor inners, for unique-ified cases. Should we?) - */ - if (save_jointype == JOIN_UNIQUE_OUTER) - { - if (outerpath != outerrel->cheapest_total_path) - continue; - outerpath = (Path *) create_unique_path(root, outerrel, - outerpath, extra->sjinfo); - Assert(outerpath); - } - - /* - * The result will have this sort order (even if it is implemented as - * a nestloop, and even if some of the mergeclauses are implemented by - * qpquals rather than as true mergeclauses): - */ - merge_pathkeys = build_join_pathkeys(root, joinrel, jointype, - outerpath->pathkeys); - - if (save_jointype == JOIN_UNIQUE_INNER) - { - /* - * Consider nestloop join, but only with the unique-ified cheapest - * inner path - */ - try_nestloop_path(root, - joinrel, - outerpath, - inner_cheapest_total, - merge_pathkeys, - jointype, - extra); - } - else if (nestjoinOK) - { - /* - * Consider nestloop joins using this outer path and various - * available paths for the inner relation. We consider the - * cheapest-total paths for each available parameterization of the - * inner relation, including the unparameterized case. - */ - ListCell *lc2; - - foreach(lc2, innerrel->cheapest_parameterized_paths) - { - Path *innerpath = (Path *) lfirst(lc2); - - try_nestloop_path(root, - joinrel, - outerpath, - innerpath, - merge_pathkeys, - jointype, - extra); - } - - /* Also consider materialized form of the cheapest inner path */ - if (matpath != NULL) - try_nestloop_path(root, - joinrel, - outerpath, - matpath, - merge_pathkeys, - jointype, - extra); - } - - /* Can't do anything else if outer path needs to be unique'd */ - if (save_jointype == JOIN_UNIQUE_OUTER) - continue; - - /* Can't do anything else if inner rel is parameterized by outer */ - if (inner_cheapest_total == NULL) - continue; - - /* Generate merge join paths */ - generate_mergejoin_paths(root, joinrel, innerrel, outerpath, - save_jointype, extra, useallclauses, - inner_cheapest_total, merge_pathkeys, - false); - } - - /* - * Consider partial nestloop and mergejoin plan if outerrel has any - * partial path and the joinrel is parallel-safe. However, we can't - * handle JOIN_UNIQUE_OUTER, because the outer path will be partial, and - * therefore we won't be able to properly guarantee uniqueness. Nor can - * we handle extra_lateral_rels, since partial paths must not be - * parameterized. Similarly, we can't handle JOIN_FULL and JOIN_RIGHT, - * because they can produce false null extended rows. - */ - if (joinrel->consider_parallel && - save_jointype != JOIN_UNIQUE_OUTER && - save_jointype != JOIN_FULL && - save_jointype != JOIN_RIGHT && - outerrel->partial_pathlist != NIL && - bms_is_empty(joinrel->lateral_relids)) - { - if (nestjoinOK) - consider_parallel_nestloop(root, joinrel, outerrel, innerrel, - save_jointype, extra); + RelOptInfo *joinrel, + RelOptInfo *outerrel, + RelOptInfo *innerrel, + JoinType jointype, + JoinPathExtraData *extra) +{ + JoinType save_jointype = jointype; + bool nestjoinOK; + bool useallclauses; + Path *inner_cheapest_total = innerrel->cheapest_total_path; + Path *matpath = NULL; + ListCell *lc1; + + /* + * Nestloop only supports inner, left, semi, and anti joins. Also, if we + * are doing a right or full mergejoin, we must use *all* the mergeclauses + * as join clauses, else we will not have a valid plan. (Although these + * two flags are currently inverses, keep them separate for clarity and + * possible future changes.) + */ + switch (jointype) + { + case JOIN_INNER: + case JOIN_LEFT: + case JOIN_SEMI: + case JOIN_ANTI: +#ifdef __TBASE__ + case JOIN_LEFT_SCALAR: +#endif + nestjoinOK = true; + useallclauses = false; + break; + case JOIN_RIGHT: + case JOIN_FULL: + nestjoinOK = false; + useallclauses = true; + break; + case JOIN_UNIQUE_OUTER: + case JOIN_UNIQUE_INNER: + jointype = JOIN_INNER; + nestjoinOK = true; + useallclauses = false; + break; + default: + elog(ERROR, "unrecognized join type: %d", + (int) jointype); + nestjoinOK = false; /* keep compiler quiet */ + useallclauses = false; + break; + } - /* - * If inner_cheapest_total is NULL or non parallel-safe then find the - * cheapest total parallel safe path. If doing JOIN_UNIQUE_INNER, we - * can't use any alternative inner path. - */ - if (inner_cheapest_total == NULL || - !inner_cheapest_total->parallel_safe) - { - if (save_jointype == JOIN_UNIQUE_INNER) - return; + /* + * If inner_cheapest_total is parameterized by the outer rel, ignore it; + * we will consider it below as a member of cheapest_parameterized_paths, + * but the other possibilities considered in this routine aren't usable. + */ + if (PATH_PARAM_BY_REL(inner_cheapest_total, outerrel)) + inner_cheapest_total = NULL; + + /* + * If we need to unique-ify the inner path, we will consider only the + * cheapest-total inner. + */ + if (save_jointype == JOIN_UNIQUE_INNER) + { + /* No way to do this with an inner path parameterized by outer rel */ + if (inner_cheapest_total == NULL) + return; + inner_cheapest_total = (Path *) + create_unique_path(root, innerrel, inner_cheapest_total, extra->sjinfo); + Assert(inner_cheapest_total); + } + else if (nestjoinOK) + { + /* + * Consider materializing the cheapest inner path, unless + * enable_material is off or the path in question materializes its + * output anyway. + */ + if (enable_material && inner_cheapest_total != NULL && + !ExecMaterializesOutput(inner_cheapest_total->pathtype)) + matpath = (Path *) + create_material_path(innerrel, inner_cheapest_total); + } - inner_cheapest_total = get_cheapest_parallel_safe_total_inner( - innerrel->pathlist); - } + foreach(lc1, outerrel->pathlist) + { + Path *outerpath = (Path *) lfirst(lc1); + List *merge_pathkeys; + + /* + * We cannot use an outer path that is parameterized by the inner rel. + */ + if (PATH_PARAM_BY_REL(outerpath, innerrel)) + continue; + + /* + * If we need to unique-ify the outer path, it's pointless to consider + * any but the cheapest outer. (XXX we don't consider parameterized + * outers, nor inners, for unique-ified cases. Should we?) + */ + if (save_jointype == JOIN_UNIQUE_OUTER) + { + if (outerpath != outerrel->cheapest_total_path) + continue; + outerpath = (Path *) create_unique_path(root, outerrel, + outerpath, extra->sjinfo); + Assert(outerpath); + } + + /* + * The result will have this sort order (even if it is implemented as + * a nestloop, and even if some of the mergeclauses are implemented by + * qpquals rather than as true mergeclauses): + */ + merge_pathkeys = build_join_pathkeys(root, joinrel, jointype, + outerpath->pathkeys); + + if (save_jointype == JOIN_UNIQUE_INNER) + { + /* + * Consider nestloop join, but only with the unique-ified cheapest + * inner path + */ + try_nestloop_path(root, + joinrel, + outerpath, + inner_cheapest_total, + merge_pathkeys, + jointype, + extra); + } + else if (nestjoinOK) + { + /* + * Consider nestloop joins using this outer path and various + * available paths for the inner relation. We consider the + * cheapest-total paths for each available parameterization of the + * inner relation, including the unparameterized case. + */ + ListCell *lc2; + + foreach(lc2, innerrel->cheapest_parameterized_paths) + { + Path *innerpath = (Path *) lfirst(lc2); + + try_nestloop_path(root, + joinrel, + outerpath, + innerpath, + merge_pathkeys, + jointype, + extra); + } + + /* Also consider materialized form of the cheapest inner path */ + if (matpath != NULL) + try_nestloop_path(root, + joinrel, + outerpath, + matpath, + merge_pathkeys, + jointype, + extra); + } + + /* Can't do anything else if outer path needs to be unique'd */ + if (save_jointype == JOIN_UNIQUE_OUTER) + continue; + + /* Can't do anything else if inner rel is parameterized by outer */ + if (inner_cheapest_total == NULL) + continue; + + /* Generate merge join paths */ + generate_mergejoin_paths(root, joinrel, innerrel, outerpath, + save_jointype, extra, useallclauses, + inner_cheapest_total, merge_pathkeys, + false); + } - if (inner_cheapest_total) - consider_parallel_mergejoin(root, joinrel, outerrel, innerrel, - save_jointype, extra, - inner_cheapest_total); - } + /* + * Consider partial nestloop and mergejoin plan if outerrel has any + * partial path and the joinrel is parallel-safe. However, we can't + * handle JOIN_UNIQUE_OUTER, because the outer path will be partial, and + * therefore we won't be able to properly guarantee uniqueness. Nor can + * we handle extra_lateral_rels, since partial paths must not be + * parameterized. Similarly, we can't handle JOIN_FULL and JOIN_RIGHT, + * because they can produce false null extended rows. + */ + if (joinrel->consider_parallel && + save_jointype != JOIN_UNIQUE_OUTER && + save_jointype != JOIN_FULL && + save_jointype != JOIN_RIGHT && + outerrel->partial_pathlist != NIL && + bms_is_empty(joinrel->lateral_relids)) + { + if (nestjoinOK) + consider_parallel_nestloop(root, joinrel, outerrel, innerrel, + save_jointype, extra); + + /* + * If inner_cheapest_total is NULL or non parallel-safe then find the + * cheapest total parallel safe path. If doing JOIN_UNIQUE_INNER, we + * can't use any alternative inner path. + */ + if (inner_cheapest_total == NULL || + !inner_cheapest_total->parallel_safe) + { + if (save_jointype == JOIN_UNIQUE_INNER) + return; + + inner_cheapest_total = get_cheapest_parallel_safe_total_inner( + innerrel->pathlist); + } + + if (inner_cheapest_total) + consider_parallel_mergejoin(root, joinrel, outerrel, innerrel, + save_jointype, extra, + inner_cheapest_total); + } } /* diff --git a/src/backend/optimizer/path/joinrels.c b/src/backend/optimizer/path/joinrels.c index d3022390..659a8494 100644 --- a/src/backend/optimizer/path/joinrels.c +++ b/src/backend/optimizer/path/joinrels.c @@ -328,316 +328,320 @@ make_rels_by_clauseless_joins(PlannerInfo *root, */ static bool join_is_legal(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2, - Relids joinrelids, - SpecialJoinInfo **sjinfo_p, bool *reversed_p) -{// #lizard forgives - SpecialJoinInfo *match_sjinfo; - bool reversed; - bool unique_ified; - bool must_be_leftjoin; - ListCell *l; - - /* - * Ensure output params are set on failure return. This is just to - * suppress uninitialized-variable warnings from overly anal compilers. - */ - *sjinfo_p = NULL; - *reversed_p = false; - - /* - * If we have any special joins, the proposed join might be illegal; and - * in any case we have to determine its join type. Scan the join info - * list for matches and conflicts. - */ - match_sjinfo = NULL; - reversed = false; - unique_ified = false; - must_be_leftjoin = false; - - foreach(l, root->join_info_list) - { - SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) lfirst(l); - - /* - * This special join is not relevant unless its RHS overlaps the - * proposed join. (Check this first as a fast path for dismissing - * most irrelevant SJs quickly.) - */ - if (!bms_overlap(sjinfo->min_righthand, joinrelids)) - continue; - - /* - * Also, not relevant if proposed join is fully contained within RHS - * (ie, we're still building up the RHS). - */ - if (bms_is_subset(joinrelids, sjinfo->min_righthand)) - continue; - - /* - * Also, not relevant if SJ is already done within either input. - */ - if (bms_is_subset(sjinfo->min_lefthand, rel1->relids) && - bms_is_subset(sjinfo->min_righthand, rel1->relids)) - continue; - if (bms_is_subset(sjinfo->min_lefthand, rel2->relids) && - bms_is_subset(sjinfo->min_righthand, rel2->relids)) - continue; - - /* - * If it's a semijoin and we already joined the RHS to any other rels - * within either input, then we must have unique-ified the RHS at that - * point (see below). Therefore the semijoin is no longer relevant in - * this join path. - */ - if (sjinfo->jointype == JOIN_SEMI) - { - if (bms_is_subset(sjinfo->syn_righthand, rel1->relids) && - !bms_equal(sjinfo->syn_righthand, rel1->relids)) - continue; - if (bms_is_subset(sjinfo->syn_righthand, rel2->relids) && - !bms_equal(sjinfo->syn_righthand, rel2->relids)) - continue; - } - - /* - * If one input contains min_lefthand and the other contains - * min_righthand, then we can perform the SJ at this join. - * - * Reject if we get matches to more than one SJ; that implies we're - * considering something that's not really valid. - */ - if (bms_is_subset(sjinfo->min_lefthand, rel1->relids) && - bms_is_subset(sjinfo->min_righthand, rel2->relids)) - { - if (match_sjinfo) - return false; /* invalid join path */ - match_sjinfo = sjinfo; - reversed = false; - } - else if (bms_is_subset(sjinfo->min_lefthand, rel2->relids) && - bms_is_subset(sjinfo->min_righthand, rel1->relids)) - { - if (match_sjinfo) - return false; /* invalid join path */ - match_sjinfo = sjinfo; - reversed = true; - } - else if (sjinfo->jointype == JOIN_SEMI && - bms_equal(sjinfo->syn_righthand, rel2->relids) && - create_unique_path(root, rel2, rel2->cheapest_total_path, - sjinfo) != NULL) - { - /*---------- - * For a semijoin, we can join the RHS to anything else by - * unique-ifying the RHS (if the RHS can be unique-ified). - * We will only get here if we have the full RHS but less - * than min_lefthand on the LHS. - * - * The reason to consider such a join path is exemplified by - * SELECT ... FROM a,b WHERE (a.x,b.y) IN (SELECT c1,c2 FROM c) - * If we insist on doing this as a semijoin we will first have - * to form the cartesian product of A*B. But if we unique-ify - * C then the semijoin becomes a plain innerjoin and we can join - * in any order, eg C to A and then to B. When C is much smaller - * than A and B this can be a huge win. So we allow C to be - * joined to just A or just B here, and then make_join_rel has - * to handle the case properly. - * - * Note that actually we'll allow unique-ified C to be joined to - * some other relation D here, too. That is legal, if usually not - * very sane, and this routine is only concerned with legality not - * with whether the join is good strategy. - *---------- - */ - if (match_sjinfo) - return false; /* invalid join path */ - match_sjinfo = sjinfo; - reversed = false; - unique_ified = true; - } - else if (sjinfo->jointype == JOIN_SEMI && - bms_equal(sjinfo->syn_righthand, rel1->relids) && - create_unique_path(root, rel1, rel1->cheapest_total_path, - sjinfo) != NULL) - { - /* Reversed semijoin case */ - if (match_sjinfo) - return false; /* invalid join path */ - match_sjinfo = sjinfo; - reversed = true; - unique_ified = true; - } - else - { - /* - * Otherwise, the proposed join overlaps the RHS but isn't a valid - * implementation of this SJ. But don't panic quite yet: the RHS - * violation might have occurred previously, in one or both input - * relations, in which case we must have previously decided that - * it was OK to commute some other SJ with this one. If we need - * to perform this join to finish building up the RHS, rejecting - * it could lead to not finding any plan at all. (This can occur - * because of the heuristics elsewhere in this file that postpone - * clauseless joins: we might not consider doing a clauseless join - * within the RHS until after we've performed other, validly - * commutable SJs with one or both sides of the clauseless join.) - * This consideration boils down to the rule that if both inputs - * overlap the RHS, we can allow the join --- they are either - * fully within the RHS, or represent previously-allowed joins to - * rels outside it. - */ - if (bms_overlap(rel1->relids, sjinfo->min_righthand) && - bms_overlap(rel2->relids, sjinfo->min_righthand)) - continue; /* assume valid previous violation of RHS */ - - /* - * The proposed join could still be legal, but only if we're - * allowed to associate it into the RHS of this SJ. That means - * this SJ must be a LEFT join (not SEMI or ANTI, and certainly - * not FULL) and the proposed join must not overlap the LHS. - */ - if (sjinfo->jointype != JOIN_LEFT || - bms_overlap(joinrelids, sjinfo->min_lefthand)) - return false; /* invalid join path */ - - /* - * To be valid, the proposed join must be a LEFT join; otherwise - * it can't associate into this SJ's RHS. But we may not yet have - * found the SpecialJoinInfo matching the proposed join, so we - * can't test that yet. Remember the requirement for later. - */ - must_be_leftjoin = true; - } - } - - /* - * Fail if violated any SJ's RHS and didn't match to a LEFT SJ: the - * proposed join can't associate into an SJ's RHS. - * - * Also, fail if the proposed join's predicate isn't strict; we're - * essentially checking to see if we can apply outer-join identity 3, and - * that's a requirement. (This check may be redundant with checks in - * make_outerjoininfo, but I'm not quite sure, and it's cheap to test.) - */ - if (must_be_leftjoin && - (match_sjinfo == NULL || - match_sjinfo->jointype != JOIN_LEFT || - !match_sjinfo->lhs_strict)) - return false; /* invalid join path */ - - /* - * We also have to check for constraints imposed by LATERAL references. - */ - if (root->hasLateralRTEs) - { - bool lateral_fwd; - bool lateral_rev; - Relids join_lateral_rels; - - /* - * The proposed rels could each contain lateral references to the - * other, in which case the join is impossible. If there are lateral - * references in just one direction, then the join has to be done with - * a nestloop with the lateral referencer on the inside. If the join - * matches an SJ that cannot be implemented by such a nestloop, the - * join is impossible. - * - * Also, if the lateral reference is only indirect, we should reject - * the join; whatever rel(s) the reference chain goes through must be - * joined to first. - * - * Another case that might keep us from building a valid plan is the - * implementation restriction described by have_dangerous_phv(). - */ - lateral_fwd = bms_overlap(rel1->relids, rel2->lateral_relids); - lateral_rev = bms_overlap(rel2->relids, rel1->lateral_relids); - if (lateral_fwd && lateral_rev) - return false; /* have lateral refs in both directions */ - if (lateral_fwd) - { - /* has to be implemented as nestloop with rel1 on left */ - if (match_sjinfo && - (reversed || - unique_ified || - match_sjinfo->jointype == JOIN_FULL)) - return false; /* not implementable as nestloop */ - /* check there is a direct reference from rel2 to rel1 */ - if (!bms_overlap(rel1->relids, rel2->direct_lateral_relids)) - return false; /* only indirect refs, so reject */ - /* check we won't have a dangerous PHV */ - if (have_dangerous_phv(root, rel1->relids, rel2->lateral_relids)) - return false; /* might be unable to handle required PHV */ - } - else if (lateral_rev) - { - /* has to be implemented as nestloop with rel2 on left */ - if (match_sjinfo && - (!reversed || - unique_ified || - match_sjinfo->jointype == JOIN_FULL)) - return false; /* not implementable as nestloop */ - /* check there is a direct reference from rel1 to rel2 */ - if (!bms_overlap(rel2->relids, rel1->direct_lateral_relids)) - return false; /* only indirect refs, so reject */ - /* check we won't have a dangerous PHV */ - if (have_dangerous_phv(root, rel2->relids, rel1->lateral_relids)) - return false; /* might be unable to handle required PHV */ - } - - /* - * LATERAL references could also cause problems later on if we accept - * this join: if the join's minimum parameterization includes any rels - * that would have to be on the inside of an outer join with this join - * rel, then it's never going to be possible to build the complete - * query using this join. We should reject this join not only because - * it'll save work, but because if we don't, the clauseless-join - * heuristics might think that legality of this join means that some - * other join rel need not be formed, and that could lead to failure - * to find any plan at all. We have to consider not only rels that - * are directly on the inner side of an OJ with the joinrel, but also - * ones that are indirectly so, so search to find all such rels. - */ - join_lateral_rels = min_join_parameterization(root, joinrelids, - rel1, rel2); - if (join_lateral_rels) - { - Relids join_plus_rhs = bms_copy(joinrelids); - bool more; - - do - { - more = false; - foreach(l, root->join_info_list) - { - SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) lfirst(l); - - if (bms_overlap(sjinfo->min_lefthand, join_plus_rhs) && - !bms_is_subset(sjinfo->min_righthand, join_plus_rhs)) - { - join_plus_rhs = bms_add_members(join_plus_rhs, - sjinfo->min_righthand); - more = true; - } - /* full joins constrain both sides symmetrically */ - if (sjinfo->jointype == JOIN_FULL && - bms_overlap(sjinfo->min_righthand, join_plus_rhs) && - !bms_is_subset(sjinfo->min_lefthand, join_plus_rhs)) - { - join_plus_rhs = bms_add_members(join_plus_rhs, - sjinfo->min_lefthand); - more = true; - } - } - } while (more); - if (bms_overlap(join_plus_rhs, join_lateral_rels)) - return false; /* will not be able to join to some RHS rel */ - } - } - - /* Otherwise, it's a valid join */ - *sjinfo_p = match_sjinfo; - *reversed_p = reversed; - return true; + Relids joinrelids, + SpecialJoinInfo **sjinfo_p, bool *reversed_p) +{ + SpecialJoinInfo *match_sjinfo; + bool reversed; + bool unique_ified; + bool must_be_leftjoin; + ListCell *l; + + /* + * Ensure output params are set on failure return. This is just to + * suppress uninitialized-variable warnings from overly anal compilers. + */ + *sjinfo_p = NULL; + *reversed_p = false; + + /* + * If we have any special joins, the proposed join might be illegal; and + * in any case we have to determine its join type. Scan the join info + * list for matches and conflicts. + */ + match_sjinfo = NULL; + reversed = false; + unique_ified = false; + must_be_leftjoin = false; + + foreach(l, root->join_info_list) + { + SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) lfirst(l); + + /* + * This special join is not relevant unless its RHS overlaps the + * proposed join. (Check this first as a fast path for dismissing + * most irrelevant SJs quickly.) + */ + if (!bms_overlap(sjinfo->min_righthand, joinrelids)) + continue; + + /* + * Also, not relevant if proposed join is fully contained within RHS + * (ie, we're still building up the RHS). + */ + if (bms_is_subset(joinrelids, sjinfo->min_righthand)) + continue; + + /* + * Also, not relevant if SJ is already done within either input. + */ + if (bms_is_subset(sjinfo->min_lefthand, rel1->relids) && + bms_is_subset(sjinfo->min_righthand, rel1->relids)) + continue; + if (bms_is_subset(sjinfo->min_lefthand, rel2->relids) && + bms_is_subset(sjinfo->min_righthand, rel2->relids)) + continue; + + /* + * If it's a semijoin and we already joined the RHS to any other rels + * within either input, then we must have unique-ified the RHS at that + * point (see below). Therefore the semijoin is no longer relevant in + * this join path. + */ +#ifdef __TBASE__ + if (sjinfo->jointype == JOIN_SEMI || sjinfo->jointype == JOIN_LEFT_SCALAR) +#else + if (sjinfo->jointype == JOIN_SEMI) +#endif + { + if (bms_is_subset(sjinfo->syn_righthand, rel1->relids) && + !bms_equal(sjinfo->syn_righthand, rel1->relids)) + continue; + if (bms_is_subset(sjinfo->syn_righthand, rel2->relids) && + !bms_equal(sjinfo->syn_righthand, rel2->relids)) + continue; + } + + /* + * If one input contains min_lefthand and the other contains + * min_righthand, then we can perform the SJ at this join. + * + * Reject if we get matches to more than one SJ; that implies we're + * considering something that's not really valid. + */ + if (bms_is_subset(sjinfo->min_lefthand, rel1->relids) && + bms_is_subset(sjinfo->min_righthand, rel2->relids)) + { + if (match_sjinfo) + return false; /* invalid join path */ + match_sjinfo = sjinfo; + reversed = false; + } + else if (bms_is_subset(sjinfo->min_lefthand, rel2->relids) && + bms_is_subset(sjinfo->min_righthand, rel1->relids)) + { + if (match_sjinfo) + return false; /* invalid join path */ + match_sjinfo = sjinfo; + reversed = true; + } + else if (sjinfo->jointype == JOIN_SEMI && + bms_equal(sjinfo->syn_righthand, rel2->relids) && + create_unique_path(root, rel2, rel2->cheapest_total_path, + sjinfo) != NULL) + { + /*---------- + * For a semijoin, we can join the RHS to anything else by + * unique-ifying the RHS (if the RHS can be unique-ified). + * We will only get here if we have the full RHS but less + * than min_lefthand on the LHS. + * + * The reason to consider such a join path is exemplified by + * SELECT ... FROM a,b WHERE (a.x,b.y) IN (SELECT c1,c2 FROM c) + * If we insist on doing this as a semijoin we will first have + * to form the cartesian product of A*B. But if we unique-ify + * C then the semijoin becomes a plain innerjoin and we can join + * in any order, eg C to A and then to B. When C is much smaller + * than A and B this can be a huge win. So we allow C to be + * joined to just A or just B here, and then make_join_rel has + * to handle the case properly. + * + * Note that actually we'll allow unique-ified C to be joined to + * some other relation D here, too. That is legal, if usually not + * very sane, and this routine is only concerned with legality not + * with whether the join is good strategy. + *---------- + */ + if (match_sjinfo) + return false; /* invalid join path */ + match_sjinfo = sjinfo; + reversed = false; + unique_ified = true; + } + else if (sjinfo->jointype == JOIN_SEMI && + bms_equal(sjinfo->syn_righthand, rel1->relids) && + create_unique_path(root, rel1, rel1->cheapest_total_path, + sjinfo) != NULL) + { + /* Reversed semijoin case */ + if (match_sjinfo) + return false; /* invalid join path */ + match_sjinfo = sjinfo; + reversed = true; + unique_ified = true; + } + else + { + /* + * Otherwise, the proposed join overlaps the RHS but isn't a valid + * implementation of this SJ. But don't panic quite yet: the RHS + * violation might have occurred previously, in one or both input + * relations, in which case we must have previously decided that + * it was OK to commute some other SJ with this one. If we need + * to perform this join to finish building up the RHS, rejecting + * it could lead to not finding any plan at all. (This can occur + * because of the heuristics elsewhere in this file that postpone + * clauseless joins: we might not consider doing a clauseless join + * within the RHS until after we've performed other, validly + * commutable SJs with one or both sides of the clauseless join.) + * This consideration boils down to the rule that if both inputs + * overlap the RHS, we can allow the join --- they are either + * fully within the RHS, or represent previously-allowed joins to + * rels outside it. + */ + if (bms_overlap(rel1->relids, sjinfo->min_righthand) && + bms_overlap(rel2->relids, sjinfo->min_righthand)) + continue; /* assume valid previous violation of RHS */ + + /* + * The proposed join could still be legal, but only if we're + * allowed to associate it into the RHS of this SJ. That means + * this SJ must be a LEFT join (not SEMI or ANTI, and certainly + * not FULL) and the proposed join must not overlap the LHS. + */ + if (sjinfo->jointype != JOIN_LEFT || + bms_overlap(joinrelids, sjinfo->min_lefthand)) + return false; /* invalid join path */ + + /* + * To be valid, the proposed join must be a LEFT join; otherwise + * it can't associate into this SJ's RHS. But we may not yet have + * found the SpecialJoinInfo matching the proposed join, so we + * can't test that yet. Remember the requirement for later. + */ + must_be_leftjoin = true; + } + } + + /* + * Fail if violated any SJ's RHS and didn't match to a LEFT SJ: the + * proposed join can't associate into an SJ's RHS. + * + * Also, fail if the proposed join's predicate isn't strict; we're + * essentially checking to see if we can apply outer-join identity 3, and + * that's a requirement. (This check may be redundant with checks in + * make_outerjoininfo, but I'm not quite sure, and it's cheap to test.) + */ + if (must_be_leftjoin && + (match_sjinfo == NULL || + match_sjinfo->jointype != JOIN_LEFT || + !match_sjinfo->lhs_strict)) + return false; /* invalid join path */ + + /* + * We also have to check for constraints imposed by LATERAL references. + */ + if (root->hasLateralRTEs) + { + bool lateral_fwd; + bool lateral_rev; + Relids join_lateral_rels; + + /* + * The proposed rels could each contain lateral references to the + * other, in which case the join is impossible. If there are lateral + * references in just one direction, then the join has to be done with + * a nestloop with the lateral referencer on the inside. If the join + * matches an SJ that cannot be implemented by such a nestloop, the + * join is impossible. + * + * Also, if the lateral reference is only indirect, we should reject + * the join; whatever rel(s) the reference chain goes through must be + * joined to first. + * + * Another case that might keep us from building a valid plan is the + * implementation restriction described by have_dangerous_phv(). + */ + lateral_fwd = bms_overlap(rel1->relids, rel2->lateral_relids); + lateral_rev = bms_overlap(rel2->relids, rel1->lateral_relids); + if (lateral_fwd && lateral_rev) + return false; /* have lateral refs in both directions */ + if (lateral_fwd) + { + /* has to be implemented as nestloop with rel1 on left */ + if (match_sjinfo && + (reversed || + unique_ified || + match_sjinfo->jointype == JOIN_FULL)) + return false; /* not implementable as nestloop */ + /* check there is a direct reference from rel2 to rel1 */ + if (!bms_overlap(rel1->relids, rel2->direct_lateral_relids)) + return false; /* only indirect refs, so reject */ + /* check we won't have a dangerous PHV */ + if (have_dangerous_phv(root, rel1->relids, rel2->lateral_relids)) + return false; /* might be unable to handle required PHV */ + } + else if (lateral_rev) + { + /* has to be implemented as nestloop with rel2 on left */ + if (match_sjinfo && + (!reversed || + unique_ified || + match_sjinfo->jointype == JOIN_FULL)) + return false; /* not implementable as nestloop */ + /* check there is a direct reference from rel1 to rel2 */ + if (!bms_overlap(rel2->relids, rel1->direct_lateral_relids)) + return false; /* only indirect refs, so reject */ + /* check we won't have a dangerous PHV */ + if (have_dangerous_phv(root, rel2->relids, rel1->lateral_relids)) + return false; /* might be unable to handle required PHV */ + } + + /* + * LATERAL references could also cause problems later on if we accept + * this join: if the join's minimum parameterization includes any rels + * that would have to be on the inside of an outer join with this join + * rel, then it's never going to be possible to build the complete + * query using this join. We should reject this join not only because + * it'll save work, but because if we don't, the clauseless-join + * heuristics might think that legality of this join means that some + * other join rel need not be formed, and that could lead to failure + * to find any plan at all. We have to consider not only rels that + * are directly on the inner side of an OJ with the joinrel, but also + * ones that are indirectly so, so search to find all such rels. + */ + join_lateral_rels = min_join_parameterization(root, joinrelids, + rel1, rel2); + if (join_lateral_rels) + { + Relids join_plus_rhs = bms_copy(joinrelids); + bool more; + + do + { + more = false; + foreach(l, root->join_info_list) + { + SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) lfirst(l); + + if (bms_overlap(sjinfo->min_lefthand, join_plus_rhs) && + !bms_is_subset(sjinfo->min_righthand, join_plus_rhs)) + { + join_plus_rhs = bms_add_members(join_plus_rhs, + sjinfo->min_righthand); + more = true; + } + /* full joins constrain both sides symmetrically */ + if (sjinfo->jointype == JOIN_FULL && + bms_overlap(sjinfo->min_righthand, join_plus_rhs) && + !bms_is_subset(sjinfo->min_lefthand, join_plus_rhs)) + { + join_plus_rhs = bms_add_members(join_plus_rhs, + sjinfo->min_lefthand); + more = true; + } + } + } while (more); + if (bms_overlap(join_plus_rhs, join_lateral_rels)) + return false; /* will not be able to join to some RHS rel */ + } + } + + /* Otherwise, it's a valid join */ + *sjinfo_p = match_sjinfo; + *reversed_p = reversed; + return true; } @@ -745,153 +749,168 @@ make_join_rel(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2) */ static void populate_joinrel_with_paths(PlannerInfo *root, RelOptInfo *rel1, - RelOptInfo *rel2, RelOptInfo *joinrel, - SpecialJoinInfo *sjinfo, List *restrictlist) -{// #lizard forgives - /* - * Consider paths using each rel as both outer and inner. Depending on - * the join type, a provably empty outer or inner rel might mean the join - * is provably empty too; in which case throw away any previously computed - * paths and mark the join as dummy. (We do it this way since it's - * conceivable that dummy-ness of a multi-element join might only be - * noticeable for certain construction paths.) - * - * Also, a provably constant-false join restriction typically means that - * we can skip evaluating one or both sides of the join. We do this by - * marking the appropriate rel as dummy. For outer joins, a - * constant-false restriction that is pushed down still means the whole - * join is dummy, while a non-pushed-down one means that no inner rows - * will join so we can treat the inner rel as dummy. - * - * We need only consider the jointypes that appear in join_info_list, plus - * JOIN_INNER. - */ - switch (sjinfo->jointype) - { - case JOIN_INNER: - if (is_dummy_rel(rel1) || is_dummy_rel(rel2) || - restriction_is_constant_false(restrictlist, false)) - { - mark_dummy_rel(joinrel); - break; - } - add_paths_to_joinrel(root, joinrel, rel1, rel2, - JOIN_INNER, sjinfo, - restrictlist); - add_paths_to_joinrel(root, joinrel, rel2, rel1, - JOIN_INNER, sjinfo, - restrictlist); - break; - case JOIN_LEFT: - if (is_dummy_rel(rel1) || - restriction_is_constant_false(restrictlist, true)) - { - mark_dummy_rel(joinrel); - break; - } - if (restriction_is_constant_false(restrictlist, false) && - bms_is_subset(rel2->relids, sjinfo->syn_righthand)) - mark_dummy_rel(rel2); - add_paths_to_joinrel(root, joinrel, rel1, rel2, - JOIN_LEFT, sjinfo, - restrictlist); - add_paths_to_joinrel(root, joinrel, rel2, rel1, - JOIN_RIGHT, sjinfo, - restrictlist); - break; - case JOIN_FULL: - if ((is_dummy_rel(rel1) && is_dummy_rel(rel2)) || - restriction_is_constant_false(restrictlist, true)) - { - mark_dummy_rel(joinrel); - break; - } - add_paths_to_joinrel(root, joinrel, rel1, rel2, - JOIN_FULL, sjinfo, - restrictlist); - add_paths_to_joinrel(root, joinrel, rel2, rel1, - JOIN_FULL, sjinfo, - restrictlist); - - /* - * If there are join quals that aren't mergeable or hashable, we - * may not be able to build any valid plan. Complain here so that - * we can give a somewhat-useful error message. (Since we have no - * flexibility of planning for a full join, there's no chance of - * succeeding later with another pair of input rels.) - */ - if (joinrel->pathlist == NIL) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("FULL JOIN is only supported with merge-joinable or hash-joinable join conditions"))); - break; - case JOIN_SEMI: - - /* - * We might have a normal semijoin, or a case where we don't have - * enough rels to do the semijoin but can unique-ify the RHS and - * then do an innerjoin (see comments in join_is_legal). In the - * latter case we can't apply JOIN_SEMI joining. - */ - if (bms_is_subset(sjinfo->min_lefthand, rel1->relids) && - bms_is_subset(sjinfo->min_righthand, rel2->relids)) - { - if (is_dummy_rel(rel1) || is_dummy_rel(rel2) || - restriction_is_constant_false(restrictlist, false)) - { - mark_dummy_rel(joinrel); - break; - } - add_paths_to_joinrel(root, joinrel, rel1, rel2, - JOIN_SEMI, sjinfo, - restrictlist); - } - - /* - * If we know how to unique-ify the RHS and one input rel is - * exactly the RHS (not a superset) we can consider unique-ifying - * it and then doing a regular join. (The create_unique_path - * check here is probably redundant with what join_is_legal did, - * but if so the check is cheap because it's cached. So test - * anyway to be sure.) - */ - if (bms_equal(sjinfo->syn_righthand, rel2->relids) && - create_unique_path(root, rel2, rel2->cheapest_total_path, - sjinfo) != NULL) - { - if (is_dummy_rel(rel1) || is_dummy_rel(rel2) || - restriction_is_constant_false(restrictlist, false)) - { - mark_dummy_rel(joinrel); - break; - } - add_paths_to_joinrel(root, joinrel, rel1, rel2, - JOIN_UNIQUE_INNER, sjinfo, - restrictlist); - add_paths_to_joinrel(root, joinrel, rel2, rel1, - JOIN_UNIQUE_OUTER, sjinfo, - restrictlist); - } - break; - case JOIN_ANTI: - if (is_dummy_rel(rel1) || - restriction_is_constant_false(restrictlist, true)) - { - mark_dummy_rel(joinrel); - break; - } - if (restriction_is_constant_false(restrictlist, false) && - bms_is_subset(rel2->relids, sjinfo->syn_righthand)) - mark_dummy_rel(rel2); - add_paths_to_joinrel(root, joinrel, rel1, rel2, - JOIN_ANTI, sjinfo, - restrictlist); - break; - default: - /* other values not expected here */ - elog(ERROR, "unrecognized join type: %d", (int) sjinfo->jointype); - break; - } + RelOptInfo *rel2, RelOptInfo *joinrel, + SpecialJoinInfo *sjinfo, List *restrictlist) +{ + /* + * Consider paths using each rel as both outer and inner. Depending on + * the join type, a provably empty outer or inner rel might mean the join + * is provably empty too; in which case throw away any previously computed + * paths and mark the join as dummy. (We do it this way since it's + * conceivable that dummy-ness of a multi-element join might only be + * noticeable for certain construction paths.) + * + * Also, a provably constant-false join restriction typically means that + * we can skip evaluating one or both sides of the join. We do this by + * marking the appropriate rel as dummy. For outer joins, a + * constant-false restriction that is pushed down still means the whole + * join is dummy, while a non-pushed-down one means that no inner rows + * will join so we can treat the inner rel as dummy. + * + * We need only consider the jointypes that appear in join_info_list, plus + * JOIN_INNER. + */ + switch (sjinfo->jointype) + { + case JOIN_INNER: + if (is_dummy_rel(rel1) || is_dummy_rel(rel2) || + restriction_is_constant_false(restrictlist, false)) + { + mark_dummy_rel(joinrel); + break; + } + add_paths_to_joinrel(root, joinrel, rel1, rel2, + JOIN_INNER, sjinfo, + restrictlist); + add_paths_to_joinrel(root, joinrel, rel2, rel1, + JOIN_INNER, sjinfo, + restrictlist); + break; + case JOIN_LEFT: + if (is_dummy_rel(rel1) || + restriction_is_constant_false(restrictlist, true)) + { + mark_dummy_rel(joinrel); + break; + } + if (restriction_is_constant_false(restrictlist, false) && + bms_is_subset(rel2->relids, sjinfo->syn_righthand)) + mark_dummy_rel(rel2); + add_paths_to_joinrel(root, joinrel, rel1, rel2, + JOIN_LEFT, sjinfo, + restrictlist); + add_paths_to_joinrel(root, joinrel, rel2, rel1, + JOIN_RIGHT, sjinfo, + restrictlist); + break; + case JOIN_FULL: + if ((is_dummy_rel(rel1) && is_dummy_rel(rel2)) || + restriction_is_constant_false(restrictlist, true)) + { + mark_dummy_rel(joinrel); + break; + } + add_paths_to_joinrel(root, joinrel, rel1, rel2, + JOIN_FULL, sjinfo, + restrictlist); + add_paths_to_joinrel(root, joinrel, rel2, rel1, + JOIN_FULL, sjinfo, + restrictlist); + + /* + * If there are join quals that aren't mergeable or hashable, we + * may not be able to build any valid plan. Complain here so that + * we can give a somewhat-useful error message. (Since we have no + * flexibility of planning for a full join, there's no chance of + * succeeding later with another pair of input rels.) + */ + if (joinrel->pathlist == NIL) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("FULL JOIN is only supported with merge-joinable or hash-joinable join conditions"))); + break; + case JOIN_SEMI: +#ifdef __TBASE__ + case JOIN_LEFT_SCALAR: +#endif + + /* + * We might have a normal semijoin, or a case where we don't have + * enough rels to do the semijoin but can unique-ify the RHS and + * then do an innerjoin (see comments in join_is_legal). In the + * latter case we can't apply JOIN_SEMI joining. + */ + if (bms_is_subset(sjinfo->min_lefthand, rel1->relids) && + bms_is_subset(sjinfo->min_righthand, rel2->relids)) + { + if (is_dummy_rel(rel1) || is_dummy_rel(rel2) || + restriction_is_constant_false(restrictlist, false)) + { + mark_dummy_rel(joinrel); + break; + } +#ifdef __TBASE__ + add_paths_to_joinrel(root, joinrel, rel1, rel2, + sjinfo->jointype, sjinfo, + restrictlist); +#else + add_paths_to_joinrel(root, joinrel, rel1, rel2, + JOIN_SEMI, sjinfo, + restrictlist); +#endif + } + + /* + * If we know how to unique-ify the RHS and one input rel is + * exactly the RHS (not a superset) we can consider unique-ifying + * it and then doing a regular join. (The create_unique_path + * check here is probably redundant with what join_is_legal did, + * but if so the check is cheap because it's cached. So test + * anyway to be sure.) + */ +#ifdef __TBASE__ + if (sjinfo->jointype == JOIN_SEMI && bms_equal(sjinfo->syn_righthand, rel2->relids) && + create_unique_path(root, rel2, rel2->cheapest_total_path, + sjinfo) != NULL) +#else + if (bms_equal(sjinfo->syn_righthand, rel2->relids) && + create_unique_path(root, rel2, rel2->cheapest_total_path, + sjinfo) != NULL) +#endif + { + if (is_dummy_rel(rel1) || is_dummy_rel(rel2) || + restriction_is_constant_false(restrictlist, false)) + { + mark_dummy_rel(joinrel); + break; + } + add_paths_to_joinrel(root, joinrel, rel1, rel2, + JOIN_UNIQUE_INNER, sjinfo, + restrictlist); + add_paths_to_joinrel(root, joinrel, rel2, rel1, + JOIN_UNIQUE_OUTER, sjinfo, + restrictlist); + } + break; + case JOIN_ANTI: + if (is_dummy_rel(rel1) || + restriction_is_constant_false(restrictlist, true)) + { + mark_dummy_rel(joinrel); + break; + } + if (restriction_is_constant_false(restrictlist, false) && + bms_is_subset(rel2->relids, sjinfo->syn_righthand)) + mark_dummy_rel(rel2); + add_paths_to_joinrel(root, joinrel, rel1, rel2, + JOIN_ANTI, sjinfo, + restrictlist); + break; + default: + /* other values not expected here */ + elog(ERROR, "unrecognized join type: %d", (int) sjinfo->jointype); + break; + } } diff --git a/src/backend/optimizer/plan/initsplan.c b/src/backend/optimizer/plan/initsplan.c index 5cdbdec7..100d9db5 100644 --- a/src/backend/optimizer/plan/initsplan.c +++ b/src/backend/optimizer/plan/initsplan.c @@ -775,318 +775,321 @@ deconstruct_recurse(PlannerInfo *root, Node *jtnode, bool below_outer_join, } #endif - /* A single baserel does not create an inner join */ - *inner_join_rels = NULL; - joinlist = list_make1(jtnode); - } - else if (IsA(jtnode, FromExpr)) - { - FromExpr *f = (FromExpr *) jtnode; - List *child_postponed_quals = NIL; - int remaining; - ListCell *l; - - /* - * First, recurse to handle child joins. We collapse subproblems into - * a single joinlist whenever the resulting joinlist wouldn't exceed - * from_collapse_limit members. Also, always collapse one-element - * subproblems, since that won't lengthen the joinlist anyway. - */ - *qualscope = NULL; - *inner_join_rels = NULL; - joinlist = NIL; - remaining = list_length(f->fromlist); - foreach(l, f->fromlist) - { - Relids sub_qualscope; - List *sub_joinlist; - int sub_members; - - sub_joinlist = deconstruct_recurse(root, lfirst(l), - below_outer_join, - &sub_qualscope, - inner_join_rels, - &child_postponed_quals); - *qualscope = bms_add_members(*qualscope, sub_qualscope); - sub_members = list_length(sub_joinlist); - remaining--; - if (sub_members <= 1 || - list_length(joinlist) + sub_members + remaining <= from_collapse_limit) - joinlist = list_concat(joinlist, sub_joinlist); - else - joinlist = lappend(joinlist, sub_joinlist); - } - - /* - * A FROM with more than one list element is an inner join subsuming - * all below it, so we should report inner_join_rels = qualscope. If - * there was exactly one element, we should (and already did) report - * whatever its inner_join_rels were. If there were no elements (is - * that possible?) the initialization before the loop fixed it. - */ - if (list_length(f->fromlist) > 1) - *inner_join_rels = *qualscope; - - /* - * Try to process any quals postponed by children. If they need - * further postponement, add them to my output postponed_qual_list. - */ - foreach(l, child_postponed_quals) - { - PostponedQual *pq = (PostponedQual *) lfirst(l); - - if (bms_is_subset(pq->relids, *qualscope)) - distribute_qual_to_rels(root, pq->qual, - false, below_outer_join, JOIN_INNER, - root->qual_security_level, - *qualscope, NULL, NULL, NULL, - NULL); - else - *postponed_qual_list = lappend(*postponed_qual_list, pq); - } - - /* - * Now process the top-level quals. - */ - foreach(l, (List *) f->quals) - { - Node *qual = (Node *) lfirst(l); - - distribute_qual_to_rels(root, qual, - false, below_outer_join, JOIN_INNER, - root->qual_security_level, - *qualscope, NULL, NULL, NULL, - postponed_qual_list); - } - } - else if (IsA(jtnode, JoinExpr)) - { - JoinExpr *j = (JoinExpr *) jtnode; - List *child_postponed_quals = NIL; - Relids leftids, - rightids, - left_inners, - right_inners, - nonnullable_rels, - nullable_rels, - ojscope; - List *leftjoinlist, - *rightjoinlist; - List *my_quals; - SpecialJoinInfo *sjinfo; - ListCell *l; - - /* - * Order of operations here is subtle and critical. First we recurse - * to handle sub-JOINs. Their join quals will be placed without - * regard for whether this level is an outer join, which is correct. - * Then we place our own join quals, which are restricted by lower - * outer joins in any case, and are forced to this level if this is an - * outer join and they mention the outer side. Finally, if this is an - * outer join, we create a join_info_list entry for the join. This - * will prevent quals above us in the join tree that use those rels - * from being pushed down below this level. (It's okay for upper - * quals to be pushed down to the outer side, however.) - */ - switch (j->jointype) - { - case JOIN_INNER: - leftjoinlist = deconstruct_recurse(root, j->larg, - below_outer_join, - &leftids, &left_inners, - &child_postponed_quals); - rightjoinlist = deconstruct_recurse(root, j->rarg, - below_outer_join, - &rightids, &right_inners, - &child_postponed_quals); - *qualscope = bms_union(leftids, rightids); - *inner_join_rels = *qualscope; - /* Inner join adds no restrictions for quals */ - nonnullable_rels = NULL; - /* and it doesn't force anything to null, either */ - nullable_rels = NULL; - break; - case JOIN_LEFT: - case JOIN_ANTI: - leftjoinlist = deconstruct_recurse(root, j->larg, - below_outer_join, - &leftids, &left_inners, - &child_postponed_quals); - rightjoinlist = deconstruct_recurse(root, j->rarg, - true, - &rightids, &right_inners, - &child_postponed_quals); - *qualscope = bms_union(leftids, rightids); - *inner_join_rels = bms_union(left_inners, right_inners); - nonnullable_rels = leftids; - nullable_rels = rightids; - break; - case JOIN_SEMI: - leftjoinlist = deconstruct_recurse(root, j->larg, - below_outer_join, - &leftids, &left_inners, - &child_postponed_quals); - rightjoinlist = deconstruct_recurse(root, j->rarg, - below_outer_join, - &rightids, &right_inners, - &child_postponed_quals); - *qualscope = bms_union(leftids, rightids); - *inner_join_rels = bms_union(left_inners, right_inners); - /* Semi join adds no restrictions for quals */ - nonnullable_rels = NULL; - - /* - * Theoretically, a semijoin would null the RHS; but since the - * RHS can't be accessed above the join, this is immaterial - * and we needn't account for it. - */ - nullable_rels = NULL; - break; - case JOIN_FULL: - leftjoinlist = deconstruct_recurse(root, j->larg, - true, - &leftids, &left_inners, - &child_postponed_quals); - rightjoinlist = deconstruct_recurse(root, j->rarg, - true, - &rightids, &right_inners, - &child_postponed_quals); - *qualscope = bms_union(leftids, rightids); - *inner_join_rels = bms_union(left_inners, right_inners); - /* each side is both outer and inner */ - nonnullable_rels = *qualscope; - nullable_rels = *qualscope; - break; - default: - /* JOIN_RIGHT was eliminated during reduce_outer_joins() */ - elog(ERROR, "unrecognized join type: %d", - (int) j->jointype); - nonnullable_rels = NULL; /* keep compiler quiet */ - nullable_rels = NULL; - leftjoinlist = rightjoinlist = NIL; - break; - } - - /* Report all rels that will be nulled anywhere in the jointree */ - root->nullable_baserels = bms_add_members(root->nullable_baserels, - nullable_rels); - - /* - * Try to process any quals postponed by children. If they need - * further postponement, add them to my output postponed_qual_list. - * Quals that can be processed now must be included in my_quals, so - * that they'll be handled properly in make_outerjoininfo. - */ - my_quals = NIL; - foreach(l, child_postponed_quals) - { - PostponedQual *pq = (PostponedQual *) lfirst(l); - - if (bms_is_subset(pq->relids, *qualscope)) - my_quals = lappend(my_quals, pq->qual); - else - { - /* - * We should not be postponing any quals past an outer join. - * If this Assert fires, pull_up_subqueries() messed up. - */ - Assert(j->jointype == JOIN_INNER); - *postponed_qual_list = lappend(*postponed_qual_list, pq); - } - } - /* list_concat is nondestructive of its second argument */ - my_quals = list_concat(my_quals, (List *) j->quals); - - /* - * For an OJ, form the SpecialJoinInfo now, because we need the OJ's - * semantic scope (ojscope) to pass to distribute_qual_to_rels. But - * we mustn't add it to join_info_list just yet, because we don't want - * distribute_qual_to_rels to think it is an outer join below us. - * - * Semijoins are a bit of a hybrid: we build a SpecialJoinInfo, but we - * want ojscope = NULL for distribute_qual_to_rels. - */ - if (j->jointype != JOIN_INNER) - { - sjinfo = make_outerjoininfo(root, - leftids, rightids, - *inner_join_rels, - j->jointype, - my_quals); - if (j->jointype == JOIN_SEMI) - ojscope = NULL; - else - ojscope = bms_union(sjinfo->min_lefthand, - sjinfo->min_righthand); - } - else - { - sjinfo = NULL; - ojscope = NULL; - } - - /* Process the JOIN's qual clauses */ - foreach(l, my_quals) - { - Node *qual = (Node *) lfirst(l); - - distribute_qual_to_rels(root, qual, - false, below_outer_join, j->jointype, - root->qual_security_level, - *qualscope, - ojscope, nonnullable_rels, NULL, - postponed_qual_list); - } - - /* Now we can add the SpecialJoinInfo to join_info_list */ - if (sjinfo) - { - root->join_info_list = lappend(root->join_info_list, sjinfo); - /* Each time we do that, recheck placeholder eval levels */ - update_placeholder_eval_levels(root, sjinfo); - } - - /* - * Finally, compute the output joinlist. We fold subproblems together - * except at a FULL JOIN or where join_collapse_limit would be - * exceeded. - */ - if (j->jointype == JOIN_FULL) - { - /* force the join order exactly at this node */ - joinlist = list_make1(list_make2(leftjoinlist, rightjoinlist)); - } - else if (list_length(leftjoinlist) + list_length(rightjoinlist) <= - join_collapse_limit) - { - /* OK to combine subproblems */ - joinlist = list_concat(leftjoinlist, rightjoinlist); - } - else - { - /* can't combine, but needn't force join order above here */ - Node *leftpart, - *rightpart; - - /* avoid creating useless 1-element sublists */ - if (list_length(leftjoinlist) == 1) - leftpart = (Node *) linitial(leftjoinlist); - else - leftpart = (Node *) leftjoinlist; - if (list_length(rightjoinlist) == 1) - rightpart = (Node *) linitial(rightjoinlist); - else - rightpart = (Node *) rightjoinlist; - joinlist = list_make2(leftpart, rightpart); - } - } - else - { - elog(ERROR, "unrecognized node type: %d", - (int) nodeTag(jtnode)); - joinlist = NIL; /* keep compiler quiet */ - } - return joinlist; + /* A single baserel does not create an inner join */ + *inner_join_rels = NULL; + joinlist = list_make1(jtnode); + } + else if (IsA(jtnode, FromExpr)) + { + FromExpr *f = (FromExpr *) jtnode; + List *child_postponed_quals = NIL; + int remaining; + ListCell *l; + + /* + * First, recurse to handle child joins. We collapse subproblems into + * a single joinlist whenever the resulting joinlist wouldn't exceed + * from_collapse_limit members. Also, always collapse one-element + * subproblems, since that won't lengthen the joinlist anyway. + */ + *qualscope = NULL; + *inner_join_rels = NULL; + joinlist = NIL; + remaining = list_length(f->fromlist); + foreach(l, f->fromlist) + { + Relids sub_qualscope; + List *sub_joinlist; + int sub_members; + + sub_joinlist = deconstruct_recurse(root, lfirst(l), + below_outer_join, + &sub_qualscope, + inner_join_rels, + &child_postponed_quals); + *qualscope = bms_add_members(*qualscope, sub_qualscope); + sub_members = list_length(sub_joinlist); + remaining--; + if (sub_members <= 1 || + list_length(joinlist) + sub_members + remaining <= from_collapse_limit) + joinlist = list_concat(joinlist, sub_joinlist); + else + joinlist = lappend(joinlist, sub_joinlist); + } + + /* + * A FROM with more than one list element is an inner join subsuming + * all below it, so we should report inner_join_rels = qualscope. If + * there was exactly one element, we should (and already did) report + * whatever its inner_join_rels were. If there were no elements (is + * that possible?) the initialization before the loop fixed it. + */ + if (list_length(f->fromlist) > 1) + *inner_join_rels = *qualscope; + + /* + * Try to process any quals postponed by children. If they need + * further postponement, add them to my output postponed_qual_list. + */ + foreach(l, child_postponed_quals) + { + PostponedQual *pq = (PostponedQual *) lfirst(l); + + if (bms_is_subset(pq->relids, *qualscope)) + distribute_qual_to_rels(root, pq->qual, + false, below_outer_join, JOIN_INNER, + root->qual_security_level, + *qualscope, NULL, NULL, NULL, + NULL); + else + *postponed_qual_list = lappend(*postponed_qual_list, pq); + } + + /* + * Now process the top-level quals. + */ + foreach(l, (List *) f->quals) + { + Node *qual = (Node *) lfirst(l); + + distribute_qual_to_rels(root, qual, + false, below_outer_join, JOIN_INNER, + root->qual_security_level, + *qualscope, NULL, NULL, NULL, + postponed_qual_list); + } + } + else if (IsA(jtnode, JoinExpr)) + { + JoinExpr *j = (JoinExpr *) jtnode; + List *child_postponed_quals = NIL; + Relids leftids, + rightids, + left_inners, + right_inners, + nonnullable_rels, + nullable_rels, + ojscope; + List *leftjoinlist, + *rightjoinlist; + List *my_quals; + SpecialJoinInfo *sjinfo; + ListCell *l; + + /* + * Order of operations here is subtle and critical. First we recurse + * to handle sub-JOINs. Their join quals will be placed without + * regard for whether this level is an outer join, which is correct. + * Then we place our own join quals, which are restricted by lower + * outer joins in any case, and are forced to this level if this is an + * outer join and they mention the outer side. Finally, if this is an + * outer join, we create a join_info_list entry for the join. This + * will prevent quals above us in the join tree that use those rels + * from being pushed down below this level. (It's okay for upper + * quals to be pushed down to the outer side, however.) + */ + switch (j->jointype) + { + case JOIN_INNER: + leftjoinlist = deconstruct_recurse(root, j->larg, + below_outer_join, + &leftids, &left_inners, + &child_postponed_quals); + rightjoinlist = deconstruct_recurse(root, j->rarg, + below_outer_join, + &rightids, &right_inners, + &child_postponed_quals); + *qualscope = bms_union(leftids, rightids); + *inner_join_rels = *qualscope; + /* Inner join adds no restrictions for quals */ + nonnullable_rels = NULL; + /* and it doesn't force anything to null, either */ + nullable_rels = NULL; + break; + case JOIN_LEFT: + case JOIN_ANTI: +#ifdef __TBASE__ + case JOIN_LEFT_SCALAR: +#endif + leftjoinlist = deconstruct_recurse(root, j->larg, + below_outer_join, + &leftids, &left_inners, + &child_postponed_quals); + rightjoinlist = deconstruct_recurse(root, j->rarg, + true, + &rightids, &right_inners, + &child_postponed_quals); + *qualscope = bms_union(leftids, rightids); + *inner_join_rels = bms_union(left_inners, right_inners); + nonnullable_rels = leftids; + nullable_rels = rightids; + break; + case JOIN_SEMI: + leftjoinlist = deconstruct_recurse(root, j->larg, + below_outer_join, + &leftids, &left_inners, + &child_postponed_quals); + rightjoinlist = deconstruct_recurse(root, j->rarg, + below_outer_join, + &rightids, &right_inners, + &child_postponed_quals); + *qualscope = bms_union(leftids, rightids); + *inner_join_rels = bms_union(left_inners, right_inners); + /* Semi join adds no restrictions for quals */ + nonnullable_rels = NULL; + + /* + * Theoretically, a semijoin would null the RHS; but since the + * RHS can't be accessed above the join, this is immaterial + * and we needn't account for it. + */ + nullable_rels = NULL; + break; + case JOIN_FULL: + leftjoinlist = deconstruct_recurse(root, j->larg, + true, + &leftids, &left_inners, + &child_postponed_quals); + rightjoinlist = deconstruct_recurse(root, j->rarg, + true, + &rightids, &right_inners, + &child_postponed_quals); + *qualscope = bms_union(leftids, rightids); + *inner_join_rels = bms_union(left_inners, right_inners); + /* each side is both outer and inner */ + nonnullable_rels = *qualscope; + nullable_rels = *qualscope; + break; + default: + /* JOIN_RIGHT was eliminated during reduce_outer_joins() */ + elog(ERROR, "unrecognized join type: %d", + (int) j->jointype); + nonnullable_rels = NULL; /* keep compiler quiet */ + nullable_rels = NULL; + leftjoinlist = rightjoinlist = NIL; + break; + } + + /* Report all rels that will be nulled anywhere in the jointree */ + root->nullable_baserels = bms_add_members(root->nullable_baserels, + nullable_rels); + + /* + * Try to process any quals postponed by children. If they need + * further postponement, add them to my output postponed_qual_list. + * Quals that can be processed now must be included in my_quals, so + * that they'll be handled properly in make_outerjoininfo. + */ + my_quals = NIL; + foreach(l, child_postponed_quals) + { + PostponedQual *pq = (PostponedQual *) lfirst(l); + + if (bms_is_subset(pq->relids, *qualscope)) + my_quals = lappend(my_quals, pq->qual); + else + { + /* + * We should not be postponing any quals past an outer join. + * If this Assert fires, pull_up_subqueries() messed up. + */ + Assert(j->jointype == JOIN_INNER); + *postponed_qual_list = lappend(*postponed_qual_list, pq); + } + } + /* list_concat is nondestructive of its second argument */ + my_quals = list_concat(my_quals, (List *) j->quals); + + /* + * For an OJ, form the SpecialJoinInfo now, because we need the OJ's + * semantic scope (ojscope) to pass to distribute_qual_to_rels. But + * we mustn't add it to join_info_list just yet, because we don't want + * distribute_qual_to_rels to think it is an outer join below us. + * + * Semijoins are a bit of a hybrid: we build a SpecialJoinInfo, but we + * want ojscope = NULL for distribute_qual_to_rels. + */ + if (j->jointype != JOIN_INNER) + { + sjinfo = make_outerjoininfo(root, + leftids, rightids, + *inner_join_rels, + j->jointype, + my_quals); + if (j->jointype == JOIN_SEMI) + ojscope = NULL; + else + ojscope = bms_union(sjinfo->min_lefthand, + sjinfo->min_righthand); + } + else + { + sjinfo = NULL; + ojscope = NULL; + } + + /* Process the JOIN's qual clauses */ + foreach(l, my_quals) + { + Node *qual = (Node *) lfirst(l); + + distribute_qual_to_rels(root, qual, + false, below_outer_join, j->jointype, + root->qual_security_level, + *qualscope, + ojscope, nonnullable_rels, NULL, + postponed_qual_list); + } + + /* Now we can add the SpecialJoinInfo to join_info_list */ + if (sjinfo) + { + root->join_info_list = lappend(root->join_info_list, sjinfo); + /* Each time we do that, recheck placeholder eval levels */ + update_placeholder_eval_levels(root, sjinfo); + } + + /* + * Finally, compute the output joinlist. We fold subproblems together + * except at a FULL JOIN or where join_collapse_limit would be + * exceeded. + */ + if (j->jointype == JOIN_FULL) + { + /* force the join order exactly at this node */ + joinlist = list_make1(list_make2(leftjoinlist, rightjoinlist)); + } + else if (list_length(leftjoinlist) + list_length(rightjoinlist) <= + join_collapse_limit) + { + /* OK to combine subproblems */ + joinlist = list_concat(leftjoinlist, rightjoinlist); + } + else + { + /* can't combine, but needn't force join order above here */ + Node *leftpart, + *rightpart; + + /* avoid creating useless 1-element sublists */ + if (list_length(leftjoinlist) == 1) + leftpart = (Node *) linitial(leftjoinlist); + else + leftpart = (Node *) leftjoinlist; + if (list_length(rightjoinlist) == 1) + rightpart = (Node *) linitial(rightjoinlist); + else + rightpart = (Node *) rightjoinlist; + joinlist = list_make2(leftpart, rightpart); + } + } + else + { + elog(ERROR, "unrecognized node type: %d", + (int) nodeTag(jtnode)); + joinlist = NIL; /* keep compiler quiet */ + } + return joinlist; } /* @@ -1205,243 +1208,263 @@ static void mls_process_cls_quals(PlannerInfo *root, */ static SpecialJoinInfo * make_outerjoininfo(PlannerInfo *root, - Relids left_rels, Relids right_rels, - Relids inner_join_rels, - JoinType jointype, List *clause) -{// #lizard forgives - SpecialJoinInfo *sjinfo = makeNode(SpecialJoinInfo); - Relids clause_relids; - Relids strict_relids; - Relids min_lefthand; - Relids min_righthand; - ListCell *l; - - /* - * We should not see RIGHT JOIN here because left/right were switched - * earlier - */ - Assert(jointype != JOIN_INNER); - Assert(jointype != JOIN_RIGHT); - - /* - * Presently the executor cannot support FOR [KEY] UPDATE/SHARE marking of - * rels appearing on the nullable side of an outer join. (It's somewhat - * unclear what that would mean, anyway: what should we mark when a result - * row is generated from no element of the nullable relation?) So, - * complain if any nullable rel is FOR [KEY] UPDATE/SHARE. - * - * You might be wondering why this test isn't made far upstream in the - * parser. It's because the parser hasn't got enough info --- consider - * FOR UPDATE applied to a view. Only after rewriting and flattening do - * we know whether the view contains an outer join. - * - * We use the original RowMarkClause list here; the PlanRowMark list would - * list everything. - */ - foreach(l, root->parse->rowMarks) - { - RowMarkClause *rc = (RowMarkClause *) lfirst(l); - - if (bms_is_member(rc->rti, right_rels) || - (jointype == JOIN_FULL && bms_is_member(rc->rti, left_rels))) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - /*------ - translator: %s is a SQL row locking clause such as FOR UPDATE */ - errmsg("%s cannot be applied to the nullable side of an outer join", - LCS_asString(rc->strength)))); - } - - sjinfo->syn_lefthand = left_rels; - sjinfo->syn_righthand = right_rels; - sjinfo->jointype = jointype; - /* this always starts out false */ - sjinfo->delay_upper_joins = false; - - compute_semijoin_info(sjinfo, clause); - - /* If it's a full join, no need to be very smart */ - if (jointype == JOIN_FULL) - { - sjinfo->min_lefthand = bms_copy(left_rels); - sjinfo->min_righthand = bms_copy(right_rels); - sjinfo->lhs_strict = false; /* don't care about this */ - return sjinfo; - } - - /* - * Retrieve all relids mentioned within the join clause. - */ - clause_relids = pull_varnos((Node *) clause); - - /* - * For which relids is the clause strict, ie, it cannot succeed if the - * rel's columns are all NULL? - */ - strict_relids = find_nonnullable_rels((Node *) clause); - - /* Remember whether the clause is strict for any LHS relations */ - sjinfo->lhs_strict = bms_overlap(strict_relids, left_rels); - - /* - * Required LHS always includes the LHS rels mentioned in the clause. We - * may have to add more rels based on lower outer joins; see below. - */ - min_lefthand = bms_intersect(clause_relids, left_rels); - - /* - * Similarly for required RHS. But here, we must also include any lower - * inner joins, to ensure we don't try to commute with any of them. - */ - min_righthand = bms_int_members(bms_union(clause_relids, inner_join_rels), - right_rels); - - /* - * Now check previous outer joins for ordering restrictions. - */ - foreach(l, root->join_info_list) - { - SpecialJoinInfo *otherinfo = (SpecialJoinInfo *) lfirst(l); - - /* - * A full join is an optimization barrier: we can't associate into or - * out of it. Hence, if it overlaps either LHS or RHS of the current - * rel, expand that side's min relset to cover the whole full join. - */ - if (otherinfo->jointype == JOIN_FULL) - { - if (bms_overlap(left_rels, otherinfo->syn_lefthand) || - bms_overlap(left_rels, otherinfo->syn_righthand)) - { - min_lefthand = bms_add_members(min_lefthand, - otherinfo->syn_lefthand); - min_lefthand = bms_add_members(min_lefthand, - otherinfo->syn_righthand); - } - if (bms_overlap(right_rels, otherinfo->syn_lefthand) || - bms_overlap(right_rels, otherinfo->syn_righthand)) - { - min_righthand = bms_add_members(min_righthand, - otherinfo->syn_lefthand); - min_righthand = bms_add_members(min_righthand, - otherinfo->syn_righthand); - } - /* Needn't do anything else with the full join */ - continue; - } - - /* - * For a lower OJ in our LHS, if our join condition uses the lower - * join's RHS and is not strict for that rel, we must preserve the - * ordering of the two OJs, so add lower OJ's full syntactic relset to - * min_lefthand. (We must use its full syntactic relset, not just its - * min_lefthand + min_righthand. This is because there might be other - * OJs below this one that this one can commute with, but we cannot - * commute with them if we don't with this one.) Also, if the current - * join is a semijoin or antijoin, we must preserve ordering - * regardless of strictness. - * - * Note: I believe we have to insist on being strict for at least one - * rel in the lower OJ's min_righthand, not its whole syn_righthand. - */ - if (bms_overlap(left_rels, otherinfo->syn_righthand)) - { + Relids left_rels, Relids right_rels, + Relids inner_join_rels, + JoinType jointype, List *clause) +{ + SpecialJoinInfo *sjinfo = makeNode(SpecialJoinInfo); + Relids clause_relids; + Relids strict_relids; + Relids min_lefthand; + Relids min_righthand; + ListCell *l; + + /* + * We should not see RIGHT JOIN here because left/right were switched + * earlier + */ + Assert(jointype != JOIN_INNER); + Assert(jointype != JOIN_RIGHT); + + /* + * Presently the executor cannot support FOR [KEY] UPDATE/SHARE marking of + * rels appearing on the nullable side of an outer join. (It's somewhat + * unclear what that would mean, anyway: what should we mark when a result + * row is generated from no element of the nullable relation?) So, + * complain if any nullable rel is FOR [KEY] UPDATE/SHARE. + * + * You might be wondering why this test isn't made far upstream in the + * parser. It's because the parser hasn't got enough info --- consider + * FOR UPDATE applied to a view. Only after rewriting and flattening do + * we know whether the view contains an outer join. + * + * We use the original RowMarkClause list here; the PlanRowMark list would + * list everything. + */ + foreach(l, root->parse->rowMarks) + { + RowMarkClause *rc = (RowMarkClause *) lfirst(l); + + if (bms_is_member(rc->rti, right_rels) || + (jointype == JOIN_FULL && bms_is_member(rc->rti, left_rels))) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + /*------ + translator: %s is a SQL row locking clause such as FOR UPDATE */ + errmsg("%s cannot be applied to the nullable side of an outer join", + LCS_asString(rc->strength)))); + } + + sjinfo->syn_lefthand = left_rels; + sjinfo->syn_righthand = right_rels; + sjinfo->jointype = jointype; + /* this always starts out false */ + sjinfo->delay_upper_joins = false; + + compute_semijoin_info(sjinfo, clause); + + /* If it's a full join, no need to be very smart */ + if (jointype == JOIN_FULL) + { + sjinfo->min_lefthand = bms_copy(left_rels); + sjinfo->min_righthand = bms_copy(right_rels); + sjinfo->lhs_strict = false; /* don't care about this */ + return sjinfo; + } + + /* + * Retrieve all relids mentioned within the join clause. + */ + clause_relids = pull_varnos((Node *) clause); + + /* + * For which relids is the clause strict, ie, it cannot succeed if the + * rel's columns are all NULL? + */ + strict_relids = find_nonnullable_rels((Node *) clause); + + /* Remember whether the clause is strict for any LHS relations */ + sjinfo->lhs_strict = bms_overlap(strict_relids, left_rels); + + /* + * Required LHS always includes the LHS rels mentioned in the clause. We + * may have to add more rels based on lower outer joins; see below. + */ + min_lefthand = bms_intersect(clause_relids, left_rels); + + /* + * Similarly for required RHS. But here, we must also include any lower + * inner joins, to ensure we don't try to commute with any of them. + */ + min_righthand = bms_int_members(bms_union(clause_relids, inner_join_rels), + right_rels); + + /* + * Now check previous outer joins for ordering restrictions. + */ + foreach(l, root->join_info_list) + { + SpecialJoinInfo *otherinfo = (SpecialJoinInfo *) lfirst(l); + + /* + * A full join is an optimization barrier: we can't associate into or + * out of it. Hence, if it overlaps either LHS or RHS of the current + * rel, expand that side's min relset to cover the whole full join. + */ + if (otherinfo->jointype == JOIN_FULL) + { + if (bms_overlap(left_rels, otherinfo->syn_lefthand) || + bms_overlap(left_rels, otherinfo->syn_righthand)) + { + min_lefthand = bms_add_members(min_lefthand, + otherinfo->syn_lefthand); + min_lefthand = bms_add_members(min_lefthand, + otherinfo->syn_righthand); + } + if (bms_overlap(right_rels, otherinfo->syn_lefthand) || + bms_overlap(right_rels, otherinfo->syn_righthand)) + { + min_righthand = bms_add_members(min_righthand, + otherinfo->syn_lefthand); + min_righthand = bms_add_members(min_righthand, + otherinfo->syn_righthand); + } + /* Needn't do anything else with the full join */ + continue; + } + + /* + * For a lower OJ in our LHS, if our join condition uses the lower + * join's RHS and is not strict for that rel, we must preserve the + * ordering of the two OJs, so add lower OJ's full syntactic relset to + * min_lefthand. (We must use its full syntactic relset, not just its + * min_lefthand + min_righthand. This is because there might be other + * OJs below this one that this one can commute with, but we cannot + * commute with them if we don't with this one.) Also, if the current + * join is a semijoin or antijoin, we must preserve ordering + * regardless of strictness. + * + * Note: I believe we have to insist on being strict for at least one + * rel in the lower OJ's min_righthand, not its whole syn_righthand. + */ + if (bms_overlap(left_rels, otherinfo->syn_righthand)) + { +#ifdef __TBASE__ if (bms_overlap(clause_relids, otherinfo->syn_righthand) && - (jointype == JOIN_SEMI || jointype == JOIN_ANTI || + (jointype == JOIN_SEMI || + jointype == JOIN_ANTI || + jointype == JOIN_LEFT_SCALAR || !bms_overlap(strict_relids, otherinfo->min_righthand))) - { - min_lefthand = bms_add_members(min_lefthand, - otherinfo->syn_lefthand); - min_lefthand = bms_add_members(min_lefthand, - otherinfo->syn_righthand); - } - } - - /* - * For a lower OJ in our RHS, if our join condition does not use the - * lower join's RHS and the lower OJ's join condition is strict, we - * can interchange the ordering of the two OJs; otherwise we must add - * the lower OJ's full syntactic relset to min_righthand. - * - * Also, if our join condition does not use the lower join's LHS - * either, force the ordering to be preserved. Otherwise we can end - * up with SpecialJoinInfos with identical min_righthands, which can - * confuse join_is_legal (see discussion in backend/optimizer/README). - * - * Also, we must preserve ordering anyway if either the current join - * or the lower OJ is either a semijoin or an antijoin. - * - * Here, we have to consider that "our join condition" includes any - * clauses that syntactically appeared above the lower OJ and below - * ours; those are equivalent to degenerate clauses in our OJ and must - * be treated as such. Such clauses obviously can't reference our - * LHS, and they must be non-strict for the lower OJ's RHS (else - * reduce_outer_joins would have reduced the lower OJ to a plain - * join). Hence the other ways in which we handle clauses within our - * join condition are not affected by them. The net effect is - * therefore sufficiently represented by the delay_upper_joins flag - * saved for us by check_outerjoin_delay. - */ - if (bms_overlap(right_rels, otherinfo->syn_righthand)) - { - if (bms_overlap(clause_relids, otherinfo->syn_righthand) || - !bms_overlap(clause_relids, otherinfo->min_lefthand) || - jointype == JOIN_SEMI || - jointype == JOIN_ANTI || - otherinfo->jointype == JOIN_SEMI || - otherinfo->jointype == JOIN_ANTI || - !otherinfo->lhs_strict || otherinfo->delay_upper_joins) - { - min_righthand = bms_add_members(min_righthand, - otherinfo->syn_lefthand); - min_righthand = bms_add_members(min_righthand, - otherinfo->syn_righthand); - } - } - } - - /* - * Examine PlaceHolderVars. If a PHV is supposed to be evaluated within - * this join's nullable side, then ensure that min_righthand contains the - * full eval_at set of the PHV. This ensures that the PHV actually can be - * evaluated within the RHS. Note that this works only because we should - * already have determined the final eval_at level for any PHV - * syntactically within this join. - */ - foreach(l, root->placeholder_list) - { - PlaceHolderInfo *phinfo = (PlaceHolderInfo *) lfirst(l); - Relids ph_syn_level = phinfo->ph_var->phrels; - - /* Ignore placeholder if it didn't syntactically come from RHS */ - if (!bms_is_subset(ph_syn_level, right_rels)) - continue; - - /* Else, prevent join from being formed before we eval the PHV */ - min_righthand = bms_add_members(min_righthand, phinfo->ph_eval_at); - } - - /* - * If we found nothing to put in min_lefthand, punt and make it the full - * LHS, to avoid having an empty min_lefthand which will confuse later - * processing. (We don't try to be smart about such cases, just correct.) - * Likewise for min_righthand. - */ - if (bms_is_empty(min_lefthand)) - min_lefthand = bms_copy(left_rels); - if (bms_is_empty(min_righthand)) - min_righthand = bms_copy(right_rels); - - /* Now they'd better be nonempty */ - Assert(!bms_is_empty(min_lefthand)); - Assert(!bms_is_empty(min_righthand)); - /* Shouldn't overlap either */ - Assert(!bms_overlap(min_lefthand, min_righthand)); - - sjinfo->min_lefthand = min_lefthand; - sjinfo->min_righthand = min_righthand; - - return sjinfo; +#else + if (bms_overlap(clause_relids, otherinfo->syn_righthand) && + (jointype == JOIN_SEMI || jointype == JOIN_ANTI || + !bms_overlap(strict_relids, otherinfo->min_righthand))) +#endif + { + min_lefthand = bms_add_members(min_lefthand, + otherinfo->syn_lefthand); + min_lefthand = bms_add_members(min_lefthand, + otherinfo->syn_righthand); + } + } + + /* + * For a lower OJ in our RHS, if our join condition does not use the + * lower join's RHS and the lower OJ's join condition is strict, we + * can interchange the ordering of the two OJs; otherwise we must add + * the lower OJ's full syntactic relset to min_righthand. + * + * Also, if our join condition does not use the lower join's LHS + * either, force the ordering to be preserved. Otherwise we can end + * up with SpecialJoinInfos with identical min_righthands, which can + * confuse join_is_legal (see discussion in backend/optimizer/README). + * + * Also, we must preserve ordering anyway if either the current join + * or the lower OJ is either a semijoin or an antijoin. + * + * Here, we have to consider that "our join condition" includes any + * clauses that syntactically appeared above the lower OJ and below + * ours; those are equivalent to degenerate clauses in our OJ and must + * be treated as such. Such clauses obviously can't reference our + * LHS, and they must be non-strict for the lower OJ's RHS (else + * reduce_outer_joins would have reduced the lower OJ to a plain + * join). Hence the other ways in which we handle clauses within our + * join condition are not affected by them. The net effect is + * therefore sufficiently represented by the delay_upper_joins flag + * saved for us by check_outerjoin_delay. + */ + if (bms_overlap(right_rels, otherinfo->syn_righthand)) + { +#ifdef __TBASE__ + if (bms_overlap(clause_relids, otherinfo->syn_righthand) || + !bms_overlap(clause_relids, otherinfo->min_lefthand) || + jointype == JOIN_SEMI || + jointype == JOIN_ANTI || + jointype == JOIN_LEFT_SCALAR || + otherinfo->jointype == JOIN_SEMI || + otherinfo->jointype == JOIN_ANTI || + otherinfo->jointype == JOIN_LEFT_SCALAR || + !otherinfo->lhs_strict || otherinfo->delay_upper_joins) +#else + if (bms_overlap(clause_relids, otherinfo->syn_righthand) || + !bms_overlap(clause_relids, otherinfo->min_lefthand) || + jointype == JOIN_SEMI || + jointype == JOIN_ANTI || + otherinfo->jointype == JOIN_SEMI || + otherinfo->jointype == JOIN_ANTI || + !otherinfo->lhs_strict || otherinfo->delay_upper_joins) +#endif + { + min_righthand = bms_add_members(min_righthand, + otherinfo->syn_lefthand); + min_righthand = bms_add_members(min_righthand, + otherinfo->syn_righthand); + } + } + } + + /* + * Examine PlaceHolderVars. If a PHV is supposed to be evaluated within + * this join's nullable side, then ensure that min_righthand contains the + * full eval_at set of the PHV. This ensures that the PHV actually can be + * evaluated within the RHS. Note that this works only because we should + * already have determined the final eval_at level for any PHV + * syntactically within this join. + */ + foreach(l, root->placeholder_list) + { + PlaceHolderInfo *phinfo = (PlaceHolderInfo *) lfirst(l); + Relids ph_syn_level = phinfo->ph_var->phrels; + + /* Ignore placeholder if it didn't syntactically come from RHS */ + if (!bms_is_subset(ph_syn_level, right_rels)) + continue; + + /* Else, prevent join from being formed before we eval the PHV */ + min_righthand = bms_add_members(min_righthand, phinfo->ph_eval_at); + } + + /* + * If we found nothing to put in min_lefthand, punt and make it the full + * LHS, to avoid having an empty min_lefthand which will confuse later + * processing. (We don't try to be smart about such cases, just correct.) + * Likewise for min_righthand. + */ + if (bms_is_empty(min_lefthand)) + min_lefthand = bms_copy(left_rels); + if (bms_is_empty(min_righthand)) + min_righthand = bms_copy(right_rels); + + /* Now they'd better be nonempty */ + Assert(!bms_is_empty(min_lefthand)); + Assert(!bms_is_empty(min_righthand)); + /* Shouldn't overlap either */ + Assert(!bms_overlap(min_lefthand, min_righthand)); + + sjinfo->min_lefthand = min_lefthand; + sjinfo->min_righthand = min_righthand; + + return sjinfo; } /* @@ -1453,171 +1476,175 @@ make_outerjoininfo(PlannerInfo *root, */ static void compute_semijoin_info(SpecialJoinInfo *sjinfo, List *clause) -{// #lizard forgives - List *semi_operators; - List *semi_rhs_exprs; - bool all_btree; - bool all_hash; - ListCell *lc; - - /* Initialize semijoin-related fields in case we can't unique-ify */ - sjinfo->semi_can_btree = false; - sjinfo->semi_can_hash = false; - sjinfo->semi_operators = NIL; - sjinfo->semi_rhs_exprs = NIL; - - /* Nothing more to do if it's not a semijoin */ - if (sjinfo->jointype != JOIN_SEMI) - return; - - /* - * Look to see whether the semijoin's join quals consist of AND'ed - * equality operators, with (only) RHS variables on only one side of each - * one. If so, we can figure out how to enforce uniqueness for the RHS. - * - * Note that the input clause list is the list of quals that are - * *syntactically* associated with the semijoin, which in practice means - * the synthesized comparison list for an IN or the WHERE of an EXISTS. - * Particularly in the latter case, it might contain clauses that aren't - * *semantically* associated with the join, but refer to just one side or - * the other. We can ignore such clauses here, as they will just drop - * down to be processed within one side or the other. (It is okay to - * consider only the syntactically-associated clauses here because for a - * semijoin, no higher-level quals could refer to the RHS, and so there - * can be no other quals that are semantically associated with this join. - * We do things this way because it is useful to have the set of potential - * unique-ification expressions before we can extract the list of quals - * that are actually semantically associated with the particular join.) - * - * Note that the semi_operators list consists of the joinqual operators - * themselves (but commuted if needed to put the RHS value on the right). - * These could be cross-type operators, in which case the operator - * actually needed for uniqueness is a related single-type operator. We - * assume here that that operator will be available from the btree or hash - * opclass when the time comes ... if not, create_unique_plan() will fail. - */ - semi_operators = NIL; - semi_rhs_exprs = NIL; - all_btree = true; - all_hash = enable_hashagg; /* don't consider hash if not enabled */ - foreach(lc, clause) - { - OpExpr *op = (OpExpr *) lfirst(lc); - Oid opno; - Node *left_expr; - Node *right_expr; - Relids left_varnos; - Relids right_varnos; - Relids all_varnos; - Oid opinputtype; - - /* Is it a binary opclause? */ - if (!IsA(op, OpExpr) || - list_length(op->args) != 2) - { - /* No, but does it reference both sides? */ - all_varnos = pull_varnos((Node *) op); - if (!bms_overlap(all_varnos, sjinfo->syn_righthand) || - bms_is_subset(all_varnos, sjinfo->syn_righthand)) - { - /* - * Clause refers to only one rel, so ignore it --- unless it - * contains volatile functions, in which case we'd better - * punt. - */ - if (contain_volatile_functions((Node *) op)) - return; - continue; - } - /* Non-operator clause referencing both sides, must punt */ - return; - } - - /* Extract data from binary opclause */ - opno = op->opno; - left_expr = linitial(op->args); - right_expr = lsecond(op->args); - left_varnos = pull_varnos(left_expr); - right_varnos = pull_varnos(right_expr); - all_varnos = bms_union(left_varnos, right_varnos); - opinputtype = exprType(left_expr); - - /* Does it reference both sides? */ - if (!bms_overlap(all_varnos, sjinfo->syn_righthand) || - bms_is_subset(all_varnos, sjinfo->syn_righthand)) - { - /* - * Clause refers to only one rel, so ignore it --- unless it - * contains volatile functions, in which case we'd better punt. - */ - if (contain_volatile_functions((Node *) op)) - return; - continue; - } - - /* check rel membership of arguments */ - if (!bms_is_empty(right_varnos) && - bms_is_subset(right_varnos, sjinfo->syn_righthand) && - !bms_overlap(left_varnos, sjinfo->syn_righthand)) - { - /* typical case, right_expr is RHS variable */ - } - else if (!bms_is_empty(left_varnos) && - bms_is_subset(left_varnos, sjinfo->syn_righthand) && - !bms_overlap(right_varnos, sjinfo->syn_righthand)) - { - /* flipped case, left_expr is RHS variable */ - opno = get_commutator(opno); - if (!OidIsValid(opno)) - return; - right_expr = left_expr; - } - else - { - /* mixed membership of args, punt */ - return; - } - - /* all operators must be btree equality or hash equality */ - if (all_btree) - { - /* oprcanmerge is considered a hint... */ - if (!op_mergejoinable(opno, opinputtype) || - get_mergejoin_opfamilies(opno) == NIL) - all_btree = false; - } - if (all_hash) - { - /* ... but oprcanhash had better be correct */ - if (!op_hashjoinable(opno, opinputtype)) - all_hash = false; - } - if (!(all_btree || all_hash)) - return; - - /* so far so good, keep building lists */ - semi_operators = lappend_oid(semi_operators, opno); - semi_rhs_exprs = lappend(semi_rhs_exprs, copyObject(right_expr)); - } - - /* Punt if we didn't find at least one column to unique-ify */ - if (semi_rhs_exprs == NIL) - return; - - /* - * The expressions we'd need to unique-ify mustn't be volatile. - */ - if (contain_volatile_functions((Node *) semi_rhs_exprs)) - return; - - /* - * If we get here, we can unique-ify the semijoin's RHS using at least one - * of sorting and hashing. Save the information about how to do that. - */ - sjinfo->semi_can_btree = all_btree; - sjinfo->semi_can_hash = all_hash; - sjinfo->semi_operators = semi_operators; - sjinfo->semi_rhs_exprs = semi_rhs_exprs; +{ + List *semi_operators; + List *semi_rhs_exprs; + bool all_btree; + bool all_hash; + ListCell *lc; + + /* Initialize semijoin-related fields in case we can't unique-ify */ + sjinfo->semi_can_btree = false; + sjinfo->semi_can_hash = false; + sjinfo->semi_operators = NIL; + sjinfo->semi_rhs_exprs = NIL; + + /* Nothing more to do if it's not a semijoin */ +#ifdef __TBASE__ + if (sjinfo->jointype != JOIN_SEMI && sjinfo->jointype != JOIN_LEFT_SCALAR) +#else + if (sjinfo->jointype != JOIN_SEMI) +#endif + return; + + /* + * Look to see whether the semijoin's join quals consist of AND'ed + * equality operators, with (only) RHS variables on only one side of each + * one. If so, we can figure out how to enforce uniqueness for the RHS. + * + * Note that the input clause list is the list of quals that are + * *syntactically* associated with the semijoin, which in practice means + * the synthesized comparison list for an IN or the WHERE of an EXISTS. + * Particularly in the latter case, it might contain clauses that aren't + * *semantically* associated with the join, but refer to just one side or + * the other. We can ignore such clauses here, as they will just drop + * down to be processed within one side or the other. (It is okay to + * consider only the syntactically-associated clauses here because for a + * semijoin, no higher-level quals could refer to the RHS, and so there + * can be no other quals that are semantically associated with this join. + * We do things this way because it is useful to have the set of potential + * unique-ification expressions before we can extract the list of quals + * that are actually semantically associated with the particular join.) + * + * Note that the semi_operators list consists of the joinqual operators + * themselves (but commuted if needed to put the RHS value on the right). + * These could be cross-type operators, in which case the operator + * actually needed for uniqueness is a related single-type operator. We + * assume here that that operator will be available from the btree or hash + * opclass when the time comes ... if not, create_unique_plan() will fail. + */ + semi_operators = NIL; + semi_rhs_exprs = NIL; + all_btree = true; + all_hash = enable_hashagg; /* don't consider hash if not enabled */ + foreach(lc, clause) + { + OpExpr *op = (OpExpr *) lfirst(lc); + Oid opno; + Node *left_expr; + Node *right_expr; + Relids left_varnos; + Relids right_varnos; + Relids all_varnos; + Oid opinputtype; + + /* Is it a binary opclause? */ + if (!IsA(op, OpExpr) || + list_length(op->args) != 2) + { + /* No, but does it reference both sides? */ + all_varnos = pull_varnos((Node *) op); + if (!bms_overlap(all_varnos, sjinfo->syn_righthand) || + bms_is_subset(all_varnos, sjinfo->syn_righthand)) + { + /* + * Clause refers to only one rel, so ignore it --- unless it + * contains volatile functions, in which case we'd better + * punt. + */ + if (contain_volatile_functions((Node *) op)) + return; + continue; + } + /* Non-operator clause referencing both sides, must punt */ + return; + } + + /* Extract data from binary opclause */ + opno = op->opno; + left_expr = linitial(op->args); + right_expr = lsecond(op->args); + left_varnos = pull_varnos(left_expr); + right_varnos = pull_varnos(right_expr); + all_varnos = bms_union(left_varnos, right_varnos); + opinputtype = exprType(left_expr); + + /* Does it reference both sides? */ + if (!bms_overlap(all_varnos, sjinfo->syn_righthand) || + bms_is_subset(all_varnos, sjinfo->syn_righthand)) + { + /* + * Clause refers to only one rel, so ignore it --- unless it + * contains volatile functions, in which case we'd better punt. + */ + if (contain_volatile_functions((Node *) op)) + return; + continue; + } + + /* check rel membership of arguments */ + if (!bms_is_empty(right_varnos) && + bms_is_subset(right_varnos, sjinfo->syn_righthand) && + !bms_overlap(left_varnos, sjinfo->syn_righthand)) + { + /* typical case, right_expr is RHS variable */ + } + else if (!bms_is_empty(left_varnos) && + bms_is_subset(left_varnos, sjinfo->syn_righthand) && + !bms_overlap(right_varnos, sjinfo->syn_righthand)) + { + /* flipped case, left_expr is RHS variable */ + opno = get_commutator(opno); + if (!OidIsValid(opno)) + return; + right_expr = left_expr; + } + else + { + /* mixed membership of args, punt */ + return; + } + + /* all operators must be btree equality or hash equality */ + if (all_btree) + { + /* oprcanmerge is considered a hint... */ + if (!op_mergejoinable(opno, opinputtype) || + get_mergejoin_opfamilies(opno) == NIL) + all_btree = false; + } + if (all_hash) + { + /* ... but oprcanhash had better be correct */ + if (!op_hashjoinable(opno, opinputtype)) + all_hash = false; + } + if (!(all_btree || all_hash)) + return; + + /* so far so good, keep building lists */ + semi_operators = lappend_oid(semi_operators, opno); + semi_rhs_exprs = lappend(semi_rhs_exprs, copyObject(right_expr)); + } + + /* Punt if we didn't find at least one column to unique-ify */ + if (semi_rhs_exprs == NIL) + return; + + /* + * The expressions we'd need to unique-ify mustn't be volatile. + */ + if (contain_volatile_functions((Node *) semi_rhs_exprs)) + return; + + /* + * If we get here, we can unique-ify the semijoin's RHS using at least one + * of sorting and hashing. Save the information about how to do that. + */ + sjinfo->semi_can_btree = all_btree; + sjinfo->semi_can_hash = all_hash; + sjinfo->semi_operators = semi_operators; + sjinfo->semi_rhs_exprs = semi_rhs_exprs; } diff --git a/src/backend/optimizer/plan/setrefs.c b/src/backend/optimizer/plan/setrefs.c index 175058f9..9f1be6e4 100644 --- a/src/backend/optimizer/plan/setrefs.c +++ b/src/backend/optimizer/plan/setrefs.c @@ -1653,119 +1653,122 @@ fix_scan_expr_walker(Node *node, fix_scan_expr_context *context) */ static void set_join_references(PlannerInfo *root, Join *join, int rtoffset) -{// #lizard forgives - Plan *outer_plan = join->plan.lefttree; - Plan *inner_plan = join->plan.righttree; - indexed_tlist *outer_itlist; - indexed_tlist *inner_itlist; - - outer_itlist = build_tlist_index(outer_plan->targetlist); - inner_itlist = build_tlist_index(inner_plan->targetlist); - - /* - * First process the joinquals (including merge or hash clauses). These - * are logically below the join so they can always use all values - * available from the input tlists. It's okay to also handle - * NestLoopParams now, because those couldn't refer to nullable - * subexpressions. - */ - join->joinqual = fix_join_expr(root, - join->joinqual, - outer_itlist, - inner_itlist, - (Index) 0, - rtoffset); - - /* Now do join-type-specific stuff */ - if (IsA(join, NestLoop)) - { - NestLoop *nl = (NestLoop *) join; - ListCell *lc; - - foreach(lc, nl->nestParams) - { - NestLoopParam *nlp = (NestLoopParam *) lfirst(lc); - - nlp->paramval = (Var *) fix_upper_expr(root, - (Node *) nlp->paramval, - outer_itlist, - OUTER_VAR, - rtoffset); - - /* Check we replaced any PlaceHolderVar with simple Var */ - if (!(IsA(nlp->paramval, Var) && - nlp->paramval->varno == OUTER_VAR)) - elog(ERROR, "NestLoopParam was not reduced to a simple Var"); - } - } - else if (IsA(join, MergeJoin)) - { - MergeJoin *mj = (MergeJoin *) join; - - mj->mergeclauses = fix_join_expr(root, - mj->mergeclauses, - outer_itlist, - inner_itlist, - (Index) 0, - rtoffset); - } - else if (IsA(join, HashJoin)) - { - HashJoin *hj = (HashJoin *) join; - - hj->hashclauses = fix_join_expr(root, - hj->hashclauses, - outer_itlist, - inner_itlist, - (Index) 0, - rtoffset); - } +{ + Plan *outer_plan = join->plan.lefttree; + Plan *inner_plan = join->plan.righttree; + indexed_tlist *outer_itlist; + indexed_tlist *inner_itlist; + + outer_itlist = build_tlist_index(outer_plan->targetlist); + inner_itlist = build_tlist_index(inner_plan->targetlist); + + /* + * First process the joinquals (including merge or hash clauses). These + * are logically below the join so they can always use all values + * available from the input tlists. It's okay to also handle + * NestLoopParams now, because those couldn't refer to nullable + * subexpressions. + */ + join->joinqual = fix_join_expr(root, + join->joinqual, + outer_itlist, + inner_itlist, + (Index) 0, + rtoffset); + + /* Now do join-type-specific stuff */ + if (IsA(join, NestLoop)) + { + NestLoop *nl = (NestLoop *) join; + ListCell *lc; + + foreach(lc, nl->nestParams) + { + NestLoopParam *nlp = (NestLoopParam *) lfirst(lc); + + nlp->paramval = (Var *) fix_upper_expr(root, + (Node *) nlp->paramval, + outer_itlist, + OUTER_VAR, + rtoffset); + + /* Check we replaced any PlaceHolderVar with simple Var */ + if (!(IsA(nlp->paramval, Var) && + nlp->paramval->varno == OUTER_VAR)) + elog(ERROR, "NestLoopParam was not reduced to a simple Var"); + } + } + else if (IsA(join, MergeJoin)) + { + MergeJoin *mj = (MergeJoin *) join; + + mj->mergeclauses = fix_join_expr(root, + mj->mergeclauses, + outer_itlist, + inner_itlist, + (Index) 0, + rtoffset); + } + else if (IsA(join, HashJoin)) + { + HashJoin *hj = (HashJoin *) join; + + hj->hashclauses = fix_join_expr(root, + hj->hashclauses, + outer_itlist, + inner_itlist, + (Index) 0, + rtoffset); + } - /* - * Now we need to fix up the targetlist and qpqual, which are logically - * above the join. This means they should not re-use any input expression - * that was computed in the nullable side of an outer join. Vars and - * PlaceHolderVars are fine, so we can implement this restriction just by - * clearing has_non_vars in the indexed_tlist structs. - * - * XXX This is a grotty workaround for the fact that we don't clearly - * distinguish between a Var appearing below an outer join and the "same" - * Var appearing above it. If we did, we'd not need to hack the matching - * rules this way. - */ - switch (join->jointype) - { - case JOIN_LEFT: - case JOIN_SEMI: - case JOIN_ANTI: - inner_itlist->has_non_vars = false; - break; - case JOIN_RIGHT: - outer_itlist->has_non_vars = false; - break; - case JOIN_FULL: - outer_itlist->has_non_vars = false; - inner_itlist->has_non_vars = false; - break; - default: - break; - } + /* + * Now we need to fix up the targetlist and qpqual, which are logically + * above the join. This means they should not re-use any input expression + * that was computed in the nullable side of an outer join. Vars and + * PlaceHolderVars are fine, so we can implement this restriction just by + * clearing has_non_vars in the indexed_tlist structs. + * + * XXX This is a grotty workaround for the fact that we don't clearly + * distinguish between a Var appearing below an outer join and the "same" + * Var appearing above it. If we did, we'd not need to hack the matching + * rules this way. + */ + switch (join->jointype) + { + case JOIN_LEFT: + case JOIN_SEMI: + case JOIN_ANTI: +#ifdef __TBASE__ + case JOIN_LEFT_SCALAR: +#endif + inner_itlist->has_non_vars = false; + break; + case JOIN_RIGHT: + outer_itlist->has_non_vars = false; + break; + case JOIN_FULL: + outer_itlist->has_non_vars = false; + inner_itlist->has_non_vars = false; + break; + default: + break; + } - join->plan.targetlist = fix_join_expr(root, - join->plan.targetlist, - outer_itlist, - inner_itlist, - (Index) 0, - rtoffset); - join->plan.qual = fix_join_expr(root, - join->plan.qual, - outer_itlist, - inner_itlist, - (Index) 0, - rtoffset); - - pfree(outer_itlist); - pfree(inner_itlist); + join->plan.targetlist = fix_join_expr(root, + join->plan.targetlist, + outer_itlist, + inner_itlist, + (Index) 0, + rtoffset); + join->plan.qual = fix_join_expr(root, + join->plan.qual, + outer_itlist, + inner_itlist, + (Index) 0, + rtoffset); + + pfree(outer_itlist); + pfree(inner_itlist); } /* diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index 1ebdfefc..bff6e3fd 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -1641,7 +1641,9 @@ append_var_to_subquery_targetlist(Var *var, List *targetList, TargetEntry **targ ent->resno = varno; var->varattno = var->varoattno = varno; - *target = ent; + + if(target != NULL) + *target = ent; return targetList; } @@ -2605,8 +2607,6 @@ convert_EXPR_sublink_to_join(PlannerInfo *root, OpExpr *expr, ent->resno = varno; - //var->varattno = var->varoattno = varno; - /* determine the eqop and optional sortop */ get_sort_group_operators(restype, false, true, false, @@ -2771,6 +2771,123 @@ get_or_exist_subquery_targetlist(PlannerInfo *root, Node *node, List **targetLis return node; } +#ifdef __TBASE__ +/* + * convert_TargetList_sublink_to_join : + * try to convert an EXISTS SubLink in targetlist to a join + * On success, it returns not NULL. + */ +TargetEntry * +convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry) +{ + Query *parse = root->parse; + Node *whereClause = NULL; + Query *subselect = NULL; + JoinExpr *joinExpr = NULL; + ParseState *pstate = NULL; + SubLink *sublink = NULL; + RangeTblRef *rtr = NULL; + RangeTblEntry *rte = NULL; + Var *var = NULL; + + /* Sanity check */ + if (!IsA(entry->expr, SubLink)) + return NULL; + + sublink = (SubLink *) entry->expr; + if (sublink->subLinkType != EXPR_SUBLINK) + return NULL; + + /* + * Copy object so that we can modify it. + */ + subselect = copyObject((Query *) sublink->subselect); + whereClause = subselect->jointree->quals; + + /* + * Only one targetEntry can be handled. + */ + if (list_length(subselect->targetList) > 1) + return NULL; + + /* + * The subquery must have a nonempty jointree, else we won't have a join. + */ + if (subselect->jointree->fromlist == NIL) + return NULL; + + /* + * What we can not optimize. + */ + if (subselect->commandType != CMD_SELECT || + subselect->hasAggs || subselect->hasDistinctOn || + subselect->setOperations || subselect->groupingSets || + subselect->groupClause || subselect->hasWindowFuncs || + subselect->hasTargetSRFs || subselect->hasModifyingCTE || + subselect->havingQual || subselect->limitOffset || + subselect->limitCount || subselect->rowMarks || + subselect->cteList || subselect->sortClause) + { + return NULL; + } + + /* + * On one hand, the WHERE clause must contain some Vars of the + * parent query, else it's not gonna be a join. + */ + if (!contain_vars_of_level(whereClause, 1)) + return NULL; + + /* + * We don't risk optimizing if the WHERE clause is volatile, either. + */ + if (contain_volatile_functions(whereClause)) + return NULL; + + /* + * The rest of the sub-select must not refer to any Vars of the parent + * query. (Vars of higher levels should be okay, though.) + */ + if (contain_vars_of_level((Node *) subselect, 1)) + return NULL; + + /* + * Move sub-select to the parent query. + */ + pstate = make_parsestate(NULL); + rte = addRangeTableEntryForSubquery(pstate, + subselect, + makeAlias("TARGETLIST_subquery", NIL), + true, + false); + parse->rtable = lappend(parse->rtable, rte); + + rtr = makeNode(RangeTblRef); + rtr->rtindex = list_length(parse->rtable); + + /* + * Form join node. + */ + joinExpr = makeNode(JoinExpr); + joinExpr->jointype = JOIN_LEFT_SCALAR; + joinExpr->isNatural = false; + joinExpr->larg = (Node *) root->parse->jointree; + joinExpr->rarg = (Node *) rtr; + joinExpr->usingClause = NIL; + joinExpr->alias = NULL; + joinExpr->rtindex = 0; /* we don't need an RTE for it */ + joinExpr->quals = NULL; + + /* Wrap join node in FromExpr as required. */ + parse->jointree = makeFromExpr(list_make1(joinExpr), NULL); + + /* Replace sublink node with Var. */ + var = makeVarFromTargetEntry(rtr->rtindex, linitial(subselect->targetList)); + entry->expr = (Expr *) var; + return entry; +} +#endif + static Expr * convert_OR_EXIST_sublink_to_join(PlannerInfo *root, SubLink *sublink, Node **jtlink) { @@ -2825,7 +2942,9 @@ convert_OR_EXIST_sublink_to_join(PlannerInfo *root, SubLink *sublink, Node **jtl Oid restype; SortGroupClause *grpcl; TargetEntry *entry; - subselect->targetList = append_var_to_subquery_targetlist((Var *)lfirst(cell), subselect->targetList, &entry); + + subselect->targetList = append_var_to_subquery_targetlist((Var *)lfirst(cell), + subselect->targetList, &entry); restype = exprType((Node *)entry->expr); get_sort_group_operators(restype, false, true, false, diff --git a/src/backend/optimizer/prep/prepjointree.c b/src/backend/optimizer/prep/prepjointree.c index 824d6e0a..b7cbd5c0 100644 --- a/src/backend/optimizer/prep/prepjointree.c +++ b/src/backend/optimizer/prep/prepjointree.c @@ -216,22 +216,48 @@ static bool check_pull_up_sublinks_qual_or_recurse(PlannerInfo *root, Node *node void pull_up_sublinks(PlannerInfo *root) { - Node *jtnode; - Relids relids; - - /* Begin recursion through the jointree */ - jtnode = pull_up_sublinks_jointree_recurse(root, - (Node *) root->parse->jointree, - &relids); + Node *jtnode; + Relids relids; +#ifdef __TBASE__ /* - * root->parse->jointree must always be a FromExpr, so insert a dummy one - * if we got a bare RangeTblRef or JoinExpr out of the recursion. + * Look for SubLinks in targetlist, and try to transform them into joins. */ - if (IsA(jtnode, FromExpr)) - root->parse->jointree = (FromExpr *) jtnode; - else - root->parse->jointree = makeFromExpr(list_make1(jtnode), NULL); + if(enable_pullup_subquery) + { + List *new_targetList = NIL; + ListCell *lc = NULL; + TargetEntry *entry = NULL; + TargetEntry *new_entry = NULL; + + foreach(lc, root->parse->targetList) + { + entry = (TargetEntry *) lfirst(lc); + + new_entry = convert_TargetList_sublink_to_join(root, entry); + if (new_entry) + new_targetList = lappend(new_targetList, new_entry); + else + new_targetList = lappend(new_targetList, entry); + } + + root->parse->targetList = new_targetList; + } +#endif + + /* Begin recursion through the jointree */ + jtnode = pull_up_sublinks_jointree_recurse(root, + (Node *) root->parse->jointree, + &relids); + + /* + * root->parse->jointree must always be a FromExpr, so insert a dummy one + * if we got a bare RangeTblRef or JoinExpr out of the recursion. + */ + if (IsA(jtnode, FromExpr)) + root->parse->jointree = (FromExpr *) jtnode; + else + root->parse->jointree = makeFromExpr(list_make1(jtnode), NULL); } /* @@ -296,98 +322,101 @@ pull_up_sublinks_jointree_recurse(PlannerInfo *root, Node *jtnode, } #endif - /* - * Note that the result will be either newf, or a stack of JoinExprs - * with newf at the base. We rely on subsequent optimization steps to - * flatten this and rearrange the joins as needed. - * - * Although we could include the pulled-up subqueries in the returned - * relids, there's no need since upper quals couldn't refer to their - * outputs anyway. - */ - *relids = frelids; - jtnode = jtlink; - } - else if (IsA(jtnode, JoinExpr)) - { - JoinExpr *j; - Relids leftrelids; - Relids rightrelids; - Node *jtlink; - - /* - * Make a modifiable copy of join node, but don't bother copying its - * subnodes (yet). - */ - j = (JoinExpr *) palloc(sizeof(JoinExpr)); - memcpy(j, jtnode, sizeof(JoinExpr)); - jtlink = (Node *) j; - - /* Recurse to process children and collect their relids */ - j->larg = pull_up_sublinks_jointree_recurse(root, j->larg, - &leftrelids); - j->rarg = pull_up_sublinks_jointree_recurse(root, j->rarg, - &rightrelids); - - /* - * Now process qual, showing appropriate child relids as available, - * and attach any pulled-up jointree items at the right place. In the - * inner-join case we put new JoinExprs above the existing one (much - * as for a FromExpr-style join). In outer-join cases the new - * JoinExprs must go into the nullable side of the outer join. The - * point of the available_rels machinations is to ensure that we only - * pull up quals for which that's okay. - * - * We don't expect to see any pre-existing JOIN_SEMI or JOIN_ANTI - * nodes here. - */ - switch (j->jointype) - { - case JOIN_INNER: - j->quals = pull_up_sublinks_qual_recurse(root, j->quals, - &jtlink, - bms_union(leftrelids, - rightrelids), - NULL, NULL); - break; - case JOIN_LEFT: - j->quals = pull_up_sublinks_qual_recurse(root, j->quals, - &j->rarg, - rightrelids, - NULL, NULL); - break; - case JOIN_FULL: - /* can't do anything with full-join quals */ - break; - case JOIN_RIGHT: - j->quals = pull_up_sublinks_qual_recurse(root, j->quals, - &j->larg, - leftrelids, - NULL, NULL); - break; - default: - elog(ERROR, "unrecognized join type: %d", - (int) j->jointype); - break; - } + /* + * Note that the result will be either newf, or a stack of JoinExprs + * with newf at the base. We rely on subsequent optimization steps to + * flatten this and rearrange the joins as needed. + * + * Although we could include the pulled-up subqueries in the returned + * relids, there's no need since upper quals couldn't refer to their + * outputs anyway. + */ + *relids = frelids; + jtnode = jtlink; + } + else if (IsA(jtnode, JoinExpr)) + { + JoinExpr *j; + Relids leftrelids; + Relids rightrelids; + Node *jtlink; + + /* + * Make a modifiable copy of join node, but don't bother copying its + * subnodes (yet). + */ + j = (JoinExpr *) palloc(sizeof(JoinExpr)); + memcpy(j, jtnode, sizeof(JoinExpr)); + jtlink = (Node *) j; + + /* Recurse to process children and collect their relids */ + j->larg = pull_up_sublinks_jointree_recurse(root, j->larg, + &leftrelids); + j->rarg = pull_up_sublinks_jointree_recurse(root, j->rarg, + &rightrelids); + + /* + * Now process qual, showing appropriate child relids as available, + * and attach any pulled-up jointree items at the right place. In the + * inner-join case we put new JoinExprs above the existing one (much + * as for a FromExpr-style join). In outer-join cases the new + * JoinExprs must go into the nullable side of the outer join. The + * point of the available_rels machinations is to ensure that we only + * pull up quals for which that's okay. + * + * We don't expect to see any pre-existing JOIN_SEMI or JOIN_ANTI + * nodes here. + */ + switch (j->jointype) + { + case JOIN_INNER: + j->quals = pull_up_sublinks_qual_recurse(root, j->quals, + &jtlink, + bms_union(leftrelids, + rightrelids), + NULL, NULL); + break; + case JOIN_LEFT: +#ifdef __TBASE__ + case JOIN_LEFT_SCALAR: +#endif + j->quals = pull_up_sublinks_qual_recurse(root, j->quals, + &j->rarg, + rightrelids, + NULL, NULL); + break; + case JOIN_FULL: + /* can't do anything with full-join quals */ + break; + case JOIN_RIGHT: + j->quals = pull_up_sublinks_qual_recurse(root, j->quals, + &j->larg, + leftrelids, + NULL, NULL); + break; + default: + elog(ERROR, "unrecognized join type: %d", + (int) j->jointype); + break; + } - /* - * Although we could include the pulled-up subqueries in the returned - * relids, there's no need since upper quals couldn't refer to their - * outputs anyway. But we *do* need to include the join's own rtindex - * because we haven't yet collapsed join alias variables, so upper - * levels would mistakenly think they couldn't use references to this - * join. - */ - *relids = bms_join(leftrelids, rightrelids); - if (j->rtindex) - *relids = bms_add_member(*relids, j->rtindex); - jtnode = jtlink; - } - else - elog(ERROR, "unrecognized node type: %d", - (int) nodeTag(jtnode)); - return jtnode; + /* + * Although we could include the pulled-up subqueries in the returned + * relids, there's no need since upper quals couldn't refer to their + * outputs anyway. But we *do* need to include the join's own rtindex + * because we haven't yet collapsed join alias variables, so upper + * levels would mistakenly think they couldn't use references to this + * join. + */ + *relids = bms_join(leftrelids, rightrelids); + if (j->rtindex) + *relids = bms_add_member(*relids, j->rtindex); + jtnode = jtlink; + } + else + elog(ERROR, "unrecognized node type: %d", + (int) nodeTag(jtnode)); + return jtnode; } #ifdef __TBASE__ @@ -1073,185 +1102,188 @@ pull_up_subqueries(PlannerInfo *root) */ static Node * pull_up_subqueries_recurse(PlannerInfo *root, Node *jtnode, - JoinExpr *lowest_outer_join, - JoinExpr *lowest_nulling_outer_join, - AppendRelInfo *containing_appendrel, - bool deletion_ok) -{// #lizard forgives - Assert(jtnode != NULL); - if (IsA(jtnode, RangeTblRef)) - { - int varno = ((RangeTblRef *) jtnode)->rtindex; - RangeTblEntry *rte = rt_fetch(varno, root->parse->rtable); - - /* - * Is this a subquery RTE, and if so, is the subquery simple enough to - * pull up? - * - * If we are looking at an append-relation member, we can't pull it up - * unless is_safe_append_member says so. - */ - if (rte->rtekind == RTE_SUBQUERY && - is_simple_subquery(rte->subquery, rte, - lowest_outer_join, deletion_ok) && - (containing_appendrel == NULL || - is_safe_append_member(rte->subquery))) - return pull_up_simple_subquery(root, jtnode, rte, - lowest_outer_join, - lowest_nulling_outer_join, - containing_appendrel, - deletion_ok); - - /* - * Alternatively, is it a simple UNION ALL subquery? If so, flatten - * into an "append relation". - * - * It's safe to do this regardless of whether this query is itself an - * appendrel member. (If you're thinking we should try to flatten the - * two levels of appendrel together, you're right; but we handle that - * in set_append_rel_pathlist, not here.) - */ - if (rte->rtekind == RTE_SUBQUERY && - is_simple_union_all(rte->subquery)) - return pull_up_simple_union_all(root, jtnode, rte); - - /* - * Or perhaps it's a simple VALUES RTE? - * - * We don't allow VALUES pullup below an outer join nor into an - * appendrel (such cases are impossible anyway at the moment). - */ - if (rte->rtekind == RTE_VALUES && - lowest_outer_join == NULL && - containing_appendrel == NULL && - is_simple_values(root, rte, deletion_ok)) - return pull_up_simple_values(root, jtnode, rte); - - /* Otherwise, do nothing at this node. */ - } - else if (IsA(jtnode, FromExpr)) - { - FromExpr *f = (FromExpr *) jtnode; - bool have_undeleted_child = false; - ListCell *l; - - Assert(containing_appendrel == NULL); + JoinExpr *lowest_outer_join, + JoinExpr *lowest_nulling_outer_join, + AppendRelInfo *containing_appendrel, + bool deletion_ok) +{ + Assert(jtnode != NULL); + if (IsA(jtnode, RangeTblRef)) + { + int varno = ((RangeTblRef *) jtnode)->rtindex; + RangeTblEntry *rte = rt_fetch(varno, root->parse->rtable); + + /* + * Is this a subquery RTE, and if so, is the subquery simple enough to + * pull up? + * + * If we are looking at an append-relation member, we can't pull it up + * unless is_safe_append_member says so. + */ + if (rte->rtekind == RTE_SUBQUERY && + is_simple_subquery(rte->subquery, rte, + lowest_outer_join, deletion_ok) && + (containing_appendrel == NULL || + is_safe_append_member(rte->subquery))) + return pull_up_simple_subquery(root, jtnode, rte, + lowest_outer_join, + lowest_nulling_outer_join, + containing_appendrel, + deletion_ok); + + /* + * Alternatively, is it a simple UNION ALL subquery? If so, flatten + * into an "append relation". + * + * It's safe to do this regardless of whether this query is itself an + * appendrel member. (If you're thinking we should try to flatten the + * two levels of appendrel together, you're right; but we handle that + * in set_append_rel_pathlist, not here.) + */ + if (rte->rtekind == RTE_SUBQUERY && + is_simple_union_all(rte->subquery)) + return pull_up_simple_union_all(root, jtnode, rte); + + /* + * Or perhaps it's a simple VALUES RTE? + * + * We don't allow VALUES pullup below an outer join nor into an + * appendrel (such cases are impossible anyway at the moment). + */ + if (rte->rtekind == RTE_VALUES && + lowest_outer_join == NULL && + containing_appendrel == NULL && + is_simple_values(root, rte, deletion_ok)) + return pull_up_simple_values(root, jtnode, rte); + + /* Otherwise, do nothing at this node. */ + } + else if (IsA(jtnode, FromExpr)) + { + FromExpr *f = (FromExpr *) jtnode; + bool have_undeleted_child = false; + ListCell *l; - /* - * If the FromExpr has quals, it's not deletable even if its parent - * would allow deletion. - */ - if (f->quals) - deletion_ok = false; + Assert(containing_appendrel == NULL); - foreach(l, f->fromlist) - { - /* - * In a non-deletable FromExpr, we can allow deletion of child - * nodes so long as at least one child remains; so it's okay - * either if any previous child survives, or if there's more to - * come. If all children are deletable in themselves, we'll force - * the last one to remain unflattened. - * - * As a separate matter, we can allow deletion of all children of - * the top-level FromExpr in a query, since that's a special case - * anyway. - */ - bool sub_deletion_ok = (deletion_ok || - have_undeleted_child || - lnext(l) != NULL || - f == root->parse->jointree); - - lfirst(l) = pull_up_subqueries_recurse(root, lfirst(l), - lowest_outer_join, - lowest_nulling_outer_join, - NULL, - sub_deletion_ok); - if (lfirst(l) != NULL) - have_undeleted_child = true; - } + /* + * If the FromExpr has quals, it's not deletable even if its parent + * would allow deletion. + */ + if (f->quals) + deletion_ok = false; - if (deletion_ok && !have_undeleted_child) - { - /* OK to delete this FromExpr entirely */ - root->hasDeletedRTEs = true; /* probably is set already */ - return NULL; - } - } - else if (IsA(jtnode, JoinExpr)) - { - JoinExpr *j = (JoinExpr *) jtnode; + foreach(l, f->fromlist) + { + /* + * In a non-deletable FromExpr, we can allow deletion of child + * nodes so long as at least one child remains; so it's okay + * either if any previous child survives, or if there's more to + * come. If all children are deletable in themselves, we'll force + * the last one to remain unflattened. + * + * As a separate matter, we can allow deletion of all children of + * the top-level FromExpr in a query, since that's a special case + * anyway. + */ + bool sub_deletion_ok = (deletion_ok || + have_undeleted_child || + lnext(l) != NULL || + f == root->parse->jointree); + + lfirst(l) = pull_up_subqueries_recurse(root, lfirst(l), + lowest_outer_join, + lowest_nulling_outer_join, + NULL, + sub_deletion_ok); + if (lfirst(l) != NULL) + have_undeleted_child = true; + } - Assert(containing_appendrel == NULL); - /* Recurse, being careful to tell myself when inside outer join */ - switch (j->jointype) - { - case JOIN_INNER: + if (deletion_ok && !have_undeleted_child) + { + /* OK to delete this FromExpr entirely */ + root->hasDeletedRTEs = true; /* probably is set already */ + return NULL; + } + } + else if (IsA(jtnode, JoinExpr)) + { + JoinExpr *j = (JoinExpr *) jtnode; - /* - * INNER JOIN can allow deletion of either child node, but not - * both. So right child gets permission to delete only if - * left child didn't get removed. - */ - j->larg = pull_up_subqueries_recurse(root, j->larg, - lowest_outer_join, - lowest_nulling_outer_join, - NULL, - true); - j->rarg = pull_up_subqueries_recurse(root, j->rarg, - lowest_outer_join, - lowest_nulling_outer_join, - NULL, - j->larg != NULL); - break; - case JOIN_LEFT: - case JOIN_SEMI: - case JOIN_ANTI: - j->larg = pull_up_subqueries_recurse(root, j->larg, - j, - lowest_nulling_outer_join, - NULL, - false); - j->rarg = pull_up_subqueries_recurse(root, j->rarg, - j, - j, - NULL, - false); - break; - case JOIN_FULL: - j->larg = pull_up_subqueries_recurse(root, j->larg, - j, - j, - NULL, - false); - j->rarg = pull_up_subqueries_recurse(root, j->rarg, - j, - j, - NULL, - false); - break; - case JOIN_RIGHT: - j->larg = pull_up_subqueries_recurse(root, j->larg, - j, - j, - NULL, - false); - j->rarg = pull_up_subqueries_recurse(root, j->rarg, - j, - lowest_nulling_outer_join, - NULL, - false); - break; - default: - elog(ERROR, "unrecognized join type: %d", - (int) j->jointype); - break; - } - } - else - elog(ERROR, "unrecognized node type: %d", - (int) nodeTag(jtnode)); - return jtnode; + Assert(containing_appendrel == NULL); + /* Recurse, being careful to tell myself when inside outer join */ + switch (j->jointype) + { + case JOIN_INNER: + + /* + * INNER JOIN can allow deletion of either child node, but not + * both. So right child gets permission to delete only if + * left child didn't get removed. + */ + j->larg = pull_up_subqueries_recurse(root, j->larg, + lowest_outer_join, + lowest_nulling_outer_join, + NULL, + true); + j->rarg = pull_up_subqueries_recurse(root, j->rarg, + lowest_outer_join, + lowest_nulling_outer_join, + NULL, + j->larg != NULL); + break; + case JOIN_LEFT: + case JOIN_SEMI: +#ifdef __TBASE__ + case JOIN_LEFT_SCALAR: +#endif + case JOIN_ANTI: + j->larg = pull_up_subqueries_recurse(root, j->larg, + j, + lowest_nulling_outer_join, + NULL, + false); + j->rarg = pull_up_subqueries_recurse(root, j->rarg, + j, + j, + NULL, + false); + break; + case JOIN_FULL: + j->larg = pull_up_subqueries_recurse(root, j->larg, + j, + j, + NULL, + false); + j->rarg = pull_up_subqueries_recurse(root, j->rarg, + j, + j, + NULL, + false); + break; + case JOIN_RIGHT: + j->larg = pull_up_subqueries_recurse(root, j->larg, + j, + j, + NULL, + false); + j->rarg = pull_up_subqueries_recurse(root, j->rarg, + j, + lowest_nulling_outer_join, + NULL, + false); + break; + default: + elog(ERROR, "unrecognized join type: %d", + (int) j->jointype); + break; + } + } + else + elog(ERROR, "unrecognized node type: %d", + (int) nodeTag(jtnode)); + return jtnode; } /* @@ -2957,277 +2989,281 @@ reduce_outer_joins_pass1(Node *jtnode) */ static void reduce_outer_joins_pass2(Node *jtnode, - reduce_outer_joins_state *state, - PlannerInfo *root, - Relids nonnullable_rels, - List *nonnullable_vars, - List *forced_null_vars) -{// #lizard forgives - /* - * pass 2 should never descend as far as an empty subnode or base rel, - * because it's only called on subtrees marked as contains_outer. - */ - if (jtnode == NULL) - elog(ERROR, "reached empty jointree"); - if (IsA(jtnode, RangeTblRef)) - elog(ERROR, "reached base rel"); - else if (IsA(jtnode, FromExpr)) - { - FromExpr *f = (FromExpr *) jtnode; - ListCell *l; - ListCell *s; - Relids pass_nonnullable_rels; - List *pass_nonnullable_vars; - List *pass_forced_null_vars; - - /* Scan quals to see if we can add any constraints */ - pass_nonnullable_rels = find_nonnullable_rels(f->quals); - pass_nonnullable_rels = bms_add_members(pass_nonnullable_rels, - nonnullable_rels); - /* NB: we rely on list_concat to not damage its second argument */ - pass_nonnullable_vars = find_nonnullable_vars(f->quals); - pass_nonnullable_vars = list_concat(pass_nonnullable_vars, - nonnullable_vars); - pass_forced_null_vars = find_forced_null_vars(f->quals); - pass_forced_null_vars = list_concat(pass_forced_null_vars, - forced_null_vars); - /* And recurse --- but only into interesting subtrees */ - Assert(list_length(f->fromlist) == list_length(state->sub_states)); - forboth(l, f->fromlist, s, state->sub_states) - { - reduce_outer_joins_state *sub_state = lfirst(s); - - if (sub_state->contains_outer) - reduce_outer_joins_pass2(lfirst(l), sub_state, root, - pass_nonnullable_rels, - pass_nonnullable_vars, - pass_forced_null_vars); - } - bms_free(pass_nonnullable_rels); - /* can't so easily clean up var lists, unfortunately */ - } - else if (IsA(jtnode, JoinExpr)) - { - JoinExpr *j = (JoinExpr *) jtnode; - int rtindex = j->rtindex; - JoinType jointype = j->jointype; - reduce_outer_joins_state *left_state = linitial(state->sub_states); - reduce_outer_joins_state *right_state = lsecond(state->sub_states); - List *local_nonnullable_vars = NIL; - bool computed_local_nonnullable_vars = false; - - /* Can we simplify this join? */ - switch (jointype) - { - case JOIN_INNER: - break; - case JOIN_LEFT: - if (bms_overlap(nonnullable_rels, right_state->relids)) - jointype = JOIN_INNER; - break; - case JOIN_RIGHT: - if (bms_overlap(nonnullable_rels, left_state->relids)) - jointype = JOIN_INNER; - break; - case JOIN_FULL: - if (bms_overlap(nonnullable_rels, left_state->relids)) - { - if (bms_overlap(nonnullable_rels, right_state->relids)) - jointype = JOIN_INNER; - else - jointype = JOIN_LEFT; - } - else - { - if (bms_overlap(nonnullable_rels, right_state->relids)) - jointype = JOIN_RIGHT; - } - break; - case JOIN_SEMI: - case JOIN_ANTI: - - /* - * These could only have been introduced by pull_up_sublinks, - * so there's no way that upper quals could refer to their - * righthand sides, and no point in checking. - */ - break; - default: - elog(ERROR, "unrecognized join type: %d", - (int) jointype); - break; - } - - /* - * Convert JOIN_RIGHT to JOIN_LEFT. Note that in the case where we - * reduced JOIN_FULL to JOIN_RIGHT, this will mean the JoinExpr no - * longer matches the internal ordering of any CoalesceExpr's built to - * represent merged join variables. We don't care about that at - * present, but be wary of it ... - */ - if (jointype == JOIN_RIGHT) - { - Node *tmparg; - - tmparg = j->larg; - j->larg = j->rarg; - j->rarg = tmparg; - jointype = JOIN_LEFT; - right_state = linitial(state->sub_states); - left_state = lsecond(state->sub_states); - } - - /* - * See if we can reduce JOIN_LEFT to JOIN_ANTI. This is the case if - * the join's own quals are strict for any var that was forced null by - * higher qual levels. NOTE: there are other ways that we could - * detect an anti-join, in particular if we were to check whether Vars - * coming from the RHS must be non-null because of table constraints. - * That seems complicated and expensive though (in particular, one - * would have to be wary of lower outer joins). For the moment this - * seems sufficient. - */ - if (jointype == JOIN_LEFT) - { - List *overlap; + reduce_outer_joins_state *state, + PlannerInfo *root, + Relids nonnullable_rels, + List *nonnullable_vars, + List *forced_null_vars) +{ + /* + * pass 2 should never descend as far as an empty subnode or base rel, + * because it's only called on subtrees marked as contains_outer. + */ + if (jtnode == NULL) + elog(ERROR, "reached empty jointree"); + if (IsA(jtnode, RangeTblRef)) + elog(ERROR, "reached base rel"); + else if (IsA(jtnode, FromExpr)) + { + FromExpr *f = (FromExpr *) jtnode; + ListCell *l; + ListCell *s; + Relids pass_nonnullable_rels; + List *pass_nonnullable_vars; + List *pass_forced_null_vars; + + /* Scan quals to see if we can add any constraints */ + pass_nonnullable_rels = find_nonnullable_rels(f->quals); + pass_nonnullable_rels = bms_add_members(pass_nonnullable_rels, + nonnullable_rels); + /* NB: we rely on list_concat to not damage its second argument */ + pass_nonnullable_vars = find_nonnullable_vars(f->quals); + pass_nonnullable_vars = list_concat(pass_nonnullable_vars, + nonnullable_vars); + pass_forced_null_vars = find_forced_null_vars(f->quals); + pass_forced_null_vars = list_concat(pass_forced_null_vars, + forced_null_vars); + /* And recurse --- but only into interesting subtrees */ + Assert(list_length(f->fromlist) == list_length(state->sub_states)); + forboth(l, f->fromlist, s, state->sub_states) + { + reduce_outer_joins_state *sub_state = lfirst(s); - local_nonnullable_vars = find_nonnullable_vars(j->quals); - computed_local_nonnullable_vars = true; + if (sub_state->contains_outer) + reduce_outer_joins_pass2(lfirst(l), sub_state, root, + pass_nonnullable_rels, + pass_nonnullable_vars, + pass_forced_null_vars); + } + bms_free(pass_nonnullable_rels); + /* can't so easily clean up var lists, unfortunately */ + } + else if (IsA(jtnode, JoinExpr)) + { + JoinExpr *j = (JoinExpr *) jtnode; + int rtindex = j->rtindex; + JoinType jointype = j->jointype; + reduce_outer_joins_state *left_state = linitial(state->sub_states); + reduce_outer_joins_state *right_state = lsecond(state->sub_states); + List *local_nonnullable_vars = NIL; + bool computed_local_nonnullable_vars = false; + + /* Can we simplify this join? */ + switch (jointype) + { + case JOIN_INNER: + break; + case JOIN_LEFT: + if (bms_overlap(nonnullable_rels, right_state->relids)) + jointype = JOIN_INNER; + break; + case JOIN_RIGHT: + if (bms_overlap(nonnullable_rels, left_state->relids)) + jointype = JOIN_INNER; + break; + case JOIN_FULL: + if (bms_overlap(nonnullable_rels, left_state->relids)) + { + if (bms_overlap(nonnullable_rels, right_state->relids)) + jointype = JOIN_INNER; + else + jointype = JOIN_LEFT; + } + else + { + if (bms_overlap(nonnullable_rels, right_state->relids)) + jointype = JOIN_RIGHT; + } + break; + case JOIN_SEMI: + case JOIN_ANTI: + + /* + * These could only have been introduced by pull_up_sublinks, + * so there's no way that upper quals could refer to their + * righthand sides, and no point in checking. + */ + break; + default: + elog(ERROR, "unrecognized join type: %d", + (int) jointype); + break; + } - /* - * It's not sufficient to check whether local_nonnullable_vars and - * forced_null_vars overlap: we need to know if the overlap - * includes any RHS variables. - */ - overlap = list_intersection(local_nonnullable_vars, - forced_null_vars); - if (overlap != NIL && - bms_overlap(pull_varnos((Node *) overlap), - right_state->relids)) - jointype = JOIN_ANTI; - } + /* + * Convert JOIN_RIGHT to JOIN_LEFT. Note that in the case where we + * reduced JOIN_FULL to JOIN_RIGHT, this will mean the JoinExpr no + * longer matches the internal ordering of any CoalesceExpr's built to + * represent merged join variables. We don't care about that at + * present, but be wary of it ... + */ + if (jointype == JOIN_RIGHT) + { + Node *tmparg; + + tmparg = j->larg; + j->larg = j->rarg; + j->rarg = tmparg; + jointype = JOIN_LEFT; + right_state = linitial(state->sub_states); + left_state = lsecond(state->sub_states); + } - /* Apply the jointype change, if any, to both jointree node and RTE */ - if (rtindex && jointype != j->jointype) - { - RangeTblEntry *rte = rt_fetch(rtindex, root->parse->rtable); + /* + * See if we can reduce JOIN_LEFT to JOIN_ANTI. This is the case if + * the join's own quals are strict for any var that was forced null by + * higher qual levels. NOTE: there are other ways that we could + * detect an anti-join, in particular if we were to check whether Vars + * coming from the RHS must be non-null because of table constraints. + * That seems complicated and expensive though (in particular, one + * would have to be wary of lower outer joins). For the moment this + * seems sufficient. + */ + if (jointype == JOIN_LEFT) + { + List *overlap; + + local_nonnullable_vars = find_nonnullable_vars(j->quals); + computed_local_nonnullable_vars = true; + + /* + * It's not sufficient to check whether local_nonnullable_vars and + * forced_null_vars overlap: we need to know if the overlap + * includes any RHS variables. + */ + overlap = list_intersection(local_nonnullable_vars, + forced_null_vars); + if (overlap != NIL && + bms_overlap(pull_varnos((Node *) overlap), + right_state->relids)) + jointype = JOIN_ANTI; + } - Assert(rte->rtekind == RTE_JOIN); - Assert(rte->jointype == j->jointype); - rte->jointype = jointype; - } - j->jointype = jointype; + /* Apply the jointype change, if any, to both jointree node and RTE */ + if (rtindex && jointype != j->jointype) + { + RangeTblEntry *rte = rt_fetch(rtindex, root->parse->rtable); - /* Only recurse if there's more to do below here */ - if (left_state->contains_outer || right_state->contains_outer) - { - Relids local_nonnullable_rels; - List *local_forced_null_vars; - Relids pass_nonnullable_rels; - List *pass_nonnullable_vars; - List *pass_forced_null_vars; + Assert(rte->rtekind == RTE_JOIN); + Assert(rte->jointype == j->jointype); + rte->jointype = jointype; + } + j->jointype = jointype; - /* - * If this join is (now) inner, we can add any constraints its - * quals provide to those we got from above. But if it is outer, - * we can pass down the local constraints only into the nullable - * side, because an outer join never eliminates any rows from its - * non-nullable side. Also, there is no point in passing upper - * constraints into the nullable side, since if there were any - * we'd have been able to reduce the join. (In the case of upper - * forced-null constraints, we *must not* pass them into the - * nullable side --- they either applied here, or not.) The upshot - * is that we pass either the local or the upper constraints, - * never both, to the children of an outer join. - * - * Note that a SEMI join works like an inner join here: it's okay - * to pass down both local and upper constraints. (There can't be - * any upper constraints affecting its inner side, but it's not - * worth having a separate code path to avoid passing them.) - * - * At a FULL join we just punt and pass nothing down --- is it - * possible to be smarter? - */ - if (jointype != JOIN_FULL) - { - local_nonnullable_rels = find_nonnullable_rels(j->quals); - if (!computed_local_nonnullable_vars) - local_nonnullable_vars = find_nonnullable_vars(j->quals); - local_forced_null_vars = find_forced_null_vars(j->quals); - if (jointype == JOIN_INNER || jointype == JOIN_SEMI) - { - /* OK to merge upper and local constraints */ - local_nonnullable_rels = bms_add_members(local_nonnullable_rels, - nonnullable_rels); - local_nonnullable_vars = list_concat(local_nonnullable_vars, - nonnullable_vars); - local_forced_null_vars = list_concat(local_forced_null_vars, - forced_null_vars); - } - } - else - { - /* no use in calculating these */ - local_nonnullable_rels = NULL; - local_forced_null_vars = NIL; - } + /* Only recurse if there's more to do below here */ + if (left_state->contains_outer || right_state->contains_outer) + { + Relids local_nonnullable_rels; + List *local_forced_null_vars; + Relids pass_nonnullable_rels; + List *pass_nonnullable_vars; + List *pass_forced_null_vars; + + /* + * If this join is (now) inner, we can add any constraints its + * quals provide to those we got from above. But if it is outer, + * we can pass down the local constraints only into the nullable + * side, because an outer join never eliminates any rows from its + * non-nullable side. Also, there is no point in passing upper + * constraints into the nullable side, since if there were any + * we'd have been able to reduce the join. (In the case of upper + * forced-null constraints, we *must not* pass them into the + * nullable side --- they either applied here, or not.) The upshot + * is that we pass either the local or the upper constraints, + * never both, to the children of an outer join. + * + * Note that a SEMI join works like an inner join here: it's okay + * to pass down both local and upper constraints. (There can't be + * any upper constraints affecting its inner side, but it's not + * worth having a separate code path to avoid passing them.) + * + * At a FULL join we just punt and pass nothing down --- is it + * possible to be smarter? + */ + if (jointype != JOIN_FULL) + { + local_nonnullable_rels = find_nonnullable_rels(j->quals); + if (!computed_local_nonnullable_vars) + local_nonnullable_vars = find_nonnullable_vars(j->quals); + local_forced_null_vars = find_forced_null_vars(j->quals); +#ifdef __TBASE__ + if (jointype == JOIN_INNER || jointype == JOIN_SEMI || jointype == JOIN_LEFT_SCALAR) +#else + if (jointype == JOIN_INNER || jointype == JOIN_SEMI) +#endif + { + /* OK to merge upper and local constraints */ + local_nonnullable_rels = bms_add_members(local_nonnullable_rels, + nonnullable_rels); + local_nonnullable_vars = list_concat(local_nonnullable_vars, + nonnullable_vars); + local_forced_null_vars = list_concat(local_forced_null_vars, + forced_null_vars); + } + } + else + { + /* no use in calculating these */ + local_nonnullable_rels = NULL; + local_forced_null_vars = NIL; + } - if (left_state->contains_outer) - { - if (jointype == JOIN_INNER || jointype == JOIN_SEMI) - { - /* pass union of local and upper constraints */ - pass_nonnullable_rels = local_nonnullable_rels; - pass_nonnullable_vars = local_nonnullable_vars; - pass_forced_null_vars = local_forced_null_vars; - } - else if (jointype != JOIN_FULL) /* ie, LEFT or ANTI */ - { - /* can't pass local constraints to non-nullable side */ - pass_nonnullable_rels = nonnullable_rels; - pass_nonnullable_vars = nonnullable_vars; - pass_forced_null_vars = forced_null_vars; - } - else - { - /* no constraints pass through JOIN_FULL */ - pass_nonnullable_rels = NULL; - pass_nonnullable_vars = NIL; - pass_forced_null_vars = NIL; - } - reduce_outer_joins_pass2(j->larg, left_state, root, - pass_nonnullable_rels, - pass_nonnullable_vars, - pass_forced_null_vars); - } + if (left_state->contains_outer) + { + if (jointype == JOIN_INNER || jointype == JOIN_SEMI) + { + /* pass union of local and upper constraints */ + pass_nonnullable_rels = local_nonnullable_rels; + pass_nonnullable_vars = local_nonnullable_vars; + pass_forced_null_vars = local_forced_null_vars; + } + else if (jointype != JOIN_FULL) /* ie, LEFT or ANTI */ + { + /* can't pass local constraints to non-nullable side */ + pass_nonnullable_rels = nonnullable_rels; + pass_nonnullable_vars = nonnullable_vars; + pass_forced_null_vars = forced_null_vars; + } + else + { + /* no constraints pass through JOIN_FULL */ + pass_nonnullable_rels = NULL; + pass_nonnullable_vars = NIL; + pass_forced_null_vars = NIL; + } + reduce_outer_joins_pass2(j->larg, left_state, root, + pass_nonnullable_rels, + pass_nonnullable_vars, + pass_forced_null_vars); + } - if (right_state->contains_outer) - { - if (jointype != JOIN_FULL) /* ie, INNER/LEFT/SEMI/ANTI */ - { - /* pass appropriate constraints, per comment above */ - pass_nonnullable_rels = local_nonnullable_rels; - pass_nonnullable_vars = local_nonnullable_vars; - pass_forced_null_vars = local_forced_null_vars; - } - else - { - /* no constraints pass through JOIN_FULL */ - pass_nonnullable_rels = NULL; - pass_nonnullable_vars = NIL; - pass_forced_null_vars = NIL; - } - reduce_outer_joins_pass2(j->rarg, right_state, root, - pass_nonnullable_rels, - pass_nonnullable_vars, - pass_forced_null_vars); - } - bms_free(local_nonnullable_rels); - } - } - else - elog(ERROR, "unrecognized node type: %d", - (int) nodeTag(jtnode)); + if (right_state->contains_outer) + { + if (jointype != JOIN_FULL) /* ie, INNER/LEFT/SEMI/ANTI */ + { + /* pass appropriate constraints, per comment above */ + pass_nonnullable_rels = local_nonnullable_rels; + pass_nonnullable_vars = local_nonnullable_vars; + pass_forced_null_vars = local_forced_null_vars; + } + else + { + /* no constraints pass through JOIN_FULL */ + pass_nonnullable_rels = NULL; + pass_nonnullable_vars = NIL; + pass_forced_null_vars = NIL; + } + reduce_outer_joins_pass2(j->rarg, right_state, root, + pass_nonnullable_rels, + pass_nonnullable_vars, + pass_forced_null_vars); + } + bms_free(local_nonnullable_rels); + } + } + else + elog(ERROR, "unrecognized node type: %d", + (int) nodeTag(jtnode)); } /* diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index ad966eb2..9b89cb4a 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -1721,32 +1721,41 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) return alternate; } - /* - * Check if we have inner replicated - * The "both replicated" case is already checked, so if innerd - * is replicated, then outerd is not replicated and it is not NULL. - * This case is not acceptable for some join types. If outer relation is - * nullable data nodes will produce joined rows with NULLs for cases when - * matching row exists, but on other data node. - */ - if ((innerd && IsLocatorReplicated(innerd->distributionType)) && - (pathnode->jointype == JOIN_INNER || - pathnode->jointype == JOIN_LEFT || - pathnode->jointype == JOIN_SEMI || - pathnode->jointype == JOIN_ANTI)) - { - /* We need inner relation is defined on all nodes where outer is */ - if (!outerd || !bms_is_subset(outerd->nodes, innerd->nodes)) - goto not_allowed_join; - - targetd = makeNode(Distribution); - targetd->distributionType = outerd->distributionType; - targetd->nodes = bms_copy(outerd->nodes); - targetd->restrictNodes = bms_copy(outerd->restrictNodes); - targetd->distributionExpr = outerd->distributionExpr; - pathnode->path.distribution = targetd; - return alternate; - } + /* + * Check if we have inner replicated + * The "both replicated" case is already checked, so if innerd + * is replicated, then outerd is not replicated and it is not NULL. + * This case is not acceptable for some join types. If outer relation is + * nullable data nodes will produce joined rows with NULLs for cases when + * matching row exists, but on other data node. + */ +#ifdef __TBASE__ + if ((innerd && IsLocatorReplicated(innerd->distributionType)) && + (pathnode->jointype == JOIN_INNER || + pathnode->jointype == JOIN_LEFT || + pathnode->jointype == JOIN_SEMI || + pathnode->jointype == JOIN_LEFT_SCALAR || + pathnode->jointype == JOIN_ANTI)) +#else + if ((innerd && IsLocatorReplicated(innerd->distributionType)) && + (pathnode->jointype == JOIN_INNER || + pathnode->jointype == JOIN_LEFT || + pathnode->jointype == JOIN_SEMI || + pathnode->jointype == JOIN_ANTI)) +#endif + { + /* We need inner relation is defined on all nodes where outer is */ + if (!outerd || !bms_is_subset(outerd->nodes, innerd->nodes)) + goto not_allowed_join; + + targetd = makeNode(Distribution); + targetd->distributionType = outerd->distributionType; + targetd->nodes = bms_copy(outerd->nodes); + targetd->restrictNodes = bms_copy(outerd->restrictNodes); + targetd->distributionExpr = outerd->distributionExpr; + pathnode->path.distribution = targetd; + return alternate; + } /* @@ -2046,43 +2055,52 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) * If we could not determine the distribution redistribute the subpathes. */ not_allowed_join: - /* - * If redistribution is required, sometimes the cheapest path would be if - * one of the subplan is replicated. If replication of any or all subplans - * is possible, return resulting plans as alternates. Try to distribute all - * by has as main variant. - */ + /* + * If redistribution is required, sometimes the cheapest path would be if + * one of the subplan is replicated. If replication of any or all subplans + * is possible, return resulting plans as alternates. Try to distribute all + * by has as main variant. + */ -#ifdef NOT_USED - /* These join types allow replicated inner */ - if (outerd && - (pathnode->jointype == JOIN_INNER || - pathnode->jointype == JOIN_LEFT || - pathnode->jointype == JOIN_SEMI || - pathnode->jointype == JOIN_ANTI)) - { - /* - * Since we discard all alternate pathes except one it is OK if all they - * reference the same objects - */ - JoinPath *altpath = flatCopyJoinPath(pathnode); - /* Redistribute inner subquery */ - altpath->innerjoinpath = redistribute_path( - root, - altpath->innerjoinpath, - innerpathkeys, - LOCATOR_TYPE_REPLICATED, - NULL, - bms_copy(outerd->nodes), - bms_copy(outerd->restrictNodes)); - targetd = makeNode(Distribution); - targetd->distributionType = outerd->distributionType; - targetd->nodes = bms_copy(outerd->nodes); - targetd->restrictNodes = bms_copy(outerd->restrictNodes); - targetd->distributionExpr = outerd->distributionExpr; - altpath->path.distribution = targetd; - alternate = lappend(alternate, altpath); - } +#ifdef NOT_USED + /* These join types allow replicated inner */ +#ifdef __TBASE__ + if (outerd && + (pathnode->jointype == JOIN_INNER || + pathnode->jointype == JOIN_LEFT || + pathnode->jointype == JOIN_SEMI || + pathnode->jointype == JOIN_LEFT_SCALAR || + pathnode->jointype == JOIN_ANTI)) +#else + if (outerd && + (pathnode->jointype == JOIN_INNER || + pathnode->jointype == JOIN_LEFT || + pathnode->jointype == JOIN_SEMI || + pathnode->jointype == JOIN_ANTI)) +#endif + { + /* + * Since we discard all alternate pathes except one it is OK if all they + * reference the same objects + */ + JoinPath *altpath = flatCopyJoinPath(pathnode); + /* Redistribute inner subquery */ + altpath->innerjoinpath = redistribute_path( + root, + altpath->innerjoinpath, + innerpathkeys, + LOCATOR_TYPE_REPLICATED, + NULL, + bms_copy(outerd->nodes), + bms_copy(outerd->restrictNodes)); + targetd = makeNode(Distribution); + targetd->distributionType = outerd->distributionType; + targetd->nodes = bms_copy(outerd->nodes); + targetd->restrictNodes = bms_copy(outerd->restrictNodes); + targetd->distributionExpr = outerd->distributionExpr; + altpath->path.distribution = targetd; + alternate = lappend(alternate, altpath); + } /* These join types allow replicated outer */ if (innerd && @@ -2161,7 +2179,9 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) Expr *right_expr = right; #endif Oid leftType PG_USED_FOR_ASSERTS_ONLY = exprType((Node *) left); - Oid rightType PG_USED_FOR_ASSERTS_ONLY = exprType((Node *) right); +#ifndef __TBASE__ + Oid rightType PG_USED_FOR_ASSERTS_ONLY = exprType((Node *) right); +#endif Relids inner_rels = pathnode->innerjoinpath->parent->relids; Relids outer_rels = pathnode->outerjoinpath->parent->relids; QualCost cost; @@ -2456,17 +2476,21 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) nodes = bms_copy(outerd->nodes); } - if(outer_size * inner_nodes < inner_size + outer_size && - (pathnode->jointype != JOIN_LEFT && pathnode->jointype != JOIN_FULL && - pathnode->jointype != JOIN_SEMI && pathnode->jointype != JOIN_ANTI) && - innerd->distributionType != LOCATOR_TYPE_REPLICATED && !redistribute_outer && - get_num_connections(inner_nodes, nRemotePlans_outer + 1) < MaxConnections * REPLICATION_FACTOR && + if(outer_size * inner_nodes < inner_size + outer_size && + (pathnode->jointype != JOIN_LEFT && + pathnode->jointype != JOIN_FULL && + pathnode->jointype != JOIN_SEMI && + pathnode->jointype != JOIN_LEFT_SCALAR && + pathnode->jointype != JOIN_LEFT_SEMI && + pathnode->jointype != JOIN_ANTI) && + innerd->distributionType != LOCATOR_TYPE_REPLICATED && !redistribute_outer && + get_num_connections(inner_nodes, nRemotePlans_outer + 1) < MaxConnections * REPLICATION_FACTOR && !replicate_inner && !dml && nRemotePlans_outer < replication_level && !pathnode->inner_unique) - { - replicate_outer = true; + { + replicate_outer = true; - nodes = bms_copy(innerd->nodes); - } + nodes = bms_copy(innerd->nodes); + } #endif } /* @@ -2483,8 +2507,12 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) * replicate outer as an optimization to save network costs. */ if(inner_size > outer_size * inner_nodes && - (pathnode->jointype != JOIN_LEFT && pathnode->jointype != JOIN_FULL && - pathnode->jointype != JOIN_SEMI && pathnode->jointype != JOIN_ANTI) && + (pathnode->jointype != JOIN_LEFT && + pathnode->jointype != JOIN_FULL && + pathnode->jointype != JOIN_SEMI && + pathnode->jointype != JOIN_LEFT_SCALAR && + pathnode->jointype != JOIN_LEFT_SEMI && + pathnode->jointype != JOIN_ANTI) && innerd->distributionType != LOCATOR_TYPE_REPLICATED && !redistribute_outer && get_num_connections(inner_nodes, nRemotePlans_outer + 1) < MaxConnections * REPLICATION_FACTOR && !dml && nRemotePlans_outer < replication_level && !pathnode->inner_unique) diff --git a/src/backend/utils/adt/network_selfuncs.c b/src/backend/utils/adt/network_selfuncs.c index 76db9c01..beb0e76a 100644 --- a/src/backend/utils/adt/network_selfuncs.c +++ b/src/backend/utils/adt/network_selfuncs.c @@ -200,50 +200,53 @@ networkjoinsel(PG_FUNCTION_ARGS) #ifdef NOT_USED JoinType jointype = (JoinType) PG_GETARG_INT16(3); #endif - SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) PG_GETARG_POINTER(4); - double selec; - VariableStatData vardata1; - VariableStatData vardata2; - bool join_is_reversed; - - get_join_variables(root, args, sjinfo, - &vardata1, &vardata2, &join_is_reversed); - - switch (sjinfo->jointype) - { - case JOIN_INNER: - case JOIN_LEFT: - case JOIN_FULL: - - /* - * Selectivity for left/full join is not exactly the same as inner - * join, but we neglect the difference, as eqjoinsel does. - */ - selec = networkjoinsel_inner(operator, &vardata1, &vardata2); - break; - case JOIN_SEMI: - case JOIN_ANTI: - /* Here, it's important that we pass the outer var on the left. */ - if (!join_is_reversed) - selec = networkjoinsel_semi(operator, &vardata1, &vardata2); - else - selec = networkjoinsel_semi(get_commutator(operator), - &vardata2, &vardata1); - break; - default: - /* other values not expected here */ - elog(ERROR, "unrecognized join type: %d", - (int) sjinfo->jointype); - selec = 0; /* keep compiler quiet */ - break; - } - - ReleaseVariableStats(vardata1); - ReleaseVariableStats(vardata2); - - CLAMP_PROBABILITY(selec); - - PG_RETURN_FLOAT8((float8) selec); + SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) PG_GETARG_POINTER(4); + double selec; + VariableStatData vardata1; + VariableStatData vardata2; + bool join_is_reversed; + + get_join_variables(root, args, sjinfo, + &vardata1, &vardata2, &join_is_reversed); + + switch (sjinfo->jointype) + { + case JOIN_INNER: + case JOIN_LEFT: + case JOIN_FULL: + + /* + * Selectivity for left/full join is not exactly the same as inner + * join, but we neglect the difference, as eqjoinsel does. + */ + selec = networkjoinsel_inner(operator, &vardata1, &vardata2); + break; + case JOIN_SEMI: + case JOIN_ANTI: +#ifdef __TBASE__ + case JOIN_LEFT_SCALAR: +#endif + /* Here, it's important that we pass the outer var on the left. */ + if (!join_is_reversed) + selec = networkjoinsel_semi(operator, &vardata1, &vardata2); + else + selec = networkjoinsel_semi(get_commutator(operator), + &vardata2, &vardata1); + break; + default: + /* other values not expected here */ + elog(ERROR, "unrecognized join type: %d", + (int) sjinfo->jointype); + selec = 0; /* keep compiler quiet */ + break; + } + + ReleaseVariableStats(vardata1); + ReleaseVariableStats(vardata2); + + CLAMP_PROBABILITY(selec); + + PG_RETURN_FLOAT8((float8) selec); } /* diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 3b1ff05f..134664f8 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -2265,56 +2265,59 @@ eqjoinsel(PG_FUNCTION_ARGS) #ifdef NOT_USED JoinType jointype = (JoinType) PG_GETARG_INT16(3); #endif - SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) PG_GETARG_POINTER(4); - double selec; - VariableStatData vardata1; - VariableStatData vardata2; - bool join_is_reversed; - RelOptInfo *inner_rel; - - get_join_variables(root, args, sjinfo, - &vardata1, &vardata2, &join_is_reversed); - - switch (sjinfo->jointype) - { - case JOIN_INNER: - case JOIN_LEFT: - case JOIN_FULL: - selec = eqjoinsel_inner(operator, &vardata1, &vardata2); - break; - case JOIN_SEMI: - case JOIN_ANTI: - - /* - * Look up the join's inner relation. min_righthand is sufficient - * information because neither SEMI nor ANTI joins permit any - * reassociation into or out of their RHS, so the righthand will - * always be exactly that set of rels. - */ - inner_rel = find_join_input_rel(root, sjinfo->min_righthand); - - if (!join_is_reversed) - selec = eqjoinsel_semi(operator, &vardata1, &vardata2, - inner_rel); - else - selec = eqjoinsel_semi(get_commutator(operator), - &vardata2, &vardata1, - inner_rel); - break; - default: - /* other values not expected here */ - elog(ERROR, "unrecognized join type: %d", - (int) sjinfo->jointype); - selec = 0; /* keep compiler quiet */ - break; - } - - ReleaseVariableStats(vardata1); - ReleaseVariableStats(vardata2); - - CLAMP_PROBABILITY(selec); + SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) PG_GETARG_POINTER(4); + double selec; + VariableStatData vardata1; + VariableStatData vardata2; + bool join_is_reversed; + RelOptInfo *inner_rel; + + get_join_variables(root, args, sjinfo, + &vardata1, &vardata2, &join_is_reversed); + + switch (sjinfo->jointype) + { + case JOIN_INNER: + case JOIN_LEFT: + case JOIN_FULL: + selec = eqjoinsel_inner(operator, &vardata1, &vardata2); + break; + case JOIN_SEMI: + case JOIN_ANTI: +#ifdef __TBASE__ + case JOIN_LEFT_SCALAR: +#endif - PG_RETURN_FLOAT8((float8) selec); + /* + * Look up the join's inner relation. min_righthand is sufficient + * information because neither SEMI nor ANTI joins permit any + * reassociation into or out of their RHS, so the righthand will + * always be exactly that set of rels. + */ + inner_rel = find_join_input_rel(root, sjinfo->min_righthand); + + if (!join_is_reversed) + selec = eqjoinsel_semi(operator, &vardata1, &vardata2, + inner_rel); + else + selec = eqjoinsel_semi(get_commutator(operator), + &vardata2, &vardata1, + inner_rel); + break; + default: + /* other values not expected here */ + elog(ERROR, "unrecognized join type: %d", + (int) sjinfo->jointype); + selec = 0; /* keep compiler quiet */ + break; + } + + ReleaseVariableStats(vardata1); + ReleaseVariableStats(vardata2); + + CLAMP_PROBABILITY(selec); + + PG_RETURN_FLOAT8((float8) selec); } /* diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index 9df9532f..9f974c28 100644 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -765,37 +765,42 @@ typedef enum CmdType */ typedef enum JoinType { - /* - * The canonical kinds of joins according to the SQL JOIN syntax. Only - * these codes can appear in parser output (e.g., JoinExpr nodes). - */ - JOIN_INNER, /* matching tuple pairs only */ - JOIN_LEFT, /* pairs + unmatched LHS tuples */ - JOIN_FULL, /* pairs + unmatched LHS + unmatched RHS */ - JOIN_RIGHT, /* pairs + unmatched RHS tuples */ - - /* - * Semijoins and anti-semijoins (as defined in relational theory) do not - * appear in the SQL JOIN syntax, but there are standard idioms for - * representing them (e.g., using EXISTS). The planner recognizes these - * cases and converts them to joins. So the planner and executor must - * support these codes. NOTE: in JOIN_SEMI output, it is unspecified - * which matching RHS row is joined to. In JOIN_ANTI output, the row is - * guaranteed to be null-extended. - */ - JOIN_SEMI, /* 1 copy of each LHS row that has match(es) */ - JOIN_ANTI, /* 1 copy of each LHS row that has no match */ + /* + * The canonical kinds of joins according to the SQL JOIN syntax. Only + * these codes can appear in parser output (e.g., JoinExpr nodes). + */ + JOIN_INNER, /* matching tuple pairs only */ + JOIN_LEFT, /* pairs + unmatched LHS tuples */ + JOIN_FULL, /* pairs + unmatched LHS + unmatched RHS */ + JOIN_RIGHT, /* pairs + unmatched RHS tuples */ + + /* + * Semijoins and anti-semijoins (as defined in relational theory) do not + * appear in the SQL JOIN syntax, but there are standard idioms for + * representing them (e.g., using EXISTS). The planner recognizes these + * cases and converts them to joins. So the planner and executor must + * support these codes. NOTE: in JOIN_SEMI output, it is unspecified + * which matching RHS row is joined to. In JOIN_ANTI output, the row is + * guaranteed to be null-extended. + */ + JOIN_SEMI, /* 1 copy of each LHS row that has match(es) */ + JOIN_ANTI, /* 1 copy of each LHS row that has no match */ + + /* + * These codes are used internally in the planner, but are not supported + * by the executor (nor, indeed, by most of the planner). + */ + JOIN_UNIQUE_OUTER, /* LHS path must be made unique */ + JOIN_UNIQUE_INNER, /* RHS path must be made unique */ - /* - * These codes are used internally in the planner, but are not supported - * by the executor (nor, indeed, by most of the planner). - */ - JOIN_UNIQUE_OUTER, /* LHS path must be made unique */ - JOIN_UNIQUE_INNER /* RHS path must be made unique */ +#ifdef __TBASE__ + JOIN_LEFT_SCALAR /* pairs + unmatched LHS tuples */ + /* only 1 copy of echo LHS row else report error. */ +#endif - /* - * We might need additional join types someday. - */ + /* + * We might need additional join types someday. + */ } JoinType; /* @@ -812,12 +817,22 @@ typedef enum JoinType * pushed-down quals. This is convenient because for almost all purposes, * quals attached to a semijoin can be treated the same as innerjoin quals. */ +#ifdef __TBASE__ #define IS_OUTER_JOIN(jointype) \ - (((1 << (jointype)) & \ - ((1 << JOIN_LEFT) | \ - (1 << JOIN_FULL) | \ - (1 << JOIN_RIGHT) | \ - (1 << JOIN_ANTI))) != 0) + (((1 << (jointype)) & \ + ((1 << JOIN_LEFT) | \ + (1 << JOIN_LEFT_SCALAR) | \ + (1 << JOIN_FULL) | \ + (1 << JOIN_RIGHT) | \ + (1 << JOIN_ANTI))) != 0) +#else +#define IS_OUTER_JOIN(jointype) \ + (((1 << (jointype)) & \ + ((1 << JOIN_LEFT) | \ + (1 << JOIN_FULL) | \ + (1 << JOIN_RIGHT) | \ + (1 << JOIN_ANTI))) != 0) +#endif /* * AggStrategy - diff --git a/src/include/optimizer/subselect.h b/src/include/optimizer/subselect.h index d303ff05..ec687d2f 100644 --- a/src/include/optimizer/subselect.h +++ b/src/include/optimizer/subselect.h @@ -99,8 +99,9 @@ extern JoinExpr *convert_ALL_sublink_to_join(PlannerInfo *root, SubLink *sublink Relids available_rels); extern bool check_or_exist_sublink_pullupable(PlannerInfo *root,Node *node); extern bool check_or_exist_qual_pullupable(PlannerInfo *root, Node *node); -extern List * convert_OR_EXIST_sublink_to_join_recurse(PlannerInfo *root, Node *node, +extern List *convert_OR_EXIST_sublink_to_join_recurse(PlannerInfo *root, Node *node, Node **jtlink); +extern TargetEntry *convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry); #endif extern Node *SS_replace_correlation_vars(PlannerInfo *root, Node *expr); extern Node *SS_process_sublinks(PlannerInfo *root, Node *expr, bool isQual); diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out index c480d768..5d770f1d 100644 --- a/src/test/regress/expected/subselect.out +++ b/src/test/regress/expected/subselect.out @@ -851,31 +851,35 @@ explain (verbose, costs off) select * from int4_tbl where (case when f1 in (select unique1 from tenk1 a) then f1 else null end) in (select ten from tenk1 b); - QUERY PLAN ---------------------------------------------------------------------------------------------------------------- + QUERY PLAN +--------------------------------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) Output: int4_tbl.f1 - -> Nested Loop Semi Join + -> Hash Join Output: int4_tbl.f1 - Join Filter: ((CASE WHEN (hashed SubPlan 1) THEN int4_tbl.f1 ELSE NULL::integer END) = b.ten) - -> Remote Subquery Scan on all (datanode_1) - Output: int4_tbl.f1, CASE WHEN (hashed SubPlan 1) THEN int4_tbl.f1 ELSE NULL::integer END - Distribute results by H: CASE WHEN (hashed SubPlan 1) THEN f1 ELSE NULL::integer END - -> Seq Scan on public.int4_tbl - Output: int4_tbl.f1, CASE WHEN (hashed SubPlan 1) THEN int4_tbl.f1 ELSE NULL::integer END - SubPlan 1 - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Output: a.unique1 - -> Seq Scan on public.tenk1 a - Output: a.unique1 - -> Materialize + Inner Unique: true + Hash Cond: (CASE WHEN (hashed SubPlan 1) THEN int4_tbl.f1 ELSE NULL::integer END = b.ten) + -> Seq Scan on public.int4_tbl + Output: int4_tbl.f1 + -> Hash Output: b.ten - -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> HashAggregate Output: b.ten - Distribute results by H: ten - -> Seq Scan on public.tenk1 b + Group Key: b.ten + -> Remote Subquery Scan on all (datanode_1,datanode_2) Output: b.ten -(22 rows) + Distribute results by H: ten + -> HashAggregate + Output: b.ten + Group Key: b.ten + -> Seq Scan on public.tenk1 b + Output: b.ten + SubPlan 1 + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: a.unique1 + -> Seq Scan on public.tenk1 a + Output: a.unique1 +(26 rows) select * from int4_tbl where (case when f1 in (select unique1 from tenk1 a) then f1 else null end) in @@ -1441,3 +1445,295 @@ select * from x for update; Output: subselect_tbl.f1, subselect_tbl.f2, subselect_tbl.f3 (4 rows) +-- +-- Tests for pulling up more sublinks +-- + +set enable_pullup_subquery to true; +create table tbl_a(a int,b int); +create table tbl_b(a int,b int); +insert into tbl_a select generate_series(1,10),1 ; +insert into tbl_b select generate_series(2,11),1 ; +-- check targetlist subquery scenario. +set enable_nestloop to true; +set enable_hashjoin to false; +set enable_mergejoin to false; +explain select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + QUERY PLAN +------------------------------------------------------------------------------------------------------------ + Remote Subquery Scan on all (datanode_1,datanode_2) (cost=36374.94..36378.32 rows=1350 width=8) + -> Sort (cost=36374.94..36378.32 rows=1350 width=8) + Sort Key: a.a, ((SubPlan 1)) + -> Seq Scan on tbl_a a (cost=0.00..36304.75 rows=1350 width=8) + SubPlan 1 + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=0.00..26.88 rows=7 width=4) + -> Seq Scan on tbl_b b (cost=0.00..26.88 rows=7 width=4) + Filter: (a = a.a) +(8 rows) + +select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + a | q +----+---- + 1 | + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 + 10 | 10 +(10 rows) + +set enable_nestloop to false; +set enable_hashjoin to true; +set enable_mergejoin to false; +explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + QUERY PLAN +------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a.a, ((SubPlan 1)) + -> Seq Scan on tbl_a a + SubPlan 1 + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on tbl_b b + Filter: (a = a.a) +(8 rows) + +select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + a | q +----+---- + 1 | + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 + 10 | 10 +(10 rows) + +set enable_nestloop to false; +set enable_hashjoin to false; +set enable_mergejoin to true; +explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + QUERY PLAN +------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a.a, ((SubPlan 1)) + -> Seq Scan on tbl_a a + SubPlan 1 + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on tbl_b b + Filter: (a = a.a) +(8 rows) + +select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + a | q +----+---- + 1 | + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 + 10 | 10 +(10 rows) + +-- check non-scalar scenario. +insert into tbl_b values(2,2); +set enable_nestloop to true; +set enable_hashjoin to false; +set enable_mergejoin to false; +explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + QUERY PLAN +------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a.a, ((SubPlan 1)) + -> Seq Scan on tbl_a a + SubPlan 1 + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on tbl_b b + Filter: (a = a.a) +(8 rows) + +select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; +ERROR: more than one row returned by a subquery used as an expression +set enable_nestloop to false; +set enable_hashjoin to true; +set enable_mergejoin to false; +explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + QUERY PLAN +------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a.a, ((SubPlan 1)) + -> Seq Scan on tbl_a a + SubPlan 1 + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on tbl_b b + Filter: (a = a.a) +(8 rows) + +select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; +ERROR: more than one row returned by a subquery used as an expression +set enable_nestloop to false; +set enable_hashjoin to false; +set enable_mergejoin to true; +explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + QUERY PLAN +------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a.a, ((SubPlan 1)) + -> Seq Scan on tbl_a a + SubPlan 1 + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on tbl_b b + Filter: (a = a.a) +(8 rows) + +select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; +ERROR: more than one row returned by a subquery used as an expression +explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a and b.a = 5) q from tbl_a a order by 1,2; + QUERY PLAN +-------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a.a, ((SubPlan 1)) + -> Seq Scan on tbl_a a + SubPlan 1 + -> Remote Subquery Scan on all (datanode_1) + -> Result + One-Time Filter: (a.a = 5) + -> Seq Scan on tbl_b b + Filter: (a = 5) +(10 rows) + +select a.a,(select b.a from tbl_b b where b.a = a.a and b.a = 5) q from tbl_a a order by 1,2; + a | q +----+--- + 1 | + 2 | + 3 | + 4 | + 5 | 5 + 6 | + 7 | + 8 | + 9 | + 10 | +(10 rows) + +-- check distinct scenario. +set enable_nestloop to true; +set enable_hashjoin to false; +set enable_mergejoin to false; +explain (costs off) select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + QUERY PLAN +------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a.a, ((SubPlan 1)) + -> Seq Scan on tbl_a a + SubPlan 1 + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Unique + -> Seq Scan on tbl_b b + Filter: (a = a.a) +(9 rows) + +select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + a | q +----+---- + 1 | + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 + 10 | 10 +(10 rows) + +set enable_nestloop to false; +set enable_hashjoin to true; +set enable_mergejoin to false; +explain (costs off) select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + QUERY PLAN +------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a.a, ((SubPlan 1)) + -> Seq Scan on tbl_a a + SubPlan 1 + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Unique + -> Seq Scan on tbl_b b + Filter: (a = a.a) +(9 rows) + +select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + a | q +----+---- + 1 | + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 + 10 | 10 +(10 rows) + +set enable_nestloop to false; +set enable_hashjoin to false; +set enable_mergejoin to true; +explain (costs off) select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + QUERY PLAN +------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a.a, ((SubPlan 1)) + -> Seq Scan on tbl_a a + SubPlan 1 + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Unique + -> Seq Scan on tbl_b b + Filter: (a = a.a) +(9 rows) + +select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + a | q +----+---- + 1 | + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 + 10 | 10 +(10 rows) + +set enable_nestloop to true; +set enable_hashjoin to true; +set enable_mergejoin to true; +drop table tbl_a; +drop table tbl_b; +set enable_pullup_subquery to false; diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql index 9b3f974f..2ebadb13 100644 --- a/src/test/regress/sql/subselect.sql +++ b/src/test/regress/sql/subselect.sql @@ -691,3 +691,83 @@ select * from (with x as (select 2 as y) select * from x) ss; explain (verbose, costs off) with x as (select * from subselect_tbl) select * from x for update; + +-- +-- Tests for pulling up more sublinks +-- + +set enable_pullup_subquery to true; +create table tbl_a(a int,b int); +create table tbl_b(a int,b int); +insert into tbl_a select generate_series(1,10),1 ; +insert into tbl_b select generate_series(2,11),1 ; + +-- check targetlist subquery scenario. +set enable_nestloop to true; +set enable_hashjoin to false; +set enable_mergejoin to false; +explain select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; +select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + +set enable_nestloop to false; +set enable_hashjoin to true; +set enable_mergejoin to false; +explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; +select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + +set enable_nestloop to false; +set enable_hashjoin to false; +set enable_mergejoin to true; +explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; +select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + +-- check non-scalar scenario. +insert into tbl_b values(2,2); + +set enable_nestloop to true; +set enable_hashjoin to false; +set enable_mergejoin to false; +explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; +select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + +set enable_nestloop to false; +set enable_hashjoin to true; +set enable_mergejoin to false; +explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; +select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + +set enable_nestloop to false; +set enable_hashjoin to false; +set enable_mergejoin to true; +explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; +select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + +explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a and b.a = 5) q from tbl_a a order by 1,2; +select a.a,(select b.a from tbl_b b where b.a = a.a and b.a = 5) q from tbl_a a order by 1,2; + +-- check distinct scenario. +set enable_nestloop to true; +set enable_hashjoin to false; +set enable_mergejoin to false; +explain (costs off) select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; +select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + +set enable_nestloop to false; +set enable_hashjoin to true; +set enable_mergejoin to false; +explain (costs off) select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; +select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + +set enable_nestloop to false; +set enable_hashjoin to false; +set enable_mergejoin to true; +explain (costs off) select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; +select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + +set enable_nestloop to true; +set enable_hashjoin to true; +set enable_mergejoin to true; + +drop table tbl_a; +drop table tbl_b; +set enable_pullup_subquery to false; From 3ae0235f304583029688a1846157455b4e740e47 Mon Sep 17 00:00:00 2001 From: ericxwu Date: Wed, 12 Aug 2020 15:29:39 +0800 Subject: [PATCH 030/578] Refine UPDATE/DELETE join distribution rules 1. Do not replicate outer path if join type is JOIN_LEFT_SCALAR 2. Remove the replication_level restriction since we have to do the replicate for UPDATE/DELETE anyway --- src/backend/optimizer/util/pathnode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 9b89cb4a..6fc4ae2c 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -2707,7 +2707,7 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) if (resultRelLoc == RESULT_REL_INNER && pathnode->jointype != JOIN_LEFT && pathnode->jointype != JOIN_FULL && pathnode->jointype != JOIN_SEMI && pathnode->jointype != JOIN_ANTI && - !pathnode->inner_unique) + pathnode->jointype != JOIN_LEFT_SCALAR && !pathnode->inner_unique) { /* Replicate outer */ pathnode->outerjoinpath = redistribute_path( @@ -2755,7 +2755,7 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) if (innerd &&resultRelLoc == RESULT_REL_INNER && pathnode->jointype != JOIN_LEFT && pathnode->jointype != JOIN_FULL && pathnode->jointype != JOIN_SEMI && pathnode->jointype != JOIN_ANTI && - !pathnode->inner_unique) + pathnode->jointype != JOIN_LEFT_SCALAR && !pathnode->inner_unique) { pathnode->path.distribution = innerd; return alternate; From 9653f0d0d6fa8bbd91e4cc60290562347e73c8c0 Mon Sep 17 00:00:00 2001 From: youngxie Date: Mon, 17 Aug 2020 14:58:51 +0800 Subject: [PATCH 031/578] Support agg optimize for targetlist subquery --- src/backend/optimizer/plan/subselect.c | 100 ++++++++++++++++++++++++- src/test/regress/sql/subselect.sql | 8 ++ 2 files changed, 106 insertions(+), 2 deletions(-) diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index bff6e3fd..a0f01e9b 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -2819,8 +2819,7 @@ convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry) /* * What we can not optimize. */ - if (subselect->commandType != CMD_SELECT || - subselect->hasAggs || subselect->hasDistinctOn || + if (subselect->commandType != CMD_SELECT || subselect->hasDistinctOn || subselect->setOperations || subselect->groupingSets || subselect->groupClause || subselect->hasWindowFuncs || subselect->hasTargetSRFs || subselect->hasModifyingCTE || @@ -2848,8 +2847,105 @@ convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry) * The rest of the sub-select must not refer to any Vars of the parent * query. (Vars of higher levels should be okay, though.) */ + subselect->jointree->quals = NULL; if (contain_vars_of_level((Node *) subselect, 1)) return NULL; + subselect->jointree->quals = whereClause; + + if (subselect->hasAggs) + { + List *joinquals = NULL; + List *vars = NULL; + TargetEntry *ent = NULL; + ListCell *cell = NULL; + int ressortgroupref = 0; + int varno = 0; + + /* process 'op' and 'bool' expr only */ + if (contain_notexpr_or_neopexpr(whereClause, true, &joinquals)) + return NULL; + + vars = pull_vars_of_level((Node *) joinquals, 0); + + /* construct groupby clause */ + foreach (cell, vars) + { + Oid sortop; + Oid eqop; + bool hashable; + Oid restype; + SortGroupClause *grpcl; + Var *var = (Var *) lfirst(cell); + RangeTblEntry *tbl = (RangeTblEntry *) list_nth(subselect->rtable, var->varno - 1); + + if (tbl->rtekind != RTE_RELATION && tbl->rtekind != RTE_CTE) + return NULL; + + restype = exprType((Node *) var); + + grpcl = makeNode(SortGroupClause); + + ressortgroupref++; + + if (tbl->rtekind == RTE_RELATION) + { + ent = makeTargetEntry((Expr *) copyObject(var), var->varoattno, + get_relid_attribute_name(tbl->relid, var->varoattno), false); + } + else + { + int plan_id; + int ndx; + ListCell *lc; + Plan *cte_plan; + TargetEntry *cte_ent = NULL; + + /* + * Note: cte_plan_ids can be shorter than cteList, if we are still working + * on planning the CTEs (ie, this is a side-reference from another CTE). + * So we mustn't use forboth here. + */ + ndx = 0; + foreach (lc, root->parse->cteList) + { + CommonTableExpr *cte = (CommonTableExpr *) lfirst(lc); + + if (strcmp(cte->ctename, tbl->ctename) == 0) + break; + ndx++; + } + if (lc == NULL) /* shouldn't happen */ + elog(ERROR, "could not find CTE \"%s\"", tbl->ctename); + if (ndx >= list_length(root->cte_plan_ids)) + elog(ERROR, "could not find plan for CTE \"%s\"", tbl->ctename); + plan_id = list_nth_int(root->cte_plan_ids, ndx); + cte_plan = (Plan *) lfirst(list_nth_cell(root->glob->subplans, plan_id - 1)); + cte_ent = (TargetEntry *) lfirst(list_nth_cell(cte_plan->targetlist, var->varattno - 1)); + ent = makeTargetEntry((Expr *) copyObject(var), var->varoattno, cte_ent->resname, false); + } + + ent->ressortgroupref = ressortgroupref; + + subselect->targetList = lappend(subselect->targetList, ent); + + varno = list_length(subselect->targetList); + ent->resno = varno; + + /* determine the eqop and optional sortop */ + get_sort_group_operators(restype, + false, true, false, + &sortop, &eqop, NULL, + &hashable); + + grpcl->tleSortGroupRef = ressortgroupref; + grpcl->eqop = eqop; + grpcl->sortop = sortop; + grpcl->nulls_first = false; /* OK with or without sortop */ + grpcl->hashable = hashable; + + subselect->groupClause = lappend(subselect->groupClause, grpcl); + } + } /* * Move sub-select to the parent query. diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql index 2ebadb13..8d1cdebd 100644 --- a/src/test/regress/sql/subselect.sql +++ b/src/test/regress/sql/subselect.sql @@ -768,6 +768,14 @@ set enable_nestloop to true; set enable_hashjoin to true; set enable_mergejoin to true; +-- agg +explain (costs off) select (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b) from tbl_a a order by 1; +select (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b) from tbl_a a order by 1; +explain (costs off) select (select count(b.a) from tbl_b b where b.a = a.a) from tbl_a a order by 1; +select (select count(b.a) from tbl_b b where b.a = a.a ) from tbl_a a order by 1; +explain (costs off) select (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b or b.a = 1) from tbl_a a order by 1; +select (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b or b.a = 1) from tbl_a a order by 1; + drop table tbl_a; drop table tbl_b; set enable_pullup_subquery to false; From ebf254f372c7e7a7eac1256bf4a46755f31215de Mon Sep 17 00:00:00 2001 From: ericxwu Date: Tue, 18 Aug 2020 10:30:31 +0800 Subject: [PATCH 032/578] Refine some code of JOIN_LEFT_SCALAR 1. Add missing switch case in reduce_outer_joins_pass2() 2. Refine some pre-compile definitions in set_joinpath_distribution() --- src/backend/optimizer/prep/prepjointree.c | 1 + src/backend/optimizer/util/pathnode.c | 11 +++-------- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/src/backend/optimizer/prep/prepjointree.c b/src/backend/optimizer/prep/prepjointree.c index b7cbd5c0..d54e097a 100644 --- a/src/backend/optimizer/prep/prepjointree.c +++ b/src/backend/optimizer/prep/prepjointree.c @@ -3077,6 +3077,7 @@ reduce_outer_joins_pass2(Node *jtnode, break; case JOIN_SEMI: case JOIN_ANTI: + case JOIN_LEFT_SCALAR: /* * These could only have been introduced by pull_up_sublinks, diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 6fc4ae2c..1eefd477 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -1729,20 +1729,15 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) * nullable data nodes will produce joined rows with NULLs for cases when * matching row exists, but on other data node. */ -#ifdef __TBASE__ + if ((innerd && IsLocatorReplicated(innerd->distributionType)) && (pathnode->jointype == JOIN_INNER || pathnode->jointype == JOIN_LEFT || pathnode->jointype == JOIN_SEMI || +#ifdef __TBASE__ pathnode->jointype == JOIN_LEFT_SCALAR || - pathnode->jointype == JOIN_ANTI)) -#else - if ((innerd && IsLocatorReplicated(innerd->distributionType)) && - (pathnode->jointype == JOIN_INNER || - pathnode->jointype == JOIN_LEFT || - pathnode->jointype == JOIN_SEMI || - pathnode->jointype == JOIN_ANTI)) #endif + pathnode->jointype == JOIN_ANTI)) { /* We need inner relation is defined on all nodes where outer is */ if (!outerd || !bms_is_subset(outerd->nodes, innerd->nodes)) From 7a33f7f382aa4d7a1ccba1186125f7607e4c8b9a Mon Sep 17 00:00:00 2001 From: ericxwu Date: Thu, 27 Aug 2020 20:15:39 +0800 Subject: [PATCH 033/578] Pullup targetlist sublink that wrapped in expression Also improved the targetlist join type selection. We don't need JOIN_LEFT_SCALAR if sublink with agg, since we've create groupby for them. --- src/backend/optimizer/plan/subselect.c | 55 ++++++++++++++--------- src/backend/optimizer/prep/prepjointree.c | 18 +++++--- src/test/regress/sql/subselect.sql | 8 +++- 3 files changed, 53 insertions(+), 28 deletions(-) diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index a0f01e9b..05a21a79 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -2773,28 +2773,38 @@ get_or_exist_subquery_targetlist(PlannerInfo *root, Node *node, List **targetLis #ifdef __TBASE__ /* - * convert_TargetList_sublink_to_join : - * try to convert an EXISTS SubLink in targetlist to a join - * On success, it returns not NULL. + * Try to convert an SubLink in targetlist to a join + * + * The sublink in targetlist has the semantic of SCALAR. Normal joins will join + * simply generate repeated tuples. So we add a new join type JOIN_LEFT_SCALAR + * which acts like left join and reports error when scalar semantics is broken. + * + * On success, it converts sublink to subquery to parent jointree and returns + * the converted new targetlist entry. Otherwise, it returnes NULL. */ TargetEntry * convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry) { - Query *parse = root->parse; - Node *whereClause = NULL; - Query *subselect = NULL; - JoinExpr *joinExpr = NULL; - ParseState *pstate = NULL; - SubLink *sublink = NULL; - RangeTblRef *rtr = NULL; + Query *parse = root->parse; + Node *whereClause = NULL; + Query *subselect = NULL; + JoinExpr *joinExpr = NULL; + ParseState *pstate = NULL; + SubLink *sublink = NULL; + RangeTblRef *rtr = NULL; RangeTblEntry *rte = NULL; - Var *var = NULL; + Var *var = NULL; + List *sublinks = NIL; - /* Sanity check */ - if (!IsA(entry->expr, SubLink)) - return NULL; + /* Find sublinks in the targetlist entry */ + find_sublink_walker((Node *)entry->expr, &sublinks); + + /* Only one sublink can be handled */ + if (list_length(sublinks) != 1) + return NULL; + + sublink = linitial(sublinks); - sublink = (SubLink *) entry->expr; if (sublink->subLinkType != EXPR_SUBLINK) return NULL; @@ -2811,7 +2821,7 @@ convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry) return NULL; /* - * The subquery must have a nonempty jointree, else we won't have a join. + * The SubQuery must have a non-empty JoinTree, else we won't have a join. */ if (subselect->jointree->fromlist == NIL) return NULL; @@ -2845,7 +2855,7 @@ convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry) /* * The rest of the sub-select must not refer to any Vars of the parent - * query. (Vars of higher levels should be okay, though.) + * query. (Vars of higher levels should be okay, though.) */ subselect->jointree->quals = NULL; if (contain_vars_of_level((Node *) subselect, 1)) @@ -2884,7 +2894,6 @@ convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry) restype = exprType((Node *) var); grpcl = makeNode(SortGroupClause); - ressortgroupref++; if (tbl->rtekind == RTE_RELATION) @@ -2965,7 +2974,7 @@ convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry) * Form join node. */ joinExpr = makeNode(JoinExpr); - joinExpr->jointype = JOIN_LEFT_SCALAR; + joinExpr->jointype = subselect->hasAggs? JOIN_LEFT : JOIN_LEFT_SCALAR; joinExpr->isNatural = false; joinExpr->larg = (Node *) root->parse->jointree; joinExpr->rarg = (Node *) rtr; @@ -2977,9 +2986,13 @@ convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry) /* Wrap join node in FromExpr as required. */ parse->jointree = makeFromExpr(list_make1(joinExpr), NULL); - /* Replace sublink node with Var. */ + /* Build a Var pointing to the subquery */ var = makeVarFromTargetEntry(rtr->rtindex, linitial(subselect->targetList)); - entry->expr = (Expr *) var; + + /* Replace sublink node with Var. */ + entry->expr = (Expr *)substitute_sublink_with_node((Node *)entry->expr, + sublink, + (Node *)var); return entry; } #endif diff --git a/src/backend/optimizer/prep/prepjointree.c b/src/backend/optimizer/prep/prepjointree.c index d54e097a..a94388d4 100644 --- a/src/backend/optimizer/prep/prepjointree.c +++ b/src/backend/optimizer/prep/prepjointree.c @@ -181,8 +181,11 @@ static void fix_append_rel_relids(List *append_rel_list, int varno, static Node *find_jointree_node_for_rel(Node *jtnode, int relid); #ifdef __TBASE__ -static Node * pull_up_or_sublinks_qual_recurse(PlannerInfo *root, Node *node, Node **jtlink,Node **orclauses); -static bool check_pull_up_sublinks_qual_or_recurse(PlannerInfo *root, Node *node); +static Node *pull_up_or_sublinks_qual_recurse(PlannerInfo *root, Node *node, + Node **jtlink,Node **orclauses); +static bool check_pull_up_sublinks_qual_or_recurse(PlannerInfo *root, + Node *node); + #endif /* @@ -225,16 +228,19 @@ pull_up_sublinks(PlannerInfo *root) */ if(enable_pullup_subquery) { - List *new_targetList = NIL; - ListCell *lc = NULL; - TargetEntry *entry = NULL; - TargetEntry *new_entry = NULL; + List *new_targetList = NIL; + ListCell *lc = NULL; + TargetEntry *entry = NULL; + TargetEntry *new_entry = NULL; + /* Iterate through out the target list */ foreach(lc, root->parse->targetList) { entry = (TargetEntry *) lfirst(lc); + /* Try to convert sublink in targetlist entry to join */ new_entry = convert_TargetList_sublink_to_join(root, entry); + if (new_entry) new_targetList = lappend(new_targetList, new_entry); else diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql index 8d1cdebd..a7ba5190 100644 --- a/src/test/regress/sql/subselect.sql +++ b/src/test/regress/sql/subselect.sql @@ -768,7 +768,7 @@ set enable_nestloop to true; set enable_hashjoin to true; set enable_mergejoin to true; --- agg +-- targetlist sublink with agg explain (costs off) select (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b) from tbl_a a order by 1; select (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b) from tbl_a a order by 1; explain (costs off) select (select count(b.a) from tbl_b b where b.a = a.a) from tbl_a a order by 1; @@ -776,6 +776,12 @@ select (select count(b.a) from tbl_b b where b.a = a.a ) from tbl_a a order by 1 explain (costs off) select (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b or b.a = 1) from tbl_a a order by 1; select (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b or b.a = 1) from tbl_a a order by 1; +-- targetlist sublink wrapped in expr +explain (costs off) select (case when a.b =1 then (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b) else 0 end) from tbl_a a order by 1; +select (case when a.b =1 then (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b) else 0 end) from tbl_a a order by 1; +explain (costs off) select (case when a.b =1 then (select b.a from tbl_b b where b.a = a.a and b.b = a.b) else 0 end) from tbl_a a order by 1; +select (case when a.b =1 then (select b.a from tbl_b b where b.a = a.a and b.b = a.b) else 0 end) from tbl_a a order by 1; + drop table tbl_a; drop table tbl_b; set enable_pullup_subquery to false; From 1e8fd4c631aaa2b98d291c546df8db98a34a6032 Mon Sep 17 00:00:00 2001 From: ericxwu Date: Fri, 28 Aug 2020 17:05:02 +0800 Subject: [PATCH 034/578] Minor fix after merged all sublink pullup enhancements --- src/include/optimizer/clauses.h | 8 +- src/test/regress/expected/subselect.out | 391 +++++++++++++++++------- 2 files changed, 280 insertions(+), 119 deletions(-) diff --git a/src/include/optimizer/clauses.h b/src/include/optimizer/clauses.h index ac066a6a..3e7b9e4c 100644 --- a/src/include/optimizer/clauses.h +++ b/src/include/optimizer/clauses.h @@ -14,11 +14,11 @@ #ifndef CLAUSES_H #define CLAUSES_H +#include "access/htup.h" #include "nodes/relation.h" - -#define is_opclause(clause) ((clause) != NULL && IsA(clause, OpExpr)) -#define is_funcclause(clause) ((clause) != NULL && IsA(clause, FuncExpr)) +#define is_opclause(clause) ((clause) != NULL && IsA(clause, OpExpr)) +#define is_funcclause(clause) ((clause) != NULL && IsA(clause, FuncExpr)) typedef struct { @@ -85,7 +85,7 @@ extern Node *eval_const_expressions(PlannerInfo *root, Node *node); extern Node *estimate_expression_value(PlannerInfo *root, Node *node); extern Query *inline_set_returning_function(PlannerInfo *root, - RangeTblEntry *rte); + RangeTblEntry *rte); extern Node *substitute_sublink_with_node(Node *expr, SubLink *sublink, Node *node); diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out index 5d770f1d..79708c41 100644 --- a/src/test/regress/expected/subselect.out +++ b/src/test/regress/expected/subselect.out @@ -1160,7 +1160,6 @@ NOTICE: x = 9, y = 13 (3 rows) drop function tattle(x int, y int); - -- -- Tests for CTE inlining behavior -- @@ -1448,7 +1447,6 @@ select * from x for update; -- -- Tests for pulling up more sublinks -- - set enable_pullup_subquery to true; create table tbl_a(a int,b int); create table tbl_b(a int,b int); @@ -1459,16 +1457,16 @@ set enable_nestloop to true; set enable_hashjoin to false; set enable_mergejoin to false; explain select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; - QUERY PLAN ------------------------------------------------------------------------------------------------------------- - Remote Subquery Scan on all (datanode_1,datanode_2) (cost=36374.94..36378.32 rows=1350 width=8) - -> Sort (cost=36374.94..36378.32 rows=1350 width=8) - Sort Key: a.a, ((SubPlan 1)) - -> Seq Scan on tbl_a a (cost=0.00..36304.75 rows=1350 width=8) - SubPlan 1 - -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=0.00..26.88 rows=7 width=4) - -> Seq Scan on tbl_b b (cost=0.00..26.88 rows=7 width=4) - Filter: (a = a.a) + QUERY PLAN +------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) (cost=15636.19..15637.88 rows=675 width=8) + -> Sort (cost=15636.19..15637.88 rows=675 width=8) + Sort Key: a.a, b.a + -> Nested Loop Left Scalar Join (cost=0.00..15604.47 rows=675 width=8) + Join Filter: (b.a = a.a) + -> Seq Scan on tbl_a a (cost=0.00..23.50 rows=1350 width=4) + -> Materialize (cost=0.00..30.25 rows=1350 width=4) + -> Seq Scan on tbl_b b (cost=0.00..23.50 rows=1350 width=4) (8 rows) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; @@ -1490,16 +1488,16 @@ set enable_nestloop to false; set enable_hashjoin to true; set enable_mergejoin to false; explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; - QUERY PLAN -------------------------------------------------------------------------- + QUERY PLAN +----------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) -> Sort - Sort Key: a.a, ((SubPlan 1)) - -> Seq Scan on tbl_a a - SubPlan 1 - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Seq Scan on tbl_b b - Filter: (a = a.a) + Sort Key: a.a, b.a + -> Hash Left Scalar Join + Hash Cond: (a.a = b.a) + -> Seq Scan on tbl_a a + -> Hash + -> Seq Scan on tbl_b b (8 rows) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; @@ -1521,17 +1519,20 @@ set enable_nestloop to false; set enable_hashjoin to false; set enable_mergejoin to true; explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; - QUERY PLAN -------------------------------------------------------------------------- + QUERY PLAN +----------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) -> Sort - Sort Key: a.a, ((SubPlan 1)) - -> Seq Scan on tbl_a a - SubPlan 1 - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Seq Scan on tbl_b b - Filter: (a = a.a) -(8 rows) + Sort Key: a.a, b.a + -> Merge Left Scalar Join + Merge Cond: (a.a = b.a) + -> Sort + Sort Key: a.a + -> Seq Scan on tbl_a a + -> Sort + Sort Key: b.a + -> Seq Scan on tbl_b b +(11 rows) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; a | q @@ -1554,16 +1555,16 @@ set enable_nestloop to true; set enable_hashjoin to false; set enable_mergejoin to false; explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; - QUERY PLAN -------------------------------------------------------------------------- + QUERY PLAN +----------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) -> Sort - Sort Key: a.a, ((SubPlan 1)) - -> Seq Scan on tbl_a a - SubPlan 1 - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Seq Scan on tbl_b b - Filter: (a = a.a) + Sort Key: a.a, b.a + -> Nested Loop Left Scalar Join + Join Filter: (b.a = a.a) + -> Seq Scan on tbl_a a + -> Materialize + -> Seq Scan on tbl_b b (8 rows) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; @@ -1572,16 +1573,16 @@ set enable_nestloop to false; set enable_hashjoin to true; set enable_mergejoin to false; explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; - QUERY PLAN -------------------------------------------------------------------------- + QUERY PLAN +----------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) -> Sort - Sort Key: a.a, ((SubPlan 1)) - -> Seq Scan on tbl_a a - SubPlan 1 - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Seq Scan on tbl_b b - Filter: (a = a.a) + Sort Key: a.a, b.a + -> Hash Left Scalar Join + Hash Cond: (a.a = b.a) + -> Seq Scan on tbl_a a + -> Hash + -> Seq Scan on tbl_b b (8 rows) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; @@ -1590,67 +1591,69 @@ set enable_nestloop to false; set enable_hashjoin to false; set enable_mergejoin to true; explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; - QUERY PLAN -------------------------------------------------------------------------- + QUERY PLAN +----------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) -> Sort - Sort Key: a.a, ((SubPlan 1)) - -> Seq Scan on tbl_a a - SubPlan 1 - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Seq Scan on tbl_b b - Filter: (a = a.a) -(8 rows) + Sort Key: a.a, b.a + -> Merge Left Scalar Join + Merge Cond: (a.a = b.a) + -> Sort + Sort Key: a.a + -> Seq Scan on tbl_a a + -> Sort + Sort Key: b.a + -> Seq Scan on tbl_b b +(11 rows) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; ERROR: more than one row returned by a subquery used as an expression explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a and b.a = 5) q from tbl_a a order by 1,2; - QUERY PLAN --------------------------------------------------------------- - Remote Subquery Scan on all (datanode_1,datanode_2) + QUERY PLAN +--------------------------------------------- + Remote Subquery Scan on all (datanode_1) -> Sort - Sort Key: a.a, ((SubPlan 1)) - -> Seq Scan on tbl_a a - SubPlan 1 - -> Remote Subquery Scan on all (datanode_1) - -> Result - One-Time Filter: (a.a = 5) - -> Seq Scan on tbl_b b - Filter: (a = 5) -(10 rows) + Sort Key: a.a, b.a + -> Merge Left Scalar Join + Merge Cond: (a.a = b.a) + -> Sort + Sort Key: a.a + -> Seq Scan on tbl_a a + -> Sort + Sort Key: b.a + -> Seq Scan on tbl_b b + Filter: (a = 5) +(12 rows) select a.a,(select b.a from tbl_b b where b.a = a.a and b.a = 5) q from tbl_a a order by 1,2; - a | q -----+--- - 1 | - 2 | - 3 | - 4 | - 5 | 5 - 6 | - 7 | - 8 | - 9 | - 10 | -(10 rows) + a | q +---+--- + 1 | + 2 | + 5 | 5 + 6 | + 8 | + 9 | +(6 rows) -- check distinct scenario. set enable_nestloop to true; set enable_hashjoin to false; set enable_mergejoin to false; explain (costs off) select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; - QUERY PLAN -------------------------------------------------------------------------- - Remote Subquery Scan on all (datanode_1,datanode_2) - -> Sort - Sort Key: a.a, ((SubPlan 1)) - -> Seq Scan on tbl_a a - SubPlan 1 - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Unique - -> Seq Scan on tbl_b b - Filter: (a = a.a) -(9 rows) + QUERY PLAN +----------------------------------------------------------------------- + Sort + Sort Key: a.a, a + -> Nested Loop Left Scalar Join + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on tbl_a a + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Unique + -> Seq Scan on tbl_b b + Filter: (a = a.a) +(10 rows) select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; a | q @@ -1671,18 +1674,19 @@ set enable_nestloop to false; set enable_hashjoin to true; set enable_mergejoin to false; explain (costs off) select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; - QUERY PLAN -------------------------------------------------------------------------- - Remote Subquery Scan on all (datanode_1,datanode_2) - -> Sort - Sort Key: a.a, ((SubPlan 1)) - -> Seq Scan on tbl_a a - SubPlan 1 - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Unique - -> Seq Scan on tbl_b b - Filter: (a = a.a) -(9 rows) + QUERY PLAN +----------------------------------------------------------------------- + Sort + Sort Key: a.a, a + -> Nested Loop Left Scalar Join + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on tbl_a a + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Unique + -> Seq Scan on tbl_b b + Filter: (a = a.a) +(10 rows) select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; a | q @@ -1703,18 +1707,19 @@ set enable_nestloop to false; set enable_hashjoin to false; set enable_mergejoin to true; explain (costs off) select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; - QUERY PLAN -------------------------------------------------------------------------- - Remote Subquery Scan on all (datanode_1,datanode_2) - -> Sort - Sort Key: a.a, ((SubPlan 1)) - -> Seq Scan on tbl_a a - SubPlan 1 - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Unique - -> Seq Scan on tbl_b b - Filter: (a = a.a) -(9 rows) + QUERY PLAN +----------------------------------------------------------------------- + Sort + Sort Key: a.a, a + -> Nested Loop Left Scalar Join + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on tbl_a a + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Unique + -> Seq Scan on tbl_b b + Filter: (a = a.a) +(10 rows) select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; a | q @@ -1734,6 +1739,162 @@ select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a ord set enable_nestloop to true; set enable_hashjoin to true; set enable_mergejoin to true; +-- targetlist sublink with agg +explain (costs off) select (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b) from tbl_a a order by 1; + QUERY PLAN +------------------------------------------------------------------------- + Sort + Sort Key: "TARGETLIST_subquery".sum + -> Nested Loop Left Join + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on tbl_a a + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Subquery Scan on "TARGETLIST_subquery" + -> GroupAggregate + Group Key: b.a, b.b + -> Seq Scan on tbl_b b + Filter: ((a = a.a) AND (b = a.b)) +(12 rows) + +select (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b) from tbl_a a order by 1; + sum +----- + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 + +(10 rows) + +explain (costs off) select (select count(b.a) from tbl_b b where b.a = a.a) from tbl_a a order by 1; + QUERY PLAN +----------------------------------------------------------------------- + Sort + Sort Key: "TARGETLIST_subquery".count + -> Nested Loop Left Join + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on tbl_a a + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Subquery Scan on "TARGETLIST_subquery" + -> GroupAggregate + Group Key: b.a + -> Seq Scan on tbl_b b + Filter: (a = a.a) +(12 rows) + +select (select count(b.a) from tbl_b b where b.a = a.a ) from tbl_a a order by 1; + count +------- + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 2 + +(10 rows) + +explain (costs off) select (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b or b.a = 1) from tbl_a a order by 1; + QUERY PLAN +---------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: ((SubPlan 1)) + -> Seq Scan on tbl_a a + SubPlan 1 + -> Finalize Aggregate + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Partial Aggregate + -> Seq Scan on tbl_b b + Filter: (((a = a.a) AND (b = a.b)) OR (a = 1)) +(10 rows) + +select (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b or b.a = 1) from tbl_a a order by 1; + sum +----- + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 + +(10 rows) + +-- targetlist sublink wrapped in expr +explain (costs off) select (case when a.b =1 then (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b) else 0 end) from tbl_a a order by 1; + QUERY PLAN +--------------------------------------------------------------------------------------- + Sort + Sort Key: (CASE WHEN (a.b = 1) THEN "TARGETLIST_subquery".sum ELSE '0'::bigint END) + -> Nested Loop Left Join + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on tbl_a a + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Subquery Scan on "TARGETLIST_subquery" + -> GroupAggregate + Group Key: b.a, b.b + -> Seq Scan on tbl_b b + Filter: ((a = a.a) AND (b = a.b)) +(12 rows) + +select (case when a.b =1 then (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b) else 0 end) from tbl_a a order by 1; + case +------ + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 + +(10 rows) + +explain (costs off) select (case when a.b =1 then (select b.a from tbl_b b where b.a = a.a and b.b = a.b) else 0 end) from tbl_a a order by 1; + QUERY PLAN +------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: (CASE WHEN (a.b = 1) THEN b.a ELSE 0 END) + -> Hash Left Scalar Join + Hash Cond: ((a.a = b.a) AND (a.b = b.b)) + -> Seq Scan on tbl_a a + -> Hash + -> Seq Scan on tbl_b b +(8 rows) + +select (case when a.b =1 then (select b.a from tbl_b b where b.a = a.a and b.b = a.b) else 0 end) from tbl_a a order by 1; + case +------ + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 + +(10 rows) + drop table tbl_a; drop table tbl_b; set enable_pullup_subquery to false; From 21ab7f55f152ebb5a82d74dcfc1da53aec712ae1 Mon Sep 17 00:00:00 2001 From: youngxie Date: Fri, 28 Aug 2020 18:03:43 +0800 Subject: [PATCH 035/578] Fix DDL deadlock and meta inconsistency. 1. Forward DDL to ascii minimized coordinator to serialize. 2. DDL execution will be executed on all the coordinators then datanodes. 3. DDL commit will be executed on all the datanodes then coordinators. --- src/backend/commands/tablecmds.c | 221 +++++++++++----------- src/backend/pgxc/pool/execRemote.c | 275 ++++++++++++++++++++++------ src/backend/pgxc/pool/pgxcnode.c | 155 ++++++++++++++-- src/backend/postmaster/postmaster.c | 4 + src/backend/tcop/postgres.c | 119 ++++++------ src/backend/tcop/pquery.c | 15 +- src/backend/tcop/utility.c | 34 ++++ src/backend/utils/misc/guc.c | 38 +++- src/include/pgxc/pgxc.h | 13 +- src/include/pgxc/pgxcnode.h | 12 +- src/include/tcop/utility.h | 3 + 11 files changed, 637 insertions(+), 252 deletions(-) diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 66273eb5..9e3b3f14 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -632,118 +632,123 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, * namespace is selected. */ #ifdef __TBASE__ - if (stmt->interval_child) + if (stmt->interval_child) + { + /* interval partition child's namespace is same as parent. */ + namespaceId = get_rel_namespace(stmt->interval_parentId); + } + else { - /* interval partition child's namespace is same as parent. */ - namespaceId = get_rel_namespace(stmt->interval_parentId); + namespaceId = + RangeVarGetAndCheckCreationNamespace(stmt->relation, ExclusiveLock, NULL); } - else +#else + namespaceId = + RangeVarGetAndCheckCreationNamespace(stmt->relation, NoLock, NULL); #endif - namespaceId = - RangeVarGetAndCheckCreationNamespace(stmt->relation, NoLock, NULL); - - /* - * Security check: disallow creating temp tables from security-restricted - * code. This is needed because calling code might not expect untrusted - * tables to appear in pg_temp at the front of its search path. - */ - if (stmt->relation->relpersistence == RELPERSISTENCE_TEMP - && InSecurityRestrictedOperation()) - ereport(ERROR, - (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), - errmsg("cannot create temporary table within security-restricted operation"))); - /* - * Select tablespace to use. If not specified, use default tablespace - * (which may in turn default to database's default). - */ - if (stmt->tablespacename) - { - tablespaceId = get_tablespace_oid(stmt->tablespacename, false); - } - else - { - tablespaceId = GetDefaultTablespace(stmt->relation->relpersistence); - /* note InvalidOid is OK in this case */ - } - - /* Check permissions except when using database's default */ - if (OidIsValid(tablespaceId) && tablespaceId != MyDatabaseTableSpace) - { - AclResult aclresult; - - aclresult = pg_tablespace_aclcheck(tablespaceId, GetUserId(), - ACL_CREATE); - if (aclresult != ACLCHECK_OK) - aclcheck_error(aclresult, ACL_KIND_TABLESPACE, - get_tablespace_name(tablespaceId)); - } - - /* In all cases disallow placing user relations in pg_global */ - if (tablespaceId == GLOBALTABLESPACE_OID) - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("only shared relations can be placed in pg_global tablespace"))); - - /* Identify user ID that will own the table */ - if (!OidIsValid(ownerId)) - ownerId = GetUserId(); - - /* - * Parse and validate reloptions, if any. - */ - reloptions = transformRelOptions((Datum) 0, stmt->options, NULL, validnsps, - true, false); - - if (relkind == RELKIND_VIEW) - (void) view_reloptions(reloptions, true); - else - (void) heap_reloptions(relkind, reloptions, true); - - if (stmt->ofTypename) - { - AclResult aclresult; - - ofTypeId = typenameTypeId(NULL, stmt->ofTypename); - - aclresult = pg_type_aclcheck(ofTypeId, GetUserId(), ACL_USAGE); - if (aclresult != ACLCHECK_OK) - aclcheck_error_type(aclresult, ofTypeId); - } - else - ofTypeId = InvalidOid; - - /* - * Look up inheritance ancestors and generate relation schema, including - * inherited attributes. (Note that stmt->tableElts is destructively - * modified by MergeAttributes.) - */ - stmt->tableElts = - MergeAttributes(stmt->tableElts, stmt->inhRelations, - stmt->relation->relpersistence, - stmt->partbound != NULL, - &inheritOids, &old_constraints, &parentOidCount); - - /* - * Create a tuple descriptor from the relation schema. Note that this - * deals with column names, types, and NOT NULL constraints, but not - * default values or CHECK constraints; we handle those below. - */ - descriptor = BuildDescForRelation(stmt->tableElts); - - /* - * Notice that we allow OIDs here only for plain tables and partitioned - * tables, even though some other relkinds can support them. This is - * necessary because the default_with_oids GUC must apply only to plain - * tables and not any other relkind; doing otherwise would break existing - * pg_dump files. We could allow explicit "WITH OIDS" while not allowing - * default_with_oids to affect other relkinds, but it would complicate - * interpretOidsOption(). - */ - localHasOids = interpretOidsOption(stmt->options, - (relkind == RELKIND_RELATION || - relkind == RELKIND_PARTITIONED_TABLE)); - descriptor->tdhasoid = (localHasOids || parentOidCount > 0); + /* + * Security check: disallow creating temp tables from security-restricted + * code. This is needed because calling code might not expect untrusted + * tables to appear in pg_temp at the front of its search path. + */ + if (stmt->relation->relpersistence == RELPERSISTENCE_TEMP + && InSecurityRestrictedOperation()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("cannot create temporary table within security-restricted operation"))); + + /* + * Select tablespace to use. If not specified, use default tablespace + * (which may in turn default to database's default). + */ + if (stmt->tablespacename) + { + tablespaceId = get_tablespace_oid(stmt->tablespacename, false); + } + else + { + tablespaceId = GetDefaultTablespace(stmt->relation->relpersistence); + /* note InvalidOid is OK in this case */ + } + + /* Check permissions except when using database's default */ + if (OidIsValid(tablespaceId) && tablespaceId != MyDatabaseTableSpace) + { + AclResult aclresult; + + aclresult = pg_tablespace_aclcheck(tablespaceId, GetUserId(), + ACL_CREATE); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, ACL_KIND_TABLESPACE, + get_tablespace_name(tablespaceId)); + } + + /* In all cases disallow placing user relations in pg_global */ + if (tablespaceId == GLOBALTABLESPACE_OID) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("only shared relations can be placed in pg_global tablespace"))); + + /* Identify user ID that will own the table */ + if (!OidIsValid(ownerId)) + ownerId = GetUserId(); + + /* + * Parse and validate reloptions, if any. + */ + reloptions = transformRelOptions((Datum) 0, stmt->options, NULL, validnsps, + true, false); + + if (relkind == RELKIND_VIEW) + (void) view_reloptions(reloptions, true); + else + (void) heap_reloptions(relkind, reloptions, true); + + if (stmt->ofTypename) + { + AclResult aclresult; + + ofTypeId = typenameTypeId(NULL, stmt->ofTypename); + + aclresult = pg_type_aclcheck(ofTypeId, GetUserId(), ACL_USAGE); + if (aclresult != ACLCHECK_OK) + aclcheck_error_type(aclresult, ofTypeId); + } + else + ofTypeId = InvalidOid; + + /* + * Look up inheritance ancestors and generate relation schema, including + * inherited attributes. (Note that stmt->tableElts is destructively + * modified by MergeAttributes.) + */ + stmt->tableElts = + MergeAttributes(stmt->tableElts, stmt->inhRelations, + stmt->relation->relpersistence, + stmt->partbound != NULL, + &inheritOids, &old_constraints, &parentOidCount); + + /* + * Create a tuple descriptor from the relation schema. Note that this + * deals with column names, types, and NOT NULL constraints, but not + * default values or CHECK constraints; we handle those below. + */ + descriptor = BuildDescForRelation(stmt->tableElts); + + /* + * Notice that we allow OIDs here only for plain tables and partitioned + * tables, even though some other relkinds can support them. This is + * necessary because the default_with_oids GUC must apply only to plain + * tables and not any other relkind; doing otherwise would break existing + * pg_dump files. We could allow explicit "WITH OIDS" while not allowing + * default_with_oids to affect other relkinds, but it would complicate + * interpretOidsOption(). + */ + localHasOids = interpretOidsOption(stmt->options, + (relkind == RELKIND_RELATION || + relkind == RELKIND_PARTITIONED_TABLE)); + descriptor->tdhasoid = (localHasOids || parentOidCount > 0); #ifdef _SHARDING_ if(IS_PGXC_DATANODE) has_extent = interpretExtentOption(stmt->options, diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 72aa55f1..452577db 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -51,6 +51,7 @@ #include "utils/tuplesort.h" #include "utils/snapmgr.h" #include "utils/builtins.h" +#include "tcop/utility.h" #include "pgxc/locator.h" #include "pgxc/pgxc.h" #include "parser/parse_type.h" @@ -148,6 +149,7 @@ static void pgxc_abort_connections(PGXCNodeAllHandles *all_handles); static void pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle); static void pgxc_node_remote_abort(TranscationType txn_type, bool need_release_handle); static bool SetSnapshot(EState *state); +static int pgxc_node_remote_commit_internal(PGXCNodeAllHandles *handles, TranscationType txn_type); #endif static void pgxc_connections_cleanup(ResponseCombiner *combiner); @@ -4653,22 +4655,75 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit) return NULL; } +#ifdef __TBASE__ +/* + * Commit transactions on remote nodes. + * If barrier lock is set wait while it is released. + * Release remote connection after completion. + * + * For DDL, DN will commit before CN does. + * Because DDLs normally have exclusive locks, then when CN gets committed, + * blocked user transactions will see DNs in a consistent state. + */ +static void +pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle) +{ + int conn_count = 0; + + if (!enable_parallel_ddl || !has_ddl) + { + /* normal cases */ + conn_count = pgxc_node_remote_commit_internal(get_current_handles(), txn_type); + } + else + { + /* make sure first DN then CN */ + conn_count = pgxc_node_remote_commit_internal(get_current_dn_handles(), txn_type); + conn_count += pgxc_node_remote_commit_internal(get_current_cn_handles(), txn_type); + } + + stat_transaction(conn_count); + + if (need_release_handle) + { + if (!temp_object_included && !PersistentConnections) + { + /* Clean up remote sessions */ + pgxc_node_remote_cleanup_all(); + release_handles(false); + } + } + else + { + /* in subtxn, we just cleanup the connections. not release the handles. */ + if (!temp_object_included && !PersistentConnections) + { + /* Clean up remote sessions without release handles. */ + pgxc_node_remote_cleanup_all(); + } + } + + clear_handles(); +} /* * Commit transactions on remote nodes. * If barrier lock is set wait while it is released. * Release remote connection after completion. */ +static int +pgxc_node_remote_commit_internal(PGXCNodeAllHandles *handles, TranscationType txn_type) +#else static void pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle) -{// #lizard forgives - int result = 0; - char *commitCmd = NULL; - int i; - ResponseCombiner combiner; - PGXCNodeHandle **connections = NULL; - int conn_count = 0; - PGXCNodeAllHandles *handles = get_current_handles(); +#endif +{ + int result = 0; + char *commitCmd = NULL; + int i; + ResponseCombiner combiner; + PGXCNodeHandle **connections = NULL; + int conn_count = 0; #ifdef __TBASE__ switch (txn_type) @@ -4843,53 +4898,59 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle) result = EOF; } - if (result) - { - if (combiner.errorMessage) - { - pgxc_node_report_error(&combiner); - } - else - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to COMMIT the transaction on one or more nodes"))); - } - } - CloseCombiner(&combiner); - } - - stat_transaction(conn_count); + if (result) + { + if (combiner.errorMessage) + { + pgxc_node_report_error(&combiner); + } + else + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to COMMIT the transaction on one or more nodes"))); + } + } + CloseCombiner(&combiner); + } - - if (need_release_handle) - { - if (!temp_object_included && !PersistentConnections) - { - /* Clean up remote sessions */ - pgxc_node_remote_cleanup_all(); - release_handles(false); - } - } - else - { - /* in subtxn, we just cleanup the connections. not release the handles. */ - if (!temp_object_included && !PersistentConnections) - { - /* Clean up remote sessions without release handles. */ - pgxc_node_remote_cleanup_all(); - } - } - - clear_handles(); +#ifndef __TBASE__ + stat_transaction(conn_count); + + + if (need_release_handle) + { + if (!temp_object_included && !PersistentConnections) + { + /* Clean up remote sessions */ + pgxc_node_remote_cleanup_all(); + release_handles(false); + } + } + else + { + /* in subtxn, we just cleanup the connections. not release the handles. */ + if (!temp_object_included && !PersistentConnections) + { + /* Clean up remote sessions without release handles. */ + pgxc_node_remote_cleanup_all(); + } + } + + clear_handles(); +#endif pfree_pgxc_all_handles(handles); - if (connections) - { - pfree(connections); - connections = NULL; - } + if (connections) + { + pfree(connections); + connections = NULL; + } + +#ifdef __TBASE__ + return conn_count; +#endif } /* @@ -6765,6 +6826,118 @@ ExecRemoteUtility(RemoteQuery *node) } } + /* + * DDL will firstly be executed on coordinators then datanodes + * which will avoid deadlocks in cluster. + * Let us assume that user sql and ddl hold conflict locks, + * then there will be two situations: + * 1. The coordinator is not locked, user sql will see datanodes with no lock. + * 2. The coordinator is locked, user sql will wait for ddl to complete. + * + * Send BEGIN control command to all coordinator nodes + */ + if (pgxc_node_begin(co_conn_count, + pgxc_connections->coord_handles, + gxid, + need_tran_block, + false, + PGXC_NODE_COORDINATOR)) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Could not begin transaction on coordinators"))); + } + + /* Send other txn related messages to coordinator nodes */ + for (i = 0; i < co_conn_count; i++) + { + PGXCNodeHandle *conn = pgxc_connections->coord_handles[i]; + + if (snapshot && pgxc_node_send_snapshot(conn, snapshot)) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command to coordinators"))); + } + if (pgxc_node_send_cmd_id(conn, cid) < 0) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command ID to Datanodes"))); + } + + if (pgxc_node_send_query(conn, node->sql_statement) != 0) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command to coordinators"))); + } + } + + /* Make the same for Coordinators */ + while (co_conn_count > 0) + { + int i = 0; + + /* Wait until one of the connections has data available */ + if (pgxc_node_receive(co_conn_count, + pgxc_connections->coord_handles, + NULL)) + { + /* + * Got error + * TODO(Tbase): How do we check the error here? + */ + break; + } + + while (i < co_conn_count) + { + PGXCNodeHandle *conn = pgxc_connections->coord_handles[i]; + int res = handle_response(conn, combiner); + + if (res == RESPONSE_EOF) + { + i++; + } + else if (res == RESPONSE_COMPLETE) + { + /* Ignore, wait for ReadyForQuery */ + if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Unexpected FATAL ERROR on Connection to " + "Coordinator %s pid %d", + pgxc_connections->coord_handles[i]->nodename, + pgxc_connections->coord_handles[i]->backend_pid))); + } + } + else if (res == RESPONSE_ERROR) + { + /* Ignore, wait for ReadyForQuery */ + } + else if (res == RESPONSE_READY) + { + if (i < --co_conn_count) + pgxc_connections->coord_handles[i] = + pgxc_connections->coord_handles[co_conn_count]; + } + else if (res == RESPONSE_TUPDESC) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Unexpected response from coordinator"))); + } + else if (res == RESPONSE_DATAROW) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Unexpected response from coordinator"))); + } + } + } + /* * Send BEGIN control command to all data nodes */ diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index 7bea908f..8aa01b1a 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -161,6 +161,9 @@ static int get_char(PGXCNodeHandle * conn, char *out); static ParamEntry * paramlist_get_paramentry(List *param_list, const char *name); static ParamEntry * paramentry_copy(ParamEntry * src_entry); static void PGXCNodeHandleError(PGXCNodeHandle *handle, char *msg_body, int len); +static PGXCNodeAllHandles * make_PGXCNodeAllHandles(); +static void get_current_dn_handles_internal(PGXCNodeAllHandles *result); +static void get_current_cn_handles_internal(PGXCNodeAllHandles *result); #endif /* @@ -3908,34 +3911,62 @@ get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query, bool return result; } -PGXCNodeAllHandles * -get_current_handles(void) -{// #lizard forgives +#ifdef __TBASE__ +static PGXCNodeAllHandles * +make_PGXCNodeAllHandles() +{ PGXCNodeAllHandles *result; - PGXCNodeHandle *node_handle; - int i; - result = (PGXCNodeAllHandles *) palloc(sizeof(PGXCNodeAllHandles)); if (!result) { ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); + errmsg("out of memory"))); } result->primary_handle = NULL; result->co_conn_count = 0; result->dn_conn_count = 0; + result->coord_handles = NULL; + result->datanode_handles = NULL; - result->datanode_handles = (PGXCNodeHandle **) - palloc(NumDataNodes * sizeof(PGXCNodeHandle *)); - if (!result->datanode_handles) + return result; +} +#endif + +PGXCNodeAllHandles * +get_current_handles(void) +{ +#ifndef __TBASE__ + PGXCNodeAllHandles *result = make_PGXCNodeAllHandles(); +#else + PGXCNodeAllHandles *result; + PGXCNodeHandle *node_handle; + int i; + + result = (PGXCNodeAllHandles *) palloc(sizeof(PGXCNodeAllHandles)); + if (!result) { ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))); } + result->primary_handle = NULL; + result->co_conn_count = 0; + result->dn_conn_count = 0; +#endif + +#ifdef __TBASE__ + result->datanode_handles = (PGXCNodeHandle **) + palloc(NumDataNodes * sizeof(PGXCNodeHandle *)); + if (!result->datanode_handles) + { + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + for (i = 0; i < NumDataNodes; i++) { node_handle = &dn_handles[i]; @@ -3952,17 +3983,87 @@ get_current_handles(void) errmsg("out of memory"))); } + for (i = 0; i < NumCoords; i++) + { + node_handle = &co_handles[i]; + if (node_handle->sock != NO_SOCKET) + result->coord_handles[result->co_conn_count++] = node_handle; + } +#else + get_current_cn_handles_internal(result); + get_current_dn_handles_internal(result); +#endif + + return result; +} + +#ifdef __TBASE__ + + +PGXCNodeAllHandles * +get_current_cn_handles(void) +{ + PGXCNodeAllHandles *result = make_PGXCNodeAllHandles(); + + get_current_cn_handles_internal(result); + return result; +} + +PGXCNodeAllHandles * +get_current_dn_handles(void) +{ + PGXCNodeAllHandles *result = make_PGXCNodeAllHandles(); + + get_current_dn_handles_internal(result); + return result; +} + +static void +get_current_dn_handles_internal(PGXCNodeAllHandles *result) +{ + PGXCNodeHandle *node_handle; + int i; + + result->datanode_handles = (PGXCNodeHandle **) + palloc(NumDataNodes * sizeof(PGXCNodeHandle *)); + if (!result->datanode_handles) + { + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + + for (i = 0; i < NumDataNodes; i++) + { + node_handle = &dn_handles[i]; + if (node_handle->sock != NO_SOCKET) + result->datanode_handles[result->dn_conn_count++] = node_handle; + } +} + +static void +get_current_cn_handles_internal(PGXCNodeAllHandles *result) +{ + PGXCNodeHandle *node_handle; + int i; + + result->coord_handles = (PGXCNodeHandle **) + palloc(NumCoords * sizeof(PGXCNodeHandle *)); + if (!result->coord_handles) + { + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + for (i = 0; i < NumCoords; i++) { node_handle = &co_handles[i]; if (node_handle->sock != NO_SOCKET) result->coord_handles[result->co_conn_count++] = node_handle; } - - return result; } -#ifdef __TBASE__ PGXCNodeAllHandles * get_sock_fatal_handles(void) { @@ -5455,4 +5556,32 @@ void PGXCGetAllDnOid(Oid *nodelist) } +#ifdef __TBASE__ +/* + * Return the name of ascii-minimized coordinator + */ +char* find_first_exec_cn(void) +{ + int i = 0; + char* result = co_handles[0].nodename; + + for (i = 1; i < NumCoords; i++) + { + result = (strcmp(co_handles[i].nodename, result) < 0) ? + co_handles[i].nodename : + result; + } + + return result; +} + +/* + * Return whether I am the ascii-minimized coordinator + */ +bool is_first_exec_cn(char *first_cn) +{ + return strcmp(first_cn, PGXCNodeName) == 0; +} +#endif + #endif diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 7af36bb3..8d5dc71a 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -671,6 +671,10 @@ bool isRestoreMode = false; int remoteConnType = REMOTE_CONN_APP; +#ifdef __TBASE__ +bool is_forward = false; +#endif + /* key pair to be used as object id while using advisory lock for backup */ Datum xc_lockForBackupKey1; Datum xc_lockForBackupKey2; diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index ce95d95d..6103517e 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -3468,9 +3468,11 @@ finish_xact_command(void) MemoryContextStats(TopMemoryContext); #endif - xact_started = false; - - } + xact_started = false; +#ifdef __TBASE__ + has_ddl = false; +#endif + } } @@ -5103,61 +5105,66 @@ PostgresMain(int argc, char *argv[], AuditProcessResultInfo(false); } #endif - /* - * Abort the current transaction in order to recover. - */ - AbortCurrentTransaction(); - - if (am_walsender) - WalSndErrorCleanup(); + /* + * Abort the current transaction in order to recover. + */ + AbortCurrentTransaction(); + + if (am_walsender) + WalSndErrorCleanup(); + + /* + * We can't release replication slots inside AbortTransaction() as we + * need to be able to start and abort transactions while having a slot + * acquired. But we never need to hold them across top level errors, + * so releasing here is fine. There's another cleanup in ProcKill() + * ensuring we'll correctly cleanup on FATAL errors as well. + */ + if (MyReplicationSlot != NULL) + ReplicationSlotRelease(); + + /* We also want to cleanup temporary slots on error. */ + ReplicationSlotCleanup(); + + /* + * Now return to normal top-level context and clear ErrorContext for + * next time. + */ + MemoryContextSwitchTo(TopMemoryContext); + FlushErrorState(); + + /* + * If we were handling an extended-query-protocol message, initiate + * skip till next Sync. This also causes us not to issue + * ReadyForQuery (until we get Sync). + */ + if (doing_extended_query_message) + ignore_till_sync = true; + + /* We don't have a transaction command open anymore */ + xact_started = false; - /* - * We can't release replication slots inside AbortTransaction() as we - * need to be able to start and abort transactions while having a slot - * acquired. But we never need to hold them across top level errors, - * so releasing here is fine. There's another cleanup in ProcKill() - * ensuring we'll correctly cleanup on FATAL errors as well. - */ - if (MyReplicationSlot != NULL) - ReplicationSlotRelease(); - - /* We also want to cleanup temporary slots on error. */ - ReplicationSlotCleanup(); - - /* - * Now return to normal top-level context and clear ErrorContext for - * next time. - */ - MemoryContextSwitchTo(TopMemoryContext); - FlushErrorState(); - - /* - * If we were handling an extended-query-protocol message, initiate - * skip till next Sync. This also causes us not to issue - * ReadyForQuery (until we get Sync). - */ - if (doing_extended_query_message) - ignore_till_sync = true; - - /* We don't have a transaction command open anymore */ - xact_started = false; - - /* - * If an error occurred while we were reading a message from the - * client, we have potentially lost track of where the previous - * message ends and the next one begins. Even though we have - * otherwise recovered from the error, we cannot safely read any more - * messages from the client, so there isn't much we can do with the - * connection anymore. - */ - if (pq_is_reading_msg()) - ereport(FATAL, - (errcode(ERRCODE_PROTOCOL_VIOLATION), - errmsg("terminating connection because protocol synchronization was lost"))); +#ifdef __TBASE__ + /* Clear DDL flag */ + has_ddl = false; +#endif - /* Now we can allow interrupts again */ - RESUME_INTERRUPTS(); - } + /* + * If an error occurred while we were reading a message from the + * client, we have potentially lost track of where the previous + * message ends and the next one begins. Even though we have + * otherwise recovered from the error, we cannot safely read any more + * messages from the client, so there isn't much we can do with the + * connection anymore. + */ + if (pq_is_reading_msg()) + ereport(FATAL, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("terminating connection because protocol synchronization was lost"))); + + /* Now we can allow interrupts again */ + RESUME_INTERRUPTS(); + } #ifdef __TBASE__ /* for error code contrib */ diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c index 983d04cc..b2b2b678 100644 --- a/src/backend/tcop/pquery.c +++ b/src/backend/tcop/pquery.c @@ -1894,12 +1894,15 @@ PortalRunUtility(Portal portal, PlannedStmt *pstmt, #endif { #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__ - /* Avoid the start timestamp to be too old to execute on DNs */ - if(IsA(utilityStmt, VacuumStmt) || IsA(utilityStmt, AlterNodeStmt)) - { - snapshot = GetLocalTransactionSnapshot(); - }else - snapshot = GetTransactionSnapshot(); + /* Avoid the start timestamp to be too old to execute on DNs */ + if(IsA(utilityStmt, VacuumStmt) || IsA(utilityStmt, AlterNodeStmt)) + { + snapshot = GetLocalTransactionSnapshot(); + } + else + snapshot = GetTransactionSnapshot(); + + has_ddl = true; #else snapshot = GetTransactionSnapshot(); #endif diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index a7746d53..bad502d5 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -146,8 +146,12 @@ static bool IsStmtAllowedInLockedMode(Node *parsetree, const char *queryString); static void ExecCreateKeyValuesStmt(Node *parsetree); static void RemoveSequeceBarely(DropStmt *stmt); extern void RegisterSeqDrop(char *name, int32 type); +static bool forward_ddl(Node *node, const char *queryString); extern bool g_GTM_skip_catalog; + +bool has_ddl; +bool enable_parallel_ddl; #endif #endif @@ -1730,6 +1734,36 @@ ProcessUtilityPost(PlannedStmt *pstmt, ExecUtilityStmtOnNodes(parsetree, queryString, NULL, sentToRemote, auto_commit, exec_type, is_temp, add_context); } + +#ifdef __TBASE__ +static bool forward_ddl(Node *node, const char *queryString) +{ + Oid *oid_list = NULL; + char *first_cn = NULL; + + if (!enable_parallel_ddl || !IS_PGXC_LOCAL_COORDINATOR) + return false; + + if (IsA(node,IndexStmt) && + castNode(IndexStmt,node)->concurrent) + return false; + + first_cn = find_first_exec_cn(); + if(is_first_exec_cn(first_cn)) + return false; + + oid_list = (Oid *) palloc0(sizeof(Oid)); + oid_list[0] = get_pgxc_nodeoid(first_cn); + + PGXCNodeSetParam(false, "is_forward", "true", 0); + pgxc_execute_on_nodes(1, oid_list, strdup(queryString)); + PGXCNodeSetParam(false, "is_forward", "false", 0); + + pfree(oid_list); + return true; +} +#endif + /* * standard_ProcessUtility itself deals only with utility commands for * which we do not provide event trigger support. Commands that do have diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index ad023691..b78542f6 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -198,6 +198,7 @@ extern BackendId CoordSessionBackendId; extern bool PlpgsqlDebugPrint; /* used for get total size of session */ static int32 g_TotalMemorySize = 0; +extern bool enable_parallel_ddl; #endif static int GUC_check_errcode_value; @@ -2669,21 +2670,39 @@ static struct config_bool ConfigureNamesBool[] = #endif #ifdef __TBASE__ + { + {"enable_lock_account", PGC_SUSET, CUSTOM_OPTIONS, + gettext_noop("Enable lock account when login fail serval times."), + NULL + }, + &enable_lock_account, + false, + NULL, NULL, NULL + }, + { + {"lock_account_print", PGC_SUSET, CUSTOM_OPTIONS, + gettext_noop("Enable print log in lock account procedure."), + NULL + }, + &lock_account_print, + false, + NULL, NULL, NULL + }, { - {"enable_lock_account", PGC_SUSET, CUSTOM_OPTIONS, - gettext_noop("Enable lock account when login fail serval times."), - NULL + {"enable_parallel_ddl", PGC_USERSET, CUSTOM_OPTIONS, + gettext_noop("Enable parallel DDL with no dead lock."), + NULL }, - &enable_lock_account, - false, + &enable_parallel_ddl, + true, NULL, NULL, NULL }, { - {"lock_account_print", PGC_SUSET, CUSTOM_OPTIONS, - gettext_noop("Enable print log in lock account procedure."), + {"is_forward", PGC_INTERNAL, CUSTOM_OPTIONS, + gettext_noop("Whether DDL is forwarded from another coordinator."), NULL }, - &lock_account_print, + &is_forward, false, NULL, NULL, NULL }, @@ -8180,7 +8199,8 @@ set_config_option(const char *name, const char *value, */ if ((source == PGC_S_SESSION || source == PGC_S_CLIENT) && (IS_PGXC_DATANODE || !IsConnFromCoord()) - && (strcmp(name,"remotetype") != 0 && strcmp(name,"parentnode") != 0)) + && (strcmp(name,"remotetype") != 0 && strcmp(name,"parentnode") != 0 && + strcmp(name,"is_forward") != 0)) send_to_nodes = true; #endif diff --git a/src/include/pgxc/pgxc.h b/src/include/pgxc/pgxc.h index 770b213c..475f117d 100644 --- a/src/include/pgxc/pgxc.h +++ b/src/include/pgxc/pgxc.h @@ -95,7 +95,10 @@ typedef enum } RemoteConnTypes; /* Determine remote connection type for a PGXC backend */ -extern int remoteConnType; +extern int remoteConnType; +#ifdef __TBASE__ +extern bool is_forward; +#endif /* Local node name and numer */ extern char *PGXCNodeName; @@ -123,10 +126,10 @@ extern Datum xc_lockForBackupKey2; #define PGXC_PARENT_NODE_TYPE parentPGXCNodeType #define REMOTE_CONN_TYPE remoteConnType -#define IsConnFromApp() (remoteConnType == REMOTE_CONN_APP) -#define IsConnFromCoord() (remoteConnType == REMOTE_CONN_COORD) -#define IsConnFromDatanode() (remoteConnType == REMOTE_CONN_DATANODE) -#define IsConnFromGtm() (remoteConnType == REMOTE_CONN_GTM) +#define IsConnFromApp() (remoteConnType == REMOTE_CONN_APP || is_forward == true) +#define IsConnFromCoord() (remoteConnType == REMOTE_CONN_COORD && is_forward == false) +#define IsConnFromDatanode() (remoteConnType == REMOTE_CONN_DATANODE && is_forward == false) +#define IsConnFromGtm() (remoteConnType == REMOTE_CONN_GTM && is_forward == false) #define IsConnFromGtmProxy() (remoteConnType == REMOTE_CONN_GTM_PROXY) /* key pair to be used as object id while using advisory lock for backup */ diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h index 7dcb5bf7..fcd765c0 100644 --- a/src/include/pgxc/pgxcnode.h +++ b/src/include/pgxc/pgxcnode.h @@ -175,6 +175,8 @@ extern PGXCNodeAllHandles *get_handles(List *datanodelist, List *coordlist, bool extern PGXCNodeAllHandles *get_current_handles(void); #ifdef __TBASE__ +extern PGXCNodeAllHandles *get_current_cn_handles(void); +extern PGXCNodeAllHandles *get_current_dn_handles(void); extern PGXCNodeAllHandles * get_sock_fatal_handles(void); #endif extern void pfree_pgxc_all_handles(PGXCNodeAllHandles *handles); @@ -282,10 +284,12 @@ extern void pgxc_print_pending_data(PGXCNodeHandle *handle, bool reset); #ifdef __TBASE__ void add_error_message_from_combiner(PGXCNodeHandle *handle, void *combiner_input); -extern inline void pgxc_set_coordinator_proc_pid(int proc_pid); -extern inline int pgxc_get_coordinator_proc_pid(void); -extern inline void pgxc_set_coordinator_proc_vxid(TransactionId proc_vxid); -extern inline TransactionId pgxc_get_coordinator_proc_vxid(void); +inline void pgxc_set_coordinator_proc_pid(int proc_pid); +inline int pgxc_get_coordinator_proc_pid(void); +inline void pgxc_set_coordinator_proc_vxid(TransactionId proc_vxid); +inline TransactionId pgxc_get_coordinator_proc_vxid(void); +char* find_first_exec_cn(void); +bool is_first_exec_cn(char *first_cn); #endif #ifdef __AUDIT__ diff --git a/src/include/tcop/utility.h b/src/include/tcop/utility.h index b1c3c0f4..1df0be74 100644 --- a/src/include/tcop/utility.h +++ b/src/include/tcop/utility.h @@ -66,5 +66,8 @@ extern bool pgxc_lock_for_utility_stmt(Node *parsetree); #ifdef __TBASE__ typedef void (*ErrcodeHookType) (ErrorData *edata, StringInfo buff); extern PGDLLIMPORT ErrcodeHookType g_pfErrcodeHook; + +extern bool has_ddl; +extern bool enable_parallel_ddl; #endif #endif /* UTILITY_H */ From 15465a3c04ed909089c8c58a010821c68ad7c11b Mon Sep 17 00:00:00 2001 From: youngxie Date: Mon, 31 Aug 2020 20:11:19 +0800 Subject: [PATCH 036/578] Perfects comments ,names and format. --- src/backend/pgxc/pool/execRemote.c | 11 +++-- src/backend/pgxc/pool/pgxcnode.c | 69 ++++++++++++++--------------- src/backend/postmaster/postmaster.c | 2 +- src/backend/tcop/postgres.c | 35 ++++++++------- src/backend/tcop/pquery.c | 6 +-- src/backend/tcop/utility.c | 48 +++++++++++++------- src/backend/utils/misc/guc.c | 11 ++--- src/include/pgxc/pgxc.h | 9 ++-- src/include/pgxc/pgxcnode.h | 4 +- src/include/tcop/utility.h | 4 +- 10 files changed, 113 insertions(+), 86 deletions(-) diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 452577db..36bdbed3 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -4662,15 +4662,15 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit) * Release remote connection after completion. * * For DDL, DN will commit before CN does. - * Because DDLs normally have exclusive locks, then when CN gets committed, - * blocked user transactions will see DNs in a consistent state. + * Because DDLs normally have conflict locks, when CN gets committed, + * DNs will be in a consistent state for blocked user transactions. */ static void pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle) { int conn_count = 0; - if (!enable_parallel_ddl || !has_ddl) + if (!enable_parallel_ddl || !is_txn_has_parallel_ddl) { /* normal cases */ conn_count = pgxc_node_remote_commit_internal(get_current_handles(), txn_type); @@ -6874,7 +6874,10 @@ ExecRemoteUtility(RemoteQuery *node) } } - /* Make the same for Coordinators */ + /* + * Stop if all commands are completed or we got a data row and + * initialized state node for subsequent invocations + */ while (co_conn_count > 0) { int i = 0; diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index 8aa01b1a..812502d6 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -161,7 +161,7 @@ static int get_char(PGXCNodeHandle * conn, char *out); static ParamEntry * paramlist_get_paramentry(List *param_list, const char *name); static ParamEntry * paramentry_copy(ParamEntry * src_entry); static void PGXCNodeHandleError(PGXCNodeHandle *handle, char *msg_body, int len); -static PGXCNodeAllHandles * make_PGXCNodeAllHandles(); +static PGXCNodeAllHandles * get_empty_handles(void); static void get_current_dn_handles_internal(PGXCNodeAllHandles *result); static void get_current_cn_handles_internal(PGXCNodeAllHandles *result); #endif @@ -3913,10 +3913,10 @@ get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query, bool #ifdef __TBASE__ static PGXCNodeAllHandles * -make_PGXCNodeAllHandles() +get_empty_handles(void) { PGXCNodeAllHandles *result; - result = (PGXCNodeAllHandles *) palloc(sizeof(PGXCNodeAllHandles)); + result = (PGXCNodeAllHandles *) palloc0(sizeof(PGXCNodeAllHandles)); if (!result) { ereport(ERROR, @@ -3924,12 +3924,6 @@ make_PGXCNodeAllHandles() errmsg("out of memory"))); } - result->primary_handle = NULL; - result->co_conn_count = 0; - result->dn_conn_count = 0; - result->coord_handles = NULL; - result->datanode_handles = NULL; - return result; } #endif @@ -3937,27 +3931,23 @@ make_PGXCNodeAllHandles() PGXCNodeAllHandles * get_current_handles(void) { -#ifndef __TBASE__ - PGXCNodeAllHandles *result = make_PGXCNodeAllHandles(); +#ifdef __TBASE__ + PGXCNodeAllHandles *result = get_empty_handles(); #else PGXCNodeAllHandles *result; PGXCNodeHandle *node_handle; int i; - result = (PGXCNodeAllHandles *) palloc(sizeof(PGXCNodeAllHandles)); - if (!result) - { - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); - } - - result->primary_handle = NULL; - result->co_conn_count = 0; - result->dn_conn_count = 0; + result = (PGXCNodeAllHandles *) palloc0(sizeof(PGXCNodeAllHandles)); + if (!result) + { + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } #endif -#ifdef __TBASE__ +#ifndef __TBASE__ result->datanode_handles = (PGXCNodeHandle **) palloc(NumDataNodes * sizeof(PGXCNodeHandle *)); if (!result->datanode_handles) @@ -4003,7 +3993,7 @@ get_current_handles(void) PGXCNodeAllHandles * get_current_cn_handles(void) { - PGXCNodeAllHandles *result = make_PGXCNodeAllHandles(); + PGXCNodeAllHandles *result = get_empty_handles(); get_current_cn_handles_internal(result); return result; @@ -4012,7 +4002,7 @@ get_current_cn_handles(void) PGXCNodeAllHandles * get_current_dn_handles(void) { - PGXCNodeAllHandles *result = make_PGXCNodeAllHandles(); + PGXCNodeAllHandles *result = get_empty_handles(); get_current_dn_handles_internal(result); return result; @@ -4033,11 +4023,14 @@ get_current_dn_handles_internal(PGXCNodeAllHandles *result) errmsg("out of memory"))); } + result->dn_conn_count = 0; for (i = 0; i < NumDataNodes; i++) { node_handle = &dn_handles[i]; if (node_handle->sock != NO_SOCKET) + { result->datanode_handles[result->dn_conn_count++] = node_handle; + } } } @@ -4056,11 +4049,14 @@ get_current_cn_handles_internal(PGXCNodeAllHandles *result) errmsg("out of memory"))); } + result->co_conn_count = 0; for (i = 0; i < NumCoords; i++) { node_handle = &co_handles[i]; if (node_handle->sock != NO_SOCKET) + { result->coord_handles[result->co_conn_count++] = node_handle; + } } } @@ -5558,27 +5554,30 @@ void PGXCGetAllDnOid(Oid *nodelist) #ifdef __TBASE__ /* - * Return the name of ascii-minimized coordinator + * Return the name of ascii-minimized coordinator as ddl leader cn */ -char* find_first_exec_cn(void) +inline char* +find_ddl_leader_cn(void) { int i = 0; - char* result = co_handles[0].nodename; + char* result = NULL; - for (i = 1; i < NumCoords; i++) + for (i = 0; i < NumCoords; i++) { - result = (strcmp(co_handles[i].nodename, result) < 0) ? - co_handles[i].nodename : - result; + if(result == NULL || strcmp(co_handles[i].nodename, result) < 0) + { + result = co_handles[i].nodename; + } } - return result; + return pstrdup(result); } /* - * Return whether I am the ascii-minimized coordinator + * Return whether I am the leader cn */ -bool is_first_exec_cn(char *first_cn) +inline bool +is_ddl_leader_cn(char *first_cn) { return strcmp(first_cn, PGXCNodeName) == 0; } diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 8d5dc71a..1f8c33a5 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -672,7 +672,7 @@ bool isRestoreMode = false; int remoteConnType = REMOTE_CONN_APP; #ifdef __TBASE__ -bool is_forward = false; +bool is_forward_request = false; #endif /* key pair to be used as object id while using advisory lock for backup */ diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 6103517e..80b2fc4f 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -975,8 +975,9 @@ pg_analyze_and_rewrite_params(RawStmt *parsetree, */ static List * pg_rewrite_query(Query *query) -{// #lizard forgives - List *querytree_list; +{ + List *querytree_list; + char *leader_cn = NULL; if (Debug_print_parse) elog_node_display(LOG, "parse tree", query, @@ -986,17 +987,21 @@ pg_rewrite_query(Query *query) ResetUsage(); #ifdef PGXC + /* directly forward the request */ + leader_cn = find_ddl_leader_cn(); + if (query->commandType == CMD_UTILITY && - IsA(query->utilityStmt, CreateTableAsStmt)) - { - /* - * CREATE TABLE AS SELECT and SELECT INTO are rewritten so that the - * target table is created first. The SELECT query is then transformed - * into an INSERT INTO statement - */ - querytree_list = QueryRewriteCTAS(query); - } - else + IsA(query->utilityStmt, CreateTableAsStmt) && + (enable_parallel_ddl && is_ddl_leader_cn(leader_cn))) + { + /* + * CREATE TABLE AS SELECT and SELECT INTO are rewritten so that the + * target table is created first. The SELECT query is then transformed + * into an INSERT INTO statement + */ + querytree_list = QueryRewriteCTAS(query); + } + else #endif if (query->commandType == CMD_UTILITY) { @@ -3470,7 +3475,7 @@ finish_xact_command(void) xact_started = false; #ifdef __TBASE__ - has_ddl = false; + is_txn_has_parallel_ddl = false; #endif } } @@ -5145,8 +5150,8 @@ PostgresMain(int argc, char *argv[], xact_started = false; #ifdef __TBASE__ - /* Clear DDL flag */ - has_ddl = false; + /* Clear parallel DDL flag */ + is_txn_has_parallel_ddl = false; #endif /* diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c index b2b2b678..67cf48ae 100644 --- a/src/backend/tcop/pquery.c +++ b/src/backend/tcop/pquery.c @@ -1900,9 +1900,9 @@ PortalRunUtility(Portal portal, PlannedStmt *pstmt, snapshot = GetLocalTransactionSnapshot(); } else - snapshot = GetTransactionSnapshot(); - - has_ddl = true; + { + snapshot = GetTransactionSnapshot(); + } #else snapshot = GetTransactionSnapshot(); #endif diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index bad502d5..cd0252f9 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -146,11 +146,11 @@ static bool IsStmtAllowedInLockedMode(Node *parsetree, const char *queryString); static void ExecCreateKeyValuesStmt(Node *parsetree); static void RemoveSequeceBarely(DropStmt *stmt); extern void RegisterSeqDrop(char *name, int32 type); -static bool forward_ddl(Node *node, const char *queryString); +static bool forward_ddl_to_leader_cn(Node *node, const char *queryString); extern bool g_GTM_skip_catalog; -bool has_ddl; +bool is_txn_has_parallel_ddl; bool enable_parallel_ddl; #endif @@ -1736,30 +1736,46 @@ ProcessUtilityPost(PlannedStmt *pstmt, } #ifdef __TBASE__ -static bool forward_ddl(Node *node, const char *queryString) +/* + * Forward specific DDLs request to leader cn + * on success return true else false + */ +static bool forward_ddl_to_leader_cn(Node *node, const char *queryString) { - Oid *oid_list = NULL; - char *first_cn = NULL; + Oid leader_cn = InvalidOid; + char *leader_name = NULL; - if (!enable_parallel_ddl || !IS_PGXC_LOCAL_COORDINATOR) + /* avoid forward recurse */ + if (!enable_parallel_ddl || !IS_PGXC_LOCAL_COORDINATOR || is_forward_request) + { return false; + } - if (IsA(node,IndexStmt) && - castNode(IndexStmt,node)->concurrent) + /* CONCURRENT INDEX is not supported */ + if (IsA(node,IndexStmt) && castNode(IndexStmt,node)->concurrent) + { return false; + } - first_cn = find_first_exec_cn(); - if(is_first_exec_cn(first_cn)) + /* Set parallel ddl flag */ + is_txn_has_parallel_ddl = true; + + leader_name = find_ddl_leader_cn(); + if(is_ddl_leader_cn(leader_name)) + { return false; + } + + leader_cn = get_pgxc_nodeoid(leader_name); + + /* Set flag to indicate forwarded request */ + PGXCNodeSetParam(false, "is_forward_request", "true", 0); - oid_list = (Oid *) palloc0(sizeof(Oid)); - oid_list[0] = get_pgxc_nodeoid(first_cn); + pgxc_execute_on_nodes(1, &leader_cn, pstrdup(queryString)); - PGXCNodeSetParam(false, "is_forward", "true", 0); - pgxc_execute_on_nodes(1, oid_list, strdup(queryString)); - PGXCNodeSetParam(false, "is_forward", "false", 0); + /* Cancel forwarded flag for subsequent requests */ + PGXCNodeSetParam(false, "is_forward_request", "false", 0); - pfree(oid_list); return true; } #endif diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index b78542f6..93dc7020 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -2690,7 +2690,7 @@ static struct config_bool ConfigureNamesBool[] = }, { {"enable_parallel_ddl", PGC_USERSET, CUSTOM_OPTIONS, - gettext_noop("Enable parallel DDL with no dead lock."), + gettext_noop("Enable parallel DDL with no deadlock."), NULL }, &enable_parallel_ddl, @@ -2698,11 +2698,12 @@ static struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, { - {"is_forward", PGC_INTERNAL, CUSTOM_OPTIONS, + {"is_forward_request", PGC_USERSET, CUSTOM_OPTIONS, gettext_noop("Whether DDL is forwarded from another coordinator."), - NULL + NULL, + GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_AUTO_FILE | GUC_DISALLOW_IN_FILE | GUC_NO_SHOW_ALL }, - &is_forward, + &is_forward_request, false, NULL, NULL, NULL }, @@ -8200,7 +8201,7 @@ set_config_option(const char *name, const char *value, if ((source == PGC_S_SESSION || source == PGC_S_CLIENT) && (IS_PGXC_DATANODE || !IsConnFromCoord()) && (strcmp(name,"remotetype") != 0 && strcmp(name,"parentnode") != 0 && - strcmp(name,"is_forward") != 0)) + strcmp(name,"is_forward_request") != 0)) send_to_nodes = true; #endif diff --git a/src/include/pgxc/pgxc.h b/src/include/pgxc/pgxc.h index 475f117d..9f3ed6f5 100644 --- a/src/include/pgxc/pgxc.h +++ b/src/include/pgxc/pgxc.h @@ -97,7 +97,8 @@ typedef enum /* Determine remote connection type for a PGXC backend */ extern int remoteConnType; #ifdef __TBASE__ -extern bool is_forward; +/* Is request forwarded another coordinator */ +extern bool is_forward_request; #endif /* Local node name and numer */ @@ -126,9 +127,9 @@ extern Datum xc_lockForBackupKey2; #define PGXC_PARENT_NODE_TYPE parentPGXCNodeType #define REMOTE_CONN_TYPE remoteConnType -#define IsConnFromApp() (remoteConnType == REMOTE_CONN_APP || is_forward == true) -#define IsConnFromCoord() (remoteConnType == REMOTE_CONN_COORD && is_forward == false) -#define IsConnFromDatanode() (remoteConnType == REMOTE_CONN_DATANODE && is_forward == false) +#define IsConnFromApp() (remoteConnType == REMOTE_CONN_APP || is_forward_request == true) +#define IsConnFromCoord() (remoteConnType == REMOTE_CONN_COORD && is_forward_request == false) +#define IsConnFromDatanode() (remoteConnType == REMOTE_CONN_DATANODE && is_forward_request == false) #define IsConnFromGtm() (remoteConnType == REMOTE_CONN_GTM && is_forward == false) #define IsConnFromGtmProxy() (remoteConnType == REMOTE_CONN_GTM_PROXY) diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h index fcd765c0..872536fc 100644 --- a/src/include/pgxc/pgxcnode.h +++ b/src/include/pgxc/pgxcnode.h @@ -288,8 +288,8 @@ inline void pgxc_set_coordinator_proc_pid(int proc_pid); inline int pgxc_get_coordinator_proc_pid(void); inline void pgxc_set_coordinator_proc_vxid(TransactionId proc_vxid); inline TransactionId pgxc_get_coordinator_proc_vxid(void); -char* find_first_exec_cn(void); -bool is_first_exec_cn(char *first_cn); +inline char* find_ddl_leader_cn(void); +inline bool is_ddl_leader_cn(char *leader_cn); #endif #ifdef __AUDIT__ diff --git a/src/include/tcop/utility.h b/src/include/tcop/utility.h index 1df0be74..92605dff 100644 --- a/src/include/tcop/utility.h +++ b/src/include/tcop/utility.h @@ -67,7 +67,9 @@ extern bool pgxc_lock_for_utility_stmt(Node *parsetree); typedef void (*ErrcodeHookType) (ErrorData *edata, StringInfo buff); extern PGDLLIMPORT ErrcodeHookType g_pfErrcodeHook; -extern bool has_ddl; +/* Does txn include parallel DDLs */ +extern bool is_txn_has_parallel_ddl; +/* Parallel DDL switch */ extern bool enable_parallel_ddl; #endif #endif /* UTILITY_H */ From 3837cd66de9e0c7b41a0dd41448920ef54a378a8 Mon Sep 17 00:00:00 2001 From: youngxie Date: Mon, 31 Aug 2020 20:27:13 +0800 Subject: [PATCH 037/578] Fix under single node mode. --- src/backend/pgxc/pool/pgxcnode.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index 812502d6..f2886833 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -5570,7 +5570,10 @@ find_ddl_leader_cn(void) } } - return pstrdup(result); + if(result) + result = pstrdup(result); + + return result; } /* @@ -5579,6 +5582,9 @@ find_ddl_leader_cn(void) inline bool is_ddl_leader_cn(char *first_cn) { + if(first_cn == NULL) + return false; + return strcmp(first_cn, PGXCNodeName) == 0; } #endif From 3a33e59fa7f8bba084f6b4617ea4472006e36ea6 Mon Sep 17 00:00:00 2001 From: youngxie Date: Tue, 1 Sep 2020 10:48:45 +0800 Subject: [PATCH 038/578] regress fix --- src/test/regress/expected/sysviews.out | 3 ++- src/test/regress/expected/xc_misc.out | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index ded66084..e2765dd9 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -111,6 +111,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_nestloop | on enable_null_string | off enable_oracle_compatible | off + enable_parallel_ddl | on enable_pgbouncer | off enable_plpgsql_debug_print | off enable_pooler_debug_print | on @@ -124,7 +125,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_tidscan | on enable_transparent_crypt | on enable_user_authority_force_check | off -(52 rows) +(53 rows) -- Test that the pg_timezone_names and pg_timezone_abbrevs views are -- more-or-less working. We can't test their contents in any great detail diff --git a/src/test/regress/expected/xc_misc.out b/src/test/regress/expected/xc_misc.out index 75d207cc..0c894fdd 100644 --- a/src/test/regress/expected/xc_misc.out +++ b/src/test/regress/expected/xc_misc.out @@ -55,7 +55,7 @@ SET check_function_bodies = false; create function f1 () returns setof my_tab1 as $$ create table my_tab2 (a int); select * from my_tab1; $$ language sql; ERROR: function "f1" already exists with same argument types select f1(); -ERROR: Unexpected response from Datanode +ERROR: Unexpected response from coordinator CONTEXT: SQL function "f1" statement 1 SET check_function_bodies = true; drop function f1(); From fe33aaa6cf030b59021cabaf473159e6ea876d9a Mon Sep 17 00:00:00 2001 From: ericxwu Date: Tue, 1 Sep 2020 14:53:43 +0800 Subject: [PATCH 039/578] Support pullup agg sublink with ScalarArrayOpExpr qual --- src/backend/optimizer/plan/subselect.c | 456 +++++++------ src/test/regress/expected/subselect.out | 34 + src/test/regress/expected/subselect_1.out | 772 ++++++++++++++++++++++ src/test/regress/sql/subselect.sql | 188 +++--- 4 files changed, 1158 insertions(+), 292 deletions(-) diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index 05a21a79..8bb67513 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -1516,9 +1516,8 @@ simplify_ALL_query(PlannerInfo *root, Query *query) } /* - * if whereclause contains 'not' boolexpr or not equal opexpr, - * return true. - */ + * If where clause contains 'not' BoolExpr or not-equal OpExpr, return true. + */ static bool contain_notexpr_or_neopexpr(Node *whereclause, bool check_or, List **joinquals) {// #lizard forgives @@ -1536,7 +1535,7 @@ contain_notexpr_or_neopexpr(Node *whereclause, bool check_or, List **joinquals) if(!check_or) return true; - /* look for common expr */ + /* Look for common EXPR */ foreach(cell, expr->args) { List *cur = NIL; @@ -1561,11 +1560,11 @@ contain_notexpr_or_neopexpr(Node *whereclause, bool check_or, List **joinquals) return false; } - /* and expr */ - foreach(cell, expr->args) - { - bool result; - Node *arg = lfirst(cell); + /* AND EXPR */ + foreach(cell, expr->args) + { + bool result; + Node *arg = lfirst(cell); result = contain_notexpr_or_neopexpr(arg, check_or, joinquals); @@ -1588,16 +1587,17 @@ contain_notexpr_or_neopexpr(Node *whereclause, bool check_or, List **joinquals) *joinquals = lappend(*joinquals, expr); - + /* Make sure the operator is hashjoinable */ if (!op_hashjoinable(expr->opno, exprType((Node *)lexpr))) { return true; - } - - foreach(cell, expr->args) - { - bool result; - Node *arg = lfirst(cell); + } + + /* Check the operands of the OpExpr */ + foreach(cell, expr->args) + { + bool result; + Node *arg = lfirst(cell); result = contain_notexpr_or_neopexpr(arg, check_or, joinquals); @@ -1616,11 +1616,69 @@ contain_notexpr_or_neopexpr(Node *whereclause, bool check_or, List **joinquals) bool result; RelabelType *label = (RelabelType *)whereclause; - result = contain_notexpr_or_neopexpr((Node *)label->arg, check_or, joinquals); - if (result) - return true; - return false; - } + result = contain_notexpr_or_neopexpr((Node *)label->arg, + check_or, + joinquals); + if (result) + return true; + return false; + } + /* In case the where clause is "tbl.col_a IN ('0','1')" */ + else if (IsA(whereclause, ScalarArrayOpExpr)) + { + ListCell *lc = NULL; + ScalarArrayOpExpr *scalarArray = (ScalarArrayOpExpr*)whereclause; + Expr *lexpr = linitial(scalarArray->args); + + if (!op_hashjoinable(scalarArray->opno, exprType((Node *)lexpr))) + { + return true; + } + + foreach(lc, scalarArray->args) + { + if (contain_notexpr_or_neopexpr((Node *)lfirst(lc), + check_or, + joinquals)) + { + return true; + } + } + + return false; + } + /* + * The right operand of ScalarArrayOpExpr, we only support array of + * constant values + */ + else if (IsA(whereclause, ArrayExpr)) + { + ListCell *lc = NULL; + ArrayExpr *arrayExpr = (ArrayExpr*)whereclause; + + foreach(lc, arrayExpr->elements) + { + if (!IsA((Node *)lfirst(lc), Const)) + { + return true; + } + } + + return false; + } + /* In case the where clause is "tbl.col_a is(is not) NULL" */ + else if (IsA(whereclause, NullTest)) + { + NullTest *nullTestExpr = (NullTest *)whereclause; + + if (contain_notexpr_or_neopexpr((Node *)nullTestExpr->arg, + check_or, + joinquals)) + { + return true; + } + return false; + } return true; } @@ -2785,15 +2843,15 @@ get_or_exist_subquery_targetlist(PlannerInfo *root, Node *node, List **targetLis TargetEntry * convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry) { - Query *parse = root->parse; - Node *whereClause = NULL; - Query *subselect = NULL; - JoinExpr *joinExpr = NULL; - ParseState *pstate = NULL; - SubLink *sublink = NULL; - RangeTblRef *rtr = NULL; - RangeTblEntry *rte = NULL; - Var *var = NULL; + Query *parse = root->parse; + Node *whereClause = NULL; + Query *subselect = NULL; + JoinExpr *joinExpr = NULL; + ParseState *pstate = NULL; + SubLink *sublink = NULL; + RangeTblRef *rtr = NULL; + RangeTblEntry *rte = NULL; + Var *var = NULL; List *sublinks = NIL; /* Find sublinks in the targetlist entry */ @@ -2805,195 +2863,195 @@ convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry) sublink = linitial(sublinks); - if (sublink->subLinkType != EXPR_SUBLINK) - return NULL; - - /* - * Copy object so that we can modify it. - */ - subselect = copyObject((Query *) sublink->subselect); - whereClause = subselect->jointree->quals; + if (sublink->subLinkType != EXPR_SUBLINK) + return NULL; /* - * Only one targetEntry can be handled. - */ - if (list_length(subselect->targetList) > 1) - return NULL; + * Copy object so that we can modify it. + */ + subselect = copyObject((Query *) sublink->subselect); + whereClause = subselect->jointree->quals; - /* - * The SubQuery must have a non-empty JoinTree, else we won't have a join. - */ - if (subselect->jointree->fromlist == NIL) - return NULL; + /* + * Only one targetEntry can be handled. + */ + if (list_length(subselect->targetList) > 1) + return NULL; - /* - * What we can not optimize. - */ - if (subselect->commandType != CMD_SELECT || subselect->hasDistinctOn || - subselect->setOperations || subselect->groupingSets || - subselect->groupClause || subselect->hasWindowFuncs || - subselect->hasTargetSRFs || subselect->hasModifyingCTE || - subselect->havingQual || subselect->limitOffset || - subselect->limitCount || subselect->rowMarks || - subselect->cteList || subselect->sortClause) - { - return NULL; - } + /* + * The SubQuery must have a non-empty JoinTree, else we won't have a join. + */ + if (subselect->jointree->fromlist == NIL) + return NULL; - /* - * On one hand, the WHERE clause must contain some Vars of the - * parent query, else it's not gonna be a join. - */ - if (!contain_vars_of_level(whereClause, 1)) - return NULL; + /* + * What we can not optimize. + */ + if (subselect->commandType != CMD_SELECT || subselect->hasDistinctOn || + subselect->setOperations || subselect->groupingSets || + subselect->groupClause || subselect->hasWindowFuncs || + subselect->hasTargetSRFs || subselect->hasModifyingCTE || + subselect->havingQual || subselect->limitOffset || + subselect->limitCount || subselect->rowMarks || + subselect->cteList || subselect->sortClause) + { + return NULL; + } - /* - * We don't risk optimizing if the WHERE clause is volatile, either. - */ - if (contain_volatile_functions(whereClause)) - return NULL; + /* + * On one hand, the WHERE clause must contain some Vars of the + * parent query, else it's not gonna be a join. + */ + if (!contain_vars_of_level(whereClause, 1)) + return NULL; - /* - * The rest of the sub-select must not refer to any Vars of the parent - * query. (Vars of higher levels should be okay, though.) - */ - subselect->jointree->quals = NULL; - if (contain_vars_of_level((Node *) subselect, 1)) - return NULL; - subselect->jointree->quals = whereClause; + /* + * We don't risk optimizing if the WHERE clause is volatile, either. + */ + if (contain_volatile_functions(whereClause)) + return NULL; - if (subselect->hasAggs) - { - List *joinquals = NULL; - List *vars = NULL; - TargetEntry *ent = NULL; - ListCell *cell = NULL; - int ressortgroupref = 0; - int varno = 0; + /* + * The rest of the sub-select must not refer to any Vars of the parent + * query. (Vars of higher levels should be okay, though.) + */ + subselect->jointree->quals = NULL; + if (contain_vars_of_level((Node *) subselect, 1)) + return NULL; + subselect->jointree->quals = whereClause; - /* process 'op' and 'bool' expr only */ - if (contain_notexpr_or_neopexpr(whereClause, true, &joinquals)) - return NULL; + if (subselect->hasAggs) + { + List *joinquals = NULL; + List *vars = NULL; + TargetEntry *ent = NULL; + ListCell *cell = NULL; + int ressortgroupref = 0; + int varno = 0; + + /* process 'op' and 'bool' expr only */ + if (contain_notexpr_or_neopexpr(whereClause, true, &joinquals)) + return NULL; - vars = pull_vars_of_level((Node *) joinquals, 0); + vars = pull_vars_of_level((Node *) joinquals, 0); - /* construct groupby clause */ + /* construct groupby clause */ foreach (cell, vars) { - Oid sortop; - Oid eqop; - bool hashable; - Oid restype; - SortGroupClause *grpcl; - Var *var = (Var *) lfirst(cell); - RangeTblEntry *tbl = (RangeTblEntry *) list_nth(subselect->rtable, var->varno - 1); - - if (tbl->rtekind != RTE_RELATION && tbl->rtekind != RTE_CTE) - return NULL; - - restype = exprType((Node *) var); - - grpcl = makeNode(SortGroupClause); - ressortgroupref++; - - if (tbl->rtekind == RTE_RELATION) - { - ent = makeTargetEntry((Expr *) copyObject(var), var->varoattno, - get_relid_attribute_name(tbl->relid, var->varoattno), false); - } - else - { - int plan_id; - int ndx; - ListCell *lc; - Plan *cte_plan; - TargetEntry *cte_ent = NULL; - - /* - * Note: cte_plan_ids can be shorter than cteList, if we are still working - * on planning the CTEs (ie, this is a side-reference from another CTE). - * So we mustn't use forboth here. - */ - ndx = 0; - foreach (lc, root->parse->cteList) - { - CommonTableExpr *cte = (CommonTableExpr *) lfirst(lc); - - if (strcmp(cte->ctename, tbl->ctename) == 0) - break; - ndx++; - } - if (lc == NULL) /* shouldn't happen */ - elog(ERROR, "could not find CTE \"%s\"", tbl->ctename); - if (ndx >= list_length(root->cte_plan_ids)) - elog(ERROR, "could not find plan for CTE \"%s\"", tbl->ctename); - plan_id = list_nth_int(root->cte_plan_ids, ndx); - cte_plan = (Plan *) lfirst(list_nth_cell(root->glob->subplans, plan_id - 1)); - cte_ent = (TargetEntry *) lfirst(list_nth_cell(cte_plan->targetlist, var->varattno - 1)); - ent = makeTargetEntry((Expr *) copyObject(var), var->varoattno, cte_ent->resname, false); - } - - ent->ressortgroupref = ressortgroupref; - - subselect->targetList = lappend(subselect->targetList, ent); - - varno = list_length(subselect->targetList); - ent->resno = varno; + Oid sortop; + Oid eqop; + bool hashable; + Oid restype; + SortGroupClause *grpcl; + Var *var = (Var *) lfirst(cell); + RangeTblEntry *tbl = (RangeTblEntry *) list_nth(subselect->rtable, var->varno - 1); + + if (tbl->rtekind != RTE_RELATION && tbl->rtekind != RTE_CTE) + return NULL; + + restype = exprType((Node *) var); + + grpcl = makeNode(SortGroupClause); + ressortgroupref++; + + if (tbl->rtekind == RTE_RELATION) + { + ent = makeTargetEntry((Expr *) copyObject(var), var->varoattno, + get_relid_attribute_name(tbl->relid, var->varoattno), false); + } + else + { + int plan_id; + int ndx; + ListCell *lc; + Plan *cte_plan; + TargetEntry *cte_ent = NULL; + + /* + * Note: cte_plan_ids can be shorter than cteList, if we are still working + * on planning the CTEs (ie, this is a side-reference from another CTE). + * So we mustn't use forboth here. + */ + ndx = 0; + foreach (lc, root->parse->cteList) + { + CommonTableExpr *cte = (CommonTableExpr *) lfirst(lc); + + if (strcmp(cte->ctename, tbl->ctename) == 0) + break; + ndx++; + } + if (lc == NULL) /* shouldn't happen */ + elog(ERROR, "could not find CTE \"%s\"", tbl->ctename); + if (ndx >= list_length(root->cte_plan_ids)) + elog(ERROR, "could not find plan for CTE \"%s\"", tbl->ctename); + plan_id = list_nth_int(root->cte_plan_ids, ndx); + cte_plan = (Plan *) lfirst(list_nth_cell(root->glob->subplans, plan_id - 1)); + cte_ent = (TargetEntry *) lfirst(list_nth_cell(cte_plan->targetlist, var->varattno - 1)); + ent = makeTargetEntry((Expr *) copyObject(var), var->varoattno, cte_ent->resname, false); + } + + ent->ressortgroupref = ressortgroupref; + + subselect->targetList = lappend(subselect->targetList, ent); + + varno = list_length(subselect->targetList); + ent->resno = varno; + + /* determine the eqop and optional sortop */ + get_sort_group_operators(restype, + false, true, false, + &sortop, &eqop, NULL, + &hashable); + + grpcl->tleSortGroupRef = ressortgroupref; + grpcl->eqop = eqop; + grpcl->sortop = sortop; + grpcl->nulls_first = false; /* OK with or without sortop */ + grpcl->hashable = hashable; + + subselect->groupClause = lappend(subselect->groupClause, grpcl); + } + } - /* determine the eqop and optional sortop */ - get_sort_group_operators(restype, - false, true, false, - &sortop, &eqop, NULL, - &hashable); + /* + * Move sub-select to the parent query. + */ + pstate = make_parsestate(NULL); + rte = addRangeTableEntryForSubquery(pstate, + subselect, + makeAlias("TARGETLIST_subquery", NIL), + true, + false); + parse->rtable = lappend(parse->rtable, rte); - grpcl->tleSortGroupRef = ressortgroupref; - grpcl->eqop = eqop; - grpcl->sortop = sortop; - grpcl->nulls_first = false; /* OK with or without sortop */ - grpcl->hashable = hashable; + rtr = makeNode(RangeTblRef); + rtr->rtindex = list_length(parse->rtable); - subselect->groupClause = lappend(subselect->groupClause, grpcl); - } - } + /* + * Form join node. + */ + joinExpr = makeNode(JoinExpr); + joinExpr->jointype = subselect->hasAggs? JOIN_LEFT : JOIN_LEFT_SCALAR; + joinExpr->isNatural = false; + joinExpr->larg = (Node *) root->parse->jointree; + joinExpr->rarg = (Node *) rtr; + joinExpr->usingClause = NIL; + joinExpr->alias = NULL; + joinExpr->rtindex = 0; /* we don't need an RTE for it */ + joinExpr->quals = NULL; - /* - * Move sub-select to the parent query. - */ - pstate = make_parsestate(NULL); - rte = addRangeTableEntryForSubquery(pstate, - subselect, - makeAlias("TARGETLIST_subquery", NIL), - true, - false); - parse->rtable = lappend(parse->rtable, rte); + /* Wrap join node in FromExpr as required. */ + parse->jointree = makeFromExpr(list_make1(joinExpr), NULL); - rtr = makeNode(RangeTblRef); - rtr->rtindex = list_length(parse->rtable); + /* Build a Var pointing to the subquery */ + var = makeVarFromTargetEntry(rtr->rtindex, linitial(subselect->targetList)); - /* - * Form join node. - */ - joinExpr = makeNode(JoinExpr); - joinExpr->jointype = subselect->hasAggs? JOIN_LEFT : JOIN_LEFT_SCALAR; - joinExpr->isNatural = false; - joinExpr->larg = (Node *) root->parse->jointree; - joinExpr->rarg = (Node *) rtr; - joinExpr->usingClause = NIL; - joinExpr->alias = NULL; - joinExpr->rtindex = 0; /* we don't need an RTE for it */ - joinExpr->quals = NULL; - - /* Wrap join node in FromExpr as required. */ - parse->jointree = makeFromExpr(list_make1(joinExpr), NULL); - - /* Build a Var pointing to the subquery */ - var = makeVarFromTargetEntry(rtr->rtindex, linitial(subselect->targetList)); - - /* Replace sublink node with Var. */ - entry->expr = (Expr *)substitute_sublink_with_node((Node *)entry->expr, - sublink, + /* Replace sublink node with Var. */ + entry->expr = (Expr *)substitute_sublink_with_node((Node *)entry->expr, + sublink, (Node *)var); - return entry; + return entry; } #endif diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out index 79708c41..432740af 100644 --- a/src/test/regress/expected/subselect.out +++ b/src/test/regress/expected/subselect.out @@ -1895,6 +1895,40 @@ select (case when a.b =1 then (select b.a from tbl_b b where b.a = a.a and b.b = (10 rows) +explain (costs off) select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b in (1,2)) else 0 end) from tbl_a a order by 1; + QUERY PLAN +----------------------------------------------------------------------------------------- + Sort + Sort Key: (CASE WHEN (a.b = 1) THEN "TARGETLIST_subquery".count ELSE '0'::bigint END) + -> Nested Loop Left Join + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on tbl_a a + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Subquery Scan on "TARGETLIST_subquery" + -> GroupAggregate + Group Key: b.a, b.b + -> Result + One-Time Filter: (a.b = ANY ('{1,2}'::integer[])) + -> Seq Scan on tbl_b b + Filter: ((a = a.a) AND (b = a.b)) +(14 rows) + +select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b in (1,2)) else 0 end) from tbl_a a order by 1; + case +------ + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + +(10 rows) + drop table tbl_a; drop table tbl_b; set enable_pullup_subquery to false; diff --git a/src/test/regress/expected/subselect_1.out b/src/test/regress/expected/subselect_1.out index e8cd553a..79ebb522 100644 --- a/src/test/regress/expected/subselect_1.out +++ b/src/test/regress/expected/subselect_1.out @@ -1167,3 +1167,775 @@ NOTICE: x = 9, y = 13 (3 rows) drop function tattle(x int, y int); +-- +-- Tests for pulling up more sublinks +-- +set enable_pullup_subquery to true; +create table tbl_a(a int,b int); +create table tbl_b(a int,b int); +insert into tbl_a select generate_series(1,10),1 ; +insert into tbl_b select generate_series(2,11),1 ; +-- check targetlist subquery scenario. +set enable_nestloop to true; +set enable_hashjoin to false; +set enable_mergejoin to false; +explain select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + QUERY PLAN +------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) (cost=15636.19..15637.88 rows=675 width=8) + -> Sort (cost=15636.19..15637.88 rows=675 width=8) + Sort Key: a.a, b.a + -> Nested Loop Left Scalar Join (cost=0.00..15604.47 rows=675 width=8) + Join Filter: (b.a = a.a) + -> Seq Scan on tbl_a a (cost=0.00..23.50 rows=1350 width=4) + -> Materialize (cost=0.00..30.25 rows=1350 width=4) + -> Seq Scan on tbl_b b (cost=0.00..23.50 rows=1350 width=4) +(8 rows) + +select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + a | q +----+---- + 1 | + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 + 10 | 10 +(10 rows) + +set enable_nestloop to false; +set enable_hashjoin to true; +set enable_mergejoin to false; +explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a.a, b.a + -> Hash Left Scalar Join + Hash Cond: (a.a = b.a) + -> Seq Scan on tbl_a a + -> Hash + -> Seq Scan on tbl_b b +(8 rows) + +select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + a | q +----+---- + 1 | + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 + 10 | 10 +(10 rows) + +set enable_nestloop to false; +set enable_hashjoin to false; +set enable_mergejoin to true; +explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a.a, b.a + -> Merge Left Scalar Join + Merge Cond: (a.a = b.a) + -> Sort + Sort Key: a.a + -> Seq Scan on tbl_a a + -> Sort + Sort Key: b.a + -> Seq Scan on tbl_b b +(11 rows) + +select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + a | q +----+---- + 1 | + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 + 10 | 10 +(10 rows) + +-- check non-scalar scenario. +insert into tbl_b values(2,2); +set enable_nestloop to true; +set enable_hashjoin to false; +set enable_mergejoin to false; +explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a.a, b.a + -> Nested Loop Left Scalar Join + Join Filter: (b.a = a.a) + -> Seq Scan on tbl_a a + -> Materialize + -> Seq Scan on tbl_b b +(8 rows) + +select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; +ERROR: more than one row returned by a subquery used as an expression +set enable_nestloop to false; +set enable_hashjoin to true; +set enable_mergejoin to false; +explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a.a, b.a + -> Hash Left Scalar Join + Hash Cond: (a.a = b.a) + -> Seq Scan on tbl_a a + -> Hash + -> Seq Scan on tbl_b b +(8 rows) + +select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; +ERROR: more than one row returned by a subquery used as an expression +set enable_nestloop to false; +set enable_hashjoin to false; +set enable_mergejoin to true; +explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a.a, b.a + -> Merge Left Scalar Join + Merge Cond: (a.a = b.a) + -> Sort + Sort Key: a.a + -> Seq Scan on tbl_a a + -> Sort + Sort Key: b.a + -> Seq Scan on tbl_b b +(11 rows) + +select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; +ERROR: more than one row returned by a subquery used as an expression +explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a and b.a = 5) q from tbl_a a order by 1,2; + QUERY PLAN +--------------------------------------------- + Remote Subquery Scan on all (datanode_1) + -> Sort + Sort Key: a.a, b.a + -> Merge Left Scalar Join + Merge Cond: (a.a = b.a) + -> Sort + Sort Key: a.a + -> Seq Scan on tbl_a a + -> Sort + Sort Key: b.a + -> Seq Scan on tbl_b b + Filter: (a = 5) +(12 rows) + +select a.a,(select b.a from tbl_b b where b.a = a.a and b.a = 5) q from tbl_a a order by 1,2; + a | q +---+--- + 1 | + 2 | + 5 | 5 + 6 | + 8 | + 9 | +(6 rows) + +-- check distinct scenario. +set enable_nestloop to true; +set enable_hashjoin to false; +set enable_mergejoin to false; +explain (costs off) select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + QUERY PLAN +----------------------------------------------------------------------- + Sort + Sort Key: a.a, a + -> Nested Loop Left Scalar Join + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on tbl_a a + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Unique + -> Seq Scan on tbl_b b + Filter: (a = a.a) +(10 rows) + +select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + a | q +----+---- + 1 | + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 + 10 | 10 +(10 rows) + +set enable_nestloop to false; +set enable_hashjoin to true; +set enable_mergejoin to false; +explain (costs off) select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + QUERY PLAN +----------------------------------------------------------------------- + Sort + Sort Key: a.a, a + -> Nested Loop Left Scalar Join + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on tbl_a a + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Unique + -> Seq Scan on tbl_b b + Filter: (a = a.a) +(10 rows) + +select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + a | q +----+---- + 1 | + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 + 10 | 10 +(10 rows) + +set enable_nestloop to false; +set enable_hashjoin to false; +set enable_mergejoin to true; +explain (costs off) select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + QUERY PLAN +----------------------------------------------------------------------- + Sort + Sort Key: a.a, a + -> Nested Loop Left Scalar Join + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on tbl_a a + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Unique + -> Seq Scan on tbl_b b + Filter: (a = a.a) +(10 rows) + +select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + a | q +----+---- + 1 | + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 + 10 | 10 +(10 rows) + +set enable_nestloop to true; +set enable_hashjoin to true; +set enable_mergejoin to true; +-- targetlist sublink with agg +explain (costs off) select (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b) from tbl_a a order by 1; + QUERY PLAN +------------------------------------------------------------------------- + Sort + Sort Key: "TARGETLIST_subquery".sum + -> Nested Loop Left Join + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on tbl_a a + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Subquery Scan on "TARGETLIST_subquery" + -> GroupAggregate + Group Key: b.a, b.b + -> Seq Scan on tbl_b b + Filter: ((a = a.a) AND (b = a.b)) +(12 rows) + +select (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b) from tbl_a a order by 1; + sum +----- + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 + +(10 rows) + +explain (costs off) select (select count(b.a) from tbl_b b where b.a = a.a) from tbl_a a order by 1; + QUERY PLAN +----------------------------------------------------------------------- + Sort + Sort Key: "TARGETLIST_subquery".count + -> Nested Loop Left Join + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on tbl_a a + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Subquery Scan on "TARGETLIST_subquery" + -> GroupAggregate + Group Key: b.a + -> Seq Scan on tbl_b b + Filter: (a = a.a) +(12 rows) + +select (select count(b.a) from tbl_b b where b.a = a.a ) from tbl_a a order by 1; + count +------- + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 2 + +(10 rows) + +explain (costs off) select (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b or b.a = 1) from tbl_a a order by 1; + QUERY PLAN +---------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: ((SubPlan 1)) + -> Seq Scan on tbl_a a + SubPlan 1 + -> Finalize Aggregate + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Partial Aggregate + -> Seq Scan on tbl_b b + Filter: (((a = a.a) AND (b = a.b)) OR (a = 1)) +(10 rows) + +select (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b or b.a = 1) from tbl_a a order by 1; + sum +----- + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 + +(10 rows) + +-- targetlist sublink wrapped in expr +explain (costs off) select (case when a.b =1 then (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b) else 0 end) from tbl_a a order by 1; + QUERY PLAN +--------------------------------------------------------------------------------------- + Sort + Sort Key: (CASE WHEN (a.b = 1) THEN "TARGETLIST_subquery".sum ELSE '0'::bigint END) + -> Nested Loop Left Join + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on tbl_a a + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Subquery Scan on "TARGETLIST_subquery" + -> GroupAggregate + Group Key: b.a, b.b + -> Seq Scan on tbl_b b + Filter: ((a = a.a) AND (b = a.b)) +(12 rows) + +select (case when a.b =1 then (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b) else 0 end) from tbl_a a order by 1; + case +------ + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 + +(10 rows) + +explain (costs off) select (case when a.b =1 then (select b.a from tbl_b b where b.a = a.a and b.b = a.b) else 0 end) from tbl_a a order by 1; + QUERY PLAN +------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: (CASE WHEN (a.b = 1) THEN b.a ELSE 0 END) + -> Hash Left Scalar Join + Hash Cond: ((a.a = b.a) AND (a.b = b.b)) + -> Seq Scan on tbl_a a + -> Hash + -> Seq Scan on tbl_b b +(8 rows) + +select (case when a.b =1 then (select b.a from tbl_b b where b.a = a.a and b.b = a.b) else 0 end) from tbl_a a order by 1; + case +------ + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 + +(10 rows) + +explain (costs off) select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b in (1,2)) else 0 end) from tbl_a a order by 1; + QUERY PLAN +----------------------------------------------------------------------------------------- + Sort + Sort Key: (CASE WHEN (a.b = 1) THEN "TARGETLIST_subquery".count ELSE '0'::bigint END) + -> Nested Loop Left Join + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on tbl_a a + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Subquery Scan on "TARGETLIST_subquery" + -> GroupAggregate + Group Key: b.a, b.b + -> Result + One-Time Filter: (a.b = ANY ('{1,2}'::integer[])) + -> Seq Scan on tbl_b b + Filter: ((a = a.a) AND (b = a.b)) +(14 rows) + +select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b in (1,2)) else 0 end) from tbl_a a order by 1; + case +------ + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + +(10 rows) + +drop table tbl_a; +drop table tbl_b; +set enable_pullup_subquery to false; +-- +-- Tests for CTE inlining behavior +-- +-- Basic subquery that can be inlined +explain (verbose, costs off) +with x as (select * from (select f1 from subselect_tbl) ss) +select * from x where f1 = 1; + QUERY PLAN +------------------------------------------ + Remote Subquery Scan on all (datanode_1) + Output: subselect_tbl.f1 + -> Seq Scan on public.subselect_tbl + Output: subselect_tbl.f1 + Filter: (subselect_tbl.f1 = 1) +(5 rows) + +-- Explicitly request materialization +explain (verbose, costs off) +with x as materialized (select * from (select f1 from subselect_tbl) ss) +select * from x where f1 = 1; + QUERY PLAN +------------------------------------------------------------- + CTE Scan on x + Output: x.f1 + Filter: (x.f1 = 1) + CTE x + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: subselect_tbl.f1 + -> Seq Scan on public.subselect_tbl + Output: subselect_tbl.f1 +(8 rows) + +-- Stable functions are safe to inline +explain (verbose, costs off) +with x as (select * from (select f1, now() from subselect_tbl) ss) +select * from x where f1 = 1; + QUERY PLAN +------------------------------------------ + Remote Subquery Scan on all (datanode_1) + Output: subselect_tbl.f1, now() + -> Seq Scan on public.subselect_tbl + Output: subselect_tbl.f1, now() + Filter: (subselect_tbl.f1 = 1) +(5 rows) + +-- Volatile functions prevent inlining +explain (verbose, costs off) +with x as (select * from (select f1, random() from subselect_tbl) ss) +select * from x where f1 = 1; + QUERY PLAN +------------------------------------------------------------- + CTE Scan on x + Output: x.f1, x.random + Filter: (x.f1 = 1) + CTE x + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: f1, random + -> Seq Scan on public.subselect_tbl + Output: subselect_tbl.f1, random() +(8 rows) + +-- SELECT FOR UPDATE cannot be inlined +explain (verbose, costs off) +with x as (select * from (select f1 from subselect_tbl for update) ss) +select * from x where f1 = 1; + QUERY PLAN +-------------------------------------------------------------------------- + CTE Scan on x + Output: x.f1 + Filter: (x.f1 = 1) + CTE x + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: ss.f1 + -> Subquery Scan on ss + Output: ss.f1 + -> LockRows + Output: subselect_tbl.f1, subselect_tbl.ctid + -> Seq Scan on public.subselect_tbl + Output: subselect_tbl.f1, subselect_tbl.ctid +(12 rows) + +-- Multiply-referenced CTEs are inlined only when requested +explain (verbose, costs off) +with x as (select * from (select f1, now() as n from subselect_tbl) ss) +select * from x, x x2 where x.n = x2.n; + QUERY PLAN +------------------------------------------------------------- + Merge Join + Output: x.f1, x.n, x2.f1, x2.n + Merge Cond: (x.n = x2.n) + CTE x + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: subselect_tbl.f1, now() + -> Seq Scan on public.subselect_tbl + Output: subselect_tbl.f1, now() + -> Sort + Output: x.f1, x.n + Sort Key: x.n + -> CTE Scan on x + Output: x.f1, x.n + -> Sort + Output: x2.f1, x2.n + Sort Key: x2.n + -> CTE Scan on x x2 + Output: x2.f1, x2.n +(18 rows) + +explain (verbose, costs off) +with x as not materialized (select * from (select f1, now() as n from subselect_tbl) ss) +select * from x, x x2 where x.n = x2.n; + QUERY PLAN +-------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + Output: subselect_tbl.f1, now(), subselect_tbl_1.f1, now() + -> Result + Output: subselect_tbl.f1, (now()), subselect_tbl_1.f1, (now()) + One-Time Filter: (now() = now()) + -> Nested Loop + Output: subselect_tbl.f1, (now()), subselect_tbl_1.f1, (now()) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: subselect_tbl.f1, now() + Distribute results by H: now() + -> Seq Scan on public.subselect_tbl + Output: subselect_tbl.f1, now() + -> Materialize + Output: subselect_tbl_1.f1, (now()) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: subselect_tbl_1.f1, now() + Distribute results by H: now() + -> Seq Scan on public.subselect_tbl subselect_tbl_1 + Output: subselect_tbl_1.f1, now() +(19 rows) + +-- Multiply-referenced CTEs can't be inlined if they contain outer self-refs +explain (verbose, costs off) +with recursive x(a) as + ((values ('a'), ('b')) + union all + (with z as not materialized (select * from x) + select z.a || z1.a as a from z cross join z as z1 + where length(z.a || z1.a) < 5)) +select * from x; + QUERY PLAN +---------------------------------------------------------- + CTE Scan on x + Output: x.a + CTE x + -> Recursive Union + -> Values Scan on "*VALUES*" + Output: "*VALUES*".column1 + -> Nested Loop + Output: (z.a || z1.a) + Join Filter: (length((z.a || z1.a)) < 5) + CTE z + -> WorkTable Scan on x x_1 + Output: x_1.a + -> CTE Scan on z + Output: z.a + -> Materialize + Output: z1.a + -> CTE Scan on z z1 + Output: z1.a +(18 rows) + +with recursive x(a) as + ((values ('a'), ('b')) + union all + (with z as not materialized (select * from x) + select z.a || z1.a as a from z cross join z as z1 + where length(z.a || z1.a) < 5)) +select * from x; + a +------ + a + b + aa + ab + ba + bb + aaaa + aaab + aaba + aabb + abaa + abab + abba + abbb + baaa + baab + baba + babb + bbaa + bbab + bbba + bbbb +(22 rows) + +explain (verbose, costs off) +with recursive x(a) as + ((values ('a'), ('b')) + union all + (with z as not materialized (select * from x) + select z.a || z.a as a from z + where length(z.a || z.a) < 5)) +select * from x; + QUERY PLAN +-------------------------------------------------------- + CTE Scan on x + Output: x.a + CTE x + -> Recursive Union + -> Values Scan on "*VALUES*" + Output: "*VALUES*".column1 + -> WorkTable Scan on x x_1 + Output: (x_1.a || x_1.a) + Filter: (length((x_1.a || x_1.a)) < 5) +(9 rows) + +with recursive x(a) as + ((values ('a'), ('b')) + union all + (with z as not materialized (select * from x) + select z.a || z.a as a from z + where length(z.a || z.a) < 5)) +select * from x; + a +------ + a + b + aa + bb + aaaa + bbbb +(6 rows) + +-- Check handling of outer references +explain (verbose, costs off) +with x as (select * from int4_tbl) +select * from (with y as (select * from x) select * from y) ss; + QUERY PLAN +------------------------------------------ + Remote Subquery Scan on all (datanode_1) + Output: f1 + -> Seq Scan on public.int4_tbl + Output: int4_tbl.f1 +(4 rows) + +explain (verbose, costs off) +with x as materialized (select * from int4_tbl) +select * from (with y as (select * from x) select * from y) ss; + QUERY PLAN +-------------------------------------------------- + CTE Scan on x + Output: x.f1 + CTE x + -> Remote Subquery Scan on all (datanode_1) + Output: int4_tbl.f1 + -> Seq Scan on public.int4_tbl + Output: int4_tbl.f1 +(7 rows) + +-- Ensure that we inline the currect CTE when there are +-- multiple CTEs with the same name +explain (verbose, costs off) +with x as (select 1 as y) +select * from (with x as (select 2 as y) select * from x) ss; + QUERY PLAN +------------- + Result + Output: 2 +(2 rows) + +-- Row marks are not pushed into CTEs +explain (verbose, costs off) +with x as (select * from subselect_tbl) +select * from x for update; + QUERY PLAN +---------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + Output: subselect_tbl.f1, subselect_tbl.f2, subselect_tbl.f3 + -> Seq Scan on public.subselect_tbl + Output: subselect_tbl.f1, subselect_tbl.f2, subselect_tbl.f3 +(4 rows) + diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql index a7ba5190..82efdc9c 100644 --- a/src/test/regress/sql/subselect.sql +++ b/src/test/regress/sql/subselect.sql @@ -599,99 +599,6 @@ select * from drop function tattle(x int, y int); --- --- Tests for CTE inlining behavior --- - --- Basic subquery that can be inlined -explain (verbose, costs off) -with x as (select * from (select f1 from subselect_tbl) ss) -select * from x where f1 = 1; - --- Explicitly request materialization -explain (verbose, costs off) -with x as materialized (select * from (select f1 from subselect_tbl) ss) -select * from x where f1 = 1; - --- Stable functions are safe to inline -explain (verbose, costs off) -with x as (select * from (select f1, now() from subselect_tbl) ss) -select * from x where f1 = 1; - --- Volatile functions prevent inlining -explain (verbose, costs off) -with x as (select * from (select f1, random() from subselect_tbl) ss) -select * from x where f1 = 1; - --- SELECT FOR UPDATE cannot be inlined -explain (verbose, costs off) -with x as (select * from (select f1 from subselect_tbl for update) ss) -select * from x where f1 = 1; - --- Multiply-referenced CTEs are inlined only when requested -explain (verbose, costs off) -with x as (select * from (select f1, now() as n from subselect_tbl) ss) -select * from x, x x2 where x.n = x2.n; - -explain (verbose, costs off) -with x as not materialized (select * from (select f1, now() as n from subselect_tbl) ss) -select * from x, x x2 where x.n = x2.n; - --- Multiply-referenced CTEs can't be inlined if they contain outer self-refs -explain (verbose, costs off) -with recursive x(a) as - ((values ('a'), ('b')) - union all - (with z as not materialized (select * from x) - select z.a || z1.a as a from z cross join z as z1 - where length(z.a || z1.a) < 5)) -select * from x; - -with recursive x(a) as - ((values ('a'), ('b')) - union all - (with z as not materialized (select * from x) - select z.a || z1.a as a from z cross join z as z1 - where length(z.a || z1.a) < 5)) -select * from x; - -explain (verbose, costs off) -with recursive x(a) as - ((values ('a'), ('b')) - union all - (with z as not materialized (select * from x) - select z.a || z.a as a from z - where length(z.a || z.a) < 5)) -select * from x; - -with recursive x(a) as - ((values ('a'), ('b')) - union all - (with z as not materialized (select * from x) - select z.a || z.a as a from z - where length(z.a || z.a) < 5)) -select * from x; - --- Check handling of outer references -explain (verbose, costs off) -with x as (select * from int4_tbl) -select * from (with y as (select * from x) select * from y) ss; - -explain (verbose, costs off) -with x as materialized (select * from int4_tbl) -select * from (with y as (select * from x) select * from y) ss; - --- Ensure that we inline the currect CTE when there are --- multiple CTEs with the same name -explain (verbose, costs off) -with x as (select 1 as y) -select * from (with x as (select 2 as y) select * from x) ss; - --- Row marks are not pushed into CTEs -explain (verbose, costs off) -with x as (select * from subselect_tbl) -select * from x for update; - -- -- Tests for pulling up more sublinks -- @@ -781,7 +688,102 @@ explain (costs off) select (case when a.b =1 then (select sum(b.a) from tbl_b b select (case when a.b =1 then (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b) else 0 end) from tbl_a a order by 1; explain (costs off) select (case when a.b =1 then (select b.a from tbl_b b where b.a = a.a and b.b = a.b) else 0 end) from tbl_a a order by 1; select (case when a.b =1 then (select b.a from tbl_b b where b.a = a.a and b.b = a.b) else 0 end) from tbl_a a order by 1; +explain (costs off) select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b in (1,2)) else 0 end) from tbl_a a order by 1; +select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b in (1,2)) else 0 end) from tbl_a a order by 1; drop table tbl_a; drop table tbl_b; set enable_pullup_subquery to false; + +-- +-- Tests for CTE inlining behavior +-- + +-- Basic subquery that can be inlined +explain (verbose, costs off) +with x as (select * from (select f1 from subselect_tbl) ss) +select * from x where f1 = 1; + +-- Explicitly request materialization +explain (verbose, costs off) +with x as materialized (select * from (select f1 from subselect_tbl) ss) +select * from x where f1 = 1; + +-- Stable functions are safe to inline +explain (verbose, costs off) +with x as (select * from (select f1, now() from subselect_tbl) ss) +select * from x where f1 = 1; + +-- Volatile functions prevent inlining +explain (verbose, costs off) +with x as (select * from (select f1, random() from subselect_tbl) ss) +select * from x where f1 = 1; + +-- SELECT FOR UPDATE cannot be inlined +explain (verbose, costs off) +with x as (select * from (select f1 from subselect_tbl for update) ss) +select * from x where f1 = 1; + +-- Multiply-referenced CTEs are inlined only when requested +explain (verbose, costs off) +with x as (select * from (select f1, now() as n from subselect_tbl) ss) +select * from x, x x2 where x.n = x2.n; + +explain (verbose, costs off) +with x as not materialized (select * from (select f1, now() as n from subselect_tbl) ss) +select * from x, x x2 where x.n = x2.n; + +-- Multiply-referenced CTEs can't be inlined if they contain outer self-refs +explain (verbose, costs off) +with recursive x(a) as + ((values ('a'), ('b')) + union all + (with z as not materialized (select * from x) + select z.a || z1.a as a from z cross join z as z1 + where length(z.a || z1.a) < 5)) +select * from x; + +with recursive x(a) as + ((values ('a'), ('b')) + union all + (with z as not materialized (select * from x) + select z.a || z1.a as a from z cross join z as z1 + where length(z.a || z1.a) < 5)) +select * from x; + +explain (verbose, costs off) +with recursive x(a) as + ((values ('a'), ('b')) + union all + (with z as not materialized (select * from x) + select z.a || z.a as a from z + where length(z.a || z.a) < 5)) +select * from x; + +with recursive x(a) as + ((values ('a'), ('b')) + union all + (with z as not materialized (select * from x) + select z.a || z.a as a from z + where length(z.a || z.a) < 5)) +select * from x; + +-- Check handling of outer references +explain (verbose, costs off) +with x as (select * from int4_tbl) +select * from (with y as (select * from x) select * from y) ss; + +explain (verbose, costs off) +with x as materialized (select * from int4_tbl) +select * from (with y as (select * from x) select * from y) ss; + +-- Ensure that we inline the currect CTE when there are +-- multiple CTEs with the same name +explain (verbose, costs off) +with x as (select 1 as y) +select * from (with x as (select 2 as y) select * from x) ss; + +-- Row marks are not pushed into CTEs +explain (verbose, costs off) +with x as (select * from subselect_tbl) +select * from x for update; From 7902557b5849c93352454a486fad80c42bcaadc2 Mon Sep 17 00:00:00 2001 From: ericxwu Date: Tue, 1 Sep 2020 17:00:20 +0800 Subject: [PATCH 040/578] Support pullup agg sublink with NullTest qual --- src/backend/optimizer/plan/subselect.c | 4 +-- src/test/regress/expected/subselect.out | 34 +++++++++++++++++++++++++ src/test/regress/sql/subselect.sql | 3 ++- 3 files changed, 38 insertions(+), 3 deletions(-) diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index 8bb67513..7d799073 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -1654,7 +1654,7 @@ contain_notexpr_or_neopexpr(Node *whereclause, bool check_or, List **joinquals) else if (IsA(whereclause, ArrayExpr)) { ListCell *lc = NULL; - ArrayExpr *arrayExpr = (ArrayExpr*)whereclause; + ArrayExpr *arrayExpr = (ArrayExpr *)whereclause; foreach(lc, arrayExpr->elements) { @@ -2866,7 +2866,7 @@ convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry) if (sublink->subLinkType != EXPR_SUBLINK) return NULL; - /* + /* * Copy object so that we can modify it. */ subselect = copyObject((Query *) sublink->subselect); diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out index 432740af..e31f5f94 100644 --- a/src/test/regress/expected/subselect.out +++ b/src/test/regress/expected/subselect.out @@ -1929,6 +1929,40 @@ select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and (10 rows) +explain (costs off) select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b is not null) else 0 end) from tbl_a a order by 1; + QUERY PLAN +----------------------------------------------------------------------------------------- + Sort + Sort Key: (CASE WHEN (a.b = 1) THEN "TARGETLIST_subquery".count ELSE '0'::bigint END) + -> Nested Loop Left Join + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on tbl_a a + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Subquery Scan on "TARGETLIST_subquery" + -> GroupAggregate + Group Key: b.a, b.b + -> Result + One-Time Filter: (a.b IS NOT NULL) + -> Seq Scan on tbl_b b + Filter: ((a = a.a) AND (b = a.b)) +(14 rows) + +select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b is not null) else 0 end) from tbl_a a order by 1; + case +------ + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + +(10 rows) + drop table tbl_a; drop table tbl_b; set enable_pullup_subquery to false; diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql index 82efdc9c..68fa867e 100644 --- a/src/test/regress/sql/subselect.sql +++ b/src/test/regress/sql/subselect.sql @@ -690,7 +690,8 @@ explain (costs off) select (case when a.b =1 then (select b.a from tbl_b b wher select (case when a.b =1 then (select b.a from tbl_b b where b.a = a.a and b.b = a.b) else 0 end) from tbl_a a order by 1; explain (costs off) select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b in (1,2)) else 0 end) from tbl_a a order by 1; select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b in (1,2)) else 0 end) from tbl_a a order by 1; - +explain (costs off) select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b is not null) else 0 end) from tbl_a a order by 1; +select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b is not null) else 0 end) from tbl_a a order by 1; drop table tbl_a; drop table tbl_b; set enable_pullup_subquery to false; From 41f2a048e1c58bb5b1038c5ca984318c3715e22e Mon Sep 17 00:00:00 2001 From: youngxie Date: Wed, 2 Sep 2020 14:50:50 +0800 Subject: [PATCH 041/578] Fix pullup count agg subquery in targetlist --- src/backend/optimizer/plan/subselect.c | 59 +++++++++++++++++++++---- src/test/regress/expected/subselect.out | 12 ++--- 2 files changed, 56 insertions(+), 15 deletions(-) diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index 7d799073..483fdedb 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -2851,8 +2851,9 @@ convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry) SubLink *sublink = NULL; RangeTblRef *rtr = NULL; RangeTblEntry *rte = NULL; - Var *var = NULL; - List *sublinks = NIL; + Node *target = NULL; + List *sublinks = NIL; + bool count_agg = false; /* Find sublinks in the targetlist entry */ find_sublink_walker((Node *)entry->expr, &sublinks); @@ -2922,17 +2923,49 @@ convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry) if (subselect->hasAggs) { + int ressortgroupref = 0; + int varno = 0; List *joinquals = NULL; List *vars = NULL; TargetEntry *ent = NULL; ListCell *cell = NULL; - int ressortgroupref = 0; - int varno = 0; + char *name = NULL; + Aggref *agg = NULL; + Node *expr = linitial(subselect->targetList); /* process 'op' and 'bool' expr only */ if (contain_notexpr_or_neopexpr(whereClause, true, &joinquals)) return NULL; + expr = (Node *)((TargetEntry *)expr)->expr; + /* + * First node must be Agg. + * we optimize subquery only like "SELECT agg()", + * others will not be optimized for now. + */ + if (!IsA(expr, Aggref)) + return NULL; + + agg = (Aggref *)expr; + name = get_func_name(agg->aggfnoid); + if(!name) + { + return NULL; + } + + /* count agg */ + if (pg_strcasecmp(name, "count") == 0) + { + count_agg = true; + } + /* strict aggs are allowed */ + else if (pg_strcasecmp(name, "max") != 0 && pg_strcasecmp(name, "min") != 0 && + pg_strcasecmp(name, "stddev") != 0 && pg_strcasecmp(name, "sum") != 0 && + pg_strcasecmp(name, "avg") != 0 && pg_strcasecmp(name, "variance") != 0) + { + return NULL; + } + vars = pull_vars_of_level((Node *) joinquals, 0); /* construct groupby clause */ @@ -3045,12 +3078,20 @@ convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry) parse->jointree = makeFromExpr(list_make1(joinExpr), NULL); /* Build a Var pointing to the subquery */ - var = makeVarFromTargetEntry(rtr->rtindex, linitial(subselect->targetList)); + target = (Node *)makeVarFromTargetEntry(rtr->rtindex, linitial(subselect->targetList)); + + /* Add Coalesce(count,0) */ + if (count_agg) + { + CoalesceExpr *coalesce = makeNode(CoalesceExpr); + coalesce->args = list_make2(target, + makeConst(INT8OID, -1, InvalidOid, sizeof(int64), Int64GetDatum(0), false, true)); + coalesce->coalescetype = INT8OID; + target = (Node *) coalesce; + } - /* Replace sublink node with Var. */ - entry->expr = (Expr *)substitute_sublink_with_node((Node *)entry->expr, - sublink, - (Node *)var); + /* Replace sublink node with Result. */ + entry->expr = (Expr *)substitute_sublink_with_node((Node *)entry->expr, sublink, target); return entry; } #endif diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out index e31f5f94..4b7dda27 100644 --- a/src/test/regress/expected/subselect.out +++ b/src/test/regress/expected/subselect.out @@ -1776,7 +1776,7 @@ explain (costs off) select (select count(b.a) from tbl_b b where b.a = a.a) fro QUERY PLAN ----------------------------------------------------------------------- Sort - Sort Key: "TARGETLIST_subquery".count + Sort Key: (COALESCE("TARGETLIST_subquery".count, '0'::bigint)) -> Nested Loop Left Join -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on tbl_a a @@ -1792,6 +1792,7 @@ explain (costs off) select (select count(b.a) from tbl_b b where b.a = a.a) fro select (select count(b.a) from tbl_b b where b.a = a.a ) from tbl_a a order by 1; count ------- + 0 1 1 1 @@ -1801,7 +1802,6 @@ select (select count(b.a) from tbl_b b where b.a = a.a ) from tbl_a a order by 1 1 1 2 - (10 rows) explain (costs off) select (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b or b.a = 1) from tbl_a a order by 1; @@ -1896,10 +1896,10 @@ select (case when a.b =1 then (select b.a from tbl_b b where b.a = a.a and b.b = (10 rows) explain (costs off) select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b in (1,2)) else 0 end) from tbl_a a order by 1; - QUERY PLAN ------------------------------------------------------------------------------------------ + QUERY PLAN +---------------------------------------------------------------------------------------------------------------- Sort - Sort Key: (CASE WHEN (a.b = 1) THEN "TARGETLIST_subquery".count ELSE '0'::bigint END) + Sort Key: (CASE WHEN (a.b = 1) THEN COALESCE("TARGETLIST_subquery".count, '0'::bigint) ELSE '0'::bigint END) -> Nested Loop Left Join -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on tbl_a a @@ -1917,6 +1917,7 @@ explain (costs off) select (case when a.b =1 then (select count(*) from tbl_b b select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b in (1,2)) else 0 end) from tbl_a a order by 1; case ------ + 0 1 1 1 @@ -1926,7 +1927,6 @@ select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and 1 1 1 - (10 rows) explain (costs off) select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b is not null) else 0 end) from tbl_a a order by 1; From bf4fd50c7846ef4683e796f78a8bbdbc769f8e4a Mon Sep 17 00:00:00 2001 From: ericxwu Date: Wed, 2 Sep 2020 15:31:01 +0800 Subject: [PATCH 042/578] update subselect test expect file --- src/test/regress/expected/subselect_1.out | 46 ++++++++++++++++++++--- 1 file changed, 40 insertions(+), 6 deletions(-) diff --git a/src/test/regress/expected/subselect_1.out b/src/test/regress/expected/subselect_1.out index 79ebb522..cf0f6db9 100644 --- a/src/test/regress/expected/subselect_1.out +++ b/src/test/regress/expected/subselect_1.out @@ -1499,7 +1499,7 @@ explain (costs off) select (select count(b.a) from tbl_b b where b.a = a.a) fro QUERY PLAN ----------------------------------------------------------------------- Sort - Sort Key: "TARGETLIST_subquery".count + Sort Key: (COALESCE("TARGETLIST_subquery".count, '0'::bigint)) -> Nested Loop Left Join -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on tbl_a a @@ -1515,6 +1515,7 @@ explain (costs off) select (select count(b.a) from tbl_b b where b.a = a.a) fro select (select count(b.a) from tbl_b b where b.a = a.a ) from tbl_a a order by 1; count ------- + 0 1 1 1 @@ -1524,7 +1525,6 @@ select (select count(b.a) from tbl_b b where b.a = a.a ) from tbl_a a order by 1 1 1 2 - (10 rows) explain (costs off) select (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b or b.a = 1) from tbl_a a order by 1; @@ -1619,10 +1619,10 @@ select (case when a.b =1 then (select b.a from tbl_b b where b.a = a.a and b.b = (10 rows) explain (costs off) select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b in (1,2)) else 0 end) from tbl_a a order by 1; - QUERY PLAN ------------------------------------------------------------------------------------------ + QUERY PLAN +---------------------------------------------------------------------------------------------------------------- Sort - Sort Key: (CASE WHEN (a.b = 1) THEN "TARGETLIST_subquery".count ELSE '0'::bigint END) + Sort Key: (CASE WHEN (a.b = 1) THEN COALESCE("TARGETLIST_subquery".count, '0'::bigint) ELSE '0'::bigint END) -> Nested Loop Left Join -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on tbl_a a @@ -1640,6 +1640,41 @@ explain (costs off) select (case when a.b =1 then (select count(*) from tbl_b b select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b in (1,2)) else 0 end) from tbl_a a order by 1; case ------ + 0 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 +(10 rows) + +explain (costs off) select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b is not null) else 0 end) from tbl_a a order by 1; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------- + Sort + Sort Key: (CASE WHEN (a.b = 1) THEN COALESCE("TARGETLIST_subquery".count, '0'::bigint) ELSE '0'::bigint END) + -> Nested Loop Left Join + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on tbl_a a + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Subquery Scan on "TARGETLIST_subquery" + -> GroupAggregate + Group Key: b.a, b.b + -> Result + One-Time Filter: (a.b IS NOT NULL) + -> Seq Scan on tbl_b b + Filter: ((a = a.a) AND (b = a.b)) +(14 rows) + +select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b is not null) else 0 end) from tbl_a a order by 1; + case +------ + 0 1 1 1 @@ -1649,7 +1684,6 @@ select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and 1 1 1 - (10 rows) drop table tbl_a; From 7935999fdb5efc755acc532b97017750a3e774b3 Mon Sep 17 00:00:00 2001 From: qiannzhang Date: Wed, 2 Sep 2020 15:35:55 +0800 Subject: [PATCH 043/578] Fix coredump when alter gtm node --- src/backend/access/transam/gtm.c | 201 +++++++++++++++++-------------- src/backend/pgxc/pool/pgxcnode.c | 94 +++++++-------- 2 files changed, 155 insertions(+), 140 deletions(-) diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c index 4545592b..267c5b88 100644 --- a/src/backend/access/transam/gtm.c +++ b/src/backend/access/transam/gtm.c @@ -1081,100 +1081,105 @@ IsGTMConnected() } #ifdef __TBASE__ +/* + * Set gtm info with GtmHost and GtmPort. + * + * There are three cases: + * 1.New gtm info from create/alter gtm node command + * 2.Gtm info from pgxc_node + * 3.Gtm info from recovery gtm host + */ static void GetMasterGtmInfo(void) -{// #lizard forgives - /* Check gtm host and port info */ - Relation rel; - HeapScanDesc scan; - HeapTuple gtmtup; - Form_pgxc_node nodeForm; - bool found = false; - - /* reset gtm info */ - ResetGtmInfo(); - - /* we have no recovery gtm host info, just read from heap. */ - if (!g_recovery_gtm_host->need_read) - { - rel = heap_open(PgxcNodeRelationId, AccessShareLock); - scan = heap_beginscan_catalog(rel, 0, NULL); - - /* Only one record will match */ - while (HeapTupleIsValid(gtmtup = heap_getnext(scan, ForwardScanDirection))) - { - nodeForm = (Form_pgxc_node) GETSTRUCT(gtmtup); - if (PGXC_NODE_GTM == nodeForm->node_type && nodeForm->nodeis_primary) - { - GtmHost = strdup(NameStr(nodeForm->node_host)); - GtmPort = nodeForm->node_port; - found = true; - break; - } - } - - heap_endscan(scan); - heap_close(rel, AccessShareLock); - } - else - { - /* get the gtm host info */ - GtmHost = strdup(NameStr(g_recovery_gtm_host->hostdata)); - GtmPort = g_recovery_gtm_host->port; - found = true; - } - - if (!found) - { - if (NewGtmHost && NewGtmPort != 0) - { - elog(LOG, "GetMasterGtmInfo: can not get master gtm info from pgxc_node, try use NewGtmHost:%s NewGtmPort:%d", - NewGtmHost, NewGtmPort); - } - else - { - elog(LOG, "GetMasterGtmInfo: can not get master gtm info from pgxc_node"); - } - } +{ + /* Check gtm host and port info */ + Relation rel; + HeapScanDesc scan; + HeapTuple gtmtup; + Form_pgxc_node nodeForm; + bool found = false; + + /* reset gtm info */ + ResetGtmInfo(); + + /* If NewGtmHost and NewGtmPort, just use it. */ + if (NewGtmHost && NewGtmPort != 0) + { + GtmHost = strdup(NewGtmHost); + GtmPort = NewGtmPort; + + free(NewGtmHost); + NewGtmHost = NULL; + NewGtmPort = 0; + + elog(LOG, + "GetMasterGtmInfo: set master gtm info with NewGtmHost:%s NewGtmPort:%d", + NewGtmHost, NewGtmPort); + return; + } + + /* we have no recovery gtm host info, just read from heap. */ + if (!g_recovery_gtm_host->need_read) + { + /* + * We must be sure there is no error report, because we may be + * in AbortTransaction now. + * 1.If we are not in a transaction, we should not open relation. + * 2.If we do not get lock, it is ok to try it next time. + */ + if (IsTransactionState() && + ConditionalLockRelationOid(PgxcNodeRelationId, AccessShareLock)) + { + rel = relation_open(PgxcNodeRelationId, NoLock); + scan = heap_beginscan_catalog(rel, 0, NULL); + /* Only one record will match */ + while (HeapTupleIsValid(gtmtup = heap_getnext(scan, ForwardScanDirection))) + { + nodeForm = (Form_pgxc_node) GETSTRUCT(gtmtup); + if (PGXC_NODE_GTM == nodeForm->node_type && nodeForm->nodeis_primary) + { + GtmHost = strdup(NameStr(nodeForm->node_host)); + GtmPort = nodeForm->node_port; + found = true; + break; + } + } + heap_endscan(scan); + relation_close(rel, AccessShareLock); + } + } + else + { + /* get the gtm host info */ + GtmHost = strdup(NameStr(g_recovery_gtm_host->hostdata)); + GtmPort = g_recovery_gtm_host->port; + found = true; + } + + if (!found) + { + elog(LOG, + "GetMasterGtmInfo: can not get master gtm info from pgxc_node"); + } } #endif static void CheckConnection(void) -{// #lizard forgives -#ifdef __TBASE__ - /* First time try connect to gtm, get gtm info from syscache first */ - if (NULL == GtmHost && 0 == GtmPort) - { - GetMasterGtmInfo(); - } - - /* If NewGtmHost and NewGtmPort were set, we are in create/alter gtm node command */ - if (NewGtmHost && NewGtmPort != 0) - { - ResetGtmInfo(); - - GtmHost = strdup(NewGtmHost); - GtmPort = NewGtmPort; - - free(NewGtmHost); - NewGtmHost = NULL; - NewGtmPort = 0; - - /* Close old gtm connection */ - CloseGTM(); - } -#endif - - /* Be sure that a backend does not use a postmaster connection */ - if (IsUnderPostmaster && GTMPQispostmaster(conn) == 1) - { - InitGTM(); - return; - } - - if (GTMPQstatus(conn) != CONNECTION_OK) - InitGTM(); +{ + /* Be sure that a backend does not use a postmaster connection */ + if (IsUnderPostmaster && GTMPQispostmaster(conn) == 1) + { + CloseGTM(); + InitGTM(); + return; + } + + if (GTMPQstatus(conn) != CONNECTION_OK) + { + CloseGTM(); + InitGTM(); + } } void @@ -1183,8 +1188,26 @@ InitGTM(void) #define CONNECT_STR_LEN 256 /* 256 bytes should be enough */ char conn_str[CONNECT_STR_LEN]; #ifdef __TBASE__ - int try_cnt = 0; - const int max_try_cnt = 1; + int try_cnt = 0; + const int max_try_cnt = 1; + + /* + * Only re-set gtm info in two cases: + * 1.No gtm info + * 2.New gtm info by create/alter gtm node command + */ + if ((GtmHost == NULL && GtmPort == 0) || + (NewGtmHost != NULL && NewGtmPort != 0)) + { + GetMasterGtmInfo(); + } + if (GtmHost == NULL && GtmPort == 0) + { + ereport(LOG, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("GtmHost and GtmPort are not set"))); + return; + } #endif try_connect_gtm: diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index f2886833..775b703d 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -1365,23 +1365,17 @@ get_message(PGXCNodeHandle *conn, int *len, char **msg) */ void release_handles(bool force) -{// #lizard forgives - bool destroy = false; - int i; - int nbytes = 0; - if (!force) - { - if (HandlesInvalidatePending) - { - DoInvalidateRemoteHandles(); - return; - } - - /* don't free connection if holding a cluster lock */ - if (cluster_ex_lock_held) - { - return; - } +{ + bool destroy = false; + int i; + int nbytes = 0; + if (!force) + { + /* don't free connection if holding a cluster lock */ + if (cluster_ex_lock_held) + { + return; + } if (datanode_count == 0 && coord_count == 0 && slavedatanode_count == 0) { @@ -1425,34 +1419,33 @@ release_handles(bool force) #ifndef __USE_GLOBAL_SNAPSHOT__ handle->sendGxidVersion = 0; #endif - nbytes = pgxc_node_is_data_enqueued(handle); - if (nbytes) - { - elog(PANIC, "Connection to Datanode %s has data %d pending", - handle->nodename, nbytes); - } - } - - - for (i = 0; i < NumSlaveDataNodes; i++) - { - PGXCNodeHandle *handle = &sdn_handles[i]; - - if (handle->sock != NO_SOCKET) - { - /* - * Connections at this point should be completely inactive, - * otherwise abaandon them. We can not allow not cleaned up - * connection is returned to pool. - */ - if (handle->state != DN_CONNECTION_STATE_IDLE || - handle->transaction_status != 'I') - { - destroy = true; - elog(DEBUG1, "Connection to Datanode %d has unexpected state %d and will be dropped", - handle->nodeoid, handle->state); - } - + nbytes = pgxc_node_is_data_enqueued(handle); + if (nbytes) + { + elog(PANIC, "Connection to Datanode %s has data %d pending", + handle->nodename, nbytes); + } + } + + for (i = 0; i < NumSlaveDataNodes; i++) + { + PGXCNodeHandle *handle = &sdn_handles[i]; + + if (handle->sock != NO_SOCKET) + { + /* + * Connections at this point should be completely inactive, + * otherwise abaandon them. We can not allow not cleaned up + * connection is returned to pool. + */ + if (handle->state != DN_CONNECTION_STATE_IDLE || + handle->transaction_status != 'I') + { + destroy = true; + elog(DEBUG1, "Connection to Datanode %d has unexpected state %d and will be dropped", + handle->nodeoid, handle->state); + } + #ifdef _PG_REGRESS_ elog(LOG, "release_handles release a connection with datanode %s" "remote backend PID %d", @@ -1513,8 +1506,7 @@ release_handles(bool force) } } - //destroy = true; - /* And finally release all the connections on pooler */ + /* And finally release all the connections on pooler */ PoolManagerReleaseConnections(destroy); datanode_count = 0; @@ -4795,12 +4787,12 @@ DoInvalidateRemoteHandles(void) { bool result = false; - HandlesInvalidatePending = false; - HandlesRefreshPending = false; + InitMultinodeExecutor(true); - InitMultinodeExecutor(true); + HandlesInvalidatePending = false; + HandlesRefreshPending = false; - return result; + return result; } /* From f0a41293dedaa1988c21aeed0f7903ea734c91c8 Mon Sep 17 00:00:00 2001 From: ericxwu Date: Thu, 3 Sep 2020 16:39:33 +0800 Subject: [PATCH 044/578] Support converting correlated ANY sublink to lateral subquery We support both 1-level upper or above 1-level cases. Lateral flag of subquery should be set when pulling up those sublinks with exactly one upper level correlations. If the correlation is above one upper level, the pullup will not have any side effect, thus we can threat them as normal pullup. --- src/backend/optimizer/plan/subselect.c | 427 +++++++------ src/backend/optimizer/util/var.c | 9 + src/include/optimizer/subselect.h | 6 +- src/include/optimizer/var.h | 1 - src/test/regress/expected/subselect.out | 816 ++++++++++++------------ src/test/regress/sql/subselect.sql | 9 +- 6 files changed, 663 insertions(+), 605 deletions(-) diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index 483fdedb..f7832d4d 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -172,8 +172,12 @@ static bool finalize_primnode(Node *node, finalize_primnode_context *context); static bool finalize_agg_primnode(Node *node, finalize_primnode_context *context); #ifdef __TBASE__ -static Expr * convert_OR_EXIST_sublink_to_join(PlannerInfo *root, SubLink *sublink, Node **jtlink); -static Node * get_or_exist_subquery_targetlist(PlannerInfo *root, Node *node,List **targetList, List **joinClause, int *next_attno); +static Expr * convert_OR_EXIST_sublink_to_join(PlannerInfo *root, + SubLink *sublink, Node **jtlink); +static Node * get_or_exist_subquery_targetlist(PlannerInfo *root, Node *node, + List **targetList, List **joinClause, int *next_attno); +static bool is_simple_subquery(Query *subquery, JoinExpr *lowest_outer_join, + bool deletion_ok); #endif /* * Select a PARAM_EXEC number to identify the given Var as a parameter for @@ -538,6 +542,140 @@ get_first_col_type(Plan *plan, Oid *coltype, int32 *coltypmod, *colcollation = InvalidOid; } +#ifdef __TBASE__ +/* + * Check if there is a range table entry of type func expr whose arguments + * are correlated + */ +bool +has_correlation_in_funcexpr_rte(List *rtable) +{ + /* + * check if correlation occurs in a func expr in the from clause of the + * subselect + */ + ListCell *lc_rte; + + foreach(lc_rte, rtable) + { + RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc_rte); + + if (rte->functions && contain_vars_upper_level((Node *) rte->functions, 1)) + { + return true; + } + } + return false; +} + +/* + * is_simple_subquery + * Check a subquery in the range table to see if it's simple enough + * to pull up into the parent query. + * + * rte is the RTE_SUBQUERY RangeTblEntry that contained the subquery. + * (Note subquery is not necessarily equal to rte->subquery; it could be a + * processed copy of that.) + * lowest_outer_join is the lowest outer join above the subquery, or NULL. + * deletion_ok is TRUE if it'd be okay to delete the subquery entirely. + */ +static bool +is_simple_subquery(Query *subquery, + JoinExpr *lowest_outer_join, + bool deletion_ok) +{ + /* + * Let's just make sure it's a valid subselect ... + */ + if (!IsA(subquery, Query) || + subquery->commandType != CMD_SELECT) + elog(ERROR, "subquery is bogus"); + + /* + * Can't currently pull up a query with setops (unless it's simple UNION + * ALL, which is handled by a different code path). Maybe after querytree + * redesign... + */ + if (subquery->setOperations) + return false; + + /* + * Can't pull up a subquery involving grouping, aggregation, SRFs, + * sorting, limiting, or WITH. (XXX WITH could possibly be allowed later) + * + * We also don't pull up a subquery that has explicit FOR UPDATE/SHARE + * clauses, because pullup would cause the locking to occur semantically + * higher than it should. Implicit FOR UPDATE/SHARE is okay because in + * that case the locking was originally declared in the upper query + * anyway. + */ + if (subquery->hasAggs || + subquery->hasWindowFuncs || + subquery->hasTargetSRFs || + subquery->groupClause || + subquery->groupingSets || + subquery->havingQual || + subquery->sortClause || + subquery->distinctClause || + subquery->limitOffset || + subquery->limitCount || + subquery->hasForUpdate || + subquery->cteList) + return false; + + /* + * Don't pull up a subquery with an empty jointree, unless it has no quals + * and deletion_ok is TRUE and we're not underneath an outer join. + * + * query_planner() will correctly generate a Result plan for a jointree + * that's totally empty, but we can't cope with an empty FromExpr + * appearing lower down in a jointree: we identify join rels via baserelid + * sets, so we couldn't distinguish a join containing such a FromExpr from + * one without it. We can only handle such cases if the place where the + * subquery is linked is a FromExpr or inner JOIN that would still be + * nonempty after removal of the subquery, so that it's still identifiable + * via its contained baserelids. Safe contexts are signaled by + * deletion_ok. + * + * But even in a safe context, we must keep the subquery if it has any + * quals, because it's unclear where to put them in the upper query. + * + * Also, we must forbid pullup if such a subquery is underneath an outer + * join, because then we might need to wrap its output columns with + * PlaceHolderVars, and the PHVs would then have empty relid sets meaning + * we couldn't tell where to evaluate them. (This test is separate from + * the deletion_ok flag for possible future expansion: deletion_ok tells + * whether the immediate parent site in the jointree could cope, not + * whether we'd have PHV issues. It's possible this restriction could be + * fixed by letting the PHVs use the relids of the parent jointree item, + * but that complication is for another day.) + * + * Note that deletion of a subquery is also dependent on the check below + * that its targetlist contains no set-returning functions. Deletion from + * a FROM list or inner JOIN is okay only if the subquery must return + * exactly one row. + */ + if (subquery->jointree->fromlist == NIL && + (subquery->jointree->quals != NULL || + !deletion_ok || + lowest_outer_join != NULL)) + return false; + + /* + * Don't pull up a subquery that has any volatile functions in its + * targetlist. Otherwise we might introduce multiple evaluations of these + * functions, if they get copied to multiple places in the upper query, + * leading to surprising results. (Note: the PlaceHolderVar mechanism + * doesn't quite guarantee single evaluation; else we could pull up anyway + * and just wrap such items in PlaceHolderVars ...) + */ + if (contain_volatile_functions((Node *) subquery->targetList)) + return false; + + return true; +} +#endif + /* * Convert a SubLink (as created by the parser) into a SubPlan. * @@ -1453,12 +1591,6 @@ SS_process_ctes(PlannerInfo *root) } #ifdef __TBASE__ -static bool -simplify_ANY_query(PlannerInfo *root, Query *query) -{ - return false; -} - static bool simplify_EXPR_query(PlannerInfo *root, Query *query) {// #lizard forgives @@ -2005,82 +2137,73 @@ convert_ANY_sublink_to_join(PlannerInfo *root, SubLink *sublink, Node *quals; ParseState *pstate; #ifdef __TBASE__ - int offset = 0; - Node *whereClause = NULL; + bool correlated = false; #endif Assert(sublink->subLinkType == ANY_SUBLINK); #ifdef __TBASE__ - /* - * handle correlated subquery here. - * simple case: select * from a where a.X in (select b.X from b where a.Xx ? b.Xx.......); - */ - if (simplify_ANY_query(root, subselect)) - { - subselect = copyObject(subselect); - whereClause = subselect->jointree->quals; - subselect->jointree->quals = NULL; - - /* - * The rest of the sub-select must not refer to any Vars of the parent - * query. (Vars of higher levels should be okay, though.) - */ - if (contain_vars_of_level((Node *) subselect, 1)) - return NULL; + if (enable_pullup_subquery) + { + /* + * If there are CTEs, then the transformation does not work. Don't attempt + * to pullup. + */ + if (parse->cteList) + return NULL; - if (whereClause) - { + /* + * If uncorrelated, and no Var nodes on lhs, the subquery will be executed + * only once. It should become an InitPlan, but make_subplan() doesn't + * handle that case, so just flatten it for now. + * TODO: Let it become an InitPlan, so its QEs can be recycled. + * + * We only handle level 1 correlated cases. The sub-select must not refer + * to any Vars of the parent query. (Vars of higher levels should be okay, + * though.) + */ + correlated = contain_vars_of_level((Node *) subselect, 1); - if (contain_vars_of_level((Node *) subselect, 1)) - return NULL; - /* - * the WHERE clause may contain some Vars of the - * parent query. - */ - upper_varnos = pull_varnos_of_level(whereClause, 1); + if (correlated) + { + /* + * If deeply(>1) correlated, then don't pull it up + */ + if (contain_vars_upper_level(sublink->subselect, 1)) + return NULL; - if (upper_varnos) - { - /* whereclause contains vars from different parent query */ - if (bms_num_members(upper_varnos) > 1) - { - return NULL; - } - - if (!bms_is_subset(upper_varnos, available_rels)) - { - return NULL; - } - } + /* + * Under certain conditions, we cannot pull up the subquery as a join. + */ + if (!is_simple_subquery(subselect, NULL, false)) + return NULL; - /* - * We don't risk optimizing if the WHERE clause is volatile, either. - */ - if (contain_volatile_functions(whereClause)) - return NULL; - } - } - else - { - whereClause = NULL; + /* + * Do not pull subqueries with correlation in a func expr in the from + * clause of the subselect + */ + if (has_correlation_in_funcexpr_rte(subselect->rtable)) + return NULL; - if (under_not) - { - return NULL; - } + if (contain_subplans(subselect->jointree->quals)) + return NULL; + } + } + else + { +#endif + /* + * The sub-select must not refer to any Vars of the parent query. (Vars of + * higher levels should be okay, though.) + */ + if (contain_vars_of_level((Node *) subselect, 1)) + return NULL; +#ifdef __TBASE__ + } - if (contain_vars_of_level((Node *) subselect, 1)) - return NULL; - } - -#else - /* - * The sub-select must not refer to any Vars of the parent query. (Vars of - * higher levels should be okay, though.) - */ - if (contain_vars_of_level((Node *) subselect, 1)) - return NULL; + /* TODO: Currently we do not pullup under_not */ + if (under_not) + return NULL; #endif /* @@ -2107,50 +2230,28 @@ convert_ANY_sublink_to_join(PlannerInfo *root, SubLink *sublink, /* Create a dummy ParseState for addRangeTableEntryForSubquery */ pstate = make_parsestate(NULL); + /* + * Okay, pull up the sub-select into upper range table. + * + * We rely here on the assumption that the outer query has no references + * to the inner (necessarily true, other than the Vars that we build + * below). Therefore this is a lot easier than what pull_up_subqueries has + * to go through. + * + * If the subquery is correlated, i.e. it refers to any Vars of the + * parent query, mark it as lateral. + */ + rte = addRangeTableEntryForSubquery(pstate, + subselect, + makeAlias("ANY_subquery", NIL), #ifdef __TBASE__ - if (whereClause) - { - rtindex = list_length(parse->rtable); - - offset = rtindex; - - OffsetVarNodes(whereClause, rtindex, 0); - - IncrementVarSublevelsUp(whereClause, -1, 1); - } -#endif - - /* - * Okay, pull up the sub-select into upper range table. - * - * We rely here on the assumption that the outer query has no references - * to the inner (necessarily true, other than the Vars that we build - * below). Therefore this is a lot easier than what pull_up_subqueries has - * to go through. - */ -#ifdef __TBASE__ - if (whereClause) - { -#endif - rte = addRangeTableEntryForSubquery(pstate, - subselect, - makeAlias("ANY_subquery", NIL), - false, - false); -#ifdef __TBASE__ - } - else - { - rte = addRangeTableEntryForSubquery(pstate, - (Query *) sublink->subselect, - makeAlias("ANY_subquery", NIL), - false, - false); - } + correlated, /* lateral */ +#else + false, #endif - - parse->rtable = lappend(parse->rtable, rte); - rtindex = list_length(parse->rtable); + false); + parse->rtable = lappend(parse->rtable, rte); + rtindex = list_length(parse->rtable); /* * Form a RangeTblRef for the pulled-up sub-select. @@ -2165,95 +2266,15 @@ convert_ANY_sublink_to_join(PlannerInfo *root, SubLink *sublink, subselect->targetList, rtindex); -#ifdef __TBASE__ - /* add vars from subquery in whereclause into targetlist */ - if (whereClause) - { - ListCell *cell; - List *vars = pull_vars_of_level((Node *)whereClause, 0); - - foreach(cell, vars) - { - Var *var = lfirst(cell); - - if (var->varno == rtindex) - { - bool match = false; - ListCell *lc; - Var *temp_var = NULL; - TargetEntry *ent = NULL; - int varno = 0; - int varlevelsup = 0; - - if (var->varlevelsup >= 1) - { - varlevelsup = var->varlevelsup; - var->varlevelsup = 0; - } - - temp_var = copyObject(var); - temp_var->varno -= offset; - temp_var->varnoold -= offset; - - match = false; - foreach(lc, subselect->targetList) - { - TargetEntry *tent = (TargetEntry *) lfirst(lc); - - if (IsA(tent->expr, Var)) - { - if (equal(temp_var, tent->expr)) - { - match = true; - - var->varattno = var->varoattno = tent->resno; - - break; - } - } - } - - if (!match) - { - ent = makeTargetEntry((Expr *)temp_var, temp_var->varoattno, NULL, false); - - subselect->targetList = lappend(subselect->targetList, ent); - - varno = list_length(subselect->targetList); - - ent->resno = varno; - - var->varattno = var->varoattno = varno; - } - - if (varlevelsup) - { - var->varlevelsup = varlevelsup; - } - } - } - } -#endif - - /* - * Build the new join's qual expression, replacing Params with these Vars. - */ - quals = convert_testexpr(root, sublink->testexpr, subquery_vars); - -#ifdef __TBASE__ - /* make join quals with whereclause */ - if (whereClause) - { - Expr *expr = makeBoolExpr(AND_EXPR, list_make2(quals, whereClause), 0); - - quals = (Node *)expr; - } -#endif + /* + * Build the new join's qual expression, replacing Params with these Vars. + */ + quals = convert_testexpr(root, sublink->testexpr, subquery_vars); - /* - * And finally, build the JoinExpr node. - */ - result = makeNode(JoinExpr); + /* + * And finally, build the JoinExpr node. + */ + result = makeNode(JoinExpr); #ifdef __TBASE__ result->jointype = under_not ? JOIN_ANTI : JOIN_SEMI; #else diff --git a/src/backend/optimizer/util/var.c b/src/backend/optimizer/util/var.c index a6bc46f1..228adc19 100644 --- a/src/backend/optimizer/util/var.c +++ b/src/backend/optimizer/util/var.c @@ -844,6 +844,15 @@ alias_relid_set(PlannerInfo *root, Relids relids) } #ifdef __TBASE__ +/* + * contain_vars_upper_level + * Recursively scan a clause to discover whether it contains any Var nodes + * of/above the specified query level. + * + * Returns true if any such Var found. + * + * Will recurse into sublinks. Also, may be invoked directly on a Query. + */ bool contain_vars_upper_level(Node *node, int levelsup) { diff --git a/src/include/optimizer/subselect.h b/src/include/optimizer/subselect.h index ec687d2f..47ba77f5 100644 --- a/src/include/optimizer/subselect.h +++ b/src/include/optimizer/subselect.h @@ -120,4 +120,8 @@ extern Param *assign_nestloop_param_placeholdervar(PlannerInfo *root, PlaceHolderVar *phv); extern int SS_assign_special_param(PlannerInfo *root); -#endif /* SUBSELECT_H */ +#ifdef __TBASE__ +extern bool has_correlation_in_funcexpr_rte(List *rtable); +#endif + +#endif /* SUBSELECT_H */ diff --git a/src/include/optimizer/var.h b/src/include/optimizer/var.h index 968da4b2..5cbb90d2 100644 --- a/src/include/optimizer/var.h +++ b/src/include/optimizer/var.h @@ -98,7 +98,6 @@ extern List *pull_var_clause(Node *node, int flags); extern Node *flatten_join_alias_vars(PlannerInfo *root, Node *node); #ifdef __TBASE__ extern bool contain_vars_upper_level(Node *node, int levelsup); - #endif #endif /* VAR_H */ diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out index 4b7dda27..096bb24f 100644 --- a/src/test/regress/expected/subselect.out +++ b/src/test/regress/expected/subselect.out @@ -1161,411 +1161,127 @@ NOTICE: x = 9, y = 13 drop function tattle(x int, y int); -- --- Tests for CTE inlining behavior +-- Tests for pulling up more sublinks -- --- Basic subquery that can be inlined -explain (verbose, costs off) -with x as (select * from (select f1 from subselect_tbl) ss) -select * from x where f1 = 1; - QUERY PLAN ------------------------------------------- - Remote Subquery Scan on all (datanode_1) - Output: subselect_tbl.f1 - -> Seq Scan on public.subselect_tbl - Output: subselect_tbl.f1 - Filter: (subselect_tbl.f1 = 1) -(5 rows) - --- Explicitly request materialization -explain (verbose, costs off) -with x as materialized (select * from (select f1 from subselect_tbl) ss) -select * from x where f1 = 1; - QUERY PLAN -------------------------------------------------------------- - CTE Scan on x - Output: x.f1 - Filter: (x.f1 = 1) - CTE x - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Output: subselect_tbl.f1 - -> Seq Scan on public.subselect_tbl - Output: subselect_tbl.f1 +set enable_pullup_subquery to true; +create table tbl_a(a int,b int); +create table tbl_b(a int,b int); +insert into tbl_a select generate_series(1,10),1; +insert into tbl_b select generate_series(2,11),1; +-- check targetlist subquery scenario. +set enable_nestloop to true; +set enable_hashjoin to false; +set enable_mergejoin to false; +explain select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + QUERY PLAN +------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) (cost=15636.19..15637.88 rows=675 width=8) + -> Sort (cost=15636.19..15637.88 rows=675 width=8) + Sort Key: a.a, b.a + -> Nested Loop Left Scalar Join (cost=0.00..15604.47 rows=675 width=8) + Join Filter: (b.a = a.a) + -> Seq Scan on tbl_a a (cost=0.00..23.50 rows=1350 width=4) + -> Materialize (cost=0.00..30.25 rows=1350 width=4) + -> Seq Scan on tbl_b b (cost=0.00..23.50 rows=1350 width=4) (8 rows) --- Stable functions are safe to inline -explain (verbose, costs off) -with x as (select * from (select f1, now() from subselect_tbl) ss) -select * from x where f1 = 1; - QUERY PLAN ------------------------------------------- - Remote Subquery Scan on all (datanode_1) - Output: subselect_tbl.f1, now() - -> Seq Scan on public.subselect_tbl - Output: subselect_tbl.f1, now() - Filter: (subselect_tbl.f1 = 1) -(5 rows) +select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + a | q +----+---- + 1 | + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 + 10 | 10 +(10 rows) --- Volatile functions prevent inlining -explain (verbose, costs off) -with x as (select * from (select f1, random() from subselect_tbl) ss) -select * from x where f1 = 1; - QUERY PLAN -------------------------------------------------------------- - CTE Scan on x - Output: x.f1, x.random - Filter: (x.f1 = 1) - CTE x - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Output: f1, random - -> Seq Scan on public.subselect_tbl - Output: subselect_tbl.f1, random() +set enable_nestloop to false; +set enable_hashjoin to true; +set enable_mergejoin to false; +explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a.a, b.a + -> Hash Left Scalar Join + Hash Cond: (a.a = b.a) + -> Seq Scan on tbl_a a + -> Hash + -> Seq Scan on tbl_b b (8 rows) --- SELECT FOR UPDATE cannot be inlined -explain (verbose, costs off) -with x as (select * from (select f1 from subselect_tbl for update) ss) -select * from x where f1 = 1; - QUERY PLAN --------------------------------------------------------------------------- - CTE Scan on x - Output: x.f1 - Filter: (x.f1 = 1) - CTE x - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Output: ss.f1 - -> Subquery Scan on ss - Output: ss.f1 - -> LockRows - Output: subselect_tbl.f1, subselect_tbl.ctid - -> Seq Scan on public.subselect_tbl - Output: subselect_tbl.f1, subselect_tbl.ctid -(12 rows) +select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + a | q +----+---- + 1 | + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 + 10 | 10 +(10 rows) --- Multiply-referenced CTEs are inlined only when requested -explain (verbose, costs off) -with x as (select * from (select f1, now() as n from subselect_tbl) ss) -select * from x, x x2 where x.n = x2.n; - QUERY PLAN -------------------------------------------------------------- - Merge Join - Output: x.f1, x.n, x2.f1, x2.n - Merge Cond: (x.n = x2.n) - CTE x - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Output: subselect_tbl.f1, now() - -> Seq Scan on public.subselect_tbl - Output: subselect_tbl.f1, now() - -> Sort - Output: x.f1, x.n - Sort Key: x.n - -> CTE Scan on x - Output: x.f1, x.n +set enable_nestloop to false; +set enable_hashjoin to false; +set enable_mergejoin to true; +explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) -> Sort - Output: x2.f1, x2.n - Sort Key: x2.n - -> CTE Scan on x x2 - Output: x2.f1, x2.n -(18 rows) + Sort Key: a.a, b.a + -> Merge Left Scalar Join + Merge Cond: (a.a = b.a) + -> Sort + Sort Key: a.a + -> Seq Scan on tbl_a a + -> Sort + Sort Key: b.a + -> Seq Scan on tbl_b b +(11 rows) -explain (verbose, costs off) -with x as not materialized (select * from (select f1, now() as n from subselect_tbl) ss) -select * from x, x x2 where x.n = x2.n; - QUERY PLAN --------------------------------------------------------------------------------- +select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + a | q +----+---- + 1 | + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 + 10 | 10 +(10 rows) + +-- check non-scalar scenario. +insert into tbl_b values(2,2); +set enable_nestloop to true; +set enable_hashjoin to false; +set enable_mergejoin to false; +explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; + QUERY PLAN +----------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) - Output: subselect_tbl.f1, now(), subselect_tbl_1.f1, now() - -> Result - Output: subselect_tbl.f1, (now()), subselect_tbl_1.f1, (now()) - One-Time Filter: (now() = now()) - -> Nested Loop - Output: subselect_tbl.f1, (now()), subselect_tbl_1.f1, (now()) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Output: subselect_tbl.f1, now() - Distribute results by H: now() - -> Seq Scan on public.subselect_tbl - Output: subselect_tbl.f1, now() + -> Sort + Sort Key: a.a, b.a + -> Nested Loop Left Scalar Join + Join Filter: (b.a = a.a) + -> Seq Scan on tbl_a a -> Materialize - Output: subselect_tbl_1.f1, (now()) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Output: subselect_tbl_1.f1, now() - Distribute results by H: now() - -> Seq Scan on public.subselect_tbl subselect_tbl_1 - Output: subselect_tbl_1.f1, now() -(19 rows) - --- Multiply-referenced CTEs can't be inlined if they contain outer self-refs -explain (verbose, costs off) -with recursive x(a) as - ((values ('a'), ('b')) - union all - (with z as not materialized (select * from x) - select z.a || z1.a as a from z cross join z as z1 - where length(z.a || z1.a) < 5)) -select * from x; - QUERY PLAN ----------------------------------------------------------- - CTE Scan on x - Output: x.a - CTE x - -> Recursive Union - -> Values Scan on "*VALUES*" - Output: "*VALUES*".column1 - -> Nested Loop - Output: (z.a || z1.a) - Join Filter: (length((z.a || z1.a)) < 5) - CTE z - -> WorkTable Scan on x x_1 - Output: x_1.a - -> CTE Scan on z - Output: z.a - -> Materialize - Output: z1.a - -> CTE Scan on z z1 - Output: z1.a -(18 rows) - -with recursive x(a) as - ((values ('a'), ('b')) - union all - (with z as not materialized (select * from x) - select z.a || z1.a as a from z cross join z as z1 - where length(z.a || z1.a) < 5)) -select * from x; - a ------- - a - b - aa - ab - ba - bb - aaaa - aaab - aaba - aabb - abaa - abab - abba - abbb - baaa - baab - baba - babb - bbaa - bbab - bbba - bbbb -(22 rows) - -explain (verbose, costs off) -with recursive x(a) as - ((values ('a'), ('b')) - union all - (with z as not materialized (select * from x) - select z.a || z.a as a from z - where length(z.a || z.a) < 5)) -select * from x; - QUERY PLAN --------------------------------------------------------- - CTE Scan on x - Output: x.a - CTE x - -> Recursive Union - -> Values Scan on "*VALUES*" - Output: "*VALUES*".column1 - -> WorkTable Scan on x x_1 - Output: (x_1.a || x_1.a) - Filter: (length((x_1.a || x_1.a)) < 5) -(9 rows) - -with recursive x(a) as - ((values ('a'), ('b')) - union all - (with z as not materialized (select * from x) - select z.a || z.a as a from z - where length(z.a || z.a) < 5)) -select * from x; - a ------- - a - b - aa - bb - aaaa - bbbb -(6 rows) - --- Check handling of outer references -explain (verbose, costs off) -with x as (select * from int4_tbl) -select * from (with y as (select * from x) select * from y) ss; - QUERY PLAN ------------------------------------------- - Remote Subquery Scan on all (datanode_1) - Output: f1 - -> Seq Scan on public.int4_tbl - Output: int4_tbl.f1 -(4 rows) - -explain (verbose, costs off) -with x as materialized (select * from int4_tbl) -select * from (with y as (select * from x) select * from y) ss; - QUERY PLAN --------------------------------------------------- - CTE Scan on x - Output: x.f1 - CTE x - -> Remote Subquery Scan on all (datanode_1) - Output: int4_tbl.f1 - -> Seq Scan on public.int4_tbl - Output: int4_tbl.f1 -(7 rows) - --- Ensure that we inline the currect CTE when there are --- multiple CTEs with the same name -explain (verbose, costs off) -with x as (select 1 as y) -select * from (with x as (select 2 as y) select * from x) ss; - QUERY PLAN -------------- - Result - Output: 2 -(2 rows) - --- Row marks are not pushed into CTEs -explain (verbose, costs off) -with x as (select * from subselect_tbl) -select * from x for update; - QUERY PLAN ----------------------------------------------------------------------- - Remote Subquery Scan on all (datanode_1,datanode_2) - Output: subselect_tbl.f1, subselect_tbl.f2, subselect_tbl.f3 - -> Seq Scan on public.subselect_tbl - Output: subselect_tbl.f1, subselect_tbl.f2, subselect_tbl.f3 -(4 rows) - --- --- Tests for pulling up more sublinks --- -set enable_pullup_subquery to true; -create table tbl_a(a int,b int); -create table tbl_b(a int,b int); -insert into tbl_a select generate_series(1,10),1 ; -insert into tbl_b select generate_series(2,11),1 ; --- check targetlist subquery scenario. -set enable_nestloop to true; -set enable_hashjoin to false; -set enable_mergejoin to false; -explain select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; - QUERY PLAN -------------------------------------------------------------------------------------------------- - Remote Subquery Scan on all (datanode_1,datanode_2) (cost=15636.19..15637.88 rows=675 width=8) - -> Sort (cost=15636.19..15637.88 rows=675 width=8) - Sort Key: a.a, b.a - -> Nested Loop Left Scalar Join (cost=0.00..15604.47 rows=675 width=8) - Join Filter: (b.a = a.a) - -> Seq Scan on tbl_a a (cost=0.00..23.50 rows=1350 width=4) - -> Materialize (cost=0.00..30.25 rows=1350 width=4) - -> Seq Scan on tbl_b b (cost=0.00..23.50 rows=1350 width=4) -(8 rows) - -select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; - a | q -----+---- - 1 | - 2 | 2 - 3 | 3 - 4 | 4 - 5 | 5 - 6 | 6 - 7 | 7 - 8 | 8 - 9 | 9 - 10 | 10 -(10 rows) - -set enable_nestloop to false; -set enable_hashjoin to true; -set enable_mergejoin to false; -explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; - QUERY PLAN ------------------------------------------------------ - Remote Subquery Scan on all (datanode_1,datanode_2) - -> Sort - Sort Key: a.a, b.a - -> Hash Left Scalar Join - Hash Cond: (a.a = b.a) - -> Seq Scan on tbl_a a - -> Hash - -> Seq Scan on tbl_b b -(8 rows) - -select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; - a | q -----+---- - 1 | - 2 | 2 - 3 | 3 - 4 | 4 - 5 | 5 - 6 | 6 - 7 | 7 - 8 | 8 - 9 | 9 - 10 | 10 -(10 rows) - -set enable_nestloop to false; -set enable_hashjoin to false; -set enable_mergejoin to true; -explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; - QUERY PLAN ------------------------------------------------------ - Remote Subquery Scan on all (datanode_1,datanode_2) - -> Sort - Sort Key: a.a, b.a - -> Merge Left Scalar Join - Merge Cond: (a.a = b.a) - -> Sort - Sort Key: a.a - -> Seq Scan on tbl_a a - -> Sort - Sort Key: b.a - -> Seq Scan on tbl_b b -(11 rows) - -select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; - a | q -----+---- - 1 | - 2 | 2 - 3 | 3 - 4 | 4 - 5 | 5 - 6 | 6 - 7 | 7 - 8 | 8 - 9 | 9 - 10 | 10 -(10 rows) - --- check non-scalar scenario. -insert into tbl_b values(2,2); -set enable_nestloop to true; -set enable_hashjoin to false; -set enable_mergejoin to false; -explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; - QUERY PLAN ------------------------------------------------------ - Remote Subquery Scan on all (datanode_1,datanode_2) - -> Sort - Sort Key: a.a, b.a - -> Nested Loop Left Scalar Join - Join Filter: (b.a = a.a) - -> Seq Scan on tbl_a a - -> Materialize - -> Seq Scan on tbl_b b -(8 rows) + -> Seq Scan on tbl_b b +(8 rows) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; ERROR: more than one row returned by a subquery used as an expression @@ -1930,10 +1646,10 @@ select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and (10 rows) explain (costs off) select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b is not null) else 0 end) from tbl_a a order by 1; - QUERY PLAN ------------------------------------------------------------------------------------------ + QUERY PLAN +---------------------------------------------------------------------------------------------------------------- Sort - Sort Key: (CASE WHEN (a.b = 1) THEN "TARGETLIST_subquery".count ELSE '0'::bigint END) + Sort Key: (CASE WHEN (a.b = 1) THEN COALESCE("TARGETLIST_subquery".count, '0'::bigint) ELSE '0'::bigint END) -> Nested Loop Left Join -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on tbl_a a @@ -1951,6 +1667,7 @@ explain (costs off) select (case when a.b =1 then (select count(*) from tbl_b b select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b is not null) else 0 end) from tbl_a a order by 1; case ------ + 0 1 1 1 @@ -1960,9 +1677,312 @@ select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and 1 1 1 - (10 rows) +-- support pullup lateral ANY_SUBLINK +explain select * from tbl_a a where a.b IN (select b.a from tbl_b b where b.b > a.b); + QUERY PLAN +---------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) (cost=140.38..193.10 rows=225 width=8) + -> Hash Semi Join (cost=140.38..193.10 rows=225 width=8) + Hash Cond: (a.b = b.a) + Join Filter: (b.b > a.b) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.00..141.05 rows=1350 width=8) + Distribute results by H: b + -> Seq Scan on tbl_a a (cost=0.00..23.50 rows=1350 width=8) + -> Hash (cost=23.50..23.50 rows=1350 width=8) + -> Seq Scan on tbl_b b (cost=0.00..23.50 rows=1350 width=8) +(9 rows) + +select * from tbl_a a where a.b IN (select b.a from tbl_b b where b.b > a.b); + a | b +---+--- +(0 rows) + drop table tbl_a; drop table tbl_b; set enable_pullup_subquery to false; +-- +-- Tests for CTE inlining behavior +-- +-- Basic subquery that can be inlined +explain (verbose, costs off) +with x as (select * from (select f1 from subselect_tbl) ss) +select * from x where f1 = 1; + QUERY PLAN +------------------------------------------ + Remote Subquery Scan on all (datanode_1) + Output: subselect_tbl.f1 + -> Seq Scan on public.subselect_tbl + Output: subselect_tbl.f1 + Filter: (subselect_tbl.f1 = 1) +(5 rows) + +-- Explicitly request materialization +explain (verbose, costs off) +with x as materialized (select * from (select f1 from subselect_tbl) ss) +select * from x where f1 = 1; + QUERY PLAN +------------------------------------------------------------- + CTE Scan on x + Output: x.f1 + Filter: (x.f1 = 1) + CTE x + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: subselect_tbl.f1 + -> Seq Scan on public.subselect_tbl + Output: subselect_tbl.f1 +(8 rows) + +-- Stable functions are safe to inline +explain (verbose, costs off) +with x as (select * from (select f1, now() from subselect_tbl) ss) +select * from x where f1 = 1; + QUERY PLAN +------------------------------------------ + Remote Subquery Scan on all (datanode_1) + Output: subselect_tbl.f1, now() + -> Seq Scan on public.subselect_tbl + Output: subselect_tbl.f1, now() + Filter: (subselect_tbl.f1 = 1) +(5 rows) + +-- Volatile functions prevent inlining +explain (verbose, costs off) +with x as (select * from (select f1, random() from subselect_tbl) ss) +select * from x where f1 = 1; + QUERY PLAN +------------------------------------------------------------- + CTE Scan on x + Output: x.f1, x.random + Filter: (x.f1 = 1) + CTE x + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: f1, random + -> Seq Scan on public.subselect_tbl + Output: subselect_tbl.f1, random() +(8 rows) + +-- SELECT FOR UPDATE cannot be inlined +explain (verbose, costs off) +with x as (select * from (select f1 from subselect_tbl for update) ss) +select * from x where f1 = 1; + QUERY PLAN +-------------------------------------------------------------------------- + CTE Scan on x + Output: x.f1 + Filter: (x.f1 = 1) + CTE x + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: ss.f1 + -> Subquery Scan on ss + Output: ss.f1 + -> LockRows + Output: subselect_tbl.f1, subselect_tbl.ctid + -> Seq Scan on public.subselect_tbl + Output: subselect_tbl.f1, subselect_tbl.ctid +(12 rows) + +-- Multiply-referenced CTEs are inlined only when requested +explain (verbose, costs off) +with x as (select * from (select f1, now() as n from subselect_tbl) ss) +select * from x, x x2 where x.n = x2.n; + QUERY PLAN +------------------------------------------------------------- + Merge Join + Output: x.f1, x.n, x2.f1, x2.n + Merge Cond: (x.n = x2.n) + CTE x + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: subselect_tbl.f1, now() + -> Seq Scan on public.subselect_tbl + Output: subselect_tbl.f1, now() + -> Sort + Output: x.f1, x.n + Sort Key: x.n + -> CTE Scan on x + Output: x.f1, x.n + -> Sort + Output: x2.f1, x2.n + Sort Key: x2.n + -> CTE Scan on x x2 + Output: x2.f1, x2.n +(18 rows) + +explain (verbose, costs off) +with x as not materialized (select * from (select f1, now() as n from subselect_tbl) ss) +select * from x, x x2 where x.n = x2.n; + QUERY PLAN +-------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + Output: subselect_tbl.f1, now(), subselect_tbl_1.f1, now() + -> Result + Output: subselect_tbl.f1, (now()), subselect_tbl_1.f1, (now()) + One-Time Filter: (now() = now()) + -> Nested Loop + Output: subselect_tbl.f1, (now()), subselect_tbl_1.f1, (now()) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: subselect_tbl.f1, now() + Distribute results by H: now() + -> Seq Scan on public.subselect_tbl + Output: subselect_tbl.f1, now() + -> Materialize + Output: subselect_tbl_1.f1, (now()) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: subselect_tbl_1.f1, now() + Distribute results by H: now() + -> Seq Scan on public.subselect_tbl subselect_tbl_1 + Output: subselect_tbl_1.f1, now() +(19 rows) + +-- Multiply-referenced CTEs can't be inlined if they contain outer self-refs +explain (verbose, costs off) +with recursive x(a) as + ((values ('a'), ('b')) + union all + (with z as not materialized (select * from x) + select z.a || z1.a as a from z cross join z as z1 + where length(z.a || z1.a) < 5)) +select * from x; + QUERY PLAN +---------------------------------------------------------- + CTE Scan on x + Output: x.a + CTE x + -> Recursive Union + -> Values Scan on "*VALUES*" + Output: "*VALUES*".column1 + -> Nested Loop + Output: (z.a || z1.a) + Join Filter: (length((z.a || z1.a)) < 5) + CTE z + -> WorkTable Scan on x x_1 + Output: x_1.a + -> CTE Scan on z + Output: z.a + -> Materialize + Output: z1.a + -> CTE Scan on z z1 + Output: z1.a +(18 rows) + +with recursive x(a) as + ((values ('a'), ('b')) + union all + (with z as not materialized (select * from x) + select z.a || z1.a as a from z cross join z as z1 + where length(z.a || z1.a) < 5)) +select * from x; + a +------ + a + b + aa + ab + ba + bb + aaaa + aaab + aaba + aabb + abaa + abab + abba + abbb + baaa + baab + baba + babb + bbaa + bbab + bbba + bbbb +(22 rows) + +explain (verbose, costs off) +with recursive x(a) as + ((values ('a'), ('b')) + union all + (with z as not materialized (select * from x) + select z.a || z.a as a from z + where length(z.a || z.a) < 5)) +select * from x; + QUERY PLAN +-------------------------------------------------------- + CTE Scan on x + Output: x.a + CTE x + -> Recursive Union + -> Values Scan on "*VALUES*" + Output: "*VALUES*".column1 + -> WorkTable Scan on x x_1 + Output: (x_1.a || x_1.a) + Filter: (length((x_1.a || x_1.a)) < 5) +(9 rows) + +with recursive x(a) as + ((values ('a'), ('b')) + union all + (with z as not materialized (select * from x) + select z.a || z.a as a from z + where length(z.a || z.a) < 5)) +select * from x; + a +------ + a + b + aa + bb + aaaa + bbbb +(6 rows) + +-- Check handling of outer references +explain (verbose, costs off) +with x as (select * from int4_tbl) +select * from (with y as (select * from x) select * from y) ss; + QUERY PLAN +------------------------------------------ + Remote Subquery Scan on all (datanode_1) + Output: f1 + -> Seq Scan on public.int4_tbl + Output: int4_tbl.f1 +(4 rows) + +explain (verbose, costs off) +with x as materialized (select * from int4_tbl) +select * from (with y as (select * from x) select * from y) ss; + QUERY PLAN +-------------------------------------------------- + CTE Scan on x + Output: x.f1 + CTE x + -> Remote Subquery Scan on all (datanode_1) + Output: int4_tbl.f1 + -> Seq Scan on public.int4_tbl + Output: int4_tbl.f1 +(7 rows) + +-- Ensure that we inline the currect CTE when there are +-- multiple CTEs with the same name +explain (verbose, costs off) +with x as (select 1 as y) +select * from (with x as (select 2 as y) select * from x) ss; + QUERY PLAN +------------- + Result + Output: 2 +(2 rows) + +-- Row marks are not pushed into CTEs +explain (verbose, costs off) +with x as (select * from subselect_tbl) +select * from x for update; + QUERY PLAN +---------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + Output: subselect_tbl.f1, subselect_tbl.f2, subselect_tbl.f3 + -> Seq Scan on public.subselect_tbl + Output: subselect_tbl.f1, subselect_tbl.f2, subselect_tbl.f3 +(4 rows) + diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql index 68fa867e..01926d80 100644 --- a/src/test/regress/sql/subselect.sql +++ b/src/test/regress/sql/subselect.sql @@ -606,8 +606,8 @@ drop function tattle(x int, y int); set enable_pullup_subquery to true; create table tbl_a(a int,b int); create table tbl_b(a int,b int); -insert into tbl_a select generate_series(1,10),1 ; -insert into tbl_b select generate_series(2,11),1 ; +insert into tbl_a select generate_series(1,10),1; +insert into tbl_b select generate_series(2,11),1; -- check targetlist subquery scenario. set enable_nestloop to true; @@ -692,6 +692,11 @@ explain (costs off) select (case when a.b =1 then (select count(*) from tbl_b b select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b in (1,2)) else 0 end) from tbl_a a order by 1; explain (costs off) select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b is not null) else 0 end) from tbl_a a order by 1; select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b is not null) else 0 end) from tbl_a a order by 1; + +-- support pullup lateral ANY_SUBLINK +explain select * from tbl_a a where a.b IN (select b.a from tbl_b b where b.b > a.b); +select * from tbl_a a where a.b IN (select b.a from tbl_b b where b.b > a.b); + drop table tbl_a; drop table tbl_b; set enable_pullup_subquery to false; From 60423cc3247048893d688f8d77eb25fee58fa180 Mon Sep 17 00:00:00 2001 From: ericxwu Date: Thu, 3 Sep 2020 20:54:28 +0800 Subject: [PATCH 045/578] Fix PortalDrop core when commandTag is NULL --- src/backend/utils/mmgr/portalmem.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/src/backend/utils/mmgr/portalmem.c b/src/backend/utils/mmgr/portalmem.c index d8d6079e..deb2b8d6 100644 --- a/src/backend/utils/mmgr/portalmem.c +++ b/src/backend/utils/mmgr/portalmem.c @@ -660,15 +660,18 @@ PortalDrop(Portal portal, bool isTopCommit) { #ifdef __TBASE__ - /* - * when dn recv rollback_subtxn, the resource already release by AbortSubTransaction, - * and the memory delete by CleanupSubTransaction (delete parent memory context op will delete child) - */ - if (strcmp(portal->commandTag, "ROLLBACK SUBTXN") == 0) - { - elog(LOG, "skip delete portal resowner"); - } - else + /* + * When CN/DN received rollback_subtxn, the resource already been + * released by AbortSubTransaction, and the memory delete by + * CleanupSubTransaction (delete parent memory context operation + * will delete child) + */ + if (portal->commandTag && + strcmp(portal->commandTag, "ROLLBACK SUBTXN") == 0) + { + elog(LOG, "skip delete portal resowner"); + } + else #endif { bool isCommit = (portal->status != PORTAL_FAILED); From 57c842e57fa83040d6d8f5581368cc6ac8ccca0f Mon Sep 17 00:00:00 2001 From: youngxie Date: Fri, 4 Sep 2020 10:25:36 +0800 Subject: [PATCH 046/578] Fix format --- src/backend/tcop/utility.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index cd0252f9..f331a920 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -1737,10 +1737,12 @@ ProcessUtilityPost(PlannedStmt *pstmt, #ifdef __TBASE__ /* - * Forward specific DDLs request to leader cn - * on success return true else false + * Forward specific DDLs request to leader cn. + * + * On success return true else false. */ -static bool forward_ddl_to_leader_cn(Node *node, const char *queryString) +static bool +forward_ddl_to_leader_cn(Node *node, const char *queryString) { Oid leader_cn = InvalidOid; char *leader_name = NULL; From 1dfdfcca27a32c4dcdf7bbc2fa4dae56cbd60cab Mon Sep 17 00:00:00 2001 From: whalesong Date: Fri, 4 Sep 2020 11:42:20 +0800 Subject: [PATCH 047/578] bugfix: select not committed when persistent_datanode_connections = on, https://git.code.oa.com/jasonysli/PG-XL-v10/merge_requests/5 --- src/backend/pgxc/pool/execRemote.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 36bdbed3..22c3fade 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -8874,16 +8874,17 @@ ExecRemoteQuery(PlanState *pstate) */ combiner->node_count = regular_conn_count; - /* - * Start transaction on data nodes if we are in explicit transaction - * or going to use extended query protocol or write to multiple nodes - */ - if (step->force_autocommit) - need_tran_block = false; - else - need_tran_block = step->cursor || - (!step->read_only && total_conn_count > 1) || - (TransactionBlockStatusCode() == 'T'); + /* + * Start transaction on data nodes if we are in explicit transaction + * or going to use extended query protocol or write to multiple nodes + */ + if (step->force_autocommit) + need_tran_block = false; + else + need_tran_block = step->cursor || + step->statement || node->rqs_num_params || + (!step->read_only && total_conn_count > 1) || + (TransactionBlockStatusCode() == 'T'); #ifdef __TBASE__ /* Set plpgsql transaction begin for all connections */ From f09fd4a587be84e40f8ca4233b5ad652c0268f94 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Sat, 8 Aug 2020 11:37:18 +0800 Subject: [PATCH 048/578] fix bug: when exec with+subquery sql ,report prepared statement already exists : http://tapd.oa.com/20421696/bugtrace/bugs/view?bug_id=1020421696081159381 --- src/backend/commands/prepare.c | 86 ++++++++++++++----------- src/test/regress/expected/prepare.out | 2 +- src/test/regress/expected/prepare_1.out | 2 +- 3 files changed, 51 insertions(+), 39 deletions(-) diff --git a/src/backend/commands/prepare.c b/src/backend/commands/prepare.c index e77df1ad..5a46fa7f 100644 --- a/src/backend/commands/prepare.c +++ b/src/backend/commands/prepare.c @@ -586,43 +586,55 @@ StorePreparedStatement(const char *stmt_name, bool from_sql, bool use_resowner) { - PreparedStatement *entry; - TimestampTz cur_ts = GetCurrentStatementStartTimestamp(); - bool found; - - /* Initialize the hash table, if necessary */ - if (!prepared_queries) - InitQueryHashTable(); - - /* Add entry to hash table */ - entry = (PreparedStatement *) hash_search(prepared_queries, - stmt_name, - HASH_ENTER, - &found); - - /* Shouldn't get a duplicate entry */ - if (found) - ereport(ERROR, - (errcode(ERRCODE_DUPLICATE_PSTATEMENT), - errmsg("prepared statement \"%s\" already exists", - stmt_name))); - - /* Fill in the hash table entry */ - entry->plansource = plansource; - entry->from_sql = from_sql; - entry->prepare_time = cur_ts; - entry->use_resowner = use_resowner; - - /* Now it's safe to move the CachedPlanSource to permanent memory */ - SaveCachedPlan(plansource); -#ifdef XCP - if (use_resowner) - { - ResourceOwnerEnlargePreparedStmts(CurTransactionResourceOwner); - ResourceOwnerRememberPreparedStmt(CurTransactionResourceOwner, - entry->stmt_name); - } -#endif + PreparedStatement *entry; + TimestampTz cur_ts = GetCurrentStatementStartTimestamp(); + bool found; + + /* Initialize the hash table, if necessary */ + if (!prepared_queries) + InitQueryHashTable(); + + /* Add entry to hash table */ + entry = (PreparedStatement *) hash_search(prepared_queries, + stmt_name, + HASH_ENTER, + &found); + + /* Shouldn't get a duplicate entry */ + if (found) + { + if (!(plansource->commandTag == entry->plansource->commandTag && + strcmp(plansource->query_string, entry->plansource->query_string) == 0)) + { + ereport(ERROR, + (errcode(ERRCODE_DUPLICATE_PSTATEMENT), + errmsg("prepared statement \"%s\" already exists, and plansource is not the same.", + stmt_name))); + } + else + { + elog(LOG, " \"%s\" already exists in prepared_queries, skip it.", stmt_name); + return ; + } + } + + /* Fill in the hash table entry */ + entry->plansource = plansource; + entry->from_sql = from_sql; + entry->prepare_time = cur_ts; + entry->use_resowner = use_resowner; + + /* Now it's safe to move the CachedPlanSource to permanent memory */ + SaveCachedPlan(plansource); + +#ifdef XCP + if (use_resowner) + { + ResourceOwnerEnlargePreparedStmts(CurTransactionResourceOwner); + ResourceOwnerRememberPreparedStmt(CurTransactionResourceOwner, + entry->stmt_name); + } +#endif } /* diff --git a/src/test/regress/expected/prepare.out b/src/test/regress/expected/prepare.out index 0b810146..787b242c 100644 --- a/src/test/regress/expected/prepare.out +++ b/src/test/regress/expected/prepare.out @@ -21,7 +21,7 @@ SELECT name, statement, parameter_types FROM pg_prepared_statements; -- should fail PREPARE q1 AS SELECT 2; -ERROR: prepared statement "q1" already exists +ERROR: prepared statement "q1" already exists, and plansource is not the same. -- should succeed DEALLOCATE q1; PREPARE q1 AS SELECT 2; diff --git a/src/test/regress/expected/prepare_1.out b/src/test/regress/expected/prepare_1.out index c1c15864..db1e190b 100644 --- a/src/test/regress/expected/prepare_1.out +++ b/src/test/regress/expected/prepare_1.out @@ -21,7 +21,7 @@ SELECT name, statement, parameter_types FROM pg_prepared_statements; -- should fail PREPARE q1 AS SELECT 2; -ERROR: prepared statement "q1" already exists +ERROR: prepared statement "q1" already exists, and plansource is not the same. -- should succeed DEALLOCATE q1; PREPARE q1 AS SELECT 2; From 47f7719806bd4404056feacdc112d7545c06e733 Mon Sep 17 00:00:00 2001 From: ericxwu Date: Wed, 9 Sep 2020 15:17:46 +0800 Subject: [PATCH 049/578] Support Fast Shipping Query if the subquery only contains constant value 1. Enable such optimization to avoid remote distribution the other hand relation rte. This will pushdown more select cases to datanode. 2. Add GUC enable_subquery_shipping for more potential subquery optimizations. 3. Fix deparse_query() for dual RTE --- src/backend/optimizer/util/pathnode.c | 34 +- src/backend/optimizer/util/pgxcship.c | 616 ++++++++++++++++--------- src/backend/utils/adt/ruleutils.c | 33 +- src/backend/utils/misc/guc.c | 10 + src/include/optimizer/pathnode.h | 1 + src/include/pgxc/locator.h | 19 +- src/test/regress/expected/sysviews.out | 3 +- src/test/regress/expected/xc_FQS_2.out | 30 ++ src/test/regress/sql/xc_FQS.sql | 9 + 9 files changed, 486 insertions(+), 269 deletions(-) diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 1eefd477..eee0c16b 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -56,10 +56,12 @@ #ifdef __TBASE__ /*GUC parameter */ bool prefer_olap; - +/* Max replication level on join to make Query more efficient */ int replication_level; - +/* Restrict query to involved node as possible */ bool restrict_query = false; +/* Support fast query shipping for subquery */ +bool enable_subquery_shipping = false; #define REPLICATION_FACTOR 0.8 #endif @@ -2173,24 +2175,24 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) Expr *left_expr = left; Expr *right_expr = right; #endif - Oid leftType PG_USED_FOR_ASSERTS_ONLY = exprType((Node *) left); + Oid leftType PG_USED_FOR_ASSERTS_ONLY = exprType((Node *) left); #ifndef __TBASE__ Oid rightType PG_USED_FOR_ASSERTS_ONLY = exprType((Node *) right); #endif - Relids inner_rels = pathnode->innerjoinpath->parent->relids; - Relids outer_rels = pathnode->outerjoinpath->parent->relids; - QualCost cost; + Relids inner_rels = pathnode->innerjoinpath->parent->relids; + Relids outer_rels = pathnode->outerjoinpath->parent->relids; + QualCost cost; -#ifndef __TBASE__ - /* - * Check if both parts are of the same data type and choose - * distribution type to redistribute. - * XXX We may want more sophisticated algorithm to choose - * the best condition to redistribute parts along. - * For now use simple but reliable approach. - */ - if (leftType != rightType) - continue; +#ifndef __TBASE__ + /* + * Check if both parts are of the same data type and choose + * distribution type to redistribute. + * XXX We may want more sophisticated algorithm to choose + * the best condition to redistribute parts along. + * For now use simple but reliable approach. + */ + if (leftType != rightType) + continue; #endif #ifndef _PG_REGRESS_ { diff --git a/src/backend/optimizer/util/pgxcship.c b/src/backend/optimizer/util/pgxcship.c index f79eb3bd..ae1ca9d6 100644 --- a/src/backend/optimizer/util/pgxcship.c +++ b/src/backend/optimizer/util/pgxcship.c @@ -153,6 +153,7 @@ static ExecNodes *pgxc_FQS_datanodes_for_rtr(Index varno, Query *query); #ifdef __TBASE__ static ExecNodes* pgxc_is_group_subquery_shippable(Query *query, Shippability_context *sc_context); static void pgxc_is_rte_subquery_shippable(Node *node, Shippability_context *sc_context); +static bool pgxc_FQS_check_subquery_const(Query *query); #endif /* * Set the given reason in Shippability_context indicating why the query can not be @@ -205,6 +206,124 @@ pgxc_set_exprtype_shippability(Oid exprtype, Shippability_context *sc_context) pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_TYPE); } +#ifdef __TBASE__ +/* + * pgxc_FQS_check_const_recurse + * Recursively check the query node to see if it only contains constant values. + * We only support all constant values in same leaf nodes, correlated cases are + * not supported. + */ +static bool +pgxc_FQS_check_const_recurse(Node *node, Query *query) +{ + if (!node) + return false; + + switch(nodeTag(node)) + { + case T_FromExpr: + { + FromExpr *from_expr = (FromExpr *)node; + ListCell *lcell; + bool result = true; + + /* + * Only support SELECT for now + */ + if (query->commandType != CMD_SELECT) + return false; + + /* + * Check the SetOperation to cover the case of + * '(const subquery) UNION (const subquery)...' + */ + if (!from_expr->fromlist) + { + if (query->setOperations && + IsA(query->setOperations, SetOperationStmt)) + { + return pgxc_FQS_check_const_recurse(query->setOperations, query); + } + return false; + } + + /* Check if all RTEs contains only constant values */ + foreach (lcell, from_expr->fromlist) + { + Node *fromlist_entry = lfirst(lcell); + + if (!pgxc_FQS_check_const_recurse(fromlist_entry, query)) + { + result = false; + } + } + return result; + } + case T_RangeTblRef: + { + RangeTblRef *rtr = (RangeTblRef *)node; + RangeTblEntry *rte = rt_fetch(rtr->rtindex, query->rtable); + + if (rte->rtekind == RTE_SUBQUERY) + { + return pgxc_FQS_check_subquery_const(rte->subquery); + } + return false; + } + case T_JoinExpr: + { + /* TODO: Not supported yet */ + return false; + } + case T_SetOperationStmt: + { + SetOperationStmt *setOp = (SetOperationStmt *)node; + + /* Only handle UNION cases */ + if (setOp->op == SETOP_UNION && + pgxc_FQS_check_const_recurse(setOp->larg, query) && + pgxc_FQS_check_const_recurse(setOp->rarg, query)) + { + return true; + } + return false; + } + default: + return false; + } + /* Keep compiler happy */ + return false; +} + +/* + * pgxc_FQS_check_subquery_const + * Check the query node to see if it only contains constant values, we could + * provide more shipping optimizations based on this hint. + */ +static bool +pgxc_FQS_check_subquery_const(Query *query) +{ + ListCell *lc; + bool result = true; + + /* If all target list entries are T_Const, then we are done. */ + foreach(lc, query->targetList) + { + TargetEntry *tle = lfirst(lc); + if (!IsA(tle->expr, Const)) + { + result = false; + } + } + + if (result == true) + return true; + + /* Otherwise, check if all RTEs are const */ + return pgxc_FQS_check_const_recurse((Node *)query->jointree, query); +} +#endif + /* * pgxc_FQS_datanodes_for_rtr * For a given RangeTblRef find the datanodes where corresponding data is @@ -254,19 +373,40 @@ pgxc_FQS_datanodes_for_rtr(Index varno, Query *query) return NULL; #endif - return pgxc_FQS_get_relation_nodes(rte, varno, query); - } - break; + return pgxc_FQS_get_relation_nodes(rte, varno, query); + } + break; + case RTE_SUBQUERY: +#ifdef __TBASE__ + { + Query *subquery = rte->subquery; - /* For any other type of RTE, we return NULL for now */ - case RTE_JOIN: - case RTE_CTE: - case RTE_SUBQUERY: - case RTE_FUNCTION: - case RTE_VALUES: - default: - return NULL; - } + /* + * Current we only consider the case if subquery only contains + * constant values. If so, we can treat them as replicated RTE. + */ + if (enable_subquery_shipping && + pgxc_FQS_check_subquery_const(subquery)) + { + ExecNodes *exec_nodes = makeNode(ExecNodes); + exec_nodes->baselocatortype = LOCATOR_TYPE_REPLICATED; + /* No locate info stored for such subquery RTEs, we use this + * flag to force using the other hand locate info */ + exec_nodes->const_subquery = true; + + return exec_nodes; + } + return NULL; + } +#endif + /* For any other type of RTE, we return NULL for now */ + case RTE_JOIN: + case RTE_CTE: + case RTE_FUNCTION: + case RTE_VALUES: + default: + return NULL; + } } /* @@ -276,85 +416,85 @@ pgxc_FQS_datanodes_for_rtr(Index varno, Query *query) */ static ExecNodes * pgxc_FQS_find_datanodes_recurse(Node *node, Query *query, Bitmapset **relids) -{// #lizard forgives - List *query_rtable = query->rtable; - - if (!node) - return NULL; - - switch(nodeTag(node)) - { - case T_FromExpr: - { - FromExpr *from_expr = (FromExpr *)node; - ListCell *lcell; - bool first; - Bitmapset *from_relids; - ExecNodes *result_en; - - /* - * For INSERT commands, we won't have any entries in the from list. - * Get the datanodes using the resultRelation index. - */ - if (query->commandType != CMD_SELECT && !from_expr->fromlist) - { - *relids = bms_make_singleton(query->resultRelation); - return pgxc_FQS_datanodes_for_rtr(query->resultRelation, - query); - } - - /* - * All the entries in the From list are considered to be INNER - * joined with the quals as the JOIN condition. Get the datanodes - * for the first entry in the From list. For every subsequent entry - * determine whether the join between the relation in that entry and - * the cumulative JOIN of previous entries can be pushed down to the - * datanodes and the corresponding set of datanodes where the join - * can be pushed down. - */ - first = true; - result_en = NULL; - from_relids = NULL; - foreach (lcell, from_expr->fromlist) - { - Node *fromlist_entry = lfirst(lcell); - Bitmapset *fle_relids = NULL; - ExecNodes *tmp_en; - ExecNodes *en = pgxc_FQS_find_datanodes_recurse(fromlist_entry, - query, &fle_relids); - /* - * If any entry in fromlist is not shippable, jointree is not - * shippable - */ - if (!en) - { - FreeExecNodes(&result_en); - return NULL; - } - - /* FQS does't ship a DML with more than one relation involved */ - if (!first && query->commandType != CMD_SELECT) - { - FreeExecNodes(&result_en); - return NULL; - } - - if (first) - { - first = false; - result_en = en; - from_relids = fle_relids; - continue; - } +{ + List *query_rtable = query->rtable; + + if (!node) + return NULL; + + switch(nodeTag(node)) + { + case T_FromExpr: + { + FromExpr *from_expr = (FromExpr *)node; + ListCell *lcell; + bool first; + Bitmapset *from_relids; + ExecNodes *result_en; + + /* + * For INSERT commands, we won't have any entries in the from list. + * Get the datanodes using the resultRelation index. + */ + if (query->commandType != CMD_SELECT && !from_expr->fromlist) + { + *relids = bms_make_singleton(query->resultRelation); + return pgxc_FQS_datanodes_for_rtr(query->resultRelation, + query); + } - tmp_en = result_en; - /* - * Check whether the JOIN is pushable to the datanodes and - * find the datanodes where the JOIN can be pushed to - */ - result_en = pgxc_is_join_shippable(result_en, en, from_relids, - fle_relids, JOIN_INNER, - make_ands_implicit((Expr *)from_expr->quals), + /* + * All the entries in the From list are considered to be INNER + * joined with the quals as the JOIN condition. Get the datanodes + * for the first entry in the From list. For every subsequent entry + * determine whether the join between the relation in that entry and + * the cumulative JOIN of previous entries can be pushed down to the + * datanodes and the corresponding set of datanodes where the join + * can be pushed down. + */ + first = true; + result_en = NULL; + from_relids = NULL; + foreach (lcell, from_expr->fromlist) + { + Node *fromlist_entry = lfirst(lcell); + Bitmapset *fle_relids = NULL; + ExecNodes *tmp_en; + ExecNodes *en = pgxc_FQS_find_datanodes_recurse(fromlist_entry, + query, &fle_relids); + /* + * If any entry in fromlist is not shippable, jointree is not + * shippable + */ + if (!en) + { + FreeExecNodes(&result_en); + return NULL; + } + + /* FQS does't ship a DML with more than one relation involved */ + if (!first && query->commandType != CMD_SELECT) + { + FreeExecNodes(&result_en); + return NULL; + } + + if (first) + { + first = false; + result_en = en; + from_relids = fle_relids; + continue; + } + + tmp_en = result_en; + /* + * Check whether the JOIN is pushable to the datanodes and + * find the datanodes where the JOIN can be pushed to + */ + result_en = pgxc_is_join_shippable(result_en, en, from_relids, + fle_relids, JOIN_INNER, + make_ands_implicit((Expr *)from_expr->quals), #ifdef __TBASE__ query, #endif @@ -2343,48 +2483,52 @@ pgxc_is_join_shippable(ExecNodes *inner_en, ExecNodes *outer_en, Relids in_relid #ifdef __TBASE__ Query *query, #endif - List *rtables) -{// #lizard forgives - bool merge_nodes = false; - - /* - * If either of inner_en or outer_en is NULL, return NULL. We can't ship the - * join when either of the sides do not have datanodes to ship to. - */ - if (!outer_en || !inner_en) - return NULL; - /* - * We only support reduction of INNER, LEFT [OUTER] and FULL [OUTER] joins. - * RIGHT [OUTER] join is converted to LEFT [OUTER] join during join tree - * deconstruction. - */ - if (jointype != JOIN_INNER && jointype != JOIN_LEFT && jointype != JOIN_FULL) - return NULL; - - /* If both sides are replicated or have single node each, we ship any kind of JOIN */ - if ((IsExecNodesReplicated(inner_en) && IsExecNodesReplicated(outer_en)) || - (list_length(inner_en->nodeList) == 1 && - list_length(outer_en->nodeList) == 1)) - merge_nodes = true; - - /* If both sides are distributed, ... */ - else if (IsExecNodesColumnDistributed(inner_en) && - IsExecNodesColumnDistributed(outer_en)) - { - /* - * If two sides are distributed in the same manner by a value, with an - * equi-join on the distribution column and that condition - * is shippable, ship the join if node lists from both sides can be - * merged. - */ - if (inner_en->baselocatortype == outer_en->baselocatortype && - IsExecNodesDistributedByValue(inner_en)) - { - Expr *equi_join_expr = pgxc_find_dist_equijoin_qual(in_relids, - out_relids, InvalidOid, - (Node *)join_quals, rtables); - if (equi_join_expr && pgxc_is_expr_shippable(equi_join_expr, NULL)) - merge_nodes = true; + List *rtables) +{ + bool merge_nodes = false; + + /* + * If either of inner_en or outer_en is NULL, return NULL. We can't ship the + * join when either of the sides do not have datanodes to ship to. + */ + if (!outer_en || !inner_en) + return NULL; + /* + * We only support reduction of INNER, LEFT [OUTER] and FULL [OUTER] joins. + * RIGHT [OUTER] join is converted to LEFT [OUTER] join during join tree + * deconstruction. + */ + if (jointype != JOIN_INNER && jointype != JOIN_LEFT && jointype != JOIN_FULL) + return NULL; + + /* + * If both sides are replicated or have single node each, we ship any kind + * of JOIN + */ + if ((IsExecNodesReplicated(inner_en) && IsExecNodesReplicated(outer_en) && + !inner_en->const_subquery && !outer_en->const_subquery) || + (list_length(inner_en->nodeList) == 1 && + list_length(outer_en->nodeList) == 1)) + merge_nodes = true; + + /* If both sides are distributed, ... */ + else if (IsExecNodesColumnDistributed(inner_en) && + IsExecNodesColumnDistributed(outer_en)) + { + /* + * If two sides are distributed in the same manner by a value, with an + * equi-join on the distribution column and that condition + * is shippable, ship the join if node lists from both sides can be + * merged. + */ + if (inner_en->baselocatortype == outer_en->baselocatortype && + IsExecNodesDistributedByValue(inner_en)) + { + Expr *equi_join_expr = pgxc_find_dist_equijoin_qual(in_relids, + out_relids, InvalidOid, + (Node *)join_quals, rtables); + if (equi_join_expr && pgxc_is_expr_shippable(equi_join_expr, NULL)) + merge_nodes = true; #ifdef __TBASE__ if (merge_nodes && restrict_query && query->commandType == CMD_SELECT) { @@ -2479,108 +2623,122 @@ pgxc_is_join_shippable(ExecNodes *inner_en, ExecNodes *outer_en, Relids in_relid } } #endif - } - } - /* - * If outer side is distributed and inner side is replicated, we can ship - * LEFT OUTER and INNER join. - */ - else if (IsExecNodesColumnDistributed(outer_en) && - IsExecNodesReplicated(inner_en) && - (jointype == JOIN_INNER || jointype == JOIN_LEFT)) - { - merge_nodes = true; + } + } + /* + * If outer side is distributed and inner side is replicated, we can ship + * LEFT OUTER and INNER join. + */ + else if (IsExecNodesColumnDistributed(outer_en) && + IsExecNodesReplicated(inner_en) && + (jointype == JOIN_INNER || jointype == JOIN_LEFT)) + { + merge_nodes = true; #ifdef __TBASE__ - if (restrict_query) - { - if (query->commandType == CMD_SELECT) - { - if (!outer_en->restrict_shippable) - { - List *nodelist = NULL; - - if (jointype == JOIN_INNER) - { - nodelist = pgxc_find_dist_equi_nodes(in_relids, - out_relids, InvalidOid, - (Node *)join_quals, rtables); - if (nodelist && !list_difference_int(nodelist, inner_en->nodeList)) - { - ExecNodes *merged_en = makeNode(ExecNodes); - merged_en->nodeList = nodelist; - merged_en->baselocatortype = outer_en->baselocatortype; - merged_en->restrict_shippable = true; - return merged_en; - } - } - - if (jointype == JOIN_INNER || jointype == JOIN_LEFT) - { - nodelist = pgxc_find_dist_equi_nodes(in_relids, - out_relids, InvalidOid, - (Node *)make_ands_implicit((Expr *)query->jointree->quals), rtables); - if (nodelist && !list_difference_int(nodelist, inner_en->nodeList)) - { - ExecNodes *merged_en = makeNode(ExecNodes); - merged_en->nodeList = nodelist; - merged_en->baselocatortype = outer_en->baselocatortype; - merged_en->restrict_shippable = true; - return merged_en; - } - } - } + /* + * Push down to restrict datanodes based if join is on distributed + * column or related qual + */ + if (restrict_query && + query->commandType == CMD_SELECT && + !outer_en->restrict_shippable) + { + List *nodelist = NULL; + + if (jointype == JOIN_INNER) + { + nodelist = pgxc_find_dist_equi_nodes(in_relids, + out_relids, InvalidOid, + (Node *)join_quals, rtables); + if (nodelist && !list_difference_int(nodelist, inner_en->nodeList)) + { + ExecNodes *merged_en = makeNode(ExecNodes); + merged_en->nodeList = nodelist; + merged_en->baselocatortype = outer_en->baselocatortype; + merged_en->restrict_shippable = true; + return merged_en; + } + } - return pgxc_merge_exec_nodes(inner_en, outer_en); - } - } + if (jointype == JOIN_INNER || jointype == JOIN_LEFT) + { + nodelist = pgxc_find_dist_equi_nodes(in_relids, + out_relids, InvalidOid, + (Node *)make_ands_implicit((Expr *)query->jointree->quals), rtables); + if (nodelist && !list_difference_int(nodelist, inner_en->nodeList)) + { + ExecNodes *merged_en = makeNode(ExecNodes); + merged_en->nodeList = nodelist; + merged_en->baselocatortype = outer_en->baselocatortype; + merged_en->restrict_shippable = true; + return merged_en; + } + } + } + + /* Inner side is constant subquery */ + if (enable_subquery_shipping && inner_en->const_subquery) + { + ExecNodes *merged_en = makeNode(ExecNodes); + merged_en->nodeList = list_copy(outer_en->nodeList); + merged_en->baselocatortype = outer_en->baselocatortype; + return merged_en; + } #endif - } - /* - * If outer side is replicated and inner side is distributed, we can ship - * only for INNER join. - */ - else if (IsExecNodesReplicated(outer_en) && - IsExecNodesColumnDistributed(inner_en) && - jointype == JOIN_INNER) - { - merge_nodes = true; + } + /* + * If outer side is replicated and inner side is distributed, we can ship + * only for INNER join. + */ + else if (IsExecNodesReplicated(outer_en) && + IsExecNodesColumnDistributed(inner_en) && + jointype == JOIN_INNER) + { + merge_nodes = true; #ifdef __TBASE__ - if (restrict_query) - { - if (query->commandType == CMD_SELECT) - { - if (!inner_en->restrict_shippable) - { - List *nodelist = NULL; - - nodelist = pgxc_find_dist_equi_nodes(in_relids, - out_relids, InvalidOid, - (Node *)join_quals, rtables); - if (nodelist && !list_difference_int(nodelist, outer_en->nodeList)) - { - ExecNodes *merged_en = makeNode(ExecNodes); - merged_en->nodeList = nodelist; - merged_en->baselocatortype = inner_en->baselocatortype; - merged_en->restrict_shippable = true; - return merged_en; - } - - nodelist = pgxc_find_dist_equi_nodes(in_relids, - out_relids, InvalidOid, - (Node *)make_ands_implicit((Expr *)query->jointree->quals), rtables); - if (nodelist && !list_difference_int(nodelist, outer_en->nodeList)) - { - ExecNodes *merged_en = makeNode(ExecNodes); - merged_en->nodeList = nodelist; - merged_en->baselocatortype = inner_en->baselocatortype; - merged_en->restrict_shippable = true; - return merged_en; - } - } + /* + * Push down to restrict datanodes based if join is on distributed + * column or related qual + */ + if (restrict_query && + query->commandType == CMD_SELECT && + !inner_en->restrict_shippable) + { + List *nodelist = NULL; + + nodelist = pgxc_find_dist_equi_nodes(in_relids, + out_relids, InvalidOid, + (Node *)join_quals, rtables); + if (nodelist && !list_difference_int(nodelist, outer_en->nodeList)) + { + ExecNodes *merged_en = makeNode(ExecNodes); + merged_en->nodeList = nodelist; + merged_en->baselocatortype = inner_en->baselocatortype; + merged_en->restrict_shippable = true; + return merged_en; + } - return pgxc_merge_exec_nodes(inner_en, outer_en); - } - } + nodelist = pgxc_find_dist_equi_nodes(in_relids, + out_relids, InvalidOid, + (Node *)make_ands_implicit((Expr *)query->jointree->quals), rtables); + if (nodelist && !list_difference_int(nodelist, outer_en->nodeList)) + { + ExecNodes *merged_en = makeNode(ExecNodes); + merged_en->nodeList = nodelist; + merged_en->baselocatortype = inner_en->baselocatortype; + merged_en->restrict_shippable = true; + return merged_en; + } + } + + /* Outer side is constant subquery */ + if (enable_subquery_shipping && outer_en->const_subquery) + { + ExecNodes *merged_en = makeNode(ExecNodes); + merged_en->nodeList = list_copy(inner_en->nodeList); + merged_en->baselocatortype = inner_en->baselocatortype; + return merged_en; + } #endif } /* diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c index feb22b86..dac8f826 100644 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@ -10606,20 +10606,25 @@ get_from_clause_item(Node *jtnode, Query *query, deparse_context *context) printalias = true; } #ifdef PGXC - else if (rte->rtekind == RTE_SUBQUERY && rte->eref->aliasname) - { - /* - * - * This condition arises when the from clause is a view. The - * corresponding subquery RTE has its eref set to view name. - * The remote query generated has this subquery of which the - * columns can be referred to as view_name.col1, so it should - * be possible to refer to this subquery object. - */ - appendStringInfo(buf, " %s", - quote_identifier(rte->eref->aliasname)); - printalias = true; - } + else if (rte->rtekind == RTE_SUBQUERY && rte->eref->aliasname) + { + /* + * This condition arises when the from clause is a view. The + * corresponding subquery RTE has its eref set to view name. + * The remote query generated has this subquery of which the + * columns can be referred to as view_name.col1, so it should + * be possible to refer to this subquery object. + */ + appendStringInfo(buf, " %s", + quote_identifier(rte->eref->aliasname)); + + /* + * For 'dual' rte, the aliasname is also 'dual', print alias will + * lead to syntax error. + */ + if (strcmp(rte->eref->aliasname, "dual") != 0) + printalias = true; + } #endif else if (rte->rtekind == RTE_FUNCTION) { diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 93dc7020..01405c31 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -2312,6 +2312,16 @@ static struct config_bool ConfigureNamesBool[] = false, NULL, NULL, NULL }, + + { + {"enable_subquery_shipping", PGC_USERSET, CUSTOM_OPTIONS, + gettext_noop("support fast query shipping for subquery"), + NULL + }, + &enable_subquery_shipping, + true, + NULL, NULL, NULL + }, #endif #ifdef _MIGRATE_ diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index e27c80ff..f1ff4710 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -358,6 +358,7 @@ extern void contains_remotesubplan(Path *path, int *number, bool *redistribute); extern int replication_level; extern bool restrict_query; +extern bool enable_subquery_shipping; #endif #endif /* PATHNODE_H */ diff --git a/src/include/pgxc/locator.h b/src/include/pgxc/locator.h index 26209a93..c6218522 100644 --- a/src/include/pgxc/locator.h +++ b/src/include/pgxc/locator.h @@ -96,21 +96,22 @@ typedef struct */ typedef struct { - NodeTag type; - List *primarynodelist; - List *nodeList; - char baselocatortype; - Expr *en_expr; /* expression to evaluate at execution time if planner - * can not determine execution nodes */ + NodeTag type; + List *primarynodelist; + List *nodeList; + char baselocatortype; + Expr *en_expr; /* expression to evaluate at execution time if planner + * can not determine execution nodes */ #ifdef __COLD_HOT__ Expr *sec_en_expr; /* Sec Expression to evaluate at execution time * if planner can not determine execution * nodes */ #endif - Oid en_relid; /* Relation to determine execution nodes */ - RelationAccessType accesstype; /* Access type to determine execution nodes */ + Oid en_relid; /* Relation to determine execution nodes */ + RelationAccessType accesstype; /* Access type to determine execution nodes */ #ifdef __TBASE__ - bool restrict_shippable; + bool restrict_shippable; /* The ExecNode is choose by join qual on distribute column */ + bool const_subquery; /* The subquery rte only got constant values */ #endif } ExecNodes; diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index e2765dd9..06796d76 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -122,10 +122,11 @@ select name, setting from pg_settings where name like 'enable%'; enable_shard_statistic | on enable_sort | on enable_statistic | on + enable_subquery_shipping | on enable_tidscan | on enable_transparent_crypt | on enable_user_authority_force_check | off -(53 rows) +(55 rows) -- Test that the pg_timezone_names and pg_timezone_abbrevs views are -- more-or-less working. We can't test their contents in any great detail diff --git a/src/test/regress/expected/xc_FQS_2.out b/src/test/regress/expected/xc_FQS_2.out index 9f1e51d1..e3e73168 100644 --- a/src/test/regress/expected/xc_FQS_2.out +++ b/src/test/regress/expected/xc_FQS_2.out @@ -1607,6 +1607,36 @@ select * from tab1_replicated where val = 7; -----+------ (0 rows) +-- Constant subquery +create table subquery_fqs(id int, a varchar, c int); +insert into subquery_fqs values(1,'gd', 2); +insert into subquery_fqs values(1,'zj', 2); +insert into subquery_fqs values(1,'sz', 2); +explain select * from subquery_fqs t join (select 1 id, 'gd' a, 2 c from dual union select 1 id, 'sz' a, 2 c union select 1 id, 'zj' a, 2 c from dual) t2 ON (t.id = t2.id and t.a = t2.a); + QUERY PLAN +------------------------------------------------------------------------------ + Remote Fast Query Execution (cost=0.00..0.00 rows=0 width=0) + Node/s: datanode_1, datanode_2 + -> Hash Join (cost=0.19..25.60 rows=1 width=80) + Hash Cond: ((t.id = (1)) AND ((t.a)::text = ('gd'::text))) + -> Seq Scan on subquery_fqs t (cost=0.00..18.80 rows=880 width=40) + -> Hash (cost=0.14..0.14 rows=3 width=40) + -> HashAggregate (cost=0.08..0.11 rows=3 width=40) + Group Key: (1), ('gd'::text), (2) + -> Append (cost=0.00..0.06 rows=3 width=40) + -> Result (cost=0.00..0.01 rows=1 width=40) + -> Result (cost=0.00..0.01 rows=1 width=40) + -> Result (cost=0.00..0.01 rows=1 width=40) +(12 rows) + +select * from subquery_fqs t join (select 1 id, 'gd' a, 2 c from dual union select 1 id, 'sz' a, 2 c union select 1 id, 'zj' a, 2 c from dual) t2 ON (t.id = t2.id and t.a = t2.a); + id | a | c | id | a | c +----+----+---+----+----+--- + 1 | gd | 2 | 1 | gd | 2 + 1 | zj | 2 | 1 | zj | 2 + 1 | sz | 2 | 1 | sz | 2 +(3 rows) + drop table tab1_rr; drop table tab1_hash; drop table tab1_modulo; diff --git a/src/test/regress/sql/xc_FQS.sql b/src/test/regress/sql/xc_FQS.sql index a6d6f15f..bdb9c02a 100644 --- a/src/test/regress/sql/xc_FQS.sql +++ b/src/test/regress/sql/xc_FQS.sql @@ -275,6 +275,15 @@ delete from tab1_replicated where val = 7; explain (verbose on, costs off) delete from tab1_replicated where val = 7; select * from tab1_replicated where val = 7; +-- Constant subquery +create table subquery_fqs(id int, a varchar, c int); +insert into subquery_fqs values(1,'gd', 2); +insert into subquery_fqs values(1,'zj', 2); +insert into subquery_fqs values(1,'sz', 2); +explain select * from subquery_fqs t join (select 1 id, 'gd' a, 2 c from dual union select 1 id, 'sz' a, 2 c union select 1 id, 'zj' a, 2 c from dual) t2 ON (t.id = t2.id and t.a = t2.a); +select * from subquery_fqs t join (select 1 id, 'gd' a, 2 c from dual union select 1 id, 'sz' a, 2 c union select 1 id, 'zj' a, 2 c from dual) t2 ON (t.id = t2.id and t.a = t2.a); + + drop table tab1_rr; drop table tab1_hash; drop table tab1_modulo; From fa58aa7a75035b8dbf8fba1735744847bd39eb32 Mon Sep 17 00:00:00 2001 From: qiannzhang Date: Wed, 9 Sep 2020 19:10:36 +0800 Subject: [PATCH 050/578] Fix subquery's pathkey ID81711417. The fix is from the second issue of pg commit 24c19e9f668. convert_subquery_pathkeys would create pathkeys for subquery output values if they match any EquivalenceClass known in the outer query and are available in the subquery's syntactic targetlist. However, the second part of that condition is wrong, because such values might not appear in the subquery relation's reltarget list, which would mean that they couldn't be accessed above the level of the subquery scan. We must check that they appear in the reltarget list, instead. This can lead to dropping knowledge about the subquery's sort ordering, but I believe it's okay, because any sort key that the outer query actually has any interest in would appear in the reltarget list. --- src/backend/access/transam/gtm.c | 1 + src/backend/optimizer/path/pathkeys.c | 424 +++++++++++++----------- src/backend/pgxc/pool/poolmgr.c | 1 + src/test/regress/expected/subselect.out | 31 ++ src/test/regress/sql/subselect.sql | 28 ++ 5 files changed, 289 insertions(+), 196 deletions(-) diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c index 267c5b88..d2557802 100644 --- a/src/backend/access/transam/gtm.c +++ b/src/backend/access/transam/gtm.c @@ -40,6 +40,7 @@ #include "utils/tqual.h" #include "pgxc/nodemgr.h" #include "access/xlog.h" +#include "storage/lmgr.h" #endif /* To access sequences */ diff --git a/src/backend/optimizer/path/pathkeys.c b/src/backend/optimizer/path/pathkeys.c index 804c58a8..8587420d 100644 --- a/src/backend/optimizer/path/pathkeys.c +++ b/src/backend/optimizer/path/pathkeys.c @@ -29,6 +29,7 @@ static bool pathkey_is_redundant(PathKey *new_pathkey, List *pathkeys); +static Var *find_var_for_subquery_tle(RelOptInfo *rel, TargetEntry *tle); static bool right_merge_direction(PlannerInfo *root, PathKey *pathkey); @@ -599,206 +600,237 @@ build_expression_pathkey(PlannerInfo *root, * 'subquery_pathkeys': the subquery's output pathkeys, in its terms. * 'subquery_tlist': the subquery's output targetlist, in its terms. * - * It is not necessary for caller to do truncate_useless_pathkeys(), - * because we select keys in a way that takes usefulness of the keys into - * account. + * We intentionally don't do truncate_useless_pathkeys() here, because there + * are situations where seeing the raw ordering of the subquery is helpful. + * For example, if it returns ORDER BY x DESC, that may prompt us to + * construct a mergejoin using DESC order rather than ASC order; but the + * right_merge_direction heuristic would have us throw the knowledge away. */ List * convert_subquery_pathkeys(PlannerInfo *root, RelOptInfo *rel, - List *subquery_pathkeys, - List *subquery_tlist) -{// #lizard forgives - List *retval = NIL; - int retvallen = 0; - int outer_query_keys = list_length(root->query_pathkeys); - ListCell *i; - - foreach(i, subquery_pathkeys) - { - PathKey *sub_pathkey = (PathKey *) lfirst(i); - EquivalenceClass *sub_eclass = sub_pathkey->pk_eclass; - PathKey *best_pathkey = NULL; - - if (sub_eclass->ec_has_volatile) - { - /* - * If the sub_pathkey's EquivalenceClass is volatile, then it must - * have come from an ORDER BY clause, and we have to match it to - * that same targetlist entry. - */ - TargetEntry *tle; - - if (sub_eclass->ec_sortref == 0) /* can't happen */ - elog(ERROR, "volatile EquivalenceClass has no sortref"); - tle = get_sortgroupref_tle(sub_eclass->ec_sortref, subquery_tlist); - Assert(tle); - /* resjunk items aren't visible to outer query */ - if (!tle->resjunk) - { - /* We can represent this sub_pathkey */ - EquivalenceMember *sub_member; - Expr *outer_expr; - EquivalenceClass *outer_ec; - - Assert(list_length(sub_eclass->ec_members) == 1); - sub_member = (EquivalenceMember *) linitial(sub_eclass->ec_members); - outer_expr = (Expr *) makeVarFromTargetEntry(rel->relid, tle); - - /* - * Note: it might look funny to be setting sortref = 0 for a - * reference to a volatile sub_eclass. However, the - * expression is *not* volatile in the outer query: it's just - * a Var referencing whatever the subquery emitted. (IOW, the - * outer query isn't going to re-execute the volatile - * expression itself.) So this is okay. Likewise, it's - * correct to pass nullable_relids = NULL, because we're - * underneath any outer joins appearing in the outer query. - */ - outer_ec = - get_eclass_for_sort_expr(root, - outer_expr, - NULL, - sub_eclass->ec_opfamilies, - sub_member->em_datatype, - sub_eclass->ec_collation, - 0, - rel->relids, - false); - - /* - * If we don't find a matching EC, sub-pathkey isn't - * interesting to the outer query - */ - if (outer_ec) - best_pathkey = - make_canonical_pathkey(root, - outer_ec, - sub_pathkey->pk_opfamily, - sub_pathkey->pk_strategy, - sub_pathkey->pk_nulls_first); - } - } - else - { - /* - * Otherwise, the sub_pathkey's EquivalenceClass could contain - * multiple elements (representing knowledge that multiple items - * are effectively equal). Each element might match none, one, or - * more of the output columns that are visible to the outer query. - * This means we may have multiple possible representations of the - * sub_pathkey in the context of the outer query. Ideally we - * would generate them all and put them all into an EC of the - * outer query, thereby propagating equality knowledge up to the - * outer query. Right now we cannot do so, because the outer - * query's EquivalenceClasses are already frozen when this is - * called. Instead we prefer the one that has the highest "score" - * (number of EC peers, plus one if it matches the outer - * query_pathkeys). This is the most likely to be useful in the - * outer query. - */ - int best_score = -1; - ListCell *j; - - foreach(j, sub_eclass->ec_members) - { - EquivalenceMember *sub_member = (EquivalenceMember *) lfirst(j); - Expr *sub_expr = sub_member->em_expr; - Oid sub_expr_type = sub_member->em_datatype; - Oid sub_expr_coll = sub_eclass->ec_collation; - ListCell *k; - - if (sub_member->em_is_child) - continue; /* ignore children here */ - - foreach(k, subquery_tlist) - { - TargetEntry *tle = (TargetEntry *) lfirst(k); - Expr *tle_expr; - Expr *outer_expr; - EquivalenceClass *outer_ec; - PathKey *outer_pk; - int score; - - /* resjunk items aren't visible to outer query */ - if (tle->resjunk) - continue; - - /* - * The targetlist entry is considered to match if it - * matches after sort-key canonicalization. That is - * needed since the sub_expr has been through the same - * process. - */ - tle_expr = canonicalize_ec_expression(tle->expr, - sub_expr_type, - sub_expr_coll); - if (!equal(tle_expr, sub_expr)) - continue; - - /* - * Build a representation of this targetlist entry as an - * outer Var. - */ - outer_expr = (Expr *) makeVarFromTargetEntry(rel->relid, - tle); - - /* See if we have a matching EC for that */ - outer_ec = get_eclass_for_sort_expr(root, - outer_expr, - NULL, - sub_eclass->ec_opfamilies, - sub_expr_type, - sub_expr_coll, - 0, - rel->relids, - false); - - /* - * If we don't find a matching EC, this sub-pathkey isn't - * interesting to the outer query - */ - if (!outer_ec) - continue; - - outer_pk = make_canonical_pathkey(root, - outer_ec, - sub_pathkey->pk_opfamily, - sub_pathkey->pk_strategy, - sub_pathkey->pk_nulls_first); - /* score = # of equivalence peers */ - score = list_length(outer_ec->ec_members) - 1; - /* +1 if it matches the proper query_pathkeys item */ - if (retvallen < outer_query_keys && - list_nth(root->query_pathkeys, retvallen) == outer_pk) - score++; - if (score > best_score) - { - best_pathkey = outer_pk; - best_score = score; - } - } - } - } - - /* - * If we couldn't find a representation of this sub_pathkey, we're - * done (we can't use the ones to its right, either). - */ - if (!best_pathkey) - break; - - /* - * Eliminate redundant ordering info; could happen if outer query - * equivalences subquery keys... - */ - if (!pathkey_is_redundant(best_pathkey, retval)) - { - retval = lappend(retval, best_pathkey); - retvallen++; - } - } + List *subquery_pathkeys, + List *subquery_tlist) +{ + List *retval = NIL; + int retvallen = 0; + int outer_query_keys = list_length(root->query_pathkeys); + ListCell *i; + + foreach(i, subquery_pathkeys) + { + PathKey *sub_pathkey = (PathKey *) lfirst(i); + EquivalenceClass *sub_eclass = sub_pathkey->pk_eclass; + PathKey *best_pathkey = NULL; + + if (sub_eclass->ec_has_volatile) + { + /* + * If the sub_pathkey's EquivalenceClass is volatile, then it must + * have come from an ORDER BY clause, and we have to match it to + * that same targetlist entry. + */ + TargetEntry *tle; + Var *outer_var; + + if (sub_eclass->ec_sortref == 0) /* can't happen */ + elog(ERROR, "volatile EquivalenceClass has no sortref"); + tle = get_sortgroupref_tle(sub_eclass->ec_sortref, subquery_tlist); + Assert(tle); + /* Is TLE actually available to the outer query? */ + outer_var = find_var_for_subquery_tle(rel, tle); + if (outer_var) + { + /* We can represent this sub_pathkey */ + EquivalenceMember *sub_member; + EquivalenceClass *outer_ec; + + Assert(list_length(sub_eclass->ec_members) == 1); + sub_member = (EquivalenceMember *) linitial(sub_eclass->ec_members); + + /* + * Note: it might look funny to be setting sortref = 0 for a + * reference to a volatile sub_eclass. However, the + * expression is *not* volatile in the outer query: it's just + * a Var referencing whatever the subquery emitted. (IOW, the + * outer query isn't going to re-execute the volatile + * expression itself.) So this is okay. Likewise, it's + * correct to pass nullable_relids = NULL, because we're + * underneath any outer joins appearing in the outer query. + */ + outer_ec = + get_eclass_for_sort_expr(root, + (Expr *) outer_var, + NULL, + sub_eclass->ec_opfamilies, + sub_member->em_datatype, + sub_eclass->ec_collation, + 0, + rel->relids, + false); + + /* + * If we don't find a matching EC, sub-pathkey isn't + * interesting to the outer query + */ + if (outer_ec) + best_pathkey = + make_canonical_pathkey(root, + outer_ec, + sub_pathkey->pk_opfamily, + sub_pathkey->pk_strategy, + sub_pathkey->pk_nulls_first); + } + } + else + { + /* + * Otherwise, the sub_pathkey's EquivalenceClass could contain + * multiple elements (representing knowledge that multiple items + * are effectively equal). Each element might match none, one, or + * more of the output columns that are visible to the outer query. + * This means we may have multiple possible representations of the + * sub_pathkey in the context of the outer query. Ideally we + * would generate them all and put them all into an EC of the + * outer query, thereby propagating equality knowledge up to the + * outer query. Right now we cannot do so, because the outer + * query's EquivalenceClasses are already frozen when this is + * called. Instead we prefer the one that has the highest "score" + * (number of EC peers, plus one if it matches the outer + * query_pathkeys). This is the most likely to be useful in the + * outer query. + */ + int best_score = -1; + ListCell *j; + + foreach(j, sub_eclass->ec_members) + { + EquivalenceMember *sub_member = (EquivalenceMember *) lfirst(j); + Expr *sub_expr = sub_member->em_expr; + Oid sub_expr_type = sub_member->em_datatype; + Oid sub_expr_coll = sub_eclass->ec_collation; + ListCell *k; + + if (sub_member->em_is_child) + continue; /* ignore children here */ + + foreach(k, subquery_tlist) + { + TargetEntry *tle = (TargetEntry *) lfirst(k); + Var *outer_var; + Expr *tle_expr; + EquivalenceClass *outer_ec; + PathKey *outer_pk; + int score; + + /* Is TLE actually available to the outer query? */ + outer_var = find_var_for_subquery_tle(rel, tle); + if (!outer_var) + continue; + + /* + * The targetlist entry is considered to match if it + * matches after sort-key canonicalization. That is + * needed since the sub_expr has been through the same + * process. + */ + tle_expr = canonicalize_ec_expression(tle->expr, + sub_expr_type, + sub_expr_coll); + if (!equal(tle_expr, sub_expr)) + continue; + + /* See if we have a matching EC for the TLE */ + outer_ec = get_eclass_for_sort_expr(root, + (Expr *) outer_var, + NULL, + sub_eclass->ec_opfamilies, + sub_expr_type, + sub_expr_coll, + 0, + rel->relids, + false); + + /* + * If we don't find a matching EC, this sub-pathkey isn't + * interesting to the outer query + */ + if (!outer_ec) + continue; + + outer_pk = make_canonical_pathkey(root, + outer_ec, + sub_pathkey->pk_opfamily, + sub_pathkey->pk_strategy, + sub_pathkey->pk_nulls_first); + /* score = # of equivalence peers */ + score = list_length(outer_ec->ec_members) - 1; + /* +1 if it matches the proper query_pathkeys item */ + if (retvallen < outer_query_keys && + list_nth(root->query_pathkeys, retvallen) == outer_pk) + score++; + if (score > best_score) + { + best_pathkey = outer_pk; + best_score = score; + } + } + } + } + + /* + * If we couldn't find a representation of this sub_pathkey, we're + * done (we can't use the ones to its right, either). + */ + if (!best_pathkey) + break; + + /* + * Eliminate redundant ordering info; could happen if outer query + * equivalences subquery keys... + */ + if (!pathkey_is_redundant(best_pathkey, retval)) + { + retval = lappend(retval, best_pathkey); + retvallen++; + } + } + + return retval; +} - return retval; +/* + * find_var_for_subquery_tle + * + * If the given subquery tlist entry is due to be emitted by the subquery's + * scan node, return a Var for it, else return NULL. + * + * We need this to ensure that we don't return pathkeys describing values + * that are unavailable above the level of the subquery scan. + */ +static Var * +find_var_for_subquery_tle(RelOptInfo *rel, TargetEntry *tle) +{ + ListCell *lc; + + /* If the TLE is resjunk, it's certainly not visible to the outer query */ + if (tle->resjunk) + return NULL; + + /* Search the rel's targetlist to see what it will return */ + foreach(lc, rel->reltarget->exprs) + { + Var *var = (Var *) lfirst(lc); + + /* Ignore placeholders */ + if (!IsA(var, Var)) + continue; + Assert(var->varno == rel->relid); + + /* If we find a Var referencing this TLE, we're good */ + if (var->varattno == tle->resno) + return copyObject(var); /* Make a copy for safety */ + } + return NULL; } /* diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c index dd575f76..0392d08f 100644 --- a/src/backend/pgxc/pool/poolmgr.c +++ b/src/backend/pgxc/pool/poolmgr.c @@ -49,6 +49,7 @@ #include "utils/lsyscache.h" #include "utils/resowner.h" #include "lib/stringinfo.h" +#include "libpq/libpq-be.h" #include "libpq/pqformat.h" #include "common/username.h" #include "pgxc/locator.h" diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out index 096bb24f..52c196aa 100644 --- a/src/test/regress/expected/subselect.out +++ b/src/test/regress/expected/subselect.out @@ -1986,3 +1986,34 @@ select * from x for update; Output: subselect_tbl.f1, subselect_tbl.f2, subselect_tbl.f3 (4 rows) +-- test subquery pathkey +CREATE TABLE catalog_sales ( + cs_sold_date_sk integer, + cs_item_sk integer NOT NULL, + cs_order_number integer NOT NULL +); +CREATE TABLE catalog_returns ( + cr_returned_date_sk integer, + cr_item_sk integer NOT NULL, + cr_order_number integer NOT NULL +); +CREATE TABLE date_dim ( + d_date_sk integer NOT NULL, + d_year integer +); +with cs as +( + select d_year AS cs_sold_year, cs_item_sk + from catalog_sales + left join catalog_returns on cr_order_number=cs_order_number and cs_item_sk=cr_item_sk + join date_dim on cs_sold_date_sk = d_date_sk + order by d_year, cs_item_sk +) +select 1 +from date_dim + join cs on (cs_sold_year=d_year and cs_item_sk=cs_item_sk); + ?column? +---------- +(0 rows) + +drop table catalog_sales, catalog_returns, date_dim; diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql index 01926d80..818c6b4f 100644 --- a/src/test/regress/sql/subselect.sql +++ b/src/test/regress/sql/subselect.sql @@ -793,3 +793,31 @@ select * from (with x as (select 2 as y) select * from x) ss; explain (verbose, costs off) with x as (select * from subselect_tbl) select * from x for update; + +-- test subquery pathkey +CREATE TABLE catalog_sales ( + cs_sold_date_sk integer, + cs_item_sk integer NOT NULL, + cs_order_number integer NOT NULL +); +CREATE TABLE catalog_returns ( + cr_returned_date_sk integer, + cr_item_sk integer NOT NULL, + cr_order_number integer NOT NULL +); +CREATE TABLE date_dim ( + d_date_sk integer NOT NULL, + d_year integer +); +with cs as +( + select d_year AS cs_sold_year, cs_item_sk + from catalog_sales + left join catalog_returns on cr_order_number=cs_order_number and cs_item_sk=cr_item_sk + join date_dim on cs_sold_date_sk = d_date_sk + order by d_year, cs_item_sk +) +select 1 +from date_dim + join cs on (cs_sold_year=d_year and cs_item_sk=cs_item_sk); +drop table catalog_sales, catalog_returns, date_dim; \ No newline at end of file From 3dc65e912e19f98a386a4a3975647bcc2fef54d6 Mon Sep 17 00:00:00 2001 From: ericxwu Date: Thu, 10 Sep 2020 10:53:01 +0800 Subject: [PATCH 051/578] Fix coredump during ExecEndRemoteSubplan when conn is NULL http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131081943789 --- src/backend/pgxc/pool/execRemote.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 22c3fade..ea47904b 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -11505,6 +11505,16 @@ ExecEndRemoteSubplan(RemoteSubplanState *node) conn = combiner->connections[i]; + /* connection can be null in sort, forget it */ + if (!conn) + { + combiner->conn_count--; + combiner->connections[i] = + combiner->connections[combiner->conn_count]; + i--; + continue; + } + CHECK_OWNERSHIP(conn, combiner); if (pgxc_node_send_close(conn, true, cursor) != 0) From 12c41f3e73b7e09fa803999fe686f136ce268586 Mon Sep 17 00:00:00 2001 From: ericxwu Date: Sat, 12 Sep 2020 11:12:17 +0800 Subject: [PATCH 052/578] Fix warnings --- src/backend/executor/nodeModifyTable.c | 2 -- src/backend/optimizer/util/relnode.c | 3 --- 2 files changed, 5 deletions(-) diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 659f15b1..4a03adb3 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -3206,7 +3206,6 @@ ExecEndModifyTable(ModifyTableState *node) #ifdef __TBASE__ if (IS_PGXC_COORDINATOR) { - ResponseCombiner *combiner; ModifyTable *plan = (ModifyTable *)node->ps.plan; if (plan->remote_plans) @@ -3217,7 +3216,6 @@ ExecEndModifyTable(ModifyTableState *node) { RemoteQuery *rq = (RemoteQuery *)list_nth(plan->remote_plans, i); - combiner = (ResponseCombiner *) node->mt_remoterels[i]; ExecEndNode(node->mt_remoterels[i]); diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c index 153d3d36..b4359f52 100644 --- a/src/backend/optimizer/util/relnode.c +++ b/src/backend/optimizer/util/relnode.c @@ -507,9 +507,6 @@ build_join_rel(PlannerInfo *root, { RelOptInfo *joinrel; List *restrictlist; -#ifdef __TBASE__ - PlannerInfo *top_root = root; -#endif /* * See if we already have a joinrel for this set of base rels. From 2fbad221b6749adb0ab93063a739a064cb6c5208 Mon Sep 17 00:00:00 2001 From: youngxie Date: Tue, 15 Sep 2020 11:30:20 +0800 Subject: [PATCH 053/578] Fix postgres log csv format. --- src/backend/utils/error/elog.c | 160 ++++++++++++++++----------------- 1 file changed, 80 insertions(+), 80 deletions(-) diff --git a/src/backend/utils/error/elog.c b/src/backend/utils/error/elog.c index 9851cf57..58ff0658 100644 --- a/src/backend/utils/error/elog.c +++ b/src/backend/utils/error/elog.c @@ -2562,23 +2562,23 @@ log_line_prefix(StringInfo buf, ErrorData *edata) { char strfbuf[128]; - snprintf(strfbuf, sizeof(strfbuf) - 1, "%lx.%x,coord(%d,%u)", + snprintf(strfbuf, sizeof(strfbuf) - 1, "%lx.%x,coord(%d.%u)", (long) (MyStartTime), MyProcPid, pgxc_get_coordinator_proc_pid(), pgxc_get_coordinator_proc_vxid()); - appendStringInfo(buf, "%*s", padding, strfbuf); - } - else - appendStringInfo(buf, "%lx.%x,coord(%d,%u)", + appendStringInfo(buf, "%*s", padding, strfbuf); + } + else + appendStringInfo(buf, "%lx.%x,coord(%d.%u)", (long) (MyStartTime), MyProcPid, pgxc_get_coordinator_proc_pid(), pgxc_get_coordinator_proc_vxid()); - break; - case 'p': - if (padding != 0) - appendStringInfo(buf, "%*d", padding, MyProcPid); - else - appendStringInfo(buf, "%d,coord(%d,%u)", + break; + case 'p': + if (padding != 0) + appendStringInfo(buf, "%*d", padding, MyProcPid); + else + appendStringInfo(buf, "%d,coord(%d.%u)", MyProcPid, pgxc_get_coordinator_proc_pid(), pgxc_get_coordinator_proc_vxid()); @@ -2829,78 +2829,78 @@ appendCSVLiteral(StringInfo buf, const char *data) */ static void write_csvlog(ErrorData *edata) -{// #lizard forgives - StringInfoData buf; - bool print_stmt = false; - - /* static counter for line numbers */ - static long log_line_number = 0; - - /* has counter been reset in current process? */ - static int log_my_pid = 0; - - /* - * This is one of the few places where we'd rather not inherit a static - * variable's value from the postmaster. But since we will, reset it when - * MyProcPid changes. - */ - if (log_my_pid != MyProcPid) - { - log_line_number = 0; - log_my_pid = MyProcPid; - formatted_start_time[0] = '\0'; - } - log_line_number++; - - initStringInfo(&buf); - - /* - * timestamp with milliseconds - * - * Check if the timestamp is already calculated for the syslog message, - * and use it if so. Otherwise, get the current timestamp. This is done - * to put same timestamp in both syslog and csvlog messages. - */ - if (formatted_log_time[0] == '\0') - setup_formatted_log_time(); - - appendStringInfoString(&buf, formatted_log_time); - appendStringInfoChar(&buf, ','); - - /* username */ - if (MyProcPort) - appendCSVLiteral(&buf, MyProcPort->user_name); - appendStringInfoChar(&buf, ','); - - /* database name */ - if (MyProcPort) - appendCSVLiteral(&buf, MyProcPort->database_name); - appendStringInfoChar(&buf, ','); - - /* Process id */ - if (MyProcPid != 0) - appendStringInfo(&buf, "%d,coord(%d,%u)", +{ + StringInfoData buf; + bool print_stmt = false; + + /* static counter for line numbers */ + static long log_line_number = 0; + + /* has counter been reset in current process? */ + static int log_my_pid = 0; + + /* + * This is one of the few places where we'd rather not inherit a static + * variable's value from the postmaster. But since we will, reset it when + * MyProcPid changes. + */ + if (log_my_pid != MyProcPid) + { + log_line_number = 0; + log_my_pid = MyProcPid; + formatted_start_time[0] = '\0'; + } + log_line_number++; + + initStringInfo(&buf); + + /* + * timestamp with milliseconds + * + * Check if the timestamp is already calculated for the syslog message, + * and use it if so. Otherwise, get the current timestamp. This is done + * to put same timestamp in both syslog and csvlog messages. + */ + if (formatted_log_time[0] == '\0') + setup_formatted_log_time(); + + appendStringInfoString(&buf, formatted_log_time); + appendStringInfoChar(&buf, ','); + + /* username */ + if (MyProcPort) + appendCSVLiteral(&buf, MyProcPort->user_name); + appendStringInfoChar(&buf, ','); + + /* database name */ + if (MyProcPort) + appendCSVLiteral(&buf, MyProcPort->database_name); + appendStringInfoChar(&buf, ','); + + /* Process id */ + if (MyProcPid != 0) + appendStringInfo(&buf, "%d,coord(%d.%u)", MyProcPid, pgxc_get_coordinator_proc_pid(), pgxc_get_coordinator_proc_vxid()); - appendStringInfoChar(&buf, ','); - - /* Remote host and port */ - if (MyProcPort && MyProcPort->remote_host) - { - appendStringInfoChar(&buf, '"'); - appendStringInfoString(&buf, MyProcPort->remote_host); - if (MyProcPort->remote_port && MyProcPort->remote_port[0] != '\0') - { - appendStringInfoChar(&buf, ':'); - appendStringInfoString(&buf, MyProcPort->remote_port); - } - appendStringInfoChar(&buf, '"'); - } - appendStringInfoChar(&buf, ','); - - /* session id */ - appendStringInfo(&buf, "%lx.%x,coord(%d,%u)", + appendStringInfoChar(&buf, ','); + + /* Remote host and port */ + if (MyProcPort && MyProcPort->remote_host) + { + appendStringInfoChar(&buf, '"'); + appendStringInfoString(&buf, MyProcPort->remote_host); + if (MyProcPort->remote_port && MyProcPort->remote_port[0] != '\0') + { + appendStringInfoChar(&buf, ':'); + appendStringInfoString(&buf, MyProcPort->remote_port); + } + appendStringInfoChar(&buf, '"'); + } + appendStringInfoChar(&buf, ','); + + /* session id */ + appendStringInfo(&buf, "%lx.%x,coord(%d.%u)", (long) MyStartTime, MyProcPid, pgxc_get_coordinator_proc_pid(), pgxc_get_coordinator_proc_vxid()); From 168b8413f3be7172d74cf750266b1a7385b3d620 Mon Sep 17 00:00:00 2001 From: youngxie Date: Tue, 15 Sep 2020 19:56:56 +0800 Subject: [PATCH 054/578] adjust commit order in pgxc_node_remote_finish for parallel ddl. --- src/backend/pgxc/pool/execRemote.c | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index ea47904b..e3fa18d7 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -8585,9 +8585,34 @@ pgxc_node_remote_finish(char *prepareGID, bool commit, } } - for (i = 0; i < pgxc_handles->co_conn_count; i++) + /* Make sure datanode commit first */ + if (conn_count && is_txn_has_parallel_ddl) { - PGXCNodeHandle *conn = pgxc_handles->coord_handles[i]; + InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE); +#ifdef __TWO_PHASE_TRANS__ + g_twophase_state.response_operation = + (commit == true) ? REMOTE_FINISH_COMMIT : REMOTE_FINISH_ABORT; +#endif + /* Receive responses */ + if (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner) || + !validate_combiner(&combiner)) + { + if (combiner.errorMessage) + pgxc_node_report_error(&combiner); + else + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to COMMIT the transaction on one or more nodes"))); + } + else + CloseCombiner(&combiner); + + conn_count = 0; + } + + for (i = 0; i < pgxc_handles->co_conn_count; i++) + { + PGXCNodeHandle *conn = pgxc_handles->coord_handles[i]; #ifdef __TWO_PHASE_TRANS__ twophase_index = g_twophase_state.coord_index; g_twophase_state.coord_state[twophase_index].is_participant = true; From fd9e9f44c87464ebc02d590d5d9a091575ca5bc2 Mon Sep 17 00:00:00 2001 From: ericxwu Date: Thu, 17 Sep 2020 19:52:11 +0800 Subject: [PATCH 055/578] Fix core during ExecEndCteScan --- src/backend/executor/nodeCtescan.c | 38 +++++++++++++++--------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/src/backend/executor/nodeCtescan.c b/src/backend/executor/nodeCtescan.c index 1f309184..e9e4e0a3 100644 --- a/src/backend/executor/nodeCtescan.c +++ b/src/backend/executor/nodeCtescan.c @@ -279,25 +279,25 @@ ExecInitCteScan(CteScan *node, EState *estate, int eflags) void ExecEndCteScan(CteScanState *node) { - /* - * Free exprcontext - */ - ExecFreeExprContext(&node->ss.ps); - - /* - * clean out the tuple table - */ - ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); - ExecClearTuple(node->ss.ss_ScanTupleSlot); - - /* - * If I am the leader, free the tuplestore. - */ - if (node->leader == node) - { - tuplestore_end(node->cte_table); - node->cte_table = NULL; - } + /* + * Free exprcontext + */ + ExecFreeExprContext(&node->ss.ps); + + /* + * clean out the tuple table + */ + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + ExecClearTuple(node->ss.ss_ScanTupleSlot); + + /* + * If I am the leader, free the tuplestore. + */ + if (node->leader == node && node->cte_table) + { + tuplestore_end(node->cte_table); + node->cte_table = NULL; + } } /* ---------------------------------------------------------------- From ee7936e87b7b49f604cdfcdd96a4bc9248dade25 Mon Sep 17 00:00:00 2001 From: qiannzhang Date: Fri, 18 Sep 2020 17:10:51 +0800 Subject: [PATCH 056/578] Set keepalive, user_timeout, and connect_timeout for gtm connection, TAPD: http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131082021889 http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131082021563 --- src/backend/access/transam/gtm.c | 168 ++++++++++++++++--------------- src/gtm/client/fe-connect.c | 69 ++++++++++++- src/include/gtm/libpq-fe.h | 5 +- 3 files changed, 156 insertions(+), 86 deletions(-) diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c index d2557802..9ec7287d 100644 --- a/src/backend/access/transam/gtm.c +++ b/src/backend/access/transam/gtm.c @@ -1212,90 +1212,98 @@ InitGTM(void) #endif try_connect_gtm: - /* If this thread is postmaster itself, it contacts gtm identifying itself */ - if (!IsUnderPostmaster) - { - GTM_PGXCNodeType remote_type = GTM_NODE_DEFAULT; - - if (IS_PGXC_COORDINATOR) - remote_type = GTM_NODE_COORDINATOR; - else if (IS_PGXC_DATANODE) - remote_type = GTM_NODE_DATANODE; - - /* Use 60s as connection timeout */ - snprintf(conn_str, CONNECT_STR_LEN, "host=%s port=%d node_name=%s remote_type=%d postmaster=1 connect_timeout=%d", - GtmHost, GtmPort, PGXCNodeName, remote_type, - GtmConnectTimeout); - - /* Log activity of GTM connections */ - if(GTMDebugPrint) - elog(LOG, "Postmaster: connection established to GTM with string %s", conn_str); - } - else - { - /* Use 60s as connection timeout */ - snprintf(conn_str, CONNECT_STR_LEN, "host=%s port=%d node_name=%s connect_timeout=%d", - GtmHost, GtmPort, PGXCNodeName, GtmConnectTimeout); - - /* Log activity of GTM connections */ - if (IsAutoVacuumWorkerProcess() && GTMDebugPrint) - elog(LOG, "Autovacuum worker: connection established to GTM with string %s", conn_str); - else if (IsAutoVacuumLauncherProcess() && GTMDebugPrint) - elog(LOG, "Autovacuum launcher: connection established to GTM with string %s", conn_str); - else if (IsClusterMonitorProcess() && GTMDebugPrint) - elog(LOG, "Cluster monitor: connection established to GTM with string %s", conn_str); - else if(GTMDebugPrint) - elog(LOG, "Postmaster child: connection established to GTM with string %s", conn_str); - } + /* If this thread is postmaster itself, it contacts gtm identifying itself */ + if (!IsUnderPostmaster) + { + GTM_PGXCNodeType remote_type = GTM_NODE_DEFAULT; + + if (IS_PGXC_COORDINATOR) + remote_type = GTM_NODE_COORDINATOR; + else if (IS_PGXC_DATANODE) + remote_type = GTM_NODE_DATANODE; + + /* Use 60s as connection timeout */ + snprintf(conn_str, CONNECT_STR_LEN, "host=%s port=%d node_name=%s remote_type=%d postmaster=1 connect_timeout=%d", + GtmHost, GtmPort, PGXCNodeName, remote_type, + tcp_keepalives_idle > 0 ? + tcp_keepalives_idle : GtmConnectTimeout); + + /* Log activity of GTM connections */ + if(GTMDebugPrint) + elog(LOG, "Postmaster: connection established to GTM with string %s", conn_str); + } + else + { + /* Use 60s as connection timeout */ + snprintf(conn_str, CONNECT_STR_LEN, "host=%s port=%d node_name=%s connect_timeout=%d", + GtmHost, GtmPort, PGXCNodeName, + tcp_keepalives_idle > 0 ? + tcp_keepalives_idle : GtmConnectTimeout); + + /* Log activity of GTM connections */ + if (IsAutoVacuumWorkerProcess() && GTMDebugPrint) + elog(LOG, "Autovacuum worker: connection established to GTM with string %s", conn_str); + else if (IsAutoVacuumLauncherProcess() && GTMDebugPrint) + elog(LOG, "Autovacuum launcher: connection established to GTM with string %s", conn_str); + else if (IsClusterMonitorProcess() && GTMDebugPrint) + elog(LOG, "Cluster monitor: connection established to GTM with string %s", conn_str); + else if(GTMDebugPrint) + elog(LOG, "Postmaster child: connection established to GTM with string %s", conn_str); + } - conn = PQconnectGTM(conn_str); - if (GTMPQstatus(conn) != CONNECTION_OK) - { - int save_errno = errno; - -#ifdef __TBASE__ - if (try_cnt < max_try_cnt) - { - /* If connect gtm failed, get gtm info from syscache, and try again */ - GetMasterGtmInfo(); - if (GtmHost != NULL && GtmPort) - { - elog(DEBUG1, "[InitGTM] Get GtmHost:%s GtmPort:%d try_cnt:%d max_try_cnt:%d", - GtmHost, GtmPort, try_cnt, max_try_cnt); - } - CloseGTM(); - try_cnt++; - goto try_connect_gtm; - } - else -#endif - { - ResetGtmInfo(); + conn = PQconnectGTM(conn_str); + if (GTMPQstatus(conn) != CONNECTION_OK) + { + int save_errno = errno; + +#ifdef __TBASE__ + if (try_cnt < max_try_cnt) + { + /* If connect gtm failed, get gtm info from syscache, and try again */ + GetMasterGtmInfo(); + if (GtmHost != NULL && GtmPort) + { + elog(DEBUG1, "[InitGTM] Get GtmHost:%s GtmPort:%d try_cnt:%d max_try_cnt:%d", + GtmHost, GtmPort, try_cnt, max_try_cnt); + } + CloseGTM(); + try_cnt++; + goto try_connect_gtm; + } + else +#endif + { + ResetGtmInfo(); - /* Use LOG instead of ERROR to avoid error stack overflow. */ - if(conn) - { - ereport(LOG, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("can not connect to GTM: %s %m", GTMPQerrorMessage(conn)))); - } - else - { - ereport(LOG, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("connection is null: %m"))); - } + /* Use LOG instead of ERROR to avoid error stack overflow. */ + if(conn) + { + ereport(LOG, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("can not connect to GTM: %s %m", GTMPQerrorMessage(conn)))); + } + else + { + ereport(LOG, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("connection is null: %m"))); + } - errno = save_errno; + errno = save_errno; - CloseGTM(); - } - - } - else if (IS_PGXC_COORDINATOR) - { - register_session(conn, PGXCNodeName, MyProcPid, MyBackendId); - } + CloseGTM(); + } + + } + else + { + GTMSetSockKeepAlive(conn, tcp_keepalives_idle, + tcp_keepalives_interval, tcp_keepalives_count); + if (IS_PGXC_COORDINATOR) + { + register_session(conn, PGXCNodeName, MyProcPid, MyBackendId); + } + } } void diff --git a/src/gtm/client/fe-connect.c b/src/gtm/client/fe-connect.c index ed8ef6cc..1e3b712f 100644 --- a/src/gtm/client/fe-connect.c +++ b/src/gtm/client/fe-connect.c @@ -82,11 +82,10 @@ PQconnectGTM(const char *conninfo) { GTM_Conn *conn = PQconnectGTMStart(conninfo); - if (conn && conn->status != CONNECTION_BAD) - { - (void)connectGTMComplete(conn); - - } + if (conn && conn->status != CONNECTION_BAD) + { + (void)connectGTMComplete(conn); + } #if 0 else if (conn != NULL) { @@ -1423,3 +1422,63 @@ GTMPQuntrace(GTM_Conn *conn) conn->Pfdebug = NULL; } } + +/* + * Set socket keepalive and user_timeout. + * We can use this to detect the broken connection quickly. + */ +void +GTMSetSockKeepAlive(GTM_Conn *conn, int tcp_keepalives_idle, + int tcp_keepalives_interval, int tcp_keepalives_count) +{ + int sock = conn->sock; + int keepalive = 1; + /* user_timeout in ms */ + uint32 user_timeout = UINT32_MAX / 1000 < tcp_keepalives_idle ? + 0 : tcp_keepalives_idle * (uint32)1000; + struct tcp_info info; + int len = sizeof(info); + /* check sock */ + getsockopt(sock, IPPROTO_TCP, TCP_INFO, &info, (socklen_t *)&len); + if (info.tcpi_state != TCP_ESTABLISHED) + { + return; + } + + /* set keepalive */ + if (setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, + (char *)&keepalive, sizeof(keepalive)) < 0) + { + elog(LOG, "GTMSetSockKeepAlive setsockopt(SO_KEEPALIVE) failed: %m"); + } + if (tcp_keepalives_idle > 0 && + setsockopt(sock, IPPROTO_TCP, TCP_KEEPIDLE, + (char *)&tcp_keepalives_idle, + sizeof(tcp_keepalives_idle)) < 0) + { + elog(LOG, "GTMSetSockKeepAlive setsockopt(TCP_KEEPIDLE) failed: %m"); + } + if (tcp_keepalives_interval > 0 && + setsockopt(sock, IPPROTO_TCP, TCP_KEEPINTVL, + (char *)&tcp_keepalives_interval, + sizeof(tcp_keepalives_interval)) < 0) + { + elog(LOG, "GTMSetSockKeepAlive setsockopt(TCP_KEEPINTVL) failed: %m"); + } + if (tcp_keepalives_count > 0 && + setsockopt(sock, IPPROTO_TCP, TCP_KEEPCNT, + (char *)&tcp_keepalives_count, + sizeof(tcp_keepalives_count)) < 0) + { + elog(LOG, "GTMSetSockKeepAlive setsockopt(TCP_KEEPCNT) failed: %m"); + } + + /* set user_timeout */ + if (user_timeout > 0 && + setsockopt(sock, IPPROTO_TCP, TCP_USER_TIMEOUT, + (char *)&user_timeout, + sizeof(user_timeout)) < 0) + { + elog(LOG, "GTMSetSockKeepAlive setsockopt(TCP_USER_TIMEOUT) failed: %m"); + } +} \ No newline at end of file diff --git a/src/include/gtm/libpq-fe.h b/src/include/gtm/libpq-fe.h index 23a24e81..54058e5f 100644 --- a/src/include/gtm/libpq-fe.h +++ b/src/include/gtm/libpq-fe.h @@ -130,7 +130,10 @@ extern void GTMPQuntrace(GTM_Conn *conn); /* Force the write buffer to be written (or at least try) */ extern int PQflush(GTM_Conn *conn); -#define libpq_gettext(x) x +extern void GTMSetSockKeepAlive(GTM_Conn *conn, int tcp_keepalives_idle, + int tcp_keepalives_interval, int tcp_keepalives_count); + +#define libpq_gettext(x) x #ifdef __cplusplus } From 13030218e3194dbc0bf327262796373cfc3d4a42 Mon Sep 17 00:00:00 2001 From: ericxwu Date: Tue, 22 Sep 2020 19:30:23 +0800 Subject: [PATCH 057/578] Support nestloop join suppresion when outerpath selectivity could be under estimated http://tapd.oa.com/20421696/bugtrace/bugs/view?bug_id=1020421696082218207 --- src/backend/optimizer/path/clausesel.c | 105 +++++++++++++ src/backend/optimizer/path/costsize.c | 44 ++++-- src/backend/utils/misc/guc.c | 209 +++++++++++++------------ src/include/optimizer/cost.h | 14 +- src/test/regress/expected/join_3.out | 57 +++++++ src/test/regress/expected/sysviews.out | 3 +- src/test/regress/sql/join.sql | 29 ++++ 7 files changed, 342 insertions(+), 119 deletions(-) diff --git a/src/backend/optimizer/path/clausesel.c b/src/backend/optimizer/path/clausesel.c index 8e6e1670..86fe951b 100644 --- a/src/backend/optimizer/path/clausesel.c +++ b/src/backend/optimizer/path/clausesel.c @@ -23,6 +23,11 @@ #include "utils/lsyscache.h" #include "utils/selfuncs.h" #include "statistics/statistics.h" +#ifdef __TBASE__ +#include "access/htup_details.h" +#include "catalog/pg_operator.h" +#include "utils/syscache.h" +#endif /* @@ -866,3 +871,103 @@ clause_selectivity(PlannerInfo *root, return s1; } + +#ifdef __TBASE__ +/* + * clause_selectivity_could_under_estimated + * Check whether BaseRelOpt of the path might got under estimated rows. + * + * In real user scenarios, multiple columns could have correlation. It needs + * more statistic hints for the optimizer to know the data model + * characteristics. Since the extended mutli-column statistic calculation only + * supports '=' operation, we introduced this function to check if the + * selectivity of input path is under estimated. + */ +bool +clause_selectivity_could_under_estimated(PlannerInfo *root, Path *path) +{ + RelOptInfo *rel = NULL; + + /* We only support 1-depth nestloop outer path for now. */ + if (path->pathtype == T_SeqScan || + path->pathtype == T_IndexScan || + path->pathtype == T_IndexOnlyScan || + path->pathtype == T_BitmapIndexScan || + path->pathtype == T_BitmapHeapScan) + { + rel = path->parent; + } + else + { + return false; + } + + Assert(rel); + + /* + * The correlation problem only happens when there are multiple + * restrictions. + */ + if (list_length(rel->baserestrictinfo) > 1) + { + ListCell *lc; + Node *clause; + int count = 0; + + /* Walk through all restrictions */ + foreach (lc, rel->baserestrictinfo) + { + RestrictInfo *ri = (RestrictInfo *) lfirst(lc); + + /* + * Proceed with examination of contained clause. If the clause is + * an OR-clause. + */ + if (ri->orclause) + clause = (Node *) ri->orclause; + else + clause = (Node *) ri->clause; + + /* + * The multi-column statistic only supports '=' operator based on + * single column histograms. Thus we count all unsupported cases + * here. is_opclause() covers the NULL check for 'clause' + * + * TODO(Tbase): Be more precise on other type of clauses. + */ + if (is_opclause(clause)) + { + OpExpr *opclause = (OpExpr *) clause; + char *oprname; + Oid opno = opclause->opno; + HeapTuple opTuple; + Form_pg_operator operform; + + opTuple = SearchSysCache1(OPEROID, ObjectIdGetDatum(opno)); + if (HeapTupleIsValid(opTuple)) + { + operform = (Form_pg_operator)GETSTRUCT(opTuple); + oprname = NameStr(operform->oprname); + } + ReleaseSysCache(opTuple); + + /* Supported case, skip the count. */ + if (oprname && strcmp(oprname, "=") == 0) + continue; + } + + /* Unsupported case */ + count++; + } + + /* + * The path got some restrictions which could lead to selectivity + * under estimation. + */ + if (count > 0) + return true; + } + + return false; +} +#endif diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index a6bba0cf..0de40222 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -128,19 +128,20 @@ Cost disable_cost = 1.0e10; int max_parallel_workers_per_gather = 2; -bool enable_seqscan = true; -bool enable_indexscan = true; -bool enable_indexonlyscan = true; -bool enable_bitmapscan = true; -bool enable_tidscan = true; -bool enable_sort = true; -bool enable_hashagg = true; -bool enable_nestloop = true; -bool enable_material = true; -bool enable_mergejoin = true; -bool enable_hashjoin = true; -bool enable_fast_query_shipping = true; -bool enable_gathermerge = true; +bool enable_seqscan = true; +bool enable_indexscan = true; +bool enable_indexonlyscan = true; +bool enable_bitmapscan = true; +bool enable_tidscan = true; +bool enable_sort = true; +bool enable_hashagg = true; +bool enable_nestloop = true; +bool enable_material = true; +bool enable_mergejoin = true; +bool enable_hashjoin = true; +bool enable_fast_query_shipping = true; +bool enable_gathermerge = true; +bool enable_nestloop_suppression = false; typedef struct { @@ -2345,6 +2346,22 @@ final_cost_nestloop(PlannerInfo *root, NestPath *path, { /* Normal-case source costs were included in preliminary estimate */ +#ifdef __TBASE__ + /* + * When outerpath only got one row, we need to check if the number of + * rows is under estimated. It might lead to huge cost estimation error + * if innerpath is SeqScan. + * If it is the case, we count additional disable_cost to suppress this + * nestloop path. Thus Hashjoin or the rotated Nestloop join paths + * could win. + */ + if (enable_nestloop_suppression && + outer_path_rows == 1 && inner_path->pathtype == T_SeqScan && + clause_selectivity_could_under_estimated(root, outer_path)) + { + startup_cost += disable_cost; + } +#endif /* Compute number of tuples processed (not number emitted!) */ ntuples = outer_path_rows * inner_path_rows; } @@ -4022,7 +4039,6 @@ has_indexed_join_quals(NestPath *joinpath) return found_one; } - /* * approx_tuple_count * Quick-and-dirty estimation of the number of join rows passing diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 01405c31..af23b681 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -1021,105 +1021,116 @@ static const unit_conversion time_unit_conversion_table[] = static struct config_bool ConfigureNamesBool[] = { - { - {"enable_seqscan", PGC_USERSET, QUERY_TUNING_METHOD, - gettext_noop("Enables the planner's use of sequential-scan plans."), - NULL - }, - &enable_seqscan, - true, - NULL, NULL, NULL - }, - { - {"enable_indexscan", PGC_USERSET, QUERY_TUNING_METHOD, - gettext_noop("Enables the planner's use of index-scan plans."), - NULL - }, - &enable_indexscan, - true, - NULL, NULL, NULL - }, - { - {"enable_indexonlyscan", PGC_USERSET, QUERY_TUNING_METHOD, - gettext_noop("Enables the planner's use of index-only-scan plans."), - NULL - }, - &enable_indexonlyscan, - true, - NULL, NULL, NULL - }, - { - {"enable_bitmapscan", PGC_USERSET, QUERY_TUNING_METHOD, - gettext_noop("Enables the planner's use of bitmap-scan plans."), - NULL - }, - &enable_bitmapscan, - true, - NULL, NULL, NULL - }, - { - {"enable_tidscan", PGC_USERSET, QUERY_TUNING_METHOD, - gettext_noop("Enables the planner's use of TID scan plans."), - NULL - }, - &enable_tidscan, - true, - NULL, NULL, NULL - }, - { - {"enable_sort", PGC_USERSET, QUERY_TUNING_METHOD, - gettext_noop("Enables the planner's use of explicit sort steps."), - NULL - }, - &enable_sort, - true, - NULL, NULL, NULL - }, - { - {"enable_hashagg", PGC_USERSET, QUERY_TUNING_METHOD, - gettext_noop("Enables the planner's use of hashed aggregation plans."), - NULL - }, - &enable_hashagg, - true, - NULL, NULL, NULL - }, - { - {"enable_material", PGC_USERSET, QUERY_TUNING_METHOD, - gettext_noop("Enables the planner's use of materialization."), - NULL - }, - &enable_material, - true, - NULL, NULL, NULL - }, - { - {"enable_nestloop", PGC_USERSET, QUERY_TUNING_METHOD, - gettext_noop("Enables the planner's use of nested-loop join plans."), - NULL - }, - &enable_nestloop, - true, - NULL, NULL, NULL - }, - { - {"enable_mergejoin", PGC_USERSET, QUERY_TUNING_METHOD, - gettext_noop("Enables the planner's use of merge join plans."), - NULL - }, - &enable_mergejoin, - true, - NULL, NULL, NULL - }, - { - {"enable_hashjoin", PGC_USERSET, QUERY_TUNING_METHOD, - gettext_noop("Enables the planner's use of hash join plans."), - NULL - }, - &enable_hashjoin, - true, - NULL, NULL, NULL - }, + { + {"enable_seqscan", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("Enables the planner's use of sequential-scan plans."), + NULL + }, + &enable_seqscan, + true, + NULL, NULL, NULL + }, + { + {"enable_indexscan", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("Enables the planner's use of index-scan plans."), + NULL + }, + &enable_indexscan, + true, + NULL, NULL, NULL + }, + { + {"enable_indexonlyscan", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("Enables the planner's use of index-only-scan plans."), + NULL + }, + &enable_indexonlyscan, + true, + NULL, NULL, NULL + }, + { + {"enable_bitmapscan", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("Enables the planner's use of bitmap-scan plans."), + NULL + }, + &enable_bitmapscan, + true, + NULL, NULL, NULL + }, + { + {"enable_tidscan", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("Enables the planner's use of TID scan plans."), + NULL + }, + &enable_tidscan, + true, + NULL, NULL, NULL + }, + { + {"enable_sort", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("Enables the planner's use of explicit sort steps."), + NULL + }, + &enable_sort, + true, + NULL, NULL, NULL + }, + { + {"enable_hashagg", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("Enables the planner's use of hashed aggregation plans."), + NULL + }, + &enable_hashagg, + true, + NULL, NULL, NULL + }, + { + {"enable_material", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("Enables the planner's use of materialization."), + NULL + }, + &enable_material, + true, + NULL, NULL, NULL + }, + { + {"enable_nestloop", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("Enables the planner's use of nested-loop join plans."), + NULL + }, + &enable_nestloop, + true, + NULL, NULL, NULL + }, +#ifdef __TBASE__ + { + {"enable_nestloop_suppression", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("Enables the selectivity hints when planning nested-loop joins."), + NULL + }, + &enable_nestloop_suppression, + false, + NULL, NULL, NULL + }, +#endif + { + {"enable_mergejoin", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("Enables the planner's use of merge join plans."), + NULL + }, + &enable_mergejoin, + true, + NULL, NULL, NULL + }, + { + {"enable_hashjoin", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("Enables the planner's use of hash join plans."), + NULL + }, + &enable_hashjoin, + true, + NULL, NULL, NULL + }, #ifdef PGXC { {"enable_fast_query_shipping", PGC_USERSET, QUERY_TUNING_METHOD, diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index e438ae2e..102795bb 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -77,7 +77,8 @@ extern bool enable_mergejoin; extern bool enable_hashjoin; extern bool enable_fast_query_shipping; extern bool enable_gathermerge; -extern int constraint_exclusion; +extern bool enable_nestloop_suppression; +extern int constraint_exclusion; extern double clamp_row_est(double nrows); extern double index_pages_fetched(double tuples_fetched, BlockNumber pages, @@ -219,10 +220,13 @@ extern Selectivity clauselist_selectivity(PlannerInfo *root, JoinType jointype, SpecialJoinInfo *sjinfo); extern Selectivity clause_selectivity(PlannerInfo *root, - Node *clause, - int varRelid, - JoinType jointype, - SpecialJoinInfo *sjinfo); + Node *clause, + int varRelid, + JoinType jointype, + SpecialJoinInfo *sjinfo); +#ifdef __TBASE__ +extern bool clause_selectivity_could_under_estimated(PlannerInfo *root, Path *path); +#endif extern void cost_gather_merge(GatherMergePath *path, PlannerInfo *root, RelOptInfo *rel, ParamPathInfo *param_info, Cost input_startup_cost, Cost input_total_cost, diff --git a/src/test/regress/expected/join_3.out b/src/test/regress/expected/join_3.out index f151b912..a133332a 100644 --- a/src/test/regress/expected/join_3.out +++ b/src/test/regress/expected/join_3.out @@ -6154,3 +6154,60 @@ where exists (select 1 from j3 (19 rows) drop table j3; +-- +-- Test nestloop path suppression if the selectivity could be under estimated +-- +create table nestloop_suppression1(a int, b int, c int, d varchar(20)); +create table nestloop_suppression2(a int, b int, c int, d varchar(20)); +create table nestloop_suppression3(a int, b int); +insert into nestloop_suppression1 select i, i+1, i+2, 'char'||i from generate_series(1,10000) i; +insert into nestloop_suppression2 select i, i+1, i+2, 'char'||i from generate_series(1,10000) i; +insert into nestloop_suppression3 select i, i+1 from generate_series(1,100) i; +create index idx_nestloop_suppression1_b on nestloop_suppression1(b); +analyze nestloop_suppression1; +analyze nestloop_suppression2; +analyze nestloop_suppression3; +set enable_hashjoin = false; +explain select t3.b from nestloop_suppression1 t1, nestloop_suppression2 t2, nestloop_suppression3 t3 + where t1.b=2 and t1.c=3 and t1.d like 'char%' and t1.a=t2.a and t3.b>t2.a; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------ + Nested Loop (cost=200.16..596.19 rows=33 width=4) + Join Filter: (t3.b > t2.a) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.16..453.19 rows=1 width=4) + -> Nested Loop (cost=0.16..353.18 rows=1 width=4) + Join Filter: (t1.a = t2.a) + -> Index Scan using idx_nestloop_suppression1_b on nestloop_suppression1 t1 (cost=0.16..8.18 rows=1 width=4) + Index Cond: (b = 2) + Filter: (((d)::text ~~ 'char%'::text) AND (c = 3)) + -> Seq Scan on nestloop_suppression2 t2 (cost=0.00..220.00 rows=10000 width=4) + -> Materialize (cost=100.00..141.75 rows=100 width=4) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.00..141.50 rows=100 width=4) + -> Seq Scan on nestloop_suppression3 t3 (cost=0.00..41.00 rows=100 width=4) +(12 rows) + +set enable_nestloop_suppression = true; +explain select t3.b from nestloop_suppression1 t1, nestloop_suppression2 t2, nestloop_suppression3 t3 + where t1.b=2 and t1.c=3 and t1.d like 'char%' and t1.a=t2.a and t3.b>t2.a; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------ + Nested Loop (cost=200.16..621.19 rows=33 width=4) + Join Filter: (t3.b > t2.a) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.16..478.19 rows=1 width=4) + -> Nested Loop (cost=0.16..378.19 rows=1 width=4) + Join Filter: (t1.a = t2.a) + -> Seq Scan on nestloop_suppression2 t2 (cost=0.00..220.00 rows=10000 width=4) + -> Materialize (cost=0.16..8.19 rows=1 width=4) + -> Index Scan using idx_nestloop_suppression1_b on nestloop_suppression1 t1 (cost=0.16..8.18 rows=1 width=4) + Index Cond: (b = 2) + Filter: (((d)::text ~~ 'char%'::text) AND (c = 3)) + -> Materialize (cost=100.00..141.75 rows=100 width=4) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.00..141.50 rows=100 width=4) + -> Seq Scan on nestloop_suppression3 t3 (cost=0.00..41.00 rows=100 width=4) +(13 rows) + +drop table nestloop_suppression1; +drop table nestloop_suppression2; +drop table nestloop_suppression3; +reset enable_nestloop_suppression; +reset enable_hashjoin; diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index 06796d76..0422edd6 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -109,6 +109,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_multi_cluster | on enable_multi_cluster_print | off enable_nestloop | on + enable_nestloop_suppression | off enable_null_string | off enable_oracle_compatible | off enable_parallel_ddl | on @@ -126,7 +127,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_tidscan | on enable_transparent_crypt | on enable_user_authority_force_check | off -(55 rows) +(56 rows) -- Test that the pg_timezone_names and pg_timezone_abbrevs views are -- more-or-less working. We can't test their contents in any great detail diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql index dceca27f..ee870752 100644 --- a/src/test/regress/sql/join.sql +++ b/src/test/regress/sql/join.sql @@ -1987,3 +1987,32 @@ where exists (select 1 from j3 and t1.unique1 < 1; drop table j3; + +-- +-- Test nestloop path suppression if the selectivity could be under estimated +-- +create table nestloop_suppression1(a int, b int, c int, d varchar(20)); +create table nestloop_suppression2(a int, b int, c int, d varchar(20)); +create table nestloop_suppression3(a int, b int); + +insert into nestloop_suppression1 select i, i+1, i+2, 'char'||i from generate_series(1,10000) i; +insert into nestloop_suppression2 select i, i+1, i+2, 'char'||i from generate_series(1,10000) i; +insert into nestloop_suppression3 select i, i+1 from generate_series(1,100) i; +create index idx_nestloop_suppression1_b on nestloop_suppression1(b); +analyze nestloop_suppression1; +analyze nestloop_suppression2; +analyze nestloop_suppression3; + +set enable_hashjoin = false; +explain select t3.b from nestloop_suppression1 t1, nestloop_suppression2 t2, nestloop_suppression3 t3 + where t1.b=2 and t1.c=3 and t1.d like 'char%' and t1.a=t2.a and t3.b>t2.a; +set enable_nestloop_suppression = true; +explain select t3.b from nestloop_suppression1 t1, nestloop_suppression2 t2, nestloop_suppression3 t3 + where t1.b=2 and t1.c=3 and t1.d like 'char%' and t1.a=t2.a and t3.b>t2.a; + +drop table nestloop_suppression1; +drop table nestloop_suppression2; +drop table nestloop_suppression3; + +reset enable_nestloop_suppression; +reset enable_hashjoin; \ No newline at end of file From db62e69e0f182cd1818d8c3368ca15da74459654 Mon Sep 17 00:00:00 2001 From: ericxwu Date: Tue, 22 Sep 2020 11:37:39 +0800 Subject: [PATCH 058/578] Support inline CTE with multiple references http://tapd.oa.com/20421696/bugtrace/bugs/view?bug_id=1020421696082218007 --- src/backend/optimizer/plan/subselect.c | 104 ++++++++++++++++++++++++ src/test/regress/expected/subselect.out | 43 +++++----- 2 files changed, 126 insertions(+), 21 deletions(-) diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index f7832d4d..e8495fbc 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -98,6 +98,7 @@ #include "pgxc/pgxc.h" #endif #ifdef __TBASE__ +#include #include "nodes/pg_list.h" #include "parser/parse_oper.h" #include "parser/parse_func.h" @@ -1383,6 +1384,80 @@ hash_ok_operator(OpExpr *expr) } } +#ifdef __TBASE__ +/* + * Check if total cost of inlining to multiple subquery is cheaper. + * + * There are three alternatives to optimize CTE with multiple references. + * XXX Keep the CTE as an optimization fence, using materialized CTE scan could + * be cost saving. But in TBase distributed system, this will lead to more + * executor nodes perfored in CN, which could be much slower. + * XXX Inline the CTE to multiple subqueries. This could leverage more join + * reordering and predicate pushdown opetimization automatically. + * XXX Inline the CTE to some of the reference place(s). This need an overall + * cost based optimizer including CTE inline and sublink pullup phase, + * postgres optimizer does not support this yet. + */ +static bool +is_cte_worth_inline(CommonTableExpr *cte, Plan *plan, Path *path) +{ + Cost inline_total_cost = 0; + Cost cte_total_cost = 0; + Cost material_cost = 0; + double material_bytes = 0; + long work_mem_bytes = work_mem * 1024L; + + /* Force pullup multi-reference CTE when enable_pullup_subquery enabled */ + if (enable_pullup_subquery) + return true; + + /* Num bytes to be materialized by CTE */ + material_bytes = plan->plan_rows * plan->plan_width; + + /* + * Whether spilling or not, charge 2x cpu_operator_cost per tuple to + * reflect bookkeeping overhead. (This rate must be more than what + * cost_rescan charges for materialize, ie, cpu_operator_cost per tuple; + * if it is exactly the same then there will be a cost tie between + * nestloop with A outer, materialized B inner and nestloop with B outer, + * materialized A inner. The extra cost ensures we'll prefer + * materializing the smaller rel.) Note that this is normally a good deal + * less than cpu_tuple_cost; which is OK because a Material plan node + * doesn't do qual-checking or projection, so it's got less overhead than + * most plan nodes. + */ + material_cost += 2 * cpu_operator_cost * plan->plan_rows; + + /* + * If we will spill to disk, charge at the rate of seq_page_cost per page. + * This cost is assumed to be evenly spread through the plan run phase, + * which isn't exactly accurate but our cost model doesn't allow for + * nonuniform costs within the run phase. + */ + if (material_bytes > work_mem_bytes) + { + double npages = ceil(material_bytes / BLCKSZ); + + material_bytes += seq_page_cost * npages; + } + + /* Calculate total costs for different options */ + cte_total_cost = plan->total_cost + material_cost; + inline_total_cost = plan->total_cost * cte->cterefcount; + + /* + * In a distributed system like TBase, the inline one could leverage more + * optimizations like subquery pullup, predicate pushdown, etc. We add a + * optimization factor 0.5 here to show case these cost saves. + */ + inline_total_cost = inline_total_cost * 0.5; + + if (inline_total_cost <= cte_total_cost) + return true; + else + return false; +} +#endif /* * SS_process_ctes: process a query's WITH list @@ -1508,6 +1583,35 @@ SS_process_ctes(PlannerInfo *root) plan = create_plan(subroot, best_path); +#ifdef __TBASE__ + /* + * Handle the CTE with multiple references in the main query. Since we + * need to compare the cost between CTE Scan and inline subquery Scan, + * perform the inline check after we got the best path of CTE subquery. + */ + if ((cte->ctematerialized == CTEMaterializeNever || + (cte->ctematerialized == CTEMaterializeDefault && + cte->cterefcount > 1)) && + !cte->cterecursive && + cmdType == CMD_SELECT && + !contain_dml(cte->ctequery) && + (cte->cterefcount <= 1 || + !contain_outer_selfref(cte->ctequery)) && + !contain_volatile_functions(cte->ctequery)) + { + /* + * Check if total cost of inlining to multiple subquery is cheaper. + */ + if (is_cte_worth_inline(cte, plan, best_path)) + { + inline_cte(root, cte); + /* Make a dummy entry in cte_plan_ids */ + root->cte_plan_ids = lappend_int(root->cte_plan_ids, -1); + continue; + } + } +#endif + #ifdef XCP /* Add a remote subplan, if redistribution is needed. */ if (subroot->distribution) diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out index 52c196aa..85af9fb1 100644 --- a/src/test/regress/expected/subselect.out +++ b/src/test/regress/expected/subselect.out @@ -1787,27 +1787,28 @@ select * from x where f1 = 1; explain (verbose, costs off) with x as (select * from (select f1, now() as n from subselect_tbl) ss) select * from x, x x2 where x.n = x2.n; - QUERY PLAN -------------------------------------------------------------- - Merge Join - Output: x.f1, x.n, x2.f1, x2.n - Merge Cond: (x.n = x2.n) - CTE x - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Output: subselect_tbl.f1, now() - -> Seq Scan on public.subselect_tbl - Output: subselect_tbl.f1, now() - -> Sort - Output: x.f1, x.n - Sort Key: x.n - -> CTE Scan on x - Output: x.f1, x.n - -> Sort - Output: x2.f1, x2.n - Sort Key: x2.n - -> CTE Scan on x x2 - Output: x2.f1, x2.n -(18 rows) + QUERY PLAN +-------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + Output: subselect_tbl.f1, now(), subselect_tbl_1.f1, now() + -> Result + Output: subselect_tbl.f1, (now()), subselect_tbl_1.f1, (now()) + One-Time Filter: (now() = now()) + -> Nested Loop + Output: subselect_tbl.f1, (now()), subselect_tbl_1.f1, (now()) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: subselect_tbl.f1, now() + Distribute results by H: now() + -> Seq Scan on public.subselect_tbl + Output: subselect_tbl.f1, now() + -> Materialize + Output: subselect_tbl_1.f1, (now()) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: subselect_tbl_1.f1, now() + Distribute results by H: now() + -> Seq Scan on public.subselect_tbl subselect_tbl_1 + Output: subselect_tbl_1.f1, now() +(19 rows) explain (verbose, costs off) with x as not materialized (select * from (select f1, now() as n from subselect_tbl) ss) From 6d816ab54dbcf37787b462c66ea664a7ca9b8d53 Mon Sep 17 00:00:00 2001 From: qiannzhang Date: Wed, 23 Sep 2020 09:31:40 +0800 Subject: [PATCH 059/578] Fix bug of send concurrently by DataPumpRawSendData and pq_flush in DN. http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131080938551 --- src/backend/utils/error/elog.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/src/backend/utils/error/elog.c b/src/backend/utils/error/elog.c index 58ff0658..ca77995a 100644 --- a/src/backend/utils/error/elog.c +++ b/src/backend/utils/error/elog.c @@ -1591,12 +1591,31 @@ EmitErrorReport(void) /* Send to client, if enabled */ if (edata->output_to_client) { - if (true == g_enable_copy_silence) + if (true == g_enable_copy_silence || + (IS_PGXC_DATANODE && edata->elevel < ERROR)) { + /* + * Do not send nonfatal msg to client for Datanode. + * + * It is possible that DataPumpRawSendData is sending data now, + * and this msg can be mixed with data message + * if the socket is written concurrently. + * + * In addition, the msg is not that important. + */ ; } else { + /* + * For the same reason as above, shut down producer for Datanode + * before send ERROR/FATAL msg. + * It is ok to shut down it again in AbortTransaction. + */ + if (IS_PGXC_DATANODE) + { + SqueueProducerExit(); + } send_message_to_frontend(edata); } } From 524ea3d76c6a18a96c01a1b854f82cb491806c7b Mon Sep 17 00:00:00 2001 From: ericxwu Date: Wed, 23 Sep 2020 17:57:57 +0800 Subject: [PATCH 060/578] Improve regress stability --- src/test/regress/expected/join_3.out | 46 +++++++++---------- src/test/regress/expected/rowsecurity_1.out | 2 +- .../regress/expected/select_parallel_4.out | 18 ++++---- src/test/regress/sql/join.sql | 6 ++- src/test/regress/sql/rowsecurity.sql | 2 +- 5 files changed, 37 insertions(+), 37 deletions(-) diff --git a/src/test/regress/expected/join_3.out b/src/test/regress/expected/join_3.out index a133332a..ce5f9512 100644 --- a/src/test/regress/expected/join_3.out +++ b/src/test/regress/expected/join_3.out @@ -2979,6 +2979,7 @@ select * from int4_tbl a full join int4_tbl b on false order by 1,2; -- -- test for ability to use a cartesian join when necessary -- +set enable_hashjoin = false; explain (num_nodes off, nodes off, costs off) select * from tenk1 join int4_tbl on f1 = twothousand, @@ -2987,8 +2988,8 @@ select * from where q1 = thousand or q2 = thousand; QUERY PLAN ------------------------------------------------------------------------------------ - Hash Join - Hash Cond: (tenk1.twothousand = int4_tbl.f1) + Nested Loop + Join Filter: (tenk1.twothousand = int4_tbl.f1) -> Nested Loop -> Nested Loop -> Function Scan on q1 @@ -3002,7 +3003,7 @@ where q1 = thousand or q2 = thousand; Index Cond: (q1.q1 = thousand) -> Bitmap Index Scan on tenk1_thous_tenthous Index Cond: (q2.q2 = thousand) - -> Hash + -> Materialize -> Remote Subquery Scan on all -> Seq Scan on int4_tbl (18 rows) @@ -3015,8 +3016,8 @@ select * from where thousand = (q1 + q2); QUERY PLAN -------------------------------------------------------------------------- - Hash Join - Hash Cond: (tenk1.twothousand = int4_tbl.f1) + Nested Loop + Join Filter: (tenk1.twothousand = int4_tbl.f1) -> Nested Loop -> Nested Loop -> Function Scan on q1 @@ -3027,41 +3028,38 @@ where thousand = (q1 + q2); Recheck Cond: (thousand = (q1.q1 + q2.q2)) -> Bitmap Index Scan on tenk1_thous_tenthous Index Cond: (thousand = (q1.q1 + q2.q2)) - -> Hash + -> Materialize -> Remote Subquery Scan on all -> Seq Scan on int4_tbl (15 rows) +set enable_hashjoin = true; -- -- test ability to generate a suitable plan for a star-schema query -- +set enable_mergejoin = false; explain (costs off) select * from tenk1, int8_tbl a, int8_tbl b where thousand = a.q1 and tenthous = b.q1 and a.q2 = 1 and b.q2 = 2; - QUERY PLAN ---------------------------------------------------------- + QUERY PLAN +-------------------------------------------------- Remote Fast Query Execution Node/s: datanode_1, datanode_2 - -> Merge Join - Merge Cond: (tenk1.thousand = a.q1) - -> Sort - Sort Key: tenk1.thousand - -> Merge Join - Merge Cond: (tenk1.tenthous = b.q1) - -> Sort - Sort Key: tenk1.tenthous - -> Seq Scan on tenk1 - -> Sort - Sort Key: b.q1 - -> Seq Scan on int8_tbl b - Filter: (q2 = 2) - -> Sort - Sort Key: a.q1 + -> Hash Join + Hash Cond: (tenk1.thousand = a.q1) + -> Hash Join + Hash Cond: (tenk1.tenthous = b.q1) + -> Seq Scan on tenk1 + -> Hash + -> Seq Scan on int8_tbl b + Filter: (q2 = 2) + -> Hash -> Seq Scan on int8_tbl a Filter: (q2 = 1) -(19 rows) +(13 rows) +set enable_mergejoin = true; -- -- test a corner case in which we shouldn't apply the star-schema optimization -- diff --git a/src/test/regress/expected/rowsecurity_1.out b/src/test/regress/expected/rowsecurity_1.out index 60160a5a..e0336e73 100644 --- a/src/test/regress/expected/rowsecurity_1.out +++ b/src/test/regress/expected/rowsecurity_1.out @@ -2260,7 +2260,7 @@ EXPLAIN (COSTS OFF) SELECT * FROM rls_view; -- Query as view/table owner. Should return all records. SET SESSION AUTHORIZATION regress_rls_alice; -SELECT * FROM rls_view; +SELECT * FROM rls_view order by 1; a | b ---+----- 1 | aba diff --git a/src/test/regress/expected/select_parallel_4.out b/src/test/regress/expected/select_parallel_4.out index 3ae6bc47..4d264b26 100644 --- a/src/test/regress/expected/select_parallel_4.out +++ b/src/test/regress/expected/select_parallel_4.out @@ -139,19 +139,19 @@ alter table tenk2 set (parallel_workers = 0); explain (costs off) select count(*) from tenk1 where (two, four) not in (select hundred, thousand from tenk2 where thousand > 100); - QUERY PLAN ------------------------------------------------------------------------------------------------- - Aggregate + QUERY PLAN +------------------------------------------------------------------------------------- + Finalize Aggregate -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Gather Workers Planned: 4 - -> Hash Anti Join - Hash Cond: ((tenk1.two = tenk2.hundred) AND (tenk1.four = tenk2.thousand)) + -> Partial Aggregate -> Parallel Seq Scan on tenk1 - -> Hash - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Seq Scan on tenk2 - Filter: (thousand > 100) + Filter: (NOT (hashed SubPlan 1)) + SubPlan 1 + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on tenk2 + Filter: (thousand > 100) (11 rows) select count(*) from tenk1 where (two, four) not in diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql index ee870752..16e8dd0b 100644 --- a/src/test/regress/sql/join.sql +++ b/src/test/regress/sql/join.sql @@ -884,7 +884,7 @@ select * from int4_tbl a full join int4_tbl b on false order by 1,2; -- -- test for ability to use a cartesian join when necessary -- - +set enable_hashjoin = false; explain (num_nodes off, nodes off, costs off) select * from tenk1 join int4_tbl on f1 = twothousand, @@ -898,15 +898,17 @@ select * from int4(sin(1)) q1, int4(sin(0)) q2 where thousand = (q1 + q2); +set enable_hashjoin = true; -- -- test ability to generate a suitable plan for a star-schema query -- - +set enable_mergejoin = false; explain (costs off) select * from tenk1, int8_tbl a, int8_tbl b where thousand = a.q1 and tenthous = b.q1 and a.q2 = 1 and b.q2 = 2; +set enable_mergejoin = true; -- -- test a corner case in which we shouldn't apply the star-schema optimization diff --git a/src/test/regress/sql/rowsecurity.sql b/src/test/regress/sql/rowsecurity.sql index bd588af8..3fa55ccc 100644 --- a/src/test/regress/sql/rowsecurity.sql +++ b/src/test/regress/sql/rowsecurity.sql @@ -892,7 +892,7 @@ EXPLAIN (COSTS OFF) SELECT * FROM rls_view; -- Query as view/table owner. Should return all records. SET SESSION AUTHORIZATION regress_rls_alice; -SELECT * FROM rls_view; +SELECT * FROM rls_view order by 1; EXPLAIN (COSTS OFF) SELECT * FROM rls_view; DROP VIEW rls_view; From 3d357481bd11159ffbcd3d3a7fed2cc7ab03445a Mon Sep 17 00:00:00 2001 From: qiannzhang Date: Thu, 24 Sep 2020 16:14:12 +0800 Subject: [PATCH 061/578] remove elog in GTMSetSockKeepAlive --- src/backend/access/transam/gtm.c | 9 +++++++-- src/gtm/client/fe-connect.c | 17 ++++++++++------- src/include/gtm/libpq-fe.h | 2 +- 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c index 9ec7287d..81bb209f 100644 --- a/src/backend/access/transam/gtm.c +++ b/src/backend/access/transam/gtm.c @@ -1297,8 +1297,13 @@ InitGTM(void) } else { - GTMSetSockKeepAlive(conn, tcp_keepalives_idle, - tcp_keepalives_interval, tcp_keepalives_count); + if (!GTMSetSockKeepAlive(conn, tcp_keepalives_idle, + tcp_keepalives_interval, tcp_keepalives_count)) + { + ereport(LOG, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("GTMSetSockKeepAlive failed: %m"))); + } if (IS_PGXC_COORDINATOR) { register_session(conn, PGXCNodeName, MyProcPid, MyBackendId); diff --git a/src/gtm/client/fe-connect.c b/src/gtm/client/fe-connect.c index 1e3b712f..6e3b0306 100644 --- a/src/gtm/client/fe-connect.c +++ b/src/gtm/client/fe-connect.c @@ -1427,7 +1427,7 @@ GTMPQuntrace(GTM_Conn *conn) * Set socket keepalive and user_timeout. * We can use this to detect the broken connection quickly. */ -void +bool GTMSetSockKeepAlive(GTM_Conn *conn, int tcp_keepalives_idle, int tcp_keepalives_interval, int tcp_keepalives_count) { @@ -1442,35 +1442,36 @@ GTMSetSockKeepAlive(GTM_Conn *conn, int tcp_keepalives_idle, getsockopt(sock, IPPROTO_TCP, TCP_INFO, &info, (socklen_t *)&len); if (info.tcpi_state != TCP_ESTABLISHED) { - return; + /* No need to set */ + return true; } /* set keepalive */ if (setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, (char *)&keepalive, sizeof(keepalive)) < 0) { - elog(LOG, "GTMSetSockKeepAlive setsockopt(SO_KEEPALIVE) failed: %m"); + return false; } if (tcp_keepalives_idle > 0 && setsockopt(sock, IPPROTO_TCP, TCP_KEEPIDLE, (char *)&tcp_keepalives_idle, sizeof(tcp_keepalives_idle)) < 0) { - elog(LOG, "GTMSetSockKeepAlive setsockopt(TCP_KEEPIDLE) failed: %m"); + return false; } if (tcp_keepalives_interval > 0 && setsockopt(sock, IPPROTO_TCP, TCP_KEEPINTVL, (char *)&tcp_keepalives_interval, sizeof(tcp_keepalives_interval)) < 0) { - elog(LOG, "GTMSetSockKeepAlive setsockopt(TCP_KEEPINTVL) failed: %m"); + return false; } if (tcp_keepalives_count > 0 && setsockopt(sock, IPPROTO_TCP, TCP_KEEPCNT, (char *)&tcp_keepalives_count, sizeof(tcp_keepalives_count)) < 0) { - elog(LOG, "GTMSetSockKeepAlive setsockopt(TCP_KEEPCNT) failed: %m"); + return false; } /* set user_timeout */ @@ -1479,6 +1480,8 @@ GTMSetSockKeepAlive(GTM_Conn *conn, int tcp_keepalives_idle, (char *)&user_timeout, sizeof(user_timeout)) < 0) { - elog(LOG, "GTMSetSockKeepAlive setsockopt(TCP_USER_TIMEOUT) failed: %m"); + return false; } + + return true; } \ No newline at end of file diff --git a/src/include/gtm/libpq-fe.h b/src/include/gtm/libpq-fe.h index 54058e5f..d5e3d2fb 100644 --- a/src/include/gtm/libpq-fe.h +++ b/src/include/gtm/libpq-fe.h @@ -130,7 +130,7 @@ extern void GTMPQuntrace(GTM_Conn *conn); /* Force the write buffer to be written (or at least try) */ extern int PQflush(GTM_Conn *conn); -extern void GTMSetSockKeepAlive(GTM_Conn *conn, int tcp_keepalives_idle, +extern bool GTMSetSockKeepAlive(GTM_Conn *conn, int tcp_keepalives_idle, int tcp_keepalives_interval, int tcp_keepalives_count); #define libpq_gettext(x) x From 1efbc57281b85eefa01f388161bd48b42a061162 Mon Sep 17 00:00:00 2001 From: yeyukui Date: Fri, 25 Sep 2020 09:38:02 +0800 Subject: [PATCH 062/578] Fix the problem that the deleted table still leaves security metadata in the security feature --- src/backend/commands/user.c | 17 ++------ src/backend/utils/misc/cls.c | 56 +++++++++++++++++++++++++ src/backend/utils/misc/mls.c | 46 ++++++++++---------- src/include/utils/cls.h | 2 + src/test/regress/expected/mls_check.out | 49 +++++++++++++++++++++- src/test/regress/sql/mls_check.sql | 17 +++++++- 6 files changed, 150 insertions(+), 37 deletions(-) diff --git a/src/backend/commands/user.c b/src/backend/commands/user.c index 8d35b8c6..038119db 100644 --- a/src/backend/commands/user.c +++ b/src/backend/commands/user.c @@ -1178,23 +1178,12 @@ DropRole(DropRoleStmt *stmt) errdetail_log("%s", detail_log))); #ifdef _MLS_ - if (true == mls_check_role_permission(roleid)) + if (true == mls_check_role_permission(roleid) || + true == cls_check_user_has_policy(roleid)) { - elog(ERROR, "could not drop role:%s, cause this role has mls policy bound", + elog(ERROR, "could not drop role:%s, cause this role has mls poilcy bound", role); } - - if (!is_mls_user() && userid_is_mls_user(roleid)) - { - elog(ERROR, "non-mls user could not drop mls role:%s, permission denied", - role); - } - - if(is_mls_user() && !userid_is_mls_user(roleid)) - { - elog(ERROR, "mls user could not drop role:%s, permission denied", - role); - } #endif /* * Remove the role from the pg_authid table diff --git a/src/backend/utils/misc/cls.c b/src/backend/utils/misc/cls.c index 48366cac..916968ad 100644 --- a/src/backend/utils/misc/cls.c +++ b/src/backend/utils/misc/cls.c @@ -1109,5 +1109,61 @@ bool cls_check_table_col_has_policy(Oid relid, int attnum) return false; } +/* + * check table has policy + */ +bool cls_check_table_has_policy(Oid relid) +{ + int16 attnum = InvalidAttrNumber; + + attnum = cls_check_table_has_cls_policy(relid); + if (attnum != InvalidAttrNumber) + { + return true; + } + return false; +} + +/* + * check user whether has policy + */ +bool cls_check_user_has_policy(Oid roleid) +{ + SysScanDesc scan; + ScanKeyData skey[1]; + HeapTuple htup; + Relation rel; + bool found = false; + + ScanKeyInit(&skey[0], + Anum_pg_cls_user_userid, + BTEqualStrategyNumber, + F_OIDEQ, + ObjectIdGetDatum(roleid)); + + rel = heap_open(ClsUserRelationId, AccessShareLock); + scan = systable_beginscan(rel, + PgClsUserPolidUseridIndexId, + true, + NULL, + 1, + skey); + + while (HeapTupleIsValid(htup = systable_getnext(scan))) + { + Form_pg_cls_user form_cls_user = (Form_pg_cls_user) GETSTRUCT(htup); + + if (form_cls_user) + { + found = true; + break; + } + } + + systable_endscan(scan); + heap_close(rel, AccessShareLock); + + return found; +} #endif diff --git a/src/backend/utils/misc/mls.c b/src/backend/utils/misc/mls.c index 2c4b3795..c29ed21c 100644 --- a/src/backend/utils/misc/mls.c +++ b/src/backend/utils/misc/mls.c @@ -363,11 +363,8 @@ Datum pg_trsprt_crypt_support_datatype(PG_FUNCTION_ARGS) */ bool mls_check_relation_permission(Oid relid, bool * schema_bound) { - bool found; Oid parent_oid; - found = false; - if (!IS_SYSTEM_REL(relid)) { if (schema_bound) @@ -377,20 +374,27 @@ bool mls_check_relation_permission(Oid relid, bool * schema_bound) parent_oid = mls_get_parent_oid_by_relid(relid); - found = datamask_check_table_has_datamask(parent_oid); - if (true == found) + if (datamask_check_table_has_datamask(parent_oid) || + datamask_check_table_has_datamask(relid)) + { + return true; + } + + if (transparent_crypt_check_table_has_crypto(parent_oid, true, schema_bound) || + transparent_crypt_check_table_has_crypto(relid, true, schema_bound)) { - return found; + return true; } - found = trsprt_crypt_check_table_has_crypt(parent_oid, true, schema_bound); - if (true == found) + if (cls_check_table_has_policy(parent_oid) || + cls_check_table_has_policy(relid)) { - return found; + return true; } + } - return found; + return false; } bool mls_check_schema_permission(Oid schemaoid) @@ -429,31 +433,31 @@ bool mls_check_schema_permission(Oid schemaoid) bool mls_check_column_permission(Oid relid, int attnum) { Oid parent_oid; - bool found = false; if (!IS_SYSTEM_REL(relid)) { parent_oid = mls_get_parent_oid_by_relid(relid); - found = dmask_check_table_col_has_dmask(parent_oid, attnum); - if (true == found) + + if (datamask_check_table_col_has_datamask(parent_oid, attnum) || + datamask_check_table_col_has_datamask(relid, attnum)) { - return found; + return true; } - found = trsprt_crypt_chk_tbl_col_has_crypt(parent_oid, attnum); - if (true == found) + if (transparent_crypt_check_table_col_has_crypto(parent_oid, attnum) || + transparent_crypt_check_table_col_has_crypto(relid, attnum)) { - return found; + return true; } - found = cls_check_table_col_has_policy(parent_oid, attnum); - if (true == found) + if (cls_check_table_col_has_policy(parent_oid, attnum) || + cls_check_table_col_has_policy(relid, attnum)) { - return found; + return true; } } - return found; + return false; } diff --git a/src/include/utils/cls.h b/src/include/utils/cls.h index 3a91435c..2d753eec 100644 --- a/src/include/utils/cls.h +++ b/src/include/utils/cls.h @@ -84,5 +84,7 @@ extern void mls_update_cls_with_current_user(TupleTableSlot *slot); extern bool mls_cls_column_add_check(char * colname, Oid typoid); extern bool mls_cls_column_drop_check(char * name); extern bool cls_check_table_col_has_policy(Oid relid, int attnum); +extern bool cls_check_table_has_policy(Oid relid); +extern bool cls_check_user_has_policy(Oid relid); #endif diff --git a/src/test/regress/expected/mls_check.out b/src/test/regress/expected/mls_check.out index 0e5b955d..371bfe10 100644 --- a/src/test/regress/expected/mls_check.out +++ b/src/test/regress/expected/mls_check.out @@ -3776,6 +3776,34 @@ select MLS_TRANSPARENT_CRYPT_ALGORITHM_UNBIND_SCHEMA('crypted_schema_alt_2'); t (1 row) +-- child table has bind +select algorithm_id, nspname, tblname from pg_transparent_crypt_policy_map where nspname ilike '%alt%' order by 1,2,3; + algorithm_id | nspname | tblname +--------------+-------------------------+------------------------ + 4 | no_crypted_schema_alt_2 | tbl_crypt_alt_2_part_0 + 4 | no_crypted_schema_alt_2 | tbl_crypt_alt_2_part_1 + 4 | no_crypted_schema_alt_2 | tbl_crypt_alt_2_part_2 +(3 rows) + +--clean child +select MLS_TRANSPARENT_CRYPT_ALGORITHM_UNBIND_TABLE('no_crypted_schema_alt_2', 'tbl_crypt_alt_2_part_0'); + mls_transparent_crypt_algorithm_unbind_table +---------------------------------------------- + t +(1 row) + +select MLS_TRANSPARENT_CRYPT_ALGORITHM_UNBIND_TABLE('no_crypted_schema_alt_2', 'tbl_crypt_alt_2_part_1'); + mls_transparent_crypt_algorithm_unbind_table +---------------------------------------------- + t +(1 row) + +select MLS_TRANSPARENT_CRYPT_ALGORITHM_UNBIND_TABLE('no_crypted_schema_alt_2', 'tbl_crypt_alt_2_part_2'); + mls_transparent_crypt_algorithm_unbind_table +---------------------------------------------- + t +(1 row) + \c - godlike drop table no_crypted_schema_alt.tbl_crypted_alt; drop table no_crypted_schema_alt.tbl_nocrypt_alt; @@ -5001,7 +5029,7 @@ select * from xixi where i = 3; (1 row) \c - badboy ---fails to update +--fails to update insert into xixi as x(i,j) values(6,6) on conflict(i) do update set j = 3096 where x.j = 2048 and x.i = 6; select * from xixi where i = 6; i | j | _cls @@ -5534,7 +5562,26 @@ truncate table lala3; drop table lala; drop table lala2; drop table lala3; +\c - mls_admin +select * from pg_cls_table; + polid | attnum | relid | enable | nspname | tblname | reloptions +-------+--------+-------+--------+---------+---------+------------ + 99 | 3 | 17061 | t | public | xixi | +(1 row) + +select MLS_CLS_DROP_TABLE_LABEL('cls_compare', 'public', 'xixi'); + mls_cls_drop_table_label +-------------------------- + t +(1 row) + +select * from pg_cls_table; + polid | attnum | relid | enable | nspname | tblname | reloptions +-------+--------+-------+--------+---------+---------+------------ +(0 rows) + --everything is done +\c - godlike drop table xixi; drop table momo; -----------------CLS END-------------------- diff --git a/src/test/regress/sql/mls_check.sql b/src/test/regress/sql/mls_check.sql index 208fd38b..4369a706 100644 --- a/src/test/regress/sql/mls_check.sql +++ b/src/test/regress/sql/mls_check.sql @@ -1416,6 +1416,15 @@ select MLS_TRANSPARENT_CRYPT_ALGORITHM_UNBIND_TABLE('no_crypted_schema_alt', 'tb select MLS_TRANSPARENT_CRYPT_ALGORITHM_UNBIND_TABLE('no_crypted_schema_alt_2', 'tbl_crypt_alt_2'); select MLS_TRANSPARENT_CRYPT_ALGORITHM_UNBIND_SCHEMA('crypted_schema_alt'); select MLS_TRANSPARENT_CRYPT_ALGORITHM_UNBIND_SCHEMA('crypted_schema_alt_2'); + +-- child table has bind +select algorithm_id, nspname, tblname from pg_transparent_crypt_policy_map where nspname ilike '%alt%' order by 1,2,3; + +--clean child +select MLS_TRANSPARENT_CRYPT_ALGORITHM_UNBIND_TABLE('no_crypted_schema_alt_2', 'tbl_crypt_alt_2_part_0'); +select MLS_TRANSPARENT_CRYPT_ALGORITHM_UNBIND_TABLE('no_crypted_schema_alt_2', 'tbl_crypt_alt_2_part_1'); +select MLS_TRANSPARENT_CRYPT_ALGORITHM_UNBIND_TABLE('no_crypted_schema_alt_2', 'tbl_crypt_alt_2_part_2'); + \c - godlike drop table no_crypted_schema_alt.tbl_crypted_alt; drop table no_crypted_schema_alt.tbl_nocrypt_alt; @@ -1923,7 +1932,7 @@ select * from xixi where i = 3; insert into xixi as x(i,j) values(3,3) on conflict(i) do update set j = 2048 where x.j = 1024 and x.i = 3; select * from xixi where i = 3; \c - badboy ---fails to update +--fails to update insert into xixi as x(i,j) values(6,6) on conflict(i) do update set j = 3096 where x.j = 2048 and x.i = 6; select * from xixi where i = 6; \c - godlike @@ -2153,7 +2162,13 @@ drop table lala; drop table lala2; drop table lala3; +\c - mls_admin +select * from pg_cls_table; +select MLS_CLS_DROP_TABLE_LABEL('cls_compare', 'public', 'xixi'); +select * from pg_cls_table; + --everything is done +\c - godlike drop table xixi; drop table momo; From 0d82571a14784f41ec51376dbd07a5dff4d61210 Mon Sep 17 00:00:00 2001 From: jennyerchen Date: Fri, 25 Sep 2020 06:24:31 +0000 Subject: [PATCH 063/578] Merge branch 'sequence_curval' into 'Tbase_v5.04' (merge request !95) fix bug Incorrect acquisition of current session currval : http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view/1020421696082313799 fix bug Incorrect acquisition of current session currval : http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view/1020421696082313799 Signed-off-by: JennyJennyChen (cherry picked from commit cd43ab51) 0489b282 fix bug Incorrect acquisition of current session currval : http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view/1020421696082313799 Signed-off-by: JennyJennyChen --- src/backend/commands/sequence.c | 43 ++++++++++---------- src/test/regress/output/constraints.source | 2 +- src/test/regress/output/constraints_2.source | 2 +- src/test/regress/output/constraints_3.source | 2 +- 4 files changed, 25 insertions(+), 24 deletions(-) diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c index bc7cb490..5b6fd741 100644 --- a/src/backend/commands/sequence.c +++ b/src/backend/commands/sequence.c @@ -996,28 +996,29 @@ nextval_internal(Oid relid, bool check_permissions) Datum currval_oid(PG_FUNCTION_ARGS) { - Oid relid = PG_GETARG_OID(0); - int64 result; - SeqTable elm; - Relation seqrel; - char *seqname = NULL; + Oid relid = PG_GETARG_OID(0); + int64 result; + SeqTable elm; + Relation seqrel; + char *seqname = NULL; + + /* open and lock sequence */ + init_sequence(relid, &elm, &seqrel); + + if (pg_class_aclcheck(elm->relid, GetUserId(), + ACL_SELECT | ACL_USAGE) != ACLCHECK_OK) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("permission denied for sequence %s", + RelationGetRelationName(seqrel)))); + + if (elm->last_valid) + { + result = elm->last; + relation_close(seqrel, NoLock); + PG_RETURN_INT64(result); + } - /* open and lock sequence */ - init_sequence(relid, &elm, &seqrel); - - if (pg_class_aclcheck(elm->relid, GetUserId(), - ACL_SELECT | ACL_USAGE) != ACLCHECK_OK) - ereport(ERROR, - (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), - errmsg("permission denied for sequence %s", - RelationGetRelationName(seqrel)))); -#if 0 - if (!elm->last_valid) - ereport(ERROR, - (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("currval of sequence \"%s\" is not yet defined in this session", - RelationGetRelationName(seqrel)))); -#endif #ifdef XCP { /* diff --git a/src/test/regress/output/constraints.source b/src/test/regress/output/constraints.source index 568efec7..e19ef775 100644 --- a/src/test/regress/output/constraints.source +++ b/src/test/regress/output/constraints.source @@ -187,7 +187,7 @@ DETAIL: Failing row contains (8, Y, -8). SELECT 'eight' AS one, currval('insert_seq'); one | currval -------+--------- - eight | 8 + eight | 7 (1 row) -- According to SQL, it is OK to insert a record that gives rise to NULL diff --git a/src/test/regress/output/constraints_2.source b/src/test/regress/output/constraints_2.source index 46a83703..241adcc1 100644 --- a/src/test/regress/output/constraints_2.source +++ b/src/test/regress/output/constraints_2.source @@ -188,7 +188,7 @@ DETAIL: Failing row contains (9, Y, -9). SELECT 'eight' AS one, currval('insert_seq'); one | currval -------+--------- - eight | 9 + eight | 7 (1 row) -- According to SQL, it is OK to insert a record that gives rise to NULL diff --git a/src/test/regress/output/constraints_3.source b/src/test/regress/output/constraints_3.source index bbdb9c1e..e19ef775 100644 --- a/src/test/regress/output/constraints_3.source +++ b/src/test/regress/output/constraints_3.source @@ -187,7 +187,7 @@ DETAIL: Failing row contains (8, Y, -8). SELECT 'eight' AS one, currval('insert_seq'); one | currval -------+--------- - eight | 9 + eight | 7 (1 row) -- According to SQL, it is OK to insert a record that gives rise to NULL From 5db1bcbbd928fa9eab9673ac2d2a9044ddb9f447 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Thu, 24 Sep 2020 15:27:38 +0800 Subject: [PATCH 064/578] for gtm monitor ID859649763 --- src/gtm/client/fe-connect.c | 143 ++++---- src/gtm/client/fe-protocol.c | 541 ++++++++++++++++++------------- src/gtm/client/gtm_client.c | 100 ++++++ src/gtm/common/Makefile | 2 +- src/gtm/common/bloom.c | 282 ++++++++++++++++ src/gtm/common/datapump.c | 337 +++++++++++++++++++ src/gtm/common/elog.c | 136 ++++---- src/gtm/gtm_ctl/gtm_ctl.c | 449 ++++++++++++++++++------- src/gtm/libpq/pqformat.c | 2 +- src/gtm/main/Makefile | 2 +- src/gtm/main/gtm_seq.c | 83 +++-- src/gtm/main/gtm_stat.c | 253 ++++++++++++++- src/gtm/main/gtm_stat_error.c | 385 ++++++++++++++++++++++ src/gtm/main/gtm_store.c | 4 +- src/gtm/main/gtm_thread.c | 110 ++++--- src/gtm/main/main.c | 269 ++++++++++----- src/include/gtm/bloom.h | 39 +++ src/include/gtm/datapump.h | 52 +++ src/include/gtm/elog.h | 6 + src/include/gtm/gtm.h | 4 + src/include/gtm/gtm_c.h | 2 +- src/include/gtm/gtm_client.h | 272 ++++++++-------- src/include/gtm/gtm_msg.h | 7 +- src/include/gtm/gtm_stat.h | 86 +++++ src/include/gtm/gtm_stat_error.h | 56 ++++ 25 files changed, 2837 insertions(+), 785 deletions(-) create mode 100644 src/gtm/common/bloom.c create mode 100644 src/gtm/common/datapump.c create mode 100644 src/gtm/main/gtm_stat_error.c create mode 100644 src/include/gtm/bloom.h create mode 100644 src/include/gtm/datapump.h create mode 100644 src/include/gtm/gtm_stat.h create mode 100644 src/include/gtm/gtm_stat_error.h diff --git a/src/gtm/client/fe-connect.c b/src/gtm/client/fe-connect.c index 6e3b0306..97e6e0c5 100644 --- a/src/gtm/client/fe-connect.c +++ b/src/gtm/client/fe-connect.c @@ -929,76 +929,83 @@ freeGTM_Conn(GTM_Conn *conn) termGTMPQExpBuffer(&conn->errorMessage); termGTMPQExpBuffer(&conn->workBuffer); #ifdef XCP - if (conn->result) - { - /* Free last snapshot if defined */ - if (conn->result->gr_snapshot.sn_xip) - free(conn->result->gr_snapshot.sn_xip); - - /* Depending on result type there could be allocated data */ - switch (conn->result->gr_type) - { - case SEQUENCE_INIT_RESULT: - case SEQUENCE_RESET_RESULT: - case SEQUENCE_CLOSE_RESULT: - case SEQUENCE_RENAME_RESULT: - case SEQUENCE_ALTER_RESULT: - case SEQUENCE_SET_VAL_RESULT: - case MSG_DB_SEQUENCE_RENAME_RESULT: - if (conn->result->gr_resdata.grd_seqkey.gsk_key) - free(conn->result->gr_resdata.grd_seqkey.gsk_key); - break; - - case SEQUENCE_GET_NEXT_RESULT: - case SEQUENCE_GET_LAST_RESULT: - if (conn->result->gr_resdata.grd_seq.seqkey.gsk_key) - free(conn->result->gr_resdata.grd_seq.seqkey.gsk_key); - break; - - default: - break; - } - - -#ifdef __TBASE__ - if (conn->result->grd_storage_data.len && conn->result->grd_storage_data.data) - { - free(conn->result->grd_storage_data.data); - conn->result->grd_storage_data.data = NULL; - conn->result->grd_storage_data.len = 0; - } - - if (conn->result->grd_store_seq.count && conn->result->grd_store_seq.seqs) - { - free(conn->result->grd_store_seq.seqs); - conn->result->grd_store_seq.seqs = NULL; - conn->result->grd_store_seq.count = 0; - } - - if (conn->result->grd_store_txn.count && conn->result->grd_store_txn.txns) - { - free(conn->result->grd_store_txn.txns); - conn->result->grd_store_txn.txns = NULL; - conn->result->grd_store_txn.count = 0; - } - - if (conn->result->grd_store_check_seq.count && conn->result->grd_store_check_seq.seqs) - { - free(conn->result->grd_store_check_seq.seqs); - conn->result->grd_store_check_seq.seqs = NULL; - conn->result->grd_store_check_seq.count = 0; - } - - if (conn->result->grd_store_check_txn.count && conn->result->grd_store_check_txn.txns) + if (conn->result) + { + /* Free last snapshot if defined */ + if (conn->result->gr_snapshot.sn_xip) + free(conn->result->gr_snapshot.sn_xip); + + /* Depending on result type there could be allocated data */ + switch (conn->result->gr_type) + { + case SEQUENCE_INIT_RESULT: + case SEQUENCE_RESET_RESULT: + case SEQUENCE_CLOSE_RESULT: + case SEQUENCE_RENAME_RESULT: + case SEQUENCE_ALTER_RESULT: + case SEQUENCE_SET_VAL_RESULT: + case MSG_DB_SEQUENCE_RENAME_RESULT: + if (conn->result->gr_resdata.grd_seqkey.gsk_key) + free(conn->result->gr_resdata.grd_seqkey.gsk_key); + break; + + case SEQUENCE_GET_NEXT_RESULT: + case SEQUENCE_GET_LAST_RESULT: + if (conn->result->gr_resdata.grd_seq.seqkey.gsk_key) + free(conn->result->gr_resdata.grd_seq.seqkey.gsk_key); + break; + + default: + break; + } + + +#ifdef __TBASE__ + if (conn->result->grd_storage_data.len && conn->result->grd_storage_data.data) + { + free(conn->result->grd_storage_data.data); + conn->result->grd_storage_data.data = NULL; + conn->result->grd_storage_data.len = 0; + } + + if (conn->result->grd_store_seq.count && conn->result->grd_store_seq.seqs) + { + free(conn->result->grd_store_seq.seqs); + conn->result->grd_store_seq.seqs = NULL; + conn->result->grd_store_seq.count = 0; + } + + if (conn->result->grd_store_txn.count && conn->result->grd_store_txn.txns) + { + free(conn->result->grd_store_txn.txns); + conn->result->grd_store_txn.txns = NULL; + conn->result->grd_store_txn.count = 0; + } + + if (conn->result->grd_store_check_seq.count && conn->result->grd_store_check_seq.seqs) + { + free(conn->result->grd_store_check_seq.seqs); + conn->result->grd_store_check_seq.seqs = NULL; + conn->result->grd_store_check_seq.count = 0; + } + + if (conn->result->grd_store_check_txn.count && conn->result->grd_store_check_txn.txns) + { + free(conn->result->grd_store_check_txn.txns); + conn->result->grd_store_check_txn.txns = NULL; + conn->result->grd_store_check_txn.count = 0; + } + + if (conn->result->grd_errlog.len && conn->result->grd_errlog.errlog) { - free(conn->result->grd_store_check_txn.txns); - conn->result->grd_store_check_txn.txns = NULL; - conn->result->grd_store_check_txn.count = 0; + free(conn->result->grd_errlog.errlog); + conn->result->grd_errlog.errlog = NULL; + conn->result->grd_errlog.len = 0; } - -#endif - free(conn->result); - } + +#endif + free(conn->result); + } #endif free(conn); diff --git a/src/gtm/client/fe-protocol.c b/src/gtm/client/fe-protocol.c index 9bdfc9be..fb43649c 100644 --- a/src/gtm/client/fe-protocol.c +++ b/src/gtm/client/fe-protocol.c @@ -737,287 +737,368 @@ result->gr_status = GTM_RESULT_ERROR; } #endif - /* communication protocol: total data len, pkg number, {pkg_len,pkg_data}, {pkg_len,pkg_data},*/ - if (gtmpqGetInt(&result->grd_storage_data.len, - sizeof(uint32), conn)) - { - result->gr_status = GTM_RESULT_ERROR; - break; - } - - /* get loop count */ - if (gtmpqGetInt(&loop_count, - sizeof(uint32), conn)) - { - result->gr_status = GTM_RESULT_ERROR; - break; - } - - result->grd_storage_data.data = (char *) malloc(result->grd_storage_data.len); - data_buf = result->grd_storage_data.data; - for (i = 0; i < loop_count; i++) - { - /* a length of the next send pkg */ - if (gtmpqGetInt(&data_len, sizeof(int32), conn)) - { - result->gr_status = GTM_RESULT_ERROR; - break; - } - - /* pkg body */ - if (gtmpqGetnchar(data_buf + offset, data_len, conn)) - { - result->gr_status = GTM_RESULT_ERROR; - break; - } - offset += data_len; - } - - if (result->gr_status != GTM_RESULT_OK) - { - if (offset != result->grd_storage_data.len) - { - abort(); - } - } - } - break; - - case TXN_FINISH_GID_RESULT: + /* communication protocol: total data len, pkg number, {pkg_len,pkg_data}, {pkg_len,pkg_data},*/ + if (gtmpqGetInt(&result->grd_storage_data.len, + sizeof(uint32), conn)) + { + result->gr_status = GTM_RESULT_ERROR; + break; + } + + /* get loop count */ + if (gtmpqGetInt(&loop_count, + sizeof(uint32), conn)) + { + result->gr_status = GTM_RESULT_ERROR; + break; + } + + result->grd_storage_data.data = (char *) malloc(result->grd_storage_data.len); + data_buf = result->grd_storage_data.data; + for (i = 0; i < loop_count; i++) + { + /* a length of the next send pkg */ + if (gtmpqGetInt(&data_len, sizeof(int32), conn)) + { + result->gr_status = GTM_RESULT_ERROR; + break; + } + + /* pkg body */ + if (gtmpqGetnchar(data_buf + offset, data_len, conn)) + { + result->gr_status = GTM_RESULT_ERROR; + break; + } + offset += data_len; + } + + if (result->gr_status != GTM_RESULT_OK) + { + if (offset != result->grd_storage_data.len) + { + abort(); + } + } + } + break; + + case TXN_FINISH_GID_RESULT: + { + if (gtmpqGetInt(&result->gr_finish_status, + sizeof(uint32), conn)) + { + result->gr_status = GTM_RESULT_ERROR; + break; + } + break; + } + + case MSG_LIST_GTM_STORE_RESULT: + { + if (gtmpqGetInt64(&result->gtm_status.header.m_identifier, conn)) + { + result->gr_status = GTM_RESULT_ERROR; + break; + } + + if (gtmpqGetInt(&result->gtm_status.header.m_major_version, sizeof(int32), conn)) + { + result->gr_status = GTM_RESULT_ERROR; + break; + } + + if (gtmpqGetInt(&result->gtm_status.header.m_minor_version, sizeof(int32), conn)) + { + result->gr_status = GTM_RESULT_ERROR; + break; + } + + if (gtmpqGetInt(&result->gtm_status.header.m_gtm_status, sizeof(int32), conn)) + { + result->gr_status = GTM_RESULT_ERROR; + break; + } + + if (gtmpqGetInt64(&result->gtm_status.header.m_next_gts, conn)) + { + result->gr_status = GTM_RESULT_ERROR; + break; + } + + if (gtmpqGetInt((int32 *) &result->gtm_status.header.m_global_xmin, sizeof(int32), conn)) + { + result->gr_status = GTM_RESULT_ERROR; + break; + } + + if (gtmpqGetInt((int32 *) &result->gtm_status.header.m_next_gxid, sizeof(int32), conn)) + { + result->gr_status = GTM_RESULT_ERROR; + break; + } + + if (gtmpqGetInt((int32 *) &result->gtm_status.header.m_seq_freelist, sizeof(int32), conn)) + { + result->gr_status = GTM_RESULT_ERROR; + break; + } + + if (gtmpqGetInt((int32 *) &result->gtm_status.header.m_txn_freelist, sizeof(int32), conn)) + { + result->gr_status = GTM_RESULT_ERROR; + break; + } + + if (gtmpqGetInt64(&result->gtm_status.header.m_lsn, conn)) + { + result->gr_status = GTM_RESULT_ERROR; + break; + } + + + if (gtmpqGetInt64(&result->gtm_status.header.m_last_update_time, conn)) + { + result->gr_status = GTM_RESULT_ERROR; + break; + } + + if (gtmpqGetInt((int32 *) &result->gtm_status.header.m_crc, sizeof(int32), conn)) + { + result->gr_status = GTM_RESULT_ERROR; + break; + } + + if (gtmpqGetInt((int32 *) &result->gtm_status.seq_total, sizeof(int32), conn)) + { + result->gr_status = GTM_RESULT_ERROR; + break; + } + + if (gtmpqGetInt((int32 *) &result->gtm_status.seq_used, sizeof(int32), conn)) + { + result->gr_status = GTM_RESULT_ERROR; + break; + } + + if (gtmpqGetInt((int32 *) &result->gtm_status.txn_total, sizeof(int32), conn)) + { + result->gr_status = GTM_RESULT_ERROR; + break; + } + + if (gtmpqGetInt((int32 *) &result->gtm_status.txn_used, sizeof(int32), conn)) + { + result->gr_status = GTM_RESULT_ERROR; + break; + } + break; + } + + case MSG_LIST_GTM_STORE_SEQ_RESULT: /* List gtm running sequence info */ + { + if (conn->result->grd_store_seq.count && conn->result->grd_store_seq.seqs) + { + free(conn->result->grd_store_seq.seqs); + conn->result->grd_store_seq.seqs = NULL; + conn->result->grd_store_seq.count = 0; + } + + if (gtmpqGetInt(&conn->result->grd_store_seq.count, + sizeof(int32), conn)) + { + result->gr_status = GTM_RESULT_ERROR; + break; + } + + conn->result->grd_store_seq.seqs = + (GTM_StoredSeqInfo *) malloc(sizeof(GTM_StoredSeqInfo) * + conn->result->grd_store_seq.count); + for (i = 0; i < conn->result->grd_store_seq.count; i++) + { + if (gtmpqGetnchar((char *) &conn->result->grd_store_seq.seqs[i], sizeof(GTM_StoredSeqInfo), conn)) + { + result->gr_status = GTM_RESULT_ERROR; + break; + } + } + break; + } + + case MSG_LIST_GTM_TXN_STORE_RESULT: /* List gtm running sequence info */ + { + if (conn->result->grd_store_txn.count && conn->result->grd_store_txn.txns) + { + free(conn->result->grd_store_txn.txns); + conn->result->grd_store_txn.txns = NULL; + conn->result->grd_store_txn.count = 0; + } + + if (gtmpqGetInt(&conn->result->grd_store_txn.count, + sizeof(int32), conn)) + { + result->gr_status = GTM_RESULT_ERROR; + break; + } + + conn->result->grd_store_txn.txns = + (GTM_StoredTransactionInfo *) malloc(sizeof(GTM_StoredTransactionInfo) * + conn->result->grd_store_txn.count); + for (i = 0; i < conn->result->grd_store_txn.count; i++) + { + if (gtmpqGetnchar((char *) &conn->result->grd_store_txn.txns[i], sizeof(GTM_StoredTransactionInfo), + conn)) + { + result->gr_status = GTM_RESULT_ERROR; + break; + } + } + break; + } + + + case MSG_CHECK_GTM_SEQ_STORE_RESULT: /* Check gtm sequence valid info */ + { + if (conn->result->grd_store_check_seq.count && conn->result->grd_store_check_seq.seqs) + { + free(conn->result->grd_store_check_seq.seqs); + conn->result->grd_store_check_seq.seqs = NULL; + conn->result->grd_store_check_seq.count = 0; + } + + if (gtmpqGetInt(&conn->result->grd_store_check_seq.count, + sizeof(int32), conn)) + { + result->gr_status = GTM_RESULT_ERROR; + break; + } + + conn->result->grd_store_check_seq.seqs = + (GTMStorageSequneceStatus *) malloc(sizeof(GTMStorageSequneceStatus) * + conn->result->grd_store_check_seq.count); + for (i = 0; i < conn->result->grd_store_check_seq.count; i++) + { + if (gtmpqGetnchar((char *) &conn->result->grd_store_check_seq.seqs[i], sizeof(GTMStorageSequneceStatus), + conn)) + { + result->gr_status = GTM_RESULT_ERROR; + break; + } + } + break; + } + + case MSG_CHECK_GTM_TXN_STORE_RESULT: /* Check gtm transaction usage info */ + { + if (conn->result->grd_store_check_txn.count && conn->result->grd_store_check_txn.txns) + { + free(conn->result->grd_store_check_txn.txns); + conn->result->grd_store_check_txn.txns = NULL; + conn->result->grd_store_check_txn.count = 0; + } + + if (gtmpqGetInt(&conn->result->grd_store_check_txn.count, + sizeof(int32), conn)) + { + result->gr_status = GTM_RESULT_ERROR; + break; + } + + conn->result->grd_store_check_txn.txns = + (GTMStorageTransactionStatus *) malloc(sizeof(GTMStorageTransactionStatus) * + conn->result->grd_store_check_txn.count); + for (i = 0; i < conn->result->grd_store_check_txn.count; i++) + { + if (gtmpqGetnchar((char *) &conn->result->grd_store_check_txn.txns[i], + sizeof(GTMStorageTransactionStatus), conn)) + { + result->gr_status = GTM_RESULT_ERROR; + break; + } + } + break; + } + + case MSG_GET_GTM_STATISTICS_RESULT: { - if (gtmpqGetInt(&result->gr_finish_status, - sizeof(uint32), conn)) - { - result->gr_status = GTM_RESULT_ERROR; - break; - } - break; - } - - case MSG_LIST_GTM_STORE_RESULT: - { - if (gtmpqGetInt64(&result->gtm_status.header.m_identifier, conn)) - { - result->gr_status = GTM_RESULT_ERROR; - break; - } - - if (gtmpqGetInt(&result->gtm_status.header.m_major_version, sizeof(int32), conn)) - { - result->gr_status = GTM_RESULT_ERROR; - break; - } - - if (gtmpqGetInt(&result->gtm_status.header.m_minor_version, sizeof(int32), conn)) - { - result->gr_status = GTM_RESULT_ERROR; - break; - } - - if (gtmpqGetInt(&result->gtm_status.header.m_gtm_status, sizeof(int32), conn)) - { - result->gr_status = GTM_RESULT_ERROR; - break; - } - - if (gtmpqGetInt64(&result->gtm_status.header.m_next_gts, conn)) - { - result->gr_status = GTM_RESULT_ERROR; - break; - } - - if (gtmpqGetInt((int32 *) &result->gtm_status.header.m_global_xmin, sizeof(int32), conn)) - { - result->gr_status = GTM_RESULT_ERROR; - break; - } - - if (gtmpqGetInt((int32 *) &result->gtm_status.header.m_next_gxid, sizeof(int32), conn)) - { - result->gr_status = GTM_RESULT_ERROR; - break; - } - - if (gtmpqGetInt((int32 *) &result->gtm_status.header.m_seq_freelist, sizeof(int32), conn)) - { - result->gr_status = GTM_RESULT_ERROR; - break; - } - - if (gtmpqGetInt((int32 *) &result->gtm_status.header.m_txn_freelist, sizeof(int32), conn)) - { - result->gr_status = GTM_RESULT_ERROR; - break; - } - - if (gtmpqGetInt64(&result->gtm_status.header.m_lsn, conn)) - { - result->gr_status = GTM_RESULT_ERROR; - break; - } - - - if (gtmpqGetInt64(&result->gtm_status.header.m_last_update_time, conn)) - { - result->gr_status = GTM_RESULT_ERROR; - break; - } - - if (gtmpqGetInt((int32 *) &result->gtm_status.header.m_crc, sizeof(int32), conn)) + if (gtmpqGetInt64(&result->gr_resdata.statistic_result.start_time, conn)) { result->gr_status = GTM_RESULT_ERROR; break; } - if (gtmpqGetInt((int32 *) &result->gtm_status.seq_total, sizeof(int32), conn)) + if (gtmpqGetInt64(&result->gr_resdata.statistic_result.end_time, conn)) { result->gr_status = GTM_RESULT_ERROR; break; } - if (gtmpqGetInt((int32 *) &result->gtm_status.seq_used, sizeof(int32), conn)) + if (gtmpqGetInt(&result->gr_resdata.statistic_result.sequences_remained, + sizeof(int32), conn)) { result->gr_status = GTM_RESULT_ERROR; break; } - if (gtmpqGetInt((int32 *) &result->gtm_status.txn_total, sizeof(int32), conn)) - { - result->gr_status = GTM_RESULT_ERROR; - break; - } - - if (gtmpqGetInt((int32 *) &result->gtm_status.txn_used, sizeof(int32), conn)) - { - result->gr_status = GTM_RESULT_ERROR; - break; - } - break; - } - - case MSG_LIST_GTM_STORE_SEQ_RESULT: /* List gtm running sequence info */ - { - if (conn->result->grd_store_seq.count && conn->result->grd_store_seq.seqs) - { - free(conn->result->grd_store_seq.seqs); - conn->result->grd_store_seq.seqs = NULL; - conn->result->grd_store_seq.count = 0; - } - - if (gtmpqGetInt(&conn->result->grd_store_seq.count, + if (gtmpqGetInt(&result->gr_resdata.statistic_result.txn_remained, sizeof(int32), conn)) { result->gr_status = GTM_RESULT_ERROR; break; } - conn->result->grd_store_seq.seqs = - (GTM_StoredSeqInfo *) malloc(sizeof(GTM_StoredSeqInfo) * - conn->result->grd_store_seq.count); - for (i = 0; i < conn->result->grd_store_seq.count; i++) + for (i = 0; i < CMD_STATISTICS_TYPE_COUNT; i++) { - if (gtmpqGetnchar((char *) &conn->result->grd_store_seq.seqs[i], sizeof(GTM_StoredSeqInfo), conn)) + if (gtmpqGetInt((int32*) &result->gr_resdata.statistic_result.stat_info[i].total_request_times, + sizeof(int32), conn)) { result->gr_status = GTM_RESULT_ERROR; break; } - } - break; - } - case MSG_LIST_GTM_TXN_STORE_RESULT: /* List gtm running sequence info */ - { - if (conn->result->grd_store_txn.count && conn->result->grd_store_txn.txns) - { - free(conn->result->grd_store_txn.txns); - conn->result->grd_store_txn.txns = NULL; - conn->result->grd_store_txn.count = 0; - } - - if (gtmpqGetInt(&conn->result->grd_store_txn.count, - sizeof(int32), conn)) - { - result->gr_status = GTM_RESULT_ERROR; - break; - } - - conn->result->grd_store_txn.txns = - (GTM_StoredTransactionInfo *) malloc(sizeof(GTM_StoredTransactionInfo) * - conn->result->grd_store_txn.count); - for (i = 0; i < conn->result->grd_store_txn.count; i++) - { - if (gtmpqGetnchar((char *) &conn->result->grd_store_txn.txns[i], sizeof(GTM_StoredTransactionInfo), - conn)) + if (gtmpqGetInt((int32*) &result->gr_resdata.statistic_result.stat_info[i].avg_costtime, + sizeof(int32), conn)) { result->gr_status = GTM_RESULT_ERROR; break; } - } - break; - } - - case MSG_CHECK_GTM_SEQ_STORE_RESULT: /* Check gtm sequence valid info */ - { - if (conn->result->grd_store_check_seq.count && conn->result->grd_store_check_seq.seqs) - { - free(conn->result->grd_store_check_seq.seqs); - conn->result->grd_store_check_seq.seqs = NULL; - conn->result->grd_store_check_seq.count = 0; - } - - if (gtmpqGetInt(&conn->result->grd_store_check_seq.count, - sizeof(int32), conn)) - { - result->gr_status = GTM_RESULT_ERROR; - break; - } + if (gtmpqGetInt((int32*) &result->gr_resdata.statistic_result.stat_info[i].max_costtime, + sizeof(int32), conn)) + { + result->gr_status = GTM_RESULT_ERROR; + break; + } - conn->result->grd_store_check_seq.seqs = - (GTMStorageSequneceStatus *) malloc(sizeof(GTMStorageSequneceStatus) * - conn->result->grd_store_check_seq.count); - for (i = 0; i < conn->result->grd_store_check_seq.count; i++) - { - if (gtmpqGetnchar((char *) &conn->result->grd_store_check_seq.seqs[i], sizeof(GTMStorageSequneceStatus), - conn)) + if (gtmpqGetInt((int32*) &result->gr_resdata.statistic_result.stat_info[i].min_costtime, + sizeof(int32), conn)) { result->gr_status = GTM_RESULT_ERROR; break; } } + break; } - - case MSG_CHECK_GTM_TXN_STORE_RESULT: /* Check gtm transaction usage info */ + case MSG_GET_GTM_ERRORLOG_RESULT: { - if (conn->result->grd_store_check_txn.count && conn->result->grd_store_check_txn.txns) + result->grd_errlog.len = result->gr_msglen; + if (result->gr_msglen == 0) { - free(conn->result->grd_store_check_txn.txns); - conn->result->grd_store_check_txn.txns = NULL; - conn->result->grd_store_check_txn.count = 0; + break; } - if (gtmpqGetInt(&conn->result->grd_store_check_txn.count, - sizeof(int32), conn)) + result->grd_errlog.errlog = + (char *) malloc(result->gr_msglen); + if (gtmpqGetnchar((char *) result->grd_errlog.errlog, + result->gr_msglen, conn)) { result->gr_status = GTM_RESULT_ERROR; break; } - - conn->result->grd_store_check_txn.txns = - (GTMStorageTransactionStatus *) malloc(sizeof(GTMStorageTransactionStatus) * - conn->result->grd_store_check_txn.count); - for (i = 0; i < conn->result->grd_store_check_txn.count; i++) - { - if (gtmpqGetnchar((char *) &conn->result->grd_store_check_txn.txns[i], - sizeof(GTMStorageTransactionStatus), conn)) - { - result->gr_status = GTM_RESULT_ERROR; - break; - } - } break; } + #endif case SEQUENCE_LIST_RESULT: if (gtmpqGetInt(&result->gr_resdata.grd_seq_list.seq_count, diff --git a/src/gtm/client/gtm_client.c b/src/gtm/client/gtm_client.c index d5b5b56d..c0ef6ab1 100644 --- a/src/gtm/client/gtm_client.c +++ b/src/gtm/client/gtm_client.c @@ -574,6 +574,106 @@ check_gtm_status(GTM_Conn *conn, int *status, GTM_Timestamp *master,XLogRecPtr * return GTM_RESULT_ERROR; } +/* + * to get GTM statistics info + */ +int +get_gtm_statistics(GTM_Conn *conn, int clear_flag, int timeout_seconds, GTM_StatisticsResult** result) +{ + GTM_Result *res = NULL; + time_t finish_time; + + /* Start the message. */ + if (gtmpqPutMsgStart('C', true, conn) || + gtmpqPutInt(MSG_GET_STATISTICS, sizeof (GTM_MessageType), conn)) + goto send_failed; + + if (gtmpqPutInt(clear_flag,sizeof(int),conn)) + goto send_failed; + + /* Finish the message. */ + if (gtmpqPutMsgEnd(conn)) + goto send_failed; + + /* Flush to ensure backend gets it. */ + if (gtmpqFlush(conn)) + goto send_failed; + + /* add two seconds to allow extra wait */ + finish_time = time(NULL) + timeout_seconds + 2; + if (gtmpqWaitTimed(true, false, conn, finish_time) || + gtmpqReadData(conn) < 0) + goto receive_failed; + + if ((res = GTMPQgetResult(conn)) == NULL) + goto receive_failed; + + if (GTM_RESULT_OK == res->gr_status) + { + *result = &(res->gr_resdata.statistic_result); + return GTM_RESULT_OK; + } + else + { + return GTM_RESULT_ERROR; + } + +receive_failed: +send_failed: + conn->result = makeEmptyResultIfIsNull(conn->result); + conn->result->gr_status = GTM_RESULT_COMM_ERROR; + return GTM_RESULT_ERROR; +} + +/* + * to get gtm error log + */ +int +get_gtm_errlog(GTM_Conn *conn, int timeout_seconds, char** errlog, int* len) +{ + GTM_Result *res = NULL; + time_t finish_time; + + /* Start the message. */ + if (gtmpqPutMsgStart('C', true, conn) || + gtmpqPutInt(MSG_GET_ERRORLOG, sizeof (GTM_MessageType), conn)) + goto send_failed; + + /* Finish the message. */ + if (gtmpqPutMsgEnd(conn)) + goto send_failed; + + /* Flush to ensure backend gets it. */ + if (gtmpqFlush(conn)) + goto send_failed; + + /* add two seconds to allow extra wait */ + finish_time = time(NULL) + timeout_seconds + 2; + if (gtmpqWaitTimed(true, false, conn, finish_time) || + gtmpqReadData(conn) < 0) + goto receive_failed; + + if ((res = GTMPQgetResult(conn)) == NULL) + goto receive_failed; + + if (GTM_RESULT_OK == res->gr_status) + { + *errlog = res->grd_errlog.errlog; + *len = res->grd_errlog.len; + return GTM_RESULT_OK; + } + else + { + return GTM_RESULT_ERROR; + } + +receive_failed: +send_failed: + conn->result = makeEmptyResultIfIsNull(conn->result); + conn->result->gr_status = GTM_RESULT_COMM_ERROR; + return GTM_RESULT_ERROR; +} + #endif /* * Transaction Management API diff --git a/src/gtm/common/Makefile b/src/gtm/common/Makefile index 8f91e968..43d80dad 100644 --- a/src/gtm/common/Makefile +++ b/src/gtm/common/Makefile @@ -23,7 +23,7 @@ LDFLAGS=-L$(top_builddir)/common -L$(top_builddir)/libpq LIBS=-lpthread -lrt OBJS = gtm_opt_handler.o aset.o mcxt.o gtm_utils.o elog.o assert.o stringinfo.o gtm_lock.o \ - gtm_list.o gtm_serialize.o gtm_serialize_debug.o gtm_time.o gtm_gxid.o heap.o + gtm_list.o gtm_serialize.o gtm_serialize_debug.o gtm_time.o gtm_gxid.o heap.o datapump.o bloom.o all:all-lib diff --git a/src/gtm/common/bloom.c b/src/gtm/common/bloom.c new file mode 100644 index 00000000..14348110 --- /dev/null +++ b/src/gtm/common/bloom.c @@ -0,0 +1,282 @@ +/*------------------------------------------------------------------------- + * + * bloom.c + * + * a bloom filter, using murmurhash + * + * Copyright (c) 2020-Present TBase development team, Tencent + * + * + * IDENTIFICATION + * src/gtm/common/bloom.c + * + *------------------------------------------------------------------------- + */ + +#include +#include +#include "gtm/gtm_c.h" +#include "gtm/gtm.h" +#include "gtm/bloom.h" +#include "gtm/palloc.h" + +#define SETBIT(bitmap, bit) ((bitmap)[(bit)/CHAR_BIT] |= (1<<((bit)%CHAR_BIT))) +#define GETBIT(bitmap, bit) ((bitmap)[(bit)/CHAR_BIT] & (1<<((bit)%CHAR_BIT))) +#define MIX(h,k,m) { k *= m; k ^= k >> r; k *= m; h *= m; h ^= k; } + +/* + * Create a bloom filter, variable parameter is hash seed + * hash function num depend on seeds + */ +BLOOM * +BloomCreate(int bitmap_size, int nfuncs, ...) +{ + BLOOM *bloom; + va_list l; + int i; + + bloom = palloc(sizeof(BLOOM)); + if (NULL == bloom) + { + return NULL; + } + + bloom->bitmap = palloc0( ((bitmap_size + CHAR_BIT - 1) / CHAR_BIT) * sizeof(char)); + if (NULL == bloom->bitmap) + { + pfree(bloom); + return NULL; + } + + bloom->seeds = (uint32*)palloc(nfuncs * sizeof(uint32)); + if (NULL == bloom->seeds) + { + pfree(bloom->bitmap); + pfree(bloom); + return NULL; + } + + va_start(l, nfuncs); + for(i = 0; i < nfuncs; ++i) + { + bloom->seeds[i] = va_arg(l, uint32); + } + va_end(l); + + bloom->bitmap_size = bitmap_size; + bloom->nfuncs = nfuncs; + + return bloom; +} + +/* + * Destroy a bloom filter + */ +int +BloomDestroy(BLOOM *bloom) +{ + pfree(bloom->bitmap); + pfree(bloom->seeds); + pfree(bloom); + + return 0; +} + +/* + * Reset bloom filter's bitmap + */ +void +BloomReset(BLOOM *bloom) +{ + MemSet(bloom->bitmap, 0, ((bloom->bitmap_size + CHAR_BIT - 1) / CHAR_BIT) * sizeof(char)); +} + +/* + * Add an item into bloom filter + */ +void +BloomAdd(BLOOM *bloom, const char *s, int len) +{ + int i; + for(i = 0; i < bloom->nfuncs; ++i) + { + SETBIT(bloom->bitmap, MurmurHash2(s, len, bloom->seeds[i]) % bloom->bitmap_size); + } +} + +/* + * Check if the item exist + */ +bool +BloomCheck(BLOOM *bloom, const char *s, int len) +{ + int i; + + for(i = 0; i < bloom->nfuncs; ++i) + { + if(!(GETBIT(bloom->bitmap, MurmurHash2(s, len, bloom->seeds[i]) % bloom->bitmap_size))) + { + return false; + } + } + + return true; +} + +/* + * Check if the item exist, if not exist, add the item into bloom + */ +bool +BloomCheckAndAdd(BLOOM *bloom, const char *s, int len) +{ + int i, j; + uint32 hash; + bool exist = true; + for(i = 0; i < bloom->nfuncs; ++i) + { + hash = MurmurHash2(s, len, bloom->seeds[i]) % bloom->bitmap_size; + if(!(GETBIT(bloom->bitmap, hash))) + { + exist = false; + SETBIT(bloom->bitmap, hash); + for (j = i + 1; j < bloom->nfuncs; ++j) + { + hash = MurmurHash2(s, len, bloom->seeds[j]) % bloom->bitmap_size; + SETBIT(bloom->bitmap, hash); + } + break; + } + } + return exist; +} + +/* + * Murmurhash function + */ +uint32_t +MurmurHash2(const void * key, int len, uint32_t seed) +{ + const uint32_t m = 0x5bd1e995; + const int32_t r = 24; + const uint8_t * data = (const uint8_t *)key; + uint32_t h = seed ^ len; + uint8_t align = (uintptr_t)data & 3; + + if(align && (len >= 4)) + { + /* Pre-load the temp registers */ + uint32_t t = 0, d = 0; + int32_t sl; + int32_t sr; + + switch(align) + { + case 1: t |= data[2] << 16; + case 2: t |= data[1] << 8; + case 3: t |= data[0]; + } + + t <<= (8 * align); + + data += 4-align; + len -= 4-align; + + sl = 8 * (4-align); + sr = 8 * align; + + /* Mix */ + + while(len >= 4) + { + uint32_t k; + + d = *(uint32_t *)data; + t = (t >> sr) | (d << sl); + + k = t; + + MIX(h,k,m); + + t = d; + + data += 4; + len -= 4; + } + + /* Handle leftover data in temp registers */ + + d = 0; + + if(len >= align) + { + uint32_t k; + + switch(align) + { + case 3: d |= data[2] << 16; + case 2: d |= data[1] << 8; + case 1: d |= data[0]; + } + + k = (t >> sr) | (d << sl); + MIX(h,k,m); + + data += align; + len -= align; + + /* ---------- + * Handle tail bytes */ + + switch(len) + { + case 3: h ^= data[2] << 16; + case 2: h ^= data[1] << 8; + case 1: h ^= data[0]; h *= m; + }; + } + else + { + switch(len) + { + case 3: d |= data[2] << 16; + case 2: d |= data[1] << 8; + case 1: d |= data[0]; + case 0: h ^= (t >> sr) | (d << sl); h *= m; + } + } + + h ^= h >> 13; + h *= m; + h ^= h >> 15; + + return h; + } + else + { + while(len >= 4) + { + uint32_t k = *(uint32_t *)data; + + MIX(h,k,m); + + data += 4; + len -= 4; + } + + /* ---------- + * Handle tail bytes */ + + switch(len) + { + case 3: h ^= data[2] << 16; + case 2: h ^= data[1] << 8; + case 1: h ^= data[0]; h *= m; + }; + + h ^= h >> 13; + h *= m; + h ^= h >> 15; + + return h; + } +} diff --git a/src/gtm/common/datapump.c b/src/gtm/common/datapump.c new file mode 100644 index 00000000..912dc4bd --- /dev/null +++ b/src/gtm/common/datapump.c @@ -0,0 +1,337 @@ +/*------------------------------------------------------------------------- + * + * datapump.c + * + * + * lockless message queue + * + * Copyright (c) 2020-Present TBase development team, Tencent + * + * + * IDENTIFICATION + * src/gtm/common/datapump.c + * + *------------------------------------------------------------------------- + */ + +#include "gtm/datapump.h" + + +/* + * The following funciton is used to handle lockless message queue. + */ + +/* + * Get data pointer, use with the following functions. + */ +char * +GetData(DataPumpBuf *buf, uint32 *uiLen) +{ + uint32 border = 0; + uint32 tail = 0; + char *data; + if (buf) + { + if (0 == DataSize(buf)) + { + return NULL; + } + + SpinLockAcquire(&(buf->pointer_lock)); + border = buf->border; + tail = buf->tail; + SpinLockRelease(&(buf->pointer_lock)); + if (INVALID_BORDER == border) + { + *uiLen = 0; + return NULL; + } + + /* read from tail to border*/ + if (border >= tail) + { + /* Only sender increases tail, no need to lock. */ + *uiLen = border - tail; + data = buf->buf + tail; + } + else + { + /* read from tail to end */ + *uiLen = buf->length - tail; + data = buf->buf + tail; + buf->wrap_around = true; + } + return data; + } + else + { + *uiLen = 0; + return NULL; + } +} + +/* + * Increate data offset, used after finishing read data from queue. + */ +void +IncDataOff(DataPumpBuf *buf, uint32 uiLen) +{ + if (buf) + { + SpinLockAcquire(&(buf->pointer_lock)); + buf->tail = (buf->tail + uiLen) % buf->length; + if (buf->tail == buf->border) + { + buf->border = INVALID_BORDER; + } + SpinLockRelease(&(buf->pointer_lock)); + } +} + +/* + * Return total data size in buffer + */ +uint32 +DataSize(DataPumpBuf *buf) +{ + uint32 border = 0; + uint32 head = 0; + uint32 tail = 0; + uint32 size = 0; + if (buf) + { + SpinLockAcquire(&(buf->pointer_lock)); + head = buf->head; + tail = buf->tail; + border = buf->border; + SpinLockRelease(&(buf->pointer_lock)); + + if (INVALID_BORDER == border) + { + return 0; + } + + if (tail <= head) + { + size = head - tail; + } + else + { + size = buf->length - tail + head; + } + + return size; + } + return 0; +} + +/* + * Get the pointer to write and return the length to write. + */ +char * +GetWriteOff(DataPumpBuf *buf, uint32 *uiLen) +{ + uint32 head = 0; + uint32 tail = 0; + char *ptr = NULL; + if (0 == FreeSpace(buf)) + { + return NULL; + } + + if (buf) + { + SpinLockAcquire(&(buf->pointer_lock)); + head = buf->head; + tail = buf->tail; + SpinLockRelease(&(buf->pointer_lock)); + + if (head >= tail) + { + /* tail is the beginning of the queue. */ + if (tail != 0) + { + + *uiLen = buf->length - head; + } + else + { + /* Reserved one byte as flag. */ + *uiLen = buf->length - head - 1; + } + } + else + { + /* Reserved one byte as flag. */ + *uiLen = tail - head - 1; + } + ptr = buf->buf + head; + return ptr; + } + else + { + return NULL; + } +} + +/* + * Used to increase the write pointer after write some data. + */ +void +IncWriteOff(DataPumpBuf *buf, uint32 uiLen) +{ + if (buf) + { + SpinLockAcquire(&(buf->pointer_lock)); + buf->head += uiLen; + buf->head = buf->head % buf->length; + SpinLockRelease(&(buf->pointer_lock)); + } +} + +/* + * Reserve space in print buffer + */ +int +ReserveSpace(DataPumpBuf *buf, uint32 len, uint32 *offset) +{ + /* not enough space avaliable, wait */ + if (FreeSpace(buf) < len) + { + return -1; + } + + if (buf) + { + *offset = buf->head; + buf->head = (buf->head + len) % buf->length; + } + return 0; +} + +uint32 +BufferOffsetAdd(DataPumpBuf *buf, uint32 pointer, uint32 offset) +{ + + if (buf) + { + return (pointer + offset) % buf->length; + } + return 0; +} + +/* + * No need to lock, reader never read the data before we set border. + */ +int +ReturnSpace(DataPumpBuf *buf, uint32 offset) +{ + if (buf) + { + buf->head = offset; + } + return 0; +} + +/* + * Fill data into reserved by ReserveSpace + */ +void +FillReserveSpace(DataPumpBuf *buf, uint32 offset, char *p, uint32 len) +{ + uint32 bytes2end = 0; + uint32 bytesfrombegin = 0; + + if (buf) + { + bytes2end = buf->length - offset; + if (len <= bytes2end) + { + memcpy(buf->buf + offset, p, len); + } + else + { + bytesfrombegin = len - bytes2end; + memcpy(buf->buf + offset, p, bytes2end); + memcpy(buf->buf, (char*)p + bytes2end, bytesfrombegin); + } + } +} + +/* + * Return free space of the buffer. + */ +uint32 +FreeSpace(DataPumpBuf *buf) +{ + uint32 head = 0; + uint32 tail = 0; + uint32 len = 0; + if (buf) + { + SpinLockAcquire(&(buf->pointer_lock)); + head = buf->head; + tail = buf->tail; + SpinLockRelease(&(buf->pointer_lock)); + + if (tail <= head) + { + len = tail + buf->length - head - 1; + } + else + { + len = tail - head - 1; + } + return len; + } + else + { + return 0; + } +} + +/* + * Set tuple end border of the buffer. + */ +void +SetBorder(DataPumpBuf *buf) +{ + SpinLockAcquire(&(buf->pointer_lock)); + buf->border = buf->head; + SpinLockRelease(&(buf->pointer_lock)); +} + +/* + * Send data into buffer + */ +void +PutData(DataPumpBuf *buf, char *data, uint32 len) +{ + char *ptr; + uint32 bufferLen; + uint32 needLen; + uint32 offset = 0; + needLen = len; + while (1) + { + ptr = GetWriteOff(buf, &bufferLen); + if (ptr) + { + if (bufferLen >= needLen) + { + memcpy(ptr, data + offset, needLen); + IncWriteOff(buf, needLen); + return; + } + else + { + memcpy(ptr, data + offset, bufferLen); + IncWriteOff(buf, bufferLen); + needLen -= bufferLen; + offset += bufferLen; + } + } + } +} + + diff --git a/src/gtm/common/elog.c b/src/gtm/common/elog.c index 1d30011b..833b9e25 100644 --- a/src/gtm/common/elog.c +++ b/src/gtm/common/elog.c @@ -52,6 +52,7 @@ int Log_destination = LOG_DESTINATION_STDERR; } \ } while (0) +errlog_collection_hook_type errlog_collection_func = NULL; static void send_message_to_server_log(ErrorData *edata); static void send_message_to_frontend(Port *myport, ErrorData *edata); @@ -61,8 +62,8 @@ static const char *error_severity(int elevel); static void append_with_tabs(StringInfo buf, const char *str); static bool is_log_level_output(int elevel, int log_min_level); -int log_min_messages = WARNING; -char *Log_line_prefix = "%l:%p:%m -"; /* format for extra log line info */ +int log_min_messages = WARNING; +char *Log_line_prefix = "%p:%m -"; /* format for extra log line info */ #define FORMATTED_TS_LEN 128 static char formatted_start_time[FORMATTED_TS_LEN]; @@ -797,70 +798,73 @@ DebugFileOpen(void) */ static void send_message_to_server_log(ErrorData *edata) -{// #lizard forgives - StringInfoData buf; - - initStringInfo(&buf); - - formatted_log_time[0] = '\0'; - - log_line_prefix(&buf); - appendStringInfo(&buf, "%s: ", error_severity(edata->elevel)); - - if (edata->message) - append_with_tabs(&buf, edata->message); - else - append_with_tabs(&buf, _("missing error text")); - - appendStringInfoChar(&buf, '\n'); - - if (edata->detail_log) - { - log_line_prefix(&buf); - appendStringInfoString(&buf, _("DETAIL: ")); - append_with_tabs(&buf, edata->detail_log); - appendStringInfoChar(&buf, '\n'); - } - else if (edata->detail) - { - log_line_prefix(&buf); - appendStringInfoString(&buf, _("DETAIL: ")); - append_with_tabs(&buf, edata->detail); - appendStringInfoChar(&buf, '\n'); - } - if (edata->hint) - { - log_line_prefix(&buf); - appendStringInfoString(&buf, _("HINT: ")); - append_with_tabs(&buf, edata->hint); - appendStringInfoChar(&buf, '\n'); - } - if (edata->context) - { - log_line_prefix(&buf); - appendStringInfoString(&buf, _("CONTEXT: ")); - append_with_tabs(&buf, edata->context); - appendStringInfoChar(&buf, '\n'); - } - - /* assume no newlines in funcname or filename... */ - if (edata->funcname && edata->filename) - { - appendStringInfo(&buf, _("LOCATION: %s, %s:%d\n"), - edata->funcname, edata->filename, - edata->lineno); - } - else if (edata->filename) - { - appendStringInfo(&buf, _("LOCATION: %s:%d\n"), - edata->filename, edata->lineno); - } - - /* Write to stderr, if enabled */ - if (Log_destination & LOG_DESTINATION_STDERR) - write(fileno(stderr), buf.data, buf.len); - - pfree(buf.data); +{ + StringInfoData buf; + + initStringInfo(&buf); + + formatted_log_time[0] = '\0'; + + log_line_prefix(&buf); + appendStringInfo(&buf, "%s: ", error_severity(edata->elevel)); + + if (edata->message) + append_with_tabs(&buf, edata->message); + else + append_with_tabs(&buf, _("missing error text")); + + appendStringInfoChar(&buf, '\n'); + + if (edata->detail_log) + { + log_line_prefix(&buf); + appendStringInfoString(&buf, _("DETAIL: ")); + append_with_tabs(&buf, edata->detail_log); + appendStringInfoChar(&buf, '\n'); + } + else if (edata->detail) + { + log_line_prefix(&buf); + appendStringInfoString(&buf, _("DETAIL: ")); + append_with_tabs(&buf, edata->detail); + appendStringInfoChar(&buf, '\n'); + } + if (edata->hint) + { + log_line_prefix(&buf); + appendStringInfoString(&buf, _("HINT: ")); + append_with_tabs(&buf, edata->hint); + appendStringInfoChar(&buf, '\n'); + } + if (edata->context) + { + log_line_prefix(&buf); + appendStringInfoString(&buf, _("CONTEXT: ")); + append_with_tabs(&buf, edata->context); + appendStringInfoChar(&buf, '\n'); + } + + /* assume no newlines in funcname or filename... */ + if (edata->funcname && edata->filename) + { + appendStringInfo(&buf, _("LOCATION: %s, %s:%d\n"), + edata->funcname, edata->filename, + edata->lineno); + } + else if (edata->filename) + { + appendStringInfo(&buf, _("LOCATION: %s:%d\n"), + edata->filename, edata->lineno); + } + + /* Write to stderr, if enabled */ + if (Log_destination & LOG_DESTINATION_STDERR) + write(fileno(stderr), buf.data, buf.len); + + if (errlog_collection_func && (buf.len > 0) && ('\0' != buf.data[0])) + (*errlog_collection_func) (edata, &buf); + + pfree(buf.data); } /* diff --git a/src/gtm/gtm_ctl/gtm_ctl.c b/src/gtm/gtm_ctl/gtm_ctl.c index 59406ab2..3d1cd2f4 100644 --- a/src/gtm/gtm_ctl/gtm_ctl.c +++ b/src/gtm/gtm_ctl/gtm_ctl.c @@ -19,6 +19,9 @@ #include #include #include +#include +#include "gtm/gtm_stat.h" +#include "gtm/gtm_stat_error.h" #ifdef HAVE_SYS_RESOURCE_H #include @@ -32,6 +35,7 @@ /* PID can be negative for standalone backend */ typedef long pgpid_t; + typedef enum { SMART_MODE, @@ -49,10 +53,13 @@ typedef enum RESTART_COMMAND, STATUS_COMMAND, RECONNECT_COMMAND, - RELOAD_COMMAND + RELOAD_COMMAND, + STAT_COMMAND, + ERRLOG_COMMAND } CtlCommand; -#define DEFAULT_WAIT 60 +#define DEFAULT_WAIT 60 +#define DEFAULT_FLAG 0 static bool do_wait = false; static bool wait_set = false; @@ -78,6 +85,7 @@ GTM_ThreadID TopMostThreadID; int tcp_keepalives_idle = 0; int tcp_keepalives_interval = 0; int tcp_keepalives_count = 0; +static int clear_flag = DEFAULT_FLAG; #endif static void write_stderr(const char *fmt,...) @@ -1099,6 +1107,206 @@ do_status(void) } +static void +do_stat(void) +{ + int ret = 0; + int i = 0; + char gtm_connect_str[MAXPGPATH]; + GTM_Conn *gtm_conn = NULL; + GTM_StatisticsResult* result = NULL; + struct tm timeinfo; + char time_buff[128]; + int interval_time = 0; + float interval_minute = 0.0; + uint32 calcu_result[3]; + static const float EPSINON = 0.00001; + static char* statistics_name_tab[CMD_STATISTICS_TYPE_COUNT] = { + "GET_GTS", + "SEQUENCE_GET_NEXT", + "TXN_START_PREPARED" + }; + + /* Connect gtm and get the lates timestamp. */ + if (gtm_port == NULL || gtm_host == NULL) + { + return; + } + + snprintf(gtm_connect_str, MAXPGPATH, "host=%s port=%s node_name=gtm_ctl remote_type=%d postmaster=0 connect_timeout=%d", + gtm_host, gtm_port, GTM_NODE_GTM_CTL,wait_seconds); + gtm_conn = connect_gtm(gtm_connect_str); + if (gtm_conn == NULL) { + return; + } + + ret = get_gtm_statistics(gtm_conn, clear_flag, wait_seconds, &result); + if (!ret) + { + printf(_("GTM statistics:\n")); + strftime(time_buff, sizeof(time_buff), + "%Y-%m-%d %H:%M:%S", + localtime_r(&result->start_time, &timeinfo)); + printf(_("statistics start time: %s\n"), time_buff); + + strftime(time_buff, sizeof(time_buff), + "%Y-%m-%d %H:%M:%S", + localtime_r(&result->end_time, &timeinfo)); + printf(_("statistics end time: %s\n"), time_buff); + + printf(_("sequences remained: %d\n"), result->sequences_remained); + printf(_("txn remained: %d\n"), result->txn_remained); + + interval_time = result->end_time - result->start_time; + calcu_result[0] = (interval_time == 0) ? 0 : + result->stat_info[0].total_request_times / interval_time; + + interval_minute = (float)interval_time / (float)60.0; + if ((interval_minute >= - EPSINON) && (interval_minute <= EPSINON)) // 0 + { + calcu_result[1] = 0; + calcu_result[2] = 0; + } + else + { + calcu_result[1] = (int)((float)result->stat_info[1].total_request_times / interval_minute); + calcu_result[2] = (int)((float)result->stat_info[2].total_request_times / interval_minute); + } + + for (i = 0; i < CMD_STATISTICS_TYPE_COUNT; i++) + { + printf(_("%s info:\n"), statistics_name_tab[i]); + printf(_("total request times: %u\n"), result->stat_info[i].total_request_times); + printf(_("avg costtime: %u(ms)\n"), result->stat_info[i].avg_costtime); + printf(_("max costtime: %u(ms)\n"), result->stat_info[i].max_costtime); + printf(_("min costtime: %u(ms)\n"), result->stat_info[i].min_costtime); + if (i == 0) + { + printf(_("requests per second: %u\n"), calcu_result[i]); + } + else + { + printf(_("requests per minute: %u\n"), calcu_result[i]); + } + } + } + else + { + printf(_("%s: Can not get statistics, please check gtm status!\n"), + progname); + } + + disconnect_gtm(gtm_conn); + return; +} + +/* +* error_severity --- get localized string representing elevel +*/ +static const char * +error_severity(int elevel) +{ + const char *prefix; + + switch (elevel) + { + case 10: + case 11: + case 12: + case 13: + case 14: + prefix = _("DEBUG"); + break; + case 15: + case 16: + prefix = _("LOG"); + break; + case 17: + prefix = _("INFO"); + break; + case 18: + prefix = _("NOTICE"); + break; + case 19: + prefix = _("WARNING"); + break; + case 20: + prefix = _("ERROR"); + break; + case 22: + prefix = _("FATAL"); + break; + case 23: + prefix = _("PANIC"); + break; + default: + prefix = "???"; + break; + } + + return prefix; +} + +static void +do_errlog(void) +{ + int ret = 0; + char gtm_connect_str[MAXPGPATH]; + GTM_Conn *gtm_conn = NULL; + char *errlog = NULL; + int len = 0; + GTM_ErrLog* err_info = NULL; + struct tm timeinfo; + char time_buff[128]; + + /* Connect gtm and get the lates timestamp. */ + if (gtm_port == NULL || gtm_host == NULL) + { + return; + } + + snprintf(gtm_connect_str, MAXPGPATH, "host=%s port=%s node_name=gtm_ctl remote_type=%d postmaster=0 connect_timeout=%d", + gtm_host, gtm_port, GTM_NODE_GTM_CTL,wait_seconds); + gtm_conn = connect_gtm(gtm_connect_str); + if (gtm_conn == NULL) { + return; + } + + ret = get_gtm_errlog(gtm_conn, wait_seconds, &errlog, &len); + if (!ret) + { + printf(_("%s: errlog len: %d \n"), progname, len); + while (len) + { + err_info = (GTM_ErrLog*)errlog; + err_info->proc_id = ntohl(err_info->proc_id); + err_info->error_no = ntohl(err_info->error_no); + err_info->log_time = be64toh(err_info->log_time); + err_info->err_level = ntohl(err_info->err_level); + err_info->errmsg_len = ntohl(err_info->errmsg_len); + + strftime(time_buff, sizeof(time_buff), + "%Y-%m-%d %H:%M:%S", + localtime_r(&err_info->log_time, &timeinfo)); + + printf(_("%d|%d|%s|%s|%d|%s\n"), err_info->proc_id, + err_info->error_no, time_buff, error_severity(err_info->err_level), err_info->errmsg_len, + err_info->errmsg); + + errlog += (sizeof(GTM_ErrLog) + err_info->errmsg_len); + len -= (sizeof(GTM_ErrLog) + err_info->errmsg_len); + } + } + else + { + printf(_("%s: Can not get errlog, please check gtm status!\n"), + progname); + } + + disconnect_gtm(gtm_conn); + return; +} + /* * utility routines */ @@ -1259,84 +1467,93 @@ main(int argc, char **argv) */ optind = 1; - /* process command-line options */ - while (optind < argc) - { - while ((c = getopt(argc, argv, "D:i:l:m:o:p:t:wWZ:H:P:g:")) != -1) - { - switch (c) - { - case 'D': - { - char *gtmdata_D; - char *env_var = pg_malloc(strlen(optarg) + 9); - - gtmdata_D = xstrdup(optarg); - canonicalize_path(gtmdata_D); - snprintf(env_var, strlen(optarg) + 9, "GTMDATA=%s", - gtmdata_D); - putenv(env_var); - - /* - * We could pass GTMDATA just in an environment - * variable but we do -D too for clearer gtm - * 'ps' display - */ - gtmdata_opt = (char *) pg_malloc(strlen(gtmdata_D) + 8); - snprintf(gtmdata_opt, strlen(gtmdata_D) + 8, - "-D \"%s\" ", - gtmdata_D); - break; - } - case 'i': - nodename = strdup(optarg); - break; - case 'l': - log_file = xstrdup(optarg); - break; - case 'm': - set_mode(optarg); - break; - case 'o': - gtm_opts = xstrdup(optarg); - break; - case 'p': - gtm_path = xstrdup(optarg); - canonicalize_path(gtm_path); - break; - case 't': - wait_seconds = atoi(optarg); - break; - case 'w': - do_wait = true; - wait_set = true; - break; - case 'W': - do_wait = false; - wait_set = true; - break; - case 'Z': - gtm_app = xstrdup(optarg); - if (strcmp(gtm_app,"gtm_proxy") != 0 - && strcmp(gtm_app,"gtm_standby") != 0 - && strcmp(gtm_app,"gtm") != 0) - { - write_stderr(_("%s: %s launch name set not correct\n"), progname, gtm_app); - do_advice(); - exit(1); - } - break; + /* process command-line options */ + while (optind < argc) + { + while ((c = getopt(argc, argv, "D:i:l:m:o:p:t:wWZ:H:P:g:c:")) != -1) + { + switch (c) + { + case 'D': + { + char *gtmdata_D; + char *env_var = pg_malloc(strlen(optarg) + 9); + + gtmdata_D = xstrdup(optarg); + canonicalize_path(gtmdata_D); + snprintf(env_var, strlen(optarg) + 9, "GTMDATA=%s", + gtmdata_D); + putenv(env_var); + + /* + * We could pass GTMDATA just in an environment + * variable but we do -D too for clearer gtm + * 'ps' display + */ + gtmdata_opt = (char *) pg_malloc(strlen(gtmdata_D) + 8); + snprintf(gtmdata_opt, strlen(gtmdata_D) + 8, + "-D \"%s\" ", + gtmdata_D); + break; + } + case 'i': + nodename = strdup(optarg); + break; + case 'l': + log_file = xstrdup(optarg); + break; + case 'm': + set_mode(optarg); + break; + case 'o': + gtm_opts = xstrdup(optarg); + break; + case 'p': + gtm_path = xstrdup(optarg); + canonicalize_path(gtm_path); + break; + case 't': + wait_seconds = atoi(optarg); + break; + case 'w': + do_wait = true; + wait_set = true; + break; + case 'W': + do_wait = false; + wait_set = true; + break; + case 'Z': + gtm_app = xstrdup(optarg); + if (strcmp(gtm_app,"gtm_proxy") != 0 + && strcmp(gtm_app,"gtm_standby") != 0 + && strcmp(gtm_app,"gtm") != 0) + { + write_stderr(_("%s: %s launch name set not correct\n"), progname, gtm_app); + do_advice(); + exit(1); + } + break; #ifdef __TBASE__ - case 'H': - gtm_host = xstrdup(optarg); - break; - - case 'P': - gtm_port = xstrdup(optarg); + case 'H': + gtm_host = xstrdup(optarg); + break; + + case 'P': + gtm_port = xstrdup(optarg); break; case 'g': startup_gts = xstrdup(optarg); break; + case 'c': + clear_flag = atoi(optarg); + if (clear_flag != 0 && clear_flag != 1) + { + write_stderr(_("%s: %d clear_flag set not correct\n"), progname, clear_flag); + do_advice(); + exit(1); + } + break; #endif default: /* getopt_long already issued a suitable error message */ @@ -1369,16 +1586,20 @@ main(int argc, char **argv) ctl_command = RECONNECT_COMMAND; else if (strcmp(argv[optind], "reload") == 0) ctl_command = RELOAD_COMMAND; - else - { - write_stderr(_("%s: unrecognized operation mode \"%s\"\n"), - progname, argv[optind]); - do_advice(); - exit(1); - } - optind++; - } - } + else if (strcmp(argv[optind], "stat") == 0) + ctl_command = STAT_COMMAND; + else if (strcmp(argv[optind], "errlog") == 0) + ctl_command = ERRLOG_COMMAND; + else + { + write_stderr(_("%s: unrecognized operation mode \"%s\"\n"), + progname, argv[optind]); + do_advice(); + exit(1); + } + optind++; + } + } if (ctl_command == NO_COMMAND) { @@ -1395,13 +1616,14 @@ main(int argc, char **argv) canonicalize_path(gtm_data); } - if (!gtm_data && ctl_command != STATUS_COMMAND) - { - write_stderr("%s: no GTM/GTM Proxy directory specified \n", - progname); - do_advice(); - exit(1); - } + if (!gtm_data && ctl_command != STATUS_COMMAND && + ctl_command != STAT_COMMAND && ctl_command != ERRLOG_COMMAND) + { + write_stderr("%s: no GTM/GTM Proxy directory specified \n", + progname); + do_advice(); + exit(1); + } /* * pid files of gtm and gtm proxy are named differently @@ -1442,12 +1664,13 @@ main(int argc, char **argv) } #ifdef __TBASE__ - if(ctl_command == STATUS_COMMAND) - { - if(gtm_port == NULL) - { - write_stderr(_("%s: option -P GTM_port is not specified\n"), - progname); + if(ctl_command == STATUS_COMMAND || ctl_command == STAT_COMMAND + || ctl_command == ERRLOG_COMMAND) + { + if(gtm_port == NULL) + { + write_stderr(_("%s: option -P GTM_port is not specified\n"), + progname); do_advice(); exit(1); } @@ -1463,15 +1686,17 @@ main(int argc, char **argv) case PROMOTE_COMMAND: case STATUS_COMMAND: case RELOAD_COMMAND: - do_wait = false; - break; - case STOP_COMMAND: - do_wait = true; - break; - default: - break; - } - } + case STAT_COMMAND: + case ERRLOG_COMMAND: + do_wait = false; + break; + case STOP_COMMAND: + do_wait = true; + break; + default: + break; + } + } /* Build strings for pid file and option file */ if(gtm_data) @@ -1523,9 +1748,15 @@ main(int argc, char **argv) case RELOAD_COMMAND: do_reload(); break; - default: + case STAT_COMMAND: + do_stat(); break; - } + case ERRLOG_COMMAND: + do_errlog(); + break; + default: + break; + } exit(0); } diff --git a/src/gtm/libpq/pqformat.c b/src/gtm/libpq/pqformat.c index 31d11d45..0a37c574 100644 --- a/src/gtm/libpq/pqformat.c +++ b/src/gtm/libpq/pqformat.c @@ -634,5 +634,5 @@ pq_getmsgend(StringInfo msg) int pq_getmsgunreadlen(StringInfo msg) { - return msg->len - msg->cursor; + return msg->len - msg->cursor; } diff --git a/src/gtm/main/Makefile b/src/gtm/main/Makefile index 7351019f..ca5910a6 100644 --- a/src/gtm/main/Makefile +++ b/src/gtm/main/Makefile @@ -15,7 +15,7 @@ ifneq ($(PORTNAME), win32) override CFLAGS += $(PTHREAD_CFLAGS) endif -OBJS=main.o gtm_thread.o gtm_txn.o gtm_seq.o gtm_snap.o gtm_standby.o gtm_opt.o gtm_backup.o gtm_store.o gtm_xlog.o +OBJS=main.o gtm_thread.o gtm_txn.o gtm_seq.o gtm_snap.o gtm_standby.o gtm_opt.o gtm_backup.o gtm_store.o gtm_xlog.o gtm_stat.o gtm_stat_error.o OTHERS= ../libpq/libpqcomm.a ../path/libgtmpath.a ../recovery/libgtmrecovery.a ../client/libgtmclient.a ../common/libgtm.a ../../port/libpgport.a diff --git a/src/gtm/main/gtm_seq.c b/src/gtm/main/gtm_seq.c index abff352a..9941cb30 100644 --- a/src/gtm/main/gtm_seq.c +++ b/src/gtm/main/gtm_seq.c @@ -2115,48 +2115,47 @@ ProcessSequenceGetCurrentCommand(Port *myport, StringInfo message) */ void ProcessSequenceGetNextCommand(Port *myport, StringInfo message, bool is_backup) -{// #lizard forgives - GTM_SequenceKeyData seqkey; - StringInfoData buf; - GTM_Sequence seqval; - GTM_Sequence range; - GTM_Sequence rangemax; - uint32 coord_namelen; - char *coord_name; - uint32 coord_procid; - - if (Recovery_IsStandby()) - { - if (myport->remote_type != GTM_NODE_GTM) - { - elog(ERROR, "gtm standby can't provide sequence to datanodes or coordinators."); - } - } - - - seqkey.gsk_keylen = pq_getmsgint(message, sizeof (seqkey.gsk_keylen)); - seqkey.gsk_key = (char *)pq_getmsgbytes(message, seqkey.gsk_keylen); - - coord_namelen = pq_getmsgint(message, sizeof(coord_namelen)); - if (coord_namelen > 0) - coord_name = (char *)pq_getmsgbytes(message, coord_namelen); - else - coord_name = NULL; - coord_procid = pq_getmsgint(message, sizeof(coord_procid)); - memcpy(&range, pq_getmsgbytes(message, sizeof (GTM_Sequence)), - sizeof (GTM_Sequence)); - - if (GTM_SeqGetNext(&seqkey, coord_name, coord_procid, range, - &seqval, &rangemax)) - ereport(ERROR, - (ERANGE, - errmsg("Can not get current value of the sequence"))); - - - elog(DEBUG1, "Getting next value %ld for sequence %s", seqval, seqkey.gsk_key); - - if (!is_backup) - { +{ + GTM_SequenceKeyData seqkey; + StringInfoData buf; + GTM_Sequence seqval; + GTM_Sequence range; + GTM_Sequence rangemax; + uint32 coord_namelen; + char *coord_name; + uint32 coord_procid; + + if (Recovery_IsStandby()) + { + if (myport->remote_type != GTM_NODE_GTM) + { + elog(ERROR, "gtm standby can't provide sequence to datanodes or coordinators."); + } + } + + + seqkey.gsk_keylen = pq_getmsgint(message, sizeof (seqkey.gsk_keylen)); + seqkey.gsk_key = (char *)pq_getmsgbytes(message, seqkey.gsk_keylen); + + coord_namelen = pq_getmsgint(message, sizeof(coord_namelen)); + if (coord_namelen > 0) + coord_name = (char *)pq_getmsgbytes(message, coord_namelen); + else + coord_name = NULL; + coord_procid = pq_getmsgint(message, sizeof(coord_procid)); + memcpy(&range, pq_getmsgbytes(message, sizeof (GTM_Sequence)), + sizeof (GTM_Sequence)); + + if (GTM_SeqGetNext(&seqkey, coord_name, coord_procid, range, + &seqval, &rangemax)) + ereport(ERROR, + (ERANGE, + errmsg("Can not get current value of the sequence"))); + + elog(DEBUG1, "Getting next value %ld for sequence %s", seqval, seqkey.gsk_key); + + if (!is_backup) + { #ifndef __XLOG__ /* Backup first */ if (GetMyConnection(myport)->standby) diff --git a/src/gtm/main/gtm_stat.c b/src/gtm/main/gtm_stat.c index 7b8d7f1d..89a51d5f 100644 --- a/src/gtm/main/gtm_stat.c +++ b/src/gtm/main/gtm_stat.c @@ -14,24 +14,263 @@ */ #include "gtm/gtm_c.h" #include "gtm/gtm.h" +#include "gtm/gtm_stat.h" +#include "gtm/gtm_msg.h" +#include "gtm/libpq.h" +#include "gtm/pqformat.h" +#include -uint32 GTM_Message_Stats[MSG_MAX_MESSAGE_TYPE]; -uint32 GTM_Result_Stats[GTM_MAX_RESULT_TYPE]; +extern int32 GTM_StoreGetUsedSeq(void); +extern int32 GTM_StoreGetUsedTxn(void); + +GTM_Statistics GTMStatistics; + +/* + * Init global gtm statistic handle + */ void -gtm_msgstat_increment(int type) +GTM_InitGtmStatistics(void) +{ + GTMStatistics.stat_start_time = time(NULL);; + SpinLockInit(>MStatistics.lock); +} + +/* + * Init the worker statistics's handle + */ +static void +GTM_InitStatisticsInfo(GTM_WorkerStatistics *stat_handle) +{ + int i = 0; + for (i = 0; i < CMD_STATISTICS_TYPE_COUNT; i++) + { + pg_atomic_init_u32(&stat_handle->cmd_statistics[i].total_request_times, 0); + pg_atomic_init_u32(&stat_handle->cmd_statistics[i].total_costtime, 0); + pg_atomic_init_u32(&stat_handle->cmd_statistics[i].max_costtime, 0); + pg_atomic_init_u32(&stat_handle->cmd_statistics[i].min_costtime, PG_UINT32_MAX); + } +} + +/* + * Reset the worker statistics's handle + */ +static void +GTM_ResetStatisticsInfo(GTM_WorkerStatistics *stat_handle) +{ + int i = 0; + for (i = 0; i < CMD_STATISTICS_TYPE_COUNT; i++) + { + pg_atomic_write_u32(&stat_handle->cmd_statistics[i].total_request_times, 0); + pg_atomic_write_u32(&stat_handle->cmd_statistics[i].total_costtime, 0); + pg_atomic_write_u32(&stat_handle->cmd_statistics[i].max_costtime, 0); + pg_atomic_write_u32(&stat_handle->cmd_statistics[i].min_costtime, PG_UINT32_MAX); + } +} + +/* + * Init the statistics item + */ +static void +GTM_InitStatisticsItemArray(GTM_StatisticsItem *cmd_item) { - GTM_Message_Stats[type]++; + int i = 0; + for (i = 0; i < CMD_STATISTICS_TYPE_COUNT; i++) + { + cmd_item[i].total_request_times = 0; + cmd_item[i].total_costtime = 0; + cmd_item[i].max_costtime = 0; + cmd_item[i].min_costtime = PG_UINT32_MAX; + } } +/* + * Init worker thread's statistics handle + * only worker thread need to call + */ void -gtm_resultstat_increment(int type) +GTM_InitStatisticsHandle(void) { - GTM_Result_Stats[type]++; + GTM_ThreadInfo *thrinfo = GetMyThreadInfo; + MemoryContext oldContext; + + AssertState(thrinfo->stat_handle == NULL); + + oldContext = MemoryContextSwitchTo(TopMostMemoryContext); + + thrinfo->stat_handle = palloc(sizeof(GTM_WorkerStatistics)); + if (thrinfo->stat_handle == NULL) + ereport(ERROR, (ENOMEM, errmsg("Out of memory"))); + + GTM_InitStatisticsInfo(thrinfo->stat_handle); + + MemoryContextSwitchTo(oldContext); } +/* + * Update statistics, when completing a command + */ void -gtm_print_stats(void) +GTM_UpdateStatistics(GTM_WorkerStatistics* stat_handle, GTM_MessageType mtype, uint32 costtime) { + GTM_StatisticsCmd mCmd; + GTM_StatisticsInfo* stat_info = NULL; + + if (mtype == MSG_GETGTS) + { + mCmd = CMD_GETGTS; + } + else if (mtype == MSG_SEQUENCE_GET_NEXT) + { + mCmd = CMD_SEQUENCE_GET_NEXT; + } + else if (mtype == MSG_TXN_START_PREPARED) + { + mCmd = CMD_TXN_START_PREPARED; + } + else + { + return; + } + + stat_info = &stat_handle->cmd_statistics[mCmd]; + pg_atomic_fetch_add_u32(&stat_info->total_request_times, 1); + pg_atomic_fetch_add_u32(&stat_info->total_costtime, costtime); + + if (costtime > pg_atomic_read_u32(&stat_info->max_costtime)) + { + pg_atomic_write_u32(&stat_info->max_costtime, costtime); + } + + if (costtime < pg_atomic_read_u32(&stat_info->min_costtime)) + { + pg_atomic_write_u32(&stat_info->min_costtime, costtime); + } +} + +/* + * Combine the statistics of each thread and calculate the result + */ +static void +GTM_GetMergeResult(int clear_flag, pg_time_t *stat_start_time, pg_time_t *stat_end_time, GTM_StatisticsItem *result) +{ + GTM_ThreadInfo *thrinfo = NULL; + GTM_WorkerStatistics *stat_handle = NULL; + uint32 max_costtime = 0; + uint32 min_costtime = 0; + uint32 i = 0; + uint32 j = 0; + + GTM_InitStatisticsItemArray(result); + + SpinLockAcquire(>MStatistics.lock); + GTM_RWLockAcquire(>MThreads->gt_lock, GTM_LOCKMODE_READ); + + /* Combine data from each thread */ + for (i = 0; i < GTMThreads->gt_array_size; i++) + { + thrinfo = GTMThreads->gt_threads[i]; + if(NULL == thrinfo) + { + elog(DEBUG1, "thread %d exits.", i); + continue; + } + + if(false == thrinfo->thr_epoll_ok || NULL == thrinfo->stat_handle) + { + continue; + } + + stat_handle = thrinfo->stat_handle; + for (j = 0; j < CMD_STATISTICS_TYPE_COUNT; j++) + { + result[j].total_request_times += pg_atomic_read_u32(&stat_handle->cmd_statistics[j].total_request_times); + result[j].total_costtime += pg_atomic_read_u32(&stat_handle->cmd_statistics[j].total_costtime); + max_costtime = pg_atomic_read_u32(&stat_handle->cmd_statistics[j].max_costtime); + min_costtime = pg_atomic_read_u32(&stat_handle->cmd_statistics[j].min_costtime); + if (result[j].max_costtime < max_costtime) + { + result[j].max_costtime = max_costtime; + } + + if (result[j].min_costtime > min_costtime) + { + result[j].min_costtime = min_costtime; + } + } + + if (clear_flag) + { + GTM_ResetStatisticsInfo(stat_handle); + } + } + + *stat_start_time = GTMStatistics.stat_start_time; + *stat_end_time = time(NULL); + for (i = 0; i < CMD_STATISTICS_TYPE_COUNT; i++) + { + result[i].avg_costtime = (result[i].total_request_times == 0) ? 0 : + result[i].total_costtime / result[i].total_request_times; + } + + if (clear_flag) + { + GTMStatistics.stat_start_time = *stat_end_time; + } + + GTM_RWLockRelease(>MThreads->gt_lock); + SpinLockRelease(>MStatistics.lock); +} + +/* + * Process MSG_GET_STATISTICS message + */ +void +ProcessGetStatisticsCommand(Port *myport, StringInfo message) +{ + int32 used_seq = 0; + int32 used_txn = 0; + int clear_flag = 0; + int i = 0; + StringInfoData buf; + pg_time_t stat_start_time = 0; + pg_time_t stat_end_time = 0; + GTM_StatisticsItem result_info[CMD_STATISTICS_TYPE_COUNT]; + + clear_flag = pq_getmsgint(message, sizeof (int)); + pq_getmsgend(message); + + GTM_GetMergeResult(clear_flag, &stat_start_time, &stat_end_time, result_info); + used_seq = GTM_StoreGetUsedSeq(); + used_txn = GTM_StoreGetUsedTxn(); + + pq_beginmessage(&buf, 'S'); + pq_sendint(&buf, MSG_GET_GTM_STATISTICS_RESULT, 4); + + if (myport->remote_type == GTM_NODE_GTM_PROXY) + { + GTM_ProxyMsgHeader proxyhdr; + proxyhdr.ph_conid = myport->conn_id; + pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader)); + } + + pq_sendint64(&buf, stat_start_time); + pq_sendint64(&buf, stat_end_time); + pq_sendint(&buf, GTM_MAX_SEQ_NUMBER - used_seq, sizeof(int32)); + pq_sendint(&buf, MAX_PREPARED_TXN - used_txn, sizeof(int32)); + for (i = 0; i < CMD_STATISTICS_TYPE_COUNT; i++) + { + pq_sendint(&buf, result_info[i].total_request_times, sizeof(int32)); + pq_sendint(&buf, result_info[i].avg_costtime, sizeof(int32)); + pq_sendint(&buf, result_info[i].max_costtime, sizeof(int32)); + pq_sendint(&buf, result_info[i].min_costtime, sizeof(int32)); + } + + pq_endmessage(myport, &buf); + if (myport->remote_type != GTM_NODE_GTM_PROXY) + { + /* Don't flush to the backup because this does not change the internal status */ + pq_flush(myport); + } } diff --git a/src/gtm/main/gtm_stat_error.c b/src/gtm/main/gtm_stat_error.c new file mode 100644 index 00000000..3e5cdd46 --- /dev/null +++ b/src/gtm/main/gtm_stat_error.c @@ -0,0 +1,385 @@ +/*------------------------------------------------------------------------- + * + * gtm_stat_error.c + + * collect error logs of gtm + * + * Copyright (c) 2020-Present TBase development team, Tencent + * + * IDENTIFICATION + * src/gtm/main/gtm_stat_error.c + * + *------------------------------------------------------------------------- + */ + +#include +#include "gtm/gtm.h" + +#include "gtm/elog.h" +#include "gtm/palloc.h" +#include "gtm/gtm_lock.h" +#include "gtm/gtm_stat_error.h" +#include "gtm/gtm_msg.h" +#include "gtm/libpq.h" +#include "gtm/pqformat.h" + +static int gtm_err_log_min = ERROR; +static int gtm_errmsg_size = GTM_MAX_ERRMSG_SIZE; +static int gtm_max_errlog_tuple_len = sizeof(GTM_ErrLog) + GTM_MAX_ERRMSG_SIZE; + +GTM_LogCollector GlobalLogCollector; +void GTM_ErrorLogCollector(ErrorData *edata, StringInfo buff); + +/* + * Build data pump buffer. + */ +DataPumpBuf* +GTM_BuildDataPumpBuf(uint32 size) +{ + DataPumpBuf *buff = NULL; + buff = (DataPumpBuf*)palloc0(sizeof(DataPumpBuf)); + if (NULL == buff) + { + return NULL; + } + + buff->length = size * 1024; + buff->buf = (char*)palloc0(buff->length); + if (NULL == buff->buf) + { + pfree(buff); + return NULL; + } + + SpinLockInit(&(buff->pointer_lock)); + + buff->head = 0; + buff->tail = 0; + buff->wrap_around = 0; + buff->border = INVALID_BORDER; + + return buff; +} + +/* + * Destroy data pump buffer. + */ +void +GTM_DestroyDataPumpBuf(DataPumpBuf *buff) +{ + pfree(buff->buf); + pfree(buff); + return; +} + +/* + * Thread-level log collector + * call by each thread's send_message_to_server_log, can't log any error log + */ +void +GTM_ErrorLogCollector(ErrorData *edata, StringInfo buff) +{ + GTM_ThreadInfo *thrinfo = GetMyThreadInfo; + uint32 errmsg_len = 0; + uint32 free_space = 0; + GTM_ErrLog err_info; + DataPumpBuf* datapump_buff = thrinfo->datapump_buff; + + if (edata->elevel < gtm_err_log_min || 0 == buff->len) + { + return; + } + + errmsg_len = Min(buff->len, gtm_errmsg_size - 1); + + err_info.proc_id = getpid(); + err_info.error_no = edata->saved_errno; + err_info.log_time = time(NULL); + err_info.err_level = edata->elevel; + err_info.errmsg_len = errmsg_len; + + free_space = FreeSpace(datapump_buff); + if (free_space < sizeof(GTM_ErrLog) + errmsg_len) + { + return; + } + + PutData(datapump_buff, (char*) &err_info, sizeof(GTM_ErrLog)); + PutData(datapump_buff, buff->data, errmsg_len); + SetBorder(datapump_buff); +} + +/* + * Init the global log collector + */ +int +GTM_InitLogCollector(void) +{ + MemoryContext oldContext; + oldContext = MemoryContextSwitchTo(TopMostMemoryContext); + + GlobalLogCollector.tmp_buff = palloc(gtm_max_errlog_tuple_len); + if (NULL == GlobalLogCollector.tmp_buff) + { + elog(ERROR, "Failed to create tmpBuf, out of memory."); + MemoryContextSwitchTo(oldContext); + return -1; + } + + GlobalLogCollector.bloom_filter = BloomCreate(GTM_BLOOM_FILTER_SIZE, 2, 0, 97); + if (NULL == GlobalLogCollector.bloom_filter) + { + elog(ERROR, "Failed to create bloom filter, out of memory."); + pfree(GlobalLogCollector.tmp_buff); + MemoryContextSwitchTo(oldContext); + return -1; + } + + GlobalLogCollector.datapump_buff = GTM_BuildDataPumpBuf(GTM_GLOBAL_ERRLOG_DATAPUMP_SIZE); + if (NULL == GlobalLogCollector.datapump_buff) + { + elog(ERROR, "Failed to datapump buf, out of memory."); + BloomDestroy(GlobalLogCollector.bloom_filter); + pfree(GlobalLogCollector.tmp_buff); + MemoryContextSwitchTo(oldContext); + return -1; + } + + SpinLockInit(&GlobalLogCollector.lock); + pg_atomic_init_u32(&GlobalLogCollector.full, 0); + + MemoryContextSwitchTo(oldContext); + return 0; +} + +/* + * Deinit the global log collector + */ +void +GTM_DeInitLogCollector(void) +{ + if (GlobalLogCollector.tmp_buff != NULL) + { + pfree(GlobalLogCollector.tmp_buff); + GlobalLogCollector.tmp_buff = NULL; + } + + if (GlobalLogCollector.bloom_filter != NULL) + { + BloomDestroy(GlobalLogCollector.bloom_filter); + GlobalLogCollector.bloom_filter = NULL; + } + + if (GlobalLogCollector.datapump_buff != NULL) + { + GTM_DestroyDataPumpBuf(GlobalLogCollector.datapump_buff); + GlobalLogCollector.datapump_buff = NULL; + } +} + +/* + * Get a log tuple from datapump buff + */ +static int +GTM_GetLogTupleFromDataPump(DataPumpBuf* dataPumpBuf, char* buf) +{ + char* data = NULL; + uint32 data_len = 0; + uint32 offset = 0; + GTM_ErrLog* err_info = NULL; + uint32 tuple_len = 0; + + data = GetData(dataPumpBuf, &data_len); + if (NULL == data) + { + /* no data */ + return -1; + } + + if (data_len < sizeof(GTM_ErrLog)) + { + /* copy the last part of datapumpbuff to temp buff */ + memcpy(buf, data, data_len); + offset = data_len; + + IncDataOff(dataPumpBuf, data_len); + data = GetData(dataPumpBuf, &data_len); + AssertState(data != NULL); + /* copy the rest */ + memcpy((char*)buf + offset, data, sizeof(GTM_ErrLog) - offset); + data += (sizeof(GTM_ErrLog) - offset); + + err_info = (GTM_ErrLog*)buf; + tuple_len = sizeof(GTM_ErrLog) + err_info->errmsg_len; + + memcpy((char*)buf + sizeof(GTM_ErrLog), data, err_info->errmsg_len); + IncDataOff(dataPumpBuf, tuple_len - offset); + } + else + { + err_info = (GTM_ErrLog*)data; + tuple_len = sizeof(GTM_ErrLog) + err_info->errmsg_len; + if (data_len < tuple_len) + { + memcpy(buf, data, data_len); + offset = data_len; + + IncDataOff(dataPumpBuf, data_len); + data = GetData(dataPumpBuf, &data_len); + AssertState(data != NULL); + + memcpy((char*)buf + offset, data, tuple_len - offset); + IncDataOff(dataPumpBuf, tuple_len - offset); + } + else + { + memcpy((char*)buf, data, tuple_len); + IncDataOff(dataPumpBuf, tuple_len); + } + } + + return 0; +} + +/* + * Collect errlog data from various threads and eliminate duplication + */ +void +GTM_ProcessLogCollection(void) +{ + GTM_ThreadInfo *thrinfo = NULL; + DataPumpBuf* datapump_buff = NULL; + DataPumpBuf* global_datapump_buff = GlobalLogCollector.datapump_buff; + BLOOM *bloom_filter = GlobalLogCollector.bloom_filter; + char *tmp_buff = GlobalLogCollector.tmp_buff; + GTM_ErrLog* err_info = NULL; + int errmsg_len = 0; + uint32 i = 0; + char *msg = NULL; + + GTM_RWLockAcquire(>MThreads->gt_lock, GTM_LOCKMODE_READ); + + for (i = 0; i < GTMThreads->gt_array_size; i++) + { + thrinfo = GTMThreads->gt_threads[i]; + if(NULL == thrinfo) + { + elog(DEBUG1, "thread %d exits.", i); + continue; + } + + datapump_buff = thrinfo->datapump_buff; + if (NULL == datapump_buff) + { + continue; + } + + if (pg_atomic_read_u32(&GlobalLogCollector.full)) + { + break; + } + + while (FreeSpace(global_datapump_buff) >= gtm_max_errlog_tuple_len) + { + if (GTM_GetLogTupleFromDataPump(datapump_buff, tmp_buff)) + { + break; + } + + err_info = (GTM_ErrLog*)tmp_buff; + if (!BloomCheckAndAdd(bloom_filter, err_info->errmsg, err_info->errmsg_len)) + { + /* replace \n with space */ + msg = err_info->errmsg; + for (i = 0; i < err_info->errmsg_len; i++) + { + if (msg[i] == '\n' || msg[i] == '\t' || msg[i] == '\r') + { + msg[i] = ' '; + } + } + + /* serialize */ + errmsg_len = err_info->errmsg_len; + err_info->proc_id = htonl(err_info->proc_id); + err_info->error_no = htonl(err_info->error_no); + err_info->log_time = htobe64(err_info->log_time); + err_info->err_level = htonl(err_info->err_level); + err_info->errmsg_len = htonl(err_info->errmsg_len); + + /* put err log into global datapumpbuff */ + PutData(global_datapump_buff, (char*) err_info, sizeof(GTM_ErrLog) + errmsg_len); + SetBorder(global_datapump_buff); + } + } + + if (FreeSpace(global_datapump_buff) < gtm_max_errlog_tuple_len) + { + pg_atomic_write_u32(&GlobalLogCollector.full, 1); + elog(DEBUG1, "global datapump buff is full."); + } + } + + GTM_RWLockRelease(>MThreads->gt_lock); +} + +/* + * Process MSG_GET_ERRORLOG message + */ +void +ProcessGetErrorlogCommand(Port *myport, StringInfo message) +{ + char* data = NULL; + uint32 data_len = 0; + uint32 total_len = 0; + StringInfoData buf; + DataPumpBuf* global_datapump_buff = GlobalLogCollector.datapump_buff; + BLOOM *bloom_filter = GlobalLogCollector.bloom_filter; + + pq_getmsgend(message); + + SpinLockAcquire(&GlobalLogCollector.lock); + + pq_beginmessage(&buf, 'S'); + pq_sendint(&buf, MSG_GET_GTM_ERRORLOG_RESULT, 4); + + if (myport->remote_type == GTM_NODE_GTM_PROXY) + { + GTM_ProxyMsgHeader proxyhdr; + proxyhdr.ph_conid = myport->conn_id; + pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader)); + } + + data = GetData(global_datapump_buff, &data_len); + while (NULL != data) + { + total_len += data_len; + /* check max len,if the producer is faster than the consumer, it may block here */ + if (total_len >= GTM_GLOBAL_ERRLOG_DATAPUMP_SIZE) + { + pg_atomic_write_u32(&GlobalLogCollector.full, 1); + } + + pq_sendbytes(&buf, data, data_len); + + IncDataOff(global_datapump_buff, data_len); + data = GetData(global_datapump_buff, &data_len); + } + + /* clear bitmap */ + BloomReset(bloom_filter); + if (pg_atomic_read_u32(&GlobalLogCollector.full)) + { + pg_atomic_write_u32(&GlobalLogCollector.full, 0); + } + + SpinLockRelease(&GlobalLogCollector.lock); + + pq_endmessage(myport, &buf); + + if (myport->remote_type != GTM_NODE_GTM_PROXY) + { + /* Don't flush to the backup because this does not change the internal status */ + pq_flush(myport); + } +} \ No newline at end of file diff --git a/src/gtm/main/gtm_store.c b/src/gtm/main/gtm_store.c index 5b8acac5..0123a662 100644 --- a/src/gtm/main/gtm_store.c +++ b/src/gtm/main/gtm_store.c @@ -236,8 +236,8 @@ static int32 GTM_StoreSync(char *data, size_t size); static int32 GTM_StoreInitSync(char *data, size_t size); static bool GTM_StoreCheckHeaderCRC(void); static int32 GTM_StoreGetHeader(GTMControlHeader *header); -static int32 GTM_StoreGetUsedSeq(void); -static int32 GTM_StoreGetUsedTxn(void); +int32 GTM_StoreGetUsedSeq(void); +int32 GTM_StoreGetUsedTxn(void); static bool GTM_StoreCheckSeqCRC(GTM_StoredSeqInfo *seq); static bool GTM_StoreCheckTxnCRC(GTM_StoredTransactionInfo *txn); static bool GTM_StoreSeqInFreelist(GTM_StoredSeqInfo *seq); diff --git a/src/gtm/main/gtm_thread.c b/src/gtm/main/gtm_thread.c index 8dd2344c..9d71553f 100644 --- a/src/gtm/main/gtm_thread.c +++ b/src/gtm/main/gtm_thread.c @@ -19,6 +19,8 @@ #include "gtm/gtm_xlog.h" #include "gtm/gtm_txn.h" #include "gtm/libpq.h" +#include "gtm/gtm_stat_error.h" + #ifdef __TBASE__ #include "gtm/gtm_store.h" #endif @@ -275,60 +277,64 @@ GTM_ThreadCreate(void *(* startroutine)(void *), int32 max_lock) thrinfo->insert_lock_id = -1; thrinfo->insert_try_lock_id = pthread_self() % NUM_XLOGINSERT_LOCKS; thrinfo->register_buff = NULL; - thrinfo->last_sync_gts = 0; + thrinfo->last_sync_gts = 0; + thrinfo->stat_handle = NULL; + thrinfo->datapump_buff = GTM_BuildDataPumpBuf(GTM_THREAD_ERRLOG_DATAPUMP_SIZE); #endif - /* - * Each thread gets its own ErrorContext and its a child of ErrorContext of - * the main process - * - * This is a thread-specific context and is not shared between other - * threads - */ - thrinfo->thr_error_context = AllocSetContextCreate(ErrorContext, - "ErrorContext", - 8 * 1024, - 8 * 1024, - 8 * 1024, - false); - - thrinfo->thr_startroutine = startroutine; - - /* - * Now start the thread. The thread will start executing the given - * "startroutine". The thrinfo structure is also passed to the thread. Any - * additional parameters should be passed via the thrinfo strcuture. - * - * Return the thrinfo structure to the caller - */ - if ((err = pthread_create(&thrinfo->thr_id, NULL, GTM_ThreadMainWrapper, - thrinfo))) - { - ereport(LOG, - (err, - errmsg("Failed to create a new thread: error %s", strerror(err)))); - - GTM_ThreadRemove(thrinfo); - - MemoryContextDelete(thrinfo->thr_error_context); - MemoryContextDelete(thrinfo->thr_thread_context); + /* + * Each thread gets its own ErrorContext and its a child of ErrorContext of + * the main process + * + * This is a thread-specific context and is not shared between other + * threads + */ + thrinfo->thr_error_context = AllocSetContextCreate(ErrorContext, + "ErrorContext", + 8 * 1024, + 8 * 1024, + 8 * 1024, + false); + + thrinfo->thr_startroutine = startroutine; + + /* + * Now start the thread. The thread will start executing the given + * "startroutine". The thrinfo structure is also passed to the thread. Any + * additional parameters should be passed via the thrinfo strcuture. + * + * Return the thrinfo structure to the caller + */ + if ((err = pthread_create(&thrinfo->thr_id, NULL, GTM_ThreadMainWrapper, + thrinfo))) + { + ereport(LOG, + (err, + errmsg("Failed to create a new thread: error %s", strerror(err)))); + + GTM_ThreadRemove(thrinfo); + + MemoryContextDelete(thrinfo->thr_error_context); + MemoryContextDelete(thrinfo->thr_thread_context); + + GTM_RWLockDestroy(&thrinfo->thr_lock); +#ifdef __TBASE__ + GTM_DestroyDataPumpBuf(thrinfo->datapump_buff); +#endif + pfree(thrinfo); - GTM_RWLockDestroy(&thrinfo->thr_lock); + return NULL; + } - pfree(thrinfo); + /* + * Ensure that the resources are released when the thread exits. (We used + * to do this inside GTM_ThreadMainWrapper, but thrinfo->thr_id may not set + * by the time GTM_ThreadMainWrapper starts executing, this possibly + * calling the function on an invalid thr_id + */ + pthread_detach(thrinfo->thr_id); - return NULL; - } - - /* - * Ensure that the resources are released when the thread exits. (We used - * to do this inside GTM_ThreadMainWrapper, but thrinfo->thr_id may not set - * by the time GTM_ThreadMainWrapper starts executing, this possibly - * calling the function on an invalid thr_id - */ - pthread_detach(thrinfo->thr_id); - - return thrinfo; + return thrinfo; } /* @@ -398,8 +404,10 @@ GTM_ThreadCleanup(void *argp) RWLockCleanUp(); if(thrinfo->locks_hold != NULL) pfree(thrinfo->locks_hold); - if(thrinfo->write_locks_hold != NULL) - pfree(thrinfo->write_locks_hold); + if(thrinfo->write_locks_hold != NULL) + pfree(thrinfo->write_locks_hold); + if(thrinfo->datapump_buff != NULL) + GTM_DestroyDataPumpBuf(thrinfo->datapump_buff); #endif /* * Switch to the memory context of the main process so that we can free up diff --git a/src/gtm/main/main.c b/src/gtm/main/main.c index 3cbfd061..03618ffe 100644 --- a/src/gtm/main/main.c +++ b/src/gtm/main/main.c @@ -57,7 +57,8 @@ #include "gtm/gtm_utils.h" #include "gtm/gtm_backup.h" #include "gtm/gtm_time.h" - +#include "gtm/gtm_stat.h" +#include "gtm/gtm_stat_error.h" #ifdef __TBASE__ #include "gtm/gtm_store.h" @@ -135,6 +136,9 @@ int g_max_thread_number = 512; /* max thread number of gtm. */ GTM_ThreadInfo *g_timekeeper_thread = NULL; GTM_ThreadInfo *g_timebackup_thread = NULL; GTM_ThreadInfo *g_timer_thread = NULL; +GTM_ThreadInfo *g_logcollector_thread = NULL; +void *GTM_ThreadLogCollector(void *argp); +extern void GTM_ErrorLogCollector(ErrorData *edata, StringInfo buff); #ifdef __XLOG__ GTM_ThreadInfo *g_basebackup_thread = NULL; @@ -1476,25 +1480,41 @@ main(int argc, char *argv[]) } for(i = 0; i < max_wal_sender; i++) + { + { + GTM_ThreadInfo *thr = GTM_ThreadCreate(GTM_ThreadWalSender, g_max_lock_number); + if (NULL == thr) + { + elog(ERROR, "Failed to create wal sender thread."); + exit(1); + } + } + } + + g_logcollector_thread = GTM_ThreadCreate(GTM_ThreadLogCollector, g_max_lock_number); + if (NULL == g_logcollector_thread) { - { - GTM_ThreadInfo *thr = GTM_ThreadCreate(GTM_ThreadWalSender, g_max_lock_number); - if (NULL == thr) - { - elog(ERROR, "Failed to create wal sender thread."); - exit(1); - } - } + elog(ERROR, "Failed to create gtm log collector thread."); + exit(1); } - fprintf(stdout, "TBase create %d worker thread.\n", process_thread_num); - - /* Processing threads + Timer + Timekeeper + Timebackup threads + Walwrite + CheckPointer*/ - GTMThreads->gt_start_thread_count = process_thread_num + max_wal_sender + util_thread_cnt; - fprintf(stdout, "Start sever loop start thread count %d running thread count %d.\n", - GTMThreads->gt_start_thread_count, GTMThreads->gt_thread_count); - - elog(LOG, "Start sever loop start thread count %d running thread count %d.\n", - GTMThreads->gt_start_thread_count, GTMThreads->gt_thread_count); + util_thread_cnt++; + + fprintf(stdout, "TBase create %d worker thread.\n", process_thread_num); + + /* Processing threads + Timer + Timekeeper + Timebackup threads + Walwrite + CheckPointer*/ + GTMThreads->gt_start_thread_count = process_thread_num + max_wal_sender + util_thread_cnt; + fprintf(stdout, "Start sever loop start thread count %d running thread count %d.\n", + GTMThreads->gt_start_thread_count, GTMThreads->gt_thread_count); + + elog(LOG, "Start sever loop start thread count %d running thread count %d.\n", + GTMThreads->gt_start_thread_count, GTMThreads->gt_thread_count); + + /* init statistic time */ + GTM_InitGtmStatistics(); + + /* init log hook */ + errlog_collection_func = GTM_ErrorLogCollector; + #endif fprintf(stdout, "TBase GTM is ready to go!!\n"); /* @@ -2352,6 +2372,92 @@ GTM_ThreadWalSender(void *argp) return my_threadinfo; } +/* + * Log collection thread, responsible for summarizing + * the log data of each thread to global datapump + */ +void * +GTM_ThreadLogCollector(void *argp) +{ + GTM_ThreadInfo *my_threadinfo = (GTM_ThreadInfo *)argp; + sigjmp_buf local_sigjmp_buf; + struct sigaction action; + int ret = 0; + action.sa_flags = 0; + action.sa_handler = GTM_ThreadSigHandler; + + ret = sigaction(SIGQUIT, &action, NULL); + if (ret) + { + elog(LOG, "register thread quit handler failed"); + } + + elog(DEBUG8, "Starting the log collector thread"); + MessageContext = AllocSetContextCreate(TopMemoryContext, + "MessageContext", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE, + false); + + /* + * POSTGRES main processing loop begins here + * + * If an exception is encountered, processing resumes here so we abort the + * current transaction and start a new one. + * + * You might wonder why this isn't coded as an infinite loop around a + * PG_TRY construct. The reason is that this is the bottom of the + * exception stack, and so with PG_TRY there would be no exception handler + * in force at all during the CATCH part. By leaving the outermost setjmp + * always active, we have at least some chance of recovering from an error + * during error recovery. (If we get into an infinite loop thereby, it + * will soon be stopped by overflow of elog.c's internal state stack.) + */ + + if (sigsetjmp(local_sigjmp_buf, 1) != 0) + { +#ifdef __TBASE__ + RWLockCleanUp(); +#endif + EmitErrorReport(NULL); + + /* + * Now return to normal top-level context and clear ErrorContext for + * next time. + */ + MemoryContextSwitchTo(TopMemoryContext); + FlushErrorState(); + } + + /* We can now handle ereport(ERROR) */ + PG_exception_stack = &local_sigjmp_buf; + + if (GTM_InitLogCollector() != 0) + { + elog(ERROR, "Failed to Init LogCollector."); + exit(1); + } + + for(;;) + { + /* no need to lock here. */ + if(GTM_SHUTTING_DOWN == GTMTransactions.gt_gtm_state) + { + break; + } + + /* sleep GTM_LOG_COLLECT_CYCLE */ + usleep(GTM_LOG_COLLECT_CYCLE); + + GTM_ProcessLogCollection(); + } + + GTM_DeInitLogCollector(); + elog(LOG, "GTM is shutting down, log collector exits!"); + return my_threadinfo; +} + void SendXLogSyncStatus(GTM_Conn *conn) {// #lizard forgives @@ -2955,13 +3061,15 @@ GTM_ThreadMain(void *argp) action.sa_handler = GTM_ThreadSigHandler; ret = sigaction(SIGQUIT, &action, NULL); - if (ret) - { - elog(LOG, "register thread quit handler failed"); - } + if (ret) + { + elog(LOG, "register thread quit handler failed"); + } - elog(DEBUG8, "Starting the connection helper thread"); - bind_service_threads(); + elog(DEBUG8, "Starting the connection helper thread"); + bind_service_threads(); + + GTM_InitStatisticsHandle(); /* * Create the memory context we will use in the main loop. @@ -2971,52 +3079,52 @@ GTM_ThreadMain(void *argp) * * This context is thread-specific */ - MessageContext = AllocSetContextCreate(TopMemoryContext, - "MessageContext", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE, - false); - - efd = epoll_create1(0); - if(efd == -1) - { - elog(ERROR, "failed to create epoll"); - } - thrinfo->thr_efd = efd; - thrinfo->thr_epoll_ok = true; - - /* - * Acquire the thread lock to prevent connection from GTM-Standby to update - * GTM-Standby registration. - */ - - /* - * Get the input_message in the TopMemoryContext so that we don't need to - * free/palloc it for every incoming message. Unlike Postgres, we don't - * expect the incoming messages to be of arbitrary sizes - */ - - initStringInfo(&input_message); - - /* - * POSTGRES main processing loop begins here - * - * If an exception is encountered, processing resumes here so we abort the - * current transaction and start a new one. - * - * You might wonder why this isn't coded as an infinite loop around a - * PG_TRY construct. The reason is that this is the bottom of the - * exception stack, and so with PG_TRY there would be no exception handler - * in force at all during the CATCH part. By leaving the outermost setjmp - * always active, we have at least some chance of recovering from an error - * during error recovery. (If we get into an infinite loop thereby, it - * will soon be stopped by overflow of elog.c's internal state stack.) - */ - - if (sigsetjmp(local_sigjmp_buf, 1) != 0) - { - bool report = false; + MessageContext = AllocSetContextCreate(TopMemoryContext, + "MessageContext", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE, + false); + + efd = epoll_create1(0); + if(efd == -1) + { + elog(ERROR, "failed to create epoll"); + } + thrinfo->thr_efd = efd; + thrinfo->thr_epoll_ok = true; + + /* + * Acquire the thread lock to prevent connection from GTM-Standby to update + * GTM-Standby registration. + */ + + /* + * Get the input_message in the TopMemoryContext so that we don't need to + * free/palloc it for every incoming message. Unlike Postgres, we don't + * expect the incoming messages to be of arbitrary sizes + */ + + initStringInfo(&input_message); + + /* + * POSTGRES main processing loop begins here + * + * If an exception is encountered, processing resumes here so we abort the + * current transaction and start a new one. + * + * You might wonder why this isn't coded as an infinite loop around a + * PG_TRY construct. The reason is that this is the bottom of the + * exception stack, and so with PG_TRY there would be no exception handler + * in force at all during the CATCH part. By leaving the outermost setjmp + * always active, we have at least some chance of recovering from an error + * during error recovery. (If we get into an infinite loop thereby, it + * will soon be stopped by overflow of elog.c's internal state stack.) + */ + + if (sigsetjmp(local_sigjmp_buf, 1) != 0) + { + bool report = false; #ifdef __TBASE__ RWLockCleanUp(); #endif @@ -3422,7 +3530,7 @@ ProcessCommand(Port *myport, StringInfo input_message) #ifdef __TBASE__ GTM_ThreadInfo *my_threadinfo = NULL; long long start_time; - long long end_time; + long long cost_time; my_threadinfo = GetMyThreadInfo; #ifndef __XLOG__ GTM_ConnectionInfo *conn; @@ -3641,6 +3749,16 @@ ProcessCommand(Port *myport, StringInfo input_message) break; } #endif + case MSG_GET_STATISTICS: + { + ProcessGetStatisticsCommand(myport,input_message); + break; + } + case MSG_GET_ERRORLOG: + { + ProcessGetErrorlogCommand(myport,input_message); + break; + } #endif default: ereport(FATAL, @@ -3651,12 +3769,13 @@ ProcessCommand(Port *myport, StringInfo input_message) BeforeReplyToClientXLogTrigger(); - end_time = getSystemTime(); +#ifdef __TBASE__ + cost_time = getSystemTime() - start_time; + if(enable_gtm_debug || cost_time > warnning_time_cost) + elog(LOG, "cost mtype = %s (%d) %lld ms.", gtm_util_message_name(mtype), (int)mtype,cost_time); - if(enable_gtm_debug || end_time - start_time > warnning_time_cost) - elog(LOG, "cost mtype = %s (%d) %lld ms.", gtm_util_message_name(mtype), (int)mtype,end_time - start_time); + GTM_UpdateStatistics(my_threadinfo->stat_handle, mtype, cost_time); -#ifdef __TBASE__ if (my_threadinfo->handle_standby) { GTM_RWLockRelease(&my_threadinfo->thr_lock); diff --git a/src/include/gtm/bloom.h b/src/include/gtm/bloom.h new file mode 100644 index 00000000..5bdb4936 --- /dev/null +++ b/src/include/gtm/bloom.h @@ -0,0 +1,39 @@ +/*------------------------------------------------------------------------- + * + * bloom.h + * + * + * a bloom filter, using murmurhash + * + * Copyright (c) 2020-Present TBase development team, Tencent + * + * + * IDENTIFICATION + * src/include/gtm/bloom.h + * + *------------------------------------------------------------------------- + */ +#ifndef _BLOOM_H +#define _BLOOM_H + +#include "gtm/gtm_lock.h" + +typedef unsigned int (*hashfunc_t)(const void *, int); + +typedef struct +{ + int bitmap_size; /* bitmap size of bloom filter */ + unsigned char* bitmap; /* bloom filter bitmap */ + int nfuncs; /* hash functions num */ + uint32* seeds; /* hash functions seeds */ +} BLOOM; + +BLOOM *BloomCreate(int bitmap_size, int nfuncs, ...); +int BloomDestroy(BLOOM *bloom); +void BloomReset(BLOOM *bloom); +void BloomAdd(BLOOM *bloom, const char *s, int len); +bool BloomCheck(BLOOM *bloom, const char *s, int len); +bool BloomCheckAndAdd(BLOOM *bloom, const char *s, int len); +uint32_t MurmurHash2(const void * key, int len, uint32_t seed); + +#endif diff --git a/src/include/gtm/datapump.h b/src/include/gtm/datapump.h new file mode 100644 index 00000000..360b56a7 --- /dev/null +++ b/src/include/gtm/datapump.h @@ -0,0 +1,52 @@ +/*------------------------------------------------------------------------- + * + * datapump.h + * + * + * lockless message queue + * + * Copyright (c) 2020-Present TBase development team, Tencent + * + * + * IDENTIFICATION + * src/include/gtm/datapump.h + * + *------------------------------------------------------------------------- + */ +#ifndef _DATAPUMP_H +#define _DATAPUMP_H + +#include "gtm/gtm_c.h" +#include "gtm/gtm_lock.h" + +#define INVALID_BORDER (~((uint32)0)) +typedef struct +{ + char *buf; /* Data buffer */ + unsigned length; /* Data buffer length */ + s_lock_t pointer_lock; /* lock to protect offset and status */ + volatile uint32 head; /* Head of the loop */ + volatile uint32 tail; /* Tail of the buffer */ + volatile uint32 border; /* end of last tuple, so that we can send a complete tuple */ + volatile uint32 wrap_around; /* wrap around of the queue , for read only */ +} DataPumpBuf; + +uint32 DataSize(DataPumpBuf *buf); +uint32 FreeSpace(DataPumpBuf *buf); +char *GetData(DataPumpBuf *buf, uint32 *uiLen); +void IncDataOff(DataPumpBuf *buf, uint32 uiLen); +char *GetWriteOff(DataPumpBuf *buf, uint32 *uiLen); +void IncWriteOff(DataPumpBuf *buf, uint32 uiLen); +char *GetWriteOff(DataPumpBuf *buf, uint32 *uiLen); +uint32 BufferOffsetAdd(DataPumpBuf *buf, uint32 pointer, uint32 offset); +int ReserveSpace(DataPumpBuf *buf, uint32 len, uint32 *offset); +int ReturnSpace(DataPumpBuf *buf, uint32 offset); +void FillReserveSpace(DataPumpBuf *buf, uint32 offset, char *p, uint32 len); +void SetBorder(DataPumpBuf *buf); +void *DataPumpSenderThread(void *arg); +void PutData(DataPumpBuf *buf, char *data, uint32 len); + + + + +#endif diff --git a/src/include/gtm/elog.h b/src/include/gtm/elog.h index c79eaba5..70387e54 100644 --- a/src/include/gtm/elog.h +++ b/src/include/gtm/elog.h @@ -76,6 +76,7 @@ #define ELOG_H #include "c.h" +#include "stringinfo.h" /* Error level codes */ #define DEBUG8 9 @@ -315,4 +316,9 @@ write_stderr(const char *fmt,...) the supplied arguments. */ __attribute__((format(printf, 1, 2))); + +/* log collection function hook */ +typedef void (*errlog_collection_hook_type) (ErrorData *edata, StringInfo buff); +extern errlog_collection_hook_type errlog_collection_func; + #endif /* GTM_ELOG_H */ diff --git a/src/include/gtm/gtm.h b/src/include/gtm/gtm.h index f98af0ff..715d91cb 100644 --- a/src/include/gtm/gtm.h +++ b/src/include/gtm/gtm.h @@ -25,6 +25,8 @@ #include "gtm/elog.h" #include "gtm/gtm_list.h" #include "gtm/gtm_xlog_internal.h" +#include "gtm/gtm_stat.h" +#include "gtm/datapump.h" extern char *GTMLogFile; typedef enum GTM_ThreadStatus @@ -98,6 +100,8 @@ typedef struct GTM_ThreadInfo XLogWaiter xlog_waiter; bool handle_standby; #endif + GTM_WorkerStatistics *stat_handle; /* statistics hanndle */ + DataPumpBuf *datapump_buff; /* log collection buff */ } GTM_ThreadInfo; typedef struct GTM_Threads diff --git a/src/include/gtm/gtm_c.h b/src/include/gtm/gtm_c.h index b2c6382b..7af3e735 100644 --- a/src/include/gtm/gtm_c.h +++ b/src/include/gtm/gtm_c.h @@ -392,7 +392,7 @@ typedef enum #define GTM_GTS_ONE_SECOND (1000 * 1000L) #define GTM_SYNC_CYCLE (5 * GTM_GTS_ONE_SECOND) #define GTM_SYNC_TIME_LIMIT (60 * GTM_GTS_ONE_SECOND) - +#define GTM_LOG_COLLECT_CYCLE (5 * GTM_GTS_ONE_SECOND) #pragma pack() diff --git a/src/include/gtm/gtm_client.h b/src/include/gtm/gtm_client.h index f7b17ac9..2381286a 100644 --- a/src/include/gtm/gtm_client.h +++ b/src/include/gtm/gtm_client.h @@ -22,6 +22,8 @@ #include "gtm/register.h" #include "gtm/libpq-fe.h" #include "access/xlogdefs.h" +#include "gtm/gtm_stat.h" + #define MAX_HOSTADDR_LEN 32 #define MAX_PORT_LEN 8 @@ -77,110 +79,111 @@ typedef union GTM_ResultData #endif - GlobalTransactionId grd_gxid; /* TXN_PREPARE - * TXN_START_PREPARED - * TXN_ROLLBACK - */ - struct { - GlobalTransactionId gxid; - /* TXN_COMMIT - * TXN_COMMIT_PREPARED - */ - int status; - } grd_eof_txn; - - GlobalTransactionId grd_next_gxid; - - struct - { - GTM_TransactionHandle txnhandle; - GlobalTransactionId gxid; - } grd_txn; /* TXN_GET_GXID */ - - GTM_SequenceKeyData grd_seqkey; /* SEQUENCE_INIT - * SEQUENCE_RESET - * SEQUENCE_CLOSE */ - struct - { - GTM_SequenceKeyData seqkey; - GTM_Sequence seqval; - GTM_Sequence rangemax; - } grd_seq; /* SEQUENCE_GET_CURRENT - * SEQUENCE_GET_NEXT */ - struct - { - int32 seq_count; - GTM_SeqInfo *seq; - } grd_seq_list; /* SEQUENCE_GET_LIST */ - - struct - { - int32 txn_count; /* TXN_BEGIN_GETGXID_MULTI */ - GlobalTransactionId txn_gxid[GTM_MAX_GLOBAL_TRANSACTIONS]; - GTM_Timestamp timestamp; - } grd_txn_get_multi; - - struct - { - int ts_count; /* GETGTS_MULTI */ - GTM_Timestamp gts[GTM_MAX_GLOBAL_TRANSACTIONS]; - } grd_gts_get_multi; - - struct - { - int txn_count; /* TXN_COMMIT_MULTI */ - int status[GTM_MAX_GLOBAL_TRANSACTIONS]; - } grd_txn_rc_multi; - - struct - { - GTM_TransactionHandle txnhandle; /* SNAPSHOT_GXID_GET */ - GlobalTransactionId gxid; /* SNAPSHOT_GET */ - int txn_count; /* SNAPSHOT_GET_MULTI */ - int status[GTM_MAX_GLOBAL_TRANSACTIONS]; - } grd_txn_snap_multi; - - struct - { - GlobalTransactionId gxid; - GlobalTransactionId prepared_gxid; - int nodelen; - char *nodestring; - } grd_txn_get_gid_data; /* TXN_GET_GID_DATA_RESULT */ - - struct - { - char *ptr; - int len; - } grd_txn_gid_list; /* TXN_GXID_LIST_RESULT */ - - struct - { - GTM_PGXCNodeType type; /* NODE_REGISTER */ - int len; - char *node_name; /* NODE_UNREGISTER */ - GlobalTransactionId xmin; - } grd_node; - - struct - { - int num_node; - GTM_PGXCNodeInfo *nodeinfo[MAX_NODES]; - } grd_node_list; - - struct - { - GlobalTransactionId latest_completed_xid; - GlobalTransactionId global_xmin; - int errcode; - } grd_report_xmin; /* REPORT_XMIN */ - - - /* - * TODO - * TXN_GET_STATUS - * TXN_GET_ALL_PREPARED - */ + GlobalTransactionId grd_gxid; /* TXN_PREPARE + * TXN_START_PREPARED + * TXN_ROLLBACK + */ + struct { + GlobalTransactionId gxid; + /* TXN_COMMIT + * TXN_COMMIT_PREPARED + */ + int status; + } grd_eof_txn; + + GlobalTransactionId grd_next_gxid; + + struct + { + GTM_TransactionHandle txnhandle; + GlobalTransactionId gxid; + } grd_txn; /* TXN_GET_GXID */ + + GTM_SequenceKeyData grd_seqkey; /* SEQUENCE_INIT + * SEQUENCE_RESET + * SEQUENCE_CLOSE */ + struct + { + GTM_SequenceKeyData seqkey; + GTM_Sequence seqval; + GTM_Sequence rangemax; + } grd_seq; /* SEQUENCE_GET_CURRENT + * SEQUENCE_GET_NEXT */ + struct + { + int32 seq_count; + GTM_SeqInfo *seq; + } grd_seq_list; /* SEQUENCE_GET_LIST */ + + struct + { + int32 txn_count; /* TXN_BEGIN_GETGXID_MULTI */ + GlobalTransactionId txn_gxid[GTM_MAX_GLOBAL_TRANSACTIONS]; + GTM_Timestamp timestamp; + } grd_txn_get_multi; + + struct + { + int ts_count; /* GETGTS_MULTI */ + GTM_Timestamp gts[GTM_MAX_GLOBAL_TRANSACTIONS]; + } grd_gts_get_multi; + + struct + { + int txn_count; /* TXN_COMMIT_MULTI */ + int status[GTM_MAX_GLOBAL_TRANSACTIONS]; + } grd_txn_rc_multi; + + struct + { + GTM_TransactionHandle txnhandle; /* SNAPSHOT_GXID_GET */ + GlobalTransactionId gxid; /* SNAPSHOT_GET */ + int txn_count; /* SNAPSHOT_GET_MULTI */ + int status[GTM_MAX_GLOBAL_TRANSACTIONS]; + } grd_txn_snap_multi; + + struct + { + GlobalTransactionId gxid; + GlobalTransactionId prepared_gxid; + int nodelen; + char *nodestring; + } grd_txn_get_gid_data; /* TXN_GET_GID_DATA_RESULT */ + + struct + { + char *ptr; + int len; + } grd_txn_gid_list; /* TXN_GXID_LIST_RESULT */ + + struct + { + GTM_PGXCNodeType type; /* NODE_REGISTER */ + int len; + char *node_name; /* NODE_UNREGISTER */ + GlobalTransactionId xmin; + } grd_node; + + struct + { + int num_node; + GTM_PGXCNodeInfo *nodeinfo[MAX_NODES]; + } grd_node_list; + + struct + { + GlobalTransactionId latest_completed_xid; + GlobalTransactionId global_xmin; + int errcode; + } grd_report_xmin; /* REPORT_XMIN */ + + GTM_StatisticsResult statistic_result; + + /* + * TODO + * TXN_GET_STATUS + * TXN_GET_ALL_PREPARED + */ } GTM_ResultData; #define GTM_RESULT_COMM_ERROR (-2) /* Communication error */ @@ -210,35 +213,41 @@ typedef struct GTM_Result XLogRecPtr start_pos; TimeLineID time_line; #endif - } grd_storage_data; /* STORAGE_TRANSFER_RESULT */ - int gr_finish_status; /* TXN_FINISH_GID_RESULT result */ - GTMStorageStatus gtm_status; - - struct - { - int32 count; - GTM_StoredSeqInfo *seqs; - }grd_store_seq; + } grd_storage_data; /* STORAGE_TRANSFER_RESULT */ + int gr_finish_status; /* TXN_FINISH_GID_RESULT result */ + GTMStorageStatus gtm_status; + + struct + { + int32 count; + GTM_StoredSeqInfo *seqs; + }grd_store_seq; + + struct + { + int32 count; + GTM_StoredTransactionInfo *txns; + }grd_store_txn; + + + struct + { + int32 count; + GTMStorageSequneceStatus *seqs; + }grd_store_check_seq; + + struct + { + int32 count; + GTMStorageTransactionStatus *txns; + }grd_store_check_txn; - struct - { - int32 count; - GTM_StoredTransactionInfo *txns; - }grd_store_txn; - - - struct + struct { - int32 count; - GTMStorageSequneceStatus *seqs; - }grd_store_check_seq; + int len; + char* errlog; + } grd_errlog; - struct - { - int32 count; - GTMStorageTransactionStatus *txns; - }grd_store_check_txn; - #endif /* * We keep these two items outside the union to avoid repeated malloc/free @@ -296,6 +305,9 @@ int check_gtm_status(GTM_Conn *conn, int *status, GTM_Timestamp *master,XLogRecP int check_gtm_status(GTM_Conn *conn, int *status, GTM_Timestamp *master, GTM_Timestamp *standby, char *standbyhost, char *standbyport, int32 buflen); #endif int bkup_global_timestamp(GTM_Conn *conn, GlobalTimestamp timestamp); +int get_gtm_statistics(GTM_Conn *conn, int clear_flag, int timeout_seconds, GTM_StatisticsResult** result); +int get_gtm_errlog(GTM_Conn *conn, int timeout_seconds, char** errlog, int* len); + #endif int bkup_begin_transaction_gxid(GTM_Conn *conn, GlobalTransactionId gxid, diff --git a/src/include/gtm/gtm_msg.h b/src/include/gtm/gtm_msg.h index 69fcd9ed..bb66c194 100644 --- a/src/include/gtm/gtm_msg.h +++ b/src/include/gtm/gtm_msg.h @@ -121,6 +121,8 @@ typedef enum GTM_MessageType MSG_GET_REPLICATION_STATUS, MSG_GET_REPLICATION_TRANSFER, #endif + MSG_GET_STATISTICS, + MSG_GET_ERRORLOG, /* * Must be at the end @@ -204,7 +206,10 @@ typedef enum GTM_ResultType MSG_REPLICATION_CONTENT, #endif - RESULT_TYPE_COUNT + MSG_GET_GTM_STATISTICS_RESULT, + MSG_GET_GTM_ERRORLOG_RESULT, + + RESULT_TYPE_COUNT } GTM_ResultType; /* diff --git a/src/include/gtm/gtm_stat.h b/src/include/gtm/gtm_stat.h new file mode 100644 index 00000000..b58dd330 --- /dev/null +++ b/src/include/gtm/gtm_stat.h @@ -0,0 +1,86 @@ +/*------------------------------------------------------------------------- + * + * gtm_stat.h + * + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group + * Portions Copyright (c) 2012-2018 TBase Development Group + * + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ +#ifndef _GTM_STAT_H +#define _GTM_STAT_H + +#include "gtm/gtm_c.h" +#include "gtm/gtm_lock.h" +#include "gtm/gtm_msg.h" +#include "gtm/libpq-be.h" +#include "gtm/stringinfo.h" +#include "port/atomics.h" + +typedef int64 pg_time_t; +#define CACHE_LINE_SIZE 64 +#define CACHE_LINE_ALIGN __attribute__((aligned(CACHE_LINE_SIZE))) + +typedef enum GTM_Statistic_Cmd +{ + CMD_GETGTS, + CMD_SEQUENCE_GET_NEXT, + CMD_TXN_START_PREPARED, + CMD_STATISTICS_TYPE_COUNT +} GTM_StatisticsCmd; + +typedef struct +{ + pg_atomic_uint32 total_request_times; + pg_atomic_uint32 total_costtime; + pg_atomic_uint32 max_costtime; + pg_atomic_uint32 min_costtime; +} CACHE_LINE_ALIGN GTM_StatisticsInfo; + +typedef struct +{ + GTM_StatisticsInfo cmd_statistics[CMD_STATISTICS_TYPE_COUNT]; +} GTM_WorkerStatistics; + +typedef struct +{ + uint32 total_request_times; + union + { + uint32 total_costtime; + uint32 avg_costtime; + }; + uint32 max_costtime; + uint32 min_costtime; +} GTM_StatisticsItem; + +typedef struct +{ + pg_time_t start_time; /* statistics info start time */ + pg_time_t end_time; /* statistics info end time */ + int32 sequences_remained; /* sequence remained num */ + int32 txn_remained; /* txn remained num */ + GTM_StatisticsItem stat_info[CMD_STATISTICS_TYPE_COUNT]; /* specific cmd statistics info */ +} GTM_StatisticsResult; + +typedef struct +{ + pg_time_t stat_start_time; /* statistics info start time */ + s_lock_t lock; /* lock to avoid multi client */ +} GTM_Statistics; + +extern GTM_Statistics GTMStatistics; + +void GTM_InitGtmStatistics(void); + +void GTM_InitStatisticsHandle(void); + +void GTM_UpdateStatistics(GTM_WorkerStatistics* stat_handle, GTM_MessageType mtype, uint32 costtime); + +void ProcessGetStatisticsCommand(Port *myport, StringInfo message); +#endif diff --git a/src/include/gtm/gtm_stat_error.h b/src/include/gtm/gtm_stat_error.h new file mode 100644 index 00000000..608eb854 --- /dev/null +++ b/src/include/gtm/gtm_stat_error.h @@ -0,0 +1,56 @@ +/*------------------------------------------------------------------------- + * + * gtm_stat_error.h + + * collect error logs of gtm + * + * Copyright (c) 2020-Present TBase development team, Tencent + * + * IDENTIFICATION + * src/gtm/main/gtm_stat_error.h + * + *------------------------------------------------------------------------- + */ +#ifndef _GTM_STAT_ERROR_H +#define _GTM_STAT_ERROR_H + +#include "gtm/gtm_c.h" +#include "gtm/gtm_lock.h" +#include "gtm/datapump.h" +#include "gtm/bloom.h" + +#define GTM_MAX_ERRMSG_SIZE (1024) /* max size of each error msg to track */ +#define GTM_BLOOM_FILTER_SIZE (1 * 1024 * 1024) +#define GTM_GLOBAL_ERRLOG_DATAPUMP_SIZE (10 * 1024) /* k */ +#define GTM_THREAD_ERRLOG_DATAPUMP_SIZE (16) /* k */ + +typedef int64 pg_time_t; + +typedef struct +{ + int proc_id; /* process id */ + int error_no; /* errno */ + pg_time_t log_time; /* log time */ + int err_level; /* error level */ + int errmsg_len; /* length of valid bytes in error message */ + char errmsg[0]; /* variable length array - must be last */ +} GTM_ErrLog; + +typedef struct +{ + s_lock_t lock; /* lock to avoid multi client */ + pg_atomic_uint32 full; /* datapump is full */ + char *tmp_buff; /* a buff use to read tuple data */ + BLOOM *bloom_filter; /* bloom filter use to exclude duplicates */ + DataPumpBuf *datapump_buff; /* circular queue buffer */ +} GTM_LogCollector; + +extern GTM_LogCollector GlobalLogCollector; + +DataPumpBuf *GTM_BuildDataPumpBuf(uint32 size); +void GTM_DestroyDataPumpBuf(DataPumpBuf *buff); +int GTM_InitLogCollector(void); +void GTM_DeInitLogCollector(void); +void GTM_ProcessLogCollection(void); +void ProcessGetErrorlogCommand(Port *myport, StringInfo message); +#endif From d6aec823f4d5562041ff003154f79a1a6c241339 Mon Sep 17 00:00:00 2001 From: qiannzhang Date: Fri, 25 Sep 2020 19:56:11 +0800 Subject: [PATCH 065/578] Run Node Lock/UnLock locally. http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131082021563 --- src/backend/tcop/pquery.c | 46 +++++++++++++++++++++----------------- src/backend/tcop/utility.c | 6 ++++- 2 files changed, 30 insertions(+), 22 deletions(-) diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c index 67cf48ae..e7068eab 100644 --- a/src/backend/tcop/pquery.c +++ b/src/backend/tcop/pquery.c @@ -1864,27 +1864,31 @@ PortalRunUtility(Portal portal, PlannedStmt *pstmt, GetGtmInfoFromUserCmd(utilityStmt); #endif - /* - * Set snapshot if utility stmt needs one. Most reliable way to do this - * seems to be to enumerate those that do not need one; this is a short - * list. Transaction control, LOCK, and SET must *not* set a snapshot - * since they need to be executable at the start of a transaction-snapshot - * mode transaction without freezing a snapshot. By extension we allow - * SHOW not to set a snapshot. The other stmts listed are just efficiency - * hacks. Beware of listing anything that can modify the database --- if, - * say, it has to update an index with expressions that invoke - * user-defined functions, then it had better have a snapshot. - */ - if (!(IsA(utilityStmt, TransactionStmt) || - IsA(utilityStmt, LockStmt) || - IsA(utilityStmt, VariableSetStmt) || - IsA(utilityStmt, VariableShowStmt) || - IsA(utilityStmt, ConstraintsSetStmt) || - /* efficiency hacks from here down */ - IsA(utilityStmt, FetchStmt) || - IsA(utilityStmt, ListenStmt) || - IsA(utilityStmt, NotifyStmt) || - IsA(utilityStmt, UnlistenStmt) || + /* + * Set snapshot if utility stmt needs one. Most reliable way to do this + * seems to be to enumerate those that do not need one; this is a short + * list. Transaction control, LOCK, and SET must *not* set a snapshot + * since they need to be executable at the start of a transaction-snapshot + * mode transaction without freezing a snapshot. By extension we allow + * SHOW not to set a snapshot. The other stmts listed are just efficiency + * hacks. Beware of listing anything that can modify the database --- if, + * say, it has to update an index with expressions that invoke + * user-defined functions, then it had better have a snapshot. + */ + if (!(IsA(utilityStmt, TransactionStmt) || + IsA(utilityStmt, LockStmt) || + IsA(utilityStmt, VariableSetStmt) || + IsA(utilityStmt, VariableShowStmt) || + IsA(utilityStmt, ConstraintsSetStmt) || + /* efficiency hacks from here down */ + IsA(utilityStmt, FetchStmt) || + IsA(utilityStmt, ListenStmt) || + IsA(utilityStmt, NotifyStmt) || + IsA(utilityStmt, UnlistenStmt) || +#ifdef __TBASE__ + /* Node Lock/Unlock do not modify any data */ + IsA(utilityStmt, LockNodeStmt) || +#endif #ifdef PGXC IsA(utilityStmt, PauseClusterStmt) || IsA(utilityStmt, BarrierStmt) || diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index f331a920..f5d10269 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -6581,7 +6581,11 @@ IsStmtAllowedInLockedMode(Node *parsetree, const char *queryString) #ifdef XCP case T_PauseClusterStmt: #endif - return ALLOW; +#ifdef __TBASE__ + /* Node Lock/Unlock do not modify any data */ + case T_LockNodeStmt: +#endif + return ALLOW; default: return DISALLOW; From f50e456f0e54850f3acf5094d989403565dfc25e Mon Sep 17 00:00:00 2001 From: mark Date: Fri, 25 Sep 2020 17:51:58 +0800 Subject: [PATCH 066/578] [TAPD 82017165] Correctly reset the overdue information stored in shard map. --- src/backend/pgxc/shard/shardmap.c | 161 ++++++++++++++++-------------- 1 file changed, 86 insertions(+), 75 deletions(-) diff --git a/src/backend/pgxc/shard/shardmap.c b/src/backend/pgxc/shard/shardmap.c index a2b1d8e9..9e1cec0d 100644 --- a/src/backend/pgxc/shard/shardmap.c +++ b/src/backend/pgxc/shard/shardmap.c @@ -767,62 +767,72 @@ static bool SyncShardMapList_Node_DN(void) return false; } - self_node_oid = get_pgxc_nodeoid_extend(PGXCNodeName, PGXCMainClusterName); - if (InvalidOid == self_node_oid) - { - elog(LOG, "SyncShardMapList_Node_DN failed to get nodeoid, node:%s", PGXCNodeName); - return false; - } - curr_groupoid = GetGroupOidByNode(self_node_oid); - if (InvalidOid == curr_groupoid) - { - elog(LOG, "SyncShardMapList_Node_DN failed to get groupoid, node:%s, nodeoid:%d", PGXCNodeName, self_node_oid); - return false; - } - - if (is_group_sharding_inited(curr_groupoid)) - { - bms_clear(g_DatanodeShardgroupBitmap); - - /* If the group sharding has not been inited */ - if (!g_GroupShardingMgr_DN->used) - { - g_GroupShardingMgr_DN->members->shardMapStatus = SHMEM_SHRADMAP_STATUS_LOADING; - g_GroupShardingMgr_DN->members->group = curr_groupoid; - g_GroupShardingMgr_DN->used = true; - } - - shardrel = heap_open(PgxcShardMapRelationId, AccessShareLock); - ScanKeyInit(&skey, - Anum_pgxc_shard_map_nodegroup, - BTEqualStrategyNumber, F_OIDEQ, - ObjectIdGetDatum(curr_groupoid)); - - sysscan = systable_beginscan(shardrel, - PgxcShardMapGroupIndexId, - true, - NULL, 1, &skey); - - while(HeapTupleIsValid(oldtup = systable_getnext(sysscan))) - { - pgxc_shard = (Form_pgxc_shard_map)GETSTRUCT(oldtup); - InsertShardMap_DN(pgxc_shard); - - /* - * If node is DN AND pgxc_shard_map tuple's primary copy is itself, - * Add this shardid to bitmap. - */ - BuildDatanodeVisibilityMap(pgxc_shard, self_node_oid); - } - systable_endscan(sysscan); - heap_close(shardrel, AccessShareLock); - ShardMapInitDone_DN(curr_groupoid, false); - } - else - { - elog(LOG, "SyncShardMapList_Node_DN group %d is not inited.", curr_groupoid); - return false; - } + self_node_oid = get_pgxc_nodeoid_extend(PGXCNodeName, PGXCMainClusterName); + if (InvalidOid == self_node_oid) + { + elog(LOG, "SyncShardMapList_Node_DN failed to get nodeoid, node:%s", PGXCNodeName); + return false; + } + curr_groupoid = GetGroupOidByNode(self_node_oid); + if (InvalidOid == curr_groupoid) + { + elog(LOG, "SyncShardMapList_Node_DN failed to get groupoid, node:%s, nodeoid:%d", PGXCNodeName, self_node_oid); + return false; + } + + if (is_group_sharding_inited(curr_groupoid)) + { + bms_clear(g_DatanodeShardgroupBitmap); + + /* + * If sharding of the group has not been inited, or this sharding map is in use but + * store overdue information, possibly caused by group syncing backend crashing right + * before the shmem sync. + */ + if (!g_GroupShardingMgr_DN->used || curr_groupoid != g_GroupShardingMgr_DN->members->group) + { + /* + * Datanodes can only be in one node group, so we save the effort of + * removing entry and skip right into resetting the mgr. + */ + g_GroupShardingMgr_DN->members->shardMapStatus = SHMEM_SHRADMAP_STATUS_LOADING; + SpinLockAcquire(&g_GroupShardingMgr_DN->lock); + g_GroupShardingMgr_DN->members->group = curr_groupoid; + g_GroupShardingMgr_DN->used = true; + SpinLockRelease(&g_GroupShardingMgr_DN->lock); + } + + shardrel = heap_open(PgxcShardMapRelationId, AccessShareLock); + ScanKeyInit(&skey, + Anum_pgxc_shard_map_nodegroup, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(curr_groupoid)); + + sysscan = systable_beginscan(shardrel, + PgxcShardMapGroupIndexId, + true, + NULL, 1, &skey); + + while(HeapTupleIsValid(oldtup = systable_getnext(sysscan))) + { + pgxc_shard = (Form_pgxc_shard_map)GETSTRUCT(oldtup); + InsertShardMap_DN(pgxc_shard); + + /* + * If node is DN AND pgxc_shard_map tuple's primary copy is itself, + * Add this shardid to bitmap. + */ + BuildDatanodeVisibilityMap(pgxc_shard, self_node_oid); + } + systable_endscan(sysscan); + heap_close(shardrel, AccessShareLock); + ShardMapInitDone_DN(curr_groupoid, false); + } + else + { + elog(LOG, "SyncShardMapList_Node_DN group %d is not inited.", curr_groupoid); + return false; + } return true; } @@ -1028,26 +1038,27 @@ static void ShardMapInitDone_CN(int32 map, Oid group, bool need_lock) static void ShardMapInitDone_DN(Oid group, bool need_lock) -{// #lizard forgives - bool dup = false; - int32 maxNodeIndex = 0; - int32 i; - int32 j; - int32 nodeindex = 0; - int32 nodeCnt = 0; - ShardMapItemDef item; +{ + bool dup = false; + int32 maxNodeIndex = 0; + int32 i; + int32 j; + int32 nodeindex = 0; + int32 nodeCnt = 0; + ShardMapItemDef item; - if(!IS_PGXC_DATANODE) - { - elog(ERROR, "ShardMapInitDone_DN should only be called in datanode"); - return; - } - - if(group != g_GroupShardingMgr_DN->members->group) - { - elog(PANIC, "groupoid %d in mgr is not group %d", g_GroupShardingMgr_DN->members->group, group); - return; - } + if(!IS_PGXC_DATANODE) + { + elog(ERROR, "ShardMapInitDone_DN should only be called in datanode"); + return; + } + + if(group != g_GroupShardingMgr_DN->members->group) + { + /* PANIC here is to reset shmem, although a more elegant way should be provided by ShardMapShmem AM */ + elog(PANIC, "groupoid %d in mgr is not group %d", g_GroupShardingMgr_DN->members->group, group); + return; + } if (need_lock) { From e7095fb698a6abd7713cb8c77649521a26fa54f6 Mon Sep 17 00:00:00 2001 From: ericxwu Date: Sun, 27 Sep 2020 10:48:09 +0800 Subject: [PATCH 067/578] Support simplify subquery when pullup TargestList sublink and optimize targetlist convert logic Subquery in TargetList could have 'limit 1' cluase or 'rownum=1'(Oracle Compatibility) qualification to make sure only one row returned. In this case, we can eliminate the limit clause by combining the first matched logic to join operation, thus we introduced the new join type JOIN_LEFT_SEMI. The new join type returns the first copy of each LHS row that has match, and also returns unmatched LHS tuples.(Existing JOIN_LEFT or JOIN_SEMI does not satisfy this sementic) As an additional benifit, SubQuery in targetlist with aggregation can also speed up by using this new JOIN_LEFT_SEMI join type by skip detecting next matched inner row.(Reduce another hash probe, or more nestloop costs). http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696860181163 --- src/backend/commands/explain.c | 9 +- src/backend/executor/nodeHashjoin.c | 10 +- src/backend/executor/nodeMergejoin.c | 26 ++--- src/backend/executor/nodeNestloop.c | 7 +- src/backend/optimizer/path/allpaths.c | 3 +- src/backend/optimizer/path/costsize.c | 15 ++- src/backend/optimizer/path/indxpath.c | 4 +- src/backend/optimizer/path/joinpath.c | 5 +- src/backend/optimizer/path/joinrels.c | 6 +- src/backend/optimizer/plan/initsplan.c | 8 +- src/backend/optimizer/plan/setrefs.c | 1 + src/backend/optimizer/plan/subselect.c | 132 ++++++++++++++++++++-- src/backend/optimizer/prep/prepjointree.c | 4 + src/backend/optimizer/util/pathnode.c | 7 +- src/backend/utils/adt/network_selfuncs.c | 1 + src/backend/utils/adt/selfuncs.c | 1 + src/include/nodes/nodes.h | 7 +- src/test/regress/expected/subselect.out | 39 ++++++- src/test/regress/sql/subselect.sql | 4 + 19 files changed, 237 insertions(+), 52 deletions(-) diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index d49eebc8..c58bd433 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -1374,9 +1374,12 @@ ExplainNode(PlanState *planstate, List *ancestors, jointype = "Anti"; break; #ifdef __TBASE__ - case JOIN_LEFT_SCALAR: - jointype = "Left Scalar"; - break; + case JOIN_LEFT_SCALAR: + jointype = "Left Scalar"; + break; + case JOIN_LEFT_SEMI: + jointype = "Left Semi"; + break; #endif default: jointype = "???"; diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c index 9f1b7b90..6d57ae37 100644 --- a/src/backend/executor/nodeHashjoin.c +++ b/src/backend/executor/nodeHashjoin.c @@ -660,7 +660,8 @@ ExecInitHashJoin(HashJoin *node, EState *estate, int eflags) * detect whether we need only consider the first matching inner tuple */ hjstate->js.single_match = (node->join.inner_unique || - node->join.jointype == JOIN_SEMI); + node->join.jointype == JOIN_SEMI || + node->join.jointype == JOIN_LEFT_SEMI); /* set up null tuples for outer joins, if needed */ switch (node->join.jointype) @@ -669,11 +670,8 @@ ExecInitHashJoin(HashJoin *node, EState *estate, int eflags) case JOIN_SEMI: break; #ifdef __TBASE__ - case JOIN_LEFT_SCALAR: - hjstate->hj_NullInnerTupleSlot = - ExecInitNullTupleSlot(estate, - ExecGetResultType(innerPlanState(hjstate))); - break; + case JOIN_LEFT_SCALAR: + case JOIN_LEFT_SEMI: #endif case JOIN_LEFT: case JOIN_ANTI: diff --git a/src/backend/executor/nodeMergejoin.c b/src/backend/executor/nodeMergejoin.c index d10b74b0..6989c3ed 100644 --- a/src/backend/executor/nodeMergejoin.c +++ b/src/backend/executor/nodeMergejoin.c @@ -694,14 +694,14 @@ ExecMergeJoin(PlanState *pstate) break; } #ifdef __TBASE__ - /* - * if we have finished the join, and the inner never be executed, - * we need to disconnect from remote node. - */ - if (!node->mj_InnerInited && IS_PGXC_DATANODE) - { - ExecDisconnectNode(innerPlan); - } + /* + * If we have finished the join, and the inner never + * be executed, we need to disconnect from remote node. + */ + if (!node->mj_InnerInited && IS_PGXC_DATANODE) + { + ExecDisconnectNode(innerPlan); + } #endif /* Otherwise we're done. */ return NULL; @@ -1542,7 +1542,8 @@ ExecInitMergeJoin(MergeJoin *node, EState *estate, int eflags) * detect whether we need only consider the first matching inner tuple */ mergestate->js.single_match = (node->join.inner_unique || - node->join.jointype == JOIN_SEMI); + node->join.jointype == JOIN_SEMI || + node->join.jointype == JOIN_LEFT_SEMI); /* set up null tuples for outer joins, if needed */ switch (node->join.jointype) @@ -1554,12 +1555,7 @@ ExecInitMergeJoin(MergeJoin *node, EState *estate, int eflags) break; #ifdef __TBASE__ case JOIN_LEFT_SCALAR: - mergestate->mj_FillOuter = true; - mergestate->mj_FillInner = false; - mergestate->mj_NullInnerTupleSlot = - ExecInitNullTupleSlot(estate, - ExecGetResultType(innerPlanState(mergestate))); - break; + case JOIN_LEFT_SEMI: #endif case JOIN_LEFT: case JOIN_ANTI: diff --git a/src/backend/executor/nodeNestloop.c b/src/backend/executor/nodeNestloop.c index 9a9ec8d4..d277cb19 100644 --- a/src/backend/executor/nodeNestloop.c +++ b/src/backend/executor/nodeNestloop.c @@ -179,7 +179,8 @@ ExecNestLoop(PlanState *pstate) if (!node->nl_MatchedOuter && (node->js.jointype == JOIN_LEFT || node->js.jointype == JOIN_ANTI || - node->js.jointype == JOIN_LEFT_SCALAR)) + node->js.jointype == JOIN_LEFT_SCALAR || + node->js.jointype == JOIN_LEFT_SEMI)) #else if (!node->nl_MatchedOuter && (node->js.jointype == JOIN_LEFT || @@ -341,7 +342,8 @@ ExecInitNestLoop(NestLoop *node, EState *estate, int eflags) * detect whether we need only consider the first matching inner tuple */ nlstate->js.single_match = (node->join.inner_unique || - node->join.jointype == JOIN_SEMI); + node->join.jointype == JOIN_SEMI || + node->join.jointype == JOIN_LEFT_SEMI); /* set up null tuples for outer joins, if needed */ switch (node->join.jointype) @@ -351,6 +353,7 @@ ExecInitNestLoop(NestLoop *node, EState *estate, int eflags) break; #ifdef __TBASE__ case JOIN_LEFT_SCALAR: + case JOIN_LEFT_SEMI: nlstate->nl_NullInnerTupleSlot = ExecInitNullTupleSlot(estate, ExecGetResultType(innerPlanState(nlstate))); diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index a4fec879..42c19c2f 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -232,7 +232,8 @@ set_base_rel_consider_startup(PlannerInfo *root) #ifdef __TBASE__ if ((sjinfo->jointype == JOIN_SEMI || sjinfo->jointype == JOIN_ANTI || - sjinfo->jointype == JOIN_LEFT_SCALAR) && + sjinfo->jointype == JOIN_LEFT_SCALAR || + sjinfo->jointype == JOIN_LEFT_SEMI) && bms_get_singleton_member(sjinfo->syn_righthand, &varno)) #else if ((sjinfo->jointype == JOIN_SEMI || sjinfo->jointype == JOIN_ANTI) && diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 0de40222..4b984a23 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -2137,6 +2137,7 @@ initial_cost_nestloop(PlannerInfo *root, JoinCostWorkspace *workspace, if (jointype == JOIN_SEMI || jointype == JOIN_ANTI || jointype == JOIN_LEFT_SCALAR || + jointype == JOIN_LEFT_SEMI || extra->inner_unique) #else if (jointype == JOIN_SEMI || jointype == JOIN_ANTI || @@ -2230,6 +2231,7 @@ final_cost_nestloop(PlannerInfo *root, NestPath *path, if (path->jointype == JOIN_SEMI || path->jointype == JOIN_ANTI || path->jointype == JOIN_LEFT_SCALAR || + path->jointype == JOIN_LEFT_SEMI || extra->inner_unique) #else if (path->jointype == JOIN_SEMI || path->jointype == JOIN_ANTI || @@ -2731,6 +2733,7 @@ final_cost_mergejoin(PlannerInfo *root, MergePath *path, if ((path->jpath.jointype == JOIN_SEMI || path->jpath.jointype == JOIN_ANTI || path->jpath.jointype == JOIN_LEFT_SCALAR || + path->jpath.jointype == JOIN_LEFT_SEMI || extra->inner_unique) && (list_length(path->jpath.joinrestrictinfo) == list_length(path->path_mergeclauses))) @@ -3240,6 +3243,7 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path, if (path->jpath.jointype == JOIN_SEMI || path->jpath.jointype == JOIN_ANTI || path->jpath.jointype == JOIN_LEFT_SCALAR || + path->jpath.jointype == JOIN_LEFT_SEMI || extra->inner_unique) #else if (path->jpath.jointype == JOIN_SEMI || @@ -3288,7 +3292,9 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path, /* Get # of tuples that will pass the basic join */ #ifdef __TBASE__ - if (path->jpath.jointype == JOIN_SEMI || path->jpath.jointype == JOIN_LEFT_SCALAR) + if (path->jpath.jointype == JOIN_SEMI || + path->jpath.jointype == JOIN_LEFT_SCALAR || + path->jpath.jointype == JOIN_LEFT_SEMI) #else if (path->jpath.jointype == JOIN_SEMI) #endif @@ -4446,6 +4452,7 @@ calc_joinrel_size_estimate(PlannerInfo *root, case JOIN_SEMI: #ifdef __TBASE__ case JOIN_LEFT_SCALAR: + case JOIN_LEFT_SEMI: #endif nrows = outer_rows * fkselec * jselec; /* pselec not used */ @@ -4527,7 +4534,8 @@ get_foreign_key_join_selectivity(PlannerInfo *root, * Hence, if either case applies, punt and ignore the FK. */ #ifdef __TBASE__ - if ((jointype == JOIN_SEMI || jointype == JOIN_ANTI || jointype == JOIN_LEFT_SCALAR) && + if ((jointype == JOIN_SEMI || jointype == JOIN_ANTI || + jointype == JOIN_LEFT_SCALAR || jointype == JOIN_LEFT_SEMI) && (ref_is_outer || bms_membership(inner_relids) != BMS_SINGLETON)) continue; #else @@ -4649,7 +4657,8 @@ get_foreign_key_join_selectivity(PlannerInfo *root, * table. So, at least for now, disregard inheritance here. */ #ifdef __TBASE__ - if (jointype == JOIN_SEMI || jointype == JOIN_ANTI || jointype == JOIN_LEFT_SCALAR) + if (jointype == JOIN_SEMI || jointype == JOIN_ANTI || + jointype == JOIN_LEFT_SCALAR || jointype == JOIN_LEFT_SEMI) #else if (jointype == JOIN_SEMI || jointype == JOIN_ANTI) #endif diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index b377f0d6..1e58fbdc 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -2025,7 +2025,9 @@ adjust_rowcount_for_semijoins(PlannerInfo *root, SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) lfirst(lc); #ifdef __TBASE__ - if ((sjinfo->jointype == JOIN_SEMI || sjinfo->jointype == JOIN_LEFT_SCALAR ) && + if ((sjinfo->jointype == JOIN_SEMI || + sjinfo->jointype == JOIN_LEFT_SCALAR || + sjinfo->jointype == JOIN_LEFT_SEMI) && bms_is_member(cur_relid, sjinfo->syn_lefthand) && bms_is_member(outer_relid, sjinfo->syn_righthand)) #else diff --git a/src/backend/optimizer/path/joinpath.c b/src/backend/optimizer/path/joinpath.c index c832b9d8..49852d77 100644 --- a/src/backend/optimizer/path/joinpath.c +++ b/src/backend/optimizer/path/joinpath.c @@ -145,6 +145,7 @@ add_paths_to_joinrel(PlannerInfo *root, case JOIN_ANTI: #ifdef __TBASE__ case JOIN_LEFT_SCALAR: + case JOIN_LEFT_SEMI: #endif extra.inner_unique = false; /* well, unproven */ break; @@ -191,7 +192,8 @@ add_paths_to_joinrel(PlannerInfo *root, */ #ifdef __TBASE__ if (jointype == JOIN_SEMI || jointype == JOIN_ANTI || - jointype == JOIN_LEFT_SCALAR || extra.inner_unique) + jointype == JOIN_LEFT_SCALAR || jointype == JOIN_LEFT_SEMI || + extra.inner_unique) #else if (jointype == JOIN_SEMI || jointype == JOIN_ANTI || extra.inner_unique) #endif @@ -1300,6 +1302,7 @@ match_unsorted_outer(PlannerInfo *root, case JOIN_ANTI: #ifdef __TBASE__ case JOIN_LEFT_SCALAR: + case JOIN_LEFT_SEMI: #endif nestjoinOK = true; useallclauses = false; diff --git a/src/backend/optimizer/path/joinrels.c b/src/backend/optimizer/path/joinrels.c index 659a8494..eb920d05 100644 --- a/src/backend/optimizer/path/joinrels.c +++ b/src/backend/optimizer/path/joinrels.c @@ -390,7 +390,9 @@ join_is_legal(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2, * this join path. */ #ifdef __TBASE__ - if (sjinfo->jointype == JOIN_SEMI || sjinfo->jointype == JOIN_LEFT_SCALAR) + if (sjinfo->jointype == JOIN_SEMI || + sjinfo->jointype == JOIN_LEFT_SCALAR || + sjinfo->jointype == JOIN_LEFT_SEMI) #else if (sjinfo->jointype == JOIN_SEMI) #endif @@ -832,8 +834,8 @@ populate_joinrel_with_paths(PlannerInfo *root, RelOptInfo *rel1, case JOIN_SEMI: #ifdef __TBASE__ case JOIN_LEFT_SCALAR: + case JOIN_LEFT_SEMI: #endif - /* * We might have a normal semijoin, or a case where we don't have * enough rels to do the semijoin but can unique-ify the RHS and diff --git a/src/backend/optimizer/plan/initsplan.c b/src/backend/optimizer/plan/initsplan.c index 100d9db5..7c743fd2 100644 --- a/src/backend/optimizer/plan/initsplan.c +++ b/src/backend/optimizer/plan/initsplan.c @@ -910,6 +910,7 @@ deconstruct_recurse(PlannerInfo *root, Node *jtnode, bool below_outer_join, case JOIN_ANTI: #ifdef __TBASE__ case JOIN_LEFT_SCALAR: + case JOIN_LEFT_SEMI: #endif leftjoinlist = deconstruct_recurse(root, j->larg, below_outer_join, @@ -1354,6 +1355,7 @@ make_outerjoininfo(PlannerInfo *root, (jointype == JOIN_SEMI || jointype == JOIN_ANTI || jointype == JOIN_LEFT_SCALAR || + jointype == JOIN_LEFT_SEMI || !bms_overlap(strict_relids, otherinfo->min_righthand))) #else if (bms_overlap(clause_relids, otherinfo->syn_righthand) && @@ -1401,9 +1403,11 @@ make_outerjoininfo(PlannerInfo *root, jointype == JOIN_SEMI || jointype == JOIN_ANTI || jointype == JOIN_LEFT_SCALAR || + jointype == JOIN_LEFT_SEMI || otherinfo->jointype == JOIN_SEMI || otherinfo->jointype == JOIN_ANTI || otherinfo->jointype == JOIN_LEFT_SCALAR || + otherinfo->jointype == JOIN_LEFT_SEMI || !otherinfo->lhs_strict || otherinfo->delay_upper_joins) #else if (bms_overlap(clause_relids, otherinfo->syn_righthand) || @@ -1491,7 +1495,9 @@ compute_semijoin_info(SpecialJoinInfo *sjinfo, List *clause) /* Nothing more to do if it's not a semijoin */ #ifdef __TBASE__ - if (sjinfo->jointype != JOIN_SEMI && sjinfo->jointype != JOIN_LEFT_SCALAR) + if (sjinfo->jointype != JOIN_SEMI && + sjinfo->jointype != JOIN_LEFT_SCALAR && + sjinfo->jointype != JOIN_LEFT_SEMI) #else if (sjinfo->jointype != JOIN_SEMI) #endif diff --git a/src/backend/optimizer/plan/setrefs.c b/src/backend/optimizer/plan/setrefs.c index 9f1be6e4..e5470fa8 100644 --- a/src/backend/optimizer/plan/setrefs.c +++ b/src/backend/optimizer/plan/setrefs.c @@ -1740,6 +1740,7 @@ set_join_references(PlannerInfo *root, Join *join, int rtoffset) case JOIN_ANTI: #ifdef __TBASE__ case JOIN_LEFT_SCALAR: + case JOIN_LEFT_SEMI: #endif inner_itlist->has_non_vars = false; break; diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index e8495fbc..7c342fc3 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -2955,6 +2955,95 @@ get_or_exist_subquery_targetlist(PlannerInfo *root, Node *node, List **targetLis } #ifdef __TBASE__ +/* + * simplify_TargetList_query:remove any useless stuff in an TargetList's + * subquery + * + * For subquery in targetlist, normally we use JOIN_LEFT_SCALAR type to + * make sure there will be only one row found. If subquery contains + * aggregation clause, then we are OK with JOIN_LEFT_SEMI. Further more, if + * subquery got 'limit 1' or equivalent clauses such as Oracle 'rownum = 1'. + * Then we can remove the limit clause and use JOIN_SEMI to simplify the + * subquery. + * + * Returns TRUE if was able to discard the 'LIMIT 1' cluase or the subquery + * already simple enough, else FALSE. + */ +static bool +simplify_TargetList_query(PlannerInfo *root, Query *query, bool *useLeftSemiJoin) +{ + /* + * We don't try to simplify at all if the query uses set operations, + * aggregates, grouping sets, SRFs, modifying CTEs, HAVING, OFFSET, or FOR + * UPDATE/SHARE; none of these seem likely in normal usage and their + * possible effects are complex. (Note: we could ignore an "OFFSET 0" + * clause, but that traditionally is used as an optimization fence, so we + * don't.) + */ + if (query->commandType != CMD_SELECT || + query->setOperations || + query->groupingSets || + query->hasWindowFuncs || + query->hasTargetSRFs || + query->hasModifyingCTE || + query->havingQual || + query->limitOffset || + query->rowMarks) + return false; + + /* By default, use JOIN_LEFT_SCALAR. */ + Assert(useLeftSemiJoin); + *useLeftSemiJoin = false; + + /* Handle 'limit 1' case as described above. */ + if (query->limitCount) + { + /* + * The LIMIT clause has not yet been through eval_const_expressions, + * so we have to apply that here. It might seem like this is a waste + * of cycles, since the only case plausibly worth worrying about is + * "LIMIT 1" ... but what we'll actually see is "LIMIT int8(1::int4)", + * so we have to fold constants or we're not going to recognize it. + */ + Node *node = eval_const_expressions(root, query->limitCount); + Const *limit; + int64 limitValue; + + /* Might as well update the query if we simplified the clause. */ + query->limitCount = node; + + if (!IsA(node, Const)) + return false; + + limit = (Const *) node; + + Assert(limit->consttype == INT8OID); + limitValue = DatumGetInt64(limit->constvalue); + + /* Invalid value, we have to get at least one row. */ + if (!limit->constisnull && limitValue <= 0) + return false; + + /* + * If the SubQuery got limit 1(actually must be limit 1), then the + * join Semantic equals JOIN_SEMI. We don't need to continue when got + * one LHS match. + */ + if (limitValue == 1) + { + /* + * Remove the limit clause for more possible subquery pullup + * optimizations. + */ + query->limitCount = NULL; + /* Inform caller to use JOIN_LEFT_SEMI */ + *useLeftSemiJoin = true; + } + } + + return true; +} + /* * Try to convert an SubLink in targetlist to a join * @@ -2976,9 +3065,12 @@ convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry) SubLink *sublink = NULL; RangeTblRef *rtr = NULL; RangeTblEntry *rte = NULL; - Node *target = NULL; - List *sublinks = NIL; - bool count_agg = false; + Node *target = NULL; + List *sublinks = NIL; + bool count_agg = false; + bool useLeftSemiJoin = false; + /* By default, JOIN_LEFT_SCALAR is the worst choice */ + JoinType finalJoinType = JOIN_LEFT_SCALAR; /* Find sublinks in the targetlist entry */ find_sublink_walker((Node *)entry->expr, &sublinks); @@ -3010,6 +3102,18 @@ convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry) if (subselect->jointree->fromlist == NIL) return NULL; + /* + * See if the subquery can be simplified. For now, we just try to remove + * 'limit 1' clause. If it's been removed, we can use JOIN_LEFT_SEMI to + * save more costs. + */ + if (!simplify_TargetList_query(root, subselect, &useLeftSemiJoin)) + return NULL; + + /* 'limit 1' optimized */ + if (useLeftSemiJoin) + finalJoinType = JOIN_LEFT_SEMI; + /* * What we can not optimize. */ @@ -3170,6 +3274,13 @@ convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry) subselect->groupClause = lappend(subselect->groupClause, grpcl); } + + /* + * If we got Aggregation clause, since there is only one TargetList, + * then we can use JOIN_LEFT_SEMI over JOIN_LEFT/JOIN_LEFT_SCALAR to + * save more costs. + */ + finalJoinType = JOIN_LEFT_SEMI; } /* @@ -3190,7 +3301,7 @@ convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry) * Form join node. */ joinExpr = makeNode(JoinExpr); - joinExpr->jointype = subselect->hasAggs? JOIN_LEFT : JOIN_LEFT_SCALAR; + joinExpr->jointype = finalJoinType; joinExpr->isNatural = false; joinExpr->larg = (Node *) root->parse->jointree; joinExpr->rarg = (Node *) rtr; @@ -3203,20 +3314,25 @@ convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry) parse->jointree = makeFromExpr(list_make1(joinExpr), NULL); /* Build a Var pointing to the subquery */ - target = (Node *)makeVarFromTargetEntry(rtr->rtindex, linitial(subselect->targetList)); + target = (Node *)makeVarFromTargetEntry(rtr->rtindex, + linitial(subselect->targetList)); /* Add Coalesce(count,0) */ if (count_agg) { CoalesceExpr *coalesce = makeNode(CoalesceExpr); - coalesce->args = list_make2(target, - makeConst(INT8OID, -1, InvalidOid, sizeof(int64), Int64GetDatum(0), false, true)); + Const *constExpr = makeConst(INT8OID, -1, InvalidOid, sizeof(int64), + Int64GetDatum(0), false, true); + + coalesce->args = list_make2(target, constExpr); coalesce->coalescetype = INT8OID; target = (Node *) coalesce; } /* Replace sublink node with Result. */ - entry->expr = (Expr *)substitute_sublink_with_node((Node *)entry->expr, sublink, target); + entry->expr = (Expr *)substitute_sublink_with_node((Node *)entry->expr, + sublink, + target); return entry; } #endif diff --git a/src/backend/optimizer/prep/prepjointree.c b/src/backend/optimizer/prep/prepjointree.c index a94388d4..93ceb77e 100644 --- a/src/backend/optimizer/prep/prepjointree.c +++ b/src/backend/optimizer/prep/prepjointree.c @@ -384,7 +384,9 @@ pull_up_sublinks_jointree_recurse(PlannerInfo *root, Node *jtnode, break; case JOIN_LEFT: #ifdef __TBASE__ + case JOIN_SEMI: case JOIN_LEFT_SCALAR: + case JOIN_LEFT_SEMI: #endif j->quals = pull_up_sublinks_qual_recurse(root, j->quals, &j->rarg, @@ -1243,6 +1245,7 @@ pull_up_subqueries_recurse(PlannerInfo *root, Node *jtnode, case JOIN_SEMI: #ifdef __TBASE__ case JOIN_LEFT_SCALAR: + case JOIN_LEFT_SEMI: #endif case JOIN_ANTI: j->larg = pull_up_subqueries_recurse(root, j->larg, @@ -3084,6 +3087,7 @@ reduce_outer_joins_pass2(Node *jtnode, case JOIN_SEMI: case JOIN_ANTI: case JOIN_LEFT_SCALAR: + case JOIN_LEFT_SEMI: /* * These could only have been introduced by pull_up_sublinks, diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index eee0c16b..464eccfb 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -1738,6 +1738,7 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) pathnode->jointype == JOIN_SEMI || #ifdef __TBASE__ pathnode->jointype == JOIN_LEFT_SCALAR || + pathnode->jointype == JOIN_LEFT_SEMI || #endif pathnode->jointype == JOIN_ANTI)) { @@ -2704,7 +2705,8 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) if (resultRelLoc == RESULT_REL_INNER && pathnode->jointype != JOIN_LEFT && pathnode->jointype != JOIN_FULL && pathnode->jointype != JOIN_SEMI && pathnode->jointype != JOIN_ANTI && - pathnode->jointype != JOIN_LEFT_SCALAR && !pathnode->inner_unique) + pathnode->jointype != JOIN_LEFT_SCALAR && + pathnode->jointype != JOIN_LEFT_SEMI && !pathnode->inner_unique) { /* Replicate outer */ pathnode->outerjoinpath = redistribute_path( @@ -2752,7 +2754,8 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) if (innerd &&resultRelLoc == RESULT_REL_INNER && pathnode->jointype != JOIN_LEFT && pathnode->jointype != JOIN_FULL && pathnode->jointype != JOIN_SEMI && pathnode->jointype != JOIN_ANTI && - pathnode->jointype != JOIN_LEFT_SCALAR && !pathnode->inner_unique) + pathnode->jointype != JOIN_LEFT_SCALAR && + pathnode->jointype != JOIN_LEFT_SEMI && !pathnode->inner_unique) { pathnode->path.distribution = innerd; return alternate; diff --git a/src/backend/utils/adt/network_selfuncs.c b/src/backend/utils/adt/network_selfuncs.c index beb0e76a..d05ffd86 100644 --- a/src/backend/utils/adt/network_selfuncs.c +++ b/src/backend/utils/adt/network_selfuncs.c @@ -225,6 +225,7 @@ networkjoinsel(PG_FUNCTION_ARGS) case JOIN_ANTI: #ifdef __TBASE__ case JOIN_LEFT_SCALAR: + case JOIN_LEFT_SEMI: #endif /* Here, it's important that we pass the outer var on the left. */ if (!join_is_reversed) diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 134664f8..06b1d9fa 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -2286,6 +2286,7 @@ eqjoinsel(PG_FUNCTION_ARGS) case JOIN_ANTI: #ifdef __TBASE__ case JOIN_LEFT_SCALAR: + case JOIN_LEFT_SEMI: #endif /* diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index 9f974c28..43f90ba9 100644 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -794,8 +794,10 @@ typedef enum JoinType JOIN_UNIQUE_INNER, /* RHS path must be made unique */ #ifdef __TBASE__ - JOIN_LEFT_SCALAR /* pairs + unmatched LHS tuples */ - /* only 1 copy of echo LHS row else report error. */ + JOIN_LEFT_SCALAR, /* pairs + unmatched LHS tuples, only 1 copy of + * each LHS row else report error. */ + JOIN_LEFT_SEMI /* 1 copy of each LHS row that has match(es) + + * unmatched LHS tuples */ #endif /* @@ -821,6 +823,7 @@ typedef enum JoinType #define IS_OUTER_JOIN(jointype) \ (((1 << (jointype)) & \ ((1 << JOIN_LEFT) | \ + (1 << JOIN_LEFT_SEMI) | \ (1 << JOIN_LEFT_SCALAR) | \ (1 << JOIN_FULL) | \ (1 << JOIN_RIGHT) | \ diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out index 85af9fb1..17184f61 100644 --- a/src/test/regress/expected/subselect.out +++ b/src/test/regress/expected/subselect.out @@ -1461,7 +1461,7 @@ explain (costs off) select (select sum(b.a) from tbl_b b where b.a = a.a and b. ------------------------------------------------------------------------- Sort Sort Key: "TARGETLIST_subquery".sum - -> Nested Loop Left Join + -> Nested Loop Left Semi Join -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on tbl_a a -> Materialize @@ -1493,7 +1493,7 @@ explain (costs off) select (select count(b.a) from tbl_b b where b.a = a.a) fro ----------------------------------------------------------------------- Sort Sort Key: (COALESCE("TARGETLIST_subquery".count, '0'::bigint)) - -> Nested Loop Left Join + -> Nested Loop Left Semi Join -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on tbl_a a -> Materialize @@ -1556,7 +1556,7 @@ explain (costs off) select (case when a.b =1 then (select sum(b.a) from tbl_b b --------------------------------------------------------------------------------------- Sort Sort Key: (CASE WHEN (a.b = 1) THEN "TARGETLIST_subquery".sum ELSE '0'::bigint END) - -> Nested Loop Left Join + -> Nested Loop Left Semi Join -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on tbl_a a -> Materialize @@ -1616,7 +1616,7 @@ explain (costs off) select (case when a.b =1 then (select count(*) from tbl_b b ---------------------------------------------------------------------------------------------------------------- Sort Sort Key: (CASE WHEN (a.b = 1) THEN COALESCE("TARGETLIST_subquery".count, '0'::bigint) ELSE '0'::bigint END) - -> Nested Loop Left Join + -> Nested Loop Left Semi Join -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on tbl_a a -> Materialize @@ -1650,7 +1650,7 @@ explain (costs off) select (case when a.b =1 then (select count(*) from tbl_b b ---------------------------------------------------------------------------------------------------------------- Sort Sort Key: (CASE WHEN (a.b = 1) THEN COALESCE("TARGETLIST_subquery".count, '0'::bigint) ELSE '0'::bigint END) - -> Nested Loop Left Join + -> Nested Loop Left Semi Join -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on tbl_a a -> Materialize @@ -1679,6 +1679,35 @@ select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and 1 (10 rows) +-- targetlist sublink with limit 1 +explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a limit 1) q from tbl_a a order by 1,2; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a.a, b.a + -> Hash Left Semi Join + Hash Cond: (a.a = b.a) + -> Seq Scan on tbl_a a + -> Hash + -> Seq Scan on tbl_b b +(8 rows) + +select a.a,(select b.a from tbl_b b where b.a = a.a limit 1) q from tbl_a a order by 1,2; + a | q +----+---- + 1 | + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 + 10 | 10 +(10 rows) + -- support pullup lateral ANY_SUBLINK explain select * from tbl_a a where a.b IN (select b.a from tbl_b b where b.b > a.b); QUERY PLAN diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql index 818c6b4f..66c01e19 100644 --- a/src/test/regress/sql/subselect.sql +++ b/src/test/regress/sql/subselect.sql @@ -693,6 +693,10 @@ select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and explain (costs off) select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b is not null) else 0 end) from tbl_a a order by 1; select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b is not null) else 0 end) from tbl_a a order by 1; +-- targetlist sublink with limit 1 +explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a limit 1) q from tbl_a a order by 1,2; +select a.a,(select b.a from tbl_b b where b.a = a.a limit 1) q from tbl_a a order by 1,2; + -- support pullup lateral ANY_SUBLINK explain select * from tbl_a a where a.b IN (select b.a from tbl_b b where b.b > a.b); select * from tbl_a a where a.b IN (select b.a from tbl_b b where b.b > a.b); From b4ed52d92f2ae67795211e5560f88c54c51efbe3 Mon Sep 17 00:00:00 2001 From: ericxwu Date: Wed, 30 Sep 2020 14:48:21 +0800 Subject: [PATCH 068/578] Fix pg_dump issue when group info been deleted http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131082507633 --- src/bin/pg_dump/pg_dump.c | 45 +++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index 5b3c3317..10a8ce5f 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -15835,27 +15835,30 @@ dumpTableSchema(Archive *fout, TableInfo *tbinfo) fmtId(tbinfo->attnames[hashkey - 1])); } #ifdef __TBASE__ - else if(tbinfo->pgxclocatortype == 'S' && !tbinfo->ispartition) - { - int hashkey = tbinfo->pgxcattnum; - int sechashkey = tbinfo->pgxcsecattnum; - - if (sechashkey) - { - appendPQExpBuffer(q, "\nDISTRIBUTE BY SHARD (%s,", - fmtId(tbinfo->attnames[hashkey - 1])); - appendPQExpBuffer(q, "%s)", - fmtId(tbinfo->attnames[sechashkey - 1])); - } - else - appendPQExpBuffer(q, "\nDISTRIBUTE BY SHARD (%s)", - fmtId(tbinfo->attnames[hashkey - 1])); - - if (tbinfo->coldgroupname) - appendPQExpBuffer(q, " to GROUP %s %s", tbinfo->groupname, tbinfo->coldgroupname); - else - appendPQExpBuffer(q, " to GROUP %s", tbinfo->groupname); - } + else if(tbinfo->pgxclocatortype == 'S' && !tbinfo->ispartition) + { + int hashkey = tbinfo->pgxcattnum; + int sechashkey = tbinfo->pgxcsecattnum; + + if (sechashkey) + { + appendPQExpBuffer(q, "\nDISTRIBUTE BY SHARD (%s,", + fmtId(tbinfo->attnames[hashkey - 1])); + appendPQExpBuffer(q, "%s)", + fmtId(tbinfo->attnames[sechashkey - 1])); + } + else + appendPQExpBuffer(q, "\nDISTRIBUTE BY SHARD (%s)", + fmtId(tbinfo->attnames[hashkey - 1])); + + if (tbinfo->groupname) + { + if (tbinfo->coldgroupname) + appendPQExpBuffer(q, " to GROUP %s %s", tbinfo->groupname, tbinfo->coldgroupname); + else + appendPQExpBuffer(q, " to GROUP %s", tbinfo->groupname); + } + } #endif } if (include_nodes && From 2d67e124781e4016e22c1df214f5d6ee3627260f Mon Sep 17 00:00:00 2001 From: ericxwu Date: Tue, 29 Sep 2020 11:51:56 +0800 Subject: [PATCH 069/578] FQS support pushdown query with subquery to datanode Currently we only support Subquery push down to single DN. Multiple DN pushdown will have cross-phase issues between main query and subquery, which is way more complicated. So we just skip the case by now. http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131081947401 --- src/backend/optimizer/util/pgxcship.c | 239 +++++++++++++++++++------ src/backend/utils/adt/ruleutils.c | 13 +- src/test/regress/expected/xc_FQS_2.out | 68 +++++++ src/test/regress/sql/xc_FQS.sql | 8 + 4 files changed, 261 insertions(+), 67 deletions(-) diff --git a/src/backend/optimizer/util/pgxcship.c b/src/backend/optimizer/util/pgxcship.c index ae1ca9d6..bfbc9e99 100644 --- a/src/backend/optimizer/util/pgxcship.c +++ b/src/backend/optimizer/util/pgxcship.c @@ -153,6 +153,7 @@ static ExecNodes *pgxc_FQS_datanodes_for_rtr(Index varno, Query *query); #ifdef __TBASE__ static ExecNodes* pgxc_is_group_subquery_shippable(Query *query, Shippability_context *sc_context); static void pgxc_is_rte_subquery_shippable(Node *node, Shippability_context *sc_context); +static bool pgxc_is_simple_subquery(Query *subquery); static bool pgxc_FQS_check_subquery_const(Query *query); #endif /* @@ -339,7 +340,8 @@ pgxc_FQS_datanodes_for_rtr(Index varno, Query *query) { /* For anything, other than a table, we can't find the datanodes */ #ifdef __TBASE__ - if (rte->relkind != RELKIND_RELATION && rte->relkind != RELKIND_PARTITIONED_TABLE) + if (rte->relkind != RELKIND_RELATION && + rte->relkind != RELKIND_PARTITIONED_TABLE) { return NULL; } @@ -364,7 +366,8 @@ pgxc_FQS_datanodes_for_rtr(Index varno, Query *query) * all partitioned tables should have the same distribution, try to * get execution datanodes */ - if (rte->inh && has_subclass(rte->relid) && rte->relkind != RELKIND_PARTITIONED_TABLE) + if (rte->inh && has_subclass(rte->relid) && + rte->relkind != RELKIND_PARTITIONED_TABLE) { return NULL; } @@ -379,23 +382,54 @@ pgxc_FQS_datanodes_for_rtr(Index varno, Query *query) case RTE_SUBQUERY: #ifdef __TBASE__ { - Query *subquery = rte->subquery; + Query *subquery = rte->subquery; + ExecNodes *exec_nodes = NULL; /* - * Current we only consider the case if subquery only contains - * constant values. If so, we can treat them as replicated RTE. + * Consider the case if subquery only contains constant values. + * If so, we can treat them as replicated RTE. */ if (enable_subquery_shipping && pgxc_FQS_check_subquery_const(subquery)) { - ExecNodes *exec_nodes = makeNode(ExecNodes); + exec_nodes = makeNode(ExecNodes); exec_nodes->baselocatortype = LOCATOR_TYPE_REPLICATED; - /* No locate info stored for such subquery RTEs, we use this - * flag to force using the other hand locate info */ + /* + * No locate info stored for such subquery RTEs, we use this + * flag to force using the other hand locate info. + */ exec_nodes->const_subquery = true; return exec_nodes; } + + /* Try to process exec_nodes for simple Subquery */ + if (enable_subquery_shipping && + pgxc_is_simple_subquery(subquery)) + { + Bitmapset *relids = NULL; + + /* Recurse into the subquery to find executable datanodes. */ + exec_nodes = pgxc_FQS_find_datanodes_recurse((Node *)subquery->jointree, + subquery, &relids); + + /* Clean up the relids used in recursion function */ + bms_free(relids); + relids = NULL; + + /* + * Currently we only support Subquery push down to single DN. + * Multiple DN pushdown will have cross-phase issues between + * main query and subquery, it needs more complicate + * calculation. So we just skip the case by now. + */ + if (exec_nodes && exec_nodes->nodeList && + (list_length(exec_nodes->nodeList) == 1)) + return exec_nodes; + else + return NULL; + } + return NULL; } #endif @@ -1767,7 +1801,59 @@ pgxc_is_shard_in_same_group(Var *var1, Var *var2, List *rtable) return result; } + +/* + * Check is the subquery is simple enough to pushdown to DN + */ +static bool +pgxc_is_simple_subquery(Query *query) +{ + /* + * Let's just make sure it's a valid select ... + */ + if (!IsA(query, Query) || query->commandType != CMD_SELECT) + return false; + + /* + * Can't currently pushdown a query with setops (unless it's simple UNION + * ALL, which is handled by a different code path). + */ + if (query->setOperations) + return false; + + /* + * Can't pushdown a subquery involving grouping, aggregation, SRFs, + * sorting, limiting, or WITH. + */ + if (query->hasAggs || + query->hasWindowFuncs || + query->hasTargetSRFs || + query->groupClause || + query->groupingSets || + query->havingQual || + query->sortClause || + query->distinctClause || + query->limitOffset || + query->limitCount || + query->hasForUpdate || + query->cteList) + return false; + + /* + * Don't pushdown a subquery that has any volatile functions in its + * targetlist. Otherwise we might introduce multiple evaluations of these + * functions, if they get copied to multiple places in the upper query, + * leading to surprising results. (Note: the PlaceHolderVar mechanism + * doesn't quite guarantee single evaluation; else we could pull up anyway + * and just wrap such items in PlaceHolderVars ...) + */ + if (contain_volatile_functions((Node *) query->targetList)) + return false; + + return true; +} #endif + /* * Returns whether or not the rtable (and its subqueries) * only contain pg_catalog entries. @@ -1794,7 +1880,6 @@ pgxc_query_contains_only_pg_catalog(List *rtable) return true; } - /* * pgxc_is_query_shippable * This function calls the query walker to analyse the query to gather @@ -1807,60 +1892,98 @@ pgxc_query_contains_only_pg_catalog(List *rtable) */ ExecNodes * pgxc_is_query_shippable(Query *query, int query_level) -{// #lizard forgives - Shippability_context sc_context; - ExecNodes *exec_nodes; - bool canShip = true; - Bitmapset *shippability; +{ + Shippability_context sc_context; + ExecNodes *exec_nodes; + bool canShip = true; + Bitmapset *shippability; - memset(&sc_context, 0, sizeof(sc_context)); - /* let's assume that by default query is shippable */ - sc_context.sc_query = query; - sc_context.sc_query_level = query_level; - sc_context.sc_for_expr = false; + memset(&sc_context, 0, sizeof(sc_context)); + /* let's assume that by default query is shippable */ + sc_context.sc_query = query; + sc_context.sc_query_level = query_level; + sc_context.sc_for_expr = false; - /* - * We might have already decided not to ship the query to the Datanodes, but - * still walk it anyway to find out if there are any subqueries which can be - * shipped. - */ - pgxc_shippability_walker((Node *)query, &sc_context); + /* + * We might have already decided not to ship the query to the Datanodes, but + * still walk it anyway to find out if there are any subqueries which can be + * shipped. + */ + pgxc_shippability_walker((Node *)query, &sc_context); - exec_nodes = sc_context.sc_exec_nodes; - /* - * The shippability context contains two ExecNodes, one for the subLinks - * involved in the Query and other for the relation involved in FromClause. - * They are computed at different times while scanning the query. Merge both - * of them if they are both replicated. If query doesn't have SubLinks, we - * don't need to consider corresponding ExecNodes. - * PGXC_FQS_TODO: - * Merge the subquery ExecNodes if both of them are replicated. - * The logic to merge node lists with other distribution - * strategy is not clear yet. - */ - if (query->hasSubLinks) - { - if (exec_nodes && IsExecNodesReplicated(exec_nodes) && - sc_context.sc_subquery_en && - IsExecNodesReplicated(sc_context.sc_subquery_en)) - exec_nodes = pgxc_merge_exec_nodes(exec_nodes, - sc_context.sc_subquery_en); - else - exec_nodes = NULL; - } + exec_nodes = sc_context.sc_exec_nodes; + /* + * The shippability context contains two ExecNodes, one for the subLinks + * involved in the Query and other for the relation involved in FromClause. + * They are computed at different times while scanning the query. Merge both + * of them if they are both replicated. If query doesn't have SubLinks, we + * don't need to consider corresponding ExecNodes. + * PGXC_FQS_TODO: + * Merge the subquery ExecNodes if both of them are replicated. + * The logic to merge node lists with other distribution + * strategy is not clear yet. + */ + if (query->hasSubLinks) + { - /* - * Look at the information gathered by the walker in Shippability_context and that - * in the Query structure to decide whether we should ship this query - * directly to the Datanode or not - */ +#ifdef __TBASE__ + int num_fromclause_nodes = 0; + int num_sublink_nodes = 0; - /* - * If the planner was not able to find the Datanodes to the execute the - * query, the query is not completely shippable. So, return NULL - */ - if (!exec_nodes) - return NULL; + /* Get number of DN nodes for Main Query result */ + if (exec_nodes && exec_nodes->nodeList) + { + num_fromclause_nodes = list_length(exec_nodes->nodeList); + } + + /* Get number of DN nodes for Sublink result */ + if (sc_context.sc_subquery_en && sc_context.sc_subquery_en->nodeList) + { + num_sublink_nodes = list_length(sc_context.sc_subquery_en->nodeList); + } + + /* + * Try to merge sublink nodelist only if: + * XXX Only cover CMD_SELECT + * XXX Both main query and sublink results got single DN node + * XXX With same column distributed type + */ + if (enable_subquery_shipping && + exec_nodes && sc_context.sc_subquery_en && + query->commandType == CMD_SELECT && + IsExecNodesColumnDistributed(exec_nodes) && + IsExecNodesColumnDistributed(sc_context.sc_subquery_en) && + exec_nodes->baselocatortype == sc_context.sc_subquery_en->baselocatortype && + (num_fromclause_nodes == 1) && (num_sublink_nodes == 1)) + { + exec_nodes = pgxc_merge_exec_nodes(exec_nodes, sc_context.sc_subquery_en); + } + /* Fall back to PGXC logic that only try with replicated type */ +#endif + else if (exec_nodes && IsExecNodesReplicated(exec_nodes) && + sc_context.sc_subquery_en && + IsExecNodesReplicated(sc_context.sc_subquery_en)) + { + exec_nodes = pgxc_merge_exec_nodes(exec_nodes, sc_context.sc_subquery_en); + } + else + { + exec_nodes = NULL; + } + } + + /* + * Look at the information gathered by the walker in Shippability_context and that + * in the Query structure to decide whether we should ship this query + * directly to the Datanode or not + */ + + /* + * If the planner was not able to find the Datanodes to the execute the + * query, the query is not completely shippable. So, return NULL + */ + if (!exec_nodes) + return NULL; /* Copy the shippability reasons. We modify the copy for easier handling. * The original can be saved away */ diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c index dac8f826..fe4df6e7 100644 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@ -10613,17 +10613,12 @@ get_from_clause_item(Node *jtnode, Query *query, deparse_context *context) * corresponding subquery RTE has its eref set to view name. * The remote query generated has this subquery of which the * columns can be referred to as view_name.col1, so it should - * be possible to refer to this subquery object. - */ + * be possible to refer to this subquery object + * We've finished the alias print here, no need to set printalias + * again. + */ appendStringInfo(buf, " %s", quote_identifier(rte->eref->aliasname)); - - /* - * For 'dual' rte, the aliasname is also 'dual', print alias will - * lead to syntax error. - */ - if (strcmp(rte->eref->aliasname, "dual") != 0) - printalias = true; } #endif else if (rte->rtekind == RTE_FUNCTION) diff --git a/src/test/regress/expected/xc_FQS_2.out b/src/test/regress/expected/xc_FQS_2.out index e3e73168..ea10b9ad 100644 --- a/src/test/regress/expected/xc_FQS_2.out +++ b/src/test/regress/expected/xc_FQS_2.out @@ -1637,8 +1637,76 @@ select * from subquery_fqs t join (select 1 id, 'gd' a, 2 c from dual union sele 1 | sz | 2 | 1 | sz | 2 (3 rows) +-- Support subquery FQS only if subquery distributed on same DN with main query(only 1 DN node) +explain select * from subquery_fqs t1 where t1.id = 1 and t1.c IN (select c from subquery_fqs t2 where t2.id=1); + QUERY PLAN +-------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.00..142.30 rows=2 width=40) + -> Nested Loop Semi Join (cost=100.00..142.30 rows=2 width=40) + Join Filter: (t1.c = t2.c) + -> Seq Scan on subquery_fqs t1 (cost=0.00..21.00 rows=4 width=40) + Filter: (id = 1) + -> Materialize (cost=100.00..121.07 rows=4 width=4) + -> Remote Subquery Scan on all (datanode_1) (cost=100.00..121.05 rows=4 width=4) + -> Seq Scan on subquery_fqs t2 (cost=0.00..21.00 rows=4 width=4) + Filter: (id = 1) +(9 rows) + +select * from subquery_fqs t1 where t1.id = 1 and t1.c IN (select c from subquery_fqs t2 where t2.id=1); + id | a | c +----+----+--- + 1 | gd | 2 + 1 | zj | 2 + 1 | sz | 2 +(3 rows) + +explain select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select c from subquery_fqs t2 where t2.id=1 order by c limit 1); + QUERY PLAN +------------------------------------------------------------------------------------------ + Remote Fast Query Execution (cost=0.00..0.00 rows=0 width=0) + Node/s: datanode_1 + -> Seq Scan on subquery_fqs t1 (cost=21.02..44.22 rows=1 width=40) + Filter: ((id = 1) AND (c = $0)) + InitPlan 1 (returns $0) + -> Limit (cost=21.02..21.02 rows=1 width=4) + -> Sort (cost=21.02..21.03 rows=4 width=4) + Sort Key: t2.c + -> Seq Scan on subquery_fqs t2 (cost=0.00..21.00 rows=4 width=4) + Filter: (id = 1) +(10 rows) + +select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select c from subquery_fqs t2 where t2.id=1 order by c limit 1); + id | a | c +----+----+--- + 1 | gd | 2 + 1 | zj | 2 + 1 | sz | 2 +(3 rows) + +explain select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select max(c) from subquery_fqs t2 where t2.id=1); + QUERY PLAN +------------------------------------------------------------------------------------ + Remote Fast Query Execution (cost=0.00..0.00 rows=0 width=0) + Node/s: datanode_1 + -> Seq Scan on subquery_fqs t1 (cost=21.02..44.22 rows=1 width=40) + Filter: ((id = 1) AND (c = $0)) + InitPlan 1 (returns $0) + -> Aggregate (cost=21.01..21.02 rows=1 width=4) + -> Seq Scan on subquery_fqs t2 (cost=0.00..21.00 rows=4 width=4) + Filter: (id = 1) +(8 rows) + +select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select max(c) from subquery_fqs t2 where t2.id=1); + id | a | c +----+----+--- + 1 | gd | 2 + 1 | zj | 2 + 1 | sz | 2 +(3 rows) + drop table tab1_rr; drop table tab1_hash; drop table tab1_modulo; drop table tab1_replicated; +drop table subquery_fqs; drop function cr_table(varchar, int[], varchar); diff --git a/src/test/regress/sql/xc_FQS.sql b/src/test/regress/sql/xc_FQS.sql index bdb9c02a..14721a76 100644 --- a/src/test/regress/sql/xc_FQS.sql +++ b/src/test/regress/sql/xc_FQS.sql @@ -283,9 +283,17 @@ insert into subquery_fqs values(1,'sz', 2); explain select * from subquery_fqs t join (select 1 id, 'gd' a, 2 c from dual union select 1 id, 'sz' a, 2 c union select 1 id, 'zj' a, 2 c from dual) t2 ON (t.id = t2.id and t.a = t2.a); select * from subquery_fqs t join (select 1 id, 'gd' a, 2 c from dual union select 1 id, 'sz' a, 2 c union select 1 id, 'zj' a, 2 c from dual) t2 ON (t.id = t2.id and t.a = t2.a); +-- Support subquery FQS only if subquery distributed on same DN with main query(only 1 DN node) +explain select * from subquery_fqs t1 where t1.id = 1 and t1.c IN (select c from subquery_fqs t2 where t2.id=1); +select * from subquery_fqs t1 where t1.id = 1 and t1.c IN (select c from subquery_fqs t2 where t2.id=1); +explain select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select c from subquery_fqs t2 where t2.id=1 order by c limit 1); +select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select c from subquery_fqs t2 where t2.id=1 order by c limit 1); +explain select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select max(c) from subquery_fqs t2 where t2.id=1); +select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select max(c) from subquery_fqs t2 where t2.id=1); drop table tab1_rr; drop table tab1_hash; drop table tab1_modulo; drop table tab1_replicated; +drop table subquery_fqs; drop function cr_table(varchar, int[], varchar); From a4f370e4aa3afb72354502ccd1cbbacdddf66be7 Mon Sep 17 00:00:00 2001 From: youngxie Date: Sun, 4 Oct 2020 11:16:43 +0800 Subject: [PATCH 070/578] fix gtm coredump due to initilization failure. --- src/gtm/main/main.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/src/gtm/main/main.c b/src/gtm/main/main.c index 03618ffe..78fbcff5 100644 --- a/src/gtm/main/main.c +++ b/src/gtm/main/main.c @@ -1092,28 +1092,28 @@ main(int argc, char *argv[]) } } while(max_retry_times > 0); - if(ret) - { - elog(FATAL, "GTM_StoreMasterInit failed too many times exit, %s", strerror(errno)); - } - - if (!gtm_standby_restore_next_gxid()) - { - elog(FATAL, "Failed to restore next/last gxid from the active-GTM."); - } - elog(LOG, "Restoring next/last gxid from the active-GTM succeeded."); + if(ret) + { + elog(FATAL, "GTM_StoreMasterInit failed too many times \"%s\", exit", strerror(errno)); + } + + if (!gtm_standby_restore_next_gxid()) + { + elog(FATAL, "Failed to restore next/last gxid from the active-GTM."); + } + elog(LOG, "Restoring next/last gxid from the active-GTM succeeded."); - if (!gtm_standby_restore_gxid()) - { - elog(FATAL, "Failed to restore all of gxid(s) from the active-GTM."); - } - elog(LOG, "Restoring all of gxid(s) from the active-GTM succeeded."); + if (!gtm_standby_restore_gxid()) + { + elog(FATAL, "Failed to restore all of gxid(s) from the active-GTM."); + } + elog(LOG, "Restoring all of gxid(s) from the active-GTM succeeded."); - if (!gtm_standby_restore_sequence()) - { - elog(FATAL, "Failed to restore sequences from the active-GTM."); - } - elog(LOG, "Restoring sequences from the active-GTM succeeded."); + if (!gtm_standby_restore_sequence()) + { + elog(FATAL, "Failed to restore sequences from the active-GTM."); + } + elog(LOG, "Restoring sequences from the active-GTM succeeded."); #else From df7eab923eaf6d4299e23a4932ab8c5eb05260a6 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Sat, 10 Oct 2020 16:53:10 +0800 Subject: [PATCH 071/578] for pooler statistics extension (merge request !112) --- contrib/tbase_pooler_stat/Makefile | 18 + .../tbase_pooler_stat--1.0.sql | 41 ++ .../tbase_pooler_stat--unpackaged--1.0.sql | 8 + contrib/tbase_pooler_stat/tbase_pooler_stat.c | 307 ++++++++++ .../tbase_pooler_stat.control | 5 + src/backend/libpq/pqformat.c | 17 +- src/backend/pgxc/pool/poolmgr.c | 545 +++++++++++++++--- src/include/libpq/pqformat.h | 1 + src/include/pgxc/poolmgr.h | 79 ++- 9 files changed, 913 insertions(+), 108 deletions(-) create mode 100644 contrib/tbase_pooler_stat/Makefile create mode 100644 contrib/tbase_pooler_stat/tbase_pooler_stat--1.0.sql create mode 100644 contrib/tbase_pooler_stat/tbase_pooler_stat--unpackaged--1.0.sql create mode 100644 contrib/tbase_pooler_stat/tbase_pooler_stat.c create mode 100644 contrib/tbase_pooler_stat/tbase_pooler_stat.control diff --git a/contrib/tbase_pooler_stat/Makefile b/contrib/tbase_pooler_stat/Makefile new file mode 100644 index 00000000..ed49584d --- /dev/null +++ b/contrib/tbase_pooler_stat/Makefile @@ -0,0 +1,18 @@ +# contrib/tbase_pooler_stat/Makefile + +MODULE_big = tbase_pooler_stat +OBJS = tbase_pooler_stat.o + +EXTENSION = tbase_pooler_stat +DATA = tbase_pooler_stat--1.0.sql tbase_pooler_stat--unpackaged--1.0.sql + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/tbase_pooler_stat +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/contrib/tbase_pooler_stat/tbase_pooler_stat--1.0.sql b/contrib/tbase_pooler_stat/tbase_pooler_stat--1.0.sql new file mode 100644 index 00000000..5ee8e1e6 --- /dev/null +++ b/contrib/tbase_pooler_stat/tbase_pooler_stat--1.0.sql @@ -0,0 +1,41 @@ +/* contrib/tbase_pooler_stat/tbase_pooler_stat--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION tbase_pooler_stat" to load this file. \quit + +-- Register functions. +CREATE OR REPLACE FUNCTION tbase_get_pooler_cmd_statistics( + OUT command_type text, + OUT request_times int8, + OUT avg_costtime int8, + OUT max_costtime int8, + OUT min_costtime int8 +) +RETURNS SETOF record +AS 'MODULE_PATHNAME' +LANGUAGE C; + + +CREATE OR REPLACE FUNCTION tbase_reset_pooler_cmd_statistics() +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C; + +CREATE OR REPLACE FUNCTION tbase_get_pooler_conn_statistics( + OUT database name, + OUT user_name name, + OUT node_name name, + OUT oid Oid, + OUT is_coord bool, + OUT conn_cnt int4, + OUT free_cnt int4, + OUT warming_cnt int4, + OUT query_cnt int4, + OUT exceed_keepalive_cnt int4, + OUT exceed_deadtime_cnt int4, + OUT exceed_maxlifetime_cnt int4 +) +RETURNS SETOF record +AS 'MODULE_PATHNAME' +LANGUAGE C; + diff --git a/contrib/tbase_pooler_stat/tbase_pooler_stat--unpackaged--1.0.sql b/contrib/tbase_pooler_stat/tbase_pooler_stat--unpackaged--1.0.sql new file mode 100644 index 00000000..86f8cb72 --- /dev/null +++ b/contrib/tbase_pooler_stat/tbase_pooler_stat--unpackaged--1.0.sql @@ -0,0 +1,8 @@ +/* contrib/tbase_pooler_stat/tbase_pooler_stat--unpackaged--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION tbase_pooler_stat FROM unpackaged" to load this file. \quit + +ALTER EXTENSION tbase_pooler_stat ADD function tbase_get_pooler_cmd_statistics(); +ALTER EXTENSION tbase_pooler_stat ADD function tbase_reset_pooler_cmd_statistics(); +ALTER EXTENSION tbase_pooler_stat ADD function tbase_get_pooler_conn_statistics(); \ No newline at end of file diff --git a/contrib/tbase_pooler_stat/tbase_pooler_stat.c b/contrib/tbase_pooler_stat/tbase_pooler_stat.c new file mode 100644 index 00000000..d85e5405 --- /dev/null +++ b/contrib/tbase_pooler_stat/tbase_pooler_stat.c @@ -0,0 +1,307 @@ +/* + * contrib/tbase_pooler_stat/tbase_pooler_stat.c + * + * tbase_pooler_stat.c + * + * Copyright (c) 2020 Tbase Kernel Group + * + * Permission to use, copy, modify, and distribute this software and + * its documentation for any purpose, without fee, and without a + * written agreement is hereby granted, provided that the above + * copyright notice and this paragraph and the following two + * paragraphs appear in all copies. + * + * IN NO EVENT SHALL THE AUTHOR BE LIABLE TO ANY PARTY FOR DIRECT, + * INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING + * LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS + * DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * + * THE AUTHOR SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS + * IS" BASIS, AND THE AUTHOR HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, + * SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. + */ + +#include "postgres.h" +#include "funcapi.h" +#include "access/htup_details.h" +#include "catalog/pg_type.h" +#include +#include "pgxc/poolmgr.h" +#include "libpq/pqformat.h" +#include "utils/builtins.h" + +PG_MODULE_MAGIC; + +PG_FUNCTION_INFO_V1(tbase_get_pooler_cmd_statistics); +PG_FUNCTION_INFO_V1(tbase_reset_pooler_cmd_statistics); +PG_FUNCTION_INFO_V1(tbase_get_pooler_conn_statistics); + +typedef struct +{ + uint32 currIdx; /* current handle item id */ + PoolerCmdStatistics *buf; /* a fixed length buf store the result */ +} Pooler_CmdState; + +typedef struct +{ + uint32 total_node_cursor; /* total connection nodes count */ + const char *database; /* node_cursor's database */ + const char *username; /* node_cursor's username */ + uint32 node_cursor; /* current handle node cursor */ + StringInfo buf; /* a stringInfo buf store the result */ +} Pooler_ConnState; + + +/* the g_pooler_cmd_name_tab and g_pooler_cmd must be in the same order */ +static char *g_pooler_cmd_name_tab[POOLER_CMD_COUNT] = +{ + "ABORT", /* ABORT */ + "FIRE_TRANSACTION_BLOCK", /* Fire transaction-block commands on given nodes */ + "CONNECT", /* CONNECT */ + "DISCONNECT", /* DISCONNECT */ + "CLEAN_CONN", /* CLEAN CONNECTION */ + "GET_CONN", /* GET CONNECTIONS */ + "CANCEL_SQL", /* Cancel SQL Command in progress on specified connections */ + "LOCK_UNLOCK_POOLER", /* Lock/unlock pooler */ + "RELOAD_CONN", /* Reload connection info */ + "PING_CONN", /* Ping connection info */ + "CHECK_CONN", /* Check connection info consistency */ + "RELEASE_CONN", /* RELEASE CONNECTIONS */ + "REFRESH_CONN", /* Refresh connection info */ + "SESSION_RELATED", /* Session-related COMMAND */ + "CLOSE_POOLER_CONN", /* Close pooler connections*/ + "GET_CMD_STATSTICS", /* Get command statistics */ + "RESET_CMD_STATISTICS", /* Reset command statistics */ + "GET_CONN_STATISTICS" /* Get connection statistics */ +}; + +/* + * get pooler command statistics + */ +Datum +tbase_get_pooler_cmd_statistics(PG_FUNCTION_ARGS) +{ +#define LIST_POOLER_CMD_STATISTICS_COLUMNS 5 + FuncCallContext *funcctx; + int32 ret = 0; + Pooler_CmdState *status = NULL; + Datum values[LIST_POOLER_CMD_STATISTICS_COLUMNS]; + bool nulls[LIST_POOLER_CMD_STATISTICS_COLUMNS]; + HeapTuple tuple; + Datum result; + PoolerCmdStatistics stat_info; + int size = sizeof(PoolerCmdStatistics) * POOLER_CMD_COUNT; + + MemSet(values, 0, sizeof(values)); + MemSet(nulls, 0, sizeof(nulls)); + + if (SRF_IS_FIRSTCALL()) + { + MemoryContext oldcontext; + TupleDesc tupdesc; + + funcctx = SRF_FIRSTCALL_INIT(); + + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + tupdesc = CreateTemplateTupleDesc(LIST_POOLER_CMD_STATISTICS_COLUMNS, false); + + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "command_type", + TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "request_times", + INT8OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "avg_costtime", + INT8OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 4, "max_costtime", + INT8OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 5, "min_costtime", + INT8OID, -1, 0); + funcctx->tuple_desc = BlessTupleDesc(tupdesc); + + status = (Pooler_CmdState*) palloc(sizeof(Pooler_CmdState)); + status->currIdx = 0; + status->buf = (PoolerCmdStatistics*) palloc(size); + + funcctx->user_fctx = (void*) status; + + ret = PoolManagerGetCmdStatistics((char*)status->buf, size); + if (ret) + { + elog(ERROR, "get pooler cmd statictics info from pooler failed"); + } + + MemoryContextSwitchTo(oldcontext); + } + + funcctx = SRF_PERCALL_SETUP(); + status = (Pooler_CmdState *) funcctx->user_fctx; + + while (status->currIdx < POOLER_CMD_COUNT) + { + stat_info.total_request_times = be64toh(status->buf[status->currIdx].total_request_times); + stat_info.total_costtime = be64toh(status->buf[status->currIdx].total_costtime); + stat_info.max_costtime = be64toh(status->buf[status->currIdx].max_costtime); + stat_info.min_costtime = be64toh(status->buf[status->currIdx].min_costtime); + + /* avg_costtime */ + stat_info.avg_costtime = (stat_info.total_request_times == 0) ? 0 : (stat_info.total_costtime / stat_info.total_request_times); + + values[0] = CStringGetTextDatum(g_pooler_cmd_name_tab[status->currIdx]); + values[1] = Int64GetDatum(stat_info.total_request_times); + values[2] = Int64GetDatum(stat_info.avg_costtime); + values[3] = Int64GetDatum(stat_info.max_costtime); + values[4] = Int64GetDatum(stat_info.min_costtime); + + status->currIdx++; + + tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); + result = HeapTupleGetDatum(tuple); + SRF_RETURN_NEXT(funcctx, result); + } + + SRF_RETURN_DONE(funcctx); +} + +/* + * reset pooler command statistics + */ +Datum +tbase_reset_pooler_cmd_statistics(PG_FUNCTION_ARGS) +{ + PoolManagerResetCmdStatistics(); + + PG_RETURN_VOID(); +} + +/* + * get pooler connections statistics + */ +Datum +tbase_get_pooler_conn_statistics(PG_FUNCTION_ARGS) +{ +#define LIST_POOLER_CONN_STATISTICS_COLUMNS 12 + FuncCallContext *funcctx = NULL; + int32 ret = 0; + Pooler_ConnState *status = NULL; + Datum values[LIST_POOLER_CONN_STATISTICS_COLUMNS]; + bool nulls[LIST_POOLER_CONN_STATISTICS_COLUMNS]; + HeapTuple tuple; + Datum result; + + if (SRF_IS_FIRSTCALL()) + { + MemoryContext oldcontext; + TupleDesc tupdesc; + + /* content will destroy in SRF_RETURN_DONE */ + funcctx = SRF_FIRSTCALL_INIT(); + + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + tupdesc = CreateTemplateTupleDesc(LIST_POOLER_CONN_STATISTICS_COLUMNS, false); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "database", + NAMEOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "user_name", + NAMEOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "node_name", + NAMEOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 4, "oid", + OIDOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 5, "is_coord", + BOOLOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 6, "conn_cnt", + INT4OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 7, "free_cnt", + INT4OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 8, "warming_cnt", + INT4OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 9, "query_cnt", + INT4OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 10, "exceed_keepalive_cnt", + INT4OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 11, "exceed_deadtime_cnt", + INT4OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 12, "exceed_maxlifetime_cnt", + INT4OID, -1, 0); + + funcctx->tuple_desc = BlessTupleDesc(tupdesc); + + status = (Pooler_ConnState*) palloc(sizeof(Pooler_ConnState)); + status->database = NULL; + status->username = NULL; + status->node_cursor = 0; + status->buf = makeStringInfo(); + + funcctx->user_fctx = (void*) status; + + ret = PoolManagerGetConnStatistics(status->buf); + if (ret) + { + elog(ERROR, "get pooler conn statictics info from pooler failed"); + } + else + { + status->total_node_cursor = pq_getmsgint(status->buf, sizeof(uint32)); + } + + MemoryContextSwitchTo(oldcontext); + } + + funcctx = SRF_PERCALL_SETUP(); + status = (Pooler_ConnState *) funcctx->user_fctx; + + while (status->total_node_cursor) + { + MemSet(values, 0, sizeof(values)); + MemSet(nulls, 0, sizeof(nulls)); + + if (status->node_cursor == 0) + { + /* get next database and username */ + status->database = pq_getmsgstring(status->buf); + status->username = pq_getmsgstring(status->buf); + status->node_cursor = pq_getmsgint(status->buf, sizeof(uint32)); + } + + values[0] = CStringGetDatum(status->database); + values[1] = CStringGetDatum(status->username); + if (status->node_cursor == 0) + { + nulls[2] = true; + nulls[3] = true; + nulls[4] = true; + nulls[5] = true; + nulls[6] = true; + nulls[7] = true; + nulls[8] = true; + nulls[9] = true; + nulls[10] = true; + nulls[11] = true; + } + else + { + values[2] = CStringGetDatum(pq_getmsgstring(status->buf)); + values[3] = ObjectIdGetDatum(pq_getmsgint(status->buf, sizeof(Oid))); + values[4] = BoolGetDatum(pq_getmsgint(status->buf, sizeof(bool))); + values[5] = UInt32GetDatum(pq_getmsgint(status->buf, sizeof(uint32))); + values[6] = UInt32GetDatum(pq_getmsgint(status->buf, sizeof(uint32))); + values[7] = UInt32GetDatum(pq_getmsgint(status->buf, sizeof(uint32))); + values[8] = UInt32GetDatum(pq_getmsgint(status->buf, sizeof(uint32))); + values[9] = UInt32GetDatum(pq_getmsgint(status->buf, sizeof(uint32))); + values[10] = UInt32GetDatum(pq_getmsgint(status->buf, sizeof(uint32))); + values[11] = UInt32GetDatum(pq_getmsgint(status->buf, sizeof(uint32))); + status->node_cursor--; + } + + status->total_node_cursor--; + + tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); + result = HeapTupleGetDatum(tuple); + SRF_RETURN_NEXT(funcctx, result); + } + + SRF_RETURN_DONE(funcctx); +} \ No newline at end of file diff --git a/contrib/tbase_pooler_stat/tbase_pooler_stat.control b/contrib/tbase_pooler_stat/tbase_pooler_stat.control new file mode 100644 index 00000000..ae727b91 --- /dev/null +++ b/contrib/tbase_pooler_stat/tbase_pooler_stat.control @@ -0,0 +1,5 @@ +# tbase_pooler_stat extension +comment = 'pooler statistics' +default_version = '1.0' +module_pathname = '$libdir/tbase_pooler_stat' +relocatable = true diff --git a/src/backend/libpq/pqformat.c b/src/backend/libpq/pqformat.c index d2ba7d1b..2005bbd3 100644 --- a/src/backend/libpq/pqformat.c +++ b/src/backend/libpq/pqformat.c @@ -578,7 +578,22 @@ pq_copymsgbytes(StringInfo msg, char *buf, int datalen) } /* -------------------------------- - * pq_getmsgtext - get a counted text string (with conversion) + * pq_updatemsgbytes - update the content of the specified location with buf + * + * -------------------------------- + */ +void +pq_updatemsgbytes(StringInfo msg, int offset, char *buf, int datalen) +{ + if (datalen < 0 || offset < 0 || offset + datalen > msg->len) + ereport(ERROR, + (EPROTO, + errmsg("invalid update data in message"))); + memcpy(&msg->data[offset], buf, datalen); +} + +/* -------------------------------- + * pq_getmsgtext - get a counted text string (with conversion) * * Always returns a pointer to a freshly palloc'd result. * The result has a trailing null, *and* we return its strlen in *nbytes. diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c index 0392d08f..d24a32f4 100644 --- a/src/backend/pgxc/pool/poolmgr.c +++ b/src/backend/pgxc/pool/poolmgr.c @@ -66,6 +66,7 @@ #include "utils/varlena.h" #include "port.h" #include +#include /* the mini use conut of a connection */ #define MINI_USE_COUNT 10 @@ -138,6 +139,33 @@ typedef struct PoolerStatistics PoolerStatistics g_pooler_stat; +/* global command statistics handle */ +PoolerCmdStatistics* g_pooler_cmd_stat = NULL; + +unsigned char g_pooler_cmd[POOLER_CMD_COUNT] = +{ + 'a', /* ABORT */ + 'b', /* Fire transaction-block commands on given nodes */ + 'c', /* CONNECT */ + 'd', /* DISCONNECT */ + 'f', /* CLEAN CONNECTION */ + 'g', /* GET CONNECTIONS */ + 'h', /* Cancel SQL Command in progress on specified connections */ + 'o', /* Lock/unlock pooler */ + 'p', /* Reload connection info */ + 'P', /* Ping connection info */ + 'q', /* Check connection info consistency */ + 'r', /* RELEASE CONNECTIONS */ + 'R', /* Refresh connection info */ + 's', /* Session-related COMMAND */ + 't', /* Close pooler connections*/ + 'x', /* Get command statistics */ + 'y', /* Reset command statistics */ + 'z' /* Get connection statistics */ +}; + +/* a map used to change msgtype to id */ +uint8 g_qtype2id[256]; /* Flag to tell if we are Postgres-XC pooler process */ static bool am_pgxc_pooler = false; @@ -346,25 +374,27 @@ char *poolErrorMsg[] = {"No Error", typedef struct { - int32 cmd; /* refer to handle_agent_input command tag */ - bool bCoord; /* coordinator or datanode*/ - PGXCASyncTaskCtl *taskControl; - PoolAgent *agent; - PGXCNodePool *nodepool; /* node pool for current node */ - PGXCNodePoolSlot *slot; /* connection slot , no need to free */ - int32 current_status; /* currrent connect status*/ - int32 final_status; /* final status we are going to get to*/ - int32 nodeindex; /* node index of the remote peer */ - bool needfree; /* whether need to free taskControl, last thread set the flag */ - - int32 req_seq; /* req sequence number */ - int32 pid; /* pid that acquires the connection */ - bool needConnect; /* check whether we need to build a new connection , we acquire new connections */ - bool error_flag; /* set when error */ - SendSetQueryStatus setquery_status; /* send set query status */ - struct timeval start_time; /* when acquire conn by sync thread, the time begin request */ - struct timeval end_time; /* when acquire conn by sync thread, the time finish request */ - char errmsg[POOLER_ERROR_MSG_LEN]; + int32 cmd; /* refer to handle_agent_input command tag */ + bool bCoord; /* coordinator or datanode*/ + PGXCASyncTaskCtl *taskControl; + PoolAgent *agent; + PGXCNodePool *nodepool; /* node pool for current node */ + PGXCNodePoolSlot *slot; /* connection slot , no need to free */ + int32 current_status; /* currrent connect status*/ + int32 final_status; /* final status we are going to get to*/ + int32 nodeindex; /* node index of the remote peer */ + bool needfree; /* whether need to free taskControl, last thread set the flag */ + + int32 req_seq; /* req sequence number */ + int32 pid; /* pid that acquires the connection */ + bool needConnect; /* check whether we need to build a new connection , we acquire new connections */ + bool error_flag; /* set when error */ + SendSetQueryStatus setquery_status; /* send set query status */ + struct timeval start_time; /* when acquire conn by sync thread, the time begin request */ + struct timeval end_time; /* when acquire conn by sync thread, the time finish request */ + char errmsg[POOLER_ERROR_MSG_LEN]; + pg_time_t cmd_start_time; /* command start time, including the processing time in the main process */ + pg_time_t cmd_end_time; /* command end time */ }PGXCPoolAsyncReq; static inline void RebuildAgentIndex(void); @@ -562,6 +592,11 @@ static int handle_close_pooled_connections(PoolAgent * agent, StringInfo s); static void ConnectPoolManager(void); #endif +static void init_pooler_cmd_statistics(void); +static void reset_pooler_cmd_statistics(void); +static void update_pooler_cmd_statistics(unsigned char qtype, uint64 costtime); +static void handle_get_cmd_statistics(PoolAgent *agent); +static void handle_get_conn_statistics(PoolAgent *agent); #define IncreaseSlotRefCount(slot,filename,linenumber)\ do\ @@ -693,8 +728,6 @@ do\ }\ }while(0) - - void PGXCPoolerProcessIam(void) { @@ -1460,6 +1493,95 @@ PoolManagerLock(bool is_lock) RESUME_POOLER_RELOAD(); } +/* + * get pooler command statistics + */ +int +PoolManagerGetCmdStatistics(char *s, int size) +{ + int qtype = 0; + char msgtype = 'x'; + HOLD_POOLER_RELOAD(); + + if (poolHandle == NULL) + { + ConnectPoolManager(); + } + + /* Message type */ + pool_putbytes(&poolHandle->port, &msgtype, 1); + pool_flush(&poolHandle->port); + + qtype = pool_getbyte(&poolHandle->port); + if (qtype == EOF || (unsigned char)qtype != msgtype) + { + elog(ERROR, POOL_MGR_PREFIX"get command statistics error, qtype:%d", qtype); + RESUME_POOLER_RELOAD(); + return -1; + } + + /* get all command statistics messages */ + pool_getbytes(&poolHandle->port, s, size); + + RESUME_POOLER_RELOAD(); + return 0; +} + +/* + * reset command statistics + */ +void +PoolManagerResetCmdStatistics(void) +{ + char msgtype = 'y'; + HOLD_POOLER_RELOAD(); + + if (poolHandle == NULL) + { + ConnectPoolManager(); + } + + /* Message type */ + pool_putbytes(&poolHandle->port, &msgtype, 1); + pool_flush(&poolHandle->port); + + RESUME_POOLER_RELOAD(); +} + +/* + * get pooler connections statistics + */ +int +PoolManagerGetConnStatistics(StringInfo s) +{ + int qtype = 0; + char msgtype = 'z'; + HOLD_POOLER_RELOAD(); + + if (poolHandle == NULL) + { + ConnectPoolManager(); + } + + /* Message type */ + pool_putbytes(&poolHandle->port, &msgtype, 1); + pool_flush(&poolHandle->port); + + qtype = pool_getbyte(&poolHandle->port); + if (qtype == EOF || (unsigned char)qtype != msgtype) + { + elog(ERROR, POOL_MGR_PREFIX"get conn statistics error, qtype:%d", qtype); + RESUME_POOLER_RELOAD(); + return -1; + } + + /* get all the messages left */ + pool_getmessage(&poolHandle->port, s, 0); + + RESUME_POOLER_RELOAD(); + return 0; +} + /* * Init PoolAgent */ @@ -1953,6 +2075,16 @@ PoolManagerReloadConnectionInfo(void) pool_flush(&poolHandle->port); } +/* + * get systime time, ms + */ +static pg_time_t +get_system_time() +{ + struct timeb t; + ftime(&t); + return 1000 * t.time + t.millitm; +} /* * Handle messages to agent @@ -5097,7 +5229,8 @@ PoolerLoop(void) pool_fd[i].events = POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND; } - reset_pooler_statistics(); + reset_pooler_statistics(); + init_pooler_cmd_statistics(); for (;;) { @@ -8043,9 +8176,12 @@ static inline bool dispatch_connection_request(PGXCASyncTaskCtl *taskControl, { taskControl->m_status = PoolAyncCtlStaus_dispatched; - /* also use this request to response to session*/ - req->final_status = PoolConnectStaus_destory; - } + /* also use this request to response to session*/ + req->final_status = PoolConnectStaus_destory; + + /* if last request, transfer cmd_start_time to req */ + req->cmd_start_time = agent->cmd_start_time; + } if (PoolConnectDebugPrint) { @@ -8081,8 +8217,16 @@ static inline bool dispatch_connection_request(PGXCASyncTaskCtl *taskControl, snprintf(agent->port.err_msg, POOL_ERR_MSG_LEN, "%s", poolErrorMsg[agent->port.error_code]); SpinLockRelease(&agent->port.lock); #endif + } + else + { + if (dispatched) + { + /* dispatch success, clear cmd start time in agent */ + agent->cmd_start_time = 0; + } } - return ret; + return ret; } @@ -8116,31 +8260,43 @@ static inline bool dispatch_local_set_request(PGXCASyncTaskCtl *taskControl, { taskControl->m_status = PoolAyncCtlStaus_dispatched; - /* also use this request to response to session*/ - req->final_status = PoolLocalSetStatus_destory; - req->current_status = PoolLocalSetStatus_destory; - } - + /* also use this request to response to session*/ + req->final_status = PoolLocalSetStatus_destory; + req->current_status = PoolLocalSetStatus_destory; + + /* if last request, transfer cmd_start_time to req */ + req->cmd_start_time = agent->cmd_start_time; + } + if (PoolConnectDebugPrint) { elog(LOG, POOL_MGR_PREFIX"pid:%d dispatch async local set nodeindex:%d connection, current status:%d final status:%d", agent->pid, nodeindex, req->current_status, req->final_status); } - if (dispatched) + if (dispatched) + { + if (PoolConnectDebugPrint) + { + elog(LOG, POOL_MGR_PREFIX"pid:%d dispatch last local set request!! nodeindex:%d connection, current status:%d final status:%d request_num:%d", agent->pid, nodeindex, req->current_status, req->final_status, taskControl->m_mumber_total); + } + } + ret = dispatch_async_network_operation(req); + if (!ret) + { + elog(LOG, POOL_MGR_PREFIX"pid:%d dispatch async local set request failed!! nodeindex:%d connection, current status:%d final status:%d request_num:%d", agent->pid, nodeindex, req->current_status, req->final_status, taskControl->m_mumber_total); + pfree(req); + } + else { - if (PoolConnectDebugPrint) + if (dispatched) { - elog(LOG, POOL_MGR_PREFIX"pid:%d dispatch last local set request!! nodeindex:%d connection, current status:%d final status:%d request_num:%d", agent->pid, nodeindex, req->current_status, req->final_status, taskControl->m_mumber_total); + /* dispatch success, clear cmd start time in agent */ + agent->cmd_start_time = 0; } } - ret = dispatch_async_network_operation(req); - if (!ret) - { - elog(LOG, POOL_MGR_PREFIX"pid:%d dispatch async local set request failed!! nodeindex:%d connection, current status:%d final status:%d request_num:%d", agent->pid, nodeindex, req->current_status, req->final_status, taskControl->m_mumber_total); - pfree(req); - } - return ret; + + return ret; } static inline bool dispatch_set_command_request(PGXCASyncTaskCtl *taskControl, @@ -8183,10 +8339,13 @@ static inline bool dispatch_set_command_request(PGXCASyncTaskCtl *taskControl, { taskControl->m_status = PoolAyncCtlStaus_dispatched; - /* also use this request to response to session*/ - req->final_status = PoolSetCommandStatus_destory; - req->current_status = PoolSetCommandStatus_destory; - } + /* also use this request to response to session*/ + req->final_status = PoolSetCommandStatus_destory; + req->current_status = PoolSetCommandStatus_destory; + + /* if last request, transfer cmd_start_time to req */ + req->cmd_start_time = agent->cmd_start_time; + } if (PoolConnectDebugPrint) { @@ -8207,14 +8366,22 @@ static inline bool dispatch_set_command_request(PGXCASyncTaskCtl *taskControl, } } - ret = dispatch_async_network_operation(req); - if (!ret) + ret = dispatch_async_network_operation(req); + if (!ret) + { + if (slot) + { + elog(LOG, POOL_MGR_PREFIX"pid:%d dispatch async set command request failed!! nodeindex:%d connection nodename:%s backend_pid:%d current status:%d final status:%d request_num:%d command:%s", agent->pid, nodeindex, slot->node_name, slot->backend_pid, req->current_status, req->final_status, taskControl->m_mumber_total, taskControl->m_command); + } + pfree(req); + } + else { - if (slot) + if (dispatched) { - elog(LOG, POOL_MGR_PREFIX"pid:%d dispatch async set command request failed!! nodeindex:%d connection nodename:%s backend_pid:%d current status:%d final status:%d request_num:%d command:%s", agent->pid, nodeindex, slot->node_name, slot->backend_pid, req->current_status, req->final_status, taskControl->m_mumber_total, taskControl->m_command); + /* dispatch success, clear cmd start time in agent */ + agent->cmd_start_time = 0; } - pfree(req); } if (PoolConnectDebugPrint) @@ -8268,34 +8435,37 @@ static inline bool dispatch_cancle_request(PGXCASyncTaskCtl *taskControl, { taskControl->m_status = PoolAyncCtlStaus_dispatched; - /* use this request to response to session*/ - req->current_status = PoolCancelStatus_destory; - req->final_status = PoolCancelStatus_destory; - } - - if (bCoord) - { - slot = agent->coord_connections[nodeindex]; - - } - else - { - slot = agent->dn_connections[nodeindex]; - } - - if (PoolConnectDebugPrint) - { - if (slot) - { - elog(LOG, POOL_MGR_PREFIX"pid:%d dispatch async CANCLE_QUERY nodeindex:%d connection, nodename:%s backend_pid:%d current status:%d final status:%d", - agent->pid, - nodeindex, - slot->node_name, - slot->backend_pid, - req->current_status, - req->final_status); - } - } + /* use this request to response to session*/ + req->current_status = PoolCancelStatus_destory; + req->final_status = PoolCancelStatus_destory; + + /* if last request, transfer cmd_start_time to req */ + req->cmd_start_time = agent->cmd_start_time; + } + + if (bCoord) + { + slot = agent->coord_connections[nodeindex]; + + } + else + { + slot = agent->dn_connections[nodeindex]; + } + + if (PoolConnectDebugPrint) + { + if (slot) + { + elog(LOG, POOL_MGR_PREFIX"pid:%d dispatch async CANCLE_QUERY nodeindex:%d connection, nodename:%s backend_pid:%d current status:%d final status:%d", + agent->pid, + nodeindex, + slot->node_name, + slot->backend_pid, + req->current_status, + req->final_status); + } + } if (dispatched) { @@ -8336,8 +8506,17 @@ static inline bool dispatch_cancle_request(PGXCASyncTaskCtl *taskControl, snprintf(agent->port.err_msg, POOL_ERR_MSG_LEN, "%s", poolErrorMsg[agent->port.error_code]); SpinLockRelease(&agent->port.lock); #endif + } + else + { + if (dispatched) + { + /* dispatch success, clear cmd start time in agent */ + agent->cmd_start_time = 0; + } } - return ret; + + return ret; } @@ -10761,3 +10940,211 @@ ConnectPoolManager(void) } #endif + +/* + * init pooler command statistics + */ +static void +init_pooler_cmd_statistics(void) +{ + int i = 0; + memset(g_qtype2id, -1, sizeof(g_qtype2id)); + + /* init type to id map */ + for (i = 0; i < POOLER_CMD_COUNT; i++) + { + g_qtype2id[g_pooler_cmd[i]] = i; + } + + /* init global statistics array */ + g_pooler_cmd_stat = (PoolerCmdStatistics*) palloc(POOLER_CMD_COUNT * sizeof(PoolerCmdStatistics)); + for (i = 0; i < POOLER_CMD_COUNT; i++) + { + g_pooler_cmd_stat[i].total_request_times = 0; + g_pooler_cmd_stat[i].total_costtime = 0; + g_pooler_cmd_stat[i].max_costtime = 0; + g_pooler_cmd_stat[i].min_costtime = MAX_UINT64; + } +} + +/* + * reset pooler command statistics + */ +static void +reset_pooler_cmd_statistics(void) +{ + int i = 0; + /* reset global statistics array */ + for (i = 0; i < POOLER_CMD_COUNT; i++) + { + g_pooler_cmd_stat[i].total_request_times = 0; + g_pooler_cmd_stat[i].total_costtime = 0; + g_pooler_cmd_stat[i].max_costtime = 0; + g_pooler_cmd_stat[i].min_costtime = MAX_UINT64; + } +} + +/* + * update pooler command statistics info + */ +static void +update_pooler_cmd_statistics(unsigned char qtype, uint64 costtime) +{ + uint8 id = g_qtype2id[qtype]; + if (id == MAX_UINT8) + { + return; + } + + g_pooler_cmd_stat[id].total_request_times += 1; + g_pooler_cmd_stat[id].total_costtime += costtime; + + if (costtime > g_pooler_cmd_stat[id].max_costtime) + { + g_pooler_cmd_stat[id].max_costtime = costtime; + } + + if (costtime < g_pooler_cmd_stat[id].min_costtime) + { + g_pooler_cmd_stat[id].min_costtime = costtime; + } +} + +/* + * handle get command statistics + */ +static void +handle_get_cmd_statistics(PoolAgent *agent) +{ + int i = 0; + uint64 n64 = 0; + char msgtype = 'x'; + + /* response message type */ + pool_putbytes(&agent->port, &msgtype, 1); + + /* fixed length command statistics info */ + for (i = 0; i < POOLER_CMD_COUNT; i++) + { + n64 = htobe64(g_pooler_cmd_stat[i].total_request_times); + pool_putbytes(&agent->port, (char *) &n64, sizeof(n64)); + + n64 = htobe64(g_pooler_cmd_stat[i].total_costtime); + pool_putbytes(&agent->port, (char *) &n64, sizeof(n64)); + + n64 = htobe64(g_pooler_cmd_stat[i].max_costtime); + pool_putbytes(&agent->port, (char *) &n64, sizeof(n64)); + + n64 = htobe64(g_pooler_cmd_stat[i].min_costtime); + pool_putbytes(&agent->port, (char *) &n64, sizeof(n64)); + } + + pool_flush(&agent->port); +} + +/* + * handle get connections statistics + */ +static void +handle_get_conn_statistics(PoolAgent *agent) +{ + DatabasePool *database_pool = databasePools; + HASH_SEQ_STATUS hseq_status; + PGXCNodePool *node_pool = NULL; + + uint32 node_cnt = 0; /* the nodes count use the same database and username */ + uint32 total_node_cnt = 0; /* total nodes count */ + + /* var offset in buf */ + uint32 node_cnt_offset = 0; + uint32 total_node_cnt_offset = 0; + + uint32 exceed_keepalive_cnt = 0; + uint32 exceed_deadtime_cnt = 0; + uint32 exceed_maxlifetime_cnt = 0; + int i = 0; + PGXCNodePoolSlot *slot = NULL; + time_t now = time(NULL); + StringInfoData buf; + + initStringInfo(&buf); + /* reserve a place for total_node_cnt, record the offset of total_node_cnt */ + total_node_cnt_offset = buf.len; + pq_sendint(&buf, total_node_cnt, sizeof(uint32)); + + /* total node count | database | username | node count in the same database and username | node pool conn statistics | ... | database | username | ... */ + while (database_pool) + { + pq_sendstring(&buf, database_pool->database); + pq_sendstring(&buf, database_pool->user_name); + + /* reserve a place for node_cnt, record the offset of node_cnt */ + node_cnt = 0; + node_cnt_offset = buf.len; + pq_sendint(&buf, node_cnt, sizeof(uint32)); + + /* traverse all node_pool in hashtable */ + hash_seq_init(&hseq_status, database_pool->nodePools); + while ((node_pool = (PGXCNodePool *) hash_seq_search(&hseq_status))) + { + node_cnt++; + + pq_sendstring(&buf, node_pool->node_name); + pq_sendint(&buf, node_pool->nodeoid, sizeof(Oid)); + pq_sendint(&buf, node_pool->coord, sizeof(bool)); + pq_sendint(&buf, node_pool->size, sizeof(uint32)); + pq_sendint(&buf, node_pool->freeSize, sizeof(uint32)); + pq_sendint(&buf, node_pool->nwarming, sizeof(uint32)); + pq_sendint(&buf, node_pool->nquery, sizeof(uint32)); + + /* reset statistics count */ + exceed_keepalive_cnt = 0; + exceed_deadtime_cnt = 0; + exceed_maxlifetime_cnt = 0; + /* statistical connection life cycle */ + if (node_pool->slot) + { + for (i = 0; i < node_pool->freeSize; i++) + { + slot = node_pool->slot[i]; + if (difftime(now, slot->released) > PoolConnKeepAlive) + { + exceed_keepalive_cnt++; + } + + if (difftime(now, slot->created) > PoolConnDeadtime) + { + exceed_deadtime_cnt++; + } + + if (difftime(now, slot->created) >= PoolConnMaxLifetime) + { + exceed_maxlifetime_cnt++; + } + } + } + + pq_sendint(&buf, exceed_keepalive_cnt, sizeof(uint32)); + pq_sendint(&buf, exceed_deadtime_cnt, sizeof(uint32)); + pq_sendint(&buf, exceed_maxlifetime_cnt, sizeof(uint32)); + } + + + total_node_cnt += node_cnt; + + /* change the nodes count in message buff */ + node_cnt = htonl(node_cnt); + pq_updatemsgbytes(&buf, node_cnt_offset, (char*) &node_cnt, sizeof(uint32)); + database_pool = database_pool->next; + } + + /* change the total nodes count in message buff */ + total_node_cnt = htonl(total_node_cnt); + pq_updatemsgbytes(&buf, total_node_cnt_offset, (char*) &total_node_cnt, sizeof(uint32)); + + /* send messages */ + pool_putmessage(&agent->port, 'z', buf.data, buf.len); + pool_flush(&agent->port); + + pfree(buf.data); +} diff --git a/src/include/libpq/pqformat.h b/src/include/libpq/pqformat.h index bc1cb48f..0714f261 100644 --- a/src/include/libpq/pqformat.h +++ b/src/include/libpq/pqformat.h @@ -42,6 +42,7 @@ extern float4 pq_getmsgfloat4(StringInfo msg); extern float8 pq_getmsgfloat8(StringInfo msg); extern const char *pq_getmsgbytes(StringInfo msg, int datalen); extern void pq_copymsgbytes(StringInfo msg, char *buf, int datalen); +extern void pq_updatemsgbytes(StringInfo msg, int offset, char *buf, int datalen); extern char *pq_getmsgtext(StringInfo msg, int rawbytes, int *nbytes); extern const char *pq_getmsgstring(StringInfo msg); extern const char *pq_getmsgrawstring(StringInfo msg); diff --git a/src/include/pgxc/poolmgr.h b/src/include/pgxc/poolmgr.h index bf90897e..c0d996d8 100644 --- a/src/include/pgxc/poolmgr.h +++ b/src/include/pgxc/poolmgr.h @@ -186,34 +186,36 @@ typedef struct PGXCASyncTaskCtl */ typedef struct { - /* Process ID of postmaster child process associated to pool agent */ - int pid; - /* communication channel */ - PoolPort port; - DatabasePool *pool; - MemoryContext mcxt; - int num_dn_connections; - int num_coord_connections; - Oid *dn_conn_oids; /* one for each Datanode */ - Oid *coord_conn_oids; /* one for each Coordinator */ - PGXCNodePoolSlot **dn_connections; /* one for each Datanode */ - PGXCNodePoolSlot **coord_connections; /* one for each Coordinator */ - - char *session_params; - char *local_params; - List *session_params_list; /* session param list */ - List *local_params_list; /* local param list */ - - bool is_temp; /* Temporary objects used for this pool session? */ - - int query_count; /* query count, if exceed, need to reconnect database */ - bool breconnecting; /* whether we are reconnecting */ - int agentindex; - - - bool destory_pending; /* whether we have been ordered to destory */ - int32 ref_count; /* reference count */ - PGXCASyncTaskCtl *task_control; /* in error situation, we need to free the task control */ + /* Process ID of postmaster child process associated to pool agent */ + int pid; + /* communication channel */ + PoolPort port; + DatabasePool *pool; + MemoryContext mcxt; + int num_dn_connections; + int num_coord_connections; + Oid *dn_conn_oids; /* one for each Datanode */ + Oid *coord_conn_oids; /* one for each Coordinator */ + PGXCNodePoolSlot **dn_connections; /* one for each Datanode */ + PGXCNodePoolSlot **coord_connections; /* one for each Coordinator */ + + char *session_params; + char *local_params; + List *session_params_list; /* session param list */ + List *local_params_list; /* local param list */ + + bool is_temp; /* Temporary objects used for this pool session? */ + + int query_count; /* query count, if exceed, need to reconnect database */ + bool breconnecting; /* whether we are reconnecting */ + int agentindex; + + + bool destory_pending; /* whether we have been ordered to destory */ + int32 ref_count; /* reference count */ + PGXCASyncTaskCtl *task_control; /* in error situation, we need to free the task control */ + + pg_time_t cmd_start_time; /* command start time */ } PoolAgent; /* Handle to the pool manager (Session's side) */ @@ -223,6 +225,23 @@ typedef struct PoolPort port; } PoolHandle; +typedef struct PoolerCmdStatistics +{ + uint64 total_request_times; /* command total request times */ + union + { + uint64 total_costtime; /* total time spent processing commands */ + uint64 avg_costtime; /* avg time spent processing command */ + }; + uint64 max_costtime; /* max time spent processing command */ + uint64 min_costtime; /* min time spent processing command */ +} PoolerCmdStatistics; + + +#define POOLER_CMD_COUNT (18) + + + #define POOLER_ERROR_MSG_LEN 256 extern int MinPoolSize; @@ -349,4 +368,8 @@ extern bool check_persistent_connections(bool *newval, void **extra, extern int PoolManagerRefreshConnectionInfo(void); extern int PoolManagerClosePooledConnections(const char *dbname, const char *username); +extern int PoolManagerGetCmdStatistics(char *s, int size); +extern void PoolManagerResetCmdStatistics(void); +extern int PoolManagerGetConnStatistics(StringInfo s); + #endif From 1adcec8a4e09b7b37cb337f8f74921e86126b6c4 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Thu, 15 Oct 2020 17:02:18 +0800 Subject: [PATCH 072/578] add tbase_pooler_stat Makefile --- contrib/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/contrib/Makefile b/contrib/Makefile index c1ff5226..9c5df8bf 100644 --- a/contrib/Makefile +++ b/contrib/Makefile @@ -52,7 +52,8 @@ SUBDIRS = \ tsm_system_time \ unaccent \ vacuumlo \ - stormstats + stormstats \ + tbase_pooler_stat ifeq ($(with_openssl),yes) SUBDIRS += sslinfo From 84a203be06e9b30b40b7866957d9375b26c6fc9d Mon Sep 17 00:00:00 2001 From: yeyukui Date: Mon, 2 Nov 2020 14:13:26 +0800 Subject: [PATCH 073/578] fix bug about "create extension if not exists" --- src/backend/tcop/utility.c | 125 ++++++++++++++-------------- src/include/catalog/objectaddress.h | 8 ++ 2 files changed, 72 insertions(+), 61 deletions(-) diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index f5d10269..4a8871b6 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -3853,67 +3853,70 @@ ProcessUtilitySlow(ParseState *pstate, } #endif - /* - * Add the CREATE INDEX node itself to stash right away; - * if there were any commands stashed in the ALTER TABLE - * code, we need them to appear after this one. - */ - EventTriggerCollectSimpleCommand(address, secondaryObject, - parsetree); - commandCollected = true; - EventTriggerAlterTableEnd(); - } - break; - - case T_CreateExtensionStmt: -#ifdef __TBASE__ - { - CreateExtensionStmt *stmt = (CreateExtensionStmt *) parsetree; - char *extension_query_string = NULL; - if (IS_PGXC_LOCAL_COORDINATOR && CREATEEXT_CREATE == stmt->action) - { - StringInfo qstring; - /* stage 1 */ - address = PrepareExtension(pstate, stmt); - - qstring = makeStringInfo(); - initStringInfo(qstring); - - appendStringInfo(qstring, - _("PREPARE %s"), - queryString); - /* Send prepare extension msg to all other cn and dn */ - extension_query_string = qstring->data; - ExecUtilityStmtOnNodes(parsetree, extension_query_string, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false, false); - - /* stage 2 */ - ExecuteExtension(pstate, (CreateExtensionStmt *) parsetree); - resetStringInfo(qstring); - appendStringInfo(qstring, - _("EXECUTE %s"), - queryString); - /* Send execute extension msg to all other cn and dn */ - extension_query_string = qstring->data; - ExecUtilityStmtOnNodes(parsetree, extension_query_string, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false, false); - - pfree(qstring->data); - pfree(qstring); - } - else if (CREATEEXT_PREPARE == stmt->action) - { - address = PrepareExtension(pstate, stmt); - } - else if (CREATEEXT_EXECUTE == stmt->action) - { - ExecuteExtension(pstate, stmt); - } - else - { - address = CreateExtension(pstate, (CreateExtensionStmt *) parsetree); - } - - break; - } + /* + * Add the CREATE INDEX node itself to stash right away; + * if there were any commands stashed in the ALTER TABLE + * code, we need them to appear after this one. + */ + EventTriggerCollectSimpleCommand(address, secondaryObject, + parsetree); + commandCollected = true; + EventTriggerAlterTableEnd(); + } + break; + + case T_CreateExtensionStmt: +#ifdef __TBASE__ + { + CreateExtensionStmt *stmt = (CreateExtensionStmt *) parsetree; + char *extension_query_string = NULL; + if (IS_PGXC_LOCAL_COORDINATOR && CREATEEXT_CREATE == stmt->action) + { + StringInfo qstring; + /* stage 1 */ + address = PrepareExtension(pstate, stmt); + + if (ObjectAddressIsEqual(InvalidObjectAddress, address)) + break; + + qstring = makeStringInfo(); + initStringInfo(qstring); + + appendStringInfo(qstring, + _("PREPARE %s"), + queryString); + /* Send prepare extension msg to all other cn and dn */ + extension_query_string = qstring->data; + ExecUtilityStmtOnNodes(parsetree, extension_query_string, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false, false); + + /* stage 2 */ + ExecuteExtension(pstate, (CreateExtensionStmt *) parsetree); + resetStringInfo(qstring); + appendStringInfo(qstring, + _("EXECUTE %s"), + queryString); + /* Send execute extension msg to all other cn and dn */ + extension_query_string = qstring->data; + ExecUtilityStmtOnNodes(parsetree, extension_query_string, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false, false); + + pfree(qstring->data); + pfree(qstring); + } + else if (CREATEEXT_PREPARE == stmt->action) + { + address = PrepareExtension(pstate, stmt); + } + else if (CREATEEXT_EXECUTE == stmt->action) + { + ExecuteExtension(pstate, stmt); + } + else + { + address = CreateExtension(pstate, (CreateExtensionStmt *) parsetree); + } + + break; + } #endif case T_AlterExtensionStmt: address = ExecAlterExtensionStmt(pstate, (AlterExtensionStmt *) parsetree); diff --git a/src/include/catalog/objectaddress.h b/src/include/catalog/objectaddress.h index edfa0219..0d80f74c 100644 --- a/src/include/catalog/objectaddress.h +++ b/src/include/catalog/objectaddress.h @@ -90,6 +90,14 @@ typedef struct ObjectAddress extern const ObjectAddress InvalidObjectAddress; +/* + * Compare whether two ObjectAddress are the same + */ +#define ObjectAddressIsEqual(addr1, addr2) \ + ((addr1).classId == (addr2).classId && \ + (addr1).objectId == (addr2).objectId && \ + (addr1).objectSubId == (addr2).objectSubId) + #define ObjectAddressSubSet(addr, class_id, object_id, object_sub_id) \ do { \ (addr).classId = (class_id); \ From 2ab5df38308972addcc577c5c37ae8659d6a2298 Mon Sep 17 00:00:00 2001 From: yeyukui Date: Wed, 4 Nov 2020 11:46:44 +0800 Subject: [PATCH 074/578] * add a parameter to support pg_dumpall dump security data * fix bug about 'create extension if not exists' --- src/bin/pg_dump/pg_dumpall.c | 637 +++++++++++++++++++++-------------- 1 file changed, 377 insertions(+), 260 deletions(-) diff --git a/src/bin/pg_dump/pg_dumpall.c b/src/bin/pg_dump/pg_dumpall.c index 6354cdde..6d885bc7 100644 --- a/src/bin/pg_dump/pg_dumpall.c +++ b/src/bin/pg_dump/pg_dumpall.c @@ -27,6 +27,7 @@ /* version string we expect back from pg_dump */ #define PGDUMP_VERSIONSTR "pg_dump (PostgreSQL) " PG_VERSION "\n" +#define PGDUM_SERCURITY_VERSIONSTR "pg_dump_security (TBase) " PG_VERSION "\n" static void help(void); @@ -48,7 +49,9 @@ static void makeAlterConfigCommand(PGconn *conn, const char *arrayitem, static void dumpDatabases(PGconn *conn); static void dumpTimestamp(const char *msg); -static int runPgDump(const char *dbname); +static int runPgDump(const char *dbname); +static int runPgDumpSecurity(PGconn *conn, const char *pghost, const char *pgport, + const char *pguser, trivalue prompt_password); static void buildShSecLabels(PGconn *conn, const char *catalog_name, uint32 objectId, PQExpBuffer buffer, const char *target, const char *objname); @@ -64,8 +67,10 @@ static void dumpNodeGroups(PGconn *conn); #endif /* PGXC */ static char pg_dump_bin[MAXPGPATH]; +static char pg_dump_security_bin[MAXPGPATH]; static const char *progname; static PQExpBuffer pgdumpopts; +static PQExpBuffer pgdumpsecurityopts; static char *connstr = ""; static bool skip_acls = false; static bool verbose = false; @@ -99,6 +104,10 @@ static int include_nodes = 0; #endif /* PGXC */ #define exit_nicely(code) exit(code) +#ifdef __TBASE__ +static int dump_security_data = 0; +#endif + int main(int argc, char *argv[]) {// #lizard forgives @@ -152,270 +161,308 @@ main(int argc, char *argv[]) {"dump-nodes", no_argument, &dump_nodes, 1}, //{"include-nodes", no_argument, &include_nodes, 1}, #endif - {NULL, 0, NULL, 0} - }; - - char *pghost = NULL; - char *pgport = NULL; - char *pguser = NULL; - char *pgdb = NULL; - char *use_role = NULL; - trivalue prompt_password = TRI_DEFAULT; - bool data_only = false; - bool globals_only = false; - bool output_clean = false; - bool roles_only = false; - bool tablespaces_only = false; - PGconn *conn; - int encoding; - const char *std_strings; - int c, - ret; - int optindex; - - set_pglocale_pgservice( argv[0], PG_TEXTDOMAIN("pg_dump") ); - - progname = get_progname( argv[0] ); - - if (argc > 1) - { - if ( strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0) - { - help(); - exit_nicely(0); - } - if ( strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0) - { - puts("pg_dumpall(PostgreSQL) " PG_VERSION); - exit_nicely(0); - } - } - - if ((ret = find_other_exec(argv[0], "pg_dump", PGDUMP_VERSIONSTR, - pg_dump_bin)) < 0) - { - char full_path[MAXPGPATH]; - - if (find_my_exec(argv[0], full_path) < 0) - strlcpy(full_path, progname, sizeof(full_path)); - - if (ret == -1) - fprintf(stderr, - _("The program \"pg_dump\" is needed by %s " - "but was not found in the\n" - "same directory as \"%s\".\n" - "Check your installation.\n"), - progname, full_path); - else - fprintf(stderr, - _("The program \"pg_dump\" was found by \"%s\"\n" - "but was not the same version as %s.\n" - "Check your installation.\n"), - full_path, progname); - exit_nicely(1); - } - - pgdumpopts = createPQExpBuffer(); - - while ((c = getopt_long(argc, argv, "acd:f:gh:l:oOp:rsS:tuU:vwWx", long_options, &optindex)) != -1) - { - switch (c) - { - case 'a': - data_only = true; - appendPQExpBufferStr(pgdumpopts, " -a"); - break; - - case 'c': - output_clean = true; - break; - - case 'd': - connstr = pg_strdup(optarg); - break; - - case 'f': - filename = pg_strdup(optarg); - appendPQExpBufferStr(pgdumpopts, " -f "); - appendShellString(pgdumpopts, filename); - break; - case 'g': - globals_only = true; - break; - - case 'h': - pghost = pg_strdup(optarg); - break; - - case 'l': - pgdb = pg_strdup(optarg); - break; - - case 'o': - appendPQExpBufferStr(pgdumpopts, " -o"); - break; - - case 'O': - appendPQExpBufferStr(pgdumpopts, " -O"); - break; - - case 'p': - pgport = pg_strdup(optarg); - break; - - case 'r': - roles_only = true; - break; - - case 's': - appendPQExpBufferStr(pgdumpopts, " -s"); - break; - - case 'S': - appendPQExpBufferStr(pgdumpopts, " -S "); - appendShellString(pgdumpopts, optarg); - break; - - case 't': - tablespaces_only = true; - break; - +#ifdef __TBASE__ + {"dump-security-data", no_argument, &dump_security_data, 1}, +#endif + {NULL, 0, NULL, 0} + }; + + char *pghost = NULL; + char *pgport = NULL; + char *pguser = NULL; + char *pgdb = NULL; + char *use_role = NULL; + trivalue prompt_password = TRI_DEFAULT; + bool data_only = false; + bool globals_only = false; + bool output_clean = false; + bool roles_only = false; + bool tablespaces_only = false; + PGconn *conn; + int encoding; + const char *std_strings; + int c, + ret; + int optindex; + + set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_dump")); + + progname = get_progname(argv[0]); + + if (argc > 1) + { + if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0) + { + help(); + exit_nicely(0); + } + if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0) + { + puts("pg_dumpall (PostgreSQL) " PG_VERSION); + exit_nicely(0); + } + } + + if ((ret = find_other_exec(argv[0], "pg_dump", PGDUMP_VERSIONSTR, + pg_dump_bin)) < 0) + { + char full_path[MAXPGPATH]; + + if (find_my_exec(argv[0], full_path) < 0) + strlcpy(full_path, progname, sizeof(full_path)); + + if (ret == -1) + fprintf(stderr, + _("The program \"pg_dump\" is needed by %s " + "but was not found in the\n" + "same directory as \"%s\".\n" + "Check your installation.\n"), + progname, full_path); + else + fprintf(stderr, + _("The program \"pg_dump\" was found by \"%s\"\n" + "but was not the same version as %s.\n" + "Check your installation.\n"), + full_path, progname); + exit_nicely(1); + } + + if ((ret = find_other_exec(argv[0], "pg_dump_security", PGDUM_SERCURITY_VERSIONSTR, + pg_dump_security_bin)) < 0) + { + char full_path[MAXPGPATH]; + + if (find_my_exec(argv[0], full_path) < 0) + strlcpy(full_path, progname, sizeof(full_path)); + + if (ret == -1) + fprintf(stderr, + _("The program \"pg_dump_security\" is needed by %s " + "but was not found in the\n" + "same directory as \"%s\".\n" + "Check your installation.\n"), + progname, full_path); + else + fprintf(stderr, + _("The program \"pg_dump_security\" was found by \"%s\"\n" + "but was not the same version as %s.\n" + "Check your installation.\n"), + full_path, progname); + exit_nicely(1); + } + + pgdumpopts = createPQExpBuffer(); + + pgdumpsecurityopts = createPQExpBuffer(); + + while ((c = getopt_long(argc, argv, "acd:f:gh:l:oOp:rsS:tuU:vwWx", long_options, &optindex)) != -1) + { + switch (c) + { + case 'a': + data_only = true; + appendPQExpBufferStr(pgdumpopts, " -a"); + break; + + case 'c': + output_clean = true; + break; + + case 'd': + connstr = pg_strdup(optarg); + break; + + case 'f': + filename = pg_strdup(optarg); + appendPQExpBufferStr(pgdumpopts, " -f "); + appendShellString(pgdumpopts, filename); + + appendPQExpBufferStr(pgdumpsecurityopts, " -f "); + appendShellString(pgdumpsecurityopts, filename); + break; + + case 'g': + globals_only = true; + break; + + case 'h': + pghost = pg_strdup(optarg); + appendPQExpBufferStr(pgdumpsecurityopts, " -h"); + appendShellString(pgdumpsecurityopts, pghost); + break; + + case 'l': + pgdb = pg_strdup(optarg); + break; + + case 'o': + appendPQExpBufferStr(pgdumpopts, " -o"); + break; + + case 'O': + appendPQExpBufferStr(pgdumpopts, " -O"); + break; + + case 'p': + pgport = pg_strdup(optarg); + appendPQExpBufferStr(pgdumpsecurityopts, " -p"); + appendShellString(pgdumpsecurityopts, pgport); + break; + + case 'r': + roles_only = true; + break; + + case 's': + appendPQExpBufferStr(pgdumpopts, " -s"); + break; + + case 'S': + appendPQExpBufferStr(pgdumpopts, " -S "); + appendShellString(pgdumpopts, optarg); + break; + + case 't': + tablespaces_only = true; + break; + #ifdef __TBASE__ case 'u': appendPQExpBufferStr(pgdumpopts, " -u"); break; #endif - case 'U': - pguser = pg_strdup(optarg); - break; - - case 'v': - verbose = true; - appendPQExpBufferStr(pgdumpopts, " -v"); - break; - - case 'w': - prompt_password = TRI_NO; - appendPQExpBufferStr(pgdumpopts, " -w"); - break; - - case 'W': - prompt_password = TRI_YES; - appendPQExpBufferStr(pgdumpopts, " -W"); - break; - - case 'x': - skip_acls = true; - appendPQExpBufferStr(pgdumpopts, " -x"); - break; - - case 0: - break; - - case 2: - appendPQExpBufferStr(pgdumpopts, " --lock-wait-timeout "); - appendShellString(pgdumpopts, optarg); - break; - - case 3: - use_role = pg_strdup(optarg); - appendPQExpBufferStr(pgdumpopts, " --role "); - appendShellString(pgdumpopts, use_role); - break; - - case 4: - dosync = false; - appendPQExpBufferStr(pgdumpopts, " --no-sync"); - break; - - default: - fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname); - exit_nicely(1); - } - } - - /* Complain if any arguments remain */ - if (optind < argc) - { - fprintf(stderr, _("%s: too many command-line arguments (first is \"%s\")\n"), - progname, argv[optind]); - fprintf(stderr, _("Try \"%s --help\" for more information.\n"), - progname); - exit_nicely(1); - } - - /* Make sure the user hasn't specified a mix of globals-only options */ - if (globals_only && roles_only) - { - fprintf(stderr, _("%s: options -g/--globals-only and -r/--roles-only cannot be used together\n"), - progname); - fprintf(stderr, _("Try \"%s --help\" for more information.\n"), - progname); - exit_nicely(1); - } - - if (globals_only && tablespaces_only) - { - fprintf(stderr, _("%s: options -g/--globals-only and -t/--tablespaces-only cannot be used together\n"), - progname); - fprintf(stderr, _("Try \"%s --help\" for more information.\n"), - progname); - exit_nicely(1); - } - - if (if_exists && !output_clean) - { - fprintf(stderr, _("%s: option --if-exists requires option -c/--clean\n"), - progname); - exit_nicely(1); - } - - if (roles_only && tablespaces_only) - { - fprintf(stderr, _("%s: options -r/--roles-only and -t/--tablespaces-only cannot be used together\n"), - progname); - fprintf(stderr, _("Try \"%s --help\" for more information.\n"), - progname); - exit_nicely(1); - } - - /* - * If password values are not required in the dump, switch to using - * pg_roles which is equally useful, just more likely to have unrestricted - * access than pg_authid. - */ - if (no_role_passwords) - sprintf(role_catalog, "%s", PG_ROLES); - else - sprintf(role_catalog, "%s", PG_AUTHID); - - /* Add long options to the pg_dump argument list */ - if (binary_upgrade) - appendPQExpBufferStr(pgdumpopts, " --binary-upgrade"); - if (column_inserts) - appendPQExpBufferStr(pgdumpopts, " --column-inserts"); - if (disable_dollar_quoting) - appendPQExpBufferStr(pgdumpopts, " --disable-dollar-quoting"); - if (disable_triggers) - appendPQExpBufferStr(pgdumpopts, " --disable-triggers"); - if (inserts) - appendPQExpBufferStr(pgdumpopts, " --inserts"); - if (no_tablespaces) - appendPQExpBufferStr(pgdumpopts, " --no-tablespaces"); - if (quote_all_identifiers) - appendPQExpBufferStr(pgdumpopts, " --quote-all-identifiers"); - if (use_setsessauth) - appendPQExpBufferStr(pgdumpopts, " --use-set-session-authorization"); - if (no_publications) - appendPQExpBufferStr(pgdumpopts, " --no-publications"); - if (no_security_labels) - appendPQExpBufferStr(pgdumpopts, " --no-security-labels"); - if (no_subscriptions) - appendPQExpBufferStr(pgdumpopts, " --no-subscriptions"); - if (no_unlogged_table_data) - appendPQExpBufferStr(pgdumpopts, " --no-unlogged-table-data"); + case 'U': + pguser = pg_strdup(optarg); + break; + + case 'v': + verbose = true; + appendPQExpBufferStr(pgdumpopts, " -v"); + appendPQExpBufferStr(pgdumpsecurityopts, " -v"); + break; + + case 'w': + prompt_password = TRI_NO; + appendPQExpBufferStr(pgdumpopts, " -w"); + break; + + case 'W': + prompt_password = TRI_YES; + appendPQExpBufferStr(pgdumpopts, " -W"); + break; + + case 'x': + skip_acls = true; + appendPQExpBufferStr(pgdumpopts, " -x"); + break; + + case 0: + break; + + case 2: + appendPQExpBufferStr(pgdumpopts, " --lock-wait-timeout "); + appendShellString(pgdumpopts, optarg); + break; + + case 3: + use_role = pg_strdup(optarg); + appendPQExpBufferStr(pgdumpopts, " --role "); + appendShellString(pgdumpopts, use_role); + break; + + case 4: + dosync = false; + appendPQExpBufferStr(pgdumpopts, " --no-sync"); + break; + + default: + fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname); + exit_nicely(1); + } + } + + /* Complain if any arguments remain */ + if (optind < argc) + { + fprintf(stderr, _("%s: too many command-line arguments (first is \"%s\")\n"), + progname, argv[optind]); + fprintf(stderr, _("Try \"%s --help\" for more information.\n"), + progname); + exit_nicely(1); + } + + /* Make sure the user hasn't specified a mix of globals-only options */ + if (globals_only && roles_only) + { + fprintf(stderr, _("%s: options -g/--globals-only and -r/--roles-only cannot be used together\n"), + progname); + fprintf(stderr, _("Try \"%s --help\" for more information.\n"), + progname); + exit_nicely(1); + } + + if (globals_only && tablespaces_only) + { + fprintf(stderr, _("%s: options -g/--globals-only and -t/--tablespaces-only cannot be used together\n"), + progname); + fprintf(stderr, _("Try \"%s --help\" for more information.\n"), + progname); + exit_nicely(1); + } + + if (if_exists && !output_clean) + { + fprintf(stderr, _("%s: option --if-exists requires option -c/--clean\n"), + progname); + exit_nicely(1); + } + + if (roles_only && tablespaces_only) + { + fprintf(stderr, _("%s: options -r/--roles-only and -t/--tablespaces-only cannot be used together\n"), + progname); + fprintf(stderr, _("Try \"%s --help\" for more information.\n"), + progname); + exit_nicely(1); + } + + /* + * If password values are not required in the dump, switch to using + * pg_roles which is equally useful, just more likely to have unrestricted + * access than pg_authid. + */ + if (no_role_passwords) + sprintf(role_catalog, "%s", PG_ROLES); + else + sprintf(role_catalog, "%s", PG_AUTHID); + + /* Add long options to the pg_dump argument list */ + if (binary_upgrade) + appendPQExpBufferStr(pgdumpopts, " --binary-upgrade"); + if (column_inserts) + appendPQExpBufferStr(pgdumpopts, " --column-inserts"); + if (disable_dollar_quoting) + appendPQExpBufferStr(pgdumpopts, " --disable-dollar-quoting"); + if (disable_triggers) + appendPQExpBufferStr(pgdumpopts, " --disable-triggers"); + if (inserts) + appendPQExpBufferStr(pgdumpopts, " --inserts"); + if (no_tablespaces) + appendPQExpBufferStr(pgdumpopts, " --no-tablespaces"); + if (quote_all_identifiers) + appendPQExpBufferStr(pgdumpopts, " --quote-all-identifiers"); + if (use_setsessauth) + appendPQExpBufferStr(pgdumpopts, " --use-set-session-authorization"); + if (no_publications) + appendPQExpBufferStr(pgdumpopts, " --no-publications"); + if (no_security_labels) + appendPQExpBufferStr(pgdumpopts, " --no-security-labels"); + if (no_subscriptions) + appendPQExpBufferStr(pgdumpopts, " --no-subscriptions"); + if (no_unlogged_table_data) + appendPQExpBufferStr(pgdumpopts, " --no-unlogged-table-data"); #ifdef PGXC if (include_nodes) @@ -586,7 +633,21 @@ main(int argc, char *argv[]) if (!globals_only && !roles_only && !tablespaces_only) dumpDatabases(conn); - PQfinish(conn); + /* + * support to dump security meta data + */ + if (dump_security_data) + { + ret = runPgDumpSecurity(conn, pghost, pgport, pguser, prompt_password); + + if (ret != 0) + { + fprintf(stderr, _("%s: pg_dump_security failed on database \"%s\", exiting\n"), progname, pgdb); + exit_nicely(1); + } + } + + PQfinish(conn); if (verbose) dumpTimestamp("Completed on"); @@ -629,7 +690,8 @@ help(void) printf(_(" -S, --superuser=NAME superuser user name to use in the dump\n")); printf(_(" -t, --tablespaces-only dump only tablespaces, no databases or roles\n")); #ifdef __TBASE__ - printf(_(" -u, --with-dropped-column dump the table schema with dropped columns\n")); + printf(_(" -u, --with-dropped-column dump the table schema with dropped columns\n")); + printf(_(" --dump-security-data dump security meta data\n")); #endif printf(_(" -x, --no-privileges do not dump privileges (grant/revoke)\n")); printf(_(" --binary-upgrade for use by upgrade utilities only\n")); @@ -1832,7 +1894,62 @@ dumpDatabases(PGconn *conn) PQclear(res); } +/* + * run pg_dump_security to dump security metadata + */ +static int +runPgDumpSecurity(PGconn *old_conn, const char *pghost, const char *pgport, + const char *pguser, trivalue prompt_password) +{ + PQExpBuffer cmd = createPQExpBuffer(); + PQExpBuffer buf = createPQExpBuffer(); + PGresult *extnames; + PGconn *new_conn; + int ret; + PGresult *res; + int i; + char *dbname; + + res = executeQuery(old_conn, "SELECT datname FROM pg_database WHERE datallowconn ORDER BY 1"); + + for (i = 0; i < PQntuples(res); i++) + { + dbname = PQgetvalue(res, i, 0); + + new_conn = connectDatabase(dbname, NULL, pghost, pgport, pguser, prompt_password, false); + + extnames = executeQuery(new_conn, "SELECT extname from pg_extension WHERE extname='tbase_mls' ORDERY BY 1"); + if (PQntuples(extnames) > 0) + { + break; + } + } + + fprintf(OPF, "\\c %s mls_admin\n\n", dbname); + + appendPQExpBuffer(cmd, "\"%s\" %s", pg_dump_security_bin, + pgdumpsecurityopts->data); + appendPQExpBufferStr(cmd, " -l"); + + appendShellString(cmd, dbname); + + if (verbose) + fprintf(stderr, _("%s: running \"%s\"\n"), progname, cmd->data); + + fflush(stdout); + fflush(stderr); + + ret = system(cmd->data); + + PQclear(res); + PQclear(extnames); + destroyPQExpBuffer(cmd); + destroyPQExpBuffer(buf); + PQfinish(new_conn); + + return ret; +} /* * Run pg_dump on dbname. From b8ad31a95d2ce1f9b6796396d427c582b57d6989 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Wed, 18 Nov 2020 14:29:10 +0800 Subject: [PATCH 075/578] fix pooler log and pgsl_store core --- src/backend/pgxc/pool/poolmgr.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c index d24a32f4..afa55efc 100644 --- a/src/backend/pgxc/pool/poolmgr.c +++ b/src/backend/pgxc/pool/poolmgr.c @@ -776,9 +776,9 @@ PoolManagerInit() * processes do this.) */ #ifdef HAVE_SETSID - if (setsid() < 0) - elog(LOG, POOL_MGR_PREFIX"setsid() failed: %m"); - //elog(FATAL, POOL_MGR_PREFIX"setsid() failed: %m"); + if (setsid() < 0) + elog(DEBUG1, POOL_MGR_PREFIX"setsid() failed: %m"); + //elog(FATAL, POOL_MGR_PREFIX"setsid() failed: %m"); #endif /* * Properly accept or ignore signals the postmaster might send us From 8f7357232567540514e670bde4f740dd6f7c2415 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Wed, 25 Nov 2020 14:46:50 +0800 Subject: [PATCH 076/578] delete GUC parameter use_data_pump and change pooler log level --- src/backend/pgxc/pool/poolcomm.c | 196 +++++++++++++++---------------- src/backend/pgxc/squeue/squeue.c | 2 +- src/backend/utils/misc/guc.c | 28 ++--- 3 files changed, 108 insertions(+), 118 deletions(-) diff --git a/src/backend/pgxc/pool/poolcomm.c b/src/backend/pgxc/pool/poolcomm.c index c462027a..b1b68e04 100644 --- a/src/backend/pgxc/pool/poolcomm.c +++ b/src/backend/pgxc/pool/poolcomm.c @@ -1302,104 +1302,104 @@ pool_recvres_with_commandID(PoolPort *port, CommandId *cmdID, const char *sql) */ int pool_recvres(PoolPort *port) -{// #lizard forgives - int r; - uint n32 = 0; - uint err = 0; - char buf[SEND_RES_BUFFER_SIZE - POOL_ERR_MSG_LEN]; - char err_msg[POOL_ERR_MSG_LEN]; - int recved_size = 0; - int size = SEND_RES_BUFFER_SIZE - POOL_ERR_MSG_LEN; - char *ptr = buf; - - /* receive message header first */ - for(;;) - { - r = recv(Socket(*port), ptr + recved_size, size - recved_size, 0); - if (r < 0) - { - /* - * Report broken connection - */ - elog(LOG, "recv size %d size %d n32 %d.", recved_size, size, n32); - ereport(LOG, - (errcode_for_socket_access(), - errmsg("could not receive data from client: %m"))); - goto failure; - } - else if (r == 0) - { - if(recved_size == size) - break; - else - goto failure; - } - - recved_size += r; - if(recved_size == size) - break; - - } - /* Verify response */ - if (buf[0] != 's') - { - ereport(LOG, - (errcode(ERRCODE_PROTOCOL_VIOLATION), - errmsg("unexpected message code:%c", buf[0]))); - goto failure; - } - - memcpy(&n32, buf + 1, 4); - n32 = ntohl(n32); - if (n32 != 0) - { - ereport(LOG, - (errcode(ERRCODE_PROTOCOL_VIOLATION), - errmsg("pool_recvres return code:%d", n32))); - } - - memcpy(&err, buf + 5, 4); - err = ntohl(err); - - /* if has err_msg, receive error message */ - if (PoolErrIsValid(err)) - { - ptr = err_msg; - size = POOL_ERR_MSG_LEN; - recved_size = 0; - for(;;) - { - r = recv(Socket(*port), ptr + recved_size, size - recved_size, 0); - if (r < 0) - { - /* - * Report broken connection - */ - elog(LOG, "recv size %d size %d n32 %d.", recved_size, size, n32); - ereport(LOG, - (errcode_for_socket_access(), - errmsg("could not receive data from client: %m"))); - goto failure; - } - else if (r == 0) - { - if(recved_size == size) - break; - else - goto failure; - } - - recved_size += r; - if(recved_size == size) - break; - - } - - elog(WARNING, "%s", err_msg); - } - - return n32; - +{ + int r; + uint n32 = 0; + uint err = 0; + char buf[SEND_RES_BUFFER_SIZE - POOL_ERR_MSG_LEN]; + char err_msg[POOL_ERR_MSG_LEN]; + int recved_size = 0; + int size = SEND_RES_BUFFER_SIZE - POOL_ERR_MSG_LEN; + char *ptr = buf; + + /* receive message header first */ + for(;;) + { + r = recv(Socket(*port), ptr + recved_size, size - recved_size, 0); + if (r < 0) + { + /* + * Report broken connection + */ + elog(LOG, "recv size %d size %d n32 %d.", recved_size, size, n32); + ereport(LOG, + (errcode_for_socket_access(), + errmsg("could not receive data from client: %m"))); + goto failure; + } + else if (r == 0) + { + if(recved_size == size) + break; + else + goto failure; + } + + recved_size += r; + if(recved_size == size) + break; + + } + /* Verify response */ + if (buf[0] != 's') + { + ereport(LOG, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("unexpected message code:%c", buf[0]))); + goto failure; + } + + memcpy(&n32, buf + 1, 4); + n32 = ntohl(n32); + if (n32 != 0) + { + ereport(DEBUG1, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("pool_recvres return code:%d", n32))); + } + + memcpy(&err, buf + 5, 4); + err = ntohl(err); + + /* if has err_msg, receive error message */ + if (PoolErrIsValid(err)) + { + ptr = err_msg; + size = POOL_ERR_MSG_LEN; + recved_size = 0; + for(;;) + { + r = recv(Socket(*port), ptr + recved_size, size - recved_size, 0); + if (r < 0) + { + /* + * Report broken connection + */ + elog(LOG, "recv size %d size %d n32 %d.", recved_size, size, n32); + ereport(LOG, + (errcode_for_socket_access(), + errmsg("could not receive data from client: %m"))); + goto failure; + } + else if (r == 0) + { + if(recved_size == size) + break; + else + goto failure; + } + + recved_size += r; + if(recved_size == size) + break; + + } + + elog(WARNING, "%s", err_msg); + } + + return n32; + failure: return EOF; } diff --git a/src/backend/pgxc/squeue/squeue.c b/src/backend/pgxc/squeue/squeue.c index ecc25323..515391a3 100644 --- a/src/backend/pgxc/squeue/squeue.c +++ b/src/backend/pgxc/squeue/squeue.c @@ -66,7 +66,7 @@ int SQueueSize = 64; #ifdef __TBASE__ extern ProtocolVersion FrontendProtocol; -bool g_UseDataPump = false;/* Use data pumb, true default. */ +bool g_UseDataPump = true;/* Use data pumb, true default. */ bool g_DataPumpDebug = false;/* enable debug info */ int32 g_SndThreadNum = 8; /* Two sender threads default. */ int32 g_SndThreadBufferSize = 16; /* in Kilo bytes. */ diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index af23b681..825e4725 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -2054,25 +2054,15 @@ static struct config_bool ConfigureNamesBool[] = }, #ifdef __TBASE__ - { - {"enable_statistic", PGC_SIGHUP, STATS_COLLECTOR, - gettext_noop("collect statistic information for debug."), - NULL - }, - &enable_statistic, - false, - NULL, NULL, NULL - }, - - { - {"use_data_pump", PGC_SIGHUP, CUSTOM_OPTIONS, - gettext_noop("use datapump to make data transfer more efficient."), - NULL - }, - &g_UseDataPump, - true, - NULL, NULL, NULL - }, + { + {"enable_statistic", PGC_SIGHUP, STATS_COLLECTOR, + gettext_noop("collect statistic information for debug."), + NULL + }, + &enable_statistic, + false, + NULL, NULL, NULL + }, { {"debug_data_pump", PGC_SIGHUP, CUSTOM_OPTIONS, From c46bec597e4e7e74031fe0a608b9f93a522ef293 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Thu, 26 Nov 2020 10:49:18 +0800 Subject: [PATCH 077/578] add log switch in pool_recvres --- src/backend/pgxc/pool/poolcomm.c | 6 +++--- src/backend/pgxc/pool/poolmgr.c | 30 +++++++++++++++--------------- src/include/pgxc/poolcomm.h | 12 ++++++------ 3 files changed, 24 insertions(+), 24 deletions(-) diff --git a/src/backend/pgxc/pool/poolcomm.c b/src/backend/pgxc/pool/poolcomm.c index b1b68e04..8a70dffb 100644 --- a/src/backend/pgxc/pool/poolcomm.c +++ b/src/backend/pgxc/pool/poolcomm.c @@ -1301,7 +1301,7 @@ pool_recvres_with_commandID(PoolPort *port, CommandId *cmdID, const char *sql) * Return 0 at success or EOF at error. */ int -pool_recvres(PoolPort *port) +pool_recvres(PoolPort *port, bool need_log) { int r; uint n32 = 0; @@ -1351,9 +1351,9 @@ pool_recvres(PoolPort *port) memcpy(&n32, buf + 1, 4); n32 = ntohl(n32); - if (n32 != 0) + if (n32 != 0 && need_log) { - ereport(DEBUG1, + ereport(LOG, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("pool_recvres return code:%d", n32))); } diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c index afa55efc..73cf905c 100644 --- a/src/backend/pgxc/pool/poolmgr.c +++ b/src/backend/pgxc/pool/poolmgr.c @@ -1455,8 +1455,8 @@ PoolManagerSendLocalCommand(int dn_count, int* dn_list, int co_count, int* co_li pool_putmessage(&poolHandle->port, 'b', (char *) buf, (2 + dn_count + co_count) * sizeof(uint32)); pool_flush(&poolHandle->port); - /* Get result */ - return pool_recvres(&poolHandle->port); + /* Get result */ + return pool_recvres(&poolHandle->port, true); } /* @@ -2023,13 +2023,13 @@ PoolManagerCleanConnection(List *datanodelist, List *coordlist, char *dbname, ch RESUME_POOLER_RELOAD(); - /* Receive result message */ - if (pool_recvres(&poolHandle->port) != CLEAN_CONNECTION_COMPLETED) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg(POOL_MGR_PREFIX"Clean connections not completed. HINT: cannot drop the currently open database"))); - } + /* Receive result message */ + if (pool_recvres(&poolHandle->port, true) != CLEAN_CONNECTION_COMPLETED) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg(POOL_MGR_PREFIX"Clean connections not completed. HINT: cannot drop the currently open database"))); + } } @@ -2054,7 +2054,7 @@ PoolManagerCheckConnectionInfo(void) pool_putmessage(&poolHandle->port, 'q', NULL, 0); pool_flush(&poolHandle->port); - res = pool_recvres(&poolHandle->port); + res = pool_recvres(&poolHandle->port, true); if (res == POOL_CHECK_SUCCESS) return true; @@ -3874,7 +3874,7 @@ PoolManagerCancelQuery(int dn_count, int* dn_list, int co_count, int* co_list, i pool_putmessage(&poolHandle->port, 'h', (char *) buf, (2 + dn_count + co_count + 1) * sizeof(uint32)); pool_flush(&poolHandle->port); - res = pool_recvres(&poolHandle->port); + res = pool_recvres(&poolHandle->port, false); if (res != (dn_count + co_count)) { @@ -10644,7 +10644,7 @@ PoolManagerRefreshConnectionInfo(void) pool_putmessage(&poolHandle->port, 'R', NULL, 0); pool_flush(&poolHandle->port); - res = pool_recvres(&poolHandle->port); + res = pool_recvres(&poolHandle->port, true); RESUME_POOLER_RELOAD(); @@ -10843,9 +10843,9 @@ PoolManagerClosePooledConnections(const char *dbname, const char *username) pool_flush(&poolHandle->port); - /* Then Get back Pids from Pooler */ - res = pool_recvres(&poolHandle->port); - elog(LOG, "PoolManagerClosePooledConnections res:%d", res); + /* Then Get back Pids from Pooler */ + res = pool_recvres(&poolHandle->port, true); + elog(LOG, "PoolManagerClosePooledConnections res:%d", res); RESUME_POOLER_RELOAD(); diff --git a/src/include/pgxc/poolcomm.h b/src/include/pgxc/poolcomm.h index ab34974c..04fd7e6e 100644 --- a/src/include/pgxc/poolcomm.h +++ b/src/include/pgxc/poolcomm.h @@ -55,11 +55,11 @@ extern int pool_putbytes(PoolPort *port, const char *s, size_t len); extern int pool_flush(PoolPort *port); /*extern int pool_sendfds(PoolPort *port, int *fds, int count);*/ extern int pool_sendfds(PoolPort *port, int *fds, int count, char *errbuf, int32 buf_len); -extern int pool_recvfds(PoolPort *port, int *fds, int count); -extern int pool_sendres(PoolPort *port, int res, char *errbuf, int32 buf_len, bool need_log); -extern int pool_recvres(PoolPort *port); -extern int pool_sendpids(PoolPort *port, int *pids, int count, char *errbuf, int32 buf_len); -extern int pool_recvpids(PoolPort *port, int **pids); -extern int pool_sendres_with_command_id(PoolPort *port, int res, CommandId cmdID, char *errbuf, int32 buf_len, char *errmsg, bool need_log); +extern int pool_recvfds(PoolPort *port, int *fds, int count); +extern int pool_sendres(PoolPort *port, int res, char *errbuf, int32 buf_len, bool need_log); +extern int pool_recvres(PoolPort *port, bool need_log); +extern int pool_sendpids(PoolPort *port, int *pids, int count, char *errbuf, int32 buf_len); +extern int pool_recvpids(PoolPort *port, int **pids); +extern int pool_sendres_with_command_id(PoolPort *port, int res, CommandId cmdID, char *errbuf, int32 buf_len, char *errmsg, bool need_log); extern int pool_recvres_with_commandID(PoolPort *port, CommandId *cmdID, const char *sql); #endif /* POOLCOMM_H */ From df2f355e166a401277fe530b4da46d1cc7a7daa1 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Tue, 24 Nov 2020 20:28:25 +0800 Subject: [PATCH 078/578] fix get_node_list bug (merge request !2) --- src/gtm/client/fe-protocol.c | 101 +++++++++++++++++++++-------------- 1 file changed, 60 insertions(+), 41 deletions(-) diff --git a/src/gtm/client/fe-protocol.c b/src/gtm/client/fe-protocol.c index fb43649c..d545384c 100644 --- a/src/gtm/client/fe-protocol.c +++ b/src/gtm/client/fe-protocol.c @@ -1262,9 +1262,11 @@ result->gr_status = GTM_RESULT_ERROR; break; - case NODE_LIST_RESULT: - { - int i; + case NODE_LIST_RESULT: + { + int i; + char *buf = NULL; + int buf_size = 8192; if (gtmpqGetInt(&result->gr_resdata.grd_node_list.num_node, sizeof(int32), conn)) { @@ -1272,48 +1274,65 @@ result->gr_status = GTM_RESULT_ERROR; break; } - for (i = 0; i < result->gr_resdata.grd_node_list.num_node; i++) + buf = (char *) malloc(buf_size); + if (buf == NULL) { - int size; - char buf[8092]; - GTM_PGXCNodeInfo *data = (GTM_PGXCNodeInfo *) malloc(sizeof(GTM_PGXCNodeInfo)); + result->gr_status = GTM_RESULT_ERROR; + printfGTMPQExpBuffer(&conn->errorMessage, "malloc buffer for node list data failed"); + break; + } - if (gtmpqGetInt(&size, sizeof(int32), conn)) - { - result->gr_status = GTM_RESULT_ERROR; - free(data); - break; - } - if (size > 8092) - { - result->gr_status = GTM_RESULT_ERROR; - printfGTMPQExpBuffer(&conn->errorMessage, "buffer size not large enough for node list data"); - free(data); - continue; - } + for (i = 0; i < result->gr_resdata.grd_node_list.num_node; i++) + { + int size; + GTM_PGXCNodeInfo *data = (GTM_PGXCNodeInfo *) malloc(sizeof(GTM_PGXCNodeInfo)); - if (gtmpqGetnchar((char *) &buf, size, conn)) - { - result->gr_status = GTM_RESULT_ERROR; - free(data); - break; - } - if (!gtm_deserialize_pgxcnodeinfo(data, buf, size, &conn->errorMessage)) - { - result->gr_status = GTM_RESULT_ERROR; - free(data); - break; - } - else - { - result->gr_resdata.grd_node_list.nodeinfo[i] = data; - } - } + if (gtmpqGetInt(&size, sizeof(int32), conn)) + { + result->gr_status = GTM_RESULT_ERROR; + free(data); + break; + } - break; - } - case BARRIER_RESULT: - break; + if (size > buf_size) + { + buf = (char *) realloc(buf, size); + if (buf == NULL) + { + result->gr_status = GTM_RESULT_ERROR; + printfGTMPQExpBuffer(&conn->errorMessage, "realloc buffer for node list data failed"); + free(data); + break; + } + buf_size = size; + } + + if (gtmpqGetnchar(buf, size, conn)) + { + result->gr_status = GTM_RESULT_ERROR; + free(data); + break; + } + if (!gtm_deserialize_pgxcnodeinfo(data, buf, size, &conn->errorMessage)) + { + result->gr_status = GTM_RESULT_ERROR; + free(data); + break; + } + else + { + result->gr_resdata.grd_node_list.nodeinfo[i] = data; + } + } + + if (buf != NULL) + { + free(buf); + } + break; + } + case BARRIER_RESULT: + break; case REPORT_XMIN_RESULT: if (gtmpqGetnchar((char *) &result->gr_resdata.grd_report_xmin.latest_completed_xid, From 6abed652955a1a7f7fdabdc6a356809f045a9aae Mon Sep 17 00:00:00 2001 From: sigmalin Date: Tue, 1 Dec 2020 14:19:30 +0800 Subject: [PATCH 079/578] fix ID83728819 gtm coredump --- src/gtm/main/gtm_store.c | 4 ++-- src/gtm/main/main.c | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/gtm/main/gtm_store.c b/src/gtm/main/gtm_store.c index 0123a662..8208b72a 100644 --- a/src/gtm/main/gtm_store.c +++ b/src/gtm/main/gtm_store.c @@ -3117,9 +3117,9 @@ ProcessStorageTransferCommand(Port *myport, StringInfo message) g_GTM_Backup_Timer = GTM_AddTimer(LockStoreStandbyCrashHandler, GTM_TIMER_TYPE_ONCE, LOCK_STORE_CRASH_HANDL_TIMEOUT, GetMyThreadInfo); if(g_GTM_Backup_Timer == INVALID_TIMER_HANDLE) { + GTM_RWLockRelease(&g_GTM_Backup_Timer_Lock); elog(ERROR, "Failed to register lock store crash handler, will exit!"); - exit(1); - } + } GTM_RWLockRelease(&g_GTM_Backup_Timer_Lock); /* send xlog replication relative data */ diff --git a/src/gtm/main/main.c b/src/gtm/main/main.c index 78fbcff5..ded1a044 100644 --- a/src/gtm/main/main.c +++ b/src/gtm/main/main.c @@ -3371,6 +3371,9 @@ GTM_ThreadBasebackup(void *argp) if (sigsetjmp(local_sigjmp_buf, 1) != 0) { bool report = false; +#ifdef __TBASE__ + RWLockCleanUp(); +#endif /* * NOTE: if you are tempted to add more code in this if-block, * consider the high probability that it should be in From ca2ea1820d0d4e281aa44b3419f9d02311df7c1d Mon Sep 17 00:00:00 2001 From: whalesong Date: Mon, 26 Oct 2020 14:33:29 +0800 Subject: [PATCH 080/578] bugfix: tpcc district not found fatal --- src/backend/pgxc/pool/execRemote.c | 89 +++++++++++++++++------------- 1 file changed, 50 insertions(+), 39 deletions(-) diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index e3fa18d7..1be26a46 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -7435,6 +7435,18 @@ PreCommit_Remote(char *prepareGID, char *nodestring, bool preparedLocalNode) } +/* + * Whether node need clean: last command is not finished + * 'Z' message: ready for query + * 'C' message: command complete + */ +static inline bool +node_need_clean(PGXCNodeHandle *handle) +{ + return handle->state != DN_CONNECTION_STATE_IDLE || + (('Z' != handle->last_command) && ('C' != handle->last_command)); +} + /* * Do abort processing for the transaction. We must abort the transaction on * all the involved nodes. If a node has already prepared a transaction, we run @@ -7495,21 +7507,21 @@ PreAbort_Remote(TranscationType txn_type, bool need_release_handle) { PGXCNodeHandle *handle = all_handles->coord_handles[i]; if (handle->sock != NO_SOCKET) - { - if ((handle->state != DN_CONNECTION_STATE_IDLE) || !node_ready_for_query(handle)) - { - /* - * Forget previous combiner if any since input will be handled by - * different one. - */ - handle->combiner = NULL; - clean_nodes[node_count++] = handle; - cancel_co_list[cancel_co_count++] = PGXCNodeGetNodeId(handle->nodeoid, NULL); - -#ifdef _PG_REGRESS_ - ereport(LOG, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("PreAbort_Remote node:%s pid:%d status:%d need clean.", handle->nodename, handle->backend_pid, handle->state))); + { + if (node_need_clean(handle)) + { + /* + * Forget previous combiner if any since input will be handled by + * different one. + */ + handle->combiner = NULL; + clean_nodes[node_count++] = handle; + cancel_co_list[cancel_co_count++] = PGXCNodeGetNodeId(handle->nodeoid, NULL); + +#ifdef _PG_REGRESS_ + ereport(LOG, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("PreAbort_Remote node:%s pid:%d status:%d need clean.", handle->nodename, handle->backend_pid, handle->state))); #endif if (handle->in_extended_query) { @@ -7561,15 +7573,14 @@ PreAbort_Remote(TranscationType txn_type, bool need_release_handle) { PGXCNodeHandle *handle = all_handles->datanode_handles[i]; if (handle->sock != NO_SOCKET) - { - if (handle->state == DN_CONNECTION_STATE_COPY_IN || - handle->state == DN_CONNECTION_STATE_COPY_OUT || - !node_ready_for_query(handle)) - { -#ifdef _PG_REGRESS_ - ereport(LOG, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("PreAbort_Remote node:%s pid:%d status:%d need clean.", handle->nodename, handle->backend_pid, handle->state))); + { + if (handle->state == DN_CONNECTION_STATE_COPY_IN || + handle->state == DN_CONNECTION_STATE_COPY_OUT) + { +#ifdef _PG_REGRESS_ + ereport(LOG, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("PreAbort_Remote node:%s pid:%d status:%d need clean.", handle->nodename, handle->backend_pid, handle->state))); #endif if (handle->in_extended_query) { @@ -7605,21 +7616,21 @@ PreAbort_Remote(TranscationType txn_type, bool need_release_handle) clean_nodes[node_count++] = handle; cancel_dn_list[cancel_dn_count++] = PGXCNodeGetNodeId(handle->nodeoid, NULL); } -#endif - } - else if (handle->state != DN_CONNECTION_STATE_IDLE) - { - /* - * Forget previous combiner if any since input will be handled by - * different one. - */ - handle->combiner = NULL; - clean_nodes[node_count++] = handle; - cancel_dn_list[cancel_dn_count++] = PGXCNodeGetNodeId(handle->nodeoid, NULL); -#ifdef _PG_REGRESS_ - ereport(LOG, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("PreAbort_Remote node:%s pid:%d status:%d need clean.", handle->nodename, handle->backend_pid, handle->state))); +#endif + } + else if (node_need_clean(handle)) + { + /* + * Forget previous combiner if any since input will be handled by + * different one. + */ + handle->combiner = NULL; + clean_nodes[node_count++] = handle; + cancel_dn_list[cancel_dn_count++] = PGXCNodeGetNodeId(handle->nodeoid, NULL); +#ifdef _PG_REGRESS_ + ereport(LOG, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("PreAbort_Remote node:%s pid:%d status:%d need clean.", handle->nodename, handle->backend_pid, handle->state))); #endif if (handle->in_extended_query) From 3c2753ef45b562900d6d427e8fdea377d29a1e83 Mon Sep 17 00:00:00 2001 From: youngxie Date: Thu, 3 Dec 2020 10:02:50 +0800 Subject: [PATCH 081/578] fix bug of vacuum_freeze in interval partition table, tapd :http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131083556663 (cherry picked from commit b8eb6d7b) 1204141f add comments. 21706b73 add comment 2a1ee817 Fix vacuum of toast table. 09e34c29 fix bug of vacuum_freeze in interval partition table, tapd :http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131083556663 --- src/backend/catalog/toasting.c | 99 ++++++------- src/backend/commands/vacuum.c | 225 +++++++++++++++--------------- src/backend/commands/vacuumlazy.c | 35 ++--- 3 files changed, 181 insertions(+), 178 deletions(-) diff --git a/src/backend/catalog/toasting.c b/src/backend/catalog/toasting.c index b9ebd095..d908bfc3 100644 --- a/src/backend/catalog/toasting.c +++ b/src/backend/catalog/toasting.c @@ -457,56 +457,57 @@ create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid, * (1) there are any toastable attributes, and (2) the maximum length * of a tuple could exceed TOAST_TUPLE_THRESHOLD. (We don't want to * create a toast table for something like "f1 varchar(20)".) + * No need to create a TOAST table for partitioned tables. */ static bool needs_toast_table(Relation rel) -{// #lizard forgives - int32 data_length = 0; - bool maxlength_unknown = false; - bool has_toastable_attrs = false; - TupleDesc tupdesc; - Form_pg_attribute *att; - int32 tuple_length; - int i; - - /* No TOAST for partitioned tables */ - if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) - { - return false; - } - - tupdesc = rel->rd_att; - att = tupdesc->attrs; - - for (i = 0; i < tupdesc->natts; i++) - { - if (att[i]->attisdropped) - continue; - data_length = att_align_nominal(data_length, att[i]->attalign); - if (att[i]->attlen > 0) - { - /* Fixed-length types are never toastable */ - data_length += att[i]->attlen; - } - else - { - int32 maxlen = type_maximum_size(att[i]->atttypid, - att[i]->atttypmod); - - if (maxlen < 0) - maxlength_unknown = true; - else - data_length += maxlen; - if (att[i]->attstorage != 'p') - has_toastable_attrs = true; - } - } - if (!has_toastable_attrs) - return false; /* nothing to toast? */ - if (maxlength_unknown) - return true; /* any unlimited-length attrs? */ - tuple_length = MAXALIGN(SizeofHeapTupleHeader + - BITMAPLEN(tupdesc->natts)) + - MAXALIGN(data_length); - return (tuple_length > TOAST_TUPLE_THRESHOLD); +{ + int32 data_length = 0; + bool maxlength_unknown = false; + bool has_toastable_attrs = false; + TupleDesc tupdesc; + Form_pg_attribute *att; + int32 tuple_length; + int i; + + /* No TOAST for partitioned tables */ + if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + { + return false; + } + + tupdesc = rel->rd_att; + att = tupdesc->attrs; + + for (i = 0; i < tupdesc->natts; i++) + { + if (att[i]->attisdropped) + continue; + data_length = att_align_nominal(data_length, att[i]->attalign); + if (att[i]->attlen > 0) + { + /* Fixed-length types are never toastable */ + data_length += att[i]->attlen; + } + else + { + int32 maxlen = type_maximum_size(att[i]->atttypid, + att[i]->atttypmod); + + if (maxlen < 0) + maxlength_unknown = true; + else + data_length += maxlen; + if (att[i]->attstorage != 'p') + has_toastable_attrs = true; + } + } + if (!has_toastable_attrs) + return false; /* nothing to toast? */ + if (maxlength_unknown) + return true; /* any unlimited-length attrs? */ + tuple_length = MAXALIGN(SizeofHeapTupleHeader + + BITMAPLEN(tupdesc->natts)) + + MAXALIGN(data_length); + return (tuple_length > TOAST_TUPLE_THRESHOLD); } diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 15fc537e..5fc12531 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -1429,124 +1429,125 @@ vacuum_rel(Oid relid, RangeVar *relation, int options, VacuumParams *params) #else else if (onerel->rd_rel->relnamespace == PG_CATALOG_NAMESPACE) #endif - ereport(WARNING, - (errmsg("skipping \"%s\" --- only superuser or database owner can vacuum it", - RelationGetRelationName(onerel)))); - else - ereport(WARNING, - (errmsg("skipping \"%s\" --- only table or database owner can vacuum it", - RelationGetRelationName(onerel)))); - relation_close(onerel, lmode); - PopActiveSnapshot(); - CommitTransactionCommand(); - return false; - } - - /* - * Check that it's a vacuumable relation; we used to do this in - * get_rel_oids() but seems safer to check after we've locked the - * relation. - */ - if (onerel->rd_rel->relkind != RELKIND_RELATION && - onerel->rd_rel->relkind != RELKIND_MATVIEW && - onerel->rd_rel->relkind != RELKIND_TOASTVALUE && - onerel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE) - { - ereport(WARNING, - (errmsg("skipping \"%s\" --- cannot vacuum non-tables or special system tables", - RelationGetRelationName(onerel)))); - relation_close(onerel, lmode); - PopActiveSnapshot(); - CommitTransactionCommand(); - return false; - } - - /* - * Silently ignore tables that are temp tables of other backends --- - * trying to vacuum these will lead to great unhappiness, since their - * contents are probably not up-to-date on disk. (We don't throw a - * warning here; it would just lead to chatter during a database-wide - * VACUUM.) - */ - if (RELATION_IS_OTHER_TEMP(onerel)) - { - relation_close(onerel, lmode); - PopActiveSnapshot(); - CommitTransactionCommand(); - return false; - } - - /* - * Get a session-level lock too. This will protect our access to the - * relation across multiple transactions, so that we can vacuum the - * relation's TOAST table (if any) secure in the knowledge that no one is - * deleting the parent relation. - * - * NOTE: this cannot block, even if someone else is waiting for access, - * because the lock manager knows that both lock requests are from the - * same process. - */ - onerelid = onerel->rd_lockInfo.lockRelId; - LockRelationIdForSession(&onerelid, lmode); - - /* - * Remember the relation's TOAST relation for later, if the caller asked - * us to process it. In VACUUM FULL, though, the toast table is - * automatically rebuilt by cluster_rel so we shouldn't recurse to it. - */ - if (!(options & VACOPT_SKIPTOAST) && !(options & VACOPT_FULL)) - toast_relid = onerel->rd_rel->reltoastrelid; - else - toast_relid = InvalidOid; - - /* - * Switch to the table owner's userid, so that any index functions are run - * as that user. Also lock down security-restricted operations and - * arrange to make GUC variable changes local to this command. (This is - * unnecessary, but harmless, for lazy VACUUM.) - */ - GetUserIdAndSecContext(&save_userid, &save_sec_context); - SetUserIdAndSecContext(onerel->rd_rel->relowner, - save_sec_context | SECURITY_RESTRICTED_OPERATION); - save_nestlevel = NewGUCNestLevel(); + ereport(WARNING, + (errmsg("skipping \"%s\" --- only superuser or database owner can vacuum it", + RelationGetRelationName(onerel)))); + else + ereport(WARNING, + (errmsg("skipping \"%s\" --- only table or database owner can vacuum it", + RelationGetRelationName(onerel)))); + relation_close(onerel, lmode); + PopActiveSnapshot(); + CommitTransactionCommand(); + return false; + } - /* - * Ignore partitioned tables as there is no work to be done. Since we - * release the lock here, it's possible that any partitions added from - * this point on will not get processed, but that seems harmless. - */ - if (onerel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) - { - /* Roll back any GUC changes executed by index functions */ - AtEOXact_GUC(false, save_nestlevel); + /* + * Check that it's a vacuumable relation; we used to do this in + * get_rel_oids() but seems safer to check after we've locked the + * relation. + */ + if (onerel->rd_rel->relkind != RELKIND_RELATION && + onerel->rd_rel->relkind != RELKIND_MATVIEW && + onerel->rd_rel->relkind != RELKIND_TOASTVALUE && + onerel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE) + { + ereport(WARNING, + (errmsg("skipping \"%s\" --- cannot vacuum non-tables or special system tables", + RelationGetRelationName(onerel)))); + relation_close(onerel, lmode); + PopActiveSnapshot(); + CommitTransactionCommand(); + return false; + } - /* Restore userid and security context */ - SetUserIdAndSecContext(save_userid, save_sec_context); + /* + * Silently ignore tables that are temp tables of other backends --- + * trying to vacuum these will lead to great unhappiness, since their + * contents are probably not up-to-date on disk. (We don't throw a + * warning here; it would just lead to chatter during a database-wide + * VACUUM.) + */ + if (RELATION_IS_OTHER_TEMP(onerel)) + { + relation_close(onerel, lmode); + PopActiveSnapshot(); + CommitTransactionCommand(); + return false; + } - relation_close(onerel, NoLock); - PopActiveSnapshot(); - CommitTransactionCommand(); + /* + * Get a session-level lock too. This will protect our access to the + * relation across multiple transactions, so that we can vacuum the + * relation's TOAST table (if any) secure in the knowledge that no one is + * deleting the parent relation. + * + * NOTE: this cannot block, even if someone else is waiting for access, + * because the lock manager knows that both lock requests are from the + * same process. + */ + onerelid = onerel->rd_lockInfo.lockRelId; + LockRelationIdForSession(&onerelid, lmode); + + /* + * Remember the relation's TOAST relation for later, if the caller asked + * us to process it. In VACUUM FULL, though, the toast table is + * automatically rebuilt by cluster_rel so we shouldn't recurse to it. + */ + if (!(options & VACOPT_SKIPTOAST) && !(options & VACOPT_FULL)) + toast_relid = onerel->rd_rel->reltoastrelid; + else + toast_relid = InvalidOid; + + /* + * Switch to the table owner's userid, so that any index functions are run + * as that user. Also lock down security-restricted operations and + * arrange to make GUC variable changes local to this command. (This is + * unnecessary, but harmless, for lazy VACUUM.) + */ + GetUserIdAndSecContext(&save_userid, &save_sec_context); + SetUserIdAndSecContext(onerel->rd_rel->relowner, + save_sec_context | SECURITY_RESTRICTED_OPERATION); + save_nestlevel = NewGUCNestLevel(); + + /* + * Ignore partitioned tables as there is no work to be done. Since we + * release the lock here, it's possible that any partitions added from + * this point on will not get processed, but that seems harmless. + */ + if (onerel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + { + /* Roll back any GUC changes executed by index functions */ + AtEOXact_GUC(false, save_nestlevel); + + /* Restore userid and security context */ + SetUserIdAndSecContext(save_userid, save_sec_context); + + relation_close(onerel, NoLock); + PopActiveSnapshot(); + CommitTransactionCommand(); + + /* + * If the relation has a secondary toast rel, vacuum that too while we + * still hold the session lock on the master table. Note however that + * "analyze" will not get done on the toast table. This is good, because + * the toaster always uses hardcoded index access and statistics are + * totally unimportant for toast relations. + */ + if (toast_relid != InvalidOid) + { + vacuum_rel(toast_relid, relation, options, params); + } - /* - * If the relation has a secondary toast rel, vacuum that too while we - * still hold the session lock on the master table. Note however that - * "analyze" will not get done on the toast table. This is good, because - * the toaster always uses hardcoded index access and statistics are - * totally unimportant for toast relations. - */ - if (toast_relid != InvalidOid) - { - vacuum_rel(toast_relid, relation, options, params); - } + /* + * Now release the session-level lock on the master table. + */ + UnlockRelationIdForSession(&onerelid, lmode); - /* - * Now release the session-level lock on the master table. - */ - UnlockRelationIdForSession(&onerelid, lmode); + /* It's OK for other commands to look at this table */ + return true; + } - /* It's OK for other commands to look at this table */ - return true; - } #ifdef XCP /* diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c index 0192cdc6..1e72ec49 100644 --- a/src/backend/commands/vacuumlazy.c +++ b/src/backend/commands/vacuumlazy.c @@ -187,20 +187,20 @@ lazy_vacuum_interval_rel(Relation onerel, VacuumParams *params) int nindexes; Relation *Irel; bool hasindex; - TransactionId oldestXmin = InvalidTransactionId; - TransactionId freezeLimit = InvalidTransactionId; - MultiXactId multiXactCutoff = InvalidMultiXactId; + TransactionId oldestXmin = InvalidTransactionId; + TransactionId freezeLimit = InvalidTransactionId; + MultiXactId multiXactCutoff = InvalidMultiXactId; - if (params && onerel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE) - { - vacuum_set_xid_limits(onerel, + if (params && onerel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE) + { + vacuum_set_xid_limits(onerel, params->freeze_min_age, params->freeze_table_age, params->multixact_freeze_min_age, params->multixact_freeze_table_age, &oldestXmin, &freezeLimit, NULL, &multiXactCutoff, NULL); - } + } childs = RelationGetAllPartitions(onerel); @@ -260,14 +260,15 @@ lazy_vacuum_interval_rel(Relation onerel, VacuumParams *params) pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, PROGRESS_VACUUM_PHASE_FINAL_CLEANUP); - vac_update_relstats(onerel, - pages, - tuples, - visiblepages, - hasindex, - freezeLimit, - multiXactCutoff, - false); + /* save changes */ + vac_update_relstats(onerel, + pages, + tuples, + visiblepages, + hasindex, + freezeLimit, + multiXactCutoff, + false); pgstat_report_vacuum(RelationGetRelid(onerel), onerel->rd_rel->relisshared, @@ -313,10 +314,10 @@ lazy_vacuum_rel(Relation onerel, int options, VacuumParams *params, Assert(params != NULL); #ifdef __TBASE__ + /* update statistic info for interval partition parent table */ if (RELATION_IS_INTERVAL(onerel)) { - /* update statistic info for interval partition parent table */ - lazy_vacuum_interval_rel(onerel, params); + lazy_vacuum_interval_rel(onerel, params); return; } #endif From c5a7f0ec3cfcab9617d02007425f57a30b0a14d6 Mon Sep 17 00:00:00 2001 From: jackywpxie Date: Wed, 2 Dec 2020 22:14:50 +0800 Subject: [PATCH 082/578] jacky/feature/MaintainGTS_Tbase_v2.15.16 (merge request !12) Squash merge branch 'jacky/feature/MaintainGTS_Tbase_v2.15.16' into 'Tbase_v2.15.16' * Revert 'fixed bug for persistent datanode connections.' * MaintainGTS supports unlogged table. * fixed bug for persistent datanode connections. * add {} * Merge branch 'Tbase_v2.15.16' into jacky/feature/MaintainGTS_Tbase_v2.15.16 * rollback modification * Revert 'bugfix: tpcc district not found fatal' * bugfix: tpcc district not found fatal * delete extension: reset_gts * clear the modification to buffer.h and buffer.c * fixed bug: endless loop * modified according to xiecanyang's suggestion. * delete damaged_gts * add damaged_gts test option * add damage_gts * fixed bug: count not open tlog file when the tuple has been frozen. * delete a comment. * modified the comment of PostmasterIsPrimaryAndNormal. * rename PostmasterIsAlive to PostmasterIsPrimaryAndNormal * modified code format * delete enable_satisfies_any * delete pg_memory_barrier() * delete space. * add pg_memory_barrier() * fixed a error of going back. * go back to before fixing the bug of persistent_datanode_connection. * fixed bugs: insert abort when persistent_datanode_connecionts = on. * correct a typo * ajustted code format. * fixed bugs: * ajustted code format. * roll back the modification of ReadBuffer_common. * mkdir maintain for trace log. * fixed bug: release clog lock. * printData/printStack call audit_log_trace. * check and reset GTS before and after vacuum pages. * add trace log accoording to audit fga log. * optimized code format. * 1. ajustted code format: suck as line break, etc. * rollback: not fully tested * reduce if logical judgement. * modified acoording to jason's suggestion * deal with special GTS * modified according to code review comments. * comment memory barrier. * add GTS values: 3, 4. * print the line number and file name of error stack. * reset_gts = 1: * fixed bug: Could not open file 'pg_commit_ts/XXX': No such file or * reduce unnecessary logs. * fixed bug: set persistent_datanode_connections to on, insert transaction * support heap_page_reset_gts(get_buffer('table_name', page_number)); * 1.fix bug about errmsg, 'database tbase does not exist', in pg_log. * solve the problem of GTS output big interger out of bounds. * remove dependency on Kernal * pg_archivecleanup support removing the .gts file. * pg_waldump ... -r transaction command support GTS. * rename tbase_gts to tbase_gts_tools * fix bug: heap_page_items can not output t_data when page id is not normal. * initialize values * shuiwu20201029_2 * refactoring functions to simplify code. * add tbase_gts extension in the Makefile of extensions. * delete enable_satisfies_any from GUC * modified txid_gts. * add heap_page_reset_gts() * add xmin_gts and xmax_gts in extension function * add extension function heap_page_items_with_gts. * add tbase_gts extendion * add enable_satisfies_any * fix bug: * print correct CTID. * make changes according to code viewing suggestions. * 1. Print log when GTS is inserted into heaptuple * add ctid information while checking GTS * use __sync_synchronize() to prevent CPU reordering and compiler * 1. increase log information when gts is incorrect. * When gts is not set, its correctness is not checked. * check the correctness of GTS before writing pages. * fix bug ID82284643: GTS is not used for index ans system tables. * fix bug ID82284643: reduce locks of checking GTS when reading pages. * fix bug ID82284643: check and reset tuple's xmin_gts and xmax_gts according to the gts in tlog through write and read data page. * fix bug ID82284643: check and reset tuple's xmin_gts and xmax_gts according to the gts in tlog through vacuum operation. --- contrib/Makefile | 1 + contrib/audit_test/audit_test_AlogQueue.c | 839 ++++++ contrib/pageinspect/heapfuncs.c | 126 +- contrib/pgxc_ctl/make_signature | 0 contrib/tbase_gts_tools/Makefile | 23 + .../tbase_gts_tools/tbase_gts_tools--1.0.sql | 104 + contrib/tbase_gts_tools/tbase_gts_tools.c | 357 +++ .../tbase_gts_tools/tbase_gts_tools.control | 5 + src/backend/access/rmgrdesc/xactdesc.c | 70 +- src/backend/access/transam/commit_ts.c | 178 +- src/backend/access/transam/gtm.c | 44 +- src/backend/commands/vacuum.c | 4 +- src/backend/commands/vacuumlazy.c | 1060 +++++--- src/backend/main/main.c | 59 +- src/backend/pgxc/pool/poolmgr.c | 26 +- src/backend/postmaster/auditlogger.c | 2313 ++++++++++------- src/backend/postmaster/postmaster.c | 20 + src/backend/replication/logical/decode.c | 10 +- src/backend/utils/cache/relcache.c | 91 + src/backend/utils/misc/guc.c | 48 +- src/backend/utils/misc/postgresql.conf.sample | 9 + src/backend/utils/time/tqual.c | 68 +- src/bin/pg_archivecleanup/pg_archivecleanup.c | 188 +- src/include/access/htup_details.h | 5 + src/include/bootstrap/bootstrap.h | 3 +- src/include/commands/vacuum.h | 24 +- src/include/postmaster/auditlogger.h | 25 +- src/include/postmaster/postmaster.h | 1 + src/include/utils/relcache.h | 2 + 29 files changed, 3987 insertions(+), 1716 deletions(-) create mode 100644 contrib/audit_test/audit_test_AlogQueue.c mode change 100644 => 100755 contrib/pgxc_ctl/make_signature create mode 100644 contrib/tbase_gts_tools/Makefile create mode 100644 contrib/tbase_gts_tools/tbase_gts_tools--1.0.sql create mode 100644 contrib/tbase_gts_tools/tbase_gts_tools.c create mode 100644 contrib/tbase_gts_tools/tbase_gts_tools.control diff --git a/contrib/Makefile b/contrib/Makefile index 9c5df8bf..1d0dcd37 100644 --- a/contrib/Makefile +++ b/contrib/Makefile @@ -46,6 +46,7 @@ SUBDIRS = \ seg \ spi \ tablefunc \ + tbase_gts_tools \ tcn \ test_decoding \ tsm_system_rows \ diff --git a/contrib/audit_test/audit_test_AlogQueue.c b/contrib/audit_test/audit_test_AlogQueue.c new file mode 100644 index 00000000..6b81ad85 --- /dev/null +++ b/contrib/audit_test/audit_test_AlogQueue.c @@ -0,0 +1,839 @@ +/* + * contrib/audit_test/audit_test.c + */ + +#include "postgres_fe.h" + +#include "libpq-fe.h" +#include "pg_getopt.h" +#include "port/atomics.h" +#include +#include +#include +#include + +#define TestAlogProducerCount 1000 +#define TestAlogQueueSize 1200 +#define TestAlogBuffSize 40960 +#define TestAlogFileSize 102400000 + +#ifdef Assert +#undef Assert +#endif + +#define Assert assert + +typedef struct TestAuditLogQueue +{ + pid_t q_pid; + int q_size; + char q_lock; + volatile int q_head; + volatile int q_tail; + char q_area[FLEXIBLE_ARRAY_MEMBER]; +} AlogQueue; + +static int shared_queue_idx[TestAlogProducerCount] = { 0 }; +static AlogQueue * shared_queue [TestAlogProducerCount] = { 0 }; +static AlogQueue * local_cache = NULL; +static char * alog_file_name = "test_alog.txt"; +static FILE * alog_file_fp = NULL; + +static char * alog_queue_offset_to(AlogQueue * queue, int offset); +static bool alog_queue_is_full(int q_size, int q_head, int q_tail); +static bool alog_queue_is_empty(int q_size, int q_head, int q_tail); +static bool alog_queue_is_enough(int q_size, int q_head, int q_tail, int N); +static int alog_queue_remain(int q_size, int q_head, int q_tail); +static int alog_queue_used(int q_size, int q_head, int q_tail); +static bool alog_queue_push(AlogQueue * queue, char * buff, int len); +static bool alog_queue_push2(AlogQueue * queue, char * buff1, int len1, char * buff2, int len2); +static bool alog_queue_pushn(AlogQueue * queue, char * buff[], int len[], int n); +static int alog_queue_get_str_len(AlogQueue * queue, int offset); +static void alog_queue_clear_str_len(AlogQueue * queue, int offset); +static bool alog_queue_pop_to_queue(AlogQueue * from, AlogQueue * to); +static bool alog_queue_pop_to_file(AlogQueue * from, FILE * logfile); +static int alog_write_log_file(const char *buffer, int count, FILE * logfile); +static int alog_random_string(char buff[TestAlogBuffSize]); + +int test_alog(); +int test_alog0(); + + +/* -------------------------------- + * AlogQueue routines + * -------------------------------- + */ + +/* + * Get a write pointer in queue + */ +static char * alog_queue_offset_to(AlogQueue * queue, int offset) +{ + char * start = (char *) queue; + + Assert(offset >= 0 && offset < queue->q_size); + + start += offsetof(AlogQueue, q_area); + start += offset; + + return start; +} + +static bool alog_queue_is_full(int q_size, int q_head, int q_tail) +{ + Assert(q_size > 0 && q_head >= 0 && q_tail >= 0); + Assert(q_head < q_size && q_tail < q_size); + + if ((q_tail + 1) % q_size == q_head) + { + return true; + } + else + { + return false; + } +} + +static bool alog_queue_is_empty(int q_size, int q_head, int q_tail) +{ + Assert(q_size > 0 && q_head >= 0 && q_tail >= 0); + Assert(q_head < q_size && q_tail < q_size); + + if (q_tail == q_head) + { + return true; + } + else + { + return false; + } +} + +/* + * how many bytes already in used + */ + +static int alog_queue_used(int q_size, int q_head, int q_tail) +{ + int used = (q_tail - q_head + q_size) % q_size; + + Assert(q_size > 0 && q_head >= 0 && q_tail >= 0); + Assert(q_head < q_size && q_tail < q_size); + + return used; +} + + +/* + * how many bytes remain in Queue + */ +static int alog_queue_remain(int q_size, int q_head, int q_tail) +{ + int remain = (q_head - q_tail + q_size - 1) % q_size; + + Assert(q_size > 0 && q_head >= 0 && q_tail >= 0); + Assert(q_head < q_size && q_tail < q_size); + Assert(remain == (q_size - 1) - ((q_tail - q_head + q_size) % q_size)); + Assert(remain == (q_size - 1) - alog_queue_used(q_size, q_head, q_tail)); + + return remain; +} + +/* + * whether queue has enough space for N bytes ? + */ +static bool alog_queue_is_enough(int q_size, int q_head, int q_tail, int N) +{ + int remain = alog_queue_remain(q_size, q_head, q_tail); + + Assert(q_size > 0 && q_head >= 0 && q_tail >= 0 && N > 0); + Assert(q_head < q_size && q_tail < q_size); + + if (remain > N) + { + return true; + } + + return false; +} + +/* + * write buff to queue + * + * len = size(int) + strlen(str) + * + */ +static bool alog_queue_push(AlogQueue * queue, char * buff, int len) +{ + char * buff_array [] = { buff }; + int len_array [] = { len }; + + return alog_queue_pushn(queue, buff_array, len_array, sizeof(len_array)/sizeof(len_array[0])); +} + +/* + * write buff1 and buff2 to queue + */ +static bool alog_queue_push2(AlogQueue * queue, char * buff1, int len1, char * buff2, int len2) +{ + char * buff_array[] = {buff1, buff2}; + int len_array[] = {len1, len2}; + + return alog_queue_pushn(queue, buff_array, len_array, sizeof(len_array)/sizeof(len_array[0])); +} + +static bool alog_queue_pushn(AlogQueue * queue, char * buff[], int len[], int n) +{ + volatile int q_head = queue->q_head; + volatile int q_tail = queue->q_tail; + volatile int q_size = queue->q_size; + + int q_head_before = q_head; + int q_tail_before = q_tail; + int q_size_before = q_size; + + int q_used_before = 0; + int q_used_after = 0; + + int total_len = 0; + int i = 0; + + for (i = 0; i < n; i++) + { + total_len += len[i]; + } + + pg_memory_barrier(); + + Assert(q_size > 0 && q_head >= 0 && q_tail >= 0); + Assert(q_head < q_size && q_tail < q_size); + Assert(buff != NULL && len != 0 && n > 0 && total_len > 0); + + q_used_before = alog_queue_used(q_size_before, q_head_before, q_tail_before); + + if (alog_queue_is_full(q_size, q_head, q_tail)) + { + return false; + } + + if (!alog_queue_is_enough(q_size, q_head, q_tail, total_len)) + { + return false; + } + + for (i = 0; i < n; i++) + { + char * curr_buff = buff[i]; + int curr_len = len[i]; + + /* has enough space, write directly */ + if (q_size - q_tail >= curr_len) + { + char * p_start = alog_queue_offset_to(queue, q_tail); + memcpy(p_start, curr_buff, curr_len); + } + else + { + /* must write as two parts */ + int first_len = q_size - q_tail; + int second_len = curr_len - first_len; + + char * first_buf = curr_buff + 0; + char * second_buf = curr_buff + first_len; + + char * p_start = NULL; + + pg_memory_barrier(); + + Assert(first_len > 0 && first_len < q_size); + Assert(second_len > 0 && second_len < q_size); + + /* 01. write the first parts into the tail of queue->q_area */ + p_start = alog_queue_offset_to(queue, q_tail); + memcpy(p_start, first_buf, first_len); + + Assert((q_tail + first_len) % q_size == 0); + + /* 02. write the remain parts into the head of queue->q_area */ + p_start = alog_queue_offset_to(queue, 0); + memcpy(p_start, second_buf, second_len); + } + + q_tail = (q_tail + curr_len) % q_size; + } + + queue->q_tail = q_tail; + + q_used_after = alog_queue_used(q_size, q_head, q_tail); + Assert(q_used_before + total_len == q_used_after); + + return true; +} + +/* + * |<- strlen value ->|<- string message content ->| + * | | + * | | + * |<------------------ buff --------------------->| + * + * len = size(int) + strlen(str) + * + */ +static int alog_queue_get_str_len(AlogQueue * queue, int offset) +{ + volatile int q_size = queue->q_size; + char buff[sizeof(int)] = { '\0' }; + int len = 0; + + pg_memory_barrier(); + + Assert(offset >= 0 && offset < q_size); + + /* read len directly */ + if (q_size - offset >= sizeof(int)) + { + char * q_start = alog_queue_offset_to(queue, offset); + memcpy(buff, q_start, sizeof(int)); + } + else + { + /* must read as two parts */ + int first_len = q_size - offset; + int second_len = sizeof(int) - first_len; + + char * p_start = NULL; + + pg_memory_barrier(); + + Assert(first_len > 0 && first_len < q_size); + Assert(second_len > 0 && second_len < sizeof(int)); + + /* 01. copy the first parts */ + p_start = alog_queue_offset_to(queue, offset); + memcpy(buff, p_start, first_len); + + /* 02. copy the remain parts */ + p_start = alog_queue_offset_to(queue, 0); + memcpy(buff + first_len, p_start, second_len); + } + + memcpy((char *)(&len), buff, sizeof(int)); + + Assert(len > 0 && len < q_size); + + return len; +} + +static void alog_queue_clear_str_len(AlogQueue * queue, int offset) +{ + volatile int q_size = queue->q_size; + char buff[sizeof(int)] = { '\0' }; + + pg_memory_barrier(); + + Assert(offset >= 0 && offset < q_size); + + /* read len directly */ + if (q_size - offset >= sizeof(int)) + { + char * q_start = alog_queue_offset_to(queue, offset); + memcpy(q_start, buff, sizeof(int)); + } + else + { + /* must read as two parts */ + int first_len = q_size - offset; + int second_len = sizeof(int) - first_len; + + char * p_start = NULL; + + pg_memory_barrier(); + + Assert(first_len > 0 && first_len < q_size); + Assert(second_len > 0 && second_len < sizeof(int)); + + /* 01. copy the first parts */ + p_start = alog_queue_offset_to(queue, offset); + memcpy(p_start, buff, first_len); + + /* 02. copy the remain parts */ + p_start = alog_queue_offset_to(queue, 0); + memcpy(p_start, buff, second_len); + } +} + +/* + * copy message from queue to another as much as possible + * + * |<- strlen value ->|<- string message content ->| + * | | + * | | + * |<------------------ buff --------------------->| + * + * len = size(int) + strlen(str) + * + */ +static bool alog_queue_pop_to_queue(AlogQueue * from, AlogQueue * to) +{ + volatile int q_from_head = from->q_head; + volatile int q_from_tail = from->q_tail; + volatile int q_from_size = from->q_size; + + volatile int q_to_head = to->q_head; + volatile int q_to_tail = to->q_tail; + volatile int q_to_size = to->q_size; + + int from_head = q_from_head; + int from_tail = q_from_tail; + int from_size = q_from_size; + + int to_head = q_to_head; + int to_tail = q_to_tail; + int to_size = q_to_size; + + int from_total = 0; + + int from_used = 0; + int from_copyed = 0; + + int to_used = 0; + int to_copyed = 0; + + pg_memory_barrier(); + + from_total = from_used = alog_queue_used(from_size, from_head, from_tail); + to_used = alog_queue_used(to_size, to_head, to_tail); + + Assert(from_size > 0 && from_head >= 0 && from_tail >= 0); + Assert(from_head < from_size && from_tail < from_size && from_used <= from_size); + + Assert(to_size > 0 && to_head >= 0 && to_tail >= 0); + Assert(to_head < to_size && to_tail < to_size && to_used <= to_size); + + /* from is empty, ignore */ + if (alog_queue_is_empty(from_size, from_head, from_tail)) + { + return false; + } + + /* to is full, can not write */ + if (alog_queue_is_full(to_size, to_head, to_tail)) + { + return false; + } + + /* copy message into queue until to is full or from is empty */ + do + { + int string_len = alog_queue_get_str_len(from, from_head); + int copy_len = sizeof(int) + string_len; + + pg_memory_barrier(); + + Assert(string_len > 0 && string_len < from_size); + Assert(copy_len > 0 && copy_len < from_size); + + if (!alog_queue_is_enough(to_size, to_head, to_tail, copy_len)) + { + break; + } + + /* just copy dierctly */ + if (from_size - from_head >= copy_len) + { + char * p_start = alog_queue_offset_to(from, from_head); + if (!alog_queue_push(to, p_start, copy_len)) + { + break; + } + } + else + { + /* must copy as two parts */ + int first_len = from_size - from_head; + int second_len = copy_len - first_len; + char * p_first_start = NULL; + char * p_second_start = NULL; + + Assert(first_len > 0 && first_len < from_size); + Assert(second_len > 0 && second_len < from_size); + + p_first_start = alog_queue_offset_to(from, from_head); + p_second_start = alog_queue_offset_to(from, 0); + + /* 01. copy the content parts into the tail of to->q_area */ + if (!alog_queue_push2(to, p_first_start, first_len, p_second_start, second_len)) + { + break; + } + } + + from_head = (from_head + copy_len) % from_size; + to_tail = (to_tail + copy_len) % to_size; + + from_copyed += copy_len; + to_copyed += copy_len; + + Assert(from_copyed <= from_total); + Assert(from_used - copy_len >= 0); + Assert(to_used + copy_len <= to_size); + Assert(from_used - copy_len == alog_queue_used(from_size, from_head, from_tail)); + Assert(to_used + copy_len == alog_queue_used(to_size, to_head, to_tail)); + + from_used = alog_queue_used(from_size, from_head, from_tail); + to_used = alog_queue_used(to_size, to_head, to_tail); + } while (!alog_queue_is_empty(from_size, from_head, from_tail)); + + from->q_head = from_head; + + return true; +} + +/* + * copy message from queue to file as much as possible + */ +static bool alog_queue_pop_to_file(AlogQueue * from, FILE * logfile) +{ + volatile int q_from_head = from->q_head; + volatile int q_from_tail = from->q_tail; + volatile int q_from_size = from->q_size; + + int from_head = q_from_head; + int from_tail = q_from_tail; + int from_size = q_from_size; + + int from_total = 0; + + int from_used = 0; + int from_copyed = 0; + + pg_memory_barrier(); + + from_total = from_used = alog_queue_used(from_size, from_head, from_tail); + + Assert(from_size > 0 && from_head >= 0 && from_tail >= 0); + Assert(from_head < from_size && from_tail < from_size && from_used <= from_size); + + /* from is empty, ignore */ + if (alog_queue_is_empty(from_size, from_head, from_tail)) + { + return false; + } + + /* copy message into file until from is empty */ + do + { + int string_len = alog_queue_get_str_len(from, from_head); + int copy_len = sizeof(int) + string_len; + + pg_memory_barrier(); + + /* just copy dierctly */ + if (from_size - from_head >= copy_len) + { + char * p_start = alog_queue_offset_to(from, from_head + sizeof(int)); + + /* only copy message content, not write message len */ + alog_write_log_file(p_start, string_len, logfile); + } + else if (from_size - from_head > sizeof(int)) + { + /* must copy as two parts */ + int first_len = from_size - from_head - sizeof(int); + int second_len = string_len - first_len; + char * p_start = NULL; + + Assert(first_len > 0 && first_len < from_size); + Assert(second_len > 0 && second_len < from_size); + + p_start = alog_queue_offset_to(from, from_head + sizeof(int)); + alog_write_log_file(p_start, first_len, logfile); + + p_start = alog_queue_offset_to(from, 0); + alog_write_log_file(p_start, second_len, logfile); + } + else + { + /* just copy content only */ + int cpy_offset = (from_head + sizeof(int)) % from_size; + char * p_start = alog_queue_offset_to(from, cpy_offset); + + Assert(from_size - from_head <= sizeof(int)); + alog_write_log_file(p_start, string_len, logfile); + } + + from_head = (from_head + copy_len) % from_size; + from_copyed += copy_len; + + Assert(from_copyed <= from_total); + Assert(from_used - copy_len >= 0); + Assert(from_used - copy_len == alog_queue_used(from_size, from_head, from_tail)); + + from_used = alog_queue_used(from_size, from_head, from_tail); + } while (!alog_queue_is_empty(from_size, from_head, from_tail)); + + from->q_head = from_head; + + return true; +} + +static int +alog_write_log_file(const char *buffer, int count, FILE * logfile) +{ + int rc = 0; + rc = fwrite(buffer, 1, count, logfile); + + /* can't use ereport here because of possible recursion */ + if (rc != count) + { + printf("could not write to audit log file: %s\n", strerror(errno)); + return -1; + } + + return 0; +} + +static AlogQueue * +alog_make_queue(int q_size_kb) +{ + AlogQueue * queue = NULL; + Size alogSize = 0; + + alogSize = offsetof(AlogQueue, q_area); + alogSize = alogSize + q_size_kb * 1024; + + queue = (AlogQueue *)malloc(alogSize); + if (queue == NULL) + { + return NULL; + } + + memset(queue, 0, alogSize); + + queue->q_pid = 0; + queue->q_size = q_size_kb * 1024; + queue->q_lock = 0; + queue->q_head = 0; + queue->q_tail = 0; + + return queue; +} + +static FILE * +alog_open_log_file(const char *filename, const char *mode) +{ + FILE *fh = NULL; + mode_t oumask = 0; + + oumask = umask((mode_t) ((~(S_IWUSR | S_IRUSR | S_IWUSR)) & (S_IRWXU | S_IRWXG | S_IRWXO))); + fh = fopen(filename, mode); + umask(oumask); + + if (fh) + { + setvbuf(fh, NULL, PG_IOLBF, 0); + } + + return fh; +} + +static int alog_random_string(char buff[TestAlogBuffSize]) +{ + int i = 0; + + char letter[] = { 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', + 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z' }; + + int len = rand() % TestAlogBuffSize; + + buff[0] = '\0'; + + if (len == 0) + { + len += (TestAlogBuffSize/10); + } + else if (len < 0) + { + len *= -1; + } + + if (len >= TestAlogBuffSize - 10) + { + len = TestAlogBuffSize - 10; + } + + // len = 100; + + memcpy(buff, (char *)(&len), sizeof(int)); + for (i = 0; i < len - 1; i++) + { + int j = i % sizeof(letter); + buff[sizeof(int) + i] = letter[j]; + } + + buff[sizeof(int) + len - 1] = '\n'; + + return sizeof(int) + len; +} + +static void * alog_producer(void * para) +{ + int * idx = (int *) para; + char buff[TestAlogBuffSize] = { '0' }; + + srand(time(NULL)); + + while (1) + { + int len = alog_random_string(buff); + AlogQueue * queue = shared_queue [*idx]; + + while (!alog_queue_push(queue, buff, len)) + { + usleep(10000); + } + } + + return NULL; +} + +static void * alog_consumer(void * para) +{ + while (1) + { + int i = 0; + + for (i = 0; i < TestAlogProducerCount; i++) + { + alog_queue_pop_to_queue(shared_queue[i], local_cache); + + if (0) + { + if (ftell(alog_file_fp) >= TestAlogFileSize * 1024L) + { + FILE * fh = alog_open_log_file(alog_file_name, "w"); + fclose(alog_file_fp); + alog_file_fp = fh; + } + + alog_queue_pop_to_file(shared_queue[i], alog_file_fp); + } + } + } + return NULL; +} + +static void * alog_writer(void * para) +{ + FILE * file = alog_file_fp; + + while (1) + { + if (1) + { + if (ftell(file) >= TestAlogFileSize * 1024L) + { + FILE * fh = alog_open_log_file(alog_file_name, "w"); + fclose(file); + file = fh; + } + + alog_queue_pop_to_file(local_cache, file); + } + } + + return NULL; +} + +enum MT_thr_detach +{ + MT_THR_JOINABLE, + MT_THR_DETACHED +}; + +static int32 CreateThread(void *(*f) (void *), void *arg, int32 mode) +{ + + pthread_attr_t attr; + pthread_t threadid; + int ret = 0; + + pthread_attr_init(&attr); + switch (mode) + { + case MT_THR_JOINABLE: + { + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + break; + } + case MT_THR_DETACHED: + { + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); + break; + } + default: + break; + } + ret = pthread_create(&threadid, &attr, f, arg); + return ret; +} + +int test_alog() +{ + int queue_size_kb = TestAlogQueueSize; + + int i = 0; + + for (i = 0; i < TestAlogProducerCount; i++) + { + shared_queue[i] = alog_make_queue(queue_size_kb); + shared_queue_idx[i] = i; + } + + local_cache = alog_make_queue(queue_size_kb); + + alog_file_fp = alog_open_log_file(alog_file_name, "a"); + + CreateThread(alog_writer, NULL ,MT_THR_DETACHED); + + for (i = 0; i < TestAlogProducerCount; i++) + { + CreateThread(alog_producer, (void *) (&(shared_queue_idx[i])), MT_THR_DETACHED); + } + + alog_consumer(NULL); + + return 0; +} + +int test_alog0() +{ + + int queue_size_kb = TestAlogQueueSize; + AlogQueue * q0 = NULL; + AlogQueue * q1 = NULL; + + char buff[TestAlogBuffSize] = { '0' }; + int len = 0; + + srand(time(NULL)); + + q0 = alog_make_queue(queue_size_kb); + q1 = alog_make_queue(queue_size_kb); + + do + { + len = alog_random_string(buff); + } while (alog_queue_push(q0, buff, len)); + + alog_queue_pop_to_queue(q0, q1); + + do + { + FILE * file = alog_open_log_file(alog_file_name, "a"); + alog_queue_pop_to_file(q1, file); + } while(0); + + return 0; +} + diff --git a/contrib/pageinspect/heapfuncs.c b/contrib/pageinspect/heapfuncs.c index e2bf4172..54c10d4f 100644 --- a/contrib/pageinspect/heapfuncs.c +++ b/contrib/pageinspect/heapfuncs.c @@ -215,69 +215,69 @@ heap_page_items(PG_FUNCTION_ARGS) #else values[10] = UInt8GetDatum(tuphdr->t_hoff); #endif - /* Copy raw tuple data into bytea attribute */ - tuple_data_len = lp_len - tuphdr->t_hoff; - tuple_data_bytea = (bytea *) palloc(tuple_data_len + VARHDRSZ); - SET_VARSIZE(tuple_data_bytea, tuple_data_len + VARHDRSZ); - memcpy(VARDATA(tuple_data_bytea), (char *) tuphdr + tuphdr->t_hoff, - tuple_data_len); - values[14] = PointerGetDatum(tuple_data_bytea); - - /* - * We already checked that the item is completely within the raw - * page passed to us, with the length given in the line pointer. - * Let's check that t_hoff doesn't point over lp_len, before using - * it to access t_bits and oid. - */ - if (tuphdr->t_hoff >= SizeofHeapTupleHeader && - tuphdr->t_hoff <= lp_len && - tuphdr->t_hoff == MAXALIGN(tuphdr->t_hoff)) - { - if (tuphdr->t_infomask & HEAP_HASNULL) - { - int bits_len; - - bits_len = - ((tuphdr->t_infomask2 & HEAP_NATTS_MASK) / 8 + 1) * 8; - values[12] = CStringGetTextDatum( - bits_to_text(tuphdr->t_bits, bits_len)); - } - else - nulls[12] = true; - - if (tuphdr->t_infomask & HEAP_HASOID) - values[13] = HeapTupleHeaderGetOid(tuphdr); - else - nulls[13] = true; - } - else - { - nulls[12] = true; - nulls[13] = true; - } - } - else - { - /* - * The line pointer is not used, or it's invalid. Set the rest of - * the fields to NULL - */ - int i; - - for (i = 4; i <= 13; i++) - nulls[i] = true; - } - - /* Build and return the result tuple. */ - resultTuple = heap_form_tuple(inter_call_data->tupd, values, nulls); - result = HeapTupleGetDatum(resultTuple); - - inter_call_data->offset++; - - SRF_RETURN_NEXT(fctx, result); - } - else - SRF_RETURN_DONE(fctx); + /* Copy raw tuple data into bytea attribute */ + tuple_data_len = lp_len - tuphdr->t_hoff; + tuple_data_bytea = (bytea *) palloc(tuple_data_len + VARHDRSZ); + SET_VARSIZE(tuple_data_bytea, tuple_data_len + VARHDRSZ); + memcpy(VARDATA(tuple_data_bytea), (char *) tuphdr + tuphdr->t_hoff, + tuple_data_len); + values[14] = PointerGetDatum(tuple_data_bytea); + + /* + * We already checked that the item is completely within the raw + * page passed to us, with the length given in the line pointer. + * Let's check that t_hoff doesn't point over lp_len, before using + * it to access t_bits and oid. + */ + if (tuphdr->t_hoff >= SizeofHeapTupleHeader && + tuphdr->t_hoff <= lp_len && + tuphdr->t_hoff == MAXALIGN(tuphdr->t_hoff)) + { + if (tuphdr->t_infomask & HEAP_HASNULL) + { + int bits_len; + + bits_len = + ((tuphdr->t_infomask2 & HEAP_NATTS_MASK) / 8 + 1) * 8; + values[12] = CStringGetTextDatum( + bits_to_text(tuphdr->t_bits, bits_len)); + } + else + nulls[12] = true; + + if (tuphdr->t_infomask & HEAP_HASOID) + values[13] = HeapTupleHeaderGetOid(tuphdr); + else + nulls[13] = true; + } + else + { + nulls[12] = true; + nulls[13] = true; + } + } + else + { + /* + * The line pointer is not used, or it's invalid. Set the rest of + * the fields to NULL + */ + int i; + + for (i = 4; i <= 14; i++) + nulls[i] = true; + } + + /* Build and return the result tuple. */ + resultTuple = heap_form_tuple(inter_call_data->tupd, values, nulls); + result = HeapTupleGetDatum(resultTuple); + + inter_call_data->offset++; + + SRF_RETURN_NEXT(fctx, result); + } + else + SRF_RETURN_DONE(fctx); } /* diff --git a/contrib/pgxc_ctl/make_signature b/contrib/pgxc_ctl/make_signature old mode 100644 new mode 100755 diff --git a/contrib/tbase_gts_tools/Makefile b/contrib/tbase_gts_tools/Makefile new file mode 100644 index 00000000..4b82be9d --- /dev/null +++ b/contrib/tbase_gts_tools/Makefile @@ -0,0 +1,23 @@ +# contrib/tbase_gts_tools/Makefile +MODULES = tbase_gts_tools + +## 扩展名称; +EXTENSION = tbase_gts_tools + +## 扩展安装的SQL文件; +DATA = tbase_gts_tools--1.0.sql + +## 扩展描述; +PGFILEDESC = "tbase_gts_tools - GTS wrapper for Tbase" + +### 以下为Pg构建扩展相关命令; +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) ## 环境变量参数加载; +else +subdir = contrib/tbase_gts_tools +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/contrib/tbase_gts_tools/tbase_gts_tools--1.0.sql b/contrib/tbase_gts_tools/tbase_gts_tools--1.0.sql new file mode 100644 index 00000000..92d0f1f4 --- /dev/null +++ b/contrib/tbase_gts_tools/tbase_gts_tools--1.0.sql @@ -0,0 +1,104 @@ +/* contrib/tbase_gts/tbase_gts_tools--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "create EXTENSION tbase_gts_tools" to load this file. \quit + +CREATE FUNCTION txid_gts(int) +RETURNS bigint +AS 'MODULE_PATHNAME', 'txid_gts' +LANGUAGE C STRICT; + +-- +-- heap_page_items_with_gts() +-- according to heap_page_items_with_gts() from pageinspect--1.5.sql +-- +CREATE FUNCTION heap_page_items_with_gts(IN page bytea, + OUT lp smallint, + OUT lp_off smallint, + OUT lp_flags smallint, + OUT lp_len smallint, + OUT t_xmin xid, + OUT t_xmax xid, + OUT t_xmin_gts bigint, + OUT t_xmax_gts bigint, + OUT t_field3 int4, + OUT t_ctid tid, + OUT t_infomask2 integer, + OUT t_infomask integer, + OUT t_shard smallint, + OUT t_hoff smallint, + OUT t_bits text, + OUT t_oid oid, + OUT t_data bytea) +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'heap_page_items_with_gts' +LANGUAGE C STRICT PARALLEL SAFE; + + +CREATE FUNCTION heap_page_ids(IN page bytea, + OUT lp smallint, + OUT lp_off smallint, + OUT lp_flags smallint, + OUT lp_len smallint, + OUT t_xmin xid, + OUT t_xmax xid, + OUT t_xmin_gts bigint, + OUT t_xmax_gts bigint, + OUT t_field3 int4, + OUT t_ctid tid, + OUT t_infomask2 integer, + OUT t_infomask integer, + OUT t_shard smallint, + OUT t_hoff smallint, + OUT t_bits text, + OUT t_oid oid, + OUT t_data bytea) +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'heap_page_ids' +LANGUAGE C STRICT PARALLEL SAFE; + + +CREATE FUNCTION heap_page_items_without_data(IN page bytea, + OUT lp smallint, + OUT lp_off smallint, + OUT lp_flags smallint, + OUT lp_len smallint, + OUT t_xmin xid, + OUT t_xmax xid, + OUT t_xmin_gts bigint, + OUT t_xmax_gts bigint, + OUT t_field3 int4, + OUT t_ctid tid, + OUT t_infomask2 integer, + OUT t_infomask integer, + OUT t_shard smallint, + OUT t_hoff smallint, + OUT t_bits text, + OUT t_oid oid, + OUT t_data bytea) +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'heap_page_items_without_data' +LANGUAGE C STRICT PARALLEL SAFE; + + +CREATE FUNCTION heap_page_items_with_gts_log(IN page bytea, + OUT lp smallint, + OUT lp_off smallint, + OUT lp_flags smallint, + OUT lp_len smallint, + OUT t_xmin xid, + OUT t_xmax xid, + OUT t_xmin_gts bigint, + OUT t_xmax_gts bigint, + OUT t_field3 int4, + OUT t_ctid tid, + OUT t_infomask2 integer, + OUT t_infomask integer, + OUT t_shard smallint, + OUT t_hoff smallint, + OUT t_bits text, + OUT t_oid oid, + OUT t_data bytea) +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'heap_page_items_with_gts_log' +LANGUAGE C STRICT PARALLEL SAFE; diff --git a/contrib/tbase_gts_tools/tbase_gts_tools.c b/contrib/tbase_gts_tools/tbase_gts_tools.c new file mode 100644 index 00000000..5de20a39 --- /dev/null +++ b/contrib/tbase_gts_tools/tbase_gts_tools.c @@ -0,0 +1,357 @@ +#include "postgres.h" +#include "fmgr.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "catalog/pg_type.h" +#include "catalog/namespace.h" +#include "utils/timestamp.h" +#include "utils/varlena.h" +#include "utils/builtins.h" +#include "utils/elog.h" +#include "access/commit_ts.h" +#include "access/htup_details.h" +#include "storage/bufmgr.h" + +#ifdef PG_MODULE_MAGIC +PG_MODULE_MAGIC; +#endif + +static Datum +items(PG_FUNCTION_ARGS, int log_level, bool with_data, bool only_id); + +/* + * bits_to_text + * + * Converts a bits8-array of 'len' bits to a human-readable + * c-string representation. + */ +static char * +bits_to_text(bits8 *bits, int len) +{ + int i; + char *str; + + str = palloc(len + 1); + + for (i = 0; i < len; i++) + str[i] = (bits[(i / 8)] & (1 << (i % 8))) ? '1' : '0'; + + str[i] = '\0'; + + return str; +} + +PG_FUNCTION_INFO_V1(txid_gts); + +Datum +txid_gts(PG_FUNCTION_ARGS) +{ + TransactionId xid = PG_GETARG_UINT32(0); + TimestampTz gts; + bool found = false; + + if (TransactionIdIsNormal(xid)) + { + found = TransactionIdGetCommitTsData(xid, >s, NULL); + } + + if (!found) + { + PG_RETURN_NULL(); + } + + PG_RETURN_INT64(gts); +} + +/* + * heap_page_items_with_gts + * + * Allows inspection of line pointers and tuple headers of a heap page. + */ +PG_FUNCTION_INFO_V1(heap_page_items_with_gts); + +typedef struct heap_page_items_state +{ + TupleDesc tupd; + Page page; + uint16 offset; +} heap_page_items_state; + +Datum +heap_page_items_with_gts(PG_FUNCTION_ARGS) +{ + return items(fcinfo, 0, true, false); +} + +PG_FUNCTION_INFO_V1(heap_page_items_with_gts_log); + +Datum +heap_page_items_with_gts_log(PG_FUNCTION_ARGS) +{ + return items(fcinfo, 1, true, false); +} + +PG_FUNCTION_INFO_V1(heap_page_ids); + +Datum +heap_page_ids(PG_FUNCTION_ARGS) +{ + return items(fcinfo, 1, false, true); +} + +PG_FUNCTION_INFO_V1(heap_page_items_without_data); + +Datum +heap_page_items_without_data(PG_FUNCTION_ARGS) +{ + return items(fcinfo, 1, false, false); +} + +static Datum +items(PG_FUNCTION_ARGS, int log_level, bool with_data, bool only_id) +{ + bytea *raw_page; + int raw_page_size; + heap_page_items_state *inter_call_data = NULL; + FuncCallContext *fctx; + + if (!superuser()) + { + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + (errmsg("must be superuser to use raw page functions")))); + } + + if (SRF_IS_FIRSTCALL()) + { + TupleDesc tupdesc; + MemoryContext mctx; + + fctx = SRF_FIRSTCALL_INIT(); + mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx); + + raw_page = PG_GETARG_BYTEA_P(0); + raw_page_size = VARSIZE(raw_page) - VARHDRSZ; + if (raw_page_size < SizeOfPageHeaderData) + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("input page too small (%d bytes)", raw_page_size))); + } + + inter_call_data = palloc(sizeof(heap_page_items_state)); + + /* Build a tuple descriptor for our result type */ + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + { + elog(ERROR, "return type must be a row type"); + } + + inter_call_data->tupd = tupdesc; + + inter_call_data->offset = FirstOffsetNumber; + inter_call_data->page = VARDATA(raw_page); + + fctx->max_calls = PageGetMaxOffsetNumber(inter_call_data->page); + fctx->user_fctx = inter_call_data; + + MemoryContextSwitchTo(mctx); + } + + fctx = SRF_PERCALL_SETUP(); + inter_call_data = fctx->user_fctx; + + if (fctx->call_cntr < fctx->max_calls) + { + Page page = inter_call_data->page; + HeapTuple resultTuple; + Datum result; + ItemId id; + Datum values[17]; + bool nulls[17]; + uint16 lp_offset; + uint16 lp_flags; + uint16 lp_len; + + memset(values, 0, sizeof(values)); + memset(nulls, 0, sizeof(nulls)); + + /* Extract information from the line pointer */ + + id = PageGetItemId(page, inter_call_data->offset); + + lp_offset = ItemIdGetOffset(id); + lp_flags = ItemIdGetFlags(id); + lp_len = ItemIdGetLength(id); + + values[0] = UInt16GetDatum(inter_call_data->offset); + values[1] = UInt16GetDatum(lp_offset); + values[2] = UInt16GetDatum(lp_flags); + values[3] = UInt16GetDatum(lp_len); + + /* + * We do just enough validity checking to make sure we don't reference + * data outside the page passed to us. The page could be corrupt in + * many other ways, but at least we won't crash. + */ + if (!only_id && + ItemIdHasStorage(id) && + lp_len >= MinHeapTupleSize && + lp_offset == MAXALIGN(lp_offset) && + lp_offset + lp_len <= BLCKSZ) + { + HeapTupleHeader tuphdr; + bytea *tuple_data_bytea; + int tuple_data_len; + + /* Extract information from the tuple header */ + + tuphdr = (HeapTupleHeader)PageGetItem(page, id); + + values[4] = UInt32GetDatum(HeapTupleHeaderGetRawXmin(tuphdr)); + values[5] = UInt32GetDatum(HeapTupleHeaderGetRawXmax(tuphdr)); + values[6] = Int64GetDatum(HeapTupleHeaderGetXminTimestamp(tuphdr)); + values[7] = Int64GetDatum(HeapTupleHeaderGetXmaxTimestamp(tuphdr)); + + /* shared with xvac */ + values[8] = UInt32GetDatum(HeapTupleHeaderGetRawCommandId(tuphdr)); + values[9] = PointerGetDatum(&tuphdr->t_ctid); + values[10] = UInt32GetDatum(tuphdr->t_infomask2); + values[11] = UInt32GetDatum(tuphdr->t_infomask); +#ifdef _MIGRATE_ + values[12] = Int32GetDatum(tuphdr->t_shardid); + values[13] = UInt8GetDatum(tuphdr->t_hoff); +#else + values[12] = UInt8GetDatum(tuphdr->t_hoff); +#endif + + if (with_data) + { + /* Copy raw tuple data into bytea attribute */ + tuple_data_len = lp_len - tuphdr->t_hoff; + tuple_data_bytea = (bytea *)palloc(tuple_data_len + VARHDRSZ); + SET_VARSIZE(tuple_data_bytea, tuple_data_len + VARHDRSZ); + memcpy(VARDATA(tuple_data_bytea), (char *)tuphdr + tuphdr->t_hoff, + tuple_data_len); + values[16] = PointerGetDatum(tuple_data_bytea); + } + else + { + nulls[16] = true; + } + + /* + * We already checked that the item is completely within the raw + * page passed to us, with the length given in the line pointer. + * Let's check that t_hoff doesn't point over lp_len, before using + * it to access t_bits and oid. + */ + if (tuphdr->t_hoff >= SizeofHeapTupleHeader && + tuphdr->t_hoff <= lp_len && + tuphdr->t_hoff == MAXALIGN(tuphdr->t_hoff)) + { + if (tuphdr->t_infomask & HEAP_HASNULL) + { + int bits_len; + + bits_len = + ((tuphdr->t_infomask2 & HEAP_NATTS_MASK) / 8 + 1) * 8; + values[14] = CStringGetTextDatum( + bits_to_text(tuphdr->t_bits, bits_len)); + } + else + { + nulls[14] = true; + } + + if (tuphdr->t_infomask & HEAP_HASOID) + { + values[15] = HeapTupleHeaderGetOid(tuphdr); + } + else + { + nulls[15] = true; + } + } + else + { + nulls[14] = true; + nulls[15] = true; + } + } + else + { + /* + * The line pointer is not used, or it's invalid. Set the rest of + * the fields to NULL + */ + int i; + + for (i = 4; i <= 16; i++) + nulls[i] = true; + } + + if (log_level > 0) + { + elog(LOG, "heap_page_items_with_gts_log: null[0~16] = " + "%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d \n", + nulls[0], nulls[1], nulls[2], nulls[3], + nulls[4], nulls[5], nulls[6], nulls[7], + nulls[8], nulls[9], nulls[10], nulls[11], + nulls[12], nulls[13], nulls[14], nulls[15], + nulls[16]); + + if (only_id) + { + elog(LOG, "heap_page_items_with_gts_log: " + "lp=%d, lp_off=%d, lp_flags=%d, lp_len=%d \n", + DatumGetUInt16(values[0]), + DatumGetUInt16(values[1]), + DatumGetUInt16(values[2]), + DatumGetUInt16(values[3])); + } + else + { + elog(LOG, "heap_page_items_with_gts_log: " + "lp=%d, lp_off=%d, lp_flags=%d, lp_len=%d " + "t_xmin=%u t_xmax=%u t_xmin_gts=%ld t_xmax_gts=%ld " + "t_field3=%u t_infomask2=%u t_infomask=%u " + "t_share=%d t_hoff=%d t_oid=%u " + "\n", + DatumGetUInt16(values[0]), + DatumGetUInt16(values[1]), + DatumGetUInt16(values[2]), + DatumGetUInt16(values[3]), + DatumGetUInt32(values[4]), + DatumGetUInt32(values[5]), + DatumGetInt64(values[6]), + DatumGetInt64(values[7]), + DatumGetUInt32(values[8]), + /* ignore tid */ + DatumGetUInt32(values[10]), + DatumGetUInt32(values[11]), + DatumGetInt32(values[12]), + DatumGetUInt8(values[13]), + /* ignore text */ + (Oid)values[15] + /* + * ignore oid + * ignore byte + */ + ); + } + } + + /* Build and return the result tuple. */ + resultTuple = heap_form_tuple(inter_call_data->tupd, values, nulls); + result = HeapTupleGetDatum(resultTuple); + + inter_call_data->offset++; + + SRF_RETURN_NEXT(fctx, result); + } + else + { + SRF_RETURN_DONE(fctx); + } +} diff --git a/contrib/tbase_gts_tools/tbase_gts_tools.control b/contrib/tbase_gts_tools/tbase_gts_tools.control new file mode 100644 index 00000000..a7b6e7f3 --- /dev/null +++ b/contrib/tbase_gts_tools/tbase_gts_tools.control @@ -0,0 +1,5 @@ +# tbase_gts_tools extension +comment = 'GTS wrapper for Tbase' +default_version = '1.0' +module_pathname = '$libdir/tbase_gts_tools' +relocatable = true diff --git a/src/backend/access/rmgrdesc/xactdesc.c b/src/backend/access/rmgrdesc/xactdesc.c index 303e7a88..450e2594 100644 --- a/src/backend/access/rmgrdesc/xactdesc.c +++ b/src/backend/access/rmgrdesc/xactdesc.c @@ -338,41 +338,41 @@ xact_desc_assignment(StringInfo buf, xl_xact_assignment *xlrec) void xact_desc(StringInfo buf, XLogReaderState *record) { - char *rec = XLogRecGetData(record); - uint8 info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK; - - if (info == XLOG_XACT_COMMIT || info == XLOG_XACT_COMMIT_PREPARED) - { - xl_xact_commit *xlrec = (xl_xact_commit *) rec; - - xact_desc_commit(buf, XLogRecGetInfo(record), xlrec, - XLogRecGetOrigin(record)); - } - else if (info == XLOG_XACT_ABORT || info == XLOG_XACT_ABORT_PREPARED) - { - xl_xact_abort *xlrec = (xl_xact_abort *) rec; - - xact_desc_abort(buf, XLogRecGetInfo(record), xlrec); - } - else if (info == XLOG_XACT_ASSIGNMENT) - { - xl_xact_assignment *xlrec = (xl_xact_assignment *) rec; - - /* - * Note that we ignore the WAL record's xid, since we're more - * interested in the top-level xid that issued the record and which - * xids are being reported here. - */ - appendStringInfo(buf, "xtop %u: ", xlrec->xtop); - xact_desc_assignment(buf, xlrec); - } - #ifdef __TBASE__ - else if (info == XLOG_XACT_ACQUIRE_GTS) - { - xl_xact_acquire_gts *xlrec = (xl_xact_acquire_gts *) rec; - appendStringInfo(buf, "acquire global timestamp "INT64_FORMAT" ", xlrec->global_timestamp); - } - #endif + char *rec = XLogRecGetData(record); + uint8 info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK; + + if (info == XLOG_XACT_COMMIT || info == XLOG_XACT_COMMIT_PREPARED) + { + xl_xact_commit *xlrec = (xl_xact_commit *) rec; + + xact_desc_commit(buf, XLogRecGetInfo(record), xlrec, + XLogRecGetOrigin(record)); + } + else if (info == XLOG_XACT_ABORT || info == XLOG_XACT_ABORT_PREPARED) + { + xl_xact_abort *xlrec = (xl_xact_abort *) rec; + + xact_desc_abort(buf, XLogRecGetInfo(record), xlrec); + } + else if (info == XLOG_XACT_ASSIGNMENT) + { + xl_xact_assignment *xlrec = (xl_xact_assignment *) rec; + + /* + * Note that we ignore the WAL record's xid, since we're more + * interested in the top-level xid that issued the record and which + * xids are being reported here. + */ + appendStringInfo(buf, "xtop %u: ", xlrec->xtop); + xact_desc_assignment(buf, xlrec); + } +#ifdef __TBASE__ + else if (info == XLOG_XACT_ACQUIRE_GTS) + { + xl_xact_acquire_gts *xlrec = (xl_xact_acquire_gts *) rec; + appendStringInfo(buf, "acquire global timestamp "INT64_FORMAT" ", xlrec->global_timestamp); + } +#endif } const char * diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c index d7140d25..0a950156 100644 --- a/src/backend/access/transam/commit_ts.c +++ b/src/backend/access/transam/commit_ts.c @@ -208,63 +208,64 @@ TransactionTreeSetCommitTsData(TransactionId xid, int nsubxids, else newestXact = xid; #endif - /* - * We split the xids to set the timestamp to in groups belonging to the - * same SLRU page; the first element in each such set is its head. The - * first group has the main XID as the head; subsequent sets use the first - * subxid not on the previous page as head. This way, we only have to - * lock/modify each SLRU page once. - */ - for (i = 0, headxid = xid;;) - { - int pageno = TransactionIdToCTsPage(headxid); - int j; - - for (j = i; j < nsubxids; j++) - { - if(enable_committs_print) - { - elog(LOG, "TransactionTreeSetCommitTsData, subxid xid %d i %d j %d nsubxids %d", subxids[j], i, j, nsubxids); - } - - if (TransactionIdToCTsPage(subxids[j]) != pageno) - { - if(enable_committs_print) - { - elog(LOG, "break pageno %d subxid xid %d j %d", pageno, subxids[j], j); - } - break; - } - } - /* subxids[i..j] are on the same page as the head */ - if(j - i > 0) - { - SetXidCommitTsInPage(headxid, j - i, subxids + i, global_timestamp, timestamp, nodeid, - pageno, lsn); - } - else - { - SetXidCommitTsInPage(headxid, 0, NULL, global_timestamp, timestamp, nodeid, - pageno, lsn); - } - - if(enable_committs_print) - { - elog(LOG, "set committs data pageno %d xid %d head xid %d j-i %d i %d nsubxids %d committs "INT64_FORMAT, pageno, xid, headxid, j-i, - i, nsubxids, global_timestamp); - } - - /* if we wrote out all subxids, we're done. */ - if (j + 1 > nsubxids) - break; - - /* - * Set the new head and skip over it, as well as over the subxids we - * just wrote. - */ - headxid = subxids[j]; - i = j + 1; - } + /* + * We split the xids to set the timestamp to in groups belonging to the + * same SLRU page; the first element in each such set is its head. The + * first group has the main XID as the head; subsequent sets use the first + * subxid not on the previous page as head. This way, we only have to + * lock/modify each SLRU page once. + */ + for (i = 0, headxid = xid;;) + { + int pageno = TransactionIdToCTsPage(headxid); + int j; + + for (j = i; j < nsubxids; j++) + { + if(enable_committs_print) + { + elog(LOG, "TransactionTreeSetCommitTsData, subxid xid %d i %d j %d nsubxids %d", subxids[j], i, j, nsubxids); + } + + if (TransactionIdToCTsPage(subxids[j]) != pageno) + { + if(enable_committs_print) + { + elog(LOG, "break pageno %d subxid xid %d j %d", pageno, subxids[j], j); + } + break; + } + } + /* subxids[i..j] are on the same page as the head */ + if(j - i > 0) + { + SetXidCommitTsInPage(headxid, j - i, subxids + i, global_timestamp, timestamp, nodeid, + pageno, lsn); + } + else + { + SetXidCommitTsInPage(headxid, 0, NULL, global_timestamp, timestamp, nodeid, + pageno, lsn); + } + + if(enable_committs_print) + { + elog(LOG, + "TransactionTreeSetCommitTsData: set committs data pageno %d xid %d head xid %d j-i %d i %d nsubxids %d committs "INT64_FORMAT, + pageno, xid, headxid, j - i, i, nsubxids, global_timestamp); + } + + /* if we wrote out all subxids, we're done. */ + if (j + 1 > nsubxids) + break; + + /* + * Set the new head and skip over it, as well as over the subxids we + * just wrote. + */ + headxid = subxids[j]; + i = j + 1; + } #if 0 /* update the cached value in shared memory */ LWLockAcquire(CommitTsLock, LW_EXCLUSIVE); @@ -317,21 +318,21 @@ static void TransactionIdSetCommitTs(TransactionId xid, TimestampTz gts, TimestampTz ts, RepOriginId nodeid, int partitionno, int slotno, XLogRecPtr lsn) { - int entryno = TransactionIdToCTsEntry(xid); - CommitTimestampEntry entry; - -// Assert(TransactionIdIsNormal(xid)); - if(enable_committs_print) - { - elog(LOG, "TransactionIdSetCommitTs xid %d", xid); - } - entry.global_timestamp = gts; - entry.time = ts; - entry.nodeid = nodeid; - - memcpy(CommitTsCtl->shared[partitionno]->page_buffer[slotno] + - SizeOfCommitTimestampEntry * entryno, - &entry, SizeOfCommitTimestampEntry); + int entryno = TransactionIdToCTsEntry(xid); + CommitTimestampEntry entry; + +// Assert(TransactionIdIsNormal(xid)); + if (enable_committs_print) + { + elog(LOG, "TransactionIdSetCommitTs: xid %d gts "INT64_FORMAT, xid, gts); + } + entry.global_timestamp = gts; + entry.time = ts; + entry.nodeid = nodeid; + + memcpy(CommitTsCtl->shared[partitionno]->page_buffer[slotno] + + SizeOfCommitTimestampEntry * entryno, + &entry, SizeOfCommitTimestampEntry); #ifdef __TBASE__ /* @@ -1191,19 +1192,26 @@ WriteSetTimestampXlogRec(TransactionId mainxid, int nsubxids, TransactionId *subxids, TimestampTz global_timestamp, TimestampTz timestamp, RepOriginId nodeid) { - xl_commit_ts_set record; - - record.global_timestamp = global_timestamp; - record.timestamp = timestamp; - record.nodeid = nodeid; - record.mainxid = mainxid; - - XLogBeginInsert(); - XLogRegisterData((char *) &record, - offsetof(xl_commit_ts_set, mainxid) + - sizeof(TransactionId)); - XLogRegisterData((char *) subxids, nsubxids * sizeof(TransactionId)); - XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_SETTS); + xl_commit_ts_set record; + + record.global_timestamp = global_timestamp; + record.timestamp = timestamp; + record.nodeid = nodeid; + record.mainxid = mainxid; + + XLogBeginInsert(); + XLogRegisterData((char *) &record, + offsetof(xl_commit_ts_set, mainxid) + + sizeof(TransactionId)); + XLogRegisterData((char *) subxids, nsubxids * sizeof(TransactionId)); + XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_SETTS); + + if (enable_committs_print) + { + elog(LOG, + "WriteSetTimestampXlogRec: mainxid %d timestamp "INT64_FORMAT" global_timestamp "INT64_FORMAT, + mainxid, timestamp, global_timestamp); + } } /* diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c index 81bb209f..981332b6 100644 --- a/src/backend/access/transam/gtm.c +++ b/src/backend/access/transam/gtm.c @@ -1382,36 +1382,28 @@ GetGlobalTimestampGTM(void) if (log_gtm_stats) ShowUsageCommon("BeginTranGTM", &start_r, &start_t); -retry: - - latest_gts = GetLatestCommitTS(); - if (gts_result.gts != InvalidGlobalTimestamp && latest_gts > (gts_result.gts + GTM_CHECK_DELTA)) - { - if(retries < 3) - { - retries++; - goto retry; - } - - elog(ERROR, "global gts:%lu is earlier than local gts:%lu, please check GTM status!", gts_result.gts + GTM_CHECK_DELTA, latest_gts); - } + latest_gts = GetLatestCommitTS(); + if (gts_result.gts != InvalidGlobalTimestamp && latest_gts > (gts_result.gts + GTM_CHECK_DELTA)) + { + elog(ERROR, "global gts:%lu is earlier than local gts:%lu, please check GTM status!", gts_result.gts + GTM_CHECK_DELTA, latest_gts); + } - /* if we are standby, use timestamp subtracting given interval */ - if (IsStandbyPostgres() && query_delay) - { - GTM_Timestamp interval = query_delay * USECS_PER_SEC; + /* if we are standby, use timestamp subtracting given interval */ + if (IsStandbyPostgres() && query_delay) + { + GTM_Timestamp interval = query_delay * USECS_PER_SEC; - gts_result.gts = gts_result.gts - interval; + gts_result.gts = gts_result.gts - interval; - if (gts_result.gts < FirstGlobalTimestamp) - { - gts_result.gts = FirstGlobalTimestamp; - } - } + if (gts_result.gts < FirstGlobalTimestamp) + { + gts_result.gts = FirstGlobalTimestamp; + } + } - GTM_ReadOnly = gts_result.gtm_readonly; - - return gts_result.gts; + GTM_ReadOnly = gts_result.gtm_readonly; + + return gts_result.gts; } #endif diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 5fc12531..efb5aade 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -86,8 +86,6 @@ int vacuum_multixact_freeze_min_age; int vacuum_multixact_freeze_table_age; int vacuum_defer_freeze_min_age; - - /* A few variables that don't seem worth passing around as parameters */ static MemoryContext vac_context = NULL; static BufferAccessStrategy vac_strategy; @@ -1938,7 +1936,7 @@ vacuum_rel_coordinator(Relation onerel, bool is_outer, VacuumParams *params) relname = RelationGetRelationName(onerel); nspname = get_namespace_name(RelationGetNamespace(onerel)); - elog(LOG, "Getting relation statistics for %s.%s", nspname, relname); + elog(DEBUG5, "Getting relation statistics for %s.%s", nspname, relname); #ifdef __TBASE__ if (params && onerel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE) diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c index 1e72ec49..f1508d69 100644 --- a/src/backend/commands/vacuumlazy.c +++ b/src/backend/commands/vacuumlazy.c @@ -34,8 +34,10 @@ */ #include "postgres.h" +#include #include +#include "access/commit_ts.h" #include "access/genam.h" #include "access/heapam.h" #include "access/heapam_xlog.h" @@ -45,6 +47,7 @@ #include "access/visibilitymap.h" #include "access/xlog.h" #include "access/xlogutils.h" +#include "bootstrap/bootstrap.h" #include "catalog/catalog.h" #include "catalog/storage.h" #include "commands/dbcommands.h" @@ -53,7 +56,9 @@ #include "miscadmin.h" #include "pgstat.h" #include "portability/instr_time.h" +#include "postmaster/auditlogger.h" #include "postmaster/autovacuum.h" +#include "postmaster/postmaster.h" #include "storage/bufmgr.h" #include "storage/freespace.h" #include "storage/lmgr.h" @@ -135,6 +140,13 @@ typedef struct LVRelStats bool lock_waiter_detected; } LVRelStats; +int gts_maintain_option; + +static void PrintStack(void); +static void PrintData(RelFileNode *rnode, + BlockNumber blkno, Page page, OffsetNumber lineoff, + GlobalTimestamp tlog_xmin_gts, GlobalTimestamp tlog_xmax_gts); +static void MaintainGTS(RelFileNode *rnode, BlockNumber blkno, Buffer buffer); /* A few variables that don't seem worth passing around as parameters */ static int elevel = -1; @@ -991,192 +1003,198 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, relname, blkno))); PageInit(page, BufferGetPageSize(buf), 0); #endif - empty_pages++; - UnlockReleaseBuffer(buf); - RecordNewPageWithFullFreeSpace(onerel, blkno); - } - else - { - UnlockReleaseBuffer(buf); - freespace = PageGetHeapFreeSpace(page); - RecordPageWithFreeSpace(onerel, blkno, freespace); - MarkBufferDirty(buf); - } -#endif - continue; - } - - if (PageIsEmpty(page)) - { - empty_pages++; - freespace = PageGetHeapFreeSpace(page); - - /* empty pages are always all-visible and all-frozen */ - if (!PageIsAllVisible(page)) - { - START_CRIT_SECTION(); - - /* mark buffer dirty before writing a WAL record */ - MarkBufferDirty(buf); - - /* - * It's possible that another backend has extended the heap, - * initialized the page, and then failed to WAL-log the page - * due to an ERROR. Since heap extension is not WAL-logged, - * recovery might try to replay our record setting the page - * all-visible and find that the page isn't initialized, which - * will cause a PANIC. To prevent that, check whether the - * page has been previously WAL-logged, and if not, do that - * now. - */ - if (RelationNeedsWAL(onerel) && - PageGetLSN(page) == InvalidXLogRecPtr) - log_newpage_buffer(buf, true); - - PageSetAllVisible(page); - visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr, - vmbuffer, InvalidTransactionId, - VISIBILITYMAP_ALL_VISIBLE | VISIBILITYMAP_ALL_FROZEN); - END_CRIT_SECTION(); - } - - UnlockReleaseBuffer(buf); - RecordPageWithFreeSpace(onerel, blkno, freespace); - continue; - } - - - /* - * Prune all HOT-update chains in this page. - * - * We count tuples removed by the pruning step as removed by VACUUM. - */ - tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false, - &vacrelstats->latestRemovedXid); - - /* - * Now scan the page to collect vacuumable items and check for tuples - * requiring freezing. - */ - all_visible = true; - has_dead_tuples = false; - nfrozen = 0; - hastup = false; - prev_dead_count = vacrelstats->num_dead_tuples; - maxoff = PageGetMaxOffsetNumber(page); + empty_pages++; + UnlockReleaseBuffer(buf); + RecordNewPageWithFullFreeSpace(onerel, blkno); + } + else + { + UnlockReleaseBuffer(buf); + freespace = PageGetHeapFreeSpace(page); + RecordPageWithFreeSpace(onerel, blkno, freespace); + MarkBufferDirty(buf); + } +#endif + continue; + } - /* - * Note: If you change anything in the loop below, also look at - * heap_page_is_all_visible to see if that needs to be changed. - */ - for (offnum = FirstOffsetNumber; - offnum <= maxoff; - offnum = OffsetNumberNext(offnum)) - { - ItemId itemid; + if (PageIsEmpty(page)) + { + empty_pages++; + freespace = PageGetHeapFreeSpace(page); - itemid = PageGetItemId(page, offnum); + /* empty pages are always all-visible and all-frozen */ + if (!PageIsAllVisible(page)) + { + START_CRIT_SECTION(); + + /* mark buffer dirty before writing a WAL record */ + MarkBufferDirty(buf); + + /* + * It's possible that another backend has extended the heap, + * initialized the page, and then failed to WAL-log the page + * due to an ERROR. Since heap extension is not WAL-logged, + * recovery might try to replay our record setting the page + * all-visible and find that the page isn't initialized, which + * will cause a PANIC. To prevent that, check whether the + * page has been previously WAL-logged, and if not, do that + * now. + */ + if (RelationNeedsWAL(onerel) && + PageGetLSN(page) == InvalidXLogRecPtr) + log_newpage_buffer(buf, true); + + PageSetAllVisible(page); + visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr, + vmbuffer, InvalidTransactionId, + VISIBILITYMAP_ALL_VISIBLE | VISIBILITYMAP_ALL_FROZEN); + END_CRIT_SECTION(); + } - /* Unused items require no processing, but we count 'em */ - if (!ItemIdIsUsed(itemid)) - { - nunused += 1; - continue; - } + UnlockReleaseBuffer(buf); + RecordPageWithFreeSpace(onerel, blkno, freespace); + continue; + } - /* Redirect items mustn't be touched */ - if (ItemIdIsRedirected(itemid)) - { - hastup = true; /* this page won't be truncatable */ - continue; - } +#ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__ + if (gts_maintain_option != GTS_MAINTAIN_NOTHING) + { + MaintainGTS(&onerel->rd_node, blkno, buf); + } +#endif - ItemPointerSet(&(tuple.t_self), blkno, offnum); + /* + * Prune all HOT-update chains in this page. + * + * We count tuples removed by the pruning step as removed by VACUUM. + */ + tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false, + &vacrelstats->latestRemovedXid); + + /* + * Now scan the page to collect vacuumable items and check for tuples + * requiring freezing. + */ + all_visible = true; + has_dead_tuples = false; + nfrozen = 0; + hastup = false; + prev_dead_count = vacrelstats->num_dead_tuples; + maxoff = PageGetMaxOffsetNumber(page); + + /* + * Note: If you change anything in the loop below, also look at + * heap_page_is_all_visible to see if that needs to be changed. + */ + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid; - /* - * DEAD item pointers are to be vacuumed normally; but we don't - * count them in tups_vacuumed, else we'd be double-counting (at - * least in the common case where heap_page_prune() just freed up - * a non-HOT tuple). - */ - if (ItemIdIsDead(itemid)) - { - lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); - all_visible = false; - continue; - } + itemid = PageGetItemId(page, offnum); - Assert(ItemIdIsNormal(itemid)); + /* Unused items require no processing, but we count 'em */ + if (!ItemIdIsUsed(itemid)) + { + nunused += 1; + continue; + } - tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); - tuple.t_len = ItemIdGetLength(itemid); - tuple.t_tableOid = RelationGetRelid(onerel); + /* Redirect items mustn't be touched */ + if (ItemIdIsRedirected(itemid)) + { + hastup = true; /* this page won't be truncatable */ + continue; + } - tupgone = false; + ItemPointerSet(&(tuple.t_self), blkno, offnum); - switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf)) - { - case HEAPTUPLE_DEAD: + /* + * DEAD item pointers are to be vacuumed normally; but we don't + * count them in tups_vacuumed, else we'd be double-counting (at + * least in the common case where heap_page_prune() just freed up + * a non-HOT tuple). + */ + if (ItemIdIsDead(itemid)) + { + lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); + all_visible = false; + continue; + } - /* - * Ordinarily, DEAD tuples would have been removed by - * heap_page_prune(), but it's possible that the tuple - * state changed since heap_page_prune() looked. In - * particular an INSERT_IN_PROGRESS tuple could have - * changed to DEAD if the inserter aborted. So this - * cannot be considered an error condition. - * - * If the tuple is HOT-updated then it must only be - * removed by a prune operation; so we keep it just as if - * it were RECENTLY_DEAD. Also, if it's a heap-only - * tuple, we choose to keep it, because it'll be a lot - * cheaper to get rid of it in the next pruning pass than - * to treat it like an indexed tuple. - */ - if (HeapTupleIsHotUpdated(&tuple) || - HeapTupleIsHeapOnly(&tuple)) - nkeep += 1; - else - tupgone = true; /* we can delete the tuple */ - all_visible = false; - break; - case HEAPTUPLE_LIVE: - /* Tuple is good --- but let's do some validity checks */ - if (onerel->rd_rel->relhasoids && - !OidIsValid(HeapTupleGetOid(&tuple))) - elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid", - relname, blkno, offnum); + Assert(ItemIdIsNormal(itemid)); - /* - * Is the tuple definitely visible to all transactions? - * - * NB: Like with per-tuple hint bits, we can't set the - * PD_ALL_VISIBLE flag if the inserter committed - * asynchronously. See SetHintBits for more info. Check - * that the tuple is hinted xmin-committed because of - * that. - */ - if (all_visible) - { - TransactionId xmin; + tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); + tuple.t_len = ItemIdGetLength(itemid); + tuple.t_tableOid = RelationGetRelid(onerel); - if (!HeapTupleHeaderXminCommitted(tuple.t_data)) - { - all_visible = false; - break; - } + tupgone = false; - /* - * The inserter definitely committed. But is it old - * enough that everyone sees it as committed? - */ - xmin = HeapTupleHeaderGetXmin(tuple.t_data); - if (!TransactionIdPrecedes(xmin, OldestXmin)) - { - all_visible = false; - break; - } - + switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf)) + { + case HEAPTUPLE_DEAD: + + /* + * Ordinarily, DEAD tuples would have been removed by + * heap_page_prune(), but it's possible that the tuple + * state changed since heap_page_prune() looked. In + * particular an INSERT_IN_PROGRESS tuple could have + * changed to DEAD if the inserter aborted. So this + * cannot be considered an error condition. + * + * If the tuple is HOT-updated then it must only be + * removed by a prune operation; so we keep it just as if + * it were RECENTLY_DEAD. Also, if it's a heap-only + * tuple, we choose to keep it, because it'll be a lot + * cheaper to get rid of it in the next pruning pass than + * to treat it like an indexed tuple. + */ + if (HeapTupleIsHotUpdated(&tuple) || + HeapTupleIsHeapOnly(&tuple)) + nkeep += 1; + else + tupgone = true; /* we can delete the tuple */ + all_visible = false; + break; + case HEAPTUPLE_LIVE: + /* Tuple is good --- but let's do some validity checks */ + if (onerel->rd_rel->relhasoids && + !OidIsValid(HeapTupleGetOid(&tuple))) + elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid", + relname, blkno, offnum); + + /* + * Is the tuple definitely visible to all transactions? + * + * NB: Like with per-tuple hint bits, we can't set the + * PD_ALL_VISIBLE flag if the inserter committed + * asynchronously. See SetHintBits for more info. Check + * that the tuple is hinted xmin-committed because of + * that. + */ + if (all_visible) + { + TransactionId xmin; + + if (!HeapTupleHeaderXminCommitted(tuple.t_data)) + { + all_visible = false; + break; + } + + /* + * The inserter definitely committed. But is it old + * enough that everyone sees it as committed? + */ + xmin = HeapTupleHeaderGetXmin(tuple.t_data); + if (!TransactionIdPrecedes(xmin, OldestXmin)) + { + all_visible = false; + break; + } + #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__ { GlobalTimestamp committs = HeapTupleHderGetXminTimestapAtomic(tuple.t_data); @@ -1203,220 +1221,227 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, } #endif - /* Track newest xmin on page. */ - if (TransactionIdFollows(xmin, visibility_cutoff_xid)) - visibility_cutoff_xid = xmin; - } - break; - case HEAPTUPLE_RECENTLY_DEAD: - - /* - * If tuple is recently deleted then we must not remove it - * from relation. - */ - nkeep += 1; - all_visible = false; - break; - case HEAPTUPLE_INSERT_IN_PROGRESS: - /* This is an expected case during concurrent vacuum */ - all_visible = false; - break; - case HEAPTUPLE_DELETE_IN_PROGRESS: - /* This is an expected case during concurrent vacuum */ - all_visible = false; - break; - default: - elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); - break; - } - - if (tupgone) - { - lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); - HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data, - &vacrelstats->latestRemovedXid); - tups_vacuumed += 1; - has_dead_tuples = true; - } - else - { - bool tuple_totally_frozen; - - num_tuples += 1; - hastup = true; - - /* - * Each non-removable tuple must be checked to see if it needs - * freezing. Note we already have exclusive buffer lock. - */ - if (heap_prepare_freeze_tuple(tuple.t_data, FreezeLimit, - MultiXactCutoff, &frozen[nfrozen], - &tuple_totally_frozen)) - frozen[nfrozen++].offset = offnum; - - if (!tuple_totally_frozen) - all_frozen = false; - } - } /* scan along page */ - - /* - * If we froze any tuples, mark the buffer dirty, and write a WAL - * record recording the changes. We must log the changes to be - * crash-safe against future truncation of CLOG. - */ - if (nfrozen > 0) - { - START_CRIT_SECTION(); - - MarkBufferDirty(buf); - - /* execute collected freezes */ - for (i = 0; i < nfrozen; i++) - { - ItemId itemid; - HeapTupleHeader htup; - - itemid = PageGetItemId(page, frozen[i].offset); - htup = (HeapTupleHeader) PageGetItem(page, itemid); + /* Track newest xmin on page. */ + if (TransactionIdFollows(xmin, visibility_cutoff_xid)) + visibility_cutoff_xid = xmin; + } + break; + case HEAPTUPLE_RECENTLY_DEAD: + + /* + * If tuple is recently deleted then we must not remove it + * from relation. + */ + nkeep += 1; + all_visible = false; + break; + case HEAPTUPLE_INSERT_IN_PROGRESS: + /* This is an expected case during concurrent vacuum */ + all_visible = false; + break; + case HEAPTUPLE_DELETE_IN_PROGRESS: + /* This is an expected case during concurrent vacuum */ + all_visible = false; + break; + default: + elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); + break; + } - heap_execute_freeze_tuple(htup, &frozen[i]); - } + if (tupgone) + { + lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); + HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data, + &vacrelstats->latestRemovedXid); + tups_vacuumed += 1; + has_dead_tuples = true; + } + else + { + bool tuple_totally_frozen; + + num_tuples += 1; + hastup = true; + + /* + * Each non-removable tuple must be checked to see if it needs + * freezing. Note we already have exclusive buffer lock. + */ + if (heap_prepare_freeze_tuple(tuple.t_data, FreezeLimit, + MultiXactCutoff, &frozen[nfrozen], + &tuple_totally_frozen)) + frozen[nfrozen++].offset = offnum; + + if (!tuple_totally_frozen) + all_frozen = false; + } + } /* scan along page */ + + /* + * If we froze any tuples, mark the buffer dirty, and write a WAL + * record recording the changes. We must log the changes to be + * crash-safe against future truncation of CLOG. + */ + if (nfrozen > 0) + { + START_CRIT_SECTION(); - /* Now WAL-log freezing if necessary */ - if (RelationNeedsWAL(onerel)) - { - XLogRecPtr recptr; + MarkBufferDirty(buf); - recptr = log_heap_freeze(onerel, buf, FreezeLimit, - frozen, nfrozen); - PageSetLSN(page, recptr); - } + /* execute collected freezes */ + for (i = 0; i < nfrozen; i++) + { + ItemId itemid; + HeapTupleHeader htup; - END_CRIT_SECTION(); - } + itemid = PageGetItemId(page, frozen[i].offset); + htup = (HeapTupleHeader) PageGetItem(page, itemid); - /* - * If there are no indexes then we can vacuum the page right now - * instead of doing a second scan. - */ - if (nindexes == 0 && - vacrelstats->num_dead_tuples > 0) - { - /* Remove tuples from heap */ - lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats, &vmbuffer); - has_dead_tuples = false; + heap_execute_freeze_tuple(htup, &frozen[i]); + } - /* - * Forget the now-vacuumed tuples, and press on, but be careful - * not to reset latestRemovedXid since we want that value to be - * valid. - */ - vacrelstats->num_dead_tuples = 0; - vacuumed_pages++; - } + /* Now WAL-log freezing if necessary */ + if (RelationNeedsWAL(onerel)) + { + XLogRecPtr recptr; - freespace = PageGetHeapFreeSpace(page); + recptr = log_heap_freeze(onerel, buf, FreezeLimit, + frozen, nfrozen); + PageSetLSN(page, recptr); + } - /* mark page all-visible, if appropriate */ - if (all_visible && !all_visible_according_to_vm) - { - uint8 flags = VISIBILITYMAP_ALL_VISIBLE; + END_CRIT_SECTION(); + } - if (all_frozen) - flags |= VISIBILITYMAP_ALL_FROZEN; + /* + * If there are no indexes then we can vacuum the page right now + * instead of doing a second scan. + */ + if (nindexes == 0 && + vacrelstats->num_dead_tuples > 0) + { + /* Remove tuples from heap */ + lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats, &vmbuffer); + has_dead_tuples = false; + + /* + * Forget the now-vacuumed tuples, and press on, but be careful + * not to reset latestRemovedXid since we want that value to be + * valid. + */ + vacrelstats->num_dead_tuples = 0; + vacuumed_pages++; + } - /* - * It should never be the case that the visibility map page is set - * while the page-level bit is clear, but the reverse is allowed - * (if checksums are not enabled). Regardless, set the both bits - * so that we get back in sync. - * - * NB: If the heap page is all-visible but the VM bit is not set, - * we don't need to dirty the heap page. However, if checksums - * are enabled, we do need to make sure that the heap page is - * dirtied before passing it to visibilitymap_set(), because it - * may be logged. Given that this situation should only happen in - * rare cases after a crash, it is not worth optimizing. - */ - PageSetAllVisible(page); - MarkBufferDirty(buf); - visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr, - vmbuffer, visibility_cutoff_xid, flags); - } + freespace = PageGetHeapFreeSpace(page); - /* - * As of PostgreSQL 9.2, the visibility map bit should never be set if - * the page-level bit is clear. However, it's possible that the bit - * got cleared after we checked it and before we took the buffer - * content lock, so we must recheck before jumping to the conclusion - * that something bad has happened. - */ - else if (all_visible_according_to_vm && !PageIsAllVisible(page) - && VM_ALL_VISIBLE(onerel, blkno, &vmbuffer)) - { - elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u", - relname, blkno); - visibilitymap_clear(onerel, blkno, vmbuffer, - VISIBILITYMAP_VALID_BITS); - } + /* mark page all-visible, if appropriate */ + if (all_visible && !all_visible_according_to_vm) + { + uint8 flags = VISIBILITYMAP_ALL_VISIBLE; + + if (all_frozen) + flags |= VISIBILITYMAP_ALL_FROZEN; + + /* + * It should never be the case that the visibility map page is set + * while the page-level bit is clear, but the reverse is allowed + * (if checksums are not enabled). Regardless, set the both bits + * so that we get back in sync. + * + * NB: If the heap page is all-visible but the VM bit is not set, + * we don't need to dirty the heap page. However, if checksums + * are enabled, we do need to make sure that the heap page is + * dirtied before passing it to visibilitymap_set(), because it + * may be logged. Given that this situation should only happen in + * rare cases after a crash, it is not worth optimizing. + */ + PageSetAllVisible(page); + MarkBufferDirty(buf); + visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr, + vmbuffer, visibility_cutoff_xid, flags); + } - /* - * It's possible for the value returned by GetOldestXmin() to move - * backwards, so it's not wrong for us to see tuples that appear to - * not be visible to everyone yet, while PD_ALL_VISIBLE is already - * set. The real safe xmin value never moves backwards, but - * GetOldestXmin() is conservative and sometimes returns a value - * that's unnecessarily small, so if we see that contradiction it just - * means that the tuples that we think are not visible to everyone yet - * actually are, and the PD_ALL_VISIBLE flag is correct. - * - * There should never be dead tuples on a page with PD_ALL_VISIBLE - * set, however. - */ - else if (PageIsAllVisible(page) && has_dead_tuples) - { - elog(WARNING, "page containing dead tuples is marked as all-visible in relation \"%s\" page %u", - relname, blkno); - PageClearAllVisible(page); - MarkBufferDirty(buf); - visibilitymap_clear(onerel, blkno, vmbuffer, - VISIBILITYMAP_VALID_BITS); - } + /* + * As of PostgreSQL 9.2, the visibility map bit should never be set if + * the page-level bit is clear. However, it's possible that the bit + * got cleared after we checked it and before we took the buffer + * content lock, so we must recheck before jumping to the conclusion + * that something bad has happened. + */ + else if (all_visible_according_to_vm && !PageIsAllVisible(page) + && VM_ALL_VISIBLE(onerel, blkno, &vmbuffer)) + { + elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u", + relname, blkno); + visibilitymap_clear(onerel, blkno, vmbuffer, + VISIBILITYMAP_VALID_BITS); + } - /* - * If the all-visible page is turned out to be all-frozen but not - * marked, we should so mark it. Note that all_frozen is only valid - * if all_visible is true, so we must check both. - */ - else if (all_visible_according_to_vm && all_visible && all_frozen && - !VM_ALL_FROZEN(onerel, blkno, &vmbuffer)) - { - /* - * We can pass InvalidTransactionId as the cutoff XID here, - * because setting the all-frozen bit doesn't cause recovery - * conflicts. - */ - visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr, - vmbuffer, InvalidTransactionId, - VISIBILITYMAP_ALL_FROZEN); - } + /* + * It's possible for the value returned by GetOldestXmin() to move + * backwards, so it's not wrong for us to see tuples that appear to + * not be visible to everyone yet, while PD_ALL_VISIBLE is already + * set. The real safe xmin value never moves backwards, but + * GetOldestXmin() is conservative and sometimes returns a value + * that's unnecessarily small, so if we see that contradiction it just + * means that the tuples that we think are not visible to everyone yet + * actually are, and the PD_ALL_VISIBLE flag is correct. + * + * There should never be dead tuples on a page with PD_ALL_VISIBLE + * set, however. + */ + else if (PageIsAllVisible(page) && has_dead_tuples) + { + elog(WARNING, "page containing dead tuples is marked as all-visible in relation \"%s\" page %u", + relname, blkno); + PageClearAllVisible(page); + MarkBufferDirty(buf); + visibilitymap_clear(onerel, blkno, vmbuffer, + VISIBILITYMAP_VALID_BITS); + } - UnlockReleaseBuffer(buf); + /* + * If the all-visible page is turned out to be all-frozen but not + * marked, we should so mark it. Note that all_frozen is only valid + * if all_visible is true, so we must check both. + */ + else if (all_visible_according_to_vm && all_visible && all_frozen && + !VM_ALL_FROZEN(onerel, blkno, &vmbuffer)) + { + /* + * We can pass InvalidTransactionId as the cutoff XID here, + * because setting the all-frozen bit doesn't cause recovery + * conflicts. + */ + visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr, + vmbuffer, InvalidTransactionId, + VISIBILITYMAP_ALL_FROZEN); + } - /* Remember the location of the last page with nonremovable tuples */ - if (hastup) - vacrelstats->nonempty_pages = blkno + 1; +#ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__ + if (gts_maintain_option != GTS_MAINTAIN_NOTHING) + { + MaintainGTS(&onerel->rd_node, blkno, buf); + } +#endif - /* - * If we remembered any tuples for deletion, then the page will be - * visited again by lazy_vacuum_heap, which will compute and record - * its post-compaction free space. If not, then we're done with this - * page, so remember its free space as-is. (This path will always be - * taken if there are no indexes.) - */ - if (vacrelstats->num_dead_tuples == prev_dead_count) - RecordPageWithFreeSpace(onerel, blkno, freespace); - } + UnlockReleaseBuffer(buf); + + /* Remember the location of the last page with nonremovable tuples */ + if (hastup) + vacrelstats->nonempty_pages = blkno + 1; + + /* + * If we remembered any tuples for deletion, then the page will be + * visited again by lazy_vacuum_heap, which will compute and record + * its post-compaction free space. If not, then we're done with this + * page, so remember its free space as-is. (This path will always be + * taken if there are no indexes.) + */ + if (vacrelstats->num_dead_tuples == prev_dead_count) + RecordPageWithFreeSpace(onerel, blkno, freespace); + } /* report that everything is scanned and vacuumed */ pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno); @@ -1594,6 +1619,7 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats) errdetail_internal("%s", pg_rusage_show(&ru0)))); } + /* * lazy_vacuum_page() -- free dead tuples on a page * and repair its fragmentation. @@ -2655,3 +2681,297 @@ xlog_reinit_extent_pages(RelFileNode rnode, ExtentID eid) } #endif + +#define STACK_SIZE 64 + +/* + * print error stack to maintain_trace file. + */ +static void +PrintStack(void) +{ + void *trace[STACK_SIZE] = {0}; + size_t size = backtrace(trace, STACK_SIZE); + char **symbols = (char **) backtrace_symbols(trace, size); + size_t i = 0; + time_t t = 0; + struct tm *timeInfo = NULL; + + if (symbols == NULL) + { + return; + } + + time(&t); + timeInfo = localtime(&t); + trace_log("Dumping stack starts at %s", asctime(timeInfo)); + trace_log("backtrace() returned %zu addresses.", size); + for (i = 1; i < size; i++) + { + char syscom[MAXPGPATH] = {0}; + FILE *fcmd = NULL; + char temp[MAXPGPATH] = {0}; + + trace_log("#%-2zu %s", i, symbols[i]); + + snprintf(syscom, MAXPGPATH, "addr2line %p -e %s -f -C", trace[i], exename); + fcmd = popen(syscom, "r"); + if (fcmd == NULL) + { + continue; + } + while (fgets(temp, sizeof(temp), fcmd) != NULL) + { + /* ignore the ending "\n" */ + trace_log(" %.*s", (int) strlen(temp) - 1, temp); + } + pclose(fcmd); + } + trace_log("Dumping stack ends.\n"); + + free(symbols); +} + +/* + * print error data to maintain file. + */ +static void +PrintData(RelFileNode *rnode, BlockNumber blkno, Page page, OffsetNumber lineoff, + GlobalTimestamp tlog_xmin_gts, GlobalTimestamp tlog_xmax_gts) +{ + XLogRecPtr lsn = PageGetLSN(page); + PageHeader pagehdr = (PageHeader) page; + ItemId id = PageGetItemId(page, lineoff); + uint16 lp_offset = 0; + uint16 lp_flags = 0; + uint16 lp_len = 0; + HeapTupleHeader tuphdr = NULL; + TransactionId xmin = 0; + TransactionId xmax = 0; + GlobalTimestamp tuple_xmin_gts = 0; + GlobalTimestamp tuple_xmax_gts = 0; + uint32 cid = 0; + uint32 infomask2 = 0; + uint32 infomask = 0; + ShardID shardid = 0; + uint8 hoff = 0; + time_t t = 0; + struct tm *timeInfo = NULL; + + lp_offset = ItemIdGetOffset(id); + lp_flags = ItemIdGetFlags(id); + lp_len = ItemIdGetLength(id); + + tuphdr = (HeapTupleHeader)PageGetItem(page, id); + xmin = HeapTupleHeaderGetRawXmin(tuphdr); + xmax = HeapTupleHeaderGetRawXmax(tuphdr); + tuple_xmin_gts = HeapTupleHeaderGetXminTimestamp(tuphdr); + tuple_xmax_gts = HeapTupleHeaderGetXmaxTimestamp(tuphdr); + cid = HeapTupleHeaderGetRawCommandId(tuphdr); + infomask2 = tuphdr->t_infomask2; + infomask = tuphdr->t_infomask; + shardid = tuphdr->t_shardid; + hoff = tuphdr->t_hoff; + + time(&t); + timeInfo = localtime(&t); + trace_log("Printing error data starts at %s", asctime(timeInfo)); + trace_log("relfilenode %u pageno %u \n" + "page: lsn=" UINT64_FORMAT " " + "checksum=%d flags=%d shard=%d " + "lower=%d upper=%d special=%d " + "pagesize=%zu version=%d " + "prune_xid=%u \n" + "item: lp=%d, lp_off=%d, lp_flags=%d, lp_len=%d \n" + "heaptuple header: t_xmin=%u t_xmax=%u t_xmin_gts=%ld t_xmax_gts=%ld " + "t_cid=%u t_infomask2=%u t_infomask=%u " + "t_shareid=%d t_hoff=%d \n" + "tlog: tlog_xmin_gts=%ld tlog_xmax_gts=%ld", + rnode->relNode, blkno, + lsn, + pagehdr->pd_checksum, pagehdr->pd_flags, pagehdr->pd_shard, + pagehdr->pd_lower, pagehdr->pd_upper, pagehdr->pd_special, + PageGetPageSize(page), (uint16) PageGetPageLayoutVersion(page), + pagehdr->pd_prune_xid, + lineoff, lp_offset, lp_flags, lp_len, + xmin, xmax, tuple_xmin_gts, tuple_xmax_gts, + cid, infomask2, infomask, + shardid, hoff, + tlog_xmin_gts, tlog_xmax_gts); + if (tlog_xmax_gts == 0) + { + trace_log("xmin_gts in tuple is not equal to xmin_gts in tlog!"); + } + else + { + trace_log("xmax_gts in tuple is not equal to xmax_gts in tlog!"); + } + trace_log("Printing error data ends.\n"); +} + +/* + * MaintainGTS() -- check and reset gts in tuples according to gts in tlog. + * + * Buffer must be pinned and exclusive-locked. (If caller does not hold + * exclusive lock, then somebody could be in process of writing the buffer, + * leading to risk of bad data written to disk. + * + * Caller must hold pin and buffer cleanup lock on the buffer. + * gts_maintain_option = 0: GTS_MAINTAIN_NOTHING, do nothing. + * gts_maintain_option = 1: GTS_MAINTAIN_VACUUM_CHECK, check correctness of GTS + * while doing vacuum. + * gts_maintain_option = 2: GTS_MAINTAIN_VACUUM_RESET, check correctness of GTS + * and reset it according to tlog if it is wrong while + * doing vacuum. + */ +void +MaintainGTS(RelFileNode *rnode, BlockNumber blkno, Buffer buffer) +{ + Page page; + int lines; + OffsetNumber lineoff; + ItemId itemid; + bool changed = false; + bool reset = false; + + /* GTS is only used for normal user tables, not systems table and any index. */ + if (GTS_MAINTAIN_NOTHING == gts_maintain_option) + { + return; + } + + if (!PostmasterIsPrimaryAndNormal()) + { + return; + } + + if (!RelationHasGTS(rnode->spcNode, rnode->relNode)) + { + return; + } + + if (GTS_MAINTAIN_VACUUM_RESET == gts_maintain_option) + { + reset = true; + } + + page = BufferGetPage(buffer); + lines = PageGetMaxOffsetNumber(page); + for (lineoff = FirstOffsetNumber, itemid = PageGetItemId(page, lineoff); + lineoff <= lines; + lineoff++, itemid++) + { + HeapTupleHeader tuphdr; + TransactionId xmin = InvalidTransactionId; + TransactionId xmax = InvalidTransactionId; + GlobalTimestamp tlog_xmin_gts = InvalidGlobalTimestamp; + GlobalTimestamp tlog_xmax_gts = InvalidGlobalTimestamp; + + if (!ItemIdIsNormal(itemid)) + { + continue; + } + + tuphdr = (HeapTupleHeader) PageGetItem(page, itemid); + + xmin = HeapTupleHeaderGetRawXmin(tuphdr); + if (TransactionIdIsNormal(xmin) && + HeapTupleHeaderXminCommitted(tuphdr) && + !HeapTupleHeaderXminFrozen(tuphdr)) + { + GlobalTimestamp tuple_xmin_gts = HeapTupleHeaderGetXminTimestampAtomic(tuphdr); + + if (GlobalTimestampIsValid(tuple_xmin_gts) + && !CommitTimestampIsLocal(tuple_xmin_gts) + && !GlobalTimestampIsFrozen(tuple_xmin_gts) + && TransactionIdGetCommitTsData(xmin, &tlog_xmin_gts, NULL) + && tuple_xmin_gts != tlog_xmin_gts) + { + elog(WARNING, + "relfilenode %u " + "pageno %u lineoff %u " + "CTID %hu/%hu/%hu " + "infomask %d multixact %d, " + "xmin %u xmin_gts "INT64_FORMAT" " + "in tuple is not equal to xmin_gts "INT64_FORMAT" in tlog.", + rnode->relNode, + blkno, lineoff, + tuphdr->t_ctid.ip_blkid.bi_hi, + tuphdr->t_ctid.ip_blkid.bi_lo, + tuphdr->t_ctid.ip_posid, + tuphdr->t_infomask, tuphdr->t_infomask & HEAP_XMAX_IS_MULTI, + xmin, tuple_xmin_gts, tlog_xmin_gts); + + PrintStack(); + PrintData(rnode, blkno, page, lineoff, tlog_xmin_gts, 0); + + if (reset) + { + changed = true; + HeapTupleHeaderSetXminTimestampAtomic(tuphdr, tlog_xmin_gts); + elog(WARNING, + "relfilenode %u " + "pageno %u lineoff %u xmin %u xmin_gts "INT64_FORMAT" " + "in tuple has been reset to xmin_gts "INT64_FORMAT" in tlog.", + rnode->relNode, + blkno, lineoff, + xmin, tuple_xmin_gts, + HeapTupleHeaderGetXminTimestamp(tuphdr)); + } + } + } + + xmax = HeapTupleHeaderGetRawXmax(tuphdr); + if (TransactionIdIsNormal(xmax) && + HeapTupleHeaderXmaxCommitted(tuphdr)) + { + GlobalTimestamp tuple_xmax_gts = HeapTupleHeaderGetXmaxTimestampAtomic(tuphdr); + + if (GlobalTimestampIsValid(tuple_xmax_gts) + && !CommitTimestampIsLocal(tuple_xmax_gts) + && !GlobalTimestampIsFrozen(tuple_xmax_gts) + && TransactionIdGetCommitTsData(xmax, &tlog_xmax_gts, NULL) + && tuple_xmax_gts != tlog_xmax_gts) + { + elog(WARNING, + "relfilenode %u " + "pageno %u lineoff %u " + "CTID %hu/%hu/%hu " + "infomask %d multixact %d " + "xid %u xmax %u xmax_gts "INT64_FORMAT" " + "in tuple is not equal to xmax_gts "INT64_FORMAT" in tlog.", + rnode->relNode, + blkno, lineoff, + tuphdr->t_ctid.ip_blkid.bi_hi, + tuphdr->t_ctid.ip_blkid.bi_lo, + tuphdr->t_ctid.ip_posid, + tuphdr->t_infomask, tuphdr->t_infomask & HEAP_XMAX_IS_MULTI, + HeapTupleHeaderGetUpdateXid(tuphdr), xmax, tuple_xmax_gts, + tlog_xmax_gts); + + PrintStack(); + PrintData(rnode, blkno, page, lineoff, 0, tlog_xmax_gts); + + if (reset) + { + changed = true; + HeapTupleHeaderSetXmaxTimestampAtomic(tuphdr, tlog_xmax_gts); + elog(WARNING, + "relfilenode " + "%u pageno %u lineoff %u " + "xmax %u xmax_gts "INT64_FORMAT" " + "in tuple has been reset to xmax_gts "INT64_FORMAT" in tlog.", + rnode->relNode, + blkno, lineoff, + xmax, tuple_xmax_gts, + HeapTupleHeaderGetXminTimestamp(tuphdr)); + } + } + } + } + + if (changed) + { + MarkBufferDirtyHint(buffer, true); + } +} diff --git a/src/backend/main/main.c b/src/backend/main/main.c index 32041d32..679afb31 100644 --- a/src/backend/main/main.c +++ b/src/backend/main/main.c @@ -45,6 +45,7 @@ const char *progname; +const char *exename; static void startup_hacks(const char *progname); @@ -58,33 +59,37 @@ static void check_root(const char *progname); */ int main(int argc, char *argv[]) -{// #lizard forgives - bool do_check_root = true; - - progname = get_progname(argv[0]); - - /* - * Platform-specific startup hacks - */ - startup_hacks(progname); - - /* - * Remember the physical location of the initially given argv[] array for - * possible use by ps display. On some platforms, the argv[] storage must - * be overwritten in order to set the process title for ps. In such cases - * save_ps_display_args makes and returns a new copy of the argv[] array. - * - * save_ps_display_args may also move the environment strings to make - * extra room. Therefore this should be done as early as possible during - * startup, to avoid entanglements with code that might save a getenv() - * result pointer. - */ - argv = save_ps_display_args(argc, argv); - - /* - * If supported on the current platform, set up a handler to be called if - * the backend/postmaster crashes with a fatal signal or exception. - */ +{ + bool do_check_root = true; + + progname = get_progname(argv[0]); + /* + * Make a copy. Leaks memory, but called only once. + */ + exename = strdup(argv[0]); + + /* + * Platform-specific startup hacks + */ + startup_hacks(progname); + + /* + * Remember the physical location of the initially given argv[] array for + * possible use by ps display. On some platforms, the argv[] storage must + * be overwritten in order to set the process title for ps. In such cases + * save_ps_display_args makes and returns a new copy of the argv[] array. + * + * save_ps_display_args may also move the environment strings to make + * extra room. Therefore this should be done as early as possible during + * startup, to avoid entanglements with code that might save a getenv() + * result pointer. + */ + argv = save_ps_display_args(argc, argv); + + /* + * If supported on the current platform, set up a handler to be called if + * the backend/postmaster crashes with a fatal signal or exception. + */ #if defined(WIN32) && defined(HAVE_MINIDUMP_TYPE) pgwin32_install_crashdump_handler(); #endif diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c index 73cf905c..ca079833 100644 --- a/src/backend/pgxc/pool/poolmgr.c +++ b/src/backend/pgxc/pool/poolmgr.c @@ -9805,19 +9805,19 @@ TryPingUnhealthyNode(Oid nodeoid) void PoolPingNodeRecheck(Oid nodeoid) { - int status; - NodeDefinition *nodeDef; - char connstr[MAXPGPATH * 2 + 256]; - bool healthy; - const char *username = NULL; - char *errstr = NULL; - - username = get_user_name(&errstr); - if (errstr != NULL) - { - elog(WARNING, "Could not get current username errmsg: %s", errstr); - return; - } + int status; + NodeDefinition *nodeDef; + char connstr[MAXPGPATH * 2 + 256]; + bool healthy; + const char *username = NULL; + char *errstr = NULL; + + username = get_user_name(&errstr); + if (errstr != NULL) + { + elog(WARNING, "Could not get current username errmsg: %s", errstr); + return; + } nodeDef = PgxcNodeGetDefinition(nodeoid); if (nodeDef == NULL) diff --git a/src/backend/postmaster/auditlogger.c b/src/backend/postmaster/auditlogger.c index 5cf9325c..5aea9c14 100644 --- a/src/backend/postmaster/auditlogger.c +++ b/src/backend/postmaster/auditlogger.c @@ -146,12 +146,12 @@ typedef struct AuditLogQueue { - pid_t q_pid; - int q_size; - slock_t q_lock; - volatile int q_head; - volatile int q_tail; - char q_area[FLEXIBLE_ARRAY_MEMBER]; + pid_t q_pid; + int q_size; + slock_t q_lock; + volatile int q_head; + volatile int q_tail; + char q_area[FLEXIBLE_ARRAY_MEMBER]; } AlogQueue; typedef struct AuditLogQueueArray @@ -163,16 +163,25 @@ typedef struct AuditLogQueueArray typedef struct AuditLogQueueCache { - ThreadSema q_sema; /* local ThreadSema for CommonLogWriter and FGALogWriter */ - int q_count; - AlogQueue * q_cache[FLEXIBLE_ARRAY_MEMBER]; + /* local ThreadSema for CommonLogWriter, FGALogWriter and TraceLogWriter. */ + ThreadSema q_sema; + int q_count; + AlogQueue * q_cache[FLEXIBLE_ARRAY_MEMBER]; } AlogQueueCache; -/* shared memory queue array */ -static AlogQueueArray * AuditCommonLogQueueArray = NULL; /* store common audit logs, each elem for a backend */ -static AlogQueueArray * AuditFGALogQueueArray = NULL; /* store fga audit logs, each elem for a backend */ +/* + * shared memory queue array + * + * store common audit logs, each elem for a backend + */ +static AlogQueueArray * AuditCommonLogQueueArray = NULL; +/* store fga audit logs, each elem for a backend */ +static AlogQueueArray * AuditFGALogQueueArray = NULL; +/* store trace audit logs, each elem for a backend */ +static AlogQueueArray * AuditTraceLogQueueArray = NULL; -/* shared memory bitmap to notify consumers to read audit log from AlogQueueArray above +/* + * shared memory bitmap to notify consumers to read audit log from AlogQueueArray above * each element for one consumer */ static int * AuditConsumerNotifyBitmap = NULL; @@ -180,35 +189,59 @@ static int * AuditConsumerNotifyBitmap = NULL; /* * Postgres backend state, used in postgres backend only * - * Postgres backend write common audit log into AuditCommonLogQueueArray->a_queue[idx] + * Postgres backend write common audit log into AuditCommonLogQueueArray->a_queue[idx] * and write fga audit log info AuditFGALogQueueArray->a_queue[idx] + * and write trace audit log info AuditTraceLogQueueArray->a_queue[idx] * - * Postgres backend acqurie free index by AuditLoggerQueueAcquire + * Postgres backend acqurie free index by AuditLoggerQueueAcquire * */ static int AuditPostgresAlogQueueIndex = 0; -/* Consumer local queue cache for AuditLog_max_worker_number consumers, used in audit logger process only */ -static AlogQueueCache * AuditCommonLogLocalCache = NULL; /* store common audit logs, each elem for a thread Consumer */ -static AlogQueueCache * AuditFGALogLocalCache = NULL; /* store fga audit logs, each elem for a trhead Consumer */ +/* + * Consumer local queue cache for AuditLog_max_worker_number consumers, used in + * audit logger process only. + * + * store common audit logs, each elem for a thread Consumer + */ +static AlogQueueCache * AuditCommonLogLocalCache = NULL; +/* store fga audit logs, each elem for a thread Consumer */ +static AlogQueueCache * AuditFGALogLocalCache = NULL; +/* store trace audit logs, each elem for a thread Consumer */ +static AlogQueueCache * AuditTraceLogLocalCache = NULL; -/* local ThreadSema array for AuditLog_max_worker_number consumers, used in audit logger process only */ -static ThreadSema * AuditConsumerNotifySemas = NULL; /* each elem for a trhead Consumer */ +/* + * local ThreadSema array for AuditLog_max_worker_number consumers, used in audit + * logger process only. + * + * each elem for a thread Consumer. + */ +static ThreadSema * AuditConsumerNotifySemas = NULL; /* * GUC parameters. can change at SIGHUP. */ -int AuditLog_RotationAge = HOURS_PER_DAY * MINS_PER_HOUR; -int AuditLog_RotationSize = 10 * 1024; -char * AuditLog_filename = NULL; -bool AuditLog_truncate_on_rotation = false; -int AuditLog_file_mode = S_IRUSR | S_IWUSR; - -int AuditLog_max_worker_number = 16; /* max number of worker thead to read audit log */ -int AuditLog_common_log_queue_size_kb = 64; /* size of AlogQueue->q_area for each backend to store common audit log, KB */ -int AuditLog_fga_log_queue_size_kb = 64; /* size of AlogQueue->q_area for each backend to store audit log, KB */ -int AuditLog_common_log_cache_size_kb = 64; /* size of common audit log local buffer for each worker */ -int AuditLog_fga_log_cacae_size_kb = 64; /* size of fga audit log local buffer for eache worker */ +int AuditLog_RotationAge = HOURS_PER_DAY * MINS_PER_HOUR; +int AuditLog_RotationSize = 10 * 1024; +char * AuditLog_filename = NULL; +static char * TraceLog_filename = "maintain-%A-%H.log"; +bool AuditLog_truncate_on_rotation = false; +int AuditLog_file_mode = S_IRUSR | S_IWUSR; + +/* max number of worker thead to read audit log */ +int AuditLog_max_worker_number = 16; +/* size of AlogQueue->q_area for each backend to store common audit log, KB */ +int AuditLog_common_log_queue_size_kb = 64; +/* size of AlogQueue->q_area for each backend to store fga audit log, KB */ +int AuditLog_fga_log_queue_size_kb = 64; +/* size of AlogQueue->q_area for each backend to store trace audit log, KB */ +int Maintain_trace_log_queue_size_kb = 64; +/* size of common audit log local buffer for each worker */ +int AuditLog_common_log_cache_size_kb = 64; +/* size of fga audit log local buffer for each worker */ +int AuditLog_fga_log_cacae_size_kb = 64; +/* size of trace audit log local buffer for each worker */ +int Maintain_trace_log_cache_size_kb = 64; /* * Globally visible state @@ -219,19 +252,24 @@ bool enable_auditlogger_warning = false; /* * Logger Private state */ -static pg_time_t audit_next_rotation_time = 0; -static bool audit_rotation_disabled = false; -static FILE * audit_comm_log_file = NULL; -static FILE * audit_fga_log_file = NULL; -static slock_t audit_comm_log_file_lock; -static slock_t audit_fga_log_file_lock; -NON_EXEC_STATIC pg_time_t audit_first_log_file_time = 0; -static char * audit_last_comm_log_file_name = NULL; -static char * audit_last_fga_log_file_name = NULL; -static char * audit_log_directory = NULL; -static char * audit_curr_log_dir = NULL; -static char * audit_curr_log_file_name = NULL; -static int audit_curr_log_rotation_age = 0; +static pg_time_t audit_next_rotation_time = 0; +static bool audit_rotation_disabled = false; +static FILE * audit_comm_log_file = NULL; +static FILE * audit_fga_log_file = NULL; +static FILE * audit_trace_log_file = NULL; +static slock_t audit_comm_log_file_lock; +static slock_t audit_fga_log_file_lock; +static slock_t audit_trace_log_file_lock; +NON_EXEC_STATIC pg_time_t audit_first_log_file_time = 0; +static char * audit_last_comm_log_file_name = NULL; +static char * audit_last_fga_log_file_name = NULL; +static char * audit_last_trace_log_file_name = NULL; +static char * audit_log_directory = NULL; +static char * trace_log_directory = NULL; +static char * audit_curr_log_dir = NULL; +static char * trace_curr_log_dir = NULL; +static char * audit_curr_log_file_name = NULL; +static int audit_curr_log_rotation_age = 0; /* * Flags set by interrupt handlers for later service in the main loop. @@ -273,26 +311,30 @@ static void audit_assign_log_dir(void); #endif #ifdef AuditLog_002_For_ShareMemoryQueue -static Size audit_queue_elem_size(int queue_size_kb); - -static Size audit_shared_queue_array_bitmap_offset(void); -static Size audit_shared_queue_array_header_size(void); -static Size audit_shared_common_queue_elem_size(void); -static Size audit_shared_common_queue_array_size(void); -static Size audit_shared_fga_queue_elem_size(void); -static Size audit_shared_fga_queue_array_size(void); -static Size audit_shared_consumer_bitmap_size(void); -static int audit_shared_consumer_bitmap_get_value(int consumer_id); -static void audit_shared_consumer_bitmap_set_value(int consumer_id, int value); +static Size audit_queue_elem_size(int queue_size_kb); + +static Size audit_shared_queue_array_bitmap_offset(void); +static Size audit_shared_queue_array_header_size(void); +static Size audit_shared_common_queue_elem_size(void); +static Size audit_shared_common_queue_array_size(void); +static Size audit_shared_fga_queue_elem_size(void); +static Size audit_shared_fga_queue_array_size(void); +static Size audit_shared_trace_queue_elem_size(void); +static Size audit_shared_trace_queue_array_size(void); +static Size audit_shared_consumer_bitmap_size(void); +static int audit_shared_consumer_bitmap_get_value(int consumer_id); +static void audit_shared_consumer_bitmap_set_value(int consumer_id, int value); #endif #ifdef AuditLog_003_For_LogFile -static int audit_write_log_file(const char *buffer, int count, int destination); -static FILE * audit_open_log_file(const char *filename, const char *mode, bool allow_errors); -static void aduit_open_fga_log_file(void); -static void audit_rotate_log_file(bool time_based_rotation, int size_rotation_for); -static char * audit_log_file_getname(pg_time_t timestamp, const char *suffix); -static void audit_set_next_rotation_time(void); +static int audit_write_log_file(const char *buffer, int count, int destination); +static FILE * audit_open_log_file(const char *filename, const char *mode, bool allow_errors); +static void audit_open_fga_log_file(void); +static void audit_open_trace_log_file(void); +static void audit_rotate_log_file(bool time_based_rotation, int size_rotation_for); +static char * audit_log_file_getname(pg_time_t timestamp, const char *suffix); +static char * trace_log_file_getname(pg_time_t timestamp, const char *suffix); +static void audit_set_next_rotation_time(void); #endif #ifdef AuditLog_004_For_QueueReadWrite @@ -314,10 +356,12 @@ static bool alog_queue_pop_to_file(AlogQueue * from, int destination); #endif #ifdef AuditLog_005_For_ThreadWorker -static AlogQueue * alog_get_shared_common_queue(int idx); -static AlogQueue * alog_get_shared_fga_queue(int idx); -static AlogQueue * alog_get_local_common_cache(int consumer_id); -static AlogQueue * alog_get_local_fga_cache(int consumer_id); +static AlogQueue * alog_get_shared_common_queue(int idx); +static AlogQueue * alog_get_shared_fga_queue(int idx); +static AlogQueue * alog_get_shared_trace_queue(int idx); +static AlogQueue * alog_get_local_common_cache(int consumer_id); +static AlogQueue * alog_get_local_fga_cache(int consumer_id); +static AlogQueue * alog_get_local_trace_cache(int consumer_id); static AlogQueueCache * alog_make_local_cache(int cache_number, int queue_size_kb); static ThreadSema * alog_make_consumer_semas(int consumer_count); static void alog_consumer_wakeup(int consumer_id); @@ -484,71 +528,75 @@ AuditLoggerMain(int argc, char *argv[]) static void audit_logger_MainLoop(void) { - /* - * Create log directory if not present; ignore errors - */ - audit_assign_log_dir(); - mkdir(audit_log_directory, S_IRWXU); - - /* - * Remember active logfile's name. We recompute this from the reference - * time because passing down just the pg_time_t is a lot cheaper than - * passing a whole file path in the EXEC_BACKEND case. - */ - audit_first_log_file_time = time(NULL); - audit_last_comm_log_file_name = audit_log_file_getname(audit_first_log_file_time, NULL); - audit_comm_log_file = audit_open_log_file(audit_last_comm_log_file_name, "a", false); - aduit_open_fga_log_file(); - - /* remember active logfile parameters */ - audit_curr_log_dir = pstrdup(audit_log_directory); - audit_curr_log_file_name = pstrdup(AuditLog_filename); - audit_curr_log_rotation_age = AuditLog_RotationAge; - - SpinLockInit(&(audit_comm_log_file_lock)); - SpinLockInit(&(audit_fga_log_file_lock)); - - /* set next planned rotation time */ - audit_set_next_rotation_time(); - - /* start consumer and writer thread*/ - alog_start_all_worker(); - - /* main worker loop */ - while (PostmasterIsAlive()) - { - int rc = 0; - - /* Clear any already-pending wakeups */ - ResetLatch(MyLatch); - - audit_process_sighup(); - audit_process_sigusr1(); - audit_process_sigusr2(); - audit_process_sigterm(); - audit_process_sigint(); - audit_process_sigquit(); - audit_process_rotate(); - audit_process_wakeup(false); - - rc = WaitLatch(MyLatch, - WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, - AUDIT_LATCH_MICROSEC, - WAIT_EVENT_AUDIT_LOGGER_MAIN); - - if (rc & WL_POSTMASTER_DEATH) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("audit logger exit after postmaster die"))); - exit(2); - } - else if (rc & WL_TIMEOUT) - { - audit_consume_requested = true; - audit_process_wakeup(true); - } - } + /* + * Create log directory if not present; ignore errors + */ + audit_assign_log_dir(); + mkdir(audit_log_directory, S_IRWXU); + mkdir(trace_log_directory, S_IRWXU); + + /* + * Remember active logfile's name. We recompute this from the reference + * time because passing down just the pg_time_t is a lot cheaper than + * passing a whole file path in the EXEC_BACKEND case. + */ + audit_first_log_file_time = time(NULL); + audit_last_comm_log_file_name = audit_log_file_getname(audit_first_log_file_time, NULL); + audit_comm_log_file = audit_open_log_file(audit_last_comm_log_file_name, "a", false); + audit_open_fga_log_file(); + audit_open_trace_log_file(); + + /* remember active logfile parameters */ + audit_curr_log_dir = pstrdup(audit_log_directory); + trace_curr_log_dir= pstrdup(trace_log_directory); + audit_curr_log_file_name = pstrdup(AuditLog_filename); + audit_curr_log_rotation_age = AuditLog_RotationAge; + + SpinLockInit(&(audit_comm_log_file_lock)); + SpinLockInit(&(audit_fga_log_file_lock)); + SpinLockInit(&(audit_trace_log_file_lock)); + + /* set next planned rotation time */ + audit_set_next_rotation_time(); + + /* start consumer and writer thread*/ + alog_start_all_worker(); + + /* main worker loop */ + while (PostmasterIsAlive()) + { + int rc = 0; + + /* Clear any already-pending wakeups */ + ResetLatch(MyLatch); + + audit_process_sighup(); + audit_process_sigusr1(); + audit_process_sigusr2(); + audit_process_sigterm(); + audit_process_sigint(); + audit_process_sigquit(); + audit_process_rotate(); + audit_process_wakeup(false); + + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, + AUDIT_LATCH_MICROSEC, + WAIT_EVENT_AUDIT_LOGGER_MAIN); + + if (rc & WL_POSTMASTER_DEATH) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("audit logger exit after postmaster die"))); + exit(2); + } + else if (rc & WL_TIMEOUT) + { + audit_consume_requested = true; + audit_process_wakeup(true); + } + } } /* -------------------------------- @@ -765,101 +813,112 @@ static void audit_process_sigquit(void) } static void audit_process_rotate(void) -{// #lizard forgives - bool time_based_rotation = false; - int size_rotation_for = 0; - pg_time_t now = MyStartTime; - - if (AuditLog_RotationAge > 0 && !audit_rotation_disabled) - { - /* Do a logfile rotation if it's time */ - now = (pg_time_t) time(NULL); - if (now >= audit_next_rotation_time) - audit_rotation_requested = time_based_rotation = true; - } - - if (!audit_rotation_requested && AuditLog_RotationSize > 0 && !audit_rotation_disabled) - { - /* Do a rotation if file is too big */ - if (ftell(audit_comm_log_file) >= AuditLog_RotationSize * 1024L) - { - audit_rotation_requested = true; - size_rotation_for |= AUDIT_COMMON_LOG; - } - - if (audit_fga_log_file != NULL && - ftell(audit_fga_log_file) >= AuditLog_RotationSize * 1024L) - { - audit_rotation_requested = true; - size_rotation_for |= AUDIT_FGA_LOG; - } - } - - if (audit_rotation_requested) - { - /* - * Force rotation when both values are zero. It means the request - * was sent by pg_rotate_log_file. - */ - if (!time_based_rotation && size_rotation_for == 0) - size_rotation_for = AUDIT_COMMON_LOG | AUDIT_FGA_LOG; - audit_rotate_log_file(time_based_rotation, size_rotation_for); - } +{ + bool time_based_rotation = false; + int size_rotation_for = 0; + pg_time_t now = MyStartTime; + + if (AuditLog_RotationAge > 0 && !audit_rotation_disabled) + { + /* Do a logfile rotation if it's time */ + now = (pg_time_t) time(NULL); + if (now >= audit_next_rotation_time) + audit_rotation_requested = time_based_rotation = true; + } + + if (!audit_rotation_requested && AuditLog_RotationSize > 0 && !audit_rotation_disabled) + { + /* Do a rotation if file is too big */ + if (ftell(audit_comm_log_file) >= AuditLog_RotationSize * 1024L) + { + audit_rotation_requested = true; + size_rotation_for |= AUDIT_COMMON_LOG; + } + + if (audit_fga_log_file != NULL && + ftell(audit_fga_log_file) >= AuditLog_RotationSize * 1024L) + { + audit_rotation_requested = true; + size_rotation_for |= AUDIT_FGA_LOG; + } + + if (audit_trace_log_file != NULL && + ftell(audit_trace_log_file) >= AuditLog_RotationSize * 1024L) + { + audit_rotation_requested = true; + size_rotation_for |= MAINTAIN_TRACE_LOG; + } + } + + if (audit_rotation_requested) + { + /* + * Force rotation when both values are zero. It means the request + * was sent by pg_rotate_log_file. + */ + if (!time_based_rotation && size_rotation_for == 0) + { + size_rotation_for = AUDIT_COMMON_LOG | AUDIT_FGA_LOG | MAINTAIN_TRACE_LOG; + } + audit_rotate_log_file(time_based_rotation, size_rotation_for); + } } /* * if any audit log was coming, * wakeup a consumer to read */ -static void audit_process_wakeup(bool timeout) -{// #lizard forgives - if (audit_consume_requested) - { - int i = 0; - - if (timeout) - { - for (i = 0; i < MaxBackends; i++) - { - int sharedIdx = i; - int consumer_id = sharedIdx % AuditLog_max_worker_number; - - AlogQueue * shared_common_queue = alog_get_shared_common_queue(sharedIdx); - AlogQueue * shared_fga_queue = alog_get_shared_fga_queue(sharedIdx); - - bool b_common_is_empty = alog_queue_is_empty2(shared_common_queue); - bool b_fga_is_empty = alog_queue_is_empty2(shared_fga_queue); - - pg_memory_barrier(); - - if (!b_common_is_empty || !b_fga_is_empty) - { - if (!audit_shared_consumer_bitmap_get_value(consumer_id)) - { - audit_shared_consumer_bitmap_set_value(consumer_id, 1); - alog_consumer_wakeup(consumer_id); - } - } - } - } - else - { - for (i = 0; i < AuditLog_max_worker_number; i++) - { - int consumer_id = i; - int bitmap_value = audit_shared_consumer_bitmap_get_value(consumer_id); - - pg_memory_barrier(); - - if (bitmap_value) - { - alog_consumer_wakeup(consumer_id); - } - } - } - - audit_consume_requested = false; - } +static void audit_process_wakeup(bool timeout) +{ + if (audit_consume_requested) + { + int i = 0; + + if (timeout) + { + for (i = 0; i < MaxBackends; i++) + { + int sharedIdx = i; + int consumer_id = sharedIdx % AuditLog_max_worker_number; + + AlogQueue * shared_common_queue = alog_get_shared_common_queue(sharedIdx); + AlogQueue * shared_fga_queue = alog_get_shared_fga_queue(sharedIdx); + AlogQueue * shared_trace_queue = alog_get_shared_trace_queue(sharedIdx); + + bool b_common_is_empty = alog_queue_is_empty2(shared_common_queue); + bool b_fga_is_empty = alog_queue_is_empty2(shared_fga_queue); + bool b_trace_is_empty = alog_queue_is_empty2(shared_trace_queue); + + pg_memory_barrier(); + + if (!b_common_is_empty || !b_fga_is_empty || !b_trace_is_empty) + { + if (!audit_shared_consumer_bitmap_get_value(consumer_id)) + { + audit_shared_consumer_bitmap_set_value(consumer_id, 1); + alog_consumer_wakeup(consumer_id); + } + } + } + } + else + { + for (i = 0; i < AuditLog_max_worker_number; i++) + { + int consumer_id = i; + int bitmap_value = audit_shared_consumer_bitmap_get_value(consumer_id); + + pg_memory_barrier(); + + if (bitmap_value) + { + alog_consumer_wakeup(consumer_id); + } + } + } + + audit_consume_requested = false; + } } #endif @@ -887,16 +946,19 @@ static void audit_process_wakeup(bool timeout) * | | | | | | * | | | | |<-------- audit_shared_common_queue_elem_size() * MaxBackends ---------->| * |<- 4B -> |<------ 4B ------>|<- 4 * MaxBackends B ->|<- AUDIT_BITMAP_SIZE ->|<--------- audit_shared_fga_queue_elem_size() * MaxBackends ------------>| + * | | | | |<-------- audit_shared_trace_queue_elem_size() * MaxBackends ----------->| * | | | * | | | - * |<------------------------- Shared Log Queue Array Header ------------------>|<------------------- AlogQueue Array [MaxBackends] --------------------->| + * |<------------------------- Shared Log Queue Array Header ------------------>|<------------------- AlogQueue Array [MaxBackends] --------------------->| * * * 02. AlogQueue as follows * - * | q_area -> char[AuditLog_common_log_queue_size_kb * BYTES_PER_KB] | + * | q_area -> char[AuditLog_common_log_queue_size_kb * BYTES_PER_KB] | * | q_pid | q_size | q_lock | q_head | q_tail | OR | * | q_area -> char[AuditLog_fga_log_queue_size_kb * BYTES_PER_KB] | + * | OR | + * | q_area -> char[Maintain_trace_log_queue_size_kb * BYTES_PER_KB] | * * -------------------------------- */ @@ -905,13 +967,13 @@ static Size audit_shared_queue_array_bitmap_offset(void) { Size alogQueueArrayBitmapOffset = 0; - /* store AlogQueueArray->a_count, a_bitmap */ - alogQueueArrayBitmapOffset = add_size(alogQueueArrayBitmapOffset, - offsetof(AlogQueueArray, a_queue)); + /* store AlogQueueArray->a_count, a_bitmap */ + alogQueueArrayBitmapOffset = add_size(alogQueueArrayBitmapOffset, + offsetof(AlogQueueArray, a_queue)); - /* store AlogQueueArray->a_queue */ - alogQueueArrayBitmapOffset = add_size(alogQueueArrayBitmapOffset, - mul_size(sizeof(AlogQueue *), MaxBackends)); + /* store AlogQueueArray->a_queue */ + alogQueueArrayBitmapOffset = add_size(alogQueueArrayBitmapOffset, + mul_size(sizeof(AlogQueue *), MaxBackends)); return alogQueueArrayBitmapOffset; } @@ -920,13 +982,13 @@ static Size audit_shared_queue_array_header_size(void) { Size alogQueueArrayHeaderSize = 0; - /* store AlogQueueArray->a_count, a_bitmap, a_queue */ - alogQueueArrayHeaderSize = add_size(alogQueueArrayHeaderSize, - audit_shared_queue_array_bitmap_offset()); + /* store AlogQueueArray->a_count, a_bitmap, a_queue */ + alogQueueArrayHeaderSize = add_size(alogQueueArrayHeaderSize, + audit_shared_queue_array_bitmap_offset()); - /* store content of AlogQueueArray->a_bitmap */ - alogQueueArrayHeaderSize = add_size(alogQueueArrayHeaderSize, - AUDIT_BITMAP_SIZE); + /* store content of AlogQueueArray->a_bitmap */ + alogQueueArrayHeaderSize = add_size(alogQueueArrayHeaderSize, + AUDIT_BITMAP_SIZE); return alogQueueArrayHeaderSize; } @@ -989,6 +1051,30 @@ static Size audit_shared_fga_queue_array_size(void) return alogFgaQueueSize; } +static Size audit_shared_trace_queue_elem_size(void) +{ + Size alogTraceQueueItemSize = 0; + + /* store content of trace audit log */ + alogTraceQueueItemSize = audit_queue_elem_size(Maintain_trace_log_queue_size_kb); + + return alogTraceQueueItemSize; +} + +static Size audit_shared_trace_queue_array_size(void) +{ + Size alogTraceQueueSize = 0; + + /* store content of trace audit log */ + alogTraceQueueSize = audit_shared_trace_queue_elem_size(); + alogTraceQueueSize = mul_size(alogTraceQueueSize, MaxBackends); + + alogTraceQueueSize = add_size(alogTraceQueueSize, + audit_shared_queue_array_header_size()); + + return alogTraceQueueSize; +} + static Size audit_shared_consumer_bitmap_size(void) { Size alogConsumerBitmapSize = 0; @@ -1012,10 +1098,11 @@ static void audit_shared_consumer_bitmap_set_value(int consumer_id, int value) Size AuditLoggerShmemSize(void) { - Size size = 0; - Size alogCommonQueueSize = 0; - Size alogFgaQueueSize = 0; - Size alogConsumerBmpSize = 0; + Size size = 0; + Size alogCommonQueueSize = 0; + Size alogFgaQueueSize = 0; + Size alogTraceQueueSize = 0; + Size alogConsumerBmpSize = 0; /* for common audit log */ alogCommonQueueSize = audit_shared_common_queue_array_size(); @@ -1023,173 +1110,245 @@ Size AuditLoggerShmemSize(void) /* for fga audit log*/ alogFgaQueueSize = audit_shared_fga_queue_array_size(); - /* for consumer notify bitmap */ - alogConsumerBmpSize = audit_shared_consumer_bitmap_size(); + /* for trace audit log */ + alogTraceQueueSize = audit_shared_trace_queue_array_size(); + + /* for consumer notify bitmap */ + alogConsumerBmpSize = audit_shared_consumer_bitmap_size(); - /* for total size */ - size = add_size(alogCommonQueueSize, alogFgaQueueSize); - size = add_size(size, alogConsumerBmpSize); + /* for total size */ + size = add_size(alogCommonQueueSize, alogFgaQueueSize); + size = add_size(size, alogTraceQueueSize); + size = add_size(size, alogConsumerBmpSize); return size; } void AuditLoggerShmemInit(void) -{// #lizard forgives - Size alogBmpOffset = 0; - Size alogHeaderSize = 0; - Size alogItemSize = 0; - Size alogArraySize = 0; - Size alogConsumerBmpSize = 0; - - bool found = false; - int i = 0; - - alogBmpOffset = audit_shared_queue_array_bitmap_offset(); - alogHeaderSize = audit_shared_queue_array_header_size(); - alogItemSize = audit_shared_common_queue_elem_size(); - alogArraySize = audit_shared_common_queue_array_size(); - - AuditCommonLogQueueArray = ShmemInitStruct("Audit Common Log Queue", - alogArraySize, - &found); - /* Mark it empty upon creation */ - if (!found) - { - AlogQueueArray * alogQueueArray = AuditCommonLogQueueArray; - int falogQueueArray = 0; - Size sharedMemSize = 0; - - if (enable_auditlogger_warning) - { - sharedMemSize += alogHeaderSize; - MemSet(alogQueueArray, 'a', alogHeaderSize); - - for (i = 0; i < MaxBackends; i++) - { - AlogQueue * alogQueueItem = NULL; - - alogQueueItem = (AlogQueue *)(((char *) alogQueueArray) + alogHeaderSize + i * alogItemSize); - sharedMemSize += audit_shared_common_queue_elem_size(); - MemSet(alogQueueItem, 'b', audit_shared_common_queue_elem_size()); - } - - falogQueueArray = BasicOpenFile("AuditCommonLogQueueArray.txt", O_RDWR | O_TRUNC | O_CREAT, S_IRUSR | S_IWUSR); - if (falogQueueArray != -1) - { - write(falogQueueArray, alogQueueArray, alogArraySize); - write(falogQueueArray, "\nNew Line\n", strlen("\nNew Line\n")); - } - - Assert(sharedMemSize == alogArraySize); - } - - MemSet(alogQueueArray, 0, alogArraySize); - - alogQueueArray->a_count = MaxBackends; - alogQueueArray->a_bitmap = bms_make(((char *) alogQueueArray) + alogBmpOffset, - MaxBackends); - for (i = 0; i < MaxBackends; i++) - { - AlogQueue * alogQueueItem = NULL; - - alogQueueItem = (AlogQueue *)(((char *) alogQueueArray) + alogHeaderSize + i * alogItemSize); - - alog_queue_init(alogQueueItem, AuditLog_common_log_queue_size_kb); - - alogQueueArray->a_queue[i] = alogQueueItem; - } - - if (enable_auditlogger_warning) - { - if (falogQueueArray != -1) - { - write(falogQueueArray, alogQueueArray, alogArraySize); - close(falogQueueArray); - } - } - } - - found = false; - i = 0; - - alogBmpOffset = audit_shared_queue_array_bitmap_offset(); - alogHeaderSize = audit_shared_queue_array_header_size(); - alogItemSize = audit_shared_fga_queue_elem_size(); - alogArraySize = audit_shared_fga_queue_array_size(); - - AuditFGALogQueueArray = ShmemInitStruct("Audit FGA Log Queue", - alogArraySize, - &found); - /* Mark it empty upon creation */ - if (!found) - { - AlogQueueArray * alogQueueArray = AuditFGALogQueueArray; - int falogQueueArray = 0; - Size sharedMemSize = 0; - - if (enable_auditlogger_warning) - { - sharedMemSize += alogHeaderSize; - MemSet(alogQueueArray, 'c', alogHeaderSize); - - for (i = 0; i < MaxBackends; i++) - { - AlogQueue * alogQueueItem = NULL; - - alogQueueItem = (AlogQueue *)(((char *) alogQueueArray) + alogHeaderSize + i * alogItemSize); - sharedMemSize += audit_shared_fga_queue_elem_size(); - MemSet(alogQueueItem, 'd', audit_shared_fga_queue_elem_size()); - } - - falogQueueArray = BasicOpenFile("AuditFGALogQueueArray.txt", O_RDWR | O_TRUNC | O_CREAT, S_IRUSR | S_IWUSR); - if (falogQueueArray != -1) - { - write(falogQueueArray, alogQueueArray, alogArraySize); - write(falogQueueArray, "\nNew Line\n", strlen("\nNew Line\n")); - } - - Assert(sharedMemSize == alogArraySize); - } - - MemSet(alogQueueArray, 0, alogArraySize); - - alogQueueArray->a_count = MaxBackends; - alogQueueArray->a_bitmap = bms_make(((char *) alogQueueArray) + alogBmpOffset, - MaxBackends); - for (i = 0; i < MaxBackends; i++) - { - AlogQueue * alogQueueItem = NULL; - - alogQueueItem = (AlogQueue *)(((char *) alogQueueArray) + alogHeaderSize + i * alogItemSize); - - alog_queue_init(alogQueueItem, AuditLog_fga_log_queue_size_kb); - - alogQueueArray->a_queue[i] = alogQueueItem; - } - - if (enable_auditlogger_warning) - { - if (falogQueueArray != -1) - { - write(falogQueueArray, alogQueueArray, alogArraySize); - close(falogQueueArray); - } - } - } - - found = false; - i = 0; - - alogConsumerBmpSize = audit_shared_consumer_bitmap_size(); - - AuditConsumerNotifyBitmap = ShmemInitStruct("Audit Consumer Bitmap", - alogConsumerBmpSize, - &found); - /* Mark it empty upon creation */ - if (!found) - { - MemSet(AuditConsumerNotifyBitmap, 0, alogConsumerBmpSize); - } +{ + Size alogBmpOffset = 0; + Size alogHeaderSize = 0; + Size alogItemSize = 0; + Size alogArraySize = 0; + Size alogConsumerBmpSize = 0; + + bool found = false; + int i = 0; + + alogBmpOffset = audit_shared_queue_array_bitmap_offset(); + alogHeaderSize = audit_shared_queue_array_header_size(); + alogItemSize = audit_shared_common_queue_elem_size(); + alogArraySize = audit_shared_common_queue_array_size(); + + AuditCommonLogQueueArray = ShmemInitStruct("Audit Common Log Queue", + alogArraySize, + &found); + /* Mark it empty upon creation */ + if (!found) + { + AlogQueueArray * alogQueueArray = AuditCommonLogQueueArray; + int falogQueueArray = 0; + Size sharedMemSize = 0; + + if (enable_auditlogger_warning) + { + sharedMemSize += alogHeaderSize; + MemSet(alogQueueArray, 'a', alogHeaderSize); + + for (i = 0; i < MaxBackends; i++) + { + AlogQueue * alogQueueItem = NULL; + + alogQueueItem = (AlogQueue *)(((char *) alogQueueArray) + alogHeaderSize + i * alogItemSize); + sharedMemSize += audit_shared_common_queue_elem_size(); + MemSet(alogQueueItem, 'b', audit_shared_common_queue_elem_size()); + } + + falogQueueArray = BasicOpenFile("AuditCommonLogQueueArray.txt", O_RDWR | O_TRUNC | O_CREAT, S_IRUSR | S_IWUSR); + if (falogQueueArray != -1) + { + write(falogQueueArray, alogQueueArray, alogArraySize); + write(falogQueueArray, "\nNew Line\n", strlen("\nNew Line\n")); + } + + Assert(sharedMemSize == alogArraySize); + } + + MemSet(alogQueueArray, 0, alogArraySize); + + alogQueueArray->a_count = MaxBackends; + alogQueueArray->a_bitmap = bms_make(((char *) alogQueueArray) + alogBmpOffset, + MaxBackends); + for (i = 0; i < MaxBackends; i++) + { + AlogQueue * alogQueueItem = NULL; + + alogQueueItem = (AlogQueue *)(((char *) alogQueueArray) + alogHeaderSize + i * alogItemSize); + + alog_queue_init(alogQueueItem, AuditLog_common_log_queue_size_kb); + + alogQueueArray->a_queue[i] = alogQueueItem; + } + + if (enable_auditlogger_warning) + { + if (falogQueueArray != -1) + { + write(falogQueueArray, alogQueueArray, alogArraySize); + close(falogQueueArray); + } + } + } + + found = false; + i = 0; + + alogBmpOffset = audit_shared_queue_array_bitmap_offset(); + alogHeaderSize = audit_shared_queue_array_header_size(); + alogItemSize = audit_shared_fga_queue_elem_size(); + alogArraySize = audit_shared_fga_queue_array_size(); + + AuditFGALogQueueArray = ShmemInitStruct("Audit FGA Log Queue", + alogArraySize, + &found); + /* Mark it empty upon creation */ + if (!found) + { + AlogQueueArray * alogQueueArray = AuditFGALogQueueArray; + int falogQueueArray = 0; + Size sharedMemSize = 0; + + if (enable_auditlogger_warning) + { + sharedMemSize += alogHeaderSize; + MemSet(alogQueueArray, 'c', alogHeaderSize); + + for (i = 0; i < MaxBackends; i++) + { + AlogQueue * alogQueueItem = NULL; + + alogQueueItem = (AlogQueue *)(((char *) alogQueueArray) + alogHeaderSize + i * alogItemSize); + sharedMemSize += audit_shared_fga_queue_elem_size(); + MemSet(alogQueueItem, 'd', audit_shared_fga_queue_elem_size()); + } + + falogQueueArray = BasicOpenFile("AuditFGALogQueueArray.txt", O_RDWR | O_TRUNC | O_CREAT, S_IRUSR | S_IWUSR); + if (falogQueueArray != -1) + { + write(falogQueueArray, alogQueueArray, alogArraySize); + write(falogQueueArray, "\nNew Line\n", strlen("\nNew Line\n")); + } + + Assert(sharedMemSize == alogArraySize); + } + + MemSet(alogQueueArray, 0, alogArraySize); + + alogQueueArray->a_count = MaxBackends; + alogQueueArray->a_bitmap = bms_make(((char *) alogQueueArray) + alogBmpOffset, + MaxBackends); + for (i = 0; i < MaxBackends; i++) + { + AlogQueue * alogQueueItem = NULL; + + alogQueueItem = (AlogQueue *)(((char *) alogQueueArray) + alogHeaderSize + i * alogItemSize); + + alog_queue_init(alogQueueItem, AuditLog_fga_log_queue_size_kb); + + alogQueueArray->a_queue[i] = alogQueueItem; + } + + if (enable_auditlogger_warning) + { + if (falogQueueArray != -1) + { + write(falogQueueArray, alogQueueArray, alogArraySize); + close(falogQueueArray); + } + } + } + + found = false; + i = 0; + + alogBmpOffset = audit_shared_queue_array_bitmap_offset(); + alogHeaderSize = audit_shared_queue_array_header_size(); + alogItemSize = audit_shared_trace_queue_elem_size(); + alogArraySize = audit_shared_trace_queue_array_size(); + + AuditTraceLogQueueArray = ShmemInitStruct("Audit Trace Log Queue", + alogArraySize, + &found); + /* Mark it empty upon creation */ + if (!found) + { + AlogQueueArray * alogQueueArray = AuditTraceLogQueueArray; + int falogQueueArray = 0; + Size sharedMemSize = 0; + + if (enable_auditlogger_warning) + { + sharedMemSize += alogHeaderSize; + MemSet(alogQueueArray, 'e', alogHeaderSize); + + for (i = 0; i < MaxBackends; i++) + { + AlogQueue * alogQueueItem = NULL; + + alogQueueItem = (AlogQueue *)(((char *) alogQueueArray) + alogHeaderSize + i * alogItemSize); + sharedMemSize += audit_shared_trace_queue_elem_size(); + MemSet(alogQueueItem, 'f', audit_shared_trace_queue_elem_size()); + } + + falogQueueArray = BasicOpenFile("AuditTraceLogQueueArray.txt", O_RDWR | O_TRUNC | O_CREAT, S_IRUSR | S_IWUSR); + if (falogQueueArray != -1) + { + write(falogQueueArray, alogQueueArray, alogArraySize); + write(falogQueueArray, "\nNew Line\n", strlen("\nNew Line\n")); + } + + Assert(sharedMemSize == alogArraySize); + } + + MemSet(alogQueueArray, 0, alogArraySize); + + alogQueueArray->a_count = MaxBackends; + alogQueueArray->a_bitmap = bms_make(((char *) alogQueueArray) + alogBmpOffset, + MaxBackends); + for (i = 0; i < MaxBackends; i++) + { + AlogQueue * alogQueueItem = NULL; + + alogQueueItem = (AlogQueue *)(((char *) alogQueueArray) + alogHeaderSize + i * alogItemSize); + + alog_queue_init(alogQueueItem, Maintain_trace_log_queue_size_kb); + + alogQueueArray->a_queue[i] = alogQueueItem; + } + + if (enable_auditlogger_warning) + { + if (falogQueueArray != -1) + { + write(falogQueueArray, alogQueueArray, alogArraySize); + close(falogQueueArray); + } + } + } + + found = false; + i = 0; + + alogConsumerBmpSize = audit_shared_consumer_bitmap_size(); + + AuditConsumerNotifyBitmap = ShmemInitStruct("Audit Consumer Bitmap", + alogConsumerBmpSize, + &found); + /* Mark it empty upon creation */ + if (!found) + { + MemSet(AuditConsumerNotifyBitmap, 0, alogConsumerBmpSize); + } } #endif @@ -1204,7 +1363,7 @@ void AuditLoggerShmemInit(void) /* * Make audit_log_directory from Log_directory */ -static void +static void audit_assign_log_dir(void) { if (audit_log_directory != NULL) @@ -1221,12 +1380,19 @@ audit_assign_log_dir(void) { StringInfoData alog_dir; - memset(&alog_dir, 0, sizeof(alog_dir)); - initStringInfo(&alog_dir); - appendStringInfo(&alog_dir, "%s/audit", Log_directory); + memset(&alog_dir, 0, sizeof(alog_dir)); + initStringInfo(&alog_dir); + appendStringInfo(&alog_dir, "%s/audit", Log_directory); - audit_log_directory = alog_dir.data; - } + audit_log_directory = alog_dir.data; + } + + if (trace_log_directory != NULL) + { + pfree(trace_log_directory); + trace_log_directory = NULL; + } + trace_log_directory = pstrdup("pg_log/maintain"); } /* @@ -1241,15 +1407,19 @@ audit_write_log_file(const char *buffer, int count, int destination) { int rc = 0; - if (destination == AUDIT_FGA_LOG) - { - rc = fwrite(buffer, 1, count, audit_fga_log_file); - } - else - { - Assert(destination == AUDIT_COMMON_LOG); - rc = fwrite(buffer, 1, count, audit_comm_log_file); - } + if (destination == AUDIT_FGA_LOG) + { + rc = fwrite(buffer, 1, count, audit_fga_log_file); + } + else if (destination == MAINTAIN_TRACE_LOG) + { + rc = fwrite(buffer, 1, count, audit_trace_log_file); + } + else + { + Assert(destination == AUDIT_COMMON_LOG); + rc = fwrite(buffer, 1, count, audit_comm_log_file); + } /* can't use ereport here because of possible recursion */ if (rc != count) @@ -1262,7 +1432,7 @@ audit_write_log_file(const char *buffer, int count, int destination) } static void -aduit_open_fga_log_file(void) +audit_open_fga_log_file(void) { char *filename = NULL; @@ -1276,6 +1446,25 @@ aduit_open_fga_log_file(void) audit_last_fga_log_file_name = filename; } + +static void +audit_open_trace_log_file(void) +{ + char *filename = NULL; + + filename = trace_log_file_getname(time(NULL), ".trace"); + + audit_trace_log_file = audit_open_log_file(filename, "a", false); + + if (audit_last_trace_log_file_name != NULL) + { + /* probably shouldn't happen */ + pfree(audit_last_trace_log_file_name); + } + + audit_last_trace_log_file_name = filename; +} + /* * Open a new logfile with proper permissions and buffering options. * @@ -1325,128 +1514,192 @@ audit_open_log_file(const char *filename, const char *mode, bool allow_errors) */ static void audit_rotate_log_file(bool time_based_rotation, int size_rotation_for) -{// #lizard forgives - char *filename = NULL; - char *fgafilename = NULL; - pg_time_t fntime = 0; - FILE *fh = NULL; - - audit_rotation_requested = false; - - /* - * When doing a time-based rotation, invent the new logfile name based on - * the planned rotation time, not current time, to avoid "slippage" in the - * file name when we don't do the rotation immediately. - */ - if (time_based_rotation) - fntime = audit_next_rotation_time; - else - fntime = time(NULL); - filename = audit_log_file_getname(fntime, NULL); - if (audit_fga_log_file != NULL) - fgafilename = audit_log_file_getname(fntime, ".fga"); - - /* - * Decide whether to overwrite or append. We can overwrite if (a) - * AuditLog_truncate_on_rotation is set, (b) the rotation was triggered by - * elapsed time and not something else, and (c) the computed file name is - * different from what we were previously logging into. - * - * Note: audit_last_comm_log_file_name should never be NULL here, but if it is, append. - */ - if (time_based_rotation || (size_rotation_for & AUDIT_COMMON_LOG)) - { - if (AuditLog_truncate_on_rotation && time_based_rotation && - audit_last_comm_log_file_name != NULL && - strcmp(filename, audit_last_comm_log_file_name) != 0) - fh = audit_open_log_file(filename, "w", true); - else - fh = audit_open_log_file(filename, "a", true); - - if (!fh) - { - /* - * ENFILE/EMFILE are not too surprising on a busy system; just - * keep using the old file till we manage to get a new one. - * Otherwise, assume something's wrong with audit_log_directory and stop - * trying to create files. - */ - if (errno != ENFILE && errno != EMFILE) - { - ereport(LOG, - (errmsg("disabling automatic rotation audit log file (use SIGHUP to re-enable)"))); - audit_rotation_disabled = true; - } - - pfree(filename); - if (fgafilename) - pfree(fgafilename); - return; - } - - SpinLockAcquire(&(audit_comm_log_file_lock)); - fclose(audit_comm_log_file); - audit_comm_log_file = fh; - SpinLockRelease(&(audit_comm_log_file_lock)); - - /* instead of pfree'ing filename, remember it for next time */ - if (audit_last_comm_log_file_name != NULL) - pfree(audit_last_comm_log_file_name); - audit_last_comm_log_file_name = filename; - filename = NULL; - } - - /* Same as above, but for fga audit log file. */ - - if (audit_fga_log_file != NULL && - (time_based_rotation || (size_rotation_for & AUDIT_FGA_LOG))) - { - if (AuditLog_truncate_on_rotation && time_based_rotation && - audit_last_fga_log_file_name != NULL && - strcmp(fgafilename, audit_last_fga_log_file_name) != 0) - fh = audit_open_log_file(fgafilename, "w", true); - else - fh = audit_open_log_file(fgafilename, "a", true); - - if (!fh) - { - /* - * ENFILE/EMFILE are not too surprising on a busy system; just - * keep using the old file till we manage to get a new one. - * Otherwise, assume something's wrong with audit_log_directory and stop - * trying to create files. - */ - if (errno != ENFILE && errno != EMFILE) - { - ereport(LOG, - (errmsg("disabling automatic rotation audit log file (use SIGHUP to re-enable)"))); - audit_rotation_disabled = true; - } - - if (filename) - pfree(filename); - pfree(fgafilename); - return; - } - - SpinLockAcquire(&(audit_fga_log_file_lock)); - fclose(audit_fga_log_file); - audit_fga_log_file = fh; - SpinLockRelease(&(audit_fga_log_file_lock)); - - /* instead of pfree'ing filename, remember it for next time */ - if (audit_last_fga_log_file_name != NULL) - pfree(audit_last_fga_log_file_name); - audit_last_fga_log_file_name = fgafilename; - fgafilename = NULL; - } - - if (filename) - pfree(filename); - if (fgafilename) - pfree(fgafilename); - - audit_set_next_rotation_time(); +{ + char *filename = NULL; + char *fgafilename = NULL; + char *tracefilename = NULL; + pg_time_t fntime = 0; + FILE *fh = NULL; + + audit_rotation_requested = false; + + /* + * When doing a time-based rotation, invent the new logfile name based on + * the planned rotation time, not current time, to avoid "slippage" in the + * file name when we don't do the rotation immediately. + */ + if (time_based_rotation) + fntime = audit_next_rotation_time; + else + fntime = time(NULL); + filename = audit_log_file_getname(fntime, NULL); + if (audit_fga_log_file != NULL) + fgafilename = audit_log_file_getname(fntime, ".fga"); + if (audit_trace_log_file != NULL) + tracefilename = trace_log_file_getname(fntime, ".trace"); + + /* + * Decide whether to overwrite or append. We can overwrite if (a) + * AuditLog_truncate_on_rotation is set, (b) the rotation was triggered by + * elapsed time and not something else, and (c) the computed file name is + * different from what we were previously logging into. + * + * Note: audit_last_comm_log_file_name should never be NULL here, but if it is, append. + */ + if (time_based_rotation || (size_rotation_for & AUDIT_COMMON_LOG)) + { + if (AuditLog_truncate_on_rotation && time_based_rotation && + audit_last_comm_log_file_name != NULL && + strcmp(filename, audit_last_comm_log_file_name) != 0) + fh = audit_open_log_file(filename, "w", true); + else + fh = audit_open_log_file(filename, "a", true); + + if (!fh) + { + /* + * ENFILE/EMFILE are not too surprising on a busy system; just + * keep using the old file till we manage to get a new one. + * Otherwise, assume something's wrong with audit_log_directory and stop + * trying to create files. + */ + if (errno != ENFILE && errno != EMFILE) + { + ereport(LOG, + (errmsg("disabling automatic rotation audit log file (use SIGHUP to re-enable)"))); + audit_rotation_disabled = true; + } + + if (filename) + pfree(filename); + if (fgafilename) + pfree(fgafilename); + if (tracefilename) + pfree(tracefilename); + return; + } + + SpinLockAcquire(&(audit_comm_log_file_lock)); + fclose(audit_comm_log_file); + audit_comm_log_file = fh; + SpinLockRelease(&(audit_comm_log_file_lock)); + + /* instead of pfree'ing filename, remember it for next time */ + if (audit_last_comm_log_file_name != NULL) + pfree(audit_last_comm_log_file_name); + audit_last_comm_log_file_name = filename; + filename = NULL; + } + + /* Same as above, but for fga audit log file. */ + if (audit_fga_log_file != NULL && + (time_based_rotation || (size_rotation_for & AUDIT_FGA_LOG))) + { + if (AuditLog_truncate_on_rotation && time_based_rotation && + audit_last_fga_log_file_name != NULL && + strcmp(fgafilename, audit_last_fga_log_file_name) != 0) + fh = audit_open_log_file(fgafilename, "w", true); + else + fh = audit_open_log_file(fgafilename, "a", true); + + if (!fh) + { + /* + * ENFILE/EMFILE are not too surprising on a busy system; just + * keep using the old file till we manage to get a new one. + * Otherwise, assume something's wrong with audit_log_directory and stop + * trying to create files. + */ + if (errno != ENFILE && errno != EMFILE) + { + ereport(LOG, + (errmsg("disabling automatic rotation audit log file (use SIGHUP to re-enable)"))); + audit_rotation_disabled = true; + } + + if (filename) + pfree(filename); + if (fgafilename) + pfree(fgafilename); + if (tracefilename) + pfree(tracefilename); + return; + } + + SpinLockAcquire(&(audit_fga_log_file_lock)); + fclose(audit_fga_log_file); + audit_fga_log_file = fh; + SpinLockRelease(&(audit_fga_log_file_lock)); + + /* instead of pfree'ing filename, remember it for next time */ + if (audit_last_fga_log_file_name != NULL) + pfree(audit_last_fga_log_file_name); + audit_last_fga_log_file_name = fgafilename; + fgafilename = NULL; + } + + /* Same as above, but for trace audit log file. */ + if (audit_trace_log_file != NULL && + (time_based_rotation || (size_rotation_for & MAINTAIN_TRACE_LOG))) + { + /* + * Only append write,do not consider overwrite for maintain trace log. + * That is different from audit common log and fga log. + * + if (AuditLog_truncate_on_rotation && time_based_rotation && + audit_last_trace_log_file_name != NULL && + strcmp(tracefilename, audit_last_trace_log_file_name) != 0) + fh = audit_open_log_file(tracefilename, "w", true); + else + */ + { + fh = audit_open_log_file(tracefilename, "a", true); + } + + if (!fh) + { + /* + * ENFILE/EMFILE are not too surprising on a busy system; just + * keep using the old file till we manage to get a new one. + * Otherwise, assume something's wrong with audit_log_directory and stop + * trying to create files. + */ + if (errno != ENFILE && errno != EMFILE) + { + ereport(LOG, + (errmsg("disabling automatic rotation audit log file (use SIGHUP to re-enable)"))); + audit_rotation_disabled = true; + } + + if (filename) + pfree(filename); + if (fgafilename) + pfree(fgafilename); + if (tracefilename) + pfree(tracefilename); + return; + } + + SpinLockAcquire(&(audit_trace_log_file_lock)); + fclose(audit_trace_log_file); + audit_trace_log_file = fh; + SpinLockRelease(&(audit_trace_log_file_lock)); + + /* instead of pfree'ing filename, remember it for next time */ + if (audit_last_trace_log_file_name != NULL) + pfree(audit_last_trace_log_file_name); + audit_last_trace_log_file_name = tracefilename; + tracefilename = NULL; + } + + if (filename) + pfree(filename); + if (fgafilename) + pfree(fgafilename); + if (tracefilename) + pfree(tracefilename); + + audit_set_next_rotation_time(); } /* @@ -1484,6 +1737,42 @@ audit_log_file_getname(pg_time_t timestamp, const char *suffix) return filename; } +/* + * construct logfile name using timestamp information. + * acoording to audit_log_file_getname(). + * + * If suffix isn't NULL, append it to the name, replacing any ".log" + * that may be in the pattern. + * + * Result is palloc'd. + */ +static char * +trace_log_file_getname(pg_time_t timestamp, const char *suffix) +{ + char *filename = NULL; + int len = 0; + + filename = palloc(MAXPGPATH); + + snprintf(filename, MAXPGPATH, "%s/", trace_log_directory); + + len = strlen(filename); + + /* treat AuditLog_filename as a strftime pattern */ + pg_strftime(filename + len, MAXPGPATH - len, TraceLog_filename, + pg_localtime(×tamp, log_timezone)); + + if (suffix != NULL) + { + len = strlen(filename); + if (len > 4 && (strcmp(filename + (len - 4), ".log") == 0)) + len -= 4; + strlcpy(filename + len, suffix, MAXPGPATH - len); + } + + return filename; +} + /* * Determine the next planned rotation time, and store in audit_next_rotation_time. */ @@ -1589,9 +1878,9 @@ static bool alog_queue_is_empty(int q_size, int q_head, int q_tail) static bool alog_queue_is_empty2(AlogQueue * queue) { - volatile int q_head = queue->q_head; - volatile int q_tail = queue->q_tail; - volatile int q_size = queue->q_size; + volatile int q_head = queue->q_head; + volatile int q_tail = queue->q_tail; + volatile int q_size = queue->q_size; pg_memory_barrier(); @@ -1603,12 +1892,12 @@ static bool alog_queue_is_empty2(AlogQueue * queue) */ static int alog_queue_used(int q_size, int q_head, int q_tail) { - int used = (q_tail - q_head + q_size) % q_size; + int used = (q_tail - q_head + q_size) % q_size; Assert(q_size > 0 && q_head >= 0 && q_tail >= 0); Assert(q_head < q_size && q_tail < q_size); - return used; + return used; } /* @@ -1662,9 +1951,9 @@ static bool alog_queue_push(AlogQueue * queue, char * buff, int len) * write buff1 and buff2 to queue */ static bool alog_queue_push2(AlogQueue * queue, char * buff1, int len1, char * buff2, int len2) -{ - char * buff_array[] = {buff1, buff2}; - int len_array[] = {len1, len2}; +{ + char * buff_array[] = {buff1, buff2}; + int len_array[] = {len1, len2}; return alog_queue_pushn(queue, buff_array, len_array, sizeof(len_array)/sizeof(len_array[0])); } @@ -1673,25 +1962,25 @@ static bool alog_queue_push2(AlogQueue * queue, char * buff1, int len1, char * b * write n buffs to queue */ static bool alog_queue_pushn(AlogQueue * queue, char * buff[], int len[], int n) -{// #lizard forgives - volatile int q_head = queue->q_head; - volatile int q_tail = queue->q_tail; - volatile int q_size = queue->q_size; +{ + volatile int q_head = queue->q_head; + volatile int q_tail = queue->q_tail; + volatile int q_size = queue->q_size; - int q_head_before = q_head; - int q_tail_before = q_tail; - int q_size_before = q_size; + int q_head_before = q_head; + int q_tail_before = q_tail; + int q_size_before = q_size; - int q_used_before = 0; - int q_used_after = 0; - - int total_len = 0; - int i = 0; - - for (i = 0; i < n; i++) - { - total_len += len[i]; - } + int q_used_before = 0; + int q_used_after = 0; + + int total_len = 0; + int i = 0; + + for (i = 0; i < n; i++) + { + total_len += len[i]; + } pg_memory_barrier(); alog_just_caller(&q_used_before); @@ -1764,9 +2053,9 @@ static bool alog_queue_pushn(AlogQueue * queue, char * buff[], int len[], int n) /* * |<- strlen value ->|<- string message content ->| - * | | - * | | - * |<------------------ buff --------------------->| + * | | + * | | + * |<------------------ buff --------------------->| * * len = size(int) + strlen(str) * @@ -1820,9 +2109,9 @@ static int alog_queue_get_str_len(AlogQueue * queue, int offset) * copy message from queue to another as much as possible * * |<- strlen value ->|<- string message content ->| - * | | - * | | - * |<------------------ buff --------------------->| + * | | + * | | + * |<------------------ buff --------------------->| * * len = size(int) + strlen(str) * @@ -1949,107 +2238,114 @@ static bool alog_queue_pop_to_queue(AlogQueue * from, AlogQueue * to) * copy message from queue to file as much as possible */ static bool alog_queue_pop_to_file(AlogQueue * from, int destination) -{// #lizard forgives - volatile int q_from_head = from->q_head; - volatile int q_from_tail = from->q_tail; - volatile int q_from_size = from->q_size; - - int from_head = q_from_head; - int from_tail = q_from_tail; - int from_size = q_from_size; - - int from_total = 0; - - int from_used = 0; - int from_copyed = 0; - - volatile slock_t * file_lock = NULL; - - pg_memory_barrier(); - alog_just_caller(&from_total); - - from_total = from_used = alog_queue_used(from_size, from_head, from_tail); - - Assert(from_size > 0 && from_head >= 0 && from_tail >= 0); - Assert(from_head < from_size && from_tail < from_size && from_used <= from_size); - Assert(destination == AUDIT_COMMON_LOG || destination == AUDIT_FGA_LOG); - - if (destination == AUDIT_COMMON_LOG) - { - file_lock = &audit_comm_log_file_lock; - } - else - { - file_lock = &audit_fga_log_file_lock; - } - - /* from is empty, ignore */ - if (alog_queue_is_empty(from_size, from_head, from_tail)) - { - return false; - } - - /* copy message into file until from is empty */ - do - { - int string_len = alog_queue_get_str_len(from, from_head); - int copy_len = sizeof(int) + string_len; - - pg_memory_barrier(); - - /* just copy dierctly */ - if (from_size - from_head >= copy_len) - { - char * p_start = alog_queue_offset_to(from, from_head + sizeof(int)); - - /* only copy message content, not write message len */ - SpinLockAcquire(file_lock); - audit_write_log_file(p_start, string_len, destination); - SpinLockRelease(file_lock); - } - else if (from_size - from_head > sizeof(int)) - { - /* must copy as two parts */ - int first_len = from_size - from_head - sizeof(int); - int second_len = string_len - first_len; - char * p_start = NULL; - - Assert(first_len > 0 && first_len < from_size); - Assert(second_len > 0 && second_len < from_size); - - SpinLockAcquire(file_lock); - p_start = alog_queue_offset_to(from, from_head + sizeof(int)); - audit_write_log_file(p_start, first_len, destination); - - p_start = alog_queue_offset_to(from, 0); - audit_write_log_file(p_start, second_len, destination); - SpinLockRelease(file_lock); - } - else - { - /* just copy content only */ - int cpy_offset = (from_head + sizeof(int)) % from_size; - char * p_start = alog_queue_offset_to(from, cpy_offset); - - Assert(from_size - from_head <= sizeof(int)); - SpinLockAcquire(file_lock); - audit_write_log_file(p_start, string_len, destination); - SpinLockRelease(file_lock); - } - - from_head = (from_head + copy_len) % from_size; - from_copyed += copy_len; - - Assert(from_copyed <= from_total); - Assert(from_used - copy_len >= 0); - Assert(from_used - copy_len == alog_queue_used(from_size, from_head, from_tail)); - - from_used = alog_queue_used(from_size, from_head, from_tail); - } while (!alog_queue_is_empty(from_size, from_head, from_tail)); - - from->q_head = from_head; - - return true; +{ + volatile int q_from_head = from->q_head; + volatile int q_from_tail = from->q_tail; + volatile int q_from_size = from->q_size; + + int from_head = q_from_head; + int from_tail = q_from_tail; + int from_size = q_from_size; + + int from_total = 0; + + int from_used = 0; + int from_copyed = 0; + + volatile slock_t * file_lock = NULL; + + pg_memory_barrier(); + alog_just_caller(&from_total); + + from_total = from_used = alog_queue_used(from_size, from_head, from_tail); + + Assert(from_size > 0 && from_head >= 0 && from_tail >= 0); + Assert(from_head < from_size && from_tail < from_size && from_used <= from_size); + Assert(destination == AUDIT_COMMON_LOG || + destination == AUDIT_FGA_LOG || + destination == MAINTAIN_TRACE_LOG); + + if (destination == AUDIT_COMMON_LOG) + { + file_lock = &audit_comm_log_file_lock; + } + else if (destination == AUDIT_FGA_LOG) + { + file_lock = &audit_fga_log_file_lock; + } + else + { + Assert(destination == MAINTAIN_TRACE_LOG); + file_lock = &audit_trace_log_file_lock; + } + + /* from is empty, ignore */ + if (alog_queue_is_empty(from_size, from_head, from_tail)) + { + return false; + } + + /* copy message into file until from is empty */ + do + { + int string_len = alog_queue_get_str_len(from, from_head); + int copy_len = sizeof(int) + string_len; + + pg_memory_barrier(); + + /* just copy dierctly */ + if (from_size - from_head >= copy_len) + { + char * p_start = alog_queue_offset_to(from, from_head + sizeof(int)); + + /* only copy message content, not write message len */ + SpinLockAcquire(file_lock); + audit_write_log_file(p_start, string_len, destination); + SpinLockRelease(file_lock); + } + else if (from_size - from_head > sizeof(int)) + { + /* must copy as two parts */ + int first_len = from_size - from_head - sizeof(int); + int second_len = string_len - first_len; + char * p_start = NULL; + + Assert(first_len > 0 && first_len < from_size); + Assert(second_len > 0 && second_len < from_size); + + SpinLockAcquire(file_lock); + p_start = alog_queue_offset_to(from, from_head + sizeof(int)); + audit_write_log_file(p_start, first_len, destination); + + p_start = alog_queue_offset_to(from, 0); + audit_write_log_file(p_start, second_len, destination); + SpinLockRelease(file_lock); + } + else + { + /* just copy content only */ + int cpy_offset = (from_head + sizeof(int)) % from_size; + char * p_start = alog_queue_offset_to(from, cpy_offset); + + Assert(from_size - from_head <= sizeof(int)); + SpinLockAcquire(file_lock); + audit_write_log_file(p_start, string_len, destination); + SpinLockRelease(file_lock); + } + + from_head = (from_head + copy_len) % from_size; + from_copyed += copy_len; + + Assert(from_copyed <= from_total); + Assert(from_used - copy_len >= 0); + Assert(from_used - copy_len == alog_queue_used(from_size, from_head, from_tail)); + + from_used = alog_queue_used(from_size, from_head, from_tail); + } while (!alog_queue_is_empty(from_size, from_head, from_tail)); + + from->q_head = from_head; + + return true; } #endif @@ -2057,16 +2353,18 @@ static bool alog_queue_pop_to_file(AlogQueue * from, int destination) #ifdef AuditLog_005_For_ThreadWorker /* - * find an unused shard entry id in AuditCommonLogQueueArray and AuditFGALogQueueArray - * + * find an unused shard entry id in AuditCommonLogQueueArray, AuditFGALogQueueArray + * and AuditTraceLogQueueArray. + * * called by postgres backend to init AuditPostgresAlogQueueIndex */ int AuditLoggerQueueAcquire(void) { int alogIdx = -1; - AlogQueue * common_queue = NULL; - AlogQueue * fga_queue = NULL; + AlogQueue * common_queue = NULL; + AlogQueue * fga_queue = NULL; + AlogQueue * trace_queue = NULL; if (!IsBackendPostgres) { @@ -2079,14 +2377,17 @@ int AuditLoggerQueueAcquire(void) alogIdx = MyProc->pgprocno; Assert(alogIdx >= 0 && alogIdx < MaxBackends); - common_queue = alog_get_shared_common_queue(alogIdx); - fga_queue = alog_get_shared_fga_queue(alogIdx); + common_queue = alog_get_shared_common_queue(alogIdx); + fga_queue = alog_get_shared_fga_queue(alogIdx); + trace_queue = alog_get_shared_trace_queue(alogIdx); - Assert(common_queue->q_pid == fga_queue->q_pid); + Assert(common_queue->q_pid == fga_queue->q_pid); + Assert(common_queue->q_pid == trace_queue->q_pid); - AuditPostgresAlogQueueIndex = alogIdx; - common_queue->q_pid = MyProcPid; - fga_queue->q_pid = MyProcPid; + AuditPostgresAlogQueueIndex = alogIdx; + common_queue->q_pid = MyProcPid; + fga_queue->q_pid = MyProcPid; + trace_queue->q_pid = MyProcPid; if (enable_auditlogger_warning) { @@ -2117,6 +2418,16 @@ static AlogQueue * alog_get_shared_fga_queue(int idx) return queue; } +static AlogQueue * alog_get_shared_trace_queue(int idx) +{ + AlogQueue * queue = NULL; + + Assert(idx >= 0 && idx < MaxBackends); + queue = AuditTraceLogQueueArray->a_queue[idx]; + + return queue; +} + static AlogQueue * alog_get_local_common_cache(int consumer_id) { AlogQueue * queue = NULL; @@ -2137,12 +2448,27 @@ static AlogQueue * alog_get_local_fga_cache(int consumer_id) return queue; } +static AlogQueue * alog_get_local_trace_cache(int consumer_id) +{ + AlogQueue * queue = NULL; + + Assert(consumer_id >= 0 && consumer_id < AuditLog_max_worker_number); + queue = AuditTraceLogLocalCache->q_cache[consumer_id]; + + return queue; +} + /* * local cache for log Consumer * - * AuditCommonLogLocalCache = alog_make_local_cache(AuditLog_max_worker_number, AuditLog_common_log_cache_size_kb); + * AuditCommonLogLocalCache = alog_make_local_cache(AuditLog_max_worker_number, + * AuditLog_common_log_cache_size_kb); * - * AuditFGALogLocalCache = alog_make_local_cache(AuditLog_max_worker_number, AuditLog_fga_log_cacae_size_kb); + * AuditFGALogLocalCache = alog_make_local_cache(AuditLog_max_worker_number, + * AuditLog_fga_log_cacae_size_kb); + * + * AuditTraceLogLocalCache = alog_make_local_cache(AuditLog_max_worker_number, + * Maintain_trace_log_cache_size_kb); */ static AlogQueueCache * alog_make_local_cache(int cache_number, int queue_size_kb) { @@ -2218,7 +2544,7 @@ static void alog_consumer_wakeup(int consumer_id) } /* - * Sleep if there is no log to read in + * Sleep if there is no log to read in * shared audit log queue */ static void alog_consumer_sleep(int consumer_id) @@ -2234,98 +2560,122 @@ static void alog_consumer_sleep(int consumer_id) /* * AuditLog_max_worker_number consumers * - * read log from part of AuditCommonLogQueueArray and AuditFGALogQueueArray - * and write to one cache in AuditCommonLogLocalCache and AuditFgaLogQueueCache + * read log from part of AuditCommonLogQueueArray, AuditFGALogQueueArray and + * AuditTraceLogQueueArray; write to one cache in AuditCommonLogLocalCache, + * AuditFgaLogQueueCache and AuditTraceLogQueueCache. * - */ + */ static void * alog_consumer_main(void * arg) -{// #lizard forgives - int consumer_id = *((int *) arg); - int i = 0; - - AlogQueue * local_common_cache = NULL; - AlogQueue * local_fga_cache = NULL; - - Assert(consumer_id >= 0 && consumer_id < AuditLog_max_worker_number); - - /* get local common queue cache entry from AuditCommonLogLocalCache */ - local_common_cache = alog_get_local_common_cache(consumer_id); - - /* get local fga queue cache entry from AuditFgaLogQueueCache */ - local_fga_cache = alog_get_local_fga_cache(consumer_id); - - while (true) - { - bool shared_is_empty = true; - - for (i = 0; i < ((MaxBackends / AuditLog_max_worker_number) + 1); i++) - { - int sharedIdx = consumer_id + i * AuditLog_max_worker_number; - AlogQueue * shared_common_queue = NULL; - AlogQueue * shared_fga_queue = NULL; - - Assert(consumer_id == (sharedIdx % AuditLog_max_worker_number)); - - if (sharedIdx < MaxBackends) - { - bool local_is_empty = false; - - /* get shared common queue entry from AuditCommonLogQueueArray */ - shared_common_queue = alog_get_shared_common_queue(sharedIdx); - - /* get shared fga queue entry from AuditFGALogQueueArray */ - shared_fga_queue = alog_get_shared_fga_queue(sharedIdx); - - local_is_empty = false; - if (alog_queue_is_empty2(local_common_cache)) - { - local_is_empty = true; - } - - /* read from shared queue, and write to local cache queue */ - if (alog_queue_pop_to_queue(shared_common_queue, local_common_cache)) - { - if (local_is_empty) - { - alog_writer_wakeup(AUDIT_COMMON_LOG); - } - } - - local_is_empty = false; - if (alog_queue_is_empty2(local_fga_cache)) - { - local_is_empty = true; - } - - if (alog_queue_pop_to_queue(shared_fga_queue, local_fga_cache)) - { - if (local_is_empty) - { - alog_writer_wakeup(AUDIT_FGA_LOG); - } - } - - if (!alog_queue_is_empty2(shared_common_queue) || - !alog_queue_is_empty2(shared_fga_queue)) - { - shared_is_empty = false; - } - } - } - - if (shared_is_empty) - { - /* - * maybe shared input is empty, - * local output is full, - * so wait a moment and retry - */ - audit_shared_consumer_bitmap_set_value(consumer_id, 0); - alog_consumer_sleep(consumer_id); - } - } - - return NULL; +{ + int consumer_id = *((int *) arg); + int i = 0; + + AlogQueue * local_common_cache = NULL; + AlogQueue * local_fga_cache = NULL; + AlogQueue * local_trace_cache = NULL; + + Assert(consumer_id >= 0 && consumer_id < AuditLog_max_worker_number); + + /* get local common queue cache entry from AuditCommonLogLocalCache */ + local_common_cache = alog_get_local_common_cache(consumer_id); + + /* get local fga queue cache entry from AuditFgaLogQueueCache */ + local_fga_cache = alog_get_local_fga_cache(consumer_id); + + /* get local trace queue cache entry from AuditTraceLogQueueCache */ + local_trace_cache = alog_get_local_trace_cache(consumer_id); + + while (true) + { + bool shared_is_empty = true; + + for (i = 0; i < ((MaxBackends / AuditLog_max_worker_number) + 1); i++) + { + int sharedIdx = consumer_id + i * AuditLog_max_worker_number; + AlogQueue * shared_common_queue = NULL; + AlogQueue * shared_fga_queue = NULL; + AlogQueue * shared_trace_queue = NULL; + + Assert(consumer_id == (sharedIdx % AuditLog_max_worker_number)); + + if (sharedIdx < MaxBackends) + { + bool local_is_empty = false; + + /* get shared common queue entry from AuditCommonLogQueueArray */ + shared_common_queue = alog_get_shared_common_queue(sharedIdx); + + /* get shared fga queue entry from AuditFGALogQueueArray */ + shared_fga_queue = alog_get_shared_fga_queue(sharedIdx); + + /* get shared trace queue entry from AuditTraceLogQueueArray */ + shared_trace_queue = alog_get_shared_trace_queue(sharedIdx); + + local_is_empty = false; + if (alog_queue_is_empty2(local_common_cache)) + { + local_is_empty = true; + } + + /* read from shared queue, and write to local cache queue */ + if (alog_queue_pop_to_queue(shared_common_queue, local_common_cache)) + { + if (local_is_empty) + { + alog_writer_wakeup(AUDIT_COMMON_LOG); + } + } + + local_is_empty = false; + if (alog_queue_is_empty2(local_fga_cache)) + { + local_is_empty = true; + } + + if (alog_queue_pop_to_queue(shared_fga_queue, local_fga_cache)) + { + if (local_is_empty) + { + alog_writer_wakeup(AUDIT_FGA_LOG); + } + } + + local_is_empty = false; + if (alog_queue_is_empty2(local_trace_cache)) + { + local_is_empty = true; + } + + if (alog_queue_pop_to_queue(shared_trace_queue, local_trace_cache)) + { + if (local_is_empty) + { + alog_writer_wakeup(MAINTAIN_TRACE_LOG); + } + } + + if (!alog_queue_is_empty2(shared_common_queue) || + !alog_queue_is_empty2(shared_fga_queue) || + !alog_queue_is_empty2(shared_trace_queue)) + { + shared_is_empty = false; + } + } + } + + if (shared_is_empty) + { + /* + * maybe shared input is empty, + * local output is full, + * so wait a moment and retry + */ + audit_shared_consumer_bitmap_set_value(consumer_id, 0); + alog_consumer_sleep(consumer_id); + } + } + + return NULL; } /* @@ -2337,24 +2687,30 @@ static void alog_writer_wakeup(int writer_destination) ThreadSema * sema = NULL; AlogQueueCache * local_cache = NULL; - Assert(writer_destination == AUDIT_COMMON_LOG || - writer_destination == AUDIT_FGA_LOG); - - if (writer_destination == AUDIT_COMMON_LOG) - { - local_cache = AuditCommonLogLocalCache; - } - else - { - local_cache = AuditFGALogLocalCache; - } + Assert(writer_destination == AUDIT_COMMON_LOG || + writer_destination == AUDIT_FGA_LOG || + writer_destination == MAINTAIN_TRACE_LOG); + + if (writer_destination == AUDIT_COMMON_LOG) + { + local_cache = AuditCommonLogLocalCache; + } + else if (writer_destination == AUDIT_FGA_LOG) + { + local_cache = AuditFGALogLocalCache; + } + else + { + Assert(writer_destination == MAINTAIN_TRACE_LOG); + local_cache = AuditTraceLogLocalCache; + } sema = (&(local_cache->q_sema)); ThreadSemaUp(sema); } /* - * Sleep if there is no log to read in + * Sleep if there is no log to read in * local audit log cache */ static void alog_writer_sleep(int writer_destination) @@ -2362,79 +2718,93 @@ static void alog_writer_sleep(int writer_destination) ThreadSema * sema = NULL; AlogQueueCache * local_cache = NULL; - Assert(writer_destination == AUDIT_COMMON_LOG || - writer_destination == AUDIT_FGA_LOG); - - if (writer_destination == AUDIT_COMMON_LOG) - { - local_cache = AuditCommonLogLocalCache; - } - else - { - local_cache = AuditFGALogLocalCache; - } + Assert(writer_destination == AUDIT_COMMON_LOG || + writer_destination == AUDIT_FGA_LOG || + writer_destination == MAINTAIN_TRACE_LOG); + + if (writer_destination == AUDIT_COMMON_LOG) + { + local_cache = AuditCommonLogLocalCache; + } + else if (writer_destination == AUDIT_FGA_LOG) + { + local_cache = AuditFGALogLocalCache; + } + else + { + Assert(writer_destination == MAINTAIN_TRACE_LOG); + local_cache = AuditTraceLogLocalCache; + } sema = (&(local_cache->q_sema)); ThreadSemaDown(sema); } /* - * two writer, write log to logfile - * + * three writer, write log to logfile + * * one for AuditCommonLogLocalCache * one for AuditFgaLogQueueCache + * one for AuditTraceLogQueueCache */ static void * alog_writer_main(void * arg) -{// #lizard forgives - int writer_destination = *((int *) arg); - AlogQueueCache * local_cache = NULL; - - Assert(writer_destination == AUDIT_COMMON_LOG || - writer_destination == AUDIT_FGA_LOG); - - if (writer_destination == AUDIT_COMMON_LOG) - { - /* read from AuditCommonLogLocalCache, and write to fga log file */ - local_cache = AuditCommonLogLocalCache; - } - else - { - /* read from AuditFgaLogQueueCache, and write to fga log file */ - local_cache = AuditFGALogLocalCache; - } - - while (true) - { - int i = 0; - bool copy_nothing = true; - - for (i = 0; i < AuditLog_max_worker_number; i++) - { - int consumer_id = i; - AlogQueue * local_queue = local_cache->q_cache[i]; - - if (alog_queue_pop_to_file(local_queue, writer_destination)) - { - copy_nothing = false; - } - - if (alog_queue_is_empty2(local_queue)) - { - alog_consumer_wakeup(consumer_id); - } - } - - if (copy_nothing) - { - /* - * maybe local input is empty, - * so wait a moment and retry - */ - alog_writer_sleep(writer_destination); - } - } - - return NULL; +{ + int writer_destination = *((int *) arg); + AlogQueueCache * local_cache = NULL; + + Assert(writer_destination == AUDIT_COMMON_LOG || + writer_destination == AUDIT_FGA_LOG || + writer_destination == MAINTAIN_TRACE_LOG); + + if (writer_destination == AUDIT_COMMON_LOG) + { + /* read from AuditCommonLogLocalCache, and write to common log file */ + local_cache = AuditCommonLogLocalCache; + } + else if (writer_destination == AUDIT_FGA_LOG) + { + /* read from AuditFgaLogQueueCache, and write to fga log file */ + local_cache = AuditFGALogLocalCache; + } + else + { + /* read from AuditTraceLogQueueCache, and write to trace log file */ + Assert(writer_destination == MAINTAIN_TRACE_LOG); + local_cache = AuditTraceLogLocalCache; + } + + while (true) + { + int i = 0; + bool copy_nothing = true; + + for (i = 0; i < AuditLog_max_worker_number; i++) + { + int consumer_id = i; + AlogQueue * local_queue = local_cache->q_cache[i]; + + if (alog_queue_pop_to_file(local_queue, writer_destination)) + { + copy_nothing = false; + } + + if (alog_queue_is_empty2(local_queue)) + { + alog_consumer_wakeup(consumer_id); + } + } + + if (copy_nothing) + { + /* + * maybe local input is empty, + * so wait a moment and retry + */ + alog_writer_sleep(writer_destination); + } + } + + return NULL; } static void alog_start_writer(int writer_destination) @@ -2442,21 +2812,22 @@ static void alog_start_writer(int writer_destination) int * des = NULL; int ret = 0; - Assert(writer_destination == AUDIT_COMMON_LOG || - writer_destination == AUDIT_FGA_LOG); + Assert(writer_destination == AUDIT_COMMON_LOG || + writer_destination == AUDIT_FGA_LOG || + writer_destination == MAINTAIN_TRACE_LOG); des = palloc0(sizeof(int)); *des = writer_destination; - ret = CreateThread(alog_writer_main, (void *)des, MT_THR_DETACHED); - if (ret != 0) - { - /* failed to create thread, exit */ - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("could not start audit log write worker"))); - exit(6); - } + ret = CreateThread(alog_writer_main, (void *)des, MT_THR_DETACHED); + if (ret != 0) + { + /* failed to create thread, exit */ + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("could not start audit log write worker"))); + exit(6); + } } static void alog_start_consumer(int consumer_id) @@ -2484,15 +2855,18 @@ static void alog_start_all_worker(void) { int i = 0; - AuditCommonLogLocalCache = alog_make_local_cache(AuditLog_max_worker_number, - AuditLog_common_log_cache_size_kb); - AuditFGALogLocalCache = alog_make_local_cache(AuditLog_max_worker_number, - AuditLog_fga_log_cacae_size_kb); - AuditConsumerNotifySemas = alog_make_consumer_semas(AuditLog_max_worker_number); + AuditCommonLogLocalCache = alog_make_local_cache(AuditLog_max_worker_number, + AuditLog_common_log_cache_size_kb); + AuditFGALogLocalCache = alog_make_local_cache(AuditLog_max_worker_number, + AuditLog_fga_log_cacae_size_kb); + AuditTraceLogLocalCache = alog_make_local_cache(AuditLog_max_worker_number, + Maintain_trace_log_cache_size_kb); + AuditConsumerNotifySemas = alog_make_consumer_semas(AuditLog_max_worker_number); - /* 00, start writer worker, one for common log, another for fga log */ - alog_start_writer(AUDIT_COMMON_LOG); - alog_start_writer(AUDIT_FGA_LOG); + /* 00, start writer worker, one for common log, one for fga log, one for trace log. */ + alog_start_writer(AUDIT_COMMON_LOG); + alog_start_writer(AUDIT_FGA_LOG); + alog_start_writer(MAINTAIN_TRACE_LOG); /* 001, start AuditLog_max_worker_number consumer worker */ for (i = 0; i < AuditLog_max_worker_number; i++) @@ -2509,92 +2883,110 @@ static void alog_start_all_worker(void) #ifdef AuditLog_006_For_Elog void alog(int destination, const char *fmt,...) -{// #lizard forgives - StringInfoData buf; - AlogQueue * queue = NULL; - - int len = 0; - int idx = 0; - int consumer_id = 0; - - Assert(AuditPostgresAlogQueueIndex >= 0 && - AuditPostgresAlogQueueIndex < MaxBackends); - - idx = AuditPostgresAlogQueueIndex; - consumer_id = (idx % AuditLog_max_worker_number); - - if(destination != AUDIT_COMMON_LOG && - destination != AUDIT_FGA_LOG) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("only common audit log and fag audit log can be processed"))); - return; - } - - if (!IsBackendPostgres || - !IsUnderPostmaster) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("only postgres backend can write audit log"))); - return; - } - - if (destination == AUDIT_COMMON_LOG) - { - queue = alog_get_shared_common_queue(idx); - } - else - { - queue = alog_get_shared_fga_queue(idx); - } - - Assert(queue->q_pid == getpid()); - - initStringInfo(&buf); - appendBinaryStringInfo(&buf, (const char *)(&len), sizeof(len)); - - for (;;) - { - va_list args; - int needed; - va_start(args, fmt); - needed = appendStringInfoVA(&buf, fmt, args); - va_end(args); - if (needed == 0) - { - break; - } - enlargeStringInfo(&buf, needed); - } - - appendStringInfoChar(&buf, '\n'); - - /* push string len to header */ - len = buf.len - sizeof(len); - memcpy(buf.data, (char *)(&len), sizeof(len)); - - /* push total buff into queue */ - len = buf.len; - while (false == alog_queue_push(queue, buf.data, len)) - { - pg_usleep(AUDIT_SLEEP_MICROSEC); - } - - pfree(buf.data); - - if (!audit_shared_consumer_bitmap_get_value(consumer_id)) - { - /* - * set shared consumer bitmap value to 1 to - * notify consumer to read audit log - */ - audit_shared_consumer_bitmap_set_value(consumer_id, 1); - - /* Notify audit logger process that it's got something to do */ - SendPostmasterSignal(PMSIGNAL_WAKEN_AUDIT_LOGGER); - } +{ + StringInfoData buf; + AlogQueue * queue = NULL; + + int len = 0; + int idx = 0; + int consumer_id = 0; + + Assert(AuditPostgresAlogQueueIndex >= 0 && + AuditPostgresAlogQueueIndex < MaxBackends); + + idx = AuditPostgresAlogQueueIndex; + consumer_id = (idx % AuditLog_max_worker_number); + + if(destination != AUDIT_COMMON_LOG && + destination != AUDIT_FGA_LOG && + destination != MAINTAIN_TRACE_LOG) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("only common/fga/trace audit log can be processed"))); + return; + } + + if (!IsBackendPostgres || + !IsUnderPostmaster) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("only postgres backend can write audit log"))); + return; + } + + if (destination == AUDIT_COMMON_LOG) + { + queue = alog_get_shared_common_queue(idx); + } + else if (destination == AUDIT_FGA_LOG) + { + queue = alog_get_shared_fga_queue(idx); + } + else + { + Assert(destination == MAINTAIN_TRACE_LOG); + queue = alog_get_shared_trace_queue(idx); + } + + Assert(queue->q_pid == getpid()); + + initStringInfo(&buf); + appendBinaryStringInfo(&buf, (const char *)(&len), sizeof(len)); + + for (;;) + { + va_list args; + int needed; + va_start(args, fmt); + needed = appendStringInfoVA(&buf, fmt, args); + va_end(args); + if (needed == 0) + { + break; + } + enlargeStringInfo(&buf, needed); + } + + appendStringInfoChar(&buf, '\n'); + + /* push string len to header */ + len = buf.len - sizeof(len); + memcpy(buf.data, (char *)(&len), sizeof(len)); + + /* push total buff into queue */ + len = buf.len; + while (false == alog_queue_push(queue, buf.data, len)) + { + if (!audit_shared_consumer_bitmap_get_value(consumer_id)) + { + /* + * set shared consumer bitmap value to 1 to + * notify consumer to read log + */ + audit_shared_consumer_bitmap_set_value(consumer_id, 1); + + /* Notify logger process that it's got something to do */ + SendPostmasterSignal(PMSIGNAL_WAKEN_AUDIT_LOGGER); + } + + pg_usleep(AUDIT_SLEEP_MICROSEC); + } + + pfree(buf.data); + + if (!audit_shared_consumer_bitmap_get_value(consumer_id)) + { + /* + * set shared consumer bitmap value to 1 to + * notify consumer to read audit log + */ + audit_shared_consumer_bitmap_set_value(consumer_id, 1); + + /* Notify audit logger process that it's got something to do */ + SendPostmasterSignal(PMSIGNAL_WAKEN_AUDIT_LOGGER); + } } #endif @@ -2603,15 +2995,15 @@ void alog(int destination, const char *fmt,...) static void * alog_shard_stat_main(void * arg) { atexit(FlushShardStatistic); - - while (true) - { - long shard_stat_interval = g_ShardInfoFlushInterval * 1000000L; - - FlushShardStatistic(); - - pg_usleep(shard_stat_interval); - } + + while (true) + { + long shard_stat_interval = g_ShardInfoFlushInterval * 1000000L; + + FlushShardStatistic(); + + pg_usleep(shard_stat_interval); + } return NULL; } @@ -2632,4 +3024,3 @@ static void alog_start_shard_stat_worker(void) } #endif - diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 1f8c33a5..e22a0cca 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -7247,6 +7247,26 @@ void PostmasterDisableTimeout(void) } } +/* + * Whether the database is primary instance and it is normal. + */ +bool PostmasterIsPrimaryAndNormal(void) +{ + /* + * Do not consider: pmState == PM_HOT_STANDBY. Because the original data may + * be retained in the slave instance, which is inconsistent with the reset + * data in the primary instance. + */ + if (pmState == PM_RUN) + { + return true; + } + else + { + return false; + } +} + void InitPostmasterLatch(void) { /* Initialize process-local latch support */ diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index 3f35d73d..f3577766 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -444,11 +444,11 @@ DecodeXactOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) break; #ifdef __TBASE__ - case XLOG_XACT_ACQUIRE_GTS: - { - /* nothing to do. */ - } - break; + case XLOG_XACT_ACQUIRE_GTS: + { + /* nothing to do. */ + } + break; #endif default: elog(ERROR, "unexpected RM_XACT_ID record type: %u", info); diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index d4ecc464..0614298b 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -2364,6 +2364,97 @@ RelationIdGetRelation(Oid relationId) return rd; } +/* + * Whether a relation has xmin_gts and max_gts. + */ +bool +RelationHasGTS(Oid reltablespace, Oid relfilenode) +{ + bool has = false; + SysScanDesc scandesc = NULL; + Relation relation = NULL; + HeapTuple ntp = NULL; + ScanKeyData skey[2]; + bool found = false; + Oid relid = InvalidOid; + Form_pg_class classform = NULL; + + /* zero means this is a "mapped" relation */ + if (0 == relfilenode || relfilenode < FirstNormalObjectId) + { + return false; + } + + if (GLOBALTABLESPACE_OID == reltablespace) + { + return false; + } + + /* pg_class will show 0 when the value is actually MyDatabaseTableSpace */ + if (reltablespace == MyDatabaseTableSpace) + { + reltablespace = 0; + } + + /* + * Not a shared table, could either be a plain relation or a + * non-shared, nailed one, like e.g. pg_class. + * + * check for plain relations by looking in pg_class + */ + relation = heap_open(RelationRelationId, AccessShareLock); + + ScanKeyInit(&skey[0], + Anum_pg_class_reltablespace, + BTEqualStrategyNumber, + F_OIDEQ, + ObjectIdGetDatum(reltablespace)); + ScanKeyInit(&skey[1], + Anum_pg_class_relfilenode, + BTEqualStrategyNumber, + F_OIDEQ, + ObjectIdGetDatum(relfilenode)); + + scandesc = systable_beginscan(relation, + ClassTblspcRelfilenodeIndexId, + true, + NULL, + 2, + skey); + + while (HeapTupleIsValid(ntp = systable_getnext(scandesc))) + { + if (found) + { + elog(ERROR, + "unexpected duplicate for tablespace %u, relfilenode %u", + reltablespace, relfilenode); + } + + found = true; + relid = HeapTupleGetOid(ntp); + classform = (Form_pg_class) GETSTRUCT(ntp); + } + + if (!found) + { + elog(WARNING, + "unexpected none for tablespace %u, relfilenode %u", + reltablespace, relfilenode); + } + else if ((classform->relkind == RELKIND_RELATION || + classform->relkind == RELPERSISTENCE_UNLOGGED) && + !IsSystemClass(relid, classform)) + { + has = true; + } + + systable_endscan(scandesc); + heap_close(relation, AccessShareLock); + + return has; +} + /* ---------------------------------------------------------------- * cache invalidation support routines * ---------------------------------------------------------------- diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 825e4725..3f02526e 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -3109,15 +3109,25 @@ static struct config_int ConfigureNamesInt[] = NULL, NULL, NULL }, - { - {"max_files_per_process", PGC_POSTMASTER, RESOURCES_KERNEL, - gettext_noop("Sets the maximum number of simultaneously open files for each server process."), - NULL - }, - &max_files_per_process, - 1000, 25, INT_MAX, - NULL, NULL, NULL - }, + { + {"gts_maintain_option", PGC_SIGHUP, DEVELOPER_OPTIONS, + gettext_noop("Enables check correctness of GTS and reseting it if it is wrong"), + NULL + }, + >s_maintain_option, + 0, 0, 2, + NULL, NULL, NULL + }, + + { + {"max_files_per_process", PGC_POSTMASTER, RESOURCES_KERNEL, + gettext_noop("Sets the maximum number of simultaneously open files for each server process."), + NULL + }, + &max_files_per_process, + 1000, 25, INT_MAX, + NULL, NULL, NULL + }, /* * See also CheckRequiredParameterValues() if this parameter changes @@ -3807,6 +3817,16 @@ static struct config_int ConfigureNamesInt[] = 64, 8, INT_MAX / 1024, NULL, NULL, NULL }, + { + {"alog_trace_queue_size", PGC_POSTMASTER, LOGGING_WHERE, + gettext_noop("Size of share memory queue for each backend to store trace audit log, kilobytes."), + NULL, + GUC_UNIT_KB + }, + &Maintain_trace_log_queue_size_kb, + 64, 8, INT_MAX / 1024, + NULL, NULL, NULL + }, { {"alog_common_cache_size", PGC_POSTMASTER, LOGGING_WHERE, gettext_noop("Size of common audit log local buffer for each audit worker, kilobytes."), @@ -3827,6 +3847,16 @@ static struct config_int ConfigureNamesInt[] = 64, 8, INT_MAX / 1024, NULL, NULL, NULL }, + { + {"alog_trace_cache_size", PGC_POSTMASTER, LOGGING_WHERE, + gettext_noop("Size of trace audit log local buffer for each audit worker, kilobytes."), + NULL, + GUC_UNIT_KB + }, + &Maintain_trace_log_cache_size_kb, + 64, 8, INT_MAX / 1024, + NULL, NULL, NULL + }, #endif { {"max_function_args", PGC_INTERNAL, PRESET_OPTIONS, diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index c5a8d1ba..e5cb868a 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -485,6 +485,15 @@ #update_process_title = on +# - Maintain GTS - + +#gts_maintain_option = 0 # range: 0-2. the default is 0. + # 0: do nothing. + # 1: check the correctness of the GTS of tuples by referring to + # tlog while doing vacuum. + # 2: check the correctness of the GTS of tuples by referring to + # tlog, and reset it if it is wrong while doing vacuum. + #------------------------------------------------------------------------------ # RUNTIME STATISTICS #------------------------------------------------------------------------------ diff --git a/src/backend/utils/time/tqual.c b/src/backend/utils/time/tqual.c index 6b01aa3e..9fa4ad40 100644 --- a/src/backend/utils/time/tqual.c +++ b/src/backend/utils/time/tqual.c @@ -167,34 +167,38 @@ static bool XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot); GlobalTimestamp HeapTupleHderGetXminTimestapAtomic(HeapTupleHeader tuple) { - if(HEAP_XMIN_TIMESTAMP_IS_UPDATED(tuple->t_infomask2)) - return HeapTupleHeaderGetXminTimestamp(tuple); - else - return InvalidGlobalTimestamp; - + if (HEAP_XMIN_TIMESTAMP_IS_UPDATED(tuple->t_infomask2)) + { + return HeapTupleHeaderGetXminTimestamp(tuple); + } + else + { + return InvalidGlobalTimestamp; + } } GlobalTimestamp HeapTupleHderGetXmaxTimestapAtomic(HeapTupleHeader tuple) { - if(HEAP_XMAX_TIMESTAMP_IS_UPDATED(tuple->t_infomask2)) - return HeapTupleHeaderGetXmaxTimestamp(tuple); - else - return InvalidGlobalTimestamp; - + if (HEAP_XMAX_TIMESTAMP_IS_UPDATED(tuple->t_infomask2)) + { + return HeapTupleHeaderGetXmaxTimestamp(tuple); + } + else + { + return InvalidGlobalTimestamp; + } } void HeapTupleHderSetXminTimestapAtomic(HeapTupleHeader tuple, GlobalTimestamp committs) { - HeapTupleHeaderSetXminTimestamp(tuple, committs); - tuple->t_infomask2 |= HEAP_XMIN_TIMESTAMP_UPDATED; - + HeapTupleHeaderSetXminTimestamp(tuple, committs); + tuple->t_infomask2 |= HEAP_XMIN_TIMESTAMP_UPDATED; } void HeapTupleHderSetXmaxTimestapAtomic(HeapTupleHeader tuple, GlobalTimestamp committs) { - HeapTupleHeaderSetXmaxTimestamp(tuple, committs); - tuple->t_infomask2 |= HEAP_XMAX_TIMESTAMP_UPDATED; - + HeapTupleHeaderSetXmaxTimestamp(tuple, committs); + tuple->t_infomask2 |= HEAP_XMAX_TIMESTAMP_UPDATED; } @@ -263,6 +267,21 @@ SetHintBits(HeapTupleHeader tuple, Buffer buffer, if(TransactionIdIsNormal(xmin) && TransactionIdGetCommitTsData(xmin, &global_timestamp, NULL)) { HeapTupleHderSetXminTimestapAtomic(tuple, global_timestamp); + if (enable_committs_print) + { + BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1); + RelFileNode *rnode = &bufHdr->tag.rnode; + + elog(LOG, + "SetHintBits: relfilenode %u pageno %u " + "CTID %hu/%hu/%hu " + "infomask %d xmin %u xmin_gts "INT64_FORMAT, + rnode->relNode, bufHdr->tag.blockNum, + tuple->t_ctid.ip_blkid.bi_hi, + tuple->t_ctid.ip_blkid.bi_lo, + tuple->t_ctid.ip_posid, + tuple->t_infomask, xmin, global_timestamp); + } } } @@ -275,6 +294,23 @@ SetHintBits(HeapTupleHeader tuple, Buffer buffer, if(TransactionIdIsNormal(xmax) && TransactionIdGetCommitTsData(xmax, &global_timestamp, NULL)) { HeapTupleHderSetXmaxTimestapAtomic(tuple, global_timestamp); + if (enable_committs_print) + { + BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1); + RelFileNode *rnode = &bufHdr->tag.rnode; + + elog(LOG, + "SetHintBits: relfilenode %u pageno %u " + "CTID %hu/%hu/%hu " + "infomask %d multixact %d " + "xid %u xmax %u xmax_gts "INT64_FORMAT, + rnode->relNode, bufHdr->tag.blockNum, + tuple->t_ctid.ip_blkid.bi_hi, + tuple->t_ctid.ip_blkid.bi_lo, + tuple->t_ctid.ip_posid, + tuple->t_infomask, tuple->t_infomask & HEAP_XMAX_IS_MULTI, + HeapTupleHeaderGetUpdateXid(tuple), xmax, global_timestamp); + } } } } diff --git a/src/bin/pg_archivecleanup/pg_archivecleanup.c b/src/bin/pg_archivecleanup/pg_archivecleanup.c index 4dc3d6ba..2a1e645b 100644 --- a/src/bin/pg_archivecleanup/pg_archivecleanup.c +++ b/src/bin/pg_archivecleanup/pg_archivecleanup.c @@ -30,11 +30,12 @@ bool debug = false; /* are we debugging? */ bool dryrun = false; /* are we performing a dry-run operation? */ char *additional_ext = NULL; /* Extension to remove from filenames */ -char *archiveLocation; /* where to find the archive? */ -char *restartWALFileName; /* the file from which we can restart restore */ -char WALFilePath[MAXPGPATH * 2]; /* the file path including archive */ -char exclusiveCleanupFileName[MAXFNAMELEN]; /* the oldest file we want - * to remain in archive */ +char *archiveLocation; /* where to find the archive? */ +char *restartWALFileName; /* the file from which we can restart restore */ +char WALFilePath[MAXPGPATH * 2]; /* the file path including archive */ +char WALGTSFilePath[MAXPGPATH * 2]; +char exclusiveCleanupFileName[MAXFNAMELEN]; /* the oldest file we want + * to remain in archive */ /* ===================================================================== @@ -93,87 +94,102 @@ TrimExtension(char *filename, char *extension) static void CleanupPriorWALFiles(void) -{// #lizard forgives - int rc; - DIR *xldir; - struct dirent *xlde; - char walfile[MAXPGPATH]; - - if ((xldir = opendir(archiveLocation)) != NULL) - { - while (errno = 0, (xlde = readdir(xldir)) != NULL) - { - /* - * Truncation is essentially harmless, because we skip names of - * length other than XLOG_FNAME_LEN. (In principle, one could use - * a 1000-character additional_ext and get trouble.) - */ - strlcpy(walfile, xlde->d_name, MAXPGPATH); - TrimExtension(walfile, additional_ext); - - /* - * We ignore the timeline part of the XLOG segment identifiers in - * deciding whether a segment is still needed. This ensures that - * we won't prematurely remove a segment from a parent timeline. - * We could probably be a little more proactive about removing - * segments of non-parent timelines, but that would be a whole lot - * more complicated. - * - * We use the alphanumeric sorting property of the filenames to - * decide which ones are earlier than the exclusiveCleanupFileName - * file. Note that this means files are not removed in the order - * they were originally written, in case this worries you. - */ - if ((IsXLogFileName(walfile) || IsPartialXLogFileName(walfile)) && - strcmp(walfile + 8, exclusiveCleanupFileName + 8) < 0) - { - /* - * Use the original file name again now, including any - * extension that might have been chopped off before testing - * the sequence. - */ - snprintf(WALFilePath, sizeof(WALFilePath), "%s/%s", - archiveLocation, xlde->d_name); - - if (dryrun) - { - /* - * Prints the name of the file to be removed and skips the - * actual removal. The regular printout is so that the - * user can pipe the output into some other program. - */ - printf("%s\n", WALFilePath); - if (debug) - fprintf(stderr, - _("%s: file \"%s\" would be removed\n"), - progname, WALFilePath); - continue; - } - - if (debug) - fprintf(stderr, _("%s: removing file \"%s\"\n"), - progname, WALFilePath); - - rc = unlink(WALFilePath); - if (rc != 0) - { - fprintf(stderr, _("%s: ERROR: could not remove file \"%s\": %s\n"), - progname, WALFilePath, strerror(errno)); - break; - } - } - } - - if (errno) - fprintf(stderr, _("%s: could not read archive location \"%s\": %s\n"), - progname, archiveLocation, strerror(errno)); - if (closedir(xldir)) - fprintf(stderr, _("%s: could not close archive location \"%s\": %s\n"), - progname, archiveLocation, strerror(errno)); - } - else - fprintf(stderr, _("%s: could not open archive location \"%s\": %s\n"), - progname, archiveLocation, strerror(errno)); +{ + int rc; + DIR *xldir; + struct dirent *xlde; + char walfile[MAXPGPATH]; + + if ((xldir = opendir(archiveLocation)) != NULL) + { + while (errno = 0, (xlde = readdir(xldir)) != NULL) + { + /* + * Truncation is essentially harmless, because we skip names of + * length other than XLOG_FNAME_LEN. (In principle, one could use + * a 1000-character additional_ext and get trouble.) + */ + strlcpy(walfile, xlde->d_name, MAXPGPATH); + TrimExtension(walfile, additional_ext); + + /* + * We ignore the timeline part of the XLOG segment identifiers in + * deciding whether a segment is still needed. This ensures that + * we won't prematurely remove a segment from a parent timeline. + * We could probably be a little more proactive about removing + * segments of non-parent timelines, but that would be a whole lot + * more complicated. + * + * We use the alphanumeric sorting property of the filenames to + * decide which ones are earlier than the exclusiveCleanupFileName + * file. Note that this means files are not removed in the order + * they were originally written, in case this worries you. + */ + if ((IsXLogFileName(walfile) || IsPartialXLogFileName(walfile)) && + strcmp(walfile + 8, exclusiveCleanupFileName + 8) < 0) + { + /* + * Use the original file name again now, including any + * extension that might have been chopped off before testing + * the sequence. + */ + snprintf(WALFilePath, sizeof(WALFilePath), "%s/%s", + archiveLocation, xlde->d_name); + snprintf(WALGTSFilePath, sizeof(WALGTSFilePath), "%s/%s.gts", + archiveLocation, xlde->d_name); + + if (dryrun) + { + /* + * Prints the name of the file to be removed and skips the + * actual removal. The regular printout is so that the + * user can pipe the output into some other program. + */ + printf("%s\n", WALFilePath); + if (debug) + fprintf(stderr, + _("%s: file \"%s\" would be removed\n"), + progname, WALFilePath); + continue; + } + + if (debug) + fprintf(stderr, _("%s: removing file \"%s\"\n"), + progname, WALFilePath); + + rc = unlink(WALFilePath); + if (rc != 0) + { + fprintf(stderr, _("%s: ERROR: could not remove file \"%s\": %s\n"), + progname, WALFilePath, strerror(errno)); + break; + } + + if (debug) + fprintf(stderr, _("%s: removing file \"%s\"\n"), + progname, WALGTSFilePath); + + /* remove the .gts file */ + rc = unlink(WALGTSFilePath); + if (rc != 0) + { + fprintf(stderr, _("%s: ERROR: could not remove file \"%s\": %s\n"), + progname, WALGTSFilePath, strerror(errno)); + break; + } + } + } + + if (errno) + fprintf(stderr, _("%s: could not read archive location \"%s\": %s\n"), + progname, archiveLocation, strerror(errno)); + if (closedir(xldir)) + fprintf(stderr, _("%s: could not close archive location \"%s\": %s\n"), + progname, archiveLocation, strerror(errno)); + } + else + fprintf(stderr, _("%s: could not open archive location \"%s\": %s\n"), + progname, archiveLocation, strerror(errno)); } /* diff --git a/src/include/access/htup_details.h b/src/include/access/htup_details.h index 12fce61c..f312bd76 100644 --- a/src/include/access/htup_details.h +++ b/src/include/access/htup_details.h @@ -409,6 +409,11 @@ do { \ ((tup)->t_infomask |= HEAP_XMIN_FROZEN) \ ) +#define HeapTupleHeaderXmaxCommitted(tup) \ +( \ + ((tup)->t_infomask & HEAP_XMAX_COMMITTED) != 0 \ +) + /* * HeapTupleHeaderGetRawXmax gets you the raw Xmax field. To find out the Xid * that updated a tuple, you might need to resolve the MultiXactId if certain diff --git a/src/include/bootstrap/bootstrap.h b/src/include/bootstrap/bootstrap.h index 98bf4db1..706759f4 100644 --- a/src/include/bootstrap/bootstrap.h +++ b/src/include/bootstrap/bootstrap.h @@ -30,7 +30,8 @@ extern Relation boot_reldesc; extern Form_pg_attribute attrtypes[MAXATTR]; -extern int numattr; +extern int numattr; +extern const char *exename; extern void AuxiliaryProcessMain(int argc, char *argv[]) pg_attribute_noreturn(); diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h index 356efa52..cd79ba61 100644 --- a/src/include/commands/vacuum.h +++ b/src/include/commands/vacuum.h @@ -211,12 +211,13 @@ typedef struct VacuumParams } VacuumParams; /* GUC parameters */ -extern PGDLLIMPORT int default_statistics_target; /* PGDLLIMPORT for PostGIS */ -extern int vacuum_freeze_min_age; -extern int vacuum_defer_freeze_min_age; -extern int vacuum_freeze_table_age; -extern int vacuum_multixact_freeze_min_age; -extern int vacuum_multixact_freeze_table_age; +extern PGDLLIMPORT int default_statistics_target; /* PGDLLIMPORT for PostGIS */ +extern int vacuum_freeze_min_age; +extern int vacuum_defer_freeze_min_age; +extern int vacuum_freeze_table_age; +extern int vacuum_multixact_freeze_min_age; +extern int vacuum_multixact_freeze_table_age; + #ifdef __TBASE__ extern bool enable_sampling_analyze; extern bool distributed_query_analyze; @@ -365,6 +366,17 @@ extern void ClearQueryAnalyzeInfo(void); extern char *GetAnalyzeInfo(int nodeid, char *key); extern void ExecSample(SampleStmt *stmt, DestReceiver *dest); + +extern int gts_maintain_option; +typedef enum +{ + GTS_MAINTAIN_NOTHING = 0, /* do nothing */ + GTS_MAINTAIN_VACUUM_CHECK = 1, /* check correctness of GTS while + * doing vacuum. */ + GTS_MAINTAIN_VACUUM_RESET = 2, /* check correctness of GTS and reset + * it according to tlog if it is wrong + * while doing vacuum. */ +} GTSMaintainOption; #endif #endif /* VACUUM_H */ diff --git a/src/include/postmaster/auditlogger.h b/src/include/postmaster/auditlogger.h index 4b14ccea..2ec37bc1 100644 --- a/src/include/postmaster/auditlogger.h +++ b/src/include/postmaster/auditlogger.h @@ -74,8 +74,10 @@ #include -#define AUDIT_COMMON_LOG 1 -#define AUDIT_FGA_LOG 2 +#define AUDIT_COMMON_LOG (1 << 0) +#define AUDIT_FGA_LOG (1 << 1) +/* size_rotation_for = AUDIT_COMMON_LOG | AUDIT_FGA_LOG | MAINTAIN_TRACE_LOG */ +#define MAINTAIN_TRACE_LOG (1 << 2) extern int AuditLog_RotationAge; extern int AuditLog_RotationSize; @@ -83,11 +85,13 @@ extern PGDLLIMPORT char * AuditLog_filename; extern bool AuditLog_truncate_on_rotation; extern int AuditLog_file_mode; -extern int AuditLog_max_worker_number; -extern int AuditLog_common_log_queue_size_kb; -extern int AuditLog_fga_log_queue_size_kb; -extern int AuditLog_common_log_cache_size_kb; -extern int AuditLog_fga_log_cacae_size_kb; +extern int AuditLog_max_worker_number; +extern int AuditLog_common_log_queue_size_kb; +extern int AuditLog_fga_log_queue_size_kb; +extern int Maintain_trace_log_queue_size_kb; +extern int AuditLog_common_log_cache_size_kb; +extern int AuditLog_fga_log_cacae_size_kb; +extern int Maintain_trace_log_cache_size_kb; extern bool am_auditlogger; extern bool enable_auditlogger_warning; @@ -102,8 +106,9 @@ extern Size AuditLoggerShmemSize(void); extern void AuditLoggerShmemInit(void); extern int AuditLoggerQueueAcquire(void); -extern void alog(int destination, const char *fmt,...) pg_attribute_printf(2, 3); -#define audit_log(args...) alog(AUDIT_COMMON_LOG, ##args) -#define audit_log_fga(args...) alog(AUDIT_FGA_LOG, ##args) +extern void alog(int destination, const char *fmt,...) pg_attribute_printf(2, 3); +#define audit_log(args...) alog(AUDIT_COMMON_LOG, ##args) +#define audit_log_fga(args...) alog(AUDIT_FGA_LOG, ##args) +#define trace_log(args...) alog(MAINTAIN_TRACE_LOG, ##args) #endif /* __AUDIT_LOGGER_H__ */ diff --git a/src/include/postmaster/postmaster.h b/src/include/postmaster/postmaster.h index 359cc258..f32e7db9 100644 --- a/src/include/postmaster/postmaster.h +++ b/src/include/postmaster/postmaster.h @@ -109,5 +109,6 @@ extern void ShmemBackendArrayAllocation(void); #ifdef __TBASE__ extern void PostmasterEnableLogTimeout(void); extern void PostmasterDisableTimeout(void); +extern bool PostmasterIsPrimaryAndNormal(void); #endif #endif /* _POSTMASTER_H */ diff --git a/src/include/utils/relcache.h b/src/include/utils/relcache.h index 1cdbc044..bd96e72d 100644 --- a/src/include/utils/relcache.h +++ b/src/include/utils/relcache.h @@ -129,6 +129,8 @@ extern void RelationCacheInitFilePreInvalidate(void); extern void RelationCacheInitFilePostInvalidate(void); extern void RelationCacheInitFileRemove(void); +extern bool RelationHasGTS(Oid reltablespace, Oid relfilenode); + /* should be used only by relcache.c and catcache.c */ extern bool criticalRelcachesBuilt; From 9c176085ed9ba9d5f6e02b330218481e96395104 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Mon, 16 Nov 2020 14:41:57 +0800 Subject: [PATCH 083/578] fix bug ID82369889 (merge request !139) --- src/backend/commands/prepare.c | 8 +++++++- src/backend/pgxc/squeue/squeue.c | 4 ++-- src/backend/tcop/pquery.c | 7 ++++++- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/src/backend/commands/prepare.c b/src/backend/commands/prepare.c index 5a46fa7f..1aa469d6 100644 --- a/src/backend/commands/prepare.c +++ b/src/backend/commands/prepare.c @@ -739,6 +739,13 @@ DropPreparedStatement(const char *stmt_name, bool showError) if (entry) { +#ifdef XCP + /* if a process SharedQueueRelease in DropCachedPlan, this SharedQueue + * Can be created by another process, and SharedQueueDisconnectConsumer + * will change the SharedQueue of another process's status, + * so let SharedQueueDisconnectConsumer be in front of DropCachedPlan */ + SharedQueueDisconnectConsumer(entry->stmt_name); +#endif /* Release the plancache entry */ DropCachedPlan(entry->plansource); @@ -750,7 +757,6 @@ DropPreparedStatement(const char *stmt_name, bool showError) if (entry->use_resowner) ResourceOwnerForgetPreparedStmt(CurTransactionResourceOwner, entry->stmt_name); - SharedQueueDisconnectConsumer(entry->stmt_name); #endif #ifdef __TBASE__ if (distributed_query_analyze) diff --git a/src/backend/pgxc/squeue/squeue.c b/src/backend/pgxc/squeue/squeue.c index 515391a3..b1440b60 100644 --- a/src/backend/pgxc/squeue/squeue.c +++ b/src/backend/pgxc/squeue/squeue.c @@ -977,11 +977,11 @@ SharedQueueAcquire(const char *sqname, int ncons) if (old_squeue) { LWLockRelease(SQueuesLock); - pg_usleep(1000000L); + (trycount < 10) ? pg_usleep(10000L) : pg_usleep(1000000L); elog(DEBUG1, "SQueue race condition, give the old producer to " "finish the work and retry again"); trycount++; - if (trycount >= 10) + if (trycount >= 20) elog(ERROR, "Couldn't resolve SQueue race condition after" " %d tries", trycount); goto tryagain; diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c index e7068eab..13f56acc 100644 --- a/src/backend/tcop/pquery.c +++ b/src/backend/tcop/pquery.c @@ -696,10 +696,15 @@ PortalStart(Portal portal, ParamListInfo params, * NB: Check queryDesc->plannedstmt->nParamExec > 0 is incorrect * here since queryDesc->plannedstmt->nParamExec may be used * just to allocate space for them and no actual values passed. + * + * If distributionType is LOCATOR_TYPE_SHARD, even with parameters + * PARAM_EXEC, still follow the redistribution logic, otherwise, + * it may cause SharedQueue conflict in the lower layer redistribution */ #ifdef __TBASE__ if (!paramPassDown && queryDesc->plannedstmt->nParamRemote > 0 && - queryDesc->plannedstmt->remoteparams[queryDesc->plannedstmt->nParamRemote-1].paramkind == PARAM_EXEC) + queryDesc->plannedstmt->remoteparams[queryDesc->plannedstmt->nParamRemote-1].paramkind == PARAM_EXEC && + queryDesc->plannedstmt->distributionType != LOCATOR_TYPE_SHARD) #else if (queryDesc->plannedstmt->nParamRemote > 0 && queryDesc->plannedstmt->remoteparams[queryDesc->plannedstmt->nParamRemote-1].paramkind == PARAM_EXEC) From 87ec1ba9e0655c1961d350de8840e467cce3e17c Mon Sep 17 00:00:00 2001 From: youngxie Date: Fri, 4 Dec 2020 17:42:20 +0800 Subject: [PATCH 084/578] Fix parallel ddl pushing guc variable order. --- src/backend/pgxc/pool/pgxcnode.c | 6 + src/backend/tcop/utility.c | 4 +- src/include/pgxc/pgxcnode.h | 240 ++++++++++++++++--------------- 3 files changed, 129 insertions(+), 121 deletions(-) diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index 775b703d..5a93bbb1 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -133,6 +133,9 @@ static List *local_param_list = NIL; static StringInfo session_params; static StringInfo local_params; +/* Is forward request to leader coordinator */ +bool forward_mode = false; + typedef struct { NameData name; @@ -4634,6 +4637,9 @@ PGXCNodeGetSessionParamStr(void) if (IS_PGXC_COORDINATOR) appendStringInfo(session_params, "SET global_session TO %s_%d;", PGXCNodeName, MyProcPid); + if (forward_mode) + appendStringInfo(session_params, "SET is_forward_request to true;"); + get_set_command(session_param_list, session_params, false); appendStringInfo(session_params, "SET parentPGXCPid TO %d;", MyProcPid); diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index 4a8871b6..55b62ce3 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -1771,12 +1771,12 @@ forward_ddl_to_leader_cn(Node *node, const char *queryString) leader_cn = get_pgxc_nodeoid(leader_name); /* Set flag to indicate forwarded request */ - PGXCNodeSetParam(false, "is_forward_request", "true", 0); + forward_mode = true; pgxc_execute_on_nodes(1, &leader_cn, pstrdup(queryString)); /* Cancel forwarded flag for subsequent requests */ - PGXCNodeSetParam(false, "is_forward_request", "false", 0); + forward_mode = false; return true; } diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h index 872536fc..ccb84026 100644 --- a/src/include/pgxc/pgxcnode.h +++ b/src/include/pgxc/pgxcnode.h @@ -2,7 +2,7 @@ * * pgxcnode.h * - * Utility functions to communicate to Datanodes and Coordinators + * Utility functions to communicate to Datanodes and Coordinators * * * Portions Copyright (c) 2012-2014, TransLattice, Inc. @@ -32,103 +32,103 @@ typedef struct PGcancel NODE_CANCEL; /* Helper structure to access Datanode from Session */ typedef enum { - DN_CONNECTION_STATE_IDLE, /* idle, ready for query */ - DN_CONNECTION_STATE_QUERY, /* query is sent, response expected */ - DN_CONNECTION_STATE_CLOSE, /* close is sent, confirmation expected */ - DN_CONNECTION_STATE_ERROR_FATAL, /* fatal error */ - DN_CONNECTION_STATE_COPY_IN, - DN_CONNECTION_STATE_COPY_OUT -} DNConnectionState; + DN_CONNECTION_STATE_IDLE, /* idle, ready for query */ + DN_CONNECTION_STATE_QUERY, /* query is sent, response expected */ + DN_CONNECTION_STATE_CLOSE, /* close is sent, confirmation expected */ + DN_CONNECTION_STATE_ERROR_FATAL, /* fatal error */ + DN_CONNECTION_STATE_COPY_IN, + DN_CONNECTION_STATE_COPY_OUT +} DNConnectionState; typedef enum { - HANDLE_IDLE, - HANDLE_ERROR, - HANDLE_DEFAULT -} PGXCNode_HandleRequested; + HANDLE_IDLE, + HANDLE_ERROR, + HANDLE_DEFAULT +} PGXCNode_HandleRequested; #ifdef __TBASE__ typedef enum { - DNStatus_OK = 0, - DNStatus_ERR = 1, - DNStatus_EXPIRED = 2, - DNStatus_BUTTY + DNStatus_OK = 0, + DNStatus_ERR = 1, + DNStatus_EXPIRED = 2, + DNStatus_BUTTY }DNStateEnum; typedef enum { - SendSetQuery_OK = 0, - SendSetQuery_EXPIRED = 1, - SendSetQuery_SendQuery_ERROR = 2, - SendSetQuery_Set_ERROR = 3, - SendSetQuery_BUTTY + SendSetQuery_OK = 0, + SendSetQuery_EXPIRED = 1, + SendSetQuery_SendQuery_ERROR = 2, + SendSetQuery_Set_ERROR = 3, + SendSetQuery_BUTTY }SendSetQueryStatus; -#define MAX_ERROR_MSG_LENGTH 1024 +#define MAX_ERROR_MSG_LENGTH 1024 #endif #define DN_CONNECTION_STATE_ERROR(dnconn) \ - ((dnconn)->state == DN_CONNECTION_STATE_ERROR_FATAL \ - || (dnconn)->transaction_status == 'E') + ((dnconn)->state == DN_CONNECTION_STATE_ERROR_FATAL \ + || (dnconn)->transaction_status == 'E') #define HAS_MESSAGE_BUFFERED(conn) \ - ((conn)->inCursor + 4 < (conn)->inEnd \ - && (conn)->inCursor + ntohl(*((uint32_t *) ((conn)->inBuffer + (conn)->inCursor + 1))) < (conn)->inEnd) + ((conn)->inCursor + 4 < (conn)->inEnd \ + && (conn)->inCursor + ntohl(*((uint32_t *) ((conn)->inBuffer + (conn)->inCursor + 1))) < (conn)->inEnd) struct pgxc_node_handle { - Oid nodeoid; - int nodeid; - char nodename[NAMEDATALEN]; - char nodehost[NAMEDATALEN]; - int nodeport; - - /* fd of the connection */ - int sock; - /* pid of the remote backend process */ - int backend_pid; - - /* Connection state */ - char transaction_status; - DNConnectionState state; - bool read_only; - struct ResponseCombiner *combiner; + Oid nodeoid; + int nodeid; + char nodename[NAMEDATALEN]; + char nodehost[NAMEDATALEN]; + int nodeport; + + /* fd of the connection */ + int sock; + /* pid of the remote backend process */ + int backend_pid; + + /* Connection state */ + char transaction_status; + DNConnectionState state; + bool read_only; + struct ResponseCombiner *combiner; #ifdef DN_CONNECTION_DEBUG - bool have_row_desc; + bool have_row_desc; #endif #ifndef __USE_GLOBAL_SNAPSHOT__ - uint64 sendGxidVersion; + uint64 sendGxidVersion; #endif - char error[MAX_ERROR_MSG_LENGTH]; - /* Output buffer */ - char *outBuffer; - size_t outSize; - size_t outEnd; - /* Input buffer */ - char *inBuffer; - size_t inSize; - size_t inStart; - size_t inEnd; - size_t inCursor; - /* - * Have a variable to enable/disable response checking and - * if enable then read the result of response checking - * - * For details see comments of RESP_ROLLBACK - */ - bool ck_resp_rollback; - - bool in_extended_query; - bool needSync; /* set when error and extend query. */ + char error[MAX_ERROR_MSG_LENGTH]; + /* Output buffer */ + char *outBuffer; + size_t outSize; + size_t outEnd; + /* Input buffer */ + char *inBuffer; + size_t inSize; + size_t inStart; + size_t inEnd; + size_t inCursor; + /* + * Have a variable to enable/disable response checking and + * if enable then read the result of response checking + * + * For details see comments of RESP_ROLLBACK + */ + bool ck_resp_rollback; + + bool in_extended_query; + bool needSync; /* set when error and extend query. */ #ifdef __TBASE__ bool sock_fatal_occurred; /*Network failure occurred, and sock descriptor was closed */ - char last_command; /*last command we processed. */ - long recv_datarows; - bool plpgsql_need_begin_sub_txn; - bool plpgsql_need_begin_txn; + char last_command; /*last command we processed. */ + long recv_datarows; + bool plpgsql_need_begin_sub_txn; + bool plpgsql_need_begin_txn; #endif }; typedef struct pgxc_node_handle PGXCNodeHandle; @@ -136,21 +136,23 @@ typedef struct pgxc_node_handle PGXCNodeHandle; /* Structure used to get all the handles involved in a transaction */ typedef struct { - PGXCNodeHandle *primary_handle; /* Primary connection to PGXC node */ - int dn_conn_count; /* number of Datanode Handles including primary handle */ - PGXCNodeHandle **datanode_handles; /* an array of Datanode handles */ - int co_conn_count; /* number of Coordinator handles */ - PGXCNodeHandle **coord_handles; /* an array of Coordinator handles */ + PGXCNodeHandle *primary_handle; /* Primary connection to PGXC node */ + int dn_conn_count; /* number of Datanode Handles including primary handle */ + PGXCNodeHandle **datanode_handles; /* an array of Datanode handles */ + int co_conn_count; /* number of Coordinator handles */ + PGXCNodeHandle **coord_handles; /* an array of Coordinator handles */ } PGXCNodeAllHandles; +extern bool forward_mode; + extern void InitMultinodeExecutor(bool is_force); extern Oid get_nodeoid_from_nodeid(int nodeid, char node_type); /* Open/close connection routines (invoked from Pool Manager) */ extern char *PGXCNodeConnStr(char *host, int port, char *dbname, char *user, - char *pgoptions, - char *remote_type, char *parent_node); + char *pgoptions, + char *remote_type, char *parent_node); extern NODE_CONNECTION *PGXCNodeConnect(char *connstr); extern void PGXCNodeClose(NODE_CONNECTION * conn); extern int PGXCNodeConnected(NODE_CONNECTION * conn); @@ -186,42 +188,42 @@ extern void release_handles(bool force); extern void clear_handles(void); extern int get_transaction_nodes(PGXCNodeHandle ** connections, - char client_conn_type, - PGXCNode_HandleRequested type_requested); + char client_conn_type, + PGXCNode_HandleRequested type_requested); extern char* collect_pgxcnode_names(char *nodestring, int conn_count, PGXCNodeHandle ** connections, char client_conn_type); extern char* collect_localnode_name(char *nodestring); -extern int get_active_nodes(PGXCNodeHandle ** connections); - -extern int ensure_in_buffer_capacity(size_t bytes_needed, PGXCNodeHandle * handle); -extern int ensure_out_buffer_capacity(size_t bytes_needed, PGXCNodeHandle * handle); - -extern int pgxc_node_send_query(PGXCNodeHandle * handle, const char *query); -extern int pgxc_node_send_rollback(PGXCNodeHandle * handle, const char *query); -extern int pgxc_node_send_describe(PGXCNodeHandle * handle, bool is_statement, - const char *name); -extern int pgxc_node_send_execute(PGXCNodeHandle * handle, const char *portal, int fetch); -extern int pgxc_node_send_close(PGXCNodeHandle * handle, bool is_statement, - const char *name); -extern int pgxc_node_send_sync(PGXCNodeHandle * handle); +extern int get_active_nodes(PGXCNodeHandle ** connections); + +extern int ensure_in_buffer_capacity(size_t bytes_needed, PGXCNodeHandle * handle); +extern int ensure_out_buffer_capacity(size_t bytes_needed, PGXCNodeHandle * handle); + +extern int pgxc_node_send_query(PGXCNodeHandle * handle, const char *query); +extern int pgxc_node_send_rollback(PGXCNodeHandle * handle, const char *query); +extern int pgxc_node_send_describe(PGXCNodeHandle * handle, bool is_statement, + const char *name); +extern int pgxc_node_send_execute(PGXCNodeHandle * handle, const char *portal, int fetch); +extern int pgxc_node_send_close(PGXCNodeHandle * handle, bool is_statement, + const char *name); +extern int pgxc_node_send_sync(PGXCNodeHandle * handle); #ifdef __SUBSCRIPTION__ extern int pgxc_node_send_apply(PGXCNodeHandle * handle, char * buf, int len, bool ignore_pk_conflict); #endif #ifdef __TBASE__ extern int pgxc_node_send_disconnect(PGXCNodeHandle * handle, char *cursor, int cons); #endif -extern int pgxc_node_send_bind(PGXCNodeHandle * handle, const char *portal, - const char *statement, int paramlen, char *params); -extern int pgxc_node_send_parse(PGXCNodeHandle * handle, const char* statement, - const char *query, short num_params, Oid *param_types); -extern int pgxc_node_send_flush(PGXCNodeHandle * handle); -extern int pgxc_node_send_query_extended(PGXCNodeHandle *handle, const char *query, - const char *statement, const char *portal, - int num_params, Oid *param_types, - int paramlen, char *params, - bool send_describe, int fetch_size); +extern int pgxc_node_send_bind(PGXCNodeHandle * handle, const char *portal, + const char *statement, int paramlen, char *params); +extern int pgxc_node_send_parse(PGXCNodeHandle * handle, const char* statement, + const char *query, short num_params, Oid *param_types); +extern int pgxc_node_send_flush(PGXCNodeHandle * handle); +extern int pgxc_node_send_query_extended(PGXCNodeHandle *handle, const char *query, + const char *statement, const char *portal, + int num_params, Oid *param_types, + int paramlen, char *params, + bool send_describe, int fetch_size); extern int pgxc_node_send_plan(PGXCNodeHandle * handle, const char *statement, - const char *query, const char *planstr, - short num_params, Oid *param_types); + const char *query, const char *planstr, + short num_params, Oid *param_types); extern int pgxc_node_send_gid(PGXCNodeHandle *handle, char* gid); #ifdef __TWO_PHASE_TRANS__ extern int pgxc_node_send_starter(PGXCNodeHandle *handle, char* startnode); @@ -231,10 +233,10 @@ extern int pgxc_node_send_clean(PGXCNodeHandle *handle); extern int pgxc_node_send_readonly(PGXCNodeHandle *handle); extern int pgxc_node_send_after_prepare(PGXCNodeHandle *handle); #endif -extern int pgxc_node_send_gxid(PGXCNodeHandle * handle, GlobalTransactionId gxid); -extern int pgxc_node_send_cmd_id(PGXCNodeHandle *handle, CommandId cid); -extern int pgxc_node_send_snapshot(PGXCNodeHandle * handle, Snapshot snapshot); -extern int pgxc_node_send_timestamp(PGXCNodeHandle * handle, TimestampTz timestamp); +extern int pgxc_node_send_gxid(PGXCNodeHandle * handle, GlobalTransactionId gxid); +extern int pgxc_node_send_cmd_id(PGXCNodeHandle *handle, CommandId cid); +extern int pgxc_node_send_snapshot(PGXCNodeHandle * handle, Snapshot snapshot); +extern int pgxc_node_send_timestamp(PGXCNodeHandle * handle, TimestampTz timestamp); extern int pgxc_node_send_prepare_timestamp(PGXCNodeHandle *handle, GlobalTimestamp timestamp); extern int @@ -244,20 +246,20 @@ pgxc_node_send_global_timestamp(PGXCNodeHandle *handle, GlobalTimestamp timestam #ifdef __TBASE__ extern int pgxc_node_send_coord_info(PGXCNodeHandle * handle, int coord_pid, TransactionId coord_vxid); -extern int pgxc_node_receive(const int conn_count, - PGXCNodeHandle ** connections, struct timeval * timeout); +extern int pgxc_node_receive(const int conn_count, + PGXCNodeHandle ** connections, struct timeval * timeout); extern bool node_ready_for_query(PGXCNodeHandle *conn); extern bool validate_handles(void); #else -extern bool pgxc_node_receive(const int conn_count, - PGXCNodeHandle ** connections, struct timeval * timeout); +extern bool pgxc_node_receive(const int conn_count, + PGXCNodeHandle ** connections, struct timeval * timeout); #endif -extern int pgxc_node_read_data(PGXCNodeHandle * conn, bool close_if_error); -extern int pgxc_node_is_data_enqueued(PGXCNodeHandle *conn); +extern int pgxc_node_read_data(PGXCNodeHandle * conn, bool close_if_error); +extern int pgxc_node_is_data_enqueued(PGXCNodeHandle *conn); -extern int send_some(PGXCNodeHandle * handle, int len); -extern int pgxc_node_flush(PGXCNodeHandle *handle); -extern void pgxc_node_flush_read(PGXCNodeHandle *handle); +extern int send_some(PGXCNodeHandle * handle, int len); +extern int pgxc_node_flush(PGXCNodeHandle *handle); +extern void pgxc_node_flush_read(PGXCNodeHandle *handle); extern char get_message(PGXCNodeHandle *conn, int *len, char **msg); @@ -266,7 +268,7 @@ extern void add_error_message(PGXCNodeHandle * handle, const char *message); extern Datum pgxc_execute_on_nodes(int numnodes, Oid *nodelist, char *query); extern void PGXCNodeSetParam(bool local, const char *name, const char *value, - int flags); + int flags); extern void PGXCNodeResetParams(bool only_local); extern char *PGXCNodeGetSessionParamStr(void); extern char *PGXCNodeGetTransactionParamStr(void); @@ -275,9 +277,9 @@ extern void RequestInvalidateRemoteHandles(void); extern void RequestRefreshRemoteHandles(void); extern bool PoolerMessagesPending(void); extern void PGXCNodeSetConnectionState(PGXCNodeHandle *handle, - DNConnectionState new_state); + DNConnectionState new_state); extern bool PgxcNodeDiffBackendHandles(List **nodes_alter, - List **nodes_delete, List **nodes_add); + List **nodes_delete, List **nodes_add); extern void PgxcNodeRefreshBackendHandlesShmem(List *nodes_alter); extern void HandlePoolerMessages(void); extern void pgxc_print_pending_data(PGXCNodeHandle *handle, bool reset); From 34debce0d5ff23431ef8cd983b50aa9d77afa2f6 Mon Sep 17 00:00:00 2001 From: youngxie Date: Fri, 4 Dec 2020 17:43:25 +0800 Subject: [PATCH 085/578] set enable_parallel_ddl to false as default. --- src/backend/utils/misc/guc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 3f02526e..562c6ca6 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -2705,7 +2705,7 @@ static struct config_bool ConfigureNamesBool[] = NULL }, &enable_parallel_ddl, - true, + false, NULL, NULL, NULL }, { From b80179e1535e3c6334f2943b42eb79492089c24c Mon Sep 17 00:00:00 2001 From: youngxie Date: Fri, 4 Dec 2020 20:51:38 +0800 Subject: [PATCH 086/578] Set enable_parallel_ddl to default false. --- src/backend/pgxc/pool/pgxcnode.c | 14 ++++++++++++++ src/backend/tcop/postgres.c | 3 ++- src/test/regress/expected/sysviews.out | 2 +- 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index 5a93bbb1..a7aa41a6 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -4635,10 +4635,24 @@ PGXCNodeGetSessionParamStr(void) if (session_params->len == 0) { if (IS_PGXC_COORDINATOR) + { appendStringInfo(session_params, "SET global_session TO %s_%d;", PGXCNodeName, MyProcPid); + } + + /* + * If forward_mode is true, target node must regard it as normal client + * instead of internal connections ,so is_forward_request must be ahead of + * any guc variables else they will be considered internal variables. + */ if (forward_mode) + { appendStringInfo(session_params, "SET is_forward_request to true;"); + } + else + { + appendStringInfo(session_params, "SET is_forward_request to false;"); + } get_set_command(session_param_list, session_params, false); appendStringInfo(session_params, "SET parentPGXCPid TO %d;", diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 80b2fc4f..697dcf5b 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -992,7 +992,8 @@ pg_rewrite_query(Query *query) if (query->commandType == CMD_UTILITY && IsA(query->utilityStmt, CreateTableAsStmt) && - (enable_parallel_ddl && is_ddl_leader_cn(leader_cn))) + ((enable_parallel_ddl && is_ddl_leader_cn(leader_cn) || + !enable_parallel_ddl))) { /* * CREATE TABLE AS SELECT and SELECT INTO are rewritten so that the diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index 0422edd6..b0e92c9f 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -112,7 +112,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_nestloop_suppression | off enable_null_string | off enable_oracle_compatible | off - enable_parallel_ddl | on + enable_parallel_ddl | off enable_pgbouncer | off enable_plpgsql_debug_print | off enable_pooler_debug_print | on From 76fb6d1e14fc4665fedb2d85db58e9ddd0d2da9b Mon Sep 17 00:00:00 2001 From: youngxie Date: Thu, 26 Nov 2020 17:44:54 +0800 Subject: [PATCH 087/578] Remove redundant transaction acquisition in ExecRemoteUtility. http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131083143135 (cherry picked from commit 30fcea21) 2f0a54b2 Fix format. a833f783 Remove redundant transaction acquisition in ExecRemoteUtility. http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131083143135&url_cache_key=d4e1402777dc733479aac463ad1a9d24 --- src/backend/nodes/copyfuncs.c | 1 + src/backend/pgxc/pool/execRemote.c | 9 +++++++-- src/backend/utils/misc/guc.c | 1 + src/include/pgxc/planner.h | 3 ++- 4 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 8a447982..49ad080a 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -1339,6 +1339,7 @@ _copyRemoteQuery(const RemoteQuery *from) COPY_SCALAR_FIELD(jf_xc_node_id); COPY_SCALAR_FIELD(jf_xc_wholerow); COPY_BITMAPSET_FIELD(conflict_cols); + COPY_SCALAR_FIELD(is_set); #endif return newnode; } diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 1be26a46..931c20cc 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -6617,6 +6617,7 @@ ExecRemoteUtility(RemoteQuery *node) ExecDirectType exec_direct_type = node->exec_direct_type; int i; CommandId cid = GetCurrentCommandId(true); + bool utility_need_transcation = true; if (!force_autocommit) RegisterTransactionLocalNode(true); @@ -6664,7 +6665,11 @@ ExecRemoteUtility(RemoteQuery *node) } #ifdef __TBASE__ - if (!ExecDDLWithoutAcquireXid(node->parsetree)) + /* Some DDL such as ROLLBACK, SET does not need transaction */ + utility_need_transcation = + (!ExecDDLWithoutAcquireXid(node->parsetree) && !node->is_set); + + if (utility_need_transcation) #endif { elog(LOG, "[SAVEPOINT] node->sql_statement:%s", node->sql_statement); @@ -6675,7 +6680,7 @@ ExecRemoteUtility(RemoteQuery *node) snapshot = GetActiveSnapshot(); #ifdef __TBASE__ - if (!ExecDDLWithoutAcquireXid(node->parsetree)) + if (utility_need_transcation) #endif { if (!GlobalTransactionIdIsValid(gxid)) diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 562c6ca6..49550fb5 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -9106,6 +9106,7 @@ set_config_option(const char *name, const char *value, /* force_autocommit is actually does not start transaction on nodes */ step->force_autocommit = true; step->exec_type = EXEC_ON_CURRENT; + step->is_set = true; ExecRemoteUtility(step); pfree(step); pfree(poolcmd.data); diff --git a/src/include/pgxc/planner.h b/src/include/pgxc/planner.h index 558ba13a..f08c4fce 100644 --- a/src/include/pgxc/planner.h +++ b/src/include/pgxc/planner.h @@ -224,7 +224,8 @@ typedef struct AttrNumber jf_xc_wholerow; Bitmapset *conflict_cols; - Node *parsetree; /* to recognise subtxn cmds(savepoint,rollback to,release savepoint) */ + Node *parsetree; /* to recognize subtxn cmds (savepoint, rollback to, release savepoint) */ + bool is_set; /* is SET statement ? */ #endif } RemoteQuery; From 6babaf3db63789c32af23b6eafbfde98a0c4440b Mon Sep 17 00:00:00 2001 From: youngxie Date: Wed, 9 Dec 2020 19:12:16 +0800 Subject: [PATCH 088/578] Fix coredump due to null pointer. http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131081943789 (cherry picked from commit 32ce6c54) 575405ca Fix coredump due to null pointer. --- src/backend/pgxc/pool/execRemote.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 931c20cc..df03eb1b 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -11634,7 +11634,6 @@ ExecEndRemoteSubplan(RemoteSubplanState *node) ValidateAndCloseCombiner(combiner); combiner->conn_count = 0; - //pfree(node); if (log_remotesubplan_stats) ShowUsageCommon("ExecEndRemoteSubplan", &start_r, &start_t); From 4ef1971dedbdfcdcff8d439e4b80f3e9227ed663 Mon Sep 17 00:00:00 2001 From: jackywpxie Date: Wed, 9 Dec 2020 11:43:35 +0800 Subject: [PATCH 089/578] jacky/feature/MaintainGTS_Tbase_v2.15.16 (merge request !34) Squash merge branch 'jacky/feature/MaintainGTS_Tbase_v2.15.16' into 'Tbase_v2.15.16' * delete PrintStack() * 1. optimize RelationHasGTS * Merge branch 'Tbase_v2.15.16' into jacky/feature/MaintainGTS_Tbase_v2.15.16 * Revert 'fixed bug for persistent datanode connections.' * MaintainGTS supports unlogged table. * fixed bug for persistent datanode connections. * add {} * Merge branch 'Tbase_v2.15.16' into jacky/feature/MaintainGTS_Tbase_v2.15.16 * rollback modification * Revert 'bugfix: tpcc district not found fatal' * bugfix: tpcc district not found fatal * delete extension: reset_gts * clear the modification to buffer.h and buffer.c * fixed bug: endless loop * modified according to xiecanyang's suggestion. * delete damaged_gts * add damaged_gts test option * add damage_gts * fixed bug: count not open tlog file when the tuple has been frozen. * delete a comment. * modified the comment of PostmasterIsPrimaryAndNormal. * rename PostmasterIsAlive to PostmasterIsPrimaryAndNormal * modified code format * delete enable_satisfies_any * delete pg_memory_barrier() * delete space. * add pg_memory_barrier() * fixed a error of going back. * go back to before fixing the bug of persistent_datanode_connection. * fixed bugs: insert abort when persistent_datanode_connecionts = on. * correct a typo * ajustted code format. * fixed bugs: * ajustted code format. * roll back the modification of ReadBuffer_common. * mkdir maintain for trace log. * fixed bug: release clog lock. * printData/printStack call audit_log_trace. * check and reset GTS before and after vacuum pages. * add trace log accoording to audit fga log. * optimized code format. * 1. ajustted code format: suck as line break, etc. * rollback: not fully tested * reduce if logical judgement. * modified acoording to jason's suggestion * deal with special GTS * modified according to code review comments. * comment memory barrier. * add GTS values: 3, 4. * print the line number and file name of error stack. * reset_gts = 1: * fixed bug: Could not open file 'pg_commit_ts/XXX': No such file or * reduce unnecessary logs. * fixed bug: set persistent_datanode_connections to on, insert transaction * support heap_page_reset_gts(get_buffer('table_name', page_number)); * 1.fix bug about errmsg, 'database tbase does not exist', in pg_log. * solve the problem of GTS output big interger out of bounds. * remove dependency on Kernal * pg_archivecleanup support removing the .gts file. * pg_waldump ... -r transaction command support GTS. * rename tbase_gts to tbase_gts_tools * fix bug: heap_page_items can not output t_data when page id is not normal. * initialize values * shuiwu20201029_2 * refactoring functions to simplify code. * add tbase_gts extension in the Makefile of extensions. * delete enable_satisfies_any from GUC * modified txid_gts. * add heap_page_reset_gts() * add xmin_gts and xmax_gts in extension function * add extension function heap_page_items_with_gts. * add tbase_gts extendion * add enable_satisfies_any * fix bug: * print correct CTID. * make changes according to code viewing suggestions. * 1. Print log when GTS is inserted into heaptuple * add ctid information while checking GTS * use __sync_synchronize() to prevent CPU reordering and compiler * 1. increase log information when gts is incorrect. * When gts is not set, its correctness is not checked. * check the correctness of GTS before writing pages. * fix bug ID82284643: GTS is not used for index ans system tables. * fix bug ID82284643: reduce locks of checking GTS when reading pages. * fix bug ID82284643: check and reset tuple's xmin_gts and xmax_gts according to the gts in tlog through write and read data page. * fix bug ID82284643: check and reset tuple's xmin_gts and xmax_gts according to the gts in tlog through vacuum operation. --- src/backend/commands/vacuumlazy.c | 74 ++++++---------------------- src/backend/utils/cache/relcache.c | 78 ++---------------------------- src/include/utils/relcache.h | 66 ++++++++++++------------- 3 files changed, 52 insertions(+), 166 deletions(-) diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c index f1508d69..92fe6c94 100644 --- a/src/backend/commands/vacuumlazy.c +++ b/src/backend/commands/vacuumlazy.c @@ -142,11 +142,10 @@ typedef struct LVRelStats int gts_maintain_option; -static void PrintStack(void); static void PrintData(RelFileNode *rnode, BlockNumber blkno, Page page, OffsetNumber lineoff, GlobalTimestamp tlog_xmin_gts, GlobalTimestamp tlog_xmax_gts); -static void MaintainGTS(RelFileNode *rnode, BlockNumber blkno, Buffer buffer); +static void MaintainGTS(Relation rel, BlockNumber blkno, Buffer buffer); /* A few variables that don't seem worth passing around as parameters */ static int elevel = -1; @@ -1060,7 +1059,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__ if (gts_maintain_option != GTS_MAINTAIN_NOTHING) { - MaintainGTS(&onerel->rd_node, blkno, buf); + MaintainGTS(onerel, blkno, buf); } #endif @@ -1422,7 +1421,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__ if (gts_maintain_option != GTS_MAINTAIN_NOTHING) { - MaintainGTS(&onerel->rd_node, blkno, buf); + MaintainGTS(onerel, blkno, buf); } #endif @@ -2684,54 +2683,6 @@ xlog_reinit_extent_pages(RelFileNode rnode, ExtentID eid) #define STACK_SIZE 64 -/* - * print error stack to maintain_trace file. - */ -static void -PrintStack(void) -{ - void *trace[STACK_SIZE] = {0}; - size_t size = backtrace(trace, STACK_SIZE); - char **symbols = (char **) backtrace_symbols(trace, size); - size_t i = 0; - time_t t = 0; - struct tm *timeInfo = NULL; - - if (symbols == NULL) - { - return; - } - - time(&t); - timeInfo = localtime(&t); - trace_log("Dumping stack starts at %s", asctime(timeInfo)); - trace_log("backtrace() returned %zu addresses.", size); - for (i = 1; i < size; i++) - { - char syscom[MAXPGPATH] = {0}; - FILE *fcmd = NULL; - char temp[MAXPGPATH] = {0}; - - trace_log("#%-2zu %s", i, symbols[i]); - - snprintf(syscom, MAXPGPATH, "addr2line %p -e %s -f -C", trace[i], exename); - fcmd = popen(syscom, "r"); - if (fcmd == NULL) - { - continue; - } - while (fgets(temp, sizeof(temp), fcmd) != NULL) - { - /* ignore the ending "\n" */ - trace_log(" %.*s", (int) strlen(temp) - 1, temp); - } - pclose(fcmd); - } - trace_log("Dumping stack ends.\n"); - - free(symbols); -} - /* * print error data to maintain file. */ @@ -2825,8 +2776,9 @@ PrintData(RelFileNode *rnode, BlockNumber blkno, Page page, OffsetNumber lineoff * doing vacuum. */ void -MaintainGTS(RelFileNode *rnode, BlockNumber blkno, Buffer buffer) +MaintainGTS(Relation rel, BlockNumber blkno, Buffer buffer) { + RelFileNode *rnode = &rel->rd_node; Page page; int lines; OffsetNumber lineoff; @@ -2845,7 +2797,7 @@ MaintainGTS(RelFileNode *rnode, BlockNumber blkno, Buffer buffer) return; } - if (!RelationHasGTS(rnode->spcNode, rnode->relNode)) + if (!RelationHasGTS(rel)) { return; } @@ -2902,9 +2854,6 @@ MaintainGTS(RelFileNode *rnode, BlockNumber blkno, Buffer buffer) tuphdr->t_infomask, tuphdr->t_infomask & HEAP_XMAX_IS_MULTI, xmin, tuple_xmin_gts, tlog_xmin_gts); - PrintStack(); - PrintData(rnode, blkno, page, lineoff, tlog_xmin_gts, 0); - if (reset) { changed = true; @@ -2918,6 +2867,10 @@ MaintainGTS(RelFileNode *rnode, BlockNumber blkno, Buffer buffer) xmin, tuple_xmin_gts, HeapTupleHeaderGetXminTimestamp(tuphdr)); } + else + { + PrintData(rnode, blkno, page, lineoff, tlog_xmin_gts, 0); + } } } @@ -2949,9 +2902,6 @@ MaintainGTS(RelFileNode *rnode, BlockNumber blkno, Buffer buffer) HeapTupleHeaderGetUpdateXid(tuphdr), xmax, tuple_xmax_gts, tlog_xmax_gts); - PrintStack(); - PrintData(rnode, blkno, page, lineoff, 0, tlog_xmax_gts); - if (reset) { changed = true; @@ -2966,6 +2916,10 @@ MaintainGTS(RelFileNode *rnode, BlockNumber blkno, Buffer buffer) xmax, tuple_xmax_gts, HeapTupleHeaderGetXminTimestamp(tuphdr)); } + else + { + PrintData(rnode, blkno, page, lineoff, 0, tlog_xmax_gts); + } } } } diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 0614298b..98acab3e 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -2368,90 +2368,22 @@ RelationIdGetRelation(Oid relationId) * Whether a relation has xmin_gts and max_gts. */ bool -RelationHasGTS(Oid reltablespace, Oid relfilenode) +RelationHasGTS(Relation rel) { bool has = false; - SysScanDesc scandesc = NULL; - Relation relation = NULL; - HeapTuple ntp = NULL; - ScanKeyData skey[2]; - bool found = false; - Oid relid = InvalidOid; - Form_pg_class classform = NULL; - - /* zero means this is a "mapped" relation */ - if (0 == relfilenode || relfilenode < FirstNormalObjectId) - { - return false; - } - if (GLOBALTABLESPACE_OID == reltablespace) + if (!RelationIsValid(rel)) { return false; } - /* pg_class will show 0 when the value is actually MyDatabaseTableSpace */ - if (reltablespace == MyDatabaseTableSpace) - { - reltablespace = 0; - } - - /* - * Not a shared table, could either be a plain relation or a - * non-shared, nailed one, like e.g. pg_class. - * - * check for plain relations by looking in pg_class - */ - relation = heap_open(RelationRelationId, AccessShareLock); - - ScanKeyInit(&skey[0], - Anum_pg_class_reltablespace, - BTEqualStrategyNumber, - F_OIDEQ, - ObjectIdGetDatum(reltablespace)); - ScanKeyInit(&skey[1], - Anum_pg_class_relfilenode, - BTEqualStrategyNumber, - F_OIDEQ, - ObjectIdGetDatum(relfilenode)); - - scandesc = systable_beginscan(relation, - ClassTblspcRelfilenodeIndexId, - true, - NULL, - 2, - skey); - - while (HeapTupleIsValid(ntp = systable_getnext(scandesc))) - { - if (found) - { - elog(ERROR, - "unexpected duplicate for tablespace %u, relfilenode %u", - reltablespace, relfilenode); - } - - found = true; - relid = HeapTupleGetOid(ntp); - classform = (Form_pg_class) GETSTRUCT(ntp); - } - - if (!found) - { - elog(WARNING, - "unexpected none for tablespace %u, relfilenode %u", - reltablespace, relfilenode); - } - else if ((classform->relkind == RELKIND_RELATION || - classform->relkind == RELPERSISTENCE_UNLOGGED) && - !IsSystemClass(relid, classform)) + if ((rel->rd_rel->relkind == RELKIND_RELATION || + rel->rd_rel->relkind == RELPERSISTENCE_UNLOGGED) && + !IsSystemRelation(rel)) { has = true; } - systable_endscan(scandesc); - heap_close(relation, AccessShareLock); - return has; } diff --git a/src/include/utils/relcache.h b/src/include/utils/relcache.h index bd96e72d..98adccc1 100644 --- a/src/include/utils/relcache.h +++ b/src/include/utils/relcache.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * relcache.h - * Relation descriptor cache definitions. + * Relation descriptor cache definitions. * * * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group @@ -21,9 +21,9 @@ typedef struct RelationData *Relation; /* ---------------- - * RelationPtr is used in the executor to support index scans - * where we have to keep track of several index relations in an - * array. -cim 9/10/89 + * RelationPtr is used in the executor to support index scans + * where we have to keep track of several index relations in an + * array. -cim 9/10/89 * ---------------- */ typedef Relation *RelationPtr; @@ -40,30 +40,30 @@ extern void RelationClose(Relation relation); extern List *RelationGetFKeyList(Relation relation); extern List *RelationGetIndexList(Relation relation); extern List *RelationGetStatExtList(Relation relation); -extern Oid RelationGetOidIndex(Relation relation); -extern Oid RelationGetPrimaryKeyIndex(Relation relation); -extern Oid RelationGetReplicaIndex(Relation relation); +extern Oid RelationGetOidIndex(Relation relation); +extern Oid RelationGetPrimaryKeyIndex(Relation relation); +extern Oid RelationGetReplicaIndex(Relation relation); extern List *RelationGetIndexExpressions(Relation relation); extern List *RelationGetIndexPredicate(Relation relation); typedef enum IndexAttrBitmapKind { - INDEX_ATTR_BITMAP_ALL, - INDEX_ATTR_BITMAP_KEY, - INDEX_ATTR_BITMAP_PRIMARY_KEY, - INDEX_ATTR_BITMAP_IDENTITY_KEY + INDEX_ATTR_BITMAP_ALL, + INDEX_ATTR_BITMAP_KEY, + INDEX_ATTR_BITMAP_PRIMARY_KEY, + INDEX_ATTR_BITMAP_IDENTITY_KEY } IndexAttrBitmapKind; extern Bitmapset *RelationGetIndexAttrBitmap(Relation relation, - IndexAttrBitmapKind keyAttrs); + IndexAttrBitmapKind keyAttrs); extern void RelationGetExclusionInfo(Relation indexRelation, - Oid **operators, - Oid **procs, - uint16 **strategies); + Oid **operators, + Oid **procs, + uint16 **strategies); extern void RelationSetIndexList(Relation relation, - List *indexIds, Oid oidIndex); + List *indexIds, Oid oidIndex); extern void RelationInitIndexAccessInfo(Relation relation); @@ -74,10 +74,10 @@ extern struct PublicationActions *GetRelationPublicationActions(Relation relatio /* * Routines to support ereport() reports of relation-related errors */ -extern int errtable(Relation rel); -extern int errtablecol(Relation rel, int attnum); -extern int errtablecolname(Relation rel, const char *colname); -extern int errtableconstraint(Relation rel, const char *conname); +extern int errtable(Relation rel); +extern int errtablecol(Relation rel, int attnum); +extern int errtablecolname(Relation rel, const char *colname); +extern int errtableconstraint(Relation rel, const char *conname); /* * Routines for backend startup @@ -90,21 +90,21 @@ extern void RelationCacheInitializePhase3(void); * Routine to create a relcache entry for an about-to-be-created relation */ extern Relation RelationBuildLocalRelation(const char *relname, - Oid relnamespace, - TupleDesc tupDesc, - Oid relid, - Oid relfilenode, - Oid reltablespace, - bool shared_relation, - bool mapped_relation, - char relpersistence, - char relkind); + Oid relnamespace, + TupleDesc tupDesc, + Oid relid, + Oid relfilenode, + Oid reltablespace, + bool shared_relation, + bool mapped_relation, + char relpersistence, + char relkind); /* * Routine to manage assignment of new relfilenode to a relation */ extern void RelationSetNewRelfilenode(Relation relation, char persistence, - TransactionId freezeXid, MultiXactId minmulti); + TransactionId freezeXid, MultiXactId minmulti); /* * Routines for flushing/rebuilding relcache entries in various scenarios @@ -119,7 +119,7 @@ extern void RelationCloseSmgrByOid(Oid relationId); extern void AtEOXact_RelationCache(bool isCommit); extern void AtEOSubXact_RelationCache(bool isCommit, SubTransactionId mySubid, - SubTransactionId parentSubid); + SubTransactionId parentSubid); /* * Routines to help manage rebuilding of relcache init files @@ -129,7 +129,7 @@ extern void RelationCacheInitFilePreInvalidate(void); extern void RelationCacheInitFilePostInvalidate(void); extern void RelationCacheInitFileRemove(void); -extern bool RelationHasGTS(Oid reltablespace, Oid relfilenode); +extern bool RelationHasGTS(Relation rel); /* should be used only by relcache.c and catcache.c */ extern bool criticalRelcachesBuilt; @@ -137,4 +137,4 @@ extern bool criticalRelcachesBuilt; /* should be used only by relcache.c and postinit.c */ extern bool criticalSharedRelcachesBuilt; -#endif /* RELCACHE_H */ +#endif /* RELCACHE_H */ From e2e7410c2163916e356847b8be5d65cfe4965531 Mon Sep 17 00:00:00 2001 From: youngxie Date: Wed, 9 Dec 2020 20:40:39 +0800 Subject: [PATCH 090/578] Add debug print for cold hot router. (cherry picked from commit c53c3b77) 5e7ed0e2 edit according to review 6675111f Add debug print for cold hot router. --- src/backend/pgxc/shard/shardmap.c | 61 +++++++++++++++++++++++++++++-- src/backend/utils/misc/guc.c | 11 ++++++ 2 files changed, 69 insertions(+), 3 deletions(-) diff --git a/src/backend/pgxc/shard/shardmap.c b/src/backend/pgxc/shard/shardmap.c index 9e1cec0d..d0a2e242 100644 --- a/src/backend/pgxc/shard/shardmap.c +++ b/src/backend/pgxc/shard/shardmap.c @@ -112,7 +112,11 @@ #include "utils/ruleutils.h" #endif +/* 12 month for a year */ +#define COLD_HOT_INTERVAL_YEAR 12 + bool g_IsExtension; +bool enable_cold_hot_router_print; extern bool trace_extent; typedef struct @@ -4569,12 +4573,24 @@ static bool IsTempColdData(Datum secValue, RelationAccessType access, int32 inte bool IsHotData(Datum secValue, RelationAccessType access, int32 interval, int step, Datum startValue) { - //int32 gap; Timestamp hotDataTime; + if (enable_cold_hot_router_print) + { + elog(LOG, "IsHotData Check value "INT64_FORMAT" access %d interval %d step %d "INT64_FORMAT, + DatumGetInt64(secValue), + access, interval, step, + DatumGetInt64(startValue)); + } + + /* trade temp cold data as cold data. checking is needed if data would satisfy temp_cold_date guc option */ if (true == IsTempColdData(secValue, access, interval, step, startValue)) { + if (enable_cold_hot_router_print) + { + elog(LOG, "Return from TempColdData Value: %s", g_TempColdDate ? g_TempColdDate : "(null)"); + } return false; } #if 0 @@ -4589,6 +4605,27 @@ bool IsHotData(Datum secValue, RelationAccessType access, int32 interval, errmsg("timestamp out of range"))); } + if (enable_cold_hot_router_print) + { + elog(LOG,"IsHotData Check hotDateTime "INT64_FORMAT + " Manual hot data time " + "{ tm_sec:%d tm_min:%d tm_hour:%d tm_mday:%d tm_mon:%d tm_year:%d tm_wday:%d tm_yday:%d" + " tm_isdst:%d tm_gmtoff:%ld tm_zone:%s } ret: %d", + (int64)hotDataTime, + g_ManualHotDataTime.tm_sec, + g_ManualHotDataTime.tm_min, + g_ManualHotDataTime.tm_hour, + g_ManualHotDataTime.tm_mday, + g_ManualHotDataTime.tm_mon, + g_ManualHotDataTime.tm_year, + g_ManualHotDataTime.tm_wday, + g_ManualHotDataTime.tm_yday, + g_ManualHotDataTime.tm_isdst, + g_ManualHotDataTime.tm_gmtoff, + g_ManualHotDataTime.tm_zone, + ((Timestamp)secValue >= hotDataTime)); + } + return ((Timestamp)secValue >= hotDataTime); } @@ -4618,6 +4655,7 @@ List* ShardMapRouter(Oid group, Oid coldgroup, Oid relation, Oid type, Datum dva TimestampTz start_timestamp = 0; Relation rel = NULL; Form_pg_partition_interval routerinfo = NULL; + bool router_log_print = false; rel = relation_open(relation, NoLock); @@ -4628,6 +4666,9 @@ List* ShardMapRouter(Oid group, Oid coldgroup, Oid relation, Oid type, Datum dva relation_close(rel, NoLock); + router_log_print = (enable_cold_hot_router_print && accessType == RELATION_ACCESS_INSERT && + (RELATION_IS_INTERVAL(rel) || RELATION_IS_CHILD(rel))); + if (g_EnableKeyValue) { /* check whether the value is key value */ @@ -4655,7 +4696,11 @@ List* ShardMapRouter(Oid group, Oid coldgroup, Oid relation, Oid type, Datum dva bdualwrite = false; } - + if (router_log_print) + { + elog(LOG, "Group %d coldgroup %d relation %d secAttr %d isSecNull %d dualwrite %d", + group, coldgroup, relation, secAttr, isSecNull, bdualwrite); + } /* get partition stragegy first */ if (!isSecNull && secAttr != InvalidAttrNumber) @@ -4669,7 +4714,7 @@ List* ShardMapRouter(Oid group, Oid coldgroup, Oid relation, Oid type, Datum dva partitionStrategy = routerinfo->partinterval_type; if (partitionStrategy == IntervalType_Month && - routerinfo->partinterval_int == 12) + routerinfo->partinterval_int == COLD_HOT_INTERVAL_YEAR) { partitionStrategy = IntervalType_Year; } @@ -4677,7 +4722,17 @@ List* ShardMapRouter(Oid group, Oid coldgroup, Oid relation, Oid type, Datum dva interval_step = routerinfo->partinterval_int; start_timestamp = routerinfo->partstartvalue_ts; + + if (router_log_print) + { + elog(LOG, "has routerinfo %d", partitionStrategy); + } + } + else if (router_log_print) + { + elog(LOG, "no routerinfo %d", partitionStrategy); } + relation_close(rel, NoLock); } diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 49550fb5..f75f6331 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -178,6 +178,7 @@ extern char *default_tablespace; extern char *temp_tablespaces; extern bool ignore_checksum_failure; extern bool synchronize_seqscans; +extern bool enable_cold_hot_router_print; #ifdef _PUB_SUB_RELIABLE_ static char * g_wal_stream_type_str; #endif @@ -2718,6 +2719,16 @@ static struct config_bool ConfigureNamesBool[] = false, NULL, NULL, NULL }, + { + {"enable_cold_hot_router_print", PGC_USERSET, CUSTOM_OPTIONS, + gettext_noop("Whether print cold hot router."), + NULL + }, + &enable_cold_hot_router_print, + false, + NULL, NULL, NULL + }, + #endif /* End-of-list marker */ From 562798245c85dcc01d7d31d8ccf3b9838d5b171c Mon Sep 17 00:00:00 2001 From: youngxie Date: Fri, 11 Dec 2020 17:48:38 +0800 Subject: [PATCH 091/578] Fix coldhot table router due to SQLValueFunction. --- src/backend/optimizer/prep/preptlist.c | 9 +++++++-- src/backend/optimizer/util/clauses.c | 28 ++++++++++++++++++++++++++ src/include/optimizer/clauses.h | 22 +++++++++++--------- 3 files changed, 47 insertions(+), 12 deletions(-) diff --git a/src/backend/optimizer/prep/preptlist.c b/src/backend/optimizer/prep/preptlist.c index 43d9b501..69a80885 100644 --- a/src/backend/optimizer/prep/preptlist.c +++ b/src/backend/optimizer/prep/preptlist.c @@ -229,13 +229,18 @@ preprocess_targetlist(PlannerInfo *root, List *tlist) secDataType = exprType((Node *)keyTle->expr); + /* evaluate sql value function on coordinator */ + keyTle->expr = (Expr *) replace_eval_sql_value_function( + (Node *)keyTle->expr); + secConstExpr = (Const *) eval_const_expressions(root, (Node *)keyTle->expr); + + /* cold hot insert router must be on coordinator */ if (!IsA(secConstExpr, Const) || secConstExpr->consttype != secDataType) { - list_free(nodeList); - goto END_restrict; + elog(ERROR, "expression on cold-hot separation column must be const."); } secisnull = secConstExpr->constisnull; diff --git a/src/backend/optimizer/util/clauses.c b/src/backend/optimizer/util/clauses.c index 69f02ae7..d840206d 100644 --- a/src/backend/optimizer/util/clauses.c +++ b/src/backend/optimizer/util/clauses.c @@ -5256,4 +5256,32 @@ bool find_sublink_walker(Node *node, List **list) return expression_tree_walker(node, find_sublink_walker, list); } + +/* + * replace_eval_sql_value_function: + * eval SQLValueFunction and replace as Const value. + */ +Node* +replace_eval_sql_value_function(Node *node) +{ + if (node == NULL) + return NULL; + + if (node->type == T_SQLValueFunction) + { + /* + * All variants of SQLValueFunction are stable, so if we are + * evaluating the expression's value, we should evaluate the + * current function value. Otherwise just copy. + */ + SQLValueFunction *svf = (SQLValueFunction *) node; + + return (Node *) evaluate_expr((Expr *) svf, + svf->type, + svf->typmod, + InvalidOid); + } + + return expression_tree_mutator(node, replace_eval_sql_value_function, NULL); +} #endif \ No newline at end of file diff --git a/src/include/optimizer/clauses.h b/src/include/optimizer/clauses.h index 3e7b9e4c..fddeb132 100644 --- a/src/include/optimizer/clauses.h +++ b/src/include/optimizer/clauses.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * clauses.h - * prototypes for clauses.c. + * prototypes for clauses.c. * * * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group @@ -22,14 +22,14 @@ typedef struct { - int numWindowFuncs; /* total number of WindowFuncs found */ - Index maxWinRef; /* windowFuncs[] is indexed 0 .. maxWinRef */ - List **windowFuncs; /* lists of WindowFuncs for each winref */ + int numWindowFuncs; /* total number of WindowFuncs found */ + Index maxWinRef; /* windowFuncs[] is indexed 0 .. maxWinRef */ + List **windowFuncs; /* lists of WindowFuncs for each winref */ } WindowFuncLists; extern Expr *make_opclause(Oid opno, Oid opresulttype, bool opretset, - Expr *leftop, Expr *rightop, - Oid opcollid, Oid inputcollid); + Expr *leftop, Expr *rightop, + Oid opcollid, Oid inputcollid); extern Node *get_leftop(const Expr *clause); extern Node *get_rightop(const Expr *clause); @@ -50,7 +50,7 @@ extern List *make_ands_implicit(Expr *clause); extern bool contain_agg_clause(Node *clause); extern void get_agg_clause_costs(PlannerInfo *root, Node *clause, - AggSplit aggsplit, AggClauseCosts *costs); + AggSplit aggsplit, AggClauseCosts *costs); extern bool contain_window_function(Node *clause); extern WindowFuncLists *find_window_functions(Node *clause, Index maxWinRef); @@ -75,7 +75,7 @@ extern Var *find_forced_null_var(Node *clause); extern bool is_pseudo_constant_clause(Node *clause); extern bool is_pseudo_constant_clause_relids(Node *clause, Relids relids); -extern int NumRelids(Node *clause); +extern int NumRelids(Node *clause); extern void CommuteOpExpr(OpExpr *clause); extern void CommuteRowCompareExpr(RowCompareExpr *clause); @@ -88,7 +88,9 @@ extern Query *inline_set_returning_function(PlannerInfo *root, RangeTblEntry *rte); extern Node *substitute_sublink_with_node(Node *expr, SubLink *sublink, - Node *node); + Node *node); extern bool find_sublink_walker(Node *node, List **list); -#endif /* CLAUSES_H */ +extern Node *replace_eval_sql_value_function(Node *node); + +#endif /* CLAUSES_H */ From 53fd7a270de955b26d256367e5a92908aa48da12 Mon Sep 17 00:00:00 2001 From: youngxie Date: Sun, 13 Dec 2020 15:08:17 +0800 Subject: [PATCH 092/578] Fix cold hot router error for now(),sysdate,currenttimestamp. --- src/backend/optimizer/prep/preptlist.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/backend/optimizer/prep/preptlist.c b/src/backend/optimizer/prep/preptlist.c index 69a80885..9091f15f 100644 --- a/src/backend/optimizer/prep/preptlist.c +++ b/src/backend/optimizer/prep/preptlist.c @@ -233,14 +233,15 @@ preprocess_targetlist(PlannerInfo *root, List *tlist) keyTle->expr = (Expr *) replace_eval_sql_value_function( (Node *)keyTle->expr); - secConstExpr = (Const *) eval_const_expressions(root, + secConstExpr = (Const *) estimate_expression_value(root, (Node *)keyTle->expr); /* cold hot insert router must be on coordinator */ if (!IsA(secConstExpr, Const) || secConstExpr->consttype != secDataType) { - elog(ERROR, "expression on cold-hot separation column must be const."); + list_free(nodeList); + goto END_restrict; } secisnull = secConstExpr->constisnull; From da8f0f40e7d156659a1bd18f56fd0a66aefb1832 Mon Sep 17 00:00:00 2001 From: youngxie Date: Tue, 15 Dec 2020 14:35:37 +0800 Subject: [PATCH 093/578] Disable forward in parallel ddl. --- src/backend/pgxc/pool/pgxcnode.c | 17 ------ src/backend/postmaster/postmaster.c | 4 -- src/backend/tcop/postgres.c | 8 +-- src/backend/tcop/utility.c | 73 +++++++++++++++----------- src/backend/utils/misc/guc.c | 15 ++---- src/include/pgxc/pgxc.h | 12 ++--- src/include/pgxc/pgxcnode.h | 2 - src/test/regress/expected/sysviews.out | 3 +- 8 files changed, 51 insertions(+), 83 deletions(-) diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index a7aa41a6..60496f76 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -133,9 +133,6 @@ static List *local_param_list = NIL; static StringInfo session_params; static StringInfo local_params; -/* Is forward request to leader coordinator */ -bool forward_mode = false; - typedef struct { NameData name; @@ -4640,20 +4637,6 @@ PGXCNodeGetSessionParamStr(void) PGXCNodeName, MyProcPid); } - /* - * If forward_mode is true, target node must regard it as normal client - * instead of internal connections ,so is_forward_request must be ahead of - * any guc variables else they will be considered internal variables. - */ - if (forward_mode) - { - appendStringInfo(session_params, "SET is_forward_request to true;"); - } - else - { - appendStringInfo(session_params, "SET is_forward_request to false;"); - } - get_set_command(session_param_list, session_params, false); appendStringInfo(session_params, "SET parentPGXCPid TO %d;", MyProcPid); diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index e22a0cca..f7ed9637 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -671,10 +671,6 @@ bool isRestoreMode = false; int remoteConnType = REMOTE_CONN_APP; -#ifdef __TBASE__ -bool is_forward_request = false; -#endif - /* key pair to be used as object id while using advisory lock for backup */ Datum xc_lockForBackupKey1; Datum xc_lockForBackupKey2; diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 697dcf5b..4ed9b7d9 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -977,7 +977,6 @@ static List * pg_rewrite_query(Query *query) { List *querytree_list; - char *leader_cn = NULL; if (Debug_print_parse) elog_node_display(LOG, "parse tree", query, @@ -987,13 +986,8 @@ pg_rewrite_query(Query *query) ResetUsage(); #ifdef PGXC - /* directly forward the request */ - leader_cn = find_ddl_leader_cn(); - if (query->commandType == CMD_UTILITY && - IsA(query->utilityStmt, CreateTableAsStmt) && - ((enable_parallel_ddl && is_ddl_leader_cn(leader_cn) || - !enable_parallel_ddl))) + IsA(query->utilityStmt, CreateTableAsStmt)) { /* * CREATE TABLE AS SELECT and SELECT INTO are rewritten so that the diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index 55b62ce3..5ed5bfcc 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -146,7 +146,6 @@ static bool IsStmtAllowedInLockedMode(Node *parsetree, const char *queryString); static void ExecCreateKeyValuesStmt(Node *parsetree); static void RemoveSequeceBarely(DropStmt *stmt); extern void RegisterSeqDrop(char *name, int32 type); -static bool forward_ddl_to_leader_cn(Node *node, const char *queryString); extern bool g_GTM_skip_catalog; @@ -1412,8 +1411,22 @@ ProcessUtilityPost(PlannedStmt *pstmt, break; } } + + /* + * Also truncate on coordinators which makes parallel ddl possible. + * temp table only exists on current coordinator + * which parallel ddl has no effect. + */ + if (!is_temp) + { + exec_type = EXEC_ON_ALL_NODES; + } + else + { exec_type = EXEC_ON_DATANODES; } + + } break; case T_AlterDatabaseStmt: @@ -1737,48 +1750,40 @@ ProcessUtilityPost(PlannedStmt *pstmt, #ifdef __TBASE__ /* - * Forward specific DDLs request to leader cn. - * - * On success return true else false. + * Enable parallel ddl for specific query. */ -static bool -forward_ddl_to_leader_cn(Node *node, const char *queryString) +static void +parallel_ddl_process(Node *node) { - Oid leader_cn = InvalidOid; - char *leader_name = NULL; - - /* avoid forward recurse */ - if (!enable_parallel_ddl || !IS_PGXC_LOCAL_COORDINATOR || is_forward_request) + if (!enable_parallel_ddl || !IS_PGXC_LOCAL_COORDINATOR) { - return false; + return ; } + switch (nodeTag(node)) + { + case T_CreateStmt: + case T_CreateForeignTableStmt: + case T_CreateTableAsStmt: + case T_CreateSchemaStmt: + case T_AlterTableStmt: + case T_DefineStmt: + case T_DropStmt: + case T_RenameStmt: + case T_TruncateStmt: + case T_IndexStmt: /* CONCURRENT INDEX is not supported */ if (IsA(node,IndexStmt) && castNode(IndexStmt,node)->concurrent) { - return false; + return ; } - - /* Set parallel ddl flag */ - is_txn_has_parallel_ddl = true; - - leader_name = find_ddl_leader_cn(); - if(is_ddl_leader_cn(leader_name)) - { - return false; + break; + default: + return ; } - leader_cn = get_pgxc_nodeoid(leader_name); - - /* Set flag to indicate forwarded request */ - forward_mode = true; - - pgxc_execute_on_nodes(1, &leader_cn, pstrdup(queryString)); - - /* Cancel forwarded flag for subsequent requests */ - forward_mode = false; - - return true; + /* Parallel ddl is enabled, set parallel ddl flag */ + is_txn_has_parallel_ddl = true; } #endif @@ -1807,6 +1812,10 @@ standard_ProcessUtility(PlannedStmt *pstmt, bool isTopLevel = (context == PROCESS_UTILITY_TOPLEVEL); ParseState *pstate; +#ifdef __TBASE__ + /* parallel enable check */ + parallel_ddl_process(parsetree); +#endif /* * For more detail see comments in function pgxc_lock_for_backup. * diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index f75f6331..c1770cd1 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -2710,16 +2710,6 @@ static struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, { - {"is_forward_request", PGC_USERSET, CUSTOM_OPTIONS, - gettext_noop("Whether DDL is forwarded from another coordinator."), - NULL, - GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_AUTO_FILE | GUC_DISALLOW_IN_FILE | GUC_NO_SHOW_ALL - }, - &is_forward_request, - false, - NULL, NULL, NULL - }, - { {"enable_cold_hot_router_print", PGC_USERSET, CUSTOM_OPTIONS, gettext_noop("Whether print cold hot router."), NULL @@ -8252,9 +8242,10 @@ set_config_option(const char *name, const char *value, */ if ((source == PGC_S_SESSION || source == PGC_S_CLIENT) && (IS_PGXC_DATANODE || !IsConnFromCoord()) - && (strcmp(name,"remotetype") != 0 && strcmp(name,"parentnode") != 0 && - strcmp(name,"is_forward_request") != 0)) + && (strcmp(name,"remotetype") != 0 && strcmp(name,"parentnode") != 0)) + { send_to_nodes = true; + } #endif #ifdef PGXC diff --git a/src/include/pgxc/pgxc.h b/src/include/pgxc/pgxc.h index 9f3ed6f5..b69d747b 100644 --- a/src/include/pgxc/pgxc.h +++ b/src/include/pgxc/pgxc.h @@ -96,10 +96,6 @@ typedef enum /* Determine remote connection type for a PGXC backend */ extern int remoteConnType; -#ifdef __TBASE__ -/* Is request forwarded another coordinator */ -extern bool is_forward_request; -#endif /* Local node name and numer */ extern char *PGXCNodeName; @@ -127,10 +123,10 @@ extern Datum xc_lockForBackupKey2; #define PGXC_PARENT_NODE_TYPE parentPGXCNodeType #define REMOTE_CONN_TYPE remoteConnType -#define IsConnFromApp() (remoteConnType == REMOTE_CONN_APP || is_forward_request == true) -#define IsConnFromCoord() (remoteConnType == REMOTE_CONN_COORD && is_forward_request == false) -#define IsConnFromDatanode() (remoteConnType == REMOTE_CONN_DATANODE && is_forward_request == false) -#define IsConnFromGtm() (remoteConnType == REMOTE_CONN_GTM && is_forward == false) +#define IsConnFromApp() (remoteConnType == REMOTE_CONN_APP) +#define IsConnFromCoord() (remoteConnType == REMOTE_CONN_COORD) +#define IsConnFromDatanode() (remoteConnType == REMOTE_CONN_DATANODE) +#define IsConnFromGtm() (remoteConnType == REMOTE_CONN_GTM) #define IsConnFromGtmProxy() (remoteConnType == REMOTE_CONN_GTM_PROXY) /* key pair to be used as object id while using advisory lock for backup */ diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h index ccb84026..4a2ee55b 100644 --- a/src/include/pgxc/pgxcnode.h +++ b/src/include/pgxc/pgxcnode.h @@ -143,8 +143,6 @@ typedef struct PGXCNodeHandle **coord_handles; /* an array of Coordinator handles */ } PGXCNodeAllHandles; -extern bool forward_mode; - extern void InitMultinodeExecutor(bool is_force); extern Oid get_nodeoid_from_nodeid(int nodeid, char node_type); diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index b0e92c9f..2ab99d9d 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -78,6 +78,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_bitmapscan | on enable_check_password | off enable_cls | on + enable_cold_hot_router_print | off enable_cold_hot_visible | off enable_cold_seperation | off enable_committs_print | off @@ -127,7 +128,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_tidscan | on enable_transparent_crypt | on enable_user_authority_force_check | off -(56 rows) +(57 rows) -- Test that the pg_timezone_names and pg_timezone_abbrevs views are -- more-or-less working. We can't test their contents in any great detail From 3d9f209b428237edcbb4cb0526612fae68246046 Mon Sep 17 00:00:00 2001 From: yeyukui Date: Fri, 18 Dec 2020 10:24:00 +0800 Subject: [PATCH 094/578] fix security table coredump about do sample and vacuum analyze --- src/backend/commands/analyze.c | 10 ++++++-- src/backend/utils/adt/rowtypes.c | 32 ++++++++++++++++++++++++- src/backend/utils/misc/relcrypt.c | 2 +- src/test/regress/expected/mls_check.out | 10 ++++++++ src/test/regress/sql/mls_check.sql | 5 ++++ 5 files changed, 55 insertions(+), 4 deletions(-) diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 1b1db5a2..54a26e55 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -697,7 +697,7 @@ do_analyze_rel(Relation onerel, int options, VacuumParams *params, stats->tupDesc = onerel->rd_att; #ifdef _MLS_ /* has column crypt */ - if (stats->tupDesc->attrs_ext) + if (stats->tupDesc->attrs_ext && IS_PGXC_DATANODE) { TRANSP_CRYPT_ATTRS_EXT_ENABLE(stats->tupDesc); } @@ -707,7 +707,7 @@ do_analyze_rel(Relation onerel, int options, VacuumParams *params, numrows, totalrows); #ifdef _MLS_ - if (stats->tupDesc->attrs_ext) + if (stats->tupDesc->attrs_ext && IS_PGXC_DATANODE) { TRANSP_CRYPT_ATTRS_EXT_DISABLE(stats->tupDesc); } @@ -2610,9 +2610,15 @@ compute_scalar_stats(VacAttrStatsP stats, { if (0 != stats->tupDesc->transp_crypt[curr_attnum - 1].algo_id) { + if (stats->tupDesc->attrs_ext && IS_PGXC_DATANODE) + { TRANSP_CRYPT_ATTRS_EXT_ENABLE(stats->tupDesc); + } heap_deform_tuple(stats->rows[i], stats->tupDesc, tuple_values, tuple_isnull); + if (stats->tupDesc->attrs_ext && IS_PGXC_DATANODE) + { TRANSP_CRYPT_ATTRS_EXT_DISABLE(stats->tupDesc); + } if (tuple_isnull[curr_attnum - 1]) { diff --git a/src/backend/utils/adt/rowtypes.c b/src/backend/utils/adt/rowtypes.c index 7964c77f..3ba7b15c 100644 --- a/src/backend/utils/adt/rowtypes.c +++ b/src/backend/utils/adt/rowtypes.c @@ -25,6 +25,10 @@ #include "utils/builtins.h" #include "utils/lsyscache.h" #include "utils/typcache.h" +#include "utils/relcrypt.h" +#include "commands/relcryptcommand.h" +#include "utils/mls.h" +#include "utils/datamask.h" /* @@ -311,6 +315,8 @@ record_out(PG_FUNCTION_ARGS) Datum *values; bool *nulls; StringInfoData buf; + Oid parentOid = InvalidOid; + Form_pg_attribute *att = NULL; check_stack_depth(); /* recurses for record-type columns */ @@ -319,6 +325,7 @@ record_out(PG_FUNCTION_ARGS) tupTypmod = HeapTupleHeaderGetTypMod(rec); tupdesc = lookup_rowtype_tupdesc(tupType, tupTypmod); ncolumns = tupdesc->natts; + att = tupdesc->attrs; /* Build a temporary HeapTuple control structure */ tuple.t_len = HeapTupleHeaderGetDatumLength(rec); @@ -360,8 +367,31 @@ record_out(PG_FUNCTION_ARGS) values = (Datum *) palloc(ncolumns * sizeof(Datum)); nulls = (bool *) palloc(ncolumns * sizeof(bool)); - /* Break down the tuple into fields */ + /* Break down the tuple into fields, if the table use encrypt, should + * decrypt the data after deform_tuple on datanode. + */ + if (IS_PGXC_DATANODE && tupdesc->attrs_ext) + { + transparent_crypt_decrypt_all_cols_value_copy(&tuple, tupdesc, values, nulls); + } + else + { heap_deform_tuple(&tuple, tupdesc, values, nulls); + } + + /* + * Check table or parent table has datamask policy, if true, change data to + * datamask to avoid data leak. + */ + if (tupdesc->natts > 0) + { + parentOid = mls_get_parent_oid_by_relid(att[0]->attrelid); + } + + if (OidIsValid(parentOid) && datamask_check_table_has_datamask(parentOid)) + { + datamask_exchange_all_cols_value_copy(tupdesc, values, nulls, parentOid); + } /* And build the result string */ initStringInfo(&buf); diff --git a/src/backend/utils/misc/relcrypt.c b/src/backend/utils/misc/relcrypt.c index 65f3b65e..b3d25625 100644 --- a/src/backend/utils/misc/relcrypt.c +++ b/src/backend/utils/misc/relcrypt.c @@ -1578,7 +1578,7 @@ bool trsprt_crypt_chk_tbl_has_col_crypt(Oid relid) while (HeapTupleIsValid(htup = systable_getnext(scan))) { - Form_pg_transparent_crypt_policy_map form = (Form_pg_transparent_crypt_policy_map)htup; + Form_pg_transparent_crypt_policy_map form = (Form_pg_transparent_crypt_policy_map) GETSTRUCT(htup); if (form->attnum > InvalidAttrNumber) { diff --git a/src/test/regress/expected/mls_check.out b/src/test/regress/expected/mls_check.out index 371bfe10..496d0b8c 100644 --- a/src/test/regress/expected/mls_check.out +++ b/src/test/regress/expected/mls_check.out @@ -1284,6 +1284,14 @@ select * from alter_order_range_201702 order by f1 asc; 2 | 9999 | 9999 | 9999 | XXXXe | XXXXe | XXXXoworld | XXXXe | 9999 | Tue May 05 05:05:05 2015 | 9999 | 9999 (1 row) +sample alter_order_range(3000); + samplenum | totalnum | deadnum | totalpages | visiblepages | rows +-----------+----------+---------+------------+--------------+---------------------------------------------------------------------------------------------------------- + 2 | 2 | 0 | 40 | 0 | + | | | | | (1,9999,9999,9999,XXXXe,"XXXXe ","XXXXoworld ",XXXXe,9999,"Tue May 05 05:05:05 2015",9999,9999) + | | | | | (2,9999,9999,9999,XXXXe,"XXXXe ","XXXXoworld ",XXXXe,9999,"Tue May 05 05:05:05 2015",9999,9999) +(3 rows) + alter table alter_order_range detach partition alter_order_range_201701; ERROR: could not detach partition for table:alter_order_range, cause mls poilcy is bound \c - mls_admin @@ -2509,6 +2517,8 @@ select * from tbl_mls_test where f1 = 1024 and f2 >= '2018-05-01' and f2 < '2018 1024 | Tue May 01 00:00:00 2018 | 1024 (1 row) +-- test vacuum analyze +vacuum analyze tbl_mls_test; --case: orignal partition, interval partition with index \c - godlike create table tbl_mls_part_list( a int ,b int ) PARTITION BY LIST (b) ; diff --git a/src/test/regress/sql/mls_check.sql b/src/test/regress/sql/mls_check.sql index 4369a706..83e4027c 100644 --- a/src/test/regress/sql/mls_check.sql +++ b/src/test/regress/sql/mls_check.sql @@ -478,6 +478,8 @@ select * from alter_order_range order by f1 asc; select * from alter_order_range_201701 order by f1 asc; select * from alter_order_range_201702 order by f1 asc; +sample alter_order_range(3000); + alter table alter_order_range detach partition alter_order_range_201701; \c - mls_admin @@ -848,6 +850,9 @@ checkpoint; --explain select * from tbl_mls_test where f1 = 1024 and f2 >= '2018-05-01' and f2 < '2018-06-01' order by f1 limit 10 ; select * from tbl_mls_test where f1 = 1024 and f2 >= '2018-05-01' and f2 < '2018-06-01' order by f1 limit 10 ; +-- test vacuum analyze +vacuum analyze tbl_mls_test; + --case: orignal partition, interval partition with index \c - godlike create table tbl_mls_part_list( a int ,b int ) PARTITION BY LIST (b) ; From ce318d0355206873dfb62d73ba0f57cb2db6ceef Mon Sep 17 00:00:00 2001 From: sigmalin Date: Fri, 18 Dec 2020 10:49:57 +0800 Subject: [PATCH 095/578] fix nestloop bug --- src/backend/tcop/pquery.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c index 13f56acc..5cc3042c 100644 --- a/src/backend/tcop/pquery.c +++ b/src/backend/tcop/pquery.c @@ -703,8 +703,7 @@ PortalStart(Portal portal, ParamListInfo params, */ #ifdef __TBASE__ if (!paramPassDown && queryDesc->plannedstmt->nParamRemote > 0 && - queryDesc->plannedstmt->remoteparams[queryDesc->plannedstmt->nParamRemote-1].paramkind == PARAM_EXEC && - queryDesc->plannedstmt->distributionType != LOCATOR_TYPE_SHARD) + queryDesc->plannedstmt->remoteparams[queryDesc->plannedstmt->nParamRemote-1].paramkind == PARAM_EXEC) #else if (queryDesc->plannedstmt->nParamRemote > 0 && queryDesc->plannedstmt->remoteparams[queryDesc->plannedstmt->nParamRemote-1].paramkind == PARAM_EXEC) From 38aaa5469d66712cf00fc842a9cd71625792d30e Mon Sep 17 00:00:00 2001 From: whalesong Date: Fri, 29 Jan 2021 10:43:35 +0800 Subject: [PATCH 096/578] Bugfix: prepared statement does not exist when dn restart, ID84661745 (merge request !123) (cherry picked from commit e43101af) 7d7edd09 Bugfix: prepared statement does not exist when dn restart, ID84661745, code optimize 3 098b765e Bugfix: prepared statement does not exist when dn restart, ID84661745, code optimize 2 37164a3a Bugfix: prepared statement does not exist when dn restart, ID84661745, code optimize 709950c1 Bugfix: prepared statement does not exist when dn restart, ID84661745 --- src/backend/commands/prepare.c | 48 ++++++++++++++++++++++++++++++-- src/backend/pgxc/pool/pgxcnode.c | 24 ++++++++++++++++ src/include/commands/prepare.h | 3 +- 3 files changed, 71 insertions(+), 4 deletions(-) diff --git a/src/backend/commands/prepare.c b/src/backend/commands/prepare.c index 1aa469d6..d4729433 100644 --- a/src/backend/commands/prepare.c +++ b/src/backend/commands/prepare.c @@ -1089,7 +1089,7 @@ HaveActiveDatanodeStatements(void) * prepared on the node */ bool -ActivateDatanodeStatementOnNode(const char *stmt_name, int noid) +ActivateDatanodeStatementOnNode(const char *stmt_name, int nodeidx) { DatanodeStatement *entry; int i; @@ -1099,13 +1099,55 @@ ActivateDatanodeStatementOnNode(const char *stmt_name, int noid) /* see if statement already active on the node */ for (i = 0; i < entry->number_of_nodes; i++) - if (entry->dns_node_indices[i] == noid) + if (entry->dns_node_indices[i] == nodeidx) return true; /* statement is not active on the specified node append item to the list */ - entry->dns_node_indices[entry->number_of_nodes++] = noid; + entry->dns_node_indices[entry->number_of_nodes++] = nodeidx; return false; } + + +/* + * Mark datanode statement as inactive on specified node + */ +void +InactivateDatanodeStatementOnNode(int nodeidx) +{ + HASH_SEQ_STATUS seq; + DatanodeStatement *entry; + int i; + + /* nothing cached */ + if (!datanode_queries) + return; + + /* walk over cache */ + hash_seq_init(&seq, datanode_queries); + while ((entry = hash_seq_search(&seq)) != NULL) + { + /* see if statement already active on the node */ + for (i = 0; i < entry->number_of_nodes; i++) + { + if (entry->dns_node_indices[i] == nodeidx) + { + elog(DEBUG5, "InactivateDatanodeStatementOnNode: node index %d, " + "number_of_nodes %d, statement name %s", nodeidx, + entry->number_of_nodes, entry->stmt_name); + + /* remove nodeidx from list */ + entry->number_of_nodes--; + if (i < entry->number_of_nodes) + { + entry->dns_node_indices[i] = + entry->dns_node_indices[entry->number_of_nodes]; + } + break; + } + } + } +} + #endif #ifdef __TBASE__ /* prepare remoteDML statement on coordinator */ diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index 60496f76..9b5f7a84 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -3855,6 +3855,30 @@ get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query, bool is_global_session ? 'T' : 'F'); #endif + if (IS_PGXC_COORDINATOR) + { + char nodetype = PGXC_NODE_DATANODE; + int nodeidx = PGXCNodeGetNodeId(node_handle->nodeoid, &nodetype); + if (PGXC_NODE_DATANODE != nodetype) + { + elog(ERROR, "Unexpected node type %c, name %s, index %d, " + "oid %d, max nodes %d", nodetype, + node_handle->nodename, nodeidx, + node_handle->nodeoid, NumDataNodes); + } + if (nodeidx < 0 || nodeidx >= NumDataNodes) + { + elog(ERROR, "Invalid datanode index %d, name %s, oid %d, " + "type %c, max nodes %d", nodeidx, + node_handle->nodename, node_handle->nodeoid, + nodetype, NumDataNodes); + } + + InactivateDatanodeStatementOnNode(nodeidx); + elog(DEBUG5, "Inactivate statement on datanode %s, nodeidx %d, " + "oid %d, type %c, max nodes %d", node_handle->nodename, + nodeidx, node_handle->nodeoid, nodetype, NumDataNodes); + } } } /* Initialisation for Coordinators */ diff --git a/src/include/commands/prepare.h b/src/include/commands/prepare.h index e05003a0..57a72d94 100644 --- a/src/include/commands/prepare.h +++ b/src/include/commands/prepare.h @@ -132,9 +132,10 @@ extern void DropAllPreparedStatements(void); #ifdef PGXC extern DatanodeStatement *FetchDatanodeStatement(const char *stmt_name, bool throwError); -extern bool ActivateDatanodeStatementOnNode(const char *stmt_name, int noid); +extern bool ActivateDatanodeStatementOnNode(const char *stmt_name, int nodeidx); extern bool HaveActiveDatanodeStatements(void); extern void DropDatanodeStatement(const char *stmt_name); +extern void InactivateDatanodeStatementOnNode(int nodeidx); extern int SetRemoteStatementName(Plan *plan, const char *stmt_name, int num_params, Oid *param_types, int n); #endif From 98dd71078ce41256f43be60aca3f576cabc0bf38 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Fri, 29 Jan 2021 14:54:39 +0800 Subject: [PATCH 097/578] fix http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131084977249 (merge request !126) --- src/backend/pgxc/pool/pgxcnode.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index 9b5f7a84..3aac2510 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -822,7 +822,6 @@ pgxc_node_receive(const int conn_count, } retry: - CHECK_FOR_INTERRUPTS(); poll_val = poll(pool_fd, conn_count, timeout_ms); if (poll_val < 0) { From bec7ae72747858acdd3c90fa0f8b8216f9e2d6ca Mon Sep 17 00:00:00 2001 From: sigmalin Date: Wed, 27 Jan 2021 15:21:18 +0800 Subject: [PATCH 098/578] fix latch already owned caused by memory problem (merge request !118) --- src/backend/pgxc/squeue/squeue.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/backend/pgxc/squeue/squeue.c b/src/backend/pgxc/squeue/squeue.c index b1440b60..e006d2c8 100644 --- a/src/backend/pgxc/squeue/squeue.c +++ b/src/backend/pgxc/squeue/squeue.c @@ -839,7 +839,9 @@ SharedQueueAcquire(const char *sqname, int ncons) int qsize; /* Size of one queue */ int i; char *heapPtr; - +#ifdef __TBASE__ + SQueueSync *sqsync = NULL; +#endif elog(DEBUG1, "Create a new SQueue %s and format it for %d consumers", sqname, ncons); /* Initialize the shared queue */ @@ -899,6 +901,13 @@ SharedQueueAcquire(const char *sqname, int ncons) heapPtr = (char *) sq; /* Skip header */ heapPtr += SQUEUE_HDR_SIZE(sq->sq_nconsumers); + +#ifdef __TBASE__ + /* Init latch */ + sqsync = sq->sq_sync; + InitSharedLatch(&sqsync->sqs_producer_latch); +#endif + /* Set up consumer queues */ for (i = 0; i < sq->sq_nconsumers; i++) { @@ -915,6 +924,7 @@ SharedQueueAcquire(const char *sqname, int ncons) #ifdef __TBASE__ cstate->send_fd = false; cstate->cs_done = false; + InitSharedLatch(&sqsync->sqs_consumer_sync[i].cs_latch); #endif heapPtr += qsize; } From e111c5410f6e88286a1d4cf2dd18d371d9b410ed Mon Sep 17 00:00:00 2001 From: jackywpxie Date: Wed, 20 Jan 2021 19:20:08 +0800 Subject: [PATCH 099/578] fixed bug: coredump while executing lots of SELECT PGXC_POOL_RELOAD() (merge request !105) Squash merge branch 'jacky/bugfix/coredump_Tbase_v5.05.3' into 'Tbase_v5.05.3' * fixed bug: coredump while executing lots of SELECT PGXC_POOL_RELOAD() (cherry picked from commit 0b4d42f8) 9715c00f fixed bug: coredump while executing lots of SELECT PGXC_POOL_RELOAD() http://tapd.oa.com/20421696/bugtrace/bugs/view?bug_id=1020421696084789579&url_cache_key=e5779d19ee5ceffc54b891e3b94140f4 --- src/backend/pgxc/pool/pgxcnode.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index 3aac2510..12873dac 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -4813,11 +4813,15 @@ DoInvalidateRemoteHandles(void) { bool result = false; + HOLD_INTERRUPTS(); + InitMultinodeExecutor(true); HandlesInvalidatePending = false; HandlesRefreshPending = false; + RESUME_INTERRUPTS(); + return result; } @@ -4832,6 +4836,8 @@ DoRefreshRemoteHandles(void) int numCoords, numDNodes, numSlaveDNodes, total_nodes; bool res = true; + HOLD_INTERRUPTS(); + HandlesRefreshPending = false; PgxcNodeGetOidsExtend(&coOids, &dnOids, &sdnOids,&numCoords, &numDNodes, &numSlaveDNodes, false); @@ -4982,6 +4988,8 @@ DoRefreshRemoteHandles(void) list_free(added); list_free(deleted); + RESUME_INTERRUPTS(); + return res; } From 27e1e023dca671bd0e74c928ba17880c60852f4f Mon Sep 17 00:00:00 2001 From: jackywpxie Date: Mon, 25 Jan 2021 17:21:53 +0800 Subject: [PATCH 100/578] =?UTF-8?q?:=E4=BF=AE=E8=A1=A5PROCSIG=5FPGXCPOOL?= =?UTF-8?q?=5FRELOAD=E4=BF=A1=E5=8F=B7=E5=B1=8F=E8=94=BD=E6=BC=8F=E6=B4=9E?= =?UTF-8?q?=20(merge=20request=20!116)=20(merge=20request=20!149)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Squash merge branch 'jacky/bugfix/PoolerReload_TBase_V2.15.16.9' into 'TBase_V2.15.16.9' * :修补PROCSIG_PGXCPOOL_RELOAD信号屏蔽漏洞 (merge request !116) --- src/backend/pgxc/pool/poolutils.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/backend/pgxc/pool/poolutils.c b/src/backend/pgxc/pool/poolutils.c index c61379ac..b03a4e2b 100644 --- a/src/backend/pgxc/pool/poolutils.c +++ b/src/backend/pgxc/pool/poolutils.c @@ -415,6 +415,9 @@ HandlePoolerReload(void) if (proc_exit_inprogress) return; + if (InterruptHoldoffCount != 0) + return; + #ifdef __TBASE__ if (PoolerReloadHoldoffCount) { @@ -430,6 +433,16 @@ HandlePoolerReload(void) PoolerReloadPending = false; #endif + HOLD_INTERRUPTS(); + + /* + * Reinitialize session, it updates the shared memory table. + * Initialize XL executor. This must be done inside a transaction block. + */ + StartTransactionCommand(); + InitMultinodeExecutor(true); + CommitTransactionCommand(); + /* Request query cancel, when convenient */ InterruptPending = true; QueryCancelPending = true; @@ -439,6 +452,8 @@ HandlePoolerReload(void) /* Prevent using of cached connections to remote nodes */ RequestInvalidateRemoteHandles(); + + RESUME_INTERRUPTS(); } /* From e31babde1419022c884591477911dcee669f2269 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Wed, 27 Jan 2021 16:32:30 +0800 Subject: [PATCH 101/578] fix http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131084803305 (merge request !98) --- src/backend/executor/execProcnode.c | 67 +++++++++++++++++++++++++++++ src/backend/executor/nodeMaterial.c | 13 ++++++ src/backend/executor/nodeNestloop.c | 4 ++ src/backend/pgxc/pool/execRemote.c | 4 ++ src/include/executor/executor.h | 3 ++ 5 files changed, 91 insertions(+) diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c index bdf04a2a..0119064b 100644 --- a/src/backend/executor/execProcnode.c +++ b/src/backend/executor/execProcnode.c @@ -927,6 +927,73 @@ ExecDisconnectNode(PlanState *node) ExecDisconnectNode(ps->righttree); } + +bool +HasDisconnectNode(PlanState *node) +{ + PlanState *ps = node; + RemoteSubplanState *remotesubplan = NULL; + + if (!node) + return false; + + if (IsA(node, SubqueryScanState)) + { + SubqueryScanState *substate = (SubqueryScanState *)node; + ps = substate->subplan; + } + + switch (nodeTag(ps)) + { + case T_RemoteSubplanState: + { + remotesubplan = (RemoteSubplanState *) ps; + if (remotesubplan->eflags & EXEC_FLAG_DISCONN) + { + return true; + } + return false; + } + + case T_AppendState: + { + AppendState *append = (AppendState *) ps; + int i; + + for (i = 0; i < append->as_nplans; i++) + { + if (HasDisconnectNode(append->appendplans[i])) + { + return true; + } + } + + return false; + } + + case T_MergeAppendState: + { + MergeAppendState *mstate = (MergeAppendState *) ps; + int i; + + for (i = 0; i < mstate->ms_nplans; i++) + { + if (HasDisconnectNode(mstate->mergeplans[i])) + { + return true; + } + } + + return false; + } + + default: + break; + } + + return HasDisconnectNode(ps->lefttree) || HasDisconnectNode(ps->righttree); +} + void ExecFinishNode(PlanState *node) {// #lizard forgives diff --git a/src/backend/executor/nodeMaterial.c b/src/backend/executor/nodeMaterial.c index d96a6d4a..fc02b41a 100644 --- a/src/backend/executor/nodeMaterial.c +++ b/src/backend/executor/nodeMaterial.c @@ -327,6 +327,19 @@ ExecReScanMaterial(MaterialState *node) if (node->eflags != 0) { +#ifdef __TBASE__ + /* + * If we haven't materialized yet, but some nodes have done disconnect, + * maybe this node needs to be executed when the material is executed, + * so re-scan here + */ + if ((NULL == node->tuplestorestate) && HasDisconnectNode(outerPlan)) + { + ExecReScan(outerPlan); + node->eof_underlying = false; + return; + } +#endif /* * If we haven't materialized yet, just return. If outerplan's * chgParam is not NULL then it will be re-scanned by ExecProcNode, diff --git a/src/backend/executor/nodeNestloop.c b/src/backend/executor/nodeNestloop.c index d277cb19..e9e95c2d 100644 --- a/src/backend/executor/nodeNestloop.c +++ b/src/backend/executor/nodeNestloop.c @@ -116,6 +116,10 @@ ExecNestLoop(PlanState *pstate) #ifdef __TBASE__ if (!node->nl_InnerInited && IS_PGXC_DATANODE) { + /* + * Perform disconnection to make the redistribution on other nodes end normally, + * otherwise need to wait for a timeout + */ ExecDisconnectNode(innerPlan); } #endif diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index df03eb1b..48778128 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -11030,6 +11030,9 @@ ExecReScanRemoteSubplan(RemoteSubplanState *node) * Force query is re-bound with new parameters */ node->bound = false; +#ifdef __TBASE__ + node->eflags &= ~(EXEC_FLAG_DISCONN); +#endif } #ifdef __TBASE__ @@ -11168,6 +11171,7 @@ ExecDisconnectRemoteSubplan(RemoteSubplanState *node) } node->bound = true; + node->eflags |= EXEC_FLAG_DISCONN; connections = (PGXCNodeHandle **)palloc(combiner->conn_count * sizeof(PGXCNodeHandle *)); diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index 3ec9ef06..5262c42e 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -67,6 +67,7 @@ #ifdef XCP /* distributed executor may never execute the plan on this node */ #define EXEC_FLAG_SUBPLAN 0x0100 +#define EXEC_FLAG_DISCONN 0x1000 #endif #ifdef __TBASE__ @@ -526,6 +527,8 @@ extern Relation ExecOpenScanRelation(EState *estate, Index scanrelid, int eflags #ifdef __TBASE__ extern Relation ExecOpenScanRelationPartition(EState *estate, Index scanrelid, int eflags, int partidx); + +extern bool HasDisconnectNode(PlanState *node); #endif extern void ExecCloseScanRelation(Relation scanrel); From b37d77c69f80aefb8cf223c18c6eaa75135c38a8 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Mon, 22 Feb 2021 19:50:20 +0800 Subject: [PATCH 102/578] cherry-pick from f21bd27 fix RemoteSubplanMakeUnique for latch already owned error --- src/backend/pgxc/pool/execRemote.c | 7 ++++++- src/backend/storage/ipc/latch.c | 2 +- src/backend/tcop/pquery.c | 3 +++ 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 48778128..333838a3 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -64,6 +64,7 @@ #include "postmaster/postmaster.h" #include "executor/nodeModifyTable.h" #include "utils/syscache.h" +#include "nodes/print.h" #endif /* * We do not want it too long, when query is terminating abnormally we just @@ -9503,7 +9504,8 @@ RemoteSubplanMakeUnique(Node *plan, int unique) */ if (IsA(plan, RemoteSubplan)) { - ((RemoteSubplan *)plan)->unique = unique; + int old = ((RemoteSubplan *)plan)->unique; + ((RemoteSubplan *)plan)->unique = old * MAX_NODES_NUMBER + unique; } /* Otherwise it is a Plan descendant */ RemoteSubplanMakeUnique((Node *) ((Plan *) plan)->lefttree, unique); @@ -10055,6 +10057,8 @@ ExecInitRemoteSubplan(RemoteSubplan *node, EState *estate, int eflags) * unique. */ RemoteSubplanMakeUnique((Node *) outerPlan(node), PGXCNodeId); + elog(DEBUG3, "RemoteSubplanMakeUnique for LOCATOR_TYPE_NONE unique: %d, cursor: %s", + PGXCNodeId, node->cursor); } rstmt.planTree = outerPlan(node); /* @@ -10255,6 +10259,7 @@ ExecInitRemoteSubplan(RemoteSubplan *node, EState *estate, int eflags) #ifdef __AUDIT__ rstmt.queryString = NULL; rstmt.parseTree = NULL; + elog_node_display(DEBUG5, "SendPlanMessage", &rstmt, Debug_pretty_print); #endif } PG_CATCH(); diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c index 55891078..f8fb52da 100644 --- a/src/backend/storage/ipc/latch.c +++ b/src/backend/storage/ipc/latch.c @@ -296,7 +296,7 @@ OwnLatch(volatile Latch *latch) #endif if (latch->owner_pid != 0) - elog(ERROR, "latch already owned"); + elog(ERROR, "latch already owned by %d", latch->owner_pid); latch->owner_pid = MyProcPid; } diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c index 5cc3042c..3f2e7d62 100644 --- a/src/backend/tcop/pquery.c +++ b/src/backend/tcop/pquery.c @@ -770,6 +770,9 @@ PortalStart(Portal portal, ParamListInfo params, RemoteSubplanMakeUnique( (Node *) queryDesc->plannedstmt->planTree, PGXC_PARENT_NODE_ID); + + elog(DEBUG3, "RemoteSubplanMakeUnique for PARAM_EXEC unique: %d, portal: %s", + PGXC_PARENT_NODE_ID, portal->name); /* * Call ExecutorStart to prepare the plan for execution */ From f304b0ec9b3d5aa870e539d7d9f1b21bdd34f22e Mon Sep 17 00:00:00 2001 From: sigmalin Date: Wed, 13 Jan 2021 10:27:41 +0800 Subject: [PATCH 103/578] fix http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131084406117 (merge request !91) --- src/backend/pgxc/pool/execRemote.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 333838a3..cff54d43 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -11163,6 +11163,8 @@ ExecDisconnectRemoteSubplan(RemoteSubplanState *node) if (conn) { + CHECK_OWNERSHIP(conn, combiner); + if (pgxc_node_send_disconnect(conn, cursor, list_length(plan->distributionRestrict)) != 0) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), From dd4e99e103f2c97fe74fdc3db870c2d7a90b11b6 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Thu, 4 Feb 2021 20:32:14 +0800 Subject: [PATCH 104/578] fix sequence bug (merge request !147) --- src/backend/access/transam/xact.c | 2 + src/backend/commands/sequence.c | 2 +- src/backend/commands/tablecmds.c | 174 +++++++++++++++- src/backend/tcop/utility.c | 33 +++ src/include/commands/tablecmds.h | 45 ++-- src/include/pg_config_manual.h | 3 + src/test/regress/expected/create_index.out | 230 +++++++++++---------- 7 files changed, 358 insertions(+), 131 deletions(-) diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 5369958d..9632a415 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -6247,6 +6247,8 @@ AbortSubTransaction(void) { CheckGTMConnection(); } + + FinishSeqOp(false); } #endif diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c index 5b6fd741..2557db35 100644 --- a/src/backend/commands/sequence.c +++ b/src/backend/commands/sequence.c @@ -375,7 +375,7 @@ DefineSequence(ParseState *pstate, CreateSeqStmt *seq) { ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE), - errmsg("GTM error, could not create sequence"))); + errmsg("GTM error, could not create sequence %s", seqname))); } #ifdef __TBASE__ diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 9e3b3f14..6915458f 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -132,6 +132,11 @@ #include "pgxc/shardmap.h" #include "pgxc/groupmgr.h" #endif + +#ifdef __TBASE__ +#include "parser/scansup.h" +#endif + /* * ON COMMIT action list */ @@ -1220,19 +1225,170 @@ DropErrorMsgWrongType(const char *relname, char wrongkind, char rightkind) (wentry->kind != '\0') ? errhint("%s", _(wentry->drophint_msg)) : 0)); } +#ifdef __TBASE__ + +/* + * replace all invisible characters with ' ', + * leave no spaces next to ',' or '.' + */ +static void +OmitqueryStringSpace(char *queryString) +{ + char *front = queryString; + char *last = queryString; + bool skip = false; + + if (queryString == NULL) + { + return; + } + + /* omit space */ + while (scanner_isspace(*front)) + { + ++front; + } + + while ((*front) != '\0') + { + if(scanner_isspace(*front) && skip == false) + { + while(scanner_isspace(*front)) + { + ++front; + } + + if ((*front) == ',' || (*front) == '.') + { + /* no need space */ + } + else if (last != queryString && (*(last - 1) == ',' || *(last - 1) == '.')) + { + /* no need space */ + } + else + { + /* replace all invisible characters with ' ' */ + *last = ' '; + ++last; + continue; + } + } + + if ((*front) == '\"') + { + skip = (skip == true) ? false : true; + *last = *front; + ++front; + } + else + { + *last = *front; + ++front; + } + ++last; + } + *last = '\0'; +} + +/* + * remove relname in query string (replace with ' ') + */ +static void +RemoveRelnameInQueryString(char *queryString, RangeVar *rel) +{ + char *ptr = NULL; + char *tmp = NULL; + char *tmpStr = NULL; + char *start_ptr = queryString; + char *end_ptr = queryString + strlen(queryString) - 1; + int len = 0; + char full_name[MAXFULLNAMEDATALEN]; + + /* get remove obj full name */ + snprintf(full_name, MAXFULLNAMEDATALEN, "%s%s%s%s%s", (rel->catalogname) ? (rel->catalogname) : "", + (rel->catalogname) ? "." : "", + (rel->schemaname) ? (rel->schemaname) : "", + (rel->schemaname) ? "." : "", + rel->relname); + tmpStr = queryString; + len = strlen(full_name); + while ((ptr = strstr(tmpStr, full_name)) != NULL) + { + /* is not independent string, skip */ + if (((ptr - 1) >= start_ptr && *(ptr - 1) != ' ' && (*(ptr - 1) != ',')) || + ((ptr + len) <= end_ptr && *(ptr + len) != ' ' && *(ptr + len) != ',' && *(ptr + len) != ';')) + { + if (((ptr - 1) >= start_ptr && *(ptr - 1) == '\"' && (ptr + len) <= end_ptr && *(ptr + len) == '\"') && + ((ptr - 2) < start_ptr || *(ptr - 2) != '.')) + { + *(ptr - 1) = ' '; + *(ptr + len) = ' '; + } + else + { + tmpStr = ptr + len; + continue; + } + } + + /* replace obj name with ' ' */ + MemSet(ptr, ' ', len); + + /* find the previous ',' */ + tmp = ptr - 1; + while (tmp >= start_ptr && *tmp == ' ') + { + tmp--; + } + + if (tmp >= start_ptr && *tmp == ',') + { + *tmp = ' '; + } + else + { + /* find the following ',' */ + tmp = ptr + len; + while (tmp <= end_ptr && *tmp == ' ') + { + tmp++; + } + + if (tmp <= end_ptr && *tmp == ',') + { + *tmp = ' '; + } + } + + tmpStr = ptr + len; + } +} + +#endif + /* * RemoveRelations * Implements DROP TABLE, DROP INDEX, DROP SEQUENCE, DROP VIEW, * DROP MATERIALIZED VIEW, DROP FOREIGN TABLE */ +#ifdef __TBASE__ +int +RemoveRelations(DropStmt *drop, char* queryString) +#else void RemoveRelations(DropStmt *drop) -{// #lizard forgives +#endif +{ ObjectAddresses *objects; char relkind; ListCell *cell; int flags = 0; LOCKMODE lockmode = AccessExclusiveLock; +#ifdef __TBASE__ + bool querystring_omit = false; + int drop_cnt = 0; +#endif /* DROP CONCURRENTLY uses a weaker lock, and has some restrictions */ if (drop->concurrent) @@ -1328,6 +1484,15 @@ RemoveRelations(DropStmt *drop) if (!OidIsValid(relOid)) { DropErrorMsgNonExistent(rel, relkind, drop->missing_ok); +#ifdef __TBASE__ + if (!querystring_omit) + { + OmitqueryStringSpace(queryString); + querystring_omit = true; + } + + RemoveRelnameInQueryString(queryString, rel); +#endif continue; } @@ -1374,11 +1539,18 @@ RemoveRelations(DropStmt *drop) obj.objectSubId = 0; add_exact_object_address(&obj, objects); +#ifdef __TBASE__ + drop_cnt++; +#endif } performMultipleDeletions(objects, drop->behavior, flags); free_object_addresses(objects); + +#ifdef __TBASE__ + return drop_cnt; +#endif } /* diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index 5ed5bfcc..c99d090e 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -3127,6 +3127,14 @@ ProcessUtilitySlow(ParseState *pstate, stmts = transformCreateStmt((CreateStmt *) parsetree, queryString, !is_local && !sentToRemote); +#ifdef __TBASE__ + if (NULL == stmts) + { + commandCollected = true; + break; + } +#endif + if (IS_PGXC_LOCAL_COORDINATOR) { /* @@ -4401,18 +4409,43 @@ ExecDropStmt(DropStmt *stmt, bool isTopLevel) #ifdef PGXC { bool is_temp = false; +#ifdef __TBASE__ + int drop_cnt = 0; + char *new_query_string = pstrdup(queryString); +#endif RemoteQueryExecType exec_type = EXEC_ON_ALL_NODES; /* Check restrictions on objects dropped */ DropStmtPreTreatment((DropStmt *) stmt, queryString, sentToRemote, &is_temp, &exec_type); #endif + +#ifdef __TBASE__ + drop_cnt = RemoveRelations(stmt, new_query_string); +#else RemoveRelations(stmt); +#endif + #ifdef PGXC +#ifdef __TBASE__ + /* if drop nothing, skip */ + if (drop_cnt == 0) + { + pfree(new_query_string); + break; + } + + /* DROP is done depending on the object type and its temporary type */ + if (IS_PGXC_LOCAL_COORDINATOR) + ExecUtilityStmtOnNodes(NULL, new_query_string, NULL, sentToRemote, false, + exec_type, is_temp, false); + pfree(new_query_string); +#else /* DROP is done depending on the object type and its temporary type */ if (IS_PGXC_LOCAL_COORDINATOR) ExecUtilityStmtOnNodes(NULL, queryString, NULL, sentToRemote, false, exec_type, is_temp, false); +#endif } #endif break; diff --git a/src/include/commands/tablecmds.h b/src/include/commands/tablecmds.h index 702b1b80..e1a3252c 100644 --- a/src/include/commands/tablecmds.h +++ b/src/include/commands/tablecmds.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * tablecmds.h - * prototypes for tablecmds.c. + * prototypes for tablecmds.c. * * * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group @@ -23,11 +23,14 @@ extern ObjectAddress DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, - ObjectAddress *typaddress, const char *queryString); - + ObjectAddress *typaddress, const char *queryString); +#ifdef __TBASE__ +extern int RemoveRelations(DropStmt *drop, char* queryString); +#else extern void RemoveRelations(DropStmt *drop); +#endif -extern Oid AlterTableLookupRelation(AlterTableStmt *stmt, LOCKMODE lockmode); +extern Oid AlterTableLookupRelation(AlterTableStmt *stmt, LOCKMODE lockmode); extern void AlterTable(Oid relid, LOCKMODE lockmode, AlterTableStmt *stmt); @@ -37,22 +40,22 @@ extern void ATExecChangeOwner(Oid relationOid, Oid newOwnerId, bool recursing, L extern void AlterTableInternal(Oid relid, List *cmds, bool recurse); -extern Oid AlterTableMoveAll(AlterTableMoveAllStmt *stmt); +extern Oid AlterTableMoveAll(AlterTableMoveAllStmt *stmt); extern ObjectAddress AlterTableNamespace(AlterObjectSchemaStmt *stmt, - Oid *oldschema); + Oid *oldschema); extern void AlterTableNamespaceInternal(Relation rel, Oid oldNspOid, - Oid nspOid, ObjectAddresses *objsMoved + Oid nspOid, ObjectAddresses *objsMoved #ifdef _MLS_ , const char * newschemaname #endif - ); + ); extern void AlterRelationNamespaceInternal(Relation classRel, Oid relOid, - Oid oldNspOid, Oid newNspOid, - bool hasDependEntry, - ObjectAddresses *objsMoved); + Oid oldNspOid, Oid newNspOid, + bool hasDependEntry, + ObjectAddresses *objsMoved); extern void CheckTableNotInUse(Relation rel, const char *stmt); @@ -69,11 +72,11 @@ extern ObjectAddress RenameConstraint(RenameStmt *stmt); extern ObjectAddress RenameRelation(RenameStmt *stmt); extern void RenameRelationInternal(Oid myrelid, - const char *newrelname, bool is_internal); + const char *newrelname, bool is_internal); extern void find_composite_type_dependencies(Oid typeOid, - Relation origRelation, - const char *origTypeName); + Relation origRelation, + const char *origTypeName); extern void check_of_type(HeapTuple typetuple); @@ -83,23 +86,23 @@ extern void remove_on_commit_action(Oid relid); extern void PreCommit_on_commit_actions(void); extern void AtEOXact_on_commit_actions(bool isCommit); extern void AtEOSubXact_on_commit_actions(bool isCommit, - SubTransactionId mySubid, - SubTransactionId parentSubid); + SubTransactionId mySubid, + SubTransactionId parentSubid); #ifdef PGXC extern bool IsTempTable(Oid relid); extern bool IsLocalTempTable(Oid relid); extern bool IsIndexUsingTempTable(Oid relid); extern bool IsOnCommitActions(void); extern void DropTableThrowErrorExternal(RangeVar *relation, - ObjectType removeType, - bool missing_ok); + ObjectType removeType, + bool missing_ok); #endif extern void RangeVarCallbackOwnsTable(const RangeVar *relation, - Oid relId, Oid oldRelId, void *arg); + Oid relId, Oid oldRelId, void *arg); extern void RangeVarCallbackOwnsRelation(const RangeVar *relation, - Oid relId, Oid oldRelId, void *noCatalogs); + Oid relId, Oid oldRelId, void *noCatalogs); #ifdef _MIGRATE_ extern bool oidarray_contian_oid(Oid *old_oids, int old_num, Oid new_oid); @@ -114,4 +117,4 @@ extern void StoreIntervalPartitionInfo(Oid relationId, char partkind, Oid parent extern void ExecCheckOverLapStmt(CheckOverLapStmt *stmt); #endif -#endif /* TABLECMDS_H */ +#endif /* TABLECMDS_H */ diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h index b53f9d0d..8d110b88 100644 --- a/src/include/pg_config_manual.h +++ b/src/include/pg_config_manual.h @@ -82,6 +82,9 @@ */ #define NAMEDATALEN 64 + +#define MAXFULLNAMEDATALEN (NAMEDATALEN * 3 + 2) + /* * Maximum number of arguments to a function. * diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out index ef804438..924c7c95 100644 --- a/src/test/regress/expected/create_index.out +++ b/src/test/regress/expected/create_index.out @@ -505,13 +505,14 @@ SELECT * FROM circle_tbl WHERE f1 && circle(point(1,-2), 1) EXPLAIN (COSTS OFF, NODES OFF) SELECT count(*) FROM gpolygon_tbl WHERE f1 && '(1000,1000,0,0)'::polygon; - QUERY PLAN ------------------------------------------------------------------- - Aggregate + QUERY PLAN +------------------------------------------------------------------------ + Finalize Aggregate -> Remote Subquery Scan on all - -> Index Scan using ggpolygonind on gpolygon_tbl - Index Cond: (f1 && '((1000,1000),(0,0))'::polygon) -(4 rows) + -> Partial Aggregate + -> Index Scan using ggpolygonind on gpolygon_tbl + Index Cond: (f1 && '((1000,1000),(0,0))'::polygon) +(5 rows) SELECT count(*) FROM gpolygon_tbl WHERE f1 && '(1000,1000,0,0)'::polygon; count @@ -521,13 +522,14 @@ SELECT count(*) FROM gpolygon_tbl WHERE f1 && '(1000,1000,0,0)'::polygon; EXPLAIN (COSTS OFF, NODES OFF) SELECT count(*) FROM gcircle_tbl WHERE f1 && '<(500,500),500>'::circle; - QUERY PLAN -------------------------------------------------------------- - Aggregate + QUERY PLAN +------------------------------------------------------------------- + Finalize Aggregate -> Remote Subquery Scan on all - -> Index Scan using ggcircleind on gcircle_tbl - Index Cond: (f1 && '<(500,500),500>'::circle) -(4 rows) + -> Partial Aggregate + -> Index Scan using ggcircleind on gcircle_tbl + Index Cond: (f1 && '<(500,500),500>'::circle) +(5 rows) SELECT count(*) FROM gcircle_tbl WHERE f1 && '<(500,500),500>'::circle; count @@ -539,7 +541,7 @@ EXPLAIN (COSTS OFF, NODES OFF) SELECT count(*) FROM point_tbl WHERE f1 <@ box '(0,0,100,100)'; QUERY PLAN ---------------------------------------------------------- - Remote Fast Query Execution + Remote Subquery Scan on all -> Aggregate -> Index Only Scan using gpointind on point_tbl Index Cond: (f1 <@ '(100,100),(0,0)'::box) @@ -555,7 +557,7 @@ EXPLAIN (COSTS OFF, NODES OFF) SELECT count(*) FROM point_tbl WHERE box '(0,0,100,100)' @> f1; QUERY PLAN ---------------------------------------------------------- - Remote Fast Query Execution + Remote Subquery Scan on all -> Aggregate -> Index Only Scan using gpointind on point_tbl Index Cond: (f1 <@ '(100,100),(0,0)'::box) @@ -571,7 +573,7 @@ EXPLAIN (COSTS OFF, NODES OFF) SELECT count(*) FROM point_tbl WHERE f1 <@ polygon '(0,0),(0,100),(100,100),(50,50),(100,0),(0,0)'; QUERY PLAN ---------------------------------------------------------------------------------------------- - Remote Fast Query Execution + Remote Subquery Scan on all -> Aggregate -> Index Only Scan using gpointind on point_tbl Index Cond: (f1 <@ '((0,0),(0,100),(100,100),(50,50),(100,0),(0,0))'::polygon) @@ -587,7 +589,7 @@ EXPLAIN (COSTS OFF, NODES OFF) SELECT count(*) FROM point_tbl WHERE f1 <@ circle '<(50,50),50>'; QUERY PLAN ---------------------------------------------------------- - Remote Fast Query Execution + Remote Subquery Scan on all -> Aggregate -> Index Only Scan using gpointind on point_tbl Index Cond: (f1 <@ '<(50,50),50>'::circle) @@ -603,7 +605,7 @@ EXPLAIN (COSTS OFF, NODES OFF) SELECT count(*) FROM point_tbl p WHERE p.f1 << '(0.0, 0.0)'; QUERY PLAN ------------------------------------------------------------ - Remote Fast Query Execution + Remote Subquery Scan on all -> Aggregate -> Index Only Scan using gpointind on point_tbl p Index Cond: (f1 << '(0,0)'::point) @@ -619,7 +621,7 @@ EXPLAIN (COSTS OFF, NODES OFF) SELECT count(*) FROM point_tbl p WHERE p.f1 >> '(0.0, 0.0)'; QUERY PLAN ------------------------------------------------------------ - Remote Fast Query Execution + Remote Subquery Scan on all -> Aggregate -> Index Only Scan using gpointind on point_tbl p Index Cond: (f1 >> '(0,0)'::point) @@ -635,7 +637,7 @@ EXPLAIN (COSTS OFF, NODES OFF) SELECT count(*) FROM point_tbl p WHERE p.f1 <^ '(0.0, 0.0)'; QUERY PLAN ------------------------------------------------------------ - Remote Fast Query Execution + Remote Subquery Scan on all -> Aggregate -> Index Only Scan using gpointind on point_tbl p Index Cond: (f1 <^ '(0,0)'::point) @@ -651,7 +653,7 @@ EXPLAIN (COSTS OFF, NODES OFF) SELECT count(*) FROM point_tbl p WHERE p.f1 >^ '(0.0, 0.0)'; QUERY PLAN ------------------------------------------------------------ - Remote Fast Query Execution + Remote Subquery Scan on all -> Aggregate -> Index Only Scan using gpointind on point_tbl p Index Cond: (f1 >^ '(0,0)'::point) @@ -667,7 +669,7 @@ EXPLAIN (COSTS OFF, NODES OFF) SELECT count(*) FROM point_tbl p WHERE p.f1 ~= '(-5, -12)'; QUERY PLAN ------------------------------------------------------------ - Remote Fast Query Execution + Remote Subquery Scan on all -> Aggregate -> Index Only Scan using gpointind on point_tbl p Index Cond: (f1 ~= '(-5,-12)'::point) @@ -683,7 +685,7 @@ EXPLAIN (COSTS OFF, NODES OFF) SELECT * FROM point_tbl ORDER BY f1 <-> '0,1'; QUERY PLAN ---------------------------------------------------- - Remote Fast Query Execution + Remote Subquery Scan on all -> Index Only Scan using gpointind on point_tbl Order By: (f1 <-> '(0,1)'::point) (3 rows) @@ -719,7 +721,7 @@ EXPLAIN (COSTS OFF, NODES OFF) SELECT * FROM point_tbl WHERE f1 IS NOT NULL ORDER BY f1 <-> '0,1'; QUERY PLAN ---------------------------------------------------- - Remote Fast Query Execution + Remote Subquery Scan on all -> Index Only Scan using gpointind on point_tbl Index Cond: (f1 IS NOT NULL) Order By: (f1 <-> '(0,1)'::point) @@ -740,7 +742,7 @@ EXPLAIN (COSTS OFF, NODES OFF) SELECT * FROM point_tbl WHERE f1 <@ '(-10,-10),(10,10)':: box ORDER BY f1 <-> '0,1'; QUERY PLAN ------------------------------------------------------ - Remote Fast Query Execution + Remote Subquery Scan on all -> Index Only Scan using gpointind on point_tbl Index Cond: (f1 <@ '(10,10),(-10,-10)'::box) Order By: (f1 <-> '(0,1)'::point) @@ -807,13 +809,14 @@ SELECT count(*) FROM quad_point_tbl; EXPLAIN (NODES OFF, COSTS OFF) SELECT count(*) FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)'; - QUERY PLAN ------------------------------------------------------------------ - Aggregate + QUERY PLAN +----------------------------------------------------------------------- + Finalize Aggregate -> Remote Subquery Scan on all - -> Index Only Scan using sp_quad_ind on quad_point_tbl - Index Cond: (p <@ '(1000,1000),(200,200)'::box) -(4 rows) + -> Partial Aggregate + -> Index Only Scan using sp_quad_ind on quad_point_tbl + Index Cond: (p <@ '(1000,1000),(200,200)'::box) +(5 rows) SELECT count(*) FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)'; count @@ -823,13 +826,14 @@ SELECT count(*) FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)'; EXPLAIN (NODES OFF, COSTS OFF) SELECT count(*) FROM quad_point_tbl WHERE box '(200,200,1000,1000)' @> p; - QUERY PLAN ------------------------------------------------------------------ - Aggregate + QUERY PLAN +----------------------------------------------------------------------- + Finalize Aggregate -> Remote Subquery Scan on all - -> Index Only Scan using sp_quad_ind on quad_point_tbl - Index Cond: (p <@ '(1000,1000),(200,200)'::box) -(4 rows) + -> Partial Aggregate + -> Index Only Scan using sp_quad_ind on quad_point_tbl + Index Cond: (p <@ '(1000,1000),(200,200)'::box) +(5 rows) SELECT count(*) FROM quad_point_tbl WHERE box '(200,200,1000,1000)' @> p; count @@ -924,13 +928,14 @@ SELECT count(*) FROM quad_point_tbl WHERE p ~= '(4585, 365)'; EXPLAIN (NODES OFF, COSTS OFF) SELECT count(*) FROM kd_point_tbl WHERE p <@ box '(200,200,1000,1000)'; - QUERY PLAN ---------------------------------------------------------------- - Aggregate + QUERY PLAN +--------------------------------------------------------------------- + Finalize Aggregate -> Remote Subquery Scan on all - -> Index Only Scan using sp_kd_ind on kd_point_tbl - Index Cond: (p <@ '(1000,1000),(200,200)'::box) -(4 rows) + -> Partial Aggregate + -> Index Only Scan using sp_kd_ind on kd_point_tbl + Index Cond: (p <@ '(1000,1000),(200,200)'::box) +(5 rows) SELECT count(*) FROM kd_point_tbl WHERE p <@ box '(200,200,1000,1000)'; count @@ -940,13 +945,14 @@ SELECT count(*) FROM kd_point_tbl WHERE p <@ box '(200,200,1000,1000)'; EXPLAIN (NODES OFF, COSTS OFF) SELECT count(*) FROM kd_point_tbl WHERE box '(200,200,1000,1000)' @> p; - QUERY PLAN ---------------------------------------------------------------- - Aggregate + QUERY PLAN +--------------------------------------------------------------------- + Finalize Aggregate -> Remote Subquery Scan on all - -> Index Only Scan using sp_kd_ind on kd_point_tbl - Index Cond: (p <@ '(1000,1000),(200,200)'::box) -(4 rows) + -> Partial Aggregate + -> Index Only Scan using sp_kd_ind on kd_point_tbl + Index Cond: (p <@ '(1000,1000),(200,200)'::box) +(5 rows) SELECT count(*) FROM kd_point_tbl WHERE box '(200,200,1000,1000)' @> p; count @@ -1315,7 +1321,7 @@ EXPLAIN (COSTS OFF, NODES OFF) SELECT * FROM point_tbl WHERE f1 <@ '(-10,-10),(10,10)':: box ORDER BY f1 <-> '0,1'; QUERY PLAN ------------------------------------------------------------------ - Remote Fast Query Execution + Remote Subquery Scan on all -> Sort Sort Key: ((f1 <-> '(0,1)'::point)) -> Bitmap Heap Scan on point_tbl @@ -1390,15 +1396,16 @@ SELECT count(*) FROM quad_point_tbl; EXPLAIN (NODES OFF, COSTS OFF) SELECT count(*) FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)'; - QUERY PLAN ---------------------------------------------------------------------- - Aggregate + QUERY PLAN +--------------------------------------------------------------------------- + Finalize Aggregate -> Remote Subquery Scan on all - -> Bitmap Heap Scan on quad_point_tbl - Recheck Cond: (p <@ '(1000,1000),(200,200)'::box) - -> Bitmap Index Scan on sp_quad_ind - Index Cond: (p <@ '(1000,1000),(200,200)'::box) -(6 rows) + -> Partial Aggregate + -> Bitmap Heap Scan on quad_point_tbl + Recheck Cond: (p <@ '(1000,1000),(200,200)'::box) + -> Bitmap Index Scan on sp_quad_ind + Index Cond: (p <@ '(1000,1000),(200,200)'::box) +(7 rows) SELECT count(*) FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)'; count @@ -1408,15 +1415,16 @@ SELECT count(*) FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)'; EXPLAIN (NODES OFF, COSTS OFF) SELECT count(*) FROM quad_point_tbl WHERE box '(200,200,1000,1000)' @> p; - QUERY PLAN ---------------------------------------------------------------------- - Aggregate + QUERY PLAN +--------------------------------------------------------------------------- + Finalize Aggregate -> Remote Subquery Scan on all - -> Bitmap Heap Scan on quad_point_tbl - Recheck Cond: ('(1000,1000),(200,200)'::box @> p) - -> Bitmap Index Scan on sp_quad_ind - Index Cond: ('(1000,1000),(200,200)'::box @> p) -(6 rows) + -> Partial Aggregate + -> Bitmap Heap Scan on quad_point_tbl + Recheck Cond: ('(1000,1000),(200,200)'::box @> p) + -> Bitmap Index Scan on sp_quad_ind + Index Cond: ('(1000,1000),(200,200)'::box @> p) +(7 rows) SELECT count(*) FROM quad_point_tbl WHERE box '(200,200,1000,1000)' @> p; count @@ -1521,15 +1529,16 @@ SELECT count(*) FROM quad_point_tbl WHERE p ~= '(4585, 365)'; EXPLAIN (NODES OFF, COSTS OFF) SELECT count(*) FROM kd_point_tbl WHERE p <@ box '(200,200,1000,1000)'; - QUERY PLAN ---------------------------------------------------------------------- - Aggregate + QUERY PLAN +--------------------------------------------------------------------------- + Finalize Aggregate -> Remote Subquery Scan on all - -> Bitmap Heap Scan on kd_point_tbl - Recheck Cond: (p <@ '(1000,1000),(200,200)'::box) - -> Bitmap Index Scan on sp_kd_ind - Index Cond: (p <@ '(1000,1000),(200,200)'::box) -(6 rows) + -> Partial Aggregate + -> Bitmap Heap Scan on kd_point_tbl + Recheck Cond: (p <@ '(1000,1000),(200,200)'::box) + -> Bitmap Index Scan on sp_kd_ind + Index Cond: (p <@ '(1000,1000),(200,200)'::box) +(7 rows) SELECT count(*) FROM kd_point_tbl WHERE p <@ box '(200,200,1000,1000)'; count @@ -1539,15 +1548,16 @@ SELECT count(*) FROM kd_point_tbl WHERE p <@ box '(200,200,1000,1000)'; EXPLAIN (NODES OFF, COSTS OFF) SELECT count(*) FROM kd_point_tbl WHERE box '(200,200,1000,1000)' @> p; - QUERY PLAN ---------------------------------------------------------------------- - Aggregate + QUERY PLAN +--------------------------------------------------------------------------- + Finalize Aggregate -> Remote Subquery Scan on all - -> Bitmap Heap Scan on kd_point_tbl - Recheck Cond: ('(1000,1000),(200,200)'::box @> p) - -> Bitmap Index Scan on sp_kd_ind - Index Cond: ('(1000,1000),(200,200)'::box @> p) -(6 rows) + -> Partial Aggregate + -> Bitmap Heap Scan on kd_point_tbl + Recheck Cond: ('(1000,1000),(200,200)'::box @> p) + -> Bitmap Index Scan on sp_kd_ind + Index Cond: ('(1000,1000),(200,200)'::box @> p) +(7 rows) SELECT count(*) FROM kd_point_tbl WHERE box '(200,200,1000,1000)' @> p; count @@ -2612,7 +2622,6 @@ DROP INDEX CONCURRENTLY "concur_index2"; -- works ERROR: index "concur_index2" does not exist DROP INDEX CONCURRENTLY IF EXISTS "concur_index2"; -- notice NOTICE: index "concur_index2" does not exist, skipping -ERROR: DROP INDEX CONCURRENTLY cannot run inside a transaction block -- failures DROP INDEX CONCURRENTLY "concur_index2", "concur_index3"; ERROR: index "concur_index2" does not exist @@ -2623,7 +2632,6 @@ ROLLBACK; -- successes DROP INDEX CONCURRENTLY IF EXISTS "concur_index3"; NOTICE: index "concur_index3" does not exist, skipping -ERROR: DROP INDEX CONCURRENTLY cannot run inside a transaction block DROP INDEX CONCURRENTLY "concur_index4"; ERROR: index "concur_index4" does not exist DROP INDEX CONCURRENTLY "concur_index5"; @@ -2911,21 +2919,22 @@ SELECT * FROM tenk1 EXPLAIN (NODES OFF, COSTS OFF) SELECT count(*) FROM tenk1 WHERE hundred = 42 AND (thousand = 42 OR thousand = 99); - QUERY PLAN ---------------------------------------------------------------------------------------- - Aggregate + QUERY PLAN +--------------------------------------------------------------------------------------------- + Finalize Aggregate -> Remote Subquery Scan on all - -> Bitmap Heap Scan on tenk1 - Recheck Cond: ((hundred = 42) AND ((thousand = 42) OR (thousand = 99))) - -> BitmapAnd - -> Bitmap Index Scan on tenk1_hundred - Index Cond: (hundred = 42) - -> BitmapOr - -> Bitmap Index Scan on tenk1_thous_tenthous - Index Cond: (thousand = 42) - -> Bitmap Index Scan on tenk1_thous_tenthous - Index Cond: (thousand = 99) -(12 rows) + -> Partial Aggregate + -> Bitmap Heap Scan on tenk1 + Recheck Cond: ((hundred = 42) AND ((thousand = 42) OR (thousand = 99))) + -> BitmapAnd + -> Bitmap Index Scan on tenk1_hundred + Index Cond: (hundred = 42) + -> BitmapOr + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (thousand = 42) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (thousand = 99) +(13 rows) SELECT count(*) FROM tenk1 WHERE hundred = 42 AND (thousand = 42 OR thousand = 99); @@ -2976,12 +2985,16 @@ explain (costs off) SELECT unique1 FROM tenk1 WHERE unique1 IN (1,42,7) ORDER BY unique1; - QUERY PLAN -------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) - -> Index Only Scan using tenk1_unique1 on tenk1 - Index Cond: (unique1 = ANY ('{1,42,7}'::integer[])) -(3 rows) + -> Sort + Sort Key: unique1 + -> Bitmap Heap Scan on tenk1 + Recheck Cond: (unique1 = ANY ('{1,42,7}'::integer[])) + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 = ANY ('{1,42,7}'::integer[])) +(7 rows) SELECT unique1 FROM tenk1 WHERE unique1 IN (1,42,7) @@ -2997,13 +3010,14 @@ explain (costs off) SELECT thousand, tenthous FROM tenk1 WHERE thousand < 2 AND tenthous IN (1001,3000) ORDER BY thousand; - QUERY PLAN -------------------------------------------------------------- + QUERY PLAN +-------------------------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) - -> Index Only Scan using tenk1_thous_tenthous on tenk1 - Index Cond: (thousand < 2) - Filter: (tenthous = ANY ('{1001,3000}'::integer[])) -(4 rows) + -> Sort + Sort Key: thousand + -> Index Only Scan using tenk1_thous_tenthous on tenk1 + Index Cond: ((thousand < 2) AND (tenthous = ANY ('{1001,3000}'::integer[]))) +(5 rows) SELECT thousand, tenthous FROM tenk1 WHERE thousand < 2 AND tenthous IN (1001,3000) From 12482d3b1681cf92cc095806c3300813095de565 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Fri, 5 Feb 2021 17:35:21 +0800 Subject: [PATCH 105/578] fix regress --- src/test/regress/expected/create_index.out | 26 +++++++++------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out index 924c7c95..dd727bc7 100644 --- a/src/test/regress/expected/create_index.out +++ b/src/test/regress/expected/create_index.out @@ -2489,16 +2489,14 @@ SET maintenance_work_mem = '1MB'; CREATE INDEX hash_tuplesort_idx ON tenk1 USING hash (stringu1 name_ops) WITH (fillfactor = 10); EXPLAIN (COSTS OFF) SELECT count(*) FROM tenk1 WHERE stringu1 = 'TVAAAA'; - QUERY PLAN -------------------------------------------------------------------- + QUERY PLAN +---------------------------------------------------------------- Finalize Aggregate -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Partial Aggregate - -> Bitmap Heap Scan on tenk1 - Recheck Cond: (stringu1 = 'TVAAAA'::name) - -> Bitmap Index Scan on hash_tuplesort_idx - Index Cond: (stringu1 = 'TVAAAA'::name) -(7 rows) + -> Index Scan using hash_tuplesort_idx on tenk1 + Index Cond: (stringu1 = 'TVAAAA'::name) +(5 rows) SELECT count(*) FROM tenk1 WHERE stringu1 = 'TVAAAA'; count @@ -2985,16 +2983,12 @@ explain (costs off) SELECT unique1 FROM tenk1 WHERE unique1 IN (1,42,7) ORDER BY unique1; - QUERY PLAN -------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) - -> Sort - Sort Key: unique1 - -> Bitmap Heap Scan on tenk1 - Recheck Cond: (unique1 = ANY ('{1,42,7}'::integer[])) - -> Bitmap Index Scan on tenk1_unique1 - Index Cond: (unique1 = ANY ('{1,42,7}'::integer[])) -(7 rows) + -> Index Only Scan using tenk1_unique1 on tenk1 + Index Cond: (unique1 = ANY ('{1,42,7}'::integer[])) +(3 rows) SELECT unique1 FROM tenk1 WHERE unique1 IN (1,42,7) From 61c26682a51062d71331fdaaecfbbf9e1725470b Mon Sep 17 00:00:00 2001 From: sigmalin Date: Mon, 22 Feb 2021 20:31:30 +0800 Subject: [PATCH 106/578] report error if squeue null --- src/backend/tcop/pquery.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c index 3f2e7d62..7a529dac 100644 --- a/src/backend/tcop/pquery.c +++ b/src/backend/tcop/pquery.c @@ -1409,6 +1409,11 @@ PortalRun(Portal portal, long count, bool isTopLevel, bool run_once, int myindex = queryDesc->myindex; TupleTableSlot *slot; + if (squeue == NULL) + { + elog(ERROR, "squeue: %s is null, myindex: %d, atStart: %d, atEnd: %d", portal->name, myindex, portal->atStart, portal->atEnd); + } + /* * We are the consumer. * We have skipped plan initialization, hence we do not have From 73eaa3eccdfa575e9535035972872d1ea9e867da Mon Sep 17 00:00:00 2001 From: anthonyyan Date: Sun, 7 Feb 2021 10:43:48 +0800 Subject: [PATCH 107/578] fix cn002 coredump when cn001 switchover, query pgxc_node must wrap transaction, http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131084796329 (merge request !157) (cherry picked from commit c5021b12) 1096f0a6 fix cn002 coredump when cn001 switchover, query pgxc_node must wrap transaction, http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131084796329 --- src/backend/pgxc/nodemgr/nodemgr.c | 25 ++++++++++++++++++++++++- src/backend/pgxc/pool/pgxcnode.c | 2 +- src/backend/pgxc/pool/poolmgr.c | 6 +++--- src/include/pgxc/nodemgr.h | 2 +- 4 files changed, 29 insertions(+), 6 deletions(-) diff --git a/src/backend/pgxc/nodemgr/nodemgr.c b/src/backend/pgxc/nodemgr/nodemgr.c index 8767e723..2a114a9e 100644 --- a/src/backend/pgxc/nodemgr/nodemgr.c +++ b/src/backend/pgxc/nodemgr/nodemgr.c @@ -577,7 +577,7 @@ count_coords_datanodes(Relation rel, int *num_coord, int *num_dns) * * Update node definitions in the shared memory tables from the catalog */ -void +static void PgxcNodeListAndCount(void) {// #lizard forgives Relation rel; @@ -800,6 +800,29 @@ PgxcNodeListAndCount(void) LWLockRelease(NodeTableLock); } +/* + * PgxcNodeListAndCountWrapTransaction + * + * Update node definitions in the shared memory tables from the catalog wrap the transaction + */ +void +PgxcNodeListAndCountWrapTransaction(void) +{ + bool need_abort = false; + + if (!IsTransactionOrTransactionBlock()) + { + StartTransactionCommand(); + need_abort = true; + } + + PgxcNodeListAndCount(); + + if (need_abort) + { + AbortCurrentTransaction(); + } +} /* * PgxcNodeGetIds diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index 12873dac..8f76dc2a 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -240,7 +240,7 @@ InitMultinodeExecutor(bool is_force) return; /* Update node table in the shared memory */ - PgxcNodeListAndCount(); + PgxcNodeListAndCountWrapTransaction(); /* Get classified list of node Oids */ PgxcNodeGetOidsExtend(&coOids, &dnOids, &sdnOids, &NumCoords, &NumDataNodes, &NumSlaveDataNodes, true); diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c index ca079833..f64b45c3 100644 --- a/src/backend/pgxc/pool/poolmgr.c +++ b/src/backend/pgxc/pool/poolmgr.c @@ -2050,7 +2050,7 @@ PoolManagerCheckConnectionInfo(void) ConnectPoolManager(); } - PgxcNodeListAndCount(); + PgxcNodeListAndCountWrapTransaction(); pool_putmessage(&poolHandle->port, 'q', NULL, 0); pool_flush(&poolHandle->port); @@ -2070,7 +2070,7 @@ void PoolManagerReloadConnectionInfo(void) { Assert(poolHandle); - PgxcNodeListAndCount(); + PgxcNodeListAndCountWrapTransaction(); pool_putmessage(&poolHandle->port, 'p', NULL, 0); pool_flush(&poolHandle->port); } @@ -10640,7 +10640,7 @@ PoolManagerRefreshConnectionInfo(void) HOLD_POOLER_RELOAD(); Assert(poolHandle); - PgxcNodeListAndCount(); + PgxcNodeListAndCountWrapTransaction(); pool_putmessage(&poolHandle->port, 'R', NULL, 0); pool_flush(&poolHandle->port); diff --git a/src/include/pgxc/nodemgr.h b/src/include/pgxc/nodemgr.h index ee6c7417..0f31d8cc 100644 --- a/src/include/pgxc/nodemgr.h +++ b/src/include/pgxc/nodemgr.h @@ -47,7 +47,7 @@ extern Size NodeHashTableShmemSize(void); #endif extern Size NodeTablesShmemSize(void); -extern void PgxcNodeListAndCount(void); +extern void PgxcNodeListAndCountWrapTransaction(void); extern void PgxcNodeGetOidsExtend(Oid **coOids, Oid **dnOids, Oid **sdnOids, int *num_coords, int *num_dns, int *num_sdns, bool update_preferred); From f581cb881b97cfcf03d1775007a64eccabd76a8c Mon Sep 17 00:00:00 2001 From: andrelin Date: Mon, 1 Feb 2021 12:43:41 +0000 Subject: [PATCH 108/578] Allow modifying pg_node_tree by guc: allow_force_ddl (merge request !106) (cherry picked from commit 2a5a6850) --- src/backend/utils/adt/pseudotypes.c | 345 ++++++++++++++-------------- 1 file changed, 176 insertions(+), 169 deletions(-) diff --git a/src/backend/utils/adt/pseudotypes.c b/src/backend/utils/adt/pseudotypes.c index f55ffff9..60a08586 100644 --- a/src/backend/utils/adt/pseudotypes.c +++ b/src/backend/utils/adt/pseudotypes.c @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * pseudotypes.c - * Functions for the system pseudo-types. + * Functions for the system pseudo-types. * * A pseudo-type isn't really a type and never has any operations, but * we do need to supply input and output functions to satisfy the links @@ -17,7 +17,7 @@ * * * IDENTIFICATION - * src/backend/utils/adt/pseudotypes.c + * src/backend/utils/adt/pseudotypes.c * *------------------------------------------------------------------------- */ @@ -34,22 +34,25 @@ #include "utils/lsyscache.h" #include "utils/syscache.h" #endif +#ifdef __TBASE__ +#include "utils/guc.h" +#endif /* - * cstring_in - input routine for pseudo-type CSTRING. + * cstring_in - input routine for pseudo-type CSTRING. * * We might as well allow this to support constructs like "foo_in('blah')". */ Datum cstring_in(PG_FUNCTION_ARGS) { - char *str = PG_GETARG_CSTRING(0); + char *str = PG_GETARG_CSTRING(0); - PG_RETURN_CSTRING(pstrdup(str)); + PG_RETURN_CSTRING(pstrdup(str)); } /* - * cstring_out - output routine for pseudo-type CSTRING. + * cstring_out - output routine for pseudo-type CSTRING. * * We allow this mainly so that "SELECT some_output_function(...)" does * what the user will expect. @@ -57,61 +60,61 @@ cstring_in(PG_FUNCTION_ARGS) Datum cstring_out(PG_FUNCTION_ARGS) { - char *str = PG_GETARG_CSTRING(0); + char *str = PG_GETARG_CSTRING(0); - PG_RETURN_CSTRING(pstrdup(str)); + PG_RETURN_CSTRING(pstrdup(str)); } /* - * cstring_recv - binary input routine for pseudo-type CSTRING. + * cstring_recv - binary input routine for pseudo-type CSTRING. */ Datum cstring_recv(PG_FUNCTION_ARGS) { - StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); - char *str; - int nbytes; + StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); + char *str; + int nbytes; - str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes); - PG_RETURN_CSTRING(str); + str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes); + PG_RETURN_CSTRING(str); } /* - * cstring_send - binary output routine for pseudo-type CSTRING. + * cstring_send - binary output routine for pseudo-type CSTRING. */ Datum cstring_send(PG_FUNCTION_ARGS) { - char *str = PG_GETARG_CSTRING(0); - StringInfoData buf; + char *str = PG_GETARG_CSTRING(0); + StringInfoData buf; - pq_begintypsend(&buf); - pq_sendtext(&buf, str, strlen(str)); - PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); + pq_begintypsend(&buf); + pq_sendtext(&buf, str, strlen(str)); + PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); } /* - * anyarray_in - input routine for pseudo-type ANYARRAY. + * anyarray_in - input routine for pseudo-type ANYARRAY. */ Datum anyarray_in(PG_FUNCTION_ARGS) { #ifdef XCP - /* - * XCP version of array_in() understands prefix describing element type - */ - return array_in(fcinfo); + /* + * XCP version of array_in() understands prefix describing element type + */ + return array_in(fcinfo); #else - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot accept a value of type %s", "anyarray"))); + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot accept a value of type %s", "anyarray"))); - PG_RETURN_VOID(); /* keep compiler quiet */ + PG_RETURN_VOID(); /* keep compiler quiet */ #endif } /* - * anyarray_out - output routine for pseudo-type ANYARRAY. + * anyarray_out - output routine for pseudo-type ANYARRAY. * * We may as well allow this, since array_out will in fact work. * XCP needs to send from data nodes to coordinator values of that type. @@ -122,58 +125,58 @@ Datum anyarray_out(PG_FUNCTION_ARGS) { #ifdef XCP - /* - * Output prefix: (type_namespace_name.typename) to look up actual element - * type at the destination node then output in usual format for array - */ - ArrayType *v = PG_GETARG_ARRAYTYPE_P(0); - Oid element_type = ARR_ELEMTYPE(v); - Form_pg_type typeForm; - HeapTuple typeTuple; - char *typname, - *typnspname; - /* two identifiers, parenthesis, dot and trailing \0 */ - char prefix[2*NAMEDATALEN+4], - *retval, - *newval; - int prefixlen, retvallen; - Datum array_out_result; - MemoryContext save_context; - - save_context = MemoryContextSwitchTo(fcinfo->flinfo->fn_mcxt); - /* Figure out type name and type namespace */ - typeTuple = SearchSysCache(TYPEOID, - ObjectIdGetDatum(element_type), - 0, 0, 0); - if (!HeapTupleIsValid(typeTuple)) - elog(ERROR, "cache lookup failed for type %u", element_type); - typeForm = (Form_pg_type) GETSTRUCT(typeTuple); - typname = NameStr(typeForm->typname); - typnspname = get_namespace_name(typeForm->typnamespace); - - sprintf(prefix, "(%s.%s)", typnspname, typname); - ReleaseSysCache(typeTuple); - MemoryContextSwitchTo(save_context); - - /* Get standard output and make up prefixed result */ - array_out_result = array_out(fcinfo); - retval = DatumGetCString(array_out_result); - prefixlen = strlen(prefix); - retvallen = strlen(retval); - newval = (char *) palloc(prefixlen + retvallen + 1); - strcpy(newval, prefix); - strcpy(newval + prefixlen, retval); - - pfree(retval); - - PG_RETURN_CSTRING(newval); + /* + * Output prefix: (type_namespace_name.typename) to look up actual element + * type at the destination node then output in usual format for array + */ + ArrayType *v = PG_GETARG_ARRAYTYPE_P(0); + Oid element_type = ARR_ELEMTYPE(v); + Form_pg_type typeForm; + HeapTuple typeTuple; + char *typname, + *typnspname; + /* two identifiers, parenthesis, dot and trailing \0 */ + char prefix[2*NAMEDATALEN+4], + *retval, + *newval; + int prefixlen, retvallen; + Datum array_out_result; + MemoryContext save_context; + + save_context = MemoryContextSwitchTo(fcinfo->flinfo->fn_mcxt); + /* Figure out type name and type namespace */ + typeTuple = SearchSysCache(TYPEOID, + ObjectIdGetDatum(element_type), + 0, 0, 0); + if (!HeapTupleIsValid(typeTuple)) + elog(ERROR, "cache lookup failed for type %u", element_type); + typeForm = (Form_pg_type) GETSTRUCT(typeTuple); + typname = NameStr(typeForm->typname); + typnspname = get_namespace_name(typeForm->typnamespace); + + sprintf(prefix, "(%s.%s)", typnspname, typname); + ReleaseSysCache(typeTuple); + MemoryContextSwitchTo(save_context); + + /* Get standard output and make up prefixed result */ + array_out_result = array_out(fcinfo); + retval = DatumGetCString(array_out_result); + prefixlen = strlen(prefix); + retvallen = strlen(retval); + newval = (char *) palloc(prefixlen + retvallen + 1); + strcpy(newval, prefix); + strcpy(newval + prefixlen, retval); + + pfree(retval); + + PG_RETURN_CSTRING(newval); #else - return array_out(fcinfo); + return array_out(fcinfo); #endif } /* - * anyarray_recv - binary input routine for pseudo-type ANYARRAY. + * anyarray_recv - binary input routine for pseudo-type ANYARRAY. * * XXX this could actually be made to work, since the incoming array * data will contain the element type OID. Need to think through @@ -182,75 +185,75 @@ anyarray_out(PG_FUNCTION_ARGS) Datum anyarray_recv(PG_FUNCTION_ARGS) { - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot accept a value of type %s", "anyarray"))); + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot accept a value of type %s", "anyarray"))); - PG_RETURN_VOID(); /* keep compiler quiet */ + PG_RETURN_VOID(); /* keep compiler quiet */ } /* - * anyarray_send - binary output routine for pseudo-type ANYARRAY. + * anyarray_send - binary output routine for pseudo-type ANYARRAY. * * We may as well allow this, since array_send will in fact work. */ Datum anyarray_send(PG_FUNCTION_ARGS) { - return array_send(fcinfo); + return array_send(fcinfo); } /* - * anyenum_in - input routine for pseudo-type ANYENUM. + * anyenum_in - input routine for pseudo-type ANYENUM. */ Datum anyenum_in(PG_FUNCTION_ARGS) { - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot accept a value of type %s", "anyenum"))); + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot accept a value of type %s", "anyenum"))); - PG_RETURN_VOID(); /* keep compiler quiet */ + PG_RETURN_VOID(); /* keep compiler quiet */ } /* - * anyenum_out - output routine for pseudo-type ANYENUM. + * anyenum_out - output routine for pseudo-type ANYENUM. * * We may as well allow this, since enum_out will in fact work. */ Datum anyenum_out(PG_FUNCTION_ARGS) { - return enum_out(fcinfo); + return enum_out(fcinfo); } /* - * anyrange_in - input routine for pseudo-type ANYRANGE. + * anyrange_in - input routine for pseudo-type ANYRANGE. */ Datum anyrange_in(PG_FUNCTION_ARGS) { - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot accept a value of type %s", "anyrange"))); + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot accept a value of type %s", "anyrange"))); - PG_RETURN_VOID(); /* keep compiler quiet */ + PG_RETURN_VOID(); /* keep compiler quiet */ } /* - * anyrange_out - output routine for pseudo-type ANYRANGE. + * anyrange_out - output routine for pseudo-type ANYRANGE. * * We may as well allow this, since range_out will in fact work. */ Datum anyrange_out(PG_FUNCTION_ARGS) { - return range_out(fcinfo); + return range_out(fcinfo); } /* - * void_in - input routine for pseudo-type VOID. + * void_in - input routine for pseudo-type VOID. * * We allow this so that PL functions can return VOID without any special * hack in the PL handler. Whatever value the PL thinks it's returning @@ -259,22 +262,22 @@ anyrange_out(PG_FUNCTION_ARGS) Datum void_in(PG_FUNCTION_ARGS) { - PG_RETURN_VOID(); /* you were expecting something different? */ + PG_RETURN_VOID(); /* you were expecting something different? */ } /* - * void_out - output routine for pseudo-type VOID. + * void_out - output routine for pseudo-type VOID. * * We allow this so that "SELECT function_returning_void(...)" works. */ Datum void_out(PG_FUNCTION_ARGS) { - PG_RETURN_CSTRING(pstrdup("")); + PG_RETURN_CSTRING(pstrdup("")); } /* - * void_recv - binary input routine for pseudo-type VOID. + * void_recv - binary input routine for pseudo-type VOID. * * Note that since we consume no bytes, an attempt to send anything but * an empty string will result in an "invalid message format" error. @@ -282,11 +285,11 @@ void_out(PG_FUNCTION_ARGS) Datum void_recv(PG_FUNCTION_ARGS) { - PG_RETURN_VOID(); + PG_RETURN_VOID(); } /* - * void_send - binary output routine for pseudo-type VOID. + * void_send - binary output routine for pseudo-type VOID. * * We allow this so that "SELECT function_returning_void(...)" works * even when binary output is requested. @@ -294,42 +297,42 @@ void_recv(PG_FUNCTION_ARGS) Datum void_send(PG_FUNCTION_ARGS) { - StringInfoData buf; + StringInfoData buf; - /* send an empty string */ - pq_begintypsend(&buf); - PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); + /* send an empty string */ + pq_begintypsend(&buf); + PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); } /* - * shell_in - input routine for "shell" types (those not yet filled in). + * shell_in - input routine for "shell" types (those not yet filled in). */ Datum shell_in(PG_FUNCTION_ARGS) { - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot accept a value of a shell type"))); + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot accept a value of a shell type"))); - PG_RETURN_VOID(); /* keep compiler quiet */ + PG_RETURN_VOID(); /* keep compiler quiet */ } /* - * shell_out - output routine for "shell" types. + * shell_out - output routine for "shell" types. */ Datum shell_out(PG_FUNCTION_ARGS) { - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot display a value of a shell type"))); + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot display a value of a shell type"))); - PG_RETURN_VOID(); /* keep compiler quiet */ + PG_RETURN_VOID(); /* keep compiler quiet */ } /* - * pg_node_tree_in - input routine for type PG_NODE_TREE. + * pg_node_tree_in - input routine for type PG_NODE_TREE. * * pg_node_tree isn't really a pseudotype --- it's real enough to be a table * column --- but it presently has no operations of its own, and disallows @@ -338,53 +341,57 @@ shell_out(PG_FUNCTION_ARGS) Datum pg_node_tree_in(PG_FUNCTION_ARGS) { - /* - * We disallow input of pg_node_tree values because the SQL functions that - * operate on the type are not secure against malformed input. - */ - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot accept a value of type %s", "pg_node_tree"))); - - PG_RETURN_VOID(); /* keep compiler quiet */ +#ifdef __TBASE__ + if (g_allow_force_ddl) + return textin(fcinfo); +#endif + /* + * We disallow input of pg_node_tree values because the SQL functions that + * operate on the type are not secure against malformed input. + */ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot accept a value of type %s", "pg_node_tree"))); + + PG_RETURN_VOID(); /* keep compiler quiet */ } /* - * pg_node_tree_out - output routine for type PG_NODE_TREE. + * pg_node_tree_out - output routine for type PG_NODE_TREE. * * The internal representation is the same as TEXT, so just pass it off. */ Datum pg_node_tree_out(PG_FUNCTION_ARGS) { - return textout(fcinfo); + return textout(fcinfo); } /* - * pg_node_tree_recv - binary input routine for type PG_NODE_TREE. + * pg_node_tree_recv - binary input routine for type PG_NODE_TREE. */ Datum pg_node_tree_recv(PG_FUNCTION_ARGS) { - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot accept a value of type %s", "pg_node_tree"))); + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot accept a value of type %s", "pg_node_tree"))); - PG_RETURN_VOID(); /* keep compiler quiet */ + PG_RETURN_VOID(); /* keep compiler quiet */ } /* - * pg_node_tree_send - binary output routine for type PG_NODE_TREE. + * pg_node_tree_send - binary output routine for type PG_NODE_TREE. */ Datum pg_node_tree_send(PG_FUNCTION_ARGS) { - return textsend(fcinfo); + return textsend(fcinfo); } /* - * pg_ddl_command_in - input routine for type PG_DDL_COMMAND. + * pg_ddl_command_in - input routine for type PG_DDL_COMMAND. * * Like pg_node_tree, pg_ddl_command isn't really a pseudotype; it's here for * the same reasons as that one. @@ -392,55 +399,55 @@ pg_node_tree_send(PG_FUNCTION_ARGS) Datum pg_ddl_command_in(PG_FUNCTION_ARGS) { - /* - * Disallow input of pg_ddl_command value. - */ - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot accept a value of type %s", "pg_ddl_command"))); - - PG_RETURN_VOID(); /* keep compiler quiet */ + /* + * Disallow input of pg_ddl_command value. + */ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot accept a value of type %s", "pg_ddl_command"))); + + PG_RETURN_VOID(); /* keep compiler quiet */ } /* - * pg_ddl_command_out - output routine for type PG_DDL_COMMAND. + * pg_ddl_command_out - output routine for type PG_DDL_COMMAND. * * We don't have any good way to output this type directly, so punt. */ Datum pg_ddl_command_out(PG_FUNCTION_ARGS) { - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot output a value of type %s", "pg_ddl_command"))); + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot output a value of type %s", "pg_ddl_command"))); - PG_RETURN_VOID(); + PG_RETURN_VOID(); } /* - * pg_ddl_command_recv - binary input routine for type PG_DDL_COMMAND. + * pg_ddl_command_recv - binary input routine for type PG_DDL_COMMAND. */ Datum pg_ddl_command_recv(PG_FUNCTION_ARGS) { - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot accept a value of type %s", "pg_ddl_command"))); + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot accept a value of type %s", "pg_ddl_command"))); - PG_RETURN_VOID(); + PG_RETURN_VOID(); } /* - * pg_ddl_command_send - binary output routine for type PG_DDL_COMMAND. + * pg_ddl_command_send - binary output routine for type PG_DDL_COMMAND. */ Datum pg_ddl_command_send(PG_FUNCTION_ARGS) { - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot output a value of type %s", "pg_ddl_command"))); + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot output a value of type %s", "pg_ddl_command"))); - PG_RETURN_VOID(); + PG_RETURN_VOID(); } @@ -453,21 +460,21 @@ pg_ddl_command_send(PG_FUNCTION_ARGS) Datum \ typname##_in(PG_FUNCTION_ARGS) \ { \ - ereport(ERROR, \ - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), \ - errmsg("cannot accept a value of type %s", #typname))); \ + ereport(ERROR, \ + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), \ + errmsg("cannot accept a value of type %s", #typname))); \ \ - PG_RETURN_VOID(); /* keep compiler quiet */ \ + PG_RETURN_VOID(); /* keep compiler quiet */ \ } \ \ Datum \ typname##_out(PG_FUNCTION_ARGS) \ { \ - ereport(ERROR, \ - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), \ - errmsg("cannot display a value of type %s", #typname))); \ + ereport(ERROR, \ + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), \ + errmsg("cannot display a value of type %s", #typname))); \ \ - PG_RETURN_VOID(); /* keep compiler quiet */ \ + PG_RETURN_VOID(); /* keep compiler quiet */ \ } \ \ extern int no_such_variable From cf69d15ed2c89fd16a4146defe2e512f3041f9b9 Mon Sep 17 00:00:00 2001 From: andrelin Date: Tue, 11 May 2021 10:53:17 +0800 Subject: [PATCH 109/578] Support concurrent update with remote subplans (merge request !322) Squash merge branch 'andrelin/try_update' into 'Tbase_v2.15.16.11' * cover regression expectation * Push epqContext including tid, range table idx, ntuples of epqTuple to remote * Upgrade version to TBase_V2.15.16.11 * fix warnings * Reset cursor name of remote subplans when EvalPlanQual tapd: http://tapd.oa.com/pgxz/prong/stories/view/1010092131864520567 --- src/backend/access/transam/gtm.c | 1 - src/backend/executor/execMain.c | 126 +++++++++++++++++- src/backend/nodes/nodeFuncs.c | 110 +++++++++++++++ src/backend/optimizer/prep/preptlist.c | 14 ++ src/backend/pgxc/nodemgr/nodemgr.c | 1 + src/backend/pgxc/pool/execRemote.c | 65 ++++++++- src/backend/pgxc/pool/pgxcnode.c | 23 +++- src/backend/tcop/postgres.c | 23 ++++ src/backend/tcop/pquery.c | 24 +++- src/include/executor/execdesc.h | 1 + src/include/nodes/execnodes.h | 14 ++ src/include/nodes/plannodes.h | 5 + src/include/pgxc/pgxcnode.h | 3 +- src/include/utils/portal.h | 3 + src/test/regress/expected/subselect.out | 8 +- src/test/regress/expected/xc_FQS_join_1.out | 48 +++---- src/test/regress/expected/xc_for_update_1.out | 80 +++++------ 17 files changed, 465 insertions(+), 84 deletions(-) diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c index 981332b6..5f4e8218 100644 --- a/src/backend/access/transam/gtm.c +++ b/src/backend/access/transam/gtm.c @@ -1341,7 +1341,6 @@ GetGlobalTimestampGTM(void) GTM_Timestamp latest_gts = InvalidGlobalTimestamp; struct rusage start_r; struct timeval start_t; - int retries = 0; if (log_gtm_stats) ResetUsageCommon(&start_r, &start_t); diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 4e0e708b..472bec42 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -84,6 +84,7 @@ #include "pgxc/poolmgr.h" #endif #ifdef __TBASE__ +#include "optimizer/planmain.h" #include "pgxc/squeue.h" #include "utils/relfilenodemap.h" #endif @@ -141,6 +142,9 @@ static void ExecPartitionCheck(ResultRelInfo *resultRelInfo, static int ExecCheckRTERelkindextPerms(RangeTblEntry *rte); #endif +static bool ResetRemoteSubplanCursor(Plan *plan, List *subplans, void *context); +static void AttachRemoteEPQContext(EState *estate, RemoteEPQContext *epq); + /* * Note that GetUpdatedColumns() also exists in commands/trigger.c. There does * not appear to be any good header to put it into, given the structures that @@ -1161,6 +1165,8 @@ InitPlan(QueryDesc *queryDesc, int eflags) estate->es_epqTuple = NULL; estate->es_epqTupleSet = NULL; estate->es_epqScanDone = NULL; + if (queryDesc->epqContext != NULL) + AttachRemoteEPQContext(estate, queryDesc->epqContext); /* * Initialize private state information for each SubPlan. We must do this @@ -2677,6 +2683,15 @@ ExecBuildAuxRowMark(ExecRowMark *erm, List *targetlist) resname); if (!AttributeNumberIsValid(aerm->ctidAttNo)) elog(ERROR, "could not find junk %s column", resname); + +#ifdef __TBASE__ + /* we need xc_node_id combined with ctid to determine physical tuple */ + snprintf(resname, sizeof(resname), "xc_node_id%u", erm->rowmarkId); + aerm->nodeidAttNo = ExecFindJunkAttributeInTlist(targetlist, + resname); + if (!AttributeNumberIsValid(aerm->nodeidAttNo)) + elog(ERROR, "could not find junk %s column", resname); +#endif } else { @@ -3054,11 +3069,14 @@ EvalPlanQualInit(EPQState *epqstate, EState *estate, Plan *subplan, List *auxrowmarks, int epqParam) { /* Mark the EPQ state inactive */ + epqstate->parentestate = estate; epqstate->estate = NULL; epqstate->planstate = NULL; epqstate->origslot = NULL; /* ... and remember data that EvalPlanQualBegin will need */ - epqstate->plan = subplan; + epqstate->plan = copyObject(subplan); + /* Reset cursor name of remote subplans if any */ + ResetRemoteSubplanCursor(epqstate->plan, estate->es_plannedstmt->subplans, "epq"); epqstate->arowMarks = auxrowmarks; epqstate->epqParam = epqParam; } @@ -3074,7 +3092,11 @@ EvalPlanQualSetPlan(EPQState *epqstate, Plan *subplan, List *auxrowmarks) /* If we have a live EPQ query, shut it down */ EvalPlanQualEnd(epqstate); /* And set/change the plan pointer */ - epqstate->plan = subplan; + epqstate->plan = copyObject(subplan); + /* Reset cursor name of remote subplans if any */ + ResetRemoteSubplanCursor(epqstate->plan, + epqstate->parentestate->es_plannedstmt->subplans, + "epq"); /* The rowmarks depend on the plan, too */ epqstate->arowMarks = auxrowmarks; } @@ -3205,8 +3227,15 @@ EvalPlanQualFetchRowMarks(EPQState *epqstate) { /* ordinary table, fetch the tuple */ Buffer buffer; + uint32 xc_node_id; tuple.t_self = *((ItemPointer) DatumGetPointer(datum)); + + xc_node_id = DatumGetUInt32(ExecGetJunkAttribute(epqstate->origslot, + aerm->nodeidAttNo, + &isNull)); + if (xc_node_id == PGXCNodeIdentifier) + { if (!heap_fetch(erm->relation, SnapshotAny, &tuple, &buffer, false, NULL)) elog(ERROR, "failed to fetch tuple for EvalPlanQual recheck"); @@ -3227,6 +3256,14 @@ EvalPlanQualFetchRowMarks(EPQState *epqstate) #endif ReleaseBuffer(buffer); } + else + { + copyTuple = (HeapTuple) palloc(HEAPTUPLESIZE); + copyTuple->t_self = tuple.t_self; + } + + copyTuple->t_xc_node_id = xc_node_id; + } /* store tuple */ EvalPlanQualSetTuple(epqstate, erm->rti, copyTuple); @@ -3510,6 +3547,7 @@ EvalPlanQualEnd(EPQState *epqstate) /* Mark EPQState idle */ epqstate->estate = NULL; + epqstate->parentestate = NULL; epqstate->planstate = NULL; epqstate->origslot = NULL; } @@ -3957,4 +3995,88 @@ int ExecCheckPgclassAuthority(ScanState *node, TupleTableSlot *slot) } #endif +/* + * ResetRemoteSubplanCursor + * walker to find out RemoteSubplan and re-generate a cursor for it + * currently it is used in EvalPlanQual, otherwise EvalPlanQual will + * use old cursor name to create a duplicate portal, which is illegal. + */ +static bool +ResetRemoteSubplanCursor(Plan *plan, List *subplans, void *context) +{ + if (plan == NULL) + return false; + + if (IsA(plan, RemoteSubplan)) + { + RemoteSubplan *rsp = castNode(RemoteSubplan, plan); + char *origin_cursor = rsp->cursor; + rsp->cursor = (char *) palloc(NAMEDATALEN); + snprintf(rsp->cursor, NAMEDATALEN, "%s_%s", origin_cursor, (const char *) context); + } + + return plantree_walker(plan, subplans, ResetRemoteSubplanCursor, context); +} +static void +AttachRemoteEPQContext(EState *estate, RemoteEPQContext *epq) +{ + int i; + int rtsize = list_length(estate->es_range_table); + Relation relation; + + estate->es_epqTuple = (HeapTuple *) + palloc0(rtsize * sizeof(HeapTuple)); + estate->es_epqTupleSet = (bool *) + palloc0(rtsize * sizeof(bool)); + estate->es_epqScanDone = (bool *) + palloc0(rtsize * sizeof(bool)); + + for(i = 0; i < epq->ntuples; i++) + { + HeapTuple copyTuple; + HeapTupleData tuple; + Buffer buffer; + int idx = epq->rtidx[i]; + + if (epq->nodeid[i] != PGXCNodeIdentifier) + { + estate->es_epqTupleSet[idx - 1] = true; + estate->es_epqScanDone[idx - 1] = true; + continue; + } + + relation = relation_open(getrelid(idx, estate->es_range_table), NoLock); + if (relation->rd_rel->relkind == RELKIND_FOREIGN_TABLE) + elog(ERROR, "foreign table does not support remote epq process"); + + tuple.t_self = epq->tid[i]; + if (!heap_fetch(relation, SnapshotAny, &tuple, &buffer, + false, NULL)) + { + elog(DEBUG1, "failed to fetch tuple for remote EvalPlanQual recheck"); + relation_close(relation, NoLock); + continue; + } + +#ifdef _MLS_ + if (HeapTupleHeaderGetNatts(tuple.t_data) < + RelationGetDescr(relation)->natts) + { + copyTuple = heap_expand_tuple(&tuple, + RelationGetDescr(relation)); + } + else +#endif + { + /* successful, copy tuple */ + copyTuple = heap_copytuple(&tuple); + } + + estate->es_epqTuple[idx - 1] = copyTuple; + estate->es_epqTupleSet[idx - 1] = true; + + ReleaseBuffer(buffer); + relation_close(relation, NoLock); + } +} diff --git a/src/backend/nodes/nodeFuncs.c b/src/backend/nodes/nodeFuncs.c index 47ebb30d..27a1b7a3 100644 --- a/src/backend/nodes/nodeFuncs.c +++ b/src/backend/nodes/nodeFuncs.c @@ -3872,3 +3872,113 @@ planstate_walk_members(List *plans, PlanState **planstates, return false; } + +/* + * Walk a list of SubPlans (or initPlans, which also use SubPlan nodes). + */ +static bool +plantree_walk_initplans(List *plans, + List *subplans, + bool (*walker) (), + void *context) +{ + ListCell *lc; + + foreach(lc, plans) + { + Plan *splan = list_nth_node(Plan, subplans, + (lfirst_node(SubPlan, lc))->plan_id); + + if (walker(splan, context)) + return true; + } + + return false; +} + +/* + * plantree_walker --- walk plan trees + * + * The walker has already visited the current node, and so we need only + * recurse into any sub-nodes it has. + */ +bool +plantree_walker(Plan *plan, + List *top_subplans, + bool (*walker) (), + void *context) +{ + ListCell *lc; + + if (plan == NULL) + return false; + + /* initPlan-s */ + if (plantree_walk_initplans(plan->initPlan, top_subplans, walker, context)) + return true; + + /* lefttree */ + if (walker(plan->lefttree, top_subplans, context)) + return true; + + /* righttree */ + if (walker(plan->righttree, top_subplans, context)) + return true; + + /* special child plans */ + switch (nodeTag(plan)) + { + case T_ModifyTable: + foreach(lc, ((ModifyTable *) plan)->plans) + { + if (walker((Plan *) lfirst(lc), top_subplans, context)) + return true; + } + break; + case T_Append: + foreach(lc, ((Append *) plan)->appendplans) + { + if (walker((Plan *) lfirst(lc), top_subplans, context)) + return true; + } + break; + case T_MergeAppend: + foreach(lc, ((MergeAppend *) plan)->mergeplans) + { + if (walker((Plan *) lfirst(lc), top_subplans, context)) + return true; + } + break; + case T_BitmapAnd: + foreach(lc, ((BitmapAnd *) plan)->bitmapplans) + { + if (walker((Plan *) lfirst(lc), top_subplans, context)) + return true; + } + break; + case T_BitmapOr: + foreach(lc, ((BitmapOr *) plan)->bitmapplans) + { + if (walker((Plan *) lfirst(lc), top_subplans, context)) + return true; + } + break; + case T_SubqueryScan: + { + if (walker(castNode(SubqueryScan, plan)->subplan, top_subplans, context)) + return true; + } + break; + case T_CustomScan: + foreach(lc, ((CustomScan *) plan)->custom_plans) + { + if (walker((Plan *) lfirst(lc), top_subplans, context)) + return true; + } + break; + default: + break; + } + + return false; +} diff --git a/src/backend/optimizer/prep/preptlist.c b/src/backend/optimizer/prep/preptlist.c index 9091f15f..2a69be86 100644 --- a/src/backend/optimizer/prep/preptlist.c +++ b/src/backend/optimizer/prep/preptlist.c @@ -368,6 +368,20 @@ preprocess_targetlist(PlannerInfo *root, List *tlist) pstrdup(resname), true); tlist = lappend(tlist, tle); + + /* Need to fetch another xc_node_id */ + var = makeVar(rc->rti, + XC_NodeIdAttributeNumber, + INT4OID, + -1, + InvalidOid, + 0); + snprintf(resname, sizeof(resname), "xc_node_id%u", rc->rowmarkId); + tle = makeTargetEntry((Expr *) var, + list_length(tlist) + 1, + pstrdup(resname), + true); + tlist = lappend(tlist, tle); } if (rc->allMarkTypes & (1 << ROW_MARK_COPY)) { diff --git a/src/backend/pgxc/nodemgr/nodemgr.c b/src/backend/pgxc/nodemgr/nodemgr.c index 2a114a9e..570aadee 100644 --- a/src/backend/pgxc/nodemgr/nodemgr.c +++ b/src/backend/pgxc/nodemgr/nodemgr.c @@ -39,6 +39,7 @@ #endif #ifdef __TBASE__ +#include "access/xact.h" #include "libpq/libpq.h" #endif bool enable_multi_cluster = true; diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index cff54d43..80829005 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -10555,7 +10555,8 @@ append_param_data(StringInfo buf, Oid ptype, int pused, Datum value, bool isnull } -static int encode_parameters(int nparams, RemoteParam *remoteparams, +static int +encode_parameters(int nparams, RemoteParam *remoteparams, PlanState *planstate, char** result) { EState *estate = planstate->state; @@ -10616,6 +10617,57 @@ static int encode_parameters(int nparams, RemoteParam *remoteparams, return buf.len; } +/* + * Encode executor context for EvalPlanQual process including: + * the number of epqTuples, the ctid and xc_node_id of each tuple. + */ +static int +encode_epqcontext(PlanState *planstate, char **result) +{ + EState *estate = planstate->state; + StringInfoData buf; + uint16 n16; + uint32 n32; + int ntuples = list_length(estate->es_range_table); + int i; + ExprContext *econtext; + MemoryContext oldcontext; + + if (planstate->ps_ExprContext == NULL) + ExecAssignExprContext(estate, planstate); + + econtext = planstate->ps_ExprContext; + oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory); + + initStringInfo(&buf); + + /* Number of epq tuples */ + n16 = htons(ntuples); + appendBinaryStringInfo(&buf, (char *) &n16, 2); + + for (i = 0; i < ntuples; i++) + { + ItemPointerData tid = estate->es_epqTuple[i]->t_self; + int rtidx = i + 1; + + n16 = htons(rtidx); + appendBinaryStringInfo(&buf, (char *) &n16, 2); + n16 = htons(tid.ip_blkid.bi_hi); + appendBinaryStringInfo(&buf, (char *) &n16, 2); + n16 = htons(tid.ip_blkid.bi_lo); + appendBinaryStringInfo(&buf, (char *) &n16, 2); + n16 = htons(tid.ip_posid); + appendBinaryStringInfo(&buf, (char *) &n16, 2); + n32 = htonl(estate->es_epqTuple[i]->t_xc_node_id); + appendBinaryStringInfo(&buf, (char *) &n32, 4); + } + + /* Take data from the buffer */ + *result = palloc(buf.len); + memcpy(*result, buf.data, buf.len); + MemoryContextSwitchTo(oldcontext); + return buf.len; +} TupleTableSlot * ExecRemoteSubplan(PlanState *pstate) @@ -10665,7 +10717,10 @@ ExecRemoteSubplan(PlanState *pstate) { int fetch = 0; int paramlen = 0; + int epqctxlen = 0; char *paramdata = NULL; + char *epqctxdata = NULL; + /* * Conditions when we want to execute query on the primary node first: * Coordinator running replicated ModifyTable on multiple nodes @@ -10732,6 +10787,9 @@ ExecRemoteSubplan(PlanState *pstate) &combiner->ss.ps, ¶mdata); + if (estate->es_epqTuple != NULL) + epqctxlen = encode_epqcontext(&combiner->ss.ps, &epqctxdata); + /* * The subplan being rescanned, need to restore connections and * re-bind the portal @@ -10771,7 +10829,7 @@ ExecRemoteSubplan(PlanState *pstate) /* rebind */ pgxc_node_send_bind(conn, combiner->cursor, combiner->cursor, - paramlen, paramdata); + paramlen, paramdata, epqctxlen, epqctxdata); if (enable_statistic) { elog(LOG, "Bind Message:pid:%d,remote_pid:%d,remote_ip:%s,remote_port:%d,fd:%d,cursor:%s", @@ -10859,7 +10917,8 @@ ExecRemoteSubplan(PlanState *pstate) } /* bind */ - pgxc_node_send_bind(conn, cursor, cursor, paramlen, paramdata); + pgxc_node_send_bind(conn, cursor, cursor, paramlen, paramdata, + epqctxlen, epqctxdata); if (enable_statistic) { diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index 8f76dc2a..36558205 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -2145,13 +2145,15 @@ pgxc_node_send_plan(PGXCNodeHandle * handle, const char *statement, */ int pgxc_node_send_bind(PGXCNodeHandle * handle, const char *portal, - const char *statement, int paramlen, char *params) -{// #lizard forgives + const char *statement, int paramlen, const char *params, + int epqctxlen, const char *epqctx) +{ int pnameLen; int stmtLen; int paramCodeLen; int paramValueLen; int paramOutLen; + int epqCtxLen; int msgLen; /* Invalid connection state, return error */ @@ -2168,8 +2170,10 @@ pgxc_node_send_bind(PGXCNodeHandle * handle, const char *portal, paramValueLen = paramlen ? paramlen : 2; /* size of output parameter codes array (always empty for now) */ paramOutLen = 2; + /* size of epq context, 2 if not epq */ + epqCtxLen = epqctxlen ? epqctxlen : 2; /* size + pnameLen + stmtLen + parameters */ - msgLen = 4 + pnameLen + stmtLen + paramCodeLen + paramValueLen + paramOutLen; + msgLen = 4 + pnameLen + stmtLen + paramCodeLen + paramValueLen + paramOutLen + epqCtxLen; /* msgType + msgLen */ if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0) @@ -2216,6 +2220,17 @@ pgxc_node_send_bind(PGXCNodeHandle * handle, const char *portal, /* output parameter codes (none) */ handle->outBuffer[handle->outEnd++] = 0; handle->outBuffer[handle->outEnd++] = 0; + /* output epq context */ + if (epqctxlen) + { + memcpy(handle->outBuffer + handle->outEnd, epqctx, epqctxlen); + handle->outEnd += epqctxlen; + } + else + { + handle->outBuffer[handle->outEnd++] = 0; + handle->outBuffer[handle->outEnd++] = 0; + } handle->in_extended_query = true; return 0; @@ -2463,7 +2478,7 @@ pgxc_node_send_query_extended(PGXCNodeHandle *handle, const char *query, if (query) if (pgxc_node_send_parse(handle, statement, query, num_params, param_types)) return EOF; - if (pgxc_node_send_bind(handle, portal, statement, paramlen, params)) + if (pgxc_node_send_bind(handle, portal, statement, paramlen, params, 0, NULL)) return EOF; if (send_describe) if (pgxc_node_send_describe(handle, false, portal)) diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 4ed9b7d9..def87c2d 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -2151,6 +2151,7 @@ exec_bind_message(StringInfo input_message) int16 *pformats = NULL; int numParams; int numRFormats; + int num_epq_tuple; int16 *rformats = NULL; CachedPlanSource *psrc; CachedPlan *cplan; @@ -2687,6 +2688,28 @@ exec_bind_message(StringInfo input_message) rformats[i] = pq_getmsgint(input_message, 2); } + /* Get epq context */ + num_epq_tuple = pq_getmsgint(input_message, 2); + if (num_epq_tuple > 0) + { + int i; + + portal->epqContext = palloc(sizeof(RemoteEPQContext)); + portal->epqContext->ntuples = num_epq_tuple; + portal->epqContext->tid = palloc(num_epq_tuple * sizeof(ItemPointerData)); + portal->epqContext->rtidx = palloc(num_epq_tuple * sizeof(int)); + portal->epqContext->nodeid = palloc(num_epq_tuple * sizeof(uint32)); + + for (i = 0; i < num_epq_tuple; i++) + { + portal->epqContext->rtidx[i] = pq_getmsgint(input_message, 2); + portal->epqContext->tid[i].ip_blkid.bi_hi = pq_getmsgint(input_message, 2); + portal->epqContext->tid[i].ip_blkid.bi_lo = pq_getmsgint(input_message, 2); + portal->epqContext->tid[i].ip_posid = pq_getmsgint(input_message, 2); + portal->epqContext->nodeid[i] = pq_getmsgint(input_message, 4); + } + } + pq_getmsgend(input_message); /* diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c index 7a529dac..16179e73 100644 --- a/src/backend/tcop/pquery.c +++ b/src/backend/tcop/pquery.c @@ -129,6 +129,7 @@ CreateQueryDesc(PlannedStmt *plannedstmt, #ifdef __TBASE__ qd->sender = NULL; qd->es_param_exec_vals = NULL; + qd->epqContext = NULL; #endif /* not yet executed */ @@ -681,6 +682,13 @@ PortalStart(Portal portal, ParamListInfo params, params, NULL, 0); + + /* + * set information about EvalPlanQual if any, they will be fill in + * estate later after it been created. + */ + queryDesc->epqContext = portal->epqContext; + /* * If parent node have sent down parameters, and at least one * of them is PARAM_EXEC we should avoid "single execution" @@ -697,13 +705,13 @@ PortalStart(Portal portal, ParamListInfo params, * here since queryDesc->plannedstmt->nParamExec may be used * just to allocate space for them and no actual values passed. * - * If distributionType is LOCATOR_TYPE_SHARD, even with parameters - * PARAM_EXEC, still follow the redistribution logic, otherwise, - * it may cause SharedQueue conflict in the lower layer redistribution + * Also, if we are doing EvalPlanQual, we will be rescan soon, which + * is not supported in SharedQueue mode. Force to do it traditionally. */ #ifdef __TBASE__ - if (!paramPassDown && queryDesc->plannedstmt->nParamRemote > 0 && - queryDesc->plannedstmt->remoteparams[queryDesc->plannedstmt->nParamRemote-1].paramkind == PARAM_EXEC) + if ((!paramPassDown && queryDesc->plannedstmt->nParamRemote > 0 && + queryDesc->plannedstmt->remoteparams[queryDesc->plannedstmt->nParamRemote-1].paramkind == PARAM_EXEC) || + queryDesc->epqContext != NULL) #else if (queryDesc->plannedstmt->nParamRemote > 0 && queryDesc->plannedstmt->remoteparams[queryDesc->plannedstmt->nParamRemote-1].paramkind == PARAM_EXEC) @@ -1012,6 +1020,12 @@ PortalStart(Portal portal, ParamListInfo params, 0); /* + * set information about EvalPlanQual if any, they will be fill in + * estate later after it been created. + */ + queryDesc->epqContext = portal->epqContext; + + /* * If it's a scrollable cursor, executor needs to support * REWIND and backwards scan, as well as whatever the caller * might've asked for. diff --git a/src/include/executor/execdesc.h b/src/include/executor/execdesc.h index 94f6449d..00e26823 100644 --- a/src/include/executor/execdesc.h +++ b/src/include/executor/execdesc.h @@ -117,6 +117,7 @@ typedef struct QueryDesc #ifdef __TBASE__ DataPumpSender sender; /* used for locally data transfering */ ParamExecData *es_param_exec_vals; /* values of internal params */ + RemoteEPQContext *epqContext; /* information about EvalPlanQual from remote */ #endif int myindex; /* -1 if locally executed subplan is producing diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 0b6c770c..1fdf29fe 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -596,6 +596,9 @@ typedef struct ExecAuxRowMark AttrNumber ctidAttNo; /* resno of ctid junk attribute, if any */ AttrNumber toidAttNo; /* resno of tableoid junk attribute, if any */ AttrNumber wholeAttNo; /* resno of whole-row junk attribute, if any */ +#ifdef __TBASE__ + AttrNumber nodeidAttNo; /* resno of xc_node_id junk attribute, if any */ +#endif } ExecAuxRowMark; @@ -995,6 +998,9 @@ typedef struct EPQState Plan *plan; /* plan tree to be executed */ List *arowMarks; /* ExecAuxRowMarks (non-locking only) */ int epqParam; /* ID of Param to force scan node re-eval */ +#ifdef __TBASE__ + EState *parentestate; /* parant EState, more information to modify plantree if needed */ +#endif } EPQState; @@ -2297,4 +2303,12 @@ typedef struct LimitState TupleTableSlot *subSlot; /* tuple last obtained from subplan */ } LimitState; +typedef struct RemoteEPQContext +{ + int ntuples; + int *rtidx; + ItemPointerData *tid; + uint32 *nodeid; +} RemoteEPQContext; + #endif /* EXECNODES_H */ diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index 8a456994..ce1f6719 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -1100,4 +1100,9 @@ typedef struct PlanInvalItem uint32 hashValue; /* hash value of object's cache lookup key */ } PlanInvalItem; +extern bool plantree_walker(Plan *plan, + List *top_subplans, + bool (*walker) (), + void *context); + #endif /* PLANNODES_H */ diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h index 4a2ee55b..3773cac2 100644 --- a/src/include/pgxc/pgxcnode.h +++ b/src/include/pgxc/pgxcnode.h @@ -210,7 +210,8 @@ extern int pgxc_node_send_apply(PGXCNodeHandle * handle, char * buf, int len, bo extern int pgxc_node_send_disconnect(PGXCNodeHandle * handle, char *cursor, int cons); #endif extern int pgxc_node_send_bind(PGXCNodeHandle * handle, const char *portal, - const char *statement, int paramlen, char *params); + const char *statement, int paramlen, const char *params, + int eqpctxlen, const char *epqctx); extern int pgxc_node_send_parse(PGXCNodeHandle * handle, const char* statement, const char *query, short num_params, Oid *param_types); extern int pgxc_node_send_flush(PGXCNodeHandle * handle); diff --git a/src/include/utils/portal.h b/src/include/utils/portal.h index 9c4515b8..2a4a6c42 100644 --- a/src/include/utils/portal.h +++ b/src/include/utils/portal.h @@ -263,6 +263,9 @@ typedef struct PortalData * portal marked failed in subtransaction * in AtSubAbort_Portals */ + + /* information about EvalPlanQual, pass it to queryDesc */ + RemoteEPQContext *epqContext; #endif } PortalData; diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out index 17184f61..a1f9d561 100644 --- a/src/test/regress/expected/subselect.out +++ b/src/test/regress/expected/subselect.out @@ -1796,8 +1796,8 @@ select * from x where f1 = 1; explain (verbose, costs off) with x as (select * from (select f1 from subselect_tbl for update) ss) select * from x where f1 = 1; - QUERY PLAN --------------------------------------------------------------------------- + QUERY PLAN +---------------------------------------------------------------------------------------------------- CTE Scan on x Output: x.f1 Filter: (x.f1 = 1) @@ -1807,9 +1807,9 @@ select * from x where f1 = 1; -> Subquery Scan on ss Output: ss.f1 -> LockRows - Output: subselect_tbl.f1, subselect_tbl.ctid + Output: subselect_tbl.f1, subselect_tbl.ctid, subselect_tbl.xc_node_id -> Seq Scan on public.subselect_tbl - Output: subselect_tbl.f1, subselect_tbl.ctid + Output: subselect_tbl.f1, subselect_tbl.ctid, subselect_tbl.xc_node_id (12 rows) -- Multiply-referenced CTEs are inlined only when requested diff --git a/src/test/regress/expected/xc_FQS_join_1.out b/src/test/regress/expected/xc_FQS_join_1.out index c80fb0f2..18836c1e 100644 --- a/src/test/regress/expected/xc_FQS_join_1.out +++ b/src/test/regress/expected/xc_FQS_join_1.out @@ -691,12 +691,12 @@ explain (verbose on, nodes off, costs off) select * from tab1_mod, tab3_mod -- DMLs involving JOINs are not FQSed explain (verbose on, nodes off, costs off) update tab1_mod set val2 = 1000 from tab2_mod where tab1_mod.val = tab2_mod.val and tab1_mod. val2 = tab2_mod.val2; - QUERY PLAN ---------------------------------------------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------ Remote Subquery Scan on all -> Update on public.tab1_mod -> Merge Join - Output: tab1_mod.val, 1000, tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab2_mod.ctid + Output: tab1_mod.val, 1000, tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab2_mod.ctid, tab2_mod.xc_node_id Merge Cond: ((tab1_mod.val = tab2_mod.val) AND (tab1_mod.val2 = tab2_mod.val2)) -> Sort Output: tab1_mod.val, tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val2 @@ -704,25 +704,25 @@ explain (verbose on, nodes off, costs off) update tab1_mod set val2 = 1000 from -> Seq Scan on public.tab1_mod Output: tab1_mod.val, tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val2 -> Materialize - Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2 + Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2 -> Remote Subquery Scan on all - Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2 + Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2 Distribute results by M: val -> Sort - Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2 + Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2 Sort Key: tab2_mod.val, tab2_mod.val2 -> Seq Scan on public.tab2_mod - Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2 + Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2 (20 rows) explain (verbose on, nodes off, costs off) delete from tab1_mod using tab2_mod where tab1_mod.val = tab2_mod.val and tab1_mod.val2 = tab2_mod.val2; - QUERY PLAN ---------------------------------------------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------ Remote Subquery Scan on all -> Delete on public.tab1_mod -> Merge Join - Output: tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val, tab2_mod.ctid + Output: tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val, tab2_mod.ctid, tab2_mod.xc_node_id Merge Cond: ((tab1_mod.val = tab2_mod.val) AND (tab1_mod.val2 = tab2_mod.val2)) -> Sort Output: tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val, tab1_mod.val2 @@ -730,25 +730,25 @@ explain (verbose on, nodes off, costs off) delete from tab1_mod using tab2_mod -> Seq Scan on public.tab1_mod Output: tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val, tab1_mod.val2 -> Materialize - Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2 + Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2 -> Remote Subquery Scan on all - Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2 + Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2 Distribute results by M: val -> Sort - Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2 + Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2 Sort Key: tab2_mod.val, tab2_mod.val2 -> Seq Scan on public.tab2_mod - Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2 + Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2 (20 rows) explain (verbose on, nodes off, costs off) update tab1_rep set val2 = 1000 from tab2_rep where tab1_rep.val = tab2_rep.val and tab1_rep.val2 = tab2_rep.val2; - QUERY PLAN ------------------------------------------------------------------------------------------------- + QUERY PLAN +--------------------------------------------------------------------------------------------------------------- Remote Subquery Scan on any -> Update on public.tab1_rep -> Merge Join - Output: tab1_rep.val, 1000, tab1_rep.ctid, tab1_rep.shardid, tab2_rep.ctid + Output: tab1_rep.val, 1000, tab1_rep.ctid, tab1_rep.shardid, tab2_rep.ctid, tab2_rep.xc_node_id Merge Cond: ((tab1_rep.val = tab2_rep.val) AND (tab1_rep.val2 = tab2_rep.val2)) -> Sort Output: tab1_rep.val, tab1_rep.ctid, tab1_rep.shardid, tab1_rep.val2 @@ -756,20 +756,20 @@ explain (verbose on, nodes off, costs off) update tab1_rep set val2 = 1000 from -> Seq Scan on public.tab1_rep Output: tab1_rep.val, tab1_rep.ctid, tab1_rep.shardid, tab1_rep.val2 -> Sort - Output: tab2_rep.ctid, tab2_rep.val, tab2_rep.val2 + Output: tab2_rep.ctid, tab2_rep.xc_node_id, tab2_rep.val, tab2_rep.val2 Sort Key: tab2_rep.val, tab2_rep.val2 -> Seq Scan on public.tab2_rep - Output: tab2_rep.ctid, tab2_rep.val, tab2_rep.val2 + Output: tab2_rep.ctid, tab2_rep.xc_node_id, tab2_rep.val, tab2_rep.val2 (15 rows) explain (verbose on, nodes off, costs off) delete from tab1_rep using tab2_rep where tab1_rep.val = tab2_rep.val and tab1_rep.val2 = tab2_rep.val2; - QUERY PLAN ------------------------------------------------------------------------------------------------- + QUERY PLAN +--------------------------------------------------------------------------------------------------- Remote Subquery Scan on any -> Delete on public.tab1_rep -> Merge Join - Output: tab1_rep.ctid, tab1_rep.shardid, tab2_rep.ctid + Output: tab1_rep.ctid, tab1_rep.shardid, tab2_rep.ctid, tab2_rep.xc_node_id Merge Cond: ((tab1_rep.val = tab2_rep.val) AND (tab1_rep.val2 = tab2_rep.val2)) -> Sort Output: tab1_rep.ctid, tab1_rep.shardid, tab1_rep.val, tab1_rep.val2 @@ -777,10 +777,10 @@ explain (verbose on, nodes off, costs off) delete from tab1_rep using tab2_rep -> Seq Scan on public.tab1_rep Output: tab1_rep.ctid, tab1_rep.shardid, tab1_rep.val, tab1_rep.val2 -> Sort - Output: tab2_rep.ctid, tab2_rep.val, tab2_rep.val2 + Output: tab2_rep.ctid, tab2_rep.xc_node_id, tab2_rep.val, tab2_rep.val2 Sort Key: tab2_rep.val, tab2_rep.val2 -> Seq Scan on public.tab2_rep - Output: tab2_rep.ctid, tab2_rep.val, tab2_rep.val2 + Output: tab2_rep.ctid, tab2_rep.xc_node_id, tab2_rep.val, tab2_rep.val2 (15 rows) drop table tab1_rep; diff --git a/src/test/regress/expected/xc_for_update_1.out b/src/test/regress/expected/xc_for_update_1.out index b9de2234..66a13a33 100644 --- a/src/test/regress/expected/xc_for_update_1.out +++ b/src/test/regress/expected/xc_for_update_1.out @@ -97,12 +97,12 @@ explain (costs off, num_nodes off, nodes off, verbose on) select * from t1 for QUERY PLAN ------------------------------------------------------------------ Remote Fast Query Execution - Output: t1.val, t1.val2, t1.ctid + Output: t1.val, t1.val2, t1.ctid, t1.xc_node_id Remote query: SELECT val, val2 FROM t1 FOR UPDATE OF t1 NOWAIT -> LockRows - Output: val, val2, ctid + Output: val, val2, ctid, xc_node_id -> Seq Scan on public.t1 - Output: val, val2, ctid + Output: val, val2, ctid, xc_node_id (7 rows) -- two table case @@ -279,23 +279,23 @@ select * from t1 join t2 on (t1.val2 = t2.val2) join t3 on (t1.val2 = t3.val2) f ERROR: FOR UPDATE is not allowed with joins -- check a few subquery cases explain (costs off, num_nodes off, nodes off, verbose on) select * from (select * from t1 for update of t1 nowait) as foo; - QUERY PLAN ------------------------------------------------------- + QUERY PLAN +--------------------------------------------------------------------- Remote Subquery Scan on all Output: foo.val, foo.val2 -> Subquery Scan on foo Output: foo.val, foo.val2 -> LockRows - Output: t1.val, t1.val2, t1.ctid + Output: t1.val, t1.val2, t1.ctid, t1.xc_node_id -> Seq Scan on public.t1 - Output: t1.val, t1.val2, t1.ctid + Output: t1.val, t1.val2, t1.ctid, t1.xc_node_id (8 rows) explain (costs off, num_nodes off, nodes off, verbose on) select * from t1 where val in (select val from t2 for update of t2 nowait) for update; ERROR: FOR UPDATE is not allowed with joins explain (costs off, num_nodes off, nodes off, verbose on) select * from t1 where val in (select val from t2 for update of t2 nowait); - QUERY PLAN ---------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------ Remote Subquery Scan on all Output: t1.val, t1.val2 -> Hash Join @@ -312,9 +312,9 @@ explain (costs off, num_nodes off, nodes off, verbose on) select * from t1 wher -> Subquery Scan on "ANY_subquery" Output: "ANY_subquery".val -> LockRows - Output: t2.val, t2.ctid + Output: t2.val, t2.ctid, t2.xc_node_id -> Seq Scan on public.t2 - Output: t2.val, t2.ctid + Output: t2.val, t2.ctid, t2.xc_node_id (19 rows) -- test multiple row marks @@ -325,48 +325,48 @@ explain (costs off, num_nodes off, nodes off, verbose on) select * from t1 for QUERY PLAN ----------------------------------------------------------- Remote Fast Query Execution - Output: t1.val, t1.val2, t1.ctid + Output: t1.val, t1.val2, t1.ctid, t1.xc_node_id Remote query: SELECT val, val2 FROM t1 FOR UPDATE OF t1 -> LockRows - Output: val, val2, ctid + Output: val, val2, ctid, xc_node_id -> Seq Scan on public.t1 - Output: val, val2, ctid + Output: val, val2, ctid, xc_node_id (7 rows) explain (costs off, num_nodes off, nodes off, verbose on) select * from t1 for update of t1 for share of t1; QUERY PLAN ----------------------------------------------------------- Remote Fast Query Execution - Output: t1.val, t1.val2, t1.ctid + Output: t1.val, t1.val2, t1.ctid, t1.xc_node_id Remote query: SELECT val, val2 FROM t1 FOR UPDATE OF t1 -> LockRows - Output: val, val2, ctid + Output: val, val2, ctid, xc_node_id -> Seq Scan on public.t1 - Output: val, val2, ctid + Output: val, val2, ctid, xc_node_id (7 rows) explain (costs off, num_nodes off, nodes off, verbose on) select * from t1 for share of t1 for share of t1 for update of t1; QUERY PLAN ----------------------------------------------------------- Remote Fast Query Execution - Output: t1.val, t1.val2, t1.ctid + Output: t1.val, t1.val2, t1.ctid, t1.xc_node_id Remote query: SELECT val, val2 FROM t1 FOR UPDATE OF t1 -> LockRows - Output: val, val2, ctid + Output: val, val2, ctid, xc_node_id -> Seq Scan on public.t1 - Output: val, val2, ctid + Output: val, val2, ctid, xc_node_id (7 rows) explain (costs off, num_nodes off, nodes off, verbose on) select * from t1 for share of t1 for share of t1 for share of t1; QUERY PLAN ---------------------------------------------------------- Remote Fast Query Execution - Output: t1.val, t1.val2, t1.ctid + Output: t1.val, t1.val2, t1.ctid, t1.xc_node_id Remote query: SELECT val, val2 FROM t1 FOR SHARE OF t1 -> LockRows - Output: val, val2, ctid + Output: val, val2, ctid, xc_node_id -> Seq Scan on public.t1 - Output: val, val2, ctid + Output: val, val2, ctid, xc_node_id (7 rows) -- make sure NOWAIT is used in remote query even if it is not mentioned with FOR UPDATE clause @@ -374,12 +374,12 @@ explain (costs off, num_nodes off, nodes off, verbose on) select * from t1 for QUERY PLAN ------------------------------------------------------------------ Remote Fast Query Execution - Output: t1.val, t1.val2, t1.ctid + Output: t1.val, t1.val2, t1.ctid, t1.xc_node_id Remote query: SELECT val, val2 FROM t1 FOR UPDATE OF t1 NOWAIT -> LockRows - Output: val, val2, ctid + Output: val, val2, ctid, xc_node_id -> Seq Scan on public.t1 - Output: val, val2, ctid + Output: val, val2, ctid, xc_node_id (7 rows) -- same table , different aliases and different row marks for different aliases @@ -409,17 +409,17 @@ explain (costs off, num_nodes off, nodes off, verbose on) WITH q1 AS (SELECT * (4 rows) explain (costs off, num_nodes off, nodes off, verbose on) WITH q1 AS (SELECT * from t1 FOR UPDATE) SELECT * FROM q1 FOR UPDATE; - QUERY PLAN --------------------------------------------------------- + QUERY PLAN +----------------------------------------------------------------------- CTE Scan on q1 Output: q1.val, q1.val2 CTE q1 -> Remote Subquery Scan on all - Output: t1.val, t1.val2, t1.ctid + Output: t1.val, t1.val2, t1.ctid, t1.xc_node_id -> LockRows - Output: t1.val, t1.val2, t1.ctid + Output: t1.val, t1.val2, t1.ctid, t1.xc_node_id -> Seq Scan on public.t1 - Output: t1.val, t1.val2, t1.ctid + Output: t1.val, t1.val2, t1.ctid, t1.xc_node_id (9 rows) -- test case of inheried tables @@ -433,17 +433,17 @@ select * from p1 order by 1 for update; (4 rows) explain (costs off, num_nodes off, nodes off, verbose on) select * from p1 for update; - QUERY PLAN --------------------------------------------------------------- + QUERY PLAN +----------------------------------------------------------------------------- Remote Subquery Scan on all - Output: a, b, ctid, tableoid + Output: a, b, ctid, xc_node_id, tableoid -> LockRows - Output: p1.a, p1.b, p1.ctid, p1.tableoid + Output: p1.a, p1.b, p1.ctid, p1.xc_node_id, p1.tableoid -> Append -> Seq Scan on public.p1 - Output: p1.a, p1.b, p1.ctid, p1.tableoid + Output: p1.a, p1.b, p1.ctid, p1.xc_node_id, p1.tableoid -> Seq Scan on public.c1 - Output: c1.a, c1.b, c1.ctid, c1.tableoid + Output: c1.a, c1.b, c1.ctid, c1.xc_node_id, c1.tableoid (9 rows) select * from c1 order by 1 for update; @@ -457,12 +457,12 @@ explain (costs off, num_nodes off, nodes off, verbose on) select * from c1 for QUERY PLAN ------------------------------------------------------------ Remote Fast Query Execution - Output: c1.a, c1.b, c1.d, c1.e, c1.ctid + Output: c1.a, c1.b, c1.d, c1.e, c1.ctid, c1.xc_node_id Remote query: SELECT a, b, d, e FROM c1 FOR UPDATE OF c1 -> LockRows - Output: a, b, d, e, ctid + Output: a, b, d, e, ctid, xc_node_id -> Seq Scan on public.c1 - Output: a, b, d, e, ctid + Output: a, b, d, e, ctid, xc_node_id (7 rows) -- confirm that in various join scenarios for update gets to the remote query From 3373fc7eece4e92796654fd55583b62e7acc9498 Mon Sep 17 00:00:00 2001 From: ceciliasu Date: Mon, 10 May 2021 17:52:35 +0800 Subject: [PATCH 110/578] fix bug of not refresh relcache after clean-sharding. http://tapd.oa.com/20418349/bugtrace/bugs/view?bug_id=1020418349087059509 --- src/backend/pgxc/shard/shardmap.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/backend/pgxc/shard/shardmap.c b/src/backend/pgxc/shard/shardmap.c index d0a2e242..38b5044a 100644 --- a/src/backend/pgxc/shard/shardmap.c +++ b/src/backend/pgxc/shard/shardmap.c @@ -71,6 +71,7 @@ #include "utils/lsyscache.h" #include "utils/fmgroids.h" #include "utils/rel.h" +#include "utils/inval.h" #include "pgxc/shardmap.h" #include "pgxc/pgxc.h" #include "pgxc/pgxcnode.h" @@ -1936,6 +1937,12 @@ void ForceRefreshShardMap(Oid groupoid) } } LWLockRelease(ShardMapLock); + + /* + * Invalidate the relcache after refresh shard map in shmem, + * because Relation->rd_locator_info changed. + */ + CacheInvalidateRelcacheAll(); } /* From b223e1595d92ce2d2afb359180e1f9cac2e1121e Mon Sep 17 00:00:00 2001 From: andrelin Date: Thu, 13 May 2021 13:09:14 +0800 Subject: [PATCH 111/578] Only datanodes need to parse epqTuples from remote --- src/backend/tcop/postgres.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index def87c2d..db2b5639 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -2688,7 +2688,9 @@ exec_bind_message(StringInfo input_message) rformats[i] = pq_getmsgint(input_message, 2); } - /* Get epq context */ + /* Get epq context, only datanodes need them */ + if (IS_PGXC_DATANODE && (IsConnFromCoord() || IsConnFromDatanode())) + { num_epq_tuple = pq_getmsgint(input_message, 2); if (num_epq_tuple > 0) { @@ -2709,6 +2711,7 @@ exec_bind_message(StringInfo input_message) portal->epqContext->nodeid[i] = pq_getmsgint(input_message, 4); } } + } pq_getmsgend(input_message); From da2bbd16e2f7246e3b4f87ffbd9a9f2b86484867 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Thu, 3 Jun 2021 15:10:41 +0800 Subject: [PATCH 112/578] fix compile errors --- src/backend/commands/vacuumlazy.c | 8 ++--- src/backend/pgxc/pool/poolmgr.c | 32 +++++++++++++++++++- src/backend/utils/adt/rowtypes.c | 4 +-- src/backend/utils/misc/guc.c | 13 +------- src/backend/utils/misc/mls.c | 12 ++++---- src/test/regress/expected/create_index_1.out | 2 -- src/test/regress/expected/sysviews.out | 1 + 7 files changed, 45 insertions(+), 27 deletions(-) diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c index 92fe6c94..4796152a 100644 --- a/src/backend/commands/vacuumlazy.c +++ b/src/backend/commands/vacuumlazy.c @@ -2831,7 +2831,7 @@ MaintainGTS(Relation rel, BlockNumber blkno, Buffer buffer) HeapTupleHeaderXminCommitted(tuphdr) && !HeapTupleHeaderXminFrozen(tuphdr)) { - GlobalTimestamp tuple_xmin_gts = HeapTupleHeaderGetXminTimestampAtomic(tuphdr); + GlobalTimestamp tuple_xmin_gts = HeapTupleHderGetXminTimestapAtomic(tuphdr); if (GlobalTimestampIsValid(tuple_xmin_gts) && !CommitTimestampIsLocal(tuple_xmin_gts) @@ -2857,7 +2857,7 @@ MaintainGTS(Relation rel, BlockNumber blkno, Buffer buffer) if (reset) { changed = true; - HeapTupleHeaderSetXminTimestampAtomic(tuphdr, tlog_xmin_gts); + HeapTupleHderSetXminTimestapAtomic(tuphdr, tlog_xmin_gts); elog(WARNING, "relfilenode %u " "pageno %u lineoff %u xmin %u xmin_gts "INT64_FORMAT" " @@ -2878,7 +2878,7 @@ MaintainGTS(Relation rel, BlockNumber blkno, Buffer buffer) if (TransactionIdIsNormal(xmax) && HeapTupleHeaderXmaxCommitted(tuphdr)) { - GlobalTimestamp tuple_xmax_gts = HeapTupleHeaderGetXmaxTimestampAtomic(tuphdr); + GlobalTimestamp tuple_xmax_gts = HeapTupleHderGetXmaxTimestapAtomic(tuphdr); if (GlobalTimestampIsValid(tuple_xmax_gts) && !CommitTimestampIsLocal(tuple_xmax_gts) @@ -2905,7 +2905,7 @@ MaintainGTS(Relation rel, BlockNumber blkno, Buffer buffer) if (reset) { changed = true; - HeapTupleHeaderSetXmaxTimestampAtomic(tuphdr, tlog_xmax_gts); + HeapTupleHderSetXmaxTimestapAtomic(tuphdr, tlog_xmax_gts); elog(WARNING, "relfilenode " "%u pageno %u lineoff %u " diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c index f64b45c3..4e1da81f 100644 --- a/src/backend/pgxc/pool/poolmgr.c +++ b/src/backend/pgxc/pool/poolmgr.c @@ -2102,6 +2102,8 @@ agent_handle_input(PoolAgent * agent, StringInfo s) { int res; + agent->cmd_start_time = get_system_time(); + /* * During a pool cleaning, Abort, Connect and Get Connections messages * are not allowed on pooler side. @@ -2232,7 +2234,19 @@ agent_handle_input(PoolAgent * agent, StringInfo s) /* Send result */ pool_sendres(&agent->port, res, NULL, 0, true); break; - + + case 'x': /* get command statistics */ + handle_get_cmd_statistics(agent); + break; + + case 'y': /* reset command statistics */ + reset_pooler_cmd_statistics(); + break; + + case 'z': /* get connections statistics */ + handle_get_conn_statistics(agent); + break; + case EOF: /* EOF */ agent_destroy(agent); return; @@ -2242,6 +2256,12 @@ agent_handle_input(PoolAgent * agent, StringInfo s) return; } + /* if cmd_start_time is not 0, means cmd handle in main loop sync, statistic here */ + if (agent->cmd_start_time != 0) + { + update_pooler_cmd_statistics(qtype, get_system_time() - agent->cmd_start_time); + } + /* avoid reading from connection */ if ((qtype = pool_pollbyte(&agent->port)) == EOF) break; @@ -6263,6 +6283,11 @@ static void pooler_handle_sync_response_queue(void) abort(); } } + + if (connRsp->cmd_start_time != 0) + { + update_pooler_cmd_statistics(connRsp->cmd, connRsp->cmd_end_time - connRsp->cmd_start_time); + } /* handle pending agent, if any */ agent_handle_pending_agent(agent); @@ -7981,6 +8006,11 @@ void *pooler_sync_remote_operator_thread(void *arg) { gettimeofday(&request->end_time, NULL); } + + if (request->cmd_start_time != 0) + { + request->cmd_end_time = get_system_time(); + } /* clear task status */ pooler_async_task_done(&g_PoolSyncNetworkControl, threadIndex); diff --git a/src/backend/utils/adt/rowtypes.c b/src/backend/utils/adt/rowtypes.c index 3ba7b15c..38e30728 100644 --- a/src/backend/utils/adt/rowtypes.c +++ b/src/backend/utils/adt/rowtypes.c @@ -372,7 +372,7 @@ record_out(PG_FUNCTION_ARGS) */ if (IS_PGXC_DATANODE && tupdesc->attrs_ext) { - transparent_crypt_decrypt_all_cols_value_copy(&tuple, tupdesc, values, nulls); + trsprt_crypt_dcrpt_all_col_vale_cp(&tuple, tupdesc, values, nulls); } else { @@ -390,7 +390,7 @@ record_out(PG_FUNCTION_ARGS) if (OidIsValid(parentOid) && datamask_check_table_has_datamask(parentOid)) { - datamask_exchange_all_cols_value_copy(tupdesc, values, nulls, parentOid); + dmask_exchg_all_cols_value_copy(tupdesc, values, nulls, parentOid); } /* And build the result string */ diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index c1770cd1..5a6afb51 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -2109,18 +2109,7 @@ static struct config_bool ConfigureNamesBool[] = &g_enable_bouncer, false, NULL, NULL, NULL - }, - - { - { - "enable_pgbouncer", PGC_SIGHUP, STATS_COLLECTOR, - gettext_noop("use pgbouncer as coordinator connection pool."), - NULL - }, - &g_enable_bouncer, - false, - NULL, NULL, NULL - }, + }, { { diff --git a/src/backend/utils/misc/mls.c b/src/backend/utils/misc/mls.c index c29ed21c..8902bcaf 100644 --- a/src/backend/utils/misc/mls.c +++ b/src/backend/utils/misc/mls.c @@ -380,8 +380,8 @@ bool mls_check_relation_permission(Oid relid, bool * schema_bound) return true; } - if (transparent_crypt_check_table_has_crypto(parent_oid, true, schema_bound) || - transparent_crypt_check_table_has_crypto(relid, true, schema_bound)) + if (trsprt_crypt_check_table_has_crypt(parent_oid, true, schema_bound) || + trsprt_crypt_check_table_has_crypt(relid, true, schema_bound)) { return true; } @@ -438,14 +438,14 @@ bool mls_check_column_permission(Oid relid, int attnum) { parent_oid = mls_get_parent_oid_by_relid(relid); - if (datamask_check_table_col_has_datamask(parent_oid, attnum) || - datamask_check_table_col_has_datamask(relid, attnum)) + if (dmask_check_table_col_has_dmask(parent_oid, attnum) || + dmask_check_table_col_has_dmask(relid, attnum)) { return true; } - if (transparent_crypt_check_table_col_has_crypto(parent_oid, attnum) || - transparent_crypt_check_table_col_has_crypto(relid, attnum)) + if (trsprt_crypt_chk_tbl_col_has_crypt(parent_oid, attnum) || + trsprt_crypt_chk_tbl_col_has_crypt(relid, attnum)) { return true; } diff --git a/src/test/regress/expected/create_index_1.out b/src/test/regress/expected/create_index_1.out index 30f42019..924c7c95 100644 --- a/src/test/regress/expected/create_index_1.out +++ b/src/test/regress/expected/create_index_1.out @@ -2622,7 +2622,6 @@ DROP INDEX CONCURRENTLY "concur_index2"; -- works ERROR: index "concur_index2" does not exist DROP INDEX CONCURRENTLY IF EXISTS "concur_index2"; -- notice NOTICE: index "concur_index2" does not exist, skipping -ERROR: DROP INDEX CONCURRENTLY cannot run inside a transaction block -- failures DROP INDEX CONCURRENTLY "concur_index2", "concur_index3"; ERROR: index "concur_index2" does not exist @@ -2633,7 +2632,6 @@ ROLLBACK; -- successes DROP INDEX CONCURRENTLY IF EXISTS "concur_index3"; NOTICE: index "concur_index3" does not exist, skipping -ERROR: DROP INDEX CONCURRENTLY cannot run inside a transaction block DROP INDEX CONCURRENTLY "concur_index4"; ERROR: index "concur_index4" does not exist DROP INDEX CONCURRENTLY "concur_index5"; diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index 2ab99d9d..48b73026 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -120,6 +120,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_pooler_stuck_exit | off enable_pullup_subquery | on enable_replication_slot_debug | off + enable_sampling_analyze | on enable_seqscan | on enable_shard_statistic | on enable_sort | on From 7ff1831dede2d81ea827148a221a7d08418262d4 Mon Sep 17 00:00:00 2001 From: andrelin Date: Thu, 31 Dec 2020 11:41:45 +0800 Subject: [PATCH 113/578] [Bugfix] hash value calculation during redistributing data (merge request !65) Should break after hash a type of datum, result may be wrong or even cause data distribute to a single DN TAPD[ID84546415]: http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131084546415 --- src/backend/executor/nodeAgg.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c index d0610f2a..d771a28e 100644 --- a/src/backend/executor/nodeAgg.c +++ b/src/backend/executor/nodeAgg.c @@ -6122,16 +6122,19 @@ ReDistributeHash(Oid dataType, int numWorkers, Datum value, LocatorHashFunc hash int64 val = DatumGetInt64(value); result = (val % num) % numWorkers; } + break; case INT2OID: { int16 val = DatumGetInt16(value); result = (val % num) % numWorkers; } + break; case OIDOID: { uint32 val = (uint32)DatumGetObjectId(value); result = (val % num) % numWorkers; } + break; case INT4OID: case ABSTIMEOID: case RELTIMEOID: @@ -6140,12 +6143,14 @@ ReDistributeHash(Oid dataType, int numWorkers, Datum value, LocatorHashFunc hash int32 val = DatumGetInt32(value); result = (val % num) % numWorkers; } + break; case BOOLOID: case CHAROID: { int32 val = (int32)DatumGetChar(value); result = (val % num) % numWorkers; } + break; case TIMEOID: case TIMESTAMPOID: case TIMESTAMPTZOID: @@ -6153,6 +6158,7 @@ ReDistributeHash(Oid dataType, int numWorkers, Datum value, LocatorHashFunc hash int64 val = DatumGetInt64(value); result = (val % num) % numWorkers; } + break; default: { unsigned int hashvalue = 0; From e9af66964cca23897ca69691a9c2c92861493eee Mon Sep 17 00:00:00 2001 From: andrelin Date: Wed, 13 Jan 2021 13:09:52 +0800 Subject: [PATCH 114/578] Support pull up subquery which has more than 2 RTE (merge request !92) it's an unnecessary limitation and add limitation about LIMIT expression, adjust code style TAPD: http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131084728795 --- src/backend/optimizer/plan/subselect.c | 6 +-- src/test/regress/expected/subselect.out | 50 +++++++++++++++++++++++++ src/test/regress/sql/subselect.sql | 24 ++++++++++++ 3 files changed, 75 insertions(+), 5 deletions(-) diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index 7c342fc3..3e3339f8 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -1713,6 +1713,7 @@ simplify_EXPR_query(PlannerInfo *root, Query *query) query->hasModifyingCTE || query->havingQual || query->limitOffset || + query->limitCount || query->rowMarks || query->hasSubLinks || query->cteList || @@ -2604,11 +2605,6 @@ convert_EXPR_sublink_to_join(PlannerInfo *root, OpExpr *expr, return NULL; } - if (list_length(((Query *)sublink->subselect)->rtable) > 2) - { - return NULL; - } - subselect = (Query *)copyObject(sublink->subselect); /* we can just handle simple case now! */ diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out index a1f9d561..4691a4a9 100644 --- a/src/test/regress/expected/subselect.out +++ b/src/test/regress/expected/subselect.out @@ -1730,6 +1730,56 @@ select * from tbl_a a where a.b IN (select b.a from tbl_b b where b.b > a.b); drop table tbl_a; drop table tbl_b; +-- more RTEs in subquery +CREATE TABLE sub_t1 (a int4, b int4); +CREATE TABLE sub_t2 (a int4, b int4); +CREATE TABLE sub_interfere1 (a int4, b int4); +CREATE TABLE sub_interfere2 (a int4, b int4); +explain (costs off) +select 1 from + sub_t1 t1, + sub_t2 t2 +where t2.a = ( + select + min(t2.a) + from + sub_t2 t2, + sub_interfere1, + sub_interfere2 + where + t1.a = t2.a +); + QUERY PLAN +----------------------------------------------------------------------------------------------------- + Hash Join + Hash Cond: ("EXPR_subquery".min = t2.a) + -> Hash Left Join + Hash Cond: (t1.a = "EXPR_subquery".a) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on sub_t1 t1 + -> Hash + -> Subquery Scan on "EXPR_subquery" + -> HashAggregate + Group Key: t2_1.a + -> Nested Loop + -> Nested Loop + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on sub_t2 t2_1 + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on sub_interfere1 + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on sub_interfere2 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on sub_t2 t2 +(23 rows) + +DROP TABLE sub_t1; +DROP TABLE sub_t2; +DROP TABLE sub_interfere1; +DROP TABLE sub_interfere2; set enable_pullup_subquery to false; -- -- Tests for CTE inlining behavior diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql index 66c01e19..256ddefa 100644 --- a/src/test/regress/sql/subselect.sql +++ b/src/test/regress/sql/subselect.sql @@ -703,6 +703,30 @@ select * from tbl_a a where a.b IN (select b.a from tbl_b b where b.b > a.b); drop table tbl_a; drop table tbl_b; + +-- more RTEs in subquery +CREATE TABLE sub_t1 (a int4, b int4); +CREATE TABLE sub_t2 (a int4, b int4); +CREATE TABLE sub_interfere1 (a int4, b int4); +CREATE TABLE sub_interfere2 (a int4, b int4); +explain (costs off) +select 1 from + sub_t1 t1, + sub_t2 t2 +where t2.a = ( + select + min(t2.a) + from + sub_t2 t2, + sub_interfere1, + sub_interfere2 + where + t1.a = t2.a +); +DROP TABLE sub_t1; +DROP TABLE sub_t2; +DROP TABLE sub_interfere1; +DROP TABLE sub_interfere2; set enable_pullup_subquery to false; -- From df822f2db1693ac030e16aa0473540780ea76991 Mon Sep 17 00:00:00 2001 From: andrelin Date: Sun, 17 Jan 2021 22:04:16 +0800 Subject: [PATCH 115/578] Support tables join between different group by pulling up to CN (merge request !66) The original check relied too much on global variables and guc values, and mistakenly prevented the unique path from pulling to CN for calculation. We deal with it by removing guc and a global variable, and guard it at a more proper position. Same guard was added in the "INSERT INTO SELECT FROM" case. --- src/backend/optimizer/plan/createplan.c | 32 ---------------- src/backend/optimizer/plan/planner.c | 20 ---------- src/backend/optimizer/util/pathnode.c | 51 +++++++++++++++++++++---- src/backend/parser/analyze.c | 24 +++++++++--- src/backend/pgxc/plan/planner.c | 18 --------- src/backend/utils/misc/guc.c | 10 ----- src/include/optimizer/planmain.h | 2 - src/test/regress/expected/sysviews.out | 3 +- 8 files changed, 62 insertions(+), 98 deletions(-) diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 23d04153..72a495e1 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -108,7 +108,6 @@ bool enable_group_across_query = false; bool enable_distributed_unique_plan = false; #endif #ifdef __COLD_HOT__ -bool has_distribute_remote_plan = false; bool has_cold_hot_table = false; #endif static Plan *create_plan_recurse(PlannerInfo *root, Path *best_path, @@ -701,26 +700,11 @@ create_scan_plan(PlannerInfo *root, Path *best_path, int flags) if (AttributeNumberIsValid(loc->secAttrNum) || OidIsValid(loc->coldGroupId)) { - if (has_distribute_remote_plan && list_length(groupOids) != 1) - { - error = true; - } - else - { has_cold_hot_table = true; } } - } heap_close(relation, NoLock); - - if (error) - { - has_distribute_remote_plan = false; - has_cold_hot_table = false; - - elog(ERROR, "Tables which located in more than one group could not involved in query with join or redistribution"); - } } #endif @@ -6408,22 +6392,6 @@ make_remotesubplan(PlannerInfo *root, Assert(!equal(resultDistribution, execDistribution)); Assert(!IsA(lefttree, RemoteSubplan)); -#ifdef __COLD_HOT__ - if (distributionType != LOCATOR_TYPE_NONE) - { - if (has_cold_hot_table && list_length(groupOids) != 1 && root->parse->commandType != CMD_INSERT) - { - has_cold_hot_table = false; - has_distribute_remote_plan = false; - elog(ERROR, "Tables which located in more than one group could not involved in query with join or redistribution"); - } - else - { - has_distribute_remote_plan = true; - } - } -#endif - #ifdef __TBASE__ if((IsA(lefttree, HashJoin) || IsA(lefttree, SeqScan) || IsA(lefttree, Agg) || IsA(lefttree, Group) || diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 6ed3f131..4e9eda21 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -288,7 +288,6 @@ standard_planner(Query *parse, int cursorOptions, ParamListInfo boundParams) groupOids = NULL; #endif #ifdef __COLD_HOT__ - has_distribute_remote_plan = false; has_cold_hot_table = false; #endif /* @@ -526,25 +525,6 @@ standard_planner(Query *parse, int cursorOptions, ParamListInfo boundParams) result->partpruning = bms_copy(root->partpruning); #endif - -#ifdef __TBASE__ - /* - * sanity check - * tables from different groups can not be joined, and shard table join with other table type - * also permitted. - */ - { - if (list_length(groupOids) > 1 && !enable_group_across_query && !has_cold_hot_table) - { - groupOids = NULL; - elog(ERROR, "Shard tables from different groups should not be invloved in one Query,\n" - "Shard tables should not be invloved in one Query with other tables, such as hash table."); - } - - groupOids = NULL; - } -#endif - return result; } diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 464eccfb..bd6d510c 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -1682,6 +1682,33 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) goto pull_up; } + /* + * If outer or inner subpaths are distributed by shard and they do not exist + * in same node set, which means we may need to redistribute tuples to data + * nodes which use different router map to producer. + * We don't support that, so pull it up to CN to accomplish the join. + * + * TODO: + * 1. if the join is "REPLICATION join SHARD", and node set of SHARD table + * is subset of REPLICATION table, no need to pull up. + * 2. find out which side of this join needs to dispatch, and only decide + * whether to pull up by the distributionType of another side subpath. + * 3. pass target router map to another group maybe ? thus nothing need to + * pull up to CN. + */ + if (innerd && outerd && + (outerd->distributionType == LOCATOR_TYPE_SHARD || + (innerd->distributionType == LOCATOR_TYPE_SHARD)) && + !bms_equal(outerd->nodes, innerd->nodes)) + { + goto pull_up; + } + + /* + * the join of cold-hot tables must be pulled up to CN until we find a way + * to determine whether this join occurs in a specific group. + */ +#ifdef __COLD_HOT__ if (has_cold_hot_table) { if (list_length(groupOids) > 1) @@ -1691,9 +1718,10 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) else if (list_length(groupOids) < 1) { has_cold_hot_table = false; - elog(ERROR, "hot cold table joins without groups"); + elog(ERROR, "cold-hot table joins without groups"); } } +#endif #endif /* * If both subpaths are distributed by replication, the resulting @@ -2435,8 +2463,21 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) nodes = bms_add_member(nodes, i); #ifdef __TBASE__ + /* + * We end up here that we don't have replication table and whether + * 1. we have no shard table at both sides OR + * 2. we have shard table but spread in same node set + * so check distribution type and decide what's next. + */ + if (innerd->distributionType == LOCATOR_TYPE_SHARD || + outerd->distributionType == LOCATOR_TYPE_SHARD) + { + /* must be same node set, just copy */ + Assert(bms_equal(innerd->nodes, innerd->nodes)); + nodes = bms_copy(outerd->nodes); + } /* check if we can distribute by shard */ - if (OidIsValid(group)) + else if (OidIsValid(group)) { int node_index; int32 dn_num; @@ -3101,12 +3142,6 @@ create_redistribute_grouping_path(PlannerInfo *root, Query *parse, Path *path) te = (TargetEntry *)list_nth(parse->targetList, groupColIdx[colIdx]-1); - if (list_length(groupOids) > 1 && !enable_group_across_query) - { - groupOids = NULL; - elog(ERROR, "Tables from different groups should not be invloved in one Query."); - } - if (groupOids) { group = linitial_oid(groupOids); diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c index a157b502..27c44c45 100644 --- a/src/backend/parser/analyze.c +++ b/src/backend/parser/analyze.c @@ -703,18 +703,30 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt) ParseState *sub_pstate = make_parsestate(pstate); Query *selectQuery; -#ifdef __COLD_HOT__ +#ifdef __TBASE__ /* prevent insert into cold_hot table select ... */ if (pstate->p_target_relation) { - RelationLocInfo *rel_loc_info = pstate->p_target_relation->rd_locator_info; + RelationLocInfo *target_rel_loc_info = pstate->p_target_relation->rd_locator_info; + RelationLocInfo *from_rel_loc_info; - if (rel_loc_info) + if (target_rel_loc_info && target_rel_loc_info->locatorType == LOCATOR_TYPE_SHARD) { - if (AttributeNumberIsValid(rel_loc_info->secAttrNum) - || OidIsValid(rel_loc_info->coldGroupId)) + foreach(lc, selectStmt->fromClause) { - elog(ERROR, "table in cold-hot group or key-value group could not join with other tables."); + Relation rel = heap_openrv((RangeVar *) lfirst(lc), AccessShareLock); + + from_rel_loc_info = rel->rd_locator_info; + if (from_rel_loc_info == NULL || /* from system table */ +#ifdef __COLD_HOT__ + from_rel_loc_info->coldGroupId != target_rel_loc_info->coldGroupId || +#endif + from_rel_loc_info->groupId != target_rel_loc_info->groupId) + { + elog(ERROR, "shard table could not be inserted from any other tables in different group"); + } + + heap_close(rel, AccessShareLock); } } } diff --git a/src/backend/pgxc/plan/planner.c b/src/backend/pgxc/plan/planner.c index c761f4a9..9bc141ad 100644 --- a/src/backend/pgxc/plan/planner.c +++ b/src/backend/pgxc/plan/planner.c @@ -349,24 +349,6 @@ pgxc_FQS_planner(Query *query, int cursorOptions, ParamListInfo boundParams) result->invalItems = glob->invalItems; result->rowMarks = glob->finalrowmarks; -#ifdef __TBASE__ - /* - * sanity check - * tables from different groups can not be joined, and shard table join with other table type - * also permitted. - */ - { - if (list_length(groupOids) > 1 && !enable_group_across_query) - { - groupOids = NULL; - elog(ERROR, "Shard tables from different groups should not be invloved in one Query,\n" - "Shard tables should not be invloved in one Query with other tables, such as hash table."); - } - - groupOids = NULL; - } -#endif - return result; } diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 5a6afb51..0f728e50 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -2197,16 +2197,6 @@ static struct config_bool ConfigureNamesBool[] = }, { - {"enable_group_across_query", PGC_USERSET, CUSTOM_OPTIONS, - gettext_noop("enable group-across queries."), - NULL - }, - &enable_group_across_query, - false, - NULL, NULL, NULL - }, - - { {"enable_distributed_unique_plan", PGC_USERSET, CUSTOM_OPTIONS, gettext_noop("enable distributed unique plan."), NULL diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h index 58ffbce4..8139e134 100644 --- a/src/include/optimizer/planmain.h +++ b/src/include/optimizer/planmain.h @@ -97,9 +97,7 @@ extern int force_parallel_mode; #ifdef __TBASE__ extern int remote_subplan_depth; extern List *groupOids; -extern bool enable_group_across_query; extern bool enable_distributed_unique_plan; -extern bool has_distribute_remote_plan; extern bool has_cold_hot_table; #define INSERT_TRIGGER "tt_dn_in_" diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index 48b73026..7a478711 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -96,7 +96,6 @@ select name, setting from pg_settings where name like 'enable%'; enable_fast_query_shipping | on enable_fga | on enable_gathermerge | on - enable_group_across_query | off enable_gtm_debug_print | off enable_gtm_proxy | off enable_hashagg | on @@ -129,7 +128,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_tidscan | on enable_transparent_crypt | on enable_user_authority_force_check | off -(57 rows) +(56 rows) -- Test that the pg_timezone_names and pg_timezone_abbrevs views are -- more-or-less working. We can't test their contents in any great detail From 1ee49bce6548b7415ef64c992f2e8f59fc26ff1c Mon Sep 17 00:00:00 2001 From: andrelin Date: Fri, 29 Jan 2021 20:34:42 +0800 Subject: [PATCH 116/578] Bug fix, consider RangeVar only when check from clause of INSERT --- src/backend/parser/analyze.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c index 27c44c45..fb5e27f1 100644 --- a/src/backend/parser/analyze.c +++ b/src/backend/parser/analyze.c @@ -714,7 +714,10 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt) { foreach(lc, selectStmt->fromClause) { - Relation rel = heap_openrv((RangeVar *) lfirst(lc), AccessShareLock); + Node *node = lfirst(lc); + if (IsA(node, RangeVar)) + { + Relation rel = heap_openrv((RangeVar *) node, AccessShareLock); from_rel_loc_info = rel->rd_locator_info; if (from_rel_loc_info == NULL || /* from system table */ @@ -730,6 +733,7 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt) } } } + } #endif /* From 8958a0499a31a8c654ce11e278b6aecf53985de9 Mon Sep 17 00:00:00 2001 From: andrelin Date: Mon, 25 Jan 2021 16:01:30 +0800 Subject: [PATCH 117/578] remote subquery width fix --- src/backend/optimizer/plan/planner.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 4e9eda21..b909ad6a 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -4281,6 +4281,8 @@ create_grouping_paths(PlannerInfo *root, bool try_redistribute_grouping = false; PathTarget * local_grouping_target = make_partial_grouping_target(root, target); + grouped_rel->reltarget = local_grouping_target; + /* Estimate number of partial groups. */ double dNumLocalGroups = get_number_of_groups(root, cheapest_path->rows, @@ -5326,6 +5328,9 @@ create_grouping_paths(PlannerInfo *root, { partial_grouping_target = make_partial_grouping_target(root, target); +#ifdef __TBASE__ + grouped_rel->reltarget = partial_grouping_target; +#endif /* Estimate number of partial groups. */ dNumPartialGroups = get_number_of_groups(root, cheapest_path->rows, From cfb79d48fc6bcdeef0fa438b6022396801fbf2ee Mon Sep 17 00:00:00 2001 From: andrelin Date: Wed, 20 Jan 2021 19:59:50 +0800 Subject: [PATCH 118/578] Adjust costsize.c, consider number of nodes involved adjust create_bitmap_subplan --- src/backend/optimizer/path/costsize.c | 309 ++++++++++++++++++++++++ src/backend/optimizer/plan/createplan.c | 36 ++- 2 files changed, 342 insertions(+), 3 deletions(-) diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 4b984a23..37683e78 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -179,6 +179,111 @@ static double relation_byte_size(double tuples, int width); static double page_size(double tuples, int width); static double get_parallel_divisor(Path *path); +#ifdef __TBASE__ +/* + * In PostgreSQL, the row count estimate of a base rel scan, like a Seq Scan + * or an Index Scan, can be directly copied from RelOptInfo->rows/tuples. In + * TBase, it's not that straightforward as a Scan runs in parallel in the + * DNs, and the number of rows scanned by each Scan is RelOptInfo->rows / + * number of DN. + * + * That's pretty straightforward, too, but it means that we'd have to modify + * all the cost_seqscan, cost_index, etc. functions to take that into + * account. That's prone to bugs, because it is easy to miss references to + * rel->rows/tuples/pages. Even if we fix them all now, more can be + * introduced in merges with PostgreSQL, and it's not easy to notice because + * the only consequence is a bad cost estimate. + * + * To make that more robust with PostgreSQL merges, we do a little switcheroo + * with the RelOptInfo. The RelOptInfoDataNode struct is a "proxy" of + * RelOptInfo, containing the same fields, except that the rows/pages/tuple + * have already been divided by the number of data nodes. The costing functions + * have been modified so that on entry, they construct a RelOptInfoDataNode and + * use it in place of the RelOptInfo. That way, the formulas in the costing + * functions can still refer to "rel->pages", "rel->tuples" and so forth in + * the source code, keeping them unchanged from upstream, but will actually + * use the adjusted values. + * + * The RelOptInfoDataNode struct doesn't contain all the fields from RelOptInfo, + * only the ones commonly used in the cost_*() functions. If a reference to a + * new field is added in uptream, and it's not handled either by adding it to + * the RelOptInfoDataNode, or by modifying the reference to explictly point to + * the original RelOptInfo, you'll get a compiler error. That's good: it forces + * you to think whether the value needs to be divided by nDNs or not. + */ +typedef struct +{ + /* Values copied from RelOptInfo as is, for convenience */ + Index relid; + RTEKind rtekind; /* RELATION, SUBQUERY, or FUNCTION */ + Oid reltablespace; /* containing tablespace */ + double allvisfrac; + + /* Values adjusted from RelOptInfo, by dividing by number of DNs */ + double rows; + BlockNumber pages; + double tuples; + + /* the original RelOptInfo */ + RelOptInfo *orig; +} RelOptInfoDataNode; + +/* ParamPathInfoDataNode is a similar proxy for ParamPathInfo. */ +typedef struct +{ + double ppi_rows; /* estimated number of result tuples */ + List *ppi_clauses; /* join clauses available from outer rels */ + + ParamPathInfo *orig; +} ParamPathInfoDataNode; + +static ParamPathInfoDataNode * +adjust_reloptinfo(Path *path, RelOptInfoDataNode *basescan, RelOptInfo *baserel_orig, + ParamPathInfoDataNode *param_info, ParamPathInfo *param_info_orig) +{ + double nodes; + + if (path->distribution && IsA(path->distribution, Distribution) && + path->distribution->distributionType != LOCATOR_TYPE_REPLICATED && + path->distribution->distributionType != LOCATOR_TYPE_NONE) + nodes = bms_num_members(path->distribution->nodes); + else + nodes = 1; + + basescan->relid = baserel_orig->relid; + basescan->rtekind = baserel_orig->rtekind; + basescan->reltablespace = baserel_orig->reltablespace; + basescan->allvisfrac = baserel_orig->allvisfrac; + + basescan->rows = clamp_row_est(baserel_orig->rows / nodes); + basescan->tuples = clamp_row_est(baserel_orig->tuples / nodes); + basescan->pages = ceil((double) baserel_orig->pages / nodes); + + basescan->orig = baserel_orig; + + if (param_info_orig) + { + param_info->ppi_rows = clamp_row_est(param_info_orig->ppi_rows / nodes); + param_info->ppi_clauses = param_info_orig->ppi_clauses; + param_info->orig = param_info_orig; + return param_info; + } + else + return NULL; +} + +/* + * ADJUST_BASESCAN initializes the proxy structs for RelOptInfo and ParamPathInfo, + * adjusting them by # of data nodes as needed. + */ +#define ADJUST_BASESCAN(path, baserel_orig, baserel, param_info_orig, param_info) \ + RelOptInfoDataNode baserel_adjusted; \ + ParamPathInfoDataNode param_info_adjusted; \ + RelOptInfoDataNode *baserel = &baserel_adjusted; \ + ParamPathInfoDataNode *param_info = adjust_reloptinfo(path, &baserel_adjusted, baserel_orig, \ + ¶m_info_adjusted, param_info_orig) +#endif + /* * clamp_row_est @@ -210,8 +315,14 @@ clamp_row_est(double nrows) */ void cost_seqscan(Path *path, PlannerInfo *root, +#ifdef __TBASE__ + RelOptInfo *baserel_orig, ParamPathInfo *param_info_orig) +{ + ADJUST_BASESCAN(path, baserel_orig, baserel, param_info_orig, param_info); +#else RelOptInfo *baserel, ParamPathInfo *param_info) { +#endif Cost startup_cost = 0; Cost cpu_run_cost; Cost disk_run_cost; @@ -243,7 +354,11 @@ cost_seqscan(Path *path, PlannerInfo *root, disk_run_cost = spc_seq_page_cost * baserel->pages; /* CPU costs */ +#ifdef __TBASE__ + get_restriction_qual_cost(root, baserel_orig, param_info_orig, &qpqual_cost); +#else get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost); +#endif startup_cost += qpqual_cost.startup; cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple; @@ -287,8 +402,14 @@ cost_seqscan(Path *path, PlannerInfo *root, */ void cost_samplescan(Path *path, PlannerInfo *root, +#ifdef __TBASE__ + RelOptInfo *baserel_orig, ParamPathInfo *param_info_orig) +{ + ADJUST_BASESCAN(path, baserel_orig, baserel, param_info_orig, param_info); +#else RelOptInfo *baserel, ParamPathInfo *param_info) { +#endif Cost startup_cost = 0; Cost run_cost = 0; RangeTblEntry *rte; @@ -337,7 +458,11 @@ cost_samplescan(Path *path, PlannerInfo *root, * simple constants anyway. We also don't charge anything for the * calculations the sampling method might do internally. */ +#ifdef __TBASE__ + get_restriction_qual_cost(root, baserel_orig, param_info_orig, &qpqual_cost); +#else get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost); +#endif startup_cost += qpqual_cost.startup; cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple; @@ -362,9 +487,14 @@ cost_samplescan(Path *path, PlannerInfo *root, */ void cost_gather(GatherPath *path, PlannerInfo *root, +#ifdef __TBASE__ + RelOptInfo *rel_orig, ParamPathInfo *param_info_orig, +#else RelOptInfo *rel, ParamPathInfo *param_info, +#endif double *rows) { + ADJUST_BASESCAN(&path->path, rel_orig, rel, param_info_orig, param_info); Cost startup_cost = 0; Cost run_cost = 0; @@ -478,7 +608,12 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count, bool partial_path) {// #lizard forgives IndexOptInfo *index = path->indexinfo; +#ifdef __TBASE__ + RelOptInfo *baserel_orig = index->rel; + ADJUST_BASESCAN(&path->path, baserel_orig, baserel, path->path.param_info, param_info); +#else RelOptInfo *baserel = index->rel; +#endif bool indexonly = (path->path.pathtype == T_IndexOnlyScan); amcostestimate_function amcostestimate; List *qpquals; @@ -500,10 +635,23 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count, double pages_fetched; double rand_heap_pages; double index_pages; + double nodes = 1; +#ifdef __TBASE__ + if (path->path.distribution && IsA(path->path.distribution, Distribution) && + path->path.distribution->distributionType != LOCATOR_TYPE_REPLICATED && + path->path.distribution->distributionType != LOCATOR_TYPE_NONE) + { + nodes = bms_num_members(path->path.distribution->nodes); + } + /* Should only be applied to base relations */ + Assert(IsA(baserel_orig, RelOptInfo) && + IsA(index, IndexOptInfo)); +#else /* Should only be applied to base relations */ Assert(IsA(baserel, RelOptInfo) && IsA(index, IndexOptInfo)); +#endif Assert(baserel->relid > 0); Assert(baserel->rtekind == RTE_RELATION); @@ -514,6 +662,18 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count, * baserestrictinfo as the list of relevant restriction clauses for the * rel. */ +#ifdef __TBASE__ + if (param_info) + { + path->path.rows = param_info->ppi_rows; + /* qpquals come from the rel's restriction clauses and ppi_clauses */ + qpquals = list_concat( + extract_nonindex_conditions(path->indexinfo->indrestrictinfo, + path->indexquals), + extract_nonindex_conditions(param_info->ppi_clauses, + path->indexquals)); + } +#else if (path->path.param_info) { path->path.rows = path->path.param_info->ppi_rows; @@ -524,6 +684,7 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count, extract_nonindex_conditions(path->path.param_info->ppi_clauses, path->indexquals)); } +#endif else { path->path.rows = baserel->rows; @@ -549,6 +710,9 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count, &indexSelectivity, &indexCorrelation, &index_pages); + /* The index pages should be divided among all the data nodes like baserel dose. */ + index_pages = ceil(index_pages / nodes); + /* * Save amcostestimate's results for possible use in bitmap scan planning. * We don't bother to save indexStartupCost or indexCorrelation, because a @@ -608,7 +772,11 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count, */ pages_fetched = index_pages_fetched(tuples_fetched * loop_count, baserel->pages, +#ifdef __TBASE__ + index_pages, +#else (double) index->pages, +#endif root); if (indexonly) @@ -632,7 +800,11 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count, pages_fetched = index_pages_fetched(pages_fetched * loop_count, baserel->pages, +#ifdef __TBASE__ + index_pages, +#else (double) index->pages, +#endif root); if (indexonly) @@ -648,7 +820,11 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count, */ pages_fetched = index_pages_fetched(tuples_fetched, baserel->pages, +#ifdef __TBASE__ + index_pages, +#else (double) index->pages, +#endif root); if (indexonly) @@ -1014,6 +1190,21 @@ cost_bitmap_heap_scan(Path *path, PlannerInfo *root, RelOptInfo *baserel, cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple; cpu_run_cost = cpu_per_tuple * tuples_fetched; +#ifdef __TBASE__ + /* Adjust costing for parallelism between data nodes, if used. */ + if (path->distribution && IsA(path->distribution, Distribution) && + path->distribution->distributionType != LOCATOR_TYPE_REPLICATED && + path->distribution->distributionType != LOCATOR_TYPE_NONE) + { + double nodes = bms_num_members(path->distribution->nodes); + + /* The CPU cost is divided among all the data nodes. */ + cpu_run_cost /= nodes; + + path->rows = clamp_row_est(path->rows / nodes); + } +#endif + /* Adjust costing for parallelism, if used. */ if (path->parallel_workers > 0) { @@ -1177,8 +1368,14 @@ cost_bitmap_or_node(BitmapOrPath *path, PlannerInfo *root) */ void cost_tidscan(Path *path, PlannerInfo *root, +#ifdef __TBASE__ + RelOptInfo *baserel_orig, List *tidquals, ParamPathInfo *param_info_orig) +{ + ADJUST_BASESCAN(path, baserel_orig, baserel, param_info_orig, param_info); +#else RelOptInfo *baserel, List *tidquals, ParamPathInfo *param_info) { +#endif Cost startup_cost = 0; Cost run_cost = 0; bool isCurrentOf = false; @@ -1234,7 +1431,11 @@ cost_tidscan(Path *path, PlannerInfo *root, */ if (isCurrentOf) { +#ifdef __TBASE__ + Assert(baserel->orig->baserestrictcost.startup >= disable_cost); +#else Assert(baserel->baserestrictcost.startup >= disable_cost); +#endif startup_cost -= disable_cost; } else if (!enable_tidscan) @@ -1255,7 +1456,11 @@ cost_tidscan(Path *path, PlannerInfo *root, run_cost += spc_random_page_cost * ntuples; /* Add scanning CPU costs */ +#ifdef __TBASE__ + get_restriction_qual_cost(root, baserel_orig, param_info_orig, &qpqual_cost); +#else get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost); +#endif /* XXX currently we assume TID quals are a subset of qpquals */ startup_cost += qpqual_cost.startup + tid_qual_cost.per_tuple; @@ -1280,8 +1485,14 @@ cost_tidscan(Path *path, PlannerInfo *root, */ void cost_subqueryscan(SubqueryScanPath *path, PlannerInfo *root, +#ifdef __TBASE__ + RelOptInfo *baserel_orig, ParamPathInfo *param_info_orig) +{ + ADJUST_BASESCAN(&path->path, baserel_orig, baserel, param_info_orig, param_info); +#else RelOptInfo *baserel, ParamPathInfo *param_info) { +#endif Cost startup_cost; Cost run_cost; QualCost qpqual_cost; @@ -1306,7 +1517,11 @@ cost_subqueryscan(SubqueryScanPath *path, PlannerInfo *root, path->path.startup_cost = path->subpath->startup_cost; path->path.total_cost = path->subpath->total_cost; +#ifdef __TBASE__ + get_restriction_qual_cost(root, baserel_orig, param_info_orig, &qpqual_cost); +#else get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost); +#endif startup_cost = qpqual_cost.startup; cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple; @@ -1329,8 +1544,14 @@ cost_subqueryscan(SubqueryScanPath *path, PlannerInfo *root, */ void cost_functionscan(Path *path, PlannerInfo *root, +#ifdef __TBASE__ + RelOptInfo *baserel_orig, ParamPathInfo *param_info_orig) +{ + ADJUST_BASESCAN(path, baserel_orig, baserel, param_info_orig, param_info); +#else RelOptInfo *baserel, ParamPathInfo *param_info) { +#endif Cost startup_cost = 0; Cost run_cost = 0; QualCost qpqual_cost; @@ -1367,7 +1588,11 @@ cost_functionscan(Path *path, PlannerInfo *root, startup_cost += exprcost.startup + exprcost.per_tuple; /* Add scanning CPU costs */ +#ifdef __TBASE__ + get_restriction_qual_cost(root, baserel_orig, param_info_orig, &qpqual_cost); +#else get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost); +#endif startup_cost += qpqual_cost.startup; cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple; @@ -1390,8 +1615,14 @@ cost_functionscan(Path *path, PlannerInfo *root, */ void cost_tablefuncscan(Path *path, PlannerInfo *root, +#ifdef __TBASE__ + RelOptInfo *baserel_orig, ParamPathInfo *param_info_orig) +{ + ADJUST_BASESCAN(path, baserel_orig, baserel, param_info_orig, param_info); +#else RelOptInfo *baserel, ParamPathInfo *param_info) { +#endif Cost startup_cost = 0; Cost run_cost = 0; QualCost qpqual_cost; @@ -1423,7 +1654,11 @@ cost_tablefuncscan(Path *path, PlannerInfo *root, startup_cost += exprcost.startup + exprcost.per_tuple; /* Add scanning CPU costs */ +#ifdef __TBASE__ + get_restriction_qual_cost(root, baserel_orig, param_info_orig, &qpqual_cost); +#else get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost); +#endif startup_cost += qpqual_cost.startup; cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple; @@ -1446,8 +1681,14 @@ cost_tablefuncscan(Path *path, PlannerInfo *root, */ void cost_valuesscan(Path *path, PlannerInfo *root, +#ifdef __TBASE__ + RelOptInfo *baserel_orig, ParamPathInfo *param_info_orig) +{ + ADJUST_BASESCAN(path, baserel_orig, baserel, param_info_orig, param_info); +#else RelOptInfo *baserel, ParamPathInfo *param_info) { +#endif Cost startup_cost = 0; Cost run_cost = 0; QualCost qpqual_cost; @@ -1470,7 +1711,11 @@ cost_valuesscan(Path *path, PlannerInfo *root, cpu_per_tuple = cpu_operator_cost; /* Add scanning CPU costs */ +#ifdef __TBASE__ + get_restriction_qual_cost(root, baserel_orig, param_info_orig, &qpqual_cost); +#else get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost); +#endif startup_cost += qpqual_cost.startup; cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple; @@ -1496,8 +1741,14 @@ cost_valuesscan(Path *path, PlannerInfo *root, */ void cost_ctescan(Path *path, PlannerInfo *root, +#ifdef __TBASE__ + RelOptInfo *baserel_orig, ParamPathInfo *param_info_orig) +{ + ADJUST_BASESCAN(path, baserel_orig, baserel, param_info_orig, param_info); +#else RelOptInfo *baserel, ParamPathInfo *param_info) { +#endif Cost startup_cost = 0; Cost run_cost = 0; QualCost qpqual_cost; @@ -1517,7 +1768,11 @@ cost_ctescan(Path *path, PlannerInfo *root, cpu_per_tuple = cpu_tuple_cost; /* Add scanning CPU costs */ +#ifdef __TBASE__ + get_restriction_qual_cost(root, baserel_orig, param_info_orig, &qpqual_cost); +#else get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost); +#endif startup_cost += qpqual_cost.startup; cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple; @@ -1537,8 +1792,14 @@ cost_ctescan(Path *path, PlannerInfo *root, */ void cost_namedtuplestorescan(Path *path, PlannerInfo *root, +#ifdef __TBASE__ + RelOptInfo *baserel_orig, ParamPathInfo *param_info_orig) +{ + ADJUST_BASESCAN(path, baserel_orig, baserel, param_info_orig, param_info); +#else RelOptInfo *baserel, ParamPathInfo *param_info) { +#endif Cost startup_cost = 0; Cost run_cost = 0; QualCost qpqual_cost; @@ -1558,7 +1819,11 @@ cost_namedtuplestorescan(Path *path, PlannerInfo *root, cpu_per_tuple = cpu_tuple_cost; /* Add scanning CPU costs */ +#ifdef __TBASE__ + get_restriction_qual_cost(root, baserel_orig, param_info_orig, &qpqual_cost); +#else get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost); +#endif startup_cost += qpqual_cost.startup; cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple; @@ -2208,6 +2473,17 @@ final_cost_nestloop(PlannerInfo *root, NestPath *path, else path->path.rows = path->path.parent->rows; +#ifdef __TBASE__ + if (path->path.distribution && IsA(path->path.distribution, Distribution) && + path->path.distribution->distributionType != LOCATOR_TYPE_REPLICATED && + path->path.distribution->distributionType != LOCATOR_TYPE_NONE) + { + double nodes = bms_num_members(path->path.distribution->nodes); + + path->path.rows = clamp_row_est(path->path.rows / nodes); + } +#endif + /* For partial paths, scale row estimate. */ if (path->path.parallel_workers > 0) { @@ -2697,6 +2973,17 @@ final_cost_mergejoin(PlannerInfo *root, MergePath *path, else path->jpath.path.rows = path->jpath.path.parent->rows; +#ifdef __TBASE__ + if (path->jpath.path.distribution && IsA(path->jpath.path.distribution, Distribution) && + path->jpath.path.distribution->distributionType != LOCATOR_TYPE_REPLICATED && + path->jpath.path.distribution->distributionType != LOCATOR_TYPE_NONE) + { + double nodes = bms_num_members(path->jpath.path.distribution->nodes); + + path->jpath.path.rows = clamp_row_est(path->jpath.path.rows / nodes); + } +#endif + /* For partial paths, scale row estimate. */ if (path->jpath.path.parallel_workers > 0) { @@ -3140,6 +3427,17 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path, else path->jpath.path.rows = path->jpath.path.parent->rows; +#ifdef __TBASE__ + if (path->jpath.path.distribution && IsA(path->jpath.path.distribution, Distribution) && + path->jpath.path.distribution->distributionType != LOCATOR_TYPE_REPLICATED && + path->jpath.path.distribution->distributionType != LOCATOR_TYPE_NONE) + { + double nodes = bms_num_members(path->jpath.path.distribution->nodes); + + path->jpath.path.rows = clamp_row_est(path->jpath.path.rows / nodes); + } +#endif + /* For partial paths, scale row estimate. */ if (path->jpath.path.parallel_workers > 0) { @@ -4728,6 +5026,17 @@ set_subquery_size_estimates(PlannerInfo *root, RelOptInfo *rel) */ sub_final_rel = fetch_upper_rel(subroot, UPPERREL_FINAL, NULL); rel->tuples = sub_final_rel->cheapest_total_path->rows; +#ifdef __TBASE__ + if (sub_final_rel->cheapest_total_path->distribution && IsA(sub_final_rel->cheapest_total_path->distribution, Distribution) && + sub_final_rel->cheapest_total_path->distribution->distributionType != LOCATOR_TYPE_REPLICATED && + sub_final_rel->cheapest_total_path->distribution->distributionType != LOCATOR_TYPE_NONE) + { + double nodes = bms_num_members(sub_final_rel->cheapest_total_path->distribution->nodes); + + /* count tuples in all data nodes */ + rel->tuples *= nodes; + } +#endif /* * Compute per-output-column width estimates by examining the subquery's diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 72a495e1..a30ea56e 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -3811,6 +3811,16 @@ create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual, List *subindexquals = NIL; List *subindexECs = NIL; ListCell *l; + double nodes = 1; + +#ifdef __TBASE__ + if (apath->path.distribution && IsA(apath->path.distribution, Distribution) && + apath->path.distribution->distributionType != LOCATOR_TYPE_REPLICATED && + apath->path.distribution->distributionType != LOCATOR_TYPE_NONE) + { + nodes = bms_num_members(apath->path.distribution->nodes); + } +#endif /* * There may well be redundant quals among the subplans, since a @@ -3839,7 +3849,7 @@ create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual, plan->startup_cost = apath->path.startup_cost; plan->total_cost = apath->path.total_cost; plan->plan_rows = - clamp_row_est(apath->bitmapselectivity * apath->path.parent->tuples); + clamp_row_est(apath->bitmapselectivity * apath->path.parent->tuples / nodes); plan->plan_width = 0; /* meaningless */ plan->parallel_aware = false; plan->parallel_safe = apath->path.parallel_safe; @@ -3899,11 +3909,21 @@ create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual, } else { + double nodes = 1; +#ifdef __TBASE__ + if (opath->path.distribution && IsA(opath->path.distribution, Distribution) && + opath->path.distribution->distributionType != LOCATOR_TYPE_REPLICATED && + opath->path.distribution->distributionType != LOCATOR_TYPE_NONE) + { + nodes = bms_num_members(opath->path.distribution->nodes); + } +#endif + plan = (Plan *) make_bitmap_or(subplans); plan->startup_cost = opath->path.startup_cost; plan->total_cost = opath->path.total_cost; plan->plan_rows = - clamp_row_est(opath->bitmapselectivity * opath->path.parent->tuples); + clamp_row_est(opath->bitmapselectivity * opath->path.parent->tuples / nodes); plan->plan_width = 0; /* meaningless */ plan->parallel_aware = false; plan->parallel_safe = opath->path.parallel_safe; @@ -3934,6 +3954,16 @@ create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual, IndexScan *iscan; List *subindexECs; ListCell *l; + double nodes = 1; + +#ifdef __TBASE__ + if (ipath->path.distribution && IsA(ipath->path.distribution, Distribution) && + ipath->path.distribution->distributionType != LOCATOR_TYPE_REPLICATED && + ipath->path.distribution->distributionType != LOCATOR_TYPE_NONE) + { + nodes = bms_num_members(ipath->path.distribution->nodes); + } +#endif /* Use the regular indexscan plan build machinery... */ iscan = castNode(IndexScan, @@ -3948,7 +3978,7 @@ create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual, plan->startup_cost = 0.0; plan->total_cost = ipath->indextotalcost; plan->plan_rows = - clamp_row_est(ipath->indexselectivity * ipath->path.parent->tuples); + clamp_row_est(ipath->indexselectivity * ipath->path.parent->tuples / nodes); plan->plan_width = 0; /* meaningless */ plan->parallel_aware = false; plan->parallel_safe = ipath->path.parallel_safe; From 52e562a09bef3a58b221d66c376f7a4a9dbdcd72 Mon Sep 17 00:00:00 2001 From: andrelin Date: Tue, 26 Jan 2021 20:47:12 +0800 Subject: [PATCH 119/578] Adjust gather cost if its upper path is a remote subquery --- src/backend/optimizer/path/costsize.c | 12 ++ src/backend/optimizer/util/pathnode.c | 78 +++++++++-- src/include/optimizer/cost.h | 189 +++++++++++++------------- 3 files changed, 175 insertions(+), 104 deletions(-) diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 37683e78..60949690 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -518,6 +518,18 @@ cost_gather(GatherPath *path, PlannerInfo *root, path->path.total_cost = (startup_cost + run_cost); } +#ifdef __TBASE__ +/* + * gather node has been optimized, it only needs to do some initiating work + * so set total_cost to startup_cost which means run_cost = 0. + */ +void +reset_cost_gather(GatherPath *path) +{ + path->path.total_cost = path->subpath->total_cost + path->path.startup_cost; +} +#endif + /* * cost_gather_merge * Determines and returns the cost of gather merge path. diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index bd6d510c..13cfb45d 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -1404,7 +1404,41 @@ set_scanpath_distribution(PlannerInfo *root, RelOptInfo *rel, Path *pathnode) } } +#ifdef __TBASE__ +static Path * +create_remotesubplan_path_internal(PlannerInfo *root, Path *subpath, + Distribution *distribution, RelOptInfo *rel, + ParamPathInfo *param_info, List *pathkeys, + PathTarget *pathtarget, int replication, + Cost additional_startup_cost, + Cost additional_total_cost) +{ + RemoteSubPath *pathnode; + + if (IsA(subpath, GatherPath)) + reset_cost_gather((GatherPath *) subpath); + + pathnode = makeNode(RemoteSubPath); + pathnode->path.pathtype = T_RemoteSubplan; + pathnode->path.parent = rel; + pathnode->path.param_info = param_info; + pathnode->path.pathkeys = pathkeys; + pathnode->subpath = subpath; + pathnode->path.distribution = (Distribution *) copyObject(distribution); + + /* We don't want to run subplains in parallel workers */ + pathnode->path.parallel_aware = false; + pathnode->path.parallel_safe = false; + + pathnode->path.pathtarget = pathtarget; + cost_remote_subplan((Path *) pathnode, subpath->startup_cost + additional_startup_cost, + subpath->total_cost + additional_total_cost, subpath->rows, + rel->reltarget->width, replication); + + return (Path *) pathnode; +} +#endif /* @@ -1422,6 +1456,13 @@ create_remotesubplan_path(PlannerInfo *root, Path *subpath, RemoteSubPath *pathnode; Distribution *subdistribution = subpath->distribution; +#ifdef __TBASE__ + return create_remotesubplan_path_internal(root, subpath, distribution, + rel, subpath->param_info, + subpath->pathkeys, subpath->pathtarget, + (subdistribution && IsLocatorReplicated(subdistribution->distributionType)) ? + bms_num_members(subdistribution->nodes) : 1, 0, 0); +#else pathnode = makeNode(RemoteSubPath); pathnode->path.pathtype = T_RemoteSubplan; pathnode->path.parent = rel; @@ -1442,6 +1483,7 @@ create_remotesubplan_path(PlannerInfo *root, Path *subpath, bms_num_members(subdistribution->nodes) : 1); return (Path *) pathnode; +#endif } /* @@ -1484,6 +1526,20 @@ redistribute_path(PlannerInfo *root, Path *subpath, List *pathkeys, if (IsA(subpath, MaterialPath)) { MaterialPath *mpath = (MaterialPath *) subpath; +#ifdef __TBASE__ + if (IsA(mpath->subpath, RemoteSubPath)) + { + pathnode = (RemoteSubPath *) mpath->subpath; + pathnode->path.distribution = (Distribution *) copyObject(distribution); + } + else + { + pathnode = (RemoteSubPath *) create_remotesubplan_path_internal(root, mpath->subpath, + distribution, rel, subpath->param_info, + subpath->pathkeys, rel->reltarget, + num_replication, 0, 0); + } +#else /* If subpath is already a RemoteSubPath, just replace distribution */ if (IsA(mpath->subpath, RemoteSubPath)) { @@ -1508,16 +1564,13 @@ redistribute_path(PlannerInfo *root, Path *subpath, List *pathkeys, subpath = pathnode->subpath; pathnode->path.distribution = distribution; - mpath->path.distribution = (Distribution *) copyObject(distribution); /* (re)calculate costs */ cost_remote_subplan((Path *) pathnode, subpath->startup_cost, subpath->total_cost, subpath->rows, rel->reltarget->width, -#ifdef __TBASE__ - num_replication); -#else IsLocatorReplicated(distributionType) ? bms_num_members(nodes) : 1); #endif + mpath->path.distribution = (Distribution *) copyObject(distribution); mpath->subpath = (Path *) pathnode; cost_material(&mpath->path, pathnode->path.startup_cost, @@ -1530,7 +1583,7 @@ redistribute_path(PlannerInfo *root, Path *subpath, List *pathkeys, { Cost input_startup_cost = 0; Cost input_total_cost = 0; - +#ifndef __TBASE__ pathnode = makeNode(RemoteSubPath); pathnode->path.pathtype = T_RemoteSubplan; pathnode->path.parent = rel; @@ -1538,7 +1591,7 @@ redistribute_path(PlannerInfo *root, Path *subpath, List *pathkeys, pathnode->path.param_info = subpath->param_info; pathnode->path.pathkeys = pathkeys ? pathkeys : subpath->pathkeys; pathnode->path.distribution = distribution; - +#endif /* * If we need to insert a Sort node, add it here, so that it gets * pushed down to the remote node. @@ -1571,7 +1624,14 @@ redistribute_path(PlannerInfo *root, Path *subpath, List *pathkeys, input_startup_cost += sort_path.startup_cost; input_total_cost += sort_path.total_cost; } - +#ifdef __TBASE__ + pathnode = (RemoteSubPath *) create_remotesubplan_path_internal(root, subpath, + distribution, rel, subpath->param_info, + pathkeys ? pathkeys : subpath->pathkeys, + rel->reltarget, num_replication, + input_startup_cost - subpath->startup_cost, + input_total_cost - subpath->total_cost); +#else pathnode->subpath = subpath; /* We don't want to run subplains in parallel workers */ @@ -1581,11 +1641,7 @@ redistribute_path(PlannerInfo *root, Path *subpath, List *pathkeys, cost_remote_subplan((Path *) pathnode, input_startup_cost, input_total_cost, subpath->rows, rel->reltarget->width, -#ifdef __TBASE__ num_replication); -#else - IsLocatorReplicated(distributionType) ? - bms_num_members(nodes) : 1); #endif return (Path *) pathnode; } diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index 102795bb..2198c9db 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * cost.h - * prototypes for costsize.c and clausesel.c. + * prototypes for costsize.c and clausesel.c. * * * Portions Copyright (c) 2012-2014, TransLattice, Inc. @@ -24,7 +24,7 @@ /* If you change these, update backend/utils/misc/postgresql.sample.conf */ #define DEFAULT_SEQ_PAGE_COST 1.0 #define DEFAULT_RANDOM_PAGE_COST 4.0 -#define DEFAULT_CPU_TUPLE_COST 0.01 +#define DEFAULT_CPU_TUPLE_COST 0.01 #define DEFAULT_CPU_INDEX_TUPLE_COST 0.005 #define DEFAULT_CPU_OPERATOR_COST 0.0025 #ifdef XCP @@ -34,19 +34,19 @@ #define DEFAULT_PARALLEL_TUPLE_COST 0.1 #define DEFAULT_PARALLEL_SETUP_COST 1000.0 -#define DEFAULT_EFFECTIVE_CACHE_SIZE 524288 /* measured in pages */ +#define DEFAULT_EFFECTIVE_CACHE_SIZE 524288 /* measured in pages */ typedef enum { - CONSTRAINT_EXCLUSION_OFF, /* do not use c_e */ - CONSTRAINT_EXCLUSION_ON, /* apply c_e to all rels */ - CONSTRAINT_EXCLUSION_PARTITION /* apply c_e to otherrels only */ -} ConstraintExclusionType; + CONSTRAINT_EXCLUSION_OFF, /* do not use c_e */ + CONSTRAINT_EXCLUSION_ON, /* apply c_e to all rels */ + CONSTRAINT_EXCLUSION_PARTITION /* apply c_e to otherrels only */ +} ConstraintExclusionType; /* * prototypes for costsize.c - * routines to compute costs and sizes + * routines to compute costs and sizes */ /* parameter variables and flags */ @@ -63,7 +63,7 @@ extern PGDLLIMPORT double parallel_tuple_cost; extern PGDLLIMPORT double parallel_setup_cost; extern PGDLLIMPORT int effective_cache_size; extern Cost disable_cost; -extern int max_parallel_workers_per_gather; +extern int max_parallel_workers_per_gather; extern bool enable_seqscan; extern bool enable_indexscan; extern bool enable_indexonlyscan; @@ -82,143 +82,146 @@ extern int constraint_exclusion; extern double clamp_row_est(double nrows); extern double index_pages_fetched(double tuples_fetched, BlockNumber pages, - double index_pages, PlannerInfo *root); + double index_pages, PlannerInfo *root); extern void cost_seqscan(Path *path, PlannerInfo *root, RelOptInfo *baserel, - ParamPathInfo *param_info); + ParamPathInfo *param_info); extern void cost_samplescan(Path *path, PlannerInfo *root, RelOptInfo *baserel, - ParamPathInfo *param_info); + ParamPathInfo *param_info); extern void cost_index(IndexPath *path, PlannerInfo *root, - double loop_count, bool partial_path); + double loop_count, bool partial_path); extern void cost_bitmap_heap_scan(Path *path, PlannerInfo *root, RelOptInfo *baserel, - ParamPathInfo *param_info, - Path *bitmapqual, double loop_count); + ParamPathInfo *param_info, + Path *bitmapqual, double loop_count); extern void cost_bitmap_and_node(BitmapAndPath *path, PlannerInfo *root); extern void cost_bitmap_or_node(BitmapOrPath *path, PlannerInfo *root); extern void cost_bitmap_tree_node(Path *path, Cost *cost, Selectivity *selec); extern void cost_tidscan(Path *path, PlannerInfo *root, - RelOptInfo *baserel, List *tidquals, ParamPathInfo *param_info); + RelOptInfo *baserel, List *tidquals, ParamPathInfo *param_info); extern void cost_subqueryscan(SubqueryScanPath *path, PlannerInfo *root, - RelOptInfo *baserel, ParamPathInfo *param_info); + RelOptInfo *baserel, ParamPathInfo *param_info); extern void cost_functionscan(Path *path, PlannerInfo *root, - RelOptInfo *baserel, ParamPathInfo *param_info); + RelOptInfo *baserel, ParamPathInfo *param_info); extern void cost_tableexprscan(Path *path, PlannerInfo *root, - RelOptInfo *baserel, ParamPathInfo *param_info); + RelOptInfo *baserel, ParamPathInfo *param_info); extern void cost_valuesscan(Path *path, PlannerInfo *root, - RelOptInfo *baserel, ParamPathInfo *param_info); + RelOptInfo *baserel, ParamPathInfo *param_info); #ifdef PGXC extern void cost_remotequery(Path *path, PlannerInfo *root, RelOptInfo *baserel); #endif extern void cost_tablefuncscan(Path *path, PlannerInfo *root, - RelOptInfo *baserel, ParamPathInfo *param_info); + RelOptInfo *baserel, ParamPathInfo *param_info); extern void cost_ctescan(Path *path, PlannerInfo *root, - RelOptInfo *baserel, ParamPathInfo *param_info); + RelOptInfo *baserel, ParamPathInfo *param_info); extern void cost_namedtuplestorescan(Path *path, PlannerInfo *root, - RelOptInfo *baserel, ParamPathInfo *param_info); + RelOptInfo *baserel, ParamPathInfo *param_info); extern void cost_recursive_union(Path *runion, Path *nrterm, Path *rterm); extern void cost_sort(Path *path, PlannerInfo *root, - List *pathkeys, Cost input_cost, double tuples, int width, - Cost comparison_cost, int sort_mem, - double limit_tuples); + List *pathkeys, Cost input_cost, double tuples, int width, + Cost comparison_cost, int sort_mem, + double limit_tuples); extern void cost_merge_append(Path *path, PlannerInfo *root, - List *pathkeys, int n_streams, - Cost input_startup_cost, Cost input_total_cost, - double tuples); + List *pathkeys, int n_streams, + Cost input_startup_cost, Cost input_total_cost, + double tuples); extern void cost_material(Path *path, - Cost input_startup_cost, Cost input_total_cost, - double tuples, int width); + Cost input_startup_cost, Cost input_total_cost, + double tuples, int width); extern void cost_agg(Path *path, PlannerInfo *root, - AggStrategy aggstrategy, const AggClauseCosts *aggcosts, - int numGroupCols, double numGroups, - Cost input_startup_cost, Cost input_total_cost, - double input_tuples); + AggStrategy aggstrategy, const AggClauseCosts *aggcosts, + int numGroupCols, double numGroups, + Cost input_startup_cost, Cost input_total_cost, + double input_tuples); extern void cost_windowagg(Path *path, PlannerInfo *root, - List *windowFuncs, int numPartCols, int numOrderCols, - Cost input_startup_cost, Cost input_total_cost, - double input_tuples); + List *windowFuncs, int numPartCols, int numOrderCols, + Cost input_startup_cost, Cost input_total_cost, + double input_tuples); extern void cost_group(Path *path, PlannerInfo *root, - int numGroupCols, double numGroups, - Cost input_startup_cost, Cost input_total_cost, - double input_tuples); + int numGroupCols, double numGroups, + Cost input_startup_cost, Cost input_total_cost, + double input_tuples); extern void initial_cost_nestloop(PlannerInfo *root, - JoinCostWorkspace *workspace, - JoinType jointype, - Path *outer_path, Path *inner_path, - JoinPathExtraData *extra); + JoinCostWorkspace *workspace, + JoinType jointype, + Path *outer_path, Path *inner_path, + JoinPathExtraData *extra); extern void final_cost_nestloop(PlannerInfo *root, NestPath *path, - JoinCostWorkspace *workspace, - JoinPathExtraData *extra); + JoinCostWorkspace *workspace, + JoinPathExtraData *extra); extern void initial_cost_mergejoin(PlannerInfo *root, - JoinCostWorkspace *workspace, - JoinType jointype, - List *mergeclauses, - Path *outer_path, Path *inner_path, - List *outersortkeys, List *innersortkeys, - JoinPathExtraData *extra); + JoinCostWorkspace *workspace, + JoinType jointype, + List *mergeclauses, + Path *outer_path, Path *inner_path, + List *outersortkeys, List *innersortkeys, + JoinPathExtraData *extra); extern void final_cost_mergejoin(PlannerInfo *root, MergePath *path, - JoinCostWorkspace *workspace, - JoinPathExtraData *extra); + JoinCostWorkspace *workspace, + JoinPathExtraData *extra); extern void initial_cost_hashjoin(PlannerInfo *root, - JoinCostWorkspace *workspace, - JoinType jointype, - List *hashclauses, - Path *outer_path, Path *inner_path, - JoinPathExtraData *extra); + JoinCostWorkspace *workspace, + JoinType jointype, + List *hashclauses, + Path *outer_path, Path *inner_path, + JoinPathExtraData *extra); extern void final_cost_hashjoin(PlannerInfo *root, HashPath *path, - JoinCostWorkspace *workspace, - JoinPathExtraData *extra); + JoinCostWorkspace *workspace, + JoinPathExtraData *extra); extern void cost_gather(GatherPath *path, PlannerInfo *root, - RelOptInfo *baserel, ParamPathInfo *param_info, double *rows); + RelOptInfo *baserel, ParamPathInfo *param_info, double *rows); +#ifdef __TBASE__ +extern void reset_cost_gather(GatherPath *path); +#endif extern void cost_subplan(PlannerInfo *root, SubPlan *subplan, Plan *plan); extern void cost_qual_eval(QualCost *cost, List *quals, PlannerInfo *root); extern void cost_qual_eval_node(QualCost *cost, Node *qual, PlannerInfo *root); #ifdef XCP extern void cost_remote_subplan(Path *path, - Cost input_startup_cost, Cost input_total_cost, - double tuples, int width, int replication); + Cost input_startup_cost, Cost input_total_cost, + double tuples, int width, int replication); #endif extern void compute_semi_anti_join_factors(PlannerInfo *root, - RelOptInfo *outerrel, - RelOptInfo *innerrel, - JoinType jointype, - SpecialJoinInfo *sjinfo, - List *restrictlist, - SemiAntiJoinFactors *semifactors); + RelOptInfo *outerrel, + RelOptInfo *innerrel, + JoinType jointype, + SpecialJoinInfo *sjinfo, + List *restrictlist, + SemiAntiJoinFactors *semifactors); extern void set_baserel_size_estimates(PlannerInfo *root, RelOptInfo *rel); extern double get_parameterized_baserel_size(PlannerInfo *root, - RelOptInfo *rel, - List *param_clauses); + RelOptInfo *rel, + List *param_clauses); extern double get_parameterized_joinrel_size(PlannerInfo *root, - RelOptInfo *rel, - Path *outer_path, - Path *inner_path, - SpecialJoinInfo *sjinfo, - List *restrict_clauses); + RelOptInfo *rel, + Path *outer_path, + Path *inner_path, + SpecialJoinInfo *sjinfo, + List *restrict_clauses); extern void set_joinrel_size_estimates(PlannerInfo *root, RelOptInfo *rel, - RelOptInfo *outer_rel, - RelOptInfo *inner_rel, - SpecialJoinInfo *sjinfo, - List *restrictlist); + RelOptInfo *outer_rel, + RelOptInfo *inner_rel, + SpecialJoinInfo *sjinfo, + List *restrictlist); extern void set_subquery_size_estimates(PlannerInfo *root, RelOptInfo *rel); extern void set_function_size_estimates(PlannerInfo *root, RelOptInfo *rel); extern void set_values_size_estimates(PlannerInfo *root, RelOptInfo *rel); extern void set_cte_size_estimates(PlannerInfo *root, RelOptInfo *rel, - double cte_rows); + double cte_rows); extern void set_tablefunc_size_estimates(PlannerInfo *root, RelOptInfo *rel); extern void set_namedtuplestore_size_estimates(PlannerInfo *root, RelOptInfo *rel); extern void set_foreign_size_estimates(PlannerInfo *root, RelOptInfo *rel); extern PathTarget *set_pathtarget_cost_width(PlannerInfo *root, PathTarget *target); extern double compute_bitmap_pages(PlannerInfo *root, RelOptInfo *baserel, - Path *bitmapqual, int loop_count, Cost *cost, double *tuple); + Path *bitmapqual, int loop_count, Cost *cost, double *tuple); /* * prototypes for clausesel.c - * routines to compute clause selectivities + * routines to compute clause selectivities */ extern Selectivity clauselist_selectivity(PlannerInfo *root, - List *clauses, - int varRelid, - JoinType jointype, - SpecialJoinInfo *sjinfo); + List *clauses, + int varRelid, + JoinType jointype, + SpecialJoinInfo *sjinfo); extern Selectivity clause_selectivity(PlannerInfo *root, Node *clause, int varRelid, @@ -228,8 +231,8 @@ extern Selectivity clause_selectivity(PlannerInfo *root, extern bool clause_selectivity_could_under_estimated(PlannerInfo *root, Path *path); #endif extern void cost_gather_merge(GatherMergePath *path, PlannerInfo *root, - RelOptInfo *rel, ParamPathInfo *param_info, - Cost input_startup_cost, Cost input_total_cost, - double *rows); + RelOptInfo *rel, ParamPathInfo *param_info, + Cost input_startup_cost, Cost input_total_cost, + double *rows); -#endif /* COST_H */ +#endif /* COST_H */ From faaf938a2b66199d1436425a469dca9633819b2d Mon Sep 17 00:00:00 2001 From: andrelin Date: Mon, 1 Feb 2021 20:27:13 +0800 Subject: [PATCH 120/578] Adjust gather and add a guc control parallel agg worker num --- src/backend/optimizer/path/allpaths.c | 1 + src/backend/optimizer/path/costsize.c | 14 ++ src/backend/optimizer/path/indxpath.c | 2 + src/backend/optimizer/plan/createplan.c | 26 ++- src/backend/optimizer/util/pathnode.c | 4 +- src/backend/utils/misc/guc.c | 12 +- src/include/optimizer/paths.h | 215 ++++++++++++------------ 7 files changed, 157 insertions(+), 117 deletions(-) diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 42c19c2f..310cec07 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -65,6 +65,7 @@ bool enable_geqo = false; /* just in case GUC doesn't set it */ int geqo_threshold; int min_parallel_table_scan_size; int min_parallel_index_scan_size; +int min_parallel_rows_size; /* Hook for plugins to get control in set_rel_pathlist() */ set_rel_pathlist_hook_type set_rel_pathlist_hook = NULL; diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 60949690..5d40d93e 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -494,7 +494,9 @@ cost_gather(GatherPath *path, PlannerInfo *root, #endif double *rows) { +#ifdef __TBASE__ ADJUST_BASESCAN(&path->path, rel_orig, rel, param_info_orig, param_info); +#endif Cost startup_cost = 0; Cost run_cost = 0; @@ -542,10 +544,17 @@ reset_cost_gather(GatherPath *path) */ void cost_gather_merge(GatherMergePath *path, PlannerInfo *root, +#ifdef __TBASE__ + RelOptInfo *rel_orig, ParamPathInfo *param_info_orig, +#else RelOptInfo *rel, ParamPathInfo *param_info, +#endif Cost input_startup_cost, Cost input_total_cost, double *rows) { +#ifdef __TBASE__ + ADJUST_BASESCAN(&path->path, rel_orig, rel, param_info_orig, param_info); +#endif Cost startup_cost = 0; Cost run_cost = 0; Cost comparison_cost; @@ -879,8 +888,13 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count, * sequential as for parallel scans the pages are accessed in random * order. */ +#ifdef __TBASE__ + path->path.parallel_workers = compute_parallel_worker(baserel_orig, + rand_heap_pages, index_pages); +#else path->path.parallel_workers = compute_parallel_worker(baserel, rand_heap_pages, index_pages); +#endif /* * Fall out if workers can't be assigned for parallel scan, because in diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index 1e58fbdc..31f75070 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -1667,6 +1667,8 @@ bitmap_and_cost_est(PlannerInfo *root, RelOptInfo *rel, List *paths) required_outer); bpath.path.pathkeys = NIL; bpath.bitmapqual = (Path *) &apath; + /* TODO: get real distribution information */ + bpath.path.distribution = NULL; /* * Check the cost of temporary path without considering parallelism. diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index a30ea56e..a98797ff 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -6416,6 +6416,7 @@ make_remotesubplan(PlannerInfo *root, Plan *gather_left = lefttree; Plan *gather_parent = NULL; bool need_sort = true; + double nodes = 1; #endif /* Sanity checks */ @@ -6423,6 +6424,16 @@ make_remotesubplan(PlannerInfo *root, Assert(!IsA(lefttree, RemoteSubplan)); #ifdef __TBASE__ + if (execDistribution && + (execDistribution->distributionType == LOCATOR_TYPE_HASH || + execDistribution->distributionType == LOCATOR_TYPE_SHARD)) + { + nodes = bms_num_members(execDistribution->nodes); + if (nodes <= 0) + /* should not happen, but for safety */ + nodes = 1; + } + if((IsA(lefttree, HashJoin) || IsA(lefttree, SeqScan) || IsA(lefttree, Agg) || IsA(lefttree, Group) || IsA(lefttree, Sort) || IsA(lefttree, Limit) || IsA(lefttree, Gather)) && @@ -6432,18 +6443,17 @@ make_remotesubplan(PlannerInfo *root, distributionType == LOCATOR_TYPE_NONE || distributionType == LOCATOR_TYPE_SHARD)) { - int parallel_threshold_rows = 50000; - if (IsA(lefttree, Gather)) { Gather *gather = (Gather *)lefttree; int nWorkers = gather->num_workers; Plan *leftplan = lefttree->lefttree; - double rows = GetPlanRows(leftplan); + /* rows estimate is cut down to per data nodes, set it to all nodes for parallel estimate. */ + double rows = GetPlanRows(leftplan) * nodes; int heap_parallel_threshold = 0; int heap_parallel_workers = 1; - heap_parallel_threshold = Max(parallel_threshold_rows, 1); + heap_parallel_threshold = Max(min_parallel_rows_size, 1); while (rows >= (heap_parallel_threshold * 3)) { heap_parallel_workers++; @@ -6481,7 +6491,7 @@ make_remotesubplan(PlannerInfo *root, switch(nodeTag(lefttree)) { case T_SeqScan: - if (rows >= parallel_threshold_rows * 3) + if (rows >= min_parallel_rows_size * 3) { lefttree->parallel_aware = true; } @@ -6667,7 +6677,9 @@ make_remotesubplan(PlannerInfo *root, } } - if (rows < parallel_threshold_rows * 3) + /* rows estimate is cut down to per data nodes, set it to all nodes for parallel estimate. */ + rows *= nodes; + if (rows < min_parallel_rows_size * 3) need_parallel = false; if (need_parallel) @@ -6677,7 +6689,7 @@ make_remotesubplan(PlannerInfo *root, Gather *gather_plan = NULL; Plan *subplan = NULL; - heap_parallel_threshold = Max(parallel_threshold_rows, 1); + heap_parallel_threshold = Max(min_parallel_rows_size, 1); while (rows >= (heap_parallel_threshold * 3)) { heap_parallel_workers++; diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 13cfb45d..038871a6 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -1415,8 +1415,8 @@ create_remotesubplan_path_internal(PlannerInfo *root, Path *subpath, { RemoteSubPath *pathnode; - if (IsA(subpath, GatherPath)) - reset_cost_gather((GatherPath *) subpath); + //if (IsA(subpath, GatherPath)) + //reset_cost_gather((GatherPath *) subpath); pathnode = makeNode(RemoteSubPath); pathnode->path.pathtype = T_RemoteSubplan; diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 0f728e50..35c4981d 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -4133,7 +4133,17 @@ static struct config_int ConfigureNamesInt[] = (512 * 1024) / BLCKSZ, 0, INT_MAX / 3, NULL, NULL, NULL }, - +#ifdef __TBASE__ + { + {"min_parallel_rows_size", PGC_USERSET, QUERY_TUNING_COST, + gettext_noop("Sets the minimum amount of rows for a parallel aggregate or scan."), + gettext_noop("If the planner estimates that it will read rows too small to reach this limit, a parallel plan will not be considered.") + }, + &min_parallel_rows_size, + 50000, 0, INT_MAX / 3, + NULL, NULL, NULL + }, +#endif { /* Can't be set in postgresql.conf */ {"server_version_num", PGC_INTERNAL, PRESET_OPTIONS, diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h index e32f7688..cf766c0f 100644 --- a/src/include/optimizer/paths.h +++ b/src/include/optimizer/paths.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * paths.h - * prototypes for various files in optimizer/path + * prototypes for various files in optimizer/path * * * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group @@ -21,43 +21,44 @@ * allpaths.c */ extern bool enable_geqo; -extern int geqo_threshold; -extern int min_parallel_table_scan_size; -extern int min_parallel_index_scan_size; +extern int geqo_threshold; +extern int min_parallel_table_scan_size; +extern int min_parallel_index_scan_size; +extern int min_parallel_rows_size; /* Hook for plugins to get control in set_rel_pathlist() */ typedef void (*set_rel_pathlist_hook_type) (PlannerInfo *root, - RelOptInfo *rel, - Index rti, - RangeTblEntry *rte); + RelOptInfo *rel, + Index rti, + RangeTblEntry *rte); extern PGDLLIMPORT set_rel_pathlist_hook_type set_rel_pathlist_hook; /* Hook for plugins to get control in add_paths_to_joinrel() */ typedef void (*set_join_pathlist_hook_type) (PlannerInfo *root, - RelOptInfo *joinrel, - RelOptInfo *outerrel, - RelOptInfo *innerrel, - JoinType jointype, - JoinPathExtraData *extra); + RelOptInfo *joinrel, + RelOptInfo *outerrel, + RelOptInfo *innerrel, + JoinType jointype, + JoinPathExtraData *extra); extern PGDLLIMPORT set_join_pathlist_hook_type set_join_pathlist_hook; /* Hook for plugins to replace standard_join_search() */ typedef RelOptInfo *(*join_search_hook_type) (PlannerInfo *root, - int levels_needed, - List *initial_rels); + int levels_needed, + List *initial_rels); extern PGDLLIMPORT join_search_hook_type join_search_hook; extern RelOptInfo *make_one_rel(PlannerInfo *root, List *joinlist); extern void set_dummy_rel_pathlist(RelOptInfo *rel); extern RelOptInfo *standard_join_search(PlannerInfo *root, int levels_needed, - List *initial_rels); + List *initial_rels); extern void generate_gather_paths(PlannerInfo *root, RelOptInfo *rel); extern int compute_parallel_worker(RelOptInfo *rel, double heap_pages, - double index_pages); + double index_pages); extern void create_partial_bitmap_paths(PlannerInfo *root, RelOptInfo *rel, - Path *bitmapqual); + Path *bitmapqual); #ifdef OPTIMIZER_DEBUG extern void debug_print_rel(PlannerInfo *root, RelOptInfo *rel); @@ -65,167 +66,167 @@ extern void debug_print_rel(PlannerInfo *root, RelOptInfo *rel); /* * indxpath.c - * routines to generate index paths + * routines to generate index paths */ extern void create_index_paths(PlannerInfo *root, RelOptInfo *rel); extern bool relation_has_unique_index_for(PlannerInfo *root, RelOptInfo *rel, - List *restrictlist, - List *exprlist, List *oprlist); + List *restrictlist, + List *exprlist, List *oprlist); extern bool indexcol_is_bool_constant_for_query(IndexOptInfo *index, - int indexcol); + int indexcol); extern bool match_index_to_operand(Node *operand, int indexcol, - IndexOptInfo *index); + IndexOptInfo *index); extern void expand_indexqual_conditions(IndexOptInfo *index, - List *indexclauses, List *indexclausecols, - List **indexquals_p, List **indexqualcols_p); + List *indexclauses, List *indexclausecols, + List **indexquals_p, List **indexqualcols_p); extern void check_index_predicates(PlannerInfo *root, RelOptInfo *rel); extern Expr *adjust_rowcompare_for_index(RowCompareExpr *clause, - IndexOptInfo *index, - int indexcol, - List **indexcolnos, - bool *var_on_left_p); + IndexOptInfo *index, + int indexcol, + List **indexcolnos, + bool *var_on_left_p); /* * tidpath.h - * routines to generate tid paths + * routines to generate tid paths */ extern void create_tidscan_paths(PlannerInfo *root, RelOptInfo *rel); /* * joinpath.c - * routines to create join paths + * routines to create join paths */ extern void add_paths_to_joinrel(PlannerInfo *root, RelOptInfo *joinrel, - RelOptInfo *outerrel, RelOptInfo *innerrel, - JoinType jointype, SpecialJoinInfo *sjinfo, - List *restrictlist); + RelOptInfo *outerrel, RelOptInfo *innerrel, + JoinType jointype, SpecialJoinInfo *sjinfo, + List *restrictlist); /* * joinrels.c - * routines to determine which relations to join + * routines to determine which relations to join */ extern void join_search_one_level(PlannerInfo *root, int level); extern RelOptInfo *make_join_rel(PlannerInfo *root, - RelOptInfo *rel1, RelOptInfo *rel2); + RelOptInfo *rel1, RelOptInfo *rel2); extern bool have_join_order_restriction(PlannerInfo *root, - RelOptInfo *rel1, RelOptInfo *rel2); + RelOptInfo *rel1, RelOptInfo *rel2); extern bool have_dangerous_phv(PlannerInfo *root, - Relids outer_relids, Relids inner_params); + Relids outer_relids, Relids inner_params); /* * equivclass.c - * routines for managing EquivalenceClasses + * routines for managing EquivalenceClasses */ typedef bool (*ec_matches_callback_type) (PlannerInfo *root, - RelOptInfo *rel, - EquivalenceClass *ec, - EquivalenceMember *em, - void *arg); + RelOptInfo *rel, + EquivalenceClass *ec, + EquivalenceMember *em, + void *arg); extern bool process_equivalence(PlannerInfo *root, RestrictInfo *restrictinfo, - bool below_outer_join); + bool below_outer_join); extern Expr *canonicalize_ec_expression(Expr *expr, - Oid req_type, Oid req_collation); + Oid req_type, Oid req_collation); extern void reconsider_outer_join_clauses(PlannerInfo *root); extern EquivalenceClass *get_eclass_for_sort_expr(PlannerInfo *root, - Expr *expr, - Relids nullable_relids, - List *opfamilies, - Oid opcintype, - Oid collation, - Index sortref, - Relids rel, - bool create_it); + Expr *expr, + Relids nullable_relids, + List *opfamilies, + Oid opcintype, + Oid collation, + Index sortref, + Relids rel, + bool create_it); extern void generate_base_implied_equalities(PlannerInfo *root); extern List *generate_join_implied_equalities(PlannerInfo *root, - Relids join_relids, - Relids outer_relids, - RelOptInfo *inner_rel); + Relids join_relids, + Relids outer_relids, + RelOptInfo *inner_rel); extern List *generate_join_implied_equalities_for_ecs(PlannerInfo *root, - List *eclasses, - Relids join_relids, - Relids outer_relids, - RelOptInfo *inner_rel); + List *eclasses, + Relids join_relids, + Relids outer_relids, + RelOptInfo *inner_rel); extern bool exprs_known_equal(PlannerInfo *root, Node *item1, Node *item2); extern EquivalenceClass *match_eclasses_to_foreign_key_col(PlannerInfo *root, - ForeignKeyOptInfo *fkinfo, - int colno); + ForeignKeyOptInfo *fkinfo, + int colno); extern void add_child_rel_equivalences(PlannerInfo *root, - AppendRelInfo *appinfo, - RelOptInfo *parent_rel, - RelOptInfo *child_rel); + AppendRelInfo *appinfo, + RelOptInfo *parent_rel, + RelOptInfo *child_rel); extern List *generate_implied_equalities_for_column(PlannerInfo *root, - RelOptInfo *rel, - ec_matches_callback_type callback, - void *callback_arg, - Relids prohibited_rels); + RelOptInfo *rel, + ec_matches_callback_type callback, + void *callback_arg, + Relids prohibited_rels); extern bool have_relevant_eclass_joinclause(PlannerInfo *root, - RelOptInfo *rel1, RelOptInfo *rel2); + RelOptInfo *rel1, RelOptInfo *rel2); extern bool has_relevant_eclass_joinclause(PlannerInfo *root, - RelOptInfo *rel1); + RelOptInfo *rel1); extern bool eclass_useful_for_merging(PlannerInfo *root, - EquivalenceClass *eclass, - RelOptInfo *rel); + EquivalenceClass *eclass, + RelOptInfo *rel); extern bool is_redundant_derived_clause(RestrictInfo *rinfo, List *clauselist); /* * pathkeys.c - * utilities for matching and building path keys + * utilities for matching and building path keys */ typedef enum { - PATHKEYS_EQUAL, /* pathkeys are identical */ - PATHKEYS_BETTER1, /* pathkey 1 is a superset of pathkey 2 */ - PATHKEYS_BETTER2, /* vice versa */ - PATHKEYS_DIFFERENT /* neither pathkey includes the other */ + PATHKEYS_EQUAL, /* pathkeys are identical */ + PATHKEYS_BETTER1, /* pathkey 1 is a superset of pathkey 2 */ + PATHKEYS_BETTER2, /* vice versa */ + PATHKEYS_DIFFERENT /* neither pathkey includes the other */ } PathKeysComparison; extern PathKeysComparison compare_pathkeys(List *keys1, List *keys2); extern bool pathkeys_contained_in(List *keys1, List *keys2); extern Path *get_cheapest_path_for_pathkeys(List *paths, List *pathkeys, - Relids required_outer, - CostSelector cost_criterion, - bool require_parallel_safe); + Relids required_outer, + CostSelector cost_criterion, + bool require_parallel_safe); extern Path *get_cheapest_fractional_path_for_pathkeys(List *paths, - List *pathkeys, - Relids required_outer, - double fraction); + List *pathkeys, + Relids required_outer, + double fraction); extern Path *get_cheapest_parallel_safe_total_inner(List *paths); extern List *build_index_pathkeys(PlannerInfo *root, IndexOptInfo *index, - ScanDirection scandir); + ScanDirection scandir); extern List *build_expression_pathkey(PlannerInfo *root, Expr *expr, - Relids nullable_relids, Oid opno, - Relids rel, bool create_it); + Relids nullable_relids, Oid opno, + Relids rel, bool create_it); extern List *convert_subquery_pathkeys(PlannerInfo *root, RelOptInfo *rel, - List *subquery_pathkeys, - List *subquery_tlist); + List *subquery_pathkeys, + List *subquery_tlist); extern List *build_join_pathkeys(PlannerInfo *root, - RelOptInfo *joinrel, - JoinType jointype, - List *outer_pathkeys); + RelOptInfo *joinrel, + JoinType jointype, + List *outer_pathkeys); extern List *make_pathkeys_for_sortclauses(PlannerInfo *root, - List *sortclauses, - List *tlist); + List *sortclauses, + List *tlist); extern void initialize_mergeclause_eclasses(PlannerInfo *root, - RestrictInfo *restrictinfo); + RestrictInfo *restrictinfo); extern void update_mergeclause_eclasses(PlannerInfo *root, - RestrictInfo *restrictinfo); + RestrictInfo *restrictinfo); extern List *find_mergeclauses_for_pathkeys(PlannerInfo *root, - List *pathkeys, - bool outer_keys, - List *restrictinfos); + List *pathkeys, + bool outer_keys, + List *restrictinfos); extern List *select_outer_pathkeys_for_merge(PlannerInfo *root, - List *mergeclauses, - RelOptInfo *joinrel); + List *mergeclauses, + RelOptInfo *joinrel); extern List *make_inner_pathkeys_for_merge(PlannerInfo *root, - List *mergeclauses, - List *outer_pathkeys); + List *mergeclauses, + List *outer_pathkeys); extern List *truncate_useless_pathkeys(PlannerInfo *root, - RelOptInfo *rel, - List *pathkeys); + RelOptInfo *rel, + List *pathkeys); extern bool has_useful_pathkeys(PlannerInfo *root, RelOptInfo *rel); extern PathKey *make_canonical_pathkey(PlannerInfo *root, - EquivalenceClass *eclass, Oid opfamily, - int strategy, bool nulls_first); + EquivalenceClass *eclass, Oid opfamily, + int strategy, bool nulls_first); -#endif /* PATHS_H */ +#endif /* PATHS_H */ From c2d9b972409d601ab35898b6ee28676ced36eb28 Mon Sep 17 00:00:00 2001 From: andrelin Date: Tue, 2 Feb 2021 19:52:42 +0800 Subject: [PATCH 121/578] make group estimate compatible with multi datanodes --- src/backend/utils/adt/selfuncs.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 06b1d9fa..e24d7193 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -3541,6 +3541,20 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows, */ double clamp = rel->tuples; +#ifdef __TBASE__ + double nodes = 1; + if (list_length(rel->pathlist) > 0) + { + Path *path = linitial(rel->pathlist); + if (path->distribution && + (path->distribution->distributionType == LOCATOR_TYPE_HASH || + path->distribution->distributionType == LOCATOR_TYPE_SHARD)) + nodes = bms_num_members(path->distribution->nodes); + /* for sanity */ + if (nodes < 1) + nodes = 1; + } +#endif if (relvarcount > 1) { clamp *= 0.1; @@ -3600,7 +3614,11 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows, (1 - pow((rel->tuples - rel->rows) / rel->tuples, rel->tuples / reldistinct)); } +#ifdef __TBASE__ + reldistinct = clamp_row_est(reldistinct / nodes); +#else reldistinct = clamp_row_est(reldistinct); +#endif /* * Update estimate of total distinct groups. From cfb34f5b9ebdca9b0155f3802cc8ab830ab7b50b Mon Sep 17 00:00:00 2001 From: andrelin Date: Wed, 3 Feb 2021 19:57:47 +0800 Subject: [PATCH 122/578] Adjust remote cost --- src/backend/optimizer/path/costsize.c | 7 ++++--- src/backend/optimizer/util/pathnode.c | 2 +- src/include/optimizer/cost.h | 3 ++- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 5d40d93e..4c816ac7 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -5591,7 +5591,8 @@ page_size(double tuples, int width) void cost_remote_subplan(Path *path, Cost input_startup_cost, Cost input_total_cost, - double tuples, int width, int replication) + double tuples, int width, int replication, + int nworkers) { Cost startup_cost = input_startup_cost + remote_query_cost; Cost run_cost = input_total_cost - input_startup_cost; @@ -5601,12 +5602,12 @@ cost_remote_subplan(Path *path, /* * Charge 2x cpu_operator_cost per tuple to reflect bookkeeping overhead. */ - run_cost += 2 * cpu_operator_cost * tuples; + run_cost += 2 * cpu_operator_cost * tuples * nworkers; /* * Estimate cost of sending data over network */ - run_cost += network_byte_cost * tuples * width * replication; + run_cost += network_byte_cost * tuples * width * replication * nworkers; path->startup_cost = startup_cost; path->total_cost = startup_cost + run_cost; diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 038871a6..e5551e0c 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -1434,7 +1434,7 @@ create_remotesubplan_path_internal(PlannerInfo *root, Path *subpath, cost_remote_subplan((Path *) pathnode, subpath->startup_cost + additional_startup_cost, subpath->total_cost + additional_total_cost, subpath->rows, - rel->reltarget->width, replication); + rel->reltarget->width, replication, subpath->parallel_workers); return (Path *) pathnode; } diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index 2198c9db..358e83b9 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -177,7 +177,8 @@ extern void cost_qual_eval_node(QualCost *cost, Node *qual, PlannerInfo *root); #ifdef XCP extern void cost_remote_subplan(Path *path, Cost input_startup_cost, Cost input_total_cost, - double tuples, int width, int replication); + double tuples, int width, int replication, + int nworkers); #endif extern void compute_semi_anti_join_factors(PlannerInfo *root, RelOptInfo *outerrel, From 54ed9c61430726cd9a6b1bad22ee6f5d4db351fe Mon Sep 17 00:00:00 2001 From: andrelin Date: Fri, 5 Feb 2021 10:29:37 +0800 Subject: [PATCH 123/578] Support parallel nestloop under remote sub query --- src/backend/optimizer/plan/createplan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index a98797ff..31bbb2d6 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -6434,7 +6434,7 @@ make_remotesubplan(PlannerInfo *root, nodes = 1; } - if((IsA(lefttree, HashJoin) || IsA(lefttree, SeqScan) + if((IsA(lefttree, HashJoin) || IsA(lefttree, NestLoop) || IsA(lefttree, SeqScan) || IsA(lefttree, Agg) || IsA(lefttree, Group) || IsA(lefttree, Sort) || IsA(lefttree, Limit) || IsA(lefttree, Gather)) && max_parallel_workers_per_gather && root->glob->parallelModeOK && From 225c6a482de94c82dd0ed1d8588944e5ce936f49 Mon Sep 17 00:00:00 2001 From: andrelin Date: Fri, 5 Feb 2021 11:27:22 +0800 Subject: [PATCH 124/578] Cover regress expect --- src/test/regress/expected/create_index_1.out | 18 +- src/test/regress/expected/fast_default.out | 12 +- src/test/regress/expected/groupingsets.out | 5 +- src/test/regress/expected/inherit_3.out | 83 ++- .../regress/expected/insert_conflict_1.out | 8 +- src/test/regress/expected/join_3.out | 512 +++++++++--------- src/test/regress/expected/privileges.out | 27 +- src/test/regress/expected/rowsecurity_1.out | 31 +- src/test/regress/expected/select_views.out | 19 +- src/test/regress/expected/stats_ext_2.out | 96 ++-- src/test/regress/expected/subselect.out | 38 +- src/test/regress/expected/xc_FQS_2.out | 12 +- src/test/regress/expected/xc_FQS_join_1.out | 104 ++-- src/test/regress/expected/xc_groupby_1.out | 216 +++----- src/test/regress/expected/xc_having_1.out | 19 +- src/test/regress/expected/xl_join.out | 33 +- 16 files changed, 571 insertions(+), 662 deletions(-) diff --git a/src/test/regress/expected/create_index_1.out b/src/test/regress/expected/create_index_1.out index 924c7c95..32acf6f1 100644 --- a/src/test/regress/expected/create_index_1.out +++ b/src/test/regress/expected/create_index_1.out @@ -2490,15 +2490,13 @@ CREATE INDEX hash_tuplesort_idx ON tenk1 USING hash (stringu1 name_ops) WITH (fi EXPLAIN (COSTS OFF) SELECT count(*) FROM tenk1 WHERE stringu1 = 'TVAAAA'; QUERY PLAN -------------------------------------------------------------------- +---------------------------------------------------------------- Finalize Aggregate -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Partial Aggregate - -> Bitmap Heap Scan on tenk1 - Recheck Cond: (stringu1 = 'TVAAAA'::name) - -> Bitmap Index Scan on hash_tuplesort_idx + -> Index Scan using hash_tuplesort_idx on tenk1 Index Cond: (stringu1 = 'TVAAAA'::name) -(7 rows) +(5 rows) SELECT count(*) FROM tenk1 WHERE stringu1 = 'TVAAAA'; count @@ -2986,15 +2984,11 @@ SELECT unique1 FROM tenk1 WHERE unique1 IN (1,42,7) ORDER BY unique1; QUERY PLAN -------------------------------------------------------------------------- +------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) - -> Sort - Sort Key: unique1 - -> Bitmap Heap Scan on tenk1 - Recheck Cond: (unique1 = ANY ('{1,42,7}'::integer[])) - -> Bitmap Index Scan on tenk1_unique1 + -> Index Only Scan using tenk1_unique1 on tenk1 Index Cond: (unique1 = ANY ('{1,42,7}'::integer[])) -(7 rows) +(3 rows) SELECT unique1 FROM tenk1 WHERE unique1 IN (1,42,7) diff --git a/src/test/regress/expected/fast_default.out b/src/test/regress/expected/fast_default.out index f2d63e30..16c60821 100644 --- a/src/test/regress/expected/fast_default.out +++ b/src/test/regress/expected/fast_default.out @@ -452,18 +452,16 @@ DELETE FROM T WHERE pk BETWEEN 10 AND 20 RETURNING *; EXPLAIN (VERBOSE TRUE, COSTS FALSE) DELETE FROM T WHERE pk BETWEEN 10 AND 20 RETURNING *; - QUERY PLAN ------------------------------------------------------------------ + QUERY PLAN +------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) Output: pk, c_bigint, c_text -> Delete on fast_default.t Output: pk, c_bigint, c_text - -> Bitmap Heap Scan on fast_default.t + -> Seq Scan on fast_default.t Output: xc_node_id, ctid, shardid, pk - Recheck Cond: ((t.pk >= 10) AND (t.pk <= 20)) - -> Bitmap Index Scan on t_pkey - Index Cond: ((t.pk >= 10) AND (t.pk <= 20)) -(9 rows) + Filter: ((t.pk >= 10) AND (t.pk <= 20)) +(7 rows) -- UPDATE UPDATE T SET c_text = '"' || c_text || '"' WHERE pk < 10; diff --git a/src/test/regress/expected/groupingsets.out b/src/test/regress/expected/groupingsets.out index 032ef9c2..e1524f49 100644 --- a/src/test/regress/expected/groupingsets.out +++ b/src/test/regress/expected/groupingsets.out @@ -1456,13 +1456,12 @@ explain (costs off) Hash Key: ten Hash Key: hundred Hash Key: thousand + Hash Key: twothousand Group Key: unique1 - Sort Key: twothousand - Group Key: twothousand -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Sort Sort Key: unique1 -> Seq Scan on tenk1 -(13 rows) +(12 rows) -- end diff --git a/src/test/regress/expected/inherit_3.out b/src/test/regress/expected/inherit_3.out index 6723ae3a..d0ff897f 100644 --- a/src/test/regress/expected/inherit_3.out +++ b/src/test/regress/expected/inherit_3.out @@ -1522,8 +1522,8 @@ vacuum analyze patest2; analyze int4_tbl; explain (costs off, num_nodes off, nodes off) select * from patest0 join (select f1 from int4_tbl where f1 = 0 limit 1) ss on id = f1; - QUERY PLAN ----------------------------------------------------------------- + QUERY PLAN +-------------------------------------------------------------- Nested Loop -> Limit -> Remote Subquery Scan on all @@ -1533,15 +1533,13 @@ select * from patest0 join (select f1 from int4_tbl where f1 = 0 limit 1) ss on -> Materialize -> Remote Subquery Scan on all -> Append - -> Bitmap Heap Scan on patest0 - Recheck Cond: (id = int4_tbl.f1) - -> Bitmap Index Scan on patest0i - Index Cond: (id = int4_tbl.f1) + -> Index Scan using patest0i on patest0 + Index Cond: (id = int4_tbl.f1) -> Index Scan using patest1i on patest1 Index Cond: (id = int4_tbl.f1) -> Index Scan using patest2i on patest2 Index Cond: (id = int4_tbl.f1) -(17 rows) +(15 rows) select * from patest0 join (select f1 from int4_tbl where f1 = 0 limit 1) ss on id = f1; id | x | f1 @@ -1554,8 +1552,8 @@ select * from patest0 join (select f1 from int4_tbl where f1 = 0 limit 1) ss on drop index patest2i; explain (costs off, num_nodes off, nodes off) select * from patest0 join (select f1 from int4_tbl where f1 = 0 limit 1) ss on id = f1; - QUERY PLAN ----------------------------------------------------------------- + QUERY PLAN +-------------------------------------------------------------- Nested Loop -> Limit -> Remote Subquery Scan on all @@ -1565,15 +1563,13 @@ select * from patest0 join (select f1 from int4_tbl where f1 = 0 limit 1) ss on -> Materialize -> Remote Subquery Scan on all -> Append - -> Bitmap Heap Scan on patest0 - Recheck Cond: (id = int4_tbl.f1) - -> Bitmap Index Scan on patest0i - Index Cond: (id = int4_tbl.f1) + -> Index Scan using patest0i on patest0 + Index Cond: (id = int4_tbl.f1) -> Index Scan using patest1i on patest1 Index Cond: (id = int4_tbl.f1) -> Seq Scan on patest2 Filter: (int4_tbl.f1 = id) -(17 rows) +(15 rows) select * from patest0 join (select f1 from int4_tbl where f1 = 0 limit 1) ss on id = f1; id | x | f1 @@ -1788,15 +1784,13 @@ SELECT thousand, thousand FROM tenk1 ORDER BY thousand, tenthous; QUERY PLAN ------------------------------------------------------------------------------- - Merge Append - Sort Key: tenk1.thousand, tenk1.tenthous - -> Remote Subquery Scan on all - -> Index Only Scan using tenk1_thous_tenthous on tenk1 - -> Remote Subquery Scan on all - -> Sort - Sort Key: tenk1_1.thousand, tenk1_1.thousand + Remote Subquery Scan on all + -> Sort + Sort Key: tenk1.thousand, tenk1.tenthous + -> Append + -> Index Only Scan using tenk1_thous_tenthous on tenk1 -> Index Only Scan using tenk1_thous_tenthous on tenk1 tenk1_1 -(8 rows) +(6 rows) explain (costs off, num_nodes off, nodes off) SELECT thousand, tenthous, thousand+tenthous AS x FROM tenk1 @@ -1805,15 +1799,13 @@ SELECT 42, 42, hundred FROM tenk1 ORDER BY thousand, tenthous; QUERY PLAN ------------------------------------------------------------------------ - Merge Append - Sort Key: tenk1.thousand, tenk1.tenthous - -> Remote Subquery Scan on all - -> Index Only Scan using tenk1_thous_tenthous on tenk1 - -> Remote Subquery Scan on all - -> Sort - Sort Key: 42, 42 + Remote Subquery Scan on all + -> Sort + Sort Key: tenk1.thousand, tenk1.tenthous + -> Append + -> Index Only Scan using tenk1_thous_tenthous on tenk1 -> Index Only Scan using tenk1_hundred on tenk1 tenk1_1 -(8 rows) +(6 rows) explain (costs off, num_nodes off, nodes off) SELECT thousand, tenthous FROM tenk1 @@ -1822,15 +1814,14 @@ SELECT thousand, random()::integer FROM tenk1 ORDER BY thousand, tenthous; QUERY PLAN ------------------------------------------------------------------------------- - Merge Append + Sort Sort Key: tenk1.thousand, tenk1.tenthous - -> Remote Subquery Scan on all - -> Index Only Scan using tenk1_thous_tenthous on tenk1 - -> Remote Subquery Scan on all - -> Sort - Sort Key: tenk1_1.thousand, ((random())::integer) + -> Append + -> Remote Subquery Scan on all + -> Index Only Scan using tenk1_thous_tenthous on tenk1 + -> Remote Subquery Scan on all -> Index Only Scan using tenk1_thous_tenthous on tenk1 tenk1_1 -(8 rows) +(7 rows) -- Check min/max aggregate optimization explain (costs off, num_nodes off, nodes off) @@ -1880,17 +1871,15 @@ SELECT x, y FROM UNION ALL SELECT unique2 AS x, unique2 AS y FROM tenk1 b) s ORDER BY x, y; - QUERY PLAN -------------------------------------------------------------------- - Merge Append - Sort Key: a.thousand, a.tenthous - -> Remote Subquery Scan on all - -> Index Only Scan using tenk1_thous_tenthous on tenk1 a - -> Remote Subquery Scan on all - -> Sort - Sort Key: b.unique2, b.unique2 + QUERY PLAN +------------------------------------------------------------------------- + Remote Subquery Scan on all + -> Sort + Sort Key: a.thousand, a.tenthous + -> Append + -> Index Only Scan using tenk1_thous_tenthous on tenk1 a -> Index Only Scan using tenk1_unique2 on tenk1 b -(8 rows) +(6 rows) -- exercise rescan code path via a repeatedly-evaluated subquery explain (costs off) diff --git a/src/test/regress/expected/insert_conflict_1.out b/src/test/regress/expected/insert_conflict_1.out index 042c1c00..1dce5ece 100644 --- a/src/test/regress/expected/insert_conflict_1.out +++ b/src/test/regress/expected/insert_conflict_1.out @@ -51,8 +51,8 @@ explain (costs off) insert into insertconflicttest values(0, 'Crowberry') on con explain (costs off) insert into insertconflicttest values(0, 'Crowberry') on conflict (key, fruit) do update set fruit = excluded.fruit where exists (select 1 from insertconflicttest ii where ii.key = excluded.key); - QUERY PLAN ----------------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_2) -> Insert on insertconflicttest Conflict Resolution: UPDATE @@ -61,8 +61,8 @@ explain (costs off) insert into insertconflicttest values(0, 'Crowberry') on con -> Result SubPlan 1 -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Index Only Scan using both_index_expr_key on insertconflicttest ii - Index Cond: (key = excluded.key) + -> Seq Scan on insertconflicttest ii + Filter: (key = excluded.key) (10 rows) -- Neither collation nor operator class specifications are required -- diff --git a/src/test/regress/expected/join_3.out b/src/test/regress/expected/join_3.out index ce5f9512..a1c6c31b 100644 --- a/src/test/regress/expected/join_3.out +++ b/src/test/regress/expected/join_3.out @@ -1867,6 +1867,30 @@ SELECT '' AS "xxx", * | 1 | 4 | one | -1 (1 row) +-- +-- semijoin selectivity for <> +-- +explain (costs off) +select * from int4_tbl i4, tenk1 a +where exists(select * from tenk1 b + where a.twothousand = b.twothousand and a.fivethous <> b.fivethous) + and i4.f1 = a.tenthous; + QUERY PLAN +----------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Hash Join + Hash Cond: (a.tenthous = i4.f1) + -> Hash Semi Join + Hash Cond: (a.twothousand = b.twothousand) + Join Filter: (a.fivethous <> b.fivethous) + -> Seq Scan on tenk1 a + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on tenk1 b + -> Hash + -> Seq Scan on int4_tbl i4 +(12 rows) + -- -- More complicated constructs -- @@ -3014,8 +3038,8 @@ select * from int4(sin(1)) q1, int4(sin(0)) q2 where thousand = (q1 + q2); - QUERY PLAN --------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------ Nested Loop Join Filter: (tenk1.twothousand = int4_tbl.f1) -> Nested Loop @@ -3024,14 +3048,12 @@ where thousand = (q1 + q2); -> Function Scan on q2 -> Materialize -> Remote Subquery Scan on all - -> Bitmap Heap Scan on tenk1 - Recheck Cond: (thousand = (q1.q1 + q2.q2)) - -> Bitmap Index Scan on tenk1_thous_tenthous - Index Cond: (thousand = (q1.q1 + q2.q2)) + -> Index Scan using tenk1_thous_tenthous on tenk1 + Index Cond: (thousand = (q1.q1 + q2.q2)) -> Materialize -> Remote Subquery Scan on all -> Seq Scan on int4_tbl -(15 rows) +(13 rows) set enable_hashjoin = true; -- @@ -3149,22 +3171,22 @@ select * from tenk1 a join tenk1 b on Nested Loop Join Filter: (((a.unique1 = 1) AND (b.unique1 = 2)) OR ((a.unique2 = 3) AND (b.hundred = 4))) -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Bitmap Heap Scan on tenk1 b - Recheck Cond: ((unique1 = 2) OR (hundred = 4)) + -> Bitmap Heap Scan on tenk1 a + Recheck Cond: ((unique1 = 1) OR (unique2 = 3)) -> BitmapOr -> Bitmap Index Scan on tenk1_unique1 - Index Cond: (unique1 = 2) - -> Bitmap Index Scan on tenk1_hundred - Index Cond: (hundred = 4) + Index Cond: (unique1 = 1) + -> Bitmap Index Scan on tenk1_unique2 + Index Cond: (unique2 = 3) -> Materialize -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Bitmap Heap Scan on tenk1 a - Recheck Cond: ((unique1 = 1) OR (unique2 = 3)) + -> Bitmap Heap Scan on tenk1 b + Recheck Cond: ((unique1 = 2) OR (hundred = 4)) -> BitmapOr -> Bitmap Index Scan on tenk1_unique1 - Index Cond: (unique1 = 1) - -> Bitmap Index Scan on tenk1_unique2 - Index Cond: (unique2 = 3) + Index Cond: (unique1 = 2) + -> Bitmap Index Scan on tenk1_hundred + Index Cond: (hundred = 4) (19 rows) explain (costs off) @@ -3175,17 +3197,17 @@ select * from tenk1 a join tenk1 b on Nested Loop Join Filter: (((a.unique1 = 1) AND (b.unique1 = 2)) OR ((a.unique2 = 3) AND (b.ten = 4))) -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Seq Scan on tenk1 b - Filter: ((unique1 = 2) OR (ten = 4)) + -> Bitmap Heap Scan on tenk1 a + Recheck Cond: ((unique1 = 1) OR (unique2 = 3)) + -> BitmapOr + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 = 1) + -> Bitmap Index Scan on tenk1_unique2 + Index Cond: (unique2 = 3) -> Materialize -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Bitmap Heap Scan on tenk1 a - Recheck Cond: ((unique1 = 1) OR (unique2 = 3)) - -> BitmapOr - -> Bitmap Index Scan on tenk1_unique1 - Index Cond: (unique1 = 1) - -> Bitmap Index Scan on tenk1_unique2 - Index Cond: (unique2 = 3) + -> Seq Scan on tenk1 b + Filter: ((unique1 = 2) OR (ten = 4)) (14 rows) explain (costs off) @@ -3245,11 +3267,9 @@ where t1.unique1 = 1; -> Hash -> Remote Subquery Scan on all Distribute results by H: thousand - -> Bitmap Heap Scan on tenk1 t2 - Recheck Cond: (t1.hundred = hundred) - -> Bitmap Index Scan on tenk1_hundred - Index Cond: (t1.hundred = hundred) -(22 rows) + -> Index Scan using tenk1_hundred on tenk1 t2 + Index Cond: (t1.hundred = hundred) +(20 rows) explain (num_nodes off, nodes off, costs off) select * from tenk1 t1 left join @@ -3276,19 +3296,17 @@ where t1.unique1 = 1; -> Hash -> Remote Subquery Scan on all Distribute results by H: thousand - -> Bitmap Heap Scan on tenk1 t2 - Recheck Cond: (t1.hundred = hundred) - -> Bitmap Index Scan on tenk1_hundred - Index Cond: (t1.hundred = hundred) -(22 rows) + -> Index Scan using tenk1_hundred on tenk1 t2 + Index Cond: (t1.hundred = hundred) +(20 rows) explain (num_nodes off, nodes off, costs off) select count(*) from tenk1 a join tenk1 b on a.unique1 = b.unique2 left join tenk1 c on a.unique2 = b.unique1 and c.thousand = a.thousand join int4_tbl on b.thousand = f1; - QUERY PLAN -------------------------------------------------------------------------------------------------------- + QUERY PLAN +-------------------------------------------------------------------------------------------------------- Finalize Aggregate -> Remote Subquery Scan on all -> Partial Aggregate @@ -3306,13 +3324,11 @@ select count(*) from Distribute results by H: unique2 -> Nested Loop -> Seq Scan on int4_tbl - -> Bitmap Heap Scan on tenk1 b - Recheck Cond: (thousand = int4_tbl.f1) - -> Bitmap Index Scan on tenk1_thous_tenthous - Index Cond: (thousand = int4_tbl.f1) + -> Index Scan using tenk1_thous_tenthous on tenk1 b + Index Cond: (thousand = int4_tbl.f1) -> Index Scan using tenk1_unique1 on tenk1 a Index Cond: (unique1 = b.unique2) -(23 rows) +(21 rows) select count(*) from tenk1 a join tenk1 b on a.unique1 = b.unique2 @@ -3330,8 +3346,8 @@ select b.unique1 from join int4_tbl i1 on b.thousand = f1 right join int4_tbl i2 on i2.f1 = b.tenthous order by 1; - QUERY PLAN -------------------------------------------------------------------------------------------------------------- + QUERY PLAN +-------------------------------------------------------------------------------------------------------------- Remote Subquery Scan on all -> Sort Sort Key: b.unique1 @@ -3353,17 +3369,15 @@ select b.unique1 from Distribute results by H: unique2 -> Nested Loop -> Seq Scan on int4_tbl i1 - -> Bitmap Heap Scan on tenk1 b - Recheck Cond: (thousand = i1.f1) - -> Bitmap Index Scan on tenk1_thous_tenthous - Index Cond: (thousand = i1.f1) + -> Index Scan using tenk1_thous_tenthous on tenk1 b + Index Cond: (thousand = i1.f1) -> Index Scan using tenk1_unique1 on tenk1 a Index Cond: (unique1 = b.unique2) -> Hash -> Remote Subquery Scan on all Distribute results by H: f1 -> Seq Scan on int4_tbl i2 -(31 rows) +(29 rows) select b.unique1 from tenk1 a join tenk1 b on a.unique1 = b.unique2 @@ -3391,16 +3405,16 @@ order by fault; QUERY PLAN -------------------------------------------------------------------------------- Remote Subquery Scan on all - -> Nested Loop Left Join + -> Hash Right Join + Hash Cond: (tenk1.unique2 = int8_tbl.q2) Filter: ((COALESCE(tenk1.unique1, '-1'::integer) + int8_tbl.q1) = 122) -> Remote Subquery Scan on all - Distribute results by H: q2 - -> Seq Scan on int8_tbl - -> Materialize + Distribute results by H: unique2 + -> Seq Scan on tenk1 + -> Hash -> Remote Subquery Scan on all - Distribute results by H: unique2 - -> Index Scan using tenk1_unique2 on tenk1 - Index Cond: (int8_tbl.q2 = unique2) + Distribute results by H: q2 + -> Seq Scan on int8_tbl (11 rows) select * from @@ -3425,16 +3439,16 @@ select q1, unique2, thousand, hundred QUERY PLAN -------------------------------------------------------------------------------------------- Remote Subquery Scan on all - -> Nested Loop Left Join + -> Hash Right Join + Hash Cond: (b.unique2 = a.q1) Filter: ((COALESCE(b.thousand, 123) = a.q1) AND (a.q1 = COALESCE(b.hundred, 123))) -> Remote Subquery Scan on all - Distribute results by H: q1 - -> Seq Scan on int8_tbl a - -> Materialize + Distribute results by H: COALESCE(thousand, 123) + -> Seq Scan on tenk1 b + -> Hash -> Remote Subquery Scan on all - Distribute results by H: COALESCE(thousand, 123) - -> Index Scan using tenk1_unique2 on tenk1 b - Index Cond: (a.q1 = unique2) + Distribute results by H: q1 + -> Seq Scan on int8_tbl a (11 rows) select q1, unique2, thousand, hundred @@ -3451,16 +3465,16 @@ select f1, unique2, case when unique2 is null then f1 else 0 end QUERY PLAN -------------------------------------------------------------------------- Remote Subquery Scan on all - -> Nested Loop Left Join + -> Hash Right Join + Hash Cond: (b.unique2 = a.f1) Filter: (CASE WHEN (b.unique2 IS NULL) THEN a.f1 ELSE 0 END = 0) -> Remote Subquery Scan on all - Distribute results by H: f1 - -> Seq Scan on int4_tbl a - -> Materialize + Distribute results by H: unique2 + -> Seq Scan on tenk1 b + -> Hash -> Remote Subquery Scan on all - Distribute results by H: unique2 - -> Index Only Scan using tenk1_unique2 on tenk1 b - Index Cond: (unique2 = a.f1) + Distribute results by H: f1 + -> Seq Scan on int4_tbl a (11 rows) select f1, unique2, case when unique2 is null then f1 else 0 end @@ -3520,34 +3534,33 @@ left join using (join_key) ) foo3 using (join_key); - QUERY PLAN --------------------------------------------------------------------------------------- + QUERY PLAN +----------------------------------------------------------------------- Hash Right Join Output: "*VALUES*".column1, i1.f1, (666) Hash Cond: (i1.f1 = "*VALUES*".column1) -> Remote Subquery Scan on all (datanode_1,datanode_2) Output: i1.f1, 666 - -> Nested Loop Left Join + -> Hash Right Join Output: i1.f1, 666 - -> Remote Subquery Scan on all (datanode_1) - Output: i1.f1 - Distribute results by H: f1 - -> Seq Scan on public.int4_tbl i1 - Output: i1.f1 - -> Materialize + Hash Cond: (i2.unique2 = i1.f1) + -> Remote Subquery Scan on all (datanode_1,datanode_2) Output: i2.unique2 - -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: unique2 + -> Seq Scan on public.tenk1 i2 Output: i2.unique2 - Distribute results by H: unique2 - Sort Key: i2.unique2 - -> Index Only Scan using tenk1_unique2 on public.tenk1 i2 - Output: i2.unique2 - Index Cond: (i2.unique2 = i1.f1) + -> Hash + Output: i1.f1 + -> Remote Subquery Scan on all (datanode_1) + Output: i1.f1 + Distribute results by H: f1 + -> Seq Scan on public.int4_tbl i1 + Output: i1.f1 -> Hash Output: "*VALUES*".column1 -> Values Scan on "*VALUES*" Output: "*VALUES*".column1 -(25 rows) +(24 rows) select foo1.join_key as foo1_id, foo3.join_key AS foo3_id, bug_field from (values (0),(1)) foo1(join_key) @@ -3584,8 +3597,8 @@ select t1.* from on (t1.f1 = b1.d1) left join int4_tbl i4 on (i8.q2 = i4.f1); - QUERY PLAN ----------------------------------------------------------------------------------- + QUERY PLAN +---------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) Output: t1.f1 -> Nested Loop Left Join @@ -3596,29 +3609,27 @@ select t1.* from Join Filter: (t1.f1 = '***'::text) -> Seq Scan on public.text_tbl t1 Output: t1.f1 - -> Materialize + -> Nested Loop Left Join Output: i8.q2 - -> Nested Loop Left Join - Output: i8.q2 - Join Filter: ((NULL::integer) = i8b1.q2) - -> Seq Scan on public.int8_tbl i8b1 - Output: i8b1.q1, i8b1.q2 - -> Materialize + Join Filter: ((NULL::integer) = i8b1.q2) + -> Seq Scan on public.int8_tbl i8b1 + Output: i8b1.q1, i8b1.q2 + -> Materialize + Output: i8.q2, (NULL::integer) + -> Nested Loop Left Join Output: i8.q2, (NULL::integer) - -> Nested Loop Left Join - Output: i8.q2, (NULL::integer) - Join Filter: (i8.q1 = i8b2.q1) - -> Seq Scan on public.int8_tbl i8 - Output: i8.q1, i8.q2 - -> Materialize - Output: i8b2.q1, (NULL::integer) - -> Seq Scan on public.int8_tbl i8b2 - Output: i8b2.q1, NULL::integer + Join Filter: (i8.q1 = i8b2.q1) + -> Seq Scan on public.int8_tbl i8 + Output: i8.q1, i8.q2 + -> Materialize + Output: i8b2.q1, (NULL::integer) + -> Seq Scan on public.int8_tbl i8b2 + Output: i8b2.q1, NULL::integer -> Materialize Output: i4.f1 -> Seq Scan on public.int4_tbl i4 Output: i4.f1 -(32 rows) +(30 rows) select t1.* from text_tbl t1 @@ -3647,8 +3658,8 @@ select t1.* from on (t1.f1 = b1.d1) left join int4_tbl i4 on (i8.q2 = i4.f1); - QUERY PLAN ----------------------------------------------------------------------------------------------- + QUERY PLAN +---------------------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) Output: t1.f1 -> Nested Loop Left Join @@ -3659,33 +3670,31 @@ select t1.* from Join Filter: (t1.f1 = '***'::text) -> Seq Scan on public.text_tbl t1 Output: t1.f1 - -> Materialize + -> Nested Loop Left Join Output: i8.q2 - -> Nested Loop Left Join - Output: i8.q2 - Join Filter: ((NULL::integer) = i8b1.q2) - -> Seq Scan on public.int8_tbl i8b1 - Output: i8b1.q1, i8b1.q2 - -> Materialize + Join Filter: ((NULL::integer) = i8b1.q2) + -> Seq Scan on public.int8_tbl i8b1 + Output: i8b1.q1, i8b1.q2 + -> Materialize + Output: i8.q2, (NULL::integer) + -> Nested Loop Left Join Output: i8.q2, (NULL::integer) - -> Nested Loop Left Join - Output: i8.q2, (NULL::integer) - Join Filter: (i8.q1 = i8b2.q1) - -> Seq Scan on public.int8_tbl i8 - Output: i8.q1, i8.q2 - -> Materialize - Output: i8b2.q1, (NULL::integer) - -> Nested Loop - Output: i8b2.q1, NULL::integer - -> Seq Scan on public.int8_tbl i8b2 - Output: i8b2.q1, i8b2.q2 - -> Materialize - -> Seq Scan on public.int4_tbl i4b2 + Join Filter: (i8.q1 = i8b2.q1) + -> Seq Scan on public.int8_tbl i8 + Output: i8.q1, i8.q2 + -> Materialize + Output: i8b2.q1, (NULL::integer) + -> Nested Loop + Output: i8b2.q1, NULL::integer + -> Seq Scan on public.int8_tbl i8b2 + Output: i8b2.q1, i8b2.q2 + -> Materialize + -> Seq Scan on public.int4_tbl i4b2 -> Materialize Output: i4.f1 -> Seq Scan on public.int4_tbl i4 Output: i4.f1 -(36 rows) +(34 rows) select t1.* from text_tbl t1 @@ -3715,8 +3724,8 @@ select t1.* from on (t1.f1 = b1.d1) left join int4_tbl i4 on (i8.q2 = i4.f1); - QUERY PLAN ----------------------------------------------------------------------------------------------- + QUERY PLAN +---------------------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) Output: t1.f1 -> Nested Loop Left Join @@ -3727,36 +3736,34 @@ select t1.* from Join Filter: (t1.f1 = '***'::text) -> Seq Scan on public.text_tbl t1 Output: t1.f1 - -> Materialize + -> Nested Loop Left Join Output: i8.q2 - -> Nested Loop Left Join - Output: i8.q2 - Join Filter: ((NULL::integer) = i8b1.q2) - -> Seq Scan on public.int8_tbl i8b1 - Output: i8b1.q1, i8b1.q2 - -> Materialize + Join Filter: ((NULL::integer) = i8b1.q2) + -> Seq Scan on public.int8_tbl i8b1 + Output: i8b1.q1, i8b1.q2 + -> Materialize + Output: i8.q2, (NULL::integer) + -> Nested Loop Left Join Output: i8.q2, (NULL::integer) - -> Nested Loop Left Join - Output: i8.q2, (NULL::integer) - Join Filter: (i8.q1 = i8b2.q1) - -> Seq Scan on public.int8_tbl i8 - Output: i8.q1, i8.q2 - -> Materialize - Output: i8b2.q1, (NULL::integer) - -> Nested Loop - Output: i8b2.q1, NULL::integer - Join Filter: (i8b2.q1 = i4b2.f1) - -> Seq Scan on public.int8_tbl i8b2 - Output: i8b2.q1, i8b2.q2 - -> Materialize + Join Filter: (i8.q1 = i8b2.q1) + -> Seq Scan on public.int8_tbl i8 + Output: i8.q1, i8.q2 + -> Materialize + Output: i8b2.q1, (NULL::integer) + -> Nested Loop + Output: i8b2.q1, NULL::integer + Join Filter: (i8b2.q1 = i4b2.f1) + -> Seq Scan on public.int8_tbl i8b2 + Output: i8b2.q1, i8b2.q2 + -> Materialize + Output: i4b2.f1 + -> Seq Scan on public.int4_tbl i4b2 Output: i4b2.f1 - -> Seq Scan on public.int4_tbl i4b2 - Output: i4b2.f1 -> Materialize Output: i4.f1 -> Seq Scan on public.int4_tbl i4 Output: i4.f1 -(39 rows) +(37 rows) select t1.* from text_tbl t1 @@ -3849,11 +3856,9 @@ where t1.f1 = ss.f1; Output: t1.f1, i8.q1, i8.q2 -> Seq Scan on public.text_tbl t1 Output: t1.f1 - -> Materialize + -> Seq Scan on public.int8_tbl i8 Output: i8.q1, i8.q2 - -> Seq Scan on public.int8_tbl i8 - Output: i8.q1, i8.q2 - Filter: (i8.q2 = 123) + Filter: (i8.q2 = 123) -> Materialize Output: (i8.q1), t2.f1 -> Limit @@ -3864,7 +3869,7 @@ where t1.f1 = ss.f1; Output: (i8.q1), t2.f1 -> Seq Scan on public.text_tbl t2 Output: i8.q1, t2.f1 -(24 rows) +(22 rows) select * from text_tbl t1 @@ -3885,26 +3890,24 @@ select * from lateral (select i8.q1, t2.f1 from text_tbl t2 limit 1) as ss1, lateral (select ss1.* from text_tbl t3 limit 1) as ss2 where t1.f1 = ss2.f1; - QUERY PLAN ------------------------------------------------------------------------------ + QUERY PLAN +----------------------------------------------------------------------------------- Nested Loop Output: t1.f1, i8.q1, i8.q2, (i8.q1), t2.f1, ((i8.q1)), (t2.f1) Join Filter: (t1.f1 = (t2.f1)) - -> Nested Loop - Output: t1.f1, i8.q1, i8.q2, (i8.q1), t2.f1 - -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: t1.f1, i8.q1, i8.q2 + -> Nested Loop Left Join Output: t1.f1, i8.q1, i8.q2 - -> Nested Loop Left Join - Output: t1.f1, i8.q1, i8.q2 - -> Seq Scan on public.text_tbl t1 - Output: t1.f1 - -> Materialize - Output: i8.q1, i8.q2 - -> Seq Scan on public.int8_tbl i8 - Output: i8.q1, i8.q2 - Filter: (i8.q2 = 123) - -> Materialize - Output: (i8.q1), t2.f1 + -> Seq Scan on public.text_tbl t1 + Output: t1.f1 + -> Seq Scan on public.int8_tbl i8 + Output: i8.q1, i8.q2 + Filter: (i8.q2 = 123) + -> Materialize + Output: (i8.q1), t2.f1, ((i8.q1)), (t2.f1) + -> Nested Loop + Output: (i8.q1), t2.f1, ((i8.q1)), (t2.f1) -> Limit Output: (i8.q1), t2.f1 -> Remote Subquery Scan on all (datanode_1,datanode_2) @@ -3913,17 +3916,17 @@ where t1.f1 = ss2.f1; Output: (i8.q1), t2.f1 -> Seq Scan on public.text_tbl t2 Output: i8.q1, t2.f1 - -> Materialize - Output: ((i8.q1)), (t2.f1) - -> Limit - Output: ((i8.q1)), (t2.f1) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Output: (i8.q1), t2.f1 + -> Materialize + Output: ((i8.q1)), (t2.f1) -> Limit Output: ((i8.q1)), (t2.f1) - -> Seq Scan on public.text_tbl t3 + -> Remote Subquery Scan on all (datanode_1,datanode_2) Output: (i8.q1), t2.f1 -(36 rows) + -> Limit + Output: ((i8.q1)), (t2.f1) + -> Seq Scan on public.text_tbl t3 + Output: (i8.q1), t2.f1 +(34 rows) select * from text_tbl t1 @@ -4286,16 +4289,14 @@ select d.* from d left join (select distinct * from b) s QUERY PLAN ----------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) - -> Merge Right Join - Merge Cond: (b.id = d.a) - -> Unique - -> Sort - Sort Key: b.id, b.c_id - -> Seq Scan on b - -> Sort - Sort Key: d.a + -> Hash Right Join + Hash Cond: (b.id = d.a) + -> HashAggregate + Group Key: b.id, b.c_id + -> Seq Scan on b + -> Hash -> Seq Scan on d -(10 rows) +(8 rows) -- check join removal works when uniqueness of the join condition is enforced -- by a UNION @@ -4353,14 +4354,17 @@ explain (verbose false, costs false, nodes false) select p.*, linked from parent p left join (select c.*, true as linked from child c) as ss on (p.k = ss.k) order by p.k; - QUERY PLAN ----------------------------------------------------------- + QUERY PLAN +--------------------------------------------- Remote Subquery Scan on all - -> Merge Left Join - Merge Cond: (p.k = c.k) - -> Index Scan using parent_pkey on parent p - -> Index Only Scan using child_k_key on child c -(5 rows) + -> Sort + Sort Key: p.k + -> Hash Left Join + Hash Cond: (p.k = c.k) + -> Seq Scan on parent p + -> Hash + -> Seq Scan on child c +(8 rows) -- check for a 9.0rc1 bug: join removal breaks pseudoconstant qual handling select p.* from @@ -4468,20 +4472,20 @@ from left join uniquetbl u1 ON u1.f1 = t1.string4) ss on t0.f1 = ss.case1 where ss.stringu2 !~* ss.case1; - QUERY PLAN --------------------------------------------------------------------------------------------------------- + QUERY PLAN +-------------------------------------------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) -> Nested Loop Join Filter: ((CASE t1.ten WHEN 0 THEN 'doh!'::text ELSE NULL::text END) = t0.f1) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: CASE ten WHEN 0 THEN 'doh!'::text ELSE NULL::text END - -> Nested Loop - -> Seq Scan on int4_tbl i4 - -> Index Scan using tenk1_unique2 on tenk1 t1 - Index Cond: (unique2 = i4.f1) - Filter: (stringu2 !~* CASE ten WHEN 0 THEN 'doh!'::text ELSE NULL::text END) + -> Seq Scan on text_tbl t0 -> Materialize - -> Seq Scan on text_tbl t0 + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: CASE ten WHEN 0 THEN 'doh!'::text ELSE NULL::text END + -> Nested Loop + -> Seq Scan on int4_tbl i4 + -> Index Scan using tenk1_unique2 on tenk1 t1 + Index Cond: (unique2 = i4.f1) + Filter: (stringu2 !~* CASE ten WHEN 0 THEN 'doh!'::text ELSE NULL::text END) (12 rows) select t0.* @@ -4713,18 +4717,18 @@ select * from generate_series(100,200) g, explain (num_nodes off, nodes off, costs off) select count(*) from tenk1 a, tenk1 b join lateral (values(a.unique1)) ss(x) on b.unique2 = ss.x; - QUERY PLAN ------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------ Finalize Aggregate -> Remote Subquery Scan on all -> Partial Aggregate -> Hash Join - Hash Cond: (b.unique2 = a.unique1) - -> Remote Subquery Scan on all - Distribute results by H: unique2 - -> Seq Scan on tenk1 b + Hash Cond: (a.unique1 = b.unique2) + -> Seq Scan on tenk1 a -> Hash - -> Seq Scan on tenk1 a + -> Remote Subquery Scan on all + Distribute results by H: unique2 + -> Seq Scan on tenk1 b (10 rows) select count(*) from tenk1 a, @@ -5445,8 +5449,8 @@ select * from lateral (select f1 from int4_tbl where f1 = any (select unique1 from tenk1 where unique2 = v.x offset 0)) ss; - QUERY PLAN ----------------------------------------------------------------------------------- + QUERY PLAN +---------------------------------------------------------------------------------------- Nested Loop Output: "*VALUES*".column1, "*VALUES*".column2, int4_tbl.f1 -> Values Scan on "*VALUES*" @@ -5455,18 +5459,21 @@ select * from Output: int4_tbl.f1 -> Remote Subquery Scan on all Output: int4_tbl.f1 - -> Nested Loop + -> Hash Join Output: int4_tbl.f1 - Join Filter: (int4_tbl.f1 = tenk1.unique1) - -> HashAggregate - Output: tenk1.unique1 - Group Key: tenk1.unique1 - -> Index Scan using tenk1_unique2 on public.tenk1 - Output: tenk1.unique1 - Index Cond: (tenk1.unique2 = "*VALUES*".column2) + Inner Unique: true + Hash Cond: (int4_tbl.f1 = tenk1.unique1) -> Seq Scan on public.int4_tbl Output: int4_tbl.f1 -(19 rows) + -> Hash + Output: tenk1.unique1 + -> HashAggregate + Output: tenk1.unique1 + Group Key: tenk1.unique1 + -> Index Scan using tenk1_unique2 on public.tenk1 + Output: tenk1.unique1 + Index Cond: (tenk1.unique2 = "*VALUES*".column2) +(22 rows) select * from (values (0,9998), (1,1000)) v(id,x), @@ -6102,11 +6109,10 @@ where exists (select 1 from tenk1 t3 Output: t1.unique1 -> Remote Subquery Scan on all (datanode_1,datanode_2) Output: t1.unique1 - -> Bitmap Heap Scan on public.onek t1 + Sort Key: t1.unique1 + -> Index Only Scan using onek_unique1 on public.onek t1 Output: t1.unique1 - Recheck Cond: (t1.unique1 < 1) - -> Bitmap Index Scan on onek_unique1 - Index Cond: (t1.unique1 < 1) + Index Cond: (t1.unique1 < 1) -> Materialize Output: t2.hundred -> Remote Subquery Scan on all (datanode_1,datanode_2) @@ -6116,7 +6122,7 @@ where exists (select 1 from tenk1 t3 -> Index Only Scan using tenk1_hundred on public.tenk1 t2 Output: t2.hundred Index Cond: (t2.hundred = t3.tenthous) -(39 rows) +(38 rows) -- ... unless it actually is unique create table j3 as select unique1, tenthous from onek; @@ -6128,8 +6134,8 @@ from onek t1, tenk1 t2 where exists (select 1 from j3 where j3.unique1 = t1.unique1 and j3.tenthous = t2.hundred) and t1.unique1 < 1; - QUERY PLAN -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------------ Remote Subquery Scan on all (datanode_1,datanode_2) Output: t1.unique1, t2.hundred -> Nested Loop @@ -6138,18 +6144,16 @@ where exists (select 1 from j3 Output: t1.unique1, j3.tenthous -> Nested Loop Output: t1.unique1, j3.tenthous - -> Bitmap Heap Scan on public.onek t1 - Output: t1.unique1, t1.unique2, t1.two, t1.four, t1.ten, t1.twenty, t1.hundred, t1.thousand, t1.twothousand, t1.fivethous, t1.tenthous, t1.odd, t1.even, t1.stringu1, t1.stringu2, t1.string4 - Recheck Cond: (t1.unique1 < 1) - -> Bitmap Index Scan on onek_unique1 - Index Cond: (t1.unique1 < 1) + -> Index Only Scan using onek_unique1 on public.onek t1 + Output: t1.unique1 + Index Cond: (t1.unique1 < 1) -> Index Only Scan using j3_unique1_tenthous_idx on public.j3 Output: j3.unique1, j3.tenthous Index Cond: (j3.unique1 = t1.unique1) -> Index Only Scan using tenk1_hundred on public.tenk1 t2 Output: t2.hundred Index Cond: (t2.hundred = j3.tenthous) -(19 rows) +(17 rows) drop table j3; -- @@ -6170,18 +6174,18 @@ explain select t3.b from nestloop_suppression1 t1, nestloop_suppression2 t2, nes where t1.b=2 and t1.c=3 and t1.d like 'char%' and t1.a=t2.a and t3.b>t2.a; QUERY PLAN ------------------------------------------------------------------------------------------------------------------------------ - Nested Loop (cost=200.16..596.19 rows=33 width=4) + Nested Loop (cost=200.16..401.93 rows=33 width=4) Join Filter: (t3.b > t2.a) - -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.16..453.19 rows=1 width=4) - -> Nested Loop (cost=0.16..353.18 rows=1 width=4) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.16..280.68 rows=1 width=4) + -> Nested Loop (cost=0.16..180.68 rows=1 width=4) Join Filter: (t1.a = t2.a) -> Index Scan using idx_nestloop_suppression1_b on nestloop_suppression1 t1 (cost=0.16..8.18 rows=1 width=4) Index Cond: (b = 2) Filter: (((d)::text ~~ 'char%'::text) AND (c = 3)) - -> Seq Scan on nestloop_suppression2 t2 (cost=0.00..220.00 rows=10000 width=4) - -> Materialize (cost=100.00..141.75 rows=100 width=4) - -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.00..141.50 rows=100 width=4) - -> Seq Scan on nestloop_suppression3 t3 (cost=0.00..41.00 rows=100 width=4) + -> Seq Scan on nestloop_suppression2 t2 (cost=0.00..110.00 rows=5000 width=4) + -> Materialize (cost=100.00..120.62 rows=50 width=4) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.00..120.50 rows=50 width=4) + -> Seq Scan on nestloop_suppression3 t3 (cost=0.00..20.50 rows=50 width=4) (12 rows) set enable_nestloop_suppression = true; @@ -6189,19 +6193,19 @@ explain select t3.b from nestloop_suppression1 t1, nestloop_suppression2 t2, nes where t1.b=2 and t1.c=3 and t1.d like 'char%' and t1.a=t2.a and t3.b>t2.a; QUERY PLAN ------------------------------------------------------------------------------------------------------------------------------------ - Nested Loop (cost=200.16..621.19 rows=33 width=4) + Nested Loop (cost=200.16..414.44 rows=33 width=4) Join Filter: (t3.b > t2.a) - -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.16..478.19 rows=1 width=4) - -> Nested Loop (cost=0.16..378.19 rows=1 width=4) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.16..293.19 rows=1 width=4) + -> Nested Loop (cost=0.16..193.19 rows=1 width=4) Join Filter: (t1.a = t2.a) - -> Seq Scan on nestloop_suppression2 t2 (cost=0.00..220.00 rows=10000 width=4) + -> Seq Scan on nestloop_suppression2 t2 (cost=0.00..110.00 rows=5000 width=4) -> Materialize (cost=0.16..8.19 rows=1 width=4) -> Index Scan using idx_nestloop_suppression1_b on nestloop_suppression1 t1 (cost=0.16..8.18 rows=1 width=4) Index Cond: (b = 2) Filter: (((d)::text ~~ 'char%'::text) AND (c = 3)) - -> Materialize (cost=100.00..141.75 rows=100 width=4) - -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.00..141.50 rows=100 width=4) - -> Seq Scan on nestloop_suppression3 t3 (cost=0.00..41.00 rows=100 width=4) + -> Materialize (cost=100.00..120.62 rows=50 width=4) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.00..120.50 rows=50 width=4) + -> Seq Scan on nestloop_suppression3 t3 (cost=0.00..20.50 rows=50 width=4) (13 rows) drop table nestloop_suppression1; diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out index 85aea9c7..ccf6aba3 100644 --- a/src/test/regress/expected/privileges.out +++ b/src/test/regress/expected/privileges.out @@ -211,10 +211,12 @@ EXPLAIN (COSTS OFF) SELECT * FROM atest12v x, atest12v y WHERE x.a = y.b; Distribute results by H: b -> Seq Scan on atest12 atest12_1 Filter: (b <<< 5) - -> Index Scan using atest12_a_idx on atest12 - Index Cond: (a = atest12_1.b) + -> Bitmap Heap Scan on atest12 + Recheck Cond: (a = atest12_1.b) Filter: (b <<< 5) -(9 rows) + -> Bitmap Index Scan on atest12_a_idx + Index Cond: (a = atest12_1.b) +(11 rows) -- And this one. EXPLAIN (COSTS OFF) SELECT * FROM atest12 x, atest12 y @@ -247,16 +249,15 @@ EXPLAIN (COSTS OFF) SELECT * FROM atest12v x, atest12v y WHERE x.a = y.b; QUERY PLAN ----------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) - -> Hash Join - Hash Cond: (atest12_1.b = atest12.a) + -> Nested Loop -> Remote Subquery Scan on all (datanode_1,datanode_2) Distribute results by H: b -> Seq Scan on atest12 atest12_1 Filter: (b <<< 5) - -> Hash - -> Seq Scan on atest12 - Filter: (b <<< 5) -(10 rows) + -> Index Scan using atest12_a_idx on atest12 + Index Cond: (a = atest12_1.b) + Filter: (b <<< 5) +(9 rows) RESET random_page_cost; -- Now regress_user1 grants sufficient access to regress_user2. @@ -273,10 +274,12 @@ EXPLAIN (COSTS OFF) SELECT * FROM atest12v x, atest12v y WHERE x.a = y.b; Distribute results by H: b -> Seq Scan on atest12 atest12_1 Filter: (b <<< 5) - -> Index Scan using atest12_a_idx on atest12 - Index Cond: (a = atest12_1.b) + -> Bitmap Heap Scan on atest12 + Recheck Cond: (a = atest12_1.b) Filter: (b <<< 5) -(9 rows) + -> Bitmap Index Scan on atest12_a_idx + Index Cond: (a = atest12_1.b) +(11 rows) -- But not for this, due to lack of table-wide permissions needed -- to make use of the expression index's statistics. diff --git a/src/test/regress/expected/rowsecurity_1.out b/src/test/regress/expected/rowsecurity_1.out index e0336e73..01debacc 100644 --- a/src/test/regress/expected/rowsecurity_1.out +++ b/src/test/regress/expected/rowsecurity_1.out @@ -1577,13 +1577,12 @@ WHERE t2.a = 3 and t3.a = 2 AND f_leak(t2.b) AND f_leak(t3.b); Remote Subquery Scan on all (datanode_2) -> Update on t2 -> Nested Loop - -> Seq Scan on t2 - Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b)) - -> Materialize -> Remote Subquery Scan on all (datanode_1) -> Seq Scan on t3 Filter: ((a = 2) AND f_leak(b)) -(9 rows) + -> Seq Scan on t2 + Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b)) +(8 rows) UPDATE t2 SET b=t2.b FROM t3 WHERE t2.a = 3 and t3.a = 2 AND f_leak(t2.b) AND f_leak(t3.b); @@ -2062,16 +2061,16 @@ EXPLAIN (COSTS OFF) EXECUTE plancache_test2; PREPARE plancache_test3 AS WITH q AS MATERIALIZED (SELECT * FROM z2) SELECT * FROM q,z1 WHERE f_leak(z1.b); EXPLAIN (COSTS OFF) EXECUTE plancache_test3; QUERY PLAN ------------------------------------------------------------------ +------------------------------------------------------------- Nested Loop CTE q -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on z2 - -> CTE Scan on q - -> Materialize -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on z1 Filter: (((a % 2) = 0) AND f_leak(b)) + -> Materialize + -> CTE Scan on q (9 rows) SET ROLE regress_rls_group1; @@ -2114,16 +2113,16 @@ EXPLAIN (COSTS OFF) EXECUTE plancache_test2; EXPLAIN (COSTS OFF) EXECUTE plancache_test3; QUERY PLAN ------------------------------------------------------------------ +------------------------------------------------------------- Nested Loop CTE q -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on z2 - -> CTE Scan on q - -> Materialize -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on z1 Filter: (((a % 2) = 0) AND f_leak(b)) + -> Materialize + -> CTE Scan on q (9 rows) SET SESSION AUTHORIZATION regress_rls_carol; @@ -2166,16 +2165,16 @@ EXPLAIN (COSTS OFF) EXECUTE plancache_test2; EXPLAIN (COSTS OFF) EXECUTE plancache_test3; QUERY PLAN ------------------------------------------------------------------ +------------------------------------------------------------- Nested Loop CTE q -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on z2 - -> CTE Scan on q - -> Materialize -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on z1 Filter: (((a % 2) = 1) AND f_leak(b)) + -> Materialize + -> CTE Scan on q (9 rows) SET ROLE regress_rls_group2; @@ -2218,16 +2217,16 @@ EXPLAIN (COSTS OFF) EXECUTE plancache_test2; EXPLAIN (COSTS OFF) EXECUTE plancache_test3; QUERY PLAN ------------------------------------------------------------------ +------------------------------------------------------------- Nested Loop CTE q -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on z2 - -> CTE Scan on q - -> Materialize -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on z1 Filter: (((a % 2) = 1) AND f_leak(b)) + -> Materialize + -> CTE Scan on q (9 rows) -- diff --git a/src/test/regress/expected/select_views.out b/src/test/regress/expected/select_views.out index ca729d3d..2406dabc 100644 --- a/src/test/regress/expected/select_views.out +++ b/src/test/regress/expected/select_views.out @@ -1449,18 +1449,17 @@ EXPLAIN (COSTS OFF) SELECT * FROM my_credit_card_usage_normal Remote Subquery Scan on all (datanode_1,datanode_2) -> Nested Loop Join Filter: (l.cid = r.cid) + -> Subquery Scan on l + Filter: f_leak(l.cnum) + -> Hash Join + Hash Cond: (r_1.cid = l_1.cid) + -> Seq Scan on credit_card r_1 + -> Hash + -> Seq Scan on customer l_1 + Filter: (name = (CURRENT_USER)::text) -> Seq Scan on credit_usage r Filter: ((ymd >= '10-01-2011'::date) AND (ymd < '11-01-2011'::date)) - -> Materialize - -> Subquery Scan on l - Filter: f_leak(l.cnum) - -> Hash Join - Hash Cond: (r_1.cid = l_1.cid) - -> Seq Scan on credit_card r_1 - -> Hash - -> Seq Scan on customer l_1 - Filter: (name = (CURRENT_USER)::text) -(14 rows) +(13 rows) SELECT * FROM my_credit_card_usage_secure WHERE f_leak(cnum) AND ymd >= '2011-10-01' AND ymd < '2011-11-01'; diff --git a/src/test/regress/expected/stats_ext_2.out b/src/test/regress/expected/stats_ext_2.out index ca7aba0a..315bcbc7 100644 --- a/src/test/regress/expected/stats_ext_2.out +++ b/src/test/regress/expected/stats_ext_2.out @@ -181,12 +181,10 @@ EXPLAIN (COSTS off) Group Key: a, b -> Remote Subquery Scan on all (datanode_1,datanode_2) Distribute results by H: a - -> Partial GroupAggregate + -> Partial HashAggregate Group Key: a, b - -> Sort - Sort Key: a, b - -> Seq Scan on ndistinct -(10 rows) + -> Seq Scan on ndistinct +(8 rows) EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY b, c; @@ -197,12 +195,10 @@ EXPLAIN (COSTS off) Group Key: b, c -> Remote Subquery Scan on all (datanode_1,datanode_2) Distribute results by H: b - -> Partial GroupAggregate + -> Partial HashAggregate Group Key: b, c - -> Sort - Sort Key: b, c - -> Seq Scan on ndistinct -(10 rows) + -> Seq Scan on ndistinct +(8 rows) EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c; @@ -213,12 +209,10 @@ EXPLAIN (COSTS off) Group Key: a, b, c -> Remote Subquery Scan on all (datanode_1,datanode_2) Distribute results by H: a - -> Partial GroupAggregate + -> Partial HashAggregate Group Key: a, b, c - -> Sort - Sort Key: a, b, c - -> Seq Scan on ndistinct -(10 rows) + -> Seq Scan on ndistinct +(8 rows) EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d; @@ -229,12 +223,10 @@ EXPLAIN (COSTS off) Group Key: a, b, c, d -> Remote Subquery Scan on all (datanode_1,datanode_2) Distribute results by H: a - -> Partial GroupAggregate + -> Partial HashAggregate Group Key: a, b, c, d - -> Sort - Sort Key: a, b, c, d - -> Seq Scan on ndistinct -(10 rows) + -> Seq Scan on ndistinct +(8 rows) EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d; @@ -245,12 +237,10 @@ EXPLAIN (COSTS off) Group Key: b, c, d -> Remote Subquery Scan on all (datanode_1,datanode_2) Distribute results by H: b - -> Partial GroupAggregate + -> Partial HashAggregate Group Key: b, c, d - -> Sort - Sort Key: b, c, d - -> Seq Scan on ndistinct -(10 rows) + -> Seq Scan on ndistinct +(8 rows) -- correct command CREATE STATISTICS s10 ON a, b, c FROM ndistinct; @@ -316,12 +306,10 @@ EXPLAIN (COSTS off) Group Key: a, b, c, d -> Remote Subquery Scan on all (datanode_1,datanode_2) Distribute results by H: a - -> Partial GroupAggregate + -> Partial HashAggregate Group Key: a, b, c, d - -> Sort - Sort Key: a, b, c, d - -> Seq Scan on ndistinct -(10 rows) + -> Seq Scan on ndistinct +(8 rows) EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d; @@ -332,12 +320,10 @@ EXPLAIN (COSTS off) Group Key: b, c, d -> Remote Subquery Scan on all (datanode_1,datanode_2) Distribute results by H: b - -> Partial GroupAggregate + -> Partial HashAggregate Group Key: b, c, d - -> Sort - Sort Key: b, c, d - -> Seq Scan on ndistinct -(10 rows) + -> Seq Scan on ndistinct +(8 rows) TRUNCATE TABLE ndistinct; -- under-estimates when using only per-column statistics @@ -363,12 +349,10 @@ EXPLAIN (COSTS off) Group Key: a, b -> Remote Subquery Scan on all (datanode_1,datanode_2) Distribute results by H: b - -> Partial GroupAggregate + -> Partial HashAggregate Group Key: a, b - -> Sort - Sort Key: a, b - -> Seq Scan on ndistinct -(10 rows) + -> Seq Scan on ndistinct +(8 rows) EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c; @@ -675,10 +659,10 @@ EXPLAIN SELECT count(*) FROM subset WHERE b = 'prefix_1' and c = 1; QUERY PLAN ------------------------------------------------------------------------------------------------- - Finalize Aggregate (cost=255.01..255.02 rows=1 width=8) - -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=255.00..255.01 rows=1 width=0) - -> Partial Aggregate (cost=155.00..155.01 rows=1 width=8) - -> Seq Scan on subset (cost=0.00..155.00 rows=2 width=0) + Finalize Aggregate (cost=177.51..177.52 rows=1 width=8) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=177.50..177.51 rows=1 width=8) + -> Partial Aggregate (cost=77.50..77.51 rows=1 width=8) + -> Seq Scan on subset (cost=0.00..77.50 rows=1 width=0) Filter: ((b = 'prefix_1'::text) AND (c = 1)) (5 rows) @@ -696,10 +680,10 @@ EXPLAIN SELECT count(*) FROM subset WHERE b = 'prefix_1' and c = 1; QUERY PLAN ------------------------------------------------------------------------------------------------- - Finalize Aggregate (cost=255.01..255.02 rows=1 width=8) - -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=255.00..255.01 rows=1 width=0) - -> Partial Aggregate (cost=155.00..155.01 rows=1 width=8) - -> Seq Scan on subset (cost=0.00..155.00 rows=100 width=0) + Finalize Aggregate (cost=177.51..177.52 rows=1 width=8) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=177.50..177.51 rows=1 width=8) + -> Partial Aggregate (cost=77.50..77.51 rows=1 width=8) + -> Seq Scan on subset (cost=0.00..77.50 rows=50 width=0) Filter: ((b = 'prefix_1'::text) AND (c = 1)) (5 rows) @@ -714,10 +698,10 @@ EXPLAIN SELECT count(*) FROM subset WHERE b like '%_1' and c = 1; QUERY PLAN ------------------------------------------------------------------------------------------------- - Finalize Aggregate (cost=255.01..255.02 rows=1 width=8) - -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=255.00..255.01 rows=1 width=0) - -> Partial Aggregate (cost=155.00..155.01 rows=1 width=8) - -> Seq Scan on subset (cost=0.00..155.00 rows=10 width=0) + Finalize Aggregate (cost=177.51..177.52 rows=1 width=8) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=177.50..177.51 rows=1 width=8) + -> Partial Aggregate (cost=77.50..77.51 rows=1 width=8) + -> Seq Scan on subset (cost=0.00..77.50 rows=5 width=0) Filter: ((b ~~ '%_1'::text) AND (c = 1)) (5 rows) @@ -738,10 +722,10 @@ EXPLAIN SELECT count(*) FROM subset WHERE b like '%_1' and c = 1; QUERY PLAN ------------------------------------------------------------------------------------------------- - Finalize Aggregate (cost=255.01..255.02 rows=1 width=8) - -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=255.00..255.01 rows=1 width=0) - -> Partial Aggregate (cost=155.00..155.01 rows=1 width=8) - -> Seq Scan on subset (cost=0.00..155.00 rows=100 width=0) + Finalize Aggregate (cost=177.51..177.52 rows=1 width=8) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=177.50..177.51 rows=1 width=8) + -> Partial Aggregate (cost=77.50..77.51 rows=1 width=8) + -> Seq Scan on subset (cost=0.00..77.50 rows=50 width=0) Filter: ((b ~~ '%_1'::text) AND (c = 1)) (5 rows) diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out index 4691a4a9..c9dc3101 100644 --- a/src/test/regress/expected/subselect.out +++ b/src/test/regress/expected/subselect.out @@ -1174,15 +1174,15 @@ set enable_hashjoin to false; set enable_mergejoin to false; explain select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; QUERY PLAN -------------------------------------------------------------------------------------------------- - Remote Subquery Scan on all (datanode_1,datanode_2) (cost=15636.19..15637.88 rows=675 width=8) - -> Sort (cost=15636.19..15637.88 rows=675 width=8) +----------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) (cost=3923.54..3924.39 rows=338 width=8) + -> Sort (cost=3923.54..3924.39 rows=338 width=8) Sort Key: a.a, b.a - -> Nested Loop Left Scalar Join (cost=0.00..15604.47 rows=675 width=8) + -> Nested Loop Left Scalar Join (cost=0.00..3909.35 rows=338 width=8) Join Filter: (b.a = a.a) - -> Seq Scan on tbl_a a (cost=0.00..23.50 rows=1350 width=4) - -> Materialize (cost=0.00..30.25 rows=1350 width=4) - -> Seq Scan on tbl_b b (cost=0.00..23.50 rows=1350 width=4) + -> Seq Scan on tbl_a a (cost=0.00..11.75 rows=675 width=4) + -> Materialize (cost=0.00..15.12 rows=675 width=4) + -> Seq Scan on tbl_b b (cost=0.00..11.75 rows=675 width=4) (8 rows) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; @@ -1711,16 +1711,16 @@ select a.a,(select b.a from tbl_b b where b.a = a.a limit 1) q from tbl_a a orde -- support pullup lateral ANY_SUBLINK explain select * from tbl_a a where a.b IN (select b.a from tbl_b b where b.b > a.b); QUERY PLAN ----------------------------------------------------------------------------------------------------------- - Remote Subquery Scan on all (datanode_1,datanode_2) (cost=140.38..193.10 rows=225 width=8) - -> Hash Semi Join (cost=140.38..193.10 rows=225 width=8) +--------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) (cost=120.19..136.36 rows=112 width=8) + -> Hash Semi Join (cost=120.19..136.36 rows=112 width=8) Hash Cond: (a.b = b.a) Join Filter: (b.b > a.b) - -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.00..141.05 rows=1350 width=8) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.00..111.75 rows=675 width=8) Distribute results by H: b - -> Seq Scan on tbl_a a (cost=0.00..23.50 rows=1350 width=8) - -> Hash (cost=23.50..23.50 rows=1350 width=8) - -> Seq Scan on tbl_b b (cost=0.00..23.50 rows=1350 width=8) + -> Seq Scan on tbl_a a (cost=0.00..11.75 rows=675 width=8) + -> Hash (cost=11.75..11.75 rows=675 width=8) + -> Seq Scan on tbl_b b (cost=0.00..11.75 rows=675 width=8) (9 rows) select * from tbl_a a where a.b IN (select b.a from tbl_b b where b.b > a.b); @@ -1750,9 +1750,12 @@ where t2.a = ( t1.a = t2.a ); QUERY PLAN ------------------------------------------------------------------------------------------------------ +----------------------------------------------------------------------------------------------------------- Hash Join - Hash Cond: ("EXPR_subquery".min = t2.a) + Hash Cond: (t2.a = "EXPR_subquery".min) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on sub_t2 t2 + -> Hash -> Hash Left Join Hash Cond: (t1.a = "EXPR_subquery".a) -> Remote Subquery Scan on all (datanode_1,datanode_2) @@ -1771,9 +1774,6 @@ where t2.a = ( -> Materialize -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on sub_interfere2 - -> Hash - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Seq Scan on sub_t2 t2 (23 rows) DROP TABLE sub_t1; diff --git a/src/test/regress/expected/xc_FQS_2.out b/src/test/regress/expected/xc_FQS_2.out index ea10b9ad..c4e07fc5 100644 --- a/src/test/regress/expected/xc_FQS_2.out +++ b/src/test/regress/expected/xc_FQS_2.out @@ -1641,14 +1641,14 @@ select * from subquery_fqs t join (select 1 id, 'gd' a, 2 c from dual union sele explain select * from subquery_fqs t1 where t1.id = 1 and t1.c IN (select c from subquery_fqs t2 where t2.id=1); QUERY PLAN -------------------------------------------------------------------------------------------------- - Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.00..142.30 rows=2 width=40) - -> Nested Loop Semi Join (cost=100.00..142.30 rows=2 width=40) + Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.00..121.06 rows=1 width=40) + -> Nested Loop Semi Join (cost=100.00..121.06 rows=1 width=40) Join Filter: (t1.c = t2.c) - -> Seq Scan on subquery_fqs t1 (cost=0.00..21.00 rows=4 width=40) + -> Seq Scan on subquery_fqs t1 (cost=0.00..10.50 rows=2 width=40) Filter: (id = 1) - -> Materialize (cost=100.00..121.07 rows=4 width=4) - -> Remote Subquery Scan on all (datanode_1) (cost=100.00..121.05 rows=4 width=4) - -> Seq Scan on subquery_fqs t2 (cost=0.00..21.00 rows=4 width=4) + -> Materialize (cost=100.00..110.51 rows=2 width=4) + -> Remote Subquery Scan on all (datanode_1) (cost=100.00..110.50 rows=2 width=4) + -> Seq Scan on subquery_fqs t2 (cost=0.00..10.50 rows=2 width=4) Filter: (id = 1) (9 rows) diff --git a/src/test/regress/expected/xc_FQS_join_1.out b/src/test/regress/expected/xc_FQS_join_1.out index 18836c1e..6cfb1dda 100644 --- a/src/test/regress/expected/xc_FQS_join_1.out +++ b/src/test/regress/expected/xc_FQS_join_1.out @@ -390,19 +390,19 @@ explain (verbose on, nodes off, costs off) select * from tab1_mod natural join t ---------------------------------------------------------------------------------- Hash Join Output: tab1_mod.val, tab1_mod.val2 - Hash Cond: ((tab1_mod.val = tab4_rep.val) AND (tab1_mod.val2 = tab4_rep.val2)) - -> Remote Subquery Scan on all - Output: tab1_mod.val, tab1_mod.val2 - -> Seq Scan on public.tab1_mod - Output: tab1_mod.val, tab1_mod.val2 - Filter: (tab1_mod.val > 2) - -> Hash - Output: tab4_rep.val, tab4_rep.val2 + Hash Cond: ((tab4_rep.val = tab1_mod.val) AND (tab4_rep.val2 = tab1_mod.val2)) -> Remote Subquery Scan on all Output: tab4_rep.val, tab4_rep.val2 -> Seq Scan on public.tab4_rep Output: tab4_rep.val, tab4_rep.val2 Filter: (tab4_rep.val < 4) + -> Hash + Output: tab1_mod.val, tab1_mod.val2 + -> Remote Subquery Scan on all + Output: tab1_mod.val, tab1_mod.val2 + -> Seq Scan on public.tab1_mod + Output: tab1_mod.val, tab1_mod.val2 + Filter: (tab1_mod.val > 2) (15 rows) -- Join involving two distributed tables, never shipped @@ -425,18 +425,18 @@ explain (verbose on, nodes off, costs off) select * from tab1_mod natural join t Output: tab1_mod.val, tab1_mod.val2 -> Hash Join Output: tab1_mod.val, tab1_mod.val2 - Hash Cond: ((tab1_mod.val = tab2_mod.val) AND (tab1_mod.val2 = tab2_mod.val2)) - -> Seq Scan on public.tab1_mod - Output: tab1_mod.val, tab1_mod.val2 - Filter: (tab1_mod.val > 2) - -> Hash - Output: tab2_mod.val, tab2_mod.val2 - -> Remote Subquery Scan on all - Output: tab2_mod.val, tab2_mod.val2 - Distribute results by M: val + Hash Cond: ((tab2_mod.val = tab1_mod.val) AND (tab2_mod.val2 = tab1_mod.val2)) -> Seq Scan on public.tab2_mod Output: tab2_mod.val, tab2_mod.val2 Filter: (tab2_mod.val < 4) + -> Hash + Output: tab1_mod.val, tab1_mod.val2 + -> Remote Subquery Scan on all + Output: tab1_mod.val, tab1_mod.val2 + Distribute results by M: val + -> Seq Scan on public.tab1_mod + Output: tab1_mod.val, tab1_mod.val2 + Filter: (tab1_mod.val > 2) (16 rows) -- Join involving a distributed table and two replicated tables, such that the @@ -590,17 +590,17 @@ explain (verbose on, nodes off, costs off, num_nodes on) select * from tab1_mod Output: tab1_mod.val, tab1_mod.val2, tab1_mod.val2 Join Filter: (tab1_mod.val2 = tab4_rep.val2) -> Remote Subquery Scan on all - Output: tab1_mod.val, tab1_mod.val2 - -> Seq Scan on public.tab1_mod - Output: tab1_mod.val, tab1_mod.val2 - Filter: (tab1_mod.val = 1) - -> Materialize - Output: tab4_rep.val, tab4_rep.val2 - -> Remote Subquery Scan on all Output: tab4_rep.val, tab4_rep.val2 -> Seq Scan on public.tab4_rep Output: tab4_rep.val, tab4_rep.val2 Filter: (tab4_rep.val = 1) + -> Materialize + Output: tab1_mod.val, tab1_mod.val2 + -> Remote Subquery Scan on all + Output: tab1_mod.val, tab1_mod.val2 + -> Seq Scan on public.tab1_mod + Output: tab1_mod.val, tab1_mod.val2 + Filter: (tab1_mod.val = 1) (18 rows) -- following join between distributed tables should get FQSed because both of @@ -625,16 +625,16 @@ explain (verbose on, nodes off, costs off, num_nodes on) select * from tab1_mod -> Nested Loop Output: tab1_mod.val2, tab1_mod.val, tab2_mod.val, tab1_mod.val Join Filter: (tab1_mod.val2 = tab2_mod.val2) + -> Seq Scan on public.tab2_mod + Output: tab2_mod.val, tab2_mod.val2 + Filter: (tab2_mod.val = 2) + -> Materialize + Output: tab1_mod.val2, tab1_mod.val -> Remote Subquery Scan on all Output: tab1_mod.val2, tab1_mod.val -> Seq Scan on public.tab1_mod Output: tab1_mod.val2, tab1_mod.val Filter: (tab1_mod.val = 1) - -> Materialize - Output: tab2_mod.val, tab2_mod.val2 - -> Seq Scan on public.tab2_mod - Output: tab2_mod.val, tab2_mod.val2 - Filter: (tab2_mod.val = 2) (15 rows) -- JOIN involving the distributed table with equi-JOIN on the distributed column @@ -696,24 +696,21 @@ explain (verbose on, nodes off, costs off) update tab1_mod set val2 = 1000 from Remote Subquery Scan on all -> Update on public.tab1_mod -> Merge Join - Output: tab1_mod.val, 1000, tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab2_mod.ctid, tab2_mod.xc_node_id - Merge Cond: ((tab1_mod.val = tab2_mod.val) AND (tab1_mod.val2 = tab2_mod.val2)) + Output: tab1_mod.val, 1000, tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab2_mod.ctid + Merge Cond: ((tab2_mod.val = tab1_mod.val) AND (tab2_mod.val2 = tab1_mod.val2)) + -> Remote Subquery Scan on all + Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2 + -> Sort + Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2 + Sort Key: tab2_mod.val, tab2_mod.val2 + -> Seq Scan on public.tab2_mod + Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2 -> Sort Output: tab1_mod.val, tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val2 Sort Key: tab1_mod.val, tab1_mod.val2 -> Seq Scan on public.tab1_mod Output: tab1_mod.val, tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val2 - -> Materialize - Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2 - -> Remote Subquery Scan on all - Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2 - Distribute results by M: val - -> Sort - Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2 - Sort Key: tab2_mod.val, tab2_mod.val2 - -> Seq Scan on public.tab2_mod - Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2 -(20 rows) +(17 rows) explain (verbose on, nodes off, costs off) delete from tab1_mod using tab2_mod where tab1_mod.val = tab2_mod.val and tab1_mod.val2 = tab2_mod.val2; @@ -722,24 +719,21 @@ explain (verbose on, nodes off, costs off) delete from tab1_mod using tab2_mod Remote Subquery Scan on all -> Delete on public.tab1_mod -> Merge Join - Output: tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val, tab2_mod.ctid, tab2_mod.xc_node_id - Merge Cond: ((tab1_mod.val = tab2_mod.val) AND (tab1_mod.val2 = tab2_mod.val2)) + Output: tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val, tab2_mod.ctid + Merge Cond: ((tab2_mod.val = tab1_mod.val) AND (tab2_mod.val2 = tab1_mod.val2)) + -> Remote Subquery Scan on all + Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2 + -> Sort + Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2 + Sort Key: tab2_mod.val, tab2_mod.val2 + -> Seq Scan on public.tab2_mod + Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2 -> Sort Output: tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val, tab1_mod.val2 Sort Key: tab1_mod.val, tab1_mod.val2 -> Seq Scan on public.tab1_mod Output: tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val, tab1_mod.val2 - -> Materialize - Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2 - -> Remote Subquery Scan on all - Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2 - Distribute results by M: val - -> Sort - Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2 - Sort Key: tab2_mod.val, tab2_mod.val2 - -> Seq Scan on public.tab2_mod - Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2 -(20 rows) +(17 rows) explain (verbose on, nodes off, costs off) update tab1_rep set val2 = 1000 from tab2_rep where tab1_rep.val = tab2_rep.val and tab1_rep.val2 = tab2_rep.val2; diff --git a/src/test/regress/expected/xc_groupby_1.out b/src/test/regress/expected/xc_groupby_1.out index 8db42b7f..b33bfcf0 100644 --- a/src/test/regress/expected/xc_groupby_1.out +++ b/src/test/regress/expected/xc_groupby_1.out @@ -332,20 +332,16 @@ explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc -> HashAggregate Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2 Group Key: xc_groupby_tab1.val, xc_groupby_tab2.val2 - -> Merge Join + -> Hash Join Output: xc_groupby_tab1.val, xc_groupby_tab2.val2 - Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) - -> Sort - Output: xc_groupby_tab1.val - Sort Key: xc_groupby_tab1.val - -> Seq Scan on public.xc_groupby_tab1 - Output: xc_groupby_tab1.val - -> Sort + Hash Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 + -> Hash Output: xc_groupby_tab2.val2, xc_groupby_tab2.val - Sort Key: xc_groupby_tab2.val -> Seq Scan on public.xc_groupby_tab2 Output: xc_groupby_tab2.val2, xc_groupby_tab2.val -(22 rows) +(18 rows) explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2; QUERY PLAN @@ -355,20 +351,16 @@ explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc -> HashAggregate Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2 Group Key: xc_groupby_tab1.val, xc_groupby_tab2.val2 - -> Merge Join + -> Hash Join Output: xc_groupby_tab1.val, xc_groupby_tab2.val2 - Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) - -> Sort - Output: xc_groupby_tab1.val - Sort Key: xc_groupby_tab1.val - -> Seq Scan on public.xc_groupby_tab1 - Output: xc_groupby_tab1.val - -> Sort + Hash Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 + -> Hash Output: xc_groupby_tab2.val2, xc_groupby_tab2.val - Sort Key: xc_groupby_tab2.val -> Seq Scan on public.xc_groupby_tab2 Output: xc_groupby_tab2.val2, xc_groupby_tab2.val -(18 rows) +(14 rows) select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2 order by 1; ?column? @@ -397,20 +389,16 @@ explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc -> Partial HashAggregate Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) Group Key: (xc_groupby_tab1.val + xc_groupby_tab2.val2) - -> Merge Join + -> Hash Join Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2) - Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) - -> Sort - Output: xc_groupby_tab1.val - Sort Key: xc_groupby_tab1.val - -> Seq Scan on public.xc_groupby_tab1 - Output: xc_groupby_tab1.val - -> Sort + Hash Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 + -> Hash Output: xc_groupby_tab2.val2, xc_groupby_tab2.val - Sort Key: xc_groupby_tab2.val -> Seq Scan on public.xc_groupby_tab2 Output: xc_groupby_tab2.val2, xc_groupby_tab2.val -(28 rows) +(24 rows) explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2; QUERY PLAN @@ -426,20 +414,16 @@ explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc -> Partial HashAggregate Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) Group Key: (xc_groupby_tab1.val + xc_groupby_tab2.val2) - -> Merge Join + -> Hash Join Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2) - Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) - -> Sort - Output: xc_groupby_tab1.val - Sort Key: xc_groupby_tab1.val - -> Seq Scan on public.xc_groupby_tab1 - Output: xc_groupby_tab1.val - -> Sort + Hash Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 + -> Hash Output: xc_groupby_tab2.val2, xc_groupby_tab2.val - Sort Key: xc_groupby_tab2.val -> Seq Scan on public.xc_groupby_tab2 Output: xc_groupby_tab2.val2, xc_groupby_tab2.val -(24 rows) +(20 rows) -- group by with aggregates in expression select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2 order by 1; @@ -870,12 +854,12 @@ explain (verbose true, costs false, nodes false) select * from (select b,count(b Remote Subquery Scan on all Output: b, count Sort Key: b - -> Sort - Output: xc_groupby_def.b, (count(xc_groupby_def.b)) - Sort Key: xc_groupby_def.b - -> Finalize HashAggregate - Output: xc_groupby_def.b, count(xc_groupby_def.b) - Group Key: xc_groupby_def.b + -> Finalize GroupAggregate + Output: xc_groupby_def.b, count(xc_groupby_def.b) + Group Key: xc_groupby_def.b + -> Sort + Output: xc_groupby_def.b, (PARTIAL count(xc_groupby_def.b)) + Sort Key: xc_groupby_def.b -> Remote Subquery Scan on all Output: xc_groupby_def.b, PARTIAL count(xc_groupby_def.b) Distribute results by H: b @@ -2302,20 +2286,16 @@ explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc -> Sort Output: xc_groupby_tab1.val, xc_groupby_tab2.val2 Sort Key: xc_groupby_tab1.val, xc_groupby_tab2.val2 - -> Merge Join + -> Hash Join Output: xc_groupby_tab1.val, xc_groupby_tab2.val2 - Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) - -> Sort - Output: xc_groupby_tab1.val - Sort Key: xc_groupby_tab1.val - -> Seq Scan on public.xc_groupby_tab1 - Output: xc_groupby_tab1.val - -> Sort + Hash Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 + -> Hash Output: xc_groupby_tab2.val2, xc_groupby_tab2.val - Sort Key: xc_groupby_tab2.val -> Seq Scan on public.xc_groupby_tab2 Output: xc_groupby_tab2.val2, xc_groupby_tab2.val -(21 rows) +(17 rows) select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2 order by 1; ?column? @@ -2346,20 +2326,16 @@ explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc -> Sort Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) - -> Merge Join + -> Hash Join Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2) - Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) - -> Sort - Output: xc_groupby_tab1.val - Sort Key: xc_groupby_tab1.val - -> Seq Scan on public.xc_groupby_tab1 - Output: xc_groupby_tab1.val - -> Sort + Hash Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 + -> Hash Output: xc_groupby_tab2.val2, xc_groupby_tab2.val - Sort Key: xc_groupby_tab2.val -> Seq Scan on public.xc_groupby_tab2 Output: xc_groupby_tab2.val2, xc_groupby_tab2.val -(30 rows) +(26 rows) -- group by with aggregates in expression select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2 order by 1; @@ -3968,20 +3944,16 @@ explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc -> Sort Output: xc_groupby_tab1.val, xc_groupby_tab2.val2 Sort Key: xc_groupby_tab1.val, xc_groupby_tab2.val2 - -> Merge Join + -> Hash Join Output: xc_groupby_tab1.val, xc_groupby_tab2.val2 - Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) - -> Sort - Output: xc_groupby_tab1.val - Sort Key: xc_groupby_tab1.val - -> Seq Scan on public.xc_groupby_tab1 - Output: xc_groupby_tab1.val - -> Sort + Hash Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 + -> Hash Output: xc_groupby_tab2.val2, xc_groupby_tab2.val - Sort Key: xc_groupby_tab2.val -> Seq Scan on public.xc_groupby_tab2 Output: xc_groupby_tab2.val2, xc_groupby_tab2.val -(22 rows) +(18 rows) explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2; QUERY PLAN @@ -4034,20 +4006,16 @@ explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc -> Partial HashAggregate Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) Group Key: (xc_groupby_tab1.val + xc_groupby_tab2.val2) - -> Merge Join + -> Hash Join Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2) - Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) - -> Sort - Output: xc_groupby_tab1.val - Sort Key: xc_groupby_tab1.val - -> Seq Scan on public.xc_groupby_tab1 - Output: xc_groupby_tab1.val - -> Sort + Hash Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 + -> Hash Output: xc_groupby_tab2.val2, xc_groupby_tab2.val - Sort Key: xc_groupby_tab2.val -> Seq Scan on public.xc_groupby_tab2 Output: xc_groupby_tab2.val2, xc_groupby_tab2.val -(28 rows) +(24 rows) explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2; QUERY PLAN @@ -4063,20 +4031,16 @@ explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc -> Partial HashAggregate Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) Group Key: (xc_groupby_tab1.val + xc_groupby_tab2.val2) - -> Merge Join + -> Hash Join Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2) - Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) - -> Sort - Output: xc_groupby_tab1.val - Sort Key: xc_groupby_tab1.val - -> Seq Scan on public.xc_groupby_tab1 - Output: xc_groupby_tab1.val - -> Sort + Hash Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 + -> Hash Output: xc_groupby_tab2.val2, xc_groupby_tab2.val - Sort Key: xc_groupby_tab2.val -> Seq Scan on public.xc_groupby_tab2 Output: xc_groupby_tab2.val2, xc_groupby_tab2.val -(24 rows) +(20 rows) -- group by with aggregates in expression select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2 order by val2; @@ -4507,12 +4471,12 @@ explain (verbose true, costs false, nodes false) select * from (select b,count(b Remote Subquery Scan on all Output: b, count Sort Key: b - -> Sort - Output: xc_groupby_def.b, (count(xc_groupby_def.b)) - Sort Key: xc_groupby_def.b - -> Finalize HashAggregate - Output: xc_groupby_def.b, count(xc_groupby_def.b) - Group Key: xc_groupby_def.b + -> Finalize GroupAggregate + Output: xc_groupby_def.b, count(xc_groupby_def.b) + Group Key: xc_groupby_def.b + -> Sort + Output: xc_groupby_def.b, (PARTIAL count(xc_groupby_def.b)) + Sort Key: xc_groupby_def.b -> Remote Subquery Scan on all Output: xc_groupby_def.b, PARTIAL count(xc_groupby_def.b) Distribute results by H: b @@ -6058,20 +6022,16 @@ explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc -> Sort Output: xc_groupby_tab1.val, xc_groupby_tab2.val2 Sort Key: xc_groupby_tab1.val, xc_groupby_tab2.val2 - -> Merge Join + -> Hash Join Output: xc_groupby_tab1.val, xc_groupby_tab2.val2 - Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) - -> Sort - Output: xc_groupby_tab1.val - Sort Key: xc_groupby_tab1.val - -> Seq Scan on public.xc_groupby_tab1 - Output: xc_groupby_tab1.val - -> Sort + Hash Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 + -> Hash Output: xc_groupby_tab2.val2, xc_groupby_tab2.val - Sort Key: xc_groupby_tab2.val -> Seq Scan on public.xc_groupby_tab2 Output: xc_groupby_tab2.val2, xc_groupby_tab2.val -(25 rows) +(21 rows) explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2; QUERY PLAN @@ -6130,20 +6090,16 @@ explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc -> Sort Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) - -> Merge Join + -> Hash Join Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2) - Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) - -> Sort - Output: xc_groupby_tab1.val - Sort Key: xc_groupby_tab1.val - -> Seq Scan on public.xc_groupby_tab1 - Output: xc_groupby_tab1.val - -> Sort + Hash Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 + -> Hash Output: xc_groupby_tab2.val2, xc_groupby_tab2.val - Sort Key: xc_groupby_tab2.val -> Seq Scan on public.xc_groupby_tab2 Output: xc_groupby_tab2.val2, xc_groupby_tab2.val -(31 rows) +(27 rows) explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2; QUERY PLAN @@ -6165,20 +6121,16 @@ explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc -> Sort Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) - -> Merge Join + -> Hash Join Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2) - Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) - -> Sort - Output: xc_groupby_tab1.val - Sort Key: xc_groupby_tab1.val - -> Seq Scan on public.xc_groupby_tab1 - Output: xc_groupby_tab1.val - -> Sort + Hash Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 + -> Hash Output: xc_groupby_tab2.val2, xc_groupby_tab2.val - Sort Key: xc_groupby_tab2.val -> Seq Scan on public.xc_groupby_tab2 Output: xc_groupby_tab2.val2, xc_groupby_tab2.val -(30 rows) +(26 rows) -- group by with aggregates in expression select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2 order by 1; diff --git a/src/test/regress/expected/xc_having_1.out b/src/test/regress/expected/xc_having_1.out index 93469960..9d914a2a 100644 --- a/src/test/regress/expected/xc_having_1.out +++ b/src/test/regress/expected/xc_having_1.out @@ -605,31 +605,26 @@ explain (verbose true, costs false, nodes false) select count(*), sum(xc_having_ -> GroupAggregate Output: count(*), sum((xc_having_tab1.val * xc_having_tab2.val)), avg((xc_having_tab1.val * xc_having_tab2.val)), ((sum((xc_having_tab1.val * xc_having_tab2.val)))::double precision / (count(*))::double precision), xc_having_tab1.val2, xc_having_tab2.val2 Group Key: xc_having_tab1.val2, xc_having_tab2.val2 - -> Merge Join + -> Sort Output: xc_having_tab1.val2, xc_having_tab2.val2, xc_having_tab1.val, xc_having_tab2.val - Merge Cond: (xc_having_tab1.val2 = xc_having_tab2.val2) + Sort Key: xc_having_tab1.val2 + -> Hash Join + Output: xc_having_tab1.val2, xc_having_tab2.val2, xc_having_tab1.val, xc_having_tab2.val + Hash Cond: (xc_having_tab1.val2 = xc_having_tab2.val2) Join Filter: ((xc_having_tab1.val2 + xc_having_tab2.val2) > 2) -> Remote Subquery Scan on all Output: xc_having_tab1.val, xc_having_tab1.val2 Distribute results by H: val2 - Sort Key: xc_having_tab1.val2 - -> Sort - Output: xc_having_tab1.val, xc_having_tab1.val2 - Sort Key: xc_having_tab1.val2 -> Seq Scan on public.xc_having_tab1 Output: xc_having_tab1.val, xc_having_tab1.val2 - -> Materialize + -> Hash Output: xc_having_tab2.val, xc_having_tab2.val2 -> Remote Subquery Scan on all Output: xc_having_tab2.val, xc_having_tab2.val2 Distribute results by H: val2 - Sort Key: xc_having_tab2.val2 - -> Sort - Output: xc_having_tab2.val, xc_having_tab2.val2 - Sort Key: xc_having_tab2.val2 -> Seq Scan on public.xc_having_tab2 Output: xc_having_tab2.val, xc_having_tab2.val2 -(29 rows) +(24 rows) -- group by and having, without aggregate in the target list select val2 from xc_having_tab1 group by val2 having sum(val) > 8; diff --git a/src/test/regress/expected/xl_join.out b/src/test/regress/expected/xl_join.out index 6369183d..6753f018 100644 --- a/src/test/regress/expected/xl_join.out +++ b/src/test/regress/expected/xl_join.out @@ -8,26 +8,25 @@ EXPLAIN (COSTS OFF) SELECT * FROM xl_join_t1 INNER JOIN xl_join_t2 ON xl_join_t1.val1 = xl_join_t2.val2 INNER JOIN xl_join_t3 ON xl_join_t1.val1 = xl_join_t3.val1; - QUERY PLAN ---------------------------------------------------------------------- + QUERY PLAN +----------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) -> Merge Join - Merge Cond: (xl_join_t2.val2 = xl_join_t1.val1) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: val2 - -> Sort - Sort Key: xl_join_t2.val2 - -> Seq Scan on xl_join_t2 - -> Materialize - -> Merge Join - Merge Cond: (xl_join_t1.val1 = xl_join_t3.val1) - -> Sort - Sort Key: xl_join_t1.val1 - -> Seq Scan on xl_join_t1 + Merge Cond: (xl_join_t1.val1 = xl_join_t3.val1) + -> Merge Join + Merge Cond: (xl_join_t2.val2 = xl_join_t1.val1) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: val2 -> Sort - Sort Key: xl_join_t3.val1 - -> Seq Scan on xl_join_t3 -(17 rows) + Sort Key: xl_join_t2.val2 + -> Seq Scan on xl_join_t2 + -> Sort + Sort Key: xl_join_t1.val1 + -> Seq Scan on xl_join_t1 + -> Sort + Sort Key: xl_join_t3.val1 + -> Seq Scan on xl_join_t3 +(16 rows) SELECT * FROM xl_join_t1 INNER JOIN xl_join_t2 ON xl_join_t1.val1 = xl_join_t2.val2 From cf30ce5bfb494fc97e2ab759eef4de08d4134985 Mon Sep 17 00:00:00 2001 From: andrelin Date: Fri, 5 Feb 2021 13:44:49 +0800 Subject: [PATCH 125/578] Abstract function path_count_datanodes for cost estimate --- src/backend/optimizer/path/costsize.c | 59 ++++--------------------- src/backend/optimizer/plan/createplan.c | 25 ++--------- src/backend/optimizer/util/pathnode.c | 27 +++++++++++ src/backend/utils/adt/selfuncs.c | 9 +--- src/include/optimizer/paths.h | 4 +- 5 files changed, 43 insertions(+), 81 deletions(-) diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 4c816ac7..06e9a7c3 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -241,14 +241,7 @@ static ParamPathInfoDataNode * adjust_reloptinfo(Path *path, RelOptInfoDataNode *basescan, RelOptInfo *baserel_orig, ParamPathInfoDataNode *param_info, ParamPathInfo *param_info_orig) { - double nodes; - - if (path->distribution && IsA(path->distribution, Distribution) && - path->distribution->distributionType != LOCATOR_TYPE_REPLICATED && - path->distribution->distributionType != LOCATOR_TYPE_NONE) - nodes = bms_num_members(path->distribution->nodes); - else - nodes = 1; + double nodes = path_count_datanodes(path); basescan->relid = baserel_orig->relid; basescan->rtekind = baserel_orig->rtekind; @@ -659,12 +652,7 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count, double nodes = 1; #ifdef __TBASE__ - if (path->path.distribution && IsA(path->path.distribution, Distribution) && - path->path.distribution->distributionType != LOCATOR_TYPE_REPLICATED && - path->path.distribution->distributionType != LOCATOR_TYPE_NONE) - { - nodes = bms_num_members(path->path.distribution->nodes); - } + nodes = path_count_datanodes(&path->path); /* Should only be applied to base relations */ Assert(IsA(baserel_orig, RelOptInfo) && IsA(index, IndexOptInfo)); @@ -1217,12 +1205,9 @@ cost_bitmap_heap_scan(Path *path, PlannerInfo *root, RelOptInfo *baserel, cpu_run_cost = cpu_per_tuple * tuples_fetched; #ifdef __TBASE__ - /* Adjust costing for parallelism between data nodes, if used. */ - if (path->distribution && IsA(path->distribution, Distribution) && - path->distribution->distributionType != LOCATOR_TYPE_REPLICATED && - path->distribution->distributionType != LOCATOR_TYPE_NONE) { - double nodes = bms_num_members(path->distribution->nodes); + /* Adjust costing for parallelism between data nodes, if used. */ + double nodes = path_count_datanodes(path); /* The CPU cost is divided among all the data nodes. */ cpu_run_cost /= nodes; @@ -2500,14 +2485,7 @@ final_cost_nestloop(PlannerInfo *root, NestPath *path, path->path.rows = path->path.parent->rows; #ifdef __TBASE__ - if (path->path.distribution && IsA(path->path.distribution, Distribution) && - path->path.distribution->distributionType != LOCATOR_TYPE_REPLICATED && - path->path.distribution->distributionType != LOCATOR_TYPE_NONE) - { - double nodes = bms_num_members(path->path.distribution->nodes); - - path->path.rows = clamp_row_est(path->path.rows / nodes); - } + path->path.rows = clamp_row_est(path->path.rows / path_count_datanodes(&path->path)); #endif /* For partial paths, scale row estimate. */ @@ -3000,14 +2978,7 @@ final_cost_mergejoin(PlannerInfo *root, MergePath *path, path->jpath.path.rows = path->jpath.path.parent->rows; #ifdef __TBASE__ - if (path->jpath.path.distribution && IsA(path->jpath.path.distribution, Distribution) && - path->jpath.path.distribution->distributionType != LOCATOR_TYPE_REPLICATED && - path->jpath.path.distribution->distributionType != LOCATOR_TYPE_NONE) - { - double nodes = bms_num_members(path->jpath.path.distribution->nodes); - - path->jpath.path.rows = clamp_row_est(path->jpath.path.rows / nodes); - } + path->jpath.path.rows = clamp_row_est(path->jpath.path.rows / path_count_datanodes(&path->jpath.path)); #endif /* For partial paths, scale row estimate. */ @@ -3454,14 +3425,7 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path, path->jpath.path.rows = path->jpath.path.parent->rows; #ifdef __TBASE__ - if (path->jpath.path.distribution && IsA(path->jpath.path.distribution, Distribution) && - path->jpath.path.distribution->distributionType != LOCATOR_TYPE_REPLICATED && - path->jpath.path.distribution->distributionType != LOCATOR_TYPE_NONE) - { - double nodes = bms_num_members(path->jpath.path.distribution->nodes); - - path->jpath.path.rows = clamp_row_est(path->jpath.path.rows / nodes); - } + path->jpath.path.rows = clamp_row_est(path->jpath.path.rows / path_count_datanodes(&path->jpath.path)); #endif /* For partial paths, scale row estimate. */ @@ -5053,15 +5017,8 @@ set_subquery_size_estimates(PlannerInfo *root, RelOptInfo *rel) sub_final_rel = fetch_upper_rel(subroot, UPPERREL_FINAL, NULL); rel->tuples = sub_final_rel->cheapest_total_path->rows; #ifdef __TBASE__ - if (sub_final_rel->cheapest_total_path->distribution && IsA(sub_final_rel->cheapest_total_path->distribution, Distribution) && - sub_final_rel->cheapest_total_path->distribution->distributionType != LOCATOR_TYPE_REPLICATED && - sub_final_rel->cheapest_total_path->distribution->distributionType != LOCATOR_TYPE_NONE) - { - double nodes = bms_num_members(sub_final_rel->cheapest_total_path->distribution->nodes); - /* count tuples in all data nodes */ - rel->tuples *= nodes; - } + rel->tuples *= path_count_datanodes(sub_final_rel->cheapest_total_path); #endif /* diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 31bbb2d6..e4ccad0c 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -3814,12 +3814,7 @@ create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual, double nodes = 1; #ifdef __TBASE__ - if (apath->path.distribution && IsA(apath->path.distribution, Distribution) && - apath->path.distribution->distributionType != LOCATOR_TYPE_REPLICATED && - apath->path.distribution->distributionType != LOCATOR_TYPE_NONE) - { - nodes = bms_num_members(apath->path.distribution->nodes); - } + nodes = path_count_datanodes(&apath->path); #endif /* @@ -3911,14 +3906,8 @@ create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual, { double nodes = 1; #ifdef __TBASE__ - if (opath->path.distribution && IsA(opath->path.distribution, Distribution) && - opath->path.distribution->distributionType != LOCATOR_TYPE_REPLICATED && - opath->path.distribution->distributionType != LOCATOR_TYPE_NONE) - { - nodes = bms_num_members(opath->path.distribution->nodes); - } + nodes = path_count_datanodes(&opath->path); #endif - plan = (Plan *) make_bitmap_or(subplans); plan->startup_cost = opath->path.startup_cost; plan->total_cost = opath->path.total_cost; @@ -3955,16 +3944,9 @@ create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual, List *subindexECs; ListCell *l; double nodes = 1; - #ifdef __TBASE__ - if (ipath->path.distribution && IsA(ipath->path.distribution, Distribution) && - ipath->path.distribution->distributionType != LOCATOR_TYPE_REPLICATED && - ipath->path.distribution->distributionType != LOCATOR_TYPE_NONE) - { - nodes = bms_num_members(ipath->path.distribution->nodes); - } + nodes = path_count_datanodes(&ipath->path); #endif - /* Use the regular indexscan plan build machinery... */ iscan = castNode(IndexScan, create_indexscan_plan(root, ipath, @@ -6424,6 +6406,7 @@ make_remotesubplan(PlannerInfo *root, Assert(!IsA(lefttree, RemoteSubplan)); #ifdef __TBASE__ + /* do things like path_count_datanodes, but we have only distribution here */ if (execDistribution && (execDistribution->distributionType == LOCATOR_TYPE_HASH || execDistribution->distributionType == LOCATOR_TYPE_SHARD)) diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index e5551e0c..690adbfd 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -1405,6 +1405,10 @@ set_scanpath_distribution(PlannerInfo *root, RelOptInfo *rel, Path *pathnode) } #ifdef __TBASE__ +/* + * implementation for create_remotesubplan_path, besides regular creation of remote subplan, + * we need it when redistributing join rel. + */ static Path * create_remotesubplan_path_internal(PlannerInfo *root, Path *subpath, Distribution *distribution, RelOptInfo *rel, @@ -6994,3 +6998,26 @@ reparameterize_path(PlannerInfo *root, Path *path, } return NULL; } + +#ifdef __TBASE__ +/* + * count datanode number for given path, consider replication table as 1 + * because we use this function to figure out how many parts that data + * had been separated into, when we estimating costs of a plan. Therefore + * to get more accurate estimating result as in a distributed system. + */ +double +path_count_datanodes(Path *path) +{ + if (path->distribution && IsA(path->distribution, Distribution) && + (path->distribution->distributionType == LOCATOR_TYPE_SHARD || + path->distribution->distributionType == LOCATOR_TYPE_HASH)) + { + double nodes = bms_num_members(path->distribution->nodes); + if (nodes > 0) + return nodes; + } + + return 1; +} +#endif diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index e24d7193..80a4ce72 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -3545,14 +3545,7 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows, double nodes = 1; if (list_length(rel->pathlist) > 0) { - Path *path = linitial(rel->pathlist); - if (path->distribution && - (path->distribution->distributionType == LOCATOR_TYPE_HASH || - path->distribution->distributionType == LOCATOR_TYPE_SHARD)) - nodes = bms_num_members(path->distribution->nodes); - /* for sanity */ - if (nodes < 1) - nodes = 1; + nodes = path_count_datanodes(linitial(rel->pathlist)); } #endif if (relvarcount > 1) diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h index cf766c0f..416d15d8 100644 --- a/src/include/optimizer/paths.h +++ b/src/include/optimizer/paths.h @@ -228,5 +228,7 @@ extern bool has_useful_pathkeys(PlannerInfo *root, RelOptInfo *rel); extern PathKey *make_canonical_pathkey(PlannerInfo *root, EquivalenceClass *eclass, Oid opfamily, int strategy, bool nulls_first); - +#ifdef __TBASE__ +extern double path_count_datanodes(Path *path); +#endif #endif /* PATHS_H */ From 90ad670bfcfe9ef3ed0eb789023b6d8dcdbe67c7 Mon Sep 17 00:00:00 2001 From: yeyukui Date: Thu, 31 Dec 2020 11:23:15 +0800 Subject: [PATCH 126/578] v2 support gbk and gb18030, tapd:http://tapd.oa.com/pgxz/prong/stories/view/1010092131861052005 --- src/backend/access/common/heaptuple.c | 33 +- src/backend/access/common/printtup.c | 17 + src/backend/executor/execTuples.c | 2 + src/backend/utils/adt/varlena.c | 26 + src/backend/utils/mb/encnames.c | 7 +- src/backend/utils/mb/mbutils.c | 6 + src/backend/utils/mb/wchar.c | 65 ++- src/include/c.h | 3 + src/include/mb/pg_wchar.h | 585 ++++++++++---------- src/test/regress/expected/rowsecurity_1.out | 10 +- src/test/regress/expected/zhcn_gb18030.out | 132 +++++ src/test/regress/expected/zhcn_utf8.out | 264 +++++++++ src/test/regress/parallel_schedule | 4 +- src/test/regress/pg_regress.c | 3 +- src/test/regress/serial_schedule | 2 + src/test/regress/sql/rowsecurity.sql | 10 +- src/test/regress/sql/zhcn_gb18030.sql | 65 +++ src/test/regress/sql/zhcn_utf8.sql | 140 +++++ 18 files changed, 1065 insertions(+), 309 deletions(-) create mode 100644 src/test/regress/expected/zhcn_gb18030.out create mode 100644 src/test/regress/expected/zhcn_utf8.out create mode 100644 src/test/regress/sql/zhcn_gb18030.sql create mode 100644 src/test/regress/sql/zhcn_utf8.sql diff --git a/src/backend/access/common/heaptuple.c b/src/backend/access/common/heaptuple.c index d134e17d..d858d75b 100644 --- a/src/backend/access/common/heaptuple.c +++ b/src/backend/access/common/heaptuple.c @@ -78,6 +78,8 @@ #ifdef __TBASE__ #include "utils/typcache.h" #include "pgxc/execRemote.h" +#include "catalog/pg_type.h" +#include "mb/pg_wchar.h" #endif /* Does att's datatype allow packing into the 1-byte-header varlena format? */ @@ -1328,6 +1330,30 @@ slot_deform_tuple(TupleTableSlot *slot, int natts) slot->tts_slow = slow; } +/** + * get maximum bytes number from column define size, if column is bounded string, return -1 + * then InputFunctionCall -> varchar2_input|varchar_input|varchar2_input|nvarchar2_input + * avoid to verification the length of string which encoded by client encode + */ +static int +get_typioparam_mod(Oid typioparam, int32 typmod) +{ + switch (typioparam) + { + case CHAROID: + case BPCHAROID: + case VARCHAROID: +#ifdef _PG_ORCL_ + case VARCHAR2OID: + case NVARCHAR2OID: +#endif + return -1; + + default: + return typmod; + } +} + /* * slot_deform_datarow * Extract data from the DataRow message into Datum/isnull arrays. @@ -1480,13 +1506,18 @@ slot_deform_datarow(TupleTableSlot *slot) #endif else { + int typmod = slot->tts_attinmeta->atttypmods[i]; appendBinaryStringInfo(buffer, cur, len); cur += len; + if (GetDatabaseEncoding() != pg_get_client_encoding() && + pg_get_client_encoding() != PG_SQL_ASCII && IS_PGXC_LOCAL_COORDINATOR) + typmod = get_typioparam_mod(slot->tts_attinmeta->attioparams[i], typmod); + slot->tts_values[i] = InputFunctionCall(slot->tts_attinmeta->attinfuncs + i, buffer->data, slot->tts_attinmeta->attioparams[i], - slot->tts_attinmeta->atttypmods[i]); + typmod); slot->tts_isnull[i] = false; resetStringInfo(buffer); diff --git a/src/backend/access/common/printtup.c b/src/backend/access/common/printtup.c index 3124b935..c7f180a5 100644 --- a/src/backend/access/common/printtup.c +++ b/src/backend/access/common/printtup.c @@ -335,6 +335,7 @@ printtup(TupleTableSlot *slot, DestReceiver *self) int natts = typeinfo->natts; int i; bool binary = false; + bool needEncodingConvert = false; #ifdef __TBASE__ if (end_query_requested) @@ -399,6 +400,12 @@ printtup(TupleTableSlot *slot, DestReceiver *self) pq_sendint(&buf, natts, 2); + /* encoding convert only on datanode when connect from coordinator node or connect from app */ + if (isPGXCDataNode && (IsConnFromCoord() || IsConnFromApp())) + { + needEncodingConvert = true; + } + /* * send the attributes of this tuple */ @@ -430,10 +437,20 @@ printtup(TupleTableSlot *slot, DestReceiver *self) char *outputstr; outputstr = OutputFunctionCall(&thisState->finfo, attr); + + if (needEncodingConvert) + { pq_sendcountedtext(&buf, outputstr, strlen(outputstr), false); } else { + int len = strlen(outputstr); + pq_sendint(&buf, len, 4); + appendBinaryStringInfo(&buf, outputstr, len); + } + } + else + { /* Binary output */ bytea *outputbytes; diff --git a/src/backend/executor/execTuples.c b/src/backend/executor/execTuples.c index d36cc392..4cd13fa4 100644 --- a/src/backend/executor/execTuples.c +++ b/src/backend/executor/execTuples.c @@ -501,7 +501,9 @@ ExecClearTuple(TupleTableSlot *slot) /* slot in which to store tuple */ heap_free_minimal_tuple(slot->tts_mintuple); #ifdef PGXC if (slot->tts_shouldFreeRow) + { pfree(slot->tts_datarow); + } slot->tts_shouldFreeRow = false; slot->tts_datarow = NULL; diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 8f170abe..51d53bab 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -2220,6 +2220,19 @@ varstrfastcmp_locale(Datum x, Datum y, SortSupport ssup) * memcmp() compares data from cachelines that are needed in L1 cache even * when the last comparison's result cannot be reused. */ +#ifdef __TBASE__ + /** + * on cn node, when client encoding is not equals server encoding, a1p is client encoding + * so must convert a1p to server encoding + */ + if (GetDatabaseEncoding() != pg_get_client_encoding() && + pg_get_client_encoding() != PG_SQL_ASCII && IS_PGXC_LOCAL_COORDINATOR) + { + a1p = pg_client_to_server(a1p, strnlen(a1p, len1)); + len1 = strlen(a1p); + } +#endif + arg1_match = true; if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0) { @@ -2235,6 +2248,19 @@ varstrfastcmp_locale(Datum x, Datum y, SortSupport ssup) * it seems (at least with moderate to low cardinality sets), because * quicksort compares the same pivot against many values. */ +#ifdef __TBASE__ + /** + * on cn node, when client encoding is not equals server encoding, a2p is client encoding + * so must convert a2p to server encoding + */ + if (GetDatabaseEncoding() != pg_get_client_encoding() && + pg_get_client_encoding() != PG_SQL_ASCII && IS_PGXC_LOCAL_COORDINATOR) + { + a2p = pg_client_to_server(a2p, strnlen(a2p, len2)); + len2 = strlen(a2p); + } +#endif + if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0) { memcpy(sss->buf2, a2p, len2); diff --git a/src/backend/utils/mb/encnames.c b/src/backend/utils/mb/encnames.c index e79eb2fd..16422b65 100644 --- a/src/backend/utils/mb/encnames.c +++ b/src/backend/utils/mb/encnames.c @@ -449,6 +449,11 @@ static const char *const pg_enc2icu_tbl[] = "CP1255", /* PG_WIN1255 */ "CP1257", /* PG_WIN1257 */ "KOI8-U", /* PG_KOI8U */ + NULL, /* Shift JIS (Windows-932) */ + NULL, /* Big5 (Windows-950) */ + "GBK", /* GBK (Windows-936) */ + NULL, /* UHC (Windows-949) */ + "GB18030", /* GB18030 */ }; bool @@ -462,7 +467,7 @@ get_encoding_name_for_icu(int encoding) { const char *icu_encoding_name; - StaticAssertStmt(lengthof(pg_enc2icu_tbl) == PG_ENCODING_BE_LAST + 1, + StaticAssertStmt(lengthof(pg_enc2icu_tbl) == PG_SERVER_ENCODING_BE_LAST + 1, "pg_enc2icu_tbl incomplete"); icu_encoding_name = pg_enc2icu_tbl[encoding]; diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c index f466a0da..8d34efa4 100644 --- a/src/backend/utils/mb/mbutils.c +++ b/src/backend/utils/mb/mbutils.c @@ -574,6 +574,12 @@ pg_any_to_server(const char *s, int len, int encoding) if (len <= 0) return (char *) s; /* empty string is always valid */ + /* + * no need to convert on datanode node + */ + if (IsConnFromCoord() || IsConnFromDatanode()) + return (char *) s; + if (encoding == DatabaseEncoding->encoding || encoding == PG_SQL_ASCII) { diff --git a/src/backend/utils/mb/wchar.c b/src/backend/utils/mb/wchar.c index 7344e44b..33e745d6 100644 --- a/src/backend/utils/mb/wchar.c +++ b/src/backend/utils/mb/wchar.c @@ -1020,6 +1020,31 @@ pg_big5_dsplen(const unsigned char *s) /* * GBK */ +static int +pg_gbk2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) +{ + int cnt = 0; + + while (len > 0 && *from) + { + if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 1 */ + { + *to = *from++ << 8; + *to |= *from++; + len -= 2; + } + else /* should be ASCII */ + { + *to = *from++; + len--; + } + to++; + cnt++; + } + *to = 0; + return cnt; +} + static int pg_gbk_mblen(const unsigned char *s) { @@ -1075,6 +1100,42 @@ pg_uhc_dsplen(const unsigned char *s) * GB18030 * Added by Bill Huang , */ +static int +pg_gb18030_2_wchar_with_len(const unsigned char *from, pg_wchar *to, int len) +{ + int cnt = 0; + + while (len > 0 && *from) + { + if (IS_HIGHBIT_SET(*from) && len >= 2) /* 2 bytes */ + { + if (IS_GB18030_SET(*(from + 1)) && len >= 4) /* 4 bytes for CJK */ + { + *to = *from++ << 24; + *to |= *from++ << 16; + *to |= *from++ << 8; + *to |= *from++; + len -= 4; + } + else + { + *to = *from++ << 8; + *to |= *from++; + len -= 2; + } + } + else /* should be ASCII */ + { + *to = *from++; + len--; + } + to++; + cnt++; + } + *to = 0; + return cnt; +} + static int pg_gb18030_mblen(const unsigned char *s) { @@ -1766,9 +1827,9 @@ const pg_wchar_tbl pg_wchar_table[] = { {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_KOI8U */ {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2}, /* PG_SJIS */ {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifier, 2}, /* PG_BIG5 */ - {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifier, 2}, /* PG_GBK */ + {pg_gbk2wchar_with_len, pg_wchar2euc_with_len, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifier, 2}, /* PG_GBK */ {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifier, 2}, /* PG_UHC */ - {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifier, 4}, /* PG_GB18030 */ + {pg_gb18030_2_wchar_with_len, pg_wchar2euc_with_len, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifier, 4}, /* PG_GB18030 */ {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifier, 3}, /* PG_JOHAB */ {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2} /* PG_SHIFT_JIS_2004 */ }; diff --git a/src/include/c.h b/src/include/c.h index 4c2b1d98..d4a4033d 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -1047,6 +1047,9 @@ typedef NameData *Name; /* msb for char */ #define HIGHBIT (0x80) #define IS_HIGHBIT_SET(ch) ((unsigned char)(ch) & HIGHBIT) +#define GB18030_2ND_MIX (0x30) +#define GB18030_2ND_MAX (0x39) +#define IS_GB18030_SET(ch) ((ch) <= GB18030_2ND_MAX && (ch) >= GB18030_2ND_MIX) #define STATUS_OK (0) #define STATUS_ERROR (-1) diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index bed8069b..44bb9b4a 100644 --- a/src/include/mb/pg_wchar.h +++ b/src/include/mb/pg_wchar.h @@ -1,18 +1,18 @@ /*------------------------------------------------------------------------- * * pg_wchar.h - * multibyte-character support + * multibyte-character support * * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * src/include/mb/pg_wchar.h * - * NOTES - * This is used both by the backend and by libpq, but should not be - * included by libpq client programs. In particular, a libpq client - * should not assume that the encoding IDs used by the version of libpq - * it's linked to match up with the IDs declared here. + * NOTES + * This is used both by the backend and by libpq, but should not be + * included by libpq client programs. In particular, a libpq client + * should not assume that the encoding IDs used by the version of libpq + * it's linked to match up with the IDs declared here. * *------------------------------------------------------------------------- */ @@ -27,13 +27,13 @@ typedef unsigned int pg_wchar; /* * Maximum byte length of multibyte characters in any backend encoding */ -#define MAX_MULTIBYTE_CHAR_LEN 4 +#define MAX_MULTIBYTE_CHAR_LEN 4 /* * various definitions for EUC */ -#define SS2 0x8e /* single shift 2 (JIS0201) */ -#define SS3 0x8f /* single shift 3 (JIS0212) */ +#define SS2 0x8e /* single shift 2 (JIS0201) */ +#define SS3 0x8f /* single shift 3 (JIS0212) */ /* * SJIS validation macros @@ -54,28 +54,28 @@ typedef unsigned int pg_wchar; * 1) 1-byte ASCII characters. Each byte is below 0x80. * * 2) "Official" single byte charsets such as ISO-8859-1 (Latin1). - * Each MULE character consists of 2 bytes: LC1 + C1, where LC1 is - * an identifier for the charset (in the range 0x81 to 0x8d) and C1 - * is the character code (in the range 0xa0 to 0xff). + * Each MULE character consists of 2 bytes: LC1 + C1, where LC1 is + * an identifier for the charset (in the range 0x81 to 0x8d) and C1 + * is the character code (in the range 0xa0 to 0xff). * * 3) "Private" single byte charsets such as SISHENG. Each MULE - * character consists of 3 bytes: LCPRV1 + LC12 + C1, where LCPRV1 - * is a private-charset flag, LC12 is an identifier for the charset, - * and C1 is the character code (in the range 0xa0 to 0xff). - * LCPRV1 is either 0x9a (if LC12 is in the range 0xa0 to 0xdf) - * or 0x9b (if LC12 is in the range 0xe0 to 0xef). + * character consists of 3 bytes: LCPRV1 + LC12 + C1, where LCPRV1 + * is a private-charset flag, LC12 is an identifier for the charset, + * and C1 is the character code (in the range 0xa0 to 0xff). + * LCPRV1 is either 0x9a (if LC12 is in the range 0xa0 to 0xdf) + * or 0x9b (if LC12 is in the range 0xe0 to 0xef). * * 4) "Official" multibyte charsets such as JIS X0208. Each MULE - * character consists of 3 bytes: LC2 + C1 + C2, where LC2 is - * an identifier for the charset (in the range 0x90 to 0x99) and C1 - * and C2 form the character code (each in the range 0xa0 to 0xff). + * character consists of 3 bytes: LC2 + C1 + C2, where LC2 is + * an identifier for the charset (in the range 0x90 to 0x99) and C1 + * and C2 form the character code (each in the range 0xa0 to 0xff). * * 5) "Private" multibyte charsets such as CNS 11643-1992 Plane 3. - * Each MULE character consists of 4 bytes: LCPRV2 + LC22 + C1 + C2, - * where LCPRV2 is a private-charset flag, LC22 is an identifier for - * the charset, and C1 and C2 form the character code (each in the range - * 0xa0 to 0xff). LCPRV2 is either 0x9c (if LC22 is in the range 0xf0 - * to 0xf4) or 0x9d (if LC22 is in the range 0xf5 to 0xfe). + * Each MULE character consists of 4 bytes: LCPRV2 + LC22 + C1 + C2, + * where LCPRV2 is a private-charset flag, LC22 is an identifier for + * the charset, and C1 and C2 form the character code (each in the range + * 0xa0 to 0xff). LCPRV2 is either 0x9c (if LC22 is in the range 0xf0 + * to 0xf4) or 0x9d (if LC22 is in the range 0xf5 to 0xfe). * * "Official" encodings are those that have been assigned code numbers by * the XEmacs project; "private" encodings have Postgres-specific charset @@ -99,119 +99,119 @@ typedef unsigned int pg_wchar; /* * Charset IDs for official single byte encodings (0x81-0x8e) */ -#define LC_ISO8859_1 0x81 /* ISO8859 Latin 1 */ -#define LC_ISO8859_2 0x82 /* ISO8859 Latin 2 */ -#define LC_ISO8859_3 0x83 /* ISO8859 Latin 3 */ -#define LC_ISO8859_4 0x84 /* ISO8859 Latin 4 */ -#define LC_TIS620 0x85 /* Thai (not supported yet) */ -#define LC_ISO8859_7 0x86 /* Greek (not supported yet) */ -#define LC_ISO8859_6 0x87 /* Arabic (not supported yet) */ -#define LC_ISO8859_8 0x88 /* Hebrew (not supported yet) */ -#define LC_JISX0201K 0x89 /* Japanese 1 byte kana */ -#define LC_JISX0201R 0x8a /* Japanese 1 byte Roman */ +#define LC_ISO8859_1 0x81 /* ISO8859 Latin 1 */ +#define LC_ISO8859_2 0x82 /* ISO8859 Latin 2 */ +#define LC_ISO8859_3 0x83 /* ISO8859 Latin 3 */ +#define LC_ISO8859_4 0x84 /* ISO8859 Latin 4 */ +#define LC_TIS620 0x85 /* Thai (not supported yet) */ +#define LC_ISO8859_7 0x86 /* Greek (not supported yet) */ +#define LC_ISO8859_6 0x87 /* Arabic (not supported yet) */ +#define LC_ISO8859_8 0x88 /* Hebrew (not supported yet) */ +#define LC_JISX0201K 0x89 /* Japanese 1 byte kana */ +#define LC_JISX0201R 0x8a /* Japanese 1 byte Roman */ /* Note that 0x8b seems to be unused as of Emacs 20.7. * However, there might be a chance that 0x8b could be used * in later versions of Emacs. */ -#define LC_KOI8_R 0x8b /* Cyrillic KOI8-R */ -#define LC_ISO8859_5 0x8c /* ISO8859 Cyrillic */ -#define LC_ISO8859_9 0x8d /* ISO8859 Latin 5 (not supported yet) */ -#define LC_ISO8859_15 0x8e /* ISO8859 Latin 15 (not supported yet) */ -/* #define CONTROL_1 0x8f control characters (unused) */ +#define LC_KOI8_R 0x8b /* Cyrillic KOI8-R */ +#define LC_ISO8859_5 0x8c /* ISO8859 Cyrillic */ +#define LC_ISO8859_9 0x8d /* ISO8859 Latin 5 (not supported yet) */ +#define LC_ISO8859_15 0x8e /* ISO8859 Latin 15 (not supported yet) */ +/* #define CONTROL_1 0x8f control characters (unused) */ /* Is a leading byte for "official" single byte encodings? */ -#define IS_LC1(c) ((unsigned char)(c) >= 0x81 && (unsigned char)(c) <= 0x8d) +#define IS_LC1(c) ((unsigned char)(c) >= 0x81 && (unsigned char)(c) <= 0x8d) /* * Charset IDs for official multibyte encodings (0x90-0x99) * 0x9a-0x9d are free. 0x9e and 0x9f are reserved. */ -#define LC_JISX0208_1978 0x90 /* Japanese Kanji, old JIS (not supported) */ -#define LC_GB2312_80 0x91 /* Chinese */ -#define LC_JISX0208 0x92 /* Japanese Kanji (JIS X 0208) */ -#define LC_KS5601 0x93 /* Korean */ -#define LC_JISX0212 0x94 /* Japanese Kanji (JIS X 0212) */ -#define LC_CNS11643_1 0x95 /* CNS 11643-1992 Plane 1 */ -#define LC_CNS11643_2 0x96 /* CNS 11643-1992 Plane 2 */ -#define LC_JISX0213_1 0x97 /* Japanese Kanji (JIS X 0213 Plane 1) - * (not supported) */ -#define LC_BIG5_1 0x98 /* Plane 1 Chinese traditional (not - * supported) */ -#define LC_BIG5_2 0x99 /* Plane 1 Chinese traditional (not - * supported) */ +#define LC_JISX0208_1978 0x90 /* Japanese Kanji, old JIS (not supported) */ +#define LC_GB2312_80 0x91 /* Chinese */ +#define LC_JISX0208 0x92 /* Japanese Kanji (JIS X 0208) */ +#define LC_KS5601 0x93 /* Korean */ +#define LC_JISX0212 0x94 /* Japanese Kanji (JIS X 0212) */ +#define LC_CNS11643_1 0x95 /* CNS 11643-1992 Plane 1 */ +#define LC_CNS11643_2 0x96 /* CNS 11643-1992 Plane 2 */ +#define LC_JISX0213_1 0x97 /* Japanese Kanji (JIS X 0213 Plane 1) + * (not supported) */ +#define LC_BIG5_1 0x98 /* Plane 1 Chinese traditional (not + * supported) */ +#define LC_BIG5_2 0x99 /* Plane 1 Chinese traditional (not + * supported) */ /* Is a leading byte for "official" multibyte encodings? */ -#define IS_LC2(c) ((unsigned char)(c) >= 0x90 && (unsigned char)(c) <= 0x99) +#define IS_LC2(c) ((unsigned char)(c) >= 0x90 && (unsigned char)(c) <= 0x99) /* * Postgres-specific prefix bytes for "private" single byte encodings * (According to the MULE docs, we should be using 0x9e for this) */ -#define LCPRV1_A 0x9a -#define LCPRV1_B 0x9b -#define IS_LCPRV1(c) ((unsigned char)(c) == LCPRV1_A || (unsigned char)(c) == LCPRV1_B) -#define IS_LCPRV1_A_RANGE(c) \ - ((unsigned char)(c) >= 0xa0 && (unsigned char)(c) <= 0xdf) -#define IS_LCPRV1_B_RANGE(c) \ - ((unsigned char)(c) >= 0xe0 && (unsigned char)(c) <= 0xef) +#define LCPRV1_A 0x9a +#define LCPRV1_B 0x9b +#define IS_LCPRV1(c) ((unsigned char)(c) == LCPRV1_A || (unsigned char)(c) == LCPRV1_B) +#define IS_LCPRV1_A_RANGE(c) \ + ((unsigned char)(c) >= 0xa0 && (unsigned char)(c) <= 0xdf) +#define IS_LCPRV1_B_RANGE(c) \ + ((unsigned char)(c) >= 0xe0 && (unsigned char)(c) <= 0xef) /* * Postgres-specific prefix bytes for "private" multibyte encodings * (According to the MULE docs, we should be using 0x9f for this) */ -#define LCPRV2_A 0x9c -#define LCPRV2_B 0x9d -#define IS_LCPRV2(c) ((unsigned char)(c) == LCPRV2_A || (unsigned char)(c) == LCPRV2_B) -#define IS_LCPRV2_A_RANGE(c) \ - ((unsigned char)(c) >= 0xf0 && (unsigned char)(c) <= 0xf4) -#define IS_LCPRV2_B_RANGE(c) \ - ((unsigned char)(c) >= 0xf5 && (unsigned char)(c) <= 0xfe) +#define LCPRV2_A 0x9c +#define LCPRV2_B 0x9d +#define IS_LCPRV2(c) ((unsigned char)(c) == LCPRV2_A || (unsigned char)(c) == LCPRV2_B) +#define IS_LCPRV2_A_RANGE(c) \ + ((unsigned char)(c) >= 0xf0 && (unsigned char)(c) <= 0xf4) +#define IS_LCPRV2_B_RANGE(c) \ + ((unsigned char)(c) >= 0xf5 && (unsigned char)(c) <= 0xfe) /* * Charset IDs for private single byte encodings (0xa0-0xef) */ -#define LC_SISHENG 0xa0 /* Chinese SiSheng characters for - * PinYin/ZhuYin (not supported) */ -#define LC_IPA 0xa1 /* IPA (International Phonetic - * Association) (not supported) */ -#define LC_VISCII_LOWER 0xa2 /* Vietnamese VISCII1.1 lower-case (not - * supported) */ -#define LC_VISCII_UPPER 0xa3 /* Vietnamese VISCII1.1 upper-case (not - * supported) */ -#define LC_ARABIC_DIGIT 0xa4 /* Arabic digit (not supported) */ -#define LC_ARABIC_1_COLUMN 0xa5 /* Arabic 1-column (not supported) */ -#define LC_ASCII_RIGHT_TO_LEFT 0xa6 /* ASCII (left half of ISO8859-1) with - * right-to-left direction (not - * supported) */ -#define LC_LAO 0xa7 /* Lao characters (ISO10646 0E80..0EDF) - * (not supported) */ -#define LC_ARABIC_2_COLUMN 0xa8 /* Arabic 1-column (not supported) */ +#define LC_SISHENG 0xa0 /* Chinese SiSheng characters for + * PinYin/ZhuYin (not supported) */ +#define LC_IPA 0xa1 /* IPA (International Phonetic + * Association) (not supported) */ +#define LC_VISCII_LOWER 0xa2 /* Vietnamese VISCII1.1 lower-case (not + * supported) */ +#define LC_VISCII_UPPER 0xa3 /* Vietnamese VISCII1.1 upper-case (not + * supported) */ +#define LC_ARABIC_DIGIT 0xa4 /* Arabic digit (not supported) */ +#define LC_ARABIC_1_COLUMN 0xa5 /* Arabic 1-column (not supported) */ +#define LC_ASCII_RIGHT_TO_LEFT 0xa6 /* ASCII (left half of ISO8859-1) with + * right-to-left direction (not + * supported) */ +#define LC_LAO 0xa7 /* Lao characters (ISO10646 0E80..0EDF) + * (not supported) */ +#define LC_ARABIC_2_COLUMN 0xa8 /* Arabic 1-column (not supported) */ /* * Charset IDs for private multibyte encodings (0xf0-0xff) */ -#define LC_INDIAN_1_COLUMN 0xf0 /* Indian charset for 1-column width - * glyphs (not supported) */ -#define LC_TIBETAN_1_COLUMN 0xf1 /* Tibetan 1-column width glyphs (not - * supported) */ -#define LC_UNICODE_SUBSET_2 0xf2 /* Unicode characters of the range - * U+2500..U+33FF. (not supported) */ -#define LC_UNICODE_SUBSET_3 0xf3 /* Unicode characters of the range - * U+E000..U+FFFF. (not supported) */ -#define LC_UNICODE_SUBSET 0xf4 /* Unicode characters of the range - * U+0100..U+24FF. (not supported) */ -#define LC_ETHIOPIC 0xf5 /* Ethiopic characters (not supported) */ -#define LC_CNS11643_3 0xf6 /* CNS 11643-1992 Plane 3 */ -#define LC_CNS11643_4 0xf7 /* CNS 11643-1992 Plane 4 */ -#define LC_CNS11643_5 0xf8 /* CNS 11643-1992 Plane 5 */ -#define LC_CNS11643_6 0xf9 /* CNS 11643-1992 Plane 6 */ -#define LC_CNS11643_7 0xfa /* CNS 11643-1992 Plane 7 */ -#define LC_INDIAN_2_COLUMN 0xfb /* Indian charset for 2-column width - * glyphs (not supported) */ -#define LC_TIBETAN 0xfc /* Tibetan (not supported) */ -/* #define FREE 0xfd free (unused) */ -/* #define FREE 0xfe free (unused) */ -/* #define FREE 0xff free (unused) */ +#define LC_INDIAN_1_COLUMN 0xf0 /* Indian charset for 1-column width + * glyphs (not supported) */ +#define LC_TIBETAN_1_COLUMN 0xf1 /* Tibetan 1-column width glyphs (not + * supported) */ +#define LC_UNICODE_SUBSET_2 0xf2 /* Unicode characters of the range + * U+2500..U+33FF. (not supported) */ +#define LC_UNICODE_SUBSET_3 0xf3 /* Unicode characters of the range + * U+E000..U+FFFF. (not supported) */ +#define LC_UNICODE_SUBSET 0xf4 /* Unicode characters of the range + * U+0100..U+24FF. (not supported) */ +#define LC_ETHIOPIC 0xf5 /* Ethiopic characters (not supported) */ +#define LC_CNS11643_3 0xf6 /* CNS 11643-1992 Plane 3 */ +#define LC_CNS11643_4 0xf7 /* CNS 11643-1992 Plane 4 */ +#define LC_CNS11643_5 0xf8 /* CNS 11643-1992 Plane 5 */ +#define LC_CNS11643_6 0xf9 /* CNS 11643-1992 Plane 6 */ +#define LC_CNS11643_7 0xfa /* CNS 11643-1992 Plane 7 */ +#define LC_INDIAN_2_COLUMN 0xfb /* Indian charset for 2-column width + * glyphs (not supported) */ +#define LC_TIBETAN 0xfc /* Tibetan (not supported) */ +/* #define FREE 0xfd free (unused) */ +/* #define FREE 0xfe free (unused) */ +/* #define FREE 0xff free (unused) */ /*---------------------------------------------------- * end of MULE stuff @@ -222,87 +222,88 @@ typedef unsigned int pg_wchar; * PostgreSQL encoding identifiers * * WARNING: the order of this enum must be same as order of entries - * in the pg_enc2name_tbl[] array (in mb/encnames.c), and - * in the pg_wchar_table[] array (in mb/wchar.c)! + * in the pg_enc2name_tbl[] array (in mb/encnames.c), and + * in the pg_wchar_table[] array (in mb/wchar.c)! * - * If you add some encoding don't forget to check - * PG_ENCODING_BE_LAST macro. + * If you add some encoding don't forget to check + * PG_ENCODING_BE_LAST macro. * * PG_SQL_ASCII is default encoding and must be = 0. * - * XXX We must avoid renumbering any backend encoding until libpq's major + * XXX We must avoid renumbering any backend encoding until libpq's major * version number is increased beyond 5; it turns out that the backend * encoding IDs are effectively part of libpq's ABI as far as 8.2 initdb and * psql are concerned. */ typedef enum pg_enc { - PG_SQL_ASCII = 0, /* SQL/ASCII */ - PG_EUC_JP, /* EUC for Japanese */ - PG_EUC_CN, /* EUC for Chinese */ - PG_EUC_KR, /* EUC for Korean */ - PG_EUC_TW, /* EUC for Taiwan */ - PG_EUC_JIS_2004, /* EUC-JIS-2004 */ - PG_UTF8, /* Unicode UTF8 */ - PG_MULE_INTERNAL, /* Mule internal code */ - PG_LATIN1, /* ISO-8859-1 Latin 1 */ - PG_LATIN2, /* ISO-8859-2 Latin 2 */ - PG_LATIN3, /* ISO-8859-3 Latin 3 */ - PG_LATIN4, /* ISO-8859-4 Latin 4 */ - PG_LATIN5, /* ISO-8859-9 Latin 5 */ - PG_LATIN6, /* ISO-8859-10 Latin6 */ - PG_LATIN7, /* ISO-8859-13 Latin7 */ - PG_LATIN8, /* ISO-8859-14 Latin8 */ - PG_LATIN9, /* ISO-8859-15 Latin9 */ - PG_LATIN10, /* ISO-8859-16 Latin10 */ - PG_WIN1256, /* windows-1256 */ - PG_WIN1258, /* Windows-1258 */ - PG_WIN866, /* (MS-DOS CP866) */ - PG_WIN874, /* windows-874 */ - PG_KOI8R, /* KOI8-R */ - PG_WIN1251, /* windows-1251 */ - PG_WIN1252, /* windows-1252 */ - PG_ISO_8859_5, /* ISO-8859-5 */ - PG_ISO_8859_6, /* ISO-8859-6 */ - PG_ISO_8859_7, /* ISO-8859-7 */ - PG_ISO_8859_8, /* ISO-8859-8 */ - PG_WIN1250, /* windows-1250 */ - PG_WIN1253, /* windows-1253 */ - PG_WIN1254, /* windows-1254 */ - PG_WIN1255, /* windows-1255 */ - PG_WIN1257, /* windows-1257 */ - PG_KOI8U, /* KOI8-U */ - /* PG_ENCODING_BE_LAST points to the above entry */ - - /* followings are for client encoding only */ - PG_SJIS, /* Shift JIS (Windows-932) */ - PG_BIG5, /* Big5 (Windows-950) */ - PG_GBK, /* GBK (Windows-936) */ - PG_UHC, /* UHC (Windows-949) */ - PG_GB18030, /* GB18030 */ - PG_JOHAB, /* EUC for Korean JOHAB */ - PG_SHIFT_JIS_2004, /* Shift-JIS-2004 */ - _PG_LAST_ENCODING_ /* mark only */ + PG_SQL_ASCII = 0, /* SQL/ASCII */ + PG_EUC_JP, /* EUC for Japanese */ + PG_EUC_CN, /* EUC for Chinese */ + PG_EUC_KR, /* EUC for Korean */ + PG_EUC_TW, /* EUC for Taiwan */ + PG_EUC_JIS_2004, /* EUC-JIS-2004 */ + PG_UTF8, /* Unicode UTF8 */ + PG_MULE_INTERNAL, /* Mule internal code */ + PG_LATIN1, /* ISO-8859-1 Latin 1 */ + PG_LATIN2, /* ISO-8859-2 Latin 2 */ + PG_LATIN3, /* ISO-8859-3 Latin 3 */ + PG_LATIN4, /* ISO-8859-4 Latin 4 */ + PG_LATIN5, /* ISO-8859-9 Latin 5 */ + PG_LATIN6, /* ISO-8859-10 Latin6 */ + PG_LATIN7, /* ISO-8859-13 Latin7 */ + PG_LATIN8, /* ISO-8859-14 Latin8 */ + PG_LATIN9, /* ISO-8859-15 Latin9 */ + PG_LATIN10, /* ISO-8859-16 Latin10 */ + PG_WIN1256, /* windows-1256 */ + PG_WIN1258, /* Windows-1258 */ + PG_WIN866, /* (MS-DOS CP866) */ + PG_WIN874, /* windows-874 */ + PG_KOI8R, /* KOI8-R */ + PG_WIN1251, /* windows-1251 */ + PG_WIN1252, /* windows-1252 */ + PG_ISO_8859_5, /* ISO-8859-5 */ + PG_ISO_8859_6, /* ISO-8859-6 */ + PG_ISO_8859_7, /* ISO-8859-7 */ + PG_ISO_8859_8, /* ISO-8859-8 */ + PG_WIN1250, /* windows-1250 */ + PG_WIN1253, /* windows-1253 */ + PG_WIN1254, /* windows-1254 */ + PG_WIN1255, /* windows-1255 */ + PG_WIN1257, /* windows-1257 */ + PG_KOI8U, /* KOI8-U */ + /* PG_ENCODING_BE_LAST points to the above entry */ + + /* followings are for client encoding only */ + PG_SJIS, /* Shift JIS (Windows-932) */ + PG_BIG5, /* Big5 (Windows-950) */ + PG_GBK, /* GBK (Windows-936) */ + PG_UHC, /* UHC (Windows-949) */ + PG_GB18030, /* GB18030 */ + PG_JOHAB, /* EUC for Korean JOHAB */ + PG_SHIFT_JIS_2004, /* Shift-JIS-2004 */ + _PG_LAST_ENCODING_ /* mark only */ } pg_enc; #define PG_ENCODING_BE_LAST PG_KOI8U +#define PG_SERVER_ENCODING_BE_LAST PG_GB18030 /* * Please use these tests before access to pg_encconv_tbl[] * or to other places... */ #define PG_VALID_BE_ENCODING(_enc) \ - ((_enc) >= 0 && (_enc) <= PG_ENCODING_BE_LAST) + (((_enc) >= 0 && (_enc) <= PG_ENCODING_BE_LAST) || (_enc) == PG_GBK || (_enc) == PG_GB18030) #define PG_ENCODING_IS_CLIENT_ONLY(_enc) \ - ((_enc) > PG_ENCODING_BE_LAST && (_enc) < _PG_LAST_ENCODING_) + ((_enc) > PG_ENCODING_BE_LAST && (_enc) < _PG_LAST_ENCODING_ && (_enc) != PG_GBK && (_enc) != PG_GB18030) #define PG_VALID_ENCODING(_enc) \ - ((_enc) >= 0 && (_enc) < _PG_LAST_ENCODING_) + ((_enc) >= 0 && (_enc) < _PG_LAST_ENCODING_) /* On FE are possible all encodings */ -#define PG_VALID_FE_ENCODING(_enc) PG_VALID_ENCODING(_enc) +#define PG_VALID_FE_ENCODING(_enc) PG_VALID_ENCODING(_enc) /* * Table for mapping an encoding number to official encoding name and @@ -310,14 +311,14 @@ typedef enum pg_enc * before accessing a table entry! * * if (PG_VALID_ENCODING(encoding)) - * pg_enc2name_tbl[ encoding ]; + * pg_enc2name_tbl[ encoding ]; */ typedef struct pg_enc2name { - const char *name; - pg_enc encoding; + const char *name; + pg_enc encoding; #ifdef WIN32 - unsigned codepage; /* codepage for WIN32 */ + unsigned codepage; /* codepage for WIN32 */ #endif } pg_enc2name; @@ -328,8 +329,8 @@ extern const pg_enc2name pg_enc2name_tbl[]; */ typedef struct pg_enc2gettext { - pg_enc encoding; - const char *name; + pg_enc encoding; + const char *name; } pg_enc2gettext; extern const pg_enc2gettext pg_enc2gettext_tbl[]; @@ -344,12 +345,12 @@ extern const char *get_encoding_name_for_icu(int encoding); * pg_wchar stuff */ typedef int (*mb2wchar_with_len_converter) (const unsigned char *from, - pg_wchar *to, - int len); + pg_wchar *to, + int len); typedef int (*wchar2mb_with_len_converter) (const pg_wchar *from, - unsigned char *to, - int len); + unsigned char *to, + int len); typedef int (*mblen_converter) (const unsigned char *mbstr); @@ -361,14 +362,14 @@ typedef int (*mbverifier) (const unsigned char *mbstr, int len); typedef struct { - mb2wchar_with_len_converter mb2wchar_with_len; /* convert a multibyte - * string to a wchar */ - wchar2mb_with_len_converter wchar2mb_with_len; /* convert a wchar string - * to a multibyte */ - mblen_converter mblen; /* get byte length of a char */ - mbdisplaylen_converter dsplen; /* get display width of a char */ - mbverifier mbverify; /* verify multibyte sequence */ - int maxmblen; /* max bytes for a char in this encoding */ + mb2wchar_with_len_converter mb2wchar_with_len; /* convert a multibyte + * string to a wchar */ + wchar2mb_with_len_converter wchar2mb_with_len; /* convert a wchar string + * to a multibyte */ + mblen_converter mblen; /* get byte length of a char */ + mbdisplaylen_converter dsplen; /* get display width of a char */ + mbverifier mbverify; /* verify multibyte sequence */ + int maxmblen; /* max bytes for a char in this encoding */ } pg_wchar_tbl; extern const pg_wchar_tbl pg_wchar_table[]; @@ -384,8 +385,8 @@ extern const pg_wchar_tbl pg_wchar_table[]; * * 1. Using a radix tree, from source to destination code. * 2. Using a sorted array of source -> destination code pairs. This - * method is used for "combining" characters. There are so few of - * them that building a radix tree would be wasteful. + * method is used for "combining" characters. There are so few of + * them that building a radix tree would be wasteful. * 3. Using a conversion function. */ @@ -415,44 +416,44 @@ extern const pg_wchar_tbl pg_wchar_table[]; */ typedef struct { - /* - * Array containing all the values. Only one of chars16 or chars32 is - * used, depending on how wide the values we need to represent are. - */ - const uint16 *chars16; - const uint32 *chars32; - - /* Radix tree for 1-byte inputs */ - uint32 b1root; /* offset of table in the chars[16|32] array */ - uint8 b1_lower; /* min allowed value for a single byte input */ - uint8 b1_upper; /* max allowed value for a single byte input */ - - /* Radix tree for 2-byte inputs */ - uint32 b2root; /* offset of 1st byte's table */ - uint8 b2_1_lower; /* min/max allowed value for 1st input byte */ - uint8 b2_1_upper; - uint8 b2_2_lower; /* min/max allowed value for 2nd input byte */ - uint8 b2_2_upper; - - /* Radix tree for 3-byte inputs */ - uint32 b3root; /* offset of 1st byte's table */ - uint8 b3_1_lower; /* min/max allowed value for 1st input byte */ - uint8 b3_1_upper; - uint8 b3_2_lower; /* min/max allowed value for 2nd input byte */ - uint8 b3_2_upper; - uint8 b3_3_lower; /* min/max allowed value for 3rd input byte */ - uint8 b3_3_upper; - - /* Radix tree for 4-byte inputs */ - uint32 b4root; /* offset of 1st byte's table */ - uint8 b4_1_lower; /* min/max allowed value for 1st input byte */ - uint8 b4_1_upper; - uint8 b4_2_lower; /* min/max allowed value for 2nd input byte */ - uint8 b4_2_upper; - uint8 b4_3_lower; /* min/max allowed value for 3rd input byte */ - uint8 b4_3_upper; - uint8 b4_4_lower; /* min/max allowed value for 4th input byte */ - uint8 b4_4_upper; + /* + * Array containing all the values. Only one of chars16 or chars32 is + * used, depending on how wide the values we need to represent are. + */ + const uint16 *chars16; + const uint32 *chars32; + + /* Radix tree for 1-byte inputs */ + uint32 b1root; /* offset of table in the chars[16|32] array */ + uint8 b1_lower; /* min allowed value for a single byte input */ + uint8 b1_upper; /* max allowed value for a single byte input */ + + /* Radix tree for 2-byte inputs */ + uint32 b2root; /* offset of 1st byte's table */ + uint8 b2_1_lower; /* min/max allowed value for 1st input byte */ + uint8 b2_1_upper; + uint8 b2_2_lower; /* min/max allowed value for 2nd input byte */ + uint8 b2_2_upper; + + /* Radix tree for 3-byte inputs */ + uint32 b3root; /* offset of 1st byte's table */ + uint8 b3_1_lower; /* min/max allowed value for 1st input byte */ + uint8 b3_1_upper; + uint8 b3_2_lower; /* min/max allowed value for 2nd input byte */ + uint8 b3_2_upper; + uint8 b3_3_lower; /* min/max allowed value for 3rd input byte */ + uint8 b3_3_upper; + + /* Radix tree for 4-byte inputs */ + uint32 b4root; /* offset of 1st byte's table */ + uint8 b4_1_lower; /* min/max allowed value for 1st input byte */ + uint8 b4_1_upper; + uint8 b4_2_lower; /* min/max allowed value for 2nd input byte */ + uint8 b4_2_upper; + uint8 b4_3_lower; /* min/max allowed value for 3rd input byte */ + uint8 b4_3_upper; + uint8 b4_4_lower; /* min/max allowed value for 4th input byte */ + uint8 b4_4_upper; } pg_mb_radix_tree; @@ -461,9 +462,9 @@ typedef struct */ typedef struct { - uint32 utf1; /* UTF-8 code 1 */ - uint32 utf2; /* UTF-8 code 2 */ - uint32 code; /* local code */ + uint32 utf1; /* UTF-8 code 1 */ + uint32 utf2; /* UTF-8 code 2 */ + uint32 code; /* local code */ } pg_utf_to_local_combined; /* @@ -471,9 +472,9 @@ typedef struct */ typedef struct { - uint32 code; /* local code */ - uint32 utf1; /* UTF-8 code 1 */ - uint32 utf2; /* UTF-8 code 2 */ + uint32 code; /* local code */ + uint32 utf1; /* UTF-8 code 1 */ + uint32 utf2; /* UTF-8 code 2 */ } pg_local_to_utf_combined; /* @@ -490,79 +491,79 @@ typedef uint32 (*utf_local_conversion_func) (uint32 code); * used by frontends.) */ #define CHECK_ENCODING_CONVERSION_ARGS(srcencoding,destencoding) \ - check_encoding_conversion_args(PG_GETARG_INT32(0), \ - PG_GETARG_INT32(1), \ - PG_GETARG_INT32(4), \ - (srcencoding), \ - (destencoding)) + check_encoding_conversion_args(PG_GETARG_INT32(0), \ + PG_GETARG_INT32(1), \ + PG_GETARG_INT32(4), \ + (srcencoding), \ + (destencoding)) /* * These functions are considered part of libpq's exported API and * are also declared in libpq-fe.h. */ -extern int pg_char_to_encoding(const char *name); +extern int pg_char_to_encoding(const char *name); extern const char *pg_encoding_to_char(int encoding); -extern int pg_valid_server_encoding_id(int encoding); +extern int pg_valid_server_encoding_id(int encoding); /* * Remaining functions are not considered part of libpq's API, though many * of them do exist inside libpq. */ -extern int pg_mb2wchar(const char *from, pg_wchar *to); -extern int pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len); +extern int pg_mb2wchar(const char *from, pg_wchar *to); +extern int pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len); extern int pg_encoding_mb2wchar_with_len(int encoding, - const char *from, pg_wchar *to, int len); -extern int pg_wchar2mb(const pg_wchar *from, char *to); -extern int pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len); + const char *from, pg_wchar *to, int len); +extern int pg_wchar2mb(const pg_wchar *from, char *to); +extern int pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len); extern int pg_encoding_wchar2mb_with_len(int encoding, - const pg_wchar *from, char *to, int len); -extern int pg_char_and_wchar_strcmp(const char *s1, const pg_wchar *s2); -extern int pg_wchar_strncmp(const pg_wchar *s1, const pg_wchar *s2, size_t n); -extern int pg_char_and_wchar_strncmp(const char *s1, const pg_wchar *s2, size_t n); + const pg_wchar *from, char *to, int len); +extern int pg_char_and_wchar_strcmp(const char *s1, const pg_wchar *s2); +extern int pg_wchar_strncmp(const pg_wchar *s1, const pg_wchar *s2, size_t n); +extern int pg_char_and_wchar_strncmp(const char *s1, const pg_wchar *s2, size_t n); extern size_t pg_wchar_strlen(const pg_wchar *wstr); -extern int pg_mblen(const char *mbstr); -extern int pg_dsplen(const char *mbstr); -extern int pg_encoding_mblen(int encoding, const char *mbstr); -extern int pg_encoding_dsplen(int encoding, const char *mbstr); -extern int pg_encoding_verifymb(int encoding, const char *mbstr, int len); -extern int pg_mule_mblen(const unsigned char *mbstr); -extern int pg_mic_mblen(const unsigned char *mbstr); -extern int pg_mbstrlen(const char *mbstr); -extern int pg_mbstrlen_with_len(const char *mbstr, int len); -extern int pg_mbcliplen(const char *mbstr, int len, int limit); +extern int pg_mblen(const char *mbstr); +extern int pg_dsplen(const char *mbstr); +extern int pg_encoding_mblen(int encoding, const char *mbstr); +extern int pg_encoding_dsplen(int encoding, const char *mbstr); +extern int pg_encoding_verifymb(int encoding, const char *mbstr, int len); +extern int pg_mule_mblen(const unsigned char *mbstr); +extern int pg_mic_mblen(const unsigned char *mbstr); +extern int pg_mbstrlen(const char *mbstr); +extern int pg_mbstrlen_with_len(const char *mbstr, int len); +extern int pg_mbcliplen(const char *mbstr, int len, int limit); extern int pg_encoding_mbcliplen(int encoding, const char *mbstr, - int len, int limit); -extern int pg_mbcharcliplen(const char *mbstr, int len, int imit); -extern int pg_encoding_max_length(int encoding); -extern int pg_database_encoding_max_length(void); + int len, int limit); +extern int pg_mbcharcliplen(const char *mbstr, int len, int imit); +extern int pg_encoding_max_length(int encoding); +extern int pg_database_encoding_max_length(void); extern mbcharacter_incrementer pg_database_encoding_character_incrementer(void); -extern int PrepareClientEncoding(int encoding); -extern int SetClientEncoding(int encoding); +extern int PrepareClientEncoding(int encoding); +extern int SetClientEncoding(int encoding); extern void InitializeClientEncoding(void); -extern int pg_get_client_encoding(void); +extern int pg_get_client_encoding(void); extern const char *pg_get_client_encoding_name(void); extern void SetDatabaseEncoding(int encoding); -extern int GetDatabaseEncoding(void); +extern int GetDatabaseEncoding(void); extern const char *GetDatabaseEncodingName(void); extern void SetMessageEncoding(int encoding); -extern int GetMessageEncoding(void); +extern int GetMessageEncoding(void); #ifdef ENABLE_NLS -extern int pg_bind_textdomain_codeset(const char *domainname); +extern int pg_bind_textdomain_codeset(const char *domainname); #endif -extern int pg_valid_client_encoding(const char *name); -extern int pg_valid_server_encoding(const char *name); +extern int pg_valid_client_encoding(const char *name); +extern int pg_valid_server_encoding(const char *name); extern unsigned char *unicode_to_utf8(pg_wchar c, unsigned char *utf8string); extern pg_wchar utf8_to_unicode(const unsigned char *c); -extern int pg_utf_mblen(const unsigned char *); +extern int pg_utf_mblen(const unsigned char *); extern unsigned char *pg_do_encoding_conversion(unsigned char *src, int len, - int src_encoding, - int dest_encoding); + int src_encoding, + int dest_encoding); extern char *pg_client_to_server(const char *s, int len); extern char *pg_server_to_client(const char *s, int len); @@ -573,48 +574,48 @@ extern unsigned short BIG5toCNS(unsigned short big5, unsigned char *lc); extern unsigned short CNStoBIG5(unsigned short cns, unsigned char lc); extern void UtfToLocal(const unsigned char *utf, int len, - unsigned char *iso, - const pg_mb_radix_tree *map, - const pg_utf_to_local_combined *cmap, int cmapsize, - utf_local_conversion_func conv_func, - int encoding); + unsigned char *iso, + const pg_mb_radix_tree *map, + const pg_utf_to_local_combined *cmap, int cmapsize, + utf_local_conversion_func conv_func, + int encoding); extern void LocalToUtf(const unsigned char *iso, int len, - unsigned char *utf, - const pg_mb_radix_tree *map, - const pg_local_to_utf_combined *cmap, int cmapsize, - utf_local_conversion_func conv_func, - int encoding); + unsigned char *utf, + const pg_mb_radix_tree *map, + const pg_local_to_utf_combined *cmap, int cmapsize, + utf_local_conversion_func conv_func, + int encoding); extern bool pg_verifymbstr(const char *mbstr, int len, bool noError); extern bool pg_verify_mbstr(int encoding, const char *mbstr, int len, - bool noError); + bool noError); extern int pg_verify_mbstr_len(int encoding, const char *mbstr, int len, - bool noError); + bool noError); extern void check_encoding_conversion_args(int src_encoding, - int dest_encoding, - int len, - int expected_src_encoding, - int expected_dest_encoding); + int dest_encoding, + int len, + int expected_src_encoding, + int expected_dest_encoding); extern void report_invalid_encoding(int encoding, const char *mbstr, int len) pg_attribute_noreturn(); extern void report_untranslatable_char(int src_encoding, int dest_encoding, - const char *mbstr, int len) pg_attribute_noreturn(); + const char *mbstr, int len) pg_attribute_noreturn(); extern void local2local(const unsigned char *l, unsigned char *p, int len, - int src_encoding, int dest_encoding, const unsigned char *tab); + int src_encoding, int dest_encoding, const unsigned char *tab); extern void pg_ascii2mic(const unsigned char *l, unsigned char *p, int len); extern void pg_mic2ascii(const unsigned char *mic, unsigned char *p, int len); extern void latin2mic(const unsigned char *l, unsigned char *p, int len, - int lc, int encoding); + int lc, int encoding); extern void mic2latin(const unsigned char *mic, unsigned char *p, int len, - int lc, int encoding); + int lc, int encoding); extern void latin2mic_with_table(const unsigned char *l, unsigned char *p, - int len, int lc, int encoding, - const unsigned char *tab); + int len, int lc, int encoding, + const unsigned char *tab); extern void mic2latin_with_table(const unsigned char *mic, unsigned char *p, - int len, int lc, int encoding, - const unsigned char *tab); + int len, int lc, int encoding, + const unsigned char *tab); extern bool pg_utf8_islegal(const unsigned char *source, int length); @@ -622,4 +623,4 @@ extern bool pg_utf8_islegal(const unsigned char *source, int length); extern WCHAR *pgwin32_message_to_UTF16(const char *str, int len, int *utf16len); #endif -#endif /* PG_WCHAR_H */ +#endif /* PG_WCHAR_H */ diff --git a/src/test/regress/expected/rowsecurity_1.out b/src/test/regress/expected/rowsecurity_1.out index 01debacc..237482a1 100644 --- a/src/test/regress/expected/rowsecurity_1.out +++ b/src/test/regress/expected/rowsecurity_1.out @@ -2238,7 +2238,7 @@ CREATE VIEW rls_view AS SELECT * FROM z1 WHERE f_leak(b) order by 1; GRANT SELECT ON rls_view TO regress_rls_bob; -- Query as role that is not owner of view or table. Should return all records. SET SESSION AUTHORIZATION regress_rls_bob; -SELECT * FROM rls_view; +SELECT * FROM rls_view ORDER BY a,b; a | b ---+----- 1 | aba @@ -2259,7 +2259,7 @@ EXPLAIN (COSTS OFF) SELECT * FROM rls_view; -- Query as view/table owner. Should return all records. SET SESSION AUTHORIZATION regress_rls_alice; -SELECT * FROM rls_view order by 1; +SELECT * FROM rls_view ORDER BY a,b; a | b ---+----- 1 | aba @@ -2286,7 +2286,7 @@ GRANT SELECT ON rls_view TO regress_rls_alice; -- Query as role that is not owner of view but is owner of table. -- Should return records based on view owner policies. SET SESSION AUTHORIZATION regress_rls_alice; -SELECT * FROM rls_view; +SELECT * FROM rls_view ORDER BY a,b; a | b ---+----- 2 | bbb @@ -2306,7 +2306,7 @@ EXPLAIN (COSTS OFF) SELECT * FROM rls_view; -- Query as role that is not owner of table but is owner of view. -- Should return records based on view owner policies. SET SESSION AUTHORIZATION regress_rls_bob; -SELECT * FROM rls_view; +SELECT * FROM rls_view ORDER BY a,b; a | b ---+----- 2 | bbb @@ -2332,7 +2332,7 @@ ERROR: permission denied for relation rls_view -- Query as role that is not the owner of the table or view with permissions. SET SESSION AUTHORIZATION regress_rls_bob; GRANT SELECT ON rls_view TO regress_rls_carol; -SELECT * FROM rls_view; +SELECT * FROM rls_view ORDER BY a,b; a | b ---+----- 2 | bbb diff --git a/src/test/regress/expected/zhcn_gb18030.out b/src/test/regress/expected/zhcn_gb18030.out new file mode 100644 index 00000000..e330ad5c --- /dev/null +++ b/src/test/regress/expected/zhcn_gb18030.out @@ -0,0 +1,132 @@ +-- +-- gbk +-- +\c db_gbk; +SET client_encoding = gbk; +-- regular expression query +SELECT * FROM tbl_gbk WHERE f1 ~ '^��' ORDER BY f1; + f1 +-------- + ���Ұ� + ��һλ +(2 rows) + +DROP TABLE tbl_gbk; +CREATE TABLE tbl_gbk(f1 varchar(3)); +INSERT INTO tbl_gbk (f1) VALUES ('�˶���'); +INSERT INTO tbl_gbk (f1) VALUES ('�����'); +-- �F is not support by euc_cn, but support on gbk +INSERT INTO tbl_gbk (f1) VALUES ('���F��'); +INSERT INTO tbl_gbk (f1) VALUES ('���Ұ�'); +INSERT INTO tbl_gbk (f1) VALUES ('��һλ'); +INSERT INTO tbl_gbk (f1) VALUES ('����'); +-- error +INSERT INTO tbl_gbk (f1) VALUES ('���Ұ�2'); +ERROR: value too long for type character varying(3) +-- order by +SELECT * FROM tbl_gbk ORDER BY f1; + f1 +-------- + �˶��� + ����� + ���Ұ� + ��һλ + ���� + ���F�� +(6 rows) + +-- regular expression query +SELECT * FROM tbl_gbk WHERE f1 ~ '^��' ORDER BY f1; + f1 +-------- + ���Ұ� + ��һλ +(2 rows) + +-- query encoding length +SELECT OCTET_LENGTH(f1) FROM tbl_gbk ORDER BY f1; + octet_length +-------------- + 6 + 6 + 6 + 6 + 4 + 6 +(6 rows) + +-- +-- gb18030 +-- +\c db_gb18030; +SET client_encoding = gb18030; +-- regular expression query +SELECT * FROM tbl_gb18030 WHERE f1 ~ '^��' ORDER BY f1; + f1 +-------- + ���Ұ� + ��һλ +(2 rows) + +SELECT * FROM tbl_gb18030 WHERE f1 ~ '^�0�0' ORDER BY f1; + f1 +-------- + �0�0�3�3�5�3 +(1 row) + +DROP TABLE tbl_gb18030; +CREATE TABLE tbl_gb18030(f1 varchar(3)); +INSERT INTO tbl_gb18030 (f1) VALUES ('�˶���'); +INSERT INTO tbl_gb18030 (f1) VALUES ('�����'); +-- �F is not support by euc_cn, but support on gb18030 +INSERT INTO tbl_gb18030 (f1) VALUES ('���F��'); +INSERT INTO tbl_gb18030 (f1) VALUES ('���Ұ�'); +INSERT INTO tbl_gb18030 (f1) VALUES ('��һλ'); +INSERT INTO tbl_gb18030 (f1) VALUES ('����'); +-- which not support by gbk, but support on gb18030 +INSERT INTO tbl_gb18030 (f1) VALUES ('�0�0�3�3�5�3'); +-- out of bound error +INSERT INTO tbl_gb18030 (f1) VALUES ('���Ұ�2'); +ERROR: value too long for type character varying(3) +INSERT INTO tbl_gb18030 (f1) VALUES ('�0�0�3�3�5�32'); +ERROR: value too long for type character varying(3) +-- order by +SELECT * FROM tbl_gb18030 ORDER BY f1; + f1 +-------- + �0�0�3�3�5�3 + �˶��� + ����� + ���Ұ� + ��һλ + ���� + ���F�� +(7 rows) + +-- regular expression query +SELECT * FROM tbl_gb18030 WHERE f1 ~ '^��' ORDER BY f1; + f1 +-------- + ���Ұ� + ��һλ +(2 rows) + +SELECT * FROM tbl_gb18030 WHERE f1 ~ '^�0�0' ORDER BY f1; + f1 +-------- + �0�0�3�3�5�3 +(1 row) + +-- query encoding length +SELECT OCTET_LENGTH(f1) FROM tbl_gb18030 ORDER BY f1; + octet_length +-------------- + 12 + 6 + 6 + 6 + 6 + 4 + 6 +(7 rows) + diff --git a/src/test/regress/expected/zhcn_utf8.out b/src/test/regress/expected/zhcn_utf8.out new file mode 100644 index 00000000..a3ecc8e2 --- /dev/null +++ b/src/test/regress/expected/zhcn_utf8.out @@ -0,0 +1,264 @@ +-- +-- gbk +-- +CREATE DATABASE db_gbk template template0 encoding = gbk LC_COLLATE = 'zh_CN.gbk' LC_CTYPE = 'zh_CN.gbk'; +\c db_gbk; +CREATE TABLE tbl_gbk(f1 varchar(3)); +INSERT INTO tbl_gbk (f1) VALUES ('邓东宝'); +INSERT INTO tbl_gbk (f1) VALUES ('李尔王'); +-- 镕 is not support by euc_cn, but support on gbk +INSERT INTO tbl_gbk (f1) VALUES ('朱镕非'); +INSERT INTO tbl_gbk (f1) VALUES ('王家坝'); +INSERT INTO tbl_gbk (f1) VALUES ('王一位'); +INSERT INTO tbl_gbk (f1) VALUES ('怡宝'); +-- error +INSERT INTO tbl_gbk (f1) VALUES ('王家坝2'); +ERROR: value too long for type character varying(3) +-- order by +SELECT * FROM tbl_gbk ORDER BY f1; + f1 +-------- + 邓东宝 + 李尔王 + 王家坝 + 王一位 + 怡宝 + 朱镕非 +(6 rows) + +-- regular expression query +SELECT * FROM tbl_gbk WHERE f1 ~ '^王' ORDER BY f1; + f1 +-------- + 王家坝 + 王一位 +(2 rows) + +-- query encoding length +SELECT OCTET_LENGTH(f1) FROM tbl_gbk ORDER BY f1; + octet_length +-------------- + 6 + 6 + 6 + 6 + 4 + 6 +(6 rows) + +-- MATERIALIZED VIEW join +CREATE TABLE T_PERSON(i int, n varchar(32)); +INSERT INTO T_PERSON VALUES (1, '韩梅梅'); +INSERT INTO T_PERSON VALUES (2, '张雷'); +CREATE TABLE T_NICK(id int, name varchar(32)); +INSERT INTO T_NICK VALUES (1, '叶子'); +INSERT INTO T_NICK VALUES (2, '蓝天'); +CREATE MATERIALIZED VIEW T_MATER AS SELECT * FROM T_PERSON WITH NO DATA; +REFRESH MATERIALIZED VIEW T_MATER; +SELECT * FROM T_MATER p JOIN T_NICK n on p.i = n.id order by i; + i | n | id | name +---+--------+----+------ + 1 | 韩梅梅 | 1 | 叶子 + 2 | 张雷 | 2 | 蓝天 +(2 rows) + +SELECT * FROM T_MATER p JOIN T_NICK n on p.i = n.id order by name; + i | n | id | name +---+--------+----+------ + 2 | 张雷 | 2 | 蓝天 + 1 | 韩梅梅 | 1 | 叶子 +(2 rows) + +SELECT * FROM T_MATER p JOIN T_NICK n on p.i = n.id order by n; + i | n | id | name +---+--------+----+------ + 1 | 韩梅梅 | 1 | 叶子 + 2 | 张雷 | 2 | 蓝天 +(2 rows) + +DROP MATERIALIZED VIEW T_MATER; +DROP TABLE T_PERSON; +DROP TABLE T_NICK; +-- +-- gb18030 +-- +CREATE DATABASE db_gb18030 template template0 encoding = gb18030 LC_COLLATE = 'zh_CN.gb18030' LC_CTYPE = 'zh_CN.gb18030'; +\c db_gb18030; +CREATE TABLE tbl_gb18030(f1 varchar(3)); +INSERT INTO tbl_gb18030 (f1) VALUES ('邓东宝'); +INSERT INTO tbl_gb18030 (f1) VALUES ('李尔王'); +-- 镕 is not support by euc_cn, but support on gb18030 +INSERT INTO tbl_gb18030 (f1) VALUES ('朱镕非'); +INSERT INTO tbl_gb18030 (f1) VALUES ('王家坝'); +INSERT INTO tbl_gb18030 (f1) VALUES ('王一位'); +INSERT INTO tbl_gb18030 (f1) VALUES ('怡宝'); +-- which not support by gbk, but support on gb18030 +INSERT INTO tbl_gb18030 (f1) VALUES ('€𣘗𧄧'); +-- out of bound error +INSERT INTO tbl_gb18030 (f1) VALUES ('王家坝2'); +ERROR: value too long for type character varying(3) +INSERT INTO tbl_gb18030 (f1) VALUES ('€𣘗𧄧2'); +ERROR: value too long for type character varying(3) +-- text +CREATE TABLE tbl_text(i int, f1 text); +INSERT INTO tbl_text (f1) VALUES ('邓东宝'); +INSERT INTO tbl_text (f1) VALUES ('李尔王'); +-- 镕 is not support by euc_cn, but support on gb18030 +INSERT INTO tbl_text (f1) VALUES ('朱镕非'); +INSERT INTO tbl_text (f1) VALUES ('王家坝'); +INSERT INTO tbl_text (f1) VALUES ('王一位'); +INSERT INTO tbl_text (f1) VALUES ('怡宝'); +-- which not support by gbk, but support on gb18030 +INSERT INTO tbl_text (f1) VALUES ('€𣘗𧄧'); +SELECT * FROM tbl_text ORDER BY f1; + i | f1 +---+------------ + | \u0080𣘗𧄧 + | 邓东宝 + | 李尔王 + | 王家坝 + | 王一位 + | 怡宝 + | 朱镕非 +(7 rows) + +-- nvarchar2 +CREATE TABLE tbl_nvarchar2(i int, f1 nvarchar2(3) ); +INSERT INTO tbl_nvarchar2 (f1) VALUES ('邓东宝'); +INSERT INTO tbl_nvarchar2 (f1) VALUES ('李尔王'); +-- 镕 is not support by euc_cn, but support on gb18030 +INSERT INTO tbl_nvarchar2 (f1) VALUES ('朱镕非'); +INSERT INTO tbl_nvarchar2 (f1) VALUES ('王家坝'); +INSERT INTO tbl_nvarchar2 (f1) VALUES ('王一位'); +INSERT INTO tbl_nvarchar2 (f1) VALUES ('怡宝'); +-- which not support by gbk, but support on gb18030 +INSERT INTO tbl_nvarchar2 (f1) VALUES ('€𣘗𧄧'); +SELECT * FROM tbl_nvarchar2 ORDER BY f1; + i | f1 +---+------------ + | \u0080𣘗𧄧 + | 邓东宝 + | 李尔王 + | 王家坝 + | 王一位 + | 怡宝 + | 朱镕非 +(7 rows) + +-- bpchar +CREATE TABLE tbl_bpchar(i int, f1 bpchar(3) ); +INSERT INTO tbl_bpchar (f1) VALUES ('邓东宝'); +INSERT INTO tbl_bpchar (f1) VALUES ('李尔王'); +-- 镕 is not support by euc_cn, but support on gb18030 +INSERT INTO tbl_bpchar (f1) VALUES ('朱镕非'); +INSERT INTO tbl_bpchar (f1) VALUES ('王家坝'); +INSERT INTO tbl_bpchar (f1) VALUES ('王一位'); +INSERT INTO tbl_bpchar (f1) VALUES ('怡宝'); +-- which not support by gbk, but support on gb18030 +INSERT INTO tbl_bpchar (f1) VALUES ('€𣘗𧄧'); +SELECT * FROM tbl_bpchar ORDER BY f1; + i | f1 +---+------------ + | \u0080𣘗𧄧 + | 邓东宝 + | 李尔王 + | 王家坝 + | 王一位 + | 怡宝 + | 朱镕非 +(7 rows) + +-- char +CREATE TABLE tbl_char(i int, f1 char(3) ); +INSERT INTO tbl_char (f1) VALUES ('邓东宝'); +INSERT INTO tbl_char (f1) VALUES ('李尔王'); +-- 镕 is not support by euc_cn, but support on gb18030 +INSERT INTO tbl_char (f1) VALUES ('朱镕非'); +INSERT INTO tbl_char (f1) VALUES ('王家坝'); +INSERT INTO tbl_char (f1) VALUES ('王家1'); +INSERT INTO tbl_char (f1) VALUES ('王家2'); +INSERT INTO tbl_char (f1) VALUES ('王一位'); +INSERT INTO tbl_char (f1) VALUES ('怡宝'); +-- which not support by gbk, but support on gb18030 +INSERT INTO tbl_char (f1) VALUES ('€𣘗𧄧'); +SELECT * FROM tbl_char ORDER BY f1; + i | f1 +---+------------ + | \u0080𣘗𧄧 + | 邓东宝 + | 李尔王 + | 王家1 + | 王家2 + | 王家坝 + | 王一位 + | 怡宝 + | 朱镕非 +(9 rows) + +-- order by +SELECT * FROM tbl_gb18030 ORDER BY f1; + f1 +------------ + \u0080𣘗𧄧 + 邓东宝 + 李尔王 + 王家坝 + 王一位 + 怡宝 + 朱镕非 +(7 rows) + +-- regular expression query +SELECT * FROM tbl_gb18030 WHERE f1 ~ '^王' ORDER BY f1; + f1 +-------- + 王家坝 + 王一位 +(2 rows) + +-- query encoding length +SELECT OCTET_LENGTH(f1) FROM tbl_gb18030 ORDER BY f1; + octet_length +-------------- + 12 + 6 + 6 + 6 + 6 + 4 + 6 +(7 rows) + +-- MATERIALIZED VIEW join +CREATE TABLE T_PERSON(i int, n varchar(32)); +INSERT INTO T_PERSON VALUES (1, '韩梅梅'); +INSERT INTO T_PERSON VALUES (2, '李雷'); +CREATE TABLE T_NICK(id int, name varchar(32)); +INSERT INTO T_NICK VALUES (1, '叶子'); +INSERT INTO T_NICK VALUES (2, '蓝天'); +CREATE MATERIALIZED VIEW T_MATER AS SELECT * FROM T_PERSON WITH NO DATA; +REFRESH MATERIALIZED VIEW T_MATER; +SELECT * FROM T_NICK n JOIN T_MATER p on n.id=p.i order by i; + id | name | i | n +----+------+---+-------- + 1 | 叶子 | 1 | 韩梅梅 + 2 | 蓝天 | 2 | 李雷 +(2 rows) + +SELECT * FROM T_NICK n JOIN T_MATER p on n.id=p.i order by name; + id | name | i | n +----+------+---+-------- + 2 | 蓝天 | 2 | 李雷 + 1 | 叶子 | 1 | 韩梅梅 +(2 rows) + +SELECT * FROM T_NICK n JOIN T_MATER p on n.id=p.i order by n; + id | name | i | n +----+------+---+-------- + 1 | 叶子 | 1 | 韩梅梅 + 2 | 蓝天 | 2 | 李雷 +(2 rows) + +DROP MATERIALIZED VIEW T_MATER; +DROP TABLE T_PERSON; +DROP TABLE T_NICK; diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 4f52d0f8..ebd01715 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -23,10 +23,10 @@ test: tablespace # ---------- # The first group of parallel tests # ---------- -test: boolean char name varchar text int2 int4 int8 oid float4 float8 bit numeric txid uuid enum money rangetypes pg_lsn regproc +test: boolean char name varchar text int2 int4 int8 oid float4 float8 bit numeric txid uuid enum money rangetypes pg_lsn regproc zhcn_utf8 # Depends on things setup during char, varchar and text -test: strings +test: strings zhcn_gb18030 # Depends on int2, int4, int8, float4, float8 test: numerology diff --git a/src/test/regress/pg_regress.c b/src/test/regress/pg_regress.c index ec698123..f903bc43 100644 --- a/src/test/regress/pg_regress.c +++ b/src/test/regress/pg_regress.c @@ -871,7 +871,8 @@ set_node_config_file(PGXCNodeTypeNum node) fputs("log_min_messages = log\n", pg_conf); fputs("log_min_error_statement = log\n", pg_conf); - fputs("max_connections = 300\n", pg_conf); + fputs("max_connections = 500\n", pg_conf); + fputs("max_pool_size = 500\n", pg_conf); fputs("max_worker_processes = 256\n", pg_conf); fputs("max_parallel_workers = 256\n", pg_conf); fputs("enable_statistic = on\n", pg_conf); diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule index 0371de42..04781232 100644 --- a/src/test/regress/serial_schedule +++ b/src/test/regress/serial_schedule @@ -34,6 +34,8 @@ test: rangetypes test: pg_lsn test: regproc test: strings +test: zhcn_utf8 +test: zhcn_gb18030 test: numerology test: point test: lseg diff --git a/src/test/regress/sql/rowsecurity.sql b/src/test/regress/sql/rowsecurity.sql index 3fa55ccc..4ed98e68 100644 --- a/src/test/regress/sql/rowsecurity.sql +++ b/src/test/regress/sql/rowsecurity.sql @@ -887,12 +887,12 @@ GRANT SELECT ON rls_view TO regress_rls_bob; -- Query as role that is not owner of view or table. Should return all records. SET SESSION AUTHORIZATION regress_rls_bob; -SELECT * FROM rls_view; +SELECT * FROM rls_view ORDER BY a,b; EXPLAIN (COSTS OFF) SELECT * FROM rls_view; -- Query as view/table owner. Should return all records. SET SESSION AUTHORIZATION regress_rls_alice; -SELECT * FROM rls_view order by 1; +SELECT * FROM rls_view ORDER BY a,b; EXPLAIN (COSTS OFF) SELECT * FROM rls_view; DROP VIEW rls_view; @@ -904,13 +904,13 @@ GRANT SELECT ON rls_view TO regress_rls_alice; -- Query as role that is not owner of view but is owner of table. -- Should return records based on view owner policies. SET SESSION AUTHORIZATION regress_rls_alice; -SELECT * FROM rls_view; +SELECT * FROM rls_view ORDER BY a,b; EXPLAIN (COSTS OFF) SELECT * FROM rls_view; -- Query as role that is not owner of table but is owner of view. -- Should return records based on view owner policies. SET SESSION AUTHORIZATION regress_rls_bob; -SELECT * FROM rls_view; +SELECT * FROM rls_view ORDER BY a,b; EXPLAIN (COSTS OFF) SELECT * FROM rls_view; -- Query as role that is not the owner of the table or view without permissions. @@ -921,7 +921,7 @@ EXPLAIN (COSTS OFF) SELECT * FROM rls_view; --fail - permission denied. -- Query as role that is not the owner of the table or view with permissions. SET SESSION AUTHORIZATION regress_rls_bob; GRANT SELECT ON rls_view TO regress_rls_carol; -SELECT * FROM rls_view; +SELECT * FROM rls_view ORDER BY a,b; EXPLAIN (COSTS OFF) SELECT * FROM rls_view; SET SESSION AUTHORIZATION regress_rls_bob; diff --git a/src/test/regress/sql/zhcn_gb18030.sql b/src/test/regress/sql/zhcn_gb18030.sql new file mode 100644 index 00000000..3846d9a6 --- /dev/null +++ b/src/test/regress/sql/zhcn_gb18030.sql @@ -0,0 +1,65 @@ +-- +-- gbk +-- +\c db_gbk; +SET client_encoding = gbk; + +-- regular expression query +SELECT * FROM tbl_gbk WHERE f1 ~ '^��' ORDER BY f1; + +DROP TABLE tbl_gbk; +CREATE TABLE tbl_gbk(f1 varchar(3)); +INSERT INTO tbl_gbk (f1) VALUES ('�˶���'); +INSERT INTO tbl_gbk (f1) VALUES ('�����'); +-- �F is not support by euc_cn, but support on gbk +INSERT INTO tbl_gbk (f1) VALUES ('���F��'); +INSERT INTO tbl_gbk (f1) VALUES ('���Ұ�'); +INSERT INTO tbl_gbk (f1) VALUES ('��һλ'); +INSERT INTO tbl_gbk (f1) VALUES ('����'); +-- error +INSERT INTO tbl_gbk (f1) VALUES ('���Ұ�2'); + +-- order by +SELECT * FROM tbl_gbk ORDER BY f1; + +-- regular expression query +SELECT * FROM tbl_gbk WHERE f1 ~ '^��' ORDER BY f1; + +-- query encoding length +SELECT OCTET_LENGTH(f1) FROM tbl_gbk ORDER BY f1; + + +-- +-- gb18030 +-- +\c db_gb18030; + +SET client_encoding = gb18030; +-- regular expression query +SELECT * FROM tbl_gb18030 WHERE f1 ~ '^��' ORDER BY f1; +SELECT * FROM tbl_gb18030 WHERE f1 ~ '^�0�0' ORDER BY f1; + +DROP TABLE tbl_gb18030; +CREATE TABLE tbl_gb18030(f1 varchar(3)); +INSERT INTO tbl_gb18030 (f1) VALUES ('�˶���'); +INSERT INTO tbl_gb18030 (f1) VALUES ('�����'); +-- �F is not support by euc_cn, but support on gb18030 +INSERT INTO tbl_gb18030 (f1) VALUES ('���F��'); +INSERT INTO tbl_gb18030 (f1) VALUES ('���Ұ�'); +INSERT INTO tbl_gb18030 (f1) VALUES ('��һλ'); +INSERT INTO tbl_gb18030 (f1) VALUES ('����'); +-- which not support by gbk, but support on gb18030 +INSERT INTO tbl_gb18030 (f1) VALUES ('�0�0�3�3�5�3'); +-- out of bound error +INSERT INTO tbl_gb18030 (f1) VALUES ('���Ұ�2'); +INSERT INTO tbl_gb18030 (f1) VALUES ('�0�0�3�3�5�32'); + +-- order by +SELECT * FROM tbl_gb18030 ORDER BY f1; +-- regular expression query +SELECT * FROM tbl_gb18030 WHERE f1 ~ '^��' ORDER BY f1; +SELECT * FROM tbl_gb18030 WHERE f1 ~ '^�0�0' ORDER BY f1; + +-- query encoding length +SELECT OCTET_LENGTH(f1) FROM tbl_gb18030 ORDER BY f1; + diff --git a/src/test/regress/sql/zhcn_utf8.sql b/src/test/regress/sql/zhcn_utf8.sql new file mode 100644 index 00000000..764647f1 --- /dev/null +++ b/src/test/regress/sql/zhcn_utf8.sql @@ -0,0 +1,140 @@ +-- +-- gbk +-- +CREATE DATABASE db_gbk template template0 encoding = gbk LC_COLLATE = 'zh_CN.gbk' LC_CTYPE = 'zh_CN.gbk'; +\c db_gbk; + +CREATE TABLE tbl_gbk(f1 varchar(3)); +INSERT INTO tbl_gbk (f1) VALUES ('邓东宝'); +INSERT INTO tbl_gbk (f1) VALUES ('李尔王'); +-- 镕 is not support by euc_cn, but support on gbk +INSERT INTO tbl_gbk (f1) VALUES ('朱镕非'); +INSERT INTO tbl_gbk (f1) VALUES ('王家坝'); +INSERT INTO tbl_gbk (f1) VALUES ('王一位'); +INSERT INTO tbl_gbk (f1) VALUES ('怡宝'); +-- error +INSERT INTO tbl_gbk (f1) VALUES ('王家坝2'); + +-- order by +SELECT * FROM tbl_gbk ORDER BY f1; + +-- regular expression query +SELECT * FROM tbl_gbk WHERE f1 ~ '^王' ORDER BY f1; + +-- query encoding length +SELECT OCTET_LENGTH(f1) FROM tbl_gbk ORDER BY f1; + +-- MATERIALIZED VIEW join +CREATE TABLE T_PERSON(i int, n varchar(32)); +INSERT INTO T_PERSON VALUES (1, '韩梅梅'); +INSERT INTO T_PERSON VALUES (2, '张雷'); +CREATE TABLE T_NICK(id int, name varchar(32)); +INSERT INTO T_NICK VALUES (1, '叶子'); +INSERT INTO T_NICK VALUES (2, '蓝天'); +CREATE MATERIALIZED VIEW T_MATER AS SELECT * FROM T_PERSON WITH NO DATA; +REFRESH MATERIALIZED VIEW T_MATER; +SELECT * FROM T_MATER p JOIN T_NICK n on p.i = n.id order by i; +SELECT * FROM T_MATER p JOIN T_NICK n on p.i = n.id order by name; +SELECT * FROM T_MATER p JOIN T_NICK n on p.i = n.id order by n; +DROP MATERIALIZED VIEW T_MATER; +DROP TABLE T_PERSON; +DROP TABLE T_NICK; + +-- +-- gb18030 +-- +CREATE DATABASE db_gb18030 template template0 encoding = gb18030 LC_COLLATE = 'zh_CN.gb18030' LC_CTYPE = 'zh_CN.gb18030'; +\c db_gb18030; + +CREATE TABLE tbl_gb18030(f1 varchar(3)); +INSERT INTO tbl_gb18030 (f1) VALUES ('邓东宝'); +INSERT INTO tbl_gb18030 (f1) VALUES ('李尔王'); +-- 镕 is not support by euc_cn, but support on gb18030 +INSERT INTO tbl_gb18030 (f1) VALUES ('朱镕非'); +INSERT INTO tbl_gb18030 (f1) VALUES ('王家坝'); +INSERT INTO tbl_gb18030 (f1) VALUES ('王一位'); +INSERT INTO tbl_gb18030 (f1) VALUES ('怡宝'); +-- which not support by gbk, but support on gb18030 +INSERT INTO tbl_gb18030 (f1) VALUES ('€𣘗𧄧'); +-- out of bound error +INSERT INTO tbl_gb18030 (f1) VALUES ('王家坝2'); +INSERT INTO tbl_gb18030 (f1) VALUES ('€𣘗𧄧2'); + +-- text +CREATE TABLE tbl_text(i int, f1 text); +INSERT INTO tbl_text (f1) VALUES ('邓东宝'); +INSERT INTO tbl_text (f1) VALUES ('李尔王'); +-- 镕 is not support by euc_cn, but support on gb18030 +INSERT INTO tbl_text (f1) VALUES ('朱镕非'); +INSERT INTO tbl_text (f1) VALUES ('王家坝'); +INSERT INTO tbl_text (f1) VALUES ('王一位'); +INSERT INTO tbl_text (f1) VALUES ('怡宝'); +-- which not support by gbk, but support on gb18030 +INSERT INTO tbl_text (f1) VALUES ('€𣘗𧄧'); +SELECT * FROM tbl_text ORDER BY f1; + +-- nvarchar2 +CREATE TABLE tbl_nvarchar2(i int, f1 nvarchar2(3) ); +INSERT INTO tbl_nvarchar2 (f1) VALUES ('邓东宝'); +INSERT INTO tbl_nvarchar2 (f1) VALUES ('李尔王'); +-- 镕 is not support by euc_cn, but support on gb18030 +INSERT INTO tbl_nvarchar2 (f1) VALUES ('朱镕非'); +INSERT INTO tbl_nvarchar2 (f1) VALUES ('王家坝'); +INSERT INTO tbl_nvarchar2 (f1) VALUES ('王一位'); +INSERT INTO tbl_nvarchar2 (f1) VALUES ('怡宝'); +-- which not support by gbk, but support on gb18030 +INSERT INTO tbl_nvarchar2 (f1) VALUES ('€𣘗𧄧'); +SELECT * FROM tbl_nvarchar2 ORDER BY f1; + +-- bpchar +CREATE TABLE tbl_bpchar(i int, f1 bpchar(3) ); +INSERT INTO tbl_bpchar (f1) VALUES ('邓东宝'); +INSERT INTO tbl_bpchar (f1) VALUES ('李尔王'); +-- 镕 is not support by euc_cn, but support on gb18030 +INSERT INTO tbl_bpchar (f1) VALUES ('朱镕非'); +INSERT INTO tbl_bpchar (f1) VALUES ('王家坝'); +INSERT INTO tbl_bpchar (f1) VALUES ('王一位'); +INSERT INTO tbl_bpchar (f1) VALUES ('怡宝'); +-- which not support by gbk, but support on gb18030 +INSERT INTO tbl_bpchar (f1) VALUES ('€𣘗𧄧'); +SELECT * FROM tbl_bpchar ORDER BY f1; + +-- char +CREATE TABLE tbl_char(i int, f1 char(3) ); +INSERT INTO tbl_char (f1) VALUES ('邓东宝'); +INSERT INTO tbl_char (f1) VALUES ('李尔王'); +-- 镕 is not support by euc_cn, but support on gb18030 +INSERT INTO tbl_char (f1) VALUES ('朱镕非'); +INSERT INTO tbl_char (f1) VALUES ('王家坝'); +INSERT INTO tbl_char (f1) VALUES ('王家1'); +INSERT INTO tbl_char (f1) VALUES ('王家2'); +INSERT INTO tbl_char (f1) VALUES ('王一位'); +INSERT INTO tbl_char (f1) VALUES ('怡宝'); +-- which not support by gbk, but support on gb18030 +INSERT INTO tbl_char (f1) VALUES ('€𣘗𧄧'); +SELECT * FROM tbl_char ORDER BY f1; + +-- order by +SELECT * FROM tbl_gb18030 ORDER BY f1; + +-- regular expression query +SELECT * FROM tbl_gb18030 WHERE f1 ~ '^王' ORDER BY f1; + +-- query encoding length +SELECT OCTET_LENGTH(f1) FROM tbl_gb18030 ORDER BY f1; + +-- MATERIALIZED VIEW join +CREATE TABLE T_PERSON(i int, n varchar(32)); +INSERT INTO T_PERSON VALUES (1, '韩梅梅'); +INSERT INTO T_PERSON VALUES (2, '李雷'); +CREATE TABLE T_NICK(id int, name varchar(32)); +INSERT INTO T_NICK VALUES (1, '叶子'); +INSERT INTO T_NICK VALUES (2, '蓝天'); +CREATE MATERIALIZED VIEW T_MATER AS SELECT * FROM T_PERSON WITH NO DATA; +REFRESH MATERIALIZED VIEW T_MATER; +SELECT * FROM T_NICK n JOIN T_MATER p on n.id=p.i order by i; +SELECT * FROM T_NICK n JOIN T_MATER p on n.id=p.i order by name; +SELECT * FROM T_NICK n JOIN T_MATER p on n.id=p.i order by n; +DROP MATERIALIZED VIEW T_MATER; +DROP TABLE T_PERSON; +DROP TABLE T_NICK; From e6f7cc721ebf602112ba3bab9fc05df09dfd4904 Mon Sep 17 00:00:00 2001 From: andrelin Date: Sat, 6 Feb 2021 12:54:03 +0800 Subject: [PATCH 127/578] Replicated distribution support composite type --- src/backend/access/common/printtup.c | 38 ++++++++++++++++++++++++++++ src/test/regress/expected/rules.out | 13 ++++------ 2 files changed, 43 insertions(+), 8 deletions(-) diff --git a/src/backend/access/common/printtup.c b/src/backend/access/common/printtup.c index c7f180a5..a9b0b09b 100644 --- a/src/backend/access/common/printtup.c +++ b/src/backend/access/common/printtup.c @@ -29,9 +29,12 @@ #include "miscadmin.h" #ifdef __TBASE__ +#include "access/htup_details.h" +#include "catalog/pg_type.h" #include "postmaster/postmaster.h" #include "pgxc/squeue.h" #include "executor/executor.h" +#include "utils/typcache.h" extern bool IsAbortedTransactionBlockState(void); #endif static void printtup_startup(DestReceiver *self, int operation, @@ -444,6 +447,41 @@ printtup(TupleTableSlot *slot, DestReceiver *self) } else { +#ifdef __TBASE__ + if (slot->tts_tupleDescriptor->attrs[i]->atttypid == RECORDOID && self->mydest == DestRemoteExecute) + { + Oid tupType; + int32 tupTypmod; + TupleDesc tupdesc; + uint32 n32; + StringInfoData tupdesc_data; + HeapTupleHeader rec; + /* RECORD must be varlena */ + Datum attr_detoast = PointerGetDatum(PG_DETOAST_DATUM(slot->tts_values[i])); + + rec = DatumGetHeapTupleHeader(attr_detoast); + + initStringInfo(&tupdesc_data); + + /* Extract type info from the tuple itself */ + tupType = HeapTupleHeaderGetTypeId(rec); + tupTypmod = HeapTupleHeaderGetTypMod(rec); + tupdesc = lookup_rowtype_tupdesc(tupType, tupTypmod); + + /* -2 to indicate this is composite type */ + n32 = htonl(-2); + appendBinaryStringInfo(&buf, (char *) &n32, 4); + + FormRowDescriptionMessage(tupdesc, NULL, NULL, &tupdesc_data); + ReleaseTupleDesc(tupdesc); + n32 = htonl(tupdesc_data.len); + /* write rowDesctiption */ + appendBinaryStringInfo(&buf, (char *) &n32, 4); + appendBinaryStringInfo(&buf, tupdesc_data.data, tupdesc_data.len); + + pfree(tupdesc_data.data); + } +#endif int len = strlen(outputstr); pq_sendint(&buf, len, 4); appendBinaryStringInfo(&buf, outputstr, len); diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 0d96dff4..89552269 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -2667,19 +2667,16 @@ select * from id_ordered order by id; (6 rows) update id_ordered set name = 'update 2' where id = 2; -ERROR: input of anonymous composite types is not implemented update id_ordered set name = 'update 4' where id = 4; -ERROR: input of anonymous composite types is not implemented update id_ordered set name = 'update 5' where id = 5; -ERROR: input of anonymous composite types is not implemented select * from id_ordered order by id; - id | name -----+-------- + id | name +----+---------- 1 | Test 1 - 2 | Test 2 + 2 | update 2 3 | Test 3 - 4 | Test 4 - 5 | Test 5 + 4 | update 4 + 5 | update 5 6 | Test 6 (6 rows) From 328456a3581c9c38a5f087ac79c0290bd91bdc53 Mon Sep 17 00:00:00 2001 From: andrelin Date: Sat, 6 Feb 2021 13:33:35 +0800 Subject: [PATCH 128/578] cover regress expectation about cost changes --- src/test/regress/expected/create_view.out | 49 +++++++++++++++++ src/test/regress/expected/xc_FQS_join_1.out | 58 ++++++++++----------- 2 files changed, 78 insertions(+), 29 deletions(-) diff --git a/src/test/regress/expected/create_view.out b/src/test/regress/expected/create_view.out index 56e73b4e..57376793 100644 --- a/src/test/regress/expected/create_view.out +++ b/src/test/regress/expected/create_view.out @@ -38,6 +38,55 @@ SELECT * FROM viewtest ORDER BY a; CREATE OR REPLACE VIEW viewtest AS SELECT a, b FROM viewtest_tbl WHERE a > 5 ORDER BY b DESC; +EXPLAIN SELECT * FROM viewtest; + QUERY PLAN +------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) (cost=22.23..22.79 rows=225 width=8) + -> Sort (cost=22.23..22.79 rows=225 width=8) + Sort Key: viewtest_tbl.b DESC + -> Seq Scan on viewtest_tbl (cost=0.00..13.44 rows=225 width=8) + Filter: (a > 5) +(5 rows) + +SELECT * FROM viewtest; + a | b +----+---- + 20 | 25 + 15 | 20 + 10 | 15 +(3 rows) + +EXPLAIN SELECT a FROM viewtest; + QUERY PLAN +------------------------------------------------------------------------------------------------- + Subquery Scan on viewtest (cost=22.23..25.04 rows=225 width=4) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=22.23..22.79 rows=225 width=8) + -> Sort (cost=22.23..22.79 rows=225 width=8) + Sort Key: viewtest_tbl.b DESC + -> Seq Scan on viewtest_tbl (cost=0.00..13.44 rows=225 width=8) + Filter: (a > 5) +(6 rows) + +SELECT a FROM viewtest; + a +---- + 20 + 15 + 10 +(3 rows) + +EXPLAIN SELECT * FROM viewtest ORDER BY a; + QUERY PLAN +------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) (cost=33.83..34.39 rows=225 width=8) + -> Sort (cost=33.83..34.39 rows=225 width=8) + Sort Key: viewtest_tbl.a + -> Sort (cost=22.23..22.79 rows=225 width=8) + Sort Key: viewtest_tbl.b DESC + -> Seq Scan on viewtest_tbl (cost=0.00..13.44 rows=225 width=8) + Filter: (a > 5) +(7 rows) + SELECT * FROM viewtest ORDER BY a; a | b ----+---- diff --git a/src/test/regress/expected/xc_FQS_join_1.out b/src/test/regress/expected/xc_FQS_join_1.out index 6cfb1dda..57ff7524 100644 --- a/src/test/regress/expected/xc_FQS_join_1.out +++ b/src/test/regress/expected/xc_FQS_join_1.out @@ -390,19 +390,19 @@ explain (verbose on, nodes off, costs off) select * from tab1_mod natural join t ---------------------------------------------------------------------------------- Hash Join Output: tab1_mod.val, tab1_mod.val2 - Hash Cond: ((tab4_rep.val = tab1_mod.val) AND (tab4_rep.val2 = tab1_mod.val2)) - -> Remote Subquery Scan on all - Output: tab4_rep.val, tab4_rep.val2 - -> Seq Scan on public.tab4_rep - Output: tab4_rep.val, tab4_rep.val2 - Filter: (tab4_rep.val < 4) - -> Hash - Output: tab1_mod.val, tab1_mod.val2 + Hash Cond: ((tab1_mod.val = tab4_rep.val) AND (tab1_mod.val2 = tab4_rep.val2)) -> Remote Subquery Scan on all Output: tab1_mod.val, tab1_mod.val2 -> Seq Scan on public.tab1_mod Output: tab1_mod.val, tab1_mod.val2 Filter: (tab1_mod.val > 2) + -> Hash + Output: tab4_rep.val, tab4_rep.val2 + -> Remote Subquery Scan on all + Output: tab4_rep.val, tab4_rep.val2 + -> Seq Scan on public.tab4_rep + Output: tab4_rep.val, tab4_rep.val2 + Filter: (tab4_rep.val < 4) (15 rows) -- Join involving two distributed tables, never shipped @@ -425,18 +425,18 @@ explain (verbose on, nodes off, costs off) select * from tab1_mod natural join t Output: tab1_mod.val, tab1_mod.val2 -> Hash Join Output: tab1_mod.val, tab1_mod.val2 - Hash Cond: ((tab2_mod.val = tab1_mod.val) AND (tab2_mod.val2 = tab1_mod.val2)) - -> Seq Scan on public.tab2_mod - Output: tab2_mod.val, tab2_mod.val2 - Filter: (tab2_mod.val < 4) - -> Hash - Output: tab1_mod.val, tab1_mod.val2 - -> Remote Subquery Scan on all - Output: tab1_mod.val, tab1_mod.val2 - Distribute results by M: val + Hash Cond: ((tab1_mod.val = tab2_mod.val) AND (tab1_mod.val2 = tab2_mod.val2)) -> Seq Scan on public.tab1_mod Output: tab1_mod.val, tab1_mod.val2 Filter: (tab1_mod.val > 2) + -> Hash + Output: tab2_mod.val, tab2_mod.val2 + -> Remote Subquery Scan on all + Output: tab2_mod.val, tab2_mod.val2 + Distribute results by M: val + -> Seq Scan on public.tab2_mod + Output: tab2_mod.val, tab2_mod.val2 + Filter: (tab2_mod.val < 4) (16 rows) -- Join involving a distributed table and two replicated tables, such that the @@ -590,17 +590,17 @@ explain (verbose on, nodes off, costs off, num_nodes on) select * from tab1_mod Output: tab1_mod.val, tab1_mod.val2, tab1_mod.val2 Join Filter: (tab1_mod.val2 = tab4_rep.val2) -> Remote Subquery Scan on all - Output: tab4_rep.val, tab4_rep.val2 - -> Seq Scan on public.tab4_rep - Output: tab4_rep.val, tab4_rep.val2 - Filter: (tab4_rep.val = 1) - -> Materialize - Output: tab1_mod.val, tab1_mod.val2 - -> Remote Subquery Scan on all Output: tab1_mod.val, tab1_mod.val2 -> Seq Scan on public.tab1_mod Output: tab1_mod.val, tab1_mod.val2 Filter: (tab1_mod.val = 1) + -> Materialize + Output: tab4_rep.val, tab4_rep.val2 + -> Remote Subquery Scan on all + Output: tab4_rep.val, tab4_rep.val2 + -> Seq Scan on public.tab4_rep + Output: tab4_rep.val, tab4_rep.val2 + Filter: (tab4_rep.val = 1) (18 rows) -- following join between distributed tables should get FQSed because both of @@ -625,16 +625,16 @@ explain (verbose on, nodes off, costs off, num_nodes on) select * from tab1_mod -> Nested Loop Output: tab1_mod.val2, tab1_mod.val, tab2_mod.val, tab1_mod.val Join Filter: (tab1_mod.val2 = tab2_mod.val2) - -> Seq Scan on public.tab2_mod - Output: tab2_mod.val, tab2_mod.val2 - Filter: (tab2_mod.val = 2) - -> Materialize - Output: tab1_mod.val2, tab1_mod.val -> Remote Subquery Scan on all Output: tab1_mod.val2, tab1_mod.val -> Seq Scan on public.tab1_mod Output: tab1_mod.val2, tab1_mod.val Filter: (tab1_mod.val = 1) + -> Materialize + Output: tab2_mod.val, tab2_mod.val2 + -> Seq Scan on public.tab2_mod + Output: tab2_mod.val, tab2_mod.val2 + Filter: (tab2_mod.val = 2) (15 rows) -- JOIN involving the distributed table with equi-JOIN on the distributed column From 7408928cc774deb4bc743de3cbcddf2287a22087 Mon Sep 17 00:00:00 2001 From: andrelin Date: Thu, 15 Apr 2021 15:03:09 +0800 Subject: [PATCH 129/578] Prevent reenter ExecutorEnd during abort This is following PG rule: skip executor shut down during error abort, PGXC code violated it for treating shared queue, this commit fix this tapd: http://tapd.oa.com/20421696/bugtrace/bugs/view?bug_id=1020421696084977249&url_cache_key=25afc0aab46ec661eb190971ad54594d --- src/backend/commands/portalcmds.c | 32 ++++++++++++++++++------------ src/backend/utils/mmgr/portalmem.c | 9 --------- 2 files changed, 19 insertions(+), 22 deletions(-) diff --git a/src/backend/commands/portalcmds.c b/src/backend/commands/portalcmds.c index e5a87499..4bea0943 100644 --- a/src/backend/commands/portalcmds.c +++ b/src/backend/commands/portalcmds.c @@ -359,8 +359,13 @@ PortalCleanup(Portal portal) #ifdef XCP if (portal->strategy == PORTAL_DISTRIBUTED) { - /* If portal is producing it has an executor which should be - * shut down */ + /* If cleanup fails below prevent double cleanup */ + portal->queryDesc = NULL; + + /* + * If portal is producing it has an executor which should be + * shut down + */ if (queryDesc->myindex == -1) { if (portal->status == PORTAL_FAILED) @@ -370,8 +375,6 @@ PortalCleanup(Portal portal) * producers list. */ removeProducingPortal(portal); - /* If cleanup fails below prevent double cleanup */ - portal->queryDesc = NULL; /* * Inform consumers about failed producer if they are * still waiting @@ -384,28 +387,33 @@ PortalCleanup(Portal portal) { ResourceOwner saveResourceOwner; - /* We must make the portal's resource owner current to - * release resources properly */ + /* + * We must make the portal's resource owner current to + * release resources properly + */ saveResourceOwner = CurrentResourceOwner; PG_TRY(); { + if (portal->resowner) CurrentResourceOwner = portal->resowner; + /* do nothing about executor if portal is failed */ + if (portal->status != PORTAL_FAILED) + { /* Finish executor if it is not yet finished */ if (!queryDesc->estate->es_finished) ExecutorFinish(queryDesc); - /* Destroy executor if not yet destroyed */ - if (queryDesc->estate) ExecutorEnd(queryDesc); - if (portal->status == PORTAL_FAILED) + FreeQueryDesc(queryDesc); + } + else { /* - * If portal if failed we can allow to be blocked + * If portal is failed we can allow to be blocked * here while UnBind is waiting for finishing * consumers. */ if (queryDesc->squeue) SharedQueueUnBind(queryDesc->squeue, true); - FreeQueryDesc(queryDesc); } } PG_CATCH(); @@ -428,8 +436,6 @@ PortalCleanup(Portal portal) PG_TRY(); { CurrentResourceOwner = portal->resowner; - /* Prevent double cleanup in case of error below */ - portal->queryDesc = NULL; /* Reset the squeue if exists */ if (queryDesc->squeue) SharedQueueReset(queryDesc->squeue, queryDesc->myindex); diff --git a/src/backend/utils/mmgr/portalmem.c b/src/backend/utils/mmgr/portalmem.c index deb2b8d6..567737b6 100644 --- a/src/backend/utils/mmgr/portalmem.c +++ b/src/backend/utils/mmgr/portalmem.c @@ -606,15 +606,6 @@ PortalDrop(Portal portal, bool isTopCommit) */ if (portalIsProducing(portal)) return; - - if (portal->queryDesc) - { - ResourceOwner saveResourceOwner = CurrentResourceOwner; - CurrentResourceOwner = portal->resowner; - FreeQueryDesc(portal->queryDesc); - CurrentResourceOwner = saveResourceOwner; - portal->queryDesc = NULL; - } #endif /* From 3ed136704d98edea9e29188d143105d68e9a2bd6 Mon Sep 17 00:00:00 2001 From: andrelin Date: Thu, 15 Apr 2021 15:05:01 +0800 Subject: [PATCH 130/578] Revert "fix http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131084977249 (merge request !126) " This reverts commit 5c0c40bc3b9658ac4282883088749416594f793d. --- src/backend/pgxc/pool/pgxcnode.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index 36558205..aa1070be 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -822,6 +822,7 @@ pgxc_node_receive(const int conn_count, } retry: + CHECK_FOR_INTERRUPTS(); poll_val = poll(pool_fd, conn_count, timeout_ms); if (poll_val < 0) { From 88e73c5e43585eda9d2da2197f932a5e482a38cc Mon Sep 17 00:00:00 2001 From: andrelin Date: Fri, 19 Feb 2021 19:49:57 +0800 Subject: [PATCH 131/578] Explain analyze enhancement http://tapd.oa.com/pgxz/prong/stories/view/1010092131862892295 --- src/backend/commands/Makefile | 2 +- src/backend/commands/explain_dist.c | 630 ++++++++++++++++++++++++++++ src/backend/executor/execParallel.c | 13 + src/backend/executor/execProcnode.c | 3 + src/backend/pgxc/pool/execRemote.c | 57 ++- src/backend/pgxc/pool/pgxcnode.c | 10 +- src/backend/tcop/postgres.c | 67 ++- src/backend/tcop/pquery.c | 12 +- src/include/commands/explain_dist.h | 36 ++ src/include/pgxc/execRemote.h | 6 + src/include/pgxc/pgxcnode.h | 2 +- src/include/utils/plancache.h | 1 + src/include/utils/portal.h | 1 + 13 files changed, 808 insertions(+), 32 deletions(-) create mode 100644 src/backend/commands/explain_dist.c create mode 100644 src/include/commands/explain_dist.h diff --git a/src/backend/commands/Makefile b/src/backend/commands/Makefile index 4a6c99e0..663eb71e 100644 --- a/src/backend/commands/Makefile +++ b/src/backend/commands/Makefile @@ -15,7 +15,7 @@ include $(top_builddir)/src/Makefile.global OBJS = amcmds.o aggregatecmds.o alter.o analyze.o async.o cluster.o comment.o \ collationcmds.o constraint.o conversioncmds.o copy.o createas.o \ dbcommands.o define.o discard.o dropcmds.o \ - event_trigger.o explain.o extension.o foreigncmds.o functioncmds.o \ + event_trigger.o explain.o explain_dist.o extension.o foreigncmds.o functioncmds.o \ indexcmds.o lockcmds.o matview.o operatorcmds.o opclasscmds.o \ policy.o portalcmds.o prepare.o proclang.o publicationcmds.o \ schemacmds.o seclabel.o sequence.o statscmds.o subscriptioncmds.o \ diff --git a/src/backend/commands/explain_dist.c b/src/backend/commands/explain_dist.c new file mode 100644 index 00000000..41b7c5a1 --- /dev/null +++ b/src/backend/commands/explain_dist.c @@ -0,0 +1,630 @@ +/*------------------------------------------------------------------------- + * + * explain_dist.c + * This code provides support for distributed explain analyze. + * + * Portions Copyright (c) 2020, Tencent TBase-C Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/commands/explain_dist.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "commands/explain_dist.h" +#include "executor/hashjoin.h" +#include "libpq/libpq.h" +#include "libpq/pqformat.h" +#include "nodes/nodeFuncs.h" + +/* Read instrument field */ +#define INSTR_READ_FIELD(fldname) \ +do { \ + instr->fldname = strtod(tmp_head, &tmp_pos); \ + tmp_head = tmp_pos + 1; \ +} while(0) + +/* Set max instrument */ +#define INSTR_MAX_FIELD(fldname) \ +do { \ + target->fldname = Max(src->fldname, target->fldname); \ +} while(0) + +/* Serialize state */ +typedef struct +{ + /* ids of plan nodes we've handled */ + Bitmapset *printed_nodes; + /* send str buf */ + StringInfoData buf; +} SerializeState; + +/* + * InstrOut + * + * Serialize Instrumentation structure with the format + * "nodetype-plan_node_id{val,val,...,val}". + * + * NOTE: The function should be modified if the structure of Instrumentation + * or its relevant members has been changed. + */ +static void +InstrOut(StringInfo buf, Plan *plan, Instrumentation *instr) +{ + /* nodeTag for varify */ + appendStringInfo(buf, "%hd-%d{", nodeTag(plan), plan->plan_node_id); + + /* bool */ + /* running should be false after InstrEndLoop */ + appendStringInfo(buf, "%hd,", instr->need_timer); + appendStringInfo(buf, "%hd,", instr->need_bufusage); + appendStringInfo(buf, "%hd,", instr->running); + /* instr_time */ + /* starttime and counter should be 0 after InstrEndLoop */ + appendStringInfo(buf, "%ld,", instr->starttime.tv_sec); + appendStringInfo(buf, "%ld,", instr->starttime.tv_nsec); + appendStringInfo(buf, "%ld,", instr->counter.tv_sec); + appendStringInfo(buf, "%ld,", instr->counter.tv_nsec); + /* double */ + /* firsttuple and tuplecount should be 0 after InstrEndLoop */ + appendStringInfo(buf, "%.0f,", instr->firsttuple); + appendStringInfo(buf, "%.0f,", instr->tuplecount); + /* BufferUsage */ + appendStringInfo(buf, "%ld,", instr->bufusage_start.shared_blks_hit); + appendStringInfo(buf, "%ld,", instr->bufusage_start.shared_blks_read); + appendStringInfo(buf, "%ld,", instr->bufusage_start.shared_blks_dirtied); + appendStringInfo(buf, "%ld,", instr->bufusage_start.shared_blks_written); + appendStringInfo(buf, "%ld,", instr->bufusage_start.local_blks_hit); + appendStringInfo(buf, "%ld,", instr->bufusage_start.local_blks_read); + appendStringInfo(buf, "%ld,", instr->bufusage_start.local_blks_dirtied); + appendStringInfo(buf, "%ld,", instr->bufusage_start.local_blks_written); + appendStringInfo(buf, "%ld,", instr->bufusage_start.temp_blks_read); + appendStringInfo(buf, "%ld,", instr->bufusage_start.temp_blks_written); + appendStringInfo(buf, "%ld,", instr->bufusage_start.blk_read_time.tv_sec); + appendStringInfo(buf, "%ld,", instr->bufusage_start.blk_read_time.tv_nsec); + appendStringInfo(buf, "%ld,", instr->bufusage_start.blk_write_time.tv_sec); + appendStringInfo(buf, "%ld,", instr->bufusage_start.blk_write_time.tv_nsec); + /* double */ + appendStringInfo(buf, "%.10f,", instr->startup); + appendStringInfo(buf, "%.10f,", instr->total); + appendStringInfo(buf, "%.0f,", instr->ntuples); + appendStringInfo(buf, "%.0f,", instr->nloops); + appendStringInfo(buf, "%.0f,", instr->nfiltered1); + appendStringInfo(buf, "%.0f,", instr->nfiltered2); + /* BufferUsage */ + appendStringInfo(buf, "%ld,", instr->bufusage.shared_blks_hit); + appendStringInfo(buf, "%ld,", instr->bufusage.shared_blks_read); + appendStringInfo(buf, "%ld,", instr->bufusage.shared_blks_dirtied); + appendStringInfo(buf, "%ld,", instr->bufusage.shared_blks_written); + appendStringInfo(buf, "%ld,", instr->bufusage.local_blks_hit); + appendStringInfo(buf, "%ld,", instr->bufusage.local_blks_read); + appendStringInfo(buf, "%ld,", instr->bufusage.local_blks_dirtied); + appendStringInfo(buf, "%ld,", instr->bufusage.local_blks_written); + appendStringInfo(buf, "%ld,", instr->bufusage.temp_blks_read); + appendStringInfo(buf, "%ld,", instr->bufusage.temp_blks_written); + appendStringInfo(buf, "%ld,", instr->bufusage.blk_read_time.tv_sec); + appendStringInfo(buf, "%ld,", instr->bufusage.blk_read_time.tv_nsec); + appendStringInfo(buf, "%ld,", instr->bufusage.blk_write_time.tv_sec); + appendStringInfo(buf, "%ld}", instr->bufusage.blk_write_time.tv_nsec); + + elog(DEBUG1, "InstrOut: plan_node_id %d, nloops %.0f", plan->plan_node_id, instr->nloops); +} + +/* + * WorkerInstrOut + * + * Serialize worker instrumentation with the format + * "n|val,val,..,val|...|val,val,..,val|". n indicates the worker num, + * and | separates each worker instrumentation. + */ +static void +WorkerInstrOut(StringInfo buf, WorkerInstrumentation *worker_instr) +{ + int n; + + if (worker_instr == NULL) + { + appendStringInfo(buf, "0|"); + return; + } + + appendStringInfo(buf, "%d|", worker_instr->num_workers); + for (n = 0; n < worker_instr->num_workers; n++) + { + Instrumentation *instr = &worker_instr->instrument[n]; + + if (instr->nloops <= 0) + appendStringInfo(buf, "0,0,0,0|"); + else + /* send startup, total, ntuples, loops for now */ + appendStringInfo(buf, "%.10f,%.10f,%.0f,%.0f|", + instr->startup, instr->total, instr->ntuples, instr->nloops); + } +} + +/* + * SpecInstrOut + * + * Serialize specific information in planstate with the format + * "1/0", and 1/0 indicates if values are valid or not. + * + * NOTE: The function should be modified if the corresponding data structure + * has been changed. + * The function is VERY related to show_sort_info, show_hash_info. + */ +static void +SpecInstrOut(StringInfo buf, NodeTag plantag, PlanState *planstate) +{ + switch(plantag) + { + case T_Gather: + { + appendStringInfo(buf, "%d>", + ((GatherState *) planstate)->nworkers_launched); + } + break; + + case T_GatherMerge: + { + appendStringInfo(buf, "%d>", + ((GatherMergeState *) planstate)->nworkers_launched); + } + break; +#if 0 + case T_Sort: + { + /* according to RemoteSortState and show_sort_info */ + SortState *sortstate = castNode(SortState, planstate); + + if (sortstate->sort_Done && sortstate->tuplesortstate) + { + Tuplesortstate *state = (Tuplesortstate *) sortstate->tuplesortstate; + char *sortMethod; + char *spaceType; + long spaceUsed; + + tuplesort_get_stats(state, (const char **) &sortMethod, (const char **) &spaceType, &spaceUsed); + appendStringInfo(buf, "1<%s,%s,%ld>", + sortMethod, spaceType, spaceUsed); + } + else + appendStringInfo(buf, "0>"); + } + break; + + case T_Hash: + { + /* according to RemoteHashState and show_hash_info */ + HashState *hashstate = castNode(HashState, planstate); + HashJoinTable hashtable = hashstate->hashtable; + + if (hashtable) + { + hashtable->nbuckets = 0; + appendStringInfo(buf, "1<%d,%d,%d,%d,%ld>", + hashtable->nbuckets, hashtable->nbuckets_original, + hashtable->nbatch, hashtable->nbatch_original, + (hashtable->spacePeak + 1023) / 1024); + } + else + appendStringInfo(buf, "0>"); + } + break; +#endif + default: + break; + } +} + +/* + * InstrIn + * + * DeSerialize of one Instrumentation. + */ +static void +InstrIn(StringInfo str, RemoteInstr *rinstr) +{ + char *tmp_pos; + char *tmp_head = &str->data[str->cursor]; + Instrumentation *instr = &rinstr->instr; + + if (str->len <= 0) + return; + + /* verify nodetype and plan_node_id */ + rinstr->nodeTag = strtol(tmp_head, &tmp_pos, 0); + tmp_head = tmp_pos + 1; + rinstr->id = (int) strtol(tmp_head, &tmp_pos, 0); + tmp_head = tmp_pos + 1; + + /* read values */ + INSTR_READ_FIELD(need_timer); + INSTR_READ_FIELD(need_bufusage); + INSTR_READ_FIELD(running); + + INSTR_READ_FIELD(starttime.tv_sec); + INSTR_READ_FIELD(starttime.tv_nsec); + INSTR_READ_FIELD(counter.tv_sec); + INSTR_READ_FIELD(counter.tv_nsec); + + INSTR_READ_FIELD(firsttuple); + INSTR_READ_FIELD(tuplecount); + + INSTR_READ_FIELD(bufusage_start.shared_blks_hit); + INSTR_READ_FIELD(bufusage_start.shared_blks_read); + INSTR_READ_FIELD(bufusage_start.shared_blks_dirtied); + INSTR_READ_FIELD(bufusage_start.shared_blks_written); + INSTR_READ_FIELD(bufusage_start.local_blks_hit); + INSTR_READ_FIELD(bufusage_start.local_blks_read); + INSTR_READ_FIELD(bufusage_start.local_blks_dirtied); + INSTR_READ_FIELD(bufusage_start.local_blks_written); + INSTR_READ_FIELD(bufusage_start.temp_blks_read); + INSTR_READ_FIELD(bufusage_start.temp_blks_written); + INSTR_READ_FIELD(bufusage_start.blk_read_time.tv_sec); + INSTR_READ_FIELD(bufusage_start.blk_read_time.tv_nsec); + INSTR_READ_FIELD(bufusage_start.blk_write_time.tv_sec); + INSTR_READ_FIELD(bufusage_start.blk_write_time.tv_nsec); + + INSTR_READ_FIELD(startup); + INSTR_READ_FIELD(total); + INSTR_READ_FIELD(ntuples); + INSTR_READ_FIELD(nloops); + INSTR_READ_FIELD(nfiltered1); + INSTR_READ_FIELD(nfiltered2); + + INSTR_READ_FIELD(bufusage.shared_blks_hit); + INSTR_READ_FIELD(bufusage.shared_blks_read); + INSTR_READ_FIELD(bufusage.shared_blks_dirtied); + INSTR_READ_FIELD(bufusage.shared_blks_written); + INSTR_READ_FIELD(bufusage.local_blks_hit); + INSTR_READ_FIELD(bufusage.local_blks_read); + INSTR_READ_FIELD(bufusage.local_blks_dirtied); + INSTR_READ_FIELD(bufusage.local_blks_written); + INSTR_READ_FIELD(bufusage.temp_blks_read); + INSTR_READ_FIELD(bufusage.temp_blks_written); + INSTR_READ_FIELD(bufusage.blk_read_time.tv_sec); + INSTR_READ_FIELD(bufusage.blk_read_time.tv_nsec); + INSTR_READ_FIELD(bufusage.blk_write_time.tv_sec); + INSTR_READ_FIELD(bufusage.blk_write_time.tv_nsec); + + elog(DEBUG1, "InstrIn: plan_node_id %d, nloops %.0f", rinstr->id, instr->nloops); + + /* tmp_head points to next instrument's nodetype or '\0' already */ + str->cursor = tmp_head - &str->data[0]; +} + +/* + * SpecInstrIn + * + * DeSerialize of specific instrument info of current node. + */ +static void +SpecInstrIn(StringInfo str, RemoteInstr *rinstr) +{ + char *tmp_pos; + char *tmp_head = &str->data[str->cursor]; + + switch(rinstr->nodeTag) + { + case T_Gather: + case T_GatherMerge: + { + rinstr->nworkers_launched = (int) strtod(tmp_head, &tmp_pos); + tmp_head = tmp_pos + 1; + } + break; +#if 0 + case T_Sort: + { + RemoteSortState *instr = (RemoteSortState *)palloc0( + sizeof(RemoteSortState)); + /* either stat or w_stat is valid */ + INSTR_READ_FIELD(rs.isvalid); + if (instr->rs.isvalid) + { + INSTR_READ_FIELD(stat.sortMethod); + INSTR_READ_FIELD(stat.spaceType); + INSTR_READ_FIELD(stat.spaceUsed); + } + + INSTR_READ_FIELD(rs.num_workers); + if (instr->rs.num_workers > 0) + { + int n; + Size size; + + size = mul_size(sizeof(TuplesortInstrumentation), + instr->rs.num_workers); + instr->w_stats = (TuplesortInstrumentation *)palloc0(size); + + for (n = 0; n < instr->rs.num_workers; n++) + { + INSTR_READ_FIELD(w_stats[n].sortMethod); + if (instr->w_stats[n].sortMethod != SORT_TYPE_STILL_IN_PROGRESS) + { + INSTR_READ_FIELD(w_stats[n].spaceType); + INSTR_READ_FIELD(w_stats[n].spaceUsed); + } + } + } + remote_instr->state = (RemoteState *) instr; + } + break; + + case T_Hash: + { + RemoteHashState *instr = (RemoteHashState *)palloc0( + sizeof(RemoteHashState)); + INSTR_READ_FIELD(rs.isvalid); + if (instr->rs.isvalid) + { + INSTR_READ_FIELD(nbuckets); + INSTR_READ_FIELD(nbuckets_original); + INSTR_READ_FIELD(nbatch); + INSTR_READ_FIELD(nbatch_original); + INSTR_READ_FIELD(spacePeakKb); + } + remote_instr->state = (RemoteState *) instr; + } + break; +#endif + default: + break; + } + + str->cursor = tmp_head - &str->data[0]; +} + +/* + * SerializeLocalInstr + * + * Serialize local instruments in the planstate tree for sending. + */ +static bool +SerializeLocalInstr(PlanState *planstate, SerializeState *ss) +{ + /* + * We should handle InitPlan/SubPlan the same as in ExplainSubPlans. + * But we do not want another planstate_tree_walker, + * it is ok to use plan_node_id in place of plan_id. + */ + int plan_node_id = planstate->plan->plan_node_id; + if (bms_is_member(plan_node_id, ss->printed_nodes)) + return false; + else + ss->printed_nodes = bms_add_member(ss->printed_nodes, plan_node_id); + + /* For CteScan producer, deal with its child directly */ + if (IsA(planstate, CteScanState)) + planstate = ((CteScanState *)planstate)->cteplanstate; + + if (planstate->instrument) + { + /* clean up the instrumentation state as in ExplainNode */ + InstrEndLoop(planstate->instrument); + InstrOut(&ss->buf, planstate->plan, planstate->instrument); + //WorkerInstrOut(&ss->buf, planstate->worker_instrument); + SpecInstrOut(&ss->buf, nodeTag(planstate->plan), planstate); + } + else + { + /* should not be NULL */ + elog(ERROR, "SerializeLocalInstr: instrument is NULL, %d", + nodeTag(planstate)); + } + + return planstate_tree_walker(planstate, SerializeLocalInstr, ss); +} + +/* + * SendLocalInstr + * + * Serialize local instrument of the given planstate and send it to upper node. + */ +void +SendLocalInstr(PlanState *planstate) +{ + SerializeState ss; + + /* Construct str with the same logic in ExplainNode */ + ss.printed_nodes = NULL; + pq_beginmessage(&ss.buf, 'i'); + SerializeLocalInstr(planstate, &ss); + pq_endmessage(&ss.buf); + bms_free(ss.printed_nodes); + pq_flush(); +} + +/* + * combineRemoteInstr + * + * tool function to combine received instrumentation of all nodes, + * currently it choose max value. + */ +static void +combineRemoteInstr(RemoteInstr *rtarget, RemoteInstr *rsrc) +{ + Instrumentation *target = &rtarget->instr; + Instrumentation *src = &rsrc->instr; + + Assert(rtarget->id == rsrc->id); + Assert(rtarget->nodeTag == rsrc->nodeTag); + + INSTR_MAX_FIELD(need_timer); + INSTR_MAX_FIELD(need_bufusage); + INSTR_MAX_FIELD(running); + + INSTR_MAX_FIELD(starttime.tv_sec); + INSTR_MAX_FIELD(starttime.tv_nsec); + INSTR_MAX_FIELD(counter.tv_sec); + INSTR_MAX_FIELD(counter.tv_nsec); + + INSTR_MAX_FIELD(firsttuple); + INSTR_MAX_FIELD(tuplecount); + + INSTR_MAX_FIELD(bufusage_start.shared_blks_hit); + INSTR_MAX_FIELD(bufusage_start.shared_blks_read); + INSTR_MAX_FIELD(bufusage_start.shared_blks_dirtied); + INSTR_MAX_FIELD(bufusage_start.shared_blks_written); + INSTR_MAX_FIELD(bufusage_start.local_blks_hit); + INSTR_MAX_FIELD(bufusage_start.local_blks_read); + INSTR_MAX_FIELD(bufusage_start.local_blks_dirtied); + INSTR_MAX_FIELD(bufusage_start.local_blks_written); + INSTR_MAX_FIELD(bufusage_start.temp_blks_read); + INSTR_MAX_FIELD(bufusage_start.temp_blks_written); + INSTR_MAX_FIELD(bufusage_start.blk_read_time.tv_sec); + INSTR_MAX_FIELD(bufusage_start.blk_read_time.tv_nsec); + INSTR_MAX_FIELD(bufusage_start.blk_write_time.tv_sec); + INSTR_MAX_FIELD(bufusage_start.blk_write_time.tv_nsec); + + INSTR_MAX_FIELD(startup); + INSTR_MAX_FIELD(total); + INSTR_MAX_FIELD(ntuples); + INSTR_MAX_FIELD(nloops); + INSTR_MAX_FIELD(nfiltered1); + INSTR_MAX_FIELD(nfiltered2); + + INSTR_MAX_FIELD(bufusage.shared_blks_hit); + INSTR_MAX_FIELD(bufusage.shared_blks_read); + INSTR_MAX_FIELD(bufusage.shared_blks_dirtied); + INSTR_MAX_FIELD(bufusage.shared_blks_written); + INSTR_MAX_FIELD(bufusage.local_blks_hit); + INSTR_MAX_FIELD(bufusage.local_blks_read); + INSTR_MAX_FIELD(bufusage.local_blks_dirtied); + INSTR_MAX_FIELD(bufusage.local_blks_written); + INSTR_MAX_FIELD(bufusage.temp_blks_read); + INSTR_MAX_FIELD(bufusage.temp_blks_written); + INSTR_MAX_FIELD(bufusage.blk_read_time.tv_sec); + INSTR_MAX_FIELD(bufusage.blk_read_time.tv_nsec); + INSTR_MAX_FIELD(bufusage.blk_write_time.tv_sec); + INSTR_MAX_FIELD(bufusage.blk_write_time.tv_nsec); + + rtarget->nworkers_launched = Max(rtarget->nworkers_launched, rsrc->nworkers_launched); +} + +/* + * HandleRemoteInstr + * + * Handle remote instrument message and save it by plan_node_id. + */ +void +HandleRemoteInstr(char *msg_body, size_t len, int nodeoid, ResponseCombiner *combiner) +{ + RemoteInstr recv_instr; + StringInfo recv_str; + bool found; + RemoteInstr *cur_instr; + + if (combiner->recv_instr_htbl == NULL) + { + elog(ERROR, "combiner is not prepared for instrumentation"); + } + elog(DEBUG1, "Handle remote instrument: nodeoid %d", nodeoid); + + recv_str = makeStringInfo(); + appendBinaryStringInfo(recv_str, msg_body, len); + + while(recv_str->cursor < recv_str->len) + { + InstrIn(recv_str, &recv_instr); + SpecInstrIn(recv_str, &recv_instr); + cur_instr = (RemoteInstr *) hash_search(combiner->recv_instr_htbl, + (void *) &recv_instr.id, + HASH_ENTER, &found); + if (found) + { + combineRemoteInstr(cur_instr, &recv_instr); + } + else + { + memcpy(cur_instr, &recv_instr, sizeof(RemoteInstr)); + } + } +} + +/* + * attachRemoteSpecialInstr + * + * Attach specific information in planstate. + */ +static void +attachRemoteSpecialInstr(PlanState *planstate, RemoteInstr *rinstr) +{ + int nodeTag = nodeTag(planstate->plan); + + switch(nodeTag) + { + case T_Gather: + { + GatherState *gs = (GatherState *) planstate; + gs->nworkers_launched = rinstr->nworkers_launched; + } + break; + case T_GatherMerge: + { + GatherMergeState *gms = (GatherMergeState *) planstate; + gms->nworkers_launched = rinstr->nworkers_launched; + } + break; + default: + break; + } +} + +/* + * AttachRemoteInstr + * + * Attach instrument information in planstate from saved info in combiner. + */ +bool +AttachRemoteInstr(PlanState *planstate, ResponseCombiner *combiner) +{ + int plan_node_id = planstate->plan->plan_node_id; + if (bms_is_member(plan_node_id, combiner->printed_nodes)) + return false; + else + combiner->printed_nodes = bms_add_member(combiner->printed_nodes, plan_node_id); + + if (IsA(planstate, RemoteSubplanState) && NULL == planstate->lefttree) + { + Plan *plan = planstate->plan; + PlanState *remote_ps; + EState *estate = planstate->state; + + remote_ps = ExecInitNode(plan->lefttree, estate, EXEC_FLAG_EXPLAIN_ONLY); + planstate->lefttree = remote_ps; + } + + if (planstate->instrument) + { + bool found; + RemoteInstr *rinstr= (RemoteInstr *) hash_search(combiner->recv_instr_htbl, + (void *) &plan_node_id, + HASH_FIND, &found); + if (!found) + { + elog(DEBUG1, "AttachRemoteInstr: remote instrumentation not found, tag %d id %d", + nodeTag(planstate->plan), plan_node_id); + } + else + { + Assert(rinstr->nodeTag == nodeTag(planstate->plan)); + Assert(rinstr->id == plan_node_id); + + memcpy(planstate->instrument, &rinstr->instr, sizeof(Instrumentation)); + attachRemoteSpecialInstr(planstate, rinstr); + } + } + else + { + /* should not be NULL */ + elog(ERROR, "AttachRemoteInstr: instrument is NULL, tag %d id %d", + nodeTag(planstate), plan_node_id); + } + + return planstate_tree_walker(planstate, AttachRemoteInstr, combiner); +} diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c index d051eb2b..8f0d9718 100644 --- a/src/backend/executor/execParallel.c +++ b/src/backend/executor/execParallel.c @@ -244,6 +244,19 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e) /* Count this node. */ e->nnodes++; + /* + * if we are running with instrument option, must init + * full plantree here, to ensure e->nnodes correct. + */ + if (planstate->instrument && + IsA(planstate, RemoteSubplanState) && + NULL == planstate->lefttree) + { + planstate->lefttree = ExecInitNode(planstate->plan->lefttree, + planstate->state, + EXEC_FLAG_EXPLAIN_ONLY); + } + /* Call estimators for parallel-aware nodes. */ if (planstate->plan->parallel_aware) { diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c index 0119064b..f8f15db2 100644 --- a/src/backend/executor/execProcnode.c +++ b/src/backend/executor/execProcnode.c @@ -866,6 +866,9 @@ ExecShutdownNode(PlanState *node) case T_GatherMergeState: ExecShutdownGatherMerge((GatherMergeState *) node); break; + case T_RemoteSubplanState: + ExecShutdownRemoteSubplan((RemoteSubplanState *) node); + break; default: break; } diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 80829005..39796a82 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -59,6 +59,7 @@ #include "pgxc/xc_maintenance_mode.h" #include "catalog/pgxc_class.h" #ifdef __TBASE__ +#include "commands/explain_dist.h" #include "pgxc/squeue.h" #include "executor/execParallel.h" #include "postmaster/postmaster.h" @@ -297,6 +298,15 @@ InitResponseCombiner(ResponseCombiner *combiner, int node_count, combiner->recv_datarows = 0; combiner->prerowBuffers = NULL; combiner->is_abort = false; + combiner->printed_nodes = NULL; + { + HASHCTL ctl; + + ctl.keysize = sizeof(int); + ctl.entrysize = sizeof(RemoteInstr); + + combiner->recv_instr_htbl = hash_create("Remote Instrument", 16, &ctl, HASH_ELEM); + } #endif } @@ -1098,6 +1108,18 @@ CloseCombiner(ResponseCombiner *combiner) pfree(combiner->tapemarks); combiner->tapemarks = NULL; } +#ifdef __TBASE__ + if (combiner->recv_instr_htbl) + { + hash_destroy(combiner->recv_instr_htbl); + combiner->recv_instr_htbl = NULL; + } + if (combiner->printed_nodes) + { + bms_free(combiner->printed_nodes); + combiner->printed_nodes = NULL; + } +#endif } /* @@ -2671,6 +2693,10 @@ FetchTuple(ResponseCombiner *combiner) { /* Do nothing. It must have been handled in handle_response() */ } + else if (res == RESPONSE_INSTR) + { + /* Do nothing. It must have been handled in handle_response() */ + } else { // Can not get here? @@ -3306,6 +3332,12 @@ handle_response(PGXCNodeHandle *conn, ResponseCombiner *combiner) #endif return RESPONSE_ASSIGN_GXID; +#ifdef __TBASE__ + case 'i': /* Remote Instrument */ + if (msg_len > 0) + HandleRemoteInstr(msg, msg_len, conn->nodeoid, combiner); + return RESPONSE_INSTR; +#endif default: /* sync lost? */ elog(WARNING, "Received unsupported message type: %c", msg_type); @@ -10487,7 +10519,7 @@ ExecFinishInitRemoteSubplan(RemoteSubplanState *node) errmsg("Failed to send command ID to data nodes"))); } pgxc_node_send_plan(connection, cursor, "Remote Subplan", - node->subplanstr, node->nParamRemote, paramtypes); + node->subplanstr, node->nParamRemote, paramtypes, estate->es_instrument); if (enable_statistic) { @@ -11100,6 +11132,29 @@ ExecReScanRemoteSubplan(RemoteSubplanState *node) } #ifdef __TBASE__ +/* + * ExecShutdownRemoteSubplan + * + * for instrumentation only, init full planstate tree, + * then attach recieved remote instrumenation. + */ +void +ExecShutdownRemoteSubplan(RemoteSubplanState *node) +{ + ResponseCombiner *combiner = &node->combiner; + PlanState *ps = &combiner->ss.ps; + Plan *plan = ps->plan; + EState *estate = ps->state; + + if (estate->es_instrument) + { + if (!ps->lefttree) + ps->lefttree = ExecInitNode(plan->lefttree, estate, EXEC_FLAG_EXPLAIN_ONLY); + + AttachRemoteInstr(ps->lefttree, combiner); + } +} + void ExecFinishRemoteSubplan(RemoteSubplanState *node) {// #lizard forgives diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index aa1070be..3baad358 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -2064,7 +2064,7 @@ pgxc_node_send_parse(PGXCNodeHandle * handle, const char* statement, int pgxc_node_send_plan(PGXCNodeHandle * handle, const char *statement, const char *query, const char *planstr, - short num_params, Oid *param_types) + short num_params, Oid *param_types, int instrument_options) { int stmtLen; int queryLen; @@ -2093,8 +2093,8 @@ pgxc_node_send_plan(PGXCNodeHandle * handle, const char *statement, paramTypes[i] = format_type_be(param_types[i]); paramTypeLen += strlen(paramTypes[i]) + 1; } - /* size + pnameLen + queryLen + parameters */ - msgLen = 4 + queryLen + stmtLen + planLen + paramTypeLen; + /* size + pnameLen + queryLen + parameters + instrument_options */ + msgLen = 4 + queryLen + stmtLen + planLen + paramTypeLen + 4; /* msgType + msgLen */ if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0) @@ -2134,6 +2134,10 @@ pgxc_node_send_plan(PGXCNodeHandle * handle, const char *statement, pfree(paramTypes[i]); } pfree(paramTypes); + /* instrument_options */ + instrument_options = htonl(instrument_options); + memcpy(handle->outBuffer + handle->outEnd, &instrument_options, 4); + handle->outEnd += 4; handle->last_command = 'a'; diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index db2b5639..c8bdc980 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -106,6 +106,7 @@ #include "executor/execParallel.h" #include "pgxc/poolutils.h" #include "commands/vacuum.h" +#include "commands/explain_dist.h" #endif #endif @@ -1995,8 +1996,9 @@ exec_plan_message(const char *query_string, /* source of the query */ const char *stmt_name, /* name for prepared stmt */ const char *plan_string, /* encoded plan to execute */ char **paramTypeNames, /* parameter type names */ - int numParams) /* number of parameters */ -{// #lizard forgives + int numParams, /* number of parameters */ + int instrument_options) /* explain analyze option */ +{ MemoryContext oldcontext; bool save_log_statement_stats = log_statement_stats; char msec_str[32]; @@ -2094,6 +2096,8 @@ exec_plan_message(const char *query_string, /* source of the query */ StorePreparedStatement(stmt_name, psrc, false, true); SetRemoteSubplan(psrc, plan_string); + /* set instrument_options, default 0 */ + psrc->instrument_options = instrument_options; MemoryContextSwitchTo(oldcontext); @@ -2691,26 +2695,26 @@ exec_bind_message(StringInfo input_message) /* Get epq context, only datanodes need them */ if (IS_PGXC_DATANODE && (IsConnFromCoord() || IsConnFromDatanode())) { - num_epq_tuple = pq_getmsgint(input_message, 2); - if (num_epq_tuple > 0) - { - int i; - - portal->epqContext = palloc(sizeof(RemoteEPQContext)); - portal->epqContext->ntuples = num_epq_tuple; - portal->epqContext->tid = palloc(num_epq_tuple * sizeof(ItemPointerData)); - portal->epqContext->rtidx = palloc(num_epq_tuple * sizeof(int)); - portal->epqContext->nodeid = palloc(num_epq_tuple * sizeof(uint32)); - - for (i = 0; i < num_epq_tuple; i++) - { - portal->epqContext->rtidx[i] = pq_getmsgint(input_message, 2); - portal->epqContext->tid[i].ip_blkid.bi_hi = pq_getmsgint(input_message, 2); - portal->epqContext->tid[i].ip_blkid.bi_lo = pq_getmsgint(input_message, 2); - portal->epqContext->tid[i].ip_posid = pq_getmsgint(input_message, 2); - portal->epqContext->nodeid[i] = pq_getmsgint(input_message, 4); - } - } + num_epq_tuple = pq_getmsgint(input_message, 2); + if (num_epq_tuple > 0) + { + int i; + + portal->epqContext = palloc(sizeof(RemoteEPQContext)); + portal->epqContext->ntuples = num_epq_tuple; + portal->epqContext->tid = palloc(num_epq_tuple * sizeof(ItemPointerData)); + portal->epqContext->rtidx = palloc(num_epq_tuple * sizeof(int)); + portal->epqContext->nodeid = palloc(num_epq_tuple * sizeof(uint32)); + + for (i = 0; i < num_epq_tuple; i++) + { + portal->epqContext->rtidx[i] = pq_getmsgint(input_message, 2); + portal->epqContext->tid[i].ip_blkid.bi_hi = pq_getmsgint(input_message, 2); + portal->epqContext->tid[i].ip_blkid.bi_lo = pq_getmsgint(input_message, 2); + portal->epqContext->tid[i].ip_posid = pq_getmsgint(input_message, 2); + portal->epqContext->nodeid[i] = pq_getmsgint(input_message, 4); + } + } } pq_getmsgend(input_message); @@ -2760,6 +2764,9 @@ exec_bind_message(StringInfo input_message) cplan->stmt_list, cplan); + /* set instrument before PortalStart, default 0 */ + portal->up_instrument = psrc->instrument_options; + /* Done with the snapshot used for parameter I/O and parsing/planning */ if (snapshot_set) PopActiveSnapshot(); @@ -3025,6 +3032,15 @@ exec_execute_message(const char *portal_name, long max_rows) CommandCounterIncrement(); } + +#ifdef __TBASE__ + if (portal->up_instrument && + portal->queryDesc && + portal->queryDesc->myindex == -1) + { + SendLocalInstr(portal->queryDesc->planstate); + } +#endif /* Send appropriate CommandComplete to client */ EndCommand(completionTag, dest); @@ -5486,6 +5502,7 @@ PostgresMain(int argc, char *argv[], const char *plan_string; int numParams; char **paramTypes = NULL; + int instrument_options = 0; /* Set statement_timestamp() */ SetCurrentStatementStartTimestamp(); @@ -5502,10 +5519,14 @@ PostgresMain(int argc, char *argv[], paramTypes[i] = (char *) pq_getmsgstring(&input_message); } + + instrument_options = pq_getmsgint(&input_message, 4); + pq_getmsgend(&input_message); exec_plan_message(query_string, stmt_name, plan_string, - paramTypes, numParams); + paramTypes, numParams, + instrument_options); } break; #endif diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c index 16179e73..c1eadf2c 100644 --- a/src/backend/tcop/pquery.c +++ b/src/backend/tcop/pquery.c @@ -681,8 +681,11 @@ PortalStart(Portal portal, ParamListInfo params, None_Receiver, params, NULL, +#ifdef __TBASE__ + portal->up_instrument); +#else 0); - +#endif /* * set information about EvalPlanQual if any, they will be fill in * estate later after it been created. @@ -1006,7 +1009,7 @@ PortalStart(Portal portal, ParamListInfo params, None_Receiver, params, portal->queryEnv, - 0); + portal->up_instrument); } else #endif @@ -1017,8 +1020,11 @@ PortalStart(Portal portal, ParamListInfo params, None_Receiver, params, portal->queryEnv, +#ifdef __TBASE__ + portal->up_instrument); +#else 0); - +#endif /* * set information about EvalPlanQual if any, they will be fill in * estate later after it been created. diff --git a/src/include/commands/explain_dist.h b/src/include/commands/explain_dist.h new file mode 100644 index 00000000..fe682bda --- /dev/null +++ b/src/include/commands/explain_dist.h @@ -0,0 +1,36 @@ +/*------------------------------------------------------------------------- + * + * explain_dist.h + * + * Portions Copyright (c) 2018, Tencent TBase-C Group. + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/commands/explain_dist.h + * + *------------------------------------------------------------------------- + */ +#ifndef EXPLAINDIST_H +#define EXPLAINDIST_H + +#include "commands/explain.h" +#include "pgxc/execRemote.h" + +/* Hash table entry */ +typedef struct +{ + int id; /* unique id of current plan node */ + int nodeTag; /* type of current plan node */ + Instrumentation instr; /* instrument of current plan node */ + + /* for Gather */ + int nworkers_launched; /* worker num of gather */ + + /* for Hash: */ +} RemoteInstr; + +extern void SendLocalInstr(PlanState *planstate); +extern void HandleRemoteInstr(char *msg_body, size_t len, int nodeoid, ResponseCombiner *combiner); +extern bool AttachRemoteInstr(PlanState *planstate, ResponseCombiner *combiner); + +#endif /* EXPLAINDIST_H */ \ No newline at end of file diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h index c6e2e0e8..20e73f2e 100644 --- a/src/include/pgxc/execRemote.h +++ b/src/include/pgxc/execRemote.h @@ -52,6 +52,7 @@ #endif #ifdef __TBASE__ +#define RESPONSE_INSTR 13 #define UINT32_BITS_NUM 32 #define WORD_NUMBER_FOR_NODES (MAX_NODES_NUMBER / UINT32_BITS_NUM) @@ -174,6 +175,10 @@ typedef struct ResponseCombiner PGXCNodeHandle **conns; int ccount; uint64 recv_datarows; + + /* for remote instrument */ + Bitmapset *printed_nodes; /* ids of plan nodes we've handled */ + HTAB *recv_instr_htbl; /* received str hash table for each plan_node_id */ #endif } ResponseCombiner; @@ -422,6 +427,7 @@ extern void SetCurrentHandlesReadonly(void); extern TupleDesc create_tuple_desc(char *msg_body, size_t len); extern void ExecFinishRemoteSubplan(RemoteSubplanState *node); +extern void ExecShutdownRemoteSubplan(RemoteSubplanState *node); #endif #ifdef __SUBSCRIPTION__ diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h index 3773cac2..15c4ef46 100644 --- a/src/include/pgxc/pgxcnode.h +++ b/src/include/pgxc/pgxcnode.h @@ -222,7 +222,7 @@ extern int pgxc_node_send_query_extended(PGXCNodeHandle *handle, const char *que bool send_describe, int fetch_size); extern int pgxc_node_send_plan(PGXCNodeHandle * handle, const char *statement, const char *query, const char *planstr, - short num_params, Oid *param_types); + short num_params, Oid *param_types, int instrument_options); extern int pgxc_node_send_gid(PGXCNodeHandle *handle, char* gid); #ifdef __TWO_PHASE_TRANS__ extern int pgxc_node_send_starter(PGXCNodeHandle *handle, char* startnode); diff --git a/src/include/utils/plancache.h b/src/include/utils/plancache.h index 0cdeadc3..55e70db2 100644 --- a/src/include/utils/plancache.h +++ b/src/include/utils/plancache.h @@ -181,6 +181,7 @@ typedef struct CachedPlanSource #endif #ifdef __TBASE__ bool insert_into; + int instrument_options #endif } CachedPlanSource; diff --git a/src/include/utils/portal.h b/src/include/utils/portal.h index 2a4a6c42..5d039875 100644 --- a/src/include/utils/portal.h +++ b/src/include/utils/portal.h @@ -266,6 +266,7 @@ typedef struct PortalData /* information about EvalPlanQual, pass it to queryDesc */ RemoteEPQContext *epqContext; + int up_instrument; /* explain analyze option from cn */ #endif } PortalData; From 4e1a137415a3e685f4879ce2284ae9bac3c7b40d Mon Sep 17 00:00:00 2001 From: andrelin Date: Fri, 26 Feb 2021 10:15:40 +0800 Subject: [PATCH 132/578] initialize hashtable only when instrument flaged --- src/backend/pgxc/pool/execRemote.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 39796a82..ab99d828 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -299,14 +299,7 @@ InitResponseCombiner(ResponseCombiner *combiner, int node_count, combiner->prerowBuffers = NULL; combiner->is_abort = false; combiner->printed_nodes = NULL; - { - HASHCTL ctl; - - ctl.keysize = sizeof(int); - ctl.entrysize = sizeof(RemoteInstr); - - combiner->recv_instr_htbl = hash_create("Remote Instrument", 16, &ctl, HASH_ELEM); - } + combiner->recv_instr_htbl = NULL; #endif } @@ -9915,6 +9908,16 @@ ExecInitRemoteSubplan(RemoteSubplan *node, EState *estate, int eflags) combiner->ss.ps.state = estate; combiner->ss.ps.ExecProcNode = ExecRemoteSubplan; + if (estate->es_instrument) + { + HASHCTL ctl; + + ctl.keysize = sizeof(int); + ctl.entrysize = sizeof(RemoteInstr); + + combiner->recv_instr_htbl = hash_create("Remote Instrument", 16, &ctl, HASH_ELEM); + } + combiner->ss.ps.qual = NULL; combiner->request_type = REQUEST_TYPE_QUERY; From 7e4b0d7d1be4f40c882e6afbca750e1970615982 Mon Sep 17 00:00:00 2001 From: andrelin Date: Mon, 1 Mar 2021 15:01:04 +0800 Subject: [PATCH 133/578] fix compile error --- src/include/utils/plancache.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/include/utils/plancache.h b/src/include/utils/plancache.h index 55e70db2..e04b03d8 100644 --- a/src/include/utils/plancache.h +++ b/src/include/utils/plancache.h @@ -181,7 +181,7 @@ typedef struct CachedPlanSource #endif #ifdef __TBASE__ bool insert_into; - int instrument_options + int instrument_options; #endif } CachedPlanSource; From f6cd1fe9950c6ce911745dd9904b429e24791a0e Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Tue, 29 Aug 2017 13:22:49 -0400 Subject: [PATCH 134/578] Propagate sort instrumentation from workers back to leader. Up until now, when parallel query was used, no details about the sort method or space used by the workers were available; details were shown only for any sorting done by the leader. Fix that. Commit 1177ab1dabf72bafee8f19d904cee3a299f25892 forced the test case added by commit 1f6d515a67ec98194c23a5db25660856c9aab944 to run without parallelism; now that we have this infrastructure, allow that again, with a little tweaking to make it pass with and without force_parallel_mode. Robert Haas and Tom Lane Discussion: http://postgr.es/m/CA+Tgmoa2VBZW6S8AAXfhpHczb=Rf6RqQ2br+zJvEgwJ0uoD_tQ@mail.gmail.com --- src/backend/commands/explain.c | 57 ++++++++++++- src/backend/executor/execParallel.c | 104 +++++++++++++++--------- src/backend/executor/nodeSort.c | 97 ++++++++++++++++++++++ src/backend/utils/sort/tuplesort.c | 56 ++++++++++--- src/include/executor/nodeSort.h | 9 +- src/include/nodes/execnodes.h | 12 +++ src/include/utils/tuplesort.h | 96 ++++++++++++++-------- src/test/regress/expected/subselect.out | 48 +++++++++++ src/test/regress/sql/subselect.sql | 43 +++++++++- 9 files changed, 433 insertions(+), 89 deletions(-) diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index c58bd433..4ba4dc81 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -2534,15 +2534,21 @@ show_tablesample(TableSampleClause *tsc, PlanState *planstate, static void show_sort_info(SortState *sortstate, ExplainState *es) { - if (es->analyze && sortstate->sort_Done && - sortstate->tuplesortstate != NULL) + if (!es->analyze) + return; + + if (sortstate->sort_Done && sortstate->tuplesortstate != NULL) { Tuplesortstate *state = (Tuplesortstate *) sortstate->tuplesortstate; + TuplesortInstrumentation stats; const char *sortMethod; const char *spaceType; long spaceUsed; - tuplesort_get_stats(state, &sortMethod, &spaceType, &spaceUsed); + tuplesort_get_stats(state, &stats); + sortMethod = tuplesort_method_name(stats.sortMethod); + spaceType = tuplesort_space_type_name(stats.spaceType); + spaceUsed = stats.spaceUsed; if (es->format == EXPLAIN_FORMAT_TEXT) { @@ -2557,6 +2563,51 @@ show_sort_info(SortState *sortstate, ExplainState *es) ExplainPropertyText("Sort Space Type", spaceType, es); } } + + if (sortstate->shared_info != NULL) + { + int n; + bool opened_group = false; + + for (n = 0; n < sortstate->shared_info->num_workers; n++) + { + TuplesortInstrumentation *sinstrument; + const char *sortMethod; + const char *spaceType; + long spaceUsed; + + sinstrument = &sortstate->shared_info->sinstrument[n]; + if (sinstrument->sortMethod == SORT_TYPE_STILL_IN_PROGRESS) + continue; /* ignore any unfilled slots */ + sortMethod = tuplesort_method_name(sinstrument->sortMethod); + spaceType = tuplesort_space_type_name(sinstrument->spaceType); + spaceUsed = sinstrument->spaceUsed; + + if (es->format == EXPLAIN_FORMAT_TEXT) + { + appendStringInfoSpaces(es->str, es->indent * 2); + appendStringInfo(es->str, + "Worker %d: Sort Method: %s %s: %ldkB\n", + n, sortMethod, spaceType, spaceUsed); + } + else + { + if (!opened_group) + { + ExplainOpenGroup("Workers", "Workers", false, es); + opened_group = true; + } + ExplainOpenGroup("Worker", NULL, true, es); + ExplainPropertyInteger("Worker Number", n, es); + ExplainPropertyText("Sort Method", sortMethod, es); + ExplainPropertyLong("Sort Space Used", spaceUsed, es); + ExplainPropertyText("Sort Space Type", spaceType, es); + ExplainCloseGroup("Worker", NULL, true, es); + } + } + if (opened_group) + ExplainCloseGroup("Workers", "Workers", false, es); + } } /* diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c index 8f0d9718..45dc56fb 100644 --- a/src/backend/executor/execParallel.c +++ b/src/backend/executor/execParallel.c @@ -28,9 +28,10 @@ #include "executor/nodeBitmapHeapscan.h" #include "executor/nodeCustom.h" #include "executor/nodeForeignscan.h" -#include "executor/nodeSeqscan.h" #include "executor/nodeIndexscan.h" #include "executor/nodeIndexonlyscan.h" +#include "executor/nodeSeqscan.h" +#include "executor/nodeSort.h" #include "executor/tqueue.h" #include "nodes/nodeFuncs.h" #include "optimizer/planmain.h" @@ -227,10 +228,10 @@ ExecSerializePlan(Plan *plan, EState *estate) } /* - * Ordinary plan nodes won't do anything here, but parallel-aware plan nodes - * may need some state which is shared across all parallel workers. Before - * we size the DSM, give them a chance to call shm_toc_estimate_chunk or - * shm_toc_estimate_keys on &pcxt->estimator. + * Parallel-aware plan nodes (and occasionally others) may need some state + * which is shared across all parallel workers. Before we size the DSM, give + * them a chance to call shm_toc_estimate_chunk or shm_toc_estimate_keys on + * &pcxt->estimator. * * While we're at it, count the number of PlanState nodes in the tree, so * we know how many SharedPlanStateInstrumentation structures we need. @@ -257,50 +258,56 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e) EXEC_FLAG_EXPLAIN_ONLY); } - /* Call estimators for parallel-aware nodes. */ - if (planstate->plan->parallel_aware) - { switch (nodeTag(planstate)) { case T_SeqScanState: + if (planstate->plan->parallel_aware) ExecSeqScanEstimate((SeqScanState *) planstate, e->pcxt); break; case T_IndexScanState: + if (planstate->plan->parallel_aware) ExecIndexScanEstimate((IndexScanState *) planstate, e->pcxt); break; case T_IndexOnlyScanState: + if (planstate->plan->parallel_aware) ExecIndexOnlyScanEstimate((IndexOnlyScanState *) planstate, e->pcxt); break; case T_ForeignScanState: + if (planstate->plan->parallel_aware) ExecForeignScanEstimate((ForeignScanState *) planstate, e->pcxt); break; case T_CustomScanState: + if (planstate->plan->parallel_aware) ExecCustomScanEstimate((CustomScanState *) planstate, e->pcxt); break; case T_BitmapHeapScanState: + if (planstate->plan->parallel_aware) ExecBitmapHeapEstimate((BitmapHeapScanState *) planstate, e->pcxt); break; + case T_SortState: + /* even when not parallel-aware */ + ExecSortEstimate((SortState *) planstate, e->pcxt); #ifdef __TBASE__ - + if (planstate->plan->parallel_aware) + ReDistributeEstimate(planstate, e->pcxt); + break; /* For remote query and remote subplan, there is no need for shared storage. */ case T_RemoteQueryState: case T_RemoteSubplanState: break; - case T_HashJoinState: + if (planstate->plan->parallel_aware) ExecParallelHashJoinEstimate((HashJoinState*) planstate, e->pcxt); break; - case T_SortState: - ReDistributeEstimate(planstate, e->pcxt); - break; case T_AggState: + if (planstate->plan->parallel_aware) { AggState *aggstate = (AggState *)planstate; @@ -312,7 +319,6 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e) default: break; } - } return planstate_tree_walker(planstate, ExecParallelEstimate, e); } @@ -337,60 +343,70 @@ ExecParallelInitializeDSM(PlanState *planstate, d->nnodes++; /* - * Call initializers for parallel-aware plan nodes. + * Call initializers for DSM-using plan nodes. * - * Ordinary plan nodes won't do anything here, but parallel-aware plan - * nodes may need to initialize shared state in the DSM before parallel - * workers are available. They can allocate the space they previously + * Most plan nodes won't do anything here, but plan nodes that allocated + * DSM may need to initialize shared state in the DSM before parallel + * workers are launched. They can allocate the space they previously * estimated using shm_toc_allocate, and add the keys they previously * estimated using shm_toc_insert, in each case targeting pcxt->toc. */ - if (planstate->plan->parallel_aware) - { switch (nodeTag(planstate)) { case T_SeqScanState: + if (planstate->plan->parallel_aware) ExecSeqScanInitializeDSM((SeqScanState *) planstate, d->pcxt); break; case T_IndexScanState: + if (planstate->plan->parallel_aware) ExecIndexScanInitializeDSM((IndexScanState *) planstate, d->pcxt); break; case T_IndexOnlyScanState: + if (planstate->plan->parallel_aware) ExecIndexOnlyScanInitializeDSM((IndexOnlyScanState *) planstate, d->pcxt); break; case T_ForeignScanState: + if (planstate->plan->parallel_aware) ExecForeignScanInitializeDSM((ForeignScanState *) planstate, d->pcxt); break; case T_CustomScanState: + if (planstate->plan->parallel_aware) ExecCustomScanInitializeDSM((CustomScanState *) planstate, d->pcxt); break; case T_BitmapHeapScanState: + if (planstate->plan->parallel_aware) ExecBitmapHeapInitializeDSM((BitmapHeapScanState *) planstate, d->pcxt); break; - + case T_SortState: + /* even when not parallel-aware */ + ExecSortInitializeDSM((SortState *) planstate, d->pcxt); #ifdef __TBASE__ + if (planstate->plan->parallel_aware) + ReDistributeInitializeDSM(planstate, d->pcxt); + break; case T_RemoteQueryState: + if (planstate->plan->parallel_aware) ExecRemoteQueryInitializeDSM((RemoteQueryState *)planstate, d->pcxt); break; case T_RemoteSubplanState: + if (planstate->plan->parallel_aware) ExecRemoteSubPlanInitializeDSM((RemoteSubplanState *)planstate, d->pcxt); break; case T_HashJoinState: + if (planstate->plan->parallel_aware) ExecParallelHashJoinInitializeDSM((HashJoinState *) planstate, d->pcxt); break; - case T_SortState: - ReDistributeInitializeDSM(planstate, d->pcxt); - break; case T_AggState: + if (planstate->plan->parallel_aware) { AggState *aggstate = (AggState *)planstate; @@ -403,7 +419,6 @@ ExecParallelInitializeDSM(PlanState *planstate, default: break; } - } return planstate_tree_walker(planstate, ExecParallelInitializeDSM, d); } @@ -914,6 +929,13 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate, planstate->worker_instrument->num_workers = instrumentation->num_workers; memcpy(&planstate->worker_instrument->instrument, instrument, ibytes); + /* + * Perform any node-type-specific work that needs to be done. Currently, + * only Sort nodes need to do anything here. + */ + if (IsA(planstate, SortState)) + ExecSortRetrieveInstrumentation((SortState *) planstate); + return planstate_tree_walker(planstate, ExecParallelRetrieveInstrumentation, instrumentation); } @@ -1076,47 +1098,56 @@ ExecParallelInitializeWorker(PlanState *planstate, shm_toc *toc) if (planstate == NULL) return false; - /* Call initializers for parallel-aware plan nodes. */ - if (planstate->plan->parallel_aware) - { switch (nodeTag(planstate)) { case T_SeqScanState: + if (planstate->plan->parallel_aware) ExecSeqScanInitializeWorker((SeqScanState *) planstate, toc); break; case T_IndexScanState: + if (planstate->plan->parallel_aware) ExecIndexScanInitializeWorker((IndexScanState *) planstate, toc); break; case T_IndexOnlyScanState: + if (planstate->plan->parallel_aware) ExecIndexOnlyScanInitializeWorker((IndexOnlyScanState *) planstate, toc); break; case T_ForeignScanState: + if (planstate->plan->parallel_aware) ExecForeignScanInitializeWorker((ForeignScanState *) planstate, toc); break; case T_CustomScanState: + if (planstate->plan->parallel_aware) ExecCustomScanInitializeWorker((CustomScanState *) planstate, toc); break; case T_BitmapHeapScanState: - ExecBitmapHeapInitializeWorker((BitmapHeapScanState *) planstate, toc); + if (planstate->plan->parallel_aware) + ExecBitmapHeapInitializeWorker( + (BitmapHeapScanState *) planstate, toc); break; + case T_SortState: + /* even when not parallel-aware */ + ExecSortInitializeWorker((SortState *) planstate, toc); #ifdef __TBASE__ + if (planstate->plan->parallel_aware) + ReDistributeInitializeWorker(planstate, toc); + break; case T_RemoteQueryState: + if (planstate->plan->parallel_aware) ExecRemoteQueryInitializeDSMWorker((RemoteQueryState *)planstate, toc); break; - - case T_RemoteSubplanState: + case T_RemoteSubplanState: + if (planstate->plan->parallel_aware) ExecRemoteSubPlanInitDSMWorker((RemoteSubplanState *)planstate, toc); break; - case T_HashJoinState: - ExecParallelHashJoinInitWorker((HashJoinState *) planstate, toc); - break; - case T_SortState: - ReDistributeInitializeWorker(planstate, toc); + if (planstate->plan->parallel_aware) + ExecParallelHashJoinInitWorker((HashJoinState *) planstate, toc); break; case T_AggState: + if (planstate->plan->parallel_aware) { AggState *aggstate = (AggState *)planstate; @@ -1128,7 +1159,6 @@ ExecParallelInitializeWorker(PlanState *planstate, shm_toc *toc) default: break; } - } return planstate_tree_walker(planstate, ExecParallelInitializeWorker, toc); } diff --git a/src/backend/executor/nodeSort.c b/src/backend/executor/nodeSort.c index 5a42eef5..3c35d902 100644 --- a/src/backend/executor/nodeSort.c +++ b/src/backend/executor/nodeSort.c @@ -15,6 +15,7 @@ #include "postgres.h" +#include "access/parallel.h" #include "executor/execdebug.h" #include "executor/nodeSort.h" #include "miscadmin.h" @@ -232,6 +233,15 @@ ExecSort(PlanState *pstate) node->sort_Done = true; node->bounded_Done = node->bounded; node->bound_Done = node->bound; + if (node->shared_info && node->am_worker) + { + TuplesortInstrumentation *si; + + Assert(IsParallelWorker()); + Assert(ParallelWorkerNumber <= node->shared_info->num_workers); + si = &node->shared_info->sinstrument[ParallelWorkerNumber]; + tuplesort_get_stats(tuplesortstate, si); + } SO1_printf("ExecSort: %s\n", "sorting done"); } @@ -444,3 +454,90 @@ ExecReScanSort(SortState *node) else tuplesort_rescan((Tuplesortstate *) node->tuplesortstate); } + +/* ---------------------------------------------------------------- + * Parallel Query Support + * ---------------------------------------------------------------- + */ + +/* ---------------------------------------------------------------- + * ExecSortEstimate + * + * Estimate space required to propagate sort statistics. + * ---------------------------------------------------------------- + */ +void +ExecSortEstimate(SortState *node, ParallelContext *pcxt) +{ + Size size; + + /* don't need this if not instrumenting or no workers */ + if (!node->ss.ps.instrument || pcxt->nworkers == 0) + return; + + size = mul_size(pcxt->nworkers, sizeof(TuplesortInstrumentation)); + size = add_size(size, offsetof(SharedSortInfo, sinstrument)); + shm_toc_estimate_chunk(&pcxt->estimator, size); + shm_toc_estimate_keys(&pcxt->estimator, 1); +} + +/* ---------------------------------------------------------------- + * ExecSortInitializeDSM + * + * Initialize DSM space for sort statistics. + * ---------------------------------------------------------------- + */ +void +ExecSortInitializeDSM(SortState *node, ParallelContext *pcxt) +{ + Size size; + + /* don't need this if not instrumenting or no workers */ + if (!node->ss.ps.instrument || pcxt->nworkers == 0) + return; + + size = offsetof(SharedSortInfo, sinstrument) + + pcxt->nworkers * sizeof(TuplesortInstrumentation); + node->shared_info = shm_toc_allocate(pcxt->toc, size); + /* ensure any unfilled slots will contain zeroes */ + memset(node->shared_info, 0, size); + node->shared_info->num_workers = pcxt->nworkers; + shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, + node->shared_info); +} + +/* ---------------------------------------------------------------- + * ExecSortInitializeWorker + * + * Attach worker to DSM space for sort statistics. + * ---------------------------------------------------------------- + */ +void +ExecSortInitializeWorker(SortState *node, shm_toc *toc) +{ + node->shared_info = + shm_toc_lookup(toc, node->ss.ps.plan->plan_node_id, true); + node->am_worker = true; +} + +/* ---------------------------------------------------------------- + * ExecSortRetrieveInstrumentation + * + * Transfer sort statistics from DSM to private memory. + * ---------------------------------------------------------------- + */ +void +ExecSortRetrieveInstrumentation(SortState *node) +{ + Size size; + SharedSortInfo *si; + + if (node->shared_info == NULL) + return; + + size = offsetof(SharedSortInfo, sinstrument) + + node->shared_info->num_workers * sizeof(TuplesortInstrumentation); + si = palloc(size); + memcpy(si, node->shared_info, size); + node->shared_info = si; +} diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c index 676cb9bd..ad5d9988 100644 --- a/src/backend/utils/sort/tuplesort.c +++ b/src/backend/utils/sort/tuplesort.c @@ -3379,13 +3379,10 @@ tuplesort_restorepos(Tuplesortstate *state) * * This can be called after tuplesort_performsort() finishes to obtain * printable summary information about how the sort was performed. - * spaceUsed is measured in kilobytes. */ void tuplesort_get_stats(Tuplesortstate *state, - const char **sortMethod, - const char **spaceType, - long *spaceUsed) + TuplesortInstrumentation *stats) { /* * Note: it might seem we should provide both memory and disk usage for a @@ -3398,35 +3395,68 @@ tuplesort_get_stats(Tuplesortstate *state, */ if (state->tapeset) { - *spaceType = "Disk"; - *spaceUsed = LogicalTapeSetBlocks(state->tapeset) * (BLCKSZ / 1024); + stats->spaceType = SORT_SPACE_TYPE_DISK; + stats->spaceUsed = LogicalTapeSetBlocks(state->tapeset) * (BLCKSZ / 1024); } else { - *spaceType = "Memory"; - *spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024; + stats->spaceType = SORT_SPACE_TYPE_MEMORY; + stats->spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024; } switch (state->status) { case TSS_SORTEDINMEM: if (state->boundUsed) - *sortMethod = "top-N heapsort"; + stats->sortMethod = SORT_TYPE_TOP_N_HEAPSORT; else - *sortMethod = "quicksort"; + stats->sortMethod = SORT_TYPE_QUICKSORT; break; case TSS_SORTEDONTAPE: - *sortMethod = "external sort"; + stats->sortMethod = SORT_TYPE_EXTERNAL_SORT; break; case TSS_FINALMERGE: - *sortMethod = "external merge"; + stats->sortMethod = SORT_TYPE_EXTERNAL_MERGE; break; default: - *sortMethod = "still in progress"; + stats->sortMethod = SORT_TYPE_STILL_IN_PROGRESS; break; } } +/* + * Convert TuplesortMethod to a string. + */ +const char * +tuplesort_method_name(TuplesortMethod m) +{ + switch (m) + { + case SORT_TYPE_STILL_IN_PROGRESS: + return "still in progress"; + case SORT_TYPE_TOP_N_HEAPSORT: + return "top-N heapsort"; + case SORT_TYPE_QUICKSORT: + return "quicksort"; + case SORT_TYPE_EXTERNAL_SORT: + return "external sort"; + case SORT_TYPE_EXTERNAL_MERGE: + return "external merge"; + } + + return "unknown"; +} + +/* + * Convert TuplesortSpaceType to a string. + */ +const char * +tuplesort_space_type_name(TuplesortSpaceType t) +{ + Assert(t == SORT_SPACE_TYPE_DISK || t == SORT_SPACE_TYPE_MEMORY); + return t == SORT_SPACE_TYPE_DISK ? "Disk" : "Memory"; +} + /* * Heap manipulation routines, per Knuth's Algorithm 5.2.3H. diff --git a/src/include/executor/nodeSort.h b/src/include/executor/nodeSort.h index fcf6b765..77ac0659 100644 --- a/src/include/executor/nodeSort.h +++ b/src/include/executor/nodeSort.h @@ -14,6 +14,7 @@ #ifndef NODESORT_H #define NODESORT_H +#include "access/parallel.h" #include "nodes/execnodes.h" extern SortState *ExecInitSort(Sort *node, EState *estate, int eflags); @@ -22,4 +23,10 @@ extern void ExecSortMarkPos(SortState *node); extern void ExecSortRestrPos(SortState *node); extern void ExecReScanSort(SortState *node); -#endif /* NODESORT_H */ +/* parallel instrumentation support */ +extern void ExecSortEstimate(SortState *node, ParallelContext *pcxt); +extern void ExecSortInitializeDSM(SortState *node, ParallelContext *pcxt); +extern void ExecSortInitializeWorker(SortState *node, shm_toc *toc); +extern void ExecSortRetrieveInstrumentation(SortState *node); + +#endif /* NODESORT_H */ diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 1fdf29fe..cedcf547 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1965,6 +1965,16 @@ typedef struct ReDistributeState + ReDistributeBufferTotalSize * numWorkers * numWorkers) #endif +/* ---------------- + * Shared memory container for per-worker sort information + * ---------------- + */ +typedef struct SharedSortInfo +{ + int num_workers; + TuplesortInstrumentation sinstrument[FLEXIBLE_ARRAY_MEMBER]; +} SharedSortInfo; + /* ---------------- * SortState information * ---------------- @@ -1979,6 +1989,8 @@ typedef struct SortState bool bounded_Done; /* value of bounded we did the sort with */ int64 bound_Done; /* value of bound we did the sort with */ void *tuplesortstate; /* private state of tuplesort.c */ + bool am_worker; /* are we a worker? */ + SharedSortInfo *shared_info; /* one entry per worker */ #ifdef __TBASE__ Size stateLen; ReDistributeState *state; diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h index 1b5ada2c..f3e81c70 100644 --- a/src/include/utils/tuplesort.h +++ b/src/include/utils/tuplesort.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * tuplesort.h - * Generalized tuple sorting routines. + * Generalized tuple sorting routines. * * This module handles sorting of heap tuples, index tuples, or single * Datums (and could easily support other kinds of sortable objects, @@ -35,6 +35,34 @@ struct ResponseCombiner; */ typedef struct Tuplesortstate Tuplesortstate; +/* + * Data structures for reporting sort statistics. Note that + * TuplesortInstrumentation can't contain any pointers because we + * sometimes put it in shared memory. + */ +typedef enum +{ + SORT_TYPE_STILL_IN_PROGRESS = 0, + SORT_TYPE_TOP_N_HEAPSORT, + SORT_TYPE_QUICKSORT, + SORT_TYPE_EXTERNAL_SORT, + SORT_TYPE_EXTERNAL_MERGE +} TuplesortMethod; + +typedef enum +{ + SORT_SPACE_TYPE_DISK, + SORT_SPACE_TYPE_MEMORY +} TuplesortSpaceType; + +typedef struct TuplesortInstrumentation +{ + TuplesortMethod sortMethod; /* sort algorithm used */ + TuplesortSpaceType spaceType; /* type of space spaceUsed represents */ + long spaceUsed; /* space consumption, in kB */ +} TuplesortInstrumentation; + + /* * We provide multiple interfaces to what is essentially the same code, * since different callers have different data to be sorted and want to @@ -63,66 +91,66 @@ typedef struct Tuplesortstate Tuplesortstate; */ extern Tuplesortstate *tuplesort_begin_heap(TupleDesc tupDesc, - int nkeys, AttrNumber *attNums, - Oid *sortOperators, Oid *sortCollations, - bool *nullsFirstFlags, - int workMem, bool randomAccess); + int nkeys, AttrNumber *attNums, + Oid *sortOperators, Oid *sortCollations, + bool *nullsFirstFlags, + int workMem, bool randomAccess); extern Tuplesortstate *tuplesort_begin_cluster(TupleDesc tupDesc, - Relation indexRel, - int workMem, bool randomAccess); + Relation indexRel, + int workMem, bool randomAccess); extern Tuplesortstate *tuplesort_begin_index_btree(Relation heapRel, - Relation indexRel, - bool enforceUnique, - int workMem, bool randomAccess); + Relation indexRel, + bool enforceUnique, + int workMem, bool randomAccess); extern Tuplesortstate *tuplesort_begin_index_hash(Relation heapRel, - Relation indexRel, - uint32 high_mask, - uint32 low_mask, - uint32 max_buckets, - int workMem, bool randomAccess); + Relation indexRel, + uint32 high_mask, + uint32 low_mask, + uint32 max_buckets, + int workMem, bool randomAccess); extern Tuplesortstate *tuplesort_begin_datum(Oid datumType, - Oid sortOperator, Oid sortCollation, - bool nullsFirstFlag, - int workMem, bool randomAccess); + Oid sortOperator, Oid sortCollation, + bool nullsFirstFlag, + int workMem, bool randomAccess); #ifdef PGXC extern Tuplesortstate *tuplesort_begin_merge(TupleDesc tupDesc, - int nkeys, AttrNumber *attNums, - Oid *sortOperators, Oid *sortCollations, bool *nullsFirstFlags, - struct ResponseCombiner *combiner, - int workMem); + int nkeys, AttrNumber *attNums, + Oid *sortOperators, Oid *sortCollations, bool *nullsFirstFlags, + struct ResponseCombiner *combiner, + int workMem); #endif extern void tuplesort_set_bound(Tuplesortstate *state, int64 bound); extern void tuplesort_puttupleslot(Tuplesortstate *state, - TupleTableSlot *slot); + TupleTableSlot *slot); extern void tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup); extern void tuplesort_putindextuplevalues(Tuplesortstate *state, - Relation rel, ItemPointer self, - Datum *values, bool *isnull); + Relation rel, ItemPointer self, + Datum *values, bool *isnull); extern void tuplesort_putdatum(Tuplesortstate *state, Datum val, - bool isNull); + bool isNull); extern void tuplesort_performsort(Tuplesortstate *state); extern bool tuplesort_gettupleslot(Tuplesortstate *state, bool forward, - bool copy, TupleTableSlot *slot, Datum *abbrev); + bool copy, TupleTableSlot *slot, Datum *abbrev); extern HeapTuple tuplesort_getheaptuple(Tuplesortstate *state, bool forward); extern IndexTuple tuplesort_getindextuple(Tuplesortstate *state, bool forward); extern bool tuplesort_getdatum(Tuplesortstate *state, bool forward, - Datum *val, bool *isNull, Datum *abbrev); + Datum *val, bool *isNull, Datum *abbrev); extern bool tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples, - bool forward); + bool forward); extern void tuplesort_end(Tuplesortstate *state); extern void tuplesort_get_stats(Tuplesortstate *state, - const char **sortMethod, - const char **spaceType, - long *spaceUsed); + TuplesortInstrumentation *stats); +extern const char *tuplesort_method_name(TuplesortMethod m); +extern const char *tuplesort_space_type_name(TuplesortSpaceType t); -extern int tuplesort_merge_order(int64 allowedMem); +extern int tuplesort_merge_order(int64 allowedMem); /* * These routines may only be called if randomAccess was specified 'true'. @@ -134,4 +162,4 @@ extern void tuplesort_rescan(Tuplesortstate *state); extern void tuplesort_markpos(Tuplesortstate *state); extern void tuplesort_restorepos(Tuplesortstate *state); -#endif /* TUPLESORT_H */ +#endif /* TUPLESORT_H */ diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out index c9dc3101..c573fbda 100644 --- a/src/test/regress/expected/subselect.out +++ b/src/test/regress/expected/subselect.out @@ -1781,6 +1781,36 @@ DROP TABLE sub_t2; DROP TABLE sub_interfere1; DROP TABLE sub_interfere2; set enable_pullup_subquery to false; +-- Test that LIMIT can be pushed to SORT through a subquery that just projects +-- columns. We check for that having happened by looking to see if EXPLAIN +-- ANALYZE shows that a top-N sort was used. We must suppress or filter away +-- all the non-invariant parts of the EXPLAIN ANALYZE output. +-- +create table sq_limit (pk int primary key, c1 int, c2 int); +insert into sq_limit values + (1, 1, 1), + (2, 2, 2), + (3, 3, 3), + (4, 4, 4), + (5, 1, 1), + (6, 2, 2), + (7, 3, 3), + (8, 4, 4); +create function explain_sq_limit() returns setof text language plpgsql as +$$ +declare ln text; +begin + for ln in + explain (analyze, summary off, timing off, costs off) + select * from (select pk,c2 from sq_limit order by c1,pk) as x limit 3 + loop + ln := regexp_replace(ln, 'Memory: \S*', 'Memory: xxx'); + -- this case might occur if force_parallel_mode is on: + ln := regexp_replace(ln, 'Worker 0: Sort Method', 'Sort Method'); + return next ln; + end loop; +end; +$$; -- -- Tests for CTE inlining behavior -- @@ -2097,3 +2127,21 @@ from date_dim (0 rows) drop table catalog_sales, catalog_returns, date_dim; +-- not in optimization +create table notin_t1 (id1 int, num1 int not null); +create table notin_t2 (id2 int, num2 int not null); +explain(costs off) select num1 from notin_t1 where num1 not in (select num2 from notin_t2); + QUERY PLAN +------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on notin_t1 + Filter: (NOT (hashed SubPlan 1)) + SubPlan 1 + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on notin_t2 +(6 rows) + +drop table notin_t1; +drop table notin_t2; +drop function explain_sq_limit(); +drop table sq_limit; diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql index 256ddefa..8b5db9a2 100644 --- a/src/test/regress/sql/subselect.sql +++ b/src/test/regress/sql/subselect.sql @@ -728,6 +728,37 @@ DROP TABLE sub_t2; DROP TABLE sub_interfere1; DROP TABLE sub_interfere2; set enable_pullup_subquery to false; +-- Test that LIMIT can be pushed to SORT through a subquery that just projects +-- columns. We check for that having happened by looking to see if EXPLAIN +-- ANALYZE shows that a top-N sort was used. We must suppress or filter away +-- all the non-invariant parts of the EXPLAIN ANALYZE output. +-- +create table sq_limit (pk int primary key, c1 int, c2 int); +insert into sq_limit values + (1, 1, 1), + (2, 2, 2), + (3, 3, 3), + (4, 4, 4), + (5, 1, 1), + (6, 2, 2), + (7, 3, 3), + (8, 4, 4); + +create function explain_sq_limit() returns setof text language plpgsql as +$$ +declare ln text; +begin + for ln in + explain (analyze, summary off, timing off, costs off) + select * from (select pk,c2 from sq_limit order by c1,pk) as x limit 3 + loop + ln := regexp_replace(ln, 'Memory: \S*', 'Memory: xxx'); + -- this case might occur if force_parallel_mode is on: + ln := regexp_replace(ln, 'Worker 0: Sort Method', 'Sort Method'); + return next ln; + end loop; +end; +$$; -- -- Tests for CTE inlining behavior @@ -848,4 +879,14 @@ with cs as select 1 from date_dim join cs on (cs_sold_year=d_year and cs_item_sk=cs_item_sk); -drop table catalog_sales, catalog_returns, date_dim; \ No newline at end of file +drop table catalog_sales, catalog_returns, date_dim; + +-- not in optimization +create table notin_t1 (id1 int, num1 int not null); +create table notin_t2 (id2 int, num2 int not null); +explain(costs off) select num1 from notin_t1 where num1 not in (select num2 from notin_t2); +drop table notin_t1; +drop table notin_t2; +drop function explain_sq_limit(); + +drop table sq_limit; From 283e8ce8559ce6ae0834e17b8acfefd898159b0b Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 16 Aug 2017 16:18:41 +0300 Subject: [PATCH 135/578] Use atomic ops to hand out pages to scan in parallel scan. With a lot of CPUs, the spinlock that protects the current scan location in a parallel scan can become a bottleneck. Use an atomic fetch-and-add instruction instead. David Rowley Discussion: https://www.postgresql.org/message-id/CAKJS1f9tgsPhqBcoPjv9_KUPZvTLCZ4jy%3DB%3DbhqgaKn7cYzm-w@mail.gmail.com --- src/backend/access/heap/heapam.c | 104 ++++++++++++++++++------------- src/include/access/relscan.h | 5 +- 2 files changed, 63 insertions(+), 46 deletions(-) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index ab7920d8..5f6eb658 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -59,6 +59,7 @@ #include "catalog/namespace.h" #include "miscadmin.h" #include "pgstat.h" +#include "port/atomics.h" #include "storage/bufmgr.h" #include "storage/freespace.h" #include "storage/lmgr.h" @@ -106,6 +107,7 @@ static HeapScanDesc heap_beginscan_internal(Relation relation, bool is_bitmapscan, bool is_samplescan, bool temp_snap); +static void heap_parallelscan_startblock_init(HeapScanDesc scan); static BlockNumber heap_parallelscan_nextpage(HeapScanDesc scan); static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid, CommandId cid, int options); @@ -549,6 +551,8 @@ heapgettup(HeapScanDesc scan, } if (scan->rs_parallel != NULL) { + heap_parallelscan_startblock_init(scan); + page = heap_parallelscan_nextpage(scan); /* Other processes might have already finished the scan. */ @@ -929,6 +933,8 @@ heapgettup_pagemode(HeapScanDesc scan, } if (scan->rs_parallel != NULL) { + heap_parallelscan_startblock_init(scan); + page = heap_parallelscan_nextpage(scan); /* Other processes might have already finished the scan. */ @@ -1744,14 +1750,10 @@ heap_rescan(HeapScanDesc scan, /* * Caller is responsible for making sure that all workers have - * finished the scan before calling this, so it really shouldn't be - * necessary to acquire the mutex at all. We acquire it anyway, just - * to be tidy. + * finished the scan before calling this. */ parallel_scan = scan->rs_parallel; - SpinLockAcquire(¶llel_scan->phs_mutex); - parallel_scan->phs_cblock = parallel_scan->phs_startblock; - SpinLockRelease(¶llel_scan->phs_mutex); + pg_atomic_write_u64(¶llel_scan->phs_nallocated, 0); } } @@ -1909,8 +1911,8 @@ heap_parallelscan_initialize(ParallelHeapScanDesc target, Relation relation, !RelationUsesLocalBuffers(relation) && target->phs_nblocks > NBuffers / 4; SpinLockInit(&target->phs_mutex); - target->phs_cblock = InvalidBlockNumber; target->phs_startblock = InvalidBlockNumber; + pg_atomic_write_u64(&target->phs_nallocated, 0); SerializeSnapshot(snapshot, target->phs_snapshot_data); } @@ -1934,20 +1936,17 @@ heap_beginscan_parallel(Relation relation, ParallelHeapScanDesc parallel_scan) } /* ---------------- - * heap_parallelscan_nextpage - get the next page to scan + * heap_parallelscan_startblock_init - find and set the scan's startblock * - * Get the next page to scan. Even if there are no pages left to scan, - * another backend could have grabbed a page to scan and not yet finished - * looking at it, so it doesn't follow that the scan is done when the - * first backend gets an InvalidBlockNumber return. + * Determine where the parallel seq scan should start. This function may + * be called many times, once by each parallel worker. We must be careful + * only to set the startblock once. * ---------------- */ -static BlockNumber -heap_parallelscan_nextpage(HeapScanDesc scan) -{// #lizard forgives - BlockNumber page = InvalidBlockNumber; +static void +heap_parallelscan_startblock_init(HeapScanDesc scan) +{ BlockNumber sync_startpage = InvalidBlockNumber; - BlockNumber report_page = InvalidBlockNumber; ParallelHeapScanDesc parallel_scan; Assert(scan->rs_parallel); @@ -1979,46 +1978,63 @@ heap_parallelscan_nextpage(HeapScanDesc scan) sync_startpage = ss_get_location(scan->rs_rd, scan->rs_nblocks); goto retry; } - parallel_scan->phs_cblock = parallel_scan->phs_startblock; + } + SpinLockRelease(¶llel_scan->phs_mutex); } - /* - * The current block number is the next one that needs to be scanned, - * unless it's InvalidBlockNumber already, in which case there are no more - * blocks to scan. After remembering the current value, we must advance - * it so that the next call to this function returns the next block to be - * scanned. +/* ---------------- + * heap_parallelscan_nextpage - get the next page to scan + * + * Get the next page to scan. Even if there are no pages left to scan, + * another backend could have grabbed a page to scan and not yet finished + * looking at it, so it doesn't follow that the scan is done when the + * first backend gets an InvalidBlockNumber return. + * ---------------- */ - page = parallel_scan->phs_cblock; - if (page != InvalidBlockNumber) - { - parallel_scan->phs_cblock++; - if (parallel_scan->phs_cblock >= scan->rs_nblocks) - parallel_scan->phs_cblock = 0; - if (parallel_scan->phs_cblock == parallel_scan->phs_startblock) +static BlockNumber +heap_parallelscan_nextpage(HeapScanDesc scan) { - parallel_scan->phs_cblock = InvalidBlockNumber; - report_page = parallel_scan->phs_startblock; - } - } - - /* Release the lock. */ - SpinLockRelease(¶llel_scan->phs_mutex); + BlockNumber page; + ParallelHeapScanDesc parallel_scan; + uint64 nallocated; + + Assert(scan->rs_parallel); + parallel_scan = scan->rs_parallel; + + /* + * phs_nallocated tracks how many pages have been allocated to workers + * already. When phs_nallocated >= rs_nblocks, all blocks have been + * allocated. + * + * Because we use an atomic fetch-and-add to fetch the current value, the + * phs_nallocated counter will exceed rs_nblocks, because workers will + * still increment the value, when they try to allocate the next block but + * all blocks have been allocated already. The counter must be 64 bits + * wide because of that, to avoid wrapping around when rs_nblocks is close + * to 2^32. + * + * The actual page to return is calculated by adding the counter to the + * starting block number, modulo nblocks. + */ + nallocated = pg_atomic_fetch_add_u64(¶llel_scan->phs_nallocated, 1); + if (nallocated >= scan->rs_nblocks) + page = InvalidBlockNumber; /* all blocks have been allocated */ + else + page = (nallocated + parallel_scan->phs_startblock) % scan->rs_nblocks; /* * Report scan location. Normally, we report the current page number. * When we reach the end of the scan, though, we report the starting page, * not the ending page, just so the starting positions for later scans * doesn't slew backwards. We only report the position at the end of the - * scan once, though: subsequent callers will have report nothing, since - * they will have page == InvalidBlockNumber. + * scan once, though: subsequent callers will report nothing. */ if (scan->rs_syncscan) { - if (report_page == InvalidBlockNumber) - report_page = page; - if (report_page != InvalidBlockNumber) - ss_report_location(scan->rs_rd, report_page); + if (page != InvalidBlockNumber) + ss_report_location(scan->rs_rd, page); + else if (nallocated == scan->rs_nblocks) + ss_report_location(scan->rs_rd, parallel_scan->phs_startblock); } return page; diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index 35cf3f10..79e8cab7 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -97,9 +97,10 @@ typedef struct ParallelHeapScanDescData Oid phs_relid; /* OID of relation to scan */ bool phs_syncscan; /* report location to syncscan logic? */ BlockNumber phs_nblocks; /* # blocks in relation at start of scan */ - slock_t phs_mutex; /* mutual exclusion for block number fields */ + slock_t phs_mutex; /* mutual exclusion for setting startblock */ BlockNumber phs_startblock; /* starting block number */ - BlockNumber phs_cblock; /* current block number */ + pg_atomic_uint64 phs_nallocated; /* number of blocks allocated to + * workers so far. */ char phs_snapshot_data[FLEXIBLE_ARRAY_MEMBER]; } ParallelHeapScanDescData; From 80ceed7e71a5349323ad54d4325f104058e6e2cf Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Thu, 16 Nov 2017 17:28:11 -0800 Subject: [PATCH 136/578] Provide DSM segment to ExecXXXInitializeWorker functions. Previously, executor nodes running in parallel worker processes didn't have access to the dsm_segment object used for parallel execution. In order to support resource management based on DSM segment lifetime, they need that. So create a ParallelWorkerContext object to hold it and pass it to all InitializeWorker functions. Author: Thomas Munro Reviewed-By: Andres Freund Discussion: https://postgr.es/m/CAEepm=2W=cOkiZxcg6qiFQP-dHUe09aqTrEMM7yJDrHMhDv_RA@mail.gmail.com --- src/backend/executor/execParallel.c | 44 ++- src/backend/executor/nodeAgg.c | 3 +- src/backend/executor/nodeBitmapHeapscan.c | 5 +- src/backend/executor/nodeCustom.c | 284 +++++++------- src/backend/executor/nodeForeignscan.c | 7 +- src/backend/executor/nodeHashjoin.c | 6 +- src/backend/executor/nodeIndexonlyscan.c | 5 +- src/backend/executor/nodeIndexscan.c | 5 +- src/backend/executor/nodeSeqscan.c | 445 +++++++++++----------- src/backend/executor/nodeSort.c | 4 +- src/backend/pgxc/pool/execRemote.c | 8 +- src/include/access/parallel.h | 6 + src/include/executor/nodeAgg.h | 2 +- src/include/executor/nodeBitmapHeapscan.h | 8 +- src/include/executor/nodeCustom.h | 10 +- src/include/executor/nodeForeignscan.h | 8 +- src/include/executor/nodeHashjoin.h | 2 +- src/include/executor/nodeIndexonlyscan.h | 8 +- src/include/executor/nodeIndexscan.h | 18 +- src/include/executor/nodeSeqscan.h | 6 +- src/include/executor/nodeSort.h | 3 +- src/include/pgxc/execRemote.h | 4 +- src/tools/pgindent/typedefs.list | 1 + 23 files changed, 466 insertions(+), 426 deletions(-) diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c index 45dc56fb..75a82009 100644 --- a/src/backend/executor/execParallel.c +++ b/src/backend/executor/execParallel.c @@ -1093,8 +1093,8 @@ ExecParallelReportInstrumentation(PlanState *planstate, * is allocated and initialized by executor; that is, after ExecutorStart(). */ static bool -ExecParallelInitializeWorker(PlanState *planstate, shm_toc *toc) -{// #lizard forgives +ExecParallelInitializeWorker(PlanState *planstate, ParallelWorkerContext *pwcxt) +{ if (planstate == NULL) return false; @@ -1102,49 +1102,50 @@ ExecParallelInitializeWorker(PlanState *planstate, shm_toc *toc) { case T_SeqScanState: if (planstate->plan->parallel_aware) - ExecSeqScanInitializeWorker((SeqScanState *) planstate, toc); + ExecSeqScanInitializeWorker((SeqScanState *) planstate, pwcxt); break; case T_IndexScanState: if (planstate->plan->parallel_aware) - ExecIndexScanInitializeWorker((IndexScanState *) planstate, toc); + ExecIndexScanInitializeWorker((IndexScanState *) planstate, + pwcxt); break; case T_IndexOnlyScanState: if (planstate->plan->parallel_aware) - ExecIndexOnlyScanInitializeWorker((IndexOnlyScanState *) planstate, toc); + ExecIndexOnlyScanInitializeWorker((IndexOnlyScanState *) planstate, + pwcxt); break; case T_ForeignScanState: if (planstate->plan->parallel_aware) ExecForeignScanInitializeWorker((ForeignScanState *) planstate, - toc); + pwcxt); break; case T_CustomScanState: if (planstate->plan->parallel_aware) ExecCustomScanInitializeWorker((CustomScanState *) planstate, - toc); + pwcxt); break; case T_BitmapHeapScanState: if (planstate->plan->parallel_aware) - ExecBitmapHeapInitializeWorker( - (BitmapHeapScanState *) planstate, toc); + ExecBitmapHeapInitializeWorker((BitmapHeapScanState *) planstate, pwcxt); break; case T_SortState: /* even when not parallel-aware */ - ExecSortInitializeWorker((SortState *) planstate, toc); + ExecSortInitializeWorker((SortState *) planstate, pwcxt); #ifdef __TBASE__ if (planstate->plan->parallel_aware) - ReDistributeInitializeWorker(planstate, toc); + ReDistributeInitializeWorker(planstate, pwcxt); break; case T_RemoteQueryState: if (planstate->plan->parallel_aware) - ExecRemoteQueryInitializeDSMWorker((RemoteQueryState *)planstate, toc); + ExecRemoteQueryInitializeDSMWorker((RemoteQueryState *)planstate, pwcxt); break; case T_RemoteSubplanState: if (planstate->plan->parallel_aware) - ExecRemoteSubPlanInitDSMWorker((RemoteSubplanState *)planstate, toc); - break; - case T_HashJoinState: - if (planstate->plan->parallel_aware) - ExecParallelHashJoinInitWorker((HashJoinState *) planstate, toc); + ExecRemoteSubPlanInitializeDSMWorker((RemoteSubplanState *)planstate, pwcxt); + break; + case T_HashJoinState: + if (planstate->plan->parallel_aware) + ExecParallelHashJoinInitializeWorker((HashJoinState *) planstate, pwcxt); break; case T_AggState: if (planstate->plan->parallel_aware) @@ -1152,7 +1153,7 @@ ExecParallelInitializeWorker(PlanState *planstate, shm_toc *toc) AggState *aggstate = (AggState *)planstate; if (aggstate->aggstrategy == AGG_HASHED) - ReDistributeInitializeWorker(planstate, toc); + ReDistributeInitializeWorker(planstate, pwcxt); } break; #endif @@ -1160,7 +1161,8 @@ ExecParallelInitializeWorker(PlanState *planstate, shm_toc *toc) break; } - return planstate_tree_walker(planstate, ExecParallelInitializeWorker, toc); + return planstate_tree_walker(planstate, ExecParallelInitializeWorker, + pwcxt); } /* @@ -1189,6 +1191,7 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc) int instrument_options = 0; void *area_space; dsa_area *area; + ParallelWorkerContext pwcxt; #ifdef __TBASE__ int i = 0; int nWorkers = 0; @@ -1320,6 +1323,9 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc) } } #endif + pwcxt.toc = toc; + pwcxt.seg = seg; + ExecParallelInitializeWorker(queryDesc->planstate, &pwcxt); /* Start up the executor */ ExecutorStart(queryDesc, 0); diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c index d771a28e..9f5678f5 100644 --- a/src/backend/executor/nodeAgg.c +++ b/src/backend/executor/nodeAgg.c @@ -5435,10 +5435,11 @@ ReDistributeInitializeDSM(PlanState *node, ParallelContext *pcxt) } void -ReDistributeInitializeWorker(PlanState *node, shm_toc *toc) +ReDistributeInitializeWorker(PlanState *node, ParallelWorkerContext *pwcxt) { int offset = 0; int i = 0; + shm_toc *toc = pwcxt->toc; ReDistributeState *state = NULL; ReDistributeState *rd_state = NULL; volatile ParallelWorkerStatus *numParallelWorkers = NULL; diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c index 2230a9c4..511dab7f 100644 --- a/src/backend/executor/nodeBitmapHeapscan.c +++ b/src/backend/executor/nodeBitmapHeapscan.c @@ -1061,12 +1061,13 @@ ExecBitmapHeapInitializeDSM(BitmapHeapScanState *node, * ---------------------------------------------------------------- */ void -ExecBitmapHeapInitializeWorker(BitmapHeapScanState *node, shm_toc *toc) +ExecBitmapHeapInitializeWorker(BitmapHeapScanState *node, + ParallelWorkerContext *pwcxt) { ParallelBitmapHeapState *pstate; Snapshot snapshot; - pstate = shm_toc_lookup(toc, node->ss.ps.plan->plan_node_id, false); + pstate = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false); node->pstate = pstate; snapshot = RestoreSnapshot(pstate->phs_snapshot_data); diff --git a/src/backend/executor/nodeCustom.c b/src/backend/executor/nodeCustom.c index 7f7d78a2..4640093c 100644 --- a/src/backend/executor/nodeCustom.c +++ b/src/backend/executor/nodeCustom.c @@ -1,7 +1,7 @@ /* ------------------------------------------------------------------------ * * nodeCustom.c - * Routines to handle execution of custom scan node + * Routines to handle execution of custom scan node * * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -31,51 +31,51 @@ static TupleTableSlot *ExecCustomScan(PlanState *pstate); CustomScanState * ExecInitCustomScan(CustomScan *cscan, EState *estate, int eflags) { - CustomScanState *css; - Relation scan_rel = NULL; - Index scanrelid = cscan->scan.scanrelid; - Index tlistvarno; - - /* - * Allocate the CustomScanState object. We let the custom scan provider - * do the palloc, in case it wants to make a larger object that embeds - * CustomScanState as the first field. It must set the node tag and the - * methods field correctly at this time. Other standard fields should be - * set to zero. - */ - css = castNode(CustomScanState, - cscan->methods->CreateCustomScanState(cscan)); - - /* ensure flags is filled correctly */ - css->flags = cscan->flags; - - /* fill up fields of ScanState */ - css->ss.ps.plan = &cscan->scan.plan; - css->ss.ps.state = estate; - css->ss.ps.ExecProcNode = ExecCustomScan; - - /* create expression context for node */ - ExecAssignExprContext(estate, &css->ss.ps); - - /* initialize child expressions */ - css->ss.ps.qual = - ExecInitQual(cscan->scan.plan.qual, (PlanState *) css); - - /* tuple table initialization */ - ExecInitScanTupleSlot(estate, &css->ss); - ExecInitResultTupleSlot(estate, &css->ss.ps); - - /* - * open the base relation, if any, and acquire an appropriate lock on it - */ - if (scanrelid > 0) - { - scan_rel = ExecOpenScanRelation(estate, scanrelid, eflags); - css->ss.ss_currentRelation = scan_rel; + CustomScanState *css; + Relation scan_rel = NULL; + Index scanrelid = cscan->scan.scanrelid; + Index tlistvarno; + + /* + * Allocate the CustomScanState object. We let the custom scan provider + * do the palloc, in case it wants to make a larger object that embeds + * CustomScanState as the first field. It must set the node tag and the + * methods field correctly at this time. Other standard fields should be + * set to zero. + */ + css = castNode(CustomScanState, + cscan->methods->CreateCustomScanState(cscan)); + + /* ensure flags is filled correctly */ + css->flags = cscan->flags; + + /* fill up fields of ScanState */ + css->ss.ps.plan = &cscan->scan.plan; + css->ss.ps.state = estate; + css->ss.ps.ExecProcNode = ExecCustomScan; + + /* create expression context for node */ + ExecAssignExprContext(estate, &css->ss.ps); + + /* initialize child expressions */ + css->ss.ps.qual = + ExecInitQual(cscan->scan.plan.qual, (PlanState *) css); + + /* tuple table initialization */ + ExecInitScanTupleSlot(estate, &css->ss); + ExecInitResultTupleSlot(estate, &css->ss.ps); + + /* + * open the base relation, if any, and acquire an appropriate lock on it + */ + if (scanrelid > 0) + { + scan_rel = ExecOpenScanRelation(estate, scanrelid, eflags); + css->ss.ss_currentRelation = scan_rel; #ifdef _MLS_ mls_check_datamask_need_passby((ScanState*)css, scan_rel->rd_id); #endif - } + } else { #ifdef _MLS_ @@ -83,149 +83,165 @@ ExecInitCustomScan(CustomScan *cscan, EState *estate, int eflags) #endif } - /* - * Determine the scan tuple type. If the custom scan provider provided a - * targetlist describing the scan tuples, use that; else use base - * relation's rowtype. - */ - if (cscan->custom_scan_tlist != NIL || scan_rel == NULL) - { - TupleDesc scan_tupdesc; - - scan_tupdesc = ExecTypeFromTL(cscan->custom_scan_tlist, false); - ExecAssignScanType(&css->ss, scan_tupdesc); - /* Node's targetlist will contain Vars with varno = INDEX_VAR */ - tlistvarno = INDEX_VAR; - } - else - { - ExecAssignScanType(&css->ss, RelationGetDescr(scan_rel)); - /* Node's targetlist will contain Vars with varno = scanrelid */ - tlistvarno = scanrelid; - } - - /* - * Initialize result tuple type and projection info. - */ - ExecAssignResultTypeFromTL(&css->ss.ps); - ExecAssignScanProjectionInfoWithVarno(&css->ss, tlistvarno); - - /* - * The callback of custom-scan provider applies the final initialization - * of the custom-scan-state node according to its logic. - */ - css->methods->BeginCustomScan(css, estate, eflags); - - return css; + /* + * Determine the scan tuple type. If the custom scan provider provided a + * targetlist describing the scan tuples, use that; else use base + * relation's rowtype. + */ + if (cscan->custom_scan_tlist != NIL || scan_rel == NULL) + { + TupleDesc scan_tupdesc; + + scan_tupdesc = ExecTypeFromTL(cscan->custom_scan_tlist, false); + ExecAssignScanType(&css->ss, scan_tupdesc); + /* Node's targetlist will contain Vars with varno = INDEX_VAR */ + tlistvarno = INDEX_VAR; + } + else + { + ExecAssignScanType(&css->ss, RelationGetDescr(scan_rel)); + /* Node's targetlist will contain Vars with varno = scanrelid */ + tlistvarno = scanrelid; + } + + /* + * Initialize result tuple type and projection info. + */ + ExecAssignResultTypeFromTL(&css->ss.ps); + ExecAssignScanProjectionInfoWithVarno(&css->ss, tlistvarno); + + /* + * The callback of custom-scan provider applies the final initialization + * of the custom-scan-state node according to its logic. + */ + css->methods->BeginCustomScan(css, estate, eflags); + + return css; } static TupleTableSlot * ExecCustomScan(PlanState *pstate) { - CustomScanState *node = castNode(CustomScanState, pstate); + CustomScanState *node = castNode(CustomScanState, pstate); - CHECK_FOR_INTERRUPTS(); + CHECK_FOR_INTERRUPTS(); - Assert(node->methods->ExecCustomScan != NULL); - return node->methods->ExecCustomScan(node); + Assert(node->methods->ExecCustomScan != NULL); + return node->methods->ExecCustomScan(node); } void ExecEndCustomScan(CustomScanState *node) { - Assert(node->methods->EndCustomScan != NULL); - node->methods->EndCustomScan(node); + Assert(node->methods->EndCustomScan != NULL); + node->methods->EndCustomScan(node); - /* Free the exprcontext */ - ExecFreeExprContext(&node->ss.ps); + /* Free the exprcontext */ + ExecFreeExprContext(&node->ss.ps); - /* Clean out the tuple table */ - ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); - ExecClearTuple(node->ss.ss_ScanTupleSlot); + /* Clean out the tuple table */ + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + ExecClearTuple(node->ss.ss_ScanTupleSlot); - /* Close the heap relation */ - if (node->ss.ss_currentRelation) - ExecCloseScanRelation(node->ss.ss_currentRelation); + /* Close the heap relation */ + if (node->ss.ss_currentRelation) + ExecCloseScanRelation(node->ss.ss_currentRelation); } void ExecReScanCustomScan(CustomScanState *node) { - Assert(node->methods->ReScanCustomScan != NULL); - node->methods->ReScanCustomScan(node); + Assert(node->methods->ReScanCustomScan != NULL); + node->methods->ReScanCustomScan(node); } void ExecCustomMarkPos(CustomScanState *node) { - if (!node->methods->MarkPosCustomScan) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("custom scan \"%s\" does not support MarkPos", - node->methods->CustomName))); - node->methods->MarkPosCustomScan(node); + if (!node->methods->MarkPosCustomScan) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("custom scan \"%s\" does not support MarkPos", + node->methods->CustomName))); + node->methods->MarkPosCustomScan(node); } void ExecCustomRestrPos(CustomScanState *node) { - if (!node->methods->RestrPosCustomScan) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("custom scan \"%s\" does not support MarkPos", - node->methods->CustomName))); - node->methods->RestrPosCustomScan(node); + if (!node->methods->RestrPosCustomScan) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("custom scan \"%s\" does not support MarkPos", + node->methods->CustomName))); + node->methods->RestrPosCustomScan(node); } void ExecCustomScanEstimate(CustomScanState *node, ParallelContext *pcxt) { - const CustomExecMethods *methods = node->methods; - - if (methods->EstimateDSMCustomScan) - { - node->pscan_len = methods->EstimateDSMCustomScan(node, pcxt); - shm_toc_estimate_chunk(&pcxt->estimator, node->pscan_len); - shm_toc_estimate_keys(&pcxt->estimator, 1); - } + const CustomExecMethods *methods = node->methods; + + if (methods->EstimateDSMCustomScan) + { + node->pscan_len = methods->EstimateDSMCustomScan(node, pcxt); + shm_toc_estimate_chunk(&pcxt->estimator, node->pscan_len); + shm_toc_estimate_keys(&pcxt->estimator, 1); + } } void ExecCustomScanInitializeDSM(CustomScanState *node, ParallelContext *pcxt) { - const CustomExecMethods *methods = node->methods; + const CustomExecMethods *methods = node->methods; - if (methods->InitializeDSMCustomScan) - { - int plan_node_id = node->ss.ps.plan->plan_node_id; - void *coordinate; + if (methods->InitializeDSMCustomScan) + { + int plan_node_id = node->ss.ps.plan->plan_node_id; + void *coordinate; - coordinate = shm_toc_allocate(pcxt->toc, node->pscan_len); - methods->InitializeDSMCustomScan(node, pcxt, coordinate); - shm_toc_insert(pcxt->toc, plan_node_id, coordinate); - } + coordinate = shm_toc_allocate(pcxt->toc, node->pscan_len); + methods->InitializeDSMCustomScan(node, pcxt, coordinate); + shm_toc_insert(pcxt->toc, plan_node_id, coordinate); + } } void -ExecCustomScanInitializeWorker(CustomScanState *node, shm_toc *toc) +ExecCustomScanReInitializeDSM(CustomScanState *node, ParallelContext *pcxt) { - const CustomExecMethods *methods = node->methods; + const CustomExecMethods *methods = node->methods; - if (methods->InitializeWorkerCustomScan) - { - int plan_node_id = node->ss.ps.plan->plan_node_id; - void *coordinate; + if (methods->ReInitializeDSMCustomScan) + { + int plan_node_id = node->ss.ps.plan->plan_node_id; + void *coordinate; - coordinate = shm_toc_lookup(toc, plan_node_id, false); - methods->InitializeWorkerCustomScan(node, toc, coordinate); - } + coordinate = shm_toc_lookup(pcxt->toc, plan_node_id, false); + methods->ReInitializeDSMCustomScan(node, pcxt, coordinate); + } +} + +void +ExecCustomScanInitializeWorker(CustomScanState *node, + ParallelWorkerContext *pwcxt) +{ + const CustomExecMethods *methods = node->methods; + + if (methods->InitializeWorkerCustomScan) + { + int plan_node_id = node->ss.ps.plan->plan_node_id; + void *coordinate; + + coordinate = shm_toc_lookup(pwcxt->toc, plan_node_id, false); + methods->InitializeWorkerCustomScan(node, pwcxt->toc, coordinate); + } } void ExecShutdownCustomScan(CustomScanState *node) { - const CustomExecMethods *methods = node->methods; + const CustomExecMethods *methods = node->methods; - if (methods->ShutdownCustomScan) - methods->ShutdownCustomScan(node); + if (methods->ShutdownCustomScan) + methods->ShutdownCustomScan(node); } diff --git a/src/backend/executor/nodeForeignscan.c b/src/backend/executor/nodeForeignscan.c index 07da53e3..69eeda5c 100644 --- a/src/backend/executor/nodeForeignscan.c +++ b/src/backend/executor/nodeForeignscan.c @@ -356,7 +356,8 @@ ExecForeignScanInitializeDSM(ForeignScanState *node, ParallelContext *pcxt) * ---------------------------------------------------------------- */ void -ExecForeignScanInitializeWorker(ForeignScanState *node, shm_toc *toc) +ExecForeignScanInitializeWorker(ForeignScanState *node, + ParallelWorkerContext *pwcxt) { FdwRoutine *fdwroutine = node->fdwroutine; @@ -365,8 +366,8 @@ ExecForeignScanInitializeWorker(ForeignScanState *node, shm_toc *toc) int plan_node_id = node->ss.ps.plan->plan_node_id; void *coordinate; - coordinate = shm_toc_lookup(toc, plan_node_id, false); - fdwroutine->InitializeWorkerForeignScan(node, toc, coordinate); + coordinate = shm_toc_lookup(pwcxt->toc, plan_node_id, false); + fdwroutine->InitializeWorkerForeignScan(node, pwcxt->toc, coordinate); } } diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c index 6d57ae37..c6446cad 100644 --- a/src/backend/executor/nodeHashjoin.c +++ b/src/backend/executor/nodeHashjoin.c @@ -1422,14 +1422,14 @@ ExecParallelHashJoinInitializeDSM(HashJoinState *node, * ---------------------------------------------------------------- */ void -ExecParallelHashJoinInitWorker(HashJoinState *node, shm_toc *toc) +ExecParallelHashJoinInitWorker(HashJoinState *node, ParallelWorkerContext *pwcxt) { int offset = 0; ParallelHashJoinState *parallelState = NULL; volatile ParallelWorkerStatus *numParallelWorkers = NULL; - parallelState = shm_toc_lookup(toc, node->js.ps.plan->plan_node_id, false); - numParallelWorkers = GetParallelWorkerStatusInfo(toc); + parallelState = shm_toc_lookup(pwcxt->toc, node->js.ps.plan->plan_node_id, false); + numParallelWorkers = GetParallelWorkerStatusInfo(pwcxt->toc); node->hj_parallelState = (ParallelHashJoinState *)palloc0(sizeof(ParallelHashJoinState)); diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index 1ae02cf5..73df1306 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -734,11 +734,12 @@ ExecIndexOnlyScanInitializeDSM(IndexOnlyScanState *node, * ---------------------------------------------------------------- */ void -ExecIndexOnlyScanInitializeWorker(IndexOnlyScanState *node, shm_toc *toc) +ExecIndexOnlyScanInitializeWorker(IndexOnlyScanState *node, + ParallelWorkerContext *pwcxt) { ParallelIndexScanDesc piscan; - piscan = shm_toc_lookup(toc, node->ss.ps.plan->plan_node_id, false); + piscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false); node->ioss_ScanDesc = index_beginscan_parallel(node->ss.ss_currentRelation, node->ioss_RelationDesc, diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index 9cda2201..23dfff75 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -1777,11 +1777,12 @@ ExecIndexScanInitializeDSM(IndexScanState *node, * ---------------------------------------------------------------- */ void -ExecIndexScanInitializeWorker(IndexScanState *node, shm_toc *toc) +ExecIndexScanInitializeWorker(IndexScanState *node, + ParallelWorkerContext *pwcxt) { ParallelIndexScanDesc piscan; - piscan = shm_toc_lookup(toc, node->ss.ps.plan->plan_node_id, false); + piscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false); node->iss_ScanDesc = index_beginscan_parallel(node->ss.ss_currentRelation, node->iss_RelationDesc, diff --git a/src/backend/executor/nodeSeqscan.c b/src/backend/executor/nodeSeqscan.c index 42e2313f..e04a2be9 100644 --- a/src/backend/executor/nodeSeqscan.c +++ b/src/backend/executor/nodeSeqscan.c @@ -1,28 +1,28 @@ /*------------------------------------------------------------------------- * * nodeSeqscan.c - * Support routines for sequential scans of relations. + * Support routines for sequential scans of relations. * * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * src/backend/executor/nodeSeqscan.c + * src/backend/executor/nodeSeqscan.c * *------------------------------------------------------------------------- */ /* * INTERFACE ROUTINES - * ExecSeqScan sequentially scans a relation. - * ExecSeqNext retrieve next tuple in sequential order. - * ExecInitSeqScan creates and initializes a seqscan node. - * ExecEndSeqScan releases any storage allocated. - * ExecReScanSeqScan rescans the relation + * ExecSeqScan sequentially scans a relation. + * ExecSeqNext retrieve next tuple in sequential order. + * ExecInitSeqScan creates and initializes a seqscan node. + * ExecEndSeqScan releases any storage allocated. + * ExecReScanSeqScan rescans the relation * - * ExecSeqScanEstimate estimates DSM space needed for parallel scan - * ExecSeqScanInitializeDSM initialize DSM for parallel scan - * ExecSeqScanInitializeWorker attach to DSM info in parallel worker + * ExecSeqScanEstimate estimates DSM space needed for parallel scan + * ExecSeqScanInitializeDSM initialize DSM for parallel scan + * ExecSeqScanInitializeWorker attach to DSM info in parallel worker */ #include "postgres.h" @@ -43,82 +43,82 @@ static bool InitScanRelation(SeqScanState *node, EState *estate, int eflags); static TupleTableSlot *SeqNext(SeqScanState *node); /* ---------------------------------------------------------------- - * Scan Support + * Scan Support * ---------------------------------------------------------------- */ /* ---------------------------------------------------------------- - * SeqNext + * SeqNext * - * This is a workhorse for ExecSeqScan + * This is a workhorse for ExecSeqScan * ---------------------------------------------------------------- */ static TupleTableSlot * SeqNext(SeqScanState *node) { - HeapTuple tuple; - HeapScanDesc scandesc; - EState *estate; - ScanDirection direction; - TupleTableSlot *slot; - - /* - * get information from the estate and scan state - */ - scandesc = node->ss.ss_currentScanDesc; - estate = node->ss.ps.state; - direction = estate->es_direction; - slot = node->ss.ss_ScanTupleSlot; - - if (scandesc == NULL) - { - /* - * We reach here if the scan is not parallel, or if we're executing a - * scan that was intended to be parallel serially. - */ - scandesc = heap_beginscan(node->ss.ss_currentRelation, - estate->es_snapshot, - 0, NULL); - if(enable_distri_print) - { - elog(LOG, "seq scan snapshot local %d start ts "INT64_FORMAT " rel %s", estate->es_snapshot->local, - estate->es_snapshot->start_ts, RelationGetRelationName(node->ss.ss_currentRelation)); - } - node->ss.ss_currentScanDesc = scandesc; - } + HeapTuple tuple; + HeapScanDesc scandesc; + EState *estate; + ScanDirection direction; + TupleTableSlot *slot; + + /* + * get information from the estate and scan state + */ + scandesc = node->ss.ss_currentScanDesc; + estate = node->ss.ps.state; + direction = estate->es_direction; + slot = node->ss.ss_ScanTupleSlot; + + if (scandesc == NULL) + { + /* + * We reach here if the scan is not parallel, or if we're executing a + * scan that was intended to be parallel serially. + */ + scandesc = heap_beginscan(node->ss.ss_currentRelation, + estate->es_snapshot, + 0, NULL); + if(enable_distri_print) + { + elog(LOG, "seq scan snapshot local %d start ts "INT64_FORMAT " rel %s", estate->es_snapshot->local, + estate->es_snapshot->start_ts, RelationGetRelationName(node->ss.ss_currentRelation)); + } + node->ss.ss_currentScanDesc = scandesc; + } - /* - * get the next tuple from the table - */ - tuple = heap_getnext(scandesc, direction); + /* + * get the next tuple from the table + */ + tuple = heap_getnext(scandesc, direction); - if(enable_distri_debug) - { - if(tuple) - { - scandesc->rs_scan_number++; - } - } + if(enable_distri_debug) + { + if(tuple) + { + scandesc->rs_scan_number++; + } + } - /* - * save the tuple and the buffer returned to us by the access methods in - * our scan tuple slot and return the slot. Note: we pass 'false' because - * tuples returned by heap_getnext() are pointers onto disk pages and were - * not created with palloc() and so should not be pfree()'d. Note also - * that ExecStoreTuple will increment the refcount of the buffer; the - * refcount will not be dropped until the tuple table slot is cleared. - */ - if (tuple) - ExecStoreTuple(tuple, /* tuple to store */ - slot, /* slot to store in */ - scandesc->rs_cbuf, /* buffer associated with this - * tuple */ - false); /* don't pfree this pointer */ - else - ExecClearTuple(slot); - - return slot; + /* + * save the tuple and the buffer returned to us by the access methods in + * our scan tuple slot and return the slot. Note: we pass 'false' because + * tuples returned by heap_getnext() are pointers onto disk pages and were + * not created with palloc() and so should not be pfree()'d. Note also + * that ExecStoreTuple will increment the refcount of the buffer; the + * refcount will not be dropped until the tuple table slot is cleared. + */ + if (tuple) + ExecStoreTuple(tuple, /* tuple to store */ + slot, /* slot to store in */ + scandesc->rs_cbuf, /* buffer associated with this + * tuple */ + false); /* don't pfree this pointer */ + else + ExecClearTuple(slot); + + return slot; } /* @@ -127,64 +127,64 @@ SeqNext(SeqScanState *node) static bool SeqRecheck(SeqScanState *node, TupleTableSlot *slot) { - /* - * Note that unlike IndexScan, SeqScan never use keys in heap_beginscan - * (and this is very bad) - so, here we do not check are keys ok or not. - */ - return true; + /* + * Note that unlike IndexScan, SeqScan never use keys in heap_beginscan + * (and this is very bad) - so, here we do not check are keys ok or not. + */ + return true; } /* ---------------------------------------------------------------- - * ExecSeqScan(node) + * ExecSeqScan(node) * - * Scans the relation sequentially and returns the next qualifying - * tuple. - * We call the ExecScan() routine and pass it the appropriate - * access method functions. + * Scans the relation sequentially and returns the next qualifying + * tuple. + * We call the ExecScan() routine and pass it the appropriate + * access method functions. * ---------------------------------------------------------------- */ static TupleTableSlot * ExecSeqScan(PlanState *pstate) { - SeqScanState *node = castNode(SeqScanState, pstate); + SeqScanState *node = castNode(SeqScanState, pstate); - return ExecScan(&node->ss, - (ExecScanAccessMtd) SeqNext, - (ExecScanRecheckMtd) SeqRecheck); + return ExecScan(&node->ss, + (ExecScanAccessMtd) SeqNext, + (ExecScanRecheckMtd) SeqRecheck); } /* ---------------------------------------------------------------- - * InitScanRelation + * InitScanRelation * - * Set up to access the scan relation. + * Set up to access the scan relation. * ---------------------------------------------------------------- */ static bool InitScanRelation(SeqScanState *node, EState *estate, int eflags) { - Relation currentRelation; + Relation currentRelation; - /* - * get the relation object id from the relid'th entry in the range table, - * open that relation and acquire appropriate lock on it. - */ + /* + * get the relation object id from the relid'th entry in the range table, + * open that relation and acquire appropriate lock on it. + */ #ifdef __TBASE__ - /* if interval partition, scan child table instead */ - if(((SeqScan *) node->ss.ps.plan)->ispartchild) - { - currentRelation = ExecOpenScanRelationPartition(estate, - ((SeqScan *) node->ss.ps.plan)->scanrelid, - eflags, - ((SeqScan *) node->ss.ps.plan)->childidx); - } - else - { + /* if interval partition, scan child table instead */ + if(((SeqScan *) node->ss.ps.plan)->ispartchild) + { + currentRelation = ExecOpenScanRelationPartition(estate, + ((SeqScan *) node->ss.ps.plan)->scanrelid, + eflags, + ((SeqScan *) node->ss.ps.plan)->childidx); + } + else + { #endif - currentRelation = ExecOpenScanRelation(estate, - ((SeqScan *) node->ss.ps.plan)->scanrelid, - eflags); + currentRelation = ExecOpenScanRelation(estate, + ((SeqScan *) node->ss.ps.plan)->scanrelid, + eflags); #ifdef __TBASE__ - } + } #endif if (!currentRelation) @@ -196,56 +196,56 @@ InitScanRelation(SeqScanState *node, EState *estate, int eflags) mls_check_datamask_need_passby((ScanState*)node, currentRelation->rd_id); #endif - node->ss.ss_currentRelation = currentRelation; + node->ss.ss_currentRelation = currentRelation; - /* and report the scan tuple slot's rowtype */ - ExecAssignScanType(&node->ss, RelationGetDescr(currentRelation)); + /* and report the scan tuple slot's rowtype */ + ExecAssignScanType(&node->ss, RelationGetDescr(currentRelation)); return true; } /* ---------------------------------------------------------------- - * ExecInitSeqScan + * ExecInitSeqScan * --------------------------------------------------------------- */ SeqScanState * ExecInitSeqScan(SeqScan *node, EState *estate, int eflags) { - SeqScanState *scanstate; + SeqScanState *scanstate; bool init_ret = true; #ifdef __AUDIT_FGA__ ListCell *item; #endif - /* - * Once upon a time it was possible to have an outerPlan of a SeqScan, but - * not any more. - */ - Assert(outerPlan(node) == NULL); - Assert(innerPlan(node) == NULL); - - /* - * create state structure - */ - scanstate = makeNode(SeqScanState); - scanstate->ss.ps.plan = (Plan *) node; - scanstate->ss.ps.state = estate; - scanstate->ss.ps.ExecProcNode = ExecSeqScan; - - /* - * Miscellaneous initialization - * - * create expression context for node - */ - ExecAssignExprContext(estate, &scanstate->ss.ps); - - /* - * initialize child expressions - */ - scanstate->ss.ps.qual = - ExecInitQual(node->plan.qual, (PlanState *) scanstate); + /* + * Once upon a time it was possible to have an outerPlan of a SeqScan, but + * not any more. + */ + Assert(outerPlan(node) == NULL); + Assert(innerPlan(node) == NULL); + + /* + * create state structure + */ + scanstate = makeNode(SeqScanState); + scanstate->ss.ps.plan = (Plan *) node; + scanstate->ss.ps.state = estate; + scanstate->ss.ps.ExecProcNode = ExecSeqScan; + + /* + * Miscellaneous initialization + * + * create expression context for node + */ + ExecAssignExprContext(estate, &scanstate->ss.ps); + + /* + * initialize child expressions + */ + scanstate->ss.ps.qual = + ExecInitQual(node->plan.qual, (PlanState *) scanstate); #ifdef __AUDIT_FGA__ if (enable_fga) @@ -268,15 +268,15 @@ ExecInitSeqScan(SeqScan *node, EState *estate, int eflags) } #endif - /* - * tuple table initialization - */ - ExecInitResultTupleSlot(estate, &scanstate->ss.ps); - ExecInitScanTupleSlot(estate, &scanstate->ss); + /* + * tuple table initialization + */ + ExecInitResultTupleSlot(estate, &scanstate->ss.ps); + ExecInitScanTupleSlot(estate, &scanstate->ss); - /* - * initialize scan relation - */ + /* + * initialize scan relation + */ init_ret = InitScanRelation(scanstate, estate, eflags); if (!init_ret) { @@ -284,137 +284,138 @@ ExecInitSeqScan(SeqScan *node, EState *estate, int eflags) return NULL; } - /* - * Initialize result tuple type and projection info. - */ - ExecAssignResultTypeFromTL(&scanstate->ss.ps); - ExecAssignScanProjectionInfo(&scanstate->ss); + /* + * Initialize result tuple type and projection info. + */ + ExecAssignResultTypeFromTL(&scanstate->ss.ps); + ExecAssignScanProjectionInfo(&scanstate->ss); - return scanstate; + return scanstate; } /* ---------------------------------------------------------------- - * ExecEndSeqScan + * ExecEndSeqScan * - * frees any storage allocated through C routines. + * frees any storage allocated through C routines. * ---------------------------------------------------------------- */ void ExecEndSeqScan(SeqScanState *node) { - Relation relation; - HeapScanDesc scanDesc; - - /* - * get information from node - */ - relation = node->ss.ss_currentRelation; - scanDesc = node->ss.ss_currentScanDesc; - - /* - * Free the exprcontext - */ - ExecFreeExprContext(&node->ss.ps); - - /* - * clean out the tuple table - */ - ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); - ExecClearTuple(node->ss.ss_ScanTupleSlot); - - /* - * close heap scan - */ - if (scanDesc != NULL) - heap_endscan(scanDesc); - - /* - * close the heap relation. - */ - ExecCloseScanRelation(relation); + Relation relation; + HeapScanDesc scanDesc; + + /* + * get information from node + */ + relation = node->ss.ss_currentRelation; + scanDesc = node->ss.ss_currentScanDesc; + + /* + * Free the exprcontext + */ + ExecFreeExprContext(&node->ss.ps); + + /* + * clean out the tuple table + */ + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + ExecClearTuple(node->ss.ss_ScanTupleSlot); + + /* + * close heap scan + */ + if (scanDesc != NULL) + heap_endscan(scanDesc); + + /* + * close the heap relation. + */ + ExecCloseScanRelation(relation); } /* ---------------------------------------------------------------- - * Join Support + * Join Support * ---------------------------------------------------------------- */ /* ---------------------------------------------------------------- - * ExecReScanSeqScan + * ExecReScanSeqScan * - * Rescans the relation. + * Rescans the relation. * ---------------------------------------------------------------- */ void ExecReScanSeqScan(SeqScanState *node) { - HeapScanDesc scan; + HeapScanDesc scan; - scan = node->ss.ss_currentScanDesc; + scan = node->ss.ss_currentScanDesc; - if (scan != NULL) - heap_rescan(scan, /* scan desc */ - NULL); /* new scan keys */ + if (scan != NULL) + heap_rescan(scan, /* scan desc */ + NULL); /* new scan keys */ - ExecScanReScan((ScanState *) node); + ExecScanReScan((ScanState *) node); } /* ---------------------------------------------------------------- - * Parallel Scan Support + * Parallel Scan Support * ---------------------------------------------------------------- */ /* ---------------------------------------------------------------- - * ExecSeqScanEstimate + * ExecSeqScanEstimate * - * estimates the space required to serialize seqscan node. + * estimates the space required to serialize seqscan node. * ---------------------------------------------------------------- */ void ExecSeqScanEstimate(SeqScanState *node, - ParallelContext *pcxt) + ParallelContext *pcxt) { - EState *estate = node->ss.ps.state; + EState *estate = node->ss.ps.state; - node->pscan_len = heap_parallelscan_estimate(estate->es_snapshot); - shm_toc_estimate_chunk(&pcxt->estimator, node->pscan_len); - shm_toc_estimate_keys(&pcxt->estimator, 1); + node->pscan_len = heap_parallelscan_estimate(estate->es_snapshot); + shm_toc_estimate_chunk(&pcxt->estimator, node->pscan_len); + shm_toc_estimate_keys(&pcxt->estimator, 1); } /* ---------------------------------------------------------------- - * ExecSeqScanInitializeDSM + * ExecSeqScanInitializeDSM * - * Set up a parallel heap scan descriptor. + * Set up a parallel heap scan descriptor. * ---------------------------------------------------------------- */ void ExecSeqScanInitializeDSM(SeqScanState *node, - ParallelContext *pcxt) + ParallelContext *pcxt) { - EState *estate = node->ss.ps.state; - ParallelHeapScanDesc pscan; - - pscan = shm_toc_allocate(pcxt->toc, node->pscan_len); - heap_parallelscan_initialize(pscan, - node->ss.ss_currentRelation, - estate->es_snapshot); - shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pscan); - node->ss.ss_currentScanDesc = - heap_beginscan_parallel(node->ss.ss_currentRelation, pscan); + EState *estate = node->ss.ps.state; + ParallelHeapScanDesc pscan; + + pscan = shm_toc_allocate(pcxt->toc, node->pscan_len); + heap_parallelscan_initialize(pscan, + node->ss.ss_currentRelation, + estate->es_snapshot); + shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pscan); + node->ss.ss_currentScanDesc = + heap_beginscan_parallel(node->ss.ss_currentRelation, pscan); } /* ---------------------------------------------------------------- - * ExecSeqScanInitializeWorker + * ExecSeqScanInitializeWorker * - * Copy relevant information from TOC into planstate. + * Copy relevant information from TOC into planstate. * ---------------------------------------------------------------- */ void -ExecSeqScanInitializeWorker(SeqScanState *node, shm_toc *toc) +ExecSeqScanInitializeWorker(SeqScanState *node, + ParallelWorkerContext *pwcxt) { - ParallelHeapScanDesc pscan; + ParallelHeapScanDesc pscan; - pscan = shm_toc_lookup(toc, node->ss.ps.plan->plan_node_id, false); - node->ss.ss_currentScanDesc = - heap_beginscan_parallel(node->ss.ss_currentRelation, pscan); + pscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false); + node->ss.ss_currentScanDesc = + heap_beginscan_parallel(node->ss.ss_currentRelation, pscan); } diff --git a/src/backend/executor/nodeSort.c b/src/backend/executor/nodeSort.c index 3c35d902..d891a645 100644 --- a/src/backend/executor/nodeSort.c +++ b/src/backend/executor/nodeSort.c @@ -513,10 +513,10 @@ ExecSortInitializeDSM(SortState *node, ParallelContext *pcxt) * ---------------------------------------------------------------- */ void -ExecSortInitializeWorker(SortState *node, shm_toc *toc) +ExecSortInitializeWorker(SortState *node, ParallelWorkerContext *pwcxt) { node->shared_info = - shm_toc_lookup(toc, node->ss.ps.plan->plan_node_id, true); + shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, true); node->am_worker = true; } diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index ab99d828..e3be03b8 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -12021,7 +12021,7 @@ ExecRemoteQueryInitializeDSM(RemoteQueryState *node, void ExecRemoteSubPlanInitDSMWorker(RemoteSubplanState *node, - shm_toc *toc) + ParallelWorkerContext *pwcxt) { int32 i = 0; int32 length = 0; @@ -12033,7 +12033,7 @@ ExecRemoteSubPlanInitDSMWorker(RemoteSubplanState *node, List *locla_exec_nodes = NULL; ListCell *node_list_item = NULL; - worker_status = GetParallelWorkerStatusInfo(toc); + worker_status = GetParallelWorkerStatusInfo(pwcxt->toc); worker_num = ExecGetForWorkerNumber(worker_status); node->parallel_status = worker_status; if (node->execOnAll) @@ -12083,7 +12083,7 @@ ExecRemoteSubPlanInitDSMWorker(RemoteSubplanState *node, } void ExecRemoteQueryInitializeDSMWorker(RemoteQueryState *node, - shm_toc *toc) + ParallelWorkerContext *pwcxt) { int32 worker_num = 0; ParallelWorkerStatus *worker_status = NULL; @@ -12092,7 +12092,7 @@ ExecRemoteQueryInitializeDSMWorker(RemoteQueryState *node, combiner = (ResponseCombiner *) node; step = (RemoteQuery *) combiner->ss.ps.plan; - worker_status = GetParallelWorkerStatusInfo(toc); + worker_status = GetParallelWorkerStatusInfo(pwcxt->toc); worker_num = ExecGetForWorkerNumber(worker_status); node->parallel_status = worker_status; worker_num = worker_num; /* keep compiler quiet. */ diff --git a/src/include/access/parallel.h b/src/include/access/parallel.h index 7f8c75be..2e258ca4 100644 --- a/src/include/access/parallel.h +++ b/src/include/access/parallel.h @@ -118,6 +118,12 @@ typedef struct ParallelWorkerStatus } ParallelWorkerStatus; #endif +typedef struct ParallelWorkerContext +{ + dsm_segment *seg; + shm_toc *toc; +} ParallelWorkerContext; + extern volatile bool ParallelMessagePending; extern int ParallelWorkerNumber; extern bool InitializingParallelWorker; diff --git a/src/include/executor/nodeAgg.h b/src/include/executor/nodeAgg.h index 1f5022aa..7336aaf9 100644 --- a/src/include/executor/nodeAgg.h +++ b/src/include/executor/nodeAgg.h @@ -38,7 +38,7 @@ extern void ReDistributeEstimate(PlanState *node, ParallelContext *pcxt); extern void ReDistributeInitializeDSM(PlanState *node, ParallelContext *pcxt); -extern void ReDistributeInitializeWorker(PlanState *node, shm_toc *toc); +extern void ReDistributeInitializeWorker(PlanState *node, ParallelWorkerContext *pwcxt); extern void InitializeReDistribute(ReDistributeState *state, BufFile ***file); diff --git a/src/include/executor/nodeBitmapHeapscan.h b/src/include/executor/nodeBitmapHeapscan.h index 81d2d40b..ab98a23b 100644 --- a/src/include/executor/nodeBitmapHeapscan.h +++ b/src/include/executor/nodeBitmapHeapscan.h @@ -21,10 +21,10 @@ extern BitmapHeapScanState *ExecInitBitmapHeapScan(BitmapHeapScan *node, EState extern void ExecEndBitmapHeapScan(BitmapHeapScanState *node); extern void ExecReScanBitmapHeapScan(BitmapHeapScanState *node); extern void ExecBitmapHeapEstimate(BitmapHeapScanState *node, - ParallelContext *pcxt); + ParallelContext *pcxt); extern void ExecBitmapHeapInitializeDSM(BitmapHeapScanState *node, - ParallelContext *pcxt); + ParallelContext *pcxt); extern void ExecBitmapHeapInitializeWorker(BitmapHeapScanState *node, - shm_toc *toc); + ParallelWorkerContext *pwcxt); -#endif /* NODEBITMAPHEAPSCAN_H */ +#endif /* NODEBITMAPHEAPSCAN_H */ diff --git a/src/include/executor/nodeCustom.h b/src/include/executor/nodeCustom.h index 743b0bb4..ef99c01b 100644 --- a/src/include/executor/nodeCustom.h +++ b/src/include/executor/nodeCustom.h @@ -20,7 +20,7 @@ * General executor code */ extern CustomScanState *ExecInitCustomScan(CustomScan *custom_scan, - EState *estate, int eflags); + EState *estate, int eflags); extern void ExecEndCustomScan(CustomScanState *node); extern void ExecReScanCustomScan(CustomScanState *node); @@ -31,11 +31,11 @@ extern void ExecCustomRestrPos(CustomScanState *node); * Parallel execution support */ extern void ExecCustomScanEstimate(CustomScanState *node, - ParallelContext *pcxt); + ParallelContext *pcxt); extern void ExecCustomScanInitializeDSM(CustomScanState *node, - ParallelContext *pcxt); + ParallelContext *pcxt); extern void ExecCustomScanInitializeWorker(CustomScanState *node, - shm_toc *toc); + ParallelWorkerContext *pwcxt); extern void ExecShutdownCustomScan(CustomScanState *node); -#endif /* NODECUSTOM_H */ +#endif /* NODECUSTOM_H */ diff --git a/src/include/executor/nodeForeignscan.h b/src/include/executor/nodeForeignscan.h index 6498b632..663bdf77 100644 --- a/src/include/executor/nodeForeignscan.h +++ b/src/include/executor/nodeForeignscan.h @@ -22,11 +22,11 @@ extern void ExecEndForeignScan(ForeignScanState *node); extern void ExecReScanForeignScan(ForeignScanState *node); extern void ExecForeignScanEstimate(ForeignScanState *node, - ParallelContext *pcxt); + ParallelContext *pcxt); extern void ExecForeignScanInitializeDSM(ForeignScanState *node, - ParallelContext *pcxt); + ParallelContext *pcxt); extern void ExecForeignScanInitializeWorker(ForeignScanState *node, - shm_toc *toc); + ParallelWorkerContext *pwcxt); extern void ExecShutdownForeignScan(ForeignScanState *node); -#endif /* NODEFOREIGNSCAN_H */ +#endif /* NODEFOREIGNSCAN_H */ diff --git a/src/include/executor/nodeHashjoin.h b/src/include/executor/nodeHashjoin.h index 19a9a5d4..49b04cd3 100644 --- a/src/include/executor/nodeHashjoin.h +++ b/src/include/executor/nodeHashjoin.h @@ -31,7 +31,7 @@ extern void ExecParallelHashJoinEstimate(HashJoinState *node, ParallelContext *p extern void ExecParallelHashJoinInitializeDSM(HashJoinState *node, ParallelContext *pcxt); -extern void ExecParallelHashJoinInitWorker(HashJoinState *node, shm_toc *toc); +extern void ExecParallelHashJoinInitWorker(HashJoinState *node, ParallelWorkerContext *pwcxt); extern void ParallelHashJoinEreport(void); #endif diff --git a/src/include/executor/nodeIndexonlyscan.h b/src/include/executor/nodeIndexonlyscan.h index 7c904f25..8bb3a65c 100644 --- a/src/include/executor/nodeIndexonlyscan.h +++ b/src/include/executor/nodeIndexonlyscan.h @@ -25,10 +25,10 @@ extern void ExecReScanIndexOnlyScan(IndexOnlyScanState *node); /* Support functions for parallel index-only scans */ extern void ExecIndexOnlyScanEstimate(IndexOnlyScanState *node, - ParallelContext *pcxt); + ParallelContext *pcxt); extern void ExecIndexOnlyScanInitializeDSM(IndexOnlyScanState *node, - ParallelContext *pcxt); + ParallelContext *pcxt); extern void ExecIndexOnlyScanInitializeWorker(IndexOnlyScanState *node, - shm_toc *toc); + ParallelWorkerContext *pwcxt); -#endif /* NODEINDEXONLYSCAN_H */ +#endif /* NODEINDEXONLYSCAN_H */ diff --git a/src/include/executor/nodeIndexscan.h b/src/include/executor/nodeIndexscan.h index 17390c2e..ae0f4480 100644 --- a/src/include/executor/nodeIndexscan.h +++ b/src/include/executor/nodeIndexscan.h @@ -24,21 +24,23 @@ extern void ExecIndexRestrPos(IndexScanState *node); extern void ExecReScanIndexScan(IndexScanState *node); extern void ExecIndexScanEstimate(IndexScanState *node, ParallelContext *pcxt); extern void ExecIndexScanInitializeDSM(IndexScanState *node, ParallelContext *pcxt); -extern void ExecIndexScanInitializeWorker(IndexScanState *node, shm_toc *toc); +extern void ExecIndexScanReInitializeDSM(IndexScanState *node, ParallelContext *pcxt); +extern void ExecIndexScanInitializeWorker(IndexScanState *node, + ParallelWorkerContext *pwcxt); /* * These routines are exported to share code with nodeIndexonlyscan.c and * nodeBitmapIndexscan.c */ extern void ExecIndexBuildScanKeys(PlanState *planstate, Relation index, - List *quals, bool isorderby, - ScanKey *scanKeys, int *numScanKeys, - IndexRuntimeKeyInfo **runtimeKeys, int *numRuntimeKeys, - IndexArrayKeyInfo **arrayKeys, int *numArrayKeys); + List *quals, bool isorderby, + ScanKey *scanKeys, int *numScanKeys, + IndexRuntimeKeyInfo **runtimeKeys, int *numRuntimeKeys, + IndexArrayKeyInfo **arrayKeys, int *numArrayKeys); extern void ExecIndexEvalRuntimeKeys(ExprContext *econtext, - IndexRuntimeKeyInfo *runtimeKeys, int numRuntimeKeys); + IndexRuntimeKeyInfo *runtimeKeys, int numRuntimeKeys); extern bool ExecIndexEvalArrayKeys(ExprContext *econtext, - IndexArrayKeyInfo *arrayKeys, int numArrayKeys); + IndexArrayKeyInfo *arrayKeys, int numArrayKeys); extern bool ExecIndexAdvanceArrayKeys(IndexArrayKeyInfo *arrayKeys, int numArrayKeys); -#endif /* NODEINDEXSCAN_H */ +#endif /* NODEINDEXSCAN_H */ diff --git a/src/include/executor/nodeSeqscan.h b/src/include/executor/nodeSeqscan.h index aa2653c1..ee3b1a0b 100644 --- a/src/include/executor/nodeSeqscan.h +++ b/src/include/executor/nodeSeqscan.h @@ -24,6 +24,8 @@ extern void ExecReScanSeqScan(SeqScanState *node); /* parallel scan support */ extern void ExecSeqScanEstimate(SeqScanState *node, ParallelContext *pcxt); extern void ExecSeqScanInitializeDSM(SeqScanState *node, ParallelContext *pcxt); -extern void ExecSeqScanInitializeWorker(SeqScanState *node, shm_toc *toc); +extern void ExecSeqScanReInitializeDSM(SeqScanState *node, ParallelContext *pcxt); +extern void ExecSeqScanInitializeWorker(SeqScanState *node, + ParallelWorkerContext *pwcxt); -#endif /* NODESEQSCAN_H */ +#endif /* NODESEQSCAN_H */ diff --git a/src/include/executor/nodeSort.h b/src/include/executor/nodeSort.h index 77ac0659..cc61a9db 100644 --- a/src/include/executor/nodeSort.h +++ b/src/include/executor/nodeSort.h @@ -26,7 +26,8 @@ extern void ExecReScanSort(SortState *node); /* parallel instrumentation support */ extern void ExecSortEstimate(SortState *node, ParallelContext *pcxt); extern void ExecSortInitializeDSM(SortState *node, ParallelContext *pcxt); -extern void ExecSortInitializeWorker(SortState *node, shm_toc *toc); +extern void ExecSortReInitializeDSM(SortState *node, ParallelContext *pcxt); +extern void ExecSortInitializeWorker(SortState *node, ParallelWorkerContext *pwcxt); extern void ExecSortRetrieveInstrumentation(SortState *node); #endif /* NODESORT_H */ diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h index 20e73f2e..03b16a62 100644 --- a/src/include/pgxc/execRemote.h +++ b/src/include/pgxc/execRemote.h @@ -413,9 +413,9 @@ extern void set_dbcleanup_callback(xact_callback function, void *paraminfo, int extern void ExecRemoteSubPlanInitializeDSM(RemoteSubplanState *node, ParallelContext *pcxt); extern void ExecRemoteQueryInitializeDSM(RemoteQueryState *node, ParallelContext *pcxt); extern void ExecRemoteSubPlanInitDSMWorker(RemoteSubplanState *node, - shm_toc *toc); + ParallelWorkerContext *pwcxt); extern void ExecRemoteQueryInitializeDSMWorker(RemoteQueryState *node, - shm_toc *toc); + ParallelWorkerContext *pwcxt); extern bool ExecRemoteDML(ModifyTableState *mtstate, ItemPointer tupleid, HeapTuple oldtuple, TupleTableSlot *slot, TupleTableSlot *planSlot, EState *estate, EPQState *epqstate, diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 8166d86c..dedefbdf 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1530,6 +1530,7 @@ ParallelHeapScanDesc ParallelIndexScanDesc ParallelSlot ParallelState +ParallelWorkerContext ParallelWorkerInfo Param ParamExecData From 6b440a79956653a382210e675233b2df4145131c Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Wed, 30 Aug 2017 13:18:16 -0400 Subject: [PATCH 137/578] Separate reinitialization of shared parallel-scan state from ExecReScan. Previously, the parallel executor logic did reinitialization of shared state within the ExecReScan code for parallel-aware scan nodes. This is problematic, because it means that the ExecReScan call has to occur synchronously (ie, during the parent Gather node's ReScan call). That is swimming very much against the tide so far as the ExecReScan machinery is concerned; the fact that it works at all today depends on a lot of fragile assumptions, such as that no plan node between Gather and a parallel-aware scan node is parameterized. Another objection is that because ExecReScan might be called in workers as well as the leader, hacky extra tests are needed in some places to prevent unwanted shared-state resets. Hence, let's separate this code into two functions, a ReInitializeDSM call and the ReScan call proper. ReInitializeDSM is called only in the leader and is guaranteed to run before we start new workers. ReScan is returned to its traditional function of resetting only local state, which means that ExecReScan's usual habits of delaying or eliminating child rescan calls are safe again. As with the preceding commit 7df2c1f8d, it doesn't seem to be necessary to make these changes in 9.6, which is a good thing because the FDW and CustomScan APIs are impacted. Discussion: https://postgr.es/m/CAA4eK1JkByysFJNh9M349u_nNjqETuEnY_y1VUc_kJiU0bxtaQ@mail.gmail.com --- doc/src/sgml/custom-scan.sgml | 29 ++- doc/src/sgml/fdwhandler.sgml | 43 +++- src/backend/access/heap/heapam.c | 28 ++- src/backend/executor/execParallel.c | 97 ++++++-- src/backend/executor/nodeBitmapHeapscan.c | 42 ++-- src/backend/executor/nodeForeignscan.c | 23 +- src/backend/executor/nodeGather.c | 42 +++- src/backend/executor/nodeGatherMerge.c | 41 +++- src/backend/executor/nodeIndexonlyscan.c | 29 ++- src/backend/executor/nodeIndexscan.c | 40 ++-- src/backend/executor/nodeSeqscan.c | 16 ++ src/backend/executor/nodeSort.c | 17 ++ src/include/access/heapam.h | 113 +++++----- src/include/executor/execParallel.h | 25 ++- src/include/executor/nodeBitmapHeapscan.h | 2 + src/include/executor/nodeCustom.h | 2 + src/include/executor/nodeForeignscan.h | 2 + src/include/executor/nodeIndexonlyscan.h | 2 + src/include/foreign/fdwapi.h | 260 +++++++++++----------- src/include/nodes/extensible.h | 121 +++++----- 20 files changed, 606 insertions(+), 368 deletions(-) diff --git a/doc/src/sgml/custom-scan.sgml b/doc/src/sgml/custom-scan.sgml index 6159c3a2..9d1ca7bf 100644 --- a/doc/src/sgml/custom-scan.sgml +++ b/doc/src/sgml/custom-scan.sgml @@ -320,22 +320,39 @@ void (*InitializeDSMCustomScan) (CustomScanState *node, void *coordinate); Initialize the dynamic shared memory that will be required for parallel - operation; coordinate points to an amount of allocated space - equal to the return value of EstimateDSMCustomScan. + operation. coordinate points to a shared memory area of + size equal to the return value of EstimateDSMCustomScan. This callback is optional, and need only be supplied if this custom scan provider supports parallel execution. +void (*ReInitializeDSMCustomScan) (CustomScanState *node, + ParallelContext *pcxt, + void *coordinate); + + Re-initialize the dynamic shared memory required for parallel operation + when the custom-scan plan node is about to be re-scanned. + This callback is optional, and need only be supplied if this custom + scan provider supports parallel execution. + Recommended practice is that this callback reset only shared state, + while the ReScanCustomScan callback resets only local + state. Currently, this callback will be called + before ReScanCustomScan, but it's best not to rely on + that ordering. + + + + void (*InitializeWorkerCustomScan) (CustomScanState *node, shm_toc *toc, void *coordinate); - Initialize a parallel worker's custom state based on the shared state - set up in the leader by InitializeDSMCustomScan. - This callback is optional, and needs only be supplied if this - custom path supports parallel execution. + Initialize a parallel worker's local state based on the shared state + set up by the leader during InitializeDSMCustomScan. + This callback is optional, and need only be supplied if this custom + scan provider supports parallel execution. diff --git a/doc/src/sgml/fdwhandler.sgml b/doc/src/sgml/fdwhandler.sgml index dbeaab55..cfa68084 100644 --- a/doc/src/sgml/fdwhandler.sgml +++ b/doc/src/sgml/fdwhandler.sgml @@ -1191,12 +1191,12 @@ ImportForeignSchema (ImportForeignSchemaStmt *stmt, Oid serverOid); A ForeignScan node can, optionally, support parallel execution. A parallel ForeignScan will be executed - in multiple processes and should return each row only once across + in multiple processes and must return each row exactly once across all cooperating processes. To do this, processes can coordinate through - fixed size chunks of dynamic shared memory. This shared memory is not - guaranteed to be mapped at the same address in every process, so pointers - may not be used. The following callbacks are all optional in general, - but required if parallel execution is to be supported. + fixed-size chunks of dynamic shared memory. This shared memory is not + guaranteed to be mapped at the same address in every process, so it + must not contain pointers. The following functions are all optional, + but most are required if parallel execution is to be supported. @@ -1215,7 +1215,7 @@ IsForeignScanParallelSafe(PlannerInfo *root, RelOptInfo *rel, - If this callback is not defined, it is assumed that the scan must take + If this function is not defined, it is assumed that the scan must take place within the parallel leader. Note that returning true does not mean that the scan itself can be done in parallel, only that the scan can be performed within a parallel worker. Therefore, it can be useful to define @@ -1230,6 +1230,9 @@ EstimateDSMForeignScan(ForeignScanState *node, ParallelContext *pcxt); Estimate the amount of dynamic shared memory that will be required for parallel operation. This may be higher than the amount that will actually be used, but it must not be lower. The return value is in bytes. + This function is optional, and can be omitted if not needed; but if it + is omitted, the next three functions must be omitted as well, because + no shared memory will be allocated for the FDW's use. @@ -1239,8 +1242,25 @@ InitializeDSMForeignScan(ForeignScanState *node, ParallelContext *pcxt, void *coordinate); Initialize the dynamic shared memory that will be required for parallel - operation; coordinate points to an amount of allocated space - equal to the return value of EstimateDSMForeignScan. + operation. coordinate points to a shared memory area of + size equal to the return value of EstimateDSMForeignScan. + This function is optional, and can be omitted if not needed. + + + + +void +ReInitializeDSMForeignScan(ForeignScanState *node, ParallelContext *pcxt, + void *coordinate); + + Re-initialize the dynamic shared memory required for parallel operation + when the foreign-scan plan node is about to be re-scanned. + This function is optional, and can be omitted if not needed. + Recommended practice is that this function reset only shared state, + while the ReScanForeignScan function resets only local + state. Currently, this function will be called + before ReScanForeignScan, but it's best not to rely on + that ordering. @@ -1249,10 +1269,9 @@ void InitializeWorkerForeignScan(ForeignScanState *node, shm_toc *toc, void *coordinate); - Initialize a parallel worker's custom state based on the shared state - set up in the leader by InitializeDSMForeignScan. - This callback is optional, and needs only be supplied if this - custom path supports parallel execution. + Initialize a parallel worker's local state based on the shared state + set up by the leader during InitializeDSMForeignScan. + This function is optional, and can be omitted if not needed. diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 5f6eb658..97064050 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -1740,21 +1740,6 @@ heap_rescan(HeapScanDesc scan, * reinitialize scan descriptor */ initscan(scan, key, true); - - /* - * reset parallel scan, if present - */ - if (scan->rs_parallel != NULL) - { - ParallelHeapScanDesc parallel_scan; - - /* - * Caller is responsible for making sure that all workers have - * finished the scan before calling this. - */ - parallel_scan = scan->rs_parallel; - pg_atomic_write_u64(¶llel_scan->phs_nallocated, 0); - } } /* ---------------- @@ -1916,6 +1901,19 @@ heap_parallelscan_initialize(ParallelHeapScanDesc target, Relation relation, SerializeSnapshot(snapshot, target->phs_snapshot_data); } +/* ---------------- + * heap_parallelscan_reinitialize - reset a parallel scan + * + * Call this in the leader process. Caller is responsible for + * making sure that all workers have finished the scan beforehand. + * ---------------- + */ +void +heap_parallelscan_reinitialize(ParallelHeapScanDesc parallel_scan) +{ + pg_atomic_write_u64(¶llel_scan->phs_nallocated, 0); +} + /* ---------------- * heap_beginscan_parallel - join a parallel scan * diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c index 75a82009..5ec13c1a 100644 --- a/src/backend/executor/execParallel.c +++ b/src/backend/executor/execParallel.c @@ -144,6 +144,8 @@ static bool ExecParallelInitializeDSM(PlanState *node, ExecParallelInitializeDSMContext *d); static shm_mq_handle **ExecParallelSetupTupleQueues(ParallelContext *pcxt, bool reinitialize); +static bool ExecParallelReInitializeDSM(PlanState *planstate, + ParallelContext *pcxt); static bool ExecParallelRetrieveInstrumentation(PlanState *planstate, SharedExecutorInstrumentation *instrumentation); @@ -415,7 +417,6 @@ ExecParallelInitializeDSM(PlanState *planstate, } break; #endif - default: break; } @@ -475,18 +476,6 @@ ExecParallelSetupTupleQueues(ParallelContext *pcxt, bool reinitialize) return responseq; } -/* - * Re-initialize the parallel executor info such that it can be reused by - * workers. - */ -void -ExecParallelReinitialize(ParallelExecutorInfo *pei) -{ - ReinitializeParallelDSM(pei->pcxt); - pei->tqueue = ExecParallelSetupTupleQueues(pei->pcxt, true); - pei->finished = false; -} - /* * Sets up the required infrastructure for backend workers to perform * execution and return results to the main backend. @@ -875,7 +864,7 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate, int nworkers) ExecParallelInitializeDSM(planstate, &d); /* - * Make sure that the world hasn't shifted under our feat. This could + * Make sure that the world hasn't shifted under our feet. This could * probably just be an Assert(), but let's be conservative for now. */ if (e.nnodes != d.nnodes) @@ -885,6 +874,82 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate, int nworkers) return pei; } +/* + * Re-initialize the parallel executor shared memory state before launching + * a fresh batch of workers. + */ +void +ExecParallelReinitialize(PlanState *planstate, + ParallelExecutorInfo *pei) +{ + /* Old workers must already be shut down */ + Assert(pei->finished); + + ReinitializeParallelDSM(pei->pcxt); + pei->tqueue = ExecParallelSetupTupleQueues(pei->pcxt, true); + pei->finished = false; + + /* Traverse plan tree and let each child node reset associated state. */ + ExecParallelReInitializeDSM(planstate, pei->pcxt); +} + +/* + * Traverse plan tree to reinitialize per-node dynamic shared memory state + */ +static bool +ExecParallelReInitializeDSM(PlanState *planstate, + ParallelContext *pcxt) +{ + if (planstate == NULL) + return false; + + /* + * Call reinitializers for DSM-using plan nodes. + */ + switch (nodeTag(planstate)) + { + case T_SeqScanState: + if (planstate->plan->parallel_aware) + ExecSeqScanReInitializeDSM((SeqScanState *) planstate, + pcxt); + break; + case T_IndexScanState: + if (planstate->plan->parallel_aware) + ExecIndexScanReInitializeDSM((IndexScanState *) planstate, + pcxt); + break; + case T_IndexOnlyScanState: + if (planstate->plan->parallel_aware) + ExecIndexOnlyScanReInitializeDSM((IndexOnlyScanState *) planstate, + pcxt); + break; + case T_ForeignScanState: + if (planstate->plan->parallel_aware) + ExecForeignScanReInitializeDSM((ForeignScanState *) planstate, + pcxt); + break; + case T_CustomScanState: + if (planstate->plan->parallel_aware) + ExecCustomScanReInitializeDSM((CustomScanState *) planstate, + pcxt); + break; + case T_BitmapHeapScanState: + if (planstate->plan->parallel_aware) + ExecBitmapHeapReInitializeDSM((BitmapHeapScanState *) planstate, + pcxt); + break; + case T_SortState: + /* even when not parallel-aware */ + ExecSortReInitializeDSM((SortState *) planstate, pcxt); + break; + + default: + break; + } + + return planstate_tree_walker(planstate, ExecParallelReInitializeDSM, pcxt); +} + /* * Copy instrumentation information about this node and its descendants from * dynamic shared memory. @@ -1325,14 +1390,12 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc) #endif pwcxt.toc = toc; pwcxt.seg = seg; - ExecParallelInitializeWorker(queryDesc->planstate, &pwcxt); - /* Start up the executor */ ExecutorStart(queryDesc, 0); /* Special executor initialization steps for parallel workers */ queryDesc->planstate->state->es_query_dsa = area; - ExecParallelInitializeWorker(queryDesc->planstate, toc); + ExecParallelInitializeWorker(queryDesc->planstate, &pwcxt); /* Run the plan */ ExecutorRun(queryDesc, ForwardScanDirection, 0L, true); diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c index 511dab7f..7973d3b2 100644 --- a/src/backend/executor/nodeBitmapHeapscan.c +++ b/src/backend/executor/nodeBitmapHeapscan.c @@ -716,23 +716,6 @@ ExecReScanBitmapHeapScan(BitmapHeapScanState *node) node->shared_tbmiterator = NULL; node->shared_prefetch_iterator = NULL; - /* Reset parallel bitmap state, if present */ - if (node->pstate) - { - dsa_area *dsa = node->ss.ps.state->es_query_dsa; - - node->pstate->state = BM_INITIAL; - - if (DsaPointerIsValid(node->pstate->tbmiterator)) - tbm_free_shared_area(dsa, node->pstate->tbmiterator); - - if (DsaPointerIsValid(node->pstate->prefetch_iterator)) - tbm_free_shared_area(dsa, node->pstate->prefetch_iterator); - - node->pstate->tbmiterator = InvalidDsaPointer; - node->pstate->prefetch_iterator = InvalidDsaPointer; - } - ExecScanReScan(&node->ss); /* @@ -1054,6 +1037,31 @@ ExecBitmapHeapInitializeDSM(BitmapHeapScanState *node, node->pstate = pstate; } +/* ---------------------------------------------------------------- + * ExecBitmapHeapReInitializeDSM + * + * Reset shared state before beginning a fresh scan. + * ---------------------------------------------------------------- + */ +void +ExecBitmapHeapReInitializeDSM(BitmapHeapScanState *node, + ParallelContext *pcxt) +{ + ParallelBitmapHeapState *pstate = node->pstate; + dsa_area *dsa = node->ss.ps.state->es_query_dsa; + + pstate->state = BM_INITIAL; + + if (DsaPointerIsValid(pstate->tbmiterator)) + tbm_free_shared_area(dsa, pstate->tbmiterator); + + if (DsaPointerIsValid(pstate->prefetch_iterator)) + tbm_free_shared_area(dsa, pstate->prefetch_iterator); + + pstate->tbmiterator = InvalidDsaPointer; + pstate->prefetch_iterator = InvalidDsaPointer; +} + /* ---------------------------------------------------------------- * ExecBitmapHeapInitializeWorker * diff --git a/src/backend/executor/nodeForeignscan.c b/src/backend/executor/nodeForeignscan.c index 69eeda5c..314ab881 100644 --- a/src/backend/executor/nodeForeignscan.c +++ b/src/backend/executor/nodeForeignscan.c @@ -350,7 +350,28 @@ ExecForeignScanInitializeDSM(ForeignScanState *node, ParallelContext *pcxt) } /* ---------------------------------------------------------------- - * ExecForeignScanInitializeDSM + * ExecForeignScanReInitializeDSM + * + * Reset shared state before beginning a fresh scan. + * ---------------------------------------------------------------- + */ +void +ExecForeignScanReInitializeDSM(ForeignScanState *node, ParallelContext *pcxt) +{ + FdwRoutine *fdwroutine = node->fdwroutine; + + if (fdwroutine->ReInitializeDSMForeignScan) + { + int plan_node_id = node->ss.ps.plan->plan_node_id; + void *coordinate; + + coordinate = shm_toc_lookup(pcxt->toc, plan_node_id, false); + fdwroutine->ReInitializeDSMForeignScan(node, pcxt, coordinate); + } +} + +/* ---------------------------------------------------------------- + * ExecForeignScanInitializeWorker * * Initialization according to the parallel coordination information * ---------------------------------------------------------------- diff --git a/src/backend/executor/nodeGather.c b/src/backend/executor/nodeGather.c index 4154b5d3..9c63e4eb 100644 --- a/src/backend/executor/nodeGather.c +++ b/src/backend/executor/nodeGather.c @@ -177,7 +177,7 @@ ExecGather(PlanState *pstate) ParallelContext *pcxt; ParallelWorkerStatus *num_parallel_workers = NULL; - /* Initialize the workers required to execute Gather node. */ + /* Initialize, or re-initialize, shared state needed by workers. */ if (!node->pei) #ifdef __TBASE__ node->pei = ExecInitParallelPlan(node->ps.lefttree, @@ -189,6 +189,10 @@ ExecGather(PlanState *pstate) estate, gather->num_workers); #endif + else + ExecParallelReinitialize(node->ps.lefttree, + node->pei); + /* * Register backend workers. We might not get as many as we * requested, or indeed any at all. @@ -527,7 +531,7 @@ ExecShutdownGather(GatherState *node) /* ---------------------------------------------------------------- * ExecReScanGather * - * Re-initialize the workers and rescans a relation via them. + * Prepare to re-scan the result of a Gather. * ---------------------------------------------------------------- */ void @@ -539,14 +543,46 @@ ExecReScanGather(GatherState *node) * to propagate any error or other information to master backend before * dying. Parallel context will be reused for rescan. */ + Gather *gather = (Gather *) node->ps.plan; + PlanState *outerPlan = outerPlanState(node); + + /* Make sure any existing workers are gracefully shut down */ ExecShutdownGatherWorkers(node); + /* Mark node so that shared state will be rebuilt at next call */ node->initialized = false; if (node->pei) - ExecParallelReinitialize(node->pei); + ExecParallelReinitialize(&node->ps, node->pei); ExecReScan(node->ps.lefttree); +#if 0 + ======= + /* + * Set child node's chgParam to tell it that the next scan might deliver a + * different set of rows within the leader process. (The overall rowset + * shouldn't change, but the leader process's subset might; hence nodes + * between here and the parallel table scan node mustn't optimize on the + * assumption of an unchanging rowset.) + */ + if (gather->rescan_param >= 0) + outerPlan->chgParam = bms_add_member(outerPlan->chgParam, + gather->rescan_param); + + /* + * If chgParam of subnode is not null then plan will be re-scanned by + * first ExecProcNode. Note: because this does nothing if we have a + * rescan_param, it's currently guaranteed that parallel-aware child nodes + * will not see a ReScan call until after they get a ReInitializeDSM call. + * That ordering might not be something to rely on, though. A good rule + * of thumb is that ReInitializeDSM should reset only shared state, ReScan + * should reset only local state, and anything that depends on both of + * those steps being finished must wait until the first ExecProcNode call. + */ + if (outerPlan->chgParam == NULL) + ExecReScan(outerPlan); +>>>>>>> 41b0dd987d... Separate reinitialization of shared parallel-scan state from ExecReScan. +#endif } #ifdef __TBASE__ void diff --git a/src/backend/executor/nodeGatherMerge.c b/src/backend/executor/nodeGatherMerge.c index 120cbc91..291cf644 100644 --- a/src/backend/executor/nodeGatherMerge.c +++ b/src/backend/executor/nodeGatherMerge.c @@ -197,7 +197,7 @@ ExecGatherMerge(PlanState *pstate) ParallelWorkerStatus *num_parallel_workers = NULL; #endif - /* Initialize data structures for workers. */ + /* Initialize, or re-initialize, shared state needed by workers. */ if (!node->pei) #ifdef __TBASE__ node->pei = ExecInitParallelPlan(node->ps.lefttree, @@ -209,6 +209,9 @@ ExecGatherMerge(PlanState *pstate) estate, gm->num_workers); #endif + else + ExecParallelReinitialize(node->ps.lefttree, + node->pei); /* Try to launch workers. */ pcxt = node->pei->pcxt; @@ -390,7 +393,7 @@ ExecShutdownGatherMergeWorkers(GatherMergeState *node) /* ---------------------------------------------------------------- * ExecReScanGatherMerge * - * Re-initialize the workers and rescans a relation via them. + * Prepare to re-scan the result of a GatherMerge. * ---------------------------------------------------------------- */ void @@ -402,14 +405,46 @@ ExecReScanGatherMerge(GatherMergeState *node) * to propagate any error or other information to master backend before * dying. Parallel context will be reused for rescan. */ + GatherMerge *gm = (GatherMerge *) node->ps.plan; + PlanState *outerPlan = outerPlanState(node); + + /* Make sure any existing workers are gracefully shut down */ ExecShutdownGatherMergeWorkers(node); + /* Mark node so that shared state will be rebuilt at next call */ node->initialized = false; if (node->pei) - ExecParallelReinitialize(node->pei); + ExecParallelReinitialize(&node->ps, node->pei); ExecReScan(node->ps.lefttree); +#if 0 +======= + /* + * Set child node's chgParam to tell it that the next scan might deliver a + * different set of rows within the leader process. (The overall rowset + * shouldn't change, but the leader process's subset might; hence nodes + * between here and the parallel table scan node mustn't optimize on the + * assumption of an unchanging rowset.) + */ + if (gm->rescan_param >= 0) + outerPlan->chgParam = bms_add_member(outerPlan->chgParam, + gm->rescan_param); + + /* + * If chgParam of subnode is not null then plan will be re-scanned by + * first ExecProcNode. Note: because this does nothing if we have a + * rescan_param, it's currently guaranteed that parallel-aware child nodes + * will not see a ReScan call until after they get a ReInitializeDSM call. + * That ordering might not be something to rely on, though. A good rule + * of thumb is that ReInitializeDSM should reset only shared state, ReScan + * should reset only local state, and anything that depends on both of + * those steps being finished must wait until the first ExecProcNode call. + */ + if (outerPlan->chgParam == NULL) + ExecReScan(outerPlan); +>>>>>>> 41b0dd987d... Separate reinitialization of shared parallel-scan state from ExecReScan. +#endif } /* diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index 73df1306..2b6d4d61 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -25,6 +25,7 @@ * parallel index-only scan * ExecIndexOnlyScanInitializeDSM initialize DSM for parallel * index-only scan + * ExecIndexOnlyScanReInitializeDSM reinitialize DSM for fresh scan * ExecIndexOnlyScanInitializeWorker attach to DSM info in parallel worker */ #include "postgres.h" @@ -347,16 +348,6 @@ ExecIndexOnlyScan(PlanState *pstate) void ExecReScanIndexOnlyScan(IndexOnlyScanState *node) { - bool reset_parallel_scan = true; - - /* - * If we are here to just update the scan keys, then don't reset parallel - * scan. For detailed reason behind this look in the comments for - * ExecReScanIndexScan. - */ - if (node->ioss_NumRuntimeKeys != 0 && !node->ioss_RuntimeKeysReady) - reset_parallel_scan = false; - /* * If we are doing runtime key calculations (ie, any of the index key * values weren't simple Consts), compute the new key values. But first, @@ -377,15 +368,10 @@ ExecReScanIndexOnlyScan(IndexOnlyScanState *node) /* reset index scan */ if (node->ioss_ScanDesc) - { - index_rescan(node->ioss_ScanDesc, node->ioss_ScanKeys, node->ioss_NumScanKeys, node->ioss_OrderByKeys, node->ioss_NumOrderByKeys); - if (reset_parallel_scan && node->ioss_ScanDesc->parallel_scan) - index_parallelrescan(node->ioss_ScanDesc); - } ExecScanReScan(&node->ss); } @@ -727,6 +713,19 @@ ExecIndexOnlyScanInitializeDSM(IndexOnlyScanState *node, node->ioss_OrderByKeys, node->ioss_NumOrderByKeys); } +/* ---------------------------------------------------------------- + * ExecIndexOnlyScanReInitializeDSM + * + * Reset shared state before beginning a fresh scan. + * ---------------------------------------------------------------- + */ +void +ExecIndexOnlyScanReInitializeDSM(IndexOnlyScanState *node, + ParallelContext *pcxt) +{ + index_parallelrescan(node->ioss_ScanDesc); +} + /* ---------------------------------------------------------------- * ExecIndexOnlyScanInitializeWorker * diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index 23dfff75..9e0307a1 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -24,6 +24,7 @@ * ExecIndexRestrPos restores scan position. * ExecIndexScanEstimate estimates DSM space needed for parallel index scan * ExecIndexScanInitializeDSM initialize DSM for parallel indexscan + * ExecIndexScanReInitializeDSM reinitialize DSM for fresh scan * ExecIndexScanInitializeWorker attach to DSM info in parallel worker */ #include "postgres.h" @@ -588,19 +589,7 @@ ExecIndexScan(PlanState *pstate) */ void ExecReScanIndexScan(IndexScanState *node) -{// #lizard forgives - bool reset_parallel_scan = true; - - /* - * If we are here to just update the scan keys, then don't reset parallel - * scan. We don't want each of the participating process in the parallel - * scan to update the shared parallel scan state at the start of the scan. - * It is quite possible that one of the participants has already begun - * scanning the index when another has yet to start it. - */ - if (node->iss_NumRuntimeKeys != 0 && !node->iss_RuntimeKeysReady) - reset_parallel_scan = false; - +{ /* * If we are doing runtime key calculations (ie, any of the index key * values weren't simple Consts), compute the new key values. But first, @@ -626,21 +615,11 @@ ExecReScanIndexScan(IndexScanState *node) reorderqueue_pop(node); } - /* - * Reset (parallel) index scan. For parallel-aware nodes, the scan - * descriptor is initialized during actual execution of node and we can - * reach here before that (ex. during execution of nest loop join). So, - * avoid updating the scan descriptor at that time. - */ + /* reset index scan */ if (node->iss_ScanDesc) - { index_rescan(node->iss_ScanDesc, node->iss_ScanKeys, node->iss_NumScanKeys, node->iss_OrderByKeys, node->iss_NumOrderByKeys); - - if (reset_parallel_scan && node->iss_ScanDesc->parallel_scan) - index_parallelrescan(node->iss_ScanDesc); - } node->iss_ReachedEnd = false; ExecScanReScan(&node->ss); @@ -1770,6 +1749,19 @@ ExecIndexScanInitializeDSM(IndexScanState *node, node->iss_OrderByKeys, node->iss_NumOrderByKeys); } +/* ---------------------------------------------------------------- + * ExecIndexScanReInitializeDSM + * + * Reset shared state before beginning a fresh scan. + * ---------------------------------------------------------------- + */ +void +ExecIndexScanReInitializeDSM(IndexScanState *node, + ParallelContext *pcxt) +{ + index_parallelrescan(node->iss_ScanDesc); +} + /* ---------------------------------------------------------------- * ExecIndexScanInitializeWorker * diff --git a/src/backend/executor/nodeSeqscan.c b/src/backend/executor/nodeSeqscan.c index e04a2be9..a55b9cbd 100644 --- a/src/backend/executor/nodeSeqscan.c +++ b/src/backend/executor/nodeSeqscan.c @@ -22,6 +22,7 @@ * * ExecSeqScanEstimate estimates DSM space needed for parallel scan * ExecSeqScanInitializeDSM initialize DSM for parallel scan + * ExecSeqScanReInitializeDSM reinitialize DSM for fresh parallel scan * ExecSeqScanInitializeWorker attach to DSM info in parallel worker */ #include "postgres.h" @@ -403,6 +404,21 @@ ExecSeqScanInitializeDSM(SeqScanState *node, heap_beginscan_parallel(node->ss.ss_currentRelation, pscan); } +/* ---------------------------------------------------------------- + * ExecSeqScanReInitializeDSM + * + * Reset shared state before beginning a fresh scan. + * ---------------------------------------------------------------- + */ +void +ExecSeqScanReInitializeDSM(SeqScanState *node, + ParallelContext *pcxt) +{ + HeapScanDesc scan = node->ss.ss_currentScanDesc; + + heap_parallelscan_reinitialize(scan->rs_parallel); +} + /* ---------------------------------------------------------------- * ExecSeqScanInitializeWorker * diff --git a/src/backend/executor/nodeSort.c b/src/backend/executor/nodeSort.c index d891a645..2dd4bf89 100644 --- a/src/backend/executor/nodeSort.c +++ b/src/backend/executor/nodeSort.c @@ -506,6 +506,23 @@ ExecSortInitializeDSM(SortState *node, ParallelContext *pcxt) node->shared_info); } +/* ---------------------------------------------------------------- + * ExecSortReInitializeDSM + * + * Reset shared state before beginning a fresh scan. + * ---------------------------------------------------------------- + */ +void +ExecSortReInitializeDSM(SortState *node, ParallelContext *pcxt) +{ + /* If there's any instrumentation space, clear it for next time */ + if (node->shared_info != NULL) + { + memset(node->shared_info->sinstrument, 0, + node->shared_info->num_workers * sizeof(TuplesortInstrumentation)); + } +} + /* ---------------------------------------------------------------- * ExecSortInitializeWorker * diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 33914a93..6cec82fb 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * heapam.h - * POSTGRES heap access method definitions. + * POSTGRES heap access method definitions. * * * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group @@ -25,9 +25,9 @@ /* "options" flag bits for heap_insert */ -#define HEAP_INSERT_SKIP_WAL 0x0001 -#define HEAP_INSERT_SKIP_FSM 0x0002 -#define HEAP_INSERT_FROZEN 0x0004 +#define HEAP_INSERT_SKIP_WAL 0x0001 +#define HEAP_INSERT_SKIP_FSM 0x0002 +#define HEAP_INSERT_FROZEN 0x0004 #define HEAP_INSERT_SPECULATIVE 0x0008 typedef struct BulkInsertStateData *BulkInsertState; @@ -37,17 +37,17 @@ typedef struct BulkInsertStateData *BulkInsertState; */ typedef enum LockTupleMode { - /* SELECT FOR KEY SHARE */ - LockTupleKeyShare, - /* SELECT FOR SHARE */ - LockTupleShare, - /* SELECT FOR NO KEY UPDATE, and UPDATEs that don't modify key columns */ - LockTupleNoKeyExclusive, - /* SELECT FOR UPDATE, UPDATEs that modify key columns, and DELETE */ - LockTupleExclusive + /* SELECT FOR KEY SHARE */ + LockTupleKeyShare, + /* SELECT FOR SHARE */ + LockTupleShare, + /* SELECT FOR NO KEY UPDATE, and UPDATEs that don't modify key columns */ + LockTupleNoKeyExclusive, + /* SELECT FOR UPDATE, UPDATEs that modify key columns, and DELETE */ + LockTupleExclusive } LockTupleMode; -#define MaxLockTupleMode LockTupleExclusive +#define MaxLockTupleMode LockTupleExclusive /* * When heap_update, heap_delete, or heap_lock_tuple fail because the target @@ -67,14 +67,14 @@ typedef enum LockTupleMode */ typedef struct HeapUpdateFailureData { - ItemPointerData ctid; - TransactionId xmax; - CommandId cmax; + ItemPointerData ctid; + TransactionId xmax; + CommandId cmax; } HeapUpdateFailureData; /* ---------------- - * function prototypes for heap access method + * function prototypes for heap access method * * heap_create, heap_create_with_catalog, and heap_drop_with_catalog * are declared in catalog/heap.h @@ -86,13 +86,13 @@ extern Relation relation_open(Oid relationId, LOCKMODE lockmode); extern Relation try_relation_open(Oid relationId, LOCKMODE lockmode); extern Relation relation_openrv(const RangeVar *relation, LOCKMODE lockmode); extern Relation relation_openrv_extended(const RangeVar *relation, - LOCKMODE lockmode, bool missing_ok); + LOCKMODE lockmode, bool missing_ok); extern void relation_close(Relation relation, LOCKMODE lockmode); extern Relation heap_open(Oid relationId, LOCKMODE lockmode); extern Relation heap_openrv(const RangeVar *relation, LOCKMODE lockmode); extern Relation heap_openrv_extended(const RangeVar *relation, - LOCKMODE lockmode, bool missing_ok); + LOCKMODE lockmode, bool missing_ok); #define heap_close(r,l) relation_close(r,l) @@ -102,47 +102,48 @@ typedef struct ParallelHeapScanDescData *ParallelHeapScanDesc; /* * HeapScanIsValid - * True iff the heap scan is valid. + * True iff the heap scan is valid. */ #define HeapScanIsValid(scan) PointerIsValid(scan) extern HeapScanDesc heap_beginscan(Relation relation, Snapshot snapshot, - int nkeys, ScanKey key); + int nkeys, ScanKey key); extern HeapScanDesc heap_beginscan_catalog(Relation relation, int nkeys, - ScanKey key); + ScanKey key); extern HeapScanDesc heap_beginscan_strat(Relation relation, Snapshot snapshot, - int nkeys, ScanKey key, - bool allow_strat, bool allow_sync); + int nkeys, ScanKey key, + bool allow_strat, bool allow_sync); extern HeapScanDesc heap_beginscan_bm(Relation relation, Snapshot snapshot, - int nkeys, ScanKey key); + int nkeys, ScanKey key); extern HeapScanDesc heap_beginscan_sampling(Relation relation, - Snapshot snapshot, int nkeys, ScanKey key, - bool allow_strat, bool allow_sync, bool allow_pagemode); + Snapshot snapshot, int nkeys, ScanKey key, + bool allow_strat, bool allow_sync, bool allow_pagemode); extern void heap_setscanlimits(HeapScanDesc scan, BlockNumber startBlk, - BlockNumber endBlk); + BlockNumber endBlk); extern void heapgetpage(HeapScanDesc scan, BlockNumber page); extern void heap_rescan(HeapScanDesc scan, ScanKey key); extern void heap_rescan_set_params(HeapScanDesc scan, ScanKey key, - bool allow_strat, bool allow_sync, bool allow_pagemode); + bool allow_strat, bool allow_sync, bool allow_pagemode); extern void heap_endscan(HeapScanDesc scan); extern HeapTuple heap_getnext(HeapScanDesc scan, ScanDirection direction); extern Size heap_parallelscan_estimate(Snapshot snapshot); extern void heap_parallelscan_initialize(ParallelHeapScanDesc target, - Relation relation, Snapshot snapshot); + Relation relation, Snapshot snapshot); +extern void heap_parallelscan_reinitialize(ParallelHeapScanDesc parallel_scan); extern HeapScanDesc heap_beginscan_parallel(Relation, ParallelHeapScanDesc); extern bool heap_fetch(Relation relation, Snapshot snapshot, - HeapTuple tuple, Buffer *userbuf, bool keep_buf, - Relation stats_relation); + HeapTuple tuple, Buffer *userbuf, bool keep_buf, + Relation stats_relation); extern bool heap_hot_search_buffer(ItemPointer tid, Relation relation, - Buffer buffer, Snapshot snapshot, HeapTuple heapTuple, - bool *all_dead, bool first_call); + Buffer buffer, Snapshot snapshot, HeapTuple heapTuple, + bool *all_dead, bool first_call); extern bool heap_hot_search(ItemPointer tid, Relation relation, - Snapshot snapshot, bool *all_dead); + Snapshot snapshot, bool *all_dead); extern void heap_get_latest_tid(Relation relation, Snapshot snapshot, - ItemPointer tid); + ItemPointer tid); extern void setLastTid(const ItemPointer tid); extern BulkInsertState GetBulkInsertState(void); @@ -153,33 +154,33 @@ extern void FreeBulkInsertState(BulkInsertState); extern void ReleaseBulkInsertStatePin(BulkInsertState bistate); extern Oid heap_insert(Relation relation, HeapTuple tup, CommandId cid, - int options, BulkInsertState bistate); + int options, BulkInsertState bistate); extern void heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples, - CommandId cid, int options, BulkInsertState bistate); + CommandId cid, int options, BulkInsertState bistate); extern HTSU_Result heap_delete(Relation relation, ItemPointer tid, - CommandId cid, Snapshot crosscheck, bool wait, - HeapUpdateFailureData *hufd); + CommandId cid, Snapshot crosscheck, bool wait, + HeapUpdateFailureData *hufd); extern void heap_finish_speculative(Relation relation, HeapTuple tuple); extern void heap_abort_speculative(Relation relation, HeapTuple tuple); extern HTSU_Result heap_update(Relation relation, ItemPointer otid, - HeapTuple newtup, - CommandId cid, Snapshot crosscheck, bool wait, - HeapUpdateFailureData *hufd, LockTupleMode *lockmode); + HeapTuple newtup, + CommandId cid, Snapshot crosscheck, bool wait, + HeapUpdateFailureData *hufd, LockTupleMode *lockmode); extern HTSU_Result heap_lock_tuple(Relation relation, HeapTuple tuple, - CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, - bool follow_update, - Buffer *buffer, HeapUpdateFailureData *hufd); + CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, + bool follow_update, + Buffer *buffer, HeapUpdateFailureData *hufd); extern void heap_inplace_update(Relation relation, HeapTuple tuple); extern bool heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid, - TransactionId cutoff_multi); + TransactionId cutoff_multi); extern bool heap_tuple_needs_freeze(HeapTupleHeader tuple, TransactionId cutoff_xid, - MultiXactId cutoff_multi, Buffer buf); + MultiXactId cutoff_multi, Buffer buf); extern bool heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple); -extern Oid simple_heap_insert(Relation relation, HeapTuple tup); +extern Oid simple_heap_insert(Relation relation, HeapTuple tup); extern void simple_heap_delete(Relation relation, ItemPointer tid); extern void simple_heap_update(Relation relation, ItemPointer otid, - HeapTuple tup); + HeapTuple tup); extern void heap_sync(Relation relation); extern void heap_update_snapshot(HeapScanDesc scan, Snapshot snapshot); @@ -187,12 +188,12 @@ extern void heap_update_snapshot(HeapScanDesc scan, Snapshot snapshot); /* in heap/pruneheap.c */ extern void heap_page_prune_opt(Relation relation, Buffer buffer); extern int heap_page_prune(Relation relation, Buffer buffer, - TransactionId OldestXmin, - bool report_stats, TransactionId *latestRemovedXid); + TransactionId OldestXmin, + bool report_stats, TransactionId *latestRemovedXid); extern void heap_page_prune_execute(Buffer buffer, - OffsetNumber *redirected, int nredirected, - OffsetNumber *nowdead, int ndead, - OffsetNumber *nowunused, int nunused); + OffsetNumber *redirected, int nredirected, + OffsetNumber *nowdead, int ndead, + OffsetNumber *nowunused, int nunused); extern void heap_get_root_tuples(Page page, OffsetNumber *root_offsets); /* in heap/syncscan.c */ @@ -204,4 +205,4 @@ extern Size SyncScanShmemSize(void); extern void mls_enable_update_rolpassword(void); extern void mls_disable_update_rolpassword(void); #endif -#endif /* HEAPAM_H */ +#endif /* HEAPAM_H */ diff --git a/src/include/executor/execParallel.h b/src/include/executor/execParallel.h index ebd06bfb..3614fc6a 100644 --- a/src/include/executor/execParallel.h +++ b/src/include/executor/execParallel.h @@ -1,12 +1,12 @@ /*-------------------------------------------------------------------- * execParallel.h - * POSTGRES parallel execution interface + * POSTGRES parallel execution interface * * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * src/include/executor/execParallel.h + * src/include/executor/execParallel.h *-------------------------------------------------------------------- */ @@ -23,15 +23,15 @@ typedef struct SharedExecutorInstrumentation SharedExecutorInstrumentation; typedef struct ParallelExecutorInfo { - PlanState *planstate; - ParallelContext *pcxt; - BufferUsage *buffer_usage; - SharedExecutorInstrumentation *instrumentation; - shm_mq_handle **tqueue; - dsa_area *area; - bool finished; + PlanState *planstate; + ParallelContext *pcxt; + BufferUsage *buffer_usage; + SharedExecutorInstrumentation *instrumentation; + shm_mq_handle **tqueue; + dsa_area *area; + bool finished; #ifdef __TBASE__ - bool *executor_done; + bool *executor_done; #endif } ParallelExecutorInfo; @@ -48,7 +48,8 @@ extern ParallelExecutorInfo *ExecInitParallelPlan(PlanState *planstate, extern void ExecParallelFinish(ParallelExecutorInfo *pei); extern void ExecParallelCleanup(ParallelExecutorInfo *pei); -extern void ExecParallelReinitialize(ParallelExecutorInfo *pei); +extern void ExecParallelReinitialize(PlanState *planstate, + ParallelExecutorInfo *pei); extern void ParallelQueryMain(dsm_segment *seg, shm_toc *toc); #ifdef __TBASE__ @@ -63,4 +64,4 @@ extern bool ParallelError(void); extern void HandleParallelExecutionError(void); #endif -#endif /* EXECPARALLEL_H */ +#endif /* EXECPARALLEL_H */ diff --git a/src/include/executor/nodeBitmapHeapscan.h b/src/include/executor/nodeBitmapHeapscan.h index ab98a23b..7907ecc3 100644 --- a/src/include/executor/nodeBitmapHeapscan.h +++ b/src/include/executor/nodeBitmapHeapscan.h @@ -24,6 +24,8 @@ extern void ExecBitmapHeapEstimate(BitmapHeapScanState *node, ParallelContext *pcxt); extern void ExecBitmapHeapInitializeDSM(BitmapHeapScanState *node, ParallelContext *pcxt); +extern void ExecBitmapHeapReInitializeDSM(BitmapHeapScanState *node, + ParallelContext *pcxt); extern void ExecBitmapHeapInitializeWorker(BitmapHeapScanState *node, ParallelWorkerContext *pwcxt); diff --git a/src/include/executor/nodeCustom.h b/src/include/executor/nodeCustom.h index ef99c01b..d7dcf3b8 100644 --- a/src/include/executor/nodeCustom.h +++ b/src/include/executor/nodeCustom.h @@ -34,6 +34,8 @@ extern void ExecCustomScanEstimate(CustomScanState *node, ParallelContext *pcxt); extern void ExecCustomScanInitializeDSM(CustomScanState *node, ParallelContext *pcxt); +extern void ExecCustomScanReInitializeDSM(CustomScanState *node, + ParallelContext *pcxt); extern void ExecCustomScanInitializeWorker(CustomScanState *node, ParallelWorkerContext *pwcxt); extern void ExecShutdownCustomScan(CustomScanState *node); diff --git a/src/include/executor/nodeForeignscan.h b/src/include/executor/nodeForeignscan.h index 663bdf77..152abf02 100644 --- a/src/include/executor/nodeForeignscan.h +++ b/src/include/executor/nodeForeignscan.h @@ -25,6 +25,8 @@ extern void ExecForeignScanEstimate(ForeignScanState *node, ParallelContext *pcxt); extern void ExecForeignScanInitializeDSM(ForeignScanState *node, ParallelContext *pcxt); +extern void ExecForeignScanReInitializeDSM(ForeignScanState *node, + ParallelContext *pcxt); extern void ExecForeignScanInitializeWorker(ForeignScanState *node, ParallelWorkerContext *pwcxt); extern void ExecShutdownForeignScan(ForeignScanState *node); diff --git a/src/include/executor/nodeIndexonlyscan.h b/src/include/executor/nodeIndexonlyscan.h index 8bb3a65c..c5344a8d 100644 --- a/src/include/executor/nodeIndexonlyscan.h +++ b/src/include/executor/nodeIndexonlyscan.h @@ -28,6 +28,8 @@ extern void ExecIndexOnlyScanEstimate(IndexOnlyScanState *node, ParallelContext *pcxt); extern void ExecIndexOnlyScanInitializeDSM(IndexOnlyScanState *node, ParallelContext *pcxt); +extern void ExecIndexOnlyScanReInitializeDSM(IndexOnlyScanState *node, + ParallelContext *pcxt); extern void ExecIndexOnlyScanInitializeWorker(IndexOnlyScanState *node, ParallelWorkerContext *pwcxt); diff --git a/src/include/foreign/fdwapi.h b/src/include/foreign/fdwapi.h index 8cdedb5d..ef0fbe6f 100644 --- a/src/include/foreign/fdwapi.h +++ b/src/include/foreign/fdwapi.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * fdwapi.h - * API for foreign-data wrappers + * API for foreign-data wrappers * * Copyright (c) 2010-2017, PostgreSQL Global Development Group * @@ -25,136 +25,139 @@ struct ExplainState; */ typedef void (*GetForeignRelSize_function) (PlannerInfo *root, - RelOptInfo *baserel, - Oid foreigntableid); + RelOptInfo *baserel, + Oid foreigntableid); typedef void (*GetForeignPaths_function) (PlannerInfo *root, - RelOptInfo *baserel, - Oid foreigntableid); + RelOptInfo *baserel, + Oid foreigntableid); typedef ForeignScan *(*GetForeignPlan_function) (PlannerInfo *root, - RelOptInfo *baserel, - Oid foreigntableid, - ForeignPath *best_path, - List *tlist, - List *scan_clauses, - Plan *outer_plan); + RelOptInfo *baserel, + Oid foreigntableid, + ForeignPath *best_path, + List *tlist, + List *scan_clauses, + Plan *outer_plan); typedef void (*BeginForeignScan_function) (ForeignScanState *node, - int eflags); + int eflags); typedef TupleTableSlot *(*IterateForeignScan_function) (ForeignScanState *node); typedef bool (*RecheckForeignScan_function) (ForeignScanState *node, - TupleTableSlot *slot); + TupleTableSlot *slot); typedef void (*ReScanForeignScan_function) (ForeignScanState *node); typedef void (*EndForeignScan_function) (ForeignScanState *node); typedef void (*GetForeignJoinPaths_function) (PlannerInfo *root, - RelOptInfo *joinrel, - RelOptInfo *outerrel, - RelOptInfo *innerrel, - JoinType jointype, - JoinPathExtraData *extra); + RelOptInfo *joinrel, + RelOptInfo *outerrel, + RelOptInfo *innerrel, + JoinType jointype, + JoinPathExtraData *extra); typedef void (*GetForeignUpperPaths_function) (PlannerInfo *root, - UpperRelationKind stage, - RelOptInfo *input_rel, - RelOptInfo *output_rel); + UpperRelationKind stage, + RelOptInfo *input_rel, + RelOptInfo *output_rel); typedef void (*AddForeignUpdateTargets_function) (Query *parsetree, - RangeTblEntry *target_rte, - Relation target_relation); + RangeTblEntry *target_rte, + Relation target_relation); typedef List *(*PlanForeignModify_function) (PlannerInfo *root, - ModifyTable *plan, - Index resultRelation, - int subplan_index); + ModifyTable *plan, + Index resultRelation, + int subplan_index); typedef void (*BeginForeignModify_function) (ModifyTableState *mtstate, - ResultRelInfo *rinfo, - List *fdw_private, - int subplan_index, - int eflags); + ResultRelInfo *rinfo, + List *fdw_private, + int subplan_index, + int eflags); typedef TupleTableSlot *(*ExecForeignInsert_function) (EState *estate, - ResultRelInfo *rinfo, - TupleTableSlot *slot, - TupleTableSlot *planSlot); + ResultRelInfo *rinfo, + TupleTableSlot *slot, + TupleTableSlot *planSlot); typedef TupleTableSlot *(*ExecForeignUpdate_function) (EState *estate, - ResultRelInfo *rinfo, - TupleTableSlot *slot, - TupleTableSlot *planSlot); + ResultRelInfo *rinfo, + TupleTableSlot *slot, + TupleTableSlot *planSlot); typedef TupleTableSlot *(*ExecForeignDelete_function) (EState *estate, - ResultRelInfo *rinfo, - TupleTableSlot *slot, - TupleTableSlot *planSlot); + ResultRelInfo *rinfo, + TupleTableSlot *slot, + TupleTableSlot *planSlot); typedef void (*EndForeignModify_function) (EState *estate, - ResultRelInfo *rinfo); + ResultRelInfo *rinfo); typedef int (*IsForeignRelUpdatable_function) (Relation rel); typedef bool (*PlanDirectModify_function) (PlannerInfo *root, - ModifyTable *plan, - Index resultRelation, - int subplan_index); + ModifyTable *plan, + Index resultRelation, + int subplan_index); typedef void (*BeginDirectModify_function) (ForeignScanState *node, - int eflags); + int eflags); typedef TupleTableSlot *(*IterateDirectModify_function) (ForeignScanState *node); typedef void (*EndDirectModify_function) (ForeignScanState *node); typedef RowMarkType (*GetForeignRowMarkType_function) (RangeTblEntry *rte, - LockClauseStrength strength); + LockClauseStrength strength); typedef HeapTuple (*RefetchForeignRow_function) (EState *estate, - ExecRowMark *erm, - Datum rowid, - bool *updated); + ExecRowMark *erm, + Datum rowid, + bool *updated); typedef void (*ExplainForeignScan_function) (ForeignScanState *node, - struct ExplainState *es); + struct ExplainState *es); typedef void (*ExplainForeignModify_function) (ModifyTableState *mtstate, - ResultRelInfo *rinfo, - List *fdw_private, - int subplan_index, - struct ExplainState *es); + ResultRelInfo *rinfo, + List *fdw_private, + int subplan_index, + struct ExplainState *es); typedef void (*ExplainDirectModify_function) (ForeignScanState *node, - struct ExplainState *es); + struct ExplainState *es); typedef int (*AcquireSampleRowsFunc) (Relation relation, int elevel, - HeapTuple *rows, int targrows, - double *totalrows, - double *totaldeadrows); + HeapTuple *rows, int targrows, + double *totalrows, + double *totaldeadrows); typedef bool (*AnalyzeForeignTable_function) (Relation relation, - AcquireSampleRowsFunc *func, - BlockNumber *totalpages); + AcquireSampleRowsFunc *func, + BlockNumber *totalpages); typedef List *(*ImportForeignSchema_function) (ImportForeignSchemaStmt *stmt, - Oid serverOid); + Oid serverOid); typedef Size (*EstimateDSMForeignScan_function) (ForeignScanState *node, - ParallelContext *pcxt); + ParallelContext *pcxt); typedef void (*InitializeDSMForeignScan_function) (ForeignScanState *node, - ParallelContext *pcxt, - void *coordinate); + ParallelContext *pcxt, + void *coordinate); +typedef void (*ReInitializeDSMForeignScan_function) (ForeignScanState *node, + ParallelContext *pcxt, + void *coordinate); typedef void (*InitializeWorkerForeignScan_function) (ForeignScanState *node, - shm_toc *toc, - void *coordinate); + shm_toc *toc, + void *coordinate); typedef void (*ShutdownForeignScan_function) (ForeignScanState *node); typedef bool (*IsForeignScanParallelSafe_function) (PlannerInfo *root, - RelOptInfo *rel, - RangeTblEntry *rte); + RelOptInfo *rel, + RangeTblEntry *rte); /* * FdwRoutine is the struct returned by a foreign-data wrapper's handler @@ -168,75 +171,76 @@ typedef bool (*IsForeignScanParallelSafe_function) (PlannerInfo *root, */ typedef struct FdwRoutine { - NodeTag type; - - /* Functions for scanning foreign tables */ - GetForeignRelSize_function GetForeignRelSize; - GetForeignPaths_function GetForeignPaths; - GetForeignPlan_function GetForeignPlan; - BeginForeignScan_function BeginForeignScan; - IterateForeignScan_function IterateForeignScan; - ReScanForeignScan_function ReScanForeignScan; - EndForeignScan_function EndForeignScan; - - /* - * Remaining functions are optional. Set the pointer to NULL for any that - * are not provided. - */ - - /* Functions for remote-join planning */ - GetForeignJoinPaths_function GetForeignJoinPaths; - - /* Functions for remote upper-relation (post scan/join) planning */ - GetForeignUpperPaths_function GetForeignUpperPaths; - - /* Functions for updating foreign tables */ - AddForeignUpdateTargets_function AddForeignUpdateTargets; - PlanForeignModify_function PlanForeignModify; - BeginForeignModify_function BeginForeignModify; - ExecForeignInsert_function ExecForeignInsert; - ExecForeignUpdate_function ExecForeignUpdate; - ExecForeignDelete_function ExecForeignDelete; - EndForeignModify_function EndForeignModify; - IsForeignRelUpdatable_function IsForeignRelUpdatable; - PlanDirectModify_function PlanDirectModify; - BeginDirectModify_function BeginDirectModify; - IterateDirectModify_function IterateDirectModify; - EndDirectModify_function EndDirectModify; - - /* Functions for SELECT FOR UPDATE/SHARE row locking */ - GetForeignRowMarkType_function GetForeignRowMarkType; - RefetchForeignRow_function RefetchForeignRow; - RecheckForeignScan_function RecheckForeignScan; - - /* Support functions for EXPLAIN */ - ExplainForeignScan_function ExplainForeignScan; - ExplainForeignModify_function ExplainForeignModify; - ExplainDirectModify_function ExplainDirectModify; - - /* Support functions for ANALYZE */ - AnalyzeForeignTable_function AnalyzeForeignTable; - - /* Support functions for IMPORT FOREIGN SCHEMA */ - ImportForeignSchema_function ImportForeignSchema; - - /* Support functions for parallelism under Gather node */ - IsForeignScanParallelSafe_function IsForeignScanParallelSafe; - EstimateDSMForeignScan_function EstimateDSMForeignScan; - InitializeDSMForeignScan_function InitializeDSMForeignScan; - InitializeWorkerForeignScan_function InitializeWorkerForeignScan; - ShutdownForeignScan_function ShutdownForeignScan; + NodeTag type; + + /* Functions for scanning foreign tables */ + GetForeignRelSize_function GetForeignRelSize; + GetForeignPaths_function GetForeignPaths; + GetForeignPlan_function GetForeignPlan; + BeginForeignScan_function BeginForeignScan; + IterateForeignScan_function IterateForeignScan; + ReScanForeignScan_function ReScanForeignScan; + EndForeignScan_function EndForeignScan; + + /* + * Remaining functions are optional. Set the pointer to NULL for any that + * are not provided. + */ + + /* Functions for remote-join planning */ + GetForeignJoinPaths_function GetForeignJoinPaths; + + /* Functions for remote upper-relation (post scan/join) planning */ + GetForeignUpperPaths_function GetForeignUpperPaths; + + /* Functions for updating foreign tables */ + AddForeignUpdateTargets_function AddForeignUpdateTargets; + PlanForeignModify_function PlanForeignModify; + BeginForeignModify_function BeginForeignModify; + ExecForeignInsert_function ExecForeignInsert; + ExecForeignUpdate_function ExecForeignUpdate; + ExecForeignDelete_function ExecForeignDelete; + EndForeignModify_function EndForeignModify; + IsForeignRelUpdatable_function IsForeignRelUpdatable; + PlanDirectModify_function PlanDirectModify; + BeginDirectModify_function BeginDirectModify; + IterateDirectModify_function IterateDirectModify; + EndDirectModify_function EndDirectModify; + + /* Functions for SELECT FOR UPDATE/SHARE row locking */ + GetForeignRowMarkType_function GetForeignRowMarkType; + RefetchForeignRow_function RefetchForeignRow; + RecheckForeignScan_function RecheckForeignScan; + + /* Support functions for EXPLAIN */ + ExplainForeignScan_function ExplainForeignScan; + ExplainForeignModify_function ExplainForeignModify; + ExplainDirectModify_function ExplainDirectModify; + + /* Support functions for ANALYZE */ + AnalyzeForeignTable_function AnalyzeForeignTable; + + /* Support functions for IMPORT FOREIGN SCHEMA */ + ImportForeignSchema_function ImportForeignSchema; + + /* Support functions for parallelism under Gather node */ + IsForeignScanParallelSafe_function IsForeignScanParallelSafe; + EstimateDSMForeignScan_function EstimateDSMForeignScan; + InitializeDSMForeignScan_function InitializeDSMForeignScan; + ReInitializeDSMForeignScan_function ReInitializeDSMForeignScan; + InitializeWorkerForeignScan_function InitializeWorkerForeignScan; + ShutdownForeignScan_function ShutdownForeignScan; } FdwRoutine; /* Functions in foreign/foreign.c */ extern FdwRoutine *GetFdwRoutine(Oid fdwhandler); -extern Oid GetForeignServerIdByRelId(Oid relid); +extern Oid GetForeignServerIdByRelId(Oid relid); extern FdwRoutine *GetFdwRoutineByServerId(Oid serverid); extern FdwRoutine *GetFdwRoutineByRelId(Oid relid); extern FdwRoutine *GetFdwRoutineForRelation(Relation relation, bool makecopy); extern bool IsImportableForeignTable(const char *tablename, - ImportForeignSchemaStmt *stmt); + ImportForeignSchemaStmt *stmt); extern Path *GetExistingLocalJoinPath(RelOptInfo *joinrel); -#endif /* FDWAPI_H */ +#endif /* FDWAPI_H */ diff --git a/src/include/nodes/extensible.h b/src/include/nodes/extensible.h index bc4e07d8..0654e79c 100644 --- a/src/include/nodes/extensible.h +++ b/src/include/nodes/extensible.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * extensible.h - * Definitions for extensible nodes and custom scans + * Definitions for extensible nodes and custom scans * * * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group @@ -21,7 +21,7 @@ #include "nodes/relation.h" /* maximum length of an extensible node identifier */ -#define EXTNODENAME_MAX_LEN 64 +#define EXTNODENAME_MAX_LEN 64 /* * An extensible node is a new type of node defined by an extension. The @@ -31,8 +31,8 @@ */ typedef struct ExtensibleNode { - NodeTag type; - const char *extnodename; /* identifier of ExtensibleNodeMethods */ + NodeTag type; + const char *extnodename; /* identifier of ExtensibleNodeMethods */ } ExtensibleNode; /* @@ -59,27 +59,27 @@ typedef struct ExtensibleNode */ typedef struct ExtensibleNodeMethods { - const char *extnodename; - Size node_size; - void (*nodeCopy) (struct ExtensibleNode *newnode, - const struct ExtensibleNode *oldnode); - bool (*nodeEqual) (const struct ExtensibleNode *a, - const struct ExtensibleNode *b); - void (*nodeOut) (struct StringInfoData *str, - const struct ExtensibleNode *node); - void (*nodeRead) (struct ExtensibleNode *node); + const char *extnodename; + Size node_size; + void (*nodeCopy) (struct ExtensibleNode *newnode, + const struct ExtensibleNode *oldnode); + bool (*nodeEqual) (const struct ExtensibleNode *a, + const struct ExtensibleNode *b); + void (*nodeOut) (struct StringInfoData *str, + const struct ExtensibleNode *node); + void (*nodeRead) (struct ExtensibleNode *node); } ExtensibleNodeMethods; extern void RegisterExtensibleNodeMethods(const ExtensibleNodeMethods *method); extern const ExtensibleNodeMethods *GetExtensibleNodeMethods(const char *name, - bool missing_ok); + bool missing_ok); /* * Flags for custom paths, indicating what capabilities the resulting scan * will have. */ -#define CUSTOMPATH_SUPPORT_BACKWARD_SCAN 0x0001 -#define CUSTOMPATH_SUPPORT_MARK_RESTORE 0x0002 +#define CUSTOMPATH_SUPPORT_BACKWARD_SCAN 0x0001 +#define CUSTOMPATH_SUPPORT_MARK_RESTORE 0x0002 /* * Custom path methods. Mostly, we just need to know how to convert a @@ -87,16 +87,16 @@ extern const ExtensibleNodeMethods *GetExtensibleNodeMethods(const char *name, */ typedef struct CustomPathMethods { - const char *CustomName; + const char *CustomName; - /* Convert Path to a Plan */ - struct Plan *(*PlanCustomPath) (PlannerInfo *root, - RelOptInfo *rel, - struct CustomPath *best_path, - List *tlist, - List *clauses, - List *custom_plans); -} CustomPathMethods; + /* Convert Path to a Plan */ + struct Plan *(*PlanCustomPath) (PlannerInfo *root, + RelOptInfo *rel, + struct CustomPath *best_path, + List *tlist, + List *clauses, + List *custom_plans); +} CustomPathMethods; /* * Custom scan. Here again, there's not much to do: we need to be able to @@ -104,10 +104,10 @@ typedef struct CustomPathMethods */ typedef struct CustomScanMethods { - const char *CustomName; + const char *CustomName; - /* Create execution state (CustomScanState) from a CustomScan plan node */ - Node *(*CreateCustomScanState) (CustomScan *cscan); + /* Create execution state (CustomScanState) from a CustomScan plan node */ + Node *(*CreateCustomScanState) (CustomScan *cscan); } CustomScanMethods; /* @@ -116,39 +116,42 @@ typedef struct CustomScanMethods */ typedef struct CustomExecMethods { - const char *CustomName; - - /* Required executor methods */ - void (*BeginCustomScan) (CustomScanState *node, - EState *estate, - int eflags); - TupleTableSlot *(*ExecCustomScan) (CustomScanState *node); - void (*EndCustomScan) (CustomScanState *node); - void (*ReScanCustomScan) (CustomScanState *node); - - /* Optional methods: needed if mark/restore is supported */ - void (*MarkPosCustomScan) (CustomScanState *node); - void (*RestrPosCustomScan) (CustomScanState *node); - - /* Optional methods: needed if parallel execution is supported */ - Size (*EstimateDSMCustomScan) (CustomScanState *node, - ParallelContext *pcxt); - void (*InitializeDSMCustomScan) (CustomScanState *node, - ParallelContext *pcxt, - void *coordinate); - void (*InitializeWorkerCustomScan) (CustomScanState *node, - shm_toc *toc, - void *coordinate); - void (*ShutdownCustomScan) (CustomScanState *node); - - /* Optional: print additional information in EXPLAIN */ - void (*ExplainCustomScan) (CustomScanState *node, - List *ancestors, - ExplainState *es); + const char *CustomName; + + /* Required executor methods */ + void (*BeginCustomScan) (CustomScanState *node, + EState *estate, + int eflags); + TupleTableSlot *(*ExecCustomScan) (CustomScanState *node); + void (*EndCustomScan) (CustomScanState *node); + void (*ReScanCustomScan) (CustomScanState *node); + + /* Optional methods: needed if mark/restore is supported */ + void (*MarkPosCustomScan) (CustomScanState *node); + void (*RestrPosCustomScan) (CustomScanState *node); + + /* Optional methods: needed if parallel execution is supported */ + Size (*EstimateDSMCustomScan) (CustomScanState *node, + ParallelContext *pcxt); + void (*InitializeDSMCustomScan) (CustomScanState *node, + ParallelContext *pcxt, + void *coordinate); + void (*ReInitializeDSMCustomScan) (CustomScanState *node, + ParallelContext *pcxt, + void *coordinate); + void (*InitializeWorkerCustomScan) (CustomScanState *node, + shm_toc *toc, + void *coordinate); + void (*ShutdownCustomScan) (CustomScanState *node); + + /* Optional: print additional information in EXPLAIN */ + void (*ExplainCustomScan) (CustomScanState *node, + List *ancestors, + ExplainState *es); } CustomExecMethods; extern void RegisterCustomScanMethods(const CustomScanMethods *methods); extern const CustomScanMethods *GetCustomScanMethods(const char *CustomName, - bool missing_ok); + bool missing_ok); -#endif /* EXTENSIBLE_H */ +#endif /* EXTENSIBLE_H */ From 46dd3624a6e495b72aa4175e8816bd7f9720fe90 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Tue, 5 Dec 2017 10:55:56 -0800 Subject: [PATCH 138/578] Fix EXPLAIN ANALYZE of hash join when the leader doesn't participate. If a hash join appears in a parallel query, there may be no hash table available for explain.c to inspect even though a hash table may have been built in other processes. This could happen either because parallel_leader_participation was set to off or because the leader happened to hit the end of the outer relation immediately (even though the complete relation is not empty) and decided not to build the hash table. Commit bf11e7ee introduced a way for workers to exchange instrumentation via the DSM segment for Sort nodes even though they are not parallel-aware. This commit does the same for Hash nodes, so that explain.c has a way to find instrumentation data from an arbitrary participant that actually built the hash table. Author: Thomas Munro Reviewed-By: Andres Freund Discussion: https://postgr.es/m/CAEepm%3D3DUQC2-z252N55eOcZBer6DPdM%3DFzrxH9dZc5vYLsjaA%40mail.gmail.com --- src/backend/commands/explain.c | 60 +++++-- src/backend/executor/execParallel.c | 41 +++-- src/backend/executor/execProcnode.c | 3 + src/backend/executor/nodeHash.c | 104 ++++++++++++ src/include/executor/nodeHash.h | 47 +++--- src/include/nodes/execnodes.h | 26 +++ src/test/regress/sql/join.sql | 241 ++++++++++++++++++++++++++++ 7 files changed, 478 insertions(+), 44 deletions(-) diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 4ba4dc81..4ec68401 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -20,7 +20,7 @@ #include "commands/createas.h" #include "commands/defrem.h" #include "commands/prepare.h" -#include "executor/hashjoin.h" +#include "executor/nodeHash.h" #include "foreign/fdwapi.h" #include "nodes/extensible.h" #include "nodes/nodeFuncs.h" @@ -2616,34 +2616,62 @@ show_sort_info(SortState *sortstate, ExplainState *es) static void show_hash_info(HashState *hashstate, ExplainState *es) { - HashJoinTable hashtable; + HashInstrumentation *hinstrument = NULL; - hashtable = hashstate->hashtable; + /* + * In a parallel query, the leader process may or may not have run the + * hash join, and even if it did it may not have built a hash table due to + * timing (if it started late it might have seen no tuples in the outer + * relation and skipped building the hash table). Therefore we have to be + * prepared to get instrumentation data from a worker if there is no hash + * table. + */ + if (hashstate->hashtable) + { + hinstrument = (HashInstrumentation *) + palloc(sizeof(HashInstrumentation)); + ExecHashGetInstrumentation(hinstrument, hashstate->hashtable); + } + else if (hashstate->shared_info) + { + SharedHashInfo *shared_info = hashstate->shared_info; + int i; + + /* Find the first worker that built a hash table. */ + for (i = 0; i < shared_info->num_workers; ++i) + { + if (shared_info->hinstrument[i].nbatch > 0) + { + hinstrument = &shared_info->hinstrument[i]; + break; + } + } + } - if (hashtable) + if (hinstrument) { - long spacePeakKb = (hashtable->spacePeak + 1023) / 1024; + long spacePeakKb = (hinstrument->space_peak + 1023) / 1024; if (es->format != EXPLAIN_FORMAT_TEXT) { - ExplainPropertyLong("Hash Buckets", hashtable->nbuckets, es); + ExplainPropertyLong("Hash Buckets", hinstrument->nbuckets, es); ExplainPropertyLong("Original Hash Buckets", - hashtable->nbuckets_original, es); - ExplainPropertyLong("Hash Batches", hashtable->nbatch, es); + hinstrument->nbuckets_original, es); + ExplainPropertyLong("Hash Batches", hinstrument->nbatch, es); ExplainPropertyLong("Original Hash Batches", - hashtable->nbatch_original, es); + hinstrument->nbatch_original, es); ExplainPropertyLong("Peak Memory Usage", spacePeakKb, es); } - else if (hashtable->nbatch_original != hashtable->nbatch || - hashtable->nbuckets_original != hashtable->nbuckets) + else if (hinstrument->nbatch_original != hinstrument->nbatch || + hinstrument->nbuckets_original != hinstrument->nbuckets) { appendStringInfoSpaces(es->str, es->indent * 2); appendStringInfo(es->str, "Buckets: %d (originally %d) Batches: %d (originally %d) Memory Usage: %ldkB\n", - hashtable->nbuckets, - hashtable->nbuckets_original, - hashtable->nbatch, - hashtable->nbatch_original, + hinstrument->nbuckets, + hinstrument->nbuckets_original, + hinstrument->nbatch, + hinstrument->nbatch_original, spacePeakKb); } else @@ -2651,7 +2679,7 @@ show_hash_info(HashState *hashstate, ExplainState *es) appendStringInfoSpaces(es->str, es->indent * 2); appendStringInfo(es->str, "Buckets: %d Batches: %d Memory Usage: %ldkB\n", - hashtable->nbuckets, hashtable->nbatch, + hinstrument->nbuckets, hinstrument->nbatch, spacePeakKb); } } diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c index 5ec13c1a..7aa46865 100644 --- a/src/backend/executor/execParallel.c +++ b/src/backend/executor/execParallel.c @@ -28,6 +28,7 @@ #include "executor/nodeBitmapHeapscan.h" #include "executor/nodeCustom.h" #include "executor/nodeForeignscan.h" +#include "executor/nodeHash.h" #include "executor/nodeIndexscan.h" #include "executor/nodeIndexonlyscan.h" #include "executor/nodeSeqscan.h" @@ -292,8 +293,12 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e) ExecBitmapHeapEstimate((BitmapHeapScanState *) planstate, e->pcxt); break; + case T_HashState: + /* even when not parallel-aware, for EXPLAIN ANALYZE */ + ExecHashEstimate((HashState *) planstate, e->pcxt); + break; case T_SortState: - /* even when not parallel-aware */ + /* even when not parallel-aware, for EXPLAIN ANALYZE */ ExecSortEstimate((SortState *) planstate, e->pcxt); #ifdef __TBASE__ if (planstate->plan->parallel_aware) @@ -385,8 +390,12 @@ ExecParallelInitializeDSM(PlanState *planstate, ExecBitmapHeapInitializeDSM((BitmapHeapScanState *) planstate, d->pcxt); break; + case T_HashState: + /* even when not parallel-aware, for EXPLAIN ANALYZE */ + ExecHashInitializeDSM((HashState *) planstate, d->pcxt); + break; case T_SortState: - /* even when not parallel-aware */ + /* even when not parallel-aware, for EXPLAIN ANALYZE */ ExecSortInitializeDSM((SortState *) planstate, d->pcxt); #ifdef __TBASE__ if (planstate->plan->parallel_aware) @@ -938,8 +947,12 @@ ExecParallelReInitializeDSM(PlanState *planstate, ExecBitmapHeapReInitializeDSM((BitmapHeapScanState *) planstate, pcxt); break; + case T_HashState: + /* even when not parallel-aware, for EXPLAIN ANALYZE */ + ExecHashReInitializeDSM((HashState *) planstate, pcxt); + break; case T_SortState: - /* even when not parallel-aware */ + /* even when not parallel-aware, for EXPLAIN ANALYZE */ ExecSortReInitializeDSM((SortState *) planstate, pcxt); break; @@ -994,12 +1007,18 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate, planstate->worker_instrument->num_workers = instrumentation->num_workers; memcpy(&planstate->worker_instrument->instrument, instrument, ibytes); - /* - * Perform any node-type-specific work that needs to be done. Currently, - * only Sort nodes need to do anything here. - */ - if (IsA(planstate, SortState)) + /* Perform any node-type-specific work that needs to be done. */ + switch (nodeTag(planstate)) + { + case T_SortState: ExecSortRetrieveInstrumentation((SortState *) planstate); + break; + case T_HashState: + ExecHashRetrieveInstrumentation((HashState *) planstate); + break; + default: + break; + } return planstate_tree_walker(planstate, ExecParallelRetrieveInstrumentation, instrumentation); @@ -1193,8 +1212,12 @@ ExecParallelInitializeWorker(PlanState *planstate, ParallelWorkerContext *pwcxt) if (planstate->plan->parallel_aware) ExecBitmapHeapInitializeWorker((BitmapHeapScanState *) planstate, pwcxt); break; + case T_HashState: + /* even when not parallel-aware, for EXPLAIN ANALYZE */ + ExecHashInitializeWorker((HashState *) planstate, pwcxt); + break; case T_SortState: - /* even when not parallel-aware */ + /* even when not parallel-aware, for EXPLAIN ANALYZE */ ExecSortInitializeWorker((SortState *) planstate, pwcxt); #ifdef __TBASE__ if (planstate->plan->parallel_aware) diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c index f8f15db2..eb5df5b9 100644 --- a/src/backend/executor/execProcnode.c +++ b/src/backend/executor/execProcnode.c @@ -869,6 +869,9 @@ ExecShutdownNode(PlanState *node) case T_RemoteSubplanState: ExecShutdownRemoteSubplan((RemoteSubplanState *) node); break; + case T_HashState: + ExecShutdownHash((HashState *) node); + break; default: break; } diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c index 3b9684f7..c81eb2fa 100644 --- a/src/backend/executor/nodeHash.c +++ b/src/backend/executor/nodeHash.c @@ -1822,6 +1822,110 @@ ExecHashRemoveNextSkewBucket(HashJoinTable hashtable) } } +/* + * Reserve space in the DSM segment for instrumentation data. + */ +void +ExecHashEstimate(HashState *node, ParallelContext *pcxt) +{ + size_t size; + + size = mul_size(pcxt->nworkers, sizeof(HashInstrumentation)); + size = add_size(size, offsetof(SharedHashInfo, hinstrument)); + shm_toc_estimate_chunk(&pcxt->estimator, size); + shm_toc_estimate_keys(&pcxt->estimator, 1); +} + +/* + * Set up a space in the DSM for all workers to record instrumentation data + * about their hash table. + */ +void +ExecHashInitializeDSM(HashState *node, ParallelContext *pcxt) +{ + size_t size; + + size = offsetof(SharedHashInfo, hinstrument) + + pcxt->nworkers * sizeof(HashInstrumentation); + node->shared_info = (SharedHashInfo *) shm_toc_allocate(pcxt->toc, size); + memset(node->shared_info, 0, size); + node->shared_info->num_workers = pcxt->nworkers; + shm_toc_insert(pcxt->toc, node->ps.plan->plan_node_id, + node->shared_info); +} + +/* + * Reset shared state before beginning a fresh scan. + */ +void +ExecHashReInitializeDSM(HashState *node, ParallelContext *pcxt) +{ + if (node->shared_info != NULL) + { + memset(node->shared_info->hinstrument, 0, + node->shared_info->num_workers * sizeof(HashInstrumentation)); + } +} + +/* + * Locate the DSM space for hash table instrumentation data that we'll write + * to at shutdown time. + */ +void +ExecHashInitializeWorker(HashState *node, ParallelWorkerContext *pwcxt) +{ + SharedHashInfo *shared_info; + + shared_info = (SharedHashInfo *) + shm_toc_lookup(pwcxt->toc, node->ps.plan->plan_node_id, true); + node->hinstrument = &shared_info->hinstrument[ParallelWorkerNumber]; +} + +/* + * Copy instrumentation data from this worker's hash table (if it built one) + * to DSM memory so the leader can retrieve it. This must be done in an + * ExecShutdownHash() rather than ExecEndHash() because the latter runs after + * we've detached from the DSM segment. + */ +void +ExecShutdownHash(HashState *node) +{ + if (node->hinstrument && node->hashtable) + ExecHashGetInstrumentation(node->hinstrument, node->hashtable); +} + +/* + * Retrieve instrumentation data from workers before the DSM segment is + * detached, so that EXPLAIN can access it. + */ +void +ExecHashRetrieveInstrumentation(HashState *node) +{ + SharedHashInfo *shared_info = node->shared_info; + size_t size; + + /* Replace node->shared_info with a copy in backend-local memory. */ + size = offsetof(SharedHashInfo, hinstrument) + + shared_info->num_workers * sizeof(HashInstrumentation); + node->shared_info = palloc(size); + memcpy(node->shared_info, shared_info, size); +} + +/* + * Copy the instrumentation data from 'hashtable' into a HashInstrumentation + * struct. + */ +void +ExecHashGetInstrumentation(HashInstrumentation *instrument, + HashJoinTable hashtable) +{ + instrument->nbuckets = hashtable->nbuckets; + instrument->nbuckets_original = hashtable->nbuckets_original; + instrument->nbatch = hashtable->nbatch; + instrument->nbatch_original = hashtable->nbatch_original; + instrument->space_peak = hashtable->spacePeak; +} + /* * Allocate 'size' bytes from the currently active HashMemoryChunk */ diff --git a/src/include/executor/nodeHash.h b/src/include/executor/nodeHash.h index f52d3dc8..025f2a33 100644 --- a/src/include/executor/nodeHash.h +++ b/src/include/executor/nodeHash.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * nodeHash.h - * prototypes for nodeHash.c + * prototypes for nodeHash.c * * * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group @@ -14,6 +14,7 @@ #ifndef NODEHASH_H #define NODEHASH_H +#include "access/parallel.h" #include "nodes/execnodes.h" extern HashState *ExecInitHash(Hash *node, EState *estate, int eflags); @@ -22,37 +23,45 @@ extern void ExecEndHash(HashState *node); extern void ExecReScanHash(HashState *node); extern HashJoinTable ExecHashTableCreate(Hash *node, List *hashOperators, - bool keepNulls); + bool keepNulls); #ifdef __TBASE__ extern HashJoinTable ExecShmHashTableCreate(Hash *node, List *hashOperators, - bool keepNulls); + bool keepNulls); extern Node *MultiExecShmHash(HashState *node); #endif extern void ExecHashTableDestroy(HashJoinTable hashtable); extern void ExecHashTableInsert(HashJoinTable hashtable, - TupleTableSlot *slot, - uint32 hashvalue); + TupleTableSlot *slot, + uint32 hashvalue); extern bool ExecHashGetHashValue(HashJoinTable hashtable, - ExprContext *econtext, - List *hashkeys, - bool outer_tuple, - bool keep_nulls, - uint32 *hashvalue); + ExprContext *econtext, + List *hashkeys, + bool outer_tuple, + bool keep_nulls, + uint32 *hashvalue); extern void ExecHashGetBucketAndBatch(HashJoinTable hashtable, - uint32 hashvalue, - int *bucketno, - int *batchno); + uint32 hashvalue, + int *bucketno, + int *batchno); extern bool ExecScanHashBucket(HashJoinState *hjstate, ExprContext *econtext); extern void ExecPrepHashTableForUnmatched(HashJoinState *hjstate); extern bool ExecScanHashTableForUnmatched(HashJoinState *hjstate, - ExprContext *econtext); + ExprContext *econtext); extern void ExecHashTableReset(HashJoinTable hashtable); extern void ExecHashTableResetMatchFlags(HashJoinTable hashtable); extern void ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew, - int *numbuckets, - int *numbatches, - int *num_skew_mcvs); -extern int ExecHashGetSkewBucket(HashJoinTable hashtable, uint32 hashvalue); + int *numbuckets, + int *numbatches, + int *num_skew_mcvs); +extern int ExecHashGetSkewBucket(HashJoinTable hashtable, uint32 hashvalue); +extern void ExecHashEstimate(HashState *node, ParallelContext *pcxt); +extern void ExecHashInitializeDSM(HashState *node, ParallelContext *pcxt); +extern void ExecHashInitializeWorker(HashState *node, ParallelWorkerContext *pwcxt); +extern void ExecHashReInitializeDSM(HashState *node, ParallelContext *pcxt); +extern void ExecHashRetrieveInstrumentation(HashState *node); +extern void ExecShutdownHash(HashState *node); +extern void ExecHashGetInstrumentation(HashInstrumentation *instrument, + HashJoinTable hashtable); -#endif /* NODEHASH_H */ +#endif /* NODEHASH_H */ diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index cedcf547..20c14341 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -2222,6 +2222,29 @@ typedef struct GatherMergeState struct GMReaderTupleBuffer *gm_tuple_buffers; /* tuple buffer per reader */ } GatherMergeState; +/* ---------------- + * Values displayed by EXPLAIN ANALYZE + * ---------------- + */ +typedef struct HashInstrumentation +{ + int nbuckets; /* number of buckets at end of execution */ + int nbuckets_original; /* planned number of buckets */ + int nbatch; /* number of batches at end of execution */ + int nbatch_original; /* planned number of batches */ + size_t space_peak; /* speak memory usage in bytes */ +} HashInstrumentation; + +/* ---------------- + * Shared memory container for per-worker hash information + * ---------------- + */ +typedef struct SharedHashInfo +{ + int num_workers; + HashInstrumentation hinstrument[FLEXIBLE_ARRAY_MEMBER]; +} SharedHashInfo; + /* ---------------- * HashState information * ---------------- @@ -2232,6 +2255,9 @@ typedef struct HashState HashJoinTable hashtable; /* hash table for the hashjoin */ List *hashkeys; /* list of ExprState nodes */ /* hashkeys is same as parent's hj_InnerHashKeys */ + + SharedHashInfo *shared_info; /* one entry per worker */ + HashInstrumentation *hinstrument; /* this worker's entry */ } HashState; /* ---------------- diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql index 16e8dd0b..31475056 100644 --- a/src/test/regress/sql/join.sql +++ b/src/test/regress/sql/join.sql @@ -2004,6 +2004,247 @@ create index idx_nestloop_suppression1_b on nestloop_suppression1(b); analyze nestloop_suppression1; analyze nestloop_suppression2; analyze nestloop_suppression3; +begin; + +set local min_parallel_table_scan_size = 0; +set local parallel_setup_cost = 0; + +-- Extract bucket and batch counts from an explain analyze plan. In +-- general we can't make assertions about how many batches (or +-- buckets) will be required because it can vary, but we can in some +-- special cases and we can check for growth. +create or replace function find_hash(node json) +returns json language plpgsql +as +$$ +declare + x json; + child json; +begin + if node->>'Node Type' = 'Hash' then + return node; + else + for child in select json_array_elements(node->'Plans') + loop + x := find_hash(child); + if x is not null then + return x; + end if; + end loop; + return null; + end if; +end; +$$; +create or replace function hash_join_batches(query text) +returns table (original int, final int) language plpgsql +as +$$ +declare + whole_plan json; + hash_node json; +begin + for whole_plan in + execute 'explain (analyze, format ''json'') ' || query + loop + hash_node := find_hash(json_extract_path(whole_plan, '0', 'Plan')); + original := hash_node->>'Original Hash Batches'; + final := hash_node->>'Hash Batches'; + return next; + end loop; +end; +$$; + +-- Make a simple relation with well distributed keys and correctly +-- estimated size. +create table simple as + select generate_series(1, 20000) AS id, 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'; +alter table simple set (parallel_workers = 2); +analyze simple; + +-- Make a relation whose size we will under-estimate. We want stats +-- to say 1000 rows, but actually there are 20,000 rows. +create table bigger_than_it_looks as + select generate_series(1, 20000) as id, 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'; +alter table bigger_than_it_looks set (autovacuum_enabled = 'false'); +alter table bigger_than_it_looks set (parallel_workers = 2); +analyze bigger_than_it_looks; +update pg_class set reltuples = 1000 where relname = 'bigger_than_it_looks'; + +-- Make a relation whose size we underestimate and that also has a +-- kind of skew that breaks our batching scheme. We want stats to say +-- 2 rows, but actually there are 20,000 rows with the same key. +create table extremely_skewed (id int, t text); +alter table extremely_skewed set (autovacuum_enabled = 'false'); +alter table extremely_skewed set (parallel_workers = 2); +analyze extremely_skewed; +insert into extremely_skewed + select 42 as id, 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa' + from generate_series(1, 20000); +update pg_class + set reltuples = 2, relpages = pg_relation_size('extremely_skewed') / 8192 + where relname = 'extremely_skewed'; + +-- The "optimal" case: the hash table fits in memory; we plan for 1 +-- batch, we stick to that number, and peak memory usage stays within +-- our work_mem budget + +-- non-parallel +savepoint settings; +set local max_parallel_workers_per_gather = 0; +set local work_mem = '4MB'; +explain (costs off) + select count(*) from simple r join simple s using (id); +select count(*) from simple r join simple s using (id); +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join simple s using (id); +$$); +rollback to settings; + +-- parallel with parallel-oblivious hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +set local work_mem = '4MB'; +explain (costs off) + select count(*) from simple r join simple s using (id); +select count(*) from simple r join simple s using (id); +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join simple s using (id); +$$); +rollback to settings; + +-- The "good" case: batches required, but we plan the right number; we +-- plan for some number of batches, and we stick to that number, and +-- peak memory usage says within our work_mem budget + +-- non-parallel +savepoint settings; +set local max_parallel_workers_per_gather = 0; +set local work_mem = '128kB'; +explain (costs off) + select count(*) from simple r join simple s using (id); +select count(*) from simple r join simple s using (id); +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join simple s using (id); +$$); +rollback to settings; + +-- parallel with parallel-oblivious hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +set local work_mem = '128kB'; +explain (costs off) + select count(*) from simple r join simple s using (id); +select count(*) from simple r join simple s using (id); +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join simple s using (id); +$$); +rollback to settings; + +-- The "bad" case: during execution we need to increase number of +-- batches; in this case we plan for 1 batch, and increase at least a +-- couple of times, and peak memory usage stays within our work_mem +-- budget + +-- non-parallel +savepoint settings; +set local max_parallel_workers_per_gather = 0; +set local work_mem = '128kB'; +explain (costs off) + select count(*) FROM simple r JOIN bigger_than_it_looks s USING (id); +select count(*) FROM simple r JOIN bigger_than_it_looks s USING (id); +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) FROM simple r JOIN bigger_than_it_looks s USING (id); +$$); +rollback to settings; + +-- parallel with parallel-oblivious hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +set local work_mem = '128kB'; +explain (costs off) + select count(*) from simple r join bigger_than_it_looks s using (id); +select count(*) from simple r join bigger_than_it_looks s using (id); +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join bigger_than_it_looks s using (id); +$$); +rollback to settings; + +-- The "ugly" case: increasing the number of batches during execution +-- doesn't help, so stop trying to fit in work_mem and hope for the +-- best; in this case we plan for 1 batch, increases just once and +-- then stop increasing because that didn't help at all, so we blow +-- right through the work_mem budget and hope for the best... + +-- non-parallel +savepoint settings; +set local max_parallel_workers_per_gather = 0; +set local work_mem = '128kB'; +explain (costs off) + select count(*) from simple r join extremely_skewed s using (id); +select count(*) from simple r join extremely_skewed s using (id); +select * from hash_join_batches( +$$ + select count(*) from simple r join extremely_skewed s using (id); +$$); +rollback to settings; + +-- parallel with parallel-oblivious hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +set local work_mem = '128kB'; +explain (costs off) + select count(*) from simple r join extremely_skewed s using (id); +select count(*) from simple r join extremely_skewed s using (id); +select * from hash_join_batches( +$$ + select count(*) from simple r join extremely_skewed s using (id); +$$); +rollback to settings; + +-- A couple of other hash join tests unrelated to work_mem management. + +-- Check that EXPLAIN ANALYZE has data even if the leader doesn't participate +savepoint settings; +set local max_parallel_workers_per_gather = 2; +set local work_mem = '4MB'; +set local parallel_leader_participation = off; +select * from hash_join_batches( +$$ + select count(*) from simple r join simple s using (id); +$$); +rollback to settings; + +-- A full outer join where every record is matched. + +-- non-parallel +savepoint settings; +set local max_parallel_workers_per_gather = 0; +explain (costs off) + select count(*) from simple r full outer join simple s using (id); +select count(*) from simple r full outer join simple s using (id); +rollback to settings; + +-- parallelism not possible with parallel-oblivious outer hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +explain (costs off) + select count(*) from simple r full outer join simple s using (id); +select count(*) from simple r full outer join simple s using (id); +rollback to settings; + +-- An full outer join where every record is not matched. set enable_hashjoin = false; explain select t3.b from nestloop_suppression1 t1, nestloop_suppression2 t2, nestloop_suppression3 t3 From a8d69a8f0e3dcab441059ba8d3dc94f9e360baf4 Mon Sep 17 00:00:00 2001 From: andrelin Date: Mon, 1 Mar 2021 14:14:23 +0800 Subject: [PATCH 139/578] Fix bug after cherry-pick PG commit 8d4af49f32 --- src/backend/executor/nodeAgg.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c index 9f5678f5..d36c3df4 100644 --- a/src/backend/executor/nodeAgg.c +++ b/src/backend/executor/nodeAgg.c @@ -550,6 +550,20 @@ typedef struct AggStatePerHashData Agg *aggnode; /* original Agg node, for numGroups etc. */ } AggStatePerHashData; +#ifdef __TBASE__ +/* + * used in ReDistributeInitializeDSM and ReDistributeInitializeWorker + * to distinguish keys between shared memory for parallel and + * shared memory for redistributed optimization, for parallel it's + * plan_node_id the same as PG, for redistributed optimization, we + * use plan_node_id + this macro + * + * Node: refer to execParallel.c, only the first 4 bits been occupied + * for specific use, now we have to use an extra bit, but it's fine + * since plan_node_id is only a 32bit integer. + */ +#define PARALLEL_REDISTRIBUTE_OFFSET UINT64CONST(0xE800000000000000) +#endif static void select_current_set(AggState *aggstate, int setno, bool is_hash); static void initialize_phase(AggState *aggstate, int newphase); @@ -5430,7 +5444,7 @@ ReDistributeInitializeDSM(PlanState *node, ParallelContext *pcxt) state->buf[i]->dataType = DT_None; } - shm_toc_insert(pcxt->toc, node->plan->plan_node_id, state); + shm_toc_insert(pcxt->toc, node->plan->plan_node_id + PARALLEL_REDISTRIBUTE_OFFSET, state); *state_ptr = state; } @@ -5444,7 +5458,7 @@ ReDistributeInitializeWorker(PlanState *node, ParallelWorkerContext *pwcxt) ReDistributeState *rd_state = NULL; volatile ParallelWorkerStatus *numParallelWorkers = NULL; - state = shm_toc_lookup(toc, node->plan->plan_node_id, false); + state = shm_toc_lookup(toc, node->plan->plan_node_id + PARALLEL_REDISTRIBUTE_OFFSET, false); numParallelWorkers = GetParallelWorkerStatusInfo(toc); rd_state = (ReDistributeState *)palloc0(sizeof(ReDistributeState)); From d2d1f1f3ca4c75092c4ef6044029dec1092fdd9a Mon Sep 17 00:00:00 2001 From: andrelin Date: Fri, 5 Mar 2021 15:04:58 +0800 Subject: [PATCH 140/578] Support sort and hash state, and show info of all datanodes if "verbose" cherry-pick explain analyze tests from V3 --- src/backend/commands/explain.c | 31 + src/backend/commands/explain_dist.c | 651 +++++++++++++++++--- src/backend/executor/execParallel.c | 227 ++++++- src/backend/executor/execProcnode.c | 3 + src/backend/executor/nodeGather.c | 3 +- src/backend/executor/nodeGatherMerge.c | 4 +- src/backend/executor/nodeHash.c | 23 + src/backend/executor/nodeHashjoin.c | 9 + src/backend/executor/nodeSort.c | 3 + src/backend/pgxc/pool/execRemote.c | 56 +- src/backend/utils/cache/lsyscache.c | 25 + src/include/commands/explain_dist.h | 35 +- src/include/executor/instrument.h | 90 +-- src/include/nodes/execnodes.h | 4 + src/include/pgxc/execRemote.h | 2 +- src/include/utils/lsyscache.h | 195 +++--- src/test/regress/expected/join_3.out | 440 +++++++++++++ src/test/regress/expected/tbase_explain.out | 362 +++++++++++ src/test/regress/parallel_schedule | 6 + src/test/regress/sql/tbase_explain.sql | 37 ++ 20 files changed, 1932 insertions(+), 274 deletions(-) create mode 100644 src/test/regress/expected/tbase_explain.out create mode 100644 src/test/regress/sql/tbase_explain.sql diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 4ec68401..7b56c3d3 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -47,6 +47,7 @@ #include "pgxc/execRemote.h" #endif #ifdef __TBASE__ +#include "commands/explain_dist.h" #include "commands/vacuum.h" #endif @@ -1492,6 +1493,10 @@ ExplainNode(PlanState *planstate, List *ancestors, ExplainPropertyFloat("Actual Loops", nloops, 0, es); } } + else if (es->analyze && planstate->dn_instrument) + { + ExplainCommonRemoteInstr(planstate, es); + } else if (es->analyze) { if (es->format == EXPLAIN_FORMAT_TEXT) @@ -2608,6 +2613,32 @@ show_sort_info(SortState *sortstate, ExplainState *es) if (opened_group) ExplainCloseGroup("Workers", "Workers", false, es); } +#ifdef __TBASE__ + else if (sortstate->instrument.spaceType != -1) + { + /* try our cached distributed instrument */ + /* same logic above */ + const char *sortMethod = tuplesort_method_name(sortstate->instrument.sortMethod); + const char *spaceType = tuplesort_space_type_name(sortstate->instrument.spaceType); + long spaceUsed = sortstate->instrument.spaceUsed; + + /* -1 means invalid value, indicate that this node executed by ourself */ + Assert(sortstate->instrument.sortMethod != -1); + + if (es->format == EXPLAIN_FORMAT_TEXT) + { + appendStringInfoSpaces(es->str, es->indent * 2); + appendStringInfo(es->str, "Sort Method: %s %s: %ldkB\n", + sortMethod, spaceType, spaceUsed); + } + else + { + ExplainPropertyText("Sort Method", sortMethod, es); + ExplainPropertyLong("Sort Space Used", spaceUsed, es); + ExplainPropertyText("Sort Space Type", spaceType, es); + } + } +#endif } /* diff --git a/src/backend/commands/explain_dist.c b/src/backend/commands/explain_dist.c index 41b7c5a1..65d9fed8 100644 --- a/src/backend/commands/explain_dist.c +++ b/src/backend/commands/explain_dist.c @@ -20,6 +20,8 @@ #include "libpq/libpq.h" #include "libpq/pqformat.h" #include "nodes/nodeFuncs.h" +#include "utils/lsyscache.h" +#include "utils/tuplesort.h" /* Read instrument field */ #define INSTR_READ_FIELD(fldname) \ @@ -34,6 +36,15 @@ do { \ target->fldname = Max(src->fldname, target->fldname); \ } while(0) +/* Tools for max/min */ +#define SET_MIN_MAX(min, max, tmp) \ +do { \ + if (min > tmp) \ + min = tmp; \ + if (max < tmp) \ + max = tmp; \ +} while(0) + /* Serialize state */ typedef struct { @@ -47,16 +58,16 @@ typedef struct * InstrOut * * Serialize Instrumentation structure with the format - * "nodetype-plan_node_id{val,val,...,val}". + * "nodetype-plan_node_id-node_oid{val,val,...,val}". * * NOTE: The function should be modified if the structure of Instrumentation * or its relevant members has been changed. */ static void -InstrOut(StringInfo buf, Plan *plan, Instrumentation *instr) +InstrOut(StringInfo buf, Plan *plan, Instrumentation *instr, int current_node_id) { /* nodeTag for varify */ - appendStringInfo(buf, "%hd-%d{", nodeTag(plan), plan->plan_node_id); + appendStringInfo(buf, "%hd-%d-%d{", nodeTag(plan), plan->plan_node_id, current_node_id); /* bool */ /* running should be false after InstrEndLoop */ @@ -111,7 +122,7 @@ InstrOut(StringInfo buf, Plan *plan, Instrumentation *instr) appendStringInfo(buf, "%ld,", instr->bufusage.blk_write_time.tv_sec); appendStringInfo(buf, "%ld}", instr->bufusage.blk_write_time.tv_nsec); - elog(DEBUG1, "InstrOut: plan_node_id %d, nloops %.0f", plan->plan_node_id, instr->nloops); + elog(DEBUG1, "InstrOut: plan_node_id %d, node %d, nloops %.0f", plan->plan_node_id, current_node_id, instr->nloops); } /* @@ -174,7 +185,6 @@ SpecInstrOut(StringInfo buf, NodeTag plantag, PlanState *planstate) ((GatherMergeState *) planstate)->nworkers_launched); } break; -#if 0 case T_Sort: { /* according to RemoteSortState and show_sort_info */ @@ -182,39 +192,113 @@ SpecInstrOut(StringInfo buf, NodeTag plantag, PlanState *planstate) if (sortstate->sort_Done && sortstate->tuplesortstate) { - Tuplesortstate *state = (Tuplesortstate *) sortstate->tuplesortstate; - char *sortMethod; - char *spaceType; - long spaceUsed; - - tuplesort_get_stats(state, (const char **) &sortMethod, (const char **) &spaceType, &spaceUsed); - appendStringInfo(buf, "1<%s,%s,%ld>", - sortMethod, spaceType, spaceUsed); + Tuplesortstate *state = (Tuplesortstate *) sortstate->tuplesortstate; + TuplesortInstrumentation stats; + tuplesort_get_stats(state, &stats); + Assert(stats.sortMethod != SORT_TYPE_STILL_IN_PROGRESS); + appendStringInfo(buf, "1<%hd,%hd,%ld>", + stats.sortMethod, stats.spaceType, stats.spaceUsed); + } + else if (sortstate->instrument.sortMethod != -1) + { + Assert(sortstate->instrument.sortMethod != SORT_TYPE_STILL_IN_PROGRESS); + Assert(sortstate->instrument.spaceType != -1); + appendStringInfo(buf, "1<%hd,%hd,%ld>", + sortstate->instrument.sortMethod, + sortstate->instrument.spaceType, + sortstate->instrument.spaceUsed); + } + else + { + appendStringInfo(buf, "0>"); + } + + if (sortstate->shared_info) + { + int n; + appendStringInfo(buf, "%d>", sortstate->shared_info->num_workers); + for (n = 0; n < sortstate->shared_info->num_workers; n++) + { + TuplesortInstrumentation *w_stats; + w_stats = &sortstate->shared_info->sinstrument[n]; + if (w_stats->sortMethod == SORT_TYPE_STILL_IN_PROGRESS) + { + appendStringInfo(buf, "0>"); + } + else + appendStringInfo(buf, "%hd,%hd,%ld>", + w_stats->sortMethod, + w_stats->spaceType, w_stats->spaceUsed); + elog(DEBUG1, "send out parallel sort %d info: %d %d %ld", + planstate->plan->plan_node_id, + w_stats->sortMethod, + w_stats->spaceType, + w_stats->spaceUsed); + } } else appendStringInfo(buf, "0>"); } break; - case T_Hash: { - /* according to RemoteHashState and show_hash_info */ + /* according to show_hash_info */ HashState *hashstate = castNode(HashState, planstate); HashJoinTable hashtable = hashstate->hashtable; + int nbuckets = 0; + int nbuckets_original = 0; + int nbatch = 0; + int nbatch_original = 0; + Size spacePeak = 0; + bool valid = true; + if (hashtable) { - hashtable->nbuckets = 0; + nbuckets = hashtable->nbuckets; + nbuckets_original = hashtable->nbuckets_original; + nbatch = hashtable->nbatch; + nbatch_original = hashtable->nbatch_original; + spacePeak = hashtable->spacePeak; + } + else if (hashstate->shared_info) + { + int n; + for (n = 0; n < hashstate->shared_info->num_workers; n++) + { + HashInstrumentation *w_stats = &hashstate->shared_info->hinstrument[n]; + /* Find the first worker that built a hash table. same logic in show_hash_info */ + if (w_stats->nbatch > 0) + { + nbuckets = w_stats->nbuckets; + nbuckets_original = w_stats->nbuckets_original; + nbatch = w_stats->nbatch; + nbatch_original = w_stats->nbatch_original; + spacePeak = w_stats->space_peak; + break; + } + } + } + else + { + Assert(hashstate->hinstrument == NULL); + valid = false; + } + + if (valid) + { + elog(DEBUG1, "send out hash %d peak %zu", planstate->plan->plan_node_id, + spacePeak); appendStringInfo(buf, "1<%d,%d,%d,%d,%ld>", - hashtable->nbuckets, hashtable->nbuckets_original, - hashtable->nbatch, hashtable->nbatch_original, - (hashtable->spacePeak + 1023) / 1024); + nbuckets, nbuckets_original, + nbatch, nbatch_original, + spacePeak); } else appendStringInfo(buf, "0>"); } break; -#endif + default: break; } @@ -238,7 +322,9 @@ InstrIn(StringInfo str, RemoteInstr *rinstr) /* verify nodetype and plan_node_id */ rinstr->nodeTag = strtol(tmp_head, &tmp_pos, 0); tmp_head = tmp_pos + 1; - rinstr->id = (int) strtol(tmp_head, &tmp_pos, 0); + rinstr->key.plan_node_id = (int) strtol(tmp_head, &tmp_pos, 0); + tmp_head = tmp_pos + 1; + rinstr->key.node_id = strtol(tmp_head, &tmp_pos, 0); tmp_head = tmp_pos + 1; /* read values */ @@ -291,7 +377,7 @@ InstrIn(StringInfo str, RemoteInstr *rinstr) INSTR_READ_FIELD(bufusage.blk_write_time.tv_sec); INSTR_READ_FIELD(bufusage.blk_write_time.tv_nsec); - elog(DEBUG1, "InstrIn: plan_node_id %d, nloops %.0f", rinstr->id, instr->nloops); + elog(DEBUG1, "InstrIn: plan_node_id %d, node %d, nloops %.0f", rinstr->key.plan_node_id, rinstr->key.node_id, instr->nloops); /* tmp_head points to next instrument's nodetype or '\0' already */ str->cursor = tmp_head - &str->data[0]; @@ -303,75 +389,67 @@ InstrIn(StringInfo str, RemoteInstr *rinstr) * DeSerialize of specific instrument info of current node. */ static void -SpecInstrIn(StringInfo str, RemoteInstr *rinstr) +SpecInstrIn(StringInfo str, RemoteInstr *instr) { char *tmp_pos; char *tmp_head = &str->data[str->cursor]; - switch(rinstr->nodeTag) + switch(instr->nodeTag) { case T_Gather: case T_GatherMerge: { - rinstr->nworkers_launched = (int) strtod(tmp_head, &tmp_pos); - tmp_head = tmp_pos + 1; + INSTR_READ_FIELD(nworkers_launched); } break; -#if 0 case T_Sort: { - RemoteSortState *instr = (RemoteSortState *)palloc0( - sizeof(RemoteSortState)); /* either stat or w_stat is valid */ - INSTR_READ_FIELD(rs.isvalid); - if (instr->rs.isvalid) + bool isvalid = (bool) strtod(tmp_head, &tmp_pos); + tmp_head = tmp_pos + 1; + + if (isvalid) { - INSTR_READ_FIELD(stat.sortMethod); - INSTR_READ_FIELD(stat.spaceType); - INSTR_READ_FIELD(stat.spaceUsed); + INSTR_READ_FIELD(sort_stat.sortMethod); + INSTR_READ_FIELD(sort_stat.spaceType); + INSTR_READ_FIELD(sort_stat.spaceUsed); + Assert(instr->sort_stat.sortMethod != SORT_TYPE_STILL_IN_PROGRESS); } - INSTR_READ_FIELD(rs.num_workers); - if (instr->rs.num_workers > 0) + INSTR_READ_FIELD(nworkers_launched); + if (instr->nworkers_launched > 0) { int n; - Size size; - - size = mul_size(sizeof(TuplesortInstrumentation), - instr->rs.num_workers); - instr->w_stats = (TuplesortInstrumentation *)palloc0(size); + instr->w_sort_stats = (TuplesortInstrumentation *) palloc0(instr->nworkers_launched * sizeof(TuplesortInstrumentation)); - for (n = 0; n < instr->rs.num_workers; n++) + for (n = 0; n < instr->nworkers_launched; n++) { - INSTR_READ_FIELD(w_stats[n].sortMethod); - if (instr->w_stats[n].sortMethod != SORT_TYPE_STILL_IN_PROGRESS) + INSTR_READ_FIELD(w_sort_stats[n].sortMethod); + if (instr->w_sort_stats[n].sortMethod != SORT_TYPE_STILL_IN_PROGRESS) { - INSTR_READ_FIELD(w_stats[n].spaceType); - INSTR_READ_FIELD(w_stats[n].spaceUsed); + INSTR_READ_FIELD(w_sort_stats[n].spaceType); + INSTR_READ_FIELD(w_sort_stats[n].spaceUsed); } } } - remote_instr->state = (RemoteState *) instr; } break; - case T_Hash: { - RemoteHashState *instr = (RemoteHashState *)palloc0( - sizeof(RemoteHashState)); - INSTR_READ_FIELD(rs.isvalid); - if (instr->rs.isvalid) + bool isvalid = (bool) strtod(tmp_head, &tmp_pos); + tmp_head = tmp_pos + 1; + + if (isvalid) { - INSTR_READ_FIELD(nbuckets); - INSTR_READ_FIELD(nbuckets_original); - INSTR_READ_FIELD(nbatch); - INSTR_READ_FIELD(nbatch_original); - INSTR_READ_FIELD(spacePeakKb); + INSTR_READ_FIELD(hash_stat.nbuckets); + INSTR_READ_FIELD(hash_stat.nbuckets_original); + INSTR_READ_FIELD(hash_stat.nbatch); + INSTR_READ_FIELD(hash_stat.nbatch_original); + INSTR_READ_FIELD(hash_stat.space_peak); } - remote_instr->state = (RemoteState *) instr; } break; -#endif + default: break; } @@ -406,9 +484,33 @@ SerializeLocalInstr(PlanState *planstate, SerializeState *ss) { /* clean up the instrumentation state as in ExplainNode */ InstrEndLoop(planstate->instrument); - InstrOut(&ss->buf, planstate->plan, planstate->instrument); - //WorkerInstrOut(&ss->buf, planstate->worker_instrument); - SpecInstrOut(&ss->buf, nodeTag(planstate->plan), planstate); + if (planstate->dn_instrument) + { + /* re-send our received remote instr to upstream. */ + int n; + for (n = 0; n < planstate->dn_instrument->nnode; n++) + { + Instrumentation *instrument = &(planstate->dn_instrument->instrument[n].instr); + int node_id = planstate->dn_instrument->instrument[n].nodeid; + + /* instrument valid only if node_oid set */ + if (node_id != 0) + { + InstrOut(&ss->buf, planstate->plan, instrument, node_id); + SpecInstrOut(&ss->buf, nodeTag(planstate->plan), planstate); + } + else + { + elog(DEBUG1, "can't send instr out plan_node_id %d not attached", plan_node_id); + } + } + } + else + { + /* send our own instr */ + InstrOut(&ss->buf, planstate->plan, planstate->instrument, 0); + SpecInstrOut(&ss->buf, nodeTag(planstate->plan), planstate); + } } else { @@ -439,6 +541,89 @@ SendLocalInstr(PlanState *planstate) pq_flush(); } +static void +combineSpecRemoteInstr(RemoteInstr *rtarget, RemoteInstr *rsrc) +{ + int i; + /* specific instrument */ + switch (rsrc->nodeTag) + { + case T_Gather: + case T_GatherMerge: + { + rtarget->nworkers_launched = Max(rtarget->nworkers_launched, rsrc->nworkers_launched); + } + break; + case T_Sort: + { + if (rsrc->sort_stat.sortMethod != SORT_TYPE_STILL_IN_PROGRESS && + rsrc->sort_stat.sortMethod != -1) + { + /* TODO: figure out which sortMethod is worse */ + rtarget->sort_stat.sortMethod = rsrc->sort_stat.sortMethod; + if (rtarget->sort_stat.spaceType == rsrc->sort_stat.spaceType) + { + /* same space type, just compare space used */ + rtarget->sort_stat.spaceUsed = Max(rtarget->sort_stat.spaceUsed, rsrc->sort_stat.spaceUsed); + } + else if (rtarget->sort_stat.spaceType > rsrc->sort_stat.spaceType) + { + /* invalid > memory > disk */ + rtarget->sort_stat.spaceType = rsrc->sort_stat.spaceType; + rtarget->sort_stat.spaceUsed = rsrc->sort_stat.spaceUsed; + } + } + + rtarget->nworkers_launched = Max(rtarget->nworkers_launched, rsrc->nworkers_launched); + if (rtarget->w_sort_stats == NULL) + { + rtarget->w_sort_stats = palloc0(rtarget->nworkers_launched * sizeof(TuplesortInstrumentation)); + for (i = 0; i < rtarget->nworkers_launched; i++) + rtarget->w_sort_stats[i].spaceType = -1; + } + for (i = 0; i < rtarget->nworkers_launched; i++) + { + if (rsrc->w_sort_stats[i].sortMethod == SORT_TYPE_STILL_IN_PROGRESS || + rsrc->w_sort_stats[i].sortMethod == -1) + continue; + + /* same logic above */ + /* TODO: figure out which sortMethod is worse */ + rtarget->w_sort_stats[i].sortMethod = rsrc->w_sort_stats[i].sortMethod; + if (rtarget->w_sort_stats[i].spaceType == rsrc->w_sort_stats[i].spaceType) + { + /* same space type, just compare space used */ + rtarget->w_sort_stats[i].spaceUsed = Max(rtarget->w_sort_stats[i].spaceUsed, rsrc->w_sort_stats[i].spaceUsed); + } + else if (rtarget->w_sort_stats[i].spaceType > rsrc->w_sort_stats[i].spaceType) + { + /* invalid > memory > disk */ + rtarget->w_sort_stats[i].spaceType = rsrc->w_sort_stats[i].spaceType; + rtarget->w_sort_stats[i].spaceUsed = rsrc->w_sort_stats[i].spaceUsed; + } + + elog(DEBUG1, "combine parallel plan %d sort state %d %d %ld", + rtarget->key.plan_node_id, + rtarget->w_sort_stats[i].sortMethod, + rtarget->w_sort_stats[i].spaceType, + rtarget->w_sort_stats[i].spaceUsed); + } + } + break; + case T_Hash: + { + rtarget->hash_stat.nbuckets = Max(rtarget->hash_stat.nbuckets, rsrc->hash_stat.nbuckets); + rtarget->hash_stat.nbuckets_original = Max(rtarget->hash_stat.nbuckets_original, rsrc->hash_stat.nbuckets_original); + rtarget->hash_stat.nbatch = Max(rtarget->hash_stat.nbatch, rsrc->hash_stat.nbatch); + rtarget->hash_stat.nbatch_original = Max(rtarget->hash_stat.nbatch_original, rsrc->hash_stat.nbatch_original); + rtarget->hash_stat.space_peak = Max(rtarget->hash_stat.space_peak, rsrc->hash_stat.space_peak); + } + break; + default: + break; + } +} + /* * combineRemoteInstr * @@ -451,9 +636,11 @@ combineRemoteInstr(RemoteInstr *rtarget, RemoteInstr *rsrc) Instrumentation *target = &rtarget->instr; Instrumentation *src = &rsrc->instr; - Assert(rtarget->id == rsrc->id); + Assert(rtarget->key.node_id == rsrc->key.node_id); + Assert(rtarget->key.plan_node_id == rsrc->key.plan_node_id); Assert(rtarget->nodeTag == rsrc->nodeTag); + /* regular instrument */ INSTR_MAX_FIELD(need_timer); INSTR_MAX_FIELD(need_bufusage); INSTR_MAX_FIELD(running); @@ -503,7 +690,7 @@ combineRemoteInstr(RemoteInstr *rtarget, RemoteInstr *rsrc) INSTR_MAX_FIELD(bufusage.blk_write_time.tv_sec); INSTR_MAX_FIELD(bufusage.blk_write_time.tv_nsec); - rtarget->nworkers_launched = Max(rtarget->nworkers_launched, rsrc->nworkers_launched); + combineSpecRemoteInstr(rtarget, rsrc); } /* @@ -512,28 +699,38 @@ combineRemoteInstr(RemoteInstr *rtarget, RemoteInstr *rsrc) * Handle remote instrument message and save it by plan_node_id. */ void -HandleRemoteInstr(char *msg_body, size_t len, int nodeoid, ResponseCombiner *combiner) +HandleRemoteInstr(char *msg_body, size_t len, int nodeid, ResponseCombiner *combiner) { RemoteInstr recv_instr; StringInfo recv_str; bool found; RemoteInstr *cur_instr; + /* must doing this under per query context */ + MemoryContext oldcontext = MemoryContextSwitchTo(combiner->ss.ps.state->es_query_cxt); + if (combiner->recv_instr_htbl == NULL) { elog(ERROR, "combiner is not prepared for instrumentation"); } - elog(DEBUG1, "Handle remote instrument: nodeoid %d", nodeoid); + elog(DEBUG1, "Handle remote instrument: nodeid %d", nodeid); recv_str = makeStringInfo(); appendBinaryStringInfo(recv_str, msg_body, len); while(recv_str->cursor < recv_str->len) { + memset(&recv_instr, 0, sizeof(RemoteInstr)); + recv_instr.sort_stat.sortMethod = -1; + recv_instr.sort_stat.spaceType = -1; InstrIn(recv_str, &recv_instr); SpecInstrIn(recv_str, &recv_instr); + + if (recv_instr.key.node_id == 0) + recv_instr.key.node_id = nodeid; + cur_instr = (RemoteInstr *) hash_search(combiner->recv_instr_htbl, - (void *) &recv_instr.id, + (void *) &recv_instr.key, HASH_ENTER, &found); if (found) { @@ -541,9 +738,21 @@ HandleRemoteInstr(char *msg_body, size_t len, int nodeoid, ResponseCombiner *com } else { + elog(DEBUG1, "remote instr hashtable enter plan_node_id %d node %d", + recv_instr.key.plan_node_id, recv_instr.key.node_id); + memcpy(cur_instr, &recv_instr, sizeof(RemoteInstr)); + if (recv_instr.nodeTag == T_Sort && recv_instr.nworkers_launched > 0) + { + Size size = sizeof(TuplesortInstrumentation) * recv_instr.nworkers_launched; + + cur_instr->w_sort_stats = palloc(size); + memcpy(cur_instr->w_sort_stats, recv_instr.w_sort_stats, size); + } } } + + MemoryContextSwitchTo(oldcontext); } /* @@ -552,23 +761,83 @@ HandleRemoteInstr(char *msg_body, size_t len, int nodeoid, ResponseCombiner *com * Attach specific information in planstate. */ static void -attachRemoteSpecialInstr(PlanState *planstate, RemoteInstr *rinstr) +attachRemoteSpecificInstr(PlanState *planstate, RemoteInstr *rinstr) { int nodeTag = nodeTag(planstate->plan); + int nworkers = rinstr->nworkers_launched; switch(nodeTag) { case T_Gather: + { + GatherState *gs = (GatherState *) planstate; + gs->nworkers_launched = nworkers; + } + break; + case T_GatherMerge: + { + GatherMergeState *gms = (GatherMergeState *) planstate; + gms->nworkers_launched = nworkers; + } + break; + case T_Sort: + { + SortState *ss = (SortState *) planstate; + ss->instrument.sortMethod = rinstr->sort_stat.sortMethod; + ss->instrument.spaceType = rinstr->sort_stat.spaceType; + ss->instrument.spaceUsed = rinstr->sort_stat.spaceUsed; + elog(DEBUG1, "attach sort nworkers %d", nworkers); + + if (nworkers > 0) { - GatherState *gs = (GatherState *) planstate; - gs->nworkers_launched = rinstr->nworkers_launched; + int i; + if (ss->shared_info == NULL) + { + Size size = offsetof(SharedSortInfo, sinstrument) + + nworkers * sizeof(TuplesortInstrumentation); + ss->shared_info = palloc0(size); + } + + ss->shared_info->num_workers = nworkers; + for (i = 0; i < nworkers; i++) + { + ss->shared_info->sinstrument[i].sortMethod = rinstr->w_sort_stats[i].sortMethod; + ss->shared_info->sinstrument[i].spaceType = rinstr->w_sort_stats[i].spaceType; + ss->shared_info->sinstrument[i].spaceUsed = rinstr->w_sort_stats[i].spaceUsed; + elog(DEBUG1, "attach parallel sort %d, info: %d %d %ld", + planstate->plan->plan_node_id, + ss->shared_info->sinstrument[i].sortMethod, + ss->shared_info->sinstrument[i].spaceType, + ss->shared_info->sinstrument[i].spaceUsed); + } } + } break; - case T_GatherMerge: + case T_Hash: + { + HashState *hs = (HashState *) planstate; + if (IsParallelWorker()) + { + Assert(hs->hinstrument != NULL); + Assert(hs->shared_info != NULL); + Assert(hs->hashtable == NULL); + /* copy into first instrument */ + memcpy(&hs->shared_info->hinstrument[0], &rinstr->hash_stat, sizeof(HashInstrumentation)); + elog(DEBUG1, "parallel worker attach hash state plan %d peak %zu", + planstate->plan->plan_node_id, hs->hinstrument->space_peak); + } + else { - GatherMergeState *gms = (GatherMergeState *) planstate; - gms->nworkers_launched = rinstr->nworkers_launched; + if (hs->hashtable == NULL) + hs->hashtable = palloc(sizeof(HashJoinTableData)); + + hs->hashtable->nbuckets = rinstr->hash_stat.nbuckets; + hs->hashtable->nbuckets_original = rinstr->hash_stat.nbuckets_original; + hs->hashtable->nbatch = rinstr->hash_stat.nbatch; + hs->hashtable->nbatch_original = rinstr->hash_stat.nbatch_original; + hs->hashtable->spacePeak = rinstr->hash_stat.space_peak; } + } break; default: break; @@ -581,43 +850,81 @@ attachRemoteSpecialInstr(PlanState *planstate, RemoteInstr *rinstr) * Attach instrument information in planstate from saved info in combiner. */ bool -AttachRemoteInstr(PlanState *planstate, ResponseCombiner *combiner) +AttachRemoteInstr(PlanState *planstate, AttachRemoteInstrContext *ctx) { int plan_node_id = planstate->plan->plan_node_id; - if (bms_is_member(plan_node_id, combiner->printed_nodes)) + + if (bms_is_member(plan_node_id, ctx->printed_nodes)) return false; else - combiner->printed_nodes = bms_add_member(combiner->printed_nodes, plan_node_id); + ctx->printed_nodes = bms_add_member(ctx->printed_nodes, plan_node_id); - if (IsA(planstate, RemoteSubplanState) && NULL == planstate->lefttree) + if (IsA(planstate, RemoteSubplanState) && planstate->lefttree == NULL) { - Plan *plan = planstate->plan; - PlanState *remote_ps; - EState *estate = planstate->state; - - remote_ps = ExecInitNode(plan->lefttree, estate, EXEC_FLAG_EXPLAIN_ONLY); - planstate->lefttree = remote_ps; + /* subplan could be here, init it's child too */ + planstate->lefttree = ExecInitNode(planstate->plan->lefttree, + planstate->state, + EXEC_FLAG_EXPLAIN_ONLY); } if (planstate->instrument) { - bool found; - RemoteInstr *rinstr= (RemoteInstr *) hash_search(combiner->recv_instr_htbl, - (void *) &plan_node_id, - HASH_FIND, &found); - if (!found) + RemoteInstrKey key; + bool found; + RemoteInstr *rinstr; + RemoteInstr rinstr_final; /* for specific instrument */ + bool spec_need_attach = false; + ListCell *lc; + + int n = 0; + int nnode = list_length(ctx->node_idx_List); + + key.plan_node_id = plan_node_id; + memset(&rinstr_final, 0, sizeof(RemoteInstr)); + rinstr_final.sort_stat.sortMethod = -1; + rinstr_final.sort_stat.spaceType = -1; + + /* This is for non-parallel case. If parallel, we init dn_instrument in dsm. */ + if (planstate->dn_instrument == NULL) { - elog(DEBUG1, "AttachRemoteInstr: remote instrumentation not found, tag %d id %d", - nodeTag(planstate->plan), plan_node_id); + Size size = offsetof(DatanodeInstrumentation, instrument) + + mul_size(nnode, sizeof(RemoteInstrumentation)); + Assert(!IsParallelWorker()); + planstate->dn_instrument = palloc0(size); + planstate->dn_instrument->nnode = nnode; } - else + + foreach(lc, ctx->node_idx_List) { - Assert(rinstr->nodeTag == nodeTag(planstate->plan)); - Assert(rinstr->id == plan_node_id); + key.node_id = get_pgxc_node_id(get_nodeoid_from_nodeid(lfirst_int(lc), PGXC_NODE_DATANODE)); + elog(DEBUG1, "attach node %d, plan_node_id %d", key.node_id, key.plan_node_id); + rinstr = (RemoteInstr *) hash_search(ctx->htab, + (void *) &key, + HASH_FIND, &found); - memcpy(planstate->instrument, &rinstr->instr, sizeof(Instrumentation)); - attachRemoteSpecialInstr(planstate, rinstr); + if (found) + { + Assert(rinstr->nodeTag == nodeTag(planstate->plan)); + Assert(rinstr->key.plan_node_id == plan_node_id); + + elog(DEBUG1, "instr attach plan_node_id %d node %d index %d", plan_node_id, key.node_id, n); + planstate->dn_instrument->instrument[n].nodeid = key.node_id; + memcpy(&planstate->dn_instrument->instrument[n].instr, &rinstr->instr, sizeof(Instrumentation)); + /* TODO attach all nodes' remote specific instr */ + rinstr_final.nodeTag = rinstr->nodeTag; + rinstr_final.key = rinstr->key; + combineSpecRemoteInstr(&rinstr_final, rinstr); + spec_need_attach = true; + } + else + { + elog(DEBUG1, "failed to find remote instr of plan_node_id %d node %d", plan_node_id, key.node_id); + } + n++; } + /* TODO attach all nodes' remote specific instr */ + if (spec_need_attach) + attachRemoteSpecificInstr(planstate, &rinstr_final); } else { @@ -626,5 +933,157 @@ AttachRemoteInstr(PlanState *planstate, ResponseCombiner *combiner) nodeTag(planstate), plan_node_id); } - return planstate_tree_walker(planstate, AttachRemoteInstr, combiner); + return planstate_tree_walker(planstate, AttachRemoteInstr, ctx); +} + +/* + * ExplainCommonRemoteInstr + * + * Explain remote instruments for common info of current node. + */ +void +ExplainCommonRemoteInstr(PlanState *planstate, ExplainState *es) +{ + int i; + int nnode = planstate->dn_instrument->nnode; + + RemoteInstrumentation *rinstr = planstate->dn_instrument->instrument; + /* for min/max display */ + double nloops_min, nloops_max, nloops; + double startup_sec_min, startup_sec_max, startup_sec; + double total_sec_min, total_sec_max, total_sec; + double rows_min, rows_max, rows; + /* for verbose */ + StringInfoData buf; + + if (es->format == EXPLAIN_FORMAT_TEXT) + { + appendStringInfoChar(es->str, '\n'); + appendStringInfoSpaces(es->str, es->indent * 2); + } + + /* give min max a startup value */ + for (i = 0; i < nnode; i++) + { + Instrumentation *instr = &rinstr[i].instr; + if (instr->nloops != 0) + { + nloops_min = nloops_max = instr->nloops; + startup_sec_min = startup_sec_max = 1000.0 * instr->startup / nloops_min; + total_sec_min = total_sec_max = 1000.0 * instr->total / nloops_min; + rows_min = rows_max = instr->ntuples / nloops_min; + break; + } + } + if (i == nnode) + { + appendStringInfo(es->str, "DN (never executed)"); + return; + } + + if (es->verbose) + initStringInfo(&buf); + + for (i = 0; i < nnode; i++) + { + Instrumentation *instr = &rinstr[i].instr; + int node_id = rinstr[i].nodeid; + char *dnname; + + if (node_id == 0) + continue; + + dnname = get_pgxc_nodename_from_identifier(node_id); + nloops = instr->nloops; + startup_sec = 1000.0 * instr->startup / nloops; + total_sec = 1000.0 * instr->total / nloops; + rows = instr->ntuples / nloops; + + SET_MIN_MAX(nloops_min, nloops_max, nloops); + SET_MIN_MAX(startup_sec_min, startup_sec_max, startup_sec); + SET_MIN_MAX(total_sec_min, total_sec_max, total_sec); + SET_MIN_MAX(rows_min, rows_max, rows); + + /* one line for each dn if verbose */ + if (es->verbose) + { + if (es->format == EXPLAIN_FORMAT_TEXT) + { + appendStringInfoChar(&buf, '\n'); + appendStringInfoSpaces(&buf, es->indent * 2); + if (nloops <= 0) + { + appendStringInfo(&buf, "- %s (never executed)", dnname); + } + else + { + if (es->timing) + appendStringInfo(&buf, + "- %s (actual time=%.3f..%.3f rows=%.0f loops=%.0f)", + dnname, startup_sec, total_sec, rows, nloops); + else + appendStringInfo(&buf, + "- %s (actual rows=%.0f loops=%.0f)", + dnname, rows, nloops); + } + } + else + { + ExplainPropertyText("Data Node", dnname, es); + if (es->timing) + { + ExplainPropertyFloat("Actual Startup Time", startup_sec, 3, es); + ExplainPropertyFloat("Actual Total Time", total_sec, 3, es); + } + ExplainPropertyFloat("Actual Rows", rows, 0, es); + ExplainPropertyFloat("Actual Loops", nloops, 0, es); + } + } + } + + if (es->format == EXPLAIN_FORMAT_TEXT) + { + bool show_verbose = true; + + if (nloops_max <= 0) + { + show_verbose = false; + appendStringInfo(es->str, "DN (never executed)"); + } + else + { + if (es->timing) + appendStringInfo(es->str, + "DN (actual startup time=%.3f..%.3f total time=%.3f..%.3f rows=%.0f..%.0f loops=%.0f..%.0f)", + startup_sec_min, startup_sec_max, + total_sec_min, total_sec_max, rows_min, rows_max, + nloops_min, nloops_max); + else + appendStringInfo(es->str, + "DN (actual rows=%.0f..%.0f loops=%.0f..%.0f)", + rows_min, rows_max, nloops_min, nloops_max); + } + + if (es->verbose) + { + if (show_verbose) + appendStringInfo(es->str, "%s", buf.data); + pfree(buf.data); + } + } + else + { + ExplainPropertyText("Data Node", "ALL", es); + if (es->timing) + { + ExplainPropertyFloat("Actual Min Startup Time", startup_sec_min, 3, es); + ExplainPropertyFloat("Actual Max Startup Time", startup_sec_max, 3, es); + ExplainPropertyFloat("Actual Min Total Time", total_sec_min, 3, es); + ExplainPropertyFloat("Actual Max Total Time", total_sec_max, 3, es); + } + ExplainPropertyFloat("Actual Min Rows", rows_min, 0, es); + ExplainPropertyFloat("Actual Max Rows", rows_max, 0, es); + ExplainPropertyFloat("Actual Min Loops", nloops_min, 0, es); + ExplainPropertyFloat("Actual Max Loops", nloops_max, 0, es); + } } diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c index 7aa46865..db2cdf60 100644 --- a/src/backend/executor/execParallel.c +++ b/src/backend/executor/execParallel.c @@ -82,6 +82,7 @@ #define PARALLEL_KEY_EXEC_ERROR UINT64CONST(0xE0000000000000B1) #define PARALLEL_KEY_EXEC_DONE UINT64CONST(0xE0000000000000B2) +#define PARALLEL_REMOTEINSTR_OFFSET UINT64CONST(0xEC00000000000000) #endif #define PARALLEL_TUPLE_QUEUE_SIZE 65536 @@ -137,6 +138,15 @@ typedef struct ExecParallelInitializeDSMContext int nnodes; } ExecParallelInitializeDSMContext; +#ifdef __TBASE__ +/* Context object for ExecParallelInitializeRemoteInstr. */ +typedef struct ExecParallelRemoteInstrContext +{ + ParallelContext *pcxt; + int ndatanode; +} ExecParallelRemoteInstrContext; +#endif + /* Helper functions that run in the parallel leader. */ static char *ExecSerializePlan(Plan *plan, EState *estate); static bool ExecParallelEstimate(PlanState *node, @@ -149,7 +159,13 @@ static bool ExecParallelReInitializeDSM(PlanState *planstate, ParallelContext *pcxt); static bool ExecParallelRetrieveInstrumentation(PlanState *planstate, SharedExecutorInstrumentation *instrumentation); - +#ifdef __TBASE__ +static bool ExecParallelEstimateRemoteInstr(PlanState *planstate, + ExecParallelRemoteInstrContext *ri); +static bool ExecParallelInitRemoteInstrDSM(PlanState *planstate, + ExecParallelRemoteInstrContext *ri); +static bool ExecInitializeWorkerRemoteInstr(PlanState *planstate, ParallelWorkerContext *pcxt); +#endif /* Helper function that runs in the parallel worker. */ static DestReceiver *ExecParallelGetReceiver(dsm_segment *seg, shm_toc *toc); @@ -241,26 +257,16 @@ ExecSerializePlan(Plan *plan, EState *estate) */ static bool ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e) -{// #lizard forgives +{ +#ifdef __TBASE__ + int previous_nworkers; +#endif if (planstate == NULL) return false; /* Count this node. */ e->nnodes++; - /* - * if we are running with instrument option, must init - * full plantree here, to ensure e->nnodes correct. - */ - if (planstate->instrument && - IsA(planstate, RemoteSubplanState) && - NULL == planstate->lefttree) - { - planstate->lefttree = ExecInitNode(planstate->plan->lefttree, - planstate->state, - EXEC_FLAG_EXPLAIN_ONLY); - } - switch (nodeTag(planstate)) { case T_SeqScanState: @@ -306,7 +312,27 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e) break; /* For remote query and remote subplan, there is no need for shared storage. */ case T_RemoteQueryState: + break; case T_RemoteSubplanState: + /* + * If we are running with instrument option, must init full plantree here, + * to ensure e->nnodes correct. Further, we estimate per node instrument + * for remote instrumentation. + */ + if (planstate->instrument && NULL == planstate->lefttree) + { + ExecParallelRemoteInstrContext ri; + RemoteSubplanState *node = (RemoteSubplanState *) planstate; + + ri.ndatanode = list_length(((RemoteSubplan *)planstate->plan)->nodeList); + ri.pcxt = e->pcxt; + + planstate->lefttree = ExecInitNode(planstate->plan->lefttree, + planstate->state, + EXEC_FLAG_EXPLAIN_ONLY); + planstate_tree_walker(planstate, ExecParallelEstimateRemoteInstr, &ri); + node->combiner.remote_parallel_estimated = true; + } break; case T_HashJoinState: if (planstate->plan->parallel_aware) @@ -322,12 +348,24 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e) ReDistributeEstimate(planstate, e->pcxt); } break; + case T_GatherState: + previous_nworkers = e->pcxt->nworkers; + e->pcxt->nworkers = ((Gather *) planstate->plan)->num_workers; #endif default: break; } +#ifdef __TBASE__ + planstate_tree_walker(planstate, ExecParallelEstimate, e); + + if (IsA(planstate, GatherState)) + e->pcxt->nworkers = previous_nworkers; + + return false; +#else return planstate_tree_walker(planstate, ExecParallelEstimate, e); +#endif } /* @@ -337,7 +375,10 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e) static bool ExecParallelInitializeDSM(PlanState *planstate, ExecParallelInitializeDSMContext *d) -{// #lizard forgives +{ +#ifdef __TBASE__ + int previous_nworkers; +#endif if (planstate == NULL) return false; @@ -407,9 +448,21 @@ ExecParallelInitializeDSM(PlanState *planstate, d->pcxt); break; case T_RemoteSubplanState: + { + RemoteSubplanState *node = (RemoteSubplanState *) planstate; + if (node->combiner.remote_parallel_estimated) + { + ExecParallelRemoteInstrContext ri; + + ri.ndatanode = list_length(((RemoteSubplan *)planstate->plan)->nodeList); + ri.pcxt = d->pcxt; + + planstate_tree_walker(planstate, ExecParallelInitRemoteInstrDSM, &ri); + } if (planstate->plan->parallel_aware) ExecRemoteSubPlanInitializeDSM((RemoteSubplanState *)planstate, d->pcxt); + } break; case T_HashJoinState: if (planstate->plan->parallel_aware) @@ -425,12 +478,24 @@ ExecParallelInitializeDSM(PlanState *planstate, ReDistributeInitializeDSM(planstate, d->pcxt); } break; + case T_GatherState: + previous_nworkers = d->pcxt->nworkers; + d->pcxt->nworkers = ((Gather *) planstate->plan)->num_workers; #endif default: break; } +#ifdef __TBASE__ + planstate_tree_walker(planstate, ExecParallelInitializeDSM, d); + + if (IsA(planstate, GatherState)) + d->pcxt->nworkers = previous_nworkers; + + return false; +#else return planstate_tree_walker(planstate, ExecParallelInitializeDSM, d); +#endif } /* @@ -1002,7 +1067,9 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate, ibytes = mul_size(instrumentation->num_workers, sizeof(Instrumentation)); planstate->worker_instrument = palloc(ibytes + offsetof(WorkerInstrumentation, instrument)); +#ifndef __TBASE__ MemoryContextSwitchTo(oldcontext); +#endif planstate->worker_instrument->num_workers = instrumentation->num_workers; memcpy(&planstate->worker_instrument->instrument, instrument, ibytes); @@ -1019,6 +1086,26 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate, default: break; } +#ifdef __TBASE__ + /* also retrieve instrumentation from remote */ + if (planstate->dn_instrument != NULL) + { + DatanodeInstrumentation *tmp_instrument = planstate->dn_instrument; + int nnode = planstate->dn_instrument->nnode; + Size size = offsetof(DatanodeInstrumentation, instrument) + + mul_size(nnode, sizeof(RemoteInstrumentation)); + + elog(DEBUG1, "retrieve downstream instrumentation, plan_node_id %d nnode %d", plan_node_id, nnode); + + planstate->dn_instrument = palloc0(size); + memcpy(planstate->dn_instrument, tmp_instrument, size); + } + /* + * TBase switch memory context later to keep retrieved instrumentation live until + * sending them back to upstream. + */ + MemoryContextSwitchTo(oldcontext); +#endif return planstate_tree_walker(planstate, ExecParallelRetrieveInstrumentation, instrumentation); @@ -1228,6 +1315,15 @@ ExecParallelInitializeWorker(PlanState *planstate, ParallelWorkerContext *pwcxt) ExecRemoteQueryInitializeDSMWorker((RemoteQueryState *)planstate, pwcxt); break; case T_RemoteSubplanState: + if (planstate->instrument && NULL == planstate->lefttree) + { + /* if instrument needed, init full plantree in worker */ + planstate->lefttree = ExecInitNode(planstate->plan->lefttree, + planstate->state, + EXEC_FLAG_EXPLAIN_ONLY); + /* attach share memory for it's child */ + planstate_tree_walker(planstate, ExecInitializeWorkerRemoteInstr, pwcxt); + } if (planstate->plan->parallel_aware) ExecRemoteSubPlanInitializeDSMWorker((RemoteSubplanState *)planstate, pwcxt); break; @@ -1253,6 +1349,105 @@ ExecParallelInitializeWorker(PlanState *planstate, ParallelWorkerContext *pwcxt) pwcxt); } +#ifdef __TBASE__ +/* + * Estimate share memory space for plan nodes executed by remote, they contain instruments + * from all datanodes involved, and only leader worker receive these instruments. + */ +static bool +ExecParallelEstimateRemoteInstr(PlanState *node, ExecParallelRemoteInstrContext *ri) +{ + ParallelContext *pcxt = ri->pcxt; + Size size = mul_size(ri->ndatanode, sizeof(RemoteInstrumentation)); + size = add_size(size, offsetof(DatanodeInstrumentation, instrument)); + + if (node == NULL) + return false; + + /* + * only remote plan node could be here, we need disable parallel for these nodes + * to prevent them from initializing other share memory for execution, they don't + * need that, only init share memory for instrument collecting. + */ + node->plan->parallel_aware = false; + + shm_toc_estimate_chunk(&pcxt->estimator, size); + shm_toc_estimate_keys(&pcxt->estimator, 1); + + /* for sub-plan */ + if (IsA(node, RemoteSubplanState) && node->lefttree == NULL) + { + node->lefttree = ExecInitNode(node->plan->lefttree, + node->state, + EXEC_FLAG_EXPLAIN_ONLY); + } + + elog(DEBUG1, "parallel estimate shm remote instrument for plan node %d", node->plan->plan_node_id); + + return planstate_tree_walker(node, ExecParallelEstimateRemoteInstr, + ri); +} + +/* + * Allocate share memory space for plan nodes executed by remote, they contain instruments + * from all datanodes involved, and only leader worker receive these instruments. use + * plan_node_id + offset as a unique key. + */ +static bool +ExecParallelInitRemoteInstrDSM(PlanState *node, ExecParallelRemoteInstrContext *ri) +{ + ParallelContext *pcxt = ri->pcxt; + Size size = mul_size(ri->ndatanode, sizeof(RemoteInstrumentation)); + size = add_size(size, offsetof(DatanodeInstrumentation, instrument)); + + if (node == NULL) + return false; + + node->dn_instrument = shm_toc_allocate(pcxt->toc, size); + memset(node->dn_instrument, 0, size); + node->dn_instrument->nnode = ri->ndatanode; + shm_toc_insert(pcxt->toc, node->plan->plan_node_id + PARALLEL_REMOTEINSTR_OFFSET, + node->dn_instrument); + + elog(DEBUG1, "parallel allocate shm remote instrument for plan node %d", node->plan->plan_node_id); + + return planstate_tree_walker(node, ExecParallelInitRemoteInstrDSM, + ri); +} + +/* + * Fetch the share memory for plan nodes executed by remote, they will be fulfilled + * with instruments during RemoteSubplan node's execution. use plan_node_id + offset + * as the unique key. + */ +static bool +ExecInitializeWorkerRemoteInstr(PlanState *planstate, ParallelWorkerContext *pwcxt) +{ + /* + * only remote plan node could be here, we need disable parallel for these nodes + * to prevent them from initializing other share memory for execution, they don't + * need that, only init share memory for instrument collecting. + */ + planstate->plan->parallel_aware = false; + planstate->dn_instrument = shm_toc_lookup(pwcxt->toc, + planstate->plan->plan_node_id + PARALLEL_REMOTEINSTR_OFFSET, + false); + + /* for sub-plan */ + if (IsA(planstate, RemoteSubplanState) && planstate->lefttree == NULL) + { + planstate->lefttree = ExecInitNode(planstate->plan->lefttree, + planstate->state, + EXEC_FLAG_EXPLAIN_ONLY); + } + + elog(DEBUG1, "parallel init worker remote instrument for plan node %d", planstate->plan->plan_node_id); + + return planstate_tree_walker(planstate, ExecInitializeWorkerRemoteInstr, + pwcxt); +} +#endif + /* * Main entrypoint for parallel query worker processes. * diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c index eb5df5b9..cec4400d 100644 --- a/src/backend/executor/execProcnode.c +++ b/src/backend/executor/execProcnode.c @@ -429,6 +429,9 @@ ExecInitNode(Plan *node, EState *estate, int eflags) /* Set up instrumentation for this node if requested */ if (estate->es_instrument) result->instrument = InstrAlloc(1, estate->es_instrument); +#ifdef __TBASE__ + result->dn_instrument = NULL; +#endif return result; } diff --git a/src/backend/executor/nodeGather.c b/src/backend/executor/nodeGather.c index 9c63e4eb..55686429 100644 --- a/src/backend/executor/nodeGather.c +++ b/src/backend/executor/nodeGather.c @@ -543,9 +543,10 @@ ExecReScanGather(GatherState *node) * to propagate any error or other information to master backend before * dying. Parallel context will be reused for rescan. */ +#if 0 /* pg latest code disable for now */ Gather *gather = (Gather *) node->ps.plan; PlanState *outerPlan = outerPlanState(node); - +#endif /* Make sure any existing workers are gracefully shut down */ ExecShutdownGatherWorkers(node); diff --git a/src/backend/executor/nodeGatherMerge.c b/src/backend/executor/nodeGatherMerge.c index 291cf644..6f94db2b 100644 --- a/src/backend/executor/nodeGatherMerge.c +++ b/src/backend/executor/nodeGatherMerge.c @@ -405,8 +405,10 @@ ExecReScanGatherMerge(GatherMergeState *node) * to propagate any error or other information to master backend before * dying. Parallel context will be reused for rescan. */ +#if 0 /* postgresql latest code */ GatherMerge *gm = (GatherMerge *) node->ps.plan; PlanState *outerPlan = outerPlanState(node); +#endif /* Make sure any existing workers are gracefully shut down */ ExecShutdownGatherMergeWorkers(node); @@ -418,7 +420,7 @@ ExecReScanGatherMerge(GatherMergeState *node) ExecParallelReinitialize(&node->ps, node->pei); ExecReScan(node->ps.lefttree); -#if 0 +#if 0 /* postgresql latest code */ ======= /* * Set child node's chgParam to tell it that the next scan might deliver a diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c index c81eb2fa..1c4d148d 100644 --- a/src/backend/executor/nodeHash.c +++ b/src/backend/executor/nodeHash.c @@ -184,6 +184,9 @@ ExecInitHash(Hash *node, EState *estate, int eflags) hashstate->ps.ExecProcNode = ExecHash; hashstate->hashtable = NULL; hashstate->hashkeys = NIL; /* will be set by parent HashJoin */ +#ifdef __TBASE__ + hashstate->shared_info = NULL; +#endif /* * Miscellaneous initialization @@ -1830,6 +1833,10 @@ ExecHashEstimate(HashState *node, ParallelContext *pcxt) { size_t size; + /* don't need this if not instrumenting or no workers */ + if (!node->ps.instrument || pcxt->nworkers == 0) + return; + size = mul_size(pcxt->nworkers, sizeof(HashInstrumentation)); size = add_size(size, offsetof(SharedHashInfo, hinstrument)); shm_toc_estimate_chunk(&pcxt->estimator, size); @@ -1845,6 +1852,10 @@ ExecHashInitializeDSM(HashState *node, ParallelContext *pcxt) { size_t size; + /* don't need this if not instrumenting or no workers */ + if (!node->ps.instrument || pcxt->nworkers == 0) + return; + size = offsetof(SharedHashInfo, hinstrument) + pcxt->nworkers * sizeof(HashInstrumentation); node->shared_info = (SharedHashInfo *) shm_toc_allocate(pcxt->toc, size); @@ -1876,9 +1887,17 @@ ExecHashInitializeWorker(HashState *node, ParallelWorkerContext *pwcxt) { SharedHashInfo *shared_info; + /* don't need this if not instrumenting */ + if (!node->ps.instrument) + return; + shared_info = (SharedHashInfo *) shm_toc_lookup(pwcxt->toc, node->ps.plan->plan_node_id, true); node->hinstrument = &shared_info->hinstrument[ParallelWorkerNumber]; +#ifdef __TBASE__ + /* set node->shared_info for distributed instrument */ + node->shared_info = shared_info; +#endif } /* @@ -1890,6 +1909,7 @@ ExecHashInitializeWorker(HashState *node, ParallelWorkerContext *pwcxt) void ExecShutdownHash(HashState *node) { + /* Now accumulate data for the current (final) hash table */ if (node->hinstrument && node->hashtable) ExecHashGetInstrumentation(node->hinstrument, node->hashtable); } @@ -1904,6 +1924,9 @@ ExecHashRetrieveInstrumentation(HashState *node) SharedHashInfo *shared_info = node->shared_info; size_t size; + if (shared_info == NULL) + return; + /* Replace node->shared_info with a copy in backend-local memory. */ size = offsetof(SharedHashInfo, hinstrument) + shared_info->num_workers * sizeof(HashInstrumentation); diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c index c6446cad..5aab09e0 100644 --- a/src/backend/executor/nodeHashjoin.c +++ b/src/backend/executor/nodeHashjoin.c @@ -287,6 +287,8 @@ ExecHashJoin(PlanState *pstate) node->hj_HashOperators, HJ_FILL_INNER(node)); node->hj_HashTable = hashtable; + /* copy into hashNode too, for instrumentation */ + hashNode->hashtable = hashtable; parallelState->statusParallelWorker[ParallelWorkerNumber] = ParallelHashJoin_MergeShmHashTableDone; } else @@ -1651,6 +1653,7 @@ ExecMergeShmHashTable(HashJoinState * hjstate, volatile ParallelHashJoinState *p } ht->totalTuples = ht->totalTuples + mergeHashtable->totalTuples; + ht->spacePeak = Max(ht->spacePeak, mergeHashtable->spacePeak); /* merge hashtable */ for(indexbucket = 0; indexbucket < ht->nbuckets; indexbucket++) @@ -1803,6 +1806,12 @@ ExecMergeShmHashTable(HashJoinState * hjstate, volatile ParallelHashJoinState *p hashtable->totalTuples = ht->totalTuples; hashtable->skewEnabled = false; hashtable->growEnabled = false; + /* copy instrumentation too */ + hashtable->nbuckets = ht->nbuckets; + hashtable->nbuckets_original = ht->nbuckets_original; + hashtable->nbatch = ht->nbatch; + hashtable->nbatch_original = ht->nbatch_original; + hashtable->spacePeak = ht->spacePeak; } #if 0 diff --git a/src/backend/executor/nodeSort.c b/src/backend/executor/nodeSort.c index 2dd4bf89..40ae3ac0 100644 --- a/src/backend/executor/nodeSort.c +++ b/src/backend/executor/nodeSort.c @@ -299,6 +299,9 @@ ExecInitSort(Sort *node, EState *estate, int eflags) sortstate->state = NULL; sortstate->file = NULL; sortstate->dataslot = NULL; + sortstate->instrument.sortMethod = -1; + sortstate->instrument.spaceType = -1; + sortstate->instrument.spaceUsed = 0; #endif /* diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index e3be03b8..40f3f655 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -298,7 +298,6 @@ InitResponseCombiner(ResponseCombiner *combiner, int node_count, combiner->recv_datarows = 0; combiner->prerowBuffers = NULL; combiner->is_abort = false; - combiner->printed_nodes = NULL; combiner->recv_instr_htbl = NULL; #endif } @@ -1107,11 +1106,6 @@ CloseCombiner(ResponseCombiner *combiner) hash_destroy(combiner->recv_instr_htbl); combiner->recv_instr_htbl = NULL; } - if (combiner->printed_nodes) - { - bms_free(combiner->printed_nodes); - combiner->printed_nodes = NULL; - } #endif } @@ -1900,7 +1894,15 @@ FetchTuple(ResponseCombiner *combiner) * Case if we run local subplan. * We do not have remote connections, so just get local tuple and return it */ - if (outerPlanState(combiner)) + if (outerPlanState(combiner) +#ifdef __TBASE__ + /* + * if dn_instrument is not null, means this node is initialized for recv + * instrument from remote, not execute it locally too. + */ + && ((outerPlanState(combiner))->dn_instrument == NULL) +#endif + ) { RemoteSubplanState *planstate = (RemoteSubplanState *) combiner; RemoteSubplan *plan = (RemoteSubplan *) combiner->ss.ps.plan; @@ -2686,10 +2688,6 @@ FetchTuple(ResponseCombiner *combiner) { /* Do nothing. It must have been handled in handle_response() */ } - else if (res == RESPONSE_INSTR) - { - /* Do nothing. It must have been handled in handle_response() */ - } else { // Can not get here? @@ -3328,8 +3326,9 @@ handle_response(PGXCNodeHandle *conn, ResponseCombiner *combiner) #ifdef __TBASE__ case 'i': /* Remote Instrument */ if (msg_len > 0) - HandleRemoteInstr(msg, msg_len, conn->nodeoid, combiner); - return RESPONSE_INSTR; + HandleRemoteInstr(msg, msg_len, conn->nodeid, combiner); + /* just break to return EOF. */ + break; #endif default: /* sync lost? */ @@ -9907,17 +9906,19 @@ ExecInitRemoteSubplan(RemoteSubplan *node, EState *estate, int eflags) combiner->ss.ps.plan = (Plan *) node; combiner->ss.ps.state = estate; combiner->ss.ps.ExecProcNode = ExecRemoteSubplan; - +#ifdef __TBASE__ if (estate->es_instrument) { HASHCTL ctl; - ctl.keysize = sizeof(int); + ctl.keysize = sizeof(RemoteInstrKey); ctl.entrysize = sizeof(RemoteInstr); - combiner->recv_instr_htbl = hash_create("Remote Instrument", 16, &ctl, HASH_ELEM); + combiner->recv_instr_htbl = hash_create("Remote Instrument", 8 * NumDataNodes, + &ctl, HASH_ELEM | HASH_BLOBS); } - + combiner->remote_parallel_estimated = false; +#endif combiner->ss.ps.qual = NULL; combiner->request_type = REQUEST_TYPE_QUERY; @@ -10718,6 +10719,9 @@ ExecRemoteSubplan(PlanState *pstate) int count = 0; #endif #ifdef __TBASE__ + if ((node->eflags & EXEC_FLAG_EXPLAIN_ONLY) != 0) + return NULL; + if (!node->local_exec && (!node->finish_init) && (!(node->eflags & EXEC_FLAG_SUBPLAN))) { if(node->execNodes) @@ -11149,12 +11153,25 @@ ExecShutdownRemoteSubplan(RemoteSubplanState *node) Plan *plan = ps->plan; EState *estate = ps->state; + if ((node->eflags & EXEC_FLAG_EXPLAIN_ONLY) != 0) + return; + + elog(DEBUG1, "shutdown remote subplan worker %d, plan_node_id %d", ParallelWorkerNumber, plan->plan_node_id); + if (estate->es_instrument) { + MemoryContext oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); + AttachRemoteInstrContext ctx; + if (!ps->lefttree) ps->lefttree = ExecInitNode(plan->lefttree, estate, EXEC_FLAG_EXPLAIN_ONLY); - AttachRemoteInstr(ps->lefttree, combiner); + ctx.htab = combiner->recv_instr_htbl; + ctx.node_idx_List = ((RemoteSubplan *) plan)->nodeList; + ctx.printed_nodes = NULL; + AttachRemoteInstr(ps->lefttree, &ctx); + + MemoryContextSwitchTo(oldcontext); } } @@ -11168,6 +11185,9 @@ ExecFinishRemoteSubplan(RemoteSubplanState *node) int *dn_list = NULL; char cursor[NAMEDATALEN]; + if ((node->eflags & EXEC_FLAG_EXPLAIN_ONLY) != 0) + return; + if (!node->bound) { if (g_DataPumpDebug) diff --git a/src/backend/utils/cache/lsyscache.c b/src/backend/utils/cache/lsyscache.c index 46e81611..5f55c35d 100644 --- a/src/backend/utils/cache/lsyscache.c +++ b/src/backend/utils/cache/lsyscache.c @@ -2640,6 +2640,31 @@ is_pgxc_nodeprimary(Oid nodeid) return result; } +#ifdef __TBASE__ +/* + * get_pgxc_nodename + * Get node name for given identifier + */ +char * +get_pgxc_nodename_from_identifier(int id) +{ + HeapTuple tuple; + Form_pgxc_node nodeForm; + char *result; + + tuple = SearchSysCache1(PGXCNODEIDENTIFIER, Int32GetDatum(id)); + + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for identifier %d", id); + + nodeForm = (Form_pgxc_node) GETSTRUCT(tuple); + result = pstrdup(NameStr(nodeForm->node_name)); + ReleaseSysCache(tuple); + + return result; +} +#endif + /* * get_pgxc_groupoid * Obtain PGXC Group Oid for given group name diff --git a/src/include/commands/explain_dist.h b/src/include/commands/explain_dist.h index fe682bda..0c49bc66 100644 --- a/src/include/commands/explain_dist.h +++ b/src/include/commands/explain_dist.h @@ -16,21 +16,42 @@ #include "commands/explain.h" #include "pgxc/execRemote.h" +/* Key of hash table entry */ +typedef struct RemoteInstrKey +{ + int plan_node_id; /* unique id of current plan node */ + int node_id; /* node id */ +} RemoteInstrKey; + /* Hash table entry */ -typedef struct +typedef struct RemoteInstr { - int id; /* unique id of current plan node */ + RemoteInstrKey key; + int nodeTag; /* type of current plan node */ Instrumentation instr; /* instrument of current plan node */ - /* for Gather */ - int nworkers_launched; /* worker num of gather */ + /* for Gather and Sort */ + int nworkers_launched; /* worker num of gather or sort */ - /* for Hash: */ + /* for Sort */ + TuplesortInstrumentation sort_stat; /* instrument if no parallel */ + TuplesortInstrumentation *w_sort_stats; /* instrument of parallel workers */ + + /* for Hash */ + HashInstrumentation hash_stat; } RemoteInstr; +typedef struct AttachRemoteInstrContext +{ + List *node_idx_List; /* list of node index in dn_handles */ + HTAB *htab; /* htab from combiner, stored remote instr */ + Bitmapset *printed_nodes; /* ids of plan nodes we've handled */ +} AttachRemoteInstrContext; + extern void SendLocalInstr(PlanState *planstate); -extern void HandleRemoteInstr(char *msg_body, size_t len, int nodeoid, ResponseCombiner *combiner); -extern bool AttachRemoteInstr(PlanState *planstate, ResponseCombiner *combiner); +extern void HandleRemoteInstr(char *msg_body, size_t len, int nodeid, ResponseCombiner *combiner); +extern bool AttachRemoteInstr(PlanState *planstate, AttachRemoteInstrContext *ctx); +extern void ExplainCommonRemoteInstr(PlanState *planstate, ExplainState *es); #endif /* EXPLAINDIST_H */ \ No newline at end of file diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index 072f7f5a..1f9af7fe 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * instrument.h - * definitions for run-time statistics collection + * definitions for run-time statistics collection * * * Copyright (c) 2001-2017, PostgreSQL Global Development Group @@ -18,57 +18,71 @@ typedef struct BufferUsage { - long shared_blks_hit; /* # of shared buffer hits */ - long shared_blks_read; /* # of shared disk blocks read */ - long shared_blks_dirtied; /* # of shared blocks dirtied */ - long shared_blks_written; /* # of shared disk blocks written */ - long local_blks_hit; /* # of local buffer hits */ - long local_blks_read; /* # of local disk blocks read */ - long local_blks_dirtied; /* # of shared blocks dirtied */ - long local_blks_written; /* # of local disk blocks written */ - long temp_blks_read; /* # of temp blocks read */ - long temp_blks_written; /* # of temp blocks written */ - instr_time blk_read_time; /* time spent reading */ - instr_time blk_write_time; /* time spent writing */ + long shared_blks_hit; /* # of shared buffer hits */ + long shared_blks_read; /* # of shared disk blocks read */ + long shared_blks_dirtied; /* # of shared blocks dirtied */ + long shared_blks_written; /* # of shared disk blocks written */ + long local_blks_hit; /* # of local buffer hits */ + long local_blks_read; /* # of local disk blocks read */ + long local_blks_dirtied; /* # of shared blocks dirtied */ + long local_blks_written; /* # of local disk blocks written */ + long temp_blks_read; /* # of temp blocks read */ + long temp_blks_written; /* # of temp blocks written */ + instr_time blk_read_time; /* time spent reading */ + instr_time blk_write_time; /* time spent writing */ } BufferUsage; /* Flag bits included in InstrAlloc's instrument_options bitmask */ typedef enum InstrumentOption { - INSTRUMENT_TIMER = 1 << 0, /* needs timer (and row counts) */ - INSTRUMENT_BUFFERS = 1 << 1, /* needs buffer usage */ - INSTRUMENT_ROWS = 1 << 2, /* needs row count */ - INSTRUMENT_ALL = PG_INT32_MAX + INSTRUMENT_TIMER = 1 << 0, /* needs timer (and row counts) */ + INSTRUMENT_BUFFERS = 1 << 1, /* needs buffer usage */ + INSTRUMENT_ROWS = 1 << 2, /* needs row count */ + INSTRUMENT_ALL = PG_INT32_MAX } InstrumentOption; typedef struct Instrumentation { - /* Parameters set at node creation: */ - bool need_timer; /* TRUE if we need timer data */ - bool need_bufusage; /* TRUE if we need buffer usage data */ - /* Info about current plan cycle: */ - bool running; /* TRUE if we've completed first tuple */ - instr_time starttime; /* Start time of current iteration of node */ - instr_time counter; /* Accumulated runtime for this node */ - double firsttuple; /* Time for first tuple of this cycle */ - double tuplecount; /* Tuples emitted so far this cycle */ - BufferUsage bufusage_start; /* Buffer usage at start */ - /* Accumulated statistics across all completed cycles: */ - double startup; /* Total startup time (in seconds) */ - double total; /* Total total time (in seconds) */ - double ntuples; /* Total tuples produced */ - double nloops; /* # of run cycles for this node */ - double nfiltered1; /* # tuples removed by scanqual or joinqual */ - double nfiltered2; /* # tuples removed by "other" quals */ - BufferUsage bufusage; /* Total buffer usage */ + /* Parameters set at node creation: */ + bool need_timer; /* TRUE if we need timer data */ + bool need_bufusage; /* TRUE if we need buffer usage data */ + /* Info about current plan cycle: */ + bool running; /* TRUE if we've completed first tuple */ + instr_time starttime; /* Start time of current iteration of node */ + instr_time counter; /* Accumulated runtime for this node */ + double firsttuple; /* Time for first tuple of this cycle */ + double tuplecount; /* Tuples emitted so far this cycle */ + BufferUsage bufusage_start; /* Buffer usage at start */ + /* Accumulated statistics across all completed cycles: */ + double startup; /* Total startup time (in seconds) */ + double total; /* Total total time (in seconds) */ + double ntuples; /* Total tuples produced */ + double nloops; /* # of run cycles for this node */ + double nfiltered1; /* # tuples removed by scanqual or joinqual */ + double nfiltered2; /* # tuples removed by "other" quals */ + BufferUsage bufusage; /* Total buffer usage */ } Instrumentation; typedef struct WorkerInstrumentation { - int num_workers; /* # of structures that follow */ - Instrumentation instrument[FLEXIBLE_ARRAY_MEMBER]; + int num_workers; /* # of structures that follow */ + Instrumentation instrument[FLEXIBLE_ARRAY_MEMBER]; } WorkerInstrumentation; +#ifdef __TBASE__ +typedef struct RemoteInstrumentation +{ + int nodeid; /* which datanode the instrument comes from */ + Instrumentation instr; /* the instrumentation */ +} RemoteInstrumentation; + +typedef struct DatanodeInstrumentation +{ + int nnode; /* how many datanodes this node has been executed */ + RemoteInstrumentation instrument[FLEXIBLE_ARRAY_MEMBER]; +} DatanodeInstrumentation; +#endif + extern PGDLLIMPORT BufferUsage pgBufferUsage; extern Instrumentation *InstrAlloc(int n, int instrument_options); @@ -81,4 +95,4 @@ extern void InstrStartParallelQuery(void); extern void InstrEndParallelQuery(BufferUsage *result); extern void InstrAccumParallelQuery(BufferUsage *result); -#endif /* INSTRUMENT_H */ +#endif /* INSTRUMENT_H */ diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 20c14341..087b2223 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -928,6 +928,9 @@ typedef struct PlanState * wrapper */ Instrumentation *instrument; /* Optional runtime stats for this node */ +#ifdef __TBASE__ + DatanodeInstrumentation *dn_instrument; /* per-datanode instrumentation */ +#endif WorkerInstrumentation *worker_instrument; /* per-worker instrumentation */ /* @@ -1992,6 +1995,7 @@ typedef struct SortState bool am_worker; /* are we a worker? */ SharedSortInfo *shared_info; /* one entry per worker */ #ifdef __TBASE__ + TuplesortInstrumentation instrument; /* cached instrument from distributed nodes */ Size stateLen; ReDistributeState *state; BufFile **file; diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h index 03b16a62..8332a217 100644 --- a/src/include/pgxc/execRemote.h +++ b/src/include/pgxc/execRemote.h @@ -177,8 +177,8 @@ typedef struct ResponseCombiner uint64 recv_datarows; /* for remote instrument */ - Bitmapset *printed_nodes; /* ids of plan nodes we've handled */ HTAB *recv_instr_htbl; /* received str hash table for each plan_node_id */ + bool remote_parallel_estimated; /* hint for remote instrument in parallel mode */ #endif } ResponseCombiner; diff --git a/src/include/utils/lsyscache.h b/src/include/utils/lsyscache.h index c00fcb61..e0d757b0 100644 --- a/src/include/utils/lsyscache.h +++ b/src/include/utils/lsyscache.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * lsyscache.h - * Convenience routines for common queries in the system catalog cache. + * Convenience routines for common queries in the system catalog cache. * * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group @@ -21,41 +21,41 @@ /* Result list element for get_op_btree_interpretation */ typedef struct OpBtreeInterpretation { - Oid opfamily_id; /* btree opfamily containing operator */ - int strategy; /* its strategy number */ - Oid oplefttype; /* declared left input datatype */ - Oid oprighttype; /* declared right input datatype */ + Oid opfamily_id; /* btree opfamily containing operator */ + int strategy; /* its strategy number */ + Oid oplefttype; /* declared left input datatype */ + Oid oprighttype; /* declared right input datatype */ } OpBtreeInterpretation; /* I/O function selector for get_type_io_data */ typedef enum IOFuncSelector { - IOFunc_input, - IOFunc_output, - IOFunc_receive, - IOFunc_send + IOFunc_input, + IOFunc_output, + IOFunc_receive, + IOFunc_send } IOFuncSelector; /* Flag bits for get_attstatsslot */ -#define ATTSTATSSLOT_VALUES 0x01 -#define ATTSTATSSLOT_NUMBERS 0x02 +#define ATTSTATSSLOT_VALUES 0x01 +#define ATTSTATSSLOT_NUMBERS 0x02 /* Result struct for get_attstatsslot */ typedef struct AttStatsSlot { - /* Always filled: */ - Oid staop; /* Actual staop for the found slot */ - /* Filled if ATTSTATSSLOT_VALUES is specified: */ - Oid valuetype; /* Actual datatype of the values */ - Datum *values; /* slot's "values" array, or NULL if none */ - int nvalues; /* length of values[], or 0 */ - /* Filled if ATTSTATSSLOT_NUMBERS is specified: */ - float4 *numbers; /* slot's "numbers" array, or NULL if none */ - int nnumbers; /* length of numbers[], or 0 */ + /* Always filled: */ + Oid staop; /* Actual staop for the found slot */ + /* Filled if ATTSTATSSLOT_VALUES is specified: */ + Oid valuetype; /* Actual datatype of the values */ + Datum *values; /* slot's "values" array, or NULL if none */ + int nvalues; /* length of values[], or 0 */ + /* Filled if ATTSTATSSLOT_NUMBERS is specified: */ + float4 *numbers; /* slot's "numbers" array, or NULL if none */ + int nnumbers; /* length of numbers[], or 0 */ - /* Remaining fields are private to get_attstatsslot/free_attstatsslot */ - void *values_arr; /* palloc'd values array, if any */ - void *numbers_arr; /* palloc'd numbers array, if any */ + /* Remaining fields are private to get_attstatsslot/free_attstatsslot */ + void *values_arr; /* palloc'd values array, if any */ + void *numbers_arr; /* palloc'd numbers array, if any */ } AttStatsSlot; /* Hook for plugins to get control in get_attavgwidth() */ @@ -63,40 +63,40 @@ typedef int32 (*get_attavgwidth_hook_type) (Oid relid, AttrNumber attnum); extern PGDLLIMPORT get_attavgwidth_hook_type get_attavgwidth_hook; extern bool op_in_opfamily(Oid opno, Oid opfamily); -extern int get_op_opfamily_strategy(Oid opno, Oid opfamily); -extern Oid get_op_opfamily_sortfamily(Oid opno, Oid opfamily); +extern int get_op_opfamily_strategy(Oid opno, Oid opfamily); +extern Oid get_op_opfamily_sortfamily(Oid opno, Oid opfamily); extern void get_op_opfamily_properties(Oid opno, Oid opfamily, bool ordering_op, - int *strategy, - Oid *lefttype, - Oid *righttype); + int *strategy, + Oid *lefttype, + Oid *righttype); extern Oid get_opfamily_member(Oid opfamily, Oid lefttype, Oid righttype, - int16 strategy); + int16 strategy); extern bool get_ordering_op_properties(Oid opno, - Oid *opfamily, Oid *opcintype, int16 *strategy); -extern Oid get_equality_op_for_ordering_op(Oid opno, bool *reverse); -extern Oid get_ordering_op_for_equality_op(Oid opno, bool use_lhs_type); + Oid *opfamily, Oid *opcintype, int16 *strategy); +extern Oid get_equality_op_for_ordering_op(Oid opno, bool *reverse); +extern Oid get_ordering_op_for_equality_op(Oid opno, bool use_lhs_type); extern List *get_mergejoin_opfamilies(Oid opno); extern bool get_compatible_hash_operators(Oid opno, - Oid *lhs_opno, Oid *rhs_opno); + Oid *lhs_opno, Oid *rhs_opno); extern bool get_op_hash_functions(Oid opno, - RegProcedure *lhs_procno, RegProcedure *rhs_procno); + RegProcedure *lhs_procno, RegProcedure *rhs_procno); extern List *get_op_btree_interpretation(Oid opno); extern bool equality_ops_are_compatible(Oid opno1, Oid opno2); extern Oid get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype, - int16 procnum); + int16 procnum); extern char *get_attname(Oid relid, AttrNumber attnum); extern char *get_relid_attribute_name(Oid relid, AttrNumber attnum); extern AttrNumber get_attnum(Oid relid, const char *attname); extern char get_attidentity(Oid relid, AttrNumber attnum); -extern Oid get_atttype(Oid relid, AttrNumber attnum); +extern Oid get_atttype(Oid relid, AttrNumber attnum); extern int32 get_atttypmod(Oid relid, AttrNumber attnum); extern void get_atttypetypmodcoll(Oid relid, AttrNumber attnum, - Oid *typid, int32 *typmod, Oid *collid); + Oid *typid, int32 *typmod, Oid *collid); extern char *get_collation_name(Oid colloid); #ifdef XCP -extern Oid get_collation_namespace(Oid colloid); +extern Oid get_collation_namespace(Oid colloid); extern int32 get_collation_encoding(Oid colloid); -extern Oid get_collid(const char *collname, int32 collencoding, Oid collnsp); +extern Oid get_collid(const char *collname, int32 collencoding, Oid collnsp); #endif extern char *get_constraint_name(Oid conoid); #ifdef __TBASE__ @@ -106,26 +106,26 @@ extern Oid get_rel_filenode(Oid relid); extern bool get_rel_stat(Oid relid, int *pages, float *tuples, int *all_visible_pages); #endif extern char *get_language_name(Oid langoid, bool missing_ok); -extern Oid get_opclass_family(Oid opclass); -extern Oid get_opclass_input_type(Oid opclass); +extern Oid get_opclass_family(Oid opclass); +extern Oid get_opclass_input_type(Oid opclass); extern RegProcedure get_opcode(Oid opno); extern char *get_opname(Oid opno); -extern Oid get_op_rettype(Oid opno); +extern Oid get_op_rettype(Oid opno); extern void op_input_types(Oid opno, Oid *lefttype, Oid *righttype); extern bool op_mergejoinable(Oid opno, Oid inputtype); extern bool op_hashjoinable(Oid opno, Oid inputtype); extern bool op_strict(Oid opno); extern char op_volatile(Oid opno); -extern Oid get_commutator(Oid opno); -extern Oid get_negator(Oid opno); +extern Oid get_commutator(Oid opno); +extern Oid get_negator(Oid opno); extern RegProcedure get_oprrest(Oid opno); extern RegProcedure get_oprjoin(Oid opno); extern char *get_func_name(Oid funcid); -extern Oid get_func_namespace(Oid funcid); -extern Oid get_func_rettype(Oid funcid); -extern int get_func_nargs(Oid funcid); -extern Oid get_func_signature(Oid funcid, Oid **argtypes, int *nargs); -extern Oid get_func_variadictype(Oid funcid); +extern Oid get_func_namespace(Oid funcid); +extern Oid get_func_rettype(Oid funcid); +extern int get_func_nargs(Oid funcid); +extern Oid get_func_signature(Oid funcid, Oid **argtypes, int *nargs); +extern Oid get_func_variadictype(Oid funcid); extern bool get_func_retset(Oid funcid); extern bool func_strict(Oid funcid); extern char func_volatile(Oid funcid); @@ -133,33 +133,33 @@ extern char func_parallel(Oid funcid); extern bool get_func_leakproof(Oid funcid); extern float4 get_func_cost(Oid funcid); extern float4 get_func_rows(Oid funcid); -extern Oid get_relname_relid(const char *relname, Oid relnamespace); +extern Oid get_relname_relid(const char *relname, Oid relnamespace); #ifdef PGXC -extern int get_relnatts(Oid relid); +extern int get_relnatts(Oid relid); #endif extern char *get_rel_name(Oid relid); -extern Oid get_rel_namespace(Oid relid); -extern Oid get_rel_type_id(Oid relid); +extern Oid get_rel_namespace(Oid relid); +extern Oid get_rel_type_id(Oid relid); extern char get_rel_relkind(Oid relid); -extern Oid get_rel_tablespace(Oid relid); +extern Oid get_rel_tablespace(Oid relid); extern char get_rel_persistence(Oid relid); -extern Oid get_transform_fromsql(Oid typid, Oid langid, List *trftypes); -extern Oid get_transform_tosql(Oid typid, Oid langid, List *trftypes); +extern Oid get_transform_fromsql(Oid typid, Oid langid, List *trftypes); +extern Oid get_transform_tosql(Oid typid, Oid langid, List *trftypes); extern bool get_typisdefined(Oid typid); extern int16 get_typlen(Oid typid); extern bool get_typbyval(Oid typid); extern void get_typlenbyval(Oid typid, int16 *typlen, bool *typbyval); extern void get_typlenbyvalalign(Oid typid, int16 *typlen, bool *typbyval, - char *typalign); -extern Oid getTypeIOParam(HeapTuple typeTuple); + char *typalign); +extern Oid getTypeIOParam(HeapTuple typeTuple); extern void get_type_io_data(Oid typid, - IOFuncSelector which_func, - int16 *typlen, - bool *typbyval, - char *typalign, - char *typdelim, - Oid *typioparam, - Oid *func); + IOFuncSelector which_func, + int16 *typlen, + bool *typbyval, + char *typalign, + char *typdelim, + Oid *typioparam, + Oid *func); extern char get_typstorage(Oid typid); extern Node *get_typdefault(Oid typid); extern char get_typtype(Oid typid); @@ -167,65 +167,68 @@ extern bool type_is_rowtype(Oid typid); extern bool type_is_enum(Oid typid); extern bool type_is_range(Oid typid); extern void get_type_category_preferred(Oid typid, - char *typcategory, - bool *typispreferred); -extern Oid get_typ_typrelid(Oid typid); -extern Oid get_element_type(Oid typid); -extern Oid get_array_type(Oid typid); -extern Oid get_promoted_array_type(Oid typid); -extern Oid get_base_element_type(Oid typid); + char *typcategory, + bool *typispreferred); +extern Oid get_typ_typrelid(Oid typid); +extern Oid get_element_type(Oid typid); +extern Oid get_array_type(Oid typid); +extern Oid get_promoted_array_type(Oid typid); +extern Oid get_base_element_type(Oid typid); extern void getTypeInputInfo(Oid type, Oid *typInput, Oid *typIOParam); extern void getTypeOutputInfo(Oid type, Oid *typOutput, bool *typIsVarlena); extern void getTypeBinaryInputInfo(Oid type, Oid *typReceive, Oid *typIOParam); extern void getTypeBinaryOutputInfo(Oid type, Oid *typSend, bool *typIsVarlena); -extern Oid get_typmodin(Oid typid); -extern Oid get_typcollation(Oid typid); +extern Oid get_typmodin(Oid typid); +extern Oid get_typcollation(Oid typid); extern bool type_is_collatable(Oid typid); -extern Oid getBaseType(Oid typid); -extern Oid getBaseTypeAndTypmod(Oid typid, int32 *typmod); +extern Oid getBaseType(Oid typid); +extern Oid getBaseTypeAndTypmod(Oid typid, int32 *typmod); #ifdef PGXC extern char *get_typename(Oid typid); extern char * get_typenamespace_typename(Oid typid); extern char *get_pgxc_nodename(Oid nodeoid); -extern Oid get_pgxc_nodeoid_extend(const char *nodename, const char *clustername); +extern Oid get_pgxc_nodeoid_extend(const char *nodename, const char *clustername); #define get_pgxc_nodeoid(nodename) get_pgxc_nodeoid_extend((nodename), (PGXCClusterName)) -extern uint32 get_pgxc_node_id(Oid nodeid); -extern char get_pgxc_nodetype(Oid nodeid); -extern int get_pgxc_nodeport(Oid nodeid); +extern uint32 get_pgxc_node_id(Oid nodeid); +extern char get_pgxc_nodetype(Oid nodeid); +extern int get_pgxc_nodeport(Oid nodeid); extern char *get_pgxc_nodehost(Oid nodeid); -extern bool is_pgxc_nodepreferred(Oid nodeid); -extern bool is_pgxc_nodeprimary(Oid nodeid); -extern Oid get_pgxc_groupoid(const char *groupname); -extern int get_pgxc_groupmembers(Oid groupid, Oid **members); -extern int get_pgxc_classnodes(Oid tableid, Oid **nodes); +extern bool is_pgxc_nodepreferred(Oid nodeid); +extern bool is_pgxc_nodeprimary(Oid nodeid); +#ifdef __TBASE__ +extern char *get_pgxc_nodename_from_identifier(int id); +#endif +extern Oid get_pgxc_groupoid(const char *groupname); +extern int get_pgxc_groupmembers(Oid groupid, Oid **members); +extern int get_pgxc_classnodes(Oid tableid, Oid **nodes); extern char * get_pgxc_groupname(Oid groupid); #endif extern int32 get_typavgwidth(Oid typid, int32 typmod); extern int32 get_attavgwidth(Oid relid, AttrNumber attnum); extern bool get_attstatsslot(AttStatsSlot *sslot, HeapTuple statstuple, - int reqkind, Oid reqop, int flags); + int reqkind, Oid reqop, int flags); extern void free_attstatsslot(AttStatsSlot *sslot); extern char *get_namespace_name(Oid nspid); #ifdef XCP -extern Oid get_namespaceid(const char *nspname); +extern Oid get_namespaceid(const char *nspname); extern char *get_typ_name(Oid typid); -extern Oid get_typ_namespace(Oid typid); -extern Oid get_typname_typid(const char *typname, Oid typnamespace); -extern Oid get_funcid(const char *funcname, oidvector *argtypes, Oid funcnsp); -extern Oid get_opnamespace(Oid opno); -extern Oid get_operid(const char *oprname, Oid oprleft, Oid oprright, Oid oprnsp); +extern Oid get_typ_namespace(Oid typid); +extern Oid get_typname_typid(const char *typname, Oid typnamespace); +extern Oid get_funcid(const char *funcname, oidvector *argtypes, Oid funcnsp); +extern Oid get_opnamespace(Oid opno); +extern Oid get_operid(const char *oprname, Oid oprleft, Oid oprright, Oid oprnsp); #endif extern char *get_namespace_name_or_temp(Oid nspid); -extern Oid get_range_subtype(Oid rangeOid); +extern Oid get_range_subtype(Oid rangeOid); #ifdef XCP -extern Oid get_tablesample_method_id(const char *methodname); +extern Oid get_tablesample_method_id(const char *methodname); #endif #define type_is_array(typid) (get_element_type(typid) != InvalidOid) /* type_is_array_domain accepts both plain arrays and domains over arrays */ #define type_is_array_domain(typid) (get_base_element_type(typid) != InvalidOid) -#define TypeIsToastable(typid) (get_typstorage(typid) != 'p') +#define TypeIsToastable(typid) (get_typstorage(typid) != 'p') -#endif /* LSYSCACHE_H */ +#endif /* LSYSCACHE_H */ diff --git a/src/test/regress/expected/join_3.out b/src/test/regress/expected/join_3.out index a1c6c31b..761f5a90 100644 --- a/src/test/regress/expected/join_3.out +++ b/src/test/regress/expected/join_3.out @@ -6169,6 +6169,446 @@ create index idx_nestloop_suppression1_b on nestloop_suppression1(b); analyze nestloop_suppression1; analyze nestloop_suppression2; analyze nestloop_suppression3; +begin; +set local min_parallel_table_scan_size = 0; +set local parallel_setup_cost = 0; +-- Extract bucket and batch counts from an explain analyze plan. In +-- general we can't make assertions about how many batches (or +-- buckets) will be required because it can vary, but we can in some +-- special cases and we can check for growth. +create or replace function find_hash(node json) +returns json language plpgsql +as +$$ +declare + x json; + child json; +begin + if node->>'Node Type' = 'Hash' then + return node; + else + for child in select json_array_elements(node->'Plans') + loop + x := find_hash(child); + if x is not null then + return x; + end if; + end loop; + return null; + end if; +end; +$$; +create or replace function hash_join_batches(query text) +returns table (original int, final int) language plpgsql +as +$$ +declare + whole_plan json; + hash_node json; +begin + for whole_plan in + execute 'explain (analyze, format ''json'') ' || query + loop + hash_node := find_hash(json_extract_path(whole_plan, '0', 'Plan')); + original := hash_node->>'Original Hash Batches'; + final := hash_node->>'Hash Batches'; + return next; + end loop; +end; +$$; +-- Make a simple relation with well distributed keys and correctly +-- estimated size. +create table simple as + select generate_series(1, 20000) AS id, 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'; +alter table simple set (parallel_workers = 2); +analyze simple; +-- Make a relation whose size we will under-estimate. We want stats +-- to say 1000 rows, but actually there are 20,000 rows. +create table bigger_than_it_looks as + select generate_series(1, 20000) as id, 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'; +alter table bigger_than_it_looks set (autovacuum_enabled = 'false'); +alter table bigger_than_it_looks set (parallel_workers = 2); +analyze bigger_than_it_looks; +update pg_class set reltuples = 1000 where relname = 'bigger_than_it_looks'; +-- Make a relation whose size we underestimate and that also has a +-- kind of skew that breaks our batching scheme. We want stats to say +-- 2 rows, but actually there are 20,000 rows with the same key. +create table extremely_skewed (id int, t text); +alter table extremely_skewed set (autovacuum_enabled = 'false'); +alter table extremely_skewed set (parallel_workers = 2); +analyze extremely_skewed; +insert into extremely_skewed + select 42 as id, 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa' + from generate_series(1, 20000); +update pg_class + set reltuples = 2, relpages = pg_relation_size('extremely_skewed') / 8192 + where relname = 'extremely_skewed'; +-- The "optimal" case: the hash table fits in memory; we plan for 1 +-- batch, we stick to that number, and peak memory usage stays within +-- our work_mem budget +-- non-parallel +savepoint settings; +set local max_parallel_workers_per_gather = 0; +set local work_mem = '4MB'; +explain (costs off) + select count(*) from simple r join simple s using (id); + QUERY PLAN +----------------------------------------------------------- + Finalize Aggregate + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Partial Aggregate + -> Hash Join + Hash Cond: (r.id = s.id) + -> Seq Scan on simple r + -> Hash + -> Seq Scan on simple s +(8 rows) + +select count(*) from simple r join simple s using (id); + count +------- + 20000 +(1 row) + +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join simple s using (id); +$$); + initially_multibatch | increased_batches +----------------------+------------------- + f | f +(1 row) + +rollback to settings; +-- parallel with parallel-oblivious hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +set local work_mem = '4MB'; +explain (costs off) + select count(*) from simple r join simple s using (id); + QUERY PLAN +-------------------------------------------------------------------- + Parallel Finalize Aggregate + -> Parallel Remote Subquery Scan on all (datanode_1,datanode_2) + -> Gather + Workers Planned: 2 + -> Partial Aggregate + -> Parallel Hash Join + Hash Cond: (r.id = s.id) + -> Parallel Seq Scan on simple r + -> Parallel Hash + -> Parallel Seq Scan on simple s +(10 rows) + +select count(*) from simple r join simple s using (id); + count +------- + 20000 +(1 row) + +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join simple s using (id); +$$); + initially_multibatch | increased_batches +----------------------+------------------- + f | f +(1 row) + +rollback to settings; +-- The "good" case: batches required, but we plan the right number; we +-- plan for some number of batches, and we stick to that number, and +-- peak memory usage says within our work_mem budget +-- non-parallel +savepoint settings; +set local max_parallel_workers_per_gather = 0; +set local work_mem = '128kB'; +explain (costs off) + select count(*) from simple r join simple s using (id); + QUERY PLAN +----------------------------------------------------------- + Finalize Aggregate + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Partial Aggregate + -> Hash Join + Hash Cond: (r.id = s.id) + -> Seq Scan on simple r + -> Hash + -> Seq Scan on simple s +(8 rows) + +select count(*) from simple r join simple s using (id); + count +------- + 20000 +(1 row) + +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join simple s using (id); +$$); + initially_multibatch | increased_batches +----------------------+------------------- + t | f +(1 row) + +rollback to settings; +-- parallel with parallel-oblivious hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +set local work_mem = '128kB'; +explain (costs off) + select count(*) from simple r join simple s using (id); + QUERY PLAN +-------------------------------------------------------------------- + Parallel Finalize Aggregate + -> Parallel Remote Subquery Scan on all (datanode_1,datanode_2) + -> Gather + Workers Planned: 2 + -> Partial Aggregate + -> Parallel Hash Join + Hash Cond: (r.id = s.id) + -> Parallel Seq Scan on simple r + -> Parallel Hash + -> Parallel Seq Scan on simple s +(10 rows) + +select count(*) from simple r join simple s using (id); + count +------- + 20000 +(1 row) + +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join simple s using (id); +$$); + initially_multibatch | increased_batches +----------------------+------------------- + t | f +(1 row) + +rollback to settings; +-- The "bad" case: during execution we need to increase number of +-- batches; in this case we plan for 1 batch, and increase at least a +-- couple of times, and peak memory usage stays within our work_mem +-- budget +-- non-parallel +savepoint settings; +set local max_parallel_workers_per_gather = 0; +set local work_mem = '128kB'; +explain (costs off) + select count(*) FROM simple r JOIN bigger_than_it_looks s USING (id); + QUERY PLAN +------------------------------------------------------------------ + Finalize Aggregate + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Partial Aggregate + -> Hash Join + Hash Cond: (r.id = s.id) + -> Seq Scan on simple r + -> Hash + -> Seq Scan on bigger_than_it_looks s +(8 rows) + +select count(*) FROM simple r JOIN bigger_than_it_looks s USING (id); + count +------- + 20000 +(1 row) + +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) FROM simple r JOIN bigger_than_it_looks s USING (id); +$$); + initially_multibatch | increased_batches +----------------------+------------------- + f | t +(1 row) + +rollback to settings; +-- parallel with parallel-oblivious hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +set local work_mem = '128kB'; +explain (costs off) + select count(*) from simple r join bigger_than_it_looks s using (id); + QUERY PLAN +--------------------------------------------------------------------------------- + Parallel Finalize Aggregate + -> Parallel Remote Subquery Scan on all (datanode_1,datanode_2) + -> Gather + Workers Planned: 2 + -> Partial Aggregate + -> Parallel Hash Join + Hash Cond: (r.id = s.id) + -> Parallel Seq Scan on simple r + -> Parallel Hash + -> Parallel Seq Scan on bigger_than_it_looks s +(10 rows) + +select count(*) from simple r join bigger_than_it_looks s using (id); + count +------- + 20000 +(1 row) + +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join bigger_than_it_looks s using (id); +$$); + initially_multibatch | increased_batches +----------------------+------------------- + f | f +(1 row) + +rollback to settings; +-- The "ugly" case: increasing the number of batches during execution +-- doesn't help, so stop trying to fit in work_mem and hope for the +-- best; in this case we plan for 1 batch, increases just once and +-- then stop increasing because that didn't help at all, so we blow +-- right through the work_mem budget and hope for the best... +-- non-parallel +savepoint settings; +set local max_parallel_workers_per_gather = 0; +set local work_mem = '128kB'; +explain (costs off) + select count(*) from simple r join extremely_skewed s using (id); + QUERY PLAN +-------------------------------------------------------------- + Finalize Aggregate + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Partial Aggregate + -> Hash Join + Hash Cond: (r.id = s.id) + -> Seq Scan on simple r + -> Hash + -> Seq Scan on extremely_skewed s +(8 rows) + +select count(*) from simple r join extremely_skewed s using (id); + count +------- + 20000 +(1 row) + +select * from hash_join_batches( +$$ + select count(*) from simple r join extremely_skewed s using (id); +$$); + original | final +----------+------- + 1 | 2 +(1 row) + +rollback to settings; +-- parallel with parallel-oblivious hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +set local work_mem = '128kB'; +explain (costs off) + select count(*) from simple r join extremely_skewed s using (id); + QUERY PLAN +----------------------------------------------------------------------------- + Parallel Finalize Aggregate + -> Parallel Remote Subquery Scan on all (datanode_1,datanode_2) + -> Gather + Workers Planned: 2 + -> Partial Aggregate + -> Parallel Hash Join + Hash Cond: (r.id = s.id) + -> Parallel Seq Scan on simple r + -> Parallel Hash + -> Parallel Seq Scan on extremely_skewed s +(10 rows) + +select count(*) from simple r join extremely_skewed s using (id); + count +------- + 20000 +(1 row) + +select * from hash_join_batches( +$$ + select count(*) from simple r join extremely_skewed s using (id); +$$); + original | final +----------+------- + 1 | 1 +(1 row) + +rollback to settings; +-- A couple of other hash join tests unrelated to work_mem management. +-- Check that EXPLAIN ANALYZE has data even if the leader doesn't participate +savepoint settings; +set local max_parallel_workers_per_gather = 2; +set local work_mem = '4MB'; +set local parallel_leader_participation = off; +ERROR: unrecognized configuration parameter "parallel_leader_participation" +select * from hash_join_batches( +$$ + select count(*) from simple r join simple s using (id); +$$); +ERROR: current transaction is aborted, commands ignored until end of transaction block +rollback to settings; +-- A full outer join where every record is matched. +-- non-parallel +savepoint settings; +set local max_parallel_workers_per_gather = 0; +explain (costs off) + select count(*) from simple r full outer join simple s using (id); + QUERY PLAN +----------------------------------------------------------- + Finalize Aggregate + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Partial Aggregate + -> Hash Full Join + Hash Cond: (r.id = s.id) + -> Seq Scan on simple r + -> Hash + -> Seq Scan on simple s +(8 rows) + +select count(*) from simple r full outer join simple s using (id); + count +------- + 20000 +(1 row) + +rollback to settings; +-- parallelism not possible with parallel-oblivious outer hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +explain (costs off) + select count(*) from simple r full outer join simple s using (id); + QUERY PLAN +-------------------------------------------------------------------- + Parallel Finalize Aggregate + -> Parallel Remote Subquery Scan on all (datanode_1,datanode_2) + -> Gather + Workers Planned: 2 + -> Partial Aggregate + -> Parallel Hash Full Join + Hash Cond: (r.id = s.id) + -> Parallel Seq Scan on simple r + -> Parallel Hash + -> Parallel Seq Scan on simple s +(10 rows) + +select count(*) from simple r full outer join simple s using (id); + count +------- + 20000 +(1 row) + +rollback to settings; +-- An full outer join where every record is not matched. set enable_hashjoin = false; explain select t3.b from nestloop_suppression1 t1, nestloop_suppression2 t2, nestloop_suppression3 t3 where t1.b=2 and t1.c=3 and t1.d like 'char%' and t1.a=t2.a and t3.b>t2.a; diff --git a/src/test/regress/expected/tbase_explain.out b/src/test/regress/expected/tbase_explain.out new file mode 100644 index 00000000..691d1bb5 --- /dev/null +++ b/src/test/regress/expected/tbase_explain.out @@ -0,0 +1,362 @@ +--explain analyze +create table a1(id int, num int, name text); +create table a2(id int, num int, name text); +insert into a1 values(1,generate_series(1,100),'a'); +insert into a1 values(2,generate_series(1,100),'b'); +insert into a1 values(3,generate_series(1,100),'c'); +insert into a2 select * from a1; +--normal cases +explain (costs off,timing off,summary off,analyze,verbose) +select count(*) from a1; + QUERY PLAN +----------------------------------------------------------------------------------- + Finalize Aggregate (actual rows=1 loops=1) + Output: count(*) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (actual rows=2 loops=1) + Output: PARTIAL count(*) + -> Partial Aggregate + DN (actual rows=1..1 loops=1..1) + - datanode_1 (actual rows=1 loops=1) + - datanode_2 (actual rows=1 loops=1) + Output: PARTIAL count(*) + -> Seq Scan on public.a1 + DN (actual rows=100..200 loops=1..1) + - datanode_1 (actual rows=200 loops=1) + - datanode_2 (actual rows=100 loops=1) + Output: id, num, name +(14 rows) + +explain (costs off,timing off,summary off,analyze,verbose) +select num, count(*) cnt from a2 group by num order by cnt; + QUERY PLAN +------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) (actual rows=100 loops=1) + Output: num, count(*) + Sort Key: count(*) + -> Sort + DN (actual rows=42..58 loops=1..1) + - datanode_1 (actual rows=42 loops=1) + - datanode_2 (actual rows=58 loops=1) + Output: num, (count(*)) + Sort Key: (count(*)) + Sort Method: quicksort Memory: 28kB + -> Finalize HashAggregate + DN (actual rows=42..58 loops=1..1) + - datanode_1 (actual rows=42 loops=1) + - datanode_2 (actual rows=58 loops=1) + Output: num, count(*) + Group Key: a2.num + -> Remote Subquery Scan on all (datanode_1,datanode_2) + DN (actual rows=84..116 loops=1..1) + - datanode_1 (actual rows=84 loops=1) + - datanode_2 (actual rows=116 loops=1) + Output: num, PARTIAL count(*) + Distribute results by H: num + -> Partial HashAggregate + DN (actual rows=100..100 loops=1..1) + - datanode_1 (actual rows=100 loops=1) + - datanode_2 (actual rows=100 loops=1) + Output: num, PARTIAL count(*) + Group Key: a2.num + -> Seq Scan on public.a2 + DN (actual rows=100..200 loops=1..1) + - datanode_1 (actual rows=200 loops=1) + - datanode_2 (actual rows=100 loops=1) + Output: id, num, name +(33 rows) + +explain (costs off,timing off,summary off,analyze,verbose) +select * from a1, a2 where a1.num = a2.num; + QUERY PLAN +------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) (actual rows=900 loops=1) + Output: a1.id, a1.num, a1.name, a2.id, a2.num, a2.name + -> Hash Join + DN (actual rows=378..522 loops=1..1) + - datanode_1 (actual rows=378 loops=1) + - datanode_2 (actual rows=522 loops=1) + Output: a1.id, a1.num, a1.name, a2.id, a2.num, a2.name + Hash Cond: (a1.num = a2.num) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + DN (actual rows=126..174 loops=1..1) + - datanode_1 (actual rows=126 loops=1) + - datanode_2 (actual rows=174 loops=1) + Output: a1.id, a1.num, a1.name + Distribute results by H: num + -> Seq Scan on public.a1 + DN (actual rows=100..200 loops=1..1) + - datanode_1 (actual rows=200 loops=1) + - datanode_2 (actual rows=100 loops=1) + Output: a1.id, a1.num, a1.name + -> Hash + DN (actual rows=126..174 loops=1..1) + - datanode_1 (actual rows=126 loops=1) + - datanode_2 (actual rows=174 loops=1) + Output: a2.id, a2.num, a2.name + Buckets: 1024 Batches: 1 Memory Usage: 16kB + -> Remote Subquery Scan on all (datanode_1,datanode_2) + DN (actual rows=126..174 loops=1..1) + - datanode_1 (actual rows=126 loops=1) + - datanode_2 (actual rows=174 loops=1) + Output: a2.id, a2.num, a2.name + Distribute results by H: num + -> Seq Scan on public.a2 + DN (actual rows=100..200 loops=1..1) + - datanode_1 (actual rows=200 loops=1) + - datanode_2 (actual rows=100 loops=1) + Output: a2.id, a2.num, a2.name +(36 rows) + +--append +explain (costs off,timing off,summary off,analyze,verbose) +select max(num) from a1 union select min(num) from a1 order by 1; + QUERY PLAN +----------------------------------------------------------------------------------------------------- + Unique (actual rows=2 loops=1) + Output: (max(a1.num)) + -> Sort (actual rows=2 loops=1) + Output: (max(a1.num)) + Sort Key: (max(a1.num)) + Sort Method: quicksort Memory: 25kB + -> Append (actual rows=2 loops=1) + -> Finalize Aggregate (actual rows=1 loops=1) + Output: max(a1.num) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (actual rows=2 loops=1) + Output: PARTIAL max(a1.num) + -> Partial Aggregate + DN (actual rows=1..1 loops=1..1) + - datanode_1 (actual rows=1 loops=1) + - datanode_2 (actual rows=1 loops=1) + Output: PARTIAL max(a1.num) + -> Seq Scan on public.a1 + DN (actual rows=100..200 loops=1..1) + - datanode_1 (actual rows=200 loops=1) + - datanode_2 (actual rows=100 loops=1) + Output: a1.id, a1.num, a1.name + -> Finalize Aggregate (actual rows=1 loops=1) + Output: min(a1_1.num) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (actual rows=2 loops=1) + Output: PARTIAL min(a1_1.num) + -> Partial Aggregate + DN (actual rows=1..1 loops=1..1) + - datanode_1 (actual rows=1 loops=1) + - datanode_2 (actual rows=1 loops=1) + Output: PARTIAL min(a1_1.num) + -> Seq Scan on public.a1 a1_1 + DN (actual rows=100..200 loops=1..1) + - datanode_1 (actual rows=200 loops=1) + - datanode_2 (actual rows=100 loops=1) + Output: a1_1.id, a1_1.num, a1_1.name +(35 rows) + +--subplan +explain (costs off,timing off,summary off,analyze,verbose) +select * from a1 where id in (select count(*) from a2 where a1.num=a2.num); + QUERY PLAN +------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) (actual rows=100 loops=1) + Output: a1.id, a1.num, a1.name + -> Seq Scan on public.a1 + DN (actual rows=0..100 loops=1..1) + - datanode_1 (actual rows=0 loops=1) + - datanode_2 (actual rows=100 loops=1) + Output: a1.id, a1.num, a1.name + Filter: (SubPlan 1) + SubPlan 1 + -> Finalize Aggregate + DN (actual rows=1..1 loops=100..200) + - datanode_1 (actual rows=1 loops=200) + - datanode_2 (actual rows=1 loops=100) + Output: count(*) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + DN (actual rows=2..2 loops=100..200) + - datanode_1 (actual rows=2 loops=200) + - datanode_2 (actual rows=2 loops=100) + Output: PARTIAL count(*) + -> Partial Aggregate + DN (actual rows=1..1 loops=1..1) + - datanode_1 (actual rows=1 loops=1) + - datanode_2 (actual rows=1 loops=1) + Output: PARTIAL count(*) + -> Seq Scan on public.a2 + DN (actual rows=1..2 loops=1..1) + - datanode_1 (actual rows=2 loops=1) + - datanode_2 (actual rows=1 loops=1) + Output: a2.id, a2.num, a2.name + Filter: (a1.num = a2.num) +(30 rows) + +--initplan +explain (costs off,timing off,summary off,analyze,verbose) +select * from a1 where num >= (select count(*) from a2 where name='a'); + QUERY PLAN +----------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) (actual rows=3 loops=1) + Output: a1.id, a1.num, a1.name + -> Seq Scan on public.a1 + DN (actual rows=1..2 loops=1..1) + - datanode_1 (actual rows=2 loops=1) + - datanode_2 (actual rows=1 loops=1) + Output: a1.id, a1.num, a1.name + Filter: (a1.num >= $0) + InitPlan 1 (returns $0) + -> Finalize Aggregate + DN (actual rows=1..1 loops=1..1) + - datanode_1 (actual rows=1 loops=1) + - datanode_2 (actual rows=1 loops=1) + Output: count(*) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + DN (actual rows=2..2 loops=1..1) + - datanode_1 (actual rows=2 loops=1) + - datanode_2 (actual rows=2 loops=1) + Output: PARTIAL count(*) + -> Partial Aggregate + DN (actual rows=1..1 loops=1..1) + - datanode_1 (actual rows=1 loops=1) + - datanode_2 (actual rows=1 loops=1) + Output: PARTIAL count(*) + -> Seq Scan on public.a2 + DN (actual rows=0..100 loops=1..1) + - datanode_1 (actual rows=100 loops=1) + - datanode_2 (actual rows=0 loops=1) + Output: a2.id, a2.num, a2.name + Filter: (a2.name = 'a'::text) +(30 rows) + +explain (costs off,timing off,summary off,analyze,verbose) +select * from a1 where num >= (select count(*) from a2 where name='b') order by id; + QUERY PLAN +----------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) (actual rows=3 loops=1) + Output: a1.id, a1.num, a1.name + Sort Key: a1.id + -> Sort + DN (actual rows=1..2 loops=1..1) + - datanode_1 (actual rows=2 loops=1) + - datanode_2 (actual rows=1 loops=1) + Output: a1.id, a1.num, a1.name + Sort Key: a1.id + Sort Method: quicksort Memory: 25kB + InitPlan 1 (returns $0) + -> Finalize Aggregate + DN (actual rows=1..1 loops=1..1) + - datanode_1 (actual rows=1 loops=1) + - datanode_2 (actual rows=1 loops=1) + Output: count(*) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + DN (actual rows=2..2 loops=1..1) + - datanode_1 (actual rows=2 loops=1) + - datanode_2 (actual rows=2 loops=1) + Output: PARTIAL count(*) + -> Partial Aggregate + DN (actual rows=1..1 loops=1..1) + - datanode_1 (actual rows=1 loops=1) + - datanode_2 (actual rows=1 loops=1) + Output: PARTIAL count(*) + -> Seq Scan on public.a2 + DN (actual rows=0..100 loops=1..1) + - datanode_1 (actual rows=100 loops=1) + - datanode_2 (actual rows=0 loops=1) + Output: a2.id, a2.num, a2.name + Filter: (a2.name = 'b'::text) + -> Seq Scan on public.a1 + DN (actual rows=1..2 loops=1..1) + - datanode_1 (actual rows=2 loops=1) + - datanode_2 (actual rows=1 loops=1) + Output: a1.id, a1.num, a1.name + Filter: (a1.num >= $0) +(38 rows) + +explain (costs off,timing off,summary off,analyze,verbose) +select * from a1 where num >= (select count(*) from a2 where name='c') limit 1; + QUERY PLAN +------------------------------------------------------------------------------------------- + Limit (actual rows=1 loops=1) + Output: a1.id, a1.num, a1.name + InitPlan 1 (returns $0) + -> Finalize Aggregate (actual rows=1 loops=1) + Output: count(*) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (actual rows=2 loops=1) + Output: PARTIAL count(*) + -> Partial Aggregate + DN (actual rows=1..1 loops=1..1) + - datanode_1 (actual rows=1 loops=1) + - datanode_2 (actual rows=1 loops=1) + Output: PARTIAL count(*) + -> Seq Scan on public.a2 + DN (actual rows=0..100 loops=1..1) + - datanode_1 (actual rows=0 loops=1) + - datanode_2 (actual rows=100 loops=1) + Output: a2.id, a2.num, a2.name + Filter: (a2.name = 'c'::text) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (actual rows=1 loops=1) + Output: a1.id, a1.num, a1.name + -> Limit + DN (actual rows=1..1 loops=1..1) + - datanode_1 (actual rows=1 loops=1) + - datanode_2 (actual rows=1 loops=1) + Output: a1.id, a1.num, a1.name + -> Seq Scan on public.a1 + DN (actual rows=1..1 loops=1..1) + - datanode_1 (actual rows=1 loops=1) + - datanode_2 (actual rows=1 loops=1) + Output: a1.id, a1.num, a1.name + Filter: (a1.num >= $0) +(31 rows) + +explain (costs off,timing off,summary off,analyze,verbose) +select count(*) from a1 group by name having count(*) = (select count(*) from a2 where name='a'); + QUERY PLAN +----------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) (actual rows=3 loops=1) + Output: count(*), a1.name + -> Finalize HashAggregate + DN (actual rows=1..2 loops=1..1) + - datanode_1 (actual rows=2 loops=1) + - datanode_2 (actual rows=1 loops=1) + Output: count(*), a1.name + Group Key: a1.name + Filter: (count(*) = $0) + InitPlan 1 (returns $0) + -> Finalize Aggregate + DN (actual rows=1..1 loops=1..1) + - datanode_1 (actual rows=1 loops=1) + - datanode_2 (actual rows=1 loops=1) + Output: count(*) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + DN (actual rows=2..2 loops=1..1) + - datanode_1 (actual rows=2 loops=1) + - datanode_2 (actual rows=2 loops=1) + Output: PARTIAL count(*) + -> Partial Aggregate + DN (actual rows=1..1 loops=1..1) + - datanode_1 (actual rows=1 loops=1) + - datanode_2 (actual rows=1 loops=1) + Output: PARTIAL count(*) + -> Seq Scan on public.a2 + DN (actual rows=0..100 loops=1..1) + - datanode_1 (actual rows=100 loops=1) + - datanode_2 (actual rows=0 loops=1) + Output: a2.id, a2.num, a2.name + Filter: (a2.name = 'a'::text) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + DN (actual rows=1..2 loops=1..1) + - datanode_1 (actual rows=2 loops=1) + - datanode_2 (actual rows=1 loops=1) + Output: a1.name, PARTIAL count(*) + Distribute results by H: name + -> Partial HashAggregate + DN (actual rows=1..2 loops=1..1) + - datanode_1 (actual rows=2 loops=1) + - datanode_2 (actual rows=1 loops=1) + Output: a1.name, PARTIAL count(*) + Group Key: a1.name + -> Seq Scan on public.a1 + DN (actual rows=100..200 loops=1..1) + - datanode_1 (actual rows=200 loops=1) + - datanode_2 (actual rows=100 loops=1) + Output: a1.id, a1.num, a1.name +(48 rows) + +--cleanup +drop table a1, a2; diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index ebd01715..c0ccc373 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -161,3 +161,9 @@ test: xc_notrans_block # This runs XL specific tests test: xl_primary_key xl_foreign_key xl_distribution_column_types xl_alter_table xl_distribution_column_types_modulo xl_plan_pushdown xl_functions xl_limitations xl_user_defined_functions xl_join xl_distributed_xact xl_create_table + +# This runs TBase specific tests +test: tbase_explain + +test: redistribute_custom_types +test: nestloop_by_shard diff --git a/src/test/regress/sql/tbase_explain.sql b/src/test/regress/sql/tbase_explain.sql new file mode 100644 index 00000000..7e212bc7 --- /dev/null +++ b/src/test/regress/sql/tbase_explain.sql @@ -0,0 +1,37 @@ +--explain analyze +create table a1(id int, num int, name text); +create table a2(id int, num int, name text); +insert into a1 values(1,generate_series(1,100),'a'); +insert into a1 values(2,generate_series(1,100),'b'); +insert into a1 values(3,generate_series(1,100),'c'); +insert into a2 select * from a1; + +--normal cases +explain (costs off,timing off,summary off,analyze,verbose) +select count(*) from a1; +explain (costs off,timing off,summary off,analyze,verbose) +select num, count(*) cnt from a2 group by num order by cnt; +explain (costs off,timing off,summary off,analyze,verbose) +select * from a1, a2 where a1.num = a2.num; + +--append +explain (costs off,timing off,summary off,analyze,verbose) +select max(num) from a1 union select min(num) from a1 order by 1; + +--subplan +explain (costs off,timing off,summary off,analyze,verbose) +select * from a1 where id in (select count(*) from a2 where a1.num=a2.num); + +--initplan +explain (costs off,timing off,summary off,analyze,verbose) +select * from a1 where num >= (select count(*) from a2 where name='a'); +explain (costs off,timing off,summary off,analyze,verbose) +select * from a1 where num >= (select count(*) from a2 where name='b') order by id; +explain (costs off,timing off,summary off,analyze,verbose) +select * from a1 where num >= (select count(*) from a2 where name='c') limit 1; +explain (costs off,timing off,summary off,analyze,verbose) +select count(*) from a1 group by name having count(*) = (select count(*) from a2 where name='a'); + +--cleanup +drop table a1, a2; + From 799266e603f90f920e9152a33deeb68abd3ea039 Mon Sep 17 00:00:00 2001 From: andrelin Date: Wed, 10 Mar 2021 11:22:13 +0800 Subject: [PATCH 141/578] Fix a compile warning --- src/backend/access/common/printtup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/access/common/printtup.c b/src/backend/access/common/printtup.c index a9b0b09b..dfd64707 100644 --- a/src/backend/access/common/printtup.c +++ b/src/backend/access/common/printtup.c @@ -447,6 +447,7 @@ printtup(TupleTableSlot *slot, DestReceiver *self) } else { + int len = strlen(outputstr); #ifdef __TBASE__ if (slot->tts_tupleDescriptor->attrs[i]->atttypid == RECORDOID && self->mydest == DestRemoteExecute) { @@ -482,7 +483,6 @@ printtup(TupleTableSlot *slot, DestReceiver *self) pfree(tupdesc_data.data); } #endif - int len = strlen(outputstr); pq_sendint(&buf, len, 4); appendBinaryStringInfo(&buf, outputstr, len); } From d0dc3c6c72d1918ef025484b076c5a18cb767314 Mon Sep 17 00:00:00 2001 From: andrelin Date: Wed, 10 Mar 2021 14:35:33 +0800 Subject: [PATCH 142/578] fix bug in parallel sort and parallel hash redistributing data http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131083333949 --- src/backend/executor/nodeAgg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c index d36c3df4..8b1695a6 100644 --- a/src/backend/executor/nodeAgg.c +++ b/src/backend/executor/nodeAgg.c @@ -6060,7 +6060,7 @@ GetReDistributeData(ReDistributeState *state, BufFile *file, TupleTableSlot **sl } else { - data = (char *)palloc0(nread); + data = (char *)palloc0(dataLen); } READ_DATA: From ae830ca4141a8e4ba78387615b3e936acc59018d Mon Sep 17 00:00:00 2001 From: andrelin Date: Tue, 16 Mar 2021 12:03:31 +0800 Subject: [PATCH 143/578] Support explain analyze for INSERT INTO SELECT statement tapd: http://tapd.oa.com/pgxz/prong/stories/view/1010092131862892295 --- src/backend/commands/explain_dist.c | 6 +++--- src/backend/tcop/pquery.c | 26 +++++++++++++++++++------- 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/src/backend/commands/explain_dist.c b/src/backend/commands/explain_dist.c index 65d9fed8..1f2d50e4 100644 --- a/src/backend/commands/explain_dist.c +++ b/src/backend/commands/explain_dist.c @@ -706,15 +706,15 @@ HandleRemoteInstr(char *msg_body, size_t len, int nodeid, ResponseCombiner *comb bool found; RemoteInstr *cur_instr; - /* must doing this under per query context */ - MemoryContext oldcontext = MemoryContextSwitchTo(combiner->ss.ps.state->es_query_cxt); - if (combiner->recv_instr_htbl == NULL) { elog(ERROR, "combiner is not prepared for instrumentation"); } elog(DEBUG1, "Handle remote instrument: nodeid %d", nodeid); + /* must doing this under per query context */ + MemoryContext oldcontext = MemoryContextSwitchTo(combiner->ss.ps.state->es_query_cxt); + recv_str = makeStringInfo(); appendBinaryStringInfo(recv_str, msg_body, len); diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c index c1eadf2c..5d358337 100644 --- a/src/backend/tcop/pquery.c +++ b/src/backend/tcop/pquery.c @@ -45,6 +45,7 @@ #include "optimizer/planner.h" #include "executor/execParallel.h" #include "commands/defrem.h" +#include "commands/explain_dist.h" #include "commands/vacuum.h" #include "postmaster/postmaster.h" #include "optimizer/planmain.h" @@ -65,7 +66,8 @@ static void ProcessQuery(PlannedStmt *plan, ParamListInfo params, QueryEnvironment *queryEnv, DestReceiver *dest, - char *completionTag); + char *completionTag, + int instrument); static void FillPortalStore(Portal portal, bool isTopLevel); static uint64 RunFromStore(Portal portal, ScanDirection direction, uint64 count, DestReceiver *dest); @@ -179,8 +181,9 @@ ProcessQuery(PlannedStmt *plan, ParamListInfo params, QueryEnvironment *queryEnv, DestReceiver *dest, - char *completionTag) -{// #lizard forgives + char *completionTag, + int instrument) +{ QueryDesc *queryDesc; /* @@ -191,13 +194,13 @@ ProcessQuery(PlannedStmt *plan, { queryDesc = CreateQueryDesc(plan, sourceText, InvalidSnapshot, InvalidSnapshot, - dest, params, queryEnv, 0); + dest, params, queryEnv, instrument); } else #endif queryDesc = CreateQueryDesc(plan, sourceText, GetActiveSnapshot(), InvalidSnapshot, - dest, params, queryEnv, 0); + dest, params, queryEnv, instrument); /* * Call ExecutorStart to prepare the plan for execution @@ -248,6 +251,13 @@ ProcessQuery(PlannedStmt *plan, } } +#ifdef __TBASE__ + if (instrument && queryDesc->planstate) + { + SendLocalInstr(queryDesc->planstate); + } +#endif + /* * Now, we close down all the scans and free allocated resources. */ @@ -2090,7 +2100,8 @@ PortalRunMulti(Portal portal, portal->sourceText, portal->portalParams, portal->queryEnv, - dest, completionTag); + dest, completionTag, + portal->up_instrument); #ifdef PGXC /* it's special for INSERT */ if (IS_PGXC_COORDINATOR && @@ -2106,7 +2117,8 @@ PortalRunMulti(Portal portal, portal->sourceText, portal->portalParams, portal->queryEnv, - altdest, NULL); + altdest, NULL, + portal->up_instrument); } if (log_executor_stats) From 1ceb1a4eba88297cdb128a3b796995e6acbcdfaf Mon Sep 17 00:00:00 2001 From: andrelin Date: Tue, 16 Mar 2021 15:52:10 +0800 Subject: [PATCH 144/578] Make cost_xxx functions more readable cherry-pick same refactor from V3 commit: ce5a1e72 http://tapd.oa.com/pgxz/prong/stories/view/1010092131862621757 --- src/backend/optimizer/path/costsize.c | 323 +++++----------------- src/backend/optimizer/util/pathnode.c | 2 +- src/test/regress/expected/create_view.out | 6 +- 3 files changed, 68 insertions(+), 263 deletions(-) diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 06e9a7c3..491d9d40 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -179,7 +179,6 @@ static double relation_byte_size(double tuples, int width); static double page_size(double tuples, int width); static double get_parallel_divisor(Path *path); -#ifdef __TBASE__ /* * In PostgreSQL, the row count estimate of a base rel scan, like a Seq Scan * or an Index Scan, can be directly copied from RelOptInfo->rows/tuples. In @@ -211,72 +210,14 @@ static double get_parallel_divisor(Path *path); * the original RelOptInfo, you'll get a compiler error. That's good: it forces * you to think whether the value needs to be divided by nDNs or not. */ -typedef struct -{ - /* Values copied from RelOptInfo as is, for convenience */ - Index relid; - RTEKind rtekind; /* RELATION, SUBQUERY, or FUNCTION */ - Oid reltablespace; /* containing tablespace */ - double allvisfrac; - - /* Values adjusted from RelOptInfo, by dividing by number of DNs */ - double rows; - BlockNumber pages; - double tuples; - - /* the original RelOptInfo */ - RelOptInfo *orig; -} RelOptInfoDataNode; +#define PAGES_PER_DN(pages) \ + (ceil((double) (pages) / num_nodes)) -/* ParamPathInfoDataNode is a similar proxy for ParamPathInfo. */ -typedef struct -{ - double ppi_rows; /* estimated number of result tuples */ - List *ppi_clauses; /* join clauses available from outer rels */ - - ParamPathInfo *orig; -} ParamPathInfoDataNode; - -static ParamPathInfoDataNode * -adjust_reloptinfo(Path *path, RelOptInfoDataNode *basescan, RelOptInfo *baserel_orig, - ParamPathInfoDataNode *param_info, ParamPathInfo *param_info_orig) -{ - double nodes = path_count_datanodes(path); - - basescan->relid = baserel_orig->relid; - basescan->rtekind = baserel_orig->rtekind; - basescan->reltablespace = baserel_orig->reltablespace; - basescan->allvisfrac = baserel_orig->allvisfrac; - - basescan->rows = clamp_row_est(baserel_orig->rows / nodes); - basescan->tuples = clamp_row_est(baserel_orig->tuples / nodes); - basescan->pages = ceil((double) baserel_orig->pages / nodes); - - basescan->orig = baserel_orig; - - if (param_info_orig) - { - param_info->ppi_rows = clamp_row_est(param_info_orig->ppi_rows / nodes); - param_info->ppi_clauses = param_info_orig->ppi_clauses; - param_info->orig = param_info_orig; - return param_info; - } - else - return NULL; -} - -/* - * ADJUST_BASESCAN initializes the proxy structs for RelOptInfo and ParamPathInfo, - * adjusting them by # of data nodes as needed. - */ -#define ADJUST_BASESCAN(path, baserel_orig, baserel, param_info_orig, param_info) \ - RelOptInfoDataNode baserel_adjusted; \ - ParamPathInfoDataNode param_info_adjusted; \ - RelOptInfoDataNode *baserel = &baserel_adjusted; \ - ParamPathInfoDataNode *param_info = adjust_reloptinfo(path, &baserel_adjusted, baserel_orig, \ - ¶m_info_adjusted, param_info_orig) -#endif +#define ROWS_PER_DN(rows) \ + (clamp_row_est((rows) / num_nodes)) +#define TUPLES_PER_DN(tuples) \ + (clamp_row_est((tuples) / num_nodes)) /* * clamp_row_est @@ -298,7 +239,6 @@ clamp_row_est(double nrows) return nrows; } - /* * cost_seqscan * Determines and returns the cost of scanning a relation sequentially. @@ -308,20 +248,15 @@ clamp_row_est(double nrows) */ void cost_seqscan(Path *path, PlannerInfo *root, -#ifdef __TBASE__ - RelOptInfo *baserel_orig, ParamPathInfo *param_info_orig) -{ - ADJUST_BASESCAN(path, baserel_orig, baserel, param_info_orig, param_info); -#else RelOptInfo *baserel, ParamPathInfo *param_info) { -#endif Cost startup_cost = 0; Cost cpu_run_cost; Cost disk_run_cost; double spc_seq_page_cost; QualCost qpqual_cost; Cost cpu_per_tuple; + double num_nodes = path_count_datanodes(path); /* Should only be applied to base relations */ Assert(baserel->relid > 0); @@ -329,9 +264,9 @@ cost_seqscan(Path *path, PlannerInfo *root, /* Mark the path with the correct row estimate */ if (param_info) - path->rows = param_info->ppi_rows; + path->rows = ROWS_PER_DN(param_info->ppi_rows); else - path->rows = baserel->rows; + path->rows = ROWS_PER_DN(baserel->rows); if (!enable_seqscan) startup_cost += disable_cost; @@ -344,18 +279,14 @@ cost_seqscan(Path *path, PlannerInfo *root, /* * disk costs */ - disk_run_cost = spc_seq_page_cost * baserel->pages; + disk_run_cost = spc_seq_page_cost * PAGES_PER_DN(baserel->pages); /* CPU costs */ -#ifdef __TBASE__ - get_restriction_qual_cost(root, baserel_orig, param_info_orig, &qpqual_cost); -#else get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost); -#endif startup_cost += qpqual_cost.startup; cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple; - cpu_run_cost = cpu_per_tuple * baserel->tuples; + cpu_run_cost = cpu_per_tuple * TUPLES_PER_DN(baserel->tuples); /* tlist eval costs are paid per output row, not per tuple scanned */ startup_cost += path->pathtarget->cost.startup; cpu_run_cost += path->pathtarget->cost.per_tuple * path->rows; @@ -395,14 +326,8 @@ cost_seqscan(Path *path, PlannerInfo *root, */ void cost_samplescan(Path *path, PlannerInfo *root, -#ifdef __TBASE__ - RelOptInfo *baserel_orig, ParamPathInfo *param_info_orig) -{ - ADJUST_BASESCAN(path, baserel_orig, baserel, param_info_orig, param_info); -#else RelOptInfo *baserel, ParamPathInfo *param_info) { -#endif Cost startup_cost = 0; Cost run_cost = 0; RangeTblEntry *rte; @@ -413,6 +338,7 @@ cost_samplescan(Path *path, PlannerInfo *root, spc_page_cost; QualCost qpqual_cost; Cost cpu_per_tuple; + double num_nodes = path_count_datanodes(path); /* Should only be applied to base relations with tablesample clauses */ Assert(baserel->relid > 0); @@ -424,9 +350,9 @@ cost_samplescan(Path *path, PlannerInfo *root, /* Mark the path with the correct row estimate */ if (param_info) - path->rows = param_info->ppi_rows; + path->rows = ROWS_PER_DN(param_info->ppi_rows); else - path->rows = baserel->rows; + path->rows = ROWS_PER_DN(baserel->rows); /* fetch estimated page cost for tablespace containing table */ get_tablespace_page_costs(baserel->reltablespace, @@ -441,7 +367,7 @@ cost_samplescan(Path *path, PlannerInfo *root, * disk costs (recall that baserel->pages has already been set to the * number of pages the sampling method will visit) */ - run_cost += spc_page_cost * baserel->pages; + run_cost += spc_page_cost * PAGES_PER_DN(baserel->pages); /* * CPU costs (recall that baserel->tuples has already been set to the @@ -451,15 +377,11 @@ cost_samplescan(Path *path, PlannerInfo *root, * simple constants anyway. We also don't charge anything for the * calculations the sampling method might do internally. */ -#ifdef __TBASE__ - get_restriction_qual_cost(root, baserel_orig, param_info_orig, &qpqual_cost); -#else get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost); -#endif startup_cost += qpqual_cost.startup; cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple; - run_cost += cpu_per_tuple * baserel->tuples; + run_cost += cpu_per_tuple * TUPLES_PER_DN(baserel->tuples); /* tlist eval costs are paid per output row, not per tuple scanned */ startup_cost += path->pathtarget->cost.startup; run_cost += path->pathtarget->cost.per_tuple * path->rows; @@ -480,26 +402,20 @@ cost_samplescan(Path *path, PlannerInfo *root, */ void cost_gather(GatherPath *path, PlannerInfo *root, -#ifdef __TBASE__ - RelOptInfo *rel_orig, ParamPathInfo *param_info_orig, -#else RelOptInfo *rel, ParamPathInfo *param_info, -#endif double *rows) { -#ifdef __TBASE__ - ADJUST_BASESCAN(&path->path, rel_orig, rel, param_info_orig, param_info); -#endif Cost startup_cost = 0; Cost run_cost = 0; + double num_nodes = path_count_datanodes((Path *) path); /* Mark the path with the correct row estimate */ if (rows) path->path.rows = *rows; else if (param_info) - path->path.rows = param_info->ppi_rows; + path->path.rows = ROWS_PER_DN(param_info->ppi_rows); else - path->path.rows = rel->rows; + path->path.rows = ROWS_PER_DN(rel->rows); startup_cost = path->subpath->startup_cost; @@ -537,30 +453,24 @@ reset_cost_gather(GatherPath *path) */ void cost_gather_merge(GatherMergePath *path, PlannerInfo *root, -#ifdef __TBASE__ - RelOptInfo *rel_orig, ParamPathInfo *param_info_orig, -#else RelOptInfo *rel, ParamPathInfo *param_info, -#endif Cost input_startup_cost, Cost input_total_cost, double *rows) { -#ifdef __TBASE__ - ADJUST_BASESCAN(&path->path, rel_orig, rel, param_info_orig, param_info); -#endif Cost startup_cost = 0; Cost run_cost = 0; Cost comparison_cost; double N; double logN; + double num_nodes = path_count_datanodes((Path *) path); /* Mark the path with the correct row estimate */ if (rows) path->path.rows = *rows; else if (param_info) - path->path.rows = param_info->ppi_rows; + path->path.rows = ROWS_PER_DN(param_info->ppi_rows); else - path->path.rows = rel->rows; + path->path.rows = ROWS_PER_DN(rel->rows); if (!enable_gathermerge) startup_cost += disable_cost; @@ -622,12 +532,7 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count, bool partial_path) {// #lizard forgives IndexOptInfo *index = path->indexinfo; -#ifdef __TBASE__ - RelOptInfo *baserel_orig = index->rel; - ADJUST_BASESCAN(&path->path, baserel_orig, baserel, path->path.param_info, param_info); -#else RelOptInfo *baserel = index->rel; -#endif bool indexonly = (path->path.pathtype == T_IndexOnlyScan); amcostestimate_function amcostestimate; List *qpquals; @@ -650,17 +555,13 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count, double rand_heap_pages; double index_pages; double nodes = 1; + double index_pages_per_dn; + double baserel_pages_per_dn; + double num_nodes = path_count_datanodes((Path *) path); -#ifdef __TBASE__ - nodes = path_count_datanodes(&path->path); - /* Should only be applied to base relations */ - Assert(IsA(baserel_orig, RelOptInfo) && - IsA(index, IndexOptInfo)); -#else /* Should only be applied to base relations */ Assert(IsA(baserel, RelOptInfo) && IsA(index, IndexOptInfo)); -#endif Assert(baserel->relid > 0); Assert(baserel->rtekind == RTE_RELATION); @@ -671,21 +572,9 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count, * baserestrictinfo as the list of relevant restriction clauses for the * rel. */ -#ifdef __TBASE__ - if (param_info) - { - path->path.rows = param_info->ppi_rows; - /* qpquals come from the rel's restriction clauses and ppi_clauses */ - qpquals = list_concat( - extract_nonindex_conditions(path->indexinfo->indrestrictinfo, - path->indexquals), - extract_nonindex_conditions(param_info->ppi_clauses, - path->indexquals)); - } -#else if (path->path.param_info) { - path->path.rows = path->path.param_info->ppi_rows; + path->path.rows = ROWS_PER_DN(path->path.param_info->ppi_rows); /* qpquals come from the rel's restriction clauses and ppi_clauses */ qpquals = list_concat( extract_nonindex_conditions(path->indexinfo->indrestrictinfo, @@ -693,10 +582,9 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count, extract_nonindex_conditions(path->path.param_info->ppi_clauses, path->indexquals)); } -#endif else { - path->path.rows = baserel->rows; + path->path.rows = ROWS_PER_DN(baserel->rows); /* qpquals come from just the rel's restriction clauses */ qpquals = extract_nonindex_conditions(path->indexinfo->indrestrictinfo, path->indexquals); @@ -720,7 +608,8 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count, &index_pages); /* The index pages should be divided among all the data nodes like baserel dose. */ - index_pages = ceil(index_pages / nodes); + index_pages_per_dn = PAGES_PER_DN(index_pages); + baserel_pages_per_dn = PAGES_PER_DN(baserel->pages); /* * Save amcostestimate's results for possible use in bitmap scan planning. @@ -735,7 +624,7 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count, run_cost += indexTotalCost - indexStartupCost; /* estimate number of main-table tuples fetched */ - tuples_fetched = clamp_row_est(indexSelectivity * baserel->tuples); + tuples_fetched = clamp_row_est(indexSelectivity * TUPLES_PER_DN(baserel->tuples)); /* fetch estimated page costs for tablespace containing table */ get_tablespace_page_costs(baserel->reltablespace, @@ -780,12 +669,8 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count, * fetches are random accesses. */ pages_fetched = index_pages_fetched(tuples_fetched * loop_count, - baserel->pages, -#ifdef __TBASE__ - index_pages, -#else - (double) index->pages, -#endif + baserel_pages_per_dn, + (double) index_pages_per_dn, root); if (indexonly) @@ -805,15 +690,11 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count, * where such a plan is actually interesting, only one page would get * fetched per scan anyway, so it shouldn't matter much.) */ - pages_fetched = ceil(indexSelectivity * (double) baserel->pages); + pages_fetched = ceil(indexSelectivity * (double) PAGES_PER_DN(baserel->pages)); pages_fetched = index_pages_fetched(pages_fetched * loop_count, - baserel->pages, -#ifdef __TBASE__ - index_pages, -#else - (double) index->pages, -#endif + baserel_pages_per_dn, + (double) index_pages_per_dn, root); if (indexonly) @@ -828,12 +709,8 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count, * interpolate between that and the correlation-derived result. */ pages_fetched = index_pages_fetched(tuples_fetched, - baserel->pages, -#ifdef __TBASE__ - index_pages, -#else - (double) index->pages, -#endif + baserel_pages_per_dn, + (double) index_pages_per_dn, root); if (indexonly) @@ -845,7 +722,7 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count, max_IO_cost = pages_fetched * spc_random_page_cost; /* min_IO_cost is for the perfectly correlated case (csquared=1) */ - pages_fetched = ceil(indexSelectivity * (double) baserel->pages); + pages_fetched = ceil(indexSelectivity * (double) baserel_pages_per_dn); if (indexonly) pages_fetched = ceil(pages_fetched * (1.0 - baserel->allvisfrac)); @@ -876,13 +753,8 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count, * sequential as for parallel scans the pages are accessed in random * order. */ -#ifdef __TBASE__ - path->path.parallel_workers = compute_parallel_worker(baserel_orig, - rand_heap_pages, index_pages); -#else path->path.parallel_workers = compute_parallel_worker(baserel, - rand_heap_pages, index_pages); -#endif + rand_heap_pages, index_pages_per_dn); /* * Fall out if workers can't be assigned for parallel scan, because in @@ -1379,14 +1251,8 @@ cost_bitmap_or_node(BitmapOrPath *path, PlannerInfo *root) */ void cost_tidscan(Path *path, PlannerInfo *root, -#ifdef __TBASE__ - RelOptInfo *baserel_orig, List *tidquals, ParamPathInfo *param_info_orig) -{ - ADJUST_BASESCAN(path, baserel_orig, baserel, param_info_orig, param_info); -#else RelOptInfo *baserel, List *tidquals, ParamPathInfo *param_info) { -#endif Cost startup_cost = 0; Cost run_cost = 0; bool isCurrentOf = false; @@ -1396,6 +1262,7 @@ cost_tidscan(Path *path, PlannerInfo *root, int ntuples; ListCell *l; double spc_random_page_cost; + double num_nodes = path_count_datanodes(path); /* Should only be applied to base relations */ Assert(baserel->relid > 0); @@ -1403,9 +1270,9 @@ cost_tidscan(Path *path, PlannerInfo *root, /* Mark the path with the correct row estimate */ if (param_info) - path->rows = param_info->ppi_rows; + path->rows = ROWS_PER_DN(param_info->ppi_rows); else - path->rows = baserel->rows; + path->rows = ROWS_PER_DN(baserel->rows); /* Count how many tuples we expect to retrieve */ ntuples = 0; @@ -1442,11 +1309,7 @@ cost_tidscan(Path *path, PlannerInfo *root, */ if (isCurrentOf) { -#ifdef __TBASE__ - Assert(baserel->orig->baserestrictcost.startup >= disable_cost); -#else Assert(baserel->baserestrictcost.startup >= disable_cost); -#endif startup_cost -= disable_cost; } else if (!enable_tidscan) @@ -1467,11 +1330,7 @@ cost_tidscan(Path *path, PlannerInfo *root, run_cost += spc_random_page_cost * ntuples; /* Add scanning CPU costs */ -#ifdef __TBASE__ - get_restriction_qual_cost(root, baserel_orig, param_info_orig, &qpqual_cost); -#else get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost); -#endif /* XXX currently we assume TID quals are a subset of qpquals */ startup_cost += qpqual_cost.startup + tid_qual_cost.per_tuple; @@ -1496,18 +1355,13 @@ cost_tidscan(Path *path, PlannerInfo *root, */ void cost_subqueryscan(SubqueryScanPath *path, PlannerInfo *root, -#ifdef __TBASE__ - RelOptInfo *baserel_orig, ParamPathInfo *param_info_orig) -{ - ADJUST_BASESCAN(&path->path, baserel_orig, baserel, param_info_orig, param_info); -#else RelOptInfo *baserel, ParamPathInfo *param_info) { -#endif Cost startup_cost; Cost run_cost; QualCost qpqual_cost; Cost cpu_per_tuple; + double num_nodes = path_count_datanodes((Path *)path); /* Should only be applied to base relations that are subqueries */ Assert(baserel->relid > 0); @@ -1515,9 +1369,9 @@ cost_subqueryscan(SubqueryScanPath *path, PlannerInfo *root, /* Mark the path with the correct row estimate */ if (param_info) - path->path.rows = param_info->ppi_rows; + path->path.rows = ROWS_PER_DN(param_info->ppi_rows); else - path->path.rows = baserel->rows; + path->path.rows = ROWS_PER_DN(baserel->rows); /* * Cost of path is cost of evaluating the subplan, plus cost of evaluating @@ -1528,11 +1382,7 @@ cost_subqueryscan(SubqueryScanPath *path, PlannerInfo *root, path->path.startup_cost = path->subpath->startup_cost; path->path.total_cost = path->subpath->total_cost; -#ifdef __TBASE__ - get_restriction_qual_cost(root, baserel_orig, param_info_orig, &qpqual_cost); -#else get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost); -#endif startup_cost = qpqual_cost.startup; cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple; @@ -1555,20 +1405,15 @@ cost_subqueryscan(SubqueryScanPath *path, PlannerInfo *root, */ void cost_functionscan(Path *path, PlannerInfo *root, -#ifdef __TBASE__ - RelOptInfo *baserel_orig, ParamPathInfo *param_info_orig) -{ - ADJUST_BASESCAN(path, baserel_orig, baserel, param_info_orig, param_info); -#else RelOptInfo *baserel, ParamPathInfo *param_info) { -#endif Cost startup_cost = 0; Cost run_cost = 0; QualCost qpqual_cost; Cost cpu_per_tuple; RangeTblEntry *rte; QualCost exprcost; + double num_nodes = path_count_datanodes(path); /* Should only be applied to base relations that are functions */ Assert(baserel->relid > 0); @@ -1577,9 +1422,9 @@ cost_functionscan(Path *path, PlannerInfo *root, /* Mark the path with the correct row estimate */ if (param_info) - path->rows = param_info->ppi_rows; + path->rows = ROWS_PER_DN(param_info->ppi_rows); else - path->rows = baserel->rows; + path->rows = ROWS_PER_DN(baserel->rows); /* * Estimate costs of executing the function expression(s). @@ -1599,11 +1444,7 @@ cost_functionscan(Path *path, PlannerInfo *root, startup_cost += exprcost.startup + exprcost.per_tuple; /* Add scanning CPU costs */ -#ifdef __TBASE__ - get_restriction_qual_cost(root, baserel_orig, param_info_orig, &qpqual_cost); -#else get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost); -#endif startup_cost += qpqual_cost.startup; cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple; @@ -1626,20 +1467,15 @@ cost_functionscan(Path *path, PlannerInfo *root, */ void cost_tablefuncscan(Path *path, PlannerInfo *root, -#ifdef __TBASE__ - RelOptInfo *baserel_orig, ParamPathInfo *param_info_orig) -{ - ADJUST_BASESCAN(path, baserel_orig, baserel, param_info_orig, param_info); -#else RelOptInfo *baserel, ParamPathInfo *param_info) { -#endif Cost startup_cost = 0; Cost run_cost = 0; QualCost qpqual_cost; Cost cpu_per_tuple; RangeTblEntry *rte; QualCost exprcost; + double num_nodes = path_count_datanodes(path); /* Should only be applied to base relations that are functions */ Assert(baserel->relid > 0); @@ -1648,9 +1484,9 @@ cost_tablefuncscan(Path *path, PlannerInfo *root, /* Mark the path with the correct row estimate */ if (param_info) - path->rows = param_info->ppi_rows; + path->rows = ROWS_PER_DN(param_info->ppi_rows); else - path->rows = baserel->rows; + path->rows = ROWS_PER_DN(baserel->rows); /* * Estimate costs of executing the table func expression(s). @@ -1665,15 +1501,11 @@ cost_tablefuncscan(Path *path, PlannerInfo *root, startup_cost += exprcost.startup + exprcost.per_tuple; /* Add scanning CPU costs */ -#ifdef __TBASE__ - get_restriction_qual_cost(root, baserel_orig, param_info_orig, &qpqual_cost); -#else get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost); -#endif startup_cost += qpqual_cost.startup; cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple; - run_cost += cpu_per_tuple * baserel->tuples; + run_cost += cpu_per_tuple * TUPLES_PER_DN(baserel->tuples); /* tlist eval costs are paid per output row, not per tuple scanned */ startup_cost += path->pathtarget->cost.startup; @@ -1692,18 +1524,13 @@ cost_tablefuncscan(Path *path, PlannerInfo *root, */ void cost_valuesscan(Path *path, PlannerInfo *root, -#ifdef __TBASE__ - RelOptInfo *baserel_orig, ParamPathInfo *param_info_orig) -{ - ADJUST_BASESCAN(path, baserel_orig, baserel, param_info_orig, param_info); -#else RelOptInfo *baserel, ParamPathInfo *param_info) { -#endif Cost startup_cost = 0; Cost run_cost = 0; QualCost qpqual_cost; Cost cpu_per_tuple; + double num_nodes = path_count_datanodes(path); /* Should only be applied to base relations that are values lists */ Assert(baserel->relid > 0); @@ -1711,9 +1538,9 @@ cost_valuesscan(Path *path, PlannerInfo *root, /* Mark the path with the correct row estimate */ if (param_info) - path->rows = param_info->ppi_rows; + path->rows = ROWS_PER_DN(param_info->ppi_rows); else - path->rows = baserel->rows; + path->rows = ROWS_PER_DN(baserel->rows); /* * For now, estimate list evaluation cost at one operator eval per list @@ -1722,15 +1549,11 @@ cost_valuesscan(Path *path, PlannerInfo *root, cpu_per_tuple = cpu_operator_cost; /* Add scanning CPU costs */ -#ifdef __TBASE__ - get_restriction_qual_cost(root, baserel_orig, param_info_orig, &qpqual_cost); -#else get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost); -#endif startup_cost += qpqual_cost.startup; cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple; - run_cost += cpu_per_tuple * baserel->tuples; + run_cost += cpu_per_tuple * TUPLES_PER_DN(baserel->tuples); /* tlist eval costs are paid per output row, not per tuple scanned */ startup_cost += path->pathtarget->cost.startup; @@ -1752,18 +1575,13 @@ cost_valuesscan(Path *path, PlannerInfo *root, */ void cost_ctescan(Path *path, PlannerInfo *root, -#ifdef __TBASE__ - RelOptInfo *baserel_orig, ParamPathInfo *param_info_orig) -{ - ADJUST_BASESCAN(path, baserel_orig, baserel, param_info_orig, param_info); -#else RelOptInfo *baserel, ParamPathInfo *param_info) { -#endif Cost startup_cost = 0; Cost run_cost = 0; QualCost qpqual_cost; Cost cpu_per_tuple; + double num_nodes = path_count_datanodes(path); /* Should only be applied to base relations that are CTEs */ Assert(baserel->relid > 0); @@ -1771,23 +1589,19 @@ cost_ctescan(Path *path, PlannerInfo *root, /* Mark the path with the correct row estimate */ if (param_info) - path->rows = param_info->ppi_rows; + path->rows = ROWS_PER_DN(param_info->ppi_rows); else - path->rows = baserel->rows; + path->rows = ROWS_PER_DN(baserel->rows); /* Charge one CPU tuple cost per row for tuplestore manipulation */ cpu_per_tuple = cpu_tuple_cost; /* Add scanning CPU costs */ -#ifdef __TBASE__ - get_restriction_qual_cost(root, baserel_orig, param_info_orig, &qpqual_cost); -#else get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost); -#endif startup_cost += qpqual_cost.startup; cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple; - run_cost += cpu_per_tuple * baserel->tuples; + run_cost += cpu_per_tuple * TUPLES_PER_DN(baserel->tuples); /* tlist eval costs are paid per output row, not per tuple scanned */ startup_cost += path->pathtarget->cost.startup; @@ -1803,18 +1617,13 @@ cost_ctescan(Path *path, PlannerInfo *root, */ void cost_namedtuplestorescan(Path *path, PlannerInfo *root, -#ifdef __TBASE__ - RelOptInfo *baserel_orig, ParamPathInfo *param_info_orig) -{ - ADJUST_BASESCAN(path, baserel_orig, baserel, param_info_orig, param_info); -#else RelOptInfo *baserel, ParamPathInfo *param_info) { -#endif Cost startup_cost = 0; Cost run_cost = 0; QualCost qpqual_cost; Cost cpu_per_tuple; + double num_nodes = path_count_datanodes(path); /* Should only be applied to base relations that are Tuplestores */ Assert(baserel->relid > 0); @@ -1822,23 +1631,19 @@ cost_namedtuplestorescan(Path *path, PlannerInfo *root, /* Mark the path with the correct row estimate */ if (param_info) - path->rows = param_info->ppi_rows; + path->rows = ROWS_PER_DN(param_info->ppi_rows); else - path->rows = baserel->rows; + path->rows = ROWS_PER_DN(baserel->rows); /* Charge one CPU tuple cost per row for tuplestore manipulation */ cpu_per_tuple = cpu_tuple_cost; /* Add scanning CPU costs */ -#ifdef __TBASE__ - get_restriction_qual_cost(root, baserel_orig, param_info_orig, &qpqual_cost); -#else get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost); -#endif startup_cost += qpqual_cost.startup; cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple; - run_cost += cpu_per_tuple * baserel->tuples; + run_cost += cpu_per_tuple * TUPLES_PER_DN(baserel->tuples); path->startup_cost = startup_cost; path->total_cost = startup_cost + run_cost; diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 690adbfd..c0847553 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -7001,7 +7001,7 @@ reparameterize_path(PlannerInfo *root, Path *path, #ifdef __TBASE__ /* - * count datanode number for given path, consider replication table as 1 + * Count datanode number for given path, consider replication table as 1 * because we use this function to figure out how many parts that data * had been separated into, when we estimating costs of a plan. Therefore * to get more accurate estimating result as in a distributed system. diff --git a/src/test/regress/expected/create_view.out b/src/test/regress/expected/create_view.out index 57376793..b8836c0d 100644 --- a/src/test/regress/expected/create_view.out +++ b/src/test/regress/expected/create_view.out @@ -59,7 +59,7 @@ SELECT * FROM viewtest; EXPLAIN SELECT a FROM viewtest; QUERY PLAN ------------------------------------------------------------------------------------------------- - Subquery Scan on viewtest (cost=22.23..25.04 rows=225 width=4) + Subquery Scan on viewtest (cost=22.23..27.29 rows=225 width=4) -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=22.23..22.79 rows=225 width=8) -> Sort (cost=22.23..22.79 rows=225 width=8) Sort Key: viewtest_tbl.b DESC @@ -78,8 +78,8 @@ SELECT a FROM viewtest; EXPLAIN SELECT * FROM viewtest ORDER BY a; QUERY PLAN ------------------------------------------------------------------------------------------- - Remote Subquery Scan on all (datanode_1,datanode_2) (cost=33.83..34.39 rows=225 width=8) - -> Sort (cost=33.83..34.39 rows=225 width=8) + Remote Subquery Scan on all (datanode_1,datanode_2) (cost=36.08..36.64 rows=225 width=8) + -> Sort (cost=36.08..36.64 rows=225 width=8) Sort Key: viewtest_tbl.a -> Sort (cost=22.23..22.79 rows=225 width=8) Sort Key: viewtest_tbl.b DESC From 0f2f19866f7776adc4a758bc300733f9c65d34b3 Mon Sep 17 00:00:00 2001 From: andrelin Date: Thu, 18 Mar 2021 15:00:14 +0800 Subject: [PATCH 145/578] Fix two compile warnings --- src/backend/commands/explain_dist.c | 3 ++- src/backend/optimizer/path/costsize.c | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/backend/commands/explain_dist.c b/src/backend/commands/explain_dist.c index 1f2d50e4..81fedadc 100644 --- a/src/backend/commands/explain_dist.c +++ b/src/backend/commands/explain_dist.c @@ -705,6 +705,7 @@ HandleRemoteInstr(char *msg_body, size_t len, int nodeid, ResponseCombiner *comb StringInfo recv_str; bool found; RemoteInstr *cur_instr; + MemoryContext oldcontext; if (combiner->recv_instr_htbl == NULL) { @@ -713,7 +714,7 @@ HandleRemoteInstr(char *msg_body, size_t len, int nodeid, ResponseCombiner *comb elog(DEBUG1, "Handle remote instrument: nodeid %d", nodeid); /* must doing this under per query context */ - MemoryContext oldcontext = MemoryContextSwitchTo(combiner->ss.ps.state->es_query_cxt); + oldcontext = MemoryContextSwitchTo(combiner->ss.ps.state->es_query_cxt); recv_str = makeStringInfo(); appendBinaryStringInfo(recv_str, msg_body, len); diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 491d9d40..c0fa9bdf 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -554,7 +554,6 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count, double pages_fetched; double rand_heap_pages; double index_pages; - double nodes = 1; double index_pages_per_dn; double baserel_pages_per_dn; double num_nodes = path_count_datanodes((Path *) path); From e60f8bde529bbfa2d045303ca55c7166f2efaed0 Mon Sep 17 00:00:00 2001 From: andrelin Date: Fri, 12 Mar 2021 14:14:52 +0800 Subject: [PATCH 146/578] Support hash varbit and bit for V2 we do not add meta data, just add a function for locator tapd: http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view/1020421696082800841 --- src/backend/pgxc/locator/locator.c | 4 +++ src/backend/utils/adt/varbit.c | 9 +++++++ src/include/utils/varbit.h | 40 ++++++++++++++++-------------- 3 files changed, 34 insertions(+), 19 deletions(-) diff --git a/src/backend/pgxc/locator/locator.c b/src/backend/pgxc/locator/locator.c index fdc500fa..431e5bc3 100644 --- a/src/backend/pgxc/locator/locator.c +++ b/src/backend/pgxc/locator/locator.c @@ -38,6 +38,7 @@ #include "utils/relcache.h" #include "utils/tqual.h" #include "utils/syscache.h" +#include "utils/varbit.h" #include "nodes/nodes.h" #include "optimizer/clauses.h" #include "parser/parse_coerce.h" @@ -1020,6 +1021,9 @@ hash_func_ptr(Oid dataType) return hash_numeric; case UUIDOID: return uuid_hash; + case BITOID: + case VARBITOID: + return bithash; default: return NULL; } diff --git a/src/backend/utils/adt/varbit.c b/src/backend/utils/adt/varbit.c index bde80471..933ad9ff 100644 --- a/src/backend/utils/adt/varbit.c +++ b/src/backend/utils/adt/varbit.c @@ -16,6 +16,7 @@ #include "postgres.h" +#include "access/hash.h" #include "access/htup_details.h" #include "libpq/pqformat.h" #include "nodes/nodeFuncs.h" @@ -1871,3 +1872,11 @@ bitgetbit(PG_FUNCTION_ARGS) else PG_RETURN_INT32(0); } + +Datum +bithash(PG_FUNCTION_ARGS) +{ + VarBit *arg1 = PG_GETARG_VARBIT_P(0); + + return hash_any(VARBITS(arg1), VARBITBYTES(arg1)); +} diff --git a/src/include/utils/varbit.h b/src/include/utils/varbit.h index f82f3aec..53c4f080 100644 --- a/src/include/utils/varbit.h +++ b/src/include/utils/varbit.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * varbit.h - * Functions for the SQL datatypes BIT() and BIT VARYING(). + * Functions for the SQL datatypes BIT() and BIT VARYING(). * * Code originally contributed by Adriaan Joubert. * @@ -24,10 +24,10 @@ */ typedef struct { - int32 vl_len_; /* varlena header (do not touch directly!) */ - int32 bit_len; /* number of valid bits */ - bits8 bit_dat[FLEXIBLE_ARRAY_MEMBER]; /* bit string, most sig. byte - * first */ + int32 vl_len_; /* varlena header (do not touch directly!) */ + int32 bit_len; /* number of valid bits */ + bits8 bit_dat[FLEXIBLE_ARRAY_MEMBER]; /* bit string, most sig. byte + * first */ } VarBit; /* @@ -36,34 +36,36 @@ typedef struct * BIT and BIT VARYING are toastable varlena types. They are the same * as far as representation goes, so we just have one set of macros. */ -#define DatumGetVarBitP(X) ((VarBit *) PG_DETOAST_DATUM(X)) -#define DatumGetVarBitPCopy(X) ((VarBit *) PG_DETOAST_DATUM_COPY(X)) -#define VarBitPGetDatum(X) PointerGetDatum(X) -#define PG_GETARG_VARBIT_P(n) DatumGetVarBitP(PG_GETARG_DATUM(n)) +#define DatumGetVarBitP(X) ((VarBit *) PG_DETOAST_DATUM(X)) +#define DatumGetVarBitPCopy(X) ((VarBit *) PG_DETOAST_DATUM_COPY(X)) +#define VarBitPGetDatum(X) PointerGetDatum(X) +#define PG_GETARG_VARBIT_P(n) DatumGetVarBitP(PG_GETARG_DATUM(n)) #define PG_GETARG_VARBIT_P_COPY(n) DatumGetVarBitPCopy(PG_GETARG_DATUM(n)) -#define PG_RETURN_VARBIT_P(x) return VarBitPGetDatum(x) +#define PG_RETURN_VARBIT_P(x) return VarBitPGetDatum(x) /* Header overhead *in addition to* VARHDRSZ */ -#define VARBITHDRSZ sizeof(int32) +#define VARBITHDRSZ sizeof(int32) /* Number of bits in this bit string */ -#define VARBITLEN(PTR) (((VarBit *) (PTR))->bit_len) +#define VARBITLEN(PTR) (((VarBit *) (PTR))->bit_len) /* Pointer to the first byte containing bit string data */ -#define VARBITS(PTR) (((VarBit *) (PTR))->bit_dat) +#define VARBITS(PTR) (((VarBit *) (PTR))->bit_dat) /* Number of bytes in the data section of a bit string */ -#define VARBITBYTES(PTR) (VARSIZE(PTR) - VARHDRSZ - VARBITHDRSZ) +#define VARBITBYTES(PTR) (VARSIZE(PTR) - VARHDRSZ - VARBITHDRSZ) /* Padding of the bit string at the end (in bits) */ -#define VARBITPAD(PTR) (VARBITBYTES(PTR)*BITS_PER_BYTE - VARBITLEN(PTR)) +#define VARBITPAD(PTR) (VARBITBYTES(PTR)*BITS_PER_BYTE - VARBITLEN(PTR)) /* Number of bytes needed to store a bit string of a given length */ -#define VARBITTOTALLEN(BITLEN) (((BITLEN) + BITS_PER_BYTE-1)/BITS_PER_BYTE + \ - VARHDRSZ + VARBITHDRSZ) +#define VARBITTOTALLEN(BITLEN) (((BITLEN) + BITS_PER_BYTE-1)/BITS_PER_BYTE + \ + VARHDRSZ + VARBITHDRSZ) /* * Maximum number of bits. Several code sites assume no overflow from * computing bitlen + X; VARBITTOTALLEN() has the largest such X. */ -#define VARBITMAXLEN (INT_MAX - BITS_PER_BYTE + 1) +#define VARBITMAXLEN (INT_MAX - BITS_PER_BYTE + 1) /* pointer beyond the end of the bit string (like end() in STL containers) */ -#define VARBITEND(PTR) (((bits8 *) (PTR)) + VARSIZE(PTR)) +#define VARBITEND(PTR) (((bits8 *) (PTR)) + VARSIZE(PTR)) /* Mask that will cover exactly one byte, i.e. BITS_PER_BYTE bits */ #define BITMASK 0xFF +extern Datum bithash(PG_FUNCTION_ARGS); + #endif From 7ad70a744ff6974bcc69450d71bd2f3ada5ba9af Mon Sep 17 00:00:00 2001 From: bethding Date: Tue, 9 Mar 2021 15:20:53 +0800 Subject: [PATCH 147/578] fqs insert when distribute key's func returns a single result --- src/backend/commands/copy.c | 3 + src/backend/commands/explain.c | 2 +- src/backend/commands/prepare.c | 14 +- src/backend/executor/execMain.c | 95 +++++++ src/backend/executor/execUtils.c | 19 +- src/backend/executor/functions.c | 4 + src/backend/executor/spi.c | 5 + src/backend/nodes/copyfuncs.c | 2 + src/backend/optimizer/util/clauses.c | 30 +++ src/backend/optimizer/util/pgxcship.c | 38 ++- src/backend/pgxc/locator/locator.c | 17 ++ src/backend/pgxc/plan/planner.c | 8 + src/backend/pgxc/pool/execRemote.c | 17 +- src/backend/pgxc/pool/pgxcnode.c | 32 ++- src/backend/tcop/postgres.c | 17 +- src/backend/tcop/pquery.c | 9 +- src/backend/utils/adt/ruleutils.c | 30 ++- src/include/commands/prepare.h | 3 +- src/include/executor/executor.h | 1 + src/include/optimizer/clauses.h | 2 + src/include/pgxc/locator.h | 141 ++++++----- src/include/pgxc/planner.h | 1 + src/test/regress/expected/fast_default.out | 20 +- .../regress/expected/insert_conflict_1.out | 62 ++--- src/test/regress/expected/prepare.out | 235 ++++++++++++++++++ src/test/regress/expected/rules.out | 5 +- src/test/regress/output/constraints_3.source | 2 +- src/test/regress/sql/prepare.sql | 129 ++++++++++ 28 files changed, 798 insertions(+), 145 deletions(-) diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 0753302f..9e5aec9f 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -1873,6 +1873,9 @@ BeginCopy(ParseState *pstate, * * ExecutorStart computes a result tupdesc for us */ + if (query->returningList != NIL) + ExecutorStart(cstate->queryDesc, EXEC_FLAG_RETURNING); + else ExecutorStart(cstate->queryDesc, 0); tupDesc = cstate->queryDesc->tupDesc; diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 7b56c3d3..2722d951 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -3988,7 +3988,7 @@ ExplainRemoteQuery(RemoteQuery *plan, PlanState *planstate, List *ancestors, Exp estate = planstate->state; oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); - node = ExecInitRemoteQuery(step, estate, 0); + node = ExecInitRemoteQuery(step, estate, EXEC_FLAG_EXPLAIN_ONLY); MemoryContextSwitchTo(oldcontext); result = ExecRemoteQuery((PlanState *) node); while (result != NULL && !TupIsNull(result)) diff --git a/src/backend/commands/prepare.c b/src/backend/commands/prepare.c index d4729433..7cf29f5f 100644 --- a/src/backend/commands/prepare.c +++ b/src/backend/commands/prepare.c @@ -201,7 +201,8 @@ PrepareQuery(PrepareStmt *stmt, const char *queryString, StorePreparedStatement(stmt->name, plansource, true, - false); + false, + 'N'); } /* @@ -584,7 +585,8 @@ void StorePreparedStatement(const char *stmt_name, CachedPlanSource *plansource, bool from_sql, - bool use_resowner) + bool use_resowner, + const char need_rewrite) { PreparedStatement *entry; TimestampTz cur_ts = GetCurrentStatementStartTimestamp(); @@ -603,7 +605,13 @@ StorePreparedStatement(const char *stmt_name, /* Shouldn't get a duplicate entry */ if (found) { - if (!(plansource->commandTag == entry->plansource->commandTag && + if (need_rewrite == 'Y' && + plansource->commandTag == entry->plansource->commandTag && + strcmp(plansource->query_string, entry->plansource->query_string) != 0) + { + entry->plansource->query_string = plansource->query_string; + } + else if (!(plansource->commandTag == entry->plansource->commandTag && strcmp(plansource->query_string, entry->plansource->query_string) == 0)) { ereport(ERROR, diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 472bec42..d516a94c 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -62,6 +62,7 @@ #include "storage/lmgr.h" #include "tcop/utility.h" #include "utils/acl.h" +#include "utils/builtins.h" #ifdef _MLS_ #include "utils/mls.h" #endif @@ -1899,6 +1900,95 @@ ExecEndPlan(PlanState *planstate, EState *estate) } } +/* + * RewriteForSql + * We must caculate the result of distribute key's function to know + * which datanode will execute the sql command. After we get the result, + * we should use the result to replace distribute key's function to + * generate a new sql that will be shipped to datanode. + * Note: for replication table, we should caculate all the results of + * functions before ship the sql. Otherwise the value may not be same + * in different datanodes. + */ +static void +RewriteForSql(RemoteQuery *plan, Query *query, + char *distribcol, bool isreplic) +{ + ListCell *lc_deparse = NULL; + TargetEntry *entry_deparse = NULL; + bool find_target = false; + StringInfoData buf; + + foreach(lc_deparse, query->targetList) + { + entry_deparse = lfirst(lc_deparse); + if (isreplic) + { + entry_deparse->expr = (Expr *)replace_distribkey_func( + (Node *)entry_deparse->expr); + find_target = true; + } + else if (strcmp(entry_deparse->resname, distribcol) == 0) + { + entry_deparse->expr = (Expr *)replace_distribkey_func( + (Node *)entry_deparse->expr); + plan->exec_nodes->en_expr = entry_deparse->expr; + find_target = true; + break; + } + } + + if (find_target) + { + initStringInfo(&buf); + /* + * We always finalise aggregates on datanodes for FQS. + * Use the expressions for ORDER BY or GROUP BY clauses. + */ + deparse_query(query, &buf, NIL, true, false); + plan->sql_statement = pstrdup(buf.data); + pfree(buf.data); + } +} + +/* + * RewriteFuncNode + * We ship the insert sql whose distribute key's value contains function. + * So we must rewrite the func node by caculating result of the function. + */ +static void +RewriteFuncNode(PlanState *planstate) +{ + RemoteQuery *plan = (RemoteQuery *)planstate->plan; + ExecNodes *exec_nodes = plan->exec_nodes; + Query *query = copyObject(plan->forDeparse); + RelationLocInfo *rel_loc_info = NULL; + char *distribcol = NULL; + + if ((!exec_nodes) || (!exec_nodes->need_rewrite)) + return; + + /* + * For replicated table, we need to execute func + * and then ship to datanode + */ + if (IsExecNodesReplicated(exec_nodes)) + { + RewriteForSql(plan, query, NULL, true); + return; + } + + if (exec_nodes->en_relid == InvalidOid || (!exec_nodes->en_expr)) + return; + + rel_loc_info = GetRelationLocInfo(exec_nodes->en_relid); + if (!rel_loc_info) + return; + + distribcol = GetRelationDistribColumn(rel_loc_info); + RewriteForSql(plan, query, distribcol, false); +} + /* ---------------------------------------------------------------- * ExecutePlan * @@ -1947,6 +2037,11 @@ ExecutePlan(EState *estate, if (use_parallel_mode) EnterParallelMode(); + if (operation == CMD_INSERT && planstate->plan->type == T_RemoteQuery) + { + RewriteFuncNode(planstate); + } + /* * Loop until we've processed the proper number of tuples from the plan. */ diff --git a/src/backend/executor/execUtils.c b/src/backend/executor/execUtils.c index c9c06f89..79c629be 100644 --- a/src/backend/executor/execUtils.c +++ b/src/backend/executor/execUtils.c @@ -119,6 +119,7 @@ #include "utils/ruleutils.h" #endif +#include "pgxc/execRemote.h" static void ShutdownExprContext(ExprContext *econtext, bool isCommit); @@ -509,6 +510,7 @@ ExecAssignResultTypeFromTL(PlanState *planstate) { bool hasoid; TupleDesc tupDesc; + List *targetList = NIL; if (ExecContextForcesOids(planstate, &hasoid)) { @@ -521,11 +523,26 @@ ExecAssignResultTypeFromTL(PlanState *planstate) } /* + * If the command with returning syntax, the tupDesc's info should + * be maked up of returningList + */ + if (IsA(planstate, RemoteQueryState) && + (((((RemoteQueryState *)planstate)->eflags) & EXEC_FLAG_RETURNING) != 0)) + { + if (planstate->state && planstate->state->es_plannedstmt && + planstate->state->es_plannedstmt->parseTree && + planstate->state->es_plannedstmt->parseTree->returningList) + targetList = planstate->state->es_plannedstmt->parseTree->returningList; + } + if (targetList == NIL) + targetList = planstate->plan->targetlist; + + /* * ExecTypeFromTL needs the parse-time representation of the tlist, not a * list of ExprStates. This is good because some plan nodes don't bother * to set up planstate->targetlist ... */ - tupDesc = ExecTypeFromTL(planstate->plan->targetlist, hasoid); + tupDesc = ExecTypeFromTL(targetList, hasoid); ExecAssignResultType(planstate, tupDesc); } diff --git a/src/backend/executor/functions.c b/src/backend/executor/functions.c index 08a35bd5..f35db8bf 100644 --- a/src/backend/executor/functions.c +++ b/src/backend/executor/functions.c @@ -853,6 +853,10 @@ postquel_start(execution_state *es, SQLFunctionCachePtr fcache) eflags = EXEC_FLAG_SKIP_TRIGGERS; else eflags = 0; /* default run-to-completion flags */ + + if (es->qd->plannedstmt->hasReturning) + eflags |= EXEC_FLAG_RETURNING; + ExecutorStart(es->qd, eflags); } diff --git a/src/backend/executor/spi.c b/src/backend/executor/spi.c index b64f03f6..808c75f8 100644 --- a/src/backend/executor/spi.c +++ b/src/backend/executor/spi.c @@ -2403,6 +2403,11 @@ _SPI_pquery(QueryDesc *queryDesc, bool fire_triggers, uint64 tcount) else eflags = EXEC_FLAG_SKIP_TRIGGERS; + if (queryDesc->plannedstmt->hasReturning) + { + eflags |= EXEC_FLAG_RETURNING; + } + ExecutorStart(queryDesc, eflags); ExecutorRun(queryDesc, ForwardScanDirection, tcount, true); diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 49ad080a..780ad8aa 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -1315,6 +1315,7 @@ _copyRemoteQuery(const RemoteQuery *from) COPY_NODE_FIELD(query_var_tlist); COPY_SCALAR_FIELD(is_temp); #ifdef __TBASE__ + COPY_NODE_FIELD(forDeparse); COPY_STRING_FIELD(sql_select); COPY_STRING_FIELD(sql_select_base); COPY_SCALAR_FIELD(forUpadte); @@ -1361,6 +1362,7 @@ _copyExecNodes(const ExecNodes *from) #endif COPY_SCALAR_FIELD(en_relid); COPY_SCALAR_FIELD(accesstype); + COPY_SCALAR_FIELD(need_rewrite); return newnode; } diff --git a/src/backend/optimizer/util/clauses.c b/src/backend/optimizer/util/clauses.c index d840206d..82a9f2ba 100644 --- a/src/backend/optimizer/util/clauses.c +++ b/src/backend/optimizer/util/clauses.c @@ -5257,6 +5257,36 @@ bool find_sublink_walker(Node *node, List **list) return expression_tree_walker(node, find_sublink_walker, list); } +/* + * replace_distribkey_func: + * evaluate the result of a function that returns only + * one value and replace as certain value. + */ +Node* +replace_distribkey_func(Node *node) +{ + if (node == NULL) + return NULL; + + if (node->type == T_FuncExpr) + { + FuncExpr *func = (FuncExpr *) node; + + if (!func->funcretset) + { + Node *evalNode = (Node *) evaluate_expr((Expr *) func, + func->funcresulttype, + exprTypmod(node), + func->funccollid); + return evalNode; + } + } + + return expression_tree_mutator(node, + replace_distribkey_func, + NULL); +} + /* * replace_eval_sql_value_function: * eval SQLValueFunction and replace as Const value. diff --git a/src/backend/optimizer/util/pgxcship.c b/src/backend/optimizer/util/pgxcship.c index bfbc9e99..d294de0b 100644 --- a/src/backend/optimizer/util/pgxcship.c +++ b/src/backend/optimizer/util/pgxcship.c @@ -103,7 +103,10 @@ typedef enum SS_HAS_AGG_EXPR, /* it has aggregate expressions */ SS_UNSHIPPABLE_TYPE, /* the type of expression is unshippable */ SS_UNSHIPPABLE_TRIGGER, /* the type of trigger is unshippable */ - SS_UPDATES_DISTRIBUTION_COLUMN /* query updates the distribution column */ + SS_UPDATES_DISTRIBUTION_COLUMN, /* query updates the distribution column */ + SS_NEED_FUNC_REWRITE /* exist func expression of distribution column, + * we should rewrite the expr for FQS + */ } ShippabilityStat; extern void PoolPingNodes(void); @@ -1249,13 +1252,18 @@ pgxc_shippability_walker(Node *node, Shippability_context *sc_context) * can be shipped to the Datanode and what can not be. */ if (!pgxc_is_func_shippable(funcexpr->funcid)) + { + /* Ship insert if function doesn't return set */ + if (sc_context->sc_query && + sc_context->sc_query->commandType == CMD_INSERT && + !(funcexpr->funcretset && sc_context->sc_for_expr)) + { + pgxc_set_shippability_reason(sc_context, SS_NEED_FUNC_REWRITE); + } + else pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR); + } - /* - * If this is a stand alone expression and the function returns a - * set of rows, we need to handle it along with the final result of - * other expressions. So, it can not be shippable. - */ if (funcexpr->funcretset && sc_context->sc_for_expr) pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR); @@ -1318,10 +1326,6 @@ pgxc_shippability_walker(Node *node, Shippability_context *sc_context) Query *query = (Query *)node; ExecNodes *exec_nodes = NULL; - /* PGXCTODO : If the query has a returning list, it is not shippable as of now */ - if (query->returningList) - pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR); - /* A stand-alone expression containing Query is not shippable */ if (sc_context->sc_for_expr) { @@ -2018,6 +2022,20 @@ pgxc_is_query_shippable(Query *query, int query_level) */ shippability = bms_del_member(shippability, SS_HAS_AGG_EXPR); + /* + * If an insert sql command whose distribute key's value is a + * function, we allow it to be shipped to datanode. But we must + * must know the function's result before real execute. So set + * the flag to identify rewrite in ExecutePlan. + */ + if (bms_is_member(SS_NEED_FUNC_REWRITE, shippability)) + { + exec_nodes->need_rewrite = true; + shippability = bms_del_member(shippability, SS_NEED_FUNC_REWRITE); + } + else + exec_nodes->need_rewrite = false; + /* Can not ship the query for some reason */ if (!bms_is_empty(shippability)) canShip = false; diff --git a/src/backend/pgxc/locator/locator.c b/src/backend/pgxc/locator/locator.c index 431e5bc3..a6933ce2 100644 --- a/src/backend/pgxc/locator/locator.c +++ b/src/backend/pgxc/locator/locator.c @@ -2281,6 +2281,23 @@ GetRelationNodes(RelationLocInfo *rel_loc_info, Datum valueForDistCol, return exec_nodes; } +/* + * GetRelationNodesForExplain + * This is just for explain statement, just pick one datanode. + * The returned List is a copy, so it should be freed when finished. + */ +ExecNodes * +GetRelationNodesForExplain(RelationLocInfo *rel_loc_info, + RelationAccessType accessType) +{ + ExecNodes *exec_nodes; + exec_nodes = makeNode(ExecNodes); + exec_nodes->baselocatortype = rel_loc_info->locatorType; + exec_nodes->accesstype = accessType; + exec_nodes->nodeList = lappend_int(exec_nodes->nodeList, 1); + return exec_nodes; +} + /* * GetRelationNodesByQuals * A wrapper around GetRelationNodes to reduce the node list by looking at the diff --git a/src/backend/pgxc/plan/planner.c b/src/backend/pgxc/plan/planner.c index 9bc141ad..2692e307 100644 --- a/src/backend/pgxc/plan/planner.c +++ b/src/backend/pgxc/plan/planner.c @@ -348,6 +348,7 @@ pgxc_FQS_planner(Query *query, int cursorOptions, ParamListInfo boundParams) result->relationOids = glob->relationOids; result->invalItems = glob->invalItems; result->rowMarks = glob->finalrowmarks; + result->hasReturning = (query->returningList != NULL); return result; } @@ -390,6 +391,13 @@ pgxc_FQS_create_remote_plan(Query *query, ExecNodes *exec_nodes, bool is_exec_di pfree(buf.data); } + if (query_step->exec_nodes && + query_step->exec_nodes->need_rewrite && + query->commandType == CMD_INSERT) + { + query_step->forDeparse = copyObject(query); + } + /* Optimize multi-node handling */ query_step->read_only = (query->commandType == CMD_SELECT && !query->hasForUpdate); query_step->has_row_marks = query->hasForUpdate; diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 40f3f655..d95a772a 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -6287,13 +6287,16 @@ get_exec_connections(RemoteQueryState *planstate, /* execution time determining of target Datanodes */ bool isnull; ExecNodes *nodes; + Datum partvalue; #ifdef __COLD_HOT__ bool secisnull; Datum secValue; #endif ExprState *estate = ExecInitExpr(exec_nodes->en_expr, (PlanState *) planstate); - Datum partvalue = ExecEvalExpr(estate, + /* For explain, no need to execute expr. */ + if (planstate->eflags != EXEC_FLAG_EXPLAIN_ONLY) + partvalue = ExecEvalExpr(estate, planstate->combiner.ss.ps.ps_ExprContext, &isnull); RelationLocInfo *rel_loc_info = GetRelationLocInfo(exec_nodes->en_relid); @@ -6303,6 +6306,8 @@ get_exec_connections(RemoteQueryState *planstate, { estate = ExecInitExpr(exec_nodes->sec_en_expr, (PlanState *) planstate); + /* For explain, no need to execute expr. */ + if (planstate->eflags != EXEC_FLAG_EXPLAIN_ONLY) secValue = ExecEvalExpr(estate, planstate->combiner.ss.ps.ps_ExprContext, &secisnull); @@ -6314,6 +6319,10 @@ get_exec_connections(RemoteQueryState *planstate, } #endif + if (planstate->eflags == EXEC_FLAG_EXPLAIN_ONLY) + nodes = GetRelationNodesForExplain(rel_loc_info, + exec_nodes->accesstype); + else /* PGXCTODO what is the type of partvalue here */ nodes = GetRelationNodes(rel_loc_info, partvalue, @@ -6567,10 +6576,13 @@ pgxc_start_command_on_connection(PGXCNodeHandle *connection, int fetch = 0; bool prepared = false; char nodetype = PGXC_NODE_DATANODE; + ExecNodes *exec_nodes = step->exec_nodes; /* if prepared statement is referenced see if it is already * exist */ - if (step->statement) + if (exec_nodes && exec_nodes->need_rewrite == true) + prepared = false; + else if (step->statement) prepared = ActivateDatanodeStatementOnNode(step->statement, PGXCNodeGetNodeId(connection->nodeoid, @@ -8799,6 +8811,7 @@ ExecInitRemoteQuery(RemoteQuery *node, EState *estate, int eflags) ResponseCombiner *combiner; remotestate = makeNode(RemoteQueryState); + remotestate->eflags = eflags; combiner = (ResponseCombiner *) remotestate; InitResponseCombiner(combiner, 0, node->combine_type); combiner->ss.ps.plan = (Plan *) node; diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index 3baad358..81027676 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -1991,6 +1991,10 @@ pgxc_node_send_parse(PGXCNodeHandle * handle, const char* statement, size_t old_outEnd = handle->outEnd; #endif + ResponseCombiner *combiner = handle->combiner; + bool need_rewrite = false; + int rewriteLen = 1; + /* if there are parameters, param_types should exist */ Assert(num_params <= 0 || param_types); /* 2 bytes for number of parameters, preceding the type names */ @@ -2010,8 +2014,8 @@ pgxc_node_send_parse(PGXCNodeHandle * handle, const char* statement, paramTypeLen += strlen(paramTypes[cnt_params]) + 1; } - /* size + stmtLen + strlen + paramTypeLen */ - msgLen = 4 + stmtLen + strLen + paramTypeLen; + /* size + rewriteLen + stmtLen + strlen + paramTypeLen */ + msgLen = 4 + rewriteLen + stmtLen + strLen + paramTypeLen; /* msgType + msgLen */ if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0) @@ -2025,6 +2029,7 @@ pgxc_node_send_parse(PGXCNodeHandle * handle, const char* statement, msgLen = htonl(msgLen); memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4); handle->outEnd += 4; + /* statement name */ if (statement) { @@ -2053,6 +2058,29 @@ pgxc_node_send_parse(PGXCNodeHandle * handle, const char* statement, pfree(paramTypes[cnt_params]); } pfree(paramTypes); + + /* + * If the extended query contains an insert sql command whose + * distribute key's value is a function, we caculte the function + * and rewrite the insert sql with the const result. So after send + * the sql to datanode, it will be cached, However, the sql command + * changes as the result of the function, so datanode should use + * the new sql instead of cached sql. The we send a 'need_rewrite' + * flag to tell the datanode to use new sql. + */ + if (IsA((combiner->ss.ps.plan), RemoteQuery)) + { + RemoteQuery *plan = (RemoteQuery *)(combiner->ss.ps.plan); + ExecNodes *exec_nodes = plan->exec_nodes; + if (exec_nodes && exec_nodes->need_rewrite) + { + handle->outBuffer[handle->outEnd++] = 'Y'; + need_rewrite = true; + } + } + if (!need_rewrite) + handle->outBuffer[handle->outEnd++] = 'N'; + Assert(old_outEnd + ntohl(msgLen) + 1 == handle->outEnd); return 0; diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index c8bdc980..db91d32d 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -1648,8 +1648,9 @@ exec_parse_message(const char *query_string, /* string to execute */ const char *stmt_name, /* name for prepared stmt */ Oid *paramTypes, /* parameter types */ char **paramTypeNames, /* parameter type names */ - int numParams) /* number of parameters */ -{// #lizard forgives + int numParams, /* number of parameters */ + const char need_rewrite) /* plancache need to be rewritted */ +{ MemoryContext unnamed_stmt_context = NULL; MemoryContext oldcontext; List *parsetree_list; @@ -1929,11 +1930,11 @@ exec_parse_message(const char *query_string, /* string to execute */ #ifdef __TBASE__ if (use_resowner) { - StorePreparedStatement(stmt_name, psrc, false, true); + StorePreparedStatement(stmt_name, psrc, false, true, need_rewrite); } else #endif - StorePreparedStatement(stmt_name, psrc, false, false); + StorePreparedStatement(stmt_name, psrc, false, false, need_rewrite); } else { @@ -2093,7 +2094,7 @@ exec_plan_message(const char *query_string, /* source of the query */ /* * Store the query as a prepared statement. See above comments. */ - StorePreparedStatement(stmt_name, psrc, false, true); + StorePreparedStatement(stmt_name, psrc, false, true, 'N'); SetRemoteSubplan(psrc, plan_string); /* set instrument_options, default 0 */ @@ -5460,6 +5461,7 @@ PostgresMain(int argc, char *argv[], int numParams; Oid *paramTypes = NULL; char **paramTypeNames = NULL; + char need_rewrite = 'N'; forbidden_in_wal_sender(firstchar); @@ -5479,6 +5481,8 @@ PostgresMain(int argc, char *argv[], paramTypeNames = (char **)palloc(numParams * sizeof(char *)); for (i = 0; i < numParams; i++) paramTypeNames[i] = (char *)pq_getmsgstring(&input_message); + + need_rewrite = pq_getmsgbyte(&input_message); } else #endif /* PGXC */ @@ -5490,7 +5494,8 @@ PostgresMain(int argc, char *argv[], pq_getmsgend(&input_message); exec_parse_message(query_string, stmt_name, - paramTypes, paramTypeNames, numParams); + paramTypes, paramTypeNames, + numParams, need_rewrite); } break; diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c index 5d358337..b73224fe 100644 --- a/src/backend/tcop/pquery.c +++ b/src/backend/tcop/pquery.c @@ -1092,11 +1092,16 @@ PortalStart(Portal portal, ParamListInfo params, */ { PlannedStmt *pstmt; + List *list = NIL; pstmt = PortalGetPrimaryStmt(portal); + if (portal->strategy == PORTAL_ONE_RETURNING && + pstmt->parseTree && pstmt->parseTree->returningList) + list = pstmt->parseTree->returningList; + else + list = pstmt->planTree->targetlist; portal->tupDesc = - ExecCleanTypeFromTL(pstmt->planTree->targetlist, - false); + ExecCleanTypeFromTL(list, false); } /* diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c index fe4df6e7..2ce117ab 100644 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@ -6637,6 +6637,8 @@ get_update_query_targetlist_def(Query *query, List *targetList, { TargetEntry *tle = (TargetEntry *) lfirst(l); Node *expr; + Node *aexpr = (Node*)tle->expr; + const char *attr_str; if (tle->resjunk) continue; /* ignore junk entries */ @@ -6707,16 +6709,17 @@ get_update_query_targetlist_def(Query *query, List *targetList, * Put out name of target column; look in the catalogs, not at * tle->resname, since resname will fail to track RENAME. */ - appendStringInfoString(buf, - quote_identifier(get_relid_attribute_name(rte->relid, - tle->resno))); + attr_str = quote_identifier( + get_relid_attribute_name(rte->relid, tle->resno)); + appendStringInfoString(buf, attr_str); + for (;;) + { /* * Print any indirection needed (subfields or subscripts), and strip * off the top-level nodes representing the indirection assignments. */ - expr = processIndirection((Node *) tle->expr, context); - + expr = processIndirection(aexpr, context); /* * If we're in a multiassignment, skip printing anything more, unless * this is the last column; in which case, what we print should be the @@ -6725,7 +6728,7 @@ get_update_query_targetlist_def(Query *query, List *targetList, if (cur_ma_sublink != NULL) { if (--remaining_ma_columns > 0) - continue; /* not the last column of multiassignment */ + break; /* not the last column of multiassignment */ appendStringInfoChar(buf, ')'); expr = (Node *) cur_ma_sublink; cur_ma_sublink = NULL; @@ -6734,6 +6737,21 @@ get_update_query_targetlist_def(Query *query, List *targetList, appendStringInfoString(buf, " = "); get_rule_expr(expr, context, false); + + /* + * expand multiple entries for the same target attribute if need. + * if this is the last one, we don't append sep and column msg. + */ + if (IsA(aexpr, ArrayRef) && + IsA(((ArrayRef*)aexpr)->refexpr, ArrayRef)) + { + appendStringInfoString(buf, sep); + appendStringInfoString(buf, attr_str); + aexpr = (Node*)((ArrayRef*)aexpr)->refexpr; + } + else + break; + } } } diff --git a/src/include/commands/prepare.h b/src/include/commands/prepare.h index 57a72d94..53fbdede 100644 --- a/src/include/commands/prepare.h +++ b/src/include/commands/prepare.h @@ -121,7 +121,8 @@ extern void ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, extern void StorePreparedStatement(const char *stmt_name, CachedPlanSource *plansource, bool from_sql, - bool use_resowner); + bool use_resowner, + const char need_rewrite); extern PreparedStatement *FetchPreparedStatement(const char *stmt_name, bool throwError); extern void DropPreparedStatement(const char *stmt_name, bool showError); diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index 5262c42e..7fb94908 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -64,6 +64,7 @@ #define EXEC_FLAG_WITH_OIDS 0x0020 /* force OIDs in returned tuples */ #define EXEC_FLAG_WITHOUT_OIDS 0x0040 /* force no OIDs in returned tuples */ #define EXEC_FLAG_WITH_NO_DATA 0x0080 /* rel scannability doesn't matter */ +#define EXEC_FLAG_RETURNING 0x0800 /* returning tuples */ #ifdef XCP /* distributed executor may never execute the plan on this node */ #define EXEC_FLAG_SUBPLAN 0x0100 diff --git a/src/include/optimizer/clauses.h b/src/include/optimizer/clauses.h index fddeb132..e55c6033 100644 --- a/src/include/optimizer/clauses.h +++ b/src/include/optimizer/clauses.h @@ -91,6 +91,8 @@ extern Node *substitute_sublink_with_node(Node *expr, SubLink *sublink, Node *node); extern bool find_sublink_walker(Node *node, List **list); +extern Node *replace_distribkey_func(Node *node); + extern Node *replace_eval_sql_value_function(Node *node); #endif /* CLAUSES_H */ diff --git a/src/include/pgxc/locator.h b/src/include/pgxc/locator.h index c6218522..bef6e6a0 100644 --- a/src/include/pgxc/locator.h +++ b/src/include/pgxc/locator.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * locator.h - * Externally declared locator functions + * Externally declared locator functions * * * Portions Copyright (c) 2010-2012 Postgres-XC Development Group @@ -22,9 +22,9 @@ #define LOCATOR_TYPE_CUSTOM 'C' #define LOCATOR_TYPE_MODULO 'M' #define LOCATOR_TYPE_NONE 'O' -#define LOCATOR_TYPE_DISTRIBUTED 'D' /* for distributed table without specific - * scheme, e.g. result of JOIN of - * replicated and distributed table */ +#define LOCATOR_TYPE_DISTRIBUTED 'D' /* for distributed table without specific + * scheme, e.g. result of JOIN of + * replicated and distributed table */ #ifdef _MIGRATE_ #define LOCATOR_TYPE_SHARD 'S' @@ -40,14 +40,14 @@ #define IsLocatorNone(x) (x == LOCATOR_TYPE_NONE) #define IsLocatorReplicated(x) (x == LOCATOR_TYPE_REPLICATED) #define IsLocatorColumnDistributed(x) (x == LOCATOR_TYPE_HASH || \ - x == LOCATOR_TYPE_RROBIN || \ - x == LOCATOR_TYPE_MODULO || \ - x == LOCATOR_TYPE_DISTRIBUTED || \ - x == LOCATOR_TYPE_SHARD) + x == LOCATOR_TYPE_RROBIN || \ + x == LOCATOR_TYPE_MODULO || \ + x == LOCATOR_TYPE_DISTRIBUTED || \ + x == LOCATOR_TYPE_SHARD) #define IsLocatorDistributedByValue(x) (x == LOCATOR_TYPE_HASH || \ - x == LOCATOR_TYPE_MODULO || \ - x == LOCATOR_TYPE_RANGE || \ - x == LOCATOR_TYPE_SHARD) + x == LOCATOR_TYPE_MODULO || \ + x == LOCATOR_TYPE_RANGE || \ + x == LOCATOR_TYPE_SHARD) #include "nodes/primnodes.h" #include "utils/relcache.h" @@ -59,35 +59,35 @@ typedef int PartAttrNumber; */ typedef enum { - RELATION_ACCESS_READ, /* SELECT */ - RELATION_ACCESS_READ_FQS, /* SELECT for FQS */ - RELATION_ACCESS_READ_FOR_UPDATE, /* SELECT FOR UPDATE */ - RELATION_ACCESS_UPDATE, /* UPDATE OR DELETE */ - RELATION_ACCESS_INSERT /* INSERT */ + RELATION_ACCESS_READ, /* SELECT */ + RELATION_ACCESS_READ_FQS, /* SELECT for FQS */ + RELATION_ACCESS_READ_FOR_UPDATE, /* SELECT FOR UPDATE */ + RELATION_ACCESS_UPDATE, /* UPDATE OR DELETE */ + RELATION_ACCESS_INSERT /* INSERT */ } RelationAccessType; typedef struct { - Oid relid; - char locatorType; - PartAttrNumber partAttrNum; /* if partitioned */ - char *partAttrName; /* if partitioned */ + Oid relid; + char locatorType; + PartAttrNumber partAttrNum; /* if partitioned */ + char *partAttrName; /* if partitioned */ #ifdef _MIGRATE_ - Oid groupId; /* distribute group */ + Oid groupId; /* distribute group */ #endif #ifdef __COLD_HOT__ - /* used for table in cold-hot group */ - Oid coldGroupId; /* cold group oid if exist */ - AttrNumber secAttrNum; /* second distributed column's attribute number */ - char *secAttrName; /* second distributed column's name */ + /* used for table in cold-hot group */ + Oid coldGroupId; /* cold group oid if exist */ + AttrNumber secAttrNum; /* second distributed column's attribute number */ + char *secAttrName; /* second distributed column's name */ #endif - List *rl_nodeList; /* Node Indices */ - ListCell *roundRobinNode; /* index of the next one to use */ + List *rl_nodeList; /* Node Indices */ + ListCell *roundRobinNode; /* index of the next one to use */ } RelationLocInfo; -#define IsRelationReplicated(rel_loc) IsLocatorReplicated((rel_loc)->locatorType) -#define IsRelationColumnDistributed(rel_loc) IsLocatorColumnDistributed((rel_loc)->locatorType) -#define IsRelationDistributedByValue(rel_loc) IsLocatorDistributedByValue((rel_loc)->locatorType) +#define IsRelationReplicated(rel_loc) IsLocatorReplicated((rel_loc)->locatorType) +#define IsRelationColumnDistributed(rel_loc) IsLocatorColumnDistributed((rel_loc)->locatorType) +#define IsRelationDistributedByValue(rel_loc) IsLocatorDistributedByValue((rel_loc)->locatorType) /* * Nodes to execute on * primarynodelist is for replicated table writes, where to execute first. @@ -103,9 +103,9 @@ typedef struct Expr *en_expr; /* expression to evaluate at execution time if planner * can not determine execution nodes */ #ifdef __COLD_HOT__ - Expr *sec_en_expr; /* Sec Expression to evaluate at execution time - * if planner can not determine execution - * nodes */ + Expr *sec_en_expr; /* Sec Expression to evaluate at execution time + * if planner can not determine execution + * nodes */ #endif Oid en_relid; /* Relation to determine execution nodes */ RelationAccessType accesstype; /* Access type to determine execution nodes */ @@ -113,6 +113,7 @@ typedef struct bool restrict_shippable; /* The ExecNode is choose by join qual on distribute column */ bool const_subquery; /* The subquery rte only got constant values */ #endif + bool need_rewrite; /* exists func, need to be rewritted when execute plan */ } ExecNodes; @@ -122,17 +123,17 @@ typedef struct typedef enum { - LOCATOR_LIST_NONE, /* locator returns integers in range 0..NodeCount-1, - * value of nodeList ignored and can be NULL */ - LOCATOR_LIST_INT, /* nodeList is an integer array (int *), value from - * the array is returned */ - LOCATOR_LIST_OID, /* node list is an array of Oids (Oid *), value from - * the array is returned */ - LOCATOR_LIST_POINTER, /* node list is an array of pointers (void **), - * value from the array is returned */ - LOCATOR_LIST_LIST, /* node list is a list, item type is determined by - * list type (integer, oid or pointer). NodeCount - * is ignored */ + LOCATOR_LIST_NONE, /* locator returns integers in range 0..NodeCount-1, + * value of nodeList ignored and can be NULL */ + LOCATOR_LIST_INT, /* nodeList is an integer array (int *), value from + * the array is returned */ + LOCATOR_LIST_OID, /* node list is an array of Oids (Oid *), value from + * the array is returned */ + LOCATOR_LIST_POINTER, /* node list is an array of pointers (void **), + * value from the array is returned */ + LOCATOR_LIST_LIST, /* node list is a list, item type is determined by + * list type (integer, oid or pointer). NodeCount + * is ignored */ } LocatorListType; typedef Datum (*LocatorHashFunc) (PG_FUNCTION_ARGS); @@ -152,35 +153,35 @@ typedef struct _Locator Locator; * accessType - see RelationAccessType enum * dataType - actual data type of values provided to determine nodes * listType - defines how nodeList parameter is interpreted, see - * LocatorListType enum for more details + * LocatorListType enum for more details * nodeCount - number of nodes to distribute - * nodeList - detailed info about relation nodes. Either List or array or NULL - * result - returned address of the array where locator will output node - * references. Type of array items (int, Oid or pointer (void *)) - * depends on listType. - * primary - set to true if caller ever wants to determine primary node. + * nodeList - detailed info about relation nodes. Either List or array or NULL + * result - returned address of the array where locator will output node + * references. Type of array items (int, Oid or pointer (void *)) + * depends on listType. + * primary - set to true if caller ever wants to determine primary node. * Primary node will be returned as the first element of the - * result array + * result array */ #ifdef _MIGRATE_ extern Locator *createLocator(char locatorType, RelationAccessType accessType, - Oid dataType, LocatorListType listType, int nodeCount, - void *nodeList, void **result, bool primary, Oid groupid, - Oid coldGroupId, Oid secDataType, AttrNumber secAttrNum, - Oid relid); + Oid dataType, LocatorListType listType, int nodeCount, + void *nodeList, void **result, bool primary, Oid groupid, + Oid coldGroupId, Oid secDataType, AttrNumber secAttrNum, + Oid relid); #else extern Locator *createLocator(char locatorType, RelationAccessType accessType, - Oid dataType, LocatorListType listType, int nodeCount, - void *nodeList, void **result, bool primary); + Oid dataType, LocatorListType listType, int nodeCount, + void *nodeList, void **result, bool primary); #endif extern void freeLocator(Locator *locator); extern int GET_NODES(Locator *self, Datum value, bool isnull, #ifdef __COLD_HOT__ - Datum secValue, bool secIsNull, + Datum secValue, bool secIsNull, #endif - bool *hasprimary); + bool *hasprimary); extern void *getLocatorResults(Locator *self); extern void *getLocatorNodeMap(Locator *self); extern int getLocatorNodeCount(Locator *self); @@ -200,20 +201,22 @@ extern RelationLocInfo *CopyRelationLocInfo(RelationLocInfo *src_info); extern char GetRelationLocType(Oid relid); extern bool IsTableDistOnPrimary(RelationLocInfo *rel_loc_info); extern bool IsLocatorInfoEqual(RelationLocInfo *rel_loc_info1, RelationLocInfo *rel_loc_info2); -extern int GetRoundRobinNode(Oid relid); +extern int GetRoundRobinNode(Oid relid); extern ExecNodes *GetRelationNodes(RelationLocInfo *rel_loc_info, Datum valueForDistCol, - bool isValueNull, + bool isValueNull, #ifdef __COLD_HOT__ - Datum valueForSecDistCol, bool isSecValueNull, + Datum valueForSecDistCol, bool isSecValueNull, #endif - RelationAccessType accessType); + RelationAccessType accessType); +extern ExecNodes *GetRelationNodesForExplain(RelationLocInfo *rel_loc_info, + RelationAccessType accessType); extern ExecNodes *GetRelationNodesByQuals(Oid reloid, - RelationLocInfo *rel_loc_info, - Index varno, - Node *quals, - RelationAccessType relaccess, - Node **dis_qual, - Node **sec_quals); + RelationLocInfo *rel_loc_info, + Index varno, + Node *quals, + RelationAccessType relaccess, + Node **dis_qual, + Node **sec_quals); extern bool IsTypeHashDistributable(Oid col_type); extern List *GetAllDataNodes(void); diff --git a/src/include/pgxc/planner.h b/src/include/pgxc/planner.h index f08c4fce..cb221759 100644 --- a/src/include/pgxc/planner.h +++ b/src/include/pgxc/planner.h @@ -207,6 +207,7 @@ typedef struct * triggers. In order to make triggers work, we separate UPSERT into INSERT and * UPDATE. */ + Query *forDeparse; /* function statement */ char *sql_select; /* select statement */ char *sql_select_base; bool forUpadte; diff --git a/src/test/regress/expected/fast_default.out b/src/test/regress/expected/fast_default.out index 16c60821..d390a452 100644 --- a/src/test/regress/expected/fast_default.out +++ b/src/test/regress/expected/fast_default.out @@ -452,16 +452,20 @@ DELETE FROM T WHERE pk BETWEEN 10 AND 20 RETURNING *; EXPLAIN (VERBOSE TRUE, COSTS FALSE) DELETE FROM T WHERE pk BETWEEN 10 AND 20 RETURNING *; - QUERY PLAN -------------------------------------------------------- - Remote Subquery Scan on all (datanode_1,datanode_2) - Output: pk, c_bigint, c_text + QUERY PLAN +------------------------------------------------------------------------------------------------ + Remote Fast Query Execution + Output: t.xc_node_id, t.ctid, t.shardid, t.pk + Node/s: datanode_1, datanode_2 + Remote query: DELETE FROM t WHERE ((pk >= 10) AND (pk <= 20)) RETURNING pk, c_bigint, c_text -> Delete on fast_default.t Output: pk, c_bigint, c_text - -> Seq Scan on fast_default.t - Output: xc_node_id, ctid, shardid, pk - Filter: ((t.pk >= 10) AND (t.pk <= 20)) -(7 rows) + -> Bitmap Heap Scan on fast_default.t + Output: ctid, shardid + Recheck Cond: ((t.pk >= 10) AND (t.pk <= 20)) + -> Bitmap Index Scan on t_pkey + Index Cond: ((t.pk >= 10) AND (t.pk <= 20)) +(11 rows) -- UPDATE UPDATE T SET c_text = '"' || c_text || '"' WHERE pk < 10; diff --git a/src/test/regress/expected/insert_conflict_1.out b/src/test/regress/expected/insert_conflict_1.out index 1dce5ece..1a544406 100644 --- a/src/test/regress/expected/insert_conflict_1.out +++ b/src/test/regress/expected/insert_conflict_1.out @@ -219,37 +219,37 @@ explain (costs off) insert into insertconflicttest values(0, 'Crowberry') on con -- Does the same, but JSON format shows "Conflict Arbiter Index" as JSON array: explain (costs off, format json) insert into insertconflicttest values (0, 'Bilberry') on conflict (key) do update set fruit = excluded.fruit where insertconflicttest.fruit != 'Lime' returning *; - QUERY PLAN ----------------------------------------------------------------------------- - [ + - { + - "Plan": { + - "Node Type": "Remote Subquery Scan", + - "Parallel Aware": false, + - "Replicated": "no", + - "Node List": ["datanode_2"], + - "Plans": [ + - { + - "Node Type": "ModifyTable", + - "Operation": "Insert", + - "Parent Relationship": "Outer", + - "Parallel Aware": false, + - "Relation Name": "insertconflicttest", + - "Alias": "insertconflicttest", + - "Conflict Resolution": "UPDATE", + - "Conflict Arbiter Indexes": ["key_index"], + - "Conflict Filter": "(insertconflicttest.fruit <> 'Lime'::text)",+ - "Plans": [ + - { + - "Node Type": "Result", + - "Parent Relationship": "Member", + - "Parallel Aware": false + - } + - ] + - } + - ] + - } + - } + + QUERY PLAN +------------------------------------------------------------------------------ + [ + + { + + "Plan": { + + "Node Type": "Remote Fast Query Execution", + + "Parallel Aware": false, + + "Node expr": "0" + + "Remote plan": [ + + { + + "Plan": { + + "Node Type": "ModifyTable", + + "Operation": "Insert", + + "Parallel Aware": false, + + "Relation Name": "insertconflicttest", + + "Alias": "insertconflicttest", + + "Conflict Resolution": "UPDATE", + + "Conflict Arbiter Indexes": ["key_index"], + + "Conflict Filter": "(insertconflicttest.fruit <> 'Lime'::text)",+ + "Plans": [ + + { + + "Node Type": "Result", + + "Parent Relationship": "Member", + + "Parallel Aware": false + + } + + ] + + } + + } + + ] + + } + + } + ] (1 row) diff --git a/src/test/regress/expected/prepare.out b/src/test/regress/expected/prepare.out index 787b242c..7dd52d9a 100644 --- a/src/test/regress/expected/prepare.out +++ b/src/test/regress/expected/prepare.out @@ -162,3 +162,238 @@ SELECT name, statement, parameter_types FROM pg_prepared_statements ------+-----------+----------------- (0 rows) +-- +-- search_path test +-- +CREATE DATABASE search_path_db; +\c search_path_db +CREATE TABLE tbl_test( + id int primary key, + name varchar(30) +); +INSERT INTO tbl_test VALUES (1, 'public 01'); +INSERT INTO tbl_test VALUES (2, 'public 02'); +INSERT INTO tbl_test VALUES (3, 'public 03'); +select * from tbl_test order by id; + id | name +----+----------- + 1 | public 01 + 2 | public 02 + 3 | public 03 +(3 rows) + +-- create schema +CREATE SCHEMA sch01; +CREATE SCHEMA sch02; +-- set schema to sch01 +SET search_path TO sch01; +CREATE TABLE IF NOT EXISTS tbl_test( + id int primary key, + name varchar(30) +); +BEGIN; +INSERT INTO tbl_test VALUES (11, 'sch01 11'); +INSERT INTO tbl_test VALUES (12, 'sch01 12'); +INSERT INTO tbl_test VALUES (13, 'sch01 13'); +COMMIT; +select * from tbl_test order by id; + id | name +----+---------- + 11 | sch01 11 + 12 | sch01 12 + 13 | sch01 13 +(3 rows) + +-- set schema to sch02 +SET search_path TO sch02; +CREATE TABLE IF NOT EXISTS tbl_test( + id int primary key, + name varchar(30) +); +BEGIN; +INSERT INTO tbl_test VALUES (21, 'sch02 21'); +INSERT INTO tbl_test VALUES (22, 'sch02 22'); +INSERT INTO tbl_test VALUES (23, 'sch02 23'); +ROLLBACK; +select * from tbl_test order by id; + id | name +----+------ +(0 rows) + +-- set schema to sch01 +SET search_path = sch01; +SHOW search_path; + search_path +------------- + sch01 +(1 row) + +PREPARE ps_test_insert (int, varchar) AS INSERT INTO tbl_test VALUES ($1, $2);; +PREPARE ps_test_select (int) AS select * from tbl_test where id < $1 order by id; +BEGIN; +EXECUTE ps_test_insert(14, 'sch01 14'); +EXECUTE ps_test_select(50); + id | name +----+---------- + 11 | sch01 11 + 12 | sch01 12 + 13 | sch01 13 + 14 | sch01 14 +(4 rows) + +ROLLBACK; +EXECUTE ps_test_select(50); + id | name +----+---------- + 11 | sch01 11 + 12 | sch01 12 + 13 | sch01 13 +(3 rows) + +SHOW search_path; + search_path +------------- + sch01 +(1 row) + +BEGIN; +EXECUTE ps_test_insert(15, 'sch01 15'); +EXECUTE ps_test_select(50); + id | name +----+---------- + 11 | sch01 11 + 12 | sch01 12 + 13 | sch01 13 + 15 | sch01 15 +(4 rows) + +COMMIT; +EXECUTE ps_test_select(50); + id | name +----+---------- + 11 | sch01 11 + 12 | sch01 12 + 13 | sch01 13 + 15 | sch01 15 +(4 rows) + +SHOW search_path; + search_path +------------- + sch01 +(1 row) + +EXECUTE ps_test_insert(16, 'sch01 16'); +EXECUTE ps_test_select(50); + id | name +----+---------- + 11 | sch01 11 + 12 | sch01 12 + 13 | sch01 13 + 15 | sch01 15 + 16 | sch01 16 +(5 rows) + +SHOW search_path; + search_path +------------- + sch01 +(1 row) + +DEALLOCATE PREPARE ps_test_insert; +DEALLOCATE PREPARE ps_test_select; +-- test insert fqs in prepare +CREATE TABLE insert_fsq_test(id serial primary key, name varchar(30)); +PREPARE ps_test_insert (varchar) AS INSERT INTO insert_fsq_test (name) VALUES ($1); +EXECUTE ps_test_insert('1'); +EXECUTE ps_test_insert('2'); +EXECUTE ps_test_insert('3'); +EXECUTE ps_test_insert('4'); +EXECUTE ps_test_insert('5'); +SELECT * from insert_fsq_test order by id; + id | name +----+------ + 1 | 1 + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 +(5 rows) + +DEALLOCATE PREPARE ps_test_insert; +DROP TABLE insert_fsq_test cascade; +-- +-- gb18030 test +-- +CREATE DATABASE gb18030_db template template0 encoding = gb18030 LC_COLLATE = 'zh_CN.gb18030' LC_CTYPE = 'zh_CN.gb18030'; +\c gb18030_db; +-- set client_encoding +SET client_encoding = utf8; +CREATE TABLE tbl_test(id int primary key, name varchar(3)); +INSERT INTO tbl_test VALUES (3, '张三'); +BEGIN; +INSERT INTO tbl_test VALUES (4, '李四'); +INSERT INTO tbl_test VALUES (5, '王五'); +COMMIT; +BEGIN; +INSERT INTO tbl_test VALUES (6, '丁六'); +INSERT INTO tbl_test VALUES (7, '方七'); +ROLLBACK; +SELECT * FROM tbl_test ORDER BY id; + id | name +----+------ + 3 | 张三 + 4 | 李四 + 5 | 王五 +(3 rows) + +SHOW client_encoding; + client_encoding +----------------- + UTF8 +(1 row) + +PREPARE ps_test (int) AS select * from tbl_test where id < $1 order by id; +EXECUTE ps_test(20); + id | name +----+------ + 3 | 张三 + 4 | 李四 + 5 | 王五 +(3 rows) + +SHOW client_encoding; + client_encoding +----------------- + UTF8 +(1 row) + +EXECUTE ps_test(20); + id | name +----+------ + 3 | 张三 + 4 | 李四 + 5 | 王五 +(3 rows) + +SHOW client_encoding; + client_encoding +----------------- + UTF8 +(1 row) + +EXECUTE ps_test(20); + id | name +----+------ + 3 | 张三 + 4 | 李四 + 5 | 王五 +(3 rows) + +SHOW client_encoding; + client_encoding +----------------- + UTF8 +(1 row) + +DEALLOCATE PREPARE ps_test; diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 89552269..14660970 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -3156,13 +3156,14 @@ SELECT tablename, rulename, definition FROM pg_rules explain (costs off) INSERT INTO hats VALUES ('h8', 'forbidden') RETURNING *; QUERY PLAN ------------------------------------------------------------------------------------------------------- - Remote Subquery Scan on all (datanode_1) + Remote Fast Query Execution + Node expr: 'h8'::bpchar -> Insert on hat_data Conflict Resolution: UPDATE Conflict Arbiter Indexes: hat_data_unique_idx Conflict Filter: ((excluded.hat_color <> 'forbidden'::bpchar) AND (hat_data.* <> excluded.*)) -> Result -(6 rows) +(7 rows) -- ensure upserting into a rule, with a CTE (different offsets!) works WITH data(hat_name, hat_color) AS MATERIALIZED ( diff --git a/src/test/regress/output/constraints_3.source b/src/test/regress/output/constraints_3.source index e19ef775..568efec7 100644 --- a/src/test/regress/output/constraints_3.source +++ b/src/test/regress/output/constraints_3.source @@ -187,7 +187,7 @@ DETAIL: Failing row contains (8, Y, -8). SELECT 'eight' AS one, currval('insert_seq'); one | currval -------+--------- - eight | 7 + eight | 8 (1 row) -- According to SQL, it is OK to insert a record that gives rise to NULL diff --git a/src/test/regress/sql/prepare.sql b/src/test/regress/sql/prepare.sql index 507c0668..9a465ab3 100644 --- a/src/test/regress/sql/prepare.sql +++ b/src/test/regress/sql/prepare.sql @@ -75,3 +75,132 @@ SELECT name, statement, parameter_types FROM pg_prepared_statements DEALLOCATE ALL; SELECT name, statement, parameter_types FROM pg_prepared_statements ORDER BY name; + +-- +-- search_path test +-- +CREATE DATABASE search_path_db; +\c search_path_db + +CREATE TABLE tbl_test( + id int primary key, + name varchar(30) +); + +INSERT INTO tbl_test VALUES (1, 'public 01'); +INSERT INTO tbl_test VALUES (2, 'public 02'); +INSERT INTO tbl_test VALUES (3, 'public 03'); + +select * from tbl_test order by id; + +-- create schema +CREATE SCHEMA sch01; +CREATE SCHEMA sch02; + +-- set schema to sch01 +SET search_path TO sch01; + +CREATE TABLE IF NOT EXISTS tbl_test( + id int primary key, + name varchar(30) +); + +BEGIN; +INSERT INTO tbl_test VALUES (11, 'sch01 11'); +INSERT INTO tbl_test VALUES (12, 'sch01 12'); +INSERT INTO tbl_test VALUES (13, 'sch01 13'); +COMMIT; + +select * from tbl_test order by id; + +-- set schema to sch02 +SET search_path TO sch02; + +CREATE TABLE IF NOT EXISTS tbl_test( + id int primary key, + name varchar(30) +); + +BEGIN; +INSERT INTO tbl_test VALUES (21, 'sch02 21'); +INSERT INTO tbl_test VALUES (22, 'sch02 22'); +INSERT INTO tbl_test VALUES (23, 'sch02 23'); +ROLLBACK; + +select * from tbl_test order by id; + +-- set schema to sch01 +SET search_path = sch01; +SHOW search_path; + +PREPARE ps_test_insert (int, varchar) AS INSERT INTO tbl_test VALUES ($1, $2);; +PREPARE ps_test_select (int) AS select * from tbl_test where id < $1 order by id; + +BEGIN; +EXECUTE ps_test_insert(14, 'sch01 14'); +EXECUTE ps_test_select(50); +ROLLBACK; +EXECUTE ps_test_select(50); + +SHOW search_path; + +BEGIN; +EXECUTE ps_test_insert(15, 'sch01 15'); +EXECUTE ps_test_select(50); +COMMIT; +EXECUTE ps_test_select(50); + +SHOW search_path; + +EXECUTE ps_test_insert(16, 'sch01 16'); +EXECUTE ps_test_select(50); + +SHOW search_path; + +DEALLOCATE PREPARE ps_test_insert; +DEALLOCATE PREPARE ps_test_select; + +-- test insert fqs in prepare +CREATE TABLE insert_fsq_test(id serial primary key, name varchar(30)); +PREPARE ps_test_insert (varchar) AS INSERT INTO insert_fsq_test (name) VALUES ($1); +EXECUTE ps_test_insert('1'); +EXECUTE ps_test_insert('2'); +EXECUTE ps_test_insert('3'); +EXECUTE ps_test_insert('4'); +EXECUTE ps_test_insert('5'); +SELECT * from insert_fsq_test order by id; +DEALLOCATE PREPARE ps_test_insert; +DROP TABLE insert_fsq_test cascade; + +-- +-- gb18030 test +-- +CREATE DATABASE gb18030_db template template0 encoding = gb18030 LC_COLLATE = 'zh_CN.gb18030' LC_CTYPE = 'zh_CN.gb18030'; +\c gb18030_db; + +-- set client_encoding +SET client_encoding = utf8; + +CREATE TABLE tbl_test(id int primary key, name varchar(3)); + +INSERT INTO tbl_test VALUES (3, '张三'); +BEGIN; +INSERT INTO tbl_test VALUES (4, '李四'); +INSERT INTO tbl_test VALUES (5, '王五'); +COMMIT; +BEGIN; +INSERT INTO tbl_test VALUES (6, '丁六'); +INSERT INTO tbl_test VALUES (7, '方七'); +ROLLBACK; +SELECT * FROM tbl_test ORDER BY id; + +SHOW client_encoding; + +PREPARE ps_test (int) AS select * from tbl_test where id < $1 order by id; +EXECUTE ps_test(20); +SHOW client_encoding; +EXECUTE ps_test(20); +SHOW client_encoding; +EXECUTE ps_test(20); +SHOW client_encoding; +DEALLOCATE PREPARE ps_test; From 8496d5d717807b3e119994ca85fe1d2a335fbc6b Mon Sep 17 00:00:00 2001 From: bethding Date: Sat, 29 May 2021 14:17:34 +0800 Subject: [PATCH 148/578] get exec_nodes's func value in rewrite for sql http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131088058267&url_cache_key=3ba5cc9f7d4408eb8cb3e14319eb688f --- src/backend/executor/execMain.c | 30 +++++++++++++++++++++++++----- src/backend/nodes/copyfuncs.c | 3 +++ src/backend/pgxc/pool/execRemote.c | 15 +++++++++++++-- src/include/pgxc/locator.h | 3 +++ 4 files changed, 44 insertions(+), 7 deletions(-) diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index d516a94c..9ec3add2 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -1911,13 +1911,19 @@ ExecEndPlan(PlanState *planstate, EState *estate) * in different datanodes. */ static void -RewriteForSql(RemoteQuery *plan, Query *query, +RewriteForSql(RemoteQueryState *planstate, RemoteQuery *plan, char *distribcol, bool isreplic) { + Query *query = copyObject(plan->forDeparse); ListCell *lc_deparse = NULL; TargetEntry *entry_deparse = NULL; bool find_target = false; StringInfoData buf; + bool isnull; + Datum partvalue; + ExprState *estate = NULL; + + plan->exec_nodes->rewrite_done = false; foreach(lc_deparse, query->targetList) { @@ -1932,7 +1938,21 @@ RewriteForSql(RemoteQuery *plan, Query *query, { entry_deparse->expr = (Expr *)replace_distribkey_func( (Node *)entry_deparse->expr); - plan->exec_nodes->en_expr = entry_deparse->expr; + + /* + * Get expr value here to avoid executing function again + * in get_exec_connections. + */ + estate = ExecInitExpr(entry_deparse->expr, + (PlanState *) planstate); + if (planstate->eflags != EXEC_FLAG_EXPLAIN_ONLY) + partvalue = ExecEvalExpr(estate, + planstate->combiner.ss.ps.ps_ExprContext, + &isnull); + + plan->exec_nodes->rewrite_value = partvalue; + plan->exec_nodes->isnull = isnull; + plan->exec_nodes->rewrite_done = true; find_target = true; break; } @@ -1961,9 +1981,9 @@ RewriteFuncNode(PlanState *planstate) { RemoteQuery *plan = (RemoteQuery *)planstate->plan; ExecNodes *exec_nodes = plan->exec_nodes; - Query *query = copyObject(plan->forDeparse); RelationLocInfo *rel_loc_info = NULL; char *distribcol = NULL; + RemoteQueryState *node = castNode(RemoteQueryState, planstate); if ((!exec_nodes) || (!exec_nodes->need_rewrite)) return; @@ -1974,7 +1994,7 @@ RewriteFuncNode(PlanState *planstate) */ if (IsExecNodesReplicated(exec_nodes)) { - RewriteForSql(plan, query, NULL, true); + RewriteForSql(node, plan, NULL, true); return; } @@ -1986,7 +2006,7 @@ RewriteFuncNode(PlanState *planstate) return; distribcol = GetRelationDistribColumn(rel_loc_info); - RewriteForSql(plan, query, distribcol, false); + RewriteForSql(node, plan, distribcol, false); } /* ---------------------------------------------------------------- diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 780ad8aa..3d4e8a68 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -1363,6 +1363,9 @@ _copyExecNodes(const ExecNodes *from) COPY_SCALAR_FIELD(en_relid); COPY_SCALAR_FIELD(accesstype); COPY_SCALAR_FIELD(need_rewrite); + COPY_SCALAR_FIELD(rewrite_value); + COPY_SCALAR_FIELD(isnull); + COPY_SCALAR_FIELD(rewrite_done); return newnode; } diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index d95a772a..491e33d3 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -6288,18 +6288,29 @@ get_exec_connections(RemoteQueryState *planstate, bool isnull; ExecNodes *nodes; Datum partvalue; + ExprState *estate; #ifdef __COLD_HOT__ bool secisnull; Datum secValue; #endif - ExprState *estate = ExecInitExpr(exec_nodes->en_expr, + RelationLocInfo *rel_loc_info; + if (exec_nodes->rewrite_done) + { + partvalue = exec_nodes->rewrite_value; + isnull = exec_nodes->isnull; + } + else + { + estate = ExecInitExpr(exec_nodes->en_expr, (PlanState *) planstate); /* For explain, no need to execute expr. */ if (planstate->eflags != EXEC_FLAG_EXPLAIN_ONLY) partvalue = ExecEvalExpr(estate, planstate->combiner.ss.ps.ps_ExprContext, &isnull); - RelationLocInfo *rel_loc_info = GetRelationLocInfo(exec_nodes->en_relid); + } + + rel_loc_info = GetRelationLocInfo(exec_nodes->en_relid); #ifdef __COLD_HOT__ if (exec_nodes->sec_en_expr) diff --git a/src/include/pgxc/locator.h b/src/include/pgxc/locator.h index bef6e6a0..4e692237 100644 --- a/src/include/pgxc/locator.h +++ b/src/include/pgxc/locator.h @@ -114,6 +114,9 @@ typedef struct bool const_subquery; /* The subquery rte only got constant values */ #endif bool need_rewrite; /* exists func, need to be rewritted when execute plan */ + Datum rewrite_value; /* function evaluate result */ + bool isnull; + bool rewrite_done; /* function rewritted */ } ExecNodes; From 7b40013a7d0a12b441eba460f92b3fdc1d470b4b Mon Sep 17 00:00:00 2001 From: andrelin Date: Fri, 23 Apr 2021 15:56:28 +0800 Subject: [PATCH 149/578] Introduce global session view (merge request !276) Squash merge branch 'andrelin/global_session' into 'Tbase_v2.15.19' * Minor adjustment after review by YoungXie * Should skip explain remote query when report planstate * Introduce pg_cancel_session and pg_terminate_session to send signals to all backends of one session * Add more comments * Transport session id to parallel workers * Copy backend id into local backend status to support cluster stat collect * fix format * Support multi query strategy * Support collect backend stat from remote nodes * Support EXEC_ON_ALL_NODES if sending NIL nodelist in ExecRemoteQuery * fix a warning * Add hook before PortalDrop and after PortalStart * Allocate session id from gtm with format nodename_pid_timestamp in CN * Initial commit of global session view * fix http://tapd.oa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131086872963 --- contrib/Makefile | 3 +- contrib/pg_stat_cluster_activity/Makefile | 27 + .../pg_stat_cluster_activity--1.0.sql | 60 + ...stat_cluster_activity--unpackaged--1.0.sql | 8 + .../pg_stat_cluster_activity.c | 1069 ++++++++++++++ .../pg_stat_cluster_activity.conf | 1 + .../pg_stat_cluster_activity.control | 5 + src/backend/access/transam/parallel.c | 16 +- src/backend/commands/explain.c | 9 + src/backend/pgxc/pool/execRemote.c | 34 +- src/backend/pgxc/pool/pgxcnode.c | 123 +- src/backend/pgxc/squeue/squeue.c | 6 + src/backend/postmaster/pgstat.c | 1 + src/backend/postmaster/postmaster.c | 3 + src/backend/tcop/postgres.c | 22 +- src/backend/tcop/pquery.c | 7 + src/backend/utils/mmgr/portalmem.c | 6 + src/include/commands/explain.h | 3 + src/include/pgstat.h | 1273 +++++++++-------- src/include/pgxc/pgxc.h | 3 + src/include/pgxc/pgxcnode.h | 11 +- src/include/pgxc/squeue.h | 83 +- src/include/utils/portal.h | 6 + 23 files changed, 2069 insertions(+), 710 deletions(-) create mode 100644 contrib/pg_stat_cluster_activity/Makefile create mode 100644 contrib/pg_stat_cluster_activity/pg_stat_cluster_activity--1.0.sql create mode 100644 contrib/pg_stat_cluster_activity/pg_stat_cluster_activity--unpackaged--1.0.sql create mode 100644 contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c create mode 100644 contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.conf create mode 100644 contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.control diff --git a/contrib/Makefile b/contrib/Makefile index 1d0dcd37..494da1e1 100644 --- a/contrib/Makefile +++ b/contrib/Makefile @@ -54,7 +54,8 @@ SUBDIRS = \ unaccent \ vacuumlo \ stormstats \ - tbase_pooler_stat + tbase_pooler_stat \ + pg_stat_cluster_activity ifeq ($(with_openssl),yes) SUBDIRS += sslinfo diff --git a/contrib/pg_stat_cluster_activity/Makefile b/contrib/pg_stat_cluster_activity/Makefile new file mode 100644 index 00000000..a12ef09e --- /dev/null +++ b/contrib/pg_stat_cluster_activity/Makefile @@ -0,0 +1,27 @@ +# contrib/pg_stat_cluster_activity/Makefile + +MODULE_big = pg_stat_cluster_activity +OBJS = pg_stat_cluster_activity.o $(WIN32RES) + +EXTENSION = pg_stat_cluster_activity +DATA = pg_stat_cluster_activity--1.0.sql +PGFILEDESC = "pg_stat_cluster_activity - execution of cluster statistics" + +LDFLAGS_SL += $(filter -lm, $(LIBS)) + +REGRESS_OPTS = --temp-config $(top_srcdir)/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.conf +REGRESS = pg_stat_cluster_activity +# Disabled because these tests require "shared_preload_libraries=pg_stat_cluster_activity", +# which typical installcheck users do not have (e.g. buildfarm clients). +NO_INSTALLCHECK = 1 + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/pg_stat_cluster_activity +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity--1.0.sql b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity--1.0.sql new file mode 100644 index 00000000..9f524816 --- /dev/null +++ b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity--1.0.sql @@ -0,0 +1,60 @@ +/* contrib/pg_stat_cluster_activity/pg_stat_cluster_activity--1.0.sql */ + +-- complain if script is sourced in psql, rather than via ALTER EXTENSION +\echo Use "CREATE EXTENSION pg_stat_cluster_activity" to load this file. \quit + +/* Now redefine */ +CREATE OR REPLACE FUNCTION pg_stat_get_cluster_activity( + sessionid text, + coordonly bool, + localonly bool, + OUT sessionid text, + OUT pid integer, + OUT client_addr inet, + OUT client_hostname text, + OUT client_port integer, + OUT nodename text, + OUT role text, + OUT datid oid, + OUT usesysid oid, + OUT wait_event_type text, + OUT wait_event text, + OUT state text, + OUT sqname text, + OUT sqdone bool, + OUT query text, + OUT planstate text, + OUT portal text, + OUT cursors text, + OUT backend_start timestamp, + OUT xact_start timestamp, + OUT query_start timestamp, + OUT state_change timestamp +) +RETURNS SETOF record +AS 'MODULE_PATHNAME' +LANGUAGE C; + +CREATE OR REPLACE FUNCTION pg_signal_session(text, integer, bool) +RETURNS bool +AS 'MODULE_PATHNAME' +LANGUAGE C; + +CREATE OR REPLACE FUNCTION pg_terminate_session(text) +RETURNS bool +AS 'MODULE_PATHNAME' +LANGUAGE C; + +CREATE OR REPLACE FUNCTION pg_cancel_session(text) +RETURNS bool +AS 'MODULE_PATHNAME' +LANGUAGE C; + +CREATE OR REPLACE VIEW pg_stat_cluster_activity AS + SELECT * FROM pg_stat_get_cluster_activity(NULL, false, false); + +CREATE OR REPLACE VIEW pg_stat_cluster_activity_cn AS + SELECT * FROM pg_stat_get_cluster_activity(NULL, true, false); + +GRANT SELECT ON pg_stat_cluster_activity TO PUBLIC; +GRANT SELECT ON pg_stat_cluster_activity_cn TO PUBLIC; diff --git a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity--unpackaged--1.0.sql b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity--unpackaged--1.0.sql new file mode 100644 index 00000000..33f68860 --- /dev/null +++ b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity--unpackaged--1.0.sql @@ -0,0 +1,8 @@ +/* contrib/pg_stat_cluster_activity/pg_stat_cluster_activity--unpackaged--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION pg_stat_cluster_activity" to load this file. \quit + +ALTER EXTENSION pg_stat_cluster_activity ADD function pg_stat_cluster_get_activity(); +ALTER EXTENSION pg_stat_statements ADD view pg_stat_cluster_activity; +ALTER EXTENSION pg_stat_statements ADD view pg_stat_cluster_activity_cn; diff --git a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c new file mode 100644 index 00000000..ff748ae0 --- /dev/null +++ b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c @@ -0,0 +1,1069 @@ +#include "postgres.h" + +#include "catalog/pg_authid.h" +#include "catalog/pg_type.h" +#include "commands/explain.h" +#include "common/ip.h" +#include "fmgr.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "nodes/makefuncs.h" +#include "nodes/nodeFuncs.h" +#include "pgstat.h" +#include "pgxc/execRemote.h" +#include "pgxc/pgxc.h" +#include "pgxc/squeue.h" +#include "port/atomics.h" +#include "storage/ipc.h" +#include "storage/procarray.h" +#include "storage/shmem.h" +#include "utils/builtins.h" +#include "utils/portal.h" +#include "utils/snapmgr.h" +#include "utils/timestamp.h" + +PG_MODULE_MAGIC; + +#define PG_STAT_GET_ClUSTER_ACTIVITY_COLS 22 + +/* ---------- + * Total number of backends including auxiliary + * + * We reserve a slot for each possible BackendId, plus one for each + * possible auxiliary process type. (This scheme assumes there is not + * more than one of any auxiliary process type at a time.) MaxBackends + * includes autovacuum workers and background workers as well. + * ---------- + */ +#define NumBackendStatSlots (MaxBackends + NUM_AUXPROCTYPES) + +#define UINT32_ACCESS_ONCE(var) ((uint32)(*((volatile uint32 *)&(var)))) + +/* + * PgClusterStatus is something like PgBackendStatus (see pgstat.c) but it + * contains information that a query executed in a cluster database system. + * Each PgClusterStatus stands for a backend process forked by postmaster, + * the same way PgBackendStatus does, like extended fields of PgBackendStatus. + * We show it in view pg_stat_cluster_activity, still, one tuple for an entry. + */ +typedef struct PgClusterStatus +{ + /* + * To avoid locking overhead, we use the following protocol: a backend + * increments changecount before modifying its entry, and again after + * finishing a modification. A would-be reader should note the value of + * changecount, copy the entry into private memory, then check + * changecount again. If the value hasn't changed, and if it's even, + * the copy is valid; otherwise start over. This makes updates cheap + * while reads are potentially expensive, but that's the tradeoff we want. + * + * The above protocol needs the memory barriers to ensure that the + * apparent order of execution is as it desires. Otherwise, for example, + * the CPU might rearrange the code so that changecount is incremented + * twice before the modification on a machine with weak memory ordering. + * This surprising result can lead to bugs. + */ + int changecount; + + bool valid; /* don't show this entry if false */ + + /* fields that will be shown in pg_stat_cluster_activity */ + char sessionid[NAMEDATALEN]; /* global session id in a cluster, one for a session */ + char nodename[NAMEDATALEN]; /* nodename, determined after process started */ + char role[NAMEDATALEN]; /* coord, datanode, producer or consumer */ + + /* portal_name or portal_name_unique */ + char sqname[NAMEDATALEN]; + /* true if sharequeue end, but currently change when query ends in this backend */ + bool sqdone; + /* part of plantree this backend is processing, OR last processed if backend is idle */ + char planstate[4096]; + + /* + * portal name: the name of current portal, given by upper node of processing query + * cursor name: contained in planstate this backend is querying, which would be + * portal name of next layer of nodes bellow this backend + * + * Note: with these two fields plus nodename, we can build a backend tree of executing query + * in whole distributed system. + */ + char portal[NAMEDATALEN]; + char cursors[NAMEDATALEN * 64]; +} PgClusterStatus; + +static PgClusterStatus *ClusterStatusArray = NULL; +static PgClusterStatus *MyCSEntry = NULL; + +static shmem_startup_hook_type prev_shmem_startup_hook = NULL; +static PortalStart_hook_type prev_PortalStart = NULL; +static PortalDrop_hook_type prev_PortalDrop = NULL; +static ExecutorStart_hook_type prev_ExecutorStart = NULL; + +/* + * Macros to load and store st_changecount with the memory barriers. + * + * increment_changecount_before() and + * increment_changecount_after() need to be called before and after + * entries are modified, respectively. This makes sure that st_changecount + * is incremented around the modification. + * + * Also save_changecount_before() and save_changecount_after() + * need to be called before and after entries are copied into private memory + * respectively. + */ +#define increment_changecount_before(status) \ + do { \ + status->changecount++; \ + pg_write_barrier(); \ + } while (0) + +#define increment_changecount_after(status) \ + do { \ + pg_write_barrier(); \ + status->changecount++; \ + Assert((status->changecount & 1) == 0); \ + } while (0) + +#define save_changecount_before(status, save_changecount) \ + do { \ + save_changecount = status->changecount; \ + pg_read_barrier(); \ + } while (0) + +#define save_changecount_after(status, save_changecount) \ + do { \ + pg_read_barrier(); \ + save_changecount = status->changecount; \ + } while (0) + +Datum pg_stat_get_cluster_activity(PG_FUNCTION_ARGS); +Datum pg_signal_session(PG_FUNCTION_ARGS); +Datum pg_terminate_session(PG_FUNCTION_ARGS); +Datum pg_cancel_session(PG_FUNCTION_ARGS); + +void _PG_init(void); +void _PG_fini(void); + +PG_FUNCTION_INFO_V1(pg_stat_get_cluster_activity); +PG_FUNCTION_INFO_V1(pg_signal_session); +PG_FUNCTION_INFO_V1(pg_terminate_session); +PG_FUNCTION_INFO_V1(pg_cancel_session); + +/* + * walk through planstate tree and gets cursors it contains in + * RemoteSubplan node, formed as a single string delimited each + * cursor by a space (one cursor stands for a RemoteSubplan node). + */ +static bool +cursorCollectWalker(PlanState *planstate, StringInfo str) +{ + if (IsA(planstate, RemoteSubplanState)) + { + RemoteSubplan *plan = (RemoteSubplan *) planstate->plan; + if (plan->cursor != NULL) + { + appendStringInfoString(str, plan->cursor); + if (plan->unique) + appendStringInfo(str, "_%d", plan->unique); + /* add a space as delimiter */ + appendStringInfoString(str, " "); + } + } + + return planstate_tree_walker(planstate, cursorCollectWalker, str); +} + +/* + * Initialize the shared status array and several string buffers + * during postmaster startup. + */ +static void +CreateSharedClusterStatus(void) +{ + Size size; + bool found; + + /* Create or attach to the shared array */ + size = mul_size(sizeof(PgClusterStatus), NumBackendStatSlots); + ClusterStatusArray = (PgClusterStatus *) + ShmemInitStruct("Cluster Status Array", size, &found); + + if (!found) + { + /* + * We're the first - initialize. + */ + MemSet(ClusterStatusArray, 0, size); + } +} + +/* + * Shut down a single backend's statistics reporting at process exit. + * + * Flush any remaining statistics counts out to the collector. + * Without this, operations triggered during backend exit (such as + * temp table deletions) won't be counted. + * + * Lastly, clear out our entry in the PgBackendStatus array. + */ +static void +pgcs_shutdown_hook(int code, Datum arg) +{ + volatile PgClusterStatus *entry = MyCSEntry; + + /* + * Clear my status entry, following the protocol of bumping st_changecount + * before and after. We use a volatile pointer here to ensure the + * compiler doesn't try to get cute. + */ + increment_changecount_before(entry); + + entry->valid = false; /* mark invalid to hide this entry */ + + increment_changecount_after(entry); +} + +/* ---------- + * pgcs_entry_initialize() - + * + * Initialize my cluster status entry, and set up our on-proc-exit hook. + * as an extension but we don't have hook during process startup, so called + * each time the backend try to report something. + * ---------- + */ +static void +pgcs_entry_initialize(void) +{ + /* already initialized */ + if (MyCSEntry != NULL) + return; + + if (ClusterStatusArray == NULL) + { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("shared memory for pg_stat_cluster_activity is not prepared"), + errhint("maybe you need to set shared_preload_libraries in postgresql.conf file"))); + return; + } + + /* Initialize MyCSEntry */ + if (MyBackendId != InvalidBackendId) + { + Assert(MyBackendId >= 1 && MyBackendId <= MaxBackends); + MyCSEntry = &ClusterStatusArray[MyBackendId - 1]; + } + else + { + /* Must be an auxiliary process */ + Assert(MyAuxProcType != NotAnAuxProcess); + + /* + * Assign the MyCSEntry for an auxiliary process. Since it doesn't + * have a BackendId, the slot is statically allocated based on the + * auxiliary process type (MyAuxProcType). Backends use slots indexed + * in the range from 1 to MaxBackends (inclusive), so we use + * MaxBackends + AuxBackendType + 1 as the index of the slot for an + * auxiliary process. + */ + MyCSEntry = &ClusterStatusArray[MaxBackends + MyAuxProcType]; + } + + /* also set nodename here, it won't change anyway */ + memcpy(MyCSEntry->nodename, PGXCNodeName, strlen(PGXCNodeName) + 1); + + /* Set up a process-exit hook to clean up */ + on_shmem_exit(pgcs_shutdown_hook, 0); +} + +/* ---------- + * pgcs_report_common + * + * Report common fileds of cluster backend status activity, + * called by pgcs_report_query_activity and pgcs_report_activity. + * report role, sqname, also if this backend become consumer, remove + * previous planstate and cursor. + * ---------- + */ +static void +pgcs_report_common(PgClusterStatus *entry, QueryDesc *desc) +{ + strncpy((char *) entry->sessionid, PGXCSessionId, NAMEDATALEN); + + entry->sqdone = false; + entry->valid = true; + + /* fields need queryDesc */ + if (IS_PGXC_DATANODE) + { + if (desc != NULL && desc->squeue) + { + strncpy((char *) entry->sqname, SqueueName(desc->squeue), NAMEDATALEN); + if (IsSqueueProducer()) + { + strncpy((char *) entry->role, "producer", NAMEDATALEN); + } + else if (IsSqueueConsumer()) + { + strncpy((char *) entry->role, "consumer", NAMEDATALEN); + /* consumer does not know of planstate */ + entry->planstate[0] = '\0'; + entry->cursors[0] = '\0'; + } + else + { + /* do not support */ + entry->role[0] = '\0'; + } + } + else if (IsParallelWorker()) + { + strncpy((char *) entry->role, "parallel worker", NAMEDATALEN); + } + else + { + strncpy((char *) entry->role, "datanode", NAMEDATALEN); + } + } + else if (IS_PGXC_COORDINATOR) + { + strncpy((char *) entry->role, "coordinator", NAMEDATALEN); + } + else + { + /* do not support */ + entry->role[0] = '\0'; + } +} + +/* ---------- + * pgcs_report_query_activity + * + * Report fileds of per-query referred, hooked as ExecutorStart_hook + * report planstate, cursors and common fields. + * ---------- + */ +static void +pgcs_report_query_activity(QueryDesc *desc, int eflags) +{ + volatile PgClusterStatus *entry; + StringInfo planstate_str = NULL; + StringInfo cursors = NULL; + + if (prev_ExecutorStart) + prev_ExecutorStart(desc, eflags); + else + standard_ExecutorStart(desc, eflags); + + pgcs_entry_initialize(); + entry = MyCSEntry; + + if (!desc) + return; + + /* if query already done, just report sqdone and return */ + if (desc->already_executed) + { + increment_changecount_before(entry); + entry->sqdone = true; + increment_changecount_after(entry); + return; + } + + if (desc->planstate != NULL) + { + ExplainState *es = NewExplainState(); + + /* make planstate text tree */ + es->costs = false; + /* we don't want plan->targetlist been changed */ + es->skip_remote_query = true; + + ExplainBeginOutput(es); + ExplainPrintPlan(es, desc); + ExplainEndOutput(es); + /* remove last '\n' */ + if (es->str->len > 1) + es->str->data[--es->str->len] = '\0'; + planstate_str = es->str; + + /* find name of RemoteSubplan to show as cursors */ + cursors = makeStringInfo(); + cursorCollectWalker(desc->planstate, cursors); + } + + increment_changecount_before(entry); + + if (planstate_str != NULL && planstate_str->len > 0) + memcpy((char *) entry->planstate, planstate_str->data, Min(planstate_str->len + 1, 4096)); + if (cursors != NULL && cursors->len > 0) + memcpy((char *) entry->cursors, cursors->data, Min(cursors->len + 1, NAMEDATALEN * 64)); + + pgcs_report_common((PgClusterStatus *) entry, desc); + + increment_changecount_after(entry); +} + +/* ---------- + * pgcs_report_activity + * + * Report fileds of per-portal referred, hooked as PortalStart_hook + * report portal name and common fields. + * ---------- + */ +static void +pgcs_report_activity(Portal portal) +{ + volatile PgClusterStatus *entry; + QueryDesc *desc = portal->queryDesc; + + pgcs_entry_initialize(); + entry = MyCSEntry; + + /* if query already done, just report sqdone and return */ + if (desc != NULL && desc->already_executed) + { + increment_changecount_before(entry); + entry->sqdone = true; + increment_changecount_after(entry); + return; + } + + increment_changecount_before(entry); + + strncpy((char *) entry->portal, portal->name, NAMEDATALEN); + pgcs_report_common((PgClusterStatus *) entry, desc); + + increment_changecount_after(entry); +} + +/* ---------- + * pgstat_fetch_stat_local_csentry + * + * Given a backend id, find particular cluster status entry, copy valid + * entry into local memory, loop around changecount to ensure concurrency. + * ---------- + */ +static PgClusterStatus * +pgstat_fetch_stat_local_csentry(int beid) +{ + PgClusterStatus *csentry; + PgClusterStatus *local = palloc(sizeof(PgClusterStatus)); + local->valid = false; + + if (ClusterStatusArray == NULL) + { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("shared memory for pg_stat_cluster_activity is not prepared"), + errhint("maybe you need to set shared_preload_libraries in postgresql.conf"))); + return NULL; + } + + if (beid < 1) + return NULL; + + csentry = &ClusterStatusArray[beid - 1]; + + for (;;) + { + int before_changecount; + int after_changecount; + + save_changecount_before(csentry, before_changecount); + if (csentry->valid) + { + memcpy(local, csentry, sizeof(PgClusterStatus)); + } + save_changecount_after(csentry, after_changecount); + if (before_changecount == after_changecount && + (before_changecount & 1) == 0) + break; + + /* Make sure we can break out of loop if stuck... */ + CHECK_FOR_INTERRUPTS(); + } + + return local; +} + +/* ---------- + * pg_stat_get_remote_activity + * + * Execute pg_stat_get_cluster_activity query remotely and save + * results in tuplestore. + * ---------- + */ +static void +pg_stat_get_remote_activity(const char *sessionid, bool coordonly, Tuplestorestate *tupstore) +{ +#define QUERY_LEN 1024 + char query[QUERY_LEN]; + int i; + EState *estate; + MemoryContext oldcontext; + RemoteQuery *plan; + RemoteQueryState *pstate; + Var *dummy; + TupleTableSlot *result = NULL; + + /* + * Here we call pg_stat_get_cluster_activity in remote with args: + * coordonly = false, localonly = true, to prevent recursive calls in remote nodes. + */ + if (sessionid == NULL) + snprintf(query, QUERY_LEN, "select * from pg_stat_get_cluster_activity(NULL, false, true)"); + else + snprintf(query, QUERY_LEN, "select * from pg_stat_get_cluster_activity('%s', false, true)", sessionid); + + plan = makeNode(RemoteQuery); + plan->combine_type = COMBINE_TYPE_NONE; + /* + * set exec_nodes to NULL makes ExecRemoteQuery send query to all nodes + * (local CN nodes won't recieved query again). + */ + plan->exec_nodes = NULL; + plan->exec_type = EXEC_ON_ALL_NODES; + plan->sql_statement = (char *) query; + plan->force_autocommit = false; + + if (coordonly) + { + plan->exec_nodes = makeNode(ExecNodes); + plan->exec_nodes->nodeList = GetAllCoordNodes(); + plan->exec_type = EXEC_ON_COORDS; + } + + /* + * We only need the target entry to determine result data type. + * So create dummy even if real expression is a function. + */ + for (i = 1; i <= PG_STAT_GET_ClUSTER_ACTIVITY_COLS; i++) + { + dummy = makeVar(1, i, TEXTOID, 0, InvalidOid, 0); + plan->scan.plan.targetlist = lappend(plan->scan.plan.targetlist, + makeTargetEntry((Expr *) dummy, i, NULL, false)); + } + + /* prepare to execute */ + estate = CreateExecutorState(); + oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); + estate->es_snapshot = GetActiveSnapshot(); + pstate = ExecInitRemoteQuery(plan, estate, 0); + MemoryContextSwitchTo(oldcontext); + + result = ExecRemoteQuery((PlanState *) pstate); + + while (result != NULL && !TupIsNull(result)) + { + slot_getallattrs(result); + + tuplestore_puttupleslot(tupstore, result); + result = ExecRemoteQuery((PlanState *) pstate); + } + + ExecEndRemoteQuery(pstate); + return; +} + +/* ---------- + * pg_stat_get_cluster_activity + * + * Internal SRF function of this extension, access sharememory to find + * every live backend which executed or executing query. copy to local + * and show status. also we collect some fields from PGBackendStatus + * + * arguments: sessionid -- global unique id for a session, generated by CN + * coordonly -- only dispatch to other cn if true. + * localonly -- collect local entries status if true. + * + * Note: since we also collect PGBackendStatus, get them first and use + * backend id to access particular cluster status entry to narrow down + * loop search range from all backend slots to localNumBackends (see pgstat.c) + * ---------- + */ +Datum +pg_stat_get_cluster_activity(PG_FUNCTION_ARGS) +{ + int num_backends = pgstat_fetch_stat_numbackends(); + int curr_backend; + bool with_sessionid = !PG_ARGISNULL(0); + bool coordonly = PG_ARGISNULL(1) ? false : PG_GETARG_BOOL(1); + bool localonly = PG_ARGISNULL(2) ? false : PG_GETARG_BOOL(2); + const char *sessionid = with_sessionid ? text_to_cstring(PG_GETARG_TEXT_P(0)) : NULL; + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + TupleDesc tupdesc; + Tuplestorestate *tupstore; + MemoryContext per_query_ctx; + MemoryContext oldcontext; + + /* check to see if caller supports us returning a tuplestore */ + if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("set-valued function called in context that cannot accept a set"))); + if (!(rsinfo->allowedModes & SFRM_Materialize)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("materialize mode required, but it is not " \ + "allowed in this context"))); + + /* Build a tuple descriptor for our result type */ + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + /* switch to query's memory context to save results during execution */ + per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; + oldcontext = MemoryContextSwitchTo(per_query_ctx); + + tupstore = tuplestore_begin_heap(true, false, work_mem); + rsinfo->returnMode = SFRM_Materialize; + rsinfo->setResult = tupstore; + rsinfo->setDesc = tupdesc; + + MemoryContextSwitchTo(oldcontext); + + /* dispatch query to remote if needed */ + if (!localonly && IS_PGXC_COORDINATOR) + pg_stat_get_remote_activity(sessionid, coordonly, tupstore); + + /* 1-based index */ + for (curr_backend = 1; curr_backend <= num_backends; curr_backend++) + { + /* for each row */ + Datum values[PG_STAT_GET_ClUSTER_ACTIVITY_COLS]; + bool nulls[PG_STAT_GET_ClUSTER_ACTIVITY_COLS]; + + /* same as pg_stat_get_activity */ + LocalPgBackendStatus *local_beentry; + PgBackendStatus *beentry; + PGPROC *proc; + const char *wait_event_type = NULL; + const char *wait_event = NULL; + + /* cluster information */ + PgClusterStatus *local_csentry; + + MemSet(values, 0, sizeof(values)); + MemSet(nulls, 0, sizeof(nulls)); + + /* Get the next one in the list */ + local_beentry = pgstat_fetch_stat_local_beentry(curr_backend); + local_csentry = pgstat_fetch_stat_local_csentry(local_beentry->backend_id); + if (!local_beentry || !local_csentry) + { + int i; + + /* Ignore missing entries if looking for specific sessionid */ + if (with_sessionid) + continue; + + for (i = 0; i < lengthof(nulls); i++) + nulls[i] = true; + + nulls[13] = false; + values[13] = CStringGetTextDatum(""); + + tuplestore_putvalues(tupstore, tupdesc, values, nulls); + continue; + } + + if (!local_csentry->valid) + continue; + + beentry = &local_beentry->backendStatus; + /* If looking for specific sessionid, ignore all the others */ + if (with_sessionid && strcmp(sessionid, local_csentry->sessionid) != 0) + continue; + + /* Values available to all callers */ + values[0] = CStringGetTextDatum(local_csentry->sessionid); + values[1] = Int32GetDatum(beentry->st_procpid); + + if (beentry->st_databaseid != InvalidOid) + values[7] = ObjectIdGetDatum(beentry->st_databaseid); + else + nulls[7] = true; + + if (beentry->st_userid != InvalidOid) + values[8] = ObjectIdGetDatum(beentry->st_userid); + else + nulls[8] = true; + + /* Values only available to owner or superuser or pg_read_all_stats */ + if (has_privs_of_role(GetUserId(), beentry->st_userid) || + is_member_of_role(GetUserId(), DEFAULT_ROLE_READ_ALL_STATS)) + { + SockAddr zero_clientaddr; + + /* A zeroed client addr means we don't know */ + memset(&zero_clientaddr, 0, sizeof(zero_clientaddr)); + if (memcmp(&(beentry->st_clientaddr), &zero_clientaddr, + sizeof(zero_clientaddr)) == 0) + { + nulls[2] = true; + nulls[3] = true; + nulls[4] = true; + } + else + { + if (beentry->st_clientaddr.addr.ss_family == AF_INET +#ifdef HAVE_IPV6 + || beentry->st_clientaddr.addr.ss_family == AF_INET6 +#endif + ) + { + char remote_host[NI_MAXHOST]; + char remote_port[NI_MAXSERV]; + int ret; + + remote_host[0] = '\0'; + remote_port[0] = '\0'; + ret = pg_getnameinfo_all(&beentry->st_clientaddr.addr, + beentry->st_clientaddr.salen, + remote_host, sizeof(remote_host), + remote_port, sizeof(remote_port), + NI_NUMERICHOST | NI_NUMERICSERV); + if (ret == 0) + { + clean_ipv6_addr(beentry->st_clientaddr.addr.ss_family, remote_host); + values[2] = DirectFunctionCall1(inet_in, + CStringGetDatum(remote_host)); + if (beentry->st_clienthostname && + beentry->st_clienthostname[0]) + values[3] = CStringGetTextDatum(beentry->st_clienthostname); + else + nulls[3] = true; + values[4] = Int32GetDatum(atoi(remote_port)); + } + else + { + nulls[2] = true; + nulls[3] = true; + nulls[4] = true; + } + } + else if (beentry->st_clientaddr.addr.ss_family == AF_UNIX) + { + /* + * Unix sockets always reports NULL for host and -1 for + * port, so it's possible to tell the difference to + * connections we have no permissions to view, or with + * errors. + */ + nulls[2] = true; + nulls[3] = true; + values[4] = DatumGetInt32(-1); + } + else + { + /* Unknown address type, should never happen */ + nulls[2] = true; + nulls[3] = true; + nulls[4] = true; + } + } + + values[5] = CStringGetTextDatum(local_csentry->nodename); + values[6] = CStringGetTextDatum(local_csentry->role); + + proc = BackendPidGetProc(beentry->st_procpid); + if (proc != NULL) + { + uint32 raw_wait_event; + + raw_wait_event = UINT32_ACCESS_ONCE(proc->wait_event_info); + wait_event_type = pgstat_get_wait_event_type(raw_wait_event); + wait_event = pgstat_get_wait_event(raw_wait_event); + } + else if (beentry->st_backendType != B_BACKEND) + { + /* + * For an auxiliary process, retrieve process info from + * AuxiliaryProcs stored in shared-memory. + */ + proc = AuxiliaryPidGetProc(beentry->st_procpid); + + if (proc != NULL) + { + uint32 raw_wait_event; + + raw_wait_event = + UINT32_ACCESS_ONCE(proc->wait_event_info); + wait_event_type = + pgstat_get_wait_event_type(raw_wait_event); + wait_event = pgstat_get_wait_event(raw_wait_event); + } + } + + if (wait_event_type) + values[9] = CStringGetTextDatum(wait_event_type); + else + nulls[9] = true; + + if (wait_event) + values[10] = CStringGetTextDatum(wait_event); + else + nulls[10] = true; + + switch (beentry->st_state) + { + case STATE_IDLE: + values[11] = CStringGetTextDatum("idle"); + break; + case STATE_RUNNING: + values[11] = CStringGetTextDatum("active"); + break; + case STATE_IDLEINTRANSACTION: + values[11] = CStringGetTextDatum("idle in transaction"); + break; + case STATE_FASTPATH: + values[11] = CStringGetTextDatum("fastpath function call"); + break; + case STATE_IDLEINTRANSACTION_ABORTED: + values[11] = CStringGetTextDatum("idle in transaction (aborted)"); + break; + case STATE_DISABLED: + values[11] = CStringGetTextDatum("disabled"); + break; + case STATE_UNDEFINED: + nulls[11] = true; + break; + } + + values[12] = CStringGetTextDatum(local_csentry->sqname); + values[13] = BoolGetDatum(local_csentry->sqdone); + values[14] = CStringGetTextDatum(beentry->st_activity); + values[15] = CStringGetTextDatum(local_csentry->planstate); + values[16] = CStringGetTextDatum(local_csentry->portal); + values[17] = CStringGetTextDatum(local_csentry->cursors); + + if (beentry->st_proc_start_timestamp != 0) + values[18] = TimestampTzGetDatum(beentry->st_proc_start_timestamp); + else + nulls[18] = true; + + if (beentry->st_xact_start_timestamp != 0) + values[19] = TimestampTzGetDatum(beentry->st_xact_start_timestamp); + else + nulls[19] = true; + + if (beentry->st_activity_start_timestamp != 0) + values[20] = TimestampTzGetDatum(beentry->st_activity_start_timestamp); + else + nulls[20] = true; + + if (beentry->st_state_start_timestamp != 0) + values[21] = TimestampTzGetDatum(beentry->st_state_start_timestamp); + else + nulls[21] = true; + } + else + { + values[14] = CStringGetTextDatum(""); + nulls[2] = true; + nulls[3] = true; + nulls[4] = true; + nulls[5] = true; + nulls[6] = true; + nulls[9] = true; + nulls[10] = true; + nulls[11] = true; + nulls[12] = true; + nulls[13] = true; + nulls[15] = true; + nulls[16] = true; + nulls[17] = true; + nulls[18] = true; + nulls[19] = true; + nulls[20] = true; + nulls[21] = true; + } + + tuplestore_putvalues(tupstore, tupdesc, values, nulls); + } + + /* clean up and return the tuplestore */ + tuplestore_donestoring(tupstore); + + return (Datum) 0; +} + +static bool +pgcs_signal_session_remote(const char *sessionid, int signal) +{ +#define QUERY_LEN 1024 + char query[QUERY_LEN]; + EState *estate; + MemoryContext oldcontext; + RemoteQuery *plan; + RemoteQueryState *pstate; + Var *dummy; + TupleTableSlot *result = NULL; + + snprintf(query, QUERY_LEN, "select pg_signal_session('%s', %d, true)", sessionid, signal); + + plan = makeNode(RemoteQuery); + plan->combine_type = COMBINE_TYPE_NONE; + /* + * set exec_nodes to NULL makes ExecRemoteQuery send query to all nodes + * (local CN nodes won't recieved query again). + */ + plan->exec_nodes = NULL; + plan->exec_type = EXEC_ON_ALL_NODES; + plan->sql_statement = (char *) query; + plan->force_autocommit = false; + + /* + * We only need the target entry to determine result data type. + * So create dummy even if real expression is a function. + */ + dummy = makeVar(1, 1, TEXTOID, 0, InvalidOid, 0); + plan->scan.plan.targetlist = lappend(plan->scan.plan.targetlist, + makeTargetEntry((Expr *) dummy, 1, NULL, false)); + + /* prepare to execute */ + estate = CreateExecutorState(); + oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); + estate->es_snapshot = GetActiveSnapshot(); + pstate = ExecInitRemoteQuery(plan, estate, 0); + MemoryContextSwitchTo(oldcontext); + + result = ExecRemoteQuery((PlanState *) pstate); + ExecEndRemoteQuery(pstate); + if (TupIsNull(result)) + { + elog(ERROR, "result of pg_signal_session executed remotely is NULL"); + return false; + } + + return true; +} + +static bool +pgcs_signal_session(const char *sessionid, int signal) +{ + int num_backends = pgstat_fetch_stat_numbackends(); + int curr_backend; + const char *funcname; + LocalPgBackendStatus *local_beentry; + PgClusterStatus *local_csentry; + PgBackendStatus *beentry; + + if (signal == SIGTERM) + funcname = "pg_terminate_backend"; + else if (signal == SIGINT) + funcname = "pg_cancel_backend"; + else + elog(ERROR, "pgcs_signal_session only support SIGTERM and SIGINT, not %d", signal); + + /* 1-based index */ + for (curr_backend = 1; curr_backend <= num_backends; curr_backend++) + { + /* Get the next one in the list */ + local_beentry = pgstat_fetch_stat_local_beentry(curr_backend); + local_csentry = pgstat_fetch_stat_local_csentry(local_beentry->backend_id); + + if (local_csentry->valid && strcmp(local_csentry->sessionid, sessionid) == 0) + { + beentry = &local_beentry->backendStatus; + OidFunctionCall1(fmgr_internal_function(funcname), + Int32GetDatum(beentry->st_procpid)); + } + } + + return true; +} + +Datum +pg_signal_session(PG_FUNCTION_ARGS) +{ + const char *sessionid = text_to_cstring(PG_GETARG_TEXT_P(0)); + int signal = PG_GETARG_INT32(1); + bool localonly = PG_ARGISNULL(2) ? false : PG_GETARG_BOOL(2); + bool result; + + result = pgcs_signal_session(sessionid, signal); + if (result && !localonly) + result = pgcs_signal_session_remote(sessionid, signal); + + return BoolGetDatum(result); +} + +Datum +pg_terminate_session(PG_FUNCTION_ARGS) +{ + return DirectFunctionCall3(pg_signal_session, + PG_GETARG_DATUM(0), + Int32GetDatum(SIGTERM), + BoolGetDatum(false)); +} + +Datum +pg_cancel_session(PG_FUNCTION_ARGS) +{ + return DirectFunctionCall3(pg_signal_session, + PG_GETARG_DATUM(0), + Int32GetDatum(SIGINT), + BoolGetDatum(false)); +} + +/* + * Hooked as shmem_startup_hook + */ +static void +pgcs_shmem_startup(void) +{ + CreateSharedClusterStatus(); +} + +/* + * Estimate shared memory space needed. + */ +static Size +pgcs_memsize(void) +{ + return mul_size(sizeof(PgClusterStatus), NumBackendStatSlots); +} + +/* + * Module load callback + */ +void +_PG_init(void) +{ + if (!process_shared_preload_libraries_in_progress) + return; + + /* + * Request additional shared resources. (These are no-ops if we're not in + * the postmaster process.) We'll allocate or attach to the shared + * resources in pgcs_shmem_startup(). + */ + RequestAddinShmemSpace(pgcs_memsize()); + + /* + * Install hooks. + */ + prev_shmem_startup_hook = shmem_startup_hook; + shmem_startup_hook = pgcs_shmem_startup; + prev_PortalStart = PortalStart_hook; + PortalStart_hook = pgcs_report_activity; + prev_PortalDrop = PortalDrop_hook; + PortalDrop_hook = pgcs_report_activity; + prev_ExecutorStart = ExecutorStart_hook; + ExecutorStart_hook = pgcs_report_query_activity; +} + +/* + * Module unload callback + */ +void +_PG_fini(void) +{ + /* Uninstall hooks. */ + shmem_startup_hook = prev_shmem_startup_hook; + PortalStart_hook = prev_PortalStart; + PortalDrop_hook = prev_PortalDrop; + ExecutorStart_hook = prev_ExecutorStart; +} diff --git a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.conf b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.conf new file mode 100644 index 00000000..91c61803 --- /dev/null +++ b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.conf @@ -0,0 +1 @@ +shared_preload_libraries = 'pg_stat_cluster_activity' diff --git a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.control b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.control new file mode 100644 index 00000000..dacd5262 --- /dev/null +++ b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.control @@ -0,0 +1,5 @@ +# pg_stat_cluster_activity extension +comment = 'track execution statistics in whole cluster scope' +default_version = '1.0' +module_pathname = '$libdir/pg_stat_cluster_activity' +relocatable = true diff --git a/src/backend/access/transam/parallel.c b/src/backend/access/transam/parallel.c index 84256fe1..b67873e8 100644 --- a/src/backend/access/transam/parallel.c +++ b/src/backend/access/transam/parallel.c @@ -69,6 +69,7 @@ #define PARALLEL_KEY_GLOBALXID UINT64CONST(0xFFFFFFFFFFFF0010) #endif #define PARALLEL_KEY_ENTRYPOINT UINT64CONST(0xFFFFFFFFFFFF0009) +#define PARALLEL_KEY_SESSIONID UINT64CONST(0xFFFFFFFFFFFF0011) @@ -205,6 +206,7 @@ InitializeParallelDSM(ParallelContext *pcxt) #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__ Size gxidlen = 0; #endif + Size sidlen = 0; Size segsize = 0; int i; FixedParallelState *fps; @@ -241,8 +243,10 @@ InitializeParallelDSM(ParallelContext *pcxt) gxidlen = EstimateGlobalXidSpace(); shm_toc_estimate_chunk(&pcxt->estimator, gxidlen); #endif + sidlen = PGXCSessionId[0] == '\0' ? 0 : strlen(PGXCSessionId) + 1; + shm_toc_estimate_chunk(&pcxt->estimator, sidlen); /* If you add more chunks here, you probably need to add keys. */ - shm_toc_estimate_keys(&pcxt->estimator, 7); + shm_toc_estimate_keys(&pcxt->estimator, 8); /* Estimate space need for error queues. */ StaticAssertStmt(BUFFERALIGN(PARALLEL_ERROR_QUEUE_SIZE) == @@ -312,6 +316,7 @@ InitializeParallelDSM(ParallelContext *pcxt) #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__ char *gxidspace; #endif + char *sidspace; char *error_queue_space; char *entrypointstate; Size lnamelen; @@ -351,6 +356,10 @@ InitializeParallelDSM(ParallelContext *pcxt) SerializeGlobalXid(gxidlen, gxidspace); shm_toc_insert(pcxt->toc, PARALLEL_KEY_GLOBALXID, gxidspace); #endif + /* global session id */ + sidspace = shm_toc_allocate(pcxt->toc, sidlen); + SerializeSessionId(sidlen, sidspace); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_SESSIONID, sidspace); /* Allocate space for worker information. */ pcxt->worker = palloc0(sizeof(ParallelWorkerInfo) * pcxt->nworkers); @@ -982,6 +991,7 @@ ParallelWorkerMain(Datum main_arg) #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__ char *gxidspace; #endif + char *sidspace; StringInfoData msgbuf; /* Set flag to indicate that we're initializing a parallel worker. */ @@ -1115,6 +1125,10 @@ ParallelWorkerMain(Datum main_arg) StartParallelWorkerGlobalXid(gxidspace); #endif + /* Restore session id */ + sidspace = shm_toc_lookup(toc, PARALLEL_KEY_SESSIONID, false); + StartParallelWorkerSessionId(sidspace); + /* Restore combo CID state. */ combocidspace = shm_toc_lookup(toc, PARALLEL_KEY_COMBO_CID, false); RestoreComboCIDState(combocidspace); diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 2722d951..09617d73 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -3870,6 +3870,7 @@ ExplainRemoteQuery(RemoteQuery *plan, PlanState *planstate, List *ancestors, Exp {// #lizard forgives ExecNodes *en = plan->exec_nodes; /* add names of the nodes if they exist */ + if (en && es->nodes) { StringInfo node_names = makeStringInfo(); @@ -3914,6 +3915,14 @@ ExplainRemoteQuery(RemoteQuery *plan, PlanState *planstate, List *ancestors, Exp } } + /* + * if required, skip executing remote query, this + * is happened when a backend report planstate it + * processing, shouldn't execute it again. + */ + if (es->skip_remote_query) + return; + if (en && en->en_expr) show_expression((Node *)en->en_expr, "Node expr", planstate, ancestors, es->verbose, es); diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 491e33d3..86cafb0f 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -3444,6 +3444,13 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections, if (connections[i]->state == DN_CONNECTION_STATE_QUERY) BufferConnection(connections[i]); + /* Send global session id */ + if (pgxc_node_send_sessionid(connections[i])) + { + elog(WARNING, "pgxc_node_begin sending session id failed"); + return EOF; + } + /* Send GXID and check for errors */ if (pgxc_node_send_gxid(connections[i], gxid)) { @@ -6229,6 +6236,8 @@ get_exec_connections_all_dn(bool is_global_session) /* * Get Node connections depending on the connection type: * Datanodes Only, Coordinators only or both types + * If exec_nodes is NIL and exec_type is EXEC_ON_ALL_NODES + * connect to all nodes except myself */ static PGXCNodeAllHandles * get_exec_connections(RemoteQueryState *planstate, @@ -6303,9 +6312,9 @@ get_exec_connections(RemoteQueryState *planstate, { estate = ExecInitExpr(exec_nodes->en_expr, (PlanState *) planstate); - /* For explain, no need to execute expr. */ - if (planstate->eflags != EXEC_FLAG_EXPLAIN_ONLY) - partvalue = ExecEvalExpr(estate, + /* For explain, no need to execute expr. */ + if (planstate->eflags != EXEC_FLAG_EXPLAIN_ONLY) + partvalue = ExecEvalExpr(estate, planstate->combiner.ss.ps.ps_ExprContext, &isnull); } @@ -6447,6 +6456,12 @@ get_exec_connections(RemoteQueryState *planstate, co_conn_count = 0; } + if ((list_length(nodelist) == 0 && exec_type == EXEC_ON_ALL_NODES)) + { + nodelist = GetAllDataNodes(); + dn_conn_count = NumDataNodes; + } + #ifdef __TBASE__ if (IsParallelWorker()) { @@ -8933,6 +8948,19 @@ ExecRemoteQuery(PlanState *pstate) need_global_snapshot = true; #endif } + else if (step->exec_type == EXEC_ON_ALL_NODES) + { + total_conn_count = regular_conn_count = + pgxc_connections->dn_conn_count + pgxc_connections->co_conn_count; + + connections = palloc(mul_size(total_conn_count, sizeof(PGXCNodeHandle *))); + memcpy(connections, pgxc_connections->datanode_handles, + pgxc_connections->dn_conn_count * sizeof(PGXCNodeHandle *)); + memcpy(connections + pgxc_connections->dn_conn_count, pgxc_connections->coord_handles, + pgxc_connections->co_conn_count * sizeof(PGXCNodeHandle *)); + + need_global_snapshot = g_set_global_snapshot; + } #ifdef __TBASE__ /* set snapshot as needed */ diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index 81027676..233fd0e2 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -378,6 +378,8 @@ InitMultinodeExecutor(bool is_force) MemoryContextSwitchTo(oldcontext); + PGXCSessionId[0] = '\0'; + if (IS_PGXC_COORDINATOR) { for (count = 0; count < NumCoords; count++) @@ -386,6 +388,8 @@ InitMultinodeExecutor(bool is_force) get_pgxc_nodename(co_handles[count].nodeoid)) == 0) PGXCNodeId = count + 1; } + + sprintf(PGXCSessionId, "%s_%d_%ld", PGXCNodeName, MyProcPid, GetCurrentTimestamp()); } else /* DataNode */ { @@ -410,7 +414,8 @@ InitMultinodeExecutor(bool is_force) } -Oid get_nodeoid_from_nodeid(int nodeid, char node_type) +Oid +get_nodeoid_from_nodeid(int nodeid, char node_type) { if (PGXC_NODE_COORDINATOR == node_type) { @@ -524,7 +529,8 @@ PGXCNodeConnect(char *connstr) return (NODE_CONNECTION *) conn; } -int PGXCNodePing(const char *connstr) +int +PGXCNodePing(const char *connstr) { if (connstr[0]) { @@ -943,8 +949,9 @@ pgxc_node_receive(const int conn_count, } -void pgxc_print_pending_data(PGXCNodeHandle *handle, bool reset) -{// #lizard forgives +void +pgxc_print_pending_data(PGXCNodeHandle *handle, bool reset) +{ char *msg; int32 ret; //DNConnectionState estate = 0; @@ -1517,8 +1524,9 @@ release_handles(bool force) /* * Check whether there bad connections to remote nodes when abort transactions. */ -bool validate_handles(void) -{// #lizard forgives +bool +validate_handles(void) +{ int i; int ret; @@ -2462,7 +2470,8 @@ pgxc_node_send_sync(PGXCNodeHandle * handle) /* * Send logical apply message down to the Datanode */ -int pgxc_node_send_apply(PGXCNodeHandle * handle, char * buf, int len, bool ignore_pk_conflict) +int +pgxc_node_send_apply(PGXCNodeHandle * handle, char * buf, int len, bool ignore_pk_conflict) { int msgLen = 0; @@ -3340,29 +3349,59 @@ pgxc_node_send_coord_info(PGXCNodeHandle * handle, int coord_pid, TransactionId return 0; } -inline void pgxc_set_coordinator_proc_pid(int proc_pid) +void +pgxc_set_coordinator_proc_pid(int proc_pid) { pgxc_coordinator_proc_pid = (IS_PGXC_COORDINATOR ? MyProcPid : proc_pid); } -inline void pgxc_set_coordinator_proc_vxid(TransactionId proc_vxid) +void +pgxc_set_coordinator_proc_vxid(TransactionId proc_vxid) { TransactionId lxid = (MyProc != NULL ? MyProc->lxid : InvalidTransactionId); pgxc_coordinator_proc_vxid = (IS_PGXC_COORDINATOR ? lxid : proc_vxid); } -inline int pgxc_get_coordinator_proc_pid(void) +int +pgxc_get_coordinator_proc_pid(void) { return (IS_PGXC_COORDINATOR ? MyProcPid : pgxc_coordinator_proc_pid); } -inline TransactionId pgxc_get_coordinator_proc_vxid(void) +TransactionId +pgxc_get_coordinator_proc_vxid(void) { TransactionId lxid = (MyProc != NULL ? MyProc->lxid : InvalidTransactionId); return (IS_PGXC_COORDINATOR ? lxid : pgxc_coordinator_proc_vxid); } + +int +pgxc_node_send_sessionid(PGXCNodeHandle * handle) +{ + int msgLen = 0; + + /* size + sessionid_str + '\0' */ + msgLen = 4 + strlen(PGXCSessionId) + 1; + + /* msgType + msgLen */ + if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0) + { + add_error_message(handle, "pgxc_node_send_sessionid out of memory"); + return EOF; + } + + handle->outBuffer[handle->outEnd++] = 'o'; /* session id */ + + msgLen = htonl(msgLen); + memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4); + handle->outEnd += 4; + + memcpy(handle->outBuffer + handle->outEnd, PGXCSessionId, strlen(PGXCSessionId) + 1); + handle->outEnd += strlen(PGXCSessionId) + 1; + return 0; +} #endif /* @@ -3416,8 +3455,9 @@ add_error_message(PGXCNodeHandle *handle, const char *message) } } #ifdef __TBASE__ -void add_error_message_from_combiner(PGXCNodeHandle *handle, void *combiner_input) -{// #lizard forgives +void +add_error_message_from_combiner(PGXCNodeHandle *handle, void *combiner_input) +{ ResponseCombiner *combiner; combiner = (ResponseCombiner*)combiner_input; @@ -4205,8 +4245,8 @@ pfree_pgxc_all_handles(PGXCNodeAllHandles *pgxc_handles) } /* Do translation for non-main cluster */ - -Oid PGXCGetLocalNodeOid(Oid nodeoid) +Oid +PGXCGetLocalNodeOid(Oid nodeoid) { if(false == IsPGXCMainCluster) @@ -4224,7 +4264,8 @@ Oid PGXCGetLocalNodeOid(Oid nodeoid) return nodeoid; } -Oid PGXCGetMainNodeOid(Oid nodeoid) +Oid +PGXCGetMainNodeOid(Oid nodeoid) { if(false == IsPGXCMainCluster) @@ -4420,7 +4461,9 @@ paramlist_delete_param(List *param_list, const char *name) return param_list; } -static ParamEntry * paramlist_get_paramentry(List *param_list, const char *name) + +static ParamEntry * +paramlist_get_paramentry(List *param_list, const char *name) { ListCell *cur_item; @@ -4439,7 +4482,9 @@ static ParamEntry * paramlist_get_paramentry(List *param_list, const char *name) return NULL; } -static ParamEntry * paramentry_copy(ParamEntry * src_entry) + +static ParamEntry * +paramentry_copy(ParamEntry * src_entry) { ParamEntry *dst_entry = NULL; if (src_entry) @@ -5432,7 +5477,8 @@ PGXCNodeSendSetQuery(NODE_CONNECTION *conn, const char *sql_command, char *errms return error ? -1 : 0; } -bool node_ready_for_query(PGXCNodeHandle *conn) +bool +node_ready_for_query(PGXCNodeHandle *conn) { return ('Z' == (conn)->last_command); } @@ -5611,7 +5657,8 @@ void PGXCGetCoordOidOthers(Oid **nodelist) } -void PGXCGetAllDnOid(Oid *nodelist) +void +PGXCGetAllDnOid(Oid *nodelist) { Oid node_oid; int i; @@ -5663,4 +5710,40 @@ is_ddl_leader_cn(char *first_cn) } #endif +/* + * SerializeSessionId + * Dumps the serialized session id onto the memory location at + * start_address for parallel workers + */ +void +SerializeSessionId(Size maxsize, char *start_address) +{ + + if(PGXCSessionId[0] == '\0') + { + *(int *) start_address = 0; + } + else + { + int len = strlen(PGXCSessionId) + 1; + + *(int *) start_address = len; + memcpy(start_address + sizeof(int), PGXCSessionId, len); + } +} + +/* + * StartParallelWorkerSessionId + * Reads the serialized session id and set it on parallel workers + */ +void +StartParallelWorkerSessionId(char *address) +{ + char *sidspace = address + sizeof(int); + + if (*(int *) address == 0) /* len */ + PGXCSessionId[0] = '\0'; + else + strncpy((char *) PGXCSessionId, sidspace, NAMEDATALEN); +} #endif diff --git a/src/backend/pgxc/squeue/squeue.c b/src/backend/pgxc/squeue/squeue.c index e006d2c8..a4deed0d 100644 --- a/src/backend/pgxc/squeue/squeue.c +++ b/src/backend/pgxc/squeue/squeue.c @@ -9443,3 +9443,9 @@ int PipeLength(PGPipe *pPipe) } #endif + +const char * +SqueueName(SharedQueue sq) +{ + return sq->sq_key; +} diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index cfe20c82..0d77754a 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -3379,6 +3379,7 @@ pgstat_read_current_status(void) BackendIdGetTransactionIds(i, &localentry->backend_xid, &localentry->backend_xmin); + localentry->backend_id = i; localentry++; localappname += NAMEDATALEN; diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index f7ed9637..c3fe228d 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -466,6 +466,9 @@ char *PGXCClusterName = NULL; char *PGXCMainClusterName = NULL; bool IsPGXCMainCluster = false; int PGXCNodeId = 0; +#ifdef __TBASE__ +char PGXCSessionId[NAMEDATALEN]; +#endif /* * When a particular node starts up, store the node identifier in this variable * so that we dont have to calculate it OR do a search in cache any where else diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index db91d32d..7020932d 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -655,6 +655,7 @@ SocketBackend(StringInfo inBuf) #ifdef __TBASE__ case 'N': case 'U': /* coord info: coord_pid and top_xid */ + case 'o': /* global session id */ #endif case 'M': /* Command ID */ case 'g': /* GXID */ @@ -2838,6 +2839,8 @@ exec_execute_message(const char *portal_name, long max_rows) bool execute_is_fetch; bool was_logged = false; char msec_str[32]; + int instrument; + QueryDesc *desc; /* Adjust destination to tell printtup.c what to do */ dest = whereToSendOutput; @@ -3005,6 +3008,9 @@ exec_execute_message(const char *portal_name, long max_rows) portal->cplan->stmt_list = portal->cplan->stmt_list_backup; portal->cplan->stmt_list_backup = NULL; } + + desc = PortalGetQueryDesc(portal); + instrument = portal->up_instrument; #endif #ifdef __AUDIT__ @@ -3033,13 +3039,12 @@ exec_execute_message(const char *portal_name, long max_rows) CommandCounterIncrement(); } - #ifdef __TBASE__ - if (portal->up_instrument && - portal->queryDesc && - portal->queryDesc->myindex == -1) + if (instrument && + desc != NULL && + desc->myindex == -1) { - SendLocalInstr(portal->queryDesc->planstate); + SendLocalInstr(desc->planstate); } #endif /* Send appropriate CommandComplete to client */ @@ -5728,6 +5733,13 @@ PostgresMain(int argc, char *argv[], elog(DEBUG5, "Received coord_pid: %d, coord_vxid: %u", coord_pid, coord_vxid); } break; + case 'o': /* session id */ + { + const char *sessionid = pq_getmsgstring(&input_message); + pq_getmsgend(&input_message); + strncpy((char *) PGXCSessionId, sessionid, NAMEDATALEN); + } + break; #endif /* * 'X' means that the frontend is closing down the socket. EOF diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c index b73224fe..2064e594 100644 --- a/src/backend/tcop/pquery.c +++ b/src/backend/tcop/pquery.c @@ -54,6 +54,10 @@ #ifdef __TBASE__ bool paramPassDown = false; #endif + +/* Hooks for plugins to get control in PortalStart */ +PortalStart_hook_type PortalStart_hook = NULL; + /* * ActivePortal is the currently executing Portal (the most closely nested, * if there are several). @@ -1138,6 +1142,9 @@ PortalStart(Portal portal, ParamListInfo params, portal->tupDesc = NULL; break; } + + if (PortalStart_hook) + PortalStart_hook(portal); } PG_CATCH(); { diff --git a/src/backend/utils/mmgr/portalmem.c b/src/backend/utils/mmgr/portalmem.c index 567737b6..6fb83b8d 100644 --- a/src/backend/utils/mmgr/portalmem.c +++ b/src/backend/utils/mmgr/portalmem.c @@ -96,6 +96,9 @@ do { \ elog(WARNING, "trying to delete portal name that does not exist"); \ } while(0) +/* Hooks for plugins to get control in PortalDrop */ +PortalDrop_hook_type PortalDrop_hook = NULL; + static MemoryContext PortalMemory = NULL; @@ -564,6 +567,9 @@ PortalDrop(Portal portal, bool isTopCommit) (errcode(ERRCODE_INVALID_CURSOR_STATE), errmsg("cannot drop active portal \"%s\"", portal->name))); + if (PortalDrop_hook) + PortalDrop_hook(portal); + /* * Allow portalcmds.c to clean up the state it knows about, in particular * shutting down the executor if still active. This step potentially runs diff --git a/src/include/commands/explain.h b/src/include/commands/explain.h index 0f8c2765..19c744a4 100644 --- a/src/include/commands/explain.h +++ b/src/include/commands/explain.h @@ -37,6 +37,9 @@ typedef struct ExplainState bool nodes; /* print nodes in RemoteQuery node */ bool num_nodes; /* print number of nodes in RemoteQuery node */ #endif /* PGXC */ +#ifdef __TBASE__ + bool skip_remote_query; /* skip execute remote query */ +#endif bool timing; /* print detailed node timing */ bool summary; /* print total planning and execution timing */ ExplainFormat format; /* output format */ diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 8d0fb02e..15dd8b59 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -1,12 +1,12 @@ /* ---------- - * pgstat.h + * pgstat.h * - * Definitions for the PostgreSQL statistics collector daemon. + * Definitions for the PostgreSQL statistics collector daemon. * * Portions Copyright (c) 2012-2014, TransLattice, Inc. - * Copyright (c) 2001-2017, PostgreSQL Global Development Group + * Copyright (c) 2001-2017, PostgreSQL Global Development Group * - * src/include/pgstat.h + * src/include/pgstat.h * ---------- */ #ifndef PGSTAT_H @@ -27,20 +27,20 @@ * Paths for the statistics files (relative to installation's $PGDATA). * ---------- */ -#define PGSTAT_STAT_PERMANENT_DIRECTORY "pg_stat" -#define PGSTAT_STAT_PERMANENT_FILENAME "pg_stat/global.stat" -#define PGSTAT_STAT_PERMANENT_TMPFILE "pg_stat/global.tmp" +#define PGSTAT_STAT_PERMANENT_DIRECTORY "pg_stat" +#define PGSTAT_STAT_PERMANENT_FILENAME "pg_stat/global.stat" +#define PGSTAT_STAT_PERMANENT_TMPFILE "pg_stat/global.tmp" /* Default directory to store temporary statistics data in */ -#define PG_STAT_TMP_DIR "pg_stat_tmp" +#define PG_STAT_TMP_DIR "pg_stat_tmp" /* Values for track_functions GUC variable --- order is significant! */ typedef enum TrackFunctionsLevel { - TRACK_FUNC_OFF, - TRACK_FUNC_PL, - TRACK_FUNC_ALL -} TrackFunctionsLevel; + TRACK_FUNC_OFF, + TRACK_FUNC_PL, + TRACK_FUNC_ALL +} TrackFunctionsLevel; /* ---------- * The types of backend -> collector messages @@ -48,24 +48,24 @@ typedef enum TrackFunctionsLevel */ typedef enum StatMsgType { - PGSTAT_MTYPE_DUMMY, - PGSTAT_MTYPE_INQUIRY, - PGSTAT_MTYPE_TABSTAT, - PGSTAT_MTYPE_TABPURGE, - PGSTAT_MTYPE_DROPDB, - PGSTAT_MTYPE_RESETCOUNTER, - PGSTAT_MTYPE_RESETSHAREDCOUNTER, - PGSTAT_MTYPE_RESETSINGLECOUNTER, - PGSTAT_MTYPE_AUTOVAC_START, - PGSTAT_MTYPE_VACUUM, - PGSTAT_MTYPE_ANALYZE, - PGSTAT_MTYPE_ARCHIVER, - PGSTAT_MTYPE_BGWRITER, - PGSTAT_MTYPE_FUNCSTAT, - PGSTAT_MTYPE_FUNCPURGE, - PGSTAT_MTYPE_RECOVERYCONFLICT, - PGSTAT_MTYPE_TEMPFILE, - PGSTAT_MTYPE_DEADLOCK + PGSTAT_MTYPE_DUMMY, + PGSTAT_MTYPE_INQUIRY, + PGSTAT_MTYPE_TABSTAT, + PGSTAT_MTYPE_TABPURGE, + PGSTAT_MTYPE_DROPDB, + PGSTAT_MTYPE_RESETCOUNTER, + PGSTAT_MTYPE_RESETSHAREDCOUNTER, + PGSTAT_MTYPE_RESETSINGLECOUNTER, + PGSTAT_MTYPE_AUTOVAC_START, + PGSTAT_MTYPE_VACUUM, + PGSTAT_MTYPE_ANALYZE, + PGSTAT_MTYPE_ARCHIVER, + PGSTAT_MTYPE_BGWRITER, + PGSTAT_MTYPE_FUNCSTAT, + PGSTAT_MTYPE_FUNCPURGE, + PGSTAT_MTYPE_RECOVERYCONFLICT, + PGSTAT_MTYPE_TEMPFILE, + PGSTAT_MTYPE_DEADLOCK } StatMsgType; /* ---------- @@ -75,7 +75,7 @@ typedef enum StatMsgType typedef int64 PgStat_Counter; /* ---------- - * PgStat_TableCounts The actual per-table counts kept by a backend + * PgStat_TableCounts The actual per-table counts kept by a backend * * This struct should contain only actual event counters, because we memcmp * it against zeroes to detect whether there are any counts to transmit. @@ -97,37 +97,37 @@ typedef int64 PgStat_Counter; */ typedef struct PgStat_TableCounts { - PgStat_Counter t_numscans; + PgStat_Counter t_numscans; - PgStat_Counter t_tuples_returned; - PgStat_Counter t_tuples_fetched; + PgStat_Counter t_tuples_returned; + PgStat_Counter t_tuples_fetched; - PgStat_Counter t_tuples_inserted; - PgStat_Counter t_tuples_updated; - PgStat_Counter t_tuples_deleted; - PgStat_Counter t_tuples_hot_updated; - bool t_truncated; + PgStat_Counter t_tuples_inserted; + PgStat_Counter t_tuples_updated; + PgStat_Counter t_tuples_deleted; + PgStat_Counter t_tuples_hot_updated; + bool t_truncated; - PgStat_Counter t_delta_live_tuples; - PgStat_Counter t_delta_dead_tuples; - PgStat_Counter t_changed_tuples; + PgStat_Counter t_delta_live_tuples; + PgStat_Counter t_delta_dead_tuples; + PgStat_Counter t_changed_tuples; - PgStat_Counter t_blocks_fetched; - PgStat_Counter t_blocks_hit; + PgStat_Counter t_blocks_fetched; + PgStat_Counter t_blocks_hit; } PgStat_TableCounts; /* Possible targets for resetting cluster-wide shared values */ typedef enum PgStat_Shared_Reset_Target { - RESET_ARCHIVER, - RESET_BGWRITER + RESET_ARCHIVER, + RESET_BGWRITER } PgStat_Shared_Reset_Target; /* Possible object types for resetting single counters */ typedef enum PgStat_Single_Reset_Type { - RESET_TABLE, - RESET_FUNCTION + RESET_TABLE, + RESET_FUNCTION } PgStat_Single_Reset_Type; /* ------------------------------------------------------------ @@ -137,7 +137,7 @@ typedef enum PgStat_Single_Reset_Type /* ---------- - * PgStat_TableStatus Per-table status within a backend + * PgStat_TableStatus Per-table status within a backend * * Many of the event counters are nontransactional, ie, we count events * in committed and aborted transactions alike. For these, we just count @@ -153,34 +153,34 @@ typedef enum PgStat_Single_Reset_Type */ typedef struct PgStat_TableStatus { - Oid t_id; /* table's OID */ + Oid t_id; /* table's OID */ #ifdef __TBASE__ Oid t_parent_id; /* parent's OID for interval child table, of InvalidOid */ #endif - bool t_shared; /* is it a shared catalog? */ - struct PgStat_TableXactStatus *trans; /* lowest subxact's counts */ - PgStat_TableCounts t_counts; /* event counts to be sent */ + bool t_shared; /* is it a shared catalog? */ + struct PgStat_TableXactStatus *trans; /* lowest subxact's counts */ + PgStat_TableCounts t_counts; /* event counts to be sent */ } PgStat_TableStatus; /* ---------- - * PgStat_TableXactStatus Per-table, per-subtransaction status + * PgStat_TableXactStatus Per-table, per-subtransaction status * ---------- */ typedef struct PgStat_TableXactStatus { - PgStat_Counter tuples_inserted; /* tuples inserted in (sub)xact */ - PgStat_Counter tuples_updated; /* tuples updated in (sub)xact */ - PgStat_Counter tuples_deleted; /* tuples deleted in (sub)xact */ - bool truncated; /* relation truncated in this (sub)xact */ - PgStat_Counter inserted_pre_trunc; /* tuples inserted prior to truncate */ - PgStat_Counter updated_pre_trunc; /* tuples updated prior to truncate */ - PgStat_Counter deleted_pre_trunc; /* tuples deleted prior to truncate */ - int nest_level; /* subtransaction nest level */ - /* links to other structs for same relation: */ - struct PgStat_TableXactStatus *upper; /* next higher subxact if any */ - PgStat_TableStatus *parent; /* per-table status */ - /* structs of same subxact level are linked here: */ - struct PgStat_TableXactStatus *next; /* next of same subxact */ + PgStat_Counter tuples_inserted; /* tuples inserted in (sub)xact */ + PgStat_Counter tuples_updated; /* tuples updated in (sub)xact */ + PgStat_Counter tuples_deleted; /* tuples deleted in (sub)xact */ + bool truncated; /* relation truncated in this (sub)xact */ + PgStat_Counter inserted_pre_trunc; /* tuples inserted prior to truncate */ + PgStat_Counter updated_pre_trunc; /* tuples updated prior to truncate */ + PgStat_Counter deleted_pre_trunc; /* tuples deleted prior to truncate */ + int nest_level; /* subtransaction nest level */ + /* links to other structs for same relation: */ + struct PgStat_TableXactStatus *upper; /* next higher subxact if any */ + PgStat_TableStatus *parent; /* per-table status */ + /* structs of same subxact level are linked here: */ + struct PgStat_TableXactStatus *next; /* next of same subxact */ } PgStat_TableXactStatus; @@ -191,13 +191,13 @@ typedef struct PgStat_TableXactStatus /* ---------- - * PgStat_MsgHdr The common message header + * PgStat_MsgHdr The common message header * ---------- */ typedef struct PgStat_MsgHdr { - StatMsgType m_type; - int m_size; + StatMsgType m_type; + int m_size; } PgStat_MsgHdr; /* ---------- @@ -208,22 +208,22 @@ typedef struct PgStat_MsgHdr * ---------- */ #define PGSTAT_MAX_MSG_SIZE 1000 -#define PGSTAT_MSG_PAYLOAD (PGSTAT_MAX_MSG_SIZE - sizeof(PgStat_MsgHdr)) +#define PGSTAT_MSG_PAYLOAD (PGSTAT_MAX_MSG_SIZE - sizeof(PgStat_MsgHdr)) /* ---------- - * PgStat_MsgDummy A dummy message, ignored by the collector + * PgStat_MsgDummy A dummy message, ignored by the collector * ---------- */ typedef struct PgStat_MsgDummy { - PgStat_MsgHdr m_hdr; + PgStat_MsgHdr m_hdr; } PgStat_MsgDummy; /* ---------- - * PgStat_MsgInquiry Sent by a backend to ask the collector - * to write the stats file(s). + * PgStat_MsgInquiry Sent by a backend to ask the collector + * to write the stats file(s). * * Ordinarily, an inquiry message prompts writing of the global stats file, * the stats file for shared catalogs, and the stats file for the specified @@ -242,219 +242,219 @@ typedef struct PgStat_MsgDummy typedef struct PgStat_MsgInquiry { - PgStat_MsgHdr m_hdr; - TimestampTz clock_time; /* observed local clock time */ - TimestampTz cutoff_time; /* minimum acceptable file timestamp */ - Oid databaseid; /* requested DB (InvalidOid => shared only) */ + PgStat_MsgHdr m_hdr; + TimestampTz clock_time; /* observed local clock time */ + TimestampTz cutoff_time; /* minimum acceptable file timestamp */ + Oid databaseid; /* requested DB (InvalidOid => shared only) */ } PgStat_MsgInquiry; /* ---------- - * PgStat_TableEntry Per-table info in a MsgTabstat + * PgStat_TableEntry Per-table info in a MsgTabstat * ---------- */ typedef struct PgStat_TableEntry { - Oid t_id; + Oid t_id; #ifdef __TBASE__ Oid t_parent_id; #endif - PgStat_TableCounts t_counts; + PgStat_TableCounts t_counts; } PgStat_TableEntry; /* ---------- - * PgStat_MsgTabstat Sent by the backend to report table - * and buffer access statistics. + * PgStat_MsgTabstat Sent by the backend to report table + * and buffer access statistics. * ---------- */ #define PGSTAT_NUM_TABENTRIES \ - ((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - 3 * sizeof(int) - 2 * sizeof(PgStat_Counter)) \ - / sizeof(PgStat_TableEntry)) + ((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - 3 * sizeof(int) - 2 * sizeof(PgStat_Counter)) \ + / sizeof(PgStat_TableEntry)) typedef struct PgStat_MsgTabstat { - PgStat_MsgHdr m_hdr; - Oid m_databaseid; - int m_nentries; - int m_xact_commit; - int m_xact_rollback; - PgStat_Counter m_block_read_time; /* times in microseconds */ - PgStat_Counter m_block_write_time; - PgStat_TableEntry m_entry[PGSTAT_NUM_TABENTRIES]; + PgStat_MsgHdr m_hdr; + Oid m_databaseid; + int m_nentries; + int m_xact_commit; + int m_xact_rollback; + PgStat_Counter m_block_read_time; /* times in microseconds */ + PgStat_Counter m_block_write_time; + PgStat_TableEntry m_entry[PGSTAT_NUM_TABENTRIES]; } PgStat_MsgTabstat; /* ---------- - * PgStat_MsgTabpurge Sent by the backend to tell the collector - * about dead tables. + * PgStat_MsgTabpurge Sent by the backend to tell the collector + * about dead tables. * ---------- */ #define PGSTAT_NUM_TABPURGE \ - ((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - sizeof(int)) \ - / sizeof(Oid)) + ((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - sizeof(int)) \ + / sizeof(Oid)) typedef struct PgStat_MsgTabpurge { - PgStat_MsgHdr m_hdr; - Oid m_databaseid; - int m_nentries; - Oid m_tableid[PGSTAT_NUM_TABPURGE]; + PgStat_MsgHdr m_hdr; + Oid m_databaseid; + int m_nentries; + Oid m_tableid[PGSTAT_NUM_TABPURGE]; } PgStat_MsgTabpurge; /* ---------- - * PgStat_MsgDropdb Sent by the backend to tell the collector - * about a dropped database + * PgStat_MsgDropdb Sent by the backend to tell the collector + * about a dropped database * ---------- */ typedef struct PgStat_MsgDropdb { - PgStat_MsgHdr m_hdr; - Oid m_databaseid; + PgStat_MsgHdr m_hdr; + Oid m_databaseid; } PgStat_MsgDropdb; /* ---------- - * PgStat_MsgResetcounter Sent by the backend to tell the collector - * to reset counters + * PgStat_MsgResetcounter Sent by the backend to tell the collector + * to reset counters * ---------- */ typedef struct PgStat_MsgResetcounter { - PgStat_MsgHdr m_hdr; - Oid m_databaseid; + PgStat_MsgHdr m_hdr; + Oid m_databaseid; } PgStat_MsgResetcounter; /* ---------- * PgStat_MsgResetsharedcounter Sent by the backend to tell the collector - * to reset a shared counter + * to reset a shared counter * ---------- */ typedef struct PgStat_MsgResetsharedcounter { - PgStat_MsgHdr m_hdr; - PgStat_Shared_Reset_Target m_resettarget; + PgStat_MsgHdr m_hdr; + PgStat_Shared_Reset_Target m_resettarget; } PgStat_MsgResetsharedcounter; /* ---------- * PgStat_MsgResetsinglecounter Sent by the backend to tell the collector - * to reset a single counter + * to reset a single counter * ---------- */ typedef struct PgStat_MsgResetsinglecounter { - PgStat_MsgHdr m_hdr; - Oid m_databaseid; - PgStat_Single_Reset_Type m_resettype; - Oid m_objectid; + PgStat_MsgHdr m_hdr; + Oid m_databaseid; + PgStat_Single_Reset_Type m_resettype; + Oid m_objectid; } PgStat_MsgResetsinglecounter; /* ---------- - * PgStat_MsgAutovacStart Sent by the autovacuum daemon to signal - * that a database is going to be processed + * PgStat_MsgAutovacStart Sent by the autovacuum daemon to signal + * that a database is going to be processed * ---------- */ typedef struct PgStat_MsgAutovacStart { - PgStat_MsgHdr m_hdr; - Oid m_databaseid; - TimestampTz m_start_time; + PgStat_MsgHdr m_hdr; + Oid m_databaseid; + TimestampTz m_start_time; } PgStat_MsgAutovacStart; /* ---------- - * PgStat_MsgVacuum Sent by the backend or autovacuum daemon - * after VACUUM + * PgStat_MsgVacuum Sent by the backend or autovacuum daemon + * after VACUUM * ---------- */ typedef struct PgStat_MsgVacuum { - PgStat_MsgHdr m_hdr; - Oid m_databaseid; - Oid m_tableoid; - bool m_autovacuum; - TimestampTz m_vacuumtime; - PgStat_Counter m_live_tuples; - PgStat_Counter m_dead_tuples; + PgStat_MsgHdr m_hdr; + Oid m_databaseid; + Oid m_tableoid; + bool m_autovacuum; + TimestampTz m_vacuumtime; + PgStat_Counter m_live_tuples; + PgStat_Counter m_dead_tuples; } PgStat_MsgVacuum; /* ---------- - * PgStat_MsgAnalyze Sent by the backend or autovacuum daemon - * after ANALYZE + * PgStat_MsgAnalyze Sent by the backend or autovacuum daemon + * after ANALYZE * ---------- */ typedef struct PgStat_MsgAnalyze { - PgStat_MsgHdr m_hdr; - Oid m_databaseid; - Oid m_tableoid; - bool m_autovacuum; - bool m_resetcounter; - TimestampTz m_analyzetime; - PgStat_Counter m_live_tuples; - PgStat_Counter m_dead_tuples; + PgStat_MsgHdr m_hdr; + Oid m_databaseid; + Oid m_tableoid; + bool m_autovacuum; + bool m_resetcounter; + TimestampTz m_analyzetime; + PgStat_Counter m_live_tuples; + PgStat_Counter m_dead_tuples; } PgStat_MsgAnalyze; /* ---------- - * PgStat_MsgArchiver Sent by the archiver to update statistics. + * PgStat_MsgArchiver Sent by the archiver to update statistics. * ---------- */ typedef struct PgStat_MsgArchiver { - PgStat_MsgHdr m_hdr; - bool m_failed; /* Failed attempt */ - char m_xlog[MAX_XFN_CHARS + 1]; - TimestampTz m_timestamp; + PgStat_MsgHdr m_hdr; + bool m_failed; /* Failed attempt */ + char m_xlog[MAX_XFN_CHARS + 1]; + TimestampTz m_timestamp; } PgStat_MsgArchiver; /* ---------- - * PgStat_MsgBgWriter Sent by the bgwriter to update statistics. + * PgStat_MsgBgWriter Sent by the bgwriter to update statistics. * ---------- */ typedef struct PgStat_MsgBgWriter { - PgStat_MsgHdr m_hdr; - - PgStat_Counter m_timed_checkpoints; - PgStat_Counter m_requested_checkpoints; - PgStat_Counter m_buf_written_checkpoints; - PgStat_Counter m_buf_written_clean; - PgStat_Counter m_maxwritten_clean; - PgStat_Counter m_buf_written_backend; - PgStat_Counter m_buf_fsync_backend; - PgStat_Counter m_buf_alloc; - PgStat_Counter m_checkpoint_write_time; /* times in milliseconds */ - PgStat_Counter m_checkpoint_sync_time; + PgStat_MsgHdr m_hdr; + + PgStat_Counter m_timed_checkpoints; + PgStat_Counter m_requested_checkpoints; + PgStat_Counter m_buf_written_checkpoints; + PgStat_Counter m_buf_written_clean; + PgStat_Counter m_maxwritten_clean; + PgStat_Counter m_buf_written_backend; + PgStat_Counter m_buf_fsync_backend; + PgStat_Counter m_buf_alloc; + PgStat_Counter m_checkpoint_write_time; /* times in milliseconds */ + PgStat_Counter m_checkpoint_sync_time; } PgStat_MsgBgWriter; /* ---------- - * PgStat_MsgRecoveryConflict Sent by the backend upon recovery conflict + * PgStat_MsgRecoveryConflict Sent by the backend upon recovery conflict * ---------- */ typedef struct PgStat_MsgRecoveryConflict { - PgStat_MsgHdr m_hdr; + PgStat_MsgHdr m_hdr; - Oid m_databaseid; - int m_reason; + Oid m_databaseid; + int m_reason; } PgStat_MsgRecoveryConflict; /* ---------- - * PgStat_MsgTempFile Sent by the backend upon creating a temp file + * PgStat_MsgTempFile Sent by the backend upon creating a temp file * ---------- */ typedef struct PgStat_MsgTempFile { - PgStat_MsgHdr m_hdr; + PgStat_MsgHdr m_hdr; - Oid m_databaseid; - size_t m_filesize; + Oid m_databaseid; + size_t m_filesize; } PgStat_MsgTempFile; /* ---------- - * PgStat_FunctionCounts The actual per-function counts kept by a backend + * PgStat_FunctionCounts The actual per-function counts kept by a backend * * This struct should contain only actual event counters, because we memcmp * it against zeroes to detect whether there are any counts to transmit. @@ -465,103 +465,103 @@ typedef struct PgStat_MsgTempFile */ typedef struct PgStat_FunctionCounts { - PgStat_Counter f_numcalls; - instr_time f_total_time; - instr_time f_self_time; + PgStat_Counter f_numcalls; + instr_time f_total_time; + instr_time f_self_time; } PgStat_FunctionCounts; /* ---------- - * PgStat_BackendFunctionEntry Entry in backend's per-function hash table + * PgStat_BackendFunctionEntry Entry in backend's per-function hash table * ---------- */ typedef struct PgStat_BackendFunctionEntry { - Oid f_id; - PgStat_FunctionCounts f_counts; + Oid f_id; + PgStat_FunctionCounts f_counts; } PgStat_BackendFunctionEntry; /* ---------- - * PgStat_FunctionEntry Per-function info in a MsgFuncstat + * PgStat_FunctionEntry Per-function info in a MsgFuncstat * ---------- */ typedef struct PgStat_FunctionEntry { - Oid f_id; - PgStat_Counter f_numcalls; - PgStat_Counter f_total_time; /* times in microseconds */ - PgStat_Counter f_self_time; + Oid f_id; + PgStat_Counter f_numcalls; + PgStat_Counter f_total_time; /* times in microseconds */ + PgStat_Counter f_self_time; } PgStat_FunctionEntry; /* ---------- - * PgStat_MsgFuncstat Sent by the backend to report function - * usage statistics. + * PgStat_MsgFuncstat Sent by the backend to report function + * usage statistics. * ---------- */ -#define PGSTAT_NUM_FUNCENTRIES \ - ((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - sizeof(int)) \ - / sizeof(PgStat_FunctionEntry)) +#define PGSTAT_NUM_FUNCENTRIES \ + ((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - sizeof(int)) \ + / sizeof(PgStat_FunctionEntry)) typedef struct PgStat_MsgFuncstat { - PgStat_MsgHdr m_hdr; - Oid m_databaseid; - int m_nentries; - PgStat_FunctionEntry m_entry[PGSTAT_NUM_FUNCENTRIES]; + PgStat_MsgHdr m_hdr; + Oid m_databaseid; + int m_nentries; + PgStat_FunctionEntry m_entry[PGSTAT_NUM_FUNCENTRIES]; } PgStat_MsgFuncstat; /* ---------- - * PgStat_MsgFuncpurge Sent by the backend to tell the collector - * about dead functions. + * PgStat_MsgFuncpurge Sent by the backend to tell the collector + * about dead functions. * ---------- */ #define PGSTAT_NUM_FUNCPURGE \ - ((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - sizeof(int)) \ - / sizeof(Oid)) + ((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - sizeof(int)) \ + / sizeof(Oid)) typedef struct PgStat_MsgFuncpurge { - PgStat_MsgHdr m_hdr; - Oid m_databaseid; - int m_nentries; - Oid m_functionid[PGSTAT_NUM_FUNCPURGE]; + PgStat_MsgHdr m_hdr; + Oid m_databaseid; + int m_nentries; + Oid m_functionid[PGSTAT_NUM_FUNCPURGE]; } PgStat_MsgFuncpurge; /* ---------- - * PgStat_MsgDeadlock Sent by the backend to tell the collector - * about a deadlock that occurred. + * PgStat_MsgDeadlock Sent by the backend to tell the collector + * about a deadlock that occurred. * ---------- */ typedef struct PgStat_MsgDeadlock { - PgStat_MsgHdr m_hdr; - Oid m_databaseid; + PgStat_MsgHdr m_hdr; + Oid m_databaseid; } PgStat_MsgDeadlock; /* ---------- - * PgStat_Msg Union over all possible messages. + * PgStat_Msg Union over all possible messages. * ---------- */ typedef union PgStat_Msg { - PgStat_MsgHdr msg_hdr; - PgStat_MsgDummy msg_dummy; - PgStat_MsgInquiry msg_inquiry; - PgStat_MsgTabstat msg_tabstat; - PgStat_MsgTabpurge msg_tabpurge; - PgStat_MsgDropdb msg_dropdb; - PgStat_MsgResetcounter msg_resetcounter; - PgStat_MsgResetsharedcounter msg_resetsharedcounter; - PgStat_MsgResetsinglecounter msg_resetsinglecounter; - PgStat_MsgAutovacStart msg_autovacuum; - PgStat_MsgVacuum msg_vacuum; - PgStat_MsgAnalyze msg_analyze; - PgStat_MsgArchiver msg_archiver; - PgStat_MsgBgWriter msg_bgwriter; - PgStat_MsgFuncstat msg_funcstat; - PgStat_MsgFuncpurge msg_funcpurge; - PgStat_MsgRecoveryConflict msg_recoveryconflict; - PgStat_MsgDeadlock msg_deadlock; + PgStat_MsgHdr msg_hdr; + PgStat_MsgDummy msg_dummy; + PgStat_MsgInquiry msg_inquiry; + PgStat_MsgTabstat msg_tabstat; + PgStat_MsgTabpurge msg_tabpurge; + PgStat_MsgDropdb msg_dropdb; + PgStat_MsgResetcounter msg_resetcounter; + PgStat_MsgResetsharedcounter msg_resetsharedcounter; + PgStat_MsgResetsinglecounter msg_resetsinglecounter; + PgStat_MsgAutovacStart msg_autovacuum; + PgStat_MsgVacuum msg_vacuum; + PgStat_MsgAnalyze msg_analyze; + PgStat_MsgArchiver msg_archiver; + PgStat_MsgBgWriter msg_bgwriter; + PgStat_MsgFuncstat msg_funcstat; + PgStat_MsgFuncpurge msg_funcpurge; + PgStat_MsgRecoveryConflict msg_recoveryconflict; + PgStat_MsgDeadlock msg_deadlock; } PgStat_Msg; @@ -573,96 +573,96 @@ typedef union PgStat_Msg * ------------------------------------------------------------ */ -#define PGSTAT_FILE_FORMAT_ID 0x01A5BC9D +#define PGSTAT_FILE_FORMAT_ID 0x01A5BC9D /* ---------- - * PgStat_StatDBEntry The collector's data per database + * PgStat_StatDBEntry The collector's data per database * ---------- */ typedef struct PgStat_StatDBEntry { - Oid databaseid; - PgStat_Counter n_xact_commit; - PgStat_Counter n_xact_rollback; - PgStat_Counter n_blocks_fetched; - PgStat_Counter n_blocks_hit; - PgStat_Counter n_tuples_returned; - PgStat_Counter n_tuples_fetched; - PgStat_Counter n_tuples_inserted; - PgStat_Counter n_tuples_updated; - PgStat_Counter n_tuples_deleted; - TimestampTz last_autovac_time; - PgStat_Counter n_conflict_tablespace; - PgStat_Counter n_conflict_lock; - PgStat_Counter n_conflict_snapshot; - PgStat_Counter n_conflict_bufferpin; - PgStat_Counter n_conflict_startup_deadlock; - PgStat_Counter n_temp_files; - PgStat_Counter n_temp_bytes; - PgStat_Counter n_deadlocks; - PgStat_Counter n_block_read_time; /* times in microseconds */ - PgStat_Counter n_block_write_time; - - TimestampTz stat_reset_timestamp; - TimestampTz stats_timestamp; /* time of db stats file update */ - - /* - * tables and functions must be last in the struct, because we don't write - * the pointers out to the stats file. - */ - HTAB *tables; - HTAB *functions; + Oid databaseid; + PgStat_Counter n_xact_commit; + PgStat_Counter n_xact_rollback; + PgStat_Counter n_blocks_fetched; + PgStat_Counter n_blocks_hit; + PgStat_Counter n_tuples_returned; + PgStat_Counter n_tuples_fetched; + PgStat_Counter n_tuples_inserted; + PgStat_Counter n_tuples_updated; + PgStat_Counter n_tuples_deleted; + TimestampTz last_autovac_time; + PgStat_Counter n_conflict_tablespace; + PgStat_Counter n_conflict_lock; + PgStat_Counter n_conflict_snapshot; + PgStat_Counter n_conflict_bufferpin; + PgStat_Counter n_conflict_startup_deadlock; + PgStat_Counter n_temp_files; + PgStat_Counter n_temp_bytes; + PgStat_Counter n_deadlocks; + PgStat_Counter n_block_read_time; /* times in microseconds */ + PgStat_Counter n_block_write_time; + + TimestampTz stat_reset_timestamp; + TimestampTz stats_timestamp; /* time of db stats file update */ + + /* + * tables and functions must be last in the struct, because we don't write + * the pointers out to the stats file. + */ + HTAB *tables; + HTAB *functions; } PgStat_StatDBEntry; /* ---------- - * PgStat_StatTabEntry The collector's data per table (or index) + * PgStat_StatTabEntry The collector's data per table (or index) * ---------- */ typedef struct PgStat_StatTabEntry { - Oid tableid; + Oid tableid; - PgStat_Counter numscans; + PgStat_Counter numscans; - PgStat_Counter tuples_returned; - PgStat_Counter tuples_fetched; + PgStat_Counter tuples_returned; + PgStat_Counter tuples_fetched; - PgStat_Counter tuples_inserted; - PgStat_Counter tuples_updated; - PgStat_Counter tuples_deleted; - PgStat_Counter tuples_hot_updated; + PgStat_Counter tuples_inserted; + PgStat_Counter tuples_updated; + PgStat_Counter tuples_deleted; + PgStat_Counter tuples_hot_updated; - PgStat_Counter n_live_tuples; - PgStat_Counter n_dead_tuples; - PgStat_Counter changes_since_analyze; + PgStat_Counter n_live_tuples; + PgStat_Counter n_dead_tuples; + PgStat_Counter changes_since_analyze; - PgStat_Counter blocks_fetched; - PgStat_Counter blocks_hit; + PgStat_Counter blocks_fetched; + PgStat_Counter blocks_hit; - TimestampTz vacuum_timestamp; /* user initiated vacuum */ - PgStat_Counter vacuum_count; - TimestampTz autovac_vacuum_timestamp; /* autovacuum initiated */ - PgStat_Counter autovac_vacuum_count; - TimestampTz analyze_timestamp; /* user initiated */ - PgStat_Counter analyze_count; - TimestampTz autovac_analyze_timestamp; /* autovacuum initiated */ - PgStat_Counter autovac_analyze_count; + TimestampTz vacuum_timestamp; /* user initiated vacuum */ + PgStat_Counter vacuum_count; + TimestampTz autovac_vacuum_timestamp; /* autovacuum initiated */ + PgStat_Counter autovac_vacuum_count; + TimestampTz analyze_timestamp; /* user initiated */ + PgStat_Counter analyze_count; + TimestampTz autovac_analyze_timestamp; /* autovacuum initiated */ + PgStat_Counter autovac_analyze_count; } PgStat_StatTabEntry; /* ---------- - * PgStat_StatFuncEntry The collector's data per function + * PgStat_StatFuncEntry The collector's data per function * ---------- */ typedef struct PgStat_StatFuncEntry { - Oid functionid; + Oid functionid; - PgStat_Counter f_numcalls; + PgStat_Counter f_numcalls; - PgStat_Counter f_total_time; /* times in microseconds */ - PgStat_Counter f_self_time; + PgStat_Counter f_total_time; /* times in microseconds */ + PgStat_Counter f_self_time; } PgStat_StatFuncEntry; @@ -671,15 +671,15 @@ typedef struct PgStat_StatFuncEntry */ typedef struct PgStat_ArchiverStats { - PgStat_Counter archived_count; /* archival successes */ - char last_archived_wal[MAX_XFN_CHARS + 1]; /* last WAL file - * archived */ - TimestampTz last_archived_timestamp; /* last archival success time */ - PgStat_Counter failed_count; /* failed archival attempts */ - char last_failed_wal[MAX_XFN_CHARS + 1]; /* WAL file involved in - * last failure */ - TimestampTz last_failed_timestamp; /* last archival failure time */ - TimestampTz stat_reset_timestamp; + PgStat_Counter archived_count; /* archival successes */ + char last_archived_wal[MAX_XFN_CHARS + 1]; /* last WAL file + * archived */ + TimestampTz last_archived_timestamp; /* last archival success time */ + PgStat_Counter failed_count; /* failed archival attempts */ + char last_failed_wal[MAX_XFN_CHARS + 1]; /* WAL file involved in + * last failure */ + TimestampTz last_failed_timestamp; /* last archival failure time */ + TimestampTz stat_reset_timestamp; } PgStat_ArchiverStats; /* @@ -687,18 +687,18 @@ typedef struct PgStat_ArchiverStats */ typedef struct PgStat_GlobalStats { - TimestampTz stats_timestamp; /* time of stats file update */ - PgStat_Counter timed_checkpoints; - PgStat_Counter requested_checkpoints; - PgStat_Counter checkpoint_write_time; /* times in milliseconds */ - PgStat_Counter checkpoint_sync_time; - PgStat_Counter buf_written_checkpoints; - PgStat_Counter buf_written_clean; - PgStat_Counter maxwritten_clean; - PgStat_Counter buf_written_backend; - PgStat_Counter buf_fsync_backend; - PgStat_Counter buf_alloc; - TimestampTz stat_reset_timestamp; + TimestampTz stats_timestamp; /* time of stats file update */ + PgStat_Counter timed_checkpoints; + PgStat_Counter requested_checkpoints; + PgStat_Counter checkpoint_write_time; /* times in milliseconds */ + PgStat_Counter checkpoint_sync_time; + PgStat_Counter buf_written_checkpoints; + PgStat_Counter buf_written_clean; + PgStat_Counter maxwritten_clean; + PgStat_Counter buf_written_backend; + PgStat_Counter buf_fsync_backend; + PgStat_Counter buf_alloc; + TimestampTz stat_reset_timestamp; } PgStat_GlobalStats; @@ -708,18 +708,18 @@ typedef struct PgStat_GlobalStats */ typedef enum BackendType { - B_AUTOVAC_LAUNCHER, - B_AUTOVAC_WORKER, - B_BACKEND, - B_BG_WORKER, - B_BG_WRITER, - B_CHECKPOINTER, - B_STARTUP, - B_WAL_RECEIVER, - B_WAL_SENDER, - B_WAL_WRITER, - B_PGXL_CLUSTER_MONITOR, - B_PGXL_POOLER + B_AUTOVAC_LAUNCHER, + B_AUTOVAC_WORKER, + B_BACKEND, + B_BG_WORKER, + B_BG_WRITER, + B_CHECKPOINTER, + B_STARTUP, + B_WAL_RECEIVER, + B_WAL_SENDER, + B_WAL_WRITER, + B_PGXL_CLUSTER_MONITOR, + B_PGXL_POOLER } BackendType; @@ -729,13 +729,13 @@ typedef enum BackendType */ typedef enum BackendState { - STATE_UNDEFINED, - STATE_IDLE, - STATE_RUNNING, - STATE_IDLEINTRANSACTION, - STATE_FASTPATH, - STATE_IDLEINTRANSACTION_ABORTED, - STATE_DISABLED + STATE_UNDEFINED, + STATE_IDLE, + STATE_RUNNING, + STATE_IDLEINTRANSACTION, + STATE_FASTPATH, + STATE_IDLEINTRANSACTION_ABORTED, + STATE_DISABLED } BackendState; @@ -743,15 +743,15 @@ typedef enum BackendState * Wait Classes * ---------- */ -#define PG_WAIT_LWLOCK 0x01000000U -#define PG_WAIT_LOCK 0x03000000U -#define PG_WAIT_BUFFER_PIN 0x04000000U -#define PG_WAIT_ACTIVITY 0x05000000U -#define PG_WAIT_CLIENT 0x06000000U -#define PG_WAIT_EXTENSION 0x07000000U -#define PG_WAIT_IPC 0x08000000U -#define PG_WAIT_TIMEOUT 0x09000000U -#define PG_WAIT_IO 0x0A000000U +#define PG_WAIT_LWLOCK 0x01000000U +#define PG_WAIT_LOCK 0x03000000U +#define PG_WAIT_BUFFER_PIN 0x04000000U +#define PG_WAIT_ACTIVITY 0x05000000U +#define PG_WAIT_CLIENT 0x06000000U +#define PG_WAIT_EXTENSION 0x07000000U +#define PG_WAIT_IPC 0x08000000U +#define PG_WAIT_TIMEOUT 0x09000000U +#define PG_WAIT_IO 0x0A000000U /* ---------- * Wait Events - Activity @@ -763,27 +763,27 @@ typedef enum BackendState */ typedef enum { - WAIT_EVENT_ARCHIVER_MAIN = PG_WAIT_ACTIVITY, - WAIT_EVENT_AUTOVACUUM_MAIN, - WAIT_EVENT_BGWRITER_HIBERNATE, - WAIT_EVENT_BGWRITER_MAIN, - WAIT_EVENT_CHECKPOINTER_MAIN, - WAIT_EVENT_LOGICAL_LAUNCHER_MAIN, - WAIT_EVENT_LOGICAL_APPLY_MAIN, - WAIT_EVENT_PGSTAT_MAIN, - WAIT_EVENT_RECOVERY_WAL_ALL, - WAIT_EVENT_RECOVERY_WAL_STREAM, - WAIT_EVENT_SYSLOGGER_MAIN, + WAIT_EVENT_ARCHIVER_MAIN = PG_WAIT_ACTIVITY, + WAIT_EVENT_AUTOVACUUM_MAIN, + WAIT_EVENT_BGWRITER_HIBERNATE, + WAIT_EVENT_BGWRITER_MAIN, + WAIT_EVENT_CHECKPOINTER_MAIN, + WAIT_EVENT_LOGICAL_LAUNCHER_MAIN, + WAIT_EVENT_LOGICAL_APPLY_MAIN, + WAIT_EVENT_PGSTAT_MAIN, + WAIT_EVENT_RECOVERY_WAL_ALL, + WAIT_EVENT_RECOVERY_WAL_STREAM, + WAIT_EVENT_SYSLOGGER_MAIN, #ifdef __AUDIT__ - WAIT_EVENT_AUDIT_LOGGER_MAIN, + WAIT_EVENT_AUDIT_LOGGER_MAIN, #endif - WAIT_EVENT_WAL_RECEIVER_MAIN, - WAIT_EVENT_WAL_SENDER_MAIN, - WAIT_EVENT_WAL_WRITER_MAIN, + WAIT_EVENT_WAL_RECEIVER_MAIN, + WAIT_EVENT_WAL_SENDER_MAIN, + WAIT_EVENT_WAL_WRITER_MAIN, #ifdef __AUDIT_FGA__ WAIT_EVENT_AUDIT_FGA_MAIN, #endif - WAIT_EVENT_CLUSTER_MONITOR_MAIN + WAIT_EVENT_CLUSTER_MONITOR_MAIN } WaitEventActivity; /* ---------- @@ -796,14 +796,14 @@ typedef enum */ typedef enum { - WAIT_EVENT_CLIENT_READ = PG_WAIT_CLIENT, - WAIT_EVENT_CLIENT_WRITE, - WAIT_EVENT_LIBPQWALRECEIVER_CONNECT, - WAIT_EVENT_LIBPQWALRECEIVER_RECEIVE, - WAIT_EVENT_SSL_OPEN_SERVER, - WAIT_EVENT_WAL_RECEIVER_WAIT_START, - WAIT_EVENT_WAL_SENDER_WAIT_WAL, - WAIT_EVENT_WAL_SENDER_WRITE_DATA + WAIT_EVENT_CLIENT_READ = PG_WAIT_CLIENT, + WAIT_EVENT_CLIENT_WRITE, + WAIT_EVENT_LIBPQWALRECEIVER_CONNECT, + WAIT_EVENT_LIBPQWALRECEIVER_RECEIVE, + WAIT_EVENT_SSL_OPEN_SERVER, + WAIT_EVENT_WAL_RECEIVER_WAIT_START, + WAIT_EVENT_WAL_SENDER_WAIT_WAL, + WAIT_EVENT_WAL_SENDER_WRITE_DATA } WaitEventClient; /* ---------- @@ -815,23 +815,23 @@ typedef enum */ typedef enum { - WAIT_EVENT_BGWORKER_SHUTDOWN = PG_WAIT_IPC, - WAIT_EVENT_BGWORKER_STARTUP, - WAIT_EVENT_BTREE_PAGE, - WAIT_EVENT_EXECUTE_GATHER, - WAIT_EVENT_LOGICAL_SYNC_DATA, - WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE, - WAIT_EVENT_MQ_INTERNAL, - WAIT_EVENT_MQ_PUT_MESSAGE, - WAIT_EVENT_MQ_RECEIVE, - WAIT_EVENT_MQ_SEND, - WAIT_EVENT_PARALLEL_FINISH, - WAIT_EVENT_PARALLEL_BITMAP_SCAN, - WAIT_EVENT_PROCARRAY_GROUP_UPDATE, - WAIT_EVENT_REPLICATION_ORIGIN_DROP, - WAIT_EVENT_REPLICATION_SLOT_DROP, - WAIT_EVENT_SAFE_SNAPSHOT, - WAIT_EVENT_SYNC_REP + WAIT_EVENT_BGWORKER_SHUTDOWN = PG_WAIT_IPC, + WAIT_EVENT_BGWORKER_STARTUP, + WAIT_EVENT_BTREE_PAGE, + WAIT_EVENT_EXECUTE_GATHER, + WAIT_EVENT_LOGICAL_SYNC_DATA, + WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE, + WAIT_EVENT_MQ_INTERNAL, + WAIT_EVENT_MQ_PUT_MESSAGE, + WAIT_EVENT_MQ_RECEIVE, + WAIT_EVENT_MQ_SEND, + WAIT_EVENT_PARALLEL_FINISH, + WAIT_EVENT_PARALLEL_BITMAP_SCAN, + WAIT_EVENT_PROCARRAY_GROUP_UPDATE, + WAIT_EVENT_REPLICATION_ORIGIN_DROP, + WAIT_EVENT_REPLICATION_SLOT_DROP, + WAIT_EVENT_SAFE_SNAPSHOT, + WAIT_EVENT_SYNC_REP } WaitEventIPC; /* ---------- @@ -842,9 +842,9 @@ typedef enum */ typedef enum { - WAIT_EVENT_BASE_BACKUP_THROTTLE = PG_WAIT_TIMEOUT, - WAIT_EVENT_PG_SLEEP, - WAIT_EVENT_RECOVERY_APPLY_DELAY + WAIT_EVENT_BASE_BACKUP_THROTTLE = PG_WAIT_TIMEOUT, + WAIT_EVENT_PG_SLEEP, + WAIT_EVENT_RECOVERY_APPLY_DELAY } WaitEventTimeout; /* ---------- @@ -855,86 +855,86 @@ typedef enum */ typedef enum { - WAIT_EVENT_BUFFILE_READ = PG_WAIT_IO, - WAIT_EVENT_BUFFILE_WRITE, - WAIT_EVENT_CONTROL_FILE_READ, - WAIT_EVENT_CONTROL_FILE_SYNC, - WAIT_EVENT_CONTROL_FILE_SYNC_UPDATE, - WAIT_EVENT_CONTROL_FILE_WRITE, - WAIT_EVENT_CONTROL_FILE_WRITE_UPDATE, - WAIT_EVENT_COPY_FILE_READ, - WAIT_EVENT_COPY_FILE_WRITE, + WAIT_EVENT_BUFFILE_READ = PG_WAIT_IO, + WAIT_EVENT_BUFFILE_WRITE, + WAIT_EVENT_CONTROL_FILE_READ, + WAIT_EVENT_CONTROL_FILE_SYNC, + WAIT_EVENT_CONTROL_FILE_SYNC_UPDATE, + WAIT_EVENT_CONTROL_FILE_WRITE, + WAIT_EVENT_CONTROL_FILE_WRITE_UPDATE, + WAIT_EVENT_COPY_FILE_READ, + WAIT_EVENT_COPY_FILE_WRITE, #ifdef _MLS_ WAIT_EVENT_CRYPT_KEY_MAP_READ, WAIT_EVENT_CRYPT_KEY_MAP_SYNC, WAIT_EVENT_CRYPT_KEY_MAP_WRITE, -#endif - WAIT_EVENT_DATA_FILE_EXTEND, - WAIT_EVENT_DATA_FILE_FLUSH, - WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC, - WAIT_EVENT_DATA_FILE_PREFETCH, - WAIT_EVENT_DATA_FILE_READ, - WAIT_EVENT_DATA_FILE_SYNC, - WAIT_EVENT_DATA_FILE_TRUNCATE, +#endif + WAIT_EVENT_DATA_FILE_EXTEND, + WAIT_EVENT_DATA_FILE_FLUSH, + WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC, + WAIT_EVENT_DATA_FILE_PREFETCH, + WAIT_EVENT_DATA_FILE_READ, + WAIT_EVENT_DATA_FILE_SYNC, + WAIT_EVENT_DATA_FILE_TRUNCATE, #ifdef _SHARDING_ - WAIT_EVENT_DATA_FILE_DEALLOC, + WAIT_EVENT_DATA_FILE_DEALLOC, #endif - WAIT_EVENT_DATA_FILE_WRITE, - WAIT_EVENT_DSM_FILL_ZERO_WRITE, - WAIT_EVENT_LOCK_FILE_ADDTODATADIR_READ, - WAIT_EVENT_LOCK_FILE_ADDTODATADIR_SYNC, - WAIT_EVENT_LOCK_FILE_ADDTODATADIR_WRITE, - WAIT_EVENT_LOCK_FILE_CREATE_READ, - WAIT_EVENT_LOCK_FILE_CREATE_SYNC, - WAIT_EVENT_LOCK_FILE_CREATE_WRITE, - WAIT_EVENT_LOCK_FILE_RECHECKDATADIR_READ, - WAIT_EVENT_LOGICAL_REWRITE_CHECKPOINT_SYNC, - WAIT_EVENT_LOGICAL_REWRITE_MAPPING_SYNC, - WAIT_EVENT_LOGICAL_REWRITE_MAPPING_WRITE, - WAIT_EVENT_LOGICAL_REWRITE_SYNC, - WAIT_EVENT_LOGICAL_REWRITE_TRUNCATE, - WAIT_EVENT_LOGICAL_REWRITE_WRITE, + WAIT_EVENT_DATA_FILE_WRITE, + WAIT_EVENT_DSM_FILL_ZERO_WRITE, + WAIT_EVENT_LOCK_FILE_ADDTODATADIR_READ, + WAIT_EVENT_LOCK_FILE_ADDTODATADIR_SYNC, + WAIT_EVENT_LOCK_FILE_ADDTODATADIR_WRITE, + WAIT_EVENT_LOCK_FILE_CREATE_READ, + WAIT_EVENT_LOCK_FILE_CREATE_SYNC, + WAIT_EVENT_LOCK_FILE_CREATE_WRITE, + WAIT_EVENT_LOCK_FILE_RECHECKDATADIR_READ, + WAIT_EVENT_LOGICAL_REWRITE_CHECKPOINT_SYNC, + WAIT_EVENT_LOGICAL_REWRITE_MAPPING_SYNC, + WAIT_EVENT_LOGICAL_REWRITE_MAPPING_WRITE, + WAIT_EVENT_LOGICAL_REWRITE_SYNC, + WAIT_EVENT_LOGICAL_REWRITE_TRUNCATE, + WAIT_EVENT_LOGICAL_REWRITE_WRITE, #ifdef _MLS_ WAIT_EVENT_REL_CRYPT_MAP_READ, WAIT_EVENT_REL_CRYPT_MAP_SYNC, WAIT_EVENT_REL_CRYPT_MAP_WRITE, #endif - WAIT_EVENT_RELATION_MAP_READ, - WAIT_EVENT_RELATION_MAP_SYNC, - WAIT_EVENT_RELATION_MAP_WRITE, - WAIT_EVENT_REORDER_BUFFER_READ, - WAIT_EVENT_REORDER_BUFFER_WRITE, - WAIT_EVENT_REORDER_LOGICAL_MAPPING_READ, - WAIT_EVENT_REPLICATION_SLOT_READ, - WAIT_EVENT_REPLICATION_SLOT_RESTORE_SYNC, - WAIT_EVENT_REPLICATION_SLOT_SYNC, - WAIT_EVENT_REPLICATION_SLOT_WRITE, - WAIT_EVENT_SLRU_FLUSH_SYNC, - WAIT_EVENT_SLRU_READ, - WAIT_EVENT_SLRU_SYNC, - WAIT_EVENT_SLRU_WRITE, - WAIT_EVENT_SNAPBUILD_READ, - WAIT_EVENT_SNAPBUILD_SYNC, - WAIT_EVENT_SNAPBUILD_WRITE, - WAIT_EVENT_TIMELINE_HISTORY_FILE_SYNC, - WAIT_EVENT_TIMELINE_HISTORY_FILE_WRITE, - WAIT_EVENT_TIMELINE_HISTORY_READ, - WAIT_EVENT_TIMELINE_HISTORY_SYNC, - WAIT_EVENT_TIMELINE_HISTORY_WRITE, - WAIT_EVENT_TWOPHASE_FILE_READ, - WAIT_EVENT_TWOPHASE_FILE_SYNC, - WAIT_EVENT_TWOPHASE_FILE_WRITE, - WAIT_EVENT_WALSENDER_TIMELINE_HISTORY_READ, - WAIT_EVENT_WAL_BOOTSTRAP_SYNC, - WAIT_EVENT_WAL_BOOTSTRAP_WRITE, - WAIT_EVENT_WAL_COPY_READ, - WAIT_EVENT_WAL_COPY_SYNC, - WAIT_EVENT_WAL_COPY_WRITE, - WAIT_EVENT_WAL_INIT_SYNC, - WAIT_EVENT_WAL_INIT_WRITE, - WAIT_EVENT_WAL_READ, - WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN, - WAIT_EVENT_WAL_WRITE + WAIT_EVENT_RELATION_MAP_READ, + WAIT_EVENT_RELATION_MAP_SYNC, + WAIT_EVENT_RELATION_MAP_WRITE, + WAIT_EVENT_REORDER_BUFFER_READ, + WAIT_EVENT_REORDER_BUFFER_WRITE, + WAIT_EVENT_REORDER_LOGICAL_MAPPING_READ, + WAIT_EVENT_REPLICATION_SLOT_READ, + WAIT_EVENT_REPLICATION_SLOT_RESTORE_SYNC, + WAIT_EVENT_REPLICATION_SLOT_SYNC, + WAIT_EVENT_REPLICATION_SLOT_WRITE, + WAIT_EVENT_SLRU_FLUSH_SYNC, + WAIT_EVENT_SLRU_READ, + WAIT_EVENT_SLRU_SYNC, + WAIT_EVENT_SLRU_WRITE, + WAIT_EVENT_SNAPBUILD_READ, + WAIT_EVENT_SNAPBUILD_SYNC, + WAIT_EVENT_SNAPBUILD_WRITE, + WAIT_EVENT_TIMELINE_HISTORY_FILE_SYNC, + WAIT_EVENT_TIMELINE_HISTORY_FILE_WRITE, + WAIT_EVENT_TIMELINE_HISTORY_READ, + WAIT_EVENT_TIMELINE_HISTORY_SYNC, + WAIT_EVENT_TIMELINE_HISTORY_WRITE, + WAIT_EVENT_TWOPHASE_FILE_READ, + WAIT_EVENT_TWOPHASE_FILE_SYNC, + WAIT_EVENT_TWOPHASE_FILE_WRITE, + WAIT_EVENT_WALSENDER_TIMELINE_HISTORY_READ, + WAIT_EVENT_WAL_BOOTSTRAP_SYNC, + WAIT_EVENT_WAL_BOOTSTRAP_WRITE, + WAIT_EVENT_WAL_COPY_READ, + WAIT_EVENT_WAL_COPY_SYNC, + WAIT_EVENT_WAL_COPY_WRITE, + WAIT_EVENT_WAL_INIT_SYNC, + WAIT_EVENT_WAL_INIT_WRITE, + WAIT_EVENT_WAL_READ, + WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN, + WAIT_EVENT_WAL_WRITE } WaitEventIO; /* ---------- @@ -943,11 +943,11 @@ typedef enum */ typedef enum ProgressCommandType { - PROGRESS_COMMAND_INVALID, - PROGRESS_COMMAND_VACUUM + PROGRESS_COMMAND_INVALID, + PROGRESS_COMMAND_VACUUM } ProgressCommandType; -#define PGSTAT_NUM_PROGRESS_PARAM 10 +#define PGSTAT_NUM_PROGRESS_PARAM 10 /* ---------- * Shared-memory data structures @@ -963,12 +963,12 @@ typedef enum ProgressCommandType */ typedef struct PgBackendSSLStatus { - /* Information about SSL connection */ - int ssl_bits; - bool ssl_compression; - char ssl_version[NAMEDATALEN]; /* MUST be null-terminated */ - char ssl_cipher[NAMEDATALEN]; /* MUST be null-terminated */ - char ssl_clientdn[NAMEDATALEN]; /* MUST be null-terminated */ + /* Information about SSL connection */ + int ssl_bits; + bool ssl_compression; + char ssl_version[NAMEDATALEN]; /* MUST be null-terminated */ + char ssl_cipher[NAMEDATALEN]; /* MUST be null-terminated */ + char ssl_clientdn[NAMEDATALEN]; /* MUST be null-terminated */ } PgBackendSSLStatus; @@ -986,66 +986,66 @@ typedef struct PgBackendSSLStatus */ typedef struct PgBackendStatus { - /* - * To avoid locking overhead, we use the following protocol: a backend - * increments st_changecount before modifying its entry, and again after - * finishing a modification. A would-be reader should note the value of - * st_changecount, copy the entry into private memory, then check - * st_changecount again. If the value hasn't changed, and if it's even, - * the copy is valid; otherwise start over. This makes updates cheap - * while reads are potentially expensive, but that's the tradeoff we want. - * - * The above protocol needs the memory barriers to ensure that the - * apparent order of execution is as it desires. Otherwise, for example, - * the CPU might rearrange the code so that st_changecount is incremented - * twice before the modification on a machine with weak memory ordering. - * This surprising result can lead to bugs. - */ - int st_changecount; - - /* The entry is valid iff st_procpid > 0, unused if st_procpid == 0 */ - int st_procpid; - - /* Type of backends */ - BackendType st_backendType; - - /* Times when current backend, transaction, and activity started */ - TimestampTz st_proc_start_timestamp; - TimestampTz st_xact_start_timestamp; - TimestampTz st_activity_start_timestamp; - TimestampTz st_state_start_timestamp; - - /* Database OID, owning user's OID, connection client address */ - Oid st_databaseid; - Oid st_userid; - SockAddr st_clientaddr; - char *st_clienthostname; /* MUST be null-terminated */ - - /* Information about SSL connection */ - bool st_ssl; - PgBackendSSLStatus *st_sslstatus; - - /* current state */ - BackendState st_state; - - /* application name; MUST be null-terminated */ - char *st_appname; - - /* current command string; MUST be null-terminated */ - char *st_activity; - - /* - * Command progress reporting. Any command which wishes can advertise - * that it is running by setting st_progress_command, - * st_progress_command_target, and st_progress_param[]. - * st_progress_command_target should be the OID of the relation which the - * command targets (we assume there's just one, as this is meant for - * utility commands), but the meaning of each element in the - * st_progress_param array is command-specific. - */ - ProgressCommandType st_progress_command; - Oid st_progress_command_target; - int64 st_progress_param[PGSTAT_NUM_PROGRESS_PARAM]; + /* + * To avoid locking overhead, we use the following protocol: a backend + * increments st_changecount before modifying its entry, and again after + * finishing a modification. A would-be reader should note the value of + * st_changecount, copy the entry into private memory, then check + * st_changecount again. If the value hasn't changed, and if it's even, + * the copy is valid; otherwise start over. This makes updates cheap + * while reads are potentially expensive, but that's the tradeoff we want. + * + * The above protocol needs the memory barriers to ensure that the + * apparent order of execution is as it desires. Otherwise, for example, + * the CPU might rearrange the code so that st_changecount is incremented + * twice before the modification on a machine with weak memory ordering. + * This surprising result can lead to bugs. + */ + int st_changecount; + + /* The entry is valid if st_procpid > 0, unused if st_procpid == 0 */ + int st_procpid; + + /* Type of backends */ + BackendType st_backendType; + + /* Times when current backend, transaction, and activity started */ + TimestampTz st_proc_start_timestamp; + TimestampTz st_xact_start_timestamp; + TimestampTz st_activity_start_timestamp; + TimestampTz st_state_start_timestamp; + + /* Database OID, owning user's OID, connection client address */ + Oid st_databaseid; + Oid st_userid; + SockAddr st_clientaddr; + char *st_clienthostname; /* MUST be null-terminated */ + + /* Information about SSL connection */ + bool st_ssl; + PgBackendSSLStatus *st_sslstatus; + + /* current state */ + BackendState st_state; + + /* application name; MUST be null-terminated */ + char *st_appname; + + /* current command string; MUST be null-terminated */ + char *st_activity; + + /* + * Command progress reporting. Any command which wishes can advertise + * that it is running by setting st_progress_command, + * st_progress_command_target, and st_progress_param[]. + * st_progress_command_target should be the OID of the relation which the + * command targets (we assume there's just one, as this is meant for + * utility commands), but the meaning of each element in the + * st_progress_param array is command-specific. + */ + ProgressCommandType st_progress_command; + Oid st_progress_command_target; + int64 st_progress_param[PGSTAT_NUM_PROGRESS_PARAM]; } PgBackendStatus; /* @@ -1060,30 +1060,30 @@ typedef struct PgBackendStatus * need to be called before and after PgBackendStatus entries are copied into * private memory, respectively. */ -#define pgstat_increment_changecount_before(beentry) \ - do { \ - beentry->st_changecount++; \ - pg_write_barrier(); \ - } while (0) +#define pgstat_increment_changecount_before(beentry) \ + do { \ + beentry->st_changecount++; \ + pg_write_barrier(); \ + } while (0) #define pgstat_increment_changecount_after(beentry) \ - do { \ - pg_write_barrier(); \ - beentry->st_changecount++; \ - Assert((beentry->st_changecount & 1) == 0); \ - } while (0) - -#define pgstat_save_changecount_before(beentry, save_changecount) \ - do { \ - save_changecount = beentry->st_changecount; \ - pg_read_barrier(); \ - } while (0) - -#define pgstat_save_changecount_after(beentry, save_changecount) \ - do { \ - pg_read_barrier(); \ - save_changecount = beentry->st_changecount; \ - } while (0) + do { \ + pg_write_barrier(); \ + beentry->st_changecount++; \ + Assert((beentry->st_changecount & 1) == 0); \ + } while (0) + +#define pgstat_save_changecount_before(beentry, save_changecount) \ + do { \ + save_changecount = beentry->st_changecount; \ + pg_read_barrier(); \ + } while (0) + +#define pgstat_save_changecount_after(beentry, save_changecount) \ + do { \ + pg_read_barrier(); \ + save_changecount = beentry->st_changecount; \ + } while (0) /* ---------- * LocalPgBackendStatus @@ -1095,22 +1095,25 @@ typedef struct PgBackendStatus */ typedef struct LocalPgBackendStatus { - /* - * Local version of the backend status entry. - */ - PgBackendStatus backendStatus; - - /* - * The xid of the current transaction if available, InvalidTransactionId - * if not. - */ - TransactionId backend_xid; - - /* - * The xmin of the current session if available, InvalidTransactionId if - * not. - */ - TransactionId backend_xmin; + /* + * Local version of the backend status entry. + */ + PgBackendStatus backendStatus; + + /* + * The xid of the current transaction if available, InvalidTransactionId + * if not. + */ + TransactionId backend_xid; + + /* + * The xmin of the current session if available, InvalidTransactionId if + * not. + */ + TransactionId backend_xmin; + + /* copy of backend id */ + BackendId backend_id; } LocalPgBackendStatus; /* @@ -1118,15 +1121,15 @@ typedef struct LocalPgBackendStatus */ typedef struct PgStat_FunctionCallUsage { - /* Link to function's hashtable entry (must still be there at exit!) */ - /* NULL means we are not tracking the current function call */ - PgStat_FunctionCounts *fs; - /* Total time previously charged to function, as of function start */ - instr_time save_f_total_time; - /* Backend-wide total time as of function start */ - instr_time save_total; - /* system clock as of function start */ - instr_time f_start; + /* Link to function's hashtable entry (must still be there at exit!) */ + /* NULL means we are not tracking the current function call */ + PgStat_FunctionCounts *fs; + /* Total time previously charged to function, as of function start */ + instr_time save_f_total_time; + /* Backend-wide total time as of function start */ + instr_time save_total; + /* system clock as of function start */ + instr_time f_start; } PgStat_FunctionCallUsage; @@ -1136,7 +1139,7 @@ typedef struct PgStat_FunctionCallUsage */ extern bool pgstat_track_activities; extern bool pgstat_track_counts; -extern int pgstat_track_functions; +extern int pgstat_track_functions; extern PGDLLIMPORT int pgstat_track_activity_query_size; extern char *pgstat_stat_directory; extern char *pgstat_stat_tmpname; @@ -1161,7 +1164,7 @@ extern Size BackendStatusShmemSize(void); extern void CreateSharedBackendStatus(void); extern void pgstat_init(void); -extern int pgstat_start(void); +extern int pgstat_start(void); extern void pgstat_reset_all(void); extern void allow_immediate_pgstat_restart(void); @@ -1187,10 +1190,10 @@ extern void pgstat_reset_single_counter(Oid objectid, PgStat_Single_Reset_Type t extern void pgstat_report_autovac(Oid dboid); extern void pgstat_report_vacuum(Oid tableoid, bool shared, - PgStat_Counter livetuples, PgStat_Counter deadtuples); + PgStat_Counter livetuples, PgStat_Counter deadtuples); extern void pgstat_report_analyze(Relation rel, - PgStat_Counter livetuples, PgStat_Counter deadtuples, - bool resetcounter); + PgStat_Counter livetuples, PgStat_Counter deadtuples, + bool resetcounter); extern void pgstat_report_recovery_conflict(int reason); extern void pgstat_report_deadlock(void); @@ -1206,14 +1209,14 @@ extern const char *pgstat_get_wait_event(uint32 wait_event_info); extern const char *pgstat_get_wait_event_type(uint32 wait_event_info); extern const char *pgstat_get_backend_current_activity(int pid, bool checkUser); extern const char *pgstat_get_crashed_backend_activity(int pid, char *buffer, - int buflen); + int buflen); extern const char *pgstat_get_backend_desc(BackendType backendType); extern void pgstat_progress_start_command(ProgressCommandType cmdtype, - Oid relid); + Oid relid); extern void pgstat_progress_update_param(int index, int64 val); extern void pgstat_progress_update_multi_param(int nparam, const int *index, - const int64 *val); + const int64 *val); extern void pgstat_progress_end_command(void); extern PgStat_TableStatus *find_tabstat_entry(Oid rel_id); @@ -1224,13 +1227,13 @@ extern void pgstat_initstats(Relation rel); /* ---------- * pgstat_report_wait_start() - * - * Called from places where server process needs to wait. This is called - * to report wait event information. The wait information is stored - * as 4-bytes where first byte represents the wait event class (type of - * wait, for different types of wait, refer WaitClass) and the next - * 3-bytes represent the actual wait event. Currently 2-bytes are used - * for wait event which is sufficient for current usage, 1-byte is - * reserved for future usage. + * Called from places where server process needs to wait. This is called + * to report wait event information. The wait information is stored + * as 4-bytes where first byte represents the wait event class (type of + * wait, for different types of wait, refer WaitClass) and the next + * 3-bytes represent the actual wait event. Currently 2-bytes are used + * for wait event which is sufficient for current usage, 1-byte is + * reserved for future usage. * * NB: this *must* be able to survive being called before MyProc has been * initialized. @@ -1239,22 +1242,22 @@ extern void pgstat_initstats(Relation rel); static inline void pgstat_report_wait_start(uint32 wait_event_info) { - volatile PGPROC *proc = MyProc; + volatile PGPROC *proc = MyProc; - if (!pgstat_track_activities || !proc) - return; + if (!pgstat_track_activities || !proc) + return; - /* - * Since this is a four-byte field which is always read and written as - * four-bytes, updates are atomic. - */ - proc->wait_event_info = wait_event_info; + /* + * Since this is a four-byte field which is always read and written as + * four-bytes, updates are atomic. + */ + proc->wait_event_info = wait_event_info; } /* ---------- * pgstat_report_wait_end() - * - * Called to report end of a wait. + * Called to report end of a wait. * * NB: this *must* be able to survive being called before MyProc has been * initialized. @@ -1263,59 +1266,59 @@ pgstat_report_wait_start(uint32 wait_event_info) static inline void pgstat_report_wait_end(void) { - volatile PGPROC *proc = MyProc; + volatile PGPROC *proc = MyProc; - if (!pgstat_track_activities || !proc) - return; + if (!pgstat_track_activities || !proc) + return; - /* - * Since this is a four-byte field which is always read and written as - * four-bytes, updates are atomic. - */ - proc->wait_event_info = 0; + /* + * Since this is a four-byte field which is always read and written as + * four-bytes, updates are atomic. + */ + proc->wait_event_info = 0; } /* nontransactional event counts are simple enough to inline */ -#define pgstat_count_heap_scan(rel) \ - do { \ - if ((rel)->pgstat_info != NULL) \ - (rel)->pgstat_info->t_counts.t_numscans++; \ - } while (0) -#define pgstat_count_heap_getnext(rel) \ - do { \ - if ((rel)->pgstat_info != NULL) \ - (rel)->pgstat_info->t_counts.t_tuples_returned++; \ - } while (0) -#define pgstat_count_heap_fetch(rel) \ - do { \ - if ((rel)->pgstat_info != NULL) \ - (rel)->pgstat_info->t_counts.t_tuples_fetched++; \ - } while (0) -#define pgstat_count_index_scan(rel) \ - do { \ - if ((rel)->pgstat_info != NULL) \ - (rel)->pgstat_info->t_counts.t_numscans++; \ - } while (0) -#define pgstat_count_index_tuples(rel, n) \ - do { \ - if ((rel)->pgstat_info != NULL) \ - (rel)->pgstat_info->t_counts.t_tuples_returned += (n); \ - } while (0) -#define pgstat_count_buffer_read(rel) \ - do { \ - if ((rel)->pgstat_info != NULL) \ - (rel)->pgstat_info->t_counts.t_blocks_fetched++; \ - } while (0) -#define pgstat_count_buffer_hit(rel) \ - do { \ - if ((rel)->pgstat_info != NULL) \ - (rel)->pgstat_info->t_counts.t_blocks_hit++; \ - } while (0) -#define pgstat_count_buffer_read_time(n) \ - (pgStatBlockReadTime += (n)) -#define pgstat_count_buffer_write_time(n) \ - (pgStatBlockWriteTime += (n)) +#define pgstat_count_heap_scan(rel) \ + do { \ + if ((rel)->pgstat_info != NULL) \ + (rel)->pgstat_info->t_counts.t_numscans++; \ + } while (0) +#define pgstat_count_heap_getnext(rel) \ + do { \ + if ((rel)->pgstat_info != NULL) \ + (rel)->pgstat_info->t_counts.t_tuples_returned++; \ + } while (0) +#define pgstat_count_heap_fetch(rel) \ + do { \ + if ((rel)->pgstat_info != NULL) \ + (rel)->pgstat_info->t_counts.t_tuples_fetched++; \ + } while (0) +#define pgstat_count_index_scan(rel) \ + do { \ + if ((rel)->pgstat_info != NULL) \ + (rel)->pgstat_info->t_counts.t_numscans++; \ + } while (0) +#define pgstat_count_index_tuples(rel, n) \ + do { \ + if ((rel)->pgstat_info != NULL) \ + (rel)->pgstat_info->t_counts.t_tuples_returned += (n); \ + } while (0) +#define pgstat_count_buffer_read(rel) \ + do { \ + if ((rel)->pgstat_info != NULL) \ + (rel)->pgstat_info->t_counts.t_blocks_fetched++; \ + } while (0) +#define pgstat_count_buffer_hit(rel) \ + do { \ + if ((rel)->pgstat_info != NULL) \ + (rel)->pgstat_info->t_counts.t_blocks_hit++; \ + } while (0) +#define pgstat_count_buffer_read_time(n) \ + (pgStatBlockReadTime += (n)) +#define pgstat_count_buffer_write_time(n) \ + (pgStatBlockWriteTime += (n)) extern void pgstat_count_heap_insert(Relation rel, PgStat_Counter n); extern void pgstat_count_heap_update(Relation rel, bool hot); @@ -1329,9 +1332,9 @@ extern void pgstat_count_remote_delete(Relation rel, int n); #endif extern void pgstat_init_function_usage(FunctionCallInfoData *fcinfo, - PgStat_FunctionCallUsage *fcu); + PgStat_FunctionCallUsage *fcu); extern void pgstat_end_function_usage(PgStat_FunctionCallUsage *fcu, - bool finalize); + bool finalize); extern void AtEOXact_PgStat(bool isCommit); extern void AtEOSubXact_PgStat(bool isCommit, int nestDepth); @@ -1340,9 +1343,9 @@ extern void AtPrepare_PgStat(void); extern void PostPrepare_PgStat(void); extern void pgstat_twophase_postcommit(TransactionId xid, uint16 info, - void *recdata, uint32 len); + void *recdata, uint32 len); extern void pgstat_twophase_postabort(TransactionId xid, uint16 info, - void *recdata, uint32 len); + void *recdata, uint32 len); extern void pgstat_send_archiver(const char *xlog, bool failed); extern void pgstat_send_bgwriter(void); @@ -1357,8 +1360,8 @@ extern PgStat_StatTabEntry *pgstat_fetch_stat_tabentry(Oid relid); extern PgBackendStatus *pgstat_fetch_stat_beentry(int beid); extern LocalPgBackendStatus *pgstat_fetch_stat_local_beentry(int beid); extern PgStat_StatFuncEntry *pgstat_fetch_stat_funcentry(Oid funcid); -extern int pgstat_fetch_stat_numbackends(void); +extern int pgstat_fetch_stat_numbackends(void); extern PgStat_ArchiverStats *pgstat_fetch_stat_archiver(void); extern PgStat_GlobalStats *pgstat_fetch_global(void); -#endif /* PGSTAT_H */ +#endif /* PGSTAT_H */ diff --git a/src/include/pgxc/pgxc.h b/src/include/pgxc/pgxc.h index b69d747b..6c5abcf4 100644 --- a/src/include/pgxc/pgxc.h +++ b/src/include/pgxc/pgxc.h @@ -105,6 +105,9 @@ extern uint32 PGXCNodeIdentifier; extern char *PGXCClusterName; extern char *PGXCMainClusterName; extern char *PGXCDefaultClusterName; +#ifdef __TBASE__ +extern char PGXCSessionId[NAMEDATALEN]; +#endif extern Datum xc_lockForBackupKey1; diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h index 15c4ef46..f79bc2b8 100644 --- a/src/include/pgxc/pgxcnode.h +++ b/src/include/pgxc/pgxcnode.h @@ -285,12 +285,15 @@ extern void pgxc_print_pending_data(PGXCNodeHandle *handle, bool reset); #ifdef __TBASE__ void add_error_message_from_combiner(PGXCNodeHandle *handle, void *combiner_input); -inline void pgxc_set_coordinator_proc_pid(int proc_pid); -inline int pgxc_get_coordinator_proc_pid(void); -inline void pgxc_set_coordinator_proc_vxid(TransactionId proc_vxid); -inline TransactionId pgxc_get_coordinator_proc_vxid(void); +void pgxc_set_coordinator_proc_pid(int proc_pid); +int pgxc_get_coordinator_proc_pid(void); +void pgxc_set_coordinator_proc_vxid(TransactionId proc_vxid); +TransactionId pgxc_get_coordinator_proc_vxid(void); inline char* find_ddl_leader_cn(void); inline bool is_ddl_leader_cn(char *leader_cn); +extern int pgxc_node_send_sessionid(PGXCNodeHandle * handle); +extern void SerializeSessionId(Size maxsize, char *start_address); +extern void StartParallelWorkerSessionId(char *address); #endif #ifdef __AUDIT__ diff --git a/src/include/pgxc/squeue.h b/src/include/pgxc/squeue.h index a48ba09b..3f0a6408 100644 --- a/src/include/pgxc/squeue.h +++ b/src/include/pgxc/squeue.h @@ -2,13 +2,13 @@ * * barrier.h * - * Definitions for the shared queue handling + * Definitions for the shared queue handling * * * Copyright (c) 2012-2014, TransLattice, Inc. * * IDENTIFICATION - * $$ + * $$ * *------------------------------------------------------------------------- */ @@ -74,24 +74,24 @@ extern void SharedQueueAcquire(const char *sqname, int ncons, bool parallelSend, extern void SharedQueueAcquire(const char *sqname, int ncons); #endif extern SharedQueue SharedQueueBind(const char *sqname, List *consNodes, - List *distNodes, int *myindex, int *consMap + List *distNodes, int *myindex, int *consMap #ifdef __TBASE__ - , - DataPumpSender *sender + , + DataPumpSender *sender #endif - ); + ); extern void SharedQueueUnBind(SharedQueue squeue, bool failed); extern void SharedQueueRelease(const char *sqname); extern void SharedQueuesCleanup(int code, Datum arg); -extern int SharedQueueFinish(SharedQueue squeue, TupleDesc tupDesc, - Tuplestorestate **tuplestore); +extern int SharedQueueFinish(SharedQueue squeue, TupleDesc tupDesc, + Tuplestorestate **tuplestore); extern void SharedQueueWrite(SharedQueue squeue, int consumerIdx, - TupleTableSlot *slot, Tuplestorestate **tuplestore, - MemoryContext tmpcxt); + TupleTableSlot *slot, Tuplestorestate **tuplestore, + MemoryContext tmpcxt); extern bool SharedQueueRead(SharedQueue squeue, int consumerIdx, - TupleTableSlot *slot, bool canwait); + TupleTableSlot *slot, bool canwait); extern void SharedQueueDisconnectConsumer(const char *sqname); extern void SharedQueueReset(SharedQueue squeue, int consumerIdx); extern void SharedQueueResetNotConnected(SharedQueue squeue); @@ -100,35 +100,35 @@ extern bool SharedQueueWaitOnProducerLatch(SharedQueue squeue, long timeout); #ifdef __TBASE__ typedef enum { - DataPumpOK = 0, - DataPumpSndError_no_socket = -1, - DataPumpSndError_no_space = -2, - DataPumpSndError_io_error = -3, - DataPumpSndError_node_error = -4, - DataPumpSndError_bad_status = -5, - DataPumpSndError_unreachable_node = -6, - DataPumpConvert_error = -7 + DataPumpOK = 0, + DataPumpSndError_no_socket = -1, + DataPumpSndError_no_space = -2, + DataPumpSndError_io_error = -3, + DataPumpSndError_node_error = -4, + DataPumpSndError_bad_status = -5, + DataPumpSndError_unreachable_node = -6, + DataPumpConvert_error = -7 }DataPumpSndError; #define DATAPUMP_UNREACHABLE_NODE_FD (-2) typedef enum { - ConvertRunning, - ConvertListenError, - ConvertAcceptError, - ConvertRecvNodeidError, - ConvertRecvNodeindexError, - ConvertRecvSockfdError, - ConvertSetSockfdError, - ConvertExit + ConvertRunning, + ConvertListenError, + ConvertAcceptError, + ConvertRecvNodeidError, + ConvertRecvNodeindexError, + ConvertRecvSockfdError, + ConvertSetSockfdError, + ConvertExit }ConvertStatus; typedef enum { - Squeue_Consumer, - Squeue_Producer, - Squeue_None + Squeue_Consumer, + Squeue_Producer, + Squeue_None } SqueueRole; extern bool IsSqueueProducer(void); @@ -196,15 +196,15 @@ extern void RemoteSubplanSigusr2Handler(SIGNAL_ARGS); #ifdef __TBASE__ enum MT_thr_detach { - MT_THR_JOINABLE, - MT_THR_DETACHED + MT_THR_JOINABLE, + MT_THR_DETACHED }; typedef struct { - int m_cnt; - pthread_mutex_t m_mutex; - pthread_cond_t m_cond; + int m_cnt; + pthread_mutex_t m_mutex; + pthread_cond_t m_cond; }ThreadSema; extern void ThreadSemaInit(ThreadSema *sema, int32 init); @@ -214,11 +214,11 @@ extern void ThreadSemaUp(ThreadSema *sema); typedef struct { - void **m_List; /*循环队列数组*/ - uint32 m_Length; /*队列队列长度*/ - slock_t m_lock; /*保护下面的两个变量*/ - volatile uint32 m_Head; /*队列头部,数据插入往头部插入,头部加一等于尾则队列满*/ - volatile uint32 m_Tail; /*队列尾部,尾部等于头部,则队列为空*/ + void **m_List; /*循环队列数组*/ + uint32 m_Length; /*队列队列长度*/ + slock_t m_lock; /*保护下面的两个变量*/ + volatile uint32 m_Head; /*队列头部,数据插入往头部插入,头部加一等于尾则队列满*/ + volatile uint32 m_Tail; /*队列尾部,尾部等于头部,则队列为空*/ }PGPipe; extern PGPipe* CreatePipe(uint32 size); extern void DestoryPipe(PGPipe *pPipe); @@ -226,10 +226,11 @@ extern void *PipeGet(PGPipe *pPipe); extern int PipePut(PGPipe *pPipe, void *p); extern bool PipeIsFull(PGPipe *pPipe); extern bool IsEmpty(PGPipe *pPipe); -extern int PipeLength(PGPipe *pPipe); +extern int PipeLength(PGPipe *pPipe); extern int32 CreateThread(void *(*f) (void *), void *arg, int32 mode); +extern const char *SqueueName(SharedQueue sq); #endif diff --git a/src/include/utils/portal.h b/src/include/utils/portal.h index 5d039875..d662e3f6 100644 --- a/src/include/utils/portal.h +++ b/src/include/utils/portal.h @@ -282,6 +282,12 @@ typedef struct PortalData #define PortalGetQueryDesc(portal) ((portal)->queryDesc) #define PortalGetHeapMemory(portal) ((portal)->heap) +/* Hook for plugins to get control after PortalStart() */ +typedef void (*PortalStart_hook_type) (Portal portal); +extern PGDLLIMPORT PortalStart_hook_type PortalStart_hook; +/* Hook for plugins to get control before PortalDrop() */ +typedef void (*PortalDrop_hook_type) (Portal portal); +extern PGDLLIMPORT PortalDrop_hook_type PortalDrop_hook; /* Prototypes for functions in utils/mmgr/portalmem.c */ extern void EnablePortalManager(void); From d4ba06ee4d12b3c901d4a857411a653c14a265c5 Mon Sep 17 00:00:00 2001 From: andrelin Date: Mon, 26 Apr 2021 11:12:01 +0800 Subject: [PATCH 150/578] Support showing xc_node_id in parallel scan by setting PGXCNodeIdentifier for parallel workers when they started same operation like PGXCNodeId. TAPD: http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view/1020421696085995445 --- src/backend/access/transam/parallel.c | 11 +++++++++++ .../regress/expected/select_parallel_4.out | 19 +++++++++++++++++++ src/test/regress/sql/select_parallel.sql | 6 ++++++ 3 files changed, 36 insertions(+) diff --git a/src/backend/access/transam/parallel.c b/src/backend/access/transam/parallel.c index b67873e8..d6108a3a 100644 --- a/src/backend/access/transam/parallel.c +++ b/src/backend/access/transam/parallel.c @@ -38,7 +38,10 @@ #include "utils/resowner.h" #include "utils/snapmgr.h" #ifdef __TBASE__ +#include "catalog/pg_collation.h" #include "pgxc/squeue.h" +#include "utils/formatting.h" +#include "utils/lsyscache.h" #endif /* @@ -1102,6 +1105,14 @@ ParallelWorkerMain(Datum main_arg) StartTransactionCommand(); /* Initialize XL executor. This must be done inside a transaction block. */ InitMultinodeExecutor(false); + /* set PGXCNodeIdentifier for workers */ + if (PGXCNodeIdentifier == 0) + { + char *node_name; + node_name = str_tolower(PGXCNodeName, strlen(PGXCNodeName), DEFAULT_COLLATION_OID); + PGXCNodeIdentifier = get_pgxc_node_id(get_pgxc_nodeoid(node_name)); + pfree(node_name); + } CommitTransactionCommand(); /* diff --git a/src/test/regress/expected/select_parallel_4.out b/src/test/regress/expected/select_parallel_4.out index 4d264b26..6bc02325 100644 --- a/src/test/regress/expected/select_parallel_4.out +++ b/src/test/regress/expected/select_parallel_4.out @@ -360,6 +360,25 @@ EXPLAIN (analyze, timing off, summary off, costs off) SELECT * FROM tenk1; Node/s: datanode_1, datanode_2 (2 rows) +-- make sure identifier was set in workers +CREATE TABLE t_worker_identifier (a int); +INSERT INTO t_worker_identifier values(1); +EXPLAIN (costs off) SELECT xc_node_id != 0 FROM t_worker_identifier; + QUERY PLAN +------------------------------------------------------ + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Gather + Workers Planned: 3 + -> Parallel Seq Scan on t_worker_identifier +(5 rows) + +SELECT xc_node_id != 0 FROM t_worker_identifier; + ?column? +---------- + t +(1 row) + -- provoke error in worker select stringu1::int2 from tenk1 where unique1 = 1; ERROR: invalid input syntax for integer: "BAAAAA" diff --git a/src/test/regress/sql/select_parallel.sql b/src/test/regress/sql/select_parallel.sql index 70d0f0fb..d2cca20f 100644 --- a/src/test/regress/sql/select_parallel.sql +++ b/src/test/regress/sql/select_parallel.sql @@ -136,6 +136,12 @@ EXPLAIN (timing off, summary off, costs off) SELECT * FROM tenk1; EXPLAIN (analyze, timing off, summary off, costs off) SELECT * FROM tenk1; +-- make sure identifier was set in workers +CREATE TABLE t_worker_identifier (a int); +INSERT INTO t_worker_identifier values(1); +EXPLAIN (costs off) SELECT xc_node_id != 0 FROM t_worker_identifier; +SELECT xc_node_id != 0 FROM t_worker_identifier; + -- provoke error in worker select stringu1::int2 from tenk1 where unique1 = 1; From 0f174f369abdef3a11c6156df7c565379887c135 Mon Sep 17 00:00:00 2001 From: andrelin Date: Tue, 27 Apr 2021 17:04:18 +0800 Subject: [PATCH 151/578] Remote subplan should distinguish params from initplan ro subplan (merge request !296) Squash merge branch 'andrelin/params' into 'Tbase_v2.15.19' * Use enum to make code more readable * Remote subplan should distinguish params from initplan or subplan --- src/backend/nodes/copyfuncs.c | 1 + src/backend/nodes/outfuncs.c | 1 + src/backend/nodes/readfuncs.c | 1 + src/backend/optimizer/plan/planner.c | 1 + src/backend/optimizer/plan/subselect.c | 85 ++++++++++++++++++++++++++ src/backend/pgxc/pool/execRemote.c | 8 ++- src/backend/tcop/pquery.c | 21 +++++-- src/include/optimizer/subselect.h | 1 + src/include/pgxc/execRemote.h | 10 +++ src/include/pgxc/planner.h | 2 + 10 files changed, 124 insertions(+), 7 deletions(-) diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 3d4e8a68..c2885241 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -1420,6 +1420,7 @@ _copyRemoteSubplan(const RemoteSubplan *from) COPY_SCALAR_FIELD(unique); #ifdef __TBASE__ COPY_SCALAR_FIELD(parallelWorkerSendTuple); + COPY_BITMAPSET_FIELD(initPlanParams); #endif return newnode; } diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index 7d7a9704..f550ed23 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -1697,6 +1697,7 @@ _outRemoteSubplan(StringInfo str, const RemoteSubplan *node) WRITE_STRING_FIELD(cursor); WRITE_INT_FIELD(unique); WRITE_BOOL_FIELD(parallelWorkerSendTuple); + WRITE_BITMAPSET_FIELD(initPlanParams); #ifdef __TBASE__ if (IS_PGXC_COORDINATOR && !g_set_global_snapshot) diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c index 7207a98c..d653fbf3 100644 --- a/src/backend/nodes/readfuncs.c +++ b/src/backend/nodes/readfuncs.c @@ -3776,6 +3776,7 @@ _readRemoteSubplan(void) READ_STRING_FIELD(cursor); READ_INT_FIELD(unique); READ_BOOL_FIELD(parallelWorkerSendTuple); + READ_BITMAPSET_FIELD(initPlanParams); READ_DONE(); } diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index b909ad6a..8c62e688 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -403,6 +403,7 @@ standard_planner(Query *parse, int cursorOptions, ParamListInfo boundParams) top_plan = (Plan *) make_remotesubplan(root, top_plan, NULL, root->distribution, root->sort_pathkeys); + SS_remote_attach_initplans(root, top_plan); remote_subplan_depth--; } #endif diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index 3e3339f8..0f24fa9d 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -5598,3 +5598,88 @@ SS_make_initplan_from_plan(PlannerInfo *root, /* Set costs of SubPlan using info from the plan tree */ cost_subplan(subroot, node, plan); } +/* + * SS_remote_attach_initplans + * + * recursively look into a plantree, find any RemoteSubplan and + * attach params id that generated from init-plan of this query. + */ +void +SS_remote_attach_initplans(PlannerInfo *root, Plan *plan) +{ + ListCell *lc; + + if (plan == NULL) + return; + + if (IsA(plan, RemoteSubplan)) + { + ListCell *plan_lc, *param_lc; + RemoteSubplan *rsplan = (RemoteSubplan *) plan; + Assert(rsplan->initPlanParams == NULL); + foreach(plan_lc, root->init_plans) + { + SubPlan *initplan = (SubPlan *) lfirst(plan_lc); + foreach(param_lc, initplan->setParam) + { + rsplan->initPlanParams = + bms_add_member(rsplan->initPlanParams, lfirst_int(param_lc)); + } + } + } + + switch (nodeTag(plan)) + { + case T_SubqueryScan: + { + SubqueryScan *sscan = (SubqueryScan *) plan; + RelOptInfo *rel; + + rel = find_base_rel(root, sscan->scan.scanrelid); + SS_remote_attach_initplans(rel->subroot, sscan->subplan); + } + break; + case T_CustomScan: + { + foreach(lc, ((CustomScan *) plan)->custom_plans) + SS_remote_attach_initplans(root, (Plan *) lfirst(lc)); + } + break; + case T_ModifyTable: + { + foreach(lc, ((ModifyTable *) plan)->plans) + SS_remote_attach_initplans(root, (Plan *) lfirst(lc)); + } + break; + case T_Append: + { + foreach(lc, ((Append *) plan)->appendplans) + SS_remote_attach_initplans(root, (Plan *) lfirst(lc)); + } + break; + case T_MergeAppend: + { + foreach(lc, ((MergeAppend *) plan)->mergeplans) + SS_remote_attach_initplans(root, (Plan *) lfirst(lc)); + } + break; + case T_BitmapAnd: + { + foreach(lc, ((BitmapAnd *) plan)->bitmapplans) + SS_remote_attach_initplans(root, (Plan *) lfirst(lc)); + } + break; + case T_BitmapOr: + { + foreach(lc, ((BitmapOr *) plan)->bitmapplans) + SS_remote_attach_initplans(root, (Plan *) lfirst(lc)); + } + break; + default: + break; + } + + /* Process left and right child plans, if any */ + SS_remote_attach_initplans(root, plan->lefttree); + SS_remote_attach_initplans(root, plan->righttree); +} \ No newline at end of file diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 86cafb0f..52850951 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -10208,12 +10208,13 @@ ExecInitRemoteSubplan(RemoteSubplan *node, EState *estate, int eflags) */ if (OidIsValid(param->ptype)) { - rstmt.remoteparams[paramno].paramused = 1; + rstmt.remoteparams[paramno].paramused = + bms_is_member(i, node->initPlanParams) ? REMOTE_PARAM_INITPLAN : REMOTE_PARAM_SUBPLAN; rstmt.remoteparams[paramno].paramtype = param->ptype; } else { - rstmt.remoteparams[paramno].paramused = 0; + rstmt.remoteparams[paramno].paramused = REMOTE_PARAM_UNUSED; rstmt.remoteparams[paramno].paramtype = INT4OID; } @@ -10237,7 +10238,8 @@ ExecInitRemoteSubplan(RemoteSubplan *node, EState *estate, int eflags) rstmt.remoteparams[paramno].paramkind = PARAM_EXEC; rstmt.remoteparams[paramno].paramid = i; rstmt.remoteparams[paramno].paramtype = prmdata->ptype; - rstmt.remoteparams[paramno].paramused = 1; + rstmt.remoteparams[paramno].paramused = + bms_is_member(i, node->initPlanParams) ? REMOTE_PARAM_INITPLAN : REMOTE_PARAM_SUBPLAN; /* Will scan plan tree to find out data type of the param */ if (prmdata->ptype == InvalidOid) defineParams = bms_add_member(defineParams, i); diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c index 2064e594..29c53160 100644 --- a/src/backend/tcop/pquery.c +++ b/src/backend/tcop/pquery.c @@ -676,6 +676,10 @@ PortalStart(Portal portal, ParamListInfo params, { #ifdef XCP case PORTAL_DISTRIBUTED: + { + int i; + bool paramNeedPassDown = false; + /* No special ability is needed */ eflags = 0; /* Must set snapshot before starting executor. */ @@ -726,9 +730,18 @@ PortalStart(Portal portal, ParamListInfo params, * is not supported in SharedQueue mode. Force to do it traditionally. */ #ifdef __TBASE__ - if ((!paramPassDown && queryDesc->plannedstmt->nParamRemote > 0 && - queryDesc->plannedstmt->remoteparams[queryDesc->plannedstmt->nParamRemote-1].paramkind == PARAM_EXEC) || - queryDesc->epqContext != NULL) + for (i = 0; i < queryDesc->plannedstmt->nParamRemote; i++) + { + RemoteParam *rparam = &queryDesc->plannedstmt->remoteparams[i]; + if (rparam->paramkind == PARAM_EXEC && + rparam->paramused != REMOTE_PARAM_INITPLAN) /* if it's from initplan, still work with shared queue */ + { + paramNeedPassDown = true; + break; + } + } + + if ((!paramPassDown && paramNeedPassDown) || queryDesc->epqContext != NULL) #else if (queryDesc->plannedstmt->nParamRemote > 0 && queryDesc->plannedstmt->remoteparams[queryDesc->plannedstmt->nParamRemote-1].paramkind == PARAM_EXEC) @@ -737,7 +750,6 @@ PortalStart(Portal portal, ParamListInfo params, int *consMap; int len; ListCell *lc; - int i; Locator *locator; Oid keytype; DestReceiver *dest; @@ -983,6 +995,7 @@ PortalStart(Portal portal, ParamListInfo params, portal->portalPos = 0; PopActiveSnapshot(); + } break; #endif diff --git a/src/include/optimizer/subselect.h b/src/include/optimizer/subselect.h index 47ba77f5..bfba85b3 100644 --- a/src/include/optimizer/subselect.h +++ b/src/include/optimizer/subselect.h @@ -120,6 +120,7 @@ extern Param *assign_nestloop_param_placeholdervar(PlannerInfo *root, PlaceHolderVar *phv); extern int SS_assign_special_param(PlannerInfo *root); +extern void SS_remote_attach_initplans(PlannerInfo *root, Plan *plan); #ifdef __TBASE__ extern bool has_correlation_in_funcexpr_rte(List *rtable); #endif diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h index 8332a217..60910919 100644 --- a/src/include/pgxc/execRemote.h +++ b/src/include/pgxc/execRemote.h @@ -88,6 +88,16 @@ typedef enum REMOTE_COPY_TUPLESTORE /* Store data in tuplestore */ } RemoteCopyType; +/* + * Type of remote param from init-plan or subplan + */ +typedef enum +{ + REMOTE_PARAM_UNUSED, + REMOTE_PARAM_INITPLAN, + REMOTE_PARAM_SUBPLAN +} RemoteParamType; + /* Combines results of INSERT statements using multiple values */ typedef struct CombineTag { diff --git a/src/include/pgxc/planner.h b/src/include/pgxc/planner.h index cb221759..c868136a 100644 --- a/src/include/pgxc/planner.h +++ b/src/include/pgxc/planner.h @@ -253,6 +253,8 @@ typedef struct * directly without gather node? */ bool parallelWorkerSendTuple; + /* params that generated by initplan */ + Bitmapset *initPlanParams; #endif } RemoteSubplan; From 89038e41a8b3f1b1a9d215b6c47ec463ce14a0d8 Mon Sep 17 00:00:00 2001 From: andrelin Date: Tue, 1 Jun 2021 20:59:18 +0800 Subject: [PATCH 152/578] Support tablesample for partition table tapd: http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view/1020421696086883685 --- src/backend/executor/nodeSamplescan.c | 15 +++++++++++++++ src/backend/optimizer/plan/createplan.c | 1 + 2 files changed, 16 insertions(+) diff --git a/src/backend/executor/nodeSamplescan.c b/src/backend/executor/nodeSamplescan.c index 4738beb1..9be86efe 100644 --- a/src/backend/executor/nodeSamplescan.c +++ b/src/backend/executor/nodeSamplescan.c @@ -128,9 +128,24 @@ InitScanRelation(SampleScanState *node, EState *estate, int eflags) * get the relation object id from the relid'th entry in the range table, * open that relation and acquire appropriate lock on it. */ +#ifdef __TBASE__ + /* if interval partition, scan child table instead */ + if(((SeqScan *) node->ss.ps.plan)->ispartchild) + { + currentRelation = ExecOpenScanRelationPartition(estate, + ((SeqScan *) node->ss.ps.plan)->scanrelid, + eflags, + ((SeqScan *) node->ss.ps.plan)->childidx); + } + else + { +#endif currentRelation = ExecOpenScanRelation(estate, ((SampleScan *) node->ss.ps.plan)->scan.scanrelid, eflags); +#ifdef __TBASE__ + } +#endif #ifdef _MLS_ mls_check_datamask_need_passby((ScanState*)node, currentRelation->rd_id); #endif diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index e4ccad0c..1924a389 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -920,6 +920,7 @@ create_scan_plan(PlannerInfo *root, Path *best_path, int flags) switch(nodeTag(child)) { case T_SeqScan: + case T_SampleScan: break; case T_IndexScan: { From 275a095b4a78945188c9c20147a3ebe08feb7263 Mon Sep 17 00:00:00 2001 From: ericxwu Date: Wed, 5 May 2021 14:12:00 +0800 Subject: [PATCH 153/578] Fix some mistakes of remotesubplan cost calculation 1. Remove parallel workers factor when calculate cost of remote subplan. There was a bug introduced when considering parallel cost of remote subplan, that the parallel_workers is 0 if parallel not enabled for this node. This will lead to 0 remote subplan running cost in non-parallel mode. Since this is a tbase specific execution model, and we will refactor all parallel optimization/execution logics later to catch up with Postgres. Here we just remove this nworkers logic. 2. Consider tuple replication times for better estimation of the distribution cost. Currently the path->rows popped up from baserel is devided into PER_DN scope. In this module, the rows of replicate distribution node should multiply the number of nodes. There is another big issue that we messed up the distribution type usage between LOCATOR_TYPE_REPLICATED(for replicate distribution) and LOCATOR_TYPE_NONE(for CN gather distribution). Will refactor this later. http://tapd.oa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131087396641 --- src/backend/executor/execParallel.c | 4 +- src/backend/optimizer/path/costsize.c | 9 +- src/backend/optimizer/util/pathnode.c | 108 +++--------------- src/backend/pgxc/locator/locator.c | 19 +++ src/include/optimizer/cost.h | 3 +- src/include/pgxc/locator.h | 1 + src/test/regress/expected/foreign_key_2.out | 13 ++- src/test/regress/expected/join_3.out | 37 +++--- src/test/regress/expected/rowsecurity_1.out | 79 +++++++------ src/test/regress/expected/rules.out | 3 + .../regress/expected/select_parallel_4.out | 18 +-- src/test/regress/expected/stats_ext_2.out | 16 +-- src/test/regress/expected/subselect.out | 6 +- src/test/regress/expected/tbase_explain.out | 10 +- src/test/regress/expected/xc_FQS_2.out | 8 +- src/test/regress/expected/xc_FQS_join_1.out | 34 +++--- 16 files changed, 162 insertions(+), 206 deletions(-) diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c index db2cdf60..920bc32e 100644 --- a/src/backend/executor/execParallel.c +++ b/src/backend/executor/execParallel.c @@ -1325,11 +1325,11 @@ ExecParallelInitializeWorker(PlanState *planstate, ParallelWorkerContext *pwcxt) planstate_tree_walker(planstate, ExecInitializeWorkerRemoteInstr, pwcxt); } if (planstate->plan->parallel_aware) - ExecRemoteSubPlanInitializeDSMWorker((RemoteSubplanState *)planstate, pwcxt); + ExecRemoteSubPlanInitDSMWorker((RemoteSubplanState *)planstate, pwcxt); break; case T_HashJoinState: if (planstate->plan->parallel_aware) - ExecParallelHashJoinInitializeWorker((HashJoinState *) planstate, pwcxt); + ExecParallelHashJoinInitWorker((HashJoinState *) planstate, pwcxt); break; case T_AggState: if (planstate->plan->parallel_aware) diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index c0fa9bdf..9d82cec9 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -5352,23 +5352,22 @@ page_size(double tuples, int width) void cost_remote_subplan(Path *path, Cost input_startup_cost, Cost input_total_cost, - double tuples, int width, int replication, - int nworkers) + double tuples, int width, int replication) { Cost startup_cost = input_startup_cost + remote_query_cost; Cost run_cost = input_total_cost - input_startup_cost; - path->rows = tuples; + path->rows = tuples * replication; /* * Charge 2x cpu_operator_cost per tuple to reflect bookkeeping overhead. */ - run_cost += 2 * cpu_operator_cost * tuples * nworkers; + run_cost += 2 * cpu_operator_cost * tuples; /* * Estimate cost of sending data over network */ - run_cost += network_byte_cost * tuples * width * replication * nworkers; + run_cost += network_byte_cost * tuples * width * replication; path->startup_cost = startup_cost; path->total_cost = startup_cost + run_cost; diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index c0847553..17546a77 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -1404,47 +1404,6 @@ set_scanpath_distribution(PlannerInfo *root, RelOptInfo *rel, Path *pathnode) } } -#ifdef __TBASE__ -/* - * implementation for create_remotesubplan_path, besides regular creation of remote subplan, - * we need it when redistributing join rel. - */ -static Path * -create_remotesubplan_path_internal(PlannerInfo *root, Path *subpath, - Distribution *distribution, RelOptInfo *rel, - ParamPathInfo *param_info, List *pathkeys, - PathTarget *pathtarget, int replication, - Cost additional_startup_cost, - Cost additional_total_cost) -{ - RemoteSubPath *pathnode; - - //if (IsA(subpath, GatherPath)) - //reset_cost_gather((GatherPath *) subpath); - - pathnode = makeNode(RemoteSubPath); - pathnode->path.pathtype = T_RemoteSubplan; - pathnode->path.parent = rel; - pathnode->path.param_info = param_info; - pathnode->path.pathkeys = pathkeys; - pathnode->subpath = subpath; - pathnode->path.distribution = (Distribution *) copyObject(distribution); - - /* We don't want to run subplains in parallel workers */ - pathnode->path.parallel_aware = false; - pathnode->path.parallel_safe = false; - - pathnode->path.pathtarget = pathtarget; - - cost_remote_subplan((Path *) pathnode, subpath->startup_cost + additional_startup_cost, - subpath->total_cost + additional_total_cost, subpath->rows, - rel->reltarget->width, replication, subpath->parallel_workers); - - return (Path *) pathnode; -} -#endif - - /* * create_remotesubplan_path * Redistribute the data to match the distribution. @@ -1458,15 +1417,8 @@ create_remotesubplan_path(PlannerInfo *root, Path *subpath, { RelOptInfo *rel = subpath->parent; RemoteSubPath *pathnode; - Distribution *subdistribution = subpath->distribution; + Distribution *subDist = subpath->distribution; -#ifdef __TBASE__ - return create_remotesubplan_path_internal(root, subpath, distribution, - rel, subpath->param_info, - subpath->pathkeys, subpath->pathtarget, - (subdistribution && IsLocatorReplicated(subdistribution->distributionType)) ? - bms_num_members(subdistribution->nodes) : 1, 0, 0); -#else pathnode = makeNode(RemoteSubPath); pathnode->path.pathtype = T_RemoteSubplan; pathnode->path.parent = rel; @@ -1483,11 +1435,9 @@ create_remotesubplan_path(PlannerInfo *root, Path *subpath, cost_remote_subplan((Path *) pathnode, subpath->startup_cost, subpath->total_cost, subpath->rows, rel->reltarget->width, - (subdistribution && IsLocatorReplicated(subdistribution->distributionType)) ? - bms_num_members(subdistribution->nodes) : 1); + subDist ? calcDistReplications(subDist->distributionType, subDist->nodes) : 1); return (Path *) pathnode; -#endif } /* @@ -1506,13 +1456,6 @@ redistribute_path(PlannerInfo *root, Path *subpath, List *pathkeys, Distribution *distribution = NULL; RelOptInfo *rel = subpath->parent; RemoteSubPath *pathnode; -#ifdef __TBASE__ - int num_replication; - - /* IsLocatorNone() also indicates we are replicating through input nodes */ - num_replication = (IsLocatorReplicated(distributionType) || - IsLocatorNone(distributionType)) ? bms_num_members(nodes) : 1; -#endif if (distributionType != LOCATOR_TYPE_NONE) { @@ -1530,20 +1473,6 @@ redistribute_path(PlannerInfo *root, Path *subpath, List *pathkeys, if (IsA(subpath, MaterialPath)) { MaterialPath *mpath = (MaterialPath *) subpath; -#ifdef __TBASE__ - if (IsA(mpath->subpath, RemoteSubPath)) - { - pathnode = (RemoteSubPath *) mpath->subpath; - pathnode->path.distribution = (Distribution *) copyObject(distribution); - } - else - { - pathnode = (RemoteSubPath *) create_remotesubplan_path_internal(root, mpath->subpath, - distribution, rel, subpath->param_info, - subpath->pathkeys, rel->reltarget, - num_replication, 0, 0); - } -#else /* If subpath is already a RemoteSubPath, just replace distribution */ if (IsA(mpath->subpath, RemoteSubPath)) { @@ -1569,11 +1498,13 @@ redistribute_path(PlannerInfo *root, Path *subpath, List *pathkeys, subpath = pathnode->subpath; pathnode->path.distribution = distribution; /* (re)calculate costs */ - cost_remote_subplan((Path *) pathnode, subpath->startup_cost, - subpath->total_cost, subpath->rows, rel->reltarget->width, - IsLocatorReplicated(distributionType) ? - bms_num_members(nodes) : 1); -#endif + cost_remote_subplan((Path *) pathnode, + subpath->startup_cost, + subpath->total_cost, + subpath->rows, + rel->reltarget->width, + calcDistReplications(distributionType, nodes)); + mpath->path.distribution = (Distribution *) copyObject(distribution); mpath->subpath = (Path *) pathnode; cost_material(&mpath->path, @@ -1587,7 +1518,7 @@ redistribute_path(PlannerInfo *root, Path *subpath, List *pathkeys, { Cost input_startup_cost = 0; Cost input_total_cost = 0; -#ifndef __TBASE__ + pathnode = makeNode(RemoteSubPath); pathnode->path.pathtype = T_RemoteSubplan; pathnode->path.parent = rel; @@ -1595,7 +1526,7 @@ redistribute_path(PlannerInfo *root, Path *subpath, List *pathkeys, pathnode->path.param_info = subpath->param_info; pathnode->path.pathkeys = pathkeys ? pathkeys : subpath->pathkeys; pathnode->path.distribution = distribution; -#endif + /* * If we need to insert a Sort node, add it here, so that it gets * pushed down to the remote node. @@ -1628,14 +1559,6 @@ redistribute_path(PlannerInfo *root, Path *subpath, List *pathkeys, input_startup_cost += sort_path.startup_cost; input_total_cost += sort_path.total_cost; } -#ifdef __TBASE__ - pathnode = (RemoteSubPath *) create_remotesubplan_path_internal(root, subpath, - distribution, rel, subpath->param_info, - pathkeys ? pathkeys : subpath->pathkeys, - rel->reltarget, num_replication, - input_startup_cost - subpath->startup_cost, - input_total_cost - subpath->total_cost); -#else pathnode->subpath = subpath; /* We don't want to run subplains in parallel workers */ @@ -1643,10 +1566,11 @@ redistribute_path(PlannerInfo *root, Path *subpath, List *pathkeys, pathnode->path.parallel_safe = false; cost_remote_subplan((Path *) pathnode, - input_startup_cost, input_total_cost, - subpath->rows, rel->reltarget->width, - num_replication); -#endif + input_startup_cost, + input_total_cost, + subpath->rows, + rel->reltarget->width, + calcDistReplications(distributionType, nodes)); return (Path *) pathnode; } } diff --git a/src/backend/pgxc/locator/locator.c b/src/backend/pgxc/locator/locator.c index a6933ce2..76c72d85 100644 --- a/src/backend/pgxc/locator/locator.c +++ b/src/backend/pgxc/locator/locator.c @@ -2160,6 +2160,25 @@ IsDistributedColumn(AttrNumber attr, RelationLocInfo *relation_loc_info) return result; } + +/* + * Calculate the tuple replication times based on replication type and number + * of target nodes. + */ +int +calcDistReplications(char distributionType, Bitmapset *nodes) +{ + if (!nodes) + return 1; + + if (IsLocatorReplicated(distributionType) || + IsLocatorNone(distributionType)) + { + return bms_num_members(nodes); + } + + return 1; +} #endif void * diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index 358e83b9..2198c9db 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -177,8 +177,7 @@ extern void cost_qual_eval_node(QualCost *cost, Node *qual, PlannerInfo *root); #ifdef XCP extern void cost_remote_subplan(Path *path, Cost input_startup_cost, Cost input_total_cost, - double tuples, int width, int replication, - int nworkers); + double tuples, int width, int replication); #endif extern void compute_semi_anti_join_factors(PlannerInfo *root, RelOptInfo *outerrel, diff --git a/src/include/pgxc/locator.h b/src/include/pgxc/locator.h index 4e692237..3fd1f6b9 100644 --- a/src/include/pgxc/locator.h +++ b/src/include/pgxc/locator.h @@ -249,6 +249,7 @@ extern bool IsTypeDistributable(Oid col_type); extern char getLocatorDisType(Locator *self); extern bool prefer_olap; extern bool IsDistributedColumn(AttrNumber attr, RelationLocInfo *relation_loc_info); +extern int calcDistReplications(char distributionType, Bitmapset *nodes); #endif #ifdef _MLS_ diff --git a/src/test/regress/expected/foreign_key_2.out b/src/test/regress/expected/foreign_key_2.out index ec92a35b..8b8ac8ac 100644 --- a/src/test/regress/expected/foreign_key_2.out +++ b/src/test/regress/expected/foreign_key_2.out @@ -1373,23 +1373,24 @@ create temp table t1 (a integer primary key, b text); create temp table t2 (a integer, b integer references t1) distribute by hash (b); create rule r1 as on delete to t1 do delete from t2 where t2.b = old.a; explain (costs off) delete from t1 where a = 1; - QUERY PLAN ------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------ Remote Subquery Scan on all (datanode_1) -> Delete on t2 -> Nested Loop - -> Remote Subquery Scan on all (datanode_1) - -> Index Scan using t1_pkey on t1 - Index Cond: (a = 1) -> Seq Scan on t2 Filter: (b = 1) + -> Materialize + -> Remote Subquery Scan on all (datanode_1) + -> Index Scan using t1_pkey on t1 + Index Cond: (a = 1) Remote Fast Query Execution Node/s: datanode_1 -> Delete on t1 -> Index Scan using t1_pkey on t1 Index Cond: (a = 1) -(14 rows) +(15 rows) delete from t1 where a = 1; drop rule r1 on t1; diff --git a/src/test/regress/expected/join_3.out b/src/test/regress/expected/join_3.out index 761f5a90..0da9548e 100644 --- a/src/test/regress/expected/join_3.out +++ b/src/test/regress/expected/join_3.out @@ -1876,20 +1876,23 @@ where exists(select * from tenk1 b where a.twothousand = b.twothousand and a.fivethous <> b.fivethous) and i4.f1 = a.tenthous; QUERY PLAN ------------------------------------------------------------------------------ +----------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) - -> Hash Join - Hash Cond: (a.tenthous = i4.f1) -> Hash Semi Join Hash Cond: (a.twothousand = b.twothousand) Join Filter: (a.fivethous <> b.fivethous) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: twothousand + -> Hash Join + Hash Cond: (a.tenthous = i4.f1) -> Seq Scan on tenk1 a -> Hash + -> Seq Scan on int4_tbl i4 + -> Hash -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: twothousand -> Seq Scan on tenk1 b - -> Hash - -> Seq Scan on int4_tbl i4 -(12 rows) +(15 rows) -- -- More complicated constructs @@ -4718,17 +4721,17 @@ explain (num_nodes off, nodes off, costs off) select count(*) from tenk1 a, tenk1 b join lateral (values(a.unique1)) ss(x) on b.unique2 = ss.x; QUERY PLAN ------------------------------------------------------------------- +------------------------------------------------------------ Finalize Aggregate -> Remote Subquery Scan on all -> Partial Aggregate -> Hash Join - Hash Cond: (a.unique1 = b.unique2) - -> Seq Scan on tenk1 a - -> Hash + Hash Cond: (b.unique2 = a.unique1) -> Remote Subquery Scan on all Distribute results by H: unique2 -> Seq Scan on tenk1 b + -> Hash + -> Seq Scan on tenk1 a (10 rows) select count(*) from tenk1 a, @@ -6614,17 +6617,17 @@ explain select t3.b from nestloop_suppression1 t1, nestloop_suppression2 t2, nes where t1.b=2 and t1.c=3 and t1.d like 'char%' and t1.a=t2.a and t3.b>t2.a; QUERY PLAN ------------------------------------------------------------------------------------------------------------------------------ - Nested Loop (cost=200.16..401.93 rows=33 width=4) + Nested Loop (cost=200.16..402.39 rows=33 width=4) Join Filter: (t3.b > t2.a) - -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.16..280.68 rows=1 width=4) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.16..280.69 rows=1 width=4) -> Nested Loop (cost=0.16..180.68 rows=1 width=4) Join Filter: (t1.a = t2.a) -> Index Scan using idx_nestloop_suppression1_b on nestloop_suppression1 t1 (cost=0.16..8.18 rows=1 width=4) Index Cond: (b = 2) Filter: (((d)::text ~~ 'char%'::text) AND (c = 3)) -> Seq Scan on nestloop_suppression2 t2 (cost=0.00..110.00 rows=5000 width=4) - -> Materialize (cost=100.00..120.62 rows=50 width=4) - -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.00..120.50 rows=50 width=4) + -> Materialize (cost=100.00..121.08 rows=50 width=4) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.00..120.95 rows=50 width=4) -> Seq Scan on nestloop_suppression3 t3 (cost=0.00..20.50 rows=50 width=4) (12 rows) @@ -6633,7 +6636,7 @@ explain select t3.b from nestloop_suppression1 t1, nestloop_suppression2 t2, nes where t1.b=2 and t1.c=3 and t1.d like 'char%' and t1.a=t2.a and t3.b>t2.a; QUERY PLAN ------------------------------------------------------------------------------------------------------------------------------------ - Nested Loop (cost=200.16..414.44 rows=33 width=4) + Nested Loop (cost=200.16..414.89 rows=33 width=4) Join Filter: (t3.b > t2.a) -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.16..293.19 rows=1 width=4) -> Nested Loop (cost=0.16..193.19 rows=1 width=4) @@ -6643,8 +6646,8 @@ explain select t3.b from nestloop_suppression1 t1, nestloop_suppression2 t2, nes -> Index Scan using idx_nestloop_suppression1_b on nestloop_suppression1 t1 (cost=0.16..8.18 rows=1 width=4) Index Cond: (b = 2) Filter: (((d)::text ~~ 'char%'::text) AND (c = 3)) - -> Materialize (cost=100.00..120.62 rows=50 width=4) - -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.00..120.50 rows=50 width=4) + -> Materialize (cost=100.00..121.08 rows=50 width=4) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.00..120.95 rows=50 width=4) -> Seq Scan on nestloop_suppression3 t3 (cost=0.00..20.50 rows=50 width=4) (13 rows) diff --git a/src/test/regress/expected/rowsecurity_1.out b/src/test/regress/expected/rowsecurity_1.out index 237482a1..7ea346ae 100644 --- a/src/test/regress/expected/rowsecurity_1.out +++ b/src/test/regress/expected/rowsecurity_1.out @@ -1577,43 +1577,47 @@ WHERE t2.a = 3 and t3.a = 2 AND f_leak(t2.b) AND f_leak(t3.b); Remote Subquery Scan on all (datanode_2) -> Update on t2 -> Nested Loop + -> Seq Scan on t2 + Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b)) + -> Materialize -> Remote Subquery Scan on all (datanode_1) -> Seq Scan on t3 Filter: ((a = 2) AND f_leak(b)) - -> Seq Scan on t2 - Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b)) -(8 rows) +(9 rows) UPDATE t2 SET b=t2.b FROM t3 WHERE t2.a = 3 and t3.a = 2 AND f_leak(t2.b) AND f_leak(t3.b); EXPLAIN (COSTS OFF) UPDATE t1 SET b=t1.b FROM t2 WHERE t1.a = 3 and t2.a = 3 AND f_leak(t1.b) AND f_leak(t2.b); - QUERY PLAN ------------------------------------------------------------------------------ + QUERY PLAN +----------------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_2) -> Update on t1 Update on t1 Update on t2 t2_1 Update on t3 -> Nested Loop - -> Remote Subquery Scan on all (datanode_2) - -> Seq Scan on t2 - Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b)) -> Seq Scan on t1 Filter: ((a = 3) AND ((a % 2) = 0) AND f_leak(b)) + -> Materialize + -> Remote Subquery Scan on all (datanode_2) + -> Seq Scan on t2 + Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b)) -> Nested Loop - -> Remote Subquery Scan on all (datanode_2) - -> Seq Scan on t2 - Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b)) -> Seq Scan on t2 t2_1 Filter: ((a = 3) AND ((a % 2) = 0) AND f_leak(b)) + -> Materialize + -> Remote Subquery Scan on all (datanode_2) + -> Seq Scan on t2 + Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b)) -> Nested Loop - -> Remote Subquery Scan on all (datanode_2) - -> Seq Scan on t2 - Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b)) -> Seq Scan on t3 Filter: ((a = 3) AND ((a % 2) = 0) AND f_leak(b)) -(23 rows) + -> Materialize + -> Remote Subquery Scan on all (datanode_2) + -> Seq Scan on t2 + Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b)) +(26 rows) UPDATE t1 SET b=t1.b FROM t2 WHERE t1.a = 3 and t2.a = 3 AND f_leak(t1.b) AND f_leak(t2.b); @@ -1643,18 +1647,19 @@ WHERE t1.a = 3 and t2.a = 3 AND f_leak(t1.b) AND f_leak(t2.b); EXPLAIN (COSTS OFF) UPDATE t2 t2_1 SET b = t2_2.b FROM t2 t2_2 WHERE t2_1.a = 3 AND t2_2.a = t2_1.a AND t2_2.b = t2_1.b AND f_leak(t2_1.b) AND f_leak(t2_2.b) RETURNING *, t2_1, t2_2; - QUERY PLAN ------------------------------------------------------------------------------ + QUERY PLAN +----------------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_2) -> Update on t2 t2_1 -> Nested Loop Join Filter: (t2_1.b = t2_2.b) - -> Remote Subquery Scan on all (datanode_2) - -> Seq Scan on t2 t2_2 - Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b)) -> Seq Scan on t2 t2_1 Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b)) -(9 rows) + -> Materialize + -> Remote Subquery Scan on all (datanode_2) + -> Seq Scan on t2 t2_2 + Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b)) +(10 rows) UPDATE t2 t2_1 SET b = t2_2.b FROM t2 t2_2 WHERE t2_1.a = 3 AND t2_2.a = t2_1.a AND t2_2.b = t2_1.b @@ -2060,15 +2065,15 @@ EXPLAIN (COSTS OFF) EXECUTE plancache_test2; PREPARE plancache_test3 AS WITH q AS MATERIALIZED (SELECT * FROM z2) SELECT * FROM q,z1 WHERE f_leak(z1.b); EXPLAIN (COSTS OFF) EXECUTE plancache_test3; - QUERY PLAN + QUERY PLAN ------------------------------------------------------------- Nested Loop CTE q -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on z2 - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Seq Scan on z1 - Filter: (((a % 2) = 0) AND f_leak(b)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on z1 + Filter: (((a % 2) = 0) AND f_leak(b)) -> Materialize -> CTE Scan on q (9 rows) @@ -2112,15 +2117,15 @@ EXPLAIN (COSTS OFF) EXECUTE plancache_test2; (9 rows) EXPLAIN (COSTS OFF) EXECUTE plancache_test3; - QUERY PLAN + QUERY PLAN ------------------------------------------------------------- Nested Loop CTE q -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on z2 - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Seq Scan on z1 - Filter: (((a % 2) = 0) AND f_leak(b)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on z1 + Filter: (((a % 2) = 0) AND f_leak(b)) -> Materialize -> CTE Scan on q (9 rows) @@ -2164,15 +2169,15 @@ EXPLAIN (COSTS OFF) EXECUTE plancache_test2; (9 rows) EXPLAIN (COSTS OFF) EXECUTE plancache_test3; - QUERY PLAN + QUERY PLAN ------------------------------------------------------------- Nested Loop CTE q -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on z2 - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Seq Scan on z1 - Filter: (((a % 2) = 1) AND f_leak(b)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on z1 + Filter: (((a % 2) = 1) AND f_leak(b)) -> Materialize -> CTE Scan on q (9 rows) @@ -2216,15 +2221,15 @@ EXPLAIN (COSTS OFF) EXECUTE plancache_test2; (9 rows) EXPLAIN (COSTS OFF) EXECUTE plancache_test3; - QUERY PLAN + QUERY PLAN ------------------------------------------------------------- Nested Loop CTE q -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on z2 - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Seq Scan on z1 - Filter: (((a % 2) = 1) AND f_leak(b)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on z1 + Filter: (((a % 2) = 1) AND f_leak(b)) -> Materialize -> CTE Scan on q (9 rows) diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 14660970..f8f574f9 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -2397,6 +2397,9 @@ toyemp| SELECT emp.name, emp.location, (12 * emp.salary) AS annualsal FROM emp; +zv1| SELECT zt1.f1, + 'dummy'::text AS junk + FROM pg_temp_17.zt1; SELECT tablename, rulename, definition FROM pg_rules ORDER BY tablename, rulename; pg_settings|pg_settings_n|CREATE RULE pg_settings_n AS diff --git a/src/test/regress/expected/select_parallel_4.out b/src/test/regress/expected/select_parallel_4.out index 6bc02325..85e23f84 100644 --- a/src/test/regress/expected/select_parallel_4.out +++ b/src/test/regress/expected/select_parallel_4.out @@ -82,16 +82,16 @@ select length(stringu1) from tenk1 group by length(stringu1); explain (costs off) select stringu1, count(*) from tenk1 group by stringu1 order by stringu1; QUERY PLAN ------------------------------------------------------------------ - Sort - Sort Key: stringu1 - -> Finalize HashAggregate +----------------------------------------------------------- + Finalize GroupAggregate Group Key: stringu1 -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Gather - Workers Planned: 4 + -> Sort + Sort Key: stringu1 -> Partial HashAggregate Group Key: stringu1 + -> Gather + Workers Planned: 4 -> Parallel Seq Scan on tenk1 (10 rows) @@ -99,16 +99,16 @@ explain (costs off) select count(stringu1) as num, (CASE WHEN length(stringu1) > 5 THEN 'LONG' ELSE 'SHORT' END) as islong from tenk1 group by islong order by num; QUERY PLAN ------------------------------------------------------------------------------------------------------------------------- +-------------------------------------------------------------------------------------------------------------------- Sort Sort Key: (count(stringu1)) -> Finalize HashAggregate Group Key: CASE WHEN (length((stringu1)::text) > 5) THEN 'LONG'::text ELSE 'SHORT'::text END -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Partial HashAggregate + Group Key: (CASE WHEN (length((stringu1)::text) > 5) THEN 'LONG'::text ELSE 'SHORT'::text END) -> Gather Workers Planned: 4 - -> Partial HashAggregate - Group Key: CASE WHEN (length((stringu1)::text) > 5) THEN 'LONG'::text ELSE 'SHORT'::text END -> Parallel Seq Scan on tenk1 (10 rows) diff --git a/src/test/regress/expected/stats_ext_2.out b/src/test/regress/expected/stats_ext_2.out index 315bcbc7..e058f176 100644 --- a/src/test/regress/expected/stats_ext_2.out +++ b/src/test/regress/expected/stats_ext_2.out @@ -659,8 +659,8 @@ EXPLAIN SELECT count(*) FROM subset WHERE b = 'prefix_1' and c = 1; QUERY PLAN ------------------------------------------------------------------------------------------------- - Finalize Aggregate (cost=177.51..177.52 rows=1 width=8) - -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=177.50..177.51 rows=1 width=8) + Finalize Aggregate (cost=177.52..177.53 rows=1 width=8) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=177.50..177.52 rows=1 width=8) -> Partial Aggregate (cost=77.50..77.51 rows=1 width=8) -> Seq Scan on subset (cost=0.00..77.50 rows=1 width=0) Filter: ((b = 'prefix_1'::text) AND (c = 1)) @@ -680,8 +680,8 @@ EXPLAIN SELECT count(*) FROM subset WHERE b = 'prefix_1' and c = 1; QUERY PLAN ------------------------------------------------------------------------------------------------- - Finalize Aggregate (cost=177.51..177.52 rows=1 width=8) - -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=177.50..177.51 rows=1 width=8) + Finalize Aggregate (cost=177.52..177.53 rows=1 width=8) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=177.50..177.52 rows=1 width=8) -> Partial Aggregate (cost=77.50..77.51 rows=1 width=8) -> Seq Scan on subset (cost=0.00..77.50 rows=50 width=0) Filter: ((b = 'prefix_1'::text) AND (c = 1)) @@ -698,8 +698,8 @@ EXPLAIN SELECT count(*) FROM subset WHERE b like '%_1' and c = 1; QUERY PLAN ------------------------------------------------------------------------------------------------- - Finalize Aggregate (cost=177.51..177.52 rows=1 width=8) - -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=177.50..177.51 rows=1 width=8) + Finalize Aggregate (cost=177.52..177.53 rows=1 width=8) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=177.50..177.52 rows=1 width=8) -> Partial Aggregate (cost=77.50..77.51 rows=1 width=8) -> Seq Scan on subset (cost=0.00..77.50 rows=5 width=0) Filter: ((b ~~ '%_1'::text) AND (c = 1)) @@ -722,8 +722,8 @@ EXPLAIN SELECT count(*) FROM subset WHERE b like '%_1' and c = 1; QUERY PLAN ------------------------------------------------------------------------------------------------- - Finalize Aggregate (cost=177.51..177.52 rows=1 width=8) - -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=177.50..177.51 rows=1 width=8) + Finalize Aggregate (cost=177.52..177.53 rows=1 width=8) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=177.50..177.52 rows=1 width=8) -> Partial Aggregate (cost=77.50..77.51 rows=1 width=8) -> Seq Scan on subset (cost=0.00..77.50 rows=50 width=0) Filter: ((b ~~ '%_1'::text) AND (c = 1)) diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out index c573fbda..78e554cc 100644 --- a/src/test/regress/expected/subselect.out +++ b/src/test/regress/expected/subselect.out @@ -1712,11 +1712,11 @@ select a.a,(select b.a from tbl_b b where b.a = a.a limit 1) q from tbl_a a orde explain select * from tbl_a a where a.b IN (select b.a from tbl_b b where b.b > a.b); QUERY PLAN --------------------------------------------------------------------------------------------------------- - Remote Subquery Scan on all (datanode_1,datanode_2) (cost=120.19..136.36 rows=112 width=8) - -> Hash Semi Join (cost=120.19..136.36 rows=112 width=8) + Remote Subquery Scan on all (datanode_1,datanode_2) (cost=120.19..145.13 rows=112 width=8) + -> Hash Semi Join (cost=120.19..145.13 rows=112 width=8) Hash Cond: (a.b = b.a) Join Filter: (b.b > a.b) - -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.00..111.75 rows=675 width=8) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.00..120.53 rows=675 width=8) Distribute results by H: b -> Seq Scan on tbl_a a (cost=0.00..11.75 rows=675 width=8) -> Hash (cost=11.75..11.75 rows=675 width=8) diff --git a/src/test/regress/expected/tbase_explain.out b/src/test/regress/expected/tbase_explain.out index 691d1bb5..e51bb895 100644 --- a/src/test/regress/expected/tbase_explain.out +++ b/src/test/regress/expected/tbase_explain.out @@ -292,17 +292,13 @@ select * from a1 where num >= (select count(*) from a2 where name='c') limit 1; -> Remote Subquery Scan on all (datanode_1,datanode_2) (actual rows=1 loops=1) Output: a1.id, a1.num, a1.name -> Limit - DN (actual rows=1..1 loops=1..1) - - datanode_1 (actual rows=1 loops=1) - - datanode_2 (actual rows=1 loops=1) + DN (never executed) Output: a1.id, a1.num, a1.name -> Seq Scan on public.a1 - DN (actual rows=1..1 loops=1..1) - - datanode_1 (actual rows=1 loops=1) - - datanode_2 (actual rows=1 loops=1) + DN (never executed) Output: a1.id, a1.num, a1.name Filter: (a1.num >= $0) -(31 rows) +(27 rows) explain (costs off,timing off,summary off,analyze,verbose) select count(*) from a1 group by name having count(*) = (select count(*) from a2 where name='a'); diff --git a/src/test/regress/expected/xc_FQS_2.out b/src/test/regress/expected/xc_FQS_2.out index c4e07fc5..9b35d802 100644 --- a/src/test/regress/expected/xc_FQS_2.out +++ b/src/test/regress/expected/xc_FQS_2.out @@ -1641,13 +1641,13 @@ select * from subquery_fqs t join (select 1 id, 'gd' a, 2 c from dual union sele explain select * from subquery_fqs t1 where t1.id = 1 and t1.c IN (select c from subquery_fqs t2 where t2.id=1); QUERY PLAN -------------------------------------------------------------------------------------------------- - Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.00..121.06 rows=1 width=40) - -> Nested Loop Semi Join (cost=100.00..121.06 rows=1 width=40) + Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.00..121.16 rows=1 width=40) + -> Nested Loop Semi Join (cost=100.00..121.16 rows=1 width=40) Join Filter: (t1.c = t2.c) -> Seq Scan on subquery_fqs t1 (cost=0.00..10.50 rows=2 width=40) Filter: (id = 1) - -> Materialize (cost=100.00..110.51 rows=2 width=4) - -> Remote Subquery Scan on all (datanode_1) (cost=100.00..110.50 rows=2 width=4) + -> Materialize (cost=100.00..110.55 rows=4 width=4) + -> Remote Subquery Scan on all (datanode_1) (cost=100.00..110.53 rows=4 width=4) -> Seq Scan on subquery_fqs t2 (cost=0.00..10.50 rows=2 width=4) Filter: (id = 1) (9 rows) diff --git a/src/test/regress/expected/xc_FQS_join_1.out b/src/test/regress/expected/xc_FQS_join_1.out index 57ff7524..dc995cb5 100644 --- a/src/test/regress/expected/xc_FQS_join_1.out +++ b/src/test/regress/expected/xc_FQS_join_1.out @@ -697,20 +697,23 @@ explain (verbose on, nodes off, costs off) update tab1_mod set val2 = 1000 from -> Update on public.tab1_mod -> Merge Join Output: tab1_mod.val, 1000, tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab2_mod.ctid - Merge Cond: ((tab2_mod.val = tab1_mod.val) AND (tab2_mod.val2 = tab1_mod.val2)) + Merge Cond: ((tab1_mod.val = tab2_mod.val) AND (tab1_mod.val2 = tab2_mod.val2)) + -> Sort + Output: tab1_mod.val, tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val2 + Sort Key: tab1_mod.val, tab1_mod.val2 + -> Seq Scan on public.tab1_mod + Output: tab1_mod.val, tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val2 + -> Materialize + Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2 -> Remote Subquery Scan on all Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2 + Distribute results by M: val -> Sort Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2 Sort Key: tab2_mod.val, tab2_mod.val2 -> Seq Scan on public.tab2_mod Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2 - -> Sort - Output: tab1_mod.val, tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val2 - Sort Key: tab1_mod.val, tab1_mod.val2 - -> Seq Scan on public.tab1_mod - Output: tab1_mod.val, tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val2 -(17 rows) +(20 rows) explain (verbose on, nodes off, costs off) delete from tab1_mod using tab2_mod where tab1_mod.val = tab2_mod.val and tab1_mod.val2 = tab2_mod.val2; @@ -720,20 +723,23 @@ explain (verbose on, nodes off, costs off) delete from tab1_mod using tab2_mod -> Delete on public.tab1_mod -> Merge Join Output: tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val, tab2_mod.ctid - Merge Cond: ((tab2_mod.val = tab1_mod.val) AND (tab2_mod.val2 = tab1_mod.val2)) + Merge Cond: ((tab1_mod.val = tab2_mod.val) AND (tab1_mod.val2 = tab2_mod.val2)) + -> Sort + Output: tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val, tab1_mod.val2 + Sort Key: tab1_mod.val, tab1_mod.val2 + -> Seq Scan on public.tab1_mod + Output: tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val, tab1_mod.val2 + -> Materialize + Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2 -> Remote Subquery Scan on all Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2 + Distribute results by M: val -> Sort Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2 Sort Key: tab2_mod.val, tab2_mod.val2 -> Seq Scan on public.tab2_mod Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2 - -> Sort - Output: tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val, tab1_mod.val2 - Sort Key: tab1_mod.val, tab1_mod.val2 - -> Seq Scan on public.tab1_mod - Output: tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val, tab1_mod.val2 -(17 rows) +(20 rows) explain (verbose on, nodes off, costs off) update tab1_rep set val2 = 1000 from tab2_rep where tab1_rep.val = tab2_rep.val and tab1_rep.val2 = tab2_rep.val2; From d8272a5422047cc9be21eb904cb33a6ca6002d54 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Thu, 3 Jun 2021 19:31:23 +0800 Subject: [PATCH 154/578] fix compile warnings --- src/backend/commands/explain_dist.c | 2 ++ src/backend/optimizer/plan/createplan.c | 1 - src/backend/optimizer/plan/planner.c | 3 ++- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/backend/commands/explain_dist.c b/src/backend/commands/explain_dist.c index 81fedadc..2b37ba0d 100644 --- a/src/backend/commands/explain_dist.c +++ b/src/backend/commands/explain_dist.c @@ -125,6 +125,7 @@ InstrOut(StringInfo buf, Plan *plan, Instrumentation *instr, int current_node_id elog(DEBUG1, "InstrOut: plan_node_id %d, node %d, nloops %.0f", plan->plan_node_id, current_node_id, instr->nloops); } +#if 0 /* * WorkerInstrOut * @@ -156,6 +157,7 @@ WorkerInstrOut(StringInfo buf, WorkerInstrumentation *worker_instr) instr->startup, instr->total, instr->ntuples, instr->nloops); } } +#endif /* * SpecInstrOut diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 1924a389..cbd3bef4 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -689,7 +689,6 @@ create_scan_plan(PlannerInfo *root, Path *best_path, int flags) /* find is there any tables located in more than one group */ if ((rel->reloptkind == RELOPT_BASEREL || rel->reloptkind == RELOPT_OTHER_MEMBER_REL) && rel->rtekind == RTE_RELATION) { - bool error = false; rte = root->simple_rte_array[rel->relid]; relation = heap_open(rte->relid, NoLock); diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 8c62e688..a3af8985 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -4280,12 +4280,13 @@ create_grouping_paths(PlannerInfo *root, { #ifdef __TBASE__ bool try_redistribute_grouping = false; + double dNumLocalGroups; PathTarget * local_grouping_target = make_partial_grouping_target(root, target); grouped_rel->reltarget = local_grouping_target; /* Estimate number of partial groups. */ - double dNumLocalGroups = get_number_of_groups(root, + dNumLocalGroups = get_number_of_groups(root, cheapest_path->rows, gd); #endif From 594b715db8acaa3f613fededad8543c3172888a1 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Fri, 4 Jun 2021 10:22:30 +0800 Subject: [PATCH 155/578] fix regress expected errors --- src/test/regress/expected/create_view.out | 49 -------- src/test/regress/expected/join_3.out | 106 +++++++----------- src/test/regress/expected/rules.out | 3 - .../regress/expected/select_parallel_4.out | 16 +-- src/test/regress/expected/subselect.out | 38 +++---- src/test/regress/expected/tbase_explain.out | 10 +- src/test/regress/expected/xc_FQS_join_1.out | 40 +++---- src/test/regress/expected/xc_groupby_1.out | 24 ++-- src/test/regress/expected/xc_having_1.out | 14 +-- src/test/regress/parallel_schedule | 3 - 10 files changed, 115 insertions(+), 188 deletions(-) diff --git a/src/test/regress/expected/create_view.out b/src/test/regress/expected/create_view.out index b8836c0d..56e73b4e 100644 --- a/src/test/regress/expected/create_view.out +++ b/src/test/regress/expected/create_view.out @@ -38,55 +38,6 @@ SELECT * FROM viewtest ORDER BY a; CREATE OR REPLACE VIEW viewtest AS SELECT a, b FROM viewtest_tbl WHERE a > 5 ORDER BY b DESC; -EXPLAIN SELECT * FROM viewtest; - QUERY PLAN -------------------------------------------------------------------------------------------- - Remote Subquery Scan on all (datanode_1,datanode_2) (cost=22.23..22.79 rows=225 width=8) - -> Sort (cost=22.23..22.79 rows=225 width=8) - Sort Key: viewtest_tbl.b DESC - -> Seq Scan on viewtest_tbl (cost=0.00..13.44 rows=225 width=8) - Filter: (a > 5) -(5 rows) - -SELECT * FROM viewtest; - a | b -----+---- - 20 | 25 - 15 | 20 - 10 | 15 -(3 rows) - -EXPLAIN SELECT a FROM viewtest; - QUERY PLAN -------------------------------------------------------------------------------------------------- - Subquery Scan on viewtest (cost=22.23..27.29 rows=225 width=4) - -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=22.23..22.79 rows=225 width=8) - -> Sort (cost=22.23..22.79 rows=225 width=8) - Sort Key: viewtest_tbl.b DESC - -> Seq Scan on viewtest_tbl (cost=0.00..13.44 rows=225 width=8) - Filter: (a > 5) -(6 rows) - -SELECT a FROM viewtest; - a ----- - 20 - 15 - 10 -(3 rows) - -EXPLAIN SELECT * FROM viewtest ORDER BY a; - QUERY PLAN -------------------------------------------------------------------------------------------- - Remote Subquery Scan on all (datanode_1,datanode_2) (cost=36.08..36.64 rows=225 width=8) - -> Sort (cost=36.08..36.64 rows=225 width=8) - Sort Key: viewtest_tbl.a - -> Sort (cost=22.23..22.79 rows=225 width=8) - Sort Key: viewtest_tbl.b DESC - -> Seq Scan on viewtest_tbl (cost=0.00..13.44 rows=225 width=8) - Filter: (a > 5) -(7 rows) - SELECT * FROM viewtest ORDER BY a; a | b ----+---- diff --git a/src/test/regress/expected/join_3.out b/src/test/regress/expected/join_3.out index 0da9548e..53a75d2f 100644 --- a/src/test/regress/expected/join_3.out +++ b/src/test/regress/expected/join_3.out @@ -1867,33 +1867,6 @@ SELECT '' AS "xxx", * | 1 | 4 | one | -1 (1 row) --- --- semijoin selectivity for <> --- -explain (costs off) -select * from int4_tbl i4, tenk1 a -where exists(select * from tenk1 b - where a.twothousand = b.twothousand and a.fivethous <> b.fivethous) - and i4.f1 = a.tenthous; - QUERY PLAN ------------------------------------------------------------------------ - Remote Subquery Scan on all (datanode_1,datanode_2) - -> Hash Semi Join - Hash Cond: (a.twothousand = b.twothousand) - Join Filter: (a.fivethous <> b.fivethous) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: twothousand - -> Hash Join - Hash Cond: (a.tenthous = i4.f1) - -> Seq Scan on tenk1 a - -> Hash - -> Seq Scan on int4_tbl i4 - -> Hash - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: twothousand - -> Seq Scan on tenk1 b -(15 rows) - -- -- More complicated constructs -- @@ -2447,7 +2420,7 @@ select count(*) from tenk1 a, tenk1 b -> Partial Aggregate -> Hash Join Hash Cond: (a.hundred = b.thousand) - -> Seq Scan on tenk1 a + -> Index Only Scan using tenk1_hundred on tenk1 a -> Hash -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on tenk1 b @@ -3318,7 +3291,7 @@ select count(*) from Join Filter: (a.unique2 = b.unique1) -> Remote Subquery Scan on all Distribute results by H: thousand - -> Seq Scan on tenk1 c + -> Index Only Scan using tenk1_thous_tenthous on tenk1 c -> Hash -> Remote Subquery Scan on all Distribute results by H: thousand @@ -3363,7 +3336,7 @@ select b.unique1 from Join Filter: (b.unique1 = 42) -> Remote Subquery Scan on all Distribute results by H: 42 - -> Seq Scan on tenk1 c + -> Index Only Scan using tenk1_thous_tenthous on tenk1 c -> Hash -> Remote Subquery Scan on all Distribute results by H: unique1 @@ -3468,17 +3441,19 @@ select f1, unique2, case when unique2 is null then f1 else 0 end QUERY PLAN -------------------------------------------------------------------------- Remote Subquery Scan on all - -> Hash Right Join - Hash Cond: (b.unique2 = a.f1) + -> Merge Right Join + Merge Cond: (b.unique2 = a.f1) Filter: (CASE WHEN (b.unique2 IS NULL) THEN a.f1 ELSE 0 END = 0) -> Remote Subquery Scan on all Distribute results by H: unique2 - -> Seq Scan on tenk1 b - -> Hash + -> Index Only Scan using tenk1_unique2 on tenk1 b + -> Materialize -> Remote Subquery Scan on all Distribute results by H: f1 - -> Seq Scan on int4_tbl a -(11 rows) + -> Sort + Sort Key: a.f1 + -> Seq Scan on int4_tbl a +(13 rows) select f1, unique2, case when unique2 is null then f1 else 0 end from int4_tbl a left join tenk1 b on f1 = unique2 @@ -3537,33 +3512,37 @@ left join using (join_key) ) foo3 using (join_key); - QUERY PLAN ------------------------------------------------------------------------ + QUERY PLAN +-------------------------------------------------------------------------------- Hash Right Join Output: "*VALUES*".column1, i1.f1, (666) Hash Cond: (i1.f1 = "*VALUES*".column1) -> Remote Subquery Scan on all (datanode_1,datanode_2) Output: i1.f1, 666 - -> Hash Right Join + -> Merge Right Join Output: i1.f1, 666 - Hash Cond: (i2.unique2 = i1.f1) + Merge Cond: (i2.unique2 = i1.f1) -> Remote Subquery Scan on all (datanode_1,datanode_2) Output: i2.unique2 Distribute results by H: unique2 - -> Seq Scan on public.tenk1 i2 + Sort Key: i2.unique2 + -> Index Only Scan using tenk1_unique2 on public.tenk1 i2 Output: i2.unique2 - -> Hash + -> Materialize Output: i1.f1 -> Remote Subquery Scan on all (datanode_1) Output: i1.f1 Distribute results by H: f1 - -> Seq Scan on public.int4_tbl i1 + -> Sort Output: i1.f1 + Sort Key: i1.f1 + -> Seq Scan on public.int4_tbl i1 + Output: i1.f1 -> Hash Output: "*VALUES*".column1 -> Values Scan on "*VALUES*" Output: "*VALUES*".column1 -(24 rows) +(28 rows) select foo1.join_key as foo1_id, foo3.join_key AS foo3_id, bug_field from (values (0),(1)) foo1(join_key) @@ -4720,19 +4699,18 @@ select * from generate_series(100,200) g, explain (num_nodes off, nodes off, costs off) select count(*) from tenk1 a, tenk1 b join lateral (values(a.unique1)) ss(x) on b.unique2 = ss.x; - QUERY PLAN ------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------ Finalize Aggregate -> Remote Subquery Scan on all -> Partial Aggregate - -> Hash Join - Hash Cond: (b.unique2 = a.unique1) - -> Remote Subquery Scan on all - Distribute results by H: unique2 - -> Seq Scan on tenk1 b - -> Hash - -> Seq Scan on tenk1 a -(10 rows) + -> Merge Join + Merge Cond: (b.unique2 = a.unique1) + -> Remote Subquery Scan on all + Distribute results by H: unique2 + -> Index Only Scan using tenk1_unique2 on tenk1 b + -> Index Only Scan using tenk1_unique1 on tenk1 a +(9 rows) select count(*) from tenk1 a, tenk1 b join lateral (values(a.unique1)) ss(x) on b.unique2 = ss.x; @@ -4745,18 +4723,18 @@ select count(*) from tenk1 a, explain (num_nodes off, nodes off, costs off) select count(*) from tenk1 a, tenk1 b join lateral (values(a.unique1),(-1)) ss(x) on b.unique2 = ss.x; - QUERY PLAN ------------------------------------------------------ + QUERY PLAN +------------------------------------------------------------------------ Aggregate -> Hash Join Hash Cond: ("*VALUES*".column1 = b.unique2) -> Nested Loop -> Remote Subquery Scan on all - -> Seq Scan on tenk1 a + -> Index Only Scan using tenk1_unique1 on tenk1 a -> Values Scan on "*VALUES*" -> Hash -> Remote Subquery Scan on all - -> Seq Scan on tenk1 b + -> Index Only Scan using tenk1_unique2 on tenk1 b (10 rows) select count(*) from tenk1 a, @@ -6085,8 +6063,8 @@ from onek t1, tenk1 t2 where exists (select 1 from tenk1 t3 where t3.thousand = t1.unique1 and t3.tenthous = t2.hundred) and t1.unique1 < 1; - QUERY PLAN -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + QUERY PLAN +--------------------------------------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) Output: t1.unique1, t2.hundred -> Nested Loop @@ -6101,13 +6079,13 @@ where exists (select 1 from tenk1 t3 Output: t3.thousand, t3.tenthous Group Key: t3.thousand, t3.tenthous -> Remote Subquery Scan on all (datanode_1,datanode_2) - Output: t3.unique1, t3.unique2, t3.two, t3.four, t3.ten, t3.twenty, t3.hundred, t3.thousand, t3.twothousand, t3.fivethous, t3.tenthous, t3.odd, t3.even, t3.stringu1, t3.stringu2, t3.string4 + Output: t3.thousand, t3.tenthous Distribute results by H: thousand -> HashAggregate - Output: t3.unique1, t3.unique2, t3.two, t3.four, t3.ten, t3.twenty, t3.hundred, t3.thousand, t3.twothousand, t3.fivethous, t3.tenthous, t3.odd, t3.even, t3.stringu1, t3.stringu2, t3.string4 + Output: t3.thousand, t3.tenthous Group Key: t3.thousand, t3.tenthous - -> Seq Scan on public.tenk1 t3 - Output: t3.unique1, t3.unique2, t3.two, t3.four, t3.ten, t3.twenty, t3.hundred, t3.thousand, t3.twothousand, t3.fivethous, t3.tenthous, t3.odd, t3.even, t3.stringu1, t3.stringu2, t3.string4 + -> Index Only Scan using tenk1_thous_tenthous on public.tenk1 t3 + Output: t3.thousand, t3.tenthous -> Hash Output: t1.unique1 -> Remote Subquery Scan on all (datanode_1,datanode_2) diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index f8f574f9..14660970 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -2397,9 +2397,6 @@ toyemp| SELECT emp.name, emp.location, (12 * emp.salary) AS annualsal FROM emp; -zv1| SELECT zt1.f1, - 'dummy'::text AS junk - FROM pg_temp_17.zt1; SELECT tablename, rulename, definition FROM pg_rules ORDER BY tablename, rulename; pg_settings|pg_settings_n|CREATE RULE pg_settings_n AS diff --git a/src/test/regress/expected/select_parallel_4.out b/src/test/regress/expected/select_parallel_4.out index 85e23f84..0b6353b7 100644 --- a/src/test/regress/expected/select_parallel_4.out +++ b/src/test/regress/expected/select_parallel_4.out @@ -81,15 +81,15 @@ select length(stringu1) from tenk1 group by length(stringu1); explain (costs off) select stringu1, count(*) from tenk1 group by stringu1 order by stringu1; - QUERY PLAN + QUERY PLAN ----------------------------------------------------------- Finalize GroupAggregate - Group Key: stringu1 - -> Remote Subquery Scan on all (datanode_1,datanode_2) + Group Key: stringu1 + -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Sort Sort Key: stringu1 - -> Partial HashAggregate - Group Key: stringu1 + -> Partial HashAggregate + Group Key: stringu1 -> Gather Workers Planned: 4 -> Parallel Seq Scan on tenk1 @@ -98,7 +98,7 @@ explain (costs off) explain (costs off) select count(stringu1) as num, (CASE WHEN length(stringu1) > 5 THEN 'LONG' ELSE 'SHORT' END) as islong from tenk1 group by islong order by num; - QUERY PLAN + QUERY PLAN -------------------------------------------------------------------------------------------------------------------- Sort Sort Key: (count(stringu1)) @@ -107,8 +107,8 @@ explain (costs off) -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Partial HashAggregate Group Key: (CASE WHEN (length((stringu1)::text) > 5) THEN 'LONG'::text ELSE 'SHORT'::text END) - -> Gather - Workers Planned: 4 + -> Gather + Workers Planned: 4 -> Parallel Seq Scan on tenk1 (10 rows) diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out index 78e554cc..6e607200 100644 --- a/src/test/regress/expected/subselect.out +++ b/src/test/regress/expected/subselect.out @@ -877,7 +877,7 @@ select * from int4_tbl where SubPlan 1 -> Remote Subquery Scan on all (datanode_1,datanode_2) Output: a.unique1 - -> Seq Scan on public.tenk1 a + -> Index Only Scan using tenk1_unique1 on public.tenk1 a Output: a.unique1 (26 rows) @@ -1173,7 +1173,7 @@ set enable_nestloop to true; set enable_hashjoin to false; set enable_mergejoin to false; explain select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2; - QUERY PLAN + QUERY PLAN ----------------------------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) (cost=3923.54..3924.39 rows=338 width=8) -> Sort (cost=3923.54..3924.39 rows=338 width=8) @@ -1710,7 +1710,7 @@ select a.a,(select b.a from tbl_b b where b.a = a.a limit 1) q from tbl_a a orde -- support pullup lateral ANY_SUBLINK explain select * from tbl_a a where a.b IN (select b.a from tbl_b b where b.b > a.b); - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) (cost=120.19..145.13 rows=112 width=8) -> Hash Semi Join (cost=120.19..145.13 rows=112 width=8) @@ -1749,31 +1749,31 @@ where t2.a = ( where t1.a = t2.a ); - QUERY PLAN + QUERY PLAN ----------------------------------------------------------------------------------------------------------- Hash Join Hash Cond: (t2.a = "EXPR_subquery".min) -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on sub_t2 t2 -> Hash - -> Hash Left Join - Hash Cond: (t1.a = "EXPR_subquery".a) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Seq Scan on sub_t1 t1 - -> Hash - -> Subquery Scan on "EXPR_subquery" - -> HashAggregate - Group Key: t2_1.a - -> Nested Loop + -> Hash Left Join + Hash Cond: (t1.a = "EXPR_subquery".a) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on sub_t1 t1 + -> Hash + -> Subquery Scan on "EXPR_subquery" + -> HashAggregate + Group Key: t2_1.a -> Nested Loop - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Seq Scan on sub_t2 t2_1 + -> Nested Loop + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on sub_t2 t2_1 + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on sub_interfere1 -> Materialize -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Seq Scan on sub_interfere1 - -> Materialize - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Seq Scan on sub_interfere2 + -> Seq Scan on sub_interfere2 (23 rows) DROP TABLE sub_t1; diff --git a/src/test/regress/expected/tbase_explain.out b/src/test/regress/expected/tbase_explain.out index e51bb895..691d1bb5 100644 --- a/src/test/regress/expected/tbase_explain.out +++ b/src/test/regress/expected/tbase_explain.out @@ -292,13 +292,17 @@ select * from a1 where num >= (select count(*) from a2 where name='c') limit 1; -> Remote Subquery Scan on all (datanode_1,datanode_2) (actual rows=1 loops=1) Output: a1.id, a1.num, a1.name -> Limit - DN (never executed) + DN (actual rows=1..1 loops=1..1) + - datanode_1 (actual rows=1 loops=1) + - datanode_2 (actual rows=1 loops=1) Output: a1.id, a1.num, a1.name -> Seq Scan on public.a1 - DN (never executed) + DN (actual rows=1..1 loops=1..1) + - datanode_1 (actual rows=1 loops=1) + - datanode_2 (actual rows=1 loops=1) Output: a1.id, a1.num, a1.name Filter: (a1.num >= $0) -(27 rows) +(31 rows) explain (costs off,timing off,summary off,analyze,verbose) select count(*) from a1 group by name having count(*) = (select count(*) from a2 where name='a'); diff --git a/src/test/regress/expected/xc_FQS_join_1.out b/src/test/regress/expected/xc_FQS_join_1.out index dc995cb5..18836c1e 100644 --- a/src/test/regress/expected/xc_FQS_join_1.out +++ b/src/test/regress/expected/xc_FQS_join_1.out @@ -391,11 +391,11 @@ explain (verbose on, nodes off, costs off) select * from tab1_mod natural join t Hash Join Output: tab1_mod.val, tab1_mod.val2 Hash Cond: ((tab1_mod.val = tab4_rep.val) AND (tab1_mod.val2 = tab4_rep.val2)) - -> Remote Subquery Scan on all + -> Remote Subquery Scan on all + Output: tab1_mod.val, tab1_mod.val2 + -> Seq Scan on public.tab1_mod Output: tab1_mod.val, tab1_mod.val2 - -> Seq Scan on public.tab1_mod - Output: tab1_mod.val, tab1_mod.val2 - Filter: (tab1_mod.val > 2) + Filter: (tab1_mod.val > 2) -> Hash Output: tab4_rep.val, tab4_rep.val2 -> Remote Subquery Scan on all @@ -426,9 +426,9 @@ explain (verbose on, nodes off, costs off) select * from tab1_mod natural join t -> Hash Join Output: tab1_mod.val, tab1_mod.val2 Hash Cond: ((tab1_mod.val = tab2_mod.val) AND (tab1_mod.val2 = tab2_mod.val2)) - -> Seq Scan on public.tab1_mod - Output: tab1_mod.val, tab1_mod.val2 - Filter: (tab1_mod.val > 2) + -> Seq Scan on public.tab1_mod + Output: tab1_mod.val, tab1_mod.val2 + Filter: (tab1_mod.val > 2) -> Hash Output: tab2_mod.val, tab2_mod.val2 -> Remote Subquery Scan on all @@ -590,10 +590,10 @@ explain (verbose on, nodes off, costs off, num_nodes on) select * from tab1_mod Output: tab1_mod.val, tab1_mod.val2, tab1_mod.val2 Join Filter: (tab1_mod.val2 = tab4_rep.val2) -> Remote Subquery Scan on all + Output: tab1_mod.val, tab1_mod.val2 + -> Seq Scan on public.tab1_mod Output: tab1_mod.val, tab1_mod.val2 - -> Seq Scan on public.tab1_mod - Output: tab1_mod.val, tab1_mod.val2 - Filter: (tab1_mod.val = 1) + Filter: (tab1_mod.val = 1) -> Materialize Output: tab4_rep.val, tab4_rep.val2 -> Remote Subquery Scan on all @@ -696,7 +696,7 @@ explain (verbose on, nodes off, costs off) update tab1_mod set val2 = 1000 from Remote Subquery Scan on all -> Update on public.tab1_mod -> Merge Join - Output: tab1_mod.val, 1000, tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab2_mod.ctid + Output: tab1_mod.val, 1000, tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab2_mod.ctid, tab2_mod.xc_node_id Merge Cond: ((tab1_mod.val = tab2_mod.val) AND (tab1_mod.val2 = tab2_mod.val2)) -> Sort Output: tab1_mod.val, tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val2 @@ -704,15 +704,15 @@ explain (verbose on, nodes off, costs off) update tab1_mod set val2 = 1000 from -> Seq Scan on public.tab1_mod Output: tab1_mod.val, tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val2 -> Materialize - Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2 - -> Remote Subquery Scan on all - Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2 + Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2 + -> Remote Subquery Scan on all + Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2 Distribute results by M: val -> Sort Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2 Sort Key: tab2_mod.val, tab2_mod.val2 -> Seq Scan on public.tab2_mod - Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2 + Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2 (20 rows) explain (verbose on, nodes off, costs off) delete from tab1_mod using tab2_mod @@ -722,7 +722,7 @@ explain (verbose on, nodes off, costs off) delete from tab1_mod using tab2_mod Remote Subquery Scan on all -> Delete on public.tab1_mod -> Merge Join - Output: tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val, tab2_mod.ctid + Output: tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val, tab2_mod.ctid, tab2_mod.xc_node_id Merge Cond: ((tab1_mod.val = tab2_mod.val) AND (tab1_mod.val2 = tab2_mod.val2)) -> Sort Output: tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val, tab1_mod.val2 @@ -730,15 +730,15 @@ explain (verbose on, nodes off, costs off) delete from tab1_mod using tab2_mod -> Seq Scan on public.tab1_mod Output: tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val, tab1_mod.val2 -> Materialize - Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2 - -> Remote Subquery Scan on all - Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2 + Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2 + -> Remote Subquery Scan on all + Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2 Distribute results by M: val -> Sort Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2 Sort Key: tab2_mod.val, tab2_mod.val2 -> Seq Scan on public.tab2_mod - Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2 + Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2 (20 rows) explain (verbose on, nodes off, costs off) update tab1_rep set val2 = 1000 from tab2_rep diff --git a/src/test/regress/expected/xc_groupby_1.out b/src/test/regress/expected/xc_groupby_1.out index b33bfcf0..c411847d 100644 --- a/src/test/regress/expected/xc_groupby_1.out +++ b/src/test/regress/expected/xc_groupby_1.out @@ -4057,12 +4057,12 @@ explain (verbose true, costs false, nodes false) select count(*) + sum(val) + av Remote Subquery Scan on all Output: (((count(*) + sum(val)))::numeric + avg(val)), val2 Sort Key: xc_groupby_tab1.val2 - -> Sort - Output: ((((count(*) + sum(val)))::numeric + avg(val))), val2 - Sort Key: xc_groupby_tab1.val2 - -> Finalize HashAggregate - Output: (((count(*) + sum(val)))::numeric + avg(val)), val2 - Group Key: xc_groupby_tab1.val2 + -> Finalize GroupAggregate + Output: (((count(*) + sum(val)))::numeric + avg(val)), val2 + Group Key: xc_groupby_tab1.val2 + -> Sort + Output: val2, (PARTIAL count(*)), (PARTIAL sum(val)), (PARTIAL avg(val)) + Sort Key: xc_groupby_tab1.val2 -> Remote Subquery Scan on all Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val) Distribute results by H: val2 @@ -4106,12 +4106,12 @@ explain (verbose true, costs false, nodes false) select sum(val), avg(val), 2 * Remote Subquery Scan on all Output: sum(val), avg(val), (2 * val2) Sort Key: (2 * xc_groupby_tab1.val2) - -> Sort - Output: (sum(val)), (avg(val)), ((2 * val2)) - Sort Key: ((2 * xc_groupby_tab1.val2)) - -> Finalize HashAggregate - Output: sum(val), avg(val), ((2 * val2)) - Group Key: (2 * xc_groupby_tab1.val2) + -> Finalize GroupAggregate + Output: sum(val), avg(val), ((2 * val2)) + Group Key: ((2 * xc_groupby_tab1.val2)) + -> Sort + Output: ((2 * val2)), (PARTIAL sum(val)), (PARTIAL avg(val)) + Sort Key: ((2 * xc_groupby_tab1.val2)) -> Remote Subquery Scan on all Output: (2 * val2), PARTIAL sum(val), PARTIAL avg(val) Distribute results by H: (2 * val2) diff --git a/src/test/regress/expected/xc_having_1.out b/src/test/regress/expected/xc_having_1.out index 9d914a2a..dd87a084 100644 --- a/src/test/regress/expected/xc_having_1.out +++ b/src/test/regress/expected/xc_having_1.out @@ -611,17 +611,17 @@ explain (verbose true, costs false, nodes false) select count(*), sum(xc_having_ -> Hash Join Output: xc_having_tab1.val2, xc_having_tab2.val2, xc_having_tab1.val, xc_having_tab2.val Hash Cond: (xc_having_tab1.val2 = xc_having_tab2.val2) - Join Filter: ((xc_having_tab1.val2 + xc_having_tab2.val2) > 2) - -> Remote Subquery Scan on all - Output: xc_having_tab1.val, xc_having_tab1.val2 - Distribute results by H: val2 + Join Filter: ((xc_having_tab1.val2 + xc_having_tab2.val2) > 2) + -> Remote Subquery Scan on all + Output: xc_having_tab1.val, xc_having_tab1.val2 + Distribute results by H: val2 -> Seq Scan on public.xc_having_tab1 Output: xc_having_tab1.val, xc_having_tab1.val2 -> Hash - Output: xc_having_tab2.val, xc_having_tab2.val2 - -> Remote Subquery Scan on all Output: xc_having_tab2.val, xc_having_tab2.val2 - Distribute results by H: val2 + -> Remote Subquery Scan on all + Output: xc_having_tab2.val, xc_having_tab2.val2 + Distribute results by H: val2 -> Seq Scan on public.xc_having_tab2 Output: xc_having_tab2.val, xc_having_tab2.val2 (24 rows) diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index c0ccc373..31e0b077 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -164,6 +164,3 @@ test: xl_primary_key xl_foreign_key xl_distribution_column_types xl_alter_table # This runs TBase specific tests test: tbase_explain - -test: redistribute_custom_types -test: nestloop_by_shard From b467bf42642e489bffb86f6c6299c1c77294deb1 Mon Sep 17 00:00:00 2001 From: whalesong Date: Fri, 8 Jan 2021 16:34:22 +0800 Subject: [PATCH 156/578] [Bugfix] gb18030 decode failed when use prepared statement, ID84482999 (merge request !82) Squash merge branch 'Tbase_v5.05.2_bugfix_gb18030' into 'Tbase_v5.05.2' * bugfix: gb18030 decode failed when use prepared statement, ID84482999, add regress cases * bugfix: gb18030 decode failed when use prepared statement, ID84482999 (cherry picked from commit 10aa5b2b) 005c001e bugfix: gb18030 decode failed when use prepared statement, ID84482999, add regress cases 6c195741 bugfix: gb18030 decode failed when use prepared statement, ID84482999 --- src/backend/pgxc/pool/execRemote.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 52850951..5a7d8b4d 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -3664,6 +3664,12 @@ pgxc_node_remote_cleanup_all(void) return; } + /* Do not cleanup connections if we have prepared statements on nodes */ + if (HaveActiveDatanodeStatements()) + { + return; + } + /* * Send down snapshot followed by DISCARD ALL command. */ From ebe449a5689077b3e718c69cab9c6d5b1557bc04 Mon Sep 17 00:00:00 2001 From: bethding Date: Tue, 30 Mar 2021 14:39:55 +0800 Subject: [PATCH 157/578] only rewrite for distribute key --- src/backend/executor/execMain.c | 28 ++++++--------------------- src/backend/optimizer/util/pgxcship.c | 4 +++- src/test/regress/expected/prepare.out | 21 ++++++++++++++++++++ src/test/regress/sql/prepare.sql | 12 ++++++++++++ 4 files changed, 42 insertions(+), 23 deletions(-) diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 9ec3add2..571f9473 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -1906,13 +1906,10 @@ ExecEndPlan(PlanState *planstate, EState *estate) * which datanode will execute the sql command. After we get the result, * we should use the result to replace distribute key's function to * generate a new sql that will be shipped to datanode. - * Note: for replication table, we should caculate all the results of - * functions before ship the sql. Otherwise the value may not be same - * in different datanodes. */ static void RewriteForSql(RemoteQueryState *planstate, RemoteQuery *plan, - char *distribcol, bool isreplic) + char *distribcol) { Query *query = copyObject(plan->forDeparse); ListCell *lc_deparse = NULL; @@ -1928,13 +1925,10 @@ RewriteForSql(RemoteQueryState *planstate, RemoteQuery *plan, foreach(lc_deparse, query->targetList) { entry_deparse = lfirst(lc_deparse); - if (isreplic) - { - entry_deparse->expr = (Expr *)replace_distribkey_func( - (Node *)entry_deparse->expr); - find_target = true; - } - else if (strcmp(entry_deparse->resname, distribcol) == 0) + + /* Only rewrite distribute key's function. */ + if (strcmp(entry_deparse->resname, distribcol) == 0 && + !pgxc_is_expr_shippable(entry_deparse->expr, NULL)) { entry_deparse->expr = (Expr *)replace_distribkey_func( (Node *)entry_deparse->expr); @@ -1988,16 +1982,6 @@ RewriteFuncNode(PlanState *planstate) if ((!exec_nodes) || (!exec_nodes->need_rewrite)) return; - /* - * For replicated table, we need to execute func - * and then ship to datanode - */ - if (IsExecNodesReplicated(exec_nodes)) - { - RewriteForSql(node, plan, NULL, true); - return; - } - if (exec_nodes->en_relid == InvalidOid || (!exec_nodes->en_expr)) return; @@ -2006,7 +1990,7 @@ RewriteFuncNode(PlanState *planstate) return; distribcol = GetRelationDistribColumn(rel_loc_info); - RewriteForSql(node, plan, distribcol, false); + RewriteForSql(node, plan, distribcol); } /* ---------------------------------------------------------------- diff --git a/src/backend/optimizer/util/pgxcship.c b/src/backend/optimizer/util/pgxcship.c index d294de0b..7edfd4be 100644 --- a/src/backend/optimizer/util/pgxcship.c +++ b/src/backend/optimizer/util/pgxcship.c @@ -2028,7 +2028,9 @@ pgxc_is_query_shippable(Query *query, int query_level) * must know the function's result before real execute. So set * the flag to identify rewrite in ExecutePlan. */ - if (bms_is_member(SS_NEED_FUNC_REWRITE, shippability)) + if (bms_is_member(SS_NEED_FUNC_REWRITE, shippability) && + (IsLocatorColumnDistributed(exec_nodes->baselocatortype) || + IsLocatorDistributedByValue(exec_nodes->baselocatortype))) { exec_nodes->need_rewrite = true; shippability = bms_del_member(shippability, SS_NEED_FUNC_REWRITE); diff --git a/src/test/regress/expected/prepare.out b/src/test/regress/expected/prepare.out index 7dd52d9a..1e93e2ad 100644 --- a/src/test/regress/expected/prepare.out +++ b/src/test/regress/expected/prepare.out @@ -322,6 +322,27 @@ SELECT * from insert_fsq_test order by id; DEALLOCATE PREPARE ps_test_insert; DROP TABLE insert_fsq_test cascade; +-- test non-distribute key with function, no need rewrite +CREATE TABLE insert_fsq_test1(v int, w int); +CREATE SEQUENCE test_seq; +PREPARE ps_test_insert1(int) AS INSERT INTO insert_fsq_test1 values($1, nextval('test_seq')); +EXECUTE ps_test_insert1(1); +EXECUTE ps_test_insert1(2); +EXECUTE ps_test_insert1(3); +EXECUTE ps_test_insert1(4); +EXECUTE ps_test_insert1(5); +SELECT * from insert_fsq_test1 order by v; + v | w +---+--- + 1 | 1 + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 +(5 rows) + +DEALLOCATE PREPARE ps_test_insert1; +DROP TABLE insert_fsq_test1 cascade; -- -- gb18030 test -- diff --git a/src/test/regress/sql/prepare.sql b/src/test/regress/sql/prepare.sql index 9a465ab3..f20b20e8 100644 --- a/src/test/regress/sql/prepare.sql +++ b/src/test/regress/sql/prepare.sql @@ -171,6 +171,18 @@ EXECUTE ps_test_insert('5'); SELECT * from insert_fsq_test order by id; DEALLOCATE PREPARE ps_test_insert; DROP TABLE insert_fsq_test cascade; +-- test non-distribute key with function, no need rewrite +CREATE TABLE insert_fsq_test1(v int, w int); +CREATE SEQUENCE test_seq; +PREPARE ps_test_insert1(int) AS INSERT INTO insert_fsq_test1 values($1, nextval('test_seq')); +EXECUTE ps_test_insert1(1); +EXECUTE ps_test_insert1(2); +EXECUTE ps_test_insert1(3); +EXECUTE ps_test_insert1(4); +EXECUTE ps_test_insert1(5); +SELECT * from insert_fsq_test1 order by v; +DEALLOCATE PREPARE ps_test_insert1; +DROP TABLE insert_fsq_test1 cascade; -- -- gb18030 test From 3756cf349fbec83e4adbc25e98bdd299e3ae50f8 Mon Sep 17 00:00:00 2001 From: bethding Date: Wed, 27 Jan 2021 20:53:23 +0800 Subject: [PATCH 158/578] select cmd ship to datanode when single datanode --- src/backend/commands/explain.c | 86 +++++++++++++++++++++++++++ src/backend/optimizer/util/pgxcship.c | 6 ++ 2 files changed, 92 insertions(+) diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 09617d73..2f7ea8e7 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -147,6 +147,7 @@ static void ExplainDummyGroup(const char *objtype, const char *labelname, static void ExplainExecNodes(ExecNodes *en, ExplainState *es); static void ExplainRemoteQuery(RemoteQuery *plan, PlanState *planstate, List *ancestors, ExplainState *es); +static char **StrSplit(const char *str, const char *delimiter, int *n); #endif static void ExplainXMLTag(const char *tagname, int flags, ExplainState *es); static void ExplainJSONLineEnding(ExplainState *es); @@ -3493,6 +3494,74 @@ ExplainPropertyListNested(const char *qlabel, List *data, ExplainState *es) } } +/* split a string based on a delimiter */ +static char ** +StrSplit(const char *str, const char *delimiter, int *n) +{ + char *tmp = NULL; + char **rtn = NULL; + char *token = NULL; + + *n = 0; + if (!str) + return NULL; + + /* copy str to tmp as strtok will mangle the string */ + tmp = pstrdup(str); + + if (!strlen(tmp) || !delimiter || !strlen(delimiter)) + { + *n = 1; + rtn = (char **) palloc(*n * sizeof(char *)); + rtn[0] = pstrdup(tmp); + pfree(tmp); + return rtn; + } + + token = strtok(tmp, delimiter); + while (token != NULL) + { + if (*n < 1) + { + rtn = (char **) palloc(sizeof(char *)); + } + else + { + rtn = (char **) repalloc(rtn, (*n + 1) * sizeof(char *)); + } + + rtn[*n] = NULL; + rtn[*n] = pstrdup(token); + *n = *n + 1; + + token = strtok(NULL, delimiter); + } + + pfree(tmp); + return rtn; +} + +static void +DealRemoteJson(StringInfo explainResult, const char *value, int spaceLen) +{ + int i = 0; + int num = 0; + char **result = NULL; + result = StrSplit(value, "\n", &num); + for (i = 0; i < num; i++) + { + if (i > 0) + { + appendStringInfo(explainResult, "\n"); + appendStringInfoSpaces(explainResult, spaceLen); + } + appendStringInfo(explainResult, "%s", result[i]); + pfree(result[i]); + } + if (result) + pfree(result); +} + /* * Explain a simple property. * @@ -4007,9 +4076,18 @@ ExplainRemoteQuery(RemoteQuery *plan, PlanState *planstate, List *ancestors, Exp value = slot_getattr(result, 1, &isnull); if (!isnull) { + if (es->format == EXPLAIN_FORMAT_JSON) + { + if (!firstline) + appendStringInfo(&explainResult, "\n"); + DealRemoteJson(&explainResult, TextDatumGetCString(value), 2 * es->indent); + } + else + { if (!firstline) appendStringInfoSpaces(&explainResult, 2 * es->indent); appendStringInfo(&explainResult, "%s\n", TextDatumGetCString(value)); + } firstline = false; } @@ -4020,10 +4098,18 @@ ExplainRemoteQuery(RemoteQuery *plan, PlanState *planstate, List *ancestors, Exp if (es->format == EXPLAIN_FORMAT_TEXT) appendStringInfo(es->str, "%s", explainResult.data); + else if (es->format == EXPLAIN_FORMAT_JSON) + { + appendStringInfoChar(es->str, '\n'); + appendStringInfoSpaces(es->str, es->indent * 2); + appendStringInfo(es->str, "%s: %s", "\"Remote plan\"", explainResult.data); + } else + { ExplainPropertyText("Remote plan", explainResult.data, es); } } +} #endif /* diff --git a/src/backend/optimizer/util/pgxcship.c b/src/backend/optimizer/util/pgxcship.c index 7edfd4be..df8aa3d5 100644 --- a/src/backend/optimizer/util/pgxcship.c +++ b/src/backend/optimizer/util/pgxcship.c @@ -1916,6 +1916,12 @@ pgxc_is_query_shippable(Query *query, int query_level) pgxc_shippability_walker((Node *)query, &sc_context); exec_nodes = sc_context.sc_exec_nodes; + + /* For single datanode and select command, we ship it directly. */ + if (NumDataNodes == 1 && query->commandType == CMD_SELECT && + !bms_is_member(SS_NEEDS_COORD, sc_context.sc_shippability)) + return exec_nodes; + /* * The shippability context contains two ExecNodes, one for the subLinks * involved in the Query and other for the relation involved in FromClause. From 8a94c0b1220238afb4aae28070757b732ca5f967 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Fri, 4 Jun 2021 16:54:33 +0800 Subject: [PATCH 159/578] fix compile warnning:pgxc_is_expr_shippable implicit declaration --- src/backend/executor/execMain.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 571f9473..7424b45a 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -88,6 +88,7 @@ #include "optimizer/planmain.h" #include "pgxc/squeue.h" #include "utils/relfilenodemap.h" +#include "optimizer/pgxcship.h" #endif #ifdef __AUDIT__ From 40179d321f20435db553fc62955b40ae8754a8fe Mon Sep 17 00:00:00 2001 From: andrelin Date: Fri, 4 Jun 2021 16:40:56 +0800 Subject: [PATCH 160/578] Pooler support not raising error when fail to connect to nodes Only used in pg_stat_cluster_activity extension for now tapd: http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131088425671 --- contrib/pg_clean/pg_clean.c | 3198 +++++++++++++++++ .../pg_stat_cluster_activity.c | 3 +- src/backend/pgxc/barrier/barrier.c | 626 ++-- src/backend/pgxc/cluster/pause.c | 4 +- src/backend/pgxc/pool/execRemote.c | 59 +- src/backend/pgxc/pool/pgxcnode.c | 22 +- src/backend/pgxc/pool/poolmgr.c | 74 +- src/backend/replication/logical/worker.c | 2 +- src/include/pgxc/locator.h | 1 + src/include/pgxc/pgxcnode.h | 3 +- src/include/pgxc/poolmgr.h | 3 +- 11 files changed, 3648 insertions(+), 347 deletions(-) create mode 100644 contrib/pg_clean/pg_clean.c diff --git a/contrib/pg_clean/pg_clean.c b/contrib/pg_clean/pg_clean.c new file mode 100644 index 00000000..08a189f9 --- /dev/null +++ b/contrib/pg_clean/pg_clean.c @@ -0,0 +1,3198 @@ +#include "postgres.h" +#include "fmgr.h" +#include "funcapi.h" +#include "miscadmin.h" + +#include +#include +#include +#include +#include + +#include "storage/procarray.h" +#include "storage/lwlock.h" +#include "storage/proc.h" +#include "utils/varlena.h" +#include "utils/lsyscache.h" +#include "utils/palloc.h" +#include "utils/builtins.h" + +#include "executor/tuptable.h" +#include "pgxc/execRemote.h" +#include "pgxc/pgxcnode.h" +#include "access/tupdesc.h" +#include "access/htup_details.h" +#include "lib/stringinfo.h" + +#include "access/gtm.h" +#include "datatype/timestamp.h" +#include "access/xact.h" +#include "pgxc/pgxcnode.h" +#include "pgxc/poolmgr.h" +#include "utils/timestamp.h" +#include "catalog/pg_control.h" +#include "commands/dbcommands.h" + +#include "utils/memutils.h" +#include "nodes/memnodes.h" + +#ifdef XCP +#include "catalog/pg_type.h" +#include "catalog/pgxc_node.h" +#include "executor/executor.h" +#include "nodes/makefuncs.h" +#include "utils/snapmgr.h" +#endif +#ifdef PGXC +#include "pgxc/nodemgr.h" +#include "pgxc/pgxc.h" +#endif + +#include "storage/fd.h" +#include "pgstat.h" +#include "access/xact.h" +#include "access/twophase.h" +#include "access/hash.h" + +/*hash_create hash_search*/ +#include "utils/hsearch.h" + +#define TWOPHASE_RECORD_DIR "pg_2pc" +int transaction_threshold = 200000; +#define MAXIMUM_CLEAR_FILE 10000 +#define MAXIMUM_OUTPUT_FILE 1000 +#define XIDPREFIX "_$XC$" +#define DEFAULT_CLEAN_TIME_INTERVAL 120000000 +#ifdef __TWO_PHASE_TESTS__ +#define LEAST_CLEAN_TIME_INTERVAL 10000000 /* in pg_clean test_mode should not clean twophase trans prepared in ten seconds or commit in ten seconds */ +#else +#define LEAST_CLEAN_TIME_INTERVAL 60000000 /* should not clean twophase trans prepared in a minite or commit in a minite */ +#endif +GlobalTimestamp clean_time_interval = DEFAULT_CLEAN_TIME_INTERVAL; + + +PG_MODULE_MAGIC; + +#define MAX_GID 50 +#define MAX_DBNAME 64 +#define GET_START_XID "startxid:" +#define GET_COMMIT_TIMESTAMP "global_commit_timestamp:" +#define GET_START_NODE "startnode:" +#define GET_NODE "nodes:" +#define GET_XID "\nxid:" +#define GET_READONLY "readonly" +#define GIDSIZE (200 + 24) +#define MAX_TWOPC_TXN 1000 +#define STRING_BUFF_LEN 1024 + +#define MAX_CMD_LENGTH 120 + +#define XIDFOUND 1 +#define XIDNOTFOUND -1 +#define XIDEXECFAIL -2 + +#define FILEFOUND 1 +#define FILEUNKOWN -1 +#define FILENOTFOUND -2 + +#define INIT(x)\ +do{\ + x = NULL;\ + x##_count = 0;\ + x##_size = 0;\ +}while(0); + +#define RPALLOC(x)\ +do{\ + if (x##_size < x##_count+1)\ + {\ + int temp_size = (x##_size > 0) ? x##_size : 1;\ + if (NULL == x)\ + {\ + x = palloc0(2*temp_size*sizeof(*x));\ + }\ + else\ + {\ + x = repalloc(x, 2*temp_size*sizeof(*x));\ + }\ + x##_size = 2*temp_size;\ + }\ +}while(0); + +#define PALLOC(x, y)\ +do{\ + RPALLOC(x);\ + x[x##_count] = y;\ + x##_count++;\ +}while(0); + +#define RFREE(x)\ +do{\ + if (x##_size > 0)\ + {\ + pfree(x);\ + }\ + x = NULL;\ + x##_count = 0;\ + x##_size = 0;\ +}while(0); + +#define ENUM_TOCHAR_CASE(x) case x: return(#x); + +/*data structures*/ +typedef enum TXN_STATUS +{ + TXN_STATUS_INITIAL = 0, /* Initial */ + TXN_STATUS_PREPARED, + TXN_STATUS_COMMITTED, + TXN_STATUS_ABORTED, + TXN_STATUS_INPROGRESS, + TXN_STATUS_FAILED, /* Error detected while interacting with the node */ + TXN_STATUS_UNKNOWN /* Unknown: Frozen, running, or not started */ +} TXN_STATUS; + + +typedef enum +{ + UNDO = 0, + ABORT, + COMMIT +} OPERATION; + +typedef enum +{ + TWOPHASE_FILE_EXISTS = 0, + TWOPHASE_FILE_NOT_EXISTS, + TWOPHASE_FILE_OLD, + TWOPHASE_FILE_ERROR +}TWOPHASE_FILE_STATUS; + +typedef struct txn_info +{ + char gid[MAX_GID]; + uint32 *xid; /* xid used in prepare */ + TimestampTz *prepare_timestamp; + char *owner; + char *participants; + Oid origcoord; /* Original coordinator who initiated the txn */ + bool after_first_phase; + uint32 startxid; /* xid in Original coordinator */ + bool isorigcoord_part; /* Is original coordinator a + participant? */ + int num_dnparts; /* Number of participant datanodes */ + int num_coordparts; /* Number of participant coordinators */ + int *dnparts; /* Whether a node was participant in the txn */ + int *coordparts; + TXN_STATUS *txn_stat; /* Array for each nodes */ + char *msg; /* Notice message for this txn. */ + GlobalTimestamp global_commit_timestamp; /* get global_commit_timestamp from node once it is committed*/ + + TXN_STATUS global_txn_stat; + OPERATION op; + bool op_issuccess; + bool is_readonly; + bool belong_abnormal_node; +}txn_info; + +typedef struct database_info +{ + struct database_info *next; + char *database_name; + + HTAB *all_txn_info; +#if 0 + txn_info *head_txn_info; + txn_info *last_txn_info; +#endif +} database_info; + +typedef struct +{ + int index; + txn_info **txn; + int txn_count; + int txn_size; + MemoryContext mycontext; +} print_txn_info; + +typedef struct +{ + int index; + int count; + char **gid; + int gid_count; + int gid_size; + char **database; + int database_count; + int database_size; + char **global_status; + int global_status_count; + int global_status_size; + char **status; + int status_count; + int status_size; + MemoryContext mycontext; +} print_status; + +typedef struct +{ + char ***slot; /*slot[i][j] stores value of row i, colum j*/ + int slot_count; /*number of rows*/ + int slot_size; + int attnum; +}TupleTableSlots; + +/*global variable*/ +static Oid *cn_node_list = NULL; +static Oid *dn_node_list = NULL; +static bool *cn_health_map = NULL; +static bool *dn_health_map = NULL; +static int cn_nodes_num = 0; +static int dn_nodes_num = 0; +static int pgxc_clean_node_count = 0; +static Oid my_nodeoid; +static +database_info *head_database_info = NULL; +static +database_info *last_database_info = NULL; +bool execute = false; +int total_twopc_txn = 0; + +TimestampTz current_time; +GlobalTimestamp abnormal_time = InvalidGlobalTimestamp; +char *abnormal_nodename = NULL; +Oid abnormal_nodeoid = InvalidOid; +bool clear_2pc_belong_node = false; + + +/*function list*/ + /*plugin entry function*/ + +static bool check_node_health(Oid node_oid); +static Datum + execute_query_on_single_node(Oid node, const char * query, int attnum, TupleTableSlots * tuples); +void DestroyTxnHash(void); +static void ResetGlobalVariables(void); + +static Oid + getMyNodeoid(void); +static void + getDatabaseList(void); +static char* TTSgetvalue(TupleTableSlots *result, int tup_num, int field_num); +static void DropTupleTableSlots(TupleTableSlots * +Slots); +static void + getTxnInfoOnNodesAll(void); +void getTxnInfoOnNode(Oid node); +void add_txn_info(char * dbname, Oid node_oid, uint32 xid, char * gid, char * owner, + TimestampTz prepared_time, TXN_STATUS status); +TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info * txn, Oid node_oid); +static txn_info * + find_txn(char *gid); +txn_info* + make_txn_info(char * dbname, char * gid, char * owner); +database_info* + find_database_info(char *database_name); +database_info* + add_database_info(char *database_name); +int find_node_index(Oid node_oid); +Oid find_node_oid(int node_idx); +void getTxnInfoOnOtherNodesAll(void); +void getTxnInfoOnOtherNodesForDatabase(database_info *database); +void getTxnInfoOnOtherNodes(txn_info *txn); +int Get2PCXidByGid(Oid node_oid, char * gid, uint32 * transactionid); +int Get2PCFile(Oid node_oid, char * gid, uint32 * transactionid); + +char *get2PCInfo(const char *tid); + +void getTxnStatus(txn_info * txn, int node_idx); +void recover2PCForDatabaseAll(void); +void recover2PCForDatabase(database_info * db_info); +#if 0 +static bool + setMaintenanceMode(bool status); +#endif +bool send_query_clean_transaction(PGXCNodeHandle * conn, txn_info * txn, const char * finish_cmd); +bool check_2pc_belong_node(txn_info * txn); +bool check_node_participate(txn_info * txn, int node_idx); + +void recover2PC(txn_info * txn); +TXN_STATUS + check_txn_global_status(txn_info *txn); +bool clean_2PC_iscommit(txn_info *txn, bool is_commit, bool is_check); +bool clean_2PC_files(txn_info *txn); +void Init_print_txn_info(print_txn_info *print_txn); +void Init_print_stats_all(print_status *pstatus); +void Init_print_stats(txn_info * txn, char * database, print_status * pstatus); +static const char * + txn_status_to_string(TXN_STATUS status); +static const char * + txn_op_to_string(OPERATION op); +static void + CheckFirstPhase(txn_info *txn); +static void + get_transaction_handles(PGXCNodeAllHandles **pgxc_handles, txn_info *txn); +static void + get_node_handles(PGXCNodeAllHandles ** pgxc_handles, Oid nodeoid); + +Datum pg_clean_execute(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(pg_clean_execute); +Datum pg_clean_execute(PG_FUNCTION_ARGS) +{ +#ifdef ACCESS_CONTROL_ATTR_NUM +#undef ACCESS_CONTROL_ATTR_NUM +#endif +#define ACCESS_CONTROL_ATTR_NUM 4 + FuncCallContext *funcctx; + HeapTuple tuple; + print_txn_info *print_txn = NULL; + txn_info *temp_txn; + char txn_gid[100]; + char txn_status[100]; + char txn_op[100]; + char txn_op_issuccess[100]; + + Datum values[ACCESS_CONTROL_ATTR_NUM]; + bool nulls[ACCESS_CONTROL_ATTR_NUM]; + + if(!IS_PGXC_COORDINATOR) + { + elog(ERROR, "can only called on coordinator"); + } + + if (SRF_IS_FIRSTCALL()) + { + MemoryContext oldcontext; + TupleDesc tupdesc; + MemoryContext mycontext; + funcctx = SRF_FIRSTCALL_INIT(); + + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + tupdesc = CreateTemplateTupleDesc(ACCESS_CONTROL_ATTR_NUM, false); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "gid", + TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "global_transaction_status", + TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "operation", + TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 4, "operation_status", + TEXTOID, -1, 0); + funcctx->tuple_desc = BlessTupleDesc(tupdesc); + + funcctx->user_fctx = (print_txn_info *)palloc0(sizeof(print_txn_info)); + print_txn = (print_txn_info *) funcctx->user_fctx; + + + MemoryContextSwitchTo(oldcontext); + mycontext = AllocSetContextCreate(funcctx->multi_call_memory_ctx, + "clean_check", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + oldcontext = MemoryContextSwitchTo(mycontext); + + /*clear Global*/ + ResetGlobalVariables(); + execute = true; + clean_time_interval = PG_GETARG_INT32(0) * 1000000; + if (LEAST_CLEAN_TIME_INTERVAL > clean_time_interval) + { + clean_time_interval = LEAST_CLEAN_TIME_INTERVAL; + } + + /*get node list*/ + PgxcNodeGetOids(&cn_node_list, &dn_node_list, + &cn_nodes_num, &dn_nodes_num, true); + pgxc_clean_node_count = cn_nodes_num + dn_nodes_num; + my_nodeoid = getMyNodeoid(); + cn_health_map = palloc0(cn_nodes_num * sizeof(bool)); + dn_health_map = palloc0(dn_nodes_num * sizeof(bool)); + + /*add my database info*/ + add_database_info(get_database_name(MyDatabaseId)); + + /*get all info of 2PC transactions*/ + getTxnInfoOnNodesAll(); + + /*get txn info on other nodes all*/ + getTxnInfoOnOtherNodesAll(); + + /*recover all 2PC transactions*/ + recover2PCForDatabaseAll(); + + Init_print_txn_info(print_txn); + + print_txn->mycontext = mycontext; + + MemoryContextSwitchTo(oldcontext); + + } + + funcctx = SRF_PERCALL_SETUP(); + print_txn = (print_txn_info *) funcctx->user_fctx; + + if (print_txn->index < print_txn->txn_count) + { + temp_txn = print_txn->txn[print_txn->index]; + strncpy(txn_gid, temp_txn->gid, 100); + strncpy(txn_status, txn_status_to_string(temp_txn->global_txn_stat), 100); + strncpy(txn_op, txn_op_to_string(temp_txn->op), 100); + if (temp_txn->op_issuccess) + strncpy(txn_op_issuccess, "success", 100); + else + strncpy(txn_op_issuccess, "fail", 100); + + MemSet(values, 0, sizeof(values)); + MemSet(nulls, 0, sizeof(nulls)); + + values[0] = PointerGetDatum(cstring_to_text(txn_gid)); + values[1] = PointerGetDatum(cstring_to_text(txn_status)); + values[2] = PointerGetDatum(cstring_to_text(txn_op)); + values[3] = PointerGetDatum(cstring_to_text(txn_op_issuccess)); + tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); + print_txn->index++; + SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple)); + } + else + { + + //MemoryContextDelete(print_txn->mycontext); + DestroyTxnHash(); + ResetGlobalVariables(); + SRF_RETURN_DONE(funcctx); + } +} + +/* + * clear 2pc after oss detect abnormal node and restart it , + * only clear 2pc belong the abnormal node and before the abnormal time + */ +Datum pg_clean_execute_on_node(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(pg_clean_execute_on_node); +Datum pg_clean_execute_on_node(PG_FUNCTION_ARGS) +{ +#ifdef ACCESS_CONTROL_ATTR_NUM +#undef ACCESS_CONTROL_ATTR_NUM +#endif +#define ACCESS_CONTROL_ATTR_NUM 4 + FuncCallContext *funcctx; + HeapTuple tuple; + print_txn_info *print_txn = NULL; + txn_info *temp_txn; + char txn_gid[100]; + char txn_status[100]; + char txn_op[100]; + char txn_op_issuccess[100]; + + Datum values[ACCESS_CONTROL_ATTR_NUM]; + bool nulls[ACCESS_CONTROL_ATTR_NUM]; + + if(!IS_PGXC_COORDINATOR) + { + elog(ERROR, "can only called on coordinator"); + } + + if (SRF_IS_FIRSTCALL()) + { + MemoryContext oldcontext; + TupleDesc tupdesc; + MemoryContext mycontext; + funcctx = SRF_FIRSTCALL_INIT(); + + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + tupdesc = CreateTemplateTupleDesc(ACCESS_CONTROL_ATTR_NUM, false); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "gid", + TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "global_transaction_status", + TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "operation", + TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 4, "operation_status", + TEXTOID, -1, 0); + funcctx->tuple_desc = BlessTupleDesc(tupdesc); + + funcctx->user_fctx = (print_txn_info *)palloc0(sizeof(print_txn_info)); + print_txn = (print_txn_info *) funcctx->user_fctx; + + + MemoryContextSwitchTo(oldcontext); + mycontext = AllocSetContextCreate(funcctx->multi_call_memory_ctx, + "clean_check", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + oldcontext = MemoryContextSwitchTo(mycontext); + + /*clear Global*/ + ResetGlobalVariables(); + execute = true; + clear_2pc_belong_node = true; + + abnormal_nodename = text_to_cstring(PG_GETARG_TEXT_P(0)); + abnormal_nodeoid = get_pgxc_nodeoid(abnormal_nodename); + if (InvalidOid == abnormal_nodeoid) + { + elog(ERROR, "pg_clean_execute_on_node, cannot clear 2pc of invalid nodename '%s'", abnormal_nodename); + } + abnormal_time = PG_GETARG_INT64(1); + current_time = GetCurrentTimestamp(); + if (abnormal_time >= current_time) + { + elog(ERROR, "pg_clean_execute_on_node, abnormal time "INT64_FORMAT" must before current_time "INT64_FORMAT, abnormal_time, current_time); + } + + /*get node list*/ + PgxcNodeGetOids(&cn_node_list, &dn_node_list, + &cn_nodes_num, &dn_nodes_num, true); + pgxc_clean_node_count = cn_nodes_num + dn_nodes_num; + my_nodeoid = getMyNodeoid(); + cn_health_map = palloc0(cn_nodes_num * sizeof(bool)); + dn_health_map = palloc0(dn_nodes_num * sizeof(bool)); + + /*add my database info*/ + add_database_info(get_database_name(MyDatabaseId)); + + /*get all info of 2PC transactions*/ + getTxnInfoOnNodesAll(); + + /*get txn info on other nodes all*/ + getTxnInfoOnOtherNodesAll(); + + /*recover all 2PC transactions*/ + recover2PCForDatabaseAll(); + + Init_print_txn_info(print_txn); + + print_txn->mycontext = mycontext; + + MemoryContextSwitchTo(oldcontext); + + } + + funcctx = SRF_PERCALL_SETUP(); + print_txn = (print_txn_info *) funcctx->user_fctx; + + if (print_txn->index < print_txn->txn_count) + { + temp_txn = print_txn->txn[print_txn->index]; + strncpy(txn_gid, temp_txn->gid, 100); + strncpy(txn_status, txn_status_to_string(temp_txn->global_txn_stat), 100); + strncpy(txn_op, txn_op_to_string(temp_txn->op), 100); + if (temp_txn->op_issuccess) + strncpy(txn_op_issuccess, "success", 100); + else + strncpy(txn_op_issuccess, "fail", 100); + + MemSet(values, 0, sizeof(values)); + MemSet(nulls, 0, sizeof(nulls)); + + values[0] = PointerGetDatum(cstring_to_text(txn_gid)); + values[1] = PointerGetDatum(cstring_to_text(txn_status)); + values[2] = PointerGetDatum(cstring_to_text(txn_op)); + values[3] = PointerGetDatum(cstring_to_text(txn_op_issuccess)); + tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); + print_txn->index++; + SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple)); + } + else + { + DestroyTxnHash(); + pfree(abnormal_nodename); + ResetGlobalVariables(); + SRF_RETURN_DONE(funcctx); + } +} + + +Datum pg_clean_check_txn(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(pg_clean_check_txn); +Datum pg_clean_check_txn(PG_FUNCTION_ARGS) +{ +#ifdef ACCESS_CONTROL_ATTR_NUM +#undef ACCESS_CONTROL_ATTR_NUM +#endif +#define ACCESS_CONTROL_ATTR_NUM 4 + FuncCallContext *funcctx; + HeapTuple tuple; + print_status *pstatus = NULL; + + Datum values[ACCESS_CONTROL_ATTR_NUM]; + bool nulls[ACCESS_CONTROL_ATTR_NUM]; + execute = false; + + if(!IS_PGXC_COORDINATOR) + { + elog(ERROR, "can only called on coordinator"); + } + + if (SRF_IS_FIRSTCALL()) + { + MemoryContext oldcontext; + MemoryContext mycontext; + TupleDesc tupdesc; + funcctx = SRF_FIRSTCALL_INIT(); + + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + tupdesc = CreateTemplateTupleDesc(ACCESS_CONTROL_ATTR_NUM, false); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "gid", + TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "database", + TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "global_transaction_status", + TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 4, "transaction_status_on_allnodes", + TEXTOID, -1, 0); + funcctx->tuple_desc = BlessTupleDesc(tupdesc); + + funcctx->user_fctx = (print_status *)palloc0(sizeof(print_status)); + pstatus = (print_status *) funcctx->user_fctx; + pstatus->index = pstatus->count = 0; + pstatus->gid = NULL; + pstatus->global_status = pstatus->status = (char **)NULL; + pstatus->database = NULL; + pstatus->mycontext = NULL; + + + MemoryContextSwitchTo(oldcontext); + + mycontext = AllocSetContextCreate(funcctx->multi_call_memory_ctx, + "clean_check", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + oldcontext = MemoryContextSwitchTo(mycontext); + + /*clear Global*/ + ResetGlobalVariables(); + + clean_time_interval = PG_GETARG_INT32(0) * 1000000; + if (LEAST_CLEAN_TIME_INTERVAL > clean_time_interval) + { + clean_time_interval = LEAST_CLEAN_TIME_INTERVAL; + } + /*get node list*/ + PgxcNodeGetOids(&cn_node_list, &dn_node_list, + &cn_nodes_num, &dn_nodes_num, true); + if (cn_node_list == NULL || dn_node_list == NULL) + elog(ERROR, "pg_clean:fail to get cn_node_list and dn_node_list"); + pgxc_clean_node_count = cn_nodes_num + dn_nodes_num; + my_nodeoid = getMyNodeoid(); + cn_health_map = palloc0(cn_nodes_num * sizeof(bool)); + dn_health_map = palloc0(dn_nodes_num * sizeof(bool)); + + /*get all database info*/ + getDatabaseList(); + + /*get all info of 2PC transactions*/ + getTxnInfoOnNodesAll(); + + /*get txn info on other nodes all*/ + getTxnInfoOnOtherNodesAll(); + + /*recover all 2PC transactions*/ + Init_print_stats_all(pstatus); + + pstatus->mycontext = mycontext; + + MemoryContextSwitchTo(oldcontext); + + } + + funcctx = SRF_PERCALL_SETUP(); + pstatus = (print_status *) funcctx->user_fctx; + + if (pstatus->index < pstatus->count) + { + MemSet(values, 0, sizeof(values)); + MemSet(nulls, 0, sizeof(nulls)); + + values[0] = PointerGetDatum(cstring_to_text(pstatus->gid[pstatus->index])); + values[1] = PointerGetDatum(cstring_to_text(pstatus->database[pstatus->index])); + values[2] = PointerGetDatum(cstring_to_text(pstatus->global_status[pstatus->index])); + values[3] = PointerGetDatum(cstring_to_text(pstatus->status[pstatus->index])); + tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); + pstatus->index++; + SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple)); + } + else + { + /* + MemoryContextDelete(pstatus->mycontext); + DropDatabaseInfo(); + */ + DestroyTxnHash(); + ResetGlobalVariables(); + SRF_RETURN_DONE(funcctx); + } +} + +void DestroyTxnHash(void) +{ + database_info *dbinfo = head_database_info; + while (dbinfo) + { + hash_destroy(dbinfo->all_txn_info); + dbinfo = dbinfo->next; + } +} + +static void ResetGlobalVariables(void) +{ + cn_node_list = NULL; + dn_node_list = NULL; + cn_health_map = NULL; + dn_health_map = NULL; + cn_nodes_num = 0; + dn_nodes_num = 0; + pgxc_clean_node_count = 0; + execute = false; + total_twopc_txn = 0; + + head_database_info = last_database_info = NULL; + + current_time = 0; + abnormal_time = InvalidGlobalTimestamp; + abnormal_nodename = NULL; + abnormal_nodeoid = InvalidOid; + clear_2pc_belong_node = false; + +} + +static Oid getMyNodeoid(void) +{ + return get_pgxc_nodeoid(PGXCNodeName); +} + +/* + * execute_query_on_single_node -- execute query on certain node and get results + * input: node oid, execute query, number of attribute in results, results + * return: (Datum) 0 + */ +static Datum +execute_query_on_single_node(Oid node, const char *query, int attnum, TupleTableSlots *tuples) //delete numnodes, delete nodelist, insert node +{ + int ii; + bool issuccess = false; + + /*check health of node*/ + bool ishealthy = check_node_health(node); + +#ifdef XCP + EState *estate; + MemoryContext oldcontext; + RemoteQuery *plan; + RemoteQueryState *pstate; + TupleTableSlot *result = NULL; + Var *dummy; + char ntype = PGXC_NODE_NONE; + + /* + * Make up RemoteQuery plan node + */ + plan = makeNode(RemoteQuery); + plan->combine_type = COMBINE_TYPE_NONE; + plan->exec_nodes = makeNode(ExecNodes); + plan->exec_type = EXEC_ON_NONE; + + plan->exec_nodes->nodeList = lappend_int(plan->exec_nodes->nodeList, + PGXCNodeGetNodeId(node, &ntype)); + if (ntype == PGXC_NODE_NONE) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Unknown node Oid: %u", node))); + else if (ntype == PGXC_NODE_COORDINATOR) + { + plan->exec_type = EXEC_ON_COORDS; + } + else + { + plan->exec_type = EXEC_ON_DATANODES; + } + + plan->sql_statement = (char *)query; + plan->force_autocommit = false; + /* + * We only need the target entry to determine result data type. + * So create dummy even if real expression is a function. + */ + for (ii = 1; ii <= attnum; ii++) + { + dummy = makeVar(1, ii, TEXTOID, 0, InvalidOid, 0); + plan->scan.plan.targetlist = lappend(plan->scan.plan.targetlist, + makeTargetEntry((Expr *) dummy, ii, NULL, false)); + } + /* prepare to execute */ + estate = CreateExecutorState(); + oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); + estate->es_snapshot = GetActiveSnapshot(); + pstate = ExecInitRemoteQuery(plan, estate, 0); + MemoryContextSwitchTo(oldcontext); + + /*execute query on node when node is healthy*/ + INIT(tuples->slot); + tuples->attnum = 0; + if (ishealthy) + { + int i_tuple = 0; + int i_attnum = 0; + issuccess = true; + result = ExecRemoteQuery((PlanState *) pstate); + tuples->attnum = attnum; + while (result != NULL && !TupIsNull(result)) + { + slot_getallattrs(result); + RPALLOC(tuples->slot); + tuples->slot[i_tuple] = (char **) palloc0(attnum * sizeof(char *)); + + for (i_attnum = 0; i_attnum < attnum; i_attnum++) + { + /*if (result->tts_values[i_attnum] != (Datum)0)*/ + if (result->tts_isnull[i_attnum] == false) + { + tuples->slot[i_tuple][i_attnum] = text_to_cstring(DatumGetTextP(result->tts_values[i_attnum])); + } + else + { + tuples->slot[i_tuple][i_attnum] = NULL; + } + } + tuples->slot_count++; + + result = ExecRemoteQuery((PlanState *) pstate); + i_tuple++; + } + } + ExecEndRemoteQuery(pstate); +#endif + return issuccess == true ? (Datum) 1 : (Datum) 0; +} + +static bool check_node_health(Oid node_oid) +{ + int i; + bool ishealthy = false; + + PoolPingNodeRecheck(node_oid); + PgxcNodeGetHealthMap(cn_node_list, dn_node_list, + &cn_nodes_num, &dn_nodes_num, + cn_health_map, dn_health_map); + if (get_pgxc_nodetype(node_oid) == 'C') + { + for (i = 0; i < cn_nodes_num; i++) + { + if (cn_node_list[i] == node_oid) + { + ishealthy = cn_health_map[i]; + } + } + } + else + { + for (i = 0; i < dn_nodes_num; i++) + { + if (dn_node_list[i] == node_oid) + { + ishealthy = dn_health_map[i]; + } + } + } + return ishealthy; +} + +static void getDatabaseList(void) +{ + int i; + TupleTableSlots result_db; + const char *query_db = "select datname::text from pg_database;"; + /*add datname into tail of head_database_info*/ + if (execute_query_on_single_node(my_nodeoid, query_db, 1, &result_db) == (Datum) 1) + { + for (i = 0; i < result_db.slot_count; i++) + { + if (TTSgetvalue(&result_db, i, 0)) + { + add_database_info(TTSgetvalue(&result_db, i, 0)); + } + } + } + else + { + elog(LOG, "pg_clean: failed get database list on node %s", get_pgxc_nodename(my_nodeoid)); + } + DropTupleTableSlots(&result_db); +} + +/* + * TTSgetvalue -- get attribute from TupleTableSlots + * input: result, index of tuple, index of field + * return: attribute result + */ +static char * TTSgetvalue(TupleTableSlots *result, int tup_num, int field_num) +{ + return result->slot[tup_num][field_num]; +} + +static void DropTupleTableSlots(TupleTableSlots * +Slots) +{ + int i; + int j; + for (i = 0; i < Slots->slot_count; i++) + { + if (Slots->slot[i]) + { + for (j = 0; j < Slots->attnum; j++) + { + if (Slots->slot[i][j]) + { + pfree(Slots->slot[i][j]); + } + } + pfree(Slots->slot[i]); + } + } + RFREE(Slots->slot); + Slots->attnum = 0; + return; +} + +static void getTxnInfoOnNodesAll(void) +{ + int i; + current_time = GetCurrentTimestamp(); + /*upload 2PC transaction from CN*/ + for (i = 0; i < cn_nodes_num; i++) + { + if (total_twopc_txn >= MAX_TWOPC_TXN) + return; + getTxnInfoOnNode(cn_node_list[i]); + } + + /*upload 2PC transaction from DN*/ + for (i = 0; i < dn_nodes_num; i++) + { + if (total_twopc_txn >= MAX_TWOPC_TXN) + return; + getTxnInfoOnNode(dn_node_list[i]); + } +} + +void getTxnInfoOnNode(Oid node) +{ + int i; + TupleTableSlots result_txn; + Datum execute_res; + char query_execute[1024]; + const char *query_txn_status = "select transaction::text, gid::text, owner::text, database::text, timestamptz_out(prepared)::text " + "from pg_prepared_xacts;"; + const char *query_txn_status_execute = "select transaction::text, gid::text, owner::text, database::text, timestamptz_out(prepared)::text " + "from pg_prepared_xacts where database = '%s';"; + snprintf(query_execute, 1024, query_txn_status_execute, get_database_name(MyDatabaseId)); + + if (execute) + execute_res = execute_query_on_single_node(node, query_execute, 5, &result_txn); + else + execute_res = execute_query_on_single_node(node, query_txn_status, 5, &result_txn); + + if (execute_res == (Datum) 1) + { + for (i = 0; i < result_txn.slot_count; i++) + { + uint32 xid; + char* gid; + char* owner; + char* datname; + TimestampTz prepared_time; + + /*read results from each tuple*/ + xid = strtoul(TTSgetvalue(&result_txn, i, 0), NULL, 10); + gid = TTSgetvalue(&result_txn, i, 1); + owner = TTSgetvalue(&result_txn, i, 2); + datname = TTSgetvalue(&result_txn, i, 3); + prepared_time = DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in, + CStringGetDatum(TTSgetvalue(&result_txn, i, 4)), + ObjectIdGetDatum(InvalidOid), + Int32GetDatum(-1))); + + /*add txn to database*/ + add_txn_info(datname, node, xid, gid, owner, prepared_time, TXN_STATUS_PREPARED); + if (total_twopc_txn >= MAX_TWOPC_TXN) + { + break; + } + } + } + else + { + elog(LOG, "pg_clean: failed get database list on node %s", get_pgxc_nodename(node)); + } + DropTupleTableSlots(&result_txn); +} + +void add_txn_info(char* dbname, Oid node_oid, uint32 xid, char * gid, + char * owner, TimestampTz prepared_time, TXN_STATUS status) +{ + txn_info *txn = NULL; + int nodeidx; + + if ((txn = find_txn(gid)) == NULL) + { + txn = make_txn_info(dbname, gid, owner); + total_twopc_txn++; + if (txn == NULL) + { + /*no more memory*/ + elog(ERROR, "there is no more memory for palloc a 2PC transaction"); + } + } + nodeidx = find_node_index(node_oid); + txn->txn_stat[nodeidx] = status; + txn->xid[nodeidx] = xid; + txn->prepare_timestamp[nodeidx] = prepared_time; + if (nodeidx < cn_nodes_num) + { + txn->coordparts[nodeidx] = 1; + txn->num_coordparts++; + } + else + { + txn->dnparts[nodeidx-cn_nodes_num] = 1; + txn->num_dnparts++; + } + return; +} + +TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info *txn, Oid node_oid) +{ + /*get all the participates and initiate to each transactions*/ + TWOPHASE_FILE_STATUS res = TWOPHASE_FILE_NOT_EXISTS; + TupleTableSlots result; + char *partnodes = NULL; + char *startnode = NULL; + char *file_content = NULL; + uint32 startxid = 0; + char *str_startxid = NULL; + char *str_timestamp = NULL; + char *temp = NULL; + Oid temp_nodeoid; + char temp_nodetype; + int temp_nodeidx; + char stmt[1024]; + static const char *STMT_FORM = "select pgxc_get_2pc_file('%s')::text"; + snprintf(stmt, 1024, STMT_FORM, txn->gid, txn->gid, txn->gid, txn->gid); + + if (execute_query_on_single_node(node_oid, stmt, 1, &result) == (Datum) 1) + { + if (result.slot_count && TTSgetvalue(&result, 0, 0)) +#if 0 + TTSgetvalue(&result, 0, 0) && + TTSgetvalue(&result, 0, 1) && + TTSgetvalue(&result, 0, 2)) +#endif + { + file_content = TTSgetvalue(&result, 0, 0); + + if (!IsXidImplicit(txn->gid) && strstr(file_content, GET_READONLY)) + { + txn->is_readonly = true; + txn->global_txn_stat = TXN_STATUS_COMMITTED; + DropTupleTableSlots(&result); + return TWOPHASE_FILE_EXISTS; + } + startnode = strstr(file_content, GET_START_NODE); + str_startxid = strstr(file_content, GET_START_XID); + partnodes = strstr(file_content, GET_NODE); + temp = strstr(file_content, GET_COMMIT_TIMESTAMP); + + /* get the last global_commit_timestamp */ + while (temp) + { + str_timestamp = temp; + temp += strlen(GET_COMMIT_TIMESTAMP); + temp = strstr(temp, GET_COMMIT_TIMESTAMP); + } + + if (startnode) + { + startnode += strlen(GET_START_NODE); + startnode = strtok(startnode, "\n"); + txn->origcoord = get_pgxc_nodeoid(startnode); + } + + if (str_startxid) + { + str_startxid += strlen(GET_START_XID); + str_startxid = strtok(str_startxid, "\n"); + startxid = strtoul(str_startxid, NULL, 10); + txn->startxid = startxid; + } + + if (partnodes) + { + partnodes += strlen(GET_NODE); + partnodes = strtok(partnodes, "\n"); + txn->participants = (char *) palloc0(strlen(partnodes) + 1); + strncpy(txn->participants, partnodes, strlen(partnodes) + 1); + } + + if (NULL == startnode || NULL == str_startxid) + { + res = TWOPHASE_FILE_OLD; + DropTupleTableSlots(&result); + return res; + } + + if (NULL == partnodes) + { + res = TWOPHASE_FILE_ERROR; + DropTupleTableSlots(&result); + return res; + } + + if (str_timestamp) + { + str_timestamp += strlen(GET_COMMIT_TIMESTAMP); + str_timestamp = strtok(str_timestamp, "\n"); + txn->global_commit_timestamp = strtoull(str_timestamp, NULL, 10); + } + + elog(DEBUG1, "get 2pc txn:%s partnodes in nodename: %s (nodeoid:%u) result: partnodes:%s, startnode:%s, startnodeoid:%u, startxid:%u", + txn->gid, get_pgxc_nodename(node_oid), node_oid, partnodes, startnode, txn->origcoord, startxid); + /* in explicit transaction startnode participate the transaction */ + if (strstr(partnodes, startnode) || !IsXidImplicit(txn->gid)) + { + txn->isorigcoord_part = true; + } + else + { + txn->isorigcoord_part = false; + } + + res = TWOPHASE_FILE_EXISTS; + txn->num_coordparts = 0; + txn->num_dnparts = 0; + temp = strtok(partnodes,", "); + while(temp) + { + /*check node type*/ + temp_nodeoid = get_pgxc_nodeoid(temp); + if (temp_nodeoid == InvalidOid) + { + res = TWOPHASE_FILE_ERROR; + break; + } + temp_nodetype = get_pgxc_nodetype(temp_nodeoid); + temp_nodeidx = find_node_index(temp_nodeoid); + + switch (temp_nodetype) + { + case 'C': + txn->coordparts[temp_nodeidx] = 1; + txn->num_coordparts++; + break; + case 'D': + txn->dnparts[temp_nodeidx-cn_nodes_num] = 1; + txn->num_dnparts++; + break; + default: + elog(ERROR,"nodetype of %s is not 'C' or 'D'", temp); + break; + } + temp = strtok(NULL,", "); + } + } + } + else + { + elog(LOG, "pg_clean: failed get database list on node %s", get_pgxc_nodename(node_oid)); + res = TWOPHASE_FILE_ERROR; + } + DropTupleTableSlots(&result); + return res; +} + +static txn_info *find_txn(char *gid) +{ + bool found; + database_info *cur_db; + txn_info *txn; + + for (cur_db = head_database_info; cur_db; cur_db = cur_db->next) + { +#if 0 + for (cur_txn = cur_db->head_txn_info; cur_txn; cur_txn = cur_txn->next) + { + if (0 == strcmp(cur_txn->gid, gid)) + return cur_txn; + } +#endif + txn = (txn_info *)hash_search(cur_db->all_txn_info, (void *)gid, HASH_FIND, &found); + if (found) + return txn; + } + return NULL; +} + +txn_info* make_txn_info(char* dbname, char* gid, char* owner) +{ + bool found; + txn_info *txn_insert_pos = NULL; + database_info *dbinfo; + txn_info *txn; + + dbinfo = add_database_info(dbname); + txn = (txn_info *)palloc0(sizeof(txn_info)); + if (txn == NULL) + return NULL; + //txn->next = NULL; + + //txn->gid = (char *)palloc0(strlen(gid)+1); + strncpy(txn->gid, gid, strlen(gid)+1); + txn->owner = (char *)palloc0(strlen(owner)+1); + strncpy(txn->owner, owner, strlen(owner)+1); + + txn->txn_stat = (TXN_STATUS *)palloc0(sizeof(TXN_STATUS) * pgxc_clean_node_count); + txn->xid = (uint32 *)palloc0(sizeof(uint32) * pgxc_clean_node_count); + txn->prepare_timestamp = (TimestampTz *)palloc0(sizeof(TimestampTz) * pgxc_clean_node_count); + txn->coordparts = (int *)palloc0(cn_nodes_num * sizeof(int)); + + txn->dnparts = (int *)palloc0(dn_nodes_num * sizeof(int)); + if (txn->gid == NULL || txn->owner == NULL || txn->txn_stat == NULL + || txn->xid == NULL || txn->coordparts == NULL || txn->dnparts == NULL || txn->prepare_timestamp == NULL) + { + pfree(txn); + return(NULL); + } + + txn_insert_pos = (txn_info *)hash_search(dbinfo->all_txn_info, + (void *)txn->gid, HASH_ENTER, &found); + if (!found) + memcpy(txn_insert_pos, txn, sizeof(txn_info)); + +#if 0 + if (dbinfo->head_txn_info == NULL) + { + dbinfo->head_txn_info = dbinfo->last_txn_info = txn; + } + else + { + dbinfo->last_txn_info->next = txn; + dbinfo->last_txn_info = txn; + } +#endif + + return txn_insert_pos; +} + +database_info *find_database_info(char *database_name) +{ + database_info *cur_database_info = head_database_info; + + for (;cur_database_info; cur_database_info = cur_database_info->next) + { + if(cur_database_info->database_name && + database_name && + strcmp(cur_database_info->database_name, database_name) == 0) + return(cur_database_info); + } + return(NULL); +} + +database_info *add_database_info(char *database_name) +{ + database_info *rv; + HASHCTL txn_ctl; + char tabname[STRING_BUFF_LEN]; + + if ((rv = find_database_info(database_name)) != NULL) + return rv; /* Already in the list */ + rv = (database_info *)palloc0(sizeof(database_info)); + if (rv == NULL) + return NULL; + rv->next = NULL; + rv->database_name = (char *)palloc0(strlen(database_name) + 1); + strncpy(rv->database_name, database_name, strlen(database_name) + 1); + if (rv->database_name == NULL) + { + pfree(rv); + return NULL; + } +#if 0 + rv->head_txn_info = NULL; + rv->last_txn_info = NULL; +#endif + + snprintf(tabname, STRING_BUFF_LEN, "%s txn info", rv->database_name); + txn_ctl.keysize = MAX_GID; + txn_ctl.entrysize = sizeof(txn_info); + rv->all_txn_info = hash_create(tabname, 64, + &txn_ctl, HASH_ELEM); + if (head_database_info == NULL) + { + head_database_info = last_database_info = rv; + return rv; + } + else + { + last_database_info->next = rv; + last_database_info = rv; + return rv; + } +} + +int find_node_index(Oid node_oid) +{ + int res = -1; + int i; + if (get_pgxc_nodetype(node_oid) == 'C') + { + for (i = 0; i < cn_nodes_num; i++) + { + if (node_oid == cn_node_list[i]) + { + res = i; + break; + } + } + } + else + { + for (i = 0; i < dn_nodes_num; i++) + { + if (node_oid == dn_node_list[i]) + { + res = i+cn_nodes_num; + break; + } + } + } + return res; +} + +Oid find_node_oid(int node_idx) +{ + return (node_idx < cn_nodes_num) ? cn_node_list[node_idx] : + dn_node_list[node_idx-cn_nodes_num]; +} + +void getTxnInfoOnOtherNodesAll(void) +{ + database_info *cur_database; + + for (cur_database = head_database_info; cur_database; cur_database = cur_database->next) + { + getTxnInfoOnOtherNodesForDatabase(cur_database); + } +} + +void getTxnInfoOnOtherNodesForDatabase(database_info *database) +{ + txn_info *cur_txn; + HASH_SEQ_STATUS status; + HTAB *txn = database->all_txn_info; + hash_seq_init(&status, txn); + + while ((cur_txn = (txn_info *) hash_seq_search(&status)) != NULL) + { + getTxnInfoOnOtherNodes(cur_txn); + } +#if 0 + for (cur_txn = database->head_txn_info; cur_txn; cur_txn = cur_txn->next) + { + getTxnInfoOnOtherNodes(cur_txn); + } +#endif +} + +void getTxnInfoOnOtherNodes(txn_info *txn) +{ + int ii; + int ret; + char node_type; + TWOPHASE_FILE_STATUS status = TWOPHASE_FILE_NOT_EXISTS; + Oid node_oid; + uint32 transactionid = 0; + char gid[MAX_GID]; + char *ptr = NULL; + + if (IsXidImplicit(txn->gid)) + { + strncpy(gid, txn->gid, strlen(txn->gid)+1); + ptr = strtok(gid, ":"); + ptr = strtok(NULL, ":"); + node_oid = get_pgxc_nodeoid(ptr); + status = GetTransactionPartNodes(txn, node_oid); + } + else + { + for (ii = 0; ii < cn_nodes_num + dn_nodes_num; ii++) + { + if (ii < cn_nodes_num) + { + status = GetTransactionPartNodes(txn, cn_node_list[ii]); + if (TWOPHASE_FILE_EXISTS == status || + TWOPHASE_FILE_OLD == status || + TWOPHASE_FILE_ERROR == status) + { + node_oid = cn_node_list[ii]; + break; + } + } + else + { + status = GetTransactionPartNodes(txn, dn_node_list[ii - cn_nodes_num]); + if (TWOPHASE_FILE_EXISTS == status || + TWOPHASE_FILE_OLD == status || + TWOPHASE_FILE_ERROR == status) + { + node_oid = dn_node_list[ii - cn_nodes_num]; + break; + } + } + } + + /* since there may be explicit readonly twophase transactions */ + if (txn->is_readonly) + { + return; + } + if (TWOPHASE_FILE_EXISTS == status && + InvalidGlobalTimestamp == txn->global_commit_timestamp && + node_oid != txn->origcoord) + { + status = GetTransactionPartNodes(txn, txn->origcoord); + } + + } + + if (TWOPHASE_FILE_EXISTS != status) + { + /* + * if 2pc file not exists in all nodes, the trans did not pass the prepared phase, + * + */ + txn->global_txn_stat = (TWOPHASE_FILE_NOT_EXISTS == status) ? + TXN_STATUS_ABORTED : TXN_STATUS_UNKNOWN; + return; + } + + + /* judge the range of global status */ + CheckFirstPhase(txn); + + for (ii = 0; ii < pgxc_clean_node_count; ii++) + { + if (txn->txn_stat[ii] == TXN_STATUS_INITIAL) + { + /*check node ii is 'C' or 'D'*/ + node_oid = find_node_oid(ii); + if (node_oid == txn->origcoord) + continue; + node_type = get_pgxc_nodetype(node_oid); + if (node_type == 'C' && txn->coordparts[ii] != 1) + continue; + if (node_type == 'D' && txn->dnparts[ii - cn_nodes_num] != 1) + continue; + /*check coordparts or dnparts*/ + if (txn->xid[ii] == 0) + { + ret = Get2PCXidByGid(node_oid, txn->gid, &transactionid); + if (ret == XIDFOUND) + { + txn->xid[ii] = transactionid; + if (txn->xid[ii] > 0) + getTxnStatus(txn, ii); + } + else if (ret == XIDNOTFOUND) + { + if (txn->after_first_phase) + txn->txn_stat[ii] = TXN_STATUS_COMMITTED; + } + else + txn->txn_stat[ii] = TXN_STATUS_UNKNOWN; + + } + } + } +} + +/*get xid by gid on node_oid*/ +int Get2PCXidByGid(Oid node_oid, char *gid, uint32 *transactionid) +{ + int ret = XIDFOUND; + TupleTableSlots result; + uint32 xid = 0; + static const char *STMT_FORM = "select pgxc_get_2pc_xid('%s')::text;"; + char stmt[100]; + snprintf(stmt, 100, STMT_FORM, gid); + /*if exist get xid by gid on node_oid*/ + if (execute_query_on_single_node(node_oid, stmt, 1, &result) != (Datum) 0) + { + if (result.slot_count) + { + if (TTSgetvalue(&result, 0, 0)) + { + xid = strtoul(TTSgetvalue(&result, 0, 0), NULL, 10); + *transactionid = xid; + if (xid == 0) + ret = XIDNOTFOUND; + } + else + ret = XIDNOTFOUND; + } + else + ret = XIDNOTFOUND; + } + else + ret = XIDEXECFAIL; + DropTupleTableSlots(&result); + return ret; +} + +int Get2PCFile(Oid node_oid, char * gid, uint32 * transactionid) +{ + int ret = FILEFOUND; + TupleTableSlots result; + static const char *STMT_FORM = "select pgxc_get_2pc_file('%s')::text;"; + char stmt[100]; + snprintf(stmt, 100, STMT_FORM, gid); + /*if exist get xid by gid on node_oid*/ + if (execute_query_on_single_node(node_oid, stmt, 1, &result) != (Datum) 0) + { + if (result.slot_count) + { + if (!TTSgetvalue(&result, 0, 0)) + { + ret = FILENOTFOUND; + } + else + { + ret = FILEFOUND; + } + } + else + ret = FILENOTFOUND; + } + else + ret = FILEUNKOWN; + DropTupleTableSlots(&result); + return ret; +} + + +void getTxnStatus(txn_info *txn, int node_idx) +{ + Oid node_oid; + char stmt[1024]; + char *att1; + TupleTableSlots result; + + static const char *STMT_FORM = "SELECT pgxc_is_committed('%d'::xid)::text"; + snprintf(stmt, 1024, STMT_FORM, txn->xid[node_idx], txn->xid[node_idx]); + + node_oid = find_node_oid(node_idx); + if (0 != execute_query_on_single_node(node_oid, stmt, 1, &result)) + { + att1 = TTSgetvalue(&result, 0, 0); + + if (att1) + { + if (strcmp(att1, "true") == 0) + { + txn->txn_stat[node_idx] = TXN_STATUS_COMMITTED; + } + else + txn->txn_stat[node_idx] = TXN_STATUS_ABORTED; + } + else + { + txn->txn_stat[node_idx] = TXN_STATUS_INITIAL; + } + } + else + txn->txn_stat[node_idx] = TXN_STATUS_UNKNOWN; + DropTupleTableSlots(&result); +} + +char *get2PCInfo(const char *tid) +{ + char *result = NULL; + char *info = NULL; + int size = 0; + File fd = -1; + int ret = -1; + struct stat filestate; + char path[MAXPGPATH]; + + info = get_2pc_info_from_cache(tid); + if (NULL != info) + { + size = strlen(info); + result = (char *)palloc0(size + 1); + memcpy(result, info, size); + return result; + } + + elog(LOG, "try to get 2pc info from disk, tid: %s", tid); + + snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid); + if(access(path, F_OK) == 0) + { + if(stat(path, &filestate) == -1) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not get status of file \"%s\"", path))); + } + + size = filestate.st_size; + + if (0 == size) + { + return NULL; + } + + result = (char *)palloc0(size + 1); + + fd = PathNameOpenFile(path, O_RDONLY, S_IRUSR | S_IWUSR); + if (fd < 0) + { + pfree(result); + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\" for read", path))); + } + + ret = FileRead(fd, result, size, WAIT_EVENT_BUFFILE_READ); + if(ret != size) + { + pfree(result); + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\"", path))); + } + + FileClose(fd); + return result; + } + + return NULL; +} + +Datum pgxc_get_2pc_file(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(pgxc_get_2pc_file); +Datum pgxc_get_2pc_file(PG_FUNCTION_ARGS) +{ + char *tid = NULL; + char *result = NULL; + text *t_result = NULL; + + tid = text_to_cstring(PG_GETARG_TEXT_P(0)); + result = get2PCInfo(tid); + if (NULL != result) + { + t_result = cstring_to_text(result); + pfree(result); + return PointerGetDatum(t_result); + } + PG_RETURN_NULL(); +} + + +Datum pgxc_get_2pc_nodes(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(pgxc_get_2pc_nodes); +Datum pgxc_get_2pc_nodes(PG_FUNCTION_ARGS) +{ + char *tid = NULL; + char *result = NULL; + char *nodename = NULL; + text *t_result = NULL; + + tid = text_to_cstring(PG_GETARG_TEXT_P(0)); + result = get2PCInfo(tid); + if (NULL != result) + { + nodename = strstr(result, GET_NODE); + if (NULL != nodename) + { + nodename += strlen(GET_NODE); + nodename = strtok(nodename, "\n"); + t_result = cstring_to_text(nodename); + pfree(result); + return PointerGetDatum(t_result); + } + } + + PG_RETURN_NULL(); +} + +Datum pgxc_get_2pc_startnode(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(pgxc_get_2pc_startnode); +Datum pgxc_get_2pc_startnode(PG_FUNCTION_ARGS) +{ + char *tid = NULL; + char *result = NULL; + char *nodename = NULL; + text *t_result = NULL; + + tid = text_to_cstring(PG_GETARG_TEXT_P(0)); + result = get2PCInfo(tid); + if (NULL != result) + { + nodename = strstr(result, GET_START_NODE); + if (NULL != nodename) + { + nodename += strlen(GET_START_NODE); + nodename = strtok(nodename, "\n"); + t_result = cstring_to_text(nodename); + pfree(result); + return PointerGetDatum(t_result); + + } + } + PG_RETURN_NULL(); +} + +Datum pgxc_get_2pc_startxid(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(pgxc_get_2pc_startxid); +Datum pgxc_get_2pc_startxid(PG_FUNCTION_ARGS) +{ + char *tid = NULL; + char *result = NULL; + char *startxid = NULL; + text *t_result = NULL; + + tid = text_to_cstring(PG_GETARG_TEXT_P(0)); + result = get2PCInfo(tid); + if (NULL != result) + { + startxid = strstr(result, GET_START_XID); + if (NULL != startxid) + { + startxid += strlen(GET_START_XID); + startxid = strtok(startxid, "\n"); + t_result = cstring_to_text(startxid); + pfree(result); + return PointerGetDatum(t_result); + } + } + PG_RETURN_NULL(); +} + + +Datum pgxc_get_2pc_commit_timestamp(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(pgxc_get_2pc_commit_timestamp); +Datum pgxc_get_2pc_commit_timestamp(PG_FUNCTION_ARGS) +{ + char *tid = NULL; + char *result = NULL; + char *commit_timestamp = NULL; + text *t_result = NULL; + + tid = text_to_cstring(PG_GETARG_TEXT_P(0)); + result = get2PCInfo(tid); + if (NULL != result) + { + commit_timestamp = strstr(result, GET_COMMIT_TIMESTAMP); + if (NULL != commit_timestamp) + { + commit_timestamp += strlen(GET_COMMIT_TIMESTAMP); + commit_timestamp = strtok(commit_timestamp, "\n"); + t_result = cstring_to_text(commit_timestamp); + pfree(result); + return PointerGetDatum(t_result); + } + } + PG_RETURN_NULL(); +} + + + +Datum pgxc_get_2pc_xid(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(pgxc_get_2pc_xid); +Datum pgxc_get_2pc_xid(PG_FUNCTION_ARGS) +{ + char *tid = NULL; + char *result = NULL; + char *str_xid = NULL; + GlobalTransactionId xid; + + tid = text_to_cstring(PG_GETARG_TEXT_P(0)); + result = get2PCInfo(tid); + if (NULL != result) + { + str_xid = strstr(result, GET_XID); + if (NULL != str_xid) + { + str_xid += strlen(GET_XID); + str_xid = strtok(str_xid, "\n"); + xid = strtoul(str_xid, NULL, 10); + pfree(result); + PG_RETURN_UINT32(xid); + } + } + PG_RETURN_NULL(); +} + +Datum pgxc_remove_2pc_records(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(pgxc_remove_2pc_records); +Datum pgxc_remove_2pc_records(PG_FUNCTION_ARGS) +{ + char *tid = text_to_cstring(PG_GETARG_TEXT_P(0)); + remove_2pc_records(tid, true); + pfree(tid); + PG_RETURN_BOOL(true); +} + +Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(pgxc_clear_2pc_records); +Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS) +{ + MemoryContext oldcontext; + MemoryContext mycontext; + + int i = 0; + int count = 0; + TupleTableSlots *result; + TupleTableSlots clear_result; + const char *query = "select pgxc_get_record_list()::text"; + const char *CLEAR_STMT = "select pgxc_remove_2pc_records('%s')::text"; + char clear_query[100]; + char *twopcfiles = NULL; + char *ptr = NULL; + bool res = true; + + if(!IS_PGXC_COORDINATOR) + { + elog(ERROR, "can only called on coordinator"); + } + + mycontext = AllocSetContextCreate(CurrentMemoryContext, + "clean_check", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + oldcontext = MemoryContextSwitchTo(mycontext); + + ResetGlobalVariables(); +#if 0 + if((dir = opendir(TWOPHASE_RECORD_DIR))) + { + while((ptr = readdir(dir)) != NULL) + { + if (count > 999) + break; + if(strcmp(ptr->d_name,".") == 0 || strcmp(ptr->d_name,"..") == 0) + { + continue; + } + snprintf(path[count], MAX_GID, "/%s", ptr->d_name); + //snprintf(path[count], MAX_GID, "/%s", ptr->d_name); + count++; + } + + closedir(dir); + } +#endif + + /*get node list*/ + PgxcNodeGetOids(&cn_node_list, &dn_node_list, + &cn_nodes_num, &dn_nodes_num, true); + pgxc_clean_node_count = cn_nodes_num + dn_nodes_num; + my_nodeoid = getMyNodeoid(); + cn_health_map = palloc0(cn_nodes_num * sizeof(bool)); + dn_health_map = palloc0(dn_nodes_num * sizeof(bool)); + result = (TupleTableSlots *)palloc0(pgxc_clean_node_count * sizeof(TupleTableSlots)); + + /*collect the 2pc file in nodes*/ + for (i = 0; i < cn_nodes_num; i++) + { + (void) execute_query_on_single_node(cn_node_list[i], query, 1, result+i); + } + + for (i = 0; i < dn_nodes_num; i++) + { + (void) execute_query_on_single_node(dn_node_list[i], query, 1, result+cn_nodes_num+i); + } + /*get all database info*/ + getDatabaseList(); + + /*get all info of 2PC transactions*/ + getTxnInfoOnNodesAll(); +#if 0 + if((dir = opendir(TWOPHASE_RECORD_DIR))) + { + while (i < count) + { + if (!find_txn(path[i])) + { + unlink(path[i]); + WriteClean2pcXlogRec(path[i]); + } + i++; + } + + closedir(dir); + } +#endif + /*delete all rest 2pc file in each nodes*/ + for (i = 0; i < cn_nodes_num; i++) + { + if (0 == result[i].slot_count) + { + continue; + } + if (!(twopcfiles = TTSgetvalue(result+i, 0, 0))) + continue; + ptr = strtok(twopcfiles, ","); + while(ptr) + { + if (count >= MAXIMUM_CLEAR_FILE) + break; + if (!find_txn(ptr)) + { + snprintf(clear_query, 100, CLEAR_STMT, ptr); + if (execute_query_on_single_node(cn_node_list[i], clear_query, 1, &clear_result) == (Datum)0) + res = false; + DropTupleTableSlots(&clear_result); + count++; + } + ptr = strtok(NULL, ","); + } + } + + for (i = 0; i < dn_nodes_num; i++) + { + if (0 == result[cn_nodes_num+i].slot_count) + { + continue; + } + if (!(twopcfiles = TTSgetvalue(result+cn_nodes_num+i, 0, 0))) + continue; + ptr = strtok(twopcfiles, ","); + while(ptr) + { + if (count >= MAXIMUM_CLEAR_FILE) + break; + if (!find_txn(ptr)) + { + snprintf(clear_query, 100, CLEAR_STMT, ptr); + if (execute_query_on_single_node(dn_node_list[i], clear_query, 1, &clear_result) == (Datum)0) + res = false; + DropTupleTableSlots(&clear_result); + count++; + } + ptr = strtok(NULL, ","); + } + } + + for (i = 0; i < pgxc_clean_node_count; i++) + DropTupleTableSlots(result+i); + + DestroyTxnHash(); + ResetGlobalVariables(); + + MemoryContextSwitchTo(oldcontext); + MemoryContextDelete(mycontext); + + + PG_RETURN_BOOL(res); +} + +Datum pgxc_get_record_list(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(pgxc_get_record_list); +Datum pgxc_get_record_list(PG_FUNCTION_ARGS) +{ + int count = 0; + DIR *dir = NULL; + struct dirent *ptr = NULL; + char *recordList = NULL; + text *t_recordList = NULL; + + /* get from hash table */ + recordList = get_2pc_list_from_cache(&count); + if (count >= MAXIMUM_OUTPUT_FILE) + { + Assert(NULL != recordList); + t_recordList = cstring_to_text(recordList); + return PointerGetDatum(t_recordList); + } + + /* get from disk */ + if(!(dir = opendir(TWOPHASE_RECORD_DIR))) + { + if(NULL == recordList) + { + PG_RETURN_NULL(); + } + + t_recordList = cstring_to_text(recordList); + return PointerGetDatum(t_recordList); + } + + while((ptr = readdir(dir)) != NULL) + { + if(strcmp(ptr->d_name,".") == 0 || strcmp(ptr->d_name,"..") == 0) + { + continue; + } + if (count >= MAXIMUM_OUTPUT_FILE) + { + break; + } + + if(!recordList) + { + recordList = (char *)palloc0(strlen(ptr->d_name) + 1); + sprintf(recordList, "%s", ptr->d_name); + } + else + { + recordList = (char *) repalloc(recordList, + strlen(ptr->d_name) + strlen(recordList) + 2); + sprintf(recordList, "%s,%s", recordList, ptr->d_name); + } + count++; + } + + closedir(dir); + + if(!recordList) + { + PG_RETURN_NULL(); + } + else + { + t_recordList = cstring_to_text(recordList); + return PointerGetDatum(t_recordList); + } +} + +Datum pgxc_commit_on_node(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(pgxc_commit_on_node); +Datum pgxc_commit_on_node(PG_FUNCTION_ARGS) +{ + /* nodename, gid */ + char *nodename; + Oid nodeoid; + char *gid; + txn_info *txn; + char command[MAX_CMD_LENGTH]; + PGXCNodeHandle **connections = NULL; + int conn_count = 0; + ResponseCombiner combiner; + PGXCNodeAllHandles *pgxc_handles = NULL; + PGXCNodeHandle *conn = NULL; + + /*clear Global*/ + ResetGlobalVariables(); + /*get node list*/ + PgxcNodeGetOids(&cn_node_list, &dn_node_list, + &cn_nodes_num, &dn_nodes_num, true); + if (cn_node_list == NULL || dn_node_list == NULL) + elog(ERROR, "pg_clean:fail to get cn_node_list and dn_node_list"); + pgxc_clean_node_count = cn_nodes_num + dn_nodes_num; + my_nodeoid = getMyNodeoid(); + cn_health_map = palloc0(cn_nodes_num * sizeof(bool)); + dn_health_map = palloc0(dn_nodes_num * sizeof(bool)); + + nodename = text_to_cstring(PG_GETARG_TEXT_P(0)); + gid = text_to_cstring(PG_GETARG_TEXT_P(1)); + nodeoid = get_pgxc_nodeoid(nodename); + if (InvalidOid == nodeoid) + { + elog(ERROR, "Invalid nodename '%s'", nodename); + } + + txn = (txn_info *)palloc0(sizeof(txn_info)); + if (txn == NULL) + { + PG_RETURN_BOOL(false); + } + txn->txn_stat = (TXN_STATUS *)palloc0(sizeof(TXN_STATUS) * pgxc_clean_node_count); + txn->xid = (uint32 *)palloc0(sizeof(uint32) * pgxc_clean_node_count); + txn->prepare_timestamp = (TimestampTz *)palloc0(sizeof(TimestampTz) * pgxc_clean_node_count); + txn->coordparts = (int *)palloc0(cn_nodes_num * sizeof(int)); + txn->dnparts = (int *)palloc0(dn_nodes_num * sizeof(int)); + + strncpy(txn->gid, gid, strlen(gid)+1); + getTxnInfoOnOtherNodes(txn); + snprintf(command, MAX_CMD_LENGTH, "commit prepared '%s'", txn->gid); + + + if (InvalidGlobalTimestamp == txn->global_commit_timestamp) + { + if (!txn->is_readonly) + { + elog(ERROR, "in pg_clean, fail to get global_commit_timestamp for transaction '%s' on", gid); + } + else + { + txn->global_commit_timestamp = GetGlobalTimestampGTM(); + } + } + + connections = (PGXCNodeHandle**)palloc(sizeof(PGXCNodeHandle*)); + get_node_handles(&pgxc_handles, nodeoid); + + conn = (PGXC_NODE_COORDINATOR == get_pgxc_nodetype(nodeoid)) ? + pgxc_handles->coord_handles[0] : pgxc_handles->datanode_handles[0]; + if (!send_query_clean_transaction(conn, txn, command)) + { + elog(ERROR, "pg_clean: send query '%s' from '%s' to '%s' failed ", + command, get_pgxc_nodename(my_nodeoid) , nodename); + } + else + { + connections[conn_count++] = conn; + } + /* receive response */ + if (conn_count) + { + InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE); + if (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner) || + !validate_combiner(&combiner)) + { + if (combiner.errorMessage) + pgxc_node_report_error(&combiner); + else + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to FINISH the transaction on one or more nodes"))); + } + else + CloseCombiner(&combiner); + } + /*clear Global*/ + ResetGlobalVariables(); + clear_handles(); + pfree_pgxc_all_handles(pgxc_handles); + pgxc_handles = NULL; + pfree(connections); + connections = NULL; + + PG_RETURN_BOOL(true); +} + +Datum pgxc_abort_on_node(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(pgxc_abort_on_node); +Datum pgxc_abort_on_node(PG_FUNCTION_ARGS) +{ + /* nodename, gid */ + char *nodename; + Oid nodeoid; + char *gid; + txn_info *txn; + char command[MAX_CMD_LENGTH]; + PGXCNodeHandle **connections = NULL; + int conn_count = 0; + ResponseCombiner combiner; + PGXCNodeAllHandles *pgxc_handles = NULL; + PGXCNodeHandle *conn = NULL; + + /*clear Global*/ + ResetGlobalVariables(); + /*get node list*/ + PgxcNodeGetOids(&cn_node_list, &dn_node_list, + &cn_nodes_num, &dn_nodes_num, true); + if (cn_node_list == NULL || dn_node_list == NULL) + elog(ERROR, "pg_clean:fail to get cn_node_list and dn_node_list"); + pgxc_clean_node_count = cn_nodes_num + dn_nodes_num; + my_nodeoid = getMyNodeoid(); + cn_health_map = palloc0(cn_nodes_num * sizeof(bool)); + dn_health_map = palloc0(dn_nodes_num * sizeof(bool)); + + nodename = text_to_cstring(PG_GETARG_TEXT_P(0)); + gid = text_to_cstring(PG_GETARG_TEXT_P(1)); + nodeoid = get_pgxc_nodeoid(nodename); + if (InvalidOid == nodeoid) + { + elog(ERROR, "Invalid nodename '%s'", nodename); + } + + txn = (txn_info *)palloc0(sizeof(txn_info)); + if (txn == NULL) + { + PG_RETURN_BOOL(false); + } + txn->txn_stat = (TXN_STATUS *)palloc0(sizeof(TXN_STATUS) * pgxc_clean_node_count); + txn->xid = (uint32 *)palloc0(sizeof(uint32) * pgxc_clean_node_count); + txn->prepare_timestamp = (TimestampTz *)palloc0(sizeof(TimestampTz) * pgxc_clean_node_count); + txn->coordparts = (int *)palloc0(cn_nodes_num * sizeof(int)); + txn->dnparts = (int *)palloc0(dn_nodes_num * sizeof(int)); + + strncpy(txn->gid, gid, strlen(gid)+1); + connections = (PGXCNodeHandle**)palloc(sizeof(PGXCNodeHandle*)); + getTxnInfoOnOtherNodes(txn); + snprintf(command, MAX_CMD_LENGTH, "rollback prepared '%s'", txn->gid); +#if 0 + if (!setMaintenanceMode(true)) + { + elog(ERROR, "Error: fail to set maintenance mode on in pg_clean"); + } +#endif + + get_node_handles(&pgxc_handles, nodeoid); + + conn = (PGXC_NODE_COORDINATOR == get_pgxc_nodetype(nodeoid)) ? + pgxc_handles->coord_handles[0] : pgxc_handles->datanode_handles[0]; + if (!send_query_clean_transaction(conn, txn, command)) + { + elog(ERROR, "pg_clean: send query '%s' from '%s' to '%s' failed ", + command, get_pgxc_nodename(my_nodeoid) , nodename); + } + else + { + connections[conn_count++] = conn; + } + /* receive response */ + if (conn_count) + { + InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE); + if (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner) || + !validate_combiner(&combiner)) + { + if (combiner.errorMessage) + pgxc_node_report_error(&combiner); + else + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to FINISH the transaction on one or more nodes"))); + } + else + CloseCombiner(&combiner); + } + /*clear Global*/ + ResetGlobalVariables(); + clear_handles(); + pfree_pgxc_all_handles(pgxc_handles); + pgxc_handles = NULL; + pfree(connections); + connections = NULL; + + PG_RETURN_BOOL(true); +} + + + +void recover2PCForDatabaseAll(void) +{ + database_info *cur_db = head_database_info; + while (cur_db) + { + recover2PCForDatabase(cur_db); + cur_db = cur_db->next; + } + //clean_old_2PC_files(); +} + +void recover2PCForDatabase(database_info * db_info) +{ + txn_info *cur_txn; + HASH_SEQ_STATUS status; + HTAB *txn = db_info->all_txn_info; + + hash_seq_init(&status, txn); + while ((cur_txn = (txn_info *) hash_seq_search(&status)) != NULL) + { + recover2PC(cur_txn); + } +} + +bool send_query_clean_transaction(PGXCNodeHandle* conn, txn_info *txn, const char *finish_cmd) +{ +#ifdef __TWO_PHASE_TESTS__ + if (PG_CLEAN_SEND_CLEAN <= twophase_exception_case && + PG_CLEAN_SEND_QUERY >= twophase_exception_case) + { + twophase_in = IN_PG_CLEAN; + } +#endif + if (!GlobalTimestampIsValid(txn->global_commit_timestamp) && + TXN_STATUS_COMMITTED == txn->global_txn_stat && + !txn->is_readonly) + return false; + + if (pgxc_node_send_clean(conn)) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("in pg_clean failed to send pg_clean flag for %s PREPARED command", + TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK"))); + return false; + } + if (txn->is_readonly && pgxc_node_send_readonly(conn)) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("in pg_clean failed to send readonly flag for %s PREPARED command", + TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK"))); + return false; + } + + if (txn->after_first_phase && pgxc_node_send_after_prepare(conn)) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("in pg_clean failed to send after prepare flag for %s PREPARED command", + TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK"))); + return false; + } + + /* + * only transaction finished in commit prepared/rollback prepared phase send timestamp + * partial prepared transaction has no need to send other information + */ + if (InvalidGlobalTimestamp != txn->global_commit_timestamp && + pgxc_node_send_global_timestamp(conn, txn->global_commit_timestamp)) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("in pg_clean failed to send global committs for %s PREPARED command", + TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK"))); + } + if (!txn->is_readonly) + { + if (InvalidOid != txn->origcoord && pgxc_node_send_starter(conn, get_pgxc_nodename(txn->origcoord))) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("in pg_clean failed to send start node for %s PREPARED command", + TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK"))); + } + + if (InvalidTransactionId != txn->startxid && pgxc_node_send_startxid(conn, txn->startxid)) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("in pg_clean failed to send start xid for %s PREPARED command", + TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK"))); + } + + if (NULL != txn->participants && pgxc_node_send_partnodes(conn, txn->participants)) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("in pg_clean failed to send participants for %s PREPARED command", + TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK"))); + } + } + + if (pgxc_node_send_query(conn, finish_cmd)) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("in pg_clean failed to send query for %s PREPARED command", + TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK"))); + return false; + } + return true; +} + +bool check_2pc_belong_node(txn_info * txn) +{ + int node_index = 0; + char node_type; + node_index = find_node_index(abnormal_nodeoid); + if (abnormal_nodeoid == txn->origcoord) + { + txn->belong_abnormal_node = true; + return true; + } + node_type = get_pgxc_nodetype(abnormal_nodeoid); + if (node_type == 'C' && txn->coordparts[node_index] == 1) + { + txn->belong_abnormal_node = true; + return true; + } + if (node_type == 'D' && txn->dnparts[node_index - cn_nodes_num] == 1) + { + txn->belong_abnormal_node = true; + return true; + } + txn->belong_abnormal_node = false; + return false; +} + +bool check_node_participate(txn_info * txn, int node_idx) +{ + char node_type = get_pgxc_nodetype(abnormal_nodeoid); + if (PGXC_NODE_COORDINATOR == node_type) + { + return txn->coordparts[node_idx] == 1 ? true : false; + } else if (PGXC_NODE_DATANODE == node_type) + { + return txn->dnparts[node_idx] == 1 ? true : false; + } + return false; +} + +void recover2PC(txn_info * txn) +{ + TXN_STATUS txn_stat; + txn_stat = check_txn_global_status(txn); + txn->global_txn_stat = txn_stat; + +#ifdef DEBUG_EXECABORT + txn_stat = TXN_STATUS_ABORTED; +#endif + + switch (txn_stat) + { + case TXN_STATUS_FAILED: + elog(LOG, "cannot recover 2PC transaction %s for TXN_STATUS_FAILED", txn->gid); + txn->op = UNDO; + txn->op_issuccess = true; + break; + + case TXN_STATUS_UNKNOWN: + elog(LOG, "cannot recover 2PC transaction %s for TXN_STATUS_UNKNOWN", txn->gid); + txn->op = UNDO; + txn->op_issuccess = true; + break; + + case TXN_STATUS_PREPARED: + elog(DEBUG1, "2PC recovery of transaction %s not needed for TXN_STATUS_PREPARED", txn->gid); + txn->op = UNDO; + txn->op_issuccess = true; + break; + + case TXN_STATUS_COMMITTED: + if (InvalidOid == txn->origcoord || txn->is_readonly) + { + txn->op = UNDO; + txn->op_issuccess = true; + } + else + { + txn->op = COMMIT; + /* check whether all nodes can commit prepared */ + if (!clean_2PC_iscommit(txn, true, true)) + { + txn->op_issuccess = false; + elog(LOG, "check commit 2PC transaction %s failed", txn->gid); + return; + } + /* send commit prepared to all nodes */ + if (!clean_2PC_iscommit(txn, true, false)) + { + txn->op_issuccess = false; + elog(LOG, "commit 2PC transaction %s failed", txn->gid); + return; + } + txn->op_issuccess = true; + clean_2PC_files(txn); + } + break; + + case TXN_STATUS_ABORTED: + txn->op = ABORT; + /* check whether all nodes can rollback prepared */ + if (!clean_2PC_iscommit(txn, false, true)) + { + txn->op_issuccess = false; + elog(LOG, "check rollback 2PC transaction %s failed", txn->gid); + return; + } + /* send rollback prepared to all nodes */ + if (!clean_2PC_iscommit(txn, false, false)) + { + txn->op_issuccess = false; + elog(LOG, "rollback 2PC transaction %s failed", txn->gid); + return; + } + txn->op_issuccess = true; + clean_2PC_files(txn); + break; + + case TXN_STATUS_INPROGRESS: + elog(DEBUG1, "2PC recovery of transaction %s not needed for TXN_STATUS_INPROGRESS", txn->gid); + txn->op = UNDO; + txn->op_issuccess = true; + break; + + default: + elog(ERROR, "cannot recover 2PC transaction %s for unkown status", txn->gid); + break; + } + return; +} + +TXN_STATUS check_txn_global_status(txn_info *txn) +{ +#define TXN_PREPARED 0x0001 +#define TXN_COMMITTED 0x0002 +#define TXN_ABORTED 0x0004 +#define TXN_UNKNOWN 0x0008 +#define TXN_INITIAL 0x0010 +#define TXN_INPROGRESS 0X0020 + int ii; + int check_flag = 0; + int node_idx = 0; + TimestampTz prepared_time = 0; + TimestampTz time_gap = clean_time_interval; + + if (!IsXidImplicit(txn->gid) && txn->is_readonly) + { + return TXN_STATUS_COMMITTED; + } + if (txn->global_txn_stat == TXN_STATUS_UNKNOWN) + { + check_flag |= TXN_UNKNOWN; + } + if (txn->global_txn_stat == TXN_STATUS_ABORTED) + { + check_flag |= TXN_ABORTED; + } + + /*check dn participates*/ + for (ii = 0; ii < dn_nodes_num; ii++) + { + if (txn->dnparts[ii] == 1) + { + if (txn->txn_stat[ii + cn_nodes_num] == TXN_STATUS_INITIAL) + check_flag |= TXN_INITIAL; + else if (txn->txn_stat[ii + cn_nodes_num] == TXN_STATUS_UNKNOWN) + check_flag |= TXN_UNKNOWN; + else if (txn->txn_stat[ii + cn_nodes_num] == TXN_STATUS_PREPARED) + { + check_flag |= TXN_PREPARED; + prepared_time = txn->prepare_timestamp[ii + cn_nodes_num] > prepared_time ? + txn->prepare_timestamp[ii + cn_nodes_num] : prepared_time; + } + else if (txn->txn_stat[ii + cn_nodes_num] == TXN_STATUS_INPROGRESS) + check_flag |= TXN_INPROGRESS; + else if (txn->txn_stat[ii + cn_nodes_num] == TXN_STATUS_COMMITTED) + check_flag |= TXN_COMMITTED; + else if (txn->txn_stat[ii + cn_nodes_num] == TXN_STATUS_ABORTED) + check_flag |= TXN_ABORTED; + else + return TXN_STATUS_FAILED; + } + } + /*check cn participates*/ + for (ii = 0; ii < cn_nodes_num; ii++) + { + if (txn->coordparts[ii] == 1) + { + if (txn->txn_stat[ii] == TXN_STATUS_INITIAL) + check_flag |= TXN_ABORTED; + else if (txn->txn_stat[ii] == TXN_STATUS_UNKNOWN) + check_flag |= TXN_UNKNOWN; + else if (txn->txn_stat[ii] == TXN_STATUS_PREPARED) + { + check_flag |= TXN_PREPARED; + prepared_time = txn->prepare_timestamp[ii] > prepared_time ? + txn->prepare_timestamp[ii] : prepared_time; + } + else if (txn->txn_stat[ii] == TXN_STATUS_INPROGRESS) + check_flag |= TXN_INPROGRESS; + else if (txn->txn_stat[ii] == TXN_STATUS_COMMITTED) + check_flag |= TXN_COMMITTED; + else if (txn->txn_stat[ii] == TXN_STATUS_ABORTED) + check_flag |= TXN_ABORTED; + else + return TXN_STATUS_FAILED; + } + } + + /* + * first check the prepare timestamp of both implicit and explicit trans within the time_gap or not + * if not, check the commit timestamp explicit trans within the time_gap or not + */ +#if 0 + if ((check_flag & TXN_INPROGRESS) || + (IsXidImplicit(txn->gid) && current_time - prepared_time <= time_gap) || + (!IsXidImplicit(txn->gid) && + ((!txn->after_first_phase && current_time - prepared_time <= time_gap) || + (txn->after_first_phase && + (InvalidGlobalTimestamp != commit_time && + current_time - commit_time <= time_gap))))) + { + /* transaction inprogress */ + return TXN_STATUS_INPROGRESS; + } +#endif + if (clear_2pc_belong_node) + { + node_idx = find_node_index(abnormal_nodeoid); + if (!check_2pc_belong_node(txn) || + !check_node_participate(txn, node_idx) || + abnormal_time < txn->prepare_timestamp[node_idx]) + { + return TXN_STATUS_INPROGRESS; + } + } + else + { + if (check_flag & TXN_INPROGRESS ||current_time - prepared_time <= time_gap) + { + /* transaction inprogress */ + return TXN_STATUS_INPROGRESS; + } + } + + + if (!IsXidImplicit(txn->gid) && txn->after_first_phase && (TXN_PREPARED == check_flag)) + { + return TXN_STATUS_PREPARED; + } + + if (check_flag & TXN_UNKNOWN) + return TXN_STATUS_UNKNOWN; + + if ((check_flag & TXN_COMMITTED) && (check_flag & TXN_ABORTED)) + /* Mix of committed and aborted. This should not happen. */ + return TXN_STATUS_UNKNOWN; + + if ((check_flag & TXN_PREPARED) == 0) + /* Should be at least one "prepared statement" in nodes */ + return TXN_STATUS_FAILED; + + if (check_flag & TXN_COMMITTED) + /* Some 2PC transactions are committed. Need to commit others. */ + return TXN_STATUS_COMMITTED; + /* All the transactions remain prepared. No need to recover. */ + return TXN_STATUS_ABORTED; +} + +bool clean_2PC_iscommit(txn_info *txn, bool is_commit, bool is_check) +{ + int ii; + static const char *STMT_FORM = "%s prepared '%s';"; + static const char *STMT_FORM_CHECK = "%s prepared '%s' for check only;"; + char command[MAX_CMD_LENGTH]; + int node_idx; + Oid node_oid; + PGXCNodeHandle **connections = NULL; + int conn_count = 0; + ResponseCombiner combiner; + PGXCNodeAllHandles *pgxc_handles = NULL; + + if (is_commit) + { + if (is_check) + { + snprintf(command, MAX_CMD_LENGTH, STMT_FORM_CHECK, "commit", txn->gid); + } + else + { + snprintf(command, MAX_CMD_LENGTH, STMT_FORM, "commit", txn->gid); + } + } + else + { + if (is_check) + { + snprintf(command, MAX_CMD_LENGTH, STMT_FORM_CHECK, "rollback", txn->gid); + } + else + { + snprintf(command, MAX_CMD_LENGTH, STMT_FORM, "rollback", txn->gid); + } + } + if (is_commit && InvalidGlobalTimestamp == txn->global_commit_timestamp) + { + elog(ERROR, "twophase transaction '%s' has InvalidGlobalCommitTimestamp", txn->gid); + } + + connections = (PGXCNodeHandle**)palloc(sizeof(PGXCNodeHandle*) * (txn->num_dnparts + txn->num_coordparts)); + if (connections == NULL) + { + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory for connections"))); + } + get_transaction_handles(&pgxc_handles, txn); + //pgxc_handles = get_handles(nodelist, coordlist, false, true); +#ifdef __TWO_PHASE_TESTS__ + if (PG_CLEAN_SEND_CLEAN <= twophase_exception_case && + PG_CLEAN_ELOG_ERROR >= twophase_exception_case) + { + exception_count = 0; + } +#endif + for (ii = 0; ii < pgxc_handles->dn_conn_count; ii++) + { + node_oid = pgxc_handles->datanode_handles[ii]->nodeoid; + node_idx = find_node_index(node_oid); + if (TXN_STATUS_PREPARED != txn->txn_stat[ node_idx]) + { + continue; + } + /*send global timestamp to dn_node_list[ii]*/ + if (!send_query_clean_transaction(pgxc_handles->datanode_handles[ii], txn, command)) + { + elog(LOG, "pg_clean: send query '%s' from '%s' to '%s' failed ", + command, get_pgxc_nodename(my_nodeoid) , pgxc_handles->datanode_handles[ii]->nodename); + return false; + } + else + { + connections[conn_count++] = pgxc_handles->datanode_handles[ii]; +#ifdef __TWO_PHASE_TESTS__ + if (PG_CLEAN_SEND_CLEAN <= twophase_exception_case && + PG_CLEAN_ELOG_ERROR >= twophase_exception_case) + { + exception_count++; + if (1 == exception_count && + PG_CLEAN_ELOG_ERROR == twophase_exception_case) + { + elog(ERROR, "PG_CLEAN_ELOG_ERROR complish"); + } + } +#endif + } + } + + for (ii = 0; ii < pgxc_handles->co_conn_count; ii++) + { + node_oid = pgxc_handles->coord_handles[ii]->nodeoid; + node_idx = find_node_index(node_oid); + if (TXN_STATUS_PREPARED != txn->txn_stat[ node_idx]) + { + continue; + } + /*send global timestamp to dn_node_list[ii]*/ + if (!send_query_clean_transaction(pgxc_handles->coord_handles[ii], txn, command)) + { + elog(LOG, "pg_clean: send query '%s' from '%s' to '%s' failed ", + command, get_pgxc_nodename(my_nodeoid) , pgxc_handles->coord_handles[ii]->nodename); + return false; + } + else + { + connections[conn_count++] = pgxc_handles->coord_handles[ii]; +#ifdef __TWO_PHASE_TESTS__ + if (PG_CLEAN_SEND_CLEAN <= twophase_exception_case && + PG_CLEAN_ELOG_ERROR >= twophase_exception_case) + { + exception_count++; + if (1 == exception_count && + PG_CLEAN_ELOG_ERROR == twophase_exception_case) + { + elog(ERROR, "PG_CLEAN_ELOG_ERROR complish"); + } + } +#endif + } + + } + + /* receive response */ + if (conn_count) + { + InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE); + if (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner) || + !validate_combiner(&combiner)) + { + if (combiner.errorMessage) + pgxc_node_report_error(&combiner); + else + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to FINISH the transaction on one or more nodes"))); + } + else + CloseCombiner(&combiner); + } + if (enable_distri_print) + { + for (ii = 0; ii < conn_count; ii++) + { + if (DN_CONNECTION_STATE_IDLE != connections[ii]->state) + { + elog(WARNING, "IN pg_clean node:%s invalid stauts:%d", connections[ii]->nodename, connections[ii]->state); + } + } + } + conn_count = 0; + clear_handles(); + pfree_pgxc_all_handles(pgxc_handles); + pgxc_handles = NULL; + + /*last commit or rollback on origcoord if it participate this txn, since after commit the 2pc file is deleted on origcoord*/ + if (txn->origcoord != InvalidOid) + { + node_idx = find_node_index(txn->origcoord); + if (txn->coordparts[node_idx] == 1) + { + /*send global timestamp to dn_node_list[ii]*/ + + if (txn->txn_stat[node_idx] == TXN_STATUS_PREPARED) + { + get_node_handles(&pgxc_handles, txn->origcoord); + if (!send_query_clean_transaction(pgxc_handles->coord_handles[0], txn, command)) + { + elog(LOG, "pg_clean: send query '%s' from %s to %s failed ", + command, get_pgxc_nodename(my_nodeoid) , pgxc_handles->coord_handles[0]->nodename); + return false; + } + else + { + connections[conn_count++] = pgxc_handles->coord_handles[0]; + } + } + } + } + + /* receive response */ + if (conn_count) + { + InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE); + if (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner) || + !validate_combiner(&combiner)) + { + if (combiner.errorMessage) + pgxc_node_report_error(&combiner); + else + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to FINISH the transaction on one or more nodes"))); + } + else + CloseCombiner(&combiner); + } + /*free hash record from gtm*/ + FinishGIDGTM(txn->gid); + + clear_handles(); + pfree_pgxc_all_handles(pgxc_handles); + pgxc_handles = NULL; + pfree(connections); + connections = NULL; + return true; +} + +bool clean_2PC_files(txn_info * txn) +{ + int ii; + TupleTableSlots result; + bool issuccess = true; + static const char *STMT_FORM = "select pgxc_remove_2pc_records('%s')::text"; + char query[MAX_CMD_LENGTH]; + + snprintf(query, MAX_CMD_LENGTH, STMT_FORM, txn->gid); + + for (ii = 0; ii < dn_nodes_num; ii++) + { + if (execute_query_on_single_node(dn_node_list[ii], query, 1, &result) == (Datum) 1) + { + if (TTSgetvalue(&result, 0, 0) == false) + { + elog(LOG, "pg_clean: delete 2PC file failed of transaction %s on node %s", + txn->gid, get_pgxc_nodename(txn->dnparts[ii])); + issuccess = false; + } + } + else + { + elog(LOG, "pg_clean: failed clean 2pc file of transaction %s on node %s", txn->gid, get_pgxc_nodename(dn_node_list[ii])); + issuccess = false; + } + DropTupleTableSlots(&result); + if (!issuccess) + return false; + } + + for (ii = 0; ii < cn_nodes_num; ii++) + { + if (execute_query_on_single_node(cn_node_list[ii], query, 1, &result) == (Datum) 1) + { + if (TTSgetvalue(&result, 0, 0) == false) + { + elog(LOG, "Error:delete 2PC file failed of transaction %s on node %s", + txn->gid, get_pgxc_nodename(txn->coordparts[ii])); + issuccess = false; + } + } + else + { + elog(LOG, "pg_clean: failed clean 2pc file of transaction %s on node %s", txn->gid, get_pgxc_nodename(cn_node_list[ii])); + issuccess = false; + } + DropTupleTableSlots(&result); + if (!issuccess) + return false; + } + return true; +} + +void Init_print_txn_info(print_txn_info * print_txn) +{ + database_info *cur_database = head_database_info; + txn_info *cur_txn; + HASH_SEQ_STATUS status; + HTAB *txn; + + print_txn->index = 0; + INIT(print_txn->txn); + + for (; cur_database; cur_database = cur_database->next) + { + txn = cur_database->all_txn_info; + hash_seq_init(&status, txn); + while ((cur_txn = (txn_info *) hash_seq_search(&status)) != NULL) + { + if (clear_2pc_belong_node && !cur_txn->belong_abnormal_node) + { + continue; + } + if (cur_txn->global_txn_stat != TXN_STATUS_INPROGRESS) + PALLOC(print_txn->txn, cur_txn); + } + +#if 0 + cur_txn = cur_database->head_txn_info; + for (; cur_txn; cur_txn = cur_txn->next) + { + if (cur_txn->global_txn_stat != TXN_STATUS_INPROGRESS) + PALLOC(print_txn->txn, cur_txn); + } +#endif + } +} + +void Init_print_stats_all(print_status *pstatus) +{ + database_info *cur_database; + txn_info *cur_txn; + HASH_SEQ_STATUS status; + HTAB *txn; + + pstatus->index = 0; + pstatus->count = 0; + INIT(pstatus->gid); + INIT(pstatus->global_status); + INIT(pstatus->status); + INIT(pstatus->database); + + for (cur_database = head_database_info; cur_database; cur_database = cur_database->next) + { + txn = cur_database->all_txn_info; + hash_seq_init(&status, txn); + while ((cur_txn = (txn_info *) hash_seq_search(&status)) != NULL) + { + cur_txn->global_txn_stat = check_txn_global_status(cur_txn); + if (cur_txn->global_txn_stat != TXN_STATUS_INPROGRESS) + Init_print_stats(cur_txn, cur_database->database_name, pstatus); + } +#if 0 + for (cur_txn = cur_database->head_txn_info; cur_txn; cur_txn = cur_txn->next) + { + cur_txn->global_txn_stat = check_txn_global_status(cur_txn); + if (cur_txn->global_txn_stat != TXN_STATUS_INPROGRESS) + Init_print_stats(cur_txn, cur_database->database_name, pstatus); + } +#endif + } +} + +void Init_print_stats(txn_info *txn, char *database, print_status * pstatus) +{ + int ii; + StringInfoData query; + initStringInfo(&query); + + RPALLOC(pstatus->gid); + RPALLOC(pstatus->global_status); + RPALLOC(pstatus->status); + RPALLOC(pstatus->database); + + pstatus->gid[pstatus->count] = (char *)palloc0(100 * sizeof(char)); + pstatus->database[pstatus->count] = (char *)palloc0(100 * sizeof(char)); + pstatus->global_status[pstatus->count] = (char *)palloc0(100 * sizeof(char)); + + strncpy(pstatus->gid[pstatus->count], txn->gid, 100); + strncpy(pstatus->database[pstatus->count], database, 100); + strncpy(pstatus->global_status[pstatus->count], txn_status_to_string(check_txn_global_status(txn)), 100); + + for (ii = 0; ii < pgxc_clean_node_count; ii++) + { + appendStringInfo(&query, "%-12s:%-15s", get_pgxc_nodename(find_node_oid(ii)), + txn_status_to_string(txn->txn_stat[ii])); + if (ii < pgxc_clean_node_count - 1) + { + appendStringInfoChar(&query, '\n'); + } + } + + pstatus->status[pstatus->count] = (char *)palloc0((strlen(query.data)+1) * sizeof(char)); + strncpy(pstatus->status[pstatus->count], query.data, strlen(query.data)+1); + pstatus->gid_count++; + pstatus->database_count++; + pstatus->global_status_count++; + pstatus->status_count++; + pstatus->count++; +} + +static const char *txn_status_to_string(TXN_STATUS status) +{ + switch (status) + { + ENUM_TOCHAR_CASE(TXN_STATUS_INITIAL) + ENUM_TOCHAR_CASE(TXN_STATUS_UNKNOWN) + ENUM_TOCHAR_CASE(TXN_STATUS_PREPARED) + ENUM_TOCHAR_CASE(TXN_STATUS_COMMITTED) + ENUM_TOCHAR_CASE(TXN_STATUS_ABORTED) + ENUM_TOCHAR_CASE(TXN_STATUS_INPROGRESS) + ENUM_TOCHAR_CASE(TXN_STATUS_FAILED) + } + return NULL; +} + +static const char *txn_op_to_string(OPERATION op) +{ + switch (op) + { + ENUM_TOCHAR_CASE(UNDO) + ENUM_TOCHAR_CASE(ABORT) + ENUM_TOCHAR_CASE(COMMIT) + } + return NULL; +} + + +static void +CheckFirstPhase(txn_info *txn) +{ +// int ret; + Oid orignode = txn->origcoord; + uint32 startxid = txn->startxid; +// uint32 transactionid; + int nodeidx; + + /* + * if the twophase trans does not success in prepare phase, the orignode == InvalidOid. + */ + if (InvalidOid == orignode) + { + return; + } + nodeidx = find_node_index(orignode); + if (0 == txn->xid[nodeidx]) + { + txn->xid[nodeidx] = startxid; + } + /* start node participate */ + if (txn->isorigcoord_part) + { + if (0 == txn->coordparts[nodeidx]) + { + txn->coordparts[nodeidx] = 1; + txn->num_coordparts++; + } + if (txn->txn_stat[nodeidx] == TXN_STATUS_INITIAL) + { + /*select * from pgxc_is_committed...*/ + getTxnStatus(txn, nodeidx); + } + if (txn->txn_stat[nodeidx] == TXN_STATUS_PREPARED && txn->global_commit_timestamp != InvalidGlobalTimestamp) + { + txn->after_first_phase = true; + } + } + /* start node node participate */ + else + { +#if 0 + ret = Get2PCFile(orignode, txn->gid, &transactionid); + if (ret == FILENOTFOUND) + txn->after_first_phase = false; + else if (ret == FILEUNKOWN) + txn->global_txn_stat = TXN_STATUS_UNKNOWN; + else if (ret == FILEFOUND && txn->global_commit_timestamp != InvalidGlobalTimestamp) + txn->after_first_phase = true; +#endif + if (txn->global_commit_timestamp != InvalidGlobalTimestamp) + { + txn->after_first_phase = true; + } else { + txn->after_first_phase = false; + } + } +} + +void get_transaction_handles(PGXCNodeAllHandles **pgxc_handles, txn_info *txn) +{ + int dn_index = 0; + int cn_index = 0; + int nodeIndex; + char nodetype; + List *coordlist = NIL; + List *nodelist = NIL; + + while (dn_index < dn_nodes_num) + { + + /* Get node type and index */ + nodetype = PGXC_NODE_NONE; + if (TXN_STATUS_PREPARED != txn->txn_stat[dn_index + cn_nodes_num]) + { + dn_index++; + continue; + } + nodeIndex = PGXCNodeGetNodeIdFromName(get_pgxc_nodename(dn_node_list[dn_index]), &nodetype); + if (nodetype == PGXC_NODE_NONE) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("PGXC Node %s: object not defined", + get_pgxc_nodename(dn_node_list[dn_index])))); + + /* Check if node is requested is the self-node or not */ + if (nodetype == PGXC_NODE_DATANODE) + { + nodelist = lappend_int(nodelist, nodeIndex); + } + dn_index++; + + } + + while (cn_index < cn_nodes_num) + { + /* Get node type and index */ + nodetype = PGXC_NODE_NONE; + if (TXN_STATUS_PREPARED != txn->txn_stat[cn_index] || cn_node_list[cn_index] == txn->origcoord) + { + cn_index++; + continue; + } + nodeIndex = PGXCNodeGetNodeIdFromName(get_pgxc_nodename(cn_node_list[cn_index]), &nodetype); + if (nodetype == PGXC_NODE_NONE) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("PGXC Node %s: object not defined", + get_pgxc_nodename(cn_node_list[cn_index])))); + + /* Check if node is requested is the self-node or not */ + if (nodetype == PGXC_NODE_COORDINATOR) + { + coordlist = lappend_int(coordlist, nodeIndex); + } + cn_index++; + } + *pgxc_handles = get_handles(nodelist, coordlist, false, true, true); +} + +void get_node_handles(PGXCNodeAllHandles **pgxc_handles, Oid nodeoid) +{ + char nodetype = PGXC_NODE_NONE; + int nodeIndex; + List *coordlist = NIL; + List *nodelist = NIL; + + nodeIndex = PGXCNodeGetNodeIdFromName(get_pgxc_nodename(nodeoid), &nodetype); + if (nodetype == PGXC_NODE_COORDINATOR) + { + coordlist = lappend_int(coordlist, nodeIndex); + } + else + { + nodelist = lappend_int(nodelist, nodeIndex); + } + *pgxc_handles = get_handles(nodelist, coordlist, false, true, true); +} + diff --git a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c index ff748ae0..5efecaf6 100644 --- a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c +++ b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c @@ -526,10 +526,11 @@ pg_stat_get_remote_activity(const char *sessionid, bool coordonly, Tuplestoresta plan->exec_type = EXEC_ON_ALL_NODES; plan->sql_statement = (char *) query; plan->force_autocommit = false; + plan->exec_nodes = makeNode(ExecNodes); + plan->exec_nodes->missing_ok = true; if (coordonly) { - plan->exec_nodes = makeNode(ExecNodes); plan->exec_nodes->nodeList = GetAllCoordNodes(); plan->exec_type = EXEC_ON_COORDS; } diff --git a/src/backend/pgxc/barrier/barrier.c b/src/backend/pgxc/barrier/barrier.c index c73a13dd..a4ea113c 100644 --- a/src/backend/pgxc/barrier/barrier.c +++ b/src/backend/pgxc/barrier/barrier.c @@ -2,14 +2,14 @@ * * barrier.c * - * Barrier handling for PITR + * Barrier handling for PITR * * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 2010-2012 Postgres-XC Development Group * * IDENTIFICATION - * $$ + * $$ * *------------------------------------------------------------------------- */ @@ -53,25 +53,25 @@ static void EndBarrier(PGXCNodeAllHandles *handles, const char *id); void ProcessCreateBarrierPrepare(const char *id) { - StringInfoData buf; + StringInfoData buf; - if (!IS_PGXC_REMOTE_COORDINATOR) - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("The CREATE BARRIER PREPARE message is expected to " - "arrive at a Coordinator from another Coordinator"))); + if (!IS_PGXC_REMOTE_COORDINATOR) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("The CREATE BARRIER PREPARE message is expected to " + "arrive at a Coordinator from another Coordinator"))); - LWLockAcquire(BarrierLock, LW_EXCLUSIVE); + LWLockAcquire(BarrierLock, LW_EXCLUSIVE); - pq_beginmessage(&buf, 'b'); - pq_sendstring(&buf, id); - pq_endmessage(&buf); - pq_flush(); + pq_beginmessage(&buf, 'b'); + pq_sendstring(&buf, id); + pq_endmessage(&buf); + pq_flush(); - /* - * TODO Start a timer to terminate the pending barrier after a specified - * timeout - */ + /* + * TODO Start a timer to terminate the pending barrier after a specified + * timeout + */ } /* @@ -81,24 +81,24 @@ ProcessCreateBarrierPrepare(const char *id) void ProcessCreateBarrierEnd(const char *id) { - StringInfoData buf; + StringInfoData buf; - if (!IS_PGXC_REMOTE_COORDINATOR) - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("The CREATE BARRIER END message is expected to " - "arrive at a Coordinator from another Coordinator"))); + if (!IS_PGXC_REMOTE_COORDINATOR) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("The CREATE BARRIER END message is expected to " + "arrive at a Coordinator from another Coordinator"))); - LWLockRelease(BarrierLock); + LWLockRelease(BarrierLock); - pq_beginmessage(&buf, 'b'); - pq_sendstring(&buf, id); - pq_endmessage(&buf); - pq_flush(); + pq_beginmessage(&buf, 'b'); + pq_sendstring(&buf, id); + pq_endmessage(&buf); + pq_flush(); - /* - * TODO Stop the timer - */ + /* + * TODO Stop the timer + */ } /* @@ -109,186 +109,186 @@ ProcessCreateBarrierEnd(const char *id) void ProcessCreateBarrierExecute(const char *id) { - StringInfoData buf; - - if (!IsConnFromCoord()) - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("The CREATE BARRIER EXECUTE message is expected to " - "arrive from a Coordinator"))); - { - XLogRecPtr recptr; - - XLogBeginInsert(); - XLogRegisterData((char *) &id, strlen(id) + 1); - recptr = XLogInsert(RM_BARRIER_ID, XLOG_BARRIER_CREATE); - XLogFlush(recptr); - } - - pq_beginmessage(&buf, 'b'); - pq_sendstring(&buf, id); - pq_endmessage(&buf); - pq_flush(); + StringInfoData buf; + + if (!IsConnFromCoord()) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("The CREATE BARRIER EXECUTE message is expected to " + "arrive from a Coordinator"))); + { + XLogRecPtr recptr; + + XLogBeginInsert(); + XLogRegisterData((char *) &id, strlen(id) + 1); + recptr = XLogInsert(RM_BARRIER_ID, XLOG_BARRIER_CREATE); + XLogFlush(recptr); + } + + pq_beginmessage(&buf, 'b'); + pq_sendstring(&buf, id); + pq_endmessage(&buf); + pq_flush(); } static const char * generate_barrier_id(const char *id) { - char genid[1024]; - TimestampTz ts; + char genid[1024]; + TimestampTz ts; - /* - * If the caller can passed a NULL value, generate an id which is - * guaranteed to be unique across the cluster. We use a combination of - * the Coordinator node id and current timestamp. - */ + /* + * If the caller can passed a NULL value, generate an id which is + * guaranteed to be unique across the cluster. We use a combination of + * the Coordinator node id and current timestamp. + */ - if (id) - return id; + if (id) + return id; - ts = GetCurrentTimestamp(); + ts = GetCurrentTimestamp(); #ifdef HAVE_INT64_TIMESTAMP - sprintf(genid, "%s_"INT64_FORMAT, PGXCNodeName, ts); + sprintf(genid, "%s_"INT64_FORMAT, PGXCNodeName, ts); #else - sprintf(genid, "%s_%.0f", PGXCNodeName, ts); + sprintf(genid, "%s_%.0f", PGXCNodeName, ts); #endif - return pstrdup(genid); + return pstrdup(genid); } static PGXCNodeAllHandles * SendBarrierPrepareRequest(List *coords, const char *id) { - PGXCNodeAllHandles *coord_handles; - int conn; - int msglen; - int barrier_idlen; + PGXCNodeAllHandles *coord_handles; + int conn; + int msglen; + int barrier_idlen; - coord_handles = get_handles(NIL, coords, true, true); + coord_handles = get_handles(NIL, coords, true, true, true); - for (conn = 0; conn < coord_handles->co_conn_count; conn++) - { - PGXCNodeHandle *handle = coord_handles->coord_handles[conn]; + for (conn = 0; conn < coord_handles->co_conn_count; conn++) + { + PGXCNodeHandle *handle = coord_handles->coord_handles[conn]; - /* Invalid connection state, return error */ - if (handle->state != DN_CONNECTION_STATE_IDLE) - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to send CREATE BARRIER PREPARE request " - "to the node"))); + /* Invalid connection state, return error */ + if (handle->state != DN_CONNECTION_STATE_IDLE) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send CREATE BARRIER PREPARE request " + "to the node"))); - barrier_idlen = strlen(id) + 1; + barrier_idlen = strlen(id) + 1; - msglen = 4; /* for the length itself */ - msglen += barrier_idlen; - msglen += 1; /* for barrier command itself */ + msglen = 4; /* for the length itself */ + msglen += barrier_idlen; + msglen += 1; /* for barrier command itself */ - /* msgType + msgLen */ - if (ensure_out_buffer_capacity(handle->outEnd + 1 + msglen, handle) != 0) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Out of memory"))); - } + /* msgType + msgLen */ + if (ensure_out_buffer_capacity(handle->outEnd + 1 + msglen, handle) != 0) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Out of memory"))); + } - handle->outBuffer[handle->outEnd++] = 'b'; - msglen = htonl(msglen); - memcpy(handle->outBuffer + handle->outEnd, &msglen, 4); - handle->outEnd += 4; + handle->outBuffer[handle->outEnd++] = 'b'; + msglen = htonl(msglen); + memcpy(handle->outBuffer + handle->outEnd, &msglen, 4); + handle->outEnd += 4; - handle->outBuffer[handle->outEnd++] = CREATE_BARRIER_PREPARE; + handle->outBuffer[handle->outEnd++] = CREATE_BARRIER_PREPARE; - memcpy(handle->outBuffer + handle->outEnd, id, barrier_idlen); - handle->outEnd += barrier_idlen; + memcpy(handle->outBuffer + handle->outEnd, id, barrier_idlen); + handle->outEnd += barrier_idlen; - PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_QUERY); + PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_QUERY); - pgxc_node_flush(handle); - } + pgxc_node_flush(handle); + } - return coord_handles; + return coord_handles; } static void CheckBarrierCommandStatus(PGXCNodeAllHandles *conn_handles, const char *id, - const char *command) + const char *command) { - int conn; - int count = conn_handles->co_conn_count + conn_handles->dn_conn_count; - - elog(DEBUG2, "Check CREATE BARRIER <%s> %s command status", id, command); - - for (conn = 0; conn < count; conn++) - { - PGXCNodeHandle *handle; - - if (conn < conn_handles->co_conn_count) - handle = conn_handles->coord_handles[conn]; - else - handle = conn_handles->datanode_handles[conn - conn_handles->co_conn_count]; - - if (pgxc_node_receive(1, &handle, NULL)) - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to receive response from the remote side"))); - - if (handle_response(handle, NULL) != RESPONSE_BARRIER_OK) - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("CREATE BARRIER PREPARE command failed " - "with error %s", handle->error))); - } - - elog(DEBUG2, "Successfully completed CREATE BARRIER <%s> %s command on " - "all nodes", id, command); + int conn; + int count = conn_handles->co_conn_count + conn_handles->dn_conn_count; + + elog(DEBUG2, "Check CREATE BARRIER <%s> %s command status", id, command); + + for (conn = 0; conn < count; conn++) + { + PGXCNodeHandle *handle; + + if (conn < conn_handles->co_conn_count) + handle = conn_handles->coord_handles[conn]; + else + handle = conn_handles->datanode_handles[conn - conn_handles->co_conn_count]; + + if (pgxc_node_receive(1, &handle, NULL)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to receive response from the remote side"))); + + if (handle_response(handle, NULL) != RESPONSE_BARRIER_OK) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("CREATE BARRIER PREPARE command failed " + "with error %s", handle->error))); + } + + elog(DEBUG2, "Successfully completed CREATE BARRIER <%s> %s command on " + "all nodes", id, command); } static void SendBarrierEndRequest(PGXCNodeAllHandles *coord_handles, const char *id) { - int conn; - int msglen; - int barrier_idlen; + int conn; + int msglen; + int barrier_idlen; - elog(DEBUG2, "Sending CREATE BARRIER <%s> END command to all Coordinators", id); + elog(DEBUG2, "Sending CREATE BARRIER <%s> END command to all Coordinators", id); - for (conn = 0; conn < coord_handles->co_conn_count; conn++) - { - PGXCNodeHandle *handle = coord_handles->coord_handles[conn]; + for (conn = 0; conn < coord_handles->co_conn_count; conn++) + { + PGXCNodeHandle *handle = coord_handles->coord_handles[conn]; - /* Invalid connection state, return error */ - if (handle->state != DN_CONNECTION_STATE_IDLE) - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to send CREATE BARRIER PREPARE request " - "to the node"))); + /* Invalid connection state, return error */ + if (handle->state != DN_CONNECTION_STATE_IDLE) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send CREATE BARRIER PREPARE request " + "to the node"))); - barrier_idlen = strlen(id) + 1; + barrier_idlen = strlen(id) + 1; - msglen = 4; /* for the length itself */ - msglen += barrier_idlen; - msglen += 1; /* for barrier command itself */ + msglen = 4; /* for the length itself */ + msglen += barrier_idlen; + msglen += 1; /* for barrier command itself */ - /* msgType + msgLen */ - if (ensure_out_buffer_capacity(handle->outEnd + 1 + msglen, handle) != 0) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Out of memory"))); - } + /* msgType + msgLen */ + if (ensure_out_buffer_capacity(handle->outEnd + 1 + msglen, handle) != 0) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Out of memory"))); + } - handle->outBuffer[handle->outEnd++] = 'b'; - msglen = htonl(msglen); - memcpy(handle->outBuffer + handle->outEnd, &msglen, 4); - handle->outEnd += 4; + handle->outBuffer[handle->outEnd++] = 'b'; + msglen = htonl(msglen); + memcpy(handle->outBuffer + handle->outEnd, &msglen, 4); + handle->outEnd += 4; - handle->outBuffer[handle->outEnd++] = CREATE_BARRIER_END; + handle->outBuffer[handle->outEnd++] = CREATE_BARRIER_END; - memcpy(handle->outBuffer + handle->outEnd, id, barrier_idlen); - handle->outEnd += barrier_idlen; + memcpy(handle->outBuffer + handle->outEnd, id, barrier_idlen); + handle->outEnd += barrier_idlen; - PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_QUERY); - pgxc_node_flush(handle); - } + PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_QUERY); + pgxc_node_flush(handle); + } } @@ -306,35 +306,35 @@ SendBarrierEndRequest(PGXCNodeAllHandles *coord_handles, const char *id) static PGXCNodeAllHandles * PrepareBarrier(const char *id) { - PGXCNodeAllHandles *coord_handles; + PGXCNodeAllHandles *coord_handles; - elog(DEBUG2, "Preparing Coordinators for BARRIER"); + elog(DEBUG2, "Preparing Coordinators for BARRIER"); - /* - * Send a CREATE BARRIER PREPARE message to all the Coordinators. We should - * send an asynchronous request so that we can disable local commits and - * then wait for the remote Coordinators to finish the work - */ - coord_handles = SendBarrierPrepareRequest(GetAllCoordNodes(), id); + /* + * Send a CREATE BARRIER PREPARE message to all the Coordinators. We should + * send an asynchronous request so that we can disable local commits and + * then wait for the remote Coordinators to finish the work + */ + coord_handles = SendBarrierPrepareRequest(GetAllCoordNodes(), id); - /* - * Disable local commits - */ - LWLockAcquire(BarrierLock, LW_EXCLUSIVE); + /* + * Disable local commits + */ + LWLockAcquire(BarrierLock, LW_EXCLUSIVE); - elog(DEBUG2, "Disabled 2PC commits originating at the driving Coordinator"); + elog(DEBUG2, "Disabled 2PC commits originating at the driving Coordinator"); - /* - * TODO Start a timer to cancel the barrier request in case of a timeout - */ + /* + * TODO Start a timer to cancel the barrier request in case of a timeout + */ - /* - * Local in-flight commits are now over. Check status of the remote - * Coordinators - */ - CheckBarrierCommandStatus(coord_handles, id, "PREPARE"); + /* + * Local in-flight commits are now over. Check status of the remote + * Coordinators + */ + CheckBarrierCommandStatus(coord_handles, id, "PREPARE"); - return coord_handles; + return coord_handles; } /* @@ -344,80 +344,80 @@ PrepareBarrier(const char *id) static void ExecuteBarrier(const char *id) { - List *barrierDataNodeList = GetAllDataNodes(); - List *barrierCoordList = GetAllCoordNodes(); - PGXCNodeAllHandles *conn_handles; - int conn; - int msglen; - int barrier_idlen; - - conn_handles = get_handles(barrierDataNodeList, barrierCoordList, false, true); - - elog(DEBUG2, "Sending CREATE BARRIER <%s> EXECUTE message to " - "Datanodes and Coordinator", id); - /* - * Send a CREATE BARRIER request to all the Datanodes and the Coordinators - */ - for (conn = 0; conn < conn_handles->co_conn_count + conn_handles->dn_conn_count; conn++) - { - PGXCNodeHandle *handle; - - if (conn < conn_handles->co_conn_count) - handle = conn_handles->coord_handles[conn]; - else - handle = conn_handles->datanode_handles[conn - conn_handles->co_conn_count]; - - /* Invalid connection state, return error */ - if (handle->state != DN_CONNECTION_STATE_IDLE) - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to send CREATE BARRIER EXECUTE request " - "to the node"))); - - barrier_idlen = strlen(id) + 1; - - msglen = 4; /* for the length itself */ - msglen += barrier_idlen; - msglen += 1; /* for barrier command itself */ - - /* msgType + msgLen */ - if (ensure_out_buffer_capacity(handle->outEnd + 1 + msglen, handle) != 0) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Out of memory"))); - } - - handle->outBuffer[handle->outEnd++] = 'b'; - msglen = htonl(msglen); - memcpy(handle->outBuffer + handle->outEnd, &msglen, 4); - handle->outEnd += 4; - - handle->outBuffer[handle->outEnd++] = CREATE_BARRIER_EXECUTE; - - memcpy(handle->outBuffer + handle->outEnd, id, barrier_idlen); - handle->outEnd += barrier_idlen; - - PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_QUERY); - pgxc_node_flush(handle); - } - - CheckBarrierCommandStatus(conn_handles, id, "EXECUTE"); - - pfree_pgxc_all_handles(conn_handles); - - /* - * Also WAL log the BARRIER locally and flush the WAL buffers to disk - */ - { - XLogRecPtr recptr; - - XLogBeginInsert(); - XLogRegisterData((char *) &id, strlen(id) + 1); - - recptr = XLogInsert(RM_BARRIER_ID, XLOG_BARRIER_CREATE); - XLogFlush(recptr); - } + List *barrierDataNodeList = GetAllDataNodes(); + List *barrierCoordList = GetAllCoordNodes(); + PGXCNodeAllHandles *conn_handles; + int conn; + int msglen; + int barrier_idlen; + + conn_handles = get_handles(barrierDataNodeList, barrierCoordList, false, true, true); + + elog(DEBUG2, "Sending CREATE BARRIER <%s> EXECUTE message to " + "Datanodes and Coordinator", id); + /* + * Send a CREATE BARRIER request to all the Datanodes and the Coordinators + */ + for (conn = 0; conn < conn_handles->co_conn_count + conn_handles->dn_conn_count; conn++) + { + PGXCNodeHandle *handle; + + if (conn < conn_handles->co_conn_count) + handle = conn_handles->coord_handles[conn]; + else + handle = conn_handles->datanode_handles[conn - conn_handles->co_conn_count]; + + /* Invalid connection state, return error */ + if (handle->state != DN_CONNECTION_STATE_IDLE) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send CREATE BARRIER EXECUTE request " + "to the node"))); + + barrier_idlen = strlen(id) + 1; + + msglen = 4; /* for the length itself */ + msglen += barrier_idlen; + msglen += 1; /* for barrier command itself */ + + /* msgType + msgLen */ + if (ensure_out_buffer_capacity(handle->outEnd + 1 + msglen, handle) != 0) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Out of memory"))); + } + + handle->outBuffer[handle->outEnd++] = 'b'; + msglen = htonl(msglen); + memcpy(handle->outBuffer + handle->outEnd, &msglen, 4); + handle->outEnd += 4; + + handle->outBuffer[handle->outEnd++] = CREATE_BARRIER_EXECUTE; + + memcpy(handle->outBuffer + handle->outEnd, id, barrier_idlen); + handle->outEnd += barrier_idlen; + + PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_QUERY); + pgxc_node_flush(handle); + } + + CheckBarrierCommandStatus(conn_handles, id, "EXECUTE"); + + pfree_pgxc_all_handles(conn_handles); + + /* + * Also WAL log the BARRIER locally and flush the WAL buffers to disk + */ + { + XLogRecPtr recptr; + + XLogBeginInsert(); + XLogRegisterData((char *) &id, strlen(id) + 1); + + recptr = XLogInsert(RM_BARRIER_ID, XLOG_BARRIER_CREATE); + XLogFlush(recptr); + } } /* @@ -426,70 +426,70 @@ ExecuteBarrier(const char *id) static void EndBarrier(PGXCNodeAllHandles *prepared_handles, const char *id) { - /* Resume 2PC locally */ - LWLockRelease(BarrierLock); + /* Resume 2PC locally */ + LWLockRelease(BarrierLock); - SendBarrierEndRequest(prepared_handles, id); + SendBarrierEndRequest(prepared_handles, id); - CheckBarrierCommandStatus(prepared_handles, id, "END"); + CheckBarrierCommandStatus(prepared_handles, id, "END"); } void RequestBarrier(const char *id, char *completionTag) { - PGXCNodeAllHandles *prepared_handles; - const char *barrier_id; - - elog(DEBUG2, "CREATE BARRIER request received"); - /* - * Ensure that we are a Coordinator and the request is not from another - * coordinator - */ - if (!IS_PGXC_COORDINATOR) - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("CREATE BARRIER command must be sent to a Coordinator"))); - - if (IsConnFromCoord()) - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("CREATE BARRIER command is not expected from another Coordinator"))); - - /* - * Get a barrier id if the user has not supplied it - */ - barrier_id = generate_barrier_id(id); - - elog(DEBUG2, "CREATE BARRIER <%s>", barrier_id); - - /* - * Step One. Prepare all Coordinators for upcoming barrier request - */ - prepared_handles = PrepareBarrier(barrier_id); - - /* - * Step two. Issue BARRIER command to all involved components, including - * Coordinators and Datanodes - */ - ExecuteBarrier(barrier_id); - - /* - * Step three. Inform Coordinators about a successfully completed barrier - */ - EndBarrier(prepared_handles, barrier_id); - /* Finally report the barrier to GTM to backup its restart point */ - ReportBarrierGTM(barrier_id); - - /* Free the handles */ - pfree_pgxc_all_handles(prepared_handles); - - if (completionTag) - snprintf(completionTag, COMPLETION_TAG_BUFSIZE, "BARRIER %s", barrier_id); + PGXCNodeAllHandles *prepared_handles; + const char *barrier_id; + + elog(DEBUG2, "CREATE BARRIER request received"); + /* + * Ensure that we are a Coordinator and the request is not from another + * coordinator + */ + if (!IS_PGXC_COORDINATOR) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("CREATE BARRIER command must be sent to a Coordinator"))); + + if (IsConnFromCoord()) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("CREATE BARRIER command is not expected from another Coordinator"))); + + /* + * Get a barrier id if the user has not supplied it + */ + barrier_id = generate_barrier_id(id); + + elog(DEBUG2, "CREATE BARRIER <%s>", barrier_id); + + /* + * Step One. Prepare all Coordinators for upcoming barrier request + */ + prepared_handles = PrepareBarrier(barrier_id); + + /* + * Step two. Issue BARRIER command to all involved components, including + * Coordinators and Datanodes + */ + ExecuteBarrier(barrier_id); + + /* + * Step three. Inform Coordinators about a successfully completed barrier + */ + EndBarrier(prepared_handles, barrier_id); + /* Finally report the barrier to GTM to backup its restart point */ + ReportBarrierGTM(barrier_id); + + /* Free the handles */ + pfree_pgxc_all_handles(prepared_handles); + + if (completionTag) + snprintf(completionTag, COMPLETION_TAG_BUFSIZE, "BARRIER %s", barrier_id); } void barrier_redo(XLogReaderState *record) { - /* Nothing to do */ - return; + /* Nothing to do */ + return; } diff --git a/src/backend/pgxc/cluster/pause.c b/src/backend/pgxc/cluster/pause.c index cf2433cb..66c6d5e1 100644 --- a/src/backend/pgxc/cluster/pause.c +++ b/src/backend/pgxc/cluster/pause.c @@ -122,7 +122,7 @@ HandleClusterPause(bool pause, bool initiator) * coordinators to respond back */ - coord_handles = get_handles(NIL, GetAllCoordNodes(), true, true); + coord_handles = get_handles(NIL, GetAllCoordNodes(), true, true, true); for (conn = 0; conn < coord_handles->co_conn_count; conn++) { @@ -309,7 +309,7 @@ PGXCCleanClusterLock(int code, Datum arg) if (IsConnFromCoord()) return; - coord_handles = get_handles(NIL, GetAllCoordNodes(), true, true); + coord_handles = get_handles(NIL, GetAllCoordNodes(), true, true, true); /* Try best-effort to UNPAUSE other coordinators now */ for (conn = 0; conn < coord_handles->co_conn_count; conn++) { diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 5a7d8b4d..38f5bb3a 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -5865,7 +5865,7 @@ DataNodeCopyBegin(RemoteCopyData *rcstate) else { PGXCNodeAllHandles *pgxc_handles; - pgxc_handles = get_handles(nodelist, NULL, false, true); + pgxc_handles = get_handles(nodelist, NULL, false, true, true); connections = pgxc_handles->datanode_handles; Assert(pgxc_handles->dn_conn_count == conn_count); pfree(pgxc_handles); @@ -6261,6 +6261,7 @@ get_exec_connections(RemoteQueryState *planstate, int co_conn_count, dn_conn_count; bool is_query_coord_only = false; PGXCNodeAllHandles *pgxc_handles = NULL; + bool missing_ok = (exec_nodes ? exec_nodes->missing_ok : false); #ifdef __TBASE__ if (IsParallelWorker()) @@ -6527,7 +6528,7 @@ get_exec_connections(RemoteQueryState *planstate, #endif /* Get other connections (non-primary) */ - pgxc_handles = get_handles(nodelist, coordlist, is_query_coord_only, is_global_session); + pgxc_handles = get_handles(nodelist, coordlist, is_query_coord_only, is_global_session, !missing_ok); if (!pgxc_handles) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), @@ -6538,7 +6539,7 @@ get_exec_connections(RemoteQueryState *planstate, { /* Let's assume primary connection is always a Datanode connection for the moment */ PGXCNodeAllHandles *pgxc_conn_res; - pgxc_conn_res = get_handles(primarynode, NULL, false, is_global_session); + pgxc_conn_res = get_handles(primarynode, NULL, false, is_global_session, true); /* primary connection is unique */ primaryconnection = pgxc_conn_res->datanode_handles[0]; @@ -6552,6 +6553,50 @@ get_exec_connections(RemoteQueryState *planstate, pgxc_handles->primary_handle = primaryconnection; } + if (missing_ok) + { + /* compact handle list exclude missing nodes */ + int i = 0; + while (dn_conn_count && i < dn_conn_count) + { + if (DN_CONNECTION_STATE_ERROR(pgxc_handles->datanode_handles[i])) + { + /* find last healthy handle */ + while (dn_conn_count - 1 > i && + DN_CONNECTION_STATE_ERROR(pgxc_handles->datanode_handles[dn_conn_count - 1])) + dn_conn_count--; + + /* replace bad handle with last healthy handle */ + pgxc_handles->datanode_handles[i] = + pgxc_handles->datanode_handles[dn_conn_count - 1]; + /* exclude bad handle */ + pgxc_handles->datanode_handles[dn_conn_count - 1] = NULL; + dn_conn_count--; + } + i++; + } + + i = 0; + while (co_conn_count && i < co_conn_count) + { + if (DN_CONNECTION_STATE_ERROR(pgxc_handles->coord_handles[i])) + { + /* find last healthy handle */ + while (co_conn_count - 1 > i && + DN_CONNECTION_STATE_ERROR(pgxc_handles->coord_handles[co_conn_count - 1])) + co_conn_count--; + + /* replace bad handle with last healthy handle */ + pgxc_handles->coord_handles[i] = + pgxc_handles->coord_handles[co_conn_count - 1]; + /* exclude bad handle */ + pgxc_handles->coord_handles[co_conn_count - 1] = NULL; + co_conn_count--; + } + i++; + } + } + /* Depending on the execution type, we still need to save the initial node counts */ pgxc_handles->dn_conn_count = dn_conn_count; pgxc_handles->co_conn_count = co_conn_count; @@ -7168,7 +7213,7 @@ ExecCloseRemoteStatement(const char *stmt_name, List *nodelist) return; /* get needed Datanode connections */ - all_handles = get_handles(nodelist, NIL, false, true); + all_handles = get_handles(nodelist, NIL, false, true, true); conn_count = all_handles->dn_conn_count; connections = all_handles->datanode_handles; @@ -8188,7 +8233,7 @@ pgxc_node_remote_prefinish(char *prepareGID, char *nodestring) if (nodelist == NIL && coordlist == NIL) return false; - pgxc_handles = get_handles(nodelist, coordlist, false, true); + pgxc_handles = get_handles(nodelist, coordlist, false, true, true); for (i = 0; i < pgxc_handles->dn_conn_count; i++) { @@ -8548,7 +8593,7 @@ pgxc_node_remote_finish(char *prepareGID, bool commit, return prepared_local; - pgxc_handles = get_handles(nodelist, coordlist, false, true); + pgxc_handles = get_handles(nodelist, coordlist, false, true, true); #ifdef __TWO_PHASE_TRANS__ SetLocalTwoPhaseStateHandles(pgxc_handles); #endif @@ -10466,7 +10511,7 @@ ExecFinishInitRemoteSubplan(RemoteSubplanState *node) if (node->execOnAll) { PGXCNodeAllHandles *pgxc_connections; - pgxc_connections = get_handles(node->execNodes, NIL, false, true); + pgxc_connections = get_handles(node->execNodes, NIL, false, true, true); combiner->conn_count = pgxc_connections->dn_conn_count; combiner->connections = pgxc_connections->datanode_handles; combiner->current_conn = 0; diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index 233fd0e2..d37a0b48 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -3615,7 +3615,7 @@ get_any_handle(List *datanodelist) //char *init_str = NULL; List *allocate = list_make1_int(node); int *pids; - int *fds = PoolManagerGetConnections(allocate, NIL, + int *fds = PoolManagerGetConnections(allocate, NIL, true, &pids); PGXCNodeHandle *node_handle; @@ -3685,8 +3685,8 @@ get_any_handle(List *datanodelist) * Coordinator fds is returned only if transaction uses a DDL */ PGXCNodeAllHandles * -get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query, bool is_global_session) -{// #lizard forgives +get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query, bool is_global_session, bool raise_error) +{ PGXCNodeAllHandles *result; ListCell *node_list_item; List *dn_allocate = NIL; @@ -3864,7 +3864,7 @@ get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query, bool { int j = 0; int *pids; - int *fds = PoolManagerGetConnections(dn_allocate, co_allocate, &pids); + int *fds = PoolManagerGetConnections(dn_allocate, co_allocate, raise_error, &pids); if (!fds) { @@ -3927,6 +3927,13 @@ get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query, bool } node_handle = &dn_handles[node]; + + if (be_pid == 0 && !raise_error) + { + PGXCNodeSetConnectionState(node_handle, DN_CONNECTION_STATE_ERROR_FATAL); + continue; + } + pgxc_node_init(node_handle, fdsock, is_global_session, be_pid); dn_handles[node] = *node_handle; datanode_count++; @@ -3985,6 +3992,13 @@ get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query, bool } node_handle = &co_handles[node]; + + if (be_pid == 0 && !raise_error) + { + PGXCNodeSetConnectionState(node_handle, DN_CONNECTION_STATE_ERROR_FATAL); + continue; + } + pgxc_node_init(node_handle, fdsock, is_global_session, be_pid); co_handles[node] = *node_handle; coord_count++; diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c index 4e1da81f..7a688879 100644 --- a/src/backend/pgxc/pool/poolmgr.c +++ b/src/backend/pgxc/pool/poolmgr.c @@ -489,7 +489,8 @@ static void insert_database_pool(DatabasePool *pool); static void reload_database_pools(PoolAgent *agent); static DatabasePool *find_database_pool(const char *database, const char *user_name, const char *pgoptions); -static int agent_acquire_connections(PoolAgent *agent, List *datanodelist, List *coordlist, int32 *num, int **fd_result, int **pid_result); +static int agent_acquire_connections(PoolAgent *agent, List *datanodelist, List *coordlist, + bool raise_error, int32 *num, int **fd_result, int **pid_result); static int send_local_commands(PoolAgent *agent, List *datanodelist, List *coordlist); static int cancel_query_on_connections(PoolAgent *agent, List *datanodelist, List *coordlist, int signal); static PGXCNodePoolSlot *acquire_connection(DatabasePool *dbPool, PGXCNodePool **pool,int32 nodeidx, Oid node, bool bCoord); @@ -1797,13 +1798,15 @@ PoolManagerDisconnect(void) * Get pooled connections */ int * -PoolManagerGetConnections(List *datanodelist, List *coordlist, int **pids) -{// #lizard forgives +PoolManagerGetConnections(List *datanodelist, List *coordlist, bool raise_error, int **pids) +{ int i; ListCell *nodelist_item; int *fds; int totlen = list_length(datanodelist) + list_length(coordlist); + int totsize = sizeof(int) * (totlen + 2) + 1; /* sizeof nodes list + raise_error flag */ int nodes[totlen + 2]; + char *msg; int pool_recvpids_num; int pool_recvfds_ret; @@ -1850,7 +1853,11 @@ PoolManagerGetConnections(List *datanodelist, List *coordlist, int **pids) errmsg(POOL_MGR_PREFIX"out of memory"))); } - pool_putmessage(&poolHandle->port, 'g', (char *) nodes, sizeof(int) * (totlen + 2)); + msg = palloc(totsize); + memcpy(msg, (char *) nodes, totsize - 1); + msg[totsize - 1] = (char) raise_error; + pool_putmessage(&poolHandle->port, 'g', msg, totsize); + pfree(msg); if (PoolConnectDebugPrint) { @@ -2913,8 +2920,9 @@ agent_set_command(PoolAgent *agent, * return 0 : when fd_result and pid_result is not NULL, acquire connection is done(acquire from freeslot in pool). */ static int -agent_acquire_connections(PoolAgent *agent, List *datanodelist, List *coordlist, int32 *num, int **fd_result, int **pid_result) -{// #lizard forgives +agent_acquire_connections(PoolAgent *agent, List *datanodelist, List *coordlist, + bool raise_error, int32 *num, int **fd_result, int **pid_result) +{ int32 i = 0; int32 acquire_seq = 0; int node = 0; @@ -3101,6 +3109,7 @@ agent_acquire_connections(PoolAgent *agent, List *datanodelist, List *coordlist, elog(LOG, POOL_MGR_PREFIX"[agent_acquire_connections]going to acquire conn by sync thread for node:%s.", nodePool->node_name); } + asyncTaskCtl->m_missing_ok = !raise_error; /* dispatch build connection request */ succeed = dispatch_connection_request(asyncTaskCtl, false, @@ -3153,6 +3162,7 @@ agent_acquire_connections(PoolAgent *agent, List *datanodelist, List *coordlist, asyncTaskCtl = create_task_control(datanodelist, coordlist, *fd_result, *pid_result); } + asyncTaskCtl->m_missing_ok = !raise_error; /* dispatch set param request */ succeed = dispatch_connection_request(asyncTaskCtl, false, @@ -3233,6 +3243,7 @@ agent_acquire_connections(PoolAgent *agent, List *datanodelist, List *coordlist, asyncTaskCtl = create_task_control(datanodelist, coordlist, *fd_result, *pid_result); } + asyncTaskCtl->m_missing_ok = !raise_error; /* dispatch build connection request */ succeed = dispatch_connection_request(asyncTaskCtl, true, @@ -3287,6 +3298,8 @@ agent_acquire_connections(PoolAgent *agent, List *datanodelist, List *coordlist, { asyncTaskCtl = create_task_control(datanodelist, coordlist, *fd_result, *pid_result); } + + asyncTaskCtl->m_missing_ok = !raise_error; /* dispatch set param request */ succeed = dispatch_connection_request(asyncTaskCtl, true, @@ -7509,17 +7522,28 @@ void *pooler_sync_remote_operator_thread(void *arg) PGXCNodeClose(slot->conn); slot->conn = NULL; } - request->current_status = PoolConnectStaus_error; + + finish_task_request(request->taskControl); + + if (request->taskControl->m_missing_ok) + { + request->current_status = PoolConnectStaus_done; + break; + } + else + { + request->current_status = PoolConnectStaus_error; #ifdef __TBASE__ - SpinLockAcquire(&request->agent->port.lock); - request->agent->port.error_code = POOL_ERR_GET_CONNECTIONS_CONNECTION_BAD; - snprintf(request->agent->port.err_msg, POOL_ERR_MSG_LEN, "%s, connection info [%s]", poolErrorMsg[POOL_ERR_GET_CONNECTIONS_CONNECTION_BAD], - request->nodepool->connstr); - SpinLockRelease(&request->agent->port.lock); + SpinLockAcquire(&request->agent->port.lock); + request->agent->port.error_code = POOL_ERR_GET_CONNECTIONS_CONNECTION_BAD; + snprintf(request->agent->port.err_msg, POOL_ERR_MSG_LEN, "%s, connection info [%s]", poolErrorMsg[POOL_ERR_GET_CONNECTIONS_CONNECTION_BAD], + request->nodepool->connstr); + SpinLockRelease(&request->agent->port.lock); #endif - set_task_status(request->taskControl, PoolAyncCtlStaus_error); - finish_task_request(request->taskControl); - break; + set_task_status(request->taskControl, PoolAyncCtlStaus_error); + pooler_thread_logger(LOG, "connection not connect for node:[%s] failed errno %d", request->nodepool->connstr, errno); + break; + } } slot->xc_cancelConn = (NODE_CANCEL *) PQgetCancel((PGconn *)slot->conn); @@ -7710,6 +7734,12 @@ void *pooler_sync_remote_operator_thread(void *arg) #endif node_number++; } + else if (request->taskControl->m_missing_ok) + { + request->taskControl->m_result[node_number] = 0; + request->taskControl->m_pidresult[node_number] = 0; + node_number++; + } } /* Save then in the array fds for Coordinators */ @@ -7726,6 +7756,12 @@ void *pooler_sync_remote_operator_thread(void *arg) #endif node_number++; } + else + { + request->taskControl->m_result[node_number] = 0; + request->taskControl->m_pidresult[node_number] = 0; + node_number++; + } } #ifdef _POOLER_CHECK_ @@ -8687,7 +8723,8 @@ static inline bool get_acquire_success_status(PGXCASyncTaskCtl *taskControl) { bool bsucceed; SpinLockAcquire(&taskControl->m_lock); - bsucceed = taskControl->m_number_done == taskControl->m_number_succeed; + bsucceed = taskControl->m_number_done == taskControl->m_number_succeed || + taskControl->m_missing_ok; SpinLockRelease(&taskControl->m_lock); return bsucceed; } @@ -10241,6 +10278,7 @@ handle_get_connections(PoolAgent * agent, StringInfo s) List *datanodelist = NIL; List *coordlist = NIL; int connect_num = 0; + bool raise_error = true; /* * Length of message is caused by: * - Message header = 4bytes @@ -10273,6 +10311,8 @@ handle_get_connections(PoolAgent * agent, StringInfo s) { elog(LOG, POOL_MGR_PREFIX"backend required %d coordinator connections, pid:%d", coordcount, agent->pid); } + + raise_error = pq_getmsgbyte(s); pq_getmsgend(s); if(!is_pool_locked) @@ -10282,7 +10322,7 @@ handle_get_connections(PoolAgent * agent, StringInfo s) * In case of error agent_acquire_connections will log * the error and return -1 */ - ret = agent_acquire_connections(agent, datanodelist, coordlist, &connect_num, &fds, &pids); + ret = agent_acquire_connections(agent, datanodelist, coordlist, raise_error, &connect_num, &fds, &pids); /* async acquire connection will be done in parallel threads */ if (0 == ret && fds && pids) { diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c index 6b8994dc..ba16fbbb 100644 --- a/src/backend/replication/logical/worker.c +++ b/src/backend/replication/logical/worker.c @@ -661,7 +661,7 @@ apply_exec_on_nodes(StringInfo s, char *nspname, char *relname, ExecNodes * exec return; /* send apply message to DN and wait response */ - all_handles = get_handles(exec_nodes->nodeList, NIL, false, true); + all_handles = get_handles(exec_nodes->nodeList, NIL, false, true, true); /* send insert/update/delete to DN and wait exec finish */ apply_exec_on_dn_nodes(s, nspname, relname, all_handles); diff --git a/src/include/pgxc/locator.h b/src/include/pgxc/locator.h index 3fd1f6b9..30926928 100644 --- a/src/include/pgxc/locator.h +++ b/src/include/pgxc/locator.h @@ -117,6 +117,7 @@ typedef struct Datum rewrite_value; /* function evaluate result */ bool isnull; bool rewrite_done; /* function rewritted */ + bool missing_ok; } ExecNodes; diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h index f79bc2b8..4e64e650 100644 --- a/src/include/pgxc/pgxcnode.h +++ b/src/include/pgxc/pgxcnode.h @@ -171,7 +171,8 @@ extern Oid PGXCGetMainNodeOid(Oid nodeoid); extern int PGXCNodeGetNodeIdFromName(char *node_name, char *node_type); extern Oid PGXCNodeGetNodeOid(int nodeid, char node_type); -extern PGXCNodeAllHandles *get_handles(List *datanodelist, List *coordlist, bool is_query_coord_only, bool is_global_session); +extern PGXCNodeAllHandles *get_handles(List *datanodelist, List *coordlist, + bool is_query_coord_only, bool is_global_session, bool raise_error); extern PGXCNodeAllHandles *get_current_handles(void); #ifdef __TBASE__ diff --git a/src/include/pgxc/poolmgr.h b/src/include/pgxc/poolmgr.h index c0d996d8..1f2f57b3 100644 --- a/src/include/pgxc/poolmgr.h +++ b/src/include/pgxc/poolmgr.h @@ -174,6 +174,7 @@ typedef struct PGXCASyncTaskCtl CommandId m_max_command_id; /* errmsg and error status. */ + bool m_missing_ok; int32 m_error_offset; char m_error_msg[PGXC_POOL_ERROR_MSG_LEN]; }PGXCASyncTaskCtl; @@ -328,7 +329,7 @@ extern int PoolManagerSetCommand(PGXCNodeHandle **connections, int32 count, Pool const char *set_command); /* Get pooled connections */ -extern int *PoolManagerGetConnections(List *datanodelist, List *coordlist, int **pids); +extern int *PoolManagerGetConnections(List *datanodelist, List *coordlist, bool raise_error, int **pids); /* Clean pool connections */ extern void PoolManagerCleanConnection(List *datanodelist, List *coordlist, char *dbname, char *username); From 458585a25f229338da2df3685b02c2cac0a571a1 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Thu, 8 Apr 2021 15:54:18 +0800 Subject: [PATCH 161/578] for pooler log http://tapd.oa.com/pgxz/prong/stories/view/1010092131863477681 (merge request !268) --- src/backend/pgxc/pool/poolmgr.c | 187 ++++++++++++++++++++++++++++++++ src/backend/utils/misc/guc.c | 9 ++ src/include/pgxc/poolmgr.h | 1 + 3 files changed, 197 insertions(+) diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c index 7a688879..c4bcc94e 100644 --- a/src/backend/pgxc/pool/poolmgr.c +++ b/src/backend/pgxc/pool/poolmgr.c @@ -100,6 +100,7 @@ char *g_unpooled_user = "mls_admin"; bool PoolConnectDebugPrint = false; /* Pooler connect debug print */ bool PoolerStuckExit = true; /* Pooler exit when stucked */ +bool PoolSubThreadLogPrint = true; /* Pooler sub thread log print */ #define POOL_ASYN_WARM_PIPE_LEN 32 /* length of asyn warm pipe */ #define POOL_ASYN_WARN_NUM 1 /* how many connections to warm once maintaince per node pool */ @@ -397,6 +398,20 @@ typedef struct pg_time_t cmd_end_time; /* command end time */ }PGXCPoolAsyncReq; +static void pooler_subthread_write_log(int elevel, int lineno, const char *filename, const char *funcname, const char *fmt, ...)__attribute__((format(printf, 5, 6))); + +/* Use this macro when a sub thread needs to print logs */ +#define pooler_thread_logger(elevel, ...) \ + do { \ + pooler_subthread_write_log(elevel, __LINE__, __FILE__, PG_FUNCNAME_MACRO, __VA_ARGS__); \ + } while(0) + +#define FORMATTED_TS_LEN (128) /* format timestamp buf length */ +#define POOLER_WRITE_LOG_ONCE_LIMIT (5) /* number of logs written at a time */ +#define MAX_THREAD_LOG_PIPE_LEN (2 * 1024) /* length of thread log pipe */ +#define DEFAULT_LOG_BUF_LEN (1024) /* length of thread log length */ +PGPipe *g_ThreadLogQueue = NULL; + static inline void RebuildAgentIndex(void); static inline PGXCASyncTaskCtl* create_task_control(List *datanodelist, List *coordlist, int32 *fd_result, int32 *pid_result); @@ -5134,6 +5149,170 @@ destroy_node_pool_free_slots(PGXCNodePool *node_pool) } } +/* + * setup current log time + */ +static void +setup_formatted_current_log_time(char* formatted_current_log_time) +{ + pg_time_t stamp_time; + char msbuf[13]; + struct timeval timeval; + + gettimeofday(&timeval, NULL); + stamp_time = (pg_time_t) timeval.tv_sec; + + /* + * Note: we expect that guc.c will ensure that log_timezone is set up (at + * least with a minimal GMT value) before Log_line_prefix can become + * nonempty or CSV mode can be selected. + */ + pg_strftime(formatted_current_log_time, FORMATTED_TS_LEN, + /* leave room for milliseconds... */ + "%Y-%m-%d %H:%M:%S %Z", + pg_localtime(&stamp_time, log_timezone)); + + /* 'paste' milliseconds into place... */ + sprintf(msbuf, ".%03d", (int) (timeval.tv_usec / 1000)); + memcpy(formatted_current_log_time + 19, msbuf, 4); +} + +/* + * write pooler's subthread log into thread log queue + * only call by pooler's subthread in elog + */ +static void +pooler_subthread_write_log(int elevel, int lineno, const char *filename, const char *funcname, const char *fmt, ...) +{ + char *buf = NULL; + int buf_len = 0; + int offset = 0; + char formatted_current_log_time[FORMATTED_TS_LEN]; + + if (!PoolSubThreadLogPrint) + { + /* not enable sun thread log print, return */ + return; + } + + if (PipeIsFull(g_ThreadLogQueue)) + { + return; + } + + /* use malloc in sub thread */ + buf_len = strlen(filename) + strlen(funcname) + DEFAULT_LOG_BUF_LEN; + buf = (char*)malloc(buf_len); + if (buf == NULL) + { + /* no log */ + return; + } + + /* construction log, format: elevel | lineno | filename | funcname | log content */ + *(int*)(buf + offset) = elevel; + offset += sizeof(elevel); + *(int*)(buf + offset) = lineno; + offset += sizeof(lineno); + memcpy(buf + offset, filename, strlen(filename) + 1); + offset += (strlen(filename) + 1); + memcpy(buf + offset, funcname, strlen(funcname) + 1); + offset += (strlen(funcname) + 1); + + /* + * because the main thread writes the log of the sub thread asynchronously, + * record the actual log writing time here + */ + setup_formatted_current_log_time(formatted_current_log_time); + memcpy(buf + offset, formatted_current_log_time, strlen(formatted_current_log_time)); + offset += strlen(formatted_current_log_time); + *(char*)(buf + offset) = ' '; + offset += sizeof(char); + + /* Generate actual output --- have to use appendStringInfoVA */ + for (;;) + { + va_list args; + int avail; + int nprinted; + + avail = buf_len - offset - 1; + va_start(args, fmt); + nprinted = vsnprintf(buf + offset, avail, fmt, args); + va_end(args); + if (nprinted >= 0 && nprinted < avail - 1) + { + offset += nprinted; + *(char*)(buf + offset) = '\0'; + offset += sizeof(char); + break; + } + + buf_len = (buf_len * 2 > (int) MaxAllocSize) ? MaxAllocSize : buf_len * 2; + buf = (char *) realloc(buf, buf_len); + if (buf == NULL) + { + /* no log */ + return; + } + } + + /* put log into thread log queue, drop log if queue is full */ + if (-1 == PipePut(g_ThreadLogQueue, buf)) + { + free(buf); + } +} + +/* + * write subthread log in main thread + */ +static void +pooler_handle_subthread_log(bool is_pooler_exit) +{ + int write_log_cnt = 0; + int offset = 0; + int elevel = LOG; + int lineno = 0; + char *log_buf = NULL; + char *filename = NULL; + char *funcname = NULL; + char *log_content = NULL; + + while ((log_buf = (char*)PipeGet(g_ThreadLogQueue)) != NULL) + { + /* elevel | lineno | filename | funcname | log content */ + elevel = *(int*)log_buf; + offset = sizeof(elevel); + lineno = *(int*)(log_buf + offset); + offset += sizeof(lineno); + filename = log_buf + offset; + offset += (strlen(filename) + 1); + funcname = log_buf + offset; + offset += (strlen(funcname) + 1); + log_content = log_buf + offset; + + /* write log here */ + elog_start(filename, lineno, +#ifdef USE_MODULE_MSGIDS + PGXL_MSG_MODULE, PGXL_MSG_FILEID, __COUNTER__, +#endif + funcname); + elog_finish(elevel, "%s", log_content); + + free(log_buf); + + /* + * if the number of logs written at one time exceeds POOLER_WRITE_LOG_ONCE_LIMIT, + * in order not to block the main thread, return here + */ + if (write_log_cnt++ >= POOLER_WRITE_LOG_ONCE_LIMIT && !is_pooler_exit) + { + return; + } + } +} + /* * Main handling loop */ @@ -5200,6 +5379,9 @@ PoolerLoop(void) } #endif + /* create log queue */ + g_ThreadLogQueue = CreatePipe(MAX_THREAD_LOG_PIPE_LEN); + /* create utility thread */ g_AsynUtilityPipeSender = CreatePipe(POOL_ASYN_WARM_PIPE_LEN); ThreadSemaInit(&g_AsnyUtilitysem, 0); @@ -5282,6 +5464,7 @@ PoolerLoop(void) */ if (!PostmasterIsAlive()) { + pooler_handle_subthread_log(true); exit(1); } @@ -5309,6 +5492,7 @@ PoolerLoop(void) * Just close the socket and exit. Linux will help to release the resouces. */ close(server_fd); + pooler_handle_subthread_log(true); exit(0); } @@ -5420,6 +5604,9 @@ PoolerLoop(void) check_duplicate_allocated_conn(); #endif print_pooler_statistics(); + + /* handle sub thread's log */ + pooler_handle_subthread_log(false); } } diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 35c4981d..2260027e 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -2423,6 +2423,15 @@ static struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, { + {"enable_pooler_thread_log_print", PGC_USERSET, CUSTOM_OPTIONS, + gettext_noop("enable pooler manager sub thread log print"), + NULL + }, + &PoolSubThreadLogPrint, + true, + NULL, NULL, NULL + }, + { {"enable_plpgsql_debug_print", PGC_SUSET, CUSTOM_OPTIONS, gettext_noop("enable plpgsql debug infomation print"), NULL diff --git a/src/include/pgxc/poolmgr.h b/src/include/pgxc/poolmgr.h index 1f2f57b3..9fff0445 100644 --- a/src/include/pgxc/poolmgr.h +++ b/src/include/pgxc/poolmgr.h @@ -268,6 +268,7 @@ extern int PoolDNSetTimeout; extern int PoolCheckSlotTimeout; extern int PoolPrintStatTimeout; extern bool PoolConnectDebugPrint; +extern bool PoolSubThreadLogPrint; /* Status inquiry functions */ extern void PGXCPoolerProcessIam(void); extern bool IsPGXCPoolerProcess(void); From f955f8a51f2cc200bdcf236cd18ff7cd5468960b Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Fri, 4 Jun 2021 21:20:30 +0800 Subject: [PATCH 162/578] fix sysviews expected info --- src/test/regress/expected/sysviews.out | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index 7a478711..7f805e93 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -117,6 +117,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_plpgsql_debug_print | off enable_pooler_debug_print | on enable_pooler_stuck_exit | off + enable_pooler_thread_log_print | on enable_pullup_subquery | on enable_replication_slot_debug | off enable_sampling_analyze | on @@ -128,7 +129,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_tidscan | on enable_transparent_crypt | on enable_user_authority_force_check | off -(56 rows) +(57 rows) -- Test that the pg_timezone_names and pg_timezone_abbrevs views are -- more-or-less working. We can't test their contents in any great detail From 614d2598ab608f4ab2602cb46f0da010ba6adcfd Mon Sep 17 00:00:00 2001 From: andrelin Date: Fri, 4 Jun 2021 21:25:27 +0800 Subject: [PATCH 163/578] Receive 1 more byte for raise error flag in pooler --- src/backend/pgxc/pool/poolmgr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c index c4bcc94e..6efe045c 100644 --- a/src/backend/pgxc/pool/poolmgr.c +++ b/src/backend/pgxc/pool/poolmgr.c @@ -10473,10 +10473,11 @@ handle_get_connections(PoolAgent * agent, StringInfo s) * - List of Coordinators = NumPoolCoords * 4bytes (max) * - Number of Datanodes sent = 4bytes * - Number of Coordinators sent = 4bytes + * - Raise error flag = 1byte * It is better to send in a same message the list of Co and Dn at the same * time, this permits to reduce interactions between postmaster and pooler */ - pool_getmessage(&agent->port, s, 4 * agent->num_dn_connections + 4 * agent->num_coord_connections + 12); + pool_getmessage(&agent->port, s, 4 * agent->num_dn_connections + 4 * agent->num_coord_connections + 13); datanodecount = pq_getmsgint(s, 4); for (i = 0; i < datanodecount; i++) { From 06918bf23dab631a9c621b3f75090099538e8d8f Mon Sep 17 00:00:00 2001 From: jackywpxie Date: Mon, 28 Dec 2020 11:37:25 +0800 Subject: [PATCH 164/578] jacky/feature/PersistentDatanodeConnection_Tbase_v2.15 (merge request !54) Squash merge branch 'jacky/feature/PersistentDatanodeConnection_Tbase_v2.15' into 'Tbase_v2.15' * fixed merged bugs. * jacky/feature/PersistentDatanodeConnection_Tbase_v2.15.16 (merge request !15) --- src/backend/pgxc/pool/execRemote.c | 162 ++++++++---------- src/backend/pgxc/pool/pgxcnode.c | 128 ++++++++++++-- src/backend/pgxc/shard/shardmap.c | 19 +- src/include/pgxc/pgxcnode.h | 2 +- src/test/regress/expected/mls_check.out | 14 +- src/test/regress/expected/namespace.out | 42 +++++ .../regress/expected/xl_limitations_1.out | 16 +- src/test/regress/sql/mls_check.sql | 4 +- src/test/regress/sql/namespace.sql | 28 +++ src/test/regress/sql/xl_limitations.sql | 12 +- 10 files changed, 294 insertions(+), 133 deletions(-) diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 38f5bb3a..a06692a6 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -3454,11 +3454,11 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections, /* Send GXID and check for errors */ if (pgxc_node_send_gxid(connections[i], gxid)) { - elog(WARNING, "pgxc_node_begin gxid is invalid."); + elog(WARNING, "pgxc_node_begin gxid %u is invalid.", gxid); return EOF; } - /* Send timestamp and check for errors */ + /* Send timestamp and check for errors */ if (GlobalTimestampIsValid(timestamp) && pgxc_node_send_timestamp(connections[i], timestamp)) { @@ -3565,21 +3565,6 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections, connections[i]->nodename, connections[i]->backend_pid); new_connections[new_count++] = connections[i]; } - -#if 0 - /* Send BEGIN if not already in transaction */ - if (need_tran_block && connections[i]->transaction_status == 'I') - { - /* Send the BEGIN TRANSACTION command and check for errors */ - if (pgxc_node_send_query(connections[i], cmd)) - { - return EOF; - } - - elog(LOG, "pgxc_node_begin send BEGIN to node %s, pid:%d", connections[i]->nodename, connections[i]->backend_pid); - new_connections[new_count++] = connections[i]; - } -#endif } /* @@ -3627,6 +3612,8 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections, for (i = 0; i < new_count; i++) { pgxc_node_set_query(new_connections[i], init_str); + elog(DEBUG5, "pgxc_node_begin send %s to node %s, pid:%d", init_str, + new_connections[i]->nodename, new_connections[i]->backend_pid); } } @@ -3869,6 +3856,7 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit) elog(LOG, "prepare remote transaction xid %d gid %s", GetTopTransactionIdIfAny(), prepareGID); } global_prepare_ts = GetGlobalTimestampGTM(); + #ifdef __TWO_PHASE_TESTS__ if (PART_PREPARE_GET_TIMESTAMP == twophase_exception_case) { @@ -4397,15 +4385,6 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit) clear_handles(); pfree_pgxc_all_handles(handles); - -#if 0 - if (!temp_object_included && !PersistentConnections) - { - /* Clean up remote sessions */ - pgxc_node_remote_cleanup_all(); - release_handles(); - } -#endif } pfree(prepare_cmd); @@ -4505,6 +4484,7 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit) } g_twophase_state.datanode_state[conn_state_index].state = TWO_PHASE_ABORTTING; #endif + /* Send down abort prepared command */ #ifdef __USE_GLOBAL_SNAPSHOT__ if (pgxc_node_send_gxid(conn, auxXid)) @@ -4512,7 +4492,8 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit) #ifdef __TWO_PHASE_TRANS__ g_twophase_state.datanode_state[conn_state_index].conn_state = TWO_PHASE_SEND_GXID_ERROR; - g_twophase_state.datanode_state[conn_state_index].state = TWO_PHASE_ABORT_ERROR; + g_twophase_state.datanode_state[conn_state_index].state = + TWO_PHASE_ABORT_ERROR; #endif /* * Prepared transaction is left on the node, but we can not @@ -4520,10 +4501,11 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit) */ ereport(WARNING, (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("failed to send xid to " - "the node %u", conn->nodeoid))); + errmsg("failed to send xid %u to " + "the node %u", auxXid, conn->nodeoid))); } #endif + if (pgxc_node_send_query(conn, abort_cmd)) { #ifdef __TWO_PHASE_TRANS__ @@ -4598,7 +4580,8 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit) #ifdef __TWO_PHASE_TRANS__ g_twophase_state.coord_state[conn_state_index].conn_state = TWO_PHASE_SEND_GXID_ERROR; - g_twophase_state.coord_state[conn_state_index].state = TWO_PHASE_ABORT_ERROR; + g_twophase_state.coord_state[conn_state_index].state = + TWO_PHASE_ABORT_ERROR; #endif /* * Prepared transaction is left on the node, but we can not @@ -4606,10 +4589,11 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit) */ ereport(WARNING, (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("failed to send xid to " - "the node %u", conn->nodeoid))); + errmsg("failed to send xid %u to " + "the node %u", auxXid, conn->nodeoid))); } #endif + if (pgxc_node_send_query(conn, abort_cmd)) { #ifdef __TWO_PHASE_TRANS__ @@ -4662,12 +4646,20 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit) else elog(ERROR, "failed to PREPARE transaction on one or more nodes"); - if (!temp_object_included && !PersistentConnections) + if (!temp_object_included) { /* Clean up remote sessions */ pgxc_node_remote_cleanup_all(); + + if (PersistentConnections) + { + reset_handles(); + } + else + { release_handles(false); } + } clear_handles(); @@ -4700,7 +4692,7 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit) * Release remote connection after completion. * * For DDL, DN will commit before CN does. - * Because DDLs normally have conflict locks, when CN gets committed, + * Because DDL normally has conflict locks, when CN gets committed, * DNs will be in a consistent state for blocked user transactions. */ static void @@ -4722,22 +4714,21 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle) stat_transaction(conn_count); - if (need_release_handle) - { - if (!temp_object_included && !PersistentConnections) + if (!temp_object_included) { /* Clean up remote sessions */ pgxc_node_remote_cleanup_all(); - release_handles(false); - } + + if (need_release_handle) + { + if (PersistentConnections) + { + reset_handles(); } else { - /* in subtxn, we just cleanup the connections. not release the handles. */ - if (!temp_object_included && !PersistentConnections) - { - /* Clean up remote sessions without release handles. */ - pgxc_node_remote_cleanup_all(); + release_handles(false); + } } } @@ -4783,7 +4774,6 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle) if(IS_PGXC_COORDINATOR) { - global_committs = GetGlobalTimestampGTM(); if(!GlobalTimestampIsValid(global_committs)){ ereport(ERROR, @@ -4955,23 +4945,21 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle) #ifndef __TBASE__ stat_transaction(conn_count); + if (!temp_object_included) + { + /* Clean up remote sessions */ + pgxc_node_remote_cleanup_all(); if (need_release_handle) { - if (!temp_object_included && !PersistentConnections) + if (PersistentConnections) { - /* Clean up remote sessions */ - pgxc_node_remote_cleanup_all(); - release_handles(false); - } + reset_handles(); } else { - /* in subtxn, we just cleanup the connections. not release the handles. */ - if (!temp_object_included && !PersistentConnections) - { - /* Clean up remote sessions without release handles. */ - pgxc_node_remote_cleanup_all(); + release_handles(false); + } } } @@ -4992,7 +4980,7 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle) } /* - * Set the node begein transaction in plpgsql function + * Set the node begin transaction in plpgsql function */ static void SetPlpgsqlTransactionBegin(PGXCNodeHandle *conn) @@ -5802,22 +5790,21 @@ pgxc_node_remote_abort(TranscationType txn_type, bool need_release_handle) } #endif - if (need_release_handle) - { + /* + * Drop the connections to ensure aborts are handled properly. + * + * XXX We should really be consulting PersistentConnections parameter and + * keep the connections if its set. But as a short term measure, to address + * certain issues for aborted transactions, we drop the connections. + * Revisit and fix the issue + */ if (!temp_object_included) { /* Clean up remote sessions */ pgxc_node_remote_cleanup_all(); - release_handles(false); - } - } - else - { - /* in subtxn, we just cleanup the connections. not release the handles. */ - if (!temp_object_included) + if (need_release_handle) { - /* Clean up remote sessions without release handles. */ - pgxc_node_remote_cleanup_all(); + release_handles(false); } } @@ -6647,7 +6634,9 @@ pgxc_start_command_on_connection(PGXCNodeHandle *connection, if (snapshot && pgxc_node_send_snapshot(connection, snapshot)) return false; - if (step->statement || step->cursor || remotestate->rqs_num_params) + if ((step->statement && step->statement[0] != '\0') || + step->cursor || + remotestate->rqs_num_params) { /* need to use Extended Query Protocol */ int fetch = 0; @@ -7878,29 +7867,6 @@ PreAbort_Remote(TranscationType txn_type, bool need_release_handle) pgxc_node_remote_abort(txn_type, need_release_handle); - /* - * Drop the connections to ensure aborts are handled properly. - * - * XXX We should really be consulting PersistentConnections parameter and - * keep the connections if its set. But as a short term measure, to address - * certain issues for aborted transactions, we drop the connections. - * Revisit and fix the issue - */ - elog(DEBUG5, "temp_object_included %d", temp_object_included); - /* cleanup and release handles is already done in pgxc_node_remote_abort */ -#if 0 - if (release_handle) - { - if (!temp_object_included) - { - /* Clean up remote sessions */ - pgxc_node_remote_cleanup_all(); - release_handles(); - } - } - - clear_handles(); -#endif pfree_pgxc_all_handles(all_handles); if (log_gtm_stats) @@ -8844,12 +8810,19 @@ pgxc_node_remote_finish(char *prepareGID, bool commit, } #endif - if (!temp_object_included && !PersistentConnections) + if (!temp_object_included) { /* Clean up remote sessions */ pgxc_node_remote_cleanup_all(); + if (PersistentConnections) + { + reset_handles(); + } + else + { release_handles(false); } + } clear_handles(); pfree_pgxc_all_handles(pgxc_handles); pfree(finish_cmd); @@ -9050,8 +9023,9 @@ ExecRemoteQuery(PlanState *pstate) if (step->force_autocommit) need_tran_block = false; else - need_tran_block = step->cursor || - step->statement || node->rqs_num_params || + need_tran_block = (step->statement && step->statement[0] != '\0') || + step->cursor || + node->rqs_num_params || (!step->read_only && total_conn_count > 1) || (TransactionBlockStatusCode() == 'T'); diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index d37a0b48..eb400fb4 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -316,6 +316,10 @@ InitMultinodeExecutor(bool is_force) { node_handle_ent->nodeoid = dn_handles[count].nodeoid; node_handle_ent->nodeidx = count; + + elog(DEBUG5, + "node_handles_hash enter primary datanode nodeoid: %d", + node_handle_ent->nodeoid); } #endif @@ -342,6 +346,10 @@ InitMultinodeExecutor(bool is_force) { node_handle_ent->nodeoid = sdn_handles[count].nodeoid; node_handle_ent->nodeidx = count; + + elog(DEBUG5, + "node_handles_hash enter slave datanode nodeoid: %d", + node_handle_ent->nodeoid); } #endif @@ -367,6 +375,10 @@ InitMultinodeExecutor(bool is_force) { node_handle_ent->nodeoid = co_handles[count].nodeoid; node_handle_ent->nodeidx = count; + + elog(DEBUG5, + "node_handles_hash enter coordinator nodeoid: %d", + node_handle_ent->nodeoid); } #endif } @@ -655,7 +667,7 @@ pgxc_node_init(PGXCNodeHandle *handle, int sock, bool global_session, int pid) #ifdef DN_CONNECTION_DEBUG handle->have_row_desc = false; #endif - memset(handle->error, 0X00, MAX_ERROR_MSG_LENGTH); + handle->error[0] = '\0'; handle->outEnd = 0; handle->inStart = 0; handle->inEnd = 0; @@ -1405,7 +1417,7 @@ release_handles(bool force) { /* * Connections at this point should be completely inactive, - * otherwise abaandon them. We can not allow not cleaned up + * otherwise abandon them. We can not allow not cleaned up * connection is returned to pool. */ if (handle->state != DN_CONNECTION_STATE_IDLE || @@ -1442,7 +1454,7 @@ release_handles(bool force) { /* * Connections at this point should be completely inactive, - * otherwise abaandon them. We can not allow not cleaned up + * otherwise abandon them. We can not allow not cleaned up * connection is returned to pool. */ if (handle->state != DN_CONNECTION_STATE_IDLE || @@ -1521,6 +1533,67 @@ release_handles(bool force) slavedatanode_count = 0; } +/* + * Reset all Datanode and Coordinator connections occupied memory. + */ +void +reset_handles(void) +{ + int i; + + /* don't reset connection if holding a cluster lock */ + if (cluster_ex_lock_held) + { + return; + } + + if (datanode_count == 0 && coord_count == 0 && slavedatanode_count == 0) + { + return; + } + + /* Do not reset connections if we have prepared statements on nodes */ + if (HaveActiveDatanodeStatements()) + { + return; + } + + /* Reset Datanodes handles occupied memory */ + for (i = 0; i < NumDataNodes; i++) + { + PGXCNodeHandle *handle = &dn_handles[i]; + + if (handle->sock != NO_SOCKET) + { + pgxc_node_init(handle, handle->sock, true, handle->backend_pid); + } + } + + for (i = 0; i < NumSlaveDataNodes; i++) + { + PGXCNodeHandle *handle = &sdn_handles[i]; + + if (handle->sock != NO_SOCKET) + { + pgxc_node_init(handle, handle->sock, true, handle->backend_pid); + } + } + + if (IS_PGXC_COORDINATOR) + { + /* Collect Coordinator handles */ + for (i = 0; i < NumCoords; i++) + { + PGXCNodeHandle *handle = &co_handles[i]; + + if (handle->sock != NO_SOCKET) + { + pgxc_node_init(handle, handle->sock, true, handle->backend_pid); + } + } + } +} + /* * Check whether there bad connections to remote nodes when abort transactions. */ @@ -3052,8 +3125,6 @@ int pgxc_node_send_snapshot(PGXCNodeHandle *handle, Snapshot snapshot) {// #lizard forgives int msglen PG_USED_FOR_ASSERTS_ONLY; - int nval PG_USED_FOR_ASSERTS_ONLY; - int i PG_USED_FOR_ASSERTS_ONLY; /* Invalid connection state, return error */ if (handle->state != DN_CONNECTION_STATE_IDLE) @@ -3272,7 +3343,10 @@ pgxc_node_send_timestamp(PGXCNodeHandle *handle, TimestampTz timestamp) /* Invalid connection state, return error */ if (handle->state != DN_CONNECTION_STATE_IDLE) { - elog(WARNING, "pgxc_node_send_timestamp datanode:%u invalid stauts:%d, no need to send data, return NOW", handle->nodeoid, handle->state); + elog(WARNING, + "pgxc_node_send_timestamp datanode:%u invalid stauts:%d, " + "no need to send data, return NOW", + handle->nodeoid, handle->state); return EOF; } @@ -3311,7 +3385,7 @@ pgxc_node_send_timestamp(PGXCNodeHandle *handle, TimestampTz timestamp) /* * Send the Coordinator info down to the PGXC node at the beginning of transaction, * In this way, Datanode can print this Coordinator info into logfile, - * and those infos can be found in Datanode logifile if needed during debugging + * and those infos can be found in Datanode logfile if needed during debugging */ int pgxc_node_send_coord_info(PGXCNodeHandle * handle, int coord_pid, TransactionId coord_vxid) @@ -4249,13 +4323,23 @@ pfree_pgxc_all_handles(PGXCNodeAllHandles *pgxc_handles) #endif if (pgxc_handles->primary_handle) + { pfree(pgxc_handles->primary_handle); + pgxc_handles->primary_handle = NULL; + } if (pgxc_handles->datanode_handles) + { pfree(pgxc_handles->datanode_handles); + pgxc_handles->datanode_handles = NULL; + } if (pgxc_handles->coord_handles) + { pfree(pgxc_handles->coord_handles); + pgxc_handles->coord_handles = NULL; + } pfree(pgxc_handles); + pgxc_handles = NULL; } /* Do translation for non-main cluster */ @@ -4314,13 +4398,15 @@ PGXCNodeGetNodeId(Oid nodeoid, char *node_type) if (NULL == node_handles_hash) { + elog(DEBUG5, "node_handles_hash is null."); goto NOT_FOUND; } - nodeoid = PGXCGetLocalNodeOid(nodeoid); + nodeoid = PGXCGetLocalNodeOid(nodeoid); entry = (PGXCNodeHandlesLookupEnt *) hash_search(node_handles_hash, &nodeoid, HASH_FIND, &found); if (false == found) { + elog(DEBUG5, "node_handles_hash does not has %d", nodeoid); goto NOT_FOUND; } @@ -4721,11 +4807,23 @@ get_set_command(List *param_list, StringInfo command, bool local) { search_path_value[index++] = '"'; } + + if ((char *) strstr(search_path_value, "public") || + (char *) strstr(search_path_value, "PUBLIC")) + { appendStringInfo(command, "SET %s %s TO %s;", local ? "LOCAL" : "", NameStr(entry->name), search_path_value); } else { + appendStringInfo(command, "SET %s %s TO %s, public;", local ? "LOCAL" : "", + NameStr(entry->name), search_path_value); + } + + elog(DEBUG5, "get_set_command: %s", command->data); + } + else + { appendStringInfo(command, "SET %s %s TO %s;", local ? "LOCAL" : "", NameStr(entry->name), value); } @@ -4735,7 +4833,7 @@ get_set_command(List *param_list, StringInfo command, bool local) /* * Returns SET commands needed to initialize remote session. - * The command may already be biult and valid, return it right away if the case. + * The command may already be built and valid, return it right away if the case. * Otherwise build it up. * To support Distributed Session machinery coordinator should generate and * send a distributed session identifier to remote nodes. Generate it here. @@ -4777,7 +4875,7 @@ PGXCNodeGetSessionParamStr(void) /* * Returns SET commands needed to initialize transaction on a remote session. - * The command may already be biult and valid, return it right away if the case. + * The command may already be built and valid, return it right away if the case. * Otherwise build it up. */ char * @@ -5214,8 +5312,16 @@ PgxcNodeDiffBackendHandles(List **nodes_alter, Oid nodeoid; char ntype = PGXC_NODE_NONE; - if(enable_multi_cluster && strcmp(NameStr(nodeForm->node_cluster_name), PGXCClusterName)) + if (enable_multi_cluster && + strcmp(NameStr(nodeForm->node_cluster_name), PGXCClusterName)) + { + continue; + } + + if (PGXC_NODE_GTM == nodeForm->node_type) + { continue; + } nodeoid = HeapTupleGetOid(tuple); catoids = lappend_oid(catoids, nodeoid); diff --git a/src/backend/pgxc/shard/shardmap.c b/src/backend/pgxc/shard/shardmap.c index 38b5044a..6583be1c 100644 --- a/src/backend/pgxc/shard/shardmap.c +++ b/src/backend/pgxc/shard/shardmap.c @@ -889,7 +889,9 @@ static void InsertShardMap_CN(int32 map, Form_pgxc_shard_map record) nodeindex = PGXCNodeGetNodeId(record->primarycopy, &node_type); if (nodeindex < 0) { - elog(ERROR, " get node:%u for index failed", record->primarycopy); + elog(ERROR, + "InsertShardMap_CN get node:%u for index failed", + record->primarycopy); } g_GroupShardingMgr->members[map]->shmemshardmap[record->shardgroupid].primarycopy = record->primarycopy; @@ -898,7 +900,11 @@ static void InsertShardMap_CN(int32 map, Form_pgxc_shard_map record) } else { - elog(ERROR, " invalid pgxc_shard_map record with shardgroupid:%d", record->shardgroupid); + elog(ERROR, + "invalid pgxc_shard_map record with shardgroupid: %d, map %d " + "and shmemNum: %d", + record->shardgroupid, map, + g_GroupShardingMgr->members[map]->shmemNumShardGroups); } } } @@ -918,7 +924,9 @@ static void InsertShardMap_DN(Form_pgxc_shard_map record) nodeindex = PGXCNodeGetNodeId(record->primarycopy, &node_type); if (nodeindex < 0) { - elog(ERROR, " get node:%u for index failed", record->primarycopy); + elog(ERROR, + "InsertShardMap_DN get node:%u for index failed", + record->primarycopy); } g_GroupShardingMgr_DN->members->shmemshardmap[record->shardgroupid].primarycopy = record->primarycopy; @@ -927,7 +935,10 @@ static void InsertShardMap_DN(Form_pgxc_shard_map record) } else { - elog(ERROR, "[InsertShardMap_DN]invalid pgxc_shard_map record with shardgroupid:%d", record->shardgroupid); + elog(ERROR, + "InsertShardMap_DN has invalid pgxc_shard_map record with shardgroupid: " + "%d and shmemNum: %d", + record->shardgroupid, g_GroupShardingMgr_DN->members->shmemNumShardGroups); } } diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h index 4e64e650..f15515c3 100644 --- a/src/include/pgxc/pgxcnode.h +++ b/src/include/pgxc/pgxcnode.h @@ -183,7 +183,7 @@ extern PGXCNodeAllHandles * get_sock_fatal_handles(void); extern void pfree_pgxc_all_handles(PGXCNodeAllHandles *handles); extern void release_handles(bool force); - +extern void reset_handles(void); extern void clear_handles(void); extern int get_transaction_nodes(PGXCNodeHandle ** connections, diff --git a/src/test/regress/expected/mls_check.out b/src/test/regress/expected/mls_check.out index 496d0b8c..fd8a30b5 100644 --- a/src/test/regress/expected/mls_check.out +++ b/src/test/regress/expected/mls_check.out @@ -5573,10 +5573,10 @@ drop table lala; drop table lala2; drop table lala3; \c - mls_admin -select * from pg_cls_table; - polid | attnum | relid | enable | nspname | tblname | reloptions --------+--------+-------+--------+---------+---------+------------ - 99 | 3 | 17061 | t | public | xixi | +select polid, attnum, enable, nspname, tblname, reloptions from pg_cls_table; + polid | attnum | enable | nspname | tblname | reloptions +-------+--------+--------+---------+---------+------------ + 99 | 3 | t | public | xixi | (1 row) select MLS_CLS_DROP_TABLE_LABEL('cls_compare', 'public', 'xixi'); @@ -5585,9 +5585,9 @@ select MLS_CLS_DROP_TABLE_LABEL('cls_compare', 'public', 'xixi'); t (1 row) -select * from pg_cls_table; - polid | attnum | relid | enable | nspname | tblname | reloptions --------+--------+-------+--------+---------+---------+------------ +select polid, attnum, enable, nspname, tblname, reloptions from pg_cls_table; + polid | attnum | enable | nspname | tblname | reloptions +-------+--------+--------+---------+---------+------------ (0 rows) --everything is done diff --git a/src/test/regress/expected/namespace.out b/src/test/regress/expected/namespace.out index b081c977..1d2ecfee 100644 --- a/src/test/regress/expected/namespace.out +++ b/src/test/regress/expected/namespace.out @@ -69,3 +69,45 @@ SELECT COUNT(*) FROM pg_class WHERE relnamespace = 0 (1 row) +CREATE SCHEMA test_schema_2 + CREATE TABLE ab ( + a serial, + b int UNIQUE + ); +CREATE SCHEMA test_schema_3; +CREATE SCHEMA test_schema_4 + CREATE TABLE ab ( + a serial, + b int UNIQUE + ); +INSERT INTO test_schema_2.ab(b) VALUES(1); +INSERT INTO test_schema_2.ab(b) VALUES(2); +SELECT * FROM test_schema_2.ab ORDER BY a, b; + a | b +---+--- + 1 | 1 + 2 | 2 +(2 rows) + +INSERT INTO test_schema_3.ab(b) VALUES(3); +ERROR: relation "test_schema_3.ab" does not exist +LINE 1: INSERT INTO test_schema_3.ab(b) VALUES(3); + ^ +SELECT * FROM test_schema_3.ab ORDER BY a, b; +ERROR: relation "test_schema_3.ab" does not exist +LINE 1: SELECT * FROM test_schema_3.ab ORDER BY a, b; + ^ +INSERT INTO test_schema_4.ab(b) VALUES(4); +INSERT INTO test_schema_4.ab(b) VALUES(5); +SELECT * FROM test_schema_4.ab ORDER BY a, b; + a | b +---+--- + 1 | 4 + 2 | 5 +(2 rows) + +DROP SCHEMA test_schema_2 CASCADE; +NOTICE: drop cascades to table test_schema_2.ab +DROP SCHEMA test_schema_3 CASCADE; +DROP SCHEMA test_schema_4 CASCADE; +NOTICE: drop cascades to table test_schema_4.ab diff --git a/src/test/regress/expected/xl_limitations_1.out b/src/test/regress/expected/xl_limitations_1.out index c44f0d64..161cd7b4 100644 --- a/src/test/regress/expected/xl_limitations_1.out +++ b/src/test/regress/expected/xl_limitations_1.out @@ -514,7 +514,7 @@ INSERT INTO xl_names("name", "name1")VALUES ('W', 'W1'); INSERT INTO xl_names("name", "name1")VALUES ('X', 'X1'); INSERT INTO xl_names("name", "name1")VALUES ('Y', 'Y1'); INSERT INTO xl_names("name", "name1")VALUES ('Z', 'Z1'); -select xl_nodename_from_id(xc_node_id), * from xl_t order by 1; +select xl_nodename_from_id(xc_node_id), * from xl_t order by 1, 2, 3; xl_nodename_from_id | no | name ---------------------+----+------ datanode_1 | 1 | A @@ -523,7 +523,7 @@ select xl_nodename_from_id(xc_node_id), * from xl_t order by 1; datanode_2 | 4 | D (4 rows) -select xl_nodename_from_id(xc_node_id), * from xl_t1 order by 1; +select xl_nodename_from_id(xc_node_id), * from xl_t1 order by 1, 2, 3; xl_nodename_from_id | no1 | name1 ---------------------+-----+------- datanode_1 | 1 | Z @@ -552,7 +552,7 @@ where xl_t.no = T1.no1; update xl_t1 set name1 = T1.name1 from (select name,name1 from xl_names) T1 where xl_t1.name1 = T1.name; -select xl_nodename_from_id(xc_node_id), * from xl_t order by 1; +select xl_nodename_from_id(xc_node_id), * from xl_t order by 1, 2, 3; xl_nodename_from_id | no | name ---------------------+----+------ datanode_1 | 1 | Z @@ -561,13 +561,13 @@ select xl_nodename_from_id(xc_node_id), * from xl_t order by 1; datanode_2 | 4 | W (4 rows) -select xl_nodename_from_id(xc_node_id), * from xl_t1 order by 1; +select xl_nodename_from_id(xc_node_id), * from xl_t1 order by 1, 2, 3; xl_nodename_from_id | no1 | name1 ---------------------+-----+------- - datanode_1 | 2 | Y1 datanode_1 | 1 | Z1 - datanode_2 | 4 | W1 + datanode_1 | 2 | Y1 datanode_2 | 3 | X1 + datanode_2 | 4 | W1 (4 rows) --testing correlated delete: @@ -578,7 +578,7 @@ where xl_t.no in (select no1 from xl_t1 where name1 in ('Z', 'X')) delete from xl_t1 where xl_t1.name1 in (select name1 from xl_names where name in ('Z', 'X')) ; -select xl_nodename_from_id(xc_node_id), * from xl_t order by 1; +select xl_nodename_from_id(xc_node_id), * from xl_t order by 1, 2, 3; xl_nodename_from_id | no | name ---------------------+----+------ datanode_1 | 1 | Z @@ -587,7 +587,7 @@ select xl_nodename_from_id(xc_node_id), * from xl_t order by 1; datanode_2 | 4 | W (4 rows) -select xl_nodename_from_id(xc_node_id), * from xl_t1 order by 1; +select xl_nodename_from_id(xc_node_id), * from xl_t1 order by 1, 2, 3; xl_nodename_from_id | no1 | name1 ---------------------+-----+------- datanode_1 | 2 | Y1 diff --git a/src/test/regress/sql/mls_check.sql b/src/test/regress/sql/mls_check.sql index 83e4027c..0b96a0c6 100644 --- a/src/test/regress/sql/mls_check.sql +++ b/src/test/regress/sql/mls_check.sql @@ -2168,9 +2168,9 @@ drop table lala2; drop table lala3; \c - mls_admin -select * from pg_cls_table; +select polid, attnum, enable, nspname, tblname, reloptions from pg_cls_table; select MLS_CLS_DROP_TABLE_LABEL('cls_compare', 'public', 'xixi'); -select * from pg_cls_table; +select polid, attnum, enable, nspname, tblname, reloptions from pg_cls_table; --everything is done \c - godlike diff --git a/src/test/regress/sql/namespace.sql b/src/test/regress/sql/namespace.sql index ade2e5e3..77444aea 100644 --- a/src/test/regress/sql/namespace.sql +++ b/src/test/regress/sql/namespace.sql @@ -42,3 +42,31 @@ DROP SCHEMA test_schema_renamed CASCADE; -- verify that the objects were dropped SELECT COUNT(*) FROM pg_class WHERE relnamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'test_schema_renamed'); + + +CREATE SCHEMA test_schema_2 + CREATE TABLE ab ( + a serial, + b int UNIQUE + ); +CREATE SCHEMA test_schema_3; +CREATE SCHEMA test_schema_4 + CREATE TABLE ab ( + a serial, + b int UNIQUE + ); + +INSERT INTO test_schema_2.ab(b) VALUES(1); +INSERT INTO test_schema_2.ab(b) VALUES(2); +SELECT * FROM test_schema_2.ab ORDER BY a, b; + +INSERT INTO test_schema_3.ab(b) VALUES(3); +SELECT * FROM test_schema_3.ab ORDER BY a, b; + +INSERT INTO test_schema_4.ab(b) VALUES(4); +INSERT INTO test_schema_4.ab(b) VALUES(5); +SELECT * FROM test_schema_4.ab ORDER BY a, b; + +DROP SCHEMA test_schema_2 CASCADE; +DROP SCHEMA test_schema_3 CASCADE; +DROP SCHEMA test_schema_4 CASCADE; diff --git a/src/test/regress/sql/xl_limitations.sql b/src/test/regress/sql/xl_limitations.sql index a75bfcdb..3f9e7779 100644 --- a/src/test/regress/sql/xl_limitations.sql +++ b/src/test/regress/sql/xl_limitations.sql @@ -267,9 +267,9 @@ INSERT INTO xl_names("name", "name1")VALUES ('X', 'X1'); INSERT INTO xl_names("name", "name1")VALUES ('Y', 'Y1'); INSERT INTO xl_names("name", "name1")VALUES ('Z', 'Z1'); -select xl_nodename_from_id(xc_node_id), * from xl_t order by 1; +select xl_nodename_from_id(xc_node_id), * from xl_t order by 1, 2, 3; -select xl_nodename_from_id(xc_node_id), * from xl_t1 order by 1; +select xl_nodename_from_id(xc_node_id), * from xl_t1 order by 1, 2, 3; select xl_nodename_from_id(xc_node_id), * from xl_names order by name; @@ -282,8 +282,8 @@ update xl_t1 set name1 = T1.name1 from (select name,name1 from xl_names) T1 where xl_t1.name1 = T1.name; -select xl_nodename_from_id(xc_node_id), * from xl_t order by 1; -select xl_nodename_from_id(xc_node_id), * from xl_t1 order by 1; +select xl_nodename_from_id(xc_node_id), * from xl_t order by 1, 2, 3; +select xl_nodename_from_id(xc_node_id), * from xl_t1 order by 1, 2, 3; --testing correlated delete: delete from xl_t @@ -295,9 +295,9 @@ delete from xl_t1 where xl_t1.name1 in (select name1 from xl_names where name in ('Z', 'X')) ; -select xl_nodename_from_id(xc_node_id), * from xl_t order by 1; +select xl_nodename_from_id(xc_node_id), * from xl_t order by 1, 2, 3; -select xl_nodename_from_id(xc_node_id), * from xl_t1 order by 1; +select xl_nodename_from_id(xc_node_id), * from xl_t1 order by 1, 2, 3; drop table xl_t; drop table xl_t1; From 250e3deb58705a81ebd2d6e7cd715299cebbf81e Mon Sep 17 00:00:00 2001 From: jackywpxie Date: Mon, 4 Jan 2021 11:35:26 +0800 Subject: [PATCH 165/578] jacky/feature/MemoryProtect_Tbase_v2.15.16 (merge request !13) Squash merge branch 'jacky/feature/MemoryProtect_Tbase_v2.15.16' into 'Tbase_v2.15.16' * modified according to codeview suggestions. * rafactor code * modified according to codereview suggestions. * fixed an index_insert error * modified guc para values * refactor code * rollback * modified for enable_buffer_mprotect * enable memory protection while doing command: make check * refactor code * fixed MLS bugs for enable_buffer_memory_protect * refactor code * fixed merged bugs. * Merge branch 'Tbase_v2.15.16' into jacky/feature/MemoryProtect_Tbase_v2.15.16 * enable_buffer_memory_protect * Merge branch 'Tbase_v2.15.16' into jacky/feature/MemoryProtect_Tbase_v2.15.16 * Merge branch 'Tbase_v2.15.16' into jacky/feature/MemoryProtect_Tbase_v2.15.16 * Merge branch 'Tbase_v2.15.16' into jacky/feature/MemoryProtect_Tbase_v2.15.16 * delete nouse head file. * enable_xlog_memory_protect = on, make check successfully. * fixed coredump for xlog memory protect. * enable_clog_memory_protect = on, make check successfully. * enable_tlog_memory_protect = on, make check successfully. * add GUC para: * fixed a error of cherry-pick. * convert blank to tab. * adjust code according to mr. * Fix format. * Bug fix in using mprotect. * add mlock before mprotect to shard memory as required. * Add memory protect for tlog. --- src/backend/access/gist/gistget.c | 10 ++ src/backend/access/hash/hash.c | 15 ++ src/backend/access/hash/hashsearch.c | 23 +++ src/backend/access/nbtree/nbtinsert.c | 9 + src/backend/access/nbtree/nbtutils.c | 15 ++ src/backend/access/transam/clog.c | 4 + src/backend/access/transam/commit_ts.c | 12 +- src/backend/access/transam/lru.c | 106 ++++++++++- src/backend/access/transam/multixact.c | 2 + src/backend/access/transam/slru.c | 45 +++++ src/backend/access/transam/subtrans.c | 2 + src/backend/access/transam/xlog.c | 44 +++++ src/backend/commands/async.c | 2 + src/backend/commands/sequence.c | 6 +- src/backend/storage/buffer/buf_init.c | 13 ++ src/backend/storage/buffer/bufmgr.c | 109 ++++++++++- src/backend/storage/freespace/freespace.c | 11 +- src/backend/storage/freespace/fsmpage.c | 6 + src/backend/utils/misc/guc.c | 57 ++++++ src/backend/utils/misc/mls.c | 13 ++ src/backend/utils/misc/postgresql.conf.sample | 8 + src/backend/utils/time/tqual.c | 6 +- src/include/access/lru.h | 5 +- src/include/access/slru.h | 169 +++++++++--------- src/include/access/xlog.h | 3 + src/include/c.h | 1 + src/include/storage/buf_internals.h | 3 + src/include/utils/guc.h | 4 + src/test/regress/expected/oracle.out | 1 + src/test/regress/expected/sysviews.out | 6 +- .../regress/output/xc_notrans_block.source | 1 - .../regress/output/xc_notrans_block_1.source | 1 - 32 files changed, 608 insertions(+), 104 deletions(-) diff --git a/src/backend/access/gist/gistget.c b/src/backend/access/gist/gistget.c index 17494242..8a18e693 100644 --- a/src/backend/access/gist/gistget.c +++ b/src/backend/access/gist/gistget.c @@ -21,6 +21,7 @@ #include "pgstat.h" #include "lib/pairingheap.h" #include "utils/builtins.h" +#include "utils/guc.h" #include "utils/memutils.h" #include "utils/rel.h" @@ -52,7 +53,16 @@ gistkillitems(IndexScanDesc scan) if (!BufferIsValid(buffer)) return; + if (enable_buffer_mprotect) + { + /* ItemIdMarkDead() will write pages */ + LockBuffer(buffer, GIST_EXCLUSIVE); + } + else + { LockBuffer(buffer, GIST_SHARE); + } + gistcheckpage(scan->indexRelation, buffer); page = BufferGetPage(buffer); diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 636c21cf..7d04dd80 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -26,6 +26,7 @@ #include "miscadmin.h" #include "optimizer/plancat.h" #include "utils/builtins.h" +#include "utils/guc.h" #include "utils/index_selfuncs.h" #include "utils/rel.h" #include "miscadmin.h" @@ -482,7 +483,14 @@ hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, */ if (so->numKilled > 0) { + if (enable_buffer_mprotect) + { + LockBuffer(so->hashso_curbuf, BUFFER_LOCK_EXCLUSIVE); + } + else + { LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE); + } _hash_kill_items(scan); LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK); } @@ -520,7 +528,14 @@ hashendscan(IndexScanDesc scan) */ if (so->numKilled > 0) { + if (enable_buffer_mprotect) + { + LockBuffer(so->hashso_curbuf, BUFFER_LOCK_EXCLUSIVE); + } + else + { LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE); + } _hash_kill_items(scan); LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK); } diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c index cea3e835..e4ea31de 100644 --- a/src/backend/access/hash/hashsearch.c +++ b/src/backend/access/hash/hashsearch.c @@ -18,6 +18,7 @@ #include "access/relscan.h" #include "miscadmin.h" #include "pgstat.h" +#include "utils/guc.h" #include "utils/rel.h" @@ -467,7 +468,18 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) /* Before leaving current page, deal with any killed items */ if (so->numKilled > 0) + { + if (enable_buffer_mprotect) + { + LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK); + LockBuffer(so->hashso_curbuf, BUFFER_LOCK_EXCLUSIVE); + _hash_kill_items(scan); + } + else + { _hash_kill_items(scan); + } + } /* * ran off the end of this page, try the next @@ -524,7 +536,18 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) /* Before leaving current page, deal with any killed items */ if (so->numKilled > 0) + { + if (enable_buffer_mprotect) + { + LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK); + LockBuffer(so->hashso_curbuf, BUFFER_LOCK_EXCLUSIVE); + _hash_kill_items(scan); + } + else + { _hash_kill_items(scan); + } + } /* * ran off the end of this page, try the next diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 65936677..4d102b2f 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -23,6 +23,7 @@ #include "miscadmin.h" #include "storage/lmgr.h" #include "storage/predicate.h" +#include "utils/guc.h" #include "utils/tqual.h" @@ -472,7 +473,15 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, for (;;) { nblkno = opaque->btpo_next; + if (enable_buffer_mprotect) + { + nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_WRITE); + } + else + { nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ); + } + page = BufferGetPage(nbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (!P_IGNORE(opaque)) diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index a2ee713b..a031db5b 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -22,6 +22,7 @@ #include "access/relscan.h" #include "miscadmin.h" #include "utils/array.h" +#include "utils/guc.h" #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/rel.h" @@ -1756,7 +1757,14 @@ _bt_killitems(IndexScanDesc scan) * re-use of any TID on the page, so there is no need to check the * LSN. */ + if (enable_buffer_mprotect) + { + LockBuffer(so->currPos.buf, BT_WRITE); + } + else + { LockBuffer(so->currPos.buf, BT_READ); + } page = BufferGetPage(so->currPos.buf); } @@ -1765,7 +1773,14 @@ _bt_killitems(IndexScanDesc scan) Buffer buf; /* Attempt to re-read the buffer, getting pin and lock. */ + if (enable_buffer_mprotect) + { + buf = _bt_getbuf(scan->indexRelation, so->currPos.currPage, BT_WRITE); + } + else + { buf = _bt_getbuf(scan->indexRelation, so->currPos.currPage, BT_READ); + } /* It might not exist anymore; in which case we can't hint it. */ if (!BufferIsValid(buf)) diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c index 7cfbec69..98f1f7d1 100644 --- a/src/backend/access/transam/clog.c +++ b/src/backend/access/transam/clog.c @@ -365,7 +365,9 @@ TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, i byteval = *byteptr; byteval &= ~(((1 << CLOG_BITS_PER_XACT) - 1) << bshift); byteval |= (status << bshift); + SlruClogDisableMemoryProtection(ClogCtl->shared->page_buffer[slotno]); *byteptr = byteval; + SlruClogEnableMemoryProtection(ClogCtl->shared->page_buffer[slotno]); /* * Update the group LSN if the transaction completion LSN is higher. @@ -570,9 +572,11 @@ TrimCLOG(void) byteptr = ClogCtl->shared->page_buffer[slotno] + byteno; /* Zero so-far-unused positions in the current byte */ + SlruClogDisableMemoryProtection(ClogCtl->shared->page_buffer[slotno]); *byteptr &= (1 << bshift) - 1; /* Zero the rest of the page */ MemSet(byteptr + 1, 0, BLCKSZ - byteno - 1); + SlruClogEnableMemoryProtection(ClogCtl->shared->page_buffer[slotno]); ClogCtl->shared->page_dirty[slotno] = true; } diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c index 0a950156..e971e3c1 100644 --- a/src/backend/access/transam/commit_ts.c +++ b/src/backend/access/transam/commit_ts.c @@ -24,6 +24,9 @@ */ #include "postgres.h" +#include +#include + #include "access/commit_ts.h" #include "access/htup_details.h" #include "access/lru.h" @@ -330,9 +333,11 @@ TransactionIdSetCommitTs(TransactionId xid, TimestampTz gts, TimestampTz ts, entry.time = ts; entry.nodeid = nodeid; + LruTlogDisableMemoryProtection(CommitTsCtl->shared[partitionno]->page_buffer[slotno]); memcpy(CommitTsCtl->shared[partitionno]->page_buffer[slotno] + SizeOfCommitTimestampEntry * entryno, &entry, SizeOfCommitTimestampEntry); + LruTlogEnableMemoryProtection(CommitTsCtl->shared[partitionno]->page_buffer[slotno]); #ifdef __TBASE__ /* @@ -868,7 +873,7 @@ TrimCommitTs(void) CommitTsCtl->global_shared->latest_page_number = pageno; LWLockRelease(CommitTsControlLock); - elog(LOG, "Trim committs next xid %d latest page number %d entryno %d", xid, pageno, entryno); + elog(DEBUG10, "Trim committs next xid %d latest page number %d entryno %d", xid, pageno, entryno); /* @@ -902,9 +907,12 @@ TrimCommitTs(void) byteptr = CommitTsCtl->shared[partitionno]->page_buffer[slotno] + byteno; + LruTlogDisableMemoryProtection(CommitTsCtl->shared[partitionno]->page_buffer[slotno]); /* Zero the rest of the page */ MemSet(byteptr, 0, BLCKSZ - byteno); - elog(LOG, "zero out the remaining page starting from byteno %d len BLCKSZ -byteno %d entryno %d sizeofentry %lu", + LruTlogEnableMemoryProtection(CommitTsCtl->shared[partitionno]->page_buffer[slotno]); + + elog(DEBUG10, "zero out the remaining page starting from byteno %d len BLCKSZ -byteno %d entryno %d sizeofentry %lu", byteno, BLCKSZ - byteno, entryno, SizeOfCommitTimestampEntry); CommitTsCtl->shared[partitionno]->page_dirty[slotno] = true; diff --git a/src/backend/access/transam/lru.c b/src/backend/access/transam/lru.c index 6276f8d7..0c772617 100644 --- a/src/backend/access/transam/lru.c +++ b/src/backend/access/transam/lru.c @@ -109,6 +109,7 @@ #include #include +#include #include #include "access/lru.h" @@ -117,6 +118,7 @@ #include "pgstat.h" #include "storage/fd.h" #include "storage/shmem.h" +#include "utils/guc.h" #include "miscadmin.h" @@ -184,6 +186,9 @@ static LruErrorCause lru_errcause; static int lru_errno; +bool enable_tlog_mprotect = false; + + static void LruZeroLSNs(LruCtl ctl, int partitionno, int slotno); static void LruWaitIO(LruCtl ctl, int partitionno, int slotno); static void LruInternalWritePage(LruCtl ctl, int partitionno, int slotno, LruFlushPt fdata); @@ -197,6 +202,71 @@ static bool LruScanDirCbDeleteCutoff(LruCtl ctl, char *filename, int segpage, void *data); static void LruInternalDeleteSegment(LruCtl ctl, char *filename); +/* + * Set a page [ptr, ptr + BLCKSZ - 1] with read only constraint. + * + * Coredump when being written without setting it writable. + */ +void +SetPageReadOnly(char *address) +{ + /* prevent wild pointer */ + if (((uint64) address) % BLCKSZ != 0) + { + elog(PANIC, "address %p is not aligned with page", address); + } + + /* set page read only with syscall */ + if (mprotect(address, BLCKSZ, PROT_READ) != 0) + { + elog(PANIC, "mprotect failed %s at %p", strerror(errno), address); + } +} + +/* + * Set a page [ptr, ptr + BLCKSZ - 1] with writable attribute + * which cooperates with SetPageReadOnly. + */ +void +SetPageReadWrite(char *address) +{ + /* prevent wild pointer */ + if (((uint64) address) % BLCKSZ != 0) + { + elog(PANIC, "address %p is not aligned with page", address); + } + + /* set page read write with syscall */ + if (mprotect(address, BLCKSZ, PROT_WRITE | PROT_READ) != 0) + { + elog(PANIC, "mprotect failed %s at %p", strerror(errno), address); + } +} + +/* + * enable tlog memory protection + */ +inline void +LruTlogEnableMemoryProtection(char *address) +{ + if (enable_tlog_mprotect) + { + SetPageReadOnly(address); + } +} + +/* + * disable tlog memory protection + */ +inline void +LruTlogDisableMemoryProtection(char *address) +{ + if (enable_tlog_mprotect) + { + SetPageReadWrite(address); + } +} + /* * Initialization of shared memory */ @@ -218,6 +288,12 @@ LruShmemSize(int nslots, int nlsns) if (nlsns > 0) sz += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr)); /* group_lsn[] */ + if (enable_tlog_mprotect) + { + /* add BLCKSZ for memory protect */ + return BUFFERALIGN(sz) + BLCKSZ + BLCKSZ * nslots; + } + return BUFFERALIGN(sz) + BLCKSZ * nslots; } @@ -397,8 +473,12 @@ LruInit(LruCtl ctl, const char *name, int nslots, int nlsns, int nbufs, global_shared->ControlLock = ctllock; global_shared->latest_page_number = 0; - }else + } + else + { Assert(found); + } + ctl->global_shared = global_shared; for(partitionno = 0; partitionno < NUM_PARTITIONS; partitionno++){ snprintf(full_name, 64, "%s:%d", name, partitionno); @@ -451,13 +531,21 @@ LruInit(LruCtl ctl, const char *name, int nslots, int nlsns, int nbufs, strlcpy(shared->lwlock_tranche_name, name, LRU_MAX_NAME_LENGTH); shared->lwlock_tranche_id = tranche_id; - ptr += BUFFERALIGN(offset); + if (enable_tlog_mprotect) + { + ptr = (char *) BLOCKALIGN(ptr); + } for (slotno = 0; slotno < nslots; slotno++) { LWLockInitialize(&shared->buffer_locks[slotno].lock, shared->lwlock_tranche_id); + if (enable_tlog_mprotect) + { + /* protect page */ + SetPageReadOnly(ptr); + } shared->page_buffer[slotno] = ptr; shared->page_status[slotno] = LRU_PAGE_EMPTY; shared->page_dirty[slotno] = false; @@ -531,8 +619,10 @@ LruZeroPage(LruCtl ctl, int partitionno, int pageno) shared->page_dirty[slotno] = true; LruRecentlyUsed(shared, slotno); + LruTlogDisableMemoryProtection(shared->page_buffer[slotno]); /* Set the buffer to zeroes */ MemSet(shared->page_buffer[slotno], 0, BLCKSZ); + LruTlogEnableMemoryProtection(shared->page_buffer[slotno]); /* Set the LSNs for this new page to zero */ LruZeroLSNs(ctl, partitionno, slotno); @@ -1056,7 +1146,9 @@ LruPhysicalReadPage(LruCtl ctl, int partitionno, int pageno, int slotno) ereport(LOG, (errmsg("file \"%s\" doesn't exist, reading as zeroes", path))); + LruTlogDisableMemoryProtection(shared->page_buffer[slotno]); MemSet(shared->page_buffer[slotno], 0, BLCKSZ); + LruTlogEnableMemoryProtection(shared->page_buffer[slotno]); return true; } @@ -1070,16 +1162,20 @@ LruPhysicalReadPage(LruCtl ctl, int partitionno, int pageno, int slotno) errno = 0; pgstat_report_wait_start(WAIT_EVENT_SLRU_READ); + LruTlogDisableMemoryProtection(shared->page_buffer[slotno]); if (read(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ) { - elog(ERROR, "read fails path %s partitionno %d slotno %d pageno %d ", - path, partitionno, slotno, pageno); + LruTlogEnableMemoryProtection(shared->page_buffer[slotno]); pgstat_report_wait_end(); lru_errcause = LRU_READ_FAILED; lru_errno = errno; CloseTransientFile(fd); - return false; + elog(ERROR, "read fails path %s partitionno %d slotno %d pageno %d ", + path, partitionno, slotno, pageno); } + + LruTlogEnableMemoryProtection(shared->page_buffer[slotno]); + pgstat_report_wait_end(); if (CloseTransientFile(fd)) diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index 1b2b7694..264384b4 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -2044,7 +2044,9 @@ TrimMultiXact(void) offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; offptr += entryno; + SlruClogDisableMemoryProtection(MultiXactOffsetCtl->shared->page_buffer[slotno]); MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset))); + SlruClogEnableMemoryProtection(MultiXactOffsetCtl->shared->page_buffer[slotno]); MultiXactOffsetCtl->shared->page_dirty[slotno] = true; } diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index 3d3c8c84..acf0562d 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -51,6 +51,7 @@ #include #include +#include "access/lru.h" #include "access/slru.h" #include "access/transam.h" #include "access/xlog.h" @@ -123,6 +124,7 @@ typedef enum static SlruErrorCause slru_errcause; static int slru_errno; +bool enable_clog_mprotect = false; static void SimpleLruZeroLSNs(SlruCtl ctl, int slotno); static void SimpleLruWaitIO(SlruCtl ctl, int slotno); @@ -158,6 +160,12 @@ SimpleLruShmemSize(int nslots, int nlsns) if (nlsns > 0) sz += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr)); /* group_lsn[] */ + if (enable_clog_mprotect) + { + /* add BLCKSZ for memory protect */ + return BUFFERALIGN(sz) + BLCKSZ + BLCKSZ * nslots; + } + return BUFFERALIGN(sz) + BLCKSZ * nslots; } @@ -220,11 +228,16 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, shared->lwlock_tranche_id = tranche_id; ptr += BUFFERALIGN(offset); + if (enable_clog_mprotect) + { + ptr = (char *) BLOCKALIGN(ptr); + } for (slotno = 0; slotno < nslots; slotno++) { LWLockInitialize(&shared->buffer_locks[slotno].lock, shared->lwlock_tranche_id); + SlruClogEnableMemoryProtection(ptr); shared->page_buffer[slotno] = ptr; shared->page_status[slotno] = SLRU_PAGE_EMPTY; shared->page_dirty[slotno] = false; @@ -278,8 +291,10 @@ SimpleLruZeroPage(SlruCtl ctl, int pageno) shared->page_dirty[slotno] = true; SlruRecentlyUsed(shared, slotno); + SlruClogDisableMemoryProtection(shared->page_buffer[slotno]); /* Set the buffer to zeroes */ MemSet(shared->page_buffer[slotno], 0, BLCKSZ); + SlruClogEnableMemoryProtection(shared->page_buffer[slotno]); /* Set the LSNs for this new page to zero */ SimpleLruZeroLSNs(ctl, slotno); @@ -681,7 +696,9 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno) ereport(LOG, (errmsg("file \"%s\" doesn't exist, reading as zeroes", path))); + SlruClogDisableMemoryProtection(shared->page_buffer[slotno]); MemSet(shared->page_buffer[slotno], 0, BLCKSZ); + SlruClogEnableMemoryProtection(shared->page_buffer[slotno]); return true; } @@ -695,14 +712,18 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno) errno = 0; pgstat_report_wait_start(WAIT_EVENT_SLRU_READ); + SlruClogDisableMemoryProtection(shared->page_buffer[slotno]); if (read(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ) { + SlruClogEnableMemoryProtection(shared->page_buffer[slotno]); pgstat_report_wait_end(); slru_errcause = SLRU_READ_FAILED; slru_errno = errno; CloseTransientFile(fd); return false; } + + SlruClogEnableMemoryProtection(shared->page_buffer[slotno]); pgstat_report_wait_end(); if (CloseTransientFile(fd)) @@ -1420,3 +1441,27 @@ SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data) return retval; } + +/* + * enable clog memory protection + */ +inline void +SlruClogEnableMemoryProtection(char *address) +{ + if (enable_clog_mprotect) + { + SetPageReadOnly(address); + } +} + +/* + * disable clog memory protection + */ +inline void +SlruClogDisableMemoryProtection(char *address) +{ + if (enable_clog_mprotect) + { + SetPageReadWrite(address); + } +} diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c index 93051191..4180f71c 100644 --- a/src/backend/access/transam/subtrans.c +++ b/src/backend/access/transam/subtrans.c @@ -95,7 +95,9 @@ SubTransSetParent(TransactionId xid, TransactionId parent) if (*ptr != parent) { Assert(*ptr == InvalidTransactionId); + SlruClogDisableMemoryProtection(SubTransCtl->shared->page_buffer[slotno]); *ptr = parent; + SlruClogEnableMemoryProtection(SubTransCtl->shared->page_buffer[slotno]); SubTransCtl->shared->page_dirty[slotno] = true; } diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 14e26640..405e9bac 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -24,6 +24,7 @@ #include "access/clog.h" #include "access/commit_ts.h" +#include "access/lru.h" #include "access/multixact.h" #include "access/rewriteheap.h" #include "access/subtrans.h" @@ -245,6 +246,8 @@ bool InRecovery = false; /* Are we in Hot Standby mode? Only valid in startup process, see xlog.h */ HotStandbyState standbyState = STANDBY_DISABLED; +bool enable_xlog_mprotect = false; + static XLogRecPtr LastRec; /* Local copy of WalRcv->receivedUpto */ @@ -1575,7 +1578,9 @@ CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata, * Write what fits on this page, and continue on the next page. */ Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0); + XlogDisableMemoryProtection(XLogCtl->pages + XLogRecPtrToBufIdx(CurrPos) * (Size) XLOG_BLCKSZ); memcpy(currpos, rdata_data, freespace); + XlogEnableMemoryProtection(XLogCtl->pages + XLogRecPtrToBufIdx(CurrPos) * (Size) XLOG_BLCKSZ); rdata_data += freespace; rdata_len -= freespace; written += freespace; @@ -1592,8 +1597,10 @@ CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata, */ currpos = GetXLogBuffer(CurrPos); pagehdr = (XLogPageHeader) currpos; + XlogDisableMemoryProtection(XLogCtl->pages + XLogRecPtrToBufIdx(CurrPos) * (Size) XLOG_BLCKSZ); pagehdr->xlp_rem_len = write_len - written; pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD; + XlogEnableMemoryProtection(XLogCtl->pages + XLogRecPtrToBufIdx(CurrPos) * (Size) XLOG_BLCKSZ); /* skip over the page header */ if (CurrPos % XLogSegSize == 0) @@ -1623,7 +1630,9 @@ CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata, #endif Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0); + XlogDisableMemoryProtection(XLogCtl->pages + XLogRecPtrToBufIdx(CurrPos) * (Size) XLOG_BLCKSZ); memcpy(currpos, rdata_data, rdata_len); + XlogEnableMemoryProtection(XLogCtl->pages + XLogRecPtrToBufIdx(CurrPos) * (Size) XLOG_BLCKSZ); currpos += rdata_len; CurrPos += rdata_len; freespace -= rdata_len; @@ -2247,6 +2256,7 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic) * Be sure to re-zero the buffer so that bytes beyond what we've * written will look like zeroes and not valid XLOG records... */ + XlogDisableMemoryProtection((char *) NewPage); MemSet((char *) NewPage, 0, XLOG_BLCKSZ); /* @@ -2289,6 +2299,8 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic) NewPage->xlp_info |= XLP_LONG_HEADER; } + XlogEnableMemoryProtection((char *) NewPage); + /* * Make sure the initialization of the page becomes visible to others * before the xlblocks update. GetXLogBuffer() reads xlblocks without @@ -5067,6 +5079,13 @@ XLOGShmemInit(void) allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr); XLogCtl->pages = allocptr; memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers); + if (enable_xlog_mprotect) + { + for (i = 0; i < XLOGbuffers; i++) + { + XlogEnableMemoryProtection(XLogCtl->pages + XLOG_BLCKSZ * i); + } + } /* * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill @@ -7968,8 +7987,10 @@ StartupXLOG(void) /* Copy the valid part of the last block, and zero the rest */ page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ]; len = EndOfLog % XLOG_BLCKSZ; + XlogDisableMemoryProtection(page); memcpy(page, xlogreader->readBuf, len); memset(page + len, 0, XLOG_BLCKSZ - len); + XlogEnableMemoryProtection(page); XLogCtl->xlblocks[firstIdx] = pageBeginPtr + XLOG_BLCKSZ; XLogCtl->InitializedUpTo = pageBeginPtr + XLOG_BLCKSZ; @@ -13267,3 +13288,26 @@ void wal_reset_stream(void) #endif +/* + * enable xlog memory protection + */ +inline void +XlogEnableMemoryProtection(char *address) +{ + if (enable_xlog_mprotect) + { + SetPageReadOnly(address); + } +} + +/* + * disable xlog memory protection + */ +inline void +XlogDisableMemoryProtection(char *address) +{ + if (enable_xlog_mprotect) + { + SetPageReadWrite(address); + } +} diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c index c8fa6541..517f1262 100644 --- a/src/backend/commands/async.c +++ b/src/backend/commands/async.c @@ -1374,9 +1374,11 @@ asyncQueueAddEntries(ListCell *nextNotify) } /* Now copy qe into the shared buffer page */ + SlruClogDisableMemoryProtection(AsyncCtl->shared->page_buffer[slotno]); memcpy(AsyncCtl->shared->page_buffer[slotno] + offset, &qe, qe.length); + SlruClogEnableMemoryProtection(AsyncCtl->shared->page_buffer[slotno]); /* Advance queue_head appropriately, and detect if page is full */ if (asyncQueueAdvance(&(queue_head), qe.length)) diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c index 2557db35..3d522795 100644 --- a/src/backend/commands/sequence.c +++ b/src/backend/commands/sequence.c @@ -490,16 +490,14 @@ fill_seq_with_data(Relation rel, HeapTuple tuple) buf = ReadBuffer(rel, P_NEW); Assert(BufferGetBlockNumber(buf) == 0); + /* Now insert sequence tuple */ + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); page = BufferGetPage(buf); PageInit(page, BufferGetPageSize(buf), sizeof(sequence_magic)); sm = (sequence_magic *) PageGetSpecialPointer(page); sm->magic = SEQ_MAGIC; - /* Now insert sequence tuple */ - - LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); - /* * Since VACUUM does not process sequences, we have to force the tuple to * have xmin = FrozenTransactionId now. Otherwise it would become diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index 88b2dde4..66d18518 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -16,6 +16,7 @@ #include "storage/bufmgr.h" #include "storage/buf_internals.h" +#include "utils/guc.h" BufferDescPadded *BufferDescriptors; @@ -80,9 +81,19 @@ InitBufferPool(void) NBuffers * sizeof(BufferDescPadded), &foundDescs); + if (enable_buffer_mprotect) + { + BufferBlocks = (char *) + ShmemInitStruct("Buffer Blocks", + NBuffers * (Size) BLCKSZ + BLCKSZ, &foundBufs); + BufferBlocks = (char *) BLOCKALIGN(BufferBlocks); + } + else + { BufferBlocks = (char *) ShmemInitStruct("Buffer Blocks", NBuffers * (Size) BLCKSZ, &foundBufs); + } /* Align lwlocks to cacheline boundary */ BufferIOLWLockArray = (LWLockMinimallyPadded *) @@ -135,6 +146,8 @@ InitBufferPool(void) */ buf->freeNext = i + 1; + BufEnableMemoryProtection(BufferBlocks + i * BLCKSZ, false); + LWLockInitialize(BufferDescriptorGetContentLock(buf), LWTRANCHE_BUFFER_CONTENT); diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 5c2f3bd2..56276b4a 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -34,6 +34,7 @@ #include #include +#include "access/lru.h" #include "access/xlog.h" #include "catalog/catalog.h" #include "catalog/storage.h" @@ -125,6 +126,8 @@ int checkpoint_flush_after = 0; int bgwriter_flush_after = 0; int backend_flush_after = 0; +bool enable_buffer_mprotect = false; + /* * How many buffers PrefetchBuffer callers should try to stay ahead of their * ReadBuffer calls by. This is maintained by the assign hook for @@ -818,8 +821,12 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, if (!isLocalBuf) { if (mode == RBM_ZERO_AND_LOCK) + { + BufDisableMemoryProtection(BufferGetPage( + BufferDescriptorGetBuffer(bufHdr)), isLocalBuf); LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE); + } else if (mode == RBM_ZERO_AND_CLEANUP_LOCK) LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr)); } @@ -899,7 +906,10 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, if (isExtend) { /* new buffers are zero-filled */ + BufDisableMemoryProtection(bufBlock, isLocalBuf); MemSet((char *) bufBlock, 0, BLCKSZ); + BufEnableMemoryProtection(bufBlock, isLocalBuf); + /* don't set checksum for all-zero page */ smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false); @@ -917,7 +927,11 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, * just wants us to allocate a buffer. */ if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) + { + BufDisableMemoryProtection(bufBlock, isLocalBuf); MemSet((char *) bufBlock, 0, BLCKSZ); + BufEnableMemoryProtection(bufBlock, isLocalBuf); + } else { instr_time io_start, @@ -926,7 +940,9 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, if (track_io_timing) INSTR_TIME_SET_CURRENT(io_start); + BufDisableMemoryProtection(bufBlock, isLocalBuf); smgrread(smgr, forkNum, blockNum, (char *) bufBlock); + BufEnableMemoryProtection(bufBlock, isLocalBuf); if (track_io_timing) { @@ -944,7 +960,9 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, { if (algo_id == smgr->smgr_relcrypt.algo_id) { + BufDisableMemoryProtection(bufBlock, isLocalBuf); rel_crypt_page_decrypt(&(smgr->smgr_relcrypt), (Page)bufBlock); + BufEnableMemoryProtection(bufBlock, isLocalBuf); } else { @@ -967,7 +985,10 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, errmsg("invalid page in block %u of relation %s; zeroing out page", blockNum, relpath(smgr->smgr_rnode, forkNum)))); + + BufDisableMemoryProtection(bufBlock, isLocalBuf); MemSet((char *) bufBlock, 0, BLCKSZ); + BufEnableMemoryProtection(bufBlock, isLocalBuf); } else { @@ -995,6 +1016,8 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, if ((mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) && !isLocalBuf) { + BufDisableMemoryProtection(BufferGetPage( + BufferDescriptorGetBuffer(bufHdr)), isLocalBuf); LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE); } @@ -1180,8 +1203,26 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, * happens to be trying to split the page the first one got from * StrategyGetBuffer.) */ - if (LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf), - LW_SHARED)) + bool ret = false; + + if (enable_buffer_mprotect) + { + /* Encrypting buffer needs LW_EXCLUSIVE */ + ret = LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf), + LW_EXCLUSIVE); + if (ret) + { + BufDisableMemoryProtection( + BufferGetPage(BufferDescriptorGetBuffer(buf)), false); + } + } + else + { + ret = LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf), + LW_SHARED); + } + + if (ret) { /* * If using a nondefault strategy, and writing the buffer @@ -1202,6 +1243,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, if (XLogNeedsFlush(lsn) && StrategyRejectBuffer(strategy, buf)) { + BufEnableMemoryProtection( + BufferGetPage(BufferDescriptorGetBuffer(buf)), false); /* Drop lock/pin and loop around for another buffer */ LWLockRelease(BufferDescriptorGetContentLock(buf)); UnpinBuffer(buf, true); @@ -1216,6 +1259,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, smgr->smgr_rnode.node.relNode); FlushBuffer(buf, NULL); + BufEnableMemoryProtection( + BufferGetPage(BufferDescriptorGetBuffer(buf)), false); LWLockRelease(BufferDescriptorGetContentLock(buf)); ScheduleBufferTagForWriteback(&BackendWritebackContext, @@ -3844,14 +3889,29 @@ LockBuffer(Buffer buffer, int mode) buf = GetBufferDescriptor(buffer - 1); if (mode == BUFFER_LOCK_UNLOCK) + { + if (enable_buffer_mprotect && + LWLockHeldByMeInMode(BufferDescriptorGetContentLock(buf), LW_EXCLUSIVE)) + { + BufEnableMemoryProtection(BufferGetPage(buffer), false); + } + LWLockRelease(BufferDescriptorGetContentLock(buf)); + } else if (mode == BUFFER_LOCK_SHARE) + { LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED); + } else if (mode == BUFFER_LOCK_EXCLUSIVE) + { + BufDisableMemoryProtection(BufferGetPage(buffer), false); LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_EXCLUSIVE); + } else + { elog(ERROR, "unrecognized buffer lock mode: %d", mode); } +} /* * Acquire the content_lock for the buffer, but only if we don't have to wait. @@ -3867,6 +3927,7 @@ ConditionalLockBuffer(Buffer buffer) if (BufferIsLocal(buffer)) return true; /* act as though we got it */ + BufDisableMemoryProtection(BufferGetPage(buffer), false); buf = GetBufferDescriptor(buffer - 1); return LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf), @@ -4800,7 +4861,16 @@ static int SyncBufferPrePhase1(int buf_id) * try to lock the buffer, returning false means other process(start or backend) having lock the buffer in LW_EXCLUSIVE, * so, we skip this buffer, it would be treated in next checkpoint round. */ - ret = LWLockConditionalAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED); + if (enable_buffer_mprotect && !BufferIsLocal(buf_id) && BufferIsValid(buf_id)) + { + ret = LWLockConditionalAcquire( + BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE); + } + else + { + ret = LWLockConditionalAcquire( + BufferDescriptorGetContentLock(bufHdr), LW_SHARED); + } if (false == ret) { return SYNC_BUF_LWLOCK_CONFLICT; @@ -5052,7 +5122,6 @@ static List* SyncBufferPostPhase1(List * buf_id_list, WritebackContext *wb_conte info = (SyncBufIdInfo *) lfirst(l); buf = GetBufferDescriptor(info->buf_id); - LWLockRelease(BufferDescriptorGetContentLock(buf)); tag = buf->tag; @@ -5234,4 +5303,36 @@ char * BufHdrGetBlockFunc(BufferDesc * buf) #endif +/* + * enable buffer memory protection + */ +inline void +BufEnableMemoryProtection(char *address, bool localbuffer) +{ + if (localbuffer) + { + return; + } + + if (enable_buffer_mprotect) + { + SetPageReadOnly(address); + } +} + +/* + * disable buffer memory protection + */ +inline void +BufDisableMemoryProtection(char *address, bool localbuffer) +{ + if (localbuffer) + { + return; + } + if (enable_buffer_mprotect) + { + SetPageReadWrite(address); + } +} diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c index 21e16a02..a08b4a7d 100644 --- a/src/backend/storage/freespace/freespace.c +++ b/src/backend/storage/freespace/freespace.c @@ -31,6 +31,7 @@ #include "storage/extentmapping.h" #include "storage/lmgr.h" #include "storage/smgr.h" +#include "utils/guc.h" /* @@ -779,7 +780,6 @@ fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot, LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); page = BufferGetPage(buf); - if (fsm_set_avail_extent(page, slot, newValue, &root_modified, &new_root, &old_root)) MarkBufferDirtyHint(buf, false); @@ -978,7 +978,16 @@ fsm_vacuum_page(Relation rel, FSMAddress addr, bool *eof_p) * pages, increasing the chances that a later vacuum can truncate the * relation. */ + if (enable_buffer_mprotect) + { + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); ((FSMPage) PageGetContents(page))->fp_next_slot = 0; + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + } + else + { + ((FSMPage) PageGetContents(page))->fp_next_slot = 0; + } ReleaseBuffer(buf); diff --git a/src/backend/storage/freespace/fsmpage.c b/src/backend/storage/freespace/fsmpage.c index 4ec1dcaa..62868bf7 100644 --- a/src/backend/storage/freespace/fsmpage.c +++ b/src/backend/storage/freespace/fsmpage.c @@ -24,6 +24,7 @@ #include "storage/bufmgr.h" #include "storage/fsm_internals.h" +#include "utils/guc.h" /* Macros to navigate the tree within a page. Root has index zero. */ #define leftchild(x) (2 * (x) + 1) @@ -325,6 +326,11 @@ fsm_search_avail(Buffer buf, uint8 minvalue, bool advancenext, * * Wrap-around is handled at the beginning of this function. */ + if (enable_buffer_mprotect && !exclusive_lock_held) + { + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + } fsmpage->fp_next_slot = slot + (advancenext ? 1 : 0); return slot; diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 2260027e..97a0b26f 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -39,6 +39,7 @@ #include "access/xact.h" #include "access/xlog_internal.h" #include "access/heapam_xlog.h" +#include "access/lru.h" #include "catalog/namespace.h" #include "catalog/pg_authid.h" #include "commands/async.h" @@ -2698,6 +2699,62 @@ static struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, { + {"enable_buffer_mprotect", PGC_POSTMASTER, CUSTOM_OPTIONS, + gettext_noop("Protect memory corruption for share buffer"), + NULL, + GUC_NOT_IN_SAMPLE, + }, + &enable_buffer_mprotect, +#ifdef _PG_REGRESS_ + true, +#else + false, +#endif + NULL, NULL, NULL + }, + { + {"enable_clog_mprotect", PGC_POSTMASTER, CUSTOM_OPTIONS, + gettext_noop("Protect memory corruption for clog"), + NULL, + GUC_NOT_IN_SAMPLE, + }, + &enable_clog_mprotect, +#ifdef _PG_REGRESS_ + true, +#else + false, +#endif + NULL, NULL, NULL + }, + { + {"enable_tlog_mprotect", PGC_POSTMASTER, CUSTOM_OPTIONS, + gettext_noop("Protect memory corruption for tlog"), + NULL, + GUC_NOT_IN_SAMPLE, + }, + &enable_tlog_mprotect, +#ifdef _PG_REGRESS_ + true, +#else + false, +#endif + NULL, NULL, NULL + }, + { + {"enable_xlog_mprotect", PGC_POSTMASTER, CUSTOM_OPTIONS, + gettext_noop("Protect memory corruption for xlog"), + NULL, + GUC_NOT_IN_SAMPLE, + }, + &enable_xlog_mprotect, +#ifdef _PG_REGRESS_ + true, +#else + false, +#endif + NULL, NULL, NULL + }, + { {"enable_cold_hot_router_print", PGC_USERSET, CUSTOM_OPTIONS, gettext_noop("Whether print cold hot router."), NULL diff --git a/src/backend/utils/misc/mls.c b/src/backend/utils/misc/mls.c index 8902bcaf..014e81b2 100644 --- a/src/backend/utils/misc/mls.c +++ b/src/backend/utils/misc/mls.c @@ -717,6 +717,8 @@ static void* mls_crypt_worker(void * input) for (;;) { + bool need_mprotect = false; + if (false == g_crypt_parellel_main_running) { break; @@ -747,7 +749,18 @@ static void* mls_crypt_worker(void * input) buf_need_encrypt = page_new + BLCKSZ; /* 2.2 do the encrypt */ + need_mprotect = enable_buffer_mprotect && + !BufferIsLocal(encrypt_element.buf_id) && + BufferIsValid(encrypt_element.buf_id); + if (need_mprotect) + { + BufDisableMemoryProtection(buf, false); + } ret = rel_crypt_page_encrypting_parellel(encrypt_element.algo_id, buf, buf_need_encrypt, page_new, encrypt_element.cryptkey, workerid); + if (need_mprotect) + { + BufEnableMemoryProtection(buf, false); + } /* 3. put it to crypted queue */ while (QueueIsFull(crypted_queue)) diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index e5cb868a..12d19740 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -485,6 +485,14 @@ #update_process_title = on +# - Memory Protection - + +#enable_buffer_mprotect = off +#enable_clog_mprotect = off +#enable_tlog_mprotect = off +#enable_xlog_mprotect = off + + # - Maintain GTS - #gts_maintain_option = 0 # range: 0-2. the default is 0. diff --git a/src/backend/utils/time/tqual.c b/src/backend/utils/time/tqual.c index 9fa4ad40..247cabfe 100644 --- a/src/backend/utils/time/tqual.c +++ b/src/backend/utils/time/tqual.c @@ -256,7 +256,11 @@ SetHintBits(HeapTupleHeader tuple, Buffer buffer, } } #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__ - + if (enable_buffer_mprotect) + { + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + } if(infomask & HEAP_XMIN_COMMITTED) { diff --git a/src/include/access/lru.h b/src/include/access/lru.h index 01a05dd6..7d9980a3 100644 --- a/src/include/access/lru.h +++ b/src/include/access/lru.h @@ -215,7 +215,6 @@ typedef struct LruCtlData typedef LruCtlData *LruCtl; - #define PARTITION_LOCK_IDX(shared) ((shared)->num_slots) extern Size LruShmemSize(int nslots, int nlsns); @@ -244,6 +243,10 @@ extern bool LruScanDirCbReportPresence(LruCtl ctl, char *filename, int segpage, void *data); extern bool LruScanDirCbDeleteAll(LruCtl ctl, char *filename, int segpage, void *data); +extern void LruTlogEnableMemoryProtection(char *address); +extern void LruTlogDisableMemoryProtection(char *address); +extern void SetPageReadOnly(char *address); +extern void SetPageReadWrite(char *address); #endif /* SLRU_H */ diff --git a/src/include/access/slru.h b/src/include/access/slru.h index cfc559d0..3a38460b 100644 --- a/src/include/access/slru.h +++ b/src/include/access/slru.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * slru.h - * Simple LRU buffering for transaction status logfiles + * Simple LRU buffering for transaction status logfiles * * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -30,10 +30,10 @@ * take no explicit notice of that fact in slru.c, except when comparing * segment and page numbers in SimpleLruTruncate (see PagePrecedes()). */ -#define SLRU_PAGES_PER_SEGMENT 32 +#define SLRU_PAGES_PER_SEGMENT 32 /* Maximum length of an SLRU name */ -#define SLRU_MAX_NAME_LENGTH 32 +#define SLRU_MAX_NAME_LENGTH 32 /* * Page status codes. Note that these do not include the "dirty" bit. @@ -43,10 +43,10 @@ */ typedef enum { - SLRU_PAGE_EMPTY, /* buffer is not in use */ - SLRU_PAGE_READ_IN_PROGRESS, /* page is being read in */ - SLRU_PAGE_VALID, /* page is valid and not being written */ - SLRU_PAGE_WRITE_IN_PROGRESS /* page is being written out */ + SLRU_PAGE_EMPTY, /* buffer is not in use */ + SLRU_PAGE_READ_IN_PROGRESS, /* page is being read in */ + SLRU_PAGE_VALID, /* page is valid and not being written */ + SLRU_PAGE_WRITE_IN_PROGRESS /* page is being written out */ } SlruPageStatus; /* @@ -54,54 +54,54 @@ typedef enum */ typedef struct SlruSharedData { - LWLock *ControlLock; - - /* Number of buffers managed by this SLRU structure */ - int num_slots; - - /* - * Arrays holding info for each buffer slot. Page number is undefined - * when status is EMPTY, as is page_lru_count. - */ - char **page_buffer; - SlruPageStatus *page_status; - bool *page_dirty; - int *page_number; - int *page_lru_count; - - /* - * Optional array of WAL flush LSNs associated with entries in the SLRU - * pages. If not zero/NULL, we must flush WAL before writing pages (true - * for pg_xact, false for multixact, pg_subtrans, pg_notify). group_lsn[] - * has lsn_groups_per_page entries per buffer slot, each containing the - * highest LSN known for a contiguous group of SLRU entries on that slot's - * page. - */ - XLogRecPtr *group_lsn; - int lsn_groups_per_page; - - /*---------- - * We mark a page "most recently used" by setting - * page_lru_count[slotno] = ++cur_lru_count; - * The oldest page is therefore the one with the highest value of - * cur_lru_count - page_lru_count[slotno] - * The counts will eventually wrap around, but this calculation still - * works as long as no page's age exceeds INT_MAX counts. - *---------- - */ - int cur_lru_count; - - /* - * latest_page_number is the page number of the current end of the log; - * this is not critical data, since we use it only to avoid swapping out - * the latest page. - */ - int latest_page_number; - - /* LWLocks */ - int lwlock_tranche_id; - char lwlock_tranche_name[SLRU_MAX_NAME_LENGTH]; - LWLockPadded *buffer_locks; + LWLock *ControlLock; + + /* Number of buffers managed by this SLRU structure */ + int num_slots; + + /* + * Arrays holding info for each buffer slot. Page number is undefined + * when status is EMPTY, as is page_lru_count. + */ + char **page_buffer; + SlruPageStatus *page_status; + bool *page_dirty; + int *page_number; + int *page_lru_count; + + /* + * Optional array of WAL flush LSNs associated with entries in the SLRU + * pages. If not zero/NULL, we must flush WAL before writing pages (true + * for pg_xact, false for multixact, pg_subtrans, pg_notify). group_lsn[] + * has lsn_groups_per_page entries per buffer slot, each containing the + * highest LSN known for a contiguous group of SLRU entries on that slot's + * page. + */ + XLogRecPtr *group_lsn; + int lsn_groups_per_page; + + /*---------- + * We mark a page "most recently used" by setting + * page_lru_count[slotno] = ++cur_lru_count; + * The oldest page is therefore the one with the highest value of + * cur_lru_count - page_lru_count[slotno] + * The counts will eventually wrap around, but this calculation still + * works as long as no page's age exceeds INT_MAX counts. + *---------- + */ + int cur_lru_count; + + /* + * latest_page_number is the page number of the current end of the log; + * this is not critical data, since we use it only to avoid swapping out + * the latest page. + */ + int latest_page_number; + + /* LWLocks */ + int lwlock_tranche_id; + char lwlock_tranche_name[SLRU_MAX_NAME_LENGTH]; + LWLockPadded *buffer_locks; } SlruSharedData; typedef SlruSharedData *SlruShared; @@ -112,26 +112,26 @@ typedef SlruSharedData *SlruShared; */ typedef struct SlruCtlData { - SlruShared shared; - - /* - * This flag tells whether to fsync writes (true for pg_xact and multixact - * stuff, false for pg_subtrans and pg_notify). - */ - bool do_fsync; - - /* - * Decide which of two page numbers is "older" for truncation purposes. We - * need to use comparison of TransactionIds here in order to do the right - * thing with wraparound XID arithmetic. - */ - bool (*PagePrecedes) (int, int); - - /* - * Dir is set during SimpleLruInit and does not change thereafter. Since - * it's always the same, it doesn't need to be in shared memory. - */ - char Dir[64]; + SlruShared shared; + + /* + * This flag tells whether to fsync writes (true for pg_xact and multixact + * stuff, false for pg_subtrans and pg_notify). + */ + bool do_fsync; + + /* + * Decide which of two page numbers is "older" for truncation purposes. We + * need to use comparison of TransactionIds here in order to do the right + * thing with wraparound XID arithmetic. + */ + bool (*PagePrecedes) (int, int); + + /* + * Dir is set during SimpleLruInit and does not change thereafter. Since + * it's always the same, it doesn't need to be in shared memory. + */ + char Dir[64]; } SlruCtlData; typedef SlruCtlData *SlruCtl; @@ -139,26 +139,29 @@ typedef SlruCtlData *SlruCtl; extern Size SimpleLruShmemSize(int nslots, int nlsns); extern void SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, - LWLock *ctllock, const char *subdir, int tranche_id); -extern int SimpleLruZeroPage(SlruCtl ctl, int pageno); + LWLock *ctllock, const char *subdir, int tranche_id); +extern int SimpleLruZeroPage(SlruCtl ctl, int pageno); extern int SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok, - TransactionId xid); + TransactionId xid); extern int SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno, - TransactionId xid); + TransactionId xid); extern void SimpleLruWritePage(SlruCtl ctl, int slotno); extern void SimpleLruFlush(SlruCtl ctl, bool allow_redirtied); extern void SimpleLruTruncate(SlruCtl ctl, int cutoffPage); extern bool SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int pageno); typedef bool (*SlruScanCallback) (SlruCtl ctl, char *filename, int segpage, - void *data); + void *data); extern bool SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data); extern void SlruDeleteSegment(SlruCtl ctl, int segno); /* SlruScanDirectory public callbacks */ extern bool SlruScanDirCbReportPresence(SlruCtl ctl, char *filename, - int segpage, void *data); + int segpage, void *data); extern bool SlruScanDirCbDeleteAll(SlruCtl ctl, char *filename, int segpage, - void *data); + void *data); -#endif /* SLRU_H */ +extern void SlruClogEnableMemoryProtection(char *address); +extern void SlruClogDisableMemoryProtection(char *address); + +#endif /* SLRU_H */ diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 9dfcd122..a0db442b 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -289,6 +289,9 @@ extern XLogRecPtr GetLastImportantRecPtr(void); extern void GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch); extern void RemovePromoteSignalFiles(void); +extern void XlogEnableMemoryProtection(char *address); +extern void XlogDisableMemoryProtection(char *address); + extern bool CheckPromoteSignal(void); extern void WakeupRecovery(void); extern void SetWalWriterSleeping(bool sleeping); diff --git a/src/include/c.h b/src/include/c.h index d4a4033d..9e18db2b 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -651,6 +651,7 @@ typedef NameData *Name; #define MAXALIGN(LEN) TYPEALIGN(MAXIMUM_ALIGNOF, (LEN)) /* MAXALIGN covers only built-in types, not buffers */ #define BUFFERALIGN(LEN) TYPEALIGN(ALIGNOF_BUFFER, (LEN)) +#define BLOCKALIGN(LEN) TYPEALIGN(BLCKSZ, (LEN)) #define CACHELINEALIGN(LEN) TYPEALIGN(PG_CACHE_LINE_SIZE, (LEN)) #define TYPEALIGN_DOWN(ALIGNVAL,LEN) \ diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index d4afa5d7..6eae2368 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -412,4 +412,7 @@ extern void AtEOXact_LocalBuffers(bool isCommit); #ifdef _MLS_ extern char * BufHdrGetBlockFunc(BufferDesc *buf); #endif + +extern void BufEnableMemoryProtection(char *address, bool localbuffer); +extern void BufDisableMemoryProtection(char *address, bool localbuffer); #endif /* BUFMGR_INTERNALS_H */ diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h index ac00ae2d..5df24178 100644 --- a/src/include/utils/guc.h +++ b/src/include/utils/guc.h @@ -269,6 +269,10 @@ extern int32 g_TransferSpeed; /* slicent copy from */ extern bool g_enable_copy_silence; extern bool g_enable_user_authority_force_check; +extern bool enable_buffer_mprotect; +extern bool enable_clog_mprotect; +extern bool enable_tlog_mprotect; +extern bool enable_xlog_mprotect; extern int query_delay; #endif extern int log_min_error_statement; diff --git a/src/test/regress/expected/oracle.out b/src/test/regress/expected/oracle.out index 9a08138a..8cc3ca0d 100644 --- a/src/test/regress/expected/oracle.out +++ b/src/test/regress/expected/oracle.out @@ -2280,6 +2280,7 @@ ERROR: failed to set the requested LC_COLLATE value [Nls_sortR = tt_RU.utf8@iq drop table test_sort; \c postgres DROP DATABASE IF EXISTS regression_sort; +ERROR: database "regression_sort" is being accessed by other users SET client_encoding to default; -- test !=- operator set enable_oracle_compatible = off; diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index 7f805e93..b0bd7fb4 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -76,7 +76,9 @@ select name, setting from pg_settings where name like 'enable%'; enable_audit_warning | off enable_auditlogger_warning | off enable_bitmapscan | on + enable_buffer_mprotect | on enable_check_password | off + enable_clog_mprotect | on enable_cls | on enable_cold_hot_router_print | off enable_cold_hot_visible | off @@ -127,9 +129,11 @@ select name, setting from pg_settings where name like 'enable%'; enable_statistic | on enable_subquery_shipping | on enable_tidscan | on + enable_tlog_mprotect | on enable_transparent_crypt | on enable_user_authority_force_check | off -(57 rows) + enable_xlog_mprotect | on +(61 rows) -- Test that the pg_timezone_names and pg_timezone_abbrevs views are -- more-or-less working. We can't test their contents in any great detail diff --git a/src/test/regress/output/xc_notrans_block.source b/src/test/regress/output/xc_notrans_block.source index fc060427..388b5081 100644 --- a/src/test/regress/output/xc_notrans_block.source +++ b/src/test/regress/output/xc_notrans_block.source @@ -153,6 +153,5 @@ select * from xc_tab1 order by id; \c regression clean connection to all for database xc_db; drop database xc_db; -ERROR: database "xc_db" is being accessed by other users drop tablespace xc_testspace1; ERROR: tablespace "xc_testspace1" is not empty diff --git a/src/test/regress/output/xc_notrans_block_1.source b/src/test/regress/output/xc_notrans_block_1.source index 3e79eb70..574cdc77 100644 --- a/src/test/regress/output/xc_notrans_block_1.source +++ b/src/test/regress/output/xc_notrans_block_1.source @@ -156,5 +156,4 @@ select * from xc_tab1 order by id; \c regression clean connection to all for database xc_db; drop database xc_db; -ERROR: database "xc_db" is being accessed by other users drop tablespace xc_testspace1; From f3a496b6dfae6d7d6ccc280f8dd0a3ad44e98778 Mon Sep 17 00:00:00 2001 From: jackywpxie Date: Fri, 5 Feb 2021 21:07:15 +0800 Subject: [PATCH 166/578] jacky/feature/MemoryOptimization_Tbase_v2.15 (merge request !134) Squash merge branch 'jacky/feature/MemoryOptimization_Tbase_v2.15' into 'Tbase_v2.15' * modified addcording to suggestion. * Empty commit for add mr reviewer * fixed merged bug. * Merge branch 'Tbase_v2.15' into jacky/feature/MemoryOptimization_Tbase_v2.15 * refactor * delete debug code of catcache * refactor * delete debug code for relation LRU: RelCacheLogMemorySize is deleted. * delete debug code of locator * free Locator's memory * refator for relation LRU * delete debug code of memory details * add comment. * change the default value of pool_session_memory_limit t0 10 * support pool_session_memory_limit = -1 for forward compatibility. * fixed a warning * correct spelling errors * fixed bugs for number_replaced_relations * support show session_memory_size; * support number_replaced_relations * update max_relcache_relations * set enable_memory_optimization to on while doing regress test. * ignore system relation while do LRU replacing. * ignore unit * update conn->inCursor * ignore 5 bytes: * add condition of enable_memory_optimization * close enable_memory_optimization. * support enable_memory_optimization * fixed the bug for create table. * support replacing relations in RelationLRUInsert() * support create and drop table. * add RelationLRUInsert and RelationLRUDelete. * rename the name to number_replaced_relations * add cache memory optimization GUC para. * add pg_session_memory_detail() * change the unit of memory. * add tbase_memory_tools * add memory size log * add memory debug information. * delete PoolConnDeadtime * add the unit of session_memory_size * correct the unit of pool_session_memory_limit. * add debug info for memorycontext. --- contrib/Makefile | 1 + contrib/tbase_memory_tools/Makefile | 23 + .../tbase_memory_tools--1.0.sql | 29 + .../tbase_memory_tools/tbase_memory_tools.c | 277 +++++++ .../tbase_memory_tools.control | 5 + .../tbase_pooler_stat--1.0.sql | 1 - contrib/tbase_pooler_stat/tbase_pooler_stat.c | 8 +- src/backend/pgxc/locator/locator.c | 8 + src/backend/pgxc/pool/pgxcnode.c | 4 +- src/backend/pgxc/pool/poolmgr.c | 65 +- src/backend/utils/cache/catcache.c | 1 - src/backend/utils/cache/relcache.c | 105 ++- src/backend/utils/misc/guc.c | 46 +- src/backend/utils/misc/postgresql.conf.sample | 5 + src/include/utils/guc.h | 4 + src/include/utils/rel.h | 713 +++++++++--------- src/include/utils/relcache.h | 11 + src/test/regress/expected/sysviews.out | 3 +- 18 files changed, 916 insertions(+), 393 deletions(-) create mode 100644 contrib/tbase_memory_tools/Makefile create mode 100644 contrib/tbase_memory_tools/tbase_memory_tools--1.0.sql create mode 100644 contrib/tbase_memory_tools/tbase_memory_tools.c create mode 100644 contrib/tbase_memory_tools/tbase_memory_tools.control diff --git a/contrib/Makefile b/contrib/Makefile index 494da1e1..22110f25 100644 --- a/contrib/Makefile +++ b/contrib/Makefile @@ -47,6 +47,7 @@ SUBDIRS = \ spi \ tablefunc \ tbase_gts_tools \ + tbase_memory_tools \ tcn \ test_decoding \ tsm_system_rows \ diff --git a/contrib/tbase_memory_tools/Makefile b/contrib/tbase_memory_tools/Makefile new file mode 100644 index 00000000..6fa3fbca --- /dev/null +++ b/contrib/tbase_memory_tools/Makefile @@ -0,0 +1,23 @@ +# contrib/tbase_memory_tools/Makefile +MODULES = tbase_memory_tools + +## 扩展名称; +EXTENSION = tbase_memory_tools + +## 扩展安装的SQL文件; +DATA = tbase_memory_tools--1.0.sql + +## 扩展描述; +PGFILEDESC = "tbase_memory_tools - memory wrapper for Tbase" + +### 以下为Pg构建扩展相关命令; +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) ## 环境变量参数加载; +else +subdir = contrib/tbase_memory_tools +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/contrib/tbase_memory_tools/tbase_memory_tools--1.0.sql b/contrib/tbase_memory_tools/tbase_memory_tools--1.0.sql new file mode 100644 index 00000000..80f9b8a3 --- /dev/null +++ b/contrib/tbase_memory_tools/tbase_memory_tools--1.0.sql @@ -0,0 +1,29 @@ +/* contrib/tbase_memory/tbase_memory_tools--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "create EXTENSION tbase_memory_tools" to load this file. \quit + +-- +-- pg_node_memory_detail() +-- +CREATE FUNCTION pg_node_memory_detail( + OUT nodename text, + OUT pid int, + OUT memorytype text, + OUT memorykbytes int) +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'pg_node_memory_detail' +LANGUAGE C STRICT PARALLEL SAFE; + +-- +-- pg_session_memory_detail() +-- +CREATE FUNCTION pg_session_memory_detail( + OUT contextname text, + OUT contextlevel int, + OUT parent text, + OUT totalsize int, + OUT freesize int) +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'pg_session_memory_detail' +LANGUAGE C STRICT PARALLEL SAFE; \ No newline at end of file diff --git a/contrib/tbase_memory_tools/tbase_memory_tools.c b/contrib/tbase_memory_tools/tbase_memory_tools.c new file mode 100644 index 00000000..d722e2a9 --- /dev/null +++ b/contrib/tbase_memory_tools/tbase_memory_tools.c @@ -0,0 +1,277 @@ +#include "postgres.h" +#include "fmgr.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "access/htup_details.h" +#include "catalog/pg_type.h" +#include "catalog/namespace.h" +#include "utils/timestamp.h" +#include "utils/varlena.h" +#include "utils/builtins.h" +#include "utils/elog.h" +#include "utils/memutils.h" + +#ifdef PG_MODULE_MAGIC +PG_MODULE_MAGIC; +#endif + +#define LINUX_PAGE_SIZE 4096 +#define MAX_MEMORY_DETAIL 2048 + +typedef struct +{ + char *memory_context_name; + int level; + char *parent_name; + int parent_index; + long self_total_space; + long self_free_space; + long all_total_space; + long all_free_space; +} MemoryContextDetail; + +typedef struct +{ + int current; + int length; + MemoryContextDetail details[MAX_MEMORY_DETAIL]; +} SessionMemoryContexts; + +int get_memory_detail(MemoryContext mctx, + MemoryContext parent, + int level, + int ind_on_parent, + const int ind_on_stat, + SessionMemoryContexts *contexts); + + +/* + * pg_node_memory_detail + * + * node memory detail + */ +PG_FUNCTION_INFO_V1(pg_node_memory_detail); + +Datum +pg_node_memory_detail(PG_FUNCTION_ARGS) +{ + FuncCallContext *fctx; + + if (!superuser()) + { + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + (errmsg("must be superuser to use memory functions")))); + } + + if (SRF_IS_FIRSTCALL()) + { + MemoryContext mctx; + TupleDesc tupdesc; + + fctx = SRF_FIRSTCALL_INIT(); + mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx); + + /* Build a tuple descriptor for our result type */ + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + { + elog(ERROR, "return type must be a row type"); + } + + fctx->max_calls = 1; + fctx->tuple_desc = tupdesc; + + MemoryContextSwitchTo(mctx); + } + + fctx = SRF_PERCALL_SETUP(); + + if (fctx->call_cntr < fctx->max_calls) + { + HeapTuple resultTuple; + Datum result; + Datum values[4]; + bool nulls[4]; + int64 size = 0; + Size totalPages = 0; + Size rssPages = 0; + Size sharePages = 0; + char file[MAXPGPATH] = {0}; + char buf[MAXPGPATH] = {0}; + FILE *handle = NULL; + + memset(values, 0, sizeof(values)); + memset(nulls, 0, sizeof(nulls)); + + values[0] = CStringGetTextDatum(PGXCNodeName); + values[1] = Int64GetDatum(MyProcPid); + values[2] = CStringGetTextDatum("process_used_memory"); + + snprintf(file, MAXPGPATH, "/proc/%d/statm", MyProcPid); + handle = fopen(file, "r"); + if (handle != NULL && fgets(buf, MAXPGPATH, handle) > 0) + { + if (3 == sscanf(buf, "%lu %lu %lu", &totalPages, &rssPages, &sharePages)) + { + size = ((rssPages - sharePages) * LINUX_PAGE_SIZE) / 1024; + } + } + values[3] = Int64GetDatum(size); + + /* Build and return the result tuple. */ + resultTuple = heap_form_tuple(fctx->tuple_desc, values, nulls); + result = HeapTupleGetDatum(resultTuple); + + SRF_RETURN_NEXT(fctx, result); + } + else + { + SRF_RETURN_DONE(fctx); + } +} + +/* + * pg_session_memory_detail + * + * session memory detail + */ +PG_FUNCTION_INFO_V1(pg_session_memory_detail); + +Datum +pg_session_memory_detail(PG_FUNCTION_ARGS) +{ + FuncCallContext *fctx; + + if (!superuser()) + { + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + (errmsg("must be superuser to use memory functions")))); + } + + if (SRF_IS_FIRSTCALL()) + { + MemoryContext mctx; + TupleDesc tupdesc; + SessionMemoryContexts *contexts; + + fctx = SRF_FIRSTCALL_INIT(); + mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx); + + /* Build a tuple descriptor for our result type */ + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + { + elog(ERROR, "return type must be a row type"); + } + fctx->tuple_desc = tupdesc; + + contexts = (SessionMemoryContexts *) palloc0(sizeof(SessionMemoryContexts)); + contexts->current = 0; + contexts->length = 0; + contexts->details[0].memory_context_name = pstrdup("TopMemoryContext"); + contexts->details[0].level = 0; + contexts->details[0].parent_name = NULL; + contexts->details[0].parent_index = 0; + (void) get_memory_detail(TopMemoryContext, NULL, 0, 0, 0, contexts); + + fctx->user_fctx = contexts; + fctx->max_calls = contexts->length; + + MemoryContextSwitchTo(mctx); + } + + fctx = SRF_PERCALL_SETUP(); + + if (fctx->call_cntr < fctx->max_calls) + { + HeapTuple resultTuple; + Datum result; + Datum values[5]; + bool nulls[5]; + SessionMemoryContexts *contexts = fctx->user_fctx; + MemoryContextDetail *detail = &contexts->details[fctx->call_cntr]; + + memset(values, 0, sizeof(values)); + memset(nulls, 0, sizeof(nulls)); + + values[0] = CStringGetTextDatum(detail->memory_context_name); + values[1] = Int64GetDatum(detail->level); + if (detail->parent_name == NULL) + { + nulls[2] = true; + } + else + { + values[2] = CStringGetTextDatum(detail->parent_name); + } + values[3] = Int64GetDatum(detail->all_total_space); + values[3] = Int64GetDatum(detail->all_free_space); + + /* Build and return the result tuple. */ + resultTuple = heap_form_tuple(fctx->tuple_desc, values, nulls); + result = HeapTupleGetDatum(resultTuple); + + SRF_RETURN_NEXT(fctx, result); + } + else + { + SRF_RETURN_DONE(fctx); + } +} + +/* + * get memory details of self and children. + */ +int +get_memory_detail(MemoryContext mctx, + MemoryContext parent, + int level, + int ind_on_parent, + const int ind_on_stat, + SessionMemoryContexts *contexts) +{ + MemoryContext iter; + int child_index = 0; + int itr_indx_on_stat = 0; + int next_ind_on_stat = 0; + MemoryContextDetail *stat = NULL; + + if (ind_on_stat >= MAX_MEMORY_DETAIL) + { + elog(WARNING, "too many memory contexts!"); + return ind_on_stat; + } + + stat = &contexts->details[ind_on_stat]; + stat->memory_context_name = pstrdup(mctx->name); + stat->parent_name = parent ? pstrdup(parent->name) : NULL; + stat->parent_index = ind_on_parent; + stat->level = level; + stat->self_free_space = -1; + stat->self_total_space = -1; + if (IsA(mctx,AllocSetContext)) + { + AllocSetStats_Output(mctx, &stat->self_total_space, &stat->self_free_space); + stat->all_free_space = stat->self_free_space; + stat->all_total_space = stat->self_total_space; + } + + itr_indx_on_stat = ind_on_stat + 1; + contexts->length += 1; + child_index = 0; + iter = mctx->firstchild; + while (iter) + { + next_ind_on_stat = get_memory_detail(iter, mctx, level+1, child_index, itr_indx_on_stat, contexts); + iter = iter->nextchild; + + stat->all_free_space += contexts->details[itr_indx_on_stat].all_free_space; + stat->all_total_space += contexts->details[itr_indx_on_stat].all_total_space; + + itr_indx_on_stat = next_ind_on_stat; + + child_index++; + } + + return itr_indx_on_stat; +} diff --git a/contrib/tbase_memory_tools/tbase_memory_tools.control b/contrib/tbase_memory_tools/tbase_memory_tools.control new file mode 100644 index 00000000..0c1a3bbd --- /dev/null +++ b/contrib/tbase_memory_tools/tbase_memory_tools.control @@ -0,0 +1,5 @@ +# tbase_memory_tools extension +comment = 'memory wrapper for TBase' +default_version = '1.0' +module_pathname = '$libdir/tbase_memory_tools' +relocatable = true diff --git a/contrib/tbase_pooler_stat/tbase_pooler_stat--1.0.sql b/contrib/tbase_pooler_stat/tbase_pooler_stat--1.0.sql index 5ee8e1e6..93b0be11 100644 --- a/contrib/tbase_pooler_stat/tbase_pooler_stat--1.0.sql +++ b/contrib/tbase_pooler_stat/tbase_pooler_stat--1.0.sql @@ -32,7 +32,6 @@ CREATE OR REPLACE FUNCTION tbase_get_pooler_conn_statistics( OUT warming_cnt int4, OUT query_cnt int4, OUT exceed_keepalive_cnt int4, - OUT exceed_deadtime_cnt int4, OUT exceed_maxlifetime_cnt int4 ) RETURNS SETOF record diff --git a/contrib/tbase_pooler_stat/tbase_pooler_stat.c b/contrib/tbase_pooler_stat/tbase_pooler_stat.c index d85e5405..ac77982b 100644 --- a/contrib/tbase_pooler_stat/tbase_pooler_stat.c +++ b/contrib/tbase_pooler_stat/tbase_pooler_stat.c @@ -182,7 +182,7 @@ tbase_reset_pooler_cmd_statistics(PG_FUNCTION_ARGS) Datum tbase_get_pooler_conn_statistics(PG_FUNCTION_ARGS) { -#define LIST_POOLER_CONN_STATISTICS_COLUMNS 12 +#define LIST_POOLER_CONN_STATISTICS_COLUMNS 11 FuncCallContext *funcctx = NULL; int32 ret = 0; Pooler_ConnState *status = NULL; @@ -222,9 +222,7 @@ tbase_get_pooler_conn_statistics(PG_FUNCTION_ARGS) INT4OID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 10, "exceed_keepalive_cnt", INT4OID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 11, "exceed_deadtime_cnt", - INT4OID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 12, "exceed_maxlifetime_cnt", + TupleDescInitEntry(tupdesc, (AttrNumber) 11, "exceed_maxlifetime_cnt", INT4OID, -1, 0); funcctx->tuple_desc = BlessTupleDesc(tupdesc); @@ -279,7 +277,6 @@ tbase_get_pooler_conn_statistics(PG_FUNCTION_ARGS) nulls[8] = true; nulls[9] = true; nulls[10] = true; - nulls[11] = true; } else { @@ -292,7 +289,6 @@ tbase_get_pooler_conn_statistics(PG_FUNCTION_ARGS) values[8] = UInt32GetDatum(pq_getmsgint(status->buf, sizeof(uint32))); values[9] = UInt32GetDatum(pq_getmsgint(status->buf, sizeof(uint32))); values[10] = UInt32GetDatum(pq_getmsgint(status->buf, sizeof(uint32))); - values[11] = UInt32GetDatum(pq_getmsgint(status->buf, sizeof(uint32))); status->node_cursor--; } diff --git a/src/backend/pgxc/locator/locator.c b/src/backend/pgxc/locator/locator.c index 76c72d85..9ce0b468 100644 --- a/src/backend/pgxc/locator/locator.c +++ b/src/backend/pgxc/locator/locator.c @@ -915,6 +915,14 @@ FreeRelationLocInfo(RelationLocInfo *relationLocInfo) { if (relationLocInfo->partAttrName) pfree(relationLocInfo->partAttrName); + +#ifdef __COLD_HOT__ + if (relationLocInfo->secAttrName) + pfree(relationLocInfo->secAttrName); +#endif + + list_free(relationLocInfo->rl_nodeList); + pfree(relationLocInfo); } } diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index eb400fb4..5811f647 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -5532,9 +5532,11 @@ PGXCNodeSendShowQuery(NODE_CONNECTION *conn, const char *sql_command) resStatus = PQresultStatus(result); if (resStatus == PGRES_TUPLES_OK || resStatus == PGRES_COMMAND_OK) { - snprintf(number, 128, "%s", PQgetvalue(result, 0, 0)); + /* ignore unit */ + snprintf(number, result->tuples[0][0].len, "%s", PQgetvalue(result, 0, 0)); } PQclear(result); + return number; } diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c index 6efe045c..b83e273f 100644 --- a/src/backend/pgxc/pool/poolmgr.c +++ b/src/backend/pgxc/pool/poolmgr.c @@ -83,7 +83,6 @@ int PoolMaintenanceTimeout = 30; int PoolSizeCheckGap = 120; /* max check memory size gap, in seconds */ int PoolConnMaxLifetime = 600; /* max lifetime of a pooled connection, in seconds */ int PoolWarmConnMaxLifetime = 7200; /* max lifetime of a warm-needed pooled connection, in seconds */ -int PoolConnDeadtime = 1800; /* a pooled connection must be closed when lifetime exceed this, in seconds */ int PoolMaxMemoryLimit = 10; int PoolConnectTimeOut = 10; int PoolScaleFactor = 2; @@ -4755,8 +4754,7 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot, } } else if (((nodePool->freeSize > 0) && (nodePool->nwarming + nodePool->nquery) > MinFreeSize) || - (difftime(now, slot->created) >= PoolConnMaxLifetime) || - ((difftime(now, slot->created) >= PoolConnDeadtime) && (PoolConnDeadtime > PoolConnMaxLifetime))) + (difftime(now, slot->created) >= PoolConnMaxLifetime)) { force_destroy = true; if (PoolConnectDebugPrint) @@ -4777,7 +4775,6 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot, pooler_async_warm_connection(dbPool, slot, nodePool, node); grow_pool(dbPool, nodeidx, node, bCoord); } - else { if ((difftime(now, slot->checked) >= PoolSizeCheckGap) && !IS_ASYNC_PIPE_FULL()) @@ -5318,8 +5315,8 @@ pooler_handle_subthread_log(bool is_pooler_exit) */ static void PoolerLoop(void) -{// #lizard forgives - bool warme_initd = false; +{ + bool warm_inited = false; StringInfoData input_message; int maxfd = MaxConnections + 1024; struct pollfd *pool_fd; @@ -5589,10 +5586,10 @@ PoolerLoop(void) } /* create preload database pooler */ - if (!warme_initd) + if (!warm_inited) { connect_pools(); - warme_initd = true; + warm_inited = true; } pooler_pools_warm(); @@ -5828,7 +5825,6 @@ shrink_pool(DatabasePool *pool) { /* no need to shrik warmed slot, only discard them when they use too much memroy */ if (!slot->bwarmed && ((difftime(now, slot->released) > PoolConnKeepAlive) || - (difftime(now, slot->created) > PoolConnDeadtime) || (difftime(now, slot->created) >= PoolConnMaxLifetime))) { if (PoolConnectDebugPrint) @@ -8266,13 +8262,18 @@ void *pooler_async_utility_thread(void *arg) case COMMAND_CONNECTION_WARM: { CommandId commandID = InvalidCommandId; - ret = PGXCNodeSendSetQuery((NODE_CONNECTION *)pWarmInfo->slot->conn, "set warm_shared_buffer to true;", NULL, 0, &pWarmInfo->set_query_status, &commandID); + + ret = PGXCNodeSendSetQuery( + (NODE_CONNECTION *) pWarmInfo->slot->conn, + "set warm_shared_buffer to true;", + NULL, + 0, + &pWarmInfo->set_query_status, &commandID); /* only set warm flag when warm succeed */ if (0 == ret) { pWarmInfo->slot->bwarmed = true; } - } break; @@ -8280,14 +8281,44 @@ void *pooler_async_utility_thread(void *arg) { int mbytes = 0; char *size = NULL; - size = PGXCNodeSendShowQuery((NODE_CONNECTION *)pWarmInfo->slot->conn, "show session_memory_size;"); + CommandId commandID = InvalidCommandId; + + (void) PGXCNodeSendSetQuery( + (NODE_CONNECTION *) pWarmInfo->slot->conn, + "set remotetype to application;", + NULL, + 0, + &pWarmInfo->set_query_status, &commandID); + + size = PGXCNodeSendShowQuery( + (NODE_CONNECTION *) pWarmInfo->slot->conn, + "show session_memory_size;"); pWarmInfo->cmd = COMMAND_JUDGE_CONNECTION_MEMSIZE; mbytes = atoi(size); - if (mbytes >= PoolMaxMemoryLimit) + if (PoolMaxMemoryLimit > 0 && mbytes >= PoolMaxMemoryLimit) { pWarmInfo->cmd = COMMAND_CONNECTION_NEED_CLOSE; } pWarmInfo->size = mbytes; + + if (IS_PGXC_COORDINATOR) + { + (void) PGXCNodeSendSetQuery( + (NODE_CONNECTION *) pWarmInfo->slot->conn, + "set remotetype to coordinator;", + NULL, + 0, + &pWarmInfo->set_query_status, &commandID); + } + else + { + (void) PGXCNodeSendSetQuery( + (NODE_CONNECTION *) pWarmInfo->slot->conn, + "set remotetype to datanode;", + NULL, + 0, + &pWarmInfo->set_query_status, &commandID); + } } break; @@ -11318,7 +11349,6 @@ handle_get_conn_statistics(PoolAgent *agent) uint32 total_node_cnt_offset = 0; uint32 exceed_keepalive_cnt = 0; - uint32 exceed_deadtime_cnt = 0; uint32 exceed_maxlifetime_cnt = 0; int i = 0; PGXCNodePoolSlot *slot = NULL; @@ -11357,7 +11387,6 @@ handle_get_conn_statistics(PoolAgent *agent) /* reset statistics count */ exceed_keepalive_cnt = 0; - exceed_deadtime_cnt = 0; exceed_maxlifetime_cnt = 0; /* statistical connection life cycle */ if (node_pool->slot) @@ -11370,11 +11399,6 @@ handle_get_conn_statistics(PoolAgent *agent) exceed_keepalive_cnt++; } - if (difftime(now, slot->created) > PoolConnDeadtime) - { - exceed_deadtime_cnt++; - } - if (difftime(now, slot->created) >= PoolConnMaxLifetime) { exceed_maxlifetime_cnt++; @@ -11383,7 +11407,6 @@ handle_get_conn_statistics(PoolAgent *agent) } pq_sendint(&buf, exceed_keepalive_cnt, sizeof(uint32)); - pq_sendint(&buf, exceed_deadtime_cnt, sizeof(uint32)); pq_sendint(&buf, exceed_maxlifetime_cnt, sizeof(uint32)); } diff --git a/src/backend/utils/cache/catcache.c b/src/backend/utils/cache/catcache.c index bbe4710e..541174b0 100644 --- a/src/backend/utils/cache/catcache.c +++ b/src/backend/utils/cache/catcache.c @@ -91,7 +91,6 @@ static CatCTup *CatalogCacheCreateEntry(CatCache *cache, HeapTuple ntp, bool negative); static HeapTuple build_dummy_tuple(CatCache *cache, int nkeys, ScanKey skeys); - /* * internal support functions */ diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 98acab3e..eb895929 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -114,6 +114,11 @@ #define RELCACHE_INIT_FILEMAGIC 0x573266 /* version ID value */ +bool enable_memory_optimization = false; +int max_relcache_relations = 2000; +int number_replaced_relations = 10; +RelCacheHeader *RelCacheHdr = NULL; + /* * hardcoded tuple descriptors, contents generated by genbki.pl */ @@ -212,13 +217,19 @@ do { \ Assert(replace_allowed); \ hentry->reldesc = (RELATION); \ if (RelationHasReferenceCountZero(_old_rel)) \ + { \ + RelationLRUDelete(_old_rel); \ RelationDestroyRelation(_old_rel, false); \ + } \ else if (!IsBootstrapProcessingMode()) \ elog(WARNING, "leaking still-referenced relcache entry for \"%s\"", \ RelationGetRelationName(_old_rel)); \ } \ else \ + { \ hentry->reldesc = (RELATION); \ + } \ + RelationLRUInsert(RELATION); \ } while(0) #define RelationIdCacheLookup(ID, RELATION) \ @@ -242,6 +253,8 @@ do { \ if (hentry == NULL) \ elog(WARNING, "failed to delete relcache entry for OID %u", \ (RELATION)->rd_id); \ + \ + RelationLRUDelete(RELATION); \ } while(0) @@ -608,10 +621,12 @@ RelationBuildTupleDesc(Relation relation) if (attp->atthasdef) { if (attrdef == NULL) + { attrdef = (AttrDefault *) MemoryContextAllocZero(CacheMemoryContext, relation->rd_rel->relnatts * sizeof(AttrDefault)); + } attrdef[ndef].adnum = attnum; attrdef[ndef].adbin = NULL; ndef++; @@ -637,10 +652,12 @@ RelationBuildTupleDesc(Relation relation) Datum missval; if (attrmiss == NULL) + { attrmiss = (AttrMissing *) MemoryContextAllocZero(CacheMemoryContext, relation->rd_rel->relnatts * sizeof(AttrMissing)); + } missval = array_get_element(missingval, 1, @@ -1523,6 +1540,7 @@ RelationBuildDesc(Oid targetRelId, bool insertIt) * initialize the tuple descriptor (relation->rd_att). */ RelationBuildTupleDesc(relation); + #ifdef __TBASE__ } #endif @@ -1571,6 +1589,7 @@ RelationBuildDesc(Oid targetRelId, bool insertIt) #endif #endif + if (relation->rd_rel->relrowsecurity) { /* See comments near RelationBuildRuleLocok for details */ @@ -2387,6 +2406,75 @@ RelationHasGTS(Relation rel) return has; } +/* + * Insert relation elem to RelCache LRU list. + * If the number of elems in RelCache exceeds MAX_RELCACHE_RELATIONS,it will + * replace relation elems from RelCache LRU list。 + */ +void +RelationLRUInsert(Relation rel) +{ + if (!enable_memory_optimization) + { + return; + } + + if (RelCacheHdr->rh_ntup >= RelCacheHdr->rh_maxtup) + { + dlist_head *head; + dlist_iter iter; + int replaced = 0; + + Assert(number_replaced_relations > 0); + head = &RelCacheHdr->rh_lrulist; + for (iter.end = &head->head, + iter.cur = iter.end->prev ? iter.end->prev : iter.end; + iter.cur != iter.end;) + { + Relation relation; + + relation = dlist_container(RelationData, rd_lru_list_elem, iter.cur); + iter.cur = iter.cur->prev; + + if (IsSystemRelation(relation)) + { + continue; + } + + if (relation->rd_refcnt == 0) + { + RelationCacheDelete(relation); + RelationDestroyRelation(relation, false); + + replaced += 1; + if (replaced == number_replaced_relations) + { + Assert(RelCacheHdr->rh_ntup < RelCacheHdr->rh_maxtup); + break; + } + } + } + } + + dlist_push_head(&RelCacheHdr->rh_lrulist, &rel->rd_lru_list_elem); + RelCacheHdr->rh_ntup += 1; +} + +/* + * Remove relation elem from Relcache LRU list. + */ +void +RelationLRUDelete(Relation rel) +{ + if (!enable_memory_optimization) + { + return; + } + + dlist_delete(&rel->rd_lru_list_elem); + RelCacheHdr->rh_ntup -= 1; +} + /* ---------------------------------------------------------------- * cache invalidation support routines * ---------------------------------------------------------------- @@ -2650,6 +2738,10 @@ RelationDestroyRelation(Relation relation, bool remember_tupdesc) pfree(relation->rd_partcheck); if (relation->rd_fdwroutine) pfree(relation->rd_fdwroutine); +#ifdef PGXC + if (relation->rd_locator_info) + FreeRelationLocInfo(relation->rd_locator_info); +#endif pfree(relation); } @@ -2915,6 +3007,10 @@ RelationClearRelation(Relation relation, bool rebuild) SWAPFIELD(MemoryContext, rd_pdcxt); } + if (enable_memory_optimization) + { + SWAPFIELD(dlist_node, rd_lru_list_elem); + } #undef SWAPFIELD /* And now we can throw away the temporary entry */ @@ -3646,7 +3742,6 @@ RelationBuildLocalRelation(const char *relname, else rel->rd_rel->relfilenode = relfilenode; - RelationInitLockInfo(rel); /* see lmgr.c */ RelationInitPhysicalAddr(rel); @@ -3838,6 +3933,14 @@ RelationCacheInitialize(void) RelationIdCache = hash_create("Relcache by OID", INITRELCACHESIZE, &ctl, HASH_ELEM | HASH_BLOBS); + if (enable_memory_optimization && RelCacheHdr == NULL) + { + RelCacheHdr = (RelCacheHeader *) palloc0(sizeof(RelCacheHeader)); + RelCacheHdr->rh_ntup = 0; + RelCacheHdr->rh_maxtup = max_relcache_relations; + dlist_init(&RelCacheHdr->rh_lrulist); + } + /* * relation mapper needs to be initialized too */ diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 97a0b26f..8288cf36 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -2763,6 +2763,19 @@ static struct config_bool ConfigureNamesBool[] = false, NULL, NULL, NULL }, + { + {"enable_memory_optimization", PGC_POSTMASTER, RESOURCES, + gettext_noop("enable session cache memory control"), + NULL + }, + &enable_memory_optimization, +#ifdef _PG_REGRESS_ + true, +#else + false, +#endif + NULL, NULL, NULL + }, #endif @@ -3791,6 +3804,28 @@ static struct config_int ConfigureNamesInt[] = 0, 0, 31536000, NULL, NULL, NULL }, + { + {"max_relcache_relations", PGC_POSTMASTER, RESOURCES, + gettext_noop("max relcache relations per session."), + NULL + }, + &max_relcache_relations, +#ifdef _PG_REGRESS_ + 500, 500, INT_MAX, +#else + 2000, 500, INT_MAX, +#endif + NULL, NULL, NULL + }, + { + {"number_replaced_relations", PGC_POSTMASTER, RESOURCES, + gettext_noop("max relcache relations while replacing."), + NULL + }, + &number_replaced_relations, + 10, 1, 500, + NULL, NULL, NULL + }, #endif { {"log_rotation_age", PGC_SIGHUP, LOGGING_WHERE, @@ -4344,10 +4379,10 @@ static struct config_int ConfigureNamesInt[] = {"pool_session_memory_limit", PGC_SIGHUP, DATA_NODES, gettext_noop("Datanode session max memory context size."), gettext_noop("Exceed limit will be closed."), - GUC_UNIT_S + GUC_UNIT_MB }, &PoolMaxMemoryLimit, - 10, 1, 10000, + 10, -1, 10000, NULL, NULL, NULL }, { @@ -4402,7 +4437,8 @@ static struct config_int ConfigureNamesInt[] = { {"session_memory_size", PGC_USERSET, RESOURCES_MEM, gettext_noop("Used to get the total memory size of the session, in M Bytes."), - gettext_noop("Used to get the total memory size of the session, in M Bytes.") + gettext_noop("Used to get the total memory size of the session, in M Bytes."), + GUC_UNIT_MB }, &g_TotalMemorySize, 0, 0, INT_MAX, @@ -5968,7 +6004,7 @@ static struct config_enum ConfigureNamesEnum[] = #ifdef PGXC { - {"remotetype", PGC_BACKEND, CONN_AUTH, + {"remotetype", PGC_USERSET, CONN_AUTH, gettext_noop("Sets the type of Postgres-XL remote connection"), NULL }, @@ -13447,7 +13483,7 @@ show_total_memorysize(void) int32 size; static char buf[64]; size = get_total_memory_size(); - snprintf(buf, sizeof(buf), "%d", size); + snprintf(buf, sizeof(buf), "%dM", size); return buf; } #endif diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 12d19740..c03c59df 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -502,6 +502,11 @@ # 2: check the correctness of the GTS of tuples by referring to # tlog, and reset it if it is wrong while doing vacuum. +# - Memory Optimization - +#enable_memory_optimization = false +#max_relcache_relations = 2000 +#number_replaced_relations = 10 + #------------------------------------------------------------------------------ # RUNTIME STATISTICS #------------------------------------------------------------------------------ diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h index 5df24178..7ae45b95 100644 --- a/src/include/utils/guc.h +++ b/src/include/utils/guc.h @@ -274,6 +274,10 @@ extern bool enable_clog_mprotect; extern bool enable_tlog_mprotect; extern bool enable_xlog_mprotect; extern int query_delay; + +extern bool enable_memory_optimization; +extern int max_relcache_relations; +extern int number_replaced_relations; #endif extern int log_min_error_statement; extern int log_min_messages; diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index 01246e8c..03e6041b 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * rel.h - * POSTGRES relation descriptor (a/k/a relcache entry) definitions. + * POSTGRES relation descriptor (a/k/a relcache entry) definitions. * * * Portions Copyright (c) 2012-2014, TransLattice, Inc. @@ -41,13 +41,13 @@ typedef struct LockRelId { - Oid relId; /* a relation identifier */ - Oid dbId; /* a database identifier */ + Oid relId; /* a relation identifier */ + Oid dbId; /* a database identifier */ } LockRelId; typedef struct LockInfoData { - LockRelId lockRelId; + LockRelId lockRelId; } LockInfoData; typedef LockInfoData *LockInfo; @@ -57,28 +57,28 @@ typedef LockInfoData *LockInfo; */ typedef struct PartitionKeyData { - char strategy; /* partitioning strategy */ - int16 partnatts; /* number of columns in the partition key */ - AttrNumber *partattrs; /* attribute numbers of columns in the - * partition key */ - List *partexprs; /* list of expressions in the partitioning - * key, or NIL */ - - Oid *partopfamily; /* OIDs of operator families */ - Oid *partopcintype; /* OIDs of opclass declared input data types */ - FmgrInfo *partsupfunc; /* lookup info for support funcs */ - - /* Partitioning collation per attribute */ - Oid *partcollation; - - /* Type information per attribute */ - Oid *parttypid; - int32 *parttypmod; - int16 *parttyplen; - bool *parttypbyval; - char *parttypalign; - Oid *parttypcoll; -} PartitionKeyData; + char strategy; /* partitioning strategy */ + int16 partnatts; /* number of columns in the partition key */ + AttrNumber *partattrs; /* attribute numbers of columns in the + * partition key */ + List *partexprs; /* list of expressions in the partitioning + * key, or NIL */ + + Oid *partopfamily; /* OIDs of operator families */ + Oid *partopcintype; /* OIDs of opclass declared input data types */ + FmgrInfo *partsupfunc; /* lookup info for support funcs */ + + /* Partitioning collation per attribute */ + Oid *partcollation; + + /* Type information per attribute */ + Oid *parttypid; + int32 *parttypmod; + int16 *parttyplen; + bool *parttypbyval; + char *parttypalign; + Oid *parttypcoll; +} PartitionKeyData; typedef struct PartitionKeyData *PartitionKey; @@ -97,155 +97,156 @@ typedef struct tagClsExprStruct typedef struct RelationData { - RelFileNode rd_node; /* relation physical identifier */ - /* use "struct" here to avoid needing to include smgr.h: */ - struct SMgrRelationData *rd_smgr; /* cached file handle, or NULL */ - int rd_refcnt; /* reference count */ - BackendId rd_backend; /* owning backend id, if temporary relation */ - bool rd_islocaltemp; /* rel is a temp rel of this session */ - bool rd_isnailed; /* rel is nailed in cache */ - bool rd_isvalid; /* relcache entry is valid */ - char rd_indexvalid; /* state of rd_indexlist: 0 = not valid, 1 = - * valid, 2 = temporarily forced */ - bool rd_statvalid; /* is rd_statlist valid? */ - - /* - * rd_createSubid is the ID of the highest subtransaction the rel has - * survived into; or zero if the rel was not created in the current top - * transaction. This can be now be relied on, whereas previously it could - * be "forgotten" in earlier releases. Likewise, rd_newRelfilenodeSubid is - * the ID of the highest subtransaction the relfilenode change has - * survived into, or zero if not changed in the current transaction (or we - * have forgotten changing it). rd_newRelfilenodeSubid can be forgotten - * when a relation has multiple new relfilenodes within a single - * transaction, with one of them occurring in a subsequently aborted - * subtransaction, e.g. BEGIN; TRUNCATE t; SAVEPOINT save; TRUNCATE t; - * ROLLBACK TO save; -- rd_newRelfilenode is now forgotten - */ - SubTransactionId rd_createSubid; /* rel was created in current xact */ - SubTransactionId rd_newRelfilenodeSubid; /* new relfilenode assigned in - * current xact */ - - Form_pg_class rd_rel; /* RELATION tuple */ - TupleDesc rd_att; /* tuple descriptor */ - Oid rd_id; /* relation's object id */ - LockInfoData rd_lockInfo; /* lock mgr's info for locking relation */ - RuleLock *rd_rules; /* rewrite rules */ - MemoryContext rd_rulescxt; /* private memory cxt for rd_rules, if any */ - TriggerDesc *trigdesc; /* Trigger info, or NULL if rel has none */ - /* use "struct" here to avoid needing to include rowsecurity.h: */ - struct RowSecurityDesc *rd_rsdesc; /* row security policies, or NULL */ + RelFileNode rd_node; /* relation physical identifier */ + /* use "struct" here to avoid needing to include smgr.h: */ + struct SMgrRelationData *rd_smgr; /* cached file handle, or NULL */ + int rd_refcnt; /* reference count */ + BackendId rd_backend; /* owning backend id, if temporary relation */ + bool rd_islocaltemp; /* rel is a temp rel of this session */ + bool rd_isnailed; /* rel is nailed in cache */ + bool rd_isvalid; /* relcache entry is valid */ + char rd_indexvalid; /* state of rd_indexlist: 0 = not valid, 1 = + * valid, 2 = temporarily forced */ + bool rd_statvalid; /* is rd_statlist valid? */ + + /* + * rd_createSubid is the ID of the highest subtransaction the rel has + * survived into; or zero if the rel was not created in the current top + * transaction. This can be now be relied on, whereas previously it could + * be "forgotten" in earlier releases. Likewise, rd_newRelfilenodeSubid is + * the ID of the highest subtransaction the relfilenode change has + * survived into, or zero if not changed in the current transaction (or we + * have forgotten changing it). rd_newRelfilenodeSubid can be forgotten + * when a relation has multiple new relfilenodes within a single + * transaction, with one of them occurring in a subsequently aborted + * subtransaction, e.g. BEGIN; TRUNCATE t; SAVEPOINT save; TRUNCATE t; + * ROLLBACK TO save; -- rd_newRelfilenode is now forgotten + */ + SubTransactionId rd_createSubid; /* rel was created in current xact */ + SubTransactionId rd_newRelfilenodeSubid; /* new relfilenode assigned in + * current xact */ + + Form_pg_class rd_rel; /* RELATION tuple */ + TupleDesc rd_att; /* tuple descriptor */ + Oid rd_id; /* relation's object id */ + LockInfoData rd_lockInfo; /* lock mgr's info for locking relation */ + RuleLock *rd_rules; /* rewrite rules */ + MemoryContext rd_rulescxt; /* private memory cxt for rd_rules, if any */ + TriggerDesc *trigdesc; /* Trigger info, or NULL if rel has none */ + /* use "struct" here to avoid needing to include rowsecurity.h: */ + struct RowSecurityDesc *rd_rsdesc; /* row security policies, or NULL */ #ifdef _MLS_ ClsExprStruct * rd_cls_struct;/* pg_cls_check function call expr */ #endif - /* data managed by RelationGetFKeyList: */ - List *rd_fkeylist; /* list of ForeignKeyCacheInfo (see below) */ - bool rd_fkeyvalid; /* true if list has been computed */ - - MemoryContext rd_partkeycxt; /* private memory cxt for the below */ - struct PartitionKeyData *rd_partkey; /* partition key, or NULL */ - MemoryContext rd_pdcxt; /* private context for partdesc */ - struct PartitionDescData *rd_partdesc; /* partitions, or NULL */ - List *rd_partcheck; /* partition CHECK quals */ - - /* data managed by RelationGetIndexList: */ - List *rd_indexlist; /* list of OIDs of indexes on relation */ - Oid rd_oidindex; /* OID of unique index on OID, if any */ - Oid rd_pkindex; /* OID of primary key, if any */ - Oid rd_replidindex; /* OID of replica identity index, if any */ - - /* data managed by RelationGetStatExtList: */ - List *rd_statlist; /* list of OIDs of extended stats */ - - /* data managed by RelationGetIndexAttrBitmap: */ - Bitmapset *rd_indexattr; /* identifies columns used in indexes */ - Bitmapset *rd_keyattr; /* cols that can be ref'd by foreign keys */ - Bitmapset *rd_pkattr; /* cols included in primary key */ - Bitmapset *rd_idattr; /* included in replica identity index */ - - PublicationActions *rd_pubactions; /* publication actions */ - - /* - * rd_options is set whenever rd_rel is loaded into the relcache entry. - * Note that you can NOT look into rd_rel for this data. NULL means "use - * defaults". - */ - bytea *rd_options; /* parsed pg_class.reloptions */ - - /* These are non-NULL only for an index relation: */ - Form_pg_index rd_index; /* pg_index tuple describing this index */ - /* use "struct" here to avoid needing to include htup.h: */ - struct HeapTupleData *rd_indextuple; /* all of pg_index tuple */ - - /* - * index access support info (used only for an index relation) - * - * Note: only default support procs for each opclass are cached, namely - * those with lefttype and righttype equal to the opclass's opcintype. The - * arrays are indexed by support function number, which is a sufficient - * identifier given that restriction. - * - * Note: rd_amcache is available for index AMs to cache private data about - * an index. This must be just a cache since it may get reset at any time - * (in particular, it will get reset by a relcache inval message for the - * index). If used, it must point to a single memory chunk palloc'd in - * rd_indexcxt. A relcache reset will include freeing that chunk and - * setting rd_amcache = NULL. - */ - Oid rd_amhandler; /* OID of index AM's handler function */ - MemoryContext rd_indexcxt; /* private memory cxt for this stuff */ - /* use "struct" here to avoid needing to include amapi.h: */ - struct IndexAmRoutine *rd_amroutine; /* index AM's API struct */ - Oid *rd_opfamily; /* OIDs of op families for each index col */ - Oid *rd_opcintype; /* OIDs of opclass declared input data types */ - RegProcedure *rd_support; /* OIDs of support procedures */ - FmgrInfo *rd_supportinfo; /* lookup info for support procedures */ - int16 *rd_indoption; /* per-column AM-specific flags */ - List *rd_indexprs; /* index expression trees, if any */ - List *rd_indpred; /* index predicate tree, if any */ - Oid *rd_exclops; /* OIDs of exclusion operators, if any */ - Oid *rd_exclprocs; /* OIDs of exclusion ops' procs, if any */ - uint16 *rd_exclstrats; /* exclusion ops' strategy numbers, if any */ - void *rd_amcache; /* available for use by index AM */ - Oid *rd_indcollation; /* OIDs of index collations */ - - /* - * foreign-table support - * - * rd_fdwroutine must point to a single memory chunk palloc'd in - * CacheMemoryContext. It will be freed and reset to NULL on a relcache - * reset. - */ - - /* use "struct" here to avoid needing to include fdwapi.h: */ - struct FdwRoutine *rd_fdwroutine; /* cached function pointers, or NULL */ - - /* - * Hack for CLUSTER, rewriting ALTER TABLE, etc: when writing a new - * version of a table, we need to make any toast pointers inserted into it - * have the existing toast table's OID, not the OID of the transient toast - * table. If rd_toastoid isn't InvalidOid, it is the OID to place in - * toast pointers inserted into this rel. (Note it's set on the new - * version of the main heap, not the toast table itself.) This also - * causes toast_save_datum() to try to preserve toast value OIDs. - */ - Oid rd_toastoid; /* Real TOAST table's OID, or InvalidOid */ - - /* use "struct" here to avoid needing to include pgstat.h: */ - struct PgStat_TableStatus *pgstat_info; /* statistics collection area */ + /* data managed by RelationGetFKeyList: */ + List *rd_fkeylist; /* list of ForeignKeyCacheInfo (see below) */ + bool rd_fkeyvalid; /* true if list has been computed */ + + MemoryContext rd_partkeycxt; /* private memory cxt for the below */ + struct PartitionKeyData *rd_partkey; /* partition key, or NULL */ + MemoryContext rd_pdcxt; /* private context for partdesc */ + struct PartitionDescData *rd_partdesc; /* partitions, or NULL */ + List *rd_partcheck; /* partition CHECK quals */ + + /* data managed by RelationGetIndexList: */ + List *rd_indexlist; /* list of OIDs of indexes on relation */ + Oid rd_oidindex; /* OID of unique index on OID, if any */ + Oid rd_pkindex; /* OID of primary key, if any */ + Oid rd_replidindex; /* OID of replica identity index, if any */ + + /* data managed by RelationGetStatExtList: */ + List *rd_statlist; /* list of OIDs of extended stats */ + + /* data managed by RelationGetIndexAttrBitmap: */ + Bitmapset *rd_indexattr; /* identifies columns used in indexes */ + Bitmapset *rd_keyattr; /* cols that can be ref'd by foreign keys */ + Bitmapset *rd_pkattr; /* cols included in primary key */ + Bitmapset *rd_idattr; /* included in replica identity index */ + + PublicationActions *rd_pubactions; /* publication actions */ + + /* + * rd_options is set whenever rd_rel is loaded into the relcache entry. + * Note that you can NOT look into rd_rel for this data. NULL means "use + * defaults". + */ + bytea *rd_options; /* parsed pg_class.reloptions */ + + /* These are non-NULL only for an index relation: */ + Form_pg_index rd_index; /* pg_index tuple describing this index */ + /* use "struct" here to avoid needing to include htup.h: */ + struct HeapTupleData *rd_indextuple; /* all of pg_index tuple */ + + /* + * index access support info (used only for an index relation) + * + * Note: only default support procs for each opclass are cached, namely + * those with lefttype and righttype equal to the opclass's opcintype. The + * arrays are indexed by support function number, which is a sufficient + * identifier given that restriction. + * + * Note: rd_amcache is available for index AMs to cache private data about + * an index. This must be just a cache since it may get reset at any time + * (in particular, it will get reset by a relcache inval message for the + * index). If used, it must point to a single memory chunk palloc'd in + * rd_indexcxt. A relcache reset will include freeing that chunk and + * setting rd_amcache = NULL. + */ + Oid rd_amhandler; /* OID of index AM's handler function */ + MemoryContext rd_indexcxt; /* private memory cxt for this stuff */ + /* use "struct" here to avoid needing to include amapi.h: */ + struct IndexAmRoutine *rd_amroutine; /* index AM's API struct */ + Oid *rd_opfamily; /* OIDs of op families for each index col */ + Oid *rd_opcintype; /* OIDs of opclass declared input data types */ + RegProcedure *rd_support; /* OIDs of support procedures */ + FmgrInfo *rd_supportinfo; /* lookup info for support procedures */ + int16 *rd_indoption; /* per-column AM-specific flags */ + List *rd_indexprs; /* index expression trees, if any */ + List *rd_indpred; /* index predicate tree, if any */ + Oid *rd_exclops; /* OIDs of exclusion operators, if any */ + Oid *rd_exclprocs; /* OIDs of exclusion ops' procs, if any */ + uint16 *rd_exclstrats; /* exclusion ops' strategy numbers, if any */ + void *rd_amcache; /* available for use by index AM */ + Oid *rd_indcollation; /* OIDs of index collations */ + + /* + * foreign-table support + * + * rd_fdwroutine must point to a single memory chunk palloc'd in + * CacheMemoryContext. It will be freed and reset to NULL on a relcache + * reset. + */ + + /* use "struct" here to avoid needing to include fdwapi.h: */ + struct FdwRoutine *rd_fdwroutine; /* cached function pointers, or NULL */ + + /* + * Hack for CLUSTER, rewriting ALTER TABLE, etc: when writing a new + * version of a table, we need to make any toast pointers inserted into it + * have the existing toast table's OID, not the OID of the transient toast + * table. If rd_toastoid isn't InvalidOid, it is the OID to place in + * toast pointers inserted into this rel. (Note it's set on the new + * version of the main heap, not the toast table itself.) This also + * causes toast_save_datum() to try to preserve toast value OIDs. + */ + Oid rd_toastoid; /* Real TOAST table's OID, or InvalidOid */ + + /* use "struct" here to avoid needing to include pgstat.h: */ + struct PgStat_TableStatus *pgstat_info; /* statistics collection area */ #ifdef PGXC - RelationLocInfo *rd_locator_info; + RelationLocInfo *rd_locator_info; #endif #ifdef __TBASE__ - Form_pg_partition_interval rd_partitions_info; + Form_pg_partition_interval rd_partitions_info; + dlist_node rd_lru_list_elem; /* list member of LRU list */ #endif } RelationData; /* * ForeignKeyCacheInfo - * Information the relcache can cache about foreign key constraints + * Information the relcache can cache about foreign key constraints * * This is basically just an image of relevant columns from pg_constraint. * We make it a subclass of Node so that copyObject() can be used on a list @@ -259,20 +260,20 @@ typedef struct RelationData */ typedef struct ForeignKeyCacheInfo { - NodeTag type; - Oid conrelid; /* relation constrained by the foreign key */ - Oid confrelid; /* relation referenced by the foreign key */ - int nkeys; /* number of columns in the foreign key */ - /* these arrays each have nkeys valid entries: */ - AttrNumber conkey[INDEX_MAX_KEYS]; /* cols in referencing table */ - AttrNumber confkey[INDEX_MAX_KEYS]; /* cols in referenced table */ - Oid conpfeqop[INDEX_MAX_KEYS]; /* PK = FK operator OIDs */ + NodeTag type; + Oid conrelid; /* relation constrained by the foreign key */ + Oid confrelid; /* relation referenced by the foreign key */ + int nkeys; /* number of columns in the foreign key */ + /* these arrays each have nkeys valid entries: */ + AttrNumber conkey[INDEX_MAX_KEYS]; /* cols in referencing table */ + AttrNumber confkey[INDEX_MAX_KEYS]; /* cols in referenced table */ + Oid conpfeqop[INDEX_MAX_KEYS]; /* PK = FK operator OIDs */ } ForeignKeyCacheInfo; /* * StdRdOptions - * Standard contents of rd_options for heaps and generic indexes. + * Standard contents of rd_options for heaps and generic indexes. * * RelationGetFillFactor() and RelationGetTargetPageFreeSpace() can only * be applied to relations that use this format or a superset for @@ -281,134 +282,134 @@ typedef struct ForeignKeyCacheInfo /* autovacuum-related reloptions. */ typedef struct AutoVacOpts { - bool enabled; - int vacuum_threshold; - int analyze_threshold; - int vacuum_cost_delay; - int vacuum_cost_limit; - int freeze_min_age; - int freeze_max_age; - int freeze_table_age; - int multixact_freeze_min_age; - int multixact_freeze_max_age; - int multixact_freeze_table_age; - int log_min_duration; - float8 vacuum_scale_factor; - float8 analyze_scale_factor; + bool enabled; + int vacuum_threshold; + int analyze_threshold; + int vacuum_cost_delay; + int vacuum_cost_limit; + int freeze_min_age; + int freeze_max_age; + int freeze_table_age; + int multixact_freeze_min_age; + int multixact_freeze_max_age; + int multixact_freeze_table_age; + int log_min_duration; + float8 vacuum_scale_factor; + float8 analyze_scale_factor; } AutoVacOpts; typedef struct StdRdOptions { - int32 vl_len_; /* varlena header (do not touch directly!) */ - int fillfactor; /* page fill factor in percent (0..100) */ - AutoVacOpts autovacuum; /* autovacuum-related options */ - bool user_catalog_table; /* use as an additional catalog relation */ - int parallel_workers; /* max number of parallel workers */ + int32 vl_len_; /* varlena header (do not touch directly!) */ + int fillfactor; /* page fill factor in percent (0..100) */ + AutoVacOpts autovacuum; /* autovacuum-related options */ + bool user_catalog_table; /* use as an additional catalog relation */ + int parallel_workers; /* max number of parallel workers */ } StdRdOptions; -#define HEAP_MIN_FILLFACTOR 10 -#define HEAP_DEFAULT_FILLFACTOR 100 +#define HEAP_MIN_FILLFACTOR 10 +#define HEAP_DEFAULT_FILLFACTOR 100 /* * RelationGetFillFactor - * Returns the relation's fillfactor. Note multiple eval of argument! + * Returns the relation's fillfactor. Note multiple eval of argument! */ #define RelationGetFillFactor(relation, defaultff) \ - ((relation)->rd_options ? \ - ((StdRdOptions *) (relation)->rd_options)->fillfactor : (defaultff)) + ((relation)->rd_options ? \ + ((StdRdOptions *) (relation)->rd_options)->fillfactor : (defaultff)) /* * RelationGetTargetPageUsage - * Returns the relation's desired space usage per page in bytes. + * Returns the relation's desired space usage per page in bytes. */ #define RelationGetTargetPageUsage(relation, defaultff) \ - (BLCKSZ * RelationGetFillFactor(relation, defaultff) / 100) + (BLCKSZ * RelationGetFillFactor(relation, defaultff) / 100) /* * RelationGetTargetPageFreeSpace - * Returns the relation's desired freespace per page in bytes. + * Returns the relation's desired freespace per page in bytes. */ #define RelationGetTargetPageFreeSpace(relation, defaultff) \ - (BLCKSZ * (100 - RelationGetFillFactor(relation, defaultff)) / 100) + (BLCKSZ * (100 - RelationGetFillFactor(relation, defaultff)) / 100) /* * RelationIsUsedAsCatalogTable - * Returns whether the relation should be treated as a catalog table - * from the pov of logical decoding. Note multiple eval of argument! + * Returns whether the relation should be treated as a catalog table + * from the pov of logical decoding. Note multiple eval of argument! */ -#define RelationIsUsedAsCatalogTable(relation) \ - ((relation)->rd_options && \ - ((relation)->rd_rel->relkind == RELKIND_RELATION || \ - (relation)->rd_rel->relkind == RELKIND_MATVIEW) ? \ - ((StdRdOptions *) (relation)->rd_options)->user_catalog_table : false) +#define RelationIsUsedAsCatalogTable(relation) \ + ((relation)->rd_options && \ + ((relation)->rd_rel->relkind == RELKIND_RELATION || \ + (relation)->rd_rel->relkind == RELKIND_MATVIEW) ? \ + ((StdRdOptions *) (relation)->rd_options)->user_catalog_table : false) /* * RelationGetParallelWorkers - * Returns the relation's parallel_workers reloption setting. - * Note multiple eval of argument! + * Returns the relation's parallel_workers reloption setting. + * Note multiple eval of argument! */ #define RelationGetParallelWorkers(relation, defaultpw) \ - ((relation)->rd_options ? \ - ((StdRdOptions *) (relation)->rd_options)->parallel_workers : (defaultpw)) + ((relation)->rd_options ? \ + ((StdRdOptions *) (relation)->rd_options)->parallel_workers : (defaultpw)) /* * ViewOptions - * Contents of rd_options for views + * Contents of rd_options for views */ typedef struct ViewOptions { - int32 vl_len_; /* varlena header (do not touch directly!) */ - bool security_barrier; - int check_option_offset; + int32 vl_len_; /* varlena header (do not touch directly!) */ + bool security_barrier; + int check_option_offset; } ViewOptions; /* * RelationIsSecurityView - * Returns whether the relation is security view, or not. Note multiple - * eval of argument! + * Returns whether the relation is security view, or not. Note multiple + * eval of argument! */ -#define RelationIsSecurityView(relation) \ - ((relation)->rd_options ? \ - ((ViewOptions *) (relation)->rd_options)->security_barrier : false) +#define RelationIsSecurityView(relation) \ + ((relation)->rd_options ? \ + ((ViewOptions *) (relation)->rd_options)->security_barrier : false) /* * RelationHasCheckOption - * Returns true if the relation is a view defined with either the local - * or the cascaded check option. Note multiple eval of argument! + * Returns true if the relation is a view defined with either the local + * or the cascaded check option. Note multiple eval of argument! */ -#define RelationHasCheckOption(relation) \ - ((relation)->rd_options && \ - ((ViewOptions *) (relation)->rd_options)->check_option_offset != 0) +#define RelationHasCheckOption(relation) \ + ((relation)->rd_options && \ + ((ViewOptions *) (relation)->rd_options)->check_option_offset != 0) /* * RelationHasLocalCheckOption - * Returns true if the relation is a view defined with the local check - * option. Note multiple eval of argument! + * Returns true if the relation is a view defined with the local check + * option. Note multiple eval of argument! */ -#define RelationHasLocalCheckOption(relation) \ - ((relation)->rd_options && \ - ((ViewOptions *) (relation)->rd_options)->check_option_offset != 0 ? \ - strcmp((char *) (relation)->rd_options + \ - ((ViewOptions *) (relation)->rd_options)->check_option_offset, \ - "local") == 0 : false) +#define RelationHasLocalCheckOption(relation) \ + ((relation)->rd_options && \ + ((ViewOptions *) (relation)->rd_options)->check_option_offset != 0 ? \ + strcmp((char *) (relation)->rd_options + \ + ((ViewOptions *) (relation)->rd_options)->check_option_offset, \ + "local") == 0 : false) /* * RelationHasCascadedCheckOption - * Returns true if the relation is a view defined with the cascaded check - * option. Note multiple eval of argument! + * Returns true if the relation is a view defined with the cascaded check + * option. Note multiple eval of argument! */ -#define RelationHasCascadedCheckOption(relation) \ - ((relation)->rd_options && \ - ((ViewOptions *) (relation)->rd_options)->check_option_offset != 0 ? \ - strcmp((char *) (relation)->rd_options + \ - ((ViewOptions *) (relation)->rd_options)->check_option_offset, \ - "cascaded") == 0 : false) +#define RelationHasCascadedCheckOption(relation) \ + ((relation)->rd_options && \ + ((ViewOptions *) (relation)->rd_options)->check_option_offset != 0 ? \ + strcmp((char *) (relation)->rd_options + \ + ((ViewOptions *) (relation)->rd_options)->check_option_offset, \ + "cascaded") == 0 : false) /* * RelationIsValid - * True iff relation descriptor is valid. + * True iff relation descriptor is valid. */ #define RelationIsValid(relation) PointerIsValid(relation) @@ -416,263 +417,263 @@ typedef struct ViewOptions /* * RelationHasReferenceCountZero - * True iff relation reference count is zero. + * True iff relation reference count is zero. * * Note: - * Assumes relation descriptor is valid. + * Assumes relation descriptor is valid. */ #define RelationHasReferenceCountZero(relation) \ - ((bool)((relation)->rd_refcnt == 0)) + ((bool)((relation)->rd_refcnt == 0)) /* * RelationGetForm - * Returns pg_class tuple for a relation. + * Returns pg_class tuple for a relation. * * Note: - * Assumes relation descriptor is valid. + * Assumes relation descriptor is valid. */ #define RelationGetForm(relation) ((relation)->rd_rel) /* * RelationGetRelid - * Returns the OID of the relation + * Returns the OID of the relation */ #define RelationGetRelid(relation) ((relation)->rd_id) /* * RelationGetNumberOfAttributes - * Returns the number of attributes in a relation. + * Returns the number of attributes in a relation. */ #define RelationGetNumberOfAttributes(relation) ((relation)->rd_rel->relnatts) /* * RelationGetDescr - * Returns tuple descriptor for a relation. + * Returns tuple descriptor for a relation. */ #define RelationGetDescr(relation) ((relation)->rd_att) /* * RelationGetRelationName - * Returns the rel's name. + * Returns the rel's name. * * Note that the name is only unique within the containing namespace. */ #define RelationGetRelationName(relation) \ - (NameStr((relation)->rd_rel->relname)) + (NameStr((relation)->rd_rel->relname)) /* * RelationGetNamespace - * Returns the rel's namespace OID. + * Returns the rel's namespace OID. */ #define RelationGetNamespace(relation) \ - ((relation)->rd_rel->relnamespace) + ((relation)->rd_rel->relnamespace) /* * RelationIsMapped - * True if the relation uses the relfilenode map. + * True if the relation uses the relfilenode map. * * NB: this is only meaningful for relkinds that have storage, else it * will misleadingly say "true". */ #define RelationIsMapped(relation) \ - ((relation)->rd_rel->relfilenode == InvalidOid) + ((relation)->rd_rel->relfilenode == InvalidOid) /* * RelationOpenSmgr - * Open the relation at the smgr level, if not already done. + * Open the relation at the smgr level, if not already done. */ #define RelationOpenSmgr(relation) \ - do { \ - if ((relation)->rd_smgr == NULL) \ - smgrsetowner(&((relation)->rd_smgr), smgropen((relation)->rd_node, (relation)->rd_backend)); \ - (relation)->rd_smgr->smgr_hasextent = RelationHasExtent(relation); \ - } while (0) + do { \ + if ((relation)->rd_smgr == NULL) \ + smgrsetowner(&((relation)->rd_smgr), smgropen((relation)->rd_node, (relation)->rd_backend)); \ + (relation)->rd_smgr->smgr_hasextent = RelationHasExtent(relation); \ + } while (0) /* * RelationCloseSmgr - * Close the relation at the smgr level, if not already done. + * Close the relation at the smgr level, if not already done. * * Note: smgrclose should unhook from owner pointer, hence the Assert. */ #define RelationCloseSmgr(relation) \ - do { \ - if ((relation)->rd_smgr != NULL) \ - { \ - smgrclose((relation)->rd_smgr); \ - Assert((relation)->rd_smgr == NULL); \ - } \ - } while (0) + do { \ + if ((relation)->rd_smgr != NULL) \ + { \ + smgrclose((relation)->rd_smgr); \ + Assert((relation)->rd_smgr == NULL); \ + } \ + } while (0) /* * RelationGetTargetBlock - * Fetch relation's current insertion target block. + * Fetch relation's current insertion target block. * * Returns InvalidBlockNumber if there is no current target block. Note * that the target block status is discarded on any smgr-level invalidation. */ #define RelationGetTargetBlock(relation) \ - ( (relation)->rd_smgr != NULL ? (relation)->rd_smgr->smgr_targblock : InvalidBlockNumber ) + ( (relation)->rd_smgr != NULL ? (relation)->rd_smgr->smgr_targblock : InvalidBlockNumber ) #ifdef _SHARDING_ #define RelationGetTargetBlock_Shard(relation, shardid) \ - ( (relation)->rd_smgr != NULL ? smgr_get_target_block((relation)->rd_smgr, shardid) : InvalidBlockNumber ) + ( (relation)->rd_smgr != NULL ? smgr_get_target_block((relation)->rd_smgr, shardid) : InvalidBlockNumber ) #endif /* * RelationSetTargetBlock - * Set relation's current insertion target block. + * Set relation's current insertion target block. */ #define RelationSetTargetBlock(relation, targblock) \ - do { \ - RelationOpenSmgr(relation); \ - (relation)->rd_smgr->smgr_targblock = (targblock); \ - } while (0) + do { \ + RelationOpenSmgr(relation); \ + (relation)->rd_smgr->smgr_targblock = (targblock); \ + } while (0) #ifdef _SHARDING_ #define RelationSetTargetBlock_Shard(relation, targblock, shardid) \ - do { \ - RelationOpenSmgr(relation); \ - smgr_set_target_block((relation)->rd_smgr, shardid, targblock); \ - } while (0) + do { \ + RelationOpenSmgr(relation); \ + smgr_set_target_block((relation)->rd_smgr, shardid, targblock); \ + } while (0) #endif /* * RelationNeedsWAL - * True if relation needs WAL. + * True if relation needs WAL. */ #define RelationNeedsWAL(relation) \ - ((relation)->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT) - + ((relation)->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT) + #ifdef _SHARDING_ #if 0 #define RelationHasExtent(relation) \ - (IS_PGXC_DATANODE \ - && (relation)->rd_rel->relpersistence == 'p' \ - && ((relation)->rd_rel->relkind == RELKIND_RELATION || (relation)->rd_rel->relkind == RELKIND_TOASTVALUE)\ - && (relation)->rd_locator_info ? ((relation)->rd_locator_info->locatorType == LOCATOR_TYPE_HASH ? true : false) : false\ - && RelationGetRelid(relation) >= FirstNormalObjectId) + (IS_PGXC_DATANODE \ + && (relation)->rd_rel->relpersistence == 'p' \ + && ((relation)->rd_rel->relkind == RELKIND_RELATION || (relation)->rd_rel->relkind == RELKIND_TOASTVALUE)\ + && (relation)->rd_locator_info ? ((relation)->rd_locator_info->locatorType == LOCATOR_TYPE_HASH ? true : false) : false\ + && RelationGetRelid(relation) >= FirstNormalObjectId) #endif #define RelationHasExtent(relation) \ - ((relation)->rd_rel->relhasextent) + ((relation)->rd_rel->relhasextent) #define RelationGetDisKey(relation) \ - ((relation)->rd_locator_info ? (relation)->rd_locator_info->partAttrNum : InvalidAttrNumber) + ((relation)->rd_locator_info ? (relation)->rd_locator_info->partAttrNum : InvalidAttrNumber) #define RelationGetSecDisKey(relation) \ - ((relation)->rd_locator_info ? (relation)->rd_locator_info->secAttrNum : InvalidAttrNumber) + ((relation)->rd_locator_info ? (relation)->rd_locator_info->secAttrNum : InvalidAttrNumber) #define RelationIsSharded(relation) \ - ((relation)->rd_locator_info ? (relation)->rd_locator_info->locatorType == LOCATOR_TYPE_SHARD : false) + ((relation)->rd_locator_info ? (relation)->rd_locator_info->locatorType == LOCATOR_TYPE_SHARD : false) #define RelationHasToast(relation) \ - OidIsValid((relation)->rd_toastoid) + OidIsValid((relation)->rd_toastoid) #endif /* * RelationUsesLocalBuffers - * True if relation's pages are stored in local buffers. + * True if relation's pages are stored in local buffers. */ #ifdef XCP #define RelationUsesLocalBuffers(relation) \ - (!OidIsValid(MyCoordId) && \ - ((relation)->rd_rel->relpersistence == RELPERSISTENCE_TEMP)) + (!OidIsValid(MyCoordId) && \ + ((relation)->rd_rel->relpersistence == RELPERSISTENCE_TEMP)) #else #define RelationUsesLocalBuffers(relation) \ - ((relation)->rd_rel->relpersistence == RELPERSISTENCE_TEMP) + ((relation)->rd_rel->relpersistence == RELPERSISTENCE_TEMP) #endif #ifdef PGXC /* * RelationGetLocInfo - * Return the location info of relation + * Return the location info of relation */ #define RelationGetLocInfo(relation) ((relation)->rd_locator_info) #endif /* * RELATION_IS_LOCAL - * If a rel is either temp or newly created in the current transaction, - * it can be assumed to be accessible only to the current backend. - * This is typically used to decide that we can skip acquiring locks. + * If a rel is either temp or newly created in the current transaction, + * it can be assumed to be accessible only to the current backend. + * This is typically used to decide that we can skip acquiring locks. * * Beware of multiple eval of argument */ #ifdef XCP #define RELATION_IS_LOCAL(relation) \ - ((!OidIsValid(MyCoordId) && (relation)->rd_backend == MyBackendId) || \ - (OidIsValid(MyCoordId) && (relation)->rd_backend == MyFirstBackendId) || \ - ((relation)->rd_backend == MyBackendId || \ - (relation)->rd_createSubid != InvalidSubTransactionId)) + ((!OidIsValid(MyCoordId) && (relation)->rd_backend == MyBackendId) || \ + (OidIsValid(MyCoordId) && (relation)->rd_backend == MyFirstBackendId) || \ + ((relation)->rd_backend == MyBackendId || \ + (relation)->rd_createSubid != InvalidSubTransactionId)) #else #define RELATION_IS_LOCAL(relation) \ - ((relation)->rd_islocaltemp || \ - (relation)->rd_createSubid != InvalidSubTransactionId) + ((relation)->rd_islocaltemp || \ + (relation)->rd_createSubid != InvalidSubTransactionId) #endif #ifdef XCP /* * RelationGetLocatorType - * Returns the rel's locator type. + * Returns the rel's locator type. */ #define RelationGetLocatorType(relation) \ - ((relation)->rd_locator_info->locatorType) + ((relation)->rd_locator_info->locatorType) #endif /* * RELATION_IS_OTHER_TEMP - * Test for a temporary relation that belongs to some other session. + * Test for a temporary relation that belongs to some other session. * * Beware of multiple eval of argument */ #ifdef XCP #define RELATION_IS_OTHER_TEMP(relation) \ - (((relation)->rd_rel->relpersistence == RELPERSISTENCE_TEMP && \ - (relation)->rd_backend != MyBackendId) && \ - ((!OidIsValid(MyCoordId) && (relation)->rd_backend != MyBackendId) || \ - (OidIsValid(MyCoordId) && (relation)->rd_backend != MyFirstBackendId))) + (((relation)->rd_rel->relpersistence == RELPERSISTENCE_TEMP && \ + (relation)->rd_backend != MyBackendId) && \ + ((!OidIsValid(MyCoordId) && (relation)->rd_backend != MyBackendId) || \ + (OidIsValid(MyCoordId) && (relation)->rd_backend != MyFirstBackendId))) #else #define RELATION_IS_OTHER_TEMP(relation) \ - ((relation)->rd_rel->relpersistence == RELPERSISTENCE_TEMP && \ - !(relation)->rd_islocaltemp) + ((relation)->rd_rel->relpersistence == RELPERSISTENCE_TEMP && \ + !(relation)->rd_islocaltemp) #endif #ifdef XCP /* * RELATION_IS_COORDINATOR_LOCAL - * Test for a coordinator only relation such as LOCAL TEMP table or a MATVIEW + * Test for a coordinator only relation such as LOCAL TEMP table or a MATVIEW */ #define RELATION_IS_COORDINATOR_LOCAL(relation) \ - ((RELATION_IS_LOCAL(relation) && !RelationGetLocInfo(relation))) + ((RELATION_IS_LOCAL(relation) && !RelationGetLocInfo(relation))) #endif /* * RelationIsScannable - * Currently can only be false for a materialized view which has not been - * populated by its query. This is likely to get more complicated later, - * so use a macro which looks like a function. + * Currently can only be false for a materialized view which has not been + * populated by its query. This is likely to get more complicated later, + * so use a macro which looks like a function. */ #define RelationIsScannable(relation) ((relation)->rd_rel->relispopulated) /* * RelationIsPopulated - * Currently, we don't physically distinguish the "populated" and - * "scannable" properties of matviews, but that may change later. - * Hence, use the appropriate one of these macros in code tests. + * Currently, we don't physically distinguish the "populated" and + * "scannable" properties of matviews, but that may change later. + * Hence, use the appropriate one of these macros in code tests. */ #define RelationIsPopulated(relation) ((relation)->rd_rel->relispopulated) /* * RelationIsAccessibleInLogicalDecoding - * True if we need to log enough information to have access via - * decoding snapshot. + * True if we need to log enough information to have access via + * decoding snapshot. */ #define RelationIsAccessibleInLogicalDecoding(relation) \ - (XLogLogicalInfoActive() && \ - RelationNeedsWAL(relation) && \ - (IsCatalogRelation(relation) || RelationIsUsedAsCatalogTable(relation))) + (XLogLogicalInfoActive() && \ + RelationNeedsWAL(relation) && \ + (IsCatalogRelation(relation) || RelationIsUsedAsCatalogTable(relation))) /* * RelationIsLogicallyLogged - * True if we need to log enough information to extract the data from the - * WAL stream. + * True if we need to log enough information to extract the data from the + * WAL stream. * * We don't log information for unlogged tables (since they don't WAL log * anyway) and for system tables (their content is hard to make sense of, and @@ -681,46 +682,46 @@ typedef struct ViewOptions * interesting to the user... */ #define RelationIsLogicallyLogged(relation) \ - (XLogLogicalInfoActive() && \ - RelationNeedsWAL(relation) && \ - !IsCatalogRelation(relation)) + (XLogLogicalInfoActive() && \ + RelationNeedsWAL(relation) && \ + !IsCatalogRelation(relation)) /* * RelationGetPartitionKey - * Returns the PartitionKey of a relation + * Returns the PartitionKey of a relation */ #define RelationGetPartitionKey(relation) ((relation)->rd_partkey) #ifdef __TBASE__ #define RelationGetNParts(relation) \ - ((relation)->rd_partitions_info ? (relation)->rd_partitions_info->partnparts : 0) + ((relation)->rd_partitions_info ? (relation)->rd_partitions_info->partnparts : 0) #define RelationGetPartitionColumnIndex(relation) \ - ((relation)->rd_partitions_info ? (relation)->rd_partitions_info->partpartkey : InvalidAttrNumber) + ((relation)->rd_partitions_info ? (relation)->rd_partitions_info->partpartkey : InvalidAttrNumber) #define RELATION_IS_INTERVAL(relation) \ - ((relation)->rd_rel->relpartkind == RELPARTKIND_PARENT) - //((relation)->rd_partkey && (relation)->rd_partkey->strategy == PARTITION_STRATEGY_INTERVAL) + ((relation)->rd_rel->relpartkind == RELPARTKIND_PARENT) + //((relation)->rd_partkey && (relation)->rd_partkey->strategy == PARTITION_STRATEGY_INTERVAL) #define RELATION_IS_CHILD(relation) \ - ((relation)->rd_rel->relpartkind == RELPARTKIND_CHILD) + ((relation)->rd_rel->relpartkind == RELPARTKIND_CHILD) #define RELATION_GET_PARENT(relation) \ - ((relation)->rd_rel->relparent) + ((relation)->rd_rel->relparent) #define RELATION_IS_REGULAR(relation) \ - ((relation)->rd_rel->relpartkind == RELPARTKIND_NONE) + ((relation)->rd_rel->relpartkind == RELPARTKIND_NONE) #define IndexGetRelationId(relation) \ - ( \ - (relation)->rd_rel->relkind == RELKIND_INDEX ? \ - (relation)->rd_index->indrelid : InvalidOid \ - ) + ( \ + (relation)->rd_rel->relkind == RELKIND_INDEX ? \ + (relation)->rd_index->indrelid : InvalidOid \ + ) #define PARTITION_KEY_IS_TIMESTAMP(partoid) \ - ((partoid) == 1114 || (partoid) == 1184) + ((partoid) == 1114 || (partoid) == 1184) extern int64 get_total_relation_size(Relation rel); #endif @@ -731,19 +732,19 @@ extern int64 get_total_relation_size(Relation rel); static inline int get_partition_strategy(PartitionKey key) { - return key->strategy; + return key->strategy; } static inline int get_partition_natts(PartitionKey key) { - return key->partnatts; + return key->partnatts; } static inline List * get_partition_exprs(PartitionKey key) { - return key->partexprs; + return key->partexprs; } /* @@ -752,24 +753,24 @@ get_partition_exprs(PartitionKey key) static inline int16 get_partition_col_attnum(PartitionKey key, int col) { - return key->partattrs[col]; + return key->partattrs[col]; } static inline Oid get_partition_col_typid(PartitionKey key, int col) { - return key->parttypid[col]; + return key->parttypid[col]; } static inline int32 get_partition_col_typmod(PartitionKey key, int col) { - return key->parttypmod[col]; + return key->parttypmod[col]; } /* * RelationGetPartitionDesc - * Returns partition descriptor for a relation. + * Returns partition descriptor for a relation. */ #define RelationGetPartitionDesc(relation) ((relation)->rd_partdesc) @@ -779,4 +780,4 @@ extern void RelationDecrementReferenceCount(Relation rel); extern bool RelationHasUnloggedIndex(Relation rel); extern List *RelationGetRepsetList(Relation rel); -#endif /* REL_H */ +#endif /* REL_H */ diff --git a/src/include/utils/relcache.h b/src/include/utils/relcache.h index 98adccc1..243834fa 100644 --- a/src/include/utils/relcache.h +++ b/src/include/utils/relcache.h @@ -15,6 +15,7 @@ #define RELCACHE_H #include "access/tupdesc.h" +#include "lib/ilist.h" #include "nodes/bitmapset.h" @@ -54,6 +55,13 @@ typedef enum IndexAttrBitmapKind INDEX_ATTR_BITMAP_IDENTITY_KEY } IndexAttrBitmapKind; +typedef struct relcacheheader +{ + int rh_ntup; /* # of tuples in relation cache */ + int rh_maxtup; /* max number of LRU relations */ + dlist_head rh_lrulist; /* LRU list, most recent first */ +} RelCacheHeader; + extern Bitmapset *RelationGetIndexAttrBitmap(Relation relation, IndexAttrBitmapKind keyAttrs); @@ -131,6 +139,9 @@ extern void RelationCacheInitFileRemove(void); extern bool RelationHasGTS(Relation rel); +extern void RelationLRUInsert(Relation rel); +extern void RelationLRUDelete(Relation rel); + /* should be used only by relcache.c and catcache.c */ extern bool criticalRelcachesBuilt; diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index b0bd7fb4..b624ad7c 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -107,6 +107,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_key_value | off enable_lock_account | off enable_material | on + enable_memory_optimization | on enable_mergejoin | on enable_multi_cluster | on enable_multi_cluster_print | off @@ -133,7 +134,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_transparent_crypt | on enable_user_authority_force_check | off enable_xlog_mprotect | on -(61 rows) +(62 rows) -- Test that the pg_timezone_names and pg_timezone_abbrevs views are -- more-or-less working. We can't test their contents in any great detail From 6d6fe64caf9699f9ecf0d8a66562e8fe342aa034 Mon Sep 17 00:00:00 2001 From: jackywpxie Date: Tue, 20 Apr 2021 21:24:43 +0800 Subject: [PATCH 167/578] =?UTF-8?q?Tbase=5Fv2.15.18=E5=90=88=E6=B5=81?= =?UTF-8?q?=E5=88=B0=E4=B8=BB=E7=BA=BFTbase=5Fv2.15=20(merge=20request=20!?= =?UTF-8?q?282)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Squash merge branch 'Tbase_v2.15.18' into 'Tbase_v2.15' * Merge branch 'Tbase_v2.15' into Tbase_v2.15.18 * merge_Tbase_v2.15.18 (merge request !280) * ExecRemoteQuery should ignore received tuple desc from DN if CN already has one * fix http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131086220995 (merge request !265) * Make value scan as replicated distribution. * Push value scan to datanode. * fix http://tapd.oa.com/pgxz/bugtrace/bugs/view/1110092131080582878 * fix Failed to get pooled connections http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131083143069 (merge request !224) * 修复初始化shared buffer未加写锁的问题 (merge request !240) * jacky/bugfix/consistent_Tbase_v2.15.18 (merge request !243) * jacky/bugfix/coredump_Tbase_v5.05.3 (merge request !206) * http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131084864233&url_cache_key=3ba5cc9f7d4408eb8cb3e14319eb688f * use subroot for view sort in case of no sort operator in plan * fix http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131085879935 (merge request !214) * snyc dynamic shared memory from pg * jacky/bugfix/pooler_Tbase_v2.15.12 (merge request !163) * Merge branch 'Tbase_v2.15.18' of http://git.code.oa.com/Tbase/PG-XL-v10 into Tbase_v2.15.18 * add error print in pg_check while pgxc_node is wrong * fix bug in bitmap_scan_cost * Only print local cached plan if execution got error and minor fix (merge request !182) * create branch Tbase_v2.15.18 --- .../tbase_pooler_stat--1.0.sql | 1 + contrib/tbase_pooler_stat/tbase_pooler_stat.c | 13 +- src/backend/access/hash/hash.c | 9 + src/backend/access/hash/hashsearch.c | 18 - src/backend/commands/analyze.c | 5 + src/backend/nodes/copyfuncs.c | 1 + src/backend/optimizer/path/allpaths.c | 56 ++- src/backend/optimizer/path/indxpath.c | 2 + src/backend/optimizer/plan/createplan.c | 16 +- src/backend/optimizer/util/pgxcship.c | 25 +- src/backend/pgxc/pool/execRemote.c | 9 + src/backend/pgxc/pool/poolmgr.c | 357 ++++++++++++------ src/backend/storage/buffer/bufmgr.c | 4 + src/backend/storage/freespace/emapage.c | 4 + src/backend/storage/ipc/dsm.c | 35 +- src/backend/utils/misc/mls.c | 4 +- src/backend/utils/mmgr/dsa.c | 78 +++- src/backend/utils/mmgr/freepage.c | 25 +- src/backend/utils/time/tqual.c | 22 +- src/include/pgxc/planner.h | 1 + src/include/pgxc/poolmgr.h | 4 +- src/test/regress/expected/aggregates_1.out | 27 -- src/test/regress/expected/create_view.out | 32 ++ src/test/regress/expected/gist_1.out | 5 +- src/test/regress/expected/groupingsets.out | 135 +++---- src/test/regress/expected/groupingsets_1.out | 28 +- src/test/regress/expected/join_3.out | 51 ++- src/test/regress/expected/rules.out | 3 +- src/test/regress/expected/subselect.out | 44 ++- src/test/regress/expected/tablesample_1.out | 31 +- src/test/regress/sql/create_view.sql | 8 + 31 files changed, 700 insertions(+), 353 deletions(-) diff --git a/contrib/tbase_pooler_stat/tbase_pooler_stat--1.0.sql b/contrib/tbase_pooler_stat/tbase_pooler_stat--1.0.sql index 93b0be11..5ee8e1e6 100644 --- a/contrib/tbase_pooler_stat/tbase_pooler_stat--1.0.sql +++ b/contrib/tbase_pooler_stat/tbase_pooler_stat--1.0.sql @@ -32,6 +32,7 @@ CREATE OR REPLACE FUNCTION tbase_get_pooler_conn_statistics( OUT warming_cnt int4, OUT query_cnt int4, OUT exceed_keepalive_cnt int4, + OUT exceed_deadtime_cnt int4, OUT exceed_maxlifetime_cnt int4 ) RETURNS SETOF record diff --git a/contrib/tbase_pooler_stat/tbase_pooler_stat.c b/contrib/tbase_pooler_stat/tbase_pooler_stat.c index ac77982b..d38a4755 100644 --- a/contrib/tbase_pooler_stat/tbase_pooler_stat.c +++ b/contrib/tbase_pooler_stat/tbase_pooler_stat.c @@ -182,7 +182,7 @@ tbase_reset_pooler_cmd_statistics(PG_FUNCTION_ARGS) Datum tbase_get_pooler_conn_statistics(PG_FUNCTION_ARGS) { -#define LIST_POOLER_CONN_STATISTICS_COLUMNS 11 +#define LIST_POOLER_CONN_STATISTICS_COLUMNS 12 FuncCallContext *funcctx = NULL; int32 ret = 0; Pooler_ConnState *status = NULL; @@ -222,7 +222,12 @@ tbase_get_pooler_conn_statistics(PG_FUNCTION_ARGS) INT4OID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 10, "exceed_keepalive_cnt", INT4OID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 11, "exceed_maxlifetime_cnt", + /* + * This field is reserved for compatibility + */ + TupleDescInitEntry(tupdesc, (AttrNumber) 11, "exceed_deadtime_cnt", + INT4OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 12, "exceed_maxlifetime_cnt", INT4OID, -1, 0); funcctx->tuple_desc = BlessTupleDesc(tupdesc); @@ -277,6 +282,7 @@ tbase_get_pooler_conn_statistics(PG_FUNCTION_ARGS) nulls[8] = true; nulls[9] = true; nulls[10] = true; + nulls[11] = true; } else { @@ -288,7 +294,8 @@ tbase_get_pooler_conn_statistics(PG_FUNCTION_ARGS) values[7] = UInt32GetDatum(pq_getmsgint(status->buf, sizeof(uint32))); values[8] = UInt32GetDatum(pq_getmsgint(status->buf, sizeof(uint32))); values[9] = UInt32GetDatum(pq_getmsgint(status->buf, sizeof(uint32))); - values[10] = UInt32GetDatum(pq_getmsgint(status->buf, sizeof(uint32))); + values[10] = UInt32GetDatum(0); + values[11] = UInt32GetDatum(pq_getmsgint(status->buf, sizeof(uint32))); status->node_cursor--; } diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 7d04dd80..b7e21348 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -284,7 +284,16 @@ hashgettuple(IndexScanDesc scan, ScanDirection dir) * Reacquire the read lock here. */ if (BufferIsValid(so->hashso_curbuf)) + { + if (enable_buffer_mprotect) + { + LockBuffer(so->hashso_curbuf, BUFFER_LOCK_EXCLUSIVE); + } + else + { LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE); + } + } /* * If we've already initialized this scan, we can just advance it in the diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c index e4ea31de..32df3ca4 100644 --- a/src/backend/access/hash/hashsearch.c +++ b/src/backend/access/hash/hashsearch.c @@ -469,17 +469,8 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) /* Before leaving current page, deal with any killed items */ if (so->numKilled > 0) { - if (enable_buffer_mprotect) - { - LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK); - LockBuffer(so->hashso_curbuf, BUFFER_LOCK_EXCLUSIVE); - _hash_kill_items(scan); - } - else - { _hash_kill_items(scan); } - } /* * ran off the end of this page, try the next @@ -537,17 +528,8 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) /* Before leaving current page, deal with any killed items */ if (so->numKilled > 0) { - if (enable_buffer_mprotect) - { - LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK); - LockBuffer(so->hashso_curbuf, BUFFER_LOCK_EXCLUSIVE); - _hash_kill_items(scan); - } - else - { _hash_kill_items(scan); } - } /* * ran off the end of this page, try the next diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 54a26e55..48cf8d22 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -5133,6 +5133,11 @@ acquire_coordinator_sample_rows(Relation onerel, int elevel, dummy = makeVar(1, 5, onerel->rd_rel->reltype, 0, InvalidOid, 0); step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, makeTargetEntry((Expr *) dummy, 5, "rows", false)); + /* + * ANALYZE has known it's result slot desc, should + * ignore received one to avoid duplicate name issue + */ + step->ignore_tuple_desc = true; /* Execute query on the data nodes */ estate = CreateExecutorState(); diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index c2885241..6b103aa4 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -1341,6 +1341,7 @@ _copyRemoteQuery(const RemoteQuery *from) COPY_SCALAR_FIELD(jf_xc_wholerow); COPY_BITMAPSET_FIELD(conflict_cols); COPY_SCALAR_FIELD(is_set); + COPY_SCALAR_FIELD(ignore_tuple_desc); #endif return newnode; } diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 310cec07..821c2aef 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -44,6 +44,7 @@ #include "optimizer/var.h" #include "parser/parse_clause.h" #include "parser/parsetree.h" +#include "pgxc/nodemgr.h" #ifdef PGXC #include "nodes/makefuncs.h" #include "miscadmin.h" @@ -141,6 +142,7 @@ static void recurse_push_qual(Node *setOp, Query *topquery, static void remove_unused_subquery_outputs(Query *subquery, RelOptInfo *rel); static void add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, List *live_childrels); +static bool check_list_contain_all_const(List *list); /* @@ -2078,6 +2080,35 @@ set_function_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) pathkeys, required_outer)); } +/* + * check_list_contain_all_const + * Check the list is contain all consts. + */ +static bool +check_list_contain_all_const(List *list) +{ + ListCell *lc = NULL; + Node *node = NULL; + + foreach(lc, list) + { + node = lfirst(lc); + if (IsA(node, List)) + { + if (!check_list_contain_all_const(node)) + { + return false; + } + } + else if (!IsA(node, Const)) + { + return false; + } + } + + return true; +} + /* * set_values_pathlist * Build the (single) access path for a VALUES RTE @@ -2086,6 +2117,7 @@ static void set_values_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) { Relids required_outer; + Path *new_path = NULL; /* * We don't support pushing join clauses into the quals of a values scan, @@ -2095,7 +2127,29 @@ set_values_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) required_outer = rel->lateral_relids; /* Generate appropriate path */ - add_path(rel, create_valuesscan_path(root, rel, required_outer)); + new_path = create_valuesscan_path(root, rel, required_outer); + + /* Mark scan as replicated if selected value list is all const */ + if (root->parse->commandType == CMD_SELECT && + check_list_contain_all_const((List *)rte->values_lists)) + { + Distribution *targetd = NULL; + int node_index = 0; + + targetd = makeNode(Distribution); + targetd->distributionType = LOCATOR_TYPE_REPLICATED; + targetd->nodes = NULL; + + for (node_index = 0; node_index < NumDataNodes; node_index++) + { + targetd->nodes = bms_add_member(targetd->nodes, node_index); + } + + targetd->restrictNodes = NULL; + new_path->distribution = targetd; + } + + add_path(rel, new_path); } /* diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index 31f75070..4d3b3cce 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -1620,6 +1620,8 @@ bitmap_scan_cost_est(PlannerInfo *root, RelOptInfo *rel, Path *ipath) required_outer); bpath.path.pathkeys = NIL; bpath.bitmapqual = ipath; + /* TODO: get real distribution information */ + bpath.path.distribution = NULL; /* * Check the cost of temporary path without considering parallelism. diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index cbd3bef4..48589d94 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -104,6 +104,7 @@ int remote_subplan_depth = 0; List *groupOids = NULL; bool mergejoin = false; +bool child_of_gather = false; bool enable_group_across_query = false; bool enable_distributed_unique_plan = false; #endif @@ -1955,6 +1956,14 @@ create_gather_plan(PlannerInfo *root, GatherPath *best_path) Gather *gather_plan; Plan *subplan; List *tlist; + bool reset = false; + + /* if child_of_gather is false, set child_of_gather true, and reset the value before return */ + if (!child_of_gather) + { + child_of_gather = true; + reset = true; + } /* * Although the Gather node can project, we prefer to push down such work @@ -1975,6 +1984,11 @@ create_gather_plan(PlannerInfo *root, GatherPath *best_path) /* use parallel mode for parallel plans. */ root->glob->parallelModeNeeded = true; + if (reset) + { + child_of_gather = false; + } + return gather_plan; } @@ -7086,7 +7100,7 @@ make_remotesubplan(PlannerInfo *root, gather_plan->parallelWorker_sendTuple = true; } - if ((IsA(lefttree, Gather) || lefttree->parallel_aware) && + if ((IsA(lefttree, Gather) || lefttree->parallel_aware || child_of_gather) && olap_optimizer) { plan->parallel_aware = true; diff --git a/src/backend/optimizer/util/pgxcship.c b/src/backend/optimizer/util/pgxcship.c index df8aa3d5..3d00fa58 100644 --- a/src/backend/optimizer/util/pgxcship.c +++ b/src/backend/optimizer/util/pgxcship.c @@ -1884,6 +1884,16 @@ pgxc_query_contains_only_pg_catalog(List *rtable) return true; } +ExecNodes * +make_FQS_single_node() +{ + ExecNodes *exec_nodes; + exec_nodes = makeNode(ExecNodes); + exec_nodes->accesstype = RELATION_ACCESS_READ_FQS; + exec_nodes->nodeList = lappend_int(exec_nodes->nodeList, 0); + return exec_nodes; +} + /* * pgxc_is_query_shippable * This function calls the query walker to analyse the query to gather @@ -1917,10 +1927,19 @@ pgxc_is_query_shippable(Query *query, int query_level) exec_nodes = sc_context.sc_exec_nodes; - /* For single datanode and select command, we ship it directly. */ - if (NumDataNodes == 1 && query->commandType == CMD_SELECT && - !bms_is_member(SS_NEEDS_COORD, sc_context.sc_shippability)) + /* For single datanode and select command, if we don't need coord + * and exec_nodes exists, return it directly. But if exec_nodes is + * NULL we make exec_nodes for FQS; + */ + if (!bms_is_member(SS_NEEDS_COORD, sc_context.sc_shippability)) + { + if (NumDataNodes == 1 && query->commandType == CMD_SELECT) + { + if (exec_nodes) return exec_nodes; + return make_FQS_single_node(); + } + } /* * The shippability context contains two ExecNodes, one for the subLinks diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index a06692a6..d105295a 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -545,6 +545,15 @@ HandleRowDescription(ResponseCombiner *combiner, char *msg_body, size_t len) (errcode(ERRCODE_DATA_CORRUPTED), errmsg("Unexpected response from the Datanodes for 'T' message, current request type %d", combiner->request_type))); } + + /* should ignore received tuple desc if already got one to avoid duplicate name issue */ + if (combiner->ss.ps.plan != NULL && + IsA(combiner->ss.ps.plan, RemoteQuery) && + ((RemoteQuery *) combiner->ss.ps.plan)->ignore_tuple_desc) + { + return false; + } + /* Increment counter and check if it was first */ if (combiner->description_count == 0) { diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c index b83e273f..63d2c3e9 100644 --- a/src/backend/pgxc/pool/poolmgr.c +++ b/src/backend/pgxc/pool/poolmgr.c @@ -82,7 +82,6 @@ int PoolConnKeepAlive = 600; int PoolMaintenanceTimeout = 30; int PoolSizeCheckGap = 120; /* max check memory size gap, in seconds */ int PoolConnMaxLifetime = 600; /* max lifetime of a pooled connection, in seconds */ -int PoolWarmConnMaxLifetime = 7200; /* max lifetime of a warm-needed pooled connection, in seconds */ int PoolMaxMemoryLimit = 10; int PoolConnectTimeOut = 10; int PoolScaleFactor = 2; @@ -207,7 +206,7 @@ typedef struct Oid nodeoid; /* Node Oid related to this pool */ char *connstr; /* palloc memory, need free */ - int32 m_version; /* version of node pool */ + time_t m_version; /* version of node pool */ int32 size; /* total pool size */ int32 validSize; /* valid data element number */ bool failed; @@ -567,7 +566,7 @@ static void *pooler_async_utility_thread(void *arg); static void *pooler_async_connection_management_thread(void *arg); static void *pooler_sync_remote_operator_thread(void *arg); -static bool pooler_async_build_connection(DatabasePool *pool, int32 pool_version, int32 nodeidx, Oid node, +static bool pooler_async_build_connection(DatabasePool *pool, time_t pool_version, int32 nodeidx, Oid node, int32 size, char *connStr, bool bCoord); static BitmapMgr *BmpMgrCreate(uint32 objnum); static int BmpMgrAlloc(BitmapMgr *mgr); @@ -597,7 +596,6 @@ static void handle_clean_connection(PoolAgent * agent, StringInfo s); static void handle_get_connections(PoolAgent * agent, StringInfo s); static void handle_query_cancel(PoolAgent * agent, StringInfo s); static void handle_session_command(PoolAgent * agent, StringInfo s); -static bool remove_all_agent_references(Oid nodeoid); static int refresh_database_pools(PoolAgent *agent); static void pooler_async_ping_node(Oid node); @@ -3149,6 +3147,17 @@ agent_acquire_connections(PoolAgent *agent, List *datanodelist, List *coordlist, else { acquire_succeed_num++; + if (!slot->bdestoryed && difftime(time(NULL), slot->created) > PoolConnMaxLifetime) + { + elog(WARNING, + POOL_MGR_PREFIX"agent_acquire_connections acquired connection to " + "database:%s user:%s " + "node:%s backend_pid:%d nodeidx:%d " + "nodepool size:%d freeSize:%d that should be destoried.", + agent->pool->database, agent->pool->user_name, + nodePool->node_name, slot->backend_pid, node, + nodePool->size, nodePool->freeSize); + } if (PoolConnectDebugPrint) { /* double check, to ensure no double destory and multiple agents for one slot */ @@ -4470,7 +4479,12 @@ reload_database_pools(PoolAgent *agent) if (nodePool->size == nodePool->freeSize) #endif { - elog(LOG, POOL_MGR_PREFIX"nodePool:%s has been changed, size:%d, freeSize:%d, destory it now", nodePool->connstr, nodePool->size, nodePool->freeSize); + elog(LOG, POOL_MGR_PREFIX"nodePool:%s has been changed, " + "size:%d, freeSize:%d, reload_database_pools: nodePools " + "of node (%u, %s) is removed.", + nodePool->connstr, + nodePool->size, nodePool->freeSize, + nodePool->nodeoid, nodePool->node_name); destroy_node_pool(nodePool); hash_search(databasePool->nodePools, &nodePool->nodeoid, HASH_REMOVE, NULL); @@ -4480,7 +4494,13 @@ reload_database_pools(PoolAgent *agent) destroy_node_pool_free_slots(nodePool); /* increase the node pool version */ - nodePool->m_version++; + nodePool->m_version = time(NULL); + elog(LOG, POOL_MGR_PREFIX"nodePool:%s has been changed, " + "size:%d, freeSize:%d, reload_database_pools: nodePools " + "of node (%u, %s) has increased version %lu.", + nodePool->connstr, + nodePool->size, nodePool->freeSize, + nodePool->nodeoid, nodePool->node_name, nodePool->m_version); /* fresh the connect string so that new coming connection will connect to the new node */ if (connstr_chk) @@ -4687,7 +4707,11 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot, nodePool = (PGXCNodePool *) hash_search(dbPool->nodePools, &node, HASH_FIND, NULL); - if (nodePool == NULL) + /* + * The node pool of connections may has been created just now and the size is + * initialized to 0. This situation needs to be excluded. + */ + if ((nodePool == NULL) || (nodePool->size == 0)) { /* * The node may be altered or dropped. @@ -4695,8 +4719,14 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot, */ if (PoolConnectDebugPrint) { - elog(LOG, POOL_MGR_PREFIX"release_connection connection to node:%s backend_pid:%d nodeidx:%d size:%d freeSize:%d can not find nodepool, just destory it", nodePool->node_name, slot->backend_pid, nodeidx, nodePool->size, nodePool->freeSize); - abort(); + elog(LOG, + POOL_MGR_PREFIX"release_connection connection to " + "database:%s user:%s " + "node:%s backend_pid:%d nodeidx:%d " + "size:%d freeSize:%d can not find nodepool, just destory it", + dbPool->database, dbPool->user_name, + nodePool->node_name, slot->backend_pid, nodeidx, + nodePool->size, nodePool->freeSize); } destroy_slot(nodeidx, node, slot); return; @@ -4704,7 +4734,14 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot, if (PoolConnectDebugPrint) { - elog(LOG, POOL_MGR_PREFIX"release_connection connection to nodename:%s backend_pid:%d nodeidx:%d size:%d freeSize:%d begin to release", nodePool->node_name, slot->backend_pid, nodeidx, nodePool->size, nodePool->freeSize); + elog(LOG, + POOL_MGR_PREFIX"release_connection connection to " + "database:%s user:%s " + "nodename:%s backend_pid:%d nodeidx:%d " + "size:%d freeSize:%d begin to release", + dbPool->database, dbPool->user_name, + nodePool->node_name, slot->backend_pid, nodeidx, + nodePool->size, nodePool->freeSize); } /* force destroy the connection when pool not enabled */ @@ -4716,11 +4753,19 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot, /* destory the slot of former nodePool */ if (slot->m_version != nodePool->m_version) { - force_destroy = true; if (PoolConnectDebugPrint) { - elog(LOG, POOL_MGR_PREFIX"release_connection connection to node:%s backend_pid:%d nodeidx:%d agentCount:%d size:%d freeSize:%d node version:%d slot version:%d not match", nodePool->node_name, slot->backend_pid, nodeidx, agentCount, nodePool->size, nodePool->freeSize, nodePool->m_version, slot->m_version); + elog(LOG, + POOL_MGR_PREFIX"release_connection connection to " + "database:%s user:%s " + "node:%s backend_pid:%d nodeidx:%d agentCount:%d " + "size:%d freeSize:%d node version:%lu slot version:%lu not match", + dbPool->database, dbPool->user_name, + nodePool->node_name, slot->backend_pid, nodeidx, agentCount, + nodePool->size, nodePool->freeSize, nodePool->m_version, slot->m_version); } + destroy_slot(nodeidx, node, slot); + return; } if (!force_destroy) @@ -4731,24 +4776,38 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot, /* warm a connection is a hard job, when release them, we need make sure it has worked long enough. */ if (slot->bwarmed) { - if (nodePool->freeSize > MinFreeSize && difftime(now, slot->created) > PoolWarmConnMaxLifetime) + if (nodePool->freeSize > MinFreeSize || difftime(now, slot->created) > PoolConnMaxLifetime) { force_destroy = true; if (PoolConnectDebugPrint) { - elog(LOG, POOL_MGR_PREFIX"warmed connection to node:%s backend_pid:%d nodeidx:%d lifetime expired, closed it, size:%d freeSize:%d", nodePool->node_name, slot->backend_pid, nodeidx, nodePool->size, nodePool->freeSize); + elog(LOG, + POOL_MGR_PREFIX"warmed connection to " + "database:%s user:%s " + "node:%s backend_pid:%d nodeidx:%d lifetime expired, " + "closed it, size:%d freeSize:%d", + dbPool->database, dbPool->user_name, + nodePool->node_name, slot->backend_pid, nodeidx, + nodePool->size, nodePool->freeSize); } } } else { if (((nodePool->freeSize > 0) && (nodePool->nwarming + nodePool->nquery) > MinFreeSize) || - (difftime(now, slot->created) >= PoolWarmConnMaxLifetime)) + (difftime(now, slot->created) >= PoolConnMaxLifetime)) { force_destroy = true; if (PoolConnectDebugPrint) { - elog(LOG, POOL_MGR_PREFIX"unwarmed connection to node:%s backend_pid:%d nodeidx:%d lifetime expired, closed it, size:%d freeSize:%d", nodePool->node_name, slot->backend_pid, nodeidx, nodePool->size, nodePool->freeSize); + elog(LOG, + POOL_MGR_PREFIX"unwarmed connection to " + "database:%s user:%s " + "node:%s backend_pid:%d nodeidx:%d lifetime expired, " + "closed it, size:%d freeSize:%d", + dbPool->database, dbPool->user_name, + nodePool->node_name, slot->backend_pid, nodeidx, + nodePool->size, nodePool->freeSize); } } } @@ -4759,7 +4818,14 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot, force_destroy = true; if (PoolConnectDebugPrint) { - elog(LOG, POOL_MGR_PREFIX"connection to node:%s backend_pid:%d nodeidx:%d lifetime expired, closed it, size:%d freeSize:%d", nodePool->node_name, slot->backend_pid, nodeidx, nodePool->size, nodePool->freeSize); + elog(LOG, + POOL_MGR_PREFIX"connection to " + "database:%s user:%s " + "node:%s backend_pid:%d nodeidx:%d lifetime expired, " + "closed it, size:%d freeSize:%d", + dbPool->database, dbPool->user_name, + nodePool->node_name, slot->backend_pid, nodeidx, + nodePool->size, nodePool->freeSize); } } } @@ -4795,7 +4861,14 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot, slot->released = now; if (PoolConnectDebugPrint) { - elog(LOG, POOL_MGR_PREFIX"release_connection return connection to node:%s backend_pid:%d nodeidx:%d nodepool size:%d freeSize:%d", nodePool->node_name, slot->backend_pid, nodeidx, nodePool->size, nodePool->freeSize); + elog(LOG, + POOL_MGR_PREFIX"release_connection return connection to " + "database:%s user:%s " + "node:%s backend_pid:%d nodeidx:%d " + "nodepool size:%d freeSize:%d", + dbPool->database, dbPool->user_name, + nodePool->node_name, slot->backend_pid, nodeidx, + nodePool->size, nodePool->freeSize); } } } @@ -4805,7 +4878,14 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot, elog(DEBUG1, POOL_MGR_PREFIX"Cleaning up connection from pool %s, closing", nodePool->connstr); if (PoolConnectDebugPrint) { - elog(LOG, POOL_MGR_PREFIX"release_connection destory connection to node:%s backend_pid:%d nodeidx:%d nodepool size:%d freeSize:%d", nodePool->node_name, slot->backend_pid, nodeidx, nodePool->size, nodePool->freeSize); + elog(LOG, + POOL_MGR_PREFIX"release_connection destory connection to " + "database:%s user:%s " + "node:%s backend_pid:%d nodeidx:%d " + "nodepool size:%d freeSize:%d", + dbPool->database, dbPool->user_name, + nodePool->node_name, slot->backend_pid, nodeidx, + nodePool->size, nodePool->freeSize); } destroy_slot(nodeidx, node, slot); @@ -4819,6 +4899,18 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot, grow_pool(dbPool, nodeidx, node, bCoord); } } + + if (!slot->bdestoryed && difftime(time(NULL), slot->created) > PoolConnMaxLifetime) + { + elog(WARNING, + POOL_MGR_PREFIX"release_connection has not destoried connection to " + "database:%s user:%s " + "node:%s backend_pid:%d nodeidx:%d " + "nodepool size:%d freeSize:%d", + dbPool->database, dbPool->user_name, + nodePool->node_name, slot->backend_pid, nodeidx, + nodePool->size, nodePool->freeSize); + } } /* @@ -4872,6 +4964,11 @@ grow_pool(DatabasePool *dbPool, int32 nodeidx, Oid node, bool bCoord) } snprintf(nodePool->node_name, NAMEDATALEN, "%s", name_str); MemoryContextSwitchTo(oldcontext); + + nodePool->m_version = time(NULL); + elog(LOG, + "grow_pool: nodePools of node (%u, %s) is created.", + nodePool->nodeoid, nodePool->node_name); } /* here, we move the connection build work to async threads */ @@ -5110,6 +5207,8 @@ destroy_node_pool(PGXCNodePool *node_pool) destroy_slot(nodeidx, node_pool->nodeoid, node_pool->slot[i]); } pfree(node_pool->slot); + node_pool->size -= node_pool->freeSize; + node_pool->freeSize = 0; } } @@ -5128,8 +5227,11 @@ destroy_node_pool_free_slots(PGXCNodePool *node_pool) if (PoolConnectDebugPrint) { - elog(LOG, POOL_MGR_PREFIX"About to destroy slots of node pool %s, agentCount is %d, node_pool version:%d current size is %d, freeSize is %d, %d connections are in use", - node_pool->connstr, node_pool->m_version, agentCount, node_pool->size, node_pool->freeSize, node_pool->size - node_pool->freeSize); + elog(LOG, + POOL_MGR_PREFIX"About to destroy slots of node pool %s, node_pool version:%lu " + "agentCount is %d current size is %d, freeSize is %d, %d connections are in use", + node_pool->connstr, node_pool->m_version, + agentCount, node_pool->size, node_pool->freeSize, node_pool->size - node_pool->freeSize); } if (node_pool->slot) @@ -5141,8 +5243,8 @@ destroy_node_pool_free_slots(PGXCNodePool *node_pool) destroy_slot(nodeidx, node_pool->nodeoid, node_pool->slot[i]); node_pool->slot[i] = NULL; } - node_pool->freeSize = 0; node_pool->size -= node_pool->freeSize; + node_pool->freeSize = 0; } } @@ -5829,7 +5931,14 @@ shrink_pool(DatabasePool *pool) { if (PoolConnectDebugPrint) { - elog(LOG, POOL_MGR_PREFIX"shrink_pool destroy a connection to node:%s backend_pid:%d nodeidx:%d nodepool size:%d freeSize:%d", nodePool->node_name, slot->backend_pid, nodeidx, nodePool->size, nodePool->freeSize); + elog(LOG, + POOL_MGR_PREFIX"shrink_pool destroy a connection to " + "database:%s user:%s " + "node:%s backend_pid:%d nodeidx:%d " + "nodepool size:%d freeSize:%d", + pool->database, pool->user_name, + nodePool->node_name, slot->backend_pid, nodeidx, + nodePool->size, nodePool->freeSize); } /* connection is idle for long, close it */ destroy_slot(nodeidx, nodePool->nodeoid, slot); @@ -5884,9 +5993,13 @@ shrink_pool(DatabasePool *pool) { if (PoolConnectDebugPrint) { - elog(LOG, POOL_MGR_PREFIX"close %d long time free node:%u, poolsize:%d, freeSize:%d", freeCount, nodePool->nodeoid, - nodePool->size, - nodePool->freeSize); + elog(LOG, + POOL_MGR_PREFIX"close %d long time free connections of " + "database:%s user:%s " + "node:%u, poolsize:%d, freeSize:%d", + freeCount, + pool->database, pool->user_name, + nodePool->nodeoid, nodePool->size, nodePool->freeSize); } /* only grow pool when pool needed. */ @@ -5896,12 +6009,40 @@ shrink_pool(DatabasePool *pool) } } + if (PoolConnectDebugPrint) + { + for (i = 0; i < nodePool->freeSize; i++) + { + PGXCNodePoolSlot *slot = nodePool->slot[i]; + + if (slot && !slot->bdestoryed && difftime(time(NULL), slot->created) > PoolConnMaxLifetime) + { + elog(WARNING, + POOL_MGR_PREFIX"shrink_pool found connection to " + "database:%s user:%s " + "nodename:%s nodeid:%d " + "nodepool size:%d freeSize:%d that should be destoried.", + pool->database, pool->user_name, + nodePool->node_name, nodePool->nodeoid, + nodePool->size, nodePool->freeSize); + + break; + } + } + } + if (nodePool->size > 0) { empty = false; } else { + if (PoolConnectDebugPrint) + { + elog(LOG, + "shrink_pool: nodePools of node (%u, %s) is removed.", + nodePool->nodeoid, nodePool->node_name); + } destroy_node_pool(nodePool); hash_search(pool->nodePools, &nodePool->nodeoid, HASH_REMOVE, NULL); } @@ -6561,9 +6702,10 @@ static void pooler_sync_connections_to_nodepool(void) nodePool = (PGXCNodePool *) hash_search(asyncInfo->dbPool->nodePools, &asyncInfo->node, HASH_ENTER, &found); - if (!found) { + elog(WARNING, POOL_MGR_PREFIX"The nodePool has not found when the slot is warmed up."); + oldcontext = MemoryContextSwitchTo(PoolerMemoryContext); nodePool->connstr = build_node_conn_str(asyncInfo->node, asyncInfo->dbPool); if (!nodePool->connstr) @@ -6587,6 +6729,7 @@ static void pooler_sync_connections_to_nodepool(void) nodePool->coord = false; /* in this case, only datanode */ nodePool->nwarming = 0; nodePool->nquery = 0; + nodePool->m_version = time(NULL); name_str = get_node_name_by_nodeoid(asyncInfo->node); if (NULL == name_str) @@ -6632,16 +6775,18 @@ static void pooler_sync_connections_to_nodepool(void) } else { - - nodeidx = get_node_index_by_nodeoid(asyncInfo->node); - destroy_slot(nodeidx, asyncInfo->node, asyncInfo->slot); - - /* Decrease pool size */ - DecreasePoolerSize(nodePool,__FILE__, __LINE__); if (PoolConnectDebugPrint) { - elog(LOG, POOL_MGR_PREFIX"destory connection to node:%u nodeidx:%d nodepool size:%d freeSize:%d for unmatch version, slot->m_version:%d, nodePool->m_version:%d", asyncInfo->node, nodeidx, nodePool->size, nodePool->freeSize, asyncInfo->slot->m_version, nodePool->m_version); + elog(LOG, POOL_MGR_PREFIX"destory connection to node:%u " + "nodeidx:%d nodepool size:%d freeSize:%d for unmatch " + "version, slot->m_version:%lu, nodePool->m_version:%lu", + asyncInfo->node, + nodeidx, nodePool->size, nodePool->freeSize, + asyncInfo->slot->m_version, nodePool->m_version); } + nodeidx = get_node_index_by_nodeoid(asyncInfo->node); + destroy_slot(nodeidx, asyncInfo->node, asyncInfo->slot); + break; } if (COMMAND_CONNECTION_WARM == asyncInfo->cmd) @@ -6753,6 +6898,12 @@ static void pooler_sync_connections_to_nodepool(void) errmsg(POOL_MGR_PREFIX"get node %u name failed", connRsp->nodeoid))); } snprintf(nodePool->node_name, NAMEDATALEN, "%s", name_str); + + nodePool->m_version = now; + elog(LOG, + "pooler_sync_connections_to_nodepool: nodePools of " + "node (%u, %s) is created.", + nodePool->nodeoid, nodePool->node_name); } /* add connection to hash table */ @@ -6801,14 +6952,12 @@ static void pooler_sync_connections_to_nodepool(void) destroy_slot(connRsp->nodeindex, connRsp->nodeoid, slot); if (PoolConnectDebugPrint) { - elog(LOG, POOL_MGR_PREFIX"destroy slot poolsize:%d, freeSize:%d, node:%u, MaxPoolSize:%d, connRsp->m_version:%d, nodePool->m_version:%d", + elog(LOG, POOL_MGR_PREFIX"destroy slot poolsize:%d, " + "freeSize:%d, node:%u, MaxPoolSize:%d, " + "connRsp->m_version:%lu, nodePool->m_version:%lu", nodePool->size, - nodePool->freeSize, - nodePool->nodeoid, - MaxPoolSize, - connRsp->m_version, - nodePool->m_version - ); + nodePool->freeSize, nodePool->nodeoid, MaxPoolSize, + connRsp->m_version, nodePool->m_version); } } @@ -6998,7 +7147,7 @@ static void pooler_async_ping_node(Oid node) /* async batch connection build */ -static bool pooler_async_build_connection(DatabasePool *pool, int32 pool_version, int32 nodeidx, Oid node, int32 size, char *connStr, bool bCoord) +static bool pooler_async_build_connection(DatabasePool *pool, time_t pool_version, int32 nodeidx, Oid node, int32 size, char *connStr, bool bCoord) { int32 threadid; uint64 pipeput_loops = 0; @@ -7308,6 +7457,11 @@ preconnect_and_warm(DatabasePool *dbPool) errmsg(POOL_MGR_PREFIX"get node %u name failed", dnOids[i]))); } snprintf(nodePool->node_name, NAMEDATALEN, "%s", name_str); + + nodePool->m_version = time(NULL); + elog(LOG, + "preconnect_and_warm: nodePools of node (%u, %s) is created.", + nodePool->nodeoid, nodePool->node_name); } while (nodePool->size < MinPoolSize || (nodePool->freeSize < MinFreeSize && nodePool->size < MaxPoolSize)) @@ -7761,8 +7915,9 @@ void *pooler_sync_remote_operator_thread(void *arg) { if (PoolConnectStaus_connected == request->final_status) { - finish_task_request(request->taskControl); + /* Increase success count first and then finish count */ acquire_command_increase_succeed(request->taskControl); + finish_task_request(request->taskControl); request->current_status = PoolConnectStaus_done; } else @@ -7847,8 +8002,9 @@ void *pooler_sync_remote_operator_thread(void *arg) { /* job succeed */ request->current_status = PoolConnectStaus_done; - finish_task_request(request->taskControl); + /* Increase success count first and then finish count */ acquire_command_increase_succeed(request->taskControl); + finish_task_request(request->taskControl); } } continue; @@ -7864,9 +8020,10 @@ void *pooler_sync_remote_operator_thread(void *arg) { int32 ret2 = 0; + /* Increase success count first and then finish count */ + acquire_command_increase_succeed(request->taskControl); /* set myself finish count */ finish_task_request(request->taskControl); - acquire_command_increase_succeed(request->taskControl); /* wait for others to finish */ while (!check_is_task_done(request->taskControl)) @@ -8452,6 +8609,9 @@ static inline bool dispatch_connection_request(PGXCASyncTaskCtl *taskControl, /* use version to tag every slot */ slot->m_version = nodepool->m_version; + slot->created = time(NULL); + slot->checked = slot->created; + slot->released = slot->created; } @@ -9916,7 +10076,7 @@ static void print_pooler_slot(PGXCNodePoolSlot *slot) } else { - elog(LOG, "slot=%p bwarmed=%d usecount=%d refcount=%d m_version=%d pid=%d seqnum=%d " + elog(LOG, "slot=%p bwarmed=%d usecount=%d refcount=%d m_version=%lu pid=%d seqnum=%d " "bdestoryed=%d file=%s lineno=%d node_name=%s backend_pid=%d", slot, slot->bwarmed, slot->usecount, slot->refcount,slot->m_version,slot->pid,slot->seqnum, @@ -10744,65 +10904,6 @@ handle_session_command(PoolAgent * agent, StringInfo s) } - -static bool -remove_all_agent_references(Oid nodeoid) -{// #lizard forgives - int i, j; - bool res = true; - - /* - * Identify if it's a coordinator or datanode first - * and get its index - */ - for (i = 1; i <= agentCount; i++) - { - bool found = false; - - PoolAgent *agent = poolAgents[i - 1]; - for (j = 0; j < agent->num_dn_connections; j++) - { - if (agent->dn_conn_oids[j] == nodeoid) - { - found = true; - break; - } - } - if (found) - { - PGXCNodePoolSlot *slot = agent->dn_connections[j]; - if (slot) - release_connection(agent->pool, slot, j, agent->dn_conn_oids[j], false, false); - agent->dn_connections[j] = NULL; - } - else - { - for (j = 0; j < agent->num_coord_connections; j++) - { - if (agent->coord_conn_oids[j] == nodeoid) - { - found = true; - break; - } - } - if (found) - { - PGXCNodePoolSlot *slot = agent->coord_connections[j]; - if (slot) - release_connection(agent->pool, slot, j, agent->coord_conn_oids[j], true, true); - agent->coord_connections[j] = NULL; - } - else - { - elog(LOG, "Node not found! (%u)", nodeoid); - res = false; - } - } - } - return res; -} - - /* * refresh_database_pools * refresh information for all database pools @@ -10875,22 +10976,44 @@ refresh_database_pools(PoolAgent *agent) if (strcmp(connstr_chk, nodePool->connstr)) { - elog(LOG, "Found an altered node (%u)", nodePool->nodeoid); - /* - * Node has been altered. First remove - * all references to this node from ALL the - * agents before destroying it.. - */ - if (!remove_all_agent_references(nodePool->nodeoid)) + if (nodePool->size == nodePool->freeSize) { - res = POOL_REFRESH_FAILED; - break; - } - + elog(LOG, + "refresh_database_pools: Found an altered node (%u %s) " + "size %d freesize %d is removed. " + "connstr_chk=%s, nodePool->connstr=%s", + nodePool->nodeoid, nodePool->node_name, + nodePool->size, nodePool->freeSize, + connstr_chk, nodePool->connstr); destroy_node_pool(nodePool); hash_search(databasePool->nodePools, &nodePool->nodeoid, HASH_REMOVE, NULL); } + else + { + destroy_node_pool_free_slots(nodePool); + + /* increase the node pool version */ + nodePool->m_version = time(NULL); + elog(LOG, + "refresh_database_pools: Found an altered node (%u %s) " + "size %d freesize %d increased m_version %lu" + "connstr_chk=%s, nodePool->connstr=%s", + nodePool->nodeoid, nodePool->node_name, + nodePool->size, nodePool->freeSize, nodePool->m_version, + connstr_chk, nodePool->connstr); + + /* fresh the connect string so that new coming connection will connect to the new node */ + if (connstr_chk) + { + if (nodePool->connstr) + { + pfree(nodePool->connstr); + } + nodePool->connstr = pstrdup(connstr_chk); + } + } + } if (connstr_chk) pfree(connstr_chk); @@ -11181,7 +11304,7 @@ handle_close_pooled_connections(PoolAgent * agent, StringInfo s) destroy_node_pool_free_slots(nodePool); /* increase the node pool version */ - nodePool->m_version++; + nodePool->m_version = time(NULL); } } diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 56276b4a..3f4a8089 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -2966,7 +2966,9 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln) if (REL_CRYPT_ENTRY_IS_VALID(&(reln->smgr_relcrypt)) && (MAIN_FORKNUM == buf->tag.forkNum || EXTENT_FORKNUM == buf->tag.forkNum)) { + BufDisableMemoryProtection(bufBlock, false); bufBlockEncrypt = rel_crypt_page_encrypt((RelCrypt)&(reln->smgr_relcrypt), bufToWrite); + BufEnableMemoryProtection(bufBlock, false); } else { @@ -3505,7 +3507,9 @@ FlushRelationBuffers(Relation rel) if (REL_CRYPT_ENTRY_IS_VALID(&(rel->rd_smgr->smgr_relcrypt)) && (MAIN_FORKNUM == bufHdr->tag.forkNum || EXTENT_FORKNUM == bufHdr->tag.forkNum)) { + BufDisableMemoryProtection(localpage, false); bufBlockEncrypt = rel_crypt_page_encrypt((RelCrypt)&(rel->rd_smgr->smgr_relcrypt), localpage); + BufDisableMemoryProtection(localpage, false); } else { diff --git a/src/backend/storage/freespace/emapage.c b/src/backend/storage/freespace/emapage.c index b288c04e..063d08f3 100644 --- a/src/backend/storage/freespace/emapage.c +++ b/src/backend/storage/freespace/emapage.c @@ -220,6 +220,7 @@ extent_readbuffer(Relation rel, BlockNumber blkno, bool extend) buf = ReadBufferExtended(rel, EXTENT_FORKNUM, blkno, RBM_ZERO_ON_ERROR, NULL); if (PageIsNew(BufferGetPage(buf))) { + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); PageInit_shard(BufferGetPage(buf), BLCKSZ, 0, InvalidShardID, true); switch(pagetype) { @@ -236,6 +237,7 @@ extent_readbuffer(Relation rel, BlockNumber blkno, bool extend) elog(PANIC, "page type %d is not supported.", pagetype); break; } + LockBuffer(buf, BUFFER_LOCK_UNLOCK); } return buf; } @@ -324,6 +326,7 @@ extent_readbuffer_for_redo(RelFileNode rnode, BlockNumber blkno, bool extend) buf = XLogReadBufferExtended(rnode, EXTENT_FORKNUM, blkno, RBM_ZERO_ON_ERROR); if (PageIsNew(BufferGetPage(buf))) { + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); PageInit_shard(BufferGetPage(buf), BLCKSZ, 0, InvalidShardID, true); switch(pagetype) { @@ -340,6 +343,7 @@ extent_readbuffer_for_redo(RelFileNode rnode, BlockNumber blkno, bool extend) elog(PANIC, "page type %d is not supported.", pagetype); break; } + LockBuffer(buf, BUFFER_LOCK_UNLOCK); } return buf; } diff --git a/src/backend/storage/ipc/dsm.c b/src/backend/storage/ipc/dsm.c index ac677b63..eaf5145f 100644 --- a/src/backend/storage/ipc/dsm.c +++ b/src/backend/storage/ipc/dsm.c @@ -512,8 +512,6 @@ dsm_create(Size size, int flags) /* Verify that we can support an additional mapping. */ if (nitems >= dsm_control->maxitems) { - if ((flags & DSM_CREATE_NULL_IF_MAXSEGMENTS) != 0) - { LWLockRelease(DynamicSharedMemoryControlLock); dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private, &seg->mapped_address, &seg->mapped_size, WARNING); @@ -521,8 +519,10 @@ dsm_create(Size size, int flags) ResourceOwnerForgetDSM(seg->resowner, seg); dlist_delete(&seg->node); pfree(seg); + + if ((flags & DSM_CREATE_NULL_IF_MAXSEGMENTS) != 0) return NULL; - } + ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), errmsg("too many dynamic shared memory segments"))); @@ -597,22 +597,20 @@ dsm_attach(dsm_handle h) nitems = dsm_control->nitems; for (i = 0; i < nitems; ++i) { - /* If the reference count is 0, the slot is actually unused. */ - if (dsm_control->item[i].refcnt == 0) + /* + * If the reference count is 0, the slot is actually unused. If the + * reference count is 1, the slot is still in use, but the segment is + * in the process of going away; even if the handle matches, another + * slot may already have started using the same handle value by + * coincidence so we have to keep searching. + */ + if (dsm_control->item[i].refcnt <= 1) continue; /* If the handle doesn't match, it's not the slot we want. */ if (dsm_control->item[i].handle != seg->handle) continue; - /* - * If the reference count is 1, the slot is still in use, but the - * segment is in the process of going away. Treat that as if we - * didn't find a match. - */ - if (dsm_control->item[i].refcnt == 1) - break; - /* Otherwise we've found a match. */ dsm_control->item[i].refcnt++; seg->control_slot = i; @@ -728,8 +726,12 @@ dsm_detach(dsm_segment *seg) /* * Invoke registered callbacks. Just in case one of those callbacks * throws a further error that brings us back here, pop the callback - * before invoking it, to avoid infinite error recursion. + * before invoking it, to avoid infinite error recursion. Don't allow + * interrupts while running the individual callbacks in non-error code + * paths, to avoid leaving cleanup work unfinished if we're interrupted by + * a statement timeout or similar. */ + HOLD_INTERRUPTS(); while (!slist_is_empty(&seg->on_detach)) { slist_node *node; @@ -745,6 +747,7 @@ dsm_detach(dsm_segment *seg) function(seg, arg); } + RESUME_INTERRUPTS(); /* * Try to remove the mapping, if one exists. Normally, there will be, but @@ -906,8 +909,8 @@ dsm_unpin_segment(dsm_handle handle) LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE); for (i = 0; i < dsm_control->nitems; ++i) { - /* Skip unused slots. */ - if (dsm_control->item[i].refcnt == 0) + /* Skip unused slots and segments that are concurrently going away. */ + if (dsm_control->item[i].refcnt <= 1) continue; /* If we've found our handle, we can stop searching. */ diff --git a/src/backend/utils/misc/mls.c b/src/backend/utils/misc/mls.c index 014e81b2..e55f36dd 100644 --- a/src/backend/utils/misc/mls.c +++ b/src/backend/utils/misc/mls.c @@ -749,9 +749,7 @@ static void* mls_crypt_worker(void * input) buf_need_encrypt = page_new + BLCKSZ; /* 2.2 do the encrypt */ - need_mprotect = enable_buffer_mprotect && - !BufferIsLocal(encrypt_element.buf_id) && - BufferIsValid(encrypt_element.buf_id); + need_mprotect = enable_buffer_mprotect && !BufferIsLocal(encrypt_element.buf_id); if (need_mprotect) { BufDisableMemoryProtection(buf, false); diff --git a/src/backend/utils/mmgr/dsa.c b/src/backend/utils/mmgr/dsa.c index 1382c516..f7f11c06 100644 --- a/src/backend/utils/mmgr/dsa.c +++ b/src/backend/utils/mmgr/dsa.c @@ -256,7 +256,7 @@ static const uint16 dsa_size_classes[] = { * round the size of the object up to the next multiple of 8 bytes, and then * index into this array. */ -static char dsa_size_class_map[] = { +static const uint8 dsa_size_class_map[] = { 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, @@ -405,6 +405,7 @@ static dsa_area *create_internal(void *place, size_t size, static dsa_area *attach_internal(void *place, dsm_segment *segment, dsa_handle handle); static void check_for_freed_segments(dsa_area *area); +static void check_for_freed_segments_locked(dsa_area *area); /* * Create a new shared area in a new DSM segment. Further DSM segments will @@ -649,7 +650,7 @@ dsa_pin_mapping(dsa_area *area) * will result in an ERROR. * * DSA_ALLOC_NO_OOM causes this function to return InvalidDsaPointer when - * no memory is available or a size limit establed by set_dsa_size_limit + * no memory is available or a size limit establed by dsa_set_size_limit * would be exceeded. Otherwise, such allocations will result in an ERROR. * * DSA_ALLOC_ZERO causes the allocated memory to be zeroed. Otherwise, the @@ -692,7 +693,16 @@ dsa_allocate_extended(dsa_area *area, Size size, int flags) /* Obtain a span object. */ span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS); if (!DsaPointerIsValid(span_pointer)) + { + /* Raise error unless asked not to. */ + if ((flags & DSA_ALLOC_NO_OOM) == 0) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"), + errdetail("Failed on DSA request of size %zu.", + size))); return InvalidDsaPointer; + } LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); @@ -1065,6 +1075,7 @@ dsa_dump(dsa_area *area) */ LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); + check_for_freed_segments_locked(area); fprintf(stderr, "dsa_area handle %x:\n", area->control->handle); fprintf(stderr, " max_total_segment_size: %zu\n", area->control->max_total_segment_size); @@ -1667,13 +1678,15 @@ ensure_active_superblock(dsa_area *area, dsa_area_pool *pool, return false; } } + + /* + * This shouldn't happen: get_best_segment() or make_new_segment() + * promised that we can successfully allocate npages. + */ if (!FreePageManagerGet(segment_map->fpm, npages, &first_page)) - { - LWLockRelease(DSA_AREA_LOCK(area)); - if (size_class != DSA_SCLASS_BLOCK_OF_SPANS) - dsa_free(area, span_pointer); - return false; - } + elog(FATAL, + "dsa_allocate could not find %zu free pages for superblock", + npages); LWLockRelease(DSA_AREA_LOCK(area)); /* Compute the start of the superblock. */ @@ -1762,6 +1775,23 @@ get_segment_by_index(dsa_area *area, dsa_segment_index index) (DSA_SEGMENT_HEADER_MAGIC ^ area->control->handle ^ index)); } + /* + * Callers of dsa_get_address() and dsa_free() don't hold the area lock, + * but it's a bug in the calling code and undefined behavior if the + * address is not live (ie if the segment might possibly have been freed, + * they're trying to use a dangling pointer). + * + * For dsa.c code that holds the area lock to manipulate segment_bins + * lists, it would be a bug if we ever reach a freed segment here. After + * it's marked as freed, the only thing any backend should do with it is + * unmap it, and it should always have done that in + * check_for_freed_segments_locked() before arriving here to resolve an + * index to a segment_map. + * + * Either way we can assert that we aren't returning a freed segment. + */ + Assert(!area->segment_maps[index].header->freed); + return &area->segment_maps[index]; } @@ -1778,9 +1808,6 @@ destroy_superblock(dsa_area *area, dsa_pointer span_pointer) int size_class = span->size_class; dsa_segment_map *segment_map; - segment_map = - get_segment_by_index(area, DSA_EXTRACT_SEGMENT_NUMBER(span->start)); - /* Remove it from its fullness class list. */ unlink_span(area, span); @@ -1790,6 +1817,9 @@ destroy_superblock(dsa_area *area, dsa_pointer span_pointer) * could deadlock. */ LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); + check_for_freed_segments_locked(area); + segment_map = + get_segment_by_index(area, DSA_EXTRACT_SEGMENT_NUMBER(span->start)); FreePageManagerPut(segment_map->fpm, DSA_EXTRACT_OFFSET(span->start) / FPM_PAGE_SIZE, span->npages); @@ -1944,6 +1974,7 @@ get_best_segment(dsa_area *area, Size npages) Size bin; Assert(LWLockHeldByMe(DSA_AREA_LOCK(area))); + check_for_freed_segments_locked(area); /* * Start searching from the first bin that *might* have enough contiguous @@ -2220,10 +2251,30 @@ check_for_freed_segments(dsa_area *area) freed_segment_counter = area->control->freed_segment_counter; if (unlikely(area->freed_segment_counter != freed_segment_counter)) { - int i; - /* Check all currently mapped segments to find what's been freed. */ LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); + check_for_freed_segments_locked(area); + LWLockRelease(DSA_AREA_LOCK(area)); + } +} + +/* + * Workhorse for check_for_freed_segments(), and also used directly in path + * where the area lock is already held. This should be called after acquiring + * the lock but before looking up any segment by index number, to make sure we + * unmap any stale segments that might have previously had the same index as a + * current segment. + */ +static void +check_for_freed_segments_locked(dsa_area *area) +{ + size_t freed_segment_counter; + int i; + + Assert(LWLockHeldByMe(DSA_AREA_LOCK(area))); + freed_segment_counter = area->control->freed_segment_counter; + if (unlikely(area->freed_segment_counter != freed_segment_counter)) + { for (i = 0; i <= area->high_segment_index; ++i) { if (area->segment_maps[i].header != NULL && @@ -2235,7 +2286,6 @@ check_for_freed_segments(dsa_area *area) area->segment_maps[i].mapped_address = NULL; } } - LWLockRelease(DSA_AREA_LOCK(area)); area->freed_segment_counter = freed_segment_counter; } } diff --git a/src/backend/utils/mmgr/freepage.c b/src/backend/utils/mmgr/freepage.c index f61c6547..aa8bc7eb 100644 --- a/src/backend/utils/mmgr/freepage.c +++ b/src/backend/utils/mmgr/freepage.c @@ -164,7 +164,7 @@ static void FreePagePushSpanLeader(FreePageManager *fpm, Size first_page, static Size FreePageManagerLargestContiguous(FreePageManager *fpm); static void FreePageManagerUpdateLargest(FreePageManager *fpm); -#if FPM_EXTRA_ASSERTS +#ifdef FPM_EXTRA_ASSERTS static Size sum_free_pages(FreePageManager *fpm); #endif @@ -231,7 +231,7 @@ FreePageManagerGet(FreePageManager *fpm, Size npages, Size *first_page) /* * FreePageManagerGetInternal may have set contiguous_pages_dirty. - * Recompute contigous_pages if so. + * Recompute contiguous_pages if so. */ FreePageManagerUpdateLargest(fpm); @@ -455,7 +455,7 @@ FreePageManagerDump(FreePageManager *fpm) recycle = relptr_access(base, fpm->btree_recycle); if (recycle != NULL) { - appendStringInfo(&buf, "btree recycle:"); + appendStringInfoString(&buf, "btree recycle:"); FreePageManagerDumpSpans(fpm, recycle, 1, &buf); } @@ -468,7 +468,7 @@ FreePageManagerDump(FreePageManager *fpm) continue; if (!dumped_any_freelist) { - appendStringInfo(&buf, "freelists:\n"); + appendStringInfoString(&buf, "freelists:\n"); dumped_any_freelist = true; } appendStringInfo(&buf, " %zu:", f + 1); @@ -742,8 +742,8 @@ FreePageBtreeConsolidate(FreePageManager *fpm, FreePageBtree *btp) /* * If we can fit our keys onto our left sibling's page, consolidate. In - * this case, we move our keys onto the other page rather than visca - * versa, to avoid having to adjust ancestor keys. + * this case, we move our keys onto the other page rather than vice versa, + * to avoid having to adjust ancestor keys. */ np = FreePageBtreeFindLeftSibling(base, btp); if (np != NULL && btp->hdr.nused + np->hdr.nused <= max) @@ -1275,7 +1275,7 @@ FreePageManagerDumpBtree(FreePageManager *fpm, FreePageBtree *btp, btp->u.leaf_key[index].first_page, btp->u.leaf_key[index].npages); } - appendStringInfo(buf, "\n"); + appendStringInfoChar(buf, '\n'); if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC) { @@ -1308,7 +1308,7 @@ FreePageManagerDumpSpans(FreePageManager *fpm, FreePageSpanLeader *span, span = relptr_access(base, span->next); } - appendStringInfo(buf, "\n"); + appendStringInfoChar(buf, '\n'); } /* @@ -1470,9 +1470,7 @@ FreePageManagerGetInternal(FreePageManager *fpm, Size npages, Size *first_page) * pages; if false, do it always. Returns 0 if the soft flag caused the * insertion to be skipped, or otherwise the size of the contiguous span * created by the insertion. This may be larger than npages if we're able - * to consolidate with an adjacent range. *internal_pages_used is set to - * true if the btree allocated pages for internal purposes, which might - * invalidate the current largest run requiring it to be recomputed. + * to consolidate with an adjacent range. */ static Size FreePageManagerPutInternal(FreePageManager *fpm, Size first_page, Size npages, @@ -1526,6 +1524,9 @@ FreePageManagerPutInternal(FreePageManager *fpm, Size first_page, Size npages, if (!relptr_is_null(fpm->btree_recycle)) root = FreePageBtreeGetRecycled(fpm); + /* Should not allocate if soft. */ + else if (soft) + return 0; else if (FreePageManagerGetInternal(fpm, 1, &root_page)) root = (FreePageBtree *) fpm_page_to_pointer(base, root_page); else @@ -1692,7 +1693,7 @@ FreePageManagerPutInternal(FreePageManager *fpm, Size first_page, Size npages, /* * The act of allocating pages to recycle may have invalidated the - * results of our previous btree reserch, so repeat it. (We could + * results of our previous btree research, so repeat it. (We could * recheck whether any of our split-avoidance strategies that were * not viable before now are, but it hardly seems worthwhile, so * we don't bother. Consolidation can't be possible now if it diff --git a/src/backend/utils/time/tqual.c b/src/backend/utils/time/tqual.c index 247cabfe..5efbd805 100644 --- a/src/backend/utils/time/tqual.c +++ b/src/backend/utils/time/tqual.c @@ -243,6 +243,9 @@ SetHintBits(HeapTupleHeader tuple, Buffer buffer, GlobalTimestamp global_timestamp; #endif + BufferDesc *buf = NULL; + bool mprotect = false; + if (TransactionIdIsValid(xid)) { /* NB: xid must be known committed here! */ @@ -256,10 +259,18 @@ SetHintBits(HeapTupleHeader tuple, Buffer buffer, } } #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__ - if (enable_buffer_mprotect) + /* + * BUFFER_LOCK_EXCLUSIVE has made the buffer writable, but BUFFER_LOCK_SHARED + * does not, so it has to be set to be writable. + * + * After setting GTS, it needs to set the memory protection again. + */ + buf = GetBufferDescriptor(buffer - 1); + mprotect = enable_buffer_mprotect && + LWLockHeldByMeInMode(BufferDescriptorGetContentLock(buf), LW_SHARED); + if (mprotect) { - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + BufDisableMemoryProtection(BufferGetPage(buffer), false); } if(infomask & HEAP_XMIN_COMMITTED) { @@ -323,6 +334,11 @@ SetHintBits(HeapTupleHeader tuple, Buffer buffer, tuple->t_infomask |= infomask; MarkBufferDirtyHint(buffer, true); + + if (mprotect) + { + BufEnableMemoryProtection(BufferGetPage(buffer), false); + } } diff --git a/src/include/pgxc/planner.h b/src/include/pgxc/planner.h index c868136a..9e16886d 100644 --- a/src/include/pgxc/planner.h +++ b/src/include/pgxc/planner.h @@ -227,6 +227,7 @@ typedef struct Node *parsetree; /* to recognize subtxn cmds (savepoint, rollback to, release savepoint) */ bool is_set; /* is SET statement ? */ + bool ignore_tuple_desc; /* should ignore received tuple slot desc ? */ #endif } RemoteQuery; diff --git a/src/include/pgxc/poolmgr.h b/src/include/pgxc/poolmgr.h index 9fff0445..e5cee0b6 100644 --- a/src/include/pgxc/poolmgr.h +++ b/src/include/pgxc/poolmgr.h @@ -105,7 +105,7 @@ typedef struct /* trace info */ int32 refcount; /* reference count */ - int32 m_version; /* version of node slot */ + time_t m_version; /* version of node slot */ int32 pid; /* agent pid that contains the slot */ int32 seqnum; /* slot seqnum for the slot, unique for one slot */ bool bdestoryed; /* used to show whether we are destoryed */ @@ -128,7 +128,7 @@ typedef struct int size; /* total pool size */ char node_name[NAMEDATALEN]; /* name of the node.*/ - int32 m_version; /* version of node pool */ + time_t m_version; /* version of node pool */ PGXCNodePoolSlot **slot; } PGXCNodePool; diff --git a/src/test/regress/expected/aggregates_1.out b/src/test/regress/expected/aggregates_1.out index d967655b..9602196b 100644 --- a/src/test/regress/expected/aggregates_1.out +++ b/src/test/regress/expected/aggregates_1.out @@ -1872,8 +1872,6 @@ create aggregate my_sum(int4) ); -- aggregate state should be shared as aggs are the same. select my_avg(one),my_avg(one) from (values(1),(3)) t(one); -NOTICE: avg_transfn called with 1 -NOTICE: avg_transfn called with 3 my_avg | my_avg --------+-------- 2 | 2 @@ -1881,8 +1879,6 @@ NOTICE: avg_transfn called with 3 -- aggregate state should be shared as transfn is the same for both aggs. select my_avg(one),my_sum(one) from (values(1),(3)) t(one); -NOTICE: avg_transfn called with 1 -NOTICE: avg_transfn called with 3 my_avg | my_sum --------+-------- 2 | 4 @@ -1890,8 +1886,6 @@ NOTICE: avg_transfn called with 3 -- same as previous one, but with DISTINCT, which requires sorting the input. select my_avg(distinct one),my_sum(distinct one) from (values(1),(3),(1)) t(one); -NOTICE: avg_transfn called with 1 -NOTICE: avg_transfn called with 3 my_avg | my_sum --------+-------- 2 | 4 @@ -1899,10 +1893,6 @@ NOTICE: avg_transfn called with 3 -- shouldn't share states due to the distinctness not matching. select my_avg(distinct one),my_sum(one) from (values(1),(3)) t(one); -NOTICE: avg_transfn called with 1 -NOTICE: avg_transfn called with 3 -NOTICE: avg_transfn called with 1 -NOTICE: avg_transfn called with 3 my_avg | my_sum --------+-------- 2 | 4 @@ -1910,9 +1900,6 @@ NOTICE: avg_transfn called with 3 -- shouldn't share states due to the filter clause not matching. select my_avg(one) filter (where one > 1),my_sum(one) from (values(1),(3)) t(one); -NOTICE: avg_transfn called with 1 -NOTICE: avg_transfn called with 3 -NOTICE: avg_transfn called with 3 my_avg | my_sum --------+-------- 3 | 4 @@ -1920,10 +1907,6 @@ NOTICE: avg_transfn called with 3 -- this should not share the state due to different input columns. select my_avg(one),my_sum(two) from (values(1,2),(3,4)) t(one,two); -NOTICE: avg_transfn called with 2 -NOTICE: avg_transfn called with 1 -NOTICE: avg_transfn called with 4 -NOTICE: avg_transfn called with 3 my_avg | my_sum --------+-------- 2 | 6 @@ -1953,8 +1936,6 @@ create aggregate my_avg_init2(int4) ); -- state should be shared if INITCONDs are matching select my_sum_init(one),my_avg_init(one) from (values(1),(3)) t(one); -NOTICE: avg_transfn called with 1 -NOTICE: avg_transfn called with 3 my_sum_init | my_avg_init -------------+------------- 14 | 7 @@ -1962,10 +1943,6 @@ NOTICE: avg_transfn called with 3 -- Varying INITCONDs should cause the states not to be shared. select my_sum_init(one),my_avg_init2(one) from (values(1),(3)) t(one); -NOTICE: avg_transfn called with 1 -NOTICE: avg_transfn called with 1 -NOTICE: avg_transfn called with 3 -NOTICE: avg_transfn called with 3 my_sum_init | my_avg_init2 -------------+-------------- 14 | 4 @@ -2017,10 +1994,6 @@ create aggregate my_half_sum(int4) ); -- Agg state should be shared even though my_sum has no finalfn select my_sum(one),my_half_sum(one) from (values(1),(2),(3),(4)) t(one); -NOTICE: sum_transfn called with 1 -NOTICE: sum_transfn called with 2 -NOTICE: sum_transfn called with 3 -NOTICE: sum_transfn called with 4 my_sum | my_half_sum --------+------------- 10 | 5 diff --git a/src/test/regress/expected/create_view.out b/src/test/regress/expected/create_view.out index 56e73b4e..d1285a50 100644 --- a/src/test/regress/expected/create_view.out +++ b/src/test/regress/expected/create_view.out @@ -1713,3 +1713,35 @@ DROP SCHEMA temp_view_test CASCADE; NOTICE: drop cascades to 27 other objects DROP SCHEMA testviewschm2 CASCADE; NOTICE: drop cascades to 62 other objects +-- check plan without sort operator, but need merge sort +set enable_seqscan = off; +create table test(v int primary key, w int) distribute by shard(v); +NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. +insert into test values(generate_series(1,50), generate_series(1,50)); +create view test_sort as select * from test where v in (select v from test where w < 20) order by v asc; +select * from test_sort; + v | w +----+---- + 1 | 1 + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 + 10 | 10 + 11 | 11 + 12 | 12 + 13 | 13 + 14 | 14 + 15 | 15 + 16 | 16 + 17 | 17 + 18 | 18 + 19 | 19 +(19 rows) + +drop table test cascade; +NOTICE: drop cascades to view test_sort diff --git a/src/test/regress/expected/gist_1.out b/src/test/regress/expected/gist_1.out index 99b03902..0653fb98 100644 --- a/src/test/regress/expected/gist_1.out +++ b/src/test/regress/expected/gist_1.out @@ -129,7 +129,8 @@ cross join lateral QUERY PLAN -------------------------------------------------------------------------------------- Nested Loop - -> Values Scan on "*VALUES*" + -> Remote Subquery Scan on all (datanode_1) + -> Values Scan on "*VALUES*" -> Materialize -> Limit -> Remote Subquery Scan on all (datanode_1) @@ -137,7 +138,7 @@ cross join lateral -> Index Only Scan using gist_tbl_point_index on gist_tbl Index Cond: (p <@ "*VALUES*".column1) Order By: (p <-> ("*VALUES*".column1)[0]) -(9 rows) +(10 rows) select p from (values (box(point(0,0), point(0.5,0.5))), diff --git a/src/test/regress/expected/groupingsets.out b/src/test/regress/expected/groupingsets.out index e1524f49..56a23289 100644 --- a/src/test/regress/expected/groupingsets.out +++ b/src/test/regress/expected/groupingsets.out @@ -658,20 +658,21 @@ select v.c, (select count(*) from gstest2 group by () having v.c) explain (costs off) select v.c, (select count(*) from gstest2 group by () having v.c) from (values (false),(true)) v(c) order by v.c; - QUERY PLAN -------------------------------------------------------------------------- - Sort - Sort Key: "*VALUES*".column1 - -> Values Scan on "*VALUES*" - SubPlan 1 - -> Aggregate - Group Key: () - Filter: "*VALUES*".column1 - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Result - One-Time Filter: "*VALUES*".column1 - -> Seq Scan on gstest2 -(11 rows) + QUERY PLAN +------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1) + -> Sort + Sort Key: "*VALUES*".column1 + -> Values Scan on "*VALUES*" + SubPlan 1 + -> Aggregate + Group Key: () + Filter: "*VALUES*".column1 + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Result + One-Time Filter: "*VALUES*".column1 + -> Seq Scan on gstest2 +(12 rows) -- HAVING with GROUPING queries select ten, grouping(ten) from onek @@ -885,15 +886,16 @@ select a, b, grouping(a,b), sum(v), count(*), max(v) explain (costs off) select a, b, grouping(a,b), sum(v), count(*), max(v) from gstest1 group by grouping sets ((a),(b)) order by 3,1,2; - QUERY PLAN --------------------------------------------------------------------------------------------------------- - Sort - Sort Key: (GROUPING("*VALUES*".column1, "*VALUES*".column2)), "*VALUES*".column1, "*VALUES*".column2 - -> HashAggregate - Hash Key: "*VALUES*".column1 - Hash Key: "*VALUES*".column2 - -> Values Scan on "*VALUES*" -(6 rows) + QUERY PLAN +-------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1) + -> Sort + Sort Key: (GROUPING("*VALUES*".column1, "*VALUES*".column2)), "*VALUES*".column1, "*VALUES*".column2 + -> HashAggregate + Hash Key: "*VALUES*".column1 + Hash Key: "*VALUES*".column2 + -> Values Scan on "*VALUES*" +(7 rows) select a, b, grouping(a,b), sum(v), count(*), max(v) from gstest1 group by cube(a,b) order by 3,1,2; @@ -919,34 +921,36 @@ select a, b, grouping(a,b), sum(v), count(*), max(v) explain (costs off) select a, b, grouping(a,b), sum(v), count(*), max(v) from gstest1 group by cube(a,b) order by 3,1,2; - QUERY PLAN --------------------------------------------------------------------------------------------------------- - Sort - Sort Key: (GROUPING("*VALUES*".column1, "*VALUES*".column2)), "*VALUES*".column1, "*VALUES*".column2 - -> MixedAggregate - Hash Key: "*VALUES*".column1, "*VALUES*".column2 - Hash Key: "*VALUES*".column1 - Hash Key: "*VALUES*".column2 - Group Key: () - -> Values Scan on "*VALUES*" -(8 rows) + QUERY PLAN +-------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1) + -> Sort + Sort Key: (GROUPING("*VALUES*".column1, "*VALUES*".column2)), "*VALUES*".column1, "*VALUES*".column2 + -> MixedAggregate + Hash Key: "*VALUES*".column1, "*VALUES*".column2 + Hash Key: "*VALUES*".column1 + Hash Key: "*VALUES*".column2 + Group Key: () + -> Values Scan on "*VALUES*" +(9 rows) -- shouldn't try and hash explain (costs off) select a, b, grouping(a,b), array_agg(v order by v) from gstest1 group by cube(a,b); - QUERY PLAN ----------------------------------------------------------- - GroupAggregate - Group Key: "*VALUES*".column1, "*VALUES*".column2 - Group Key: "*VALUES*".column1 - Group Key: () - Sort Key: "*VALUES*".column2 - Group Key: "*VALUES*".column2 - -> Sort - Sort Key: "*VALUES*".column1, "*VALUES*".column2 - -> Values Scan on "*VALUES*" -(9 rows) + QUERY PLAN +---------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1) + -> GroupAggregate + Group Key: "*VALUES*".column1, "*VALUES*".column2 + Group Key: "*VALUES*".column1 + Group Key: () + Sort Key: "*VALUES*".column2 + Group Key: "*VALUES*".column2 + -> Sort + Sort Key: "*VALUES*".column1, "*VALUES*".column2 + -> Values Scan on "*VALUES*" +(10 rows) -- mixed hashable/sortable cases select unhashable_col, unsortable_col, @@ -1134,15 +1138,16 @@ explain (costs off) select a, b, sum(v.x) from (values (1),(2)) v(x), gstest_data(v.x) group by grouping sets (a,b); - QUERY PLAN ------------------------------------------- + QUERY PLAN +------------------------------------------------------ HashAggregate Hash Key: gstest_data.a Hash Key: gstest_data.b -> Nested Loop - -> Values Scan on "*VALUES*" + -> Remote Subquery Scan on all (datanode_1) + -> Values Scan on "*VALUES*" -> Function Scan on gstest_data -(6 rows) +(7 rows) select * from (values (1),(2)) v(x), @@ -1188,16 +1193,17 @@ select a, b, grouping(a,b), sum(v), count(*), max(v) explain (costs off) select a, b, grouping(a,b), sum(v), count(*), max(v) from gstest1 group by grouping sets ((a,b),(a+1,b+1),(a+2,b+2)) order by 3,6; - QUERY PLAN -------------------------------------------------------------------------------------------- - Sort - Sort Key: (GROUPING("*VALUES*".column1, "*VALUES*".column2)), (max("*VALUES*".column3)) - -> HashAggregate - Hash Key: "*VALUES*".column1, "*VALUES*".column2 - Hash Key: ("*VALUES*".column1 + 1), ("*VALUES*".column2 + 1) - Hash Key: ("*VALUES*".column1 + 2), ("*VALUES*".column2 + 2) - -> Values Scan on "*VALUES*" -(7 rows) + QUERY PLAN +------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1) + -> Sort + Sort Key: (GROUPING("*VALUES*".column1, "*VALUES*".column2)), (max("*VALUES*".column3)) + -> HashAggregate + Hash Key: "*VALUES*".column1, "*VALUES*".column2 + Hash Key: ("*VALUES*".column1 + 1), ("*VALUES*".column2 + 1) + Hash Key: ("*VALUES*".column1 + 2), ("*VALUES*".column2 + 2) + -> Values Scan on "*VALUES*" +(8 rows) select a, b, sum(c), sum(sum(c)) over (order by a,b) as rsum from gstest2 group by cube (a,b) order by rsum, a, b; @@ -1255,8 +1261,8 @@ explain (costs off) select a, b, sum(v.x) from (values (1),(2)) v(x), gstest_data(v.x) group by cube (a,b) order by a,b; - QUERY PLAN ------------------------------------------------- + QUERY PLAN +------------------------------------------------------------ Sort Sort Key: gstest_data.a, gstest_data.b -> MixedAggregate @@ -1265,9 +1271,10 @@ explain (costs off) Hash Key: gstest_data.b Group Key: () -> Nested Loop - -> Values Scan on "*VALUES*" + -> Remote Subquery Scan on all (datanode_1) + -> Values Scan on "*VALUES*" -> Function Scan on gstest_data -(10 rows) +(11 rows) -- More rescan tests select * from (values (1),(2)) v(a) left join lateral (select v.a, four, ten, count(*) from onek group by cube(four,ten)) s on true order by v.a,four,ten; diff --git a/src/test/regress/expected/groupingsets_1.out b/src/test/regress/expected/groupingsets_1.out index 93958dfc..e1524f49 100644 --- a/src/test/regress/expected/groupingsets_1.out +++ b/src/test/regress/expected/groupingsets_1.out @@ -1410,14 +1410,16 @@ explain (costs off) Hash Key: four Hash Key: ten Hash Key: hundred - Hash Key: thousand - Hash Key: twothousand Group Key: unique1 + Sort Key: twothousand + Group Key: twothousand + Sort Key: thousand + Group Key: thousand -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Sort Sort Key: unique1 -> Seq Scan on tenk1 -(12 rows) +(14 rows) explain (costs off) select unique1, @@ -1448,16 +1450,18 @@ explain (costs off) from tenk1 group by grouping sets (unique1,twothousand,thousand,hundred,ten,four,two); QUERY PLAN ----------------------------------------------------------- - HashAggregate - Hash Key: unique1 - Hash Key: twothousand - Hash Key: thousand - Hash Key: hundred - Hash Key: ten - Hash Key: four + MixedAggregate Hash Key: two + Hash Key: four + Hash Key: ten + Hash Key: hundred + Hash Key: thousand + Hash Key: twothousand + Group Key: unique1 -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Seq Scan on tenk1 -(10 rows) + -> Sort + Sort Key: unique1 + -> Seq Scan on tenk1 +(12 rows) -- end diff --git a/src/test/regress/expected/join_3.out b/src/test/regress/expected/join_3.out index 53a75d2f..3841e3eb 100644 --- a/src/test/regress/expected/join_3.out +++ b/src/test/regress/expected/join_3.out @@ -3513,13 +3513,13 @@ left join ) foo3 using (join_key); QUERY PLAN --------------------------------------------------------------------------------- - Hash Right Join - Output: "*VALUES*".column1, i1.f1, (666) - Hash Cond: (i1.f1 = "*VALUES*".column1) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Output: i1.f1, 666 - -> Merge Right Join +----------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + Output: "*VALUES*".column1, i1.f1, 666 + -> Hash Right Join + Output: "*VALUES*".column1, i1.f1, (666) + Hash Cond: (i1.f1 = "*VALUES*".column1) + -> Hash Right Join Output: i1.f1, 666 Merge Cond: (i2.unique2 = i1.f1) -> Remote Subquery Scan on all (datanode_1,datanode_2) @@ -3540,9 +3540,12 @@ using (join_key); Output: i1.f1 -> Hash Output: "*VALUES*".column1 + -> Remote Subquery Scan on all (datanode_1) + Output: "*VALUES*".column1 + Distribute results by H: column1 -> Values Scan on "*VALUES*" Output: "*VALUES*".column1 -(28 rows) +(27 rows) select foo1.join_key as foo1_id, foo3.join_key AS foo3_id, bug_field from (values (0),(1)) foo1(join_key) @@ -3558,8 +3561,8 @@ left join using (join_key); foo1_id | foo3_id | bug_field ---------+---------+----------- - 0 | 0 | 666 1 | | + 0 | 0 | 666 (2 rows) -- @@ -5431,15 +5434,13 @@ select * from where f1 = any (select unique1 from tenk1 where unique2 = v.x offset 0)) ss; QUERY PLAN ----------------------------------------------------------------------------------------- - Nested Loop +---------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: "*VALUES*".column1, "*VALUES*".column2, int4_tbl.f1 + -> Nested Loop Output: "*VALUES*".column1, "*VALUES*".column2, int4_tbl.f1 -> Values Scan on "*VALUES*" Output: "*VALUES*".column1, "*VALUES*".column2 - -> Materialize - Output: int4_tbl.f1 - -> Remote Subquery Scan on all - Output: int4_tbl.f1 -> Hash Join Output: int4_tbl.f1 Inner Unique: true @@ -5454,7 +5455,7 @@ select * from -> Index Scan using tenk1_unique2 on public.tenk1 Output: tenk1.unique1 Index Cond: (tenk1.unique2 = "*VALUES*".column2) -(22 rows) +(20 rows) select * from (values (0,9998), (1,1000)) v(id,x), @@ -5478,19 +5479,17 @@ lateral (select * from int8_tbl t1, and (select v.id=0)) offset 0) ss2) ss where t1.q1 = ss.q2) ss0; QUERY PLAN ------------------------------------------------------------------------------------ - Nested Loop +----------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1) Output: "*VALUES*".column1, t1.q1, t1.q2, ss2.q1, ss2.q2 - -> Values Scan on "*VALUES*" - Output: "*VALUES*".column1 - -> Materialize - Output: t1.q1, t1.q2, ss2.q1, ss2.q2 - -> Remote Subquery Scan on all (datanode_1) - Output: t1.q1, t1.q2, ss2.q1, ss2.q2 -> Nested Loop - Output: t1.q1, t1.q2, ss2.q1, ss2.q2 + Output: "*VALUES*".column1, t1.q1, t1.q2, ss2.q1, ss2.q2 -> Seq Scan on public.int8_tbl t1 Output: t1.q1, t1.q2 + -> Nested Loop + Output: "*VALUES*".column1, ss2.q1, ss2.q2 + -> Values Scan on "*VALUES*" + Output: "*VALUES*".column1 -> Subquery Scan on ss2 Output: ss2.q1, ss2.q2 Filter: (t1.q1 = ss2.q2) @@ -5512,7 +5511,7 @@ lateral (select * from int8_tbl t1, -> Seq Scan on public.int8_tbl t3 Output: t3.q1, t3.q2 Filter: (t3.q2 = $2) -(33 rows) +(31 rows) select * from (values (0), (1)) v(id), lateral (select * from int8_tbl t1, diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 14660970..25688014 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -3197,11 +3197,12 @@ RETURNING *; Conflict Arbiter Indexes: hat_data_unique_idx Conflict Filter: ((excluded.hat_color <> 'forbidden'::bpchar) AND (hat_data.* <> excluded.*)) CTE data + -> Remote Subquery Scan on all -> Values Scan on "*VALUES*" -> Remote Subquery Scan on all Distribute results by H: hat_name -> CTE Scan on data -(10 rows) +(11 rows) SELECT * FROM hat_data WHERE hat_name IN ('h8', 'h9', 'h7') ORDER BY hat_name; hat_name | hat_color diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out index 6e607200..2743774f 100644 --- a/src/test/regress/expected/subselect.out +++ b/src/test/regress/expected/subselect.out @@ -777,8 +777,10 @@ explain (verbose, costs off) select x, x from (select (select now()) as x from (values(1),(2)) v(y)) ss; QUERY PLAN ---------------------------- - Values Scan on "*VALUES*" +------------------------------------------ + Remote Subquery Scan on all (datanode_1) + Output: $0, $1 + -> Values Scan on "*VALUES*" Output: $0, $1 InitPlan 1 (returns $0) -> Result @@ -786,28 +788,32 @@ explain (verbose, costs off) InitPlan 2 (returns $1) -> Result Output: now() -(8 rows) +(10 rows) explain (verbose, costs off) select x, x from (select (select random()) as x from (values(1),(2)) v(y)) ss; QUERY PLAN ----------------------------------- - Subquery Scan on ss +------------------------------------------ + Remote Subquery Scan on all (datanode_1) + Output: ss.x, ss.x + -> Subquery Scan on ss Output: ss.x, ss.x -> Values Scan on "*VALUES*" Output: $0 InitPlan 1 (returns $0) -> Result Output: random() -(7 rows) +(9 rows) explain (verbose, costs off) select x, x from (select (select now() where y=y) as x from (values(1),(2)) v(y)) ss; QUERY PLAN ----------------------------------------------------------------------- - Values Scan on "*VALUES*" +---------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1) + Output: (SubPlan 1), (SubPlan 2) + -> Values Scan on "*VALUES*" Output: (SubPlan 1), (SubPlan 2) SubPlan 1 -> Result @@ -817,14 +823,16 @@ explain (verbose, costs off) -> Result Output: now() One-Time Filter: ("*VALUES*".column1 = "*VALUES*".column1) -(10 rows) +(12 rows) explain (verbose, costs off) select x, x from (select (select random() where y=y) as x from (values(1),(2)) v(y)) ss; QUERY PLAN ----------------------------------------------------------------------------- - Subquery Scan on ss +---------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1) + Output: ss.x, ss.x + -> Subquery Scan on ss Output: ss.x, ss.x -> Values Scan on "*VALUES*" Output: (SubPlan 1) @@ -832,7 +840,7 @@ explain (verbose, costs off) -> Result Output: random() One-Time Filter: ("*VALUES*".column1 = "*VALUES*".column1) -(8 rows) +(10 rows) -- -- Check we behave sanely in corner case of empty SELECT list (bug #8648) @@ -1955,10 +1963,12 @@ with recursive x(a) as where length(z.a || z1.a) < 5)) select * from x; QUERY PLAN ----------------------------------------------------------- +---------------------------------------------------------------- CTE Scan on x Output: x.a CTE x + -> Remote Subquery Scan on all (datanode_1) + Output: x_1.a -> Recursive Union -> Values Scan on "*VALUES*" Output: "*VALUES*".column1 @@ -1974,7 +1984,7 @@ select * from x; Output: z1.a -> CTE Scan on z z1 Output: z1.a -(18 rows) +(20 rows) with recursive x(a) as ((values ('a'), ('b')) @@ -2018,17 +2028,19 @@ with recursive x(a) as where length(z.a || z.a) < 5)) select * from x; QUERY PLAN --------------------------------------------------------- +-------------------------------------------------------------- CTE Scan on x Output: x.a CTE x + -> Remote Subquery Scan on all (datanode_1) + Output: x.a -> Recursive Union -> Values Scan on "*VALUES*" Output: "*VALUES*".column1 -> WorkTable Scan on x x_1 Output: (x_1.a || x_1.a) Filter: (length((x_1.a || x_1.a)) < 5) -(9 rows) +(11 rows) with recursive x(a) as ((values ('a'), ('b')) diff --git a/src/test/regress/expected/tablesample_1.out b/src/test/regress/expected/tablesample_1.out index 133927af..7f528c9b 100644 --- a/src/test/regress/expected/tablesample_1.out +++ b/src/test/regress/expected/tablesample_1.out @@ -242,17 +242,22 @@ select pct, count(unique1) from (values (0),(100)) v(pct), lateral (select * from tenk1 tablesample bernoulli (pct)) ss group by pct; - QUERY PLAN ------------------------------------------------------------------------ - HashAggregate - Group Key: "*VALUES*".column1 - -> Nested Loop - -> Values Scan on "*VALUES*" - -> Materialize + QUERY PLAN +-------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Finalize GroupAggregate + Group Key: "*VALUES*".column1 + -> Sort + Sort Key: "*VALUES*".column1 -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Sample Scan on tenk1 - Sampling: bernoulli ("*VALUES*".column1) -(8 rows) + Distribute results by H: column1 + -> Partial HashAggregate + Group Key: "*VALUES*".column1 + -> Nested Loop + -> Values Scan on "*VALUES*" + -> Sample Scan on tenk1 + Sampling: bernoulli ("*VALUES*".column1) +(13 rows) select pct, count(unique1) from (values (0),(100)) v(pct), @@ -260,7 +265,8 @@ select pct, count(unique1) from group by pct; pct | count -----+------- -(0 rows) + 100 | 10000 +(1 row) select pct, count(unique1) from (values (0),(100)) v(pct), @@ -268,7 +274,8 @@ select pct, count(unique1) from group by pct; pct | count -----+------- -(0 rows) + 100 | 10000 +(1 row) -- errors SELECT id FROM test_tablesample TABLESAMPLE FOOBAR (1); diff --git a/src/test/regress/sql/create_view.sql b/src/test/regress/sql/create_view.sql index 47ef2f99..cd7a3309 100644 --- a/src/test/regress/sql/create_view.sql +++ b/src/test/regress/sql/create_view.sql @@ -584,3 +584,11 @@ select pg_get_ruledef(oid, true) from pg_rewrite \set VERBOSITY terse \\ -- suppress cascade details DROP SCHEMA temp_view_test CASCADE; DROP SCHEMA testviewschm2 CASCADE; + +-- check plan without sort operator, but need merge sort +set enable_seqscan = off; +create table test(v int primary key, w int) distribute by shard(v); +insert into test values(generate_series(1,50), generate_series(1,50)); +create view test_sort as select * from test where v in (select v from test where w < 20) order by v asc; +select * from test_sort; +drop table test cascade; From f86b7d5f0a58be4154515647d55fe3b08e3ea9bf Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Sat, 5 Jun 2021 20:29:59 +0800 Subject: [PATCH 168/578] fix compile warnings and regress expected info --- src/backend/optimizer/path/allpaths.c | 2 +- src/backend/optimizer/util/pgxcship.c | 2 +- src/test/regress/expected/create_view.out | 2 +- src/test/regress/expected/join_3.out | 92 +++++++++--------- src/test/regress/expected/rules.out | 2 +- src/test/regress/expected/subselect.out | 110 +++++++++++----------- src/test/regress/sql/create_view.sql | 2 +- 7 files changed, 106 insertions(+), 106 deletions(-) diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 821c2aef..7a17b7d7 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -2095,7 +2095,7 @@ check_list_contain_all_const(List *list) node = lfirst(lc); if (IsA(node, List)) { - if (!check_list_contain_all_const(node)) + if (!check_list_contain_all_const((List *)node)) { return false; } diff --git a/src/backend/optimizer/util/pgxcship.c b/src/backend/optimizer/util/pgxcship.c index 3d00fa58..7c577339 100644 --- a/src/backend/optimizer/util/pgxcship.c +++ b/src/backend/optimizer/util/pgxcship.c @@ -1884,7 +1884,7 @@ pgxc_query_contains_only_pg_catalog(List *rtable) return true; } -ExecNodes * +static ExecNodes * make_FQS_single_node() { ExecNodes *exec_nodes; diff --git a/src/test/regress/expected/create_view.out b/src/test/regress/expected/create_view.out index d1285a50..b0007ead 100644 --- a/src/test/regress/expected/create_view.out +++ b/src/test/regress/expected/create_view.out @@ -1719,7 +1719,7 @@ create table test(v int primary key, w int) distribute by shard(v); NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. insert into test values(generate_series(1,50), generate_series(1,50)); create view test_sort as select * from test where v in (select v from test where w < 20) order by v asc; -select * from test_sort; +select * from test_sort order by 1; v | w ----+---- 1 | 1 diff --git a/src/test/regress/expected/join_3.out b/src/test/regress/expected/join_3.out index 3841e3eb..4b1d3032 100644 --- a/src/test/regress/expected/join_3.out +++ b/src/test/regress/expected/join_3.out @@ -3513,13 +3513,13 @@ left join ) foo3 using (join_key); QUERY PLAN ------------------------------------------------------------------------ +-------------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) Output: "*VALUES*".column1, i1.f1, 666 -> Hash Right Join Output: "*VALUES*".column1, i1.f1, (666) Hash Cond: (i1.f1 = "*VALUES*".column1) - -> Hash Right Join + -> Merge Right Join Output: i1.f1, 666 Merge Cond: (i2.unique2 = i1.f1) -> Remote Subquery Scan on all (datanode_1,datanode_2) @@ -3538,14 +3538,14 @@ using (join_key); Sort Key: i1.f1 -> Seq Scan on public.int4_tbl i1 Output: i1.f1 - -> Hash - Output: "*VALUES*".column1 + -> Hash + Output: "*VALUES*".column1 -> Remote Subquery Scan on all (datanode_1) Output: "*VALUES*".column1 Distribute results by H: column1 - -> Values Scan on "*VALUES*" - Output: "*VALUES*".column1 -(27 rows) + -> Values Scan on "*VALUES*" + Output: "*VALUES*".column1 +(31 rows) select foo1.join_key as foo1_id, foo3.join_key AS foo3_id, bug_field from (values (0),(1)) foo1(join_key) @@ -5433,28 +5433,28 @@ select * from lateral (select f1 from int4_tbl where f1 = any (select unique1 from tenk1 where unique2 = v.x offset 0)) ss; - QUERY PLAN + QUERY PLAN ---------------------------------------------------------------------------------- Remote Subquery Scan on all Output: "*VALUES*".column1, "*VALUES*".column2, int4_tbl.f1 -> Nested Loop - Output: "*VALUES*".column1, "*VALUES*".column2, int4_tbl.f1 - -> Values Scan on "*VALUES*" - Output: "*VALUES*".column1, "*VALUES*".column2 - -> Hash Join + Output: "*VALUES*".column1, "*VALUES*".column2, int4_tbl.f1 + -> Values Scan on "*VALUES*" + Output: "*VALUES*".column1, "*VALUES*".column2 + -> Hash Join + Output: int4_tbl.f1 + Inner Unique: true + Hash Cond: (int4_tbl.f1 = tenk1.unique1) + -> Seq Scan on public.int4_tbl Output: int4_tbl.f1 - Inner Unique: true - Hash Cond: (int4_tbl.f1 = tenk1.unique1) - -> Seq Scan on public.int4_tbl - Output: int4_tbl.f1 - -> Hash + -> Hash + Output: tenk1.unique1 + -> HashAggregate Output: tenk1.unique1 - -> HashAggregate + Group Key: tenk1.unique1 + -> Index Scan using tenk1_unique2 on public.tenk1 Output: tenk1.unique1 - Group Key: tenk1.unique1 - -> Index Scan using tenk1_unique2 on public.tenk1 - Output: tenk1.unique1 - Index Cond: (tenk1.unique2 = "*VALUES*".column2) + Index Cond: (tenk1.unique2 = "*VALUES*".column2) (20 rows) select * from @@ -5478,39 +5478,39 @@ lateral (select * from int8_tbl t1, where q2 = (select greatest(t1.q1,t2.q2)) and (select v.id=0)) offset 0) ss2) ss where t1.q1 = ss.q2) ss0; - QUERY PLAN + QUERY PLAN ----------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1) Output: "*VALUES*".column1, t1.q1, t1.q2, ss2.q1, ss2.q2 - -> Nested Loop + -> Nested Loop Output: "*VALUES*".column1, t1.q1, t1.q2, ss2.q1, ss2.q2 - -> Seq Scan on public.int8_tbl t1 - Output: t1.q1, t1.q2 + -> Seq Scan on public.int8_tbl t1 + Output: t1.q1, t1.q2 -> Nested Loop Output: "*VALUES*".column1, ss2.q1, ss2.q2 -> Values Scan on "*VALUES*" Output: "*VALUES*".column1 - -> Subquery Scan on ss2 - Output: ss2.q1, ss2.q2 - Filter: (t1.q1 = ss2.q2) - -> Seq Scan on public.int8_tbl t2 - Output: t2.q1, t2.q2 - Filter: (SubPlan 3) - SubPlan 3 - -> Remote Subquery Scan on all (datanode_1) + -> Subquery Scan on ss2 + Output: ss2.q1, ss2.q2 + Filter: (t1.q1 = ss2.q2) + -> Seq Scan on public.int8_tbl t2 + Output: t2.q1, t2.q2 + Filter: (SubPlan 3) + SubPlan 3 + -> Remote Subquery Scan on all (datanode_1) + Output: t3.q2 + -> Result Output: t3.q2 - -> Result - Output: t3.q2 - One-Time Filter: $4 - InitPlan 1 (returns $2) - -> Result - Output: GREATEST($0, t2.q2) - InitPlan 2 (returns $4) - -> Result - Output: ($3 = 0) - -> Seq Scan on public.int8_tbl t3 - Output: t3.q1, t3.q2 - Filter: (t3.q2 = $2) + One-Time Filter: $4 + InitPlan 1 (returns $2) + -> Result + Output: GREATEST($0, t2.q2) + InitPlan 2 (returns $4) + -> Result + Output: ($3 = 0) + -> Seq Scan on public.int8_tbl t3 + Output: t3.q1, t3.q2 + Filter: (t3.q2 = $2) (31 rows) select * from (values (0), (1)) v(id), diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 25688014..f750332d 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -3198,7 +3198,7 @@ RETURNING *; Conflict Filter: ((excluded.hat_color <> 'forbidden'::bpchar) AND (hat_data.* <> excluded.*)) CTE data -> Remote Subquery Scan on all - -> Values Scan on "*VALUES*" + -> Values Scan on "*VALUES*" -> Remote Subquery Scan on all Distribute results by H: hat_name -> CTE Scan on data diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out index 2743774f..36a5f074 100644 --- a/src/test/regress/expected/subselect.out +++ b/src/test/regress/expected/subselect.out @@ -776,70 +776,70 @@ where a.thousand = b.thousand explain (verbose, costs off) select x, x from (select (select now()) as x from (values(1),(2)) v(y)) ss; - QUERY PLAN + QUERY PLAN ------------------------------------------ Remote Subquery Scan on all (datanode_1) Output: $0, $1 -> Values Scan on "*VALUES*" - Output: $0, $1 - InitPlan 1 (returns $0) - -> Result - Output: now() - InitPlan 2 (returns $1) - -> Result - Output: now() + Output: $0, $1 + InitPlan 1 (returns $0) + -> Result + Output: now() + InitPlan 2 (returns $1) + -> Result + Output: now() (10 rows) explain (verbose, costs off) select x, x from (select (select random()) as x from (values(1),(2)) v(y)) ss; - QUERY PLAN + QUERY PLAN ------------------------------------------ Remote Subquery Scan on all (datanode_1) Output: ss.x, ss.x -> Subquery Scan on ss - Output: ss.x, ss.x - -> Values Scan on "*VALUES*" - Output: $0 - InitPlan 1 (returns $0) - -> Result - Output: random() + Output: ss.x, ss.x + -> Values Scan on "*VALUES*" + Output: $0 + InitPlan 1 (returns $0) + -> Result + Output: random() (9 rows) explain (verbose, costs off) select x, x from (select (select now() where y=y) as x from (values(1),(2)) v(y)) ss; - QUERY PLAN + QUERY PLAN ---------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1) Output: (SubPlan 1), (SubPlan 2) -> Values Scan on "*VALUES*" - Output: (SubPlan 1), (SubPlan 2) - SubPlan 1 - -> Result - Output: now() - One-Time Filter: ("*VALUES*".column1 = "*VALUES*".column1) - SubPlan 2 - -> Result - Output: now() - One-Time Filter: ("*VALUES*".column1 = "*VALUES*".column1) + Output: (SubPlan 1), (SubPlan 2) + SubPlan 1 + -> Result + Output: now() + One-Time Filter: ("*VALUES*".column1 = "*VALUES*".column1) + SubPlan 2 + -> Result + Output: now() + One-Time Filter: ("*VALUES*".column1 = "*VALUES*".column1) (12 rows) explain (verbose, costs off) select x, x from (select (select random() where y=y) as x from (values(1),(2)) v(y)) ss; - QUERY PLAN + QUERY PLAN ---------------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1) Output: ss.x, ss.x -> Subquery Scan on ss - Output: ss.x, ss.x - -> Values Scan on "*VALUES*" - Output: (SubPlan 1) - SubPlan 1 - -> Result - Output: random() - One-Time Filter: ("*VALUES*".column1 = "*VALUES*".column1) + Output: ss.x, ss.x + -> Values Scan on "*VALUES*" + Output: (SubPlan 1) + SubPlan 1 + -> Result + Output: random() + One-Time Filter: ("*VALUES*".column1 = "*VALUES*".column1) (10 rows) -- @@ -1962,28 +1962,28 @@ with recursive x(a) as select z.a || z1.a as a from z cross join z as z1 where length(z.a || z1.a) < 5)) select * from x; - QUERY PLAN + QUERY PLAN ---------------------------------------------------------------- CTE Scan on x Output: x.a CTE x -> Remote Subquery Scan on all (datanode_1) Output: x_1.a - -> Recursive Union - -> Values Scan on "*VALUES*" - Output: "*VALUES*".column1 - -> Nested Loop - Output: (z.a || z1.a) - Join Filter: (length((z.a || z1.a)) < 5) - CTE z - -> WorkTable Scan on x x_1 - Output: x_1.a - -> CTE Scan on z - Output: z.a - -> Materialize - Output: z1.a - -> CTE Scan on z z1 + -> Recursive Union + -> Values Scan on "*VALUES*" + Output: "*VALUES*".column1 + -> Nested Loop + Output: (z.a || z1.a) + Join Filter: (length((z.a || z1.a)) < 5) + CTE z + -> WorkTable Scan on x x_1 + Output: x_1.a + -> CTE Scan on z + Output: z.a + -> Materialize Output: z1.a + -> CTE Scan on z z1 + Output: z1.a (20 rows) with recursive x(a) as @@ -2027,19 +2027,19 @@ with recursive x(a) as select z.a || z.a as a from z where length(z.a || z.a) < 5)) select * from x; - QUERY PLAN + QUERY PLAN -------------------------------------------------------------- CTE Scan on x Output: x.a CTE x -> Remote Subquery Scan on all (datanode_1) Output: x.a - -> Recursive Union - -> Values Scan on "*VALUES*" - Output: "*VALUES*".column1 - -> WorkTable Scan on x x_1 - Output: (x_1.a || x_1.a) - Filter: (length((x_1.a || x_1.a)) < 5) + -> Recursive Union + -> Values Scan on "*VALUES*" + Output: "*VALUES*".column1 + -> WorkTable Scan on x x_1 + Output: (x_1.a || x_1.a) + Filter: (length((x_1.a || x_1.a)) < 5) (11 rows) with recursive x(a) as diff --git a/src/test/regress/sql/create_view.sql b/src/test/regress/sql/create_view.sql index cd7a3309..4ec3f6f5 100644 --- a/src/test/regress/sql/create_view.sql +++ b/src/test/regress/sql/create_view.sql @@ -590,5 +590,5 @@ set enable_seqscan = off; create table test(v int primary key, w int) distribute by shard(v); insert into test values(generate_series(1,50), generate_series(1,50)); create view test_sort as select * from test where v in (select v from test where w < 20) order by v asc; -select * from test_sort; +select * from test_sort order by 1; drop table test cascade; From 949164ce186fc66a544a8d53967dfb74db81021a Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Tue, 15 Jun 2021 17:25:17 +0800 Subject: [PATCH 169/578] add TBase Community Code of Conduct --- Code-of-Conduct.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 Code-of-Conduct.md diff --git a/Code-of-Conduct.md b/Code-of-Conduct.md new file mode 100644 index 00000000..a103d91a --- /dev/null +++ b/Code-of-Conduct.md @@ -0,0 +1,5 @@ +# TBase Community Code of Conduct +TBase follows the [CNCF Code of Conduct](https://github.com/cncf/foundation/blob/master/code-of-conduct.md). + + +Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the TBase Code of Conduct Committee via email: TBase_Community@qq.com \ No newline at end of file From 87f8908699eb2f76a4016534b7595ccac948add6 Mon Sep 17 00:00:00 2001 From: andrelin Date: Tue, 15 Jun 2021 18:09:53 +0800 Subject: [PATCH 170/578] Should call prev hook when startup pg_stat_cluster_activity --- contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c index 5efecaf6..304ee872 100644 --- a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c +++ b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c @@ -1015,6 +1015,9 @@ pg_cancel_session(PG_FUNCTION_ARGS) static void pgcs_shmem_startup(void) { + if (prev_shmem_startup_hook) + prev_shmem_startup_hook(); + CreateSharedClusterStatus(); } From fc29b95c5110ee660bc0882d29314fc08c1a798f Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Wed, 16 Jun 2021 10:46:14 +0800 Subject: [PATCH 171/578] update CONTRIBUTING --- CONTRIBUTING.md | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d06b4e57..63f601c2 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,26 +1,26 @@ # Contributing --- -如果你有好的意见或建议,欢迎给我们提 [Issues](https://github.com/Tencent/TBase/issues) 或 [Pull Requests](https://github.com/Tencent/TBase/pulls),为TBase开源社区贡献力量。TBase 持续招募贡献者,即使是在 issue 中回答问题,或者做一些简单的 bugfix ,也会给 TBase 带来很大的帮助。 +If you have good comments or suggestions, welcome to create [Issues](https://github.com/Tencent/TBase/issues) or [Pull Requests](https://github.com/Tencent/TBase/pulls),contribute to the TBase open source community.TBase continues to recruit contributors, even if it is answering questions in the issue, or doing some simple bugfixes, it will be of great help to TBase. -[腾讯开源激励计划](https://opensource.tencent.com/contribution) 鼓励开发者的参与和贡献,期待你的加入。 +[Tencent Open Source Incentive Program](https://opensource.tencent.com/contribution) Encourage developers to participate and contribute, and look forward to your joining. ## Issue -#### 对于贡献者 +#### For contributors -在提 issue 前请确保满足一下条件: +Please ensure that the following conditions are met before submitting an issue: -* 必须是一个 bug 或者功能新增 -* 已经在 issue 中搜索过,并且没有找到相似的 issue 或者解决方案 -* 新建 Issue 时请提供详细的描述、截屏或者短视频来辅助我们定位问题 +* Must be a bug or new feature +* Have searched in the issue, and did not find a similar issue or solution +* When creating a new issue, please provide a detailed description, screenshot or short video to help us locate the problem ## Pull Request -我们欢迎大家贡献代码来使我们的产品更加强大,代码团队会监控所有的 Pull request, 我们会做相应的代码检查和测试,测试通过之后我们就会接纳 PR ,但是不会立即合并到 master 分支。 +We welcome everyone to contribute code to make our product more powerful. The code team will monitor all pull requests, and we will do the corresponding code inspection and testing. After the test passes, we will accept the PR, but will not immediately merge into the master branch. -在完成一个 PR 之前请做一下确认: +Please confirm before completing a PR: -1. 从 master 分支中 fork 你自己的分支。 -2. 在修改了代码之后请修改对应的文档和注释。 -3. 在新建的文件中请加入 License 和 Copyright 申明。 -4. 确保一致的代码风格。 -5. 做充分的测试。 -6. 然后,你可以提交你的代码到 dev 分支。 \ No newline at end of file +1. Fork your own branch from the master branch. +2. Please modify the corresponding documents and comments after modifying the code. +3. Please add License and Copyright declarations in the newly created file. +4. Ensure a consistent code style. +5. Do adequate testing. +6. Then, you can submit your code to the dev branch. \ No newline at end of file From bf113c199b8f2da0d41f05dcd34feb8b5d467864 Mon Sep 17 00:00:00 2001 From: ericxwu Date: Thu, 17 Jun 2021 19:40:03 +0800 Subject: [PATCH 172/578] Support NOT IN/ANY sublink pullup after cherry-picked PG lateral impl (merge request !406) Previously we skipped NOT IN/ANY sublink pull up after merged new postgres lateral related code. Here we removed the restirction. Now we also support the case that var is nullable by adding NullTest expr. http://tapd.oa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131088904293 --- src/backend/nodes/makefuncs.c | 47 +++++ src/backend/optimizer/plan/subselect.c | 171 +++++++++++++++++- src/include/nodes/makefuncs.h | 4 + .../regress/expected/select_parallel_4.out | 16 +- src/test/regress/expected/subselect.out | 75 +++++++- src/test/regress/sql/subselect.sql | 14 ++ 6 files changed, 305 insertions(+), 22 deletions(-) diff --git a/src/backend/nodes/makefuncs.c b/src/backend/nodes/makefuncs.c index 3748c170..30c49729 100644 --- a/src/backend/nodes/makefuncs.c +++ b/src/backend/nodes/makefuncs.c @@ -695,3 +695,50 @@ makeGroupingSet(GroupingSetKind kind, List *content, int location) n->location = location; return n; } + +#ifdef __TBASE__ +/* + * makeNullTest - + * creates a Null Test expr like "expr is (NOT) NULL" + */ +NullTest * +makeNullTest(NullTestType type, Expr *expr) +{ + NullTest *n = makeNode(NullTest); + + n->nulltesttype = type; + n->arg = expr; + + return n; +} + +/* + * makeBoolExpr - + * creates a BoolExpr tree node. + */ +Expr * +makeBoolExprTreeNode(BoolExprType boolop, List *args) +{ + Node *node = NULL; + ListCell *lc = NULL; + + foreach (lc, args) + { + BoolExpr* b = NULL; + + if (node == NULL) + { + node = (Node*)lfirst(lc); + continue; + } + + b = makeNode(BoolExpr); + b->boolop = boolop; + b->args = list_make2(node, lfirst(lc)); + b->location = 0; + node = (Node*)b; + } + + return (Expr*)node; +} +#endif diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index 0f24fa9d..98ed5c26 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -152,6 +152,11 @@ static Node *convert_testexpr_mutator(Node *node, convert_testexpr_context *context); static bool subplan_is_hashable(Plan *plan); static bool testexpr_is_hashable(Node *testexpr); +#ifdef __TBASE__ +static Node *convert_joinqual_to_antiqual(Node* node, Query* parse); +static Node *convert_opexpr_to_boolexpr_for_antijoin(Node* node, Query* parse); +static bool var_is_nullable(Node *node, Query *parse); +#endif static bool hash_ok_operator(OpExpr *expr); static bool contain_dml(Node *node); static bool contain_dml_walker(Node *node, void *context); @@ -1342,6 +1347,98 @@ testexpr_is_hashable(Node *testexpr) return false; } +#ifdef __TBASE__ +/* + * Rewrite qual to complete nullability check for NOT IN/ANY sublink pullup + */ +static Node* +convert_joinqual_to_antiqual(Node* node, Query* parse) +{ + Node* antiqual = NULL; + + if (node == NULL) + return NULL; + + switch (nodeTag(node)) + { + case T_OpExpr: + antiqual = convert_opexpr_to_boolexpr_for_antijoin(node, parse); + break; + case T_BoolExpr: + { + /* Not IN, should be and clause.*/ + if (and_clause(node)) + { + BoolExpr* boolexpr = (BoolExpr*)node; + List* andarglist = NIL; + ListCell* l = NULL; + + foreach (l, boolexpr->args) + { + Node* andarg = (Node*)lfirst(l); + Node* expr = NULL; + + /* The listcell type of args should be OpExpr. */ + expr = convert_opexpr_to_boolexpr_for_antijoin(andarg, parse); + if (expr == NULL) + return NULL; + + andarglist = lappend(andarglist, expr); + } + + antiqual = (Node*)makeBoolExpr(AND_EXPR, andarglist, boolexpr->location); + } + else + return NULL; + } + break; + case T_ScalarArrayOpExpr: + case T_RowCompareExpr: + default: + antiqual = NULL; + break; + } + + return antiqual; +} + +static Node * +convert_opexpr_to_boolexpr_for_antijoin(Node *node, Query *parse) +{ + Node *boolexpr = NULL; + List *antiqual = NIL; + OpExpr *opexpr = NULL; + Node *larg = NULL; + Node *rarg = NULL; + + if (!IsA(node, OpExpr)) + return NULL; + else + opexpr = (OpExpr*)node; + + antiqual = (List*)list_make1(opexpr); + + larg = (Node*)linitial(opexpr->args); + if (IsA(larg, RelabelType)) + larg = (Node*)((RelabelType*)larg)->arg; + if (var_is_nullable(larg, parse)) + antiqual = lappend(antiqual, makeNullTest(IS_NULL, (Expr*)copyObject(larg))); + + rarg = (Node*)lsecond(opexpr->args); + if (IsA(rarg, RelabelType)) + rarg = (Node*)((RelabelType*)rarg)->arg; + if (var_is_nullable(rarg, parse)) + antiqual = lappend(antiqual, makeNullTest(IS_NULL, (Expr*)copyObject(rarg))); + + if (list_length(antiqual) > 1) + boolexpr = (Node*)makeBoolExprTreeNode(OR_EXPR, antiqual); + else + boolexpr = (Node*)opexpr; + + return boolexpr; +} +#endif + /* * Check expression is hashable + strict * @@ -2305,10 +2402,6 @@ convert_ANY_sublink_to_join(PlannerInfo *root, SubLink *sublink, return NULL; #ifdef __TBASE__ } - - /* TODO: Currently we do not pullup under_not */ - if (under_not) - return NULL; #endif /* @@ -2380,16 +2473,33 @@ convert_ANY_sublink_to_join(PlannerInfo *root, SubLink *sublink, * And finally, build the JoinExpr node. */ result = makeNode(JoinExpr); + #ifdef __TBASE__ - result->jointype = under_not ? JOIN_ANTI : JOIN_SEMI; -#else - result->jointype = JOIN_SEMI; + /* Different logic for NOT IN/ANY sublink */ + if (under_not) + { + Node* antiquals = NULL; + + antiquals = convert_joinqual_to_antiqual(quals, parse); + + if (antiquals == NULL) + return NULL; + + result->jointype = JOIN_ANTI; + result->quals = antiquals; + } + else + { + /* Basic logic for IN/ANY sublink */ + result->jointype = JOIN_SEMI; + result->quals = quals; + } #endif + result->isNatural = false; result->larg = NULL; /* caller must fill this in */ result->rarg = (Node *) rtr; result->usingClause = NIL; - result->quals = quals; result->alias = NULL; result->rtindex = 0; /* we don't need an RTE for it */ @@ -5682,4 +5792,47 @@ SS_remote_attach_initplans(PlannerInfo *root, Plan *plan) /* Process left and right child plans, if any */ SS_remote_attach_initplans(root, plan->lefttree); SS_remote_attach_initplans(root, plan->righttree); -} \ No newline at end of file +} + +#ifdef __TBASE__ +static bool +var_is_nullable(Node *node, Query *parse) +{ + RangeTblEntry* rte; + bool result = true; + Var *var = NULL; + + if (IsA(node, Var)) + var = (Var*) node; + else + return true; + + if (IS_SPECIAL_VARNO(var->varno) || + var->varno <= 0 || var->varno > list_length(parse->rtable)) + return true; + + rte = (RangeTblEntry *)list_nth(parse->rtable, var->varno - 1); + if (rte->rtekind == RTE_RELATION) + { + HeapTuple tp; + + tp = SearchSysCache2(ATTNUM, ObjectIdGetDatum(rte->relid), Int16GetDatum(var->varattno)); + if (!HeapTupleIsValid(tp)) + return true; + result = !((Form_pg_attribute)GETSTRUCT(tp))->attnotnull; + ReleaseSysCache(tp); + } + else if (rte->rtekind == RTE_SUBQUERY) + { + if (rte->subquery->groupingSets == NIL) + { + TargetEntry *te = (TargetEntry *)list_nth(rte->subquery->targetList, var->varattno - 1); + if (IsA(te->expr, Var)) + result = var_is_nullable((Node *)te->expr, rte->subquery); + } + } + + return result; +} + +#endif diff --git a/src/include/nodes/makefuncs.h b/src/include/nodes/makefuncs.h index 16390a28..6b1997ea 100644 --- a/src/include/nodes/makefuncs.h +++ b/src/include/nodes/makefuncs.h @@ -150,4 +150,8 @@ extern DefElem *makeDefElemExtended(char *nameSpace, char *name, Node *arg, extern GroupingSet *makeGroupingSet(GroupingSetKind kind, List *content, int location); +#ifdef __TBASE__ +extern NullTest *makeNullTest(NullTestType type, Expr *expr); +extern Expr *makeBoolExprTreeNode(BoolExprType boolop, List *args); +#endif #endif /* MAKEFUNC_H */ diff --git a/src/test/regress/expected/select_parallel_4.out b/src/test/regress/expected/select_parallel_4.out index 0b6353b7..e5527088 100644 --- a/src/test/regress/expected/select_parallel_4.out +++ b/src/test/regress/expected/select_parallel_4.out @@ -140,19 +140,17 @@ explain (costs off) select count(*) from tenk1 where (two, four) not in (select hundred, thousand from tenk2 where thousand > 100); QUERY PLAN -------------------------------------------------------------------------------------- - Finalize Aggregate +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ + Aggregate + -> Nested Loop Anti Join + Join Filter: (((tenk1.two = tenk2.hundred) OR (tenk1.two IS NULL) OR (tenk2.hundred IS NULL)) AND ((tenk1.four = tenk2.thousand) OR (tenk1.four IS NULL) OR (tenk2.thousand IS NULL))) -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Gather - Workers Planned: 4 - -> Partial Aggregate - -> Parallel Seq Scan on tenk1 - Filter: (NOT (hashed SubPlan 1)) - SubPlan 1 + -> Seq Scan on tenk1 + -> Materialize -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on tenk2 Filter: (thousand > 100) -(11 rows) +(9 rows) select count(*) from tenk1 where (two, four) not in (select hundred, thousand from tenk2 where thousand > 100); diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out index 36a5f074..c3a94e88 100644 --- a/src/test/regress/expected/subselect.out +++ b/src/test/regress/expected/subselect.out @@ -1736,6 +1736,69 @@ select * from tbl_a a where a.b IN (select b.a from tbl_b b where b.b > a.b); ---+--- (0 rows) +explain select * from tbl_a a where a.b NOT IN (select b.a from tbl_b b where b.b > a.b); + QUERY PLAN +--------------------------------------------------------------------------------------------------------- + Nested Loop Anti Join (cost=200.00..6935.40 rows=1123 width=8) + Join Filter: ((b.b > a.b) AND ((a.b = b.a) OR (a.b IS NULL) OR (b.a IS NULL))) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.00..120.53 rows=675 width=8) + -> Seq Scan on tbl_a a (cost=0.00..11.75 rows=675 width=8) + -> Materialize (cost=100.00..123.90 rows=675 width=8) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.00..120.53 rows=675 width=8) + -> Seq Scan on tbl_b b (cost=0.00..11.75 rows=675 width=8) +(7 rows) + +select * from tbl_a a where a.b NOT IN (select b.a from tbl_b b where b.b > a.b); + a | b +----+--- + 1 | 1 + 2 | 1 + 5 | 1 + 6 | 1 + 8 | 1 + 9 | 1 + 3 | 1 + 4 | 1 + 7 | 1 + 10 | 1 +(10 rows) + +drop table tbl_a; +drop table tbl_b; +-- test NOT IN/ANY with NOT NULL restriction +create table tbl_a(a int NOT NULL, b int NOT NULL); +create table tbl_b(a int NOT NULL, b int NOT NULL); +insert into tbl_a select generate_series(1,10),1; +insert into tbl_b select generate_series(2,11),1; +explain select * from tbl_a a where a.b NOT IN (select b.a from tbl_b b where b.b > a.b); + QUERY PLAN +--------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) (cost=120.19..150.77 rows=562 width=8) + -> Hash Anti Join (cost=120.19..150.77 rows=562 width=8) + Hash Cond: (a.b = b.a) + Join Filter: (b.b > a.b) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.00..120.53 rows=675 width=8) + Distribute results by H: b + -> Seq Scan on tbl_a a (cost=0.00..11.75 rows=675 width=8) + -> Hash (cost=11.75..11.75 rows=675 width=8) + -> Seq Scan on tbl_b b (cost=0.00..11.75 rows=675 width=8) +(9 rows) + +select * from tbl_a a where a.b NOT IN (select b.a from tbl_b b where b.b > a.b); + a | b +----+--- + 1 | 1 + 2 | 1 + 5 | 1 + 6 | 1 + 8 | 1 + 9 | 1 + 3 | 1 + 4 | 1 + 7 | 1 + 10 | 1 +(10 rows) + drop table tbl_a; drop table tbl_b; -- more RTEs in subquery @@ -2144,14 +2207,18 @@ create table notin_t1 (id1 int, num1 int not null); create table notin_t2 (id2 int, num2 int not null); explain(costs off) select num1 from notin_t1 where num1 not in (select num2 from notin_t2); QUERY PLAN -------------------------------------------------------------------- +----------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) + -> Hash Anti Join + Hash Cond: (notin_t1.num1 = notin_t2.num2) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: num1 -> Seq Scan on notin_t1 - Filter: (NOT (hashed SubPlan 1)) - SubPlan 1 + -> Hash -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: num2 -> Seq Scan on notin_t2 -(6 rows) +(10 rows) drop table notin_t1; drop table notin_t2; diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql index 8b5db9a2..f17f38f3 100644 --- a/src/test/regress/sql/subselect.sql +++ b/src/test/regress/sql/subselect.sql @@ -700,6 +700,20 @@ select a.a,(select b.a from tbl_b b where b.a = a.a limit 1) q from tbl_a a orde -- support pullup lateral ANY_SUBLINK explain select * from tbl_a a where a.b IN (select b.a from tbl_b b where b.b > a.b); select * from tbl_a a where a.b IN (select b.a from tbl_b b where b.b > a.b); +explain select * from tbl_a a where a.b NOT IN (select b.a from tbl_b b where b.b > a.b); +select * from tbl_a a where a.b NOT IN (select b.a from tbl_b b where b.b > a.b); + +drop table tbl_a; +drop table tbl_b; + +-- test NOT IN/ANY with NOT NULL restriction +create table tbl_a(a int NOT NULL, b int NOT NULL); +create table tbl_b(a int NOT NULL, b int NOT NULL); +insert into tbl_a select generate_series(1,10),1; +insert into tbl_b select generate_series(2,11),1; + +explain select * from tbl_a a where a.b NOT IN (select b.a from tbl_b b where b.b > a.b); +select * from tbl_a a where a.b NOT IN (select b.a from tbl_b b where b.b > a.b); drop table tbl_a; drop table tbl_b; From bccd602235129be841604f85d77b8f990c893bd2 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Fri, 18 Jun 2021 12:46:28 +0800 Subject: [PATCH 173/578] fix regress subselect/select_parallel expected --- src/test/regress/expected/select_parallel_4.out | 10 +++++----- src/test/regress/expected/subselect.out | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/test/regress/expected/select_parallel_4.out b/src/test/regress/expected/select_parallel_4.out index e5527088..684d4989 100644 --- a/src/test/regress/expected/select_parallel_4.out +++ b/src/test/regress/expected/select_parallel_4.out @@ -139,17 +139,17 @@ alter table tenk2 set (parallel_workers = 0); explain (costs off) select count(*) from tenk1 where (two, four) not in (select hundred, thousand from tenk2 where thousand > 100); - QUERY PLAN + QUERY PLAN ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ Aggregate -> Nested Loop Anti Join Join Filter: (((tenk1.two = tenk2.hundred) OR (tenk1.two IS NULL) OR (tenk2.hundred IS NULL)) AND ((tenk1.four = tenk2.thousand) OR (tenk1.four IS NULL) OR (tenk2.thousand IS NULL))) - -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on tenk1 -> Materialize - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Seq Scan on tenk2 - Filter: (thousand > 100) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on tenk2 + Filter: (thousand > 100) (9 rows) select count(*) from tenk1 where (two, four) not in diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out index c3a94e88..f38e79c4 100644 --- a/src/test/regress/expected/subselect.out +++ b/src/test/regress/expected/subselect.out @@ -2206,18 +2206,18 @@ drop table catalog_sales, catalog_returns, date_dim; create table notin_t1 (id1 int, num1 int not null); create table notin_t2 (id2 int, num2 int not null); explain(costs off) select num1 from notin_t1 where num1 not in (select num2 from notin_t2); - QUERY PLAN + QUERY PLAN ----------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) -> Hash Anti Join Hash Cond: (notin_t1.num1 = notin_t2.num2) -> Remote Subquery Scan on all (datanode_1,datanode_2) Distribute results by H: num1 - -> Seq Scan on notin_t1 + -> Seq Scan on notin_t1 -> Hash - -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Remote Subquery Scan on all (datanode_1,datanode_2) Distribute results by H: num2 - -> Seq Scan on notin_t2 + -> Seq Scan on notin_t2 (10 rows) drop table notin_t1; From cb7363386c0800bd25aa3db3db359be18d449976 Mon Sep 17 00:00:00 2001 From: andrelin Date: Wed, 9 Jun 2021 17:46:03 +0800 Subject: [PATCH 174/578] Raise warning istead of error when remote instrument recieved but htbl is not initialized this happens in cases like ABORT, CN will create a new combiner to recieve data as more as it can. But, if there is any good DN success to finish it's "EXPLAIN ANALYZE" job, remote instrument will be sent to CN, and processed by a newly created combiner, then error out and make further abortion goes wrong. Raising warning and just return here is okay enough for debug, in normal cases remote instrument definitely received with es_instrument flag on. tapd: http://tapd.oa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131088466047&jump_count=1 --- src/backend/commands/explain_dist.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/backend/commands/explain_dist.c b/src/backend/commands/explain_dist.c index 2b37ba0d..dbf689df 100644 --- a/src/backend/commands/explain_dist.c +++ b/src/backend/commands/explain_dist.c @@ -711,7 +711,8 @@ HandleRemoteInstr(char *msg_body, size_t len, int nodeid, ResponseCombiner *comb if (combiner->recv_instr_htbl == NULL) { - elog(ERROR, "combiner is not prepared for instrumentation"); + elog(WARNING, "combiner is not prepared for instrumentation"); + return; } elog(DEBUG1, "Handle remote instrument: nodeid %d", nodeid); From 900283515e254ddfd3fd215466b30e0b008ff1cf Mon Sep 17 00:00:00 2001 From: andrelin Date: Fri, 11 Jun 2021 11:16:40 +0800 Subject: [PATCH 175/578] Add time zone to timestamp type output in pg_stat_cluster_activity tapd: http://tapd.oa.com/pgxz/prong/stories/view/1010092131865528291 --- .../pg_stat_cluster_activity--1.0.sql | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity--1.0.sql b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity--1.0.sql index 9f524816..c5514458 100644 --- a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity--1.0.sql +++ b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity--1.0.sql @@ -26,10 +26,10 @@ CREATE OR REPLACE FUNCTION pg_stat_get_cluster_activity( OUT planstate text, OUT portal text, OUT cursors text, - OUT backend_start timestamp, - OUT xact_start timestamp, - OUT query_start timestamp, - OUT state_change timestamp + OUT backend_start timestamp with time zone, + OUT xact_start timestamp with time zone, + OUT query_start timestamp with time zone, + OUT state_change timestamp with time zone ) RETURNS SETOF record AS 'MODULE_PATHNAME' From 3a4bb4fb791a5f4fb3dda1b1ddaa8c3e02abd663 Mon Sep 17 00:00:00 2001 From: andrelin Date: Wed, 9 Jun 2021 19:35:06 +0800 Subject: [PATCH 176/578] Add a guc for pg_stat_cluster_activity extension to disable showing planstate in result sets, prevent from potential corner failure when DN call it --- .../pg_stat_cluster_activity.c | 53 ++++++++++++++----- 1 file changed, 39 insertions(+), 14 deletions(-) diff --git a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c index 304ee872..0cc836d3 100644 --- a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c +++ b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c @@ -18,6 +18,7 @@ #include "storage/procarray.h" #include "storage/shmem.h" #include "utils/builtins.h" +#include "utils/guc.h" #include "utils/portal.h" #include "utils/snapmgr.h" #include "utils/timestamp.h" @@ -99,6 +100,8 @@ static PortalStart_hook_type prev_PortalStart = NULL; static PortalDrop_hook_type prev_PortalDrop = NULL; static ExecutorStart_hook_type prev_ExecutorStart = NULL; +static bool pgcs_enable_planstate; /* whether to show planstate in result sets */ + /* * Macros to load and store st_changecount with the memory barriers. * @@ -372,20 +375,28 @@ pgcs_report_query_activity(QueryDesc *desc, int eflags) if (desc->planstate != NULL) { - ExplainState *es = NewExplainState(); - - /* make planstate text tree */ - es->costs = false; - /* we don't want plan->targetlist been changed */ - es->skip_remote_query = true; - - ExplainBeginOutput(es); - ExplainPrintPlan(es, desc); - ExplainEndOutput(es); - /* remove last '\n' */ - if (es->str->len > 1) - es->str->data[--es->str->len] = '\0'; - planstate_str = es->str; + /* make planstate text tree if enabled */ + if (pgcs_enable_planstate) + { + ExplainState *es = NewExplainState(); + + es->costs = false; + /* we don't want plan->targetlist been changed */ + es->skip_remote_query = true; + + ExplainBeginOutput(es); + ExplainPrintPlan(es, desc); + ExplainEndOutput(es); + /* remove last '\n' */ + if (es->str->len > 1) + es->str->data[--es->str->len] = '\0'; + planstate_str = es->str; + } + else + { + planstate_str = makeStringInfo(); + appendStringInfoString(planstate_str, "disabled"); + } /* find name of RemoteSubplan to show as cursors */ cursors = makeStringInfo(); @@ -1039,6 +1050,20 @@ _PG_init(void) if (!process_shared_preload_libraries_in_progress) return; + /* + * Define (or redefine) custom GUC variables. + */ + DefineCustomBoolVariable("pg_stat_cluster_activity.enable_planstate", + "whether to show planstate in result sets.", + NULL, + &pgcs_enable_planstate, + true, + PGC_SUSET, + 0, + NULL, + NULL, + NULL); + /* * Request additional shared resources. (These are no-ops if we're not in * the postmaster process.) We'll allocate or attach to the shared From 97d59366b0fa5235655db603c52cdd79dbb80842 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Tue, 14 Nov 2017 17:49:49 -0500 Subject: [PATCH 177/578] Prevent int128 from requiring more than MAXALIGN alignment. Our initial work with int128 neglected alignment considerations, an oversight that came back to bite us in bug #14897 from Vincent Lachenal. It is unsurprising that int128 might have a 16-byte alignment requirement; what's slightly more surprising is that even notoriously lax Intel chips sometimes enforce that. Raising MAXALIGN seems out of the question: the costs in wasted disk and memory space would be significant, and there would also be an on-disk compatibility break. Nor does it seem very practical to try to allow some data structures to have more-than-MAXALIGN alignment requirement, as we'd have to push knowledge of that throughout various code that copies data structures around. The only way out of the box is to make type int128 conform to the system's alignment assumptions. Fortunately, gcc supports that via its __attribute__(aligned()) pragma; and since we don't currently support int128 on non-gcc-workalike compilers, we shouldn't be losing any platform support this way. Although we could have just done pg_attribute_aligned(MAXIMUM_ALIGNOF) and called it a day, I did a little bit of extra work to make the code more portable than that: it will also support int128 on compilers without __attribute__(aligned()), if the native alignment of their 128-bit-int type is no more than that of int64. Add a regression test case that exercises the one known instance of the problem, in parallel aggregation over a bigint column. Back-patch of commit 751804998. The code known to be affected only exists in 9.6 and later, but we do have some stuff using int128 in 9.5, so patch back to 9.5. Discussion: https://postgr.es/m/20171110185747.31519.28038@wrigleys.postgresql.org --- config/c-compiler.m4 | 9 +++++--- configure | 42 +++++++++++++++++++++++++++++++++-- configure.in | 7 ++++-- src/include/c.h | 27 +++++++++++++++++----- src/include/pg_config.h.in | 3 +++ src/include/pg_config.h.win32 | 3 +++ 6 files changed, 79 insertions(+), 12 deletions(-) diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 7275ea69..8d9844ab 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -96,9 +96,11 @@ undefine([Ac_cachevar])dnl # PGAC_TYPE_128BIT_INT # --------------------- # Check if __int128 is a working 128 bit integer type, and if so -# define PG_INT128_TYPE to that typename. This currently only detects -# a GCC/clang extension, but support for different environments may be -# added in the future. +# define PG_INT128_TYPE to that typename, and define ALIGNOF_PG_INT128_TYPE +# as its alignment requirement. +# +# This currently only detects a GCC/clang extension, but support for other +# environments may be added in the future. # # For the moment we only test for support for 128bit math; support for # 128bit literals and snprintf is not required. @@ -128,6 +130,7 @@ return 1; [pgac_cv__128bit_int=no])]) if test x"$pgac_cv__128bit_int" = xyes ; then AC_DEFINE(PG_INT128_TYPE, __int128, [Define to the name of a signed 128-bit integer type.]) + AC_CHECK_ALIGNOF(PG_INT128_TYPE) fi])# PGAC_TYPE_128BIT_INT diff --git a/configure b/configure index ae61c606..26843895 100755 --- a/configure +++ b/configure @@ -14985,7 +14985,10 @@ _ACEOF # Compute maximum alignment of any basic type. # We assume long's alignment is at least as strong as char, short, or int; -# but we must check long long (if it exists) and double. +# but we must check long long (if it is being used for int64) and double. +# Note that we intentionally do not consider any types wider than 64 bits, +# as allowing MAXIMUM_ALIGNOF to exceed 8 would be too much of a penalty +# for disk and memory space. MAX_ALIGNOF=$ac_cv_alignof_long if test $MAX_ALIGNOF -lt $ac_cv_alignof_double ; then @@ -15045,7 +15048,7 @@ _ACEOF fi -# Check for extensions offering the integer scalar type __int128. +# Some compilers offer a 128-bit integer scalar type. { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __int128" >&5 $as_echo_n "checking for __int128... " >&6; } if ${pgac_cv__128bit_int+:} false; then : @@ -15095,6 +15098,41 @@ if test x"$pgac_cv__128bit_int" = xyes ; then $as_echo "#define PG_INT128_TYPE __int128" >>confdefs.h + # The cast to long int works around a bug in the HP C Compiler, +# see AC_CHECK_SIZEOF for more information. +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking alignment of PG_INT128_TYPE" >&5 +$as_echo_n "checking alignment of PG_INT128_TYPE... " >&6; } +if ${ac_cv_alignof_PG_INT128_TYPE+:} false; then : + $as_echo_n "(cached) " >&6 +else + if ac_fn_c_compute_int "$LINENO" "(long int) offsetof (ac__type_alignof_, y)" "ac_cv_alignof_PG_INT128_TYPE" "$ac_includes_default +#ifndef offsetof +# define offsetof(type, member) ((char *) &((type *) 0)->member - (char *) 0) +#endif +typedef struct { char x; PG_INT128_TYPE y; } ac__type_alignof_;"; then : + +else + if test "$ac_cv_type_PG_INT128_TYPE" = yes; then + { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error 77 "cannot compute alignment of PG_INT128_TYPE +See \`config.log' for more details" "$LINENO" 5; } + else + ac_cv_alignof_PG_INT128_TYPE=0 + fi +fi + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_alignof_PG_INT128_TYPE" >&5 +$as_echo "$ac_cv_alignof_PG_INT128_TYPE" >&6; } + + + +cat >>confdefs.h <<_ACEOF +#define ALIGNOF_PG_INT128_TYPE $ac_cv_alignof_PG_INT128_TYPE +_ACEOF + + fi # Check for various atomic operations now that we have checked how to declare diff --git a/configure.in b/configure.in index a623005a..830aa103 100644 --- a/configure.in +++ b/configure.in @@ -1845,7 +1845,10 @@ AC_CHECK_ALIGNOF(double) # Compute maximum alignment of any basic type. # We assume long's alignment is at least as strong as char, short, or int; -# but we must check long long (if it exists) and double. +# but we must check long long (if it is being used for int64) and double. +# Note that we intentionally do not consider any types wider than 64 bits, +# as allowing MAXIMUM_ALIGNOF to exceed 8 would be too much of a penalty +# for disk and memory space. MAX_ALIGNOF=$ac_cv_alignof_long if test $MAX_ALIGNOF -lt $ac_cv_alignof_double ; then @@ -1862,7 +1865,7 @@ AC_DEFINE_UNQUOTED(MAXIMUM_ALIGNOF, $MAX_ALIGNOF, [Define as the maximum alignme AC_CHECK_TYPES([int8, uint8, int64, uint64], [], [], [#include ]) -# Check for extensions offering the integer scalar type __int128. +# Some compilers offer a 128-bit integer scalar type. PGAC_TYPE_128BIT_INT # Check for various atomic operations now that we have checked how to declare diff --git a/src/include/c.h b/src/include/c.h index 9e18db2b..f2c1d8c2 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -377,13 +377,30 @@ typedef unsigned long long int uint64; /* * 128-bit signed and unsigned integers - * There currently is only a limited support for the type. E.g. 128bit - * literals and snprintf are not supported; but math is. + + * There currently is only limited support for such types. + * E.g. 128bit literals and snprintf are not supported; but math is. + * Also, because we exclude such types when choosing MAXIMUM_ALIGNOF, + * it must be possible to coerce the compiler to allocate them on no + * more than MAXALIGN boundaries. */ #if defined(PG_INT128_TYPE) -#define HAVE_INT128 -typedef PG_INT128_TYPE int128; -typedef unsigned PG_INT128_TYPE uint128; +#if defined(pg_attribute_aligned) || ALIGNOF_PG_INT128_TYPE <= MAXIMUM_ALIGNOF +#define HAVE_INT128 1 + +typedef PG_INT128_TYPE int128 +#if defined(pg_attribute_aligned) +pg_attribute_aligned(MAXIMUM_ALIGNOF) +#endif +; + +typedef unsigned PG_INT128_TYPE uint128 +#if defined(pg_attribute_aligned) +pg_attribute_aligned(MAXIMUM_ALIGNOF) +#endif +; + +#endif #endif /* diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index aa4b2974..0e32abd7 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -27,6 +27,9 @@ /* The normal alignment of `long long int', in bytes. */ #undef ALIGNOF_LONG_LONG_INT +/* The normal alignment of `PG_INT128_TYPE', in bytes. */ +#undef ALIGNOF_PG_INT128_TYPE + /* The normal alignment of `short', in bytes. */ #undef ALIGNOF_SHORT diff --git a/src/include/pg_config.h.win32 b/src/include/pg_config.h.win32 index 20bff1c4..79bd66cd 100644 --- a/src/include/pg_config.h.win32 +++ b/src/include/pg_config.h.win32 @@ -34,6 +34,9 @@ /* The alignment requirement of a `long long int'. */ #define ALIGNOF_LONG_LONG_INT 8 +/* The normal alignment of `PG_INT128_TYPE', in bytes. */ +#undef ALIGNOF_PG_INT128_TYPE + /* The alignment requirement of a `short'. */ #define ALIGNOF_SHORT 2 From dbaba14440fb7abdd9c4dc2d9e8d71ae6aeb0aac Mon Sep 17 00:00:00 2001 From: andrelin Date: Thu, 1 Jul 2021 11:41:49 +0800 Subject: [PATCH 178/578] Remove invalid assertion, this is useless after we introduce shard --- src/backend/optimizer/util/pgxcship.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/backend/optimizer/util/pgxcship.c b/src/backend/optimizer/util/pgxcship.c index 7c577339..7bff63dd 100644 --- a/src/backend/optimizer/util/pgxcship.c +++ b/src/backend/optimizer/util/pgxcship.c @@ -953,7 +953,6 @@ pgxc_FQS_get_relation_nodes(RangeTblEntry *rte, Index varno, Query *query) return NULL; } - Assert(tle); /* We found the TargetEntry for the partition column */ list_free(rel_exec_nodes->primarynodelist); rel_exec_nodes->primarynodelist = NULL; From 949f970e5c6e714a7f1d2dd0e145759de3ccab9b Mon Sep 17 00:00:00 2001 From: ceciliasu Date: Tue, 6 Jul 2021 21:39:33 +0800 Subject: [PATCH 179/578] fix bug: alloacate shm of wrong size for PGXCSessionId in InitializeParallelDSM http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view\?bug_id\=1020421696089628859 --- src/backend/access/transam/parallel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/access/transam/parallel.c b/src/backend/access/transam/parallel.c index d6108a3a..7639bc2f 100644 --- a/src/backend/access/transam/parallel.c +++ b/src/backend/access/transam/parallel.c @@ -246,7 +246,7 @@ InitializeParallelDSM(ParallelContext *pcxt) gxidlen = EstimateGlobalXidSpace(); shm_toc_estimate_chunk(&pcxt->estimator, gxidlen); #endif - sidlen = PGXCSessionId[0] == '\0' ? 0 : strlen(PGXCSessionId) + 1; + sidlen = sizeof(int) + (PGXCSessionId[0] == '\0' ? 0 : strlen(PGXCSessionId) + 1); shm_toc_estimate_chunk(&pcxt->estimator, sidlen); /* If you add more chunks here, you probably need to add keys. */ shm_toc_estimate_keys(&pcxt->estimator, 8); From c47bebc338b01177687a3a4060d61b235bf2c03f Mon Sep 17 00:00:00 2001 From: sigmalin Date: Fri, 1 Oct 2021 11:49:16 +0800 Subject: [PATCH 180/578] fix pgxc_ctl monitor bug http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131092865125 (merge request !773) Signed-off-by: JennyJennyChen --- contrib/pgxc_ctl/monitor.c | 4 ++-- src/include/gtm/gtm_c.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/contrib/pgxc_ctl/monitor.c b/contrib/pgxc_ctl/monitor.c index bb540b6f..91537d0e 100644 --- a/contrib/pgxc_ctl/monitor.c +++ b/contrib/pgxc_ctl/monitor.c @@ -484,9 +484,9 @@ do_gtm_ping(char *host, int port) elog(ERROR, "ERROR: Invalid port number, %d.\n", port); return -1; } - /* Use 60s as connection timeout */ + /* Use 60s as connection timeout, use GTM_NODE_GTM_CTL as remote type here */ sprintf(connect_str, "host=%s port=%d node_name=%s remote_type=%d postmaster=0 connect_timeout=60", - host, port, myName, GTM_NODE_COORDINATOR); + host, port, myName, GTM_NODE_GTM_CTL); if ((conn = PQconnectGTM(connect_str)) == NULL || GTMPQstatus(conn) == CONNECTION_BAD) { elog(DEBUG3, "DEBUG3: Could not connect to %s, %d\n", host, port); diff --git a/src/include/gtm/gtm_c.h b/src/include/gtm/gtm_c.h index 7af3e735..b5a18302 100644 --- a/src/include/gtm/gtm_c.h +++ b/src/include/gtm/gtm_c.h @@ -58,7 +58,7 @@ typedef enum GTM_PGXCNodeType GTM_NODE_DATANODE = 4, GTM_NODE_GTM = 5, #ifdef __TBASE__ - GTM_NODE_GTM_CTL = 6, /* gtm ctl will never register and unregister. */ + GTM_NODE_GTM_CTL = 6, /* gtm ctl will never register and unregister, maybe used by gtm_ctl or pgxc_ctl */ #endif GTM_NODE_DEFAULT/* In case nothing is associated to connection */ } GTM_PGXCNodeType; From 285b804afb6a69f97fb6fd10cef01bf033dd4ca7 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Fri, 1 Oct 2021 11:49:16 +0800 Subject: [PATCH 181/578] fix pgxc_ctl monitor bug http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131092865125 (merge request !773) Signed-off-by: JennyJennyChen --- contrib/pgxc_ctl/monitor.c | 4 ++-- src/include/gtm/gtm_c.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/contrib/pgxc_ctl/monitor.c b/contrib/pgxc_ctl/monitor.c index 65b3af88..b8d07c72 100644 --- a/contrib/pgxc_ctl/monitor.c +++ b/contrib/pgxc_ctl/monitor.c @@ -485,9 +485,9 @@ do_gtm_ping(char *host, int port) elog(ERROR, "ERROR: Invalid port number, %d.\n", port); return -1; } - /* Use 60s as connection timeout */ + /* Use 60s as connection timeout, use GTM_NODE_GTM_CTL as remote type here */ sprintf(connect_str, "host=%s port=%d node_name=%s remote_type=%d postmaster=0 connect_timeout=60", - host, port, myName, GTM_NODE_COORDINATOR); + host, port, myName, GTM_NODE_GTM_CTL); if ((conn = PQconnectGTM(connect_str)) == NULL || GTMPQstatus(conn) == CONNECTION_BAD) { elog(DEBUG3, "DEBUG3: Could not connect to %s, %d\n", host, port); diff --git a/src/include/gtm/gtm_c.h b/src/include/gtm/gtm_c.h index b2c6382b..5e52fcad 100644 --- a/src/include/gtm/gtm_c.h +++ b/src/include/gtm/gtm_c.h @@ -58,7 +58,7 @@ typedef enum GTM_PGXCNodeType GTM_NODE_DATANODE = 4, GTM_NODE_GTM = 5, #ifdef __TBASE__ - GTM_NODE_GTM_CTL = 6, /* gtm ctl will never register and unregister. */ + GTM_NODE_GTM_CTL = 6, /* gtm ctl will never register and unregister, maybe used by gtm_ctl or pgxc_ctl */ #endif GTM_NODE_DEFAULT/* In case nothing is associated to connection */ } GTM_PGXCNodeType; From b9f4bd4adb997993a33e274cb9152043d8c06039 Mon Sep 17 00:00:00 2001 From: andrelin Date: Fri, 11 Jun 2021 12:05:44 +0800 Subject: [PATCH 182/578] Consider es_plannedstmt NULL when calling ResetRemoteSubplanCursor --- src/backend/executor/execMain.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 7424b45a..78810446 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -3176,7 +3176,10 @@ EvalPlanQualInit(EPQState *epqstate, EState *estate, /* ... and remember data that EvalPlanQualBegin will need */ epqstate->plan = copyObject(subplan); /* Reset cursor name of remote subplans if any */ - ResetRemoteSubplanCursor(epqstate->plan, estate->es_plannedstmt->subplans, "epq"); + ResetRemoteSubplanCursor(epqstate->plan, + (estate->es_plannedstmt ? + estate->es_plannedstmt->subplans : NULL), + "epq"); epqstate->arowMarks = auxrowmarks; epqstate->epqParam = epqParam; } @@ -3195,7 +3198,8 @@ EvalPlanQualSetPlan(EPQState *epqstate, Plan *subplan, List *auxrowmarks) epqstate->plan = copyObject(subplan); /* Reset cursor name of remote subplans if any */ ResetRemoteSubplanCursor(epqstate->plan, - epqstate->parentestate->es_plannedstmt->subplans, + (epqstate->parentestate->es_plannedstmt ? + epqstate->parentestate->es_plannedstmt->subplans : NULL), "epq"); /* The rowmarks depend on the plan, too */ epqstate->arowMarks = auxrowmarks; From 3b3fd5a46ff108e6c11b725f761b5f30606e6a92 Mon Sep 17 00:00:00 2001 From: andrelin Date: Mon, 28 Jun 2021 19:58:43 +0800 Subject: [PATCH 183/578] Fix wrong call of list_nth_node --- src/backend/nodes/nodeFuncs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/nodes/nodeFuncs.c b/src/backend/nodes/nodeFuncs.c index 27a1b7a3..a7ab020e 100644 --- a/src/backend/nodes/nodeFuncs.c +++ b/src/backend/nodes/nodeFuncs.c @@ -3886,7 +3886,7 @@ plantree_walk_initplans(List *plans, foreach(lc, plans) { - Plan *splan = list_nth_node(Plan, subplans, + Plan *splan = (Plan *) list_nth(subplans, (lfirst_node(SubPlan, lc))->plan_id); if (walker(splan, context)) From 86ff9315abdb858c83db01c8a579357031c81011 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Wed, 24 Jun 2020 16:43:03 +0800 Subject: [PATCH 184/578] Assorted preparatory refactoring for partition-wise join. --- src/backend/catalog/partition.c | 9 ++- src/backend/optimizer/path/joinpath.c | 27 +++++---- src/backend/optimizer/util/pathnode.c | 11 ++-- src/backend/optimizer/util/relnode.c | 39 +++++++----- src/backend/utils/cache/relcache.c | 4 +- src/include/catalog/partition.h | 85 ++++++++++++++------------- src/include/optimizer/pathnode.h | 7 ++- 7 files changed, 99 insertions(+), 83 deletions(-) diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c index 30be04e6..9ecd77ea 100644 --- a/src/backend/catalog/partition.c +++ b/src/backend/catalog/partition.c @@ -595,7 +595,7 @@ RelationBuildPartitionDesc(Relation rel) * representation of partition bounds. */ bool -partition_bounds_equal(PartitionKey key, +partition_bounds_equal(int partnatts, int16 *parttyplen, bool *parttypbyval, PartitionBoundInfo b1, PartitionBoundInfo b2) {// #lizard forgives int i; @@ -613,7 +613,7 @@ partition_bounds_equal(PartitionKey key, { int j; - for (j = 0; j < key->partnatts; j++) + for (j = 0; j < partnatts; j++) { /* For range partitions, the bounds might not be finite. */ if (b1->kind != NULL) @@ -639,8 +639,7 @@ partition_bounds_equal(PartitionKey key, * context. datumIsEqual() should be simple enough to be safe. */ if (!datumIsEqual(b1->datums[i][j], b2->datums[i][j], - key->parttypbyval[j], - key->parttyplen[j])) + parttypbyval[j], parttyplen[j])) return false; } @@ -649,7 +648,7 @@ partition_bounds_equal(PartitionKey key, } /* There are ndatums+1 indexes in case of range partitions */ - if (key->strategy == PARTITION_STRATEGY_RANGE && + if (b1->strategy == PARTITION_STRATEGY_RANGE && b1->indexes[i] != b2->indexes[i]) return false; diff --git a/src/backend/optimizer/path/joinpath.c b/src/backend/optimizer/path/joinpath.c index 49852d77..72a766af 100644 --- a/src/backend/optimizer/path/joinpath.c +++ b/src/backend/optimizer/path/joinpath.c @@ -331,18 +331,15 @@ add_paths_to_joinrel(PlannerInfo *root, */ static inline bool allow_star_schema_join(PlannerInfo *root, - Path *outer_path, - Path *inner_path) + Relids outerrelids, + Relids inner_paramrels) { - Relids innerparams = PATH_REQ_OUTER(inner_path); - Relids outerrelids = outer_path->parent->relids; - /* * It's a star-schema case if the outer rel provides some but not all of * the inner rel's parameterization. */ - return (bms_overlap(innerparams, outerrelids) && - bms_nonempty_difference(innerparams, outerrelids)); + return (bms_overlap(inner_paramrels, outerrelids) && + bms_nonempty_difference(inner_paramrels, outerrelids)); } /* @@ -361,6 +358,12 @@ try_nestloop_path(PlannerInfo *root, { Relids required_outer; JoinCostWorkspace workspace; + RelOptInfo *innerrel = inner_path->parent; + RelOptInfo *outerrel = outer_path->parent; + Relids innerrelids = innerrel->relids; + Relids outerrelids = outerrel->relids; + Relids inner_paramrels = PATH_REQ_OUTER(inner_path); + Relids outer_paramrels = PATH_REQ_OUTER(outer_path); /* * Check to see if proposed path is still parameterized, and reject if the @@ -369,14 +372,12 @@ try_nestloop_path(PlannerInfo *root, * doesn't like the look of it, which could only happen if the nestloop is * still parameterized. */ - required_outer = calc_nestloop_required_outer(outer_path, - inner_path); + required_outer = calc_nestloop_required_outer(outerrelids, outer_paramrels, + innerrelids, inner_paramrels); if (required_outer && ((!bms_overlap(required_outer, extra->param_source_rels) && - !allow_star_schema_join(root, outer_path, inner_path)) || - have_dangerous_phv(root, - outer_path->parent->relids, - PATH_REQ_OUTER(inner_path)))) + !allow_star_schema_join(root, outerrelids, inner_paramrels)) || + have_dangerous_phv(root, outerrelids, inner_paramrels))) { /* Waste no memory when we reject a path here */ bms_free(required_outer); diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 17546a77..49e6658f 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -5241,14 +5241,15 @@ create_foreignscan_path(PlannerInfo *root, RelOptInfo *rel, * Note: result must not share storage with either input */ Relids -calc_nestloop_required_outer(Path *outer_path, Path *inner_path) +calc_nestloop_required_outer(Relids outerrelids, + Relids outer_paramrels, + Relids innerrelids, + Relids inner_paramrels) { - Relids outer_paramrels = PATH_REQ_OUTER(outer_path); - Relids inner_paramrels = PATH_REQ_OUTER(inner_path); Relids required_outer; /* inner_path can require rels from outer path, but not vice versa */ - Assert(!bms_overlap(outer_paramrels, inner_path->parent->relids)); + Assert(!bms_overlap(outer_paramrels, innerrelids)); /* easy case if inner path is not parameterized */ if (!inner_paramrels) return bms_copy(outer_paramrels); @@ -5256,7 +5257,7 @@ calc_nestloop_required_outer(Path *outer_path, Path *inner_path) required_outer = bms_union(outer_paramrels, inner_paramrels); /* ... and remove any mention of now-satisfied outer rels */ required_outer = bms_del_members(required_outer, - outer_path->parent->relids); + outerrelids); /* maintain invariant that required_outer is exactly NULL if empty */ if (bms_is_empty(required_outer)) { diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c index b4359f52..9fba700e 100644 --- a/src/backend/optimizer/util/relnode.c +++ b/src/backend/optimizer/util/relnode.c @@ -1106,12 +1106,8 @@ get_baserel_parampathinfo(PlannerInfo *root, RelOptInfo *baserel, Assert(!bms_overlap(baserel->relids, required_outer)); /* If we already have a PPI for this parameterization, just return it */ - foreach(lc, baserel->ppilist) - { - ppi = (ParamPathInfo *) lfirst(lc); - if (bms_equal(ppi->ppi_req_outer, required_outer)) + if ((ppi = find_param_path_info(baserel, required_outer))) return ppi; - } /* * Identify all joinclauses that are movable to this base rel given this @@ -1348,12 +1344,8 @@ get_joinrel_parampathinfo(PlannerInfo *root, RelOptInfo *joinrel, *restrict_clauses = list_concat(pclauses, *restrict_clauses); /* If we already have a PPI for this parameterization, just return it */ - foreach(lc, joinrel->ppilist) - { - ppi = (ParamPathInfo *) lfirst(lc); - if (bms_equal(ppi->ppi_req_outer, required_outer)) + if ((ppi = find_param_path_info(joinrel, required_outer))) return ppi; - } /* Estimate the number of rows returned by the parameterized join */ rows = get_parameterized_joinrel_size(root, joinrel, @@ -1392,7 +1384,6 @@ ParamPathInfo * get_appendrel_parampathinfo(RelOptInfo *appendrel, Relids required_outer) { ParamPathInfo *ppi; - ListCell *lc; /* Unparameterized paths have no ParamPathInfo */ if (bms_is_empty(required_outer)) @@ -1401,12 +1392,8 @@ get_appendrel_parampathinfo(RelOptInfo *appendrel, Relids required_outer) Assert(!bms_overlap(appendrel->relids, required_outer)); /* If we already have a PPI for this parameterization, just return it */ - foreach(lc, appendrel->ppilist) - { - ppi = (ParamPathInfo *) lfirst(lc); - if (bms_equal(ppi->ppi_req_outer, required_outer)) + if ((ppi = find_param_path_info(appendrel, required_outer))) return ppi; - } /* Else build the ParamPathInfo */ ppi = makeNode(ParamPathInfo); @@ -1417,3 +1404,23 @@ get_appendrel_parampathinfo(RelOptInfo *appendrel, Relids required_outer) return ppi; } + +/* + * Returns a ParamPathInfo for the parameterization given by required_outer, if + * already available in the given rel. Returns NULL otherwise. + */ +ParamPathInfo * +find_param_path_info(RelOptInfo *rel, Relids required_outer) +{ + ListCell *lc; + + foreach(lc, rel->ppilist) + { + ParamPathInfo *ppi = (ParamPathInfo *) lfirst(lc); + + if (bms_equal(ppi->ppi_req_outer, required_outer)) + return ppi; + } + + return NULL; +} diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index eb895929..66aebfe9 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -1396,7 +1396,9 @@ equalPartitionDescs(PartitionKey key, PartitionDesc partdesc1, if (partdesc2->boundinfo == NULL) return false; - if (!partition_bounds_equal(key, partdesc1->boundinfo, + if (!partition_bounds_equal(key->partnatts, key->parttyplen, + key->parttypbyval, + partdesc1->boundinfo, partdesc2->boundinfo)) return false; } diff --git a/src/include/catalog/partition.h b/src/include/catalog/partition.h index 2efe3ea6..bef7a0f5 100644 --- a/src/include/catalog/partition.h +++ b/src/include/catalog/partition.h @@ -1,8 +1,8 @@ /*------------------------------------------------------------------------- * * partition.h - * Header file for structures and utility functions related to - * partitioning + * Header file for structures and utility functions related to + * partitioning * * Copyright (c) 2007-2017, PostgreSQL Global Development Group * @@ -32,9 +32,9 @@ typedef struct PartitionBoundInfoData *PartitionBoundInfo; */ typedef struct PartitionDescData { - int nparts; /* Number of partitions */ - Oid *oids; /* OIDs of partitions */ - PartitionBoundInfo boundinfo; /* collection of partition bounds */ + int nparts; /* Number of partitions */ + Oid *oids; /* OIDs of partitions */ + PartitionBoundInfo boundinfo; /* collection of partition bounds */ } PartitionDescData; typedef struct PartitionDescData *PartitionDesc; @@ -43,60 +43,61 @@ typedef struct PartitionDescData *PartitionDesc; * PartitionDispatch - information about one partitioned table in a partition * hierarchy required to route a tuple to one of its partitions * - * reldesc Relation descriptor of the table - * key Partition key information of the table - * keystate Execution state required for expressions in the partition key - * partdesc Partition descriptor of the table - * tupslot A standalone TupleTableSlot initialized with this table's tuple - * descriptor - * tupmap TupleConversionMap to convert from the parent's rowtype to - * this table's rowtype (when extracting the partition key of a - * tuple just before routing it through this table) - * indexes Array with partdesc->nparts members (for details on what - * individual members represent, see how they are set in - * RelationGetPartitionDispatchInfo()) + * reldesc Relation descriptor of the table + * key Partition key information of the table + * keystate Execution state required for expressions in the partition key + * partdesc Partition descriptor of the table + * tupslot A standalone TupleTableSlot initialized with this table's tuple + * descriptor + * tupmap TupleConversionMap to convert from the parent's rowtype to + * this table's rowtype (when extracting the partition key of a + * tuple just before routing it through this table) + * indexes Array with partdesc->nparts members (for details on what + * individual members represent, see how they are set in + * RelationGetPartitionDispatchInfo()) *----------------------- */ typedef struct PartitionDispatchData { - Relation reldesc; - PartitionKey key; - List *keystate; /* list of ExprState */ - PartitionDesc partdesc; - TupleTableSlot *tupslot; - TupleConversionMap *tupmap; - int *indexes; + Relation reldesc; + PartitionKey key; + List *keystate; /* list of ExprState */ + PartitionDesc partdesc; + TupleTableSlot *tupslot; + TupleConversionMap *tupmap; + int *indexes; } PartitionDispatchData; typedef struct PartitionDispatchData *PartitionDispatch; extern void RelationBuildPartitionDesc(Relation relation); -extern bool partition_bounds_equal(PartitionKey key, - PartitionBoundInfo p1, PartitionBoundInfo p2); +extern bool partition_bounds_equal(int partnatts, int16 *parttyplen, + bool *parttypbyval, PartitionBoundInfo b1, + PartitionBoundInfo b2); extern void check_new_partition_bound(char *relname, Relation parent, - PartitionBoundSpec *spec); -extern Oid get_partition_parent(Oid relid); + PartitionBoundSpec *spec); +extern Oid get_partition_parent(Oid relid); extern List *get_qual_from_partbound(Relation rel, Relation parent, - PartitionBoundSpec *spec); + PartitionBoundSpec *spec); extern List *map_partition_varattnos(List *expr, int target_varno, - Relation partrel, Relation parent, - bool *found_whole_row); + Relation partrel, Relation parent, + bool *found_whole_row); extern List *RelationGetPartitionQual(Relation rel); extern Expr *get_partition_qual_relid(Oid relid); /* For tuple routing */ extern PartitionDispatch *RelationGetPartitionDispatchInfo(Relation rel, - int lockmode, int *num_parted, - List **leaf_part_oids); + int lockmode, int *num_parted, + List **leaf_part_oids); extern void FormPartitionKeyDatum(PartitionDispatch pd, - TupleTableSlot *slot, - EState *estate, - Datum *values, - bool *isnull); + TupleTableSlot *slot, + EState *estate, + Datum *values, + bool *isnull); extern int get_partition_for_tuple(PartitionDispatch *pd, - TupleTableSlot *slot, - EState *estate, - PartitionDispatchData **failed_at, - TupleTableSlot **failed_slot); -#endif /* PARTITION_H */ + TupleTableSlot *slot, + EState *estate, + PartitionDispatchData **failed_at, + TupleTableSlot **failed_slot); +#endif /* PARTITION_H */ diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index f1ff4710..3df87235 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -173,7 +173,10 @@ extern ForeignPath *create_foreignscan_path(PlannerInfo *root, RelOptInfo *rel, Path *fdw_outerpath, List *fdw_private); -extern Relids calc_nestloop_required_outer(Path *outer_path, Path *inner_path); +extern Relids calc_nestloop_required_outer(Relids outerrelids, + Relids outer_paramrels, + Relids innerrelids, + Relids inner_paramrels); extern Relids calc_non_nestloop_required_outer(Path *outer_path, Path *inner_path); extern NestPath *create_nestloop_path(PlannerInfo *root, @@ -349,6 +352,8 @@ extern ParamPathInfo *get_joinrel_parampathinfo(PlannerInfo *root, List **restrict_clauses); extern ParamPathInfo *get_appendrel_parampathinfo(RelOptInfo *appendrel, Relids required_outer); +extern ParamPathInfo *find_param_path_info(RelOptInfo *rel, + Relids required_outer); #ifdef __TBASE__ extern Path *create_redistribute_grouping_path(PlannerInfo *root, From a649b7cee68c5c525aad8a81009df5a7c3f94869 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Wed, 24 Jun 2020 16:47:33 +0800 Subject: [PATCH 185/578] Refactor validation of new partitions a little bit --- src/backend/commands/tablecmds.c | 318 +++++++++++++++++-------------- 1 file changed, 172 insertions(+), 146 deletions(-) diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 6915458f..db41c7fe 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -538,6 +538,11 @@ static void CreateInheritance(Relation child_rel, Relation parent_rel); static void RemoveInheritance(Relation child_rel, Relation parent_rel); static ObjectAddress ATExecAttachPartition(List **wqueue, Relation rel, PartitionCmd *cmd); +static bool PartConstraintImpliedByRelConstraint(Relation scanrel, + List *partConstraint); +static void ValidatePartitionConstraints(List **wqueue, Relation scanrel, + List *scanrel_children, + List *partConstraint); static ObjectAddress ATExecDetachPartition(Relation rel, RangeVar *name); #ifdef _SHARDING_ static void AtExecRebuildExtent(Relation rel); @@ -16342,6 +16347,169 @@ ComputePartitionAttrs(Relation rel, List *partParams, AttrNumber *partattrs, } } +/* + * PartConstraintImpliedByRelConstraint + * Does scanrel's existing constraints imply the partition constraint? + * + * Existing constraints includes its check constraints and column-level + * NOT NULL constraints and partConstraint describes the partition constraint. + */ +static bool +PartConstraintImpliedByRelConstraint(Relation scanrel, + List *partConstraint) +{ + List *existConstraint = NIL; + TupleConstr *constr = RelationGetDescr(scanrel)->constr; + int num_check, + i; + + if (constr && constr->has_not_null) + { + int natts = scanrel->rd_att->natts; + + for (i = 1; i <= natts; i++) + { + Form_pg_attribute att = scanrel->rd_att->attrs[i - 1]; + + if (att->attnotnull && !att->attisdropped) + { + NullTest *ntest = makeNode(NullTest); + + ntest->arg = (Expr *) makeVar(1, + i, + att->atttypid, + att->atttypmod, + att->attcollation, + 0); + ntest->nulltesttype = IS_NOT_NULL; + + /* + * argisrow=false is correct even for a composite column, + * because attnotnull does not represent a SQL-spec IS NOT + * NULL test in such a case, just IS DISTINCT FROM NULL. + */ + ntest->argisrow = false; + ntest->location = -1; + existConstraint = lappend(existConstraint, ntest); + } + } + } + + num_check = (constr != NULL) ? constr->num_check : 0; + for (i = 0; i < num_check; i++) + { + Node *cexpr; + + /* + * If this constraint hasn't been fully validated yet, we must ignore + * it here. + */ + if (!constr->check[i].ccvalid) + continue; + + cexpr = stringToNode(constr->check[i].ccbin); + + /* + * Run each expression through const-simplification and + * canonicalization. It is necessary, because we will be comparing it + * to similarly-processed partition constraint expressions, and may + * fail to detect valid matches without this. + */ + cexpr = eval_const_expressions(NULL, cexpr); + cexpr = (Node *) canonicalize_qual((Expr *) cexpr); + + existConstraint = list_concat(existConstraint, + make_ands_implicit((Expr *) cexpr)); + } + + if (existConstraint != NIL) + existConstraint = list_make1(make_ands_explicit(existConstraint)); + + /* And away we go ... */ + return predicate_implied_by(partConstraint, existConstraint, true); +} + +/* + * ValidatePartitionConstraints + * + * Check whether all rows in the given table obey the given partition + * constraint; if so, it can be attached as a partition.  We do this by + * scanning the table (or all of its leaf partitions) row by row, except when + * the existing constraints are sufficient to prove that the new partitioning + * constraint must already hold. + */ +static void +ValidatePartitionConstraints(List **wqueue, Relation scanrel, + List *scanrel_children, + List *partConstraint) +{ + bool found_whole_row; + ListCell *lc; + + if (partConstraint == NIL) + return; + + /* + * Based on the table's existing constraints, determine if we can skip + * scanning the table to validate the partition constraint. + */ + if (PartConstraintImpliedByRelConstraint(scanrel, partConstraint)) + { + ereport(INFO, + (errmsg("partition constraint for table \"%s\" is implied by existing constraints", + RelationGetRelationName(scanrel)))); + return; + } + + /* Constraints proved insufficient, so we need to scan the table. */ + foreach(lc, scanrel_children) + { + AlteredTableInfo *tab; + Oid part_relid = lfirst_oid(lc); + Relation part_rel; + List *my_partconstr = partConstraint; + + /* Lock already taken */ + if (part_relid != RelationGetRelid(scanrel)) + part_rel = heap_open(part_relid, NoLock); + else + part_rel = scanrel; + + /* + * Skip if the partition is itself a partitioned table. We can only + * ever scan RELKIND_RELATION relations. + */ + if (part_rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + { + if (part_rel != scanrel) + heap_close(part_rel, NoLock); + continue; + } + + if (part_rel != scanrel) + { + /* + * Adjust the constraint for scanrel so that it matches this + * partition's attribute numbers. + */ + my_partconstr = map_partition_varattnos(my_partconstr, 1, + part_rel, scanrel, + &found_whole_row); + /* There can never be a whole-row reference here */ + if (found_whole_row) + elog(ERROR, "unexpected whole-row reference found in partition key"); + } + + /* Grab a work queue entry. */ + tab = ATGetQueueEntry(wqueue, part_rel); + tab->partition_constraint = (Expr *) linitial(my_partconstr); + + /* keep our lock until commit */ + if (part_rel != scanrel) + heap_close(part_rel, NoLock); + } +} + /* * ALTER TABLE ATTACH PARTITION FOR VALUES * @@ -16353,15 +16521,12 @@ ATExecAttachPartition(List **wqueue, Relation rel, PartitionCmd *cmd) Relation attachrel, catalog; List *attachrel_children; - TupleConstr *attachrel_constr; - List *partConstraint, - *existConstraint; + List *partConstraint; SysScanDesc scan; ScanKeyData skey; AttrNumber attno; int natts; TupleDesc tupleDesc; - bool skip_validate = false; ObjectAddress address; const char *trigger_name; bool found_whole_row; @@ -16555,148 +16720,9 @@ ATExecAttachPartition(List **wqueue, Relation rel, PartitionCmd *cmd) if (found_whole_row) elog(ERROR, "unexpected whole-row reference found in partition key"); - /* - * Check if we can do away with having to scan the table being attached to - * validate the partition constraint, by *proving* that the existing - * constraints of the table *imply* the partition predicate. We include - * the table's check constraints and NOT NULL constraints in the list of - * clauses passed to predicate_implied_by(). - * - * There is a case in which we cannot rely on just the result of the - * proof. - */ - attachrel_constr = tupleDesc->constr; - existConstraint = NIL; - if (attachrel_constr != NULL) - { - int num_check = attachrel_constr->num_check; - int i; - - if (attachrel_constr->has_not_null) - { - int natts = attachrel->rd_att->natts; - - for (i = 1; i <= natts; i++) - { - Form_pg_attribute att = attachrel->rd_att->attrs[i - 1]; - - if (att->attnotnull && !att->attisdropped) - { - NullTest *ntest = makeNode(NullTest); - - ntest->arg = (Expr *) makeVar(1, - i, - att->atttypid, - att->atttypmod, - att->attcollation, - 0); - ntest->nulltesttype = IS_NOT_NULL; - - /* - * argisrow=false is correct even for a composite column, - * because attnotnull does not represent a SQL-spec IS NOT - * NULL test in such a case, just IS DISTINCT FROM NULL. - */ - ntest->argisrow = false; - ntest->location = -1; - existConstraint = lappend(existConstraint, ntest); - } - } - } - - for (i = 0; i < num_check; i++) - { - Node *cexpr; - - /* - * If this constraint hasn't been fully validated yet, we must - * ignore it here. - */ - if (!attachrel_constr->check[i].ccvalid) - continue; - - cexpr = stringToNode(attachrel_constr->check[i].ccbin); - - /* - * Run each expression through const-simplification and - * canonicalization. It is necessary, because we will be - * comparing it to similarly-processed qual clauses, and may fail - * to detect valid matches without this. - */ - cexpr = eval_const_expressions(NULL, cexpr); - cexpr = (Node *) canonicalize_qual((Expr *) cexpr); - - existConstraint = list_concat(existConstraint, - make_ands_implicit((Expr *) cexpr)); - } - - existConstraint = list_make1(make_ands_explicit(existConstraint)); - - /* And away we go ... */ - if (predicate_implied_by(partConstraint, existConstraint, true)) - skip_validate = true; - } - - if (skip_validate) - { - /* No need to scan the table after all. */ - ereport(INFO, - (errmsg("partition constraint for table \"%s\" is implied by existing constraints", - RelationGetRelationName(attachrel)))); - } - else - { - /* Constraints proved insufficient, so we need to scan the table. */ - ListCell *lc; - - foreach(lc, attachrel_children) - { - AlteredTableInfo *tab; - Oid part_relid = lfirst_oid(lc); - Relation part_rel; - List *my_partconstr = partConstraint; - - /* Lock already taken */ - if (part_relid != RelationGetRelid(attachrel)) - part_rel = heap_open(part_relid, NoLock); - else - part_rel = attachrel; - - /* - * Skip if the partition is itself a partitioned table. We can - * only ever scan RELKIND_RELATION relations. - */ - if (part_rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) - { - if (part_rel != attachrel) - heap_close(part_rel, NoLock); - continue; - } - - if (part_rel != attachrel) - { - /* - * Adjust the constraint that we constructed above for - * attachRel so that it matches this partition's attribute - * numbers. - */ - my_partconstr = map_partition_varattnos(my_partconstr, 1, - part_rel, attachrel, - &found_whole_row); - /* There can never be a whole-row reference here */ - if (found_whole_row) - elog(ERROR, "unexpected whole-row reference found in partition key"); - } - - /* Grab a work queue entry. */ - tab = ATGetQueueEntry(wqueue, part_rel); - tab->partition_constraint = (Expr *) linitial(my_partconstr); - - /* keep our lock until commit */ - if (part_rel != attachrel) - heap_close(part_rel, NoLock); - } - } + /* Validate partition constraints against the table being attached. */ + ValidatePartitionConstraints(wqueue, attachrel, attachrel_children, + partConstraint); ObjectAddressSet(address, RelationRelationId, RelationGetRelid(attachrel)); From d050c3064be485fb61e5d0dcb5f1c3564f68971c Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Wed, 24 Jun 2020 16:52:51 +0800 Subject: [PATCH 186/578] Don't lock tables in RelationGetPartitionDispatchInfo --- src/backend/catalog/partition.c | 53 +++++++++++++++++---------------- src/backend/executor/execMain.c | 10 +++++-- src/include/catalog/partition.h | 3 +- 3 files changed, 36 insertions(+), 30 deletions(-) diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c index 9ecd77ea..3ea32102 100644 --- a/src/backend/catalog/partition.c +++ b/src/backend/catalog/partition.c @@ -1011,12 +1011,16 @@ get_partition_qual_relid(Oid relid) * RelationGetPartitionDispatchInfo * Returns information necessary to route tuples down a partition tree * - * All the partitions will be locked with lockmode, unless it is NoLock. - * A list of the OIDs of all the leaf partitions of rel is returned in - * *leaf_part_oids. + * The number of elements in the returned array (that is, the number of + * PartitionDispatch objects for the partitioned tables in the partition tree) + * is returned in *num_parted and a list of the OIDs of all the leaf + * partitions of rel is returned in *leaf_part_oids. + * + * All the relations in the partition tree (including 'rel') must have been + * locked (using at least the AccessShareLock) by the caller. */ PartitionDispatch * -RelationGetPartitionDispatchInfo(Relation rel, int lockmode, +RelationGetPartitionDispatchInfo(Relation rel, int *num_parted, List **leaf_part_oids) { PartitionDispatchData **pd; @@ -1031,14 +1035,18 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode, offset; /* - * Lock partitions and make a list of the partitioned ones to prepare - * their PartitionDispatch objects below. + * We rely on the relcache to traverse the partition tree to build both + * the leaf partition OIDs list and the array of PartitionDispatch objects + * for the partitioned tables in the tree. That means every partitioned + * table in the tree must be locked, which is fine since we require the + * caller to lock all the partitions anyway. * - * Cannot use find_all_inheritors() here, because then the order of OIDs - * in parted_rels list would be unknown, which does not help, because we - * assign indexes within individual PartitionDispatch in an order that is - * predetermined (determined by the order of OIDs in individual partition - * descriptors). + * For every partitioned table in the tree, starting with the root + * partitioned table, add its relcache entry to parted_rels, while also + * queuing its partitions (in the order in which they appear in the + * partition descriptor) to be looked at later in the same loop. This is + * a bit tricky but works because the foreach() macro doesn't fetch the + * next list element until the bottom of the loop. */ *num_parted = 1; parted_rels = list_make1(rel); @@ -1047,29 +1055,24 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode, APPEND_REL_PARTITION_OIDS(rel, all_parts, all_parents); forboth(lc1, all_parts, lc2, all_parents) { - Relation partrel = heap_open(lfirst_oid(lc1), lockmode); + Oid partrelid = lfirst_oid(lc1); Relation parent = lfirst(lc2); - PartitionDesc partdesc = RelationGetPartitionDesc(partrel); + if (get_rel_relkind(partrelid) == RELKIND_PARTITIONED_TABLE) + { /* - * If this partition is a partitioned table, add its children to the - * end of the list, so that they are processed as well. + * Already locked by the caller. Note that it is the + * responsibility of the caller to close the below relcache entry, + * once done using the information being collected here (for + * example, in ExecEndModifyTable). */ - if (partdesc) - { + Relation partrel = heap_open(partrelid, NoLock); + (*num_parted)++; parted_rels = lappend(parted_rels, partrel); parted_rel_parents = lappend(parted_rel_parents, parent); APPEND_REL_PARTITION_OIDS(partrel, all_parts, all_parents); } - else - heap_close(partrel, NoLock); - - /* - * We keep the partitioned ones open until we're done using the - * information being collected here (for example, see - * ExecEndModifyTable). - */ } /* diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 78810446..776c9d41 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -44,6 +44,7 @@ #include "access/xact.h" #include "catalog/namespace.h" #include "catalog/partition.h" +#include "catalog/pg_inherits_fn.h" #include "catalog/pg_publication.h" #ifdef _MLS_ #include "catalog/pg_class.h" @@ -3696,9 +3697,12 @@ ExecSetupPartitionTupleRouting(Relation rel, int i; ResultRelInfo *leaf_part_rri; - /* Get the tuple-routing information and lock partitions */ - *pd = RelationGetPartitionDispatchInfo(rel, RowExclusiveLock, num_parted, - &leaf_parts); + /* + * Get the information about the partition tree after locking all the + * partitions. + */ + (void) find_all_inheritors(RelationGetRelid(rel), RowExclusiveLock, NULL); + *pd = RelationGetPartitionDispatchInfo(rel, num_parted, &leaf_parts); *num_partitions = list_length(leaf_parts); *partitions = (ResultRelInfo *) palloc0(*num_partitions * sizeof(ResultRelInfo)); diff --git a/src/include/catalog/partition.h b/src/include/catalog/partition.h index bef7a0f5..2283c675 100644 --- a/src/include/catalog/partition.h +++ b/src/include/catalog/partition.h @@ -88,8 +88,7 @@ extern Expr *get_partition_qual_relid(Oid relid); /* For tuple routing */ extern PartitionDispatch *RelationGetPartitionDispatchInfo(Relation rel, - int lockmode, int *num_parted, - List **leaf_part_oids); + int *num_parted, List **leaf_part_oids); extern void FormPartitionKeyDatum(PartitionDispatch pd, TupleTableSlot *slot, EState *estate, From 85b8156ded49910ea31425ed73238cd3361731b9 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Wed, 24 Jun 2020 17:31:26 +0800 Subject: [PATCH 187/578] Expand partitioned tables in PartDesc order. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/optimizer/prep/prepunion.c | 311 +++++++++++++++++-------- src/test/regress/expected/insert.out | 4 +- 2 files changed, 215 insertions(+), 100 deletions(-) diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c index 68b7cd0f..ec3de76b 100644 --- a/src/backend/optimizer/prep/prepunion.c +++ b/src/backend/optimizer/prep/prepunion.c @@ -34,6 +34,7 @@ #include "access/heapam.h" #include "access/htup_details.h" #include "access/sysattr.h" +#include "catalog/partition.h" #include "catalog/pg_inherits_fn.h" #include "catalog/pg_type.h" #include "miscadmin.h" @@ -100,6 +101,19 @@ static List *generate_append_tlist(List *colTypes, List *colCollations, static List *generate_setop_grouplist(SetOperationStmt *op, List *targetlist); static void expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti); +static void expand_partitioned_rtentry(PlannerInfo *root, + RangeTblEntry *parentrte, + Index parentRTindex, Relation parentrel, + PlanRowMark *parentrc, PartitionDesc partdesc, + LOCKMODE lockmode, + bool *has_child, List **appinfos, + List **partitioned_child_rels); +static void expand_single_inheritance_child(PlannerInfo *root, + RangeTblEntry *parentrte, + Index parentRTindex, Relation parentrel, + PlanRowMark *parentrc, Relation childrel, + bool *has_child, List **appinfos, + List **partitioned_child_rels); static void make_inh_translation_list(Relation oldrelation, Relation newrelation, Index newvarno, @@ -1441,8 +1455,12 @@ expand_inherited_tables(PlannerInfo *root) * table, but with inh = false, to represent the parent table in its role * as a simple member of the inheritance set. * - * A childless table is never considered to be an inheritance set; therefore - * a parent RTE must always have at least two associated AppendRelInfos. +* A childless table is never considered to be an inheritance set. For +* regular inheritance, a parent RTE must always have at least two associated +* AppendRelInfos: one corresponding to the parent table as a simple member of +* inheritance set and one or more corresponding to the actual children. +* Since a partitioned table is not scanned, it might have only one associated +* AppendRelInfo. */ static void expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) @@ -1455,7 +1473,7 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) List *inhOIDs; List *appinfos; ListCell *l; - bool need_append; + bool has_child; PartitionedChildRelInfo *pcinfo; List *partitioned_child_rels = NIL; @@ -1529,14 +1547,35 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) /* Scan the inheritance set and expand it */ appinfos = NIL; - need_append = false; + has_child = false; + if (RelationGetPartitionDesc(oldrelation) != NULL) + { + /* + * If this table has partitions, recursively expand them in the order + * in which they appear in the PartitionDesc. But first, expand the + * parent itself. + */ + expand_single_inheritance_child(root, rte, rti, oldrelation, oldrc, + oldrelation, + &has_child, &appinfos, + &partitioned_child_rels); + expand_partitioned_rtentry(root, rte, rti, oldrelation, oldrc, + RelationGetPartitionDesc(oldrelation), + lockmode, + &has_child, &appinfos, + &partitioned_child_rels); + } + else + { + /* + * This table has no partitions. Expand any plain inheritance + * children in the order the OIDs were returned by + * find_all_inheritors. + */ foreach(l, inhOIDs) { Oid childOID = lfirst_oid(l); Relation newrelation; - RangeTblEntry *childrte; - Index childRTindex; - AppendRelInfo *appinfo; /* Open rel if needed; we already have required locks */ if (childOID != parentOID) @@ -1547,8 +1586,8 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) /* * It is possible that the parent table has children that are temp * tables of other backends. We cannot safely access such tables - * (because of buffering issues), and the best thing to do seems to be - * to silently ignore them. + * (because of buffering issues), and the best thing to do seems + * to be to silently ignore them. */ if (childOID != parentOID && RELATION_IS_OTHER_TEMP(newrelation)) { @@ -1556,21 +1595,139 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) continue; } + expand_single_inheritance_child(root, rte, rti, oldrelation, oldrc, + newrelation, + &has_child, &appinfos, + &partitioned_child_rels); + + /* Close child relations, but keep locks */ + if (childOID != parentOID) + heap_close(newrelation, NoLock); + } + } + + heap_close(oldrelation, NoLock); + /* - * Build an RTE for the child, and attach to query's rangetable list. - * We copy most fields of the parent's RTE, but replace relation OID - * and relkind, and set inh = false. Also, set requiredPerms to zero - * since all required permissions checks are done on the original RTE. - * Likewise, set the child's securityQuals to empty, because we only - * want to apply the parent's RLS conditions regardless of what RLS - * properties individual children may have. (This is an intentional - * choice to make inherited RLS work like regular permissions checks.) - * The parent securityQuals will be propagated to children along with - * other base restriction clauses, so we don't need to do it here. + * If all the children were temp tables or a partitioned parent did not + * have any leaf partitions, pretend it's a non-inheritance situation; we + * don't need Append node in that case. The duplicate RTE we added for + * the parent table is harmless, so we don't bother to get rid of it; + * ditto for the useless PlanRowMark node. */ - childrte = copyObject(rte); + if (!has_child) + { + /* Clear flag before returning */ + rte->inh = false; + return; + } + + /* + * We keep a list of objects in root, each of which maps a partitioned + * parent RT index to the list of RT indexes of its partitioned child + * tables. When creating an Append or a ModifyTable path for the parent, + * we copy the child RT index list verbatim to the path so that it could + * be carried over to the executor so that the latter could identify the + * partitioned child tables. + */ + if (partitioned_child_rels != NIL) + { + pcinfo = makeNode(PartitionedChildRelInfo); + + Assert(rte->relkind == RELKIND_PARTITIONED_TABLE); + pcinfo->parent_relid = rti; + pcinfo->child_rels = partitioned_child_rels; + root->pcinfo_list = lappend(root->pcinfo_list, pcinfo); + } + + /* Otherwise, OK to add to root->append_rel_list */ + root->append_rel_list = list_concat(root->append_rel_list, appinfos); +} + +static void +expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte, + Index parentRTindex, Relation parentrel, + PlanRowMark *parentrc, PartitionDesc partdesc, + LOCKMODE lockmode, + bool *has_child, List **appinfos, + List **partitioned_child_rels) +{ + int i; + + check_stack_depth(); + + for (i = 0; i < partdesc->nparts; i++) + { + Oid childOID = partdesc->oids[i]; + Relation childrel; + + /* Open rel; we already have required locks */ + childrel = heap_open(childOID, NoLock); + + /* As in expand_inherited_rtentry, skip non-local temp tables */ + if (RELATION_IS_OTHER_TEMP(childrel)) + { + heap_close(childrel, lockmode); + continue; + } + + expand_single_inheritance_child(root, parentrte, parentRTindex, + parentrel, parentrc, childrel, + has_child, appinfos, + partitioned_child_rels); + + /* If this child is itself partitioned, recurse */ + if (childrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + expand_partitioned_rtentry(root, parentrte, parentRTindex, + parentrel, parentrc, + RelationGetPartitionDesc(childrel), + lockmode, + has_child, appinfos, + partitioned_child_rels); + + /* Close child relation, but keep locks */ + heap_close(childrel, NoLock); + } +} + +/* + * expand_single_inheritance_child + * Expand a single inheritance child, if needed. + * + * If this is a temp table of another backend, we'll return without doing + * anything at all. Otherwise, we'll set "has_child" to true, build a + * RangeTblEntry and either a PartitionedChildRelInfo or AppendRelInfo as + * appropriate, plus maybe a PlanRowMark. + */ +static void +expand_single_inheritance_child(PlannerInfo *root, RangeTblEntry *parentrte, + Index parentRTindex, Relation parentrel, + PlanRowMark *parentrc, Relation childrel, + bool *has_child, List **appinfos, + List **partitioned_child_rels) +{ + Query *parse = root->parse; + Oid parentOID = RelationGetRelid(parentrel); + Oid childOID = RelationGetRelid(childrel); + RangeTblEntry *childrte; + Index childRTindex; + AppendRelInfo *appinfo; + + /* + * Build an RTE for the child, and attach to query's rangetable list. We + * copy most fields of the parent's RTE, but replace relation OID and + * relkind, and set inh = false. Also, set requiredPerms to zero since + * all required permissions checks are done on the original RTE. Likewise, + * set the child's securityQuals to empty, because we only want to apply + * the parent's RLS conditions regardless of what RLS properties + * individual children may have. (This is an intentional choice to make + * inherited RLS work like regular permissions checks.) The parent + * securityQuals will be propagated to children along with other base + * restriction clauses, so we don't need to do it here. + */ + childrte = copyObject(parentrte); childrte->relid = childOID; - childrte->relkind = newrelation->rd_rel->relkind; + childrte->relkind = childrel->rd_rel->relkind; childrte->inh = false; childrte->requiredPerms = 0; childrte->securityQuals = NIL; @@ -1578,118 +1735,76 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) childRTindex = list_length(parse->rtable); /* - * Build an AppendRelInfo for this parent and child, unless the child - * is a partitioned table. + * Build an AppendRelInfo for this parent and child, unless the child is a + * partitioned table. */ if (childrte->relkind != RELKIND_PARTITIONED_TABLE) { - need_append = true; + /* Remember if we saw a real child. */ + if (childOID != parentOID) + *has_child = true; + appinfo = makeNode(AppendRelInfo); - appinfo->parent_relid = rti; + appinfo->parent_relid = parentRTindex; appinfo->child_relid = childRTindex; - appinfo->parent_reltype = oldrelation->rd_rel->reltype; - appinfo->child_reltype = newrelation->rd_rel->reltype; - make_inh_translation_list(oldrelation, newrelation, childRTindex, + appinfo->parent_reltype = parentrel->rd_rel->reltype; + appinfo->child_reltype = childrel->rd_rel->reltype; + make_inh_translation_list(parentrel, childrel, childRTindex, &appinfo->translated_vars); appinfo->parent_reloid = parentOID; - appinfos = lappend(appinfos, appinfo); + *appinfos = lappend(*appinfos, appinfo); /* - * Translate the column permissions bitmaps to the child's attnums - * (we have to build the translated_vars list before we can do - * this). But if this is the parent table, leave copyObject's - * result alone. + * Translate the column permissions bitmaps to the child's attnums (we + * have to build the translated_vars list before we can do this). But + * if this is the parent table, leave copyObject's result alone. * * Note: we need to do this even though the executor won't run any - * permissions checks on the child RTE. The - * insertedCols/updatedCols bitmaps may be examined for - * trigger-firing purposes. + * permissions checks on the child RTE. The insertedCols/updatedCols + * bitmaps may be examined for trigger-firing purposes. */ if (childOID != parentOID) { - childrte->selectedCols = translate_col_privs(rte->selectedCols, + childrte->selectedCols = translate_col_privs(parentrte->selectedCols, appinfo->translated_vars); - childrte->insertedCols = translate_col_privs(rte->insertedCols, + childrte->insertedCols = translate_col_privs(parentrte->insertedCols, appinfo->translated_vars); - childrte->updatedCols = translate_col_privs(rte->updatedCols, + childrte->updatedCols = translate_col_privs(parentrte->updatedCols, appinfo->translated_vars); } } else - partitioned_child_rels = lappend_int(partitioned_child_rels, + *partitioned_child_rels = lappend_int(*partitioned_child_rels, childRTindex); /* * Build a PlanRowMark if parent is marked FOR UPDATE/SHARE. */ - if (oldrc) + if (parentrc) { - PlanRowMark *newrc = makeNode(PlanRowMark); + PlanRowMark *childrc = makeNode(PlanRowMark); - newrc->rti = childRTindex; - newrc->prti = rti; - newrc->rowmarkId = oldrc->rowmarkId; + childrc->rti = childRTindex; + childrc->prti = parentRTindex; + childrc->rowmarkId = parentrc->rowmarkId; /* Reselect rowmark type, because relkind might not match parent */ - newrc->markType = select_rowmark_type(childrte, oldrc->strength); - newrc->allMarkTypes = (1 << newrc->markType); - newrc->strength = oldrc->strength; - newrc->waitPolicy = oldrc->waitPolicy; + childrc->markType = select_rowmark_type(childrte, parentrc->strength); + childrc->allMarkTypes = (1 << childrc->markType); + childrc->strength = parentrc->strength; + childrc->waitPolicy = parentrc->waitPolicy; /* - * We mark RowMarks for partitioned child tables as parent - * RowMarks so that the executor ignores them (except their - * existence means that the child tables be locked using - * appropriate mode). + * We mark RowMarks for partitioned child tables as parent RowMarks so + * that the executor ignores them (except their existence means that + * the child tables be locked using appropriate mode). */ - newrc->isParent = (childrte->relkind == RELKIND_PARTITIONED_TABLE); + childrc->isParent = (childrte->relkind == RELKIND_PARTITIONED_TABLE); /* Include child's rowmark type in parent's allMarkTypes */ - oldrc->allMarkTypes |= newrc->allMarkTypes; - - root->rowMarks = lappend(root->rowMarks, newrc); - } + parentrc->allMarkTypes |= childrc->allMarkTypes; - /* Close child relations, but keep locks */ - if (childOID != parentOID) - heap_close(newrelation, NoLock); + root->rowMarks = lappend(root->rowMarks, childrc); } - - heap_close(oldrelation, NoLock); - - /* - * If all the children were temp tables or a partitioned parent did not - * have any leaf partitions, pretend it's a non-inheritance situation; we - * don't need Append node in that case. The duplicate RTE we added for - * the parent table is harmless, so we don't bother to get rid of it; - * ditto for the useless PlanRowMark node. - */ - if (!need_append) - { - /* Clear flag before returning */ - rte->inh = false; - return; - } - - /* - * We keep a list of objects in root, each of which maps a partitioned - * parent RT index to the list of RT indexes of its partitioned child - * tables. When creating an Append or a ModifyTable path for the parent, - * we copy the child RT index list verbatim to the path so that it could - * be carried over to the executor so that the latter could identify the - * partitioned child tables. - */ - if (partitioned_child_rels != NIL) - { - pcinfo = makeNode(PartitionedChildRelInfo); - - Assert(rte->relkind == RELKIND_PARTITIONED_TABLE); - pcinfo->parent_relid = rti; - pcinfo->child_rels = partitioned_child_rels; - root->pcinfo_list = lappend(root->pcinfo_list, pcinfo); - } - - /* Otherwise, OK to add to root->append_rel_list */ - root->append_rel_list = list_concat(root->append_rel_list, appinfos); } /* diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out index 70a7ea2f..944336b7 100644 --- a/src/test/regress/expected/insert.out +++ b/src/test/regress/expected/insert.out @@ -280,12 +280,12 @@ select tableoid::regclass, * from list_parted; -------------+----+---- part_aa_bb | aA | part_cc_dd | cC | 1 - part_null | | 0 - part_null | | 1 part_ee_ff1 | ff | 1 part_ee_ff1 | EE | 1 part_ee_ff2 | ff | 11 part_ee_ff2 | EE | 10 + part_null | | 0 + part_null | | 1 (8 rows) -- some more tests to exercise tuple-routing with multi-level partitioning From 182433b35f5647030187095efef7293146e8784a Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Thu, 25 Jun 2020 10:46:39 +0800 Subject: [PATCH 188/578] Introduce 64-bit hash functions with a 64-bit seed. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- doc/src/sgml/xindex.sgml | 13 +- src/backend/access/hash/hashfunc.c | 372 +++++++++++++++++++- src/backend/access/hash/hashpage.c | 2 +- src/backend/access/hash/hashutil.c | 6 +- src/backend/access/hash/hashvalidate.c | 44 ++- src/backend/commands/opclasscmds.c | 28 +- src/backend/utils/adt/acl.c | 15 + src/backend/utils/adt/arrayfuncs.c | 79 +++++ src/backend/utils/adt/date.c | 21 ++ src/backend/utils/adt/jsonb_op.c | 43 +++ src/backend/utils/adt/jsonb_util.c | 43 +++ src/backend/utils/adt/mac.c | 9 + src/backend/utils/adt/mac8.c | 9 + src/backend/utils/adt/network.c | 10 + src/backend/utils/adt/numeric.c | 60 ++++ src/backend/utils/adt/pg_lsn.c | 210 +++++------ src/backend/utils/adt/rangetypes.c | 63 ++++ src/backend/utils/adt/timestamp.c | 19 + src/backend/utils/adt/uuid.c | 8 + src/backend/utils/adt/varchar.c | 18 + src/backend/utils/cache/lsyscache.c | 8 +- src/backend/utils/cache/typcache.c | 58 ++- src/include/access/hash.h | 30 +- src/include/catalog/pg_amproc.h | 37 +- src/include/catalog/pg_proc.h | 54 +++ src/include/fmgr.h | 1 + src/include/utils/jsonb.h | 316 ++++++++--------- src/include/utils/typcache.h | 208 +++++------ src/test/regress/expected/alter_generic.out | 4 +- src/test/regress/expected/hash_func.out | 300 ++++++++++++++++ src/test/regress/parallel_schedule | 2 +- src/test/regress/sql/hash_func.sql | 222 ++++++++++++ 32 files changed, 1912 insertions(+), 400 deletions(-) create mode 100644 src/test/regress/expected/hash_func.out create mode 100644 src/test/regress/sql/hash_func.sql diff --git a/doc/src/sgml/xindex.sgml b/doc/src/sgml/xindex.sgml index 333a36c4..745b4d56 100644 --- a/doc/src/sgml/xindex.sgml +++ b/doc/src/sgml/xindex.sgml @@ -436,7 +436,8 @@ - Hash indexes require one support function, shown in . @@ -451,9 +452,17 @@ - Compute the hash value for a key + Compute the 32-bit hash value for a key 1 + + + Compute the 64-bit hash value for a key given a 64-bit salt; if + the salt is 0, the low 32 bits will match the value that would + have been computed by function 1 + + 2 + diff --git a/src/backend/access/hash/hashfunc.c b/src/backend/access/hash/hashfunc.c index ff46a854..f4959255 100644 --- a/src/backend/access/hash/hashfunc.c +++ b/src/backend/access/hash/hashfunc.c @@ -54,18 +54,36 @@ hashchar(PG_FUNCTION_ARGS) return hash_uint32((int32) PG_GETARG_CHAR(0)); } +Datum +hashcharextended(PG_FUNCTION_ARGS) +{ + return hash_uint32_extended((int32) PG_GETARG_CHAR(0), PG_GETARG_INT64(1)); +} + Datum hashint2(PG_FUNCTION_ARGS) { return hash_uint32((int32) PG_GETARG_INT16(0)); } +Datum +hashint2extended(PG_FUNCTION_ARGS) +{ + return hash_uint32_extended((int32) PG_GETARG_INT16(0), PG_GETARG_INT64(1)); +} + Datum hashint4(PG_FUNCTION_ARGS) { return hash_uint32(PG_GETARG_INT32(0)); } +Datum +hashint4extended(PG_FUNCTION_ARGS) +{ + return hash_uint32_extended(PG_GETARG_INT32(0), PG_GETARG_INT64(1)); +} + Datum hashint8(PG_FUNCTION_ARGS) { @@ -86,18 +104,43 @@ hashint8(PG_FUNCTION_ARGS) return hash_uint32(lohalf); } +Datum +hashint8extended(PG_FUNCTION_ARGS) +{ + /* Same approach as hashint8 */ + int64 val = PG_GETARG_INT64(0); + uint32 lohalf = (uint32) val; + uint32 hihalf = (uint32) (val >> 32); + + lohalf ^= (val >= 0) ? hihalf : ~hihalf; + + return hash_uint32_extended(lohalf, PG_GETARG_INT64(1)); +} + Datum hashoid(PG_FUNCTION_ARGS) { return hash_uint32((uint32) PG_GETARG_OID(0)); } +Datum +hashoidextended(PG_FUNCTION_ARGS) +{ + return hash_uint32_extended((uint32) PG_GETARG_OID(0), PG_GETARG_INT64(1)); +} + Datum hashenum(PG_FUNCTION_ARGS) { return hash_uint32((uint32) PG_GETARG_OID(0)); } +Datum +hashenumextended(PG_FUNCTION_ARGS) +{ + return hash_uint32_extended((uint32) PG_GETARG_OID(0), PG_GETARG_INT64(1)); +} + Datum hashfloat4(PG_FUNCTION_ARGS) { @@ -124,6 +167,21 @@ hashfloat4(PG_FUNCTION_ARGS) return hash_any((unsigned char *) &key8, sizeof(key8)); } +Datum +hashfloat4extended(PG_FUNCTION_ARGS) +{ + float4 key = PG_GETARG_FLOAT4(0); + uint64 seed = PG_GETARG_INT64(1); + float8 key8; + + /* Same approach as hashfloat4 */ + if (key == (float4) 0) + PG_RETURN_UINT64(seed); + key8 = key; + + return hash_any_extended((unsigned char *) &key8, sizeof(key8), seed); +} + Datum hashfloat8(PG_FUNCTION_ARGS) { @@ -140,6 +198,19 @@ hashfloat8(PG_FUNCTION_ARGS) return hash_any((unsigned char *) &key, sizeof(key)); } +Datum +hashfloat8extended(PG_FUNCTION_ARGS) +{ + float8 key = PG_GETARG_FLOAT8(0); + uint64 seed = PG_GETARG_INT64(1); + + /* Same approach as hashfloat8 */ + if (key == (float8) 0) + PG_RETURN_UINT64(seed); + + return hash_any_extended((unsigned char *) &key, sizeof(key), seed); +} + Datum hashoidvector(PG_FUNCTION_ARGS) { @@ -148,6 +219,16 @@ hashoidvector(PG_FUNCTION_ARGS) return hash_any((unsigned char *) key->values, key->dim1 * sizeof(Oid)); } +Datum +hashoidvectorextended(PG_FUNCTION_ARGS) +{ + oidvector *key = (oidvector *) PG_GETARG_POINTER(0); + + return hash_any_extended((unsigned char *) key->values, + key->dim1 * sizeof(Oid), + PG_GETARG_INT64(1)); +} + Datum hashname(PG_FUNCTION_ARGS) { @@ -156,6 +237,15 @@ hashname(PG_FUNCTION_ARGS) return hash_any((unsigned char *) key, strlen(key)); } +Datum +hashnameextended(PG_FUNCTION_ARGS) +{ + char *key = NameStr(*PG_GETARG_NAME(0)); + + return hash_any_extended((unsigned char *) key, strlen(key), + PG_GETARG_INT64(1)); +} + Datum hashtext(PG_FUNCTION_ARGS) { @@ -176,6 +266,22 @@ hashtext(PG_FUNCTION_ARGS) return result; } +Datum +hashtextextended(PG_FUNCTION_ARGS) +{ + text *key = PG_GETARG_TEXT_PP(0); + Datum result; + + /* Same approach as hashtext */ + result = hash_any_extended((unsigned char *) VARDATA_ANY(key), + VARSIZE_ANY_EXHDR(key), + PG_GETARG_INT64(1)); + + PG_FREE_IF_COPY(key, 0); + + return result; +} + /* * hashvarlena() can be used for any varlena datatype in which there are * no non-significant bits, ie, distinct bitpatterns never compare as equal. @@ -195,6 +301,21 @@ hashvarlena(PG_FUNCTION_ARGS) return result; } +Datum +hashvarlenaextended(PG_FUNCTION_ARGS) +{ + struct varlena *key = PG_GETARG_VARLENA_PP(0); + Datum result; + + result = hash_any_extended((unsigned char *) VARDATA_ANY(key), + VARSIZE_ANY_EXHDR(key), + PG_GETARG_INT64(1)); + + PG_FREE_IF_COPY(key, 0); + + return result; +} + /* * This hash function was written by Bob Jenkins * (bob_jenkins@burtleburtle.net), and superficially adapted @@ -510,7 +631,227 @@ hash_any(register const unsigned char *k, register int keylen) } /* - * hash_uint32() -- hash a 32-bit value + * hash_any_extended() -- hash into a 64-bit value, using an optional seed + * k : the key (the unaligned variable-length array of bytes) + * len : the length of the key, counting by bytes + * seed : a 64-bit seed (0 means no seed) + * + * Returns a uint64 value. Otherwise similar to hash_any. + */ +Datum +hash_any_extended(register const unsigned char *k, register int keylen, + uint64 seed) +{ + register uint32 a, + b, + c, + len; + + /* Set up the internal state */ + len = keylen; + a = b = c = 0x9e3779b9 + len + 3923095; + + /* If the seed is non-zero, use it to perturb the internal state. */ + if (seed != 0) + { + /* + * In essence, the seed is treated as part of the data being hashed, + * but for simplicity, we pretend that it's padded with four bytes of + * zeroes so that the seed constitutes a 12-byte chunk. + */ + a += (uint32) (seed >> 32); + b += (uint32) seed; + mix(a, b, c); + } + + /* If the source pointer is word-aligned, we use word-wide fetches */ + if (((uintptr_t) k & UINT32_ALIGN_MASK) == 0) + { + /* Code path for aligned source data */ + register const uint32 *ka = (const uint32 *) k; + + /* handle most of the key */ + while (len >= 12) + { + a += ka[0]; + b += ka[1]; + c += ka[2]; + mix(a, b, c); + ka += 3; + len -= 12; + } + + /* handle the last 11 bytes */ + k = (const unsigned char *) ka; +#ifdef WORDS_BIGENDIAN + switch (len) + { + case 11: + c += ((uint32) k[10] << 8); + /* fall through */ + case 10: + c += ((uint32) k[9] << 16); + /* fall through */ + case 9: + c += ((uint32) k[8] << 24); + /* the lowest byte of c is reserved for the length */ + /* fall through */ + case 8: + b += ka[1]; + a += ka[0]; + break; + case 7: + b += ((uint32) k[6] << 8); + /* fall through */ + case 6: + b += ((uint32) k[5] << 16); + /* fall through */ + case 5: + b += ((uint32) k[4] << 24); + /* fall through */ + case 4: + a += ka[0]; + break; + case 3: + a += ((uint32) k[2] << 8); + /* fall through */ + case 2: + a += ((uint32) k[1] << 16); + /* fall through */ + case 1: + a += ((uint32) k[0] << 24); + /* case 0: nothing left to add */ + } +#else /* !WORDS_BIGENDIAN */ + switch (len) + { + case 11: + c += ((uint32) k[10] << 24); + /* fall through */ + case 10: + c += ((uint32) k[9] << 16); + /* fall through */ + case 9: + c += ((uint32) k[8] << 8); + /* the lowest byte of c is reserved for the length */ + /* fall through */ + case 8: + b += ka[1]; + a += ka[0]; + break; + case 7: + b += ((uint32) k[6] << 16); + /* fall through */ + case 6: + b += ((uint32) k[5] << 8); + /* fall through */ + case 5: + b += k[4]; + /* fall through */ + case 4: + a += ka[0]; + break; + case 3: + a += ((uint32) k[2] << 16); + /* fall through */ + case 2: + a += ((uint32) k[1] << 8); + /* fall through */ + case 1: + a += k[0]; + /* case 0: nothing left to add */ + } +#endif /* WORDS_BIGENDIAN */ + } + else + { + /* Code path for non-aligned source data */ + + /* handle most of the key */ + while (len >= 12) + { +#ifdef WORDS_BIGENDIAN + a += (k[3] + ((uint32) k[2] << 8) + ((uint32) k[1] << 16) + ((uint32) k[0] << 24)); + b += (k[7] + ((uint32) k[6] << 8) + ((uint32) k[5] << 16) + ((uint32) k[4] << 24)); + c += (k[11] + ((uint32) k[10] << 8) + ((uint32) k[9] << 16) + ((uint32) k[8] << 24)); +#else /* !WORDS_BIGENDIAN */ + a += (k[0] + ((uint32) k[1] << 8) + ((uint32) k[2] << 16) + ((uint32) k[3] << 24)); + b += (k[4] + ((uint32) k[5] << 8) + ((uint32) k[6] << 16) + ((uint32) k[7] << 24)); + c += (k[8] + ((uint32) k[9] << 8) + ((uint32) k[10] << 16) + ((uint32) k[11] << 24)); +#endif /* WORDS_BIGENDIAN */ + mix(a, b, c); + k += 12; + len -= 12; + } + + /* handle the last 11 bytes */ +#ifdef WORDS_BIGENDIAN + switch (len) /* all the case statements fall through */ + { + case 11: + c += ((uint32) k[10] << 8); + case 10: + c += ((uint32) k[9] << 16); + case 9: + c += ((uint32) k[8] << 24); + /* the lowest byte of c is reserved for the length */ + case 8: + b += k[7]; + case 7: + b += ((uint32) k[6] << 8); + case 6: + b += ((uint32) k[5] << 16); + case 5: + b += ((uint32) k[4] << 24); + case 4: + a += k[3]; + case 3: + a += ((uint32) k[2] << 8); + case 2: + a += ((uint32) k[1] << 16); + case 1: + a += ((uint32) k[0] << 24); + /* case 0: nothing left to add */ + } +#else /* !WORDS_BIGENDIAN */ + switch (len) /* all the case statements fall through */ + { + case 11: + c += ((uint32) k[10] << 24); + case 10: + c += ((uint32) k[9] << 16); + case 9: + c += ((uint32) k[8] << 8); + /* the lowest byte of c is reserved for the length */ + case 8: + b += ((uint32) k[7] << 24); + case 7: + b += ((uint32) k[6] << 16); + case 6: + b += ((uint32) k[5] << 8); + case 5: + b += k[4]; + case 4: + a += ((uint32) k[3] << 24); + case 3: + a += ((uint32) k[2] << 16); + case 2: + a += ((uint32) k[1] << 8); + case 1: + a += k[0]; + /* case 0: nothing left to add */ + } +#endif /* WORDS_BIGENDIAN */ + } + + final(a, b, c); + + /* report the result */ + PG_RETURN_UINT64(((uint64) b << 32) | c); +} + +/* + * hash_uint32() -- hash a 32-bit value to a 32-bit value * * This has the same result as * hash_any(&k, sizeof(uint32)) @@ -532,6 +873,35 @@ hash_uint32(uint32 k) return UInt32GetDatum(c); } +/* + * hash_uint32_extended() -- hash a 32-bit value to a 64-bit value, with a seed + * + * Like hash_uint32, this is a convenience function. + */ +Datum +hash_uint32_extended(uint32 k, uint64 seed) +{ + register uint32 a, + b, + c; + + a = b = c = 0x9e3779b9 + (uint32) sizeof(uint32) + 3923095; + + if (seed != 0) + { + a += (uint32) (seed >> 32); + b += (uint32) seed; + mix(a, b, c); + } + + a += k; + + final(a, b, c); + + /* report the result */ + PG_RETURN_UINT64(((uint64) b << 32) | c); +} + #ifdef PGXC /* * compute_hash() diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c index e592499e..eb524a73 100644 --- a/src/backend/access/hash/hashpage.c +++ b/src/backend/access/hash/hashpage.c @@ -373,7 +373,7 @@ _hash_init(Relation rel, double num_tuples, ForkNumber forkNum) if (ffactor < 10) ffactor = 10; - procid = index_getprocid(rel, 1, HASHPROC); + procid = index_getprocid(rel, 1, HASHSTANDARD_PROC); /* * We initialize the metapage, the first N bucket pages, and the first diff --git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c index 8542ae3a..15468e78 100644 --- a/src/backend/access/hash/hashutil.c +++ b/src/backend/access/hash/hashutil.c @@ -85,7 +85,7 @@ _hash_datum2hashkey(Relation rel, Datum key) Oid collation; /* XXX assumes index has only one attribute */ - procinfo = index_getprocinfo(rel, 1, HASHPROC); + procinfo = index_getprocinfo(rel, 1, HASHSTANDARD_PROC); collation = rel->rd_indcollation[0]; return DatumGetUInt32(FunctionCall1Coll(procinfo, collation, key)); @@ -108,10 +108,10 @@ _hash_datum2hashkey_type(Relation rel, Datum key, Oid keytype) hash_proc = get_opfamily_proc(rel->rd_opfamily[0], keytype, keytype, - HASHPROC); + HASHSTANDARD_PROC); if (!RegProcedureIsValid(hash_proc)) elog(ERROR, "missing support function %d(%u,%u) for index \"%s\"", - HASHPROC, keytype, keytype, + HASHSTANDARD_PROC, keytype, keytype, RelationGetRelationName(rel)); collation = rel->rd_indcollation[0]; diff --git a/src/backend/access/hash/hashvalidate.c b/src/backend/access/hash/hashvalidate.c index 7e4364f8..a027d782 100644 --- a/src/backend/access/hash/hashvalidate.c +++ b/src/backend/access/hash/hashvalidate.c @@ -29,7 +29,7 @@ #include "utils/syscache.h" -static bool check_hash_func_signature(Oid funcid, Oid restype, Oid argtype); +static bool check_hash_func_signature(Oid funcid, int16 amprocnum, Oid argtype); /* @@ -105,8 +105,9 @@ hashvalidate(Oid opclassoid) /* Check procedure numbers and function signatures */ switch (procform->amprocnum) { - case HASHPROC: - if (!check_hash_func_signature(procform->amproc, INT4OID, + case HASHSTANDARD_PROC: + case HASHEXTENDED_PROC: + if (!check_hash_func_signature(procform->amproc, procform->amprocnum, procform->amproclefttype)) { ereport(INFO, @@ -264,19 +265,37 @@ hashvalidate(Oid opclassoid) * hacks in the core hash opclass definitions. */ static bool -check_hash_func_signature(Oid funcid, Oid restype, Oid argtype) -{// #lizard forgives +check_hash_func_signature(Oid funcid, int16 amprocnum, Oid argtype) +{ bool result = true; + Oid restype; + int16 nargs; HeapTuple tp; Form_pg_proc procform; + switch (amprocnum) + { + case HASHSTANDARD_PROC: + restype = INT4OID; + nargs = 1; + break; + + case HASHEXTENDED_PROC: + restype = INT8OID; + nargs = 2; + break; + + default: + elog(ERROR, "invalid amprocnum"); + } + tp = SearchSysCache1(PROCOID, ObjectIdGetDatum(funcid)); if (!HeapTupleIsValid(tp)) elog(ERROR, "cache lookup failed for function %u", funcid); procform = (Form_pg_proc) GETSTRUCT(tp); if (procform->prorettype != restype || procform->proretset || - procform->pronargs != 1) + procform->pronargs != nargs) result = false; if (!IsBinaryCoercible(argtype, procform->proargtypes.values[0])) @@ -290,24 +309,29 @@ check_hash_func_signature(Oid funcid, Oid restype, Oid argtype) * identity, not just its input type, because hashvarlena() takes * INTERNAL and allowing any such function seems too scary. */ - if (funcid == F_HASHINT4 && + if ((funcid == F_HASHINT4 || funcid == F_HASHINT4EXTENDED) && (argtype == DATEOID || argtype == ABSTIMEOID || argtype == RELTIMEOID || argtype == XIDOID || argtype == CIDOID)) /* okay, allowed use of hashint4() */ ; - else if (funcid == F_TIMESTAMP_HASH && + else if ((funcid == F_TIMESTAMP_HASH || + funcid == F_TIMESTAMP_HASH_EXTENDED) && argtype == TIMESTAMPTZOID) /* okay, allowed use of timestamp_hash() */ ; - else if (funcid == F_HASHCHAR && + else if ((funcid == F_HASHCHAR || funcid == F_HASHCHAREXTENDED) && argtype == BOOLOID) /* okay, allowed use of hashchar() */ ; - else if (funcid == F_HASHVARLENA && + else if ((funcid == F_HASHVARLENA || funcid == F_HASHVARLENAEXTENDED) && argtype == BYTEAOID) /* okay, allowed use of hashvarlena() */ ; else result = false; } + /* If function takes a second argument, it must be for a 64-bit salt. */ + if (nargs == 2 && procform->proargtypes.values[1] != INT8OID) + result = false; + ReleaseSysCache(tp); return result; } diff --git a/src/backend/commands/opclasscmds.c b/src/backend/commands/opclasscmds.c index 80cbadb2..6e0f12b7 100644 --- a/src/backend/commands/opclasscmds.c +++ b/src/backend/commands/opclasscmds.c @@ -78,6 +78,7 @@ #include #include "access/genam.h" +#include "access/hash.h" #include "access/heapam.h" #include "access/nbtree.h" #include "access/htup_details.h" @@ -1189,7 +1190,8 @@ assignProcTypes(OpFamilyMember *member, Oid amoid, Oid typeoid) /* * btree comparison procs must be 2-arg procs returning int4, while btree * sortsupport procs must take internal and return void. hash support - * procs must be 1-arg procs returning int4. Otherwise we don't know. + * proc 1 must be a 1-arg proc returning int4, while proc 2 must be a + * 2-arg proc returning int8. Otherwise we don't know. */ if (amoid == BTREE_AM_OID) { @@ -1232,14 +1234,28 @@ assignProcTypes(OpFamilyMember *member, Oid amoid, Oid typeoid) } else if (amoid == HASH_AM_OID) { + if (member->number == HASHSTANDARD_PROC) + { if (procform->pronargs != 1) ereport(ERROR, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), - errmsg("hash procedures must have one argument"))); - if (procform->prorettype != INT4OID) - ereport(ERROR, - (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), - errmsg("hash procedures must return integer"))); + errmsg("hash procedure 1 must have one argument"))); + if (procform->prorettype != INT4OID) + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("hash procedure 1 must return integer"))); + } + else if (member->number == HASHEXTENDED_PROC) + { + if (procform->pronargs != 2) + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("hash procedure 2 must have two arguments"))); + if (procform->prorettype != INT8OID) + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("hash procedure 2 must return bigint"))); + } /* * If lefttype/righttype isn't specified, use the proc's input type diff --git a/src/backend/utils/adt/acl.c b/src/backend/utils/adt/acl.c index 7c290f3a..6c638e3c 100644 --- a/src/backend/utils/adt/acl.c +++ b/src/backend/utils/adt/acl.c @@ -16,6 +16,7 @@ #include +#include "access/hash.h" #include "access/htup_details.h" #include "catalog/catalog.h" #include "catalog/namespace.h" @@ -716,6 +717,20 @@ hash_aclitem(PG_FUNCTION_ARGS) PG_RETURN_UINT32((uint32) (a->ai_privs + a->ai_grantee + a->ai_grantor)); } +/* + * 64-bit hash function for aclitem. + * + * Similar to hash_aclitem, but accepts a seed and returns a uint64 value. + */ +Datum +hash_aclitem_extended(PG_FUNCTION_ARGS) +{ + AclItem *a = PG_GETARG_ACLITEM_P(0); + uint64 seed = PG_GETARG_INT64(1); + uint32 sum = (uint32) (a->ai_privs + a->ai_grantee + a->ai_grantor); + + return (seed == 0) ? UInt64GetDatum(sum) : hash_uint32_extended(sum, seed); +} /* * acldefault() --- create an ACL describing default access permissions diff --git a/src/backend/utils/adt/arrayfuncs.c b/src/backend/utils/adt/arrayfuncs.c index 15b7a03c..06f20055 100644 --- a/src/backend/utils/adt/arrayfuncs.c +++ b/src/backend/utils/adt/arrayfuncs.c @@ -21,6 +21,7 @@ #endif #include +#include "access/hash.h" #include "access/htup_details.h" #include "catalog/pg_type.h" #include "funcapi.h" @@ -4043,6 +4044,84 @@ hash_array(PG_FUNCTION_ARGS) PG_RETURN_UINT32(result); } +/* + * Returns 64-bit value by hashing a value to a 64-bit value, with a seed. + * Otherwise, similar to hash_array. + */ +Datum +hash_array_extended(PG_FUNCTION_ARGS) +{ + AnyArrayType *array = PG_GETARG_ANY_ARRAY(0); + uint64 seed = PG_GETARG_INT64(1); + int ndims = AARR_NDIM(array); + int *dims = AARR_DIMS(array); + Oid element_type = AARR_ELEMTYPE(array); + uint64 result = 1; + int nitems; + TypeCacheEntry *typentry; + int typlen; + bool typbyval; + char typalign; + int i; + array_iter iter; + FunctionCallInfoData locfcinfo; + + typentry = (TypeCacheEntry *) fcinfo->flinfo->fn_extra; + if (typentry == NULL || + typentry->type_id != element_type) + { + typentry = lookup_type_cache(element_type, + TYPECACHE_HASH_EXTENDED_PROC_FINFO); + if (!OidIsValid(typentry->hash_extended_proc_finfo.fn_oid)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_FUNCTION), + errmsg("could not identify an extended hash function for type %s", + format_type_be(element_type)))); + fcinfo->flinfo->fn_extra = (void *) typentry; + } + typlen = typentry->typlen; + typbyval = typentry->typbyval; + typalign = typentry->typalign; + + InitFunctionCallInfoData(locfcinfo, &typentry->hash_extended_proc_finfo, 2, + InvalidOid, NULL, NULL); + + /* Loop over source data */ + nitems = ArrayGetNItems(ndims, dims); + array_iter_setup(&iter, array); + + for (i = 0; i < nitems; i++) + { + Datum elt; + bool isnull; + uint64 elthash; + + /* Get element, checking for NULL */ + elt = array_iter_next(&iter, &isnull, i, typlen, typbyval, typalign); + + if (isnull) + { + elthash = 0; + } + else + { + /* Apply the hash function */ + locfcinfo.arg[0] = elt; + locfcinfo.arg[1] = seed; + locfcinfo.argnull[0] = false; + locfcinfo.argnull[1] = false; + locfcinfo.isnull = false; + elthash = DatumGetUInt64(FunctionCallInvoke(&locfcinfo)); + } + + result = (result << 5) - result + elthash; + } + + AARR_FREE_IF_COPY(array, 0); + + PG_RETURN_UINT64(result); +} + /*----------------------------------------------------------------------------- * array overlap/containment comparisons diff --git a/src/backend/utils/adt/date.c b/src/backend/utils/adt/date.c index ca7454de..cb29169f 100644 --- a/src/backend/utils/adt/date.c +++ b/src/backend/utils/adt/date.c @@ -1520,6 +1520,12 @@ time_hash(PG_FUNCTION_ARGS) return hashint8(fcinfo); } +Datum +time_hash_extended(PG_FUNCTION_ARGS) +{ + return hashint8extended(fcinfo); +} + Datum time_larger(PG_FUNCTION_ARGS) { @@ -2225,6 +2231,21 @@ timetz_hash(PG_FUNCTION_ARGS) PG_RETURN_UINT32(thash); } +Datum +timetz_hash_extended(PG_FUNCTION_ARGS) +{ + TimeTzADT *key = PG_GETARG_TIMETZADT_P(0); + uint64 seed = PG_GETARG_DATUM(1); + uint64 thash; + + /* Same approach as timetz_hash */ + thash = DatumGetUInt64(DirectFunctionCall2(hashint8extended, + Int64GetDatumFast(key->time), + seed)); + thash ^= DatumGetUInt64(hash_uint32_extended(key->zone, seed)); + PG_RETURN_UINT64(thash); +} + Datum timetz_larger(PG_FUNCTION_ARGS) { diff --git a/src/backend/utils/adt/jsonb_op.c b/src/backend/utils/adt/jsonb_op.c index 2ceaeef7..83219269 100644 --- a/src/backend/utils/adt/jsonb_op.c +++ b/src/backend/utils/adt/jsonb_op.c @@ -291,3 +291,46 @@ jsonb_hash(PG_FUNCTION_ARGS) PG_FREE_IF_COPY(jb, 0); PG_RETURN_INT32(hash); } + +Datum +jsonb_hash_extended(PG_FUNCTION_ARGS) +{ + Jsonb *jb = PG_GETARG_JSONB(0); + uint64 seed = PG_GETARG_INT64(1); + JsonbIterator *it; + JsonbValue v; + JsonbIteratorToken r; + uint64 hash = 0; + + if (JB_ROOT_COUNT(jb) == 0) + PG_RETURN_UINT64(seed); + + it = JsonbIteratorInit(&jb->root); + + while ((r = JsonbIteratorNext(&it, &v, false)) != WJB_DONE) + { + switch (r) + { + /* Rotation is left to JsonbHashScalarValueExtended() */ + case WJB_BEGIN_ARRAY: + hash ^= ((UINT64CONST(JB_FARRAY) << 32) | UINT64CONST(JB_FARRAY)); + break; + case WJB_BEGIN_OBJECT: + hash ^= ((UINT64CONST(JB_FOBJECT) << 32) | UINT64CONST(JB_FOBJECT)); + break; + case WJB_KEY: + case WJB_VALUE: + case WJB_ELEM: + JsonbHashScalarValueExtended(&v, &hash, seed); + break; + case WJB_END_ARRAY: + case WJB_END_OBJECT: + break; + default: + elog(ERROR, "invalid JsonbIteratorNext rc: %d", (int) r); + } + } + + PG_FREE_IF_COPY(jb, 0); + PG_RETURN_UINT64(hash); +} diff --git a/src/backend/utils/adt/jsonb_util.c b/src/backend/utils/adt/jsonb_util.c index 6bb335e0..91078189 100644 --- a/src/backend/utils/adt/jsonb_util.c +++ b/src/backend/utils/adt/jsonb_util.c @@ -1249,6 +1249,49 @@ JsonbHashScalarValue(const JsonbValue *scalarVal, uint32 *hash) *hash ^= tmp; } +/* + * Hash a value to a 64-bit value, with a seed. Otherwise, similar to + * JsonbHashScalarValue. + */ +void +JsonbHashScalarValueExtended(const JsonbValue *scalarVal, uint64 *hash, + uint64 seed) +{ + uint64 tmp; + + switch (scalarVal->type) + { + case jbvNull: + tmp = seed + 0x01; + break; + case jbvString: + tmp = DatumGetUInt64(hash_any_extended((const unsigned char *) scalarVal->val.string.val, + scalarVal->val.string.len, + seed)); + break; + case jbvNumeric: + tmp = DatumGetUInt64(DirectFunctionCall2(hash_numeric_extended, + NumericGetDatum(scalarVal->val.numeric), + UInt64GetDatum(seed))); + break; + case jbvBool: + if (seed) + tmp = DatumGetUInt64(DirectFunctionCall2(hashcharextended, + BoolGetDatum(scalarVal->val.boolean), + UInt64GetDatum(seed))); + else + tmp = scalarVal->val.boolean ? 0x02 : 0x04; + + break; + default: + elog(ERROR, "invalid jsonb scalar type"); + break; + } + + *hash = ROTATE_HIGH_AND_LOW_32BITS(*hash); + *hash ^= tmp; +} + /* * Are two scalar JsonbValues of the same type a and b equal? */ diff --git a/src/backend/utils/adt/mac.c b/src/backend/utils/adt/mac.c index 27819a01..7fc50865 100644 --- a/src/backend/utils/adt/mac.c +++ b/src/backend/utils/adt/mac.c @@ -271,6 +271,15 @@ hashmacaddr(PG_FUNCTION_ARGS) return hash_any((unsigned char *) key, sizeof(macaddr)); } +Datum +hashmacaddrextended(PG_FUNCTION_ARGS) +{ + macaddr *key = PG_GETARG_MACADDR_P(0); + + return hash_any_extended((unsigned char *) key, sizeof(macaddr), + PG_GETARG_INT64(1)); +} + /* * Arithmetic functions: bitwise NOT, AND, OR. */ diff --git a/src/backend/utils/adt/mac8.c b/src/backend/utils/adt/mac8.c index 0a239dc3..90be3efa 100644 --- a/src/backend/utils/adt/mac8.c +++ b/src/backend/utils/adt/mac8.c @@ -407,6 +407,15 @@ hashmacaddr8(PG_FUNCTION_ARGS) return hash_any((unsigned char *) key, sizeof(macaddr8)); } +Datum +hashmacaddr8extended(PG_FUNCTION_ARGS) +{ + macaddr8 *key = PG_GETARG_MACADDR8_P(0); + + return hash_any_extended((unsigned char *) key, sizeof(macaddr8), + PG_GETARG_INT64(1)); +} + /* * Arithmetic functions: bitwise NOT, AND, OR. */ diff --git a/src/backend/utils/adt/network.c b/src/backend/utils/adt/network.c index 1514f39e..f0f339bc 100644 --- a/src/backend/utils/adt/network.c +++ b/src/backend/utils/adt/network.c @@ -486,6 +486,16 @@ hashinet(PG_FUNCTION_ARGS) return hash_any((unsigned char *) VARDATA_ANY(addr), addrsize + 2); } +Datum +hashinetextended(PG_FUNCTION_ARGS) +{ + inet *addr = PG_GETARG_INET_PP(0); + int addrsize = ip_addrsize(addr); + + return hash_any_extended((unsigned char *) VARDATA_ANY(addr), addrsize + 2, + PG_GETARG_INT64(1)); +} + /* * Boolean network-inclusion tests. */ diff --git a/src/backend/utils/adt/numeric.c b/src/backend/utils/adt/numeric.c index b6aad0ae..d159c430 100644 --- a/src/backend/utils/adt/numeric.c +++ b/src/backend/utils/adt/numeric.c @@ -2230,6 +2230,66 @@ hash_numeric(PG_FUNCTION_ARGS) PG_RETURN_DATUM(result); } +/* + * Returns 64-bit value by hashing a value to a 64-bit value, with a seed. + * Otherwise, similar to hash_numeric. + */ +Datum +hash_numeric_extended(PG_FUNCTION_ARGS) +{ + Numeric key = PG_GETARG_NUMERIC(0); + uint64 seed = PG_GETARG_INT64(1); + Datum digit_hash; + Datum result; + int weight; + int start_offset; + int end_offset; + int i; + int hash_len; + NumericDigit *digits; + + if (NUMERIC_IS_NAN(key)) + PG_RETURN_UINT64(seed); + + weight = NUMERIC_WEIGHT(key); + start_offset = 0; + end_offset = 0; + + digits = NUMERIC_DIGITS(key); + for (i = 0; i < NUMERIC_NDIGITS(key); i++) + { + if (digits[i] != (NumericDigit) 0) + break; + + start_offset++; + + weight--; + } + + if (NUMERIC_NDIGITS(key) == start_offset) + PG_RETURN_UINT64(seed - 1); + + for (i = NUMERIC_NDIGITS(key) - 1; i >= 0; i--) + { + if (digits[i] != (NumericDigit) 0) + break; + + end_offset++; + } + + Assert(start_offset + end_offset < NUMERIC_NDIGITS(key)); + + hash_len = NUMERIC_NDIGITS(key) - start_offset - end_offset; + digit_hash = hash_any_extended((unsigned char *) (NUMERIC_DIGITS(key) + + start_offset), + hash_len * sizeof(NumericDigit), + seed); + + result = digit_hash ^ weight; + + PG_RETURN_DATUM(result); +} + /* ---------------------------------------------------------------------- * diff --git a/src/backend/utils/adt/pg_lsn.c b/src/backend/utils/adt/pg_lsn.c index abdd3eed..7ad30a26 100644 --- a/src/backend/utils/adt/pg_lsn.c +++ b/src/backend/utils/adt/pg_lsn.c @@ -1,13 +1,13 @@ /*------------------------------------------------------------------------- * * pg_lsn.c - * Operations for the pg_lsn datatype. + * Operations for the pg_lsn datatype. * * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * src/backend/utils/adt/pg_lsn.c + * src/backend/utils/adt/pg_lsn.c * *------------------------------------------------------------------------- */ @@ -19,8 +19,8 @@ #include "utils/builtins.h" #include "utils/pg_lsn.h" -#define MAXPG_LSNLEN 17 -#define MAXPG_LSNCOMPONENT 8 +#define MAXPG_LSNLEN 17 +#define MAXPG_LSNCOMPONENT 8 /*---------------------------------------------------------- * Formatting and conversion routines. @@ -29,180 +29,186 @@ Datum pg_lsn_in(PG_FUNCTION_ARGS) { - char *str = PG_GETARG_CSTRING(0); - int len1, - len2; - uint32 id, - off; - XLogRecPtr result; - - /* Sanity check input format. */ - len1 = strspn(str, "0123456789abcdefABCDEF"); - if (len1 < 1 || len1 > MAXPG_LSNCOMPONENT || str[len1] != '/') - ereport(ERROR, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid input syntax for type %s: \"%s\"", - "pg_lsn", str))); - len2 = strspn(str + len1 + 1, "0123456789abcdefABCDEF"); - if (len2 < 1 || len2 > MAXPG_LSNCOMPONENT || str[len1 + 1 + len2] != '\0') - ereport(ERROR, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid input syntax for type %s: \"%s\"", - "pg_lsn", str))); - - /* Decode result. */ - id = (uint32) strtoul(str, NULL, 16); - off = (uint32) strtoul(str + len1 + 1, NULL, 16); - result = ((uint64) id << 32) | off; - - PG_RETURN_LSN(result); + char *str = PG_GETARG_CSTRING(0); + int len1, + len2; + uint32 id, + off; + XLogRecPtr result; + + /* Sanity check input format. */ + len1 = strspn(str, "0123456789abcdefABCDEF"); + if (len1 < 1 || len1 > MAXPG_LSNCOMPONENT || str[len1] != '/') + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type %s: \"%s\"", + "pg_lsn", str))); + len2 = strspn(str + len1 + 1, "0123456789abcdefABCDEF"); + if (len2 < 1 || len2 > MAXPG_LSNCOMPONENT || str[len1 + 1 + len2] != '\0') + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type %s: \"%s\"", + "pg_lsn", str))); + + /* Decode result. */ + id = (uint32) strtoul(str, NULL, 16); + off = (uint32) strtoul(str + len1 + 1, NULL, 16); + result = ((uint64) id << 32) | off; + + PG_RETURN_LSN(result); } Datum pg_lsn_out(PG_FUNCTION_ARGS) { - XLogRecPtr lsn = PG_GETARG_LSN(0); - char buf[MAXPG_LSNLEN + 1]; - char *result; - uint32 id, - off; - - /* Decode ID and offset */ - id = (uint32) (lsn >> 32); - off = (uint32) lsn; - - snprintf(buf, sizeof buf, "%X/%X", id, off); - result = pstrdup(buf); - PG_RETURN_CSTRING(result); + XLogRecPtr lsn = PG_GETARG_LSN(0); + char buf[MAXPG_LSNLEN + 1]; + char *result; + uint32 id, + off; + + /* Decode ID and offset */ + id = (uint32) (lsn >> 32); + off = (uint32) lsn; + + snprintf(buf, sizeof buf, "%X/%X", id, off); + result = pstrdup(buf); + PG_RETURN_CSTRING(result); } Datum pg_lsn_recv(PG_FUNCTION_ARGS) { - StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); - XLogRecPtr result; + StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); + XLogRecPtr result; - result = pq_getmsgint64(buf); - PG_RETURN_LSN(result); + result = pq_getmsgint64(buf); + PG_RETURN_LSN(result); } Datum pg_lsn_send(PG_FUNCTION_ARGS) { - XLogRecPtr lsn = PG_GETARG_LSN(0); - StringInfoData buf; + XLogRecPtr lsn = PG_GETARG_LSN(0); + StringInfoData buf; - pq_begintypsend(&buf); - pq_sendint64(&buf, lsn); - PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); + pq_begintypsend(&buf); + pq_sendint64(&buf, lsn); + PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); } /*---------------------------------------------------------- - * Operators for PostgreSQL LSNs + * Operators for PostgreSQL LSNs *---------------------------------------------------------*/ Datum pg_lsn_eq(PG_FUNCTION_ARGS) { - XLogRecPtr lsn1 = PG_GETARG_LSN(0); - XLogRecPtr lsn2 = PG_GETARG_LSN(1); + XLogRecPtr lsn1 = PG_GETARG_LSN(0); + XLogRecPtr lsn2 = PG_GETARG_LSN(1); - PG_RETURN_BOOL(lsn1 == lsn2); + PG_RETURN_BOOL(lsn1 == lsn2); } Datum pg_lsn_ne(PG_FUNCTION_ARGS) { - XLogRecPtr lsn1 = PG_GETARG_LSN(0); - XLogRecPtr lsn2 = PG_GETARG_LSN(1); + XLogRecPtr lsn1 = PG_GETARG_LSN(0); + XLogRecPtr lsn2 = PG_GETARG_LSN(1); - PG_RETURN_BOOL(lsn1 != lsn2); + PG_RETURN_BOOL(lsn1 != lsn2); } Datum pg_lsn_lt(PG_FUNCTION_ARGS) { - XLogRecPtr lsn1 = PG_GETARG_LSN(0); - XLogRecPtr lsn2 = PG_GETARG_LSN(1); + XLogRecPtr lsn1 = PG_GETARG_LSN(0); + XLogRecPtr lsn2 = PG_GETARG_LSN(1); - PG_RETURN_BOOL(lsn1 < lsn2); + PG_RETURN_BOOL(lsn1 < lsn2); } Datum pg_lsn_gt(PG_FUNCTION_ARGS) { - XLogRecPtr lsn1 = PG_GETARG_LSN(0); - XLogRecPtr lsn2 = PG_GETARG_LSN(1); + XLogRecPtr lsn1 = PG_GETARG_LSN(0); + XLogRecPtr lsn2 = PG_GETARG_LSN(1); - PG_RETURN_BOOL(lsn1 > lsn2); + PG_RETURN_BOOL(lsn1 > lsn2); } Datum pg_lsn_le(PG_FUNCTION_ARGS) { - XLogRecPtr lsn1 = PG_GETARG_LSN(0); - XLogRecPtr lsn2 = PG_GETARG_LSN(1); + XLogRecPtr lsn1 = PG_GETARG_LSN(0); + XLogRecPtr lsn2 = PG_GETARG_LSN(1); - PG_RETURN_BOOL(lsn1 <= lsn2); + PG_RETURN_BOOL(lsn1 <= lsn2); } Datum pg_lsn_ge(PG_FUNCTION_ARGS) { - XLogRecPtr lsn1 = PG_GETARG_LSN(0); - XLogRecPtr lsn2 = PG_GETARG_LSN(1); + XLogRecPtr lsn1 = PG_GETARG_LSN(0); + XLogRecPtr lsn2 = PG_GETARG_LSN(1); - PG_RETURN_BOOL(lsn1 >= lsn2); + PG_RETURN_BOOL(lsn1 >= lsn2); } /* btree index opclass support */ Datum pg_lsn_cmp(PG_FUNCTION_ARGS) { - XLogRecPtr a = PG_GETARG_LSN(0); - XLogRecPtr b = PG_GETARG_LSN(1); - - if (a > b) - PG_RETURN_INT32(1); - else if (a == b) - PG_RETURN_INT32(0); - else - PG_RETURN_INT32(-1); + XLogRecPtr a = PG_GETARG_LSN(0); + XLogRecPtr b = PG_GETARG_LSN(1); + + if (a > b) + PG_RETURN_INT32(1); + else if (a == b) + PG_RETURN_INT32(0); + else + PG_RETURN_INT32(-1); } /* hash index opclass support */ Datum pg_lsn_hash(PG_FUNCTION_ARGS) { - /* We can use hashint8 directly */ - return hashint8(fcinfo); + /* We can use hashint8 directly */ + return hashint8(fcinfo); +} + +Datum +pg_lsn_hash_extended(PG_FUNCTION_ARGS) +{ + return hashint8extended(fcinfo); } /*---------------------------------------------------------- - * Arithmetic operators on PostgreSQL LSNs. + * Arithmetic operators on PostgreSQL LSNs. *---------------------------------------------------------*/ Datum pg_lsn_mi(PG_FUNCTION_ARGS) { - XLogRecPtr lsn1 = PG_GETARG_LSN(0); - XLogRecPtr lsn2 = PG_GETARG_LSN(1); - char buf[256]; - Datum result; - - /* Output could be as large as plus or minus 2^63 - 1. */ - if (lsn1 < lsn2) - snprintf(buf, sizeof buf, "-" UINT64_FORMAT, lsn2 - lsn1); - else - snprintf(buf, sizeof buf, UINT64_FORMAT, lsn1 - lsn2); - - /* Convert to numeric. */ - result = DirectFunctionCall3(numeric_in, - CStringGetDatum(buf), - ObjectIdGetDatum(0), - Int32GetDatum(-1)); - - return result; + XLogRecPtr lsn1 = PG_GETARG_LSN(0); + XLogRecPtr lsn2 = PG_GETARG_LSN(1); + char buf[256]; + Datum result; + + /* Output could be as large as plus or minus 2^63 - 1. */ + if (lsn1 < lsn2) + snprintf(buf, sizeof buf, "-" UINT64_FORMAT, lsn2 - lsn1); + else + snprintf(buf, sizeof buf, UINT64_FORMAT, lsn1 - lsn2); + + /* Convert to numeric. */ + result = DirectFunctionCall3(numeric_in, + CStringGetDatum(buf), + ObjectIdGetDatum(0), + Int32GetDatum(-1)); + + return result; } diff --git a/src/backend/utils/adt/rangetypes.c b/src/backend/utils/adt/rangetypes.c index c013179b..166ebf9f 100644 --- a/src/backend/utils/adt/rangetypes.c +++ b/src/backend/utils/adt/rangetypes.c @@ -1280,6 +1280,69 @@ hash_range(PG_FUNCTION_ARGS) PG_RETURN_INT32(result); } +/* + * Returns 64-bit value by hashing a value to a 64-bit value, with a seed. + * Otherwise, similar to hash_range. + */ +Datum +hash_range_extended(PG_FUNCTION_ARGS) +{ + RangeType *r = PG_GETARG_RANGE(0); + uint64 seed = PG_GETARG_INT64(1); + uint64 result; + TypeCacheEntry *typcache; + TypeCacheEntry *scache; + RangeBound lower; + RangeBound upper; + bool empty; + char flags; + uint64 lower_hash; + uint64 upper_hash; + + check_stack_depth(); + + typcache = range_get_typcache(fcinfo, RangeTypeGetOid(r)); + + range_deserialize(typcache, r, &lower, &upper, &empty); + flags = range_get_flags(r); + + scache = typcache->rngelemtype; + if (!OidIsValid(scache->hash_extended_proc_finfo.fn_oid)) + { + scache = lookup_type_cache(scache->type_id, + TYPECACHE_HASH_EXTENDED_PROC_FINFO); + if (!OidIsValid(scache->hash_extended_proc_finfo.fn_oid)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_FUNCTION), + errmsg("could not identify a hash function for type %s", + format_type_be(scache->type_id)))); + } + + if (RANGE_HAS_LBOUND(flags)) + lower_hash = DatumGetUInt64(FunctionCall2Coll(&scache->hash_extended_proc_finfo, + typcache->rng_collation, + lower.val, + seed)); + else + lower_hash = 0; + + if (RANGE_HAS_UBOUND(flags)) + upper_hash = DatumGetUInt64(FunctionCall2Coll(&scache->hash_extended_proc_finfo, + typcache->rng_collation, + upper.val, + seed)); + else + upper_hash = 0; + + /* Merge hashes of flags and bounds */ + result = hash_uint32_extended((uint32) flags, seed); + result ^= lower_hash; + result = ROTATE_HIGH_AND_LOW_32BITS(result); + result ^= upper_hash; + + PG_RETURN_UINT64(result); +} + /* *---------------------------------------------------------- * CANONICAL FUNCTIONS diff --git a/src/backend/utils/adt/timestamp.c b/src/backend/utils/adt/timestamp.c index 8fc9b6b3..70e1125e 100644 --- a/src/backend/utils/adt/timestamp.c +++ b/src/backend/utils/adt/timestamp.c @@ -2141,6 +2141,11 @@ timestamp_hash(PG_FUNCTION_ARGS) return hashint8(fcinfo); } +Datum +timestamp_hash_extended(PG_FUNCTION_ARGS) +{ + return hashint8extended(fcinfo); +} /* * Cross-type comparison functions for timestamp vs timestamptz @@ -2447,6 +2452,20 @@ interval_hash(PG_FUNCTION_ARGS) return DirectFunctionCall1(hashint8, Int64GetDatumFast(span64)); } +Datum +interval_hash_extended(PG_FUNCTION_ARGS) +{ + Interval *interval = PG_GETARG_INTERVAL_P(0); + INT128 span = interval_cmp_value(interval); + int64 span64; + + /* Same approach as interval_hash */ + span64 = int128_to_int64(span); + + return DirectFunctionCall2(hashint8extended, Int64GetDatumFast(span64), + PG_GETARG_DATUM(1)); +} + /* overlaps_timestamp() --- implements the SQL OVERLAPS operator. * * Algorithm is per SQL spec. This is much harder than you'd think diff --git a/src/backend/utils/adt/uuid.c b/src/backend/utils/adt/uuid.c index 3cf4fa70..035cd44d 100644 --- a/src/backend/utils/adt/uuid.c +++ b/src/backend/utils/adt/uuid.c @@ -408,3 +408,11 @@ uuid_hash(PG_FUNCTION_ARGS) return hash_any(key->data, UUID_LEN); } + +Datum +uuid_hash_extended(PG_FUNCTION_ARGS) +{ + pg_uuid_t *key = PG_GETARG_UUID_P(0); + + return hash_any_extended(key->data, UUID_LEN, PG_GETARG_INT64(1)); +} diff --git a/src/backend/utils/adt/varchar.c b/src/backend/utils/adt/varchar.c index 329c7d4b..c60b452b 100644 --- a/src/backend/utils/adt/varchar.c +++ b/src/backend/utils/adt/varchar.c @@ -1007,6 +1007,24 @@ hashbpchar(PG_FUNCTION_ARGS) return result; } +Datum +hashbpcharextended(PG_FUNCTION_ARGS) +{ + BpChar *key = PG_GETARG_BPCHAR_PP(0); + char *keydata; + int keylen; + Datum result; + + keydata = VARDATA_ANY(key); + keylen = bcTruelen(key); + + result = hash_any_extended((unsigned char *) keydata, keylen, + PG_GETARG_INT64(1)); + + PG_FREE_IF_COPY(key, 0); + + return result; +} /* * The following operators support character-by-character comparison diff --git a/src/backend/utils/cache/lsyscache.c b/src/backend/utils/cache/lsyscache.c index 5f55c35d..d8a59308 100644 --- a/src/backend/utils/cache/lsyscache.c +++ b/src/backend/utils/cache/lsyscache.c @@ -499,8 +499,8 @@ get_compatible_hash_operators(Oid opno, /* * get_op_hash_functions - * Get the OID(s) of hash support function(s) compatible with the given - * operator, operating on its LHS and/or RHS datatype as required. + * Get the OID(s) of the standard hash support function(s) compatible with + * the given operator, operating on its LHS and/or RHS datatype as required. * * A function for the LHS type is sought and returned into *lhs_procno if * lhs_procno isn't NULL. Similarly, a function for the RHS type is sought @@ -551,7 +551,7 @@ get_op_hash_functions(Oid opno, *lhs_procno = get_opfamily_proc(aform->amopfamily, aform->amoplefttype, aform->amoplefttype, - HASHPROC); + HASHSTANDARD_PROC); if (!OidIsValid(*lhs_procno)) continue; /* Matching LHS found, done if caller doesn't want RHS */ @@ -573,7 +573,7 @@ get_op_hash_functions(Oid opno, *rhs_procno = get_opfamily_proc(aform->amopfamily, aform->amoprighttype, aform->amoprighttype, - HASHPROC); + HASHSTANDARD_PROC); if (!OidIsValid(*rhs_procno)) { /* Forget any LHS function from this opfamily */ diff --git a/src/backend/utils/cache/typcache.c b/src/backend/utils/cache/typcache.c index 60787238..c9579be4 100644 --- a/src/backend/utils/cache/typcache.c +++ b/src/backend/utils/cache/typcache.c @@ -90,6 +90,7 @@ static TypeCacheEntry *firstDomainTypeEntry = NULL; #define TCFLAGS_HAVE_FIELD_EQUALITY 0x1000 #define TCFLAGS_HAVE_FIELD_COMPARE 0x2000 #define TCFLAGS_CHECKED_DOMAIN_CONSTRAINTS 0x4000 +#define TCFLAGS_CHECKED_HASH_EXTENDED_PROC 0x8000 /* * Data stored about a domain type's constraints. Note that we do not create @@ -314,6 +315,8 @@ lookup_type_cache(Oid type_id, int flags) flags |= TYPECACHE_HASH_OPFAMILY; if ((flags & (TYPECACHE_HASH_PROC | TYPECACHE_HASH_PROC_FINFO | + TYPECACHE_HASH_EXTENDED_PROC | + TYPECACHE_HASH_EXTENDED_PROC_FINFO | TYPECACHE_HASH_OPFAMILY)) && !(typentry->flags & TCFLAGS_CHECKED_HASH_OPCLASS)) { @@ -336,6 +339,7 @@ lookup_type_cache(Oid type_id, int flags) * decision is still good. */ typentry->flags &= ~(TCFLAGS_CHECKED_HASH_PROC); + typentry->flags &= ~(TCFLAGS_CHECKED_HASH_EXTENDED_PROC); typentry->flags |= TCFLAGS_CHECKED_HASH_OPCLASS; } @@ -379,11 +383,12 @@ lookup_type_cache(Oid type_id, int flags) typentry->eq_opr = eq_opr; /* - * Reset info about hash function whenever we pick up new info about - * equality operator. This is so we can ensure that the hash function - * matches the operator. + * Reset info about hash functions whenever we pick up new info about + * equality operator. This is so we can ensure that the hash functions + * match the operator. */ typentry->flags &= ~(TCFLAGS_CHECKED_HASH_PROC); + typentry->flags &= ~(TCFLAGS_CHECKED_HASH_EXTENDED_PROC); typentry->flags |= TCFLAGS_CHECKED_EQ_OPR; } if ((flags & TYPECACHE_LT_OPR) && @@ -474,7 +479,7 @@ lookup_type_cache(Oid type_id, int flags) hash_proc = get_opfamily_proc(typentry->hash_opf, typentry->hash_opintype, typentry->hash_opintype, - HASHPROC); + HASHSTANDARD_PROC); /* * As above, make sure hash_array will succeed. We don't currently @@ -492,6 +497,43 @@ lookup_type_cache(Oid type_id, int flags) typentry->hash_proc = hash_proc; typentry->flags |= TCFLAGS_CHECKED_HASH_PROC; } + if ((flags & (TYPECACHE_HASH_EXTENDED_PROC | + TYPECACHE_HASH_EXTENDED_PROC_FINFO)) && + !(typentry->flags & TCFLAGS_CHECKED_HASH_EXTENDED_PROC)) + { + Oid hash_extended_proc = InvalidOid; + + /* + * We insist that the eq_opr, if one has been determined, match the + * hash opclass; else report there is no hash function. + */ + if (typentry->hash_opf != InvalidOid && + (!OidIsValid(typentry->eq_opr) || + typentry->eq_opr == get_opfamily_member(typentry->hash_opf, + typentry->hash_opintype, + typentry->hash_opintype, + HTEqualStrategyNumber))) + hash_extended_proc = get_opfamily_proc(typentry->hash_opf, + typentry->hash_opintype, + typentry->hash_opintype, + HASHEXTENDED_PROC); + + /* + * As above, make sure hash_array_extended will succeed. We don't + * currently support hashing for composite types, but when we do, + * we'll need more logic here to check that case too. + */ + if (hash_extended_proc == F_HASH_ARRAY_EXTENDED && + !array_element_has_hashing(typentry)) + hash_extended_proc = InvalidOid; + + /* Force update of hash_proc_finfo only if we're changing state */ + if (typentry->hash_extended_proc != hash_extended_proc) + typentry->hash_extended_proc_finfo.fn_oid = InvalidOid; + + typentry->hash_extended_proc = hash_extended_proc; + typentry->flags |= TCFLAGS_CHECKED_HASH_EXTENDED_PROC; + } /* * Set up fmgr lookup info as requested @@ -530,6 +572,14 @@ lookup_type_cache(Oid type_id, int flags) fmgr_info_cxt(typentry->hash_proc, &typentry->hash_proc_finfo, CacheMemoryContext); } + if ((flags & TYPECACHE_HASH_EXTENDED_PROC_FINFO) && + typentry->hash_extended_proc_finfo.fn_oid == InvalidOid && + typentry->hash_extended_proc != InvalidOid) + { + fmgr_info_cxt(typentry->hash_extended_proc, + &typentry->hash_extended_proc_finfo, + CacheMemoryContext); + } /* * If it's a composite type (row type), get tupdesc if requested diff --git a/src/include/access/hash.h b/src/include/access/hash.h index a461a8fe..96cfcc34 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -38,6 +38,17 @@ typedef uint32 Bucket; #define BUCKET_TO_BLKNO(metap,B) \ ((BlockNumber) ((B) + ((B) ? (metap)->hashm_spares[_hash_spareindex((B)+1)-1] : 0)) + 1) +/* + * Rotate the high 32 bits and the low 32 bits separately. The standard + * hash function sometimes rotates the low 32 bits by one bit when + * combining elements. We want extended hash functions to be compatible with + * that algorithm when the seed is 0, so we can't just do a normal rotation. + * This works, though. + */ +#define ROTATE_HIGH_AND_LOW_32BITS(v) \ + ((((v) << 1) & UINT64CONST(0xfffffffefffffffe)) | \ + (((v) >> 31) & UINT64CONST(0x100000001))) + /* * Special space for hash index pages. * @@ -290,11 +301,19 @@ typedef HashMetaPageData *HashMetaPage; /* * When a new operator class is declared, we require that the user supply - * us with an amproc procudure for hashing a key of the new type. - * Since we only have one such proc in amproc, it's number 1. + * us with an amproc procudure for hashing a key of the new type, returning + * a 32-bit hash value. We call this the "standard" hash procedure. We + * also allow an optional "extended" hash procedure which accepts a salt and + * returns a 64-bit hash value. This is highly recommended but, for reasons + * of backward compatibility, optional. + * + * When the salt is 0, the low 32 bits of the value returned by the extended + * hash procedure should match the value that would have been returned by the + * standard hash procedure. */ -#define HASHPROC 1 -#define HASHNProcs 1 +#define HASHSTANDARD_PROC 1 +#define HASHEXTENDED_PROC 2 +#define HASHNProcs 2 /* public routines */ @@ -322,7 +341,10 @@ extern bytea *hashoptions(Datum reloptions, bool validate); extern bool hashvalidate(Oid opclassoid); extern Datum hash_any(register const unsigned char *k, register int keylen); +extern Datum hash_any_extended(register const unsigned char *k, + register int keylen, uint64 seed); extern Datum hash_uint32(uint32 k); +extern Datum hash_uint32_extended(uint32 k, uint64 seed); /* private routines */ diff --git a/src/include/catalog/pg_amproc.h b/src/include/catalog/pg_amproc.h index b6d88568..b239bbec 100644 --- a/src/include/catalog/pg_amproc.h +++ b/src/include/catalog/pg_amproc.h @@ -153,42 +153,77 @@ DATA(insert ( 4033 3802 3802 1 4044 )); /* hash */ DATA(insert ( 427 1042 1042 1 1080 )); +DATA(insert ( 427 1042 1042 2 972 )); DATA(insert ( 431 18 18 1 454 )); +DATA(insert ( 431 18 18 2 446 )); DATA(insert ( 435 1082 1082 1 450 )); +DATA(insert ( 435 1082 1082 2 425 )); DATA(insert ( 627 2277 2277 1 626 )); +DATA(insert ( 627 2277 2277 2 782 )); DATA(insert ( 1971 700 700 1 451 )); +DATA(insert ( 1971 700 700 2 443 )); DATA(insert ( 1971 701 701 1 452 )); +DATA(insert ( 1971 701 701 2 444 )); DATA(insert ( 1975 869 869 1 422 )); +DATA(insert ( 1975 869 869 2 779 )); DATA(insert ( 1977 21 21 1 449 )); +DATA(insert ( 1977 21 21 2 441 )); DATA(insert ( 1977 23 23 1 450 )); +DATA(insert ( 1977 23 23 2 425 )); DATA(insert ( 1977 20 20 1 949 )); +DATA(insert ( 1977 20 20 2 442 )); DATA(insert ( 1983 1186 1186 1 1697 )); +DATA(insert ( 1983 1186 1186 2 3418 )); DATA(insert ( 1985 829 829 1 399 )); +DATA(insert ( 1985 829 829 2 778 )); DATA(insert ( 1987 19 19 1 455 )); +DATA(insert ( 1987 19 19 2 447 )); DATA(insert ( 1990 26 26 1 453 )); +DATA(insert ( 1990 26 26 2 445 )); DATA(insert ( 1992 30 30 1 457 )); +DATA(insert ( 1992 30 30 2 776 )); DATA(insert ( 1995 25 25 1 400 )); +DATA(insert ( 1995 25 25 2 448)); DATA(insert ( 1997 1083 1083 1 1688 )); +DATA(insert ( 1997 1083 1083 2 3409 )); DATA(insert ( 1998 1700 1700 1 432 )); +DATA(insert ( 1998 1700 1700 2 780 )); DATA(insert ( 1999 1184 1184 1 2039 )); +DATA(insert ( 1999 1184 1184 2 3411 )); DATA(insert ( 2001 1266 1266 1 1696 )); +DATA(insert ( 2001 1266 1266 2 3410 )); DATA(insert ( 2040 1114 1114 1 2039 )); +DATA(insert ( 2040 1114 1114 2 3411 )); DATA(insert ( 2222 16 16 1 454 )); +DATA(insert ( 2222 16 16 2 446 )); DATA(insert ( 2223 17 17 1 456 )); +DATA(insert ( 2223 17 17 2 772 )); DATA(insert ( 2225 28 28 1 450 )); +DATA(insert ( 2225 28 28 2 425)); DATA(insert ( 2226 29 29 1 450 )); +DATA(insert ( 2226 29 29 2 425 )); DATA(insert ( 2227 702 702 1 450 )); +DATA(insert ( 2227 702 702 2 425 )); DATA(insert ( 2228 703 703 1 450 )); +DATA(insert ( 2228 703 703 2 425 )); DATA(insert ( 2229 25 25 1 400 )); +DATA(insert ( 2229 25 25 2 448 )); DATA(insert ( 2231 1042 1042 1 1080 )); +DATA(insert ( 2231 1042 1042 2 972 )); DATA(insert ( 2235 1033 1033 1 329 )); +DATA(insert ( 2235 1033 1033 2 777 )); DATA(insert ( 2969 2950 2950 1 2963 )); +DATA(insert ( 2969 2950 2950 2 3412 )); DATA(insert ( 3254 3220 3220 1 3252 )); +DATA(insert ( 3254 3220 3220 2 3413 )); DATA(insert ( 3372 774 774 1 328 )); +DATA(insert ( 3372 774 774 2 781 )); DATA(insert ( 3523 3500 3500 1 3515 )); +DATA(insert ( 3523 3500 3500 2 3414 )); DATA(insert ( 3903 3831 3831 1 3902 )); +DATA(insert ( 3903 3831 3831 2 3417 )); DATA(insert ( 4034 3802 3802 1 4045 )); - +DATA(insert ( 4034 3802 3802 2 3416)); /* gist */ DATA(insert ( 1029 600 600 1 2179 )); diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index 1f40ef5e..e5bcf8ae 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -770,6 +770,38 @@ DATA(insert OID = 432 ( hash_numeric PGNSP PGUID 12 1 0 0 0 f f f f t f i DESCR("hash"); DATA(insert OID = 328 ( hashmacaddr8 PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 23 "774" _null_ _null_ _null_ _null_ _null_ hashmacaddr8 _null_ _null_ _null_ )); DESCR("hash"); +DATA(insert OID = 4660 ( hashint2extended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "21 20" _null_ _null_ _null_ _null_ _null_ hashint2extended _null_ _null_ _null_ )); +DESCR("hash"); +DATA(insert OID = 4661 ( hashint4extended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "23 20" _null_ _null_ _null_ _null_ _null_ hashint4extended _null_ _null_ _null_ )); +DESCR("hash"); +DATA(insert OID = 4662 ( hashint8extended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "20 20" _null_ _null_ _null_ _null_ _null_ hashint8extended _null_ _null_ _null_ )); +DESCR("hash"); +DATA(insert OID = 4663 ( hashfloat4extended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "700 20" _null_ _null_ _null_ _null_ _null_ hashfloat4extended _null_ _null_ _null_ )); +DESCR("hash"); +DATA(insert OID = 4664 ( hashfloat8extended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "701 20" _null_ _null_ _null_ _null_ _null_ hashfloat8extended _null_ _null_ _null_ )); +DESCR("hash"); +DATA(insert OID = 4665 ( hashoidextended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "26 20" _null_ _null_ _null_ _null_ _null_ hashoidextended _null_ _null_ _null_ )); +DESCR("hash"); +DATA(insert OID = 4666 ( hashcharextended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "18 20" _null_ _null_ _null_ _null_ _null_ hashcharextended _null_ _null_ _null_ )); +DESCR("hash"); +DATA(insert OID = 4667 ( hashnameextended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "19 20" _null_ _null_ _null_ _null_ _null_ hashnameextended _null_ _null_ _null_ )); +DESCR("hash"); +DATA(insert OID = 4668 ( hashtextextended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "25 20" _null_ _null_ _null_ _null_ _null_ hashtextextended _null_ _null_ _null_ )); +DESCR("hash"); +DATA(insert OID = 4669 ( hashvarlenaextended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "2281 20" _null_ _null_ _null_ _null_ _null_ hashvarlenaextended _null_ _null_ _null_ )); +DESCR("hash"); +DATA(insert OID = 4670 ( hashoidvectorextended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "30 20" _null_ _null_ _null_ _null_ _null_ hashoidvectorextended _null_ _null_ _null_ )); +DESCR("hash"); +DATA(insert OID = 4671 ( hash_aclitem_extended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "1033 20" _null_ _null_ _null_ _null_ _null_ hash_aclitem_extended _null_ _null_ _null_ )); +DESCR("hash"); +DATA(insert OID = 4672 ( hashmacaddrextended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "829 20" _null_ _null_ _null_ _null_ _null_ hashmacaddrextended _null_ _null_ _null_ )); +DESCR("hash"); +DATA(insert OID = 4673 ( hashinetextended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "869 20" _null_ _null_ _null_ _null_ _null_ hashinetextended _null_ _null_ _null_ )); +DESCR("hash"); +DATA(insert OID = 4674 ( hash_numeric_extended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "1700 20" _null_ _null_ _null_ _null_ _null_ hash_numeric_extended _null_ _null_ _null_ )); +DESCR("hash"); +DATA(insert OID = 4675 ( hashmacaddr8extended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "774 20" _null_ _null_ _null_ _null_ _null_ hashmacaddr8extended _null_ _null_ _null_ )); +DESCR("hash"); DATA(insert OID = 438 ( num_nulls PGNSP PGUID 12 1 0 2276 0 f f f f f f i s 1 0 23 "2276" "{2276}" "{v}" _null_ _null_ _null_ pg_num_nulls _null_ _null_ _null_ )); DESCR("count the number of NULL arguments"); @@ -819,6 +851,8 @@ DESCR("convert float8 to int8"); DATA(insert OID = 626 ( hash_array PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 23 "2277" _null_ _null_ _null_ _null_ _null_ hash_array _null_ _null_ _null_ )); DESCR("hash"); +DATA(insert OID = 4686 ( hash_array_extended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "2277 20" _null_ _null_ _null_ _null_ _null_ hash_array_extended _null_ _null_ _null_ )); +DESCR("hash"); DATA(insert OID = 652 ( float4 PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 700 "20" _null_ _null_ _null_ _null_ _null_ i8tof _null_ _null_ _null_ )); DESCR("convert int8 to float4"); @@ -1269,6 +1303,8 @@ DATA(insert OID = 1080 ( hashbpchar PGNSP PGUID 12 1 0 0 0 f f f f t f i DESCR("hash"); DATA(insert OID = 1081 ( format_type PGNSP PGUID 12 1 0 0 0 f f f f f f s s 2 0 25 "26 23" _null_ _null_ _null_ _null_ _null_ format_type _null_ _null_ _null_ )); DESCR("format a type oid and atttypmod to canonical SQL"); +DATA(insert OID = 4676 ( hashbpcharextended PGNSP PGUID 12 1 0 0 0 f f f f f t f i s 2 0 20 "1042 20" _null_ _null_ _null_ _null_ _null_ hashbpcharextended _null_ _null_ _null_ )); +DESCR("hash"); DATA(insert OID = 1084 ( date_in PGNSP PGUID 12 1 0 0 0 f f f f t f s s 1 0 1082 "2275" _null_ _null_ _null_ _null_ _null_ date_in _null_ _null_ _null_ )); DESCR("I/O"); DATA(insert OID = 1085 ( date_out PGNSP PGUID 12 1 0 0 0 f f f f t f s s 1 0 2275 "1082" _null_ _null_ _null_ _null_ _null_ date_out _null_ _null_ _null_ )); @@ -2412,6 +2448,12 @@ DATA(insert OID = 1696 ( timetz_hash PGNSP PGUID 12 1 0 0 0 f f f f t f DESCR("hash"); DATA(insert OID = 1697 ( interval_hash PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 23 "1186" _null_ _null_ _null_ _null_ _null_ interval_hash _null_ _null_ _null_ )); DESCR("hash"); +DATA(insert OID = 4677 ( time_hash_extended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "1083 20" _null_ _null_ _null_ _null_ _null_ time_hash_extended _null_ _null_ _null_ )); +DESCR("hash"); +DATA(insert OID = 4678 ( timetz_hash_extended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "1266 20" _null_ _null_ _null_ _null_ _null_ timetz_hash_extended _null_ _null_ _null_ )); +DESCR("hash"); +DATA(insert OID = 4679 ( interval_hash_extended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "1186 20" _null_ _null_ _null_ _null_ _null_ interval_hash_extended _null_ _null_ _null_ )); +DESCR("hash"); /* OID's 1700 - 1799 NUMERIC data type */ @@ -3219,6 +3261,8 @@ DATA(insert OID = 2039 ( timestamp_hash PGNSP PGUID 12 1 0 0 0 f f f f t f i DESCR("hash"); DATA(insert OID = 2041 ( overlaps PGNSP PGUID 12 1 0 0 0 f f f f f f i s 4 0 16 "1114 1114 1114 1114" _null_ _null_ _null_ _null_ _null_ overlaps_timestamp _null_ _null_ _null_ )); DESCR("intervals overlap?"); +DATA(insert OID = 4680 ( timestamp_hash_extended PGNSP PGUID 12 1 0 0 0 f f f f f t f i s 2 0 20 "1114 20" _null_ _null_ _null_ _null_ _null_ timestamp_hash_extended _null_ _null_ _null_ )); +DESCR("hash"); DATA(insert OID = 2042 ( overlaps PGNSP PGUID 14 1 0 0 0 f f f f f f i s 4 0 16 "1114 1186 1114 1186" _null_ _null_ _null_ _null_ _null_ "select ($1, ($1 + $2)) overlaps ($3, ($3 + $4))" _null_ _null_ _null_ )); DESCR("intervals overlap?"); DATA(insert OID = 2043 ( overlaps PGNSP PGUID 14 1 0 0 0 f f f f f f i s 4 0 16 "1114 1114 1114 1186" _null_ _null_ _null_ _null_ _null_ "select ($1, $2) overlaps ($3, ($3 + $4))" _null_ _null_ _null_ )); @@ -4691,6 +4735,8 @@ DATA(insert OID = 2962 ( uuid_send PGNSP PGUID 12 1 0 0 0 f f f f t f DESCR("I/O"); DATA(insert OID = 2963 ( uuid_hash PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 23 "2950" _null_ _null_ _null_ _null_ _null_ uuid_hash _null_ _null_ _null_ )); DESCR("hash"); +DATA(insert OID = 4681 ( uuid_hash_extended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "2950 20" _null_ _null_ _null_ _null_ _null_ uuid_hash_extended _null_ _null_ _null_ )); +DESCR("hash"); /* pg_lsn */ DATA(insert OID = 3229 ( pg_lsn_in PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 3220 "2275" _null_ _null_ _null_ _null_ _null_ pg_lsn_in _null_ _null_ _null_ )); @@ -4712,6 +4758,8 @@ DATA(insert OID = 3251 ( pg_lsn_cmp PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 DESCR("less-equal-greater"); DATA(insert OID = 3252 ( pg_lsn_hash PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 23 "3220" _null_ _null_ _null_ _null_ _null_ pg_lsn_hash _null_ _null_ _null_ )); DESCR("hash"); +DATA(insert OID = 4682 ( pg_lsn_hash_extended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "3220 20" _null_ _null_ _null_ _null_ _null_ pg_lsn_hash_extended _null_ _null_ _null_ )); +DESCR("hash"); /* enum related procs */ DATA(insert OID = 3504 ( anyenum_in PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 3500 "2275" _null_ _null_ _null_ _null_ _null_ anyenum_in _null_ _null_ _null_ )); @@ -4734,6 +4782,8 @@ DATA(insert OID = 3515 ( hashenum PGNSP PGUID 12 1 0 0 0 f f f f t f i s DESCR("hash"); DATA(insert OID = 3524 ( enum_smaller PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3500 "3500 3500" _null_ _null_ _null_ _null_ _null_ enum_smaller _null_ _null_ _null_ )); DESCR("smaller of two"); +DATA(insert OID = 4683 ( hashenumextended PGNSP PGUID 12 1 0 0 0 f f f f f t f i s 2 0 20 "3500 20" _null_ _null_ _null_ _null_ _null_ hashenumextended _null_ _null_ _null_ )); +DESCR("hash"); DATA(insert OID = 3525 ( enum_larger PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3500 "3500 3500" _null_ _null_ _null_ _null_ _null_ enum_larger _null_ _null_ _null_ )); DESCR("larger of two"); DATA(insert OID = 3526 ( max PGNSP PGUID 12 1 0 0 0 t f f f f f i s 1 0 3500 "3500" _null_ _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ )); @@ -5129,6 +5179,8 @@ DATA(insert OID = 4044 ( jsonb_cmp PGNSP PGUID 12 1 0 0 0 f f f f t f DESCR("less-equal-greater"); DATA(insert OID = 4045 ( jsonb_hash PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 23 "3802" _null_ _null_ _null_ _null_ _null_ jsonb_hash _null_ _null_ _null_ )); DESCR("hash"); +DATA(insert OID = 4684 ( jsonb_hash_extended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "3802 20" _null_ _null_ _null_ _null_ _null_ jsonb_hash_extended _null_ _null_ _null_ )); +DESCR("hash"); DATA(insert OID = 4046 ( jsonb_contains PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 16 "3802 3802" _null_ _null_ _null_ _null_ _null_ jsonb_contains _null_ _null_ _null_ )); DATA(insert OID = 4047 ( jsonb_exists PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 16 "3802 25" _null_ _null_ _null_ _null_ _null_ jsonb_exists _null_ _null_ _null_ )); DATA(insert OID = 4048 ( jsonb_exists_any PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 16 "3802 1009" _null_ _null_ _null_ _null_ _null_ jsonb_exists_any _null_ _null_ _null_ )); @@ -5319,6 +5371,8 @@ DATA(insert OID = 3881 ( range_gist_same PGNSP PGUID 12 1 0 0 0 f f f f DESCR("GiST support"); DATA(insert OID = 3902 ( hash_range PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 23 "3831" _null_ _null_ _null_ _null_ _null_ hash_range _null_ _null_ _null_ )); DESCR("hash a range"); +DATA(insert OID = 4685 ( hash_range_extended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "3831 20" _null_ _null_ _null_ _null_ _null_ hash_range_extended _null_ _null_ _null_ )); +DESCR("hash a range"); DATA(insert OID = 3916 ( range_typanalyze PGNSP PGUID 12 1 0 0 0 f f f f t f s s 1 0 16 "2281" _null_ _null_ _null_ _null_ _null_ range_typanalyze _null_ _null_ _null_ )); DESCR("range typanalyze"); DATA(insert OID = 3169 ( rangesel PGNSP PGUID 12 1 0 0 0 f f f f t f s s 4 0 701 "2281 26 2281 23" _null_ _null_ _null_ _null_ _null_ rangesel _null_ _null_ _null_ )); diff --git a/src/include/fmgr.h b/src/include/fmgr.h index 5c54e2be..75f9b34b 100644 --- a/src/include/fmgr.h +++ b/src/include/fmgr.h @@ -325,6 +325,7 @@ extern struct varlena *pg_detoast_datum_packed(struct varlena *datum); #define PG_RETURN_FLOAT4(x) return Float4GetDatum(x) #define PG_RETURN_FLOAT8(x) return Float8GetDatum(x) #define PG_RETURN_INT64(x) return Int64GetDatum(x) +#define PG_RETURN_UINT64(x) return UInt64GetDatum(x) /* RETURN macros for other pass-by-ref types will typically look like this: */ #define PG_RETURN_BYTEA_P(x) PG_RETURN_POINTER(x) #define PG_RETURN_TEXT_P(x) PG_RETURN_POINTER(x) diff --git a/src/include/utils/jsonb.h b/src/include/utils/jsonb.h index 9b07ee9b..24f49166 100644 --- a/src/include/utils/jsonb.h +++ b/src/include/utils/jsonb.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * jsonb.h - * Declarations for jsonb data type support. + * Declarations for jsonb data type support. * * Copyright (c) 1996-2017, PostgreSQL Global Development Group * @@ -19,21 +19,21 @@ /* Tokens used when sequentially processing a jsonb value */ typedef enum { - WJB_DONE, - WJB_KEY, - WJB_VALUE, - WJB_ELEM, - WJB_BEGIN_ARRAY, - WJB_END_ARRAY, - WJB_BEGIN_OBJECT, - WJB_END_OBJECT + WJB_DONE, + WJB_KEY, + WJB_VALUE, + WJB_ELEM, + WJB_BEGIN_ARRAY, + WJB_END_ARRAY, + WJB_BEGIN_OBJECT, + WJB_END_OBJECT } JsonbIteratorToken; /* Strategy numbers for GIN index opclasses */ -#define JsonbContainsStrategyNumber 7 -#define JsonbExistsStrategyNumber 9 -#define JsonbExistsAnyStrategyNumber 10 -#define JsonbExistsAllStrategyNumber 11 +#define JsonbContainsStrategyNumber 7 +#define JsonbExistsStrategyNumber 9 +#define JsonbExistsAnyStrategyNumber 10 +#define JsonbExistsAllStrategyNumber 11 /* * In the standard jsonb_ops GIN opclass for jsonb, we choose to index both @@ -56,19 +56,19 @@ typedef enum * matches against the heap tuple; currently, this costs nothing because we * must always recheck for other reasons. */ -#define JGINFLAG_KEY 0x01 /* key (or string array element) */ -#define JGINFLAG_NULL 0x02 /* null value */ -#define JGINFLAG_BOOL 0x03 /* boolean value */ -#define JGINFLAG_NUM 0x04 /* numeric value */ -#define JGINFLAG_STR 0x05 /* string value (if not an array element) */ -#define JGINFLAG_HASHED 0x10 /* OR'd into flag if value was hashed */ -#define JGIN_MAXLENGTH 125 /* max length of text part before hashing */ +#define JGINFLAG_KEY 0x01 /* key (or string array element) */ +#define JGINFLAG_NULL 0x02 /* null value */ +#define JGINFLAG_BOOL 0x03 /* boolean value */ +#define JGINFLAG_NUM 0x04 /* numeric value */ +#define JGINFLAG_STR 0x05 /* string value (if not an array element) */ +#define JGINFLAG_HASHED 0x10 /* OR'd into flag if value was hashed */ +#define JGIN_MAXLENGTH 125 /* max length of text part before hashing */ /* Convenience macros */ -#define DatumGetJsonb(d) ((Jsonb *) PG_DETOAST_DATUM(d)) -#define JsonbGetDatum(p) PointerGetDatum(p) -#define PG_GETARG_JSONB(x) DatumGetJsonb(PG_GETARG_DATUM(x)) -#define PG_RETURN_JSONB(x) PG_RETURN_POINTER(x) +#define DatumGetJsonb(d) ((Jsonb *) PG_DETOAST_DATUM(d)) +#define JsonbGetDatum(p) PointerGetDatum(p) +#define PG_GETARG_JSONB(x) DatumGetJsonb(PG_GETARG_DATUM(x)) +#define PG_RETURN_JSONB(x) PG_RETURN_POINTER(x) typedef struct JsonbPair JsonbPair; typedef struct JsonbValue JsonbValue; @@ -138,38 +138,38 @@ typedef struct JsonbValue JsonbValue; */ typedef uint32 JEntry; -#define JENTRY_OFFLENMASK 0x0FFFFFFF -#define JENTRY_TYPEMASK 0x70000000 -#define JENTRY_HAS_OFF 0x80000000 +#define JENTRY_OFFLENMASK 0x0FFFFFFF +#define JENTRY_TYPEMASK 0x70000000 +#define JENTRY_HAS_OFF 0x80000000 /* values stored in the type bits */ -#define JENTRY_ISSTRING 0x00000000 -#define JENTRY_ISNUMERIC 0x10000000 -#define JENTRY_ISBOOL_FALSE 0x20000000 -#define JENTRY_ISBOOL_TRUE 0x30000000 -#define JENTRY_ISNULL 0x40000000 -#define JENTRY_ISCONTAINER 0x50000000 /* array or object */ +#define JENTRY_ISSTRING 0x00000000 +#define JENTRY_ISNUMERIC 0x10000000 +#define JENTRY_ISBOOL_FALSE 0x20000000 +#define JENTRY_ISBOOL_TRUE 0x30000000 +#define JENTRY_ISNULL 0x40000000 +#define JENTRY_ISCONTAINER 0x50000000 /* array or object */ /* Access macros. Note possible multiple evaluations */ -#define JBE_OFFLENFLD(je_) ((je_) & JENTRY_OFFLENMASK) -#define JBE_HAS_OFF(je_) (((je_) & JENTRY_HAS_OFF) != 0) -#define JBE_ISSTRING(je_) (((je_) & JENTRY_TYPEMASK) == JENTRY_ISSTRING) -#define JBE_ISNUMERIC(je_) (((je_) & JENTRY_TYPEMASK) == JENTRY_ISNUMERIC) -#define JBE_ISCONTAINER(je_) (((je_) & JENTRY_TYPEMASK) == JENTRY_ISCONTAINER) -#define JBE_ISNULL(je_) (((je_) & JENTRY_TYPEMASK) == JENTRY_ISNULL) -#define JBE_ISBOOL_TRUE(je_) (((je_) & JENTRY_TYPEMASK) == JENTRY_ISBOOL_TRUE) -#define JBE_ISBOOL_FALSE(je_) (((je_) & JENTRY_TYPEMASK) == JENTRY_ISBOOL_FALSE) -#define JBE_ISBOOL(je_) (JBE_ISBOOL_TRUE(je_) || JBE_ISBOOL_FALSE(je_)) +#define JBE_OFFLENFLD(je_) ((je_) & JENTRY_OFFLENMASK) +#define JBE_HAS_OFF(je_) (((je_) & JENTRY_HAS_OFF) != 0) +#define JBE_ISSTRING(je_) (((je_) & JENTRY_TYPEMASK) == JENTRY_ISSTRING) +#define JBE_ISNUMERIC(je_) (((je_) & JENTRY_TYPEMASK) == JENTRY_ISNUMERIC) +#define JBE_ISCONTAINER(je_) (((je_) & JENTRY_TYPEMASK) == JENTRY_ISCONTAINER) +#define JBE_ISNULL(je_) (((je_) & JENTRY_TYPEMASK) == JENTRY_ISNULL) +#define JBE_ISBOOL_TRUE(je_) (((je_) & JENTRY_TYPEMASK) == JENTRY_ISBOOL_TRUE) +#define JBE_ISBOOL_FALSE(je_) (((je_) & JENTRY_TYPEMASK) == JENTRY_ISBOOL_FALSE) +#define JBE_ISBOOL(je_) (JBE_ISBOOL_TRUE(je_) || JBE_ISBOOL_FALSE(je_)) /* Macro for advancing an offset variable to the next JEntry */ #define JBE_ADVANCE_OFFSET(offset, je) \ - do { \ - JEntry je_ = (je); \ - if (JBE_HAS_OFF(je_)) \ - (offset) = JBE_OFFLENFLD(je_); \ - else \ - (offset) += JBE_OFFLENFLD(je_); \ - } while(0) + do { \ + JEntry je_ = (je); \ + if (JBE_HAS_OFF(je_)) \ + (offset) = JBE_OFFLENFLD(je_); \ + else \ + (offset) += JBE_OFFLENFLD(je_); \ + } while(0) /* * We store an offset, not a length, every JB_OFFSET_STRIDE children. @@ -178,7 +178,7 @@ typedef uint32 JEntry; * bits instead. This allows changes in the offset-placement heuristic * without breaking on-disk compatibility. */ -#define JB_OFFSET_STRIDE 32 +#define JB_OFFSET_STRIDE 32 /* * A jsonb array or object node, within a Jsonb Datum. @@ -192,96 +192,96 @@ typedef uint32 JEntry; */ typedef struct JsonbContainer { - uint32 header; /* number of elements or key/value pairs, and - * flags */ - JEntry children[FLEXIBLE_ARRAY_MEMBER]; + uint32 header; /* number of elements or key/value pairs, and + * flags */ + JEntry children[FLEXIBLE_ARRAY_MEMBER]; - /* the data for each child node follows. */ + /* the data for each child node follows. */ } JsonbContainer; /* flags for the header-field in JsonbContainer */ -#define JB_CMASK 0x0FFFFFFF /* mask for count field */ -#define JB_FSCALAR 0x10000000 /* flag bits */ -#define JB_FOBJECT 0x20000000 -#define JB_FARRAY 0x40000000 +#define JB_CMASK 0x0FFFFFFF /* mask for count field */ +#define JB_FSCALAR 0x10000000 /* flag bits */ +#define JB_FOBJECT 0x20000000 +#define JB_FARRAY 0x40000000 /* convenience macros for accessing a JsonbContainer struct */ -#define JsonContainerSize(jc) ((jc)->header & JB_CMASK) -#define JsonContainerIsScalar(jc) (((jc)->header & JB_FSCALAR) != 0) -#define JsonContainerIsObject(jc) (((jc)->header & JB_FOBJECT) != 0) -#define JsonContainerIsArray(jc) (((jc)->header & JB_FARRAY) != 0) +#define JsonContainerSize(jc) ((jc)->header & JB_CMASK) +#define JsonContainerIsScalar(jc) (((jc)->header & JB_FSCALAR) != 0) +#define JsonContainerIsObject(jc) (((jc)->header & JB_FOBJECT) != 0) +#define JsonContainerIsArray(jc) (((jc)->header & JB_FARRAY) != 0) /* The top-level on-disk format for a jsonb datum. */ typedef struct { - int32 vl_len_; /* varlena header (do not touch directly!) */ - JsonbContainer root; + int32 vl_len_; /* varlena header (do not touch directly!) */ + JsonbContainer root; } Jsonb; /* convenience macros for accessing the root container in a Jsonb datum */ -#define JB_ROOT_COUNT(jbp_) (*(uint32 *) VARDATA(jbp_) & JB_CMASK) +#define JB_ROOT_COUNT(jbp_) (*(uint32 *) VARDATA(jbp_) & JB_CMASK) #define JB_ROOT_IS_SCALAR(jbp_) ((*(uint32 *) VARDATA(jbp_) & JB_FSCALAR) != 0) #define JB_ROOT_IS_OBJECT(jbp_) ((*(uint32 *) VARDATA(jbp_) & JB_FOBJECT) != 0) -#define JB_ROOT_IS_ARRAY(jbp_) ((*(uint32 *) VARDATA(jbp_) & JB_FARRAY) != 0) +#define JB_ROOT_IS_ARRAY(jbp_) ((*(uint32 *) VARDATA(jbp_) & JB_FARRAY) != 0) enum jbvType { - /* Scalar types */ - jbvNull = 0x0, - jbvString, - jbvNumeric, - jbvBool, - /* Composite types */ - jbvArray = 0x10, - jbvObject, - /* Binary (i.e. struct Jsonb) jbvArray/jbvObject */ - jbvBinary + /* Scalar types */ + jbvNull = 0x0, + jbvString, + jbvNumeric, + jbvBool, + /* Composite types */ + jbvArray = 0x10, + jbvObject, + /* Binary (i.e. struct Jsonb) jbvArray/jbvObject */ + jbvBinary }; /* - * JsonbValue: In-memory representation of Jsonb. This is a convenient + * JsonbValue: In-memory representation of Jsonb. This is a convenient * deserialized representation, that can easily support using the "val" * union across underlying types during manipulation. The Jsonb on-disk * representation has various alignment considerations. */ struct JsonbValue { - enum jbvType type; /* Influences sort order */ - - union - { - Numeric numeric; - bool boolean; - struct - { - int len; - char *val; /* Not necessarily null-terminated */ - } string; /* String primitive type */ - - struct - { - int nElems; - JsonbValue *elems; - bool rawScalar; /* Top-level "raw scalar" array? */ - } array; /* Array container type */ - - struct - { - int nPairs; /* 1 pair, 2 elements */ - JsonbPair *pairs; - } object; /* Associative container type */ - - struct - { - int len; - JsonbContainer *data; - } binary; /* Array or object, in on-disk format */ - } val; + enum jbvType type; /* Influences sort order */ + + union + { + Numeric numeric; + bool boolean; + struct + { + int len; + char *val; /* Not necessarily null-terminated */ + } string; /* String primitive type */ + + struct + { + int nElems; + JsonbValue *elems; + bool rawScalar; /* Top-level "raw scalar" array? */ + } array; /* Array container type */ + + struct + { + int nPairs; /* 1 pair, 2 elements */ + JsonbPair *pairs; + } object; /* Associative container type */ + + struct + { + int len; + JsonbContainer *data; + } binary; /* Array or object, in on-disk format */ + } val; }; -#define IsAJsonbScalar(jsonbval) ((jsonbval)->type >= jbvNull && \ - (jsonbval)->type <= jbvBool) +#define IsAJsonbScalar(jsonbval) ((jsonbval)->type >= jbvNull && \ + (jsonbval)->type <= jbvBool) /* * Key/value pair within an Object. @@ -295,17 +295,17 @@ struct JsonbValue */ struct JsonbPair { - JsonbValue key; /* Must be a jbvString */ - JsonbValue value; /* May be of any type */ - uint32 order; /* Pair's index in original sequence */ + JsonbValue key; /* Must be a jbvString */ + JsonbValue value; /* May be of any type */ + uint32 order; /* Pair's index in original sequence */ }; /* Conversion state used when parsing Jsonb from text, or for type coercion */ typedef struct JsonbParseState { - JsonbValue contVal; - Size size; - struct JsonbParseState *next; + JsonbValue contVal; + Size size; + struct JsonbParseState *next; } JsonbParseState; /* @@ -314,68 +314,70 @@ typedef struct JsonbParseState */ typedef enum { - JBI_ARRAY_START, - JBI_ARRAY_ELEM, - JBI_OBJECT_START, - JBI_OBJECT_KEY, - JBI_OBJECT_VALUE + JBI_ARRAY_START, + JBI_ARRAY_ELEM, + JBI_OBJECT_START, + JBI_OBJECT_KEY, + JBI_OBJECT_VALUE } JsonbIterState; typedef struct JsonbIterator { - /* Container being iterated */ - JsonbContainer *container; - uint32 nElems; /* Number of elements in children array (will - * be nPairs for objects) */ - bool isScalar; /* Pseudo-array scalar value? */ - JEntry *children; /* JEntrys for child nodes */ - /* Data proper. This points to the beginning of the variable-length data */ - char *dataProper; - - /* Current item in buffer (up to nElems) */ - int curIndex; - - /* Data offset corresponding to current item */ - uint32 curDataOffset; - - /* - * If the container is an object, we want to return keys and values - * alternately; so curDataOffset points to the current key, and - * curValueOffset points to the current value. - */ - uint32 curValueOffset; - - /* Private state */ - JsonbIterState state; - - struct JsonbIterator *parent; + /* Container being iterated */ + JsonbContainer *container; + uint32 nElems; /* Number of elements in children array (will + * be nPairs for objects) */ + bool isScalar; /* Pseudo-array scalar value? */ + JEntry *children; /* JEntrys for child nodes */ + /* Data proper. This points to the beginning of the variable-length data */ + char *dataProper; + + /* Current item in buffer (up to nElems) */ + int curIndex; + + /* Data offset corresponding to current item */ + uint32 curDataOffset; + + /* + * If the container is an object, we want to return keys and values + * alternately; so curDataOffset points to the current key, and + * curValueOffset points to the current value. + */ + uint32 curValueOffset; + + /* Private state */ + JsonbIterState state; + + struct JsonbIterator *parent; } JsonbIterator; /* Support functions */ extern uint32 getJsonbOffset(const JsonbContainer *jc, int index); extern uint32 getJsonbLength(const JsonbContainer *jc, int index); -extern int compareJsonbContainers(JsonbContainer *a, JsonbContainer *b); +extern int compareJsonbContainers(JsonbContainer *a, JsonbContainer *b); extern JsonbValue *findJsonbValueFromContainer(JsonbContainer *sheader, - uint32 flags, - JsonbValue *key); + uint32 flags, + JsonbValue *key); extern JsonbValue *getIthJsonbValueFromContainer(JsonbContainer *sheader, - uint32 i); + uint32 i); extern JsonbValue *pushJsonbValue(JsonbParseState **pstate, - JsonbIteratorToken seq, JsonbValue *jbVal); + JsonbIteratorToken seq, JsonbValue *jbVal); extern JsonbIterator *JsonbIteratorInit(JsonbContainer *container); extern JsonbIteratorToken JsonbIteratorNext(JsonbIterator **it, JsonbValue *val, - bool skipNested); + bool skipNested); extern Jsonb *JsonbValueToJsonb(JsonbValue *val); extern bool JsonbDeepContains(JsonbIterator **val, - JsonbIterator **mContained); + JsonbIterator **mContained); extern void JsonbHashScalarValue(const JsonbValue *scalarVal, uint32 *hash); +extern void JsonbHashScalarValueExtended(const JsonbValue *scalarVal, + uint64 *hash, uint64 seed); /* jsonb.c support functions */ extern char *JsonbToCString(StringInfo out, JsonbContainer *in, - int estimated_len); + int estimated_len); extern char *JsonbToCStringIndent(StringInfo out, JsonbContainer *in, - int estimated_len); + int estimated_len); -#endif /* __JSONB_H__ */ +#endif /* __JSONB_H__ */ diff --git a/src/include/utils/typcache.h b/src/include/utils/typcache.h index abe7737d..b4f75921 100644 --- a/src/include/utils/typcache.h +++ b/src/include/utils/typcache.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * typcache.h - * Type cache definitions. + * Type cache definitions. * * The type cache exists to speed lookup of certain information about data * types that is not directly available from a type's pg_type row. @@ -28,98 +28,102 @@ struct TypeCacheEnumData; typedef struct TypeCacheEntry { - /* typeId is the hash lookup key and MUST BE FIRST */ - Oid type_id; /* OID of the data type */ - - /* some subsidiary information copied from the pg_type row */ - int16 typlen; - bool typbyval; - char typalign; - char typstorage; - char typtype; - Oid typrelid; - - /* - * Information obtained from opfamily entries - * - * These will be InvalidOid if no match could be found, or if the - * information hasn't yet been requested. Also note that for array and - * composite types, typcache.c checks that the contained types are - * comparable or hashable before allowing eq_opr etc to become set. - */ - Oid btree_opf; /* the default btree opclass' family */ - Oid btree_opintype; /* the default btree opclass' opcintype */ - Oid hash_opf; /* the default hash opclass' family */ - Oid hash_opintype; /* the default hash opclass' opcintype */ - Oid eq_opr; /* the equality operator */ - Oid lt_opr; /* the less-than operator */ - Oid gt_opr; /* the greater-than operator */ - Oid cmp_proc; /* the btree comparison function */ - Oid hash_proc; /* the hash calculation function */ - - /* - * Pre-set-up fmgr call info for the equality operator, the btree - * comparison function, and the hash calculation function. These are kept - * in the type cache to avoid problems with memory leaks in repeated calls - * to functions such as array_eq, array_cmp, hash_array. There is not - * currently a need to maintain call info for the lt_opr or gt_opr. - */ - FmgrInfo eq_opr_finfo; - FmgrInfo cmp_proc_finfo; - FmgrInfo hash_proc_finfo; - - /* - * Tuple descriptor if it's a composite type (row type). NULL if not - * composite or information hasn't yet been requested. (NOTE: this is a - * reference-counted tupledesc.) - */ - TupleDesc tupDesc; - - /* - * Fields computed when TYPECACHE_RANGE_INFO is requested. Zeroes if not - * a range type or information hasn't yet been requested. Note that - * rng_cmp_proc_finfo could be different from the element type's default - * btree comparison function. - */ - struct TypeCacheEntry *rngelemtype; /* range's element type */ - Oid rng_collation; /* collation for comparisons, if any */ - FmgrInfo rng_cmp_proc_finfo; /* comparison function */ - FmgrInfo rng_canonical_finfo; /* canonicalization function, if any */ - FmgrInfo rng_subdiff_finfo; /* difference function, if any */ - - /* - * Domain constraint data if it's a domain type. NULL if not domain, or - * if domain has no constraints, or if information hasn't been requested. - */ - DomainConstraintCache *domainData; - - /* Private data, for internal use of typcache.c only */ - int flags; /* flags about what we've computed */ - - /* - * Private information about an enum type. NULL if not enum or - * information hasn't been requested. - */ - struct TypeCacheEnumData *enumData; - - /* We also maintain a list of all known domain-type cache entries */ - struct TypeCacheEntry *nextDomain; + /* typeId is the hash lookup key and MUST BE FIRST */ + Oid type_id; /* OID of the data type */ + + /* some subsidiary information copied from the pg_type row */ + int16 typlen; + bool typbyval; + char typalign; + char typstorage; + char typtype; + Oid typrelid; + + /* + * Information obtained from opfamily entries + * + * These will be InvalidOid if no match could be found, or if the + * information hasn't yet been requested. Also note that for array and + * composite types, typcache.c checks that the contained types are + * comparable or hashable before allowing eq_opr etc to become set. + */ + Oid btree_opf; /* the default btree opclass' family */ + Oid btree_opintype; /* the default btree opclass' opcintype */ + Oid hash_opf; /* the default hash opclass' family */ + Oid hash_opintype; /* the default hash opclass' opcintype */ + Oid eq_opr; /* the equality operator */ + Oid lt_opr; /* the less-than operator */ + Oid gt_opr; /* the greater-than operator */ + Oid cmp_proc; /* the btree comparison function */ + Oid hash_proc; /* the hash calculation function */ + Oid hash_extended_proc; /* the extended hash calculation function */ + + /* + * Pre-set-up fmgr call info for the equality operator, the btree + * comparison function, and the hash calculation function. These are kept + * in the type cache to avoid problems with memory leaks in repeated calls + * to functions such as array_eq, array_cmp, hash_array. There is not + * currently a need to maintain call info for the lt_opr or gt_opr. + */ + FmgrInfo eq_opr_finfo; + FmgrInfo cmp_proc_finfo; + FmgrInfo hash_proc_finfo; + FmgrInfo hash_extended_proc_finfo; + + /* + * Tuple descriptor if it's a composite type (row type). NULL if not + * composite or information hasn't yet been requested. (NOTE: this is a + * reference-counted tupledesc.) + */ + TupleDesc tupDesc; + + /* + * Fields computed when TYPECACHE_RANGE_INFO is requested. Zeroes if not + * a range type or information hasn't yet been requested. Note that + * rng_cmp_proc_finfo could be different from the element type's default + * btree comparison function. + */ + struct TypeCacheEntry *rngelemtype; /* range's element type */ + Oid rng_collation; /* collation for comparisons, if any */ + FmgrInfo rng_cmp_proc_finfo; /* comparison function */ + FmgrInfo rng_canonical_finfo; /* canonicalization function, if any */ + FmgrInfo rng_subdiff_finfo; /* difference function, if any */ + + /* + * Domain constraint data if it's a domain type. NULL if not domain, or + * if domain has no constraints, or if information hasn't been requested. + */ + DomainConstraintCache *domainData; + + /* Private data, for internal use of typcache.c only */ + int flags; /* flags about what we've computed */ + + /* + * Private information about an enum type. NULL if not enum or + * information hasn't been requested. + */ + struct TypeCacheEnumData *enumData; + + /* We also maintain a list of all known domain-type cache entries */ + struct TypeCacheEntry *nextDomain; } TypeCacheEntry; /* Bit flags to indicate which fields a given caller needs to have set */ -#define TYPECACHE_EQ_OPR 0x0001 -#define TYPECACHE_LT_OPR 0x0002 -#define TYPECACHE_GT_OPR 0x0004 -#define TYPECACHE_CMP_PROC 0x0008 -#define TYPECACHE_HASH_PROC 0x0010 -#define TYPECACHE_EQ_OPR_FINFO 0x0020 -#define TYPECACHE_CMP_PROC_FINFO 0x0040 -#define TYPECACHE_HASH_PROC_FINFO 0x0080 -#define TYPECACHE_TUPDESC 0x0100 -#define TYPECACHE_BTREE_OPFAMILY 0x0200 -#define TYPECACHE_HASH_OPFAMILY 0x0400 -#define TYPECACHE_RANGE_INFO 0x0800 -#define TYPECACHE_DOMAIN_INFO 0x1000 +#define TYPECACHE_EQ_OPR 0x0001 +#define TYPECACHE_LT_OPR 0x0002 +#define TYPECACHE_GT_OPR 0x0004 +#define TYPECACHE_CMP_PROC 0x0008 +#define TYPECACHE_HASH_PROC 0x0010 +#define TYPECACHE_EQ_OPR_FINFO 0x0020 +#define TYPECACHE_CMP_PROC_FINFO 0x0040 +#define TYPECACHE_HASH_PROC_FINFO 0x0080 +#define TYPECACHE_TUPDESC 0x0100 +#define TYPECACHE_BTREE_OPFAMILY 0x0200 +#define TYPECACHE_HASH_OPFAMILY 0x0400 +#define TYPECACHE_RANGE_INFO 0x0800 +#define TYPECACHE_DOMAIN_INFO 0x1000 +#define TYPECACHE_HASH_EXTENDED_PROC 0x2000 +#define TYPECACHE_HASH_EXTENDED_PROC_FINFO 0x4000 /* * Callers wishing to maintain a long-lived reference to a domain's constraint @@ -129,21 +133,21 @@ typedef struct TypeCacheEntry */ typedef struct DomainConstraintRef { - List *constraints; /* list of DomainConstraintState nodes */ - MemoryContext refctx; /* context holding DomainConstraintRef */ - TypeCacheEntry *tcache; /* typcache entry for domain type */ - bool need_exprstate; /* does caller need check_exprstate? */ - - /* Management data --- treat these fields as private to typcache.c */ - DomainConstraintCache *dcc; /* current constraints, or NULL if none */ - MemoryContextCallback callback; /* used to release refcount when done */ + List *constraints; /* list of DomainConstraintState nodes */ + MemoryContext refctx; /* context holding DomainConstraintRef */ + TypeCacheEntry *tcache; /* typcache entry for domain type */ + bool need_exprstate; /* does caller need check_exprstate? */ + + /* Management data --- treat these fields as private to typcache.c */ + DomainConstraintCache *dcc; /* current constraints, or NULL if none */ + MemoryContextCallback callback; /* used to release refcount when done */ } DomainConstraintRef; extern TypeCacheEntry *lookup_type_cache(Oid type_id, int flags); extern void InitDomainConstraintRef(Oid type_id, DomainConstraintRef *ref, - MemoryContext refctx, bool need_exprstate); + MemoryContext refctx, bool need_exprstate); extern void UpdateDomainConstraintRef(DomainConstraintRef *ref); @@ -152,12 +156,12 @@ extern bool DomainHasConstraints(Oid type_id); extern TupleDesc lookup_rowtype_tupdesc(Oid type_id, int32 typmod); extern TupleDesc lookup_rowtype_tupdesc_noerror(Oid type_id, int32 typmod, - bool noError); + bool noError); extern TupleDesc lookup_rowtype_tupdesc_copy(Oid type_id, int32 typmod); extern void assign_record_type_typmod(TupleDesc tupDesc); -extern int compare_values_of_enum(TypeCacheEntry *tcache, Oid arg1, Oid arg2); +extern int compare_values_of_enum(TypeCacheEntry *tcache, Oid arg1, Oid arg2); -#endif /* TYPCACHE_H */ +#endif /* TYPCACHE_H */ diff --git a/src/test/regress/expected/alter_generic.out b/src/test/regress/expected/alter_generic.out index 61bd1cf1..2d7998ff 100644 --- a/src/test/regress/expected/alter_generic.out +++ b/src/test/regress/expected/alter_generic.out @@ -427,7 +427,7 @@ BEGIN TRANSACTION; CREATE OPERATOR FAMILY alt_opf13 USING hash; CREATE FUNCTION fn_opf13 (int4) RETURNS BIGINT AS 'SELECT NULL::BIGINT;' LANGUAGE SQL; ALTER OPERATOR FAMILY alt_opf13 USING hash ADD FUNCTION 1 fn_opf13(int4); -ERROR: hash procedures must return integer +ERROR: hash procedure 1 must return integer DROP OPERATOR FAMILY alt_opf13 USING hash; ERROR: current transaction is aborted, commands ignored until end of transaction block ROLLBACK; @@ -445,7 +445,7 @@ BEGIN TRANSACTION; CREATE OPERATOR FAMILY alt_opf15 USING hash; CREATE FUNCTION fn_opf15 (int4, int2) RETURNS BIGINT AS 'SELECT NULL::BIGINT;' LANGUAGE SQL; ALTER OPERATOR FAMILY alt_opf15 USING hash ADD FUNCTION 1 fn_opf15(int4, int2); -ERROR: hash procedures must have one argument +ERROR: hash procedure 1 must have one argument DROP OPERATOR FAMILY alt_opf15 USING hash; ERROR: current transaction is aborted, commands ignored until end of transaction block ROLLBACK; diff --git a/src/test/regress/expected/hash_func.out b/src/test/regress/expected/hash_func.out new file mode 100644 index 00000000..da0948e9 --- /dev/null +++ b/src/test/regress/expected/hash_func.out @@ -0,0 +1,300 @@ +-- +-- Test hash functions +-- +-- When the salt is 0, the extended hash function should produce a result +-- whose low 32 bits match the standard hash function. When the salt is +-- not 0, we should get a different result. +-- +SELECT v as value, hashint2(v)::bit(32) as standard, + hashint2extended(v, 0)::bit(32) as extended0, + hashint2extended(v, 1)::bit(32) as extended1 +FROM (VALUES (0::int2), (1::int2), (17::int2), (42::int2)) x(v) +WHERE hashint2(v)::bit(32) != hashint2extended(v, 0)::bit(32) + OR hashint2(v)::bit(32) = hashint2extended(v, 1)::bit(32); + value | standard | extended0 | extended1 +-------+----------+-----------+----------- +(0 rows) + +SELECT v as value, hashint4(v)::bit(32) as standard, + hashint4extended(v, 0)::bit(32) as extended0, + hashint4extended(v, 1)::bit(32) as extended1 +FROM (VALUES (0), (1), (17), (42), (550273), (207112489)) x(v) +WHERE hashint4(v)::bit(32) != hashint4extended(v, 0)::bit(32) + OR hashint4(v)::bit(32) = hashint4extended(v, 1)::bit(32); + value | standard | extended0 | extended1 +-------+----------+-----------+----------- +(0 rows) + +SELECT v as value, hashint8(v)::bit(32) as standard, + hashint8extended(v, 0)::bit(32) as extended0, + hashint8extended(v, 1)::bit(32) as extended1 +FROM (VALUES (0), (1), (17), (42), (550273), (207112489)) x(v) +WHERE hashint8(v)::bit(32) != hashint8extended(v, 0)::bit(32) + OR hashint8(v)::bit(32) = hashint8extended(v, 1)::bit(32); + value | standard | extended0 | extended1 +-------+----------+-----------+----------- +(0 rows) + +SELECT v as value, hashfloat4(v)::bit(32) as standard, + hashfloat4extended(v, 0)::bit(32) as extended0, + hashfloat4extended(v, 1)::bit(32) as extended1 +FROM (VALUES (0), (1), (17), (42), (550273), (207112489)) x(v) +WHERE hashfloat4(v)::bit(32) != hashfloat4extended(v, 0)::bit(32) + OR hashfloat4(v)::bit(32) = hashfloat4extended(v, 1)::bit(32); + value | standard | extended0 | extended1 +-------+----------+-----------+----------- +(0 rows) + +SELECT v as value, hashfloat8(v)::bit(32) as standard, + hashfloat8extended(v, 0)::bit(32) as extended0, + hashfloat8extended(v, 1)::bit(32) as extended1 +FROM (VALUES (0), (1), (17), (42), (550273), (207112489)) x(v) +WHERE hashfloat8(v)::bit(32) != hashfloat8extended(v, 0)::bit(32) + OR hashfloat8(v)::bit(32) = hashfloat8extended(v, 1)::bit(32); + value | standard | extended0 | extended1 +-------+----------+-----------+----------- +(0 rows) + +SELECT v as value, hashoid(v)::bit(32) as standard, + hashoidextended(v, 0)::bit(32) as extended0, + hashoidextended(v, 1)::bit(32) as extended1 +FROM (VALUES (0), (1), (17), (42), (550273), (207112489)) x(v) +WHERE hashoid(v)::bit(32) != hashoidextended(v, 0)::bit(32) + OR hashoid(v)::bit(32) = hashoidextended(v, 1)::bit(32); + value | standard | extended0 | extended1 +-------+----------+-----------+----------- +(0 rows) + +SELECT v as value, hashchar(v)::bit(32) as standard, + hashcharextended(v, 0)::bit(32) as extended0, + hashcharextended(v, 1)::bit(32) as extended1 +FROM (VALUES (NULL::"char"), ('1'), ('x'), ('X'), ('p'), ('N')) x(v) +WHERE hashchar(v)::bit(32) != hashcharextended(v, 0)::bit(32) + OR hashchar(v)::bit(32) = hashcharextended(v, 1)::bit(32); + value | standard | extended0 | extended1 +-------+----------+-----------+----------- +(0 rows) + +SELECT v as value, hashname(v)::bit(32) as standard, + hashnameextended(v, 0)::bit(32) as extended0, + hashnameextended(v, 1)::bit(32) as extended1 +FROM (VALUES (NULL), ('PostgreSQL'), ('eIpUEtqmY89'), ('AXKEJBTK'), + ('muop28x03'), ('yi3nm0d73')) x(v) +WHERE hashname(v)::bit(32) != hashnameextended(v, 0)::bit(32) + OR hashname(v)::bit(32) = hashnameextended(v, 1)::bit(32); + value | standard | extended0 | extended1 +-------+----------+-----------+----------- +(0 rows) + +SELECT v as value, hashtext(v)::bit(32) as standard, + hashtextextended(v, 0)::bit(32) as extended0, + hashtextextended(v, 1)::bit(32) as extended1 +FROM (VALUES (NULL), ('PostgreSQL'), ('eIpUEtqmY89'), ('AXKEJBTK'), + ('muop28x03'), ('yi3nm0d73')) x(v) +WHERE hashtext(v)::bit(32) != hashtextextended(v, 0)::bit(32) + OR hashtext(v)::bit(32) = hashtextextended(v, 1)::bit(32); + value | standard | extended0 | extended1 +-------+----------+-----------+----------- +(0 rows) + +SELECT v as value, hashoidvector(v)::bit(32) as standard, + hashoidvectorextended(v, 0)::bit(32) as extended0, + hashoidvectorextended(v, 1)::bit(32) as extended1 +FROM (VALUES (NULL::oidvector), ('0 1 2 3 4'), ('17 18 19 20'), + ('42 43 42 45'), ('550273 550273 570274'), + ('207112489 207112499 21512 2155 372325 1363252')) x(v) +WHERE hashoidvector(v)::bit(32) != hashoidvectorextended(v, 0)::bit(32) + OR hashoidvector(v)::bit(32) = hashoidvectorextended(v, 1)::bit(32); + value | standard | extended0 | extended1 +-------+----------+-----------+----------- +(0 rows) + +SELECT v as value, hash_aclitem(v)::bit(32) as standard, + hash_aclitem_extended(v, 0)::bit(32) as extended0, + hash_aclitem_extended(v, 1)::bit(32) as extended1 +FROM (SELECT DISTINCT(relacl[1]) FROM pg_class LIMIT 10) x(v) +WHERE hash_aclitem(v)::bit(32) != hash_aclitem_extended(v, 0)::bit(32) + OR hash_aclitem(v)::bit(32) = hash_aclitem_extended(v, 1)::bit(32); + value | standard | extended0 | extended1 +-------+----------+-----------+----------- +(0 rows) + +SELECT v as value, hashmacaddr(v)::bit(32) as standard, + hashmacaddrextended(v, 0)::bit(32) as extended0, + hashmacaddrextended(v, 1)::bit(32) as extended1 +FROM (VALUES (NULL::macaddr), ('08:00:2b:01:02:04'), ('08:00:2b:01:02:04'), + ('e2:7f:51:3e:70:49'), ('d6:a9:4a:78:1c:d5'), + ('ea:29:b1:5e:1f:a5')) x(v) +WHERE hashmacaddr(v)::bit(32) != hashmacaddrextended(v, 0)::bit(32) + OR hashmacaddr(v)::bit(32) = hashmacaddrextended(v, 1)::bit(32); + value | standard | extended0 | extended1 +-------+----------+-----------+----------- +(0 rows) + +SELECT v as value, hashinet(v)::bit(32) as standard, + hashinetextended(v, 0)::bit(32) as extended0, + hashinetextended(v, 1)::bit(32) as extended1 +FROM (VALUES (NULL::inet), ('192.168.100.128/25'), ('192.168.100.0/8'), + ('172.168.10.126/16'), ('172.18.103.126/24'), ('192.188.13.16/32')) x(v) +WHERE hashinet(v)::bit(32) != hashinetextended(v, 0)::bit(32) + OR hashinet(v)::bit(32) = hashinetextended(v, 1)::bit(32); + value | standard | extended0 | extended1 +-------+----------+-----------+----------- +(0 rows) + +SELECT v as value, hash_numeric(v)::bit(32) as standard, + hash_numeric_extended(v, 0)::bit(32) as extended0, + hash_numeric_extended(v, 1)::bit(32) as extended1 +FROM (VALUES (0), (1.149484958), (17.149484958), (42.149484958), + (149484958.550273), (2071124898672)) x(v) +WHERE hash_numeric(v)::bit(32) != hash_numeric_extended(v, 0)::bit(32) + OR hash_numeric(v)::bit(32) = hash_numeric_extended(v, 1)::bit(32); + value | standard | extended0 | extended1 +-------+----------+-----------+----------- +(0 rows) + +SELECT v as value, hashmacaddr8(v)::bit(32) as standard, + hashmacaddr8extended(v, 0)::bit(32) as extended0, + hashmacaddr8extended(v, 1)::bit(32) as extended1 +FROM (VALUES (NULL::macaddr8), ('08:00:2b:01:02:04:36:49'), + ('08:00:2b:01:02:04:f0:e8'), ('e2:7f:51:3e:70:49:16:29'), + ('d6:a9:4a:78:1c:d5:47:32'), ('ea:29:b1:5e:1f:a5')) x(v) +WHERE hashmacaddr8(v)::bit(32) != hashmacaddr8extended(v, 0)::bit(32) + OR hashmacaddr8(v)::bit(32) = hashmacaddr8extended(v, 1)::bit(32); + value | standard | extended0 | extended1 +-------+----------+-----------+----------- +(0 rows) + +SELECT v as value, hash_array(v)::bit(32) as standard, + hash_array_extended(v, 0)::bit(32) as extended0, + hash_array_extended(v, 1)::bit(32) as extended1 +FROM (VALUES ('{0}'::int4[]), ('{0,1,2,3,4}'), ('{17,18,19,20}'), + ('{42,34,65,98}'), ('{550273,590027, 870273}'), + ('{207112489, 807112489}')) x(v) +WHERE hash_array(v)::bit(32) != hash_array_extended(v, 0)::bit(32) + OR hash_array(v)::bit(32) = hash_array_extended(v, 1)::bit(32); + value | standard | extended0 | extended1 +-------+----------+-----------+----------- +(0 rows) + +SELECT v as value, hashbpchar(v)::bit(32) as standard, + hashbpcharextended(v, 0)::bit(32) as extended0, + hashbpcharextended(v, 1)::bit(32) as extended1 +FROM (VALUES (NULL), ('PostgreSQL'), ('eIpUEtqmY89'), ('AXKEJBTK'), + ('muop28x03'), ('yi3nm0d73')) x(v) +WHERE hashbpchar(v)::bit(32) != hashbpcharextended(v, 0)::bit(32) + OR hashbpchar(v)::bit(32) = hashbpcharextended(v, 1)::bit(32); + value | standard | extended0 | extended1 +-------+----------+-----------+----------- +(0 rows) + +SELECT v as value, time_hash(v)::bit(32) as standard, + time_hash_extended(v, 0)::bit(32) as extended0, + time_hash_extended(v, 1)::bit(32) as extended1 +FROM (VALUES (NULL::time), ('11:09:59'), ('1:09:59'), ('11:59:59'), + ('7:9:59'), ('5:15:59')) x(v) +WHERE time_hash(v)::bit(32) != time_hash_extended(v, 0)::bit(32) + OR time_hash(v)::bit(32) = time_hash_extended(v, 1)::bit(32); + value | standard | extended0 | extended1 +-------+----------+-----------+----------- +(0 rows) + +SELECT v as value, timetz_hash(v)::bit(32) as standard, + timetz_hash_extended(v, 0)::bit(32) as extended0, + timetz_hash_extended(v, 1)::bit(32) as extended1 +FROM (VALUES (NULL::timetz), ('00:11:52.518762-07'), ('00:11:52.51762-08'), + ('00:11:52.62-01'), ('00:11:52.62+01'), ('11:59:59+04')) x(v) +WHERE timetz_hash(v)::bit(32) != timetz_hash_extended(v, 0)::bit(32) + OR timetz_hash(v)::bit(32) = timetz_hash_extended(v, 1)::bit(32); + value | standard | extended0 | extended1 +-------+----------+-----------+----------- +(0 rows) + +SELECT v as value, interval_hash(v)::bit(32) as standard, + interval_hash_extended(v, 0)::bit(32) as extended0, + interval_hash_extended(v, 1)::bit(32) as extended1 +FROM (VALUES (NULL::interval), + ('5 month 7 day 46 minutes'), ('1 year 7 day 46 minutes'), + ('1 year 7 month 20 day 46 minutes'), ('5 month'), + ('17 year 11 month 7 day 9 hours 46 minutes 5 seconds')) x(v) +WHERE interval_hash(v)::bit(32) != interval_hash_extended(v, 0)::bit(32) + OR interval_hash(v)::bit(32) = interval_hash_extended(v, 1)::bit(32); + value | standard | extended0 | extended1 +-------+----------+-----------+----------- +(0 rows) + +SELECT v as value, timestamp_hash(v)::bit(32) as standard, + timestamp_hash_extended(v, 0)::bit(32) as extended0, + timestamp_hash_extended(v, 1)::bit(32) as extended1 +FROM (VALUES (NULL::timestamp), ('2017-08-22 00:09:59.518762'), + ('2015-08-20 00:11:52.51762-08'), + ('2017-05-22 00:11:52.62-01'), + ('2013-08-22 00:11:52.62+01'), ('2013-08-22 11:59:59+04')) x(v) +WHERE timestamp_hash(v)::bit(32) != timestamp_hash_extended(v, 0)::bit(32) + OR timestamp_hash(v)::bit(32) = timestamp_hash_extended(v, 1)::bit(32); + value | standard | extended0 | extended1 +-------+----------+-----------+----------- +(0 rows) + +SELECT v as value, uuid_hash(v)::bit(32) as standard, + uuid_hash_extended(v, 0)::bit(32) as extended0, + uuid_hash_extended(v, 1)::bit(32) as extended1 +FROM (VALUES (NULL::uuid), ('a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a11'), + ('5a9ba4ac-8d6f-11e7-bb31-be2e44b06b34'), + ('99c6705c-d939-461c-a3c9-1690ad64ed7b'), + ('7deed3ca-8d6f-11e7-bb31-be2e44b06b34'), + ('9ad46d4f-6f2a-4edd-aadb-745993928e1e')) x(v) +WHERE uuid_hash(v)::bit(32) != uuid_hash_extended(v, 0)::bit(32) + OR uuid_hash(v)::bit(32) = uuid_hash_extended(v, 1)::bit(32); + value | standard | extended0 | extended1 +-------+----------+-----------+----------- +(0 rows) + +SELECT v as value, pg_lsn_hash(v)::bit(32) as standard, + pg_lsn_hash_extended(v, 0)::bit(32) as extended0, + pg_lsn_hash_extended(v, 1)::bit(32) as extended1 +FROM (VALUES (NULL::pg_lsn), ('16/B374D84'), ('30/B374D84'), + ('255/B374D84'), ('25/B379D90'), ('900/F37FD90')) x(v) +WHERE pg_lsn_hash(v)::bit(32) != pg_lsn_hash_extended(v, 0)::bit(32) + OR pg_lsn_hash(v)::bit(32) = pg_lsn_hash_extended(v, 1)::bit(32); + value | standard | extended0 | extended1 +-------+----------+-----------+----------- +(0 rows) + +CREATE TYPE mood AS ENUM ('sad', 'ok', 'happy'); +SELECT v as value, hashenum(v)::bit(32) as standard, + hashenumextended(v, 0)::bit(32) as extended0, + hashenumextended(v, 1)::bit(32) as extended1 +FROM (VALUES ('sad'::mood), ('ok'), ('happy')) x(v) +WHERE hashenum(v)::bit(32) != hashenumextended(v, 0)::bit(32) + OR hashenum(v)::bit(32) = hashenumextended(v, 1)::bit(32); + value | standard | extended0 | extended1 +-------+----------+-----------+----------- +(0 rows) + +DROP TYPE mood; +SELECT v as value, jsonb_hash(v)::bit(32) as standard, + jsonb_hash_extended(v, 0)::bit(32) as extended0, + jsonb_hash_extended(v, 1)::bit(32) as extended1 +FROM (VALUES (NULL::jsonb), + ('{"a": "aaa bbb ddd ccc", "b": ["eee fff ggg"], "c": {"d": "hhh iii"}}'), + ('{"foo": [true, "bar"], "tags": {"e": 1, "f": null}}'), + ('{"g": {"h": "value"}}')) x(v) +WHERE jsonb_hash(v)::bit(32) != jsonb_hash_extended(v, 0)::bit(32) + OR jsonb_hash(v)::bit(32) = jsonb_hash_extended(v, 1)::bit(32); + value | standard | extended0 | extended1 +-------+----------+-----------+----------- +(0 rows) + +SELECT v as value, hash_range(v)::bit(32) as standard, + hash_range_extended(v, 0)::bit(32) as extended0, + hash_range_extended(v, 1)::bit(32) as extended1 +FROM (VALUES (int4range(10, 20)), (int4range(23, 43)), + (int4range(5675, 550273)), + (int4range(550274, 1550274)), (int4range(1550275, 208112489))) x(v) +WHERE hash_range(v)::bit(32) != hash_range_extended(v, 0)::bit(32) + OR hash_range(v)::bit(32) = hash_range_extended(v, 1)::bit(32); + value | standard | extended0 | extended1 +-------+----------+-----------+----------- +(0 rows) + diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 31e0b077..f9eabd53 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -70,7 +70,7 @@ test: create_index create_view # ---------- # Another group of parallel tests # ---------- -test: create_aggregate create_function_3 create_cast constraints triggers inherit create_table_like typed_table vacuum drop_if_exists updatable_views rolenames roleattributes create_am +test: create_aggregate create_function_3 create_cast constraints triggers inherit create_table_like typed_table vacuum drop_if_exists updatable_views rolenames roleattributes create_am hash_func # ---------- # sanity_check does a vacuum, affecting the sort order of SELECT * diff --git a/src/test/regress/sql/hash_func.sql b/src/test/regress/sql/hash_func.sql new file mode 100644 index 00000000..b7ce8b21 --- /dev/null +++ b/src/test/regress/sql/hash_func.sql @@ -0,0 +1,222 @@ +-- +-- Test hash functions +-- +-- When the salt is 0, the extended hash function should produce a result +-- whose low 32 bits match the standard hash function. When the salt is +-- not 0, we should get a different result. +-- + +SELECT v as value, hashint2(v)::bit(32) as standard, + hashint2extended(v, 0)::bit(32) as extended0, + hashint2extended(v, 1)::bit(32) as extended1 +FROM (VALUES (0::int2), (1::int2), (17::int2), (42::int2)) x(v) +WHERE hashint2(v)::bit(32) != hashint2extended(v, 0)::bit(32) + OR hashint2(v)::bit(32) = hashint2extended(v, 1)::bit(32); + +SELECT v as value, hashint4(v)::bit(32) as standard, + hashint4extended(v, 0)::bit(32) as extended0, + hashint4extended(v, 1)::bit(32) as extended1 +FROM (VALUES (0), (1), (17), (42), (550273), (207112489)) x(v) +WHERE hashint4(v)::bit(32) != hashint4extended(v, 0)::bit(32) + OR hashint4(v)::bit(32) = hashint4extended(v, 1)::bit(32); + +SELECT v as value, hashint8(v)::bit(32) as standard, + hashint8extended(v, 0)::bit(32) as extended0, + hashint8extended(v, 1)::bit(32) as extended1 +FROM (VALUES (0), (1), (17), (42), (550273), (207112489)) x(v) +WHERE hashint8(v)::bit(32) != hashint8extended(v, 0)::bit(32) + OR hashint8(v)::bit(32) = hashint8extended(v, 1)::bit(32); + +SELECT v as value, hashfloat4(v)::bit(32) as standard, + hashfloat4extended(v, 0)::bit(32) as extended0, + hashfloat4extended(v, 1)::bit(32) as extended1 +FROM (VALUES (0), (1), (17), (42), (550273), (207112489)) x(v) +WHERE hashfloat4(v)::bit(32) != hashfloat4extended(v, 0)::bit(32) + OR hashfloat4(v)::bit(32) = hashfloat4extended(v, 1)::bit(32); + +SELECT v as value, hashfloat8(v)::bit(32) as standard, + hashfloat8extended(v, 0)::bit(32) as extended0, + hashfloat8extended(v, 1)::bit(32) as extended1 +FROM (VALUES (0), (1), (17), (42), (550273), (207112489)) x(v) +WHERE hashfloat8(v)::bit(32) != hashfloat8extended(v, 0)::bit(32) + OR hashfloat8(v)::bit(32) = hashfloat8extended(v, 1)::bit(32); + +SELECT v as value, hashoid(v)::bit(32) as standard, + hashoidextended(v, 0)::bit(32) as extended0, + hashoidextended(v, 1)::bit(32) as extended1 +FROM (VALUES (0), (1), (17), (42), (550273), (207112489)) x(v) +WHERE hashoid(v)::bit(32) != hashoidextended(v, 0)::bit(32) + OR hashoid(v)::bit(32) = hashoidextended(v, 1)::bit(32); + +SELECT v as value, hashchar(v)::bit(32) as standard, + hashcharextended(v, 0)::bit(32) as extended0, + hashcharextended(v, 1)::bit(32) as extended1 +FROM (VALUES (NULL::"char"), ('1'), ('x'), ('X'), ('p'), ('N')) x(v) +WHERE hashchar(v)::bit(32) != hashcharextended(v, 0)::bit(32) + OR hashchar(v)::bit(32) = hashcharextended(v, 1)::bit(32); + +SELECT v as value, hashname(v)::bit(32) as standard, + hashnameextended(v, 0)::bit(32) as extended0, + hashnameextended(v, 1)::bit(32) as extended1 +FROM (VALUES (NULL), ('PostgreSQL'), ('eIpUEtqmY89'), ('AXKEJBTK'), + ('muop28x03'), ('yi3nm0d73')) x(v) +WHERE hashname(v)::bit(32) != hashnameextended(v, 0)::bit(32) + OR hashname(v)::bit(32) = hashnameextended(v, 1)::bit(32); + +SELECT v as value, hashtext(v)::bit(32) as standard, + hashtextextended(v, 0)::bit(32) as extended0, + hashtextextended(v, 1)::bit(32) as extended1 +FROM (VALUES (NULL), ('PostgreSQL'), ('eIpUEtqmY89'), ('AXKEJBTK'), + ('muop28x03'), ('yi3nm0d73')) x(v) +WHERE hashtext(v)::bit(32) != hashtextextended(v, 0)::bit(32) + OR hashtext(v)::bit(32) = hashtextextended(v, 1)::bit(32); + +SELECT v as value, hashoidvector(v)::bit(32) as standard, + hashoidvectorextended(v, 0)::bit(32) as extended0, + hashoidvectorextended(v, 1)::bit(32) as extended1 +FROM (VALUES (NULL::oidvector), ('0 1 2 3 4'), ('17 18 19 20'), + ('42 43 42 45'), ('550273 550273 570274'), + ('207112489 207112499 21512 2155 372325 1363252')) x(v) +WHERE hashoidvector(v)::bit(32) != hashoidvectorextended(v, 0)::bit(32) + OR hashoidvector(v)::bit(32) = hashoidvectorextended(v, 1)::bit(32); + +SELECT v as value, hash_aclitem(v)::bit(32) as standard, + hash_aclitem_extended(v, 0)::bit(32) as extended0, + hash_aclitem_extended(v, 1)::bit(32) as extended1 +FROM (SELECT DISTINCT(relacl[1]) FROM pg_class LIMIT 10) x(v) +WHERE hash_aclitem(v)::bit(32) != hash_aclitem_extended(v, 0)::bit(32) + OR hash_aclitem(v)::bit(32) = hash_aclitem_extended(v, 1)::bit(32); + +SELECT v as value, hashmacaddr(v)::bit(32) as standard, + hashmacaddrextended(v, 0)::bit(32) as extended0, + hashmacaddrextended(v, 1)::bit(32) as extended1 +FROM (VALUES (NULL::macaddr), ('08:00:2b:01:02:04'), ('08:00:2b:01:02:04'), + ('e2:7f:51:3e:70:49'), ('d6:a9:4a:78:1c:d5'), + ('ea:29:b1:5e:1f:a5')) x(v) +WHERE hashmacaddr(v)::bit(32) != hashmacaddrextended(v, 0)::bit(32) + OR hashmacaddr(v)::bit(32) = hashmacaddrextended(v, 1)::bit(32); + +SELECT v as value, hashinet(v)::bit(32) as standard, + hashinetextended(v, 0)::bit(32) as extended0, + hashinetextended(v, 1)::bit(32) as extended1 +FROM (VALUES (NULL::inet), ('192.168.100.128/25'), ('192.168.100.0/8'), + ('172.168.10.126/16'), ('172.18.103.126/24'), ('192.188.13.16/32')) x(v) +WHERE hashinet(v)::bit(32) != hashinetextended(v, 0)::bit(32) + OR hashinet(v)::bit(32) = hashinetextended(v, 1)::bit(32); + +SELECT v as value, hash_numeric(v)::bit(32) as standard, + hash_numeric_extended(v, 0)::bit(32) as extended0, + hash_numeric_extended(v, 1)::bit(32) as extended1 +FROM (VALUES (0), (1.149484958), (17.149484958), (42.149484958), + (149484958.550273), (2071124898672)) x(v) +WHERE hash_numeric(v)::bit(32) != hash_numeric_extended(v, 0)::bit(32) + OR hash_numeric(v)::bit(32) = hash_numeric_extended(v, 1)::bit(32); + +SELECT v as value, hashmacaddr8(v)::bit(32) as standard, + hashmacaddr8extended(v, 0)::bit(32) as extended0, + hashmacaddr8extended(v, 1)::bit(32) as extended1 +FROM (VALUES (NULL::macaddr8), ('08:00:2b:01:02:04:36:49'), + ('08:00:2b:01:02:04:f0:e8'), ('e2:7f:51:3e:70:49:16:29'), + ('d6:a9:4a:78:1c:d5:47:32'), ('ea:29:b1:5e:1f:a5')) x(v) +WHERE hashmacaddr8(v)::bit(32) != hashmacaddr8extended(v, 0)::bit(32) + OR hashmacaddr8(v)::bit(32) = hashmacaddr8extended(v, 1)::bit(32); + +SELECT v as value, hash_array(v)::bit(32) as standard, + hash_array_extended(v, 0)::bit(32) as extended0, + hash_array_extended(v, 1)::bit(32) as extended1 +FROM (VALUES ('{0}'::int4[]), ('{0,1,2,3,4}'), ('{17,18,19,20}'), + ('{42,34,65,98}'), ('{550273,590027, 870273}'), + ('{207112489, 807112489}')) x(v) +WHERE hash_array(v)::bit(32) != hash_array_extended(v, 0)::bit(32) + OR hash_array(v)::bit(32) = hash_array_extended(v, 1)::bit(32); + +SELECT v as value, hashbpchar(v)::bit(32) as standard, + hashbpcharextended(v, 0)::bit(32) as extended0, + hashbpcharextended(v, 1)::bit(32) as extended1 +FROM (VALUES (NULL), ('PostgreSQL'), ('eIpUEtqmY89'), ('AXKEJBTK'), + ('muop28x03'), ('yi3nm0d73')) x(v) +WHERE hashbpchar(v)::bit(32) != hashbpcharextended(v, 0)::bit(32) + OR hashbpchar(v)::bit(32) = hashbpcharextended(v, 1)::bit(32); + +SELECT v as value, time_hash(v)::bit(32) as standard, + time_hash_extended(v, 0)::bit(32) as extended0, + time_hash_extended(v, 1)::bit(32) as extended1 +FROM (VALUES (NULL::time), ('11:09:59'), ('1:09:59'), ('11:59:59'), + ('7:9:59'), ('5:15:59')) x(v) +WHERE time_hash(v)::bit(32) != time_hash_extended(v, 0)::bit(32) + OR time_hash(v)::bit(32) = time_hash_extended(v, 1)::bit(32); + +SELECT v as value, timetz_hash(v)::bit(32) as standard, + timetz_hash_extended(v, 0)::bit(32) as extended0, + timetz_hash_extended(v, 1)::bit(32) as extended1 +FROM (VALUES (NULL::timetz), ('00:11:52.518762-07'), ('00:11:52.51762-08'), + ('00:11:52.62-01'), ('00:11:52.62+01'), ('11:59:59+04')) x(v) +WHERE timetz_hash(v)::bit(32) != timetz_hash_extended(v, 0)::bit(32) + OR timetz_hash(v)::bit(32) = timetz_hash_extended(v, 1)::bit(32); + +SELECT v as value, interval_hash(v)::bit(32) as standard, + interval_hash_extended(v, 0)::bit(32) as extended0, + interval_hash_extended(v, 1)::bit(32) as extended1 +FROM (VALUES (NULL::interval), + ('5 month 7 day 46 minutes'), ('1 year 7 day 46 minutes'), + ('1 year 7 month 20 day 46 minutes'), ('5 month'), + ('17 year 11 month 7 day 9 hours 46 minutes 5 seconds')) x(v) +WHERE interval_hash(v)::bit(32) != interval_hash_extended(v, 0)::bit(32) + OR interval_hash(v)::bit(32) = interval_hash_extended(v, 1)::bit(32); + +SELECT v as value, timestamp_hash(v)::bit(32) as standard, + timestamp_hash_extended(v, 0)::bit(32) as extended0, + timestamp_hash_extended(v, 1)::bit(32) as extended1 +FROM (VALUES (NULL::timestamp), ('2017-08-22 00:09:59.518762'), + ('2015-08-20 00:11:52.51762-08'), + ('2017-05-22 00:11:52.62-01'), + ('2013-08-22 00:11:52.62+01'), ('2013-08-22 11:59:59+04')) x(v) +WHERE timestamp_hash(v)::bit(32) != timestamp_hash_extended(v, 0)::bit(32) + OR timestamp_hash(v)::bit(32) = timestamp_hash_extended(v, 1)::bit(32); + +SELECT v as value, uuid_hash(v)::bit(32) as standard, + uuid_hash_extended(v, 0)::bit(32) as extended0, + uuid_hash_extended(v, 1)::bit(32) as extended1 +FROM (VALUES (NULL::uuid), ('a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a11'), + ('5a9ba4ac-8d6f-11e7-bb31-be2e44b06b34'), + ('99c6705c-d939-461c-a3c9-1690ad64ed7b'), + ('7deed3ca-8d6f-11e7-bb31-be2e44b06b34'), + ('9ad46d4f-6f2a-4edd-aadb-745993928e1e')) x(v) +WHERE uuid_hash(v)::bit(32) != uuid_hash_extended(v, 0)::bit(32) + OR uuid_hash(v)::bit(32) = uuid_hash_extended(v, 1)::bit(32); + +SELECT v as value, pg_lsn_hash(v)::bit(32) as standard, + pg_lsn_hash_extended(v, 0)::bit(32) as extended0, + pg_lsn_hash_extended(v, 1)::bit(32) as extended1 +FROM (VALUES (NULL::pg_lsn), ('16/B374D84'), ('30/B374D84'), + ('255/B374D84'), ('25/B379D90'), ('900/F37FD90')) x(v) +WHERE pg_lsn_hash(v)::bit(32) != pg_lsn_hash_extended(v, 0)::bit(32) + OR pg_lsn_hash(v)::bit(32) = pg_lsn_hash_extended(v, 1)::bit(32); + +CREATE TYPE mood AS ENUM ('sad', 'ok', 'happy'); +SELECT v as value, hashenum(v)::bit(32) as standard, + hashenumextended(v, 0)::bit(32) as extended0, + hashenumextended(v, 1)::bit(32) as extended1 +FROM (VALUES ('sad'::mood), ('ok'), ('happy')) x(v) +WHERE hashenum(v)::bit(32) != hashenumextended(v, 0)::bit(32) + OR hashenum(v)::bit(32) = hashenumextended(v, 1)::bit(32); +DROP TYPE mood; + +SELECT v as value, jsonb_hash(v)::bit(32) as standard, + jsonb_hash_extended(v, 0)::bit(32) as extended0, + jsonb_hash_extended(v, 1)::bit(32) as extended1 +FROM (VALUES (NULL::jsonb), + ('{"a": "aaa bbb ddd ccc", "b": ["eee fff ggg"], "c": {"d": "hhh iii"}}'), + ('{"foo": [true, "bar"], "tags": {"e": 1, "f": null}}'), + ('{"g": {"h": "value"}}')) x(v) +WHERE jsonb_hash(v)::bit(32) != jsonb_hash_extended(v, 0)::bit(32) + OR jsonb_hash(v)::bit(32) = jsonb_hash_extended(v, 1)::bit(32); + +SELECT v as value, hash_range(v)::bit(32) as standard, + hash_range_extended(v, 0)::bit(32) as extended0, + hash_range_extended(v, 1)::bit(32) as extended1 +FROM (VALUES (int4range(10, 20)), (int4range(23, 43)), + (int4range(5675, 550273)), + (int4range(550274, 1550274)), (int4range(1550275, 208112489))) x(v) +WHERE hash_range(v)::bit(32) != hash_range_extended(v, 0)::bit(32) + OR hash_range(v)::bit(32) = hash_range_extended(v, 1)::bit(32); From 7327d44e8d14385cafdbb926cfd7dd15a9a67958 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Thu, 7 Sep 2017 21:07:47 -0400 Subject: [PATCH 189/578] Refactor get_partition_for_tuple a bit. Pending patches for both default partitioning and hash partitioning find the current coding pattern to be inconvenient. Change it so that we switch on the partitioning method first and then do whatever is needed. Amul Sul, reviewed by Jeevan Ladhe, with a few adjustments by me. Discussion: http://postgr.es/m/CAAJ_b97mTb=dG2pv6+1ougxEVZFVnZJajW+0QHj46mEE7WsoOQ@mail.gmail.com Discussion: http://postgr.es/m/CAOgcT0M37CAztEinpvjJc18EdHfm23fw0EG9-36Ya=+rEFUqaQ@mail.gmail.com --- src/backend/catalog/partition.c | 80 +++++++++++++++++---------------- 1 file changed, 42 insertions(+), 38 deletions(-) diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c index 3ea32102..20450d8a 100644 --- a/src/backend/catalog/partition.c +++ b/src/backend/catalog/partition.c @@ -1952,10 +1952,7 @@ get_partition_for_tuple(PartitionDispatch *pd, PartitionDispatch parent; Datum values[PARTITION_MAX_KEYS]; bool isnull[PARTITION_MAX_KEYS]; - int cur_offset, - cur_index; - int i, - result; + int result; ExprContext *ecxt = GetPerTupleExprContext(estate); TupleTableSlot *ecxt_scantuple_old = ecxt->ecxt_scantuple; @@ -1967,6 +1964,7 @@ get_partition_for_tuple(PartitionDispatch *pd, PartitionDesc partdesc = parent->partdesc; TupleTableSlot *myslot = parent->tupslot; TupleConversionMap *map = parent->tupmap; + int cur_index = -1; if (myslot != NULL && map != NULL) { @@ -1998,12 +1996,38 @@ get_partition_for_tuple(PartitionDispatch *pd, ecxt->ecxt_scantuple = slot; FormPartitionKeyDatum(parent, slot, estate, values, isnull); - if (key->strategy == PARTITION_STRATEGY_RANGE) + /* Route as appropriate based on partitioning strategy. */ + switch (key->strategy) { - /* - * Since we cannot route tuples with NULL partition keys through a - * range-partitioned table, simply return that no partition exists - */ + case PARTITION_STRATEGY_LIST: + + if (isnull[0]) + { + if (partition_bound_accepts_nulls(partdesc->boundinfo)) + cur_index = partdesc->boundinfo->null_index; + } + else + { + bool equal = false; + int cur_offset; + + cur_offset = partition_bound_bsearch(key, + partdesc->boundinfo, + values, + false, + &equal); + if (cur_offset >= 0 && equal) + cur_index = partdesc->boundinfo->indexes[cur_offset]; + } + break; + + case PARTITION_STRATEGY_RANGE: + { + bool equal = false; + int cur_offset; + int i; + + /* No range includes NULL. */ for (i = 0; i < key->partnatts; i++) { if (isnull[i]) @@ -2014,46 +2038,26 @@ get_partition_for_tuple(PartitionDispatch *pd, goto error_exit; } } - } - /* - * A null partition key is only acceptable if null-accepting list - * partition exists. - */ - cur_index = -1; - if (isnull[0] && partition_bound_accepts_nulls(partdesc->boundinfo)) - cur_index = partdesc->boundinfo->null_index; - else if (!isnull[0]) - { - /* Else bsearch in partdesc->boundinfo */ - bool equal = false; - - cur_offset = partition_bound_bsearch(key, partdesc->boundinfo, - values, false, &equal); - switch (key->strategy) - { - case PARTITION_STRATEGY_LIST: - if (cur_offset >= 0 && equal) - cur_index = partdesc->boundinfo->indexes[cur_offset]; - else - cur_index = -1; - break; - - case PARTITION_STRATEGY_RANGE: + cur_offset = partition_bound_bsearch(key, + partdesc->boundinfo, + values, + false, + &equal); /* - * Offset returned is such that the bound at offset is - * found to be less or equal with the tuple. So, the bound - * at offset+1 would be the upper bound. + * The offset returned is such that the bound at cur_offset + * is less than or equal to the tuple value, so the bound + * at offset+1 is the upper bound. */ cur_index = partdesc->boundinfo->indexes[cur_offset + 1]; + } break; default: elog(ERROR, "unexpected partition strategy: %d", (int) key->strategy); } - } /* * cur_index < 0 means we failed to find a partition of this parent. From d396fffdcdce9966c9108593cc38babda3f31cc8 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Thu, 25 Jun 2020 12:25:07 +0800 Subject: [PATCH 190/578] Allow a partitioned table to have a default partition. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- doc/src/sgml/catalogs.sgml | 11 + doc/src/sgml/ref/alter_table.sgml | 31 +- doc/src/sgml/ref/create_table.sgml | 35 +- src/backend/catalog/heap.c | 41 +- src/backend/catalog/partition.c | 635 +++++++++++++++++++-- src/backend/commands/tablecmds.c | 160 +++++- src/backend/nodes/copyfuncs.c | 1 + src/backend/nodes/equalfuncs.c | 1 + src/backend/nodes/outfuncs.c | 1 + src/backend/nodes/readfuncs.c | 1 + src/backend/parser/gram.y | 27 +- src/backend/parser/parse_utilcmd.c | 12 + src/backend/utils/adt/ruleutils.c | 8 +- src/bin/psql/describe.c | 9 +- src/bin/psql/tab-complete.c | 4 +- src/include/catalog/partition.h | 7 + src/include/catalog/pg_partitioned_table.h | 71 +-- src/include/commands/tablecmds.h | 3 + src/include/nodes/parsenodes.h | 1 + src/test/regress/expected/alter_table.out | 49 ++ src/test/regress/expected/create_table.out | 20 + src/test/regress/expected/insert.out | 147 ++++- src/test/regress/expected/insert_1.out | 147 ++++- src/test/regress/expected/plancache.out | 26 + src/test/regress/expected/sanity_check.out | 4 + src/test/regress/expected/update.out | 33 ++ src/test/regress/sql/alter_table.sql | 47 ++ src/test/regress/sql/create_table.sql | 20 + src/test/regress/sql/insert.sql | 68 ++- src/test/regress/sql/plancache.sql | 21 + src/test/regress/sql/update.sql | 24 + 31 files changed, 1510 insertions(+), 155 deletions(-) diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml index b3d23a64..fdac2074 100644 --- a/doc/src/sgml/catalogs.sgml +++ b/doc/src/sgml/catalogs.sgml @@ -4771,6 +4771,17 @@ SCRAM-SHA-256$<iteration count>:<salt>< The number of columns in partition key + + partdefid + oid + pg_class.oid + + The OID of the pg_class entry for the default partition + of this partitioned table, or zero if this partitioned table does not + have a default partition. + + + partattrs int2vector diff --git a/doc/src/sgml/ref/alter_table.sgml b/doc/src/sgml/ref/alter_table.sgml index 9e0d6b31..06c5655e 100644 --- a/doc/src/sgml/ref/alter_table.sgml +++ b/doc/src/sgml/ref/alter_table.sgml @@ -34,7 +34,7 @@ ALTER TABLE [ IF EXISTS ] name ALTER TABLE ALL IN TABLESPACE name [ OWNED BY role_name [, ... ] ] SET TABLESPACE new_tablespace [ NOWAIT ] ALTER TABLE [ IF EXISTS ] name - ATTACH PARTITION partition_name FOR VALUES partition_bound_spec + ATTACH PARTITION partition_name { FOR VALUES partition_bound_spec | DEFAULT } ALTER TABLE [ IF EXISTS ] name DETACH PARTITION partition_name @@ -830,11 +830,18 @@ ALTER TABLE [ IF EXISTS ] name - ATTACH PARTITION partition_name FOR VALUES partition_bound_spec + ATTACH PARTITION partition_name { FOR VALUES partition_bound_spec | DEFAULT } This form attaches an existing table (which might itself be partitioned) - as a partition of the target table using the same syntax for + as a partition of the target table. The table can be attached + as a partition for specific values using FOR VALUES + or as a default partition by using DEFAULT + . + + + + A partition using FOR VALUES uses same syntax for partition_bound_spec as . The partition bound specification must correspond to the partitioning strategy and partition key of the @@ -871,6 +878,17 @@ ALTER TABLE [ IF EXISTS ] name (See the discussion in about constraints on the foreign table.) + + + When a table has a default partition, defining a new partition changes + the partition constraint for the default partition. The default + partition can't contain any rows that would need to be moved to the new + partition, and will be scanned to verify that none are present. This + scan, like the scan of the new partition, can be avoided if an + appropriate CHECK constraint is present. Also like + the scan of the new partition, it is always skipped when the default + partition is a foreign table. + @@ -1604,6 +1622,13 @@ ALTER TABLE cities ATTACH PARTITION cities_ab FOR VALUES IN ('a', 'b'); + + Attach a default partition to a partitioned table: + +ALTER TABLE cities + ATTACH PARTITION cities_partdef DEFAULT; + + Detach a partition from partitioned table: diff --git a/doc/src/sgml/ref/create_table.sgml b/doc/src/sgml/ref/create_table.sgml index cacdad1f..e46601b7 100644 --- a/doc/src/sgml/ref/create_table.sgml +++ b/doc/src/sgml/ref/create_table.sgml @@ -55,7 +55,7 @@ CREATE [ [ GLOBAL | LOCAL ] { TEMPORARY | TEMP } | UNLOGGED ] TABLE [ IF NOT EXI { column_name [ WITH OPTIONS ] [ column_constraint [ ... ] ] | table_constraint } [, ... ] -) ] FOR VALUES partition_bound_spec +) ] { FOR VALUES partition_bound_spec | DEFAULT } [ PARTITION BY { RANGE | LIST } ( { column_name | ( expression ) } [ COLLATE collation ] [ opclass ] [, ... ] ) ] [ WITH ( storage_parameter [= value] [, ... ] ) | WITH OIDS | WITHOUT OIDS ] [ ON COMMIT { PRESERVE ROWS | DELETE ROWS | DROP } ] @@ -262,11 +262,13 @@ FROM ( { numeric_literal | - PARTITION OF parent_table FOR VALUES partition_bound_spec + PARTITION OF parent_table { FOR VALUES partition_bound_spec | DEFAULT } Creates the table as a partition of the specified - parent table. + parent table. The table can be created either as a partition for specific + values using FOR VALUES or as a default partition + using DEFAULT. @@ -354,6 +356,26 @@ FROM ( { numeric_literal | + + If DEFAULT is specified, the table will be + created as a default partition of the parent table. The parent can + either be a list or range partitioned table. A partition key value + not fitting into any other partition of the given parent will be + routed to the default partition. There can be only one default + partition for a given parent table. + + + + When a table has an existing DEFAULT partition and + a new partition is added to it, the existing default partition must + be scanned to verify that it does not contain any rows which properly + belong in the new partition. If the default partition contains a + large number of rows, this may be slow. The scan will be skipped if + the default partition is a foreign table or if it has a constraint which + proves that it cannot contain rows which should be placed in the new + partition. + + A partition must have the same column names and types as the partitioned table to which it belongs. If the parent is specified WITH @@ -1827,6 +1849,13 @@ CREATE TABLE cities_ab CREATE TABLE cities_ab_10000_to_100000 PARTITION OF cities_ab FOR VALUES FROM (10000) TO (100000); + + + Create a default partition: + +CREATE TABLE cities_partdef + PARTITION OF cities DEFAULT; + diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c index d2058c17..0c382fe7 100644 --- a/src/backend/catalog/heap.c +++ b/src/backend/catalog/heap.c @@ -2784,7 +2784,8 @@ heap_drop_with_catalog(Oid relid) {// #lizard forgives Relation rel; HeapTuple tuple; - Oid parentOid = InvalidOid; + Oid parentOid = InvalidOid, + defaultPartOid = InvalidOid; /* * To drop a partition safely, we must grab exclusive lock on its parent, @@ -2800,6 +2801,14 @@ heap_drop_with_catalog(Oid relid) { parentOid = get_partition_parent(relid); LockRelationOid(parentOid, AccessExclusiveLock); + + /* + * If this is not the default partition, dropping it will change the + * default partition's partition constraint, so we must lock it. + */ + defaultPartOid = get_default_partition_oid(parentOid); + if (OidIsValid(defaultPartOid) && relid != defaultPartOid) + LockRelationOid(defaultPartOid, AccessExclusiveLock); } ReleaseSysCache(tuple); @@ -2851,6 +2860,13 @@ heap_drop_with_catalog(Oid relid) RemovePartitionKeyByRelId(relid); /* + * If the relation being dropped is the default partition itself, + * invalidate its entry in pg_partitioned_table. + */ + if (relid == defaultPartOid) + update_default_partition_oid(parentOid, InvalidOid); + + /* * Schedule unlinking of the relation's physical files at commit. */ if (rel->rd_rel->relkind != RELKIND_VIEW && @@ -2914,6 +2930,14 @@ heap_drop_with_catalog(Oid relid) if (OidIsValid(parentOid)) { /* + * If this is not the default partition, the partition constraint of + * the default partition has changed to include the portion of the key + * space previously covered by the dropped partition. + */ + if (OidIsValid(defaultPartOid) && relid != defaultPartOid) + CacheInvalidateRelcacheByRelid(defaultPartOid); + + /* * Invalidate the parent's relcache so that the partition is no longer * included in its partition descriptor. */ @@ -4250,6 +4274,7 @@ StorePartitionKey(Relation rel, values[Anum_pg_partitioned_table_partrelid - 1] = ObjectIdGetDatum(RelationGetRelid(rel)); values[Anum_pg_partitioned_table_partstrat - 1] = CharGetDatum(strategy); values[Anum_pg_partitioned_table_partnatts - 1] = Int16GetDatum(partnatts); + values[Anum_pg_partitioned_table_partdefid - 1] = ObjectIdGetDatum(InvalidOid); values[Anum_pg_partitioned_table_partattrs - 1] = PointerGetDatum(partattrs_vec); values[Anum_pg_partitioned_table_partclass - 1] = PointerGetDatum(partopclass_vec); values[Anum_pg_partitioned_table_partcollation - 1] = PointerGetDatum(partcollation_vec); @@ -4427,7 +4452,8 @@ RemovePartitionKeyByRelId(Oid relid) * relispartition to true * * Also, invalidate the parent's relcache, so that the next rebuild will load - * the new partition's info into its partition descriptor. + * the new partition's info into its partition descriptor.  If there is a + * default partition, we must invalidate its relcache entry as well. */ void StorePartitionBound(Relation rel, Relation parent, PartitionBoundSpec *bound) @@ -4438,6 +4464,7 @@ StorePartitionBound(Relation rel, Relation parent, PartitionBoundSpec *bound) Datum new_val[Natts_pg_class]; bool new_null[Natts_pg_class], new_repl[Natts_pg_class]; + Oid defaultPartOid; /* Update pg_class tuple */ classRel = heap_open(RelationRelationId, RowExclusiveLock); @@ -4475,6 +4502,16 @@ StorePartitionBound(Relation rel, Relation parent, PartitionBoundSpec *bound) heap_freetuple(newtuple); heap_close(classRel, RowExclusiveLock); + /* + * The partition constraint for the default partition depends on the + * partition bounds of every other partition, so we must invalidate the + * relcache entry for that partition every time a partition is added or + * removed. + */ + defaultPartOid = get_default_oid_from_partdesc(RelationGetPartitionDesc(parent)); + if (OidIsValid(defaultPartOid)) + CacheInvalidateRelcacheByRelid(defaultPartOid); + CacheInvalidateRelcache(parent); } diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c index 20450d8a..a4ef01e7 100644 --- a/src/backend/catalog/partition.c +++ b/src/backend/catalog/partition.c @@ -27,7 +27,9 @@ #include "catalog/pg_inherits.h" #include "catalog/pg_inherits_fn.h" #include "catalog/pg_opclass.h" +#include "catalog/pg_partitioned_table.h" #include "catalog/pg_type.h" +#include "commands/tablecmds.h" #include "executor/executor.h" #include "miscadmin.h" #include "nodes/makefuncs.h" @@ -35,6 +37,7 @@ #include "nodes/parsenodes.h" #include "optimizer/clauses.h" #include "optimizer/planmain.h" +#include "optimizer/prep.h" #include "optimizer/var.h" #include "rewrite/rewriteManip.h" #include "storage/lmgr.h" @@ -80,9 +83,12 @@ typedef struct PartitionBoundInfoData * partitioned table) */ int null_index; /* Index of the null-accepting partition; -1 * if there isn't one */ + int default_index; /* Index of the default partition; -1 if there + * isn't one */ } PartitionBoundInfoData; #define partition_bound_accepts_nulls(bi) ((bi)->null_index != -1) +#define partition_bound_has_default(bi) ((bi)->default_index != -1) /* * When qsort'ing partition bounds after reading from the catalog, each bound @@ -120,8 +126,10 @@ static void get_range_key_properties(PartitionKey key, int keynum, ListCell **partexprs_item, Expr **keyCol, Const **lower_val, Const **upper_val); -static List *get_qual_for_list(PartitionKey key, PartitionBoundSpec *spec); -static List *get_qual_for_range(PartitionKey key, PartitionBoundSpec *spec); +static List *get_qual_for_list(Relation parent, PartitionBoundSpec *spec); +static List *get_qual_for_range(Relation parent, PartitionBoundSpec *spec, + bool for_default); +static List *get_range_nulltest(PartitionKey key); static List *generate_partition_qual(Relation rel); static PartitionRangeBound *make_one_range_bound(PartitionKey key, int index, @@ -162,6 +170,7 @@ RelationBuildPartitionDesc(Relation rel) MemoryContext oldcxt; int ndatums = 0; + int default_index = -1; /* List partitioning specific */ PartitionListValue **all_values = NULL; @@ -222,6 +231,22 @@ RelationBuildPartitionDesc(Relation rel) old_portable_output = set_portable_input(false); #endif boundspec = (Node *) stringToNode(TextDatumGetCString(datum)); + + /* + * Sanity check: If the PartitionBoundSpec says this is the default + * partition, its OID should correspond to whatever's stored in + * pg_partitioned_table.partdefid; if not, the catalog is corrupt. + */ + if (castNode(PartitionBoundSpec, boundspec)->is_default) + { + Oid partdefid; + + partdefid = get_default_partition_oid(RelationGetRelid(rel)); + if (partdefid != inhrelid) + elog(ERROR, "expected partdefid %u, but got %u", + inhrelid, partdefid); + } + #ifdef __TBASE__ set_portable_input(old_portable_output); #endif @@ -258,6 +283,18 @@ RelationBuildPartitionDesc(Relation rel) if (spec->strategy != PARTITION_STRATEGY_LIST) elog(ERROR, "invalid strategy in partition bound spec"); + /* + * Note the index of the partition bound spec for the default + * partition. There's no datum to add to the list of non-null + * datums for this partition. + */ + if (spec->is_default) + { + default_index = i; + i++; + continue; + } + foreach(c, spec->listdatums) { Const *val = castNode(Const, lfirst(c)); @@ -340,6 +377,17 @@ RelationBuildPartitionDesc(Relation rel) if (spec->strategy != PARTITION_STRATEGY_RANGE) elog(ERROR, "invalid strategy in partition bound spec"); + /* + * Note the index of the partition bound spec for the default + * partition. There's no datum to add to the allbounds array + * for this partition. + */ + if (spec->is_default) + { + default_index = i++; + continue; + } + lower = make_one_range_bound(key, i, spec->lowerdatums, true); upper = make_one_range_bound(key, i, spec->upperdatums, @@ -349,10 +397,12 @@ RelationBuildPartitionDesc(Relation rel) j += 2; i++; } - Assert(j == 2 * nparts); + + Assert(j == nparts * 2 || + (default_index != -1 && j == (nparts - 1) * 2)); /* Sort all the bounds in ascending order */ - qsort_arg(all_bounds, 2 * nparts, + qsort_arg(all_bounds, j, sizeof(PartitionRangeBound *), qsort_partition_rbound_cmp, (void *) key); @@ -453,6 +503,7 @@ RelationBuildPartitionDesc(Relation rel) boundinfo = (PartitionBoundInfoData *) palloc0(sizeof(PartitionBoundInfoData)); boundinfo->strategy = key->strategy; + boundinfo->default_index = -1; boundinfo->ndatums = ndatums; boundinfo->null_index = -1; boundinfo->datums = (Datum **) palloc0(ndatums * sizeof(Datum *)); @@ -505,6 +556,21 @@ RelationBuildPartitionDesc(Relation rel) boundinfo->null_index = mapping[null_index]; } + /* Assign mapped index for the default partition. */ + if (default_index != -1) + { + /* + * The default partition accepts any value not + * specified in the lists of other partitions, hence + * it should not get mapped index while assigning + * those for non-null datums. + */ + Assert(default_index >= 0 && + mapping[default_index] == -1); + mapping[default_index] = next_index++; + boundinfo->default_index = mapping[default_index]; + } + /* All partition must now have a valid mapping */ Assert(next_index == nparts); break; @@ -559,6 +625,14 @@ RelationBuildPartitionDesc(Relation rel) boundinfo->indexes[i] = mapping[orig_index]; } } + + /* Assign mapped index for the default partition. */ + if (default_index != -1) + { + Assert(default_index >= 0 && mapping[default_index] == -1); + mapping[default_index] = next_index++; + boundinfo->default_index = mapping[default_index]; + } boundinfo->indexes[i] = -1; break; } @@ -609,6 +683,9 @@ partition_bounds_equal(int partnatts, int16 *parttyplen, bool *parttypbyval, if (b1->null_index != b2->null_index) return false; + if (b1->default_index != b2->default_index) + return false; + for (i = 0; i < b1->ndatums; i++) { int j; @@ -667,10 +744,24 @@ check_new_partition_bound(char *relname, Relation parent, {// #lizard forgives PartitionKey key = RelationGetPartitionKey(parent); PartitionDesc partdesc = RelationGetPartitionDesc(parent); + PartitionBoundInfo boundinfo = partdesc->boundinfo; ParseState *pstate = make_parsestate(NULL); int with = -1; bool overlap = false; + if (spec->is_default) + { + if (boundinfo == NULL || !partition_bound_has_default(boundinfo)) + return; + + /* Default partition already exists, error out. */ + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("partition \"%s\" conflicts with existing default partition \"%s\"", + relname, get_rel_name(partdesc->oids[boundinfo->default_index])), + parser_errposition(pstate, spec->location))); + } + switch (key->strategy) { case PARTITION_STRATEGY_LIST: @@ -679,13 +770,13 @@ check_new_partition_bound(char *relname, Relation parent, if (partdesc->nparts > 0) { - PartitionBoundInfo boundinfo = partdesc->boundinfo; ListCell *cell; Assert(boundinfo && boundinfo->strategy == PARTITION_STRATEGY_LIST && (boundinfo->ndatums > 0 || - partition_bound_accepts_nulls(boundinfo))); + partition_bound_accepts_nulls(boundinfo) || + partition_bound_has_default(boundinfo))); foreach(cell, spec->listdatums) { @@ -750,8 +841,10 @@ check_new_partition_bound(char *relname, Relation parent, int offset; bool equal; - Assert(boundinfo && boundinfo->ndatums > 0 && - boundinfo->strategy == PARTITION_STRATEGY_RANGE); + Assert(boundinfo && + boundinfo->strategy == PARTITION_STRATEGY_RANGE && + (boundinfo->ndatums > 0 || + partition_bound_has_default(boundinfo))); /* * Test whether the new lower bound (which is treated @@ -828,6 +921,139 @@ check_new_partition_bound(char *relname, Relation parent, } } +/* + * check_default_allows_bound + * + * This function checks if there exists a row in the default partition that + * would properly belong to the new partition being added. If it finds one, + * it throws an error. + */ +void +check_default_allows_bound(Relation parent, Relation default_rel, + PartitionBoundSpec *new_spec) +{ + List *new_part_constraints; + List *def_part_constraints; + List *all_parts; + ListCell *lc; + + new_part_constraints = (new_spec->strategy == PARTITION_STRATEGY_LIST) + ? get_qual_for_list(parent, new_spec) + : get_qual_for_range(parent, new_spec, false); + def_part_constraints = + get_proposed_default_constraint(new_part_constraints); + + /* + * If the existing constraints on the default partition imply that it will + * not contain any row that would belong to the new partition, we can + * avoid scanning the default partition. + */ + if (PartConstraintImpliedByRelConstraint(default_rel, def_part_constraints)) + { + ereport(INFO, + (errmsg("partition constraint for table \"%s\" is implied by existing constraints", + RelationGetRelationName(default_rel)))); + return; + } + + /* + * Scan the default partition and its subpartitions, and check for rows + * that do not satisfy the revised partition constraints. + */ + if (default_rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + all_parts = find_all_inheritors(RelationGetRelid(default_rel), + AccessExclusiveLock, NULL); + else + all_parts = list_make1_oid(RelationGetRelid(default_rel)); + + foreach(lc, all_parts) + { + Oid part_relid = lfirst_oid(lc); + Relation part_rel; + Expr *constr; + Expr *partition_constraint; + EState *estate; + HeapTuple tuple; + ExprState *partqualstate = NULL; + Snapshot snapshot; + TupleDesc tupdesc; + ExprContext *econtext; + HeapScanDesc scan; + MemoryContext oldCxt; + TupleTableSlot *tupslot; + + /* Lock already taken above. */ + if (part_relid != RelationGetRelid(default_rel)) + part_rel = heap_open(part_relid, NoLock); + else + part_rel = default_rel; + + /* + * Only RELKIND_RELATION relations (i.e. leaf partitions) need to be + * scanned. + */ + if (part_rel->rd_rel->relkind != RELKIND_RELATION) + { + if (part_rel->rd_rel->relkind == RELKIND_FOREIGN_TABLE) + ereport(WARNING, + (errcode(ERRCODE_CHECK_VIOLATION), + errmsg("skipped scanning foreign table \"%s\" which is a partition of default partition \"%s\"", + RelationGetRelationName(part_rel), + RelationGetRelationName(default_rel)))); + + if (RelationGetRelid(default_rel) != RelationGetRelid(part_rel)) + heap_close(part_rel, NoLock); + + continue; + } + + tupdesc = CreateTupleDescCopy(RelationGetDescr(part_rel)); + constr = linitial(def_part_constraints); + partition_constraint = (Expr *) + map_partition_varattnos((List *) constr, + 1, part_rel, parent, NULL); + estate = CreateExecutorState(); + + /* Build expression execution states for partition check quals */ + partqualstate = ExecPrepareExpr(partition_constraint, estate); + + econtext = GetPerTupleExprContext(estate); + snapshot = RegisterSnapshot(GetLatestSnapshot()); + scan = heap_beginscan(part_rel, snapshot, 0, NULL); + tupslot = MakeSingleTupleTableSlot(tupdesc); + + /* + * Switch to per-tuple memory context and reset it for each tuple + * produced, so we don't leak memory. + */ + oldCxt = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); + + while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL) + { + ExecStoreTuple(tuple, tupslot, InvalidBuffer, false); + econtext->ecxt_scantuple = tupslot; + + if (!ExecCheck(partqualstate, econtext)) + ereport(ERROR, + (errcode(ERRCODE_CHECK_VIOLATION), + errmsg("updated partition constraint for default partition \"%s\" would be violated by some row", + RelationGetRelationName(default_rel)))); + + ResetExprContext(econtext); + CHECK_FOR_INTERRUPTS(); + } + + MemoryContextSwitchTo(oldCxt); + heap_endscan(scan); + UnregisterSnapshot(snapshot); + ExecDropSingleTupleTableSlot(tupslot); + FreeExecutorState(estate); + + if (RelationGetRelid(default_rel) != RelationGetRelid(part_rel)) + heap_close(part_rel, NoLock); /* keep the lock until commit */ + } +} + /* * get_partition_parent * @@ -892,12 +1118,12 @@ get_qual_from_partbound(Relation rel, Relation parent, { case PARTITION_STRATEGY_LIST: Assert(spec->strategy == PARTITION_STRATEGY_LIST); - my_qual = get_qual_for_list(key, spec); + my_qual = get_qual_for_list(parent, spec); break; case PARTITION_STRATEGY_RANGE: Assert(spec->strategy == PARTITION_STRATEGY_RANGE); - my_qual = get_qual_for_range(key, spec); + my_qual = get_qual_for_range(parent, spec, false); break; default: @@ -967,7 +1193,8 @@ RelationGetPartitionQual(Relation rel) * get_partition_qual_relid * * Returns an expression tree describing the passed-in relation's partition - * constraint. + * constraint. If there is no partition constraint returns NULL; this can + * happen if the default partition is the only partition. */ Expr * get_partition_qual_relid(Oid relid) @@ -980,7 +1207,10 @@ get_partition_qual_relid(Oid relid) if (rel->rd_rel->relispartition) { and_args = generate_partition_qual(rel); - if (list_length(and_args) > 1) + + if (and_args == NIL) + result = NULL; + else if (list_length(and_args) > 1) result = makeBoolExpr(AND_EXPR, and_args, -1); else result = linitial(and_args); @@ -1295,10 +1525,14 @@ make_partition_op_expr(PartitionKey key, int keynum, * * Returns an implicit-AND list of expressions to use as a list partition's * constraint, given the partition key and bound structures. + * + * The function returns NIL for a default partition when it's the only + * partition since in that case there is no constraint. */ static List * -get_qual_for_list(PartitionKey key, PartitionBoundSpec *spec) -{// #lizard forgives +get_qual_for_list(Relation parent, PartitionBoundSpec *spec) +{ + PartitionKey key = RelationGetPartitionKey(parent); List *result; Expr *keyCol; ArrayExpr *arr; @@ -1325,7 +1559,54 @@ get_qual_for_list(PartitionKey key, PartitionBoundSpec *spec) else keyCol = (Expr *) copyObject(linitial(key->partexprs)); - /* Create list of Consts for the allowed values, excluding any nulls */ + /* + * For default list partition, collect datums for all the partitions. The + * default partition constraint should check that the partition key is + * equal to none of those. + */ + if (spec->is_default) + { + int i; + int ndatums = 0; + PartitionDesc pdesc = RelationGetPartitionDesc(parent); + PartitionBoundInfo boundinfo = pdesc->boundinfo; + + if (boundinfo) + { + ndatums = boundinfo->ndatums; + + if (partition_bound_accepts_nulls(boundinfo)) + list_has_null = true; + } + + /* + * If default is the only partition, there need not be any partition + * constraint on it. + */ + if (ndatums == 0 && !list_has_null) + return NIL; + + for (i = 0; i < ndatums; i++) + { + Const *val; + + /* Construct const from datum */ + val = makeConst(key->parttypid[0], + key->parttypmod[0], + key->parttypcoll[0], + key->parttyplen[0], + *boundinfo->datums[i], + false, /* isnull */ + key->parttypbyval[0]); + + arrelems = lappend(arrelems, val); + } + } + else + { + /* + * Create list of Consts for the allowed values, excluding any nulls. + */ foreach(cell, spec->listdatums) { Const *val = castNode(Const, lfirst(cell)); @@ -1335,6 +1616,7 @@ get_qual_for_list(PartitionKey key, PartitionBoundSpec *spec) else arrelems = lappend(arrelems, copyObject(val)); } + } if (arrelems) { @@ -1397,6 +1679,18 @@ get_qual_for_list(PartitionKey key, PartitionBoundSpec *spec) result = list_make1(nulltest); } + /* + * Note that, in general, applying NOT to a constraint expression doesn't + * necessarily invert the set of rows it accepts, because NOT (NULL) is + * NULL. However, the partition constraints we construct here never + * evaluate to NULL, so applying NOT works as intended. + */ + if (spec->is_default) + { + result = list_make1(make_ands_explicit(result)); + result = list_make1(makeBoolExpr(NOT_EXPR, result, -1)); + } + return result; } @@ -1453,6 +1747,53 @@ get_range_key_properties(PartitionKey key, int keynum, *upper_val = NULL; } +/* + * get_range_nulltest + * + * A non-default range partition table does not currently allow partition + * keys to be null, so emit an IS NOT NULL expression for each key column. + */ +static List * +get_range_nulltest(PartitionKey key) +{ + List *result = NIL; + NullTest *nulltest; + ListCell *partexprs_item; + int i; + + partexprs_item = list_head(key->partexprs); + for (i = 0; i < key->partnatts; i++) + { + Expr *keyCol; + + if (key->partattrs[i] != 0) + { + keyCol = (Expr *) makeVar(1, + key->partattrs[i], + key->parttypid[i], + key->parttypmod[i], + key->parttypcoll[i], + 0); + } + else + { + if (partexprs_item == NULL) + elog(ERROR, "wrong number of partition key expressions"); + keyCol = copyObject(lfirst(partexprs_item)); + partexprs_item = lnext(partexprs_item); + } + + nulltest = makeNode(NullTest); + nulltest->arg = keyCol; + nulltest->nulltesttype = IS_NOT_NULL; + nulltest->argisrow = false; + nulltest->location = -1; + result = lappend(result, nulltest); + } + + return result; +} + /* * get_qual_for_range * @@ -1491,12 +1832,16 @@ get_range_key_properties(PartitionKey key, int keynum, * In most common cases with only one partition column, say a, the following * expression tree will be generated: a IS NOT NULL AND a >= al AND a < au * - * If we end up with an empty result list, we return a single-member list - * containing a constant TRUE, because callers expect a non-empty list. + * For default partition, it returns the negation of the constraints of all + * the other partitions. + * + * External callers should pass for_default as false; we set it to true only + * when recursing. */ static List * -get_qual_for_range(PartitionKey key, PartitionBoundSpec *spec) -{// #lizard forgives +get_qual_for_range(Relation parent, PartitionBoundSpec *spec, + bool for_default) +{ List *result = NIL; ListCell *cell1, *cell2, @@ -1506,10 +1851,10 @@ get_qual_for_range(PartitionKey key, PartitionBoundSpec *spec) j; PartitionRangeDatum *ldatum, *udatum; + PartitionKey key = RelationGetPartitionKey(parent); Expr *keyCol; Const *lower_val, *upper_val; - NullTest *nulltest; List *lower_or_arms, *upper_or_arms; int num_or_arms, @@ -1519,44 +1864,77 @@ get_qual_for_range(PartitionKey key, PartitionBoundSpec *spec) bool need_next_lower_arm, need_next_upper_arm; - lower_or_start_datum = list_head(spec->lowerdatums); - upper_or_start_datum = list_head(spec->upperdatums); - num_or_arms = key->partnatts; + if (spec->is_default) + { + List *or_expr_args = NIL; + PartitionDesc pdesc = RelationGetPartitionDesc(parent); + Oid *inhoids = pdesc->oids; + int nparts = pdesc->nparts, + i; - /* - * A range-partitioned table does not currently allow partition keys to be - * null, so emit an IS NOT NULL expression for each key column. - */ - partexprs_item = list_head(key->partexprs); - for (i = 0; i < key->partnatts; i++) + for (i = 0; i < nparts; i++) { - Expr *keyCol; - - if (key->partattrs[i] != 0) + Oid inhrelid = inhoids[i]; + HeapTuple tuple; + Datum datum; + bool isnull; + PartitionBoundSpec *bspec; + + tuple = SearchSysCache1(RELOID, inhrelid); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for relation %u", inhrelid); + + datum = SysCacheGetAttr(RELOID, tuple, + Anum_pg_class_relpartbound, + &isnull); + + Assert(!isnull); + bspec = (PartitionBoundSpec *) + stringToNode(TextDatumGetCString(datum)); + if (!IsA(bspec, PartitionBoundSpec)) + elog(ERROR, "expected PartitionBoundSpec"); + + if (!bspec->is_default) { - keyCol = (Expr *) makeVar(1, - key->partattrs[i], - key->parttypid[i], - key->parttypmod[i], - key->parttypcoll[i], - 0); + List *part_qual; + + part_qual = get_qual_for_range(parent, bspec, true); + + /* + * AND the constraints of the partition and add to + * or_expr_args + */ + or_expr_args = lappend(or_expr_args, list_length(part_qual) > 1 + ? makeBoolExpr(AND_EXPR, part_qual, -1) + : linitial(part_qual)); } - else + ReleaseSysCache(tuple); + } + + if (or_expr_args != NIL) { - if (partexprs_item == NULL) - elog(ERROR, "wrong number of partition key expressions"); - keyCol = copyObject(lfirst(partexprs_item)); - partexprs_item = lnext(partexprs_item); + /* OR all the non-default partition constraints; then negate it */ + result = lappend(result, + list_length(or_expr_args) > 1 + ? makeBoolExpr(OR_EXPR, or_expr_args, -1) + : linitial(or_expr_args)); + result = list_make1(makeBoolExpr(NOT_EXPR, result, -1)); } - nulltest = makeNode(NullTest); - nulltest->arg = keyCol; - nulltest->nulltesttype = IS_NOT_NULL; - nulltest->argisrow = false; - nulltest->location = -1; - result = lappend(result, nulltest); + return result; } + lower_or_start_datum = list_head(spec->lowerdatums); + upper_or_start_datum = list_head(spec->upperdatums); + num_or_arms = key->partnatts; + + /* + * If it is the recursive call for default, we skip the get_range_nulltest + * to avoid accumulating the NullTest on the same keys for each partition. + */ + if (!for_default) + result = get_range_nulltest(key); + /* * Iterate over the key columns and check if the corresponding lower and * upper datums are equal using the btree equality operator for the @@ -1778,9 +2156,16 @@ get_qual_for_range(PartitionKey key, PartitionBoundSpec *spec) ? makeBoolExpr(OR_EXPR, upper_or_arms, -1) : linitial(upper_or_arms)); - /* As noted above, caller expects the list to be non-empty. */ + /* + * As noted above, for non-default, we return list with constant TRUE. If + * the result is NIL during the recursive call for default, it implies + * this is the only other partition which can hold every value of the key + * except NULL. Hence we return the NullTest result skipped earlier. + */ if (result == NIL) - result = list_make1(makeBoolConst(true, false)); + result = for_default + ? get_range_nulltest(key) + : list_make1(makeBoolConst(true, false)); return result; } @@ -1788,7 +2173,8 @@ get_qual_for_range(PartitionKey key, PartitionBoundSpec *spec) /* * generate_partition_qual * - * Generate partition predicate from rel's partition bound expression + * Generate partition predicate from rel's partition bound expression. The + * function returns a NIL list if there is no predicate. * * Result expression tree is stored CacheMemoryContext to ensure it survives * as long as the relcache entry. But we should be running in a less long-lived @@ -2023,14 +2409,25 @@ get_partition_for_tuple(PartitionDispatch *pd, case PARTITION_STRATEGY_RANGE: { - bool equal = false; + bool equal = false, + range_partkey_has_null = false; int cur_offset; int i; - /* No range includes NULL. */ + /* + * No range includes NULL, so this will be accepted by the + * default partition if there is one, and otherwise + * rejected. + */ for (i = 0; i < key->partnatts; i++) { - if (isnull[i]) + if (isnull[i] && + partition_bound_has_default(partdesc->boundinfo)) + { + range_partkey_has_null = true; + break; + } + else if (isnull[i]) { *failed_at = parent; *failed_slot = slot; @@ -2039,6 +2436,13 @@ get_partition_for_tuple(PartitionDispatch *pd, } } + /* + * No need to search for partition, as the null key will + * be routed to the default partition. + */ + if (range_partkey_has_null) + break; + cur_offset = partition_bound_bsearch(key, partdesc->boundinfo, values, @@ -2046,9 +2450,9 @@ get_partition_for_tuple(PartitionDispatch *pd, &equal); /* - * The offset returned is such that the bound at cur_offset - * is less than or equal to the tuple value, so the bound - * at offset+1 is the upper bound. + * The offset returned is such that the bound at + * cur_offset is less than or equal to the tuple value, so + * the bound at offset+1 is the upper bound. */ cur_index = partdesc->boundinfo->indexes[cur_offset + 1]; } @@ -2061,8 +2465,16 @@ get_partition_for_tuple(PartitionDispatch *pd, /* * cur_index < 0 means we failed to find a partition of this parent. - * cur_index >= 0 means we either found the leaf partition, or the - * next parent to find a partition of. + * Use the default partition, if there is one. + */ + if (cur_index < 0) + cur_index = partdesc->boundinfo->default_index; + + /* + * If cur_index is still less than 0 at this point, there's no + * partition for this tuple. Otherwise, we either found the leaf + * partition, or a child partitioned table through which we have to + * route the tuple. */ if (cur_index < 0) { @@ -2116,6 +2528,8 @@ make_one_range_bound(PartitionKey key, int index, List *datums, bool lower) ListCell *lc; int i; + Assert(datums != NIL); + bound = (PartitionRangeBound *) palloc0(sizeof(PartitionRangeBound)); bound->index = index; bound->datums = (Datum *) palloc0(key->partnatts * sizeof(Datum)); @@ -2352,3 +2766,104 @@ partition_bound_bsearch(PartitionKey key, PartitionBoundInfo boundinfo, return lo; } + +/* + * get_default_oid_from_partdesc + * + * Given a partition descriptor, return the OID of the default partition, if + * one exists; else, return InvalidOid. + */ +Oid +get_default_oid_from_partdesc(PartitionDesc partdesc) +{ + if (partdesc && partdesc->boundinfo && + partition_bound_has_default(partdesc->boundinfo)) + return partdesc->oids[partdesc->boundinfo->default_index]; + + return InvalidOid; +} + +/* + * get_default_partition_oid + * + * Given a relation OID, return the OID of the default partition, if one + * exists. Use get_default_oid_from_partdesc where possible, for + * efficiency. + */ +Oid +get_default_partition_oid(Oid parentId) +{ + HeapTuple tuple; + Oid defaultPartId = InvalidOid; + + tuple = SearchSysCache1(PARTRELID, ObjectIdGetDatum(parentId)); + + if (HeapTupleIsValid(tuple)) + { + Form_pg_partitioned_table part_table_form; + + part_table_form = (Form_pg_partitioned_table) GETSTRUCT(tuple); + defaultPartId = part_table_form->partdefid; + } + + ReleaseSysCache(tuple); + return defaultPartId; +} + +/* + * update_default_partition_oid + * + * Update pg_partition_table.partdefid with a new default partition OID. + */ +void +update_default_partition_oid(Oid parentId, Oid defaultPartId) +{ + HeapTuple tuple; + Relation pg_partitioned_table; + Form_pg_partitioned_table part_table_form; + + pg_partitioned_table = heap_open(PartitionedRelationId, RowExclusiveLock); + + tuple = SearchSysCacheCopy1(PARTRELID, ObjectIdGetDatum(parentId)); + + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for partition key of relation %u", + parentId); + + part_table_form = (Form_pg_partitioned_table) GETSTRUCT(tuple); + part_table_form->partdefid = defaultPartId; + CatalogTupleUpdate(pg_partitioned_table, &tuple->t_self, tuple); + + heap_freetuple(tuple); + heap_close(pg_partitioned_table, RowExclusiveLock); +} + +/* + * get_proposed_default_constraint + * + * This function returns the negation of new_part_constraints, which + * would be an integral part of the default partition constraints after + * addition of the partition to which the new_part_constraints belongs. + */ +List * +get_proposed_default_constraint(List *new_part_constraints) +{ + Expr *defPartConstraint; + + defPartConstraint = make_ands_explicit(new_part_constraints); + + /* + * Derive the partition constraints of default partition by negating the + * given partition constraints. The partition constraint never evaluates + * to NULL, so negating it like this is safe. + */ + defPartConstraint = makeBoolExpr(NOT_EXPR, + list_make1(defPartConstraint), + -1); + defPartConstraint = + (Expr *) eval_const_expressions(NULL, + (Node *) defPartConstraint); + defPartConstraint = canonicalize_qual(defPartConstraint); + + return list_make1(defPartConstraint); +} diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index db41c7fe..73e65cd9 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -213,6 +213,8 @@ typedef struct AlteredTableInfo bool chgPersistence; /* T if SET LOGGED/UNLOGGED is used */ char newrelpersistence; /* if above is true */ Expr *partition_constraint; /* for attach partition validation */ + /* true, if validating default due to some other attach/detach */ + bool validate_default; /* Objects to rebuild after completing ALTER TYPE operations */ List *changedConstraintOids; /* OIDs of constraints to rebuild */ List *changedConstraintDefs; /* string definitions of same */ @@ -538,11 +540,10 @@ static void CreateInheritance(Relation child_rel, Relation parent_rel); static void RemoveInheritance(Relation child_rel, Relation parent_rel); static ObjectAddress ATExecAttachPartition(List **wqueue, Relation rel, PartitionCmd *cmd); -static bool PartConstraintImpliedByRelConstraint(Relation scanrel, - List *partConstraint); static void ValidatePartitionConstraints(List **wqueue, Relation scanrel, List *scanrel_children, - List *partConstraint); + List *partConstraint, + bool validate_default); static ObjectAddress ATExecDetachPartition(Relation rel, RangeVar *name); #ifdef _SHARDING_ static void AtExecRebuildExtent(Relation rel); @@ -1006,8 +1007,10 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, { PartitionBoundSpec *bound; ParseState *pstate; - Oid parentId = linitial_oid(inheritOids); - Relation parent; + Oid parentId = linitial_oid(inheritOids), + defaultPartOid; + Relation parent, + defaultRel = NULL; /* Already have strong enough lock on the parent */ parent = heap_open(parentId, NoLock); @@ -1022,6 +1025,30 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, errmsg("\"%s\" is not partitioned", RelationGetRelationName(parent)))); + /* + * The partition constraint of the default partition depends on the + * partition bounds of every other partition. It is possible that + * another backend might be about to execute a query on the default + * partition table, and that the query relies on previously cached + * default partition constraints. We must therefore take a table lock + * strong enough to prevent all queries on the default partition from + * proceeding until we commit and send out a shared-cache-inval notice + * that will make them update their index lists. + * + * Order of locking: The relation being added won't be visible to + * other backends until it is committed, hence here in + * DefineRelation() the order of locking the default partition and the + * relation being added does not matter. But at all other places we + * need to lock the default relation before we lock the relation being + * added or removed i.e. we should take the lock in same order at all + * the places such that lock parent, lock default partition and then + * lock the partition so as to avoid a deadlock. + */ + defaultPartOid = + get_default_oid_from_partdesc(RelationGetPartitionDesc(parent)); + if (OidIsValid(defaultPartOid)) + defaultRel = heap_open(defaultPartOid, AccessExclusiveLock); + /* Tranform the bound values */ pstate = make_parsestate(NULL); pstate->p_sourcetext = queryString; @@ -1030,14 +1057,31 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, /* * Check first that the new partition's bound is valid and does not - * overlap with any of existing partitions of the parent - note that - * it does not return on error. + * overlap with any of existing partitions of the parent. */ check_new_partition_bound(relname, parent, bound); + /* + * If the default partition exists, its partition constraints will + * change after the addition of this new partition such that it won't + * allow any row that qualifies for this new partition. So, check that + * the existing data in the default partition satisfies the constraint + * as it will exist after adding this partition. + */ + if (OidIsValid(defaultPartOid)) + { + check_default_allows_bound(parent, defaultRel, bound); + /* Keep the lock until commit. */ + heap_close(defaultRel, NoLock); + } + /* Update the pg_class entry. */ StorePartitionBound(rel, parent, bound); + /* Update the default partition oid */ + if (bound->is_default) + update_default_partition_oid(RelationGetRelid(parent), relationId); + heap_close(parent, NoLock); /* @@ -6188,9 +6232,16 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap, LOCKMODE lockmode) } if (partqualstate && !ExecCheck(partqualstate, econtext)) + { + if (tab->validate_default) + ereport(ERROR, + (errcode(ERRCODE_CHECK_VIOLATION), + errmsg("updated partition constraint for default partition would be violated by some row"))); + else ereport(ERROR, (errcode(ERRCODE_CHECK_VIOLATION), errmsg("partition constraint is violated by some row"))); + } /* Write the tuple out to the new relation */ if (newrel) @@ -16354,7 +16405,7 @@ ComputePartitionAttrs(Relation rel, List *partParams, AttrNumber *partattrs, * Existing constraints includes its check constraints and column-level * NOT NULL constraints and partConstraint describes the partition constraint. */ -static bool +bool PartConstraintImpliedByRelConstraint(Relation scanrel, List *partConstraint) { @@ -16441,7 +16492,8 @@ PartConstraintImpliedByRelConstraint(Relation scanrel, static void ValidatePartitionConstraints(List **wqueue, Relation scanrel, List *scanrel_children, - List *partConstraint) + List *partConstraint, + bool validate_default) { bool found_whole_row; ListCell *lc; @@ -16503,6 +16555,7 @@ ValidatePartitionConstraints(List **wqueue, Relation scanrel, /* Grab a work queue entry. */ tab = ATGetQueueEntry(wqueue, part_rel); tab->partition_constraint = (Expr *) linitial(my_partconstr); + tab->validate_default = validate_default; /* keep our lock until commit */ if (part_rel != scanrel) @@ -16530,6 +16583,17 @@ ATExecAttachPartition(List **wqueue, Relation rel, PartitionCmd *cmd) ObjectAddress address; const char *trigger_name; bool found_whole_row; + Oid defaultPartOid; + List *partBoundConstraint; + + /* + * We must lock the default partition, because attaching a new partition + * will change its partition constraint. + */ + defaultPartOid = + get_default_oid_from_partdesc(RelationGetPartitionDesc(rel)); + if (OidIsValid(defaultPartOid)) + LockRelationOid(defaultPartOid, AccessExclusiveLock); attachrel = heap_openrv(cmd->name, AccessExclusiveLock); @@ -16686,6 +16750,11 @@ ATExecAttachPartition(List **wqueue, Relation rel, PartitionCmd *cmd) /* OK to create inheritance. Rest of the checks performed there */ CreateInheritance(attachrel, rel); + /* Update the default partition oid */ + if (cmd->bound->is_default) + update_default_partition_oid(RelationGetRelid(rel), + RelationGetRelid(attachrel)); + /* * Check that the new partition's bound is valid and does not overlap any * of existing partitions of the parent - note that it does not return on @@ -16702,10 +16771,15 @@ ATExecAttachPartition(List **wqueue, Relation rel, PartitionCmd *cmd) * If the parent itself is a partition, make sure to include its * constraint as well. */ - partConstraint = list_concat(get_qual_from_partbound(attachrel, rel, - cmd->bound), + partBoundConstraint = get_qual_from_partbound(attachrel, rel, cmd->bound); + partConstraint = list_concat(partBoundConstraint, RelationGetPartitionQual(rel)); - partConstraint = (List *) eval_const_expressions(NULL, + + /* Skip validation if there are no constraints to validate. */ + if (partConstraint) + { + partConstraint = + (List *) eval_const_expressions(NULL, (Node *) partConstraint); partConstraint = (List *) canonicalize_qual((Expr *) partConstraint); partConstraint = list_make1(make_ands_explicit(partConstraint)); @@ -16718,11 +16792,40 @@ ATExecAttachPartition(List **wqueue, Relation rel, PartitionCmd *cmd) rel, &found_whole_row); /* There can never be a whole-row reference here */ if (found_whole_row) - elog(ERROR, "unexpected whole-row reference found in partition key"); + elog(ERROR, + "unexpected whole-row reference found in partition key"); /* Validate partition constraints against the table being attached. */ ValidatePartitionConstraints(wqueue, attachrel, attachrel_children, - partConstraint); + partConstraint, false); + } + + /* + * Check whether default partition has a row that would fit the partition + * being attached. + */ + defaultPartOid = + get_default_oid_from_partdesc(RelationGetPartitionDesc(rel)); + if (OidIsValid(defaultPartOid)) + { + Relation defaultrel; + List *defaultrel_children; + List *defPartConstraint; + + /* We already have taken a lock on default partition. */ + defaultrel = heap_open(defaultPartOid, NoLock); + defPartConstraint = + get_proposed_default_constraint(partBoundConstraint); + defaultrel_children = + find_all_inheritors(defaultPartOid, + AccessExclusiveLock, NULL); + ValidatePartitionConstraints(wqueue, defaultrel, + defaultrel_children, + defPartConstraint, true); + + /* keep our lock until commit. */ + heap_close(defaultrel, NoLock); + } ObjectAddressSet(address, RelationRelationId, RelationGetRelid(attachrel)); @@ -16749,6 +16852,7 @@ ATExecDetachPartition(Relation rel, RangeVar *name) new_null[Natts_pg_class], new_repl[Natts_pg_class]; ObjectAddress address; + Oid defaultPartOid; #ifdef _MLS_ bool schema_bound; Oid partoid; @@ -16773,6 +16877,16 @@ ATExecDetachPartition(Relation rel, RangeVar *name) elog(ERROR, "must be owner of relation %s", NameStr(rel->rd_rel->relname)); } #endif + + /* + * We must lock the default partition, because detaching this partition + * will changing its partition constrant. + */ + defaultPartOid = + get_default_oid_from_partdesc(RelationGetPartitionDesc(rel)); + if (OidIsValid(defaultPartOid)) + LockRelationOid(defaultPartOid, AccessExclusiveLock); + partRel = heap_openrv(name, ShareUpdateExclusiveLock); /* All inheritance related checks are performed within the function */ @@ -16806,6 +16920,24 @@ ATExecDetachPartition(Relation rel, RangeVar *name) heap_freetuple(newtuple); heap_close(classRel, RowExclusiveLock); + if (OidIsValid(defaultPartOid)) + { + /* + * If the detach relation is the default partition itself, invalidate + * its entry in pg_partitioned_table. + */ + if (RelationGetRelid(partRel) == defaultPartOid) + update_default_partition_oid(RelationGetRelid(rel), InvalidOid); + else + { + /* + * We must invalidate default partition's relcache, for the same + * reasons explained in StorePartitionBound(). + */ + CacheInvalidateRelcacheByRelid(defaultPartOid); + } + } + /* * Invalidate the parent's relcache so that the partition is no longer * included in its partition descriptor. diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 6b103aa4..a7e1d32a 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -4828,6 +4828,7 @@ _copyPartitionBoundSpec(const PartitionBoundSpec *from) PartitionBoundSpec *newnode = makeNode(PartitionBoundSpec); COPY_SCALAR_FIELD(strategy); + COPY_SCALAR_FIELD(is_default); COPY_NODE_FIELD(listdatums); COPY_NODE_FIELD(lowerdatums); COPY_NODE_FIELD(upperdatums); diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c index 3dbcb393..6efee4a8 100644 --- a/src/backend/nodes/equalfuncs.c +++ b/src/backend/nodes/equalfuncs.c @@ -2952,6 +2952,7 @@ static bool _equalPartitionBoundSpec(const PartitionBoundSpec *a, const PartitionBoundSpec *b) { COMPARE_SCALAR_FIELD(strategy); + COMPARE_SCALAR_FIELD(is_default); COMPARE_NODE_FIELD(listdatums); COMPARE_NODE_FIELD(lowerdatums); COMPARE_NODE_FIELD(upperdatums); diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index f550ed23..daf0445f 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -5002,6 +5002,7 @@ _outPartitionBoundSpec(StringInfo str, const PartitionBoundSpec *node) WRITE_NODE_TYPE("PARTITIONBOUNDSPEC"); WRITE_CHAR_FIELD(strategy); + WRITE_BOOL_FIELD(is_default); WRITE_NODE_FIELD(listdatums); WRITE_NODE_FIELD(lowerdatums); WRITE_NODE_FIELD(upperdatums); diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c index d653fbf3..32c879f7 100644 --- a/src/backend/nodes/readfuncs.c +++ b/src/backend/nodes/readfuncs.c @@ -4076,6 +4076,7 @@ _readPartitionBoundSpec(void) READ_LOCALS(PartitionBoundSpec); READ_CHAR_FIELD(strategy); + READ_BOOL_FIELD(is_default); READ_NODE_FIELD(listdatums); READ_NODE_FIELD(lowerdatums); READ_NODE_FIELD(upperdatums); diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index ad22456b..41b045c3 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -618,7 +618,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); %type part_strategy %type part_elem %type part_params -%type ForValues +%type PartitionBoundSpec %type partbound_datum PartitionRangeDatum %type partbound_datum_list range_datum_list @@ -2108,7 +2108,7 @@ alter_group_cmds: partition_cmd: /* ALTER TABLE ATTACH PARTITION FOR VALUES */ - ATTACH PARTITION qualified_name ForValues + ATTACH PARTITION qualified_name PartitionBoundSpec { AlterTableCmd *n = makeNode(AlterTableCmd); PartitionCmd *cmd = makeNode(PartitionCmd); @@ -2833,13 +2833,14 @@ alter_identity_column_option: } ; -ForValues: +PartitionBoundSpec: /* a LIST partition */ FOR VALUES IN_P '(' partbound_datum_list ')' { PartitionBoundSpec *n = makeNode(PartitionBoundSpec); n->strategy = PARTITION_STRATEGY_LIST; + n->is_default = false; n->listdatums = $5; n->location = @3; @@ -2852,10 +2853,22 @@ ForValues: PartitionBoundSpec *n = makeNode(PartitionBoundSpec); n->strategy = PARTITION_STRATEGY_RANGE; + n->is_default = false; n->lowerdatums = $5; n->upperdatums = $9; n->location = @3; + $$ = n; + } + + /* a DEFAULT partition */ + | DEFAULT + { + PartitionBoundSpec *n = makeNode(PartitionBoundSpec); + + n->is_default = true; + n->location = @1; + $$ = n; } ; @@ -3417,7 +3430,7 @@ CreateStmt: CREATE OptTemp TABLE qualified_name '(' OptTableElementList ')' $$ = (Node *)n; } | CREATE OptTemp TABLE qualified_name PARTITION OF qualified_name - OptTypedTableElementList ForValues OptPartitionSpec OptWith + OptTypedTableElementList PartitionBoundSpec OptPartitionSpec OptWith OnCommitOption OptTableSpace { CreateStmt *n = makeNode(CreateStmt); @@ -3436,7 +3449,7 @@ CreateStmt: CREATE OptTemp TABLE qualified_name '(' OptTableElementList ')' $$ = (Node *)n; } | CREATE OptTemp TABLE IF_P NOT EXISTS qualified_name PARTITION OF - qualified_name OptTypedTableElementList ForValues OptPartitionSpec + qualified_name OptTypedTableElementList PartitionBoundSpec OptPartitionSpec OptWith OnCommitOption OptTableSpace { CreateStmt *n = makeNode(CreateStmt); @@ -5369,7 +5382,7 @@ CreateForeignTableStmt: $$ = (Node *) n; } | CREATE FOREIGN TABLE qualified_name - PARTITION OF qualified_name OptTypedTableElementList ForValues + PARTITION OF qualified_name OptTypedTableElementList PartitionBoundSpec SERVER name create_generic_options { CreateForeignTableStmt *n = makeNode(CreateForeignTableStmt); @@ -5390,7 +5403,7 @@ CreateForeignTableStmt: $$ = (Node *) n; } | CREATE FOREIGN TABLE IF_P NOT EXISTS qualified_name - PARTITION OF qualified_name OptTypedTableElementList ForValues + PARTITION OF qualified_name OptTypedTableElementList PartitionBoundSpec SERVER name create_generic_options { CreateForeignTableStmt *n = makeNode(CreateForeignTableStmt); diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c index ad70850b..3695d9dc 100644 --- a/src/backend/parser/parse_utilcmd.c +++ b/src/backend/parser/parse_utilcmd.c @@ -4963,6 +4963,18 @@ transformPartitionBound(ParseState *pstate, Relation parent, /* Avoid scribbling on input */ result_spec = copyObject(spec); + if (spec->is_default) + { + /* + * In case of the default partition, parser had no way to identify the + * partition strategy. Assign the parent's strategy to the default + * partition bound spec. + */ + result_spec->strategy = strategy; + + return result_spec; + } + if (strategy == PARTITION_STRATEGY_LIST) { ListCell *cell; diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c index 2ce117ab..2b83875e 100644 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@ -1845,7 +1845,7 @@ pg_get_partition_constraintdef(PG_FUNCTION_ARGS) constr_expr = get_partition_qual_relid(relationId); - /* Quick exit if not a partition */ + /* Quick exit if no partition constraint */ if (constr_expr == NULL) PG_RETURN_NULL(); @@ -9371,6 +9371,12 @@ get_rule_expr(Node *node, deparse_context *context, ListCell *cell; char *sep; + if (spec->is_default) + { + appendStringInfoString(buf, "DEFAULT"); + break; + } + switch (spec->strategy) { case PARTITION_STRATEGY_LIST: diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c index 6de147c8..266c3c31 100644 --- a/src/bin/psql/describe.c +++ b/src/bin/psql/describe.c @@ -1906,19 +1906,20 @@ describeOneTableDetails(const char *schemaname, parent_name = PQgetvalue(result, 0, 0); partdef = PQgetvalue(result, 0, 1); - if (PQnfields(result) == 3) + if (PQnfields(result) == 3 && !PQgetisnull(result, 0, 2)) partconstraintdef = PQgetvalue(result, 0, 2); printfPQExpBuffer(&tmpbuf, _("Partition of: %s %s"), parent_name, partdef); printTableAddFooter(&cont, tmpbuf.data); - if (partconstraintdef) - { + /* If there isn't any constraint, show that explicitly */ + if (partconstraintdef == NULL || partconstraintdef[0] == '\0') + printfPQExpBuffer(&tmpbuf, _("No partition constraint")); + else printfPQExpBuffer(&tmpbuf, _("Partition constraint: %s"), partconstraintdef); printTableAddFooter(&cont, tmpbuf.data); - } PQclear(result); } diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c index 75511955..4ce5a90e 100644 --- a/src/bin/psql/tab-complete.c +++ b/src/bin/psql/tab-complete.c @@ -2072,7 +2072,7 @@ psql_completion(const char *text, int start, int end) COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_tables, ""); /* Limited completion support for partition bound specification */ else if (TailMatches3("ATTACH", "PARTITION", MatchAny)) - COMPLETE_WITH_CONST("FOR VALUES"); + COMPLETE_WITH_LIST2("FOR VALUES", "DEFAULT"); else if (TailMatches2("FOR", "VALUES")) COMPLETE_WITH_LIST2("FROM (", "IN ("); @@ -2541,7 +2541,7 @@ psql_completion(const char *text, int start, int end) COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_partitioned_tables, ""); /* Limited completion support for partition bound specification */ else if (TailMatches3("PARTITION", "OF", MatchAny)) - COMPLETE_WITH_CONST("FOR VALUES"); + COMPLETE_WITH_LIST2("FOR VALUES", "DEFAULT"); /* CREATE TABLESPACE */ else if (Matches3("CREATE", "TABLESPACE", MatchAny)) diff --git a/src/include/catalog/partition.h b/src/include/catalog/partition.h index 2283c675..454a940a 100644 --- a/src/include/catalog/partition.h +++ b/src/include/catalog/partition.h @@ -99,4 +99,11 @@ extern int get_partition_for_tuple(PartitionDispatch *pd, EState *estate, PartitionDispatchData **failed_at, TupleTableSlot **failed_slot); +extern Oid get_default_oid_from_partdesc(PartitionDesc partdesc); +extern Oid get_default_partition_oid(Oid parentId); +extern void update_default_partition_oid(Oid parentId, Oid defaultPartId); +extern void check_default_allows_bound(Relation parent, Relation defaultRel, + PartitionBoundSpec *new_spec); +extern List *get_proposed_default_constraint(List *new_part_constaints); + #endif /* PARTITION_H */ diff --git a/src/include/catalog/pg_partitioned_table.h b/src/include/catalog/pg_partitioned_table.h index bf6e7a52..525e541f 100644 --- a/src/include/catalog/pg_partitioned_table.h +++ b/src/include/catalog/pg_partitioned_table.h @@ -1,8 +1,8 @@ /*------------------------------------------------------------------------- * * pg_partitioned_table.h - * definition of the system "partitioned table" relation - * along with the relation's initial contents. + * definition of the system "partitioned table" relation + * along with the relation's initial contents. * * * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group @@ -10,8 +10,8 @@ * src/include/catalog/pg_partitioned_table.h * * NOTES - * the genbki.sh script reads this file and generates .bki - * information from the DATA() statements. + * the genbki.sh script reads this file and generates .bki + * information from the DATA() statements. * *------------------------------------------------------------------------- */ @@ -21,54 +21,57 @@ #include "catalog/genbki.h" /* ---------------- - * pg_partitioned_table definition. cpp turns this into - * typedef struct FormData_pg_partitioned_table + * pg_partitioned_table definition. cpp turns this into + * typedef struct FormData_pg_partitioned_table * ---------------- */ #define PartitionedRelationId 3350 CATALOG(pg_partitioned_table,3350) BKI_WITHOUT_OIDS { - Oid partrelid; /* partitioned table oid */ - char partstrat; /* partitioning strategy */ - int16 partnatts; /* number of partition key columns */ + Oid partrelid; /* partitioned table oid */ + char partstrat; /* partitioning strategy */ + int16 partnatts; /* number of partition key columns */ + Oid partdefid; /* default partition oid; InvalidOid if there + * isn't one */ - /* - * variable-length fields start here, but we allow direct access to - * partattrs via the C struct. That's because the first variable-length - * field of a heap tuple can be reliably accessed using its C struct - * offset, as previous fields are all non-nullable fixed-length fields. - */ - int2vector partattrs; /* each member of the array is the attribute - * number of a partition key column, or 0 if - * the column is actually an expression */ + /* + * variable-length fields start here, but we allow direct access to + * partattrs via the C struct. That's because the first variable-length + * field of a heap tuple can be reliably accessed using its C struct + * offset, as previous fields are all non-nullable fixed-length fields. + */ + int2vector partattrs; /* each member of the array is the attribute + * number of a partition key column, or 0 if + * the column is actually an expression */ #ifdef CATALOG_VARLEN - oidvector partclass; /* operator class to compare keys */ - oidvector partcollation; /* user-specified collation for keys */ - pg_node_tree partexprs; /* list of expressions in the partition key; - * one item for each zero entry in partattrs[] */ + oidvector partclass; /* operator class to compare keys */ + oidvector partcollation; /* user-specified collation for keys */ + pg_node_tree partexprs; /* list of expressions in the partition key; + * one item for each zero entry in partattrs[] */ #endif } FormData_pg_partitioned_table; /* ---------------- - * Form_pg_partitioned_table corresponds to a pointer to a tuple with - * the format of pg_partitioned_table relation. + * Form_pg_partitioned_table corresponds to a pointer to a tuple with + * the format of pg_partitioned_table relation. * ---------------- */ typedef FormData_pg_partitioned_table *Form_pg_partitioned_table; /* ---------------- - * compiler constants for pg_partitioned_table + * compiler constants for pg_partitioned_table * ---------------- */ -#define Natts_pg_partitioned_table 7 -#define Anum_pg_partitioned_table_partrelid 1 -#define Anum_pg_partitioned_table_partstrat 2 -#define Anum_pg_partitioned_table_partnatts 3 -#define Anum_pg_partitioned_table_partattrs 4 -#define Anum_pg_partitioned_table_partclass 5 -#define Anum_pg_partitioned_table_partcollation 6 -#define Anum_pg_partitioned_table_partexprs 7 +#define Natts_pg_partitioned_table 8 +#define Anum_pg_partitioned_table_partrelid 1 +#define Anum_pg_partitioned_table_partstrat 2 +#define Anum_pg_partitioned_table_partnatts 3 +#define Anum_pg_partitioned_table_partdefid 4 +#define Anum_pg_partitioned_table_partattrs 5 +#define Anum_pg_partitioned_table_partclass 6 +#define Anum_pg_partitioned_table_partcollation 7 +#define Anum_pg_partitioned_table_partexprs 8 -#endif /* PG_PARTITIONED_TABLE_H */ +#endif /* PG_PARTITIONED_TABLE_H */ diff --git a/src/include/commands/tablecmds.h b/src/include/commands/tablecmds.h index e1a3252c..ea788476 100644 --- a/src/include/commands/tablecmds.h +++ b/src/include/commands/tablecmds.h @@ -18,6 +18,7 @@ #include "catalog/dependency.h" #include "catalog/objectaddress.h" #include "nodes/parsenodes.h" +#include "catalog/partition.h" #include "storage/lock.h" #include "utils/relcache.h" @@ -103,6 +104,8 @@ extern void RangeVarCallbackOwnsTable(const RangeVar *relation, extern void RangeVarCallbackOwnsRelation(const RangeVar *relation, Oid relId, Oid oldRelId, void *noCatalogs); +extern bool PartConstraintImpliedByRelConstraint(Relation scanrel, + List *partConstraint); #ifdef _MIGRATE_ extern bool oidarray_contian_oid(Oid *old_oids, int old_num, Oid new_oid); diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index 527cb80d..c508a87d 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -876,6 +876,7 @@ typedef struct PartitionBoundSpec NodeTag type; char strategy; /* see PARTITION_STRATEGY codes above */ + bool is_default; /* is it a default partition bound? */ /* Partitioning info for LIST strategy: */ List *listdatums; /* List of Consts (or A_Consts in raw tree) */ diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out index f8f1494d..09a22ad0 100644 --- a/src/test/regress/expected/alter_table.out +++ b/src/test/regress/expected/alter_table.out @@ -3304,6 +3304,14 @@ SELECT conislocal, coninhcount FROM pg_constraint WHERE conrelid = 'part_1'::reg CREATE TABLE fail_part (LIKE part_1 INCLUDING CONSTRAINTS); ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1); ERROR: partition "fail_part" would overlap partition "part_1" +-- check that an existing table can be attached as a default partition +CREATE TABLE def_part (LIKE list_parted INCLUDING CONSTRAINTS); +ALTER TABLE list_parted ATTACH PARTITION def_part DEFAULT; +-- check attaching default partition fails if a default partition already +-- exists +CREATE TABLE fail_def_part (LIKE part_1 INCLUDING CONSTRAINTS); +ALTER TABLE list_parted ATTACH PARTITION fail_def_part DEFAULT; +ERROR: partition "fail_def_part" conflicts with existing default partition "def_part" -- check validation when attaching list partitions CREATE TABLE list_parted2 ( a int, @@ -3317,6 +3325,15 @@ ERROR: partition constraint is violated by some row -- should be ok after deleting the bad row DELETE FROM part_2; ALTER TABLE list_parted2 ATTACH PARTITION part_2 FOR VALUES IN (2); +-- check partition cannot be attached if default has some row for its values +CREATE TABLE list_parted2_def PARTITION OF list_parted2 DEFAULT; +INSERT INTO list_parted2_def VALUES (11, 'z'); +CREATE TABLE part_3 (LIKE list_parted2); +ALTER TABLE list_parted2 ATTACH PARTITION part_3 FOR VALUES IN (11); +ERROR: updated partition constraint for default partition would be violated by some row +-- should be ok after deleting the bad row +DELETE FROM list_parted2_def WHERE a = 11; +ALTER TABLE list_parted2 ATTACH PARTITION part_3 FOR VALUES IN (11); -- adding constraints that describe the desired partition constraint -- (or more restrictive) will help skip the validation scan CREATE TABLE part_3_4 ( @@ -3332,6 +3349,10 @@ ALTER TABLE list_parted2 DETACH PARTITION part_3_4; ALTER TABLE part_3_4 ALTER a SET NOT NULL; ALTER TABLE list_parted2 ATTACH PARTITION part_3_4 FOR VALUES IN (3, 4); INFO: partition constraint for table "part_3_4" is implied by existing constraints +-- check if default partition scan skipped +ALTER TABLE list_parted2_def ADD CONSTRAINT check_a CHECK (a IN (5, 6)); +CREATE TABLE part_55_66 PARTITION OF list_parted2 FOR VALUES IN (55, 66); +INFO: partition constraint for table "list_parted2_def" is implied by existing constraints -- check validation when attaching range partitions CREATE TABLE range_parted ( a int, @@ -3357,6 +3378,19 @@ CREATE TABLE part2 ( ); ALTER TABLE range_parted ATTACH PARTITION part2 FOR VALUES FROM (1, 10) TO (1, 20); INFO: partition constraint for table "part2" is implied by existing constraints +-- Create default partition +CREATE TABLE partr_def1 PARTITION OF range_parted DEFAULT; +-- Only one default partition is allowed, hence, following should give error +CREATE TABLE partr_def2 (LIKE part1 INCLUDING CONSTRAINTS); +ALTER TABLE range_parted ATTACH PARTITION partr_def2 DEFAULT; +ERROR: partition "partr_def2" conflicts with existing default partition "partr_def1" +-- Overlapping partitions cannot be attached, hence, following should give error +INSERT INTO partr_def1 VALUES (2, 10); +CREATE TABLE part3 (LIKE range_parted); +ALTER TABLE range_parted ATTACH partition part3 FOR VALUES FROM (2, 10) TO (2, 20); +ERROR: updated partition constraint for default partition would be violated by some row +-- Attaching partitions should be successful when there are no overlapping rows +ALTER TABLE range_parted ATTACH partition part3 FOR VALUES FROM (3, 10) TO (3, 20); -- check that leaf partitions are scanned when attaching a partitioned -- table CREATE TABLE part_5 ( @@ -3411,6 +3445,7 @@ ALTER TABLE part_7 ATTACH PARTITION part_7_a_null FOR VALUES IN ('a', null); INFO: partition constraint for table "part_7_a_null" is implied by existing constraints ALTER TABLE list_parted2 ATTACH PARTITION part_7 FOR VALUES IN (7); INFO: partition constraint for table "part_7" is implied by existing constraints +INFO: partition constraint for table "list_parted2_def" is implied by existing constraints -- Same example, but check this time that the constraint correctly detects -- violating rows ALTER TABLE list_parted2 DETACH PARTITION part_7; @@ -3424,7 +3459,20 @@ SELECT tableoid::regclass, a, b FROM part_7 order by a; (2 rows) ALTER TABLE list_parted2 ATTACH PARTITION part_7 FOR VALUES IN (7); +INFO: partition constraint for table "list_parted2_def" is implied by existing constraints ERROR: partition constraint is violated by some row +-- check that leaf partitions of default partition are scanned when +-- attaching a partitioned table. +ALTER TABLE part_5 DROP CONSTRAINT check_a; +CREATE TABLE part5_def PARTITION OF part_5 DEFAULT PARTITION BY LIST(a); +CREATE TABLE part5_def_p1 PARTITION OF part5_def FOR VALUES IN (5); +INSERT INTO part5_def_p1 VALUES (5, 'y'); +CREATE TABLE part5_p1 (LIKE part_5); +ALTER TABLE part_5 ATTACH PARTITION part5_p1 FOR VALUES IN ('y'); +ERROR: updated partition constraint for default partition would be violated by some row +-- should be ok after deleting the bad row +DELETE FROM part5_def_p1 WHERE b = 'y'; +ALTER TABLE part_5 ATTACH PARTITION part5_p1 FOR VALUES IN ('y'); -- check that the table being attached is not already a partition ALTER TABLE list_parted2 ATTACH PARTITION part_2 FOR VALUES IN (2); ERROR: "part_2" is already a partition @@ -3547,6 +3595,7 @@ ALTER TABLE list_parted2 ALTER COLUMN b TYPE text; ERROR: cannot alter type of column named in partition key -- cleanup DROP TABLE list_parted, list_parted2, range_parted; +DROP TABLE fail_def_part; -- more tests for certain multi-level partitioning scenarios create table p (a int, b int) partition by range (a, b); create table p1 (b int, a int not null) partition by range (b); diff --git a/src/test/regress/expected/create_table.out b/src/test/regress/expected/create_table.out index d4b9bf0e..982e28f0 100644 --- a/src/test/regress/expected/create_table.out +++ b/src/test/regress/expected/create_table.out @@ -470,6 +470,10 @@ CREATE TABLE fail_part PARTITION OF list_parted FOR VALUES FROM (1) TO (2); ERROR: invalid bound specification for a list partition LINE 1: ...BLE fail_part PARTITION OF list_parted FOR VALUES FROM (1) T... ^ +-- check default partition cannot be created more than once +CREATE TABLE part_default PARTITION OF list_parted DEFAULT; +CREATE TABLE fail_default_part PARTITION OF list_parted DEFAULT; +ERROR: partition "fail_default_part" conflicts with existing default partition "part_default" -- specified literal can't be cast to the partition column data type CREATE TABLE bools ( a bool @@ -563,10 +567,15 @@ CREATE TABLE list_parted2 ( ) PARTITION BY LIST (a); CREATE TABLE part_null_z PARTITION OF list_parted2 FOR VALUES IN (null, 'z'); CREATE TABLE part_ab PARTITION OF list_parted2 FOR VALUES IN ('a', 'b'); +CREATE TABLE list_parted2_def PARTITION OF list_parted2 DEFAULT; CREATE TABLE fail_part PARTITION OF list_parted2 FOR VALUES IN (null); ERROR: partition "fail_part" would overlap partition "part_null_z" CREATE TABLE fail_part PARTITION OF list_parted2 FOR VALUES IN ('b', 'c'); ERROR: partition "fail_part" would overlap partition "part_ab" +-- check default partition overlap +INSERT INTO list_parted2 VALUES('X'); +CREATE TABLE fail_part PARTITION OF list_parted2 FOR VALUES IN ('W', 'X', 'Y'); +ERROR: updated partition constraint for default partition "list_parted2_def" would be violated by some row CREATE TABLE range_parted2 ( a int ) PARTITION BY RANGE (a); @@ -590,6 +599,16 @@ CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (10) TO (30); ERROR: partition "fail_part" would overlap partition "part2" CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (10) TO (50); ERROR: partition "fail_part" would overlap partition "part2" +-- Create a default partition for range partitioned table +CREATE TABLE range2_default PARTITION OF range_parted2 DEFAULT; +-- More than one default partition is not allowed, so this should give error +CREATE TABLE fail_default_part PARTITION OF range_parted2 DEFAULT; +ERROR: partition "fail_default_part" conflicts with existing default partition "range2_default" +-- Check if the range for default partitions overlap +INSERT INTO range_parted2 VALUES (85); +CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (80) TO (90); +ERROR: updated partition constraint for default partition "range2_default" would be violated by some row +CREATE TABLE part4 PARTITION OF range_parted2 FOR VALUES FROM (90) TO (100); -- now check for multi-column range partition key CREATE TABLE range_parted3 ( a int, @@ -603,6 +622,7 @@ CREATE TABLE part11 PARTITION OF range_parted3 FOR VALUES FROM (1, 1) TO (1, 10) CREATE TABLE part12 PARTITION OF range_parted3 FOR VALUES FROM (1, 10) TO (1, maxvalue); CREATE TABLE fail_part PARTITION OF range_parted3 FOR VALUES FROM (1, 10) TO (1, 20); ERROR: partition "fail_part" would overlap partition "part12" +CREATE TABLE range3_default PARTITION OF range_parted3 DEFAULT; -- cannot create a partition that says column b is allowed to range -- from -infinity to +infinity, while there exist partitions that have -- more specific ranges diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out index 944336b7..9d5b125e 100644 --- a/src/test/regress/expected/insert.out +++ b/src/test/regress/expected/insert.out @@ -221,17 +221,63 @@ insert into part_null values (null, 0); create table part_ee_ff partition of list_parted for values in ('ee', 'ff') partition by range (b); create table part_ee_ff1 partition of part_ee_ff for values from (1) to (10); create table part_ee_ff2 partition of part_ee_ff for values from (10) to (20); +-- test default partition +create table part_default partition of list_parted default; +-- Negative test: a row, which would fit in other partition, does not fit +-- default partition, even when inserted directly +insert into part_default values ('aa', 2); +ERROR: new row for relation "part_default" violates partition constraint +DETAIL: Failing row contains (aa, 2). +insert into part_default values (null, 2); +ERROR: new row for relation "part_default" violates partition constraint +DETAIL: Failing row contains (null, 2). +-- ok +insert into part_default values ('Zz', 2); +-- test if default partition works as expected for multi-level partitioned +-- table as well as when default partition itself is further partitioned +drop table part_default; +create table part_xx_yy partition of list_parted for values in ('xx', 'yy') partition by list (a); +create table part_xx_yy_p1 partition of part_xx_yy for values in ('xx'); +create table part_xx_yy_defpart partition of part_xx_yy default; +create table part_default partition of list_parted default partition by range(b); +create table part_default_p1 partition of part_default for values from (20) to (30); +create table part_default_p2 partition of part_default for values from (30) to (40); -- fail insert into part_ee_ff1 values ('EE', 11); ERROR: new row for relation "part_ee_ff1" violates partition constraint DETAIL: Failing row contains (EE, 11). +insert into part_default_p2 values ('gg', 43); +ERROR: new row for relation "part_default_p2" violates partition constraint +DETAIL: Failing row contains (gg, 43). -- fail (even the parent's, ie, part_ee_ff's partition constraint applies) insert into part_ee_ff1 values ('cc', 1); ERROR: new row for relation "part_ee_ff1" violates partition constraint DETAIL: Failing row contains (cc, 1). +insert into part_default values ('gg', 43); +ERROR: no partition of relation "part_default" found for row +DETAIL: Partition key of the failing row contains (b) = (43). -- ok insert into part_ee_ff1 values ('ff', 1); insert into part_ee_ff2 values ('ff', 11); +insert into part_default_p1 values ('cd', 25); +insert into part_default_p2 values ('de', 35); +insert into list_parted values ('ab', 21); +insert into list_parted values ('xx', 1); +insert into list_parted values ('yy', 2); +select tableoid::regclass, * from list_parted; + tableoid | a | b +--------------------+----+---- + part_cc_dd | cC | 1 + part_ee_ff1 | ff | 1 + part_ee_ff2 | ff | 11 + part_xx_yy_p1 | xx | 1 + part_xx_yy_defpart | yy | 2 + part_null | | 0 + part_default_p1 | cd | 25 + part_default_p1 | ab | 21 + part_default_p2 | de | 35 +(9 rows) + -- Check tuple routing for partitioned tables -- fail insert into range_parted values ('a', 0); @@ -251,6 +297,18 @@ insert into range_parted values ('b', 10); insert into range_parted values ('a'); ERROR: no partition of relation "range_parted" found for row DETAIL: Partition key of the failing row contains (a, (b + 0)) = (a, null). +-- Check default partition +create table part_def partition of range_parted default; +-- fail +insert into part_def values ('b', 10); +ERROR: new row for relation "part_def" violates partition constraint +DETAIL: Failing row contains (b, 10). +-- ok +insert into part_def values ('c', 10); +insert into range_parted values (null, null); +insert into range_parted values ('a', null); +insert into range_parted values (null, 19); +insert into range_parted values ('b', 20); select tableoid::regclass, * from range_parted order by 1, 2, 3; tableoid | a | b ----------+---+---- @@ -260,7 +318,12 @@ select tableoid::regclass, * from range_parted order by 1, 2, 3; part3 | b | 1 part4 | b | 10 part4 | b | 10 -(6 rows) + part_def | c | 10 + part_def | | + part_def | a | + part_def | | 19 + part_def | b | 20 +(11 rows) -- ok insert into list_parted values (null, 1); @@ -276,17 +339,22 @@ DETAIL: Partition key of the failing row contains (b) = (0). insert into list_parted values ('EE', 1); insert into part_ee_ff values ('EE', 10); select tableoid::regclass, * from list_parted; - tableoid | a | b --------------+----+---- - part_aa_bb | aA | - part_cc_dd | cC | 1 - part_ee_ff1 | ff | 1 - part_ee_ff1 | EE | 1 - part_ee_ff2 | ff | 11 - part_ee_ff2 | EE | 10 - part_null | | 0 - part_null | | 1 -(8 rows) + tableoid | a | b +--------------------+----+---- + part_aa_bb | aA | + part_cc_dd | cC | 1 + part_ee_ff1 | ff | 1 + part_ee_ff1 | EE | 1 + part_ee_ff2 | ff | 11 + part_ee_ff2 | EE | 10 + part_xx_yy_p1 | xx | 1 + part_xx_yy_defpart | yy | 2 + part_null | | 0 + part_null | | 1 + part_default_p1 | cd | 25 + part_default_p1 | ab | 21 + part_default_p2 | de | 35 +(13 rows) -- some more tests to exercise tuple-routing with multi-level partitioning create table part_gg partition of list_parted for values in ('gg') partition by range (b); @@ -318,6 +386,31 @@ select tableoid::regclass::text, a, min(b) as min_b, max(b) as max_b from list_p -- cleanup drop table range_parted, list_parted; +-- test that a default partition added as the first partition accepts any value +-- including null +create table list_parted (a int) partition by list (a); +create table part_default partition of list_parted default; +\d+ part_default + Table "public.part_default" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+---------+--------------+------------- + a | integer | | | | plain | | +Partition of: list_parted DEFAULT +No partition constraint + +insert into part_default values (null); +insert into part_default values (1); +insert into part_default values (-1); +select tableoid::regclass, a from list_parted; + tableoid | a +--------------+---- + part_default | + part_default | 1 + part_default | -1 +(3 rows) + +-- cleanup +drop table list_parted; -- more tests for certain multi-level partitioning scenarios create table mlparted (a int, b int) partition by range (a, b); create table mlparted1 (a int not null, b int not null) partition by range ((b+0)); @@ -458,6 +551,36 @@ ERROR: Postgres-XL does not support TRIGGER yet DETAIL: The feature is not currently supported insert into mlparted5 (a, b, c) values (1, 40, 'a'); drop table mlparted5; +alter table mlparted drop constraint check_b; +-- Check multi-level default partition +create table mlparted_def partition of mlparted default partition by range(a); +create table mlparted_def1 partition of mlparted_def for values from (40) to (50); +create table mlparted_def2 partition of mlparted_def for values from (50) to (60); +insert into mlparted values (40, 100); +insert into mlparted_def1 values (42, 100); +insert into mlparted_def2 values (54, 50); +-- fail +insert into mlparted values (70, 100); +ERROR: no partition of relation "mlparted_def" found for row +DETAIL: Partition key of the failing row contains (a) = (70). +insert into mlparted_def1 values (52, 50); +ERROR: new row for relation "mlparted_def1" violates partition constraint +DETAIL: Failing row contains (52, 50, null). +insert into mlparted_def2 values (34, 50); +ERROR: new row for relation "mlparted_def2" violates partition constraint +DETAIL: Failing row contains (34, 50, null). +-- ok +create table mlparted_defd partition of mlparted_def default; +insert into mlparted values (70, 100); +select tableoid::regclass, * from mlparted_def; + tableoid | a | b | c +---------------+----+-----+--- + mlparted_def1 | 40 | 100 | + mlparted_def1 | 42 | 100 | + mlparted_def2 | 54 | 50 | + mlparted_defd | 70 | 100 | +(4 rows) + -- check that message shown after failure to find a partition shows the -- appropriate key description (or none) in various situations create table key_desc (a int, b int) partition by list ((a+0)); diff --git a/src/test/regress/expected/insert_1.out b/src/test/regress/expected/insert_1.out index 592137e9..66cffedd 100644 --- a/src/test/regress/expected/insert_1.out +++ b/src/test/regress/expected/insert_1.out @@ -221,17 +221,63 @@ insert into part_null values (null, 0); create table part_ee_ff partition of list_parted for values in ('ee', 'ff') partition by range (b); create table part_ee_ff1 partition of part_ee_ff for values from (1) to (10); create table part_ee_ff2 partition of part_ee_ff for values from (10) to (20); +-- test default partition +create table part_default partition of list_parted default; +-- Negative test: a row, which would fit in other partition, does not fit +-- default partition, even when inserted directly +insert into part_default values ('aa', 2); +ERROR: new row for relation "part_default" violates partition constraint +DETAIL: Failing row contains (aa, 2). +insert into part_default values (null, 2); +ERROR: new row for relation "part_default" violates partition constraint +DETAIL: Failing row contains (null, 2). +-- ok +insert into part_default values ('Zz', 2); +-- test if default partition works as expected for multi-level partitioned +-- table as well as when default partition itself is further partitioned +drop table part_default; +create table part_xx_yy partition of list_parted for values in ('xx', 'yy') partition by list (a); +create table part_xx_yy_p1 partition of part_xx_yy for values in ('xx'); +create table part_xx_yy_defpart partition of part_xx_yy default; +create table part_default partition of list_parted default partition by range(b); +create table part_default_p1 partition of part_default for values from (20) to (30); +create table part_default_p2 partition of part_default for values from (30) to (40); -- fail insert into part_ee_ff1 values ('EE', 11); ERROR: new row for relation "part_ee_ff1" violates partition constraint DETAIL: Failing row contains (EE, 11). +insert into part_default_p2 values ('gg', 43); +ERROR: new row for relation "part_default_p2" violates partition constraint +DETAIL: Failing row contains (gg, 43). -- fail (even the parent's, ie, part_ee_ff's partition constraint applies) insert into part_ee_ff1 values ('cc', 1); ERROR: new row for relation "part_ee_ff1" violates partition constraint DETAIL: Failing row contains (cc, 1). +insert into part_default values ('gg', 43); +ERROR: no partition of relation "part_default" found for row +DETAIL: Partition key of the failing row contains (b) = (43). -- ok insert into part_ee_ff1 values ('ff', 1); insert into part_ee_ff2 values ('ff', 11); +insert into part_default_p1 values ('cd', 25); +insert into part_default_p2 values ('de', 35); +insert into list_parted values ('ab', 21); +insert into list_parted values ('xx', 1); +insert into list_parted values ('yy', 2); +select tableoid::regclass, * from list_parted; + tableoid | a | b +--------------------+----+---- + part_cc_dd | cC | 1 + part_ee_ff1 | ff | 1 + part_ee_ff2 | ff | 11 + part_xx_yy_p1 | xx | 1 + part_xx_yy_defpart | yy | 2 + part_null | | 0 + part_default_p1 | cd | 25 + part_default_p1 | ab | 21 + part_default_p2 | de | 35 +(9 rows) + -- Check tuple routing for partitioned tables -- fail insert into range_parted values ('a', 0); @@ -251,6 +297,18 @@ insert into range_parted values ('b', 10); insert into range_parted values ('a'); ERROR: no partition of relation "range_parted" found for row DETAIL: Partition key of the failing row contains (a, (b + 0)) = (a, null). +-- Check default partition +create table part_def partition of range_parted default; +-- fail +insert into part_def values ('b', 10); +ERROR: new row for relation "part_def" violates partition constraint +DETAIL: Failing row contains (b, 10). +-- ok +insert into part_def values ('c', 10); +insert into range_parted values (null, null); +insert into range_parted values ('a', null); +insert into range_parted values (null, 19); +insert into range_parted values ('b', 20); select tableoid::regclass, * from range_parted order by 1, 2, 3; tableoid | a | b ----------+---+---- @@ -260,7 +318,12 @@ select tableoid::regclass, * from range_parted order by 1, 2, 3; part3 | b | 1 part4 | b | 10 part4 | b | 10 -(6 rows) + part_def | c | 10 + part_def | | + part_def | a | + part_def | | 19 + part_def | b | 20 +(11 rows) -- ok insert into list_parted values (null, 1); @@ -276,17 +339,22 @@ DETAIL: Partition key of the failing row contains (b) = (0). insert into list_parted values ('EE', 1); insert into part_ee_ff values ('EE', 10); select tableoid::regclass, * from list_parted order by 1,2,3; - tableoid | a | b --------------+----+---- - part_aa_bb | aA | - part_cc_dd | cC | 1 - part_null | | 0 - part_null | | 1 - part_ee_ff1 | EE | 1 - part_ee_ff1 | ff | 1 - part_ee_ff2 | EE | 10 - part_ee_ff2 | ff | 11 -(8 rows) + tableoid | a | b +--------------------+----+---- + part_aa_bb | aA | + part_cc_dd | cC | 1 + part_ee_ff1 | ff | 1 + part_ee_ff1 | EE | 1 + part_ee_ff2 | ff | 11 + part_ee_ff2 | EE | 10 + part_xx_yy_p1 | xx | 1 + part_xx_yy_defpart | yy | 2 + part_null | | 0 + part_null | | 1 + part_default_p1 | cd | 25 + part_default_p1 | ab | 21 + part_default_p2 | de | 35 +(13 rows) -- some more tests to exercise tuple-routing with multi-level partitioning create table part_gg partition of list_parted for values in ('gg') partition by range (b); @@ -318,6 +386,31 @@ select tableoid::regclass::text, a, min(b) as min_b, max(b) as max_b from list_p -- cleanup drop table range_parted, list_parted; +-- test that a default partition added as the first partition accepts any value +-- including null +create table list_parted (a int) partition by list (a); +create table part_default partition of list_parted default; +\d+ part_default + Table "public.part_default" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+---------+--------------+------------- + a | integer | | | | plain | | +Partition of: list_parted DEFAULT +No partition constraint + +insert into part_default values (null); +insert into part_default values (1); +insert into part_default values (-1); +select tableoid::regclass, a from list_parted; + tableoid | a +--------------+---- + part_default | + part_default | 1 + part_default | -1 +(3 rows) + +-- cleanup +drop table list_parted; -- more tests for certain multi-level partitioning scenarios create table mlparted (a int, b int) partition by range (a, b); create table mlparted1 (a int not null, b int not null) partition by range ((b+0)); @@ -458,6 +551,36 @@ ERROR: Postgres-XL does not support TRIGGER yet DETAIL: The feature is not currently supported insert into mlparted5 (a, b, c) values (1, 40, 'a'); drop table mlparted5; +alter table mlparted drop constraint check_b; +-- Check multi-level default partition +create table mlparted_def partition of mlparted default partition by range(a); +create table mlparted_def1 partition of mlparted_def for values from (40) to (50); +create table mlparted_def2 partition of mlparted_def for values from (50) to (60); +insert into mlparted values (40, 100); +insert into mlparted_def1 values (42, 100); +insert into mlparted_def2 values (54, 50); +-- fail +insert into mlparted values (70, 100); +ERROR: no partition of relation "mlparted_def" found for row +DETAIL: Partition key of the failing row contains (a) = (70). +insert into mlparted_def1 values (52, 50); +ERROR: new row for relation "mlparted_def1" violates partition constraint +DETAIL: Failing row contains (52, 50, null). +insert into mlparted_def2 values (34, 50); +ERROR: new row for relation "mlparted_def2" violates partition constraint +DETAIL: Failing row contains (34, 50, null). +-- ok +create table mlparted_defd partition of mlparted_def default; +insert into mlparted values (70, 100); +select tableoid::regclass, * from mlparted_def; + tableoid | a | b | c +---------------+----+-----+--- + mlparted_def1 | 40 | 100 | + mlparted_def1 | 42 | 100 | + mlparted_def2 | 54 | 50 | + mlparted_defd | 70 | 100 | +(4 rows) + -- check that message shown after failure to find a partition shows the -- appropriate key description (or none) in various situations create table key_desc (a int, b int) partition by list ((a+0)); diff --git a/src/test/regress/expected/plancache.out b/src/test/regress/expected/plancache.out index 6d14b3a2..086f7977 100644 --- a/src/test/regress/expected/plancache.out +++ b/src/test/regress/expected/plancache.out @@ -252,3 +252,29 @@ NOTICE: 3 (1 row) +-- Check that addition or removal of any partition is correctly dealt with by +-- default partition table when it is being used in prepared statement. +create table list_parted (a int) partition by list(a); +create table list_part_null partition of list_parted for values in (null); +create table list_part_1 partition of list_parted for values in (1); +create table list_part_def partition of list_parted default; +prepare pstmt_def_insert (int) as insert into list_part_def values($1); +-- should fail +execute pstmt_def_insert(null); +ERROR: new row for relation "list_part_def" violates partition constraint +DETAIL: Failing row contains (null). +execute pstmt_def_insert(1); +ERROR: new row for relation "list_part_def" violates partition constraint +DETAIL: Failing row contains (1). +create table list_part_2 partition of list_parted for values in (2); +execute pstmt_def_insert(2); +ERROR: new row for relation "list_part_def" violates partition constraint +DETAIL: Failing row contains (2). +alter table list_parted detach partition list_part_null; +-- should be ok +execute pstmt_def_insert(null); +drop table list_part_1; +-- should be ok +execute pstmt_def_insert(1); +drop table list_parted, list_part_null; +deallocate pstmt_def_insert; diff --git a/src/test/regress/expected/sanity_check.out b/src/test/regress/expected/sanity_check.out index d490c40c..20bce908 100644 --- a/src/test/regress/expected/sanity_check.out +++ b/src/test/regress/expected/sanity_check.out @@ -77,6 +77,10 @@ mlparted12|f mlparted2|f mlparted3|f mlparted4|f +mlparted_def|f +mlparted_def1|f +mlparted_def2|f +mlparted_defd|f money_data|f num_data|f num_exp_add|t diff --git a/src/test/regress/expected/update.out b/src/test/regress/expected/update.out index f761e47e..2989db8f 100644 --- a/src/test/regress/expected/update.out +++ b/src/test/regress/expected/update.out @@ -218,5 +218,38 @@ ERROR: new row for relation "part_b_10_b_20" violates partition constraint DETAIL: Failing row contains (b, 9). -- ok update range_parted set b = b + 1 where b = 10; +-- Creating default partition for range +create table part_def partition of range_parted default; +\d+ part_def + Table "public.part_def" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+----------+--------------+------------- + a | text | | | | extended | | + b | integer | | | | plain | | +Partition of: range_parted DEFAULT +Partition constraint: (NOT (((a = 'a'::text) AND (b >= 1) AND (b < 10)) OR ((a = 'a'::text) AND (b >= 10) AND (b < 20)) OR ((a = 'b'::text) AND (b >= 1) AND (b < 10)) OR ((a = 'b'::text) AND (b >= 10) AND (b < 20)))) + +insert into range_parted values ('c', 9); +-- ok +update part_def set a = 'd' where a = 'c'; +-- fail +update part_def set a = 'a' where a = 'd'; +ERROR: new row for relation "part_def" violates partition constraint +DETAIL: Failing row contains (a, 9). +create table list_parted ( + a text, + b int +) partition by list (a); +create table list_part1 partition of list_parted for values in ('a', 'b'); +create table list_default partition of list_parted default; +insert into list_part1 values ('a', 1); +insert into list_default values ('d', 10); +-- fail +update list_default set a = 'a' where a = 'd'; +ERROR: new row for relation "list_default" violates partition constraint +DETAIL: Failing row contains (a, 10). +-- ok +update list_default set a = 'x' where a = 'd'; -- cleanup drop table range_parted; +drop table list_parted; diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql index 55261e2d..f996ca7a 100644 --- a/src/test/regress/sql/alter_table.sql +++ b/src/test/regress/sql/alter_table.sql @@ -2112,6 +2112,13 @@ SELECT conislocal, coninhcount FROM pg_constraint WHERE conrelid = 'part_1'::reg -- check that the new partition won't overlap with an existing partition CREATE TABLE fail_part (LIKE part_1 INCLUDING CONSTRAINTS); ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1); +-- check that an existing table can be attached as a default partition +CREATE TABLE def_part (LIKE list_parted INCLUDING CONSTRAINTS); +ALTER TABLE list_parted ATTACH PARTITION def_part DEFAULT; +-- check attaching default partition fails if a default partition already +-- exists +CREATE TABLE fail_def_part (LIKE part_1 INCLUDING CONSTRAINTS); +ALTER TABLE list_parted ATTACH PARTITION fail_def_part DEFAULT; -- check validation when attaching list partitions CREATE TABLE list_parted2 ( @@ -2128,6 +2135,15 @@ ALTER TABLE list_parted2 ATTACH PARTITION part_2 FOR VALUES IN (2); DELETE FROM part_2; ALTER TABLE list_parted2 ATTACH PARTITION part_2 FOR VALUES IN (2); +-- check partition cannot be attached if default has some row for its values +CREATE TABLE list_parted2_def PARTITION OF list_parted2 DEFAULT; +INSERT INTO list_parted2_def VALUES (11, 'z'); +CREATE TABLE part_3 (LIKE list_parted2); +ALTER TABLE list_parted2 ATTACH PARTITION part_3 FOR VALUES IN (11); +-- should be ok after deleting the bad row +DELETE FROM list_parted2_def WHERE a = 11; +ALTER TABLE list_parted2 ATTACH PARTITION part_3 FOR VALUES IN (11); + -- adding constraints that describe the desired partition constraint -- (or more restrictive) will help skip the validation scan CREATE TABLE part_3_4 ( @@ -2145,6 +2161,9 @@ ALTER TABLE list_parted2 DETACH PARTITION part_3_4; ALTER TABLE part_3_4 ALTER a SET NOT NULL; ALTER TABLE list_parted2 ATTACH PARTITION part_3_4 FOR VALUES IN (3, 4); +-- check if default partition scan skipped +ALTER TABLE list_parted2_def ADD CONSTRAINT check_a CHECK (a IN (5, 6)); +CREATE TABLE part_55_66 PARTITION OF list_parted2 FOR VALUES IN (55, 66); -- check validation when attaching range partitions CREATE TABLE range_parted ( @@ -2173,6 +2192,21 @@ CREATE TABLE part2 ( ); ALTER TABLE range_parted ATTACH PARTITION part2 FOR VALUES FROM (1, 10) TO (1, 20); +-- Create default partition +CREATE TABLE partr_def1 PARTITION OF range_parted DEFAULT; + +-- Only one default partition is allowed, hence, following should give error +CREATE TABLE partr_def2 (LIKE part1 INCLUDING CONSTRAINTS); +ALTER TABLE range_parted ATTACH PARTITION partr_def2 DEFAULT; + +-- Overlapping partitions cannot be attached, hence, following should give error +INSERT INTO partr_def1 VALUES (2, 10); +CREATE TABLE part3 (LIKE range_parted); +ALTER TABLE range_parted ATTACH partition part3 FOR VALUES FROM (2, 10) TO (2, 20); + +-- Attaching partitions should be successful when there are no overlapping rows +ALTER TABLE range_parted ATTACH partition part3 FOR VALUES FROM (3, 10) TO (3, 20); + -- check that leaf partitions are scanned when attaching a partitioned -- table CREATE TABLE part_5 ( @@ -2235,6 +2269,18 @@ INSERT INTO part_7 (a, b) VALUES (8, null), (9, 'a'); SELECT tableoid::regclass, a, b FROM part_7 order by a; ALTER TABLE list_parted2 ATTACH PARTITION part_7 FOR VALUES IN (7); +-- check that leaf partitions of default partition are scanned when +-- attaching a partitioned table. +ALTER TABLE part_5 DROP CONSTRAINT check_a; +CREATE TABLE part5_def PARTITION OF part_5 DEFAULT PARTITION BY LIST(a); +CREATE TABLE part5_def_p1 PARTITION OF part5_def FOR VALUES IN (5); +INSERT INTO part5_def_p1 VALUES (5, 'y'); +CREATE TABLE part5_p1 (LIKE part_5); +ALTER TABLE part_5 ATTACH PARTITION part5_p1 FOR VALUES IN ('y'); +-- should be ok after deleting the bad row +DELETE FROM part5_def_p1 WHERE b = 'y'; +ALTER TABLE part_5 ATTACH PARTITION part5_p1 FOR VALUES IN ('y'); + -- check that the table being attached is not already a partition ALTER TABLE list_parted2 ATTACH PARTITION part_2 FOR VALUES IN (2); @@ -2330,6 +2376,7 @@ ALTER TABLE list_parted2 ALTER COLUMN b TYPE text; -- cleanup DROP TABLE list_parted, list_parted2, range_parted; +DROP TABLE fail_def_part; -- more tests for certain multi-level partitioning scenarios create table p (a int, b int) partition by range (a, b); diff --git a/src/test/regress/sql/create_table.sql b/src/test/regress/sql/create_table.sql index 51ad8cda..1a74fdd1 100644 --- a/src/test/regress/sql/create_table.sql +++ b/src/test/regress/sql/create_table.sql @@ -449,6 +449,10 @@ CREATE TABLE fail_part PARTITION OF list_parted FOR VALUES IN (); -- trying to specify range for list partitioned table CREATE TABLE fail_part PARTITION OF list_parted FOR VALUES FROM (1) TO (2); +-- check default partition cannot be created more than once +CREATE TABLE part_default PARTITION OF list_parted DEFAULT; +CREATE TABLE fail_default_part PARTITION OF list_parted DEFAULT; + -- specified literal can't be cast to the partition column data type CREATE TABLE bools ( a bool @@ -526,9 +530,13 @@ CREATE TABLE list_parted2 ( ) PARTITION BY LIST (a); CREATE TABLE part_null_z PARTITION OF list_parted2 FOR VALUES IN (null, 'z'); CREATE TABLE part_ab PARTITION OF list_parted2 FOR VALUES IN ('a', 'b'); +CREATE TABLE list_parted2_def PARTITION OF list_parted2 DEFAULT; CREATE TABLE fail_part PARTITION OF list_parted2 FOR VALUES IN (null); CREATE TABLE fail_part PARTITION OF list_parted2 FOR VALUES IN ('b', 'c'); +-- check default partition overlap +INSERT INTO list_parted2 VALUES('X'); +CREATE TABLE fail_part PARTITION OF list_parted2 FOR VALUES IN ('W', 'X', 'Y'); CREATE TABLE range_parted2 ( a int @@ -548,6 +556,17 @@ CREATE TABLE part3 PARTITION OF range_parted2 FOR VALUES FROM (30) TO (40); CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (10) TO (30); CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (10) TO (50); +-- Create a default partition for range partitioned table +CREATE TABLE range2_default PARTITION OF range_parted2 DEFAULT; + +-- More than one default partition is not allowed, so this should give error +CREATE TABLE fail_default_part PARTITION OF range_parted2 DEFAULT; + +-- Check if the range for default partitions overlap +INSERT INTO range_parted2 VALUES (85); +CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (80) TO (90); +CREATE TABLE part4 PARTITION OF range_parted2 FOR VALUES FROM (90) TO (100); + -- now check for multi-column range partition key CREATE TABLE range_parted3 ( a int, @@ -561,6 +580,7 @@ CREATE TABLE part10 PARTITION OF range_parted3 FOR VALUES FROM (1, minvalue) TO CREATE TABLE part11 PARTITION OF range_parted3 FOR VALUES FROM (1, 1) TO (1, 10); CREATE TABLE part12 PARTITION OF range_parted3 FOR VALUES FROM (1, 10) TO (1, maxvalue); CREATE TABLE fail_part PARTITION OF range_parted3 FOR VALUES FROM (1, 10) TO (1, 20); +CREATE TABLE range3_default PARTITION OF range_parted3 DEFAULT; -- cannot create a partition that says column b is allowed to range -- from -infinity to +infinity, while there exist partitions that have diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql index 75d801b9..bbfc03c4 100644 --- a/src/test/regress/sql/insert.sql +++ b/src/test/regress/sql/insert.sql @@ -132,13 +132,39 @@ create table part_ee_ff partition of list_parted for values in ('ee', 'ff') part create table part_ee_ff1 partition of part_ee_ff for values from (1) to (10); create table part_ee_ff2 partition of part_ee_ff for values from (10) to (20); +-- test default partition +create table part_default partition of list_parted default; +-- Negative test: a row, which would fit in other partition, does not fit +-- default partition, even when inserted directly +insert into part_default values ('aa', 2); +insert into part_default values (null, 2); +-- ok +insert into part_default values ('Zz', 2); +-- test if default partition works as expected for multi-level partitioned +-- table as well as when default partition itself is further partitioned +drop table part_default; +create table part_xx_yy partition of list_parted for values in ('xx', 'yy') partition by list (a); +create table part_xx_yy_p1 partition of part_xx_yy for values in ('xx'); +create table part_xx_yy_defpart partition of part_xx_yy default; +create table part_default partition of list_parted default partition by range(b); +create table part_default_p1 partition of part_default for values from (20) to (30); +create table part_default_p2 partition of part_default for values from (30) to (40); + -- fail insert into part_ee_ff1 values ('EE', 11); +insert into part_default_p2 values ('gg', 43); -- fail (even the parent's, ie, part_ee_ff's partition constraint applies) insert into part_ee_ff1 values ('cc', 1); +insert into part_default values ('gg', 43); -- ok insert into part_ee_ff1 values ('ff', 1); insert into part_ee_ff2 values ('ff', 11); +insert into part_default_p1 values ('cd', 25); +insert into part_default_p2 values ('de', 35); +insert into list_parted values ('ab', 21); +insert into list_parted values ('xx', 1); +insert into list_parted values ('yy', 2); +select tableoid::regclass, * from list_parted; -- Check tuple routing for partitioned tables @@ -154,8 +180,18 @@ insert into range_parted values ('b', 1); insert into range_parted values ('b', 10); -- fail (partition key (b+0) is null) insert into range_parted values ('a'); -select tableoid::regclass, * from range_parted order by 1, 2, 3; +-- Check default partition +create table part_def partition of range_parted default; +-- fail +insert into part_def values ('b', 10); +-- ok +insert into part_def values ('c', 10); +insert into range_parted values (null, null); +insert into range_parted values ('a', null); +insert into range_parted values (null, 19); +insert into range_parted values ('b', 20); +select tableoid::regclass, * from range_parted order by 1, 2, 3; -- ok insert into list_parted values (null, 1); insert into list_parted (a) values ('aA'); @@ -188,6 +224,18 @@ select tableoid::regclass::text, a, min(b) as min_b, max(b) as max_b from list_p -- cleanup drop table range_parted, list_parted; +-- test that a default partition added as the first partition accepts any value +-- including null +create table list_parted (a int) partition by list (a); +create table part_default partition of list_parted default; +\d+ part_default +insert into part_default values (null); +insert into part_default values (1); +insert into part_default values (-1); +select tableoid::regclass, a from list_parted; +-- cleanup +drop table list_parted; + -- more tests for certain multi-level partitioning scenarios create table mlparted (a int, b int) partition by range (a, b); create table mlparted1 (a int not null, b int not null) partition by range ((b+0)); @@ -269,6 +317,24 @@ create function mlparted5abrtrig_func() returns trigger as $$ begin new.c = 'b'; create trigger mlparted5abrtrig before insert on mlparted5a for each row execute procedure mlparted5abrtrig_func(); insert into mlparted5 (a, b, c) values (1, 40, 'a'); drop table mlparted5; +alter table mlparted drop constraint check_b; + +-- Check multi-level default partition +create table mlparted_def partition of mlparted default partition by range(a); +create table mlparted_def1 partition of mlparted_def for values from (40) to (50); +create table mlparted_def2 partition of mlparted_def for values from (50) to (60); +insert into mlparted values (40, 100); +insert into mlparted_def1 values (42, 100); +insert into mlparted_def2 values (54, 50); +-- fail +insert into mlparted values (70, 100); +insert into mlparted_def1 values (52, 50); +insert into mlparted_def2 values (34, 50); +-- ok +create table mlparted_defd partition of mlparted_def default; +insert into mlparted values (70, 100); + +select tableoid::regclass, * from mlparted_def; -- check that message shown after failure to find a partition shows the -- appropriate key description (or none) in various situations diff --git a/src/test/regress/sql/plancache.sql b/src/test/regress/sql/plancache.sql index ca6acc41..c9d9269d 100644 --- a/src/test/regress/sql/plancache.sql +++ b/src/test/regress/sql/plancache.sql @@ -156,3 +156,24 @@ end$$ language plpgsql; select cachebug(); select cachebug(); + +-- Check that addition or removal of any partition is correctly dealt with by +-- default partition table when it is being used in prepared statement. +create table list_parted (a int) partition by list(a); +create table list_part_null partition of list_parted for values in (null); +create table list_part_1 partition of list_parted for values in (1); +create table list_part_def partition of list_parted default; +prepare pstmt_def_insert (int) as insert into list_part_def values($1); +-- should fail +execute pstmt_def_insert(null); +execute pstmt_def_insert(1); +create table list_part_2 partition of list_parted for values in (2); +execute pstmt_def_insert(2); +alter table list_parted detach partition list_part_null; +-- should be ok +execute pstmt_def_insert(null); +drop table list_part_1; +-- should be ok +execute pstmt_def_insert(1); +drop table list_parted, list_part_null; +deallocate pstmt_def_insert; diff --git a/src/test/regress/sql/update.sql b/src/test/regress/sql/update.sql index 0b5b3309..42c5e405 100644 --- a/src/test/regress/sql/update.sql +++ b/src/test/regress/sql/update.sql @@ -125,5 +125,29 @@ update range_parted set b = b - 1 where b = 10; -- ok update range_parted set b = b + 1 where b = 10; +-- Creating default partition for range +create table part_def partition of range_parted default; +\d+ part_def +insert into range_parted values ('c', 9); +-- ok +update part_def set a = 'd' where a = 'c'; +-- fail +update part_def set a = 'a' where a = 'd'; + +create table list_parted ( + a text, + b int +) partition by list (a); +create table list_part1 partition of list_parted for values in ('a', 'b'); +create table list_default partition of list_parted default; +insert into list_part1 values ('a', 1); +insert into list_default values ('d', 10); + +-- fail +update list_default set a = 'a' where a = 'd'; +-- ok +update list_default set a = 'x' where a = 'd'; + -- cleanup drop table range_parted; +drop table list_parted; From dd83fa24abfc48cc67de41df9eb5e48d877bdd58 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Fri, 8 Sep 2017 19:04:32 -0400 Subject: [PATCH 191/578] Fix uninitialized-variable bug. map_partition_varattnos() failed to set its found_whole_row output parameter if the given expression list was NIL. This seems to be a pre-existing bug that chanced to be exposed by commit 6f6b99d13. It might be unreachable in v10, but I have little faith in that proposition, so back-patch. Per buildfarm. --- src/backend/catalog/partition.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c index a4ef01e7..92054927 100644 --- a/src/backend/catalog/partition.c +++ b/src/backend/catalog/partition.c @@ -1153,11 +1153,11 @@ map_partition_varattnos(List *expr, int target_varno, Relation partrel, Relation parent, bool *found_whole_row) { - AttrNumber *part_attnos; - bool my_found_whole_row; + bool my_found_whole_row = false; - if (expr == NIL) - return NIL; + if (expr != NIL) + { + AttrNumber *part_attnos; part_attnos = convert_tuples_by_name_map(RelationGetDescr(partrel), RelationGetDescr(parent), @@ -1168,6 +1168,8 @@ map_partition_varattnos(List *expr, int target_varno, RelationGetDescr(parent)->natts, RelationGetForm(partrel)->reltype, &my_found_whole_row); + } + if (found_whole_row) *found_whole_row = my_found_whole_row; From 2fd6649a29039b4cd1c1eda72cd60f5c4ffcc13a Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Thu, 14 Sep 2017 10:43:44 -0400 Subject: [PATCH 192/578] Set partitioned_rels appropriately when UNION ALL is used. In most cases, this omission won't matter, because the appropriate locks will have been acquired during parse/plan or by AcquireExecutorLocks. But it's a bug all the same. Report by Ashutosh Bapat. Patch by me, reviewed by Amit Langote. Discussion: http://postgr.es/m/CAFjFpRdHb_ZnoDTuBXqrudWXh3H1ibLkr6nHsCFT96fSK4DXtA@mail.gmail.com --- src/backend/optimizer/path/allpaths.c | 38 +++++++++++++++++++++++++-- src/backend/optimizer/plan/planner.c | 6 ++--- 2 files changed, 38 insertions(+), 6 deletions(-) diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 7a17b7d7..a3f54c14 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -1307,14 +1307,35 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, ListCell *l; List *partitioned_rels = NIL; RangeTblEntry *rte; + bool build_partitioned_rels = false; + /* + * A plain relation will already have a PartitionedChildRelInfo if it is + * partitioned. For a subquery RTE, no PartitionedChildRelInfo exists; we + * collect all partitioned_rels associated with any child. (This assumes + * that we don't need to look through multiple levels of subquery RTEs; if + * we ever do, we could create a PartitionedChildRelInfo with the + * accumulated list of partitioned_rels which would then be found when + * populated our parent rel with paths. For the present, that appears to + * be unnecessary.) + */ rte = planner_rt_fetch(rel->relid, root); + switch (rte->rtekind) + { + case RTE_RELATION: if (rte->relkind == RELKIND_PARTITIONED_TABLE) { - partitioned_rels = get_partitioned_child_rels(root, rel->relid); - /* The root partitioned table is included as a child rel */ + partitioned_rels = + get_partitioned_child_rels(root, rel->relid); Assert(list_length(partitioned_rels) >= 1); } + break; + case RTE_SUBQUERY: + build_partitioned_rels = true; + break; + default: + elog(ERROR, "unexpcted rtekind: %d", (int) rte->rtekind); + } /* * For every non-dummy child, remember the cheapest path. Also, identify @@ -1327,6 +1348,19 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, ListCell *lcp; /* + * If we need to build partitioned_rels, accumulate the partitioned + * rels for this child. + */ + if (build_partitioned_rels) + { + List *cprels; + + cprels = get_partitioned_child_rels(root, childrel->relid); + partitioned_rels = list_concat(partitioned_rels, + list_copy(cprels)); + } + + /* * If child has an unparameterized cheapest-total path, add that to * the unparameterized Append path we are constructing for the parent. * If not, there's no workable unparameterized path. diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index a3af8985..137e77ca 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -7800,7 +7800,8 @@ grouping_distribution_match(PlannerInfo *root, Query *parse, Path *path, * Returns a list of the RT indexes of the partitioned child relations * with rti as the root parent RT index. * - * Note: Only call this function on RTEs known to be partitioned tables. + * Note: This function might get called even for range table entries that + * are not partitioned tables; in such a case, it will simply return NIL. */ List * get_partitioned_child_rels(PlannerInfo *root, Index rti) @@ -7819,9 +7820,6 @@ get_partitioned_child_rels(PlannerInfo *root, Index rti) } } - /* The root partitioned table is included as a child rel */ - Assert(list_length(result) >= 1); - return result; } From 7b9b82dda6a304f253939b993c03c90ff0bc0806 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Thu, 14 Sep 2017 12:28:50 -0400 Subject: [PATCH 193/578] Make RelationGetPartitionDispatchInfo expand depth-first. With this change, the order of leaf partitions as returned by RelationGetPartitionDispatchInfo should now be the same as the order used by expand_inherited_rtentry. This will make it simpler for future patches to match up the partition dispatch information with the planner data structures. The new code is also, in my opinion anyway, simpler and easier to understand. Amit Langote, reviewed by Amit Khandekar. I also reviewed and made a few cosmetic revisions. Discussion: http://postgr.es/m/d98d4761-5071-1762-501e-0e15047c714b@lab.ntt.co.jp --- src/backend/catalog/partition.c | 220 +++++++++++-------------- src/backend/optimizer/prep/prepunion.c | 7 + 2 files changed, 100 insertions(+), 127 deletions(-) diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c index 92054927..16920224 100644 --- a/src/backend/catalog/partition.c +++ b/src/backend/catalog/partition.c @@ -147,6 +147,8 @@ static int32 partition_bound_cmp(PartitionKey key, static int partition_bound_bsearch(PartitionKey key, PartitionBoundInfo boundinfo, void *probe, bool probe_is_bound, bool *is_equal); +static void get_partition_dispatch_recurse(Relation rel, Relation parent, + List **pds, List **leaf_part_oids); /* * RelationBuildPartitionDesc @@ -1224,21 +1226,6 @@ get_partition_qual_relid(Oid relid) return result; } -/* - * Append OIDs of rel's partitions to the list 'partoids' and for each OID, - * append pointer rel to the list 'parents'. - */ -#define APPEND_REL_PARTITION_OIDS(rel, partoids, parents) \ - do\ - {\ - int i;\ - for (i = 0; i < (rel)->rd_partdesc->nparts; i++)\ - {\ - (partoids) = lappend_oid((partoids), (rel)->rd_partdesc->oids[i]);\ - (parents) = lappend((parents), (rel));\ - }\ - } while(0) - /* * RelationGetPartitionDispatchInfo * Returns information necessary to route tuples down a partition tree @@ -1255,151 +1242,130 @@ PartitionDispatch * RelationGetPartitionDispatchInfo(Relation rel, int *num_parted, List **leaf_part_oids) { + List *pdlist = NIL; PartitionDispatchData **pd; - List *all_parts = NIL, - *all_parents = NIL, - *parted_rels, - *parted_rel_parents; - ListCell *lc1, - *lc2; - int i, - k, - offset; + ListCell *lc; + int i; - /* - * We rely on the relcache to traverse the partition tree to build both - * the leaf partition OIDs list and the array of PartitionDispatch objects - * for the partitioned tables in the tree. That means every partitioned - * table in the tree must be locked, which is fine since we require the - * caller to lock all the partitions anyway. - * - * For every partitioned table in the tree, starting with the root - * partitioned table, add its relcache entry to parted_rels, while also - * queuing its partitions (in the order in which they appear in the - * partition descriptor) to be looked at later in the same loop. This is - * a bit tricky but works because the foreach() macro doesn't fetch the - * next list element until the bottom of the loop. - */ - *num_parted = 1; - parted_rels = list_make1(rel); - /* Root partitioned table has no parent, so NULL for parent */ - parted_rel_parents = list_make1(NULL); - APPEND_REL_PARTITION_OIDS(rel, all_parts, all_parents); - forboth(lc1, all_parts, lc2, all_parents) - { - Oid partrelid = lfirst_oid(lc1); - Relation parent = lfirst(lc2); + Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE); - if (get_rel_relkind(partrelid) == RELKIND_PARTITIONED_TABLE) - { - /* - * Already locked by the caller. Note that it is the - * responsibility of the caller to close the below relcache entry, - * once done using the information being collected here (for - * example, in ExecEndModifyTable). - */ - Relation partrel = heap_open(partrelid, NoLock); + *num_parted = 0; + *leaf_part_oids = NIL; - (*num_parted)++; - parted_rels = lappend(parted_rels, partrel); - parted_rel_parents = lappend(parted_rel_parents, parent); - APPEND_REL_PARTITION_OIDS(partrel, all_parts, all_parents); + get_partition_dispatch_recurse(rel, NULL, &pdlist, leaf_part_oids); + *num_parted = list_length(pdlist); + pd = (PartitionDispatchData **) palloc(*num_parted * + sizeof(PartitionDispatchData *)); + i = 0; + foreach(lc, pdlist) + { + pd[i++] = lfirst(lc); } + + return pd; } /* - * We want to create two arrays - one for leaf partitions and another for - * partitioned tables (including the root table and internal partitions). - * While we only create the latter here, leaf partition array of suitable - * objects (such as, ResultRelInfo) is created by the caller using the - * list of OIDs we return. Indexes into these arrays get assigned in a - * breadth-first manner, whereby partitions of any given level are placed - * consecutively in the respective arrays. + * get_partition_dispatch_recurse + * Recursively expand partition tree rooted at rel + * + * As the partition tree is expanded in a depth-first manner, we mantain two + * global lists: of PartitionDispatch objects corresponding to partitioned + * tables in *pds and of the leaf partition OIDs in *leaf_part_oids. + * + * Note that the order of OIDs of leaf partitions in leaf_part_oids matches + * the order in which the planner's expand_partitioned_rtentry() processes + * them. It's not necessarily the case that the offsets match up exactly, + * because constraint exclusion might prune away some partitions on the + * planner side, whereas we'll always have the complete list; but unpruned + * partitions will appear in the same order in the plan as they are returned + * here. */ - pd = (PartitionDispatchData **) palloc(*num_parted * - sizeof(PartitionDispatchData *)); - *leaf_part_oids = NIL; - i = k = offset = 0; - forboth(lc1, parted_rels, lc2, parted_rel_parents) - { - Relation partrel = lfirst(lc1); - Relation parent = lfirst(lc2); - PartitionKey partkey = RelationGetPartitionKey(partrel); - TupleDesc tupdesc = RelationGetDescr(partrel); - PartitionDesc partdesc = RelationGetPartitionDesc(partrel); - int j, - m; - - pd[i] = (PartitionDispatch) palloc(sizeof(PartitionDispatchData)); - pd[i]->reldesc = partrel; - pd[i]->key = partkey; - pd[i]->keystate = NIL; - pd[i]->partdesc = partdesc; +static void +get_partition_dispatch_recurse(Relation rel, Relation parent, + List **pds, List **leaf_part_oids) +{ + TupleDesc tupdesc = RelationGetDescr(rel); + PartitionDesc partdesc = RelationGetPartitionDesc(rel); + PartitionKey partkey = RelationGetPartitionKey(rel); + PartitionDispatch pd; + int i; + + check_stack_depth(); + + /* Build a PartitionDispatch for this table and add it to *pds. */ + pd = (PartitionDispatch) palloc(sizeof(PartitionDispatchData)); + *pds = lappend(*pds, pd); + pd->reldesc = rel; + pd->key = partkey; + pd->keystate = NIL; + pd->partdesc = partdesc; if (parent != NULL) { /* - * For every partitioned table other than root, we must store a - * tuple table slot initialized with its tuple descriptor and a - * tuple conversion map to convert a tuple from its parent's - * rowtype to its own. That is to make sure that we are looking at - * the correct row using the correct tuple descriptor when - * computing its partition key for tuple routing. + * For every partitioned table other than the root, we must store a + * tuple table slot initialized with its tuple descriptor and a tuple + * conversion map to convert a tuple from its parent's rowtype to its + * own. That is to make sure that we are looking at the correct row + * using the correct tuple descriptor when computing its partition key + * for tuple routing. */ - pd[i]->tupslot = MakeSingleTupleTableSlot(tupdesc); - pd[i]->tupmap = convert_tuples_by_name(RelationGetDescr(parent), + pd->tupslot = MakeSingleTupleTableSlot(tupdesc); + pd->tupmap = convert_tuples_by_name(RelationGetDescr(parent), tupdesc, gettext_noop("could not convert row type")); } else { /* Not required for the root partitioned table */ - pd[i]->tupslot = NULL; - pd[i]->tupmap = NULL; + pd->tupslot = NULL; + pd->tupmap = NULL; } - pd[i]->indexes = (int *) palloc(partdesc->nparts * sizeof(int)); /* - * Indexes corresponding to the internal partitions are multiplied by - * -1 to distinguish them from those of leaf partitions. Encountering - * an index >= 0 means we found a leaf partition, which is immediately - * returned as the partition we are looking for. A negative index - * means we found a partitioned table, whose PartitionDispatch object - * is located at the above index multiplied back by -1. Using the - * PartitionDispatch object, search is continued further down the - * partition tree. - */ - m = 0; - for (j = 0; j < partdesc->nparts; j++) - { - Oid partrelid = partdesc->oids[j]; + * Go look at each partition of this table. If it's a leaf partition, + * simply add its OID to *leaf_part_oids. If it's a partitioned table, + * recursively call get_partition_dispatch_recurse(), so that its + * partitions are processed as well and a corresponding PartitionDispatch + * object gets added to *pds. + * + * About the values in pd->indexes: for a leaf partition, it contains the + * leaf partition's position in the global list *leaf_part_oids minus 1, + * whereas for a partitioned table partition, it contains the partition's + * position in the global list *pds multiplied by -1. The latter is + * multiplied by -1 to distinguish partitioned tables from leaf partitions + * when going through the values in pd->indexes. So, for example, when + * using it during tuple-routing, encountering a value >= 0 means we found + * a leaf partition. It is immediately returned as the index in the array + * of ResultRelInfos of all the leaf partitions, using which we insert the + * tuple into that leaf partition. A negative value means we found a + * partitioned table. The value multiplied by -1 is returned as the index + * in the array of PartitionDispatch objects of all partitioned tables in + * the tree. This value is used to continue the search in the next level + * of the partition tree. + */ + pd->indexes = (int *) palloc(partdesc->nparts * sizeof(int)); + for (i = 0; i < partdesc->nparts; i++) + { + Oid partrelid = partdesc->oids[i]; if (get_rel_relkind(partrelid) != RELKIND_PARTITIONED_TABLE) { *leaf_part_oids = lappend_oid(*leaf_part_oids, partrelid); - pd[i]->indexes[j] = k++; + pd->indexes[i] = list_length(*leaf_part_oids) - 1; } else { /* - * offset denotes the number of partitioned tables of upper - * levels including those of the current level. Any partition - * of this table must belong to the next level and hence will - * be placed after the last partitioned table of this level. + * We assume all tables in the partition tree were already locked + * by the caller. */ - pd[i]->indexes[j] = -(1 + offset + m); - m++; - } - } - i++; + Relation partrel = heap_open(partrelid, NoLock); - /* - * This counts the number of partitioned tables at upper levels - * including those of the current level. - */ - offset += m; + pd->indexes[i] = -list_length(*pds); + get_partition_dispatch_recurse(partrel, rel, pds, leaf_part_oids); + } } - - return pd; } /* Module-local functions */ diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c index ec3de76b..1d8eb868 100644 --- a/src/backend/optimizer/prep/prepunion.c +++ b/src/backend/optimizer/prep/prepunion.c @@ -1644,6 +1644,13 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) root->append_rel_list = list_concat(root->append_rel_list, appinfos); } +/* + * expand_partitioned_rtentry + * Recursively expand an RTE for a partitioned table. + * + * Note that RelationGetPartitionDispatchInfo will expand partitions in the + * same order as this code. + */ static void expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte, Index parentRTindex, Relation parentrel, From 3878c9057d4c3b013a575a4b19e489f4d2e2d577 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Thu, 25 Jun 2020 14:58:27 +0800 Subject: [PATCH 194/578] Expand partitioned table RTEs level by level, without flattening. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/optimizer/path/allpaths.c | 28 ++-- src/backend/optimizer/plan/initsplan.c | 22 ++- src/backend/optimizer/plan/planner.c | 80 ++++++--- src/backend/optimizer/prep/prepunion.c | 224 ++++++++++++++----------- src/include/nodes/relation.h | 8 +- src/test/regress/expected/inherit.out | 22 +++ src/test/regress/expected/join_4.out | 52 ++++++ src/test/regress/sql/inherit.sql | 17 ++ src/test/regress/sql/join.sql | 23 +++ 9 files changed, 344 insertions(+), 132 deletions(-) diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index a3f54c14..60f3dd20 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -26,6 +26,7 @@ #include "catalog/pg_operator.h" #include "catalog/pg_proc.h" #include "foreign/fdwapi.h" +#include "miscadmin.h" #include "nodes/makefuncs.h" #include "nodes/nodeFuncs.h" #ifdef OPTIMIZER_DEBUG @@ -368,8 +369,8 @@ set_rel_size(PlannerInfo *root, RelOptInfo *rel, else if (rte->relkind == RELKIND_PARTITIONED_TABLE) { /* - * A partitioned table without leaf partitions is marked - * as a dummy rel. + * A partitioned table without any partitions is marked as + * a dummy rel. */ set_dummy_rel_pathlist(rel); } @@ -887,6 +888,9 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel, int nattrs; ListCell *l; + /* Guard against stack overflow due to overly deep inheritance tree. */ + check_stack_depth(); + Assert(IS_SIMPLE_REL(rel)); /* @@ -1310,25 +1314,23 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, bool build_partitioned_rels = false; /* - * A plain relation will already have a PartitionedChildRelInfo if it is - * partitioned. For a subquery RTE, no PartitionedChildRelInfo exists; we - * collect all partitioned_rels associated with any child. (This assumes - * that we don't need to look through multiple levels of subquery RTEs; if - * we ever do, we could create a PartitionedChildRelInfo with the - * accumulated list of partitioned_rels which would then be found when - * populated our parent rel with paths. For the present, that appears to - * be unnecessary.) + * A root partition will already have a PartitionedChildRelInfo, and a + * non-root partitioned table doesn't need one, because its Append paths + * will get flattened into the parent anyway. For a subquery RTE, no + * PartitionedChildRelInfo exists; we collect all partitioned_rels + * associated with any child. (This assumes that we don't need to look + * through multiple levels of subquery RTEs; if we ever do, we could + * create a PartitionedChildRelInfo with the accumulated list of + * partitioned_rels which would then be found when populated our parent + * rel with paths. For the present, that appears to be unnecessary.) */ rte = planner_rt_fetch(rel->relid, root); switch (rte->rtekind) { case RTE_RELATION: if (rte->relkind == RELKIND_PARTITIONED_TABLE) - { partitioned_rels = get_partitioned_child_rels(root, rel->relid); - Assert(list_length(partitioned_rels) >= 1); - } break; case RTE_SUBQUERY: build_partitioned_rels = true; diff --git a/src/backend/optimizer/plan/initsplan.c b/src/backend/optimizer/plan/initsplan.c index 7c743fd2..ab0972d5 100644 --- a/src/backend/optimizer/plan/initsplan.c +++ b/src/backend/optimizer/plan/initsplan.c @@ -15,6 +15,7 @@ #include "postgres.h" #include "catalog/pg_type.h" +#include "catalog/pg_class.h" #include "nodes/nodeFuncs.h" #include "optimizer/clauses.h" #include "optimizer/cost.h" @@ -634,11 +635,28 @@ create_lateral_join_info(PlannerInfo *root) for (rti = 1; rti < root->simple_rel_array_size; rti++) { RelOptInfo *brel = root->simple_rel_array[rti]; + RangeTblEntry *brte = root->simple_rte_array[rti]; - if (brel == NULL || brel->reloptkind != RELOPT_BASEREL) + if (brel == NULL) + continue; + + /* + * In the case of table inheritance, the parent RTE is directly linked + * to every child table via an AppendRelInfo. In the case of table + * partitioning, the inheritance hierarchy is expanded one level at a + * time rather than flattened. Therefore, an other member rel that is + * a partitioned table may have children of its own, and must + * therefore be marked with the appropriate lateral info so that those + * children eventually get marked also. + */ + Assert(IS_SIMPLE_REL(brel)); + Assert(brte); + if (brel->reloptkind == RELOPT_OTHER_MEMBER_REL && + (brte->rtekind != RTE_RELATION || + brte->relkind != RELKIND_PARTITIONED_TABLE)) continue; - if (root->simple_rte_array[rti]->inh) + if (brte->inh) { foreach(lc, root->append_rel_list) { diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 137e77ca..11454dfb 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -1213,7 +1213,7 @@ static void inheritance_planner(PlannerInfo *root) {// #lizard forgives Query *parse = root->parse; - int parentRTindex = parse->resultRelation; + int top_parentRTindex = parse->resultRelation; Bitmapset *subqueryRTindexes; Bitmapset *modifiableARIindexes; int nominalRelation = -1; @@ -1231,6 +1231,10 @@ inheritance_planner(PlannerInfo *root) Index rti; RangeTblEntry *parent_rte; List *partitioned_rels = NIL; + PlannerInfo *parent_root; + Query *parent_parse; + Bitmapset *parent_relids = bms_make_singleton(top_parentRTindex); + PlannerInfo **parent_roots = NULL; Assert(parse->commandType != CMD_INSERT); @@ -1294,11 +1298,31 @@ inheritance_planner(PlannerInfo *root) * (including the root parent) as child members of the inheritance set do * not appear anywhere else in the plan. The situation is exactly the * opposite in the case of non-partitioned inheritance parent as described - * below. + * below. For the same reason, collect the list of descendant partitioned + * tables to be saved in ModifyTable node, so that executor can lock those + * as well. */ - parent_rte = rt_fetch(parentRTindex, root->parse->rtable); + parent_rte = rt_fetch(top_parentRTindex, root->parse->rtable); if (parent_rte->relkind == RELKIND_PARTITIONED_TABLE) - nominalRelation = parentRTindex; + { + nominalRelation = top_parentRTindex; + partitioned_rels = get_partitioned_child_rels(root, top_parentRTindex); + /* The root partitioned table is included as a child rel */ + Assert(list_length(partitioned_rels) >= 1); + } + + /* + * The PlannerInfo for each child is obtained by translating the relevant + * members of the PlannerInfo for its immediate parent, which we find + * using the parent_relid in its AppendRelInfo. We save the PlannerInfo + * for each parent in an array indexed by relid for fast retrieval. Since + * the maximum number of parents is limited by the number of RTEs in the + * query, we use that number to allocate the array. An extra entry is + * needed since relids start from 1. + */ + parent_roots = (PlannerInfo **) palloc0((list_length(parse->rtable) + 1) * + sizeof(PlannerInfo *)); + parent_roots[top_parentRTindex] = root; /* * And now we can get on with generating a plan for each child table. @@ -1312,15 +1336,24 @@ inheritance_planner(PlannerInfo *root) Path *subpath; /* append_rel_list contains all append rels; ignore others */ - if (appinfo->parent_relid != parentRTindex) + if (!bms_is_member(appinfo->parent_relid, parent_relids)) continue; /* + * expand_inherited_rtentry() always processes a parent before any of + * that parent's children, so the parent_root for this relation should + * already be available. + */ + parent_root = parent_roots[appinfo->parent_relid]; + Assert(parent_root != NULL); + parent_parse = parent_root->parse; + + /* * We need a working copy of the PlannerInfo so that we can control * propagation of information back to the main copy. */ subroot = makeNode(PlannerInfo); - memcpy(subroot, root, sizeof(PlannerInfo)); + memcpy(subroot, parent_root, sizeof(PlannerInfo)); /* * Generate modified query with this rel as target. We first apply @@ -1329,15 +1362,15 @@ inheritance_planner(PlannerInfo *root) * then fool around with subquery RTEs. */ subroot->parse = (Query *) - adjust_appendrel_attrs(root, - (Node *) parse, + adjust_appendrel_attrs(parent_root, + (Node *) parent_parse, appinfo); /* * If there are securityQuals attached to the parent, move them to the * child rel (they've already been transformed properly for that). */ - parent_rte = rt_fetch(parentRTindex, subroot->parse->rtable); + parent_rte = rt_fetch(appinfo->parent_relid, subroot->parse->rtable); child_rte = rt_fetch(appinfo->child_relid, subroot->parse->rtable); child_rte->securityQuals = parent_rte->securityQuals; parent_rte->securityQuals = NIL; @@ -1348,7 +1381,7 @@ inheritance_planner(PlannerInfo *root) * executor doesn't need to see the modified copies --- we can just * pass it the original rowMarks list.) */ - subroot->rowMarks = copyObject(root->rowMarks); + subroot->rowMarks = copyObject(parent_root->rowMarks); /* * The append_rel_list likewise might contain references to subquery @@ -1365,7 +1398,7 @@ inheritance_planner(PlannerInfo *root) ListCell *lc2; subroot->append_rel_list = NIL; - foreach(lc2, root->append_rel_list) + foreach(lc2, parent_root->append_rel_list) { AppendRelInfo *appinfo2 = (AppendRelInfo *) lfirst(lc2); @@ -1400,7 +1433,7 @@ inheritance_planner(PlannerInfo *root) ListCell *lr; rti = 1; - foreach(lr, parse->rtable) + foreach(lr, parent_parse->rtable) { RangeTblEntry *rte = (RangeTblEntry *) lfirst(lr); @@ -1447,6 +1480,22 @@ inheritance_planner(PlannerInfo *root) /* hack to mark target relation as an inheritance partition */ subroot->hasInheritedTarget = true; + /* + * If the child is further partitioned, remember it as a parent. Since + * a partitioned table does not have any data, we don't need to create + * a plan for it. We do, however, need to remember the PlannerInfo for + * use when processing its children. + */ + if (child_rte->inh) + { + Assert(child_rte->relkind == RELKIND_PARTITIONED_TABLE); + parent_relids = + bms_add_member(parent_relids, appinfo->child_relid); + parent_roots[appinfo->child_relid] = subroot; + + continue; + } + /* Generate Path(s) for accessing this result relation */ grouping_planner(subroot, true, 0.0 /* retrieve all tuples */ ); @@ -1577,13 +1626,6 @@ inheritance_planner(PlannerInfo *root) Assert(!parse->onConflict); } - if (parent_rte->relkind == RELKIND_PARTITIONED_TABLE) - { - partitioned_rels = get_partitioned_child_rels(root, parentRTindex); - /* The root partitioned table is included as a child rel */ - Assert(list_length(partitioned_rels) >= 1); - } - /* Result path must go into outer query's FINAL upperrel */ final_rel = fetch_upper_rel(root, UPPERREL_FINAL, NULL); diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c index 1d8eb868..6057868c 100644 --- a/src/backend/optimizer/prep/prepunion.c +++ b/src/backend/optimizer/prep/prepunion.c @@ -104,16 +104,14 @@ static void expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, static void expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte, Index parentRTindex, Relation parentrel, - PlanRowMark *parentrc, PartitionDesc partdesc, - LOCKMODE lockmode, - bool *has_child, List **appinfos, - List **partitioned_child_rels); + PlanRowMark *top_parentrc, LOCKMODE lockmode, + List **appinfos, List **partitioned_child_rels); static void expand_single_inheritance_child(PlannerInfo *root, RangeTblEntry *parentrte, Index parentRTindex, Relation parentrel, - PlanRowMark *parentrc, Relation childrel, - bool *has_child, List **appinfos, - List **partitioned_child_rels); + PlanRowMark *top_parentrc, Relation childrel, + List **appinfos, RangeTblEntry **childrte_p, + Index *childRTindex_p); static void make_inh_translation_list(Relation oldrelation, Relation newrelation, Index newvarno, @@ -1427,9 +1425,9 @@ expand_inherited_tables(PlannerInfo *root) ListCell *rl; /* - * expand_inherited_rtentry may add RTEs to parse->rtable; there is no - * need to scan them since they can't have inh=true. So just scan as far - * as the original end of the rtable list. + * expand_inherited_rtentry may add RTEs to parse->rtable. The function is + * expected to recursively handle any RTEs that it creates with inh=true. + * So just scan as far as the original end of the rtable list. */ nrtes = list_length(root->parse->rtable); rl = list_head(root->parse->rtable); @@ -1471,11 +1469,7 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) Relation oldrelation; LOCKMODE lockmode; List *inhOIDs; - List *appinfos; ListCell *l; - bool has_child; - PartitionedChildRelInfo *pcinfo; - List *partitioned_child_rels = NIL; /* Does RT entry allow inheritance? */ if (!rte->inh) @@ -1546,27 +1540,44 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) oldrelation = heap_open(parentOID, NoLock); /* Scan the inheritance set and expand it */ - appinfos = NIL; - has_child = false; if (RelationGetPartitionDesc(oldrelation) != NULL) { + List *partitioned_child_rels = NIL; + + Assert(rte->relkind == RELKIND_PARTITIONED_TABLE); + /* * If this table has partitions, recursively expand them in the order - * in which they appear in the PartitionDesc. But first, expand the - * parent itself. + * in which they appear in the PartitionDesc. */ - expand_single_inheritance_child(root, rte, rti, oldrelation, oldrc, - oldrelation, - &has_child, &appinfos, - &partitioned_child_rels); expand_partitioned_rtentry(root, rte, rti, oldrelation, oldrc, - RelationGetPartitionDesc(oldrelation), - lockmode, - &has_child, &appinfos, + lockmode, &root->append_rel_list, &partitioned_child_rels); + + /* + * We keep a list of objects in root, each of which maps a root + * partitioned parent RT index to the list of RT indexes of descendant + * partitioned child tables. When creating an Append or a ModifyTable + * path for the parent, we copy the child RT index list verbatim to + * the path so that it could be carried over to the executor so that + * the latter could identify the partitioned child tables. + */ + if (rte->inh && partitioned_child_rels != NIL) + { + PartitionedChildRelInfo *pcinfo; + + pcinfo = makeNode(PartitionedChildRelInfo); + pcinfo->parent_relid = rti; + pcinfo->child_rels = partitioned_child_rels; + root->pcinfo_list = lappend(root->pcinfo_list, pcinfo); + } } else { + List *appinfos = NIL; + RangeTblEntry *childrte; + Index childRTindex; + /* * This table has no partitions. Expand any plain inheritance * children in the order the OIDs were returned by @@ -1597,51 +1608,30 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) expand_single_inheritance_child(root, rte, rti, oldrelation, oldrc, newrelation, - &has_child, &appinfos, - &partitioned_child_rels); + &appinfos, &childrte, + &childRTindex); /* Close child relations, but keep locks */ if (childOID != parentOID) heap_close(newrelation, NoLock); } - } - - heap_close(oldrelation, NoLock); /* - * If all the children were temp tables or a partitioned parent did not - * have any leaf partitions, pretend it's a non-inheritance situation; we - * don't need Append node in that case. The duplicate RTE we added for - * the parent table is harmless, so we don't bother to get rid of it; - * ditto for the useless PlanRowMark node. + * If all the children were temp tables, pretend it's a + * non-inheritance situation; we don't need Append node in that case. + * The duplicate RTE we added for the parent table is harmless, so we + * don't bother to get rid of it; ditto for the useless PlanRowMark + * node. */ - if (!has_child) - { - /* Clear flag before returning */ + if (list_length(appinfos) < 2) rte->inh = false; - return; - } - - /* - * We keep a list of objects in root, each of which maps a partitioned - * parent RT index to the list of RT indexes of its partitioned child - * tables. When creating an Append or a ModifyTable path for the parent, - * we copy the child RT index list verbatim to the path so that it could - * be carried over to the executor so that the latter could identify the - * partitioned child tables. - */ - if (partitioned_child_rels != NIL) - { - pcinfo = makeNode(PartitionedChildRelInfo); + else + root->append_rel_list = list_concat(root->append_rel_list, + appinfos); - Assert(rte->relkind == RELKIND_PARTITIONED_TABLE); - pcinfo->parent_relid = rti; - pcinfo->child_rels = partitioned_child_rels; - root->pcinfo_list = lappend(root->pcinfo_list, pcinfo); } - /* Otherwise, OK to add to root->append_rel_list */ - root->append_rel_list = list_concat(root->append_rel_list, appinfos); + heap_close(oldrelation, NoLock); } /* @@ -1654,15 +1644,35 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) static void expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte, Index parentRTindex, Relation parentrel, - PlanRowMark *parentrc, PartitionDesc partdesc, - LOCKMODE lockmode, - bool *has_child, List **appinfos, - List **partitioned_child_rels) + PlanRowMark *top_parentrc, LOCKMODE lockmode, + List **appinfos, List **partitioned_child_rels) { int i; + RangeTblEntry *childrte; + Index childRTindex; + bool has_child = false; + PartitionDesc partdesc = RelationGetPartitionDesc(parentrel); check_stack_depth(); + /* A partitioned table should always have a partition descriptor. */ + Assert(partdesc); + + Assert(parentrte->inh); + + /* First expand the partitioned table itself. */ + expand_single_inheritance_child(root, parentrte, parentRTindex, parentrel, + top_parentrc, parentrel, + appinfos, &childrte, &childRTindex); + + /* + * The partitioned table does not have data for itself but still need to + * be locked. Update given list of partitioned children with RTI of this + * partitioned relation. + */ + *partitioned_child_rels = lappend_int(*partitioned_child_rels, + childRTindex); + for (i = 0; i < partdesc->nparts; i++) { Oid childOID = partdesc->oids[i]; @@ -1678,23 +1688,30 @@ expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte, continue; } + /* We have a real partition. */ + has_child = true; + expand_single_inheritance_child(root, parentrte, parentRTindex, - parentrel, parentrc, childrel, - has_child, appinfos, - partitioned_child_rels); + parentrel, top_parentrc, childrel, + appinfos, &childrte, &childRTindex); /* If this child is itself partitioned, recurse */ if (childrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) - expand_partitioned_rtentry(root, parentrte, parentRTindex, - parentrel, parentrc, - RelationGetPartitionDesc(childrel), - lockmode, - has_child, appinfos, - partitioned_child_rels); + expand_partitioned_rtentry(root, childrte, childRTindex, + childrel, top_parentrc, lockmode, + appinfos, partitioned_child_rels); /* Close child relation, but keep locks */ heap_close(childrel, NoLock); } + + /* + * If the partitioned table has no partitions or all the partitions are + * temporary tables from other backends, treat this as non-inheritance + * case. + */ + if (!has_child) + parentrte->inh = false; } /* @@ -1702,16 +1719,31 @@ expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte, * Expand a single inheritance child, if needed. * * If this is a temp table of another backend, we'll return without doing - * anything at all. Otherwise, we'll set "has_child" to true, build a - * RangeTblEntry and either a PartitionedChildRelInfo or AppendRelInfo as + * anything at all. Otherwise, build a RangeTblEntry and an AppendRelInfo, if * appropriate, plus maybe a PlanRowMark. + * + * We now expand the partition hierarchy level by level, creating a + * corresponding hierarchy of AppendRelInfos and RelOptInfos, where each + * partitioned descendant acts as a parent of its immediate partitions. + * (This is a difference from what older versions of PostgreSQL did and what + * is still done in the case of table inheritance for unpartitioned tables, + * where the hierarchy is flattened during RTE expansion.) + * + * PlanRowMarks still carry the top-parent's RTI, and the top-parent's + * allMarkTypes field still accumulates values from all descendents. + * + * "parentrte" and "parentRTindex" are immediate parent's RTE and + * RTI. "top_parentrc" is top parent's PlanRowMark. + * + * The child RangeTblEntry and its RTI are returned in "childrte_p" and + * "childRTindex_p" resp. */ static void expand_single_inheritance_child(PlannerInfo *root, RangeTblEntry *parentrte, Index parentRTindex, Relation parentrel, - PlanRowMark *parentrc, Relation childrel, - bool *has_child, List **appinfos, - List **partitioned_child_rels) + PlanRowMark *top_parentrc, Relation childrel, + List **appinfos, RangeTblEntry **childrte_p, + Index *childRTindex_p) { Query *parse = root->parse; Oid parentOID = RelationGetRelid(parentrel); @@ -1733,24 +1765,30 @@ expand_single_inheritance_child(PlannerInfo *root, RangeTblEntry *parentrte, * restriction clauses, so we don't need to do it here. */ childrte = copyObject(parentrte); + *childrte_p = childrte; childrte->relid = childOID; childrte->relkind = childrel->rd_rel->relkind; + /* A partitioned child will need to be expanded further. */ + if (childOID != parentOID && + childrte->relkind == RELKIND_PARTITIONED_TABLE) + childrte->inh = true; + else childrte->inh = false; childrte->requiredPerms = 0; childrte->securityQuals = NIL; parse->rtable = lappend(parse->rtable, childrte); childRTindex = list_length(parse->rtable); + *childRTindex_p = childRTindex; /* - * Build an AppendRelInfo for this parent and child, unless the child is a - * partitioned table. + * We need an AppendRelInfo if paths will be built for the child RTE. If + * childrte->inh is true, then we'll always need to generate append paths + * for it. If childrte->inh is false, we must scan it if it's not a + * partitioned table; but if it is a partitioned table, then it never has + * any data of its own and need not be scanned. */ - if (childrte->relkind != RELKIND_PARTITIONED_TABLE) + if (childrte->relkind != RELKIND_PARTITIONED_TABLE || childrte->inh) { - /* Remember if we saw a real child. */ - if (childOID != parentOID) - *has_child = true; - appinfo = makeNode(AppendRelInfo); appinfo->parent_relid = parentRTindex; appinfo->child_relid = childRTindex; @@ -1780,25 +1818,23 @@ expand_single_inheritance_child(PlannerInfo *root, RangeTblEntry *parentrte, appinfo->translated_vars); } } - else - *partitioned_child_rels = lappend_int(*partitioned_child_rels, - childRTindex); /* * Build a PlanRowMark if parent is marked FOR UPDATE/SHARE. */ - if (parentrc) + if (top_parentrc) { PlanRowMark *childrc = makeNode(PlanRowMark); childrc->rti = childRTindex; - childrc->prti = parentRTindex; - childrc->rowmarkId = parentrc->rowmarkId; + childrc->prti = top_parentrc->rti; + childrc->rowmarkId = top_parentrc->rowmarkId; /* Reselect rowmark type, because relkind might not match parent */ - childrc->markType = select_rowmark_type(childrte, parentrc->strength); + childrc->markType = select_rowmark_type(childrte, + top_parentrc->strength); childrc->allMarkTypes = (1 << childrc->markType); - childrc->strength = parentrc->strength; - childrc->waitPolicy = parentrc->waitPolicy; + childrc->strength = top_parentrc->strength; + childrc->waitPolicy = top_parentrc->waitPolicy; /* * We mark RowMarks for partitioned child tables as parent RowMarks so @@ -1807,8 +1843,8 @@ expand_single_inheritance_child(PlannerInfo *root, RangeTblEntry *parentrte, */ childrc->isParent = (childrte->relkind == RELKIND_PARTITIONED_TABLE); - /* Include child's rowmark type in parent's allMarkTypes */ - parentrc->allMarkTypes |= childrc->allMarkTypes; + /* Include child's rowmark type in top parent's allMarkTypes */ + top_parentrc->allMarkTypes |= childrc->allMarkTypes; root->rowMarks = lappend(root->rowMarks, childrc); } diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h index 99a6325c..04dd12d6 100644 --- a/src/include/nodes/relation.h +++ b/src/include/nodes/relation.h @@ -2080,10 +2080,10 @@ typedef struct SpecialJoinInfo * * When we expand an inheritable table or a UNION-ALL subselect into an * "append relation" (essentially, a list of child RTEs), we build an - * AppendRelInfo for each non-partitioned child RTE. The list of - * AppendRelInfos indicates which child RTEs must be included when expanding - * the parent, and each node carries information needed to translate Vars - * referencing the parent into Vars referencing that child. + * AppendRelInfo for each child RTE. The list of AppendRelInfos indicates + * which child RTEs must be included when expanding the parent, and each node + * carries information needed to translate Vars referencing the parent into + * Vars referencing that child. * * These structs are kept in the PlannerInfo node's append_rel_list. * Note that we just throw all the structs into one list, and scan the diff --git a/src/test/regress/expected/inherit.out b/src/test/regress/expected/inherit.out index 03b14328..cabe7df5 100644 --- a/src/test/regress/expected/inherit.out +++ b/src/test/regress/expected/inherit.out @@ -862,6 +862,28 @@ select tableoid::regclass::text as relname, parted_tab.* from parted_tab order b (3 rows) drop table parted_tab; +-- Check UPDATE with multi-level partitioned inherited target +create table mlparted_tab (a int, b char, c text) partition by list (a); +create table mlparted_tab_part1 partition of mlparted_tab for values in (1); +create table mlparted_tab_part2 partition of mlparted_tab for values in (2) partition by list (b); +create table mlparted_tab_part3 partition of mlparted_tab for values in (3); +create table mlparted_tab_part2a partition of mlparted_tab_part2 for values in ('a'); +create table mlparted_tab_part2b partition of mlparted_tab_part2 for values in ('b'); +insert into mlparted_tab values (1, 'a'), (2, 'a'), (2, 'b'), (3, 'a'); +update mlparted_tab mlp set c = 'xxx' +from + (select a from some_tab union all select a+1 from some_tab) ss (a) +where (mlp.a = ss.a and mlp.b = 'b') or mlp.a = 3; +select tableoid::regclass::text as relname, mlparted_tab.* from mlparted_tab order by 1,2; + relname | a | b | c +---------------------+---+---+----- + mlparted_tab_part1 | 1 | a | + mlparted_tab_part2a | 2 | a | + mlparted_tab_part2b | 2 | b | xxx + mlparted_tab_part3 | 3 | a | xxx +(4 rows) + +drop table mlparted_tab; drop table some_tab cascade; NOTICE: drop cascades to table some_tab_child /* Test multiple inheritance of column defaults */ diff --git a/src/test/regress/expected/join_4.out b/src/test/regress/expected/join_4.out index 39d9419d..c0af4d24 100644 --- a/src/test/regress/expected/join_4.out +++ b/src/test/regress/expected/join_4.out @@ -5648,6 +5648,58 @@ SELECT count(*) FROM testr WHERE NOT EXISTS (SELECT * FROM testh WHERE testr.b = 3000 (1 row) +-- test LATERAL reference propagation down a multi-level inheritance hierarchy +-- produced for a multi-level partitioned table hierarchy. +-- +create table pt1 (a int, b int, c varchar) partition by range(a); +create table pt1p1 partition of pt1 for values from (0) to (100) partition by range(b); +create table pt1p2 partition of pt1 for values from (100) to (200); +create table pt1p1p1 partition of pt1p1 for values from (0) to (100); +insert into pt1 values (1, 1, 'x'), (101, 101, 'y'); +create table ut1 (a int, b int, c varchar); +insert into ut1 values (101, 101, 'y'), (2, 2, 'z'); +explain (verbose, costs off) +select t1.b, ss.phv from ut1 t1 left join lateral + (select t2.a as t2a, t3.a t3a, least(t1.a, t2.a, t3.a) phv + from pt1 t2 join ut1 t3 on t2.a = t3.b) ss + on t1.a = ss.t2a order by t1.a; + QUERY PLAN +------------------------------------------------------------- + Sort + Output: t1.b, (LEAST(t1.a, t2.a, t3.a)), t1.a + Sort Key: t1.a + -> Nested Loop Left Join + Output: t1.b, (LEAST(t1.a, t2.a, t3.a)), t1.a + -> Seq Scan on public.ut1 t1 + Output: t1.a, t1.b, t1.c + -> Hash Join + Output: t2.a, LEAST(t1.a, t2.a, t3.a) + Hash Cond: (t3.b = t2.a) + -> Seq Scan on public.ut1 t3 + Output: t3.a, t3.b, t3.c + -> Hash + Output: t2.a + -> Append + -> Seq Scan on public.pt1p1p1 t2 + Output: t2.a + Filter: (t1.a = t2.a) + -> Seq Scan on public.pt1p2 t2_1 + Output: t2_1.a + Filter: (t1.a = t2_1.a) +(21 rows) + +select t1.b, ss.phv from ut1 t1 left join lateral + (select t2.a as t2a, t3.a t3a, least(t1.a, t2.a, t3.a) phv + from pt1 t2 join ut1 t3 on t2.a = t3.b) ss + on t1.a = ss.t2a order by t1.a; + b | phv +-----+----- + 2 | + 101 | 101 +(2 rows) + +drop table pt1; +drop table ut1; -- -- test that foreign key join estimation performs sanely for outer joins -- diff --git a/src/test/regress/sql/inherit.sql b/src/test/regress/sql/inherit.sql index 18f3a7b2..ff11dbcb 100644 --- a/src/test/regress/sql/inherit.sql +++ b/src/test/regress/sql/inherit.sql @@ -223,6 +223,23 @@ where parted_tab.a = ss.a; select tableoid::regclass::text as relname, parted_tab.* from parted_tab order by 1,2; drop table parted_tab; + +-- Check UPDATE with multi-level partitioned inherited target +create table mlparted_tab (a int, b char, c text) partition by list (a); +create table mlparted_tab_part1 partition of mlparted_tab for values in (1); +create table mlparted_tab_part2 partition of mlparted_tab for values in (2) partition by list (b); +create table mlparted_tab_part3 partition of mlparted_tab for values in (3); +create table mlparted_tab_part2a partition of mlparted_tab_part2 for values in ('a'); +create table mlparted_tab_part2b partition of mlparted_tab_part2 for values in ('b'); +insert into mlparted_tab values (1, 'a'), (2, 'a'), (2, 'b'), (3, 'a'); + +update mlparted_tab mlp set c = 'xxx' +from + (select a from some_tab union all select a+1 from some_tab) ss (a) +where (mlp.a = ss.a and mlp.b = 'b') or mlp.a = 3; +select tableoid::regclass::text as relname, mlparted_tab.* from mlparted_tab order by 1,2; + +drop table mlparted_tab; drop table some_tab cascade; /* Test multiple inheritance of column defaults */ diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql index 31475056..a81f4a1b 100644 --- a/src/test/regress/sql/join.sql +++ b/src/test/regress/sql/join.sql @@ -1816,6 +1816,29 @@ set enable_hashjoin TO false; EXPLAIN (VERBOSE, COSTS OFF) SELECT count(*) FROM testr WHERE NOT EXISTS (SELECT * FROM testh WHERE testr.b = testh.b); SELECT count(*) FROM testr WHERE NOT EXISTS (SELECT * FROM testh WHERE testr.b = testh.b); +-- +-- test LATERAL reference propagation down a multi-level inheritance hierarchy +-- produced for a multi-level partitioned table hierarchy. +-- +create table pt1 (a int, b int, c varchar) partition by range(a); +create table pt1p1 partition of pt1 for values from (0) to (100) partition by range(b); +create table pt1p2 partition of pt1 for values from (100) to (200); +create table pt1p1p1 partition of pt1p1 for values from (0) to (100); +insert into pt1 values (1, 1, 'x'), (101, 101, 'y'); +create table ut1 (a int, b int, c varchar); +insert into ut1 values (101, 101, 'y'), (2, 2, 'z'); +explain (verbose, costs off) +select t1.b, ss.phv from ut1 t1 left join lateral + (select t2.a as t2a, t3.a t3a, least(t1.a, t2.a, t3.a) phv + from pt1 t2 join ut1 t3 on t2.a = t3.b) ss + on t1.a = ss.t2a order by t1.a; +select t1.b, ss.phv from ut1 t1 left join lateral + (select t2.a as t2a, t3.a t3a, least(t1.a, t2.a, t3.a) phv + from pt1 t2 join ut1 t3 on t2.a = t3.b) ss + on t1.a = ss.t2a order by t1.a; + +drop table pt1; +drop table ut1; -- -- test that foreign key join estimation performs sanely for outer joins -- From 1158170a41577db0a37073c9893493fac587fffa Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Thu, 25 Jun 2020 15:27:44 +0800 Subject: [PATCH 195/578] Associate partitioning information with each RelOptInfo. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/optimizer/util/plancat.c | 159 +++++++++++++++++++++++++++ src/backend/optimizer/util/relnode.c | 35 +++++- src/include/nodes/relation.h | 56 +++++++++- 3 files changed, 248 insertions(+), 2 deletions(-) diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index a1248e65..23938198 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -79,6 +79,10 @@ static List *get_relation_constraints(PlannerInfo *root, static List *build_index_tlist(PlannerInfo *root, IndexOptInfo *index, Relation heapRelation); static List *get_relation_statistics(RelOptInfo *rel, Relation relation); +static void set_relation_partition_info(PlannerInfo *root, RelOptInfo *rel, + Relation relation); +static PartitionScheme find_partition_scheme(PlannerInfo *root, Relation rel); +static List **build_baserel_partition_key_exprs(Relation relation, Index varno); #ifdef __TBASE__ static BlockNumber GetIntervalPartitionPages(Relation rel, bool isindex, bool statistic); @@ -470,6 +474,13 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, /* Collect info about relation's foreign keys, if relevant */ get_relation_foreign_keys(root, rel, relation, inhparent); + /* + * Collect info about relation's partitioning scheme, if any. Only + * inheritance parents may be partitioned. + */ + if (inhparent && relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + set_relation_partition_info(root, rel, relation); + heap_close(relation, NoLock); /* @@ -1956,6 +1967,154 @@ has_row_triggers(PlannerInfo *root, Index rti, CmdType event) return result; } +/* + * set_relation_partition_info + * + * Set partitioning scheme and related information for a partitioned table. + */ +static void +set_relation_partition_info(PlannerInfo *root, RelOptInfo *rel, + Relation relation) +{ + PartitionDesc partdesc; + + Assert(relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE); + + partdesc = RelationGetPartitionDesc(relation); + rel->part_scheme = find_partition_scheme(root, relation); + Assert(partdesc != NULL && rel->part_scheme != NULL); + rel->boundinfo = partdesc->boundinfo; + rel->nparts = partdesc->nparts; + rel->partexprs = build_baserel_partition_key_exprs(relation, rel->relid); +} + +/* + * find_partition_scheme + * + * Find or create a PartitionScheme for this Relation. + */ +static PartitionScheme +find_partition_scheme(PlannerInfo *root, Relation relation) +{ + PartitionKey partkey = RelationGetPartitionKey(relation); + ListCell *lc; + int partnatts; + PartitionScheme part_scheme; + + /* A partitioned table should have a partition key. */ + Assert(partkey != NULL); + + partnatts = partkey->partnatts; + + /* Search for a matching partition scheme and return if found one. */ + foreach(lc, root->part_schemes) + { + part_scheme = lfirst(lc); + + /* Match partitioning strategy and number of keys. */ + if (partkey->strategy != part_scheme->strategy || + partnatts != part_scheme->partnatts) + continue; + + /* Match the partition key types. */ + if (memcmp(partkey->partopfamily, part_scheme->partopfamily, + sizeof(Oid) * partnatts) != 0 || + memcmp(partkey->partopcintype, part_scheme->partopcintype, + sizeof(Oid) * partnatts) != 0 || + memcmp(partkey->parttypcoll, part_scheme->parttypcoll, + sizeof(Oid) * partnatts) != 0) + continue; + + /* + * Length and byval information should match when partopcintype + * matches. + */ + Assert(memcmp(partkey->parttyplen, part_scheme->parttyplen, + sizeof(int16) * partnatts) == 0); + Assert(memcmp(partkey->parttypbyval, part_scheme->parttypbyval, + sizeof(bool) * partnatts) == 0); + + /* Found matching partition scheme. */ + return part_scheme; + } + + /* + * Did not find matching partition scheme. Create one copying relevant + * information from the relcache. Instead of copying whole arrays, copy + * the pointers in relcache. It's safe to do so since + * RelationClearRelation() wouldn't change it while planner is using it. + */ + part_scheme = (PartitionScheme) palloc0(sizeof(PartitionSchemeData)); + part_scheme->strategy = partkey->strategy; + part_scheme->partnatts = partkey->partnatts; + part_scheme->partopfamily = partkey->partopfamily; + part_scheme->partopcintype = partkey->partopcintype; + part_scheme->parttypcoll = partkey->parttypcoll; + part_scheme->parttyplen = partkey->parttyplen; + part_scheme->parttypbyval = partkey->parttypbyval; + + /* Add the partitioning scheme to PlannerInfo. */ + root->part_schemes = lappend(root->part_schemes, part_scheme); + + return part_scheme; +} + +/* + * build_baserel_partition_key_exprs + * + * Collects partition key expressions for a given base relation. Any single + * column partition keys are converted to Var nodes. All Var nodes are set + * to the given varno. The partition key expressions are returned as an array + * of single element lists to be stored in RelOptInfo of the base relation. + */ +static List ** +build_baserel_partition_key_exprs(Relation relation, Index varno) +{ + PartitionKey partkey = RelationGetPartitionKey(relation); + int partnatts; + int cnt; + List **partexprs; + ListCell *lc; + + /* A partitioned table should have a partition key. */ + Assert(partkey != NULL); + + partnatts = partkey->partnatts; + partexprs = (List **) palloc(sizeof(List *) * partnatts); + lc = list_head(partkey->partexprs); + + for (cnt = 0; cnt < partnatts; cnt++) + { + Expr *partexpr; + AttrNumber attno = partkey->partattrs[cnt]; + + if (attno != InvalidAttrNumber) + { + /* Single column partition key is stored as a Var node. */ + Assert(attno > 0); + + partexpr = (Expr *) makeVar(varno, attno, + partkey->parttypid[cnt], + partkey->parttypmod[cnt], + partkey->parttypcoll[cnt], 0); + } + else + { + if (lc == NULL) + elog(ERROR, "wrong number of partition key expressions"); + + /* Re-stamp the expression with given varno. */ + partexpr = (Expr *) copyObject(lfirst(lc)); + ChangeVarNodes((Node *) partexpr, 1, varno, 0); + lc = lnext(lc); + } + + partexprs[cnt] = list_make1(partexpr); + } + + return partexprs; +} + #ifdef __TBASE__ /* Get statistic/physical page num of interval partition table or its index */ static BlockNumber diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c index 9fba700e..0ada588b 100644 --- a/src/backend/optimizer/util/relnode.c +++ b/src/backend/optimizer/util/relnode.c @@ -155,6 +155,11 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptInfo *parent) rel->baserestrict_min_security = UINT_MAX; rel->joininfo = NIL; rel->has_eclass_joins = false; + rel->part_scheme = NULL; + rel->nparts = 0; + rel->boundinfo = NULL; + rel->part_rels = NULL; + rel->partexprs = NULL; #ifdef __TBASE__ rel->intervalparent = false; rel->isdefault = rte->isdefault; @@ -273,18 +278,41 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptInfo *parent) if (rte->inh) { ListCell *l; + int nparts = rel->nparts; + int cnt_parts = 0; + + if (nparts > 0) + rel->part_rels = (RelOptInfo **) + palloc(sizeof(RelOptInfo *) * nparts); foreach(l, root->append_rel_list) { AppendRelInfo *appinfo = (AppendRelInfo *) lfirst(l); + RelOptInfo *childrel; /* append_rel_list contains all append rels; ignore others */ if (appinfo->parent_relid != relid) continue; - (void) build_simple_rel(root, appinfo->child_relid, + childrel = build_simple_rel(root, appinfo->child_relid, rel); + + /* Nothing more to do for an unpartitioned table. */ + if (!rel->part_scheme) + continue; + + /* + * The order of partition OIDs in append_rel_list is the same as + * the order in the PartitionDesc, so the order of part_rels will + * also match the PartitionDesc. See expand_partitioned_rtentry. + */ + Assert(cnt_parts < nparts); + rel->part_rels[cnt_parts] = childrel; + cnt_parts++; } + + /* We should have seen all the child partitions. */ + Assert(cnt_parts == nparts); } return rel; @@ -582,6 +610,11 @@ build_join_rel(PlannerInfo *root, joinrel->joininfo = NIL; joinrel->has_eclass_joins = false; joinrel->top_parent_relids = NULL; + joinrel->part_scheme = NULL; + joinrel->nparts = 0; + joinrel->boundinfo = NULL; + joinrel->part_rels = NULL; + joinrel->partexprs = NULL; #ifdef __TBASE__ joinrel->resultRelLoc = RESULT_REL_NONE; #endif diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h index 04dd12d6..31bdde3e 100644 --- a/src/include/nodes/relation.h +++ b/src/include/nodes/relation.h @@ -360,6 +360,9 @@ typedef struct PlannerInfo List *distinct_pathkeys; /* distinctClause pathkeys, if any */ List *sort_pathkeys; /* sortClause pathkeys, if any */ + List *part_schemes; /* Canonicalised partition schemes used in the + * query. */ + List *initial_rels; /* RelOptInfos we are now trying to join */ /* Use fetch_upper_rel() to get any particular upper rel */ @@ -440,6 +443,34 @@ typedef struct PlannerInfo ((root)->simple_rte_array ? (root)->simple_rte_array[rti] : \ rt_fetch(rti, (root)->parse->rtable)) +/* + * If multiple relations are partitioned the same way, all such partitions + * will have a pointer to the same PartitionScheme. A list of PartitionScheme + * objects is attached to the PlannerInfo. By design, the partition scheme + * incorporates only the general properties of the partition method (LIST vs. + * RANGE, number of partitioning columns and the type information for each) + * and not the specific bounds. + * + * We store the opclass-declared input data types instead of the partition key + * datatypes since the former rather than the latter are used to compare + * partition bounds. Since partition key data types and the opclass declared + * input data types are expected to be binary compatible (per ResolveOpClass), + * both of those should have same byval and length properties. + */ +typedef struct PartitionSchemeData +{ + char strategy; /* partition strategy */ + int16 partnatts; /* number of partition attributes */ + Oid *partopfamily; /* OIDs of operator families */ + Oid *partopcintype; /* OIDs of opclass declared input data types */ + Oid *parttypcoll; /* OIDs of collations of partition keys. */ + + /* Cached information about partition key data types. */ + int16 *parttyplen; + bool *parttypbyval; +} PartitionSchemeData; + +typedef struct PartitionSchemeData *PartitionScheme; /*---------- * RelOptInfo @@ -570,7 +601,7 @@ typedef struct PlannerInfo * other rels for which we have tried and failed to prove * this one unique * - * The presence of the remaining fields depends on the restrictions + * The presence of the following fields depends on the restrictions * and joins that the relation participates in: * * baserestrictinfo - List of RestrictInfo nodes, containing info about @@ -601,6 +632,21 @@ typedef struct PlannerInfo * We store baserestrictcost in the RelOptInfo (for base relations) because * we know we will need it at least once (to price the sequential scan) * and may need it multiple times to price index scans. + * + * If the relation is partitioned, these fields will be set: + * + * part_scheme - Partitioning scheme of the relation + * boundinfo - Partition bounds + * nparts - Number of partitions + * part_rels - RelOptInfos for each partition + * partexprs - Partition key expressions + * + * Note: A base relation always has only one set of partition keys, but a join + * relation may have as many sets of partition keys as the number of relations + * being joined. partexprs is an array containing part_scheme->partnatts + * elements, each of which is a list of partition key expressions. For a base + * relation each list contains only one expression, but for a join relation + * there can be one per baserel. *---------- */ typedef enum RelOptKind @@ -706,6 +752,14 @@ typedef struct RelOptInfo /* used by "other" relations */ Relids top_parent_relids; /* Relids of topmost parents */ + + /* used for partitioned relations */ + PartitionScheme part_scheme; /* Partitioning scheme. */ + int nparts; /* number of partitions */ + struct PartitionBoundInfoData *boundinfo; /* Partition bounds */ + struct RelOptInfo **part_rels; /* Array of RelOptInfos of partitions, + * stored in the same order of bounds */ + List **partexprs; /* Partition key expressions. */ #ifdef __TBASE__ /* used for interval partition */ bool intervalparent; /* is interval partition */ From a32140c9839857fdfb2bb4986b460e127d12d84f Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Thu, 5 Oct 2017 12:19:40 -0400 Subject: [PATCH 196/578] Improve error message when skipping scan of default partition. It seems like a good idea to clearly distinguish between skipping the scan of the new partition itself and skipping the scan of the default partition. Amit Langote Discussion: http://postgr.es/m/1f08b844-0078-aa8d-452e-7af3bf77d05f@lab.ntt.co.jp --- src/backend/commands/tablecmds.c | 5 +++++ src/test/regress/expected/alter_table.out | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 73e65cd9..e6455b86 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -16507,9 +16507,14 @@ ValidatePartitionConstraints(List **wqueue, Relation scanrel, */ if (PartConstraintImpliedByRelConstraint(scanrel, partConstraint)) { + if (!validate_default) ereport(INFO, (errmsg("partition constraint for table \"%s\" is implied by existing constraints", RelationGetRelationName(scanrel)))); + else + ereport(INFO, + (errmsg("updated partition constraint for default partition \"%s\" is implied by existing constraints", + RelationGetRelationName(scanrel)))); return; } diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out index 09a22ad0..93f5db12 100644 --- a/src/test/regress/expected/alter_table.out +++ b/src/test/regress/expected/alter_table.out @@ -3445,7 +3445,7 @@ ALTER TABLE part_7 ATTACH PARTITION part_7_a_null FOR VALUES IN ('a', null); INFO: partition constraint for table "part_7_a_null" is implied by existing constraints ALTER TABLE list_parted2 ATTACH PARTITION part_7 FOR VALUES IN (7); INFO: partition constraint for table "part_7" is implied by existing constraints -INFO: partition constraint for table "list_parted2_def" is implied by existing constraints +INFO: updated partition constraint for default partition "list_parted2_def" is implied by existing constraints -- Same example, but check this time that the constraint correctly detects -- violating rows ALTER TABLE list_parted2 DETACH PARTITION part_7; @@ -3459,7 +3459,7 @@ SELECT tableoid::regclass, a, b FROM part_7 order by a; (2 rows) ALTER TABLE list_parted2 ATTACH PARTITION part_7 FOR VALUES IN (7); -INFO: partition constraint for table "list_parted2_def" is implied by existing constraints +INFO: updated partition constraint for default partition "list_parted2_def" is implied by existing constraints ERROR: partition constraint is violated by some row -- check that leaf partitions of default partition are scanned when -- attaching a partitioned table. From d68e653f593cdfc8d792b1cf62d409e0d4daf26f Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Thu, 5 Oct 2017 13:06:46 -0400 Subject: [PATCH 197/578] On attach, consider skipping validation of subpartitions individually. If the table attached as a partition is itself partitioned, individual partitions might have constraints strong enough to skip scanning the table even if the table actually attached does not. This is pretty cheap to check, and possibly a big win if it works out. Amit Langote, with test case changes by me. Discussion: http://postgr.es/m/1f08b844-0078-aa8d-452e-7af3bf77d05f@lab.ntt.co.jp --- src/backend/commands/tablecmds.c | 15 +++++++++++++++ src/test/regress/expected/alter_table.out | 14 ++++++++++++++ src/test/regress/sql/alter_table.sql | 14 ++++++++++++++ 3 files changed, 43 insertions(+) diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index e6455b86..360fd0ee 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -16555,6 +16555,21 @@ ValidatePartitionConstraints(List **wqueue, Relation scanrel, /* There can never be a whole-row reference here */ if (found_whole_row) elog(ERROR, "unexpected whole-row reference found in partition key"); + + /* Can we skip scanning this part_rel? */ + if (PartConstraintImpliedByRelConstraint(part_rel, my_partconstr)) + { + if (!validate_default) + ereport(INFO, + (errmsg("partition constraint for table \"%s\" is implied by existing constraints", + RelationGetRelationName(part_rel)))); + else + ereport(INFO, + (errmsg("updated partition constraint for default partition \"%s\" is implied by existing constraints", + RelationGetRelationName(part_rel)))); + heap_close(part_rel, NoLock); + continue; + } } /* Grab a work queue entry. */ diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out index 93f5db12..2b11b685 100644 --- a/src/test/regress/expected/alter_table.out +++ b/src/test/regress/expected/alter_table.out @@ -3483,6 +3483,20 @@ DETAIL: "part_5" is already a child of "list_parted2". ALTER TABLE list_parted2 ATTACH PARTITION list_parted2 FOR VALUES IN (0); ERROR: circular inheritance not allowed DETAIL: "list_parted2" is already a child of "list_parted2". +-- If the partitioned table being attached does not have a constraint that +-- would allow validation scan to be skipped, but an individual partition +-- does, then the partition's validation scan is skipped. +CREATE TABLE quuux (a int, b text) PARTITION BY LIST (a); +CREATE TABLE quuux_default PARTITION OF quuux DEFAULT PARTITION BY LIST (b); +CREATE TABLE quuux_default1 PARTITION OF quuux_default ( + CONSTRAINT check_1 CHECK (a IS NOT NULL AND a = 1) +) FOR VALUES IN ('b'); +CREATE TABLE quuux1 (a int, b text); +ALTER TABLE quuux ATTACH PARTITION quuux1 FOR VALUES IN (1); -- validate! +CREATE TABLE quuux2 (a int, b text); +ALTER TABLE quuux ATTACH PARTITION quuux2 FOR VALUES IN (2); -- skip validation +INFO: updated partition constraint for default partition "quuux_default1" is implied by existing constraints +DROP TABLE quuux; -- -- DETACH PARTITION -- diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql index f996ca7a..8b73bdf4 100644 --- a/src/test/regress/sql/alter_table.sql +++ b/src/test/regress/sql/alter_table.sql @@ -2288,6 +2288,20 @@ ALTER TABLE list_parted2 ATTACH PARTITION part_2 FOR VALUES IN (2); ALTER TABLE part_5 ATTACH PARTITION list_parted2 FOR VALUES IN ('b'); ALTER TABLE list_parted2 ATTACH PARTITION list_parted2 FOR VALUES IN (0); +-- If the partitioned table being attached does not have a constraint that +-- would allow validation scan to be skipped, but an individual partition +-- does, then the partition's validation scan is skipped. +CREATE TABLE quuux (a int, b text) PARTITION BY LIST (a); +CREATE TABLE quuux_default PARTITION OF quuux DEFAULT PARTITION BY LIST (b); +CREATE TABLE quuux_default1 PARTITION OF quuux_default ( + CONSTRAINT check_1 CHECK (a IS NOT NULL AND a = 1) +) FOR VALUES IN ('b'); +CREATE TABLE quuux1 (a int, b text); +ALTER TABLE quuux ATTACH PARTITION quuux1 FOR VALUES IN (1); -- validate! +CREATE TABLE quuux2 (a int, b text); +ALTER TABLE quuux ATTACH PARTITION quuux2 FOR VALUES IN (2); -- skip validation +DROP TABLE quuux; + -- -- DETACH PARTITION -- From 953d7eb5d8f9fa71e76ee0738e94b03680626f92 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Thu, 5 Oct 2017 13:21:50 -0400 Subject: [PATCH 198/578] On CREATE TABLE, consider skipping validation of subpartitions. This is just like commit 14f67a8ee282ebc0de78e773fbd597f460ab4a54, but for CREATE PARTITION rather than ATTACH PARTITION. Jeevan Ladhe, with test case changes by me. Discussion: http://postgr.es/m/CAOgcT0MWwG8WBw8frFMtRYHAgDD=tpt6U7WcsO_L2k0KYpm4Jg@mail.gmail.com --- src/backend/catalog/partition.c | 18 ++++++++++++++++++ src/test/regress/expected/alter_table.out | 12 +++++++++--- src/test/regress/sql/alter_table.sql | 11 ++++++++--- 3 files changed, 35 insertions(+), 6 deletions(-) diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c index 16920224..70a3d6d8 100644 --- a/src/backend/catalog/partition.c +++ b/src/backend/catalog/partition.c @@ -986,7 +986,25 @@ check_default_allows_bound(Relation parent, Relation default_rel, /* Lock already taken above. */ if (part_relid != RelationGetRelid(default_rel)) + { part_rel = heap_open(part_relid, NoLock); + + /* + * If the partition constraints on default partition child imply + * that it will not contain any row that would belong to the new + * partition, we can avoid scanning the child table. + */ + if (PartConstraintImpliedByRelConstraint(part_rel, + def_part_constraints)) + { + ereport(INFO, + (errmsg("partition constraint for table \"%s\" is implied by existing constraints", + RelationGetRelationName(part_rel)))); + + heap_close(part_rel, NoLock); + continue; + } + } else part_rel = default_rel; diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out index 2b11b685..1748add2 100644 --- a/src/test/regress/expected/alter_table.out +++ b/src/test/regress/expected/alter_table.out @@ -3483,9 +3483,10 @@ DETAIL: "part_5" is already a child of "list_parted2". ALTER TABLE list_parted2 ATTACH PARTITION list_parted2 FOR VALUES IN (0); ERROR: circular inheritance not allowed DETAIL: "list_parted2" is already a child of "list_parted2". --- If the partitioned table being attached does not have a constraint that --- would allow validation scan to be skipped, but an individual partition --- does, then the partition's validation scan is skipped. +-- If a partitioned table being created or an existing table being attached +-- as a paritition does not have a constraint that would allow validation scan +-- to be skipped, but an individual partition does, then the partition's +-- validation scan is skipped. CREATE TABLE quuux (a int, b text) PARTITION BY LIST (a); CREATE TABLE quuux_default PARTITION OF quuux DEFAULT PARTITION BY LIST (b); CREATE TABLE quuux_default1 PARTITION OF quuux_default ( @@ -3496,6 +3497,11 @@ ALTER TABLE quuux ATTACH PARTITION quuux1 FOR VALUES IN (1); -- validate! CREATE TABLE quuux2 (a int, b text); ALTER TABLE quuux ATTACH PARTITION quuux2 FOR VALUES IN (2); -- skip validation INFO: updated partition constraint for default partition "quuux_default1" is implied by existing constraints +DROP TABLE quuux1, quuux2; +-- should validate for quuux1, but not for quuux2 +CREATE TABLE quuux1 PARTITION OF quuux FOR VALUES IN (1); +CREATE TABLE quuux2 PARTITION OF quuux FOR VALUES IN (2); +INFO: partition constraint for table "quuux_default1" is implied by existing constraints DROP TABLE quuux; -- -- DETACH PARTITION diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql index 8b73bdf4..e2c0219e 100644 --- a/src/test/regress/sql/alter_table.sql +++ b/src/test/regress/sql/alter_table.sql @@ -2288,9 +2288,10 @@ ALTER TABLE list_parted2 ATTACH PARTITION part_2 FOR VALUES IN (2); ALTER TABLE part_5 ATTACH PARTITION list_parted2 FOR VALUES IN ('b'); ALTER TABLE list_parted2 ATTACH PARTITION list_parted2 FOR VALUES IN (0); --- If the partitioned table being attached does not have a constraint that --- would allow validation scan to be skipped, but an individual partition --- does, then the partition's validation scan is skipped. +-- If a partitioned table being created or an existing table being attached +-- as a paritition does not have a constraint that would allow validation scan +-- to be skipped, but an individual partition does, then the partition's +-- validation scan is skipped. CREATE TABLE quuux (a int, b text) PARTITION BY LIST (a); CREATE TABLE quuux_default PARTITION OF quuux DEFAULT PARTITION BY LIST (b); CREATE TABLE quuux_default1 PARTITION OF quuux_default ( @@ -2300,6 +2301,10 @@ CREATE TABLE quuux1 (a int, b text); ALTER TABLE quuux ATTACH PARTITION quuux1 FOR VALUES IN (1); -- validate! CREATE TABLE quuux2 (a int, b text); ALTER TABLE quuux ATTACH PARTITION quuux2 FOR VALUES IN (2); -- skip validation +DROP TABLE quuux1, quuux2; +-- should validate for quuux1, but not for quuux2 +CREATE TABLE quuux1 PARTITION OF quuux FOR VALUES IN (1); +CREATE TABLE quuux2 PARTITION OF quuux FOR VALUES IN (2); DROP TABLE quuux; -- From 15ca8404ae2b66667a4e767d50e3496d97882432 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Thu, 25 Jun 2020 16:29:14 +0800 Subject: [PATCH 199/578] Basic partition-wise join functionality. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- .../postgres_fdw/expected/postgres_fdw.out | 120 ++ contrib/postgres_fdw/sql/postgres_fdw.sql | 53 + doc/src/sgml/config.sgml | 20 + doc/src/sgml/fdwhandler.sgml | 20 + src/backend/optimizer/README | 26 + src/backend/optimizer/geqo/geqo_eval.c | 3 + src/backend/optimizer/path/allpaths.c | 238 ++- src/backend/optimizer/path/costsize.c | 1 + src/backend/optimizer/path/joinpath.c | 102 +- src/backend/optimizer/path/joinrels.c | 316 ++- src/backend/optimizer/plan/createplan.c | 35 +- src/backend/optimizer/plan/planner.c | 23 + src/backend/optimizer/plan/setrefs.c | 59 +- src/backend/optimizer/prep/prepunion.c | 96 + src/backend/optimizer/util/pathnode.c | 364 ++++ src/backend/optimizer/util/placeholder.c | 58 + src/backend/optimizer/util/plancat.c | 32 +- src/backend/optimizer/util/relnode.c | 368 +++- src/backend/utils/misc/guc.c | 9 + src/backend/utils/misc/postgresql.conf.sample | 1 + src/include/foreign/fdwapi.h | 6 + src/include/nodes/extensible.h | 3 + src/include/nodes/relation.h | 51 +- src/include/optimizer/cost.h | 1 + src/include/optimizer/pathnode.h | 6 + src/include/optimizer/paths.h | 5 + src/include/optimizer/placeholder.h | 14 +- src/include/optimizer/planner.h | 2 + src/include/optimizer/prep.h | 16 +- src/test/regress/expected/partition_join.out | 1789 +++++++++++++++++ src/test/regress/expected/sysviews.out | 1 + src/test/regress/expected/sysviews_1.out | 1 + src/test/regress/parallel_schedule | 2 +- src/test/regress/serial_schedule | 1 + src/test/regress/sql/partition_join.sql | 354 ++++ 35 files changed, 4075 insertions(+), 121 deletions(-) create mode 100644 src/test/regress/expected/partition_join.out create mode 100644 src/test/regress/sql/partition_join.sql diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out b/contrib/postgres_fdw/expected/postgres_fdw.out index 77a6e2ce..09aee7c5 100644 --- a/contrib/postgres_fdw/expected/postgres_fdw.out +++ b/contrib/postgres_fdw/expected/postgres_fdw.out @@ -7346,3 +7346,123 @@ AND ftoptions @> array['fetch_size=60000']; (1 row) ROLLBACK; +-- =================================================================== +-- test partition-wise-joins +-- =================================================================== +SET enable_partition_wise_join=on; +CREATE TABLE fprt1 (a int, b int, c varchar) PARTITION BY RANGE(a); +CREATE TABLE fprt1_p1 (LIKE fprt1); +CREATE TABLE fprt1_p2 (LIKE fprt1); +INSERT INTO fprt1_p1 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 249, 2) i; +INSERT INTO fprt1_p2 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(250, 499, 2) i; +CREATE FOREIGN TABLE ftprt1_p1 PARTITION OF fprt1 FOR VALUES FROM (0) TO (250) + SERVER loopback OPTIONS (table_name 'fprt1_p1', use_remote_estimate 'true'); +CREATE FOREIGN TABLE ftprt1_p2 PARTITION OF fprt1 FOR VALUES FROM (250) TO (500) + SERVER loopback OPTIONS (TABLE_NAME 'fprt1_p2'); +ANALYZE fprt1; +ANALYZE fprt1_p1; +ANALYZE fprt1_p2; +CREATE TABLE fprt2 (a int, b int, c varchar) PARTITION BY RANGE(b); +CREATE TABLE fprt2_p1 (LIKE fprt2); +CREATE TABLE fprt2_p2 (LIKE fprt2); +INSERT INTO fprt2_p1 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 249, 3) i; +INSERT INTO fprt2_p2 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(250, 499, 3) i; +CREATE FOREIGN TABLE ftprt2_p1 PARTITION OF fprt2 FOR VALUES FROM (0) TO (250) + SERVER loopback OPTIONS (table_name 'fprt2_p1', use_remote_estimate 'true'); +CREATE FOREIGN TABLE ftprt2_p2 PARTITION OF fprt2 FOR VALUES FROM (250) TO (500) + SERVER loopback OPTIONS (table_name 'fprt2_p2', use_remote_estimate 'true'); +ANALYZE fprt2; +ANALYZE fprt2_p1; +ANALYZE fprt2_p2; +-- inner join three tables +EXPLAIN (COSTS OFF) +SELECT t1.a,t2.b,t3.c FROM fprt1 t1 INNER JOIN fprt2 t2 ON (t1.a = t2.b) INNER JOIN fprt1 t3 ON (t2.b = t3.a) WHERE t1.a % 25 =0 ORDER BY 1,2,3; + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------- + Sort + Sort Key: t1.a, t3.c + -> Append + -> Foreign Scan + Relations: ((public.ftprt1_p1 t1) INNER JOIN (public.ftprt2_p1 t2)) INNER JOIN (public.ftprt1_p1 t3) + -> Foreign Scan + Relations: ((public.ftprt1_p2 t1) INNER JOIN (public.ftprt2_p2 t2)) INNER JOIN (public.ftprt1_p2 t3) +(7 rows) + +SELECT t1.a,t2.b,t3.c FROM fprt1 t1 INNER JOIN fprt2 t2 ON (t1.a = t2.b) INNER JOIN fprt1 t3 ON (t2.b = t3.a) WHERE t1.a % 25 =0 ORDER BY 1,2,3; + a | b | c +-----+-----+------ + 0 | 0 | 0000 + 150 | 150 | 0003 + 250 | 250 | 0005 + 400 | 400 | 0008 +(4 rows) + +-- left outer join + nullable clasue +EXPLAIN (COSTS OFF) +SELECT t1.a,t2.b,t2.c FROM fprt1 t1 LEFT JOIN (SELECT * FROM fprt2 WHERE a < 10) t2 ON (t1.a = t2.b and t1.b = t2.a) WHERE t1.a < 10 ORDER BY 1,2,3; + QUERY PLAN +----------------------------------------------------------------------------------- + Sort + Sort Key: t1.a, ftprt2_p1.b, ftprt2_p1.c + -> Append + -> Foreign Scan + Relations: (public.ftprt1_p1 t1) LEFT JOIN (public.ftprt2_p1 fprt2) +(5 rows) + +SELECT t1.a,t2.b,t2.c FROM fprt1 t1 LEFT JOIN (SELECT * FROM fprt2 WHERE a < 10) t2 ON (t1.a = t2.b and t1.b = t2.a) WHERE t1.a < 10 ORDER BY 1,2,3; + a | b | c +---+---+------ + 0 | 0 | 0000 + 2 | | + 4 | | + 6 | 6 | 0000 + 8 | | +(5 rows) + +-- with whole-row reference +EXPLAIN (COSTS OFF) +SELECT t1,t2 FROM fprt1 t1 JOIN fprt2 t2 ON (t1.a = t2.b and t1.b = t2.a) WHERE t1.a % 25 =0 ORDER BY 1,2; + QUERY PLAN +--------------------------------------------------------------------------------- + Sort + Sort Key: ((t1.*)::fprt1), ((t2.*)::fprt2) + -> Append + -> Foreign Scan + Relations: (public.ftprt1_p1 t1) INNER JOIN (public.ftprt2_p1 t2) + -> Foreign Scan + Relations: (public.ftprt1_p2 t1) INNER JOIN (public.ftprt2_p2 t2) +(7 rows) + +SELECT t1,t2 FROM fprt1 t1 JOIN fprt2 t2 ON (t1.a = t2.b and t1.b = t2.a) WHERE t1.a % 25 =0 ORDER BY 1,2; + t1 | t2 +----------------+---------------- + (0,0,0000) | (0,0,0000) + (150,150,0003) | (150,150,0003) + (250,250,0005) | (250,250,0005) + (400,400,0008) | (400,400,0008) +(4 rows) + +-- join with lateral reference +EXPLAIN (COSTS OFF) +SELECT t1.a,t1.b FROM fprt1 t1, LATERAL (SELECT t2.a, t2.b FROM fprt2 t2 WHERE t1.a = t2.b AND t1.b = t2.a) q WHERE t1.a%25 = 0 ORDER BY 1,2; + QUERY PLAN +--------------------------------------------------------------------------------- + Sort + Sort Key: t1.a, t1.b + -> Append + -> Foreign Scan + Relations: (public.ftprt1_p1 t1) INNER JOIN (public.ftprt2_p1 t2) + -> Foreign Scan + Relations: (public.ftprt1_p2 t1) INNER JOIN (public.ftprt2_p2 t2) +(7 rows) + +SELECT t1.a,t1.b FROM fprt1 t1, LATERAL (SELECT t2.a, t2.b FROM fprt2 t2 WHERE t1.a = t2.b AND t1.b = t2.a) q WHERE t1.a%25 = 0 ORDER BY 1,2; + a | b +-----+----- + 0 | 0 + 150 | 150 + 250 | 250 + 400 | 400 +(4 rows) + +RESET enable_partition_wise_join; diff --git a/contrib/postgres_fdw/sql/postgres_fdw.sql b/contrib/postgres_fdw/sql/postgres_fdw.sql index 5048bff6..471bceae 100644 --- a/contrib/postgres_fdw/sql/postgres_fdw.sql +++ b/contrib/postgres_fdw/sql/postgres_fdw.sql @@ -1764,3 +1764,56 @@ WHERE ftrelid = 'table30000'::regclass AND ftoptions @> array['fetch_size=60000']; ROLLBACK; + +-- =================================================================== +-- test partition-wise-joins +-- =================================================================== +SET enable_partition_wise_join=on; + +CREATE TABLE fprt1 (a int, b int, c varchar) PARTITION BY RANGE(a); +CREATE TABLE fprt1_p1 (LIKE fprt1); +CREATE TABLE fprt1_p2 (LIKE fprt1); +INSERT INTO fprt1_p1 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 249, 2) i; +INSERT INTO fprt1_p2 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(250, 499, 2) i; +CREATE FOREIGN TABLE ftprt1_p1 PARTITION OF fprt1 FOR VALUES FROM (0) TO (250) + SERVER loopback OPTIONS (table_name 'fprt1_p1', use_remote_estimate 'true'); +CREATE FOREIGN TABLE ftprt1_p2 PARTITION OF fprt1 FOR VALUES FROM (250) TO (500) + SERVER loopback OPTIONS (TABLE_NAME 'fprt1_p2'); +ANALYZE fprt1; +ANALYZE fprt1_p1; +ANALYZE fprt1_p2; + +CREATE TABLE fprt2 (a int, b int, c varchar) PARTITION BY RANGE(b); +CREATE TABLE fprt2_p1 (LIKE fprt2); +CREATE TABLE fprt2_p2 (LIKE fprt2); +INSERT INTO fprt2_p1 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 249, 3) i; +INSERT INTO fprt2_p2 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(250, 499, 3) i; +CREATE FOREIGN TABLE ftprt2_p1 PARTITION OF fprt2 FOR VALUES FROM (0) TO (250) + SERVER loopback OPTIONS (table_name 'fprt2_p1', use_remote_estimate 'true'); +CREATE FOREIGN TABLE ftprt2_p2 PARTITION OF fprt2 FOR VALUES FROM (250) TO (500) + SERVER loopback OPTIONS (table_name 'fprt2_p2', use_remote_estimate 'true'); +ANALYZE fprt2; +ANALYZE fprt2_p1; +ANALYZE fprt2_p2; + +-- inner join three tables +EXPLAIN (COSTS OFF) +SELECT t1.a,t2.b,t3.c FROM fprt1 t1 INNER JOIN fprt2 t2 ON (t1.a = t2.b) INNER JOIN fprt1 t3 ON (t2.b = t3.a) WHERE t1.a % 25 =0 ORDER BY 1,2,3; +SELECT t1.a,t2.b,t3.c FROM fprt1 t1 INNER JOIN fprt2 t2 ON (t1.a = t2.b) INNER JOIN fprt1 t3 ON (t2.b = t3.a) WHERE t1.a % 25 =0 ORDER BY 1,2,3; + +-- left outer join + nullable clasue +EXPLAIN (COSTS OFF) +SELECT t1.a,t2.b,t2.c FROM fprt1 t1 LEFT JOIN (SELECT * FROM fprt2 WHERE a < 10) t2 ON (t1.a = t2.b and t1.b = t2.a) WHERE t1.a < 10 ORDER BY 1,2,3; +SELECT t1.a,t2.b,t2.c FROM fprt1 t1 LEFT JOIN (SELECT * FROM fprt2 WHERE a < 10) t2 ON (t1.a = t2.b and t1.b = t2.a) WHERE t1.a < 10 ORDER BY 1,2,3; + +-- with whole-row reference +EXPLAIN (COSTS OFF) +SELECT t1,t2 FROM fprt1 t1 JOIN fprt2 t2 ON (t1.a = t2.b and t1.b = t2.a) WHERE t1.a % 25 =0 ORDER BY 1,2; +SELECT t1,t2 FROM fprt1 t1 JOIN fprt2 t2 ON (t1.a = t2.b and t1.b = t2.a) WHERE t1.a % 25 =0 ORDER BY 1,2; + +-- join with lateral reference +EXPLAIN (COSTS OFF) +SELECT t1.a,t1.b FROM fprt1 t1, LATERAL (SELECT t2.a, t2.b FROM fprt2 t2 WHERE t1.a = t2.b AND t1.b = t2.a) q WHERE t1.a%25 = 0 ORDER BY 1,2; +SELECT t1.a,t1.b FROM fprt1 t1, LATERAL (SELECT t2.a, t2.b FROM fprt2 t2 WHERE t1.a = t2.b AND t1.b = t2.a) q WHERE t1.a%25 = 0 ORDER BY 1,2; + +RESET enable_partition_wise_join; diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index e3c5bab5..939ddd85 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -3705,6 +3705,26 @@ ANY num_sync ( + enable_partition_wise_join (boolean) + + enable_partition_wise_join configuration parameter + + + + + Enables or disables the query planner's use of partition-wise join, + which allows a join between partitioned tables to be performed by + joining the matching partitions. Partition-wise join currently applies + only when the join conditions include all the partition keys, which + must be of the same data type and have exactly matching sets of child + partitions. Because partition-wise join planning can use significantly + more CPU time and memory during planning, the default is + off. + + + + enable_seqscan (boolean) diff --git a/doc/src/sgml/fdwhandler.sgml b/doc/src/sgml/fdwhandler.sgml index cfa68084..cab6ade2 100644 --- a/doc/src/sgml/fdwhandler.sgml +++ b/doc/src/sgml/fdwhandler.sgml @@ -1289,6 +1289,26 @@ ShutdownForeignScan(ForeignScanState *node); + + FDW Routines For reparameterization of paths + + + +List * +ReparameterizeForeignPathByChild(PlannerInfo *root, List *fdw_private, + RelOptInfo *child_rel); + + This function is called while converting a path parameterized by the + top-most parent of the given child relation child_rel to be + parameterized by the child relation. The function is used to reparameterize + any paths or translate any expression nodes saved in the given + fdw_private member of a ForeignPath. The + callback may use reparameterize_path_by_child, + adjust_appendrel_attrs or + adjust_appendrel_attrs_multilevel as required. + + + diff --git a/src/backend/optimizer/README b/src/backend/optimizer/README index fc0fca41..273e5914 100644 --- a/src/backend/optimizer/README +++ b/src/backend/optimizer/README @@ -1076,3 +1076,29 @@ be desirable to postpone the Gather stage until as near to the top of the plan as possible. Expanding the range of cases in which more work can be pushed below the Gather (and costing them accurately) is likely to keep us busy for a long time to come. + +Partition-wise joins +-------------------- +A join between two similarly partitioned tables can be broken down into joins +between their matching partitions if there exists an equi-join condition +between the partition keys of the joining tables. The equi-join between +partition keys implies that all join partners for a given row in one +partitioned table must be in the corresponding partition of the other +partitioned table. Because of this the join between partitioned tables to be +broken into joins between the matching partitions. The resultant join is +partitioned in the same way as the joining relations, thus allowing an N-way +join between similarly partitioned tables having equi-join condition between +their partition keys to be broken down into N-way joins between their matching +partitions. This technique of breaking down a join between partition tables +into join between their partitions is called partition-wise join. We will use +term "partitioned relation" for either a partitioned table or a join between +compatibly partitioned tables. + +The partitioning properties of a partitioned relation are stored in its +RelOptInfo. The information about data types of partition keys are stored in +PartitionSchemeData structure. The planner maintains a list of canonical +partition schemes (distinct PartitionSchemeData objects) so that RelOptInfo of +any two partitioned relations with same partitioning scheme point to the same +PartitionSchemeData object. This reduces memory consumed by +PartitionSchemeData objects and makes it easy to compare the partition schemes +of joining relations. diff --git a/src/backend/optimizer/geqo/geqo_eval.c b/src/backend/optimizer/geqo/geqo_eval.c index b5848268..108b866c 100644 --- a/src/backend/optimizer/geqo/geqo_eval.c +++ b/src/backend/optimizer/geqo/geqo_eval.c @@ -264,6 +264,9 @@ merge_clump(PlannerInfo *root, List *clumps, Clump *new_clump, bool force) /* Keep searching if join order is not valid */ if (joinrel) { + /* Create paths for partition-wise joins. */ + generate_partition_wise_join_paths(root, joinrel); + /* Create GatherPaths for any useful partial paths for rel */ generate_gather_paths(root, joinrel); diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 60f3dd20..0774ff46 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -940,12 +940,79 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel, childrel = find_base_rel(root, childRTindex); Assert(childrel->reloptkind == RELOPT_OTHER_MEMBER_REL); + if (rel->part_scheme) + { + AttrNumber attno; + + /* + * We need attr_needed data for building targetlist of a join + * relation representing join between matching partitions for + * partition-wise join. A given attribute of a child will be + * needed in the same highest joinrel where the corresponding + * attribute of parent is needed. Hence it suffices to use the + * same Relids set for parent and child. + */ + for (attno = rel->min_attr; attno <= rel->max_attr; attno++) + { + int index = attno - rel->min_attr; + Relids attr_needed = rel->attr_needed[index]; + + /* System attributes do not need translation. */ + if (attno <= 0) + { + Assert(rel->min_attr == childrel->min_attr); + childrel->attr_needed[index] = attr_needed; + } + else + { + Var *var = list_nth_node(Var, + appinfo->translated_vars, + attno - 1); + int child_index; + + child_index = var->varattno - childrel->min_attr; + childrel->attr_needed[child_index] = attr_needed; + } + } + } + + /* + * Copy/Modify targetlist. Even if this child is deemed empty, we need + * its targetlist in case it falls on nullable side in a child-join + * because of partition-wise join. + * + * NB: the resulting childrel->reltarget->exprs may contain arbitrary + * expressions, which otherwise would not occur in a rel's targetlist. + * Code that might be looking at an appendrel child must cope with + * such. (Normally, a rel's targetlist would only include Vars and + * PlaceHolderVars.) XXX we do not bother to update the cost or width + * fields of childrel->reltarget; not clear if that would be useful. + */ + childrel->reltarget->exprs = (List *) + adjust_appendrel_attrs(root, + (Node *) rel->reltarget->exprs, + 1, &appinfo); + /* - * We have to copy the parent's targetlist and quals to the child, - * with appropriate substitution of variables. However, only the - * baserestrictinfo quals are needed before we can check for - * constraint exclusion; so do that first and then check to see if we - * can disregard this child. + * We have to make child entries in the EquivalenceClass data + * structures as well. This is needed either if the parent + * participates in some eclass joins (because we will want to consider + * inner-indexscan joins on the individual children) or if the parent + * has useful pathkeys (because we should try to build MergeAppend + * paths that produce those sort orderings). Even if this child is + * deemed dummy, it may fall on nullable side in a child-join, which + * in turn may participate in a MergeAppend, where we will need the + * EquivalenceClass data structures. + */ + if (rel->has_eclass_joins || has_useful_pathkeys(root, rel)) + add_child_rel_equivalences(root, appinfo, rel, childrel); + childrel->has_eclass_joins = rel->has_eclass_joins; + + /* + * We have to copy the parent's quals to the child, with appropriate + * substitution of variables. However, only the baserestrictinfo + * quals are needed before we can check for constraint exclusion; so + * do that first and then check to see if we can disregard this child. * * The child rel's targetlist might contain non-Var expressions, which * means that substitution into the quals could produce opportunities @@ -1072,44 +1139,11 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel, continue; } - /* - * CE failed, so finish copying/modifying targetlist and join quals. - * - * NB: the resulting childrel->reltarget->exprs may contain arbitrary - * expressions, which otherwise would not occur in a rel's targetlist. - * Code that might be looking at an appendrel child must cope with - * such. (Normally, a rel's targetlist would only include Vars and - * PlaceHolderVars.) XXX we do not bother to update the cost or width - * fields of childrel->reltarget; not clear if that would be useful. - */ + /* CE failed, so finish copying/modifying join quals. */ childrel->joininfo = (List *) adjust_appendrel_attrs(root, (Node *) rel->joininfo, appinfo); - childrel->reltarget->exprs = (List *) - adjust_appendrel_attrs(root, - (Node *) rel->reltarget->exprs, - appinfo); - - /* - * We have to make child entries in the EquivalenceClass data - * structures as well. This is needed either if the parent - * participates in some eclass joins (because we will want to consider - * inner-indexscan joins on the individual children) or if the parent - * has useful pathkeys (because we should try to build MergeAppend - * paths that produce those sort orderings). - */ - if (rel->has_eclass_joins || has_useful_pathkeys(root, rel)) - add_child_rel_equivalences(root, appinfo, rel, childrel); - childrel->has_eclass_joins = rel->has_eclass_joins; - - /* - * Note: we could compute appropriate attr_needed data for the child's - * variables, by transforming the parent's attr_needed through the - * translated_vars mapping. However, currently there's no need - * because attr_needed is only examined for base relations not - * otherrels. So we just leave the child's attr_needed empty. - */ /* * If parallelism is allowable for this query in general, see whether @@ -1282,14 +1316,14 @@ set_append_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, live_childrels = lappend(live_childrels, childrel); } - /* Add paths to the "append" relation. */ + /* Add paths to the append relation. */ add_paths_to_append_rel(root, rel, live_childrels); } /* * add_paths_to_append_rel - * Generate paths for given "append" relation given the set of non-dummy + * Generate paths for the given append relation given the set of non-dummy * child rels. * * The function collects all parameterizations and orderings supported by the @@ -1313,16 +1347,19 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte; bool build_partitioned_rels = false; + if (IS_SIMPLE_REL(rel)) + { /* * A root partition will already have a PartitionedChildRelInfo, and a - * non-root partitioned table doesn't need one, because its Append paths - * will get flattened into the parent anyway. For a subquery RTE, no - * PartitionedChildRelInfo exists; we collect all partitioned_rels - * associated with any child. (This assumes that we don't need to look - * through multiple levels of subquery RTEs; if we ever do, we could - * create a PartitionedChildRelInfo with the accumulated list of - * partitioned_rels which would then be found when populated our parent - * rel with paths. For the present, that appears to be unnecessary.) + * non-root partitioned table doesn't need one, because its Append + * paths will get flattened into the parent anyway. For a subquery + * RTE, no PartitionedChildRelInfo exists; we collect all + * partitioned_rels associated with any child. (This assumes that we + * don't need to look through multiple levels of subquery RTEs; if we + * ever do, we could create a PartitionedChildRelInfo with the + * accumulated list of partitioned_rels which would then be found when + * populated our parent rel with paths. For the present, that appears + * to be unnecessary.) */ rte = planner_rt_fetch(rel->relid, root); switch (rte->rtekind) @@ -1338,6 +1375,17 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, default: elog(ERROR, "unexpcted rtekind: %d", (int) rte->rtekind); } + } + else if (rel->reloptkind == RELOPT_JOINREL && rel->part_scheme) + { + /* + * Associate PartitionedChildRelInfo of the root partitioned tables + * being joined with the root partitioned join (indicated by + * RELOPT_JOINREL). + */ + partitioned_rels = get_partitioned_child_rels_for_join(root, + rel->relids); + } /* * For every non-dummy child, remember the cheapest path. Also, identify @@ -2555,16 +2603,22 @@ standard_join_search(PlannerInfo *root, int levels_needed, List *initial_rels) join_search_one_level(root, lev); /* - * Run generate_gather_paths() for each just-processed joinrel. We - * could not do this earlier because both regular and partial paths - * can get added to a particular joinrel at multiple times within - * join_search_one_level. After that, we're done creating paths for - * the joinrel, so run set_cheapest(). + * Run generate_partition_wise_join_paths() and + * generate_gather_paths() for each just-processed joinrel. We could + * not do this earlier because both regular and partial paths can get + * added to a particular joinrel at multiple times within + * join_search_one_level. + * + * After that, we're done creating paths for the joinrel, so run + * set_cheapest(). */ foreach(lc, root->join_rel_level[lev]) { rel = (RelOptInfo *) lfirst(lc); + /* Create paths for partition-wise joins. */ + generate_partition_wise_join_paths(root, rel); + /* Create GatherPaths for any useful partial paths for rel */ generate_gather_paths(root, rel); @@ -3312,6 +3366,82 @@ compute_parallel_worker(RelOptInfo *rel, double heap_pages, double index_pages) return parallel_workers; } +/* + * generate_partition_wise_join_paths + * Create paths representing partition-wise join for given partitioned + * join relation. + * + * This must not be called until after we are done adding paths for all + * child-joins. Otherwise, add_path might delete a path to which some path + * generated here has a reference. + */ +void +generate_partition_wise_join_paths(PlannerInfo *root, RelOptInfo *rel) +{ + List *live_children = NIL; + int cnt_parts; + int num_parts; + RelOptInfo **part_rels; + + /* Handle only join relations here. */ + if (!IS_JOIN_REL(rel)) + return; + + /* + * If we've already proven this join is empty, we needn't consider any + * more paths for it. + */ + if (IS_DUMMY_REL(rel)) + return; + + /* + * Nothing to do if the relation is not partitioned. An outer join + * relation which had empty inner relation in every pair will have rest of + * the partitioning properties set except the child-join RelOptInfos. See + * try_partition_wise_join() for more explanation. + */ + if (rel->nparts <= 0 || rel->part_rels == NULL) + return; + + /* Guard against stack overflow due to overly deep partition hierarchy. */ + check_stack_depth(); + + num_parts = rel->nparts; + part_rels = rel->part_rels; + + /* Collect non-dummy child-joins. */ + for (cnt_parts = 0; cnt_parts < num_parts; cnt_parts++) + { + RelOptInfo *child_rel = part_rels[cnt_parts]; + + /* Add partition-wise join paths for partitioned child-joins. */ + generate_partition_wise_join_paths(root, child_rel); + + /* Dummy children will not be scanned, so ingore those. */ + if (IS_DUMMY_REL(child_rel)) + continue; + + set_cheapest(child_rel); + +#ifdef OPTIMIZER_DEBUG + debug_print_rel(root, rel); +#endif + + live_children = lappend(live_children, child_rel); + } + + /* If all child-joins are dummy, parent join is also dummy. */ + if (!live_children) + { + mark_dummy_rel(rel); + return; + } + + /* Build additional paths for this rel from child-join paths. */ + add_paths_to_append_rel(root, rel, live_children); + list_free(live_children); +} + /***************************************************************************** * DEBUG SUPPORT diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 9d82cec9..18ca6a7d 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -141,6 +141,7 @@ bool enable_mergejoin = true; bool enable_hashjoin = true; bool enable_fast_query_shipping = true; bool enable_gathermerge = true; +bool enable_partition_wise_join = false; bool enable_nestloop_suppression = false; typedef struct diff --git a/src/backend/optimizer/path/joinpath.c b/src/backend/optimizer/path/joinpath.c index 72a766af..de4c1137 100644 --- a/src/backend/optimizer/path/joinpath.c +++ b/src/backend/optimizer/path/joinpath.c @@ -29,9 +29,19 @@ /* Hook for plugins to get control in add_paths_to_joinrel() */ set_join_pathlist_hook_type set_join_pathlist_hook = NULL; -#define PATH_PARAM_BY_REL(path, rel) \ +/* + * Paths parameterized by the parent can be considered to be parameterized by + * any of its child. + */ +#define PATH_PARAM_BY_PARENT(path, rel) \ + ((path)->param_info && bms_overlap(PATH_REQ_OUTER(path), \ + (rel)->top_parent_relids)) +#define PATH_PARAM_BY_REL_SELF(path, rel) \ ((path)->param_info && bms_overlap(PATH_REQ_OUTER(path), (rel)->relids)) +#define PATH_PARAM_BY_REL(path, rel) \ + (PATH_PARAM_BY_REL_SELF(path, rel) || PATH_PARAM_BY_PARENT(path, rel)) + static void try_partial_mergejoin_path(PlannerInfo *root, RelOptInfo *joinrel, Path *outer_path, @@ -118,6 +128,19 @@ add_paths_to_joinrel(PlannerInfo *root, JoinPathExtraData extra; bool mergejoin_allowed = true; ListCell *lc; + Relids joinrelids; + + /* + * PlannerInfo doesn't contain the SpecialJoinInfos created for joins + * between child relations, even if there is a SpecialJoinInfo node for + * the join between the topmost parents. So, while calculating Relids set + * representing the restriction, consider relids of topmost parent of + * partitions. + */ + if (joinrel->reloptkind == RELOPT_OTHER_JOINREL) + joinrelids = joinrel->top_parent_relids; + else + joinrelids = joinrel->relids; extra.restrictlist = restrictlist; extra.mergeclause_list = NIL; @@ -224,16 +247,16 @@ add_paths_to_joinrel(PlannerInfo *root, * join has already been proven legal.) If the SJ is relevant, it * presents constraints for joining to anything not in its RHS. */ - if (bms_overlap(joinrel->relids, sjinfo2->min_righthand) && - !bms_overlap(joinrel->relids, sjinfo2->min_lefthand)) + if (bms_overlap(joinrelids, sjinfo2->min_righthand) && + !bms_overlap(joinrelids, sjinfo2->min_lefthand)) extra.param_source_rels = bms_join(extra.param_source_rels, bms_difference(root->all_baserels, sjinfo2->min_righthand)); /* full joins constrain both sides symmetrically */ if (sjinfo2->jointype == JOIN_FULL && - bms_overlap(joinrel->relids, sjinfo2->min_lefthand) && - !bms_overlap(joinrel->relids, sjinfo2->min_righthand)) + bms_overlap(joinrelids, sjinfo2->min_lefthand) && + !bms_overlap(joinrelids, sjinfo2->min_righthand)) extra.param_source_rels = bms_join(extra.param_source_rels, bms_difference(root->all_baserels, sjinfo2->min_lefthand)); @@ -360,12 +383,26 @@ try_nestloop_path(PlannerInfo *root, JoinCostWorkspace workspace; RelOptInfo *innerrel = inner_path->parent; RelOptInfo *outerrel = outer_path->parent; - Relids innerrelids = innerrel->relids; - Relids outerrelids = outerrel->relids; + Relids innerrelids; + Relids outerrelids; Relids inner_paramrels = PATH_REQ_OUTER(inner_path); Relids outer_paramrels = PATH_REQ_OUTER(outer_path); /* + * Paths are parameterized by top-level parents, so run parameterization + * tests on the parent relids. + */ + if (innerrel->top_parent_relids) + innerrelids = innerrel->top_parent_relids; + else + innerrelids = innerrel->relids; + + if (outerrel->top_parent_relids) + outerrelids = outerrel->top_parent_relids; + else + outerrelids = outerrel->relids; + + /* * Check to see if proposed path is still parameterized, and reject if the * parameterization wouldn't be sensible --- unless allow_star_schema_join * says to allow it anyway. Also, we must reject if have_dangerous_phv @@ -400,6 +437,27 @@ try_nestloop_path(PlannerInfo *root, workspace.startup_cost, workspace.total_cost, pathkeys, required_outer)) { + /* + * If the inner path is parameterized, it is parameterized by the + * topmost parent of the outer rel, not the outer rel itself. Fix + * that. + */ + if (PATH_PARAM_BY_PARENT(inner_path, outer_path->parent)) + { + inner_path = reparameterize_path_by_child(root, inner_path, + outer_path->parent); + + /* + * If we could not translate the path, we can't create nest loop + * path. + */ + if (!inner_path) + { + bms_free(required_outer); + return; + } + } + add_path(joinrel, (Path *) create_nestloop_path(root, joinrel, @@ -445,8 +503,20 @@ try_partial_nestloop_path(PlannerInfo *root, if (inner_path->param_info != NULL) { Relids inner_paramrels = inner_path->param_info->ppi_req_outer; + RelOptInfo *outerrel = outer_path->parent; + Relids outerrelids; - if (!bms_is_subset(inner_paramrels, outer_path->parent->relids)) + /* + * The inner and outer paths are parameterized, if at all, by the top + * level parents, not the child relations, so we must use those relids + * for our paramaterization tests. + */ + if (outerrel->top_parent_relids) + outerrelids = outerrel->top_parent_relids; + else + outerrelids = outerrel->relids; + + if (!bms_is_subset(inner_paramrels, outerrelids)) return; } @@ -459,6 +529,22 @@ try_partial_nestloop_path(PlannerInfo *root, if (!add_partial_path_precheck(joinrel, workspace.total_cost, pathkeys)) return; + /* + * If the inner path is parameterized, it is parameterized by the topmost + * parent of the outer rel, not the outer rel itself. Fix that. + */ + if (PATH_PARAM_BY_PARENT(inner_path, outer_path->parent)) + { + inner_path = reparameterize_path_by_child(root, inner_path, + outer_path->parent); + + /* + * If we could not translate the path, we can't create nest loop path. + */ + if (!inner_path) + return; + } + /* Might be good enough to be worth trying, so let's try it. */ add_partial_path(joinrel, (Path *) create_nestloop_path(root, diff --git a/src/backend/optimizer/path/joinrels.c b/src/backend/optimizer/path/joinrels.c index eb920d05..ad902dcf 100644 --- a/src/backend/optimizer/path/joinrels.c +++ b/src/backend/optimizer/path/joinrels.c @@ -14,10 +14,17 @@ */ #include "postgres.h" +#include "miscadmin.h" +#include "catalog/partition.h" +#include "nodes/relation.h" +#include "optimizer/clauses.h" #include "optimizer/joininfo.h" #include "optimizer/pathnode.h" #include "optimizer/paths.h" +#include "optimizer/prep.h" +#include "optimizer/cost.h" #include "utils/memutils.h" +#include "utils/lsyscache.h" static void make_rels_by_clause_joins(PlannerInfo *root, @@ -29,12 +36,17 @@ static void make_rels_by_clauseless_joins(PlannerInfo *root, static bool has_join_restriction(PlannerInfo *root, RelOptInfo *rel); static bool has_legal_joinclause(PlannerInfo *root, RelOptInfo *rel); static bool is_dummy_rel(RelOptInfo *rel); -static void mark_dummy_rel(RelOptInfo *rel); static bool restriction_is_constant_false(List *restrictlist, bool only_pushed_down); static void populate_joinrel_with_paths(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2, RelOptInfo *joinrel, SpecialJoinInfo *sjinfo, List *restrictlist); +static void try_partition_wise_join(PlannerInfo *root, RelOptInfo *rel1, + RelOptInfo *rel2, RelOptInfo *joinrel, + SpecialJoinInfo *parent_sjinfo, + List *parent_restrictlist); +static int match_expr_to_partition_keys(Expr *expr, RelOptInfo *rel, + bool strict_op); /* @@ -913,6 +925,9 @@ populate_joinrel_with_paths(PlannerInfo *root, RelOptInfo *rel1, elog(ERROR, "unrecognized join type: %d", (int) sjinfo->jointype); break; } + + /* Apply partition-wise join technique, if possible. */ + try_partition_wise_join(root, rel1, rel2, joinrel, sjinfo, restrictlist); } @@ -1218,7 +1233,7 @@ is_dummy_rel(RelOptInfo *rel) * is that the best solution is to explicitly make the dummy path in the same * context the given RelOptInfo is in. */ -static void +void mark_dummy_rel(RelOptInfo *rel) { MemoryContext oldcontext; @@ -1289,3 +1304,300 @@ restriction_is_constant_false(List *restrictlist, bool only_pushed_down) } return false; } + +/* + * Assess whether join between given two partitioned relations can be broken + * down into joins between matching partitions; a technique called + * "partition-wise join" + * + * Partition-wise join is possible when a. Joining relations have same + * partitioning scheme b. There exists an equi-join between the partition keys + * of the two relations. + * + * Partition-wise join is planned as follows (details: optimizer/README.) + * + * 1. Create the RelOptInfos for joins between matching partitions i.e + * child-joins and add paths to them. + * + * 2. Construct Append or MergeAppend paths across the set of child joins. + * This second phase is implemented by generate_partition_wise_join_paths(). + * + * The RelOptInfo, SpecialJoinInfo and restrictlist for each child join are + * obtained by translating the respective parent join structures. + */ +static void +try_partition_wise_join(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2, + RelOptInfo *joinrel, SpecialJoinInfo *parent_sjinfo, + List *parent_restrictlist) +{ + int nparts; + int cnt_parts; + + /* Guard against stack overflow due to overly deep partition hierarchy. */ + check_stack_depth(); + + /* Nothing to do, if the join relation is not partitioned. */ + if (!IS_PARTITIONED_REL(joinrel)) + return; + + /* + * set_rel_pathlist() may not create paths in children of an empty + * partitioned table and so we can not add paths to child-joins. So, deem + * such a join as unpartitioned. When a partitioned relation is deemed + * empty because all its children are empty, dummy path will be set in + * each of the children. In such a case we could still consider the join + * as partitioned, but it might not help much. + */ + if (IS_DUMMY_REL(rel1) || IS_DUMMY_REL(rel2)) + return; + + /* + * Since this join relation is partitioned, all the base relations + * participating in this join must be partitioned and so are all the + * intermediate join relations. + */ + Assert(IS_PARTITIONED_REL(rel1) && IS_PARTITIONED_REL(rel2)); + Assert(REL_HAS_ALL_PART_PROPS(rel1) && REL_HAS_ALL_PART_PROPS(rel2)); + + /* + * The partition scheme of the join relation should match that of the + * joining relations. + */ + Assert(joinrel->part_scheme == rel1->part_scheme && + joinrel->part_scheme == rel2->part_scheme); + + /* + * Since we allow partition-wise join only when the partition bounds of + * the joining relations exactly match, the partition bounds of the join + * should match those of the joining relations. + */ + Assert(partition_bounds_equal(joinrel->part_scheme->partnatts, + joinrel->part_scheme->parttyplen, + joinrel->part_scheme->parttypbyval, + joinrel->boundinfo, rel1->boundinfo)); + Assert(partition_bounds_equal(joinrel->part_scheme->partnatts, + joinrel->part_scheme->parttyplen, + joinrel->part_scheme->parttypbyval, + joinrel->boundinfo, rel2->boundinfo)); + + nparts = joinrel->nparts; + + /* Allocate space to hold child-joins RelOptInfos, if not already done. */ + if (!joinrel->part_rels) + joinrel->part_rels = + (RelOptInfo **) palloc0(sizeof(RelOptInfo *) * nparts); + + /* + * Create child-join relations for this partitioned join, if those don't + * exist. Add paths to child-joins for a pair of child relations + * corresponding to the given pair of parent relations. + */ + for (cnt_parts = 0; cnt_parts < nparts; cnt_parts++) + { + RelOptInfo *child_rel1 = rel1->part_rels[cnt_parts]; + RelOptInfo *child_rel2 = rel2->part_rels[cnt_parts]; + SpecialJoinInfo *child_sjinfo; + List *child_restrictlist; + RelOptInfo *child_joinrel; + Relids child_joinrelids; + AppendRelInfo **appinfos; + int nappinfos; + + /* We should never try to join two overlapping sets of rels. */ + Assert(!bms_overlap(child_rel1->relids, child_rel2->relids)); + child_joinrelids = bms_union(child_rel1->relids, child_rel2->relids); + appinfos = find_appinfos_by_relids(root, child_joinrelids, &nappinfos); + + /* + * Construct SpecialJoinInfo from parent join relations's + * SpecialJoinInfo. + */ + child_sjinfo = build_child_join_sjinfo(root, parent_sjinfo, + child_rel1->relids, + child_rel2->relids); + + /* + * Construct restrictions applicable to the child join from those + * applicable to the parent join. + */ + child_restrictlist = + (List *) adjust_appendrel_attrs(root, + (Node *) parent_restrictlist, + nappinfos, appinfos); + pfree(appinfos); + + child_joinrel = joinrel->part_rels[cnt_parts]; + if (!child_joinrel) + { + child_joinrel = build_child_join_rel(root, child_rel1, child_rel2, + joinrel, child_restrictlist, + child_sjinfo, + child_sjinfo->jointype); + joinrel->part_rels[cnt_parts] = child_joinrel; + } + + Assert(bms_equal(child_joinrel->relids, child_joinrelids)); + + populate_joinrel_with_paths(root, child_rel1, child_rel2, + child_joinrel, child_sjinfo, + child_restrictlist); + } +} + +/* + * Returns true if there exists an equi-join condition for each pair of + * partition keys from given relations being joined. + */ +bool +have_partkey_equi_join(RelOptInfo *rel1, RelOptInfo *rel2, JoinType jointype, + List *restrictlist) +{ + PartitionScheme part_scheme = rel1->part_scheme; + ListCell *lc; + int cnt_pks; + bool pk_has_clause[PARTITION_MAX_KEYS]; + bool strict_op; + + /* + * This function should be called when the joining relations have same + * partitioning scheme. + */ + Assert(rel1->part_scheme == rel2->part_scheme); + Assert(part_scheme); + + memset(pk_has_clause, 0, sizeof(pk_has_clause)); + foreach(lc, restrictlist) + { + RestrictInfo *rinfo = lfirst_node(RestrictInfo, lc); + OpExpr *opexpr; + Expr *expr1; + Expr *expr2; + int ipk1; + int ipk2; + + /* If processing an outer join, only use its own join clauses. */ + if (IS_OUTER_JOIN(jointype) && rinfo->is_pushed_down) + continue; + + /* Skip clauses which can not be used for a join. */ + if (!rinfo->can_join) + continue; + + /* Skip clauses which are not equality conditions. */ + if (!rinfo->mergeopfamilies) + continue; + + opexpr = (OpExpr *) rinfo->clause; + Assert(is_opclause(opexpr)); + + /* + * The equi-join between partition keys is strict if equi-join between + * at least one partition key is using a strict operator. See + * explanation about outer join reordering identity 3 in + * optimizer/README + */ + strict_op = op_strict(opexpr->opno); + + /* Match the operands to the relation. */ + if (bms_is_subset(rinfo->left_relids, rel1->relids) && + bms_is_subset(rinfo->right_relids, rel2->relids)) + { + expr1 = linitial(opexpr->args); + expr2 = lsecond(opexpr->args); + } + else if (bms_is_subset(rinfo->left_relids, rel2->relids) && + bms_is_subset(rinfo->right_relids, rel1->relids)) + { + expr1 = lsecond(opexpr->args); + expr2 = linitial(opexpr->args); + } + else + continue; + + /* + * Only clauses referencing the partition keys are useful for + * partition-wise join. + */ + ipk1 = match_expr_to_partition_keys(expr1, rel1, strict_op); + if (ipk1 < 0) + continue; + ipk2 = match_expr_to_partition_keys(expr2, rel2, strict_op); + if (ipk2 < 0) + continue; + + /* + * If the clause refers to keys at different ordinal positions, it can + * not be used for partition-wise join. + */ + if (ipk1 != ipk2) + continue; + + /* + * The clause allows partition-wise join if only it uses the same + * operator family as that specified by the partition key. + */ + if (!list_member_oid(rinfo->mergeopfamilies, + part_scheme->partopfamily[ipk1])) + continue; + + /* Mark the partition key as having an equi-join clause. */ + pk_has_clause[ipk1] = true; + } + + /* Check whether every partition key has an equi-join condition. */ + for (cnt_pks = 0; cnt_pks < part_scheme->partnatts; cnt_pks++) + { + if (!pk_has_clause[cnt_pks]) + return false; + } + + return true; +} + +/* + * Find the partition key from the given relation matching the given + * expression. If found, return the index of the partition key, else return -1. + */ +static int +match_expr_to_partition_keys(Expr *expr, RelOptInfo *rel, bool strict_op) +{ + int cnt; + + /* This function should be called only for partitioned relations. */ + Assert(rel->part_scheme); + + /* Remove any relabel decorations. */ + while (IsA(expr, RelabelType)) + expr = (Expr *) (castNode(RelabelType, expr))->arg; + + for (cnt = 0; cnt < rel->part_scheme->partnatts; cnt++) + { + ListCell *lc; + + Assert(rel->partexprs); + foreach(lc, rel->partexprs[cnt]) + { + if (equal(lfirst(lc), expr)) + return cnt; + } + + if (!strict_op) + continue; + + /* + * If it's a strict equi-join a NULL partition key on one side will + * not join a NULL partition key on the other side. So, rows with NULL + * partition key from a partition on one side can not join with those + * from a non-matching partition on the other side. So, search the + * nullable partition keys as well. + */ + Assert(rel->nullable_partexprs); + foreach(lc, rel->nullable_partexprs[cnt]) + { + if (equal(lfirst(lc), expr)) + return cnt; + } + } + + return -1; +} diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 48589d94..45880a2f 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -301,7 +301,8 @@ static Plan *prepare_sort_from_pathkeys(Plan *lefttree, List *pathkeys, static EquivalenceMember *find_ec_member_for_tle(EquivalenceClass *ec, TargetEntry *tle, Relids relids); -static Sort *make_sort_from_pathkeys(Plan *lefttree, List *pathkeys); +static Sort *make_sort_from_pathkeys(Plan *lefttree, List *pathkeys, + Relids relids); static Sort *make_sort_from_groupcols(List *groupcls, AttrNumber *grpColIdx, Plan *lefttree); @@ -2193,7 +2194,7 @@ create_sort_plan(PlannerInfo *root, SortPath *best_path, int flags) subplan = create_plan_recurse(root, best_path->subpath, flags | CP_SMALL_TLIST); - plan = make_sort_from_pathkeys(subplan, best_path->path.pathkeys); + plan = make_sort_from_pathkeys(subplan, best_path->path.pathkeys, NULL); copy_generic_path_info(&plan->plan, (Path *) best_path); @@ -4849,6 +4850,8 @@ create_mergejoin_plan(PlannerInfo *root, ListCell *lc; ListCell *lop; ListCell *lip; + Path *outer_path = best_path->jpath.outerjoinpath; + Path *inner_path = best_path->jpath.innerjoinpath; #ifdef __TBASE__ bool reset = false; @@ -4920,8 +4923,10 @@ create_mergejoin_plan(PlannerInfo *root, */ if (best_path->outersortkeys) { + Relids outer_relids = outer_path->parent->relids; Sort *sort = make_sort_from_pathkeys(outer_plan, - best_path->outersortkeys); + best_path->outersortkeys, + outer_relids); label_sort_with_costsize(root, sort, -1.0); outer_plan = (Plan *) sort; @@ -4932,8 +4937,10 @@ create_mergejoin_plan(PlannerInfo *root, if (best_path->innersortkeys) { + Relids inner_relids = inner_path->parent->relids; Sort *sort = make_sort_from_pathkeys(inner_plan, - best_path->innersortkeys); + best_path->innersortkeys, + inner_relids); label_sort_with_costsize(root, sort, -1.0); inner_plan = (Plan *) sort; @@ -7478,8 +7485,9 @@ add_sort_column(AttrNumber colIdx, Oid sortOp, Oid coll, bool nulls_first, * the output parameters *p_numsortkeys etc. * * When looking for matches to an EquivalenceClass's members, we will only - * consider child EC members if they match 'relids'. This protects against - * possible incorrect matches to child expressions that contain no Vars. + * consider child EC members if they belong to given 'relids'. This protects + * against possible incorrect matches to child expressions that contain no + * Vars. * * If reqColIdx isn't NULL then it contains sort key column numbers that * we should match. This is used when making child plans for a MergeAppend; @@ -7634,11 +7642,11 @@ prepare_sort_from_pathkeys(Plan *lefttree, List *pathkeys, continue; /* - * Ignore child members unless they match the rel being + * Ignore child members unless they belong to the rel being * sorted. */ if (em->em_is_child && - !bms_equal(em->em_relids, relids)) + !bms_is_subset(em->em_relids, relids)) continue; sortexpr = em->em_expr; @@ -7730,7 +7738,7 @@ prepare_sort_from_pathkeys(Plan *lefttree, List *pathkeys, * find_ec_member_for_tle * Locate an EquivalenceClass member matching the given TLE, if any * - * Child EC members are ignored unless they match 'relids'. + * Child EC members are ignored unless they belong to given 'relids'. */ static EquivalenceMember * find_ec_member_for_tle(EquivalenceClass *ec, @@ -7758,10 +7766,10 @@ find_ec_member_for_tle(EquivalenceClass *ec, continue; /* - * Ignore child members unless they match the rel being sorted. + * Ignore child members unless they belong to the rel being sorted. */ if (em->em_is_child && - !bms_equal(em->em_relids, relids)) + !bms_is_subset(em->em_relids, relids)) continue; /* Match if same expression (after stripping relabel) */ @@ -7782,9 +7790,10 @@ find_ec_member_for_tle(EquivalenceClass *ec, * * 'lefttree' is the node which yields input tuples * 'pathkeys' is the list of pathkeys by which the result is to be sorted + * 'relids' is the set of relations required by prepare_sort_from_pathkeys() */ static Sort * -make_sort_from_pathkeys(Plan *lefttree, List *pathkeys) +make_sort_from_pathkeys(Plan *lefttree, List *pathkeys, Relids relids) { int numsortkeys; AttrNumber *sortColIdx; @@ -7794,7 +7803,7 @@ make_sort_from_pathkeys(Plan *lefttree, List *pathkeys) /* Compute sort column info, and adjust lefttree as needed */ lefttree = prepare_sort_from_pathkeys(lefttree, pathkeys, - NULL, + relids, NULL, false, &numsortkeys, diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 11454dfb..498b1d5e 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -7866,6 +7866,29 @@ get_partitioned_child_rels(PlannerInfo *root, Index rti) } +/* + * get_partitioned_child_rels_for_join + * Build and return a list containing the RTI of every partitioned + * relation which is a child of some rel included in the join. + */ +List * +get_partitioned_child_rels_for_join(PlannerInfo *root, Relids join_relids) +{ + List *result = NIL; + ListCell *l; + + foreach(l, root->pcinfo_list) + { + PartitionedChildRelInfo *pc = lfirst(l); + + if (bms_is_member(pc->parent_relid, join_relids)) + result = list_concat(result, list_copy(pc->child_rels)); + } + + return result; +} + + static bool groupingsets_distribution_match(PlannerInfo *root, Query *parse, Path *path) { diff --git a/src/backend/optimizer/plan/setrefs.c b/src/backend/optimizer/plan/setrefs.c index e5470fa8..805585b7 100644 --- a/src/backend/optimizer/plan/setrefs.c +++ b/src/backend/optimizer/plan/setrefs.c @@ -45,6 +45,9 @@ typedef struct int num_vars; /* number of plain Var tlist entries */ bool has_ph_vars; /* are there PlaceHolderVar entries? */ bool has_non_vars; /* are there other entries? */ + bool has_conv_whole_rows; /* are there ConvertRowtypeExpr + * entries encapsulating a whole-row + * Var? */ tlist_vinfo vars[FLEXIBLE_ARRAY_MEMBER]; /* has num_vars entries */ } indexed_tlist; @@ -151,6 +154,7 @@ static List *set_returning_clause_references(PlannerInfo *root, int rtoffset); static bool extract_query_dependencies_walker(Node *node, PlannerInfo *context); +static bool is_converted_whole_row_reference(Node *node); #ifdef XCP static void set_remotesubplan_references(PlannerInfo *root, Plan *plan, int rtoffset); @@ -2010,6 +2014,7 @@ build_tlist_index(List *tlist) itlist->tlist = tlist; itlist->has_ph_vars = false; itlist->has_non_vars = false; + itlist->has_conv_whole_rows = false; /* Find the Vars and fill in the index array */ vinfo = itlist->vars; @@ -2028,6 +2033,8 @@ build_tlist_index(List *tlist) } else if (tle->expr && IsA(tle->expr, PlaceHolderVar)) itlist->has_ph_vars = true; + else if (is_converted_whole_row_reference((Node *) tle->expr)) + itlist->has_conv_whole_rows = true; else itlist->has_non_vars = true; } @@ -2043,7 +2050,10 @@ build_tlist_index(List *tlist) * This is like build_tlist_index, but we only index tlist entries that * are Vars belonging to some rel other than the one specified. We will set * has_ph_vars (allowing PlaceHolderVars to be matched), but not has_non_vars - * (so nothing other than Vars and PlaceHolderVars can be matched). + * (so nothing other than Vars and PlaceHolderVars can be matched). In case of + * DML, where this function will be used, returning lists from child relations + * will be appended similar to a simple append relation. That does not require + * fixing ConvertRowtypeExpr references. So, those are not considered here. */ static indexed_tlist * build_tlist_index_other_vars(List *tlist, Index ignore_rel) @@ -2060,6 +2070,7 @@ build_tlist_index_other_vars(List *tlist, Index ignore_rel) itlist->tlist = tlist; itlist->has_ph_vars = false; itlist->has_non_vars = false; + itlist->has_conv_whole_rows = false; /* Find the desired Vars and fill in the index array */ vinfo = itlist->vars; @@ -2263,6 +2274,7 @@ static Node * fix_join_expr_mutator(Node *node, fix_join_expr_context *context) {// #lizard forgives Var *newvar; + bool converted_whole_row; if (node == NULL) return NULL; @@ -2332,8 +2344,12 @@ fix_join_expr_mutator(Node *node, fix_join_expr_context *context) } if (IsA(node, Param)) return fix_param_node(context->root, (Param *) node); + /* Try matching more complex expressions too, if tlists have any */ - if (context->outer_itlist && context->outer_itlist->has_non_vars) + converted_whole_row = is_converted_whole_row_reference(node); + if (context->outer_itlist && + (context->outer_itlist->has_non_vars || + (context->outer_itlist->has_conv_whole_rows && converted_whole_row))) { newvar = search_indexed_tlist_for_non_var((Expr *) node, context->outer_itlist, @@ -2341,7 +2357,9 @@ fix_join_expr_mutator(Node *node, fix_join_expr_context *context) if (newvar) return (Node *) newvar; } - if (context->inner_itlist && context->inner_itlist->has_non_vars) + if (context->inner_itlist && + (context->inner_itlist->has_non_vars || + (context->inner_itlist->has_conv_whole_rows && converted_whole_row))) { newvar = search_indexed_tlist_for_non_var((Expr *) node, context->inner_itlist, @@ -2461,7 +2479,9 @@ fix_upper_expr_mutator(Node *node, fix_upper_expr_context *context) /* If no match, just fall through to process it normally */ } /* Try matching more complex expressions too, if tlist has any */ - if (context->subplan_itlist->has_non_vars) + if (context->subplan_itlist->has_non_vars || + (context->subplan_itlist->has_conv_whole_rows && + is_converted_whole_row_reference(node))) { newvar = search_indexed_tlist_for_non_var((Expr *) node, context->subplan_itlist, @@ -2669,6 +2689,37 @@ extract_query_dependencies_walker(Node *node, PlannerInfo *context) (void *) context); } + +/* + * is_converted_whole_row_reference + * If the given node is a ConvertRowtypeExpr encapsulating a whole-row + * reference as implicit cast, return true. Otherwise return false. + */ +static bool +is_converted_whole_row_reference(Node *node) +{ + ConvertRowtypeExpr *convexpr; + + if (!node || !IsA(node, ConvertRowtypeExpr)) + return false; + + /* Traverse nested ConvertRowtypeExpr's. */ + convexpr = castNode(ConvertRowtypeExpr, node); + while (convexpr->convertformat == COERCE_IMPLICIT_CAST && + IsA(convexpr->arg, ConvertRowtypeExpr)) + convexpr = castNode(ConvertRowtypeExpr, convexpr->arg); + + if (IsA(convexpr->arg, Var)) + { + Var *var = castNode(Var, convexpr->arg); + + if (var->varattno == 0) + return true; + } + + return false; +} + #ifdef XCP /* * set_remotesubplan_references diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c index 6057868c..1fe5a341 100644 --- a/src/backend/optimizer/prep/prepunion.c +++ b/src/backend/optimizer/prep/prepunion.c @@ -2281,6 +2281,59 @@ adjust_relid_set(Relids relids, Index oldrelid, Index newrelid) return relids; } +/* + * Replace any relid present in top_parent_relids with its child in + * child_relids. Members of child_relids can be multiple levels below top + * parent in the partition hierarchy. + */ +Relids +adjust_child_relids_multilevel(PlannerInfo *root, Relids relids, + Relids child_relids, Relids top_parent_relids) +{ + AppendRelInfo **appinfos; + int nappinfos; + Relids parent_relids = NULL; + Relids result; + Relids tmp_result = NULL; + int cnt; + + /* + * If the given relids set doesn't contain any of the top parent relids, + * it will remain unchanged. + */ + if (!bms_overlap(relids, top_parent_relids)) + return relids; + + appinfos = find_appinfos_by_relids(root, child_relids, &nappinfos); + + /* Construct relids set for the immediate parent of the given child. */ + for (cnt = 0; cnt < nappinfos; cnt++) + { + AppendRelInfo *appinfo = appinfos[cnt]; + + parent_relids = bms_add_member(parent_relids, appinfo->parent_relid); + } + + /* Recurse if immediate parent is not the top parent. */ + if (!bms_equal(parent_relids, top_parent_relids)) + { + tmp_result = adjust_child_relids_multilevel(root, relids, + parent_relids, + top_parent_relids); + relids = tmp_result; + } + + result = adjust_child_relids(relids, nappinfos, appinfos); + + /* Free memory consumed by any intermediate result. */ + if (tmp_result) + bms_free(tmp_result); + bms_free(parent_relids); + pfree(appinfos); + + return result; +} + /* * Adjust the targetlist entries of an inherited UPDATE operation * @@ -2400,3 +2453,46 @@ adjust_appendrel_attrs_multilevel(PlannerInfo *root, Node *node, /* Now translate for this child */ return adjust_appendrel_attrs(root, node, appinfo); } + +/* + * Construct the SpecialJoinInfo for a child-join by translating + * SpecialJoinInfo for the join between parents. left_relids and right_relids + * are the relids of left and right side of the join respectively. + */ +SpecialJoinInfo * +build_child_join_sjinfo(PlannerInfo *root, SpecialJoinInfo *parent_sjinfo, + Relids left_relids, Relids right_relids) +{ + SpecialJoinInfo *sjinfo = makeNode(SpecialJoinInfo); + AppendRelInfo **left_appinfos; + int left_nappinfos; + AppendRelInfo **right_appinfos; + int right_nappinfos; + + memcpy(sjinfo, parent_sjinfo, sizeof(SpecialJoinInfo)); + left_appinfos = find_appinfos_by_relids(root, left_relids, + &left_nappinfos); + right_appinfos = find_appinfos_by_relids(root, right_relids, + &right_nappinfos); + + sjinfo->min_lefthand = adjust_child_relids(sjinfo->min_lefthand, + left_nappinfos, left_appinfos); + sjinfo->min_righthand = adjust_child_relids(sjinfo->min_righthand, + right_nappinfos, + right_appinfos); + sjinfo->syn_lefthand = adjust_child_relids(sjinfo->syn_lefthand, + left_nappinfos, left_appinfos); + sjinfo->syn_righthand = adjust_child_relids(sjinfo->syn_righthand, + right_nappinfos, + right_appinfos); + sjinfo->semi_rhs_exprs = (List *) adjust_appendrel_attrs(root, + (Node *) sjinfo->semi_rhs_exprs, + right_nappinfos, + right_appinfos); + + pfree(left_appinfos); + pfree(right_appinfos); + + return sjinfo; +} + diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 49e6658f..c2d27db7 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -19,15 +19,20 @@ #include "miscadmin.h" #include "nodes/nodeFuncs.h" +#include "nodes/extensible.h" #include "optimizer/clauses.h" #include "optimizer/cost.h" #include "optimizer/pathnode.h" #include "optimizer/paths.h" #include "optimizer/planmain.h" +#include "optimizer/prep.h" #include "optimizer/restrictinfo.h" +#include "optimizer/tlist.h" #include "optimizer/var.h" #include "parser/parsetree.h" +#include "foreign/fdwapi.h" #include "utils/lsyscache.h" +#include "utils/memutils.h" #include "utils/selfuncs.h" #ifdef XCP #include "access/heapam.h" @@ -82,6 +87,10 @@ typedef enum #define STD_FUZZ_FACTOR 1.01 static List *translate_sub_tlist(List *tlist, int relid); +static List *reparameterize_pathlist_by_child(PlannerInfo *root, + List *pathlist, + RelOptInfo *child_rel); + #ifdef XCP static void restrict_distribution(PlannerInfo *root, RestrictInfo *ri, Path *pathnode); @@ -6924,6 +6933,361 @@ reparameterize_path(PlannerInfo *root, Path *path, return NULL; } +/* + * reparameterize_path_by_child + * Given a path parameterized by the parent of the given child relation, + * translate the path to be parameterized by the given child relation. + * + * The function creates a new path of the same type as the given path, but + * parameterized by the given child relation. Most fields from the original + * path can simply be flat-copied, but any expressions must be adjusted to + * refer to the correct varnos, and any paths must be recursively + * reparameterized. Other fields that refer to specific relids also need + * adjustment. + * + * The cost, number of rows, width and parallel path properties depend upon + * path->parent, which does not change during the translation. Hence those + * members are copied as they are. + * + * If the given path can not be reparameterized, the function returns NULL. + */ +Path * +reparameterize_path_by_child(PlannerInfo *root, Path *path, + RelOptInfo *child_rel) +{ + +#define FLAT_COPY_PATH(newnode, node, nodetype) \ + ( (newnode) = makeNode(nodetype), \ + memcpy((newnode), (node), sizeof(nodetype)) ) + +#define ADJUST_CHILD_ATTRS(node) \ + ((node) = \ + (List *) adjust_appendrel_attrs_multilevel(root, (Node *) (node), \ + child_rel->relids, \ + child_rel->top_parent_relids)) + +#define REPARAMETERIZE_CHILD_PATH(path) \ +do { \ + (path) = reparameterize_path_by_child(root, (path), child_rel); \ + if ((path) == NULL) \ + return NULL; \ +} while(0); + +#define REPARAMETERIZE_CHILD_PATH_LIST(pathlist) \ +do { \ + if ((pathlist) != NIL) \ + { \ + (pathlist) = reparameterize_pathlist_by_child(root, (pathlist), \ + child_rel); \ + if ((pathlist) == NIL) \ + return NULL; \ + } \ +} while(0); + + Path *new_path; + ParamPathInfo *new_ppi; + ParamPathInfo *old_ppi; + Relids required_outer; + + /* + * If the path is not parameterized by parent of the given relation, it + * doesn't need reparameterization. + */ + if (!path->param_info || + !bms_overlap(PATH_REQ_OUTER(path), child_rel->top_parent_relids)) + return path; + + /* Reparameterize a copy of given path. */ + switch (nodeTag(path)) + { + case T_Path: + FLAT_COPY_PATH(new_path, path, Path); + break; + + case T_IndexPath: + { + IndexPath *ipath; + + FLAT_COPY_PATH(ipath, path, IndexPath); + ADJUST_CHILD_ATTRS(ipath->indexclauses); + ADJUST_CHILD_ATTRS(ipath->indexquals); + new_path = (Path *) ipath; + } + break; + + case T_BitmapHeapPath: + { + BitmapHeapPath *bhpath; + + FLAT_COPY_PATH(bhpath, path, BitmapHeapPath); + REPARAMETERIZE_CHILD_PATH(bhpath->bitmapqual); + new_path = (Path *) bhpath; + } + break; + + case T_BitmapAndPath: + { + BitmapAndPath *bapath; + + FLAT_COPY_PATH(bapath, path, BitmapAndPath); + REPARAMETERIZE_CHILD_PATH_LIST(bapath->bitmapquals); + new_path = (Path *) bapath; + } + break; + + case T_BitmapOrPath: + { + BitmapOrPath *bopath; + + FLAT_COPY_PATH(bopath, path, BitmapOrPath); + REPARAMETERIZE_CHILD_PATH_LIST(bopath->bitmapquals); + new_path = (Path *) bopath; + } + break; + + case T_TidPath: + { + TidPath *tpath; + + /* + * TidPath contains tidquals, which do not contain any + * external parameters per create_tidscan_path(). So don't + * bother to translate those. + */ + FLAT_COPY_PATH(tpath, path, TidPath); + new_path = (Path *) tpath; + } + break; + + case T_ForeignPath: + { + ForeignPath *fpath; + ReparameterizeForeignPathByChild_function rfpc_func; + + FLAT_COPY_PATH(fpath, path, ForeignPath); + if (fpath->fdw_outerpath) + REPARAMETERIZE_CHILD_PATH(fpath->fdw_outerpath); + + /* Hand over to FDW if needed. */ + rfpc_func = + path->parent->fdwroutine->ReparameterizeForeignPathByChild; + if (rfpc_func) + fpath->fdw_private = rfpc_func(root, fpath->fdw_private, + child_rel); + new_path = (Path *) fpath; + } + break; + + case T_CustomPath: + { + CustomPath *cpath; + + FLAT_COPY_PATH(cpath, path, CustomPath); + REPARAMETERIZE_CHILD_PATH_LIST(cpath->custom_paths); + if (cpath->methods && + cpath->methods->ReparameterizeCustomPathByChild) + cpath->custom_private = + cpath->methods->ReparameterizeCustomPathByChild(root, + cpath->custom_private, + child_rel); + new_path = (Path *) cpath; + } + break; + + case T_NestPath: + { + JoinPath *jpath; + + FLAT_COPY_PATH(jpath, path, NestPath); + + REPARAMETERIZE_CHILD_PATH(jpath->outerjoinpath); + REPARAMETERIZE_CHILD_PATH(jpath->innerjoinpath); + ADJUST_CHILD_ATTRS(jpath->joinrestrictinfo); + new_path = (Path *) jpath; + } + break; + + case T_MergePath: + { + JoinPath *jpath; + MergePath *mpath; + + FLAT_COPY_PATH(mpath, path, MergePath); + + jpath = (JoinPath *) mpath; + REPARAMETERIZE_CHILD_PATH(jpath->outerjoinpath); + REPARAMETERIZE_CHILD_PATH(jpath->innerjoinpath); + ADJUST_CHILD_ATTRS(jpath->joinrestrictinfo); + ADJUST_CHILD_ATTRS(mpath->path_mergeclauses); + new_path = (Path *) mpath; + } + break; + + case T_HashPath: + { + JoinPath *jpath; + HashPath *hpath; + + FLAT_COPY_PATH(hpath, path, HashPath); + + jpath = (JoinPath *) hpath; + REPARAMETERIZE_CHILD_PATH(jpath->outerjoinpath); + REPARAMETERIZE_CHILD_PATH(jpath->innerjoinpath); + ADJUST_CHILD_ATTRS(jpath->joinrestrictinfo); + ADJUST_CHILD_ATTRS(hpath->path_hashclauses); + new_path = (Path *) hpath; + } + break; + + case T_AppendPath: + { + AppendPath *apath; + + FLAT_COPY_PATH(apath, path, AppendPath); + REPARAMETERIZE_CHILD_PATH_LIST(apath->subpaths); + new_path = (Path *) apath; + } + break; + + case T_MergeAppend: + { + MergeAppendPath *mapath; + + FLAT_COPY_PATH(mapath, path, MergeAppendPath); + REPARAMETERIZE_CHILD_PATH_LIST(mapath->subpaths); + new_path = (Path *) mapath; + } + break; + + case T_MaterialPath: + { + MaterialPath *mpath; + + FLAT_COPY_PATH(mpath, path, MaterialPath); + REPARAMETERIZE_CHILD_PATH(mpath->subpath); + new_path = (Path *) mpath; + } + break; + + case T_UniquePath: + { + UniquePath *upath; + + FLAT_COPY_PATH(upath, path, UniquePath); + REPARAMETERIZE_CHILD_PATH(upath->subpath); + ADJUST_CHILD_ATTRS(upath->uniq_exprs); + new_path = (Path *) upath; + } + break; + + case T_GatherPath: + { + GatherPath *gpath; + + FLAT_COPY_PATH(gpath, path, GatherPath); + REPARAMETERIZE_CHILD_PATH(gpath->subpath); + new_path = (Path *) gpath; + } + break; + + case T_GatherMergePath: + { + GatherMergePath *gmpath; + + FLAT_COPY_PATH(gmpath, path, GatherMergePath); + REPARAMETERIZE_CHILD_PATH(gmpath->subpath); + new_path = (Path *) gmpath; + } + break; + + default: + + /* We don't know how to reparameterize this path. */ + return NULL; + } + + /* + * Adjust the parameterization information, which refers to the topmost + * parent. The topmost parent can be multiple levels away from the given + * child, hence use multi-level expression adjustment routines. + */ + old_ppi = new_path->param_info; + required_outer = + adjust_child_relids_multilevel(root, old_ppi->ppi_req_outer, + child_rel->relids, + child_rel->top_parent_relids); + + /* If we already have a PPI for this parameterization, just return it */ + new_ppi = find_param_path_info(new_path->parent, required_outer); + + /* + * If not, build a new one and link it to the list of PPIs. For the same + * reason as explained in mark_dummy_rel(), allocate new PPI in the same + * context the given RelOptInfo is in. + */ + if (new_ppi == NULL) + { + MemoryContext oldcontext; + RelOptInfo *rel = path->parent; + + oldcontext = MemoryContextSwitchTo(GetMemoryChunkContext(rel)); + + new_ppi = makeNode(ParamPathInfo); + new_ppi->ppi_req_outer = bms_copy(required_outer); + new_ppi->ppi_rows = old_ppi->ppi_rows; + new_ppi->ppi_clauses = old_ppi->ppi_clauses; + ADJUST_CHILD_ATTRS(new_ppi->ppi_clauses); + rel->ppilist = lappend(rel->ppilist, new_ppi); + + MemoryContextSwitchTo(oldcontext); + } + bms_free(required_outer); + + new_path->param_info = new_ppi; + + /* + * Adjust the path target if the parent of the outer relation is + * referenced in the targetlist. This can happen when only the parent of + * outer relation is laterally referenced in this relation. + */ + if (bms_overlap(path->parent->lateral_relids, + child_rel->top_parent_relids)) + { + new_path->pathtarget = copy_pathtarget(new_path->pathtarget); + ADJUST_CHILD_ATTRS(new_path->pathtarget->exprs); + } + + return new_path; +} + +/* + * reparameterize_pathlist_by_child + * Helper function to reparameterize a list of paths by given child rel. + */ +static List * +reparameterize_pathlist_by_child(PlannerInfo *root, + List *pathlist, + RelOptInfo *child_rel) +{ + ListCell *lc; + List *result = NIL; + + foreach(lc, pathlist) + { + Path *path = reparameterize_path_by_child(root, lfirst(lc), + child_rel); + if (path == NULL) + { + list_free(result); + return NIL; + } + + result = lappend(result, path); + } + + return result; +} + #ifdef __TBASE__ /* * Count datanode number for given path, consider replication table as 1 diff --git a/src/backend/optimizer/util/placeholder.c b/src/backend/optimizer/util/placeholder.c index 9b29be4c..0d5351a6 100644 --- a/src/backend/optimizer/util/placeholder.c +++ b/src/backend/optimizer/util/placeholder.c @@ -20,6 +20,7 @@ #include "optimizer/pathnode.h" #include "optimizer/placeholder.h" #include "optimizer/planmain.h" +#include "optimizer/prep.h" #include "optimizer/var.h" #include "utils/lsyscache.h" @@ -414,6 +415,10 @@ add_placeholders_to_joinrel(PlannerInfo *root, RelOptInfo *joinrel, Relids relids = joinrel->relids; ListCell *lc; + /* This function is called only on the parent relations. */ + Assert(!IS_OTHER_REL(joinrel) && !IS_OTHER_REL(outer_rel) && + !IS_OTHER_REL(inner_rel)); + foreach(lc, root->placeholder_list) { PlaceHolderInfo *phinfo = (PlaceHolderInfo *) lfirst(lc); @@ -459,3 +464,56 @@ add_placeholders_to_joinrel(PlannerInfo *root, RelOptInfo *joinrel, } } } + +/* + * add_placeholders_to_child_joinrel + * Translate the PHVs in parent's targetlist and add them to the child's + * targetlist. Also adjust the cost + */ +void +add_placeholders_to_child_joinrel(PlannerInfo *root, RelOptInfo *childrel, + RelOptInfo *parentrel) +{ + ListCell *lc; + AppendRelInfo **appinfos; + int nappinfos; + + Assert(IS_JOIN_REL(childrel) && IS_JOIN_REL(parentrel)); + Assert(IS_OTHER_REL(childrel)); + + /* Nothing to do if no PHVs. */ + if (root->placeholder_list == NIL) + return; + + appinfos = find_appinfos_by_relids(root, childrel->relids, &nappinfos); + foreach(lc, parentrel->reltarget->exprs) + { + PlaceHolderVar *phv = lfirst(lc); + + if (IsA(phv, PlaceHolderVar)) + { + /* + * In case the placeholder Var refers to any of the parent + * relations, translate it to refer to the corresponding child. + */ + if (bms_overlap(phv->phrels, parentrel->relids) && + childrel->reloptkind == RELOPT_OTHER_JOINREL) + { + phv = (PlaceHolderVar *) adjust_appendrel_attrs(root, + (Node *) phv, + nappinfos, + appinfos); + } + + childrel->reltarget->exprs = lappend(childrel->reltarget->exprs, + phv); + } + } + + /* Adjust the cost and width of child targetlist. */ + childrel->reltarget->cost.startup = parentrel->reltarget->cost.startup; + childrel->reltarget->cost.per_tuple = parentrel->reltarget->cost.per_tuple; + childrel->reltarget->width = parentrel->reltarget->width; + + pfree(appinfos); +} diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 23938198..55ea9c8c 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -82,7 +82,8 @@ static List *get_relation_statistics(RelOptInfo *rel, Relation relation); static void set_relation_partition_info(PlannerInfo *root, RelOptInfo *rel, Relation relation); static PartitionScheme find_partition_scheme(PlannerInfo *root, Relation rel); -static List **build_baserel_partition_key_exprs(Relation relation, Index varno); +static void set_baserel_partition_key_exprs(Relation relation, + RelOptInfo *rel); #ifdef __TBASE__ static BlockNumber GetIntervalPartitionPages(Relation rel, bool isindex, bool statistic); @@ -1985,7 +1986,7 @@ set_relation_partition_info(PlannerInfo *root, RelOptInfo *rel, Assert(partdesc != NULL && rel->part_scheme != NULL); rel->boundinfo = partdesc->boundinfo; rel->nparts = partdesc->nparts; - rel->partexprs = build_baserel_partition_key_exprs(relation, rel->relid); + set_baserel_partition_key_exprs(relation, rel); } /* @@ -2060,21 +2061,24 @@ find_partition_scheme(PlannerInfo *root, Relation relation) } /* - * build_baserel_partition_key_exprs + * set_baserel_partition_key_exprs * - * Collects partition key expressions for a given base relation. Any single - * column partition keys are converted to Var nodes. All Var nodes are set - * to the given varno. The partition key expressions are returned as an array - * of single element lists to be stored in RelOptInfo of the base relation. + * Builds partition key expressions for the given base relation and sets them + * in given RelOptInfo. Any single column partition keys are converted to Var + * nodes. All Var nodes are restamped with the relid of given relation. */ -static List ** -build_baserel_partition_key_exprs(Relation relation, Index varno) +static void +set_baserel_partition_key_exprs(Relation relation, + RelOptInfo *rel) { PartitionKey partkey = RelationGetPartitionKey(relation); int partnatts; int cnt; List **partexprs; ListCell *lc; + Index varno = rel->relid; + + Assert(IS_SIMPLE_REL(rel) && rel->relid > 0); /* A partitioned table should have a partition key. */ Assert(partkey != NULL); @@ -2112,7 +2116,15 @@ build_baserel_partition_key_exprs(Relation relation, Index varno) partexprs[cnt] = list_make1(partexpr); } - return partexprs; + rel->partexprs = partexprs; + + /* + * A base relation can not have nullable partition key expressions. We + * still allocate array of empty expressions lists to keep partition key + * expression handling code simple. See build_joinrel_partition_info() and + * match_expr_to_partition_keys(). + */ + rel->nullable_partexprs = (List **) palloc0(sizeof(List *) * partnatts); } #ifdef __TBASE__ diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c index 0ada588b..0896b4c2 100644 --- a/src/backend/optimizer/util/relnode.c +++ b/src/backend/optimizer/util/relnode.c @@ -17,12 +17,14 @@ #include #include "miscadmin.h" +#include "catalog/partition.h" #include "optimizer/clauses.h" #include "optimizer/cost.h" #include "optimizer/pathnode.h" #include "optimizer/paths.h" #include "optimizer/placeholder.h" #include "optimizer/plancat.h" +#include "optimizer/prep.h" #include "optimizer/restrictinfo.h" #include "optimizer/tlist.h" #include "utils/hsearch.h" @@ -61,6 +63,9 @@ static List *subbuild_joinrel_joinlist(RelOptInfo *joinrel, static void set_foreign_rel_properties(RelOptInfo *joinrel, RelOptInfo *outer_rel, RelOptInfo *inner_rel); static void add_join_rel(PlannerInfo *root, RelOptInfo *joinrel); +static void build_joinrel_partition_info(RelOptInfo *joinrel, + RelOptInfo *outer_rel, RelOptInfo *inner_rel, + List *restrictlist, JoinType jointype); /* @@ -160,6 +165,7 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptInfo *parent) rel->boundinfo = NULL; rel->part_rels = NULL; rel->partexprs = NULL; + rel->nullable_partexprs = NULL; #ifdef __TBASE__ rel->intervalparent = false; rel->isdefault = rte->isdefault; @@ -536,6 +542,9 @@ build_join_rel(PlannerInfo *root, RelOptInfo *joinrel; List *restrictlist; + /* This function should be used only for join between parents. */ + Assert(!IS_OTHER_REL(outer_rel) && !IS_OTHER_REL(inner_rel)); + /* * See if we already have a joinrel for this set of base rels. */ @@ -615,6 +624,7 @@ build_join_rel(PlannerInfo *root, joinrel->boundinfo = NULL; joinrel->part_rels = NULL; joinrel->partexprs = NULL; + joinrel->nullable_partexprs = NULL; #ifdef __TBASE__ joinrel->resultRelLoc = RESULT_REL_NONE; #endif @@ -663,6 +673,10 @@ build_join_rel(PlannerInfo *root, */ joinrel->has_eclass_joins = has_relevant_eclass_joinclause(root, joinrel); + /* Store the partition information. */ + build_joinrel_partition_info(joinrel, outer_rel, inner_rel, restrictlist, + sjinfo->jointype); + /* * Set estimates of the joinrel's size. */ @@ -708,6 +722,138 @@ build_join_rel(PlannerInfo *root, return joinrel; } +/* + * build_child_join_rel + * Builds RelOptInfo representing join between given two child relations. + * + * 'outer_rel' and 'inner_rel' are the RelOptInfos of child relations being + * joined + * 'parent_joinrel' is the RelOptInfo representing the join between parent + * relations. Some of the members of new RelOptInfo are produced by + * translating corresponding members of this RelOptInfo + * 'sjinfo': child-join context info + * 'restrictlist': list of RestrictInfo nodes that apply to this particular + * pair of joinable relations + * 'join_appinfos': list of AppendRelInfo nodes for base child relations + * involved in this join + */ +RelOptInfo * +build_child_join_rel(PlannerInfo *root, RelOptInfo *outer_rel, + RelOptInfo *inner_rel, RelOptInfo *parent_joinrel, + List *restrictlist, SpecialJoinInfo *sjinfo, + JoinType jointype) +{ + RelOptInfo *joinrel = makeNode(RelOptInfo); + AppendRelInfo **appinfos; + int nappinfos; + + /* Only joins between "other" relations land here. */ + Assert(IS_OTHER_REL(outer_rel) && IS_OTHER_REL(inner_rel)); + + joinrel->reloptkind = RELOPT_OTHER_JOINREL; + joinrel->relids = bms_union(outer_rel->relids, inner_rel->relids); + joinrel->rows = 0; + /* cheap startup cost is interesting iff not all tuples to be retrieved */ + joinrel->consider_startup = (root->tuple_fraction > 0); + joinrel->consider_param_startup = false; + joinrel->consider_parallel = false; + joinrel->reltarget = create_empty_pathtarget(); + joinrel->pathlist = NIL; + joinrel->ppilist = NIL; + joinrel->partial_pathlist = NIL; + joinrel->cheapest_startup_path = NULL; + joinrel->cheapest_total_path = NULL; + joinrel->cheapest_unique_path = NULL; + joinrel->cheapest_parameterized_paths = NIL; + joinrel->direct_lateral_relids = NULL; + joinrel->lateral_relids = NULL; + joinrel->relid = 0; /* indicates not a baserel */ + joinrel->rtekind = RTE_JOIN; + joinrel->min_attr = 0; + joinrel->max_attr = 0; + joinrel->attr_needed = NULL; + joinrel->attr_widths = NULL; + joinrel->lateral_vars = NIL; + joinrel->lateral_referencers = NULL; + joinrel->indexlist = NIL; + joinrel->pages = 0; + joinrel->tuples = 0; + joinrel->allvisfrac = 0; + joinrel->subroot = NULL; + joinrel->subplan_params = NIL; + joinrel->serverid = InvalidOid; + joinrel->userid = InvalidOid; + joinrel->useridiscurrent = false; + joinrel->fdwroutine = NULL; + joinrel->fdw_private = NULL; + joinrel->baserestrictinfo = NIL; + joinrel->baserestrictcost.startup = 0; + joinrel->baserestrictcost.per_tuple = 0; + joinrel->joininfo = NIL; + joinrel->has_eclass_joins = false; + joinrel->top_parent_relids = NULL; + joinrel->part_scheme = NULL; + joinrel->part_rels = NULL; + joinrel->partexprs = NULL; + joinrel->nullable_partexprs = NULL; + + joinrel->top_parent_relids = bms_union(outer_rel->top_parent_relids, + inner_rel->top_parent_relids); + + /* Compute information relevant to foreign relations. */ + set_foreign_rel_properties(joinrel, outer_rel, inner_rel); + + /* Build targetlist */ + build_joinrel_tlist(root, joinrel, outer_rel); + build_joinrel_tlist(root, joinrel, inner_rel); + /* Add placeholder variables. */ + add_placeholders_to_child_joinrel(root, joinrel, parent_joinrel); + + /* Construct joininfo list. */ + appinfos = find_appinfos_by_relids(root, joinrel->relids, &nappinfos); + joinrel->joininfo = (List *) adjust_appendrel_attrs(root, + (Node *) parent_joinrel->joininfo, + nappinfos, + appinfos); + pfree(appinfos); + + /* + * Lateral relids referred in child join will be same as that referred in + * the parent relation. Throw any partial result computed while building + * the targetlist. + */ + bms_free(joinrel->direct_lateral_relids); + bms_free(joinrel->lateral_relids); + joinrel->direct_lateral_relids = (Relids) bms_copy(parent_joinrel->direct_lateral_relids); + joinrel->lateral_relids = (Relids) bms_copy(parent_joinrel->lateral_relids); + + /* + * If the parent joinrel has pending equivalence classes, so does the + * child. + */ + joinrel->has_eclass_joins = parent_joinrel->has_eclass_joins; + + /* Is the join between partitions itself partitioned? */ + build_joinrel_partition_info(joinrel, outer_rel, inner_rel, restrictlist, + jointype); + + /* Child joinrel is parallel safe if parent is parallel safe. */ + joinrel->consider_parallel = parent_joinrel->consider_parallel; + + + /* Set estimates of the child-joinrel's size. */ + set_joinrel_size_estimates(root, joinrel, outer_rel, inner_rel, + sjinfo, restrictlist); + + /* We build the join only once. */ + Assert(!find_join_rel(root, joinrel->relids)); + + /* Add the relation to the PlannerInfo. */ + add_join_rel(root, joinrel); + + return joinrel; +} + /* * min_join_parameterization * @@ -763,9 +909,15 @@ static void build_joinrel_tlist(PlannerInfo *root, RelOptInfo *joinrel, RelOptInfo *input_rel) { - Relids relids = joinrel->relids; + Relids relids; ListCell *vars; + /* attrs_needed refers to parent relids and not those of a child. */ + if (joinrel->top_parent_relids) + relids = joinrel->top_parent_relids; + else + relids = joinrel->relids; + foreach(vars, input_rel->reltarget->exprs) { Var *var = (Var *) lfirst(vars); @@ -780,24 +932,55 @@ build_joinrel_tlist(PlannerInfo *root, RelOptInfo *joinrel, continue; /* - * Otherwise, anything in a baserel or joinrel targetlist ought to be - * a Var. (More general cases can only appear in appendrel child - * rels, which will never be seen here.) + * Otherwise, anything in a baserel or joinrel targetlist ought to be a + * Var. Children of a partitioned table may have ConvertRowtypeExpr + * translating whole-row Var of a child to that of the parent. Children + * of an inherited table or subquery child rels can not directly + * participate in a join, so other kinds of nodes here. */ - if (!IsA(var, Var)) + if (IsA(var, Var)) + { + baserel = find_base_rel(root, var->varno); + ndx = var->varattno - baserel->min_attr; + } + else if (IsA(var, ConvertRowtypeExpr)) + { + ConvertRowtypeExpr *child_expr = (ConvertRowtypeExpr *) var; + Var *childvar = (Var *) child_expr->arg; + + /* + * Child's whole-row references are converted to look like those + * of parent using ConvertRowtypeExpr. There can be as many + * ConvertRowtypeExpr decorations as the depth of partition tree. + * The argument to the deepest ConvertRowtypeExpr is expected to + * be a whole-row reference of the child. + */ + while (IsA(childvar, ConvertRowtypeExpr)) + { + child_expr = (ConvertRowtypeExpr *) childvar; + childvar = (Var *) child_expr->arg; + } + Assert(IsA(childvar, Var) && childvar->varattno == 0); + + baserel = find_base_rel(root, childvar->varno); + ndx = 0 - baserel->min_attr; + } + else elog(ERROR, "unexpected node type in rel targetlist: %d", (int) nodeTag(var)); - /* Get the Var's original base rel */ - baserel = find_base_rel(root, var->varno); - /* Is it still needed above this joinrel? */ - ndx = var->varattno - baserel->min_attr; + /* Is the target expression still needed above this joinrel? */ if (bms_nonempty_difference(baserel->attr_needed[ndx], relids)) { /* Yup, add it to the output */ joinrel->reltarget->exprs = lappend(joinrel->reltarget->exprs, var); - /* Vars have cost zero, so no need to adjust reltarget->cost */ + + /* + * Vars have cost zero, so no need to adjust reltarget->cost. Even + * if it's a ConvertRowtypeExpr, it will be computed only for the + * base relation, costing nothing for a join. + */ joinrel->reltarget->width += baserel->attr_widths[ndx]; } } @@ -900,6 +1083,9 @@ subbuild_joinrel_restrictlist(RelOptInfo *joinrel, { ListCell *l; + /* Expected to be called only for join between parent relations. */ + Assert(joinrel->reloptkind == RELOPT_JOINREL); + foreach(l, joininfo_list) { RestrictInfo *rinfo = (RestrictInfo *) lfirst(l); @@ -1457,3 +1643,165 @@ find_param_path_info(RelOptInfo *rel, Relids required_outer) return NULL; } + +/* + * build_joinrel_partition_info + * If the two relations have same partitioning scheme, their join may be + * partitioned and will follow the same partitioning scheme as the joining + * relations. Set the partition scheme and partition key expressions in + * the join relation. + */ +static void +build_joinrel_partition_info(RelOptInfo *joinrel, RelOptInfo *outer_rel, + RelOptInfo *inner_rel, List *restrictlist, + JoinType jointype) +{ + int partnatts; + int cnt; + PartitionScheme part_scheme; + + /* Nothing to do if partition-wise join technique is disabled. */ + if (!enable_partition_wise_join) + { + Assert(!IS_PARTITIONED_REL(joinrel)); + return; + } + + /* + * We can only consider this join as an input to further partition-wise + * joins if (a) the input relations are partitioned, (b) the partition + * schemes match, and (c) we can identify an equi-join between the + * partition keys. Note that if it were possible for + * have_partkey_equi_join to return different answers for the same joinrel + * depending on which join ordering we try first, this logic would break. + * That shouldn't happen, though, because of the way the query planner + * deduces implied equalities and reorders the joins. Please see + * optimizer/README for details. + */ + if (!IS_PARTITIONED_REL(outer_rel) || !IS_PARTITIONED_REL(inner_rel) || + outer_rel->part_scheme != inner_rel->part_scheme || + !have_partkey_equi_join(outer_rel, inner_rel, jointype, restrictlist)) + { + Assert(!IS_PARTITIONED_REL(joinrel)); + return; + } + + part_scheme = outer_rel->part_scheme; + + Assert(REL_HAS_ALL_PART_PROPS(outer_rel) && + REL_HAS_ALL_PART_PROPS(inner_rel)); + + /* + * For now, our partition matching algorithm can match partitions only + * when the partition bounds of the joining relations are exactly same. + * So, bail out otherwise. + */ + if (outer_rel->nparts != inner_rel->nparts || + !partition_bounds_equal(part_scheme->partnatts, + part_scheme->parttyplen, + part_scheme->parttypbyval, + outer_rel->boundinfo, inner_rel->boundinfo)) + { + Assert(!IS_PARTITIONED_REL(joinrel)); + return; + } + + /* + * This function will be called only once for each joinrel, hence it + * should not have partition scheme, partition bounds, partition key + * expressions and array for storing child relations set. + */ + Assert(!joinrel->part_scheme && !joinrel->partexprs && + !joinrel->nullable_partexprs && !joinrel->part_rels && + !joinrel->boundinfo); + + /* + * Join relation is partitioned using the same partitioning scheme as the + * joining relations and has same bounds. + */ + joinrel->part_scheme = part_scheme; + joinrel->boundinfo = outer_rel->boundinfo; + joinrel->nparts = outer_rel->nparts; + partnatts = joinrel->part_scheme->partnatts; + joinrel->partexprs = (List **) palloc0(sizeof(List *) * partnatts); + joinrel->nullable_partexprs = + (List **) palloc0(sizeof(List *) *partnatts); + + /* + * Construct partition keys for the join. + * + * An INNER join between two partitioned relations can be regarded as + * partitioned by either key expression. For example, A INNER JOIN B ON A.a = + * B.b can be regarded as partitioned on A.a or on B.b; they are equivalent. + * + * For a SEMI or ANTI join, the result can only be regarded as being + * partitioned in the same manner as the outer side, since the inner columns + * are not retained. + * + * An OUTER join like (A LEFT JOIN B ON A.a = B.b) may produce rows with + * B.b NULL. These rows may not fit the partitioning conditions imposed on + * B.b. Hence, strictly speaking, the join is not partitioned by B.b and + * thus partition keys of an OUTER join should include partition key + * expressions from the OUTER side only. However, because all + * commonly-used comparison operators are strict, the presence of nulls on + * the outer side doesn't cause any problem; they can't match anything at + * future join levels anyway. Therefore, we track two sets of expressions: + * those that authentically partition the relation (partexprs) and those + * that partition the relation with the exception that extra nulls may be + * present (nullable_partexprs). When the comparison operator is strict, + * the latter is just as good as the former. + */ + for (cnt = 0; cnt < partnatts; cnt++) + { + List *outer_expr; + List *outer_null_expr; + List *inner_expr; + List *inner_null_expr; + List *partexpr = NIL; + List *nullable_partexpr = NIL; + + outer_expr = list_copy(outer_rel->partexprs[cnt]); + outer_null_expr = list_copy(outer_rel->nullable_partexprs[cnt]); + inner_expr = list_copy(inner_rel->partexprs[cnt]); + inner_null_expr = list_copy(inner_rel->nullable_partexprs[cnt]); + + switch (jointype) + { + case JOIN_INNER: + partexpr = list_concat(outer_expr, inner_expr); + nullable_partexpr = list_concat(outer_null_expr, + inner_null_expr); + break; + + case JOIN_SEMI: + case JOIN_ANTI: + partexpr = outer_expr; + nullable_partexpr = outer_null_expr; + break; + + case JOIN_LEFT: + partexpr = outer_expr; + nullable_partexpr = list_concat(inner_expr, + outer_null_expr); + nullable_partexpr = list_concat(nullable_partexpr, + inner_null_expr); + break; + + case JOIN_FULL: + nullable_partexpr = list_concat(outer_expr, + inner_expr); + nullable_partexpr = list_concat(nullable_partexpr, + outer_null_expr); + nullable_partexpr = list_concat(nullable_partexpr, + inner_null_expr); + break; + + default: + elog(ERROR, "unrecognized join type: %d", (int) jointype); + + } + + joinrel->partexprs[cnt] = partexpr; + joinrel->nullable_partexprs[cnt] = nullable_partexpr; + } +} diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 8288cf36..e7ba54b0 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -1195,6 +1195,15 @@ static struct config_bool ConfigureNamesBool[] = true, NULL, NULL, NULL }, + { + {"enable_partition_wise_join", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("Enables partition-wise join."), + NULL + }, + &enable_partition_wise_join, + false, + NULL, NULL, NULL + }, { {"geqo", PGC_USERSET, QUERY_TUNING_GEQO, diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index c03c59df..5ef4e565 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -304,6 +304,7 @@ #enable_seqscan = on #enable_sort = on #enable_tidscan = on +#enable_partition_wise_join = off # - Planner Cost Constants - diff --git a/src/include/foreign/fdwapi.h b/src/include/foreign/fdwapi.h index ef0fbe6f..e188cba1 100644 --- a/src/include/foreign/fdwapi.h +++ b/src/include/foreign/fdwapi.h @@ -158,6 +158,9 @@ typedef void (*ShutdownForeignScan_function) (ForeignScanState *node); typedef bool (*IsForeignScanParallelSafe_function) (PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte); +typedef List *(*ReparameterizeForeignPathByChild_function) (PlannerInfo *root, + List *fdw_private, + RelOptInfo *child_rel); /* * FdwRoutine is the struct returned by a foreign-data wrapper's handler @@ -230,6 +233,9 @@ typedef struct FdwRoutine ReInitializeDSMForeignScan_function ReInitializeDSMForeignScan; InitializeWorkerForeignScan_function InitializeWorkerForeignScan; ShutdownForeignScan_function ShutdownForeignScan; + + /* Support functions for path reparameterization. */ + ReparameterizeForeignPathByChild_function ReparameterizeForeignPathByChild; } FdwRoutine; diff --git a/src/include/nodes/extensible.h b/src/include/nodes/extensible.h index 0654e79c..c3436c7a 100644 --- a/src/include/nodes/extensible.h +++ b/src/include/nodes/extensible.h @@ -96,6 +96,9 @@ typedef struct CustomPathMethods List *tlist, List *clauses, List *custom_plans); + struct List *(*ReparameterizeCustomPathByChild) (PlannerInfo *root, + List *custom_private, + RelOptInfo *child_rel); } CustomPathMethods; /* diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h index 31bdde3e..96258106 100644 --- a/src/include/nodes/relation.h +++ b/src/include/nodes/relation.h @@ -505,6 +505,11 @@ typedef struct PartitionSchemeData *PartitionScheme; * handling join alias Vars. Currently this is not needed because all join * alias Vars are expanded to non-aliased form during preprocess_expression. * + * We also have relations representing joins between child relations of + * different partitioned tables. These relations are not added to + * join_rel_level lists as they are not joined directly by the dynamic + * programming algorithm. + * * There is also a RelOptKind for "upper" relations, which are RelOptInfos * that describe post-scan/join processing steps, such as aggregation. * Many of the fields in these RelOptInfos are meaningless, but their Path @@ -639,14 +644,18 @@ typedef struct PartitionSchemeData *PartitionScheme; * boundinfo - Partition bounds * nparts - Number of partitions * part_rels - RelOptInfos for each partition - * partexprs - Partition key expressions + * partexprs, nullable_partexprs - Partition key expressions * * Note: A base relation always has only one set of partition keys, but a join * relation may have as many sets of partition keys as the number of relations - * being joined. partexprs is an array containing part_scheme->partnatts - * elements, each of which is a list of partition key expressions. For a base - * relation each list contains only one expression, but for a join relation - * there can be one per baserel. + * being joined. partexprs and nullable_partexprs are arrays containing + * part_scheme->partnatts elements each. Each of these elements is a list of + * partition key expressions. For a base relation each list in partexprs + * contains only one expression and nullable_partexprs is not populated. For a + * join relation, partexprs and nullable_partexprs contain partition key + * expressions from non-nullable and nullable relations resp. Lists at any + * given position in those arrays together contain as many elements as the + * number of joining relations. *---------- */ typedef enum RelOptKind @@ -654,6 +663,7 @@ typedef enum RelOptKind RELOPT_BASEREL, RELOPT_JOINREL, RELOPT_OTHER_MEMBER_REL, + RELOPT_OTHER_JOINREL, RELOPT_UPPER_REL, RELOPT_DEADREL } RelOptKind; @@ -667,13 +677,17 @@ typedef enum RelOptKind (rel)->reloptkind == RELOPT_OTHER_MEMBER_REL) /* Is the given relation a join relation? */ -#define IS_JOIN_REL(rel) ((rel)->reloptkind == RELOPT_JOINREL) +#define IS_JOIN_REL(rel) \ + ((rel)->reloptkind == RELOPT_JOINREL || \ + (rel)->reloptkind == RELOPT_OTHER_JOINREL) /* Is the given relation an upper relation? */ #define IS_UPPER_REL(rel) ((rel)->reloptkind == RELOPT_UPPER_REL) /* Is the given relation an "other" relation? */ -#define IS_OTHER_REL(rel) ((rel)->reloptkind == RELOPT_OTHER_MEMBER_REL) +#define IS_OTHER_REL(rel) \ + ((rel)->reloptkind == RELOPT_OTHER_MEMBER_REL || \ + (rel)->reloptkind == RELOPT_OTHER_JOINREL) typedef struct RelOptInfo { @@ -759,7 +773,8 @@ typedef struct RelOptInfo struct PartitionBoundInfoData *boundinfo; /* Partition bounds */ struct RelOptInfo **part_rels; /* Array of RelOptInfos of partitions, * stored in the same order of bounds */ - List **partexprs; /* Partition key expressions. */ + List **partexprs; /* Non-nullable partition key expressions. */ + List **nullable_partexprs; /* Nullable partition key expressions. */ #ifdef __TBASE__ /* used for interval partition */ bool intervalparent; /* is interval partition */ @@ -773,6 +788,26 @@ typedef struct RelOptInfo } RelOptInfo; +/* + * Is given relation partitioned? + * + * A join between two partitioned relations with same partitioning scheme + * without any matching partitions will not have any partition in it but will + * have partition scheme set. So a relation is deemed to be partitioned if it + * has a partitioning scheme, bounds and positive number of partitions. + */ +#define IS_PARTITIONED_REL(rel) \ + ((rel)->part_scheme && (rel)->boundinfo && (rel)->nparts > 0) + +/* + * Convenience macro to make sure that a partitioned relation has all the + * required members set. + */ +#define REL_HAS_ALL_PART_PROPS(rel) \ + ((rel)->part_scheme && (rel)->boundinfo && (rel)->nparts > 0 && \ + (rel)->part_rels && (rel)->partexprs && (rel)->nullable_partexprs) + + /* * IndexOptInfo * Per-index information for planning/optimization diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index 2198c9db..7c527ec8 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -77,6 +77,7 @@ extern bool enable_mergejoin; extern bool enable_hashjoin; extern bool enable_fast_query_shipping; extern bool enable_gathermerge; +extern bool enable_partition_wise_join; extern bool enable_nestloop_suppression; extern int constraint_exclusion; diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index 3df87235..d6e8ffdb 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -312,6 +312,8 @@ extern LimitPath *create_limit_path(PlannerInfo *root, RelOptInfo *rel, extern Path *reparameterize_path(PlannerInfo *root, Path *path, Relids required_outer, double loop_count); +extern Path *reparameterize_path_by_child(PlannerInfo *root, Path *path, + RelOptInfo *child_rel); extern Path *create_remotesubplan_path(PlannerInfo *root, Path *subpath, Distribution *distribution); @@ -354,6 +356,10 @@ extern ParamPathInfo *get_appendrel_parampathinfo(RelOptInfo *appendrel, Relids required_outer); extern ParamPathInfo *find_param_path_info(RelOptInfo *rel, Relids required_outer); +extern RelOptInfo *build_child_join_rel(PlannerInfo *root, + RelOptInfo *outer_rel, RelOptInfo *inner_rel, + RelOptInfo *parent_joinrel, List *restrictlist, + SpecialJoinInfo *sjinfo, JoinType jointype); #ifdef __TBASE__ extern Path *create_redistribute_grouping_path(PlannerInfo *root, diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h index 416d15d8..48d6f994 100644 --- a/src/include/optimizer/paths.h +++ b/src/include/optimizer/paths.h @@ -59,6 +59,8 @@ extern int compute_parallel_worker(RelOptInfo *rel, double heap_pages, double index_pages); extern void create_partial_bitmap_paths(PlannerInfo *root, RelOptInfo *rel, Path *bitmapqual); +extern void generate_partition_wise_join_paths(PlannerInfo *root, + RelOptInfo *rel); #ifdef OPTIMIZER_DEBUG extern void debug_print_rel(PlannerInfo *root, RelOptInfo *rel); @@ -112,6 +114,9 @@ extern bool have_join_order_restriction(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2); extern bool have_dangerous_phv(PlannerInfo *root, Relids outer_relids, Relids inner_params); +extern void mark_dummy_rel(RelOptInfo *rel); +extern bool have_partkey_equi_join(RelOptInfo *rel1, RelOptInfo *rel2, + JoinType jointype, List *restrictlist); /* * equivclass.c diff --git a/src/include/optimizer/placeholder.h b/src/include/optimizer/placeholder.h index 772fef33..a4a7b79f 100644 --- a/src/include/optimizer/placeholder.h +++ b/src/include/optimizer/placeholder.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * placeholder.h - * prototypes for optimizer/util/placeholder.c. + * prototypes for optimizer/util/placeholder.c. * * * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group @@ -18,15 +18,17 @@ extern PlaceHolderVar *make_placeholder_expr(PlannerInfo *root, Expr *expr, - Relids phrels); + Relids phrels); extern PlaceHolderInfo *find_placeholder_info(PlannerInfo *root, - PlaceHolderVar *phv, bool create_new_ph); + PlaceHolderVar *phv, bool create_new_ph); extern void find_placeholders_in_jointree(PlannerInfo *root); extern void update_placeholder_eval_levels(PlannerInfo *root, - SpecialJoinInfo *new_sjinfo); + SpecialJoinInfo *new_sjinfo); extern void fix_placeholder_input_needed_levels(PlannerInfo *root); extern void add_placeholders_to_base_rels(PlannerInfo *root); extern void add_placeholders_to_joinrel(PlannerInfo *root, RelOptInfo *joinrel, - RelOptInfo *outer_rel, RelOptInfo *inner_rel); + RelOptInfo *outer_rel, RelOptInfo *inner_rel); +extern void add_placeholders_to_child_joinrel(PlannerInfo *root, + RelOptInfo *childrel, RelOptInfo *parentrel); -#endif /* PLACEHOLDER_H */ +#endif /* PLACEHOLDER_H */ diff --git a/src/include/optimizer/planner.h b/src/include/optimizer/planner.h index b10500a3..464efbe4 100644 --- a/src/include/optimizer/planner.h +++ b/src/include/optimizer/planner.h @@ -122,6 +122,8 @@ extern Expr *preprocess_phv_expression(PlannerInfo *root, Expr *expr); extern bool plan_cluster_use_sort(Oid tableOid, Oid indexOid); extern List *get_partitioned_child_rels(PlannerInfo *root, Index rti); +extern List *get_partitioned_child_rels_for_join(PlannerInfo *root, + Relids join_relids); extern void preprocess_rowmarks(PlannerInfo *root); diff --git a/src/include/optimizer/prep.h b/src/include/optimizer/prep.h index 08b43b52..e51066ed 100644 --- a/src/include/optimizer/prep.h +++ b/src/include/optimizer/prep.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * prep.h - * prototypes for files in optimizer/prep/ + * prototypes for files in optimizer/prep/ * * * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group @@ -41,7 +41,7 @@ extern Expr *canonicalize_qual(Expr *qual); extern List *preprocess_targetlist(PlannerInfo *root, List *tlist); extern List *preprocess_onconflict_targetlist(List *tlist, - int result_relation, List *range_table); + int result_relation, List *range_table); extern PlanRowMark *get_plan_rowmark(List *rowmarks, Index rtindex); @@ -53,9 +53,15 @@ extern RelOptInfo *plan_set_operations(PlannerInfo *root); extern void expand_inherited_tables(PlannerInfo *root); extern Node *adjust_appendrel_attrs(PlannerInfo *root, Node *node, - AppendRelInfo *appinfo); + AppendRelInfo *appinfo); extern Node *adjust_appendrel_attrs_multilevel(PlannerInfo *root, Node *node, - RelOptInfo *child_rel); + RelOptInfo *child_rel); -#endif /* PREP_H */ +extern SpecialJoinInfo *build_child_join_sjinfo(PlannerInfo *root, + SpecialJoinInfo *parent_sjinfo, + Relids left_relids, Relids right_relids); +extern Relids adjust_child_relids_multilevel(PlannerInfo *root, Relids relids, + Relids child_relids, Relids top_parent_relids); + +#endif /* PREP_H */ diff --git a/src/test/regress/expected/partition_join.out b/src/test/regress/expected/partition_join.out new file mode 100644 index 00000000..234b8b53 --- /dev/null +++ b/src/test/regress/expected/partition_join.out @@ -0,0 +1,1789 @@ +-- +-- PARTITION_JOIN +-- Test partition-wise join between partitioned tables +-- +-- Enable partition-wise join, which by default is disabled. +SET enable_partition_wise_join to true; +-- +-- partitioned by a single column +-- +CREATE TABLE prt1 (a int, b int, c varchar) PARTITION BY RANGE(a); +CREATE TABLE prt1_p1 PARTITION OF prt1 FOR VALUES FROM (0) TO (250); +CREATE TABLE prt1_p3 PARTITION OF prt1 FOR VALUES FROM (500) TO (600); +CREATE TABLE prt1_p2 PARTITION OF prt1 FOR VALUES FROM (250) TO (500); +INSERT INTO prt1 SELECT i, i % 25, to_char(i, 'FM0000') FROM generate_series(0, 599) i WHERE i % 2 = 0; +CREATE INDEX iprt1_p1_a on prt1_p1(a); +CREATE INDEX iprt1_p2_a on prt1_p2(a); +CREATE INDEX iprt1_p3_a on prt1_p3(a); +ANALYZE prt1; +CREATE TABLE prt2 (a int, b int, c varchar) PARTITION BY RANGE(b); +CREATE TABLE prt2_p1 PARTITION OF prt2 FOR VALUES FROM (0) TO (250); +CREATE TABLE prt2_p2 PARTITION OF prt2 FOR VALUES FROM (250) TO (500); +CREATE TABLE prt2_p3 PARTITION OF prt2 FOR VALUES FROM (500) TO (600); +INSERT INTO prt2 SELECT i % 25, i, to_char(i, 'FM0000') FROM generate_series(0, 599) i WHERE i % 3 = 0; +CREATE INDEX iprt2_p1_b on prt2_p1(b); +CREATE INDEX iprt2_p2_b on prt2_p2(b); +CREATE INDEX iprt2_p3_b on prt2_p3(b); +ANALYZE prt2; +-- inner join +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.b = 0 ORDER BY t1.a, t2.b; + QUERY PLAN +-------------------------------------------------- + Sort + Sort Key: t1.a + -> Append + -> Hash Join + Hash Cond: (t2.b = t1.a) + -> Seq Scan on prt2_p1 t2 + -> Hash + -> Seq Scan on prt1_p1 t1 + Filter: (b = 0) + -> Hash Join + Hash Cond: (t2_1.b = t1_1.a) + -> Seq Scan on prt2_p2 t2_1 + -> Hash + -> Seq Scan on prt1_p2 t1_1 + Filter: (b = 0) + -> Hash Join + Hash Cond: (t2_2.b = t1_2.a) + -> Seq Scan on prt2_p3 t2_2 + -> Hash + -> Seq Scan on prt1_p3 t1_2 + Filter: (b = 0) +(21 rows) + +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.b = 0 ORDER BY t1.a, t2.b; + a | c | b | c +-----+------+-----+------ + 0 | 0000 | 0 | 0000 + 150 | 0150 | 150 | 0150 + 300 | 0300 | 300 | 0300 + 450 | 0450 | 450 | 0450 +(4 rows) + +-- left outer join, with whole-row reference +EXPLAIN (COSTS OFF) +SELECT t1, t2 FROM prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b; + QUERY PLAN +-------------------------------------------------------- + Sort + Sort Key: t1.a, t2.b + -> Result + -> Append + -> Hash Right Join + Hash Cond: (t2.b = t1.a) + -> Seq Scan on prt2_p1 t2 + -> Hash + -> Seq Scan on prt1_p1 t1 + Filter: (b = 0) + -> Hash Right Join + Hash Cond: (t2_1.b = t1_1.a) + -> Seq Scan on prt2_p2 t2_1 + -> Hash + -> Seq Scan on prt1_p2 t1_1 + Filter: (b = 0) + -> Hash Right Join + Hash Cond: (t2_2.b = t1_2.a) + -> Seq Scan on prt2_p3 t2_2 + -> Hash + -> Seq Scan on prt1_p3 t1_2 + Filter: (b = 0) +(22 rows) + +SELECT t1, t2 FROM prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b; + t1 | t2 +--------------+-------------- + (0,0,0000) | (0,0,0000) + (50,0,0050) | + (100,0,0100) | + (150,0,0150) | (0,150,0150) + (200,0,0200) | + (250,0,0250) | + (300,0,0300) | (0,300,0300) + (350,0,0350) | + (400,0,0400) | + (450,0,0450) | (0,450,0450) + (500,0,0500) | + (550,0,0550) | +(12 rows) + +-- right outer join +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b; + QUERY PLAN +--------------------------------------------------------------------- + Sort + Sort Key: t1.a, t2.b + -> Result + -> Append + -> Hash Right Join + Hash Cond: (t1.a = t2.b) + -> Seq Scan on prt1_p1 t1 + -> Hash + -> Seq Scan on prt2_p1 t2 + Filter: (a = 0) + -> Hash Right Join + Hash Cond: (t1_1.a = t2_1.b) + -> Seq Scan on prt1_p2 t1_1 + -> Hash + -> Seq Scan on prt2_p2 t2_1 + Filter: (a = 0) + -> Nested Loop Left Join + -> Seq Scan on prt2_p3 t2_2 + Filter: (a = 0) + -> Index Scan using iprt1_p3_a on prt1_p3 t1_2 + Index Cond: (a = t2_2.b) +(21 rows) + +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b; + a | c | b | c +-----+------+-----+------ + 0 | 0000 | 0 | 0000 + 150 | 0150 | 150 | 0150 + 300 | 0300 | 300 | 0300 + 450 | 0450 | 450 | 0450 + | | 75 | 0075 + | | 225 | 0225 + | | 375 | 0375 + | | 525 | 0525 +(8 rows) + +-- full outer join, with placeholder vars +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b) WHERE t1.phv = t1.a OR t2.phv = t2.b ORDER BY t1.a, t2.b; + QUERY PLAN +------------------------------------------------------------------ + Sort + Sort Key: prt1_p1.a, prt2_p1.b + -> Append + -> Hash Full Join + Hash Cond: (prt1_p1.a = prt2_p1.b) + Filter: (((50) = prt1_p1.a) OR ((75) = prt2_p1.b)) + -> Seq Scan on prt1_p1 + Filter: (b = 0) + -> Hash + -> Seq Scan on prt2_p1 + Filter: (a = 0) + -> Hash Full Join + Hash Cond: (prt1_p2.a = prt2_p2.b) + Filter: (((50) = prt1_p2.a) OR ((75) = prt2_p2.b)) + -> Seq Scan on prt1_p2 + Filter: (b = 0) + -> Hash + -> Seq Scan on prt2_p2 + Filter: (a = 0) + -> Hash Full Join + Hash Cond: (prt1_p3.a = prt2_p3.b) + Filter: (((50) = prt1_p3.a) OR ((75) = prt2_p3.b)) + -> Seq Scan on prt1_p3 + Filter: (b = 0) + -> Hash + -> Seq Scan on prt2_p3 + Filter: (a = 0) +(27 rows) + +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b) WHERE t1.phv = t1.a OR t2.phv = t2.b ORDER BY t1.a, t2.b; + a | c | b | c +----+------+----+------ + 50 | 0050 | | + | | 75 | 0075 +(2 rows) + +-- Join with pruned partitions from joining relations +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.a < 450 AND t2.b > 250 AND t1.b = 0 ORDER BY t1.a, t2.b; + QUERY PLAN +----------------------------------------------------------- + Sort + Sort Key: t1.a + -> Append + -> Hash Join + Hash Cond: (t2.b = t1.a) + -> Seq Scan on prt2_p2 t2 + Filter: (b > 250) + -> Hash + -> Seq Scan on prt1_p2 t1 + Filter: ((a < 450) AND (b = 0)) +(10 rows) + +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.a < 450 AND t2.b > 250 AND t1.b = 0 ORDER BY t1.a, t2.b; + a | c | b | c +-----+------+-----+------ + 300 | 0300 | 300 | 0300 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 LEFT JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b; + QUERY PLAN +----------------------------------------------------------- + Sort + Sort Key: prt1_p1.a, b + -> Append + -> Hash Left Join + Hash Cond: (prt1_p1.a = b) + -> Seq Scan on prt1_p1 + Filter: ((a < 450) AND (b = 0)) + -> Hash + -> Result + One-Time Filter: false + -> Hash Right Join + Hash Cond: (prt2_p2.b = prt1_p2.a) + -> Seq Scan on prt2_p2 + Filter: (b > 250) + -> Hash + -> Seq Scan on prt1_p2 + Filter: ((a < 450) AND (b = 0)) +(17 rows) + +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 LEFT JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b; + a | c | b | c +-----+------+-----+------ + 0 | 0000 | | + 50 | 0050 | | + 100 | 0100 | | + 150 | 0150 | | + 200 | 0200 | | + 250 | 0250 | | + 300 | 0300 | 300 | 0300 + 350 | 0350 | | + 400 | 0400 | | +(9 rows) + +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 FULL JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 OR t2.a = 0 ORDER BY t1.a, t2.b; + QUERY PLAN +------------------------------------------------------------ + Sort + Sort Key: prt1_p1.a, b + -> Append + -> Hash Full Join + Hash Cond: (prt1_p1.a = b) + Filter: ((prt1_p1.b = 0) OR (a = 0)) + -> Seq Scan on prt1_p1 + Filter: (a < 450) + -> Hash + -> Result + One-Time Filter: false + -> Hash Full Join + Hash Cond: (prt1_p2.a = prt2_p2.b) + Filter: ((prt1_p2.b = 0) OR (prt2_p2.a = 0)) + -> Seq Scan on prt1_p2 + Filter: (a < 450) + -> Hash + -> Seq Scan on prt2_p2 + Filter: (b > 250) + -> Hash Full Join + Hash Cond: (prt2_p3.b = a) + Filter: ((b = 0) OR (prt2_p3.a = 0)) + -> Seq Scan on prt2_p3 + Filter: (b > 250) + -> Hash + -> Result + One-Time Filter: false +(27 rows) + +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 FULL JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 OR t2.a = 0 ORDER BY t1.a, t2.b; + a | c | b | c +-----+------+-----+------ + 0 | 0000 | | + 50 | 0050 | | + 100 | 0100 | | + 150 | 0150 | | + 200 | 0200 | | + 250 | 0250 | | + 300 | 0300 | 300 | 0300 + 350 | 0350 | | + 400 | 0400 | | + | | 375 | 0375 + | | 450 | 0450 + | | 525 | 0525 +(12 rows) + +-- Semi-join +EXPLAIN (COSTS OFF) +SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t2.b FROM prt2 t2 WHERE t2.a = 0) AND t1.b = 0 ORDER BY t1.a; + QUERY PLAN +-------------------------------------------------- + Sort + Sort Key: t1.a + -> Append + -> Hash Semi Join + Hash Cond: (t1.a = t2.b) + -> Seq Scan on prt1_p1 t1 + Filter: (b = 0) + -> Hash + -> Seq Scan on prt2_p1 t2 + Filter: (a = 0) + -> Hash Semi Join + Hash Cond: (t1_1.a = t2_1.b) + -> Seq Scan on prt1_p2 t1_1 + Filter: (b = 0) + -> Hash + -> Seq Scan on prt2_p2 t2_1 + Filter: (a = 0) + -> Nested Loop Semi Join + Join Filter: (t1_2.a = t2_2.b) + -> Seq Scan on prt1_p3 t1_2 + Filter: (b = 0) + -> Materialize + -> Seq Scan on prt2_p3 t2_2 + Filter: (a = 0) +(24 rows) + +SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t2.b FROM prt2 t2 WHERE t2.a = 0) AND t1.b = 0 ORDER BY t1.a; + a | b | c +-----+---+------ + 0 | 0 | 0000 + 150 | 0 | 0150 + 300 | 0 | 0300 + 450 | 0 | 0450 +(4 rows) + +-- Anti-join with aggregates +EXPLAIN (COSTS OFF) +SELECT sum(t1.a), avg(t1.a), sum(t1.b), avg(t1.b) FROM prt1 t1 WHERE NOT EXISTS (SELECT 1 FROM prt2 t2 WHERE t1.a = t2.b); + QUERY PLAN +-------------------------------------------------- + Aggregate + -> Append + -> Hash Anti Join + Hash Cond: (t1.a = t2.b) + -> Seq Scan on prt1_p1 t1 + -> Hash + -> Seq Scan on prt2_p1 t2 + -> Hash Anti Join + Hash Cond: (t1_1.a = t2_1.b) + -> Seq Scan on prt1_p2 t1_1 + -> Hash + -> Seq Scan on prt2_p2 t2_1 + -> Hash Anti Join + Hash Cond: (t1_2.a = t2_2.b) + -> Seq Scan on prt1_p3 t1_2 + -> Hash + -> Seq Scan on prt2_p3 t2_2 +(17 rows) + +SELECT sum(t1.a), avg(t1.a), sum(t1.b), avg(t1.b) FROM prt1 t1 WHERE NOT EXISTS (SELECT 1 FROM prt2 t2 WHERE t1.a = t2.b); + sum | avg | sum | avg +-------+----------------------+------+--------------------- + 60000 | 300.0000000000000000 | 2400 | 12.0000000000000000 +(1 row) + +-- lateral reference +EXPLAIN (COSTS OFF) +SELECT * FROM prt1 t1 LEFT JOIN LATERAL + (SELECT t2.a AS t2a, t3.a AS t3a, least(t1.a,t2.a,t3.b) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss + ON t1.a = ss.t2a WHERE t1.b = 0 ORDER BY t1.a; + QUERY PLAN +-------------------------------------------------------------------------------- + Sort + Sort Key: t1.a + -> Result + -> Append + -> Nested Loop Left Join + -> Seq Scan on prt1_p1 t1 + Filter: (b = 0) + -> Nested Loop + -> Index Only Scan using iprt1_p1_a on prt1_p1 t2 + Index Cond: (a = t1.a) + -> Index Scan using iprt2_p1_b on prt2_p1 t3 + Index Cond: (b = t2.a) + -> Nested Loop Left Join + -> Seq Scan on prt1_p2 t1_1 + Filter: (b = 0) + -> Nested Loop + -> Index Only Scan using iprt1_p2_a on prt1_p2 t2_1 + Index Cond: (a = t1_1.a) + -> Index Scan using iprt2_p2_b on prt2_p2 t3_1 + Index Cond: (b = t2_1.a) + -> Nested Loop Left Join + -> Seq Scan on prt1_p3 t1_2 + Filter: (b = 0) + -> Nested Loop + -> Index Only Scan using iprt1_p3_a on prt1_p3 t2_2 + Index Cond: (a = t1_2.a) + -> Index Scan using iprt2_p3_b on prt2_p3 t3_2 + Index Cond: (b = t2_2.a) +(28 rows) + +SELECT * FROM prt1 t1 LEFT JOIN LATERAL + (SELECT t2.a AS t2a, t3.a AS t3a, least(t1.a,t2.a,t3.b) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss + ON t1.a = ss.t2a WHERE t1.b = 0 ORDER BY t1.a; + a | b | c | t2a | t3a | least +-----+---+------+-----+-----+------- + 0 | 0 | 0000 | 0 | 0 | 0 + 50 | 0 | 0050 | | | + 100 | 0 | 0100 | | | + 150 | 0 | 0150 | 150 | 0 | 150 + 200 | 0 | 0200 | | | + 250 | 0 | 0250 | | | + 300 | 0 | 0300 | 300 | 0 | 300 + 350 | 0 | 0350 | | | + 400 | 0 | 0400 | | | + 450 | 0 | 0450 | 450 | 0 | 450 + 500 | 0 | 0500 | | | + 550 | 0 | 0550 | | | +(12 rows) + +EXPLAIN (COSTS OFF) +SELECT t1.a, ss.t2a, ss.t2c FROM prt1 t1 LEFT JOIN LATERAL + (SELECT t2.a AS t2a, t3.a AS t3a, t2.b t2b, t2.c t2c, least(t1.a,t2.a,t3.b) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss + ON t1.c = ss.t2c WHERE (t1.b + coalesce(ss.t2b, 0)) = 0 ORDER BY t1.a; + QUERY PLAN +-------------------------------------------------------------- + Sort + Sort Key: t1.a + -> Hash Left Join + Hash Cond: ((t1.c)::text = (t2.c)::text) + Filter: ((t1.b + COALESCE(t2.b, 0)) = 0) + -> Append + -> Seq Scan on prt1_p1 t1 + -> Seq Scan on prt1_p2 t1_1 + -> Seq Scan on prt1_p3 t1_2 + -> Hash + -> Append + -> Hash Join + Hash Cond: (t2.a = t3.b) + -> Seq Scan on prt1_p1 t2 + -> Hash + -> Seq Scan on prt2_p1 t3 + -> Hash Join + Hash Cond: (t2_1.a = t3_1.b) + -> Seq Scan on prt1_p2 t2_1 + -> Hash + -> Seq Scan on prt2_p2 t3_1 + -> Hash Join + Hash Cond: (t2_2.a = t3_2.b) + -> Seq Scan on prt1_p3 t2_2 + -> Hash + -> Seq Scan on prt2_p3 t3_2 +(26 rows) + +SELECT t1.a, ss.t2a, ss.t2c FROM prt1 t1 LEFT JOIN LATERAL + (SELECT t2.a AS t2a, t3.a AS t3a, t2.b t2b, t2.c t2c, least(t1.a,t2.a,t3.a) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss + ON t1.c = ss.t2c WHERE (t1.b + coalesce(ss.t2b, 0)) = 0 ORDER BY t1.a; + a | t2a | t2c +-----+-----+------ + 0 | 0 | 0000 + 50 | | + 100 | | + 150 | 150 | 0150 + 200 | | + 250 | | + 300 | 300 | 0300 + 350 | | + 400 | | + 450 | 450 | 0450 + 500 | | + 550 | | +(12 rows) + +-- +-- partitioned by expression +-- +CREATE TABLE prt1_e (a int, b int, c int) PARTITION BY RANGE(((a + b)/2)); +CREATE TABLE prt1_e_p1 PARTITION OF prt1_e FOR VALUES FROM (0) TO (250); +CREATE TABLE prt1_e_p2 PARTITION OF prt1_e FOR VALUES FROM (250) TO (500); +CREATE TABLE prt1_e_p3 PARTITION OF prt1_e FOR VALUES FROM (500) TO (600); +INSERT INTO prt1_e SELECT i, i, i % 25 FROM generate_series(0, 599, 2) i; +CREATE INDEX iprt1_e_p1_ab2 on prt1_e_p1(((a+b)/2)); +CREATE INDEX iprt1_e_p2_ab2 on prt1_e_p2(((a+b)/2)); +CREATE INDEX iprt1_e_p3_ab2 on prt1_e_p3(((a+b)/2)); +ANALYZE prt1_e; +CREATE TABLE prt2_e (a int, b int, c int) PARTITION BY RANGE(((b + a)/2)); +CREATE TABLE prt2_e_p1 PARTITION OF prt2_e FOR VALUES FROM (0) TO (250); +CREATE TABLE prt2_e_p2 PARTITION OF prt2_e FOR VALUES FROM (250) TO (500); +CREATE TABLE prt2_e_p3 PARTITION OF prt2_e FOR VALUES FROM (500) TO (600); +INSERT INTO prt2_e SELECT i, i, i % 25 FROM generate_series(0, 599, 3) i; +ANALYZE prt2_e; +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_e t1, prt2_e t2 WHERE (t1.a + t1.b)/2 = (t2.b + t2.a)/2 AND t1.c = 0 ORDER BY t1.a, t2.b; + QUERY PLAN +------------------------------------------------------------------------------ + Sort + Sort Key: t1.a, t2.b + -> Append + -> Hash Join + Hash Cond: (((t2.b + t2.a) / 2) = ((t1.a + t1.b) / 2)) + -> Seq Scan on prt2_e_p1 t2 + -> Hash + -> Seq Scan on prt1_e_p1 t1 + Filter: (c = 0) + -> Hash Join + Hash Cond: (((t2_1.b + t2_1.a) / 2) = ((t1_1.a + t1_1.b) / 2)) + -> Seq Scan on prt2_e_p2 t2_1 + -> Hash + -> Seq Scan on prt1_e_p2 t1_1 + Filter: (c = 0) + -> Hash Join + Hash Cond: (((t2_2.b + t2_2.a) / 2) = ((t1_2.a + t1_2.b) / 2)) + -> Seq Scan on prt2_e_p3 t2_2 + -> Hash + -> Seq Scan on prt1_e_p3 t1_2 + Filter: (c = 0) +(21 rows) + +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_e t1, prt2_e t2 WHERE (t1.a + t1.b)/2 = (t2.b + t2.a)/2 AND t1.c = 0 ORDER BY t1.a, t2.b; + a | c | b | c +-----+---+-----+--- + 0 | 0 | 0 | 0 + 150 | 0 | 150 | 0 + 300 | 0 | 300 | 0 + 450 | 0 | 450 | 0 +(4 rows) + +-- +-- N-way join +-- +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM prt1 t1, prt2 t2, prt1_e t3 WHERE t1.a = t2.b AND t1.a = (t3.a + t3.b)/2 AND t1.b = 0 ORDER BY t1.a, t2.b; + QUERY PLAN +--------------------------------------------------------------------------- + Sort + Sort Key: t1.a + -> Result + -> Append + -> Nested Loop + Join Filter: (t1.a = ((t3.a + t3.b) / 2)) + -> Hash Join + Hash Cond: (t2.b = t1.a) + -> Seq Scan on prt2_p1 t2 + -> Hash + -> Seq Scan on prt1_p1 t1 + Filter: (b = 0) + -> Index Scan using iprt1_e_p1_ab2 on prt1_e_p1 t3 + Index Cond: (((a + b) / 2) = t2.b) + -> Nested Loop + Join Filter: (t1_1.a = ((t3_1.a + t3_1.b) / 2)) + -> Hash Join + Hash Cond: (t2_1.b = t1_1.a) + -> Seq Scan on prt2_p2 t2_1 + -> Hash + -> Seq Scan on prt1_p2 t1_1 + Filter: (b = 0) + -> Index Scan using iprt1_e_p2_ab2 on prt1_e_p2 t3_1 + Index Cond: (((a + b) / 2) = t2_1.b) + -> Nested Loop + Join Filter: (t1_2.a = ((t3_2.a + t3_2.b) / 2)) + -> Hash Join + Hash Cond: (t2_2.b = t1_2.a) + -> Seq Scan on prt2_p3 t2_2 + -> Hash + -> Seq Scan on prt1_p3 t1_2 + Filter: (b = 0) + -> Index Scan using iprt1_e_p3_ab2 on prt1_e_p3 t3_2 + Index Cond: (((a + b) / 2) = t2_2.b) +(34 rows) + +SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM prt1 t1, prt2 t2, prt1_e t3 WHERE t1.a = t2.b AND t1.a = (t3.a + t3.b)/2 AND t1.b = 0 ORDER BY t1.a, t2.b; + a | c | b | c | ?column? | c +-----+------+-----+------+----------+--- + 0 | 0000 | 0 | 0000 | 0 | 0 + 150 | 0150 | 150 | 0150 | 300 | 0 + 300 | 0300 | 300 | 0300 | 600 | 0 + 450 | 0450 | 450 | 0450 | 900 | 0 +(4 rows) + +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) LEFT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.b = 0 ORDER BY t1.a, t2.b, t3.a + t3.b; + QUERY PLAN +-------------------------------------------------------------------- + Sort + Sort Key: t1.a, t2.b, ((t3.a + t3.b)) + -> Result + -> Append + -> Hash Right Join + Hash Cond: (((t3.a + t3.b) / 2) = t1.a) + -> Seq Scan on prt1_e_p1 t3 + -> Hash + -> Hash Right Join + Hash Cond: (t2.b = t1.a) + -> Seq Scan on prt2_p1 t2 + -> Hash + -> Seq Scan on prt1_p1 t1 + Filter: (b = 0) + -> Hash Right Join + Hash Cond: (((t3_1.a + t3_1.b) / 2) = t1_1.a) + -> Seq Scan on prt1_e_p2 t3_1 + -> Hash + -> Hash Right Join + Hash Cond: (t2_1.b = t1_1.a) + -> Seq Scan on prt2_p2 t2_1 + -> Hash + -> Seq Scan on prt1_p2 t1_1 + Filter: (b = 0) + -> Hash Right Join + Hash Cond: (((t3_2.a + t3_2.b) / 2) = t1_2.a) + -> Seq Scan on prt1_e_p3 t3_2 + -> Hash + -> Hash Right Join + Hash Cond: (t2_2.b = t1_2.a) + -> Seq Scan on prt2_p3 t2_2 + -> Hash + -> Seq Scan on prt1_p3 t1_2 + Filter: (b = 0) +(34 rows) + +SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) LEFT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.b = 0 ORDER BY t1.a, t2.b, t3.a + t3.b; + a | c | b | c | ?column? | c +-----+------+-----+------+----------+--- + 0 | 0000 | 0 | 0000 | 0 | 0 + 50 | 0050 | | | 100 | 0 + 100 | 0100 | | | 200 | 0 + 150 | 0150 | 150 | 0150 | 300 | 0 + 200 | 0200 | | | 400 | 0 + 250 | 0250 | | | 500 | 0 + 300 | 0300 | 300 | 0300 | 600 | 0 + 350 | 0350 | | | 700 | 0 + 400 | 0400 | | | 800 | 0 + 450 | 0450 | 450 | 0450 | 900 | 0 + 500 | 0500 | | | 1000 | 0 + 550 | 0550 | | | 1100 | 0 +(12 rows) + +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b; + QUERY PLAN +------------------------------------------------------------------------- + Sort + Sort Key: t1.a, t2.b, ((t3.a + t3.b)) + -> Result + -> Append + -> Nested Loop Left Join + -> Hash Right Join + Hash Cond: (t1.a = ((t3.a + t3.b) / 2)) + -> Seq Scan on prt1_p1 t1 + -> Hash + -> Seq Scan on prt1_e_p1 t3 + Filter: (c = 0) + -> Index Scan using iprt2_p1_b on prt2_p1 t2 + Index Cond: (t1.a = b) + -> Nested Loop Left Join + -> Hash Right Join + Hash Cond: (t1_1.a = ((t3_1.a + t3_1.b) / 2)) + -> Seq Scan on prt1_p2 t1_1 + -> Hash + -> Seq Scan on prt1_e_p2 t3_1 + Filter: (c = 0) + -> Index Scan using iprt2_p2_b on prt2_p2 t2_1 + Index Cond: (t1_1.a = b) + -> Nested Loop Left Join + -> Hash Right Join + Hash Cond: (t1_2.a = ((t3_2.a + t3_2.b) / 2)) + -> Seq Scan on prt1_p3 t1_2 + -> Hash + -> Seq Scan on prt1_e_p3 t3_2 + Filter: (c = 0) + -> Index Scan using iprt2_p3_b on prt2_p3 t2_2 + Index Cond: (t1_2.a = b) +(31 rows) + +SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b; + a | c | b | c | ?column? | c +-----+------+-----+------+----------+--- + 0 | 0000 | 0 | 0000 | 0 | 0 + 50 | 0050 | | | 100 | 0 + 100 | 0100 | | | 200 | 0 + 150 | 0150 | 150 | 0150 | 300 | 0 + 200 | 0200 | | | 400 | 0 + 250 | 0250 | | | 500 | 0 + 300 | 0300 | 300 | 0300 | 600 | 0 + 350 | 0350 | | | 700 | 0 + 400 | 0400 | | | 800 | 0 + 450 | 0450 | 450 | 0450 | 900 | 0 + 500 | 0500 | | | 1000 | 0 + 550 | 0550 | | | 1100 | 0 +(12 rows) + +-- Cases with non-nullable expressions in subquery results; +-- make sure these go to null as expected +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.phv, t2.b, t2.phv, t3.a + t3.b, t3.phv FROM ((SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b)) FULL JOIN (SELECT 50 phv, * FROM prt1_e WHERE prt1_e.c = 0) t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.a = t1.phv OR t2.b = t2.phv OR (t3.a + t3.b)/2 = t3.phv ORDER BY t1.a, t2.b, t3.a + t3.b; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------- + Sort + Sort Key: prt1_p1.a, prt2_p1.b, ((prt1_e_p1.a + prt1_e_p1.b)) + -> Result + -> Append + -> Hash Full Join + Hash Cond: (prt1_p1.a = ((prt1_e_p1.a + prt1_e_p1.b) / 2)) + Filter: ((prt1_p1.a = (50)) OR (prt2_p1.b = (75)) OR (((prt1_e_p1.a + prt1_e_p1.b) / 2) = (50))) + -> Hash Full Join + Hash Cond: (prt1_p1.a = prt2_p1.b) + -> Seq Scan on prt1_p1 + Filter: (b = 0) + -> Hash + -> Seq Scan on prt2_p1 + Filter: (a = 0) + -> Hash + -> Seq Scan on prt1_e_p1 + Filter: (c = 0) + -> Hash Full Join + Hash Cond: (prt1_p2.a = ((prt1_e_p2.a + prt1_e_p2.b) / 2)) + Filter: ((prt1_p2.a = (50)) OR (prt2_p2.b = (75)) OR (((prt1_e_p2.a + prt1_e_p2.b) / 2) = (50))) + -> Hash Full Join + Hash Cond: (prt1_p2.a = prt2_p2.b) + -> Seq Scan on prt1_p2 + Filter: (b = 0) + -> Hash + -> Seq Scan on prt2_p2 + Filter: (a = 0) + -> Hash + -> Seq Scan on prt1_e_p2 + Filter: (c = 0) + -> Hash Full Join + Hash Cond: (prt1_p3.a = ((prt1_e_p3.a + prt1_e_p3.b) / 2)) + Filter: ((prt1_p3.a = (50)) OR (prt2_p3.b = (75)) OR (((prt1_e_p3.a + prt1_e_p3.b) / 2) = (50))) + -> Hash Full Join + Hash Cond: (prt1_p3.a = prt2_p3.b) + -> Seq Scan on prt1_p3 + Filter: (b = 0) + -> Hash + -> Seq Scan on prt2_p3 + Filter: (a = 0) + -> Hash + -> Seq Scan on prt1_e_p3 + Filter: (c = 0) +(43 rows) + +SELECT t1.a, t1.phv, t2.b, t2.phv, t3.a + t3.b, t3.phv FROM ((SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b)) FULL JOIN (SELECT 50 phv, * FROM prt1_e WHERE prt1_e.c = 0) t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.a = t1.phv OR t2.b = t2.phv OR (t3.a + t3.b)/2 = t3.phv ORDER BY t1.a, t2.b, t3.a + t3.b; + a | phv | b | phv | ?column? | phv +----+-----+----+-----+----------+----- + 50 | 50 | | | 100 | 50 + | | 75 | 75 | | +(2 rows) + +-- Semi-join +EXPLAIN (COSTS OFF) +SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHERE t1.a = 0 AND t1.b = (t2.a + t2.b)/2) AND t1.b = 0 ORDER BY t1.a; + QUERY PLAN +--------------------------------------------------------------------------------- + Sort + Sort Key: t1.a + -> Append + -> Nested Loop + Join Filter: (t1.a = t1_3.b) + -> HashAggregate + Group Key: t1_3.b + -> Hash Join + Hash Cond: (((t2.a + t2.b) / 2) = t1_3.b) + -> Seq Scan on prt1_e_p1 t2 + -> Hash + -> Seq Scan on prt2_p1 t1_3 + Filter: (a = 0) + -> Index Scan using iprt1_p1_a on prt1_p1 t1 + Index Cond: (a = ((t2.a + t2.b) / 2)) + Filter: (b = 0) + -> Nested Loop + Join Filter: (t1_1.a = t1_4.b) + -> HashAggregate + Group Key: t1_4.b + -> Hash Join + Hash Cond: (((t2_1.a + t2_1.b) / 2) = t1_4.b) + -> Seq Scan on prt1_e_p2 t2_1 + -> Hash + -> Seq Scan on prt2_p2 t1_4 + Filter: (a = 0) + -> Index Scan using iprt1_p2_a on prt1_p2 t1_1 + Index Cond: (a = ((t2_1.a + t2_1.b) / 2)) + Filter: (b = 0) + -> Nested Loop + Join Filter: (t1_2.a = t1_5.b) + -> HashAggregate + Group Key: t1_5.b + -> Nested Loop + -> Seq Scan on prt2_p3 t1_5 + Filter: (a = 0) + -> Index Scan using iprt1_e_p3_ab2 on prt1_e_p3 t2_2 + Index Cond: (((a + b) / 2) = t1_5.b) + -> Index Scan using iprt1_p3_a on prt1_p3 t1_2 + Index Cond: (a = ((t2_2.a + t2_2.b) / 2)) + Filter: (b = 0) +(41 rows) + +SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHERE t1.a = 0 AND t1.b = (t2.a + t2.b)/2) AND t1.b = 0 ORDER BY t1.a; + a | b | c +-----+---+------ + 0 | 0 | 0000 + 150 | 0 | 0150 + 300 | 0 | 0300 + 450 | 0 | 0450 +(4 rows) + +EXPLAIN (COSTS OFF) +SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a; + QUERY PLAN +------------------------------------------------------------------------------- + Sort + Sort Key: t1.a + -> Append + -> Nested Loop + -> HashAggregate + Group Key: t1_3.b + -> Hash Semi Join + Hash Cond: (t1_3.b = ((t1_6.a + t1_6.b) / 2)) + -> Seq Scan on prt2_p1 t1_3 + -> Hash + -> Seq Scan on prt1_e_p1 t1_6 + Filter: (c = 0) + -> Index Scan using iprt1_p1_a on prt1_p1 t1 + Index Cond: (a = t1_3.b) + Filter: (b = 0) + -> Nested Loop + -> HashAggregate + Group Key: t1_4.b + -> Hash Semi Join + Hash Cond: (t1_4.b = ((t1_7.a + t1_7.b) / 2)) + -> Seq Scan on prt2_p2 t1_4 + -> Hash + -> Seq Scan on prt1_e_p2 t1_7 + Filter: (c = 0) + -> Index Scan using iprt1_p2_a on prt1_p2 t1_1 + Index Cond: (a = t1_4.b) + Filter: (b = 0) + -> Nested Loop + -> Unique + -> Sort + Sort Key: t1_5.b + -> Hash Semi Join + Hash Cond: (t1_5.b = ((t1_8.a + t1_8.b) / 2)) + -> Seq Scan on prt2_p3 t1_5 + -> Hash + -> Seq Scan on prt1_e_p3 t1_8 + Filter: (c = 0) + -> Index Scan using iprt1_p3_a on prt1_p3 t1_2 + Index Cond: (a = t1_5.b) + Filter: (b = 0) +(40 rows) + +SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a; + a | b | c +-----+---+------ + 0 | 0 | 0000 + 150 | 0 | 0150 + 300 | 0 | 0300 + 450 | 0 | 0450 +(4 rows) + +-- test merge joins +SET enable_hashjoin TO off; +SET enable_nestloop TO off; +EXPLAIN (COSTS OFF) +SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a; + QUERY PLAN +---------------------------------------------------------------- + Merge Append + Sort Key: t1.a + -> Merge Semi Join + Merge Cond: (t1.a = t1_3.b) + -> Sort + Sort Key: t1.a + -> Seq Scan on prt1_p1 t1 + Filter: (b = 0) + -> Merge Semi Join + Merge Cond: (t1_3.b = (((t1_6.a + t1_6.b) / 2))) + -> Sort + Sort Key: t1_3.b + -> Seq Scan on prt2_p1 t1_3 + -> Sort + Sort Key: (((t1_6.a + t1_6.b) / 2)) + -> Seq Scan on prt1_e_p1 t1_6 + Filter: (c = 0) + -> Merge Semi Join + Merge Cond: (t1_1.a = t1_4.b) + -> Sort + Sort Key: t1_1.a + -> Seq Scan on prt1_p2 t1_1 + Filter: (b = 0) + -> Merge Semi Join + Merge Cond: (t1_4.b = (((t1_7.a + t1_7.b) / 2))) + -> Sort + Sort Key: t1_4.b + -> Seq Scan on prt2_p2 t1_4 + -> Sort + Sort Key: (((t1_7.a + t1_7.b) / 2)) + -> Seq Scan on prt1_e_p2 t1_7 + Filter: (c = 0) + -> Merge Semi Join + Merge Cond: (t1_2.a = t1_5.b) + -> Sort + Sort Key: t1_2.a + -> Seq Scan on prt1_p3 t1_2 + Filter: (b = 0) + -> Merge Semi Join + Merge Cond: (t1_5.b = (((t1_8.a + t1_8.b) / 2))) + -> Sort + Sort Key: t1_5.b + -> Seq Scan on prt2_p3 t1_5 + -> Sort + Sort Key: (((t1_8.a + t1_8.b) / 2)) + -> Seq Scan on prt1_e_p3 t1_8 + Filter: (c = 0) +(47 rows) + +SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a; + a | b | c +-----+---+------ + 0 | 0 | 0000 + 150 | 0 | 0150 + 300 | 0 | 0300 + 450 | 0 | 0450 +(4 rows) + +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b; + QUERY PLAN +---------------------------------------------------------------------------------- + Sort + Sort Key: t1.a, t2.b, ((t3.a + t3.b)) + -> Result + -> Append + -> Merge Left Join + Merge Cond: (t1.a = t2.b) + -> Sort + Sort Key: t1.a + -> Merge Left Join + Merge Cond: ((((t3.a + t3.b) / 2)) = t1.a) + -> Sort + Sort Key: (((t3.a + t3.b) / 2)) + -> Seq Scan on prt1_e_p1 t3 + Filter: (c = 0) + -> Sort + Sort Key: t1.a + -> Seq Scan on prt1_p1 t1 + -> Sort + Sort Key: t2.b + -> Seq Scan on prt2_p1 t2 + -> Merge Left Join + Merge Cond: (t1_1.a = t2_1.b) + -> Sort + Sort Key: t1_1.a + -> Merge Left Join + Merge Cond: ((((t3_1.a + t3_1.b) / 2)) = t1_1.a) + -> Sort + Sort Key: (((t3_1.a + t3_1.b) / 2)) + -> Seq Scan on prt1_e_p2 t3_1 + Filter: (c = 0) + -> Sort + Sort Key: t1_1.a + -> Seq Scan on prt1_p2 t1_1 + -> Sort + Sort Key: t2_1.b + -> Seq Scan on prt2_p2 t2_1 + -> Merge Left Join + Merge Cond: (t1_2.a = t2_2.b) + -> Sort + Sort Key: t1_2.a + -> Merge Left Join + Merge Cond: ((((t3_2.a + t3_2.b) / 2)) = t1_2.a) + -> Sort + Sort Key: (((t3_2.a + t3_2.b) / 2)) + -> Seq Scan on prt1_e_p3 t3_2 + Filter: (c = 0) + -> Sort + Sort Key: t1_2.a + -> Seq Scan on prt1_p3 t1_2 + -> Sort + Sort Key: t2_2.b + -> Seq Scan on prt2_p3 t2_2 +(52 rows) + +SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b; + a | c | b | c | ?column? | c +-----+------+-----+------+----------+--- + 0 | 0000 | 0 | 0000 | 0 | 0 + 50 | 0050 | | | 100 | 0 + 100 | 0100 | | | 200 | 0 + 150 | 0150 | 150 | 0150 | 300 | 0 + 200 | 0200 | | | 400 | 0 + 250 | 0250 | | | 500 | 0 + 300 | 0300 | 300 | 0300 | 600 | 0 + 350 | 0350 | | | 700 | 0 + 400 | 0400 | | | 800 | 0 + 450 | 0450 | 450 | 0450 | 900 | 0 + 500 | 0500 | | | 1000 | 0 + 550 | 0550 | | | 1100 | 0 +(12 rows) + +-- MergeAppend on nullable column +EXPLAIN (COSTS OFF) +SELECT t1.a, t2.b FROM (SELECT * FROM prt1 WHERE a < 450) t1 LEFT JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b; + QUERY PLAN +----------------------------------------------------------- + Sort + Sort Key: prt1_p1.a, b + -> Append + -> Merge Left Join + Merge Cond: (prt1_p1.a = b) + -> Sort + Sort Key: prt1_p1.a + -> Seq Scan on prt1_p1 + Filter: ((a < 450) AND (b = 0)) + -> Sort + Sort Key: b + -> Result + One-Time Filter: false + -> Merge Left Join + Merge Cond: (prt1_p2.a = prt2_p2.b) + -> Sort + Sort Key: prt1_p2.a + -> Seq Scan on prt1_p2 + Filter: ((a < 450) AND (b = 0)) + -> Sort + Sort Key: prt2_p2.b + -> Seq Scan on prt2_p2 + Filter: (b > 250) +(23 rows) + +SELECT t1.a, t2.b FROM (SELECT * FROM prt1 WHERE a < 450) t1 LEFT JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b; + a | b +-----+----- + 0 | + 50 | + 100 | + 150 | + 200 | + 250 | + 300 | 300 + 350 | + 400 | +(9 rows) + +RESET enable_hashjoin; +RESET enable_nestloop; +-- +-- partitioned by multiple columns +-- +CREATE TABLE prt1_m (a int, b int, c int) PARTITION BY RANGE(a, ((a + b)/2)); +CREATE TABLE prt1_m_p1 PARTITION OF prt1_m FOR VALUES FROM (0, 0) TO (250, 250); +CREATE TABLE prt1_m_p2 PARTITION OF prt1_m FOR VALUES FROM (250, 250) TO (500, 500); +CREATE TABLE prt1_m_p3 PARTITION OF prt1_m FOR VALUES FROM (500, 500) TO (600, 600); +INSERT INTO prt1_m SELECT i, i, i % 25 FROM generate_series(0, 599, 2) i; +ANALYZE prt1_m; +CREATE TABLE prt2_m (a int, b int, c int) PARTITION BY RANGE(((b + a)/2), b); +CREATE TABLE prt2_m_p1 PARTITION OF prt2_m FOR VALUES FROM (0, 0) TO (250, 250); +CREATE TABLE prt2_m_p2 PARTITION OF prt2_m FOR VALUES FROM (250, 250) TO (500, 500); +CREATE TABLE prt2_m_p3 PARTITION OF prt2_m FOR VALUES FROM (500, 500) TO (600, 600); +INSERT INTO prt2_m SELECT i, i, i % 25 FROM generate_series(0, 599, 3) i; +ANALYZE prt2_m; +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_m WHERE prt1_m.c = 0) t1 FULL JOIN (SELECT * FROM prt2_m WHERE prt2_m.c = 0) t2 ON (t1.a = (t2.b + t2.a)/2 AND t2.b = (t1.a + t1.b)/2) ORDER BY t1.a, t2.b; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------ + Sort + Sort Key: prt1_m_p1.a, prt2_m_p1.b + -> Append + -> Hash Full Join + Hash Cond: ((prt1_m_p1.a = ((prt2_m_p1.b + prt2_m_p1.a) / 2)) AND (((prt1_m_p1.a + prt1_m_p1.b) / 2) = prt2_m_p1.b)) + -> Seq Scan on prt1_m_p1 + Filter: (c = 0) + -> Hash + -> Seq Scan on prt2_m_p1 + Filter: (c = 0) + -> Hash Full Join + Hash Cond: ((prt1_m_p2.a = ((prt2_m_p2.b + prt2_m_p2.a) / 2)) AND (((prt1_m_p2.a + prt1_m_p2.b) / 2) = prt2_m_p2.b)) + -> Seq Scan on prt1_m_p2 + Filter: (c = 0) + -> Hash + -> Seq Scan on prt2_m_p2 + Filter: (c = 0) + -> Hash Full Join + Hash Cond: ((prt1_m_p3.a = ((prt2_m_p3.b + prt2_m_p3.a) / 2)) AND (((prt1_m_p3.a + prt1_m_p3.b) / 2) = prt2_m_p3.b)) + -> Seq Scan on prt1_m_p3 + Filter: (c = 0) + -> Hash + -> Seq Scan on prt2_m_p3 + Filter: (c = 0) +(24 rows) + +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_m WHERE prt1_m.c = 0) t1 FULL JOIN (SELECT * FROM prt2_m WHERE prt2_m.c = 0) t2 ON (t1.a = (t2.b + t2.a)/2 AND t2.b = (t1.a + t1.b)/2) ORDER BY t1.a, t2.b; + a | c | b | c +-----+---+-----+--- + 0 | 0 | 0 | 0 + 50 | 0 | | + 100 | 0 | | + 150 | 0 | 150 | 0 + 200 | 0 | | + 250 | 0 | | + 300 | 0 | 300 | 0 + 350 | 0 | | + 400 | 0 | | + 450 | 0 | 450 | 0 + 500 | 0 | | + 550 | 0 | | + | | 75 | 0 + | | 225 | 0 + | | 375 | 0 + | | 525 | 0 +(16 rows) + +-- +-- tests for list partitioned tables. +-- +CREATE TABLE plt1 (a int, b int, c text) PARTITION BY LIST(c); +CREATE TABLE plt1_p1 PARTITION OF plt1 FOR VALUES IN ('0000', '0003', '0004', '0010'); +CREATE TABLE plt1_p2 PARTITION OF plt1 FOR VALUES IN ('0001', '0005', '0002', '0009'); +CREATE TABLE plt1_p3 PARTITION OF plt1 FOR VALUES IN ('0006', '0007', '0008', '0011'); +INSERT INTO plt1 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i; +ANALYZE plt1; +CREATE TABLE plt2 (a int, b int, c text) PARTITION BY LIST(c); +CREATE TABLE plt2_p1 PARTITION OF plt2 FOR VALUES IN ('0000', '0003', '0004', '0010'); +CREATE TABLE plt2_p2 PARTITION OF plt2 FOR VALUES IN ('0001', '0005', '0002', '0009'); +CREATE TABLE plt2_p3 PARTITION OF plt2 FOR VALUES IN ('0006', '0007', '0008', '0011'); +INSERT INTO plt2 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 3) i; +ANALYZE plt2; +-- +-- list partitioned by expression +-- +CREATE TABLE plt1_e (a int, b int, c text) PARTITION BY LIST(ltrim(c, 'A')); +CREATE TABLE plt1_e_p1 PARTITION OF plt1_e FOR VALUES IN ('0000', '0003', '0004', '0010'); +CREATE TABLE plt1_e_p2 PARTITION OF plt1_e FOR VALUES IN ('0001', '0005', '0002', '0009'); +CREATE TABLE plt1_e_p3 PARTITION OF plt1_e FOR VALUES IN ('0006', '0007', '0008', '0011'); +INSERT INTO plt1_e SELECT i, i, 'A' || to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i; +ANALYZE plt1_e; +-- test partition matching with N-way join +EXPLAIN (COSTS OFF) +SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM plt1 t1, plt2 t2, plt1_e t3 WHERE t1.c = t2.c AND ltrim(t3.c, 'A') = t1.c GROUP BY t1.c, t2.c, t3.c ORDER BY t1.c, t2.c, t3.c; + QUERY PLAN +-------------------------------------------------------------------------------------- + Sort + Sort Key: t1.c, t3.c + -> HashAggregate + Group Key: t1.c, t2.c, t3.c + -> Result + -> Append + -> Hash Join + Hash Cond: (t1.c = t2.c) + -> Seq Scan on plt1_p1 t1 + -> Hash + -> Hash Join + Hash Cond: (t2.c = ltrim(t3.c, 'A'::text)) + -> Seq Scan on plt2_p1 t2 + -> Hash + -> Seq Scan on plt1_e_p1 t3 + -> Hash Join + Hash Cond: (t1_1.c = t2_1.c) + -> Seq Scan on plt1_p2 t1_1 + -> Hash + -> Hash Join + Hash Cond: (t2_1.c = ltrim(t3_1.c, 'A'::text)) + -> Seq Scan on plt2_p2 t2_1 + -> Hash + -> Seq Scan on plt1_e_p2 t3_1 + -> Hash Join + Hash Cond: (t1_2.c = t2_2.c) + -> Seq Scan on plt1_p3 t1_2 + -> Hash + -> Hash Join + Hash Cond: (t2_2.c = ltrim(t3_2.c, 'A'::text)) + -> Seq Scan on plt2_p3 t2_2 + -> Hash + -> Seq Scan on plt1_e_p3 t3_2 +(33 rows) + +SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM plt1 t1, plt2 t2, plt1_e t3 WHERE t1.c = t2.c AND ltrim(t3.c, 'A') = t1.c GROUP BY t1.c, t2.c, t3.c ORDER BY t1.c, t2.c, t3.c; + avg | avg | avg | c | c | c +----------------------+----------------------+-----------------------+------+------+------- + 24.0000000000000000 | 24.0000000000000000 | 48.0000000000000000 | 0000 | 0000 | A0000 + 74.0000000000000000 | 75.0000000000000000 | 148.0000000000000000 | 0001 | 0001 | A0001 + 124.0000000000000000 | 124.5000000000000000 | 248.0000000000000000 | 0002 | 0002 | A0002 + 174.0000000000000000 | 174.0000000000000000 | 348.0000000000000000 | 0003 | 0003 | A0003 + 224.0000000000000000 | 225.0000000000000000 | 448.0000000000000000 | 0004 | 0004 | A0004 + 274.0000000000000000 | 274.5000000000000000 | 548.0000000000000000 | 0005 | 0005 | A0005 + 324.0000000000000000 | 324.0000000000000000 | 648.0000000000000000 | 0006 | 0006 | A0006 + 374.0000000000000000 | 375.0000000000000000 | 748.0000000000000000 | 0007 | 0007 | A0007 + 424.0000000000000000 | 424.5000000000000000 | 848.0000000000000000 | 0008 | 0008 | A0008 + 474.0000000000000000 | 474.0000000000000000 | 948.0000000000000000 | 0009 | 0009 | A0009 + 524.0000000000000000 | 525.0000000000000000 | 1048.0000000000000000 | 0010 | 0010 | A0010 + 574.0000000000000000 | 574.5000000000000000 | 1148.0000000000000000 | 0011 | 0011 | A0011 +(12 rows) + +-- joins where one of the relations is proven empty +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.a = 1 AND t1.a = 2; + QUERY PLAN +-------------------------- + Result + One-Time Filter: false +(2 rows) + +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 LEFT JOIN prt2 t2 ON t1.a = t2.b; + QUERY PLAN +-------------------------- + Result + One-Time Filter: false +(2 rows) + +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b; + QUERY PLAN +-------------------------------------------- + Sort + Sort Key: a, t2.b + -> Hash Left Join + Hash Cond: (t2.b = a) + -> Append + -> Seq Scan on prt2_p1 t2 + Filter: (a = 0) + -> Seq Scan on prt2_p2 t2_1 + Filter: (a = 0) + -> Seq Scan on prt2_p3 t2_2 + Filter: (a = 0) + -> Hash + -> Result + One-Time Filter: false +(14 rows) + +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 FULL JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b; + QUERY PLAN +-------------------------------------------- + Sort + Sort Key: a, t2.b + -> Hash Left Join + Hash Cond: (t2.b = a) + -> Append + -> Seq Scan on prt2_p1 t2 + Filter: (a = 0) + -> Seq Scan on prt2_p2 t2_1 + Filter: (a = 0) + -> Seq Scan on prt2_p3 t2_2 + Filter: (a = 0) + -> Hash + -> Result + One-Time Filter: false +(14 rows) + +-- +-- multiple levels of partitioning +-- +CREATE TABLE prt1_l (a int, b int, c varchar) PARTITION BY RANGE(a); +CREATE TABLE prt1_l_p1 PARTITION OF prt1_l FOR VALUES FROM (0) TO (250); +CREATE TABLE prt1_l_p2 PARTITION OF prt1_l FOR VALUES FROM (250) TO (500) PARTITION BY LIST (c); +CREATE TABLE prt1_l_p2_p1 PARTITION OF prt1_l_p2 FOR VALUES IN ('0000', '0001'); +CREATE TABLE prt1_l_p2_p2 PARTITION OF prt1_l_p2 FOR VALUES IN ('0002', '0003'); +CREATE TABLE prt1_l_p3 PARTITION OF prt1_l FOR VALUES FROM (500) TO (600) PARTITION BY RANGE (b); +CREATE TABLE prt1_l_p3_p1 PARTITION OF prt1_l_p3 FOR VALUES FROM (0) TO (13); +CREATE TABLE prt1_l_p3_p2 PARTITION OF prt1_l_p3 FOR VALUES FROM (13) TO (25); +INSERT INTO prt1_l SELECT i, i % 25, to_char(i % 4, 'FM0000') FROM generate_series(0, 599, 2) i; +ANALYZE prt1_l; +CREATE TABLE prt2_l (a int, b int, c varchar) PARTITION BY RANGE(b); +CREATE TABLE prt2_l_p1 PARTITION OF prt2_l FOR VALUES FROM (0) TO (250); +CREATE TABLE prt2_l_p2 PARTITION OF prt2_l FOR VALUES FROM (250) TO (500) PARTITION BY LIST (c); +CREATE TABLE prt2_l_p2_p1 PARTITION OF prt2_l_p2 FOR VALUES IN ('0000', '0001'); +CREATE TABLE prt2_l_p2_p2 PARTITION OF prt2_l_p2 FOR VALUES IN ('0002', '0003'); +CREATE TABLE prt2_l_p3 PARTITION OF prt2_l FOR VALUES FROM (500) TO (600) PARTITION BY RANGE (a); +CREATE TABLE prt2_l_p3_p1 PARTITION OF prt2_l_p3 FOR VALUES FROM (0) TO (13); +CREATE TABLE prt2_l_p3_p2 PARTITION OF prt2_l_p3 FOR VALUES FROM (13) TO (25); +INSERT INTO prt2_l SELECT i % 25, i, to_char(i % 4, 'FM0000') FROM generate_series(0, 599, 3) i; +ANALYZE prt2_l; +-- inner join, qual covering only top-level partitions +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1, prt2_l t2 WHERE t1.a = t2.b AND t1.b = 0 ORDER BY t1.a, t2.b; + QUERY PLAN +------------------------------------------------------------- + Sort + Sort Key: t1.a + -> Append + -> Hash Join + Hash Cond: (t2.b = t1.a) + -> Seq Scan on prt2_l_p1 t2 + -> Hash + -> Seq Scan on prt1_l_p1 t1 + Filter: (b = 0) + -> Hash Join + Hash Cond: (t2_1.b = t1_1.a) + -> Append + -> Seq Scan on prt2_l_p2_p1 t2_1 + -> Seq Scan on prt2_l_p2_p2 t2_2 + -> Hash + -> Append + -> Seq Scan on prt1_l_p2_p1 t1_1 + Filter: (b = 0) + -> Seq Scan on prt1_l_p2_p2 t1_2 + Filter: (b = 0) + -> Hash Join + Hash Cond: (t2_3.b = t1_3.a) + -> Append + -> Seq Scan on prt2_l_p3_p1 t2_3 + -> Seq Scan on prt2_l_p3_p2 t2_4 + -> Hash + -> Append + -> Seq Scan on prt1_l_p3_p1 t1_3 + Filter: (b = 0) +(29 rows) + +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1, prt2_l t2 WHERE t1.a = t2.b AND t1.b = 0 ORDER BY t1.a, t2.b; + a | c | b | c +-----+------+-----+------ + 0 | 0000 | 0 | 0000 + 150 | 0002 | 150 | 0002 + 300 | 0000 | 300 | 0000 + 450 | 0002 | 450 | 0002 +(4 rows) + +-- left join +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 LEFT JOIN prt2_l t2 ON t1.a = t2.b AND t1.c = t2.c WHERE t1.b = 0 ORDER BY t1.a, t2.b; + QUERY PLAN +------------------------------------------------------------------------------------ + Sort + Sort Key: t1.a, t2.b + -> Append + -> Hash Right Join + Hash Cond: ((t2.b = t1.a) AND ((t2.c)::text = (t1.c)::text)) + -> Seq Scan on prt2_l_p1 t2 + -> Hash + -> Seq Scan on prt1_l_p1 t1 + Filter: (b = 0) + -> Hash Right Join + Hash Cond: ((t2_1.b = t1_1.a) AND ((t2_1.c)::text = (t1_1.c)::text)) + -> Seq Scan on prt2_l_p2_p1 t2_1 + -> Hash + -> Seq Scan on prt1_l_p2_p1 t1_1 + Filter: (b = 0) + -> Hash Right Join + Hash Cond: ((t2_2.b = t1_2.a) AND ((t2_2.c)::text = (t1_2.c)::text)) + -> Seq Scan on prt2_l_p2_p2 t2_2 + -> Hash + -> Seq Scan on prt1_l_p2_p2 t1_2 + Filter: (b = 0) + -> Hash Right Join + Hash Cond: ((t2_3.b = t1_3.a) AND ((t2_3.c)::text = (t1_3.c)::text)) + -> Append + -> Seq Scan on prt2_l_p3_p1 t2_3 + -> Seq Scan on prt2_l_p3_p2 t2_4 + -> Hash + -> Append + -> Seq Scan on prt1_l_p3_p1 t1_3 + Filter: (b = 0) +(30 rows) + +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 LEFT JOIN prt2_l t2 ON t1.a = t2.b AND t1.c = t2.c WHERE t1.b = 0 ORDER BY t1.a, t2.b; + a | c | b | c +-----+------+-----+------ + 0 | 0000 | 0 | 0000 + 50 | 0002 | | + 100 | 0000 | | + 150 | 0002 | 150 | 0002 + 200 | 0000 | | + 250 | 0002 | | + 300 | 0000 | 300 | 0000 + 350 | 0002 | | + 400 | 0000 | | + 450 | 0002 | 450 | 0002 + 500 | 0000 | | + 550 | 0002 | | +(12 rows) + +-- right join +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 RIGHT JOIN prt2_l t2 ON t1.a = t2.b AND t1.c = t2.c WHERE t2.a = 0 ORDER BY t1.a, t2.b; + QUERY PLAN +------------------------------------------------------------------------------------------ + Sort + Sort Key: t1.a, t2.b + -> Result + -> Append + -> Hash Right Join + Hash Cond: ((t1.a = t2.b) AND ((t1.c)::text = (t2.c)::text)) + -> Seq Scan on prt1_l_p1 t1 + -> Hash + -> Seq Scan on prt2_l_p1 t2 + Filter: (a = 0) + -> Hash Right Join + Hash Cond: ((t1_1.a = t2_1.b) AND ((t1_1.c)::text = (t2_1.c)::text)) + -> Seq Scan on prt1_l_p2_p1 t1_1 + -> Hash + -> Seq Scan on prt2_l_p2_p1 t2_1 + Filter: (a = 0) + -> Hash Right Join + Hash Cond: ((t1_2.a = t2_2.b) AND ((t1_2.c)::text = (t2_2.c)::text)) + -> Seq Scan on prt1_l_p2_p2 t1_2 + -> Hash + -> Seq Scan on prt2_l_p2_p2 t2_2 + Filter: (a = 0) + -> Hash Right Join + Hash Cond: ((t1_3.a = t2_3.b) AND ((t1_3.c)::text = (t2_3.c)::text)) + -> Append + -> Seq Scan on prt1_l_p3_p1 t1_3 + -> Seq Scan on prt1_l_p3_p2 t1_4 + -> Hash + -> Append + -> Seq Scan on prt2_l_p3_p1 t2_3 + Filter: (a = 0) +(31 rows) + +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 RIGHT JOIN prt2_l t2 ON t1.a = t2.b AND t1.c = t2.c WHERE t2.a = 0 ORDER BY t1.a, t2.b; + a | c | b | c +-----+------+-----+------ + 0 | 0000 | 0 | 0000 + 150 | 0002 | 150 | 0002 + 300 | 0000 | 300 | 0000 + 450 | 0002 | 450 | 0002 + | | 75 | 0003 + | | 225 | 0001 + | | 375 | 0003 + | | 525 | 0001 +(8 rows) + +-- full join +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_l WHERE prt1_l.b = 0) t1 FULL JOIN (SELECT * FROM prt2_l WHERE prt2_l.a = 0) t2 ON (t1.a = t2.b AND t1.c = t2.c) ORDER BY t1.a, t2.b; + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------- + Sort + Sort Key: prt1_l_p1.a, prt2_l_p1.b + -> Append + -> Hash Full Join + Hash Cond: ((prt1_l_p1.a = prt2_l_p1.b) AND ((prt1_l_p1.c)::text = (prt2_l_p1.c)::text)) + -> Seq Scan on prt1_l_p1 + Filter: (b = 0) + -> Hash + -> Seq Scan on prt2_l_p1 + Filter: (a = 0) + -> Hash Full Join + Hash Cond: ((prt1_l_p2_p1.a = prt2_l_p2_p1.b) AND ((prt1_l_p2_p1.c)::text = (prt2_l_p2_p1.c)::text)) + -> Seq Scan on prt1_l_p2_p1 + Filter: (b = 0) + -> Hash + -> Seq Scan on prt2_l_p2_p1 + Filter: (a = 0) + -> Hash Full Join + Hash Cond: ((prt1_l_p2_p2.a = prt2_l_p2_p2.b) AND ((prt1_l_p2_p2.c)::text = (prt2_l_p2_p2.c)::text)) + -> Seq Scan on prt1_l_p2_p2 + Filter: (b = 0) + -> Hash + -> Seq Scan on prt2_l_p2_p2 + Filter: (a = 0) + -> Hash Full Join + Hash Cond: ((prt1_l_p3_p1.a = prt2_l_p3_p1.b) AND ((prt1_l_p3_p1.c)::text = (prt2_l_p3_p1.c)::text)) + -> Append + -> Seq Scan on prt1_l_p3_p1 + Filter: (b = 0) + -> Hash + -> Append + -> Seq Scan on prt2_l_p3_p1 + Filter: (a = 0) +(33 rows) + +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_l WHERE prt1_l.b = 0) t1 FULL JOIN (SELECT * FROM prt2_l WHERE prt2_l.a = 0) t2 ON (t1.a = t2.b AND t1.c = t2.c) ORDER BY t1.a, t2.b; + a | c | b | c +-----+------+-----+------ + 0 | 0000 | 0 | 0000 + 50 | 0002 | | + 100 | 0000 | | + 150 | 0002 | 150 | 0002 + 200 | 0000 | | + 250 | 0002 | | + 300 | 0000 | 300 | 0000 + 350 | 0002 | | + 400 | 0000 | | + 450 | 0002 | 450 | 0002 + 500 | 0000 | | + 550 | 0002 | | + | | 75 | 0003 + | | 225 | 0001 + | | 375 | 0003 + | | 525 | 0001 +(16 rows) + +-- lateral partition-wise join +EXPLAIN (COSTS OFF) +SELECT * FROM prt1_l t1 LEFT JOIN LATERAL + (SELECT t2.a AS t2a, t2.c AS t2c, t2.b AS t2b, t3.b AS t3b, least(t1.a,t2.a,t3.b) FROM prt1_l t2 JOIN prt2_l t3 ON (t2.a = t3.b AND t2.c = t3.c)) ss + ON t1.a = ss.t2a AND t1.c = ss.t2c WHERE t1.b = 0 ORDER BY t1.a; + QUERY PLAN +----------------------------------------------------------------------------------------------------- + Sort + Sort Key: t1.a + -> Result + -> Append + -> Nested Loop Left Join + -> Seq Scan on prt1_l_p1 t1 + Filter: (b = 0) + -> Hash Join + Hash Cond: ((t3.b = t2.a) AND ((t3.c)::text = (t2.c)::text)) + -> Seq Scan on prt2_l_p1 t3 + -> Hash + -> Seq Scan on prt1_l_p1 t2 + Filter: ((t1.a = a) AND ((t1.c)::text = (c)::text)) + -> Nested Loop Left Join + -> Seq Scan on prt1_l_p2_p1 t1_1 + Filter: (b = 0) + -> Hash Join + Hash Cond: ((t3_1.b = t2_1.a) AND ((t3_1.c)::text = (t2_1.c)::text)) + -> Seq Scan on prt2_l_p2_p1 t3_1 + -> Hash + -> Seq Scan on prt1_l_p2_p1 t2_1 + Filter: ((t1_1.a = a) AND ((t1_1.c)::text = (c)::text)) + -> Nested Loop Left Join + -> Seq Scan on prt1_l_p2_p2 t1_2 + Filter: (b = 0) + -> Hash Join + Hash Cond: ((t3_2.b = t2_2.a) AND ((t3_2.c)::text = (t2_2.c)::text)) + -> Seq Scan on prt2_l_p2_p2 t3_2 + -> Hash + -> Seq Scan on prt1_l_p2_p2 t2_2 + Filter: ((t1_2.a = a) AND ((t1_2.c)::text = (c)::text)) + -> Nested Loop Left Join + -> Append + -> Seq Scan on prt1_l_p3_p1 t1_3 + Filter: (b = 0) + -> Hash Join + Hash Cond: ((t3_3.b = t2_3.a) AND ((t3_3.c)::text = (t2_3.c)::text)) + -> Append + -> Seq Scan on prt2_l_p3_p1 t3_3 + -> Seq Scan on prt2_l_p3_p2 t3_4 + -> Hash + -> Append + -> Seq Scan on prt1_l_p3_p1 t2_3 + Filter: ((t1_3.a = a) AND ((t1_3.c)::text = (c)::text)) + -> Seq Scan on prt1_l_p3_p2 t2_4 + Filter: ((t1_3.a = a) AND ((t1_3.c)::text = (c)::text)) +(46 rows) + +SELECT * FROM prt1_l t1 LEFT JOIN LATERAL + (SELECT t2.a AS t2a, t2.c AS t2c, t2.b AS t2b, t3.b AS t3b, least(t1.a,t2.a,t3.b) FROM prt1_l t2 JOIN prt2_l t3 ON (t2.a = t3.b AND t2.c = t3.c)) ss + ON t1.a = ss.t2a AND t1.c = ss.t2c WHERE t1.b = 0 ORDER BY t1.a; + a | b | c | t2a | t2c | t2b | t3b | least +-----+---+------+-----+------+-----+-----+------- + 0 | 0 | 0000 | 0 | 0000 | 0 | 0 | 0 + 50 | 0 | 0002 | | | | | + 100 | 0 | 0000 | | | | | + 150 | 0 | 0002 | 150 | 0002 | 0 | 150 | 150 + 200 | 0 | 0000 | | | | | + 250 | 0 | 0002 | | | | | + 300 | 0 | 0000 | 300 | 0000 | 0 | 300 | 300 + 350 | 0 | 0002 | | | | | + 400 | 0 | 0000 | | | | | + 450 | 0 | 0002 | 450 | 0002 | 0 | 450 | 450 + 500 | 0 | 0000 | | | | | + 550 | 0 | 0002 | | | | | +(12 rows) + +-- join with one side empty +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_l WHERE a = 1 AND a = 2) t1 RIGHT JOIN prt2_l t2 ON t1.a = t2.b AND t1.b = t2.a AND t1.c = t2.c; + QUERY PLAN +------------------------------------------------------------------------- + Hash Left Join + Hash Cond: ((t2.b = a) AND (t2.a = b) AND ((t2.c)::text = (c)::text)) + -> Append + -> Seq Scan on prt2_l_p1 t2 + -> Seq Scan on prt2_l_p2_p1 t2_1 + -> Seq Scan on prt2_l_p2_p2 t2_2 + -> Seq Scan on prt2_l_p3_p1 t2_3 + -> Seq Scan on prt2_l_p3_p2 t2_4 + -> Hash + -> Result + One-Time Filter: false +(11 rows) + +-- +-- negative testcases +-- +CREATE TABLE prt1_n (a int, b int, c varchar) PARTITION BY RANGE(c); +CREATE TABLE prt1_n_p1 PARTITION OF prt1_n FOR VALUES FROM ('0000') TO ('0250'); +CREATE TABLE prt1_n_p2 PARTITION OF prt1_n FOR VALUES FROM ('0250') TO ('0500'); +INSERT INTO prt1_n SELECT i, i, to_char(i, 'FM0000') FROM generate_series(0, 499, 2) i; +ANALYZE prt1_n; +CREATE TABLE prt2_n (a int, b int, c text) PARTITION BY LIST(c); +CREATE TABLE prt2_n_p1 PARTITION OF prt2_n FOR VALUES IN ('0000', '0003', '0004', '0010', '0006', '0007'); +CREATE TABLE prt2_n_p2 PARTITION OF prt2_n FOR VALUES IN ('0001', '0005', '0002', '0009', '0008', '0011'); +INSERT INTO prt2_n SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i; +ANALYZE prt2_n; +CREATE TABLE prt3_n (a int, b int, c text) PARTITION BY LIST(c); +CREATE TABLE prt3_n_p1 PARTITION OF prt3_n FOR VALUES IN ('0000', '0004', '0006', '0007'); +CREATE TABLE prt3_n_p2 PARTITION OF prt3_n FOR VALUES IN ('0001', '0002', '0008', '0010'); +CREATE TABLE prt3_n_p3 PARTITION OF prt3_n FOR VALUES IN ('0003', '0005', '0009', '0011'); +INSERT INTO prt2_n SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i; +ANALYZE prt3_n; +CREATE TABLE prt4_n (a int, b int, c text) PARTITION BY RANGE(a); +CREATE TABLE prt4_n_p1 PARTITION OF prt4_n FOR VALUES FROM (0) TO (300); +CREATE TABLE prt4_n_p2 PARTITION OF prt4_n FOR VALUES FROM (300) TO (500); +CREATE TABLE prt4_n_p3 PARTITION OF prt4_n FOR VALUES FROM (500) TO (600); +INSERT INTO prt4_n SELECT i, i, to_char(i, 'FM0000') FROM generate_series(0, 599, 2) i; +ANALYZE prt4_n; +-- partition-wise join can not be applied if the partition ranges differ +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt4_n t2 WHERE t1.a = t2.a; + QUERY PLAN +---------------------------------------------- + Hash Join + Hash Cond: (t1.a = t2.a) + -> Append + -> Seq Scan on prt1_p1 t1 + -> Seq Scan on prt1_p2 t1_1 + -> Seq Scan on prt1_p3 t1_2 + -> Hash + -> Append + -> Seq Scan on prt4_n_p1 t2 + -> Seq Scan on prt4_n_p2 t2_1 + -> Seq Scan on prt4_n_p3 t2_2 +(11 rows) + +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt4_n t2, prt2 t3 WHERE t1.a = t2.a and t1.a = t3.b; + QUERY PLAN +-------------------------------------------------------- + Hash Join + Hash Cond: (t2.a = t1.a) + -> Append + -> Seq Scan on prt4_n_p1 t2 + -> Seq Scan on prt4_n_p2 t2_1 + -> Seq Scan on prt4_n_p3 t2_2 + -> Hash + -> Append + -> Hash Join + Hash Cond: (t1.a = t3.b) + -> Seq Scan on prt1_p1 t1 + -> Hash + -> Seq Scan on prt2_p1 t3 + -> Hash Join + Hash Cond: (t1_1.a = t3_1.b) + -> Seq Scan on prt1_p2 t1_1 + -> Hash + -> Seq Scan on prt2_p2 t3_1 + -> Hash Join + Hash Cond: (t1_2.a = t3_2.b) + -> Seq Scan on prt1_p3 t1_2 + -> Hash + -> Seq Scan on prt2_p3 t3_2 +(23 rows) + +-- partition-wise join can not be applied if there are no equi-join conditions +-- between partition keys +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1 LEFT JOIN prt2 t2 ON (t1.a < t2.b); + QUERY PLAN +--------------------------------------------------------- + Nested Loop Left Join + -> Append + -> Seq Scan on prt1_p1 t1 + -> Seq Scan on prt1_p2 t1_1 + -> Seq Scan on prt1_p3 t1_2 + -> Append + -> Index Scan using iprt2_p1_b on prt2_p1 t2 + Index Cond: (t1.a < b) + -> Index Scan using iprt2_p2_b on prt2_p2 t2_1 + Index Cond: (t1.a < b) + -> Index Scan using iprt2_p3_b on prt2_p3 t2_2 + Index Cond: (t1.a < b) +(12 rows) + +-- equi-join with join condition on partial keys does not qualify for +-- partition-wise join +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_m t1, prt2_m t2 WHERE t1.a = (t2.b + t2.a)/2; + QUERY PLAN +---------------------------------------------- + Hash Join + Hash Cond: (((t2.b + t2.a) / 2) = t1.a) + -> Append + -> Seq Scan on prt2_m_p1 t2 + -> Seq Scan on prt2_m_p2 t2_1 + -> Seq Scan on prt2_m_p3 t2_2 + -> Hash + -> Append + -> Seq Scan on prt1_m_p1 t1 + -> Seq Scan on prt1_m_p2 t1_1 + -> Seq Scan on prt1_m_p3 t1_2 +(11 rows) + +-- equi-join between out-of-order partition key columns does not qualify for +-- partition-wise join +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_m t1 LEFT JOIN prt2_m t2 ON t1.a = t2.b; + QUERY PLAN +---------------------------------------------- + Hash Left Join + Hash Cond: (t1.a = t2.b) + -> Append + -> Seq Scan on prt1_m_p1 t1 + -> Seq Scan on prt1_m_p2 t1_1 + -> Seq Scan on prt1_m_p3 t1_2 + -> Hash + -> Append + -> Seq Scan on prt2_m_p1 t2 + -> Seq Scan on prt2_m_p2 t2_1 + -> Seq Scan on prt2_m_p3 t2_2 +(11 rows) + +-- equi-join between non-key columns does not qualify for partition-wise join +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_m t1 LEFT JOIN prt2_m t2 ON t1.c = t2.c; + QUERY PLAN +---------------------------------------------- + Hash Left Join + Hash Cond: (t1.c = t2.c) + -> Append + -> Seq Scan on prt1_m_p1 t1 + -> Seq Scan on prt1_m_p2 t1_1 + -> Seq Scan on prt1_m_p3 t1_2 + -> Hash + -> Append + -> Seq Scan on prt2_m_p1 t2 + -> Seq Scan on prt2_m_p2 t2_1 + -> Seq Scan on prt2_m_p3 t2_2 +(11 rows) + +-- partition-wise join can not be applied between tables with different +-- partition lists +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_n t1 LEFT JOIN prt2_n t2 ON (t1.c = t2.c); + QUERY PLAN +---------------------------------------------- + Hash Right Join + Hash Cond: (t2.c = (t1.c)::text) + -> Append + -> Seq Scan on prt2_n_p1 t2 + -> Seq Scan on prt2_n_p2 t2_1 + -> Hash + -> Append + -> Seq Scan on prt1_n_p1 t1 + -> Seq Scan on prt1_n_p2 t1_1 +(9 rows) + +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_n t1 JOIN prt2_n t2 ON (t1.c = t2.c) JOIN plt1 t3 ON (t1.c = t3.c); + QUERY PLAN +---------------------------------------------------------- + Hash Join + Hash Cond: (t2.c = (t1.c)::text) + -> Append + -> Seq Scan on prt2_n_p1 t2 + -> Seq Scan on prt2_n_p2 t2_1 + -> Hash + -> Hash Join + Hash Cond: (t3.c = (t1.c)::text) + -> Append + -> Seq Scan on plt1_p1 t3 + -> Seq Scan on plt1_p2 t3_1 + -> Seq Scan on plt1_p3 t3_2 + -> Hash + -> Append + -> Seq Scan on prt1_n_p1 t1 + -> Seq Scan on prt1_n_p2 t1_1 +(16 rows) + +-- partition-wise join can not be applied for a join between list and range +-- partitioned table +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_n t1 FULL JOIN prt1 t2 ON (t1.c = t2.c); + QUERY PLAN +---------------------------------------------- + Hash Full Join + Hash Cond: ((t2.c)::text = (t1.c)::text) + -> Append + -> Seq Scan on prt1_p1 t2 + -> Seq Scan on prt1_p2 t2_1 + -> Seq Scan on prt1_p3 t2_2 + -> Hash + -> Append + -> Seq Scan on prt1_n_p1 t1 + -> Seq Scan on prt1_n_p2 t1_1 +(10 rows) + diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index b624ad7c..d098ccb4 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -113,6 +113,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_multi_cluster_print | off enable_nestloop | on enable_nestloop_suppression | off + enable_partition_wise_join | off enable_null_string | off enable_oracle_compatible | off enable_parallel_ddl | off diff --git a/src/test/regress/expected/sysviews_1.out b/src/test/regress/expected/sysviews_1.out index 708e4676..76c8fa59 100644 --- a/src/test/regress/expected/sysviews_1.out +++ b/src/test/regress/expected/sysviews_1.out @@ -95,6 +95,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_multi_cluster | on enable_multi_cluster_print | off enable_nestloop | on + enable_partition_wise_join | off enable_oracle_compatible | off enable_pgbouncer | off enable_plpgsql_debug_print | off diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index f9eabd53..289f1ed4 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -116,7 +116,7 @@ test: publication subscription # ---------- # Another group of parallel tests # ---------- -test: select_views portals_p2 foreign_key cluster dependency guc bitmapops combocid tsearch tsdicts foreign_data window xmlmap functional_deps json jsonb json_encoding indirect_toast equivclass +test: select_views portals_p2 foreign_key cluster dependency guc bitmapops combocid tsearch tsdicts foreign_data window xmlmap functional_deps json jsonb json_encoding indirect_toast equivclass partition_join # ---------- # As XL uses advisory locks internally running this test separately. diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule index 04781232..ad4f5d4f 100644 --- a/src/test/regress/serial_schedule +++ b/src/test/regress/serial_schedule @@ -193,6 +193,7 @@ test: xml test: event_trigger test: fast_default test: stats +test: partition_join test: xc_create_function test: xc_groupby test: xc_distkey diff --git a/src/test/regress/sql/partition_join.sql b/src/test/regress/sql/partition_join.sql new file mode 100644 index 00000000..ca525d99 --- /dev/null +++ b/src/test/regress/sql/partition_join.sql @@ -0,0 +1,354 @@ +-- +-- PARTITION_JOIN +-- Test partition-wise join between partitioned tables +-- + +-- Enable partition-wise join, which by default is disabled. +SET enable_partition_wise_join to true; + +-- +-- partitioned by a single column +-- +CREATE TABLE prt1 (a int, b int, c varchar) PARTITION BY RANGE(a); +CREATE TABLE prt1_p1 PARTITION OF prt1 FOR VALUES FROM (0) TO (250); +CREATE TABLE prt1_p3 PARTITION OF prt1 FOR VALUES FROM (500) TO (600); +CREATE TABLE prt1_p2 PARTITION OF prt1 FOR VALUES FROM (250) TO (500); +INSERT INTO prt1 SELECT i, i % 25, to_char(i, 'FM0000') FROM generate_series(0, 599) i WHERE i % 2 = 0; +CREATE INDEX iprt1_p1_a on prt1_p1(a); +CREATE INDEX iprt1_p2_a on prt1_p2(a); +CREATE INDEX iprt1_p3_a on prt1_p3(a); +ANALYZE prt1; + +CREATE TABLE prt2 (a int, b int, c varchar) PARTITION BY RANGE(b); +CREATE TABLE prt2_p1 PARTITION OF prt2 FOR VALUES FROM (0) TO (250); +CREATE TABLE prt2_p2 PARTITION OF prt2 FOR VALUES FROM (250) TO (500); +CREATE TABLE prt2_p3 PARTITION OF prt2 FOR VALUES FROM (500) TO (600); +INSERT INTO prt2 SELECT i % 25, i, to_char(i, 'FM0000') FROM generate_series(0, 599) i WHERE i % 3 = 0; +CREATE INDEX iprt2_p1_b on prt2_p1(b); +CREATE INDEX iprt2_p2_b on prt2_p2(b); +CREATE INDEX iprt2_p3_b on prt2_p3(b); +ANALYZE prt2; + +-- inner join +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.b = 0 ORDER BY t1.a, t2.b; +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.b = 0 ORDER BY t1.a, t2.b; + +-- left outer join, with whole-row reference +EXPLAIN (COSTS OFF) +SELECT t1, t2 FROM prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b; +SELECT t1, t2 FROM prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b; + +-- right outer join +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b; +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b; + +-- full outer join, with placeholder vars +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b) WHERE t1.phv = t1.a OR t2.phv = t2.b ORDER BY t1.a, t2.b; +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b) WHERE t1.phv = t1.a OR t2.phv = t2.b ORDER BY t1.a, t2.b; + +-- Join with pruned partitions from joining relations +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.a < 450 AND t2.b > 250 AND t1.b = 0 ORDER BY t1.a, t2.b; +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.a < 450 AND t2.b > 250 AND t1.b = 0 ORDER BY t1.a, t2.b; + +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 LEFT JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b; +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 LEFT JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b; + +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 FULL JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 OR t2.a = 0 ORDER BY t1.a, t2.b; +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 FULL JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 OR t2.a = 0 ORDER BY t1.a, t2.b; + +-- Semi-join +EXPLAIN (COSTS OFF) +SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t2.b FROM prt2 t2 WHERE t2.a = 0) AND t1.b = 0 ORDER BY t1.a; +SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t2.b FROM prt2 t2 WHERE t2.a = 0) AND t1.b = 0 ORDER BY t1.a; + +-- Anti-join with aggregates +EXPLAIN (COSTS OFF) +SELECT sum(t1.a), avg(t1.a), sum(t1.b), avg(t1.b) FROM prt1 t1 WHERE NOT EXISTS (SELECT 1 FROM prt2 t2 WHERE t1.a = t2.b); +SELECT sum(t1.a), avg(t1.a), sum(t1.b), avg(t1.b) FROM prt1 t1 WHERE NOT EXISTS (SELECT 1 FROM prt2 t2 WHERE t1.a = t2.b); + +-- lateral reference +EXPLAIN (COSTS OFF) +SELECT * FROM prt1 t1 LEFT JOIN LATERAL + (SELECT t2.a AS t2a, t3.a AS t3a, least(t1.a,t2.a,t3.b) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss + ON t1.a = ss.t2a WHERE t1.b = 0 ORDER BY t1.a; +SELECT * FROM prt1 t1 LEFT JOIN LATERAL + (SELECT t2.a AS t2a, t3.a AS t3a, least(t1.a,t2.a,t3.b) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss + ON t1.a = ss.t2a WHERE t1.b = 0 ORDER BY t1.a; + +EXPLAIN (COSTS OFF) +SELECT t1.a, ss.t2a, ss.t2c FROM prt1 t1 LEFT JOIN LATERAL + (SELECT t2.a AS t2a, t3.a AS t3a, t2.b t2b, t2.c t2c, least(t1.a,t2.a,t3.b) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss + ON t1.c = ss.t2c WHERE (t1.b + coalesce(ss.t2b, 0)) = 0 ORDER BY t1.a; +SELECT t1.a, ss.t2a, ss.t2c FROM prt1 t1 LEFT JOIN LATERAL + (SELECT t2.a AS t2a, t3.a AS t3a, t2.b t2b, t2.c t2c, least(t1.a,t2.a,t3.a) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss + ON t1.c = ss.t2c WHERE (t1.b + coalesce(ss.t2b, 0)) = 0 ORDER BY t1.a; + +-- +-- partitioned by expression +-- +CREATE TABLE prt1_e (a int, b int, c int) PARTITION BY RANGE(((a + b)/2)); +CREATE TABLE prt1_e_p1 PARTITION OF prt1_e FOR VALUES FROM (0) TO (250); +CREATE TABLE prt1_e_p2 PARTITION OF prt1_e FOR VALUES FROM (250) TO (500); +CREATE TABLE prt1_e_p3 PARTITION OF prt1_e FOR VALUES FROM (500) TO (600); +INSERT INTO prt1_e SELECT i, i, i % 25 FROM generate_series(0, 599, 2) i; +CREATE INDEX iprt1_e_p1_ab2 on prt1_e_p1(((a+b)/2)); +CREATE INDEX iprt1_e_p2_ab2 on prt1_e_p2(((a+b)/2)); +CREATE INDEX iprt1_e_p3_ab2 on prt1_e_p3(((a+b)/2)); +ANALYZE prt1_e; + +CREATE TABLE prt2_e (a int, b int, c int) PARTITION BY RANGE(((b + a)/2)); +CREATE TABLE prt2_e_p1 PARTITION OF prt2_e FOR VALUES FROM (0) TO (250); +CREATE TABLE prt2_e_p2 PARTITION OF prt2_e FOR VALUES FROM (250) TO (500); +CREATE TABLE prt2_e_p3 PARTITION OF prt2_e FOR VALUES FROM (500) TO (600); +INSERT INTO prt2_e SELECT i, i, i % 25 FROM generate_series(0, 599, 3) i; +ANALYZE prt2_e; + +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_e t1, prt2_e t2 WHERE (t1.a + t1.b)/2 = (t2.b + t2.a)/2 AND t1.c = 0 ORDER BY t1.a, t2.b; +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_e t1, prt2_e t2 WHERE (t1.a + t1.b)/2 = (t2.b + t2.a)/2 AND t1.c = 0 ORDER BY t1.a, t2.b; + +-- +-- N-way join +-- +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM prt1 t1, prt2 t2, prt1_e t3 WHERE t1.a = t2.b AND t1.a = (t3.a + t3.b)/2 AND t1.b = 0 ORDER BY t1.a, t2.b; +SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM prt1 t1, prt2 t2, prt1_e t3 WHERE t1.a = t2.b AND t1.a = (t3.a + t3.b)/2 AND t1.b = 0 ORDER BY t1.a, t2.b; + +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) LEFT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.b = 0 ORDER BY t1.a, t2.b, t3.a + t3.b; +SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) LEFT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.b = 0 ORDER BY t1.a, t2.b, t3.a + t3.b; + +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b; +SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b; + +-- Cases with non-nullable expressions in subquery results; +-- make sure these go to null as expected +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.phv, t2.b, t2.phv, t3.a + t3.b, t3.phv FROM ((SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b)) FULL JOIN (SELECT 50 phv, * FROM prt1_e WHERE prt1_e.c = 0) t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.a = t1.phv OR t2.b = t2.phv OR (t3.a + t3.b)/2 = t3.phv ORDER BY t1.a, t2.b, t3.a + t3.b; +SELECT t1.a, t1.phv, t2.b, t2.phv, t3.a + t3.b, t3.phv FROM ((SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b)) FULL JOIN (SELECT 50 phv, * FROM prt1_e WHERE prt1_e.c = 0) t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.a = t1.phv OR t2.b = t2.phv OR (t3.a + t3.b)/2 = t3.phv ORDER BY t1.a, t2.b, t3.a + t3.b; + +-- Semi-join +EXPLAIN (COSTS OFF) +SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHERE t1.a = 0 AND t1.b = (t2.a + t2.b)/2) AND t1.b = 0 ORDER BY t1.a; +SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHERE t1.a = 0 AND t1.b = (t2.a + t2.b)/2) AND t1.b = 0 ORDER BY t1.a; + +EXPLAIN (COSTS OFF) +SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a; +SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a; + +-- test merge joins +SET enable_hashjoin TO off; +SET enable_nestloop TO off; + +EXPLAIN (COSTS OFF) +SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a; +SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a; + +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b; +SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b; + +-- MergeAppend on nullable column +EXPLAIN (COSTS OFF) +SELECT t1.a, t2.b FROM (SELECT * FROM prt1 WHERE a < 450) t1 LEFT JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b; +SELECT t1.a, t2.b FROM (SELECT * FROM prt1 WHERE a < 450) t1 LEFT JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b; + +RESET enable_hashjoin; +RESET enable_nestloop; + +-- +-- partitioned by multiple columns +-- +CREATE TABLE prt1_m (a int, b int, c int) PARTITION BY RANGE(a, ((a + b)/2)); +CREATE TABLE prt1_m_p1 PARTITION OF prt1_m FOR VALUES FROM (0, 0) TO (250, 250); +CREATE TABLE prt1_m_p2 PARTITION OF prt1_m FOR VALUES FROM (250, 250) TO (500, 500); +CREATE TABLE prt1_m_p3 PARTITION OF prt1_m FOR VALUES FROM (500, 500) TO (600, 600); +INSERT INTO prt1_m SELECT i, i, i % 25 FROM generate_series(0, 599, 2) i; +ANALYZE prt1_m; + +CREATE TABLE prt2_m (a int, b int, c int) PARTITION BY RANGE(((b + a)/2), b); +CREATE TABLE prt2_m_p1 PARTITION OF prt2_m FOR VALUES FROM (0, 0) TO (250, 250); +CREATE TABLE prt2_m_p2 PARTITION OF prt2_m FOR VALUES FROM (250, 250) TO (500, 500); +CREATE TABLE prt2_m_p3 PARTITION OF prt2_m FOR VALUES FROM (500, 500) TO (600, 600); +INSERT INTO prt2_m SELECT i, i, i % 25 FROM generate_series(0, 599, 3) i; +ANALYZE prt2_m; + +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_m WHERE prt1_m.c = 0) t1 FULL JOIN (SELECT * FROM prt2_m WHERE prt2_m.c = 0) t2 ON (t1.a = (t2.b + t2.a)/2 AND t2.b = (t1.a + t1.b)/2) ORDER BY t1.a, t2.b; +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_m WHERE prt1_m.c = 0) t1 FULL JOIN (SELECT * FROM prt2_m WHERE prt2_m.c = 0) t2 ON (t1.a = (t2.b + t2.a)/2 AND t2.b = (t1.a + t1.b)/2) ORDER BY t1.a, t2.b; + +-- +-- tests for list partitioned tables. +-- +CREATE TABLE plt1 (a int, b int, c text) PARTITION BY LIST(c); +CREATE TABLE plt1_p1 PARTITION OF plt1 FOR VALUES IN ('0000', '0003', '0004', '0010'); +CREATE TABLE plt1_p2 PARTITION OF plt1 FOR VALUES IN ('0001', '0005', '0002', '0009'); +CREATE TABLE plt1_p3 PARTITION OF plt1 FOR VALUES IN ('0006', '0007', '0008', '0011'); +INSERT INTO plt1 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i; +ANALYZE plt1; + +CREATE TABLE plt2 (a int, b int, c text) PARTITION BY LIST(c); +CREATE TABLE plt2_p1 PARTITION OF plt2 FOR VALUES IN ('0000', '0003', '0004', '0010'); +CREATE TABLE plt2_p2 PARTITION OF plt2 FOR VALUES IN ('0001', '0005', '0002', '0009'); +CREATE TABLE plt2_p3 PARTITION OF plt2 FOR VALUES IN ('0006', '0007', '0008', '0011'); +INSERT INTO plt2 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 3) i; +ANALYZE plt2; + +-- +-- list partitioned by expression +-- +CREATE TABLE plt1_e (a int, b int, c text) PARTITION BY LIST(ltrim(c, 'A')); +CREATE TABLE plt1_e_p1 PARTITION OF plt1_e FOR VALUES IN ('0000', '0003', '0004', '0010'); +CREATE TABLE plt1_e_p2 PARTITION OF plt1_e FOR VALUES IN ('0001', '0005', '0002', '0009'); +CREATE TABLE plt1_e_p3 PARTITION OF plt1_e FOR VALUES IN ('0006', '0007', '0008', '0011'); +INSERT INTO plt1_e SELECT i, i, 'A' || to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i; +ANALYZE plt1_e; + +-- test partition matching with N-way join +EXPLAIN (COSTS OFF) +SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM plt1 t1, plt2 t2, plt1_e t3 WHERE t1.c = t2.c AND ltrim(t3.c, 'A') = t1.c GROUP BY t1.c, t2.c, t3.c ORDER BY t1.c, t2.c, t3.c; +SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM plt1 t1, plt2 t2, plt1_e t3 WHERE t1.c = t2.c AND ltrim(t3.c, 'A') = t1.c GROUP BY t1.c, t2.c, t3.c ORDER BY t1.c, t2.c, t3.c; + +-- joins where one of the relations is proven empty +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.a = 1 AND t1.a = 2; + +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 LEFT JOIN prt2 t2 ON t1.a = t2.b; + +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b; + +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 FULL JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b; + +-- +-- multiple levels of partitioning +-- +CREATE TABLE prt1_l (a int, b int, c varchar) PARTITION BY RANGE(a); +CREATE TABLE prt1_l_p1 PARTITION OF prt1_l FOR VALUES FROM (0) TO (250); +CREATE TABLE prt1_l_p2 PARTITION OF prt1_l FOR VALUES FROM (250) TO (500) PARTITION BY LIST (c); +CREATE TABLE prt1_l_p2_p1 PARTITION OF prt1_l_p2 FOR VALUES IN ('0000', '0001'); +CREATE TABLE prt1_l_p2_p2 PARTITION OF prt1_l_p2 FOR VALUES IN ('0002', '0003'); +CREATE TABLE prt1_l_p3 PARTITION OF prt1_l FOR VALUES FROM (500) TO (600) PARTITION BY RANGE (b); +CREATE TABLE prt1_l_p3_p1 PARTITION OF prt1_l_p3 FOR VALUES FROM (0) TO (13); +CREATE TABLE prt1_l_p3_p2 PARTITION OF prt1_l_p3 FOR VALUES FROM (13) TO (25); +INSERT INTO prt1_l SELECT i, i % 25, to_char(i % 4, 'FM0000') FROM generate_series(0, 599, 2) i; +ANALYZE prt1_l; + +CREATE TABLE prt2_l (a int, b int, c varchar) PARTITION BY RANGE(b); +CREATE TABLE prt2_l_p1 PARTITION OF prt2_l FOR VALUES FROM (0) TO (250); +CREATE TABLE prt2_l_p2 PARTITION OF prt2_l FOR VALUES FROM (250) TO (500) PARTITION BY LIST (c); +CREATE TABLE prt2_l_p2_p1 PARTITION OF prt2_l_p2 FOR VALUES IN ('0000', '0001'); +CREATE TABLE prt2_l_p2_p2 PARTITION OF prt2_l_p2 FOR VALUES IN ('0002', '0003'); +CREATE TABLE prt2_l_p3 PARTITION OF prt2_l FOR VALUES FROM (500) TO (600) PARTITION BY RANGE (a); +CREATE TABLE prt2_l_p3_p1 PARTITION OF prt2_l_p3 FOR VALUES FROM (0) TO (13); +CREATE TABLE prt2_l_p3_p2 PARTITION OF prt2_l_p3 FOR VALUES FROM (13) TO (25); +INSERT INTO prt2_l SELECT i % 25, i, to_char(i % 4, 'FM0000') FROM generate_series(0, 599, 3) i; +ANALYZE prt2_l; + +-- inner join, qual covering only top-level partitions +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1, prt2_l t2 WHERE t1.a = t2.b AND t1.b = 0 ORDER BY t1.a, t2.b; +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1, prt2_l t2 WHERE t1.a = t2.b AND t1.b = 0 ORDER BY t1.a, t2.b; + +-- left join +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 LEFT JOIN prt2_l t2 ON t1.a = t2.b AND t1.c = t2.c WHERE t1.b = 0 ORDER BY t1.a, t2.b; +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 LEFT JOIN prt2_l t2 ON t1.a = t2.b AND t1.c = t2.c WHERE t1.b = 0 ORDER BY t1.a, t2.b; + +-- right join +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 RIGHT JOIN prt2_l t2 ON t1.a = t2.b AND t1.c = t2.c WHERE t2.a = 0 ORDER BY t1.a, t2.b; +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 RIGHT JOIN prt2_l t2 ON t1.a = t2.b AND t1.c = t2.c WHERE t2.a = 0 ORDER BY t1.a, t2.b; + +-- full join +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_l WHERE prt1_l.b = 0) t1 FULL JOIN (SELECT * FROM prt2_l WHERE prt2_l.a = 0) t2 ON (t1.a = t2.b AND t1.c = t2.c) ORDER BY t1.a, t2.b; +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_l WHERE prt1_l.b = 0) t1 FULL JOIN (SELECT * FROM prt2_l WHERE prt2_l.a = 0) t2 ON (t1.a = t2.b AND t1.c = t2.c) ORDER BY t1.a, t2.b; + +-- lateral partition-wise join +EXPLAIN (COSTS OFF) +SELECT * FROM prt1_l t1 LEFT JOIN LATERAL + (SELECT t2.a AS t2a, t2.c AS t2c, t2.b AS t2b, t3.b AS t3b, least(t1.a,t2.a,t3.b) FROM prt1_l t2 JOIN prt2_l t3 ON (t2.a = t3.b AND t2.c = t3.c)) ss + ON t1.a = ss.t2a AND t1.c = ss.t2c WHERE t1.b = 0 ORDER BY t1.a; +SELECT * FROM prt1_l t1 LEFT JOIN LATERAL + (SELECT t2.a AS t2a, t2.c AS t2c, t2.b AS t2b, t3.b AS t3b, least(t1.a,t2.a,t3.b) FROM prt1_l t2 JOIN prt2_l t3 ON (t2.a = t3.b AND t2.c = t3.c)) ss + ON t1.a = ss.t2a AND t1.c = ss.t2c WHERE t1.b = 0 ORDER BY t1.a; + +-- join with one side empty +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_l WHERE a = 1 AND a = 2) t1 RIGHT JOIN prt2_l t2 ON t1.a = t2.b AND t1.b = t2.a AND t1.c = t2.c; + +-- +-- negative testcases +-- +CREATE TABLE prt1_n (a int, b int, c varchar) PARTITION BY RANGE(c); +CREATE TABLE prt1_n_p1 PARTITION OF prt1_n FOR VALUES FROM ('0000') TO ('0250'); +CREATE TABLE prt1_n_p2 PARTITION OF prt1_n FOR VALUES FROM ('0250') TO ('0500'); +INSERT INTO prt1_n SELECT i, i, to_char(i, 'FM0000') FROM generate_series(0, 499, 2) i; +ANALYZE prt1_n; + +CREATE TABLE prt2_n (a int, b int, c text) PARTITION BY LIST(c); +CREATE TABLE prt2_n_p1 PARTITION OF prt2_n FOR VALUES IN ('0000', '0003', '0004', '0010', '0006', '0007'); +CREATE TABLE prt2_n_p2 PARTITION OF prt2_n FOR VALUES IN ('0001', '0005', '0002', '0009', '0008', '0011'); +INSERT INTO prt2_n SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i; +ANALYZE prt2_n; + +CREATE TABLE prt3_n (a int, b int, c text) PARTITION BY LIST(c); +CREATE TABLE prt3_n_p1 PARTITION OF prt3_n FOR VALUES IN ('0000', '0004', '0006', '0007'); +CREATE TABLE prt3_n_p2 PARTITION OF prt3_n FOR VALUES IN ('0001', '0002', '0008', '0010'); +CREATE TABLE prt3_n_p3 PARTITION OF prt3_n FOR VALUES IN ('0003', '0005', '0009', '0011'); +INSERT INTO prt2_n SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i; +ANALYZE prt3_n; + +CREATE TABLE prt4_n (a int, b int, c text) PARTITION BY RANGE(a); +CREATE TABLE prt4_n_p1 PARTITION OF prt4_n FOR VALUES FROM (0) TO (300); +CREATE TABLE prt4_n_p2 PARTITION OF prt4_n FOR VALUES FROM (300) TO (500); +CREATE TABLE prt4_n_p3 PARTITION OF prt4_n FOR VALUES FROM (500) TO (600); +INSERT INTO prt4_n SELECT i, i, to_char(i, 'FM0000') FROM generate_series(0, 599, 2) i; +ANALYZE prt4_n; + +-- partition-wise join can not be applied if the partition ranges differ +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt4_n t2 WHERE t1.a = t2.a; +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt4_n t2, prt2 t3 WHERE t1.a = t2.a and t1.a = t3.b; + +-- partition-wise join can not be applied if there are no equi-join conditions +-- between partition keys +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1 LEFT JOIN prt2 t2 ON (t1.a < t2.b); + +-- equi-join with join condition on partial keys does not qualify for +-- partition-wise join +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_m t1, prt2_m t2 WHERE t1.a = (t2.b + t2.a)/2; + +-- equi-join between out-of-order partition key columns does not qualify for +-- partition-wise join +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_m t1 LEFT JOIN prt2_m t2 ON t1.a = t2.b; + +-- equi-join between non-key columns does not qualify for partition-wise join +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_m t1 LEFT JOIN prt2_m t2 ON t1.c = t2.c; + +-- partition-wise join can not be applied between tables with different +-- partition lists +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_n t1 LEFT JOIN prt2_n t2 ON (t1.c = t2.c); +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_n t1 JOIN prt2_n t2 ON (t1.c = t2.c) JOIN plt1 t3 ON (t1.c = t3.c); + +-- partition-wise join can not be applied for a join between list and range +-- partitioned table +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_n t1 FULL JOIN prt1 t2 ON (t1.c = t2.c); From 756ea7fe056033cd24f546d2a03143421d08307f Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Thu, 25 Jun 2020 16:34:51 +0800 Subject: [PATCH 200/578] Clean up sloppy maintenance of regression test schedule files.http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/test/regress/parallel_schedule | 4 ++-- src/test/regress/serial_schedule | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 289f1ed4..ab868e3a 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -116,7 +116,7 @@ test: publication subscription # ---------- # Another group of parallel tests # ---------- -test: select_views portals_p2 foreign_key cluster dependency guc bitmapops combocid tsearch tsdicts foreign_data window xmlmap functional_deps json jsonb json_encoding indirect_toast equivclass partition_join +test: select_views portals_p2 foreign_key cluster dependency guc bitmapops combocid tsearch tsdicts foreign_data window xmlmap functional_deps json jsonb json_encoding indirect_toast equivclass # ---------- # As XL uses advisory locks internally running this test separately. @@ -134,7 +134,7 @@ test: plancache limit plpgsql copy2 temp domain prepare without_oid conversion t # ---------- # Another group of parallel tests # ---------- -test: identity +test: identity partition_join # event triggers cannot run concurrently with any test that runs DDL test: event_trigger diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule index ad4f5d4f..28d7802d 100644 --- a/src/test/regress/serial_schedule +++ b/src/test/regress/serial_schedule @@ -93,6 +93,7 @@ test: updatable_views test: rolenames test: roleattributes test: create_am +test: hash_func test: sanity_check test: errors test: select @@ -184,16 +185,16 @@ test: conversion test: truncate test: alter_table test: sequence -test: identity test: polymorphism test: rowtypes test: returning test: with test: xml +test: identity +test: partition_join test: event_trigger test: fast_default test: stats -test: partition_join test: xc_create_function test: xc_groupby test: xc_distkey From 61ef8966abdfc14710a263668dcb662e7f97556e Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Sat, 28 Oct 2017 11:10:21 +0200 Subject: [PATCH 201/578] Fix misplaced ReleaseSysCache call in get_default_partition_oid. Julien Rouhaud Discussion: http://postgr.es/m/CAOBaU_Y4omLA+VbsVdA-JwBLoJWiPxfdKCkMjrZM7NMZxa1fKw@mail.gmail.com --- src/backend/catalog/partition.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c index 70a3d6d8..b498716e 100644 --- a/src/backend/catalog/partition.c +++ b/src/backend/catalog/partition.c @@ -2790,9 +2790,9 @@ get_default_partition_oid(Oid parentId) part_table_form = (Form_pg_partitioned_table) GETSTRUCT(tuple); defaultPartId = part_table_form->partdefid; + ReleaseSysCache(tuple); } - ReleaseSysCache(tuple); return defaultPartId; } From e6e22b6ce502c4bb04ca588d5ea32e40543453f0 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Tue, 31 Oct 2017 14:41:21 +0530 Subject: [PATCH 202/578] Fix code related to partitioning schemes for dropped columns. The entry in appinfo->translated_vars can be NULL; if so, we must avoid dereferencing it. Ashutosh Bapat Discussion: http://postgr.es/m/CAFjFpReL7+1ien=-21rhjpO3bV7aAm1rQ8XgLVk2csFagSzpZQ@mail.gmail.com --- src/backend/optimizer/path/allpaths.c | 12 ++++++++++++ src/test/regress/expected/alter_table.out | 7 +++++++ src/test/regress/sql/alter_table.sql | 4 ++++ 3 files changed, 23 insertions(+) diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 0774ff46..ba5a4418 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -970,6 +970,18 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel, attno - 1); int child_index; + /* + * Ignore any column dropped from the parent. + * Corresponding Var won't have any translation. It won't + * have attr_needed information, since it can not be + * referenced in the query. + */ + if (var == NULL) + { + Assert(attr_needed == NULL); + continue; + } + child_index = var->varattno - childrel->min_attr; childrel->attr_needed[child_index] = attr_needed; } diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out index 1748add2..d112f403 100644 --- a/src/test/regress/expected/alter_table.out +++ b/src/test/regress/expected/alter_table.out @@ -3613,6 +3613,13 @@ ALTER TABLE list_parted2 DROP COLUMN b; ERROR: cannot drop column named in partition key ALTER TABLE list_parted2 ALTER COLUMN b TYPE text; ERROR: cannot alter type of column named in partition key +-- dropping non-partition key columns should be allowed on the parent table. +ALTER TABLE list_parted DROP COLUMN b; +SELECT * FROM list_parted; + a +--- +(0 rows) + -- cleanup DROP TABLE list_parted, list_parted2, range_parted; DROP TABLE fail_def_part; diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql index e2c0219e..97d2d9bf 100644 --- a/src/test/regress/sql/alter_table.sql +++ b/src/test/regress/sql/alter_table.sql @@ -2393,6 +2393,10 @@ ALTER TABLE part_2 INHERIT inh_test; ALTER TABLE list_parted2 DROP COLUMN b; ALTER TABLE list_parted2 ALTER COLUMN b TYPE text; +-- dropping non-partition key columns should be allowed on the parent table. +ALTER TABLE list_parted DROP COLUMN b; +SELECT * FROM list_parted; + -- cleanup DROP TABLE list_parted, list_parted2, range_parted; DROP TABLE fail_def_part; From 510daa93e5a1eed5b6c0b09706aa615c6961da27 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Thu, 25 Jun 2020 18:24:56 +0800 Subject: [PATCH 203/578] 1.After a MINVALUE/MAXVALUE bound, allow only more of the same. 2.Copy information from the relcache instead of pointing to it. 3.Add hash partitioning. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- doc/src/sgml/ddl.sgml | 28 +- doc/src/sgml/ref/alter_table.sgml | 7 + doc/src/sgml/ref/create_table.sgml | 4020 +++++++++--------- src/backend/catalog/partition.c | 684 ++- src/backend/commands/tablecmds.c | 40 +- src/backend/nodes/copyfuncs.c | 2 + src/backend/nodes/equalfuncs.c | 2 + src/backend/nodes/outfuncs.c | 2 + src/backend/nodes/readfuncs.c | 2 + src/backend/optimizer/path/joinrels.c | 10 +- src/backend/optimizer/util/plancat.c | 35 +- src/backend/parser/gram.y | 76 +- src/backend/parser/parse_utilcmd.c | 75 +- src/backend/utils/adt/ruleutils.c | 15 +- src/backend/utils/cache/relcache.c | 26 +- src/bin/psql/tab-complete.c | 2 +- src/include/catalog/partition.h | 5 + src/include/catalog/pg_proc.h | 3 + src/include/nodes/parsenodes.h | 8 +- src/test/regress/expected/alter_table.out | 62 + src/test/regress/expected/alter_table_1.out | 62 + src/test/regress/expected/alter_table_2.out | 62 + src/test/regress/expected/alter_table_3.out | 62 + src/test/regress/expected/create_table.out | 88 +- src/test/regress/expected/inherit.out | 4 +- src/test/regress/expected/inherit_1.out | 4 +- src/test/regress/expected/inherit_2.out | 4 +- src/test/regress/expected/inherit_3.out | 4 +- src/test/regress/expected/insert.out | 81 +- src/test/regress/expected/insert_1.out | 35 +- src/test/regress/expected/partition_join.out | 81 + src/test/regress/expected/update.out | 29 + src/test/regress/sql/alter_table.sql | 64 + src/test/regress/sql/create_table.sql | 57 +- src/test/regress/sql/inherit.sql | 4 +- src/test/regress/sql/insert.sql | 52 +- src/test/regress/sql/partition_join.sql | 32 + src/test/regress/sql/update.sql | 28 + src/tools/pgindent/typedefs.list | 1 + 39 files changed, 3669 insertions(+), 2189 deletions(-) diff --git a/doc/src/sgml/ddl.sgml b/doc/src/sgml/ddl.sgml index 7449e064..a65a130f 100644 --- a/doc/src/sgml/ddl.sgml +++ b/doc/src/sgml/ddl.sgml @@ -3145,6 +3145,19 @@ VALUES ('Albany', NULL, NULL, 'NY'); + + + Hash Partitioning + + + + The table is partitioned by specifying a modulus and a remainder for + each partition. Each partition will hold the rows for which the hash + value of the partition key divided by the specified modulus will + produce the specified remainder. + + + If your application needs to use other forms of partitioning not listed @@ -3171,9 +3184,8 @@ VALUES ('Albany', NULL, NULL, 'NY'); All rows inserted into a partitioned table will be routed to one of the partitions based on the value of the partition key. Each partition has a subset of the data defined by its - partition bounds. Currently supported - partitioning methods include range and list, where each partition is - assigned a range of keys and a list of keys, respectively. + partition bounds. The currently supported + partitioning methods are range, list, and hash. @@ -3598,11 +3610,11 @@ ALTER TABLE measurement ATTACH PARTITION measurement_y2008m02 - Declarative partitioning only supports list and range partitioning, - whereas table inheritance allows data to be divided in a manner of - the user's choosing. (Note, however, that if constraint exclusion is - unable to prune partitions effectively, query performance will be very - poor.) + Declarative partitioning only supports range, list and hash + partitioning, whereas table inheritance allows data to be divided in a + manner of the user's choosing. (Note, however, that if constraint + exclusion is unable to prune partitions effectively, query performance + will be very poor.) diff --git a/doc/src/sgml/ref/alter_table.sgml b/doc/src/sgml/ref/alter_table.sgml index 06c5655e..d9ddbd01 100644 --- a/doc/src/sgml/ref/alter_table.sgml +++ b/doc/src/sgml/ref/alter_table.sgml @@ -1629,6 +1629,13 @@ ALTER TABLE cities ATTACH PARTITION cities_partdef DEFAULT; + + Attach a partition to hash partitioned table: + +ALTER TABLE orders + ATTACH PARTITION orders_p4 FOR VALUES WITH (MODULUS 4, REMAINDER 3); + + Detach a partition from partitioned table: diff --git a/doc/src/sgml/ref/create_table.sgml b/doc/src/sgml/ref/create_table.sgml index e46601b7..62792897 100644 --- a/doc/src/sgml/ref/create_table.sgml +++ b/doc/src/sgml/ref/create_table.sgml @@ -1,2111 +1,1995 @@ - - - - - CREATE TABLE - - - - CREATE TABLE - 7 - SQL - Language Statements - - - - CREATE TABLE - define a new table - - - - -CREATE [ [ GLOBAL | LOCAL ] { TEMPORARY | TEMP } | UNLOGGED ] TABLE [ IF NOT EXISTS ] table_name ( [ - { column_name data_type [ COLLATE collation ] [ column_constraint [ ... ] ] - | table_constraint - | LIKE source_table [ like_option ... ] } - [, ... ] -] ) -[ INHERITS ( parent_table [, ... ] ) ] -[ PARTITION BY { RANGE | LIST } ( { column_name | ( expression ) } [ COLLATE collation ] [ opclass ] [, ... ] ) ] -[ WITH ( storage_parameter [= value] [, ... ] ) | WITH OIDS | WITHOUT OIDS ] -[ ON COMMIT { PRESERVE ROWS | DELETE ROWS | DROP } ] -[ TABLESPACE tablespace_name ] -[ - DISTRIBUTE BY { REPLICATION | ROUNDROBIN | { [HASH | MODULO ] ( column_name ) } } | - DISTRIBUTED { { BY ( column_name ) } | { RANDOMLY } | - DISTSTYLE { EVEN | KEY | ALL } DISTKEY ( column_name ) -] -[ TO { GROUP groupname | NODE ( nodename [, ... ] ) } ] - -CREATE [ [ GLOBAL | LOCAL ] { TEMPORARY | TEMP } | UNLOGGED ] TABLE [ IF NOT EXISTS ] table_name - OF type_name [ ( - { column_name [ WITH OPTIONS ] [ column_constraint [ ... ] ] - | table_constraint } - [, ... ] -) ] -[ PARTITION BY { RANGE | LIST } ( { column_name | ( expression ) } [ COLLATE collation ] [ opclass ] [, ... ] ) ] -[ WITH ( storage_parameter [= value] [, ... ] ) | WITH OIDS | WITHOUT OIDS ] -[ ON COMMIT { PRESERVE ROWS | DELETE ROWS | DROP } ] -[ TABLESPACE tablespace_name ] - -CREATE [ [ GLOBAL | LOCAL ] { TEMPORARY | TEMP } | UNLOGGED ] TABLE [ IF NOT EXISTS ] table_name - PARTITION OF parent_table [ ( - { column_name [ WITH OPTIONS ] [ column_constraint [ ... ] ] - | table_constraint } - [, ... ] -) ] { FOR VALUES partition_bound_spec | DEFAULT } -[ PARTITION BY { RANGE | LIST } ( { column_name | ( expression ) } [ COLLATE collation ] [ opclass ] [, ... ] ) ] -[ WITH ( storage_parameter [= value] [, ... ] ) | WITH OIDS | WITHOUT OIDS ] -[ ON COMMIT { PRESERVE ROWS | DELETE ROWS | DROP } ] -[ TABLESPACE tablespace_name ] -[ - DISTRIBUTE BY { REPLICATION | ROUNDROBIN | { [HASH | MODULO ] ( column_name ) } } | - DISTRIBUTED { { BY ( column_name ) } | { RANDOMLY } | - DISTSTYLE { EVEN | KEY | ALL } DISTKEY ( column_name ) -] -[ TO { GROUP groupname | NODE ( nodename [, ... ] ) } ] - -where column_constraint is: - -[ CONSTRAINT constraint_name ] -{ NOT NULL | - NULL | - CHECK ( expression ) [ NO INHERIT ] | - DEFAULT default_expr | - GENERATED { ALWAYS | BY DEFAULT } AS IDENTITY [ ( sequence_options ) ] | - UNIQUE index_parameters | - PRIMARY KEY index_parameters | - REFERENCES reftable [ ( refcolumn ) ] [ MATCH FULL | MATCH PARTIAL | MATCH SIMPLE ] - [ ON DELETE action ] [ ON UPDATE action ] } -[ DEFERRABLE | NOT DEFERRABLE ] [ INITIALLY DEFERRED | INITIALLY IMMEDIATE ] - -and table_constraint is: - -[ CONSTRAINT constraint_name ] -{ CHECK ( expression ) [ NO INHERIT ] | - UNIQUE ( column_name [, ... ] ) index_parameters | - PRIMARY KEY ( column_name [, ... ] ) index_parameters | - EXCLUDE [ USING index_method ] ( exclude_element WITH operator [, ... ] ) index_parameters [ WHERE ( predicate ) ] | - FOREIGN KEY ( column_name [, ... ] ) REFERENCES reftable [ ( refcolumn [, ... ] ) ] - [ MATCH FULL | MATCH PARTIAL | MATCH SIMPLE ] [ ON DELETE action ] [ ON UPDATE action ] } -[ DEFERRABLE | NOT DEFERRABLE ] [ INITIALLY DEFERRED | INITIALLY IMMEDIATE ] - -and like_option is: - -{ INCLUDING | EXCLUDING } { DEFAULTS | CONSTRAINTS | IDENTITY | INDEXES | STORAGE | COMMENTS | ALL } - -and partition_bound_spec is: - -IN ( { numeric_literal | string_literal | NULL } [, ...] ) | -FROM ( { numeric_literal | string_literal | MINVALUE | MAXVALUE } [, ...] ) - TO ( { numeric_literal | string_literal | MINVALUE | MAXVALUE } [, ...] ) - -index_parameters in UNIQUE, PRIMARY KEY, and EXCLUDE constraints are: - -[ WITH ( storage_parameter [= value] [, ... ] ) ] -[ USING INDEX TABLESPACE tablespace_name ] - -exclude_element in an EXCLUDE constraint is: - -{ column_name | ( expression ) } [ opclass ] [ ASC | DESC ] [ NULLS { FIRST | LAST } ] - - - - - - Description - - - CREATE TABLE will create a new, initially empty table - in the current database. The table will be owned by the user issuing the - command. - - - - If a schema name is given (for example, CREATE TABLE - myschema.mytable ...) then the table is created in the specified - schema. Otherwise it is created in the current schema. Temporary - tables exist in a special schema, so a schema name cannot be given - when creating a temporary table. The name of the table must be - distinct from the name of any other table, sequence, index, view, - or foreign table in the same schema. - - - - CREATE TABLE also automatically creates a data - type that represents the composite type corresponding - to one row of the table. Therefore, tables cannot have the same - name as any existing data type in the same schema. - - - - The optional constraint clauses specify constraints (tests) that - new or updated rows must satisfy for an insert or update operation - to succeed. A constraint is an SQL object that helps define the - set of valid values in the table in various ways. - - - - There are two ways to define constraints: table constraints and - column constraints. A column constraint is defined as part of a - column definition. A table constraint definition is not tied to a - particular column, and it can encompass more than one column. - Every column constraint can also be written as a table constraint; - a column constraint is only a notational convenience for use when the - constraint only affects one column. - - - - To be able to create a table, you must have USAGE - privilege on all column types or the type in the OF - clause, respectively. - - - - - Parameters - - - - - TEMPORARY or TEMP - - - If specified, the table is created as a temporary table. - Temporary tables are automatically dropped at the end of a - session, or optionally at the end of the current transaction - (see ON COMMIT below). Existing permanent - tables with the same name are not visible to the current session - while the temporary table exists, unless they are referenced - with schema-qualified names. Any indexes created on a temporary - table are automatically temporary as well. - - - - The autovacuum daemon cannot - access and therefore cannot vacuum or analyze temporary tables. - For this reason, appropriate vacuum and analyze operations should be - performed via session SQL commands. For example, if a temporary - table is going to be used in complex queries, it is wise to run - ANALYZE on the temporary table after it is populated. - - - - Optionally, GLOBAL or LOCAL - can be written before TEMPORARY or TEMP. - This presently makes no difference in PostgreSQL - and is deprecated; see - . - - - - - - UNLOGGED - - - If specified, the table is created as an unlogged table. Data written - to unlogged tables is not written to the write-ahead log (see ), which makes them considerably faster than ordinary - tables. However, they are not crash-safe: an unlogged table is - automatically truncated after a crash or unclean shutdown. The contents - of an unlogged table are also not replicated to standby servers. - Any indexes created on an unlogged table are automatically unlogged as - well. - - - - - - IF NOT EXISTS - - - Do not throw an error if a relation with the same name already exists. - A notice is issued in this case. Note that there is no guarantee that - the existing relation is anything like the one that would have been - created. - - - - - - table_name - - - The name (optionally schema-qualified) of the table to be created. - - - - - - OF type_name - - - Creates a typed table, which takes its - structure from the specified composite type (name optionally - schema-qualified). A typed table is tied to its type; for - example the table will be dropped if the type is dropped - (with DROP TYPE ... CASCADE). - - - - When a typed table is created, then the data types of the - columns are determined by the underlying composite type and are - not specified by the CREATE TABLE command. - But the CREATE TABLE command can add defaults - and constraints to the table and can specify storage parameters. - - - - - - PARTITION OF parent_table { FOR VALUES partition_bound_spec | DEFAULT } - - - Creates the table as a partition of the specified - parent table. The table can be created either as a partition for specific - values using FOR VALUES or as a default partition - using DEFAULT. - - - - The partition_bound_spec - must correspond to the partitioning method and partition key of the - parent table, and must not overlap with any existing partition of that - parent. The form with IN is used for list partitioning, - while the form with FROM and TO is used for - range partitioning. - - - - Each of the values specified in - the partition_bound_spec is - a literal, NULL, MINVALUE, or - MAXVALUE. Each literal value must be either a - numeric constant that is coercible to the corresponding partition key - column's type, or a string literal that is valid input for that type. - - - - When creating a list partition, NULL can be - specified to signify that the partition allows the partition key - column to be null. However, there cannot be more than one such - list partition for a given parent table. NULL - cannot be specified for range partitions. - - - - When creating a range partition, the lower bound specified with - FROM is an inclusive bound, whereas the upper - bound specified with TO is an exclusive bound. - That is, the values specified in the FROM list - are valid values of the corresponding partition key columns for this - partition, whereas those in the TO list are - not. Note that this statement must be understood according to the - rules of row-wise comparison (). - For example, given PARTITION BY RANGE (x,y), a partition - bound FROM (1, 2) TO (3, 4) - allows x=1 with any y>=2, - x=2 with any non-null y, - and x=3 with any y<4. - - - - The special values MINVALUE and MAXVALUE - may be used when creating a range partition to indicate that there - is no lower or upper bound on the column's value. For example, a - partition defined using FROM (MINVALUE) TO (10) allows - any values less than 10, and a partition defined using - FROM (10) TO (MAXVALUE) allows any values greater than - or equal to 10. - - - - When creating a range partition involving more than one column, it - can also make sense to use MAXVALUE as part of the lower - bound, and MINVALUE as part of the upper bound. For - example, a partition defined using - FROM (0, MAXVALUE) TO (10, MAXVALUE) allows any rows - where the first partition key column is greater than 0 and less than - or equal to 10. Similarly, a partition defined using - FROM ('a', MINVALUE) TO ('b', MINVALUE) allows any rows - where the first partition key column starts with "a". - - - - Note that any values after MINVALUE or - MAXVALUE in a partition bound are ignored; so the bound - (10, MINVALUE, 0) is equivalent to - (10, MINVALUE, 10) and (10, MINVALUE, MINVALUE) - and (10, MINVALUE, MAXVALUE). - - - - Also note that some element types, such as timestamp, - have a notion of "infinity", which is just another value that can - be stored. This is different from MINVALUE and - MAXVALUE, which are not real values that can be stored, - but rather they are ways of saying that the value is unbounded. - MAXVALUE can be thought of as being greater than any - other value, including "infinity" and MINVALUE as being - less than any other value, including "minus infinity". Thus the range - FROM ('infinity') TO (MAXVALUE) is not an empty range; it - allows precisely one value to be stored — "infinity". - - - - If DEFAULT is specified, the table will be - created as a default partition of the parent table. The parent can - either be a list or range partitioned table. A partition key value - not fitting into any other partition of the given parent will be - routed to the default partition. There can be only one default - partition for a given parent table. - - - - When a table has an existing DEFAULT partition and - a new partition is added to it, the existing default partition must - be scanned to verify that it does not contain any rows which properly - belong in the new partition. If the default partition contains a - large number of rows, this may be slow. The scan will be skipped if - the default partition is a foreign table or if it has a constraint which - proves that it cannot contain rows which should be placed in the new - partition. - - - - A partition must have the same column names and types as the partitioned - table to which it belongs. If the parent is specified WITH - OIDS then all partitions must have OIDs; the parent's OID - column will be inherited by all partitions just like any other column. - Modifications to the column names or types of a partitioned table, or - the addition or removal of an OID column, will automatically propagate - to all partitions. CHECK constraints will be inherited - automatically by every partition, but an individual partition may specify - additional CHECK constraints; additional constraints with - the same name and condition as in the parent will be merged with the - parent constraint. Defaults may be specified separately for each - partition. - - - - Rows inserted into a partitioned table will be automatically routed to - the correct partition. If no suitable partition exists, an error will - occur. Also, if updating a row in a given partition would require it - to move to another partition due to new partition key values, an error - will occur. - - - - Operations such as TRUNCATE which normally affect a table and all of its - inheritance children will cascade to all partitions, but may also be - performed on an individual partition. Note that dropping a partition - with DROP TABLE requires taking an ACCESS - EXCLUSIVE lock on the parent table. - - - - - - column_name - - - The name of a column to be created in the new table. - - - - - - data_type - - - The data type of the column. This can include array - specifiers. For more information on the data types supported by - PostgreSQL, refer to . - - - - - - COLLATE collation - - - The COLLATE clause assigns a collation to - the column (which must be of a collatable data type). - If not specified, the column data type's default collation is used. - - - - - - INHERITS ( parent_table [, ... ] ) - - - The optional INHERITS clause specifies a list of - tables from which the new table automatically inherits all - columns. Parent tables can be plain tables or foreign tables. - - - - Use of INHERITS creates a persistent relationship - between the new child table and its parent table(s). Schema - modifications to the parent(s) normally propagate to children - as well, and by default the data of the child table is included in - scans of the parent(s). - - - - If the same column name exists in more than one parent - table, an error is reported unless the data types of the columns - match in each of the parent tables. If there is no conflict, - then the duplicate columns are merged to form a single column in - the new table. If the column name list of the new table - contains a column name that is also inherited, the data type must - likewise match the inherited column(s), and the column - definitions are merged into one. If the - new table explicitly specifies a default value for the column, - this default overrides any defaults from inherited declarations - of the column. Otherwise, any parents that specify default - values for the column must all specify the same default, or an - error will be reported. - - - - CHECK constraints are merged in essentially the same way as - columns: if multiple parent tables and/or the new table definition - contain identically-named CHECK constraints, these - constraints must all have the same check expression, or an error will be - reported. Constraints having the same name and expression will - be merged into one copy. A constraint marked NO INHERIT in a - parent will not be considered. Notice that an unnamed CHECK - constraint in the new table will never be merged, since a unique name - will always be chosen for it. - - - - In Postgres-XL, it is currently not possible to distribute a table with more than one parent. - - - - Column STORAGE settings are also copied from parent tables. - - - - If a column in the parent table is an identity column, that property is - not inherited. A column in the child table can be declared identity - column if desired. - - - - - - PARTITION BY { RANGE | LIST } ( { column_name | ( expression ) } [ opclass ] [, ...] ) - - - The optional PARTITION BY clause specifies a strategy - of partitioning the table. The table thus created is called a - partitioned table. The parenthesized list of - columns or expressions forms the partition key - for the table. When using range partitioning, the partition key can - include multiple columns or expressions (up to 32, but this limit can - altered when building PostgreSQL.), but for - list partitioning, the partition key must consist of a single column or - expression. If no B-tree operator class is specified when creating a - partitioned table, the default B-tree operator class for the datatype will - be used. If there is none, an error will be reported. - - - - A partitioned table is divided into sub-tables (called partitions), - which are created using separate CREATE TABLE commands. - The partitioned table is itself empty. A data row inserted into the - table is routed to a partition based on the value of columns or - expressions in the partition key. If no existing partition matches - the values in the new row, an error will be reported. - - - - Partitioned tables do not support UNIQUE, - PRIMARY KEY, EXCLUDE, or - FOREIGN KEY constraints; however, you can define - these constraints on individual partitions. - - - - - - - LIKE source_table [ like_option ... ] - - - The LIKE clause specifies a table from which - the new table automatically copies all column names, their data types, - and their not-null constraints. - - - Unlike INHERITS, the new table and original table - are completely decoupled after creation is complete. Changes to the - original table will not be applied to the new table, and it is not - possible to include data of the new table in scans of the original - table. - - - Default expressions for the copied column definitions will be copied - only if INCLUDING DEFAULTS is specified. The - default behavior is to exclude default expressions, resulting in the - copied columns in the new table having null defaults. - Note that copying defaults that call database-modification functions, - such as nextval, may create a functional linkage between - the original and new tables. - - - Any identity specifications of copied column definitions will only be - copied if INCLUDING IDENTITY is specified. A new - sequence is created for each identity column of the new table, separate - from the sequences associated with the old table. - - - Not-null constraints are always copied to the new table. - CHECK constraints will be copied only if - INCLUDING CONSTRAINTS is specified. - No distinction is made between column constraints and table - constraints. - - - Indexes, PRIMARY KEY, UNIQUE, - and EXCLUDE constraints on the original table will be - created on the new table only if INCLUDING INDEXES - is specified. Names for the new indexes and constraints are - chosen according to the default rules, regardless of how the originals - were named. (This behavior avoids possible duplicate-name failures for - the new indexes.) - - - STORAGE settings for the copied column definitions will be - copied only if INCLUDING STORAGE is specified. The - default behavior is to exclude STORAGE settings, resulting - in the copied columns in the new table having type-specific default - settings. For more on STORAGE settings, see - . - - - Comments for the copied columns, constraints, and indexes - will be copied only if INCLUDING COMMENTS - is specified. The default behavior is to exclude comments, resulting in - the copied columns and constraints in the new table having no comments. - - - INCLUDING ALL is an abbreviated form of - INCLUDING DEFAULTS INCLUDING IDENTITY INCLUDING CONSTRAINTS INCLUDING INDEXES INCLUDING STORAGE INCLUDING COMMENTS. - - - Note that unlike INHERITS, columns and - constraints copied by LIKE are not merged with similarly - named columns and constraints. - If the same name is specified explicitly or in another - LIKE clause, an error is signaled. - - - The LIKE clause can also be used to copy column - definitions from views, foreign tables, or composite types. - Inapplicable options (e.g., INCLUDING INDEXES from - a view) are ignored. - - - - - - CONSTRAINT constraint_name - - - An optional name for a column or table constraint. If the - constraint is violated, the constraint name is present in error messages, - so constraint names like col must be positive can be used - to communicate helpful constraint information to client applications. - (Double-quotes are needed to specify constraint names that contain spaces.) - If a constraint name is not specified, the system generates a name. - - - - - - NOT NULL - - - The column is not allowed to contain null values. - - - - - - NULL - - - The column is allowed to contain null values. This is the default. - - - - This clause is only provided for compatibility with - non-standard SQL databases. Its use is discouraged in new - applications. - - - - - - CHECK ( expression ) [ NO INHERIT ] - - - The CHECK clause specifies an expression producing a - Boolean result which new or updated rows must satisfy for an - insert or update operation to succeed. Expressions evaluating - to TRUE or UNKNOWN succeed. Should any row of an insert or - update operation produce a FALSE result, an error exception is - raised and the insert or update does not alter the database. A - check constraint specified as a column constraint should - reference that column's value only, while an expression - appearing in a table constraint can reference multiple columns. - - - - Currently, CHECK expressions cannot contain - subqueries nor refer to variables other than columns of the - current row. The system column tableoid - may be referenced, but not any other system column. - - - - A constraint marked with NO INHERIT will not propagate to - child tables. - - - - When a table has multiple CHECK constraints, - they will be tested for each row in alphabetical order by name, - after checking NOT NULL constraints. - (PostgreSQL versions before 9.5 did not honor any - particular firing order for CHECK constraints.) - - - - - - DEFAULT - default_expr - - - The DEFAULT clause assigns a default data value for - the column whose column definition it appears within. The value - is any variable-free expression (subqueries and cross-references - to other columns in the current table are not allowed). The - data type of the default expression must match the data type of the - column. - - - - The default expression will be used in any insert operation that - does not specify a value for the column. If there is no default - for a column, then the default is null. - - - - - - GENERATED { ALWAYS | BY DEFAULT } AS IDENTITY [ ( sequence_options ) ] - - - This clause creates the column as an identity - column. It will have an implicit sequence attached to it - and the column in new rows will automatically have values from the - sequence assigned to it. - - - - The clauses ALWAYS and BY DEFAULT - determine how the sequence value is given precedence over a - user-specified value in an INSERT statement. - If ALWAYS is specified, a user-specified value is - only accepted if the INSERT statement - specifies OVERRIDING SYSTEM VALUE. If BY - DEFAULT is specified, then the user-specified value takes - precedence. See for details. (In - the COPY command, user-specified values are always - used regardless of this setting.) - - - - The optional sequence_options clause can be - used to override the options of the sequence. - See for details. - - - - - - UNIQUE (column constraint) - UNIQUE ( column_name [, ... ] ) (table constraint) - - - - The UNIQUE constraint specifies that a - group of one or more columns of a table can contain - only unique values. The behavior of the unique table constraint - is the same as that for column constraints, with the additional - capability to span multiple columns. - - - - For the purpose of a unique constraint, null values are not - considered equal. - - - - In Postgres-XL, if DISTRIBUTE BY - REPLICATION is not specified, only the distribution key is - allowed to have this constraint. - - - - Each unique table constraint must name a set of columns that is - different from the set of columns named by any other unique or - primary key constraint defined for the table. (Otherwise it - would just be the same constraint listed twice.) - - - - - - PRIMARY KEY (column constraint) - PRIMARY KEY ( column_name [, ... ] ) (table constraint) - - - The PRIMARY KEY constraint specifies that a column or - columns of a table can contain only unique (non-duplicate), nonnull - values. Only one primary key can be specified for a table, whether as a - column constraint or a table constraint. - - - - The primary key constraint should name a set of columns that is - different from the set of columns named by any unique - constraint defined for the same table. (Otherwise, the unique - constraint is redundant and will be discarded.) - - - - PRIMARY KEY enforces the same data constraints as - a combination of UNIQUE and NOT NULL, but - identifying a set of columns as the primary key also provides metadata - about the design of the schema, since a primary key implies that other - tables can rely on this set of columns as a unique identifier for rows. - - - - In Postgres-XL, if DISTRIBUTE BY REPLICATION is not specified, the - distribution key must be included in the set of primary key - columns. - - - - - - EXCLUDE [ USING index_method ] ( exclude_element WITH operator [, ... ] ) index_parameters [ WHERE ( predicate ) ] - - - The EXCLUDE clause defines an exclusion - constraint, which guarantees that if - any two rows are compared on the specified column(s) or - expression(s) using the specified operator(s), not all of these - comparisons will return TRUE. If all of the - specified operators test for equality, this is equivalent to a - UNIQUE constraint, although an ordinary unique constraint - will be faster. However, exclusion constraints can specify - constraints that are more general than simple equality. - For example, you can specify a constraint that - no two rows in the table contain overlapping circles - (see ) by using the - && operator. - - - - Exclusion constraints are implemented using - an index, so each specified operator must be associated with an - appropriate operator class - (see ) for the index access - method index_method. - The operators are required to be commutative. - Each exclude_element - can optionally specify an operator class and/or ordering options; - these are described fully under - . - - - - The access method must support amgettuple (see ); at present this means GIN - cannot be used. Although it's allowed, there is little point in using - B-tree or hash indexes with an exclusion constraint, because this - does nothing that an ordinary unique constraint doesn't do better. - So in practice the access method will always be GiST or - SP-GiST. - - - - The predicate allows you to specify an - exclusion constraint on a subset of the table; internally this creates a - partial index. Note that parentheses are required around the predicate. - - - - In Postgres-XL, exclusion constraints are currently not - supported.The constraint is enforced when both rows map to the same datanode. - But if they go into different datanodes, the constraint is not enforced. - - - - - - REFERENCES reftable [ ( refcolumn ) ] [ MATCH matchtype ] [ ON DELETE action ] [ ON UPDATE action ] (column constraint) - - FOREIGN KEY ( column_name [, ... ] ) - REFERENCES reftable [ ( refcolumn [, ... ] ) ] - [ MATCH matchtype ] - [ ON DELETE action ] - [ ON UPDATE action ] - (table constraint) - - - - These clauses specify a foreign key constraint, which requires - that a group of one or more columns of the new table must only - contain values that match values in the referenced - column(s) of some row of the referenced table. If the refcolumn list is omitted, the - primary key of the reftable - is used. The referenced columns must be the columns of a non-deferrable - unique or primary key constraint in the referenced table. The user - must have REFERENCES permission on the referenced table - (either the whole table, or the specific referenced columns). - Note that foreign key constraints cannot be defined between temporary - tables and permanent tables. - - - - A value inserted into the referencing column(s) is matched against the - values of the referenced table and referenced columns using the - given match type. There are three match types: MATCH - FULL, MATCH PARTIAL, and MATCH - SIMPLE (which is the default). MATCH - FULL will not allow one column of a multicolumn foreign key - to be null unless all foreign key columns are null; if they are all - null, the row is not required to have a match in the referenced table. - MATCH SIMPLE allows any of the foreign key columns - to be null; if any of them are null, the row is not required to have a - match in the referenced table. - MATCH PARTIAL is not yet implemented. - (Of course, NOT NULL constraints can be applied to the - referencing column(s) to prevent these cases from arising.) - - - - In addition, when the data in the referenced columns is changed, - certain actions are performed on the data in this table's - columns. The ON DELETE clause specifies the - action to perform when a referenced row in the referenced table is - being deleted. Likewise, the ON UPDATE - clause specifies the action to perform when a referenced column - in the referenced table is being updated to a new value. If the - row is updated, but the referenced column is not actually - changed, no action is done. Referential actions other than the - NO ACTION check cannot be deferred, even if - the constraint is declared deferrable. There are the following possible - actions for each clause: - - - - NO ACTION - - - Produce an error indicating that the deletion or update - would create a foreign key constraint violation. - If the constraint is deferred, this - error will be produced at constraint check time if there still - exist any referencing rows. This is the default action. - - - - - - RESTRICT - - - Produce an error indicating that the deletion or update - would create a foreign key constraint violation. - This is the same as NO ACTION except that - the check is not deferrable. - - - - - - CASCADE - - - Delete any rows referencing the deleted row, or update the - values of the referencing column(s) to the new values of the - referenced columns, respectively. - - - - - - SET NULL - - - Set the referencing column(s) to null. - - - - - - SET DEFAULT - - - Set the referencing column(s) to their default values. - (There must be a row in the referenced table matching the default - values, if they are not null, or the operation will fail.) - - - - - - - - If the referenced column(s) are changed frequently, it might be wise to - add an index to the referencing column(s) so that referential actions - associated with the foreign key constraint can be performed more - efficiently. - - - - - - DEFERRABLE - NOT DEFERRABLE - - - This controls whether the constraint can be deferred. A - constraint that is not deferrable will be checked immediately - after every command. Checking of constraints that are - deferrable can be postponed until the end of the transaction - (using the command). - NOT DEFERRABLE is the default. - Currently, only UNIQUE, PRIMARY KEY, - EXCLUDE, and - REFERENCES (foreign key) constraints accept this - clause. NOT NULL and CHECK constraints are not - deferrable. Note that deferrable constraints cannot be used as - conflict arbitrators in an INSERT statement that - includes an ON CONFLICT DO UPDATE clause. - - - - - - INITIALLY IMMEDIATE - INITIALLY DEFERRED - - - If a constraint is deferrable, this clause specifies the default - time to check the constraint. If the constraint is - INITIALLY IMMEDIATE, it is checked after each - statement. This is the default. If the constraint is - INITIALLY DEFERRED, it is checked only at the - end of the transaction. The constraint check time can be - altered with the command. - - - - - - WITH ( storage_parameter [= value] [, ... ] ) - - - This clause specifies optional storage parameters for a table or index; - see for more - information. The WITH clause for a - table can also include OIDS=TRUE (or just OIDS) - to specify that rows of the new table - should have OIDs (object identifiers) assigned to them, or - OIDS=FALSE to specify that the rows should not have OIDs. - If OIDS is not specified, the default setting depends upon - the configuration parameter. - (If the new table inherits from any tables that have OIDs, then - OIDS=TRUE is forced even if the command says - OIDS=FALSE.) - - - - If OIDS=FALSE is specified or implied, the new - table does not store OIDs and no OID will be assigned for a row inserted - into it. This is generally considered worthwhile, since it - will reduce OID consumption and thereby postpone the wraparound - of the 32-bit OID counter. Once the counter wraps around, OIDs - can no longer be assumed to be unique, which makes them - considerably less useful. In addition, excluding OIDs from a - table reduces the space required to store the table on disk by - 4 bytes per row (on most machines), slightly improving performance. - - - - To remove OIDs from a table after it has been created, use . - - - - - - WITH OIDS - WITHOUT OIDS - - - These are obsolescent syntaxes equivalent to WITH (OIDS) - and WITH (OIDS=FALSE), respectively. If you wish to give - both an OIDS setting and storage parameters, you must use - the WITH ( ... ) syntax; see above. - - - - In Postgres-XL, OID is managed locally in each - Datanode and Coordinator. The OID value may be inconsistent for - rows stored in different Datanodes. - - - - - - ON COMMIT - - - The behavior of temporary tables at the end of a transaction - block can be controlled using ON COMMIT. - The three options are: - - - - PRESERVE ROWS - - - No special action is taken at the ends of transactions. - This is the default behavior. - - - - - - DELETE ROWS - - - All rows in the temporary table will be deleted at the end - of each transaction block. Essentially, an automatic is done - at each commit. - - - - - - DROP - - - The temporary table will be dropped at the end of the current - transaction block. - - - - - - - - - TABLESPACE tablespace_name - - - The tablespace_name is the name - of the tablespace in which the new table is to be created. - If not specified, - is consulted, or - if the table is temporary. - - - - - - DISTRIBUTE BY - - - This clause specifies how the table is distributed or replicated among Datanodes. - - - - - - REPLICATION - - - Each row of the table will be replicated to all the - Datanode of the Postgres-XL database - cluster. - - - - - - ROUNDROBIN - - - Each row of the table will be placed in one of the Datanodes - in a round-robin manner. The value of the row will not be - needed to determine what Datanode to go. - - - - - - HASH ( column_name ) - - - Each row of the table will be placed based on the hash value - of the specified column. Following type is allowed as - distribution column: INT8, INT2, OID, INT4, BOOL, INT2VECTOR, - OIDVECTOR, CHAR, NAME, TEXT, BPCHAR, BYTEA, VARCHAR, NUMERIC, - MONEY, ABSTIME, RELTIME, DATE, TIME,TIMESTAMP, TIMESTAMPTZ, - INTERVAL, and TIMETZ. - - - Please note that floating point is not allowed as a basis of - the distribution column. - - - - - - MODULO ( column_name ) - - - Each row of the table will be placed based on the modulo - of the specified column. Following type is allowed as - distribution column: INT8, INT2, INT4, BOOL, ABSTIME, RELTIME, - DATE. - - - Please note that floating point is not allowed as a basis of - the distribution column. - - - - - - - If DISTRIBUTE BY is not specified, columns with - UNIQUE constraint will be chosen as the distribution key. If no - such column is specified, distribution column is the first - eligible column in the definition. If no such column is found, - then the table will be distributed by ROUNDROBIN. - - - - - - - TO GROUP - TO NODE - - - This defines on the list of nodes on which table data exists. - If this is not specified table data is present on all Datanodes. - - - - - - nodename - - - Associated with TO NODE, it defines a - Postgres-XL node of catalog pgxc_node. - - - - - - groupname - - - Associated with TO GROUP, it defines a - Postgres-XL node group in catalog pgxc_group. - - - - - - USING INDEX TABLESPACE tablespace_name - - - This clause allows selection of the tablespace in which the index - associated with a UNIQUE, PRIMARY - KEY, or EXCLUDE constraint will be created. - If not specified, - is consulted, or - if the table is temporary. - - - - - - - - Storage Parameters - - - storage parameters - - + + + + + CREATE TABLE + + + + CREATE TABLE + 7 + SQL - Language Statements + + + + CREATE TABLE + define a new table + + + + + CREATE [ [ GLOBAL | LOCAL ] { TEMPORARY | TEMP } | UNLOGGED ] TABLE [ IF NOT EXISTS ] table_name ( [ + { column_name data_type [ COLLATE collation ] [ column_constraint [ ... ] ] + | table_constraint + | LIKE source_table [ like_option ... ] } + [, ... ] + ] ) + [ INHERITS ( parent_table [, ... ] ) ] + [ PARTITION BY { RANGE | LIST | HASH } ( { column_name | ( expression ) } [ COLLATE collation ] [ opclass ] [, ... ] ) ] + [ WITH ( storage_parameter [= value] [, ... ] ) | WITH OIDS | WITHOUT OIDS ] + [ ON COMMIT { PRESERVE ROWS | DELETE ROWS | DROP } ] + [ TABLESPACE tablespace_name ] + + CREATE [ [ GLOBAL | LOCAL ] { TEMPORARY | TEMP } | UNLOGGED ] TABLE [ IF NOT EXISTS ] table_name + OF type_name [ ( + { column_name [ WITH OPTIONS ] [ column_constraint [ ... ] ] + | table_constraint } + [, ... ] + ) ] + [ PARTITION BY { RANGE | LIST | HASH } ( { column_name | ( expression ) } [ COLLATE collation ] [ opclass ] [, ... ] ) ] + [ WITH ( storage_parameter [= value] [, ... ] ) | WITH OIDS | WITHOUT OIDS ] + [ ON COMMIT { PRESERVE ROWS | DELETE ROWS | DROP } ] + [ TABLESPACE tablespace_name ] + + CREATE [ [ GLOBAL | LOCAL ] { TEMPORARY | TEMP } | UNLOGGED ] TABLE [ IF NOT EXISTS ] table_name + PARTITION OF parent_table [ ( + { column_name [ WITH OPTIONS ] [ column_constraint [ ... ] ] + | table_constraint } + [, ... ] + ) ] { FOR VALUES partition_bound_spec | DEFAULT } + [ PARTITION BY { RANGE | LIST | HASH } ( { column_name | ( expression ) } [ COLLATE collation ] [ opclass ] [, ... ] ) ] + [ WITH ( storage_parameter [= value] [, ... ] ) | WITH OIDS | WITHOUT OIDS ] + [ ON COMMIT { PRESERVE ROWS | DELETE ROWS | DROP } ] + [ TABLESPACE tablespace_name ] + + where column_constraint is: + + [ CONSTRAINT constraint_name ] + { NOT NULL | + NULL | + CHECK ( expression ) [ NO INHERIT ] | + DEFAULT default_expr | + GENERATED { ALWAYS | BY DEFAULT } AS IDENTITY [ ( sequence_options ) ] | + UNIQUE index_parameters | + PRIMARY KEY index_parameters | + REFERENCES reftable [ ( refcolumn ) ] [ MATCH FULL | MATCH PARTIAL | MATCH SIMPLE ] + [ ON DELETE action ] [ ON UPDATE action ] } + [ DEFERRABLE | NOT DEFERRABLE ] [ INITIALLY DEFERRED | INITIALLY IMMEDIATE ] + + and table_constraint is: + + [ CONSTRAINT constraint_name ] + { CHECK ( expression ) [ NO INHERIT ] | + UNIQUE ( column_name [, ... ] ) index_parameters | + PRIMARY KEY ( column_name [, ... ] ) index_parameters | + EXCLUDE [ USING index_method ] ( exclude_element WITH operator [, ... ] ) index_parameters [ WHERE ( predicate ) ] | + FOREIGN KEY ( column_name [, ... ] ) REFERENCES reftable [ ( refcolumn [, ... ] ) ] + [ MATCH FULL | MATCH PARTIAL | MATCH SIMPLE ] [ ON DELETE action ] [ ON UPDATE action ] } + [ DEFERRABLE | NOT DEFERRABLE ] [ INITIALLY DEFERRED | INITIALLY IMMEDIATE ] + + and like_option is: + + { INCLUDING | EXCLUDING } { DEFAULTS | CONSTRAINTS | IDENTITY | INDEXES | STORAGE | COMMENTS | ALL } + + and partition_bound_spec is: + + IN ( { numeric_literal | string_literal | NULL } [, ...] ) | + FROM ( { numeric_literal | string_literal | MINVALUE | MAXVALUE } [, ...] ) + TO ( { numeric_literal | string_literal | MINVALUE | MAXVALUE } [, ...] ) | + WITH ( MODULUS numeric_literal, REMAINDER numeric_literal ) + + index_parameters in UNIQUE, PRIMARY KEY, and EXCLUDE constraints are: + + [ WITH ( storage_parameter [= value] [, ... ] ) ] + [ USING INDEX TABLESPACE tablespace_name ] + + exclude_element in an EXCLUDE constraint is: + + { column_name | ( expression ) } [ opclass ] [ ASC | DESC ] [ NULLS { FIRST | LAST } ] + + + + + + Description + - The WITH clause can specify storage parameters - for tables, and for indexes associated with a UNIQUE, - PRIMARY KEY, or EXCLUDE constraint. - Storage parameters for - indexes are documented in . - The storage parameters currently - available for tables are listed below. For many of these parameters, as - shown, there is an additional parameter with the same name prefixed with - toast., which controls the behavior of the - table's secondary TOAST table, if any - (see for more information about TOAST). - If a table parameter value is set and the - equivalent toast. parameter is not, the TOAST table - will use the table's parameter value. - Specifying these parameters for partitioned tables is not supported, - but you may specify them for individual leaf partitions. + CREATE TABLE will create a new, initially empty table + in the current database. The table will be owned by the user issuing the + command. - - - - - fillfactor (integer) - - - The fillfactor for a table is a percentage between 10 and 100. - 100 (complete packing) is the default. When a smaller fillfactor - is specified, INSERT operations pack table pages only - to the indicated percentage; the remaining space on each page is - reserved for updating rows on that page. This gives UPDATE - a chance to place the updated copy of a row on the same page as the - original, which is more efficient than placing it on a different page. - For a table whose entries are never updated, complete packing is the - best choice, but in heavily updated tables smaller fillfactors are - appropriate. This parameter cannot be set for TOAST tables. - - - - - - parallel_workers (integer) - - - This sets the number of workers that should be used to assist a parallel - scan of this table. If not set, the system will determine a value based - on the relation size. The actual number of workers chosen by the planner - may be less, for example due to - the setting of . - - - - - - autovacuum_enabled, toast.autovacuum_enabled (boolean) - - - Enables or disables the autovacuum daemon for a particular table. - If true, the autovacuum daemon will perform automatic VACUUM - and/or ANALYZE operations on this table following the rules - discussed in . - If false, this table will not be autovacuumed, except to prevent - transaction ID wraparound. See for - more about wraparound prevention. - Note that the autovacuum daemon does not run at all (except to prevent - transaction ID wraparound) if the - parameter is false; setting individual tables' storage parameters does - not override that. Therefore there is seldom much point in explicitly - setting this storage parameter to true, only - to false. - - - - - - autovacuum_vacuum_threshold, toast.autovacuum_vacuum_threshold (integer) - - - Per-table value for - parameter. - - - - - - autovacuum_vacuum_scale_factor, toast.autovacuum_vacuum_scale_factor (float4) - - - Per-table value for - parameter. - - - - - - autovacuum_analyze_threshold (integer) - - - Per-table value for - parameter. - - - - - - autovacuum_analyze_scale_factor (float4) - - - Per-table value for - parameter. - - - - - - autovacuum_vacuum_cost_delay, toast.autovacuum_vacuum_cost_delay (integer) - - - Per-table value for - parameter. - - - - - - autovacuum_vacuum_cost_limit, toast.autovacuum_vacuum_cost_limit (integer) - - - Per-table value for - parameter. - - - - - - autovacuum_freeze_min_age, toast.autovacuum_freeze_min_age (integer) - - - Per-table value for - parameter. Note that autovacuum will ignore - per-table autovacuum_freeze_min_age parameters that are - larger than half the - system-wide setting. - - - - - - autovacuum_freeze_max_age, toast.autovacuum_freeze_max_age (integer) - - - Per-table value for - parameter. Note that autovacuum will ignore - per-table autovacuum_freeze_max_age parameters that are - larger than the system-wide setting (it can only be set smaller). - - - - - - autovacuum_freeze_table_age, toast.autovacuum_freeze_table_age (integer) - - - Per-table value for - parameter. - - - - - - autovacuum_multixact_freeze_min_age, toast.autovacuum_multixact_freeze_min_age (integer) - - - Per-table value for - parameter. Note that autovacuum will ignore - per-table autovacuum_multixact_freeze_min_age parameters - that are larger than half the - system-wide - setting. - - - - - - autovacuum_multixact_freeze_max_age, toast.autovacuum_multixact_freeze_max_age (integer) - - - Per-table value - for parameter. - Note that autovacuum will ignore - per-table autovacuum_multixact_freeze_max_age parameters - that are larger than the system-wide setting (it can only be set - smaller). - - - - - - autovacuum_multixact_freeze_table_age, toast.autovacuum_multixact_freeze_table_age (integer) - - - Per-table value - for parameter. - - - - - - log_autovacuum_min_duration, toast.log_autovacuum_min_duration (integer) - - - Per-table value for - parameter. - - - - - - user_catalog_table (boolean) - - - Declare the table as an additional catalog table for purposes of - logical replication. See - for details. - This parameter cannot be set for TOAST tables. - - - - - - - - - - - Notes - - - Using OIDs in new applications is not recommended: where - possible, using an identity column or other sequence - generator as the table's primary key is preferred. However, if - your application does make use of OIDs to identify specific - rows of a table, it is recommended to create a unique constraint - on the oid column of that table, to ensure that - OIDs in the table will indeed uniquely identify rows even after - counter wraparound. Avoid assuming that OIDs are unique across - tables; if you need a database-wide unique identifier, use the - combination of tableoid and row OID for the - purpose. - - - - - The use of OIDS=FALSE is not recommended - for tables with no primary key, since without either an OID or a - unique data key, it is difficult to identify specific rows. - - - - - PostgreSQL automatically creates an - index for each unique constraint and primary key constraint to - enforce uniqueness. Thus, it is not necessary to create an - index explicitly for primary key columns. (See for more information.) - - - - Unique constraints and primary keys are not inherited in the - current implementation. This makes the combination of - inheritance and unique constraints rather dysfunctional. - - - - A table cannot have more than 1600 columns. (In practice, the - effective limit is usually lower because of tuple-length constraints.) - - - - - - - Examples - - - Create table films and table - distributors: - - -CREATE TABLE films ( - code char(5) CONSTRAINT firstkey PRIMARY KEY, - title varchar(40) NOT NULL, - did integer NOT NULL, - date_prod date, - kind varchar(10), - len interval hour to minute -); - -CREATE TABLE distributors ( - did integer PRIMARY KEY GENERATED BY DEFAULT AS IDENTITY, - name varchar(40) NOT NULL CHECK (name <> '') -); - - - - - Create a table with a 2-dimensional array: - - -CREATE TABLE array_int ( - vector int[][] -); - - - - - Define a unique table constraint for the table - films. Unique table constraints can be defined - on one or more columns of the table: - - -CREATE TABLE films ( - code char(5), - title varchar(40), - did integer, - date_prod date, - kind varchar(10), - len interval hour to minute, - CONSTRAINT production UNIQUE(date_prod) -); - - - - - Define a check column constraint: - - -CREATE TABLE distributors ( - did integer CHECK (did > 100), - name varchar(40) -); - - - - - Define a check table constraint: - - -CREATE TABLE distributors ( - did integer, - name varchar(40) - CONSTRAINT con1 CHECK (did > 100 AND name <> '') -); - - - - - Define a primary key table constraint for the table - films: - - -CREATE TABLE films ( - code char(5), - title varchar(40), - did integer, - date_prod date, - kind varchar(10), - len interval hour to minute, - CONSTRAINT code_title PRIMARY KEY(code,title) -); - - - - - Define a primary key constraint for table - distributors. The following two examples are - equivalent, the first using the table constraint syntax, the second - the column constraint syntax: - - -CREATE TABLE distributors ( - did integer, - name varchar(40), - PRIMARY KEY(did) -); - -CREATE TABLE distributors ( - did integer PRIMARY KEY, - name varchar(40) -); - - - - - Assign a literal constant default value for the column - name, arrange for the default value of column - did to be generated by selecting the next value - of a sequence object, and make the default value of - modtime be the time at which the row is - inserted: - - -CREATE TABLE distributors ( - name varchar(40) DEFAULT 'Luso Films', - did integer DEFAULT nextval('distributors_serial'), - modtime timestamp DEFAULT current_timestamp -); - - - - - Define two NOT NULL column constraints on the table - distributors, one of which is explicitly - given a name: - - -CREATE TABLE distributors ( - did integer CONSTRAINT no_null NOT NULL, - name varchar(40) NOT NULL -); - - - - - Define a unique constraint for the name column: - - -CREATE TABLE distributors ( - did integer, - name varchar(40) UNIQUE -); - - - The same, specified as a table constraint: - - -CREATE TABLE distributors ( - did integer, - name varchar(40), - UNIQUE(name) -); - - - - - Create the same table, specifying 70% fill factor for both the table - and its unique index: - - -CREATE TABLE distributors ( - did integer, - name varchar(40), - UNIQUE(name) WITH (fillfactor=70) -) -WITH (fillfactor=70); - - - - - Create table circles with an exclusion - constraint that prevents any two circles from overlapping: - - -CREATE TABLE circles ( - c circle, - EXCLUDE USING gist (c WITH &&) -); - - - - - Create table cinemas in tablespace diskvol1: - - -CREATE TABLE cinemas ( - id integer, - name text, - location text -) TABLESPACE diskvol1; - - - - - Create a composite type and a typed table: - -CREATE TYPE employee_type AS (name text, salary numeric); - -CREATE TABLE employees OF employee_type ( - PRIMARY KEY (name), - salary WITH OPTIONS DEFAULT 1000 -); - - - - Create a range partitioned table: - -CREATE TABLE measurement ( - logdate date not null, - peaktemp int, - unitsales int -) PARTITION BY RANGE (logdate); - - - - Create a range partitioned table with multiple columns in the partition key: - -CREATE TABLE measurement_year_month ( - logdate date not null, - peaktemp int, - unitsales int -) PARTITION BY RANGE (EXTRACT(YEAR FROM logdate), EXTRACT(MONTH FROM logdate)); - - - - Create a list partitioned table: - -CREATE TABLE cities ( - city_id bigserial not null, - name text not null, - population bigint -) PARTITION BY LIST (left(lower(name), 1)); - - - - Create partition of a range partitioned table: - -CREATE TABLE measurement_y2016m07 - PARTITION OF measurement ( - unitsales DEFAULT 0 -) FOR VALUES FROM ('2016-07-01') TO ('2016-08-01'); - - - - Create a few partitions of a range partitioned table with multiple - columns in the partition key: - -CREATE TABLE measurement_ym_older - PARTITION OF measurement_year_month - FOR VALUES FROM (MINVALUE, 0) TO (2016, 11); - -CREATE TABLE measurement_ym_y2016m11 - PARTITION OF measurement_year_month - FOR VALUES FROM (2016, 11) TO (2016, 12); - -CREATE TABLE measurement_ym_y2016m12 - PARTITION OF measurement_year_month - FOR VALUES FROM (2016, 12) TO (2017, 01); - -CREATE TABLE measurement_ym_y2017m01 - PARTITION OF measurement_year_month - FOR VALUES FROM (2017, 01) TO (2017, 02); - - - - Create partition of a list partitioned table: - -CREATE TABLE cities_ab - PARTITION OF cities ( - CONSTRAINT city_id_nonzero CHECK (city_id != 0) -) FOR VALUES IN ('a', 'b'); - - - - Create partition of a list partitioned table that is itself further - partitioned and then add a partition to it: - -CREATE TABLE cities_ab - PARTITION OF cities ( - CONSTRAINT city_id_nonzero CHECK (city_id != 0) -) FOR VALUES IN ('a', 'b') PARTITION BY RANGE (population); - -CREATE TABLE cities_ab_10000_to_100000 - PARTITION OF cities_ab FOR VALUES FROM (10000) TO (100000); - - - - Create a default partition: - -CREATE TABLE cities_partdef - PARTITION OF cities DEFAULT; - - - - - Compatibility - - - The CREATE TABLE command conforms to the - SQL standard, with exceptions listed below. - - - - Temporary Tables - + - Although the syntax of CREATE TEMPORARY TABLE - resembles that of the SQL standard, the effect is not the same. In the - standard, - temporary tables are defined just once and automatically exist (starting - with empty contents) in every session that needs them. - PostgreSQL instead - requires each session to issue its own CREATE TEMPORARY - TABLE command for each temporary table to be used. This allows - different sessions to use the same temporary table name for different - purposes, whereas the standard's approach constrains all instances of a - given temporary table name to have the same table structure. + If a schema name is given (for example, CREATE TABLE + myschema.mytable ...) then the table is created in the specified + schema. Otherwise it is created in the current schema. Temporary + tables exist in a special schema, so a schema name cannot be given + when creating a temporary table. The name of the table must be + distinct from the name of any other table, sequence, index, view, + or foreign table in the same schema. - + - The standard's definition of the behavior of temporary tables is - widely ignored. PostgreSQL's behavior - on this point is similar to that of several other SQL databases. + CREATE TABLE also automatically creates a data + type that represents the composite type corresponding + to one row of the table. Therefore, tables cannot have the same + name as any existing data type in the same schema. - + - The SQL standard also distinguishes between global and local temporary - tables, where a local temporary table has a separate set of contents for - each SQL module within each session, though its definition is still shared - across sessions. Since PostgreSQL does not - support SQL modules, this distinction is not relevant in - PostgreSQL. + The optional constraint clauses specify constraints (tests) that + new or updated rows must satisfy for an insert or update operation + to succeed. A constraint is an SQL object that helps define the + set of valid values in the table in various ways. - + - For compatibility's sake, PostgreSQL will - accept the GLOBAL and LOCAL keywords - in a temporary table declaration, but they currently have no effect. - Use of these keywords is discouraged, since future versions of - PostgreSQL might adopt a more - standard-compliant interpretation of their meaning. + There are two ways to define constraints: table constraints and + column constraints. A column constraint is defined as part of a + column definition. A table constraint definition is not tied to a + particular column, and it can encompass more than one column. + Every column constraint can also be written as a table constraint; + a column constraint is only a notational convenience for use when the + constraint only affects one column. - + - The ON COMMIT clause for temporary tables - also resembles the SQL standard, but has some differences. - If the ON COMMIT clause is omitted, SQL specifies that the - default behavior is ON COMMIT DELETE ROWS. However, the - default behavior in PostgreSQL is - ON COMMIT PRESERVE ROWS. The ON COMMIT - DROP option does not exist in SQL. + To be able to create a table, you must have USAGE + privilege on all column types or the type in the OF + clause, respectively. - - - - Non-deferred Uniqueness Constraints - + + + + Parameters + + + + + TEMPORARY or TEMP + + + If specified, the table is created as a temporary table. + Temporary tables are automatically dropped at the end of a + session, or optionally at the end of the current transaction + (see ON COMMIT below). Existing permanent + tables with the same name are not visible to the current session + while the temporary table exists, unless they are referenced + with schema-qualified names. Any indexes created on a temporary + table are automatically temporary as well. + + + + The autovacuum daemon cannot + access and therefore cannot vacuum or analyze temporary tables. + For this reason, appropriate vacuum and analyze operations should be + performed via session SQL commands. For example, if a temporary + table is going to be used in complex queries, it is wise to run + ANALYZE on the temporary table after it is populated. + + + + Optionally, GLOBAL or LOCAL + can be written before TEMPORARY or TEMP. + This presently makes no difference in PostgreSQL + and is deprecated; see + . + + + + + + UNLOGGED + + + If specified, the table is created as an unlogged table. Data written + to unlogged tables is not written to the write-ahead log (see ), which makes them considerably faster than ordinary + tables. However, they are not crash-safe: an unlogged table is + automatically truncated after a crash or unclean shutdown. The contents + of an unlogged table are also not replicated to standby servers. + Any indexes created on an unlogged table are automatically unlogged as + well. + + + + + + IF NOT EXISTS + + + Do not throw an error if a relation with the same name already exists. + A notice is issued in this case. Note that there is no guarantee that + the existing relation is anything like the one that would have been + created. + + + + + + table_name + + + The name (optionally schema-qualified) of the table to be created. + + + + + + OF type_name + + + Creates a typed table, which takes its + structure from the specified composite type (name optionally + schema-qualified). A typed table is tied to its type; for + example the table will be dropped if the type is dropped + (with DROP TYPE ... CASCADE). + + + + When a typed table is created, then the data types of the + columns are determined by the underlying composite type and are + not specified by the CREATE TABLE command. + But the CREATE TABLE command can add defaults + and constraints to the table and can specify storage parameters. + + + + + + PARTITION OF parent_table { FOR VALUES partition_bound_spec | DEFAULT } + + + Creates the table as a partition of the specified + parent table. The table can be created either as a partition for specific + values using FOR VALUES or as a default partition + using DEFAULT. This option is not available for + hash-partitioned tables. + + + + The partition_bound_spec + must correspond to the partitioning method and partition key of the + parent table, and must not overlap with any existing partition of that + parent. The form with IN is used for list partitioning, + the form with FROM and TO is used + for range partitioning, and the form with WITH is used + for hash partitioning. + + + + Each of the values specified in + the partition_bound_spec is + a literal, NULL, MINVALUE, or + MAXVALUE. Each literal value must be either a + numeric constant that is coercible to the corresponding partition key + column's type, or a string literal that is valid input for that type. + + + + When creating a list partition, NULL can be + specified to signify that the partition allows the partition key + column to be null. However, there cannot be more than one such + list partition for a given parent table. NULL + cannot be specified for range partitions. + + + + When creating a range partition, the lower bound specified with + FROM is an inclusive bound, whereas the upper + bound specified with TO is an exclusive bound. + That is, the values specified in the FROM list + are valid values of the corresponding partition key columns for this + partition, whereas those in the TO list are + not. Note that this statement must be understood according to the + rules of row-wise comparison (). + For example, given PARTITION BY RANGE (x,y), a partition + bound FROM (1, 2) TO (3, 4) + allows x=1 with any y>=2, + x=2 with any non-null y, + and x=3 with any y<4. + + + + The special values MINVALUE and MAXVALUE + may be used when creating a range partition to indicate that there + is no lower or upper bound on the column's value. For example, a + partition defined using FROM (MINVALUE) TO (10) allows + any values less than 10, and a partition defined using + FROM (10) TO (MAXVALUE) allows any values greater than + or equal to 10. + + + + When creating a range partition involving more than one column, it + can also make sense to use MAXVALUE as part of the lower + bound, and MINVALUE as part of the upper bound. For + example, a partition defined using + FROM (0, MAXVALUE) TO (10, MAXVALUE) allows any rows + where the first partition key column is greater than 0 and less than + or equal to 10. Similarly, a partition defined using + FROM ('a', MINVALUE) TO ('b', MINVALUE) allows any rows + where the first partition key column starts with "a". + + + + Note that if MINVALUE or MAXVALUE is used for + one column of a partitioning bound, the same value must be used for all + subsequent columns. For example, (10, MINVALUE, 0) is not + a valid bound; you should write (10, MINVALUE, MINVALUE). + + + + Also note that some element types, such as timestamp, + have a notion of "infinity", which is just another value that can + be stored. This is different from MINVALUE and + MAXVALUE, which are not real values that can be stored, + but rather they are ways of saying that the value is unbounded. + MAXVALUE can be thought of as being greater than any + other value, including "infinity" and MINVALUE as being + less than any other value, including "minus infinity". Thus the range + FROM ('infinity') TO (MAXVALUE) is not an empty range; it + allows precisely one value to be stored — "infinity". + + + + If DEFAULT is specified, the table will be + created as a default partition of the parent table. The parent can + either be a list or range partitioned table. A partition key value + not fitting into any other partition of the given parent will be + routed to the default partition. There can be only one default + partition for a given parent table. + + + + When a table has an existing DEFAULT partition and + a new partition is added to it, the existing default partition must + be scanned to verify that it does not contain any rows which properly + belong in the new partition. If the default partition contains a + large number of rows, this may be slow. The scan will be skipped if + the default partition is a foreign table or if it has a constraint which + proves that it cannot contain rows which should be placed in the new + partition. + + + + When creating a hash partition, a modulus and remainder must be specified. + The modulus must be a positive integer, and the remainder must be a + non-negative integer less than the modulus. Typically, when initially + setting up a hash-partitioned table, you should choose a modulus equal to + the number of partitions and assign every table the same modulus and a + different remainder (see examples, below). However, it is not required + that every partition have the same modulus, only that every modulus which + occurs among the partitions of a hash-partitioned table is a factor of the + next larger modulus. This allows the number of partitions to be increased + incrementally without needing to move all the data at once. For example, + suppose you have a hash-partitioned table with 8 partitions, each of which + has modulus 8, but find it necessary to increase the number of partitions + to 16. You can detach one of the modulus-8 partitions, create two new + modulus-16 partitions covering the same portion of the key space (one with + a remainder equal to the remainder of the detached partition, and the + other with a remainder equal to that value plus 8), and repopulate them + with data. You can then repeat this -- perhaps at a later time -- for + each modulus-8 partition until none remain. While this may still involve + a large amount of data movement at each step, it is still better than + having to create a whole new table and move all the data at once. + + + + A partition must have the same column names and types as the partitioned + table to which it belongs. If the parent is specified WITH + OIDS then all partitions must have OIDs; the parent's OID + column will be inherited by all partitions just like any other column. + Modifications to the column names or types of a partitioned table, or + the addition or removal of an OID column, will automatically propagate + to all partitions. CHECK constraints will be inherited + automatically by every partition, but an individual partition may specify + additional CHECK constraints; additional constraints with + the same name and condition as in the parent will be merged with the + parent constraint. Defaults may be specified separately for each + partition. + + + + Rows inserted into a partitioned table will be automatically routed to + the correct partition. If no suitable partition exists, an error will + occur. Also, if updating a row in a given partition would require it + to move to another partition due to new partition key values, an error + will occur. + + + + Operations such as TRUNCATE which normally affect a table and all of its + inheritance children will cascade to all partitions, but may also be + performed on an individual partition. Note that dropping a partition + with DROP TABLE requires taking an ACCESS + EXCLUSIVE lock on the parent table. + + + + + + column_name + + + The name of a column to be created in the new table. + + + + + + data_type + + + The data type of the column. This can include array + specifiers. For more information on the data types supported by + PostgreSQL, refer to . + + + + + + COLLATE collation + + + The COLLATE clause assigns a collation to + the column (which must be of a collatable data type). + If not specified, the column data type's default collation is used. + + + + + + INHERITS ( parent_table [, ... ] ) + + + The optional INHERITS clause specifies a list of + tables from which the new table automatically inherits all + columns. Parent tables can be plain tables or foreign tables. + + + + Use of INHERITS creates a persistent relationship + between the new child table and its parent table(s). Schema + modifications to the parent(s) normally propagate to children + as well, and by default the data of the child table is included in + scans of the parent(s). + + + + If the same column name exists in more than one parent + table, an error is reported unless the data types of the columns + match in each of the parent tables. If there is no conflict, + then the duplicate columns are merged to form a single column in + the new table. If the column name list of the new table + contains a column name that is also inherited, the data type must + likewise match the inherited column(s), and the column + definitions are merged into one. If the + new table explicitly specifies a default value for the column, + this default overrides any defaults from inherited declarations + of the column. Otherwise, any parents that specify default + values for the column must all specify the same default, or an + error will be reported. + + + + CHECK constraints are merged in essentially the same way as + columns: if multiple parent tables and/or the new table definition + contain identically-named CHECK constraints, these + constraints must all have the same check expression, or an error will be + reported. Constraints having the same name and expression will + be merged into one copy. A constraint marked NO INHERIT in a + parent will not be considered. Notice that an unnamed CHECK + constraint in the new table will never be merged, since a unique name + will always be chosen for it. + + + + Column STORAGE settings are also copied from parent tables. + + + + If a column in the parent table is an identity column, that property is + not inherited. A column in the child table can be declared identity + column if desired. + + + + + + PARTITION BY { RANGE | LIST | HASH } ( { column_name | ( expression ) } [ opclass ] [, ...] ) + + + The optional PARTITION BY clause specifies a strategy + of partitioning the table. The table thus created is called a + partitioned table. The parenthesized list of + columns or expressions forms the partition key + for the table. When using range or hash partitioning, the partition key + can include multiple columns or expressions (up to 32, but this limit can + be altered when building PostgreSQL), but for + list partitioning, the partition key must consist of a single column or + expression. + + + + Range and list partitioning require a btree operator class, while hash + partitioning requires a hash operator class. If no operator class is + specified explicitly, the default operator class of the appropriate + type will be used; if no default operator class exists, an error will + be raised. When hash partitioning is used, the operator class used + must implement support function 2 (see + for details). + + + + A partitioned table is divided into sub-tables (called partitions), + which are created using separate CREATE TABLE commands. + The partitioned table is itself empty. A data row inserted into the + table is routed to a partition based on the value of columns or + expressions in the partition key. If no existing partition matches + the values in the new row, an error will be reported. + + + + Partitioned tables do not support UNIQUE, + PRIMARY KEY, EXCLUDE, or + FOREIGN KEY constraints; however, you can define + these constraints on individual partitions. + + + + + + + LIKE source_table [ like_option ... ] + + + The LIKE clause specifies a table from which + the new table automatically copies all column names, their data types, + and their not-null constraints. + + + Unlike INHERITS, the new table and original table + are completely decoupled after creation is complete. Changes to the + original table will not be applied to the new table, and it is not + possible to include data of the new table in scans of the original + table. + + + Default expressions for the copied column definitions will be copied + only if INCLUDING DEFAULTS is specified. The + default behavior is to exclude default expressions, resulting in the + copied columns in the new table having null defaults. + Note that copying defaults that call database-modification functions, + such as nextval, may create a functional linkage between + the original and new tables. + + + Any identity specifications of copied column definitions will only be + copied if INCLUDING IDENTITY is specified. A new + sequence is created for each identity column of the new table, separate + from the sequences associated with the old table. + + + Not-null constraints are always copied to the new table. + CHECK constraints will be copied only if + INCLUDING CONSTRAINTS is specified. + No distinction is made between column constraints and table + constraints. + + + Indexes, PRIMARY KEY, UNIQUE, + and EXCLUDE constraints on the original table will be + created on the new table only if INCLUDING INDEXES + is specified. Names for the new indexes and constraints are + chosen according to the default rules, regardless of how the originals + were named. (This behavior avoids possible duplicate-name failures for + the new indexes.) + + + STORAGE settings for the copied column definitions will be + copied only if INCLUDING STORAGE is specified. The + default behavior is to exclude STORAGE settings, resulting + in the copied columns in the new table having type-specific default + settings. For more on STORAGE settings, see + . + + + Comments for the copied columns, constraints, and indexes + will be copied only if INCLUDING COMMENTS + is specified. The default behavior is to exclude comments, resulting in + the copied columns and constraints in the new table having no comments. + + + INCLUDING ALL is an abbreviated form of + INCLUDING DEFAULTS INCLUDING IDENTITY INCLUDING CONSTRAINTS INCLUDING INDEXES INCLUDING STORAGE INCLUDING COMMENTS. + + + Note that unlike INHERITS, columns and + constraints copied by LIKE are not merged with similarly + named columns and constraints. + If the same name is specified explicitly or in another + LIKE clause, an error is signaled. + + + The LIKE clause can also be used to copy column + definitions from views, foreign tables, or composite types. + Inapplicable options (e.g., INCLUDING INDEXES from + a view) are ignored. + + + + + + CONSTRAINT constraint_name + + + An optional name for a column or table constraint. If the + constraint is violated, the constraint name is present in error messages, + so constraint names like col must be positive can be used + to communicate helpful constraint information to client applications. + (Double-quotes are needed to specify constraint names that contain spaces.) + If a constraint name is not specified, the system generates a name. + + + + + + NOT NULL + + + The column is not allowed to contain null values. + + + + + + NULL + + + The column is allowed to contain null values. This is the default. + + + + This clause is only provided for compatibility with + non-standard SQL databases. Its use is discouraged in new + applications. + + + + + + CHECK ( expression ) [ NO INHERIT ] + + + The CHECK clause specifies an expression producing a + Boolean result which new or updated rows must satisfy for an + insert or update operation to succeed. Expressions evaluating + to TRUE or UNKNOWN succeed. Should any row of an insert or + update operation produce a FALSE result, an error exception is + raised and the insert or update does not alter the database. A + check constraint specified as a column constraint should + reference that column's value only, while an expression + appearing in a table constraint can reference multiple columns. + + + + Currently, CHECK expressions cannot contain + subqueries nor refer to variables other than columns of the + current row. The system column tableoid + may be referenced, but not any other system column. + + + + A constraint marked with NO INHERIT will not propagate to + child tables. + + + + When a table has multiple CHECK constraints, + they will be tested for each row in alphabetical order by name, + after checking NOT NULL constraints. + (PostgreSQL versions before 9.5 did not honor any + particular firing order for CHECK constraints.) + + + + + + DEFAULT + default_expr + + + The DEFAULT clause assigns a default data value for + the column whose column definition it appears within. The value + is any variable-free expression (subqueries and cross-references + to other columns in the current table are not allowed). The + data type of the default expression must match the data type of the + column. + + + + The default expression will be used in any insert operation that + does not specify a value for the column. If there is no default + for a column, then the default is null. + + + + + + GENERATED { ALWAYS | BY DEFAULT } AS IDENTITY [ ( sequence_options ) ] + + + This clause creates the column as an identity + column. It will have an implicit sequence attached to it + and the column in new rows will automatically have values from the + sequence assigned to it. + + + + The clauses ALWAYS and BY DEFAULT + determine how the sequence value is given precedence over a + user-specified value in an INSERT statement. + If ALWAYS is specified, a user-specified value is + only accepted if the INSERT statement + specifies OVERRIDING SYSTEM VALUE. If BY + DEFAULT is specified, then the user-specified value takes + precedence. See for details. (In + the COPY command, user-specified values are always + used regardless of this setting.) + + + + The optional sequence_options clause can be + used to override the options of the sequence. + See for details. + + + + + + UNIQUE (column constraint) + UNIQUE ( column_name [, ... ] ) (table constraint) + + + + The UNIQUE constraint specifies that a + group of one or more columns of a table can contain + only unique values. The behavior of the unique table constraint + is the same as that for column constraints, with the additional + capability to span multiple columns. + + + + For the purpose of a unique constraint, null values are not + considered equal. + + + + Each unique table constraint must name a set of columns that is + different from the set of columns named by any other unique or + primary key constraint defined for the table. (Otherwise it + would just be the same constraint listed twice.) + + + + + + PRIMARY KEY (column constraint) + PRIMARY KEY ( column_name [, ... ] ) (table constraint) + + + The PRIMARY KEY constraint specifies that a column or + columns of a table can contain only unique (non-duplicate), nonnull + values. Only one primary key can be specified for a table, whether as a + column constraint or a table constraint. + + + + The primary key constraint should name a set of columns that is + different from the set of columns named by any unique + constraint defined for the same table. (Otherwise, the unique + constraint is redundant and will be discarded.) + + + + PRIMARY KEY enforces the same data constraints as + a combination of UNIQUE and NOT NULL, but + identifying a set of columns as the primary key also provides metadata + about the design of the schema, since a primary key implies that other + tables can rely on this set of columns as a unique identifier for rows. + + + + + + EXCLUDE [ USING index_method ] ( exclude_element WITH operator [, ... ] ) index_parameters [ WHERE ( predicate ) ] + + + The EXCLUDE clause defines an exclusion + constraint, which guarantees that if + any two rows are compared on the specified column(s) or + expression(s) using the specified operator(s), not all of these + comparisons will return TRUE. If all of the + specified operators test for equality, this is equivalent to a + UNIQUE constraint, although an ordinary unique constraint + will be faster. However, exclusion constraints can specify + constraints that are more general than simple equality. + For example, you can specify a constraint that + no two rows in the table contain overlapping circles + (see ) by using the + && operator. + + + + Exclusion constraints are implemented using + an index, so each specified operator must be associated with an + appropriate operator class + (see ) for the index access + method index_method. + The operators are required to be commutative. + Each exclude_element + can optionally specify an operator class and/or ordering options; + these are described fully under + . + + + + The access method must support amgettuple (see ); at present this means GIN + cannot be used. Although it's allowed, there is little point in using + B-tree or hash indexes with an exclusion constraint, because this + does nothing that an ordinary unique constraint doesn't do better. + So in practice the access method will always be GiST or + SP-GiST. + + + + The predicate allows you to specify an + exclusion constraint on a subset of the table; internally this creates a + partial index. Note that parentheses are required around the predicate. + + + + + + REFERENCES reftable [ ( refcolumn ) ] [ MATCH matchtype ] [ ON DELETE action ] [ ON UPDATE action ] (column constraint) + + FOREIGN KEY ( column_name [, ... ] ) + REFERENCES reftable [ ( refcolumn [, ... ] ) ] + [ MATCH matchtype ] + [ ON DELETE action ] + [ ON UPDATE action ] + (table constraint) + + + + These clauses specify a foreign key constraint, which requires + that a group of one or more columns of the new table must only + contain values that match values in the referenced + column(s) of some row of the referenced table. If the refcolumn list is omitted, the + primary key of the reftable + is used. The referenced columns must be the columns of a non-deferrable + unique or primary key constraint in the referenced table. The user + must have REFERENCES permission on the referenced table + (either the whole table, or the specific referenced columns). + Note that foreign key constraints cannot be defined between temporary + tables and permanent tables. + + + + A value inserted into the referencing column(s) is matched against the + values of the referenced table and referenced columns using the + given match type. There are three match types: MATCH + FULL, MATCH PARTIAL, and MATCH + SIMPLE (which is the default). MATCH + FULL will not allow one column of a multicolumn foreign key + to be null unless all foreign key columns are null; if they are all + null, the row is not required to have a match in the referenced table. + MATCH SIMPLE allows any of the foreign key columns + to be null; if any of them are null, the row is not required to have a + match in the referenced table. + MATCH PARTIAL is not yet implemented. + (Of course, NOT NULL constraints can be applied to the + referencing column(s) to prevent these cases from arising.) + + + + In addition, when the data in the referenced columns is changed, + certain actions are performed on the data in this table's + columns. The ON DELETE clause specifies the + action to perform when a referenced row in the referenced table is + being deleted. Likewise, the ON UPDATE + clause specifies the action to perform when a referenced column + in the referenced table is being updated to a new value. If the + row is updated, but the referenced column is not actually + changed, no action is done. Referential actions other than the + NO ACTION check cannot be deferred, even if + the constraint is declared deferrable. There are the following possible + actions for each clause: + + + + NO ACTION + + + Produce an error indicating that the deletion or update + would create a foreign key constraint violation. + If the constraint is deferred, this + error will be produced at constraint check time if there still + exist any referencing rows. This is the default action. + + + + + + RESTRICT + + + Produce an error indicating that the deletion or update + would create a foreign key constraint violation. + This is the same as NO ACTION except that + the check is not deferrable. + + + + + + CASCADE + + + Delete any rows referencing the deleted row, or update the + values of the referencing column(s) to the new values of the + referenced columns, respectively. + + + + + + SET NULL + + + Set the referencing column(s) to null. + + + + + + SET DEFAULT + + + Set the referencing column(s) to their default values. + (There must be a row in the referenced table matching the default + values, if they are not null, or the operation will fail.) + + + + + + + + If the referenced column(s) are changed frequently, it might be wise to + add an index to the referencing column(s) so that referential actions + associated with the foreign key constraint can be performed more + efficiently. + + + + + + DEFERRABLE + NOT DEFERRABLE + + + This controls whether the constraint can be deferred. A + constraint that is not deferrable will be checked immediately + after every command. Checking of constraints that are + deferrable can be postponed until the end of the transaction + (using the command). + NOT DEFERRABLE is the default. + Currently, only UNIQUE, PRIMARY KEY, + EXCLUDE, and + REFERENCES (foreign key) constraints accept this + clause. NOT NULL and CHECK constraints are not + deferrable. Note that deferrable constraints cannot be used as + conflict arbitrators in an INSERT statement that + includes an ON CONFLICT DO UPDATE clause. + + + + + + INITIALLY IMMEDIATE + INITIALLY DEFERRED + + + If a constraint is deferrable, this clause specifies the default + time to check the constraint. If the constraint is + INITIALLY IMMEDIATE, it is checked after each + statement. This is the default. If the constraint is + INITIALLY DEFERRED, it is checked only at the + end of the transaction. The constraint check time can be + altered with the command. + + + + + + WITH ( storage_parameter [= value] [, ... ] ) + + + This clause specifies optional storage parameters for a table or index; + see for more + information. The WITH clause for a + table can also include OIDS=TRUE (or just OIDS) + to specify that rows of the new table + should have OIDs (object identifiers) assigned to them, or + OIDS=FALSE to specify that the rows should not have OIDs. + If OIDS is not specified, the default setting depends upon + the configuration parameter. + (If the new table inherits from any tables that have OIDs, then + OIDS=TRUE is forced even if the command says + OIDS=FALSE.) + + + + If OIDS=FALSE is specified or implied, the new + table does not store OIDs and no OID will be assigned for a row inserted + into it. This is generally considered worthwhile, since it + will reduce OID consumption and thereby postpone the wraparound + of the 32-bit OID counter. Once the counter wraps around, OIDs + can no longer be assumed to be unique, which makes them + considerably less useful. In addition, excluding OIDs from a + table reduces the space required to store the table on disk by + 4 bytes per row (on most machines), slightly improving performance. + + + + To remove OIDs from a table after it has been created, use . + + + + + + WITH OIDS + WITHOUT OIDS + + + These are obsolescent syntaxes equivalent to WITH (OIDS) + and WITH (OIDS=FALSE), respectively. If you wish to give + both an OIDS setting and storage parameters, you must use + the WITH ( ... ) syntax; see above. + + + + + + ON COMMIT + + + The behavior of temporary tables at the end of a transaction + block can be controlled using ON COMMIT. + The three options are: + + + + PRESERVE ROWS + + + No special action is taken at the ends of transactions. + This is the default behavior. + + + + + + DELETE ROWS + + + All rows in the temporary table will be deleted at the end + of each transaction block. Essentially, an automatic is done + at each commit. + + + + + + DROP + + + The temporary table will be dropped at the end of the current + transaction block. + + + + + + + + + TABLESPACE tablespace_name + + + The tablespace_name is the name + of the tablespace in which the new table is to be created. + If not specified, + is consulted, or + if the table is temporary. + + + + + + USING INDEX TABLESPACE tablespace_name + + + This clause allows selection of the tablespace in which the index + associated with a UNIQUE, PRIMARY + KEY, or EXCLUDE constraint will be created. + If not specified, + is consulted, or + if the table is temporary. + + + + + + + + Storage Parameters + + + storage parameters + + + + The WITH clause can specify storage parameters + for tables, and for indexes associated with a UNIQUE, + PRIMARY KEY, or EXCLUDE constraint. + Storage parameters for + indexes are documented in . + The storage parameters currently + available for tables are listed below. For many of these parameters, as + shown, there is an additional parameter with the same name prefixed with + toast., which controls the behavior of the + table's secondary TOAST table, if any + (see for more information about TOAST). + If a table parameter value is set and the + equivalent toast. parameter is not, the TOAST table + will use the table's parameter value. + Specifying these parameters for partitioned tables is not supported, + but you may specify them for individual leaf partitions. + + + + + + fillfactor (integer) + + + The fillfactor for a table is a percentage between 10 and 100. + 100 (complete packing) is the default. When a smaller fillfactor + is specified, INSERT operations pack table pages only + to the indicated percentage; the remaining space on each page is + reserved for updating rows on that page. This gives UPDATE + a chance to place the updated copy of a row on the same page as the + original, which is more efficient than placing it on a different page. + For a table whose entries are never updated, complete packing is the + best choice, but in heavily updated tables smaller fillfactors are + appropriate. This parameter cannot be set for TOAST tables. + + + + + + parallel_workers (integer) + + + This sets the number of workers that should be used to assist a parallel + scan of this table. If not set, the system will determine a value based + on the relation size. The actual number of workers chosen by the planner + may be less, for example due to + the setting of . + + + + + + autovacuum_enabled, toast.autovacuum_enabled (boolean) + + + Enables or disables the autovacuum daemon for a particular table. + If true, the autovacuum daemon will perform automatic VACUUM + and/or ANALYZE operations on this table following the rules + discussed in . + If false, this table will not be autovacuumed, except to prevent + transaction ID wraparound. See for + more about wraparound prevention. + Note that the autovacuum daemon does not run at all (except to prevent + transaction ID wraparound) if the + parameter is false; setting individual tables' storage parameters does + not override that. Therefore there is seldom much point in explicitly + setting this storage parameter to true, only + to false. + + + + + + autovacuum_vacuum_threshold, toast.autovacuum_vacuum_threshold (integer) + + + Per-table value for + parameter. + + + + + + autovacuum_vacuum_scale_factor, toast.autovacuum_vacuum_scale_factor (float4) + + + Per-table value for + parameter. + + + + + + autovacuum_analyze_threshold (integer) + + + Per-table value for + parameter. + + + + + + autovacuum_analyze_scale_factor (float4) + + + Per-table value for + parameter. + + + + + + autovacuum_vacuum_cost_delay, toast.autovacuum_vacuum_cost_delay (integer) + + + Per-table value for + parameter. + + + + + + autovacuum_vacuum_cost_limit, toast.autovacuum_vacuum_cost_limit (integer) + + + Per-table value for + parameter. + + + + + + autovacuum_freeze_min_age, toast.autovacuum_freeze_min_age (integer) + + + Per-table value for + parameter. Note that autovacuum will ignore + per-table autovacuum_freeze_min_age parameters that are + larger than half the + system-wide setting. + + + + + + autovacuum_freeze_max_age, toast.autovacuum_freeze_max_age (integer) + + + Per-table value for + parameter. Note that autovacuum will ignore + per-table autovacuum_freeze_max_age parameters that are + larger than the system-wide setting (it can only be set smaller). + + + + + + autovacuum_freeze_table_age, toast.autovacuum_freeze_table_age (integer) + + + Per-table value for + parameter. + + + + + + autovacuum_multixact_freeze_min_age, toast.autovacuum_multixact_freeze_min_age (integer) + + + Per-table value for + parameter. Note that autovacuum will ignore + per-table autovacuum_multixact_freeze_min_age parameters + that are larger than half the + system-wide + setting. + + + + + + autovacuum_multixact_freeze_max_age, toast.autovacuum_multixact_freeze_max_age (integer) + + + Per-table value + for parameter. + Note that autovacuum will ignore + per-table autovacuum_multixact_freeze_max_age parameters + that are larger than the system-wide setting (it can only be set + smaller). + + + + + + autovacuum_multixact_freeze_table_age, toast.autovacuum_multixact_freeze_table_age (integer) + + + Per-table value + for parameter. + + + + + + log_autovacuum_min_duration, toast.log_autovacuum_min_duration (integer) + + + Per-table value for + parameter. + + + + + + user_catalog_table (boolean) + + + Declare the table as an additional catalog table for purposes of + logical replication. See + for details. + This parameter cannot be set for TOAST tables. + + + + + + + + + + + Notes + + + Using OIDs in new applications is not recommended: where + possible, using an identity column or other sequence + generator as the table's primary key is preferred. However, if + your application does make use of OIDs to identify specific + rows of a table, it is recommended to create a unique constraint + on the oid column of that table, to ensure that + OIDs in the table will indeed uniquely identify rows even after + counter wraparound. Avoid assuming that OIDs are unique across + tables; if you need a database-wide unique identifier, use the + combination of tableoid and row OID for the + purpose. + + + + + The use of OIDS=FALSE is not recommended + for tables with no primary key, since without either an OID or a + unique data key, it is difficult to identify specific rows. + + + + + PostgreSQL automatically creates an + index for each unique constraint and primary key constraint to + enforce uniqueness. Thus, it is not necessary to create an + index explicitly for primary key columns. (See for more information.) + + + + Unique constraints and primary keys are not inherited in the + current implementation. This makes the combination of + inheritance and unique constraints rather dysfunctional. + + + + A table cannot have more than 1600 columns. (In practice, the + effective limit is usually lower because of tuple-length constraints.) + + + + + + + Examples + - When a UNIQUE or PRIMARY KEY constraint is - not deferrable, PostgreSQL checks for - uniqueness immediately whenever a row is inserted or modified. - The SQL standard says that uniqueness should be enforced only at - the end of the statement; this makes a difference when, for example, - a single command updates multiple key values. To obtain - standard-compliant behavior, declare the constraint as - DEFERRABLE but not deferred (i.e., INITIALLY - IMMEDIATE). Be aware that this can be significantly slower than - immediate uniqueness checking. + Create table films and table + distributors: + + + CREATE TABLE films ( + code char(5) CONSTRAINT firstkey PRIMARY KEY, + title varchar(40) NOT NULL, + did integer NOT NULL, + date_prod date, + kind varchar(10), + len interval hour to minute + ); + + CREATE TABLE distributors ( + did integer PRIMARY KEY GENERATED BY DEFAULT AS IDENTITY, + name varchar(40) NOT NULL CHECK (name <> '') + ); + - - - - Column Check Constraints - + - The SQL standard says that CHECK column constraints - can only refer to the column they apply to; only CHECK - table constraints can refer to multiple columns. - PostgreSQL does not enforce this - restriction; it treats column and table check constraints alike. + Create a table with a 2-dimensional array: + + + CREATE TABLE array_int ( + vector int[][] + ); + - - - - <literal>EXCLUDE</literal> Constraint - + - The EXCLUDE constraint type is a - PostgreSQL extension. + Define a unique table constraint for the table + films. Unique table constraints can be defined + on one or more columns of the table: + + + CREATE TABLE films ( + code char(5), + title varchar(40), + did integer, + date_prod date, + kind varchar(10), + len interval hour to minute, + CONSTRAINT production UNIQUE(date_prod) + ); + - - - - <literal>NULL</literal> <quote>Constraint</quote> - + - The NULL constraint (actually a - non-constraint) is a PostgreSQL - extension to the SQL standard that is included for compatibility with some - other database systems (and for symmetry with the NOT - NULL constraint). Since it is the default for any - column, its presence is simply noise. + Define a check column constraint: + + + CREATE TABLE distributors ( + did integer CHECK (did > 100), + name varchar(40) + ); + - - - - Inheritance - + - Multiple inheritance via the INHERITS clause is - a PostgreSQL language extension. - SQL:1999 and later define single inheritance using a - different syntax and different semantics. SQL:1999-style - inheritance is not yet supported by - PostgreSQL. + Define a check table constraint: + + + CREATE TABLE distributors ( + did integer, + name varchar(40) + CONSTRAINT con1 CHECK (did > 100 AND name <> '') + ); + - - - - Zero-column Tables - + - PostgreSQL allows a table of no columns - to be created (for example, CREATE TABLE foo();). This - is an extension from the SQL standard, which does not allow zero-column - tables. Zero-column tables are not in themselves very useful, but - disallowing them creates odd special cases for ALTER TABLE - DROP COLUMN, so it seems cleaner to ignore this spec restriction. + Define a primary key table constraint for the table + films: + + + CREATE TABLE films ( + code char(5), + title varchar(40), + did integer, + date_prod date, + kind varchar(10), + len interval hour to minute, + CONSTRAINT code_title PRIMARY KEY(code,title) + ); + - - - - Multiple Identity Columns - + - PostgreSQL allows a table to have more than one - identity column. The standard specifies that a table can have at most one - identity column. This is relaxed mainly to give more flexibility for - doing schema changes or migrations. Note that - the INSERT command supports only one override clause - that applies to the entire statement, so having multiple identity columns - with different behaviors is not well supported. + Define a primary key constraint for table + distributors. The following two examples are + equivalent, the first using the table constraint syntax, the second + the column constraint syntax: + + + CREATE TABLE distributors ( + did integer, + name varchar(40), + PRIMARY KEY(did) + ); + + CREATE TABLE distributors ( + did integer PRIMARY KEY, + name varchar(40) + ); + - - - - <literal>LIKE</> Clause - + - While a LIKE clause exists in the SQL standard, many of the - options that PostgreSQL accepts for it are not - in the standard, and some of the standard's options are not implemented - by PostgreSQL. + Assign a literal constant default value for the column + name, arrange for the default value of column + did to be generated by selecting the next value + of a sequence object, and make the default value of + modtime be the time at which the row is + inserted: + + + CREATE TABLE distributors ( + name varchar(40) DEFAULT 'Luso Films', + did integer DEFAULT nextval('distributors_serial'), + modtime timestamp DEFAULT current_timestamp + ); + - - - - <literal>WITH</> Clause - + - The WITH clause is a PostgreSQL - extension; neither storage parameters nor OIDs are in the standard. + Define two NOT NULL column constraints on the table + distributors, one of which is explicitly + given a name: + + + CREATE TABLE distributors ( + did integer CONSTRAINT no_null NOT NULL, + name varchar(40) NOT NULL + ); + + + + + Define a unique constraint for the name column: + + + CREATE TABLE distributors ( + did integer, + name varchar(40) UNIQUE + ); + + + The same, specified as a table constraint: + + + CREATE TABLE distributors ( + did integer, + name varchar(40), + UNIQUE(name) + ); + - - - - Tablespaces - + - The PostgreSQL concept of tablespaces is not - part of the standard. Hence, the clauses TABLESPACE - and USING INDEX TABLESPACE are extensions. + Create the same table, specifying 70% fill factor for both the table + and its unique index: + + + CREATE TABLE distributors ( + did integer, + name varchar(40), + UNIQUE(name) WITH (fillfactor=70) + ) + WITH (fillfactor=70); + - - - - Typed Tables - + - Typed tables implement a subset of the SQL standard. According to - the standard, a typed table has columns corresponding to the - underlying composite type as well as one other column that is - the self-referencing column. PostgreSQL does not - support these self-referencing columns explicitly, but the same - effect can be had using the OID feature. + Create table circles with an exclusion + constraint that prevents any two circles from overlapping: + + + CREATE TABLE circles ( + c circle, + EXCLUDE USING gist (c WITH &&) + ); + - - - - <literal>PARTITION BY</> Clause - + - The PARTITION BY clause is a - PostgreSQL extension. + Create table cinemas in tablespace diskvol1: + + + CREATE TABLE cinemas ( + id serial, + name text, + location text + ) TABLESPACE diskvol1; + - - - - <literal>PARTITION OF</> Clause - + - The PARTITION OF clause is a - PostgreSQL extension. - - - - - <productname>Postgres-XL</> Specifics - + Create a composite type and a typed table: + + CREATE TYPE employee_type AS (name text, salary numeric); + + CREATE TABLE employees OF employee_type ( + PRIMARY KEY (name), + salary WITH OPTIONS DEFAULT 1000 + ); + + - Currently, immutable, stable, volatile functions and nextval are allowed in DEFAULT clause. - as DEFAULT values. - + Create a range partitioned table: + + CREATE TABLE measurement ( + logdate date not null, + peaktemp int, + unitsales int + ) PARTITION BY RANGE (logdate); + + - PRIMARY KEY and foreign key must include the - distribution column. - + Create a range partitioned table with multiple columns in the partition key: + + CREATE TABLE measurement_year_month ( + logdate date not null, + peaktemp int, + unitsales int + ) PARTITION BY RANGE (EXTRACT(YEAR FROM logdate), EXTRACT(MONTH FROM logdate)); + + - TEMP tables and exclusion constraint are not supported - yet. - + Create a list partitioned table: + + CREATE TABLE cities ( + city_id bigserial not null, + name text not null, + population bigint + ) PARTITION BY LIST (left(lower(name), 1)); + + - + Create a hash partitioned table: + + CREATE TABLE orders ( + order_id bigint not null, + cust_id bigint not null, + status text + ) PARTITION BY HASH (order_id); + + - In Postgres-XL, OID is maintained locally in each - Datanode and Coordinator. The OID value may be inconsistent for rows - stored in different Datanodes. + Create partition of a range partitioned table: + + CREATE TABLE measurement_y2016m07 + PARTITION OF measurement ( + unitsales DEFAULT 0 + ) FOR VALUES FROM ('2016-07-01') TO ('2016-08-01'); + + + + Create a few partitions of a range partitioned table with multiple + columns in the partition key: + + CREATE TABLE measurement_ym_older + PARTITION OF measurement_year_month + FOR VALUES FROM (MINVALUE, MINVALUE) TO (2016, 11); + + CREATE TABLE measurement_ym_y2016m11 + PARTITION OF measurement_year_month + FOR VALUES FROM (2016, 11) TO (2016, 12); + + CREATE TABLE measurement_ym_y2016m12 + PARTITION OF measurement_year_month + FOR VALUES FROM (2016, 12) TO (2017, 01); + + CREATE TABLE measurement_ym_y2017m01 + PARTITION OF measurement_year_month + FOR VALUES FROM (2017, 01) TO (2017, 02); + + + + Create partition of a list partitioned table: + + CREATE TABLE cities_ab + PARTITION OF cities ( + CONSTRAINT city_id_nonzero CHECK (city_id != 0) + ) FOR VALUES IN ('a', 'b'); + + + + Create partition of a list partitioned table that is itself further + partitioned and then add a partition to it: + + CREATE TABLE cities_ab + PARTITION OF cities ( + CONSTRAINT city_id_nonzero CHECK (city_id != 0) + ) FOR VALUES IN ('a', 'b') PARTITION BY RANGE (population); + + CREATE TABLE cities_ab_10000_to_100000 + PARTITION OF cities_ab FOR VALUES FROM (10000) TO (100000); + + + + Create partitions of a hash partitioned table: + + CREATE TABLE orders_p1 PARTITION OF orders + FOR VALUES WITH (MODULUS 4, REMAINDER 0); + CREATE TABLE orders_p2 PARTITION OF orders + FOR VALUES WITH (MODULUS 4, REMAINDER 1); + CREATE TABLE orders_p3 PARTITION OF orders + FOR VALUES WITH (MODULUS 4, REMAINDER 2); + CREATE TABLE orders_p4 PARTITION OF orders + FOR VALUES WITH (MODULUS 4, REMAINDER 3); + + + + Create a default partition: + + CREATE TABLE cities_partdef + PARTITION OF cities DEFAULT; + + + + + Compatibility + + + The CREATE TABLE command conforms to the + SQL standard, with exceptions listed below. - - - - - - - See Also - - - - - - - - - - + + + Temporary Tables + + + Although the syntax of CREATE TEMPORARY TABLE + resembles that of the SQL standard, the effect is not the same. In the + standard, + temporary tables are defined just once and automatically exist (starting + with empty contents) in every session that needs them. + PostgreSQL instead + requires each session to issue its own CREATE TEMPORARY + TABLE command for each temporary table to be used. This allows + different sessions to use the same temporary table name for different + purposes, whereas the standard's approach constrains all instances of a + given temporary table name to have the same table structure. + + + + The standard's definition of the behavior of temporary tables is + widely ignored. PostgreSQL's behavior + on this point is similar to that of several other SQL databases. + + + + The SQL standard also distinguishes between global and local temporary + tables, where a local temporary table has a separate set of contents for + each SQL module within each session, though its definition is still shared + across sessions. Since PostgreSQL does not + support SQL modules, this distinction is not relevant in + PostgreSQL. + + + + For compatibility's sake, PostgreSQL will + accept the GLOBAL and LOCAL keywords + in a temporary table declaration, but they currently have no effect. + Use of these keywords is discouraged, since future versions of + PostgreSQL might adopt a more + standard-compliant interpretation of their meaning. + + + + The ON COMMIT clause for temporary tables + also resembles the SQL standard, but has some differences. + If the ON COMMIT clause is omitted, SQL specifies that the + default behavior is ON COMMIT DELETE ROWS. However, the + default behavior in PostgreSQL is + ON COMMIT PRESERVE ROWS. The ON COMMIT + DROP option does not exist in SQL. + + + + + Non-deferred Uniqueness Constraints + + + When a UNIQUE or PRIMARY KEY constraint is + not deferrable, PostgreSQL checks for + uniqueness immediately whenever a row is inserted or modified. + The SQL standard says that uniqueness should be enforced only at + the end of the statement; this makes a difference when, for example, + a single command updates multiple key values. To obtain + standard-compliant behavior, declare the constraint as + DEFERRABLE but not deferred (i.e., INITIALLY + IMMEDIATE). Be aware that this can be significantly slower than + immediate uniqueness checking. + + + + + Column Check Constraints + + + The SQL standard says that CHECK column constraints + can only refer to the column they apply to; only CHECK + table constraints can refer to multiple columns. + PostgreSQL does not enforce this + restriction; it treats column and table check constraints alike. + + + + + <literal>EXCLUDE</literal> Constraint + + + The EXCLUDE constraint type is a + PostgreSQL extension. + + + + + <literal>NULL</literal> <quote>Constraint</quote> + + + The NULL constraint (actually a + non-constraint) is a PostgreSQL + extension to the SQL standard that is included for compatibility with some + other database systems (and for symmetry with the NOT + NULL constraint). Since it is the default for any + column, its presence is simply noise. + + + + + Inheritance + + + Multiple inheritance via the INHERITS clause is + a PostgreSQL language extension. + SQL:1999 and later define single inheritance using a + different syntax and different semantics. SQL:1999-style + inheritance is not yet supported by + PostgreSQL. + + + + + Zero-column Tables + + + PostgreSQL allows a table of no columns + to be created (for example, CREATE TABLE foo();). This + is an extension from the SQL standard, which does not allow zero-column + tables. Zero-column tables are not in themselves very useful, but + disallowing them creates odd special cases for ALTER TABLE + DROP COLUMN, so it seems cleaner to ignore this spec restriction. + + + + + Multiple Identity Columns + + + PostgreSQL allows a table to have more than one + identity column. The standard specifies that a table can have at most one + identity column. This is relaxed mainly to give more flexibility for + doing schema changes or migrations. Note that + the INSERT command supports only one override clause + that applies to the entire statement, so having multiple identity columns + with different behaviors is not well supported. + + + + + <literal>LIKE</literal> Clause + + + While a LIKE clause exists in the SQL standard, many of the + options that PostgreSQL accepts for it are not + in the standard, and some of the standard's options are not implemented + by PostgreSQL. + + + + + <literal>WITH</literal> Clause + + + The WITH clause is a PostgreSQL + extension; neither storage parameters nor OIDs are in the standard. + + + + + Tablespaces + + + The PostgreSQL concept of tablespaces is not + part of the standard. Hence, the clauses TABLESPACE + and USING INDEX TABLESPACE are extensions. + + + + + Typed Tables + + + Typed tables implement a subset of the SQL standard. According to + the standard, a typed table has columns corresponding to the + underlying composite type as well as one other column that is + the self-referencing column. PostgreSQL does not + support these self-referencing columns explicitly, but the same + effect can be had using the OID feature. + + + + + <literal>PARTITION BY</literal> Clause + + + The PARTITION BY clause is a + PostgreSQL extension. + + + + + <literal>PARTITION OF</literal> Clause + + + The PARTITION OF clause is a + PostgreSQL extension. + + + + + + + + See Also + + + + + + + + + + diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c index b498716e..9832a333 100644 --- a/src/backend/catalog/partition.c +++ b/src/backend/catalog/partition.c @@ -15,6 +15,7 @@ #include "postgres.h" +#include "access/hash.h" #include "access/heapam.h" #include "access/htup_details.h" #include "access/nbtree.h" @@ -46,6 +47,7 @@ #include "utils/datum.h" #include "utils/memutils.h" #include "utils/fmgroids.h" +#include "utils/hashutils.h" #include "utils/inval.h" #include "utils/lsyscache.h" #include "utils/rel.h" @@ -61,26 +63,35 @@ * In the case of range partitioning, ndatums will typically be far less than * 2 * nparts, because a partition's upper bound and the next partition's lower * bound are the same in most common cases, and we only store one of them (the - * upper bound). + * upper bound). In case of hash partitioning, ndatums will be same as the + * number of partitions. + * + * For range and list partitioned tables, datums is an array of datum-tuples + * with key->partnatts datums each. For hash partitioned tables, it is an array + * of datum-tuples with 2 datums, modulus and remainder, corresponding to a + * given partition. * * In the case of list partitioning, the indexes array stores one entry for * every datum, which is the index of the partition that accepts a given datum. * In case of range partitioning, it stores one entry per distinct range * datum, which is the index of the partition for which a given datum - * is an upper bound. + * is an upper bound. In the case of hash partitioning, the number of the + * entries in the indexes array is same as the greatest modulus amongst all + * partitions. For a given partition key datum-tuple, the index of the + * partition which would accept that datum-tuple would be given by the entry + * pointed by remainder produced when hash value of the datum-tuple is divided + * by the greatest modulus. */ typedef struct PartitionBoundInfoData { - char strategy; /* list or range bounds? */ + char strategy; /* hash, list or range? */ int ndatums; /* Length of the datums following array */ - Datum **datums; /* Array of datum-tuples with key->partnatts - * datums each */ + Datum **datums; PartitionRangeDatumKind **kind; /* The kind of each range bound datum; - * NULL for list partitioned tables */ - int *indexes; /* Partition indexes; one entry per member of - * the datums array (plus one if range - * partitioned table) */ + * NULL for hash and list partitioned + * tables */ + int *indexes; /* Partition indexes */ int null_index; /* Index of the null-accepting partition; -1 * if there isn't one */ int default_index; /* Index of the default partition; -1 if there @@ -95,6 +106,14 @@ typedef struct PartitionBoundInfoData * is represented with one of the following structs. */ +/* One bound of a hash partition */ +typedef struct PartitionHashBound +{ + int modulus; + int remainder; + int index; +} PartitionHashBound; + /* One value coming from some (index'th) list partition */ typedef struct PartitionListValue { @@ -111,6 +130,7 @@ typedef struct PartitionRangeBound bool lower; /* this is the lower (vs upper) bound */ } PartitionRangeBound; +static int32 qsort_partition_hbound_cmp(const void *a, const void *b); static int32 qsort_partition_list_value_cmp(const void *a, const void *b, void *arg); static int32 qsort_partition_rbound_cmp(const void *a, const void *b, @@ -126,6 +146,7 @@ static void get_range_key_properties(PartitionKey key, int keynum, ListCell **partexprs_item, Expr **keyCol, Const **lower_val, Const **upper_val); +static List *get_qual_for_hash(Relation parent, PartitionBoundSpec *spec); static List *get_qual_for_list(Relation parent, PartitionBoundSpec *spec); static List *get_qual_for_range(Relation parent, PartitionBoundSpec *spec, bool for_default); @@ -134,6 +155,8 @@ static List *generate_partition_qual(Relation rel); static PartitionRangeBound *make_one_range_bound(PartitionKey key, int index, List *datums, bool lower); +static int32 partition_hbound_cmp(int modulus1, int remainder1, int modulus2, + int remainder2); static int32 partition_rbound_cmp(PartitionKey key, Datum *datums1, PartitionRangeDatumKind *kind1, bool lower1, PartitionRangeBound *b2); @@ -149,6 +172,12 @@ static int partition_bound_bsearch(PartitionKey key, void *probe, bool probe_is_bound, bool *is_equal); static void get_partition_dispatch_recurse(Relation rel, Relation parent, List **pds, List **leaf_part_oids); +static int get_partition_bound_num_indexes(PartitionBoundInfo b); +static int get_greatest_modulus(PartitionBoundInfo b); +static uint64 compute_hash_value(PartitionKey key, Datum *values, bool *isnull); + +/* SQL-callable function for use in hash partition CHECK constraints */ +PG_FUNCTION_INFO_V1(satisfies_hash_partition); /* * RelationBuildPartitionDesc @@ -174,6 +203,9 @@ RelationBuildPartitionDesc(Relation rel) int ndatums = 0; int default_index = -1; + /* Hash partitioning specific */ + PartitionHashBound **hbounds = NULL; + /* List partitioning specific */ PartitionListValue **all_values = NULL; int null_index = -1; @@ -267,7 +299,35 @@ RelationBuildPartitionDesc(Relation rel) oids[i++] = lfirst_oid(cell); /* Convert from node to the internal representation */ - if (key->strategy == PARTITION_STRATEGY_LIST) + if (key->strategy == PARTITION_STRATEGY_HASH) + { + ndatums = nparts; + hbounds = (PartitionHashBound **) + palloc(nparts * sizeof(PartitionHashBound *)); + + i = 0; + foreach(cell, boundspecs) + { + PartitionBoundSpec *spec = castNode(PartitionBoundSpec, + lfirst(cell)); + + if (spec->strategy != PARTITION_STRATEGY_HASH) + elog(ERROR, "invalid strategy in partition bound spec"); + + hbounds[i] = (PartitionHashBound *) + palloc(sizeof(PartitionHashBound)); + + hbounds[i]->modulus = spec->modulus; + hbounds[i]->remainder = spec->remainder; + hbounds[i]->index = i; + i++; + } + + /* Sort all the bounds in ascending order */ + qsort(hbounds, nparts, sizeof(PartitionHashBound *), + qsort_partition_hbound_cmp); + } + else if (key->strategy == PARTITION_STRATEGY_LIST) { List *non_null_values = NIL; @@ -517,6 +577,42 @@ RelationBuildPartitionDesc(Relation rel) switch (key->strategy) { + case PARTITION_STRATEGY_HASH: + { + /* Modulus are stored in ascending order */ + int greatest_modulus = hbounds[ndatums - 1]->modulus; + + boundinfo->indexes = (int *) palloc(greatest_modulus * + sizeof(int)); + + for (i = 0; i < greatest_modulus; i++) + boundinfo->indexes[i] = -1; + + for (i = 0; i < nparts; i++) + { + int modulus = hbounds[i]->modulus; + int remainder = hbounds[i]->remainder; + + boundinfo->datums[i] = (Datum *) palloc(2 * + sizeof(Datum)); + boundinfo->datums[i][0] = Int32GetDatum(modulus); + boundinfo->datums[i][1] = Int32GetDatum(remainder); + + while (remainder < greatest_modulus) + { + /* overlap? */ + Assert(boundinfo->indexes[remainder] == -1); + boundinfo->indexes[remainder] = i; + remainder += modulus; + } + + mapping[hbounds[i]->index] = i; + pfree(hbounds[i]); + } + pfree(hbounds); + break; + } + case PARTITION_STRATEGY_LIST: { boundinfo->indexes = (int *) palloc(ndatums * sizeof(int)); @@ -650,8 +746,7 @@ RelationBuildPartitionDesc(Relation rel) * Now assign OIDs from the original array into mapped indexes of the * result array. Order of OIDs in the former is defined by the * catalog scan that retrieved them, whereas that in the latter is - * defined by canonicalized representation of the list values or the - * range bounds. + * defined by canonicalized representation of the partition bounds. */ for (i = 0; i < nparts; i++) result->oids[mapping[i]] = oids[i]; @@ -688,6 +783,49 @@ partition_bounds_equal(int partnatts, int16 *parttyplen, bool *parttypbyval, if (b1->default_index != b2->default_index) return false; + if (b1->strategy == PARTITION_STRATEGY_HASH) + { + int greatest_modulus; + + /* + * If two hash partitioned tables have different greatest moduli, + * their partition schemes don't match. For hash partitioned table, + * the greatest modulus is given by the last datum and number of + * partitions is given by ndatums. + */ + if (b1->datums[b1->ndatums - 1][0] != b2->datums[b2->ndatums - 1][0]) + return false; + + /* + * We arrange the partitions in the ascending order of their modulus + * and remainders. Also every modulus is factor of next larger + * modulus. Therefore we can safely store index of a given partition + * in indexes array at remainder of that partition. Also entries at + * (remainder + N * modulus) positions in indexes array are all same + * for (modulus, remainder) specification for any partition. Thus + * datums array from both the given bounds are same, if and only if + * their indexes array will be same. So, it suffices to compare + * indexes array. + */ + greatest_modulus = get_greatest_modulus(b1); + for (i = 0; i < greatest_modulus; i++) + if (b1->indexes[i] != b2->indexes[i]) + return false; + +#ifdef USE_ASSERT_CHECKING + + /* + * Nonetheless make sure that the bounds are indeed same when the + * indexes match. Hash partition bound stores modulus and remainder + * at b1->datums[i][0] and b1->datums[i][1] position respectively. + */ + for (i = 0; i < b1->ndatums; i++) + Assert((b1->datums[i][0] == b2->datums[i][0] && + b1->datums[i][1] == b2->datums[i][1])); +#endif + } + else + { for (i = 0; i < b1->ndatums; i++) { int j; @@ -701,21 +839,26 @@ partition_bounds_equal(int partnatts, int16 *parttyplen, bool *parttypbyval, if (b1->kind[i][j] != b2->kind[i][j]) return false; - /* Non-finite bounds are equal without further examination. */ + /* + * Non-finite bounds are equal without further + * examination. + */ if (b1->kind[i][j] != PARTITION_RANGE_DATUM_VALUE) continue; } /* - * Compare the actual values. Note that it would be both incorrect - * and unsafe to invoke the comparison operator derived from the - * partitioning specification here. It would be incorrect because - * we want the relcache entry to be updated for ANY change to the - * partition bounds, not just those that the partitioning operator - * thinks are significant. It would be unsafe because we might - * reach this code in the context of an aborted transaction, and - * an arbitrary partitioning operator might not be safe in that - * context. datumIsEqual() should be simple enough to be safe. + * Compare the actual values. Note that it would be both + * incorrect and unsafe to invoke the comparison operator + * derived from the partitioning specification here. It would + * be incorrect because we want the relcache entry to be + * updated for ANY change to the partition bounds, not just + * those that the partitioning operator thinks are + * significant. It would be unsafe because we might reach + * this code in the context of an aborted transaction, and an + * arbitrary partitioning operator might not be safe in that + * context. datumIsEqual() should be simple enough to be + * safe. */ if (!datumIsEqual(b1->datums[i][j], b2->datums[i][j], parttypbyval[j], parttyplen[j])) @@ -730,10 +873,100 @@ partition_bounds_equal(int partnatts, int16 *parttyplen, bool *parttypbyval, if (b1->strategy == PARTITION_STRATEGY_RANGE && b1->indexes[i] != b2->indexes[i]) return false; - + } return true; } +/* + * Return a copy of given PartitionBoundInfo structure. The data types of bounds + * are described by given partition key specificiation. + */ +extern PartitionBoundInfo +partition_bounds_copy(PartitionBoundInfo src, + PartitionKey key) +{ + PartitionBoundInfo dest; + int i; + int ndatums; + int partnatts; + int num_indexes; + + dest = (PartitionBoundInfo) palloc(sizeof(PartitionBoundInfoData)); + + dest->strategy = src->strategy; + ndatums = dest->ndatums = src->ndatums; + partnatts = key->partnatts; + + /* Range partitioned table has an extra index. */ + num_indexes = get_partition_bound_num_indexes(src); + + /* List partitioned tables have only a single partition key. */ + Assert(key->strategy != PARTITION_STRATEGY_LIST || partnatts == 1); + + dest->datums = (Datum **) palloc(sizeof(Datum *) * ndatums); + + if (src->kind != NULL) + { + dest->kind = (PartitionRangeDatumKind **) palloc(ndatums * + sizeof(PartitionRangeDatumKind *)); + for (i = 0; i < ndatums; i++) + { + dest->kind[i] = (PartitionRangeDatumKind *) palloc(partnatts * + sizeof(PartitionRangeDatumKind)); + + memcpy(dest->kind[i], src->kind[i], + sizeof(PartitionRangeDatumKind) * key->partnatts); + } + } + else + dest->kind = NULL; + + for (i = 0; i < ndatums; i++) + { + int j; + + /* + * For a corresponding to hash partition, datums array will have two + * elements - modulus and remainder. + */ + bool hash_part = (key->strategy == PARTITION_STRATEGY_HASH); + int natts = hash_part ? 2 : partnatts; + + dest->datums[i] = (Datum *) palloc(sizeof(Datum) * natts); + + for (j = 0; j < natts; j++) + { + bool byval; + int typlen; + + if (hash_part) + { + typlen = sizeof(int32); /* Always int4 */ + byval = true; /* int4 is pass-by-value */ + } + else + { + byval = key->parttypbyval[j]; + typlen = key->parttyplen[j]; + } + + if (dest->kind == NULL || + dest->kind[i][j] == PARTITION_RANGE_DATUM_VALUE) + dest->datums[i][j] = datumCopy(src->datums[i][j], + byval, typlen); + + } + } + + dest->indexes = (int *) palloc(sizeof(int) * num_indexes); + memcpy(dest->indexes, src->indexes, sizeof(int) * num_indexes); + + dest->null_index = src->null_index; + dest->default_index = src->default_index; + + return dest; +} + /* * check_new_partition_bound * @@ -766,6 +999,89 @@ check_new_partition_bound(char *relname, Relation parent, switch (key->strategy) { + case PARTITION_STRATEGY_HASH: + { + Assert(spec->strategy == PARTITION_STRATEGY_HASH); + Assert(spec->remainder >= 0 && spec->remainder < spec->modulus); + + if (partdesc->nparts > 0) + { + PartitionBoundInfo boundinfo = partdesc->boundinfo; + Datum **datums = boundinfo->datums; + int ndatums = boundinfo->ndatums; + int greatest_modulus; + int remainder; + int offset; + bool equal, + valid_modulus = true; + int prev_modulus, /* Previous largest modulus */ + next_modulus; /* Next largest modulus */ + + /* + * Check rule that every modulus must be a factor of the + * next larger modulus. For example, if you have a bunch + * of partitions that all have modulus 5, you can add a + * new partition with modulus 10 or a new partition with + * modulus 15, but you cannot add both a partition with + * modulus 10 and a partition with modulus 15, because 10 + * is not a factor of 15. + * + * Get greatest bound in array boundinfo->datums which is + * less than or equal to spec->modulus and + * spec->remainder. + */ + offset = partition_bound_bsearch(key, boundinfo, spec, + true, &equal); + if (offset < 0) + { + next_modulus = DatumGetInt32(datums[0][0]); + valid_modulus = (next_modulus % spec->modulus) == 0; + } + else + { + prev_modulus = DatumGetInt32(datums[offset][0]); + valid_modulus = (spec->modulus % prev_modulus) == 0; + + if (valid_modulus && (offset + 1) < ndatums) + { + next_modulus = DatumGetInt32(datums[offset + 1][0]); + valid_modulus = (next_modulus % spec->modulus) == 0; + } + } + + if (!valid_modulus) + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("every hash partition modulus must be a factor of the next larger modulus"))); + + greatest_modulus = get_greatest_modulus(boundinfo); + remainder = spec->remainder; + + /* + * Normally, the lowest remainder that could conflict with + * the new partition is equal to the remainder specified + * for the new partition, but when the new partition has a + * modulus higher than any used so far, we need to adjust. + */ + if (remainder >= greatest_modulus) + remainder = remainder % greatest_modulus; + + /* Check every potentially-conflicting remainder. */ + do + { + if (boundinfo->indexes[remainder] != -1) + { + overlap = true; + with = boundinfo->indexes[remainder]; + break; + } + remainder += spec->modulus; + } while (remainder < greatest_modulus); + } + + break; + } + case PARTITION_STRATEGY_LIST: { Assert(spec->strategy == PARTITION_STRATEGY_LIST); @@ -1136,6 +1452,11 @@ get_qual_from_partbound(Relation rel, Relation parent, switch (key->strategy) { + case PARTITION_STRATEGY_HASH: + Assert(spec->strategy == PARTITION_STRATEGY_HASH); + my_qual = get_qual_for_hash(parent, spec); + break; + case PARTITION_STRATEGY_LIST: Assert(spec->strategy == PARTITION_STRATEGY_LIST); my_qual = get_qual_for_list(parent, spec); @@ -1506,6 +1827,92 @@ make_partition_op_expr(PartitionKey key, int keynum, return result; } +/* + * get_qual_for_hash + * + * Given a list of partition columns, modulus and remainder corresponding to a + * partition, this function returns CHECK constraint expression Node for that + * partition. + * + * The partition constraint for a hash partition is always a call to the + * built-in function satisfies_hash_partition(). The first two arguments are + * the modulus and remainder for the partition; the remaining arguments are the + * values to be hashed. + */ +static List * +get_qual_for_hash(Relation parent, PartitionBoundSpec *spec) +{ + PartitionKey key = RelationGetPartitionKey(parent); + FuncExpr *fexpr; + Node *relidConst; + Node *modulusConst; + Node *remainderConst; + List *args; + ListCell *partexprs_item; + int i; + + /* Fixed arguments. */ + relidConst = (Node *) makeConst(OIDOID, + -1, + InvalidOid, + sizeof(Oid), + ObjectIdGetDatum(RelationGetRelid(parent)), + false, + true); + + modulusConst = (Node *) makeConst(INT4OID, + -1, + InvalidOid, + sizeof(int32), + Int32GetDatum(spec->modulus), + false, + true); + + remainderConst = (Node *) makeConst(INT4OID, + -1, + InvalidOid, + sizeof(int32), + Int32GetDatum(spec->remainder), + false, + true); + + args = list_make3(relidConst, modulusConst, remainderConst); + partexprs_item = list_head(key->partexprs); + + /* Add an argument for each key column. */ + for (i = 0; i < key->partnatts; i++) + { + Node *keyCol; + + /* Left operand */ + if (key->partattrs[i] != 0) + { + keyCol = (Node *) makeVar(1, + key->partattrs[i], + key->parttypid[i], + key->parttypmod[i], + key->parttypcoll[i], + 0); + } + else + { + keyCol = (Node *) copyObject(lfirst(partexprs_item)); + partexprs_item = lnext(partexprs_item); + } + + args = lappend(args, keyCol); + } + + fexpr = makeFuncExpr(F_SATISFIES_HASH_PARTITION, + BOOLOID, + args, + InvalidOid, + InvalidOid, + COERCE_EXPLICIT_CALL); + + return list_make1(fexpr); +} + /* * get_qual_for_list * @@ -2371,6 +2778,17 @@ get_partition_for_tuple(PartitionDispatch *pd, /* Route as appropriate based on partitioning strategy. */ switch (key->strategy) { + case PARTITION_STRATEGY_HASH: + { + PartitionBoundInfo boundinfo = partdesc->boundinfo; + int greatest_modulus = get_greatest_modulus(boundinfo); + uint64 rowHash = compute_hash_value(key, values, + isnull); + + cur_index = boundinfo->indexes[rowHash % greatest_modulus]; + } + break; + case PARTITION_STRATEGY_LIST: if (isnull[0]) @@ -2483,6 +2901,38 @@ get_partition_for_tuple(PartitionDispatch *pd, return result; } +/* + * qsort_partition_hbound_cmp + * + * We sort hash bounds by modulus, then by remainder. + */ +static int32 +qsort_partition_hbound_cmp(const void *a, const void *b) +{ + PartitionHashBound *h1 = (*(PartitionHashBound *const *) a); + PartitionHashBound *h2 = (*(PartitionHashBound *const *) b); + + return partition_hbound_cmp(h1->modulus, h1->remainder, + h2->modulus, h2->remainder); +} + +/* + * partition_hbound_cmp + * + * Compares modulus first, then remainder if modulus are equal. + */ +static int32 +partition_hbound_cmp(int modulus1, int remainder1, int modulus2, int remainder2) +{ + if (modulus1 < modulus2) + return -1; + if (modulus1 > modulus2) + return 1; + if (modulus1 == modulus2 && remainder1 != remainder2) + return (remainder1 > remainder2) ? 1 : -1; + return 0; +} + /* * qsort_partition_list_value_cmp * @@ -2669,6 +3119,15 @@ partition_bound_cmp(PartitionKey key, PartitionBoundInfo boundinfo, switch (key->strategy) { + case PARTITION_STRATEGY_HASH: + { + PartitionBoundSpec *spec = (PartitionBoundSpec *) probe; + + cmpval = partition_hbound_cmp(DatumGetInt32(bound_datums[0]), + DatumGetInt32(bound_datums[1]), + spec->modulus, spec->remainder); + break; + } case PARTITION_STRATEGY_LIST: cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[0], key->partcollation[0], @@ -2853,3 +3312,182 @@ get_proposed_default_constraint(List *new_part_constraints) return list_make1(defPartConstraint); } + +/* + * get_partition_bound_num_indexes + * + * Returns the number of the entries in the partition bound indexes array. + */ +static int +get_partition_bound_num_indexes(PartitionBoundInfo bound) +{ + int num_indexes; + + Assert(bound); + + switch (bound->strategy) + { + case PARTITION_STRATEGY_HASH: + + /* + * The number of the entries in the indexes array is same as the + * greatest modulus. + */ + num_indexes = get_greatest_modulus(bound); + break; + + case PARTITION_STRATEGY_LIST: + num_indexes = bound->ndatums; + break; + + case PARTITION_STRATEGY_RANGE: + /* Range partitioned table has an extra index. */ + num_indexes = bound->ndatums + 1; + break; + + default: + elog(ERROR, "unexpected partition strategy: %d", + (int) bound->strategy); + } + + return num_indexes; +} + +/* + * get_greatest_modulus + * + * Returns the greatest modulus of the hash partition bound. The greatest + * modulus will be at the end of the datums array because hash partitions are + * arranged in the ascending order of their modulus and remainders. + */ +static int +get_greatest_modulus(PartitionBoundInfo bound) +{ + Assert(bound && bound->strategy == PARTITION_STRATEGY_HASH); + Assert(bound->datums && bound->ndatums > 0); + Assert(DatumGetInt32(bound->datums[bound->ndatums - 1][0]) > 0); + + return DatumGetInt32(bound->datums[bound->ndatums - 1][0]); +} + +/* + * compute_hash_value + * + * Compute the hash value for given not null partition key values. + */ +static uint64 +compute_hash_value(PartitionKey key, Datum *values, bool *isnull) +{ + int i; + int nkeys = key->partnatts; + uint64 rowHash = 0; + Datum seed = UInt64GetDatum(HASH_PARTITION_SEED); + + for (i = 0; i < nkeys; i++) + { + if (!isnull[i]) + { + Datum hash; + + Assert(OidIsValid(key->partsupfunc[i].fn_oid)); + + /* + * Compute hash for each datum value by calling respective + * datatype-specific hash functions of each partition key + * attribute. + */ + hash = FunctionCall2(&key->partsupfunc[i], values[i], seed); + + /* Form a single 64-bit hash value */ + rowHash = hash_combine64(rowHash, DatumGetUInt64(hash)); + } + } + + return rowHash; +} + +/* + * satisfies_hash_partition + * + * This is a SQL-callable function for use in hash partition constraints takes + * an already computed hash values of each partition key attribute, and combine + * them into a single hash value by calling hash_combine64. + * + * Returns true if remainder produced when this computed single hash value is + * divided by the given modulus is equal to given remainder, otherwise false. + * + * See get_qual_for_hash() for usage. + */ +Datum +satisfies_hash_partition(PG_FUNCTION_ARGS) +{ + typedef struct ColumnsHashData + { + Oid relid; + int16 nkeys; + FmgrInfo partsupfunc[PARTITION_MAX_KEYS]; + } ColumnsHashData; + Oid parentId = PG_GETARG_OID(0); + int modulus = PG_GETARG_INT32(1); + int remainder = PG_GETARG_INT32(2); + short nkeys = PG_NARGS() - 3; + int i; + Datum seed = UInt64GetDatum(HASH_PARTITION_SEED); + ColumnsHashData *my_extra; + uint64 rowHash = 0; + + /* + * Cache hash function information. + */ + my_extra = (ColumnsHashData *) fcinfo->flinfo->fn_extra; + if (my_extra == NULL || my_extra->nkeys != nkeys || + my_extra->relid != parentId) + { + Relation parent; + PartitionKey key; + int j; + + fcinfo->flinfo->fn_extra = + MemoryContextAllocZero(fcinfo->flinfo->fn_mcxt, + offsetof(ColumnsHashData, partsupfunc) + + sizeof(FmgrInfo) * nkeys); + my_extra = (ColumnsHashData *) fcinfo->flinfo->fn_extra; + my_extra->nkeys = nkeys; + my_extra->relid = parentId; + + /* Open parent relation and fetch partition keyinfo */ + parent = heap_open(parentId, AccessShareLock); + key = RelationGetPartitionKey(parent); + + Assert(key->partnatts == nkeys); + for (j = 0; j < nkeys; ++j) + fmgr_info_copy(&my_extra->partsupfunc[j], + key->partsupfunc, + fcinfo->flinfo->fn_mcxt); + + /* Hold lock until commit */ + heap_close(parent, NoLock); + } + + for (i = 0; i < nkeys; i++) + { + /* keys start from fourth argument of function. */ + int argno = i + 3; + + if (!PG_ARGISNULL(argno)) + { + Datum hash; + + Assert(OidIsValid(my_extra->partsupfunc[i].fn_oid)); + + hash = FunctionCall2(&my_extra->partsupfunc[i], + PG_GETARG_DATUM(argno), + seed); + + /* Form a single 64-bit hash value */ + rowHash = hash_combine64(rowHash, DatumGetUInt64(hash)); + } + } + + PG_RETURN_BOOL(rowHash % modulus == remainder); +} diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 360fd0ee..74b82ebf 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -535,7 +535,7 @@ static void RangeVarCallbackForAlterRelation(const RangeVar *rv, Oid relid, static bool is_partition_attr(Relation rel, AttrNumber attnum, bool *used_in_expr); static PartitionSpec *transformPartitionSpec(Relation rel, PartitionSpec *partspec, char *strategy); static void ComputePartitionAttrs(Relation rel, List *partParams, AttrNumber *partattrs, - List **partexprs, Oid *partopclass, Oid *partcollation); + List **partexprs, Oid *partopclass, Oid *partcollation, char strategy); static void CreateInheritance(Relation child_rel, Relation parent_rel); static void RemoveInheritance(Relation child_rel, Relation parent_rel); static ObjectAddress ATExecAttachPartition(List **wqueue, Relation rel, @@ -1167,7 +1167,7 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, #endif ComputePartitionAttrs(rel, stmt->partspec->partParams, partattrs, &partexprs, partopclass, - partcollation); + partcollation, strategy); StorePartitionKey(rel, strategy, partnatts, partattrs, partexprs, partopclass, partcollation); @@ -16134,7 +16134,9 @@ transformPartitionSpec(Relation rel, PartitionSpec *partspec, char *strategy) newspec->location = partspec->location; /* Parse partitioning strategy name */ - if (pg_strcasecmp(partspec->strategy, "list") == 0) + if (pg_strcasecmp(partspec->strategy, "hash") == 0) + *strategy = PARTITION_STRATEGY_HASH; + else if (pg_strcasecmp(partspec->strategy, "list") == 0) *strategy = PARTITION_STRATEGY_LIST; else if (pg_strcasecmp(partspec->strategy, "range") == 0) *strategy = PARTITION_STRATEGY_RANGE; @@ -16211,10 +16213,12 @@ transformPartitionSpec(Relation rel, PartitionSpec *partspec, char *strategy) */ static void ComputePartitionAttrs(Relation rel, List *partParams, AttrNumber *partattrs, - List **partexprs, Oid *partopclass, Oid *partcollation) -{// #lizard forgives + List **partexprs, Oid *partopclass, Oid *partcollation, + char strategy) +{ int attn; ListCell *lc; + Oid am_oid; attn = 0; foreach(lc, partParams) @@ -16374,25 +16378,41 @@ ComputePartitionAttrs(Relation rel, List *partParams, AttrNumber *partattrs, partcollation[attn] = attcollation; /* - * Identify a btree opclass to use. Currently, we use only btree - * operators, which seems enough for list and range partitioning. + * Identify the appropriate operator class. For list and range + * partitioning, we use a btree operator class; hash partitioning uses + * a hash operator class. */ + if (strategy == PARTITION_STRATEGY_HASH) + am_oid = HASH_AM_OID; + else + am_oid = BTREE_AM_OID; + if (!pelem->opclass) { - partopclass[attn] = GetDefaultOpClass(atttype, BTREE_AM_OID); + partopclass[attn] = GetDefaultOpClass(atttype, am_oid); if (!OidIsValid(partopclass[attn])) + { + if (strategy == PARTITION_STRATEGY_HASH) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("data type %s has no default hash operator class", + format_type_be(atttype)), + errhint("You must specify a hash operator class or define a default hash operator class for the data type."))); + else ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("data type %s has no default btree operator class", format_type_be(atttype)), errhint("You must specify a btree operator class or define a default btree operator class for the data type."))); + + } } else partopclass[attn] = ResolveOpClass(pelem->opclass, atttype, - "btree", - BTREE_AM_OID); + am_oid == HASH_AM_OID ? "hash" : "btree", + am_oid); attn++; } diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index a7e1d32a..1e57fedd 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -4829,6 +4829,8 @@ _copyPartitionBoundSpec(const PartitionBoundSpec *from) COPY_SCALAR_FIELD(strategy); COPY_SCALAR_FIELD(is_default); + COPY_SCALAR_FIELD(modulus); + COPY_SCALAR_FIELD(remainder); COPY_NODE_FIELD(listdatums); COPY_NODE_FIELD(lowerdatums); COPY_NODE_FIELD(upperdatums); diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c index 6efee4a8..c05b411c 100644 --- a/src/backend/nodes/equalfuncs.c +++ b/src/backend/nodes/equalfuncs.c @@ -2953,6 +2953,8 @@ _equalPartitionBoundSpec(const PartitionBoundSpec *a, const PartitionBoundSpec * { COMPARE_SCALAR_FIELD(strategy); COMPARE_SCALAR_FIELD(is_default); + COMPARE_SCALAR_FIELD(modulus); + COMPARE_SCALAR_FIELD(remainder); COMPARE_NODE_FIELD(listdatums); COMPARE_NODE_FIELD(lowerdatums); COMPARE_NODE_FIELD(upperdatums); diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index daf0445f..fb711230 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -5003,6 +5003,8 @@ _outPartitionBoundSpec(StringInfo str, const PartitionBoundSpec *node) WRITE_CHAR_FIELD(strategy); WRITE_BOOL_FIELD(is_default); + WRITE_INT_FIELD(modulus); + WRITE_INT_FIELD(remainder); WRITE_NODE_FIELD(listdatums); WRITE_NODE_FIELD(lowerdatums); WRITE_NODE_FIELD(upperdatums); diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c index 32c879f7..2bdc5067 100644 --- a/src/backend/nodes/readfuncs.c +++ b/src/backend/nodes/readfuncs.c @@ -4077,6 +4077,8 @@ _readPartitionBoundSpec(void) READ_CHAR_FIELD(strategy); READ_BOOL_FIELD(is_default); + READ_INT_FIELD(modulus); + READ_INT_FIELD(remainder); READ_NODE_FIELD(listdatums); READ_NODE_FIELD(lowerdatums); READ_NODE_FIELD(upperdatums); diff --git a/src/backend/optimizer/path/joinrels.c b/src/backend/optimizer/path/joinrels.c index ad902dcf..d6fad96c 100644 --- a/src/backend/optimizer/path/joinrels.c +++ b/src/backend/optimizer/path/joinrels.c @@ -1484,7 +1484,7 @@ have_partkey_equi_join(RelOptInfo *rel1, RelOptInfo *rel2, JoinType jointype, continue; /* Skip clauses which are not equality conditions. */ - if (!rinfo->mergeopfamilies) + if (!rinfo->mergeopfamilies && !OidIsValid(rinfo->hashjoinoperator)) continue; opexpr = (OpExpr *) rinfo->clause; @@ -1536,7 +1536,13 @@ have_partkey_equi_join(RelOptInfo *rel1, RelOptInfo *rel2, JoinType jointype, * The clause allows partition-wise join if only it uses the same * operator family as that specified by the partition key. */ - if (!list_member_oid(rinfo->mergeopfamilies, + if (rel1->part_scheme->strategy == PARTITION_STRATEGY_HASH) + { + if (!op_in_opfamily(rinfo->hashjoinoperator, + part_scheme->partopfamily[ipk1])) + continue; + } + else if (!list_member_oid(rinfo->mergeopfamilies, part_scheme->partopfamily[ipk1])) continue; diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 55ea9c8c..fc680b63 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -1978,13 +1978,15 @@ set_relation_partition_info(PlannerInfo *root, RelOptInfo *rel, Relation relation) { PartitionDesc partdesc; + PartitionKey partkey; Assert(relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE); partdesc = RelationGetPartitionDesc(relation); + partkey = RelationGetPartitionKey(relation); rel->part_scheme = find_partition_scheme(root, relation); Assert(partdesc != NULL && rel->part_scheme != NULL); - rel->boundinfo = partdesc->boundinfo; + rel->boundinfo = partition_bounds_copy(partdesc->boundinfo, partkey); rel->nparts = partdesc->nparts; set_baserel_partition_key_exprs(relation, rel); } @@ -2041,18 +2043,33 @@ find_partition_scheme(PlannerInfo *root, Relation relation) /* * Did not find matching partition scheme. Create one copying relevant - * information from the relcache. Instead of copying whole arrays, copy - * the pointers in relcache. It's safe to do so since - * RelationClearRelation() wouldn't change it while planner is using it. + * information from the relcache. We need to copy the contents of the array + * since the relcache entry may not survive after we have closed the + * relation. */ part_scheme = (PartitionScheme) palloc0(sizeof(PartitionSchemeData)); part_scheme->strategy = partkey->strategy; part_scheme->partnatts = partkey->partnatts; - part_scheme->partopfamily = partkey->partopfamily; - part_scheme->partopcintype = partkey->partopcintype; - part_scheme->parttypcoll = partkey->parttypcoll; - part_scheme->parttyplen = partkey->parttyplen; - part_scheme->parttypbyval = partkey->parttypbyval; + + part_scheme->partopfamily = (Oid *) palloc(sizeof(Oid) * partnatts); + memcpy(part_scheme->partopfamily, partkey->partopfamily, + sizeof(Oid) * partnatts); + + part_scheme->partopcintype = (Oid *) palloc(sizeof(Oid) * partnatts); + memcpy(part_scheme->partopcintype, partkey->partopcintype, + sizeof(Oid) * partnatts); + + part_scheme->parttypcoll = (Oid *) palloc(sizeof(Oid) * partnatts); + memcpy(part_scheme->parttypcoll, partkey->parttypcoll, + sizeof(Oid) * partnatts); + + part_scheme->parttyplen = (int16 *) palloc(sizeof(int16) * partnatts); + memcpy(part_scheme->parttyplen, partkey->parttyplen, + sizeof(int16) * partnatts); + + part_scheme->parttypbyval = (bool *) palloc(sizeof(bool) * partnatts); + memcpy(part_scheme->parttypbyval, partkey->parttypbyval, + sizeof(bool) * partnatts); /* Add the partitioning scheme to PlannerInfo. */ root->part_schemes = lappend(root->part_schemes, part_scheme); diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 41b045c3..1cf77960 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -620,7 +620,8 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); %type part_params %type PartitionBoundSpec %type partbound_datum PartitionRangeDatum -%type partbound_datum_list range_datum_list +%type hash_partbound partbound_datum_list range_datum_list +%type hash_partbound_elem %type lock_param @@ -2834,8 +2835,61 @@ alter_identity_column_option: ; PartitionBoundSpec: + /* a HASH partition*/ + FOR VALUES WITH '(' hash_partbound ')' + { + ListCell *lc; + PartitionBoundSpec *n = makeNode(PartitionBoundSpec); + + n->strategy = PARTITION_STRATEGY_HASH; + n->modulus = n->remainder = -1; + + foreach (lc, $5) + { + DefElem *opt = lfirst_node(DefElem, lc); + + if (strcmp(opt->defname, "modulus") == 0) + { + if (n->modulus != -1) + ereport(ERROR, + (errcode(ERRCODE_DUPLICATE_OBJECT), + errmsg("modulus for hash partition provided more than once"), + parser_errposition(opt->location))); + n->modulus = defGetInt32(opt); + } + else if (strcmp(opt->defname, "remainder") == 0) + { + if (n->remainder != -1) + ereport(ERROR, + (errcode(ERRCODE_DUPLICATE_OBJECT), + errmsg("remainder for hash partition provided more than once"), + parser_errposition(opt->location))); + n->remainder = defGetInt32(opt); + } + else + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("unrecognized hash partition bound specification \"%s\"", + opt->defname), + parser_errposition(opt->location))); + } + + if (n->modulus == -1) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("modulus for hash partition must be specified"))); + if (n->remainder == -1) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("remainder for hash partition must be specified"))); + + n->location = @3; + + $$ = n; + } + /* a LIST partition */ - FOR VALUES IN_P '(' partbound_datum_list ')' + | FOR VALUES IN_P '(' partbound_datum_list ')' { PartitionBoundSpec *n = makeNode(PartitionBoundSpec); @@ -2873,6 +2927,24 @@ PartitionBoundSpec: } ; +hash_partbound_elem: + NonReservedWord Iconst + { + $$ = makeDefElem($1, (Node *)makeInteger($2), @1); + } + ; + +hash_partbound: + hash_partbound_elem + { + $$ = list_make1($1); + } + | hash_partbound ',' hash_partbound_elem + { + $$ = lappend($1, $3); + } + ; + partbound_datum: Sconst { $$ = makeStringConst($1, @1); } | NumericOnly { $$ = makeAConst($1, @1); } diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c index 3695d9dc..e75e6b5e 100644 --- a/src/backend/parser/parse_utilcmd.c +++ b/src/backend/parser/parse_utilcmd.c @@ -196,6 +196,7 @@ static PGXCSubCluster *makeSubCluster(List *nodelist); static PGXCSubCluster *makeShardSubCluster(Oid groupId, Oid coldGroupId); #endif static void transformPartitionCmd(CreateStmtContext *cxt, PartitionCmd *cmd); +static void validateInfiniteBounds(ParseState *pstate, List *blist); static Const *transformPartitionBoundValue(ParseState *pstate, A_Const *con, const char *colName, Oid colType, int32 colTypmod); @@ -4965,6 +4966,11 @@ transformPartitionBound(ParseState *pstate, Relation parent, if (spec->is_default) { + if (strategy == PARTITION_STRATEGY_HASH) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TABLE_DEFINITION), + errmsg("a hash-partitioned table may not have a default partition"))); + /* * In case of the default partition, parser had no way to identify the * partition strategy. Assign the parent's strategy to the default @@ -4975,7 +4981,27 @@ transformPartitionBound(ParseState *pstate, Relation parent, return result_spec; } - if (strategy == PARTITION_STRATEGY_LIST) + if (strategy == PARTITION_STRATEGY_HASH) + { + if (spec->strategy != PARTITION_STRATEGY_HASH) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TABLE_DEFINITION), + errmsg("invalid bound specification for a hash partition"), + parser_errposition(pstate, exprLocation((Node *) spec)))); + + if (spec->modulus <= 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TABLE_DEFINITION), + errmsg("modulus for hash partition must be a positive integer"))); + + Assert(spec->remainder >= 0); + + if (spec->remainder >= spec->modulus) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TABLE_DEFINITION), + errmsg("remainder for hash partition must be less than modulus"))); + } + else if (strategy == PARTITION_STRATEGY_LIST) { ListCell *cell; char *colname; @@ -5053,6 +5079,13 @@ transformPartitionBound(ParseState *pstate, Relation parent, (errcode(ERRCODE_INVALID_TABLE_DEFINITION), errmsg("TO must specify exactly one value per partitioning column"))); + /* + * Once we see MINVALUE or MAXVALUE for one column, the remaining + * columns must be the same. + */ + validateInfiniteBounds(pstate, spec->lowerdatums); + validateInfiniteBounds(pstate, spec->upperdatums); + /* Transform all the constants */ i = j = 0; result_spec->lowerdatums = result_spec->upperdatums = NIL; @@ -5124,6 +5157,46 @@ transformPartitionBound(ParseState *pstate, Relation parent, return result_spec; } +/* + * validateInfiniteBounds + * + * Check that a MAXVALUE or MINVALUE specification in a partition bound is + * followed only by more of the same. + */ +static void +validateInfiniteBounds(ParseState *pstate, List *blist) +{ + ListCell *lc; + PartitionRangeDatumKind kind = PARTITION_RANGE_DATUM_VALUE; + + foreach(lc, blist) + { + PartitionRangeDatum *prd = castNode(PartitionRangeDatum, lfirst(lc)); + + if (kind == prd->kind) + continue; + + switch (kind) + { + case PARTITION_RANGE_DATUM_VALUE: + kind = prd->kind; + break; + + case PARTITION_RANGE_DATUM_MAXVALUE: + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("every bound following MAXVALUE must also be MAXVALUE"), + parser_errposition(pstate, exprLocation((Node *) prd)))); + + case PARTITION_RANGE_DATUM_MINVALUE: + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("every bound following MINVALUE must also be MINVALUE"), + parser_errposition(pstate, exprLocation((Node *) prd)))); + } + } +} + /* * Transform one constant in a partition bound spec */ diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c index 2b83875e..984ace45 100644 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@ -1645,7 +1645,7 @@ pg_get_statisticsobj_worker(Oid statextid, bool missing_ok) * * Returns the partition key specification, ie, the following: * - * PARTITION BY { RANGE | LIST } (column opt_collation opt_opclass [, ...]) + * PARTITION BY { RANGE | LIST | HASH } (column opt_collation opt_opclass [, ...]) */ Datum pg_get_partkeydef(PG_FUNCTION_ARGS) @@ -1749,6 +1749,10 @@ pg_get_partkeydef_worker(Oid relid, int prettyFlags, switch (form->partstrat) { + case PARTITION_STRATEGY_HASH: + if (!attrsOnly) + appendStringInfo(&buf, "HASH"); + break; case PARTITION_STRATEGY_LIST: if (!attrsOnly) appendStringInfo(&buf, "LIST"); @@ -9379,6 +9383,15 @@ get_rule_expr(Node *node, deparse_context *context, switch (spec->strategy) { + case PARTITION_STRATEGY_HASH: + Assert(spec->modulus > 0 && spec->remainder >= 0); + Assert(spec->modulus > spec->remainder); + + appendStringInfoString(buf, "FOR VALUES"); + appendStringInfo(buf, " WITH (modulus %d, remainder %d)", + spec->modulus, spec->remainder); + break; + case PARTITION_STRATEGY_LIST: Assert(spec->listdatums != NIL); diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 66aebfe9..f6acc9f0 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -32,6 +32,7 @@ #include #include +#include "access/hash.h" #include "access/htup_details.h" #include "access/multixact.h" #include "access/nbtree.h" @@ -981,6 +982,7 @@ RelationBuildPartitionKey(Relation relation) Datum datum; MemoryContext partkeycxt, oldcxt; + int16 procnum; tuple = SearchSysCache1(PARTRELID, ObjectIdGetDatum(RelationGetRelid(relation))); @@ -1060,6 +1062,10 @@ RelationBuildPartitionKey(Relation relation) key->parttypalign = (char *) palloc0(key->partnatts * sizeof(char)); key->parttypcoll = (Oid *) palloc0(key->partnatts * sizeof(Oid)); + /* For the hash partitioning, an extended hash function will be used. */ + procnum = (key->strategy == PARTITION_STRATEGY_HASH) ? + HASHEXTENDED_PROC : BTORDER_PROC; + /* Copy partattrs and fill other per-attribute info */ memcpy(key->partattrs, attrs, key->partnatts * sizeof(int16)); partexprs_item = list_head(key->partexprs); @@ -1080,18 +1086,20 @@ RelationBuildPartitionKey(Relation relation) key->partopfamily[i] = opclassform->opcfamily; key->partopcintype[i] = opclassform->opcintype; - /* - * A btree support function covers the cases of list and range methods - * currently supported. - */ + /* Get a support function for the specified opfamily and datatypes */ funcid = get_opfamily_proc(opclassform->opcfamily, opclassform->opcintype, opclassform->opcintype, - BTORDER_PROC); - if (!OidIsValid(funcid)) /* should not happen */ - elog(ERROR, "missing support function %d(%u,%u) in opfamily %u", - BTORDER_PROC, opclassform->opcintype, opclassform->opcintype, - opclassform->opcfamily); + procnum); + if (!OidIsValid(funcid)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator class \"%s\" of access method %s is missing support function %d for data type \"%s\"", + NameStr(opclassform->opcname), + (key->strategy == PARTITION_STRATEGY_HASH) ? + "hash" : "btree", + procnum, + format_type_be(opclassform->opcintype)))); fmgr_info(funcid, &key->partsupfunc[i]); diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c index 4ce5a90e..49305e4a 100644 --- a/src/bin/psql/tab-complete.c +++ b/src/bin/psql/tab-complete.c @@ -2074,7 +2074,7 @@ psql_completion(const char *text, int start, int end) else if (TailMatches3("ATTACH", "PARTITION", MatchAny)) COMPLETE_WITH_LIST2("FOR VALUES", "DEFAULT"); else if (TailMatches2("FOR", "VALUES")) - COMPLETE_WITH_LIST2("FROM (", "IN ("); + COMPLETE_WITH_LIST3("FROM (", "IN (", "WITH ("); /* * If we have ALTER TABLE DETACH PARTITION, provide a list of diff --git a/src/include/catalog/partition.h b/src/include/catalog/partition.h index 454a940a..8acc01a8 100644 --- a/src/include/catalog/partition.h +++ b/src/include/catalog/partition.h @@ -19,6 +19,9 @@ #include "parser/parse_node.h" #include "utils/rel.h" +/* Seed for the extended hash function */ +#define HASH_PARTITION_SEED UINT64CONST(0x7A5B22367996DCFD) + /* * PartitionBoundInfo encapsulates a set of partition bounds. It is usually * associated with partitioned tables as part of its partition descriptor. @@ -74,6 +77,8 @@ extern void RelationBuildPartitionDesc(Relation relation); extern bool partition_bounds_equal(int partnatts, int16 *parttyplen, bool *parttypbyval, PartitionBoundInfo b1, PartitionBoundInfo b2); +extern PartitionBoundInfo partition_bounds_copy(PartitionBoundInfo src, + PartitionKey key); extern void check_new_partition_bound(char *relname, Relation parent, PartitionBoundSpec *spec); diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index e5bcf8ae..27c9ef26 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -5707,6 +5707,9 @@ DATA(insert OID = 3353 ( pg_ls_logdir PGNSP PGUID 12 10 20 0 0 DESCR("list files in the log directory"); DATA(insert OID = 3354 ( pg_ls_waldir PGNSP PGUID 12 10 20 0 0 f f f f t t v s 0 0 2249 "" "{25,20,1184}" "{o,o,o}" "{name,size,modification}" _null_ _null_ pg_ls_waldir _null_ _null_ _null_ )); DESCR("list of files in the WAL directory"); +/* hash partitioning constraint function */ +DATA(insert OID = 4687 ( satisfies_hash_partition PGNSP PGUID 12 1 0 2276 0 f f f f f f i s 4 0 16 "26 23 23 2276" _null_ _null_ _null_ _null_ _null_ satisfies_hash_partition _null_ _null_ _null_ )); +DESCR("hash partition CHECK constraint"); DATA(insert OID = 3410 ( pg_extent_info PGNSP PGUID 12 10 20 0 0 f f f f f t v s 1 0 2249 "2205" "{23,16,23,23,23,23,23,23,23}" "{o,o,o,o,o,o,o,o,o}" "{eid,is_occupied,shardid,freespace_cat,hwm,scan_next,scan_prev,alloc_next,alloc_prev}" _null_ _null_ pg_extent_info_oid _null_ _null_ _null_ )); DESCR("get extent info of a relation"); DATA(insert OID = 3411 ( pg_shard_scan_list PGNSP PGUID 12 10 20 0 0 f f f f f t v s 2 0 2249 "2205 23" "{23,16,23,23,23,23}" "{o,o,o,o,o,o}" "{eid,is_occupied,shardid,freespace_cat,hwm,scan_next}" _null_ _null_ pg_shard_scan_list_oid _null_ _null_ _null_ )); diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index c508a87d..983a1ab0 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -849,7 +849,8 @@ typedef struct PartitionBy typedef struct PartitionSpec { NodeTag type; - char *strategy; /* partitioning strategy ('list' or 'range') */ + char *strategy; /* partitioning strategy ('hash', 'list' or + * 'range') */ List *partParams; /* List of PartitionElems */ #ifdef __TBASE__ PartitionBy *interval; /* used for interval partition */ @@ -858,6 +859,7 @@ typedef struct PartitionSpec } PartitionSpec; /* Internal codes for partitioning strategies */ +#define PARTITION_STRATEGY_HASH 'h' #define PARTITION_STRATEGY_LIST 'l' #define PARTITION_STRATEGY_RANGE 'r' #ifdef __TBASE__ @@ -878,6 +880,10 @@ typedef struct PartitionBoundSpec char strategy; /* see PARTITION_STRATEGY codes above */ bool is_default; /* is it a default partition bound? */ + /* Partitioning info for HASH strategy: */ + int modulus; + int remainder; + /* Partitioning info for LIST strategy: */ List *listdatums; /* List of Consts (or A_Consts in raw tree) */ diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out index d112f403..455cee74 100644 --- a/src/test/regress/expected/alter_table.out +++ b/src/test/regress/expected/alter_table.out @@ -3304,6 +3304,7 @@ SELECT conislocal, coninhcount FROM pg_constraint WHERE conrelid = 'part_1'::reg CREATE TABLE fail_part (LIKE part_1 INCLUDING CONSTRAINTS); ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1); ERROR: partition "fail_part" would overlap partition "part_1" +DROP TABLE fail_part; -- check that an existing table can be attached as a default partition CREATE TABLE def_part (LIKE list_parted INCLUDING CONSTRAINTS); ALTER TABLE list_parted ATTACH PARTITION def_part DEFAULT; @@ -3503,6 +3504,59 @@ CREATE TABLE quuux1 PARTITION OF quuux FOR VALUES IN (1); CREATE TABLE quuux2 PARTITION OF quuux FOR VALUES IN (2); INFO: partition constraint for table "quuux_default1" is implied by existing constraints DROP TABLE quuux; +-- check validation when attaching hash partitions +-- The default hash functions as they exist today aren't portable; they can +-- return different results on different machines. Depending upon how the +-- values are hashed, the row may map to different partitions, which result in +-- regression failure. To avoid this, let's create a non-default hash function +-- that just returns the input value unchanged. +CREATE OR REPLACE FUNCTION dummy_hashint4(a int4, seed int8) RETURNS int8 AS +$$ BEGIN RETURN (a + 1 + seed); END; $$ LANGUAGE 'plpgsql' IMMUTABLE; +CREATE OPERATOR CLASS custom_opclass FOR TYPE int4 USING HASH AS +OPERATOR 1 = , FUNCTION 2 dummy_hashint4(int4, int8); +-- check that the new partition won't overlap with an existing partition +CREATE TABLE hash_parted ( + a int, + b int +) PARTITION BY HASH (a custom_opclass); +CREATE TABLE hpart_1 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 4, REMAINDER 0); +CREATE TABLE fail_part (LIKE hpart_1); +ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 4); +ERROR: partition "fail_part" would overlap partition "hpart_1" +ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 0); +ERROR: partition "fail_part" would overlap partition "hpart_1" +DROP TABLE fail_part; +-- check validation when attaching hash partitions +-- check that violating rows are correctly reported +CREATE TABLE hpart_2 (LIKE hash_parted); +INSERT INTO hpart_2 VALUES (3, 0); +ALTER TABLE hash_parted ATTACH PARTITION hpart_2 FOR VALUES WITH (MODULUS 4, REMAINDER 1); +ERROR: partition constraint is violated by some row +-- should be ok after deleting the bad row +DELETE FROM hpart_2; +ALTER TABLE hash_parted ATTACH PARTITION hpart_2 FOR VALUES WITH (MODULUS 4, REMAINDER 1); +-- check that leaf partitions are scanned when attaching a partitioned +-- table +CREATE TABLE hpart_5 ( + LIKE hash_parted +) PARTITION BY LIST (b); +-- check that violating rows are correctly reported +CREATE TABLE hpart_5_a PARTITION OF hpart_5 FOR VALUES IN ('1', '2', '3'); +INSERT INTO hpart_5_a (a, b) VALUES (7, 1); +ALTER TABLE hash_parted ATTACH PARTITION hpart_5 FOR VALUES WITH (MODULUS 4, REMAINDER 2); +ERROR: partition constraint is violated by some row +-- should be ok after deleting the bad row +DELETE FROM hpart_5_a; +ALTER TABLE hash_parted ATTACH PARTITION hpart_5 FOR VALUES WITH (MODULUS 4, REMAINDER 2); +-- check that the table being attach is with valid modulus and remainder value +CREATE TABLE fail_part(LIKE hash_parted); +ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 0, REMAINDER 1); +ERROR: modulus for hash partition must be a positive integer +ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 8); +ERROR: remainder for hash partition must be less than modulus +ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 3, REMAINDER 2); +ERROR: every hash partition modulus must be a factor of the next larger modulus +DROP TABLE fail_part; -- -- DETACH PARTITION -- @@ -3514,12 +3568,17 @@ DROP TABLE regular_table; -- check that the partition being detached exists at all ALTER TABLE list_parted2 DETACH PARTITION part_4; ERROR: relation "part_4" does not exist +ALTER TABLE hash_parted DETACH PARTITION hpart_4; +ERROR: relation "hpart_4" does not exist -- check that the partition being detached is actually a partition of the parent CREATE TABLE not_a_part (a int); ALTER TABLE list_parted2 DETACH PARTITION not_a_part; ERROR: relation "not_a_part" is not a partition of relation "list_parted2" ALTER TABLE list_parted2 DETACH PARTITION part_1; ERROR: relation "part_1" is not a partition of relation "list_parted2" +ALTER TABLE hash_parted DETACH PARTITION not_a_part; +ERROR: relation "not_a_part" is not a partition of relation "hash_parted" +DROP TABLE not_a_part; -- check that, after being detached, attinhcount/coninhcount is dropped to 0 and -- attislocal/conislocal is set to true ALTER TABLE list_parted2 DETACH PARTITION part_3_4; @@ -3623,6 +3682,9 @@ SELECT * FROM list_parted; -- cleanup DROP TABLE list_parted, list_parted2, range_parted; DROP TABLE fail_def_part; +DROP TABLE hash_parted; +DROP OPERATOR CLASS custom_opclass USING HASH; +DROP FUNCTION dummy_hashint4(a int4, seed int8); -- more tests for certain multi-level partitioning scenarios create table p (a int, b int) partition by range (a, b); create table p1 (b int, a int not null) partition by range (b); diff --git a/src/test/regress/expected/alter_table_1.out b/src/test/regress/expected/alter_table_1.out index 9d508790..357e16da 100644 --- a/src/test/regress/expected/alter_table_1.out +++ b/src/test/regress/expected/alter_table_1.out @@ -3303,6 +3303,7 @@ SELECT conislocal, coninhcount FROM pg_constraint WHERE conrelid = 'part_1'::reg CREATE TABLE fail_part (LIKE part_1 INCLUDING CONSTRAINTS); ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1); ERROR: partition "fail_part" would overlap partition "part_1" +DROP TABLE fail_part; -- check validation when attaching list partitions CREATE TABLE list_parted2 ( a int, @@ -3434,6 +3435,59 @@ DETAIL: "part_5" is already a child of "list_parted2". ALTER TABLE list_parted2 ATTACH PARTITION list_parted2 FOR VALUES IN (0); ERROR: circular inheritance not allowed DETAIL: "list_parted2" is already a child of "list_parted2". +-- check validation when attaching hash partitions +-- The default hash functions as they exist today aren't portable; they can +-- return different results on different machines. Depending upon how the +-- values are hashed, the row may map to different partitions, which result in +-- regression failure. To avoid this, let's create a non-default hash function +-- that just returns the input value unchanged. +CREATE OR REPLACE FUNCTION dummy_hashint4(a int4, seed int8) RETURNS int8 AS +$$ BEGIN RETURN (a + 1 + seed); END; $$ LANGUAGE 'plpgsql' IMMUTABLE; +CREATE OPERATOR CLASS custom_opclass FOR TYPE int4 USING HASH AS +OPERATOR 1 = , FUNCTION 2 dummy_hashint4(int4, int8); +-- check that the new partition won't overlap with an existing partition +CREATE TABLE hash_parted ( + a int, + b int +) PARTITION BY HASH (a custom_opclass); +CREATE TABLE hpart_1 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 4, REMAINDER 0); +CREATE TABLE fail_part (LIKE hpart_1); +ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 4); +ERROR: partition "fail_part" would overlap partition "hpart_1" +ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 0); +ERROR: partition "fail_part" would overlap partition "hpart_1" +DROP TABLE fail_part; +-- check validation when attaching hash partitions +-- check that violating rows are correctly reported +CREATE TABLE hpart_2 (LIKE hash_parted); +INSERT INTO hpart_2 VALUES (3, 0); +ALTER TABLE hash_parted ATTACH PARTITION hpart_2 FOR VALUES WITH (MODULUS 4, REMAINDER 1); +ERROR: partition constraint is violated by some row +-- should be ok after deleting the bad row +DELETE FROM hpart_2; +ALTER TABLE hash_parted ATTACH PARTITION hpart_2 FOR VALUES WITH (MODULUS 4, REMAINDER 1); +-- check that leaf partitions are scanned when attaching a partitioned +-- table +CREATE TABLE hpart_5 ( + LIKE hash_parted +) PARTITION BY LIST (b); +-- check that violating rows are correctly reported +CREATE TABLE hpart_5_a PARTITION OF hpart_5 FOR VALUES IN ('1', '2', '3'); +INSERT INTO hpart_5_a (a, b) VALUES (7, 1); +ALTER TABLE hash_parted ATTACH PARTITION hpart_5 FOR VALUES WITH (MODULUS 4, REMAINDER 2); +ERROR: partition constraint is violated by some row +-- should be ok after deleting the bad row +DELETE FROM hpart_5_a; +ALTER TABLE hash_parted ATTACH PARTITION hpart_5 FOR VALUES WITH (MODULUS 4, REMAINDER 2); +-- check that the table being attach is with valid modulus and remainder value +CREATE TABLE fail_part(LIKE hash_parted); +ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 0, REMAINDER 1); +ERROR: modulus for hash partition must be a positive integer +ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 8); +ERROR: remainder for hash partition must be less than modulus +ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 3, REMAINDER 2); +ERROR: every hash partition modulus must be a factor of the next larger modulus +DROP TABLE fail_part; -- -- DETACH PARTITION -- @@ -3445,12 +3499,17 @@ DROP TABLE regular_table; -- check that the partition being detached exists at all ALTER TABLE list_parted2 DETACH PARTITION part_4; ERROR: relation "part_4" does not exist +ALTER TABLE hash_parted DETACH PARTITION hpart_4; +ERROR: relation "hpart_4" does not exist -- check that the partition being detached is actually a partition of the parent CREATE TABLE not_a_part (a int); ALTER TABLE list_parted2 DETACH PARTITION not_a_part; ERROR: relation "not_a_part" is not a partition of relation "list_parted2" ALTER TABLE list_parted2 DETACH PARTITION part_1; ERROR: relation "part_1" is not a partition of relation "list_parted2" +ALTER TABLE hash_parted DETACH PARTITION not_a_part; +ERROR: relation "not_a_part" is not a partition of relation "hash_parted" +DROP TABLE not_a_part; -- check that, after being detached, attinhcount/coninhcount is dropped to 0 and -- attislocal/conislocal is set to true ALTER TABLE list_parted2 DETACH PARTITION part_3_4; @@ -3546,6 +3605,9 @@ ALTER TABLE list_parted2 ALTER COLUMN b TYPE text; ERROR: cannot alter type of column named in partition key -- cleanup DROP TABLE list_parted, list_parted2, range_parted; +DROP TABLE hash_parted; +DROP OPERATOR CLASS custom_opclass USING HASH; +DROP FUNCTION dummy_hashint4(a int4, seed int8); -- more tests for certain multi-level partitioning scenarios create table p (a int, b int) partition by range (a, b); create table p1 (b int, a int not null) partition by range (b); diff --git a/src/test/regress/expected/alter_table_2.out b/src/test/regress/expected/alter_table_2.out index c25de7bc..88f9f851 100644 --- a/src/test/regress/expected/alter_table_2.out +++ b/src/test/regress/expected/alter_table_2.out @@ -3303,6 +3303,7 @@ SELECT conislocal, coninhcount FROM pg_constraint WHERE conrelid = 'part_1'::reg CREATE TABLE fail_part (LIKE part_1 INCLUDING CONSTRAINTS); ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1); ERROR: partition "fail_part" would overlap partition "part_1" +DROP TABLE fail_part; -- check validation when attaching list partitions CREATE TABLE list_parted2 ( a int, @@ -3434,6 +3435,59 @@ DETAIL: "part_5" is already a child of "list_parted2". ALTER TABLE list_parted2 ATTACH PARTITION list_parted2 FOR VALUES IN (0); ERROR: circular inheritance not allowed DETAIL: "list_parted2" is already a child of "list_parted2". +-- check validation when attaching hash partitions +-- The default hash functions as they exist today aren't portable; they can +-- return different results on different machines. Depending upon how the +-- values are hashed, the row may map to different partitions, which result in +-- regression failure. To avoid this, let's create a non-default hash function +-- that just returns the input value unchanged. +CREATE OR REPLACE FUNCTION dummy_hashint4(a int4, seed int8) RETURNS int8 AS +$$ BEGIN RETURN (a + 1 + seed); END; $$ LANGUAGE 'plpgsql' IMMUTABLE; +CREATE OPERATOR CLASS custom_opclass FOR TYPE int4 USING HASH AS +OPERATOR 1 = , FUNCTION 2 dummy_hashint4(int4, int8); +-- check that the new partition won't overlap with an existing partition +CREATE TABLE hash_parted ( + a int, + b int +) PARTITION BY HASH (a custom_opclass); +CREATE TABLE hpart_1 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 4, REMAINDER 0); +CREATE TABLE fail_part (LIKE hpart_1); +ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 4); +ERROR: partition "fail_part" would overlap partition "hpart_1" +ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 0); +ERROR: partition "fail_part" would overlap partition "hpart_1" +DROP TABLE fail_part; +-- check validation when attaching hash partitions +-- check that violating rows are correctly reported +CREATE TABLE hpart_2 (LIKE hash_parted); +INSERT INTO hpart_2 VALUES (3, 0); +ALTER TABLE hash_parted ATTACH PARTITION hpart_2 FOR VALUES WITH (MODULUS 4, REMAINDER 1); +ERROR: partition constraint is violated by some row +-- should be ok after deleting the bad row +DELETE FROM hpart_2; +ALTER TABLE hash_parted ATTACH PARTITION hpart_2 FOR VALUES WITH (MODULUS 4, REMAINDER 1); +-- check that leaf partitions are scanned when attaching a partitioned +-- table +CREATE TABLE hpart_5 ( + LIKE hash_parted +) PARTITION BY LIST (b); +-- check that violating rows are correctly reported +CREATE TABLE hpart_5_a PARTITION OF hpart_5 FOR VALUES IN ('1', '2', '3'); +INSERT INTO hpart_5_a (a, b) VALUES (7, 1); +ALTER TABLE hash_parted ATTACH PARTITION hpart_5 FOR VALUES WITH (MODULUS 4, REMAINDER 2); +ERROR: partition constraint is violated by some row +-- should be ok after deleting the bad row +DELETE FROM hpart_5_a; +ALTER TABLE hash_parted ATTACH PARTITION hpart_5 FOR VALUES WITH (MODULUS 4, REMAINDER 2); +-- check that the table being attach is with valid modulus and remainder value +CREATE TABLE fail_part(LIKE hash_parted); +ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 0, REMAINDER 1); +ERROR: modulus for hash partition must be a positive integer +ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 8); +ERROR: remainder for hash partition must be less than modulus +ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 3, REMAINDER 2); +ERROR: every hash partition modulus must be a factor of the next larger modulus +DROP TABLE fail_part; -- -- DETACH PARTITION -- @@ -3445,12 +3499,17 @@ DROP TABLE regular_table; -- check that the partition being detached exists at all ALTER TABLE list_parted2 DETACH PARTITION part_4; ERROR: relation "part_4" does not exist +ALTER TABLE hash_parted DETACH PARTITION hpart_4; +ERROR: relation "hpart_4" does not exist -- check that the partition being detached is actually a partition of the parent CREATE TABLE not_a_part (a int); ALTER TABLE list_parted2 DETACH PARTITION not_a_part; ERROR: relation "not_a_part" is not a partition of relation "list_parted2" ALTER TABLE list_parted2 DETACH PARTITION part_1; ERROR: relation "part_1" is not a partition of relation "list_parted2" +ALTER TABLE hash_parted DETACH PARTITION not_a_part; +ERROR: relation "not_a_part" is not a partition of relation "hash_parted" +DROP TABLE not_a_part; -- check that, after being detached, attinhcount/coninhcount is dropped to 0 and -- attislocal/conislocal is set to true ALTER TABLE list_parted2 DETACH PARTITION part_3_4; @@ -3546,6 +3605,9 @@ ALTER TABLE list_parted2 ALTER COLUMN b TYPE text; ERROR: cannot alter type of column named in partition key -- cleanup DROP TABLE list_parted, list_parted2, range_parted; +DROP TABLE hash_parted; +DROP OPERATOR CLASS custom_opclass USING HASH; +DROP FUNCTION dummy_hashint4(a int4, seed int8); -- more tests for certain multi-level partitioning scenarios create table p (a int, b int) partition by range (a, b); create table p1 (b int, a int not null) partition by range (b); diff --git a/src/test/regress/expected/alter_table_3.out b/src/test/regress/expected/alter_table_3.out index 031cc211..345150e0 100644 --- a/src/test/regress/expected/alter_table_3.out +++ b/src/test/regress/expected/alter_table_3.out @@ -3303,6 +3303,7 @@ SELECT conislocal, coninhcount FROM pg_constraint WHERE conrelid = 'part_1'::reg CREATE TABLE fail_part (LIKE part_1 INCLUDING CONSTRAINTS); ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1); ERROR: partition "fail_part" would overlap partition "part_1" +DROP TABLE fail_part; -- check validation when attaching list partitions CREATE TABLE list_parted2 ( a int, @@ -3434,6 +3435,59 @@ DETAIL: "part_5" is already a child of "list_parted2". ALTER TABLE list_parted2 ATTACH PARTITION list_parted2 FOR VALUES IN (0); ERROR: circular inheritance not allowed DETAIL: "list_parted2" is already a child of "list_parted2". +-- check validation when attaching hash partitions +-- The default hash functions as they exist today aren't portable; they can +-- return different results on different machines. Depending upon how the +-- values are hashed, the row may map to different partitions, which result in +-- regression failure. To avoid this, let's create a non-default hash function +-- that just returns the input value unchanged. +CREATE OR REPLACE FUNCTION dummy_hashint4(a int4, seed int8) RETURNS int8 AS +$$ BEGIN RETURN (a + 1 + seed); END; $$ LANGUAGE 'plpgsql' IMMUTABLE; +CREATE OPERATOR CLASS custom_opclass FOR TYPE int4 USING HASH AS +OPERATOR 1 = , FUNCTION 2 dummy_hashint4(int4, int8); +-- check that the new partition won't overlap with an existing partition +CREATE TABLE hash_parted ( + a int, + b int +) PARTITION BY HASH (a custom_opclass); +CREATE TABLE hpart_1 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 4, REMAINDER 0); +CREATE TABLE fail_part (LIKE hpart_1); +ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 4); +ERROR: partition "fail_part" would overlap partition "hpart_1" +ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 0); +ERROR: partition "fail_part" would overlap partition "hpart_1" +DROP TABLE fail_part; +-- check validation when attaching hash partitions +-- check that violating rows are correctly reported +CREATE TABLE hpart_2 (LIKE hash_parted); +INSERT INTO hpart_2 VALUES (3, 0); +ALTER TABLE hash_parted ATTACH PARTITION hpart_2 FOR VALUES WITH (MODULUS 4, REMAINDER 1); +ERROR: partition constraint is violated by some row +-- should be ok after deleting the bad row +DELETE FROM hpart_2; +ALTER TABLE hash_parted ATTACH PARTITION hpart_2 FOR VALUES WITH (MODULUS 4, REMAINDER 1); +-- check that leaf partitions are scanned when attaching a partitioned +-- table +CREATE TABLE hpart_5 ( + LIKE hash_parted +) PARTITION BY LIST (b); +-- check that violating rows are correctly reported +CREATE TABLE hpart_5_a PARTITION OF hpart_5 FOR VALUES IN ('1', '2', '3'); +INSERT INTO hpart_5_a (a, b) VALUES (7, 1); +ALTER TABLE hash_parted ATTACH PARTITION hpart_5 FOR VALUES WITH (MODULUS 4, REMAINDER 2); +ERROR: partition constraint is violated by some row +-- should be ok after deleting the bad row +DELETE FROM hpart_5_a; +ALTER TABLE hash_parted ATTACH PARTITION hpart_5 FOR VALUES WITH (MODULUS 4, REMAINDER 2); +-- check that the table being attach is with valid modulus and remainder value +CREATE TABLE fail_part(LIKE hash_parted); +ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 0, REMAINDER 1); +ERROR: modulus for hash partition must be a positive integer +ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 8); +ERROR: remainder for hash partition must be less than modulus +ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 3, REMAINDER 2); +ERROR: every hash partition modulus must be a factor of the next larger modulus +DROP TABLE fail_part; -- -- DETACH PARTITION -- @@ -3445,12 +3499,17 @@ DROP TABLE regular_table; -- check that the partition being detached exists at all ALTER TABLE list_parted2 DETACH PARTITION part_4; ERROR: relation "part_4" does not exist +ALTER TABLE hash_parted DETACH PARTITION hpart_4; +ERROR: relation "hpart_4" does not exist -- check that the partition being detached is actually a partition of the parent CREATE TABLE not_a_part (a int); ALTER TABLE list_parted2 DETACH PARTITION not_a_part; ERROR: relation "not_a_part" is not a partition of relation "list_parted2" ALTER TABLE list_parted2 DETACH PARTITION part_1; ERROR: relation "part_1" is not a partition of relation "list_parted2" +ALTER TABLE hash_parted DETACH PARTITION not_a_part; +ERROR: relation "not_a_part" is not a partition of relation "hash_parted" +DROP TABLE not_a_part; -- check that, after being detached, attinhcount/coninhcount is dropped to 0 and -- attislocal/conislocal is set to true ALTER TABLE list_parted2 DETACH PARTITION part_3_4; @@ -3546,6 +3605,9 @@ ALTER TABLE list_parted2 ALTER COLUMN b TYPE text; ERROR: cannot alter type of column named in partition key -- cleanup DROP TABLE list_parted, list_parted2, range_parted; +DROP TABLE hash_parted; +DROP OPERATOR CLASS custom_opclass USING HASH; +DROP FUNCTION dummy_hashint4(a int4, seed int8); -- more tests for certain multi-level partitioning scenarios create table p (a int, b int) partition by range (a, b); create table p1 (b int, a int not null) partition by range (b); diff --git a/src/test/regress/expected/create_table.out b/src/test/regress/expected/create_table.out index 982e28f0..4ae86d8c 100644 --- a/src/test/regress/expected/create_table.out +++ b/src/test/regress/expected/create_table.out @@ -343,11 +343,11 @@ CREATE TABLE partitioned ( ) PARTITION BY RANGE (const_func()); ERROR: cannot use constant expression as partition key DROP FUNCTION const_func(); --- only accept "list" and "range" as partitioning strategy +-- only accept valid partitioning strategy CREATE TABLE partitioned ( a int -) PARTITION BY HASH (a); -ERROR: unrecognized partitioning strategy "hash" +) PARTITION BY MAGIC (a); +ERROR: unrecognized partitioning strategy "magic" -- specified column must be present in the table CREATE TABLE partitioned ( a int @@ -470,6 +470,11 @@ CREATE TABLE fail_part PARTITION OF list_parted FOR VALUES FROM (1) TO (2); ERROR: invalid bound specification for a list partition LINE 1: ...BLE fail_part PARTITION OF list_parted FOR VALUES FROM (1) T... ^ +-- trying to specify modulus and remainder for list partitioned table +CREATE TABLE fail_part PARTITION OF list_parted FOR VALUES WITH (MODULUS 10, REMAINDER 1); +ERROR: invalid bound specification for a list partition +LINE 1: ...BLE fail_part PARTITION OF list_parted FOR VALUES WITH (MODU... + ^ -- check default partition cannot be created more than once CREATE TABLE part_default PARTITION OF list_parted DEFAULT; CREATE TABLE fail_default_part PARTITION OF list_parted DEFAULT; @@ -512,6 +517,11 @@ CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES IN ('a'); ERROR: invalid bound specification for a range partition LINE 1: ...BLE fail_part PARTITION OF range_parted FOR VALUES IN ('a'); ^ +-- trying to specify modulus and remainder for range partitioned table +CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES WITH (MODULUS 10, REMAINDER 1); +ERROR: invalid bound specification for a range partition +LINE 1: ...LE fail_part PARTITION OF range_parted FOR VALUES WITH (MODU... + ^ -- each of start and end bounds must have same number of values as the -- length of the partition key CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES FROM ('a', 1) TO ('z'); @@ -521,6 +531,37 @@ ERROR: TO must specify exactly one value per partitioning column -- cannot specify null values in range bounds CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES FROM (null) TO (maxvalue); ERROR: cannot specify NULL in range bound +-- trying to specify modulus and remainder for range partitioned table +CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES WITH (MODULUS 10, REMAINDER 1); +ERROR: invalid bound specification for a range partition +LINE 1: ...LE fail_part PARTITION OF range_parted FOR VALUES WITH (MODU... + ^ +-- check partition bound syntax for the hash partition +CREATE TABLE hash_parted ( + a int +) PARTITION BY HASH (a); +CREATE TABLE hpart_1 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 10, REMAINDER 0); +CREATE TABLE hpart_2 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 50, REMAINDER 1); +CREATE TABLE hpart_3 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 200, REMAINDER 2); +-- modulus 25 is factor of modulus of 50 but 10 is not factor of 25. +CREATE TABLE fail_part PARTITION OF hash_parted FOR VALUES WITH (MODULUS 25, REMAINDER 3); +ERROR: every hash partition modulus must be a factor of the next larger modulus +-- previous modulus 50 is factor of 150 but this modulus is not factor of next modulus 200. +CREATE TABLE fail_part PARTITION OF hash_parted FOR VALUES WITH (MODULUS 150, REMAINDER 3); +ERROR: every hash partition modulus must be a factor of the next larger modulus +-- trying to specify range for the hash partitioned table +CREATE TABLE fail_part PARTITION OF hash_parted FOR VALUES FROM ('a', 1) TO ('z'); +ERROR: invalid bound specification for a hash partition +LINE 1: ...BLE fail_part PARTITION OF hash_parted FOR VALUES FROM ('a',... + ^ +-- trying to specify list value for the hash partitioned table +CREATE TABLE fail_part PARTITION OF hash_parted FOR VALUES IN (1000); +ERROR: invalid bound specification for a hash partition +LINE 1: ...BLE fail_part PARTITION OF hash_parted FOR VALUES IN (1000); + ^ +-- trying to create default partition for the hash partitioned table +CREATE TABLE fail_default_part PARTITION OF hash_parted DEFAULT; +ERROR: a hash-partitioned table may not have a default partition -- check if compatible with the specified parent -- cannot create as partition of a non-partitioned table CREATE TABLE unparted ( @@ -528,6 +569,8 @@ CREATE TABLE unparted ( ); CREATE TABLE fail_part PARTITION OF unparted FOR VALUES IN ('a'); ERROR: "unparted" is not partitioned +CREATE TABLE fail_part PARTITION OF unparted FOR VALUES WITH (MODULUS 2, REMAINDER 1); +ERROR: "unparted" is not partitioned DROP TABLE unparted; -- cannot create a permanent rel as partition of a temp rel CREATE TEMP TABLE temp_parted ( @@ -628,6 +671,23 @@ CREATE TABLE range3_default PARTITION OF range_parted3 DEFAULT; -- more specific ranges CREATE TABLE fail_part PARTITION OF range_parted3 FOR VALUES FROM (1, minvalue) TO (1, maxvalue); ERROR: partition "fail_part" would overlap partition "part10" +-- check for partition bound overlap and other invalid specifications for the hash partition +CREATE TABLE hash_parted2 ( + a varchar +) PARTITION BY HASH (a); +CREATE TABLE h2part_1 PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 4, REMAINDER 2); +CREATE TABLE h2part_2 PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 8, REMAINDER 0); +CREATE TABLE h2part_3 PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 8, REMAINDER 4); +CREATE TABLE h2part_4 PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 8, REMAINDER 5); +-- overlap with part_4 +CREATE TABLE fail_part PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 2, REMAINDER 1); +ERROR: partition "fail_part" would overlap partition "h2part_4" +-- modulus must be greater than zero +CREATE TABLE fail_part PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 0, REMAINDER 1); +ERROR: modulus for hash partition must be a positive integer +-- remainder must be greater than or equal to zero and less than modulus +CREATE TABLE fail_part PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 8, REMAINDER 8); +ERROR: remainder for hash partition must be less than modulus -- check schema propagation from parent CREATE TABLE parted ( a text, @@ -732,9 +792,17 @@ Check constraints: "check_a" CHECK (length(a) > 0) Number of partitions: 3 (Use \d+ to list them.) +\d hash_parted + Table "public.hash_parted" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | +Partition key: HASH (a) +Number of partitions: 3 (Use \d+ to list them.) + -- check that we get the expected partition constraints CREATE TABLE range_parted4 (a int, b int, c int) PARTITION BY RANGE (abs(a), abs(b), c); -CREATE TABLE unbounded_range_part PARTITION OF range_parted4 FOR VALUES FROM (MINVALUE, 0, 0) TO (MAXVALUE, 0, 0); +CREATE TABLE unbounded_range_part PARTITION OF range_parted4 FOR VALUES FROM (MINVALUE, MINVALUE, MINVALUE) TO (MAXVALUE, MAXVALUE, MAXVALUE); \d+ unbounded_range_part Table "public.unbounded_range_part" Column | Type | Collation | Nullable | Default | Storage | Stats target | Description @@ -742,13 +810,13 @@ CREATE TABLE unbounded_range_part PARTITION OF range_parted4 FOR VALUES FROM (MI a | integer | | | | plain | | b | integer | | | | plain | | c | integer | | | | plain | | -Partition of: range_parted4 FOR VALUES FROM (MINVALUE, 0, 0) TO (MAXVALUE, 0, 0) +Partition of: range_parted4 FOR VALUES FROM (MINVALUE, MINVALUE, MINVALUE) TO (MAXVALUE, MAXVALUE, MAXVALUE) Partition constraint: ((abs(a) IS NOT NULL) AND (abs(b) IS NOT NULL) AND (c IS NOT NULL)) Distribute By: HASH(a) Location Nodes: ALL DATANODES DROP TABLE unbounded_range_part; -CREATE TABLE range_parted4_1 PARTITION OF range_parted4 FOR VALUES FROM (MINVALUE, 0, 0) TO (1, MAXVALUE, 0); +CREATE TABLE range_parted4_1 PARTITION OF range_parted4 FOR VALUES FROM (MINVALUE, MINVALUE, MINVALUE) TO (1, MAXVALUE, MAXVALUE); \d+ range_parted4_1 Table "public.range_parted4_1" Column | Type | Collation | Nullable | Default | Storage | Stats target | Description @@ -756,7 +824,7 @@ CREATE TABLE range_parted4_1 PARTITION OF range_parted4 FOR VALUES FROM (MINVALU a | integer | | | | plain | | b | integer | | | | plain | | c | integer | | | | plain | | -Partition of: range_parted4 FOR VALUES FROM (MINVALUE, 0, 0) TO (1, MAXVALUE, 0) +Partition of: range_parted4 FOR VALUES FROM (MINVALUE, MINVALUE, MINVALUE) TO (1, MAXVALUE, MAXVALUE) Partition constraint: ((abs(a) IS NOT NULL) AND (abs(b) IS NOT NULL) AND (c IS NOT NULL) AND (abs(a) <= 1)) Distribute By: HASH(a) Location Nodes: ALL DATANODES @@ -774,7 +842,7 @@ Partition constraint: ((abs(a) IS NOT NULL) AND (abs(b) IS NOT NULL) AND (c IS N Distribute By: HASH(a) Location Nodes: ALL DATANODES -CREATE TABLE range_parted4_3 PARTITION OF range_parted4 FOR VALUES FROM (6, 8, MINVALUE) TO (9, MAXVALUE, 0); +CREATE TABLE range_parted4_3 PARTITION OF range_parted4 FOR VALUES FROM (6, 8, MINVALUE) TO (9, MAXVALUE, MAXVALUE); \d+ range_parted4_3 Table "public.range_parted4_3" Column | Type | Collation | Nullable | Default | Storage | Stats target | Description @@ -782,7 +850,7 @@ CREATE TABLE range_parted4_3 PARTITION OF range_parted4 FOR VALUES FROM (6, 8, M a | integer | | | | plain | | b | integer | | | | plain | | c | integer | | | | plain | | -Partition of: range_parted4 FOR VALUES FROM (6, 8, MINVALUE) TO (9, MAXVALUE, 0) +Partition of: range_parted4 FOR VALUES FROM (6, 8, MINVALUE) TO (9, MAXVALUE, MAXVALUE) Partition constraint: ((abs(a) IS NOT NULL) AND (abs(b) IS NOT NULL) AND (c IS NOT NULL) AND ((abs(a) > 6) OR ((abs(a) = 6) AND (abs(b) >= 8))) AND (abs(a) <= 9)) Distribute By: HASH(a) Location Nodes: ALL DATANODES @@ -790,6 +858,8 @@ Location Nodes: ALL DATANODES DROP TABLE range_parted4; -- cleanup DROP TABLE parted, list_parted, range_parted, list_parted2, range_parted2, range_parted3; +DROP TABLE hash_parted; +DROP TABLE hash_parted2; -- comments on partitioned tables columns CREATE TABLE parted_col_comment (a int, b text) PARTITION BY LIST (a); COMMENT ON TABLE parted_col_comment IS 'Am partitioned table'; diff --git a/src/test/regress/expected/inherit.out b/src/test/regress/expected/inherit.out index cabe7df5..6e287dc4 100644 --- a/src/test/regress/expected/inherit.out +++ b/src/test/regress/expected/inherit.out @@ -2166,12 +2166,12 @@ drop table range_list_parted; -- check that constraint exclusion is able to cope with the partition -- constraint emitted for multi-column range partitioned tables create table mcrparted (a int, b int, c int) partition by range (a, abs(b), c); -create table mcrparted0 partition of mcrparted for values from (minvalue, 0, 0) to (1, 1, 1); +create table mcrparted0 partition of mcrparted for values from (minvalue, minvalue, minvalue) to (1, 1, 1); create table mcrparted1 partition of mcrparted for values from (1, 1, 1) to (10, 5, 10); create table mcrparted2 partition of mcrparted for values from (10, 5, 10) to (10, 10, 10); create table mcrparted3 partition of mcrparted for values from (11, 1, 1) to (20, 10, 10); create table mcrparted4 partition of mcrparted for values from (20, 10, 10) to (20, 20, 20); -create table mcrparted5 partition of mcrparted for values from (20, 20, 20) to (maxvalue, 0, 0); +create table mcrparted5 partition of mcrparted for values from (20, 20, 20) to (maxvalue, maxvalue, maxvalue); explain (costs off) select * from mcrparted where a = 0; -- scans mcrparted0 QUERY PLAN ------------------------------------------ diff --git a/src/test/regress/expected/inherit_1.out b/src/test/regress/expected/inherit_1.out index c087db53..ff38ed79 100644 --- a/src/test/regress/expected/inherit_1.out +++ b/src/test/regress/expected/inherit_1.out @@ -2160,12 +2160,12 @@ drop table range_list_parted; -- check that constraint exclusion is able to cope with the partition -- constraint emitted for multi-column range partitioned tables create table mcrparted (a int, b int, c int) partition by range (a, abs(b), c); -create table mcrparted0 partition of mcrparted for values from (minvalue, 0, 0) to (1, 1, 1); +create table mcrparted0 partition of mcrparted for values from (minvalue, minvalue, minvalue) to (1, 1, 1); create table mcrparted1 partition of mcrparted for values from (1, 1, 1) to (10, 5, 10); create table mcrparted2 partition of mcrparted for values from (10, 5, 10) to (10, 10, 10); create table mcrparted3 partition of mcrparted for values from (11, 1, 1) to (20, 10, 10); create table mcrparted4 partition of mcrparted for values from (20, 10, 10) to (20, 20, 20); -create table mcrparted5 partition of mcrparted for values from (20, 20, 20) to (maxvalue, 0, 0); +create table mcrparted5 partition of mcrparted for values from (20, 20, 20) to (maxvalue, maxvalue, maxvalue); explain (costs off) select * from mcrparted where a = 0; -- scans mcrparted0 QUERY PLAN ------------------------------------ diff --git a/src/test/regress/expected/inherit_2.out b/src/test/regress/expected/inherit_2.out index 9b61d3c6..8d97e116 100644 --- a/src/test/regress/expected/inherit_2.out +++ b/src/test/regress/expected/inherit_2.out @@ -2131,12 +2131,12 @@ drop table range_list_parted; -- check that constraint exclusion is able to cope with the partition -- constraint emitted for multi-column range partitioned tables create table mcrparted (a int, b int, c int) partition by range (a, abs(b), c); -create table mcrparted0 partition of mcrparted for values from (minvalue, 0, 0) to (1, 1, 1); +create table mcrparted0 partition of mcrparted for values from (minvalue, minvalue, minvalue) to (1, 1, 1); create table mcrparted1 partition of mcrparted for values from (1, 1, 1) to (10, 5, 10); create table mcrparted2 partition of mcrparted for values from (10, 5, 10) to (10, 10, 10); create table mcrparted3 partition of mcrparted for values from (11, 1, 1) to (20, 10, 10); create table mcrparted4 partition of mcrparted for values from (20, 10, 10) to (20, 20, 20); -create table mcrparted5 partition of mcrparted for values from (20, 20, 20) to (maxvalue, 0, 0); +create table mcrparted5 partition of mcrparted for values from (20, 20, 20) to (maxvalue, maxvalue, maxvalue); explain (costs off) select * from mcrparted where a = 0; -- scans mcrparted0 QUERY PLAN ------------------------------------------ diff --git a/src/test/regress/expected/inherit_3.out b/src/test/regress/expected/inherit_3.out index d0ff897f..402c6a51 100644 --- a/src/test/regress/expected/inherit_3.out +++ b/src/test/regress/expected/inherit_3.out @@ -2147,12 +2147,12 @@ drop table range_list_parted; -- check that constraint exclusion is able to cope with the partition -- constraint emitted for multi-column range partitioned tables create table mcrparted (a int, b int, c int) partition by range (a, abs(b), c); -create table mcrparted0 partition of mcrparted for values from (minvalue, 0, 0) to (1, 1, 1); +create table mcrparted0 partition of mcrparted for values from (minvalue, minvalue, minvalue) to (1, 1, 1); create table mcrparted1 partition of mcrparted for values from (1, 1, 1) to (10, 5, 10); create table mcrparted2 partition of mcrparted for values from (10, 5, 10) to (10, 10, 10); create table mcrparted3 partition of mcrparted for values from (11, 1, 1) to (20, 10, 10); create table mcrparted4 partition of mcrparted for values from (20, 10, 10) to (20, 20, 20); -create table mcrparted5 partition of mcrparted for values from (20, 20, 20) to (maxvalue, 0, 0); +create table mcrparted5 partition of mcrparted for values from (20, 20, 20) to (maxvalue, maxvalue, maxvalue); explain (costs off) select * from mcrparted where a = 0; -- scans mcrparted0 QUERY PLAN ------------------------------------ diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out index 9d5b125e..96b99abb 100644 --- a/src/test/regress/expected/insert.out +++ b/src/test/regress/expected/insert.out @@ -384,8 +384,54 @@ select tableoid::regclass::text, a, min(b) as min_b, max(b) as max_b from list_p part_null | | 1 | 1 (9 rows) +-- direct partition inserts should check hash partition bound constraint +-- create custom operator class and hash function, for the same reason +-- explained in alter_table.sql +create or replace function dummy_hashint4(a int4, seed int8) returns int8 as +$$ begin return (a + seed); end; $$ language 'plpgsql' immutable; +create operator class custom_opclass for type int4 using hash as +operator 1 = , function 2 dummy_hashint4(int4, int8); +create table hash_parted ( + a int +) partition by hash (a custom_opclass); +create table hpart0 partition of hash_parted for values with (modulus 4, remainder 0); +create table hpart1 partition of hash_parted for values with (modulus 4, remainder 1); +create table hpart2 partition of hash_parted for values with (modulus 4, remainder 2); +create table hpart3 partition of hash_parted for values with (modulus 4, remainder 3); +insert into hash_parted values(generate_series(1,10)); +-- direct insert of values divisible by 4 - ok; +insert into hpart0 values(12),(16); +-- fail; +insert into hpart0 values(11); +ERROR: new row for relation "hpart0" violates partition constraint +DETAIL: Failing row contains (11). +-- 11 % 4 -> 3 remainder i.e. valid data for hpart3 partition +insert into hpart3 values(11); +-- view data +select tableoid::regclass as part, a, a%4 as "remainder = a % 4" +from hash_parted order by part; + part | a | remainder = a % 4 +--------+----+------------------- + hpart0 | 4 | 0 + hpart0 | 8 | 0 + hpart0 | 12 | 0 + hpart0 | 16 | 0 + hpart1 | 1 | 1 + hpart1 | 5 | 1 + hpart1 | 9 | 1 + hpart2 | 2 | 2 + hpart2 | 6 | 2 + hpart2 | 10 | 2 + hpart3 | 3 | 3 + hpart3 | 7 | 3 + hpart3 | 11 | 3 +(13 rows) + -- cleanup drop table range_parted, list_parted; +drop table hash_parted; +drop operator class custom_opclass using hash; +drop function dummy_hashint4(a int4, seed int8); -- test that a default partition added as the first partition accepts any value -- including null create table list_parted (a int) partition by list (a); @@ -607,15 +653,28 @@ revoke all on key_desc from someone_else; revoke all on key_desc_1 from someone_else; drop role someone_else; drop table key_desc, key_desc_1; +-- test minvalue/maxvalue restrictions +create table mcrparted (a int, b int, c int) partition by range (a, abs(b), c); +create table mcrparted0 partition of mcrparted for values from (minvalue, 0, 0) to (1, maxvalue, maxvalue); +ERROR: every bound following MINVALUE must also be MINVALUE +LINE 1: ...partition of mcrparted for values from (minvalue, 0, 0) to (... + ^ +create table mcrparted2 partition of mcrparted for values from (10, 6, minvalue) to (10, maxvalue, minvalue); +ERROR: every bound following MAXVALUE must also be MAXVALUE +LINE 1: ...r values from (10, 6, minvalue) to (10, maxvalue, minvalue); + ^ +create table mcrparted4 partition of mcrparted for values from (21, minvalue, 0) to (30, 20, minvalue); +ERROR: every bound following MINVALUE must also be MINVALUE +LINE 1: ...ition of mcrparted for values from (21, minvalue, 0) to (30,... + ^ -- check multi-column range partitioning expression enforces the same -- constraint as what tuple-routing would determine it to be -create table mcrparted (a int, b int, c int) partition by range (a, abs(b), c); -create table mcrparted0 partition of mcrparted for values from (minvalue, 0, 0) to (1, maxvalue, 0); +create table mcrparted0 partition of mcrparted for values from (minvalue, minvalue, minvalue) to (1, maxvalue, maxvalue); create table mcrparted1 partition of mcrparted for values from (2, 1, minvalue) to (10, 5, 10); -create table mcrparted2 partition of mcrparted for values from (10, 6, minvalue) to (10, maxvalue, 0); +create table mcrparted2 partition of mcrparted for values from (10, 6, minvalue) to (10, maxvalue, maxvalue); create table mcrparted3 partition of mcrparted for values from (11, 1, 1) to (20, 10, 10); -create table mcrparted4 partition of mcrparted for values from (21, minvalue, 0) to (30, 20, maxvalue); -create table mcrparted5 partition of mcrparted for values from (30, 21, 20) to (maxvalue, 0, 0); +create table mcrparted4 partition of mcrparted for values from (21, minvalue, minvalue) to (30, 20, maxvalue); +create table mcrparted5 partition of mcrparted for values from (30, 21, 20) to (maxvalue, maxvalue, maxvalue); -- routed to mcrparted0 insert into mcrparted values (0, 1, 1); insert into mcrparted0 values (0, 1, 1); @@ -696,14 +755,14 @@ drop table brtrigpartcon; drop function brtrigpartcon1trigf(); -- check multi-column range partitioning with minvalue/maxvalue constraints create table mcrparted (a text, b int) partition by range(a, b); -create table mcrparted1_lt_b partition of mcrparted for values from (minvalue, 0) to ('b', minvalue); +create table mcrparted1_lt_b partition of mcrparted for values from (minvalue, minvalue) to ('b', minvalue); create table mcrparted2_b partition of mcrparted for values from ('b', minvalue) to ('c', minvalue); create table mcrparted3_c_to_common partition of mcrparted for values from ('c', minvalue) to ('common', minvalue); create table mcrparted4_common_lt_0 partition of mcrparted for values from ('common', minvalue) to ('common', 0); create table mcrparted5_common_0_to_10 partition of mcrparted for values from ('common', 0) to ('common', 10); create table mcrparted6_common_ge_10 partition of mcrparted for values from ('common', 10) to ('common', maxvalue); create table mcrparted7_gt_common_lt_d partition of mcrparted for values from ('common', maxvalue) to ('d', minvalue); -create table mcrparted8_ge_d partition of mcrparted for values from ('d', minvalue) to (maxvalue, 0); +create table mcrparted8_ge_d partition of mcrparted for values from ('d', minvalue) to (maxvalue, maxvalue); \d+ mcrparted Table "public.mcrparted" Column | Type | Collation | Nullable | Default | Storage | Stats target | Description @@ -711,14 +770,14 @@ create table mcrparted8_ge_d partition of mcrparted for values from ('d', minval a | text | | | | extended | | b | integer | | | | plain | | Partition key: RANGE (a, b) -Partitions: mcrparted1_lt_b FOR VALUES FROM (MINVALUE, 0) TO ('b', MINVALUE), +Partitions: mcrparted1_lt_b FOR VALUES FROM (MINVALUE, MINVALUE) TO ('b', MINVALUE), mcrparted2_b FOR VALUES FROM ('b', MINVALUE) TO ('c', MINVALUE), mcrparted3_c_to_common FOR VALUES FROM ('c', MINVALUE) TO ('common', MINVALUE), mcrparted4_common_lt_0 FOR VALUES FROM ('common', MINVALUE) TO ('common', 0), mcrparted5_common_0_to_10 FOR VALUES FROM ('common', 0) TO ('common', 10), mcrparted6_common_ge_10 FOR VALUES FROM ('common', 10) TO ('common', MAXVALUE), mcrparted7_gt_common_lt_d FOR VALUES FROM ('common', MAXVALUE) TO ('d', MINVALUE), - mcrparted8_ge_d FOR VALUES FROM ('d', MINVALUE) TO (MAXVALUE, 0) + mcrparted8_ge_d FOR VALUES FROM ('d', MINVALUE) TO (MAXVALUE, MAXVALUE) Distribute By: HASH(a) Location Nodes: ALL DATANODES @@ -728,7 +787,7 @@ Location Nodes: ALL DATANODES --------+---------+-----------+----------+---------+----------+--------------+------------- a | text | | | | extended | | b | integer | | | | plain | | -Partition of: mcrparted FOR VALUES FROM (MINVALUE, 0) TO ('b', MINVALUE) +Partition of: mcrparted FOR VALUES FROM (MINVALUE, MINVALUE) TO ('b', MINVALUE) Partition constraint: ((a IS NOT NULL) AND (b IS NOT NULL) AND (a < 'b'::text)) Distribute By: HASH(a) Location Nodes: ALL DATANODES @@ -805,7 +864,7 @@ Location Nodes: ALL DATANODES --------+---------+-----------+----------+---------+----------+--------------+------------- a | text | | | | extended | | b | integer | | | | plain | | -Partition of: mcrparted FOR VALUES FROM ('d', MINVALUE) TO (MAXVALUE, 0) +Partition of: mcrparted FOR VALUES FROM ('d', MINVALUE) TO (MAXVALUE, MAXVALUE) Partition constraint: ((a IS NOT NULL) AND (b IS NOT NULL) AND (a >= 'd'::text)) Distribute By: HASH(a) Location Nodes: ALL DATANODES diff --git a/src/test/regress/expected/insert_1.out b/src/test/regress/expected/insert_1.out index 66cffedd..3528769d 100644 --- a/src/test/regress/expected/insert_1.out +++ b/src/test/regress/expected/insert_1.out @@ -607,15 +607,28 @@ revoke all on key_desc from someone_else; revoke all on key_desc_1 from someone_else; drop role someone_else; drop table key_desc, key_desc_1; +-- test minvalue/maxvalue restrictions +create table mcrparted (a int, b int, c int) partition by range (a, abs(b), c); +create table mcrparted0 partition of mcrparted for values from (minvalue, 0, 0) to (1, maxvalue, maxvalue); +ERROR: every bound following MINVALUE must also be MINVALUE +LINE 1: ...partition of mcrparted for values from (minvalue, 0, 0) to (... + ^ +create table mcrparted2 partition of mcrparted for values from (10, 6, minvalue) to (10, maxvalue, minvalue); +ERROR: every bound following MAXVALUE must also be MAXVALUE +LINE 1: ...r values from (10, 6, minvalue) to (10, maxvalue, minvalue); + ^ +create table mcrparted4 partition of mcrparted for values from (21, minvalue, 0) to (30, 20, minvalue); +ERROR: every bound following MINVALUE must also be MINVALUE +LINE 1: ...ition of mcrparted for values from (21, minvalue, 0) to (30,... + ^ -- check multi-column range partitioning expression enforces the same -- constraint as what tuple-routing would determine it to be -create table mcrparted (a int, b int, c int) partition by range (a, abs(b), c); -create table mcrparted0 partition of mcrparted for values from (minvalue, 0, 0) to (1, maxvalue, 0); +create table mcrparted0 partition of mcrparted for values from (minvalue, minvalue, minvalue) to (1, maxvalue, maxvalue); create table mcrparted1 partition of mcrparted for values from (2, 1, minvalue) to (10, 5, 10); -create table mcrparted2 partition of mcrparted for values from (10, 6, minvalue) to (10, maxvalue, 0); +create table mcrparted2 partition of mcrparted for values from (10, 6, minvalue) to (10, maxvalue, maxvalue); create table mcrparted3 partition of mcrparted for values from (11, 1, 1) to (20, 10, 10); -create table mcrparted4 partition of mcrparted for values from (21, minvalue, 0) to (30, 20, maxvalue); -create table mcrparted5 partition of mcrparted for values from (30, 21, 20) to (maxvalue, 0, 0); +create table mcrparted4 partition of mcrparted for values from (21, minvalue, minvalue) to (30, 20, maxvalue); +create table mcrparted5 partition of mcrparted for values from (30, 21, 20) to (maxvalue, maxvalue, maxvalue); -- routed to mcrparted0 insert into mcrparted values (0, 1, 1); insert into mcrparted0 values (0, 1, 1); @@ -696,14 +709,14 @@ drop table brtrigpartcon; drop function brtrigpartcon1trigf(); -- check multi-column range partitioning with minvalue/maxvalue constraints create table mcrparted (a text, b int) partition by range(a, b); -create table mcrparted1_lt_b partition of mcrparted for values from (minvalue, 0) to ('b', minvalue); +create table mcrparted1_lt_b partition of mcrparted for values from (minvalue, minvalue) to ('b', minvalue); create table mcrparted2_b partition of mcrparted for values from ('b', minvalue) to ('c', minvalue); create table mcrparted3_c_to_common partition of mcrparted for values from ('c', minvalue) to ('common', minvalue); create table mcrparted4_common_lt_0 partition of mcrparted for values from ('common', minvalue) to ('common', 0); create table mcrparted5_common_0_to_10 partition of mcrparted for values from ('common', 0) to ('common', 10); create table mcrparted6_common_ge_10 partition of mcrparted for values from ('common', 10) to ('common', maxvalue); create table mcrparted7_gt_common_lt_d partition of mcrparted for values from ('common', maxvalue) to ('d', minvalue); -create table mcrparted8_ge_d partition of mcrparted for values from ('d', minvalue) to (maxvalue, 0); +create table mcrparted8_ge_d partition of mcrparted for values from ('d', minvalue) to (maxvalue, maxvalue); \d+ mcrparted Table "public.mcrparted" Column | Type | Collation | Nullable | Default | Storage | Stats target | Description @@ -711,14 +724,14 @@ create table mcrparted8_ge_d partition of mcrparted for values from ('d', minval a | text | | | | extended | | b | integer | | | | plain | | Partition key: RANGE (a, b) -Partitions: mcrparted1_lt_b FOR VALUES FROM (MINVALUE, 0) TO ('b', MINVALUE), +Partitions: mcrparted1_lt_b FOR VALUES FROM (MINVALUE, MAXVALUE) TO ('b', MINVALUE), mcrparted2_b FOR VALUES FROM ('b', MINVALUE) TO ('c', MINVALUE), mcrparted3_c_to_common FOR VALUES FROM ('c', MINVALUE) TO ('common', MINVALUE), mcrparted4_common_lt_0 FOR VALUES FROM ('common', MINVALUE) TO ('common', 0), mcrparted5_common_0_to_10 FOR VALUES FROM ('common', 0) TO ('common', 10), mcrparted6_common_ge_10 FOR VALUES FROM ('common', 10) TO ('common', MAXVALUE), mcrparted7_gt_common_lt_d FOR VALUES FROM ('common', MAXVALUE) TO ('d', MINVALUE), - mcrparted8_ge_d FOR VALUES FROM ('d', MINVALUE) TO (MAXVALUE, 0) + mcrparted8_ge_d FOR VALUES FROM ('d', MINVALUE) TO (MAXVALUE, MAXVALUE) Distribute By: HASH(a) Location Nodes: ALL DATANODES @@ -728,7 +741,7 @@ Location Nodes: ALL DATANODES --------+---------+-----------+----------+---------+----------+--------------+------------- a | text | | | | extended | | b | integer | | | | plain | | -Partition of: mcrparted FOR VALUES FROM (MINVALUE, 0) TO ('b', MINVALUE) +Partition of: mcrparted FOR VALUES FROM (MINVALUE, MINVALUE) TO ('b', MINVALUE) Partition constraint: ((a IS NOT NULL) AND (b IS NOT NULL) AND (a < 'b'::text)) Distribute By: HASH(a) Location Nodes: ALL DATANODES @@ -805,7 +818,7 @@ Location Nodes: ALL DATANODES --------+---------+-----------+----------+---------+----------+--------------+------------- a | text | | | | extended | | b | integer | | | | plain | | -Partition of: mcrparted FOR VALUES FROM ('d', MINVALUE) TO (MAXVALUE, 0) +Partition of: mcrparted FOR VALUES FROM ('d', MINVALUE) TO (MAXVALUE, MAXVALUE) Partition constraint: ((a IS NOT NULL) AND (b IS NOT NULL) AND (a >= 'd'::text)) Distribute By: HASH(a) Location Nodes: ALL DATANODES diff --git a/src/test/regress/expected/partition_join.out b/src/test/regress/expected/partition_join.out index 234b8b53..1c8cdb34 100644 --- a/src/test/regress/expected/partition_join.out +++ b/src/test/regress/expected/partition_join.out @@ -1256,6 +1256,87 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 One-Time Filter: false (14 rows) +-- +-- tests for hash partitioned tables. +-- +CREATE TABLE pht1 (a int, b int, c text) PARTITION BY HASH(c); +CREATE TABLE pht1_p1 PARTITION OF pht1 FOR VALUES WITH (MODULUS 3, REMAINDER 0); +CREATE TABLE pht1_p2 PARTITION OF pht1 FOR VALUES WITH (MODULUS 3, REMAINDER 1); +CREATE TABLE pht1_p3 PARTITION OF pht1 FOR VALUES WITH (MODULUS 3, REMAINDER 2); +INSERT INTO pht1 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i; +ANALYZE pht1; +CREATE TABLE pht2 (a int, b int, c text) PARTITION BY HASH(c); +CREATE TABLE pht2_p1 PARTITION OF pht2 FOR VALUES WITH (MODULUS 3, REMAINDER 0); +CREATE TABLE pht2_p2 PARTITION OF pht2 FOR VALUES WITH (MODULUS 3, REMAINDER 1); +CREATE TABLE pht2_p3 PARTITION OF pht2 FOR VALUES WITH (MODULUS 3, REMAINDER 2); +INSERT INTO pht2 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 3) i; +ANALYZE pht2; +-- +-- hash partitioned by expression +-- +CREATE TABLE pht1_e (a int, b int, c text) PARTITION BY HASH(ltrim(c, 'A')); +CREATE TABLE pht1_e_p1 PARTITION OF pht1_e FOR VALUES WITH (MODULUS 3, REMAINDER 0); +CREATE TABLE pht1_e_p2 PARTITION OF pht1_e FOR VALUES WITH (MODULUS 3, REMAINDER 1); +CREATE TABLE pht1_e_p3 PARTITION OF pht1_e FOR VALUES WITH (MODULUS 3, REMAINDER 2); +INSERT INTO pht1_e SELECT i, i, 'A' || to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i; +ANALYZE pht1_e; +-- test partition matching with N-way join +EXPLAIN (COSTS OFF) +SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM pht1 t1, pht2 t2, pht1_e t3 WHERE t1.c = t2.c AND ltrim(t3.c, 'A') = t1.c GROUP BY t1.c, t2.c, t3.c ORDER BY t1.c, t2.c, t3.c; + QUERY PLAN +-------------------------------------------------------------------------------------- + Sort + Sort Key: t1.c, t3.c + -> HashAggregate + Group Key: t1.c, t2.c, t3.c + -> Result + -> Append + -> Hash Join + Hash Cond: (t1.c = t2.c) + -> Seq Scan on pht1_p1 t1 + -> Hash + -> Hash Join + Hash Cond: (t2.c = ltrim(t3.c, 'A'::text)) + -> Seq Scan on pht2_p1 t2 + -> Hash + -> Seq Scan on pht1_e_p1 t3 + -> Hash Join + Hash Cond: (t1_1.c = t2_1.c) + -> Seq Scan on pht1_p2 t1_1 + -> Hash + -> Hash Join + Hash Cond: (t2_1.c = ltrim(t3_1.c, 'A'::text)) + -> Seq Scan on pht2_p2 t2_1 + -> Hash + -> Seq Scan on pht1_e_p2 t3_1 + -> Hash Join + Hash Cond: (t1_2.c = t2_2.c) + -> Seq Scan on pht1_p3 t1_2 + -> Hash + -> Hash Join + Hash Cond: (t2_2.c = ltrim(t3_2.c, 'A'::text)) + -> Seq Scan on pht2_p3 t2_2 + -> Hash + -> Seq Scan on pht1_e_p3 t3_2 +(33 rows) + +SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM pht1 t1, pht2 t2, pht1_e t3 WHERE t1.c = t2.c AND ltrim(t3.c, 'A') = t1.c GROUP BY t1.c, t2.c, t3.c ORDER BY t1.c, t2.c, t3.c; + avg | avg | avg | c | c | c +----------------------+----------------------+-----------------------+------+------+------- + 24.0000000000000000 | 24.0000000000000000 | 48.0000000000000000 | 0000 | 0000 | A0000 + 74.0000000000000000 | 75.0000000000000000 | 148.0000000000000000 | 0001 | 0001 | A0001 + 124.0000000000000000 | 124.5000000000000000 | 248.0000000000000000 | 0002 | 0002 | A0002 + 174.0000000000000000 | 174.0000000000000000 | 348.0000000000000000 | 0003 | 0003 | A0003 + 224.0000000000000000 | 225.0000000000000000 | 448.0000000000000000 | 0004 | 0004 | A0004 + 274.0000000000000000 | 274.5000000000000000 | 548.0000000000000000 | 0005 | 0005 | A0005 + 324.0000000000000000 | 324.0000000000000000 | 648.0000000000000000 | 0006 | 0006 | A0006 + 374.0000000000000000 | 375.0000000000000000 | 748.0000000000000000 | 0007 | 0007 | A0007 + 424.0000000000000000 | 424.5000000000000000 | 848.0000000000000000 | 0008 | 0008 | A0008 + 474.0000000000000000 | 474.0000000000000000 | 948.0000000000000000 | 0009 | 0009 | A0009 + 524.0000000000000000 | 525.0000000000000000 | 1048.0000000000000000 | 0010 | 0010 | A0010 + 574.0000000000000000 | 574.5000000000000000 | 1148.0000000000000000 | 0011 | 0011 | A0011 +(12 rows) + -- -- multiple levels of partitioning -- diff --git a/src/test/regress/expected/update.out b/src/test/regress/expected/update.out index 2989db8f..0aae60ac 100644 --- a/src/test/regress/expected/update.out +++ b/src/test/regress/expected/update.out @@ -250,6 +250,35 @@ ERROR: new row for relation "list_default" violates partition constraint DETAIL: Failing row contains (a, 10). -- ok update list_default set a = 'x' where a = 'd'; +-- create custom operator class and hash function, for the same reason +-- explained in alter_table.sql +create or replace function dummy_hashint4(a int4, seed int8) returns int8 as +$$ begin return (a + seed); end; $$ language 'plpgsql' immutable; +create operator class custom_opclass for type int4 using hash as +operator 1 = , function 2 dummy_hashint4(int4, int8); +create table hash_parted ( + a int, + b int +) partition by hash (a custom_opclass, b custom_opclass); +create table hpart1 partition of hash_parted for values with (modulus 2, remainder 1); +create table hpart2 partition of hash_parted for values with (modulus 4, remainder 2); +create table hpart3 partition of hash_parted for values with (modulus 8, remainder 0); +create table hpart4 partition of hash_parted for values with (modulus 8, remainder 4); +insert into hpart1 values (1, 1); +insert into hpart2 values (2, 5); +insert into hpart4 values (3, 4); +-- fail +update hpart1 set a = 3, b=4 where a = 1; +ERROR: new row for relation "hpart1" violates partition constraint +DETAIL: Failing row contains (3, 4). +update hash_parted set b = b - 1 where b = 1; +ERROR: new row for relation "hpart1" violates partition constraint +DETAIL: Failing row contains (1, 0). +-- ok +update hash_parted set b = b + 8 where b = 1; -- cleanup drop table range_parted; drop table list_parted; +drop table hash_parted; +drop operator class custom_opclass using hash; +drop function dummy_hashint4(a int4, seed int8); diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql index 97d2d9bf..a32521f1 100644 --- a/src/test/regress/sql/alter_table.sql +++ b/src/test/regress/sql/alter_table.sql @@ -2112,6 +2112,7 @@ SELECT conislocal, coninhcount FROM pg_constraint WHERE conrelid = 'part_1'::reg -- check that the new partition won't overlap with an existing partition CREATE TABLE fail_part (LIKE part_1 INCLUDING CONSTRAINTS); ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1); +DROP TABLE fail_part; -- check that an existing table can be attached as a default partition CREATE TABLE def_part (LIKE list_parted INCLUDING CONSTRAINTS); ALTER TABLE list_parted ATTACH PARTITION def_part DEFAULT; @@ -2307,6 +2308,62 @@ CREATE TABLE quuux1 PARTITION OF quuux FOR VALUES IN (1); CREATE TABLE quuux2 PARTITION OF quuux FOR VALUES IN (2); DROP TABLE quuux; +-- check validation when attaching hash partitions + +-- The default hash functions as they exist today aren't portable; they can +-- return different results on different machines. Depending upon how the +-- values are hashed, the row may map to different partitions, which result in +-- regression failure. To avoid this, let's create a non-default hash function +-- that just returns the input value unchanged. +CREATE OR REPLACE FUNCTION dummy_hashint4(a int4, seed int8) RETURNS int8 AS +$$ BEGIN RETURN (a + 1 + seed); END; $$ LANGUAGE 'plpgsql' IMMUTABLE; +CREATE OPERATOR CLASS custom_opclass FOR TYPE int4 USING HASH AS +OPERATOR 1 = , FUNCTION 2 dummy_hashint4(int4, int8); + +-- check that the new partition won't overlap with an existing partition +CREATE TABLE hash_parted ( + a int, + b int +) PARTITION BY HASH (a custom_opclass); +CREATE TABLE hpart_1 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 4, REMAINDER 0); +CREATE TABLE fail_part (LIKE hpart_1); +ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 4); +ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 0); +DROP TABLE fail_part; + +-- check validation when attaching hash partitions + +-- check that violating rows are correctly reported +CREATE TABLE hpart_2 (LIKE hash_parted); +INSERT INTO hpart_2 VALUES (3, 0); +ALTER TABLE hash_parted ATTACH PARTITION hpart_2 FOR VALUES WITH (MODULUS 4, REMAINDER 1); + +-- should be ok after deleting the bad row +DELETE FROM hpart_2; +ALTER TABLE hash_parted ATTACH PARTITION hpart_2 FOR VALUES WITH (MODULUS 4, REMAINDER 1); + +-- check that leaf partitions are scanned when attaching a partitioned +-- table +CREATE TABLE hpart_5 ( + LIKE hash_parted +) PARTITION BY LIST (b); + +-- check that violating rows are correctly reported +CREATE TABLE hpart_5_a PARTITION OF hpart_5 FOR VALUES IN ('1', '2', '3'); +INSERT INTO hpart_5_a (a, b) VALUES (7, 1); +ALTER TABLE hash_parted ATTACH PARTITION hpart_5 FOR VALUES WITH (MODULUS 4, REMAINDER 2); + +-- should be ok after deleting the bad row +DELETE FROM hpart_5_a; +ALTER TABLE hash_parted ATTACH PARTITION hpart_5 FOR VALUES WITH (MODULUS 4, REMAINDER 2); + +-- check that the table being attach is with valid modulus and remainder value +CREATE TABLE fail_part(LIKE hash_parted); +ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 0, REMAINDER 1); +ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 8); +ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 3, REMAINDER 2); +DROP TABLE fail_part; + -- -- DETACH PARTITION -- @@ -2318,12 +2375,16 @@ DROP TABLE regular_table; -- check that the partition being detached exists at all ALTER TABLE list_parted2 DETACH PARTITION part_4; +ALTER TABLE hash_parted DETACH PARTITION hpart_4; -- check that the partition being detached is actually a partition of the parent CREATE TABLE not_a_part (a int); ALTER TABLE list_parted2 DETACH PARTITION not_a_part; ALTER TABLE list_parted2 DETACH PARTITION part_1; +ALTER TABLE hash_parted DETACH PARTITION not_a_part; +DROP TABLE not_a_part; + -- check that, after being detached, attinhcount/coninhcount is dropped to 0 and -- attislocal/conislocal is set to true ALTER TABLE list_parted2 DETACH PARTITION part_3_4; @@ -2400,6 +2461,9 @@ SELECT * FROM list_parted; -- cleanup DROP TABLE list_parted, list_parted2, range_parted; DROP TABLE fail_def_part; +DROP TABLE hash_parted; +DROP OPERATOR CLASS custom_opclass USING HASH; +DROP FUNCTION dummy_hashint4(a int4, seed int8); -- more tests for certain multi-level partitioning scenarios create table p (a int, b int) partition by range (a, b); diff --git a/src/test/regress/sql/create_table.sql b/src/test/regress/sql/create_table.sql index 1a74fdd1..c1cf6ee1 100644 --- a/src/test/regress/sql/create_table.sql +++ b/src/test/regress/sql/create_table.sql @@ -352,10 +352,10 @@ CREATE TABLE partitioned ( ) PARTITION BY RANGE (const_func()); DROP FUNCTION const_func(); --- only accept "list" and "range" as partitioning strategy +-- only accept valid partitioning strategy CREATE TABLE partitioned ( - a int -) PARTITION BY HASH (a); + a int +) PARTITION BY MAGIC (a); -- specified column must be present in the table CREATE TABLE partitioned ( @@ -448,6 +448,8 @@ CREATE TABLE fail_part PARTITION OF list_parted FOR VALUES IN ('1'::int); CREATE TABLE fail_part PARTITION OF list_parted FOR VALUES IN (); -- trying to specify range for list partitioned table CREATE TABLE fail_part PARTITION OF list_parted FOR VALUES FROM (1) TO (2); +-- trying to specify modulus and remainder for list partitioned table +CREATE TABLE fail_part PARTITION OF list_parted FOR VALUES WITH (MODULUS 10, REMAINDER 1); -- check default partition cannot be created more than once CREATE TABLE part_default PARTITION OF list_parted DEFAULT; @@ -483,6 +485,8 @@ CREATE TABLE range_parted ( -- trying to specify list for range partitioned table CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES IN ('a'); +-- trying to specify modulus and remainder for range partitioned table +CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES WITH (MODULUS 10, REMAINDER 1); -- each of start and end bounds must have same number of values as the -- length of the partition key CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES FROM ('a', 1) TO ('z'); @@ -491,6 +495,28 @@ CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES FROM ('a') TO ('z', -- cannot specify null values in range bounds CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES FROM (null) TO (maxvalue); +-- trying to specify modulus and remainder for range partitioned table +CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES WITH (MODULUS 10, REMAINDER 1); + +-- check partition bound syntax for the hash partition +CREATE TABLE hash_parted ( + a int +) PARTITION BY HASH (a); +CREATE TABLE hpart_1 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 10, REMAINDER 0); +CREATE TABLE hpart_2 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 50, REMAINDER 1); +CREATE TABLE hpart_3 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 200, REMAINDER 2); +-- modulus 25 is factor of modulus of 50 but 10 is not factor of 25. +CREATE TABLE fail_part PARTITION OF hash_parted FOR VALUES WITH (MODULUS 25, REMAINDER 3); +-- previous modulus 50 is factor of 150 but this modulus is not factor of next modulus 200. +CREATE TABLE fail_part PARTITION OF hash_parted FOR VALUES WITH (MODULUS 150, REMAINDER 3); +-- trying to specify range for the hash partitioned table +CREATE TABLE fail_part PARTITION OF hash_parted FOR VALUES FROM ('a', 1) TO ('z'); +-- trying to specify list value for the hash partitioned table +CREATE TABLE fail_part PARTITION OF hash_parted FOR VALUES IN (1000); + +-- trying to create default partition for the hash partitioned table +CREATE TABLE fail_default_part PARTITION OF hash_parted DEFAULT; + -- check if compatible with the specified parent -- cannot create as partition of a non-partitioned table @@ -498,6 +524,7 @@ CREATE TABLE unparted ( a int ); CREATE TABLE fail_part PARTITION OF unparted FOR VALUES IN ('a'); +CREATE TABLE fail_part PARTITION OF unparted FOR VALUES WITH (MODULUS 2, REMAINDER 1); DROP TABLE unparted; -- cannot create a permanent rel as partition of a temp rel @@ -587,6 +614,21 @@ CREATE TABLE range3_default PARTITION OF range_parted3 DEFAULT; -- more specific ranges CREATE TABLE fail_part PARTITION OF range_parted3 FOR VALUES FROM (1, minvalue) TO (1, maxvalue); +-- check for partition bound overlap and other invalid specifications for the hash partition +CREATE TABLE hash_parted2 ( + a varchar +) PARTITION BY HASH (a); +CREATE TABLE h2part_1 PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 4, REMAINDER 2); +CREATE TABLE h2part_2 PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 8, REMAINDER 0); +CREATE TABLE h2part_3 PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 8, REMAINDER 4); +CREATE TABLE h2part_4 PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 8, REMAINDER 5); +-- overlap with part_4 +CREATE TABLE fail_part PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 2, REMAINDER 1); +-- modulus must be greater than zero +CREATE TABLE fail_part PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 0, REMAINDER 1); +-- remainder must be greater than or equal to zero and less than modulus +CREATE TABLE fail_part PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 8, REMAINDER 8); + -- check schema propagation from parent CREATE TABLE parted ( @@ -640,22 +682,25 @@ CREATE TABLE part_c_1_10 PARTITION OF part_c FOR VALUES FROM (1) TO (10); -- output could vary depending on the order in which partition oids are -- returned. \d parted +\d hash_parted -- check that we get the expected partition constraints CREATE TABLE range_parted4 (a int, b int, c int) PARTITION BY RANGE (abs(a), abs(b), c); -CREATE TABLE unbounded_range_part PARTITION OF range_parted4 FOR VALUES FROM (MINVALUE, 0, 0) TO (MAXVALUE, 0, 0); +CREATE TABLE unbounded_range_part PARTITION OF range_parted4 FOR VALUES FROM (MINVALUE, MINVALUE, MINVALUE) TO (MAXVALUE, MAXVALUE, MAXVALUE); \d+ unbounded_range_part DROP TABLE unbounded_range_part; -CREATE TABLE range_parted4_1 PARTITION OF range_parted4 FOR VALUES FROM (MINVALUE, 0, 0) TO (1, MAXVALUE, 0); +CREATE TABLE range_parted4_1 PARTITION OF range_parted4 FOR VALUES FROM (MINVALUE, MINVALUE, MINVALUE) TO (1, MAXVALUE, MAXVALUE); \d+ range_parted4_1 CREATE TABLE range_parted4_2 PARTITION OF range_parted4 FOR VALUES FROM (3, 4, 5) TO (6, 7, MAXVALUE); \d+ range_parted4_2 -CREATE TABLE range_parted4_3 PARTITION OF range_parted4 FOR VALUES FROM (6, 8, MINVALUE) TO (9, MAXVALUE, 0); +CREATE TABLE range_parted4_3 PARTITION OF range_parted4 FOR VALUES FROM (6, 8, MINVALUE) TO (9, MAXVALUE, MAXVALUE); \d+ range_parted4_3 DROP TABLE range_parted4; -- cleanup DROP TABLE parted, list_parted, range_parted, list_parted2, range_parted2, range_parted3; +DROP TABLE hash_parted; +DROP TABLE hash_parted2; -- comments on partitioned tables columns CREATE TABLE parted_col_comment (a int, b text) PARTITION BY LIST (a); diff --git a/src/test/regress/sql/inherit.sql b/src/test/regress/sql/inherit.sql index ff11dbcb..58f7f523 100644 --- a/src/test/regress/sql/inherit.sql +++ b/src/test/regress/sql/inherit.sql @@ -734,12 +734,12 @@ drop table range_list_parted; -- check that constraint exclusion is able to cope with the partition -- constraint emitted for multi-column range partitioned tables create table mcrparted (a int, b int, c int) partition by range (a, abs(b), c); -create table mcrparted0 partition of mcrparted for values from (minvalue, 0, 0) to (1, 1, 1); +create table mcrparted0 partition of mcrparted for values from (minvalue, minvalue, minvalue) to (1, 1, 1); create table mcrparted1 partition of mcrparted for values from (1, 1, 1) to (10, 5, 10); create table mcrparted2 partition of mcrparted for values from (10, 5, 10) to (10, 10, 10); create table mcrparted3 partition of mcrparted for values from (11, 1, 1) to (20, 10, 10); create table mcrparted4 partition of mcrparted for values from (20, 10, 10) to (20, 20, 20); -create table mcrparted5 partition of mcrparted for values from (20, 20, 20) to (maxvalue, 0, 0); +create table mcrparted5 partition of mcrparted for values from (20, 20, 20) to (maxvalue, maxvalue, maxvalue); explain (costs off) select * from mcrparted where a = 0; -- scans mcrparted0 explain (costs off) select * from mcrparted where a = 10 and abs(b) < 5; -- scans mcrparted1 explain (costs off) select * from mcrparted where a = 10 and abs(b) = 5; -- scans mcrparted1, mcrparted2 diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql index bbfc03c4..ef7abf94 100644 --- a/src/test/regress/sql/insert.sql +++ b/src/test/regress/sql/insert.sql @@ -221,8 +221,41 @@ insert into list_parted select 'gg', s.a from generate_series(1, 9) s(a); insert into list_parted (b) values (1); select tableoid::regclass::text, a, min(b) as min_b, max(b) as max_b from list_parted group by 1, 2 order by 1; +-- direct partition inserts should check hash partition bound constraint + +-- create custom operator class and hash function, for the same reason +-- explained in alter_table.sql +create or replace function dummy_hashint4(a int4, seed int8) returns int8 as +$$ begin return (a + seed); end; $$ language 'plpgsql' immutable; +create operator class custom_opclass for type int4 using hash as +operator 1 = , function 2 dummy_hashint4(int4, int8); + +create table hash_parted ( + a int +) partition by hash (a custom_opclass); +create table hpart0 partition of hash_parted for values with (modulus 4, remainder 0); +create table hpart1 partition of hash_parted for values with (modulus 4, remainder 1); +create table hpart2 partition of hash_parted for values with (modulus 4, remainder 2); +create table hpart3 partition of hash_parted for values with (modulus 4, remainder 3); + +insert into hash_parted values(generate_series(1,10)); + +-- direct insert of values divisible by 4 - ok; +insert into hpart0 values(12),(16); +-- fail; +insert into hpart0 values(11); +-- 11 % 4 -> 3 remainder i.e. valid data for hpart3 partition +insert into hpart3 values(11); + +-- view data +select tableoid::regclass as part, a, a%4 as "remainder = a % 4" +from hash_parted order by part; + -- cleanup drop table range_parted, list_parted; +drop table hash_parted; +drop operator class custom_opclass using hash; +drop function dummy_hashint4(a int4, seed int8); -- test that a default partition added as the first partition accepts any value -- including null @@ -363,15 +396,20 @@ revoke all on key_desc_1 from someone_else; drop role someone_else; drop table key_desc, key_desc_1; +-- test minvalue/maxvalue restrictions +create table mcrparted (a int, b int, c int) partition by range (a, abs(b), c); +create table mcrparted0 partition of mcrparted for values from (minvalue, 0, 0) to (1, maxvalue, maxvalue); +create table mcrparted2 partition of mcrparted for values from (10, 6, minvalue) to (10, maxvalue, minvalue); +create table mcrparted4 partition of mcrparted for values from (21, minvalue, 0) to (30, 20, minvalue); + -- check multi-column range partitioning expression enforces the same -- constraint as what tuple-routing would determine it to be -create table mcrparted (a int, b int, c int) partition by range (a, abs(b), c); -create table mcrparted0 partition of mcrparted for values from (minvalue, 0, 0) to (1, maxvalue, 0); +create table mcrparted0 partition of mcrparted for values from (minvalue, minvalue, minvalue) to (1, maxvalue, maxvalue); create table mcrparted1 partition of mcrparted for values from (2, 1, minvalue) to (10, 5, 10); -create table mcrparted2 partition of mcrparted for values from (10, 6, minvalue) to (10, maxvalue, 0); +create table mcrparted2 partition of mcrparted for values from (10, 6, minvalue) to (10, maxvalue, maxvalue); create table mcrparted3 partition of mcrparted for values from (11, 1, 1) to (20, 10, 10); -create table mcrparted4 partition of mcrparted for values from (21, minvalue, 0) to (30, 20, maxvalue); -create table mcrparted5 partition of mcrparted for values from (30, 21, 20) to (maxvalue, 0, 0); +create table mcrparted4 partition of mcrparted for values from (21, minvalue, minvalue) to (30, 20, maxvalue); +create table mcrparted5 partition of mcrparted for values from (30, 21, 20) to (maxvalue, maxvalue, maxvalue); -- routed to mcrparted0 insert into mcrparted values (0, 1, 1); @@ -436,14 +474,14 @@ drop function brtrigpartcon1trigf(); -- check multi-column range partitioning with minvalue/maxvalue constraints create table mcrparted (a text, b int) partition by range(a, b); -create table mcrparted1_lt_b partition of mcrparted for values from (minvalue, 0) to ('b', minvalue); +create table mcrparted1_lt_b partition of mcrparted for values from (minvalue, minvalue) to ('b', minvalue); create table mcrparted2_b partition of mcrparted for values from ('b', minvalue) to ('c', minvalue); create table mcrparted3_c_to_common partition of mcrparted for values from ('c', minvalue) to ('common', minvalue); create table mcrparted4_common_lt_0 partition of mcrparted for values from ('common', minvalue) to ('common', 0); create table mcrparted5_common_0_to_10 partition of mcrparted for values from ('common', 0) to ('common', 10); create table mcrparted6_common_ge_10 partition of mcrparted for values from ('common', 10) to ('common', maxvalue); create table mcrparted7_gt_common_lt_d partition of mcrparted for values from ('common', maxvalue) to ('d', minvalue); -create table mcrparted8_ge_d partition of mcrparted for values from ('d', minvalue) to (maxvalue, 0); +create table mcrparted8_ge_d partition of mcrparted for values from ('d', minvalue) to (maxvalue, maxvalue); \d+ mcrparted \d+ mcrparted1_lt_b diff --git a/src/test/regress/sql/partition_join.sql b/src/test/regress/sql/partition_join.sql index ca525d99..2316bbdc 100644 --- a/src/test/regress/sql/partition_join.sql +++ b/src/test/regress/sql/partition_join.sql @@ -229,6 +229,38 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 EXPLAIN (COSTS OFF) SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 FULL JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b; +-- +-- tests for hash partitioned tables. +-- +CREATE TABLE pht1 (a int, b int, c text) PARTITION BY HASH(c); +CREATE TABLE pht1_p1 PARTITION OF pht1 FOR VALUES WITH (MODULUS 3, REMAINDER 0); +CREATE TABLE pht1_p2 PARTITION OF pht1 FOR VALUES WITH (MODULUS 3, REMAINDER 1); +CREATE TABLE pht1_p3 PARTITION OF pht1 FOR VALUES WITH (MODULUS 3, REMAINDER 2); +INSERT INTO pht1 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i; +ANALYZE pht1; + +CREATE TABLE pht2 (a int, b int, c text) PARTITION BY HASH(c); +CREATE TABLE pht2_p1 PARTITION OF pht2 FOR VALUES WITH (MODULUS 3, REMAINDER 0); +CREATE TABLE pht2_p2 PARTITION OF pht2 FOR VALUES WITH (MODULUS 3, REMAINDER 1); +CREATE TABLE pht2_p3 PARTITION OF pht2 FOR VALUES WITH (MODULUS 3, REMAINDER 2); +INSERT INTO pht2 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 3) i; +ANALYZE pht2; + +-- +-- hash partitioned by expression +-- +CREATE TABLE pht1_e (a int, b int, c text) PARTITION BY HASH(ltrim(c, 'A')); +CREATE TABLE pht1_e_p1 PARTITION OF pht1_e FOR VALUES WITH (MODULUS 3, REMAINDER 0); +CREATE TABLE pht1_e_p2 PARTITION OF pht1_e FOR VALUES WITH (MODULUS 3, REMAINDER 1); +CREATE TABLE pht1_e_p3 PARTITION OF pht1_e FOR VALUES WITH (MODULUS 3, REMAINDER 2); +INSERT INTO pht1_e SELECT i, i, 'A' || to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i; +ANALYZE pht1_e; + +-- test partition matching with N-way join +EXPLAIN (COSTS OFF) +SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM pht1 t1, pht2 t2, pht1_e t3 WHERE t1.c = t2.c AND ltrim(t3.c, 'A') = t1.c GROUP BY t1.c, t2.c, t3.c ORDER BY t1.c, t2.c, t3.c; +SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM pht1 t1, pht2 t2, pht1_e t3 WHERE t1.c = t2.c AND ltrim(t3.c, 'A') = t1.c GROUP BY t1.c, t2.c, t3.c ORDER BY t1.c, t2.c, t3.c; + -- -- multiple levels of partitioning -- diff --git a/src/test/regress/sql/update.sql b/src/test/regress/sql/update.sql index 42c5e405..9d673de4 100644 --- a/src/test/regress/sql/update.sql +++ b/src/test/regress/sql/update.sql @@ -148,6 +148,34 @@ update list_default set a = 'a' where a = 'd'; -- ok update list_default set a = 'x' where a = 'd'; +-- create custom operator class and hash function, for the same reason +-- explained in alter_table.sql +create or replace function dummy_hashint4(a int4, seed int8) returns int8 as +$$ begin return (a + seed); end; $$ language 'plpgsql' immutable; +create operator class custom_opclass for type int4 using hash as +operator 1 = , function 2 dummy_hashint4(int4, int8); + +create table hash_parted ( + a int, + b int +) partition by hash (a custom_opclass, b custom_opclass); +create table hpart1 partition of hash_parted for values with (modulus 2, remainder 1); +create table hpart2 partition of hash_parted for values with (modulus 4, remainder 2); +create table hpart3 partition of hash_parted for values with (modulus 8, remainder 0); +create table hpart4 partition of hash_parted for values with (modulus 8, remainder 4); +insert into hpart1 values (1, 1); +insert into hpart2 values (2, 5); +insert into hpart4 values (3, 4); + +-- fail +update hpart1 set a = 3, b=4 where a = 1; +update hash_parted set b = b - 1 where b = 1; +-- ok +update hash_parted set b = b + 8 where b = 1; + -- cleanup drop table range_parted; drop table list_parted; +drop table hash_parted; +drop operator class custom_opclass using hash; +drop function dummy_hashint4(a int4, seed int8); diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index dedefbdf..ba6ce916 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1562,6 +1562,7 @@ PartitionDispatch PartitionDispatchData PartitionElem PartitionKey +PartitionHashBound PartitionListValue PartitionRangeBound PartitionRangeDatum From 4932c4931e3de93f30dd4eaeed00a8b2ddd78570 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Thu, 25 Jun 2020 20:14:34 +0800 Subject: [PATCH 204/578] Centralize executor-related partitioning code. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/catalog/partition.c | 353 ++-------------- src/backend/commands/copy.c | 1 + src/backend/executor/Makefile | 2 +- src/backend/executor/execMain.c | 261 +----------- src/backend/executor/execPartition.c | 560 +++++++++++++++++++++++++ src/backend/executor/nodeModifyTable.c | 1 + src/include/catalog/partition.h | 48 +-- src/include/executor/execPartition.h | 65 +++ src/include/executor/executor.h | 13 +- 9 files changed, 666 insertions(+), 638 deletions(-) create mode 100644 src/backend/executor/execPartition.c create mode 100644 src/include/executor/execPartition.h diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c index 9832a333..ae0bbfbe 100644 --- a/src/backend/catalog/partition.c +++ b/src/backend/catalog/partition.c @@ -170,8 +170,6 @@ static int32 partition_bound_cmp(PartitionKey key, static int partition_bound_bsearch(PartitionKey key, PartitionBoundInfo boundinfo, void *probe, bool probe_is_bound, bool *is_equal); -static void get_partition_dispatch_recurse(Relation rel, Relation parent, - List **pds, List **leaf_part_oids); static int get_partition_bound_num_indexes(PartitionBoundInfo b); static int get_greatest_modulus(PartitionBoundInfo b); static uint64 compute_hash_value(PartitionKey key, Datum *values, bool *isnull); @@ -1565,148 +1563,6 @@ get_partition_qual_relid(Oid relid) return result; } -/* - * RelationGetPartitionDispatchInfo - * Returns information necessary to route tuples down a partition tree - * - * The number of elements in the returned array (that is, the number of - * PartitionDispatch objects for the partitioned tables in the partition tree) - * is returned in *num_parted and a list of the OIDs of all the leaf - * partitions of rel is returned in *leaf_part_oids. - * - * All the relations in the partition tree (including 'rel') must have been - * locked (using at least the AccessShareLock) by the caller. - */ -PartitionDispatch * -RelationGetPartitionDispatchInfo(Relation rel, - int *num_parted, List **leaf_part_oids) -{ - List *pdlist = NIL; - PartitionDispatchData **pd; - ListCell *lc; - int i; - - Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE); - - *num_parted = 0; - *leaf_part_oids = NIL; - - get_partition_dispatch_recurse(rel, NULL, &pdlist, leaf_part_oids); - *num_parted = list_length(pdlist); - pd = (PartitionDispatchData **) palloc(*num_parted * - sizeof(PartitionDispatchData *)); - i = 0; - foreach(lc, pdlist) - { - pd[i++] = lfirst(lc); - } - - return pd; - } - - /* - * get_partition_dispatch_recurse - * Recursively expand partition tree rooted at rel - * - * As the partition tree is expanded in a depth-first manner, we mantain two - * global lists: of PartitionDispatch objects corresponding to partitioned - * tables in *pds and of the leaf partition OIDs in *leaf_part_oids. - * - * Note that the order of OIDs of leaf partitions in leaf_part_oids matches - * the order in which the planner's expand_partitioned_rtentry() processes - * them. It's not necessarily the case that the offsets match up exactly, - * because constraint exclusion might prune away some partitions on the - * planner side, whereas we'll always have the complete list; but unpruned - * partitions will appear in the same order in the plan as they are returned - * here. - */ -static void -get_partition_dispatch_recurse(Relation rel, Relation parent, - List **pds, List **leaf_part_oids) -{ - TupleDesc tupdesc = RelationGetDescr(rel); - PartitionDesc partdesc = RelationGetPartitionDesc(rel); - PartitionKey partkey = RelationGetPartitionKey(rel); - PartitionDispatch pd; - int i; - - check_stack_depth(); - - /* Build a PartitionDispatch for this table and add it to *pds. */ - pd = (PartitionDispatch) palloc(sizeof(PartitionDispatchData)); - *pds = lappend(*pds, pd); - pd->reldesc = rel; - pd->key = partkey; - pd->keystate = NIL; - pd->partdesc = partdesc; - if (parent != NULL) - { - /* - * For every partitioned table other than the root, we must store a - * tuple table slot initialized with its tuple descriptor and a tuple - * conversion map to convert a tuple from its parent's rowtype to its - * own. That is to make sure that we are looking at the correct row - * using the correct tuple descriptor when computing its partition key - * for tuple routing. - */ - pd->tupslot = MakeSingleTupleTableSlot(tupdesc); - pd->tupmap = convert_tuples_by_name(RelationGetDescr(parent), - tupdesc, - gettext_noop("could not convert row type")); - } - else - { - /* Not required for the root partitioned table */ - pd->tupslot = NULL; - pd->tupmap = NULL; - } - - /* - * Go look at each partition of this table. If it's a leaf partition, - * simply add its OID to *leaf_part_oids. If it's a partitioned table, - * recursively call get_partition_dispatch_recurse(), so that its - * partitions are processed as well and a corresponding PartitionDispatch - * object gets added to *pds. - * - * About the values in pd->indexes: for a leaf partition, it contains the - * leaf partition's position in the global list *leaf_part_oids minus 1, - * whereas for a partitioned table partition, it contains the partition's - * position in the global list *pds multiplied by -1. The latter is - * multiplied by -1 to distinguish partitioned tables from leaf partitions - * when going through the values in pd->indexes. So, for example, when - * using it during tuple-routing, encountering a value >= 0 means we found - * a leaf partition. It is immediately returned as the index in the array - * of ResultRelInfos of all the leaf partitions, using which we insert the - * tuple into that leaf partition. A negative value means we found a - * partitioned table. The value multiplied by -1 is returned as the index - * in the array of PartitionDispatch objects of all partitioned tables in - * the tree. This value is used to continue the search in the next level - * of the partition tree. - */ - pd->indexes = (int *) palloc(partdesc->nparts * sizeof(int)); - for (i = 0; i < partdesc->nparts; i++) - { - Oid partrelid = partdesc->oids[i]; - - if (get_rel_relkind(partrelid) != RELKIND_PARTITIONED_TABLE) - { - *leaf_part_oids = lappend_oid(*leaf_part_oids, partrelid); - pd->indexes[i] = list_length(*leaf_part_oids) - 1; - } - else - { - /* - * We assume all tables in the partition tree were already locked - * by the caller. - */ - Relation partrel = heap_open(partrelid, NoLock); - - pd->indexes[i] = -list_length(*pds); - get_partition_dispatch_recurse(partrel, rel, pds, leaf_part_oids); - } - } -} - /* Module-local functions */ /* @@ -2646,134 +2502,21 @@ generate_partition_qual(Relation rel) return result; } -/* ---------------- - * FormPartitionKeyDatum - * Construct values[] and isnull[] arrays for the partition key - * of a tuple. - * - * pd Partition dispatch object of the partitioned table - * slot Heap tuple from which to extract partition key - * estate executor state for evaluating any partition key - * expressions (must be non-NULL) - * values Array of partition key Datums (output area) - * isnull Array of is-null indicators (output area) - * - * the ecxt_scantuple slot of estate's per-tuple expr context must point to - * the heap tuple passed in. - * ---------------- - */ -void -FormPartitionKeyDatum(PartitionDispatch pd, - TupleTableSlot *slot, - EState *estate, - Datum *values, - bool *isnull) -{// #lizard forgives - ListCell *partexpr_item; - int i; - - if (pd->key->partexprs != NIL && pd->keystate == NIL) - { - /* Check caller has set up context correctly */ - Assert(estate != NULL && - GetPerTupleExprContext(estate)->ecxt_scantuple == slot); - - /* First time through, set up expression evaluation state */ - pd->keystate = ExecPrepareExprList(pd->key->partexprs, estate); - } - - partexpr_item = list_head(pd->keystate); - for (i = 0; i < pd->key->partnatts; i++) - { - AttrNumber keycol = pd->key->partattrs[i]; - Datum datum; - bool isNull; - - if (keycol != 0) - { - /* Plain column; get the value directly from the heap tuple */ - datum = slot_getattr(slot, keycol, &isNull); - } - else - { - /* Expression; need to evaluate it */ - if (partexpr_item == NULL) - elog(ERROR, "wrong number of partition key expressions"); - datum = ExecEvalExprSwitchContext((ExprState *) lfirst(partexpr_item), - GetPerTupleExprContext(estate), - &isNull); - partexpr_item = lnext(partexpr_item); - } - values[i] = datum; - isnull[i] = isNull; - } - - if (partexpr_item != NULL) - elog(ERROR, "wrong number of partition key expressions"); -} - /* * get_partition_for_tuple - * Finds a leaf partition for tuple contained in *slot + * Finds partition of relation which accepts the partition key specified + * in values and isnull * - * Returned value is the sequence number of the leaf partition thus found, - * or -1 if no leaf partition is found for the tuple. *failed_at is set - * to the OID of the partitioned table whose partition was not found in - * the latter case. + * Return value is index of the partition (>= 0 and < partdesc->nparts) if one + * found or -1 if none found. */ int -get_partition_for_tuple(PartitionDispatch *pd, - TupleTableSlot *slot, - EState *estate, - PartitionDispatchData **failed_at, - TupleTableSlot **failed_slot) -{// #lizard forgives - PartitionDispatch parent; - Datum values[PARTITION_MAX_KEYS]; - bool isnull[PARTITION_MAX_KEYS]; - int result; - ExprContext *ecxt = GetPerTupleExprContext(estate); - TupleTableSlot *ecxt_scantuple_old = ecxt->ecxt_scantuple; - - /* start with the root partitioned table */ - parent = pd[0]; - while (true) - { - PartitionKey key = parent->key; - PartitionDesc partdesc = parent->partdesc; - TupleTableSlot *myslot = parent->tupslot; - TupleConversionMap *map = parent->tupmap; - int cur_index = -1; - - if (myslot != NULL && map != NULL) +get_partition_for_tuple(Relation relation, Datum *values, bool *isnull) { - HeapTuple tuple = ExecFetchSlotTuple(slot); - - ExecClearTuple(myslot); - tuple = do_convert_tuple(tuple, map, NULL); - ExecStoreTuple(tuple, myslot, InvalidBuffer, true); - slot = myslot; - } - - /* Quick exit */ - if (partdesc->nparts == 0) - { - *failed_at = parent; - *failed_slot = slot; - result = -1; - goto error_exit; - } - - /* - * Extract partition key from tuple. Expression evaluation machinery - * that FormPartitionKeyDatum() invokes expects ecxt_scantuple to - * point to the correct tuple slot. The slot might have changed from - * what was used for the parent table if the table of the current - * partitioning level has different tuple descriptor from the parent. - * So update ecxt_scantuple accordingly. - */ - ecxt->ecxt_scantuple = slot; - FormPartitionKeyDatum(parent, slot, estate, values, isnull); + int bound_offset; + int part_index = -1; + PartitionKey key = RelationGetPartitionKey(relation); + PartitionDesc partdesc = RelationGetPartitionDesc(relation); /* Route as appropriate based on partitioning strategy. */ switch (key->strategy) @@ -2782,32 +2525,29 @@ get_partition_for_tuple(PartitionDispatch *pd, { PartitionBoundInfo boundinfo = partdesc->boundinfo; int greatest_modulus = get_greatest_modulus(boundinfo); - uint64 rowHash = compute_hash_value(key, values, - isnull); + uint64 rowHash = compute_hash_value(key, values, isnull); - cur_index = boundinfo->indexes[rowHash % greatest_modulus]; + part_index = boundinfo->indexes[rowHash % greatest_modulus]; } break; case PARTITION_STRATEGY_LIST: - if (isnull[0]) { if (partition_bound_accepts_nulls(partdesc->boundinfo)) - cur_index = partdesc->boundinfo->null_index; + part_index = partdesc->boundinfo->null_index; } else { bool equal = false; - int cur_offset; - cur_offset = partition_bound_bsearch(key, + bound_offset = partition_bound_bsearch(key, partdesc->boundinfo, values, false, &equal); - if (cur_offset >= 0 && equal) - cur_index = partdesc->boundinfo->indexes[cur_offset]; + if (bound_offset >= 0 && equal) + part_index = partdesc->boundinfo->indexes[bound_offset]; } break; @@ -2815,7 +2555,6 @@ get_partition_for_tuple(PartitionDispatch *pd, { bool equal = false, range_partkey_has_null = false; - int cur_offset; int i; /* @@ -2829,36 +2568,26 @@ get_partition_for_tuple(PartitionDispatch *pd, partition_bound_has_default(partdesc->boundinfo)) { range_partkey_has_null = true; - break; - } - else if (isnull[i]) - { - *failed_at = parent; - *failed_slot = slot; - result = -1; - goto error_exit; + part_index = partdesc->boundinfo->default_index; } } - /* - * No need to search for partition, as the null key will - * be routed to the default partition. - */ - if (range_partkey_has_null) - break; - - cur_offset = partition_bound_bsearch(key, + if (!range_partkey_has_null) + { + bound_offset = partition_bound_bsearch(key, partdesc->boundinfo, values, false, &equal); /* - * The offset returned is such that the bound at - * cur_offset is less than or equal to the tuple value, so - * the bound at offset+1 is the upper bound. + * The bound at bound_offset is less than or equal to the + * tuple value, so the bound at offset+1 is the upper + * bound of the partition we're looking for, if there + * actually exists one. */ - cur_index = partdesc->boundinfo->indexes[cur_offset + 1]; + part_index = partdesc->boundinfo->indexes[bound_offset + 1]; + } } break; @@ -2868,37 +2597,13 @@ get_partition_for_tuple(PartitionDispatch *pd, } /* - * cur_index < 0 means we failed to find a partition of this parent. + * part_index < 0 means we failed to find a partition of this parent. * Use the default partition, if there is one. */ - if (cur_index < 0) - cur_index = partdesc->boundinfo->default_index; + if (part_index < 0) + part_index = partdesc->boundinfo->default_index; - /* - * If cur_index is still less than 0 at this point, there's no - * partition for this tuple. Otherwise, we either found the leaf - * partition, or a child partitioned table through which we have to - * route the tuple. - */ - if (cur_index < 0) - { - result = -1; - *failed_at = parent; - *failed_slot = slot; - break; - } - else if (parent->indexes[cur_index] >= 0) - { - result = parent->indexes[cur_index]; - break; - } - else - parent = pd[-parent->indexes[cur_index]]; - } - -error_exit: - ecxt->ecxt_scantuple = ecxt_scantuple_old; - return result; + return part_index; } /* diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 9e5aec9f..e376f863 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -38,6 +38,7 @@ #include "commands/copy.h" #include "commands/defrem.h" #include "commands/trigger.h" +#include "executor/execPartition.h" #include "executor/executor.h" #include "libpq/libpq.h" #include "libpq/pqformat.h" diff --git a/src/backend/executor/Makefile b/src/backend/executor/Makefile index fef60fb4..b7d58365 100644 --- a/src/backend/executor/Makefile +++ b/src/backend/executor/Makefile @@ -14,7 +14,7 @@ include $(top_builddir)/src/Makefile.global OBJS = execAmi.o execCurrent.o execExpr.o execExprInterp.o \ execGrouping.o execIndexing.o execJunk.o \ - execMain.o execParallel.o execProcnode.o \ + execMain.o execParallel.o execPartition.o execProcnode.o \ execReplication.o execScan.o execSRF.o execTuples.o \ execUtils.o functions.o instrument.o nodeAppend.o nodeAgg.o \ nodeBitmapAnd.o nodeBitmapOr.o \ diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 776c9d41..a4978497 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -44,7 +44,6 @@ #include "access/xact.h" #include "catalog/namespace.h" #include "catalog/partition.h" -#include "catalog/pg_inherits_fn.h" #include "catalog/pg_publication.h" #ifdef _MLS_ #include "catalog/pg_class.h" @@ -133,14 +132,8 @@ static char *ExecBuildSlotValueDescription(Oid reloid, TupleDesc tupdesc, Bitmapset *modifiedCols, int maxfieldlen); -static char *ExecBuildSlotPartitionKeyDescription(Relation rel, - Datum *values, - bool *isnull, - int maxfieldlen); static void EvalPlanQualStart(EPQState *epqstate, EState *parentestate, Plan *planTree); -static void ExecPartitionCheck(ResultRelInfo *resultRelInfo, - TupleTableSlot *slot, EState *estate); #ifdef _MLS_ static int ExecCheckRTERelkindextPerms(RangeTblEntry *rte); #endif @@ -2215,8 +2208,10 @@ ExecRelCheck(ResultRelInfo *resultRelInfo, /* * ExecPartitionCheck --- check that tuple meets the partition constraint. + * + * Exported in executor.h for outside use. */ -static void +void ExecPartitionCheck(ResultRelInfo *resultRelInfo, TupleTableSlot *slot, EState *estate) { @@ -3657,256 +3652,6 @@ EvalPlanQualEnd(EPQState *epqstate) epqstate->origslot = NULL; } -/* - * ExecSetupPartitionTupleRouting - set up information needed during - * tuple routing for partitioned tables - * - * Output arguments: - * 'pd' receives an array of PartitionDispatch objects with one entry for - * every partitioned table in the partition tree - * 'partitions' receives an array of ResultRelInfo objects with one entry for - * every leaf partition in the partition tree - * 'tup_conv_maps' receives an array of TupleConversionMap objects with one - * entry for every leaf partition (required to convert input tuple based - * on the root table's rowtype to a leaf partition's rowtype after tuple - * routing is done) - * 'partition_tuple_slot' receives a standalone TupleTableSlot to be used - * to manipulate any given leaf partition's rowtype after that partition - * is chosen by tuple-routing. - * 'num_parted' receives the number of partitioned tables in the partition - * tree (= the number of entries in the 'pd' output array) - * 'num_partitions' receives the number of leaf partitions in the partition - * tree (= the number of entries in the 'partitions' and 'tup_conv_maps' - * output arrays - * - * Note that all the relations in the partition tree are locked using the - * RowExclusiveLock mode upon return from this function. - */ -void -ExecSetupPartitionTupleRouting(Relation rel, - Index resultRTindex, - PartitionDispatch **pd, - ResultRelInfo **partitions, - TupleConversionMap ***tup_conv_maps, - TupleTableSlot **partition_tuple_slot, - int *num_parted, int *num_partitions) -{ - TupleDesc tupDesc = RelationGetDescr(rel); - List *leaf_parts; - ListCell *cell; - int i; - ResultRelInfo *leaf_part_rri; - - /* - * Get the information about the partition tree after locking all the - * partitions. - */ - (void) find_all_inheritors(RelationGetRelid(rel), RowExclusiveLock, NULL); - *pd = RelationGetPartitionDispatchInfo(rel, num_parted, &leaf_parts); - *num_partitions = list_length(leaf_parts); - *partitions = (ResultRelInfo *) palloc0(*num_partitions * - sizeof(ResultRelInfo)); - *tup_conv_maps = (TupleConversionMap **) palloc0(*num_partitions * - sizeof(TupleConversionMap *)); - - /* - * Initialize an empty slot that will be used to manipulate tuples of any - * given partition's rowtype. It is attached to the caller-specified node - * (such as ModifyTableState) and released when the node finishes - * processing. - */ - *partition_tuple_slot = MakeTupleTableSlot(); - - leaf_part_rri = *partitions; - i = 0; - foreach(cell, leaf_parts) - { - Relation partrel; - TupleDesc part_tupdesc; - - /* - * We locked all the partitions above including the leaf partitions. - * Note that each of the relations in *partitions are eventually - * closed by the caller. - */ - partrel = heap_open(lfirst_oid(cell), NoLock); - part_tupdesc = RelationGetDescr(partrel); - - /* - * Verify result relation is a valid target for the current operation. - */ - CheckValidResultRel(partrel, CMD_INSERT); - - /* - * Save a tuple conversion map to convert a tuple routed to this - * partition from the parent's type to the partition's. - */ - (*tup_conv_maps)[i] = convert_tuples_by_name(tupDesc, part_tupdesc, - gettext_noop("could not convert row type")); - - InitResultRelInfo(leaf_part_rri, - partrel, - resultRTindex, - rel, - 0); - - /* - * Open partition indices (remember we do not support ON CONFLICT in - * case of partitioned tables, so we do not need support information - * for speculative insertion) - */ - if (leaf_part_rri->ri_RelationDesc->rd_rel->relhasindex && - leaf_part_rri->ri_IndexRelationDescs == NULL) - ExecOpenIndices(leaf_part_rri, false); - - leaf_part_rri++; - i++; - } -} - -/* - * ExecFindPartition -- Find a leaf partition in the partition tree rooted - * at parent, for the heap tuple contained in *slot - * - * estate must be non-NULL; we'll need it to compute any expressions in the - * partition key(s) - * - * If no leaf partition is found, this routine errors out with the appropriate - * error message, else it returns the leaf partition sequence number returned - * by get_partition_for_tuple() unchanged. - */ -int -ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd, - TupleTableSlot *slot, EState *estate) -{ - int result; - PartitionDispatchData *failed_at; - TupleTableSlot *failed_slot; - - /* - * First check the root table's partition constraint, if any. No point in - * routing the tuple if it doesn't belong in the root table itself. - */ - if (resultRelInfo->ri_PartitionCheck) - ExecPartitionCheck(resultRelInfo, slot, estate); - - result = get_partition_for_tuple(pd, slot, estate, - &failed_at, &failed_slot); - if (result < 0) - { - Relation failed_rel; - Datum key_values[PARTITION_MAX_KEYS]; - bool key_isnull[PARTITION_MAX_KEYS]; - char *val_desc; - ExprContext *ecxt = GetPerTupleExprContext(estate); - - failed_rel = failed_at->reldesc; - ecxt->ecxt_scantuple = failed_slot; - FormPartitionKeyDatum(failed_at, failed_slot, estate, - key_values, key_isnull); - val_desc = ExecBuildSlotPartitionKeyDescription(failed_rel, - key_values, - key_isnull, - 64); - Assert(OidIsValid(RelationGetRelid(failed_rel))); - ereport(ERROR, - (errcode(ERRCODE_CHECK_VIOLATION), - errmsg("no partition of relation \"%s\" found for row", - RelationGetRelationName(failed_rel)), - val_desc ? errdetail("Partition key of the failing row contains %s.", val_desc) : 0)); - } - - return result; -} - -/* - * BuildSlotPartitionKeyDescription - * - * This works very much like BuildIndexValueDescription() and is currently - * used for building error messages when ExecFindPartition() fails to find - * partition for a row. - */ -static char * -ExecBuildSlotPartitionKeyDescription(Relation rel, - Datum *values, - bool *isnull, - int maxfieldlen) -{// #lizard forgives - StringInfoData buf; - PartitionKey key = RelationGetPartitionKey(rel); - int partnatts = get_partition_natts(key); - int i; - Oid relid = RelationGetRelid(rel); - AclResult aclresult; - - if (check_enable_rls(relid, InvalidOid, true) == RLS_ENABLED) - return NULL; - - /* If the user has table-level access, just go build the description. */ - aclresult = pg_class_aclcheck(relid, GetUserId(), ACL_SELECT); - if (aclresult != ACLCHECK_OK) - { - /* - * Step through the columns of the partition key and make sure the - * user has SELECT rights on all of them. - */ - for (i = 0; i < partnatts; i++) - { - AttrNumber attnum = get_partition_col_attnum(key, i); - - /* - * If this partition key column is an expression, we return no - * detail rather than try to figure out what column(s) the - * expression includes and if the user has SELECT rights on them. - */ - if (attnum == InvalidAttrNumber || - pg_attribute_aclcheck(relid, attnum, GetUserId(), - ACL_SELECT) != ACLCHECK_OK) - return NULL; - } - } - - initStringInfo(&buf); - appendStringInfo(&buf, "(%s) = (", - pg_get_partkeydef_columns(relid, true)); - - for (i = 0; i < partnatts; i++) - { - char *val; - int vallen; - - if (isnull[i]) - val = "null"; - else - { - Oid foutoid; - bool typisvarlena; - - getTypeOutputInfo(get_partition_col_typid(key, i), - &foutoid, &typisvarlena); - val = OidOutputFunctionCall(foutoid, values[i]); - } - - if (i > 0) - appendStringInfoString(&buf, ", "); - - /* truncate if needed */ - vallen = strlen(val); - if (vallen <= maxfieldlen) - appendStringInfoString(&buf, val); - else - { - vallen = pg_mbcliplen(val, vallen, maxfieldlen); - appendBinaryStringInfo(&buf, val, vallen); - appendStringInfoString(&buf, "..."); - } - } - - appendStringInfoChar(&buf, ')'); - - return buf.data; -} - #ifdef _MLS_ /* * cls user could access cls system tables and original system tables, and no more access limit diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c new file mode 100644 index 00000000..d275cefe --- /dev/null +++ b/src/backend/executor/execPartition.c @@ -0,0 +1,560 @@ +/*------------------------------------------------------------------------- + * + * execPartition.c + * Support routines for partitioning. + * + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/executor/execPartition.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "catalog/pg_inherits_fn.h" +#include "executor/execPartition.h" +#include "executor/executor.h" +#include "mb/pg_wchar.h" +#include "miscadmin.h" +#include "utils/lsyscache.h" +#include "utils/rls.h" +#include "utils/ruleutils.h" + +static PartitionDispatch *RelationGetPartitionDispatchInfo(Relation rel, + int *num_parted, List **leaf_part_oids); +static void get_partition_dispatch_recurse(Relation rel, Relation parent, + List **pds, List **leaf_part_oids); +static void FormPartitionKeyDatum(PartitionDispatch pd, + TupleTableSlot *slot, + EState *estate, + Datum *values, + bool *isnull); +static char *ExecBuildSlotPartitionKeyDescription(Relation rel, + Datum *values, + bool *isnull, + int maxfieldlen); + +/* + * ExecSetupPartitionTupleRouting - set up information needed during + * tuple routing for partitioned tables + * + * Output arguments: + * 'pd' receives an array of PartitionDispatch objects with one entry for + * every partitioned table in the partition tree + * 'partitions' receives an array of ResultRelInfo* objects with one entry for + * every leaf partition in the partition tree + * 'tup_conv_maps' receives an array of TupleConversionMap objects with one + * entry for every leaf partition (required to convert input tuple based + * on the root table's rowtype to a leaf partition's rowtype after tuple + * routing is done) + * 'partition_tuple_slot' receives a standalone TupleTableSlot to be used + * to manipulate any given leaf partition's rowtype after that partition + * is chosen by tuple-routing. + * 'num_parted' receives the number of partitioned tables in the partition + * tree (= the number of entries in the 'pd' output array) + * 'num_partitions' receives the number of leaf partitions in the partition + * tree (= the number of entries in the 'partitions' and 'tup_conv_maps' + * output arrays + * + * Note that all the relations in the partition tree are locked using the + * RowExclusiveLock mode upon return from this function. + */ +void +ExecSetupPartitionTupleRouting(Relation rel, + Index resultRTindex, + EState *estate, + PartitionDispatch **pd, + ResultRelInfo ***partitions, + TupleConversionMap ***tup_conv_maps, + TupleTableSlot **partition_tuple_slot, + int *num_parted, int *num_partitions) +{ + TupleDesc tupDesc = RelationGetDescr(rel); + List *leaf_parts; + ListCell *cell; + int i; + ResultRelInfo *leaf_part_rri; + + /* + * Get the information about the partition tree after locking all the + * partitions. + */ + (void) find_all_inheritors(RelationGetRelid(rel), RowExclusiveLock, NULL); + *pd = RelationGetPartitionDispatchInfo(rel, num_parted, &leaf_parts); + *num_partitions = list_length(leaf_parts); + *partitions = (ResultRelInfo **) palloc(*num_partitions * + sizeof(ResultRelInfo *)); + *tup_conv_maps = (TupleConversionMap **) palloc0(*num_partitions * + sizeof(TupleConversionMap *)); + + /* + * Initialize an empty slot that will be used to manipulate tuples of any + * given partition's rowtype. It is attached to the caller-specified node + * (such as ModifyTableState) and released when the node finishes + * processing. + */ + *partition_tuple_slot = MakeTupleTableSlot(); + + leaf_part_rri = (ResultRelInfo *) palloc0(*num_partitions * + sizeof(ResultRelInfo)); + i = 0; + foreach(cell, leaf_parts) + { + Relation partrel; + TupleDesc part_tupdesc; + + /* + * We locked all the partitions above including the leaf partitions. + * Note that each of the relations in *partitions are eventually + * closed by the caller. + */ + partrel = heap_open(lfirst_oid(cell), NoLock); + part_tupdesc = RelationGetDescr(partrel); + + /* + * Save a tuple conversion map to convert a tuple routed to this + * partition from the parent's type to the partition's. + */ + (*tup_conv_maps)[i] = convert_tuples_by_name(tupDesc, part_tupdesc, + gettext_noop("could not convert row type")); + + InitResultRelInfo(leaf_part_rri, + partrel, + resultRTindex, + rel, + estate->es_instrument); + + /* + * Verify result relation is a valid target for INSERT. + */ + CheckValidResultRel(leaf_part_rri, CMD_INSERT); + + /* + * Open partition indices (remember we do not support ON CONFLICT in + * case of partitioned tables, so we do not need support information + * for speculative insertion) + */ + if (leaf_part_rri->ri_RelationDesc->rd_rel->relhasindex && + leaf_part_rri->ri_IndexRelationDescs == NULL) + ExecOpenIndices(leaf_part_rri, false); + + estate->es_leaf_result_relations = + lappend(estate->es_leaf_result_relations, leaf_part_rri); + + (*partitions)[i] = leaf_part_rri++; + i++; + } +} + +/* + * ExecFindPartition -- Find a leaf partition in the partition tree rooted + * at parent, for the heap tuple contained in *slot + * + * estate must be non-NULL; we'll need it to compute any expressions in the + * partition key(s) + * + * If no leaf partition is found, this routine errors out with the appropriate + * error message, else it returns the leaf partition sequence number + * as an index into the array of (ResultRelInfos of) all leaf partitions in + * the partition tree. + */ +int +ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd, + TupleTableSlot *slot, EState *estate) +{ + int result; + Datum values[PARTITION_MAX_KEYS]; + bool isnull[PARTITION_MAX_KEYS]; + Relation rel; + PartitionDispatch parent; + ExprContext *ecxt = GetPerTupleExprContext(estate); + TupleTableSlot *ecxt_scantuple_old = ecxt->ecxt_scantuple; + + /* + * First check the root table's partition constraint, if any. No point in + * routing the tuple if it doesn't belong in the root table itself. + */ + if (resultRelInfo->ri_PartitionCheck) + ExecPartitionCheck(resultRelInfo, slot, estate); + + /* start with the root partitioned table */ + parent = pd[0]; + while (true) + { + PartitionDesc partdesc; + TupleTableSlot *myslot = parent->tupslot; + TupleConversionMap *map = parent->tupmap; + int cur_index = -1; + + rel = parent->reldesc; + partdesc = RelationGetPartitionDesc(rel); + + /* + * Convert the tuple to this parent's layout so that we can do certain + * things we do below. + */ + if (myslot != NULL && map != NULL) + { + HeapTuple tuple = ExecFetchSlotTuple(slot); + + ExecClearTuple(myslot); + tuple = do_convert_tuple(tuple, map); + ExecStoreTuple(tuple, myslot, InvalidBuffer, true); + slot = myslot; + } + + /* Quick exit */ + if (partdesc->nparts == 0) + { + result = -1; + break; + } + + /* + * Extract partition key from tuple. Expression evaluation machinery + * that FormPartitionKeyDatum() invokes expects ecxt_scantuple to + * point to the correct tuple slot. The slot might have changed from + * what was used for the parent table if the table of the current + * partitioning level has different tuple descriptor from the parent. + * So update ecxt_scantuple accordingly. + */ + ecxt->ecxt_scantuple = slot; + FormPartitionKeyDatum(parent, slot, estate, values, isnull); + cur_index = get_partition_for_tuple(rel, values, isnull); + + /* + * cur_index < 0 means we failed to find a partition of this parent. + * cur_index >= 0 means we either found the leaf partition, or the + * next parent to find a partition of. + */ + if (cur_index < 0) + { + result = -1; + break; + } + else if (parent->indexes[cur_index] >= 0) + { + result = parent->indexes[cur_index]; + break; + } + else + parent = pd[-parent->indexes[cur_index]]; + } + + /* A partition was not found. */ + if (result < 0) + { + char *val_desc; + + val_desc = ExecBuildSlotPartitionKeyDescription(rel, + values, isnull, 64); + Assert(OidIsValid(RelationGetRelid(rel))); + ereport(ERROR, + (errcode(ERRCODE_CHECK_VIOLATION), + errmsg("no partition of relation \"%s\" found for row", + RelationGetRelationName(rel)), + val_desc ? errdetail("Partition key of the failing row contains %s.", val_desc) : 0)); + } + + ecxt->ecxt_scantuple = ecxt_scantuple_old; + return result; +} + +/* + * RelationGetPartitionDispatchInfo + * Returns information necessary to route tuples down a partition tree + * + * The number of elements in the returned array (that is, the number of + * PartitionDispatch objects for the partitioned tables in the partition tree) + * is returned in *num_parted and a list of the OIDs of all the leaf + * partitions of rel is returned in *leaf_part_oids. + * + * All the relations in the partition tree (including 'rel') must have been + * locked (using at least the AccessShareLock) by the caller. + */ +static PartitionDispatch * +RelationGetPartitionDispatchInfo(Relation rel, + int *num_parted, List **leaf_part_oids) +{ + List *pdlist = NIL; + PartitionDispatchData **pd; + ListCell *lc; + int i; + + Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE); + + *num_parted = 0; + *leaf_part_oids = NIL; + + get_partition_dispatch_recurse(rel, NULL, &pdlist, leaf_part_oids); + *num_parted = list_length(pdlist); + pd = (PartitionDispatchData **) palloc(*num_parted * + sizeof(PartitionDispatchData *)); + i = 0; + foreach(lc, pdlist) + { + pd[i++] = lfirst(lc); + } + + return pd; +} + +/* + * get_partition_dispatch_recurse + * Recursively expand partition tree rooted at rel + * + * As the partition tree is expanded in a depth-first manner, we maintain two + * global lists: of PartitionDispatch objects corresponding to partitioned + * tables in *pds and of the leaf partition OIDs in *leaf_part_oids. + * + * Note that the order of OIDs of leaf partitions in leaf_part_oids matches + * the order in which the planner's expand_partitioned_rtentry() processes + * them. It's not necessarily the case that the offsets match up exactly, + * because constraint exclusion might prune away some partitions on the + * planner side, whereas we'll always have the complete list; but unpruned + * partitions will appear in the same order in the plan as they are returned + * here. + */ +static void +get_partition_dispatch_recurse(Relation rel, Relation parent, + List **pds, List **leaf_part_oids) +{ + TupleDesc tupdesc = RelationGetDescr(rel); + PartitionDesc partdesc = RelationGetPartitionDesc(rel); + PartitionKey partkey = RelationGetPartitionKey(rel); + PartitionDispatch pd; + int i; + + check_stack_depth(); + + /* Build a PartitionDispatch for this table and add it to *pds. */ + pd = (PartitionDispatch) palloc(sizeof(PartitionDispatchData)); + *pds = lappend(*pds, pd); + pd->reldesc = rel; + pd->key = partkey; + pd->keystate = NIL; + pd->partdesc = partdesc; + if (parent != NULL) + { + /* + * For every partitioned table other than the root, we must store a + * tuple table slot initialized with its tuple descriptor and a tuple + * conversion map to convert a tuple from its parent's rowtype to its + * own. That is to make sure that we are looking at the correct row + * using the correct tuple descriptor when computing its partition key + * for tuple routing. + */ + pd->tupslot = MakeSingleTupleTableSlot(tupdesc); + pd->tupmap = convert_tuples_by_name(RelationGetDescr(parent), + tupdesc, + gettext_noop("could not convert row type")); + } + else + { + /* Not required for the root partitioned table */ + pd->tupslot = NULL; + pd->tupmap = NULL; + } + + /* + * Go look at each partition of this table. If it's a leaf partition, + * simply add its OID to *leaf_part_oids. If it's a partitioned table, + * recursively call get_partition_dispatch_recurse(), so that its + * partitions are processed as well and a corresponding PartitionDispatch + * object gets added to *pds. + * + * About the values in pd->indexes: for a leaf partition, it contains the + * leaf partition's position in the global list *leaf_part_oids minus 1, + * whereas for a partitioned table partition, it contains the partition's + * position in the global list *pds multiplied by -1. The latter is + * multiplied by -1 to distinguish partitioned tables from leaf partitions + * when going through the values in pd->indexes. So, for example, when + * using it during tuple-routing, encountering a value >= 0 means we found + * a leaf partition. It is immediately returned as the index in the array + * of ResultRelInfos of all the leaf partitions, using which we insert the + * tuple into that leaf partition. A negative value means we found a + * partitioned table. The value multiplied by -1 is returned as the index + * in the array of PartitionDispatch objects of all partitioned tables in + * the tree. This value is used to continue the search in the next level + * of the partition tree. + */ + pd->indexes = (int *) palloc(partdesc->nparts * sizeof(int)); + for (i = 0; i < partdesc->nparts; i++) + { + Oid partrelid = partdesc->oids[i]; + + if (get_rel_relkind(partrelid) != RELKIND_PARTITIONED_TABLE) + { + *leaf_part_oids = lappend_oid(*leaf_part_oids, partrelid); + pd->indexes[i] = list_length(*leaf_part_oids) - 1; + } + else + { + /* + * We assume all tables in the partition tree were already locked + * by the caller. + */ + Relation partrel = heap_open(partrelid, NoLock); + + pd->indexes[i] = -list_length(*pds); + get_partition_dispatch_recurse(partrel, rel, pds, leaf_part_oids); + } + } +} + +/* ---------------- + * FormPartitionKeyDatum + * Construct values[] and isnull[] arrays for the partition key + * of a tuple. + * + * pd Partition dispatch object of the partitioned table + * slot Heap tuple from which to extract partition key + * estate executor state for evaluating any partition key + * expressions (must be non-NULL) + * values Array of partition key Datums (output area) + * isnull Array of is-null indicators (output area) + * + * the ecxt_scantuple slot of estate's per-tuple expr context must point to + * the heap tuple passed in. + * ---------------- + */ +static void +FormPartitionKeyDatum(PartitionDispatch pd, + TupleTableSlot *slot, + EState *estate, + Datum *values, + bool *isnull) +{ + ListCell *partexpr_item; + int i; + + if (pd->key->partexprs != NIL && pd->keystate == NIL) + { + /* Check caller has set up context correctly */ + Assert(estate != NULL && + GetPerTupleExprContext(estate)->ecxt_scantuple == slot); + + /* First time through, set up expression evaluation state */ + pd->keystate = ExecPrepareExprList(pd->key->partexprs, estate); + } + + partexpr_item = list_head(pd->keystate); + for (i = 0; i < pd->key->partnatts; i++) + { + AttrNumber keycol = pd->key->partattrs[i]; + Datum datum; + bool isNull; + + if (keycol != 0) + { + /* Plain column; get the value directly from the heap tuple */ + datum = slot_getattr(slot, keycol, &isNull); + } + else + { + /* Expression; need to evaluate it */ + if (partexpr_item == NULL) + elog(ERROR, "wrong number of partition key expressions"); + datum = ExecEvalExprSwitchContext((ExprState *) lfirst(partexpr_item), + GetPerTupleExprContext(estate), + &isNull); + partexpr_item = lnext(partexpr_item); + } + values[i] = datum; + isnull[i] = isNull; + } + + if (partexpr_item != NULL) + elog(ERROR, "wrong number of partition key expressions"); +} + +/* + * BuildSlotPartitionKeyDescription + * + * This works very much like BuildIndexValueDescription() and is currently + * used for building error messages when ExecFindPartition() fails to find + * partition for a row. + */ +static char * +ExecBuildSlotPartitionKeyDescription(Relation rel, + Datum *values, + bool *isnull, + int maxfieldlen) +{ + StringInfoData buf; + PartitionKey key = RelationGetPartitionKey(rel); + int partnatts = get_partition_natts(key); + int i; + Oid relid = RelationGetRelid(rel); + AclResult aclresult; + + if (check_enable_rls(relid, InvalidOid, true) == RLS_ENABLED) + return NULL; + + /* If the user has table-level access, just go build the description. */ + aclresult = pg_class_aclcheck(relid, GetUserId(), ACL_SELECT); + if (aclresult != ACLCHECK_OK) + { + /* + * Step through the columns of the partition key and make sure the + * user has SELECT rights on all of them. + */ + for (i = 0; i < partnatts; i++) + { + AttrNumber attnum = get_partition_col_attnum(key, i); + + /* + * If this partition key column is an expression, we return no + * detail rather than try to figure out what column(s) the + * expression includes and if the user has SELECT rights on them. + */ + if (attnum == InvalidAttrNumber || + pg_attribute_aclcheck(relid, attnum, GetUserId(), + ACL_SELECT) != ACLCHECK_OK) + return NULL; + } + } + + initStringInfo(&buf); + appendStringInfo(&buf, "(%s) = (", + pg_get_partkeydef_columns(relid, true)); + + for (i = 0; i < partnatts; i++) + { + char *val; + int vallen; + + if (isnull[i]) + val = "null"; + else + { + Oid foutoid; + bool typisvarlena; + + getTypeOutputInfo(get_partition_col_typid(key, i), + &foutoid, &typisvarlena); + val = OidOutputFunctionCall(foutoid, values[i]); + } + + if (i > 0) + appendStringInfoString(&buf, ", "); + + /* truncate if needed */ + vallen = strlen(val); + if (vallen <= maxfieldlen) + appendStringInfoString(&buf, val); + else + { + vallen = pg_mbcliplen(val, vallen, maxfieldlen); + appendBinaryStringInfo(&buf, val, vallen); + appendStringInfoString(&buf, "..."); + } + } + + appendStringInfoChar(&buf, ')'); + + return buf.data; +} diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 4a03adb3..660bfd4b 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -41,6 +41,7 @@ #include "access/htup_details.h" #include "access/xact.h" #include "commands/trigger.h" +#include "executor/execPartition.h" #include "executor/executor.h" #include "executor/nodeModifyTable.h" #include "foreign/fdwapi.h" diff --git a/src/include/catalog/partition.h b/src/include/catalog/partition.h index 8acc01a8..295e9d22 100644 --- a/src/include/catalog/partition.h +++ b/src/include/catalog/partition.h @@ -42,37 +42,6 @@ typedef struct PartitionDescData typedef struct PartitionDescData *PartitionDesc; -/*----------------------- - * PartitionDispatch - information about one partitioned table in a partition - * hierarchy required to route a tuple to one of its partitions - * - * reldesc Relation descriptor of the table - * key Partition key information of the table - * keystate Execution state required for expressions in the partition key - * partdesc Partition descriptor of the table - * tupslot A standalone TupleTableSlot initialized with this table's tuple - * descriptor - * tupmap TupleConversionMap to convert from the parent's rowtype to - * this table's rowtype (when extracting the partition key of a - * tuple just before routing it through this table) - * indexes Array with partdesc->nparts members (for details on what - * individual members represent, see how they are set in - * RelationGetPartitionDispatchInfo()) - *----------------------- - */ -typedef struct PartitionDispatchData -{ - Relation reldesc; - PartitionKey key; - List *keystate; /* list of ExprState */ - PartitionDesc partdesc; - TupleTableSlot *tupslot; - TupleConversionMap *tupmap; - int *indexes; -} PartitionDispatchData; - -typedef struct PartitionDispatchData *PartitionDispatch; - extern void RelationBuildPartitionDesc(Relation relation); extern bool partition_bounds_equal(int partnatts, int16 *parttyplen, bool *parttypbyval, PartitionBoundInfo b1, @@ -91,19 +60,6 @@ extern List *map_partition_varattnos(List *expr, int target_varno, extern List *RelationGetPartitionQual(Relation rel); extern Expr *get_partition_qual_relid(Oid relid); -/* For tuple routing */ -extern PartitionDispatch *RelationGetPartitionDispatchInfo(Relation rel, - int *num_parted, List **leaf_part_oids); -extern void FormPartitionKeyDatum(PartitionDispatch pd, - TupleTableSlot *slot, - EState *estate, - Datum *values, - bool *isnull); -extern int get_partition_for_tuple(PartitionDispatch *pd, - TupleTableSlot *slot, - EState *estate, - PartitionDispatchData **failed_at, - TupleTableSlot **failed_slot); extern Oid get_default_oid_from_partdesc(PartitionDesc partdesc); extern Oid get_default_partition_oid(Oid parentId); extern void update_default_partition_oid(Oid parentId, Oid defaultPartId); @@ -111,4 +67,8 @@ extern void check_default_allows_bound(Relation parent, Relation defaultRel, PartitionBoundSpec *new_spec); extern List *get_proposed_default_constraint(List *new_part_constaints); +/* For tuple routing */ +extern int get_partition_for_tuple(Relation relation, Datum *values, + bool *isnull); + #endif /* PARTITION_H */ diff --git a/src/include/executor/execPartition.h b/src/include/executor/execPartition.h new file mode 100644 index 00000000..64e5aab4 --- /dev/null +++ b/src/include/executor/execPartition.h @@ -0,0 +1,65 @@ +/*-------------------------------------------------------------------- + * execPartition.h + * POSTGRES partitioning executor interface + * + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/include/executor/execPartition.h + *-------------------------------------------------------------------- + */ + +#ifndef EXECPARTITION_H +#define EXECPARTITION_H + +#include "catalog/partition.h" +#include "nodes/execnodes.h" +#include "nodes/parsenodes.h" +#include "nodes/plannodes.h" + +/*----------------------- + * PartitionDispatch - information about one partitioned table in a partition + * hierarchy required to route a tuple to one of its partitions + * + * reldesc Relation descriptor of the table + * key Partition key information of the table + * keystate Execution state required for expressions in the partition key + * partdesc Partition descriptor of the table + * tupslot A standalone TupleTableSlot initialized with this table's tuple + * descriptor + * tupmap TupleConversionMap to convert from the parent's rowtype to + * this table's rowtype (when extracting the partition key of a + * tuple just before routing it through this table) + * indexes Array with partdesc->nparts members (for details on what + * individual members represent, see how they are set in + * get_partition_dispatch_recurse()) + *----------------------- + */ +typedef struct PartitionDispatchData +{ + Relation reldesc; + PartitionKey key; + List *keystate; /* list of ExprState */ + PartitionDesc partdesc; + TupleTableSlot *tupslot; + TupleConversionMap *tupmap; + int *indexes; +} PartitionDispatchData; + +typedef struct PartitionDispatchData *PartitionDispatch; + +extern void ExecSetupPartitionTupleRouting(Relation rel, + Index resultRTindex, + EState *estate, + PartitionDispatch **pd, + ResultRelInfo ***partitions, + TupleConversionMap ***tup_conv_maps, + TupleTableSlot **partition_tuple_slot, + int *num_parted, int *num_partitions); +extern int ExecFindPartition(ResultRelInfo *resultRelInfo, + PartitionDispatch *pd, + TupleTableSlot *slot, + EState *estate); + +#endif /* EXECPARTITION_H */ diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index 7fb94908..4ea9ef52 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -206,6 +206,8 @@ extern void ExecCleanUpTriggerState(EState *estate); extern bool ExecContextForcesOids(PlanState *planstate, bool *hasoids); extern void ExecConstraints(ResultRelInfo *resultRelInfo, TupleTableSlot *slot, EState *estate); +extern void ExecPartitionCheck(ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, EState *estate); extern void ExecWithCheckOptions(WCOKind kind, ResultRelInfo *resultRelInfo, TupleTableSlot *slot, EState *estate); extern LockTupleMode ExecUpdateLockMode(EState *estate, ResultRelInfo *relinfo); @@ -224,17 +226,6 @@ extern void EvalPlanQualSetPlan(EPQState *epqstate, extern void EvalPlanQualSetTuple(EPQState *epqstate, Index rti, HeapTuple tuple); extern HeapTuple EvalPlanQualGetTuple(EPQState *epqstate, Index rti); -extern void ExecSetupPartitionTupleRouting(Relation rel, - Index resultRTindex, - PartitionDispatch **pd, - ResultRelInfo **partitions, - TupleConversionMap ***tup_conv_maps, - TupleTableSlot **partition_tuple_slot, - int *num_parted, int *num_partitions); -extern int ExecFindPartition(ResultRelInfo *resultRelInfo, - PartitionDispatch *pd, - TupleTableSlot *slot, - EState *estate); #define EvalPlanQualSetSlot(epqstate, slot) ((epqstate)->origslot = (slot)) extern void EvalPlanQualFetchRowMarks(EPQState *epqstate); From 80e7f9733058c4ffaee6c9fb9c865a2d135c8b02 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Thu, 25 Jun 2020 20:33:15 +0800 Subject: [PATCH 205/578] Set proargmodes for satisfies_hash_partition. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/include/catalog/pg_proc.h | 2 +- src/test/regress/expected/type_sanity.out | 11 +++++++++++ src/test/regress/sql/type_sanity.sql | 8 ++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index 27c9ef26..51adc65a 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -5708,7 +5708,7 @@ DESCR("list files in the log directory"); DATA(insert OID = 3354 ( pg_ls_waldir PGNSP PGUID 12 10 20 0 0 f f f f t t v s 0 0 2249 "" "{25,20,1184}" "{o,o,o}" "{name,size,modification}" _null_ _null_ pg_ls_waldir _null_ _null_ _null_ )); DESCR("list of files in the WAL directory"); /* hash partitioning constraint function */ -DATA(insert OID = 4687 ( satisfies_hash_partition PGNSP PGUID 12 1 0 2276 0 f f f f f f i s 4 0 16 "26 23 23 2276" _null_ _null_ _null_ _null_ _null_ satisfies_hash_partition _null_ _null_ _null_ )); +DATA(insert OID = 4687 ( satisfies_hash_partition PGNSP PGUID 12 1 0 2276 0 f f f f f f i s 4 0 16 "26 23 23 2276" _null_ "{i,i,i,v}" _null_ _null_ _null_ satisfies_hash_partition _null_ _null_ _null_ )); DESCR("hash partition CHECK constraint"); DATA(insert OID = 3410 ( pg_extent_info PGNSP PGUID 12 10 20 0 0 f f f f f t v s 1 0 2249 "2205" "{23,16,23,23,23,23,23,23,23}" "{o,o,o,o,o,o,o,o,o}" "{eid,is_occupied,shardid,freespace_cat,hwm,scan_next,scan_prev,alloc_next,alloc_prev}" _null_ _null_ pg_extent_info_oid _null_ _null_ _null_ )); DESCR("get extent info of a relation"); diff --git a/src/test/regress/expected/type_sanity.out b/src/test/regress/expected/type_sanity.out index 7b200bae..16af46e1 100644 --- a/src/test/regress/expected/type_sanity.out +++ b/src/test/regress/expected/type_sanity.out @@ -129,6 +129,17 @@ WHERE p1.typinput = p2.oid AND NOT -----+---------+-----+--------- (0 rows) +-- Check that all and only those functions with a variadic type have +-- a variadic argument. +SELECT oid::regprocedure, proargmodes, provariadic +FROM pg_proc +WHERE (proargmodes IS NOT NULL AND 'v' = any(proargmodes)) + IS DISTINCT FROM + (provariadic != 0); + oid | proargmodes | provariadic +-----+-------------+------------- +(0 rows) + -- As of 8.0, this check finds refcursor, which is borrowing -- other types' I/O routines SELECT p1.oid, p1.typname, p2.oid, p2.proname diff --git a/src/test/regress/sql/type_sanity.sql b/src/test/regress/sql/type_sanity.sql index 4c658140..4e38f3e7 100644 --- a/src/test/regress/sql/type_sanity.sql +++ b/src/test/regress/sql/type_sanity.sql @@ -104,6 +104,14 @@ WHERE p1.typinput = p2.oid AND NOT p2.proargtypes[1] = 'oid'::regtype AND p2.proargtypes[2] = 'int4'::regtype)); +-- Check that all and only those functions with a variadic type have +-- a variadic argument. +SELECT oid::regprocedure, proargmodes, provariadic +FROM pg_proc +WHERE (proargmodes IS NOT NULL AND 'v' = any(proargmodes)) + IS DISTINCT FROM + (provariadic != 0); + -- As of 8.0, this check finds refcursor, which is borrowing -- other types' I/O routines SELECT p1.oid, p1.typname, p2.oid, p2.proname From f37a2b1de4b369a5eccecf26b380bf6aea98ccd7 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Thu, 25 Jun 2020 20:38:25 +0800 Subject: [PATCH 206/578] Fix multiple problems with satisfies_hash_partition.http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/catalog/partition.c | 188 +++++++++++++++++++++--- src/test/regress/expected/hash_part.out | 113 ++++++++++++++ src/test/regress/parallel_schedule | 2 +- src/test/regress/serial_schedule | 1 + src/test/regress/sql/hash_part.sql | 90 ++++++++++++ 5 files changed, 370 insertions(+), 24 deletions(-) create mode 100644 src/test/regress/expected/hash_part.out create mode 100644 src/test/regress/sql/hash_part.sql diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c index ae0bbfbe..092e925e 100644 --- a/src/backend/catalog/partition.c +++ b/src/backend/catalog/partition.c @@ -40,6 +40,7 @@ #include "optimizer/planmain.h" #include "optimizer/prep.h" #include "optimizer/var.h" +#include "parser/parse_coerce.h" #include "rewrite/rewriteManip.h" #include "storage/lmgr.h" #include "utils/array.h" @@ -3114,9 +3115,11 @@ compute_hash_value(PartitionKey key, Datum *values, bool *isnull) /* * satisfies_hash_partition * - * This is a SQL-callable function for use in hash partition constraints takes - * an already computed hash values of each partition key attribute, and combine - * them into a single hash value by calling hash_combine64. + * This is an SQL-callable function for use in hash partition constraints. + * The first three arguments are the parent table OID, modulus, and remainder. + * The remaining arguments are the value of the partitioning columns (or + * expressions); these are hashed and the results are combined into a single + * hash value by calling hash_combine64. * * Returns true if remainder produced when this computed single hash value is * divided by the given modulus is equal to given remainder, otherwise false. @@ -3129,59 +3132,159 @@ satisfies_hash_partition(PG_FUNCTION_ARGS) typedef struct ColumnsHashData { Oid relid; - int16 nkeys; + int nkeys; + Oid variadic_type; + int16 variadic_typlen; + bool variadic_typbyval; + char variadic_typalign; FmgrInfo partsupfunc[PARTITION_MAX_KEYS]; } ColumnsHashData; - Oid parentId = PG_GETARG_OID(0); - int modulus = PG_GETARG_INT32(1); - int remainder = PG_GETARG_INT32(2); - short nkeys = PG_NARGS() - 3; - int i; + Oid parentId; + int modulus; + int remainder; Datum seed = UInt64GetDatum(HASH_PARTITION_SEED); ColumnsHashData *my_extra; uint64 rowHash = 0; + /* Return null if the parent OID, modulus, or remainder is NULL. */ + if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2)) + PG_RETURN_NULL(); + parentId = PG_GETARG_OID(0); + modulus = PG_GETARG_INT32(1); + remainder = PG_GETARG_INT32(2); + + /* Sanity check modulus and remainder. */ + if (modulus <= 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("modulus for hash partition must be a positive integer"))); + if (remainder < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("remainder for hash partition must be a non-negative integer"))); + if (remainder >= modulus) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("remainder for hash partition must be less than modulus"))); + /* * Cache hash function information. */ my_extra = (ColumnsHashData *) fcinfo->flinfo->fn_extra; - if (my_extra == NULL || my_extra->nkeys != nkeys || - my_extra->relid != parentId) + if (my_extra == NULL || my_extra->relid != parentId) { Relation parent; PartitionKey key; int j; + /* Open parent relation and fetch partition keyinfo */ + parent = try_relation_open(parentId, AccessShareLock); + if (parent == NULL) + PG_RETURN_NULL(); + key = RelationGetPartitionKey(parent); + + /* Reject parent table that is not hash-partitioned. */ + if (parent->rd_rel->relkind != RELKIND_PARTITIONED_TABLE || + key->strategy != PARTITION_STRATEGY_HASH) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("\"%s\" is not a hash partitioned table", + get_rel_name(parentId)))); + + if (!get_fn_expr_variadic(fcinfo->flinfo)) + { + int nargs = PG_NARGS() - 3; + + /* complain if wrong number of column values */ + if (key->partnatts != nargs) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("number of partitioning columns (%d) does not match number of partition keys provided (%d)", + key->partnatts, nargs))); + + /* allocate space for our cache */ fcinfo->flinfo->fn_extra = MemoryContextAllocZero(fcinfo->flinfo->fn_mcxt, offsetof(ColumnsHashData, partsupfunc) + - sizeof(FmgrInfo) * nkeys); + sizeof(FmgrInfo) * nargs); my_extra = (ColumnsHashData *) fcinfo->flinfo->fn_extra; - my_extra->nkeys = nkeys; my_extra->relid = parentId; + my_extra->nkeys = key->partnatts; - /* Open parent relation and fetch partition keyinfo */ - parent = heap_open(parentId, AccessShareLock); - key = RelationGetPartitionKey(parent); + /* check argument types and save fmgr_infos */ + for (j = 0; j < key->partnatts; ++j) + { + Oid argtype = get_fn_expr_argtype(fcinfo->flinfo, j + 3); + + if (argtype != key->parttypid[j] && !IsBinaryCoercible(argtype, key->parttypid[j])) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("column %d of the partition key has type \"%s\", but supplied value is of type \"%s\"", + j + 1, format_type_be(key->parttypid[j]), format_type_be(argtype)))); - Assert(key->partnatts == nkeys); - for (j = 0; j < nkeys; ++j) fmgr_info_copy(&my_extra->partsupfunc[j], - key->partsupfunc, + &key->partsupfunc[j], + fcinfo->flinfo->fn_mcxt); + } + + } + else + { + ArrayType *variadic_array = PG_GETARG_ARRAYTYPE_P(3); + + /* allocate space for our cache -- just one FmgrInfo in this case */ + fcinfo->flinfo->fn_extra = + MemoryContextAllocZero(fcinfo->flinfo->fn_mcxt, + offsetof(ColumnsHashData, partsupfunc) + + sizeof(FmgrInfo)); + my_extra = (ColumnsHashData *) fcinfo->flinfo->fn_extra; + my_extra->relid = parentId; + my_extra->nkeys = key->partnatts; + my_extra->variadic_type = ARR_ELEMTYPE(variadic_array); + get_typlenbyvalalign(my_extra->variadic_type, + &my_extra->variadic_typlen, + &my_extra->variadic_typbyval, + &my_extra->variadic_typalign); + + /* check argument types */ + for (j = 0; j < key->partnatts; ++j) + if (key->parttypid[j] != my_extra->variadic_type) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("column %d of the partition key has type \"%s\", but supplied value is of type \"%s\"", + j + 1, + format_type_be(key->parttypid[j]), + format_type_be(my_extra->variadic_type)))); + + fmgr_info_copy(&my_extra->partsupfunc[0], + &key->partsupfunc[0], fcinfo->flinfo->fn_mcxt); + } /* Hold lock until commit */ - heap_close(parent, NoLock); + relation_close(parent, NoLock); } + if (!OidIsValid(my_extra->variadic_type)) + { + int nkeys = my_extra->nkeys; + int i; + + /* + * For a non-variadic call, neither the number of arguments nor their + * types can change across calls, so avoid the expense of rechecking + * here. + */ + for (i = 0; i < nkeys; i++) { + Datum hash; + /* keys start from fourth argument of function. */ int argno = i + 3; - if (!PG_ARGISNULL(argno)) - { - Datum hash; + if (PG_ARGISNULL(argno)) + continue; Assert(OidIsValid(my_extra->partsupfunc[i].fn_oid)); @@ -3193,6 +3296,45 @@ satisfies_hash_partition(PG_FUNCTION_ARGS) rowHash = hash_combine64(rowHash, DatumGetUInt64(hash)); } } + else + { + ArrayType *variadic_array = PG_GETARG_ARRAYTYPE_P(3); + int i; + int nelems; + Datum *datum; + bool *isnull; + + deconstruct_array(variadic_array, + my_extra->variadic_type, + my_extra->variadic_typlen, + my_extra->variadic_typbyval, + my_extra->variadic_typalign, + &datum, &isnull, &nelems); + + /* complain if wrong number of column values */ + if (nelems != my_extra->nkeys) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("number of partitioning columns (%d) does not match number of partition keys provided (%d)", + my_extra->nkeys, nelems))); + + for (i = 0; i < nelems; i++) + { + Datum hash; + + if (isnull[i]) + continue; + + Assert(OidIsValid(my_extra->partsupfunc[0].fn_oid)); + + hash = FunctionCall2(&my_extra->partsupfunc[0], + datum[i], + seed); + + /* Form a single 64-bit hash value */ + rowHash = hash_combine64(rowHash, DatumGetUInt64(hash)); + } + } PG_RETURN_BOOL(rowHash % modulus == remainder); } diff --git a/src/test/regress/expected/hash_part.out b/src/test/regress/expected/hash_part.out new file mode 100644 index 00000000..9e9e56f6 --- /dev/null +++ b/src/test/regress/expected/hash_part.out @@ -0,0 +1,113 @@ +-- +-- Hash partitioning. +-- +CREATE OR REPLACE FUNCTION hashint4_noop(int4, int8) RETURNS int8 AS +$$SELECT coalesce($1,0)::int8$$ LANGUAGE sql IMMUTABLE; +CREATE OPERATOR CLASS test_int4_ops FOR TYPE int4 USING HASH AS +OPERATOR 1 = , FUNCTION 2 hashint4_noop(int4, int8); +CREATE OR REPLACE FUNCTION hashtext_length(text, int8) RETURNS int8 AS +$$SELECT length(coalesce($1,''))::int8$$ LANGUAGE sql IMMUTABLE; +CREATE OPERATOR CLASS test_text_ops FOR TYPE text USING HASH AS +OPERATOR 1 = , FUNCTION 2 hashtext_length(text, int8); +CREATE TABLE mchash (a int, b text, c jsonb) + PARTITION BY HASH (a test_int4_ops, b test_text_ops); +CREATE TABLE mchash1 + PARTITION OF mchash FOR VALUES WITH (MODULUS 4, REMAINDER 0); +-- invalid OID, no such table +SELECT satisfies_hash_partition(0, 4, 0, NULL); + satisfies_hash_partition +-------------------------- + +(1 row) + +-- not partitioned +SELECT satisfies_hash_partition('tenk1'::regclass, 4, 0, NULL); +ERROR: "tenk1" is not a hash partitioned table +-- partition rather than the parent +SELECT satisfies_hash_partition('mchash1'::regclass, 4, 0, NULL); +ERROR: "mchash1" is not a hash partitioned table +-- invalid modulus +SELECT satisfies_hash_partition('mchash'::regclass, 0, 0, NULL); +ERROR: modulus for hash partition must be a positive integer +-- remainder too small +SELECT satisfies_hash_partition('mchash'::regclass, 1, -1, NULL); +ERROR: remainder for hash partition must be a non-negative integer +-- remainder too large +SELECT satisfies_hash_partition('mchash'::regclass, 1, 1, NULL); +ERROR: remainder for hash partition must be less than modulus +-- modulus is null +SELECT satisfies_hash_partition('mchash'::regclass, NULL, 0, NULL); + satisfies_hash_partition +-------------------------- + +(1 row) + +-- remainder is null +SELECT satisfies_hash_partition('mchash'::regclass, 4, NULL, NULL); + satisfies_hash_partition +-------------------------- + +(1 row) + +-- too many arguments +SELECT satisfies_hash_partition('mchash'::regclass, 4, 0, NULL::int, NULL::text, NULL::json); +ERROR: number of partitioning columns (2) does not match number of partition keys provided (3) +-- too few arguments +SELECT satisfies_hash_partition('mchash'::regclass, 3, 1, NULL::int); +ERROR: number of partitioning columns (2) does not match number of partition keys provided (1) +-- wrong argument type +SELECT satisfies_hash_partition('mchash'::regclass, 2, 1, NULL::int, NULL::int); +ERROR: column 2 of the partition key has type "text", but supplied value is of type "integer" +-- ok, should be false +SELECT satisfies_hash_partition('mchash'::regclass, 4, 0, 0, ''::text); + satisfies_hash_partition +-------------------------- + f +(1 row) + +-- ok, should be true +SELECT satisfies_hash_partition('mchash'::regclass, 4, 0, 1, ''::text); + satisfies_hash_partition +-------------------------- + t +(1 row) + +-- argument via variadic syntax, should fail because not all partitioning +-- columns are of the correct type +SELECT satisfies_hash_partition('mchash'::regclass, 2, 1, + variadic array[1,2]::int[]); +ERROR: column 2 of the partition key has type "text", but supplied value is of type "integer" +-- multiple partitioning columns of the same type +CREATE TABLE mcinthash (a int, b int, c jsonb) + PARTITION BY HASH (a test_int4_ops, b test_int4_ops); +-- now variadic should work, should be false +SELECT satisfies_hash_partition('mcinthash'::regclass, 4, 0, + variadic array[0, 0]); + satisfies_hash_partition +-------------------------- + f +(1 row) + +-- should be true +SELECT satisfies_hash_partition('mcinthash'::regclass, 4, 0, + variadic array[1, 0]); + satisfies_hash_partition +-------------------------- + t +(1 row) + +-- wrong length +SELECT satisfies_hash_partition('mcinthash'::regclass, 4, 0, + variadic array[]::int[]); +ERROR: number of partitioning columns (2) does not match number of partition keys provided (0) +-- wrong type +SELECT satisfies_hash_partition('mcinthash'::regclass, 4, 0, + variadic array[now(), now()]); +ERROR: column 1 of the partition key has type "integer", but supplied value is of type "timestamp with time zone" +-- cleanup +DROP TABLE mchash; +DROP TABLE mcinthash; +DROP OPERATOR CLASS test_text_ops USING hash; +DROP OPERATOR CLASS test_int4_ops USING hash; +DROP FUNCTION hashint4_noop(int4, int8); +DROP FUNCTION hashtext_length(text, int8); diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index ab868e3a..7c3fa29e 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -134,7 +134,7 @@ test: plancache limit plpgsql copy2 temp domain prepare without_oid conversion t # ---------- # Another group of parallel tests # ---------- -test: identity partition_join +test: identity partition_join hash_part # event triggers cannot run concurrently with any test that runs DDL test: event_trigger diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule index 28d7802d..890742ef 100644 --- a/src/test/regress/serial_schedule +++ b/src/test/regress/serial_schedule @@ -192,6 +192,7 @@ test: with test: xml test: identity test: partition_join +test: hash_part test: event_trigger test: fast_default test: stats diff --git a/src/test/regress/sql/hash_part.sql b/src/test/regress/sql/hash_part.sql new file mode 100644 index 00000000..94c5eaab --- /dev/null +++ b/src/test/regress/sql/hash_part.sql @@ -0,0 +1,90 @@ +-- +-- Hash partitioning. +-- + +CREATE OR REPLACE FUNCTION hashint4_noop(int4, int8) RETURNS int8 AS +$$SELECT coalesce($1,0)::int8$$ LANGUAGE sql IMMUTABLE; +CREATE OPERATOR CLASS test_int4_ops FOR TYPE int4 USING HASH AS +OPERATOR 1 = , FUNCTION 2 hashint4_noop(int4, int8); + +CREATE OR REPLACE FUNCTION hashtext_length(text, int8) RETURNS int8 AS +$$SELECT length(coalesce($1,''))::int8$$ LANGUAGE sql IMMUTABLE; +CREATE OPERATOR CLASS test_text_ops FOR TYPE text USING HASH AS +OPERATOR 1 = , FUNCTION 2 hashtext_length(text, int8); + +CREATE TABLE mchash (a int, b text, c jsonb) + PARTITION BY HASH (a test_int4_ops, b test_text_ops); +CREATE TABLE mchash1 + PARTITION OF mchash FOR VALUES WITH (MODULUS 4, REMAINDER 0); + +-- invalid OID, no such table +SELECT satisfies_hash_partition(0, 4, 0, NULL); + +-- not partitioned +SELECT satisfies_hash_partition('tenk1'::regclass, 4, 0, NULL); + +-- partition rather than the parent +SELECT satisfies_hash_partition('mchash1'::regclass, 4, 0, NULL); + +-- invalid modulus +SELECT satisfies_hash_partition('mchash'::regclass, 0, 0, NULL); + +-- remainder too small +SELECT satisfies_hash_partition('mchash'::regclass, 1, -1, NULL); + +-- remainder too large +SELECT satisfies_hash_partition('mchash'::regclass, 1, 1, NULL); + +-- modulus is null +SELECT satisfies_hash_partition('mchash'::regclass, NULL, 0, NULL); + +-- remainder is null +SELECT satisfies_hash_partition('mchash'::regclass, 4, NULL, NULL); + +-- too many arguments +SELECT satisfies_hash_partition('mchash'::regclass, 4, 0, NULL::int, NULL::text, NULL::json); + +-- too few arguments +SELECT satisfies_hash_partition('mchash'::regclass, 3, 1, NULL::int); + +-- wrong argument type +SELECT satisfies_hash_partition('mchash'::regclass, 2, 1, NULL::int, NULL::int); + +-- ok, should be false +SELECT satisfies_hash_partition('mchash'::regclass, 4, 0, 0, ''::text); + +-- ok, should be true +SELECT satisfies_hash_partition('mchash'::regclass, 4, 0, 1, ''::text); + +-- argument via variadic syntax, should fail because not all partitioning +-- columns are of the correct type +SELECT satisfies_hash_partition('mchash'::regclass, 2, 1, + variadic array[1,2]::int[]); + +-- multiple partitioning columns of the same type +CREATE TABLE mcinthash (a int, b int, c jsonb) + PARTITION BY HASH (a test_int4_ops, b test_int4_ops); + +-- now variadic should work, should be false +SELECT satisfies_hash_partition('mcinthash'::regclass, 4, 0, + variadic array[0, 0]); + +-- should be true +SELECT satisfies_hash_partition('mcinthash'::regclass, 4, 0, + variadic array[1, 0]); + +-- wrong length +SELECT satisfies_hash_partition('mcinthash'::regclass, 4, 0, + variadic array[]::int[]); + +-- wrong type +SELECT satisfies_hash_partition('mcinthash'::regclass, 4, 0, + variadic array[now(), now()]); + +-- cleanup +DROP TABLE mchash; +DROP TABLE mcinthash; +DROP OPERATOR CLASS test_text_ops USING hash; +DROP OPERATOR CLASS test_int4_ops USING hash; +DROP FUNCTION hashint4_noop(int4, int8); +DROP FUNCTION hashtext_length(text, int8); From 89b1b9957e6e6c18797efa9fcfacbfa19ee8d284 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Thu, 25 Jun 2020 20:51:35 +0800 Subject: [PATCH 207/578] Show partition info from psql \d+.http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/bin/psql/describe.c | 34 ++++++++++++++++++---- src/test/regress/expected/create_table.out | 11 ++++--- src/test/regress/expected/foreign_data.out | 3 ++ src/test/regress/expected/insert.out | 17 +++++++++++ src/test/regress/sql/create_table.sql | 2 +- src/test/regress/sql/insert.sql | 4 +++ 6 files changed, 60 insertions(+), 11 deletions(-) diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c index 266c3c31..00cd59b0 100644 --- a/src/bin/psql/describe.c +++ b/src/bin/psql/describe.c @@ -2844,7 +2844,9 @@ describeOneTableDetails(const char *schemaname, /* print child tables (with additional info if partitions) */ if (pset.sversion >= 100000) printfPQExpBuffer(&buf, - "SELECT c.oid::pg_catalog.regclass, pg_catalog.pg_get_expr(c.relpartbound, c.oid)" + "SELECT c.oid::pg_catalog.regclass," + " pg_catalog.pg_get_expr(c.relpartbound, c.oid)," + " c.relkind" " FROM pg_catalog.pg_class c, pg_catalog.pg_inherits i" " WHERE c.oid=i.inhrelid AND i.inhparent = '%s'" " ORDER BY c.oid::pg_catalog.regclass::pg_catalog.text;", oid); @@ -2867,7 +2869,18 @@ describeOneTableDetails(const char *schemaname, else tuples = PQntuples(result); - if (!verbose) + /* + * For a partitioned table with no partitions, always print the number + * of partitions as zero, even when verbose output is expected. + * Otherwise, we will not print "Partitions" section for a partitioned + * table without any partitions. + */ + if (tableinfo.relkind == RELKIND_PARTITIONED_TABLE && tuples == 0) + { + printfPQExpBuffer(&buf, _("Number of partitions: %d"), tuples); + printTableAddFooter(&cont, buf.data); + } + else if (!verbose) { /* print the number of child tables, if any */ if (tuples > 0) @@ -2899,12 +2912,21 @@ describeOneTableDetails(const char *schemaname, } else { + char *partitioned_note; + + if (*PQgetvalue(result, i, 2) == RELKIND_PARTITIONED_TABLE) + partitioned_note = ", PARTITIONED"; + else + partitioned_note = ""; + if (i == 0) - printfPQExpBuffer(&buf, "%s: %s %s", - ct, PQgetvalue(result, i, 0), PQgetvalue(result, i, 1)); + printfPQExpBuffer(&buf, "%s: %s %s%s", + ct, PQgetvalue(result, i, 0), PQgetvalue(result, i, 1), + partitioned_note); else - printfPQExpBuffer(&buf, "%*s %s %s", - ctw, "", PQgetvalue(result, i, 0), PQgetvalue(result, i, 1)); + printfPQExpBuffer(&buf, "%*s %s %s%s", + ctw, "", PQgetvalue(result, i, 0), PQgetvalue(result, i, 1), + partitioned_note); } if (i < tuples - 1) appendPQExpBufferChar(&buf, ','); diff --git a/src/test/regress/expected/create_table.out b/src/test/regress/expected/create_table.out index 4ae86d8c..86c347be 100644 --- a/src/test/regress/expected/create_table.out +++ b/src/test/regress/expected/create_table.out @@ -431,13 +431,15 @@ ERROR: cannot inherit from partitioned table "partitioned2" c | text | | | d | text | | | Partition key: RANGE (a oid_ops, plusone(b), c, d COLLATE "C") +Number of partitions: 0 -\d partitioned2 +\d+ partitioned2 Table "public.partitioned2" - Column | Type | Collation | Nullable | Default ---------+---------+-----------+----------+--------- - a | integer | | | + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+---------+--------------+------------- + a | integer | | | | plain | | Partition key: LIST (((a + 1))) +Number of partitions: 0 DROP TABLE partitioned, partitioned2; -- @@ -877,6 +879,7 @@ SELECT obj_description('parted_col_comment'::regclass); a | integer | | | | plain | | Partition key b | text | | | | extended | | Partition key: LIST (a) +Number of partitions: 0 Distribute By: HASH(a) Location Nodes: ALL DATANODES diff --git a/src/test/regress/expected/foreign_data.out b/src/test/regress/expected/foreign_data.out index 1ba6f02c..a5326254 100644 --- a/src/test/regress/expected/foreign_data.out +++ b/src/test/regress/expected/foreign_data.out @@ -1518,6 +1518,7 @@ ERROR: foreign table "pt2_1" does not exist c2 | text | | | | extended | | c3 | date | | | | plain | | Partition key: LIST (c1) +Number of partitions: 0 Distribute By: HASH(c1) Location Nodes: ALL DATANODES @@ -1578,6 +1579,7 @@ ALTER TABLE pt2 ALTER c2 SET NOT NULL; c2 | text | | not null | | extended | | c3 | date | | | | plain | | Partition key: LIST (c1) +Number of partitions: 0 Distribute By: HASH(c1) Location Nodes: ALL DATANODES @@ -1601,6 +1603,7 @@ ALTER TABLE pt2 ADD CONSTRAINT pt2chk1 CHECK (c1 > 0); Partition key: LIST (c1) Check constraints: "pt2chk1" CHECK (c1 > 0) +Number of partitions: 0 Distribute By: HASH(c1) Location Nodes: ALL DATANODES diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out index 96b99abb..d7fd4ee7 100644 --- a/src/test/regress/expected/insert.out +++ b/src/test/regress/expected/insert.out @@ -427,6 +427,23 @@ from hash_parted order by part; hpart3 | 11 | 3 (13 rows) +-- test \d+ output on a table which has both partitioned and unpartitioned +-- partitions +\d+ list_parted + Table "public.list_parted" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+----------+--------------+------------- + a | text | | | | extended | | + b | integer | | | | plain | | +Partition key: LIST (lower(a)) +Partitions: part_aa_bb FOR VALUES IN ('aa', 'bb'), + part_cc_dd FOR VALUES IN ('cc', 'dd'), + part_default DEFAULT, PARTITIONED, + part_ee_ff FOR VALUES IN ('ee', 'ff'), PARTITIONED, + part_gg FOR VALUES IN ('gg'), PARTITIONED, + part_null FOR VALUES IN (NULL), + part_xx_yy FOR VALUES IN ('xx', 'yy'), PARTITIONED + -- cleanup drop table range_parted, list_parted; drop table hash_parted; diff --git a/src/test/regress/sql/create_table.sql b/src/test/regress/sql/create_table.sql index c1cf6ee1..43ada6b3 100644 --- a/src/test/regress/sql/create_table.sql +++ b/src/test/regress/sql/create_table.sql @@ -423,7 +423,7 @@ CREATE TABLE fail () INHERITS (partitioned2); -- Partition key in describe output \d partitioned -\d partitioned2 +\d+ partitioned2 DROP TABLE partitioned, partitioned2; diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql index ef7abf94..491af082 100644 --- a/src/test/regress/sql/insert.sql +++ b/src/test/regress/sql/insert.sql @@ -251,6 +251,10 @@ insert into hpart3 values(11); select tableoid::regclass as part, a, a%4 as "remainder = a % 4" from hash_parted order by part; +-- test \d+ output on a table which has both partitioned and unpartitioned +-- partitions +\d+ list_parted + -- cleanup drop table range_parted, list_parted; drop table hash_parted; From f54767eb3d7b462886c9d37e6055e94c07fd2ec5 Mon Sep 17 00:00:00 2001 From: Simon Riggs Date: Thu, 23 Nov 2017 05:17:47 +1100 Subject: [PATCH 208/578] Sort default partition to bottom of psql \d+ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Minor patch to change sort order only Author: Ashutosh Bapat Reviewed-by: Álvaro Herrera, Simon Riggs --- src/bin/psql/describe.c | 3 ++- src/test/regress/expected/insert.out | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c index 00cd59b0..f198c238 100644 --- a/src/bin/psql/describe.c +++ b/src/bin/psql/describe.c @@ -2849,7 +2849,8 @@ describeOneTableDetails(const char *schemaname, " c.relkind" " FROM pg_catalog.pg_class c, pg_catalog.pg_inherits i" " WHERE c.oid=i.inhrelid AND i.inhparent = '%s'" - " ORDER BY c.oid::pg_catalog.regclass::pg_catalog.text;", oid); + " ORDER BY pg_catalog.pg_get_expr(c.relpartbound, c.oid) = 'DEFAULT'," + " c.oid::pg_catalog.regclass::pg_catalog.text;", oid); else if (pset.sversion >= 80300) printfPQExpBuffer(&buf, "SELECT c.oid::pg_catalog.regclass" diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out index d7fd4ee7..503221fc 100644 --- a/src/test/regress/expected/insert.out +++ b/src/test/regress/expected/insert.out @@ -438,11 +438,11 @@ from hash_parted order by part; Partition key: LIST (lower(a)) Partitions: part_aa_bb FOR VALUES IN ('aa', 'bb'), part_cc_dd FOR VALUES IN ('cc', 'dd'), - part_default DEFAULT, PARTITIONED, part_ee_ff FOR VALUES IN ('ee', 'ff'), PARTITIONED, part_gg FOR VALUES IN ('gg'), PARTITIONED, part_null FOR VALUES IN (NULL), - part_xx_yy FOR VALUES IN ('xx', 'yy'), PARTITIONED + part_xx_yy FOR VALUES IN ('xx', 'yy'), PARTITIONED, + part_default DEFAULT, PARTITIONED -- cleanup drop table range_parted, list_parted; From ee4678a1e765d821e5561cc86b332f27f43fdc08 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Thu, 25 Jun 2020 20:58:08 +0800 Subject: [PATCH 209/578] Fix assorted syscache lookup sloppiness in partition-related code. --- src/backend/catalog/heap.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c index 0c382fe7..2f135c95 100644 --- a/src/backend/catalog/heap.c +++ b/src/backend/catalog/heap.c @@ -2797,6 +2797,8 @@ heap_drop_with_catalog(Oid relid) * shared-cache-inval notice that will make them update their index lists. */ tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(relid)); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for relation %u", relid); if (((Form_pg_class) GETSTRUCT(tuple))->relispartition) { parentOid = get_partition_parent(relid); @@ -4243,9 +4245,6 @@ StorePartitionKey(Relation rel, Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE); - tuple = SearchSysCache1(PARTRELID, - ObjectIdGetDatum(RelationGetRelid(rel))); - /* Copy the partition attribute numbers, opclass OIDs into arrays */ partattrs_vec = buildint2vector(partattrs, partnatts); partopclass_vec = buildoidvector(partopclass, partnatts); From 367cc1a194bae456316c32e29ad73173164bdb86 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Thu, 25 Jun 2020 21:19:33 +0800 Subject: [PATCH 210/578] Add null test to partition constraint for default range partitions. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/catalog/partition.c | 27 ++++++++++++++++++++----- src/test/regress/expected/inherit.out | 9 +++++---- src/test/regress/expected/inherit_1.out | 9 +++++---- src/test/regress/expected/inherit_2.out | 7 ++++--- src/test/regress/expected/inherit_3.out | 9 +++++---- src/test/regress/expected/update.out | 2 +- src/test/regress/sql/inherit.sql | 9 +++++---- 7 files changed, 47 insertions(+), 25 deletions(-) diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c index 092e925e..45dca13c 100644 --- a/src/backend/catalog/partition.c +++ b/src/backend/catalog/partition.c @@ -2163,12 +2163,29 @@ get_qual_for_range(Relation parent, PartitionBoundSpec *spec, if (or_expr_args != NIL) { - /* OR all the non-default partition constraints; then negate it */ - result = lappend(result, + Expr *other_parts_constr; + + /* + * Combine the constraints obtained for non-default partitions + * using OR. As requested, each of the OR's args doesn't include + * the NOT NULL test for partition keys (which is to avoid its + * useless repetition). Add the same now. + */ + other_parts_constr = + makeBoolExpr(AND_EXPR, + lappend(get_range_nulltest(key), list_length(or_expr_args) > 1 - ? makeBoolExpr(OR_EXPR, or_expr_args, -1) - : linitial(or_expr_args)); - result = list_make1(makeBoolExpr(NOT_EXPR, result, -1)); + ? makeBoolExpr(OR_EXPR, or_expr_args, + -1) + : linitial(or_expr_args)), + -1); + + /* + * Finally, the default partition contains everything *NOT* + * contained in the non-default partitions. + */ + result = list_make1(makeBoolExpr(NOT_EXPR, + list_make1(other_parts_constr), -1)); } return result; diff --git a/src/test/regress/expected/inherit.out b/src/test/regress/expected/inherit.out index 6e287dc4..51d9903d 100644 --- a/src/test/regress/expected/inherit.out +++ b/src/test/regress/expected/inherit.out @@ -2166,13 +2166,14 @@ drop table range_list_parted; -- check that constraint exclusion is able to cope with the partition -- constraint emitted for multi-column range partitioned tables create table mcrparted (a int, b int, c int) partition by range (a, abs(b), c); +create table mcrparted_def partition of mcrparted default; create table mcrparted0 partition of mcrparted for values from (minvalue, minvalue, minvalue) to (1, 1, 1); create table mcrparted1 partition of mcrparted for values from (1, 1, 1) to (10, 5, 10); create table mcrparted2 partition of mcrparted for values from (10, 5, 10) to (10, 10, 10); create table mcrparted3 partition of mcrparted for values from (11, 1, 1) to (20, 10, 10); create table mcrparted4 partition of mcrparted for values from (20, 10, 10) to (20, 20, 20); create table mcrparted5 partition of mcrparted for values from (20, 20, 20) to (maxvalue, maxvalue, maxvalue); -explain (costs off) select * from mcrparted where a = 0; -- scans mcrparted0 +explain (costs off) select * from mcrparted where a = 0; -- scans mcrparted0, mcrparted_def QUERY PLAN ------------------------------------------ Remote Subquery Scan on all (datanode_2) @@ -2181,7 +2182,7 @@ explain (costs off) select * from mcrparted where a = 0; -- scans mcrparted0 Filter: (a = 0) (4 rows) -explain (costs off) select * from mcrparted where a = 10 and abs(b) < 5; -- scans mcrparted1 +explain (costs off) select * from mcrparted where a = 10 and abs(b) < 5; -- scans mcrparted1, mcrparted_def QUERY PLAN --------------------------------------------------- Remote Subquery Scan on all (datanode_2) @@ -2190,7 +2191,7 @@ explain (costs off) select * from mcrparted where a = 10 and abs(b) < 5; -- scan Filter: ((a = 10) AND (abs(b) < 5)) (4 rows) -explain (costs off) select * from mcrparted where a = 10 and abs(b) = 5; -- scans mcrparted1, mcrparted2 +explain (costs off) select * from mcrparted where a = 10 and abs(b) = 5; -- scans mcrparted1, mcrparted2, mcrparted_def QUERY PLAN --------------------------------------------------- Remote Subquery Scan on all (datanode_2) @@ -2246,7 +2247,7 @@ explain (costs off) select * from mcrparted where a = 20 and abs(b) = 10 and c > Filter: ((c > 10) AND (a = 20) AND (abs(b) = 10)) (4 rows) -explain (costs off) select * from mcrparted where a = 20 and c > 20; -- scans mcrparted3, mcrparte4, mcrparte5 +explain (costs off) select * from mcrparted where a = 20 and c > 20; -- scans mcrparted3, mcrparte4, mcrparte5, mcrparted_def QUERY PLAN ----------------------------------------------- Remote Subquery Scan on all (datanode_2) diff --git a/src/test/regress/expected/inherit_1.out b/src/test/regress/expected/inherit_1.out index ff38ed79..a6b99b17 100644 --- a/src/test/regress/expected/inherit_1.out +++ b/src/test/regress/expected/inherit_1.out @@ -2160,13 +2160,14 @@ drop table range_list_parted; -- check that constraint exclusion is able to cope with the partition -- constraint emitted for multi-column range partitioned tables create table mcrparted (a int, b int, c int) partition by range (a, abs(b), c); +create table mcrparted_def partition of mcrparted default; create table mcrparted0 partition of mcrparted for values from (minvalue, minvalue, minvalue) to (1, 1, 1); create table mcrparted1 partition of mcrparted for values from (1, 1, 1) to (10, 5, 10); create table mcrparted2 partition of mcrparted for values from (10, 5, 10) to (10, 10, 10); create table mcrparted3 partition of mcrparted for values from (11, 1, 1) to (20, 10, 10); create table mcrparted4 partition of mcrparted for values from (20, 10, 10) to (20, 20, 20); create table mcrparted5 partition of mcrparted for values from (20, 20, 20) to (maxvalue, maxvalue, maxvalue); -explain (costs off) select * from mcrparted where a = 0; -- scans mcrparted0 +explain (costs off) select * from mcrparted where a = 0; -- scans mcrparted0, mcrparted_def QUERY PLAN ------------------------------------ Remote Fast Query Execution @@ -2176,7 +2177,7 @@ explain (costs off) select * from mcrparted where a = 0; -- scans mcrparted0 Filter: (a = 0) (5 rows) -explain (costs off) select * from mcrparted where a = 10 and abs(b) < 5; -- scans mcrparted1 +explain (costs off) select * from mcrparted where a = 10 and abs(b) < 5; -- scans mcrparted1, mcrparted_def QUERY PLAN --------------------------------------------------- Remote Fast Query Execution @@ -2186,7 +2187,7 @@ explain (costs off) select * from mcrparted where a = 10 and abs(b) < 5; -- scan Filter: ((a = 10) AND (abs(b) < 5)) (5 rows) -explain (costs off) select * from mcrparted where a = 10 and abs(b) = 5; -- scans mcrparted1, mcrparted2 +explain (costs off) select * from mcrparted where a = 10 and abs(b) = 5; -- scans mcrparted1, mcrparted2, mcrparted_def QUERY PLAN --------------------------------------------------- Remote Fast Query Execution @@ -2246,7 +2247,7 @@ explain (costs off) select * from mcrparted where a = 20 and abs(b) = 10 and c > Filter: ((c > 10) AND (a = 20) AND (abs(b) = 10)) (5 rows) -explain (costs off) select * from mcrparted where a = 20 and c > 20; -- scans mcrparted3, mcrparte4, mcrparte5 +explain (costs off) select * from mcrparted where a = 20 and c > 20; -- scans mcrparted3, mcrparte4, mcrparte5, mcrparted_def QUERY PLAN ----------------------------------------------- Remote Fast Query Execution diff --git a/src/test/regress/expected/inherit_2.out b/src/test/regress/expected/inherit_2.out index 8d97e116..ef08ec3e 100644 --- a/src/test/regress/expected/inherit_2.out +++ b/src/test/regress/expected/inherit_2.out @@ -2131,13 +2131,14 @@ drop table range_list_parted; -- check that constraint exclusion is able to cope with the partition -- constraint emitted for multi-column range partitioned tables create table mcrparted (a int, b int, c int) partition by range (a, abs(b), c); +create table mcrparted_def partition of mcrparted default; create table mcrparted0 partition of mcrparted for values from (minvalue, minvalue, minvalue) to (1, 1, 1); create table mcrparted1 partition of mcrparted for values from (1, 1, 1) to (10, 5, 10); create table mcrparted2 partition of mcrparted for values from (10, 5, 10) to (10, 10, 10); create table mcrparted3 partition of mcrparted for values from (11, 1, 1) to (20, 10, 10); create table mcrparted4 partition of mcrparted for values from (20, 10, 10) to (20, 20, 20); create table mcrparted5 partition of mcrparted for values from (20, 20, 20) to (maxvalue, maxvalue, maxvalue); -explain (costs off) select * from mcrparted where a = 0; -- scans mcrparted0 +explain (costs off) select * from mcrparted where a = 0; -- scans mcrparted0, mcrparted_def QUERY PLAN ------------------------------------------ Remote Subquery Scan on all (datanode_2) @@ -2146,7 +2147,7 @@ explain (costs off) select * from mcrparted where a = 0; -- scans mcrparted0 Filter: (a = 0) (4 rows) -explain (costs off) select * from mcrparted where a = 10 and abs(b) < 5; -- scans mcrparted1 +explain (costs off) select * from mcrparted where a = 10 and abs(b) < 5; -- scans mcrparted1, mcrparted_def QUERY PLAN --------------------------------------------------- Remote Subquery Scan on all (datanode_2) @@ -2155,7 +2156,7 @@ explain (costs off) select * from mcrparted where a = 10 and abs(b) < 5; -- scan Filter: ((a = 10) AND (abs(b) < 5)) (4 rows) -explain (costs off) select * from mcrparted where a = 10 and abs(b) = 5; -- scans mcrparted1, mcrparted2 +explain (costs off) select * from mcrparted where a = 10 and abs(b) = 5; -- scans mcrparted1, mcrparted2, mcrparted_def, mcrparted_def QUERY PLAN --------------------------------------------------- Remote Subquery Scan on all (datanode_2) diff --git a/src/test/regress/expected/inherit_3.out b/src/test/regress/expected/inherit_3.out index 402c6a51..9a33a70d 100644 --- a/src/test/regress/expected/inherit_3.out +++ b/src/test/regress/expected/inherit_3.out @@ -2147,13 +2147,14 @@ drop table range_list_parted; -- check that constraint exclusion is able to cope with the partition -- constraint emitted for multi-column range partitioned tables create table mcrparted (a int, b int, c int) partition by range (a, abs(b), c); +create table mcrparted_def partition of mcrparted default; create table mcrparted0 partition of mcrparted for values from (minvalue, minvalue, minvalue) to (1, 1, 1); create table mcrparted1 partition of mcrparted for values from (1, 1, 1) to (10, 5, 10); create table mcrparted2 partition of mcrparted for values from (10, 5, 10) to (10, 10, 10); create table mcrparted3 partition of mcrparted for values from (11, 1, 1) to (20, 10, 10); create table mcrparted4 partition of mcrparted for values from (20, 10, 10) to (20, 20, 20); create table mcrparted5 partition of mcrparted for values from (20, 20, 20) to (maxvalue, maxvalue, maxvalue); -explain (costs off) select * from mcrparted where a = 0; -- scans mcrparted0 +explain (costs off) select * from mcrparted where a = 0; -- scans mcrparted0, mcrparted_def QUERY PLAN ------------------------------------ Remote Fast Query Execution @@ -2163,7 +2164,7 @@ explain (costs off) select * from mcrparted where a = 0; -- scans mcrparted0 Filter: (a = 0) (5 rows) -explain (costs off) select * from mcrparted where a = 10 and abs(b) < 5; -- scans mcrparted1 +explain (costs off) select * from mcrparted where a = 10 and abs(b) < 5; -- scans mcrparted1, mcrparted_def QUERY PLAN --------------------------------------------------- Remote Fast Query Execution @@ -2173,7 +2174,7 @@ explain (costs off) select * from mcrparted where a = 10 and abs(b) < 5; -- scan Filter: ((a = 10) AND (abs(b) < 5)) (5 rows) -explain (costs off) select * from mcrparted where a = 10 and abs(b) = 5; -- scans mcrparted1, mcrparted2 +explain (costs off) select * from mcrparted where a = 10 and abs(b) = 5; -- scans mcrparted1, mcrparted2, mcrparted_def QUERY PLAN --------------------------------------------------- Remote Fast Query Execution @@ -2233,7 +2234,7 @@ explain (costs off) select * from mcrparted where a = 20 and abs(b) = 10 and c > Filter: ((c > 10) AND (a = 20) AND (abs(b) = 10)) (5 rows) -explain (costs off) select * from mcrparted where a = 20 and c > 20; -- scans mcrparted3, mcrparte4, mcrparte5 +explain (costs off) select * from mcrparted where a = 20 and c > 20; -- scans mcrparted3, mcrparte4, mcrparte5, mcrparted_def QUERY PLAN ----------------------------------------------- Remote Fast Query Execution diff --git a/src/test/regress/expected/update.out b/src/test/regress/expected/update.out index 0aae60ac..9cdaf10f 100644 --- a/src/test/regress/expected/update.out +++ b/src/test/regress/expected/update.out @@ -227,7 +227,7 @@ create table part_def partition of range_parted default; a | text | | | | extended | | b | integer | | | | plain | | Partition of: range_parted DEFAULT -Partition constraint: (NOT (((a = 'a'::text) AND (b >= 1) AND (b < 10)) OR ((a = 'a'::text) AND (b >= 10) AND (b < 20)) OR ((a = 'b'::text) AND (b >= 1) AND (b < 10)) OR ((a = 'b'::text) AND (b >= 10) AND (b < 20)))) +Partition constraint: (NOT ((a IS NOT NULL) AND (b IS NOT NULL) AND (((a = 'a'::text) AND (b >= 1) AND (b < 10)) OR ((a = 'a'::text) AND (b >= 10) AND (b < 20)) OR ((a = 'b'::text) AND (b >= 1) AND (b < 10)) OR ((a = 'b'::text) AND (b >= 10) AND (b < 20))))) insert into range_parted values ('c', 9); -- ok diff --git a/src/test/regress/sql/inherit.sql b/src/test/regress/sql/inherit.sql index 58f7f523..ea17dd86 100644 --- a/src/test/regress/sql/inherit.sql +++ b/src/test/regress/sql/inherit.sql @@ -734,19 +734,20 @@ drop table range_list_parted; -- check that constraint exclusion is able to cope with the partition -- constraint emitted for multi-column range partitioned tables create table mcrparted (a int, b int, c int) partition by range (a, abs(b), c); +create table mcrparted_def partition of mcrparted default; create table mcrparted0 partition of mcrparted for values from (minvalue, minvalue, minvalue) to (1, 1, 1); create table mcrparted1 partition of mcrparted for values from (1, 1, 1) to (10, 5, 10); create table mcrparted2 partition of mcrparted for values from (10, 5, 10) to (10, 10, 10); create table mcrparted3 partition of mcrparted for values from (11, 1, 1) to (20, 10, 10); create table mcrparted4 partition of mcrparted for values from (20, 10, 10) to (20, 20, 20); create table mcrparted5 partition of mcrparted for values from (20, 20, 20) to (maxvalue, maxvalue, maxvalue); -explain (costs off) select * from mcrparted where a = 0; -- scans mcrparted0 -explain (costs off) select * from mcrparted where a = 10 and abs(b) < 5; -- scans mcrparted1 -explain (costs off) select * from mcrparted where a = 10 and abs(b) = 5; -- scans mcrparted1, mcrparted2 +explain (costs off) select * from mcrparted where a = 0; -- scans mcrparted0, mcrparted_def +explain (costs off) select * from mcrparted where a = 10 and abs(b) < 5; -- scans mcrparted1, mcrparted_def +explain (costs off) select * from mcrparted where a = 10 and abs(b) = 5; -- scans mcrparted1, mcrparted2, mcrparted_def explain (costs off) select * from mcrparted where abs(b) = 5; -- scans all partitions explain (costs off) select * from mcrparted where a > -1; -- scans all partitions explain (costs off) select * from mcrparted where a = 20 and abs(b) = 10 and c > 10; -- scans mcrparted4 -explain (costs off) select * from mcrparted where a = 20 and c > 20; -- scans mcrparted3, mcrparte4, mcrparte5 +explain (costs off) select * from mcrparted where a = 20 and c > 20; -- scans mcrparted3, mcrparte4, mcrparte5, mcrparted_def drop table mcrparted; -- check that partitioned table Appends cope with being referenced in From febad44a0674ef540a69bf37f3f9e240f1b1d3d0 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Tue, 28 Nov 2017 14:11:16 -0500 Subject: [PATCH 211/578] If a range-partitioned table has no default partition, reject null keys. Commit 4e5fe9ad19e14af360de7970caa8b150436c9dec introduced this problem. Also add a test so it doesn't get broken again. Report by Rushabh Lathia. Fix by Amit Langote. Reviewed by Rushabh Lathia and Amul Sul. Tweaked by me. Discussion: http://postgr.es/m/CAGPqQf0Y1iJyk4QJBdMf=pS9i6Q0JUMM_h5-qkR3OMJ-e04PyA@mail.gmail.com --- src/backend/catalog/partition.c | 5 ++--- src/test/regress/expected/insert.out | 4 ++++ src/test/regress/sql/insert.sql | 3 +++ 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c index 45dca13c..544e3365 100644 --- a/src/backend/catalog/partition.c +++ b/src/backend/catalog/partition.c @@ -2582,11 +2582,10 @@ get_partition_for_tuple(Relation relation, Datum *values, bool *isnull) */ for (i = 0; i < key->partnatts; i++) { - if (isnull[i] && - partition_bound_has_default(partdesc->boundinfo)) + if (isnull[i]) { range_partkey_has_null = true; - part_index = partdesc->boundinfo->default_index; + break; } } diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out index 503221fc..a3649273 100644 --- a/src/test/regress/expected/insert.out +++ b/src/test/regress/expected/insert.out @@ -692,6 +692,10 @@ create table mcrparted2 partition of mcrparted for values from (10, 6, minvalue) create table mcrparted3 partition of mcrparted for values from (11, 1, 1) to (20, 10, 10); create table mcrparted4 partition of mcrparted for values from (21, minvalue, minvalue) to (30, 20, maxvalue); create table mcrparted5 partition of mcrparted for values from (30, 21, 20) to (maxvalue, maxvalue, maxvalue); +-- null not allowed in range partition +insert into mcrparted values (null, null, null); +ERROR: no partition of relation "mcrparted" found for row +DETAIL: Partition key of the failing row contains (a, abs(b), c) = (null, null, null). -- routed to mcrparted0 insert into mcrparted values (0, 1, 1); insert into mcrparted0 values (0, 1, 1); diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql index 491af082..e276954e 100644 --- a/src/test/regress/sql/insert.sql +++ b/src/test/regress/sql/insert.sql @@ -415,6 +415,9 @@ create table mcrparted3 partition of mcrparted for values from (11, 1, 1) to (20 create table mcrparted4 partition of mcrparted for values from (21, minvalue, minvalue) to (30, 20, maxvalue); create table mcrparted5 partition of mcrparted for values from (30, 21, 20) to (maxvalue, maxvalue, maxvalue); +-- null not allowed in range partition +insert into mcrparted values (null, null, null); + -- routed to mcrparted0 insert into mcrparted values (0, 1, 1); insert into mcrparted0 values (0, 1, 1); From 27f838fa4c17efe70c86406d96835ffe48c489f0 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Thu, 25 Jun 2020 21:28:02 +0800 Subject: [PATCH 212/578] Add extensive tests for partition pruning. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/test/regress/expected/partition_prune.out | 1095 +++++++++++++++++ src/test/regress/parallel_schedule | 2 +- src/test/regress/serial_schedule | 1 + src/test/regress/sql/partition_prune.sql | 155 +++ 4 files changed, 1252 insertions(+), 1 deletion(-) create mode 100644 src/test/regress/expected/partition_prune.out create mode 100644 src/test/regress/sql/partition_prune.sql diff --git a/src/test/regress/expected/partition_prune.out b/src/test/regress/expected/partition_prune.out new file mode 100644 index 00000000..aabb0240 --- /dev/null +++ b/src/test/regress/expected/partition_prune.out @@ -0,0 +1,1095 @@ +-- +-- Test partitioning planner code +-- +create table lp (a char) partition by list (a); +create table lp_default partition of lp default; +create table lp_ef partition of lp for values in ('e', 'f'); +create table lp_ad partition of lp for values in ('a', 'd'); +create table lp_bc partition of lp for values in ('b', 'c'); +create table lp_g partition of lp for values in ('g'); +create table lp_null partition of lp for values in (null); +explain (costs off) select * from lp; + QUERY PLAN +------------------------------ + Append + -> Seq Scan on lp_ad + -> Seq Scan on lp_bc + -> Seq Scan on lp_ef + -> Seq Scan on lp_g + -> Seq Scan on lp_null + -> Seq Scan on lp_default +(7 rows) + +explain (costs off) select * from lp where a > 'a' and a < 'd'; + QUERY PLAN +----------------------------------------------------------- + Append + -> Seq Scan on lp_bc + Filter: ((a > 'a'::bpchar) AND (a < 'd'::bpchar)) + -> Seq Scan on lp_default + Filter: ((a > 'a'::bpchar) AND (a < 'd'::bpchar)) +(5 rows) + +explain (costs off) select * from lp where a > 'a' and a <= 'd'; + QUERY PLAN +------------------------------------------------------------ + Append + -> Seq Scan on lp_ad + Filter: ((a > 'a'::bpchar) AND (a <= 'd'::bpchar)) + -> Seq Scan on lp_bc + Filter: ((a > 'a'::bpchar) AND (a <= 'd'::bpchar)) + -> Seq Scan on lp_default + Filter: ((a > 'a'::bpchar) AND (a <= 'd'::bpchar)) +(7 rows) + +explain (costs off) select * from lp where a = 'a'; + QUERY PLAN +----------------------------------- + Append + -> Seq Scan on lp_ad + Filter: (a = 'a'::bpchar) +(3 rows) + +explain (costs off) select * from lp where 'a' = a; /* commuted */ + QUERY PLAN +----------------------------------- + Append + -> Seq Scan on lp_ad + Filter: ('a'::bpchar = a) +(3 rows) + +explain (costs off) select * from lp where a is not null; + QUERY PLAN +--------------------------------- + Append + -> Seq Scan on lp_ad + Filter: (a IS NOT NULL) + -> Seq Scan on lp_bc + Filter: (a IS NOT NULL) + -> Seq Scan on lp_ef + Filter: (a IS NOT NULL) + -> Seq Scan on lp_g + Filter: (a IS NOT NULL) + -> Seq Scan on lp_default + Filter: (a IS NOT NULL) +(11 rows) + +explain (costs off) select * from lp where a is null; + QUERY PLAN +----------------------------- + Append + -> Seq Scan on lp_null + Filter: (a IS NULL) +(3 rows) + +explain (costs off) select * from lp where a = 'a' or a = 'c'; + QUERY PLAN +---------------------------------------------------------- + Append + -> Seq Scan on lp_ad + Filter: ((a = 'a'::bpchar) OR (a = 'c'::bpchar)) + -> Seq Scan on lp_bc + Filter: ((a = 'a'::bpchar) OR (a = 'c'::bpchar)) +(5 rows) + +explain (costs off) select * from lp where a is not null and (a = 'a' or a = 'c'); + QUERY PLAN +-------------------------------------------------------------------------------- + Append + -> Seq Scan on lp_ad + Filter: ((a IS NOT NULL) AND ((a = 'a'::bpchar) OR (a = 'c'::bpchar))) + -> Seq Scan on lp_bc + Filter: ((a IS NOT NULL) AND ((a = 'a'::bpchar) OR (a = 'c'::bpchar))) +(5 rows) + +explain (costs off) select * from lp where a <> 'g'; + QUERY PLAN +------------------------------------ + Append + -> Seq Scan on lp_ad + Filter: (a <> 'g'::bpchar) + -> Seq Scan on lp_bc + Filter: (a <> 'g'::bpchar) + -> Seq Scan on lp_ef + Filter: (a <> 'g'::bpchar) + -> Seq Scan on lp_default + Filter: (a <> 'g'::bpchar) +(9 rows) + +explain (costs off) select * from lp where a <> 'a' and a <> 'd'; + QUERY PLAN +------------------------------------------------------------- + Append + -> Seq Scan on lp_bc + Filter: ((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) + -> Seq Scan on lp_ef + Filter: ((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) + -> Seq Scan on lp_g + Filter: ((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) + -> Seq Scan on lp_default + Filter: ((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) +(9 rows) + +explain (costs off) select * from lp where a not in ('a', 'd'); + QUERY PLAN +------------------------------------------------ + Append + -> Seq Scan on lp_bc + Filter: (a <> ALL ('{a,d}'::bpchar[])) + -> Seq Scan on lp_ef + Filter: (a <> ALL ('{a,d}'::bpchar[])) + -> Seq Scan on lp_g + Filter: (a <> ALL ('{a,d}'::bpchar[])) + -> Seq Scan on lp_default + Filter: (a <> ALL ('{a,d}'::bpchar[])) +(9 rows) + +-- collation matches the partitioning collation, pruning works +create table coll_pruning (a text collate "C") partition by list (a); +create table coll_pruning_a partition of coll_pruning for values in ('a'); +create table coll_pruning_b partition of coll_pruning for values in ('b'); +create table coll_pruning_def partition of coll_pruning default; +explain (costs off) select * from coll_pruning where a collate "C" = 'a' collate "C"; + QUERY PLAN +--------------------------------------------- + Append + -> Seq Scan on coll_pruning_a + Filter: (a = 'a'::text COLLATE "C") +(3 rows) + +-- collation doesn't match the partitioning collation, no pruning occurs +explain (costs off) select * from coll_pruning where a collate "POSIX" = 'a' collate "POSIX"; + QUERY PLAN +--------------------------------------------------------- + Append + -> Seq Scan on coll_pruning_a + Filter: ((a)::text = 'a'::text COLLATE "POSIX") + -> Seq Scan on coll_pruning_b + Filter: ((a)::text = 'a'::text COLLATE "POSIX") + -> Seq Scan on coll_pruning_def + Filter: ((a)::text = 'a'::text COLLATE "POSIX") +(7 rows) + +create table rlp (a int, b varchar) partition by range (a); +create table rlp_default partition of rlp default partition by list (a); +create table rlp_default_default partition of rlp_default default; +create table rlp_default_10 partition of rlp_default for values in (10); +create table rlp_default_30 partition of rlp_default for values in (30); +create table rlp_default_null partition of rlp_default for values in (null); +create table rlp1 partition of rlp for values from (minvalue) to (1); +create table rlp2 partition of rlp for values from (1) to (10); +create table rlp3 (b varchar, a int) partition by list (b varchar_ops); +create table rlp3_default partition of rlp3 default; +create table rlp3abcd partition of rlp3 for values in ('ab', 'cd'); +create table rlp3efgh partition of rlp3 for values in ('ef', 'gh'); +create table rlp3nullxy partition of rlp3 for values in (null, 'xy'); +alter table rlp attach partition rlp3 for values from (15) to (20); +create table rlp4 partition of rlp for values from (20) to (30) partition by range (a); +create table rlp4_default partition of rlp4 default; +create table rlp4_1 partition of rlp4 for values from (20) to (25); +create table rlp4_2 partition of rlp4 for values from (25) to (29); +create table rlp5 partition of rlp for values from (31) to (maxvalue) partition by range (a); +create table rlp5_default partition of rlp5 default; +create table rlp5_1 partition of rlp5 for values from (31) to (40); +explain (costs off) select * from rlp where a < 1; + QUERY PLAN +------------------------- + Append + -> Seq Scan on rlp1 + Filter: (a < 1) +(3 rows) + +explain (costs off) select * from rlp where 1 > a; /* commuted */ + QUERY PLAN +------------------------- + Append + -> Seq Scan on rlp1 + Filter: (1 > a) +(3 rows) + +explain (costs off) select * from rlp where a <= 1; + QUERY PLAN +--------------------------------------- + Append + -> Seq Scan on rlp1 + Filter: (a <= 1) + -> Seq Scan on rlp2 + Filter: (a <= 1) + -> Seq Scan on rlp_default_default + Filter: (a <= 1) +(7 rows) + +explain (costs off) select * from rlp where a = 1; + QUERY PLAN +------------------------- + Append + -> Seq Scan on rlp2 + Filter: (a = 1) +(3 rows) + +explain (costs off) select * from rlp where a = 1::bigint; /* same as above */ + QUERY PLAN +----------------------------------- + Append + -> Seq Scan on rlp2 + Filter: (a = '1'::bigint) +(3 rows) + +explain (costs off) select * from rlp where a = 1::numeric; /* no pruning */ + QUERY PLAN +----------------------------------------------- + Append + -> Seq Scan on rlp1 + Filter: ((a)::numeric = '1'::numeric) + -> Seq Scan on rlp2 + Filter: ((a)::numeric = '1'::numeric) + -> Seq Scan on rlp3abcd + Filter: ((a)::numeric = '1'::numeric) + -> Seq Scan on rlp3efgh + Filter: ((a)::numeric = '1'::numeric) + -> Seq Scan on rlp3nullxy + Filter: ((a)::numeric = '1'::numeric) + -> Seq Scan on rlp3_default + Filter: ((a)::numeric = '1'::numeric) + -> Seq Scan on rlp4_1 + Filter: ((a)::numeric = '1'::numeric) + -> Seq Scan on rlp4_2 + Filter: ((a)::numeric = '1'::numeric) + -> Seq Scan on rlp4_default + Filter: ((a)::numeric = '1'::numeric) + -> Seq Scan on rlp5_1 + Filter: ((a)::numeric = '1'::numeric) + -> Seq Scan on rlp5_default + Filter: ((a)::numeric = '1'::numeric) + -> Seq Scan on rlp_default_10 + Filter: ((a)::numeric = '1'::numeric) + -> Seq Scan on rlp_default_30 + Filter: ((a)::numeric = '1'::numeric) + -> Seq Scan on rlp_default_null + Filter: ((a)::numeric = '1'::numeric) + -> Seq Scan on rlp_default_default + Filter: ((a)::numeric = '1'::numeric) +(31 rows) + +explain (costs off) select * from rlp where a <= 10; + QUERY PLAN +--------------------------------------- + Append + -> Seq Scan on rlp1 + Filter: (a <= 10) + -> Seq Scan on rlp2 + Filter: (a <= 10) + -> Seq Scan on rlp_default_10 + Filter: (a <= 10) + -> Seq Scan on rlp_default_default + Filter: (a <= 10) +(9 rows) + +explain (costs off) select * from rlp where a > 10; + QUERY PLAN +--------------------------------------- + Append + -> Seq Scan on rlp3abcd + Filter: (a > 10) + -> Seq Scan on rlp3efgh + Filter: (a > 10) + -> Seq Scan on rlp3nullxy + Filter: (a > 10) + -> Seq Scan on rlp3_default + Filter: (a > 10) + -> Seq Scan on rlp4_1 + Filter: (a > 10) + -> Seq Scan on rlp4_2 + Filter: (a > 10) + -> Seq Scan on rlp4_default + Filter: (a > 10) + -> Seq Scan on rlp5_1 + Filter: (a > 10) + -> Seq Scan on rlp5_default + Filter: (a > 10) + -> Seq Scan on rlp_default_30 + Filter: (a > 10) + -> Seq Scan on rlp_default_default + Filter: (a > 10) +(23 rows) + +explain (costs off) select * from rlp where a < 15; + QUERY PLAN +--------------------------------------- + Append + -> Seq Scan on rlp1 + Filter: (a < 15) + -> Seq Scan on rlp2 + Filter: (a < 15) + -> Seq Scan on rlp_default_10 + Filter: (a < 15) + -> Seq Scan on rlp_default_default + Filter: (a < 15) +(9 rows) + +explain (costs off) select * from rlp where a <= 15; + QUERY PLAN +--------------------------------------- + Append + -> Seq Scan on rlp1 + Filter: (a <= 15) + -> Seq Scan on rlp2 + Filter: (a <= 15) + -> Seq Scan on rlp3abcd + Filter: (a <= 15) + -> Seq Scan on rlp3efgh + Filter: (a <= 15) + -> Seq Scan on rlp3nullxy + Filter: (a <= 15) + -> Seq Scan on rlp3_default + Filter: (a <= 15) + -> Seq Scan on rlp_default_10 + Filter: (a <= 15) + -> Seq Scan on rlp_default_default + Filter: (a <= 15) +(17 rows) + +explain (costs off) select * from rlp where a > 15 and b = 'ab'; + QUERY PLAN +--------------------------------------------------------- + Append + -> Seq Scan on rlp3abcd + Filter: ((a > 15) AND ((b)::text = 'ab'::text)) + -> Seq Scan on rlp4_1 + Filter: ((a > 15) AND ((b)::text = 'ab'::text)) + -> Seq Scan on rlp4_2 + Filter: ((a > 15) AND ((b)::text = 'ab'::text)) + -> Seq Scan on rlp4_default + Filter: ((a > 15) AND ((b)::text = 'ab'::text)) + -> Seq Scan on rlp5_1 + Filter: ((a > 15) AND ((b)::text = 'ab'::text)) + -> Seq Scan on rlp5_default + Filter: ((a > 15) AND ((b)::text = 'ab'::text)) + -> Seq Scan on rlp_default_30 + Filter: ((a > 15) AND ((b)::text = 'ab'::text)) + -> Seq Scan on rlp_default_default + Filter: ((a > 15) AND ((b)::text = 'ab'::text)) +(17 rows) + +explain (costs off) select * from rlp where a = 16; + QUERY PLAN +-------------------------------- + Append + -> Seq Scan on rlp3abcd + Filter: (a = 16) + -> Seq Scan on rlp3efgh + Filter: (a = 16) + -> Seq Scan on rlp3nullxy + Filter: (a = 16) + -> Seq Scan on rlp3_default + Filter: (a = 16) +(9 rows) + +explain (costs off) select * from rlp where a = 16 and b in ('not', 'in', 'here'); + QUERY PLAN +---------------------------------------------------------------------------- + Append + -> Seq Scan on rlp3_default + Filter: ((a = 16) AND ((b)::text = ANY ('{not,in,here}'::text[]))) +(3 rows) + +explain (costs off) select * from rlp where a = 16 and b < 'ab'; + QUERY PLAN +--------------------------------------------------------- + Append + -> Seq Scan on rlp3_default + Filter: (((b)::text < 'ab'::text) AND (a = 16)) +(3 rows) + +explain (costs off) select * from rlp where a = 16 and b <= 'ab'; + QUERY PLAN +---------------------------------------------------------- + Append + -> Seq Scan on rlp3abcd + Filter: (((b)::text <= 'ab'::text) AND (a = 16)) + -> Seq Scan on rlp3_default + Filter: (((b)::text <= 'ab'::text) AND (a = 16)) +(5 rows) + +explain (costs off) select * from rlp where a = 16 and b is null; + QUERY PLAN +-------------------------------------------- + Append + -> Seq Scan on rlp3nullxy + Filter: ((b IS NULL) AND (a = 16)) +(3 rows) + +explain (costs off) select * from rlp where a = 16 and b is not null; + QUERY PLAN +------------------------------------------------ + Append + -> Seq Scan on rlp3abcd + Filter: ((b IS NOT NULL) AND (a = 16)) + -> Seq Scan on rlp3efgh + Filter: ((b IS NOT NULL) AND (a = 16)) + -> Seq Scan on rlp3nullxy + Filter: ((b IS NOT NULL) AND (a = 16)) + -> Seq Scan on rlp3_default + Filter: ((b IS NOT NULL) AND (a = 16)) +(9 rows) + +explain (costs off) select * from rlp where a is null; + QUERY PLAN +------------------------------------ + Append + -> Seq Scan on rlp_default_null + Filter: (a IS NULL) +(3 rows) + +explain (costs off) select * from rlp where a is not null; + QUERY PLAN +--------------------------------------- + Append + -> Seq Scan on rlp1 + Filter: (a IS NOT NULL) + -> Seq Scan on rlp2 + Filter: (a IS NOT NULL) + -> Seq Scan on rlp3abcd + Filter: (a IS NOT NULL) + -> Seq Scan on rlp3efgh + Filter: (a IS NOT NULL) + -> Seq Scan on rlp3nullxy + Filter: (a IS NOT NULL) + -> Seq Scan on rlp3_default + Filter: (a IS NOT NULL) + -> Seq Scan on rlp4_1 + Filter: (a IS NOT NULL) + -> Seq Scan on rlp4_2 + Filter: (a IS NOT NULL) + -> Seq Scan on rlp4_default + Filter: (a IS NOT NULL) + -> Seq Scan on rlp5_1 + Filter: (a IS NOT NULL) + -> Seq Scan on rlp5_default + Filter: (a IS NOT NULL) + -> Seq Scan on rlp_default_10 + Filter: (a IS NOT NULL) + -> Seq Scan on rlp_default_30 + Filter: (a IS NOT NULL) + -> Seq Scan on rlp_default_default + Filter: (a IS NOT NULL) +(29 rows) + +explain (costs off) select * from rlp where a > 30; + QUERY PLAN +--------------------------------------- + Append + -> Seq Scan on rlp5_1 + Filter: (a > 30) + -> Seq Scan on rlp5_default + Filter: (a > 30) + -> Seq Scan on rlp_default_default + Filter: (a > 30) +(7 rows) + +explain (costs off) select * from rlp where a = 30; /* only default is scanned */ + QUERY PLAN +---------------------------------- + Append + -> Seq Scan on rlp_default_30 + Filter: (a = 30) +(3 rows) + +explain (costs off) select * from rlp where a <= 31; + QUERY PLAN +--------------------------------------- + Append + -> Seq Scan on rlp1 + Filter: (a <= 31) + -> Seq Scan on rlp2 + Filter: (a <= 31) + -> Seq Scan on rlp3abcd + Filter: (a <= 31) + -> Seq Scan on rlp3efgh + Filter: (a <= 31) + -> Seq Scan on rlp3nullxy + Filter: (a <= 31) + -> Seq Scan on rlp3_default + Filter: (a <= 31) + -> Seq Scan on rlp4_1 + Filter: (a <= 31) + -> Seq Scan on rlp4_2 + Filter: (a <= 31) + -> Seq Scan on rlp4_default + Filter: (a <= 31) + -> Seq Scan on rlp5_1 + Filter: (a <= 31) + -> Seq Scan on rlp5_default + Filter: (a <= 31) + -> Seq Scan on rlp_default_10 + Filter: (a <= 31) + -> Seq Scan on rlp_default_30 + Filter: (a <= 31) + -> Seq Scan on rlp_default_default + Filter: (a <= 31) +(29 rows) + +explain (costs off) select * from rlp where a = 1 or a = 7; + QUERY PLAN +-------------------------------------- + Append + -> Seq Scan on rlp2 + Filter: ((a = 1) OR (a = 7)) +(3 rows) + +explain (costs off) select * from rlp where a = 1 or b = 'ab'; + QUERY PLAN +------------------------------------------------------- + Append + -> Seq Scan on rlp1 + Filter: ((a = 1) OR ((b)::text = 'ab'::text)) + -> Seq Scan on rlp2 + Filter: ((a = 1) OR ((b)::text = 'ab'::text)) + -> Seq Scan on rlp3abcd + Filter: ((a = 1) OR ((b)::text = 'ab'::text)) + -> Seq Scan on rlp4_1 + Filter: ((a = 1) OR ((b)::text = 'ab'::text)) + -> Seq Scan on rlp4_2 + Filter: ((a = 1) OR ((b)::text = 'ab'::text)) + -> Seq Scan on rlp4_default + Filter: ((a = 1) OR ((b)::text = 'ab'::text)) + -> Seq Scan on rlp5_1 + Filter: ((a = 1) OR ((b)::text = 'ab'::text)) + -> Seq Scan on rlp5_default + Filter: ((a = 1) OR ((b)::text = 'ab'::text)) + -> Seq Scan on rlp_default_10 + Filter: ((a = 1) OR ((b)::text = 'ab'::text)) + -> Seq Scan on rlp_default_30 + Filter: ((a = 1) OR ((b)::text = 'ab'::text)) + -> Seq Scan on rlp_default_null + Filter: ((a = 1) OR ((b)::text = 'ab'::text)) + -> Seq Scan on rlp_default_default + Filter: ((a = 1) OR ((b)::text = 'ab'::text)) +(25 rows) + +explain (costs off) select * from rlp where a > 20 and a < 27; + QUERY PLAN +----------------------------------------- + Append + -> Seq Scan on rlp4_1 + Filter: ((a > 20) AND (a < 27)) + -> Seq Scan on rlp4_2 + Filter: ((a > 20) AND (a < 27)) + -> Seq Scan on rlp4_default + Filter: ((a > 20) AND (a < 27)) +(7 rows) + +explain (costs off) select * from rlp where a = 29; + QUERY PLAN +-------------------------------- + Append + -> Seq Scan on rlp4_default + Filter: (a = 29) +(3 rows) + +explain (costs off) select * from rlp where a >= 29; + QUERY PLAN +--------------------------------------- + Append + -> Seq Scan on rlp4_default + Filter: (a >= 29) + -> Seq Scan on rlp5_1 + Filter: (a >= 29) + -> Seq Scan on rlp5_default + Filter: (a >= 29) + -> Seq Scan on rlp_default_30 + Filter: (a >= 29) + -> Seq Scan on rlp_default_default + Filter: (a >= 29) +(11 rows) + +-- redundant clauses are eliminated +explain (costs off) select * from rlp where a > 1 and a = 10; /* only default */ + QUERY PLAN +---------------------------------------- + Append + -> Seq Scan on rlp_default_10 + Filter: ((a > 1) AND (a = 10)) +(3 rows) + +explain (costs off) select * from rlp where a > 1 and a >=15; /* rlp3 onwards, including default */ + QUERY PLAN +----------------------------------------- + Append + -> Seq Scan on rlp3abcd + Filter: ((a > 1) AND (a >= 15)) + -> Seq Scan on rlp3efgh + Filter: ((a > 1) AND (a >= 15)) + -> Seq Scan on rlp3nullxy + Filter: ((a > 1) AND (a >= 15)) + -> Seq Scan on rlp3_default + Filter: ((a > 1) AND (a >= 15)) + -> Seq Scan on rlp4_1 + Filter: ((a > 1) AND (a >= 15)) + -> Seq Scan on rlp4_2 + Filter: ((a > 1) AND (a >= 15)) + -> Seq Scan on rlp4_default + Filter: ((a > 1) AND (a >= 15)) + -> Seq Scan on rlp5_1 + Filter: ((a > 1) AND (a >= 15)) + -> Seq Scan on rlp5_default + Filter: ((a > 1) AND (a >= 15)) + -> Seq Scan on rlp_default_30 + Filter: ((a > 1) AND (a >= 15)) + -> Seq Scan on rlp_default_default + Filter: ((a > 1) AND (a >= 15)) +(23 rows) + +explain (costs off) select * from rlp where a = 1 and a = 3; /* empty */ + QUERY PLAN +-------------------------- + Result + One-Time Filter: false +(2 rows) + +explain (costs off) select * from rlp where (a = 1 and a = 3) or (a > 1 and a = 15); + QUERY PLAN +------------------------------------------------------------------- + Append + -> Seq Scan on rlp2 + Filter: (((a = 1) AND (a = 3)) OR ((a > 1) AND (a = 15))) + -> Seq Scan on rlp3abcd + Filter: (((a = 1) AND (a = 3)) OR ((a > 1) AND (a = 15))) + -> Seq Scan on rlp3efgh + Filter: (((a = 1) AND (a = 3)) OR ((a > 1) AND (a = 15))) + -> Seq Scan on rlp3nullxy + Filter: (((a = 1) AND (a = 3)) OR ((a > 1) AND (a = 15))) + -> Seq Scan on rlp3_default + Filter: (((a = 1) AND (a = 3)) OR ((a > 1) AND (a = 15))) +(11 rows) + +-- multi-column keys +create table mc3p (a int, b int, c int) partition by range (a, abs(b), c); +create table mc3p_default partition of mc3p default; +create table mc3p0 partition of mc3p for values from (minvalue, minvalue, minvalue) to (1, 1, 1); +create table mc3p1 partition of mc3p for values from (1, 1, 1) to (10, 5, 10); +create table mc3p2 partition of mc3p for values from (10, 5, 10) to (10, 10, 10); +create table mc3p3 partition of mc3p for values from (10, 10, 10) to (10, 10, 20); +create table mc3p4 partition of mc3p for values from (10, 10, 20) to (10, maxvalue, maxvalue); +create table mc3p5 partition of mc3p for values from (11, 1, 1) to (20, 10, 10); +create table mc3p6 partition of mc3p for values from (20, 10, 10) to (20, 20, 20); +create table mc3p7 partition of mc3p for values from (20, 20, 20) to (maxvalue, maxvalue, maxvalue); +explain (costs off) select * from mc3p where a = 1; + QUERY PLAN +-------------------------------- + Append + -> Seq Scan on mc3p0 + Filter: (a = 1) + -> Seq Scan on mc3p1 + Filter: (a = 1) + -> Seq Scan on mc3p_default + Filter: (a = 1) +(7 rows) + +explain (costs off) select * from mc3p where a = 1 and abs(b) < 1; + QUERY PLAN +-------------------------------------------- + Append + -> Seq Scan on mc3p0 + Filter: ((a = 1) AND (abs(b) < 1)) + -> Seq Scan on mc3p_default + Filter: ((a = 1) AND (abs(b) < 1)) +(5 rows) + +explain (costs off) select * from mc3p where a = 1 and abs(b) = 1; + QUERY PLAN +-------------------------------------------- + Append + -> Seq Scan on mc3p0 + Filter: ((a = 1) AND (abs(b) = 1)) + -> Seq Scan on mc3p1 + Filter: ((a = 1) AND (abs(b) = 1)) + -> Seq Scan on mc3p_default + Filter: ((a = 1) AND (abs(b) = 1)) +(7 rows) + +explain (costs off) select * from mc3p where a = 1 and abs(b) = 1 and c < 8; + QUERY PLAN +-------------------------------------------------------- + Append + -> Seq Scan on mc3p0 + Filter: ((c < 8) AND (a = 1) AND (abs(b) = 1)) + -> Seq Scan on mc3p1 + Filter: ((c < 8) AND (a = 1) AND (abs(b) = 1)) + -> Seq Scan on mc3p_default + Filter: ((c < 8) AND (a = 1) AND (abs(b) = 1)) +(7 rows) + +explain (costs off) select * from mc3p where a = 10 and abs(b) between 5 and 35; + QUERY PLAN +----------------------------------------------------------------- + Append + -> Seq Scan on mc3p1 + Filter: ((a = 10) AND (abs(b) >= 5) AND (abs(b) <= 35)) + -> Seq Scan on mc3p2 + Filter: ((a = 10) AND (abs(b) >= 5) AND (abs(b) <= 35)) + -> Seq Scan on mc3p3 + Filter: ((a = 10) AND (abs(b) >= 5) AND (abs(b) <= 35)) + -> Seq Scan on mc3p4 + Filter: ((a = 10) AND (abs(b) >= 5) AND (abs(b) <= 35)) + -> Seq Scan on mc3p_default + Filter: ((a = 10) AND (abs(b) >= 5) AND (abs(b) <= 35)) +(11 rows) + +explain (costs off) select * from mc3p where a > 10; + QUERY PLAN +-------------------------------- + Append + -> Seq Scan on mc3p5 + Filter: (a > 10) + -> Seq Scan on mc3p6 + Filter: (a > 10) + -> Seq Scan on mc3p7 + Filter: (a > 10) + -> Seq Scan on mc3p_default + Filter: (a > 10) +(9 rows) + +explain (costs off) select * from mc3p where a >= 10; + QUERY PLAN +-------------------------------- + Append + -> Seq Scan on mc3p1 + Filter: (a >= 10) + -> Seq Scan on mc3p2 + Filter: (a >= 10) + -> Seq Scan on mc3p3 + Filter: (a >= 10) + -> Seq Scan on mc3p4 + Filter: (a >= 10) + -> Seq Scan on mc3p5 + Filter: (a >= 10) + -> Seq Scan on mc3p6 + Filter: (a >= 10) + -> Seq Scan on mc3p7 + Filter: (a >= 10) + -> Seq Scan on mc3p_default + Filter: (a >= 10) +(17 rows) + +explain (costs off) select * from mc3p where a < 10; + QUERY PLAN +-------------------------------- + Append + -> Seq Scan on mc3p0 + Filter: (a < 10) + -> Seq Scan on mc3p1 + Filter: (a < 10) + -> Seq Scan on mc3p_default + Filter: (a < 10) +(7 rows) + +explain (costs off) select * from mc3p where a <= 10 and abs(b) < 10; + QUERY PLAN +----------------------------------------------- + Append + -> Seq Scan on mc3p0 + Filter: ((a <= 10) AND (abs(b) < 10)) + -> Seq Scan on mc3p1 + Filter: ((a <= 10) AND (abs(b) < 10)) + -> Seq Scan on mc3p2 + Filter: ((a <= 10) AND (abs(b) < 10)) + -> Seq Scan on mc3p_default + Filter: ((a <= 10) AND (abs(b) < 10)) +(9 rows) + +explain (costs off) select * from mc3p where a = 11 and abs(b) = 0; + QUERY PLAN +--------------------------------------------- + Append + -> Seq Scan on mc3p_default + Filter: ((a = 11) AND (abs(b) = 0)) +(3 rows) + +explain (costs off) select * from mc3p where a = 20 and abs(b) = 10 and c = 100; + QUERY PLAN +------------------------------------------------------------ + Append + -> Seq Scan on mc3p6 + Filter: ((a = 20) AND (c = 100) AND (abs(b) = 10)) +(3 rows) + +explain (costs off) select * from mc3p where a > 20; + QUERY PLAN +-------------------------------- + Append + -> Seq Scan on mc3p7 + Filter: (a > 20) + -> Seq Scan on mc3p_default + Filter: (a > 20) +(5 rows) + +explain (costs off) select * from mc3p where a >= 20; + QUERY PLAN +-------------------------------- + Append + -> Seq Scan on mc3p5 + Filter: (a >= 20) + -> Seq Scan on mc3p6 + Filter: (a >= 20) + -> Seq Scan on mc3p7 + Filter: (a >= 20) + -> Seq Scan on mc3p_default + Filter: (a >= 20) +(9 rows) + +explain (costs off) select * from mc3p where (a = 1 and abs(b) = 1 and c = 1) or (a = 10 and abs(b) = 5 and c = 10) or (a > 11 and a < 20); + QUERY PLAN +--------------------------------------------------------------------------------------------------------------------------------- + Append + -> Seq Scan on mc3p1 + Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20))) + -> Seq Scan on mc3p2 + Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20))) + -> Seq Scan on mc3p5 + Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20))) + -> Seq Scan on mc3p_default + Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20))) +(9 rows) + +explain (costs off) select * from mc3p where (a = 1 and abs(b) = 1 and c = 1) or (a = 10 and abs(b) = 5 and c = 10) or (a > 11 and a < 20) or a < 1; + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------------------------------- + Append + -> Seq Scan on mc3p0 + Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1)) + -> Seq Scan on mc3p1 + Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1)) + -> Seq Scan on mc3p2 + Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1)) + -> Seq Scan on mc3p5 + Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1)) + -> Seq Scan on mc3p_default + Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1)) +(11 rows) + +explain (costs off) select * from mc3p where (a = 1 and abs(b) = 1 and c = 1) or (a = 10 and abs(b) = 5 and c = 10) or (a > 11 and a < 20) or a < 1 or a = 1; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------------------------- + Append + -> Seq Scan on mc3p0 + Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1) OR (a = 1)) + -> Seq Scan on mc3p1 + Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1) OR (a = 1)) + -> Seq Scan on mc3p2 + Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1) OR (a = 1)) + -> Seq Scan on mc3p5 + Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1) OR (a = 1)) + -> Seq Scan on mc3p_default + Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1) OR (a = 1)) +(11 rows) + +explain (costs off) select * from mc3p where a = 1 or abs(b) = 1 or c = 1; + QUERY PLAN +------------------------------------------------------ + Append + -> Seq Scan on mc3p0 + Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1)) + -> Seq Scan on mc3p1 + Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1)) + -> Seq Scan on mc3p2 + Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1)) + -> Seq Scan on mc3p4 + Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1)) + -> Seq Scan on mc3p5 + Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1)) + -> Seq Scan on mc3p6 + Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1)) + -> Seq Scan on mc3p7 + Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1)) + -> Seq Scan on mc3p_default + Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1)) +(17 rows) + +explain (costs off) select * from mc3p where (a = 1 and abs(b) = 1) or (a = 10 and abs(b) = 10); + QUERY PLAN +------------------------------------------------------------------------------ + Append + -> Seq Scan on mc3p0 + Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 10))) + -> Seq Scan on mc3p1 + Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 10))) + -> Seq Scan on mc3p2 + Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 10))) + -> Seq Scan on mc3p3 + Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 10))) + -> Seq Scan on mc3p4 + Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 10))) + -> Seq Scan on mc3p_default + Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 10))) +(13 rows) + +explain (costs off) select * from mc3p where (a = 1 and abs(b) = 1) or (a = 10 and abs(b) = 9); + QUERY PLAN +----------------------------------------------------------------------------- + Append + -> Seq Scan on mc3p0 + Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 9))) + -> Seq Scan on mc3p1 + Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 9))) + -> Seq Scan on mc3p2 + Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 9))) + -> Seq Scan on mc3p_default + Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 9))) +(9 rows) + +-- a simpler multi-column keys case +create table mc2p (a int, b int) partition by range (a, b); +create table mc2p_default partition of mc2p default; +create table mc2p0 partition of mc2p for values from (minvalue, minvalue) to (1, minvalue); +create table mc2p1 partition of mc2p for values from (1, minvalue) to (1, 1); +create table mc2p2 partition of mc2p for values from (1, 1) to (2, minvalue); +create table mc2p3 partition of mc2p for values from (2, minvalue) to (2, 1); +create table mc2p4 partition of mc2p for values from (2, 1) to (2, maxvalue); +create table mc2p5 partition of mc2p for values from (2, maxvalue) to (maxvalue, maxvalue); +explain (costs off) select * from mc2p where a < 2; + QUERY PLAN +-------------------------------- + Append + -> Seq Scan on mc2p0 + Filter: (a < 2) + -> Seq Scan on mc2p1 + Filter: (a < 2) + -> Seq Scan on mc2p2 + Filter: (a < 2) + -> Seq Scan on mc2p_default + Filter: (a < 2) +(9 rows) + +explain (costs off) select * from mc2p where a = 2 and b < 1; + QUERY PLAN +--------------------------------------- + Append + -> Seq Scan on mc2p3 + Filter: ((b < 1) AND (a = 2)) +(3 rows) + +explain (costs off) select * from mc2p where a > 1; + QUERY PLAN +-------------------------------- + Append + -> Seq Scan on mc2p2 + Filter: (a > 1) + -> Seq Scan on mc2p3 + Filter: (a > 1) + -> Seq Scan on mc2p4 + Filter: (a > 1) + -> Seq Scan on mc2p5 + Filter: (a > 1) + -> Seq Scan on mc2p_default + Filter: (a > 1) +(11 rows) + +explain (costs off) select * from mc2p where a = 1 and b > 1; + QUERY PLAN +--------------------------------------- + Append + -> Seq Scan on mc2p2 + Filter: ((b > 1) AND (a = 1)) +(3 rows) + +-- boolean partitioning +create table boolpart (a bool) partition by list (a); +create table boolpart_default partition of boolpart default; +create table boolpart_t partition of boolpart for values in ('true'); +create table boolpart_f partition of boolpart for values in ('false'); +explain (costs off) select * from boolpart where a in (true, false); + QUERY PLAN +------------------------------------------------ + Append + -> Seq Scan on boolpart_f + Filter: (a = ANY ('{t,f}'::boolean[])) + -> Seq Scan on boolpart_t + Filter: (a = ANY ('{t,f}'::boolean[])) +(5 rows) + +explain (costs off) select * from boolpart where a = false; + QUERY PLAN +------------------------------------ + Append + -> Seq Scan on boolpart_f + Filter: (NOT a) + -> Seq Scan on boolpart_t + Filter: (NOT a) + -> Seq Scan on boolpart_default + Filter: (NOT a) +(7 rows) + +explain (costs off) select * from boolpart where not a = false; + QUERY PLAN +------------------------------------ + Append + -> Seq Scan on boolpart_f + Filter: a + -> Seq Scan on boolpart_t + Filter: a + -> Seq Scan on boolpart_default + Filter: a +(7 rows) + +explain (costs off) select * from boolpart where a is true or a is not true; + QUERY PLAN +-------------------------------------------------- + Append + -> Seq Scan on boolpart_f + Filter: ((a IS TRUE) OR (a IS NOT TRUE)) + -> Seq Scan on boolpart_t + Filter: ((a IS TRUE) OR (a IS NOT TRUE)) + -> Seq Scan on boolpart_default + Filter: ((a IS TRUE) OR (a IS NOT TRUE)) +(7 rows) + +explain (costs off) select * from boolpart where a is not true; + QUERY PLAN +------------------------------------ + Append + -> Seq Scan on boolpart_f + Filter: (a IS NOT TRUE) + -> Seq Scan on boolpart_t + Filter: (a IS NOT TRUE) + -> Seq Scan on boolpart_default + Filter: (a IS NOT TRUE) +(7 rows) + +explain (costs off) select * from boolpart where a is not true and a is not false; + QUERY PLAN +-------------------------------------------------------- + Append + -> Seq Scan on boolpart_f + Filter: ((a IS NOT TRUE) AND (a IS NOT FALSE)) + -> Seq Scan on boolpart_t + Filter: ((a IS NOT TRUE) AND (a IS NOT FALSE)) + -> Seq Scan on boolpart_default + Filter: ((a IS NOT TRUE) AND (a IS NOT FALSE)) +(7 rows) + +explain (costs off) select * from boolpart where a is unknown; + QUERY PLAN +------------------------------------ + Append + -> Seq Scan on boolpart_f + Filter: (a IS UNKNOWN) + -> Seq Scan on boolpart_t + Filter: (a IS UNKNOWN) + -> Seq Scan on boolpart_default + Filter: (a IS UNKNOWN) +(7 rows) + +explain (costs off) select * from boolpart where a is not unknown; + QUERY PLAN +------------------------------------ + Append + -> Seq Scan on boolpart_f + Filter: (a IS NOT UNKNOWN) + -> Seq Scan on boolpart_t + Filter: (a IS NOT UNKNOWN) + -> Seq Scan on boolpart_default + Filter: (a IS NOT UNKNOWN) +(7 rows) + +drop table lp, coll_pruning, rlp, mc3p, mc2p, boolpart; diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 7c3fa29e..d8a925ca 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -134,7 +134,7 @@ test: plancache limit plpgsql copy2 temp domain prepare without_oid conversion t # ---------- # Another group of parallel tests # ---------- -test: identity partition_join hash_part +test: identity partition_join partition_prune hash_part # event triggers cannot run concurrently with any test that runs DDL test: event_trigger diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule index 890742ef..f0989763 100644 --- a/src/test/regress/serial_schedule +++ b/src/test/regress/serial_schedule @@ -192,6 +192,7 @@ test: with test: xml test: identity test: partition_join +test: partition_prune test: hash_part test: event_trigger test: fast_default diff --git a/src/test/regress/sql/partition_prune.sql b/src/test/regress/sql/partition_prune.sql new file mode 100644 index 00000000..514f8e5c --- /dev/null +++ b/src/test/regress/sql/partition_prune.sql @@ -0,0 +1,155 @@ +-- +-- Test partitioning planner code +-- +create table lp (a char) partition by list (a); +create table lp_default partition of lp default; +create table lp_ef partition of lp for values in ('e', 'f'); +create table lp_ad partition of lp for values in ('a', 'd'); +create table lp_bc partition of lp for values in ('b', 'c'); +create table lp_g partition of lp for values in ('g'); +create table lp_null partition of lp for values in (null); +explain (costs off) select * from lp; +explain (costs off) select * from lp where a > 'a' and a < 'd'; +explain (costs off) select * from lp where a > 'a' and a <= 'd'; +explain (costs off) select * from lp where a = 'a'; +explain (costs off) select * from lp where 'a' = a; /* commuted */ +explain (costs off) select * from lp where a is not null; +explain (costs off) select * from lp where a is null; +explain (costs off) select * from lp where a = 'a' or a = 'c'; +explain (costs off) select * from lp where a is not null and (a = 'a' or a = 'c'); +explain (costs off) select * from lp where a <> 'g'; +explain (costs off) select * from lp where a <> 'a' and a <> 'd'; +explain (costs off) select * from lp where a not in ('a', 'd'); + +-- collation matches the partitioning collation, pruning works +create table coll_pruning (a text collate "C") partition by list (a); +create table coll_pruning_a partition of coll_pruning for values in ('a'); +create table coll_pruning_b partition of coll_pruning for values in ('b'); +create table coll_pruning_def partition of coll_pruning default; +explain (costs off) select * from coll_pruning where a collate "C" = 'a' collate "C"; +-- collation doesn't match the partitioning collation, no pruning occurs +explain (costs off) select * from coll_pruning where a collate "POSIX" = 'a' collate "POSIX"; + +create table rlp (a int, b varchar) partition by range (a); +create table rlp_default partition of rlp default partition by list (a); +create table rlp_default_default partition of rlp_default default; +create table rlp_default_10 partition of rlp_default for values in (10); +create table rlp_default_30 partition of rlp_default for values in (30); +create table rlp_default_null partition of rlp_default for values in (null); +create table rlp1 partition of rlp for values from (minvalue) to (1); +create table rlp2 partition of rlp for values from (1) to (10); + +create table rlp3 (b varchar, a int) partition by list (b varchar_ops); +create table rlp3_default partition of rlp3 default; +create table rlp3abcd partition of rlp3 for values in ('ab', 'cd'); +create table rlp3efgh partition of rlp3 for values in ('ef', 'gh'); +create table rlp3nullxy partition of rlp3 for values in (null, 'xy'); +alter table rlp attach partition rlp3 for values from (15) to (20); + +create table rlp4 partition of rlp for values from (20) to (30) partition by range (a); +create table rlp4_default partition of rlp4 default; +create table rlp4_1 partition of rlp4 for values from (20) to (25); +create table rlp4_2 partition of rlp4 for values from (25) to (29); + +create table rlp5 partition of rlp for values from (31) to (maxvalue) partition by range (a); +create table rlp5_default partition of rlp5 default; +create table rlp5_1 partition of rlp5 for values from (31) to (40); + +explain (costs off) select * from rlp where a < 1; +explain (costs off) select * from rlp where 1 > a; /* commuted */ +explain (costs off) select * from rlp where a <= 1; +explain (costs off) select * from rlp where a = 1; +explain (costs off) select * from rlp where a = 1::bigint; /* same as above */ +explain (costs off) select * from rlp where a = 1::numeric; /* no pruning */ +explain (costs off) select * from rlp where a <= 10; +explain (costs off) select * from rlp where a > 10; +explain (costs off) select * from rlp where a < 15; +explain (costs off) select * from rlp where a <= 15; +explain (costs off) select * from rlp where a > 15 and b = 'ab'; +explain (costs off) select * from rlp where a = 16; +explain (costs off) select * from rlp where a = 16 and b in ('not', 'in', 'here'); +explain (costs off) select * from rlp where a = 16 and b < 'ab'; +explain (costs off) select * from rlp where a = 16 and b <= 'ab'; +explain (costs off) select * from rlp where a = 16 and b is null; +explain (costs off) select * from rlp where a = 16 and b is not null; +explain (costs off) select * from rlp where a is null; +explain (costs off) select * from rlp where a is not null; +explain (costs off) select * from rlp where a > 30; +explain (costs off) select * from rlp where a = 30; /* only default is scanned */ +explain (costs off) select * from rlp where a <= 31; +explain (costs off) select * from rlp where a = 1 or a = 7; +explain (costs off) select * from rlp where a = 1 or b = 'ab'; + +explain (costs off) select * from rlp where a > 20 and a < 27; +explain (costs off) select * from rlp where a = 29; +explain (costs off) select * from rlp where a >= 29; + +-- redundant clauses are eliminated +explain (costs off) select * from rlp where a > 1 and a = 10; /* only default */ +explain (costs off) select * from rlp where a > 1 and a >=15; /* rlp3 onwards, including default */ +explain (costs off) select * from rlp where a = 1 and a = 3; /* empty */ +explain (costs off) select * from rlp where (a = 1 and a = 3) or (a > 1 and a = 15); + +-- multi-column keys +create table mc3p (a int, b int, c int) partition by range (a, abs(b), c); +create table mc3p_default partition of mc3p default; +create table mc3p0 partition of mc3p for values from (minvalue, minvalue, minvalue) to (1, 1, 1); +create table mc3p1 partition of mc3p for values from (1, 1, 1) to (10, 5, 10); +create table mc3p2 partition of mc3p for values from (10, 5, 10) to (10, 10, 10); +create table mc3p3 partition of mc3p for values from (10, 10, 10) to (10, 10, 20); +create table mc3p4 partition of mc3p for values from (10, 10, 20) to (10, maxvalue, maxvalue); +create table mc3p5 partition of mc3p for values from (11, 1, 1) to (20, 10, 10); +create table mc3p6 partition of mc3p for values from (20, 10, 10) to (20, 20, 20); +create table mc3p7 partition of mc3p for values from (20, 20, 20) to (maxvalue, maxvalue, maxvalue); + +explain (costs off) select * from mc3p where a = 1; +explain (costs off) select * from mc3p where a = 1 and abs(b) < 1; +explain (costs off) select * from mc3p where a = 1 and abs(b) = 1; +explain (costs off) select * from mc3p where a = 1 and abs(b) = 1 and c < 8; +explain (costs off) select * from mc3p where a = 10 and abs(b) between 5 and 35; +explain (costs off) select * from mc3p where a > 10; +explain (costs off) select * from mc3p where a >= 10; +explain (costs off) select * from mc3p where a < 10; +explain (costs off) select * from mc3p where a <= 10 and abs(b) < 10; +explain (costs off) select * from mc3p where a = 11 and abs(b) = 0; +explain (costs off) select * from mc3p where a = 20 and abs(b) = 10 and c = 100; +explain (costs off) select * from mc3p where a > 20; +explain (costs off) select * from mc3p where a >= 20; +explain (costs off) select * from mc3p where (a = 1 and abs(b) = 1 and c = 1) or (a = 10 and abs(b) = 5 and c = 10) or (a > 11 and a < 20); +explain (costs off) select * from mc3p where (a = 1 and abs(b) = 1 and c = 1) or (a = 10 and abs(b) = 5 and c = 10) or (a > 11 and a < 20) or a < 1; +explain (costs off) select * from mc3p where (a = 1 and abs(b) = 1 and c = 1) or (a = 10 and abs(b) = 5 and c = 10) or (a > 11 and a < 20) or a < 1 or a = 1; +explain (costs off) select * from mc3p where a = 1 or abs(b) = 1 or c = 1; +explain (costs off) select * from mc3p where (a = 1 and abs(b) = 1) or (a = 10 and abs(b) = 10); +explain (costs off) select * from mc3p where (a = 1 and abs(b) = 1) or (a = 10 and abs(b) = 9); + +-- a simpler multi-column keys case +create table mc2p (a int, b int) partition by range (a, b); +create table mc2p_default partition of mc2p default; +create table mc2p0 partition of mc2p for values from (minvalue, minvalue) to (1, minvalue); +create table mc2p1 partition of mc2p for values from (1, minvalue) to (1, 1); +create table mc2p2 partition of mc2p for values from (1, 1) to (2, minvalue); +create table mc2p3 partition of mc2p for values from (2, minvalue) to (2, 1); +create table mc2p4 partition of mc2p for values from (2, 1) to (2, maxvalue); +create table mc2p5 partition of mc2p for values from (2, maxvalue) to (maxvalue, maxvalue); + +explain (costs off) select * from mc2p where a < 2; +explain (costs off) select * from mc2p where a = 2 and b < 1; +explain (costs off) select * from mc2p where a > 1; +explain (costs off) select * from mc2p where a = 1 and b > 1; + +-- boolean partitioning +create table boolpart (a bool) partition by list (a); +create table boolpart_default partition of boolpart default; +create table boolpart_t partition of boolpart for values in ('true'); +create table boolpart_f partition of boolpart for values in ('false'); + +explain (costs off) select * from boolpart where a in (true, false); +explain (costs off) select * from boolpart where a = false; +explain (costs off) select * from boolpart where not a = false; +explain (costs off) select * from boolpart where a is true or a is not true; +explain (costs off) select * from boolpart where a is not true; +explain (costs off) select * from boolpart where a is not true and a is not false; +explain (costs off) select * from boolpart where a is unknown; +explain (costs off) select * from boolpart where a is not unknown; + +drop table lp, coll_pruning, rlp, mc3p, mc2p, boolpart; From baa79c39fd912cefd2ed9f77a4f75c380be58983 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Thu, 25 Jun 2020 21:34:24 +0800 Subject: [PATCH 213/578] New C function: bms_add_range. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/nodes/bitmapset.c | 72 +++++++++++++++++++++++++++++++++++ src/include/nodes/bitmapset.h | 1 + 2 files changed, 73 insertions(+) diff --git a/src/backend/nodes/bitmapset.c b/src/backend/nodes/bitmapset.c index 70a53947..8ec465d2 100644 --- a/src/backend/nodes/bitmapset.c +++ b/src/backend/nodes/bitmapset.c @@ -943,6 +943,78 @@ bms_clean_members(Bitmapset *a) #endif +/* + * bms_add_range + * Add members in the range of 'lower' to 'upper' to the set. + * + * Note this could also be done by calling bms_add_member in a loop, however, + * using this function will be faster when the range is large as we work with + * at the bitmapword level rather than at bit level. + */ +Bitmapset * +bms_add_range(Bitmapset *a, int lower, int upper) +{ + int lwordnum, + lbitnum, + uwordnum, + ushiftbits, + wordnum; + + if (lower < 0 || upper < 0) + elog(ERROR, "negative bitmapset member not allowed"); + if (lower > upper) + elog(ERROR, "lower range must not be above upper range"); + uwordnum = WORDNUM(upper); + + if (a == NULL) + { + a = (Bitmapset *) palloc0(BITMAPSET_SIZE(uwordnum + 1)); + a->nwords = uwordnum + 1; + } + + /* ensure we have enough words to store the upper bit */ + else if (uwordnum >= a->nwords) + { + int oldnwords = a->nwords; + int i; + + a = (Bitmapset *) repalloc(a, BITMAPSET_SIZE(uwordnum + 1)); + a->nwords = uwordnum + 1; + /* zero out the enlarged portion */ + for (i = oldnwords; i < a->nwords; i++) + a->words[i] = 0; + } + + wordnum = lwordnum = WORDNUM(lower); + + lbitnum = BITNUM(lower); + ushiftbits = BITS_PER_BITMAPWORD - (BITNUM(upper) + 1); + + /* + * Special case when lwordnum is the same as uwordnum we must perform the + * upper and lower masking on the word. + */ + if (lwordnum == uwordnum) + { + a->words[lwordnum] |= ~(bitmapword) (((bitmapword) 1 << lbitnum) - 1) + & (~(bitmapword) 0) >> ushiftbits; + } + else + { + /* turn on lbitnum and all bits left of it */ + a->words[wordnum++] |= ~(bitmapword) (((bitmapword) 1 << lbitnum) - 1); + + /* turn on all bits for any intermediate words */ + while (wordnum < uwordnum) + a->words[wordnum++] = ~(bitmapword) 0; + + /* turn on upper's bit and all bits right of it. */ + a->words[uwordnum] |= (~(bitmapword) 0) >> ushiftbits; + } + + return a; +} + /* * bms_int_members - like bms_intersect, but left input is recycled */ diff --git a/src/include/nodes/bitmapset.h b/src/include/nodes/bitmapset.h index 800dcb52..fc101c8f 100644 --- a/src/include/nodes/bitmapset.h +++ b/src/include/nodes/bitmapset.h @@ -165,6 +165,7 @@ extern bool bms_is_empty(const Bitmapset *a); extern Bitmapset *bms_add_member(Bitmapset *a, int x); extern Bitmapset *bms_del_member(Bitmapset *a, int x); extern Bitmapset *bms_add_members(Bitmapset *a, const Bitmapset *b); +extern Bitmapset *bms_add_range(Bitmapset *a, int lower, int upper); extern Bitmapset *bms_int_members(Bitmapset *a, const Bitmapset *b); extern Bitmapset *bms_del_members(Bitmapset *a, const Bitmapset *b); extern Bitmapset *bms_join(Bitmapset *a, Bitmapset *b); From c092b1ee10419e26a151e29269e410aee1eb809f Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Fri, 1 Dec 2017 10:01:50 -0500 Subject: [PATCH 214/578] Fix uninitialized memory reference. Without this, when partdesc->nparts == 0, we end up calling ExecBuildSlotPartitionKeyDescription without initializing values and isnull. Reported by Coverity via Michael Paquier. Patch by Michael Paquier, reviewed and revised by Amit Langote. Discussion: http://postgr.es/m/CAB7nPqQ3mwkdMoPY-ocgTpPnjd8TKOadMxdTtMLvEzF8480Zfg@mail.gmail.com --- src/backend/executor/execPartition.c | 18 +++++++++++------- src/test/regress/expected/insert.out | 4 ++++ src/test/regress/sql/insert.sql | 4 ++++ 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c index d275cefe..537d8986 100644 --- a/src/backend/executor/execPartition.c +++ b/src/backend/executor/execPartition.c @@ -206,13 +206,6 @@ ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd, slot = myslot; } - /* Quick exit */ - if (partdesc->nparts == 0) - { - result = -1; - break; - } - /* * Extract partition key from tuple. Expression evaluation machinery * that FormPartitionKeyDatum() invokes expects ecxt_scantuple to @@ -223,6 +216,17 @@ ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd, */ ecxt->ecxt_scantuple = slot; FormPartitionKeyDatum(parent, slot, estate, values, isnull); + + /* + * Nothing for get_partition_for_tuple() to do if there are no + * partitions to begin with. + */ + if (partdesc->nparts == 0) + { + result = -1; + break; + } + cur_index = get_partition_for_tuple(rel, values, isnull); /* diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out index a3649273..a671b345 100644 --- a/src/test/regress/expected/insert.out +++ b/src/test/regress/expected/insert.out @@ -167,6 +167,10 @@ create table range_parted ( a text, b int ) partition by range (a, (b+0)); +-- no partitions, so fail +insert into range_parted values ('a', 11); +ERROR: no partition of relation "range_parted" found for row +DETAIL: Partition key of the failing row contains (a, (b + 0)) = (a, 11). create table part1 partition of range_parted for values from ('a', 1) to ('a', 10); create table part2 partition of range_parted for values from ('a', 10) to ('a', 20); create table part3 partition of range_parted for values from ('b', 1) to ('b', 10); diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql index e276954e..21d04de1 100644 --- a/src/test/regress/sql/insert.sql +++ b/src/test/regress/sql/insert.sql @@ -90,6 +90,10 @@ create table range_parted ( a text, b int ) partition by range (a, (b+0)); + +-- no partitions, so fail +insert into range_parted values ('a', 11); + create table part1 partition of range_parted for values from ('a', 1) to ('a', 10); create table part2 partition of range_parted for values from ('a', 10) to ('a', 20); create table part3 partition of range_parted for values from ('b', 1) to ('b', 10); From 7fb1e744f91ba38be271464b7029d8af9026176f Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Fri, 1 Dec 2017 10:58:08 -0500 Subject: [PATCH 215/578] Try to exclude partitioned tables in toto. Ashutosh Bapat, reviewed by Jeevan Chalke. Comment by me. Discussion: http://postgr.es/m/CAFjFpRcuRaydz88CY_aQekmuvmN2A9ax5z0k=ppT+s8KS8xMRA@mail.gmail.com --- src/backend/optimizer/util/plancat.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index fc680b63..dba8d09d 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -1564,8 +1564,18 @@ relation_excluded_by_constraints(PlannerInfo *root, if (predicate_refuted_by(safe_restrictions, safe_restrictions, false)) return true; - /* Only plain relations have constraints */ - if (rte->rtekind != RTE_RELATION || rte->inh) + /* + * Only plain relations have constraints. In a partitioning hierarchy, + * but not with regular table inheritance, it's OK to assume that any + * constraints that hold for the parent also hold for every child; for + * instance, table inheritance allows the parent to have constraints + * marked NO INHERIT, but table partitioning does not. We choose to check + * whether the partitioning parents can be excluded here; doing so + * consumes some cycles, but potentially saves us the work of excluding + * each child individually. + */ + if (rte->rtekind != RTE_RELATION || + (rte->inh && rte->relkind != RELKIND_PARTITIONED_TABLE)) return false; /* From 10aba7a168d0338278f20948149d8d45f15ad5c8 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Thu, 25 Jun 2020 21:48:02 +0800 Subject: [PATCH 216/578] Re-allow INSERT .. ON CONFLICT DO NOTHING on partitioned tables.http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- doc/src/sgml/ddl.sgml | 13 +++++++++---- src/backend/commands/copy.c | 3 ++- src/backend/executor/execPartition.c | 15 ++++++++++----- src/backend/executor/nodeModifyTable.c | 3 ++- src/backend/parser/analyze.c | 8 -------- src/include/executor/execPartition.h | 3 ++- src/test/regress/expected/insert_conflict.out | 13 +++++++++++++ src/test/regress/sql/insert_conflict.sql | 13 +++++++++++++ 8 files changed, 51 insertions(+), 20 deletions(-) diff --git a/doc/src/sgml/ddl.sgml b/doc/src/sgml/ddl.sgml index a65a130f..168c5f54 100644 --- a/doc/src/sgml/ddl.sgml +++ b/doc/src/sgml/ddl.sgml @@ -3558,10 +3558,15 @@ ALTER TABLE measurement ATTACH PARTITION measurement_y2008m02 Using the ON CONFLICT clause with partitioned tables - will cause an error, because unique or exclusion constraints can only be - created on individual partitions. There is no support for enforcing - uniqueness (or an exclusion constraint) across an entire partitioning - hierarchy. + will cause an error if the conflict target is specified (see + for more details on how the clause + works). Therefore, it is not possible to specify + DO UPDATE as the alternative action, because + specifying the conflict target is mandatory in that case. On the other + hand, specifying DO NOTHING as the alternative action + works fine provided the conflict target is not specified. In that case, + unique constraints (or exclusion constraints) of the individual leaf + partitions are considered. diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index e376f863..d0f69503 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -1643,7 +1643,8 @@ BeginCopy(ParseState *pstate, int num_parted, num_partitions; - ExecSetupPartitionTupleRouting(rel, + ExecSetupPartitionTupleRouting(NULL, + rel, 1, &partition_dispatch_info, &partitions, diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c index 537d8986..08a27d71 100644 --- a/src/backend/executor/execPartition.c +++ b/src/backend/executor/execPartition.c @@ -63,7 +63,8 @@ static char *ExecBuildSlotPartitionKeyDescription(Relation rel, * RowExclusiveLock mode upon return from this function. */ void -ExecSetupPartitionTupleRouting(Relation rel, +ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, + Relation rel, Index resultRTindex, EState *estate, PartitionDispatch **pd, @@ -133,13 +134,17 @@ ExecSetupPartitionTupleRouting(Relation rel, CheckValidResultRel(leaf_part_rri, CMD_INSERT); /* - * Open partition indices (remember we do not support ON CONFLICT in - * case of partitioned tables, so we do not need support information - * for speculative insertion) + * Open partition indices. The user may have asked to check for + * conflicts within this leaf partition and do "nothing" instead of + * throwing an error. Be prepared in that case by initializing the + * index information needed by ExecInsert() to perform speculative + * insertions. */ if (leaf_part_rri->ri_RelationDesc->rd_rel->relhasindex && leaf_part_rri->ri_IndexRelationDescs == NULL) - ExecOpenIndices(leaf_part_rri, false); + ExecOpenIndices(leaf_part_rri, + mtstate != NULL && + mtstate->mt_onconflict != ONCONFLICT_NONE); estate->es_leaf_result_relations = lappend(estate->es_leaf_result_relations, leaf_part_rri); diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 660bfd4b..749c320e 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -2704,7 +2704,8 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) int num_parted, num_partitions; - ExecSetupPartitionTupleRouting(rel, + ExecSetupPartitionTupleRouting(mtstate, + rel, node->nominalRelation, &partition_dispatch_info, &partitions, diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c index fb5e27f1..62db0557 100644 --- a/src/backend/parser/analyze.c +++ b/src/backend/parser/analyze.c @@ -1307,16 +1307,8 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt) /* Process ON CONFLICT, if any. */ if (stmt->onConflictClause) - { - /* Bail out if target relation is partitioned table */ - if (pstate->p_target_rangetblentry->relkind == RELKIND_PARTITIONED_TABLE) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("ON CONFLICT clause is not supported with partitioned tables"))); - qry->onConflict = transformOnConflictClause(pstate, stmt->onConflictClause); - } /* * If we have a RETURNING clause, we need to add the target relation to diff --git a/src/include/executor/execPartition.h b/src/include/executor/execPartition.h index 64e5aab4..703ff4f7 100644 --- a/src/include/executor/execPartition.h +++ b/src/include/executor/execPartition.h @@ -49,7 +49,8 @@ typedef struct PartitionDispatchData typedef struct PartitionDispatchData *PartitionDispatch; -extern void ExecSetupPartitionTupleRouting(Relation rel, +extern void ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, + Relation rel, Index resultRTindex, EState *estate, PartitionDispatch **pd, diff --git a/src/test/regress/expected/insert_conflict.out b/src/test/regress/expected/insert_conflict.out index d316b344..f10974de 100644 --- a/src/test/regress/expected/insert_conflict.out +++ b/src/test/regress/expected/insert_conflict.out @@ -814,3 +814,16 @@ select * from selfconflict; (3 rows) drop table selfconflict; +-- check that the following works: +-- insert into partitioned_table on conflict do nothing +create table parted_conflict_test (a int, b char) partition by list (a); +create table parted_conflict_test_1 partition of parted_conflict_test (b unique) for values in (1); +insert into parted_conflict_test values (1, 'a') on conflict do nothing; +insert into parted_conflict_test values (1, 'a') on conflict do nothing; +-- however, on conflict do update is not supported yet +insert into parted_conflict_test values (1) on conflict (b) do update set a = excluded.a; +ERROR: there is no unique or exclusion constraint matching the ON CONFLICT specification +-- but it works OK if we target the partition directly +insert into parted_conflict_test_1 values (1) on conflict (b) do +update set a = excluded.a; +drop table parted_conflict_test; diff --git a/src/test/regress/sql/insert_conflict.sql b/src/test/regress/sql/insert_conflict.sql index 58518bf2..92dfdd85 100644 --- a/src/test/regress/sql/insert_conflict.sql +++ b/src/test/regress/sql/insert_conflict.sql @@ -475,3 +475,16 @@ commit; select * from selfconflict order by 1; drop table selfconflict; + +-- check that the following works: +-- insert into partitioned_table on conflict do nothing +create table parted_conflict_test (a int, b char) partition by list (a); +create table parted_conflict_test_1 partition of parted_conflict_test (b unique) for values in (1); +insert into parted_conflict_test values (1, 'a') on conflict do nothing; +insert into parted_conflict_test values (1, 'a') on conflict do nothing; +-- however, on conflict do update is not supported yet +insert into parted_conflict_test values (1) on conflict (b) do update set a = excluded.a; +-- but it works OK if we target the partition directly +insert into parted_conflict_test_1 values (1) on conflict (b) do +update set a = excluded.a; +drop table parted_conflict_test; From b659c7636b4ec937749f00fb8131427e456e035c Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Fri, 1 Dec 2017 13:52:59 -0500 Subject: [PATCH 217/578] Minor code beautification in partition_bounds_equal. Use get_greatest_modulus more consistently, instead of doing the same thing in an ad-hoc manner in this one place. Ashutosh Bapat Discussion: http://postgr.es/m/CAFjFpReT9L4RCiJBKOyWC2=i02kv9uG2fx=4Fv7kFY2t0SPCgw@mail.gmail.com --- src/backend/catalog/partition.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c index 544e3365..c5550d1e 100644 --- a/src/backend/catalog/partition.c +++ b/src/backend/catalog/partition.c @@ -784,15 +784,13 @@ partition_bounds_equal(int partnatts, int16 *parttyplen, bool *parttypbyval, if (b1->strategy == PARTITION_STRATEGY_HASH) { - int greatest_modulus; + int greatest_modulus = get_greatest_modulus(b1); /* * If two hash partitioned tables have different greatest moduli, - * their partition schemes don't match. For hash partitioned table, - * the greatest modulus is given by the last datum and number of - * partitions is given by ndatums. + * their partition schemes don't match. */ - if (b1->datums[b1->ndatums - 1][0] != b2->datums[b2->ndatums - 1][0]) + if (greatest_modulus != get_greatest_modulus(b2)) return false; /* @@ -806,7 +804,6 @@ partition_bounds_equal(int partnatts, int16 *parttyplen, bool *parttypbyval, * their indexes array will be same. So, it suffices to compare * indexes array. */ - greatest_modulus = get_greatest_modulus(b1); for (i = 0; i < greatest_modulus; i++) if (b1->indexes[i] != b2->indexes[i]) return false; From c4e1c445a6c4f40ae10680c50bca603e3b26d8c8 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Fri, 26 Jun 2020 10:36:13 +0800 Subject: [PATCH 218/578] Prohibit identity columns on typed tables and partitions. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/parser/parse_utilcmd.c | 13 +++++++++++++ src/test/regress/expected/identity.out | 12 ++++++++++++ src/test/regress/sql/identity.sql | 16 ++++++++++++++++ 3 files changed, 41 insertions(+) diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c index e75e6b5e..a5c17b8b 100644 --- a/src/backend/parser/parse_utilcmd.c +++ b/src/backend/parser/parse_utilcmd.c @@ -137,6 +137,7 @@ typedef struct #endif bool ispartitioned; /* true if table is partitioned */ PartitionBoundSpec *partbound; /* transformed FOR VALUES */ + bool ofType; /* true if statement contains OF typename */ } CreateStmtContext; /* State shared by transformCreateSchemaStmt and its subroutines */ @@ -375,6 +376,8 @@ transformCreateStmt(CreateStmt *stmt, const char *queryString) #else cxt.ispartitioned = stmt->partspec != NULL; #endif + cxt.partbound = stmt->partbound; + cxt.ofType = (stmt->ofTypename != NULL); /* * Notice that we allow OIDs here only for plain tables, even though * foreign tables also support them. This is necessary because the @@ -1049,6 +1052,15 @@ transformColumnDefinition(CreateStmtContext *cxt, ColumnDef *column) Type ctype; Oid typeOid; + if (cxt->ofType) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("identity colums are not supported on typed tables"))); + if (cxt->partbound) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("identify columns are not supported on partitions"))); + ctype = typenameType(cxt->pstate, column->typeName, NULL); typeOid = HeapTupleGetOid(ctype); ReleaseSysCache(ctype); @@ -3316,6 +3328,7 @@ transformAlterTableStmt(Oid relid, AlterTableStmt *stmt, #endif cxt.ispartitioned = (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE); cxt.partbound = NULL; + cxt.ofType = false; /* * The only subtypes that currently require parse transformation handling diff --git a/src/test/regress/expected/identity.out b/src/test/regress/expected/identity.out index 7844395d..5dd9909b 100644 --- a/src/test/regress/expected/identity.out +++ b/src/test/regress/expected/identity.out @@ -326,3 +326,15 @@ SELECT * FROM itest8; RESET ROLE; DROP TABLE itest8; DROP USER regress_user1; +-- typed tables (currently not supported) +CREATE TYPE itest_type AS (f1 integer, f2 text, f3 bigint); +CREATE TABLE itest12 OF itest_type (f1 WITH OPTIONS GENERATED ALWAYS AS IDENTITY); -- error +ERROR: identity colums are not supported on typed tables +DROP TYPE itest_type CASCADE; +-- table partitions (currently not supported) +CREATE TABLE itest_parent (f1 date NOT NULL, f2 text, f3 bigint) PARTITION BY RANGE (f1); +CREATE TABLE itest_child PARTITION OF itest_parent ( + f3 WITH OPTIONS GENERATED ALWAYS AS IDENTITY +) FOR VALUES FROM ('2016-07-01') TO ('2016-08-01'); -- error +ERROR: identify columns are not supported on partitions +DROP TABLE itest_parent; diff --git a/src/test/regress/sql/identity.sql b/src/test/regress/sql/identity.sql index 4e19fde2..13d9e4c1 100644 --- a/src/test/regress/sql/identity.sql +++ b/src/test/regress/sql/identity.sql @@ -190,3 +190,19 @@ SELECT * FROM itest8; RESET ROLE; DROP TABLE itest8; DROP USER regress_user1; + + +-- typed tables (currently not supported) + +CREATE TYPE itest_type AS (f1 integer, f2 text, f3 bigint); +CREATE TABLE itest12 OF itest_type (f1 WITH OPTIONS GENERATED ALWAYS AS IDENTITY); -- error +DROP TYPE itest_type CASCADE; + + +-- table partitions (currently not supported) + +CREATE TABLE itest_parent (f1 date NOT NULL, f2 text, f3 bigint) PARTITION BY RANGE (f1); +CREATE TABLE itest_child PARTITION OF itest_parent ( + f3 WITH OPTIONS GENERATED ALWAYS AS IDENTITY +) FOR VALUES FROM ('2016-07-01') TO ('2016-08-01'); -- error +DROP TABLE itest_parent; From 7764c73d5add98dc648c949ec2573c2eb32b09ce Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Mon, 11 Dec 2017 12:48:40 -0500 Subject: [PATCH 219/578] Improve comment about PartitionBoundInfoData. Ashutosh Bapat, per discussion with Julien Rouhaund, who also reviewed this patch. Discussion: http://postgr.es/m/CAFjFpReBR3ftK9C23LLCZY_TDXhhjB_dgE-L9+mfTnA=gkvdvQ@mail.gmail.com --- src/backend/catalog/partition.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c index c5550d1e..8bac934b 100644 --- a/src/backend/catalog/partition.c +++ b/src/backend/catalog/partition.c @@ -72,6 +72,13 @@ * of datum-tuples with 2 datums, modulus and remainder, corresponding to a * given partition. * + * The datums in datums array are arranged in increasing order as defined by + * functions qsort_partition_rbound_cmp(), qsort_partition_list_value_cmp() and + * qsort_partition_hbound_cmp() for range, list and hash partitioned tables + * respectively. For range and list partitions this simply means that the + * datums in the datums array are arranged in increasing order as defined by + * the partition key's operator classes and collations. + * * In the case of list partitioning, the indexes array stores one entry for * every datum, which is the index of the partition that accepts a given datum. * In case of range partitioning, it stores one entry per distinct range From 721fbc782ed010b64281a700b21da8698a9dfd9e Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Tue, 12 Dec 2017 10:52:15 -0500 Subject: [PATCH 220/578] Remove bug from OPTIMIZER_DEBUG code for partition-wise join. Etsuro Fujita, reviewed by Ashutosh Bapat Discussion: http://postgr.es/m/5A2A60E6.6000008@lab.ntt.co.jp --- src/backend/optimizer/path/allpaths.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index ba5a4418..439e98ed 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -3436,7 +3436,7 @@ generate_partition_wise_join_paths(PlannerInfo *root, RelOptInfo *rel) set_cheapest(child_rel); #ifdef OPTIMIZER_DEBUG - debug_print_rel(root, rel); + debug_print_rel(root, child_rel); #endif live_children = lappend(live_children, child_rel); From f7c14be86e6f9801a1638b5332b634e431ef8e11 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Fri, 26 Jun 2020 15:21:42 +0800 Subject: [PATCH 221/578] Simplify and encapsulate tuple routing support code. --- src/backend/commands/copy.c | 86 ++++------------ src/backend/executor/execPartition.c | 108 +++++++++++-------- src/backend/executor/nodeModifyTable.c | 137 ++++++++++--------------- src/include/executor/execPartition.h | 47 +++++++-- src/include/nodes/execnodes.h | 9 +- 5 files changed, 182 insertions(+), 205 deletions(-) diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index d0f69503..316356d8 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -208,12 +208,9 @@ typedef struct CopyStateData bool volatile_defexprs; /* is any of defexprs volatile? */ List *range_table; - PartitionDispatch *partition_dispatch_info; - int num_dispatch; /* Number of entries in the above array */ - int num_partitions; /* Number of members in the following arrays */ - ResultRelInfo *partitions; /* Per partition result relation */ - TupleConversionMap **partition_tupconv_maps; - TupleTableSlot *partition_tuple_slot; + /* Tuple-routing support info */ + PartitionTupleRouting *partition_tuple_routing; + TransitionCaptureState *transition_capture; TupleConversionMap **transition_tupconv_maps; @@ -1636,27 +1633,10 @@ BeginCopy(ParseState *pstate, /* Initialize state for CopyFrom tuple routing. */ if (is_from && rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) { - PartitionDispatch *partition_dispatch_info; - ResultRelInfo *partitions; - TupleConversionMap **partition_tupconv_maps; - TupleTableSlot *partition_tuple_slot; - int num_parted, - num_partitions; - - ExecSetupPartitionTupleRouting(NULL, - rel, - 1, - &partition_dispatch_info, - &partitions, - &partition_tupconv_maps, - &partition_tuple_slot, - &num_parted, &num_partitions); - cstate->partition_dispatch_info = partition_dispatch_info; - cstate->num_dispatch = num_parted; - cstate->partitions = partitions; - cstate->num_partitions = num_partitions; - cstate->partition_tupconv_maps = partition_tupconv_maps; - cstate->partition_tuple_slot = partition_tuple_slot; + PartitionTupleRouting *proute; + + proute = cstate->partition_tuple_routing = + ExecSetupPartitionTupleRouting(NULL, cstate->rel, 1, estate); /* * If we are capturing transition tuples, they may need to be @@ -1669,12 +1649,11 @@ BeginCopy(ParseState *pstate, int i; cstate->transition_tupconv_maps = (TupleConversionMap **) - palloc0(sizeof(TupleConversionMap *) * - cstate->num_partitions); - for (i = 0; i < cstate->num_partitions; ++i) + palloc0(sizeof(TupleConversionMap *) * proute->num_partitions); + for (i = 0; i < proute->num_partitions; ++i) { cstate->transition_tupconv_maps[i] = - convert_tuples_by_name(RelationGetDescr(cstate->partitions[i].ri_RelationDesc), + convert_tuples_by_name(RelationGetDescr(proute->partitions[i]->ri_RelationDesc), RelationGetDescr(rel), gettext_noop("could not convert row type")); } @@ -3184,7 +3163,7 @@ CopyFrom(CopyState cstate) if ((resultRelInfo->ri_TrigDesc != NULL && (resultRelInfo->ri_TrigDesc->trig_insert_before_row || resultRelInfo->ri_TrigDesc->trig_insert_instead_row)) || - cstate->partition_dispatch_info != NULL || + cstate->partition_tuple_routing != NULL || cstate->volatile_defexprs) { useHeapMultiInsert = false; @@ -3459,10 +3438,11 @@ CopyFrom(CopyState cstate) ExecStoreTuple(tuple, slot, InvalidBuffer, false); /* Determine the partition to heap_insert the tuple into */ - if (cstate->partition_dispatch_info) + if (cstate->partition_tuple_routing) { int leaf_part_index; TupleConversionMap *map; + PartitionTupleRouting *proute = cstate->partition_tuple_routing; /* * Away we go ... If we end up not finding a partition after all, @@ -3473,11 +3453,11 @@ CopyFrom(CopyState cstate) * partition, respectively. */ leaf_part_index = ExecFindPartition(resultRelInfo, - cstate->partition_dispatch_info, + proute->partition_dispatch_info, slot, estate); Assert(leaf_part_index >= 0 && - leaf_part_index < cstate->num_partitions); + leaf_part_index < proute->num_partitions); /* * If this tuple is mapped to a partition that is not same as the @@ -3495,7 +3475,7 @@ CopyFrom(CopyState cstate) * to the selected partition. */ saved_resultRelInfo = resultRelInfo; - resultRelInfo = cstate->partitions + leaf_part_index; + resultRelInfo = proute->partitions[leaf_part_index]; /* We do not yet have a way to insert into a foreign partition */ if (resultRelInfo->ri_FdwRoutine) @@ -3542,7 +3522,7 @@ CopyFrom(CopyState cstate) * We might need to convert from the parent rowtype to the * partition rowtype. */ - map = cstate->partition_tupconv_maps[leaf_part_index]; + map = proute->partition_tupconv_maps[leaf_part_index]; if (map) { Relation partrel = resultRelInfo->ri_RelationDesc; @@ -3554,7 +3534,7 @@ CopyFrom(CopyState cstate) * point on. Use a dedicated slot from this point on until * we're finished dealing with the partition. */ - slot = cstate->partition_tuple_slot; + slot = proute->partition_tuple_slot; Assert(slot != NULL); ExecSetSlotDescriptor(slot, RelationGetDescr(partrel)); ExecStoreTuple(tuple, slot, InvalidBuffer, true); @@ -4012,34 +3992,8 @@ CopyFrom(CopyState cstate) ExecCloseIndices(resultRelInfo); /* Close all the partitioned tables, leaf partitions, and their indices */ - if (cstate->partition_dispatch_info) - { - int i; - - /* - * Remember cstate->partition_dispatch_info[0] corresponds to the root - * partitioned table, which we must not try to close, because it is - * the main target table of COPY that will be closed eventually by - * DoCopy(). Also, tupslot is NULL for the root partitioned table. - */ - for (i = 1; i < cstate->num_dispatch; i++) - { - PartitionDispatch pd = cstate->partition_dispatch_info[i]; - - heap_close(pd->reldesc, NoLock); - ExecDropSingleTupleTableSlot(pd->tupslot); - } - for (i = 0; i < cstate->num_partitions; i++) - { - ResultRelInfo *resultRelInfo = cstate->partitions + i; - - ExecCloseIndices(resultRelInfo); - heap_close(resultRelInfo->ri_RelationDesc, NoLock); - } - - /* Release the standalone partition tuple descriptor */ - ExecDropSingleTupleTableSlot(cstate->partition_tuple_slot); - } + if (cstate->partition_tuple_routing) + ExecCleanupTupleRouting(cstate->partition_tuple_routing); /* Close any trigger target relations */ ExecCleanUpTriggerState(estate); diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c index 08a27d71..0f4c8db3 100644 --- a/src/backend/executor/execPartition.c +++ b/src/backend/executor/execPartition.c @@ -38,58 +38,40 @@ static char *ExecBuildSlotPartitionKeyDescription(Relation rel, int maxfieldlen); /* - * ExecSetupPartitionTupleRouting - set up information needed during - * tuple routing for partitioned tables - * - * Output arguments: - * 'pd' receives an array of PartitionDispatch objects with one entry for - * every partitioned table in the partition tree - * 'partitions' receives an array of ResultRelInfo* objects with one entry for - * every leaf partition in the partition tree - * 'tup_conv_maps' receives an array of TupleConversionMap objects with one - * entry for every leaf partition (required to convert input tuple based - * on the root table's rowtype to a leaf partition's rowtype after tuple - * routing is done) - * 'partition_tuple_slot' receives a standalone TupleTableSlot to be used - * to manipulate any given leaf partition's rowtype after that partition - * is chosen by tuple-routing. - * 'num_parted' receives the number of partitioned tables in the partition - * tree (= the number of entries in the 'pd' output array) - * 'num_partitions' receives the number of leaf partitions in the partition - * tree (= the number of entries in the 'partitions' and 'tup_conv_maps' - * output arrays + * ExecSetupPartitionTupleRouting - sets up information needed during + * tuple routing for partitioned tables, encapsulates it in + * PartitionTupleRouting, and returns it. * * Note that all the relations in the partition tree are locked using the * RowExclusiveLock mode upon return from this function. */ -void +PartitionTupleRouting * ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, - Relation rel, - Index resultRTindex, - EState *estate, - PartitionDispatch **pd, - ResultRelInfo ***partitions, - TupleConversionMap ***tup_conv_maps, - TupleTableSlot **partition_tuple_slot, - int *num_parted, int *num_partitions) + Relation rel, Index resultRTindex, + EState *estate) { TupleDesc tupDesc = RelationGetDescr(rel); List *leaf_parts; ListCell *cell; int i; ResultRelInfo *leaf_part_rri; + PartitionTupleRouting *proute; /* * Get the information about the partition tree after locking all the * partitions. */ (void) find_all_inheritors(RelationGetRelid(rel), RowExclusiveLock, NULL); - *pd = RelationGetPartitionDispatchInfo(rel, num_parted, &leaf_parts); - *num_partitions = list_length(leaf_parts); - *partitions = (ResultRelInfo **) palloc(*num_partitions * - sizeof(ResultRelInfo *)); - *tup_conv_maps = (TupleConversionMap **) palloc0(*num_partitions * - sizeof(TupleConversionMap *)); + proute = (PartitionTupleRouting *) palloc0(sizeof(PartitionTupleRouting)); + proute->partition_dispatch_info = + RelationGetPartitionDispatchInfo(rel, &proute->num_dispatch, + &leaf_parts); + proute->num_partitions = list_length(leaf_parts); + proute->partitions = (ResultRelInfo **) palloc(proute->num_partitions * + sizeof(ResultRelInfo *)); + proute->partition_tupconv_maps = + (TupleConversionMap **) palloc0(proute->num_partitions * + sizeof(TupleConversionMap *)); /* * Initialize an empty slot that will be used to manipulate tuples of any @@ -97,9 +79,9 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, * (such as ModifyTableState) and released when the node finishes * processing. */ - *partition_tuple_slot = MakeTupleTableSlot(); + proute->partition_tuple_slot = MakeTupleTableSlot(); - leaf_part_rri = (ResultRelInfo *) palloc0(*num_partitions * + leaf_part_rri = (ResultRelInfo *) palloc0(proute->num_partitions * sizeof(ResultRelInfo)); i = 0; foreach(cell, leaf_parts) @@ -109,8 +91,8 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, /* * We locked all the partitions above including the leaf partitions. - * Note that each of the relations in *partitions are eventually - * closed by the caller. + * Note that each of the relations in proute->partitions are + * eventually closed by the caller. */ partrel = heap_open(lfirst_oid(cell), NoLock); part_tupdesc = RelationGetDescr(partrel); @@ -119,8 +101,9 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, * Save a tuple conversion map to convert a tuple routed to this * partition from the parent's type to the partition's. */ - (*tup_conv_maps)[i] = convert_tuples_by_name(tupDesc, part_tupdesc, - gettext_noop("could not convert row type")); + proute->partition_tupconv_maps[i] = + convert_tuples_by_name(tupDesc, part_tupdesc, + gettext_noop("could not convert row type")); InitResultRelInfo(leaf_part_rri, partrel, @@ -149,9 +132,11 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, estate->es_leaf_result_relations = lappend(estate->es_leaf_result_relations, leaf_part_rri); - (*partitions)[i] = leaf_part_rri++; + proute->partitions[i] = leaf_part_rri++; i++; } + + return proute; } /* @@ -272,6 +257,45 @@ ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd, return result; } +/* + * ExecCleanupTupleRouting -- Clean up objects allocated for partition tuple + * routing. + * + * Close all the partitioned tables, leaf partitions, and their indices. + */ +void +ExecCleanupTupleRouting(PartitionTupleRouting * proute) +{ + int i; + + /* + * Remember, proute->partition_dispatch_info[0] corresponds to the root + * partitioned table, which we must not try to close, because it is the + * main target table of the query that will be closed by callers such as + * ExecEndPlan() or DoCopy(). Also, tupslot is NULL for the root + * partitioned table. + */ + for (i = 1; i < proute->num_dispatch; i++) + { + PartitionDispatch pd = proute->partition_dispatch_info[i]; + + heap_close(pd->reldesc, NoLock); + ExecDropSingleTupleTableSlot(pd->tupslot); + } + + for (i = 0; i < proute->num_partitions; i++) + { + ResultRelInfo *resultRelInfo = proute->partitions[i]; + + ExecCloseIndices(resultRelInfo); + heap_close(resultRelInfo->ri_RelationDesc, NoLock); + } + + /* Release the standalone partition tuple descriptor, if any */ + if (proute->partition_tuple_slot) + ExecDropSingleTupleTableSlot(proute->partition_tuple_slot); +} + /* * RelationGetPartitionDispatchInfo * Returns information necessary to route tuples down a partition tree diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 749c320e..4c873634 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -318,32 +318,33 @@ ExecInsert(ModifyTableState *mtstate, } #endif /* Determine the partition to heap_insert the tuple into */ - if (mtstate->mt_partition_dispatch_info) + if (mtstate->mt_partition_tuple_routing) { int leaf_part_index; + PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing; TupleConversionMap *map; /* * Away we go ... If we end up not finding a partition after all, * ExecFindPartition() does not return and errors out instead. * Otherwise, the returned value is to be used as an index into arrays - * mt_partitions[] and mt_partition_tupconv_maps[] that will get us - * the ResultRelInfo and TupleConversionMap for the partition, + * proute->partitions[] and proute->partition_tupconv_maps[] that will + * get us the ResultRelInfo and TupleConversionMap for the partition, * respectively. */ leaf_part_index = ExecFindPartition(resultRelInfo, - mtstate->mt_partition_dispatch_info, + proute->partition_dispatch_info, slot, estate); Assert(leaf_part_index >= 0 && - leaf_part_index < mtstate->mt_num_partitions); + leaf_part_index < proute->num_partitions); /* * Save the old ResultRelInfo and switch to the one corresponding to * the selected partition. */ saved_resultRelInfo = resultRelInfo; - resultRelInfo = mtstate->mt_partitions + leaf_part_index; + resultRelInfo = proute->partitions[leaf_part_index]; /* We do not yet have a way to insert into a foreign partition */ if (resultRelInfo->ri_FdwRoutine) @@ -388,7 +389,7 @@ ExecInsert(ModifyTableState *mtstate, * We might need to convert from the parent rowtype to the partition * rowtype. */ - map = mtstate->mt_partition_tupconv_maps[leaf_part_index]; + map = proute->partition_tupconv_maps[leaf_part_index]; if (map) { Relation partrel = resultRelInfo->ri_RelationDesc; @@ -400,7 +401,7 @@ ExecInsert(ModifyTableState *mtstate, * on, until we're finished dealing with the partition. Use the * dedicated slot for that. */ - slot = mtstate->mt_partition_tuple_slot; + slot = proute->partition_tuple_slot; Assert(slot != NULL); ExecSetSlotDescriptor(slot, RelationGetDescr(partrel)); ExecStoreTuple(tuple, slot, InvalidBuffer, true); @@ -1834,25 +1835,12 @@ ExecSetupTransitionCaptureState(ModifyTableState *mtstate, EState *estate) */ if (mtstate->mt_transition_capture != NULL) { - ResultRelInfo *resultRelInfos; int numResultRelInfos; + PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing; - /* Find the set of partitions so that we can find their TupleDescs. */ - if (mtstate->mt_partition_dispatch_info != NULL) - { - /* - * For INSERT via partitioned table, so we need TupleDescs based - * on the partition routing table. - */ - resultRelInfos = mtstate->mt_partitions; - numResultRelInfos = mtstate->mt_num_partitions; - } - else - { - /* Otherwise we need the ResultRelInfo for each subplan. */ - resultRelInfos = mtstate->resultRelInfo; - numResultRelInfos = mtstate->mt_nplans; - } + numResultRelInfos = (proute != NULL ? + proute->num_partitions : + mtstate->mt_nplans); /* * Build array of conversion maps from each child's TupleDesc to the @@ -1862,6 +1850,29 @@ ExecSetupTransitionCaptureState(ModifyTableState *mtstate, EState *estate) */ mtstate->mt_transition_tupconv_maps = (TupleConversionMap **) palloc0(sizeof(TupleConversionMap *) * numResultRelInfos); + + /* Choose the right set of partitions */ + if (proute != NULL) + { + /* + * For tuple routing among partitions, we need TupleDescs based + * on the partition routing table. + */ + ResultRelInfo **resultRelInfos = proute->partitions; + + for (i = 0; i < numResultRelInfos; ++i) + { + mtstate->mt_transition_tupconv_maps[i] = + convert_tuples_by_name(RelationGetDescr(resultRelInfos[i]->ri_RelationDesc), + RelationGetDescr(targetRelInfo->ri_RelationDesc), + gettext_noop("could not convert row type")); + } + } + else + { + /* Otherwise we need the ResultRelInfo for each subplan. */ + ResultRelInfo *resultRelInfos = mtstate->resultRelInfo; + for (i = 0; i < numResultRelInfos; ++i) { mtstate->mt_transition_tupconv_maps[i] = @@ -1869,6 +1880,7 @@ ExecSetupTransitionCaptureState(ModifyTableState *mtstate, EState *estate) RelationGetDescr(targetRelInfo->ri_RelationDesc), gettext_noop("could not convert row type")); } + } /* * Install the conversion map for the first plan for UPDATE and DELETE @@ -2470,6 +2482,8 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) ListCell *l; int i; Relation rel; + PartitionTupleRouting *proute = NULL; + int num_partitions = 0; #ifdef __TBASE__ bool remote_dml = false; #endif @@ -2697,27 +2711,11 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) if (operation == CMD_INSERT && rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) { - PartitionDispatch *partition_dispatch_info; - ResultRelInfo *partitions; - TupleConversionMap **partition_tupconv_maps; - TupleTableSlot *partition_tuple_slot; - int num_parted, - num_partitions; - + proute = mtstate->mt_partition_tuple_routing = ExecSetupPartitionTupleRouting(mtstate, - rel, - node->nominalRelation, - &partition_dispatch_info, - &partitions, - &partition_tupconv_maps, - &partition_tuple_slot, - &num_parted, &num_partitions); - mtstate->mt_partition_dispatch_info = partition_dispatch_info; - mtstate->mt_num_dispatch = num_parted; - mtstate->mt_partitions = partitions; - mtstate->mt_num_partitions = num_partitions; - mtstate->mt_partition_tupconv_maps = partition_tupconv_maps; - mtstate->mt_partition_tuple_slot = partition_tuple_slot; + rel, node->nominalRelation, + estate); + num_partitions = proute->num_partitions; } /* Build state for collecting transition tuples */ @@ -2777,7 +2775,7 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) * will suffice. This only occurs for the INSERT case; UPDATE/DELETE * cases are handled above. */ - if (node->withCheckOptionLists != NIL && mtstate->mt_num_partitions > 0) + if (node->withCheckOptionLists != NIL && num_partitions > 0) { List *wcoList; PlanState *plan; @@ -2794,14 +2792,16 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) mtstate->mt_nplans == 1); wcoList = linitial(node->withCheckOptionLists); plan = mtstate->mt_plans[0]; - resultRelInfo = mtstate->mt_partitions; - for (i = 0; i < mtstate->mt_num_partitions; i++) + for (i = 0; i < num_partitions; i++) { - Relation partrel = resultRelInfo->ri_RelationDesc; + Relation partrel; List *mapped_wcoList; List *wcoExprs = NIL; ListCell *ll; + resultRelInfo = proute->partitions[i]; + partrel = resultRelInfo->ri_RelationDesc; + /* varno = node->nominalRelation */ mapped_wcoList = map_partition_varattnos(wcoList, node->nominalRelation, @@ -2817,7 +2817,6 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) resultRelInfo->ri_WithCheckOptions = mapped_wcoList; resultRelInfo->ri_WithCheckOptionExprs = wcoExprs; - resultRelInfo++; } } @@ -2879,13 +2878,15 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) * will suffice. This only occurs for the INSERT case; UPDATE/DELETE * are handled above. */ - resultRelInfo = mtstate->mt_partitions; returningList = linitial(node->returningLists); - for (i = 0; i < mtstate->mt_num_partitions; i++) + for (i = 0; i < num_partitions; i++) { - Relation partrel = resultRelInfo->ri_RelationDesc; + Relation partrel; List *rlist; + resultRelInfo = proute->partitions[i]; + partrel = resultRelInfo->ri_RelationDesc; + /* varno = node->nominalRelation */ rlist = map_partition_varattnos(returningList, node->nominalRelation, @@ -2893,7 +2894,6 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) resultRelInfo->ri_projectReturning = ExecBuildProjectionInfo(rlist, econtext, slot, &mtstate->ps, resultRelInfo->ri_RelationDesc->rd_att); - resultRelInfo++; } } else @@ -3227,32 +3227,9 @@ ExecEndModifyTable(ModifyTableState *node) } #endif - /* - * Close all the partitioned tables, leaf partitions, and their indices - * - * Remember node->mt_partition_dispatch_info[0] corresponds to the root - * partitioned table, which we must not try to close, because it is the - * main target table of the query that will be closed by ExecEndPlan(). - * Also, tupslot is NULL for the root partitioned table. - */ - for (i = 1; i < node->mt_num_dispatch; i++) - { - PartitionDispatch pd = node->mt_partition_dispatch_info[i]; - - heap_close(pd->reldesc, NoLock); - ExecDropSingleTupleTableSlot(pd->tupslot); - } - for (i = 0; i < node->mt_num_partitions; i++) - { - ResultRelInfo *resultRelInfo = node->mt_partitions + i; - - ExecCloseIndices(resultRelInfo); - heap_close(resultRelInfo->ri_RelationDesc, NoLock); - } - - /* Release the standalone partition tuple descriptor, if any */ - if (node->mt_partition_tuple_slot) - ExecDropSingleTupleTableSlot(node->mt_partition_tuple_slot); + /* Close all the partitioned tables, leaf partitions, and their indices */ + if (node->mt_partition_tuple_routing) + ExecCleanupTupleRouting(node->mt_partition_tuple_routing); /* * Free the exprcontext diff --git a/src/include/executor/execPartition.h b/src/include/executor/execPartition.h index 703ff4f7..bea189c5 100644 --- a/src/include/executor/execPartition.h +++ b/src/include/executor/execPartition.h @@ -49,18 +49,47 @@ typedef struct PartitionDispatchData typedef struct PartitionDispatchData *PartitionDispatch; -extern void ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, - Relation rel, - Index resultRTindex, - EState *estate, - PartitionDispatch **pd, - ResultRelInfo ***partitions, - TupleConversionMap ***tup_conv_maps, - TupleTableSlot **partition_tuple_slot, - int *num_parted, int *num_partitions); +/*----------------------- + * PartitionTupleRouting - Encapsulates all information required to execute + * tuple-routing between partitions. + * + * partition_dispatch_info Array of PartitionDispatch objects with one + * entry for every partitioned table in the + * partition tree. + * num_dispatch number of partitioned tables in the partition + * tree (= length of partition_dispatch_info[]) + * partitions Array of ResultRelInfo* objects with one entry + * for every leaf partition in the partition tree. + * num_partitions Number of leaf partitions in the partition tree + * (= 'partitions' array length) + * partition_tupconv_maps Array of TupleConversionMap objects with one + * entry for every leaf partition (required to + * convert input tuple based on the root table's + * rowtype to a leaf partition's rowtype after + * tuple routing is done) + * partition_tuple_slot TupleTableSlot to be used to manipulate any + * given leaf partition's rowtype after that + * partition is chosen for insertion by + * tuple-routing. + *----------------------- + */ +typedef struct PartitionTupleRouting +{ + PartitionDispatch *partition_dispatch_info; + int num_dispatch; + ResultRelInfo **partitions; + int num_partitions; + TupleConversionMap **partition_tupconv_maps; + TupleTableSlot *partition_tuple_slot; +} PartitionTupleRouting; + +extern PartitionTupleRouting *ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, + Relation rel, Index resultRTindex, + EState *estate); extern int ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd, TupleTableSlot *slot, EState *estate); +extern void ExecCleanupTupleRouting(PartitionTupleRouting *proute); #endif /* EXECPARTITION_H */ diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 087b2223..3d7ece62 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1066,15 +1066,8 @@ typedef struct ModifyTableState TupleTableSlot *mt_existing; /* slot to store existing target tuple in */ List *mt_excludedtlist; /* the excluded pseudo relation's tlist */ TupleTableSlot *mt_conflproj; /* CONFLICT ... SET ... projection target */ - struct PartitionDispatchData **mt_partition_dispatch_info; + struct PartitionTupleRouting *mt_partition_tuple_routing; /* Tuple-routing support info */ - int mt_num_dispatch; /* Number of entries in the above array */ - int mt_num_partitions; /* Number of members in the following - * arrays */ - ResultRelInfo *mt_partitions; /* Per partition result relation */ - TupleConversionMap **mt_partition_tupconv_maps; - /* Per partition tuple conversion map */ - TupleTableSlot *mt_partition_tuple_slot; struct TransitionCaptureState *mt_transition_capture; /* controls transition table population */ TupleConversionMap **mt_transition_tupconv_maps; From 39aff787446819eb96a5e4f3c7a1858cc3d7ae68 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Thu, 4 Jan 2018 16:25:49 -0500 Subject: [PATCH 222/578] Minor preparatory refactoring for UPDATE row movement. Generalize is_partition_attr to has_partition_attrs and make it accessible from outside tablecmds.c. Change map_partition_varattnos to clarify that it can be used for mapping between any two relations in a partitioning hierarchy, not just parent -> child. Amit Khandekar, reviewed by Amit Langote, David Rowley, and me. Some comment changes by me. Discussion: http://postgr.es/m/CAJ3gD9fWfxgKC+PfJZF3hkgAcNOy-LpfPxVYitDEXKHjeieWQQ@mail.gmail.com --- src/backend/catalog/partition.c | 87 ++++++++++++++++++++++++++++---- src/backend/commands/tablecmds.c | 71 +++----------------------- src/include/catalog/partition.h | 6 ++- 3 files changed, 87 insertions(+), 77 deletions(-) diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c index 8bac934b..08b58d74 100644 --- a/src/backend/catalog/partition.c +++ b/src/backend/catalog/partition.c @@ -1480,10 +1480,13 @@ get_qual_from_partbound(Relation rel, Relation parent, /* * map_partition_varattnos - maps varattno of any Vars in expr from the - * parent attno to partition attno. + * attno's of 'from_rel' to the attno's of 'to_rel' partition, each of which + * may be either a leaf partition or a partitioned table, but both of which + * must be from the same partitioning hierarchy. * - * We must allow for cases where physical attnos of a partition can be - * different from the parent's. + * Even though all of the same column names must be present in all relations + * in the hierarchy, and they must also have the same types, the attnos may + * be different. * * If found_whole_row is not NULL, *found_whole_row returns whether a * whole-row variable was found in the input expression. @@ -1493,8 +1496,8 @@ get_qual_from_partbound(Relation rel, Relation parent, * are working on Lists, so it's less messy to do the casts internally. */ List * -map_partition_varattnos(List *expr, int target_varno, - Relation partrel, Relation parent, +map_partition_varattnos(List *expr, int fromrel_varno, + Relation to_rel, Relation from_rel, bool *found_whole_row) { bool my_found_whole_row = false; @@ -1503,14 +1506,14 @@ map_partition_varattnos(List *expr, int target_varno, { AttrNumber *part_attnos; - part_attnos = convert_tuples_by_name_map(RelationGetDescr(partrel), - RelationGetDescr(parent), + part_attnos = convert_tuples_by_name_map(RelationGetDescr(to_rel), + RelationGetDescr(from_rel), gettext_noop("could not convert row type")); expr = (List *) map_variable_attnos((Node *) expr, - target_varno, 0, + fromrel_varno, 0, part_attnos, - RelationGetDescr(parent)->natts, - RelationGetForm(partrel)->reltype, + RelationGetDescr(from_rel)->natts, + RelationGetForm(to_rel)->reltype, &my_found_whole_row); } @@ -2627,6 +2630,70 @@ get_partition_for_tuple(Relation relation, Datum *values, bool *isnull) return part_index; } +/* + * Checks if any of the 'attnums' is a partition key attribute for rel + * + * Sets *used_in_expr if any of the 'attnums' is found to be referenced in some + * partition key expression. It's possible for a column to be both used + * directly and as part of an expression; if that happens, *used_in_expr may + * end up as either true or false. That's OK for current uses of this + * function, because *used_in_expr is only used to tailor the error message + * text. + */ +bool +has_partition_attrs(Relation rel, Bitmapset *attnums, + bool *used_in_expr) +{ + PartitionKey key; + int partnatts; + List *partexprs; + ListCell *partexprs_item; + int i; + + if (attnums == NULL || rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE) + return false; + + key = RelationGetPartitionKey(rel); + partnatts = get_partition_natts(key); + partexprs = get_partition_exprs(key); + + partexprs_item = list_head(partexprs); + for (i = 0; i < partnatts; i++) + { + AttrNumber partattno = get_partition_col_attnum(key, i); + + if (partattno != 0) + { + if (bms_is_member(partattno - FirstLowInvalidHeapAttributeNumber, + attnums)) + { + if (used_in_expr) + *used_in_expr = false; + return true; + } + } + else + { + /* Arbitrary expression */ + Node *expr = (Node *) lfirst(partexprs_item); + Bitmapset *expr_attrs = NULL; + + /* Find all attributes referenced */ + pull_varattnos(expr, 1, &expr_attrs); + partexprs_item = lnext(partexprs_item); + + if (bms_overlap(attnums, expr_attrs)) + { + if (used_in_expr) + *used_in_expr = true; + return true; + } + } + } + + return false; +} + /* * qsort_partition_hbound_cmp * diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 74b82ebf..536b8661 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -532,7 +532,6 @@ static void RangeVarCallbackForDropRelation(const RangeVar *rel, Oid relOid, Oid oldRelOid, void *arg); static void RangeVarCallbackForAlterRelation(const RangeVar *rv, Oid relid, Oid oldrelid, void *arg); -static bool is_partition_attr(Relation rel, AttrNumber attnum, bool *used_in_expr); static PartitionSpec *transformPartitionSpec(Relation rel, PartitionSpec *partspec, char *strategy); static void ComputePartitionAttrs(Relation rel, List *partParams, AttrNumber *partattrs, List **partexprs, Oid *partopclass, Oid *partcollation, char strategy); @@ -8128,68 +8127,6 @@ ATPrepDropColumn(List **wqueue, Relation rel, bool recurse, bool recursing, cmd->subtype = AT_DropColumnRecurse; } -/* - * Checks if attnum is a partition attribute for rel - * - * Sets *used_in_expr if attnum is found to be referenced in some partition - * key expression. It's possible for a column to be both used directly and - * as part of an expression; if that happens, *used_in_expr may end up as - * either true or false. That's OK for current uses of this function, because - * *used_in_expr is only used to tailor the error message text. - */ -static bool -is_partition_attr(Relation rel, AttrNumber attnum, bool *used_in_expr) -{// #lizard forgives - PartitionKey key; - int partnatts; - List *partexprs; - ListCell *partexprs_item; - int i; - - if (rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE) - return false; - - key = RelationGetPartitionKey(rel); - partnatts = get_partition_natts(key); - partexprs = get_partition_exprs(key); - - partexprs_item = list_head(partexprs); - for (i = 0; i < partnatts; i++) - { - AttrNumber partattno = get_partition_col_attnum(key, i); - - if (partattno != 0) - { - if (attnum == partattno) - { - if (used_in_expr) - *used_in_expr = false; - return true; - } - } - else - { - /* Arbitrary expression */ - Node *expr = (Node *) lfirst(partexprs_item); - Bitmapset *expr_attrs = NULL; - - /* Find all attributes referenced */ - pull_varattnos(expr, 1, &expr_attrs); - partexprs_item = lnext(partexprs_item); - - if (bms_is_member(attnum - FirstLowInvalidHeapAttributeNumber, - expr_attrs)) - { - if (used_in_expr) - *used_in_expr = true; - return true; - } - } - } - - return false; -} - /* * Return value is the address of the dropped column. */ @@ -8250,7 +8187,9 @@ ATExecDropColumn(List **wqueue, Relation rel, const char *colName, colName))); /* Don't drop columns used in the partition key */ - if (is_partition_attr(rel, attnum, &is_expr)) + if (has_partition_attrs(rel, + bms_make_singleton(attnum - FirstLowInvalidHeapAttributeNumber), + &is_expr)) { if (!is_expr) ereport(ERROR, @@ -10699,7 +10638,9 @@ ATPrepAlterColumnType(List **wqueue, colName))); /* Don't alter columns used in the partition key */ - if (is_partition_attr(rel, attnum, &is_expr)) + if (has_partition_attrs(rel, + bms_make_singleton(attnum - FirstLowInvalidHeapAttributeNumber), + &is_expr)) { if (!is_expr) ereport(ERROR, diff --git a/src/include/catalog/partition.h b/src/include/catalog/partition.h index 295e9d22..3d8b08ba 100644 --- a/src/include/catalog/partition.h +++ b/src/include/catalog/partition.h @@ -54,11 +54,13 @@ extern void check_new_partition_bound(char *relname, Relation parent, extern Oid get_partition_parent(Oid relid); extern List *get_qual_from_partbound(Relation rel, Relation parent, PartitionBoundSpec *spec); -extern List *map_partition_varattnos(List *expr, int target_varno, - Relation partrel, Relation parent, +extern List *map_partition_varattnos(List *expr, int fromrel_varno, + Relation to_rel, Relation from_rel, bool *found_whole_row); extern List *RelationGetPartitionQual(Relation rel); extern Expr *get_partition_qual_relid(Oid relid); +extern bool has_partition_attrs(Relation rel, Bitmapset *attnums, + bool *used_in_expr); extern Oid get_default_oid_from_partdesc(PartitionDesc partdesc); extern Oid get_default_partition_oid(Oid parentId); From abe8c72d16cc3a352fa34825d0ab6d719717f373 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Fri, 26 Jun 2020 16:11:19 +0800 Subject: [PATCH 223/578] Factor error generation out of ExecPartitionCheck.http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/commands/copy.c | 2 +- src/backend/executor/execMain.c | 50 ++++++++++++++++---------- src/backend/executor/execPartition.c | 5 +-- src/backend/executor/execReplication.c | 4 +-- src/backend/executor/nodeModifyTable.c | 4 +-- src/include/executor/executor.h | 5 ++- 6 files changed, 44 insertions(+), 26 deletions(-) diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 316356d8..ab6834e0 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -3585,7 +3585,7 @@ CopyFrom(CopyState cstate) /* Check the constraints of the tuple */ if (cstate->rel->rd_att->constr || check_partition_constr) - ExecConstraints(resultRelInfo, slot, estate); + ExecConstraints(resultRelInfo, slot, estate, true); #ifdef _MLS_ if (is_mls_user()) diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index a4978497..63de1a27 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -2210,16 +2210,12 @@ ExecRelCheck(ResultRelInfo *resultRelInfo, * ExecPartitionCheck --- check that tuple meets the partition constraint. * * Exported in executor.h for outside use. + * Returns true if it meets the partition constraint, else returns false. */ -void +bool ExecPartitionCheck(ResultRelInfo *resultRelInfo, TupleTableSlot *slot, EState *estate) { - Relation rel = resultRelInfo->ri_RelationDesc; - TupleDesc tupdesc = RelationGetDescr(rel); - Bitmapset *modifiedCols; - Bitmapset *insertedCols; - Bitmapset *updatedCols; ExprContext *econtext; /* @@ -2247,12 +2243,29 @@ ExecPartitionCheck(ResultRelInfo *resultRelInfo, TupleTableSlot *slot, * As in case of the catalogued constraints, we treat a NULL result as * success here, not a failure. */ - if (!ExecCheck(resultRelInfo->ri_PartitionCheckExpr, econtext)) + return ExecCheck(resultRelInfo->ri_PartitionCheckExpr, econtext); +} + +/* + * ExecPartitionCheckEmitError - Form and emit an error message after a failed + * partition constraint check. + */ +void +ExecPartitionCheckEmitError(ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, + EState *estate) { - char *val_desc; + Relation rel = resultRelInfo->ri_RelationDesc; Relation orig_rel = rel; - - /* See the comment above. */ + TupleDesc tupdesc = RelationGetDescr(rel); + char *val_desc; + Bitmapset *modifiedCols; + Bitmapset *insertedCols; + Bitmapset *updatedCols; + /* + * Need to first convert the tuple to the root partitioned table's row + * type. For details, check similar comments in ExecConstraints(). + */ if (resultRelInfo->ri_PartitionRoot) { HeapTuple tuple = ExecFetchSlotTuple(slot); @@ -2266,7 +2279,7 @@ ExecPartitionCheck(ResultRelInfo *resultRelInfo, TupleTableSlot *slot, gettext_noop("could not convert row type")); if (map != NULL) { - tuple = do_convert_tuple(tuple, map, rel); + tuple = do_convert_tuple(tuple, map); ExecSetSlotDescriptor(slot, tupdesc); ExecStoreTuple(tuple, slot, InvalidBuffer, false); } @@ -2286,13 +2299,12 @@ ExecPartitionCheck(ResultRelInfo *resultRelInfo, TupleTableSlot *slot, RelationGetRelationName(orig_rel)), val_desc ? errdetail("Failing row contains %s.", val_desc) : 0)); } -} /* * ExecConstraints - check constraints of the tuple in 'slot' * - * This checks the traditional NOT NULL and check constraints, as well as - * the partition constraint, if any. + * This checks the traditional NOT NULL and check constraints, and if + * requested, checks the partition constraint. * * Note: 'slot' contains the tuple to check the constraints of, which may * have been converted from the original input tuple after tuple routing. @@ -2300,8 +2312,9 @@ ExecPartitionCheck(ResultRelInfo *resultRelInfo, TupleTableSlot *slot, */ void ExecConstraints(ResultRelInfo *resultRelInfo, - TupleTableSlot *slot, EState *estate) -{// #lizard forgives + TupleTableSlot *slot, EState *estate, + bool check_partition_constraint) +{ Relation rel = resultRelInfo->ri_RelationDesc; TupleDesc tupdesc = RelationGetDescr(rel); TupleConstr *constr = tupdesc->constr; @@ -2415,8 +2428,9 @@ ExecConstraints(ResultRelInfo *resultRelInfo, } } - if (resultRelInfo->ri_PartitionCheck) - ExecPartitionCheck(resultRelInfo, slot, estate); + if (check_partition_constraint && resultRelInfo->ri_PartitionCheck && + !ExecPartitionCheck(resultRelInfo, slot, estate)) + ExecPartitionCheckEmitError(resultRelInfo, slot, estate); } diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c index 0f4c8db3..b1dfe5a9 100644 --- a/src/backend/executor/execPartition.c +++ b/src/backend/executor/execPartition.c @@ -167,8 +167,9 @@ ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd, * First check the root table's partition constraint, if any. No point in * routing the tuple if it doesn't belong in the root table itself. */ - if (resultRelInfo->ri_PartitionCheck) - ExecPartitionCheck(resultRelInfo, slot, estate); + if (resultRelInfo->ri_PartitionCheck && + !ExecPartitionCheck(resultRelInfo, slot, estate)) + ExecPartitionCheckEmitError(resultRelInfo, slot, estate); /* start with the root partitioned table */ parent = pd[0]; diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c index 837b903f..c0b6f4a0 100644 --- a/src/backend/executor/execReplication.c +++ b/src/backend/executor/execReplication.c @@ -404,7 +404,7 @@ ExecSimpleRelationInsert(EState *estate, TupleTableSlot *slot) /* Check the constraints of the tuple */ if (rel->rd_att->constr) - ExecConstraints(resultRelInfo, slot, estate); + ExecConstraints(resultRelInfo, slot, estate, true); #ifdef _MLS_ if (is_mls_user()) @@ -491,7 +491,7 @@ ExecSimpleRelationUpdate(EState *estate, EPQState *epqstate, /* Check the constraints of the tuple */ if (rel->rd_att->constr) - ExecConstraints(resultRelInfo, slot, estate); + ExecConstraints(resultRelInfo, slot, estate, true); #ifdef _MLS_ if (is_mls_user()) diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 4c873634..9eba56d3 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -628,7 +628,7 @@ ExecInsert(ModifyTableState *mtstate, /* Check the constraints of the tuple */ if (resultRelationDesc->rd_att->constr || check_partition_constr) - ExecConstraints(resultRelInfo, slot, estate); + ExecConstraints(resultRelInfo, slot, estate, true); #ifdef _MLS_ if (is_mls_user()) @@ -1367,7 +1367,7 @@ lreplace:; * tuple-routing is performed here, hence the slot remains unchanged. */ if (resultRelationDesc->rd_att->constr || resultRelInfo->ri_PartitionCheck) - ExecConstraints(resultRelInfo, slot, estate); + ExecConstraints(resultRelInfo, slot, estate, true); #ifdef _MLS_ if (is_mls_user()) diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index 4ea9ef52..a143cd77 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -205,8 +205,11 @@ extern ResultRelInfo *ExecGetTriggerResultRel(EState *estate, Oid relid); extern void ExecCleanUpTriggerState(EState *estate); extern bool ExecContextForcesOids(PlanState *planstate, bool *hasoids); extern void ExecConstraints(ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, EState *estate, + bool check_partition_constraint); +extern bool ExecPartitionCheck(ResultRelInfo *resultRelInfo, TupleTableSlot *slot, EState *estate); -extern void ExecPartitionCheck(ResultRelInfo *resultRelInfo, +extern void ExecPartitionCheckEmitError(ResultRelInfo *resultRelInfo, TupleTableSlot *slot, EState *estate); extern void ExecWithCheckOptions(WCOKind kind, ResultRelInfo *resultRelInfo, TupleTableSlot *slot, EState *estate); From 16b9bc37f1e1abccb9422ad3ad5572ee6792eafb Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Wed, 17 Jan 2018 14:44:15 -0500 Subject: [PATCH 224/578] Remove useless lookup of root partitioned rel in ExecInitModifyTable(). node->partitioned_rels is only set in UPDATE/DELETE cases, but ExecInitModifyTable only uses its "rel" variable in INSERT cases, so the extra logic to find the root rel is just a waste of complexity and cycles. Etsuro Fujita, reviewed by Amit Langote Discussion: https://postgr.es/m/93cf9816-2f7d-0f67-8ed2-4a4e497a6ab8@lab.ntt.co.jp --- src/backend/executor/nodeModifyTable.c | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 9eba56d3..a7ed98e7 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -47,7 +47,6 @@ #include "foreign/fdwapi.h" #include "miscadmin.h" #include "nodes/nodeFuncs.h" -#include "parser/parsetree.h" #include "storage/bufmgr.h" #include "storage/lmgr.h" #include "utils/builtins.h" @@ -2694,20 +2693,8 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) estate->es_result_relation_info = saved_resultRelInfo; - /* The root table RT index is at the head of the partitioned_rels list */ - if (node->partitioned_rels) - { - Index root_rti; - Oid root_oid; - - root_rti = linitial_int(node->partitioned_rels); - root_oid = getrelid(root_rti, estate->es_range_table); - rel = heap_open(root_oid, NoLock); /* locked by InitPlan */ - } - else - rel = mtstate->resultRelInfo->ri_RelationDesc; - /* Build state for INSERT tuple routing */ + rel = mtstate->resultRelInfo->ri_RelationDesc; if (operation == CMD_INSERT && rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) { @@ -2909,10 +2896,6 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) mtstate->ps.ps_ExprContext = NULL; } - /* Close the root partitioned rel if we opened it above. */ - if (rel != mtstate->resultRelInfo->ri_RelationDesc) - heap_close(rel, NoLock); - /* * If needed, Initialize target list, projection and qual for ON CONFLICT * DO UPDATE. From d503d3212ce54a7cc3f725a0ec867cc745b08315 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Fri, 26 Jun 2020 17:57:02 +0800 Subject: [PATCH 225/578] Allow UPDATE to move rows between partitions. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- doc/src/sgml/ddl.sgml | 24 +- doc/src/sgml/ref/update.sgml | 13 +- doc/src/sgml/trigger.sgml | 23 + src/backend/commands/copy.c | 40 +- src/backend/commands/trigger.c | 52 +- src/backend/executor/execPartition.c | 241 ++++++++- src/backend/executor/nodeModifyTable.c | 558 +++++++++++++++---- src/backend/nodes/copyfuncs.c | 2 + src/backend/nodes/equalfuncs.c | 1 + src/backend/nodes/outfuncs.c | 3 + src/backend/nodes/readfuncs.c | 1 + src/backend/optimizer/path/allpaths.c | 4 +- src/backend/optimizer/plan/createplan.c | 4 + src/backend/optimizer/plan/planner.c | 19 +- src/backend/optimizer/prep/prepunion.c | 28 +- src/backend/optimizer/util/pathnode.c | 4 + src/include/executor/execPartition.h | 34 +- src/include/nodes/execnodes.h | 4 +- src/include/nodes/plannodes.h | 1 + src/include/nodes/relation.h | 3 + src/include/optimizer/pathnode.h | 1 + src/include/optimizer/planner.h | 3 +- src/test/regress/expected/update.out | 683 ++++++++++++++++++++++-- src/test/regress/sql/update.sql | 458 +++++++++++++++- src/tools/pgindent/typedefs.list | 1 + 25 files changed, 1945 insertions(+), 260 deletions(-) diff --git a/doc/src/sgml/ddl.sgml b/doc/src/sgml/ddl.sgml index 168c5f54..cf41e569 100644 --- a/doc/src/sgml/ddl.sgml +++ b/doc/src/sgml/ddl.sgml @@ -3275,6 +3275,11 @@ VALUES ('Albany', NULL, NULL, 'NY'); foreign table partitions. + + Updating the partition key of a row might cause it to be moved into a + different partition where this row satisfies its partition constraint. + + Example @@ -3572,9 +3577,22 @@ ALTER TABLE measurement ATTACH PARTITION measurement_y2008m02 - An UPDATE that causes a row to move from one partition to - another fails, because the new value of the row fails to satisfy the - implicit partition constraint of the original partition. + When an UPDATE causes a row to move from one + partition to another, there is a chance that another concurrent + UPDATE or DELETE misses this row. + Suppose session 1 is performing an UPDATE on a + partition key, and meanwhile a concurrent session 2 for which this row + is visible performs an UPDATE or + DELETE operation on this row. Session 2 can silently + miss the row if the row is deleted from the partition due to session + 1's activity. In such case, session 2's + UPDATE or DELETE, being unaware of + the row movement thinks that the row has just been deleted and concludes + that there is nothing to be done for this row. In the usual case where + the table is not partitioned, or where there is no row movement, + session 2 would have identified the newly updated row and carried out + the UPDATE/DELETE on this new row + version. diff --git a/doc/src/sgml/ref/update.sgml b/doc/src/sgml/ref/update.sgml index b84fd93a..2fd709b8 100644 --- a/doc/src/sgml/ref/update.sgml +++ b/doc/src/sgml/ref/update.sgml @@ -288,10 +288,15 @@ UPDATE count In the case of a partitioned table, updating a row might cause it to no - longer satisfy the partition constraint. Since there is no provision to - move the row to the partition appropriate to the new value of its - partitioning key, an error will occur in this case. This can also happen - when updating a partition directly. + longer satisfy the partition constraint of the containing partition. In that + case, if there is some other partition in the partition tree for which this + row satisfies its partition constraint, then the row is moved to that + partition. If there is no such partition, an error will occur. Behind the + scenes, the row movement is actually a DELETE and + INSERT operation. However, there is a possibility that a + concurrent UPDATE or DELETE on the + same row may miss this row. For details see the section + . diff --git a/doc/src/sgml/trigger.sgml b/doc/src/sgml/trigger.sgml index ea29a097..44a9a3c9 100644 --- a/doc/src/sgml/trigger.sgml +++ b/doc/src/sgml/trigger.sgml @@ -163,6 +163,29 @@ triggers. + + If an UPDATE on a partitioned table causes a row to move + to another partition, it will be performed as a DELETE + from the original partition followed by an INSERT into + the new partition. In this case, all row-level BEFORE + UPDATE triggers and all row-level + BEFORE DELETE triggers are fired on + the original partition. Then all row-level BEFORE + INSERT triggers are fired on the destination partition. + The possibility of surprising outcomes should be considered when all these + triggers affect the row being moved. As far as AFTER ROW + triggers are concerned, AFTER DELETE + and AFTER INSERT triggers are + applied; but AFTER UPDATE triggers + are not applied because the UPDATE has been converted to + a DELETE and an INSERT. As far as + statement-level triggers are concerned, none of the + DELETE or INSERT triggers are fired, + even if row movement occurs; only the UPDATE triggers + defined on the target table used in the UPDATE statement + will be fired. + + Trigger functions invoked by per-statement triggers should always return NULL. Trigger functions invoked by per-row diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index ab6834e0..cf770f46 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -212,7 +212,6 @@ typedef struct CopyStateData PartitionTupleRouting *partition_tuple_routing; TransitionCaptureState *transition_capture; - TupleConversionMap **transition_tupconv_maps; /* * These variables are used to reduce overhead in textual COPY FROM. @@ -1645,19 +1644,7 @@ BeginCopy(ParseState *pstate, * modifies the tuple). */ if (cstate->transition_capture != NULL) - { - int i; - - cstate->transition_tupconv_maps = (TupleConversionMap **) - palloc0(sizeof(TupleConversionMap *) * proute->num_partitions); - for (i = 0; i < proute->num_partitions; ++i) - { - cstate->transition_tupconv_maps[i] = - convert_tuples_by_name(RelationGetDescr(proute->partitions[i]->ri_RelationDesc), - RelationGetDescr(rel), - gettext_noop("could not convert row type")); - } - } + ExecSetupChildParentMapForLeaf(proute); } #ifdef PGXC /* Get copy statement and execution node information */ @@ -3441,7 +3428,6 @@ CopyFrom(CopyState cstate) if (cstate->partition_tuple_routing) { int leaf_part_index; - TupleConversionMap *map; PartitionTupleRouting *proute = cstate->partition_tuple_routing; /* @@ -3505,7 +3491,8 @@ CopyFrom(CopyState cstate) */ cstate->transition_capture->tcs_original_insert_tuple = NULL; cstate->transition_capture->tcs_map = - cstate->transition_tupconv_maps[leaf_part_index]; + TupConvMapForLeaf(proute, saved_resultRelInfo, + leaf_part_index); } else { @@ -3522,23 +3509,10 @@ CopyFrom(CopyState cstate) * We might need to convert from the parent rowtype to the * partition rowtype. */ - map = proute->partition_tupconv_maps[leaf_part_index]; - if (map) - { - Relation partrel = resultRelInfo->ri_RelationDesc; - - tuple = do_convert_tuple(tuple, map, partrel); - - /* - * We must use the partition's tuple descriptor from this - * point on. Use a dedicated slot from this point on until - * we're finished dealing with the partition. - */ - slot = proute->partition_tuple_slot; - Assert(slot != NULL); - ExecSetSlotDescriptor(slot, RelationGetDescr(partrel)); - ExecStoreTuple(tuple, slot, InvalidBuffer, true); - } + tuple = ConvertPartitionTupleSlot(proute->parent_child_tupconv_maps[leaf_part_index], + tuple, + proute->partition_tuple_slot, + &slot); tuple->t_tableOid = RelationGetRelid(resultRelInfo->ri_RelationDesc); } diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c index 6412550b..7a2a05ff 100644 --- a/src/backend/commands/trigger.c +++ b/src/backend/commands/trigger.c @@ -2994,8 +2994,13 @@ ExecARUpdateTriggers(EState *estate, ResultRelInfo *relinfo, { HeapTuple trigtuple; - Assert(HeapTupleIsValid(fdw_trigtuple) ^ ItemPointerIsValid(tupleid)); - if (fdw_trigtuple == NULL) + /* + * Note: if the UPDATE is converted into a DELETE+INSERT as part of + * update-partition-key operation, then this function is also called + * separately for DELETE and INSERT to capture transition table rows. + * In such case, either old tuple or new tuple can be NULL. + */ + if (fdw_trigtuple == NULL && ItemPointerIsValid(tupleid)) trigtuple = GetTupleForTrigger(estate, NULL, relinfo, @@ -5559,7 +5564,12 @@ AfterTriggerPendingOnRel(Oid relid) * triggers actually need to be queued. It is also called after each row, * even if there are no triggers for that event, if there are any AFTER * STATEMENT triggers for the statement which use transition tables, so that - * the transition tuplestores can be built. + * the transition tuplestores can be built. Furthermore, if the transition + * capture is happening for UPDATEd rows being moved to another partition due + * to the partition-key being changed, then this function is called once when + * the row is deleted (to capture OLD row), and once when the row is inserted + * into another partition (to capture NEW row). This is done separately because + * DELETE and INSERT happen on different tables. * * Transition tuplestores are built now, rather than when events are pulled * off of the queue because AFTER ROW triggers are allowed to select from the @@ -5612,12 +5622,25 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, bool update_new_table = transition_capture->tcs_update_new_table; bool insert_new_table = transition_capture->tcs_insert_new_table;; - if ((event == TRIGGER_EVENT_DELETE && delete_old_table) || - (event == TRIGGER_EVENT_UPDATE && update_old_table)) + /* + * For INSERT events newtup should be non-NULL, for DELETE events + * oldtup should be non-NULL, whereas for UPDATE events normally both + * oldtup and newtup are non-NULL. But for UPDATE events fired for + * capturing transition tuples during UPDATE partition-key row + * movement, oldtup is NULL when the event is for a row being inserted, + * whereas newtup is NULL when the event is for a row being deleted. + */ + Assert(!(event == TRIGGER_EVENT_DELETE && delete_old_table && + oldtup == NULL)); + Assert(!(event == TRIGGER_EVENT_INSERT && insert_new_table && + newtup == NULL)); + + if (oldtup != NULL && + ((event == TRIGGER_EVENT_DELETE && delete_old_table) || + (event == TRIGGER_EVENT_UPDATE && update_old_table))) { Tuplestorestate *old_tuplestore; - Assert(oldtup != NULL); old_tuplestore = transition_capture->tcs_old_tuplestore; if (map != NULL) @@ -5630,12 +5653,12 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, else tuplestore_puttuple(old_tuplestore, oldtup); } - if ((event == TRIGGER_EVENT_INSERT && insert_new_table) || - (event == TRIGGER_EVENT_UPDATE && update_new_table)) + if (newtup != NULL && + ((event == TRIGGER_EVENT_INSERT && insert_new_table) || + (event == TRIGGER_EVENT_UPDATE && update_new_table))) { Tuplestorestate *new_tuplestore; - Assert(newtup != NULL); if (event == TRIGGER_EVENT_INSERT) new_tuplestore = transition_capture->tcs_insert_tuplestore; else @@ -5654,11 +5677,18 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, tuplestore_puttuple(new_tuplestore, newtup); } - /* If transition tables are the only reason we're here, return. */ + /* + * If transition tables are the only reason we're here, return. As + * mentioned above, we can also be here during update tuple routing in + * presence of transition tables, in which case this function is called + * separately for oldtup and newtup, so we expect exactly one of them + * to be NULL. + */ if (trigdesc == NULL || (event == TRIGGER_EVENT_DELETE && !trigdesc->trig_delete_after_row) || (event == TRIGGER_EVENT_INSERT && !trigdesc->trig_insert_after_row) || - (event == TRIGGER_EVENT_UPDATE && !trigdesc->trig_update_after_row)) + (event == TRIGGER_EVENT_UPDATE && !trigdesc->trig_update_after_row) || + (event == TRIGGER_EVENT_UPDATE && ((oldtup == NULL) ^ (newtup == NULL)))) return; } diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c index b1dfe5a9..a08f308f 100644 --- a/src/backend/executor/execPartition.c +++ b/src/backend/executor/execPartition.c @@ -54,7 +54,11 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, List *leaf_parts; ListCell *cell; int i; - ResultRelInfo *leaf_part_rri; + ResultRelInfo *leaf_part_arr = NULL, + *update_rri = NULL; + int num_update_rri = 0, + update_rri_index = 0; + bool is_update = false; PartitionTupleRouting *proute; /* @@ -69,10 +73,38 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, proute->num_partitions = list_length(leaf_parts); proute->partitions = (ResultRelInfo **) palloc(proute->num_partitions * sizeof(ResultRelInfo *)); - proute->partition_tupconv_maps = + proute->parent_child_tupconv_maps = (TupleConversionMap **) palloc0(proute->num_partitions * sizeof(TupleConversionMap *)); + /* Set up details specific to the type of tuple routing we are doing. */ + if (mtstate && mtstate->operation == CMD_UPDATE) + { + ModifyTable *node = (ModifyTable *) mtstate->ps.plan; + + is_update = true; + update_rri = mtstate->resultRelInfo; + num_update_rri = list_length(node->plans); + proute->subplan_partition_offsets = + palloc(num_update_rri * sizeof(int)); + + /* + * We need an additional tuple slot for storing transient tuples that + * are converted to the root table descriptor. + */ + proute->root_tuple_slot = MakeTupleTableSlot(); + } + else + { + /* + * Since we are inserting tuples, we need to create all new result + * rels. Avoid repeated pallocs by allocating memory for all the + * result rels in bulk. + */ + leaf_part_arr = (ResultRelInfo *) palloc0(proute->num_partitions * + sizeof(ResultRelInfo)); + } + /* * Initialize an empty slot that will be used to manipulate tuples of any * given partition's rowtype. It is attached to the caller-specified node @@ -81,38 +113,86 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, */ proute->partition_tuple_slot = MakeTupleTableSlot(); - leaf_part_rri = (ResultRelInfo *) palloc0(proute->num_partitions * - sizeof(ResultRelInfo)); i = 0; foreach(cell, leaf_parts) { - Relation partrel; + ResultRelInfo *leaf_part_rri; + Relation partrel = NULL; TupleDesc part_tupdesc; + Oid leaf_oid = lfirst_oid(cell); + + if (is_update) + { + /* + * If the leaf partition is already present in the per-subplan + * result rels, we re-use that rather than initialize a new result + * rel. The per-subplan resultrels and the resultrels of the leaf + * partitions are both in the same canonical order. So while going + * through the leaf partition oids, we need to keep track of the + * next per-subplan result rel to be looked for in the leaf + * partition resultrels. + */ + if (update_rri_index < num_update_rri && + RelationGetRelid(update_rri[update_rri_index].ri_RelationDesc) == leaf_oid) + { + leaf_part_rri = &update_rri[update_rri_index]; + partrel = leaf_part_rri->ri_RelationDesc; + + /* + * This is required in order to we convert the partition's + * tuple to be compatible with the root partitioned table's + * tuple descriptor. When generating the per-subplan result + * rels, this was not set. + */ + leaf_part_rri->ri_PartitionRoot = rel; + + /* Remember the subplan offset for this ResultRelInfo */ + proute->subplan_partition_offsets[update_rri_index] = i; + + update_rri_index++; + } + else + leaf_part_rri = (ResultRelInfo *) palloc0(sizeof(ResultRelInfo)); + } + else + { + /* For INSERTs, we already have an array of result rels allocated */ + leaf_part_rri = &leaf_part_arr[i]; + } /* - * We locked all the partitions above including the leaf partitions. - * Note that each of the relations in proute->partitions are - * eventually closed by the caller. + * If we didn't open the partition rel, it means we haven't + * initialized the result rel either. */ - partrel = heap_open(lfirst_oid(cell), NoLock); + if (!partrel) + { + /* + * We locked all the partitions above including the leaf + * partitions. Note that each of the newly opened relations in + * proute->partitions are eventually closed by the caller. + */ + partrel = heap_open(leaf_oid, NoLock); + InitResultRelInfo(leaf_part_rri, + partrel, + resultRTindex, + rel, + estate->es_instrument); + } + part_tupdesc = RelationGetDescr(partrel); /* * Save a tuple conversion map to convert a tuple routed to this * partition from the parent's type to the partition's. */ - proute->partition_tupconv_maps[i] = + proute->parent_child_tupconv_maps[i] = convert_tuples_by_name(tupDesc, part_tupdesc, gettext_noop("could not convert row type")); - InitResultRelInfo(leaf_part_rri, - partrel, - resultRTindex, - rel, - estate->es_instrument); - /* - * Verify result relation is a valid target for INSERT. + * Verify result relation is a valid target for an INSERT. An UPDATE + * of a partition-key becomes a DELETE+INSERT operation, so this check + * is still required when the operation is CMD_UPDATE. */ CheckValidResultRel(leaf_part_rri, CMD_INSERT); @@ -132,10 +212,16 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, estate->es_leaf_result_relations = lappend(estate->es_leaf_result_relations, leaf_part_rri); - proute->partitions[i] = leaf_part_rri++; + proute->partitions[i] = leaf_part_rri; i++; } + /* + * For UPDATE, we should have found all the per-subplan resultrels in the + * leaf partitions. + */ + Assert(!is_update || update_rri_index == num_update_rri); + return proute; } @@ -258,6 +344,101 @@ ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd, return result; } +/* + * ExecSetupChildParentMapForLeaf -- Initialize the per-leaf-partition + * child-to-root tuple conversion map array. + * + * This map is required for capturing transition tuples when the target table + * is a partitioned table. For a tuple that is routed by an INSERT or UPDATE, + * we need to convert it from the leaf partition to the target table + * descriptor. + */ +void +ExecSetupChildParentMapForLeaf(PartitionTupleRouting *proute) +{ + Assert(proute != NULL); + + /* + * These array elements gets filled up with maps on an on-demand basis. + * Initially just set all of them to NULL. + */ + proute->child_parent_tupconv_maps = + (TupleConversionMap **) palloc0(sizeof(TupleConversionMap *) * + proute->num_partitions); + + /* Same is the case for this array. All the values are set to false */ + proute->child_parent_map_not_required = + (bool *) palloc0(sizeof(bool) * proute->num_partitions); +} + +/* + * TupConvMapForLeaf -- Get the tuple conversion map for a given leaf partition + * index. + */ +TupleConversionMap * +TupConvMapForLeaf(PartitionTupleRouting *proute, + ResultRelInfo *rootRelInfo, int leaf_index) +{ + ResultRelInfo **resultRelInfos = proute->partitions; + TupleConversionMap **map; + TupleDesc tupdesc; + + /* Don't call this if we're not supposed to be using this type of map. */ + Assert(proute->child_parent_tupconv_maps != NULL); + + /* If it's already known that we don't need a map, return NULL. */ + if (proute->child_parent_map_not_required[leaf_index]) + return NULL; + + /* If we've already got a map, return it. */ + map = &proute->child_parent_tupconv_maps[leaf_index]; + if (*map != NULL) + return *map; + + /* No map yet; try to create one. */ + tupdesc = RelationGetDescr(resultRelInfos[leaf_index]->ri_RelationDesc); + *map = + convert_tuples_by_name(tupdesc, + RelationGetDescr(rootRelInfo->ri_RelationDesc), + gettext_noop("could not convert row type")); + + /* If it turns out no map is needed, remember for next time. */ + proute->child_parent_map_not_required[leaf_index] = (*map == NULL); + + return *map; +} + +/* + * ConvertPartitionTupleSlot -- convenience function for tuple conversion. + * The tuple, if converted, is stored in new_slot, and *p_my_slot is + * updated to point to it. new_slot typically should be one of the + * dedicated partition tuple slots. If map is NULL, *p_my_slot is not changed. + * + * Returns the converted tuple, unless map is NULL, in which case original + * tuple is returned unmodified. + */ +HeapTuple +ConvertPartitionTupleSlot(TupleConversionMap *map, + HeapTuple tuple, + TupleTableSlot *new_slot, + TupleTableSlot **p_my_slot) +{ + if (!map) + return tuple; + + tuple = do_convert_tuple(tuple, map); + + /* + * Change the partition tuple slot descriptor, as per converted tuple. + */ + *p_my_slot = new_slot; + Assert(new_slot != NULL); + ExecSetSlotDescriptor(new_slot, map->outdesc); + ExecStoreTuple(tuple, new_slot, InvalidBuffer, true); + + return tuple; +} + /* * ExecCleanupTupleRouting -- Clean up objects allocated for partition tuple * routing. @@ -265,9 +446,10 @@ ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd, * Close all the partitioned tables, leaf partitions, and their indices. */ void -ExecCleanupTupleRouting(PartitionTupleRouting * proute) +ExecCleanupTupleRouting(PartitionTupleRouting *proute) { int i; + int subplan_index = 0; /* * Remember, proute->partition_dispatch_info[0] corresponds to the root @@ -288,11 +470,30 @@ ExecCleanupTupleRouting(PartitionTupleRouting * proute) { ResultRelInfo *resultRelInfo = proute->partitions[i]; + /* + * If this result rel is one of the UPDATE subplan result rels, let + * ExecEndPlan() close it. For INSERT or COPY, + * proute->subplan_partition_offsets will always be NULL. Note that + * the subplan_partition_offsets array and the partitions array have + * the partitions in the same order. So, while we iterate over + * partitions array, we also iterate over the + * subplan_partition_offsets array in order to figure out which of the + * result rels are present in the UPDATE subplans. + */ + if (proute->subplan_partition_offsets && + proute->subplan_partition_offsets[subplan_index] == i) + { + subplan_index++; + continue; + } + ExecCloseIndices(resultRelInfo); heap_close(resultRelInfo->ri_RelationDesc, NoLock); } - /* Release the standalone partition tuple descriptor, if any */ + /* Release the standalone partition tuple descriptors, if any */ + if (proute->root_tuple_slot) + ExecDropSingleTupleTableSlot(proute->root_tuple_slot); if (proute->partition_tuple_slot) ExecDropSingleTupleTableSlot(proute->partition_tuple_slot); } diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index a7ed98e7..f04ef73d 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -79,6 +79,11 @@ static bool ExecOnConflictUpdate(ModifyTableState *mtstate, EState *estate, bool canSetTag, TupleTableSlot **returning); +static ResultRelInfo *getTargetResultRelInfo(ModifyTableState *node); +static void ExecSetupChildParentMapForTcs(ModifyTableState *mtstate); +static void ExecSetupChildParentMapForSubplan(ModifyTableState *mtstate); +static TupleConversionMap *tupconv_map_for_subplan(ModifyTableState *node, + int whichplan); /* * Verify that the tuples to be produced by INSERT or UPDATE match the @@ -281,6 +286,7 @@ ExecInsert(ModifyTableState *mtstate, Oid newId; List *recheckIndexes = NIL; TupleTableSlot *result = NULL; + TransitionCaptureState *ar_insert_trig_tcs; #ifdef __TBASE__ bool has_unshippable_trigger = false; int remoterel_index = 0; @@ -321,7 +327,6 @@ ExecInsert(ModifyTableState *mtstate, { int leaf_part_index; PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing; - TupleConversionMap *map; /* * Away we go ... If we end up not finding a partition after all, @@ -370,8 +375,10 @@ ExecInsert(ModifyTableState *mtstate, * back to tuplestore format. */ mtstate->mt_transition_capture->tcs_original_insert_tuple = NULL; + mtstate->mt_transition_capture->tcs_map = - mtstate->mt_transition_tupconv_maps[leaf_part_index]; + TupConvMapForLeaf(proute, saved_resultRelInfo, + leaf_part_index); } else { @@ -388,23 +395,10 @@ ExecInsert(ModifyTableState *mtstate, * We might need to convert from the parent rowtype to the partition * rowtype. */ - map = proute->partition_tupconv_maps[leaf_part_index]; - if (map) - { - Relation partrel = resultRelInfo->ri_RelationDesc; - - tuple = do_convert_tuple(tuple, map, partrel); - - /* - * We must use the partition's tuple descriptor from this point - * on, until we're finished dealing with the partition. Use the - * dedicated slot for that. - */ - slot = proute->partition_tuple_slot; - Assert(slot != NULL); - ExecSetSlotDescriptor(slot, RelationGetDescr(partrel)); - ExecStoreTuple(tuple, slot, InvalidBuffer, true); - } + tuple = ConvertPartitionTupleSlot(proute->parent_child_tupconv_maps[leaf_part_index], + tuple, + proute->partition_tuple_slot, + &slot); } #ifdef __TBASE__ /* Determine the interval partition to heap_insert the tuple into */ @@ -590,6 +584,8 @@ ExecInsert(ModifyTableState *mtstate, } else { + WCOKind wco_kind; + /* * We always check the partition constraint, including when the tuple * got here via tuple-routing. However we don't need to in the latter @@ -607,14 +603,23 @@ ExecInsert(ModifyTableState *mtstate, tuple->t_tableOid = RelationGetRelid(resultRelationDesc); /* - * Check any RLS INSERT WITH CHECK policies + * Check any RLS WITH CHECK policies. * + * Normally we should check INSERT policies. But if the insert is the + * result of a partition key update that moved the tuple to a new + * partition, we should instead check UPDATE policies, because we are + * executing policies defined on the target table, and not those + * defined on the child partitions. + */ + wco_kind = (mtstate->operation == CMD_UPDATE) ? + WCO_RLS_UPDATE_CHECK : WCO_RLS_INSERT_CHECK; + + /* * ExecWithCheckOptions() will skip any WCOs which are not of the kind * we are looking for at this point. */ if (resultRelInfo->ri_WithCheckOptions != NIL) - ExecWithCheckOptions(WCO_RLS_INSERT_CHECK, - resultRelInfo, slot, estate); + ExecWithCheckOptions(wco_kind, resultRelInfo, slot, estate); /* * No need though if the tuple has been routed, and a BR trigger @@ -830,9 +835,32 @@ ExecInsert(ModifyTableState *mtstate, setLastTid(&(tuple->t_self)); } + /* + * If this insert is the result of a partition key update that moved the + * tuple to a new partition, put this row into the transition NEW TABLE, + * if there is one. We need to do this separately for DELETE and INSERT + * because they happen on different tables. + */ + ar_insert_trig_tcs = mtstate->mt_transition_capture; + if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture + && mtstate->mt_transition_capture->tcs_update_new_table) + { + ExecARUpdateTriggers(estate, resultRelInfo, NULL, + NULL, + tuple, + NULL, + mtstate->mt_transition_capture); + + /* + * We've already captured the NEW TABLE row, so make sure any AR + * INSERT trigger fired below doesn't capture it again. + */ + ar_insert_trig_tcs = NULL; + } + /* AFTER ROW INSERT Triggers */ ExecARInsertTriggers(estate, resultRelInfo, tuple, recheckIndexes, - mtstate->mt_transition_capture); + ar_insert_trig_tcs); list_free(recheckIndexes); @@ -888,6 +916,8 @@ ExecDelete(ModifyTableState *mtstate, TupleTableSlot *planSlot, EPQState *epqstate, EState *estate, + bool *tupleDeleted, + bool processReturning, bool canSetTag) #else static TupleTableSlot * @@ -897,6 +927,8 @@ ExecDelete(ModifyTableState *mtstate, TupleTableSlot *planSlot, EPQState *epqstate, EState *estate, + bool *tupleDeleted, + bool processReturning, bool canSetTag) #endif {// #lizard forgives @@ -910,6 +942,11 @@ ExecDelete(ModifyTableState *mtstate, ModifyTable *mt = (ModifyTable *)mtstate->ps.plan; #endif + TransitionCaptureState *ar_delete_trig_tcs; + + if (tupleDeleted) + *tupleDeleted = false; + /* * get information on the (current) result relation */ @@ -1122,12 +1159,40 @@ ldelete:; if (canSetTag) (estate->es_processed)++; + /* Tell caller that the delete actually happened. */ + if (tupleDeleted) + *tupleDeleted = true; + + /* + * If this delete is the result of a partition key update that moved the + * tuple to a new partition, put this row into the transition OLD TABLE, + * if there is one. We need to do this separately for DELETE and INSERT + * because they happen on different tables. + */ + ar_delete_trig_tcs = mtstate->mt_transition_capture; + if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture + && mtstate->mt_transition_capture->tcs_update_old_table) + { + ExecARUpdateTriggers(estate, resultRelInfo, + tupleid, + oldtuple, + NULL, + NULL, + mtstate->mt_transition_capture); + + /* + * We've already captured the NEW TABLE row, so make sure any AR + * DELETE trigger fired below doesn't capture it again. + */ + ar_delete_trig_tcs = NULL; + } + /* AFTER ROW DELETE Triggers */ ExecARDeleteTriggers(estate, resultRelInfo, tupleid, oldtuple, - mtstate->mt_transition_capture); + ar_delete_trig_tcs); - /* Process RETURNING if present */ - if (resultRelInfo->ri_projectReturning) + /* Process RETURNING if present and if requested */ + if (processReturning && resultRelInfo->ri_projectReturning) { /* * We have to put the target tuple into a slot, which means first we @@ -1220,6 +1285,7 @@ ExecUpdate(ModifyTableState *mtstate, HTSU_Result result; HeapUpdateFailureData hufd; List *recheckIndexes = NIL; + TupleConversionMap *saved_tcs_map = NULL; #ifdef __TBASE__ int remoterel_index = 0; ModifyTable *mt = (ModifyTable *)mtstate->ps.plan; @@ -1336,6 +1402,7 @@ ExecUpdate(ModifyTableState *mtstate, else { LockTupleMode lockmode; + bool partition_constraint_failed; /* * Constraints might reference the tableoid column, so initialize @@ -1351,22 +1418,143 @@ ExecUpdate(ModifyTableState *mtstate, * (We don't need to redo triggers, however. If there are any BEFORE * triggers then trigger.c will have done heap_lock_tuple to lock the * correct tuple, so there's no need to do them again.) - * - * ExecWithCheckOptions() will skip any WCOs which are not of the kind - * we are looking for at this point. */ lreplace:; - if (resultRelInfo->ri_WithCheckOptions != NIL) + + /* + * If partition constraint fails, this row might get moved to another + * partition, in which case we should check the RLS CHECK policy just + * before inserting into the new partition, rather than doing it here. + * This is because a trigger on that partition might again change the + * row. So skip the WCO checks if the partition constraint fails. + */ + partition_constraint_failed = + resultRelInfo->ri_PartitionCheck && + !ExecPartitionCheck(resultRelInfo, slot, estate); + + if (!partition_constraint_failed && + resultRelInfo->ri_WithCheckOptions != NIL) + { + /* + * ExecWithCheckOptions() will skip any WCOs which are not of the + * kind we are looking for at this point. + */ + ExecWithCheckOptions(WCO_RLS_UPDATE_CHECK, resultRelInfo, slot, estate); + } + + /* + * If a partition check failed, try to move the row into the right + * partition. + */ + if (partition_constraint_failed) + { + bool tuple_deleted; + TupleTableSlot *ret_slot; + PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing; + int map_index; + TupleConversionMap *tupconv_map; + + /* + * When an UPDATE is run on a leaf partition, we will not have + * partition tuple routing set up. In that case, fail with + * partition constraint violation error. + */ + if (proute == NULL) + ExecPartitionCheckEmitError(resultRelInfo, slot, estate); + + /* + * Row movement, part 1. Delete the tuple, but skip RETURNING + * processing. We want to return rows from INSERT. + */ + ExecDelete(mtstate, tupleid, oldtuple, planSlot, epqstate, estate, + &tuple_deleted, false, false); + + /* + * For some reason if DELETE didn't happen (e.g. trigger prevented + * it, or it was already deleted by self, or it was concurrently + * deleted by another transaction), then we should skip the insert + * as well; otherwise, an UPDATE could cause an increase in the + * total number of rows across all partitions, which is clearly + * wrong. + * + * For a normal UPDATE, the case where the tuple has been the + * subject of a concurrent UPDATE or DELETE would be handled by + * the EvalPlanQual machinery, but for an UPDATE that we've + * translated into a DELETE from this partition and an INSERT into + * some other partition, that's not available, because CTID chains + * can't span relation boundaries. We mimic the semantics to a + * limited extent by skipping the INSERT if the DELETE fails to + * find a tuple. This ensures that two concurrent attempts to + * UPDATE the same tuple at the same time can't turn one tuple + * into two, and that an UPDATE of a just-deleted tuple can't + * resurrect it. + */ + if (!tuple_deleted) + return NULL; + + /* + * Updates set the transition capture map only when a new subplan + * is chosen. But for inserts, it is set for each row. So after + * INSERT, we need to revert back to the map created for UPDATE; + * otherwise the next UPDATE will incorrectly use the one created + * for INSERT. So first save the one created for UPDATE. + */ + if (mtstate->mt_transition_capture) + saved_tcs_map = mtstate->mt_transition_capture->tcs_map; + + /* + * resultRelInfo is one of the per-subplan resultRelInfos. So we + * should convert the tuple into root's tuple descriptor, since + * ExecInsert() starts the search from root. The tuple conversion + * map list is in the order of mtstate->resultRelInfo[], so to + * retrieve the one for this resultRel, we need to know the + * position of the resultRel in mtstate->resultRelInfo[]. + */ + map_index = resultRelInfo - mtstate->resultRelInfo; + Assert(map_index >= 0 && map_index < mtstate->mt_nplans); + tupconv_map = tupconv_map_for_subplan(mtstate, map_index); + tuple = ConvertPartitionTupleSlot(tupconv_map, + tuple, + proute->root_tuple_slot, + &slot); + + + /* + * For ExecInsert(), make it look like we are inserting into the + * root. + */ + Assert(mtstate->rootResultRelInfo != NULL); + estate->es_result_relation_info = mtstate->rootResultRelInfo; + + ret_slot = ExecInsert(mtstate, slot, planSlot, NULL, + ONCONFLICT_NONE, estate, canSetTag); + + /* + * Revert back the active result relation and the active + * transition capture map that we changed above. + */ + estate->es_result_relation_info = resultRelInfo; + if (mtstate->mt_transition_capture) + { + mtstate->mt_transition_capture->tcs_original_insert_tuple = NULL; + mtstate->mt_transition_capture->tcs_map = saved_tcs_map; + } + return ret_slot; + } /* * Check the constraints of the tuple. Note that we pass the same * slot for the orig_slot argument, because unlike ExecInsert(), no * tuple-routing is performed here, hence the slot remains unchanged. + * We've already checked the partition constraint above; however, we + * must still ensure the tuple passes all other constraints, so we + * will call ExecConstraints() and have it validate all remaining + * checks. */ - if (resultRelationDesc->rd_att->constr || resultRelInfo->ri_PartitionCheck) - ExecConstraints(resultRelInfo, slot, estate, true); + if (resultRelationDesc->rd_att->constr) + ExecConstraints(resultRelInfo, slot, estate, false); #ifdef _MLS_ if (is_mls_user()) @@ -1763,17 +1951,20 @@ fireBSTriggers(ModifyTableState *node) } /* - * Return the ResultRelInfo for which we will fire AFTER STATEMENT triggers. - * This is also the relation into whose tuple format all captured transition - * tuples must be converted. + * Return the target rel ResultRelInfo. + * + * This relation is the same as : + * - the relation for which we will fire AFTER STATEMENT triggers. + * - the relation into whose tuple format all captured transition tuples must + * be converted. + * - the root partitioned table. */ static ResultRelInfo * -getASTriggerResultRelInfo(ModifyTableState *node) +getTargetResultRelInfo(ModifyTableState *node) { /* - * If the node modifies a partitioned table, we must fire its triggers. - * Note that in that case, node->resultRelInfo points to the first leaf - * partition, not the root table. + * Note that if the node modifies a partitioned table, node->resultRelInfo + * points to the first leaf partition, not the root table. */ if (node->rootResultRelInfo != NULL) return node->rootResultRelInfo; @@ -1787,7 +1978,7 @@ getASTriggerResultRelInfo(ModifyTableState *node) static void fireASTriggers(ModifyTableState *node) { - ResultRelInfo *resultRelInfo = getASTriggerResultRelInfo(node); + ResultRelInfo *resultRelInfo = getTargetResultRelInfo(node); switch (node->operation) { @@ -1820,8 +2011,7 @@ fireASTriggers(ModifyTableState *node) static void ExecSetupTransitionCaptureState(ModifyTableState *mtstate, EState *estate) { - ResultRelInfo *targetRelInfo = getASTriggerResultRelInfo(mtstate); - int i; + ResultRelInfo *targetRelInfo = getTargetResultRelInfo(mtstate); /* Check for transition tables on the directly targeted relation. */ mtstate->mt_transition_capture = @@ -1834,60 +2024,141 @@ ExecSetupTransitionCaptureState(ModifyTableState *mtstate, EState *estate) */ if (mtstate->mt_transition_capture != NULL) { - int numResultRelInfos; - PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing; - - numResultRelInfos = (proute != NULL ? - proute->num_partitions : - mtstate->mt_nplans); + ExecSetupChildParentMapForTcs(mtstate); /* - * Build array of conversion maps from each child's TupleDesc to the - * one used in the tuplestore. The map pointers may be NULL when no - * conversion is necessary, which is hopefully a common case for - * partitions. + * Install the conversion map for the first plan for UPDATE and DELETE + * operations. It will be advanced each time we switch to the next + * plan. (INSERT operations set it every time, so we need not update + * mtstate->mt_oc_transition_capture here.) */ - mtstate->mt_transition_tupconv_maps = (TupleConversionMap **) - palloc0(sizeof(TupleConversionMap *) * numResultRelInfos); + if (mtstate->mt_transition_capture && mtstate->operation != CMD_INSERT) + mtstate->mt_transition_capture->tcs_map = + tupconv_map_for_subplan(mtstate, 0); + } +} - /* Choose the right set of partitions */ - if (proute != NULL) +/* + * Initialize the child-to-root tuple conversion map array for UPDATE subplans. + * + * This map array is required to convert the tuple from the subplan result rel + * to the target table descriptor. This requirement arises for two independent + * scenarios: + * 1. For update-tuple-routing. + * 2. For capturing tuples in transition tables. + */ +void +ExecSetupChildParentMapForSubplan(ModifyTableState *mtstate) { + ResultRelInfo *targetRelInfo = getTargetResultRelInfo(mtstate); + ResultRelInfo *resultRelInfos = mtstate->resultRelInfo; + TupleDesc outdesc; + int numResultRelInfos = mtstate->mt_nplans; + int i; + + /* + * First check if there is already a per-subplan array allocated. Even if + * there is already a per-leaf map array, we won't require a per-subplan + * one, since we will use the subplan offset array to convert the subplan + * index to per-leaf index. + */ + if (mtstate->mt_per_subplan_tupconv_maps || + (mtstate->mt_partition_tuple_routing && + mtstate->mt_partition_tuple_routing->child_parent_tupconv_maps)) + return; + /* - * For tuple routing among partitions, we need TupleDescs based - * on the partition routing table. + * Build array of conversion maps from each child's TupleDesc to the one + * used in the target relation. The map pointers may be NULL when no + * conversion is necessary, which is hopefully a common case. */ - ResultRelInfo **resultRelInfos = proute->partitions; + + /* Get tuple descriptor of the target rel. */ + outdesc = RelationGetDescr(targetRelInfo->ri_RelationDesc); + + mtstate->mt_per_subplan_tupconv_maps = (TupleConversionMap **) + palloc(sizeof(TupleConversionMap *) * numResultRelInfos); for (i = 0; i < numResultRelInfos; ++i) { - mtstate->mt_transition_tupconv_maps[i] = - convert_tuples_by_name(RelationGetDescr(resultRelInfos[i]->ri_RelationDesc), - RelationGetDescr(targetRelInfo->ri_RelationDesc), + mtstate->mt_per_subplan_tupconv_maps[i] = + convert_tuples_by_name(RelationGetDescr(resultRelInfos[i].ri_RelationDesc), + outdesc, gettext_noop("could not convert row type")); } } - else - { - /* Otherwise we need the ResultRelInfo for each subplan. */ - ResultRelInfo *resultRelInfos = mtstate->resultRelInfo; - for (i = 0; i < numResultRelInfos; ++i) +/* + * Initialize the child-to-root tuple conversion map array required for + * capturing transition tuples. + * + * The map array can be indexed either by subplan index or by leaf-partition + * index. For transition tables, we need a subplan-indexed access to the map, + * and where tuple-routing is present, we also require a leaf-indexed access. + */ +static void +ExecSetupChildParentMapForTcs(ModifyTableState *mtstate) + { + PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing; + + /* + * If partition tuple routing is set up, we will require partition-indexed + * access. In that case, create the map array indexed by partition; we + * will still be able to access the maps using a subplan index by + * converting the subplan index to a partition index using + * subplan_partition_offsets. If tuple routing is not set up, it means we + * don't require partition-indexed access. In that case, create just a + * subplan-indexed map. + */ + if (proute) { - mtstate->mt_transition_tupconv_maps[i] = - convert_tuples_by_name(RelationGetDescr(resultRelInfos[i].ri_RelationDesc), - RelationGetDescr(targetRelInfo->ri_RelationDesc), - gettext_noop("could not convert row type")); + /* + * If a partition-indexed map array is to be created, the subplan map + * array has to be NULL. If the subplan map array is already created, + * we won't be able to access the map using a partition index. + */ + Assert(mtstate->mt_per_subplan_tupconv_maps == NULL); + + ExecSetupChildParentMapForLeaf(proute); } + else + ExecSetupChildParentMapForSubplan(mtstate); } /* - * Install the conversion map for the first plan for UPDATE and DELETE - * operations. It will be advanced each time we switch to the next - * plan. (INSERT operations set it every time.) + * For a given subplan index, get the tuple conversion map. */ - mtstate->mt_transition_capture->tcs_map = - mtstate->mt_transition_tupconv_maps[0]; +static TupleConversionMap * +tupconv_map_for_subplan(ModifyTableState *mtstate, int whichplan) +{ + /* + * If a partition-index tuple conversion map array is allocated, we need + * to first get the index into the partition array. Exactly *one* of the + * two arrays is allocated. This is because if there is a partition array + * required, we don't require subplan-indexed array since we can translate + * subplan index into partition index. And, we create a subplan-indexed + * array *only* if partition-indexed array is not required. + */ + if (mtstate->mt_per_subplan_tupconv_maps == NULL) + { + int leaf_index; + PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing; + + /* + * If subplan-indexed array is NULL, things should have been arranged + * to convert the subplan index to partition index. + */ + Assert(proute && proute->subplan_partition_offsets != NULL); + + leaf_index = proute->subplan_partition_offsets[whichplan]; + + return TupConvMapForLeaf(proute, getTargetResultRelInfo(mtstate), + leaf_index); + } + else + { + Assert(whichplan >= 0 && whichplan < mtstate->mt_nplans); + return mtstate->mt_per_subplan_tupconv_maps[whichplan]; } } @@ -2134,9 +2405,8 @@ ExecModifyTable(PlanState *pstate) if (node->mt_transition_capture != NULL) { /* Prepare to convert transition tuples from this child. */ - Assert(node->mt_transition_tupconv_maps != NULL); node->mt_transition_capture->tcs_map = - node->mt_transition_tupconv_maps[node->mt_whichplan]; + tupconv_map_for_subplan(node, node->mt_whichplan); } continue; } @@ -2400,10 +2670,12 @@ ExecModifyTable(PlanState *pstate) case CMD_DELETE: #ifdef __TBASE__ slot = ExecDelete(node, tupleid, oldtuple, slot, planSlot, - &node->mt_epqstate, estate, node->canSetTag); + &node->mt_epqstate, estate, + NULL, true, node->canSetTag); #else slot = ExecDelete(node, tupleid, oldtuple, planSlot, - &node->mt_epqstate, estate, node->canSetTag); + &node->mt_epqstate, estate, + NULL, true, node->canSetTag); #endif break; default: @@ -2478,9 +2750,12 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) ResultRelInfo *resultRelInfo; TupleDesc tupDesc; Plan *subplan; + int firstVarno = 0; + Relation firstResultRel = NULL; ListCell *l; int i; Relation rel; + bool update_tuple_routing_needed = node->partColsUpdated; PartitionTupleRouting *proute = NULL; int num_partitions = 0; #ifdef __TBASE__ @@ -2572,6 +2847,16 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) resultRelInfo->ri_IndexRelationDescs == NULL) ExecOpenIndices(resultRelInfo, mtstate->mt_onconflict != ONCONFLICT_NONE); + /* + * If this is an UPDATE and a BEFORE UPDATE trigger is present, the + * trigger itself might modify the partition-key values. So arrange + * for tuple routing. + */ + if (resultRelInfo->ri_TrigDesc && + resultRelInfo->ri_TrigDesc->trig_update_before_row && + operation == CMD_UPDATE) + update_tuple_routing_needed = true; + /* Now init the plan for this result rel */ #ifdef __TBASE__ if (resultRelInfo->ispartparent && node->arbiterIndexes) @@ -2693,22 +2978,52 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) estate->es_result_relation_info = saved_resultRelInfo; - /* Build state for INSERT tuple routing */ - rel = mtstate->resultRelInfo->ri_RelationDesc; - if (operation == CMD_INSERT && - rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + /* Get the target relation */ + rel = (getTargetResultRelInfo(mtstate))->ri_RelationDesc; + + /* + * If it's not a partitioned table after all, UPDATE tuple routing should + * not be attempted. + */ + if (rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE) + update_tuple_routing_needed = false; + + /* + * Build state for tuple routing if it's an INSERT or if it's an UPDATE of + * partition key. + */ + if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE && + (operation == CMD_INSERT || update_tuple_routing_needed)) { proute = mtstate->mt_partition_tuple_routing = ExecSetupPartitionTupleRouting(mtstate, rel, node->nominalRelation, estate); num_partitions = proute->num_partitions; + + /* + * Below are required as reference objects for mapping partition + * attno's in expressions such as WithCheckOptions and RETURNING. + */ + firstVarno = mtstate->resultRelInfo[0].ri_RangeTableIndex; + firstResultRel = mtstate->resultRelInfo[0].ri_RelationDesc; } /* Build state for collecting transition tuples */ ExecSetupTransitionCaptureState(mtstate, estate); /* + * Construct mapping from each of the per-subplan partition attnos to the + * root attno. This is required when during update row movement the tuple + * descriptor of a source partition does not match the root partitioned + * table descriptor. In such a case we need to convert tuples to the root + * tuple descriptor, because the search for destination partition starts + * from the root. Skip this setup if it's not a partition key update. + */ + if (update_tuple_routing_needed) + ExecSetupChildParentMapForSubplan(mtstate); + + /* * Initialize any WITH CHECK OPTION constraints if needed. */ resultRelInfo = mtstate->resultRelInfo; @@ -2759,26 +3074,29 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) * Build WITH CHECK OPTION constraints for each leaf partition rel. Note * that we didn't build the withCheckOptionList for each partition within * the planner, but simple translation of the varattnos for each partition - * will suffice. This only occurs for the INSERT case; UPDATE/DELETE - * cases are handled above. + * will suffice. This only occurs for the INSERT case or for UPDATE row + * movement. DELETEs and local UPDATEs are handled above. */ if (node->withCheckOptionLists != NIL && num_partitions > 0) { - List *wcoList; - PlanState *plan; + List *first_wcoList; /* * In case of INSERT on partitioned tables, there is only one plan. * Likewise, there is only one WITH CHECK OPTIONS list, not one per - * partition. We make a copy of the WCO qual for each partition; note - * that, if there are SubPlans in there, they all end up attached to - * the one parent Plan node. - */ - Assert(operation == CMD_INSERT && + * partition. Whereas for UPDATE, there are as many WCOs as there are + * plans. So in either case, use the WCO expression of the first + * resultRelInfo as a reference to calculate attno's for the WCO + * expression of each of the partitions. We make a copy of the WCO + * qual for each partition. Note that, if there are SubPlans in there, + * they all end up attached to the one parent Plan node. + */ + Assert(update_tuple_routing_needed || + (operation == CMD_INSERT && list_length(node->withCheckOptionLists) == 1 && - mtstate->mt_nplans == 1); - wcoList = linitial(node->withCheckOptionLists); - plan = mtstate->mt_plans[0]; + mtstate->mt_nplans == 1)); + + first_wcoList = linitial(node->withCheckOptionLists); for (i = 0; i < num_partitions; i++) { Relation partrel; @@ -2787,17 +3105,26 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) ListCell *ll; resultRelInfo = proute->partitions[i]; + + /* + * If we are referring to a resultRelInfo from one of the update + * result rels, that result rel would already have + * WithCheckOptions initialized. + */ + if (resultRelInfo->ri_WithCheckOptions) + continue; + partrel = resultRelInfo->ri_RelationDesc; - /* varno = node->nominalRelation */ - mapped_wcoList = map_partition_varattnos(wcoList, - node->nominalRelation, - partrel, rel, NULL); + mapped_wcoList = map_partition_varattnos(first_wcoList, + firstVarno, + partrel, firstResultRel, + NULL); foreach(ll, mapped_wcoList) { WithCheckOption *wco = castNode(WithCheckOption, lfirst(ll)); ExprState *wcoExpr = ExecInitQual(castNode(List, wco->qual), - plan); + &mtstate->ps); wcoExprs = lappend(wcoExprs, wcoExpr); } @@ -2814,7 +3141,7 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) { TupleTableSlot *slot; ExprContext *econtext; - List *returningList; + List *firstReturningList; /* * Initialize result tuple slot and assign its rowtype using the first @@ -2862,22 +3189,35 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) * Build a projection for each leaf partition rel. Note that we * didn't build the returningList for each partition within the * planner, but simple translation of the varattnos for each partition - * will suffice. This only occurs for the INSERT case; UPDATE/DELETE - * are handled above. + * will suffice. This only occurs for the INSERT case or for UPDATE + * row movement. DELETEs and local UPDATEs are handled above. */ - returningList = linitial(node->returningLists); + firstReturningList = linitial(node->returningLists); for (i = 0; i < num_partitions; i++) { Relation partrel; List *rlist; resultRelInfo = proute->partitions[i]; + + /* + * If we are referring to a resultRelInfo from one of the update + * result rels, that result rel would already have a returningList + * built. + */ + if (resultRelInfo->ri_projectReturning) + continue; + partrel = resultRelInfo->ri_RelationDesc; - /* varno = node->nominalRelation */ - rlist = map_partition_varattnos(returningList, - node->nominalRelation, - partrel, rel, NULL); + /* + * Use the returning expression of the first resultRelInfo as a + * reference to calculate attno's for the returning expression of + * each of the partitions. + */ + rlist = map_partition_varattnos(firstReturningList, + firstVarno, + partrel, firstResultRel, NULL); resultRelInfo->ri_projectReturning = ExecBuildProjectionInfo(rlist, econtext, slot, &mtstate->ps, resultRelInfo->ri_RelationDesc->rd_att); diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 1e57fedd..b55431d6 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -246,6 +246,7 @@ _copyModifyTable(const ModifyTable *from) COPY_SCALAR_FIELD(canSetTag); COPY_SCALAR_FIELD(nominalRelation); COPY_NODE_FIELD(partitioned_rels); + COPY_SCALAR_FIELD(partColsUpdated); COPY_NODE_FIELD(resultRelations); COPY_SCALAR_FIELD(resultRelIndex); COPY_SCALAR_FIELD(rootResultRelIndex); @@ -2536,6 +2537,7 @@ _copyPartitionedChildRelInfo(const PartitionedChildRelInfo *from) COPY_SCALAR_FIELD(parent_relid); COPY_NODE_FIELD(child_rels); + COPY_SCALAR_FIELD(part_cols_updated); return newnode; } diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c index c05b411c..0e47737a 100644 --- a/src/backend/nodes/equalfuncs.c +++ b/src/backend/nodes/equalfuncs.c @@ -936,6 +936,7 @@ _equalPartitionedChildRelInfo(const PartitionedChildRelInfo *a, const Partitione { COMPARE_SCALAR_FIELD(parent_relid); COMPARE_NODE_FIELD(child_rels); + COMPARE_SCALAR_FIELD(part_cols_updated); return true; } diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index fb711230..610c2fae 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -661,6 +661,7 @@ _outModifyTable(StringInfo str, const ModifyTable *node) WRITE_BOOL_FIELD(canSetTag); WRITE_UINT_FIELD(nominalRelation); WRITE_NODE_FIELD(partitioned_rels); + WRITE_BOOL_FIELD(partColsUpdated); WRITE_NODE_FIELD(resultRelations); WRITE_INT_FIELD(resultRelIndex); WRITE_INT_FIELD(rootResultRelIndex); @@ -3408,6 +3409,7 @@ _outModifyTablePath(StringInfo str, const ModifyTablePath *node) WRITE_BOOL_FIELD(canSetTag); WRITE_UINT_FIELD(nominalRelation); WRITE_NODE_FIELD(partitioned_rels); + WRITE_BOOL_FIELD(partColsUpdated); WRITE_NODE_FIELD(resultRelations); WRITE_NODE_FIELD(subpaths); WRITE_NODE_FIELD(subroots); @@ -3859,6 +3861,7 @@ _outPartitionedChildRelInfo(StringInfo str, const PartitionedChildRelInfo *node) WRITE_UINT_FIELD(parent_relid); WRITE_NODE_FIELD(child_rels); + WRITE_BOOL_FIELD(part_cols_updated); } static void diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c index 2bdc5067..e0744408 100644 --- a/src/backend/nodes/readfuncs.c +++ b/src/backend/nodes/readfuncs.c @@ -2381,6 +2381,7 @@ _readModifyTable(void) READ_BOOL_FIELD(canSetTag); READ_UINT_FIELD(nominalRelation); READ_NODE_FIELD(partitioned_rels); + READ_BOOL_FIELD(partColsUpdated); READ_NODE_FIELD(resultRelations); READ_INT_FIELD(resultRelIndex); READ_INT_FIELD(rootResultRelIndex); diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 439e98ed..9020a606 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -1379,7 +1379,7 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, case RTE_RELATION: if (rte->relkind == RELKIND_PARTITIONED_TABLE) partitioned_rels = - get_partitioned_child_rels(root, rel->relid); + get_partitioned_child_rels(root, rel->relid, NULL); break; case RTE_SUBQUERY: build_partitioned_rels = true; @@ -1417,7 +1417,7 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, { List *cprels; - cprels = get_partitioned_child_rels(root, childrel->relid); + cprels = get_partitioned_child_rels(root, childrel->relid, NULL); partitioned_rels = list_concat(partitioned_rels, list_copy(cprels)); } diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 45880a2f..bf38bafc 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -329,6 +329,7 @@ static ProjectSet *make_project_set(List *tlist, Plan *subplan); static ModifyTable *make_modifytable(PlannerInfo *root, CmdType operation, bool canSetTag, Index nominalRelation, List *partitioned_rels, + bool partColsUpdated, List *resultRelations, List *subplans, List *withCheckOptionLists, List *returningLists, List *rowMarks, OnConflictExpr *onconflict, int epqParam); @@ -2972,6 +2973,7 @@ create_modifytable_plan(PlannerInfo *root, ModifyTablePath *best_path) best_path->canSetTag, best_path->nominalRelation, best_path->partitioned_rels, + best_path->partColsUpdated, best_path->resultRelations, subplans, best_path->withCheckOptionLists, @@ -8471,6 +8473,7 @@ static ModifyTable * make_modifytable(PlannerInfo *root, CmdType operation, bool canSetTag, Index nominalRelation, List *partitioned_rels, + bool partColsUpdated, List *resultRelations, List *subplans, List *withCheckOptionLists, List *returningLists, List *rowMarks, OnConflictExpr *onconflict, int epqParam) @@ -8500,6 +8503,7 @@ make_modifytable(PlannerInfo *root, node->canSetTag = canSetTag; node->nominalRelation = nominalRelation; node->partitioned_rels = partitioned_rels; + node->partColsUpdated = partColsUpdated; node->resultRelations = resultRelations; node->resultRelIndex = -1; /* will be set correctly in setrefs.c */ node->rootResultRelIndex = -1; /* will be set correctly in setrefs.c */ diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 498b1d5e..b22d678d 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -1235,6 +1235,7 @@ inheritance_planner(PlannerInfo *root) Query *parent_parse; Bitmapset *parent_relids = bms_make_singleton(top_parentRTindex); PlannerInfo **parent_roots = NULL; + bool partColsUpdated = false; Assert(parse->commandType != CMD_INSERT); @@ -1306,7 +1307,8 @@ inheritance_planner(PlannerInfo *root) if (parent_rte->relkind == RELKIND_PARTITIONED_TABLE) { nominalRelation = top_parentRTindex; - partitioned_rels = get_partitioned_child_rels(root, top_parentRTindex); + partitioned_rels = get_partitioned_child_rels(root, top_parentRTindex, + &partColsUpdated); /* The root partitioned table is included as a child rel */ Assert(list_length(partitioned_rels) >= 1); } @@ -1680,6 +1682,7 @@ inheritance_planner(PlannerInfo *root) parse->canSetTag, nominalRelation, partitioned_rels, + partColsUpdated, resultRelations, subpaths, subroots, @@ -2354,6 +2357,7 @@ grouping_planner(PlannerInfo *root, bool inheritance_update, parse->canSetTag, parse->resultRelation, NIL, + false, list_make1_int(parse->resultRelation), list_make1(path), list_make1(root), @@ -7840,17 +7844,24 @@ grouping_distribution_match(PlannerInfo *root, Query *parse, Path *path, /* * get_partitioned_child_rels * Returns a list of the RT indexes of the partitioned child relations - * with rti as the root parent RT index. + * with rti as the root parent RT index. Also sets + * *part_cols_updated to true if any of the root rte's updated + * columns is used in the partition key either of the relation whose RTI + * is specified or of any child relation. * * Note: This function might get called even for range table entries that * are not partitioned tables; in such a case, it will simply return NIL. */ List * -get_partitioned_child_rels(PlannerInfo *root, Index rti) +get_partitioned_child_rels(PlannerInfo *root, Index rti, + bool *part_cols_updated) { List *result = NIL; ListCell *l; + if (part_cols_updated) + *part_cols_updated = false; + foreach(l, root->pcinfo_list) { PartitionedChildRelInfo *pc = lfirst(l); @@ -7858,6 +7869,8 @@ get_partitioned_child_rels(PlannerInfo *root, Index rti) if (pc->parent_relid == rti) { result = pc->child_rels; + if (part_cols_updated) + *part_cols_updated = pc->part_cols_updated; break; } } diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c index 1fe5a341..c40a38ee 100644 --- a/src/backend/optimizer/prep/prepunion.c +++ b/src/backend/optimizer/prep/prepunion.c @@ -105,7 +105,8 @@ static void expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte, Index parentRTindex, Relation parentrel, PlanRowMark *top_parentrc, LOCKMODE lockmode, - List **appinfos, List **partitioned_child_rels); + List **appinfos, List **partitioned_child_rels, + bool *part_cols_updated); static void expand_single_inheritance_child(PlannerInfo *root, RangeTblEntry *parentrte, Index parentRTindex, Relation parentrel, @@ -1543,16 +1544,19 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) if (RelationGetPartitionDesc(oldrelation) != NULL) { List *partitioned_child_rels = NIL; + bool part_cols_updated = false; Assert(rte->relkind == RELKIND_PARTITIONED_TABLE); /* * If this table has partitions, recursively expand them in the order - * in which they appear in the PartitionDesc. + * in which they appear in the PartitionDesc. While at it, also + * extract the partition key columns of all the partitioned tables. */ expand_partitioned_rtentry(root, rte, rti, oldrelation, oldrc, lockmode, &root->append_rel_list, - &partitioned_child_rels); + &partitioned_child_rels, + &part_cols_updated); /* * We keep a list of objects in root, each of which maps a root @@ -1569,6 +1573,7 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) pcinfo = makeNode(PartitionedChildRelInfo); pcinfo->parent_relid = rti; pcinfo->child_rels = partitioned_child_rels; + pcinfo->part_cols_updated = part_cols_updated; root->pcinfo_list = lappend(root->pcinfo_list, pcinfo); } } @@ -1645,7 +1650,8 @@ static void expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte, Index parentRTindex, Relation parentrel, PlanRowMark *top_parentrc, LOCKMODE lockmode, - List **appinfos, List **partitioned_child_rels) + List **appinfos, List **partitioned_child_rels, + bool *part_cols_updated) { int i; RangeTblEntry *childrte; @@ -1660,6 +1666,17 @@ expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte, Assert(parentrte->inh); + /* + * Note down whether any partition key cols are being updated. Though it's + * the root partitioned table's updatedCols we are interested in, we + * instead use parentrte to get the updatedCols. This is convenient because + * parentrte already has the root partrel's updatedCols translated to match + * the attribute ordering of parentrel. + */ + if (!*part_cols_updated) + *part_cols_updated = + has_partition_attrs(parentrel, parentrte->updatedCols, NULL); + /* First expand the partitioned table itself. */ expand_single_inheritance_child(root, parentrte, parentRTindex, parentrel, top_parentrc, parentrel, @@ -1699,7 +1716,8 @@ expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte, if (childrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) expand_partitioned_rtentry(root, childrte, childRTindex, childrel, top_parentrc, lockmode, - appinfos, partitioned_child_rels); + appinfos, partitioned_child_rels, + part_cols_updated); /* Close child relation, but keep locks */ heap_close(childrel, NoLock); diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index c2d27db7..0a6735d1 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -6650,6 +6650,8 @@ create_lockrows_path(PlannerInfo *root, RelOptInfo *rel, * 'partitioned_rels' is an integer list of RT indexes of non-leaf tables in * the partition tree, if this is an UPDATE/DELETE to a partitioned table. * Otherwise NIL. + * 'partColsUpdated' is true if any partitioning columns are being updated, + * either from the target relation or a descendent partitioned table. * 'resultRelations' is an integer list of actual RT indexes of target rel(s) * 'subpaths' is a list of Path(s) producing source data (one per rel) * 'subroots' is a list of PlannerInfo structs (one per rel) @@ -6663,6 +6665,7 @@ ModifyTablePath * create_modifytable_path(PlannerInfo *root, RelOptInfo *rel, CmdType operation, bool canSetTag, Index nominalRelation, List *partitioned_rels, + bool partColsUpdated, List *resultRelations, List *subpaths, List *subroots, List *withCheckOptionLists, List *returningLists, @@ -6730,6 +6733,7 @@ create_modifytable_path(PlannerInfo *root, RelOptInfo *rel, pathnode->canSetTag = canSetTag; pathnode->nominalRelation = nominalRelation; pathnode->partitioned_rels = list_copy(partitioned_rels); + pathnode->partColsUpdated = partColsUpdated; pathnode->resultRelations = resultRelations; pathnode->subpaths = subpaths; pathnode->subroots = subroots; diff --git a/src/include/executor/execPartition.h b/src/include/executor/execPartition.h index bea189c5..45acfa92 100644 --- a/src/include/executor/execPartition.h +++ b/src/include/executor/execPartition.h @@ -62,11 +62,24 @@ typedef struct PartitionDispatchData *PartitionDispatch; * for every leaf partition in the partition tree. * num_partitions Number of leaf partitions in the partition tree * (= 'partitions' array length) - * partition_tupconv_maps Array of TupleConversionMap objects with one + * parent_child_tupconv_maps Array of TupleConversionMap objects with one * entry for every leaf partition (required to - * convert input tuple based on the root table's - * rowtype to a leaf partition's rowtype after - * tuple routing is done) + * convert tuple from the root table's rowtype to + * a leaf partition's rowtype after tuple routing + * is done) + * child_parent_tupconv_maps Array of TupleConversionMap objects with one + * entry for every leaf partition (required to + * convert an updated tuple from the leaf + * partition's rowtype to the root table's rowtype + * so that tuple routing can be done) + * child_parent_map_not_required Array of bool. True value means that a map is + * determined to be not required for the given + * partition. False means either we haven't yet + * checked if a map is required, or it was + * determined to be required. + * subplan_partition_offsets Integer array ordered by UPDATE subplans. Each + * element of this array has the index into the + * corresponding partition in partitions array. * partition_tuple_slot TupleTableSlot to be used to manipulate any * given leaf partition's rowtype after that * partition is chosen for insertion by @@ -79,8 +92,12 @@ typedef struct PartitionTupleRouting int num_dispatch; ResultRelInfo **partitions; int num_partitions; - TupleConversionMap **partition_tupconv_maps; + TupleConversionMap **parent_child_tupconv_maps; + TupleConversionMap **child_parent_tupconv_maps; + bool *child_parent_map_not_required; + int *subplan_partition_offsets; TupleTableSlot *partition_tuple_slot; + TupleTableSlot *root_tuple_slot; } PartitionTupleRouting; extern PartitionTupleRouting *ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, @@ -90,6 +107,13 @@ extern int ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd, TupleTableSlot *slot, EState *estate); +extern void ExecSetupChildParentMapForLeaf(PartitionTupleRouting *proute); +extern TupleConversionMap *TupConvMapForLeaf(PartitionTupleRouting *proute, + ResultRelInfo *rootRelInfo, int leaf_index); +extern HeapTuple ConvertPartitionTupleSlot(TupleConversionMap *map, + HeapTuple tuple, + TupleTableSlot *new_slot, + TupleTableSlot **p_my_slot); extern void ExecCleanupTupleRouting(PartitionTupleRouting *proute); #endif /* EXECPARTITION_H */ diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 3d7ece62..74475d60 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1070,8 +1070,8 @@ typedef struct ModifyTableState /* Tuple-routing support info */ struct TransitionCaptureState *mt_transition_capture; /* controls transition table population */ - TupleConversionMap **mt_transition_tupconv_maps; - /* Per plan/partition tuple conversion */ + TupleConversionMap **mt_per_subplan_tupconv_maps; + /* Per plan map for tuple conversion from child to root */ #ifdef __TBASE__ /* used for interval partition */ bool haspartparent; diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index ce1f6719..4b3c49d2 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -256,6 +256,7 @@ typedef struct ModifyTable Index nominalRelation; /* Parent RT index for use of EXPLAIN */ /* RT indexes of non-leaf tables in a partition tree */ List *partitioned_rels; + bool partColsUpdated; /* some part key in hierarchy updated */ List *resultRelations; /* integer list of RT indexes */ int resultRelIndex; /* index of first resultRel in plan's list */ int rootResultRelIndex; /* index of the partitioned table root */ diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h index 96258106..e2af7ebc 100644 --- a/src/include/nodes/relation.h +++ b/src/include/nodes/relation.h @@ -1815,6 +1815,7 @@ typedef struct ModifyTablePath Index nominalRelation; /* Parent RT index for use of EXPLAIN */ /* RT indexes of non-leaf tables in a partition tree */ List *partitioned_rels; + bool partColsUpdated; /* some part key in hierarchy updated */ List *resultRelations; /* integer list of RT indexes */ List *subpaths; /* Path(s) producing source data */ List *subroots; /* per-target-table PlannerInfos */ @@ -2263,6 +2264,8 @@ typedef struct PartitionedChildRelInfo Index parent_relid; List *child_rels; + bool part_cols_updated; /* is the partition key of any of + * the partitioned tables updated? */ } PartitionedChildRelInfo; /* diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index d6e8ffdb..4097e568 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -299,6 +299,7 @@ extern ModifyTablePath *create_modifytable_path(PlannerInfo *root, RelOptInfo *rel, CmdType operation, bool canSetTag, Index nominalRelation, List *partitioned_rels, + bool partColsUpdated, List *resultRelations, List *subpaths, List *subroots, List *withCheckOptionLists, List *returningLists, diff --git a/src/include/optimizer/planner.h b/src/include/optimizer/planner.h index 464efbe4..1425e543 100644 --- a/src/include/optimizer/planner.h +++ b/src/include/optimizer/planner.h @@ -121,7 +121,8 @@ extern Expr *preprocess_phv_expression(PlannerInfo *root, Expr *expr); extern bool plan_cluster_use_sort(Oid tableOid, Oid indexOid); -extern List *get_partitioned_child_rels(PlannerInfo *root, Index rti); +extern List *get_partitioned_child_rels(PlannerInfo *root, Index rti, + bool *part_cols_updated); extern List *get_partitioned_child_rels_for_join(PlannerInfo *root, Relids join_relids); diff --git a/src/test/regress/expected/update.out b/src/test/regress/expected/update.out index 9cdaf10f..ed21a142 100644 --- a/src/test/regress/expected/update.out +++ b/src/test/regress/expected/update.out @@ -198,58 +198,668 @@ INSERT INTO upsert_test VALUES (1, 'Bat') ON CONFLICT(a) DROP TABLE update_test; DROP TABLE upsert_test; --- update to a partition should check partition bound constraint for the new tuple -create table range_parted ( +--------------------------- +-- UPDATE with row movement +--------------------------- +-- When a partitioned table receives an UPDATE to the partitioned key and the +-- new values no longer meet the partition's bound, the row must be moved to +-- the correct partition for the new partition key (if one exists). We must +-- also ensure that updatable views on partitioned tables properly enforce any +-- WITH CHECK OPTION that is defined. The situation with triggers in this case +-- also requires thorough testing as partition key updates causing row +-- movement convert UPDATEs into DELETE+INSERT. +CREATE TABLE range_parted ( a text, - b int -) partition by range (a, b); -create table part_a_1_a_10 partition of range_parted for values from ('a', 1) to ('a', 10); -create table part_a_10_a_20 partition of range_parted for values from ('a', 10) to ('a', 20); -create table part_b_1_b_10 partition of range_parted for values from ('b', 1) to ('b', 10); -create table part_b_10_b_20 partition of range_parted for values from ('b', 10) to ('b', 20); -insert into part_a_1_a_10 values ('a', 1); -insert into part_b_10_b_20 values ('b', 10); --- fail -update part_a_1_a_10 set a = 'b' where a = 'a'; -ERROR: could not plan this distributed update -DETAIL: correlated UPDATE or updating distribution column currently not supported in Postgres-XL. -update range_parted set b = b - 1 where b = 10; -ERROR: new row for relation "part_b_10_b_20" violates partition constraint -DETAIL: Failing row contains (b, 9). + b bigint, + c numeric, + d int, + e varchar +) PARTITION BY RANGE (a, b); +-- Create partitions intentionally in descending bound order, so as to test +-- that update-row-movement works with the leaf partitions not in bound order. +CREATE TABLE part_b_20_b_30 (e varchar, c numeric, a text, b bigint, d int); +ALTER TABLE range_parted ATTACH PARTITION part_b_20_b_30 FOR VALUES FROM ('b', 20) TO ('b', 30); +CREATE TABLE part_b_10_b_20 (e varchar, c numeric, a text, b bigint, d int) PARTITION BY RANGE (c); +CREATE TABLE part_b_1_b_10 PARTITION OF range_parted FOR VALUES FROM ('b', 1) TO ('b', 10); +ALTER TABLE range_parted ATTACH PARTITION part_b_10_b_20 FOR VALUES FROM ('b', 10) TO ('b', 20); +CREATE TABLE part_a_10_a_20 PARTITION OF range_parted FOR VALUES FROM ('a', 10) TO ('a', 20); +CREATE TABLE part_a_1_a_10 PARTITION OF range_parted FOR VALUES FROM ('a', 1) TO ('a', 10); +-- Check that partition-key UPDATE works sanely on a partitioned table that +-- does not have any child partitions. +UPDATE part_b_10_b_20 set b = b - 6; +-- Create some more partitions following the above pattern of descending bound +-- order, but let's make the situation a bit more complex by having the +-- attribute numbers of the columns vary from their parent partition. +CREATE TABLE part_c_100_200 (e varchar, c numeric, a text, b bigint, d int) PARTITION BY range (abs(d)); +ALTER TABLE part_c_100_200 DROP COLUMN e, DROP COLUMN c, DROP COLUMN a; +ALTER TABLE part_c_100_200 ADD COLUMN c numeric, ADD COLUMN e varchar, ADD COLUMN a text; +ALTER TABLE part_c_100_200 DROP COLUMN b; +ALTER TABLE part_c_100_200 ADD COLUMN b bigint; +CREATE TABLE part_d_1_15 PARTITION OF part_c_100_200 FOR VALUES FROM (1) TO (15); +CREATE TABLE part_d_15_20 PARTITION OF part_c_100_200 FOR VALUES FROM (15) TO (20); +ALTER TABLE part_b_10_b_20 ATTACH PARTITION part_c_100_200 FOR VALUES FROM (100) TO (200); +CREATE TABLE part_c_1_100 (e varchar, d int, c numeric, b bigint, a text); +ALTER TABLE part_b_10_b_20 ATTACH PARTITION part_c_1_100 FOR VALUES FROM (1) TO (100); +\set init_range_parted 'truncate range_parted; insert into range_parted VALUES (''a'', 1, 1, 1), (''a'', 10, 200, 1), (''b'', 12, 96, 1), (''b'', 13, 97, 2), (''b'', 15, 105, 16), (''b'', 17, 105, 19)' +\set show_data 'select tableoid::regclass::text COLLATE "C" partname, * from range_parted ORDER BY 1, 2, 3, 4, 5, 6' +:init_range_parted; +:show_data; + partname | a | b | c | d | e +----------------+---+----+-----+----+--- + part_a_10_a_20 | a | 10 | 200 | 1 | + part_a_1_a_10 | a | 1 | 1 | 1 | + part_c_1_100 | b | 12 | 96 | 1 | + part_c_1_100 | b | 13 | 97 | 2 | + part_d_15_20 | b | 15 | 105 | 16 | + part_d_15_20 | b | 17 | 105 | 19 | +(6 rows) + +-- The order of subplans should be in bound order +EXPLAIN (costs off) UPDATE range_parted set c = c - 50 WHERE c > 97; + QUERY PLAN +------------------------------------- + Update on range_parted + Update on part_a_1_a_10 + Update on part_a_10_a_20 + Update on part_b_1_b_10 + Update on part_c_1_100 + Update on part_d_1_15 + Update on part_d_15_20 + Update on part_b_20_b_30 + -> Seq Scan on part_a_1_a_10 + Filter: (c > '97'::numeric) + -> Seq Scan on part_a_10_a_20 + Filter: (c > '97'::numeric) + -> Seq Scan on part_b_1_b_10 + Filter: (c > '97'::numeric) + -> Seq Scan on part_c_1_100 + Filter: (c > '97'::numeric) + -> Seq Scan on part_d_1_15 + Filter: (c > '97'::numeric) + -> Seq Scan on part_d_15_20 + Filter: (c > '97'::numeric) + -> Seq Scan on part_b_20_b_30 + Filter: (c > '97'::numeric) +(22 rows) + +-- fail, row movement happens only within the partition subtree. +UPDATE part_c_100_200 set c = c - 20, d = c WHERE c = 105; +ERROR: new row for relation "part_c_100_200" violates partition constraint +DETAIL: Failing row contains (105, 85, null, b, 15). +-- fail, no partition key update, so no attempt to move tuple, +-- but "a = 'a'" violates partition constraint enforced by root partition) +UPDATE part_b_10_b_20 set a = 'a'; +ERROR: new row for relation "part_c_1_100" violates partition constraint +DETAIL: Failing row contains (null, 1, 96, 12, a). +-- ok, partition key update, no constraint violation +UPDATE range_parted set d = d - 10 WHERE d > 10; +-- ok, no partition key update, no constraint violation +UPDATE range_parted set e = d; +-- No row found +UPDATE part_c_1_100 set c = c + 20 WHERE c = 98; +-- ok, row movement +UPDATE part_b_10_b_20 set c = c + 20 returning c, b, a; + c | b | a +-----+----+--- + 116 | 12 | b + 117 | 13 | b + 125 | 15 | b + 125 | 17 | b +(4 rows) + +:show_data; + partname | a | b | c | d | e +----------------+---+----+-----+---+--- + part_a_10_a_20 | a | 10 | 200 | 1 | 1 + part_a_1_a_10 | a | 1 | 1 | 1 | 1 + part_d_1_15 | b | 12 | 116 | 1 | 1 + part_d_1_15 | b | 13 | 117 | 2 | 2 + part_d_1_15 | b | 15 | 125 | 6 | 6 + part_d_1_15 | b | 17 | 125 | 9 | 9 +(6 rows) + +-- fail, row movement happens only within the partition subtree. +UPDATE part_b_10_b_20 set b = b - 6 WHERE c > 116 returning *; +ERROR: new row for relation "part_d_1_15" violates partition constraint +DETAIL: Failing row contains (2, 117, 2, b, 7). +-- ok, row movement, with subset of rows moved into different partition. +UPDATE range_parted set b = b - 6 WHERE c > 116 returning a, b + c; + a | ?column? +---+---------- + a | 204 + b | 124 + b | 134 + b | 136 +(4 rows) + +:show_data; + partname | a | b | c | d | e +---------------+---+----+-----+---+--- + part_a_1_a_10 | a | 1 | 1 | 1 | 1 + part_a_1_a_10 | a | 4 | 200 | 1 | 1 + part_b_1_b_10 | b | 7 | 117 | 2 | 2 + part_b_1_b_10 | b | 9 | 125 | 6 | 6 + part_d_1_15 | b | 11 | 125 | 9 | 9 + part_d_1_15 | b | 12 | 116 | 1 | 1 +(6 rows) + +-- Common table needed for multiple test scenarios. +CREATE TABLE mintab(c1 int); +INSERT into mintab VALUES (120); +-- update partition key using updatable view. +CREATE VIEW upview AS SELECT * FROM range_parted WHERE (select c > c1 FROM mintab) WITH CHECK OPTION; +-- ok +UPDATE upview set c = 199 WHERE b = 4; +-- fail, check option violation +UPDATE upview set c = 120 WHERE b = 4; +ERROR: new row violates check option for view "upview" +DETAIL: Failing row contains (a, 4, 120, 1, 1). +-- fail, row movement with check option violation +UPDATE upview set a = 'b', b = 15, c = 120 WHERE b = 4; +ERROR: new row violates check option for view "upview" +DETAIL: Failing row contains (b, 15, 120, 1, 1). +-- ok, row movement, check option passes +UPDATE upview set a = 'b', b = 15 WHERE b = 4; +:show_data; + partname | a | b | c | d | e +---------------+---+----+-----+---+--- + part_a_1_a_10 | a | 1 | 1 | 1 | 1 + part_b_1_b_10 | b | 7 | 117 | 2 | 2 + part_b_1_b_10 | b | 9 | 125 | 6 | 6 + part_d_1_15 | b | 11 | 125 | 9 | 9 + part_d_1_15 | b | 12 | 116 | 1 | 1 + part_d_1_15 | b | 15 | 199 | 1 | 1 +(6 rows) + +-- cleanup +DROP VIEW upview; +-- RETURNING having whole-row vars. +:init_range_parted; +UPDATE range_parted set c = 95 WHERE a = 'b' and b > 10 and c > 100 returning (range_parted), *; + range_parted | a | b | c | d | e +---------------+---+----+----+----+--- + (b,15,95,16,) | b | 15 | 95 | 16 | + (b,17,95,19,) | b | 17 | 95 | 19 | +(2 rows) + +:show_data; + partname | a | b | c | d | e +----------------+---+----+-----+----+--- + part_a_10_a_20 | a | 10 | 200 | 1 | + part_a_1_a_10 | a | 1 | 1 | 1 | + part_c_1_100 | b | 12 | 96 | 1 | + part_c_1_100 | b | 13 | 97 | 2 | + part_c_1_100 | b | 15 | 95 | 16 | + part_c_1_100 | b | 17 | 95 | 19 | +(6 rows) + +-- Transition tables with update row movement +:init_range_parted; +CREATE FUNCTION trans_updatetrigfunc() RETURNS trigger LANGUAGE plpgsql AS +$$ + begin + raise notice 'trigger = %, old table = %, new table = %', + TG_NAME, + (select string_agg(old_table::text, ', ' ORDER BY a) FROM old_table), + (select string_agg(new_table::text, ', ' ORDER BY a) FROM new_table); + return null; + end; +$$; +CREATE TRIGGER trans_updatetrig + AFTER UPDATE ON range_parted REFERENCING OLD TABLE AS old_table NEW TABLE AS new_table + FOR EACH STATEMENT EXECUTE PROCEDURE trans_updatetrigfunc(); +UPDATE range_parted set c = (case when c = 96 then 110 else c + 1 end ) WHERE a = 'b' and b > 10 and c >= 96; +NOTICE: trigger = trans_updatetrig, old table = (b,12,96,1,), (b,13,97,2,), (b,15,105,16,), (b,17,105,19,), new table = (b,12,110,1,), (b,13,98,2,), (b,15,106,16,), (b,17,106,19,) +:show_data; + partname | a | b | c | d | e +----------------+---+----+-----+----+--- + part_a_10_a_20 | a | 10 | 200 | 1 | + part_a_1_a_10 | a | 1 | 1 | 1 | + part_c_1_100 | b | 13 | 98 | 2 | + part_d_15_20 | b | 15 | 106 | 16 | + part_d_15_20 | b | 17 | 106 | 19 | + part_d_1_15 | b | 12 | 110 | 1 | +(6 rows) + +:init_range_parted; +-- Enabling OLD TABLE capture for both DELETE as well as UPDATE stmt triggers +-- should not cause DELETEd rows to be captured twice. Similar thing for +-- INSERT triggers and inserted rows. +CREATE TRIGGER trans_deletetrig + AFTER DELETE ON range_parted REFERENCING OLD TABLE AS old_table + FOR EACH STATEMENT EXECUTE PROCEDURE trans_updatetrigfunc(); +CREATE TRIGGER trans_inserttrig + AFTER INSERT ON range_parted REFERENCING NEW TABLE AS new_table + FOR EACH STATEMENT EXECUTE PROCEDURE trans_updatetrigfunc(); +UPDATE range_parted set c = c + 50 WHERE a = 'b' and b > 10 and c >= 96; +NOTICE: trigger = trans_updatetrig, old table = (b,12,96,1,), (b,13,97,2,), (b,15,105,16,), (b,17,105,19,), new table = (b,12,146,1,), (b,13,147,2,), (b,15,155,16,), (b,17,155,19,) +:show_data; + partname | a | b | c | d | e +----------------+---+----+-----+----+--- + part_a_10_a_20 | a | 10 | 200 | 1 | + part_a_1_a_10 | a | 1 | 1 | 1 | + part_d_15_20 | b | 15 | 155 | 16 | + part_d_15_20 | b | 17 | 155 | 19 | + part_d_1_15 | b | 12 | 146 | 1 | + part_d_1_15 | b | 13 | 147 | 2 | +(6 rows) + +DROP TRIGGER trans_deletetrig ON range_parted; +DROP TRIGGER trans_inserttrig ON range_parted; +-- Don't drop trans_updatetrig yet. It is required below. +-- Test with transition tuple conversion happening for rows moved into the +-- new partition. This requires a trigger that references transition table +-- (we already have trans_updatetrig). For inserted rows, the conversion +-- is not usually needed, because the original tuple is already compatible with +-- the desired transition tuple format. But conversion happens when there is a +-- BR trigger because the trigger can change the inserted row. So install a +-- BR triggers on those child partitions where the rows will be moved. +CREATE FUNCTION func_parted_mod_b() RETURNS trigger AS $$ +BEGIN + NEW.b = NEW.b + 1; + return NEW; +END $$ language plpgsql; +CREATE TRIGGER trig_c1_100 BEFORE UPDATE OR INSERT ON part_c_1_100 + FOR EACH ROW EXECUTE PROCEDURE func_parted_mod_b(); +CREATE TRIGGER trig_d1_15 BEFORE UPDATE OR INSERT ON part_d_1_15 + FOR EACH ROW EXECUTE PROCEDURE func_parted_mod_b(); +CREATE TRIGGER trig_d15_20 BEFORE UPDATE OR INSERT ON part_d_15_20 + FOR EACH ROW EXECUTE PROCEDURE func_parted_mod_b(); +:init_range_parted; +UPDATE range_parted set c = (case when c = 96 then 110 else c + 1 end) WHERE a = 'b' and b > 10 and c >= 96; +NOTICE: trigger = trans_updatetrig, old table = (b,13,96,1,), (b,14,97,2,), (b,16,105,16,), (b,18,105,19,), new table = (b,15,110,1,), (b,15,98,2,), (b,17,106,16,), (b,19,106,19,) +:show_data; + partname | a | b | c | d | e +----------------+---+----+-----+----+--- + part_a_10_a_20 | a | 10 | 200 | 1 | + part_a_1_a_10 | a | 1 | 1 | 1 | + part_c_1_100 | b | 15 | 98 | 2 | + part_d_15_20 | b | 17 | 106 | 16 | + part_d_15_20 | b | 19 | 106 | 19 | + part_d_1_15 | b | 15 | 110 | 1 | +(6 rows) + +:init_range_parted; +UPDATE range_parted set c = c + 50 WHERE a = 'b' and b > 10 and c >= 96; +NOTICE: trigger = trans_updatetrig, old table = (b,13,96,1,), (b,14,97,2,), (b,16,105,16,), (b,18,105,19,), new table = (b,15,146,1,), (b,16,147,2,), (b,17,155,16,), (b,19,155,19,) +:show_data; + partname | a | b | c | d | e +----------------+---+----+-----+----+--- + part_a_10_a_20 | a | 10 | 200 | 1 | + part_a_1_a_10 | a | 1 | 1 | 1 | + part_d_15_20 | b | 17 | 155 | 16 | + part_d_15_20 | b | 19 | 155 | 19 | + part_d_1_15 | b | 15 | 146 | 1 | + part_d_1_15 | b | 16 | 147 | 2 | +(6 rows) + +-- Case where per-partition tuple conversion map array is allocated, but the +-- map is not required for the particular tuple that is routed, thanks to +-- matching table attributes of the partition and the target table. +:init_range_parted; +UPDATE range_parted set b = 15 WHERE b = 1; +NOTICE: trigger = trans_updatetrig, old table = (a,1,1,1,), new table = (a,15,1,1,) +:show_data; + partname | a | b | c | d | e +----------------+---+----+-----+----+--- + part_a_10_a_20 | a | 10 | 200 | 1 | + part_a_10_a_20 | a | 15 | 1 | 1 | + part_c_1_100 | b | 13 | 96 | 1 | + part_c_1_100 | b | 14 | 97 | 2 | + part_d_15_20 | b | 16 | 105 | 16 | + part_d_15_20 | b | 18 | 105 | 19 | +(6 rows) + +DROP TRIGGER trans_updatetrig ON range_parted; +DROP TRIGGER trig_c1_100 ON part_c_1_100; +DROP TRIGGER trig_d1_15 ON part_d_1_15; +DROP TRIGGER trig_d15_20 ON part_d_15_20; +DROP FUNCTION func_parted_mod_b(); +-- RLS policies with update-row-movement +----------------------------------------- +ALTER TABLE range_parted ENABLE ROW LEVEL SECURITY; +CREATE USER regress_range_parted_user; +GRANT ALL ON range_parted, mintab TO regress_range_parted_user; +CREATE POLICY seeall ON range_parted AS PERMISSIVE FOR SELECT USING (true); +CREATE POLICY policy_range_parted ON range_parted for UPDATE USING (true) WITH CHECK (c % 2 = 0); +:init_range_parted; +SET SESSION AUTHORIZATION regress_range_parted_user; +-- This should fail with RLS violation error while moving row from +-- part_a_10_a_20 to part_d_1_15, because we are setting 'c' to an odd number. +UPDATE range_parted set a = 'b', c = 151 WHERE a = 'a' and c = 200; +ERROR: new row violates row-level security policy for table "range_parted" +RESET SESSION AUTHORIZATION; +-- Create a trigger on part_d_1_15 +CREATE FUNCTION func_d_1_15() RETURNS trigger AS $$ +BEGIN + NEW.c = NEW.c + 1; -- Make even numbers odd, or vice versa + return NEW; +END $$ LANGUAGE plpgsql; +CREATE TRIGGER trig_d_1_15 BEFORE INSERT ON part_d_1_15 + FOR EACH ROW EXECUTE PROCEDURE func_d_1_15(); +:init_range_parted; +SET SESSION AUTHORIZATION regress_range_parted_user; +-- Here, RLS checks should succeed while moving row from part_a_10_a_20 to +-- part_d_1_15. Even though the UPDATE is setting 'c' to an odd number, the +-- trigger at the destination partition again makes it an even number. +UPDATE range_parted set a = 'b', c = 151 WHERE a = 'a' and c = 200; +RESET SESSION AUTHORIZATION; +:init_range_parted; +SET SESSION AUTHORIZATION regress_range_parted_user; +-- This should fail with RLS violation error. Even though the UPDATE is setting +-- 'c' to an even number, the trigger at the destination partition again makes +-- it an odd number. +UPDATE range_parted set a = 'b', c = 150 WHERE a = 'a' and c = 200; +ERROR: new row violates row-level security policy for table "range_parted" +-- Cleanup +RESET SESSION AUTHORIZATION; +DROP TRIGGER trig_d_1_15 ON part_d_1_15; +DROP FUNCTION func_d_1_15(); +-- Policy expression contains SubPlan +RESET SESSION AUTHORIZATION; +:init_range_parted; +CREATE POLICY policy_range_parted_subplan on range_parted + AS RESTRICTIVE for UPDATE USING (true) + WITH CHECK ((SELECT range_parted.c <= c1 FROM mintab)); +SET SESSION AUTHORIZATION regress_range_parted_user; +-- fail, mintab has row with c1 = 120 +UPDATE range_parted set a = 'b', c = 122 WHERE a = 'a' and c = 200; +ERROR: new row violates row-level security policy "policy_range_parted_subplan" for table "range_parted" -- ok -update range_parted set b = b + 1 where b = 10; +UPDATE range_parted set a = 'b', c = 120 WHERE a = 'a' and c = 200; +-- RLS policy expression contains whole row. +RESET SESSION AUTHORIZATION; +:init_range_parted; +CREATE POLICY policy_range_parted_wholerow on range_parted AS RESTRICTIVE for UPDATE USING (true) + WITH CHECK (range_parted = row('b', 10, 112, 1, NULL)::range_parted); +SET SESSION AUTHORIZATION regress_range_parted_user; +-- ok, should pass the RLS check +UPDATE range_parted set a = 'b', c = 112 WHERE a = 'a' and c = 200; +RESET SESSION AUTHORIZATION; +:init_range_parted; +SET SESSION AUTHORIZATION regress_range_parted_user; +-- fail, the whole row RLS check should fail +UPDATE range_parted set a = 'b', c = 116 WHERE a = 'a' and c = 200; +ERROR: new row violates row-level security policy "policy_range_parted_wholerow" for table "range_parted" +-- Cleanup +RESET SESSION AUTHORIZATION; +DROP POLICY policy_range_parted ON range_parted; +DROP POLICY policy_range_parted_subplan ON range_parted; +DROP POLICY policy_range_parted_wholerow ON range_parted; +REVOKE ALL ON range_parted, mintab FROM regress_range_parted_user; +DROP USER regress_range_parted_user; +DROP TABLE mintab; +-- statement triggers with update row movement +--------------------------------------------------- +:init_range_parted; +CREATE FUNCTION trigfunc() returns trigger language plpgsql as +$$ + begin + raise notice 'trigger = % fired on table % during %', + TG_NAME, TG_TABLE_NAME, TG_OP; + return null; + end; +$$; +-- Triggers on root partition +CREATE TRIGGER parent_delete_trig + AFTER DELETE ON range_parted for each statement execute procedure trigfunc(); +CREATE TRIGGER parent_update_trig + AFTER UPDATE ON range_parted for each statement execute procedure trigfunc(); +CREATE TRIGGER parent_insert_trig + AFTER INSERT ON range_parted for each statement execute procedure trigfunc(); +-- Triggers on leaf partition part_c_1_100 +CREATE TRIGGER c1_delete_trig + AFTER DELETE ON part_c_1_100 for each statement execute procedure trigfunc(); +CREATE TRIGGER c1_update_trig + AFTER UPDATE ON part_c_1_100 for each statement execute procedure trigfunc(); +CREATE TRIGGER c1_insert_trig + AFTER INSERT ON part_c_1_100 for each statement execute procedure trigfunc(); +-- Triggers on leaf partition part_d_1_15 +CREATE TRIGGER d1_delete_trig + AFTER DELETE ON part_d_1_15 for each statement execute procedure trigfunc(); +CREATE TRIGGER d1_update_trig + AFTER UPDATE ON part_d_1_15 for each statement execute procedure trigfunc(); +CREATE TRIGGER d1_insert_trig + AFTER INSERT ON part_d_1_15 for each statement execute procedure trigfunc(); +-- Triggers on leaf partition part_d_15_20 +CREATE TRIGGER d15_delete_trig + AFTER DELETE ON part_d_15_20 for each statement execute procedure trigfunc(); +CREATE TRIGGER d15_update_trig + AFTER UPDATE ON part_d_15_20 for each statement execute procedure trigfunc(); +CREATE TRIGGER d15_insert_trig + AFTER INSERT ON part_d_15_20 for each statement execute procedure trigfunc(); +-- Move all rows from part_c_100_200 to part_c_1_100. None of the delete or +-- insert statement triggers should be fired. +UPDATE range_parted set c = c - 50 WHERE c > 97; +NOTICE: trigger = parent_update_trig fired on table range_parted during UPDATE +:show_data; + partname | a | b | c | d | e +----------------+---+----+-----+----+--- + part_a_10_a_20 | a | 10 | 150 | 1 | + part_a_1_a_10 | a | 1 | 1 | 1 | + part_c_1_100 | b | 12 | 96 | 1 | + part_c_1_100 | b | 13 | 97 | 2 | + part_c_1_100 | b | 15 | 55 | 16 | + part_c_1_100 | b | 17 | 55 | 19 | +(6 rows) + +DROP TRIGGER parent_delete_trig ON range_parted; +DROP TRIGGER parent_update_trig ON range_parted; +DROP TRIGGER parent_insert_trig ON range_parted; +DROP TRIGGER c1_delete_trig ON part_c_1_100; +DROP TRIGGER c1_update_trig ON part_c_1_100; +DROP TRIGGER c1_insert_trig ON part_c_1_100; +DROP TRIGGER d1_delete_trig ON part_d_1_15; +DROP TRIGGER d1_update_trig ON part_d_1_15; +DROP TRIGGER d1_insert_trig ON part_d_1_15; +DROP TRIGGER d15_delete_trig ON part_d_15_20; +DROP TRIGGER d15_update_trig ON part_d_15_20; +DROP TRIGGER d15_insert_trig ON part_d_15_20; -- Creating default partition for range +:init_range_parted; create table part_def partition of range_parted default; \d+ part_def - Table "public.part_def" - Column | Type | Collation | Nullable | Default | Storage | Stats target | Description ---------+---------+-----------+----------+---------+----------+--------------+------------- - a | text | | | | extended | | - b | integer | | | | plain | | + Table "public.part_def" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+-------------------+-----------+----------+---------+----------+--------------+------------- + a | text | | | | extended | | + b | bigint | | | | plain | | + c | numeric | | | | main | | + d | integer | | | | plain | | + e | character varying | | | | extended | | Partition of: range_parted DEFAULT -Partition constraint: (NOT ((a IS NOT NULL) AND (b IS NOT NULL) AND (((a = 'a'::text) AND (b >= 1) AND (b < 10)) OR ((a = 'a'::text) AND (b >= 10) AND (b < 20)) OR ((a = 'b'::text) AND (b >= 1) AND (b < 10)) OR ((a = 'b'::text) AND (b >= 10) AND (b < 20))))) - +Partition constraint: (NOT ((a IS NOT NULL) AND (b IS NOT NULL) AND (((a = 'a'::text) AND (b >= '1'::bigint) AND (b < '10'::bigint)) OR ((a = 'a'::text) AND (b >= '10'::bigint) AND (b < '20'::bigint)) OR ((a = 'b'::text) AND (b >= '1'::bigint) AND (b < '10'::bigint)) OR ((a = 'b'::text) AND (b >= '10'::bigint) AND (b < '20'::bigint)) OR ((a = 'b'::text) AND (b >= '20'::bigint) AND (b < '30'::bigint))))) + insert into range_parted values ('c', 9); -- ok update part_def set a = 'd' where a = 'c'; -- fail update part_def set a = 'a' where a = 'd'; ERROR: new row for relation "part_def" violates partition constraint -DETAIL: Failing row contains (a, 9). -create table list_parted ( +DETAIL: Failing row contains (a, 9, null, null, null). +:show_data; + partname | a | b | c | d | e +----------------+---+----+-----+----+--- + part_a_10_a_20 | a | 10 | 200 | 1 | + part_a_1_a_10 | a | 1 | 1 | 1 | + part_c_1_100 | b | 12 | 96 | 1 | + part_c_1_100 | b | 13 | 97 | 2 | + part_d_15_20 | b | 15 | 105 | 16 | + part_d_15_20 | b | 17 | 105 | 19 | + part_def | d | 9 | | | +(7 rows) + +-- Update row movement from non-default to default partition. +-- fail, default partition is not under part_a_10_a_20; +UPDATE part_a_10_a_20 set a = 'ad' WHERE a = 'a'; +ERROR: new row for relation "part_a_10_a_20" violates partition constraint +DETAIL: Failing row contains (ad, 10, 200, 1, null). +-- ok +UPDATE range_parted set a = 'ad' WHERE a = 'a'; +UPDATE range_parted set a = 'bd' WHERE a = 'b'; +:show_data; + partname | a | b | c | d | e +----------+----+----+-----+----+--- + part_def | ad | 1 | 1 | 1 | + part_def | ad | 10 | 200 | 1 | + part_def | bd | 12 | 96 | 1 | + part_def | bd | 13 | 97 | 2 | + part_def | bd | 15 | 105 | 16 | + part_def | bd | 17 | 105 | 19 | + part_def | d | 9 | | | +(7 rows) + +-- Update row movement from default to non-default partitions. +-- ok +UPDATE range_parted set a = 'a' WHERE a = 'ad'; +UPDATE range_parted set a = 'b' WHERE a = 'bd'; +:show_data; + partname | a | b | c | d | e +----------------+---+----+-----+----+--- + part_a_10_a_20 | a | 10 | 200 | 1 | + part_a_1_a_10 | a | 1 | 1 | 1 | + part_c_1_100 | b | 12 | 96 | 1 | + part_c_1_100 | b | 13 | 97 | 2 | + part_d_15_20 | b | 15 | 105 | 16 | + part_d_15_20 | b | 17 | 105 | 19 | + part_def | d | 9 | | | +(7 rows) + +-- Cleanup: range_parted no longer needed. +DROP TABLE range_parted; +CREATE TABLE list_parted ( a text, b int -) partition by list (a); -create table list_part1 partition of list_parted for values in ('a', 'b'); -create table list_default partition of list_parted default; -insert into list_part1 values ('a', 1); -insert into list_default values ('d', 10); +) PARTITION BY list (a); +CREATE TABLE list_part1 PARTITION OF list_parted for VALUES in ('a', 'b'); +CREATE TABLE list_default PARTITION OF list_parted default; +INSERT into list_part1 VALUES ('a', 1); +INSERT into list_default VALUES ('d', 10); -- fail -update list_default set a = 'a' where a = 'd'; +UPDATE list_default set a = 'a' WHERE a = 'd'; ERROR: new row for relation "list_default" violates partition constraint DETAIL: Failing row contains (a, 10). -- ok -update list_default set a = 'x' where a = 'd'; +UPDATE list_default set a = 'x' WHERE a = 'd'; +DROP TABLE list_parted; +-------------- +-- Some more update-partition-key test scenarios below. This time use list +-- partitions. +-------------- +-- Setup for list partitions +CREATE TABLE list_parted (a numeric, b int, c int8) PARTITION BY list (a); +CREATE TABLE sub_parted PARTITION OF list_parted for VALUES in (1) PARTITION BY list (b); +CREATE TABLE sub_part1(b int, c int8, a numeric); +ALTER TABLE sub_parted ATTACH PARTITION sub_part1 for VALUES in (1); +CREATE TABLE sub_part2(b int, c int8, a numeric); +ALTER TABLE sub_parted ATTACH PARTITION sub_part2 for VALUES in (2); +CREATE TABLE list_part1(a numeric, b int, c int8); +ALTER TABLE list_parted ATTACH PARTITION list_part1 for VALUES in (2,3); +INSERT into list_parted VALUES (2,5,50); +INSERT into list_parted VALUES (3,6,60); +INSERT into sub_parted VALUES (1,1,60); +INSERT into sub_parted VALUES (1,2,10); +-- Test partition constraint violation when intermediate ancestor is used and +-- constraint is inherited from upper root. +UPDATE sub_parted set a = 2 WHERE c = 10; +ERROR: new row for relation "sub_part2" violates partition constraint +DETAIL: Failing row contains (2, 10, 2). +-- Test update-partition-key, where the unpruned partitions do not have their +-- partition keys updated. +SELECT tableoid::regclass::text, * FROM list_parted WHERE a = 2 ORDER BY 1; + tableoid | a | b | c +------------+---+---+---- + list_part1 | 2 | 5 | 50 +(1 row) + +UPDATE list_parted set b = c + a WHERE a = 2; +SELECT tableoid::regclass::text, * FROM list_parted WHERE a = 2 ORDER BY 1; + tableoid | a | b | c +------------+---+----+---- + list_part1 | 2 | 52 | 50 +(1 row) + +-- Test the case where BR UPDATE triggers change the partition key. +CREATE FUNCTION func_parted_mod_b() returns trigger as $$ +BEGIN + NEW.b = 2; -- This is changing partition key column. + return NEW; +END $$ LANGUAGE plpgsql; +CREATE TRIGGER parted_mod_b before update on sub_part1 + for each row execute procedure func_parted_mod_b(); +SELECT tableoid::regclass::text, * FROM list_parted ORDER BY 1, 2, 3, 4; + tableoid | a | b | c +------------+---+----+---- + list_part1 | 2 | 52 | 50 + list_part1 | 3 | 6 | 60 + sub_part1 | 1 | 1 | 60 + sub_part2 | 1 | 2 | 10 +(4 rows) + +-- This should do the tuple routing even though there is no explicit +-- partition-key update, because there is a trigger on sub_part1. +UPDATE list_parted set c = 70 WHERE b = 1; +SELECT tableoid::regclass::text, * FROM list_parted ORDER BY 1, 2, 3, 4; + tableoid | a | b | c +------------+---+----+---- + list_part1 | 2 | 52 | 50 + list_part1 | 3 | 6 | 60 + sub_part2 | 1 | 2 | 10 + sub_part2 | 1 | 2 | 70 +(4 rows) + +DROP TRIGGER parted_mod_b ON sub_part1; +-- If BR DELETE trigger prevented DELETE from happening, we should also skip +-- the INSERT if that delete is part of UPDATE=>DELETE+INSERT. +CREATE OR REPLACE FUNCTION func_parted_mod_b() returns trigger as $$ +BEGIN + raise notice 'Trigger: Got OLD row %, but returning NULL', OLD; + return NULL; +END $$ LANGUAGE plpgsql; +CREATE TRIGGER trig_skip_delete before delete on sub_part2 + for each row execute procedure func_parted_mod_b(); +UPDATE list_parted set b = 1 WHERE c = 70; +NOTICE: Trigger: Got OLD row (2,70,1), but returning NULL +SELECT tableoid::regclass::text, * FROM list_parted ORDER BY 1, 2, 3, 4; + tableoid | a | b | c +------------+---+----+---- + list_part1 | 2 | 52 | 50 + list_part1 | 3 | 6 | 60 + sub_part2 | 1 | 2 | 10 + sub_part2 | 1 | 2 | 70 +(4 rows) + +-- Drop the trigger. Now the row should be moved. +DROP TRIGGER trig_skip_delete ON sub_part2; +UPDATE list_parted set b = 1 WHERE c = 70; +SELECT tableoid::regclass::text, * FROM list_parted ORDER BY 1, 2, 3, 4; + tableoid | a | b | c +------------+---+----+---- + list_part1 | 2 | 52 | 50 + list_part1 | 3 | 6 | 60 + sub_part1 | 1 | 1 | 70 + sub_part2 | 1 | 2 | 10 +(4 rows) + +DROP FUNCTION func_parted_mod_b(); +-- UPDATE partition-key with FROM clause. If join produces multiple output +-- rows for the same row to be modified, we should tuple-route the row only +-- once. There should not be any rows inserted. +CREATE TABLE non_parted (id int); +INSERT into non_parted VALUES (1), (1), (1), (2), (2), (2), (3), (3), (3); +UPDATE list_parted t1 set a = 2 FROM non_parted t2 WHERE t1.a = t2.id and a = 1; +SELECT tableoid::regclass::text, * FROM list_parted ORDER BY 1, 2, 3, 4; + tableoid | a | b | c +------------+---+----+---- + list_part1 | 2 | 1 | 70 + list_part1 | 2 | 2 | 10 + list_part1 | 2 | 52 | 50 + list_part1 | 3 | 6 | 60 +(4 rows) + +DROP TABLE non_parted; +-- Cleanup: list_parted no longer needed. +DROP TABLE list_parted; -- create custom operator class and hash function, for the same reason -- explained in alter_table.sql create or replace function dummy_hashint4(a int4, seed int8) returns int8 as @@ -271,14 +881,11 @@ insert into hpart4 values (3, 4); update hpart1 set a = 3, b=4 where a = 1; ERROR: new row for relation "hpart1" violates partition constraint DETAIL: Failing row contains (3, 4). +-- ok, row movement update hash_parted set b = b - 1 where b = 1; -ERROR: new row for relation "hpart1" violates partition constraint -DETAIL: Failing row contains (1, 0). -- ok update hash_parted set b = b + 8 where b = 1; -- cleanup -drop table range_parted; -drop table list_parted; drop table hash_parted; drop operator class custom_opclass using hash; drop function dummy_hashint4(a int4, seed int8); diff --git a/src/test/regress/sql/update.sql b/src/test/regress/sql/update.sql index 9d673de4..a4f2f161 100644 --- a/src/test/regress/sql/update.sql +++ b/src/test/regress/sql/update.sql @@ -107,25 +107,336 @@ INSERT INTO upsert_test VALUES (1, 'Bat') ON CONFLICT(a) DROP TABLE update_test; DROP TABLE upsert_test; --- update to a partition should check partition bound constraint for the new tuple -create table range_parted ( + +--------------------------- +-- UPDATE with row movement +--------------------------- + +-- When a partitioned table receives an UPDATE to the partitioned key and the +-- new values no longer meet the partition's bound, the row must be moved to +-- the correct partition for the new partition key (if one exists). We must +-- also ensure that updatable views on partitioned tables properly enforce any +-- WITH CHECK OPTION that is defined. The situation with triggers in this case +-- also requires thorough testing as partition key updates causing row +-- movement convert UPDATEs into DELETE+INSERT. + +CREATE TABLE range_parted ( a text, - b int -) partition by range (a, b); -create table part_a_1_a_10 partition of range_parted for values from ('a', 1) to ('a', 10); -create table part_a_10_a_20 partition of range_parted for values from ('a', 10) to ('a', 20); -create table part_b_1_b_10 partition of range_parted for values from ('b', 1) to ('b', 10); -create table part_b_10_b_20 partition of range_parted for values from ('b', 10) to ('b', 20); -insert into part_a_1_a_10 values ('a', 1); -insert into part_b_10_b_20 values ('b', 10); + b bigint, + c numeric, + d int, + e varchar +) PARTITION BY RANGE (a, b); --- fail -update part_a_1_a_10 set a = 'b' where a = 'a'; -update range_parted set b = b - 1 where b = 10; +-- Create partitions intentionally in descending bound order, so as to test +-- that update-row-movement works with the leaf partitions not in bound order. +CREATE TABLE part_b_20_b_30 (e varchar, c numeric, a text, b bigint, d int); +ALTER TABLE range_parted ATTACH PARTITION part_b_20_b_30 FOR VALUES FROM ('b', 20) TO ('b', 30); +CREATE TABLE part_b_10_b_20 (e varchar, c numeric, a text, b bigint, d int) PARTITION BY RANGE (c); +CREATE TABLE part_b_1_b_10 PARTITION OF range_parted FOR VALUES FROM ('b', 1) TO ('b', 10); +ALTER TABLE range_parted ATTACH PARTITION part_b_10_b_20 FOR VALUES FROM ('b', 10) TO ('b', 20); +CREATE TABLE part_a_10_a_20 PARTITION OF range_parted FOR VALUES FROM ('a', 10) TO ('a', 20); +CREATE TABLE part_a_1_a_10 PARTITION OF range_parted FOR VALUES FROM ('a', 1) TO ('a', 10); + +-- Check that partition-key UPDATE works sanely on a partitioned table that +-- does not have any child partitions. +UPDATE part_b_10_b_20 set b = b - 6; + +-- Create some more partitions following the above pattern of descending bound +-- order, but let's make the situation a bit more complex by having the +-- attribute numbers of the columns vary from their parent partition. +CREATE TABLE part_c_100_200 (e varchar, c numeric, a text, b bigint, d int) PARTITION BY range (abs(d)); +ALTER TABLE part_c_100_200 DROP COLUMN e, DROP COLUMN c, DROP COLUMN a; +ALTER TABLE part_c_100_200 ADD COLUMN c numeric, ADD COLUMN e varchar, ADD COLUMN a text; +ALTER TABLE part_c_100_200 DROP COLUMN b; +ALTER TABLE part_c_100_200 ADD COLUMN b bigint; +CREATE TABLE part_d_1_15 PARTITION OF part_c_100_200 FOR VALUES FROM (1) TO (15); +CREATE TABLE part_d_15_20 PARTITION OF part_c_100_200 FOR VALUES FROM (15) TO (20); + +ALTER TABLE part_b_10_b_20 ATTACH PARTITION part_c_100_200 FOR VALUES FROM (100) TO (200); + +CREATE TABLE part_c_1_100 (e varchar, d int, c numeric, b bigint, a text); +ALTER TABLE part_b_10_b_20 ATTACH PARTITION part_c_1_100 FOR VALUES FROM (1) TO (100); + +\set init_range_parted 'truncate range_parted; insert into range_parted VALUES (''a'', 1, 1, 1), (''a'', 10, 200, 1), (''b'', 12, 96, 1), (''b'', 13, 97, 2), (''b'', 15, 105, 16), (''b'', 17, 105, 19)' +\set show_data 'select tableoid::regclass::text COLLATE "C" partname, * from range_parted ORDER BY 1, 2, 3, 4, 5, 6' +:init_range_parted; +:show_data; + +-- The order of subplans should be in bound order +EXPLAIN (costs off) UPDATE range_parted set c = c - 50 WHERE c > 97; + +-- fail, row movement happens only within the partition subtree. +UPDATE part_c_100_200 set c = c - 20, d = c WHERE c = 105; +-- fail, no partition key update, so no attempt to move tuple, +-- but "a = 'a'" violates partition constraint enforced by root partition) +UPDATE part_b_10_b_20 set a = 'a'; +-- ok, partition key update, no constraint violation +UPDATE range_parted set d = d - 10 WHERE d > 10; +-- ok, no partition key update, no constraint violation +UPDATE range_parted set e = d; +-- No row found +UPDATE part_c_1_100 set c = c + 20 WHERE c = 98; +-- ok, row movement +UPDATE part_b_10_b_20 set c = c + 20 returning c, b, a; +:show_data; + +-- fail, row movement happens only within the partition subtree. +UPDATE part_b_10_b_20 set b = b - 6 WHERE c > 116 returning *; +-- ok, row movement, with subset of rows moved into different partition. +UPDATE range_parted set b = b - 6 WHERE c > 116 returning a, b + c; + +:show_data; + +-- Common table needed for multiple test scenarios. +CREATE TABLE mintab(c1 int); +INSERT into mintab VALUES (120); + +-- update partition key using updatable view. +CREATE VIEW upview AS SELECT * FROM range_parted WHERE (select c > c1 FROM mintab) WITH CHECK OPTION; +-- ok +UPDATE upview set c = 199 WHERE b = 4; +-- fail, check option violation +UPDATE upview set c = 120 WHERE b = 4; +-- fail, row movement with check option violation +UPDATE upview set a = 'b', b = 15, c = 120 WHERE b = 4; +-- ok, row movement, check option passes +UPDATE upview set a = 'b', b = 15 WHERE b = 4; + +:show_data; + +-- cleanup +DROP VIEW upview; + +-- RETURNING having whole-row vars. +:init_range_parted; +UPDATE range_parted set c = 95 WHERE a = 'b' and b > 10 and c > 100 returning (range_parted), *; +:show_data; + + +-- Transition tables with update row movement +:init_range_parted; + +CREATE FUNCTION trans_updatetrigfunc() RETURNS trigger LANGUAGE plpgsql AS +$$ + begin + raise notice 'trigger = %, old table = %, new table = %', + TG_NAME, + (select string_agg(old_table::text, ', ' ORDER BY a) FROM old_table), + (select string_agg(new_table::text, ', ' ORDER BY a) FROM new_table); + return null; + end; +$$; + +CREATE TRIGGER trans_updatetrig + AFTER UPDATE ON range_parted REFERENCING OLD TABLE AS old_table NEW TABLE AS new_table + FOR EACH STATEMENT EXECUTE PROCEDURE trans_updatetrigfunc(); + +UPDATE range_parted set c = (case when c = 96 then 110 else c + 1 end ) WHERE a = 'b' and b > 10 and c >= 96; +:show_data; +:init_range_parted; + +-- Enabling OLD TABLE capture for both DELETE as well as UPDATE stmt triggers +-- should not cause DELETEd rows to be captured twice. Similar thing for +-- INSERT triggers and inserted rows. +CREATE TRIGGER trans_deletetrig + AFTER DELETE ON range_parted REFERENCING OLD TABLE AS old_table + FOR EACH STATEMENT EXECUTE PROCEDURE trans_updatetrigfunc(); +CREATE TRIGGER trans_inserttrig + AFTER INSERT ON range_parted REFERENCING NEW TABLE AS new_table + FOR EACH STATEMENT EXECUTE PROCEDURE trans_updatetrigfunc(); +UPDATE range_parted set c = c + 50 WHERE a = 'b' and b > 10 and c >= 96; +:show_data; +DROP TRIGGER trans_deletetrig ON range_parted; +DROP TRIGGER trans_inserttrig ON range_parted; +-- Don't drop trans_updatetrig yet. It is required below. + +-- Test with transition tuple conversion happening for rows moved into the +-- new partition. This requires a trigger that references transition table +-- (we already have trans_updatetrig). For inserted rows, the conversion +-- is not usually needed, because the original tuple is already compatible with +-- the desired transition tuple format. But conversion happens when there is a +-- BR trigger because the trigger can change the inserted row. So install a +-- BR triggers on those child partitions where the rows will be moved. +CREATE FUNCTION func_parted_mod_b() RETURNS trigger AS $$ +BEGIN + NEW.b = NEW.b + 1; + return NEW; +END $$ language plpgsql; +CREATE TRIGGER trig_c1_100 BEFORE UPDATE OR INSERT ON part_c_1_100 + FOR EACH ROW EXECUTE PROCEDURE func_parted_mod_b(); +CREATE TRIGGER trig_d1_15 BEFORE UPDATE OR INSERT ON part_d_1_15 + FOR EACH ROW EXECUTE PROCEDURE func_parted_mod_b(); +CREATE TRIGGER trig_d15_20 BEFORE UPDATE OR INSERT ON part_d_15_20 + FOR EACH ROW EXECUTE PROCEDURE func_parted_mod_b(); +:init_range_parted; +UPDATE range_parted set c = (case when c = 96 then 110 else c + 1 end) WHERE a = 'b' and b > 10 and c >= 96; +:show_data; +:init_range_parted; +UPDATE range_parted set c = c + 50 WHERE a = 'b' and b > 10 and c >= 96; +:show_data; + +-- Case where per-partition tuple conversion map array is allocated, but the +-- map is not required for the particular tuple that is routed, thanks to +-- matching table attributes of the partition and the target table. +:init_range_parted; +UPDATE range_parted set b = 15 WHERE b = 1; +:show_data; + +DROP TRIGGER trans_updatetrig ON range_parted; +DROP TRIGGER trig_c1_100 ON part_c_1_100; +DROP TRIGGER trig_d1_15 ON part_d_1_15; +DROP TRIGGER trig_d15_20 ON part_d_15_20; +DROP FUNCTION func_parted_mod_b(); + +-- RLS policies with update-row-movement +----------------------------------------- + +ALTER TABLE range_parted ENABLE ROW LEVEL SECURITY; +CREATE USER regress_range_parted_user; +GRANT ALL ON range_parted, mintab TO regress_range_parted_user; +CREATE POLICY seeall ON range_parted AS PERMISSIVE FOR SELECT USING (true); +CREATE POLICY policy_range_parted ON range_parted for UPDATE USING (true) WITH CHECK (c % 2 = 0); + +:init_range_parted; +SET SESSION AUTHORIZATION regress_range_parted_user; +-- This should fail with RLS violation error while moving row from +-- part_a_10_a_20 to part_d_1_15, because we are setting 'c' to an odd number. +UPDATE range_parted set a = 'b', c = 151 WHERE a = 'a' and c = 200; + +RESET SESSION AUTHORIZATION; +-- Create a trigger on part_d_1_15 +CREATE FUNCTION func_d_1_15() RETURNS trigger AS $$ +BEGIN + NEW.c = NEW.c + 1; -- Make even numbers odd, or vice versa + return NEW; +END $$ LANGUAGE plpgsql; +CREATE TRIGGER trig_d_1_15 BEFORE INSERT ON part_d_1_15 + FOR EACH ROW EXECUTE PROCEDURE func_d_1_15(); + +:init_range_parted; +SET SESSION AUTHORIZATION regress_range_parted_user; + +-- Here, RLS checks should succeed while moving row from part_a_10_a_20 to +-- part_d_1_15. Even though the UPDATE is setting 'c' to an odd number, the +-- trigger at the destination partition again makes it an even number. +UPDATE range_parted set a = 'b', c = 151 WHERE a = 'a' and c = 200; + +RESET SESSION AUTHORIZATION; +:init_range_parted; +SET SESSION AUTHORIZATION regress_range_parted_user; +-- This should fail with RLS violation error. Even though the UPDATE is setting +-- 'c' to an even number, the trigger at the destination partition again makes +-- it an odd number. +UPDATE range_parted set a = 'b', c = 150 WHERE a = 'a' and c = 200; + +-- Cleanup +RESET SESSION AUTHORIZATION; +DROP TRIGGER trig_d_1_15 ON part_d_1_15; +DROP FUNCTION func_d_1_15(); + +-- Policy expression contains SubPlan +RESET SESSION AUTHORIZATION; +:init_range_parted; +CREATE POLICY policy_range_parted_subplan on range_parted + AS RESTRICTIVE for UPDATE USING (true) + WITH CHECK ((SELECT range_parted.c <= c1 FROM mintab)); +SET SESSION AUTHORIZATION regress_range_parted_user; +-- fail, mintab has row with c1 = 120 +UPDATE range_parted set a = 'b', c = 122 WHERE a = 'a' and c = 200; -- ok -update range_parted set b = b + 1 where b = 10; +UPDATE range_parted set a = 'b', c = 120 WHERE a = 'a' and c = 200; + +-- RLS policy expression contains whole row. + +RESET SESSION AUTHORIZATION; +:init_range_parted; +CREATE POLICY policy_range_parted_wholerow on range_parted AS RESTRICTIVE for UPDATE USING (true) + WITH CHECK (range_parted = row('b', 10, 112, 1, NULL)::range_parted); +SET SESSION AUTHORIZATION regress_range_parted_user; +-- ok, should pass the RLS check +UPDATE range_parted set a = 'b', c = 112 WHERE a = 'a' and c = 200; +RESET SESSION AUTHORIZATION; +:init_range_parted; +SET SESSION AUTHORIZATION regress_range_parted_user; +-- fail, the whole row RLS check should fail +UPDATE range_parted set a = 'b', c = 116 WHERE a = 'a' and c = 200; + +-- Cleanup +RESET SESSION AUTHORIZATION; +DROP POLICY policy_range_parted ON range_parted; +DROP POLICY policy_range_parted_subplan ON range_parted; +DROP POLICY policy_range_parted_wholerow ON range_parted; +REVOKE ALL ON range_parted, mintab FROM regress_range_parted_user; +DROP USER regress_range_parted_user; +DROP TABLE mintab; + + +-- statement triggers with update row movement +--------------------------------------------------- + +:init_range_parted; + +CREATE FUNCTION trigfunc() returns trigger language plpgsql as +$$ + begin + raise notice 'trigger = % fired on table % during %', + TG_NAME, TG_TABLE_NAME, TG_OP; + return null; + end; +$$; +-- Triggers on root partition +CREATE TRIGGER parent_delete_trig + AFTER DELETE ON range_parted for each statement execute procedure trigfunc(); +CREATE TRIGGER parent_update_trig + AFTER UPDATE ON range_parted for each statement execute procedure trigfunc(); +CREATE TRIGGER parent_insert_trig + AFTER INSERT ON range_parted for each statement execute procedure trigfunc(); + +-- Triggers on leaf partition part_c_1_100 +CREATE TRIGGER c1_delete_trig + AFTER DELETE ON part_c_1_100 for each statement execute procedure trigfunc(); +CREATE TRIGGER c1_update_trig + AFTER UPDATE ON part_c_1_100 for each statement execute procedure trigfunc(); +CREATE TRIGGER c1_insert_trig + AFTER INSERT ON part_c_1_100 for each statement execute procedure trigfunc(); + +-- Triggers on leaf partition part_d_1_15 +CREATE TRIGGER d1_delete_trig + AFTER DELETE ON part_d_1_15 for each statement execute procedure trigfunc(); +CREATE TRIGGER d1_update_trig + AFTER UPDATE ON part_d_1_15 for each statement execute procedure trigfunc(); +CREATE TRIGGER d1_insert_trig + AFTER INSERT ON part_d_1_15 for each statement execute procedure trigfunc(); +-- Triggers on leaf partition part_d_15_20 +CREATE TRIGGER d15_delete_trig + AFTER DELETE ON part_d_15_20 for each statement execute procedure trigfunc(); +CREATE TRIGGER d15_update_trig + AFTER UPDATE ON part_d_15_20 for each statement execute procedure trigfunc(); +CREATE TRIGGER d15_insert_trig + AFTER INSERT ON part_d_15_20 for each statement execute procedure trigfunc(); + +-- Move all rows from part_c_100_200 to part_c_1_100. None of the delete or +-- insert statement triggers should be fired. +UPDATE range_parted set c = c - 50 WHERE c > 97; +:show_data; + +DROP TRIGGER parent_delete_trig ON range_parted; +DROP TRIGGER parent_update_trig ON range_parted; +DROP TRIGGER parent_insert_trig ON range_parted; +DROP TRIGGER c1_delete_trig ON part_c_1_100; +DROP TRIGGER c1_update_trig ON part_c_1_100; +DROP TRIGGER c1_insert_trig ON part_c_1_100; +DROP TRIGGER d1_delete_trig ON part_d_1_15; +DROP TRIGGER d1_update_trig ON part_d_1_15; +DROP TRIGGER d1_insert_trig ON part_d_1_15; +DROP TRIGGER d15_delete_trig ON part_d_15_20; +DROP TRIGGER d15_update_trig ON part_d_15_20; +DROP TRIGGER d15_insert_trig ON part_d_15_20; + -- Creating default partition for range +:init_range_parted; create table part_def partition of range_parted default; \d+ part_def insert into range_parted values ('c', 9); @@ -134,19 +445,119 @@ update part_def set a = 'd' where a = 'c'; -- fail update part_def set a = 'a' where a = 'd'; -create table list_parted ( +:show_data; + +-- Update row movement from non-default to default partition. +-- fail, default partition is not under part_a_10_a_20; +UPDATE part_a_10_a_20 set a = 'ad' WHERE a = 'a'; +-- ok +UPDATE range_parted set a = 'ad' WHERE a = 'a'; +UPDATE range_parted set a = 'bd' WHERE a = 'b'; +:show_data; +-- Update row movement from default to non-default partitions. +-- ok +UPDATE range_parted set a = 'a' WHERE a = 'ad'; +UPDATE range_parted set a = 'b' WHERE a = 'bd'; +:show_data; + +-- Cleanup: range_parted no longer needed. +DROP TABLE range_parted; + +CREATE TABLE list_parted ( a text, b int -) partition by list (a); -create table list_part1 partition of list_parted for values in ('a', 'b'); -create table list_default partition of list_parted default; -insert into list_part1 values ('a', 1); -insert into list_default values ('d', 10); +) PARTITION BY list (a); +CREATE TABLE list_part1 PARTITION OF list_parted for VALUES in ('a', 'b'); +CREATE TABLE list_default PARTITION OF list_parted default; +INSERT into list_part1 VALUES ('a', 1); +INSERT into list_default VALUES ('d', 10); -- fail -update list_default set a = 'a' where a = 'd'; +UPDATE list_default set a = 'a' WHERE a = 'd'; -- ok -update list_default set a = 'x' where a = 'd'; +UPDATE list_default set a = 'x' WHERE a = 'd'; + +DROP TABLE list_parted; + +-------------- +-- Some more update-partition-key test scenarios below. This time use list +-- partitions. +-------------- + +-- Setup for list partitions +CREATE TABLE list_parted (a numeric, b int, c int8) PARTITION BY list (a); +CREATE TABLE sub_parted PARTITION OF list_parted for VALUES in (1) PARTITION BY list (b); + +CREATE TABLE sub_part1(b int, c int8, a numeric); +ALTER TABLE sub_parted ATTACH PARTITION sub_part1 for VALUES in (1); +CREATE TABLE sub_part2(b int, c int8, a numeric); +ALTER TABLE sub_parted ATTACH PARTITION sub_part2 for VALUES in (2); + +CREATE TABLE list_part1(a numeric, b int, c int8); +ALTER TABLE list_parted ATTACH PARTITION list_part1 for VALUES in (2,3); + +INSERT into list_parted VALUES (2,5,50); +INSERT into list_parted VALUES (3,6,60); +INSERT into sub_parted VALUES (1,1,60); +INSERT into sub_parted VALUES (1,2,10); + +-- Test partition constraint violation when intermediate ancestor is used and +-- constraint is inherited from upper root. +UPDATE sub_parted set a = 2 WHERE c = 10; + +-- Test update-partition-key, where the unpruned partitions do not have their +-- partition keys updated. +SELECT tableoid::regclass::text, * FROM list_parted WHERE a = 2 ORDER BY 1; +UPDATE list_parted set b = c + a WHERE a = 2; +SELECT tableoid::regclass::text, * FROM list_parted WHERE a = 2 ORDER BY 1; + + +-- Test the case where BR UPDATE triggers change the partition key. +CREATE FUNCTION func_parted_mod_b() returns trigger as $$ +BEGIN + NEW.b = 2; -- This is changing partition key column. + return NEW; +END $$ LANGUAGE plpgsql; +CREATE TRIGGER parted_mod_b before update on sub_part1 + for each row execute procedure func_parted_mod_b(); + +SELECT tableoid::regclass::text, * FROM list_parted ORDER BY 1, 2, 3, 4; + +-- This should do the tuple routing even though there is no explicit +-- partition-key update, because there is a trigger on sub_part1. +UPDATE list_parted set c = 70 WHERE b = 1; +SELECT tableoid::regclass::text, * FROM list_parted ORDER BY 1, 2, 3, 4; + +DROP TRIGGER parted_mod_b ON sub_part1; + +-- If BR DELETE trigger prevented DELETE from happening, we should also skip +-- the INSERT if that delete is part of UPDATE=>DELETE+INSERT. +CREATE OR REPLACE FUNCTION func_parted_mod_b() returns trigger as $$ +BEGIN + raise notice 'Trigger: Got OLD row %, but returning NULL', OLD; + return NULL; +END $$ LANGUAGE plpgsql; +CREATE TRIGGER trig_skip_delete before delete on sub_part2 + for each row execute procedure func_parted_mod_b(); +UPDATE list_parted set b = 1 WHERE c = 70; +SELECT tableoid::regclass::text, * FROM list_parted ORDER BY 1, 2, 3, 4; +-- Drop the trigger. Now the row should be moved. +DROP TRIGGER trig_skip_delete ON sub_part2; +UPDATE list_parted set b = 1 WHERE c = 70; +SELECT tableoid::regclass::text, * FROM list_parted ORDER BY 1, 2, 3, 4; +DROP FUNCTION func_parted_mod_b(); + +-- UPDATE partition-key with FROM clause. If join produces multiple output +-- rows for the same row to be modified, we should tuple-route the row only +-- once. There should not be any rows inserted. +CREATE TABLE non_parted (id int); +INSERT into non_parted VALUES (1), (1), (1), (2), (2), (2), (3), (3), (3); +UPDATE list_parted t1 set a = 2 FROM non_parted t2 WHERE t1.a = t2.id and a = 1; +SELECT tableoid::regclass::text, * FROM list_parted ORDER BY 1, 2, 3, 4; +DROP TABLE non_parted; + +-- Cleanup: list_parted no longer needed. +DROP TABLE list_parted; -- create custom operator class and hash function, for the same reason -- explained in alter_table.sql @@ -169,13 +580,12 @@ insert into hpart4 values (3, 4); -- fail update hpart1 set a = 3, b=4 where a = 1; +-- ok, row movement update hash_parted set b = b - 1 where b = 1; -- ok update hash_parted set b = b + 8 where b = 1; -- cleanup -drop table range_parted; -drop table list_parted; drop table hash_parted; drop operator class custom_opclass using hash; drop function dummy_hashint4(a int4, seed int8); diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index ba6ce916..a3cb20f8 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1568,6 +1568,7 @@ PartitionRangeBound PartitionRangeDatum PartitionRangeDatumKind PartitionSpec +PartitionTupleRouting PartitionedChildRelInfo PasswordType Path From 07e28f3b981287f20c2259ae1b7c1182210551ca Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Wed, 24 Jan 2018 16:34:51 -0500 Subject: [PATCH 226/578] Avoid referencing off the end of subplan_partition_offsets. Report by buildfarm member skink and Tom Lane. Analysis by me. Patch by Amit Khandekar. Discussion: http://postgr.es/m/CAJ3gD9fVA1iXQYhfqHP5n_TEd4U9=V8TL_cc-oKRnRmxgdvJrQ@mail.gmail.com --- src/backend/executor/execPartition.c | 2 ++ src/backend/executor/nodeModifyTable.c | 3 ++- src/include/executor/execPartition.h | 2 ++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c index a08f308f..e312167f 100644 --- a/src/backend/executor/execPartition.c +++ b/src/backend/executor/execPartition.c @@ -87,6 +87,7 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, num_update_rri = list_length(node->plans); proute->subplan_partition_offsets = palloc(num_update_rri * sizeof(int)); + proute->num_subplan_partition_offsets = num_update_rri; /* * We need an additional tuple slot for storing transient tuples that @@ -481,6 +482,7 @@ ExecCleanupTupleRouting(PartitionTupleRouting *proute) * result rels are present in the UPDATE subplans. */ if (proute->subplan_partition_offsets && + subplan_index < proute->num_subplan_partition_offsets && proute->subplan_partition_offsets[subplan_index] == i) { subplan_index++; diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index f04ef73d..003ff4b8 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -2148,7 +2148,8 @@ tupconv_map_for_subplan(ModifyTableState *mtstate, int whichplan) * If subplan-indexed array is NULL, things should have been arranged * to convert the subplan index to partition index. */ - Assert(proute && proute->subplan_partition_offsets != NULL); + Assert(proute && proute->subplan_partition_offsets != NULL && + whichplan < proute->num_subplan_partition_offsets); leaf_index = proute->subplan_partition_offsets[whichplan]; diff --git a/src/include/executor/execPartition.h b/src/include/executor/execPartition.h index 45acfa92..4e0bdc35 100644 --- a/src/include/executor/execPartition.h +++ b/src/include/executor/execPartition.h @@ -80,6 +80,7 @@ typedef struct PartitionDispatchData *PartitionDispatch; * subplan_partition_offsets Integer array ordered by UPDATE subplans. Each * element of this array has the index into the * corresponding partition in partitions array. + * num_subplan_partition_offsets Length of 'subplan_partition_offsets' array * partition_tuple_slot TupleTableSlot to be used to manipulate any * given leaf partition's rowtype after that * partition is chosen for insertion by @@ -96,6 +97,7 @@ typedef struct PartitionTupleRouting TupleConversionMap **child_parent_tupconv_maps; bool *child_parent_map_not_required; int *subplan_partition_offsets; + int num_subplan_partition_offsets; TupleTableSlot *partition_tuple_slot; TupleTableSlot *root_tuple_slot; } PartitionTupleRouting; From 45402b03030f45de48fbca8b4ecc6f35c748f06c Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Fri, 2 Feb 2018 09:23:42 -0500 Subject: [PATCH 227/578] Refactor code for partition bound searching Remove partition_bound_cmp() and partition_bound_bsearch(), whose void * argument could be, depending on the situation, of any of three different types: PartitionBoundSpec *, PartitionRangeBound *, Datum *. Instead, introduce separate bound-searching functions for each situation: partition_list_bsearch, partition_range_bsearch, partition_range_datum_bsearch, and partition_hash_bsearch. This requires duplicating the code for binary search, but it makes the code much more type safe, involves fewer branches at runtime, and at least in my opinion, is much easier to understand. Along the way, add an option to partition_range_datum_bsearch allowing the number of keys to be specified, so that we can search for partitions based on a prefix of the full list of partition keys. This is important for pending work to improve partition pruning. Amit Langote, per a suggestion from me. Discussion: http://postgr.es/m/CA+TgmoaVLDLc8=YESRwD32gPhodU_ELmXyKs77gveiYp+JE4vQ@mail.gmail.com --- src/backend/catalog/partition.c | 243 +++++++++++++++++++++----------- 1 file changed, 159 insertions(+), 84 deletions(-) diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c index 08b58d74..e185a7ee 100644 --- a/src/backend/catalog/partition.c +++ b/src/backend/catalog/partition.c @@ -170,14 +170,21 @@ static int32 partition_rbound_cmp(PartitionKey key, bool lower1, PartitionRangeBound *b2); static int32 partition_rbound_datum_cmp(PartitionKey key, Datum *rb_datums, PartitionRangeDatumKind *rb_kind, - Datum *tuple_datums); + Datum *tuple_datums, int n_tuple_datums); -static int32 partition_bound_cmp(PartitionKey key, +static int partition_list_bsearch(PartitionKey key, PartitionBoundInfo boundinfo, - int offset, void *probe, bool probe_is_bound); -static int partition_bound_bsearch(PartitionKey key, + Datum value, bool *is_equal); +static int partition_range_bsearch(PartitionKey key, PartitionBoundInfo boundinfo, - void *probe, bool probe_is_bound, bool *is_equal); + PartitionRangeBound *probe, bool *is_equal); +static int partition_range_datum_bsearch(PartitionKey key, + PartitionBoundInfo boundinfo, + int nvalues, Datum *values, bool *is_equal); +static int partition_hash_bsearch(PartitionKey key, + PartitionBoundInfo boundinfo, + int modulus, int remainder); + static int get_partition_bound_num_indexes(PartitionBoundInfo b); static int get_greatest_modulus(PartitionBoundInfo b); static uint64 compute_hash_value(PartitionKey key, Datum *values, bool *isnull); @@ -1015,8 +1022,7 @@ check_new_partition_bound(char *relname, Relation parent, int greatest_modulus; int remainder; int offset; - bool equal, - valid_modulus = true; + bool valid_modulus = true; int prev_modulus, /* Previous largest modulus */ next_modulus; /* Next largest modulus */ @@ -1029,12 +1035,13 @@ check_new_partition_bound(char *relname, Relation parent, * modulus 10 and a partition with modulus 15, because 10 * is not a factor of 15. * - * Get greatest bound in array boundinfo->datums which is - * less than or equal to spec->modulus and - * spec->remainder. + * Get the greatest (modulus, remainder) pair contained in + * boundinfo->datums that is less than or equal to the + * (spec->modulus, spec->remainder) pair. */ - offset = partition_bound_bsearch(key, boundinfo, spec, - true, &equal); + offset = partition_hash_bsearch(key, boundinfo, + spec->modulus, + spec->remainder); if (offset < 0) { next_modulus = DatumGetInt32(datums[0][0]); @@ -1108,9 +1115,9 @@ check_new_partition_bound(char *relname, Relation parent, int offset; bool equal; - offset = partition_bound_bsearch(key, boundinfo, - &val->constvalue, - true, &equal); + offset = partition_list_bsearch(key, boundinfo, + val->constvalue, + &equal); if (offset >= 0 && equal) { overlap = true; @@ -1182,8 +1189,8 @@ check_new_partition_bound(char *relname, Relation parent, * since the index array is initialised with an extra -1 * at the end. */ - offset = partition_bound_bsearch(key, boundinfo, lower, - true, &equal); + offset = partition_range_bsearch(key, boundinfo, lower, + &equal); if (boundinfo->indexes[offset + 1] < 0) { @@ -1196,10 +1203,16 @@ check_new_partition_bound(char *relname, Relation parent, if (offset + 1 < boundinfo->ndatums) { int32 cmpval; + Datum *datums; + PartitionRangeDatumKind *kind; + bool is_lower; + + datums = boundinfo->datums[offset + 1]; + kind = boundinfo->kind[offset + 1]; + is_lower = (boundinfo->indexes[offset + 1] == -1); - cmpval = partition_bound_cmp(key, boundinfo, - offset + 1, upper, - true); + cmpval = partition_rbound_cmp(key, datums, kind, + is_lower, upper); if (cmpval < 0) { /* @@ -2566,11 +2579,9 @@ get_partition_for_tuple(Relation relation, Datum *values, bool *isnull) { bool equal = false; - bound_offset = partition_bound_bsearch(key, + bound_offset = partition_list_bsearch(key, partdesc->boundinfo, - values, - false, - &equal); + values[0], &equal); if (bound_offset >= 0 && equal) part_index = partdesc->boundinfo->indexes[bound_offset]; } @@ -2598,12 +2609,11 @@ get_partition_for_tuple(Relation relation, Datum *values, bool *isnull) if (!range_partkey_has_null) { - bound_offset = partition_bound_bsearch(key, + bound_offset = partition_range_datum_bsearch(key, partdesc->boundinfo, + key->partnatts, values, - false, &equal); - /* * The bound at bound_offset is less than or equal to the * tuple value, so the bound at offset+1 is the upper @@ -2874,12 +2884,12 @@ partition_rbound_cmp(PartitionKey key, static int32 partition_rbound_datum_cmp(PartitionKey key, Datum *rb_datums, PartitionRangeDatumKind *rb_kind, - Datum *tuple_datums) + Datum *tuple_datums, int n_tuple_datums) { int i; int32 cmpval = -1; - for (i = 0; i < key->partnatts; i++) + for (i = 0; i < n_tuple_datums; i++) { if (rb_kind[i] == PARTITION_RANGE_DATUM_MINVALUE) return -1; @@ -2898,84 +2908,104 @@ partition_rbound_datum_cmp(PartitionKey key, } /* - * partition_bound_cmp + * partition_list_bsearch + * Returns the index of the greatest bound datum that is less than equal + * to the given value or -1 if all of the bound datums are greater * - * Return whether the bound at offset in boundinfo is <, =, or > the argument - * specified in *probe. + * *is_equal is set to true if the bound datum at the returned index is equal + * to the input value. */ -static int32 -partition_bound_cmp(PartitionKey key, PartitionBoundInfo boundinfo, - int offset, void *probe, bool probe_is_bound) +static int +partition_list_bsearch(PartitionKey key, + PartitionBoundInfo boundinfo, + Datum value, bool *is_equal) { - Datum *bound_datums = boundinfo->datums[offset]; - int32 cmpval = -1; + int lo, + hi, + mid; - switch (key->strategy) - { - case PARTITION_STRATEGY_HASH: + lo = -1; + hi = boundinfo->ndatums - 1; + while (lo < hi) { - PartitionBoundSpec *spec = (PartitionBoundSpec *) probe; + int32 cmpval; - cmpval = partition_hbound_cmp(DatumGetInt32(bound_datums[0]), - DatumGetInt32(bound_datums[1]), - spec->modulus, spec->remainder); - break; - } - case PARTITION_STRATEGY_LIST: + mid = (lo + hi + 1) / 2; cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[0], key->partcollation[0], - bound_datums[0], - *(Datum *) probe)); + boundinfo->datums[mid][0], + value)); + if (cmpval <= 0) + { + lo = mid; + *is_equal = (cmpval == 0); + if (*is_equal) break; + } + else + hi = mid - 1; + } - case PARTITION_STRATEGY_RANGE: - { - PartitionRangeDatumKind *kind = boundinfo->kind[offset]; + return lo; +} - if (probe_is_bound) - { /* - * We need to pass whether the existing bound is a lower - * bound, so that two equal-valued lower and upper bounds - * are not regarded equal. + * partition_range_bsearch + * Returns the index of the greatest range bound that is less than or + * equal to the given range bound or -1 if all of the range bounds are + * greater + * + * *is_equal is set to true if the range bound at the returned index is equal + * to the input range bound */ - bool lower = boundinfo->indexes[offset] < 0; +static int +partition_range_bsearch(PartitionKey key, + PartitionBoundInfo boundinfo, + PartitionRangeBound *probe, bool *is_equal) +{ + int lo, + hi, + mid; + + lo = -1; + hi = boundinfo->ndatums - 1; + while (lo < hi) + { + int32 cmpval; + mid = (lo + hi + 1) / 2; cmpval = partition_rbound_cmp(key, - bound_datums, kind, lower, - (PartitionRangeBound *) probe); - } - else - cmpval = partition_rbound_datum_cmp(key, - bound_datums, kind, - (Datum *) probe); + boundinfo->datums[mid], + boundinfo->kind[mid], + (boundinfo->indexes[mid] == -1), + probe); + if (cmpval <= 0) + { + lo = mid; + *is_equal = (cmpval == 0); + + if (*is_equal) break; } - - default: - elog(ERROR, "unexpected partition strategy: %d", - (int) key->strategy); + else + hi = mid - 1; } - return cmpval; + return lo; } /* - * Binary search on a collection of partition bounds. Returns greatest - * bound in array boundinfo->datums which is less than or equal to *probe. - * If all bounds in the array are greater than *probe, -1 is returned. + * partition_range_bsearch + * Returns the index of the greatest range bound that is less than or + * equal to the given tuple or -1 if all of the range bounds are greater * - * *probe could either be a partition bound or a Datum array representing - * the partition key of a tuple being routed; probe_is_bound tells which. - * We pass that down to the comparison function so that it can interpret the - * contents of *probe accordingly. - * - * *is_equal is set to whether the bound at the returned index is equal with - * *probe. + * *is_equal is set to true if the range bound at the returned index is equal + * to the input tuple. */ static int -partition_bound_bsearch(PartitionKey key, PartitionBoundInfo boundinfo, - void *probe, bool probe_is_bound, bool *is_equal) +partition_range_datum_bsearch(PartitionKey key, + PartitionBoundInfo boundinfo, + int nvalues, Datum *values, bool *is_equal) { int lo, hi, @@ -2988,8 +3018,11 @@ partition_bound_bsearch(PartitionKey key, PartitionBoundInfo boundinfo, int32 cmpval; mid = (lo + hi + 1) / 2; - cmpval = partition_bound_cmp(key, boundinfo, mid, probe, - probe_is_bound); + cmpval = partition_rbound_datum_cmp(key, + boundinfo->datums[mid], + boundinfo->kind[mid], + values, + nvalues); if (cmpval <= 0) { lo = mid; @@ -3005,6 +3038,48 @@ partition_bound_bsearch(PartitionKey key, PartitionBoundInfo boundinfo, return lo; } +/* + * partition_hash_bsearch + * Returns the index of the greatest (modulus, remainder) pair that is + * less than or equal to the given (modulus, remainder) pair or -1 if + * all of them are greater + */ +static int +partition_hash_bsearch(PartitionKey key, + PartitionBoundInfo boundinfo, + int modulus, int remainder) +{ + int lo, + hi, + mid; + + lo = -1; + hi = boundinfo->ndatums - 1; + while (lo < hi) + { + int32 cmpval, + bound_modulus, + bound_remainder; + + mid = (lo + hi + 1) / 2; + bound_modulus = DatumGetInt32(boundinfo->datums[mid][0]); + bound_remainder = DatumGetInt32(boundinfo->datums[mid][1]); + cmpval = partition_hbound_cmp(bound_modulus, bound_remainder, + modulus, remainder); + if (cmpval <= 0) + { + lo = mid; + + if (cmpval == 0) + break; + } + else + hi = mid - 1; + } + + return lo; +} + /* * get_default_oid_from_partdesc * From dfe5a49c10508ff1d203c8008703b20f7f1465db Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Mon, 5 Feb 2018 10:37:30 -0500 Subject: [PATCH 228/578] Fix RelationBuildPartitionKey's processing of partition key expressions. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Failure to advance the list pointer while reading partition expressions from a list results in invoking an input function with inappropriate data, possibly leading to crashes or, with carefully crafted input, disclosure of arbitrary backend memory. Bug discovered independently by Álvaro Herrera and David Rowley. This patch is by Álvaro but owes something to David's proposed fix. Back-patch to v10 where the issue was introduced. Security: CVE-2018-1052 --- src/backend/utils/cache/relcache.c | 5 +++++ src/test/regress/expected/create_table.out | 23 ++++++++++++++++++---- src/test/regress/sql/create_table.sql | 9 +++++++-- 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index f6acc9f0..f9520010 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -1115,9 +1115,14 @@ RelationBuildPartitionKey(Relation relation) } else { + if (partexprs_item == NULL) + elog(ERROR, "wrong number of partition key expressions"); + key->parttypid[i] = exprType(lfirst(partexprs_item)); key->parttypmod[i] = exprTypmod(lfirst(partexprs_item)); key->parttypcoll[i] = exprCollation(lfirst(partexprs_item)); + + partexprs_item = lnext(partexprs_item); } get_typlenbyvalalign(key->parttypid[i], &key->parttyplen[i], diff --git a/src/test/regress/expected/create_table.out b/src/test/regress/expected/create_table.out index 86c347be..7fa55adb 100644 --- a/src/test/regress/expected/create_table.out +++ b/src/test/regress/expected/create_table.out @@ -417,8 +417,9 @@ DETAIL: table partitioned depends on function plusone(integer) HINT: Use DROP ... CASCADE to drop the dependent objects too. -- partitioned table cannot participate in regular inheritance CREATE TABLE partitioned2 ( - a int -) PARTITION BY LIST ((a+1)); + a int, + b text +) PARTITION BY RANGE ((a+1), substr(b, 1, 5)); CREATE TABLE fail () INHERITS (partitioned2); ERROR: cannot inherit from partitioned table "partitioned2" -- Partition key in describe output @@ -436,11 +437,25 @@ Number of partitions: 0 \d+ partitioned2 Table "public.partitioned2" Column | Type | Collation | Nullable | Default | Storage | Stats target | Description ---------+---------+-----------+----------+---------+---------+--------------+------------- +--------+---------+-----------+----------+---------+----------+--------------+------------- a | integer | | | | plain | | -Partition key: LIST (((a + 1))) + b | text | | | | extended | | +Partition key: RANGE (((a + 1)), substr(b, 1, 5)) Number of partitions: 0 +INSERT INTO partitioned2 VALUES (1, 'hello'); +ERROR: no partition of relation "partitioned2" found for row +DETAIL: Partition key of the failing row contains ((a + 1), substr(b, 1, 5)) = (2, hello). +CREATE TABLE part2_1 PARTITION OF partitioned2 FOR VALUES FROM (-1, 'aaaaa') TO (100, 'ccccc'); +\d+ part2_1 + Table "public.part2_1" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+----------+--------------+------------- + a | integer | | | | plain | | + b | text | | | | extended | | +Partition of: partitioned2 FOR VALUES FROM ('-1', 'aaaaa') TO (100, 'ccccc') +Partition constraint: (((a + 1) IS NOT NULL) AND (substr(b, 1, 5) IS NOT NULL) AND (((a + 1) > '-1'::integer) OR (((a + 1) = '-1'::integer) AND (substr(b, 1, 5) >= 'aaaaa'::text))) AND (((a + 1) < 100) OR (((a + 1) = 100) AND (substr(b, 1, 5) < 'ccccc'::text)))) + DROP TABLE partitioned, partitioned2; -- -- Partitions diff --git a/src/test/regress/sql/create_table.sql b/src/test/regress/sql/create_table.sql index 43ada6b3..b125fa50 100644 --- a/src/test/regress/sql/create_table.sql +++ b/src/test/regress/sql/create_table.sql @@ -417,14 +417,19 @@ DROP FUNCTION plusone(int); -- partitioned table cannot participate in regular inheritance CREATE TABLE partitioned2 ( - a int -) PARTITION BY LIST ((a+1)); + a int, + b text +) PARTITION BY RANGE ((a+1), substr(b, 1, 5)); CREATE TABLE fail () INHERITS (partitioned2); -- Partition key in describe output \d partitioned \d+ partitioned2 +INSERT INTO partitioned2 VALUES (1, 'hello'); +CREATE TABLE part2_1 PARTITION OF partitioned2 FOR VALUES FROM (-1, 'aaaaa') TO (100, 'ccccc'); +\d+ part2_1 + DROP TABLE partitioned, partitioned2; -- From 98279f99acf04227f628476f31c4a045bb3fb641 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Fri, 26 Jun 2020 19:53:18 +0800 Subject: [PATCH 229/578] Fix possible crash in partition-wise join. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/optimizer/path/allpaths.c | 18 ++------ src/backend/optimizer/path/joinrels.c | 16 -------- src/backend/optimizer/util/relnode.c | 5 ++- src/include/nodes/relation.h | 14 ++++--- src/test/regress/expected/partition_join.out | 43 ++++++++++++-------- src/test/regress/sql/partition_join.sql | 2 +- 6 files changed, 43 insertions(+), 55 deletions(-) diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 9020a606..947c75f3 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -3399,20 +3399,8 @@ generate_partition_wise_join_paths(PlannerInfo *root, RelOptInfo *rel) if (!IS_JOIN_REL(rel)) return; - /* - * If we've already proven this join is empty, we needn't consider any - * more paths for it. - */ - if (IS_DUMMY_REL(rel)) - return; - - /* - * Nothing to do if the relation is not partitioned. An outer join - * relation which had empty inner relation in every pair will have rest of - * the partitioning properties set except the child-join RelOptInfos. See - * try_partition_wise_join() for more explanation. - */ - if (rel->nparts <= 0 || rel->part_rels == NULL) + /* We've nothing to do if the relation is not partitioned. */ + if (!IS_PARTITIONED_REL(rel)) return; /* Guard against stack overflow due to overly deep partition hierarchy. */ @@ -3426,6 +3414,8 @@ generate_partition_wise_join_paths(PlannerInfo *root, RelOptInfo *rel) { RelOptInfo *child_rel = part_rels[cnt_parts]; + Assert(child_rel != NULL); + /* Add partition-wise join paths for partitioned child-joins. */ generate_partition_wise_join_paths(root, child_rel); diff --git a/src/backend/optimizer/path/joinrels.c b/src/backend/optimizer/path/joinrels.c index d6fad96c..d8afa3ef 100644 --- a/src/backend/optimizer/path/joinrels.c +++ b/src/backend/optimizer/path/joinrels.c @@ -1340,17 +1340,6 @@ try_partition_wise_join(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2, if (!IS_PARTITIONED_REL(joinrel)) return; - /* - * set_rel_pathlist() may not create paths in children of an empty - * partitioned table and so we can not add paths to child-joins. So, deem - * such a join as unpartitioned. When a partitioned relation is deemed - * empty because all its children are empty, dummy path will be set in - * each of the children. In such a case we could still consider the join - * as partitioned, but it might not help much. - */ - if (IS_DUMMY_REL(rel1) || IS_DUMMY_REL(rel2)) - return; - /* * Since this join relation is partitioned, all the base relations * participating in this join must be partitioned and so are all the @@ -1382,11 +1371,6 @@ try_partition_wise_join(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2, nparts = joinrel->nparts; - /* Allocate space to hold child-joins RelOptInfos, if not already done. */ - if (!joinrel->part_rels) - joinrel->part_rels = - (RelOptInfo **) palloc0(sizeof(RelOptInfo *) * nparts); - /* * Create child-join relations for this partitioned join, if those don't * exist. Add paths to child-joins for a pair of child relations diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c index 0896b4c2..70acf299 100644 --- a/src/backend/optimizer/util/relnode.c +++ b/src/backend/optimizer/util/relnode.c @@ -1721,11 +1721,14 @@ build_joinrel_partition_info(RelOptInfo *joinrel, RelOptInfo *outer_rel, */ joinrel->part_scheme = part_scheme; joinrel->boundinfo = outer_rel->boundinfo; - joinrel->nparts = outer_rel->nparts; partnatts = joinrel->part_scheme->partnatts; joinrel->partexprs = (List **) palloc0(sizeof(List *) * partnatts); joinrel->nullable_partexprs = (List **) palloc0(sizeof(List *) *partnatts); + joinrel->nparts = outer_rel->nparts; + joinrel->part_rels = + (RelOptInfo **) palloc0(sizeof(RelOptInfo *) * joinrel->nparts); + /* * Construct partition keys for the join. diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h index e2af7ebc..ee843fad 100644 --- a/src/include/nodes/relation.h +++ b/src/include/nodes/relation.h @@ -791,13 +791,17 @@ typedef struct RelOptInfo /* * Is given relation partitioned? * - * A join between two partitioned relations with same partitioning scheme - * without any matching partitions will not have any partition in it but will - * have partition scheme set. So a relation is deemed to be partitioned if it - * has a partitioning scheme, bounds and positive number of partitions. + * It's not enough to test whether rel->part_scheme is set, because it might + * be that the basic partitioning properties of the input relations matched + * but the partition bounds did not. + * + * We treat dummy relations as unpartitioned. We could alternatively + * treat them as partitioned, but it's not clear whether that's a useful thing + * to do. */ #define IS_PARTITIONED_REL(rel) \ - ((rel)->part_scheme && (rel)->boundinfo && (rel)->nparts > 0) + ((rel)->part_scheme && (rel)->boundinfo && (rel)->nparts > 0 && \ + (rel)->part_rels && !(IS_DUMMY_REL(rel))) /* * Convenience macro to make sure that a partitioned relation has all the diff --git a/src/test/regress/expected/partition_join.out b/src/test/regress/expected/partition_join.out index 1c8cdb34..4e1cfedd 100644 --- a/src/test/regress/expected/partition_join.out +++ b/src/test/regress/expected/partition_join.out @@ -1217,24 +1217,31 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 (2 rows) EXPLAIN (COSTS OFF) -SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b; - QUERY PLAN --------------------------------------------- - Sort - Sort Key: a, t2.b - -> Hash Left Join - Hash Cond: (t2.b = a) - -> Append - -> Seq Scan on prt2_p1 t2 - Filter: (a = 0) - -> Seq Scan on prt2_p2 t2_1 - Filter: (a = 0) - -> Seq Scan on prt2_p3 t2_2 - Filter: (a = 0) - -> Hash - -> Result - One-Time Filter: false -(14 rows) +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b, prt1 t3 WHERE t2.b = t3.a; + QUERY PLAN +-------------------------------------------------- + Hash Left Join + Hash Cond: (t2.b = a) + -> Append + -> Hash Join + Hash Cond: (t3.a = t2.b) + -> Seq Scan on prt1_p1 t3 + -> Hash + -> Seq Scan on prt2_p1 t2 + -> Hash Join + Hash Cond: (t3_1.a = t2_1.b) + -> Seq Scan on prt1_p2 t3_1 + -> Hash + -> Seq Scan on prt2_p2 t2_1 + -> Hash Join + Hash Cond: (t3_2.a = t2_2.b) + -> Seq Scan on prt1_p3 t3_2 + -> Hash + -> Seq Scan on prt2_p3 t2_2 + -> Hash + -> Result + One-Time Filter: false +(21 rows) EXPLAIN (COSTS OFF) SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 FULL JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b; diff --git a/src/test/regress/sql/partition_join.sql b/src/test/regress/sql/partition_join.sql index 2316bbdc..4aa775e7 100644 --- a/src/test/regress/sql/partition_join.sql +++ b/src/test/regress/sql/partition_join.sql @@ -224,7 +224,7 @@ EXPLAIN (COSTS OFF) SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 LEFT JOIN prt2 t2 ON t1.a = t2.b; EXPLAIN (COSTS OFF) -SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b; +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b, prt1 t3 WHERE t2.b = t3.a; EXPLAIN (COSTS OFF) SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 FULL JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b; From 340ca8d212c4c5f70e5c17bad46f4ced485dae32 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Fri, 26 Jun 2020 21:30:42 +0800 Subject: [PATCH 230/578] Be lazier about partition tuple routing. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/commands/copy.c | 10 +- src/backend/executor/execPartition.c | 367 +++++++++++++++++++-------- src/include/executor/execPartition.h | 9 +- 3 files changed, 275 insertions(+), 111 deletions(-) diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index cf770f46..8ef1f6cd 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -1635,7 +1635,7 @@ BeginCopy(ParseState *pstate, PartitionTupleRouting *proute; proute = cstate->partition_tuple_routing = - ExecSetupPartitionTupleRouting(NULL, cstate->rel, 1, estate); + ExecSetupPartitionTupleRouting(NULL, cstate->rel); /* * If we are capturing transition tuples, they may need to be @@ -3462,6 +3462,14 @@ CopyFrom(CopyState cstate) */ saved_resultRelInfo = resultRelInfo; resultRelInfo = proute->partitions[leaf_part_index]; + if (resultRelInfo == NULL) + { + resultRelInfo = ExecInitPartitionInfo(NULL, + saved_resultRelInfo, + proute, estate, + leaf_part_index); + Assert(resultRelInfo != NULL); + } /* We do not yet have a way to insert into a foreign partition */ if (resultRelInfo->ri_FdwRoutine) diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c index e312167f..ad72c3cf 100644 --- a/src/backend/executor/execPartition.c +++ b/src/backend/executor/execPartition.c @@ -44,21 +44,25 @@ static char *ExecBuildSlotPartitionKeyDescription(Relation rel, * * Note that all the relations in the partition tree are locked using the * RowExclusiveLock mode upon return from this function. + * + * While we allocate the arrays of pointers of ResultRelInfo and + * TupleConversionMap for all partitions here, actual objects themselves are + * lazily allocated for a given partition if a tuple is actually routed to it; + * see ExecInitPartitionInfo. However, if the function is invoked for update + * tuple routing, caller would already have initialized ResultRelInfo's for + * some of the partitions, which are reused and assigned to their respective + * slot in the aforementioned array. */ PartitionTupleRouting * -ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, - Relation rel, Index resultRTindex, - EState *estate) +ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, Relation rel) { TupleDesc tupDesc = RelationGetDescr(rel); List *leaf_parts; ListCell *cell; int i; - ResultRelInfo *leaf_part_arr = NULL, - *update_rri = NULL; + ResultRelInfo *update_rri = NULL; int num_update_rri = 0, update_rri_index = 0; - bool is_update = false; PartitionTupleRouting *proute; /* @@ -76,13 +80,14 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, proute->parent_child_tupconv_maps = (TupleConversionMap **) palloc0(proute->num_partitions * sizeof(TupleConversionMap *)); + proute->partition_oids = (Oid *) palloc(proute->num_partitions * + sizeof(Oid)); /* Set up details specific to the type of tuple routing we are doing. */ if (mtstate && mtstate->operation == CMD_UPDATE) { ModifyTable *node = (ModifyTable *) mtstate->ps.plan; - is_update = true; update_rri = mtstate->resultRelInfo; num_update_rri = list_length(node->plans); proute->subplan_partition_offsets = @@ -95,16 +100,6 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, */ proute->root_tuple_slot = MakeTupleTableSlot(); } - else - { - /* - * Since we are inserting tuples, we need to create all new result - * rels. Avoid repeated pallocs by allocating memory for all the - * result rels in bulk. - */ - leaf_part_arr = (ResultRelInfo *) palloc0(proute->num_partitions * - sizeof(ResultRelInfo)); - } /* * Initialize an empty slot that will be used to manipulate tuples of any @@ -117,101 +112,58 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, i = 0; foreach(cell, leaf_parts) { - ResultRelInfo *leaf_part_rri; - Relation partrel = NULL; - TupleDesc part_tupdesc; + ResultRelInfo *leaf_part_rri = NULL; Oid leaf_oid = lfirst_oid(cell); - if (is_update) - { - /* - * If the leaf partition is already present in the per-subplan - * result rels, we re-use that rather than initialize a new result - * rel. The per-subplan resultrels and the resultrels of the leaf - * partitions are both in the same canonical order. So while going - * through the leaf partition oids, we need to keep track of the - * next per-subplan result rel to be looked for in the leaf - * partition resultrels. - */ - if (update_rri_index < num_update_rri && - RelationGetRelid(update_rri[update_rri_index].ri_RelationDesc) == leaf_oid) - { - leaf_part_rri = &update_rri[update_rri_index]; - partrel = leaf_part_rri->ri_RelationDesc; - - /* - * This is required in order to we convert the partition's - * tuple to be compatible with the root partitioned table's - * tuple descriptor. When generating the per-subplan result - * rels, this was not set. - */ - leaf_part_rri->ri_PartitionRoot = rel; - - /* Remember the subplan offset for this ResultRelInfo */ - proute->subplan_partition_offsets[update_rri_index] = i; - - update_rri_index++; - } - else - leaf_part_rri = (ResultRelInfo *) palloc0(sizeof(ResultRelInfo)); - } - else - { - /* For INSERTs, we already have an array of result rels allocated */ - leaf_part_rri = &leaf_part_arr[i]; - } + proute->partition_oids[i] = leaf_oid; /* - * If we didn't open the partition rel, it means we haven't - * initialized the result rel either. + * If the leaf partition is already present in the per-subplan result + * rels, we re-use that rather than initialize a new result rel. The + * per-subplan resultrels and the resultrels of the leaf partitions + * are both in the same canonical order. So while going through the + * leaf partition oids, we need to keep track of the next per-subplan + * result rel to be looked for in the leaf partition resultrels. */ - if (!partrel) + if (update_rri_index < num_update_rri && + RelationGetRelid(update_rri[update_rri_index].ri_RelationDesc) == leaf_oid) { - /* - * We locked all the partitions above including the leaf - * partitions. Note that each of the newly opened relations in - * proute->partitions are eventually closed by the caller. - */ - partrel = heap_open(leaf_oid, NoLock); - InitResultRelInfo(leaf_part_rri, - partrel, - resultRTindex, - rel, - estate->es_instrument); - } - - part_tupdesc = RelationGetDescr(partrel); - - /* - * Save a tuple conversion map to convert a tuple routed to this - * partition from the parent's type to the partition's. - */ - proute->parent_child_tupconv_maps[i] = - convert_tuples_by_name(tupDesc, part_tupdesc, - gettext_noop("could not convert row type")); - - /* - * Verify result relation is a valid target for an INSERT. An UPDATE - * of a partition-key becomes a DELETE+INSERT operation, so this check - * is still required when the operation is CMD_UPDATE. - */ - CheckValidResultRel(leaf_part_rri, CMD_INSERT); - - /* - * Open partition indices. The user may have asked to check for - * conflicts within this leaf partition and do "nothing" instead of - * throwing an error. Be prepared in that case by initializing the - * index information needed by ExecInsert() to perform speculative - * insertions. - */ - if (leaf_part_rri->ri_RelationDesc->rd_rel->relhasindex && - leaf_part_rri->ri_IndexRelationDescs == NULL) - ExecOpenIndices(leaf_part_rri, - mtstate != NULL && - mtstate->mt_onconflict != ONCONFLICT_NONE); - - estate->es_leaf_result_relations = - lappend(estate->es_leaf_result_relations, leaf_part_rri); + Relation partrel; + TupleDesc part_tupdesc; + + leaf_part_rri = &update_rri[update_rri_index]; + partrel = leaf_part_rri->ri_RelationDesc; + + /* + * This is required in order to convert the partition's tuple to + * be compatible with the root partitioned table's tuple + * descriptor. When generating the per-subplan result rels, this + * was not set. + */ + leaf_part_rri->ri_PartitionRoot = rel; + + /* Remember the subplan offset for this ResultRelInfo */ + proute->subplan_partition_offsets[update_rri_index] = i; + + update_rri_index++; + + part_tupdesc = RelationGetDescr(partrel); + + /* + * Save a tuple conversion map to convert a tuple routed to this + * partition from the parent's type to the partition's. + */ + proute->parent_child_tupconv_maps[i] = + convert_tuples_by_name(tupDesc, part_tupdesc, + gettext_noop("could not convert row type")); + + /* + * Verify result relation is a valid target for an INSERT. An + * UPDATE of a partition-key becomes a DELETE+INSERT operation, so + * this check is required even when the operation is CMD_UPDATE. + */ + CheckValidResultRel(leaf_part_rri, CMD_INSERT); + } proute->partitions[i] = leaf_part_rri; i++; @@ -219,9 +171,9 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, /* * For UPDATE, we should have found all the per-subplan resultrels in the - * leaf partitions. + * leaf partitions. (If this is an INSERT, both values will be zero.) */ - Assert(!is_update || update_rri_index == num_update_rri); + Assert(update_rri_index == num_update_rri); return proute; } @@ -345,6 +297,201 @@ ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd, return result; } +/* + * ExecInitPartitionInfo + * Initialize ResultRelInfo and other information for a partition if not + * already done + * + * Returns the ResultRelInfo + */ +ResultRelInfo * +ExecInitPartitionInfo(ModifyTableState *mtstate, + ResultRelInfo *resultRelInfo, + PartitionTupleRouting *proute, + EState *estate, int partidx) +{ + Relation rootrel = resultRelInfo->ri_RelationDesc, + partrel; + ResultRelInfo *leaf_part_rri; + ModifyTable *node = mtstate ? (ModifyTable *) mtstate->ps.plan : NULL; + MemoryContext oldContext; + + /* + * We locked all the partitions in ExecSetupPartitionTupleRouting + * including the leaf partitions. + */ + partrel = heap_open(proute->partition_oids[partidx], NoLock); + + /* + * Keep ResultRelInfo and other information for this partition in the + * per-query memory context so they'll survive throughout the query. + */ + oldContext = MemoryContextSwitchTo(estate->es_query_cxt); + + leaf_part_rri = (ResultRelInfo *) palloc0(sizeof(ResultRelInfo)); + InitResultRelInfo(leaf_part_rri, + partrel, + node ? node->nominalRelation : 1, + rootrel, + estate->es_instrument); + + /* + * Verify result relation is a valid target for an INSERT. An UPDATE of a + * partition-key becomes a DELETE+INSERT operation, so this check is still + * required when the operation is CMD_UPDATE. + */ + CheckValidResultRel(leaf_part_rri, CMD_INSERT); + + /* + * Since we've just initialized this ResultRelInfo, it's not in any list + * attached to the estate as yet. Add it, so that it can be found later. + * + * Note that the entries in this list appear in no predetermined order, + * because partition result rels are initialized as and when they're + * needed. + */ + estate->es_tuple_routing_result_relations = + lappend(estate->es_tuple_routing_result_relations, + leaf_part_rri); + + /* + * Open partition indices. The user may have asked to check for conflicts + * within this leaf partition and do "nothing" instead of throwing an + * error. Be prepared in that case by initializing the index information + * needed by ExecInsert() to perform speculative insertions. + */ + if (partrel->rd_rel->relhasindex && + leaf_part_rri->ri_IndexRelationDescs == NULL) + ExecOpenIndices(leaf_part_rri, + (mtstate != NULL && + mtstate->mt_onconflict != ONCONFLICT_NONE)); + + /* + * Build WITH CHECK OPTION constraints for the partition. Note that we + * didn't build the withCheckOptionList for partitions within the planner, + * but simple translation of varattnos will suffice. This only occurs for + * the INSERT case or in the case of UPDATE tuple routing where we didn't + * find a result rel to reuse in ExecSetupPartitionTupleRouting(). + */ + if (node && node->withCheckOptionLists != NIL) + { + List *wcoList; + List *wcoExprs = NIL; + ListCell *ll; + int firstVarno = mtstate->resultRelInfo[0].ri_RangeTableIndex; + Relation firstResultRel = mtstate->resultRelInfo[0].ri_RelationDesc; + + /* + * In the case of INSERT on a partitioned table, there is only one + * plan. Likewise, there is only one WCO list, not one per partition. + * For UPDATE, there are as many WCO lists as there are plans. + */ + Assert((node->operation == CMD_INSERT && + list_length(node->withCheckOptionLists) == 1 && + list_length(node->plans) == 1) || + (node->operation == CMD_UPDATE && + list_length(node->withCheckOptionLists) == + list_length(node->plans))); + + /* + * Use the WCO list of the first plan as a reference to calculate + * attno's for the WCO list of this partition. In the INSERT case, + * that refers to the root partitioned table, whereas in the UPDATE + * tuple routing case, that refers to the first partition in the + * mtstate->resultRelInfo array. In any case, both that relation and + * this partition should have the same columns, so we should be able + * to map attributes successfully. + */ + wcoList = linitial(node->withCheckOptionLists); + + /* + * Convert Vars in it to contain this partition's attribute numbers. + */ + wcoList = map_partition_varattnos(wcoList, firstVarno, + partrel, firstResultRel, NULL); + foreach(ll, wcoList) + { + WithCheckOption *wco = castNode(WithCheckOption, lfirst(ll)); + ExprState *wcoExpr = ExecInitQual(castNode(List, wco->qual), + mtstate->mt_plans[0]); + + wcoExprs = lappend(wcoExprs, wcoExpr); + } + + leaf_part_rri->ri_WithCheckOptions = wcoList; + leaf_part_rri->ri_WithCheckOptionExprs = wcoExprs; + } + + /* + * Build the RETURNING projection for the partition. Note that we didn't + * build the returningList for partitions within the planner, but simple + * translation of varattnos will suffice. This only occurs for the INSERT + * case or in the case of UPDATE tuple routing where we didn't find a + * result rel to reuse in ExecSetupPartitionTupleRouting(). + */ + if (node && node->returningLists != NIL) + { + TupleTableSlot *slot; + ExprContext *econtext; + List *returningList; + int firstVarno = mtstate->resultRelInfo[0].ri_RangeTableIndex; + Relation firstResultRel = mtstate->resultRelInfo[0].ri_RelationDesc; + + /* See the comment above for WCO lists. */ + Assert((node->operation == CMD_INSERT && + list_length(node->returningLists) == 1 && + list_length(node->plans) == 1) || + (node->operation == CMD_UPDATE && + list_length(node->returningLists) == + list_length(node->plans))); + + /* + * Use the RETURNING list of the first plan as a reference to + * calculate attno's for the RETURNING list of this partition. See + * the comment above for WCO lists for more details on why this is + * okay. + */ + returningList = linitial(node->returningLists); + + /* + * Convert Vars in it to contain this partition's attribute numbers. + */ + returningList = map_partition_varattnos(returningList, firstVarno, + partrel, firstResultRel, + NULL); + + /* + * Initialize the projection itself. + * + * Use the slot and the expression context that would have been set up + * in ExecInitModifyTable() for projection's output. + */ + Assert(mtstate->ps.ps_ResultTupleSlot != NULL); + slot = mtstate->ps.ps_ResultTupleSlot; + Assert(mtstate->ps.ps_ExprContext != NULL); + econtext = mtstate->ps.ps_ExprContext; + leaf_part_rri->ri_projectReturning = + ExecBuildProjectionInfo(returningList, econtext, slot, + &mtstate->ps, RelationGetDescr(partrel)); + } + + Assert(proute->partitions[partidx] == NULL); + proute->partitions[partidx] = leaf_part_rri; + + /* + * Save a tuple conversion map to convert a tuple routed to this partition + * from the parent's type to the partition's. + */ + proute->parent_child_tupconv_maps[partidx] = + convert_tuples_by_name(RelationGetDescr(rootrel), + RelationGetDescr(partrel), + gettext_noop("could not convert row type")); + + MemoryContextSwitchTo(oldContext); + + return leaf_part_rri; +} + /* * ExecSetupChildParentMapForLeaf -- Initialize the per-leaf-partition * child-to-root tuple conversion map array. @@ -471,6 +618,10 @@ ExecCleanupTupleRouting(PartitionTupleRouting *proute) { ResultRelInfo *resultRelInfo = proute->partitions[i]; + /* skip further processsing for uninitialized partitions */ + if (resultRelInfo == NULL) + continue; + /* * If this result rel is one of the UPDATE subplan result rels, let * ExecEndPlan() close it. For INSERT or COPY, diff --git a/src/include/executor/execPartition.h b/src/include/executor/execPartition.h index 4e0bdc35..40a67ea3 100644 --- a/src/include/executor/execPartition.h +++ b/src/include/executor/execPartition.h @@ -58,6 +58,7 @@ typedef struct PartitionDispatchData *PartitionDispatch; * partition tree. * num_dispatch number of partitioned tables in the partition * tree (= length of partition_dispatch_info[]) + * partition_oids Array of leaf partitions OIDs * partitions Array of ResultRelInfo* objects with one entry * for every leaf partition in the partition tree. * num_partitions Number of leaf partitions in the partition tree @@ -91,6 +92,7 @@ typedef struct PartitionTupleRouting { PartitionDispatch *partition_dispatch_info; int num_dispatch; + Oid *partition_oids; ResultRelInfo **partitions; int num_partitions; TupleConversionMap **parent_child_tupconv_maps; @@ -103,12 +105,15 @@ typedef struct PartitionTupleRouting } PartitionTupleRouting; extern PartitionTupleRouting *ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, - Relation rel, Index resultRTindex, - EState *estate); + Relation rel); extern int ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd, TupleTableSlot *slot, EState *estate); +extern ResultRelInfo *ExecInitPartitionInfo(ModifyTableState *mtstate, + ResultRelInfo *resultRelInfo, + PartitionTupleRouting *proute, + EState *estate, int partidx); extern void ExecSetupChildParentMapForLeaf(PartitionTupleRouting *proute); extern TupleConversionMap *TupConvMapForLeaf(PartitionTupleRouting *proute, ResultRelInfo *rootRelInfo, int leaf_index); From 77056eb0dd53aed29fa1a63459432a77c7bc4841 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Fri, 26 Jun 2020 21:31:38 +0800 Subject: [PATCH 231/578] Be lazier about partition tuple routing. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/executor/nodeModifyTable.c | 134 +++---------------------- 1 file changed, 12 insertions(+), 122 deletions(-) diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 003ff4b8..8d8b816d 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -345,10 +345,18 @@ ExecInsert(ModifyTableState *mtstate, /* * Save the old ResultRelInfo and switch to the one corresponding to - * the selected partition. + * the selected partition. (We might need to initialize it first.) */ saved_resultRelInfo = resultRelInfo; resultRelInfo = proute->partitions[leaf_part_index]; + if (resultRelInfo == NULL) + { + resultRelInfo = ExecInitPartitionInfo(mtstate, + saved_resultRelInfo, + proute, estate, + leaf_part_index); + Assert(resultRelInfo != NULL); + } /* We do not yet have a way to insert into a foreign partition */ if (resultRelInfo->ri_FdwRoutine) @@ -2751,14 +2759,11 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) ResultRelInfo *resultRelInfo; TupleDesc tupDesc; Plan *subplan; - int firstVarno = 0; - Relation firstResultRel = NULL; ListCell *l; int i; Relation rel; bool update_tuple_routing_needed = node->partColsUpdated; - PartitionTupleRouting *proute = NULL; - int num_partitions = 0; + #ifdef __TBASE__ bool remote_dml = false; #endif @@ -2995,20 +3000,8 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) */ if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE && (operation == CMD_INSERT || update_tuple_routing_needed)) - { - proute = mtstate->mt_partition_tuple_routing = - ExecSetupPartitionTupleRouting(mtstate, - rel, node->nominalRelation, - estate); - num_partitions = proute->num_partitions; - - /* - * Below are required as reference objects for mapping partition - * attno's in expressions such as WithCheckOptions and RETURNING. - */ - firstVarno = mtstate->resultRelInfo[0].ri_RangeTableIndex; - firstResultRel = mtstate->resultRelInfo[0].ri_RelationDesc; - } + mtstate->mt_partition_tuple_routing = + ExecSetupPartitionTupleRouting(mtstate, rel); /* Build state for collecting transition tuples */ ExecSetupTransitionCaptureState(mtstate, estate); @@ -3071,70 +3064,6 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) } #endif - /* - * Build WITH CHECK OPTION constraints for each leaf partition rel. Note - * that we didn't build the withCheckOptionList for each partition within - * the planner, but simple translation of the varattnos for each partition - * will suffice. This only occurs for the INSERT case or for UPDATE row - * movement. DELETEs and local UPDATEs are handled above. - */ - if (node->withCheckOptionLists != NIL && num_partitions > 0) - { - List *first_wcoList; - - /* - * In case of INSERT on partitioned tables, there is only one plan. - * Likewise, there is only one WITH CHECK OPTIONS list, not one per - * partition. Whereas for UPDATE, there are as many WCOs as there are - * plans. So in either case, use the WCO expression of the first - * resultRelInfo as a reference to calculate attno's for the WCO - * expression of each of the partitions. We make a copy of the WCO - * qual for each partition. Note that, if there are SubPlans in there, - * they all end up attached to the one parent Plan node. - */ - Assert(update_tuple_routing_needed || - (operation == CMD_INSERT && - list_length(node->withCheckOptionLists) == 1 && - mtstate->mt_nplans == 1)); - - first_wcoList = linitial(node->withCheckOptionLists); - for (i = 0; i < num_partitions; i++) - { - Relation partrel; - List *mapped_wcoList; - List *wcoExprs = NIL; - ListCell *ll; - - resultRelInfo = proute->partitions[i]; - - /* - * If we are referring to a resultRelInfo from one of the update - * result rels, that result rel would already have - * WithCheckOptions initialized. - */ - if (resultRelInfo->ri_WithCheckOptions) - continue; - - partrel = resultRelInfo->ri_RelationDesc; - - mapped_wcoList = map_partition_varattnos(first_wcoList, - firstVarno, - partrel, firstResultRel, - NULL); - foreach(ll, mapped_wcoList) - { - WithCheckOption *wco = castNode(WithCheckOption, lfirst(ll)); - ExprState *wcoExpr = ExecInitQual(castNode(List, wco->qual), - &mtstate->ps); - - wcoExprs = lappend(wcoExprs, wcoExpr); - } - - resultRelInfo->ri_WithCheckOptions = mapped_wcoList; - resultRelInfo->ri_WithCheckOptionExprs = wcoExprs; - } - } - /* * Initialize RETURNING projections if needed. */ @@ -3142,7 +3071,6 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) { TupleTableSlot *slot; ExprContext *econtext; - List *firstReturningList; /* * Initialize result tuple slot and assign its rowtype using the first @@ -3185,44 +3113,6 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) resultRelInfo++; } - - /* - * Build a projection for each leaf partition rel. Note that we - * didn't build the returningList for each partition within the - * planner, but simple translation of the varattnos for each partition - * will suffice. This only occurs for the INSERT case or for UPDATE - * row movement. DELETEs and local UPDATEs are handled above. - */ - firstReturningList = linitial(node->returningLists); - for (i = 0; i < num_partitions; i++) - { - Relation partrel; - List *rlist; - - resultRelInfo = proute->partitions[i]; - - /* - * If we are referring to a resultRelInfo from one of the update - * result rels, that result rel would already have a returningList - * built. - */ - if (resultRelInfo->ri_projectReturning) - continue; - - partrel = resultRelInfo->ri_RelationDesc; - - /* - * Use the returning expression of the first resultRelInfo as a - * reference to calculate attno's for the returning expression of - * each of the partitions. - */ - rlist = map_partition_varattnos(firstReturningList, - firstVarno, - partrel, firstResultRel, NULL); - resultRelInfo->ri_projectReturning = - ExecBuildProjectionInfo(rlist, econtext, slot, &mtstate->ps, - resultRelInfo->ri_RelationDesc->rd_att); - } } else { From 4485fe66fd7dcfd7f4b2ccaedd92bc279d090fdc Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Fri, 26 Jun 2020 21:37:39 +0800 Subject: [PATCH 232/578] Revise API for partition_rbound_cmp/partition_rbound_datum_cmp. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/catalog/partition.c | 54 ++++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 17 deletions(-) diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c index e185a7ee..01715488 100644 --- a/src/backend/catalog/partition.c +++ b/src/backend/catalog/partition.c @@ -165,10 +165,12 @@ static PartitionRangeBound *make_one_range_bound(PartitionKey key, int index, List *datums, bool lower); static int32 partition_hbound_cmp(int modulus1, int remainder1, int modulus2, int remainder2); -static int32 partition_rbound_cmp(PartitionKey key, - Datum *datums1, PartitionRangeDatumKind *kind1, - bool lower1, PartitionRangeBound *b2); -static int32 partition_rbound_datum_cmp(PartitionKey key, +static int32 partition_rbound_cmp(int partnatts, FmgrInfo *partsupfunc, + Oid *partcollation, Datum *datums1, + PartitionRangeDatumKind *kind1, bool lower1, + PartitionRangeBound *b2); +static int32 partition_rbound_datum_cmp(FmgrInfo *partsupfunc, + Oid *partcollation, Datum *rb_datums, PartitionRangeDatumKind *rb_kind, Datum *tuple_datums, int n_tuple_datums); @@ -1150,8 +1152,9 @@ check_new_partition_bound(char *relname, Relation parent, * First check if the resulting range would be empty with * specified lower and upper bounds */ - if (partition_rbound_cmp(key, lower->datums, lower->kind, true, - upper) >= 0) + if (partition_rbound_cmp(key->partnatts, key->partsupfunc, + key->partcollation, lower->datums, + lower->kind, true, upper) >= 0) { ereport(ERROR, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), @@ -1211,7 +1214,10 @@ check_new_partition_bound(char *relname, Relation parent, kind = boundinfo->kind[offset + 1]; is_lower = (boundinfo->indexes[offset + 1] == -1); - cmpval = partition_rbound_cmp(key, datums, kind, + cmpval = partition_rbound_cmp(key->partnatts, + key->partsupfunc, + key->partcollation, + datums, kind, is_lower, upper); if (cmpval < 0) { @@ -2614,6 +2620,7 @@ get_partition_for_tuple(Relation relation, Datum *values, bool *isnull) key->partnatts, values, &equal); + /* * The bound at bound_offset is less than or equal to the * tuple value, so the bound at offset+1 is the upper @@ -2807,7 +2814,9 @@ qsort_partition_rbound_cmp(const void *a, const void *b, void *arg) PartitionRangeBound *b2 = (*(PartitionRangeBound *const *) b); PartitionKey key = (PartitionKey) arg; - return partition_rbound_cmp(key, b1->datums, b1->kind, b1->lower, b2); + return partition_rbound_cmp(key->partnatts, key->partsupfunc, + key->partcollation, b1->datums, b1->kind, + b1->lower, b2); } /* @@ -2816,6 +2825,10 @@ qsort_partition_rbound_cmp(const void *a, const void *b, void *arg) * Return for two range bounds whether the 1st one (specified in datum1, * kind1, and lower1) is <, =, or > the bound specified in *b2. * + * partnatts, partsupfunc and partcollation give the number of attributes in the + * bounds to be compared, comparison function to be used and the collations of + * attributes, respectively. + * * Note that if the values of the two range bounds compare equal, then we take * into account whether they are upper or lower bounds, and an upper bound is * considered to be smaller than a lower bound. This is important to the way @@ -2824,7 +2837,7 @@ qsort_partition_rbound_cmp(const void *a, const void *b, void *arg) * two contiguous partitions. */ static int32 -partition_rbound_cmp(PartitionKey key, +partition_rbound_cmp(int partnatts, FmgrInfo *partsupfunc, Oid *partcollation, Datum *datums1, PartitionRangeDatumKind *kind1, bool lower1, PartitionRangeBound *b2) {// #lizard forgives @@ -2834,7 +2847,7 @@ partition_rbound_cmp(PartitionKey key, PartitionRangeDatumKind *kind2 = b2->kind; bool lower2 = b2->lower; - for (i = 0; i < key->partnatts; i++) + for (i = 0; i < partnatts; i++) { /* * First, handle cases where the column is unbounded, which should not @@ -2855,8 +2868,8 @@ partition_rbound_cmp(PartitionKey key, */ break; - cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[i], - key->partcollation[i], + cmpval = DatumGetInt32(FunctionCall2Coll(&partsupfunc[i], + partcollation[i], datums1[i], datums2[i])); if (cmpval != 0) @@ -2880,9 +2893,14 @@ partition_rbound_cmp(PartitionKey key, * * Return whether range bound (specified in rb_datums, rb_kind, and rb_lower) * is <, =, or > partition key of tuple (tuple_datums) + * + * n_tuple_datums, partsupfunc and partcollation give number of attributes in + * the bounds to be compared, comparison function to be used and the collations + * of attributes resp. + * */ static int32 -partition_rbound_datum_cmp(PartitionKey key, +partition_rbound_datum_cmp(FmgrInfo *partsupfunc, Oid *partcollation, Datum *rb_datums, PartitionRangeDatumKind *rb_kind, Datum *tuple_datums, int n_tuple_datums) { @@ -2896,8 +2914,8 @@ partition_rbound_datum_cmp(PartitionKey key, else if (rb_kind[i] == PARTITION_RANGE_DATUM_MAXVALUE) return 1; - cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[i], - key->partcollation[i], + cmpval = DatumGetInt32(FunctionCall2Coll(&partsupfunc[i], + partcollation[i], rb_datums[i], tuple_datums[i])); if (cmpval != 0) @@ -2974,7 +2992,8 @@ partition_range_bsearch(PartitionKey key, int32 cmpval; mid = (lo + hi + 1) / 2; - cmpval = partition_rbound_cmp(key, + cmpval = partition_rbound_cmp(key->partnatts, key->partsupfunc, + key->partcollation, boundinfo->datums[mid], boundinfo->kind[mid], (boundinfo->indexes[mid] == -1), @@ -3018,7 +3037,8 @@ partition_range_datum_bsearch(PartitionKey key, int32 cmpval; mid = (lo + hi + 1) / 2; - cmpval = partition_rbound_datum_cmp(key, + cmpval = partition_rbound_datum_cmp(key->partsupfunc, + key->partcollation, boundinfo->datums[mid], boundinfo->kind[mid], values, From ba7473f1c0f0e6e4aa5f460c9268caa3857e362a Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Fri, 26 Jun 2020 21:43:47 +0800 Subject: [PATCH 233/578] Revise API for partition bound search functions.http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/catalog/partition.c | 66 +++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 28 deletions(-) diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c index 01715488..bf765697 100644 --- a/src/backend/catalog/partition.c +++ b/src/backend/catalog/partition.c @@ -174,22 +174,24 @@ static int32 partition_rbound_datum_cmp(FmgrInfo *partsupfunc, Datum *rb_datums, PartitionRangeDatumKind *rb_kind, Datum *tuple_datums, int n_tuple_datums); -static int partition_list_bsearch(PartitionKey key, +static int partition_list_bsearch(FmgrInfo *partsupfunc, Oid *partcollation, PartitionBoundInfo boundinfo, Datum value, bool *is_equal); -static int partition_range_bsearch(PartitionKey key, +static int partition_range_bsearch(int partnatts, FmgrInfo *partsupfunc, + Oid *partcollation, PartitionBoundInfo boundinfo, PartitionRangeBound *probe, bool *is_equal); -static int partition_range_datum_bsearch(PartitionKey key, +static int partition_range_datum_bsearch(FmgrInfo *partsupfunc, + Oid *partcollation, PartitionBoundInfo boundinfo, int nvalues, Datum *values, bool *is_equal); -static int partition_hash_bsearch(PartitionKey key, - PartitionBoundInfo boundinfo, +static int partition_hash_bsearch(PartitionBoundInfo boundinfo, int modulus, int remainder); static int get_partition_bound_num_indexes(PartitionBoundInfo b); static int get_greatest_modulus(PartitionBoundInfo b); -static uint64 compute_hash_value(PartitionKey key, Datum *values, bool *isnull); +static uint64 compute_hash_value(int partnatts, FmgrInfo *partsupfunc, + Datum *values, bool *isnull); /* SQL-callable function for use in hash partition CHECK constraints */ PG_FUNCTION_INFO_V1(satisfies_hash_partition); @@ -1041,7 +1043,7 @@ check_new_partition_bound(char *relname, Relation parent, * boundinfo->datums that is less than or equal to the * (spec->modulus, spec->remainder) pair. */ - offset = partition_hash_bsearch(key, boundinfo, + offset = partition_hash_bsearch(boundinfo, spec->modulus, spec->remainder); if (offset < 0) @@ -1117,7 +1119,9 @@ check_new_partition_bound(char *relname, Relation parent, int offset; bool equal; - offset = partition_list_bsearch(key, boundinfo, + offset = partition_list_bsearch(key->partsupfunc, + key->partcollation, + boundinfo, val->constvalue, &equal); if (offset >= 0 && equal) @@ -1192,7 +1196,10 @@ check_new_partition_bound(char *relname, Relation parent, * since the index array is initialised with an extra -1 * at the end. */ - offset = partition_range_bsearch(key, boundinfo, lower, + offset = partition_range_bsearch(key->partnatts, + key->partsupfunc, + key->partcollation, + boundinfo, lower, &equal); if (boundinfo->indexes[offset + 1] < 0) @@ -2569,7 +2576,9 @@ get_partition_for_tuple(Relation relation, Datum *values, bool *isnull) { PartitionBoundInfo boundinfo = partdesc->boundinfo; int greatest_modulus = get_greatest_modulus(boundinfo); - uint64 rowHash = compute_hash_value(key, values, isnull); + uint64 rowHash = compute_hash_value(key->partnatts, + key->partsupfunc, + values, isnull); part_index = boundinfo->indexes[rowHash % greatest_modulus]; } @@ -2585,7 +2594,8 @@ get_partition_for_tuple(Relation relation, Datum *values, bool *isnull) { bool equal = false; - bound_offset = partition_list_bsearch(key, + bound_offset = partition_list_bsearch(key->partsupfunc, + key->partcollation, partdesc->boundinfo, values[0], &equal); if (bound_offset >= 0 && equal) @@ -2615,7 +2625,8 @@ get_partition_for_tuple(Relation relation, Datum *values, bool *isnull) if (!range_partkey_has_null) { - bound_offset = partition_range_datum_bsearch(key, + bound_offset = partition_range_datum_bsearch(key->partsupfunc, + key->partcollation, partdesc->boundinfo, key->partnatts, values, @@ -2934,7 +2945,7 @@ partition_rbound_datum_cmp(FmgrInfo *partsupfunc, Oid *partcollation, * to the input value. */ static int -partition_list_bsearch(PartitionKey key, +partition_list_bsearch(FmgrInfo *partsupfunc, Oid *partcollation, PartitionBoundInfo boundinfo, Datum value, bool *is_equal) { @@ -2949,8 +2960,8 @@ partition_list_bsearch(PartitionKey key, int32 cmpval; mid = (lo + hi + 1) / 2; - cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[0], - key->partcollation[0], + cmpval = DatumGetInt32(FunctionCall2Coll(&partsupfunc[0], + partcollation[0], boundinfo->datums[mid][0], value)); if (cmpval <= 0) @@ -2977,7 +2988,8 @@ partition_list_bsearch(PartitionKey key, * to the input range bound */ static int -partition_range_bsearch(PartitionKey key, +partition_range_bsearch(int partnatts, FmgrInfo *partsupfunc, + Oid *partcollation, PartitionBoundInfo boundinfo, PartitionRangeBound *probe, bool *is_equal) { @@ -2992,8 +3004,7 @@ partition_range_bsearch(PartitionKey key, int32 cmpval; mid = (lo + hi + 1) / 2; - cmpval = partition_rbound_cmp(key->partnatts, key->partsupfunc, - key->partcollation, + cmpval = partition_rbound_cmp(partnatts, partsupfunc, partcollation, boundinfo->datums[mid], boundinfo->kind[mid], (boundinfo->indexes[mid] == -1), @@ -3022,7 +3033,7 @@ partition_range_bsearch(PartitionKey key, * to the input tuple. */ static int -partition_range_datum_bsearch(PartitionKey key, +partition_range_datum_bsearch(FmgrInfo *partsupfunc, Oid *partcollation, PartitionBoundInfo boundinfo, int nvalues, Datum *values, bool *is_equal) { @@ -3037,8 +3048,8 @@ partition_range_datum_bsearch(PartitionKey key, int32 cmpval; mid = (lo + hi + 1) / 2; - cmpval = partition_rbound_datum_cmp(key->partsupfunc, - key->partcollation, + cmpval = partition_rbound_datum_cmp(partsupfunc, + partcollation, boundinfo->datums[mid], boundinfo->kind[mid], values, @@ -3065,8 +3076,7 @@ partition_range_datum_bsearch(PartitionKey key, * all of them are greater */ static int -partition_hash_bsearch(PartitionKey key, - PartitionBoundInfo boundinfo, +partition_hash_bsearch(PartitionBoundInfo boundinfo, int modulus, int remainder) { int lo, @@ -3264,27 +3274,27 @@ get_greatest_modulus(PartitionBoundInfo bound) * Compute the hash value for given not null partition key values. */ static uint64 -compute_hash_value(PartitionKey key, Datum *values, bool *isnull) +compute_hash_value(int partnatts, FmgrInfo *partsupfunc, + Datum *values, bool *isnull) { int i; - int nkeys = key->partnatts; uint64 rowHash = 0; Datum seed = UInt64GetDatum(HASH_PARTITION_SEED); - for (i = 0; i < nkeys; i++) + for (i = 0; i < partnatts; i++) { if (!isnull[i]) { Datum hash; - Assert(OidIsValid(key->partsupfunc[i].fn_oid)); + Assert(OidIsValid(partsupfunc[i].fn_oid)); /* * Compute hash for each datum value by calling respective * datatype-specific hash functions of each partition key * attribute. */ - hash = FunctionCall2(&key->partsupfunc[i], values[i], seed); + hash = FunctionCall2(&partsupfunc[i], values[i], seed); /* Form a single 64-bit hash value */ rowHash = hash_combine64(rowHash, DatumGetUInt64(hash)); From 6b542dcaf20bbda6f501310a50f1864e01b5e7e9 Mon Sep 17 00:00:00 2001 From: Alvaro Herrera Date: Mon, 26 Feb 2018 17:05:46 -0300 Subject: [PATCH 234/578] Update PartitionTupleRouting struct comment Small review on edd44738bc88. Discussion: https://postgr.es/m/20180222165315.k27qfn4goskhoswj@alvherre.pgsql Reviewed-by: Robert Haas, Amit Langote --- src/include/executor/execPartition.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/include/executor/execPartition.h b/src/include/executor/execPartition.h index 40a67ea3..6996258a 100644 --- a/src/include/executor/execPartition.h +++ b/src/include/executor/execPartition.h @@ -58,11 +58,15 @@ typedef struct PartitionDispatchData *PartitionDispatch; * partition tree. * num_dispatch number of partitioned tables in the partition * tree (= length of partition_dispatch_info[]) - * partition_oids Array of leaf partitions OIDs + * partition_oids Array of leaf partitions OIDs with one entry + * for every leaf partition in the partition tree, + * initialized in full by + * ExecSetupPartitionTupleRouting. * partitions Array of ResultRelInfo* objects with one entry - * for every leaf partition in the partition tree. + * for every leaf partition in the partition tree, + * initialized lazily by ExecInitPartitionInfo. * num_partitions Number of leaf partitions in the partition tree - * (= 'partitions' array length) + * (= 'partitions_oid'/'partitions' array length) * parent_child_tupconv_maps Array of TupleConversionMap objects with one * entry for every leaf partition (required to * convert tuple from the root table's rowtype to From 59dc8ae73967f58329719c0b6f807610b38f8b67 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Wed, 28 Feb 2018 12:16:09 -0500 Subject: [PATCH 235/578] For partitionwise join, match on partcollation, not parttypcoll. The previous code considered two tables to have the partition scheme if the underlying columns had the same collation, but what we actually need to compare is not the collations associated with the column but the collation used for partitioning. Fix that. Robert Haas and Amit Langote Discussion: http://postgr.es/m/0f95f924-0efa-4cf5-eb5f-9a3d1bc3c33d@lab.ntt.co.jp --- src/backend/optimizer/util/plancat.c | 6 +++--- src/include/nodes/relation.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index dba8d09d..e310e85b 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -2034,7 +2034,7 @@ find_partition_scheme(PlannerInfo *root, Relation relation) sizeof(Oid) * partnatts) != 0 || memcmp(partkey->partopcintype, part_scheme->partopcintype, sizeof(Oid) * partnatts) != 0 || - memcmp(partkey->parttypcoll, part_scheme->parttypcoll, + memcmp(partkey->partcollation, part_scheme->partcollation, sizeof(Oid) * partnatts) != 0) continue; @@ -2069,8 +2069,8 @@ find_partition_scheme(PlannerInfo *root, Relation relation) memcpy(part_scheme->partopcintype, partkey->partopcintype, sizeof(Oid) * partnatts); - part_scheme->parttypcoll = (Oid *) palloc(sizeof(Oid) * partnatts); - memcpy(part_scheme->parttypcoll, partkey->parttypcoll, + part_scheme->partcollation = (Oid *) palloc(sizeof(Oid) * partnatts); + memcpy(part_scheme->partcollation, partkey->partcollation, sizeof(Oid) * partnatts); part_scheme->parttyplen = (int16 *) palloc(sizeof(int16) * partnatts); diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h index ee843fad..6172b31e 100644 --- a/src/include/nodes/relation.h +++ b/src/include/nodes/relation.h @@ -463,7 +463,7 @@ typedef struct PartitionSchemeData int16 partnatts; /* number of partition attributes */ Oid *partopfamily; /* OIDs of operator families */ Oid *partopcintype; /* OIDs of opclass declared input data types */ - Oid *parttypcoll; /* OIDs of collations of partition keys. */ + Oid *partcollation; /* OIDs of partitioning collations */ /* Cached information about partition key data types. */ int16 *parttyplen; From e1047e742f714523a6bb715645987dc9f3db0512 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Mon, 5 Mar 2018 17:49:59 -0800 Subject: [PATCH 236/578] Fix parent node of WCO expressions in partitioned tables. Since edd44738bc8814 WCO expressions of partitioned tables are initialized with the first subplan as parent. That's not correct, as the correct context is the ModifyTableState node. That's also what is used for RETURNING processing, initialized nearby. This appears not to cause any visible problems for in core code, but is problematic for in development patch. Discussion: https://postgr.es/m/20180303043818.tnvlo243bgy7una3@alap3.anarazel.de --- src/backend/executor/execPartition.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c index ad72c3cf..50bc3754 100644 --- a/src/backend/executor/execPartition.c +++ b/src/backend/executor/execPartition.c @@ -413,7 +413,7 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, { WithCheckOption *wco = castNode(WithCheckOption, lfirst(ll)); ExprState *wcoExpr = ExecInitQual(castNode(List, wco->qual), - mtstate->mt_plans[0]); + &mtstate->ps); wcoExprs = lappend(wcoExprs, wcoExpr); } From 2e2ee0bfff790eebde820e65734e61df02bf35cc Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Sun, 28 Jun 2020 14:49:58 +0800 Subject: [PATCH 237/578] Fix state reversal after partition tuple routing --- src/backend/commands/copy.c | 13 +- src/backend/executor/nodeModifyTable.c | 216 +++++++++++++------------ src/include/nodes/execnodes.h | 9 +- src/test/regress/expected/insert.out | 26 +++ src/test/regress/sql/insert.sql | 23 +++ 5 files changed, 177 insertions(+), 110 deletions(-) diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 8ef1f6cd..321b44a1 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -3489,13 +3489,12 @@ CopyFrom(CopyState cstate) if (cstate->transition_capture != NULL) { if (resultRelInfo->ri_TrigDesc && - (resultRelInfo->ri_TrigDesc->trig_insert_before_row || - resultRelInfo->ri_TrigDesc->trig_insert_instead_row)) + resultRelInfo->ri_TrigDesc->trig_insert_before_row) { /* - * If there are any BEFORE or INSTEAD triggers on the - * partition, we'll have to be ready to convert their - * result back to tuplestore format. + * If there are any BEFORE triggers on the partition, + * we'll have to be ready to convert their result back to + * tuplestore format. */ cstate->transition_capture->tcs_original_insert_tuple = NULL; cstate->transition_capture->tcs_map = @@ -3772,18 +3771,18 @@ CopyFrom(CopyState cstate) * tuples inserted by an INSERT command. */ processed++; + } + /* Restore the saved ResultRelInfo */ if (saved_resultRelInfo) { resultRelInfo = saved_resultRelInfo; estate->es_result_relation_info = resultRelInfo; } - } #ifdef PGXC } #endif } - /* Flush any remaining buffered tuples */ #ifdef __TBASE__ if(IS_PGXC_DATANODE && npart > 0) diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 8d8b816d..a8cb0df0 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -79,6 +79,11 @@ static bool ExecOnConflictUpdate(ModifyTableState *mtstate, EState *estate, bool canSetTag, TupleTableSlot **returning); +static TupleTableSlot *ExecPrepareTupleRouting(ModifyTableState *mtstate, + EState *estate, + PartitionTupleRouting *proute, + ResultRelInfo *targetRelInfo, + TupleTableSlot *slot); static ResultRelInfo *getTargetResultRelInfo(ModifyTableState *node); static void ExecSetupChildParentMapForTcs(ModifyTableState *mtstate); static void ExecSetupChildParentMapForSubplan(ModifyTableState *mtstate); @@ -281,7 +286,6 @@ ExecInsert(ModifyTableState *mtstate, {// #lizard forgives HeapTuple tuple; ResultRelInfo *resultRelInfo; - ResultRelInfo *saved_resultRelInfo = NULL; Relation resultRelationDesc; Oid newId; List *recheckIndexes = NIL; @@ -322,92 +326,7 @@ ExecInsert(ModifyTableState *mtstate, tuple = ExecMaterializeSlot_shard(slot, hasshard, diskey, secdiskey, RelationGetRelid(resultRelationDesc)); } #endif - /* Determine the partition to heap_insert the tuple into */ - if (mtstate->mt_partition_tuple_routing) - { - int leaf_part_index; - PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing; - - /* - * Away we go ... If we end up not finding a partition after all, - * ExecFindPartition() does not return and errors out instead. - * Otherwise, the returned value is to be used as an index into arrays - * proute->partitions[] and proute->partition_tupconv_maps[] that will - * get us the ResultRelInfo and TupleConversionMap for the partition, - * respectively. - */ - leaf_part_index = ExecFindPartition(resultRelInfo, - proute->partition_dispatch_info, - slot, - estate); - Assert(leaf_part_index >= 0 && - leaf_part_index < proute->num_partitions); - - /* - * Save the old ResultRelInfo and switch to the one corresponding to - * the selected partition. (We might need to initialize it first.) - */ - saved_resultRelInfo = resultRelInfo; - resultRelInfo = proute->partitions[leaf_part_index]; - if (resultRelInfo == NULL) - { - resultRelInfo = ExecInitPartitionInfo(mtstate, - saved_resultRelInfo, - proute, estate, - leaf_part_index); - Assert(resultRelInfo != NULL); - } - /* We do not yet have a way to insert into a foreign partition */ - if (resultRelInfo->ri_FdwRoutine) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot route inserted tuples to a foreign table"))); - - /* For ExecInsertIndexTuples() to work on the partition's indexes */ - estate->es_result_relation_info = resultRelInfo; - - /* - * If we're capturing transition tuples, we might need to convert from - * the partition rowtype to parent rowtype. - */ - if (mtstate->mt_transition_capture != NULL) - { - if (resultRelInfo->ri_TrigDesc && - (resultRelInfo->ri_TrigDesc->trig_insert_before_row || - resultRelInfo->ri_TrigDesc->trig_insert_instead_row)) - { - /* - * If there are any BEFORE or INSTEAD triggers on the - * partition, we'll have to be ready to convert their result - * back to tuplestore format. - */ - mtstate->mt_transition_capture->tcs_original_insert_tuple = NULL; - - mtstate->mt_transition_capture->tcs_map = - TupConvMapForLeaf(proute, saved_resultRelInfo, - leaf_part_index); - } - else - { - /* - * Otherwise, just remember the original unconverted tuple, to - * avoid a needless round trip conversion. - */ - mtstate->mt_transition_capture->tcs_original_insert_tuple = tuple; - mtstate->mt_transition_capture->tcs_map = NULL; - } - } - - /* - * We might need to convert from the parent rowtype to the partition - * rowtype. - */ - tuple = ConvertPartitionTupleSlot(proute->parent_child_tupconv_maps[leaf_part_index], - tuple, - proute->partition_tuple_slot, - &slot); - } #ifdef __TBASE__ /* Determine the interval partition to heap_insert the tuple into */ else if (resultRelInfo->ispartparent) @@ -633,7 +552,7 @@ ExecInsert(ModifyTableState *mtstate, * No need though if the tuple has been routed, and a BR trigger * doesn't exist. */ - if (saved_resultRelInfo != NULL && + if (resultRelInfo->ri_PartitionRoot != NULL && !(resultRelInfo->ri_TrigDesc && resultRelInfo->ri_TrigDesc->trig_insert_before_row)) check_partition_constr = false; @@ -891,9 +810,6 @@ ExecInsert(ModifyTableState *mtstate, if (resultRelInfo->ri_projectReturning) result = ExecProcessReturning(resultRelInfo, slot, planSlot); - if (saved_resultRelInfo) - estate->es_result_relation_info = saved_resultRelInfo; - return result; } @@ -1528,27 +1444,22 @@ lreplace:; proute->root_tuple_slot, &slot); - - /* - * For ExecInsert(), make it look like we are inserting into the - * root. - */ + /* Prepare for tuple routing */ Assert(mtstate->rootResultRelInfo != NULL); - estate->es_result_relation_info = mtstate->rootResultRelInfo; + slot = ExecPrepareTupleRouting(mtstate, estate, proute, + mtstate->rootResultRelInfo, slot); ret_slot = ExecInsert(mtstate, slot, planSlot, NULL, ONCONFLICT_NONE, estate, canSetTag); - /* - * Revert back the active result relation and the active - * transition capture map that we changed above. - */ + /* Revert ExecPrepareTupleRouting's node change. */ estate->es_result_relation_info = resultRelInfo; if (mtstate->mt_transition_capture) { mtstate->mt_transition_capture->tcs_original_insert_tuple = NULL; mtstate->mt_transition_capture->tcs_map = saved_tcs_map; } + return ret_slot; } @@ -2046,6 +1957,103 @@ ExecSetupTransitionCaptureState(ModifyTableState *mtstate, EState *estate) } } +/* + * ExecPrepareTupleRouting --- prepare for routing one tuple + * + * Determine the partition in which the tuple in slot is to be inserted, + * and modify mtstate and estate to prepare for it. + * + * Caller must revert the estate changes after executing the insertion! + * In mtstate, transition capture changes may also need to be reverted. + * + * Returns a slot holding the tuple of the partition rowtype. + */ +static TupleTableSlot * +ExecPrepareTupleRouting(ModifyTableState *mtstate, + EState *estate, + PartitionTupleRouting *proute, + ResultRelInfo *targetRelInfo, + TupleTableSlot *slot) +{ + int partidx; + ResultRelInfo *partrel; + HeapTuple tuple; + + /* + * Determine the target partition. If ExecFindPartition does not find + * a partition after all, it doesn't return here; otherwise, the returned + * value is to be used as an index into the arrays for the ResultRelInfo + * and TupleConversionMap for the partition. + */ + partidx = ExecFindPartition(targetRelInfo, + proute->partition_dispatch_info, + slot, + estate); + Assert(partidx >= 0 && partidx < proute->num_partitions); + + /* + * Get the ResultRelInfo corresponding to the selected partition; if not + * yet there, initialize it. + */ + partrel = proute->partitions[partidx]; + if (partrel == NULL) + partrel = ExecInitPartitionInfo(mtstate, targetRelInfo, + proute, estate, + partidx); + + /* We do not yet have a way to insert into a foreign partition */ + if (partrel->ri_FdwRoutine) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot route inserted tuples to a foreign table"))); + + /* + * Make it look like we are inserting into the partition. + */ + estate->es_result_relation_info = partrel; + + /* Get the heap tuple out of the given slot. */ + tuple = ExecMaterializeSlot(slot); + + /* + * If we're capturing transition tuples, we might need to convert from the + * partition rowtype to parent rowtype. + */ + if (mtstate->mt_transition_capture != NULL) + { + if (partrel->ri_TrigDesc && + partrel->ri_TrigDesc->trig_insert_before_row) + { + /* + * If there are any BEFORE triggers on the partition, we'll have + * to be ready to convert their result back to tuplestore format. + */ + mtstate->mt_transition_capture->tcs_original_insert_tuple = NULL; + mtstate->mt_transition_capture->tcs_map = + TupConvMapForLeaf(proute, targetRelInfo, partidx); + } + else + { + /* + * Otherwise, just remember the original unconverted tuple, to + * avoid a needless round trip conversion. + */ + mtstate->mt_transition_capture->tcs_original_insert_tuple = tuple; + mtstate->mt_transition_capture->tcs_map = NULL; + } + } + + /* + * Convert the tuple, if necessary. + */ + ConvertPartitionTupleSlot(proute->parent_child_tupconv_maps[partidx], + tuple, + proute->partition_tuple_slot, + &slot); + + return slot; +} + /* * Initialize the child-to-root tuple conversion map array for UPDATE subplans. * @@ -2182,6 +2190,7 @@ static TupleTableSlot * ExecModifyTable(PlanState *pstate) {// #lizard forgives ModifyTableState *node = castNode(ModifyTableState, pstate); + PartitionTupleRouting *proute = node->mt_partition_tuple_routing; EState *estate = node->ps.state; CmdType operation = node->operation; ResultRelInfo *saved_resultRelInfo; @@ -2658,9 +2667,16 @@ ExecModifyTable(PlanState *pstate) oldtag = mls_command_tag_switch_to(CLS_CMD_ROW); } #endif + /* Prepare for tuple routing if needed. */ + if (proute) + slot = ExecPrepareTupleRouting(node, estate, proute, + resultRelInfo, slot); slot = ExecInsert(node, slot, planSlot, node->mt_arbiterindexes, node->mt_onconflict, estate, node->canSetTag); + /* Revert ExecPrepareTupleRouting's state change. */ + if (proute) + estate->es_result_relation_info = resultRelInfo; #ifdef _MLS_ if (IsA(subplanstate, ResultState) || IsA(subplanstate, RemoteSubplanState) ) { diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 74475d60..bed56a23 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1066,12 +1066,15 @@ typedef struct ModifyTableState TupleTableSlot *mt_existing; /* slot to store existing target tuple in */ List *mt_excludedtlist; /* the excluded pseudo relation's tlist */ TupleTableSlot *mt_conflproj; /* CONFLICT ... SET ... projection target */ - struct PartitionTupleRouting *mt_partition_tuple_routing; + /* Tuple-routing support info */ + struct PartitionTupleRouting *mt_partition_tuple_routing; + + /* controls transition table population for specified operation */ struct TransitionCaptureState *mt_transition_capture; - /* controls transition table population */ - TupleConversionMap **mt_per_subplan_tupconv_maps; + /* Per plan map for tuple conversion from child to root */ + TupleConversionMap **mt_per_subplan_tupconv_maps; #ifdef __TBASE__ /* used for interval partition */ bool haspartparent; diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out index a671b345..e1a74c4a 100644 --- a/src/test/regress/expected/insert.out +++ b/src/test/regress/expected/insert.out @@ -778,6 +778,32 @@ drop role regress_coldesc_role; drop table inserttest3; drop table brtrigpartcon; drop function brtrigpartcon1trigf(); +-- check that "do nothing" BR triggers work with tuple-routing (this checks +-- that estate->es_result_relation_info is appropriately set/reset for each +-- routed tuple) +create table donothingbrtrig_test (a int, b text) partition by list (a); +create table donothingbrtrig_test1 (b text, a int); +create table donothingbrtrig_test2 (c text, b text, a int); +alter table donothingbrtrig_test2 drop column c; +create or replace function donothingbrtrig_func() returns trigger as $$begin raise notice 'b: %', new.b; return NULL; end$$ language plpgsql; +create trigger donothingbrtrig1 before insert on donothingbrtrig_test1 for each row execute procedure donothingbrtrig_func(); +create trigger donothingbrtrig2 before insert on donothingbrtrig_test2 for each row execute procedure donothingbrtrig_func(); +alter table donothingbrtrig_test attach partition donothingbrtrig_test1 for values in (1); +alter table donothingbrtrig_test attach partition donothingbrtrig_test2 for values in (2); +insert into donothingbrtrig_test values (1, 'foo'), (2, 'bar'); +NOTICE: b: foo +NOTICE: b: bar +copy donothingbrtrig_test from stdout; +NOTICE: b: baz +NOTICE: b: qux +select tableoid::regclass, * from donothingbrtrig_test; + tableoid | a | b +----------+---+--- +(0 rows) + +-- cleanup +drop table donothingbrtrig_test; +drop function donothingbrtrig_func(); -- check multi-column range partitioning with minvalue/maxvalue constraints create table mcrparted (a text, b int) partition by range(a, b); create table mcrparted1_lt_b partition of mcrparted for values from (minvalue, minvalue) to ('b', minvalue); diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql index 21d04de1..9a561519 100644 --- a/src/test/regress/sql/insert.sql +++ b/src/test/regress/sql/insert.sql @@ -483,6 +483,29 @@ drop table inserttest3; drop table brtrigpartcon; drop function brtrigpartcon1trigf(); +-- check that "do nothing" BR triggers work with tuple-routing (this checks +-- that estate->es_result_relation_info is appropriately set/reset for each +-- routed tuple) +create table donothingbrtrig_test (a int, b text) partition by list (a); +create table donothingbrtrig_test1 (b text, a int); +create table donothingbrtrig_test2 (c text, b text, a int); +alter table donothingbrtrig_test2 drop column c; +create or replace function donothingbrtrig_func() returns trigger as $$begin raise notice 'b: %', new.b; return NULL; end$$ language plpgsql; +create trigger donothingbrtrig1 before insert on donothingbrtrig_test1 for each row execute procedure donothingbrtrig_func(); +create trigger donothingbrtrig2 before insert on donothingbrtrig_test2 for each row execute procedure donothingbrtrig_func(); +alter table donothingbrtrig_test attach partition donothingbrtrig_test1 for values in (1); +alter table donothingbrtrig_test attach partition donothingbrtrig_test2 for values in (2); +insert into donothingbrtrig_test values (1, 'foo'), (2, 'bar'); +copy donothingbrtrig_test from stdout; +1 baz +2 qux +\. +select tableoid::regclass, * from donothingbrtrig_test; + +-- cleanup +drop table donothingbrtrig_test; +drop function donothingbrtrig_func(); + -- check multi-column range partitioning with minvalue/maxvalue constraints create table mcrparted (a text, b int) partition by range(a, b); create table mcrparted1_lt_b partition of mcrparted for values from (minvalue, minvalue) to ('b', minvalue); From 5827d1e618561143ec6a746817a8accf7410a018 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Sun, 28 Jun 2020 20:53:20 +0800 Subject: [PATCH 238/578] Factor some code out of create_grouping_paths.http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/optimizer/plan/planner.c | 6146 +++++++++++++------------- 1 file changed, 3112 insertions(+), 3034 deletions(-) diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index b22d678d..640dcc4d 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -203,6 +203,26 @@ static bool can_push_down_grouping(PlannerInfo *root, Query *parse, Path *path); static bool can_push_down_window(PlannerInfo *root, Path *path); static void adjust_paths_for_srfs(PlannerInfo *root, RelOptInfo *rel, List *targets, List *targets_contain_srfs); +static void add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, + RelOptInfo *grouped_rel, PathTarget *target, + PathTarget *partial_grouping_target, + const AggClauseCosts *agg_costs, + const AggClauseCosts *agg_final_costs, + grouping_sets_data *gd, bool can_sort, bool can_hash, + double dNumGroups, List *havingQual); +static void add_partial_paths_to_grouping_rel(PlannerInfo *root, + RelOptInfo *input_rel, + RelOptInfo *grouped_rel, + PathTarget *target, + PathTarget *partial_grouping_target, + AggClauseCosts *agg_partial_costs, + AggClauseCosts *agg_final_costs, + grouping_sets_data *gd, + bool can_sort, + bool can_hash, + List *havingQual); +static bool can_parallel_agg(PlannerInfo *root, RelOptInfo *input_rel, + RelOptInfo *grouped_rel, const AggClauseCosts *agg_costs); #ifdef __TBASE__ static Path *adjust_modifytable_subpath(PlannerInfo *root, Query *parse, Path *path); #endif @@ -3899,15 +3919,12 @@ create_grouping_paths(PlannerInfo *root, PathTarget *partial_grouping_target = NULL; AggClauseCosts agg_partial_costs; /* parallel only */ AggClauseCosts agg_final_costs; /* parallel only */ - Size hashaggtablesize; double dNumGroups; - double dNumPartialGroups = 0; bool can_hash; bool can_sort; bool try_parallel_aggregation; bool try_distributed_aggregation; - ListCell *lc; /* For now, do all work in the (GROUP_AGG, NULL) upperrel */ grouped_rel = fetch_upper_rel(root, UPPERREL_GROUP_AGG, NULL); @@ -4041,44 +4058,11 @@ create_grouping_paths(PlannerInfo *root, (gd ? gd->any_hashable : grouping_is_hashable(parse->groupClause))); /* - * If grouped_rel->consider_parallel is true, then paths that we generate - * for this grouping relation could be run inside of a worker, but that - * doesn't mean we can actually use the PartialAggregate/FinalizeAggregate - * execution strategy. Figure that out. - */ - if (!grouped_rel->consider_parallel) - { - /* Not even parallel-safe. */ - try_parallel_aggregation = false; - } - else if (input_rel->partial_pathlist == NIL) - { - /* Nothing to use as input for partial aggregate. */ - try_parallel_aggregation = false; - } - else if (!parse->hasAggs && parse->groupClause == NIL) - { - /* - * We don't know how to do parallel aggregation unless we have either - * some aggregates or a grouping clause. + * Figure out whether a PartialAggregate/Finalize Aggregate execution + * strategy is viable. */ - try_parallel_aggregation = false; - } - else if (parse->groupingSets) - { - /* We don't know how to do grouping sets in parallel. */ - try_parallel_aggregation = false; - } - else if (agg_costs->hasNonPartial || agg_costs->hasNonSerial) - { - /* Insufficient support for partial mode. */ - try_parallel_aggregation = false; - } - else - { - /* Everything looks good. */ - try_parallel_aggregation = true; - } + try_parallel_aggregation = can_parallel_agg(root, input_rel, grouped_rel, + agg_costs); /* * The distributed aggregation however works even if there are no partial @@ -4133,8 +4117,6 @@ create_grouping_paths(PlannerInfo *root, */ if (try_parallel_aggregation) { - Path *cheapest_partial_path = linitial(input_rel->partial_pathlist); - /* * Build target list for partial aggregate paths. These paths cannot * just emit the same tlist as regular aggregate paths, because (1) we @@ -4144,11 +4126,6 @@ create_grouping_paths(PlannerInfo *root, */ partial_grouping_target = make_partial_grouping_target(root, target); - /* Estimate number of partial groups. */ - dNumPartialGroups = get_number_of_groups(root, - cheapest_partial_path->rows, - gd); - /* * Collect statistics about aggregates for estimating costs of * performing aggregation in parallel. @@ -4171,97 +4148,11 @@ create_grouping_paths(PlannerInfo *root, &agg_final_costs); } - if (can_sort) - { - /* This was checked before setting try_parallel_aggregation */ - Assert(parse->hasAggs || parse->groupClause); - - /* - * Use any available suitably-sorted path as input, and also - * consider sorting the cheapest partial path. - */ - foreach(lc, input_rel->partial_pathlist) - { - Path *path = (Path *) lfirst(lc); - bool is_sorted; - - is_sorted = pathkeys_contained_in(root->group_pathkeys, - path->pathkeys); - if (path == cheapest_partial_path || is_sorted) - { - /* Sort the cheapest partial path, if it isn't already */ - if (!is_sorted) - path = (Path *) create_sort_path(root, - grouped_rel, - path, - root->group_pathkeys, - -1.0); - - if (parse->hasAggs) - add_partial_path(grouped_rel, (Path *) - create_agg_path(root, - grouped_rel, - path, - partial_grouping_target, - parse->groupClause ? AGG_SORTED : AGG_PLAIN, - AGGSPLIT_INITIAL_SERIAL, - parse->groupClause, - NIL, - &agg_partial_costs, - dNumPartialGroups)); - else - add_partial_path(grouped_rel, (Path *) - create_group_path(root, - grouped_rel, - path, - partial_grouping_target, - parse->groupClause, - NIL, - dNumPartialGroups)); - } - } - } - - if (can_hash) - { - /* Checked above */ - Assert(parse->hasAggs || parse->groupClause); - - hashaggtablesize = - estimate_hashagg_tablesize(cheapest_partial_path, - &agg_partial_costs, - dNumPartialGroups); - - /* - * Tentatively produce a partial HashAgg Path, depending on if it - * looks as if the hash table will fit in work_mem. - */ -#ifdef __TBASE__ - if (hashaggtablesize < work_mem * 1024L || g_hybrid_hash_agg) -#else - if (hashaggtablesize < work_mem * 1024L) -#endif - { - AggPath *aggpath = (AggPath *) - create_agg_path(root, - grouped_rel, - cheapest_partial_path, + add_partial_paths_to_grouping_rel(root, input_rel, grouped_rel, target, partial_grouping_target, - AGG_HASHED, - AGGSPLIT_INITIAL_SERIAL, - parse->groupClause, - NIL, - &agg_partial_costs, - dNumPartialGroups); -#ifdef __TBASE__ - if (hashaggtablesize >= work_mem * 1024L) - { - aggpath->hybrid = true; - } -#endif - add_partial_path(grouped_rel, (Path *)aggpath); - } - } + &agg_partial_costs, &agg_final_costs, + gd, can_sort, can_hash, + (List *) parse->havingQual); } #ifdef __TBASE__ else @@ -4300,6 +4191,67 @@ create_grouping_paths(PlannerInfo *root, */ /* Build final grouping paths */ + add_paths_to_grouping_rel(root, input_rel, grouped_rel, target, + partial_grouping_target, agg_costs, + &agg_final_costs, gd, can_sort, can_hash, + dNumGroups, (List *) parse->havingQual); + + /* Generate XL aggregate paths, with distributed 2-phase aggregation. */ + + /* + * If there were no partial paths, we did not initialize any of the + * partial paths above. If that's the case, initialize here. + * + * XXX The reason why the initialization block at the beginning is not + * simply performed unconditionally is that we may skip it if we've been + * successful in fully pushing down any of the aggregates, and entirely + * skip generating the XL paths. + * + * XXX Can we simply use the same estimates as regular partial aggregates, + * or do we need to invent something else? It might be a better idea to + * use estimates for the whole result here (e.g. total number of groups) + * instead of the partial ones. Underestimates often have more severe + * consequences (e.g. OOM with HashAggregate) than overestimates, so this + * seems like a more defensive approach. + * + * XXX After thinking a bit more about the estimation, it may depend on + * pushdown - if the aggregate is fully pushed down (as above, we can + * probably use dNumGroups/numberOfNodes as a cardinality estimate, as + * we know the per-node groupings won't overlap. But here we need to be + * more careful. + */ + if (try_distributed_aggregation) + { + partial_grouping_target = make_partial_grouping_target(root, target); + + /* Estimate number of partial groups. */ + dNumPartialGroups = get_number_of_groups(root, + cheapest_path->rows, + gd); + + /* + * Collect statistics about aggregates for estimating costs of + * performing aggregation in parallel. + */ + MemSet(&agg_partial_costs, 0, sizeof(AggClauseCosts)); + MemSet(&agg_final_costs, 0, sizeof(AggClauseCosts)); + if (parse->hasAggs) + { + /* partial phase */ + get_agg_clause_costs(root, (Node *) partial_grouping_target->exprs, + AGGSPLIT_INITIAL_SERIAL, + &agg_partial_costs); + + /* final phase */ + get_agg_clause_costs(root, (Node *) target->exprs, + AGGSPLIT_FINAL_DESERIAL, + &agg_final_costs); + get_agg_clause_costs(root, parse->havingQual, + AGGSPLIT_FINAL_DESERIAL, + &agg_final_costs); + } + + /* Build final XL grouping paths */ if (can_sort) { /* @@ -4324,115 +4276,119 @@ create_grouping_paths(PlannerInfo *root, */ if (path == cheapest_path || is_sorted) { -#ifdef __TBASE__ - bool try_redistribute_grouping = false; - double dNumLocalGroups; - PathTarget * local_grouping_target = make_partial_grouping_target(root, target); - - grouped_rel->reltarget = local_grouping_target; - - /* Estimate number of partial groups. */ - dNumLocalGroups = get_number_of_groups(root, - cheapest_path->rows, - gd); -#endif - -#ifdef __TBASE__ - if (olap_optimizer && !has_cold_hot_table) - { - if (!is_sorted && !agg_costs->hasOnlyDistinct) + /* + * We can't really beat paths that we managed to fully push + * down above, so we can skip them entirely. + * + * XXX Not constructing any paths, so we can do this before + * adding the Sort path. + */ + if (can_push_down_grouping(root, parse, path)) + continue; + + /* Sort the cheapest-total path if it isn't already sorted */ + if (!is_sorted) path = (Path *) create_sort_path(root, grouped_rel, path, root->group_pathkeys, -1.0); - } - else - { + + /* Now decide what to stick atop it */ + if (parse->groupingSets) + { + /* + * TODO 2-phase aggregation for grouping sets paths not + * supported yet, but this the place where such paths + * should be constructed. + */ + } + else if (parse->hasAggs) + { + /* + * We have aggregation, possibly with plain GROUP BY. Make + * an AggPath. + */ + + path = (Path *) create_agg_path(root, + grouped_rel, + path, + partial_grouping_target, + parse->groupClause ? AGG_SORTED : AGG_PLAIN, + AGGSPLIT_INITIAL_SERIAL, + parse->groupClause, + NIL, + &agg_partial_costs, + dNumPartialGroups); + +#ifdef __TBASE__ + if (olap_optimizer && !has_cold_hot_table) + { + /* redistribute local grouping results among datanodes */ + path = create_redistribute_grouping_path(root, parse, path); + } + else + { + path = create_remotesubplan_path(root, path, NULL); + } +#else + path = create_remotesubplan_path(root, path, NULL); #endif - /* Sort the cheapest-total path if it isn't already sorted */ - if (!is_sorted) - path = (Path *) create_sort_path(root, + +#ifdef __TBASE__ + if (parse->groupClause && olap_optimizer && !has_cold_hot_table && + (!is_sorted || root->group_pathkeys)) + { + path = (Path *) create_sort_path(root, grouped_rel, path, root->group_pathkeys, -1.0); -#ifdef __TBASE__ - } + } #endif - /* - * If the grouping can't be fully pushed down, redistribute the - * path on top of the (sorted) path. If if can be pushed down, - * disable construction of complex distributed paths. - */ - if (! can_push_down_grouping(root, parse, path)) -#ifdef __TBASE__ - { - /* some special aggs cannot be parallel executed, such as count(distinct) */ - if(agg_costs->hasNonPartial || agg_costs->hasNonSerial || - parse->groupingSets || path->pathtype == T_Agg || - path->pathtype == T_Group || !olap_optimizer || has_cold_hot_table) - { - if (agg_costs->hasOnlyDistinct && olap_optimizer && !parse->groupingSets - && !has_cold_hot_table) - path = create_redistribute_grouping_path(root, parse, path); - else - path = create_remotesubplan_path(root, path, NULL); - - if (agg_costs->hasOnlyDistinct && olap_optimizer && - !has_cold_hot_table) - { - if (root->group_pathkeys) - { - path = (Path *) create_sort_path(root, + /* + * We generate two paths, differing in the second phase + * implementation (sort and hash). + */ + { + Path *agg_path = (Path *) + create_agg_path(root, grouped_rel, path, - root->group_pathkeys, - -1.0); - } - } - } - else - { - /* - * If the grouping can not be fully pushed down, we adopt another - * strategy instead. - * 1. do grouping on each datanode locally - * 2. re-distribute grouping results among datanodes, then do the - * final grouping - */ + target, + parse->groupClause ? AGG_SORTED : AGG_PLAIN, + AGGSPLIT_FINAL_DESERIAL, + parse->groupClause, + (List *) parse->havingQual, + &agg_final_costs, + dNumGroups); - try_redistribute_grouping = true; + //agg_path->parallel_safe = true; - /* step 1 */ - if (parse->groupingSets) - { - /* - * TODO 2-phase aggregation for grouping sets paths not - * supported yet, but this the place where such paths - * should be constructed. - */ + add_path(grouped_rel, agg_path); } - else if (parse->hasAggs) - { - /* - * We have aggregation, possibly with plain GROUP BY. Make - * an AggPath. - */ - path = (Path *) create_agg_path(root, + if (can_hash) + { + Path *agg_path = (Path *) + create_agg_path(root, grouped_rel, path, - local_grouping_target, - parse->groupClause ? AGG_SORTED : AGG_PLAIN, - AGGSPLIT_INITIAL_SERIAL, + target, + AGG_HASHED, + AGGSPLIT_FINAL_DESERIAL, parse->groupClause, - NIL, - &agg_partial_costs, - dNumLocalGroups); - } - else if (parse->groupClause) - { + (List *) parse->havingQual, + &agg_final_costs, + dNumGroups); + + //agg_path->parallel_safe = true; + + add_path(grouped_rel, agg_path); + } + } + else if (parse->groupClause) + { /* * We have GROUP BY without aggregation or grouping sets. * Make a GroupPath. @@ -4440,85 +4396,126 @@ create_grouping_paths(PlannerInfo *root, path = (Path *) create_group_path(root, grouped_rel, path, - local_grouping_target, + partial_grouping_target, parse->groupClause, NIL, - dNumLocalGroups); - } - else - { - /* Other cases should have been handled above */ - Assert(false); - } + dNumPartialGroups); - /* step 2*/ +#ifdef __TBASE__ + if (olap_optimizer && !has_cold_hot_table) + { + /* redistribute local grouping results among datanodes */ path = create_redistribute_grouping_path(root, parse, path); } - } + else + { + path = create_remotesubplan_path(root, path, NULL); + } #else - path = create_remotesubplan_path(root, path, NULL); + path = create_remotesubplan_path(root, path, NULL); #endif - else - try_distributed_aggregation = false; - #ifdef __TBASE__ - if(try_redistribute_grouping) - { - /* - * do final grouping at each datanode - */ - - /* Now decide what to stick atop it */ - if (parse->groupingSets) - { - /* - * TODO 2-phase aggregation for grouping sets paths not - * supported yet, but this the place where such paths - * should be constructed. - */ - } - else if (parse->hasAggs) - { - /* - * We generate two paths, differing in the second phase - * implementation (sort and hash). - */ - Path *remote_path = path; - - if (parse->groupClause) + if (olap_optimizer && !has_cold_hot_table && (!is_sorted || root->group_pathkeys)) { - if (!is_sorted || root->group_pathkeys) - { path = (Path *) create_sort_path(root, grouped_rel, path, root->group_pathkeys, -1.0); - } } - - path = (Path *)create_agg_path(root, +#endif + { + Path *group_path = (Path *) + create_group_path(root, grouped_rel, path, target, - parse->groupClause ? AGG_SORTED : AGG_PLAIN, - AGGSPLIT_FINAL_DESERIAL, parse->groupClause, (List *) parse->havingQual, - &agg_final_costs, dNumGroups); - //path->parallel_safe = true; + //group_path->parallel_safe = true; - add_path(grouped_rel, path); + add_path(grouped_rel, group_path); + } - if (can_hash) - { - path = (Path *) + } + else + { + /* Other cases should have been handled above */ + Assert(false); + } + } + } + } + + if (can_hash) + { + hashaggtablesize = estimate_hashagg_tablesize(cheapest_path, + agg_costs, + dNumGroups); + + /* + * Provided that the estimated size of the hashtable does not exceed + * work_mem, we'll generate a HashAgg Path, although if we were unable + * to sort above, then we'd better generate a Path, so that we at + * least have one. + */ +#ifdef __TBASE__ + if (hashaggtablesize < work_mem * 1024L || g_hybrid_hash_agg || + grouped_rel->pathlist == NIL) +#else + if (hashaggtablesize < work_mem * 1024L || + grouped_rel->pathlist == NIL) +#endif + { + /* If the whole aggregate was pushed down, we're done. */ + if (! can_push_down_grouping(root, parse, cheapest_path)) + { + Path *path, *agg_path; + + path = (Path *) create_agg_path(root, + grouped_rel, + cheapest_path, + partial_grouping_target, + AGG_HASHED, + AGGSPLIT_INITIAL_SERIAL, + parse->groupClause, + NIL, + &agg_partial_costs, + dNumPartialGroups); + + /* keep partially aggregated path for the can_sort branch */ + agg_path = path; +#ifdef __TBASE__ + if (hashaggtablesize >= work_mem * 1024L && g_hybrid_hash_agg) + { + AggPath *aggpath = (AggPath *)agg_path; + + aggpath->hybrid = true; + } +#endif + +#ifdef __TBASE__ + if (olap_optimizer && !has_cold_hot_table) + { + /* redistribute local grouping results among datanodes */ + path = create_redistribute_grouping_path(root, parse, path); + } + else + { + path = create_remotesubplan_path(root, path, NULL); + } +#else + path = create_remotesubplan_path(root, path, NULL); +#endif + /* Generate paths with both hash and sort second phase. */ + { + Path *agg_path = (Path *) create_agg_path(root, grouped_rel, - remote_path, + path, target, AGG_HASHED, AGGSPLIT_FINAL_DESERIAL, @@ -4526,3381 +4523,3462 @@ create_grouping_paths(PlannerInfo *root, (List *) parse->havingQual, &agg_final_costs, dNumGroups); - //path->parallel_safe = true; - if (g_hybrid_hash_agg) + + //agg_path->parallel_safe = true; +#ifdef __TBASE__ + if (hashaggtablesize >= work_mem * 1024L && g_hybrid_hash_agg) { - AggPath *agg = (AggPath *)path; - agg->hybrid = true; - } + AggPath *aggpath = (AggPath *)agg_path; - add_path(grouped_rel, path); + aggpath->hybrid = true; } +#endif + add_path(grouped_rel, agg_path); } - else if (parse->groupClause) + + if (can_sort) { - if (!is_sorted || root->group_pathkeys) - { +#ifdef __TBASE__ + if (!olap_optimizer || has_cold_hot_table) +#endif path = (Path *) create_sort_path(root, grouped_rel, - path, + agg_path, root->group_pathkeys, -1.0); - } - - path = (Path *) - create_group_path(root, - grouped_rel, - path, - target, - parse->groupClause, - (List *) parse->havingQual, - dNumGroups); - - //path->parallel_safe = true; - - /* - * We have GROUP BY without aggregation or grouping sets. - * Make a GroupPath. - */ - add_path(grouped_rel, path); - } - else - { - /* Other cases should have been handled above */ - Assert(false); - } - } - else - { +#ifdef __TBASE__ + if (olap_optimizer && !has_cold_hot_table) + { + /* redistribute local grouping results among datanodes */ + path = create_redistribute_grouping_path(root, parse, agg_path); + } + else + { + path = create_remotesubplan_path(root, path, NULL); + } +#else + path = create_remotesubplan_path(root, path, NULL); #endif - /* Now decide what to stick atop it */ - if (parse->groupingSets) - { - consider_groupingsets_paths(root, grouped_rel, - path, true, can_hash, target, - gd, agg_costs, dNumGroups); - } - else if (parse->hasAggs) - { #ifdef __TBASE__ - bool parallel_aware = false; - bool parallel_safe = false; - Path *agg_path = NULL; - - if (root->group_pathkeys && olap_optimizer && - !has_cold_hot_table && agg_costs->hasOnlyDistinct) - { - if (!pathkeys_contained_in(root->group_pathkeys, - path->pathkeys)) + if (olap_optimizer && !has_cold_hot_table) { + /* + * AGG_HASHED aggregate paths are always unsorted, so add + * a Sorted node for the final AGG_SORTED step. + */ path = (Path *) create_sort_path(root, grouped_rel, path, root->group_pathkeys, -1.0); } - } - - if (path->pathtype == T_Sort && olap_optimizer && !has_cold_hot_table) - { - SortPath *pathnode = (SortPath *)path; - - if (pathnode->subpath->pathtype == T_Gather || agg_costs->hasOnlyDistinct) - { - path->parallel_aware = true; - parallel_aware = true; - parallel_safe = true; - } - } +#endif - agg_path = (Path *) + path = (Path *) create_agg_path(root, grouped_rel, path, target, parse->groupClause ? AGG_SORTED : AGG_PLAIN, - AGGSPLIT_SIMPLE, + AGGSPLIT_FINAL_DESERIAL, parse->groupClause, (List *) parse->havingQual, - agg_costs, + &agg_final_costs, dNumGroups); - agg_path->parallel_aware = parallel_aware; - agg_path->parallel_safe = parallel_safe; - add_path(grouped_rel, agg_path); -#else - /* - * We have aggregation, possibly with plain GROUP BY. Make - * an AggPath. - */ - add_path(grouped_rel, (Path *) - create_agg_path(root, - grouped_rel, - path, - target, - parse->groupClause ? AGG_SORTED : AGG_PLAIN, - AGGSPLIT_SIMPLE, - parse->groupClause, - (List *) parse->havingQual, - agg_costs, - dNumGroups)); -#endif - } - else if (parse->groupClause) - { -#ifdef __TBASE__ - bool parallel_aware = false; - bool parallel_safe = false; - Path *group_path = NULL; + //path->parallel_safe = true; - if (root->group_pathkeys && olap_optimizer && - !has_cold_hot_table && agg_costs->hasOnlyDistinct) - { - if (!pathkeys_contained_in(root->group_pathkeys, - path->pathkeys)) - { - path = (Path *) create_sort_path(root, - grouped_rel, - path, - root->group_pathkeys, - -1.0); - } + add_path(grouped_rel, path); } + } + } + } + } - if (path->pathtype == T_Sort && olap_optimizer && !has_cold_hot_table) - { - SortPath *pathnode = (SortPath *)path; - - if (pathnode->subpath->pathtype == T_Gather) - { - path->parallel_aware = true; - parallel_aware = true; - parallel_safe = true; - } - } - - group_path = (Path *) - create_group_path(root, - grouped_rel, - path, - target, - parse->groupClause, - (List *) parse->havingQual, - dNumGroups); - group_path->parallel_aware = parallel_aware; - group_path->parallel_safe = parallel_safe; - add_path(grouped_rel, group_path); -#else + /* Give a helpful error if we failed to find any implementation */ + if (grouped_rel->pathlist == NIL) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("could not implement GROUP BY"), + errdetail("Some of the datatypes only support hashing, while others only support sorting."))); /* - * We have GROUP BY without aggregation or grouping sets. - * Make a GroupPath. + * If there is an FDW that's responsible for all baserels of the query, + * let it consider adding ForeignPaths. */ - add_path(grouped_rel, (Path *) - create_group_path(root, - grouped_rel, - path, - target, - parse->groupClause, - (List *) parse->havingQual, - dNumGroups)); -#endif - } - else - { - /* Other cases should have been handled above */ - Assert(false); - } -#ifdef __TBASE__ - } -#endif - } - } - + if (grouped_rel->fdwroutine && + grouped_rel->fdwroutine->GetForeignUpperPaths) + grouped_rel->fdwroutine->GetForeignUpperPaths(root, UPPERREL_GROUP_AGG, + input_rel, grouped_rel); + + /* Let extensions possibly add some more paths */ + if (create_upper_paths_hook) + (*create_upper_paths_hook) (root, UPPERREL_GROUP_AGG, + input_rel, grouped_rel); + + /* Now choose the best path(s) */ + set_cheapest(grouped_rel); /* - * Now generate a complete GroupAgg Path atop of the cheapest partial - * path. We can do this using either Gather or Gather Merge. + * We've been using the partial pathlist for the grouped relation to hold + * partially aggregated paths, but that's actually a little bit bogus + * because it's unsafe for later planning stages -- like ordered_rel --- + * to get the idea that they can use these partial paths as if they didn't + * need a FinalizeAggregate step. Zap the partial pathlist at this stage + * so we don't get confused. */ - if (grouped_rel->partial_pathlist) - { -#ifdef __TBASE__ - bool redistribute_group PG_USED_FOR_ASSERTS_ONLY = false; -#endif - Path *path = (Path *) linitial(grouped_rel->partial_pathlist); -#ifdef __TBASE__ - double total_groups = 0; + grouped_rel->partial_pathlist = NIL; - if (olap_optimizer && !has_cold_hot_table) - { - total_groups = path->rows; - } - else - total_groups = path->rows * path->parallel_workers; -#else - double total_groups = path->rows * path->parallel_workers; -#endif - path = (Path *) create_gather_path(root, - grouped_rel, - path, - partial_grouping_target, - NULL, - &total_groups); - /* - * Since Gather's output is always unsorted, we'll need to sort, - * unless there's no GROUP BY clause or a degenerate (constant) - * one, in which case there will only be a single group. - */ -#ifdef __TBASE__ - if (!olap_optimizer || has_cold_hot_table) - { -#endif - if (root->group_pathkeys) - path = (Path *) create_sort_path(root, - grouped_rel, - path, - root->group_pathkeys, - -1.0); -#ifdef __TBASE__ - } -#endif - /* - * If the grouping can't be fully pushed down, we'll push down the - * first phase of the aggregate, and redistribute only the partial - * results. - * - * If if can be pushed down, disable construction of complex - * distributed paths. - * - * XXX Keep this after the Sort node, to make the path sorted. - */ - if (! can_push_down_grouping(root, parse, path)) -#ifdef __TBASE__ - { - if (olap_optimizer && !has_cold_hot_table) - { - /* redistribute local grouping results among datanodes */ - path = create_redistribute_grouping_path(root, parse, path); - redistribute_group = true; - } - else - path = create_remotesubplan_path(root, path, NULL); - } -#else - path = create_remotesubplan_path(root, path, NULL); -#endif + return grouped_rel; +} - else - try_distributed_aggregation = false; -#ifdef __TBASE__ +/* + * For a given input path, consider the possible ways of doing grouping sets on + * it, by combinations of hashing and sorting. This can be called multiple + * times, so it's important that it not scribble on input. No result is + * returned, but any generated paths are added to grouped_rel. + */ +static void +consider_groupingsets_paths(PlannerInfo *root, + RelOptInfo *grouped_rel, + Path *path, + bool is_sorted, + bool can_hash, + PathTarget *target, + grouping_sets_data *gd, + const AggClauseCosts *agg_costs, + double dNumGroups) +{ + Query *parse = root->parse; + /* - * Since Gather's output is always unsorted, we'll need to sort, - * unless there's no GROUP BY clause or a degenerate (constant) - * one, in which case there will only be a single group. + * If we're not being offered sorted input, then only consider plans that + * can be done entirely by hashing. + * + * We can hash everything if it looks like it'll fit in work_mem. But if + * the input is actually sorted despite not being advertised as such, we + * prefer to make use of that in order to use less memory. + * + * If none of the grouping sets are sortable, then ignore the work_mem + * limit and generate a path anyway, since otherwise we'll just fail. */ - if (olap_optimizer && !has_cold_hot_table) - { - if (root->group_pathkeys) - { - path = (Path *) create_sort_path(root, - grouped_rel, - path, - root->group_pathkeys, - -1.0); - path->parallel_aware = true; - } - } -#endif - if (parse->hasAggs) - { - Path *agg_path = NULL; - - agg_path = (Path *) - create_agg_path(root, - grouped_rel, - path, - target, - parse->groupClause ? AGG_SORTED : AGG_PLAIN, - AGGSPLIT_FINAL_DESERIAL, - parse->groupClause, - (List *) parse->havingQual, - &agg_final_costs, - dNumGroups); - - if (olap_optimizer && !has_cold_hot_table) + if (!is_sorted) { - agg_path->parallel_safe = true; - agg_path->parallel_aware = true; - } - - add_path(grouped_rel, agg_path); - } - else - { - Path *group_path = NULL; - - group_path = (Path *) - create_group_path(root, - grouped_rel, - path, - target, - parse->groupClause, - (List *) parse->havingQual, - dNumGroups); - - if (olap_optimizer && !has_cold_hot_table) + List *new_rollups = NIL; + RollupData *unhashed_rollup = NULL; + List *sets_data; + List *empty_sets_data = NIL; + List *empty_sets = NIL; + ListCell *lc; + ListCell *l_start = list_head(gd->rollups); + AggStrategy strat = AGG_HASHED; + Size hashsize; + double exclude_groups = 0.0; + + Assert(can_hash); + + if (pathkeys_contained_in(root->group_pathkeys, path->pathkeys)) { - group_path->parallel_safe = true; - group_path->parallel_aware = true; + unhashed_rollup = lfirst(l_start); + exclude_groups = unhashed_rollup->numGroups; + l_start = lnext(l_start); } - add_path(grouped_rel, group_path); - } + hashsize = estimate_hashagg_tablesize(path, + agg_costs, + dNumGroups - exclude_groups); /* - * The point of using Gather Merge rather than Gather is that it - * can preserve the ordering of the input path, so there's no - * reason to try it unless (1) it's possible to produce more than - * one output row and (2) we want the output path to be ordered. + * gd->rollups is empty if we have only unsortable columns to work + * with. Override work_mem in that case; otherwise, we'll rely on the + * sorted-input case to generate usable mixed paths. */ - if (parse->groupClause != NIL && root->group_pathkeys != NIL) - { - foreach(lc, grouped_rel->partial_pathlist) - { - Path *subpath = (Path *) lfirst(lc); - Path *gmpath; - double total_groups; + if (hashsize > work_mem * 1024L && gd->rollups) + return; /* nope, won't fit */ /* - * It's useful to consider paths that are already properly - * ordered for Gather Merge, because those don't need a - * sort. It's also useful to consider the cheapest path, - * because sorting it in parallel and then doing Gather - * Merge may be better than doing an unordered Gather - * followed by a sort. But there's no point in - * considering non-cheapest paths that aren't already - * sorted correctly. + * We need to burst the existing rollups list into individual grouping + * sets and recompute a groupClause for each set. */ - if (path != subpath && - !pathkeys_contained_in(root->group_pathkeys, - subpath->pathkeys)) - continue; + sets_data = list_copy(gd->unsortable_sets); -#ifdef __TBASE__ - if (olap_optimizer && !has_cold_hot_table) - total_groups = subpath->rows; - else -#endif - total_groups = subpath->rows * subpath->parallel_workers; - - -#ifdef __TBASE__ - if (olap_optimizer && !has_cold_hot_table) + for_each_cell(lc, l_start) { - gmpath = (Path *) create_gather_path(root, - grouped_rel, - subpath, - partial_grouping_target, - NULL, - &total_groups); - } - else -#endif - gmpath = (Path *) - create_gather_merge_path(root, - grouped_rel, - subpath, - partial_grouping_target, - root->group_pathkeys, - NULL, - &total_groups); + RollupData *rollup = lfirst(lc); /* - * If the grouping can't be fully pushed down, we'll push down the - * first phase of the aggregate, and redistribute only the partial - * results. + * If we find an unhashable rollup that's not been skipped by the + * "actually sorted" check above, we can't cope; we'd need sorted + * input (with a different sort order) but we can't get that here. + * So bail out; we'll get a valid path from the is_sorted case + * instead. + * + * The mere presence of empty grouping sets doesn't make a rollup + * unhashable (see preprocess_grouping_sets), we handle those + * specially below. */ - redistribute_group = false; - - if (! can_push_down_grouping(root, parse, gmpath)) -#ifdef __TBASE__ + if (!rollup->hashable) + return; + else + sets_data = list_concat(sets_data, list_copy(rollup->gsets_data)); + } + foreach(lc, sets_data) { - if (olap_optimizer && !has_cold_hot_table) + GroupingSetData *gs = lfirst(lc); + List *gset = gs->set; + RollupData *rollup; + + if (gset == NIL) { - /* redistribute local grouping results among datanodes */ - gmpath = create_redistribute_grouping_path(root, parse, gmpath); - redistribute_group = true; + /* Empty grouping sets can't be hashed. */ + empty_sets_data = lappend(empty_sets_data, gs); + empty_sets = lappend(empty_sets, NIL); } else { - gmpath = create_remotesubplan_path(root, gmpath, NULL); + rollup = makeNode(RollupData); + + rollup->groupClause = preprocess_groupclause(root, gset); + rollup->gsets_data = list_make1(gs); + rollup->gsets = remap_to_groupclause_idx(rollup->groupClause, + rollup->gsets_data, + gd->tleref_to_colnum_map); + rollup->numGroups = gs->numGroups; + rollup->hashable = true; + rollup->is_hashed = true; + new_rollups = lappend(new_rollups, rollup); } } -#else - gmpath = create_remotesubplan_path(root, gmpath, NULL); -#endif -#ifdef __TBASE__ /* - * Since Gather's output is always unsorted, we'll need to sort, - * unless there's no GROUP BY clause or a degenerate (constant) - * one, in which case there will only be a single group. + * If we didn't find anything nonempty to hash, then bail. We'll + * generate a path from the is_sorted case. */ - if (olap_optimizer && !has_cold_hot_table) + if (new_rollups == NIL) + return; + + /* + * If there were empty grouping sets they should have been in the + * first rollup. + */ + Assert(!unhashed_rollup || !empty_sets); + + if (unhashed_rollup) { - if (root->group_pathkeys) + new_rollups = lappend(new_rollups, unhashed_rollup); + strat = AGG_MIXED; + } + else if (empty_sets) { - gmpath = (Path *) create_sort_path(root, - grouped_rel, - gmpath, - root->group_pathkeys, - -1.0); - - gmpath->parallel_aware = true; - } + RollupData *rollup = makeNode(RollupData); + + rollup->groupClause = NIL; + rollup->gsets_data = empty_sets_data; + rollup->gsets = empty_sets; + rollup->numGroups = list_length(empty_sets); + rollup->hashable = false; + rollup->is_hashed = false; + new_rollups = lappend(new_rollups, rollup); + strat = AGG_MIXED; } -#endif - if (parse->hasAggs) - { - Path *agg_path = NULL; - agg_path = (Path *) - create_agg_path(root, + /* + * If the grouping can't be fully pushed down, redistribute the + * path on top of the (sorted) path. If if can be pushed down, + * disable construction of complex distributed paths. + */ + if (! can_push_down_grouping(root, parse, path)) + path = create_remotesubplan_path(root, path, NULL); + + add_path(grouped_rel, (Path *) + create_groupingsets_path(root, grouped_rel, - gmpath, + path, target, - parse->groupClause ? AGG_SORTED : AGG_PLAIN, - AGGSPLIT_FINAL_DESERIAL, - parse->groupClause, (List *) parse->havingQual, - &agg_final_costs, - dNumGroups); - if (olap_optimizer && !has_cold_hot_table) - { - agg_path->parallel_safe = true; - agg_path->parallel_aware = true; + strat, + new_rollups, + agg_costs, + dNumGroups)); + return; } - add_path(grouped_rel, agg_path); - } - else + /* + * If we have sorted input but nothing we can do with it, bail. + */ + if (list_length(gd->rollups) == 0) + return; + + /* + * Given sorted input, we try and make two paths: one sorted and one mixed + * sort/hash. (We need to try both because hashagg might be disabled, or + * some columns might not be sortable.) + * + * can_hash is passed in as false if some obstacle elsewhere (such as + * ordered aggs) means that we shouldn't consider hashing at all. + */ + if (can_hash && gd->any_hashable) { - Path *group_path = NULL; - - group_path = (Path *) - create_group_path(root, - grouped_rel, - gmpath, - target, - parse->groupClause, - (List *) parse->havingQual, - dNumGroups); - - if (olap_optimizer && !has_cold_hot_table) + List *rollups = NIL; + List *hash_sets = list_copy(gd->unsortable_sets); + double availspace = (work_mem * 1024.0); + ListCell *lc; + + /* + * Account first for space needed for groups we can't sort at all. + */ + availspace -= (double) estimate_hashagg_tablesize(path, + agg_costs, + gd->dNumHashGroups); + + if (availspace > 0 && list_length(gd->rollups) > 1) { - group_path->parallel_safe = true; - group_path->parallel_aware = true; - } + double scale; + int num_rollups = list_length(gd->rollups); + int k_capacity; + int *k_weights = palloc(num_rollups * sizeof(int)); + Bitmapset *hash_items = NULL; + int i; - add_path(grouped_rel, group_path); - } - } - } - } - } + /* + * We treat this as a knapsack problem: the knapsack capacity + * represents work_mem, the item weights are the estimated memory + * usage of the hashtables needed to implement a single rollup, + * and we really ought to use the cost saving as the item value; + * however, currently the costs assigned to sort nodes don't + * reflect the comparison costs well, and so we treat all items as + * of equal value (each rollup we hash instead saves us one sort). + * + * To use the discrete knapsack, we need to scale the values to a + * reasonably small bounded range. We choose to allow a 5% error + * margin; we have no more than 4096 rollups in the worst possible + * case, which with a 5% error margin will require a bit over 42MB + * of workspace. (Anyone wanting to plan queries that complex had + * better have the memory for it. In more reasonable cases, with + * no more than a couple of dozen rollups, the memory usage will + * be negligible.) + * + * k_capacity is naturally bounded, but we clamp the values for + * scale and weight (below) to avoid overflows or underflows (or + * uselessly trying to use a scale factor less than 1 byte). + */ + scale = Max(availspace / (20.0 * num_rollups), 1.0); + k_capacity = (int) floor(availspace / scale); - if (can_hash) - { - if (parse->groupingSets) - { /* - * Try for a hash-only groupingsets path over unsorted input. + * We leave the first rollup out of consideration since it's the + * one that matches the input sort order. We assign indexes "i" + * to only those entries considered for hashing; the second loop, + * below, must use the same condition. */ - consider_groupingsets_paths(root, grouped_rel, - cheapest_path, false, true, target, - gd, agg_costs, dNumGroups); - } - else - { - hashaggtablesize = estimate_hashagg_tablesize(cheapest_path, + i = 0; + for_each_cell(lc, lnext(list_head(gd->rollups))) + { + RollupData *rollup = lfirst(lc); + + if (rollup->hashable) + { + double sz = estimate_hashagg_tablesize(path, agg_costs, - dNumGroups); + rollup->numGroups); - /* - * Provided that the estimated size of the hashtable does not - * exceed work_mem, we'll generate a HashAgg Path, although if we - * were unable to sort above, then we'd better generate a Path, so - * that we at least have one. - */ -#ifdef __TBASE__ - if (hashaggtablesize < work_mem * 1024L || g_hybrid_hash_agg || - grouped_rel->pathlist == NIL) -#else - if (hashaggtablesize < work_mem * 1024L || - grouped_rel->pathlist == NIL) -#endif - { - /* Don't mess with the cheapest path directly. */ - Path *path = cheapest_path; -#ifdef __TBASE__ - bool try_redistribute_grouping = false; -#endif + /* + * If sz is enormous, but work_mem (and hence scale) is + * small, avoid integer overflow here. + */ + k_weights[i] = (int) Min(floor(sz / scale), + k_capacity + 1.0); + ++i; + } + } /* - * If the grouping can't be fully pushed down, we'll push down the - * first phase of the aggregate, and redistribute only the partial - * results. - * - * If if can be pushed down, disable construction of complex - * distributed paths. + * Apply knapsack algorithm; compute the set of items which + * maximizes the value stored (in this case the number of sorts + * saved) while keeping the total size (approximately) within + * capacity. */ - if (! can_push_down_grouping(root, parse, path)) -#ifdef XCP + if (i > 0) + hash_items = DiscreteKnapsack(k_capacity, i, k_weights, NULL); + + if (!bms_is_empty(hash_items)) + { + rollups = list_make1(linitial(gd->rollups)); + + i = 0; + for_each_cell(lc, lnext(list_head(gd->rollups))) { - /* some special aggs cannot be parallel executed, such as count(distinct) */ - if(agg_costs->hasNonPartial || agg_costs->hasNonSerial || - path->pathtype == T_Agg || path->pathtype == T_Group || - !olap_optimizer || has_cold_hot_table) - { - if (agg_costs->hasOnlyDistinct && olap_optimizer && !has_cold_hot_table) - path = create_redistribute_grouping_path(root, parse, path); + RollupData *rollup = lfirst(lc); + + if (rollup->hashable) + { + if (bms_is_member(i, hash_items)) + hash_sets = list_concat(hash_sets, + list_copy(rollup->gsets_data)); else - path = create_remotesubplan_path(root, path, NULL); + rollups = lappend(rollups, rollup); + ++i; } else - { - /* - * If the grouping can not be fully pushed down, we adopt another - * strategy instead. - * 1. do grouping on each datanode locally - * 2. re-distribute grouping results among datanodes, then do the - * final grouping - */ - AggClauseCosts hashagg_partial_costs; - PathTarget * local_grouping_target = make_partial_grouping_target(root, target); + rollups = lappend(rollups, rollup); + } + } + } - /* Estimate number of partial groups. */ - double dNumLocalGroups = get_number_of_groups(root, - cheapest_path->rows, - gd); - try_redistribute_grouping = true; + if (!rollups && hash_sets) + rollups = list_copy(gd->rollups); - MemSet(&hashagg_partial_costs, 0, sizeof(AggClauseCosts)); + foreach(lc, hash_sets) + { + GroupingSetData *gs = lfirst(lc); + RollupData *rollup = makeNode(RollupData); - get_agg_clause_costs(root, (Node *) local_grouping_target->exprs, - AGGSPLIT_INITIAL_SERIAL, - &hashagg_partial_costs); + Assert(gs->set != NIL); - /* step 1 */ - path = (Path *) create_agg_path(root, - grouped_rel, - cheapest_path, - local_grouping_target, - AGG_HASHED, - AGGSPLIT_INITIAL_SERIAL, - parse->groupClause, - NIL, - &hashagg_partial_costs, - dNumLocalGroups); + rollup->groupClause = preprocess_groupclause(root, gs->set); + rollup->gsets_data = list_make1(gs); + rollup->gsets = remap_to_groupclause_idx(rollup->groupClause, + rollup->gsets_data, + gd->tleref_to_colnum_map); + rollup->numGroups = gs->numGroups; + rollup->hashable = true; + rollup->is_hashed = true; + rollups = lcons(rollup, rollups); + } -#ifdef __TBASE__ - if (hashaggtablesize >= work_mem * 1024L && g_hybrid_hash_agg) + if (rollups) { - AggPath *aggpath = (AggPath *)path; - - aggpath->hybrid = true; - } -#endif - - /* step 2 */ - path = create_redistribute_grouping_path(root, parse, path); + /* + * If the grouping can't be fully pushed down, redistribute the + * path on top of the (sorted) path. If if can be pushed down, + * disable construction of complex distributed paths. + */ + if (! can_push_down_grouping(root, parse, path)) + path = create_remotesubplan_path(root, path, NULL); + + add_path(grouped_rel, (Path *) + create_groupingsets_path(root, + grouped_rel, + path, + target, + (List *) parse->havingQual, + AGG_MIXED, + rollups, + agg_costs, + dNumGroups)); } } -#else - path = create_remotesubplan_path(root, path, NULL); -#endif - else - try_distributed_aggregation = false; /* - * We just need an Agg over the cheapest-total input path, - * since input order won't matter. + * Now try the simple sorted case. */ -#ifdef __TBASE__ - if(try_redistribute_grouping) + if (!gd->unsortable_sets) { - AggClauseCosts hashagg_final_costs; - Path *agg_path; - - MemSet(&hashagg_final_costs, 0, sizeof(AggClauseCosts)); - - get_agg_clause_costs(root, (Node *) target->exprs, - AGGSPLIT_FINAL_DESERIAL, - &hashagg_final_costs); - get_agg_clause_costs(root, parse->havingQual, - AGGSPLIT_FINAL_DESERIAL, - &hashagg_final_costs); + /* + * If the grouping can't be fully pushed down, redistribute the + * path on top of the (sorted) path. If if can be pushed down, + * disable construction of complex distributed paths. + */ + if (! can_push_down_grouping(root, parse, path)) + path = create_remotesubplan_path(root, path, NULL); - agg_path = (Path *) - create_agg_path(root, + add_path(grouped_rel, (Path *) + create_groupingsets_path(root, grouped_rel, path, target, - AGG_HASHED, - AGGSPLIT_FINAL_DESERIAL, - parse->groupClause, (List *) parse->havingQual, - &hashagg_final_costs, - dNumGroups); -#ifdef __TBASE__ - if (hashaggtablesize >= work_mem * 1024L && g_hybrid_hash_agg) - { - AggPath *aggpath = (AggPath *)agg_path; - - aggpath->hybrid = true; + AGG_SORTED, + gd->rollups, + agg_costs, + dNumGroups)); } -#endif - //agg_path->parallel_safe = true; - - add_path(grouped_rel, agg_path); } - else - { - bool parallel_aware = false; - bool parallel_safe = false; - Path *agg_path = NULL; - if ((path->pathtype == T_Gather || agg_costs->hasOnlyDistinct) && olap_optimizer - && !has_cold_hot_table) +/* + * create_window_paths + * + * Build a new upperrel containing Paths for window-function evaluation. + * + * input_rel: contains the source-data Paths + * input_target: result of make_window_input_target + * output_target: what the topmost WindowAggPath should return + * tlist: query's target list (needed to look up pathkeys) + * wflists: result of find_window_functions + * activeWindows: result of select_active_windows + * + * Note: all Paths in input_rel are expected to return input_target. + */ +static RelOptInfo * +create_window_paths(PlannerInfo *root, + RelOptInfo *input_rel, + PathTarget *input_target, + PathTarget *output_target, + List *tlist, + WindowFuncLists *wflists, + List *activeWindows) { - parallel_safe = true; - parallel_aware = true; - } + RelOptInfo *window_rel; + ListCell *lc; - agg_path = (Path *) - create_agg_path(root, grouped_rel, - path, - target, - AGG_HASHED, - AGGSPLIT_SIMPLE, - parse->groupClause, - (List *) parse->havingQual, - agg_costs, - dNumGroups); - agg_path->parallel_aware = parallel_aware; - agg_path->parallel_safe = parallel_safe; -#ifdef __TBASE__ - if (hashaggtablesize >= work_mem * 1024L && g_hybrid_hash_agg) + /* For now, do all work in the (WINDOW, NULL) upperrel */ + window_rel = fetch_upper_rel(root, UPPERREL_WINDOW, NULL); + + /* + * If the input relation is not parallel-safe, then the window relation + * can't be parallel-safe, either. Otherwise, we need to examine the + * target list and active windows for non-parallel-safe constructs. + */ + if (input_rel->consider_parallel && + is_parallel_safe(root, (Node *) output_target->exprs) && + is_parallel_safe(root, (Node *) activeWindows)) + window_rel->consider_parallel = true; + + /* + * If the input rel belongs to a single FDW, so does the window rel. + */ + window_rel->serverid = input_rel->serverid; + window_rel->userid = input_rel->userid; + window_rel->useridiscurrent = input_rel->useridiscurrent; + window_rel->fdwroutine = input_rel->fdwroutine; + + /* + * Consider computing window functions starting from the existing + * cheapest-total path (which will likely require a sort) as well as any + * existing paths that satisfy root->window_pathkeys (which won't). + */ + foreach(lc, input_rel->pathlist) { - AggPath *aggpath = (AggPath *)agg_path; + Path *path = (Path *) lfirst(lc); - aggpath->hybrid = true; - } -#endif - add_path(grouped_rel, agg_path); - } -#else - add_path(grouped_rel, (Path *) - create_agg_path(root, grouped_rel, + if (path == input_rel->cheapest_total_path || + pathkeys_contained_in(root->window_pathkeys, path->pathkeys)) + create_one_window_path(root, + window_rel, path, - target, - AGG_HASHED, - AGGSPLIT_SIMPLE, - parse->groupClause, - (List *) parse->havingQual, - agg_costs, - dNumGroups)); -#endif + input_target, + output_target, + tlist, + wflists, + activeWindows); } + + /* + * If there is an FDW that's responsible for all baserels of the query, + * let it consider adding ForeignPaths. + */ + if (window_rel->fdwroutine && + window_rel->fdwroutine->GetForeignUpperPaths) + window_rel->fdwroutine->GetForeignUpperPaths(root, UPPERREL_WINDOW, + input_rel, window_rel); + + /* Let extensions possibly add some more paths */ + if (create_upper_paths_hook) + (*create_upper_paths_hook) (root, UPPERREL_WINDOW, + input_rel, window_rel); + + /* Now choose the best path(s) */ + set_cheapest(window_rel); + + return window_rel; } /* - * Generate a HashAgg Path atop of the cheapest partial path. Once - * again, we'll only do this if it looks as though the hash table - * won't exceed work_mem. + * Stack window-function implementation steps atop the given Path, and + * add the result to window_rel. + * + * window_rel: upperrel to contain result + * path: input Path to use (must return input_target) + * input_target: result of make_window_input_target + * output_target: what the topmost WindowAggPath should return + * tlist: query's target list (needed to look up pathkeys) + * wflists: result of find_window_functions + * activeWindows: result of select_active_windows */ - if (grouped_rel->partial_pathlist) +static void +create_one_window_path(PlannerInfo *root, + RelOptInfo *window_rel, + Path *path, + PathTarget *input_target, + PathTarget *output_target, + List *tlist, + WindowFuncLists *wflists, + List *activeWindows) { - bool redistribute_group = false; - Path *path = (Path *) linitial(grouped_rel->partial_pathlist); + PathTarget *window_target; + ListCell *l; - hashaggtablesize = estimate_hashagg_tablesize(path, - &agg_final_costs, - dNumGroups); + /* + * Since each window clause could require a different sort order, we stack + * up a WindowAgg node for each clause, with sort steps between them as + * needed. (We assume that select_active_windows chose a good order for + * executing the clauses in.) + * + * input_target should contain all Vars and Aggs needed for the result. + * (In some cases we wouldn't need to propagate all of these all the way + * to the top, since they might only be needed as inputs to WindowFuncs. + * It's probably not worth trying to optimize that though.) It must also + * contain all window partitioning and sorting expressions, to ensure + * they're computed only once at the bottom of the stack (that's critical + * for volatile functions). As we climb up the stack, we'll add outputs + * for the WindowFuncs computed at each level. + */ + window_target = input_target; -#ifdef __TBASE__ - if (hashaggtablesize < work_mem * 1024L || g_hybrid_hash_agg) -#else - if (hashaggtablesize < work_mem * 1024L) -#endif + foreach(l, activeWindows) { -#ifdef __TBASE__ - double total_groups = 0; + WindowClause *wc = (WindowClause *) lfirst(l); + List *window_pathkeys; + + window_pathkeys = make_pathkeys_for_window(root, + wc, + tlist); - if (olap_optimizer && !has_cold_hot_table) + /* Sort if necessary */ + if (!pathkeys_contained_in(window_pathkeys, path->pathkeys)) { - total_groups = path->rows; - } - else - total_groups = path->rows * path->parallel_workers; -#else - double total_groups = path->rows * path->parallel_workers; -#endif - path = (Path *) create_gather_path(root, - grouped_rel, + path = (Path *) create_sort_path(root, window_rel, path, - partial_grouping_target, - NULL, - &total_groups); + window_pathkeys, + -1.0); + } + + if (lnext(l)) + { /* - * If the grouping can't be fully pushed down, we'll push down the - * first phase of the aggregate, and redistribute only the partial - * results. + * Add the current WindowFuncs to the output target for this + * intermediate WindowAggPath. We must copy window_target to + * avoid changing the previous path's target. * - * If if can be pushed down, disable construction of complex - * distributed paths. + * Note: a WindowFunc adds nothing to the target's eval costs; but + * we do need to account for the increase in tlist width. */ - if (! can_push_down_grouping(root, parse, path)) -#ifdef __TBASE__ - { - if (olap_optimizer && !has_cold_hot_table) - { - /* redistribute local grouping results among datanodes */ - path = create_redistribute_grouping_path(root, parse, path); - redistribute_group = true; - } - else - { - path = create_remotesubplan_path(root, path, NULL); - } - } -#else - path = create_remotesubplan_path(root, path, NULL); -#endif - else - try_distributed_aggregation = false; + ListCell *lc2; -#ifdef __TBASE__ - if (!redistribute_group) - { - Path *agg_path = (Path *) - create_agg_path(root, - grouped_rel, - path, - target, - AGG_HASHED, - AGGSPLIT_FINAL_DESERIAL, - parse->groupClause, - (List *) parse->havingQual, - &agg_final_costs, - dNumGroups); - - if (olap_optimizer && !has_cold_hot_table) - { - agg_path->parallel_aware = true; - agg_path->parallel_safe = true; - } -#ifdef __TBASE__ - if (hashaggtablesize >= work_mem * 1024L && g_hybrid_hash_agg) + window_target = copy_pathtarget(window_target); + foreach(lc2, wflists->windowFuncs[wc->winref]) { - AggPath *aggpath = (AggPath *)agg_path; + WindowFunc *wfunc = lfirst_node(WindowFunc, lc2); - aggpath->hybrid = true; + add_column_to_pathtarget(window_target, (Expr *) wfunc, 0); + window_target->width += get_typavgwidth(wfunc->wintype, -1); } -#endif - add_path(grouped_rel, agg_path); } else { -#endif - Path *agg_path = (Path *) - create_agg_path(root, - grouped_rel, - path, - target, - AGG_HASHED, - AGGSPLIT_FINAL_DESERIAL, - parse->groupClause, - (List *) parse->havingQual, - &agg_final_costs, - dNumGroups); - - if (olap_optimizer && !has_cold_hot_table) - { - agg_path->parallel_aware = true; - agg_path->parallel_safe = true; + /* Install the goal target in the topmost WindowAgg */ + window_target = output_target; } -#ifdef __TBASE__ - if (hashaggtablesize >= work_mem * 1024L && g_hybrid_hash_agg) - { - AggPath *aggpath = (AggPath *)agg_path; - aggpath->hybrid = true; - } -#endif - add_path(grouped_rel, agg_path); -#ifdef __TBASE__ - } -#endif - } - } + /* We can't really push down window functions for now. */ + if (!can_push_down_window(root, path)) + path = create_remotesubplan_path(root, path, NULL); + + path = (Path *) + create_windowagg_path(root, window_rel, path, window_target, + wflists->windowFuncs[wc->winref], + wc, + window_pathkeys); } - /* Generate XL aggregate paths, with distributed 2-phase aggregation. */ + add_path(window_rel, path); +} /* - * If there were no partial paths, we did not initialize any of the - * partial paths above. If that's the case, initialize here. + * create_distinct_paths * - * XXX The reason why the initialization block at the beginning is not - * simply performed unconditionally is that we may skip it if we've been - * successful in fully pushing down any of the aggregates, and entirely - * skip generating the XL paths. + * Build a new upperrel containing Paths for SELECT DISTINCT evaluation. * - * XXX Can we simply use the same estimates as regular partial aggregates, - * or do we need to invent something else? It might be a better idea to - * use estimates for the whole result here (e.g. total number of groups) - * instead of the partial ones. Underestimates often have more severe - * consequences (e.g. OOM with HashAggregate) than overestimates, so this - * seems like a more defensive approach. + * input_rel: contains the source-data Paths * - * XXX After thinking a bit more about the estimation, it may depend on - * pushdown - if the aggregate is fully pushed down (as above, we can - * probably use dNumGroups/numberOfNodes as a cardinality estimate, as - * we know the per-node groupings won't overlap. But here we need to be - * more careful. + * Note: input paths should already compute the desired pathtarget, since + * Sort/Unique won't project anything. */ - if (try_distributed_aggregation) +static RelOptInfo * +create_distinct_paths(PlannerInfo *root, + RelOptInfo *input_rel) { - partial_grouping_target = make_partial_grouping_target(root, target); + Query *parse = root->parse; + Path *cheapest_input_path = input_rel->cheapest_total_path; + RelOptInfo *distinct_rel; + double numDistinctRows; + bool allow_hash; + Path *path; + ListCell *lc; -#ifdef __TBASE__ - grouped_rel->reltarget = partial_grouping_target; -#endif - /* Estimate number of partial groups. */ - dNumPartialGroups = get_number_of_groups(root, - cheapest_path->rows, - gd); + /* For now, do all work in the (DISTINCT, NULL) upperrel */ + distinct_rel = fetch_upper_rel(root, UPPERREL_DISTINCT, NULL); /* - * Collect statistics about aggregates for estimating costs of - * performing aggregation in parallel. + * We don't compute anything at this level, so distinct_rel will be + * parallel-safe if the input rel is parallel-safe. In particular, if + * there is a DISTINCT ON (...) clause, any path for the input_rel will + * output those expressions, and will not be parallel-safe unless those + * expressions are parallel-safe. */ - MemSet(&agg_partial_costs, 0, sizeof(AggClauseCosts)); - MemSet(&agg_final_costs, 0, sizeof(AggClauseCosts)); - if (parse->hasAggs) - { - /* partial phase */ - get_agg_clause_costs(root, (Node *) partial_grouping_target->exprs, - AGGSPLIT_INITIAL_SERIAL, - &agg_partial_costs); - - /* final phase */ - get_agg_clause_costs(root, (Node *) target->exprs, - AGGSPLIT_FINAL_DESERIAL, - &agg_final_costs); - get_agg_clause_costs(root, parse->havingQual, - AGGSPLIT_FINAL_DESERIAL, - &agg_final_costs); - } + distinct_rel->consider_parallel = input_rel->consider_parallel; - /* Build final XL grouping paths */ - if (can_sort) + /* + * If the input rel belongs to a single FDW, so does the distinct_rel. + */ + distinct_rel->serverid = input_rel->serverid; + distinct_rel->userid = input_rel->userid; + distinct_rel->useridiscurrent = input_rel->useridiscurrent; + distinct_rel->fdwroutine = input_rel->fdwroutine; + + /* Estimate number of distinct rows there will be */ + if (parse->groupClause || parse->groupingSets || parse->hasAggs || + root->hasHavingQual) { /* - * Use any available suitably-sorted path as input, and also consider - * sorting the cheapest-total path. + * If there was grouping or aggregation, use the number of input rows + * as the estimated number of DISTINCT rows (ie, assume the input is + * already mostly unique). */ - foreach(lc, input_rel->pathlist) + numDistinctRows = cheapest_input_path->rows; + } + else { - Path *path = (Path *) lfirst(lc); - bool is_sorted; + /* + * Otherwise, the UNIQUE filter has effects comparable to GROUP BY. + */ + List *distinctExprs; - is_sorted = pathkeys_contained_in(root->group_pathkeys, - path->pathkeys); + distinctExprs = get_sortgrouplist_exprs(parse->distinctClause, + parse->targetList); + numDistinctRows = estimate_num_groups(root, distinctExprs, + cheapest_input_path->rows, + NULL); + } /* - * XL: Can it happen that the cheapest path can't be pushed down, - * while some other path could be? Perhaps we should move the check - * if a path can be pushed down up, and add another OR condition - * to consider all paths that can be pushed down? - * - * if (path == cheapest_path || is_sorted || can_push_down) + * Consider sort-based implementations of DISTINCT, if possible. */ - if (path == cheapest_path || is_sorted) + if (grouping_is_sortable(parse->distinctClause)) { /* - * We can't really beat paths that we managed to fully push - * down above, so we can skip them entirely. + * First, if we have any adequately-presorted paths, just stick a + * Unique node on those. Then consider doing an explicit sort of the + * cheapest input path and Unique'ing that. * - * XXX Not constructing any paths, so we can do this before - * adding the Sort path. + * When we have DISTINCT ON, we must sort by the more rigorous of + * DISTINCT and ORDER BY, else it won't have the desired behavior. + * Also, if we do have to do an explicit sort, we might as well use + * the more rigorous ordering to avoid a second sort later. (Note + * that the parser will have ensured that one clause is a prefix of + * the other.) */ - if (can_push_down_grouping(root, parse, path)) - continue; + List *needed_pathkeys; - /* Sort the cheapest-total path if it isn't already sorted */ - if (!is_sorted) - path = (Path *) create_sort_path(root, - grouped_rel, - path, - root->group_pathkeys, - -1.0); + if (parse->hasDistinctOn && + list_length(root->distinct_pathkeys) < + list_length(root->sort_pathkeys)) + needed_pathkeys = root->sort_pathkeys; + else + needed_pathkeys = root->distinct_pathkeys; - /* Now decide what to stick atop it */ - if (parse->groupingSets) + foreach(lc, input_rel->pathlist) { - /* - * TODO 2-phase aggregation for grouping sets paths not - * supported yet, but this the place where such paths - * should be constructed. - */ - } - else if (parse->hasAggs) + Path *path = (Path *) lfirst(lc); + + if (pathkeys_contained_in(needed_pathkeys, path->pathkeys)) { /* - * We have aggregation, possibly with plain GROUP BY. Make - * an AggPath. + * Make sure the distribution matches the distinct clause, + * needed by the UNIQUE path. + * + * FIXME This could probably benefit from pushing a UNIQUE + * to the remote side, and only doing a merge locally. */ + if (!grouping_distribution_match(root, parse, path, parse->distinctClause)) + path = create_remotesubplan_path(root, path, NULL); - path = (Path *) create_agg_path(root, - grouped_rel, + add_path(distinct_rel, (Path *) + create_upper_unique_path(root, distinct_rel, path, - partial_grouping_target, - parse->groupClause ? AGG_SORTED : AGG_PLAIN, - AGGSPLIT_INITIAL_SERIAL, - parse->groupClause, - NIL, - &agg_partial_costs, - dNumPartialGroups); + list_length(root->distinct_pathkeys), + numDistinctRows)); + } + } -#ifdef __TBASE__ - if (olap_optimizer && !has_cold_hot_table) + /* For explicit-sort case, always use the more rigorous clause */ + if (list_length(root->distinct_pathkeys) < + list_length(root->sort_pathkeys)) { - /* redistribute local grouping results among datanodes */ - path = create_redistribute_grouping_path(root, parse, path); + needed_pathkeys = root->sort_pathkeys; + /* Assert checks that parser didn't mess up... */ + Assert(pathkeys_contained_in(root->distinct_pathkeys, + needed_pathkeys)); } else - { - path = create_remotesubplan_path(root, path, NULL); - } -#else - path = create_remotesubplan_path(root, path, NULL); -#endif + needed_pathkeys = root->distinct_pathkeys; -#ifdef __TBASE__ - if (parse->groupClause && olap_optimizer && !has_cold_hot_table && - (!is_sorted || root->group_pathkeys)) - { - path = (Path *) create_sort_path(root, - grouped_rel, + path = cheapest_input_path; + if (!pathkeys_contained_in(needed_pathkeys, path->pathkeys)) + path = (Path *) create_sort_path(root, distinct_rel, path, - root->group_pathkeys, + needed_pathkeys, -1.0); + + /* In case of grouping / distribution mismatch, inject remote scan. */ + if (!grouping_distribution_match(root, parse, path, parse->distinctClause)) + path = create_remotesubplan_path(root, path, NULL); + + add_path(distinct_rel, (Path *) + create_upper_unique_path(root, distinct_rel, + path, + list_length(root->distinct_pathkeys), + numDistinctRows)); } -#endif + /* - * We generate two paths, differing in the second phase - * implementation (sort and hash). + * Consider hash-based implementations of DISTINCT, if possible. + * + * If we were not able to make any other types of path, we *must* hash or + * die trying. If we do have other choices, there are several things that + * should prevent selection of hashing: if the query uses DISTINCT ON + * (because it won't really have the expected behavior if we hash), or if + * enable_hashagg is off, or if it looks like the hashtable will exceed + * work_mem. + * + * Note: grouping_is_hashable() is much more expensive to check than the + * other gating conditions, so we want to do it last. */ + if (distinct_rel->pathlist == NIL) + allow_hash = true; /* we have no alternatives */ + else if (parse->hasDistinctOn || !enable_hashagg) + allow_hash = false; /* policy-based decision not to hash */ + else { - Path *agg_path = (Path *) - create_agg_path(root, - grouped_rel, - path, - target, - parse->groupClause ? AGG_SORTED : AGG_PLAIN, - AGGSPLIT_FINAL_DESERIAL, - parse->groupClause, - (List *) parse->havingQual, - &agg_final_costs, - dNumGroups); + Size hashentrysize; - //agg_path->parallel_safe = true; + /* Estimate per-hash-entry space at tuple width... */ + hashentrysize = MAXALIGN(cheapest_input_path->pathtarget->width) + + MAXALIGN(SizeofMinimalTupleHeader); + /* plus the per-hash-entry overhead */ + hashentrysize += hash_agg_entry_size(0); - add_path(grouped_rel, agg_path); + /* Allow hashing only if hashtable is predicted to fit in work_mem */ + allow_hash = (hashentrysize * numDistinctRows <= work_mem * 1024L); } - if (can_hash) + if (allow_hash && grouping_is_hashable(parse->distinctClause)) { - Path *agg_path = (Path *) + Path *input_path = cheapest_input_path; + + /* If needed, inject RemoteSubplan redistributing the data. */ + if (!grouping_distribution_match(root, parse, input_path, parse->distinctClause)) + input_path = create_remotesubplan_path(root, input_path, NULL); + + /* XXX Maybe we can make this a 2-phase aggregate too? */ + + /* Generate hashed aggregate path --- no sort needed */ + add_path(distinct_rel, (Path *) create_agg_path(root, - grouped_rel, - path, - target, + distinct_rel, + input_path, + input_path->pathtarget, AGG_HASHED, - AGGSPLIT_FINAL_DESERIAL, - parse->groupClause, - (List *) parse->havingQual, - &agg_final_costs, - dNumGroups); + AGGSPLIT_SIMPLE, + parse->distinctClause, + NIL, + NULL, + numDistinctRows)); + } - //agg_path->parallel_safe = true; + /* Give a helpful error if we failed to find any implementation */ + if (distinct_rel->pathlist == NIL) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("could not implement DISTINCT"), + errdetail("Some of the datatypes only support hashing, while others only support sorting."))); - add_path(grouped_rel, agg_path); - } + /* + * If there is an FDW that's responsible for all baserels of the query, + * let it consider adding ForeignPaths. + */ + if (distinct_rel->fdwroutine && + distinct_rel->fdwroutine->GetForeignUpperPaths) + distinct_rel->fdwroutine->GetForeignUpperPaths(root, UPPERREL_DISTINCT, + input_rel, distinct_rel); + + /* Let extensions possibly add some more paths */ + if (create_upper_paths_hook) + (*create_upper_paths_hook) (root, UPPERREL_DISTINCT, + input_rel, distinct_rel); + + /* Now choose the best path(s) */ + set_cheapest(distinct_rel); + + return distinct_rel; } - else if (parse->groupClause) + +/* + * create_ordered_paths + * + * Build a new upperrel containing Paths for ORDER BY evaluation. + * + * All paths in the result must satisfy the ORDER BY ordering. + * The only new path we need consider is an explicit sort on the + * cheapest-total existing path. + * + * input_rel: contains the source-data Paths + * target: the output tlist the result Paths must emit + * limit_tuples: estimated bound on the number of output tuples, + * or -1 if no LIMIT or couldn't estimate + */ +static RelOptInfo * +create_ordered_paths(PlannerInfo *root, + RelOptInfo *input_rel, + PathTarget *target, + double limit_tuples) { + Path *cheapest_input_path = input_rel->cheapest_total_path; + RelOptInfo *ordered_rel; + ListCell *lc; + + /* For now, do all work in the (ORDERED, NULL) upperrel */ + ordered_rel = fetch_upper_rel(root, UPPERREL_ORDERED, NULL); + /* - * We have GROUP BY without aggregation or grouping sets. - * Make a GroupPath. + * If the input relation is not parallel-safe, then the ordered relation + * can't be parallel-safe, either. Otherwise, it's parallel-safe if the + * target list is parallel-safe. */ - path = (Path *) create_group_path(root, - grouped_rel, - path, - partial_grouping_target, - parse->groupClause, - NIL, - dNumPartialGroups); + if (input_rel->consider_parallel && + is_parallel_safe(root, (Node *) target->exprs)) + ordered_rel->consider_parallel = true; -#ifdef __TBASE__ - if (olap_optimizer && !has_cold_hot_table) - { - /* redistribute local grouping results among datanodes */ - path = create_redistribute_grouping_path(root, parse, path); - } - else + /* + * If the input rel belongs to a single FDW, so does the ordered_rel. + */ + ordered_rel->serverid = input_rel->serverid; + ordered_rel->userid = input_rel->userid; + ordered_rel->useridiscurrent = input_rel->useridiscurrent; + ordered_rel->fdwroutine = input_rel->fdwroutine; + + foreach(lc, input_rel->pathlist) { - path = create_remotesubplan_path(root, path, NULL); - } -#else - path = create_remotesubplan_path(root, path, NULL); -#endif + Path *path = (Path *) lfirst(lc); + bool is_sorted; -#ifdef __TBASE__ - if (olap_optimizer && !has_cold_hot_table && (!is_sorted || root->group_pathkeys)) + is_sorted = pathkeys_contained_in(root->sort_pathkeys, + path->pathkeys); + if (path == cheapest_input_path || is_sorted) + { + if (!is_sorted) { + /* An explicit sort here can take advantage of LIMIT */ path = (Path *) create_sort_path(root, - grouped_rel, + ordered_rel, path, - root->group_pathkeys, - -1.0); + root->sort_pathkeys, + limit_tuples); } -#endif - { - Path *group_path = (Path *) - create_group_path(root, - grouped_rel, - path, - target, - parse->groupClause, - (List *) parse->havingQual, - dNumGroups); - //group_path->parallel_safe = true; + /* Add projection step if needed */ + if (path->pathtarget != target) + path = apply_projection_to_path(root, ordered_rel, + path, target); - add_path(grouped_rel, group_path); + add_path(ordered_rel, path); } - } - else + + /* + * generate_gather_paths() will have already generated a simple Gather + * path for the best parallel path, if any, and the loop above will have + * considered sorting it. Similarly, generate_gather_paths() will also + * have generated order-preserving Gather Merge plans which can be used + * without sorting if they happen to match the sort_pathkeys, and the loop + * above will have handled those as well. However, there's one more + * possibility: it may make sense to sort the cheapest partial path + * according to the required output order and then use Gather Merge. + */ + if (ordered_rel->consider_parallel && root->sort_pathkeys != NIL && + input_rel->partial_pathlist != NIL) { - /* Other cases should have been handled above */ - Assert(false); - } - } - } - } - - if (can_hash) - { - hashaggtablesize = estimate_hashagg_tablesize(cheapest_path, - agg_costs, - dNumGroups); - - /* - * Provided that the estimated size of the hashtable does not exceed - * work_mem, we'll generate a HashAgg Path, although if we were unable - * to sort above, then we'd better generate a Path, so that we at - * least have one. - */ -#ifdef __TBASE__ - if (hashaggtablesize < work_mem * 1024L || g_hybrid_hash_agg || - grouped_rel->pathlist == NIL) -#else - if (hashaggtablesize < work_mem * 1024L || - grouped_rel->pathlist == NIL) -#endif - { - /* If the whole aggregate was pushed down, we're done. */ - if (! can_push_down_grouping(root, parse, cheapest_path)) - { - Path *path, *agg_path; - - path = (Path *) create_agg_path(root, - grouped_rel, - cheapest_path, - partial_grouping_target, - AGG_HASHED, - AGGSPLIT_INITIAL_SERIAL, - parse->groupClause, - NIL, - &agg_partial_costs, - dNumPartialGroups); - - /* keep partially aggregated path for the can_sort branch */ - agg_path = path; -#ifdef __TBASE__ - if (hashaggtablesize >= work_mem * 1024L && g_hybrid_hash_agg) - { - AggPath *aggpath = (AggPath *)agg_path; - - aggpath->hybrid = true; - } -#endif - -#ifdef __TBASE__ - if (olap_optimizer && !has_cold_hot_table) - { - /* redistribute local grouping results among datanodes */ - path = create_redistribute_grouping_path(root, parse, path); - } - else - { - path = create_remotesubplan_path(root, path, NULL); - } -#else - path = create_remotesubplan_path(root, path, NULL); -#endif - /* Generate paths with both hash and sort second phase. */ - { - Path *agg_path = (Path *) - create_agg_path(root, - grouped_rel, - path, - target, - AGG_HASHED, - AGGSPLIT_FINAL_DESERIAL, - parse->groupClause, - (List *) parse->havingQual, - &agg_final_costs, - dNumGroups); - - //agg_path->parallel_safe = true; -#ifdef __TBASE__ - if (hashaggtablesize >= work_mem * 1024L && g_hybrid_hash_agg) - { - AggPath *aggpath = (AggPath *)agg_path; + Path *cheapest_partial_path; - aggpath->hybrid = true; - } -#endif - add_path(grouped_rel, agg_path); - } - - if (can_sort) - { -#ifdef __TBASE__ - if (!olap_optimizer || has_cold_hot_table) -#endif - path = (Path *) create_sort_path(root, - grouped_rel, - agg_path, - root->group_pathkeys, - -1.0); - -#ifdef __TBASE__ - if (olap_optimizer && !has_cold_hot_table) - { - /* redistribute local grouping results among datanodes */ - path = create_redistribute_grouping_path(root, parse, agg_path); - } - else - { - path = create_remotesubplan_path(root, path, NULL); - } -#else - path = create_remotesubplan_path(root, path, NULL); -#endif + cheapest_partial_path = linitial(input_rel->partial_pathlist); -#ifdef __TBASE__ - if (olap_optimizer && !has_cold_hot_table) - { /* - * AGG_HASHED aggregate paths are always unsorted, so add - * a Sorted node for the final AGG_SORTED step. + * If cheapest partial path doesn't need a sort, this is redundant + * with what's already been tried. */ + if (!pathkeys_contained_in(root->sort_pathkeys, + cheapest_partial_path->pathkeys)) + { + Path *path; + double total_groups; + path = (Path *) create_sort_path(root, - grouped_rel, - path, - root->group_pathkeys, + ordered_rel, + cheapest_partial_path, + root->sort_pathkeys, -1.0); - } -#endif + total_groups = cheapest_partial_path->rows * + cheapest_partial_path->parallel_workers; path = (Path *) - create_agg_path(root, - grouped_rel, + create_gather_merge_path(root, ordered_rel, path, - target, - parse->groupClause ? AGG_SORTED : AGG_PLAIN, - AGGSPLIT_FINAL_DESERIAL, - parse->groupClause, - (List *) parse->havingQual, - &agg_final_costs, - dNumGroups); + target, root->sort_pathkeys, NULL, + &total_groups); - //path->parallel_safe = true; + /* Add projection step if needed */ + if (path->pathtarget != target) + path = apply_projection_to_path(root, ordered_rel, + path, target); - add_path(grouped_rel, path); - } - } - } + add_path(ordered_rel, path); } } - /* Give a helpful error if we failed to find any implementation */ - if (grouped_rel->pathlist == NIL) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("could not implement GROUP BY"), - errdetail("Some of the datatypes only support hashing, while others only support sorting."))); - /* * If there is an FDW that's responsible for all baserels of the query, * let it consider adding ForeignPaths. */ - if (grouped_rel->fdwroutine && - grouped_rel->fdwroutine->GetForeignUpperPaths) - grouped_rel->fdwroutine->GetForeignUpperPaths(root, UPPERREL_GROUP_AGG, - input_rel, grouped_rel); + if (ordered_rel->fdwroutine && + ordered_rel->fdwroutine->GetForeignUpperPaths) + ordered_rel->fdwroutine->GetForeignUpperPaths(root, UPPERREL_ORDERED, + input_rel, ordered_rel); /* Let extensions possibly add some more paths */ if (create_upper_paths_hook) - (*create_upper_paths_hook) (root, UPPERREL_GROUP_AGG, - input_rel, grouped_rel); + (*create_upper_paths_hook) (root, UPPERREL_ORDERED, + input_rel, ordered_rel); - /* Now choose the best path(s) */ - set_cheapest(grouped_rel); /* - * We've been using the partial pathlist for the grouped relation to hold - * partially aggregated paths, but that's actually a little bit bogus - * because it's unsafe for later planning stages -- like ordered_rel --- - * to get the idea that they can use these partial paths as if they didn't - * need a FinalizeAggregate step. Zap the partial pathlist at this stage - * so we don't get confused. + * No need to bother with set_cheapest here; grouping_planner does not + * need us to do it. */ - grouped_rel->partial_pathlist = NIL; + Assert(ordered_rel->pathlist != NIL); - return grouped_rel; + return ordered_rel; } /* - * For a given input path, consider the possible ways of doing grouping sets on - * it, by combinations of hashing and sorting. This can be called multiple - * times, so it's important that it not scribble on input. No result is - * returned, but any generated paths are added to grouped_rel. + * make_group_input_target + * Generate appropriate PathTarget for initial input to grouping nodes. + * + * If there is grouping or aggregation, the scan/join subplan cannot emit + * the query's final targetlist; for example, it certainly can't emit any + * aggregate function calls. This routine generates the correct target + * for the scan/join subplan. + * + * The query target list passed from the parser already contains entries + * for all ORDER BY and GROUP BY expressions, but it will not have entries + * for variables used only in HAVING clauses; so we need to add those + * variables to the subplan target list. Also, we flatten all expressions + * except GROUP BY items into their component variables; other expressions + * will be computed by the upper plan nodes rather than by the subplan. + * For example, given a query like + * SELECT a+b,SUM(c+d) FROM table GROUP BY a+b; + * we want to pass this targetlist to the subplan: + * a+b,c,d + * where the a+b target will be used by the Sort/Group steps, and the + * other targets will be used for computing the final results. + * + * 'final_target' is the query's final target list (in PathTarget form) + * + * The result is the PathTarget to be computed by the Paths returned from + * query_planner(). */ -static void -consider_groupingsets_paths(PlannerInfo *root, - RelOptInfo *grouped_rel, - Path *path, - bool is_sorted, - bool can_hash, - PathTarget *target, - grouping_sets_data *gd, - const AggClauseCosts *agg_costs, - double dNumGroups) -{// #lizard forgives +static PathTarget * +make_group_input_target(PlannerInfo *root, PathTarget *final_target) +{ Query *parse = root->parse; + PathTarget *input_target; + List *non_group_cols; + List *non_group_vars; + int i; + ListCell *lc; /* - * If we're not being offered sorted input, then only consider plans that - * can be done entirely by hashing. - * - * We can hash everything if it looks like it'll fit in work_mem. But if - * the input is actually sorted despite not being advertised as such, we - * prefer to make use of that in order to use less memory. - * - * If none of the grouping sets are sortable, then ignore the work_mem - * limit and generate a path anyway, since otherwise we'll just fail. - */ - if (!is_sorted) - { - List *new_rollups = NIL; - RollupData *unhashed_rollup = NULL; - List *sets_data; - List *empty_sets_data = NIL; - List *empty_sets = NIL; - ListCell *lc; - ListCell *l_start = list_head(gd->rollups); - AggStrategy strat = AGG_HASHED; - Size hashsize; - double exclude_groups = 0.0; + * We must build a target containing all grouping columns, plus any other + * Vars mentioned in the query's targetlist and HAVING qual. + */ + input_target = create_empty_pathtarget(); + non_group_cols = NIL; - Assert(can_hash); + i = 0; + foreach(lc, final_target->exprs) + { + Expr *expr = (Expr *) lfirst(lc); + Index sgref = get_pathtarget_sortgroupref(final_target, i); - if (pathkeys_contained_in(root->group_pathkeys, path->pathkeys)) + if (sgref && parse->groupClause && + get_sortgroupref_clause_noerr(sgref, parse->groupClause) != NULL) { - unhashed_rollup = lfirst(l_start); - exclude_groups = unhashed_rollup->numGroups; - l_start = lnext(l_start); + /* + * It's a grouping column, so add it to the input target as-is. + */ + add_column_to_pathtarget(input_target, expr, sgref); + } + else + { + /* + * Non-grouping column, so just remember the expression for later + * call to pull_var_clause. + */ + non_group_cols = lappend(non_group_cols, expr); } - hashsize = estimate_hashagg_tablesize(path, - agg_costs, - dNumGroups - exclude_groups); + i++; + } /* - * gd->rollups is empty if we have only unsortable columns to work - * with. Override work_mem in that case; otherwise, we'll rely on the - * sorted-input case to generate usable mixed paths. + * If there's a HAVING clause, we'll need the Vars it uses, too. */ - if (hashsize > work_mem * 1024L && gd->rollups) - return; /* nope, won't fit */ + if (parse->havingQual) + non_group_cols = lappend(non_group_cols, parse->havingQual); /* - * We need to burst the existing rollups list into individual grouping - * sets and recompute a groupClause for each set. + * Pull out all the Vars mentioned in non-group cols (plus HAVING), and + * add them to the input target if not already present. (A Var used + * directly as a GROUP BY item will be present already.) Note this + * includes Vars used in resjunk items, so we are covering the needs of + * ORDER BY and window specifications. Vars used within Aggrefs and + * WindowFuncs will be pulled out here, too. */ - sets_data = list_copy(gd->unsortable_sets); - - for_each_cell(lc, l_start) - { - RollupData *rollup = lfirst(lc); + non_group_vars = pull_var_clause((Node *) non_group_cols, + PVC_RECURSE_AGGREGATES | + PVC_RECURSE_WINDOWFUNCS | + PVC_INCLUDE_PLACEHOLDERS); + add_new_columns_to_pathtarget(input_target, non_group_vars); + + /* clean up cruft */ + list_free(non_group_vars); + list_free(non_group_cols); + + /* XXX this causes some redundant cost calculation ... */ + return set_pathtarget_cost_width(root, input_target); +} /* - * If we find an unhashable rollup that's not been skipped by the - * "actually sorted" check above, we can't cope; we'd need sorted - * input (with a different sort order) but we can't get that here. - * So bail out; we'll get a valid path from the is_sorted case - * instead. + * make_partial_grouping_target + * Generate appropriate PathTarget for output of partial aggregate + * (or partial grouping, if there are no aggregates) nodes. * - * The mere presence of empty grouping sets doesn't make a rollup - * unhashable (see preprocess_grouping_sets), we handle those - * specially below. + * A partial aggregation node needs to emit all the same aggregates that + * a regular aggregation node would, plus any aggregates used in HAVING; + * except that the Aggref nodes should be marked as partial aggregates. + * + * In addition, we'd better emit any Vars and PlaceholderVars that are + * used outside of Aggrefs in the aggregation tlist and HAVING. (Presumably, + * these would be Vars that are grouped by or used in grouping expressions.) + * + * grouping_target is the tlist to be emitted by the topmost aggregation step. + * We get the HAVING clause out of *root. */ - if (!rollup->hashable) - return; - else - sets_data = list_concat(sets_data, list_copy(rollup->gsets_data)); - } - foreach(lc, sets_data) +static PathTarget * +make_partial_grouping_target(PlannerInfo *root, PathTarget *grouping_target) { - GroupingSetData *gs = lfirst(lc); - List *gset = gs->set; - RollupData *rollup; - - if (gset == NIL) + Query *parse = root->parse; + PathTarget *partial_target; + List *non_group_cols; + List *non_group_exprs; + int i; + ListCell *lc; + + partial_target = create_empty_pathtarget(); + non_group_cols = NIL; + + i = 0; + foreach(lc, grouping_target->exprs) { - /* Empty grouping sets can't be hashed. */ - empty_sets_data = lappend(empty_sets_data, gs); - empty_sets = lappend(empty_sets, NIL); + Expr *expr = (Expr *) lfirst(lc); + Index sgref = get_pathtarget_sortgroupref(grouping_target, i); + + if (sgref && parse->groupClause && + get_sortgroupref_clause_noerr(sgref, parse->groupClause) != NULL) + { + /* + * It's a grouping column, so add it to the partial_target as-is. + * (This allows the upper agg step to repeat the grouping calcs.) + */ + add_column_to_pathtarget(partial_target, expr, sgref); } else { - rollup = makeNode(RollupData); - - rollup->groupClause = preprocess_groupclause(root, gset); - rollup->gsets_data = list_make1(gs); - rollup->gsets = remap_to_groupclause_idx(rollup->groupClause, - rollup->gsets_data, - gd->tleref_to_colnum_map); - rollup->numGroups = gs->numGroups; - rollup->hashable = true; - rollup->is_hashed = true; - new_rollups = lappend(new_rollups, rollup); + /* + * Non-grouping column, so just remember the expression for later + * call to pull_var_clause. + */ + non_group_cols = lappend(non_group_cols, expr); } + + i++; } /* - * If we didn't find anything nonempty to hash, then bail. We'll - * generate a path from the is_sorted case. + * If there's a HAVING clause, we'll need the Vars/Aggrefs it uses, too. */ - if (new_rollups == NIL) - return; + if (parse->havingQual) + non_group_cols = lappend(non_group_cols, parse->havingQual); /* - * If there were empty grouping sets they should have been in the - * first rollup. + * Pull out all the Vars, PlaceHolderVars, and Aggrefs mentioned in + * non-group cols (plus HAVING), and add them to the partial_target if not + * already present. (An expression used directly as a GROUP BY item will + * be present already.) Note this includes Vars used in resjunk items, so + * we are covering the needs of ORDER BY and window specifications. */ - Assert(!unhashed_rollup || !empty_sets); + non_group_exprs = pull_var_clause((Node *) non_group_cols, + PVC_INCLUDE_AGGREGATES | + PVC_RECURSE_WINDOWFUNCS | + PVC_INCLUDE_PLACEHOLDERS); - if (unhashed_rollup) - { - new_rollups = lappend(new_rollups, unhashed_rollup); - strat = AGG_MIXED; - } - else if (empty_sets) + add_new_columns_to_pathtarget(partial_target, non_group_exprs); + + /* + * Adjust Aggrefs to put them in partial mode. At this point all Aggrefs + * are at the top level of the target list, so we can just scan the list + * rather than recursing through the expression trees. + */ + foreach(lc, partial_target->exprs) { - RollupData *rollup = makeNode(RollupData); + Aggref *aggref = (Aggref *) lfirst(lc); - rollup->groupClause = NIL; - rollup->gsets_data = empty_sets_data; - rollup->gsets = empty_sets; - rollup->numGroups = list_length(empty_sets); - rollup->hashable = false; - rollup->is_hashed = false; - new_rollups = lappend(new_rollups, rollup); - strat = AGG_MIXED; - } + if (IsA(aggref, Aggref)) + { + Aggref *newaggref; /* - * If the grouping can't be fully pushed down, redistribute the - * path on top of the (sorted) path. If if can be pushed down, - * disable construction of complex distributed paths. + * We shouldn't need to copy the substructure of the Aggref node, + * but flat-copy the node itself to avoid damaging other trees. */ - if (! can_push_down_grouping(root, parse, path)) - path = create_remotesubplan_path(root, path, NULL); - - add_path(grouped_rel, (Path *) - create_groupingsets_path(root, - grouped_rel, - path, - target, - (List *) parse->havingQual, - strat, - new_rollups, - agg_costs, - dNumGroups)); - return; + newaggref = makeNode(Aggref); + memcpy(newaggref, aggref, sizeof(Aggref)); + + /* For now, assume serialization is required */ + mark_partial_aggref(newaggref, AGGSPLIT_INITIAL_SERIAL); + + lfirst(lc) = newaggref; + } } - /* - * If we have sorted input but nothing we can do with it, bail. - */ - if (list_length(gd->rollups) == 0) - return; + /* clean up cruft */ + list_free(non_group_exprs); + list_free(non_group_cols); + + /* XXX this causes some redundant cost calculation ... */ + return set_pathtarget_cost_width(root, partial_target); +} /* - * Given sorted input, we try and make two paths: one sorted and one mixed - * sort/hash. (We need to try both because hashagg might be disabled, or - * some columns might not be sortable.) + * mark_partial_aggref + * Adjust an Aggref to make it represent a partial-aggregation step. * - * can_hash is passed in as false if some obstacle elsewhere (such as - * ordered aggs) means that we shouldn't consider hashing at all. + * The Aggref node is modified in-place; caller must do any copying required. */ - if (can_hash && gd->any_hashable) +void +mark_partial_aggref(Aggref *agg, AggSplit aggsplit) { - List *rollups = NIL; - List *hash_sets = list_copy(gd->unsortable_sets); - double availspace = (work_mem * 1024.0); - ListCell *lc; - - /* - * Account first for space needed for groups we can't sort at all. - */ - availspace -= (double) estimate_hashagg_tablesize(path, - agg_costs, - gd->dNumHashGroups); + /* aggtranstype should be computed by this point */ + Assert(OidIsValid(agg->aggtranstype)); + /* ... but aggsplit should still be as the parser left it */ + Assert(agg->aggsplit == AGGSPLIT_SIMPLE); - if (availspace > 0 && list_length(gd->rollups) > 1) - { - double scale; - int num_rollups = list_length(gd->rollups); - int k_capacity; - int *k_weights = palloc(num_rollups * sizeof(int)); - Bitmapset *hash_items = NULL; - int i; + /* Mark the Aggref with the intended partial-aggregation mode */ + agg->aggsplit = aggsplit; /* - * We treat this as a knapsack problem: the knapsack capacity - * represents work_mem, the item weights are the estimated memory - * usage of the hashtables needed to implement a single rollup, - * and we really ought to use the cost saving as the item value; - * however, currently the costs assigned to sort nodes don't - * reflect the comparison costs well, and so we treat all items as - * of equal value (each rollup we hash instead saves us one sort). - * - * To use the discrete knapsack, we need to scale the values to a - * reasonably small bounded range. We choose to allow a 5% error - * margin; we have no more than 4096 rollups in the worst possible - * case, which with a 5% error margin will require a bit over 42MB - * of workspace. (Anyone wanting to plan queries that complex had - * better have the memory for it. In more reasonable cases, with - * no more than a couple of dozen rollups, the memory usage will - * be negligible.) - * - * k_capacity is naturally bounded, but we clamp the values for - * scale and weight (below) to avoid overflows or underflows (or - * uselessly trying to use a scale factor less than 1 byte). + * Adjust result type if needed. Normally, a partial aggregate returns + * the aggregate's transition type; but if that's INTERNAL and we're + * serializing, it returns BYTEA instead. */ - scale = Max(availspace / (20.0 * num_rollups), 1.0); - k_capacity = (int) floor(availspace / scale); + if (DO_AGGSPLIT_SKIPFINAL(aggsplit)) + { + if (agg->aggtranstype == INTERNALOID && DO_AGGSPLIT_SERIALIZE(aggsplit)) + agg->aggtype = BYTEAOID; + else + agg->aggtype = agg->aggtranstype; + } +} /* - * We leave the first rollup out of consideration since it's the - * one that matches the input sort order. We assign indexes "i" - * to only those entries considered for hashing; the second loop, - * below, must use the same condition. + * postprocess_setop_tlist + * Fix up targetlist returned by plan_set_operations(). + * + * We need to transpose sort key info from the orig_tlist into new_tlist. + * NOTE: this would not be good enough if we supported resjunk sort keys + * for results of set operations --- then, we'd need to project a whole + * new tlist to evaluate the resjunk columns. For now, just ereport if we + * find any resjunk columns in orig_tlist. */ - i = 0; - for_each_cell(lc, lnext(list_head(gd->rollups))) +static List * +postprocess_setop_tlist(List *new_tlist, List *orig_tlist) { - RollupData *rollup = lfirst(lc); + ListCell *l; + ListCell *orig_tlist_item = list_head(orig_tlist); - if (rollup->hashable) + foreach(l, new_tlist) { - double sz = estimate_hashagg_tablesize(path, - agg_costs, - rollup->numGroups); + TargetEntry *new_tle = (TargetEntry *) lfirst(l); + TargetEntry *orig_tle; - /* - * If sz is enormous, but work_mem (and hence scale) is - * small, avoid integer overflow here. - */ - k_weights[i] = (int) Min(floor(sz / scale), - k_capacity + 1.0); - ++i; + /* ignore resjunk columns in setop result */ + if (new_tle->resjunk) + continue; + + Assert(orig_tlist_item != NULL); + orig_tle = (TargetEntry *) lfirst(orig_tlist_item); + orig_tlist_item = lnext(orig_tlist_item); + if (orig_tle->resjunk) /* should not happen */ + elog(ERROR, "resjunk output columns are not implemented"); + Assert(new_tle->resno == orig_tle->resno); + new_tle->ressortgroupref = orig_tle->ressortgroupref; } + if (orig_tlist_item != NULL) + elog(ERROR, "resjunk output columns are not implemented"); + return new_tlist; } /* - * Apply knapsack algorithm; compute the set of items which - * maximizes the value stored (in this case the number of sorts - * saved) while keeping the total size (approximately) within - * capacity. + * select_active_windows + * Create a list of the "active" window clauses (ie, those referenced + * by non-deleted WindowFuncs) in the order they are to be executed. */ - if (i > 0) - hash_items = DiscreteKnapsack(k_capacity, i, k_weights, NULL); - - if (!bms_is_empty(hash_items)) +static List * +select_active_windows(PlannerInfo *root, WindowFuncLists *wflists) { - rollups = list_make1(linitial(gd->rollups)); + List *result; + List *actives; + ListCell *lc; - i = 0; - for_each_cell(lc, lnext(list_head(gd->rollups))) + /* First, make a list of the active windows */ + actives = NIL; + foreach(lc, root->parse->windowClause) { - RollupData *rollup = lfirst(lc); + WindowClause *wc = (WindowClause *) lfirst(lc); - if (rollup->hashable) - { - if (bms_is_member(i, hash_items)) - hash_sets = list_concat(hash_sets, - list_copy(rollup->gsets_data)); - else - rollups = lappend(rollups, rollup); - ++i; - } - else - rollups = lappend(rollups, rollup); - } - } + /* It's only active if wflists shows some related WindowFuncs */ + Assert(wc->winref <= wflists->maxWinRef); + if (wflists->windowFuncs[wc->winref] != NIL) + actives = lappend(actives, wc); } - if (!rollups && hash_sets) - rollups = list_copy(gd->rollups); - - foreach(lc, hash_sets) + /* + * Now, ensure that windows with identical partitioning/ordering clauses + * are adjacent in the list. This is required by the SQL standard, which + * says that only one sort is to be used for such windows, even if they + * are otherwise distinct (eg, different names or framing clauses). + * + * There is room to be much smarter here, for example detecting whether + * one window's sort keys are a prefix of another's (so that sorting for + * the latter would do for the former), or putting windows first that + * match a sort order available for the underlying query. For the moment + * we are content with meeting the spec. + */ + result = NIL; + while (actives != NIL) { - GroupingSetData *gs = lfirst(lc); - RollupData *rollup = makeNode(RollupData); + WindowClause *wc = (WindowClause *) linitial(actives); + ListCell *prev; + ListCell *next; - Assert(gs->set != NIL); - - rollup->groupClause = preprocess_groupclause(root, gs->set); - rollup->gsets_data = list_make1(gs); - rollup->gsets = remap_to_groupclause_idx(rollup->groupClause, - rollup->gsets_data, - gd->tleref_to_colnum_map); - rollup->numGroups = gs->numGroups; - rollup->hashable = true; - rollup->is_hashed = true; - rollups = lcons(rollup, rollups); - } + /* Move wc from actives to result */ + actives = list_delete_first(actives); + result = lappend(result, wc); - if (rollups) + /* Now move any matching windows from actives to result */ + prev = NULL; + for (lc = list_head(actives); lc; lc = next) { - /* - * If the grouping can't be fully pushed down, redistribute the - * path on top of the (sorted) path. If if can be pushed down, - * disable construction of complex distributed paths. - */ - if (! can_push_down_grouping(root, parse, path)) - path = create_remotesubplan_path(root, path, NULL); - - add_path(grouped_rel, (Path *) - create_groupingsets_path(root, - grouped_rel, - path, - target, - (List *) parse->havingQual, - AGG_MIXED, - rollups, - agg_costs, - dNumGroups)); + WindowClause *wc2 = (WindowClause *) lfirst(lc); + + next = lnext(lc); + /* framing options are NOT to be compared here! */ + if (equal(wc->partitionClause, wc2->partitionClause) && + equal(wc->orderClause, wc2->orderClause)) + { + actives = list_delete_cell(actives, lc, prev); + result = lappend(result, wc2); } + else + prev = lc; } - - /* - * Now try the simple sorted case. - */ - if (!gd->unsortable_sets) - { - /* - * If the grouping can't be fully pushed down, redistribute the - * path on top of the (sorted) path. If if can be pushed down, - * disable construction of complex distributed paths. - */ - if (! can_push_down_grouping(root, parse, path)) - path = create_remotesubplan_path(root, path, NULL); - - add_path(grouped_rel, (Path *) - create_groupingsets_path(root, - grouped_rel, - path, - target, - (List *) parse->havingQual, - AGG_SORTED, - gd->rollups, - agg_costs, - dNumGroups)); } + + return result; } /* - * create_window_paths - * - * Build a new upperrel containing Paths for window-function evaluation. + * make_window_input_target + * Generate appropriate PathTarget for initial input to WindowAgg nodes. * - * input_rel: contains the source-data Paths - * input_target: result of make_window_input_target - * output_target: what the topmost WindowAggPath should return - * tlist: query's target list (needed to look up pathkeys) - * wflists: result of find_window_functions - * activeWindows: result of select_active_windows + * When the query has window functions, this function computes the desired + * target to be computed by the node just below the first WindowAgg. + * This tlist must contain all values needed to evaluate the window functions, + * compute the final target list, and perform any required final sort step. + * If multiple WindowAggs are needed, each intermediate one adds its window + * function results onto this base tlist; only the topmost WindowAgg computes + * the actual desired target list. * - * Note: all Paths in input_rel are expected to return input_target. - */ -static RelOptInfo * -create_window_paths(PlannerInfo *root, - RelOptInfo *input_rel, - PathTarget *input_target, - PathTarget *output_target, - List *tlist, - WindowFuncLists *wflists, + * This function is much like make_group_input_target, though not quite enough + * like it to share code. As in that function, we flatten most expressions + * into their component variables. But we do not want to flatten window + * PARTITION BY/ORDER BY clauses, since that might result in multiple + * evaluations of them, which would be bad (possibly even resulting in + * inconsistent answers, if they contain volatile functions). + * Also, we must not flatten GROUP BY clauses that were left unflattened by + * make_group_input_target, because we may no longer have access to the + * individual Vars in them. + * + * Another key difference from make_group_input_target is that we don't + * flatten Aggref expressions, since those are to be computed below the + * window functions and just referenced like Vars above that. + * + * 'final_target' is the query's final target list (in PathTarget form) + * 'activeWindows' is the list of active windows previously identified by + * select_active_windows. + * + * The result is the PathTarget to be computed by the plan node immediately + * below the first WindowAgg node. + */ +static PathTarget * +make_window_input_target(PlannerInfo *root, + PathTarget *final_target, List *activeWindows) -{// #lizard forgives - RelOptInfo *window_rel; +{ + Query *parse = root->parse; + PathTarget *input_target; + Bitmapset *sgrefs; + List *flattenable_cols; + List *flattenable_vars; + int i; ListCell *lc; - /* For now, do all work in the (WINDOW, NULL) upperrel */ - window_rel = fetch_upper_rel(root, UPPERREL_WINDOW, NULL); - - /* - * If the input relation is not parallel-safe, then the window relation - * can't be parallel-safe, either. Otherwise, we need to examine the - * target list and active windows for non-parallel-safe constructs. - */ - if (input_rel->consider_parallel && - is_parallel_safe(root, (Node *) output_target->exprs) && - is_parallel_safe(root, (Node *) activeWindows)) - window_rel->consider_parallel = true; + Assert(parse->hasWindowFuncs); /* - * If the input rel belongs to a single FDW, so does the window rel. + * Collect the sortgroupref numbers of window PARTITION/ORDER BY clauses + * into a bitmapset for convenient reference below. */ - window_rel->serverid = input_rel->serverid; - window_rel->userid = input_rel->userid; - window_rel->useridiscurrent = input_rel->useridiscurrent; - window_rel->fdwroutine = input_rel->fdwroutine; + sgrefs = NULL; + foreach(lc, activeWindows) + { + WindowClause *wc = (WindowClause *) lfirst(lc); + ListCell *lc2; - /* - * Consider computing window functions starting from the existing - * cheapest-total path (which will likely require a sort) as well as any - * existing paths that satisfy root->window_pathkeys (which won't). - */ - foreach(lc, input_rel->pathlist) + foreach(lc2, wc->partitionClause) { - Path *path = (Path *) lfirst(lc); + SortGroupClause *sortcl = (SortGroupClause *) lfirst(lc2); - if (path == input_rel->cheapest_total_path || - pathkeys_contained_in(root->window_pathkeys, path->pathkeys)) - create_one_window_path(root, - window_rel, - path, - input_target, - output_target, - tlist, - wflists, - activeWindows); + sgrefs = bms_add_member(sgrefs, sortcl->tleSortGroupRef); } + foreach(lc2, wc->orderClause) + { + SortGroupClause *sortcl = (SortGroupClause *) lfirst(lc2); - /* - * If there is an FDW that's responsible for all baserels of the query, - * let it consider adding ForeignPaths. - */ - if (window_rel->fdwroutine && - window_rel->fdwroutine->GetForeignUpperPaths) - window_rel->fdwroutine->GetForeignUpperPaths(root, UPPERREL_WINDOW, - input_rel, window_rel); - - /* Let extensions possibly add some more paths */ - if (create_upper_paths_hook) - (*create_upper_paths_hook) (root, UPPERREL_WINDOW, - input_rel, window_rel); + sgrefs = bms_add_member(sgrefs, sortcl->tleSortGroupRef); + } + } - /* Now choose the best path(s) */ - set_cheapest(window_rel); + /* Add in sortgroupref numbers of GROUP BY clauses, too */ + foreach(lc, parse->groupClause) + { + SortGroupClause *grpcl = (SortGroupClause *) lfirst(lc); - return window_rel; + sgrefs = bms_add_member(sgrefs, grpcl->tleSortGroupRef); } /* - * Stack window-function implementation steps atop the given Path, and - * add the result to window_rel. - * - * window_rel: upperrel to contain result - * path: input Path to use (must return input_target) - * input_target: result of make_window_input_target - * output_target: what the topmost WindowAggPath should return - * tlist: query's target list (needed to look up pathkeys) - * wflists: result of find_window_functions - * activeWindows: result of select_active_windows + * Construct a target containing all the non-flattenable targetlist items, + * and save aside the others for a moment. */ -static void -create_one_window_path(PlannerInfo *root, - RelOptInfo *window_rel, - Path *path, - PathTarget *input_target, - PathTarget *output_target, - List *tlist, - WindowFuncLists *wflists, - List *activeWindows) + input_target = create_empty_pathtarget(); + flattenable_cols = NIL; + + i = 0; + foreach(lc, final_target->exprs) { - PathTarget *window_target; - ListCell *l; + Expr *expr = (Expr *) lfirst(lc); + Index sgref = get_pathtarget_sortgroupref(final_target, i); /* - * Since each window clause could require a different sort order, we stack - * up a WindowAgg node for each clause, with sort steps between them as - * needed. (We assume that select_active_windows chose a good order for - * executing the clauses in.) - * - * input_target should contain all Vars and Aggs needed for the result. - * (In some cases we wouldn't need to propagate all of these all the way - * to the top, since they might only be needed as inputs to WindowFuncs. - * It's probably not worth trying to optimize that though.) It must also - * contain all window partitioning and sorting expressions, to ensure - * they're computed only once at the bottom of the stack (that's critical - * for volatile functions). As we climb up the stack, we'll add outputs - * for the WindowFuncs computed at each level. + * Don't want to deconstruct window clauses or GROUP BY items. (Note + * that such items can't contain window functions, so it's okay to + * compute them below the WindowAgg nodes.) */ - window_target = input_target; - - foreach(l, activeWindows) - { - WindowClause *wc = (WindowClause *) lfirst(l); - List *window_pathkeys; - - window_pathkeys = make_pathkeys_for_window(root, - wc, - tlist); - - /* Sort if necessary */ - if (!pathkeys_contained_in(window_pathkeys, path->pathkeys)) - { - path = (Path *) create_sort_path(root, window_rel, - path, - window_pathkeys, - -1.0); - } - - if (lnext(l)) + if (sgref != 0 && bms_is_member(sgref, sgrefs)) { /* - * Add the current WindowFuncs to the output target for this - * intermediate WindowAggPath. We must copy window_target to - * avoid changing the previous path's target. - * - * Note: a WindowFunc adds nothing to the target's eval costs; but - * we do need to account for the increase in tlist width. + * Don't want to deconstruct this value, so add it to the input + * target as-is. */ - ListCell *lc2; - - window_target = copy_pathtarget(window_target); - foreach(lc2, wflists->windowFuncs[wc->winref]) - { - WindowFunc *wfunc = lfirst_node(WindowFunc, lc2); - - add_column_to_pathtarget(window_target, (Expr *) wfunc, 0); - window_target->width += get_typavgwidth(wfunc->wintype, -1); - } + add_column_to_pathtarget(input_target, expr, sgref); } else { - /* Install the goal target in the topmost WindowAgg */ - window_target = output_target; + /* + * Column is to be flattened, so just remember the expression for + * later call to pull_var_clause. + */ + flattenable_cols = lappend(flattenable_cols, expr); } - /* We can't really push down window functions for now. */ - if (!can_push_down_window(root, path)) - path = create_remotesubplan_path(root, path, NULL); - - path = (Path *) - create_windowagg_path(root, window_rel, path, window_target, - wflists->windowFuncs[wc->winref], - wc, - window_pathkeys); + i++; } - add_path(window_rel, path); + /* + * Pull out all the Vars and Aggrefs mentioned in flattenable columns, and + * add them to the input target if not already present. (Some might be + * there already because they're used directly as window/group clauses.) + * + * Note: it's essential to use PVC_INCLUDE_AGGREGATES here, so that any + * Aggrefs are placed in the Agg node's tlist and not left to be computed + * at higher levels. On the other hand, we should recurse into + * WindowFuncs to make sure their input expressions are available. + */ + flattenable_vars = pull_var_clause((Node *) flattenable_cols, + PVC_INCLUDE_AGGREGATES | + PVC_RECURSE_WINDOWFUNCS | + PVC_INCLUDE_PLACEHOLDERS); + add_new_columns_to_pathtarget(input_target, flattenable_vars); + + /* clean up cruft */ + list_free(flattenable_vars); + list_free(flattenable_cols); + + /* XXX this causes some redundant cost calculation ... */ + return set_pathtarget_cost_width(root, input_target); } /* - * create_distinct_paths - * - * Build a new upperrel containing Paths for SELECT DISTINCT evaluation. + * make_pathkeys_for_window + * Create a pathkeys list describing the required input ordering + * for the given WindowClause. * - * input_rel: contains the source-data Paths + * The required ordering is first the PARTITION keys, then the ORDER keys. + * In the future we might try to implement windowing using hashing, in which + * case the ordering could be relaxed, but for now we always sort. * - * Note: input paths should already compute the desired pathtarget, since - * Sort/Unique won't project anything. + * Caution: if you change this, see createplan.c's get_column_info_for_window! */ -static RelOptInfo * -create_distinct_paths(PlannerInfo *root, - RelOptInfo *input_rel) -{// #lizard forgives - Query *parse = root->parse; - Path *cheapest_input_path = input_rel->cheapest_total_path; - RelOptInfo *distinct_rel; - double numDistinctRows; - bool allow_hash; - Path *path; - ListCell *lc; - - /* For now, do all work in the (DISTINCT, NULL) upperrel */ - distinct_rel = fetch_upper_rel(root, UPPERREL_DISTINCT, NULL); +static List * +make_pathkeys_for_window(PlannerInfo *root, WindowClause *wc, + List *tlist) +{ + List *window_pathkeys; + List *window_sortclauses; + + /* Throw error if can't sort */ + if (!grouping_is_sortable(wc->partitionClause)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("could not implement window PARTITION BY"), + errdetail("Window partitioning columns must be of sortable datatypes."))); + if (!grouping_is_sortable(wc->orderClause)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("could not implement window ORDER BY"), + errdetail("Window ordering columns must be of sortable datatypes."))); + + /* Okay, make the combined pathkeys */ + window_sortclauses = list_concat(list_copy(wc->partitionClause), + list_copy(wc->orderClause)); + window_pathkeys = make_pathkeys_for_sortclauses(root, + window_sortclauses, + tlist); + list_free(window_sortclauses); + return window_pathkeys; +} /* - * We don't compute anything at this level, so distinct_rel will be - * parallel-safe if the input rel is parallel-safe. In particular, if - * there is a DISTINCT ON (...) clause, any path for the input_rel will - * output those expressions, and will not be parallel-safe unless those - * expressions are parallel-safe. + * make_sort_input_target + * Generate appropriate PathTarget for initial input to Sort step. + * + * If the query has ORDER BY, this function chooses the target to be computed + * by the node just below the Sort (and DISTINCT, if any, since Unique can't + * project) steps. This might or might not be identical to the query's final + * output target. + * + * The main argument for keeping the sort-input tlist the same as the final + * is that we avoid a separate projection node (which will be needed if + * they're different, because Sort can't project). However, there are also + * advantages to postponing tlist evaluation till after the Sort: it ensures + * a consistent order of evaluation for any volatile functions in the tlist, + * and if there's also a LIMIT, we can stop the query without ever computing + * tlist functions for later rows, which is beneficial for both volatile and + * expensive functions. + * + * Our current policy is to postpone volatile expressions till after the sort + * unconditionally (assuming that that's possible, ie they are in plain tlist + * columns and not ORDER BY/GROUP BY/DISTINCT columns). We also prefer to + * postpone set-returning expressions, because running them beforehand would + * bloat the sort dataset, and because it might cause unexpected output order + * if the sort isn't stable. However there's a constraint on that: all SRFs + * in the tlist should be evaluated at the same plan step, so that they can + * run in sync in nodeProjectSet. So if any SRFs are in sort columns, we + * mustn't postpone any SRFs. (Note that in principle that policy should + * probably get applied to the group/window input targetlists too, but we + * have not done that historically.) Lastly, expensive expressions are + * postponed if there is a LIMIT, or if root->tuple_fraction shows that + * partial evaluation of the query is possible (if neither is true, we expect + * to have to evaluate the expressions for every row anyway), or if there are + * any volatile or set-returning expressions (since once we've put in a + * projection at all, it won't cost any more to postpone more stuff). + * + * Another issue that could potentially be considered here is that + * evaluating tlist expressions could result in data that's either wider + * or narrower than the input Vars, thus changing the volume of data that + * has to go through the Sort. However, we usually have only a very bad + * idea of the output width of any expression more complex than a Var, + * so for now it seems too risky to try to optimize on that basis. + * + * Note that if we do produce a modified sort-input target, and then the + * query ends up not using an explicit Sort, no particular harm is done: + * we'll initially use the modified target for the preceding path nodes, + * but then change them to the final target with apply_projection_to_path. + * Moreover, in such a case the guarantees about evaluation order of + * volatile functions still hold, since the rows are sorted already. + * + * This function has some things in common with make_group_input_target and + * make_window_input_target, though the detailed rules for what to do are + * different. We never flatten/postpone any grouping or ordering columns; + * those are needed before the sort. If we do flatten a particular + * expression, we leave Aggref and WindowFunc nodes alone, since those were + * computed earlier. + * + * 'final_target' is the query's final target list (in PathTarget form) + * 'have_postponed_srfs' is an output argument, see below + * + * The result is the PathTarget to be computed by the plan node immediately + * below the Sort step (and the Distinct step, if any). This will be + * exactly final_target if we decide a projection step wouldn't be helpful. + * + * In addition, *have_postponed_srfs is set to TRUE if we choose to postpone + * any set-returning functions to after the Sort. */ - distinct_rel->consider_parallel = input_rel->consider_parallel; +static PathTarget * +make_sort_input_target(PlannerInfo *root, + PathTarget *final_target, + bool *have_postponed_srfs) +{ + Query *parse = root->parse; + PathTarget *input_target; + int ncols; + bool *col_is_srf; + bool *postpone_col; + bool have_srf; + bool have_volatile; + bool have_expensive; + bool have_srf_sortcols; + bool postpone_srfs; + List *postponable_cols; + List *postponable_vars; + int i; + ListCell *lc; + + /* Shouldn't get here unless query has ORDER BY */ + Assert(parse->sortClause); + + *have_postponed_srfs = false; /* default result */ + + /* Inspect tlist and collect per-column information */ + ncols = list_length(final_target->exprs); + col_is_srf = (bool *) palloc0(ncols * sizeof(bool)); + postpone_col = (bool *) palloc0(ncols * sizeof(bool)); + have_srf = have_volatile = have_expensive = have_srf_sortcols = false; + + i = 0; + foreach(lc, final_target->exprs) + { + Expr *expr = (Expr *) lfirst(lc); /* - * If the input rel belongs to a single FDW, so does the distinct_rel. + * If the column has a sortgroupref, assume it has to be evaluated + * before sorting. Generally such columns would be ORDER BY, GROUP + * BY, etc targets. One exception is columns that were removed from + * GROUP BY by remove_useless_groupby_columns() ... but those would + * only be Vars anyway. There don't seem to be any cases where it + * would be worth the trouble to double-check. */ - distinct_rel->serverid = input_rel->serverid; - distinct_rel->userid = input_rel->userid; - distinct_rel->useridiscurrent = input_rel->useridiscurrent; - distinct_rel->fdwroutine = input_rel->fdwroutine; - - /* Estimate number of distinct rows there will be */ - if (parse->groupClause || parse->groupingSets || parse->hasAggs || - root->hasHavingQual) + if (get_pathtarget_sortgroupref(final_target, i) == 0) { /* - * If there was grouping or aggregation, use the number of input rows - * as the estimated number of DISTINCT rows (ie, assume the input is - * already mostly unique). + * Check for SRF or volatile functions. Check the SRF case first + * because we must know whether we have any postponed SRFs. */ - numDistinctRows = cheapest_input_path->rows; + if (parse->hasTargetSRFs && + expression_returns_set((Node *) expr)) + { + /* We'll decide below whether these are postponable */ + col_is_srf[i] = true; + have_srf = true; + } + else if (contain_volatile_functions((Node *) expr)) + { + /* Unconditionally postpone */ + postpone_col[i] = true; + have_volatile = true; } else { /* - * Otherwise, the UNIQUE filter has effects comparable to GROUP BY. + * Else check the cost. XXX it's annoying to have to do this + * when set_pathtarget_cost_width() just did it. Refactor to + * allow sharing the work? */ - List *distinctExprs; + QualCost cost; - distinctExprs = get_sortgrouplist_exprs(parse->distinctClause, - parse->targetList); - numDistinctRows = estimate_num_groups(root, distinctExprs, - cheapest_input_path->rows, - NULL); - } + cost_qual_eval_node(&cost, (Node *) expr, root); /* - * Consider sort-based implementations of DISTINCT, if possible. + * We arbitrarily define "expensive" as "more than 10X + * cpu_operator_cost". Note this will take in any PL function + * with default cost. */ - if (grouping_is_sortable(parse->distinctClause)) + if (cost.per_tuple > 10 * cpu_operator_cost) + { + postpone_col[i] = true; + have_expensive = true; + } + } + } + else { + /* For sortgroupref cols, just check if any contain SRFs */ + if (!have_srf_sortcols && + parse->hasTargetSRFs && + expression_returns_set((Node *) expr)) + have_srf_sortcols = true; + } + + i++; + } + /* - * First, if we have any adequately-presorted paths, just stick a - * Unique node on those. Then consider doing an explicit sort of the - * cheapest input path and Unique'ing that. - * - * When we have DISTINCT ON, we must sort by the more rigorous of - * DISTINCT and ORDER BY, else it won't have the desired behavior. - * Also, if we do have to do an explicit sort, we might as well use - * the more rigorous ordering to avoid a second sort later. (Note - * that the parser will have ensured that one clause is a prefix of - * the other.) + * We can postpone SRFs if we have some but none are in sortgroupref cols. */ - List *needed_pathkeys; + postpone_srfs = (have_srf && !have_srf_sortcols); - if (parse->hasDistinctOn && - list_length(root->distinct_pathkeys) < - list_length(root->sort_pathkeys)) - needed_pathkeys = root->sort_pathkeys; - else - needed_pathkeys = root->distinct_pathkeys; + /* + * If we don't need a post-sort projection, just return final_target. + */ + if (!(postpone_srfs || have_volatile || + (have_expensive && + (parse->limitCount || root->tuple_fraction > 0)))) + return final_target; - foreach(lc, input_rel->pathlist) - { - Path *path = (Path *) lfirst(lc); + /* + * Report whether the post-sort projection will contain set-returning + * functions. This is important because it affects whether the Sort can + * rely on the query's LIMIT (if any) to bound the number of rows it needs + * to return. + */ + *have_postponed_srfs = postpone_srfs; - if (pathkeys_contained_in(needed_pathkeys, path->pathkeys)) - { /* - * Make sure the distribution matches the distinct clause, - * needed by the UNIQUE path. - * - * FIXME This could probably benefit from pushing a UNIQUE - * to the remote side, and only doing a merge locally. + * Construct the sort-input target, taking all non-postponable columns and + * then adding Vars, PlaceHolderVars, Aggrefs, and WindowFuncs found in + * the postponable ones. */ - if (!grouping_distribution_match(root, parse, path, parse->distinctClause)) - path = create_remotesubplan_path(root, path, NULL); - - add_path(distinct_rel, (Path *) - create_upper_unique_path(root, distinct_rel, - path, - list_length(root->distinct_pathkeys), - numDistinctRows)); - } - } + input_target = create_empty_pathtarget(); + postponable_cols = NIL; - /* For explicit-sort case, always use the more rigorous clause */ - if (list_length(root->distinct_pathkeys) < - list_length(root->sort_pathkeys)) + i = 0; + foreach(lc, final_target->exprs) { - needed_pathkeys = root->sort_pathkeys; - /* Assert checks that parser didn't mess up... */ - Assert(pathkeys_contained_in(root->distinct_pathkeys, - needed_pathkeys)); - } + Expr *expr = (Expr *) lfirst(lc); + + if (postpone_col[i] || (postpone_srfs && col_is_srf[i])) + postponable_cols = lappend(postponable_cols, expr); else - needed_pathkeys = root->distinct_pathkeys; + add_column_to_pathtarget(input_target, expr, + get_pathtarget_sortgroupref(final_target, i)); - path = cheapest_input_path; - if (!pathkeys_contained_in(needed_pathkeys, path->pathkeys)) - path = (Path *) create_sort_path(root, distinct_rel, - path, - needed_pathkeys, - -1.0); - - /* In case of grouping / distribution mismatch, inject remote scan. */ - if (!grouping_distribution_match(root, parse, path, parse->distinctClause)) - path = create_remotesubplan_path(root, path, NULL); - - add_path(distinct_rel, (Path *) - create_upper_unique_path(root, distinct_rel, - path, - list_length(root->distinct_pathkeys), - numDistinctRows)); + i++; + } + + /* + * Pull out all the Vars, Aggrefs, and WindowFuncs mentioned in + * postponable columns, and add them to the sort-input target if not + * already present. (Some might be there already.) We mustn't + * deconstruct Aggrefs or WindowFuncs here, since the projection node + * would be unable to recompute them. + */ + postponable_vars = pull_var_clause((Node *) postponable_cols, + PVC_INCLUDE_AGGREGATES | + PVC_INCLUDE_WINDOWFUNCS | + PVC_INCLUDE_PLACEHOLDERS); + add_new_columns_to_pathtarget(input_target, postponable_vars); + + /* clean up cruft */ + list_free(postponable_vars); + list_free(postponable_cols); + + /* XXX this represents even more redundant cost calculation ... */ + return set_pathtarget_cost_width(root, input_target); } /* - * Consider hash-based implementations of DISTINCT, if possible. + * get_cheapest_fractional_path + * Find the cheapest path for retrieving a specified fraction of all + * the tuples expected to be returned by the given relation. * - * If we were not able to make any other types of path, we *must* hash or - * die trying. If we do have other choices, there are several things that - * should prevent selection of hashing: if the query uses DISTINCT ON - * (because it won't really have the expected behavior if we hash), or if - * enable_hashagg is off, or if it looks like the hashtable will exceed - * work_mem. + * We interpret tuple_fraction the same way as grouping_planner. * - * Note: grouping_is_hashable() is much more expensive to check than the - * other gating conditions, so we want to do it last. + * We assume set_cheapest() has been run on the given rel. */ - if (distinct_rel->pathlist == NIL) - allow_hash = true; /* we have no alternatives */ - else if (parse->hasDistinctOn || !enable_hashagg) - allow_hash = false; /* policy-based decision not to hash */ - else +Path * +get_cheapest_fractional_path(RelOptInfo *rel, double tuple_fraction) { - Size hashentrysize; + Path *best_path = rel->cheapest_total_path; + ListCell *l; - /* Estimate per-hash-entry space at tuple width... */ - hashentrysize = MAXALIGN(cheapest_input_path->pathtarget->width) + - MAXALIGN(SizeofMinimalTupleHeader); - /* plus the per-hash-entry overhead */ - hashentrysize += hash_agg_entry_size(0); + /* If all tuples will be retrieved, just return the cheapest-total path */ + if (tuple_fraction <= 0.0) + return best_path; - /* Allow hashing only if hashtable is predicted to fit in work_mem */ - allow_hash = (hashentrysize * numDistinctRows <= work_mem * 1024L); - } + /* Convert absolute # of tuples to a fraction; no need to clamp to 0..1 */ + if (tuple_fraction >= 1.0 && best_path->rows > 0) + tuple_fraction /= best_path->rows; - if (allow_hash && grouping_is_hashable(parse->distinctClause)) + foreach(l, rel->pathlist) { - Path *input_path = cheapest_input_path; - - /* If needed, inject RemoteSubplan redistributing the data. */ - if (!grouping_distribution_match(root, parse, input_path, parse->distinctClause)) - input_path = create_remotesubplan_path(root, input_path, NULL); - - /* XXX Maybe we can make this a 2-phase aggregate too? */ - - /* Generate hashed aggregate path --- no sort needed */ - add_path(distinct_rel, (Path *) - create_agg_path(root, - distinct_rel, - input_path, - input_path->pathtarget, - AGG_HASHED, - AGGSPLIT_SIMPLE, - parse->distinctClause, - NIL, - NULL, - numDistinctRows)); - } - - /* Give a helpful error if we failed to find any implementation */ - if (distinct_rel->pathlist == NIL) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("could not implement DISTINCT"), - errdetail("Some of the datatypes only support hashing, while others only support sorting."))); - - /* - * If there is an FDW that's responsible for all baserels of the query, - * let it consider adding ForeignPaths. - */ - if (distinct_rel->fdwroutine && - distinct_rel->fdwroutine->GetForeignUpperPaths) - distinct_rel->fdwroutine->GetForeignUpperPaths(root, UPPERREL_DISTINCT, - input_rel, distinct_rel); + Path *path = (Path *) lfirst(l); - /* Let extensions possibly add some more paths */ - if (create_upper_paths_hook) - (*create_upper_paths_hook) (root, UPPERREL_DISTINCT, - input_rel, distinct_rel); + if (path == rel->cheapest_total_path || + compare_fractional_path_costs(best_path, path, tuple_fraction) <= 0) + continue; - /* Now choose the best path(s) */ - set_cheapest(distinct_rel); + best_path = path; + } - return distinct_rel; + return best_path; } /* - * create_ordered_paths - * - * Build a new upperrel containing Paths for ORDER BY evaluation. + * adjust_paths_for_srfs + * Fix up the Paths of the given upperrel to handle tSRFs properly. * - * All paths in the result must satisfy the ORDER BY ordering. - * The only new path we need consider is an explicit sort on the - * cheapest-total existing path. + * The executor can only handle set-returning functions that appear at the + * top level of the targetlist of a ProjectSet plan node. If we have any SRFs + * that are not at top level, we need to split up the evaluation into multiple + * plan levels in which each level satisfies this constraint. This function + * modifies each Path of an upperrel that (might) compute any SRFs in its + * output tlist to insert appropriate projection steps. * - * input_rel: contains the source-data Paths - * target: the output tlist the result Paths must emit - * limit_tuples: estimated bound on the number of output tuples, - * or -1 if no LIMIT or couldn't estimate + * The given targets and targets_contain_srfs lists are from + * split_pathtarget_at_srfs(). We assume the existing Paths emit the first + * target in targets. */ -static RelOptInfo * -create_ordered_paths(PlannerInfo *root, - RelOptInfo *input_rel, - PathTarget *target, - double limit_tuples) -{// #lizard forgives - Path *cheapest_input_path = input_rel->cheapest_total_path; - RelOptInfo *ordered_rel; +static void +adjust_paths_for_srfs(PlannerInfo *root, RelOptInfo *rel, + List *targets, List *targets_contain_srfs) +{ ListCell *lc; - /* For now, do all work in the (ORDERED, NULL) upperrel */ - ordered_rel = fetch_upper_rel(root, UPPERREL_ORDERED, NULL); + Assert(list_length(targets) == list_length(targets_contain_srfs)); + Assert(!linitial_int(targets_contain_srfs)); - /* - * If the input relation is not parallel-safe, then the ordered relation - * can't be parallel-safe, either. Otherwise, it's parallel-safe if the - * target list is parallel-safe. - */ - if (input_rel->consider_parallel && - is_parallel_safe(root, (Node *) target->exprs)) - ordered_rel->consider_parallel = true; + /* If no SRFs appear at this plan level, nothing to do */ + if (list_length(targets) == 1) + return; /* - * If the input rel belongs to a single FDW, so does the ordered_rel. + * Stack SRF-evaluation nodes atop each path for the rel. + * + * In principle we should re-run set_cheapest() here to identify the + * cheapest path, but it seems unlikely that adding the same tlist eval + * costs to all the paths would change that, so we don't bother. Instead, + * just assume that the cheapest-startup and cheapest-total paths remain + * so. (There should be no parameterized paths anymore, so we needn't + * worry about updating cheapest_parameterized_paths.) */ - ordered_rel->serverid = input_rel->serverid; - ordered_rel->userid = input_rel->userid; - ordered_rel->useridiscurrent = input_rel->useridiscurrent; - ordered_rel->fdwroutine = input_rel->fdwroutine; - - foreach(lc, input_rel->pathlist) + foreach(lc, rel->pathlist) { - Path *path = (Path *) lfirst(lc); - bool is_sorted; + Path *subpath = (Path *) lfirst(lc); + Path *newpath = subpath; + ListCell *lc1, + *lc2; - is_sorted = pathkeys_contained_in(root->sort_pathkeys, - path->pathkeys); - if (path == cheapest_input_path || is_sorted) - { - if (!is_sorted) + Assert(subpath->param_info == NULL); + forboth(lc1, targets, lc2, targets_contain_srfs) { - /* An explicit sort here can take advantage of LIMIT */ - path = (Path *) create_sort_path(root, - ordered_rel, - path, - root->sort_pathkeys, - limit_tuples); - } - - /* Add projection step if needed */ - if (path->pathtarget != target) - path = apply_projection_to_path(root, ordered_rel, - path, target); - - add_path(ordered_rel, path); + PathTarget *thistarget = (PathTarget *) lfirst(lc1); + bool contains_srfs = (bool) lfirst_int(lc2); + + /* If this level doesn't contain SRFs, do regular projection */ + if (contains_srfs) + newpath = (Path *) create_set_projection_path(root, + rel, + newpath, + thistarget); + else + newpath = (Path *) apply_projection_to_path(root, + rel, + newpath, + thistarget); } + lfirst(lc) = newpath; + if (subpath == rel->cheapest_startup_path) + rel->cheapest_startup_path = newpath; + if (subpath == rel->cheapest_total_path) + rel->cheapest_total_path = newpath; } - /* - * generate_gather_paths() will have already generated a simple Gather - * path for the best parallel path, if any, and the loop above will have - * considered sorting it. Similarly, generate_gather_paths() will also - * have generated order-preserving Gather Merge plans which can be used - * without sorting if they happen to match the sort_pathkeys, and the loop - * above will have handled those as well. However, there's one more - * possibility: it may make sense to sort the cheapest partial path - * according to the required output order and then use Gather Merge. - */ - if (ordered_rel->consider_parallel && root->sort_pathkeys != NIL && - input_rel->partial_pathlist != NIL) + /* Likewise for partial paths, if any */ + foreach(lc, rel->partial_pathlist) { - Path *cheapest_partial_path; + Path *subpath = (Path *) lfirst(lc); + Path *newpath = subpath; + ListCell *lc1, + *lc2; - cheapest_partial_path = linitial(input_rel->partial_pathlist); - - /* - * If cheapest partial path doesn't need a sort, this is redundant - * with what's already been tried. - */ - if (!pathkeys_contained_in(root->sort_pathkeys, - cheapest_partial_path->pathkeys)) + Assert(subpath->param_info == NULL); + forboth(lc1, targets, lc2, targets_contain_srfs) { - Path *path; - double total_groups; - - path = (Path *) create_sort_path(root, - ordered_rel, - cheapest_partial_path, - root->sort_pathkeys, - -1.0); - - total_groups = cheapest_partial_path->rows * - cheapest_partial_path->parallel_workers; - path = (Path *) - create_gather_merge_path(root, ordered_rel, - path, - target, root->sort_pathkeys, NULL, - &total_groups); - - /* Add projection step if needed */ - if (path->pathtarget != target) - path = apply_projection_to_path(root, ordered_rel, - path, target); - - add_path(ordered_rel, path); + PathTarget *thistarget = (PathTarget *) lfirst(lc1); + bool contains_srfs = (bool) lfirst_int(lc2); + + /* If this level doesn't contain SRFs, do regular projection */ + if (contains_srfs) + newpath = (Path *) create_set_projection_path(root, + rel, + newpath, + thistarget); + else + { + /* avoid apply_projection_to_path, in case of multiple refs */ + newpath = (Path *) create_projection_path(root, + rel, + newpath, + thistarget); + } + } + lfirst(lc) = newpath; } } /* - * If there is an FDW that's responsible for all baserels of the query, - * let it consider adding ForeignPaths. + * expression_planner + * Perform planner's transformations on a standalone expression. + * + * Various utility commands need to evaluate expressions that are not part + * of a plannable query. They can do so using the executor's regular + * expression-execution machinery, but first the expression has to be fed + * through here to transform it from parser output to something executable. + * + * Currently, we disallow sublinks in standalone expressions, so there's no + * real "planning" involved here. (That might not always be true though.) + * What we must do is run eval_const_expressions to ensure that any function + * calls are converted to positional notation and function default arguments + * get inserted. The fact that constant subexpressions get simplified is a + * side-effect that is useful when the expression will get evaluated more than + * once. Also, we must fix operator function IDs. + * + * Note: this must not make any damaging changes to the passed-in expression + * tree. (It would actually be okay to apply fix_opfuncids to it, but since + * we first do an expression_tree_mutator-based walk, what is returned will + * be a new node tree.) */ - if (ordered_rel->fdwroutine && - ordered_rel->fdwroutine->GetForeignUpperPaths) - ordered_rel->fdwroutine->GetForeignUpperPaths(root, UPPERREL_ORDERED, - input_rel, ordered_rel); - - /* Let extensions possibly add some more paths */ - if (create_upper_paths_hook) - (*create_upper_paths_hook) (root, UPPERREL_ORDERED, - input_rel, ordered_rel); +Expr * +expression_planner(Expr *expr) +{ + Node *result; /* - * No need to bother with set_cheapest here; grouping_planner does not - * need us to do it. + * Convert named-argument function calls, insert default arguments and + * simplify constant subexprs */ - Assert(ordered_rel->pathlist != NIL); + result = eval_const_expressions(NULL, (Node *) expr); - return ordered_rel; + /* Fill in opfuncid values if missing */ + fix_opfuncids(result); + + return (Expr *) result; } /* - * make_group_input_target - * Generate appropriate PathTarget for initial input to grouping nodes. - * - * If there is grouping or aggregation, the scan/join subplan cannot emit - * the query's final targetlist; for example, it certainly can't emit any - * aggregate function calls. This routine generates the correct target - * for the scan/join subplan. - * - * The query target list passed from the parser already contains entries - * for all ORDER BY and GROUP BY expressions, but it will not have entries - * for variables used only in HAVING clauses; so we need to add those - * variables to the subplan target list. Also, we flatten all expressions - * except GROUP BY items into their component variables; other expressions - * will be computed by the upper plan nodes rather than by the subplan. - * For example, given a query like - * SELECT a+b,SUM(c+d) FROM table GROUP BY a+b; - * we want to pass this targetlist to the subplan: - * a+b,c,d - * where the a+b target will be used by the Sort/Group steps, and the - * other targets will be used for computing the final results. + * plan_cluster_use_sort + * Use the planner to decide how CLUSTER should implement sorting * - * 'final_target' is the query's final target list (in PathTarget form) + * tableOid is the OID of a table to be clustered on its index indexOid + * (which is already known to be a btree index). Decide whether it's + * cheaper to do an indexscan or a seqscan-plus-sort to execute the CLUSTER. + * Return TRUE to use sorting, FALSE to use an indexscan. * - * The result is the PathTarget to be computed by the Paths returned from - * query_planner(). + * Note: caller had better already hold some type of lock on the table. */ -static PathTarget * -make_group_input_target(PlannerInfo *root, PathTarget *final_target) +bool +plan_cluster_use_sort(Oid tableOid, Oid indexOid) { - Query *parse = root->parse; - PathTarget *input_target; - List *non_group_cols; - List *non_group_vars; - int i; + PlannerInfo *root; + Query *query; + PlannerGlobal *glob; + RangeTblEntry *rte; + RelOptInfo *rel; + IndexOptInfo *indexInfo; + QualCost indexExprCost; + Cost comparisonCost; + Path *seqScanPath; + Path seqScanAndSortPath; + IndexPath *indexScanPath; ListCell *lc; - /* - * We must build a target containing all grouping columns, plus any other - * Vars mentioned in the query's targetlist and HAVING qual. - */ - input_target = create_empty_pathtarget(); - non_group_cols = NIL; - - i = 0; - foreach(lc, final_target->exprs) - { - Expr *expr = (Expr *) lfirst(lc); - Index sgref = get_pathtarget_sortgroupref(final_target, i); - - if (sgref && parse->groupClause && - get_sortgroupref_clause_noerr(sgref, parse->groupClause) != NULL) + /* We can short-circuit the cost comparison if indexscans are disabled */ + if (!enable_indexscan) + return true; /* use sort */ + + /* Set up mostly-dummy planner state */ + query = makeNode(Query); + query->commandType = CMD_SELECT; + + glob = makeNode(PlannerGlobal); + + root = makeNode(PlannerInfo); + root->parse = query; + root->glob = glob; + root->query_level = 1; + root->planner_cxt = CurrentMemoryContext; + root->wt_param_id = -1; + root->recursiveOk = true; + + /* Build a minimal RTE for the rel */ + rte = makeNode(RangeTblEntry); + rte->rtekind = RTE_RELATION; + rte->relid = tableOid; + rte->relkind = RELKIND_RELATION; /* Don't be too picky. */ + rte->lateral = false; + rte->inh = false; + rte->inFromCl = true; + query->rtable = list_make1(rte); + + /* Set up RTE/RelOptInfo arrays */ + setup_simple_rel_arrays(root); + + /* Build RelOptInfo */ + rel = build_simple_rel(root, 1, NULL); + + /* Locate IndexOptInfo for the target index */ + indexInfo = NULL; + foreach(lc, rel->indexlist) { - /* - * It's a grouping column, so add it to the input target as-is. - */ - add_column_to_pathtarget(input_target, expr, sgref); + indexInfo = (IndexOptInfo *) lfirst(lc); + if (indexInfo->indexoid == indexOid) + break; } - else - { + /* - * Non-grouping column, so just remember the expression for later - * call to pull_var_clause. + * It's possible that get_relation_info did not generate an IndexOptInfo + * for the desired index; this could happen if it's not yet reached its + * indcheckxmin usability horizon, or if it's a system index and we're + * ignoring system indexes. In such cases we should tell CLUSTER to not + * trust the index contents but use seqscan-and-sort. */ - non_group_cols = lappend(non_group_cols, expr); - } - - i++; - } + if (lc == NULL) /* not in the list? */ + return true; /* use sort */ /* - * If there's a HAVING clause, we'll need the Vars it uses, too. + * Rather than doing all the pushups that would be needed to use + * set_baserel_size_estimates, just do a quick hack for rows and width. */ - if (parse->havingQual) - non_group_cols = lappend(non_group_cols, parse->havingQual); + rel->rows = rel->tuples; + rel->reltarget->width = get_relation_data_width(tableOid, NULL); + + root->total_table_pages = rel->pages; /* - * Pull out all the Vars mentioned in non-group cols (plus HAVING), and - * add them to the input target if not already present. (A Var used - * directly as a GROUP BY item will be present already.) Note this - * includes Vars used in resjunk items, so we are covering the needs of - * ORDER BY and window specifications. Vars used within Aggrefs and - * WindowFuncs will be pulled out here, too. + * Determine eval cost of the index expressions, if any. We need to + * charge twice that amount for each tuple comparison that happens during + * the sort, since tuplesort.c will have to re-evaluate the index + * expressions each time. (XXX that's pretty inefficient...) */ - non_group_vars = pull_var_clause((Node *) non_group_cols, - PVC_RECURSE_AGGREGATES | - PVC_RECURSE_WINDOWFUNCS | - PVC_INCLUDE_PLACEHOLDERS); - add_new_columns_to_pathtarget(input_target, non_group_vars); - - /* clean up cruft */ - list_free(non_group_vars); - list_free(non_group_cols); - - /* XXX this causes some redundant cost calculation ... */ - return set_pathtarget_cost_width(root, input_target); + cost_qual_eval(&indexExprCost, indexInfo->indexprs, root); + comparisonCost = 2.0 * (indexExprCost.startup + indexExprCost.per_tuple); + + /* Estimate the cost of seq scan + sort */ + seqScanPath = create_seqscan_path(root, rel, NULL, 0); + cost_sort(&seqScanAndSortPath, root, NIL, + seqScanPath->total_cost, rel->tuples, rel->reltarget->width, + comparisonCost, maintenance_work_mem, -1.0); + + /* Estimate the cost of index scan */ + indexScanPath = create_index_path(root, indexInfo, + NIL, NIL, NIL, NIL, NIL, + ForwardScanDirection, false, + NULL, 1.0, false); + + return (seqScanAndSortPath.total_cost < indexScanPath->path.total_cost); } + /* - * make_partial_grouping_target - * Generate appropriate PathTarget for output of partial aggregate - * (or partial grouping, if there are no aggregates) nodes. + * grouping_distribution_match + * Check if the path distribution matches grouping distribution. * - * A partial aggregation node needs to emit all the same aggregates that - * a regular aggregation node would, plus any aggregates used in HAVING; - * except that the Aggref nodes should be marked as partial aggregates. + * Grouping preserves distribution if the distribution key is on of the + * grouping keys (arbitrary one). In that case it's guaranteed that groups + * on different nodes do not overlap, and we can push the aggregation to + * remote nodes as a whole. * - * In addition, we'd better emit any Vars and PlaceholderVars that are - * used outside of Aggrefs in the aggregation tlist and HAVING. (Presumably, - * these would be Vars that are grouped by or used in grouping expressions.) + * Otherwise we need to either fetch all the data to the coordinator and + * perform the aggregation there, or use two-phase aggregation, with the + * first phase (partial aggregation) pushed down, and the second phase + * (combining and finalizing the results) executed on the coordinator. * - * grouping_target is the tlist to be emitted by the topmost aggregation step. - * We get the HAVING clause out of *root. + * XXX This is used not only for plain aggregation, but also for various + * other paths, relying on grouping infrastructure (DISTINCT ON, UNIQUE). */ -static PathTarget * -make_partial_grouping_target(PlannerInfo *root, PathTarget *grouping_target) +static bool +grouping_distribution_match(PlannerInfo *root, Query *parse, Path *path, + List *clauses) { - Query *parse = root->parse; - PathTarget *partial_target; - List *non_group_cols; - List *non_group_exprs; int i; - ListCell *lc; + bool matches_key = false; + Distribution *distribution = path->distribution; - partial_target = create_empty_pathtarget(); - non_group_cols = NIL; - - i = 0; - foreach(lc, grouping_target->exprs) - { - Expr *expr = (Expr *) lfirst(lc); - Index sgref = get_pathtarget_sortgroupref(grouping_target, i); + int numGroupCols = list_length(clauses); + AttrNumber *groupColIdx = extract_grouping_cols(clauses, + parse->targetList); - if (sgref && parse->groupClause && - get_sortgroupref_clause_noerr(sgref, parse->groupClause) != NULL) +#ifdef __COLD_HOT__ + if (has_cold_hot_table) { - /* - * It's a grouping column, so add it to the partial_target as-is. - * (This allows the upper agg step to repeat the grouping calcs.) - */ - add_column_to_pathtarget(partial_target, expr, sgref); - } - else + if (! path->distribution) { - /* - * Non-grouping column, so just remember the expression for later - * call to pull_var_clause. - */ - non_group_cols = lappend(non_group_cols, expr); + return true; } - i++; + return false; } +#endif /* - * If there's a HAVING clause, we'll need the Vars/Aggrefs it uses, too. - */ - if (parse->havingQual) - non_group_cols = lappend(non_group_cols, parse->havingQual); - - /* - * Pull out all the Vars, PlaceHolderVars, and Aggrefs mentioned in - * non-group cols (plus HAVING), and add them to the partial_target if not - * already present. (An expression used directly as a GROUP BY item will - * be present already.) Note this includes Vars used in resjunk items, so - * we are covering the needs of ORDER BY and window specifications. + * With no explicit data distribution or replicated tables, we can simply + * push down the whole aggregation to the remote node, without any sort + * of redistribution. So consider this to be a match. */ - non_group_exprs = pull_var_clause((Node *) non_group_cols, - PVC_INCLUDE_AGGREGATES | - PVC_RECURSE_WINDOWFUNCS | - PVC_INCLUDE_PLACEHOLDERS); + if ((distribution == NULL) || + IsLocatorReplicated(distribution->distributionType)) + return true; - add_new_columns_to_pathtarget(partial_target, non_group_exprs); + /* But no distribution expression means 'no match'. */ + if (distribution->distributionExpr == NULL) + return false; /* - * Adjust Aggrefs to put them in partial mode. At this point all Aggrefs - * are at the top level of the target list, so we can just scan the list - * rather than recursing through the expression trees. + * With distributed data and table distributed using an expression, we + * need to check if the distribution expression matches one of the + * grouping keys (arbitrary one). */ - foreach(lc, partial_target->exprs) + for (i = 0; i < numGroupCols; i++) { - Aggref *aggref = (Aggref *) lfirst(lc); + TargetEntry *te = (TargetEntry *)list_nth(parse->targetList, + groupColIdx[i]-1); - if (IsA(aggref, Aggref)) + if (equal(te->expr, distribution->distributionExpr)) { - Aggref *newaggref; - - /* - * We shouldn't need to copy the substructure of the Aggref node, - * but flat-copy the node itself to avoid damaging other trees. - */ - newaggref = makeNode(Aggref); - memcpy(newaggref, aggref, sizeof(Aggref)); - - /* For now, assume serialization is required */ - mark_partial_aggref(newaggref, AGGSPLIT_INITIAL_SERIAL); - - lfirst(lc) = newaggref; + matches_key = true; + break; } } - /* clean up cruft */ - list_free(non_group_exprs); - list_free(non_group_cols); - - /* XXX this causes some redundant cost calculation ... */ - return set_pathtarget_cost_width(root, partial_target); + return matches_key; } /* - * mark_partial_aggref - * Adjust an Aggref to make it represent a partial-aggregation step. + * get_partitioned_child_rels + * Returns a list of the RT indexes of the partitioned child relations + * with rti as the root parent RT index. Also sets + * *part_cols_updated to true if any of the root rte's updated + * columns is used in the partition key either of the relation whose RTI + * is specified or of any child relation. * - * The Aggref node is modified in-place; caller must do any copying required. + * Note: This function might get called even for range table entries that + * are not partitioned tables; in such a case, it will simply return NIL. */ -void -mark_partial_aggref(Aggref *agg, AggSplit aggsplit) +List * +get_partitioned_child_rels(PlannerInfo *root, Index rti, + bool *part_cols_updated) { - /* aggtranstype should be computed by this point */ - Assert(OidIsValid(agg->aggtranstype)); - /* ... but aggsplit should still be as the parser left it */ - Assert(agg->aggsplit == AGGSPLIT_SIMPLE); + List *result = NIL; + ListCell *l; - /* Mark the Aggref with the intended partial-aggregation mode */ - agg->aggsplit = aggsplit; + if (part_cols_updated) + *part_cols_updated = false; - /* - * Adjust result type if needed. Normally, a partial aggregate returns - * the aggregate's transition type; but if that's INTERNAL and we're - * serializing, it returns BYTEA instead. - */ - if (DO_AGGSPLIT_SKIPFINAL(aggsplit)) + foreach(l, root->pcinfo_list) { - if (agg->aggtranstype == INTERNALOID && DO_AGGSPLIT_SERIALIZE(aggsplit)) - agg->aggtype = BYTEAOID; - else - agg->aggtype = agg->aggtranstype; + PartitionedChildRelInfo *pc = lfirst(l); + + if (pc->parent_relid == rti) + { + result = pc->child_rels; + if (part_cols_updated) + *part_cols_updated = pc->part_cols_updated; + break; } } + return result; +} + + /* - * postprocess_setop_tlist - * Fix up targetlist returned by plan_set_operations(). - * - * We need to transpose sort key info from the orig_tlist into new_tlist. - * NOTE: this would not be good enough if we supported resjunk sort keys - * for results of set operations --- then, we'd need to project a whole - * new tlist to evaluate the resjunk columns. For now, just ereport if we - * find any resjunk columns in orig_tlist. + * get_partitioned_child_rels_for_join + * Build and return a list containing the RTI of every partitioned + * relation which is a child of some rel included in the join. */ -static List * -postprocess_setop_tlist(List *new_tlist, List *orig_tlist) +List * +get_partitioned_child_rels_for_join(PlannerInfo *root, Relids join_relids) { + List *result = NIL; ListCell *l; - ListCell *orig_tlist_item = list_head(orig_tlist); - foreach(l, new_tlist) + foreach(l, root->pcinfo_list) { - TargetEntry *new_tle = (TargetEntry *) lfirst(l); - TargetEntry *orig_tle; - - /* ignore resjunk columns in setop result */ - if (new_tle->resjunk) - continue; + PartitionedChildRelInfo *pc = lfirst(l); - Assert(orig_tlist_item != NULL); - orig_tle = (TargetEntry *) lfirst(orig_tlist_item); - orig_tlist_item = lnext(orig_tlist_item); - if (orig_tle->resjunk) /* should not happen */ - elog(ERROR, "resjunk output columns are not implemented"); - Assert(new_tle->resno == orig_tle->resno); - new_tle->ressortgroupref = orig_tle->ressortgroupref; + if (bms_is_member(pc->parent_relid, join_relids)) + result = list_concat(result, list_copy(pc->child_rels)); } - if (orig_tlist_item != NULL) - elog(ERROR, "resjunk output columns are not implemented"); - return new_tlist; + + return result; } /* - * select_active_windows - * Create a list of the "active" window clauses (ie, those referenced - * by non-deleted WindowFuncs) in the order they are to be executed. + * add_paths_to_grouping_rel + * + * Add non-partial paths to grouping relation. */ -static List * -select_active_windows(PlannerInfo *root, WindowFuncLists *wflists) +static void +add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, + RelOptInfo *grouped_rel, PathTarget *target, + PathTarget *partial_grouping_target, + const AggClauseCosts *agg_costs, + const AggClauseCosts *agg_final_costs, + grouping_sets_data *gd, bool can_sort, bool can_hash, + double dNumGroups, List *havingQual) { - List *result; - List *actives; + Query *parse = root->parse; + Path *cheapest_path = input_rel->cheapest_total_path; ListCell *lc; - /* First, make a list of the active windows */ - actives = NIL; - foreach(lc, root->parse->windowClause) - { - WindowClause *wc = (WindowClause *) lfirst(lc); - - /* It's only active if wflists shows some related WindowFuncs */ - Assert(wc->winref <= wflists->maxWinRef); - if (wflists->windowFuncs[wc->winref] != NIL) - actives = lappend(actives, wc); - } - - /* - * Now, ensure that windows with identical partitioning/ordering clauses - * are adjacent in the list. This is required by the SQL standard, which - * says that only one sort is to be used for such windows, even if they - * are otherwise distinct (eg, different names or framing clauses). - * - * There is room to be much smarter here, for example detecting whether - * one window's sort keys are a prefix of another's (so that sorting for - * the latter would do for the former), or putting windows first that - * match a sort order available for the underlying query. For the moment - * we are content with meeting the spec. - */ - result = NIL; - while (actives != NIL) + if (can_sort) { - WindowClause *wc = (WindowClause *) linitial(actives); - ListCell *prev; - ListCell *next; - - /* Move wc from actives to result */ - actives = list_delete_first(actives); - result = lappend(result, wc); - - /* Now move any matching windows from actives to result */ - prev = NULL; - for (lc = list_head(actives); lc; lc = next) + /* + * Use any available suitably-sorted path as input, and also consider + * sorting the cheapest-total path. + */ + foreach(lc, input_rel->pathlist) { - WindowClause *wc2 = (WindowClause *) lfirst(lc); - - next = lnext(lc); - /* framing options are NOT to be compared here! */ - if (equal(wc->partitionClause, wc2->partitionClause) && - equal(wc->orderClause, wc2->orderClause)) + Path *path = (Path *) lfirst(lc); + bool is_sorted; + + is_sorted = pathkeys_contained_in(root->group_pathkeys, + path->pathkeys); + + /* + * XL: Can it happen that the cheapest path can't be pushed down, + * while some other path could be? Perhaps we should move the check + * if a path can be pushed down up, and add another OR condition + * to consider all paths that can be pushed down? + * + * if (path == cheapest_path || is_sorted || can_push_down) + */ + if (path == cheapest_path || is_sorted) { - actives = list_delete_cell(actives, lc, prev); - result = lappend(result, wc2); - } - else - prev = lc; - } - } +#ifdef __TBASE__ + bool try_redistribute_grouping = false; + PathTarget * local_grouping_target = make_partial_grouping_target(root, target); - return result; -} + /* Estimate number of partial groups. */ + double dNumLocalGroups = get_number_of_groups(root, + cheapest_path->rows, + gd); +#endif +#ifdef __TBASE__ + if (olap_optimizer && !has_cold_hot_table) + { + if (!is_sorted && !agg_costs->hasOnlyDistinct) + path = (Path *) create_sort_path(root, + grouped_rel, + path, + root->group_pathkeys, + -1.0); + } + else + { +#endif + /* Sort the cheapest-total path if it isn't already sorted */ + if (!is_sorted) + path = (Path *) create_sort_path(root, + grouped_rel, + path, + root->group_pathkeys, + -1.0); +#ifdef __TBASE__ + } +#endif /* - * make_window_input_target - * Generate appropriate PathTarget for initial input to WindowAgg nodes. - * - * When the query has window functions, this function computes the desired - * target to be computed by the node just below the first WindowAgg. - * This tlist must contain all values needed to evaluate the window functions, - * compute the final target list, and perform any required final sort step. - * If multiple WindowAggs are needed, each intermediate one adds its window - * function results onto this base tlist; only the topmost WindowAgg computes - * the actual desired target list. - * - * This function is much like make_group_input_target, though not quite enough - * like it to share code. As in that function, we flatten most expressions - * into their component variables. But we do not want to flatten window - * PARTITION BY/ORDER BY clauses, since that might result in multiple - * evaluations of them, which would be bad (possibly even resulting in - * inconsistent answers, if they contain volatile functions). - * Also, we must not flatten GROUP BY clauses that were left unflattened by - * make_group_input_target, because we may no longer have access to the - * individual Vars in them. - * - * Another key difference from make_group_input_target is that we don't - * flatten Aggref expressions, since those are to be computed below the - * window functions and just referenced like Vars above that. - * - * 'final_target' is the query's final target list (in PathTarget form) - * 'activeWindows' is the list of active windows previously identified by - * select_active_windows. - * - * The result is the PathTarget to be computed by the plan node immediately - * below the first WindowAgg node. + * If the grouping can't be fully pushed down, redistribute the + * path on top of the (sorted) path. If if can be pushed down, + * disable construction of complex distributed paths. */ -static PathTarget * -make_window_input_target(PlannerInfo *root, - PathTarget *final_target, - List *activeWindows) -{ - Query *parse = root->parse; - PathTarget *input_target; - Bitmapset *sgrefs; - List *flattenable_cols; - List *flattenable_vars; - int i; - ListCell *lc; + if (! can_push_down_grouping(root, parse, path)) +#ifdef __TBASE__ + { + /* some special aggs cannot be parallel executed, such as count(distinct) */ + if(agg_costs->hasNonPartial || agg_costs->hasNonSerial || + parse->groupingSets || path->pathtype == T_Agg || + path->pathtype == T_Group || !olap_optimizer || has_cold_hot_table) + { + if (agg_costs->hasOnlyDistinct && olap_optimizer && !parse->groupingSets + && !has_cold_hot_table) + path = create_redistribute_grouping_path(root, parse, path); + else + path = create_remotesubplan_path(root, path, NULL); - Assert(parse->hasWindowFuncs); + if (agg_costs->hasOnlyDistinct && olap_optimizer && + !has_cold_hot_table) + { + if (root->group_pathkeys) + { + path = (Path *) create_sort_path(root, + grouped_rel, + path, + root->group_pathkeys, + -1.0); + } + } + } + else + { + /* + * If the grouping can not be fully pushed down, we adopt another + * strategy instead. + * 1. do grouping on each datanode locally + * 2. re-distribute grouping results among datanodes, then do the + * final grouping + */ + + try_redistribute_grouping = true; + + /* step 1 */ + if (parse->groupingSets) + { + /* + * TODO 2-phase aggregation for grouping sets paths not + * supported yet, but this the place where such paths + * should be constructed. + */ + } + else if (parse->hasAggs) + { + /* + * We have aggregation, possibly with plain GROUP BY. Make + * an AggPath. + */ - /* - * Collect the sortgroupref numbers of window PARTITION/ORDER BY clauses - * into a bitmapset for convenient reference below. - */ - sgrefs = NULL; - foreach(lc, activeWindows) - { - WindowClause *wc = (WindowClause *) lfirst(lc); - ListCell *lc2; + path = (Path *) create_agg_path(root, + grouped_rel, + path, + local_grouping_target, + parse->groupClause ? AGG_SORTED : AGG_PLAIN, + AGGSPLIT_INITIAL_SERIAL, + parse->groupClause, + NIL, + &agg_partial_costs, + dNumLocalGroups); + } + else if (parse->groupClause) + { + /* + * We have GROUP BY without aggregation or grouping sets. + * Make a GroupPath. + */ + path = (Path *) create_group_path(root, + grouped_rel, + path, + local_grouping_target, + parse->groupClause, + NIL, + dNumLocalGroups); + } + else + { + /* Other cases should have been handled above */ + Assert(false); + } - foreach(lc2, wc->partitionClause) - { - SortGroupClause *sortcl = (SortGroupClause *) lfirst(lc2); + /* step 2*/ + path = create_redistribute_grouping_path(root, parse, path); + } + } +#else + path = create_remotesubplan_path(root, path, NULL); +#endif - sgrefs = bms_add_member(sgrefs, sortcl->tleSortGroupRef); - } - foreach(lc2, wc->orderClause) - { - SortGroupClause *sortcl = (SortGroupClause *) lfirst(lc2); + else + try_distributed_aggregation = false; - sgrefs = bms_add_member(sgrefs, sortcl->tleSortGroupRef); - } - } +#ifdef __TBASE__ + if(try_redistribute_grouping) + { + /* + * do final grouping at each datanode + */ - /* Add in sortgroupref numbers of GROUP BY clauses, too */ - foreach(lc, parse->groupClause) - { - SortGroupClause *grpcl = (SortGroupClause *) lfirst(lc); + /* Now decide what to stick atop it */ + if (parse->groupingSets) + { + /* + * TODO 2-phase aggregation for grouping sets paths not + * supported yet, but this the place where such paths + * should be constructed. + */ + } + else if (parse->hasAggs) + { + /* + * We generate two paths, differing in the second phase + * implementation (sort and hash). + */ + Path *remote_path = path; - sgrefs = bms_add_member(sgrefs, grpcl->tleSortGroupRef); - } + if (parse->groupClause) + { + if (!is_sorted || root->group_pathkeys) + { + path = (Path *) create_sort_path(root, + grouped_rel, + path, + root->group_pathkeys, + -1.0); + } + } - /* - * Construct a target containing all the non-flattenable targetlist items, - * and save aside the others for a moment. - */ - input_target = create_empty_pathtarget(); - flattenable_cols = NIL; + path = (Path *)create_agg_path(root, + grouped_rel, + path, + target, + parse->groupClause ? AGG_SORTED : AGG_PLAIN, + AGGSPLIT_FINAL_DESERIAL, + parse->groupClause, + havingQual, + agg_final_costs, + dNumGroups); - i = 0; - foreach(lc, final_target->exprs) - { - Expr *expr = (Expr *) lfirst(lc); - Index sgref = get_pathtarget_sortgroupref(final_target, i); + //path->parallel_safe = true; - /* - * Don't want to deconstruct window clauses or GROUP BY items. (Note - * that such items can't contain window functions, so it's okay to - * compute them below the WindowAgg nodes.) - */ - if (sgref != 0 && bms_is_member(sgref, sgrefs)) - { - /* - * Don't want to deconstruct this value, so add it to the input - * target as-is. - */ - add_column_to_pathtarget(input_target, expr, sgref); - } - else - { - /* - * Column is to be flattened, so just remember the expression for - * later call to pull_var_clause. - */ - flattenable_cols = lappend(flattenable_cols, expr); - } + add_path(grouped_rel, path); - i++; - } + if (can_hash) + { + path = (Path *) + create_agg_path(root, + grouped_rel, + remote_path, + target, + AGG_HASHED, + AGGSPLIT_FINAL_DESERIAL, + parse->groupClause, + havingQual, + agg_final_costs, + dNumGroups); + //path->parallel_safe = true; + if (g_hybrid_hash_agg) + { + AggPath *agg = (AggPath *)path; + agg->hybrid = true; + } - /* - * Pull out all the Vars and Aggrefs mentioned in flattenable columns, and - * add them to the input target if not already present. (Some might be - * there already because they're used directly as window/group clauses.) - * - * Note: it's essential to use PVC_INCLUDE_AGGREGATES here, so that any - * Aggrefs are placed in the Agg node's tlist and not left to be computed - * at higher levels. On the other hand, we should recurse into - * WindowFuncs to make sure their input expressions are available. - */ - flattenable_vars = pull_var_clause((Node *) flattenable_cols, - PVC_INCLUDE_AGGREGATES | - PVC_RECURSE_WINDOWFUNCS | - PVC_INCLUDE_PLACEHOLDERS); - add_new_columns_to_pathtarget(input_target, flattenable_vars); - - /* clean up cruft */ - list_free(flattenable_vars); - list_free(flattenable_cols); - - /* XXX this causes some redundant cost calculation ... */ - return set_pathtarget_cost_width(root, input_target); -} + add_path(grouped_rel, path); + } + } + else if (parse->groupClause) + { + if (!is_sorted || root->group_pathkeys) + { + path = (Path *) create_sort_path(root, + grouped_rel, + path, + root->group_pathkeys, + -1.0); + } -/* - * make_pathkeys_for_window - * Create a pathkeys list describing the required input ordering - * for the given WindowClause. - * - * The required ordering is first the PARTITION keys, then the ORDER keys. - * In the future we might try to implement windowing using hashing, in which - * case the ordering could be relaxed, but for now we always sort. - * - * Caution: if you change this, see createplan.c's get_column_info_for_window! - */ -static List * -make_pathkeys_for_window(PlannerInfo *root, WindowClause *wc, - List *tlist) -{ - List *window_pathkeys; - List *window_sortclauses; + path = (Path *) + create_group_path(root, + grouped_rel, + path, + target, + parse->groupClause, + havingQual, + dNumGroups); - /* Throw error if can't sort */ - if (!grouping_is_sortable(wc->partitionClause)) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("could not implement window PARTITION BY"), - errdetail("Window partitioning columns must be of sortable datatypes."))); - if (!grouping_is_sortable(wc->orderClause)) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("could not implement window ORDER BY"), - errdetail("Window ordering columns must be of sortable datatypes."))); - - /* Okay, make the combined pathkeys */ - window_sortclauses = list_concat(list_copy(wc->partitionClause), - list_copy(wc->orderClause)); - window_pathkeys = make_pathkeys_for_sortclauses(root, - window_sortclauses, - tlist); - list_free(window_sortclauses); - return window_pathkeys; -} + //path->parallel_safe = true; -/* - * make_sort_input_target - * Generate appropriate PathTarget for initial input to Sort step. - * - * If the query has ORDER BY, this function chooses the target to be computed - * by the node just below the Sort (and DISTINCT, if any, since Unique can't - * project) steps. This might or might not be identical to the query's final - * output target. - * - * The main argument for keeping the sort-input tlist the same as the final - * is that we avoid a separate projection node (which will be needed if - * they're different, because Sort can't project). However, there are also - * advantages to postponing tlist evaluation till after the Sort: it ensures - * a consistent order of evaluation for any volatile functions in the tlist, - * and if there's also a LIMIT, we can stop the query without ever computing - * tlist functions for later rows, which is beneficial for both volatile and - * expensive functions. - * - * Our current policy is to postpone volatile expressions till after the sort - * unconditionally (assuming that that's possible, ie they are in plain tlist - * columns and not ORDER BY/GROUP BY/DISTINCT columns). We also prefer to - * postpone set-returning expressions, because running them beforehand would - * bloat the sort dataset, and because it might cause unexpected output order - * if the sort isn't stable. However there's a constraint on that: all SRFs - * in the tlist should be evaluated at the same plan step, so that they can - * run in sync in nodeProjectSet. So if any SRFs are in sort columns, we - * mustn't postpone any SRFs. (Note that in principle that policy should - * probably get applied to the group/window input targetlists too, but we - * have not done that historically.) Lastly, expensive expressions are - * postponed if there is a LIMIT, or if root->tuple_fraction shows that - * partial evaluation of the query is possible (if neither is true, we expect - * to have to evaluate the expressions for every row anyway), or if there are - * any volatile or set-returning expressions (since once we've put in a - * projection at all, it won't cost any more to postpone more stuff). - * - * Another issue that could potentially be considered here is that - * evaluating tlist expressions could result in data that's either wider - * or narrower than the input Vars, thus changing the volume of data that - * has to go through the Sort. However, we usually have only a very bad - * idea of the output width of any expression more complex than a Var, - * so for now it seems too risky to try to optimize on that basis. - * - * Note that if we do produce a modified sort-input target, and then the - * query ends up not using an explicit Sort, no particular harm is done: - * we'll initially use the modified target for the preceding path nodes, - * but then change them to the final target with apply_projection_to_path. - * Moreover, in such a case the guarantees about evaluation order of - * volatile functions still hold, since the rows are sorted already. - * - * This function has some things in common with make_group_input_target and - * make_window_input_target, though the detailed rules for what to do are - * different. We never flatten/postpone any grouping or ordering columns; - * those are needed before the sort. If we do flatten a particular - * expression, we leave Aggref and WindowFunc nodes alone, since those were - * computed earlier. - * - * 'final_target' is the query's final target list (in PathTarget form) - * 'have_postponed_srfs' is an output argument, see below - * - * The result is the PathTarget to be computed by the plan node immediately - * below the Sort step (and the Distinct step, if any). This will be - * exactly final_target if we decide a projection step wouldn't be helpful. - * - * In addition, *have_postponed_srfs is set to TRUE if we choose to postpone - * any set-returning functions to after the Sort. - */ -static PathTarget * -make_sort_input_target(PlannerInfo *root, - PathTarget *final_target, - bool *have_postponed_srfs) -{// #lizard forgives - Query *parse = root->parse; - PathTarget *input_target; - int ncols; - bool *col_is_srf; - bool *postpone_col; - bool have_srf; - bool have_volatile; - bool have_expensive; - bool have_srf_sortcols; - bool postpone_srfs; - List *postponable_cols; - List *postponable_vars; - int i; - ListCell *lc; + /* + * We have GROUP BY without aggregation or grouping sets. + * Make a GroupPath. + */ + add_path(grouped_rel, path); - /* Shouldn't get here unless query has ORDER BY */ - Assert(parse->sortClause); + } + else + { + /* Other cases should have been handled above */ + Assert(false); + } + } + else + { +#endif - *have_postponed_srfs = false; /* default result */ + /* Now decide what to stick atop it */ + if (parse->groupingSets) + { + consider_groupingsets_paths(root, grouped_rel, + path, true, can_hash, target, + gd, agg_costs, dNumGroups); + } + else if (parse->hasAggs) + { +#ifdef __TBASE__ + bool parallel_aware = false; + bool parallel_safe = false; + Path *agg_path = NULL; - /* Inspect tlist and collect per-column information */ - ncols = list_length(final_target->exprs); - col_is_srf = (bool *) palloc0(ncols * sizeof(bool)); - postpone_col = (bool *) palloc0(ncols * sizeof(bool)); - have_srf = have_volatile = have_expensive = have_srf_sortcols = false; + if (root->group_pathkeys && olap_optimizer && + !has_cold_hot_table && agg_costs->hasOnlyDistinct) + { + if (!pathkeys_contained_in(root->group_pathkeys, + path->pathkeys)) + { + path = (Path *) create_sort_path(root, + grouped_rel, + path, + root->group_pathkeys, + -1.0); + } + } - i = 0; - foreach(lc, final_target->exprs) - { - Expr *expr = (Expr *) lfirst(lc); + if (path->pathtype == T_Sort && olap_optimizer && !has_cold_hot_table) + { + SortPath *pathnode = (SortPath *)path; + + if (pathnode->subpath->pathtype == T_Gather || agg_costs->hasOnlyDistinct) + { + path->parallel_aware = true; + parallel_aware = true; + parallel_safe = true; + } + } + + agg_path = (Path *) + create_agg_path(root, + grouped_rel, + path, + target, + parse->groupClause ? AGG_SORTED : AGG_PLAIN, + AGGSPLIT_SIMPLE, + parse->groupClause, + havingQual, + agg_costs, + dNumGroups); + agg_path->parallel_aware = parallel_aware; + agg_path->parallel_safe = parallel_safe; + + add_path(grouped_rel, agg_path); +#else + /* + * We have aggregation, possibly with plain GROUP BY. Make + * an AggPath. + */ + add_path(grouped_rel, (Path *) + create_agg_path(root, + grouped_rel, + path, + target, + parse->groupClause ? AGG_SORTED : AGG_PLAIN, + AGGSPLIT_SIMPLE, + parse->groupClause, + havingQual, + agg_costs, + dNumGroups)); +#endif + } + else if (parse->groupClause) + { +#ifdef __TBASE__ + bool parallel_aware = false; + bool parallel_safe = false; + Path *group_path = NULL; + + if (root->group_pathkeys && olap_optimizer && + !has_cold_hot_table && agg_costs->hasOnlyDistinct) + { + if (!pathkeys_contained_in(root->group_pathkeys, + path->pathkeys)) + { + path = (Path *) create_sort_path(root, + grouped_rel, + path, + root->group_pathkeys, + -1.0); + } + } + + if (path->pathtype == T_Sort && olap_optimizer && !has_cold_hot_table) + { + SortPath *pathnode = (SortPath *)path; + + if (pathnode->subpath->pathtype == T_Gather) + { + path->parallel_aware = true; + parallel_aware = true; + parallel_safe = true; + } + } + + group_path = (Path *) + create_group_path(root, + grouped_rel, + path, + target, + parse->groupClause, + havingQual, + dNumGroups); + group_path->parallel_aware = parallel_aware; + group_path->parallel_safe = parallel_safe; + add_path(grouped_rel, group_path); +#else + + /* + * We have GROUP BY without aggregation or grouping sets. + * Make a GroupPath. + */ + add_path(grouped_rel, (Path *) + create_group_path(root, + grouped_rel, + path, + target, + parse->groupClause, + havingQual, + dNumGroups)); +#endif + } + else + { + /* Other cases should have been handled above */ + Assert(false); + } +#ifdef __TBASE__ + } +#endif + } + } /* - * If the column has a sortgroupref, assume it has to be evaluated - * before sorting. Generally such columns would be ORDER BY, GROUP - * BY, etc targets. One exception is columns that were removed from - * GROUP BY by remove_useless_groupby_columns() ... but those would - * only be Vars anyway. There don't seem to be any cases where it - * would be worth the trouble to double-check. + * Now generate a complete GroupAgg Path atop of the cheapest partial + * path. We can do this using either Gather or Gather Merge. + */ + if (grouped_rel->partial_pathlist) + { +#ifdef __TBASE__ + bool redistribute_group PG_USED_FOR_ASSERTS_ONLY = false; +#endif + Path *path = (Path *) linitial(grouped_rel->partial_pathlist); +#ifdef __TBASE__ + double total_groups = 0; + + if (olap_optimizer && !has_cold_hot_table) + { + total_groups = path->rows; + } + else + total_groups = path->rows * path->parallel_workers; +#else + double total_groups = path->rows * path->parallel_workers; +#endif + path = (Path *) create_gather_path(root, + grouped_rel, + path, + partial_grouping_target, + NULL, + &total_groups); + /* + * Since Gather's output is always unsorted, we'll need to sort, + * unless there's no GROUP BY clause or a degenerate (constant) + * one, in which case there will only be a single group. */ - if (get_pathtarget_sortgroupref(final_target, i) == 0) - { +#ifdef __TBASE__ + if (!olap_optimizer || has_cold_hot_table) + { +#endif + if (root->group_pathkeys) + path = (Path *) create_sort_path(root, + grouped_rel, + path, + root->group_pathkeys, + -1.0); +#ifdef __TBASE__ + } +#endif /* - * Check for SRF or volatile functions. Check the SRF case first - * because we must know whether we have any postponed SRFs. + * If the grouping can't be fully pushed down, we'll push down the + * first phase of the aggregate, and redistribute only the partial + * results. + * + * If if can be pushed down, disable construction of complex + * distributed paths. + * + * XXX Keep this after the Sort node, to make the path sorted. */ - if (parse->hasTargetSRFs && - expression_returns_set((Node *) expr)) - { - /* We'll decide below whether these are postponable */ - col_is_srf[i] = true; - have_srf = true; - } - else if (contain_volatile_functions((Node *) expr)) - { - /* Unconditionally postpone */ - postpone_col[i] = true; - have_volatile = true; - } - else + if (! can_push_down_grouping(root, parse, path)) +#ifdef __TBASE__ { - /* - * Else check the cost. XXX it's annoying to have to do this - * when set_pathtarget_cost_width() just did it. Refactor to - * allow sharing the work? - */ - QualCost cost; + if (olap_optimizer && !has_cold_hot_table) + { + /* redistribute local grouping results among datanodes */ + path = create_redistribute_grouping_path(root, parse, path); + redistribute_group = true; + } + else + path = create_remotesubplan_path(root, path, NULL); + } +#else + path = create_remotesubplan_path(root, path, NULL); +#endif - cost_qual_eval_node(&cost, (Node *) expr, root); + else + try_distributed_aggregation = false; +#ifdef __TBASE__ /* - * We arbitrarily define "expensive" as "more than 10X - * cpu_operator_cost". Note this will take in any PL function - * with default cost. + * Since Gather's output is always unsorted, we'll need to sort, + * unless there's no GROUP BY clause or a degenerate (constant) + * one, in which case there will only be a single group. */ - if (cost.per_tuple > 10 * cpu_operator_cost) - { - postpone_col[i] = true; - have_expensive = true; + if (olap_optimizer && !has_cold_hot_table) + { + if (root->group_pathkeys) + { + path = (Path *) create_sort_path(root, + grouped_rel, + path, + root->group_pathkeys, + -1.0); + path->parallel_aware = true; } } - } - else - { - /* For sortgroupref cols, just check if any contain SRFs */ - if (!have_srf_sortcols && - parse->hasTargetSRFs && - expression_returns_set((Node *) expr)) - have_srf_sortcols = true; - } - - i++; - } - - /* - * We can postpone SRFs if we have some but none are in sortgroupref cols. - */ - postpone_srfs = (have_srf && !have_srf_sortcols); - - /* - * If we don't need a post-sort projection, just return final_target. - */ - if (!(postpone_srfs || have_volatile || - (have_expensive && - (parse->limitCount || root->tuple_fraction > 0)))) - return final_target; - - /* - * Report whether the post-sort projection will contain set-returning - * functions. This is important because it affects whether the Sort can - * rely on the query's LIMIT (if any) to bound the number of rows it needs - * to return. - */ - *have_postponed_srfs = postpone_srfs; - - /* - * Construct the sort-input target, taking all non-postponable columns and - * then adding Vars, PlaceHolderVars, Aggrefs, and WindowFuncs found in - * the postponable ones. - */ - input_target = create_empty_pathtarget(); - postponable_cols = NIL; - - i = 0; - foreach(lc, final_target->exprs) - { - Expr *expr = (Expr *) lfirst(lc); - - if (postpone_col[i] || (postpone_srfs && col_is_srf[i])) - postponable_cols = lappend(postponable_cols, expr); - else - add_column_to_pathtarget(input_target, expr, - get_pathtarget_sortgroupref(final_target, i)); +#endif + if (parse->hasAggs) + { + Path *agg_path = NULL; + + agg_path = (Path *) + create_agg_path(root, + grouped_rel, + path, + target, + parse->groupClause ? AGG_SORTED : AGG_PLAIN, + AGGSPLIT_FINAL_DESERIAL, + parse->groupClause, + havingQual, + agg_final_costs, + dNumGroups); + + if (olap_optimizer && !has_cold_hot_table) + { + agg_path->parallel_safe = true; + agg_path->parallel_aware = true; + } - i++; - } + add_path(grouped_rel, agg_path); + } + else + { + Path *group_path = NULL; + + group_path = (Path *) + create_group_path(root, + grouped_rel, + path, + target, + parse->groupClause, + havingQual, + dNumGroups); + + if (olap_optimizer && !has_cold_hot_table) + { + group_path->parallel_safe = true; + group_path->parallel_aware = true; + } - /* - * Pull out all the Vars, Aggrefs, and WindowFuncs mentioned in - * postponable columns, and add them to the sort-input target if not - * already present. (Some might be there already.) We mustn't - * deconstruct Aggrefs or WindowFuncs here, since the projection node - * would be unable to recompute them. - */ - postponable_vars = pull_var_clause((Node *) postponable_cols, - PVC_INCLUDE_AGGREGATES | - PVC_INCLUDE_WINDOWFUNCS | - PVC_INCLUDE_PLACEHOLDERS); - add_new_columns_to_pathtarget(input_target, postponable_vars); - - /* clean up cruft */ - list_free(postponable_vars); - list_free(postponable_cols); - - /* XXX this represents even more redundant cost calculation ... */ - return set_pathtarget_cost_width(root, input_target); -} + add_path(grouped_rel, group_path); + } -/* - * get_cheapest_fractional_path - * Find the cheapest path for retrieving a specified fraction of all - * the tuples expected to be returned by the given relation. - * - * We interpret tuple_fraction the same way as grouping_planner. - * - * We assume set_cheapest() has been run on the given rel. - */ -Path * -get_cheapest_fractional_path(RelOptInfo *rel, double tuple_fraction) -{ - Path *best_path = rel->cheapest_total_path; - ListCell *l; + /* + * The point of using Gather Merge rather than Gather is that it + * can preserve the ordering of the input path, so there's no + * reason to try it unless (1) it's possible to produce more than + * one output row and (2) we want the output path to be ordered. + */ + if (parse->groupClause != NIL && root->group_pathkeys != NIL) + { + foreach(lc, grouped_rel->partial_pathlist) + { + Path *subpath = (Path *) lfirst(lc); + Path *gmpath; + double total_groups; - /* If all tuples will be retrieved, just return the cheapest-total path */ - if (tuple_fraction <= 0.0) - return best_path; + /* + * It's useful to consider paths that are already properly + * ordered for Gather Merge, because those don't need a + * sort. It's also useful to consider the cheapest path, + * because sorting it in parallel and then doing Gather + * Merge may be better than doing an unordered Gather + * followed by a sort. But there's no point in + * considering non-cheapest paths that aren't already + * sorted correctly. + */ + if (path != subpath && + !pathkeys_contained_in(root->group_pathkeys, + subpath->pathkeys)) + continue; - /* Convert absolute # of tuples to a fraction; no need to clamp to 0..1 */ - if (tuple_fraction >= 1.0 && best_path->rows > 0) - tuple_fraction /= best_path->rows; +#ifdef __TBASE__ + if (olap_optimizer && !has_cold_hot_table) + total_groups = subpath->rows; + else +#endif + total_groups = subpath->rows * subpath->parallel_workers; - foreach(l, rel->pathlist) - { - Path *path = (Path *) lfirst(l); - if (path == rel->cheapest_total_path || - compare_fractional_path_costs(best_path, path, tuple_fraction) <= 0) - continue; +#ifdef __TBASE__ + if (olap_optimizer && !has_cold_hot_table) + { + gmpath = (Path *) create_gather_path(root, + grouped_rel, + subpath, + partial_grouping_target, + NULL, + &total_groups); + } + else +#endif + gmpath = (Path *) + create_gather_merge_path(root, + grouped_rel, + subpath, + partial_grouping_target, + root->group_pathkeys, + NULL, + &total_groups); - best_path = path; - } + /* + * If the grouping can't be fully pushed down, we'll push down the + * first phase of the aggregate, and redistribute only the partial + * results. + */ + redistribute_group = false; - return best_path; -} + if (! can_push_down_grouping(root, parse, gmpath)) +#ifdef __TBASE__ + { + if (olap_optimizer && !has_cold_hot_table) + { + /* redistribute local grouping results among datanodes */ + gmpath = create_redistribute_grouping_path(root, parse, gmpath); + redistribute_group = true; + } + else + { + gmpath = create_remotesubplan_path(root, gmpath, NULL); + } + } +#else + gmpath = create_remotesubplan_path(root, gmpath, NULL); +#endif -/* - * adjust_paths_for_srfs - * Fix up the Paths of the given upperrel to handle tSRFs properly. - * - * The executor can only handle set-returning functions that appear at the - * top level of the targetlist of a ProjectSet plan node. If we have any SRFs - * that are not at top level, we need to split up the evaluation into multiple - * plan levels in which each level satisfies this constraint. This function - * modifies each Path of an upperrel that (might) compute any SRFs in its - * output tlist to insert appropriate projection steps. - * - * The given targets and targets_contain_srfs lists are from - * split_pathtarget_at_srfs(). We assume the existing Paths emit the first - * target in targets. - */ -static void -adjust_paths_for_srfs(PlannerInfo *root, RelOptInfo *rel, - List *targets, List *targets_contain_srfs) -{ - ListCell *lc; +#ifdef __TBASE__ + /* + * Since Gather's output is always unsorted, we'll need to sort, + * unless there's no GROUP BY clause or a degenerate (constant) + * one, in which case there will only be a single group. + */ + if (olap_optimizer && !has_cold_hot_table) + { + if (root->group_pathkeys) + { + gmpath = (Path *) create_sort_path(root, + grouped_rel, + gmpath, + root->group_pathkeys, + -1.0); - Assert(list_length(targets) == list_length(targets_contain_srfs)); - Assert(!linitial_int(targets_contain_srfs)); + gmpath->parallel_aware = true; + } + } +#endif + if (parse->hasAggs) + { + Path *agg_path = NULL; - /* If no SRFs appear at this plan level, nothing to do */ - if (list_length(targets) == 1) - return; + agg_path = (Path *) + create_agg_path(root, + grouped_rel, + gmpath, + target, + parse->groupClause ? AGG_SORTED : AGG_PLAIN, + AGGSPLIT_FINAL_DESERIAL, + parse->groupClause, + havingQual, + agg_final_costs, + dNumGroups); + if (olap_optimizer && !has_cold_hot_table) + { + agg_path->parallel_safe = true; + agg_path->parallel_aware = true; + } - /* - * Stack SRF-evaluation nodes atop each path for the rel. - * - * In principle we should re-run set_cheapest() here to identify the - * cheapest path, but it seems unlikely that adding the same tlist eval - * costs to all the paths would change that, so we don't bother. Instead, - * just assume that the cheapest-startup and cheapest-total paths remain - * so. (There should be no parameterized paths anymore, so we needn't - * worry about updating cheapest_parameterized_paths.) - */ - foreach(lc, rel->pathlist) - { - Path *subpath = (Path *) lfirst(lc); - Path *newpath = subpath; - ListCell *lc1, - *lc2; + add_path(grouped_rel, agg_path); + } + else + { + Path *group_path = NULL; + + group_path = (Path *) + create_group_path(root, + grouped_rel, + gmpath, + target, + parse->groupClause, + havingQual, + dNumGroups); + + if (olap_optimizer && !has_cold_hot_table) + { + group_path->parallel_safe = true; + group_path->parallel_aware = true; + } - Assert(subpath->param_info == NULL); - forboth(lc1, targets, lc2, targets_contain_srfs) - { - PathTarget *thistarget = (PathTarget *) lfirst(lc1); - bool contains_srfs = (bool) lfirst_int(lc2); - - /* If this level doesn't contain SRFs, do regular projection */ - if (contains_srfs) - newpath = (Path *) create_set_projection_path(root, - rel, - newpath, - thistarget); - else - newpath = (Path *) apply_projection_to_path(root, - rel, - newpath, - thistarget); + add_path(grouped_rel, group_path); + } + } + } } - lfirst(lc) = newpath; - if (subpath == rel->cheapest_startup_path) - rel->cheapest_startup_path = newpath; - if (subpath == rel->cheapest_total_path) - rel->cheapest_total_path = newpath; } - /* Likewise for partial paths, if any */ - foreach(lc, rel->partial_pathlist) + if (can_hash) { - Path *subpath = (Path *) lfirst(lc); - Path *newpath = subpath; - ListCell *lc1, - *lc2; + Size hashaggtablesize; - Assert(subpath->param_info == NULL); - forboth(lc1, targets, lc2, targets_contain_srfs) + if (parse->groupingSets) { - PathTarget *thistarget = (PathTarget *) lfirst(lc1); - bool contains_srfs = (bool) lfirst_int(lc2); - - /* If this level doesn't contain SRFs, do regular projection */ - if (contains_srfs) - newpath = (Path *) create_set_projection_path(root, - rel, - newpath, - thistarget); - else + /* + * Try for a hash-only groupingsets path over unsorted input. + */ + consider_groupingsets_paths(root, grouped_rel, + cheapest_path, false, true, target, + gd, agg_costs, dNumGroups); + } + else + { + hashaggtablesize = estimate_hashagg_tablesize(cheapest_path, + agg_costs, + dNumGroups); + + /* + * Provided that the estimated size of the hashtable does not + * exceed work_mem, we'll generate a HashAgg Path, although if we + * were unable to sort above, then we'd better generate a Path, so + * that we at least have one. + */ +#ifdef __TBASE__ + if (hashaggtablesize < work_mem * 1024L || g_hybrid_hash_agg || + grouped_rel->pathlist == NIL) +#else + if (hashaggtablesize < work_mem * 1024L || + grouped_rel->pathlist == NIL) +#endif { - /* avoid apply_projection_to_path, in case of multiple refs */ - newpath = (Path *) create_projection_path(root, - rel, - newpath, - thistarget); - } - } - lfirst(lc) = newpath; - } -} + /* Don't mess with the cheapest path directly. */ + Path *path = cheapest_path; +#ifdef __TBASE__ + bool try_redistribute_grouping = false; +#endif -/* - * expression_planner - * Perform planner's transformations on a standalone expression. - * - * Various utility commands need to evaluate expressions that are not part - * of a plannable query. They can do so using the executor's regular - * expression-execution machinery, but first the expression has to be fed - * through here to transform it from parser output to something executable. - * - * Currently, we disallow sublinks in standalone expressions, so there's no - * real "planning" involved here. (That might not always be true though.) - * What we must do is run eval_const_expressions to ensure that any function - * calls are converted to positional notation and function default arguments - * get inserted. The fact that constant subexpressions get simplified is a - * side-effect that is useful when the expression will get evaluated more than - * once. Also, we must fix operator function IDs. - * - * Note: this must not make any damaging changes to the passed-in expression - * tree. (It would actually be okay to apply fix_opfuncids to it, but since - * we first do an expression_tree_mutator-based walk, what is returned will - * be a new node tree.) - */ -Expr * -expression_planner(Expr *expr) -{ - Node *result; + /* + * If the grouping can't be fully pushed down, we'll push down the + * first phase of the aggregate, and redistribute only the partial + * results. + * + * If if can be pushed down, disable construction of complex + * distributed paths. + */ + if (! can_push_down_grouping(root, parse, path)) +#ifdef XCP + { + /* some special aggs cannot be parallel executed, such as count(distinct) */ + if(agg_costs->hasNonPartial || agg_costs->hasNonSerial || + path->pathtype == T_Agg || path->pathtype == T_Group || + !olap_optimizer || has_cold_hot_table) + { + if (agg_costs->hasOnlyDistinct && olap_optimizer && !has_cold_hot_table) + path = create_redistribute_grouping_path(root, parse, path); + else + path = create_remotesubplan_path(root, path, NULL); + } + else + { + /* + * If the grouping can not be fully pushed down, we adopt another + * strategy instead. + * 1. do grouping on each datanode locally + * 2. re-distribute grouping results among datanodes, then do the + * final grouping + */ + AggClauseCosts hashagg_partial_costs; + PathTarget * local_grouping_target = make_partial_grouping_target(root, target); - /* - * Convert named-argument function calls, insert default arguments and - * simplify constant subexprs - */ - result = eval_const_expressions(NULL, (Node *) expr); + /* Estimate number of partial groups. */ + double dNumLocalGroups = get_number_of_groups(root, + cheapest_path->rows, + gd); + try_redistribute_grouping = true; - /* Fill in opfuncid values if missing */ - fix_opfuncids(result); + MemSet(&hashagg_partial_costs, 0, sizeof(AggClauseCosts)); - return (Expr *) result; -} + get_agg_clause_costs(root, (Node *) local_grouping_target->exprs, + AGGSPLIT_INITIAL_SERIAL, + &hashagg_partial_costs); + + /* step 1 */ + path = (Path *) create_agg_path(root, + grouped_rel, + cheapest_path, + local_grouping_target, + AGG_HASHED, + AGGSPLIT_INITIAL_SERIAL, + parse->groupClause, + NIL, + &hashagg_partial_costs, + dNumLocalGroups); + +#ifdef __TBASE__ + if (hashaggtablesize >= work_mem * 1024L && g_hybrid_hash_agg) + { + AggPath *aggpath = (AggPath *)path; + aggpath->hybrid = true; + } +#endif + + /* step 2 */ + path = create_redistribute_grouping_path(root, parse, path); + } + } +#else + path = create_remotesubplan_path(root, path, NULL); +#endif + else + try_distributed_aggregation = false; /* - * plan_cluster_use_sort - * Use the planner to decide how CLUSTER should implement sorting - * - * tableOid is the OID of a table to be clustered on its index indexOid - * (which is already known to be a btree index). Decide whether it's - * cheaper to do an indexscan or a seqscan-plus-sort to execute the CLUSTER. - * Return TRUE to use sorting, FALSE to use an indexscan. - * - * Note: caller had better already hold some type of lock on the table. + * We just need an Agg over the cheapest-total input path, + * since input order won't matter. */ -bool -plan_cluster_use_sort(Oid tableOid, Oid indexOid) -{ - PlannerInfo *root; - Query *query; - PlannerGlobal *glob; - RangeTblEntry *rte; - RelOptInfo *rel; - IndexOptInfo *indexInfo; - QualCost indexExprCost; - Cost comparisonCost; - Path *seqScanPath; - Path seqScanAndSortPath; - IndexPath *indexScanPath; - ListCell *lc; +#ifdef __TBASE__ + if(try_redistribute_grouping) + { + AggClauseCosts hashagg_final_costs; + Path *agg_path; - /* We can short-circuit the cost comparison if indexscans are disabled */ - if (!enable_indexscan) - return true; /* use sort */ + MemSet(&hashagg_final_costs, 0, sizeof(AggClauseCosts)); - /* Set up mostly-dummy planner state */ - query = makeNode(Query); - query->commandType = CMD_SELECT; + get_agg_clause_costs(root, (Node *) target->exprs, + AGGSPLIT_FINAL_DESERIAL, + &hashagg_final_costs); + get_agg_clause_costs(root, parse->havingQual, + AGGSPLIT_FINAL_DESERIAL, + &hashagg_final_costs); - glob = makeNode(PlannerGlobal); + agg_path = (Path *) + create_agg_path(root, + grouped_rel, + path, + target, + AGG_HASHED, + AGGSPLIT_FINAL_DESERIAL, + parse->groupClause, + havingQual, + &hashagg_final_costs, + dNumGroups); +#ifdef __TBASE__ + if (hashaggtablesize >= work_mem * 1024L && g_hybrid_hash_agg) + { + AggPath *aggpath = (AggPath *)agg_path; - root = makeNode(PlannerInfo); - root->parse = query; - root->glob = glob; - root->query_level = 1; - root->planner_cxt = CurrentMemoryContext; - root->wt_param_id = -1; - root->recursiveOk = true; + aggpath->hybrid = true; + } +#endif + //agg_path->parallel_safe = true; - /* Build a minimal RTE for the rel */ - rte = makeNode(RangeTblEntry); - rte->rtekind = RTE_RELATION; - rte->relid = tableOid; - rte->relkind = RELKIND_RELATION; /* Don't be too picky. */ - rte->lateral = false; - rte->inh = false; - rte->inFromCl = true; - query->rtable = list_make1(rte); - - /* Set up RTE/RelOptInfo arrays */ - setup_simple_rel_arrays(root); - - /* Build RelOptInfo */ - rel = build_simple_rel(root, 1, NULL); - - /* Locate IndexOptInfo for the target index */ - indexInfo = NULL; - foreach(lc, rel->indexlist) - { - indexInfo = (IndexOptInfo *) lfirst(lc); - if (indexInfo->indexoid == indexOid) - break; - } + add_path(grouped_rel, agg_path); + } + else + { + bool parallel_aware = false; + bool parallel_safe = false; + Path *agg_path = NULL; - /* - * It's possible that get_relation_info did not generate an IndexOptInfo - * for the desired index; this could happen if it's not yet reached its - * indcheckxmin usability horizon, or if it's a system index and we're - * ignoring system indexes. In such cases we should tell CLUSTER to not - * trust the index contents but use seqscan-and-sort. - */ - if (lc == NULL) /* not in the list? */ - return true; /* use sort */ + if ((path->pathtype == T_Gather || agg_costs->hasOnlyDistinct) && olap_optimizer + && !has_cold_hot_table) + { + parallel_safe = true; + parallel_aware = true; + } - /* - * Rather than doing all the pushups that would be needed to use - * set_baserel_size_estimates, just do a quick hack for rows and width. - */ - rel->rows = rel->tuples; - rel->reltarget->width = get_relation_data_width(tableOid, NULL); + agg_path = (Path *) + create_agg_path(root, grouped_rel, + path, + target, + AGG_HASHED, + AGGSPLIT_SIMPLE, + parse->groupClause, + havingQual, + agg_costs, + dNumGroups); + agg_path->parallel_aware = parallel_aware; + agg_path->parallel_safe = parallel_safe; +#ifdef __TBASE__ + if (hashaggtablesize >= work_mem * 1024L && g_hybrid_hash_agg) + { + AggPath *aggpath = (AggPath *)agg_path; + + aggpath->hybrid = true; + } +#endif + add_path(grouped_rel, agg_path); + } +#else + add_path(grouped_rel, (Path *) + create_agg_path(root, grouped_rel, + path, + target, + AGG_HASHED, + AGGSPLIT_SIMPLE, + parse->groupClause, + havingQual, + agg_costs, + dNumGroups)); +#endif + } + } - root->total_table_pages = rel->pages; + /* + * Generate a HashAgg Path atop of the cheapest partial path. Once + * again, we'll only do this if it looks as though the hash table + * won't exceed work_mem. + */ + if (grouped_rel->partial_pathlist) + { + bool redistribute_group = false; + Path *path = (Path *) linitial(grouped_rel->partial_pathlist); - /* - * Determine eval cost of the index expressions, if any. We need to - * charge twice that amount for each tuple comparison that happens during - * the sort, since tuplesort.c will have to re-evaluate the index - * expressions each time. (XXX that's pretty inefficient...) - */ - cost_qual_eval(&indexExprCost, indexInfo->indexprs, root); - comparisonCost = 2.0 * (indexExprCost.startup + indexExprCost.per_tuple); - - /* Estimate the cost of seq scan + sort */ - seqScanPath = create_seqscan_path(root, rel, NULL, 0); - cost_sort(&seqScanAndSortPath, root, NIL, - seqScanPath->total_cost, rel->tuples, rel->reltarget->width, - comparisonCost, maintenance_work_mem, -1.0); - - /* Estimate the cost of index scan */ - indexScanPath = create_index_path(root, indexInfo, - NIL, NIL, NIL, NIL, NIL, - ForwardScanDirection, false, - NULL, 1.0, false); - - return (seqScanAndSortPath.total_cost < indexScanPath->path.total_cost); -} + hashaggtablesize = estimate_hashagg_tablesize(path, + agg_final_costs, + dNumGroups); +#ifdef __TBASE__ + if (hashaggtablesize < work_mem * 1024L || g_hybrid_hash_agg) +#else + if (hashaggtablesize < work_mem * 1024L) +#endif + { +#ifdef __TBASE__ + double total_groups = 0; -/* - * grouping_distribution_match - * Check if the path distribution matches grouping distribution. - * - * Grouping preserves distribution if the distribution key is on of the - * grouping keys (arbitrary one). In that case it's guaranteed that groups - * on different nodes do not overlap, and we can push the aggregation to - * remote nodes as a whole. - * - * Otherwise we need to either fetch all the data to the coordinator and - * perform the aggregation there, or use two-phase aggregation, with the - * first phase (partial aggregation) pushed down, and the second phase - * (combining and finalizing the results) executed on the coordinator. - * - * XXX This is used not only for plain aggregation, but also for various - * other paths, relying on grouping infrastructure (DISTINCT ON, UNIQUE). - */ -static bool -grouping_distribution_match(PlannerInfo *root, Query *parse, Path *path, - List *clauses) -{// #lizard forgives - int i; - bool matches_key = false; - Distribution *distribution = path->distribution; + if (olap_optimizer && !has_cold_hot_table) + { + total_groups = path->rows; + } + else + total_groups = path->rows * path->parallel_workers; +#else + double total_groups = path->rows * path->parallel_workers; +#endif + path = (Path *) create_gather_path(root, + grouped_rel, + path, + partial_grouping_target, + NULL, + &total_groups); + /* + * If the grouping can't be fully pushed down, we'll push down the + * first phase of the aggregate, and redistribute only the partial + * results. + * + * If if can be pushed down, disable construction of complex + * distributed paths. + */ + if (! can_push_down_grouping(root, parse, path)) +#ifdef __TBASE__ + { + if (olap_optimizer && !has_cold_hot_table) + { + /* redistribute local grouping results among datanodes */ + path = create_redistribute_grouping_path(root, parse, path); + redistribute_group = true; + } + else + { + path = create_remotesubplan_path(root, path, NULL); + } + } +#else + path = create_remotesubplan_path(root, path, NULL); +#endif + else + try_distributed_aggregation = false; - int numGroupCols = list_length(clauses); - AttrNumber *groupColIdx = extract_grouping_cols(clauses, - parse->targetList); +#ifdef __TBASE__ + if (!redistribute_group) + { + Path *agg_path = (Path *) + create_agg_path(root, + grouped_rel, + path, + target, + AGG_HASHED, + AGGSPLIT_FINAL_DESERIAL, + parse->groupClause, + havingQual, + agg_final_costs, + dNumGroups); + + if (olap_optimizer && !has_cold_hot_table) + { + agg_path->parallel_aware = true; + agg_path->parallel_safe = true; + } +#ifdef __TBASE__ + if (hashaggtablesize >= work_mem * 1024L && g_hybrid_hash_agg) + { + AggPath *aggpath = (AggPath *)agg_path; -#ifdef __COLD_HOT__ - if (has_cold_hot_table) - { - if (! path->distribution) - { - return true; - } + aggpath->hybrid = true; + } +#endif + add_path(grouped_rel, agg_path); + } + else + { +#endif + Path *agg_path = (Path *) + create_agg_path(root, + grouped_rel, + path, + target, + AGG_HASHED, + AGGSPLIT_FINAL_DESERIAL, + parse->groupClause, + havingQual, + agg_final_costs, + dNumGroups); + + if (olap_optimizer && !has_cold_hot_table) + { + agg_path->parallel_aware = true; + agg_path->parallel_safe = true; + } +#ifdef __TBASE__ + if (hashaggtablesize >= work_mem * 1024L && g_hybrid_hash_agg) + { + AggPath *aggpath = (AggPath *)agg_path; - return false; - } + aggpath->hybrid = true; + } +#endif + add_path(grouped_rel, agg_path); +#ifdef __TBASE__ + } #endif + } + } + } +} /* - * With no explicit data distribution or replicated tables, we can simply - * push down the whole aggregation to the remote node, without any sort - * of redistribution. So consider this to be a match. + * add_partial_paths_to_grouping_rel + * + * Add partial paths to grouping relation. These paths are not fully + * aggregated; a FinalizeAggregate step is still required. */ - if ((distribution == NULL) || - IsLocatorReplicated(distribution->distributionType)) - return true; +static void +add_partial_paths_to_grouping_rel(PlannerInfo *root, + RelOptInfo *input_rel, + RelOptInfo *grouped_rel, + PathTarget *target, + PathTarget *partial_grouping_target, + AggClauseCosts *agg_partial_costs, + AggClauseCosts *agg_final_costs, + grouping_sets_data *gd, + bool can_sort, + bool can_hash, + List *havingQual) +{ + Query *parse = root->parse; + Path *cheapest_partial_path = linitial(input_rel->partial_pathlist); + Size hashaggtablesize; + double dNumPartialGroups = 0; + ListCell *lc; - /* But no distribution expression means 'no match'. */ - if (distribution->distributionExpr == NULL) - return false; + /* Estimate number of partial groups. */ + dNumPartialGroups = get_number_of_groups(root, + cheapest_partial_path->rows, + gd); + + if (can_sort) + { + /* This should have been checked previously */ + Assert(parse->hasAggs || parse->groupClause); /* - * With distributed data and table distributed using an expression, we - * need to check if the distribution expression matches one of the - * grouping keys (arbitrary one). + * Use any available suitably-sorted path as input, and also consider + * sorting the cheapest partial path. */ - for (i = 0; i < numGroupCols; i++) + foreach(lc, input_rel->partial_pathlist) { - TargetEntry *te = (TargetEntry *)list_nth(parse->targetList, - groupColIdx[i]-1); + Path *path = (Path *) lfirst(lc); + bool is_sorted; - if (equal(te->expr, distribution->distributionExpr)) + is_sorted = pathkeys_contained_in(root->group_pathkeys, + path->pathkeys); + if (path == cheapest_partial_path || is_sorted) { - matches_key = true; - break; + /* Sort the cheapest partial path, if it isn't already */ + if (!is_sorted) + path = (Path *) create_sort_path(root, + grouped_rel, + path, + root->group_pathkeys, + -1.0); + + if (parse->hasAggs) + add_partial_path(grouped_rel, (Path *) + create_agg_path(root, + grouped_rel, + path, + partial_grouping_target, + parse->groupClause ? AGG_SORTED : AGG_PLAIN, + AGGSPLIT_INITIAL_SERIAL, + parse->groupClause, + NIL, + agg_partial_costs, + dNumPartialGroups)); + else + add_partial_path(grouped_rel, (Path *) + create_group_path(root, + grouped_rel, + path, + partial_grouping_target, + parse->groupClause, + NIL, + dNumPartialGroups)); } } - - return matches_key; } -/* - * get_partitioned_child_rels - * Returns a list of the RT indexes of the partitioned child relations - * with rti as the root parent RT index. Also sets - * *part_cols_updated to true if any of the root rte's updated - * columns is used in the partition key either of the relation whose RTI - * is specified or of any child relation. - * - * Note: This function might get called even for range table entries that - * are not partitioned tables; in such a case, it will simply return NIL. - */ -List * -get_partitioned_child_rels(PlannerInfo *root, Index rti, - bool *part_cols_updated) + if (can_hash) { - List *result = NIL; - ListCell *l; + /* Checked above */ + Assert(parse->hasAggs || parse->groupClause); - if (part_cols_updated) - *part_cols_updated = false; + hashaggtablesize = + estimate_hashagg_tablesize(cheapest_partial_path, + agg_partial_costs, + dNumPartialGroups); - foreach(l, root->pcinfo_list) + /* + * Tentatively produce a partial HashAgg Path, depending on if it + * looks as if the hash table will fit in work_mem. + */ +#ifdef __TBASE__ + if (hashaggtablesize < work_mem * 1024L || g_hybrid_hash_agg) +#else + if (hashaggtablesize < work_mem * 1024L) +#endif { - PartitionedChildRelInfo *pc = lfirst(l); - - if (pc->parent_relid == rti) + AggPath *aggpath = (AggPath *) + create_agg_path(root, + grouped_rel, + cheapest_partial_path, + partial_grouping_target, + AGG_HASHED, + AGGSPLIT_INITIAL_SERIAL, + parse->groupClause, + NIL, + agg_partial_costs, + dNumPartialGroups); +#ifdef __TBASE__ + if (hashaggtablesize >= work_mem * 1024L) { - result = pc->child_rels; - if (part_cols_updated) - *part_cols_updated = pc->part_cols_updated; - break; + aggpath->hybrid = true; + } +#endif + add_partial_path(grouped_rel, (Path *)aggpath); } } - - return result; } - /* - * get_partitioned_child_rels_for_join - * Build and return a list containing the RTI of every partitioned - * relation which is a child of some rel included in the join. + * can_parallel_agg + * + * Determines whether or not parallel grouping and/or aggregation is possible. + * Returns true when possible, false otherwise. */ -List * -get_partitioned_child_rels_for_join(PlannerInfo *root, Relids join_relids) +static bool +can_parallel_agg(PlannerInfo *root, RelOptInfo *input_rel, + RelOptInfo *grouped_rel, const AggClauseCosts *agg_costs) { - List *result = NIL; - ListCell *l; + Query *parse = root->parse; - foreach(l, root->pcinfo_list) + if (!grouped_rel->consider_parallel) { - PartitionedChildRelInfo *pc = lfirst(l); - - if (bms_is_member(pc->parent_relid, join_relids)) - result = list_concat(result, list_copy(pc->child_rels)); + /* Not even parallel-safe. */ + return false; } - - return result; + else if (input_rel->partial_pathlist == NIL) + { + /* Nothing to use as input for partial aggregate. */ + return false; + } + else if (!parse->hasAggs && parse->groupClause == NIL) + { + /* + * We don't know how to do parallel aggregation unless we have either + * some aggregates or a grouping clause. + */ + return false; + } + else if (parse->groupingSets) + { + /* We don't know how to do grouping sets in parallel. */ + return false; + } + else if (agg_costs->hasNonPartial || agg_costs->hasNonSerial) + { + /* Insufficient support for partial mode. */ + return false; } + /* Everything looks good. */ + return true; +} static bool groupingsets_distribution_match(PlannerInfo *root, Query *parse, Path *path) From 1500a993bd9318d4e09353db29d6512fcbe0c32c Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Sun, 28 Jun 2020 21:53:18 +0800 Subject: [PATCH 239/578] Correctly assess parallel-safety of tlists when SRFs are used. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/optimizer/plan/planner.c | 52 +++++++++++++++++++++++----- 1 file changed, 44 insertions(+), 8 deletions(-) diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 640dcc4d..e2cf7b37 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -147,6 +147,7 @@ static Size estimate_hashagg_tablesize(Path *path, static RelOptInfo *create_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel, PathTarget *target, + bool target_parallel_safe, const AggClauseCosts *agg_costs, grouping_sets_data *gd); static void consider_groupingsets_paths(PlannerInfo *root, @@ -162,6 +163,7 @@ static RelOptInfo *create_window_paths(PlannerInfo *root, RelOptInfo *input_rel, PathTarget *input_target, PathTarget *output_target, + bool output_target_parallel_safe, List *tlist, WindowFuncLists *wflists, List *activeWindows); @@ -178,6 +180,7 @@ static RelOptInfo *create_distinct_paths(PlannerInfo *root, static RelOptInfo *create_ordered_paths(PlannerInfo *root, RelOptInfo *input_rel, PathTarget *target, + bool target_parallel_safe, double limit_tuples); static PathTarget *make_group_input_target(PlannerInfo *root, PathTarget *final_target); @@ -1754,6 +1757,7 @@ grouping_planner(PlannerInfo *root, bool inheritance_update, PathTarget *final_target; List *final_targets; List *final_targets_contain_srfs; + bool final_target_parallel_safe; RelOptInfo *current_rel; RelOptInfo *final_rel; ListCell *lc; @@ -1816,6 +1820,10 @@ grouping_planner(PlannerInfo *root, bool inheritance_update, /* Also extract the PathTarget form of the setop result tlist */ final_target = current_rel->cheapest_total_path->pathtarget; + /* And check whether it's parallel safe */ + final_target_parallel_safe = + is_parallel_safe(root, (Node *) final_target->exprs); + /* The setop result tlist couldn't contain any SRFs */ Assert(!parse->hasTargetSRFs); final_targets = final_targets_contain_srfs = NIL; @@ -1847,12 +1855,15 @@ grouping_planner(PlannerInfo *root, bool inheritance_update, PathTarget *sort_input_target; List *sort_input_targets; List *sort_input_targets_contain_srfs; + bool sort_input_target_parallel_safe; PathTarget *grouping_target; List *grouping_targets; List *grouping_targets_contain_srfs; + bool grouping_target_parallel_safe; PathTarget *scanjoin_target; List *scanjoin_targets; List *scanjoin_targets_contain_srfs; + bool scanjoin_target_parallel_safe; bool have_grouping; AggClauseCosts agg_costs; WindowFuncLists *wflists = NULL; @@ -1982,6 +1993,8 @@ grouping_planner(PlannerInfo *root, bool inheritance_update, * that were obtained within query_planner(). */ final_target = create_pathtarget(root, tlist); + final_target_parallel_safe = + is_parallel_safe(root, (Node *) final_target->exprs); /* * If ORDER BY was given, consider whether we should use a post-sort @@ -1989,11 +2002,18 @@ grouping_planner(PlannerInfo *root, bool inheritance_update, * so. */ if (parse->sortClause) + { sort_input_target = make_sort_input_target(root, final_target, &have_postponed_srfs); + sort_input_target_parallel_safe = + is_parallel_safe(root, (Node *) sort_input_target->exprs); + } else + { sort_input_target = final_target; + sort_input_target_parallel_safe = final_target_parallel_safe; + } /* * If we have window functions to deal with, the output from any @@ -2001,11 +2021,18 @@ grouping_planner(PlannerInfo *root, bool inheritance_update, * otherwise, it should be sort_input_target. */ if (activeWindows) + { grouping_target = make_window_input_target(root, final_target, activeWindows); + grouping_target_parallel_safe = + is_parallel_safe(root, (Node *) grouping_target->exprs); + } else + { grouping_target = sort_input_target; + grouping_target_parallel_safe = sort_input_target_parallel_safe; + } /* * If we have grouping or aggregation to do, the topmost scan/join @@ -2015,9 +2042,16 @@ grouping_planner(PlannerInfo *root, bool inheritance_update, have_grouping = (parse->groupClause || parse->groupingSets || parse->hasAggs || root->hasHavingQual); if (have_grouping) + { scanjoin_target = make_group_input_target(root, final_target); + scanjoin_target_parallel_safe = + is_parallel_safe(root, (Node *) grouping_target->exprs); + } else + { scanjoin_target = grouping_target; + scanjoin_target_parallel_safe = grouping_target_parallel_safe; + } /* * If there are any SRFs in the targetlist, we must separate each of @@ -2099,8 +2133,7 @@ grouping_planner(PlannerInfo *root, bool inheritance_update, * for partial paths. But only parallel-safe expressions can be * computed by partial paths. */ - if (current_rel->partial_pathlist && - is_parallel_safe(root, (Node *) scanjoin_target->exprs)) + if (current_rel->partial_pathlist && scanjoin_target_parallel_safe) { /* Apply the scan/join target to each partial path */ foreach(lc, current_rel->partial_pathlist) @@ -2161,6 +2194,7 @@ grouping_planner(PlannerInfo *root, bool inheritance_update, current_rel = create_grouping_paths(root, current_rel, grouping_target, + grouping_target_parallel_safe, &agg_costs, gset_data); /* Fix things up if grouping_target contains SRFs */ @@ -2180,6 +2214,7 @@ grouping_planner(PlannerInfo *root, bool inheritance_update, current_rel, grouping_target, sort_input_target, + sort_input_target_parallel_safe, tlist, wflists, activeWindows); @@ -2213,6 +2248,7 @@ grouping_planner(PlannerInfo *root, bool inheritance_update, current_rel = create_ordered_paths(root, current_rel, final_target, + final_target_parallel_safe, have_postponed_srfs ? -1.0 : limit_tuples); /* Fix things up if final_target contains SRFs */ @@ -3910,6 +3946,7 @@ static RelOptInfo * create_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel, PathTarget *target, + bool target_parallel_safe, const AggClauseCosts *agg_costs, grouping_sets_data *gd) {// #lizard forgives @@ -3934,8 +3971,7 @@ create_grouping_paths(PlannerInfo *root, * can't be parallel-safe, either. Otherwise, it's parallel-safe if the * target list and HAVING quals are parallel-safe. */ - if (input_rel->consider_parallel && - is_parallel_safe(root, (Node *) target->exprs) && + if (input_rel->consider_parallel && target_parallel_safe && is_parallel_safe(root, (Node *) parse->havingQual)) grouped_rel->consider_parallel = true; @@ -5016,6 +5052,7 @@ create_window_paths(PlannerInfo *root, RelOptInfo *input_rel, PathTarget *input_target, PathTarget *output_target, + bool output_target_parallel_safe, List *tlist, WindowFuncLists *wflists, List *activeWindows) @@ -5031,8 +5068,7 @@ create_window_paths(PlannerInfo *root, * can't be parallel-safe, either. Otherwise, we need to examine the * target list and active windows for non-parallel-safe constructs. */ - if (input_rel->consider_parallel && - is_parallel_safe(root, (Node *) output_target->exprs) && + if (input_rel->consider_parallel && output_target_parallel_safe && is_parallel_safe(root, (Node *) activeWindows)) window_rel->consider_parallel = true; @@ -5433,6 +5469,7 @@ static RelOptInfo * create_ordered_paths(PlannerInfo *root, RelOptInfo *input_rel, PathTarget *target, + bool target_parallel_safe, double limit_tuples) { Path *cheapest_input_path = input_rel->cheapest_total_path; @@ -5447,8 +5484,7 @@ create_ordered_paths(PlannerInfo *root, * can't be parallel-safe, either. Otherwise, it's parallel-safe if the * target list is parallel-safe. */ - if (input_rel->consider_parallel && - is_parallel_safe(root, (Node *) target->exprs)) + if (input_rel->consider_parallel && target_parallel_safe) ordered_rel->consider_parallel = true; /* From 03bb7b4fdf67123a740a34947cd59a2c9623925f Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Mon, 29 Jun 2020 10:30:39 +0800 Subject: [PATCH 240/578] Let Parallel Append over simple UNION ALL have partial subpaths. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/optimizer/path/allpaths.c | 22 +++++++ src/backend/optimizer/plan/planner.c | 16 +++++ src/backend/optimizer/plan/subselect.c | 17 ++++- src/test/regress/expected/select_parallel.out | 65 +++++++++++++++++++ .../regress/expected/select_parallel_1.out | 65 +++++++++++++++++++ .../regress/expected/select_parallel_2.out | 65 +++++++++++++++++++ .../regress/expected/select_parallel_3.out | 65 +++++++++++++++++++ .../regress/expected/select_parallel_4.out | 65 +++++++++++++++++++ .../regress/expected/select_parallel_5.out | 65 +++++++++++++++++++ src/test/regress/sql/select_parallel.sql | 25 +++++++ 10 files changed, 468 insertions(+), 2 deletions(-) diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 947c75f3..f5516316 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -2106,6 +2106,28 @@ set_subquery_pathlist(PlannerInfo *root, RelOptInfo *rel, pathkeys, required_outer, distribution)); } + + /* If consider_parallel is false, there should be no partial paths. */ + Assert(sub_final_rel->consider_parallel || + sub_final_rel->partial_pathlist == NIL); + + /* Same for partial paths. */ + foreach(lc, sub_final_rel->partial_pathlist) + { + Path *subpath = (Path *) lfirst(lc); + List *pathkeys; + + /* Convert subpath's pathkeys to outer representation */ + pathkeys = convert_subquery_pathkeys(root, + rel, + subpath->pathkeys, + make_tlist_from_pathtarget(subpath->pathtarget)); + + /* Generate outer path using this subpath */ + add_partial_path(rel, (Path *) + create_subqueryscan_path(root, rel, subpath, + pathkeys, required_outer)); + } } /* diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index e2cf7b37..aa85504d 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -2432,6 +2432,22 @@ grouping_planner(PlannerInfo *root, bool inheritance_update, } /* + * Generate partial paths for final_rel, too, if outer query levels might + * be able to make use of them. + */ + if (final_rel->consider_parallel && root->query_level > 1 && + !limit_needed(parse)) + { + Assert(!parse->rowMarks && parse->commandType == CMD_SELECT); + foreach(lc, current_rel->partial_pathlist) + { + Path *partial_path = (Path *) lfirst(lc); + + add_partial_path(final_rel, partial_path); + } + } + + /* * If there is an FDW that's responsible for all baserels of the query, * let it consider adding ForeignPaths. */ diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index 98ed5c26..bcdbe3da 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -4934,6 +4934,13 @@ SS_charge_for_initplans(PlannerInfo *root, RelOptInfo *final_rel) path->parallel_safe = false; } + /* + * Forget about any partial paths and clear consider_parallel, too; + * they're not usable if we attached an initPlan. + */ + final_rel->partial_pathlist = NIL; + final_rel->consider_parallel = false; + /* We needn't do set_cheapest() here, caller will do it */ } @@ -5134,10 +5141,16 @@ finalize_plan(PlannerInfo *root, Plan *plan, Bitmapset *valid_params, { SubqueryScan *sscan = (SubqueryScan *) plan; RelOptInfo *rel; + Bitmapset *subquery_params; - /* We must run SS_finalize_plan on the subquery */ + /* We must run finalize_plan on the subquery */ rel = find_base_rel(root, sscan->scan.scanrelid); - SS_finalize_plan(rel->subroot, sscan->subplan); + subquery_params = rel->subroot->outer_params; + if (gather_param >= 0) + subquery_params = bms_add_member(bms_copy(subquery_params), + gather_param); + finalize_plan(rel->subroot, sscan->subplan, gather_param, + subquery_params, NULL); /* Now we can add its extParams to the parent's params */ context.paramids = bms_add_members(context.paramids, diff --git a/src/test/regress/expected/select_parallel.out b/src/test/regress/expected/select_parallel.out index ef57a7a7..69b17579 100644 --- a/src/test/regress/expected/select_parallel.out +++ b/src/test/regress/expected/select_parallel.out @@ -338,4 +338,69 @@ EXPLAIN (analyze, timing off, summary off, costs off) SELECT * FROM tenk1; select stringu1::int2 from tenk1 where unique1 = 1; ERROR: invalid input syntax for integer: "BAAAAA" CONTEXT: parallel worker +-- test interaction with set-returning functions +SAVEPOINT settings; +-- multiple subqueries under a single Gather node +-- must set parallel_setup_cost > 0 to discourage multiple Gather nodes +SET LOCAL parallel_setup_cost = 10; +EXPLAIN (COSTS OFF) +SELECT unique1 FROM tenk1 WHERE fivethous = tenthous + 1 +UNION ALL +SELECT unique1 FROM tenk1 WHERE fivethous = tenthous + 1; + QUERY PLAN +---------------------------------------------------- + Gather + Workers Planned: 4 + -> Parallel Append + -> Parallel Seq Scan on tenk1 + Filter: (fivethous = (tenthous + 1)) + -> Parallel Seq Scan on tenk1 tenk1_1 + Filter: (fivethous = (tenthous + 1)) +(7 rows) + +ROLLBACK TO SAVEPOINT settings; +-- can't use multiple subqueries under a single Gather node due to initPlans +EXPLAIN (COSTS OFF) +SELECT unique1 FROM tenk1 WHERE fivethous = + (SELECT unique1 FROM tenk1 WHERE fivethous = 1 LIMIT 1) +UNION ALL +SELECT unique1 FROM tenk1 WHERE fivethous = + (SELECT unique2 FROM tenk1 WHERE fivethous = 1 LIMIT 1) +ORDER BY 1; + QUERY PLAN +-------------------------------------------------------------------- + Sort + Sort Key: tenk1.unique1 + -> Append + -> Gather + Workers Planned: 4 + Params Evaluated: $1 + InitPlan 1 (returns $1) + -> Limit + -> Gather + Workers Planned: 4 + -> Parallel Seq Scan on tenk1 tenk1_2 + Filter: (fivethous = 1) + -> Parallel Seq Scan on tenk1 + Filter: (fivethous = $1) + -> Gather + Workers Planned: 4 + Params Evaluated: $3 + InitPlan 2 (returns $3) + -> Limit + -> Gather + Workers Planned: 4 + -> Parallel Seq Scan on tenk1 tenk1_3 + Filter: (fivethous = 1) + -> Parallel Seq Scan on tenk1 tenk1_1 + Filter: (fivethous = $3) +(25 rows) + +-- test interaction with SRFs +SELECT * FROM information_schema.foreign_data_wrapper_options +ORDER BY 1, 2, 3; + foreign_data_wrapper_catalog | foreign_data_wrapper_name | option_name | option_value +------------------------------+---------------------------+-------------+-------------- +(0 rows) + rollback; diff --git a/src/test/regress/expected/select_parallel_1.out b/src/test/regress/expected/select_parallel_1.out index 2f089381..0bc4ec2a 100644 --- a/src/test/regress/expected/select_parallel_1.out +++ b/src/test/regress/expected/select_parallel_1.out @@ -343,4 +343,69 @@ EXPLAIN (analyze, timing off, summary off, costs off) SELECT * FROM tenk1; -- provoke error in worker select stringu1::int2 from tenk1 where unique1 = 1; ERROR: invalid input syntax for integer: "BAAAAA" +-- test interaction with set-returning functions +SAVEPOINT settings; +-- multiple subqueries under a single Gather node +-- must set parallel_setup_cost > 0 to discourage multiple Gather nodes +SET LOCAL parallel_setup_cost = 10; +EXPLAIN (COSTS OFF) +SELECT unique1 FROM tenk1 WHERE fivethous = tenthous + 1 +UNION ALL +SELECT unique1 FROM tenk1 WHERE fivethous = tenthous + 1; + QUERY PLAN +---------------------------------------------------- + Gather + Workers Planned: 4 + -> Parallel Append + -> Parallel Seq Scan on tenk1 + Filter: (fivethous = (tenthous + 1)) + -> Parallel Seq Scan on tenk1 tenk1_1 + Filter: (fivethous = (tenthous + 1)) +(7 rows) + +ROLLBACK TO SAVEPOINT settings; +-- can't use multiple subqueries under a single Gather node due to initPlans +EXPLAIN (COSTS OFF) +SELECT unique1 FROM tenk1 WHERE fivethous = + (SELECT unique1 FROM tenk1 WHERE fivethous = 1 LIMIT 1) +UNION ALL +SELECT unique1 FROM tenk1 WHERE fivethous = + (SELECT unique2 FROM tenk1 WHERE fivethous = 1 LIMIT 1) +ORDER BY 1; + QUERY PLAN +-------------------------------------------------------------------- + Sort + Sort Key: tenk1.unique1 + -> Append + -> Gather + Workers Planned: 4 + Params Evaluated: $1 + InitPlan 1 (returns $1) + -> Limit + -> Gather + Workers Planned: 4 + -> Parallel Seq Scan on tenk1 tenk1_2 + Filter: (fivethous = 1) + -> Parallel Seq Scan on tenk1 + Filter: (fivethous = $1) + -> Gather + Workers Planned: 4 + Params Evaluated: $3 + InitPlan 2 (returns $3) + -> Limit + -> Gather + Workers Planned: 4 + -> Parallel Seq Scan on tenk1 tenk1_3 + Filter: (fivethous = 1) + -> Parallel Seq Scan on tenk1 tenk1_1 + Filter: (fivethous = $3) +(25 rows) + +-- test interaction with SRFs +SELECT * FROM information_schema.foreign_data_wrapper_options +ORDER BY 1, 2, 3; + foreign_data_wrapper_catalog | foreign_data_wrapper_name | option_name | option_value +------------------------------+---------------------------+-------------+-------------- +(0 rows) + rollback; diff --git a/src/test/regress/expected/select_parallel_2.out b/src/test/regress/expected/select_parallel_2.out index 36bff2d7..0c81d6f4 100644 --- a/src/test/regress/expected/select_parallel_2.out +++ b/src/test/regress/expected/select_parallel_2.out @@ -347,4 +347,69 @@ EXPLAIN (analyze, timing off, summary off, costs off) SELECT * FROM tenk1; -- provoke error in worker select stringu1::int2 from tenk1 where unique1 = 1; ERROR: invalid input syntax for integer: "BAAAAA" +-- test interaction with set-returning functions +SAVEPOINT settings; +-- multiple subqueries under a single Gather node +-- must set parallel_setup_cost > 0 to discourage multiple Gather nodes +SET LOCAL parallel_setup_cost = 10; +EXPLAIN (COSTS OFF) +SELECT unique1 FROM tenk1 WHERE fivethous = tenthous + 1 +UNION ALL +SELECT unique1 FROM tenk1 WHERE fivethous = tenthous + 1; + QUERY PLAN +---------------------------------------------------- + Gather + Workers Planned: 4 + -> Parallel Append + -> Parallel Seq Scan on tenk1 + Filter: (fivethous = (tenthous + 1)) + -> Parallel Seq Scan on tenk1 tenk1_1 + Filter: (fivethous = (tenthous + 1)) +(7 rows) + +ROLLBACK TO SAVEPOINT settings; +-- can't use multiple subqueries under a single Gather node due to initPlans +EXPLAIN (COSTS OFF) +SELECT unique1 FROM tenk1 WHERE fivethous = + (SELECT unique1 FROM tenk1 WHERE fivethous = 1 LIMIT 1) +UNION ALL +SELECT unique1 FROM tenk1 WHERE fivethous = + (SELECT unique2 FROM tenk1 WHERE fivethous = 1 LIMIT 1) +ORDER BY 1; + QUERY PLAN +-------------------------------------------------------------------- + Sort + Sort Key: tenk1.unique1 + -> Append + -> Gather + Workers Planned: 4 + Params Evaluated: $1 + InitPlan 1 (returns $1) + -> Limit + -> Gather + Workers Planned: 4 + -> Parallel Seq Scan on tenk1 tenk1_2 + Filter: (fivethous = 1) + -> Parallel Seq Scan on tenk1 + Filter: (fivethous = $1) + -> Gather + Workers Planned: 4 + Params Evaluated: $3 + InitPlan 2 (returns $3) + -> Limit + -> Gather + Workers Planned: 4 + -> Parallel Seq Scan on tenk1 tenk1_3 + Filter: (fivethous = 1) + -> Parallel Seq Scan on tenk1 tenk1_1 + Filter: (fivethous = $3) +(25 rows) + +-- test interaction with SRFs +SELECT * FROM information_schema.foreign_data_wrapper_options +ORDER BY 1, 2, 3; + foreign_data_wrapper_catalog | foreign_data_wrapper_name | option_name | option_value +------------------------------+---------------------------+-------------+-------------- +(0 rows) + rollback; diff --git a/src/test/regress/expected/select_parallel_3.out b/src/test/regress/expected/select_parallel_3.out index a4717f62..8566355f 100644 --- a/src/test/regress/expected/select_parallel_3.out +++ b/src/test/regress/expected/select_parallel_3.out @@ -343,4 +343,69 @@ EXPLAIN (analyze, timing off, summary off, costs off) SELECT * FROM tenk1; -- provoke error in worker select stringu1::int2 from tenk1 where unique1 = 1; ERROR: invalid input syntax for integer: "BAAAAA" +-- test interaction with set-returning functions +SAVEPOINT settings; +-- multiple subqueries under a single Gather node +-- must set parallel_setup_cost > 0 to discourage multiple Gather nodes +SET LOCAL parallel_setup_cost = 10; +EXPLAIN (COSTS OFF) +SELECT unique1 FROM tenk1 WHERE fivethous = tenthous + 1 +UNION ALL +SELECT unique1 FROM tenk1 WHERE fivethous = tenthous + 1; + QUERY PLAN +---------------------------------------------------- + Gather + Workers Planned: 4 + -> Parallel Append + -> Parallel Seq Scan on tenk1 + Filter: (fivethous = (tenthous + 1)) + -> Parallel Seq Scan on tenk1 tenk1_1 + Filter: (fivethous = (tenthous + 1)) +(7 rows) + +ROLLBACK TO SAVEPOINT settings; +-- can't use multiple subqueries under a single Gather node due to initPlans +EXPLAIN (COSTS OFF) +SELECT unique1 FROM tenk1 WHERE fivethous = + (SELECT unique1 FROM tenk1 WHERE fivethous = 1 LIMIT 1) +UNION ALL +SELECT unique1 FROM tenk1 WHERE fivethous = + (SELECT unique2 FROM tenk1 WHERE fivethous = 1 LIMIT 1) +ORDER BY 1; + QUERY PLAN +-------------------------------------------------------------------- + Sort + Sort Key: tenk1.unique1 + -> Append + -> Gather + Workers Planned: 4 + Params Evaluated: $1 + InitPlan 1 (returns $1) + -> Limit + -> Gather + Workers Planned: 4 + -> Parallel Seq Scan on tenk1 tenk1_2 + Filter: (fivethous = 1) + -> Parallel Seq Scan on tenk1 + Filter: (fivethous = $1) + -> Gather + Workers Planned: 4 + Params Evaluated: $3 + InitPlan 2 (returns $3) + -> Limit + -> Gather + Workers Planned: 4 + -> Parallel Seq Scan on tenk1 tenk1_3 + Filter: (fivethous = 1) + -> Parallel Seq Scan on tenk1 tenk1_1 + Filter: (fivethous = $3) +(25 rows) + +-- test interaction with SRFs +SELECT * FROM information_schema.foreign_data_wrapper_options +ORDER BY 1, 2, 3; + foreign_data_wrapper_catalog | foreign_data_wrapper_name | option_name | option_value +------------------------------+---------------------------+-------------+-------------- +(0 rows) + rollback; diff --git a/src/test/regress/expected/select_parallel_4.out b/src/test/regress/expected/select_parallel_4.out index 684d4989..93228c2e 100644 --- a/src/test/regress/expected/select_parallel_4.out +++ b/src/test/regress/expected/select_parallel_4.out @@ -380,4 +380,69 @@ SELECT xc_node_id != 0 FROM t_worker_identifier; -- provoke error in worker select stringu1::int2 from tenk1 where unique1 = 1; ERROR: invalid input syntax for integer: "BAAAAA" +-- test interaction with set-returning functions +SAVEPOINT settings; +-- multiple subqueries under a single Gather node +-- must set parallel_setup_cost > 0 to discourage multiple Gather nodes +SET LOCAL parallel_setup_cost = 10; +EXPLAIN (COSTS OFF) +SELECT unique1 FROM tenk1 WHERE fivethous = tenthous + 1 +UNION ALL +SELECT unique1 FROM tenk1 WHERE fivethous = tenthous + 1; + QUERY PLAN +---------------------------------------------------- + Gather + Workers Planned: 4 + -> Parallel Append + -> Parallel Seq Scan on tenk1 + Filter: (fivethous = (tenthous + 1)) + -> Parallel Seq Scan on tenk1 tenk1_1 + Filter: (fivethous = (tenthous + 1)) +(7 rows) + +ROLLBACK TO SAVEPOINT settings; +-- can't use multiple subqueries under a single Gather node due to initPlans +EXPLAIN (COSTS OFF) +SELECT unique1 FROM tenk1 WHERE fivethous = + (SELECT unique1 FROM tenk1 WHERE fivethous = 1 LIMIT 1) +UNION ALL +SELECT unique1 FROM tenk1 WHERE fivethous = + (SELECT unique2 FROM tenk1 WHERE fivethous = 1 LIMIT 1) +ORDER BY 1; + QUERY PLAN +-------------------------------------------------------------------- + Sort + Sort Key: tenk1.unique1 + -> Append + -> Gather + Workers Planned: 4 + Params Evaluated: $1 + InitPlan 1 (returns $1) + -> Limit + -> Gather + Workers Planned: 4 + -> Parallel Seq Scan on tenk1 tenk1_2 + Filter: (fivethous = 1) + -> Parallel Seq Scan on tenk1 + Filter: (fivethous = $1) + -> Gather + Workers Planned: 4 + Params Evaluated: $3 + InitPlan 2 (returns $3) + -> Limit + -> Gather + Workers Planned: 4 + -> Parallel Seq Scan on tenk1 tenk1_3 + Filter: (fivethous = 1) + -> Parallel Seq Scan on tenk1 tenk1_1 + Filter: (fivethous = $3) +(25 rows) + +-- test interaction with SRFs +SELECT * FROM information_schema.foreign_data_wrapper_options +ORDER BY 1, 2, 3; + foreign_data_wrapper_catalog | foreign_data_wrapper_name | option_name | option_value +------------------------------+---------------------------+-------------+-------------- +(0 rows) + rollback; diff --git a/src/test/regress/expected/select_parallel_5.out b/src/test/regress/expected/select_parallel_5.out index 94ab46f1..6b20689d 100644 --- a/src/test/regress/expected/select_parallel_5.out +++ b/src/test/regress/expected/select_parallel_5.out @@ -343,4 +343,69 @@ EXPLAIN (analyze, timing off, summary off, costs off) SELECT * FROM tenk1; -- provoke error in worker select stringu1::int2 from tenk1 where unique1 = 1; ERROR: invalid input syntax for integer: "BAAAAA" +-- test interaction with set-returning functions +SAVEPOINT settings; +-- multiple subqueries under a single Gather node +-- must set parallel_setup_cost > 0 to discourage multiple Gather nodes +SET LOCAL parallel_setup_cost = 10; +EXPLAIN (COSTS OFF) +SELECT unique1 FROM tenk1 WHERE fivethous = tenthous + 1 +UNION ALL +SELECT unique1 FROM tenk1 WHERE fivethous = tenthous + 1; + QUERY PLAN +---------------------------------------------------- + Gather + Workers Planned: 4 + -> Parallel Append + -> Parallel Seq Scan on tenk1 + Filter: (fivethous = (tenthous + 1)) + -> Parallel Seq Scan on tenk1 tenk1_1 + Filter: (fivethous = (tenthous + 1)) +(7 rows) + +ROLLBACK TO SAVEPOINT settings; +-- can't use multiple subqueries under a single Gather node due to initPlans +EXPLAIN (COSTS OFF) +SELECT unique1 FROM tenk1 WHERE fivethous = + (SELECT unique1 FROM tenk1 WHERE fivethous = 1 LIMIT 1) +UNION ALL +SELECT unique1 FROM tenk1 WHERE fivethous = + (SELECT unique2 FROM tenk1 WHERE fivethous = 1 LIMIT 1) +ORDER BY 1; + QUERY PLAN +-------------------------------------------------------------------- + Sort + Sort Key: tenk1.unique1 + -> Append + -> Gather + Workers Planned: 4 + Params Evaluated: $1 + InitPlan 1 (returns $1) + -> Limit + -> Gather + Workers Planned: 4 + -> Parallel Seq Scan on tenk1 tenk1_2 + Filter: (fivethous = 1) + -> Parallel Seq Scan on tenk1 + Filter: (fivethous = $1) + -> Gather + Workers Planned: 4 + Params Evaluated: $3 + InitPlan 2 (returns $3) + -> Limit + -> Gather + Workers Planned: 4 + -> Parallel Seq Scan on tenk1 tenk1_3 + Filter: (fivethous = 1) + -> Parallel Seq Scan on tenk1 tenk1_1 + Filter: (fivethous = $3) +(25 rows) + +-- test interaction with SRFs +SELECT * FROM information_schema.foreign_data_wrapper_options +ORDER BY 1, 2, 3; + foreign_data_wrapper_catalog | foreign_data_wrapper_name | option_name | option_value +------------------------------+---------------------------+-------------+-------------- +(0 rows) + rollback; diff --git a/src/test/regress/sql/select_parallel.sql b/src/test/regress/sql/select_parallel.sql index d2cca20f..25ee90a1 100644 --- a/src/test/regress/sql/select_parallel.sql +++ b/src/test/regress/sql/select_parallel.sql @@ -145,4 +145,29 @@ SELECT xc_node_id != 0 FROM t_worker_identifier; -- provoke error in worker select stringu1::int2 from tenk1 where unique1 = 1; +-- test interaction with set-returning functions +SAVEPOINT settings; + +-- multiple subqueries under a single Gather node +-- must set parallel_setup_cost > 0 to discourage multiple Gather nodes +SET LOCAL parallel_setup_cost = 10; +EXPLAIN (COSTS OFF) +SELECT unique1 FROM tenk1 WHERE fivethous = tenthous + 1 +UNION ALL +SELECT unique1 FROM tenk1 WHERE fivethous = tenthous + 1; +ROLLBACK TO SAVEPOINT settings; + +-- can't use multiple subqueries under a single Gather node due to initPlans +EXPLAIN (COSTS OFF) +SELECT unique1 FROM tenk1 WHERE fivethous = + (SELECT unique1 FROM tenk1 WHERE fivethous = 1 LIMIT 1) +UNION ALL +SELECT unique1 FROM tenk1 WHERE fivethous = + (SELECT unique2 FROM tenk1 WHERE fivethous = 1 LIMIT 1) +ORDER BY 1; + +-- test interaction with SRFs +SELECT * FROM information_schema.foreign_data_wrapper_options +ORDER BY 1, 2, 3; + rollback; From 9fb4ae6d1c7a16bdd422c5fd79529c780a7a4d7f Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Mon, 29 Jun 2020 10:45:01 +0800 Subject: [PATCH 241/578] Pass additional arguments to a couple of grouping-related functions. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/optimizer/plan/planner.c | 40 +++++++++++++++++----------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index aa85504d..d7357766 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -140,7 +140,8 @@ static List *reorder_grouping_sets(List *groupingSets, List *sortclause); static void standard_qp_callback(PlannerInfo *root, void *extra); static double get_number_of_groups(PlannerInfo *root, double path_rows, - grouping_sets_data *gd); + grouping_sets_data *gd, + List *target_list); static Size estimate_hashagg_tablesize(Path *path, const AggClauseCosts *agg_costs, double dNumGroups); @@ -185,7 +186,8 @@ static RelOptInfo *create_ordered_paths(PlannerInfo *root, static PathTarget *make_group_input_target(PlannerInfo *root, PathTarget *final_target); static PathTarget *make_partial_grouping_target(PlannerInfo *root, - PathTarget *grouping_target); + PathTarget *grouping_target, + Node *havingQual); static List *postprocess_setop_tlist(List *new_tlist, List *orig_tlist); static List *select_active_windows(PlannerInfo *root, WindowFuncLists *wflists); static PathTarget *make_window_input_target(PlannerInfo *root, @@ -3768,7 +3770,8 @@ standard_qp_callback(PlannerInfo *root, void *extra) * Estimate number of groups produced by grouping clauses (1 if not grouping) * * path_rows: number of output rows from scan/join step - * gsets: grouping set data, or NULL if not doing grouping sets + * gd: grouping sets data including list of grouping sets and their clauses + * target_list: target list containing group clause references * * If doing grouping sets, we also annotate the gsets data with the estimates * for each set and each individual rollup list, with a view to later @@ -3777,7 +3780,8 @@ standard_qp_callback(PlannerInfo *root, void *extra) static double get_number_of_groups(PlannerInfo *root, double path_rows, - grouping_sets_data *gd) + grouping_sets_data *gd, + List *target_list) { Query *parse = root->parse; double dNumGroups; @@ -3802,7 +3806,7 @@ get_number_of_groups(PlannerInfo *root, ListCell *lc; groupExprs = get_sortgrouplist_exprs(rollup->groupClause, - parse->targetList); + target_list); rollup->numGroups = 0.0; @@ -3829,7 +3833,7 @@ get_number_of_groups(PlannerInfo *root, gd->dNumHashGroups = 0; groupExprs = get_sortgrouplist_exprs(parse->groupClause, - parse->targetList); + target_list); forboth(lc, gd->hash_sets_idx, lc2, gd->unsortable_sets) { @@ -3851,7 +3855,7 @@ get_number_of_groups(PlannerInfo *root, { /* Plain GROUP BY */ groupExprs = get_sortgrouplist_exprs(parse->groupClause, - parse->targetList); + target_list); dNumGroups = estimate_num_groups(root, groupExprs, path_rows, NULL); @@ -4070,7 +4074,8 @@ create_grouping_paths(PlannerInfo *root, */ dNumGroups = get_number_of_groups(root, cheapest_path->rows, - gd); + gd, + parse->targetList); /* * Determine whether it's possible to perform sort-based implementations @@ -4176,7 +4181,8 @@ create_grouping_paths(PlannerInfo *root, * appear in the result tlist, and (2) the Aggrefs must be set in * partial mode. */ - partial_grouping_target = make_partial_grouping_target(root, target); + partial_grouping_target = make_partial_grouping_target(root, target, + (Node *) parse->havingQual); /* * Collect statistics about aggregates for estimating costs of @@ -4274,7 +4280,8 @@ create_grouping_paths(PlannerInfo *root, */ if (try_distributed_aggregation) { - partial_grouping_target = make_partial_grouping_target(root, target); + partial_grouping_target = make_partial_grouping_target(root, target, + (Node *) parse->havingQual); /* Estimate number of partial groups. */ dNumPartialGroups = get_number_of_groups(root, @@ -5725,10 +5732,12 @@ make_group_input_target(PlannerInfo *root, PathTarget *final_target) * these would be Vars that are grouped by or used in grouping expressions.) * * grouping_target is the tlist to be emitted by the topmost aggregation step. - * We get the HAVING clause out of *root. + * havingQual represents the HAVING clause. */ static PathTarget * -make_partial_grouping_target(PlannerInfo *root, PathTarget *grouping_target) +make_partial_grouping_target(PlannerInfo *root, + PathTarget *grouping_target, + Node *havingQual) { Query *parse = root->parse; PathTarget *partial_target; @@ -5770,8 +5779,8 @@ make_partial_grouping_target(PlannerInfo *root, PathTarget *grouping_target) /* * If there's a HAVING clause, we'll need the Vars/Aggrefs it uses, too. */ - if (parse->havingQual) - non_group_cols = lappend(non_group_cols, parse->havingQual); + if (havingQual) + non_group_cols = lappend(non_group_cols, havingQual); /* * Pull out all the Vars, PlaceHolderVars, and Aggrefs mentioned in @@ -7892,7 +7901,8 @@ add_partial_paths_to_grouping_rel(PlannerInfo *root, /* Estimate number of partial groups. */ dNumPartialGroups = get_number_of_groups(root, cheapest_partial_path->rows, - gd); + gd, + parse->targetList); if (can_sort) { From 7e471cc3bf984e351786a8171a9809fa4535ebf3 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Mon, 29 Jun 2020 11:22:23 +0800 Subject: [PATCH 242/578] Split create_grouping_paths into degenerate and non-degenerate cases. --- src/backend/optimizer/plan/planner.c | 161 ++++++++++++++++++--------- 1 file changed, 109 insertions(+), 52 deletions(-) diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index d7357766..8723fe38 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -151,6 +151,16 @@ static RelOptInfo *create_grouping_paths(PlannerInfo *root, bool target_parallel_safe, const AggClauseCosts *agg_costs, grouping_sets_data *gd); +static bool is_degenerate_grouping(PlannerInfo *root); +static void create_degenerate_grouping_paths(PlannerInfo *root, + RelOptInfo *input_rel, + PathTarget *target, RelOptInfo *grouped_rel); +static void create_ordinary_grouping_paths(PlannerInfo *root, + RelOptInfo *input_rel, + PathTarget *target, RelOptInfo *grouped_rel, + RelOptInfo *partially_grouped_rel, + const AggClauseCosts *agg_costs, + grouping_sets_data *gd); static void consider_groupingsets_paths(PlannerInfo *root, RelOptInfo *grouped_rel, Path *path, @@ -3956,11 +3966,6 @@ estimate_hashagg_entrysize(Path *path, const AggClauseCosts *agg_costs, * * Note: all Paths in input_rel are expected to return the target computed * by make_group_input_target. - * - * We need to consider sorted and hashed aggregation in the same function, - * because otherwise (1) it would be harder to throw an appropriate error - * message if neither way works, and (2) we should not allow hashtable size - * considerations to dissuade us from using hashing if sorting is not possible. */ static RelOptInfo * create_grouping_paths(PlannerInfo *root, @@ -3971,15 +3976,8 @@ create_grouping_paths(PlannerInfo *root, grouping_sets_data *gd) {// #lizard forgives Query *parse = root->parse; - Path *cheapest_path = input_rel->cheapest_total_path; RelOptInfo *grouped_rel; PathTarget *partial_grouping_target = NULL; - AggClauseCosts agg_partial_costs; /* parallel only */ - AggClauseCosts agg_final_costs; /* parallel only */ - double dNumGroups; - bool can_hash; - bool can_sort; - bool try_parallel_aggregation; bool try_distributed_aggregation; @@ -4004,35 +4002,85 @@ create_grouping_paths(PlannerInfo *root, grouped_rel->fdwroutine = input_rel->fdwroutine; /* - * Check for degenerate grouping. + * Create either paths for a degenerate grouping or paths for ordinary + * grouping, as appropriate. */ - if ((root->hasHavingQual || parse->groupingSets) && - !parse->hasAggs && parse->groupClause == NIL) + if (is_degenerate_grouping(root)) + { + create_degenerate_grouping_paths(root, input_rel, target, grouped_rel); + + /* No need to consider any other alternatives. */ + set_cheapest(grouped_rel); + return grouped_rel; + } + else { + create_ordinary_grouping_paths(root, input_rel, target, grouped_rel, + partially_grouped_rel, agg_costs, gd); + + /* Now choose the best path(s) */ + set_cheapest(grouped_rel); + + /* + * We've been using the partial pathlist for the grouped relation to hold + * partially aggregated paths, but that's actually a little bit bogus + * because it's unsafe for later planning stages -- like ordered_rel --- + * to get the idea that they can use these partial paths as if they didn't + * need a FinalizeAggregate step. Zap the partial pathlist at this stage + * so we don't get confused. + */ + grouped_rel->partial_pathlist = NIL; + + return grouped_rel; + + } +} + +/* + * is_degenerate_grouping + * + * A degenerate grouping is one in which the query has a HAVING qual and/or + * grouping sets, but no aggregates and no GROUP BY (which implies that the + * grouping sets are all empty). + */ +static bool +is_degenerate_grouping(PlannerInfo *root) +{ + Query *parse = root->parse; + + return (root->hasHavingQual || parse->groupingSets) && + !parse->hasAggs && parse->groupClause == NIL; +} + /* - * We have a HAVING qual and/or grouping sets, but no aggregates and - * no GROUP BY (which implies that the grouping sets are all empty). + * create_degenerate_grouping_paths * - * This is a degenerate case in which we are supposed to emit either - * zero or one row for each grouping set depending on whether HAVING - * succeeds. Furthermore, there cannot be any variables in either - * HAVING or the targetlist, so we actually do not need the FROM table - * at all! We can just throw away the plan-so-far and generate a - * Result node. This is a sufficiently unusual corner case that it's - * not worth contorting the structure of this module to avoid having - * to generate the earlier paths in the first place. + * When the grouping is degenerate (see is_degenerate_grouping), we are + * supposed to emit either zero or one row for each grouping set depending on + * whether HAVING succeeds. Furthermore, there cannot be any variables in + * either HAVING or the targetlist, so we actually do not need the FROM table + * at all! We can just throw away the plan-so-far and generate a Result node. + * This is a sufficiently unusual corner case that it's not worth contorting + * the structure of this module to avoid having to generate the earlier paths + * in the first place. */ - int nrows = list_length(parse->groupingSets); +static void +create_degenerate_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel, + PathTarget *target, RelOptInfo *grouped_rel) +{ + Query *parse = root->parse; + int nrows; Path *path; + nrows = list_length(parse->groupingSets); if (nrows > 1) { /* - * Doesn't seem worthwhile writing code to cons up a - * generate_series or a values scan to emit multiple rows. Instead - * just make N clones and append them. (With a volatile HAVING - * clause, this means you might get between 0 and N output rows. - * Offhand I think that's desired.) + * Doesn't seem worthwhile writing code to cons up a generate_series + * or a values scan to emit multiple rows. Instead just make N clones + * and append them. (With a volatile HAVING clause, this means you + * might get between 0 and N output rows. Offhand I think that's + * desired.) */ List *paths = NIL; @@ -4047,9 +4095,12 @@ create_grouping_paths(PlannerInfo *root, path = (Path *) create_append_path(grouped_rel, paths, + NIL, NULL, 0, - NIL); + false, + NIL, + -1); path->pathtarget = target; } else @@ -4060,16 +4111,36 @@ create_grouping_paths(PlannerInfo *root, target, (List *) parse->havingQual); } - add_path(grouped_rel, path); - - /* No need to consider any other alternatives. */ - set_cheapest(grouped_rel); - - return grouped_rel; } /* + * create_ordinary_grouping_paths + * + * Create grouping paths for the ordinary (that is, non-degenerate) case. + * + * We need to consider sorted and hashed aggregation in the same function, + * because otherwise (1) it would be harder to throw an appropriate error + * message if neither way works, and (2) we should not allow hashtable size + * considerations to dissuade us from using hashing if sorting is not possible. + */ +static void +create_ordinary_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel, + PathTarget *target, RelOptInfo *grouped_rel, + RelOptInfo *partially_grouped_rel, + const AggClauseCosts *agg_costs, + grouping_sets_data *gd) +{ + Query *parse = root->parse; + Path *cheapest_path = input_rel->cheapest_total_path; + AggClauseCosts agg_partial_costs; /* parallel only */ + AggClauseCosts agg_final_costs; /* parallel only */ + double dNumGroups; + bool can_hash; + bool can_sort; + bool try_parallel_aggregation; + + /* * Estimate number of groups. */ dNumGroups = get_number_of_groups(root, @@ -4676,20 +4747,6 @@ create_grouping_paths(PlannerInfo *root, if (create_upper_paths_hook) (*create_upper_paths_hook) (root, UPPERREL_GROUP_AGG, input_rel, grouped_rel); - - /* Now choose the best path(s) */ - set_cheapest(grouped_rel); - /* - * We've been using the partial pathlist for the grouped relation to hold - * partially aggregated paths, but that's actually a little bit bogus - * because it's unsafe for later planning stages -- like ordered_rel --- - * to get the idea that they can use these partial paths as if they didn't - * need a FinalizeAggregate step. Zap the partial pathlist at this stage - * so we don't get confused. - */ - grouped_rel->partial_pathlist = NIL; - - return grouped_rel; } From d9302fbfd46acc360a28df0f725a7f35f4bc6276 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Mon, 29 Jun 2020 12:56:01 +0800 Subject: [PATCH 243/578] Avoid creating a TOAST table for a partitioned table. 2. fix planner.c bug. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/optimizer/plan/planner.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 8723fe38..7538b6be 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -158,7 +158,6 @@ static void create_degenerate_grouping_paths(PlannerInfo *root, static void create_ordinary_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel, PathTarget *target, RelOptInfo *grouped_rel, - RelOptInfo *partially_grouped_rel, const AggClauseCosts *agg_costs, grouping_sets_data *gd); static void consider_groupingsets_paths(PlannerInfo *root, @@ -3977,8 +3976,6 @@ create_grouping_paths(PlannerInfo *root, {// #lizard forgives Query *parse = root->parse; RelOptInfo *grouped_rel; - PathTarget *partial_grouping_target = NULL; - bool try_distributed_aggregation; /* For now, do all work in the (GROUP_AGG, NULL) upperrel */ @@ -4016,7 +4013,7 @@ create_grouping_paths(PlannerInfo *root, else { create_ordinary_grouping_paths(root, input_rel, target, grouped_rel, - partially_grouped_rel, agg_costs, gd); + agg_costs, gd); /* Now choose the best path(s) */ set_cheapest(grouped_rel); @@ -4127,7 +4124,6 @@ create_degenerate_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel, static void create_ordinary_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel, PathTarget *target, RelOptInfo *grouped_rel, - RelOptInfo *partially_grouped_rel, const AggClauseCosts *agg_costs, grouping_sets_data *gd) { @@ -4139,6 +4135,8 @@ create_ordinary_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel, bool can_hash; bool can_sort; bool try_parallel_aggregation; + bool try_distributed_aggregation; + PathTarget *partial_grouping_target = NULL; /* * Estimate number of groups. From d72d845d02dd43ad0341e0c362fa94271ae86da3 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Mon, 29 Jun 2020 14:38:59 +0800 Subject: [PATCH 244/578] Enforce child constraints during COPY TO a partitioned table. --- src/backend/commands/copy.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 321b44a1..e3e67988 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -3565,7 +3565,8 @@ CopyFrom(CopyState cstate) check_partition_constr = false; /* Check the constraints of the tuple */ - if (cstate->rel->rd_att->constr || check_partition_constr) + if (resultRelInfo->ri_RelationDesc->rd_att->constr || + check_partition_constr) ExecConstraints(resultRelInfo, slot, estate, true); #ifdef _MLS_ From 54e24d887537475c3d8f1053f53c838cfdfddf8a Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Mon, 29 Jun 2020 15:50:24 +0800 Subject: [PATCH 245/578] Faster partition pruning.http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/Makefile | 2 +- src/backend/catalog/partition.c | 133 +- src/backend/nodes/copyfuncs.c | 56 +- src/backend/nodes/equalfuncs.c | 13 - src/backend/nodes/nodeFuncs.c | 25 + src/backend/nodes/outfuncs.c | 44 +- src/backend/nodes/readfuncs.c | 31 + src/backend/optimizer/path/allpaths.c | 112 +- src/backend/optimizer/path/indxpath.c | 4 +- src/backend/optimizer/plan/planner.c | 100 +- src/backend/optimizer/prep/prepunion.c | 47 +- src/backend/optimizer/util/plancat.c | 48 +- src/backend/optimizer/util/relnode.c | 8 + src/backend/partitioning/Makefile | 17 + src/backend/partitioning/partprune.c | 2782 +++++++++++++++++ src/include/catalog/partition.h | 3 +- src/include/catalog/pg_opfamily.h | 3 + src/include/nodes/nodes.h | 4 +- src/include/nodes/primnodes.h | 75 + src/include/nodes/relation.h | 38 +- src/include/optimizer/planner.h | 5 - src/include/partitioning/partbounds.h | 124 + src/include/partitioning/partprune.h | 49 + src/test/regress/expected/inherit.out | 4 +- src/test/regress/expected/inherit_1.out | 4 +- src/test/regress/expected/inherit_2.out | 4 +- src/test/regress/expected/inherit_3.out | 4 +- src/test/regress/expected/partition_prune.out | 493 ++- src/test/regress/sql/partition_prune.sql | 123 +- 29 files changed, 3971 insertions(+), 384 deletions(-) create mode 100644 src/backend/partitioning/Makefile create mode 100644 src/backend/partitioning/partprune.c create mode 100644 src/include/partitioning/partbounds.h create mode 100644 src/include/partitioning/partprune.h diff --git a/src/backend/Makefile b/src/backend/Makefile index 4c35043b..75b093d8 100644 --- a/src/backend/Makefile +++ b/src/backend/Makefile @@ -22,7 +22,7 @@ override CFLAGS += $(PTHREAD_CFLAGS) endif SUBDIRS = access audit bootstrap catalog contrib parser commands executor foreign lib libpq \ - pgxc main nodes optimizer oracle port postmaster regex replication rewrite \ + pgxc main nodes optimizer partitioning oracle port postmaster regex replication rewrite \ statistics storage tcop tsearch utils $(top_builddir)/src/timezone $(top_builddir)/src/interfaces/libpq include $(srcdir)/common.mk diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c index bf765697..f74a88f0 100644 --- a/src/backend/catalog/partition.c +++ b/src/backend/catalog/partition.c @@ -41,6 +41,7 @@ #include "optimizer/prep.h" #include "optimizer/var.h" #include "parser/parse_coerce.h" +#include "partitioning/partbounds.h" #include "rewrite/rewriteManip.h" #include "storage/lmgr.h" #include "utils/array.h" @@ -55,88 +56,6 @@ #include "utils/ruleutils.h" #include "utils/syscache.h" -/* - * Information about bounds of a partitioned relation - * - * A list partition datum that is known to be NULL is never put into the - * datums array. Instead, it is tracked using the null_index field. - * - * In the case of range partitioning, ndatums will typically be far less than - * 2 * nparts, because a partition's upper bound and the next partition's lower - * bound are the same in most common cases, and we only store one of them (the - * upper bound). In case of hash partitioning, ndatums will be same as the - * number of partitions. - * - * For range and list partitioned tables, datums is an array of datum-tuples - * with key->partnatts datums each. For hash partitioned tables, it is an array - * of datum-tuples with 2 datums, modulus and remainder, corresponding to a - * given partition. - * - * The datums in datums array are arranged in increasing order as defined by - * functions qsort_partition_rbound_cmp(), qsort_partition_list_value_cmp() and - * qsort_partition_hbound_cmp() for range, list and hash partitioned tables - * respectively. For range and list partitions this simply means that the - * datums in the datums array are arranged in increasing order as defined by - * the partition key's operator classes and collations. - * - * In the case of list partitioning, the indexes array stores one entry for - * every datum, which is the index of the partition that accepts a given datum. - * In case of range partitioning, it stores one entry per distinct range - * datum, which is the index of the partition for which a given datum - * is an upper bound. In the case of hash partitioning, the number of the - * entries in the indexes array is same as the greatest modulus amongst all - * partitions. For a given partition key datum-tuple, the index of the - * partition which would accept that datum-tuple would be given by the entry - * pointed by remainder produced when hash value of the datum-tuple is divided - * by the greatest modulus. - */ - -typedef struct PartitionBoundInfoData -{ - char strategy; /* hash, list or range? */ - int ndatums; /* Length of the datums following array */ - Datum **datums; - PartitionRangeDatumKind **kind; /* The kind of each range bound datum; - * NULL for hash and list partitioned - * tables */ - int *indexes; /* Partition indexes */ - int null_index; /* Index of the null-accepting partition; -1 - * if there isn't one */ - int default_index; /* Index of the default partition; -1 if there - * isn't one */ -} PartitionBoundInfoData; - -#define partition_bound_accepts_nulls(bi) ((bi)->null_index != -1) -#define partition_bound_has_default(bi) ((bi)->default_index != -1) - -/* - * When qsort'ing partition bounds after reading from the catalog, each bound - * is represented with one of the following structs. - */ - -/* One bound of a hash partition */ -typedef struct PartitionHashBound -{ - int modulus; - int remainder; - int index; -} PartitionHashBound; - -/* One value coming from some (index'th) list partition */ -typedef struct PartitionListValue -{ - int index; - Datum value; -} PartitionListValue; - -/* One bound of a range partition */ -typedef struct PartitionRangeBound -{ - int index; - Datum *datums; /* range bound datums */ - PartitionRangeDatumKind *kind; /* the kind of each datum */ - bool lower; /* this is the lower (vs upper) bound */ -} PartitionRangeBound; static int32 qsort_partition_hbound_cmp(const void *a, const void *b); static int32 qsort_partition_list_value_cmp(const void *a, const void *b, @@ -169,29 +88,8 @@ static int32 partition_rbound_cmp(int partnatts, FmgrInfo *partsupfunc, Oid *partcollation, Datum *datums1, PartitionRangeDatumKind *kind1, bool lower1, PartitionRangeBound *b2); -static int32 partition_rbound_datum_cmp(FmgrInfo *partsupfunc, - Oid *partcollation, - Datum *rb_datums, PartitionRangeDatumKind *rb_kind, - Datum *tuple_datums, int n_tuple_datums); - -static int partition_list_bsearch(FmgrInfo *partsupfunc, Oid *partcollation, - PartitionBoundInfo boundinfo, - Datum value, bool *is_equal); -static int partition_range_bsearch(int partnatts, FmgrInfo *partsupfunc, - Oid *partcollation, - PartitionBoundInfo boundinfo, - PartitionRangeBound *probe, bool *is_equal); -static int partition_range_datum_bsearch(FmgrInfo *partsupfunc, - Oid *partcollation, - PartitionBoundInfo boundinfo, - int nvalues, Datum *values, bool *is_equal); -static int partition_hash_bsearch(PartitionBoundInfo boundinfo, - int modulus, int remainder); static int get_partition_bound_num_indexes(PartitionBoundInfo b); -static int get_greatest_modulus(PartitionBoundInfo b); -static uint64 compute_hash_value(int partnatts, FmgrInfo *partsupfunc, - Datum *values, bool *isnull); /* SQL-callable function for use in hash partition CHECK constraints */ PG_FUNCTION_INFO_V1(satisfies_hash_partition); @@ -802,13 +700,13 @@ partition_bounds_equal(int partnatts, int16 *parttyplen, bool *parttypbyval, if (b1->strategy == PARTITION_STRATEGY_HASH) { - int greatest_modulus = get_greatest_modulus(b1); + int greatest_modulus = get_hash_partition_greatest_modulus(b1); /* * If two hash partitioned tables have different greatest moduli, * their partition schemes don't match. */ - if (greatest_modulus != get_greatest_modulus(b2)) + if (greatest_modulus != get_hash_partition_greatest_modulus(b2)) return false; /* @@ -1068,7 +966,7 @@ check_new_partition_bound(char *relname, Relation parent, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("every hash partition modulus must be a factor of the next larger modulus"))); - greatest_modulus = get_greatest_modulus(boundinfo); + greatest_modulus = get_hash_partition_greatest_modulus(boundinfo); remainder = spec->remainder; /* @@ -1597,7 +1495,6 @@ get_partition_qual_relid(Oid relid) return result; } -/* Module-local functions */ /* * get_partition_operator @@ -2575,7 +2472,7 @@ get_partition_for_tuple(Relation relation, Datum *values, bool *isnull) case PARTITION_STRATEGY_HASH: { PartitionBoundInfo boundinfo = partdesc->boundinfo; - int greatest_modulus = get_greatest_modulus(boundinfo); + int greatest_modulus = get_hash_partition_greatest_modulus(boundinfo); uint64 rowHash = compute_hash_value(key->partnatts, key->partsupfunc, values, isnull); @@ -2910,7 +2807,7 @@ partition_rbound_cmp(int partnatts, FmgrInfo *partsupfunc, Oid *partcollation, * of attributes resp. * */ -static int32 +int32 partition_rbound_datum_cmp(FmgrInfo *partsupfunc, Oid *partcollation, Datum *rb_datums, PartitionRangeDatumKind *rb_kind, Datum *tuple_datums, int n_tuple_datums) @@ -2944,7 +2841,7 @@ partition_rbound_datum_cmp(FmgrInfo *partsupfunc, Oid *partcollation, * *is_equal is set to true if the bound datum at the returned index is equal * to the input value. */ -static int +int partition_list_bsearch(FmgrInfo *partsupfunc, Oid *partcollation, PartitionBoundInfo boundinfo, Datum value, bool *is_equal) @@ -2987,7 +2884,7 @@ partition_list_bsearch(FmgrInfo *partsupfunc, Oid *partcollation, * *is_equal is set to true if the range bound at the returned index is equal * to the input range bound */ -static int +int partition_range_bsearch(int partnatts, FmgrInfo *partsupfunc, Oid *partcollation, PartitionBoundInfo boundinfo, @@ -3032,7 +2929,7 @@ partition_range_bsearch(int partnatts, FmgrInfo *partsupfunc, * *is_equal is set to true if the range bound at the returned index is equal * to the input tuple. */ -static int +int partition_range_datum_bsearch(FmgrInfo *partsupfunc, Oid *partcollation, PartitionBoundInfo boundinfo, int nvalues, Datum *values, bool *is_equal) @@ -3075,7 +2972,7 @@ partition_range_datum_bsearch(FmgrInfo *partsupfunc, Oid *partcollation, * less than or equal to the given (modulus, remainder) pair or -1 if * all of them are greater */ -static int +int partition_hash_bsearch(PartitionBoundInfo boundinfo, int modulus, int remainder) { @@ -3231,7 +3128,7 @@ get_partition_bound_num_indexes(PartitionBoundInfo bound) * The number of the entries in the indexes array is same as the * greatest modulus. */ - num_indexes = get_greatest_modulus(bound); + num_indexes = get_hash_partition_greatest_modulus(bound); break; case PARTITION_STRATEGY_LIST: @@ -3252,14 +3149,14 @@ get_partition_bound_num_indexes(PartitionBoundInfo bound) } /* - * get_greatest_modulus + * get_hash_partition_greatest_modulus * * Returns the greatest modulus of the hash partition bound. The greatest * modulus will be at the end of the datums array because hash partitions are * arranged in the ascending order of their modulus and remainders. */ -static int -get_greatest_modulus(PartitionBoundInfo bound) +int +get_hash_partition_greatest_modulus(PartitionBoundInfo bound) { Assert(bound && bound->strategy == PARTITION_STRATEGY_HASH); Assert(bound->datums && bound->ndatums > 0); @@ -3273,7 +3170,7 @@ get_greatest_modulus(PartitionBoundInfo bound) * * Compute the hash value for given not null partition key values. */ -static uint64 +uint64 compute_hash_value(int partnatts, FmgrInfo *partsupfunc, Datum *values, bool *isnull) { diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index b55431d6..e87c8463 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -2402,6 +2402,38 @@ _copyOnConflictExpr(const OnConflictExpr *from) return newnode; } +/* + * _copyPartitionPruneStepOp + */ +static PartitionPruneStepOp * +_copyPartitionPruneStepOp(const PartitionPruneStepOp *from) +{ + PartitionPruneStepOp *newnode = makeNode(PartitionPruneStepOp); + + COPY_SCALAR_FIELD(step.step_id); + COPY_SCALAR_FIELD(opstrategy); + COPY_NODE_FIELD(exprs); + COPY_NODE_FIELD(cmpfns); + COPY_BITMAPSET_FIELD(nullkeys); + + return newnode; +} + +/* + * _copyPartitionPruneStepCombine + */ +static PartitionPruneStepCombine * +_copyPartitionPruneStepCombine(const PartitionPruneStepCombine *from) +{ + PartitionPruneStepCombine *newnode = makeNode(PartitionPruneStepCombine); + + COPY_SCALAR_FIELD(step.step_id); + COPY_SCALAR_FIELD(combineOp); + COPY_NODE_FIELD(source_stepids); + + return newnode; +} + /* **************************************************************** * relation.h copy functions * @@ -2527,21 +2559,6 @@ _copyAppendRelInfo(const AppendRelInfo *from) return newnode; } -/* - * _copyPartitionedChildRelInfo - */ -static PartitionedChildRelInfo * -_copyPartitionedChildRelInfo(const PartitionedChildRelInfo *from) -{ - PartitionedChildRelInfo *newnode = makeNode(PartitionedChildRelInfo); - - COPY_SCALAR_FIELD(parent_relid); - COPY_NODE_FIELD(child_rels); - COPY_SCALAR_FIELD(part_cols_updated); - - return newnode; -} - /* * _copyPlaceHolderInfo */ @@ -5649,6 +5666,12 @@ copyObjectImpl(const void *from) case T_OnConflictExpr: retval = _copyOnConflictExpr(from); break; + case T_PartitionPruneStepOp: + retval = _copyPartitionPruneStepOp(from); + break; + case T_PartitionPruneStepCombine: + retval = _copyPartitionPruneStepCombine(from); + break; /* * RELATION NODES @@ -5668,9 +5691,6 @@ copyObjectImpl(const void *from) case T_AppendRelInfo: retval = _copyAppendRelInfo(from); break; - case T_PartitionedChildRelInfo: - retval = _copyPartitionedChildRelInfo(from); - break; case T_PlaceHolderInfo: retval = _copyPlaceHolderInfo(from); break; diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c index 0e47737a..8abab4bb 100644 --- a/src/backend/nodes/equalfuncs.c +++ b/src/backend/nodes/equalfuncs.c @@ -931,16 +931,6 @@ _equalAppendRelInfo(const AppendRelInfo *a, const AppendRelInfo *b) return true; } -static bool -_equalPartitionedChildRelInfo(const PartitionedChildRelInfo *a, const PartitionedChildRelInfo *b) -{ - COMPARE_SCALAR_FIELD(parent_relid); - COMPARE_NODE_FIELD(child_rels); - COMPARE_SCALAR_FIELD(part_cols_updated); - - return true; -} - static bool _equalPlaceHolderInfo(const PlaceHolderInfo *a, const PlaceHolderInfo *b) { @@ -3397,9 +3387,6 @@ equal(const void *a, const void *b) case T_AppendRelInfo: retval = _equalAppendRelInfo(a, b); break; - case T_PartitionedChildRelInfo: - retval = _equalPartitionedChildRelInfo(a, b); - break; case T_PlaceHolderInfo: retval = _equalPlaceHolderInfo(a, b); break; diff --git a/src/backend/nodes/nodeFuncs.c b/src/backend/nodes/nodeFuncs.c index a7ab020e..8a10e344 100644 --- a/src/backend/nodes/nodeFuncs.c +++ b/src/backend/nodes/nodeFuncs.c @@ -2147,6 +2147,17 @@ expression_tree_walker(Node *node, return true; } break; + case T_PartitionPruneStepOp: + { + PartitionPruneStepOp *opstep = (PartitionPruneStepOp *) node; + + if (walker((Node *) opstep->exprs, context)) + return true; + } + break; + case T_PartitionPruneStepCombine: + /* no expression subnodes */ + break; case T_JoinExpr: { bool left_arg_ret = false; @@ -2977,6 +2988,20 @@ expression_tree_mutator(Node *node, return (Node *) newnode; } break; + case T_PartitionPruneStepOp: + { + PartitionPruneStepOp *opstep = (PartitionPruneStepOp *) node; + PartitionPruneStepOp *newnode; + + FLATCOPY(newnode, opstep, PartitionPruneStepOp); + MUTATE(newnode->exprs, opstep->exprs, List *); + + return (Node *) newnode; + } + break; + case T_PartitionPruneStepCombine: + /* no expression sub-nodes */ + return (Node *) copyObject(node); case T_JoinExpr: { JoinExpr *join = (JoinExpr *) node; diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index 610c2fae..24ca2109 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -2978,6 +2978,28 @@ _outFromExpr(StringInfo str, const FromExpr *node) WRITE_NODE_FIELD(quals); } +static void +_outPartitionPruneStepOp(StringInfo str, const PartitionPruneStepOp *node) +{ + WRITE_NODE_TYPE("PARTITIONPRUNESTEPOP"); + + WRITE_INT_FIELD(step.step_id); + WRITE_INT_FIELD(opstrategy); + WRITE_NODE_FIELD(exprs); + WRITE_NODE_FIELD(cmpfns); + WRITE_BITMAPSET_FIELD(nullkeys); +} + +static void +_outPartitionPruneStepCombine(StringInfo str, const PartitionPruneStepCombine *node) +{ + WRITE_NODE_TYPE("PARTITIONPRUNESTEPCOMBINE"); + + WRITE_INT_FIELD(step.step_id); + WRITE_ENUM_FIELD(combineOp, PartitionPruneCombineOp); + WRITE_NODE_FIELD(source_stepids); +} + static void _outOnConflictExpr(StringInfo str, const OnConflictExpr *node) { @@ -3527,7 +3549,6 @@ _outPlannerInfo(StringInfo str, const PlannerInfo *node) WRITE_NODE_FIELD(full_join_clauses); WRITE_NODE_FIELD(join_info_list); WRITE_NODE_FIELD(append_rel_list); - WRITE_NODE_FIELD(pcinfo_list); WRITE_NODE_FIELD(rowMarks); WRITE_NODE_FIELD(placeholder_list); WRITE_NODE_FIELD(fkey_list); @@ -3552,6 +3573,7 @@ _outPlannerInfo(StringInfo str, const PlannerInfo *node) WRITE_INT_FIELD(wt_param_id); WRITE_BITMAPSET_FIELD(curOuterRels); WRITE_NODE_FIELD(curOuterParams); + WRITE_BOOL_FIELD(partColsUpdated); #ifdef __TBASE__ WRITE_BOOL_FIELD(haspart_tobe_modify); WRITE_UINT_FIELD(partrelindex); @@ -3606,6 +3628,7 @@ _outRelOptInfo(StringInfo str, const RelOptInfo *node) WRITE_NODE_FIELD(joininfo); WRITE_BOOL_FIELD(has_eclass_joins); WRITE_BITMAPSET_FIELD(top_parent_relids); + WRITE_NODE_FIELD(partitioned_child_rels); #ifdef __TBASE__ WRITE_BOOL_FIELD(intervalparent); WRITE_BOOL_FIELD(isdefault); @@ -3854,16 +3877,6 @@ _outAppendRelInfo(StringInfo str, const AppendRelInfo *node) WRITE_OID_FIELD(parent_reloid); } -static void -_outPartitionedChildRelInfo(StringInfo str, const PartitionedChildRelInfo *node) -{ - WRITE_NODE_TYPE("PARTITIONEDCHILDRELINFO"); - - WRITE_UINT_FIELD(parent_relid); - WRITE_NODE_FIELD(child_rels); - WRITE_BOOL_FIELD(part_cols_updated); -} - static void _outPlaceHolderInfo(StringInfo str, const PlaceHolderInfo *node) { @@ -5423,6 +5436,12 @@ outNode(StringInfo str, const void *obj) case T_OnConflictExpr: _outOnConflictExpr(str, obj); break; + case T_PartitionPruneStepOp: + _outPartitionPruneStepOp(str, obj); + break; + case T_PartitionPruneStepCombine: + _outPartitionPruneStepCombine(str, obj); + break; case T_Path: _outPath(str, obj); break; @@ -5564,9 +5583,6 @@ outNode(StringInfo str, const void *obj) case T_AppendRelInfo: _outAppendRelInfo(str, obj); break; - case T_PartitionedChildRelInfo: - _outPartitionedChildRelInfo(str, obj); - break; case T_PlaceHolderInfo: _outPlaceHolderInfo(str, obj); break; diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c index e0744408..72e9a6fa 100644 --- a/src/backend/nodes/readfuncs.c +++ b/src/backend/nodes/readfuncs.c @@ -2074,6 +2074,33 @@ _readOnConflictExpr(void) READ_DONE(); } +static PartitionPruneStepOp * +_readPartitionPruneStepOp(void) +{ + READ_LOCALS(PartitionPruneStepOp); + + READ_INT_FIELD(step.step_id); + READ_INT_FIELD(opstrategy); + READ_NODE_FIELD(exprs); + READ_NODE_FIELD(cmpfns); + READ_BITMAPSET_FIELD(nullkeys); + + READ_DONE(); +} + +static PartitionPruneStepCombine * +_readPartitionPruneStepCombine(void) +{ + READ_LOCALS(PartitionPruneStepCombine); + + READ_INT_FIELD(step.step_id); + READ_ENUM_FIELD(combineOp, PartitionPruneCombineOp); + READ_NODE_FIELD(source_stepids); + + READ_DONE(); +} + + /* * Stuff from parsenodes.h. */ @@ -4349,6 +4376,10 @@ parseNodeString(void) return_value = _readFromExpr(); else if (MATCH("ONCONFLICTEXPR", 14)) return_value = _readOnConflictExpr(); + else if (MATCH("PARTITIONPRUNESTEPOP", 20)) + return_value = _readPartitionPruneStepOp(); + else if (MATCH("PARTITIONPRUNESTEPCOMBINE", 25)) + return_value = _readPartitionPruneStepCombine(); else if (MATCH("RTE", 3)) return_value = _readRangeTblEntry(); else if (MATCH("RANGETBLFUNCTION", 16)) diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index f5516316..59663d81 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -45,6 +45,7 @@ #include "optimizer/var.h" #include "parser/parse_clause.h" #include "parser/parsetree.h" +#include "partitioning/partprune.h" #include "pgxc/nodemgr.h" #ifdef PGXC #include "nodes/makefuncs.h" @@ -887,6 +888,8 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel, double *parent_attrsizes; int nattrs; ListCell *l; + Relids live_children = NULL; + bool did_pruning = false; /* Guard against stack overflow due to overly deep inheritance tree. */ check_stack_depth(); @@ -894,6 +897,31 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel, Assert(IS_SIMPLE_REL(rel)); /* + * Initialize partitioned_child_rels to contain this RT index. + * + * Note that during the set_append_rel_pathlist() phase, we will bubble up + * the indexes of partitioned relations that appear down in the tree, so + * that when we've created Paths for all the children, the root + * partitioned table's list will contain all such indexes. + */ + if (rte->relkind == RELKIND_PARTITIONED_TABLE) + rel->partitioned_child_rels = list_make1_int(rti); + + /* + * If the partitioned relation has any baserestrictinfo quals then we + * attempt to use these quals to prune away partitions that cannot + * possibly contain any tuples matching these quals. In this case we'll + * store the relids of all partitions which could possibly contain a + * matching tuple, and skip anything else in the loop below. + */ + if (rte->relkind == RELKIND_PARTITIONED_TABLE && + rel->baserestrictinfo != NIL) + { + live_children = prune_append_rel_partitions(rel); + did_pruning = true; + } + + /* * Initialize to compute size estimates for whole append relation. * * We handle width estimates by weighting the widths of different child @@ -1141,6 +1169,13 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel, continue; } + if (did_pruning && !bms_is_member(appinfo->child_relid, live_children)) + { + /* This partition was pruned; skip it. */ + set_dummy_rel_pathlist(childrel); + continue; + } + if (relation_excluded_by_constraints(root, childrel, childRTE)) { /* @@ -1322,6 +1357,12 @@ set_append_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, if (IS_DUMMY_REL(childrel)) continue; + /* Bubble up childrel's partitioned children. */ + if (rel->part_scheme) + rel->partitioned_child_rels = + list_concat(rel->partitioned_child_rels, + list_copy(childrel->partitioned_child_rels)); + /* * Child is live, so add it to the live_childrels list for use below. */ @@ -1356,48 +1397,54 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, List *all_child_outers = NIL; ListCell *l; List *partitioned_rels = NIL; - RangeTblEntry *rte; bool build_partitioned_rels = false; + /* + * AppendPath generated for partitioned tables must record the RT indexes + * of partitioned tables that are direct or indirect children of this + * Append rel. + * + * AppendPath may be for a sub-query RTE (UNION ALL), in which case, 'rel' + * itself does not represent a partitioned relation, but the child sub- + * queries may contain references to partitioned relations. The loop + * below will look for such children and collect them in a list to be + * passed to the path creation function. (This assumes that we don't need + * to look through multiple levels of subquery RTEs; if we ever do, we + * could consider stuffing the list we generate here into sub-query RTE's + * RelOptInfo, just like we do for partitioned rels, which would be used + * when populating our parent rel with paths. For the present, that + * appears to be unnecessary.) + */ + if (rel->part_scheme != NULL) + { if (IS_SIMPLE_REL(rel)) + partitioned_rels = rel->partitioned_child_rels; + else if (IS_JOIN_REL(rel)) { + int relid = -1; + /* - * A root partition will already have a PartitionedChildRelInfo, and a - * non-root partitioned table doesn't need one, because its Append - * paths will get flattened into the parent anyway. For a subquery - * RTE, no PartitionedChildRelInfo exists; we collect all - * partitioned_rels associated with any child. (This assumes that we - * don't need to look through multiple levels of subquery RTEs; if we - * ever do, we could create a PartitionedChildRelInfo with the - * accumulated list of partitioned_rels which would then be found when - * populated our parent rel with paths. For the present, that appears - * to be unnecessary.) + * For a partitioned joinrel, concatenate the component rels' + * partitioned_child_rels lists. */ - rte = planner_rt_fetch(rel->relid, root); - switch (rte->rtekind) + while ((relid = bms_next_member(rel->relids, relid)) >= 0) { - case RTE_RELATION: - if (rte->relkind == RELKIND_PARTITIONED_TABLE) + RelOptInfo *component; + + Assert(relid >= 1 && relid < root->simple_rel_array_size); + component = root->simple_rel_array[relid]; + Assert(component->part_scheme != NULL); + Assert(list_length(component->partitioned_child_rels) >= 1); partitioned_rels = - get_partitioned_child_rels(root, rel->relid, NULL); - break; - case RTE_SUBQUERY: - build_partitioned_rels = true; - break; - default: - elog(ERROR, "unexpcted rtekind: %d", (int) rte->rtekind); + list_concat(partitioned_rels, + list_copy(component->partitioned_child_rels)); } } - else if (rel->reloptkind == RELOPT_JOINREL && rel->part_scheme) - { - /* - * Associate PartitionedChildRelInfo of the root partitioned tables - * being joined with the root partitioned join (indicated by - * RELOPT_JOINREL). - */ - partitioned_rels = get_partitioned_child_rels_for_join(root, - rel->relids); + + Assert(list_length(partitioned_rels) >= 1); } + else if (rel->rtekind == RTE_SUBQUERY) + build_partitioned_rels = true; /* * For every non-dummy child, remember the cheapest path. Also, identify @@ -1415,9 +1462,8 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, */ if (build_partitioned_rels) { - List *cprels; + List *cprels = childrel->partitioned_child_rels; - cprels = get_partitioned_child_rels(root, childrel->relid, NULL); partitioned_rels = list_concat(partitioned_rels, list_copy(cprels)); } diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index 4d3b3cce..40d0757d 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -40,9 +40,7 @@ #include "utils/selfuncs.h" -#define IsBooleanOpfamily(opfamily) \ - ((opfamily) == BOOL_BTREE_FAM_OID || (opfamily) == BOOL_HASH_FAM_OID) - +/* XXX see PartCollMatchesExprColl */ #define IndexCollMatchesExprColl(idxcollation, exprcollation) \ ((idxcollation) == InvalidOid || (idxcollation) == (exprcollation)) diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 7538b6be..3e3065cd 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -628,7 +628,6 @@ subquery_planner(PlannerGlobal *glob, Query *parse, root->multiexpr_params = NIL; root->eq_classes = NIL; root->append_rel_list = NIL; - root->pcinfo_list = NIL; root->rowMarks = NIL; memset(root->upper_rels, 0, sizeof(root->upper_rels)); memset(root->upper_targets, 0, sizeof(root->upper_targets)); @@ -648,6 +647,7 @@ subquery_planner(PlannerGlobal *glob, Query *parse, else root->wt_param_id = -1; root->non_recursive_path = NULL; + root->partColsUpdated = false; /* * If there is a WITH list, process each WITH query and either convert it @@ -1264,12 +1264,12 @@ inheritance_planner(PlannerInfo *root) ListCell *lc; Index rti; RangeTblEntry *parent_rte; + Relids partitioned_relids = NULL; List *partitioned_rels = NIL; PlannerInfo *parent_root; Query *parent_parse; Bitmapset *parent_relids = bms_make_singleton(top_parentRTindex); PlannerInfo **parent_roots = NULL; - bool partColsUpdated = false; Assert(parse->commandType != CMD_INSERT); @@ -1341,10 +1341,12 @@ inheritance_planner(PlannerInfo *root) if (parent_rte->relkind == RELKIND_PARTITIONED_TABLE) { nominalRelation = top_parentRTindex; - partitioned_rels = get_partitioned_child_rels(root, top_parentRTindex, - &partColsUpdated); - /* The root partitioned table is included as a child rel */ - Assert(list_length(partitioned_rels) >= 1); + + /* + * Root parent's RT index is always present in the partitioned_rels of + * the ModifyTable node, if one is needed at all. + */ + partitioned_relids = bms_make_singleton(top_parentRTindex); } /* @@ -1575,6 +1577,15 @@ inheritance_planner(PlannerInfo *root) if (IS_DUMMY_PATH(subpath)) continue; + /* + * Add the current parent's RT index to the partitione_rels set if + * we're going to create the ModifyTable path for a partitioned root + * table. + */ + if (partitioned_relids) + partitioned_relids = bms_add_member(partitioned_relids, + appinfo->parent_relid); + #ifdef XCP /* * All subplans should have the same distribution, except may be @@ -1709,6 +1720,21 @@ inheritance_planner(PlannerInfo *root) else rowMarks = root->rowMarks; + if (partitioned_relids) + { + int i; + + i = -1; + while ((i = bms_next_member(partitioned_relids, i)) >= 0) + partitioned_rels = lappend_int(partitioned_rels, i); + + /* + * If we're going to create ModifyTable at all, the list should + * contain at least one member, that is, the root parent's index. + */ + Assert(list_length(partitioned_rels) >= 1); + } + /* Create Path representing a ModifyTable to do the UPDATE/DELETE work */ add_path(final_rel, (Path *) create_modifytable_path(root, final_rel, @@ -1716,7 +1742,7 @@ inheritance_planner(PlannerInfo *root) parse->canSetTag, nominalRelation, partitioned_rels, - partColsUpdated, + root->partColsUpdated, resultRelations, subpaths, subroots, @@ -6802,66 +6828,6 @@ grouping_distribution_match(PlannerInfo *root, Query *parse, Path *path, return matches_key; } -/* - * get_partitioned_child_rels - * Returns a list of the RT indexes of the partitioned child relations - * with rti as the root parent RT index. Also sets - * *part_cols_updated to true if any of the root rte's updated - * columns is used in the partition key either of the relation whose RTI - * is specified or of any child relation. - * - * Note: This function might get called even for range table entries that - * are not partitioned tables; in such a case, it will simply return NIL. - */ -List * -get_partitioned_child_rels(PlannerInfo *root, Index rti, - bool *part_cols_updated) -{ - List *result = NIL; - ListCell *l; - - if (part_cols_updated) - *part_cols_updated = false; - - foreach(l, root->pcinfo_list) - { - PartitionedChildRelInfo *pc = lfirst(l); - - if (pc->parent_relid == rti) - { - result = pc->child_rels; - if (part_cols_updated) - *part_cols_updated = pc->part_cols_updated; - break; - } -} - - return result; -} - - -/* - * get_partitioned_child_rels_for_join - * Build and return a list containing the RTI of every partitioned - * relation which is a child of some rel included in the join. - */ -List * -get_partitioned_child_rels_for_join(PlannerInfo *root, Relids join_relids) -{ - List *result = NIL; - ListCell *l; - - foreach(l, root->pcinfo_list) - { - PartitionedChildRelInfo *pc = lfirst(l); - - if (bms_is_member(pc->parent_relid, join_relids)) - result = list_concat(result, list_copy(pc->child_rels)); - } - - return result; -} - /* * add_paths_to_grouping_rel * diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c index c40a38ee..d2e6c3c6 100644 --- a/src/backend/optimizer/prep/prepunion.c +++ b/src/backend/optimizer/prep/prepunion.c @@ -105,8 +105,7 @@ static void expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte, Index parentRTindex, Relation parentrel, PlanRowMark *top_parentrc, LOCKMODE lockmode, - List **appinfos, List **partitioned_child_rels, - bool *part_cols_updated); + List **appinfos); static void expand_single_inheritance_child(PlannerInfo *root, RangeTblEntry *parentrte, Index parentRTindex, Relation parentrel, @@ -1543,9 +1542,6 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) /* Scan the inheritance set and expand it */ if (RelationGetPartitionDesc(oldrelation) != NULL) { - List *partitioned_child_rels = NIL; - bool part_cols_updated = false; - Assert(rte->relkind == RELKIND_PARTITIONED_TABLE); /* @@ -1554,28 +1550,7 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) * extract the partition key columns of all the partitioned tables. */ expand_partitioned_rtentry(root, rte, rti, oldrelation, oldrc, - lockmode, &root->append_rel_list, - &partitioned_child_rels, - &part_cols_updated); - - /* - * We keep a list of objects in root, each of which maps a root - * partitioned parent RT index to the list of RT indexes of descendant - * partitioned child tables. When creating an Append or a ModifyTable - * path for the parent, we copy the child RT index list verbatim to - * the path so that it could be carried over to the executor so that - * the latter could identify the partitioned child tables. - */ - if (rte->inh && partitioned_child_rels != NIL) - { - PartitionedChildRelInfo *pcinfo; - - pcinfo = makeNode(PartitionedChildRelInfo); - pcinfo->parent_relid = rti; - pcinfo->child_rels = partitioned_child_rels; - pcinfo->part_cols_updated = part_cols_updated; - root->pcinfo_list = lappend(root->pcinfo_list, pcinfo); - } + lockmode, &root->append_rel_list); } else { @@ -1650,8 +1625,7 @@ static void expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte, Index parentRTindex, Relation parentrel, PlanRowMark *top_parentrc, LOCKMODE lockmode, - List **appinfos, List **partitioned_child_rels, - bool *part_cols_updated) + List **appinfos) { int i; RangeTblEntry *childrte; @@ -1673,8 +1647,8 @@ expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte, * parentrte already has the root partrel's updatedCols translated to match * the attribute ordering of parentrel. */ - if (!*part_cols_updated) - *part_cols_updated = + if (!root->partColsUpdated) + root->partColsUpdated = has_partition_attrs(parentrel, parentrte->updatedCols, NULL); /* First expand the partitioned table itself. */ @@ -1682,14 +1656,6 @@ expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte, top_parentrc, parentrel, appinfos, &childrte, &childRTindex); - /* - * The partitioned table does not have data for itself but still need to - * be locked. Update given list of partitioned children with RTI of this - * partitioned relation. - */ - *partitioned_child_rels = lappend_int(*partitioned_child_rels, - childRTindex); - for (i = 0; i < partdesc->nparts; i++) { Oid childOID = partdesc->oids[i]; @@ -1716,8 +1682,7 @@ expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte, if (childrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) expand_partitioned_rtentry(root, childrte, childRTindex, childrel, top_parentrc, lockmode, - appinfos, partitioned_child_rels, - part_cols_updated); + appinfos); /* Close child relation, but keep locks */ heap_close(childrel, NoLock); diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index e310e85b..6abe45f5 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -1297,7 +1297,6 @@ get_relation_constraints(PlannerInfo *root, Index varno = rel->relid; Relation relation; TupleConstr *constr; - List *pcqual; /* * We assume the relation has already been safely locked. @@ -1383,16 +1382,27 @@ get_relation_constraints(PlannerInfo *root, } } - /* Append partition predicates, if any */ - pcqual = RelationGetPartitionQual(relation); + /* + * Append partition predicates, if any. + * + * For selects, partition pruning uses the parent table's partition bound + * descriptor, instead of constraint exclusion which is driven by the + * individual partition's partition constraint. + */ + if (root->parse->commandType != CMD_SELECT) + { + List *pcqual = RelationGetPartitionQual(relation); + if (pcqual) { /* - * Run each expression through const-simplification and - * canonicalization similar to check constraints. + * Run the partition quals through const-simplification similar to + * check constraints. We skip canonicalize_qual, though, because + * partition quals should be in canonical form already; also, + * since the qual is in implicit-AND format, we'd have to + * explicitly convert it to explicit-AND format and back again. */ pcqual = (List *) eval_const_expressions(root, (Node *) pcqual); - pcqual = (List *) canonicalize_qual((Expr *) pcqual); /* Fix Vars to have the desired varno */ if (varno != 1) @@ -1400,6 +1410,7 @@ get_relation_constraints(PlannerInfo *root, result = list_concat(result, pcqual); } + } heap_close(relation, NoLock); @@ -1999,6 +2010,7 @@ set_relation_partition_info(PlannerInfo *root, RelOptInfo *rel, rel->boundinfo = partition_bounds_copy(partdesc->boundinfo, partkey); rel->nparts = partdesc->nparts; set_baserel_partition_key_exprs(relation, rel); + rel->partition_qual = RelationGetPartitionQual(relation); } /* @@ -2011,7 +2023,8 @@ find_partition_scheme(PlannerInfo *root, Relation relation) { PartitionKey partkey = RelationGetPartitionKey(relation); ListCell *lc; - int partnatts; + int partnatts, + i; PartitionScheme part_scheme; /* A partitioned table should have a partition key. */ @@ -2029,7 +2042,7 @@ find_partition_scheme(PlannerInfo *root, Relation relation) partnatts != part_scheme->partnatts) continue; - /* Match the partition key types. */ + /* Match partition key type properties. */ if (memcmp(partkey->partopfamily, part_scheme->partopfamily, sizeof(Oid) * partnatts) != 0 || memcmp(partkey->partopcintype, part_scheme->partopcintype, @@ -2047,6 +2060,19 @@ find_partition_scheme(PlannerInfo *root, Relation relation) Assert(memcmp(partkey->parttypbyval, part_scheme->parttypbyval, sizeof(bool) * partnatts) == 0); + /* + * If partopfamily and partopcintype matched, must have the same + * partition comparison functions. Note that we cannot reliably + * Assert the equality of function structs themselves for they might + * be different across PartitionKey's, so just Assert for the function + * OIDs. + */ +#ifdef USE_ASSERT_CHECKING + for (i = 0; i < partkey->partnatts; i++) + Assert(partkey->partsupfunc[i].fn_oid == + part_scheme->partsupfunc[i].fn_oid); +#endif + /* Found matching partition scheme. */ return part_scheme; } @@ -2081,6 +2107,12 @@ find_partition_scheme(PlannerInfo *root, Relation relation) memcpy(part_scheme->parttypbyval, partkey->parttypbyval, sizeof(bool) * partnatts); + part_scheme->partsupfunc = (FmgrInfo *) + palloc(sizeof(FmgrInfo) * partnatts); + for (i = 0; i < partnatts; i++) + fmgr_info_copy(&part_scheme->partsupfunc[i], &partkey->partsupfunc[i], + CurrentMemoryContext); + /* Add the partitioning scheme to PlannerInfo. */ root->part_schemes = lappend(root->part_schemes, part_scheme); diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c index 70acf299..1f6fb286 100644 --- a/src/backend/optimizer/util/relnode.c +++ b/src/backend/optimizer/util/relnode.c @@ -163,9 +163,11 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptInfo *parent) rel->part_scheme = NULL; rel->nparts = 0; rel->boundinfo = NULL; + rel->partition_qual = NIL; rel->part_rels = NULL; rel->partexprs = NULL; rel->nullable_partexprs = NULL; + rel->partitioned_child_rels = NIL; #ifdef __TBASE__ rel->intervalparent = false; rel->isdefault = rte->isdefault; @@ -622,9 +624,11 @@ build_join_rel(PlannerInfo *root, joinrel->part_scheme = NULL; joinrel->nparts = 0; joinrel->boundinfo = NULL; + joinrel->partition_qual = NIL; joinrel->part_rels = NULL; joinrel->partexprs = NULL; joinrel->nullable_partexprs = NULL; + joinrel->partitioned_child_rels = NIL; #ifdef __TBASE__ joinrel->resultRelLoc = RESULT_REL_NONE; #endif @@ -793,9 +797,13 @@ build_child_join_rel(PlannerInfo *root, RelOptInfo *outer_rel, joinrel->has_eclass_joins = false; joinrel->top_parent_relids = NULL; joinrel->part_scheme = NULL; + joinrel->nparts = 0; + joinrel->boundinfo = NULL; + joinrel->partition_qual = NIL; joinrel->part_rels = NULL; joinrel->partexprs = NULL; joinrel->nullable_partexprs = NULL; + joinrel->partitioned_child_rels = NIL; joinrel->top_parent_relids = bms_union(outer_rel->top_parent_relids, inner_rel->top_parent_relids); diff --git a/src/backend/partitioning/Makefile b/src/backend/partitioning/Makefile new file mode 100644 index 00000000..429207c4 --- /dev/null +++ b/src/backend/partitioning/Makefile @@ -0,0 +1,17 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for backend/partitioning +# +# IDENTIFICATION +# src/backend/partitioning/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/partitioning +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = partprune.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/partitioning/partprune.c b/src/backend/partitioning/partprune.c new file mode 100644 index 00000000..959ee164 --- /dev/null +++ b/src/backend/partitioning/partprune.c @@ -0,0 +1,2782 @@ +/*------------------------------------------------------------------------- + * + * partprune.c + * Parses clauses attempting to match them up to partition keys of a + * given relation and generates a set of "pruning steps", which can be + * later "executed" either from the planner or the executor to determine + * the minimum set of partitions which match the given clauses. + * + * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/partitioning/partprune.c + * + *------------------------------------------------------------------------- +*/ +#include "postgres.h" + +#include "access/hash.h" +#include "access/nbtree.h" +#include "catalog/pg_operator.h" +#include "catalog/pg_opfamily.h" +#include "catalog/pg_type.h" +#include "miscadmin.h" +#include "nodes/makefuncs.h" +#include "nodes/nodeFuncs.h" +#include "optimizer/clauses.h" +#include "optimizer/planner.h" +#include "optimizer/predtest.h" +#include "optimizer/prep.h" +#include "partitioning/partprune.h" +#include "partitioning/partbounds.h" +#include "rewrite/rewriteManip.h" +#include "utils/lsyscache.h" + + +/* + * Information about a clause matched with a partition key. + */ +typedef struct PartClauseInfo +{ + int keyno; /* Partition key number (0 to partnatts - 1) */ + Oid opno; /* operator used to compare partkey to 'expr' */ + bool op_is_ne; /* is clause's original operator <> ? */ + Expr *expr; /* expr the partition key is compared to */ + Oid cmpfn; /* Oid of function to compare 'expr' to the + * partition key */ + int op_strategy; /* cached info. */ +} PartClauseInfo; + +/* + * PartClauseMatchStatus + * Describes the result match_clause_to_partition_key produces for a + * given clause and the partition key to match with that are passed to it + */ +typedef enum PartClauseMatchStatus +{ + PARTCLAUSE_NOMATCH, + PARTCLAUSE_MATCH_CLAUSE, + PARTCLAUSE_MATCH_NULLNESS, + PARTCLAUSE_MATCH_STEPS, + PARTCLAUSE_MATCH_CONTRADICT, + PARTCLAUSE_UNSUPPORTED +} PartClauseMatchStatus; + +/* + * GeneratePruningStepsContext + * Information about the current state of generation of "pruning steps" + * for a given set of clauses + * + * gen_partprune_steps() initializes an instance of this struct, which is used + * throughout the step generation process. + */ +typedef struct GeneratePruningStepsContext +{ + int next_step_id; + List *steps; +} GeneratePruningStepsContext; + +/* The result of performing one PartitionPruneStep */ +typedef struct PruneStepResult +{ + /* + * The offsets of bounds (in a table's boundinfo) whose partition is + * selected by the pruning step. + */ + Bitmapset *bound_offsets; + + bool scan_default; /* Scan the default partition? */ + bool scan_null; /* Scan the partition for NULL values? */ +} PruneStepResult; + + +static List *gen_partprune_steps_internal(GeneratePruningStepsContext *context, + RelOptInfo *rel, List *clauses, + bool *contradictory); +static PartitionPruneStep *gen_prune_step_op(GeneratePruningStepsContext *context, + StrategyNumber opstrategy, bool op_is_ne, + List *exprs, List *cmpfns, Bitmapset *nullkeys); +static PartitionPruneStep *gen_prune_step_combine(GeneratePruningStepsContext *context, + List *source_stepids, + PartitionPruneCombineOp combineOp); +static PartitionPruneStep *gen_prune_steps_from_opexps(PartitionScheme part_scheme, + GeneratePruningStepsContext *context, + List **keyclauses, Bitmapset *nullkeys); +static PartClauseMatchStatus match_clause_to_partition_key(RelOptInfo *rel, + GeneratePruningStepsContext *context, + Expr *clause, Expr *partkey, int partkeyidx, + bool *clause_is_not_null, + PartClauseInfo **pc, List **clause_steps); +static List *get_steps_using_prefix(GeneratePruningStepsContext *context, + StrategyNumber step_opstrategy, + bool step_op_is_ne, + Expr *step_lastexpr, + Oid step_lastcmpfn, + int step_lastkeyno, + Bitmapset *step_nullkeys, + List *prefix); +static List *get_steps_using_prefix_recurse(GeneratePruningStepsContext *context, + StrategyNumber step_opstrategy, + bool step_op_is_ne, + Expr *step_lastexpr, + Oid step_lastcmpfn, + int step_lastkeyno, + Bitmapset *step_nullkeys, + ListCell *start, + List *step_exprs, + List *step_cmpfns); +static PruneStepResult *get_matching_hash_bounds(PartitionPruneContext *context, + StrategyNumber opstrategy, Datum *values, int nvalues, + FmgrInfo *partsupfunc, Bitmapset *nullkeys); +static PruneStepResult *get_matching_list_bounds(PartitionPruneContext *context, + StrategyNumber opstrategy, Datum value, int nvalues, + FmgrInfo *partsupfunc, Bitmapset *nullkeys); +static PruneStepResult *get_matching_range_bounds(PartitionPruneContext *context, + StrategyNumber opstrategy, Datum *values, int nvalues, + FmgrInfo *partsupfunc, Bitmapset *nullkeys); +static PruneStepResult *perform_pruning_base_step(PartitionPruneContext *context, + PartitionPruneStepOp *opstep); +static PruneStepResult *perform_pruning_combine_step(PartitionPruneContext *context, + PartitionPruneStepCombine *cstep, + PruneStepResult **step_results); +static bool match_boolean_partition_clause(Oid partopfamily, Expr *clause, + Expr *partkey, Expr **outconst); +static bool partkey_datum_from_expr(PartitionPruneContext *context, + Expr *expr, Datum *value); + + +/* + * gen_partprune_steps + * Process 'clauses' (a rel's baserestrictinfo list of clauses) and return + * a list of "partition pruning steps" + * + * If the clauses in the input list are contradictory or there is a + * pseudo-constant "false", *contradictory is set to true upon return. + */ +List * +gen_partprune_steps(RelOptInfo *rel, List *clauses, bool *contradictory) +{ + GeneratePruningStepsContext context; + + context.next_step_id = 0; + context.steps = NIL; + + /* The clauses list may be modified below, so better make a copy. */ + clauses = list_copy(clauses); + + /* + * For sub-partitioned tables there's a corner case where if the + * sub-partitioned table shares any partition keys with its parent, then + * it's possible that the partitioning hierarchy allows the parent + * partition to only contain a narrower range of values than the + * sub-partitioned table does. In this case it is possible that we'd + * include partitions that could not possibly have any tuples matching + * 'clauses'. The possibility of such a partition arrangement is perhaps + * unlikely for non-default partitions, but it may be more likely in the + * case of default partitions, so we'll add the parent partition table's + * partition qual to the clause list in this case only. This may result + * in the default partition being eliminated. + */ + if (partition_bound_has_default(rel->boundinfo) && + rel->partition_qual != NIL) + { + List *partqual = rel->partition_qual; + + partqual = (List *) expression_planner((Expr *) partqual); + + /* Fix Vars to have the desired varno */ + if (rel->relid != 1) + ChangeVarNodes((Node *) partqual, 1, rel->relid, 0); + + clauses = list_concat(clauses, partqual); + } + + /* Down into the rabbit-hole. */ + gen_partprune_steps_internal(&context, rel, clauses, contradictory); + + return context.steps; +} + +/* + * prune_append_rel_partitions + * Returns RT indexes of the minimum set of child partitions which must + * be scanned to satisfy rel's baserestrictinfo quals. + * + * Callers must ensure that 'rel' is a partitioned table. + */ +Relids +prune_append_rel_partitions(RelOptInfo *rel) +{ + Relids result; + List *clauses = rel->baserestrictinfo; + List *pruning_steps; + bool contradictory; + PartitionPruneContext context; + Bitmapset *partindexes; + int i; + + Assert(clauses != NIL); + Assert(rel->part_scheme != NULL); + + /* If there are no partitions, return the empty set */ + if (rel->nparts == 0) + return NULL; + + /* + * Process clauses. If the clauses are found to be contradictory, we can + * return the empty set. + */ + pruning_steps = gen_partprune_steps(rel, clauses, &contradictory); + if (contradictory) + return NULL; + + context.strategy = rel->part_scheme->strategy; + context.partnatts = rel->part_scheme->partnatts; + context.partopfamily = rel->part_scheme->partopfamily; + context.partopcintype = rel->part_scheme->partopcintype; + context.partcollation = rel->part_scheme->partcollation; + context.partsupfunc = rel->part_scheme->partsupfunc; + context.nparts = rel->nparts; + context.boundinfo = rel->boundinfo; + + /* Actual pruning happens here. */ + partindexes = get_matching_partitions(&context, pruning_steps); + + /* Add selected partitions' RT indexes to result. */ + i = -1; + result = NULL; + while ((i = bms_next_member(partindexes, i)) >= 0) + result = bms_add_member(result, rel->part_rels[i]->relid); + + return result; +} + +/* + * get_matching_partitions + * Determine partitions that survive partition pruning + * + * Returns a Bitmapset of indexes of surviving partitions. + */ +Bitmapset * +get_matching_partitions(PartitionPruneContext *context, List *pruning_steps) +{ + Bitmapset *result; + int num_steps = list_length(pruning_steps), + i; + PruneStepResult **results, + *final_result; + ListCell *lc; + + /* If there are no pruning steps then all partitions match. */ + if (num_steps == 0) + return bms_add_range(NULL, 0, context->nparts - 1); + + /* + * Allocate space for individual pruning steps to store its result. Each + * slot will hold a PruneStepResult after performing a given pruning step. + * Later steps may use the result of one or more earlier steps. The + * result of applying all pruning steps is the value contained in the slot + * of the last pruning step. + */ + results = (PruneStepResult **) + palloc0(num_steps * sizeof(PruneStepResult *)); + foreach(lc, pruning_steps) + { + PartitionPruneStep *step = lfirst(lc); + + switch (nodeTag(step)) + { + case T_PartitionPruneStepOp: + results[step->step_id] = + perform_pruning_base_step(context, + (PartitionPruneStepOp *) step); + break; + + case T_PartitionPruneStepCombine: + results[step->step_id] = + perform_pruning_combine_step(context, + (PartitionPruneStepCombine *) step, + results); + break; + + default: + elog(ERROR, "invalid pruning step type: %d", + (int) nodeTag(step)); + } + } + + /* + * At this point we know the offsets of all the datums whose corresponding + * partitions need to be in the result, including special null-accepting + * and default partitions. Collect the actual partition indexes now. + */ + final_result = results[num_steps - 1]; + Assert(final_result != NULL); + i = -1; + result = NULL; + while ((i = bms_next_member(final_result->bound_offsets, i)) >= 0) + { + int partindex = context->boundinfo->indexes[i]; + + /* + * In range and hash partitioning cases, some slots may contain -1, + * indicating that no partition has been defined to accept a given + * range of data or for a given remainder, respectively. The default + * partition, if any, in case of range partitioning, will be added to + * the result, because the specified range still satisfies the query's + * conditions. + */ + if (partindex >= 0) + result = bms_add_member(result, partindex); + } + + /* Add the null and/or default partition if needed and if present. */ + if (final_result->scan_null) + { + Assert(context->strategy == PARTITION_STRATEGY_LIST); + Assert(partition_bound_accepts_nulls(context->boundinfo)); + result = bms_add_member(result, context->boundinfo->null_index); + } + if (final_result->scan_default) + { + Assert(context->strategy == PARTITION_STRATEGY_LIST || + context->strategy == PARTITION_STRATEGY_RANGE); + Assert(partition_bound_has_default(context->boundinfo)); + result = bms_add_member(result, context->boundinfo->default_index); + } + + return result; +} + +/* + * gen_partprune_steps_internal + * Processes 'clauses' to generate partition pruning steps. + * + * From OpExpr clauses that are mutually AND'd, we find combinations of those + * that match to the partition key columns and for every such combination, + * we emit a PartitionPruneStepOp containing a vector of expressions whose + * values are used as a look up key to search partitions by comparing the + * values with partition bounds. Relevant details of the operator and a + * vector of (possibly cross-type) comparison functions is also included with + * each step. + * + * For BoolExpr clauses, we recursively generate steps for each argument, and + * return a PartitionPruneStepCombine of their results. + * + * The generated steps are added to the context's steps list. Each step is + * assigned a step identifier, unique even across recursive calls. + * + * If we find clauses that are mutually contradictory, or a pseudoconstant + * clause that contains false, we set *contradictory to true and return NIL + * (that is, no pruning steps). Caller should consider all partitions as + * pruned in that case. Otherwise, *contradictory is set to false. + * + * Note: the 'clauses' List may be modified inside this function. Callers may + * like to make a copy of it before passing them to this function. + */ +static List * +gen_partprune_steps_internal(GeneratePruningStepsContext *context, + RelOptInfo *rel, List *clauses, + bool *contradictory) +{ + PartitionScheme part_scheme = rel->part_scheme; + List *keyclauses[PARTITION_MAX_KEYS]; + Bitmapset *nullkeys = NULL, + *notnullkeys = NULL; + bool generate_opsteps = false; + List *result = NIL; + ListCell *lc; + + *contradictory = false; + + memset(keyclauses, 0, sizeof(keyclauses)); + foreach(lc, clauses) + { + Expr *clause = (Expr *) lfirst(lc); + int i; + + if (IsA(clause, RestrictInfo)) + { + RestrictInfo *rinfo = (RestrictInfo *) clause; + + clause = rinfo->clause; + if (rinfo->pseudoconstant && + IsA(rinfo->clause, Const) && + !DatumGetBool(((Const *) clause)->constvalue)) + { + *contradictory = true; + return NIL; + } + } + + /* Get the BoolExpr's out of the way. */ + if (IsA(clause, BoolExpr)) + { + /* + * Generate steps for arguments. + * + * While steps generated for the arguments themselves will be + * added to context->steps during recursion and will be evaluated + * independently, collect their step IDs to be stored in the + * combine step we'll be creating. + */ + if (or_clause((Node *) clause)) + { + List *arg_stepids = NIL; + bool all_args_contradictory = true; + ListCell *lc1; + + /* + * Get pruning step for each arg. If we get contradictory for + * all args, it means the OR expression is false as a whole. + */ + foreach(lc1, ((BoolExpr *) clause)->args) + { + Expr *arg = lfirst(lc1); + bool arg_contradictory; + List *argsteps; + + argsteps = + gen_partprune_steps_internal(context, rel, + list_make1(arg), + &arg_contradictory); + if (!arg_contradictory) + all_args_contradictory = false; + + if (argsteps != NIL) + { + PartitionPruneStep *step; + + Assert(list_length(argsteps) == 1); + step = (PartitionPruneStep *) linitial(argsteps); + arg_stepids = lappend_int(arg_stepids, step->step_id); + } + else + { + /* + * No steps either means that arg_contradictory is + * true or the arg didn't contain a clause matching + * this partition key. + * + * In case of the latter, we cannot prune using such + * an arg. To indicate that to the pruning code, we + * must construct a dummy PartitionPruneStepCombine + * whose source_stepids is set to an empty List. + * However, if we can prove using constraint exclusion + * that the clause refutes the table's partition + * constraint (if it's sub-partitioned), we need not + * bother with that. That is, we effectively ignore + * this OR arm. + */ + List *partconstr = rel->partition_qual; + PartitionPruneStep *orstep; + + /* Just ignore this argument. */ + if (arg_contradictory) + continue; + + if (partconstr) + { + partconstr = (List *) + expression_planner((Expr *) partconstr); + if (rel->relid != 1) + ChangeVarNodes((Node *) partconstr, 1, + rel->relid, 0); + if (predicate_refuted_by(partconstr, + list_make1(arg), + false)) + continue; + } + + orstep = gen_prune_step_combine(context, NIL, + PARTPRUNE_COMBINE_UNION); + arg_stepids = lappend_int(arg_stepids, orstep->step_id); + } + } + + *contradictory = all_args_contradictory; + + /* Check if any contradicting clauses were found */ + if (*contradictory) + return NIL; + + if (arg_stepids != NIL) + { + PartitionPruneStep *step; + + step = gen_prune_step_combine(context, arg_stepids, + PARTPRUNE_COMBINE_UNION); + result = lappend(result, step); + } + continue; + } + else if (and_clause((Node *) clause)) + { + List *args = ((BoolExpr *) clause)->args; + List *argsteps, + *arg_stepids = NIL; + ListCell *lc1; + + /* + * args may itself contain clauses of arbitrary type, so just + * recurse and later combine the component partitions sets + * using a combine step. + */ + argsteps = gen_partprune_steps_internal(context, rel, args, + contradictory); + if (*contradictory) + return NIL; + + foreach(lc1, argsteps) + { + PartitionPruneStep *step = lfirst(lc1); + + arg_stepids = lappend_int(arg_stepids, step->step_id); + } + + if (arg_stepids != NIL) + { + PartitionPruneStep *step; + + step = gen_prune_step_combine(context, arg_stepids, + PARTPRUNE_COMBINE_INTERSECT); + result = lappend(result, step); + } + continue; + } + + /* + * Fall-through for a NOT clause, which if it's a Boolean clause, + * will be handled in match_clause_to_partition_key(). We + * currently don't perform any pruning for more complex NOT + * clauses. + */ + } + + /* + * Must be a clause for which we can check if one of its args matches + * the partition key. + */ + for (i = 0; i < part_scheme->partnatts; i++) + { + Expr *partkey = linitial(rel->partexprs[i]); + bool clause_is_not_null = false; + PartClauseInfo *pc = NULL; + List *clause_steps = NIL; + + switch (match_clause_to_partition_key(rel, context, + clause, partkey, i, + &clause_is_not_null, + &pc, &clause_steps)) + { + case PARTCLAUSE_MATCH_CLAUSE: + Assert(pc != NULL); + + /* + * Since we only allow strict operators, check for any + * contradicting IS NULL. + */ + if (bms_is_member(i, nullkeys)) + { + *contradictory = true; + return NIL; + } + generate_opsteps = true; + keyclauses[i] = lappend(keyclauses[i], pc); + break; + + case PARTCLAUSE_MATCH_NULLNESS: + if (!clause_is_not_null) + { + /* check for conflicting IS NOT NULL */ + if (bms_is_member(i, notnullkeys)) + { + *contradictory = true; + return NIL; + } + nullkeys = bms_add_member(nullkeys, i); + } + else + { + /* check for conflicting IS NULL */ + if (bms_is_member(i, nullkeys)) + { + *contradictory = true; + return NIL; + } + notnullkeys = bms_add_member(notnullkeys, i); + } + break; + + case PARTCLAUSE_MATCH_STEPS: + Assert(clause_steps != NIL); + result = list_concat(result, clause_steps); + break; + + case PARTCLAUSE_MATCH_CONTRADICT: + /* We've nothing more to do if a contradiction was found. */ + *contradictory = true; + return NIL; + + case PARTCLAUSE_NOMATCH: + + /* + * Clause didn't match this key, but it might match the + * next one. + */ + continue; + + case PARTCLAUSE_UNSUPPORTED: + /* This clause cannot be used for pruning. */ + break; + + default: + Assert(false); + break; + } + + /* done; go check the next clause. */ + break; + } + } + + /* + * If generate_opsteps is set to false it means no OpExprs were directly + * present in the input list. + */ + if (!generate_opsteps) + { + /* + * Generate one prune step for the information derived from IS NULL, + * if any. To prune hash partitions, we must have found IS NULL + * clauses for all partition keys. + */ + if (!bms_is_empty(nullkeys) && + (part_scheme->strategy != PARTITION_STRATEGY_HASH || + bms_num_members(nullkeys) == part_scheme->partnatts)) + { + PartitionPruneStep *step; + + step = gen_prune_step_op(context, InvalidStrategy, + false, NIL, NIL, nullkeys); + result = lappend(result, step); + } + + /* + * Note that for IS NOT NULL clauses, simply having step suffices; + * there is no need to propagate the exact details of which keys are + * required to be NOT NULL. Hash partitioning expects to see actual + * values to perform any pruning. + */ + if (!bms_is_empty(notnullkeys) && + part_scheme->strategy != PARTITION_STRATEGY_HASH) + { + PartitionPruneStep *step; + + step = gen_prune_step_op(context, InvalidStrategy, + false, NIL, NIL, NULL); + result = lappend(result, step); + } + } + else + { + PartitionPruneStep *step; + + /* Generate pruning steps from OpExpr clauses in keyclauses. */ + step = gen_prune_steps_from_opexps(part_scheme, context, + keyclauses, nullkeys); + if (step != NULL) + result = lappend(result, step); + } + + /* + * Finally, results from all entries appearing in result should be + * combined using an INTERSECT combine step, if more than one. + */ + if (list_length(result) > 1) + { + List *step_ids = NIL; + + foreach(lc, result) + { + PartitionPruneStep *step = lfirst(lc); + + step_ids = lappend_int(step_ids, step->step_id); + } + + if (step_ids != NIL) + { + PartitionPruneStep *step; + + step = gen_prune_step_combine(context, step_ids, + PARTPRUNE_COMBINE_INTERSECT); + result = lappend(result, step); + } + } + + return result; +} + +/* + * gen_prune_step_op + * Generate a pruning step for a specific operator + * + * The step is assigned a unique step identifier and added to context's 'steps' + * list. + */ +static PartitionPruneStep * +gen_prune_step_op(GeneratePruningStepsContext *context, + StrategyNumber opstrategy, bool op_is_ne, + List *exprs, List *cmpfns, + Bitmapset *nullkeys) +{ + PartitionPruneStepOp *opstep = makeNode(PartitionPruneStepOp); + + opstep->step.step_id = context->next_step_id++; + + /* + * For clauses that contain an <> operator, set opstrategy to + * InvalidStrategy to signal get_matching_list_bounds to do the right + * thing. + */ + if (op_is_ne) + { + Assert(opstrategy == BTEqualStrategyNumber); + opstep->opstrategy = InvalidStrategy; + } + else + opstep->opstrategy = opstrategy; + Assert(list_length(exprs) == list_length(cmpfns)); + opstep->exprs = exprs; + opstep->cmpfns = cmpfns; + opstep->nullkeys = nullkeys; + + context->steps = lappend(context->steps, opstep); + + return (PartitionPruneStep *) opstep; +} + +/* + * gen_prune_step_combine + * Generate a pruning step for a combination of several other steps + * + * The step is assigned a unique step identifier and added to context's + * 'steps' list. + */ +static PartitionPruneStep * +gen_prune_step_combine(GeneratePruningStepsContext *context, + List *source_stepids, + PartitionPruneCombineOp combineOp) +{ + PartitionPruneStepCombine *cstep = makeNode(PartitionPruneStepCombine); + + cstep->step.step_id = context->next_step_id++; + cstep->combineOp = combineOp; + cstep->source_stepids = source_stepids; + + context->steps = lappend(context->steps, cstep); + + return (PartitionPruneStep *) cstep; +} + +/* + * gen_prune_steps_from_opexps + * Generate pruning steps based on clauses for partition keys + * + * 'keyclauses' contains one list of clauses per partition key. We check here + * if we have found clauses for a valid subset of the partition key. In some + * cases, (depending on the type of partitioning being used) if we didn't + * find clauses for a given key, we discard clauses that may have been + * found for any subsequent keys; see specific notes below. + */ +static PartitionPruneStep * +gen_prune_steps_from_opexps(PartitionScheme part_scheme, + GeneratePruningStepsContext *context, + List **keyclauses, Bitmapset *nullkeys) +{ + ListCell *lc; + List *opsteps = NIL; + List *btree_clauses[BTMaxStrategyNumber + 1], + *hash_clauses[HTMaxStrategyNumber + 1]; + bool need_next_less, + need_next_eq, + need_next_greater; + int i; + + memset(btree_clauses, 0, sizeof(btree_clauses)); + memset(hash_clauses, 0, sizeof(hash_clauses)); + for (i = 0; i < part_scheme->partnatts; i++) + { + List *clauselist = keyclauses[i]; + bool consider_next_key = true; + + /* + * To be useful for pruning, we must have clauses for a prefix of + * partition keys in the case of range partitioning. So, ignore + * clauses for keys after this one. + */ + if (part_scheme->strategy == PARTITION_STRATEGY_RANGE && + clauselist == NIL) + break; + + /* + * For hash partitioning, if a column doesn't have the necessary + * equality clause, there should be an IS NULL clause, otherwise + * pruning is not possible. + */ + if (part_scheme->strategy == PARTITION_STRATEGY_HASH && + clauselist == NIL && !bms_is_member(i, nullkeys)) + return NULL; + + need_next_eq = need_next_less = need_next_greater = true; + foreach(lc, clauselist) + { + PartClauseInfo *pc = (PartClauseInfo *) lfirst(lc); + Oid lefttype, + righttype; + + /* Look up the operator's btree/hash strategy number. */ + if (pc->op_strategy == InvalidStrategy) + get_op_opfamily_properties(pc->opno, + part_scheme->partopfamily[i], + false, + &pc->op_strategy, + &lefttype, + &righttype); + + switch (part_scheme->strategy) + { + case PARTITION_STRATEGY_LIST: + case PARTITION_STRATEGY_RANGE: + { + PartClauseInfo *last = NULL; + bool inclusive = false; + + /* + * Add this clause to the list of clauses to be used + * for pruning if this is the first such key for this + * operator strategy or if it is consecutively next to + * the last column for which a clause with this + * operator strategy was matched. + */ + if (btree_clauses[pc->op_strategy] != NIL) + last = llast(btree_clauses[pc->op_strategy]); + + if (last == NULL || + i == last->keyno || i == last->keyno + 1) + btree_clauses[pc->op_strategy] = + lappend(btree_clauses[pc->op_strategy], pc); + + /* + * We may not need the next clause if they're of + * certain strategy. + */ + switch (pc->op_strategy) + { + case BTLessEqualStrategyNumber: + inclusive = true; + /* fall through */ + case BTLessStrategyNumber: + if (!inclusive) + need_next_eq = need_next_less = false; + break; + case BTEqualStrategyNumber: + /* always accept clauses for the next key. */ + break; + case BTGreaterEqualStrategyNumber: + inclusive = true; + /* fall through */ + case BTGreaterStrategyNumber: + if (!inclusive) + need_next_eq = need_next_greater = false; + break; + } + + /* We may want to change our mind. */ + if (consider_next_key) + consider_next_key = (need_next_eq || + need_next_less || + need_next_greater); + break; + } + + case PARTITION_STRATEGY_HASH: + if (pc->op_strategy != HTEqualStrategyNumber) + elog(ERROR, "invalid clause for hash partitioning"); + hash_clauses[pc->op_strategy] = + lappend(hash_clauses[pc->op_strategy], pc); + break; + + default: + elog(ERROR, "invalid partition strategy: %c", + part_scheme->strategy); + break; + } + } + + /* + * If we've decided that clauses for subsequent partition keys + * wouldn't be useful for pruning, don't search any further. + */ + if (!consider_next_key) + break; + } + + /* + * Now, we have divided clauses according to their operator strategies. + * Check for each strategy if we can generate pruning step(s) by + * collecting a list of expressions whose values will constitute a vector + * that can be used as a lookup key by a partition bound searching + * function. + */ + switch (part_scheme->strategy) + { + case PARTITION_STRATEGY_LIST: + case PARTITION_STRATEGY_RANGE: + { + List *eq_clauses = btree_clauses[BTEqualStrategyNumber]; + List *le_clauses = btree_clauses[BTLessEqualStrategyNumber]; + List *ge_clauses = btree_clauses[BTGreaterEqualStrategyNumber]; + int strat; + + /* + * For each clause under consideration for a given strategy, + * we collect expressions from clauses for earlier keys, whose + * operator strategy is inclusive, into a list called + * 'prefix'. By appending the clause's own expression to the + * 'prefix', we'll generate one step using the so generated + * vector and assign the current strategy to it. Actually, + * 'prefix' might contain multiple clauses for the same key, + * in which case, we must generate steps for various + * combinations of expressions of different keys, which + * get_steps_using_prefix takes care of for us. + */ + for (strat = 1; strat <= BTMaxStrategyNumber; strat++) + { + foreach(lc, btree_clauses[strat]) + { + PartClauseInfo *pc = lfirst(lc); + ListCell *lc1; + List *prefix = NIL; + List *pc_steps; + + /* + * Expressions from = clauses can always be in the + * prefix, provided they're from an earlier key. + */ + foreach(lc1, eq_clauses) + { + PartClauseInfo *eqpc = lfirst(lc1); + + if (eqpc->keyno == pc->keyno) + break; + if (eqpc->keyno < pc->keyno) + prefix = lappend(prefix, eqpc); + } + + /* + * If we're generating steps for keyno == pc->keyno) + break; + if (lepc->keyno < pc->keyno) + prefix = lappend(prefix, lepc); + } + } + + /* + * If we're generating steps for >/>= strategy, we can + * add other >= clauses to the prefix, provided + * they're from an earlier key. + */ + if (strat == BTGreaterStrategyNumber || + strat == BTGreaterEqualStrategyNumber) + { + foreach(lc1, ge_clauses) + { + PartClauseInfo *gepc = lfirst(lc1); + + if (gepc->keyno == pc->keyno) + break; + if (gepc->keyno < pc->keyno) + prefix = lappend(prefix, gepc); + } + } + + /* + * As mentioned above, if 'prefix' contains multiple + * expressions for the same key, the following will + * generate multiple steps, one for each combination + * of the expressions for different keys. + * + * Note that we pass NULL for step_nullkeys, because + * we don't search list/range partition bounds where + * some keys are NULL. + */ + Assert(pc->op_strategy == strat); + pc_steps = get_steps_using_prefix(context, strat, + pc->op_is_ne, + pc->expr, + pc->cmpfn, + pc->keyno, + NULL, + prefix); + opsteps = list_concat(opsteps, list_copy(pc_steps)); + } + } + break; + } + + case PARTITION_STRATEGY_HASH: + { + List *eq_clauses = hash_clauses[HTEqualStrategyNumber]; + + /* For hash partitioning, we have just the = strategy. */ + if (eq_clauses != NIL) + { + PartClauseInfo *pc; + List *pc_steps; + List *prefix = NIL; + int last_keyno; + ListCell *lc1; + + /* + * Locate the clause for the greatest column. This may + * not belong to the last partition key, but it is the + * clause belonging to the last partition key we found a + * clause for above. + */ + pc = llast(eq_clauses); + + /* + * There might be multiple clauses which matched to that + * partition key; find the first such clause. While at + * it, add all the clauses before that one to 'prefix'. + */ + last_keyno = pc->keyno; + foreach(lc, eq_clauses) + { + pc = lfirst(lc); + if (pc->keyno == last_keyno) + break; + prefix = lappend(prefix, pc); + } + + /* + * For each clause for the "last" column, after appending + * the clause's own expression to the 'prefix', we'll + * generate one step using the so generated vector and and + * assign = as its strategy. Actually, 'prefix' might + * contain multiple clauses for the same key, in which + * case, we must generate steps for various combinations + * of expressions of different keys, which + * get_steps_using_prefix will take care of for us. + */ + for_each_cell(lc1, lc) + { + pc = lfirst(lc1); + + /* + * Note that we pass nullkeys for step_nullkeys, + * because we need to tell hash partition bound search + * function which of the keys we found IS NULL clauses + * for. + */ + Assert(pc->op_strategy == HTEqualStrategyNumber); + pc_steps = + get_steps_using_prefix(context, + HTEqualStrategyNumber, + false, + pc->expr, + pc->cmpfn, + pc->keyno, + nullkeys, + prefix); + opsteps = list_concat(opsteps, list_copy(pc_steps)); + } + } + break; + } + + default: + elog(ERROR, "invalid partition strategy: %c", + part_scheme->strategy); + break; + } + + /* Lastly, add a combine step to mutually AND these op steps, if needed */ + if (list_length(opsteps) > 1) + { + List *opstep_ids = NIL; + + foreach(lc, opsteps) + { + PartitionPruneStep *step = lfirst(lc); + + opstep_ids = lappend_int(opstep_ids, step->step_id); + } + + if (opstep_ids != NIL) + return gen_prune_step_combine(context, opstep_ids, + PARTPRUNE_COMBINE_INTERSECT); + return NULL; + } + else if (opsteps != NIL) + return linitial(opsteps); + + return NULL; +} + +/* + * If the partition key has a collation, then the clause must have the same + * input collation. If the partition key is non-collatable, we assume the + * collation doesn't matter, because while collation wasn't considered when + * performing partitioning, the clause still may have a collation assigned + * due to the other input being of a collatable type. + * + * See also IndexCollMatchesExprColl. + */ +#define PartCollMatchesExprColl(partcoll, exprcoll) \ + ((partcoll) == InvalidOid || (partcoll) == (exprcoll)) + +/* + * match_clause_to_partition_key + * Attempt to match the given 'clause' with the specified partition key. + * + * Return value is: + * * PARTCLAUSE_NOMATCH if the clause doesn't match this partition key (but + * caller should keep trying, because it might match a subsequent key). + * Output arguments: none set. + * + * * PARTCLAUSE_MATCH_CLAUSE if there is a match. + * Output arguments: *pc is set to a PartClauseInfo constructed for the + * matched clause. + * + * * PARTCLAUSE_MATCH_NULLNESS if there is a match, and the matched clause was + * either a "a IS NULL" or "a IS NOT NULL" clause. + * Output arguments: *clause_is_not_null is set to false in the former case + * true otherwise. + * + * * PARTCLAUSE_MATCH_STEPS if there is a match. + * Output arguments: *clause_steps is set to a list of PartitionPruneStep + * generated for the clause. + * + * * PARTCLAUSE_MATCH_CONTRADICT if the clause is self-contradictory. This can + * only happen if it's a BoolExpr whose arguments are self-contradictory. + * Output arguments: none set. + * + * * PARTCLAUSE_UNSUPPORTED if the clause cannot be used for pruning at all + * due to one of its properties, such as argument volatility, even if it may + * have been matched with a key. + * Output arguments: none set. + */ +static PartClauseMatchStatus +match_clause_to_partition_key(RelOptInfo *rel, + GeneratePruningStepsContext *context, + Expr *clause, Expr *partkey, int partkeyidx, + bool *clause_is_not_null, PartClauseInfo **pc, + List **clause_steps) +{ + PartitionScheme part_scheme = rel->part_scheme; + Expr *expr; + Oid partopfamily = part_scheme->partopfamily[partkeyidx], + partcoll = part_scheme->partcollation[partkeyidx]; + + /* + * Recognize specially shaped clauses that match with the Boolean + * partition key. + */ + if (match_boolean_partition_clause(partopfamily, clause, partkey, &expr)) + { + PartClauseInfo *partclause; + + partclause = (PartClauseInfo *) palloc(sizeof(PartClauseInfo)); + partclause->keyno = partkeyidx; + /* Do pruning with the Boolean equality operator. */ + partclause->opno = BooleanEqualOperator; + partclause->op_is_ne = false; + partclause->expr = expr; + /* We know that expr is of Boolean type. */ + partclause->cmpfn = rel->part_scheme->partsupfunc[partkeyidx].fn_oid; + partclause->op_strategy = InvalidStrategy; + + *pc = partclause; + + return PARTCLAUSE_MATCH_CLAUSE; + } + else if (IsA(clause, OpExpr) && + list_length(((OpExpr *) clause)->args) == 2) + { + OpExpr *opclause = (OpExpr *) clause; + Expr *leftop, + *rightop; + Oid commutator = InvalidOid, + negator = InvalidOid; + Oid cmpfn; + Oid exprtype; + bool is_opne_listp = false; + PartClauseInfo *partclause; + + leftop = (Expr *) get_leftop(clause); + if (IsA(leftop, RelabelType)) + leftop = ((RelabelType *) leftop)->arg; + rightop = (Expr *) get_rightop(clause); + if (IsA(rightop, RelabelType)) + rightop = ((RelabelType *) rightop)->arg; + + /* check if the clause matches this partition key */ + if (equal(leftop, partkey)) + expr = rightop; + else if (equal(rightop, partkey)) + { + expr = leftop; + commutator = get_commutator(opclause->opno); + + /* nothing we can do unless we can swap the operands */ + if (!OidIsValid(commutator)) + return PARTCLAUSE_UNSUPPORTED; + } + else + /* clause does not match this partition key, but perhaps next. */ + return PARTCLAUSE_NOMATCH; + + /* + * Partition key also consists of a collation that's specified for it, + * so try to match it too. There may be multiple keys with the same + * expression but different collations. + */ + if (!PartCollMatchesExprColl(partcoll, opclause->inputcollid)) + return PARTCLAUSE_NOMATCH; + + /* + * Matched with this key. Now check various properties of the clause + * to see if it's sane to use it for pruning. If any of the + * properties makes it unsuitable for pruning, then the clause is + * useless no matter which key it's matched to. + */ + + /* + * Only allow strict operators. This will guarantee nulls are + * filtered. + */ + if (!op_strict(opclause->opno)) + return PARTCLAUSE_UNSUPPORTED; + + /* We can't use any volatile expressions to prune partitions. */ + if (contain_volatile_functions((Node *) expr)) + return PARTCLAUSE_UNSUPPORTED; + + /* + * Normally we only bother with operators that are listed as being + * part of the partitioning operator family. But we make an exception + * in one case -- operators named '<>' are not listed in any operator + * family whatsoever, in which case, we try to perform partition + * pruning with it only if list partitioning is in use. + */ + if (!op_in_opfamily(opclause->opno, partopfamily)) + { + if (part_scheme->strategy != PARTITION_STRATEGY_LIST) + return PARTCLAUSE_UNSUPPORTED; + + /* + * To confirm if the operator is really '<>', check if its negator + * is a btree equality operator. + */ + negator = get_negator(opclause->opno); + if (OidIsValid(negator) && op_in_opfamily(negator, partopfamily)) + { + Oid lefttype; + Oid righttype; + int strategy; + + get_op_opfamily_properties(negator, partopfamily, false, + &strategy, &lefttype, &righttype); + + if (strategy == BTEqualStrategyNumber) + is_opne_listp = true; + } + + /* Operator isn't really what we were hoping it'd be. */ + if (!is_opne_listp) + return PARTCLAUSE_UNSUPPORTED; + } + + /* Check if we're going to need a cross-type comparison function. */ + exprtype = exprType((Node *) expr); + if (exprtype != part_scheme->partopcintype[partkeyidx]) + { + switch (part_scheme->strategy) + { + case PARTITION_STRATEGY_LIST: + case PARTITION_STRATEGY_RANGE: + cmpfn = + get_opfamily_proc(part_scheme->partopfamily[partkeyidx], + part_scheme->partopcintype[partkeyidx], + exprtype, BTORDER_PROC); + break; + + case PARTITION_STRATEGY_HASH: + cmpfn = + get_opfamily_proc(part_scheme->partopfamily[partkeyidx], + exprtype, exprtype, HASHEXTENDED_PROC); + break; + + default: + elog(ERROR, "invalid partition strategy: %c", + part_scheme->strategy); + break; + } + + /* If we couldn't find one, we cannot use this expression. */ + if (!OidIsValid(cmpfn)) + return PARTCLAUSE_UNSUPPORTED; + } + else + cmpfn = part_scheme->partsupfunc[partkeyidx].fn_oid; + + partclause = (PartClauseInfo *) palloc(sizeof(PartClauseInfo)); + partclause->keyno = partkeyidx; + + /* For <> operator clauses, pass on the negator. */ + partclause->op_is_ne = false; + partclause->op_strategy = InvalidStrategy; + + if (is_opne_listp) + { + Assert(OidIsValid(negator)); + partclause->opno = negator; + partclause->op_is_ne = true; + + /* + * We already know the strategy in this case, so may as well set + * it rather than having to look it up later. + */ + partclause->op_strategy = BTEqualStrategyNumber; + } + /* And if commuted before matching, pass on the commutator */ + else if (OidIsValid(commutator)) + partclause->opno = commutator; + else + partclause->opno = opclause->opno; + + partclause->expr = expr; + partclause->cmpfn = cmpfn; + + *pc = partclause; + + return PARTCLAUSE_MATCH_CLAUSE; + } + else if (IsA(clause, ScalarArrayOpExpr)) + { + ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) clause; + Oid saop_op = saop->opno; + Oid saop_coll = saop->inputcollid; + Expr *leftop = (Expr *) linitial(saop->args), + *rightop = (Expr *) lsecond(saop->args); + List *elem_exprs, + *elem_clauses; + ListCell *lc1; + + if (IsA(leftop, RelabelType)) + leftop = ((RelabelType *) leftop)->arg; + + /* Check it matches this partition key */ + if (!equal(leftop, partkey) || + !PartCollMatchesExprColl(partcoll, saop->inputcollid)) + return PARTCLAUSE_NOMATCH; + + /* + * Matched with this key. Check various properties of the clause to + * see if it can sanely be used for partition pruning. + */ + + /* + * Only allow strict operators. This will guarantee nulls are + * filtered. + */ + if (!op_strict(saop->opno)) + return PARTCLAUSE_UNSUPPORTED; + + /* Useless if the array has any volatile functions. */ + if (contain_volatile_functions((Node *) rightop)) + return PARTCLAUSE_UNSUPPORTED; + + /* + * In case of NOT IN (..), we get a '<>', which we handle if list + * partitioning is in use and we're able to confirm that it's negator + * is a btree equality operator belonging to the partitioning operator + * family. + */ + if (!op_in_opfamily(saop_op, partopfamily)) + { + Oid negator; + + if (part_scheme->strategy != PARTITION_STRATEGY_LIST) + return PARTCLAUSE_UNSUPPORTED; + + negator = get_negator(saop_op); + if (OidIsValid(negator) && op_in_opfamily(negator, partopfamily)) + { + int strategy; + Oid lefttype, + righttype; + + get_op_opfamily_properties(negator, partopfamily, + false, &strategy, + &lefttype, &righttype); + if (strategy != BTEqualStrategyNumber) + return PARTCLAUSE_UNSUPPORTED; + } + } + + /* + * First generate a list of Const nodes, one for each array element + * (excepting nulls). + */ + elem_exprs = NIL; + if (IsA(rightop, Const)) + { + Const *arr = castNode(Const, rightop); + ArrayType *arrval = DatumGetArrayTypeP(arr->constvalue); + int16 elemlen; + bool elembyval; + char elemalign; + Datum *elem_values; + bool *elem_nulls; + int num_elems, + i; + + get_typlenbyvalalign(ARR_ELEMTYPE(arrval), + &elemlen, &elembyval, &elemalign); + deconstruct_array(arrval, + ARR_ELEMTYPE(arrval), + elemlen, elembyval, elemalign, + &elem_values, &elem_nulls, + &num_elems); + for (i = 0; i < num_elems; i++) + { + Const *elem_expr; + + /* Only consider non-null values. */ + if (elem_nulls[i]) + continue; + + elem_expr = makeConst(ARR_ELEMTYPE(arrval), -1, + arr->constcollid, elemlen, + elem_values[i], false, elembyval); + elem_exprs = lappend(elem_exprs, elem_expr); + } + } + else + { + ArrayExpr *arrexpr = castNode(ArrayExpr, rightop); + + /* + * For a nested ArrayExpr, we don't know how to get the actual + * scalar values out into a flat list, so we give up doing + * anything with this ScalarArrayOpExpr. + */ + if (arrexpr->multidims) + return PARTCLAUSE_UNSUPPORTED; + + elem_exprs = arrexpr->elements; + } + + /* + * Now generate a list of clauses, one for each array element, of the + * form saop_leftop saop_op elem_expr + */ + elem_clauses = NIL; + foreach(lc1, elem_exprs) + { + Expr *rightop = (Expr *) lfirst(lc1), + *elem_clause; + + elem_clause = make_opclause(saop_op, BOOLOID, false, + leftop, rightop, + InvalidOid, saop_coll); + elem_clauses = lappend(elem_clauses, elem_clause); + } + + /* + * Build a combine step as if for an OR clause or add the clauses to + * the end of the list that's being processed currently. + */ + if (saop->useOr && list_length(elem_clauses) > 1) + { + Expr *orexpr; + bool contradictory; + + orexpr = makeBoolExpr(OR_EXPR, elem_clauses, -1); + *clause_steps = + gen_partprune_steps_internal(context, rel, list_make1(orexpr), + &contradictory); + if (contradictory) + return PARTCLAUSE_MATCH_CONTRADICT; + + Assert(list_length(*clause_steps) == 1); + return PARTCLAUSE_MATCH_STEPS; + } + else + { + bool contradictory; + + *clause_steps = + gen_partprune_steps_internal(context, rel, elem_clauses, + &contradictory); + if (contradictory) + return PARTCLAUSE_MATCH_CONTRADICT; + Assert(list_length(*clause_steps) >= 1); + return PARTCLAUSE_MATCH_STEPS; + } + } + else if (IsA(clause, NullTest)) + { + NullTest *nulltest = (NullTest *) clause; + Expr *arg = nulltest->arg; + + if (IsA(arg, RelabelType)) + arg = ((RelabelType *) arg)->arg; + + /* Does arg match with this partition key column? */ + if (!equal(arg, partkey)) + return PARTCLAUSE_NOMATCH; + + *clause_is_not_null = nulltest->nulltesttype == IS_NOT_NULL; + + return PARTCLAUSE_MATCH_NULLNESS; + } + + return PARTCLAUSE_UNSUPPORTED; +} + +/* + * get_steps_using_prefix + * Generate list of PartitionPruneStepOp steps each consisting of given + * opstrategy + * + * To generate steps, step_lastexpr and step_lastcmpfn are appended to + * expressions and cmpfns, respectively, extracted from the clauses in + * 'prefix'. Actually, since 'prefix' may contain multiple clauses for the + * same partition key column, we must generate steps for various combinations + * of the clauses of different keys. + */ +static List * +get_steps_using_prefix(GeneratePruningStepsContext *context, + StrategyNumber step_opstrategy, + bool step_op_is_ne, + Expr *step_lastexpr, + Oid step_lastcmpfn, + int step_lastkeyno, + Bitmapset *step_nullkeys, + List *prefix) +{ + /* Quick exit if there are no values to prefix with. */ + if (list_length(prefix) == 0) + { + PartitionPruneStep *step; + + step = gen_prune_step_op(context, + step_opstrategy, + step_op_is_ne, + list_make1(step_lastexpr), + list_make1_oid(step_lastcmpfn), + step_nullkeys); + return list_make1(step); + } + + /* Recurse to generate steps for various combinations. */ + return get_steps_using_prefix_recurse(context, + step_opstrategy, + step_op_is_ne, + step_lastexpr, + step_lastcmpfn, + step_lastkeyno, + step_nullkeys, + list_head(prefix), + NIL, NIL); +} + +/* + * get_steps_using_prefix_recurse + * Recursively generate combinations of clauses for different partition + * keys and start generating steps upon reaching clauses for the greatest + * column that is less than the one for which we're currently generating + * steps (that is, step_lastkeyno) + * + * 'start' is where we should start iterating for the current invocation. + * 'step_exprs' and 'step_cmpfns' each contains the expressions and cmpfns + * we've generated so far from the clauses for the previous part keys. + */ +static List * +get_steps_using_prefix_recurse(GeneratePruningStepsContext *context, + StrategyNumber step_opstrategy, + bool step_op_is_ne, + Expr *step_lastexpr, + Oid step_lastcmpfn, + int step_lastkeyno, + Bitmapset *step_nullkeys, + ListCell *start, + List *step_exprs, + List *step_cmpfns) +{ + List *result = NIL; + ListCell *lc; + int cur_keyno; + + /* Actually, recursion would be limited by PARTITION_MAX_KEYS. */ + check_stack_depth(); + + /* Check if we need to recurse. */ + Assert(start != NULL); + cur_keyno = ((PartClauseInfo *) lfirst(start))->keyno; + if (cur_keyno < step_lastkeyno - 1) + { + PartClauseInfo *pc; + ListCell *next_start; + + /* + * For each clause with cur_keyno, adds its expr and cmpfn to + * step_exprs and step_cmpfns, respectively, and recurse after setting + * next_start to the ListCell of the first clause for the next + * partition key. + */ + for_each_cell(lc, start) + { + pc = lfirst(lc); + + if (pc->keyno > cur_keyno) + break; + } + next_start = lc; + + for_each_cell(lc, start) + { + List *moresteps; + + pc = lfirst(lc); + if (pc->keyno == cur_keyno) + { + /* clean up before starting a new recursion cycle. */ + if (cur_keyno == 0) + { + list_free(step_exprs); + list_free(step_cmpfns); + step_exprs = list_make1(pc->expr); + step_cmpfns = list_make1_oid(pc->cmpfn); + } + else + { + step_exprs = lappend(step_exprs, pc->expr); + step_cmpfns = lappend_oid(step_cmpfns, pc->cmpfn); + } + } + else + { + Assert(pc->keyno > cur_keyno); + break; + } + + moresteps = get_steps_using_prefix_recurse(context, + step_opstrategy, + step_op_is_ne, + step_lastexpr, + step_lastcmpfn, + step_lastkeyno, + step_nullkeys, + next_start, + step_exprs, + step_cmpfns); + result = list_concat(result, moresteps); + } + } + else + { + /* + * End the current recursion cycle and start generating steps, one for + * each clause with cur_keyno, which is all clauses from here onward + * till the end of the list. + */ + Assert(list_length(step_exprs) == cur_keyno); + for_each_cell(lc, start) + { + PartClauseInfo *pc = lfirst(lc); + PartitionPruneStep *step; + List *step_exprs1, + *step_cmpfns1; + + Assert(pc->keyno == cur_keyno); + + /* Leave the original step_exprs unmodified. */ + step_exprs1 = list_copy(step_exprs); + step_exprs1 = lappend(step_exprs1, pc->expr); + step_exprs1 = lappend(step_exprs1, step_lastexpr); + + /* Leave the original step_cmpfns unmodified. */ + step_cmpfns1 = list_copy(step_cmpfns); + step_cmpfns1 = lappend_oid(step_cmpfns1, pc->cmpfn); + step_cmpfns1 = lappend_oid(step_cmpfns1, step_lastcmpfn); + + step = gen_prune_step_op(context, + step_opstrategy, step_op_is_ne, + step_exprs1, step_cmpfns1, + step_nullkeys); + result = lappend(result, step); + } + } + + return result; +} + +/* + * get_matching_hash_bounds + * Determine offset of the hash bound matching the specified values, + * considering that all the non-null values come from clauses containing + * a compatible hash equality operator and any keys that are null come + * from an IS NULL clause. + * + * Generally this function will return a single matching bound offset, + * although if a partition has not been setup for a given modulus then we may + * return no matches. If the number of clauses found don't cover the entire + * partition key, then we'll need to return all offsets. + * + * 'opstrategy' if non-zero must be HTEqualStrategyNumber. + * + * 'values' contains Datums indexed by the partition key to use for pruning. + * + * 'nvalues', the number of Datums in the 'values' array. + * + * 'partsupfunc' contains partition hashing functions that can produce correct + * hash for the type of the values contained in 'values'. + * + * 'nullkeys' is the set of partition keys that are null. + */ +static PruneStepResult * +get_matching_hash_bounds(PartitionPruneContext *context, + StrategyNumber opstrategy, Datum *values, int nvalues, + FmgrInfo *partsupfunc, Bitmapset *nullkeys) +{ + PruneStepResult *result = (PruneStepResult *) palloc0(sizeof(PruneStepResult)); + PartitionBoundInfo boundinfo = context->boundinfo; + int *partindices = boundinfo->indexes; + int partnatts = context->partnatts; + bool isnull[PARTITION_MAX_KEYS]; + int i; + uint64 rowHash; + int greatest_modulus; + + Assert(context->strategy == PARTITION_STRATEGY_HASH); + + /* + * For hash partitioning we can only perform pruning based on equality + * clauses to the partition key or IS NULL clauses. We also can only + * prune if we got values for all keys. + */ + if (nvalues + bms_num_members(nullkeys) == partnatts) + { + /* + * If there are any values, they must have come from clauses + * containing an equality operator compatible with hash partitioning. + */ + Assert(opstrategy == HTEqualStrategyNumber || nvalues == 0); + + for (i = 0; i < partnatts; i++) + isnull[i] = bms_is_member(i, nullkeys); + + greatest_modulus = get_hash_partition_greatest_modulus(boundinfo); + rowHash = compute_hash_value(partnatts, partsupfunc, values, isnull); + + if (partindices[rowHash % greatest_modulus] >= 0) + result->bound_offsets = + bms_make_singleton(rowHash % greatest_modulus); + } + else + result->bound_offsets = bms_add_range(NULL, 0, + boundinfo->ndatums - 1); + + /* + * There is neither a special hash null partition or the default hash + * partition. + */ + result->scan_null = result->scan_default = false; + + return result; +} + +/* + * get_matching_list_bounds + * Determine the offsets of list bounds matching the specified value, + * according to the semantics of the given operator strategy + * 'opstrategy' if non-zero must be a btree strategy number. + * + * 'value' contains the value to use for pruning. + * + * 'nvalues', if non-zero, should be exactly 1, because of list partitioning. + * + * 'partsupfunc' contains the list partitioning comparison function to be used + * to perform partition_list_bsearch + * + * 'nullkeys' is the set of partition keys that are null. + */ +static PruneStepResult * +get_matching_list_bounds(PartitionPruneContext *context, + StrategyNumber opstrategy, Datum value, int nvalues, + FmgrInfo *partsupfunc, Bitmapset *nullkeys) +{ + PruneStepResult *result = (PruneStepResult *) palloc0(sizeof(PruneStepResult)); + PartitionBoundInfo boundinfo = context->boundinfo; + int off, + minoff, + maxoff; + bool is_equal; + bool inclusive = false; + Oid *partcollation = context->partcollation; + + Assert(context->strategy == PARTITION_STRATEGY_LIST); + Assert(context->partnatts == 1); + + result->scan_null = result->scan_default = false; + + if (!bms_is_empty(nullkeys)) + { + /* + * Nulls may exist in only one partition - the partition whose + * accepted set of values includes null or the default partition if + * the former doesn't exist. + */ + if (partition_bound_accepts_nulls(boundinfo)) + result->scan_null = true; + else + result->scan_default = partition_bound_has_default(boundinfo); + return result; + } + + /* + * If there are no datums to compare keys with, but there are partitions, + * just return the default partition if one exists. + */ + if (boundinfo->ndatums == 0) + { + result->scan_default = partition_bound_has_default(boundinfo); + return result; + } + + minoff = 0; + maxoff = boundinfo->ndatums - 1; + + /* + * If there are no values to compare with the datums in boundinfo, it + * means the caller asked for partitions for all non-null datums. Add + * indexes of *all* partitions, including the default if any. + */ + if (nvalues == 0) + { + result->bound_offsets = bms_add_range(NULL, 0, + boundinfo->ndatums - 1); + result->scan_default = partition_bound_has_default(boundinfo); + return result; + } + + /* Special case handling of values coming from a <> operator clause. */ + if (opstrategy == InvalidStrategy) + { + /* + * First match to all bounds. We'll remove any matching datums below. + */ + result->bound_offsets = bms_add_range(NULL, 0, + boundinfo->ndatums - 1); + + off = partition_list_bsearch(partsupfunc, partcollation, boundinfo, + value, &is_equal); + if (off >= 0 && is_equal) + { + + /* We have a match. Remove from the result. */ + Assert(boundinfo->indexes[off] >= 0); + result->bound_offsets = bms_del_member(result->bound_offsets, + off); + } + + /* Always include the default partition if any. */ + result->scan_default = partition_bound_has_default(boundinfo); + + return result; + } + + /* + * With range queries, always include the default list partition, because + * list partitions divide the key space in a discontinuous manner, not all + * values in the given range will have a partition assigned. This may not + * technically be true for some data types (e.g. integer types), however, + * we currently lack any sort of infrastructure to provide us with proofs + * that would allow us to do anything smarter here. + */ + if (opstrategy != BTEqualStrategyNumber) + result->scan_default = partition_bound_has_default(boundinfo); + + switch (opstrategy) + { + case BTEqualStrategyNumber: + off = partition_list_bsearch(partsupfunc, + partcollation, + boundinfo, value, + &is_equal); + if (off >= 0 && is_equal) + { + Assert(boundinfo->indexes[off] >= 0); + result->bound_offsets = bms_make_singleton(off); + } + else + result->scan_default = partition_bound_has_default(boundinfo); + return result; + + case BTGreaterEqualStrategyNumber: + inclusive = true; + /* fall through */ + case BTGreaterStrategyNumber: + off = partition_list_bsearch(partsupfunc, + partcollation, + boundinfo, value, + &is_equal); + if (off >= 0) + { + /* We don't want the matched datum to be in the result. */ + if (!is_equal || !inclusive) + off++; + } + else + { + /* + * This case means all partition bounds are greater, which in + * turn means that all partitions satisfy this key. + */ + off = 0; + } + + /* + * off is greater than the numbers of datums we have partitions + * for. The only possible partition that could contain a match is + * the default partition, but we must've set context->scan_default + * above anyway if one exists. + */ + if (off > boundinfo->ndatums - 1) + return result; + + minoff = off; + break; + + case BTLessEqualStrategyNumber: + inclusive = true; + /* fall through */ + case BTLessStrategyNumber: + off = partition_list_bsearch(partsupfunc, + partcollation, + boundinfo, value, + &is_equal); + if (off >= 0 && is_equal && !inclusive) + off--; + + /* + * off is smaller than the datums of all non-default partitions. + * The only possible partition that could contain a match is the + * default partition, but we must've set context->scan_default + * above anyway if one exists. + */ + if (off < 0) + return result; + + maxoff = off; + break; + + default: + elog(ERROR, "invalid strategy number %d", opstrategy); + break; + } + + result->bound_offsets = bms_add_range(NULL, minoff, maxoff); + return result; +} + + +/* + * get_matching_range_datums + * Determine the offsets of range bounds matching the specified values, + * according to the semantics of the given operator strategy + * + * Each datum whose offset is in result is to be treated as the upper bound of + * the partition that will contain the desired values. + * + * If default partition needs to be scanned for given values, set scan_default + * in result if present. + * + * 'opstrategy' if non-zero must be a btree strategy number. + * + * 'values' contains Datums indexed by the partition key to use for pruning. + * + * 'nvalues', number of Datums in 'values' array. Must be <= context->partnatts. + * + * 'partsupfunc' contains the range partitioning comparison functions to be + * used to perform partition_range_datum_bsearch or partition_rbound_datum_cmp + * using. + * + * 'nullkeys' is the set of partition keys that are null. + */ +static PruneStepResult * +get_matching_range_bounds(PartitionPruneContext *context, + StrategyNumber opstrategy, Datum *values, int nvalues, + FmgrInfo *partsupfunc, Bitmapset *nullkeys) +{ + PruneStepResult *result = (PruneStepResult *) palloc0(sizeof(PruneStepResult)); + PartitionBoundInfo boundinfo = context->boundinfo; + Oid *partcollation = context->partcollation; + int partnatts = context->partnatts; + int *partindices = boundinfo->indexes; + int off, + minoff, + maxoff, + i; + bool is_equal; + bool inclusive = false; + + Assert(context->strategy == PARTITION_STRATEGY_RANGE); + Assert(nvalues <= partnatts); + + result->scan_null = result->scan_default = false; + + /* + * If there are no datums to compare keys with, or if we got an IS NULL + * clause just return the default partition, if it exists. + */ + if (boundinfo->ndatums == 0 || !bms_is_empty(nullkeys)) + { + result->scan_default = partition_bound_has_default(boundinfo); + return result; + } + + minoff = 0; + maxoff = boundinfo->ndatums; + + /* + * If there are no values to compare with the datums in boundinfo, it + * means the caller asked for partitions for all non-null datums. Add + * indexes of *all* partitions, including the default partition if one + * exists. + */ + if (nvalues == 0) + { + if (partindices[minoff] < 0) + minoff++; + if (partindices[maxoff] < 0) + maxoff--; + + result->scan_default = partition_bound_has_default(boundinfo); + result->bound_offsets = bms_add_range(NULL, minoff, maxoff); + + return result; + } + + /* + * If the query does not constrain all key columns, we'll need to scan the + * the default partition, if any. + */ + if (nvalues < partnatts) + result->scan_default = partition_bound_has_default(boundinfo); + + switch (opstrategy) + { + case BTEqualStrategyNumber: + /* Look for the smallest bound that is = lookup value. */ + off = partition_range_datum_bsearch(partsupfunc, + partcollation, + boundinfo, + nvalues, values, + &is_equal); + + if (off >= 0 && is_equal) + { + if (nvalues == partnatts) + { + /* There can only be zero or one matching partition. */ + if (partindices[off + 1] >= 0) + result->bound_offsets = bms_make_singleton(off + 1); + else + result->scan_default = + partition_bound_has_default(boundinfo); + return result; + } + else + { + int saved_off = off; + + /* + * Since the lookup value contains only a prefix of keys, + * we must find other bounds that may also match the + * prefix. partition_range_datum_bsearch() returns the + * offset of one of them, find others by checking adjacent + * bounds. + */ + + /* + * First find greatest bound that's smaller than the + * lookup value. + */ + while (off >= 1) + { + int32 cmpval; + + cmpval = + partition_rbound_datum_cmp(partsupfunc, + partcollation, + boundinfo->datums[off - 1], + boundinfo->kind[off - 1], + values, nvalues); + if (cmpval != 0) + break; + off--; + } + + Assert(0 == + partition_rbound_datum_cmp(partsupfunc, + partcollation, + boundinfo->datums[off], + boundinfo->kind[off], + values, nvalues)); + + /* + * We can treat 'off' as the offset of the smallest bound + * to be included in the result, if we know it is the + * upper bound of the partition in which the lookup value + * could possibly exist. One case it couldn't is if the + * bound, or precisely the matched portion of its prefix, + * is not inclusive. + */ + if (boundinfo->kind[off][nvalues] == + PARTITION_RANGE_DATUM_MINVALUE) + off++; + + minoff = off; + + /* + * Now find smallest bound that's greater than the lookup + * value. + */ + off = saved_off; + while (off < boundinfo->ndatums - 1) + { + int32 cmpval; + + cmpval = partition_rbound_datum_cmp(partsupfunc, + partcollation, + boundinfo->datums[off + 1], + boundinfo->kind[off + 1], + values, nvalues); + if (cmpval != 0) + break; + off++; + } + + Assert(0 == + partition_rbound_datum_cmp(partsupfunc, + partcollation, + boundinfo->datums[off], + boundinfo->kind[off], + values, nvalues)); + + /* + * off + 1, then would be the offset of the greatest bound + * to be included in the result. + */ + maxoff = off + 1; + } + + /* + * Skip if minoff/maxoff are actually the upper bound of a + * un-assigned portion of values. + */ + if (partindices[minoff] < 0 && minoff < boundinfo->ndatums) + minoff++; + if (partindices[maxoff] < 0 && maxoff >= 1) + maxoff--; + + /* + * There may exist a range of values unassigned to any + * non-default partition between the datums at minoff and + * maxoff. Add the default partition in that case. + */ + if (partition_bound_has_default(boundinfo)) + { + for (i = minoff; i <= maxoff; i++) + { + if (partindices[i] < 0) + { + result->scan_default = true; + break; + } + } + } + + Assert(minoff >= 0 && maxoff >= 0); + result->bound_offsets = bms_add_range(NULL, minoff, maxoff); + } + else if (off >= 0) /* !is_equal */ + { + /* + * The lookup value falls in the range between some bounds in + * boundinfo. 'off' would be the offset of the greatest bound + * that is <= lookup value, so add off + 1 to the result + * instead as the offset of the upper bound of the only + * partition that may contain the lookup value. + */ + if (partindices[off + 1] >= 0) + result->bound_offsets = bms_make_singleton(off + 1); + else + result->scan_default = + partition_bound_has_default(boundinfo); + } + else + { + /* + * off < 0: the lookup value is smaller than all bounds, so + * only the default partition qualifies, if there is one. + */ + result->scan_default = partition_bound_has_default(boundinfo); + } + + return result; + + case BTGreaterEqualStrategyNumber: + inclusive = true; + /* fall through */ + case BTGreaterStrategyNumber: + + /* + * Look for the smallest bound that is > or >= lookup value and + * set minoff to its offset. + */ + off = partition_range_datum_bsearch(partsupfunc, + partcollation, + boundinfo, + nvalues, values, + &is_equal); + if (off < 0) + { + /* + * All bounds are greater than the lookup value, so include + * all of them in the result. + */ + minoff = 0; + } + else + { + if (is_equal && nvalues < partnatts) + { + /* + * Since the lookup value contains only a prefix of keys, + * we must find other bounds that may also match the + * prefix. partition_range_datum_bsearch() returns the + * offset of one of them, find others by checking adjacent + * bounds. + * + * Based on whether the lookup values are inclusive or + * not, we must either include the indexes of all such + * bounds in the result (that is, set minoff to the index + * of smallest such bound) or find the smallest one that's + * greater than the lookup values and set minoff to that. + */ + while (off >= 1 && off < boundinfo->ndatums - 1) + { + int32 cmpval; + int nextoff; + + nextoff = inclusive ? off - 1 : off + 1; + cmpval = + partition_rbound_datum_cmp(partsupfunc, + partcollation, + boundinfo->datums[nextoff], + boundinfo->kind[nextoff], + values, nvalues); + if (cmpval != 0) + break; + + off = nextoff; + } + + Assert(0 == + partition_rbound_datum_cmp(partsupfunc, + partcollation, + boundinfo->datums[off], + boundinfo->kind[off], + values, nvalues)); + + minoff = inclusive ? off : off + 1; + } + + /* + * lookup value falls in the range between some bounds in + * boundinfo. off would be the offset of the greatest bound + * that is <= lookup value, so add off + 1 to the result + * instead as the offset of the upper bound of the smallest + * partition that may contain the lookup value. + */ + else + minoff = off + 1; + } + break; + + case BTLessEqualStrategyNumber: + inclusive = true; + /* fall through */ + case BTLessStrategyNumber: + + /* + * Look for the greatest bound that is < or <= lookup value and + * set minoff to its offset. + */ + off = partition_range_datum_bsearch(partsupfunc, + partcollation, + boundinfo, + nvalues, values, + &is_equal); + if (off < 0) + { + /* + * All bounds are greater than the key, so we could only + * expect to find the lookup key in the default partition. + */ + result->scan_default = partition_bound_has_default(boundinfo); + return result; + } + else + { + /* + * See the comment above. + */ + if (is_equal && nvalues < partnatts) + { + while (off >= 1 && off < boundinfo->ndatums - 1) + { + int32 cmpval; + int nextoff; + + nextoff = inclusive ? off + 1 : off - 1; + cmpval = partition_rbound_datum_cmp(partsupfunc, + partcollation, + boundinfo->datums[nextoff], + boundinfo->kind[nextoff], + values, nvalues); + if (cmpval != 0) + break; + + off = nextoff; + } + + Assert(0 == + partition_rbound_datum_cmp(partsupfunc, + partcollation, + boundinfo->datums[off], + boundinfo->kind[off], + values, nvalues)); + + maxoff = inclusive ? off + 1 : off; + } + + /* + * The lookup value falls in the range between some bounds in + * boundinfo. 'off' would be the offset of the greatest bound + * that is <= lookup value, so add off + 1 to the result + * instead as the offset of the upper bound of the greatest + * partition that may contain lookup value. If the lookup + * value had exactly matched the bound, but it isn't + * inclusive, no need add the adjacent partition. + */ + else if (!is_equal || inclusive) + maxoff = off + 1; + else + maxoff = off; + } + break; + + default: + elog(ERROR, "invalid strategy number %d", opstrategy); + break; + } + + /* + * Skip a gap and when doing so, check if the bound contains a finite + * value to decide if we need to add the default partition. If it's an + * infinite bound, we need not add the default partition, as having an + * infinite bound means the partition in question catches any values that + * would otherwise be in the default partition. + */ + if (partindices[minoff] < 0) + { + int lastkey = nvalues - 1; + + if (minoff >= 0 && + minoff < boundinfo->ndatums && + boundinfo->kind[minoff][lastkey] == + PARTITION_RANGE_DATUM_VALUE) + result->scan_default = partition_bound_has_default(boundinfo); + + minoff++; + } + + /* + * Skip a gap. See the above comment about how we decide whether or or + * not to scan the default partition based whether the datum that will + * become the maximum datum is finite or not. + */ + if (maxoff >= 1 && partindices[maxoff] < 0) + { + int lastkey = nvalues - 1; + + if (maxoff >= 0 && + maxoff <= boundinfo->ndatums && + boundinfo->kind[maxoff - 1][lastkey] == + PARTITION_RANGE_DATUM_VALUE) + result->scan_default = partition_bound_has_default(boundinfo); + + maxoff--; + } + + if (partition_bound_has_default(boundinfo)) + { + /* + * There may exist a range of values unassigned to any non-default + * partition between the datums at minoff and maxoff. Add the default + * partition in that case. + */ + for (i = minoff; i <= maxoff; i++) + { + if (partindices[i] < 0) + { + result->scan_default = true; + break; + } + } + } + + Assert(minoff >= 0 && maxoff >= 0); + if (minoff <= maxoff) + result->bound_offsets = bms_add_range(NULL, minoff, maxoff); + + return result; +} + +/* + * perform_pruning_base_step + * Determines the indexes of datums that satisfy conditions specified in + * 'opstep'. + * + * Result also contains whether special null-accepting and/or default + * partition need to be scanned. + */ +static PruneStepResult * +perform_pruning_base_step(PartitionPruneContext *context, + PartitionPruneStepOp *opstep) +{ + ListCell *lc1, + *lc2; + int keyno, + nvalues; + Datum values[PARTITION_MAX_KEYS]; + FmgrInfo partsupfunc[PARTITION_MAX_KEYS]; + + /* + * There better be the same number of expressions and compare functions. + */ + Assert(list_length(opstep->exprs) == list_length(opstep->cmpfns)); + + nvalues = 0; + lc1 = list_head(opstep->exprs); + lc2 = list_head(opstep->cmpfns); + + /* + * Generate the partition lookup key that will be used by one of the + * get_matching_*_bounds functions called below. + */ + for (keyno = 0; keyno < context->partnatts; keyno++) + { + /* + * For hash partitioning, it is possible that values of some keys are + * not provided in operator clauses, but instead the planner found + * that they appeared in a IS NULL clause. + */ + if (bms_is_member(keyno, opstep->nullkeys)) + continue; + + /* + * For range partitioning, we must only perform pruning with values + * for either all partition keys or a prefix thereof. + */ + if (keyno > nvalues && context->strategy == PARTITION_STRATEGY_RANGE) + break; + + if (lc1 != NULL) + { + Expr *expr; + Datum datum; + + expr = lfirst(lc1); + if (partkey_datum_from_expr(context, expr, &datum)) + { + Oid cmpfn; + + /* + * If we're going to need a different comparison function than + * the one cached in the PartitionKey, we'll need to look up + * the FmgrInfo. + */ + cmpfn = lfirst_oid(lc2); + Assert(OidIsValid(cmpfn)); + if (cmpfn != context->partsupfunc[keyno].fn_oid) + fmgr_info(cmpfn, &partsupfunc[keyno]); + else + fmgr_info_copy(&partsupfunc[keyno], + &context->partsupfunc[keyno], + CurrentMemoryContext); + + values[keyno] = datum; + nvalues++; + } + + lc1 = lnext(lc1); + lc2 = lnext(lc2); + } + } + + switch (context->strategy) + { + case PARTITION_STRATEGY_HASH: + return get_matching_hash_bounds(context, + opstep->opstrategy, + values, nvalues, + partsupfunc, + opstep->nullkeys); + + case PARTITION_STRATEGY_LIST: + return get_matching_list_bounds(context, + opstep->opstrategy, + values[0], nvalues, + &partsupfunc[0], + opstep->nullkeys); + + case PARTITION_STRATEGY_RANGE: + return get_matching_range_bounds(context, + opstep->opstrategy, + values, nvalues, + partsupfunc, + opstep->nullkeys); + + default: + elog(ERROR, "unexpected partition strategy: %d", + (int) context->strategy); + break; + } + + return NULL; +} + +/* + * perform_pruning_combine_step + * Determines the indexes of datums obtained by combining those given + * by the steps identified by cstep->source_stepids using the specified + * combination method + * + * Since cstep may refer to the result of earlier steps, we also receive + * step_results here. + */ +static PruneStepResult * +perform_pruning_combine_step(PartitionPruneContext *context, + PartitionPruneStepCombine *cstep, + PruneStepResult **step_results) +{ + ListCell *lc1; + PruneStepResult *result = NULL; + bool firststep; + + /* + * A combine step without any source steps is an indication to not perform + * any partition pruning, we just return all partitions. + */ + result = (PruneStepResult *) palloc0(sizeof(PruneStepResult)); + if (list_length(cstep->source_stepids) == 0) + { + PartitionBoundInfo boundinfo = context->boundinfo; + + result->bound_offsets = bms_add_range(NULL, 0, boundinfo->ndatums - 1); + result->scan_default = partition_bound_has_default(boundinfo); + result->scan_null = partition_bound_accepts_nulls(boundinfo); + return result; + } + + switch (cstep->combineOp) + { + case PARTPRUNE_COMBINE_UNION: + foreach(lc1, cstep->source_stepids) + { + int step_id = lfirst_int(lc1); + PruneStepResult *step_result; + + /* + * step_results[step_id] must contain a valid result, which is + * confirmed by the fact that cstep's step_id is greater than + * step_id and the fact that results of the individual steps + * are evaluated in sequence of their step_ids. + */ + if (step_id >= cstep->step.step_id) + elog(ERROR, "invalid pruning combine step argument"); + step_result = step_results[step_id]; + Assert(step_result != NULL); + + /* Record any additional datum indexes from this step */ + result->bound_offsets = bms_add_members(result->bound_offsets, + step_result->bound_offsets); + + /* Update whether to scan null and default partitions. */ + if (!result->scan_null) + result->scan_null = step_result->scan_null; + if (!result->scan_default) + result->scan_default = step_result->scan_default; + } + break; + + case PARTPRUNE_COMBINE_INTERSECT: + firststep = true; + foreach(lc1, cstep->source_stepids) + { + int step_id = lfirst_int(lc1); + PruneStepResult *step_result; + + if (step_id >= cstep->step.step_id) + elog(ERROR, "invalid pruning combine step argument"); + step_result = step_results[step_id]; + Assert(step_result != NULL); + + if (firststep) + { + /* Copy step's result the first time. */ + result->bound_offsets = step_result->bound_offsets; + result->scan_null = step_result->scan_null; + result->scan_default = step_result->scan_default; + firststep = false; + } + else + { + /* Record datum indexes common to both steps */ + result->bound_offsets = + bms_int_members(result->bound_offsets, + step_result->bound_offsets); + + /* Update whether to scan null and default partitions. */ + if (result->scan_null) + result->scan_null = step_result->scan_null; + if (result->scan_default) + result->scan_default = step_result->scan_default; + } + } + break; + + default: + elog(ERROR, "invalid pruning combine op: %d", + (int) cstep->combineOp); + } + + return result; +} + +/* + * match_boolean_partition_clause + * + * Sets *outconst to a Const containing true or false value and returns true if + * we're able to match the clause to the partition key as specially-shaped + * Boolean clause. Returns false otherwise with *outconst set to NULL. + */ +static bool +match_boolean_partition_clause(Oid partopfamily, Expr *clause, Expr *partkey, + Expr **outconst) +{ + Expr *leftop; + + *outconst = NULL; + + if (!IsBooleanOpfamily(partopfamily)) + return false; + + if (IsA(clause, BooleanTest)) + { + BooleanTest *btest = (BooleanTest *) clause; + + /* Only IS [NOT] TRUE/FALSE are any good to us */ + if (btest->booltesttype == IS_UNKNOWN || + btest->booltesttype == IS_NOT_UNKNOWN) + return false; + + leftop = btest->arg; + if (IsA(leftop, RelabelType)) + leftop = ((RelabelType *) leftop)->arg; + + if (equal(leftop, partkey)) + *outconst = (btest->booltesttype == IS_TRUE || + btest->booltesttype == IS_NOT_FALSE) + ? (Expr *) makeBoolConst(true, false) + : (Expr *) makeBoolConst(false, false); + + if (*outconst) + return true; + } + else + { + bool is_not_clause = not_clause((Node *) clause); + + leftop = is_not_clause ? get_notclausearg(clause) : clause; + + if (IsA(leftop, RelabelType)) + leftop = ((RelabelType *) leftop)->arg; + + /* Compare to the partition key, and make up a clause ... */ + if (equal(leftop, partkey)) + *outconst = is_not_clause ? + (Expr *) makeBoolConst(false, false) : + (Expr *) makeBoolConst(true, false); + else if (equal(negate_clause((Node *) leftop), partkey)) + *outconst = (Expr *) makeBoolConst(false, false); + + if (*outconst) + return true; + } + + return false; +} + +/* + * partkey_datum_from_expr + * Evaluate 'expr', set *value to the resulting Datum. Return true if + * evaluation was possible, otherwise false. + */ +static bool +partkey_datum_from_expr(PartitionPruneContext *context, + Expr *expr, Datum *value) +{ + switch (nodeTag(expr)) + { + case T_Const: + *value = ((Const *) expr)->constvalue; + return true; + + default: + break; + } + + return false; +} diff --git a/src/include/catalog/partition.h b/src/include/catalog/partition.h index 3d8b08ba..6cade9aa 100644 --- a/src/include/catalog/partition.h +++ b/src/include/catalog/partition.h @@ -26,7 +26,7 @@ * PartitionBoundInfo encapsulates a set of partition bounds. It is usually * associated with partitioned tables as part of its partition descriptor. * - * The internal structure is opaque outside partition.c. + * The internal structure appears in partbounds.h. */ typedef struct PartitionBoundInfoData *PartitionBoundInfo; @@ -69,7 +69,6 @@ extern void check_default_allows_bound(Relation parent, Relation defaultRel, PartitionBoundSpec *new_spec); extern List *get_proposed_default_constraint(List *new_part_constaints); -/* For tuple routing */ extern int get_partition_for_tuple(Relation relation, Datum *values, bool *isnull); diff --git a/src/include/catalog/pg_opfamily.h b/src/include/catalog/pg_opfamily.h index 044e4076..599fdd41 100644 --- a/src/include/catalog/pg_opfamily.h +++ b/src/include/catalog/pg_opfamily.h @@ -53,6 +53,9 @@ typedef FormData_pg_opfamily *Form_pg_opfamily; #define Anum_pg_opfamily_opfnamespace 3 #define Anum_pg_opfamily_opfowner 4 +#define IsBooleanOpfamily(opfamily) \ + ((opfamily) == BOOL_BTREE_FAM_OID || (opfamily) == BOOL_HASH_FAM_OID) + /* ---------------- * initial contents of pg_opfamily * ---------------- diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index 43f90ba9..2f585807 100644 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -246,6 +246,9 @@ typedef enum NodeTag T_FromExpr, T_OnConflictExpr, T_IntoClause, + T_PartitionPruneStep, + T_PartitionPruneStepOp, + T_PartitionPruneStepCombine, #ifdef PGXC T_DistributeBy, T_PGXCSubCluster, @@ -328,7 +331,6 @@ typedef enum NodeTag T_PlaceHolderVar, T_SpecialJoinInfo, T_AppendRelInfo, - T_PartitionedChildRelInfo, T_PlaceHolderInfo, T_MinMaxAggInfo, T_PlannerParamItem, diff --git a/src/include/nodes/primnodes.h b/src/include/nodes/primnodes.h index faee37ff..a8df3382 100644 --- a/src/include/nodes/primnodes.h +++ b/src/include/nodes/primnodes.h @@ -20,6 +20,7 @@ #define PRIMNODES_H #include "access/attnum.h" +#include "access/stratnum.h" #include "nodes/bitmapset.h" #include "nodes/pg_list.h" @@ -1597,4 +1598,78 @@ typedef struct OnConflictExpr List *exclRelTlist; /* tlist of the EXCLUDED pseudo relation */ } OnConflictExpr; + +/* + * Node types to represent a partition pruning step. + */ + +/* + * The base Node type. step_id is the global identifier of a given step + * within a given pruning context. + */ +typedef struct PartitionPruneStep +{ + NodeTag type; + int step_id; +} PartitionPruneStep; + +/*---------- + * PartitionPruneStepOp - Information to prune using a set of mutually AND'd + * OpExpr clauses + * + * This contains information extracted from up to partnatts OpExpr clauses, + * where partnatts is the number of partition key columns. 'opstrategy' is the + * strategy of the operator in the clause matched to the last partition key. + * 'exprs' contains expressions which comprise the lookup key to be passed to + * the partition bound search function. 'cmpfns' contains the OIDs of + * comparison function used to compare aforementioned expressions with + * partition bounds. Both 'exprs' and 'cmpfns' contain the same number of + * items up to partnatts items. + * + * Once we find the offset of a partition bound using the lookup key, we + * determine which partitions to include in the result based on the value of + * 'opstrategy'. For example, if it were equality, we'd return just the + * partition that would contain that key or a set of partitions if the key + * didn't consist of all partitioning columns. For non-equality strategies, + * we'd need to include other partitions as appropriate. + * + * 'nullkeys' is the set containing the offset of the partition keys (0 to + * partnatts - 1) that were matched to an IS NULL clause. This is only + * considered for hash partitioning as we need to pass which keys are null + * to the hash partition bound search function. It is never possible to + * have an expression be present in 'exprs' for a given partition key and + * the corresponding bit set in 'nullkeys'. + *---------- + */ +typedef struct PartitionPruneStepOp +{ + PartitionPruneStep step; + + StrategyNumber opstrategy; + List *exprs; + List *cmpfns; + Bitmapset *nullkeys; +} PartitionPruneStepOp; + +/*---------- + * PartitionPruneStepCombine - Information to prune using a BoolExpr clause + * + * For BoolExpr clauses, we combine the set of partitions determined for each + * of its argument clauses. + *---------- + */ +typedef enum PartitionPruneCombineOp +{ + PARTPRUNE_COMBINE_UNION, + PARTPRUNE_COMBINE_INTERSECT +} PartitionPruneCombineOp; + +typedef struct PartitionPruneStepCombine +{ + PartitionPruneStep step; + + PartitionPruneCombineOp combineOp; + List *source_stepids; +} PartitionPruneStepCombine; + #endif /* PRIMNODES_H */ diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h index 6172b31e..e49bc1a0 100644 --- a/src/include/nodes/relation.h +++ b/src/include/nodes/relation.h @@ -76,6 +76,7 @@ #define RELATION_H #include "access/sdir.h" +#include "fmgr.h" #include "lib/stringinfo.h" #include "nodes/params.h" #include "nodes/parsenodes.h" @@ -345,8 +346,6 @@ typedef struct PlannerInfo List *append_rel_list; /* list of AppendRelInfos */ - List *pcinfo_list; /* list of PartitionedChildRelInfos */ - List *rowMarks; /* list of PlanRowMarks */ List *placeholder_list; /* list of PlaceHolderInfos */ @@ -417,6 +416,9 @@ typedef struct PlannerInfo /* optional private data for join_search_hook, e.g., GEQO */ void *join_search_private; + + /* Does this query modify any partition key columns? */ + bool partColsUpdated; #ifdef XCP /* * This is NULL for a SELECT query (NULL distribution means "Coordinator" @@ -468,6 +470,9 @@ typedef struct PartitionSchemeData /* Cached information about partition key data types. */ int16 *parttyplen; bool *parttypbyval; + + /* Cached information about partition comparison functions. */ + FmgrInfo *partsupfunc; } PartitionSchemeData; typedef struct PartitionSchemeData *PartitionScheme; @@ -641,10 +646,14 @@ typedef struct PartitionSchemeData *PartitionScheme; * If the relation is partitioned, these fields will be set: * * part_scheme - Partitioning scheme of the relation - * boundinfo - Partition bounds * nparts - Number of partitions + * boundinfo - Partition bounds + * partition_qual - Partition constraint if not the root * part_rels - RelOptInfos for each partition * partexprs, nullable_partexprs - Partition key expressions + * partitioned_child_rels - RT indexes of unpruned partitions of + * relation that are partitioned tables + * themselves * * Note: A base relation always has only one set of partition keys, but a join * relation may have as many sets of partition keys as the number of relations @@ -771,10 +780,12 @@ typedef struct RelOptInfo PartitionScheme part_scheme; /* Partitioning scheme. */ int nparts; /* number of partitions */ struct PartitionBoundInfoData *boundinfo; /* Partition bounds */ + List *partition_qual; /* partition constraint */ struct RelOptInfo **part_rels; /* Array of RelOptInfos of partitions, * stored in the same order of bounds */ List **partexprs; /* Non-nullable partition key expressions. */ List **nullable_partexprs; /* Nullable partition key expressions. */ + List *partitioned_child_rels; /* List of RT indexes. */ #ifdef __TBASE__ /* used for interval partition */ bool intervalparent; /* is interval partition */ @@ -2251,27 +2262,6 @@ typedef struct AppendRelInfo Oid parent_reloid; /* OID of parent relation */ } AppendRelInfo; -/* - * For a partitioned table, this maps its RT index to the list of RT indexes - * of the partitioned child tables in the partition tree. We need to - * separately store this information, because we do not create AppendRelInfos - * for the partitioned child tables of a parent table, since AppendRelInfos - * contain information that is unnecessary for the partitioned child tables. - * The child_rels list must contain at least one element, because the parent - * partitioned table is itself counted as a child. - * - * These structs are kept in the PlannerInfo node's pcinfo_list. - */ -typedef struct PartitionedChildRelInfo -{ - NodeTag type; - - Index parent_relid; - List *child_rels; - bool part_cols_updated; /* is the partition key of any of - * the partitioned tables updated? */ -} PartitionedChildRelInfo; - /* * For each distinct placeholder expression generated during planning, we * store a PlaceHolderInfo node in the PlannerInfo node's placeholder_list. diff --git a/src/include/optimizer/planner.h b/src/include/optimizer/planner.h index 1425e543..2e47c1e3 100644 --- a/src/include/optimizer/planner.h +++ b/src/include/optimizer/planner.h @@ -121,11 +121,6 @@ extern Expr *preprocess_phv_expression(PlannerInfo *root, Expr *expr); extern bool plan_cluster_use_sort(Oid tableOid, Oid indexOid); -extern List *get_partitioned_child_rels(PlannerInfo *root, Index rti, - bool *part_cols_updated); -extern List *get_partitioned_child_rels_for_join(PlannerInfo *root, - Relids join_relids); - extern void preprocess_rowmarks(PlannerInfo *root); #ifdef __TBASE__ diff --git a/src/include/partitioning/partbounds.h b/src/include/partitioning/partbounds.h new file mode 100644 index 00000000..c76014d4 --- /dev/null +++ b/src/include/partitioning/partbounds.h @@ -0,0 +1,124 @@ +/*------------------------------------------------------------------------- + * + * partbounds.h + * + * Copyright (c) 2007-2018, PostgreSQL Global Development Group + * + * src/include/partitioning/partbounds.h + * + *------------------------------------------------------------------------- + */ +#ifndef PARTBOUNDS_H +#define PARTBOUNDS_H + +#include "catalog/partition.h" + + +/* + * PartitionBoundInfoData encapsulates a set of partition bounds. It is + * usually associated with partitioned tables as part of its partition + * descriptor, but may also be used to represent a virtual partitioned + * table such as a partitioned joinrel within the planner. + * + * A list partition datum that is known to be NULL is never put into the + * datums array. Instead, it is tracked using the null_index field. + * + * In the case of range partitioning, ndatums will typically be far less than + * 2 * nparts, because a partition's upper bound and the next partition's lower + * bound are the same in most common cases, and we only store one of them (the + * upper bound). In case of hash partitioning, ndatums will be same as the + * number of partitions. + * + * For range and list partitioned tables, datums is an array of datum-tuples + * with key->partnatts datums each. For hash partitioned tables, it is an array + * of datum-tuples with 2 datums, modulus and remainder, corresponding to a + * given partition. + * + * The datums in datums array are arranged in increasing order as defined by + * functions qsort_partition_rbound_cmp(), qsort_partition_list_value_cmp() and + * qsort_partition_hbound_cmp() for range, list and hash partitioned tables + * respectively. For range and list partitions this simply means that the + * datums in the datums array are arranged in increasing order as defined by + * the partition key's operator classes and collations. + * + * In the case of list partitioning, the indexes array stores one entry for + * every datum, which is the index of the partition that accepts a given datum. + * In case of range partitioning, it stores one entry per distinct range + * datum, which is the index of the partition for which a given datum + * is an upper bound. In the case of hash partitioning, the number of the + * entries in the indexes array is same as the greatest modulus amongst all + * partitions. For a given partition key datum-tuple, the index of the + * partition which would accept that datum-tuple would be given by the entry + * pointed by remainder produced when hash value of the datum-tuple is divided + * by the greatest modulus. + */ + +typedef struct PartitionBoundInfoData +{ + char strategy; /* hash, list or range? */ + int ndatums; /* Length of the datums following array */ + Datum **datums; + PartitionRangeDatumKind **kind; /* The kind of each range bound datum; + * NULL for hash and list partitioned + * tables */ + int *indexes; /* Partition indexes */ + int null_index; /* Index of the null-accepting partition; -1 + * if there isn't one */ + int default_index; /* Index of the default partition; -1 if there + * isn't one */ +} PartitionBoundInfoData; + +#define partition_bound_accepts_nulls(bi) ((bi)->null_index != -1) +#define partition_bound_has_default(bi) ((bi)->default_index != -1) + +/* + * When qsort'ing partition bounds after reading from the catalog, each bound + * is represented with one of the following structs. + */ + +/* One bound of a hash partition */ +typedef struct PartitionHashBound +{ + int modulus; + int remainder; + int index; +} PartitionHashBound; + +/* One value coming from some (index'th) list partition */ +typedef struct PartitionListValue +{ + int index; + Datum value; +} PartitionListValue; + +/* One bound of a range partition */ +typedef struct PartitionRangeBound +{ + int index; + Datum *datums; /* range bound datums */ + PartitionRangeDatumKind *kind; /* the kind of each datum */ + bool lower; /* this is the lower (vs upper) bound */ +} PartitionRangeBound; + +extern int get_hash_partition_greatest_modulus(PartitionBoundInfo b); +extern int partition_list_bsearch(FmgrInfo *partsupfunc, Oid *partcollation, + PartitionBoundInfo boundinfo, + Datum value, bool *is_equal); +extern int partition_range_bsearch(int partnatts, FmgrInfo *partsupfunc, + Oid *partcollation, + PartitionBoundInfo boundinfo, + PartitionRangeBound *probe, bool *is_equal); +extern int partition_range_datum_bsearch(FmgrInfo *partsupfunc, + Oid *partcollation, + PartitionBoundInfo boundinfo, + int nvalues, Datum *values, bool *is_equal); +extern int partition_hash_bsearch(PartitionBoundInfo boundinfo, + int modulus, int remainder); +extern uint64 compute_hash_value(int partnatts, FmgrInfo *partsupfunc, + Datum *values, bool *isnull); +extern int32 partition_rbound_datum_cmp(FmgrInfo *partsupfunc, + Oid *partcollation, + Datum *rb_datums, PartitionRangeDatumKind *rb_kind, + Datum *tuple_datums, int n_tuple_datums); + +#endif /* PARTBOUNDS_H */ diff --git a/src/include/partitioning/partprune.h b/src/include/partitioning/partprune.h new file mode 100644 index 00000000..52fadc7c --- /dev/null +++ b/src/include/partitioning/partprune.h @@ -0,0 +1,49 @@ +/*------------------------------------------------------------------------- + * + * partprune.h + * prototypes for partprune.c + * + * + * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/partitioning/partprune.h + * + *------------------------------------------------------------------------- + */ +#ifndef PARTPRUNE_H +#define PARTPRUNE_H + +#include "catalog/partition.h" +#include "nodes/relation.h" + +/* + * PartitionPruneContext + * + * Information about a partitioned table needed to perform partition pruning. + */ +typedef struct PartitionPruneContext +{ + /* Partition key information */ + char strategy; + int partnatts; + Oid *partopfamily; + Oid *partopcintype; + Oid *partcollation; + FmgrInfo *partsupfunc; + + /* Number of partitions */ + int nparts; + + /* Partition boundary info */ + PartitionBoundInfo boundinfo; +} PartitionPruneContext; + + +extern Relids prune_append_rel_partitions(RelOptInfo *rel); +extern Bitmapset *get_matching_partitions(PartitionPruneContext *context, + List *pruning_steps); +extern List *gen_partprune_steps(RelOptInfo *rel, List *clauses, + bool *contradictory); + +#endif /* PARTPRUNE_H */ diff --git a/src/test/regress/expected/inherit.out b/src/test/regress/expected/inherit.out index 51d9903d..be0a774d 100644 --- a/src/test/regress/expected/inherit.out +++ b/src/test/regress/expected/inherit.out @@ -2215,9 +2215,11 @@ explain (costs off) select * from mcrparted where abs(b) = 5; -- scans all parti Filter: (abs(b) = 5) -> Seq Scan on mcrparted3 Filter: (abs(b) = 5) + -> Seq Scan on mcrparted4 + Filter: (abs(b) = 5) -> Seq Scan on mcrparted5 Filter: (abs(b) = 5) -(12 rows) +(14 rows) explain (costs off) select * from mcrparted where a > -1; -- scans all partitions QUERY PLAN diff --git a/src/test/regress/expected/inherit_1.out b/src/test/regress/expected/inherit_1.out index a6b99b17..d16ab5d6 100644 --- a/src/test/regress/expected/inherit_1.out +++ b/src/test/regress/expected/inherit_1.out @@ -2213,9 +2213,11 @@ explain (costs off) select * from mcrparted where abs(b) = 5; -- scans all parti Filter: (abs(b) = 5) -> Seq Scan on mcrparted3 Filter: (abs(b) = 5) + -> Seq Scan on mcrparted4 + Filter: (abs(b) = 5) -> Seq Scan on mcrparted5 Filter: (abs(b) = 5) -(13 rows) +(15 rows) explain (costs off) select * from mcrparted where a > -1; -- scans all partitions QUERY PLAN diff --git a/src/test/regress/expected/inherit_2.out b/src/test/regress/expected/inherit_2.out index ef08ec3e..0502f335 100644 --- a/src/test/regress/expected/inherit_2.out +++ b/src/test/regress/expected/inherit_2.out @@ -2180,9 +2180,11 @@ explain (costs off) select * from mcrparted where abs(b) = 5; -- scans all parti Filter: (abs(b) = 5) -> Seq Scan on mcrparted3 Filter: (abs(b) = 5) + -> Seq Scan on mcrparted4 + Filter: (abs(b) = 5) -> Seq Scan on mcrparted5 Filter: (abs(b) = 5) -(12 rows) +(14 rows) explain (costs off) select * from mcrparted where a > -1; -- scans all partitions QUERY PLAN diff --git a/src/test/regress/expected/inherit_3.out b/src/test/regress/expected/inherit_3.out index 9a33a70d..955a1170 100644 --- a/src/test/regress/expected/inherit_3.out +++ b/src/test/regress/expected/inherit_3.out @@ -2200,9 +2200,11 @@ explain (costs off) select * from mcrparted where abs(b) = 5; -- scans all parti Filter: (abs(b) = 5) -> Seq Scan on mcrparted3 Filter: (abs(b) = 5) + -> Seq Scan on mcrparted4 + Filter: (abs(b) = 5) -> Seq Scan on mcrparted5 Filter: (abs(b) = 5) -(13 rows) +(15 rows) explain (costs off) select * from mcrparted where a > -1; -- scans all partitions QUERY PLAN diff --git a/src/test/regress/expected/partition_prune.out b/src/test/regress/expected/partition_prune.out index aabb0240..fe195e31 100644 --- a/src/test/regress/expected/partition_prune.out +++ b/src/test/regress/expected/partition_prune.out @@ -208,16 +208,14 @@ explain (costs off) select * from rlp where 1 > a; /* commuted */ (3 rows) explain (costs off) select * from rlp where a <= 1; - QUERY PLAN ---------------------------------------- + QUERY PLAN +-------------------------- Append -> Seq Scan on rlp1 Filter: (a <= 1) -> Seq Scan on rlp2 Filter: (a <= 1) - -> Seq Scan on rlp_default_default - Filter: (a <= 1) -(7 rows) +(5 rows) explain (costs off) select * from rlp where a = 1; QUERY PLAN @@ -577,7 +575,9 @@ explain (costs off) select * from rlp where a > 20 and a < 27; Filter: ((a > 20) AND (a < 27)) -> Seq Scan on rlp4_default Filter: ((a > 20) AND (a < 27)) -(7 rows) + -> Seq Scan on rlp_default_default + Filter: ((a > 20) AND (a < 27)) +(9 rows) explain (costs off) select * from rlp where a = 29; QUERY PLAN @@ -716,9 +716,7 @@ explain (costs off) select * from mc3p where a = 1 and abs(b) = 1 and c < 8; Filter: ((c < 8) AND (a = 1) AND (abs(b) = 1)) -> Seq Scan on mc3p1 Filter: ((c < 8) AND (a = 1) AND (abs(b) = 1)) - -> Seq Scan on mc3p_default - Filter: ((c < 8) AND (a = 1) AND (abs(b) = 1)) -(7 rows) +(5 rows) explain (costs off) select * from mc3p where a = 10 and abs(b) between 5 and 35; QUERY PLAN @@ -894,6 +892,8 @@ explain (costs off) select * from mc3p where a = 1 or abs(b) = 1 or c = 1; Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1)) -> Seq Scan on mc3p2 Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1)) + -> Seq Scan on mc3p3 + Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1)) -> Seq Scan on mc3p4 Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1)) -> Seq Scan on mc3p5 @@ -904,7 +904,7 @@ explain (costs off) select * from mc3p where a = 1 or abs(b) = 1 or c = 1; Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1)) -> Seq Scan on mc3p_default Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1)) -(17 rows) +(19 rows) explain (costs off) select * from mc3p where (a = 1 and abs(b) = 1) or (a = 10 and abs(b) = 10); QUERY PLAN @@ -1040,33 +1040,22 @@ explain (costs off) select * from boolpart where a is true or a is not true; Filter: ((a IS TRUE) OR (a IS NOT TRUE)) -> Seq Scan on boolpart_t Filter: ((a IS TRUE) OR (a IS NOT TRUE)) - -> Seq Scan on boolpart_default - Filter: ((a IS TRUE) OR (a IS NOT TRUE)) -(7 rows) +(5 rows) explain (costs off) select * from boolpart where a is not true; - QUERY PLAN ------------------------------------- + QUERY PLAN +--------------------------------- Append -> Seq Scan on boolpart_f Filter: (a IS NOT TRUE) - -> Seq Scan on boolpart_t - Filter: (a IS NOT TRUE) - -> Seq Scan on boolpart_default - Filter: (a IS NOT TRUE) -(7 rows) +(3 rows) explain (costs off) select * from boolpart where a is not true and a is not false; - QUERY PLAN --------------------------------------------------------- - Append - -> Seq Scan on boolpart_f - Filter: ((a IS NOT TRUE) AND (a IS NOT FALSE)) - -> Seq Scan on boolpart_t - Filter: ((a IS NOT TRUE) AND (a IS NOT FALSE)) - -> Seq Scan on boolpart_default - Filter: ((a IS NOT TRUE) AND (a IS NOT FALSE)) -(7 rows) + QUERY PLAN +-------------------------- + Result + One-Time Filter: false +(2 rows) explain (costs off) select * from boolpart where a is unknown; QUERY PLAN @@ -1092,4 +1081,446 @@ explain (costs off) select * from boolpart where a is not unknown; Filter: (a IS NOT UNKNOWN) (7 rows) -drop table lp, coll_pruning, rlp, mc3p, mc2p, boolpart; +-- +-- some more cases +-- +-- +-- pruning for partitioned table appearing inside a sub-query +-- +-- pruning won't work for mc3p, because some keys are Params +explain (costs off) select * from mc2p t1, lateral (select count(*) from mc3p t2 where t2.a = t1.b and abs(t2.b) = 1 and t2.c = 1) s where t1.a = 1; + QUERY PLAN +----------------------------------------------------------------------- + Nested Loop + -> Append + -> Seq Scan on mc2p1 t1 + Filter: (a = 1) + -> Seq Scan on mc2p2 t1_1 + Filter: (a = 1) + -> Seq Scan on mc2p_default t1_2 + Filter: (a = 1) + -> Aggregate + -> Append + -> Seq Scan on mc3p0 t2 + Filter: ((a = t1.b) AND (c = 1) AND (abs(b) = 1)) + -> Seq Scan on mc3p1 t2_1 + Filter: ((a = t1.b) AND (c = 1) AND (abs(b) = 1)) + -> Seq Scan on mc3p2 t2_2 + Filter: ((a = t1.b) AND (c = 1) AND (abs(b) = 1)) + -> Seq Scan on mc3p3 t2_3 + Filter: ((a = t1.b) AND (c = 1) AND (abs(b) = 1)) + -> Seq Scan on mc3p4 t2_4 + Filter: ((a = t1.b) AND (c = 1) AND (abs(b) = 1)) + -> Seq Scan on mc3p5 t2_5 + Filter: ((a = t1.b) AND (c = 1) AND (abs(b) = 1)) + -> Seq Scan on mc3p6 t2_6 + Filter: ((a = t1.b) AND (c = 1) AND (abs(b) = 1)) + -> Seq Scan on mc3p7 t2_7 + Filter: ((a = t1.b) AND (c = 1) AND (abs(b) = 1)) + -> Seq Scan on mc3p_default t2_8 + Filter: ((a = t1.b) AND (c = 1) AND (abs(b) = 1)) +(28 rows) + +-- pruning should work fine, because values for a prefix of keys (a, b) are +-- available +explain (costs off) select * from mc2p t1, lateral (select count(*) from mc3p t2 where t2.c = t1.b and abs(t2.b) = 1 and t2.a = 1) s where t1.a = 1; + QUERY PLAN +----------------------------------------------------------------------- + Nested Loop + -> Append + -> Seq Scan on mc2p1 t1 + Filter: (a = 1) + -> Seq Scan on mc2p2 t1_1 + Filter: (a = 1) + -> Seq Scan on mc2p_default t1_2 + Filter: (a = 1) + -> Aggregate + -> Append + -> Seq Scan on mc3p0 t2 + Filter: ((c = t1.b) AND (a = 1) AND (abs(b) = 1)) + -> Seq Scan on mc3p1 t2_1 + Filter: ((c = t1.b) AND (a = 1) AND (abs(b) = 1)) + -> Seq Scan on mc3p_default t2_2 + Filter: ((c = t1.b) AND (a = 1) AND (abs(b) = 1)) +(16 rows) + +-- also here, because values for all keys are provided +explain (costs off) select * from mc2p t1, lateral (select count(*) from mc3p t2 where t2.a = 1 and abs(t2.b) = 1 and t2.c = 1) s where t1.a = 1; + QUERY PLAN +-------------------------------------------------------------------- + Nested Loop + -> Aggregate + -> Append + -> Seq Scan on mc3p1 t2 + Filter: ((a = 1) AND (c = 1) AND (abs(b) = 1)) + -> Append + -> Seq Scan on mc2p1 t1 + Filter: (a = 1) + -> Seq Scan on mc2p2 t1_1 + Filter: (a = 1) + -> Seq Scan on mc2p_default t1_2 + Filter: (a = 1) +(12 rows) + +-- +-- pruning with clauses containing <> operator +-- +-- doesn't prune range partitions +create table rp (a int) partition by range (a); +create table rp0 partition of rp for values from (minvalue) to (1); +create table rp1 partition of rp for values from (1) to (2); +create table rp2 partition of rp for values from (2) to (maxvalue); +explain (costs off) select * from rp where a <> 1; + QUERY PLAN +-------------------------- + Append + -> Seq Scan on rp0 + Filter: (a <> 1) + -> Seq Scan on rp1 + Filter: (a <> 1) + -> Seq Scan on rp2 + Filter: (a <> 1) +(7 rows) + +explain (costs off) select * from rp where a <> 1 and a <> 2; + QUERY PLAN +----------------------------------------- + Append + -> Seq Scan on rp0 + Filter: ((a <> 1) AND (a <> 2)) + -> Seq Scan on rp1 + Filter: ((a <> 1) AND (a <> 2)) + -> Seq Scan on rp2 + Filter: ((a <> 1) AND (a <> 2)) +(7 rows) + +-- null partition should be eliminated due to strict <> clause. +explain (costs off) select * from lp where a <> 'a'; + QUERY PLAN +------------------------------------ + Append + -> Seq Scan on lp_ad + Filter: (a <> 'a'::bpchar) + -> Seq Scan on lp_bc + Filter: (a <> 'a'::bpchar) + -> Seq Scan on lp_ef + Filter: (a <> 'a'::bpchar) + -> Seq Scan on lp_g + Filter: (a <> 'a'::bpchar) + -> Seq Scan on lp_default + Filter: (a <> 'a'::bpchar) +(11 rows) + +-- ensure we detect contradictions in clauses; a can't be NULL and NOT NULL. +explain (costs off) select * from lp where a <> 'a' and a is null; + QUERY PLAN +-------------------------- + Result + One-Time Filter: false +(2 rows) + +explain (costs off) select * from lp where (a <> 'a' and a <> 'd') or a is null; + QUERY PLAN +------------------------------------------------------------------------------ + Append + -> Seq Scan on lp_bc + Filter: (((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) OR (a IS NULL)) + -> Seq Scan on lp_ef + Filter: (((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) OR (a IS NULL)) + -> Seq Scan on lp_g + Filter: (((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) OR (a IS NULL)) + -> Seq Scan on lp_null + Filter: (((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) OR (a IS NULL)) + -> Seq Scan on lp_default + Filter: (((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) OR (a IS NULL)) +(11 rows) + +-- check that it also works for a partitioned table that's not root, +-- which in this case are partitions of rlp that are themselves +-- list-partitioned on b +explain (costs off) select * from rlp where a = 15 and b <> 'ab' and b <> 'cd' and b <> 'xy' and b is not null; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------------ + Append + -> Seq Scan on rlp3efgh + Filter: ((b IS NOT NULL) AND ((b)::text <> 'ab'::text) AND ((b)::text <> 'cd'::text) AND ((b)::text <> 'xy'::text) AND (a = 15)) + -> Seq Scan on rlp3_default + Filter: ((b IS NOT NULL) AND ((b)::text <> 'ab'::text) AND ((b)::text <> 'cd'::text) AND ((b)::text <> 'xy'::text) AND (a = 15)) +(5 rows) + +-- +-- different collations for different keys with same expression +-- +create table coll_pruning_multi (a text) partition by range (substr(a, 1) collate "POSIX", substr(a, 1) collate "C"); +create table coll_pruning_multi1 partition of coll_pruning_multi for values from ('a', 'a') to ('a', 'e'); +create table coll_pruning_multi2 partition of coll_pruning_multi for values from ('a', 'e') to ('a', 'z'); +create table coll_pruning_multi3 partition of coll_pruning_multi for values from ('b', 'a') to ('b', 'e'); +-- no pruning, because no value for the leading key +explain (costs off) select * from coll_pruning_multi where substr(a, 1) = 'e' collate "C"; + QUERY PLAN +-------------------------------------------------------- + Append + -> Seq Scan on coll_pruning_multi1 + Filter: (substr(a, 1) = 'e'::text COLLATE "C") + -> Seq Scan on coll_pruning_multi2 + Filter: (substr(a, 1) = 'e'::text COLLATE "C") + -> Seq Scan on coll_pruning_multi3 + Filter: (substr(a, 1) = 'e'::text COLLATE "C") +(7 rows) + +-- pruning, with a value provided for the leading key +explain (costs off) select * from coll_pruning_multi where substr(a, 1) = 'a' collate "POSIX"; + QUERY PLAN +------------------------------------------------------------ + Append + -> Seq Scan on coll_pruning_multi1 + Filter: (substr(a, 1) = 'a'::text COLLATE "POSIX") + -> Seq Scan on coll_pruning_multi2 + Filter: (substr(a, 1) = 'a'::text COLLATE "POSIX") +(5 rows) + +-- pruning, with values provided for both keys +explain (costs off) select * from coll_pruning_multi where substr(a, 1) = 'e' collate "C" and substr(a, 1) = 'a' collate "POSIX"; + QUERY PLAN +--------------------------------------------------------------------------------------------------------- + Append + -> Seq Scan on coll_pruning_multi2 + Filter: ((substr(a, 1) = 'e'::text COLLATE "C") AND (substr(a, 1) = 'a'::text COLLATE "POSIX")) +(3 rows) + +-- +-- LIKE operators don't prune +-- +create table like_op_noprune (a text) partition by list (a); +create table like_op_noprune1 partition of like_op_noprune for values in ('ABC'); +create table like_op_noprune2 partition of like_op_noprune for values in ('BCD'); +explain (costs off) select * from like_op_noprune where a like '%BC'; + QUERY PLAN +------------------------------------ + Append + -> Seq Scan on like_op_noprune1 + Filter: (a ~~ '%BC'::text) + -> Seq Scan on like_op_noprune2 + Filter: (a ~~ '%BC'::text) +(5 rows) + +-- +-- tests wherein clause value requires a cross-type comparison function +-- +create table lparted_by_int2 (a smallint) partition by list (a); +create table lparted_by_int2_1 partition of lparted_by_int2 for values in (1); +create table lparted_by_int2_16384 partition of lparted_by_int2 for values in (16384); +explain (costs off) select * from lparted_by_int2 where a = 100000000000000; + QUERY PLAN +-------------------------- + Result + One-Time Filter: false +(2 rows) + +create table rparted_by_int2 (a smallint) partition by range (a); +create table rparted_by_int2_1 partition of rparted_by_int2 for values from (1) to (10); +create table rparted_by_int2_16384 partition of rparted_by_int2 for values from (10) to (16384); +-- all partitions pruned +explain (costs off) select * from rparted_by_int2 where a > 100000000000000; + QUERY PLAN +-------------------------- + Result + One-Time Filter: false +(2 rows) + +create table rparted_by_int2_maxvalue partition of rparted_by_int2 for values from (16384) to (maxvalue); +-- all partitions but rparted_by_int2_maxvalue pruned +explain (costs off) select * from rparted_by_int2 where a > 100000000000000; + QUERY PLAN +------------------------------------------------- + Append + -> Seq Scan on rparted_by_int2_maxvalue + Filter: (a > '100000000000000'::bigint) +(3 rows) + +drop table lp, coll_pruning, rlp, mc3p, mc2p, boolpart, rp, coll_pruning_multi, like_op_noprune, lparted_by_int2, rparted_by_int2; +-- hash partitioning +create table hp (a int, b text) partition by hash (a, b); +create table hp0 partition of hp for values with (modulus 4, remainder 0); +create table hp3 partition of hp for values with (modulus 4, remainder 3); +create table hp1 partition of hp for values with (modulus 4, remainder 1); +create table hp2 partition of hp for values with (modulus 4, remainder 2); +insert into hp values (null, null); +insert into hp values (1, null); +insert into hp values (1, 'xxx'); +insert into hp values (null, 'xxx'); +insert into hp values (10, 'xxx'); +insert into hp values (10, 'yyy'); +select tableoid::regclass, * from hp order by 1; + tableoid | a | b +----------+----+----- + hp0 | | + hp0 | 1 | + hp0 | 1 | xxx + hp3 | 10 | yyy + hp1 | | xxx + hp2 | 10 | xxx +(6 rows) + +-- partial keys won't prune, nor would non-equality conditions +explain (costs off) select * from hp where a = 1; + QUERY PLAN +------------------------- + Append + -> Seq Scan on hp0 + Filter: (a = 1) + -> Seq Scan on hp1 + Filter: (a = 1) + -> Seq Scan on hp2 + Filter: (a = 1) + -> Seq Scan on hp3 + Filter: (a = 1) +(9 rows) + +explain (costs off) select * from hp where b = 'xxx'; + QUERY PLAN +----------------------------------- + Append + -> Seq Scan on hp0 + Filter: (b = 'xxx'::text) + -> Seq Scan on hp1 + Filter: (b = 'xxx'::text) + -> Seq Scan on hp2 + Filter: (b = 'xxx'::text) + -> Seq Scan on hp3 + Filter: (b = 'xxx'::text) +(9 rows) + +explain (costs off) select * from hp where a is null; + QUERY PLAN +----------------------------- + Append + -> Seq Scan on hp0 + Filter: (a IS NULL) + -> Seq Scan on hp1 + Filter: (a IS NULL) + -> Seq Scan on hp2 + Filter: (a IS NULL) + -> Seq Scan on hp3 + Filter: (a IS NULL) +(9 rows) + +explain (costs off) select * from hp where b is null; + QUERY PLAN +----------------------------- + Append + -> Seq Scan on hp0 + Filter: (b IS NULL) + -> Seq Scan on hp1 + Filter: (b IS NULL) + -> Seq Scan on hp2 + Filter: (b IS NULL) + -> Seq Scan on hp3 + Filter: (b IS NULL) +(9 rows) + +explain (costs off) select * from hp where a < 1 and b = 'xxx'; + QUERY PLAN +------------------------------------------------- + Append + -> Seq Scan on hp0 + Filter: ((a < 1) AND (b = 'xxx'::text)) + -> Seq Scan on hp1 + Filter: ((a < 1) AND (b = 'xxx'::text)) + -> Seq Scan on hp2 + Filter: ((a < 1) AND (b = 'xxx'::text)) + -> Seq Scan on hp3 + Filter: ((a < 1) AND (b = 'xxx'::text)) +(9 rows) + +explain (costs off) select * from hp where a <> 1 and b = 'yyy'; + QUERY PLAN +-------------------------------------------------- + Append + -> Seq Scan on hp0 + Filter: ((a <> 1) AND (b = 'yyy'::text)) + -> Seq Scan on hp1 + Filter: ((a <> 1) AND (b = 'yyy'::text)) + -> Seq Scan on hp2 + Filter: ((a <> 1) AND (b = 'yyy'::text)) + -> Seq Scan on hp3 + Filter: ((a <> 1) AND (b = 'yyy'::text)) +(9 rows) + +-- pruning should work if non-null values are provided for all the keys +explain (costs off) select * from hp where a is null and b is null; + QUERY PLAN +----------------------------------------------- + Append + -> Seq Scan on hp0 + Filter: ((a IS NULL) AND (b IS NULL)) +(3 rows) + +explain (costs off) select * from hp where a = 1 and b is null; + QUERY PLAN +------------------------------------------- + Append + -> Seq Scan on hp0 + Filter: ((b IS NULL) AND (a = 1)) +(3 rows) + +explain (costs off) select * from hp where a = 1 and b = 'xxx'; + QUERY PLAN +------------------------------------------------- + Append + -> Seq Scan on hp0 + Filter: ((a = 1) AND (b = 'xxx'::text)) +(3 rows) + +explain (costs off) select * from hp where a is null and b = 'xxx'; + QUERY PLAN +----------------------------------------------------- + Append + -> Seq Scan on hp1 + Filter: ((a IS NULL) AND (b = 'xxx'::text)) +(3 rows) + +explain (costs off) select * from hp where a = 10 and b = 'xxx'; + QUERY PLAN +-------------------------------------------------- + Append + -> Seq Scan on hp2 + Filter: ((a = 10) AND (b = 'xxx'::text)) +(3 rows) + +explain (costs off) select * from hp where a = 10 and b = 'yyy'; + QUERY PLAN +-------------------------------------------------- + Append + -> Seq Scan on hp3 + Filter: ((a = 10) AND (b = 'yyy'::text)) +(3 rows) + +explain (costs off) select * from hp where (a = 10 and b = 'yyy') or (a = 10 and b = 'xxx') or (a is null and b is null); + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------- + Append + -> Seq Scan on hp0 + Filter: (((a = 10) AND (b = 'yyy'::text)) OR ((a = 10) AND (b = 'xxx'::text)) OR ((a IS NULL) AND (b IS NULL))) + -> Seq Scan on hp2 + Filter: (((a = 10) AND (b = 'yyy'::text)) OR ((a = 10) AND (b = 'xxx'::text)) OR ((a IS NULL) AND (b IS NULL))) + -> Seq Scan on hp3 + Filter: (((a = 10) AND (b = 'yyy'::text)) OR ((a = 10) AND (b = 'xxx'::text)) OR ((a IS NULL) AND (b IS NULL))) +(7 rows) + +-- hash partitiong pruning doesn't occur with <> operator clauses +explain (costs off) select * from hp where a <> 1 and b <> 'xxx'; + QUERY PLAN +--------------------------------------------------- + Append + -> Seq Scan on hp0 + Filter: ((a <> 1) AND (b <> 'xxx'::text)) + -> Seq Scan on hp1 + Filter: ((a <> 1) AND (b <> 'xxx'::text)) + -> Seq Scan on hp2 + Filter: ((a <> 1) AND (b <> 'xxx'::text)) + -> Seq Scan on hp3 + Filter: ((a <> 1) AND (b <> 'xxx'::text)) +(9 rows) + +drop table hp; diff --git a/src/test/regress/sql/partition_prune.sql b/src/test/regress/sql/partition_prune.sql index 514f8e5c..974e62c3 100644 --- a/src/test/regress/sql/partition_prune.sql +++ b/src/test/regress/sql/partition_prune.sql @@ -152,4 +152,125 @@ explain (costs off) select * from boolpart where a is not true and a is not fals explain (costs off) select * from boolpart where a is unknown; explain (costs off) select * from boolpart where a is not unknown; -drop table lp, coll_pruning, rlp, mc3p, mc2p, boolpart; +-- +-- some more cases +-- + +-- +-- pruning for partitioned table appearing inside a sub-query +-- +-- pruning won't work for mc3p, because some keys are Params +explain (costs off) select * from mc2p t1, lateral (select count(*) from mc3p t2 where t2.a = t1.b and abs(t2.b) = 1 and t2.c = 1) s where t1.a = 1; + +-- pruning should work fine, because values for a prefix of keys (a, b) are +-- available +explain (costs off) select * from mc2p t1, lateral (select count(*) from mc3p t2 where t2.c = t1.b and abs(t2.b) = 1 and t2.a = 1) s where t1.a = 1; + +-- also here, because values for all keys are provided +explain (costs off) select * from mc2p t1, lateral (select count(*) from mc3p t2 where t2.a = 1 and abs(t2.b) = 1 and t2.c = 1) s where t1.a = 1; + +-- +-- pruning with clauses containing <> operator +-- + +-- doesn't prune range partitions +create table rp (a int) partition by range (a); +create table rp0 partition of rp for values from (minvalue) to (1); +create table rp1 partition of rp for values from (1) to (2); +create table rp2 partition of rp for values from (2) to (maxvalue); + +explain (costs off) select * from rp where a <> 1; +explain (costs off) select * from rp where a <> 1 and a <> 2; + +-- null partition should be eliminated due to strict <> clause. +explain (costs off) select * from lp where a <> 'a'; + +-- ensure we detect contradictions in clauses; a can't be NULL and NOT NULL. +explain (costs off) select * from lp where a <> 'a' and a is null; +explain (costs off) select * from lp where (a <> 'a' and a <> 'd') or a is null; + +-- check that it also works for a partitioned table that's not root, +-- which in this case are partitions of rlp that are themselves +-- list-partitioned on b +explain (costs off) select * from rlp where a = 15 and b <> 'ab' and b <> 'cd' and b <> 'xy' and b is not null; + +-- +-- different collations for different keys with same expression +-- +create table coll_pruning_multi (a text) partition by range (substr(a, 1) collate "POSIX", substr(a, 1) collate "C"); +create table coll_pruning_multi1 partition of coll_pruning_multi for values from ('a', 'a') to ('a', 'e'); +create table coll_pruning_multi2 partition of coll_pruning_multi for values from ('a', 'e') to ('a', 'z'); +create table coll_pruning_multi3 partition of coll_pruning_multi for values from ('b', 'a') to ('b', 'e'); + +-- no pruning, because no value for the leading key +explain (costs off) select * from coll_pruning_multi where substr(a, 1) = 'e' collate "C"; + +-- pruning, with a value provided for the leading key +explain (costs off) select * from coll_pruning_multi where substr(a, 1) = 'a' collate "POSIX"; + +-- pruning, with values provided for both keys +explain (costs off) select * from coll_pruning_multi where substr(a, 1) = 'e' collate "C" and substr(a, 1) = 'a' collate "POSIX"; + +-- +-- LIKE operators don't prune +-- +create table like_op_noprune (a text) partition by list (a); +create table like_op_noprune1 partition of like_op_noprune for values in ('ABC'); +create table like_op_noprune2 partition of like_op_noprune for values in ('BCD'); +explain (costs off) select * from like_op_noprune where a like '%BC'; + +-- +-- tests wherein clause value requires a cross-type comparison function +-- +create table lparted_by_int2 (a smallint) partition by list (a); +create table lparted_by_int2_1 partition of lparted_by_int2 for values in (1); +create table lparted_by_int2_16384 partition of lparted_by_int2 for values in (16384); +explain (costs off) select * from lparted_by_int2 where a = 100000000000000; + +create table rparted_by_int2 (a smallint) partition by range (a); +create table rparted_by_int2_1 partition of rparted_by_int2 for values from (1) to (10); +create table rparted_by_int2_16384 partition of rparted_by_int2 for values from (10) to (16384); +-- all partitions pruned +explain (costs off) select * from rparted_by_int2 where a > 100000000000000; +create table rparted_by_int2_maxvalue partition of rparted_by_int2 for values from (16384) to (maxvalue); +-- all partitions but rparted_by_int2_maxvalue pruned +explain (costs off) select * from rparted_by_int2 where a > 100000000000000; + +drop table lp, coll_pruning, rlp, mc3p, mc2p, boolpart, rp, coll_pruning_multi, like_op_noprune, lparted_by_int2, rparted_by_int2; + +-- hash partitioning +create table hp (a int, b text) partition by hash (a, b); +create table hp0 partition of hp for values with (modulus 4, remainder 0); +create table hp3 partition of hp for values with (modulus 4, remainder 3); +create table hp1 partition of hp for values with (modulus 4, remainder 1); +create table hp2 partition of hp for values with (modulus 4, remainder 2); + +insert into hp values (null, null); +insert into hp values (1, null); +insert into hp values (1, 'xxx'); +insert into hp values (null, 'xxx'); +insert into hp values (10, 'xxx'); +insert into hp values (10, 'yyy'); +select tableoid::regclass, * from hp order by 1; + +-- partial keys won't prune, nor would non-equality conditions +explain (costs off) select * from hp where a = 1; +explain (costs off) select * from hp where b = 'xxx'; +explain (costs off) select * from hp where a is null; +explain (costs off) select * from hp where b is null; +explain (costs off) select * from hp where a < 1 and b = 'xxx'; +explain (costs off) select * from hp where a <> 1 and b = 'yyy'; + +-- pruning should work if non-null values are provided for all the keys +explain (costs off) select * from hp where a is null and b is null; +explain (costs off) select * from hp where a = 1 and b is null; +explain (costs off) select * from hp where a = 1 and b = 'xxx'; +explain (costs off) select * from hp where a is null and b = 'xxx'; +explain (costs off) select * from hp where a = 10 and b = 'xxx'; +explain (costs off) select * from hp where a = 10 and b = 'yyy'; +explain (costs off) select * from hp where (a = 10 and b = 'yyy') or (a = 10 and b = 'xxx') or (a is null and b is null); + +-- hash partitiong pruning doesn't occur with <> operator clauses +explain (costs off) select * from hp where a <> 1 and b <> 'xxx'; + +drop table hp; From 7973f781bbd20ee24129f28116e608f4c11ea4c7 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Mon, 29 Jun 2020 15:55:26 +0800 Subject: [PATCH 246/578] Attempt to fix endianess issues in new hash partition test. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/test/regress/expected/partition_prune.out | 185 ----------------- .../regress/expected/partition_prune_hash.out | 189 ++++++++++++++++++ .../expected/partition_prune_hash_1.out | 187 +++++++++++++++++ src/test/regress/parallel_schedule | 2 +- src/test/regress/serial_schedule | 1 + src/test/regress/sql/partition_prune.sql | 37 ---- src/test/regress/sql/partition_prune_hash.sql | 41 ++++ 7 files changed, 419 insertions(+), 223 deletions(-) create mode 100644 src/test/regress/expected/partition_prune_hash.out create mode 100644 src/test/regress/expected/partition_prune_hash_1.out create mode 100644 src/test/regress/sql/partition_prune_hash.sql diff --git a/src/test/regress/expected/partition_prune.out b/src/test/regress/expected/partition_prune.out index fe195e31..3e0a196e 100644 --- a/src/test/regress/expected/partition_prune.out +++ b/src/test/regress/expected/partition_prune.out @@ -1339,188 +1339,3 @@ explain (costs off) select * from rparted_by_int2 where a > 100000000000000; (3 rows) drop table lp, coll_pruning, rlp, mc3p, mc2p, boolpart, rp, coll_pruning_multi, like_op_noprune, lparted_by_int2, rparted_by_int2; --- hash partitioning -create table hp (a int, b text) partition by hash (a, b); -create table hp0 partition of hp for values with (modulus 4, remainder 0); -create table hp3 partition of hp for values with (modulus 4, remainder 3); -create table hp1 partition of hp for values with (modulus 4, remainder 1); -create table hp2 partition of hp for values with (modulus 4, remainder 2); -insert into hp values (null, null); -insert into hp values (1, null); -insert into hp values (1, 'xxx'); -insert into hp values (null, 'xxx'); -insert into hp values (10, 'xxx'); -insert into hp values (10, 'yyy'); -select tableoid::regclass, * from hp order by 1; - tableoid | a | b -----------+----+----- - hp0 | | - hp0 | 1 | - hp0 | 1 | xxx - hp3 | 10 | yyy - hp1 | | xxx - hp2 | 10 | xxx -(6 rows) - --- partial keys won't prune, nor would non-equality conditions -explain (costs off) select * from hp where a = 1; - QUERY PLAN -------------------------- - Append - -> Seq Scan on hp0 - Filter: (a = 1) - -> Seq Scan on hp1 - Filter: (a = 1) - -> Seq Scan on hp2 - Filter: (a = 1) - -> Seq Scan on hp3 - Filter: (a = 1) -(9 rows) - -explain (costs off) select * from hp where b = 'xxx'; - QUERY PLAN ------------------------------------ - Append - -> Seq Scan on hp0 - Filter: (b = 'xxx'::text) - -> Seq Scan on hp1 - Filter: (b = 'xxx'::text) - -> Seq Scan on hp2 - Filter: (b = 'xxx'::text) - -> Seq Scan on hp3 - Filter: (b = 'xxx'::text) -(9 rows) - -explain (costs off) select * from hp where a is null; - QUERY PLAN ------------------------------ - Append - -> Seq Scan on hp0 - Filter: (a IS NULL) - -> Seq Scan on hp1 - Filter: (a IS NULL) - -> Seq Scan on hp2 - Filter: (a IS NULL) - -> Seq Scan on hp3 - Filter: (a IS NULL) -(9 rows) - -explain (costs off) select * from hp where b is null; - QUERY PLAN ------------------------------ - Append - -> Seq Scan on hp0 - Filter: (b IS NULL) - -> Seq Scan on hp1 - Filter: (b IS NULL) - -> Seq Scan on hp2 - Filter: (b IS NULL) - -> Seq Scan on hp3 - Filter: (b IS NULL) -(9 rows) - -explain (costs off) select * from hp where a < 1 and b = 'xxx'; - QUERY PLAN -------------------------------------------------- - Append - -> Seq Scan on hp0 - Filter: ((a < 1) AND (b = 'xxx'::text)) - -> Seq Scan on hp1 - Filter: ((a < 1) AND (b = 'xxx'::text)) - -> Seq Scan on hp2 - Filter: ((a < 1) AND (b = 'xxx'::text)) - -> Seq Scan on hp3 - Filter: ((a < 1) AND (b = 'xxx'::text)) -(9 rows) - -explain (costs off) select * from hp where a <> 1 and b = 'yyy'; - QUERY PLAN --------------------------------------------------- - Append - -> Seq Scan on hp0 - Filter: ((a <> 1) AND (b = 'yyy'::text)) - -> Seq Scan on hp1 - Filter: ((a <> 1) AND (b = 'yyy'::text)) - -> Seq Scan on hp2 - Filter: ((a <> 1) AND (b = 'yyy'::text)) - -> Seq Scan on hp3 - Filter: ((a <> 1) AND (b = 'yyy'::text)) -(9 rows) - --- pruning should work if non-null values are provided for all the keys -explain (costs off) select * from hp where a is null and b is null; - QUERY PLAN ------------------------------------------------ - Append - -> Seq Scan on hp0 - Filter: ((a IS NULL) AND (b IS NULL)) -(3 rows) - -explain (costs off) select * from hp where a = 1 and b is null; - QUERY PLAN -------------------------------------------- - Append - -> Seq Scan on hp0 - Filter: ((b IS NULL) AND (a = 1)) -(3 rows) - -explain (costs off) select * from hp where a = 1 and b = 'xxx'; - QUERY PLAN -------------------------------------------------- - Append - -> Seq Scan on hp0 - Filter: ((a = 1) AND (b = 'xxx'::text)) -(3 rows) - -explain (costs off) select * from hp where a is null and b = 'xxx'; - QUERY PLAN ------------------------------------------------------ - Append - -> Seq Scan on hp1 - Filter: ((a IS NULL) AND (b = 'xxx'::text)) -(3 rows) - -explain (costs off) select * from hp where a = 10 and b = 'xxx'; - QUERY PLAN --------------------------------------------------- - Append - -> Seq Scan on hp2 - Filter: ((a = 10) AND (b = 'xxx'::text)) -(3 rows) - -explain (costs off) select * from hp where a = 10 and b = 'yyy'; - QUERY PLAN --------------------------------------------------- - Append - -> Seq Scan on hp3 - Filter: ((a = 10) AND (b = 'yyy'::text)) -(3 rows) - -explain (costs off) select * from hp where (a = 10 and b = 'yyy') or (a = 10 and b = 'xxx') or (a is null and b is null); - QUERY PLAN -------------------------------------------------------------------------------------------------------------------------- - Append - -> Seq Scan on hp0 - Filter: (((a = 10) AND (b = 'yyy'::text)) OR ((a = 10) AND (b = 'xxx'::text)) OR ((a IS NULL) AND (b IS NULL))) - -> Seq Scan on hp2 - Filter: (((a = 10) AND (b = 'yyy'::text)) OR ((a = 10) AND (b = 'xxx'::text)) OR ((a IS NULL) AND (b IS NULL))) - -> Seq Scan on hp3 - Filter: (((a = 10) AND (b = 'yyy'::text)) OR ((a = 10) AND (b = 'xxx'::text)) OR ((a IS NULL) AND (b IS NULL))) -(7 rows) - --- hash partitiong pruning doesn't occur with <> operator clauses -explain (costs off) select * from hp where a <> 1 and b <> 'xxx'; - QUERY PLAN ---------------------------------------------------- - Append - -> Seq Scan on hp0 - Filter: ((a <> 1) AND (b <> 'xxx'::text)) - -> Seq Scan on hp1 - Filter: ((a <> 1) AND (b <> 'xxx'::text)) - -> Seq Scan on hp2 - Filter: ((a <> 1) AND (b <> 'xxx'::text)) - -> Seq Scan on hp3 - Filter: ((a <> 1) AND (b <> 'xxx'::text)) -(9 rows) - -drop table hp; diff --git a/src/test/regress/expected/partition_prune_hash.out b/src/test/regress/expected/partition_prune_hash.out new file mode 100644 index 00000000..fbba3f1f --- /dev/null +++ b/src/test/regress/expected/partition_prune_hash.out @@ -0,0 +1,189 @@ +-- +-- Test Partition pruning for HASH partitioning +-- We keep this as a seperate test as hash functions return +-- values will vary based on CPU architecture. +-- +create table hp (a int, b text) partition by hash (a, b); +create table hp0 partition of hp for values with (modulus 4, remainder 0); +create table hp3 partition of hp for values with (modulus 4, remainder 3); +create table hp1 partition of hp for values with (modulus 4, remainder 1); +create table hp2 partition of hp for values with (modulus 4, remainder 2); +insert into hp values (null, null); +insert into hp values (1, null); +insert into hp values (1, 'xxx'); +insert into hp values (null, 'xxx'); +insert into hp values (10, 'xxx'); +insert into hp values (10, 'yyy'); +select tableoid::regclass, * from hp order by 1; + tableoid | a | b +----------+----+----- + hp0 | | + hp0 | 1 | + hp0 | 1 | xxx + hp3 | 10 | yyy + hp1 | | xxx + hp2 | 10 | xxx +(6 rows) + +-- partial keys won't prune, nor would non-equality conditions +explain (costs off) select * from hp where a = 1; + QUERY PLAN +------------------------- + Append + -> Seq Scan on hp0 + Filter: (a = 1) + -> Seq Scan on hp1 + Filter: (a = 1) + -> Seq Scan on hp2 + Filter: (a = 1) + -> Seq Scan on hp3 + Filter: (a = 1) +(9 rows) + +explain (costs off) select * from hp where b = 'xxx'; + QUERY PLAN +----------------------------------- + Append + -> Seq Scan on hp0 + Filter: (b = 'xxx'::text) + -> Seq Scan on hp1 + Filter: (b = 'xxx'::text) + -> Seq Scan on hp2 + Filter: (b = 'xxx'::text) + -> Seq Scan on hp3 + Filter: (b = 'xxx'::text) +(9 rows) + +explain (costs off) select * from hp where a is null; + QUERY PLAN +----------------------------- + Append + -> Seq Scan on hp0 + Filter: (a IS NULL) + -> Seq Scan on hp1 + Filter: (a IS NULL) + -> Seq Scan on hp2 + Filter: (a IS NULL) + -> Seq Scan on hp3 + Filter: (a IS NULL) +(9 rows) + +explain (costs off) select * from hp where b is null; + QUERY PLAN +----------------------------- + Append + -> Seq Scan on hp0 + Filter: (b IS NULL) + -> Seq Scan on hp1 + Filter: (b IS NULL) + -> Seq Scan on hp2 + Filter: (b IS NULL) + -> Seq Scan on hp3 + Filter: (b IS NULL) +(9 rows) + +explain (costs off) select * from hp where a < 1 and b = 'xxx'; + QUERY PLAN +------------------------------------------------- + Append + -> Seq Scan on hp0 + Filter: ((a < 1) AND (b = 'xxx'::text)) + -> Seq Scan on hp1 + Filter: ((a < 1) AND (b = 'xxx'::text)) + -> Seq Scan on hp2 + Filter: ((a < 1) AND (b = 'xxx'::text)) + -> Seq Scan on hp3 + Filter: ((a < 1) AND (b = 'xxx'::text)) +(9 rows) + +explain (costs off) select * from hp where a <> 1 and b = 'yyy'; + QUERY PLAN +-------------------------------------------------- + Append + -> Seq Scan on hp0 + Filter: ((a <> 1) AND (b = 'yyy'::text)) + -> Seq Scan on hp1 + Filter: ((a <> 1) AND (b = 'yyy'::text)) + -> Seq Scan on hp2 + Filter: ((a <> 1) AND (b = 'yyy'::text)) + -> Seq Scan on hp3 + Filter: ((a <> 1) AND (b = 'yyy'::text)) +(9 rows) + +-- pruning should work if non-null values are provided for all the keys +explain (costs off) select * from hp where a is null and b is null; + QUERY PLAN +----------------------------------------------- + Append + -> Seq Scan on hp0 + Filter: ((a IS NULL) AND (b IS NULL)) +(3 rows) + +explain (costs off) select * from hp where a = 1 and b is null; + QUERY PLAN +------------------------------------------- + Append + -> Seq Scan on hp0 + Filter: ((b IS NULL) AND (a = 1)) +(3 rows) + +explain (costs off) select * from hp where a = 1 and b = 'xxx'; + QUERY PLAN +------------------------------------------------- + Append + -> Seq Scan on hp0 + Filter: ((a = 1) AND (b = 'xxx'::text)) +(3 rows) + +explain (costs off) select * from hp where a is null and b = 'xxx'; + QUERY PLAN +----------------------------------------------------- + Append + -> Seq Scan on hp1 + Filter: ((a IS NULL) AND (b = 'xxx'::text)) +(3 rows) + +explain (costs off) select * from hp where a = 10 and b = 'xxx'; + QUERY PLAN +-------------------------------------------------- + Append + -> Seq Scan on hp2 + Filter: ((a = 10) AND (b = 'xxx'::text)) +(3 rows) + +explain (costs off) select * from hp where a = 10 and b = 'yyy'; + QUERY PLAN +-------------------------------------------------- + Append + -> Seq Scan on hp3 + Filter: ((a = 10) AND (b = 'yyy'::text)) +(3 rows) + +explain (costs off) select * from hp where (a = 10 and b = 'yyy') or (a = 10 and b = 'xxx') or (a is null and b is null); + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------- + Append + -> Seq Scan on hp0 + Filter: (((a = 10) AND (b = 'yyy'::text)) OR ((a = 10) AND (b = 'xxx'::text)) OR ((a IS NULL) AND (b IS NULL))) + -> Seq Scan on hp2 + Filter: (((a = 10) AND (b = 'yyy'::text)) OR ((a = 10) AND (b = 'xxx'::text)) OR ((a IS NULL) AND (b IS NULL))) + -> Seq Scan on hp3 + Filter: (((a = 10) AND (b = 'yyy'::text)) OR ((a = 10) AND (b = 'xxx'::text)) OR ((a IS NULL) AND (b IS NULL))) +(7 rows) + +-- hash partitiong pruning doesn't occur with <> operator clauses +explain (costs off) select * from hp where a <> 1 and b <> 'xxx'; + QUERY PLAN +--------------------------------------------------- + Append + -> Seq Scan on hp0 + Filter: ((a <> 1) AND (b <> 'xxx'::text)) + -> Seq Scan on hp1 + Filter: ((a <> 1) AND (b <> 'xxx'::text)) + -> Seq Scan on hp2 + Filter: ((a <> 1) AND (b <> 'xxx'::text)) + -> Seq Scan on hp3 + Filter: ((a <> 1) AND (b <> 'xxx'::text)) +(9 rows) + +drop table hp; diff --git a/src/test/regress/expected/partition_prune_hash_1.out b/src/test/regress/expected/partition_prune_hash_1.out new file mode 100644 index 00000000..4a26a0e2 --- /dev/null +++ b/src/test/regress/expected/partition_prune_hash_1.out @@ -0,0 +1,187 @@ +-- +-- Test Partition pruning for HASH partitioning +-- We keep this as a seperate test as hash functions return +-- values will vary based on CPU architecture. +-- +create table hp (a int, b text) partition by hash (a, b); +create table hp0 partition of hp for values with (modulus 4, remainder 0); +create table hp3 partition of hp for values with (modulus 4, remainder 3); +create table hp1 partition of hp for values with (modulus 4, remainder 1); +create table hp2 partition of hp for values with (modulus 4, remainder 2); +insert into hp values (null, null); +insert into hp values (1, null); +insert into hp values (1, 'xxx'); +insert into hp values (null, 'xxx'); +insert into hp values (10, 'xxx'); +insert into hp values (10, 'yyy'); +select tableoid::regclass, * from hp order by 1; + tableoid | a | b +----------+----+----- + hp0 | | + hp0 | 1 | + hp0 | 10 | xxx + hp3 | | xxx + hp3 | 10 | yyy + hp2 | 1 | xxx +(6 rows) + +-- partial keys won't prune, nor would non-equality conditions +explain (costs off) select * from hp where a = 1; + QUERY PLAN +------------------------- + Append + -> Seq Scan on hp0 + Filter: (a = 1) + -> Seq Scan on hp1 + Filter: (a = 1) + -> Seq Scan on hp2 + Filter: (a = 1) + -> Seq Scan on hp3 + Filter: (a = 1) +(9 rows) + +explain (costs off) select * from hp where b = 'xxx'; + QUERY PLAN +----------------------------------- + Append + -> Seq Scan on hp0 + Filter: (b = 'xxx'::text) + -> Seq Scan on hp1 + Filter: (b = 'xxx'::text) + -> Seq Scan on hp2 + Filter: (b = 'xxx'::text) + -> Seq Scan on hp3 + Filter: (b = 'xxx'::text) +(9 rows) + +explain (costs off) select * from hp where a is null; + QUERY PLAN +----------------------------- + Append + -> Seq Scan on hp0 + Filter: (a IS NULL) + -> Seq Scan on hp1 + Filter: (a IS NULL) + -> Seq Scan on hp2 + Filter: (a IS NULL) + -> Seq Scan on hp3 + Filter: (a IS NULL) +(9 rows) + +explain (costs off) select * from hp where b is null; + QUERY PLAN +----------------------------- + Append + -> Seq Scan on hp0 + Filter: (b IS NULL) + -> Seq Scan on hp1 + Filter: (b IS NULL) + -> Seq Scan on hp2 + Filter: (b IS NULL) + -> Seq Scan on hp3 + Filter: (b IS NULL) +(9 rows) + +explain (costs off) select * from hp where a < 1 and b = 'xxx'; + QUERY PLAN +------------------------------------------------- + Append + -> Seq Scan on hp0 + Filter: ((a < 1) AND (b = 'xxx'::text)) + -> Seq Scan on hp1 + Filter: ((a < 1) AND (b = 'xxx'::text)) + -> Seq Scan on hp2 + Filter: ((a < 1) AND (b = 'xxx'::text)) + -> Seq Scan on hp3 + Filter: ((a < 1) AND (b = 'xxx'::text)) +(9 rows) + +explain (costs off) select * from hp where a <> 1 and b = 'yyy'; + QUERY PLAN +-------------------------------------------------- + Append + -> Seq Scan on hp0 + Filter: ((a <> 1) AND (b = 'yyy'::text)) + -> Seq Scan on hp1 + Filter: ((a <> 1) AND (b = 'yyy'::text)) + -> Seq Scan on hp2 + Filter: ((a <> 1) AND (b = 'yyy'::text)) + -> Seq Scan on hp3 + Filter: ((a <> 1) AND (b = 'yyy'::text)) +(9 rows) + +-- pruning should work if non-null values are provided for all the keys +explain (costs off) select * from hp where a is null and b is null; + QUERY PLAN +----------------------------------------------- + Append + -> Seq Scan on hp0 + Filter: ((a IS NULL) AND (b IS NULL)) +(3 rows) + +explain (costs off) select * from hp where a = 1 and b is null; + QUERY PLAN +------------------------------------------- + Append + -> Seq Scan on hp0 + Filter: ((b IS NULL) AND (a = 1)) +(3 rows) + +explain (costs off) select * from hp where a = 1 and b = 'xxx'; + QUERY PLAN +------------------------------------------------- + Append + -> Seq Scan on hp2 + Filter: ((a = 1) AND (b = 'xxx'::text)) +(3 rows) + +explain (costs off) select * from hp where a is null and b = 'xxx'; + QUERY PLAN +----------------------------------------------------- + Append + -> Seq Scan on hp3 + Filter: ((a IS NULL) AND (b = 'xxx'::text)) +(3 rows) + +explain (costs off) select * from hp where a = 10 and b = 'xxx'; + QUERY PLAN +-------------------------------------------------- + Append + -> Seq Scan on hp0 + Filter: ((a = 10) AND (b = 'xxx'::text)) +(3 rows) + +explain (costs off) select * from hp where a = 10 and b = 'yyy'; + QUERY PLAN +-------------------------------------------------- + Append + -> Seq Scan on hp3 + Filter: ((a = 10) AND (b = 'yyy'::text)) +(3 rows) + +explain (costs off) select * from hp where (a = 10 and b = 'yyy') or (a = 10 and b = 'xxx') or (a is null and b is null); + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------- + Append + -> Seq Scan on hp0 + Filter: (((a = 10) AND (b = 'yyy'::text)) OR ((a = 10) AND (b = 'xxx'::text)) OR ((a IS NULL) AND (b IS NULL))) + -> Seq Scan on hp3 + Filter: (((a = 10) AND (b = 'yyy'::text)) OR ((a = 10) AND (b = 'xxx'::text)) OR ((a IS NULL) AND (b IS NULL))) +(5 rows) + +-- hash partitiong pruning doesn't occur with <> operator clauses +explain (costs off) select * from hp where a <> 1 and b <> 'xxx'; + QUERY PLAN +--------------------------------------------------- + Append + -> Seq Scan on hp0 + Filter: ((a <> 1) AND (b <> 'xxx'::text)) + -> Seq Scan on hp1 + Filter: ((a <> 1) AND (b <> 'xxx'::text)) + -> Seq Scan on hp2 + Filter: ((a <> 1) AND (b <> 'xxx'::text)) + -> Seq Scan on hp3 + Filter: ((a <> 1) AND (b <> 'xxx'::text)) +(9 rows) + +drop table hp; diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index d8a925ca..905cb00a 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -134,7 +134,7 @@ test: plancache limit plpgsql copy2 temp domain prepare without_oid conversion t # ---------- # Another group of parallel tests # ---------- -test: identity partition_join partition_prune hash_part +test: identity partition_join partition_prune partition_prune_hash hash_part # event triggers cannot run concurrently with any test that runs DDL test: event_trigger diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule index f0989763..1f00bfbc 100644 --- a/src/test/regress/serial_schedule +++ b/src/test/regress/serial_schedule @@ -193,6 +193,7 @@ test: xml test: identity test: partition_join test: partition_prune +test: partition_prune_hash test: hash_part test: event_trigger test: fast_default diff --git a/src/test/regress/sql/partition_prune.sql b/src/test/regress/sql/partition_prune.sql index 974e62c3..ca313897 100644 --- a/src/test/regress/sql/partition_prune.sql +++ b/src/test/regress/sql/partition_prune.sql @@ -237,40 +237,3 @@ create table rparted_by_int2_maxvalue partition of rparted_by_int2 for values fr explain (costs off) select * from rparted_by_int2 where a > 100000000000000; drop table lp, coll_pruning, rlp, mc3p, mc2p, boolpart, rp, coll_pruning_multi, like_op_noprune, lparted_by_int2, rparted_by_int2; - --- hash partitioning -create table hp (a int, b text) partition by hash (a, b); -create table hp0 partition of hp for values with (modulus 4, remainder 0); -create table hp3 partition of hp for values with (modulus 4, remainder 3); -create table hp1 partition of hp for values with (modulus 4, remainder 1); -create table hp2 partition of hp for values with (modulus 4, remainder 2); - -insert into hp values (null, null); -insert into hp values (1, null); -insert into hp values (1, 'xxx'); -insert into hp values (null, 'xxx'); -insert into hp values (10, 'xxx'); -insert into hp values (10, 'yyy'); -select tableoid::regclass, * from hp order by 1; - --- partial keys won't prune, nor would non-equality conditions -explain (costs off) select * from hp where a = 1; -explain (costs off) select * from hp where b = 'xxx'; -explain (costs off) select * from hp where a is null; -explain (costs off) select * from hp where b is null; -explain (costs off) select * from hp where a < 1 and b = 'xxx'; -explain (costs off) select * from hp where a <> 1 and b = 'yyy'; - --- pruning should work if non-null values are provided for all the keys -explain (costs off) select * from hp where a is null and b is null; -explain (costs off) select * from hp where a = 1 and b is null; -explain (costs off) select * from hp where a = 1 and b = 'xxx'; -explain (costs off) select * from hp where a is null and b = 'xxx'; -explain (costs off) select * from hp where a = 10 and b = 'xxx'; -explain (costs off) select * from hp where a = 10 and b = 'yyy'; -explain (costs off) select * from hp where (a = 10 and b = 'yyy') or (a = 10 and b = 'xxx') or (a is null and b is null); - --- hash partitiong pruning doesn't occur with <> operator clauses -explain (costs off) select * from hp where a <> 1 and b <> 'xxx'; - -drop table hp; diff --git a/src/test/regress/sql/partition_prune_hash.sql b/src/test/regress/sql/partition_prune_hash.sql new file mode 100644 index 00000000..fd1783bf --- /dev/null +++ b/src/test/regress/sql/partition_prune_hash.sql @@ -0,0 +1,41 @@ +-- +-- Test Partition pruning for HASH partitioning +-- We keep this as a seperate test as hash functions return +-- values will vary based on CPU architecture. +-- + +create table hp (a int, b text) partition by hash (a, b); +create table hp0 partition of hp for values with (modulus 4, remainder 0); +create table hp3 partition of hp for values with (modulus 4, remainder 3); +create table hp1 partition of hp for values with (modulus 4, remainder 1); +create table hp2 partition of hp for values with (modulus 4, remainder 2); + +insert into hp values (null, null); +insert into hp values (1, null); +insert into hp values (1, 'xxx'); +insert into hp values (null, 'xxx'); +insert into hp values (10, 'xxx'); +insert into hp values (10, 'yyy'); +select tableoid::regclass, * from hp order by 1; + +-- partial keys won't prune, nor would non-equality conditions +explain (costs off) select * from hp where a = 1; +explain (costs off) select * from hp where b = 'xxx'; +explain (costs off) select * from hp where a is null; +explain (costs off) select * from hp where b is null; +explain (costs off) select * from hp where a < 1 and b = 'xxx'; +explain (costs off) select * from hp where a <> 1 and b = 'yyy'; + +-- pruning should work if non-null values are provided for all the keys +explain (costs off) select * from hp where a is null and b is null; +explain (costs off) select * from hp where a = 1 and b is null; +explain (costs off) select * from hp where a = 1 and b = 'xxx'; +explain (costs off) select * from hp where a is null and b = 'xxx'; +explain (costs off) select * from hp where a = 10 and b = 'xxx'; +explain (costs off) select * from hp where a = 10 and b = 'yyy'; +explain (costs off) select * from hp where (a = 10 and b = 'yyy') or (a = 10 and b = 'xxx') or (a is null and b is null); + +-- hash partitiong pruning doesn't occur with <> operator clauses +explain (costs off) select * from hp where a <> 1 and b <> 'xxx'; + +drop table hp; From 9c6403bca6d1ef8073b18996b368d63d58d3d549 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Fri, 6 Apr 2018 20:54:22 -0700 Subject: [PATCH 247/578] Blindly attempt to fix sepgsql tests broken due to 9fdb675fc5. The failure appears to solely be caused by the changed partition pruning logic. Author: Andres Freund Discussion: https://postgr.es/m/20180406210330.wmqw42wqgiicktli@alap3.anarazel.de Signed-off-by: JennyJennyChen --- contrib/sepgsql/expected/misc.out | 3 --- 1 file changed, 3 deletions(-) diff --git a/contrib/sepgsql/expected/misc.out b/contrib/sepgsql/expected/misc.out index 98f8005a..128f6bd0 100644 --- a/contrib/sepgsql/expected/misc.out +++ b/contrib/sepgsql/expected/misc.out @@ -32,9 +32,6 @@ LOG: SELinux: allowed { execute } scontext=unconfined_u:unconfined_r:sepgsql_re (6 rows) SELECT * FROM t1p WHERE o > 50 AND p like '%64%'; -LOG: SELinux: allowed { execute } scontext=unconfined_u:unconfined_r:sepgsql_regtest_superuser_t:s0-s0:c0.c255 tcontext=system_u:object_r:sepgsql_proc_exec_t:s0 tclass=db_procedure name="pg_catalog.int4le(integer,integer)" -LOG: SELinux: allowed { execute } scontext=unconfined_u:unconfined_r:sepgsql_regtest_superuser_t:s0-s0:c0.c255 tcontext=system_u:object_r:sepgsql_proc_exec_t:s0 tclass=db_procedure name="pg_catalog.int4le(integer,integer)" -LOG: SELinux: allowed { execute } scontext=unconfined_u:unconfined_r:sepgsql_regtest_superuser_t:s0-s0:c0.c255 tcontext=system_u:object_r:sepgsql_proc_exec_t:s0 tclass=db_procedure name="pg_catalog.int4le(integer,integer)" LOG: SELinux: allowed { select } scontext=unconfined_u:unconfined_r:sepgsql_regtest_superuser_t:s0-s0:c0.c255 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_table name="public.t1p" LOG: SELinux: allowed { select } scontext=unconfined_u:unconfined_r:sepgsql_regtest_superuser_t:s0-s0:c0.c255 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table t1p column o" LOG: SELinux: allowed { select } scontext=unconfined_u:unconfined_r:sepgsql_regtest_superuser_t:s0-s0:c0.c255 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table t1p column p" From deb429fef69b810afcfd9b22cd8c59d76acbfaa5 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Mon, 29 Jun 2020 17:37:11 +0800 Subject: [PATCH 248/578] Fix ALTER TABLE .. ATTACH PARTITION ... DEFAULT .http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/commands/tablecmds.c | 153 +++++++++----------- src/test/regress/expected/alter_table.out | 16 ++ src/test/regress/expected/alter_table_1.out | 16 ++ src/test/regress/expected/alter_table_2.out | 16 ++ src/test/regress/expected/alter_table_3.out | 16 ++ src/test/regress/sql/alter_table.sql | 18 +++ 6 files changed, 147 insertions(+), 88 deletions(-) diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 536b8661..dbdd156e 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -539,8 +539,7 @@ static void CreateInheritance(Relation child_rel, Relation parent_rel); static void RemoveInheritance(Relation child_rel, Relation parent_rel); static ObjectAddress ATExecAttachPartition(List **wqueue, Relation rel, PartitionCmd *cmd); -static void ValidatePartitionConstraints(List **wqueue, Relation scanrel, - List *scanrel_children, +static void QueuePartitionConstraintValidation(List **wqueue, Relation scanrel, List *partConstraint, bool validate_default); static ObjectAddress ATExecDetachPartition(Relation rel, RangeVar *name); @@ -16442,29 +16441,23 @@ PartConstraintImpliedByRelConstraint(Relation scanrel, } /* - * ValidatePartitionConstraints + * QueuePartitionConstraintValidation * - * Check whether all rows in the given table obey the given partition - * constraint; if so, it can be attached as a partition.  We do this by - * scanning the table (or all of its leaf partitions) row by row, except when - * the existing constraints are sufficient to prove that the new partitioning - * constraint must already hold. + * Add an entry to wqueue to have the given partition constraint validated by + * Phase 3, for the given relation, and all its children. + * + * We first verify whether the given constraint is implied by pre-existing + * relation constraints; if it is, there's no need to scan the table to + * validate, so don't queue in that case. */ static void -ValidatePartitionConstraints(List **wqueue, Relation scanrel, - List *scanrel_children, +QueuePartitionConstraintValidation(List **wqueue, Relation scanrel, List *partConstraint, bool validate_default) { - bool found_whole_row; - ListCell *lc; - - if (partConstraint == NIL) - return; - /* - * Based on the table's existing constraints, determine if we can skip - * scanning the table to validate the partition constraint. + * Based on the table's existing constraints, determine whether or not we + * may skip scanning the table. */ if (PartConstraintImpliedByRelConstraint(scanrel, partConstraint)) { @@ -16479,69 +16472,54 @@ ValidatePartitionConstraints(List **wqueue, Relation scanrel, return; } - /* Constraints proved insufficient, so we need to scan the table. */ - foreach(lc, scanrel_children) + /* + * Constraints proved insufficient. For plain relations, queue a validation + * item now; for partitioned tables, recurse to process each partition. + */ + if (scanrel->rd_rel->relkind == RELKIND_RELATION) { AlteredTableInfo *tab; - Oid part_relid = lfirst_oid(lc); - Relation part_rel; - List *my_partconstr = partConstraint; - /* Lock already taken */ - if (part_relid != RelationGetRelid(scanrel)) - part_rel = heap_open(part_relid, NoLock); - else - part_rel = scanrel; + /* Grab a work queue entry. */ + tab = ATGetQueueEntry(wqueue, scanrel); + Assert(tab->partition_constraint == NULL); + tab->partition_constraint = (Expr *) linitial(partConstraint); + tab->validate_default = validate_default; + } + else if (scanrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + { + PartitionDesc partdesc = RelationGetPartitionDesc(scanrel); + int i; + + for (i = 0; i < partdesc->nparts; i++) + { + Relation part_rel; + bool found_whole_row; + List *thisPartConstraint; /* - * Skip if the partition is itself a partitioned table. We can only - * ever scan RELKIND_RELATION relations. + * This is the minimum lock we need to prevent concurrent data + * additions. */ - if (part_rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) - { - if (part_rel != scanrel) - heap_close(part_rel, NoLock); - continue; - } + part_rel = heap_open(partdesc->oids[i], ShareLock); - if (part_rel != scanrel) - { /* * Adjust the constraint for scanrel so that it matches this * partition's attribute numbers. */ - my_partconstr = map_partition_varattnos(my_partconstr, 1, - part_rel, scanrel, - &found_whole_row); + thisPartConstraint = + map_partition_varattnos(partConstraint, 1, + part_rel, scanrel, &found_whole_row); /* There can never be a whole-row reference here */ if (found_whole_row) - elog(ERROR, "unexpected whole-row reference found in partition key"); + elog(ERROR, "unexpected whole-row reference found in partition constraint"); - /* Can we skip scanning this part_rel? */ - if (PartConstraintImpliedByRelConstraint(part_rel, my_partconstr)) - { - if (!validate_default) - ereport(INFO, - (errmsg("partition constraint for table \"%s\" is implied by existing constraints", - RelationGetRelationName(part_rel)))); - else - ereport(INFO, - (errmsg("updated partition constraint for default partition \"%s\" is implied by existing constraints", - RelationGetRelationName(part_rel)))); - heap_close(part_rel, NoLock); - continue; + QueuePartitionConstraintValidation(wqueue, part_rel, + thisPartConstraint, + validate_default); + heap_close(part_rel, NoLock); /* keep lock till commit */ } } - - /* Grab a work queue entry. */ - tab = ATGetQueueEntry(wqueue, part_rel); - tab->partition_constraint = (Expr *) linitial(my_partconstr); - tab->validate_default = validate_default; - - /* keep our lock until commit */ - if (part_rel != scanrel) - heap_close(part_rel, NoLock); - } } /* @@ -16568,8 +16546,8 @@ ATExecAttachPartition(List **wqueue, Relation rel, PartitionCmd *cmd) List *partBoundConstraint; /* - * We must lock the default partition, because attaching a new partition - * will change its partition constraint. + * We must lock the default partition if one exists, because attaching a + * new partition will change its partition constraint. */ defaultPartOid = get_default_oid_from_partdesc(RelationGetPartitionDesc(rel)); @@ -16634,17 +16612,18 @@ ATExecAttachPartition(List **wqueue, Relation rel, PartitionCmd *cmd) * * We do that by checking if rel is a member of the list of attachRel's * partitions provided the latter is partitioned at all. We want to avoid - * having to construct this list again, so we request the strongest lock - * on all partitions. We need the strongest lock, because we may decide - * to scan them if we find out that the table being attached (or its leaf - * partitions) may contain rows that violate the partition constraint. If - * the table has a constraint that would prevent such rows, which by - * definition is present in all the partitions, we need not scan the - * table, nor its partitions. But we cannot risk a deadlock by taking a + * having to construct this list again, so we request a lock on all + * partitions. We need ShareLock, preventing data changes, because we + * may decide to scan them if we find out that the table being attached (or + * its leaf partitions) may contain rows that violate the partition + * constraint. If the table has a constraint that would prevent such rows, + * which by definition is present in all the partitions, we need not scan + * the table, nor its partitions. But we cannot risk a deadlock by taking + * a weaker lock now and the stronger one only when needed. * weaker lock now and the stronger one only when needed. */ attachrel_children = find_all_inheritors(RelationGetRelid(attachrel), - AccessExclusiveLock, NULL); + ShareLock, NULL); if (list_member_oid(attachrel_children, RelationGetRelid(rel))) ereport(ERROR, (errcode(ERRCODE_DUPLICATE_TABLE), @@ -16777,31 +16756,29 @@ ATExecAttachPartition(List **wqueue, Relation rel, PartitionCmd *cmd) "unexpected whole-row reference found in partition key"); /* Validate partition constraints against the table being attached. */ - ValidatePartitionConstraints(wqueue, attachrel, attachrel_children, - partConstraint, false); + QueuePartitionConstraintValidation(wqueue, attachrel, partConstraint, + false); } /* - * Check whether default partition has a row that would fit the partition - * being attached. + * If we're attaching a partition other than the default partition and a + * default one exists, then that partition's partition constraint changes, + * so add an entry to the work queue to validate it, too. (We must not + * do this when the partition being attached is the default one; we + * already did it above!) */ - defaultPartOid = - get_default_oid_from_partdesc(RelationGetPartitionDesc(rel)); if (OidIsValid(defaultPartOid)) { Relation defaultrel; - List *defaultrel_children; List *defPartConstraint; - /* We already have taken a lock on default partition. */ + Assert(!cmd->bound->is_default); + + /* we already hold a lock on the default partition */ defaultrel = heap_open(defaultPartOid, NoLock); defPartConstraint = get_proposed_default_constraint(partBoundConstraint); - defaultrel_children = - find_all_inheritors(defaultPartOid, - AccessExclusiveLock, NULL); - ValidatePartitionConstraints(wqueue, defaultrel, - defaultrel_children, + QueuePartitionConstraintValidation(wqueue, defaultrel, defPartConstraint, true); /* keep our lock until commit. */ diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out index 455cee74..088474cf 100644 --- a/src/test/regress/expected/alter_table.out +++ b/src/test/regress/expected/alter_table.out @@ -3729,3 +3729,19 @@ create table parted_validate_test_1 partition of parted_validate_test for values alter table parted_validate_test add constraint parted_validate_test_chka check (a > 0) not valid; alter table parted_validate_test validate constraint parted_validate_test_chka; drop table parted_validate_test; +-- check that violating rows are correctly reported when attaching as the +-- default partition +create table defpart_attach_test (a int) partition by list (a); +create table defpart_attach_test1 partition of defpart_attach_test for values in (1); +create table defpart_attach_test_d (like defpart_attach_test); +insert into defpart_attach_test_d values (1), (2); +-- error because its constraint as the default partition would be violated +-- by the row containing 1 +alter table defpart_attach_test attach partition defpart_attach_test_d default; +ERROR: partition constraint is violated by some row +delete from defpart_attach_test_d where a = 1; +alter table defpart_attach_test_d add check (a > 1); +-- should be attached successfully and without needing to be scanned +alter table defpart_attach_test attach partition defpart_attach_test_d default; +INFO: partition constraint for table "defpart_attach_test_d" is implied by existing constraints +drop table defpart_attach_test; diff --git a/src/test/regress/expected/alter_table_1.out b/src/test/regress/expected/alter_table_1.out index 357e16da..8e1053bc 100644 --- a/src/test/regress/expected/alter_table_1.out +++ b/src/test/regress/expected/alter_table_1.out @@ -3652,3 +3652,19 @@ create table parted_validate_test_1 partition of parted_validate_test for values alter table parted_validate_test add constraint parted_validate_test_chka check (a > 0) not valid; alter table parted_validate_test validate constraint parted_validate_test_chka; drop table parted_validate_test; +-- check that violating rows are correctly reported when attaching as the +-- default partition +create table defpart_attach_test (a int) partition by list (a); +create table defpart_attach_test1 partition of defpart_attach_test for values in (1); +create table defpart_attach_test_d (like defpart_attach_test); +insert into defpart_attach_test_d values (1), (2); +-- error because its constraint as the default partition would be violated +-- by the row containing 1 +alter table defpart_attach_test attach partition defpart_attach_test_d default; +ERROR: partition constraint is violated by some row +delete from defpart_attach_test_d where a = 1; +alter table defpart_attach_test_d add check (a > 1); +-- should be attached successfully and without needing to be scanned +alter table defpart_attach_test attach partition defpart_attach_test_d default; +INFO: partition constraint for table "defpart_attach_test_d" is implied by existing constraints +drop table defpart_attach_test; diff --git a/src/test/regress/expected/alter_table_2.out b/src/test/regress/expected/alter_table_2.out index 88f9f851..19a9d000 100644 --- a/src/test/regress/expected/alter_table_2.out +++ b/src/test/regress/expected/alter_table_2.out @@ -3652,3 +3652,19 @@ create table parted_validate_test_1 partition of parted_validate_test for values alter table parted_validate_test add constraint parted_validate_test_chka check (a > 0) not valid; alter table parted_validate_test validate constraint parted_validate_test_chka; drop table parted_validate_test; +-- check that violating rows are correctly reported when attaching as the +-- default partition +create table defpart_attach_test (a int) partition by list (a); +create table defpart_attach_test1 partition of defpart_attach_test for values in (1); +create table defpart_attach_test_d (like defpart_attach_test); +insert into defpart_attach_test_d values (1), (2); +-- error because its constraint as the default partition would be violated +-- by the row containing 1 +alter table defpart_attach_test attach partition defpart_attach_test_d default; +ERROR: partition constraint is violated by some row +delete from defpart_attach_test_d where a = 1; +alter table defpart_attach_test_d add check (a > 1); +-- should be attached successfully and without needing to be scanned +alter table defpart_attach_test attach partition defpart_attach_test_d default; +INFO: partition constraint for table "defpart_attach_test_d" is implied by existing constraints +drop table defpart_attach_test; diff --git a/src/test/regress/expected/alter_table_3.out b/src/test/regress/expected/alter_table_3.out index 345150e0..5cdf3e7a 100644 --- a/src/test/regress/expected/alter_table_3.out +++ b/src/test/regress/expected/alter_table_3.out @@ -3652,3 +3652,19 @@ create table parted_validate_test_1 partition of parted_validate_test for values alter table parted_validate_test add constraint parted_validate_test_chka check (a > 0) not valid; alter table parted_validate_test validate constraint parted_validate_test_chka; drop table parted_validate_test; +-- check that violating rows are correctly reported when attaching as the +-- default partition +create table defpart_attach_test (a int) partition by list (a); +create table defpart_attach_test1 partition of defpart_attach_test for values in (1); +create table defpart_attach_test_d (like defpart_attach_test); +insert into defpart_attach_test_d values (1), (2); +-- error because its constraint as the default partition would be violated +-- by the row containing 1 +alter table defpart_attach_test attach partition defpart_attach_test_d default; +ERROR: partition constraint is violated by some row +delete from defpart_attach_test_d where a = 1; +alter table defpart_attach_test_d add check (a > 1); +-- should be attached successfully and without needing to be scanned +alter table defpart_attach_test attach partition defpart_attach_test_d default; +INFO: partition constraint for table "defpart_attach_test_d" is implied by existing constraints +drop table defpart_attach_test; diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql index a32521f1..e1c6772c 100644 --- a/src/test/regress/sql/alter_table.sql +++ b/src/test/regress/sql/alter_table.sql @@ -2498,3 +2498,21 @@ create table parted_validate_test_1 partition of parted_validate_test for values alter table parted_validate_test add constraint parted_validate_test_chka check (a > 0) not valid; alter table parted_validate_test validate constraint parted_validate_test_chka; drop table parted_validate_test; + +-- check that violating rows are correctly reported when attaching as the +-- default partition +create table defpart_attach_test (a int) partition by list (a); +create table defpart_attach_test1 partition of defpart_attach_test for values in (1); +create table defpart_attach_test_d (like defpart_attach_test); +insert into defpart_attach_test_d values (1), (2); + +-- error because its constraint as the default partition would be violated +-- by the row containing 1 +alter table defpart_attach_test attach partition defpart_attach_test_d default; +delete from defpart_attach_test_d where a = 1; +alter table defpart_attach_test_d add check (a > 1); + +-- should be attached successfully and without needing to be scanned +alter table defpart_attach_test attach partition defpart_attach_test_d default; + +drop table defpart_attach_test; \ No newline at end of file From 84d1ef9a84dff1b7bfcadfb042922920853ba8bd Mon Sep 17 00:00:00 2001 From: Alvaro Herrera Date: Thu, 12 Apr 2018 16:51:55 -0300 Subject: [PATCH 249/578] Add comment about default partition in check_new_partition_bound The intention of the test is not immediately obvious, so we need this much. --- src/backend/catalog/partition.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c index f74a88f0..5c85918f 100644 --- a/src/backend/catalog/partition.c +++ b/src/backend/catalog/partition.c @@ -898,6 +898,12 @@ check_new_partition_bound(char *relname, Relation parent, if (spec->is_default) { + /* + * The default partition bound never conflicts with any other + * partition's; if that's what we're attaching, the only possible + * problem is that one already exists, so check for that and we're + * done. + */ if (boundinfo == NULL || !partition_bound_has_default(boundinfo)) return; From b58f50ddbdd49c6d0c0e89b25963a68c48dd886b Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Mon, 29 Jun 2020 17:46:24 +0800 Subject: [PATCH 250/578] Revert lowering of lock level for ATTACH PARTITION.http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/commands/tablecmds.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index dbdd156e..afa19507 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -16498,10 +16498,9 @@ QueuePartitionConstraintValidation(List **wqueue, Relation scanrel, List *thisPartConstraint; /* - * This is the minimum lock we need to prevent concurrent data - * additions. + * This is the minimum lock we need to prevent deadlocks. */ - part_rel = heap_open(partdesc->oids[i], ShareLock); + part_rel = heap_open(partdesc->oids[i], AccessExclusiveLock); /* * Adjust the constraint for scanrel so that it matches this @@ -16612,18 +16611,17 @@ ATExecAttachPartition(List **wqueue, Relation rel, PartitionCmd *cmd) * * We do that by checking if rel is a member of the list of attachRel's * partitions provided the latter is partitioned at all. We want to avoid - * having to construct this list again, so we request a lock on all - * partitions. We need ShareLock, preventing data changes, because we - * may decide to scan them if we find out that the table being attached (or - * its leaf partitions) may contain rows that violate the partition - * constraint. If the table has a constraint that would prevent such rows, - * which by definition is present in all the partitions, we need not scan - * the table, nor its partitions. But we cannot risk a deadlock by taking - * a weaker lock now and the stronger one only when needed. + * having to construct this list again, so we request the strongest lock + * on all partitions. We need the strongest lock, because we may decide + * to scan them if we find out that the table being attached (or its leaf + * partitions) may contain rows that violate the partition constraint. If + * the table has a constraint that would prevent such rows, which by + * definition is present in all the partitions, we need not scan the + * table, nor its partitions. But we cannot risk a deadlock by taking a * weaker lock now and the stronger one only when needed. */ attachrel_children = find_all_inheritors(RelationGetRelid(attachrel), - ShareLock, NULL); + AccessExclusiveLock, NULL); if (list_member_oid(attachrel_children, RelationGetRelid(rel))) ereport(ERROR, (errcode(ERRCODE_DUPLICATE_TABLE), From 33216c2612714f287ff0403f0ede13d48facb22d Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Mon, 29 Jun 2020 19:47:39 +0800 Subject: [PATCH 251/578] Fix handling of partition bounds for boolean partitioning columns. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/parser/gram.y | 2 ++ src/test/regress/expected/create_table.out | 14 ++++++++++++++ src/test/regress/sql/create_table.sql | 7 +++++++ 3 files changed, 23 insertions(+) diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 1cf77960..a3eb7514 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -2948,6 +2948,8 @@ hash_partbound: partbound_datum: Sconst { $$ = makeStringConst($1, @1); } | NumericOnly { $$ = makeAConst($1, @1); } + | TRUE_P { $$ = makeStringConst(pstrdup("true"), @1); } + | FALSE_P { $$ = makeStringConst(pstrdup("false"), @1); } | NULL_P { $$ = makeNullAConst(@1); } ; diff --git a/src/test/regress/expected/create_table.out b/src/test/regress/expected/create_table.out index 7fa55adb..55e9e44d 100644 --- a/src/test/regress/expected/create_table.out +++ b/src/test/regress/expected/create_table.out @@ -899,3 +899,17 @@ Distribute By: HASH(a) Location Nodes: ALL DATANODES DROP TABLE parted_col_comment; +-- partition on boolean column +create table boolspart (a bool) partition by list (a); +create table boolspart_t partition of boolspart for values in (true); +create table boolspart_f partition of boolspart for values in (false); +\d+ boolspart + Table "public.boolspart" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+---------+--------------+------------- + a | boolean | | | | plain | | +Partition key: LIST (a) +Partitions: boolspart_f FOR VALUES IN (false), + boolspart_t FOR VALUES IN (true) + +drop table boolspart; diff --git a/src/test/regress/sql/create_table.sql b/src/test/regress/sql/create_table.sql index b125fa50..68482d79 100644 --- a/src/test/regress/sql/create_table.sql +++ b/src/test/regress/sql/create_table.sql @@ -714,3 +714,10 @@ COMMENT ON COLUMN parted_col_comment.a IS 'Partition key'; SELECT obj_description('parted_col_comment'::regclass); \d+ parted_col_comment DROP TABLE parted_col_comment; + +-- partition on boolean column +create table boolspart (a bool) partition by list (a); +create table boolspart_t partition of boolspart for values in (true); +create table boolspart_f partition of boolspart for values in (false); +\d+ boolspart +drop table boolspart; From 1ff5704b1e0393d1f53502027d65150d7f974059 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Mon, 29 Jun 2020 19:59:52 +0800 Subject: [PATCH 252/578] Fix assorted partition pruning bugs. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/partitioning/partprune.c | 58 ++++++++-------- src/test/regress/expected/partition_prune.out | 66 +++++++++++++++++++ src/test/regress/sql/partition_prune.sql | 14 ++++ 3 files changed, 106 insertions(+), 32 deletions(-) diff --git a/src/backend/partitioning/partprune.c b/src/backend/partitioning/partprune.c index 959ee164..03bacd1f 100644 --- a/src/backend/partitioning/partprune.c +++ b/src/backend/partitioning/partprune.c @@ -364,8 +364,9 @@ get_matching_partitions(PartitionPruneContext *context, List *pruning_steps) * For BoolExpr clauses, we recursively generate steps for each argument, and * return a PartitionPruneStepCombine of their results. * - * The generated steps are added to the context's steps list. Each step is - * assigned a step identifier, unique even across recursive calls. + * The return value is a list of the steps generated, which are also added to + * the context's steps list. Each step is assigned a step identifier, unique + * even across recursive calls. * * If we find clauses that are mutually contradictory, or a pseudoconstant * clause that contains false, we set *contradictory to true and return NIL @@ -1386,6 +1387,7 @@ match_clause_to_partition_key(RelOptInfo *rel, List *elem_exprs, *elem_clauses; ListCell *lc1; + bool contradictory; if (IsA(leftop, RelabelType)) leftop = ((RelabelType *) leftop)->arg; @@ -1404,7 +1406,7 @@ match_clause_to_partition_key(RelOptInfo *rel, * Only allow strict operators. This will guarantee nulls are * filtered. */ - if (!op_strict(saop->opno)) + if (!op_strict(saop_op)) return PARTCLAUSE_UNSUPPORTED; /* Useless if the array has any volatile functions. */ @@ -1437,6 +1439,8 @@ match_clause_to_partition_key(RelOptInfo *rel, if (strategy != BTEqualStrategyNumber) return PARTCLAUSE_UNSUPPORTED; } + else + return PARTCLAUSE_UNSUPPORTED; /* no useful negator */ } /* @@ -1477,7 +1481,7 @@ match_clause_to_partition_key(RelOptInfo *rel, elem_exprs = lappend(elem_exprs, elem_expr); } } - else + else if (IsA(rightop, ArrayExpr)) { ArrayExpr *arrexpr = castNode(ArrayExpr, rightop); @@ -1491,6 +1495,11 @@ match_clause_to_partition_key(RelOptInfo *rel, elem_exprs = arrexpr->elements; } + else + { + /* Give up on any other clause types. */ + return PARTCLAUSE_UNSUPPORTED; + } /* * Now generate a list of clauses, one for each array element, of the @@ -1509,36 +1518,21 @@ match_clause_to_partition_key(RelOptInfo *rel, } /* - * Build a combine step as if for an OR clause or add the clauses to - * the end of the list that's being processed currently. + * If we have an ANY clause and multiple elements, first turn the list + * of clauses into an OR expression. */ if (saop->useOr && list_length(elem_clauses) > 1) - { - Expr *orexpr; - bool contradictory; - - orexpr = makeBoolExpr(OR_EXPR, elem_clauses, -1); - *clause_steps = - gen_partprune_steps_internal(context, rel, list_make1(orexpr), - &contradictory); - if (contradictory) - return PARTCLAUSE_MATCH_CONTRADICT; - - Assert(list_length(*clause_steps) == 1); - return PARTCLAUSE_MATCH_STEPS; - } - else - { - bool contradictory; - - *clause_steps = - gen_partprune_steps_internal(context, rel, elem_clauses, - &contradictory); - if (contradictory) - return PARTCLAUSE_MATCH_CONTRADICT; - Assert(list_length(*clause_steps) >= 1); - return PARTCLAUSE_MATCH_STEPS; - } + elem_clauses = list_make1(makeBoolExpr(OR_EXPR, elem_clauses, -1)); + + /* Finally, generate steps */ + *clause_steps = + gen_partprune_steps_internal(context, rel, elem_clauses, + &contradictory); + if (contradictory) + return PARTCLAUSE_MATCH_CONTRADICT; + else if (*clause_steps == NIL) + return PARTCLAUSE_UNSUPPORTED; /* step generation failed */ + return PARTCLAUSE_MATCH_STEPS; } else if (IsA(clause, NullTest)) { diff --git a/src/test/regress/expected/partition_prune.out b/src/test/regress/expected/partition_prune.out index 3e0a196e..b91cac4b 100644 --- a/src/test/regress/expected/partition_prune.out +++ b/src/test/regress/expected/partition_prune.out @@ -1081,6 +1081,72 @@ explain (costs off) select * from boolpart where a is not unknown; Filter: (a IS NOT UNKNOWN) (7 rows) +-- test scalar-to-array operators +create table coercepart (a varchar) partition by list (a); +create table coercepart_ab partition of coercepart for values in ('ab'); +create table coercepart_bc partition of coercepart for values in ('bc'); +create table coercepart_cd partition of coercepart for values in ('cd'); +explain (costs off) select * from coercepart where a in ('ab', to_char(125, '999')); + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------ + Append + -> Seq Scan on coercepart_ab + Filter: ((a)::text = ANY ((ARRAY['ab'::character varying, (to_char(125, '999'::text))::character varying])::text[])) + -> Seq Scan on coercepart_bc + Filter: ((a)::text = ANY ((ARRAY['ab'::character varying, (to_char(125, '999'::text))::character varying])::text[])) + -> Seq Scan on coercepart_cd + Filter: ((a)::text = ANY ((ARRAY['ab'::character varying, (to_char(125, '999'::text))::character varying])::text[])) +(7 rows) + +explain (costs off) select * from coercepart where a ~ any ('{ab}'); + QUERY PLAN +---------------------------------------------------- + Append + -> Seq Scan on coercepart_ab + Filter: ((a)::text ~ ANY ('{ab}'::text[])) + -> Seq Scan on coercepart_bc + Filter: ((a)::text ~ ANY ('{ab}'::text[])) + -> Seq Scan on coercepart_cd + Filter: ((a)::text ~ ANY ('{ab}'::text[])) +(7 rows) + +explain (costs off) select * from coercepart where a !~ all ('{ab}'); + QUERY PLAN +----------------------------------------------------- + Append + -> Seq Scan on coercepart_ab + Filter: ((a)::text !~ ALL ('{ab}'::text[])) + -> Seq Scan on coercepart_bc + Filter: ((a)::text !~ ALL ('{ab}'::text[])) + -> Seq Scan on coercepart_cd + Filter: ((a)::text !~ ALL ('{ab}'::text[])) +(7 rows) + +explain (costs off) select * from coercepart where a ~ any ('{ab,bc}'); + QUERY PLAN +------------------------------------------------------- + Append + -> Seq Scan on coercepart_ab + Filter: ((a)::text ~ ANY ('{ab,bc}'::text[])) + -> Seq Scan on coercepart_bc + Filter: ((a)::text ~ ANY ('{ab,bc}'::text[])) + -> Seq Scan on coercepart_cd + Filter: ((a)::text ~ ANY ('{ab,bc}'::text[])) +(7 rows) + +explain (costs off) select * from coercepart where a !~ all ('{ab,bc}'); + QUERY PLAN +-------------------------------------------------------- + Append + -> Seq Scan on coercepart_ab + Filter: ((a)::text !~ ALL ('{ab,bc}'::text[])) + -> Seq Scan on coercepart_bc + Filter: ((a)::text !~ ALL ('{ab,bc}'::text[])) + -> Seq Scan on coercepart_cd + Filter: ((a)::text !~ ALL ('{ab,bc}'::text[])) +(7 rows) + +drop table coercepart; -- -- some more cases -- diff --git a/src/test/regress/sql/partition_prune.sql b/src/test/regress/sql/partition_prune.sql index ca313897..164b74ee 100644 --- a/src/test/regress/sql/partition_prune.sql +++ b/src/test/regress/sql/partition_prune.sql @@ -152,6 +152,20 @@ explain (costs off) select * from boolpart where a is not true and a is not fals explain (costs off) select * from boolpart where a is unknown; explain (costs off) select * from boolpart where a is not unknown; +-- test scalar-to-array operators +create table coercepart (a varchar) partition by list (a); +create table coercepart_ab partition of coercepart for values in ('ab'); +create table coercepart_bc partition of coercepart for values in ('bc'); +create table coercepart_cd partition of coercepart for values in ('cd'); + +explain (costs off) select * from coercepart where a in ('ab', to_char(125, '999')); +explain (costs off) select * from coercepart where a ~ any ('{ab}'); +explain (costs off) select * from coercepart where a !~ all ('{ab}'); +explain (costs off) select * from coercepart where a ~ any ('{ab,bc}'); +explain (costs off) select * from coercepart where a !~ all ('{ab,bc}'); + +drop table coercepart; + -- -- some more cases -- From a74e161eec95f99a0f9800d40c627100783fe873 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Mon, 29 Jun 2020 20:02:58 +0800 Subject: [PATCH 253/578] pgstatindex, pageinspect: handle partitioned indexes --- contrib/pageinspect/expected/page.out | 6 +- contrib/pageinspect/rawpage.c | 391 +++++----- contrib/pageinspect/sql/page.sql | 5 +- contrib/pgstattuple/expected/pgstattuple.out | 3 + contrib/pgstattuple/pgstattuple.c | 779 ++++++++++--------- contrib/pgstattuple/sql/pgstattuple.sql | 2 + 6 files changed, 603 insertions(+), 583 deletions(-) diff --git a/contrib/pageinspect/expected/page.out b/contrib/pageinspect/expected/page.out index 8e15947a..5cbe2203 100644 --- a/contrib/pageinspect/expected/page.out +++ b/contrib/pageinspect/expected/page.out @@ -83,10 +83,14 @@ SELECT * FROM fsm_page_contents(get_raw_page('test1', 'fsm', 0)); (1 row) DROP TABLE test1; --- check that using any of these functions with a partitioned table would fail +-- check that using any of these functions with a partitioned table or index +-- would fail create table test_partitioned (a int) partition by range (a); +create index test_partitioned_index on test_partitioned (a); select get_raw_page('test_partitioned', 0); -- error about partitioned table ERROR: cannot get raw page from partitioned table "test_partitioned" +select get_raw_page('test_partitioned_index', 0); -- error about partitioned index +ERROR: cannot get raw page from partitioned index "test_partitioned_index" -- a regular table which is a member of a partition set should work though create table test_part1 partition of test_partitioned for values from ( 1 ) to (100); select get_raw_page('test_part1', 0); -- get farther and error about empty table diff --git a/contrib/pageinspect/rawpage.c b/contrib/pageinspect/rawpage.c index 9682498d..999c4b45 100644 --- a/contrib/pageinspect/rawpage.c +++ b/contrib/pageinspect/rawpage.c @@ -1,14 +1,14 @@ /*------------------------------------------------------------------------- * * rawpage.c - * Functions to extract a raw page as bytea and inspect it + * Functions to extract a raw page as bytea and inspect it * * Access-method specific inspection functions are in separate files. * * Copyright (c) 2007-2017, PostgreSQL Global Development Group * * IDENTIFICATION - * contrib/pageinspect/rawpage.c + * contrib/pageinspect/rawpage.c * *------------------------------------------------------------------------- */ @@ -33,7 +33,7 @@ PG_MODULE_MAGIC; static bytea *get_raw_page_internal(text *relname, ForkNumber forknum, - BlockNumber blkno); + BlockNumber blkno); /* @@ -46,23 +46,23 @@ PG_FUNCTION_INFO_V1(get_raw_page); Datum get_raw_page(PG_FUNCTION_ARGS) { - text *relname = PG_GETARG_TEXT_PP(0); - uint32 blkno = PG_GETARG_UINT32(1); - bytea *raw_page; - - /* - * We don't normally bother to check the number of arguments to a C - * function, but here it's needed for safety because early 8.4 beta - * releases mistakenly redefined get_raw_page() as taking three arguments. - */ - if (PG_NARGS() != 2) - ereport(ERROR, - (errmsg("wrong number of arguments to get_raw_page()"), - errhint("Run the updated pageinspect.sql script."))); - - raw_page = get_raw_page_internal(relname, MAIN_FORKNUM, blkno); - - PG_RETURN_BYTEA_P(raw_page); + text *relname = PG_GETARG_TEXT_PP(0); + uint32 blkno = PG_GETARG_UINT32(1); + bytea *raw_page; + + /* + * We don't normally bother to check the number of arguments to a C + * function, but here it's needed for safety because early 8.4 beta + * releases mistakenly redefined get_raw_page() as taking three arguments. + */ + if (PG_NARGS() != 2) + ereport(ERROR, + (errmsg("wrong number of arguments to get_raw_page()"), + errhint("Run the updated pageinspect.sql script."))); + + raw_page = get_raw_page_internal(relname, MAIN_FORKNUM, blkno); + + PG_RETURN_BYTEA_P(raw_page); } /* @@ -75,17 +75,17 @@ PG_FUNCTION_INFO_V1(get_raw_page_fork); Datum get_raw_page_fork(PG_FUNCTION_ARGS) { - text *relname = PG_GETARG_TEXT_PP(0); - text *forkname = PG_GETARG_TEXT_PP(1); - uint32 blkno = PG_GETARG_UINT32(2); - bytea *raw_page; - ForkNumber forknum; + text *relname = PG_GETARG_TEXT_PP(0); + text *forkname = PG_GETARG_TEXT_PP(1); + uint32 blkno = PG_GETARG_UINT32(2); + bytea *raw_page; + ForkNumber forknum; - forknum = forkname_to_number(text_to_cstring(forkname)); + forknum = forkname_to_number(text_to_cstring(forkname)); - raw_page = get_raw_page_internal(relname, forknum, blkno); + raw_page = get_raw_page_internal(relname, forknum, blkno); - PG_RETURN_BYTEA_P(raw_page); + PG_RETURN_BYTEA_P(raw_page); } /* @@ -94,76 +94,81 @@ get_raw_page_fork(PG_FUNCTION_ARGS) static bytea * get_raw_page_internal(text *relname, ForkNumber forknum, BlockNumber blkno) { - bytea *raw_page; - RangeVar *relrv; - Relation rel; - char *raw_page_data; - Buffer buf; - - if (!superuser()) - ereport(ERROR, - (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), - (errmsg("must be superuser to use raw functions")))); - - relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); - rel = relation_openrv(relrv, AccessShareLock); - - /* Check that this relation has storage */ - if (rel->rd_rel->relkind == RELKIND_VIEW) - ereport(ERROR, - (errcode(ERRCODE_WRONG_OBJECT_TYPE), - errmsg("cannot get raw page from view \"%s\"", - RelationGetRelationName(rel)))); - if (rel->rd_rel->relkind == RELKIND_COMPOSITE_TYPE) - ereport(ERROR, - (errcode(ERRCODE_WRONG_OBJECT_TYPE), - errmsg("cannot get raw page from composite type \"%s\"", - RelationGetRelationName(rel)))); - if (rel->rd_rel->relkind == RELKIND_FOREIGN_TABLE) - ereport(ERROR, - (errcode(ERRCODE_WRONG_OBJECT_TYPE), - errmsg("cannot get raw page from foreign table \"%s\"", - RelationGetRelationName(rel)))); - if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) - ereport(ERROR, - (errcode(ERRCODE_WRONG_OBJECT_TYPE), - errmsg("cannot get raw page from partitioned table \"%s\"", - RelationGetRelationName(rel)))); - - /* - * Reject attempts to read non-local temporary relations; we would be - * likely to get wrong data since we have no visibility into the owning - * session's local buffers. - */ - if (RELATION_IS_OTHER_TEMP(rel)) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot access temporary tables of other sessions"))); - - if (blkno >= RelationGetNumberOfBlocksInFork(rel, forknum)) - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("block number %u is out of range for relation \"%s\"", - blkno, RelationGetRelationName(rel)))); - - /* Initialize buffer to copy to */ - raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ); - SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); - raw_page_data = VARDATA(raw_page); - - /* Take a verbatim copy of the page */ - - buf = ReadBufferExtended(rel, forknum, blkno, RBM_NORMAL, NULL); - LockBuffer(buf, BUFFER_LOCK_SHARE); - - memcpy(raw_page_data, BufferGetPage(buf), BLCKSZ); - - LockBuffer(buf, BUFFER_LOCK_UNLOCK); - ReleaseBuffer(buf); - - relation_close(rel, AccessShareLock); - - return raw_page; + bytea *raw_page; + RangeVar *relrv; + Relation rel; + char *raw_page_data; + Buffer buf; + + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + (errmsg("must be superuser to use raw functions")))); + + relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); + rel = relation_openrv(relrv, AccessShareLock); + + /* Check that this relation has storage */ + if (rel->rd_rel->relkind == RELKIND_VIEW) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot get raw page from view \"%s\"", + RelationGetRelationName(rel)))); + if (rel->rd_rel->relkind == RELKIND_COMPOSITE_TYPE) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot get raw page from composite type \"%s\"", + RelationGetRelationName(rel)))); + if (rel->rd_rel->relkind == RELKIND_FOREIGN_TABLE) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot get raw page from foreign table \"%s\"", + RelationGetRelationName(rel)))); + if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot get raw page from partitioned table \"%s\"", + RelationGetRelationName(rel)))); + if (rel->rd_rel->relkind == RELKIND_PARTITIONED_INDEX) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot get raw page from partitioned index \"%s\"", + RelationGetRelationName(rel)))); + + /* + * Reject attempts to read non-local temporary relations; we would be + * likely to get wrong data since we have no visibility into the owning + * session's local buffers. + */ + if (RELATION_IS_OTHER_TEMP(rel)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot access temporary tables of other sessions"))); + + if (blkno >= RelationGetNumberOfBlocksInFork(rel, forknum)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("block number %u is out of range for relation \"%s\"", + blkno, RelationGetRelationName(rel)))); + + /* Initialize buffer to copy to */ + raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ); + SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); + raw_page_data = VARDATA(raw_page); + + /* Take a verbatim copy of the page */ + + buf = ReadBufferExtended(rel, forknum, blkno, RBM_NORMAL, NULL); + LockBuffer(buf, BUFFER_LOCK_SHARE); + + memcpy(raw_page_data, BufferGetPage(buf), BLCKSZ); + + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buf); + + relation_close(rel, AccessShareLock); + + return raw_page; } @@ -182,23 +187,23 @@ get_raw_page_internal(text *relname, ForkNumber forknum, BlockNumber blkno) Page get_page_from_raw(bytea *raw_page) { - Page page; - int raw_page_size; + Page page; + int raw_page_size; - raw_page_size = VARSIZE_ANY_EXHDR(raw_page); + raw_page_size = VARSIZE_ANY_EXHDR(raw_page); - if (raw_page_size != BLCKSZ) - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("invalid page size"), - errdetail("Expected %d bytes, got %d.", - BLCKSZ, raw_page_size))); + if (raw_page_size != BLCKSZ) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid page size"), + errdetail("Expected %d bytes, got %d.", + BLCKSZ, raw_page_size))); - page = palloc(raw_page_size); + page = palloc(raw_page_size); - memcpy(page, VARDATA_ANY(raw_page), raw_page_size); + memcpy(page, VARDATA_ANY(raw_page), raw_page_size); - return page; + return page; } @@ -213,76 +218,76 @@ PG_FUNCTION_INFO_V1(page_header); Datum page_header(PG_FUNCTION_ARGS) { - bytea *raw_page = PG_GETARG_BYTEA_P(0); - int raw_page_size; - - TupleDesc tupdesc; - - Datum result; - HeapTuple tuple; - Datum values[10]; - bool nulls[10]; - - PageHeader page; - XLogRecPtr lsn; - - if (!superuser()) - ereport(ERROR, - (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), - (errmsg("must be superuser to use raw page functions")))); - - raw_page_size = VARSIZE(raw_page) - VARHDRSZ; - - /* - * Check that enough data was supplied, so that we don't try to access - * fields outside the supplied buffer. - */ - if (raw_page_size < SizeOfPageHeaderData) - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("input page too small (%d bytes)", raw_page_size))); - - page = (PageHeader) VARDATA(raw_page); - - /* Build a tuple descriptor for our result type */ - if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) - elog(ERROR, "return type must be a row type"); - - /* Extract information from the page header */ - - lsn = PageGetLSN(page); - - /* pageinspect >= 1.2 uses pg_lsn instead of text for the LSN field. */ - if (tupdesc->attrs[0]->atttypid == TEXTOID) - { - char lsnchar[64]; - - snprintf(lsnchar, sizeof(lsnchar), "%X/%X", - (uint32) (lsn >> 32), (uint32) lsn); - values[0] = CStringGetTextDatum(lsnchar); - } - else - values[0] = LSNGetDatum(lsn); - values[1] = UInt16GetDatum(page->pd_checksum); - values[2] = UInt16GetDatum(page->pd_flags); -#ifdef _SHARDING_ - values[3] = UInt16GetDatum(page->pd_shard); + bytea *raw_page = PG_GETARG_BYTEA_P(0); + int raw_page_size; + + TupleDesc tupdesc; + + Datum result; + HeapTuple tuple; + Datum values[10]; + bool nulls[10]; + + PageHeader page; + XLogRecPtr lsn; + + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + (errmsg("must be superuser to use raw page functions")))); + + raw_page_size = VARSIZE(raw_page) - VARHDRSZ; + + /* + * Check that enough data was supplied, so that we don't try to access + * fields outside the supplied buffer. + */ + if (raw_page_size < SizeOfPageHeaderData) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("input page too small (%d bytes)", raw_page_size))); + + page = (PageHeader) VARDATA(raw_page); + + /* Build a tuple descriptor for our result type */ + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + /* Extract information from the page header */ + + lsn = PageGetLSN(page); + + /* pageinspect >= 1.2 uses pg_lsn instead of text for the LSN field. */ + if (tupdesc->attrs[0]->atttypid == TEXTOID) + { + char lsnchar[64]; + + snprintf(lsnchar, sizeof(lsnchar), "%X/%X", + (uint32) (lsn >> 32), (uint32) lsn); + values[0] = CStringGetTextDatum(lsnchar); + } + else + values[0] = LSNGetDatum(lsn); + values[1] = UInt16GetDatum(page->pd_checksum); + values[2] = UInt16GetDatum(page->pd_flags); +#ifdef _SHARDING_ + values[3] = UInt16GetDatum(page->pd_shard); #endif - values[4] = UInt16GetDatum(page->pd_lower); - values[5] = UInt16GetDatum(page->pd_upper); - values[6] = UInt16GetDatum(page->pd_special); - values[7] = UInt16GetDatum(PageGetPageSize(page)); - values[8] = UInt16GetDatum(PageGetPageLayoutVersion(page)); - values[9] = TransactionIdGetDatum(page->pd_prune_xid); + values[4] = UInt16GetDatum(page->pd_lower); + values[5] = UInt16GetDatum(page->pd_upper); + values[6] = UInt16GetDatum(page->pd_special); + values[7] = UInt16GetDatum(PageGetPageSize(page)); + values[8] = UInt16GetDatum(PageGetPageLayoutVersion(page)); + values[9] = TransactionIdGetDatum(page->pd_prune_xid); - /* Build and return the tuple. */ + /* Build and return the tuple. */ - memset(nulls, 0, sizeof(nulls)); + memset(nulls, 0, sizeof(nulls)); - tuple = heap_form_tuple(tupdesc, values, nulls); - result = HeapTupleGetDatum(tuple); + tuple = heap_form_tuple(tupdesc, values, nulls); + result = HeapTupleGetDatum(tuple); - PG_RETURN_DATUM(result); + PG_RETURN_DATUM(result); } /* @@ -296,27 +301,27 @@ PG_FUNCTION_INFO_V1(page_checksum); Datum page_checksum(PG_FUNCTION_ARGS) { - bytea *raw_page = PG_GETARG_BYTEA_P(0); - uint32 blkno = PG_GETARG_INT32(1); - int raw_page_size; - PageHeader page; + bytea *raw_page = PG_GETARG_BYTEA_P(0); + uint32 blkno = PG_GETARG_INT32(1); + int raw_page_size; + PageHeader page; - if (!superuser()) - ereport(ERROR, - (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), - (errmsg("must be superuser to use raw page functions")))); + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + (errmsg("must be superuser to use raw page functions")))); - raw_page_size = VARSIZE(raw_page) - VARHDRSZ; + raw_page_size = VARSIZE(raw_page) - VARHDRSZ; - /* - * Check that the supplied page is of the right size. - */ - if (raw_page_size != BLCKSZ) - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("incorrect size of input page (%d bytes)", raw_page_size))); + /* + * Check that the supplied page is of the right size. + */ + if (raw_page_size != BLCKSZ) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("incorrect size of input page (%d bytes)", raw_page_size))); - page = (PageHeader) VARDATA(raw_page); + page = (PageHeader) VARDATA(raw_page); - PG_RETURN_INT16(pg_checksum_page((char *) page, blkno)); + PG_RETURN_INT16(pg_checksum_page((char *) page, blkno)); } diff --git a/contrib/pageinspect/sql/page.sql b/contrib/pageinspect/sql/page.sql index 493ca9b2..518d5fba 100644 --- a/contrib/pageinspect/sql/page.sql +++ b/contrib/pageinspect/sql/page.sql @@ -33,9 +33,12 @@ SELECT * FROM fsm_page_contents(get_raw_page('test1', 'fsm', 0)); DROP TABLE test1; --- check that using any of these functions with a partitioned table would fail +-- check that using any of these functions with a partitioned table or index +-- would fail create table test_partitioned (a int) partition by range (a); +create index test_partitioned_index on test_partitioned (a); select get_raw_page('test_partitioned', 0); -- error about partitioned table +select get_raw_page('test_partitioned_index', 0); -- error about partitioned index -- a regular table which is a member of a partition set should work though create table test_part1 partition of test_partitioned for values from ( 1 ) to (100); diff --git a/contrib/pgstattuple/expected/pgstattuple.out b/contrib/pgstattuple/expected/pgstattuple.out index 129b29c6..770e73ed 100644 --- a/contrib/pgstattuple/expected/pgstattuple.out +++ b/contrib/pgstattuple/expected/pgstattuple.out @@ -152,9 +152,12 @@ select pgstatginindex('test_hashidx'); ERROR: relation "test_hashidx" is not a GIN index -- check that using any of these functions with unsupported relations will fail create table test_partitioned (a int) partition by range (a); +create index test_partitioned_index on test_partitioned(a); -- these should all fail select pgstattuple('test_partitioned'); ERROR: "test_partitioned" (partitioned table) is not supported +select pgstattuple('test_partitioned_index'); +ERROR: "test_partitioned_index" (partitioned index) is not supported select pgstattuple_approx('test_partitioned'); ERROR: "test_partitioned" is not a table or materialized view select pg_relpages('test_partitioned'); diff --git a/contrib/pgstattuple/pgstattuple.c b/contrib/pgstattuple/pgstattuple.c index ae26d6fe..da993bae 100644 --- a/contrib/pgstattuple/pgstattuple.c +++ b/contrib/pgstattuple/pgstattuple.c @@ -1,7 +1,7 @@ /* * contrib/pgstattuple/pgstattuple.c * - * Copyright (c) 2001,2002 Tatsuo Ishii + * Copyright (c) 2001,2002 Tatsuo Ishii * * Permission to use, copy, modify, and distribute this software and * its documentation for any purpose, without fee, and without a @@ -53,34 +53,34 @@ PG_FUNCTION_INFO_V1(pgstattuplebyid_v1_5); */ typedef struct pgstattuple_type { - uint64 table_len; - uint64 tuple_count; - uint64 tuple_len; - uint64 dead_tuple_count; - uint64 dead_tuple_len; - uint64 free_space; /* free/reusable space in bytes */ + uint64 table_len; + uint64 tuple_count; + uint64 tuple_len; + uint64 dead_tuple_count; + uint64 dead_tuple_len; + uint64 free_space; /* free/reusable space in bytes */ } pgstattuple_type; typedef void (*pgstat_page) (pgstattuple_type *, Relation, BlockNumber, - BufferAccessStrategy); + BufferAccessStrategy); static Datum build_pgstattuple_type(pgstattuple_type *stat, - FunctionCallInfo fcinfo); + FunctionCallInfo fcinfo); static Datum pgstat_relation(Relation rel, FunctionCallInfo fcinfo); static Datum pgstat_heap(Relation rel, FunctionCallInfo fcinfo); static void pgstat_btree_page(pgstattuple_type *stat, - Relation rel, BlockNumber blkno, - BufferAccessStrategy bstrategy); + Relation rel, BlockNumber blkno, + BufferAccessStrategy bstrategy); static void pgstat_hash_page(pgstattuple_type *stat, - Relation rel, BlockNumber blkno, - BufferAccessStrategy bstrategy); + Relation rel, BlockNumber blkno, + BufferAccessStrategy bstrategy); static void pgstat_gist_page(pgstattuple_type *stat, - Relation rel, BlockNumber blkno, - BufferAccessStrategy bstrategy); + Relation rel, BlockNumber blkno, + BufferAccessStrategy bstrategy); static Datum pgstat_index(Relation rel, BlockNumber start, - pgstat_page pagefn, FunctionCallInfo fcinfo); + pgstat_page pagefn, FunctionCallInfo fcinfo); static void pgstat_index_page(pgstattuple_type *stat, Page page, - OffsetNumber minoff, OffsetNumber maxoff); + OffsetNumber minoff, OffsetNumber maxoff); /* * build_pgstattuple_type -- build a pgstattuple_type tuple @@ -88,65 +88,65 @@ static void pgstat_index_page(pgstattuple_type *stat, Page page, static Datum build_pgstattuple_type(pgstattuple_type *stat, FunctionCallInfo fcinfo) { -#define NCOLUMNS 9 -#define NCHARS 32 - - HeapTuple tuple; - char *values[NCOLUMNS]; - char values_buf[NCOLUMNS][NCHARS]; - int i; - double tuple_percent; - double dead_tuple_percent; - double free_percent; /* free/reusable space in % */ - TupleDesc tupdesc; - AttInMetadata *attinmeta; - - /* Build a tuple descriptor for our result type */ - if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) - elog(ERROR, "return type must be a row type"); - - /* - * Generate attribute metadata needed later to produce tuples from raw C - * strings - */ - attinmeta = TupleDescGetAttInMetadata(tupdesc); - - if (stat->table_len == 0) - { - tuple_percent = 0.0; - dead_tuple_percent = 0.0; - free_percent = 0.0; - } - else - { - tuple_percent = 100.0 * stat->tuple_len / stat->table_len; - dead_tuple_percent = 100.0 * stat->dead_tuple_len / stat->table_len; - free_percent = 100.0 * stat->free_space / stat->table_len; - } - - /* - * Prepare a values array for constructing the tuple. This should be an - * array of C strings which will be processed later by the appropriate - * "in" functions. - */ - for (i = 0; i < NCOLUMNS; i++) - values[i] = values_buf[i]; - i = 0; - snprintf(values[i++], NCHARS, INT64_FORMAT, stat->table_len); - snprintf(values[i++], NCHARS, INT64_FORMAT, stat->tuple_count); - snprintf(values[i++], NCHARS, INT64_FORMAT, stat->tuple_len); - snprintf(values[i++], NCHARS, "%.2f", tuple_percent); - snprintf(values[i++], NCHARS, INT64_FORMAT, stat->dead_tuple_count); - snprintf(values[i++], NCHARS, INT64_FORMAT, stat->dead_tuple_len); - snprintf(values[i++], NCHARS, "%.2f", dead_tuple_percent); - snprintf(values[i++], NCHARS, INT64_FORMAT, stat->free_space); - snprintf(values[i++], NCHARS, "%.2f", free_percent); - - /* build a tuple */ - tuple = BuildTupleFromCStrings(attinmeta, values); - - /* make the tuple into a datum */ - return HeapTupleGetDatum(tuple); +#define NCOLUMNS 9 +#define NCHARS 32 + + HeapTuple tuple; + char *values[NCOLUMNS]; + char values_buf[NCOLUMNS][NCHARS]; + int i; + double tuple_percent; + double dead_tuple_percent; + double free_percent; /* free/reusable space in % */ + TupleDesc tupdesc; + AttInMetadata *attinmeta; + + /* Build a tuple descriptor for our result type */ + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + /* + * Generate attribute metadata needed later to produce tuples from raw C + * strings + */ + attinmeta = TupleDescGetAttInMetadata(tupdesc); + + if (stat->table_len == 0) + { + tuple_percent = 0.0; + dead_tuple_percent = 0.0; + free_percent = 0.0; + } + else + { + tuple_percent = 100.0 * stat->tuple_len / stat->table_len; + dead_tuple_percent = 100.0 * stat->dead_tuple_len / stat->table_len; + free_percent = 100.0 * stat->free_space / stat->table_len; + } + + /* + * Prepare a values array for constructing the tuple. This should be an + * array of C strings which will be processed later by the appropriate + * "in" functions. + */ + for (i = 0; i < NCOLUMNS; i++) + values[i] = values_buf[i]; + i = 0; + snprintf(values[i++], NCHARS, INT64_FORMAT, stat->table_len); + snprintf(values[i++], NCHARS, INT64_FORMAT, stat->tuple_count); + snprintf(values[i++], NCHARS, INT64_FORMAT, stat->tuple_len); + snprintf(values[i++], NCHARS, "%.2f", tuple_percent); + snprintf(values[i++], NCHARS, INT64_FORMAT, stat->dead_tuple_count); + snprintf(values[i++], NCHARS, INT64_FORMAT, stat->dead_tuple_len); + snprintf(values[i++], NCHARS, "%.2f", dead_tuple_percent); + snprintf(values[i++], NCHARS, INT64_FORMAT, stat->free_space); + snprintf(values[i++], NCHARS, "%.2f", free_percent); + + /* build a tuple */ + tuple = BuildTupleFromCStrings(attinmeta, values); + + /* make the tuple into a datum */ + return HeapTupleGetDatum(tuple); } /* ---------- @@ -165,20 +165,20 @@ build_pgstattuple_type(pgstattuple_type *stat, FunctionCallInfo fcinfo) Datum pgstattuple(PG_FUNCTION_ARGS) { - text *relname = PG_GETARG_TEXT_PP(0); - RangeVar *relrv; - Relation rel; + text *relname = PG_GETARG_TEXT_PP(0); + RangeVar *relrv; + Relation rel; - if (!superuser()) - ereport(ERROR, - (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), - (errmsg("must be superuser to use pgstattuple functions")))); + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + (errmsg("must be superuser to use pgstattuple functions")))); - /* open relation */ - relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); - rel = relation_openrv(relrv, AccessShareLock); + /* open relation */ + relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); + rel = relation_openrv(relrv, AccessShareLock); - PG_RETURN_DATUM(pgstat_relation(rel, fcinfo)); + PG_RETURN_DATUM(pgstat_relation(rel, fcinfo)); } /* @@ -191,46 +191,46 @@ pgstattuple(PG_FUNCTION_ARGS) Datum pgstattuple_v1_5(PG_FUNCTION_ARGS) { - text *relname = PG_GETARG_TEXT_PP(0); - RangeVar *relrv; - Relation rel; + text *relname = PG_GETARG_TEXT_PP(0); + RangeVar *relrv; + Relation rel; - /* open relation */ - relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); - rel = relation_openrv(relrv, AccessShareLock); + /* open relation */ + relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); + rel = relation_openrv(relrv, AccessShareLock); - PG_RETURN_DATUM(pgstat_relation(rel, fcinfo)); + PG_RETURN_DATUM(pgstat_relation(rel, fcinfo)); } /* Must keep superuser() check, see above. */ Datum pgstattuplebyid(PG_FUNCTION_ARGS) { - Oid relid = PG_GETARG_OID(0); - Relation rel; + Oid relid = PG_GETARG_OID(0); + Relation rel; - if (!superuser()) - ereport(ERROR, - (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), - (errmsg("must be superuser to use pgstattuple functions")))); + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + (errmsg("must be superuser to use pgstattuple functions")))); - /* open relation */ - rel = relation_open(relid, AccessShareLock); + /* open relation */ + rel = relation_open(relid, AccessShareLock); - PG_RETURN_DATUM(pgstat_relation(rel, fcinfo)); + PG_RETURN_DATUM(pgstat_relation(rel, fcinfo)); } /* Remove superuser() check for 1.5 version, see above */ Datum pgstattuplebyid_v1_5(PG_FUNCTION_ARGS) { - Oid relid = PG_GETARG_OID(0); - Relation rel; + Oid relid = PG_GETARG_OID(0); + Relation rel; - /* open relation */ - rel = relation_open(relid, AccessShareLock); + /* open relation */ + rel = relation_open(relid, AccessShareLock); - PG_RETURN_DATUM(pgstat_relation(rel, fcinfo)); + PG_RETURN_DATUM(pgstat_relation(rel, fcinfo)); } /* @@ -239,73 +239,76 @@ pgstattuplebyid_v1_5(PG_FUNCTION_ARGS) static Datum pgstat_relation(Relation rel, FunctionCallInfo fcinfo) { - const char *err; - - /* - * Reject attempts to read non-local temporary relations; we would be - * likely to get wrong data since we have no visibility into the owning - * session's local buffers. - */ - if (RELATION_IS_OTHER_TEMP(rel)) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot access temporary tables of other sessions"))); - - switch (rel->rd_rel->relkind) - { - case RELKIND_RELATION: - case RELKIND_MATVIEW: - case RELKIND_TOASTVALUE: - case RELKIND_SEQUENCE: - return pgstat_heap(rel, fcinfo); - case RELKIND_INDEX: - switch (rel->rd_rel->relam) - { - case BTREE_AM_OID: - return pgstat_index(rel, BTREE_METAPAGE + 1, - pgstat_btree_page, fcinfo); - case HASH_AM_OID: - return pgstat_index(rel, HASH_METAPAGE + 1, - pgstat_hash_page, fcinfo); - case GIST_AM_OID: - return pgstat_index(rel, GIST_ROOT_BLKNO + 1, - pgstat_gist_page, fcinfo); - case GIN_AM_OID: - err = "gin index"; - break; - case SPGIST_AM_OID: - err = "spgist index"; - break; - case BRIN_AM_OID: - err = "brin index"; - break; - default: - err = "unknown index"; - break; - } - break; - case RELKIND_VIEW: - err = "view"; - break; - case RELKIND_COMPOSITE_TYPE: - err = "composite type"; - break; - case RELKIND_FOREIGN_TABLE: - err = "foreign table"; - break; - case RELKIND_PARTITIONED_TABLE: - err = "partitioned table"; - break; - default: - err = "unknown"; - break; - } - - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("\"%s\" (%s) is not supported", - RelationGetRelationName(rel), err))); - return 0; /* should not happen */ + const char *err; + + /* + * Reject attempts to read non-local temporary relations; we would be + * likely to get wrong data since we have no visibility into the owning + * session's local buffers. + */ + if (RELATION_IS_OTHER_TEMP(rel)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot access temporary tables of other sessions"))); + + switch (rel->rd_rel->relkind) + { + case RELKIND_RELATION: + case RELKIND_MATVIEW: + case RELKIND_TOASTVALUE: + case RELKIND_SEQUENCE: + return pgstat_heap(rel, fcinfo); + case RELKIND_INDEX: + switch (rel->rd_rel->relam) + { + case BTREE_AM_OID: + return pgstat_index(rel, BTREE_METAPAGE + 1, + pgstat_btree_page, fcinfo); + case HASH_AM_OID: + return pgstat_index(rel, HASH_METAPAGE + 1, + pgstat_hash_page, fcinfo); + case GIST_AM_OID: + return pgstat_index(rel, GIST_ROOT_BLKNO + 1, + pgstat_gist_page, fcinfo); + case GIN_AM_OID: + err = "gin index"; + break; + case SPGIST_AM_OID: + err = "spgist index"; + break; + case BRIN_AM_OID: + err = "brin index"; + break; + default: + err = "unknown index"; + break; + } + break; + case RELKIND_VIEW: + err = "view"; + break; + case RELKIND_COMPOSITE_TYPE: + err = "composite type"; + break; + case RELKIND_FOREIGN_TABLE: + err = "foreign table"; + break; + case RELKIND_PARTITIONED_TABLE: + err = "partitioned table"; + break; + case RELKIND_PARTITIONED_INDEX: + err = "partitioned index"; + break; + default: + err = "unknown"; + break; + } + + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("\"%s\" (%s) is not supported", + RelationGetRelationName(rel), err))); + return 0; /* should not happen */ } /* @@ -314,81 +317,81 @@ pgstat_relation(Relation rel, FunctionCallInfo fcinfo) static Datum pgstat_heap(Relation rel, FunctionCallInfo fcinfo) { - HeapScanDesc scan; - HeapTuple tuple; - BlockNumber nblocks; - BlockNumber block = 0; /* next block to count free space in */ - BlockNumber tupblock; - Buffer buffer; - pgstattuple_type stat = {0}; - SnapshotData SnapshotDirty; - - /* Disable syncscan because we assume we scan from block zero upwards */ - scan = heap_beginscan_strat(rel, SnapshotAny, 0, NULL, true, false); - InitDirtySnapshot(SnapshotDirty); - - nblocks = scan->rs_nblocks; /* # blocks to be scanned */ - - /* scan the relation */ - while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL) - { - CHECK_FOR_INTERRUPTS(); - - /* must hold a buffer lock to call HeapTupleSatisfiesVisibility */ - LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); - - if (HeapTupleSatisfiesVisibility(tuple, &SnapshotDirty, scan->rs_cbuf)) - { - stat.tuple_len += tuple->t_len; - stat.tuple_count++; - } - else - { - stat.dead_tuple_len += tuple->t_len; - stat.dead_tuple_count++; - } - - LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); - - /* - * To avoid physically reading the table twice, try to do the - * free-space scan in parallel with the heap scan. However, - * heap_getnext may find no tuples on a given page, so we cannot - * simply examine the pages returned by the heap scan. - */ - tupblock = ItemPointerGetBlockNumber(&tuple->t_self); - - while (block <= tupblock) - { - CHECK_FOR_INTERRUPTS(); - - buffer = ReadBufferExtended(rel, MAIN_FORKNUM, block, - RBM_NORMAL, scan->rs_strategy); - LockBuffer(buffer, BUFFER_LOCK_SHARE); - stat.free_space += PageGetHeapFreeSpace((Page) BufferGetPage(buffer)); - UnlockReleaseBuffer(buffer); - block++; - } - } - - while (block < nblocks) - { - CHECK_FOR_INTERRUPTS(); - - buffer = ReadBufferExtended(rel, MAIN_FORKNUM, block, - RBM_NORMAL, scan->rs_strategy); - LockBuffer(buffer, BUFFER_LOCK_SHARE); - stat.free_space += PageGetHeapFreeSpace((Page) BufferGetPage(buffer)); - UnlockReleaseBuffer(buffer); - block++; - } - - heap_endscan(scan); - relation_close(rel, AccessShareLock); - - stat.table_len = (uint64) nblocks * BLCKSZ; - - return build_pgstattuple_type(&stat, fcinfo); + HeapScanDesc scan; + HeapTuple tuple; + BlockNumber nblocks; + BlockNumber block = 0; /* next block to count free space in */ + BlockNumber tupblock; + Buffer buffer; + pgstattuple_type stat = {0}; + SnapshotData SnapshotDirty; + + /* Disable syncscan because we assume we scan from block zero upwards */ + scan = heap_beginscan_strat(rel, SnapshotAny, 0, NULL, true, false); + InitDirtySnapshot(SnapshotDirty); + + nblocks = scan->rs_nblocks; /* # blocks to be scanned */ + + /* scan the relation */ + while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL) + { + CHECK_FOR_INTERRUPTS(); + + /* must hold a buffer lock to call HeapTupleSatisfiesVisibility */ + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); + + if (HeapTupleSatisfiesVisibility(tuple, &SnapshotDirty, scan->rs_cbuf)) + { + stat.tuple_len += tuple->t_len; + stat.tuple_count++; + } + else + { + stat.dead_tuple_len += tuple->t_len; + stat.dead_tuple_count++; + } + + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); + + /* + * To avoid physically reading the table twice, try to do the + * free-space scan in parallel with the heap scan. However, + * heap_getnext may find no tuples on a given page, so we cannot + * simply examine the pages returned by the heap scan. + */ + tupblock = ItemPointerGetBlockNumber(&tuple->t_self); + + while (block <= tupblock) + { + CHECK_FOR_INTERRUPTS(); + + buffer = ReadBufferExtended(rel, MAIN_FORKNUM, block, + RBM_NORMAL, scan->rs_strategy); + LockBuffer(buffer, BUFFER_LOCK_SHARE); + stat.free_space += PageGetHeapFreeSpace((Page) BufferGetPage(buffer)); + UnlockReleaseBuffer(buffer); + block++; + } + } + + while (block < nblocks) + { + CHECK_FOR_INTERRUPTS(); + + buffer = ReadBufferExtended(rel, MAIN_FORKNUM, block, + RBM_NORMAL, scan->rs_strategy); + LockBuffer(buffer, BUFFER_LOCK_SHARE); + stat.free_space += PageGetHeapFreeSpace((Page) BufferGetPage(buffer)); + UnlockReleaseBuffer(buffer); + block++; + } + + heap_endscan(scan); + relation_close(rel, AccessShareLock); + + stat.table_len = (uint64) nblocks * BLCKSZ; + + return build_pgstattuple_type(&stat, fcinfo); } /* @@ -396,43 +399,43 @@ pgstat_heap(Relation rel, FunctionCallInfo fcinfo) */ static void pgstat_btree_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno, - BufferAccessStrategy bstrategy) + BufferAccessStrategy bstrategy) { - Buffer buf; - Page page; - - buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy); - LockBuffer(buf, BT_READ); - page = BufferGetPage(buf); - - /* Page is valid, see what to do with it */ - if (PageIsNew(page)) - { - /* fully empty page */ - stat->free_space += BLCKSZ; - } - else - { - BTPageOpaque opaque; - - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - if (opaque->btpo_flags & (BTP_DELETED | BTP_HALF_DEAD)) - { - /* recyclable page */ - stat->free_space += BLCKSZ; - } - else if (P_ISLEAF(opaque)) - { - pgstat_index_page(stat, page, P_FIRSTDATAKEY(opaque), - PageGetMaxOffsetNumber(page)); - } - else - { - /* root or node */ - } - } - - _bt_relbuf(rel, buf); + Buffer buf; + Page page; + + buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy); + LockBuffer(buf, BT_READ); + page = BufferGetPage(buf); + + /* Page is valid, see what to do with it */ + if (PageIsNew(page)) + { + /* fully empty page */ + stat->free_space += BLCKSZ; + } + else + { + BTPageOpaque opaque; + + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (opaque->btpo_flags & (BTP_DELETED | BTP_HALF_DEAD)) + { + /* recyclable page */ + stat->free_space += BLCKSZ; + } + else if (P_ISLEAF(opaque)) + { + pgstat_index_page(stat, page, P_FIRSTDATAKEY(opaque), + PageGetMaxOffsetNumber(page)); + } + else + { + /* root or node */ + } + } + + _bt_relbuf(rel, buf); } /* @@ -440,41 +443,41 @@ pgstat_btree_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno, */ static void pgstat_hash_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno, - BufferAccessStrategy bstrategy) + BufferAccessStrategy bstrategy) { - Buffer buf; - Page page; - - buf = _hash_getbuf_with_strategy(rel, blkno, HASH_READ, 0, bstrategy); - page = BufferGetPage(buf); - - if (PageGetSpecialSize(page) == MAXALIGN(sizeof(HashPageOpaqueData))) - { - HashPageOpaque opaque; - - opaque = (HashPageOpaque) PageGetSpecialPointer(page); - switch (opaque->hasho_flag & LH_PAGE_TYPE) - { - case LH_UNUSED_PAGE: - stat->free_space += BLCKSZ; - break; - case LH_BUCKET_PAGE: - case LH_OVERFLOW_PAGE: - pgstat_index_page(stat, page, FirstOffsetNumber, - PageGetMaxOffsetNumber(page)); - break; - case LH_BITMAP_PAGE: - case LH_META_PAGE: - default: - break; - } - } - else - { - /* maybe corrupted */ - } - - _hash_relbuf(rel, buf); + Buffer buf; + Page page; + + buf = _hash_getbuf_with_strategy(rel, blkno, HASH_READ, 0, bstrategy); + page = BufferGetPage(buf); + + if (PageGetSpecialSize(page) == MAXALIGN(sizeof(HashPageOpaqueData))) + { + HashPageOpaque opaque; + + opaque = (HashPageOpaque) PageGetSpecialPointer(page); + switch (opaque->hasho_flag & LH_PAGE_TYPE) + { + case LH_UNUSED_PAGE: + stat->free_space += BLCKSZ; + break; + case LH_BUCKET_PAGE: + case LH_OVERFLOW_PAGE: + pgstat_index_page(stat, page, FirstOffsetNumber, + PageGetMaxOffsetNumber(page)); + break; + case LH_BITMAP_PAGE: + case LH_META_PAGE: + default: + break; + } + } + else + { + /* maybe corrupted */ + } + + _hash_relbuf(rel, buf); } /* @@ -482,27 +485,27 @@ pgstat_hash_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno, */ static void pgstat_gist_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno, - BufferAccessStrategy bstrategy) + BufferAccessStrategy bstrategy) { - Buffer buf; - Page page; - - buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy); - LockBuffer(buf, GIST_SHARE); - gistcheckpage(rel, buf); - page = BufferGetPage(buf); - - if (GistPageIsLeaf(page)) - { - pgstat_index_page(stat, page, FirstOffsetNumber, - PageGetMaxOffsetNumber(page)); - } - else - { - /* root or node */ - } - - UnlockReleaseBuffer(buf); + Buffer buf; + Page page; + + buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy); + LockBuffer(buf, GIST_SHARE); + gistcheckpage(rel, buf); + page = BufferGetPage(buf); + + if (GistPageIsLeaf(page)) + { + pgstat_index_page(stat, page, FirstOffsetNumber, + PageGetMaxOffsetNumber(page)); + } + else + { + /* root or node */ + } + + UnlockReleaseBuffer(buf); } /* @@ -510,43 +513,43 @@ pgstat_gist_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno, */ static Datum pgstat_index(Relation rel, BlockNumber start, pgstat_page pagefn, - FunctionCallInfo fcinfo) + FunctionCallInfo fcinfo) { - BlockNumber nblocks; - BlockNumber blkno; - BufferAccessStrategy bstrategy; - pgstattuple_type stat = {0}; + BlockNumber nblocks; + BlockNumber blkno; + BufferAccessStrategy bstrategy; + pgstattuple_type stat = {0}; - /* prepare access strategy for this index */ - bstrategy = GetAccessStrategy(BAS_BULKREAD); + /* prepare access strategy for this index */ + bstrategy = GetAccessStrategy(BAS_BULKREAD); - blkno = start; - for (;;) - { - /* Get the current relation length */ - LockRelationForExtension(rel, ExclusiveLock); - nblocks = RelationGetNumberOfBlocks(rel); - UnlockRelationForExtension(rel, ExclusiveLock); + blkno = start; + for (;;) + { + /* Get the current relation length */ + LockRelationForExtension(rel, ExclusiveLock); + nblocks = RelationGetNumberOfBlocks(rel); + UnlockRelationForExtension(rel, ExclusiveLock); - /* Quit if we've scanned the whole relation */ - if (blkno >= nblocks) - { - stat.table_len = (uint64) nblocks * BLCKSZ; + /* Quit if we've scanned the whole relation */ + if (blkno >= nblocks) + { + stat.table_len = (uint64) nblocks * BLCKSZ; - break; - } + break; + } - for (; blkno < nblocks; blkno++) - { - CHECK_FOR_INTERRUPTS(); + for (; blkno < nblocks; blkno++) + { + CHECK_FOR_INTERRUPTS(); - pagefn(&stat, rel, blkno, bstrategy); - } - } + pagefn(&stat, rel, blkno, bstrategy); + } + } - relation_close(rel, AccessShareLock); + relation_close(rel, AccessShareLock); - return build_pgstattuple_type(&stat, fcinfo); + return build_pgstattuple_type(&stat, fcinfo); } /* @@ -554,25 +557,25 @@ pgstat_index(Relation rel, BlockNumber start, pgstat_page pagefn, */ static void pgstat_index_page(pgstattuple_type *stat, Page page, - OffsetNumber minoff, OffsetNumber maxoff) + OffsetNumber minoff, OffsetNumber maxoff) { - OffsetNumber i; - - stat->free_space += PageGetFreeSpace(page); - - for (i = minoff; i <= maxoff; i = OffsetNumberNext(i)) - { - ItemId itemid = PageGetItemId(page, i); - - if (ItemIdIsDead(itemid)) - { - stat->dead_tuple_count++; - stat->dead_tuple_len += ItemIdGetLength(itemid); - } - else - { - stat->tuple_count++; - stat->tuple_len += ItemIdGetLength(itemid); - } - } + OffsetNumber i; + + stat->free_space += PageGetFreeSpace(page); + + for (i = minoff; i <= maxoff; i = OffsetNumberNext(i)) + { + ItemId itemid = PageGetItemId(page, i); + + if (ItemIdIsDead(itemid)) + { + stat->dead_tuple_count++; + stat->dead_tuple_len += ItemIdGetLength(itemid); + } + else + { + stat->tuple_count++; + stat->tuple_len += ItemIdGetLength(itemid); + } + } } diff --git a/contrib/pgstattuple/sql/pgstattuple.sql b/contrib/pgstattuple/sql/pgstattuple.sql index 8eb5fd2c..8b7beb0c 100644 --- a/contrib/pgstattuple/sql/pgstattuple.sql +++ b/contrib/pgstattuple/sql/pgstattuple.sql @@ -64,8 +64,10 @@ select pgstatginindex('test_hashidx'); -- check that using any of these functions with unsupported relations will fail create table test_partitioned (a int) partition by range (a); +create index test_partitioned_index on test_partitioned(a); -- these should all fail select pgstattuple('test_partitioned'); +select pgstattuple('test_partitioned_index'); select pgstattuple_approx('test_partitioned'); select pg_relpages('test_partitioned'); select pgstatindex('test_partitioned'); From 7a29480302c53e7c94fd85f0ec06656f26fb2e45 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Mon, 29 Jun 2020 20:04:50 +0800 Subject: [PATCH 254/578] psql: have \d show FKs on partitioned tables --- src/bin/psql/describe.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c index f198c238..1c671aca 100644 --- a/src/bin/psql/describe.c +++ b/src/bin/psql/describe.c @@ -2240,8 +2240,13 @@ describeOneTableDetails(const char *schemaname, PQclear(result); } - /* print foreign-key constraints (there are none if no triggers) */ - if (tableinfo.hastriggers) + /* + * Print foreign-key constraints (there are none if no triggers, + * except if the table is partitioned, in which case the triggers + * appear in the partitions) + */ + if (tableinfo.hastriggers || + tableinfo.relkind == RELKIND_PARTITIONED_TABLE) { printfPQExpBuffer(&buf, "SELECT conname,\n" From c44038c0a8e33fdd5c282cd4f1fe80ff17647186 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Mon, 29 Jun 2020 20:41:58 +0800 Subject: [PATCH 255/578] Don't needlessly check the partition contraint twice.http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/commands/copy.c | 31 ++++++++++------------ src/backend/executor/execMain.c | 31 ++++++++++++---------- src/backend/executor/execPartition.c | 5 ++-- src/backend/executor/execReplication.c | 8 ++++-- src/backend/executor/nodeModifyTable.c | 36 +++++++++++--------------- src/include/executor/executor.h | 5 ++-- 6 files changed, 55 insertions(+), 61 deletions(-) diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index e3e67988..5b7eb4b9 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -3548,26 +3548,21 @@ CopyFrom(CopyState cstate) } else { + /* Check the constraints of the tuple */ + if (resultRelInfo->ri_RelationDesc->rd_att->constr) + ExecConstraints(resultRelInfo, slot, estate); + /* - * We always check the partition constraint, including when - * the tuple got here via tuple-routing. However we don't - * need to in the latter case if no BR trigger is defined on - * the partition. Note that a BR trigger might modify the - * tuple such that the partition constraint is no longer - * satisfied, so we need to check in that case. + * Also check the tuple against the partition constraint, if + * there is one; except that if we got here via tuple-routing, + * we don't need to if there's no BR trigger defined on the + * partition. */ - bool check_partition_constr = - (resultRelInfo->ri_PartitionCheck != NIL); - - if (saved_resultRelInfo != NULL && - !(resultRelInfo->ri_TrigDesc && - resultRelInfo->ri_TrigDesc->trig_insert_before_row)) - check_partition_constr = false; - - /* Check the constraints of the tuple */ - if (resultRelInfo->ri_RelationDesc->rd_att->constr || - check_partition_constr) - ExecConstraints(resultRelInfo, slot, estate, true); + if (resultRelInfo->ri_PartitionCheck && + (saved_resultRelInfo == NULL || + (resultRelInfo->ri_TrigDesc && + resultRelInfo->ri_TrigDesc->trig_insert_before_row))) + ExecPartitionCheck(resultRelInfo, slot, estate, true); #ifdef _MLS_ if (is_mls_user()) diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 63de1a27..5a082133 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -2209,14 +2209,16 @@ ExecRelCheck(ResultRelInfo *resultRelInfo, /* * ExecPartitionCheck --- check that tuple meets the partition constraint. * - * Exported in executor.h for outside use. - * Returns true if it meets the partition constraint, else returns false. + * Returns true if it meets the partition constraint. If the constraint + * fails and we're asked to emit to error, do so and don't return; otherwise + * return false. */ bool ExecPartitionCheck(ResultRelInfo *resultRelInfo, TupleTableSlot *slot, - EState *estate) + EState *estate, bool emitError) { ExprContext *econtext; + bool success; /* * If first time through, build expression state tree for the partition @@ -2243,7 +2245,13 @@ ExecPartitionCheck(ResultRelInfo *resultRelInfo, TupleTableSlot *slot, * As in case of the catalogued constraints, we treat a NULL result as * success here, not a failure. */ - return ExecCheck(resultRelInfo->ri_PartitionCheckExpr, econtext); + success = ExecCheck(resultRelInfo->ri_PartitionCheckExpr, econtext); + + /* if asked to emit error, don't actually return on failure */ + if (!success && emitError) + ExecPartitionCheckEmitError(resultRelInfo, slot, estate); + + return success; } /* @@ -2303,17 +2311,17 @@ ExecPartitionCheckEmitError(ResultRelInfo *resultRelInfo, /* * ExecConstraints - check constraints of the tuple in 'slot' * - * This checks the traditional NOT NULL and check constraints, and if - * requested, checks the partition constraint. + * This checks the traditional NOT NULL and check constraints. + * + * The partition constraint is *NOT* checked. * * Note: 'slot' contains the tuple to check the constraints of, which may * have been converted from the original input tuple after tuple routing. - * 'resultRelInfo' is the original result relation, before tuple routing. + * 'resultRelInfo' is the final result relation, after tuple routing. */ void ExecConstraints(ResultRelInfo *resultRelInfo, - TupleTableSlot *slot, EState *estate, - bool check_partition_constraint) + TupleTableSlot *slot, EState *estate) { Relation rel = resultRelInfo->ri_RelationDesc; TupleDesc tupdesc = RelationGetDescr(rel); @@ -2427,13 +2435,8 @@ ExecConstraints(ResultRelInfo *resultRelInfo, errtableconstraint(orig_rel, failed))); } } - - if (check_partition_constraint && resultRelInfo->ri_PartitionCheck && - !ExecPartitionCheck(resultRelInfo, slot, estate)) - ExecPartitionCheckEmitError(resultRelInfo, slot, estate); } - /* * ExecWithCheckOptions -- check that tuple satisfies any WITH CHECK OPTIONs * of the specified kind. diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c index 50bc3754..1835d52a 100644 --- a/src/backend/executor/execPartition.c +++ b/src/backend/executor/execPartition.c @@ -206,9 +206,8 @@ ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd, * First check the root table's partition constraint, if any. No point in * routing the tuple if it doesn't belong in the root table itself. */ - if (resultRelInfo->ri_PartitionCheck && - !ExecPartitionCheck(resultRelInfo, slot, estate)) - ExecPartitionCheckEmitError(resultRelInfo, slot, estate); + if (resultRelInfo->ri_PartitionCheck) + ExecPartitionCheck(resultRelInfo, slot, estate, true); /* start with the root partitioned table */ parent = pd[0]; diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c index c0b6f4a0..1cf551a0 100644 --- a/src/backend/executor/execReplication.c +++ b/src/backend/executor/execReplication.c @@ -404,7 +404,9 @@ ExecSimpleRelationInsert(EState *estate, TupleTableSlot *slot) /* Check the constraints of the tuple */ if (rel->rd_att->constr) - ExecConstraints(resultRelInfo, slot, estate, true); + ExecConstraints(resultRelInfo, slot, estate); + if (resultRelInfo->ri_PartitionCheck) + ExecPartitionCheck(resultRelInfo, slot, estate, true); #ifdef _MLS_ if (is_mls_user()) @@ -491,7 +493,9 @@ ExecSimpleRelationUpdate(EState *estate, EPQState *epqstate, /* Check the constraints of the tuple */ if (rel->rd_att->constr) - ExecConstraints(resultRelInfo, slot, estate, true); + ExecConstraints(resultRelInfo, slot, estate); + if (resultRelInfo->ri_PartitionCheck) + ExecPartitionCheck(resultRelInfo, slot, estate, true); #ifdef _MLS_ if (is_mls_user()) diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index a8cb0df0..34a53370 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -513,16 +513,6 @@ ExecInsert(ModifyTableState *mtstate, { WCOKind wco_kind; - /* - * We always check the partition constraint, including when the tuple - * got here via tuple-routing. However we don't need to in the latter - * case if no BR trigger is defined on the partition. Note that a BR - * trigger might modify the tuple such that the partition constraint - * is no longer satisfied, so we need to check in that case. - */ - bool check_partition_constr = - (resultRelInfo->ri_PartitionCheck != NIL); - /* * Constraints might reference the tableoid column, so initialize * t_tableOid before evaluating them. @@ -549,17 +539,21 @@ ExecInsert(ModifyTableState *mtstate, ExecWithCheckOptions(wco_kind, resultRelInfo, slot, estate); /* - * No need though if the tuple has been routed, and a BR trigger - * doesn't exist. + * Check the constraints of the tuple. */ - if (resultRelInfo->ri_PartitionRoot != NULL && - !(resultRelInfo->ri_TrigDesc && - resultRelInfo->ri_TrigDesc->trig_insert_before_row)) - check_partition_constr = false; + if (resultRelationDesc->rd_att->constr) + ExecConstraints(resultRelInfo, slot, estate); - /* Check the constraints of the tuple */ - if (resultRelationDesc->rd_att->constr || check_partition_constr) - ExecConstraints(resultRelInfo, slot, estate, true); + /* + * Also check the tuple against the partition constraint, if there is + * one; except that if we got here via tuple-routing, we don't need to + * if there's no BR trigger defined on the partition. + */ + if (resultRelInfo->ri_PartitionCheck && + (resultRelInfo->ri_PartitionRoot == NULL || + (resultRelInfo->ri_TrigDesc && + resultRelInfo->ri_TrigDesc->trig_insert_before_row))) + ExecPartitionCheck(resultRelInfo, slot, estate, true); #ifdef _MLS_ if (is_mls_user()) @@ -1354,7 +1348,7 @@ lreplace:; */ partition_constraint_failed = resultRelInfo->ri_PartitionCheck && - !ExecPartitionCheck(resultRelInfo, slot, estate); + !ExecPartitionCheck(resultRelInfo, slot, estate, false); if (!partition_constraint_failed && resultRelInfo->ri_WithCheckOptions != NIL) @@ -1473,7 +1467,7 @@ lreplace:; * checks. */ if (resultRelationDesc->rd_att->constr) - ExecConstraints(resultRelInfo, slot, estate, false); + ExecConstraints(resultRelInfo, slot, estate); #ifdef _MLS_ if (is_mls_user()) diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index a143cd77..72f62666 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -205,10 +205,9 @@ extern ResultRelInfo *ExecGetTriggerResultRel(EState *estate, Oid relid); extern void ExecCleanUpTriggerState(EState *estate); extern bool ExecContextForcesOids(PlanState *planstate, bool *hasoids); extern void ExecConstraints(ResultRelInfo *resultRelInfo, - TupleTableSlot *slot, EState *estate, - bool check_partition_constraint); -extern bool ExecPartitionCheck(ResultRelInfo *resultRelInfo, TupleTableSlot *slot, EState *estate); +extern bool ExecPartitionCheck(ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, EState *estate, bool emitError); extern void ExecPartitionCheckEmitError(ResultRelInfo *resultRelInfo, TupleTableSlot *slot, EState *estate); extern void ExecWithCheckOptions(WCOKind kind, ResultRelInfo *resultRelInfo, From 19fcfde92868cd1a246fd418e08f0d5bbfed2c7e Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Mon, 29 Jun 2020 21:08:48 +0800 Subject: [PATCH 256/578] Fix FK checks of TRUNCATE involving partitioned tables.http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/catalog/heap.c | 7 ++- src/test/regress/expected/truncate.out | 75 ++++++++++++++++++++++++++ src/test/regress/sql/truncate.sql | 47 ++++++++++++++++ 3 files changed, 127 insertions(+), 2 deletions(-) diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c index 2f135c95..57e486f1 100644 --- a/src/backend/catalog/heap.c +++ b/src/backend/catalog/heap.c @@ -4057,13 +4057,16 @@ heap_truncate_check_FKs(List *relations, bool tempTables) * Build a list of OIDs of the interesting relations. * * If a relation has no triggers, then it can neither have FKs nor be - * referenced by a FK from another table, so we can ignore it. + * referenced by a FK from another table, so we can ignore it. For + * partitioned tables, FKs have no triggers, so we must include them + * anyway. */ foreach(cell, relations) { Relation rel = lfirst(cell); - if (rel->rd_rel->relhastriggers) + if (rel->rd_rel->relhastriggers || + rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) oids = lappend_oid(oids, RelationGetRelid(rel)); } diff --git a/src/test/regress/expected/truncate.out b/src/test/regress/expected/truncate.out index d47b1080..168fc0cc 100644 --- a/src/test/regress/expected/truncate.out +++ b/src/test/regress/expected/truncate.out @@ -481,3 +481,78 @@ ERROR: cannot truncate only a partitioned table HINT: Do not specify the ONLY keyword, or use truncate only on the partitions directly. TRUNCATE truncparted; DROP TABLE truncparted; +-- foreign key on partitioned table: partition key is referencing column. +-- Make sure truncate did execute on all tables +CREATE FUNCTION tp_ins_data() RETURNS void LANGUAGE plpgsql AS $$ + BEGIN + INSERT INTO truncprim VALUES (1), (100), (150); + INSERT INTO truncpart VALUES (1), (100), (150); + END +$$; +CREATE FUNCTION tp_chk_data(OUT pktb regclass, OUT pkval int, OUT fktb regclass, OUT fkval int) + RETURNS SETOF record LANGUAGE plpgsql AS $$ + BEGIN + RETURN QUERY SELECT + pk.tableoid::regclass, pk.a, fk.tableoid::regclass, fk.a + FROM truncprim pk FULL JOIN truncpart fk USING (a) + ORDER BY 2, 4; + END +$$; +CREATE TABLE truncprim (a int PRIMARY KEY); +CREATE TABLE truncpart (a int REFERENCES truncprim) + PARTITION BY RANGE (a); +CREATE TABLE truncpart_1 PARTITION OF truncpart FOR VALUES FROM (0) TO (100); +CREATE TABLE truncpart_2 PARTITION OF truncpart FOR VALUES FROM (100) TO (200) + PARTITION BY RANGE (a); +CREATE TABLE truncpart_2_1 PARTITION OF truncpart_2 FOR VALUES FROM (100) TO (150); +CREATE TABLE truncpart_2_d PARTITION OF truncpart_2 DEFAULT; +TRUNCATE TABLE truncprim; -- should fail +ERROR: cannot truncate a table referenced in a foreign key constraint +DETAIL: Table "truncpart" references "truncprim". +HINT: Truncate table "truncpart" at the same time, or use TRUNCATE ... CASCADE. +select tp_ins_data(); + tp_ins_data +------------- + +(1 row) + +-- should truncate everything +TRUNCATE TABLE truncprim, truncpart; +select * from tp_chk_data(); + pktb | pkval | fktb | fkval +------+-------+------+------- +(0 rows) + +select tp_ins_data(); + tp_ins_data +------------- + +(1 row) + +-- should truncate everything +SET client_min_messages TO WARNING; -- suppress cascading notices +TRUNCATE TABLE truncprim CASCADE; +RESET client_min_messages; +SELECT * FROM tp_chk_data(); + pktb | pkval | fktb | fkval +------+-------+------+------- +(0 rows) + +SELECT tp_ins_data(); + tp_ins_data +------------- + +(1 row) + +-- should truncate all partitions +TRUNCATE TABLE truncpart; +SELECT * FROM tp_chk_data(); + pktb | pkval | fktb | fkval +-----------+-------+------+------- + truncprim | 1 | | + truncprim | 100 | | + truncprim | 150 | | +(3 rows) + +DROP TABLE truncprim, truncpart; +DROP FUNCTION tp_ins_data(), tp_chk_data(); diff --git a/src/test/regress/sql/truncate.sql b/src/test/regress/sql/truncate.sql index 7d25bc0b..afde2f66 100644 --- a/src/test/regress/sql/truncate.sql +++ b/src/test/regress/sql/truncate.sql @@ -244,3 +244,50 @@ INSERT INTO truncparted VALUES (1, 'a'); TRUNCATE ONLY truncparted; TRUNCATE truncparted; DROP TABLE truncparted; + +-- foreign key on partitioned table: partition key is referencing column. +-- Make sure truncate did execute on all tables +CREATE FUNCTION tp_ins_data() RETURNS void LANGUAGE plpgsql AS $$ + BEGIN + INSERT INTO truncprim VALUES (1), (100), (150); + INSERT INTO truncpart VALUES (1), (100), (150); + END +$$; +CREATE FUNCTION tp_chk_data(OUT pktb regclass, OUT pkval int, OUT fktb regclass, OUT fkval int) + RETURNS SETOF record LANGUAGE plpgsql AS $$ + BEGIN + RETURN QUERY SELECT + pk.tableoid::regclass, pk.a, fk.tableoid::regclass, fk.a + FROM truncprim pk FULL JOIN truncpart fk USING (a) + ORDER BY 2, 4; + END +$$; +CREATE TABLE truncprim (a int PRIMARY KEY); +CREATE TABLE truncpart (a int REFERENCES truncprim) + PARTITION BY RANGE (a); +CREATE TABLE truncpart_1 PARTITION OF truncpart FOR VALUES FROM (0) TO (100); +CREATE TABLE truncpart_2 PARTITION OF truncpart FOR VALUES FROM (100) TO (200) + PARTITION BY RANGE (a); +CREATE TABLE truncpart_2_1 PARTITION OF truncpart_2 FOR VALUES FROM (100) TO (150); +CREATE TABLE truncpart_2_d PARTITION OF truncpart_2 DEFAULT; + +TRUNCATE TABLE truncprim; -- should fail + +select tp_ins_data(); +-- should truncate everything +TRUNCATE TABLE truncprim, truncpart; +select * from tp_chk_data(); + +select tp_ins_data(); +-- should truncate everything +SET client_min_messages TO WARNING; -- suppress cascading notices +TRUNCATE TABLE truncprim CASCADE; +RESET client_min_messages; +SELECT * FROM tp_chk_data(); + +SELECT tp_ins_data(); +-- should truncate all partitions +TRUNCATE TABLE truncpart; +SELECT * FROM tp_chk_data(); +DROP TABLE truncprim, truncpart; +DROP FUNCTION tp_ins_data(), tp_chk_data(); From 0c76ec2f0e70cf39b142c418b1577141e5deebb8 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Mon, 29 Jun 2020 21:34:41 +0800 Subject: [PATCH 257/578] Improve performance of tuple conversion map generation --- src/backend/access/common/tupconvert.c | 53 ++++++++++++++++++-------- src/backend/optimizer/prep/prepunion.c | 41 ++++++++++---------- 2 files changed, 57 insertions(+), 37 deletions(-) diff --git a/src/backend/access/common/tupconvert.c b/src/backend/access/common/tupconvert.c index 0092ce37..6812689b 100644 --- a/src/backend/access/common/tupconvert.c +++ b/src/backend/access/common/tupconvert.c @@ -290,33 +290,55 @@ convert_tuples_by_name_map(TupleDesc indesc, const char *msg) {// #lizard forgives AttrNumber *attrMap; - int n; + int outnatts; + int innatts; int i; + int nextindesc = -1; - n = outdesc->natts; - attrMap = (AttrNumber *) palloc0(n * sizeof(AttrNumber)); - for (i = 0; i < n; i++) + outnatts = outdesc->natts; + innatts = indesc->natts; + + attrMap = (AttrNumber *) palloc0(outnatts * sizeof(AttrNumber)); + for (i = 0; i < outnatts; i++) { - Form_pg_attribute att = outdesc->attrs[i]; + Form_pg_attribute outatt = TupleDescAttr(outdesc, i); char *attname; Oid atttypid; int32 atttypmod; int j; - if (att->attisdropped) + if (outatt->attisdropped) continue; /* attrMap[i] is already 0 */ - attname = NameStr(att->attname); - atttypid = att->atttypid; - atttypmod = att->atttypmod; - for (j = 0; j < indesc->natts; j++) + attname = NameStr(outatt->attname); + atttypid = outatt->atttypid; + atttypmod = outatt->atttypmod; + + /* + * Now search for an attribute with the same name in the indesc. It + * seems likely that a partitioned table will have the attributes in + * the same order as the partition, so the search below is optimized + * for that case. It is possible that columns are dropped in one of + * the relations, but not the other, so we use the 'nextindesc' + * counter to track the starting point of the search. If the inner + * loop encounters dropped columns then it will have to skip over + * them, but it should leave 'nextindesc' at the correct position for + * the next outer loop. + */ + for (j = 0; j < innatts; j++) { - att = indesc->attrs[j]; - if (att->attisdropped) + Form_pg_attribute inatt; + + nextindesc++; + if (nextindesc >= innatts) + nextindesc = 0; + + inatt = TupleDescAttr(indesc, nextindesc); + if (inatt->attisdropped) continue; - if (strcmp(attname, NameStr(att->attname)) == 0) + if (strcmp(attname, NameStr(inatt->attname)) == 0) { /* Found it, check type */ - if (atttypid != att->atttypid || atttypmod != att->atttypmod) + if (atttypid != inatt->atttypid || atttypmod != inatt->atttypmod) ereport(ERROR, (errcode(ERRCODE_DATATYPE_MISMATCH), errmsg_internal("%s", _(msg)), @@ -324,7 +346,7 @@ convert_tuples_by_name_map(TupleDesc indesc, attname, format_type_be(outdesc->tdtypeid), format_type_be(indesc->tdtypeid)))); - attrMap[i] = (AttrNumber) (j + 1); + attrMap[i] = inatt->attnum; break; } } @@ -337,7 +359,6 @@ convert_tuples_by_name_map(TupleDesc indesc, format_type_be(outdesc->tdtypeid), format_type_be(indesc->tdtypeid)))); } - return attrMap; } diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c index d2e6c3c6..a9c117f1 100644 --- a/src/backend/optimizer/prep/prepunion.c +++ b/src/backend/optimizer/prep/prepunion.c @@ -52,6 +52,7 @@ #include "utils/lsyscache.h" #include "utils/rel.h" #include "utils/selfuncs.h" +#include "utils/syscache.h" typedef struct @@ -1848,9 +1849,11 @@ make_inh_translation_list(Relation oldrelation, Relation newrelation, List *vars = NIL; TupleDesc old_tupdesc = RelationGetDescr(oldrelation); TupleDesc new_tupdesc = RelationGetDescr(newrelation); + Oid new_relid = RelationGetRelid(newrelation); int oldnatts = old_tupdesc->natts; int newnatts = new_tupdesc->natts; int old_attno; + int new_attno = 0; for (old_attno = 0; old_attno < oldnatts; old_attno++) { @@ -1859,7 +1862,6 @@ make_inh_translation_list(Relation oldrelation, Relation newrelation, Oid atttypid; int32 atttypmod; Oid attcollation; - int new_attno; att = old_tupdesc->attrs[old_attno]; if (att->attisdropped) @@ -1892,29 +1894,25 @@ make_inh_translation_list(Relation oldrelation, Relation newrelation, * Otherwise we have to search for the matching column by name. * There's no guarantee it'll have the same column position, because * of cases like ALTER TABLE ADD COLUMN and multiple inheritance. - * However, in simple cases it will be the same column number, so try - * that before we go groveling through all the columns. - * - * Note: the test for (att = ...) != NULL cannot fail, it's just a - * notational device to include the assignment into the if-clause. - */ - if (old_attno < newnatts && - (att = new_tupdesc->attrs[old_attno]) != NULL && - !att->attisdropped && att->attinhcount != 0 && - strcmp(attname, NameStr(att->attname)) == 0) - new_attno = old_attno; - else - { - for (new_attno = 0; new_attno < newnatts; new_attno++) + * However, in simple cases, the relative order of columns is mostly + * the same in both relations, so try the column of newrelation that + * follows immediately after the one that we just found, and if that + * fails, let syscache handle it. + */ + if (new_attno >= newnatts || + (att = TupleDescAttr(new_tupdesc, new_attno))->attisdropped || + strcmp(attname, NameStr(att->attname)) != 0) { - att = new_tupdesc->attrs[new_attno]; - if (!att->attisdropped && att->attinhcount != 0 && - strcmp(attname, NameStr(att->attname)) == 0) - break; - } - if (new_attno >= newnatts) + HeapTuple newtup; + + newtup = SearchSysCacheAttName(new_relid, attname); + if (!newtup) elog(ERROR, "could not find inherited attribute \"%s\" of relation \"%s\"", attname, RelationGetRelationName(newrelation)); + new_attno = ((Form_pg_attribute) GETSTRUCT(newtup))->attnum - 1; + ReleaseSysCache(newtup); + + att = TupleDescAttr(new_tupdesc, new_attno); } /* Found it, check type and collation match */ @@ -1931,6 +1929,7 @@ make_inh_translation_list(Relation oldrelation, Relation newrelation, atttypmod, attcollation, 0)); + new_attno++; } *translated_vars = vars; From dbd5825b1fce442c2535ebb5fa2eded27efdef79 Mon Sep 17 00:00:00 2001 From: Alvaro Herrera Date: Mon, 16 Jul 2018 18:38:09 -0400 Subject: [PATCH 258/578] Fix partition pruning with IS [NOT] NULL clauses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The original code was unable to prune partitions that could not possibly contain NULL values, when the query specified less than all columns in a multicolumn partition key. Reorder the if-tests so that it is, and add more commentary and regression tests. Reported-by: Ashutosh Bapat Co-authored-by: Dilip Kumar Co-authored-by: Amit Langote Co-authored-by: Álvaro Herrera Reviewed-by: Ashutosh Bapat Reviewed-by: amul sul Discussion: https://postgr.es/m/CAFjFpRc7qjLUfXLVBBC_HAnx644sjTYM=qVoT3TJ840HPbsTXw@mail.gmail.com --- src/backend/partitioning/partprune.c | 78 ++++++++++--------- src/test/regress/expected/partition_prune.out | 41 ++++++++++ src/test/regress/sql/partition_prune.sql | 7 ++ 3 files changed, 90 insertions(+), 36 deletions(-) diff --git a/src/backend/partitioning/partprune.c b/src/backend/partitioning/partprune.c index 03bacd1f..242267f2 100644 --- a/src/backend/partitioning/partprune.c +++ b/src/backend/partitioning/partprune.c @@ -642,54 +642,60 @@ gen_partprune_steps_internal(GeneratePruningStepsContext *context, } } - /* - * If generate_opsteps is set to false it means no OpExprs were directly - * present in the input list. + /*----------- + * Now generate some (more) pruning steps. We have three strategies: + * + * 1) Generate pruning steps based on IS NULL clauses: + * a) For list partitioning, null partition keys can only be found in + * the designated null-accepting partition, so if there are IS NULL + * clauses containing partition keys we should generate a pruning + * step that gets rid of all partitions but that one. We can + * disregard any OpExpr we may have found. + * b) For range partitioning, only the default partition can contain + * NULL values, so the same rationale applies. + * c) For hash partitioning, we only apply this strategy if we have + * IS NULL clauses for all the keys. Strategy 2 below will take + * care of the case where some keys have OpExprs and others have + * IS NULL clauses. + * + * 2) If not, generate steps based on OpExprs we have (if any). + * + * 3) If this doesn't work either, we may be able to generate steps to + * prune just the null-accepting partition (if one exists), if we have + * IS NOT NULL clauses for all partition keys. */ - if (!generate_opsteps) + if (!bms_is_empty(nullkeys) && + (part_scheme->strategy == PARTITION_STRATEGY_LIST || + part_scheme->strategy == PARTITION_STRATEGY_RANGE || + (part_scheme->strategy == PARTITION_STRATEGY_HASH && + bms_num_members(nullkeys) == part_scheme->partnatts))) { - /* - * Generate one prune step for the information derived from IS NULL, - * if any. To prune hash partitions, we must have found IS NULL - * clauses for all partition keys. - */ - if (!bms_is_empty(nullkeys) && - (part_scheme->strategy != PARTITION_STRATEGY_HASH || - bms_num_members(nullkeys) == part_scheme->partnatts)) - { - PartitionPruneStep *step; - - step = gen_prune_step_op(context, InvalidStrategy, - false, NIL, NIL, nullkeys); - result = lappend(result, step); - } - - /* - * Note that for IS NOT NULL clauses, simply having step suffices; - * there is no need to propagate the exact details of which keys are - * required to be NOT NULL. Hash partitioning expects to see actual - * values to perform any pruning. - */ - if (!bms_is_empty(notnullkeys) && - part_scheme->strategy != PARTITION_STRATEGY_HASH) - { - PartitionPruneStep *step; + PartitionPruneStep *step; - step = gen_prune_step_op(context, InvalidStrategy, - false, NIL, NIL, NULL); - result = lappend(result, step); - } + /* Strategy 1 */ + step = gen_prune_step_op(context, InvalidStrategy, + false, NIL, NIL, nullkeys); + result = lappend(result, step); } - else + else if (generate_opsteps) { PartitionPruneStep *step; - /* Generate pruning steps from OpExpr clauses in keyclauses. */ + /* Strategy 2 */ step = gen_prune_steps_from_opexps(part_scheme, context, keyclauses, nullkeys); if (step != NULL) result = lappend(result, step); } + else if (bms_num_members(notnullkeys) == part_scheme->partnatts) + { + PartitionPruneStep *step; + + /* Strategy 3 */ + step = gen_prune_step_op(context, InvalidStrategy, + false, NIL, NIL, NULL); + result = lappend(result, step); + } /* * Finally, results from all entries appearing in result should be diff --git a/src/test/regress/expected/partition_prune.out b/src/test/regress/expected/partition_prune.out index b91cac4b..5db871b4 100644 --- a/src/test/regress/expected/partition_prune.out +++ b/src/test/regress/expected/partition_prune.out @@ -993,6 +993,47 @@ explain (costs off) select * from mc2p where a = 1 and b > 1; Filter: ((b > 1) AND (a = 1)) (3 rows) +-- all partitions but the default one should be pruned +explain (costs off) select * from mc2p where a = 1 and b is null; + QUERY PLAN +------------------------------------------- + Append + -> Seq Scan on mc2p_default + Filter: ((b IS NULL) AND (a = 1)) +(3 rows) + +explain (costs off) select * from mc2p where a is null and b is null; + QUERY PLAN +----------------------------------------------- + Append + -> Seq Scan on mc2p_default + Filter: ((a IS NULL) AND (b IS NULL)) +(3 rows) + +explain (costs off) select * from mc2p where a is null and b = 1; + QUERY PLAN +------------------------------------------- + Append + -> Seq Scan on mc2p_default + Filter: ((a IS NULL) AND (b = 1)) +(3 rows) + +explain (costs off) select * from mc2p where a is null; + QUERY PLAN +-------------------------------- + Append + -> Seq Scan on mc2p_default + Filter: (a IS NULL) +(3 rows) + +explain (costs off) select * from mc2p where b is null; + QUERY PLAN +-------------------------------- + Append + -> Seq Scan on mc2p_default + Filter: (b IS NULL) +(3 rows) + -- boolean partitioning create table boolpart (a bool) partition by list (a); create table boolpart_default partition of boolpart default; diff --git a/src/test/regress/sql/partition_prune.sql b/src/test/regress/sql/partition_prune.sql index 164b74ee..0a812c9c 100644 --- a/src/test/regress/sql/partition_prune.sql +++ b/src/test/regress/sql/partition_prune.sql @@ -137,6 +137,13 @@ explain (costs off) select * from mc2p where a = 2 and b < 1; explain (costs off) select * from mc2p where a > 1; explain (costs off) select * from mc2p where a = 1 and b > 1; +-- all partitions but the default one should be pruned +explain (costs off) select * from mc2p where a = 1 and b is null; +explain (costs off) select * from mc2p where a is null and b is null; +explain (costs off) select * from mc2p where a is null and b = 1; +explain (costs off) select * from mc2p where a is null; +explain (costs off) select * from mc2p where b is null; + -- boolean partitioning create table boolpart (a bool) partition by list (a); create table boolpart_default partition of boolpart default; From 0b2016b79f6bfb0d23d7d1a01fd3ddee25c1b881 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Thu, 19 Jul 2018 09:01:57 +0900 Subject: [PATCH 259/578] Fix re-parameterize of MergeAppendPath Instead of MergeAppendPath, MergeAppend nodes were considered. This code is not covered by any tests now, which should be addressed at some point. This is an oversight from f49842d, which introduced partition-wise joins in v11, so back-patch down to that. Author: Michael Paquier Reviewed-by: Ashutosh Bapat Discussion: https://postgr.es/m/20180718062202.GC8565@paquier.xyz --- src/backend/optimizer/util/pathnode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 0a6735d1..4d2a1f32 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -7153,7 +7153,7 @@ do { \ } break; - case T_MergeAppend: + case T_MergeAppendPath: { MergeAppendPath *mapath; From 4e4a47bdd8fb342f1a50575648b00c3c2facc751 Mon Sep 17 00:00:00 2001 From: Alvaro Herrera Date: Mon, 30 Jul 2018 17:18:42 -0400 Subject: [PATCH 260/578] Change bms_add_range to be a no-op for empty ranges In commit 84940644de93, bms_add_range was added with an API to fail with an error if an empty range was specified. This seems arbitrary and unhelpful, so turn that case into a no-op instead. Callers that require further verification on the arguments or result can apply them by themselves. This fixes the bug that partition pruning throws an API error for a case involving the default partition of a default partition, as in the included test case. Reported-by: Rajkumar Raghuwanshi Diagnosed-by: Tom Lane Discussion: https://postgr.es/m/16590.1532622503@sss.pgh.pa.us --- src/backend/nodes/bitmapset.c | 7 +++++-- src/test/regress/expected/partition_prune.out | 15 +++++++++++++++ src/test/regress/sql/partition_prune.sql | 7 +++++++ 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/src/backend/nodes/bitmapset.c b/src/backend/nodes/bitmapset.c index 8ec465d2..f4b56e9f 100644 --- a/src/backend/nodes/bitmapset.c +++ b/src/backend/nodes/bitmapset.c @@ -960,6 +960,10 @@ bms_add_range(Bitmapset *a, int lower, int upper) ushiftbits, wordnum; + /* do nothing if nothing is called for, without further checking */ + if (upper < lower) + return a; + if (lower < 0 || upper < 0) elog(ERROR, "negative bitmapset member not allowed"); if (lower > upper) @@ -971,13 +975,12 @@ bms_add_range(Bitmapset *a, int lower, int upper) a = (Bitmapset *) palloc0(BITMAPSET_SIZE(uwordnum + 1)); a->nwords = uwordnum + 1; } - - /* ensure we have enough words to store the upper bit */ else if (uwordnum >= a->nwords) { int oldnwords = a->nwords; int i; + /* ensure we have enough words to store the upper bit */ a = (Bitmapset *) repalloc(a, BITMAPSET_SIZE(uwordnum + 1)); a->nwords = uwordnum + 1; /* zero out the enlarged portion */ diff --git a/src/test/regress/expected/partition_prune.out b/src/test/regress/expected/partition_prune.out index 5db871b4..94bceb8d 100644 --- a/src/test/regress/expected/partition_prune.out +++ b/src/test/regress/expected/partition_prune.out @@ -1188,6 +1188,21 @@ explain (costs off) select * from coercepart where a !~ all ('{ab,bc}'); (7 rows) drop table coercepart; +CREATE TABLE part (a INT, b INT) PARTITION BY LIST (a); +CREATE TABLE part_p1 PARTITION OF part FOR VALUES IN (-2,-1,0,1,2); +CREATE TABLE part_p2 PARTITION OF part DEFAULT PARTITION BY RANGE(a); +CREATE TABLE part_p2_p1 PARTITION OF part_p2 DEFAULT; +INSERT INTO part VALUES (-1,-1), (1,1), (2,NULL), (NULL,-2),(NULL,NULL); +EXPLAIN (COSTS OFF) SELECT tableoid::regclass as part, a, b FROM part WHERE a IS NULL ORDER BY 1, 2, 3; + QUERY PLAN +--------------------------------------------------------------------------- + Sort + Sort Key: ((part_p2_p1.tableoid)::regclass), part_p2_p1.a, part_p2_p1.b + -> Append + -> Seq Scan on part_p2_p1 + Filter: (a IS NULL) +(5 rows) + -- -- some more cases -- diff --git a/src/test/regress/sql/partition_prune.sql b/src/test/regress/sql/partition_prune.sql index 0a812c9c..4862cdfd 100644 --- a/src/test/regress/sql/partition_prune.sql +++ b/src/test/regress/sql/partition_prune.sql @@ -173,6 +173,13 @@ explain (costs off) select * from coercepart where a !~ all ('{ab,bc}'); drop table coercepart; +CREATE TABLE part (a INT, b INT) PARTITION BY LIST (a); +CREATE TABLE part_p1 PARTITION OF part FOR VALUES IN (-2,-1,0,1,2); +CREATE TABLE part_p2 PARTITION OF part DEFAULT PARTITION BY RANGE(a); +CREATE TABLE part_p2_p1 PARTITION OF part_p2 DEFAULT; +INSERT INTO part VALUES (-1,-1), (1,1), (2,NULL), (NULL,-2),(NULL,NULL); +EXPLAIN (COSTS OFF) SELECT tableoid::regclass as part, a, b FROM part WHERE a IS NULL ORDER BY 1, 2, 3; + -- -- some more cases -- From 8054542bdeca32046c9dbb62f5612f77994224b9 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Tue, 30 Jun 2020 11:30:25 +0800 Subject: [PATCH 261/578] Error position support for partition specifications --- src/backend/commands/tablecmds.c | 16 +++++++++++----- src/test/regress/expected/create_table.out | 6 ++++++ 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index afa19507..d585d5e9 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -533,7 +533,7 @@ static void RangeVarCallbackForDropRelation(const RangeVar *rel, Oid relOid, static void RangeVarCallbackForAlterRelation(const RangeVar *rv, Oid relid, Oid oldrelid, void *arg); static PartitionSpec *transformPartitionSpec(Relation rel, PartitionSpec *partspec, char *strategy); -static void ComputePartitionAttrs(Relation rel, List *partParams, AttrNumber *partattrs, +static void ComputePartitionAttrs(ParseState *pstate, Relation rel, List *partParams, AttrNumber *partattrs, List **partexprs, Oid *partopclass, Oid *partcollation, char strategy); static void CreateInheritance(Relation child_rel, Relation parent_rel); static void RemoveInheritance(Relation child_rel, Relation parent_rel); @@ -1124,6 +1124,7 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, */ if (stmt->partspec) { + ParseState *pstate; char strategy; int partnatts; AttrNumber partattrs[PARTITION_MAX_KEYS]; @@ -1131,6 +1132,9 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, Oid partcollation[PARTITION_MAX_KEYS]; List *partexprs = NIL; + pstate = make_parsestate(NULL); + pstate->p_sourcetext = queryString; + partnatts = list_length(stmt->partspec->partParams); /* Protect fixed-size arrays here and in executor */ @@ -1163,7 +1167,7 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, else { #endif - ComputePartitionAttrs(rel, stmt->partspec->partParams, + ComputePartitionAttrs(pstate, rel, stmt->partspec->partParams, partattrs, &partexprs, partopclass, partcollation, strategy); @@ -16152,7 +16156,7 @@ transformPartitionSpec(Relation rel, PartitionSpec *partspec, char *strategy) * Expressions in the PartitionElems must be parse-analyzed already. */ static void -ComputePartitionAttrs(Relation rel, List *partParams, AttrNumber *partattrs, +ComputePartitionAttrs(ParseState *pstate, Relation rel, List *partParams, AttrNumber *partattrs, List **partexprs, Oid *partopclass, Oid *partcollation, char strategy) { @@ -16179,14 +16183,16 @@ ComputePartitionAttrs(Relation rel, List *partParams, AttrNumber *partattrs, ereport(ERROR, (errcode(ERRCODE_UNDEFINED_COLUMN), errmsg("column \"%s\" named in partition key does not exist", - pelem->name))); + pelem->name), + parser_errposition(pstate, pelem->location))); attform = (Form_pg_attribute) GETSTRUCT(atttuple); if (attform->attnum <= 0) ereport(ERROR, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("cannot use system column \"%s\" in partition key", - pelem->name))); + pelem->name), + parser_errposition(pstate, pelem->location))); partattrs[attn] = attform->attnum; atttype = attform->atttypid; diff --git a/src/test/regress/expected/create_table.out b/src/test/regress/expected/create_table.out index 55e9e44d..e9bf8784 100644 --- a/src/test/regress/expected/create_table.out +++ b/src/test/regress/expected/create_table.out @@ -353,11 +353,15 @@ CREATE TABLE partitioned ( a int ) PARTITION BY RANGE (b); ERROR: column "b" named in partition key does not exist +LINE 3: ) PARTITION BY RANGE (b); + ^ -- cannot use system columns in partition key CREATE TABLE partitioned ( a int ) PARTITION BY RANGE (xmin); ERROR: cannot use system column "xmin" in partition key +LINE 3: ) PARTITION BY RANGE (xmin); + ^ -- functions in key must be immutable CREATE FUNCTION immut_func (a int) RETURNS int AS $$ SELECT a + random()::int; $$ LANGUAGE SQL; CREATE TABLE partitioned ( @@ -746,6 +750,8 @@ SELECT conislocal, coninhcount FROM pg_constraint WHERE conrelid = 'part_b'::reg -- specify PARTITION BY for a partition CREATE TABLE fail_part_col_not_found PARTITION OF parted FOR VALUES IN ('c') PARTITION BY RANGE (c); ERROR: column "c" named in partition key does not exist +LINE 1: ...TITION OF parted FOR VALUES IN ('c') PARTITION BY RANGE (c); + ^ CREATE TABLE part_c PARTITION OF parted (b WITH OPTIONS NOT NULL DEFAULT 0) FOR VALUES IN ('c') PARTITION BY RANGE ((b)); -- create a level-2 partition CREATE TABLE part_c_1_10 PARTITION OF part_c FOR VALUES FROM (1) TO (10); From b62a02dd517ac39bc0adb576989e97dc20aae877 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Tue, 30 Jun 2020 11:38:05 +0800 Subject: [PATCH 262/578] Minor fixes for psql tab completion. --- src/bin/psql/tab-complete.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c index 49305e4a..29eaf18b 100644 --- a/src/bin/psql/tab-complete.c +++ b/src/bin/psql/tab-complete.c @@ -558,6 +558,7 @@ static const SchemaQuery Query_for_list_of_tmf = { "pg_catalog.pg_class c", /* selcondition */ "c.relkind IN (" CppAsString2(RELKIND_RELATION) ", " + CppAsString2(RELKIND_PARTITIONED_TABLE) ", " CppAsString2(RELKIND_MATVIEW) ", " CppAsString2(RELKIND_FOREIGN_TABLE) ")", /* viscondition */ @@ -2034,6 +2035,7 @@ psql_completion(const char *text, int start, int end) "fillfactor", "parallel_workers", "log_autovacuum_min_duration", + "toast_tuple_target", "toast.autovacuum_enabled", "toast.autovacuum_freeze_max_age", "toast.autovacuum_freeze_min_age", @@ -2535,7 +2537,7 @@ psql_completion(const char *text, int start, int end) COMPLETE_WITH_LIST2("TABLE", "MATERIALIZED VIEW"); /* Complete PARTITION BY with RANGE ( or LIST ( or ... */ else if (TailMatches2("PARTITION", "BY")) - COMPLETE_WITH_LIST2("RANGE (", "LIST ("); + COMPLETE_WITH_LIST3("RANGE (", "LIST (", "HASH ("); /* If we have xxx PARTITION OF, provide a list of partitioned tables */ else if (TailMatches2("PARTITION", "OF")) COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_partitioned_tables, ""); From d554203e6baefb1092b6d240946dddb875248f1c Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Tue, 30 Jun 2020 11:50:39 +0800 Subject: [PATCH 263/578] Fix ALTER/TYPE on columns referenced by FKs in partitioned tables --- src/backend/commands/tablecmds.c | 59 +++++++-------------- src/test/regress/expected/foreign_key.out | 12 +++++ src/test/regress/expected/foreign_key_1.out | 12 +++++ src/test/regress/expected/foreign_key_2.out | 12 +++++ src/test/regress/sql/foreign_key.sql | 11 ++++ 5 files changed, 65 insertions(+), 41 deletions(-) diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index d585d5e9..b4f3ddb8 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -11080,26 +11080,6 @@ ATExecAlterColumnType(AlteredTableInfo *tab, Relation rel, { char *defstring = pg_get_constraintdef_command(foundObject.objectId); - /* - * Put NORMAL dependencies at the front of the list and - * AUTO dependencies at the back. This makes sure that - * foreign-key constraints depending on this column will - * be dropped before unique or primary-key constraints of - * the column; which we must have because the FK - * constraints depend on the indexes belonging to the - * unique constraints. - */ - if (foundDep->deptype == DEPENDENCY_NORMAL) - { - tab->changedConstraintOids = - lcons_oid(foundObject.objectId, - tab->changedConstraintOids); - tab->changedConstraintDefs = - lcons(defstring, - tab->changedConstraintDefs); - } - else - { tab->changedConstraintOids = lappend_oid(tab->changedConstraintOids, foundObject.objectId); @@ -11107,7 +11087,6 @@ ATExecAlterColumnType(AlteredTableInfo *tab, Relation rel, lappend(tab->changedConstraintDefs, defstring); } - } break; case OCLASS_REWRITE: @@ -11462,10 +11441,18 @@ static void ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab, LOCKMODE lockmode) { ObjectAddress obj; + ObjectAddresses *objects; ListCell *def_item; ListCell *oid_item; /* + * Collect all the constraints and indexes to drop so we can process them + * in a single call. That way we don't have to worry about dependencies + * among them. + */ + objects = new_object_addresses(); + + /* * Re-parse the index and constraint definitions, and attach them to the * appropriate work queue entries. We do this before dropping because in * the case of a FOREIGN KEY constraint, we might not yet have exclusive @@ -11498,6 +11485,9 @@ ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab, LOCKMODE lockmode) conislocal = con->conislocal; ReleaseSysCache(tup); + ObjectAddressSet(obj, ConstraintRelationId, lfirst_oid(oid_item)); + add_exact_object_address(&obj, objects); + /* * If the constraint is inherited (only), we don't want to inject a * new definition here; it'll get recreated when ATAddCheckConstraint @@ -11521,31 +11511,18 @@ ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab, LOCKMODE lockmode) ATPostAlterTypeParse(oldId, relid, InvalidOid, (char *) lfirst(def_item), wqueue, lockmode, tab->rewrite); + + ObjectAddressSet(obj, RelationRelationId, lfirst_oid(oid_item)); + add_exact_object_address(&obj, objects); } /* - * Now we can drop the existing constraints and indexes --- constraints - * first, since some of them might depend on the indexes. In fact, we - * have to delete FOREIGN KEY constraints before UNIQUE constraints, but - * we already ordered the constraint list to ensure that would happen. It - * should be okay to use DROP_RESTRICT here, since nothing else should be - * depending on these objects. + * It should be okay to use DROP_RESTRICT here, since nothing else should + * be depending on these objects. */ - foreach(oid_item, tab->changedConstraintOids) - { - obj.classId = ConstraintRelationId; - obj.objectId = lfirst_oid(oid_item); - obj.objectSubId = 0; - performDeletion(&obj, DROP_RESTRICT, PERFORM_DELETION_INTERNAL); - } + performMultipleDeletions(objects, DROP_RESTRICT, PERFORM_DELETION_INTERNAL); - foreach(oid_item, tab->changedIndexOids) - { - obj.classId = RelationRelationId; - obj.objectId = lfirst_oid(oid_item); - obj.objectSubId = 0; - performDeletion(&obj, DROP_RESTRICT, PERFORM_DELETION_INTERNAL); - } + free_object_addresses(objects); /* * The objects will get recreated during subsequent passes over the work diff --git a/src/test/regress/expected/foreign_key.out b/src/test/regress/expected/foreign_key.out index 59e95190..5a958f3a 100644 --- a/src/test/regress/expected/foreign_key.out +++ b/src/test/regress/expected/foreign_key.out @@ -1431,3 +1431,15 @@ alter table fktable2 drop constraint fktable2_f1_fkey; ERROR: cannot ALTER TABLE "pktable2" because it has pending trigger events commit; drop table pktable2, fktable2; +-- Altering a type referenced by a foreign key needs to drop/recreate the FK. +-- Ensure that works. +CREATE TABLE fk_notpartitioned_pk (a INT, PRIMARY KEY(a), CHECK (a > 0)); +CREATE TABLE fk_partitioned_fk (a INT REFERENCES fk_notpartitioned_pk(a) PRIMARY KEY) PARTITION BY RANGE(a); +CREATE TABLE fk_partitioned_fk_1 PARTITION OF fk_partitioned_fk FOR VALUES FROM (MINVALUE) TO (MAXVALUE); +INSERT INTO fk_notpartitioned_pk VALUES (1); +INSERT INTO fk_partitioned_fk VALUES (1); +ALTER TABLE fk_notpartitioned_pk ALTER COLUMN a TYPE bigint; +DELETE FROM fk_notpartitioned_pk WHERE a = 1; +ERROR: update or delete on table "fk_notpartitioned_pk" violates foreign key constraint "fk_partitioned_fk_a_fkey" on table "fk_partitioned_fk" +DETAIL: Key (a)=(1) is still referenced from table "fk_partitioned_fk". +DROP TABLE fk_notpartitioned_pk, fk_partitioned_fk; diff --git a/src/test/regress/expected/foreign_key_1.out b/src/test/regress/expected/foreign_key_1.out index e5861d11..cb069e3a 100644 --- a/src/test/regress/expected/foreign_key_1.out +++ b/src/test/regress/expected/foreign_key_1.out @@ -1426,3 +1426,15 @@ alter table fktable2 drop constraint fktable2_f1_fkey; ERROR: cannot ALTER TABLE "pktable2" because it has pending trigger events commit; drop table pktable2, fktable2; +-- Altering a type referenced by a foreign key needs to drop/recreate the FK. +-- Ensure that works. +CREATE TABLE fk_notpartitioned_pk (a INT, PRIMARY KEY(a), CHECK (a > 0)); +CREATE TABLE fk_partitioned_fk (a INT REFERENCES fk_notpartitioned_pk(a) PRIMARY KEY) PARTITION BY RANGE(a); +CREATE TABLE fk_partitioned_fk_1 PARTITION OF fk_partitioned_fk FOR VALUES FROM (MINVALUE) TO (MAXVALUE); +INSERT INTO fk_notpartitioned_pk VALUES (1); +INSERT INTO fk_partitioned_fk VALUES (1); +ALTER TABLE fk_notpartitioned_pk ALTER COLUMN a TYPE bigint; +DELETE FROM fk_notpartitioned_pk WHERE a = 1; +ERROR: update or delete on table "fk_notpartitioned_pk" violates foreign key constraint "fk_partitioned_fk_a_fkey" on table "fk_partitioned_fk" +DETAIL: Key (a)=(1) is still referenced from table "fk_partitioned_fk". +DROP TABLE fk_notpartitioned_pk, fk_partitioned_fk; diff --git a/src/test/regress/expected/foreign_key_2.out b/src/test/regress/expected/foreign_key_2.out index 8b8ac8ac..27e9026d 100644 --- a/src/test/regress/expected/foreign_key_2.out +++ b/src/test/regress/expected/foreign_key_2.out @@ -1438,3 +1438,15 @@ alter table fktable2 drop constraint fktable2_f1_fkey; ERROR: cannot ALTER TABLE "pktable2" because it has pending trigger events commit; drop table pktable2, fktable2; +-- Altering a type referenced by a foreign key needs to drop/recreate the FK. +-- Ensure that works. +CREATE TABLE fk_notpartitioned_pk (a INT, PRIMARY KEY(a), CHECK (a > 0)); +CREATE TABLE fk_partitioned_fk (a INT REFERENCES fk_notpartitioned_pk(a) PRIMARY KEY) PARTITION BY RANGE(a); +CREATE TABLE fk_partitioned_fk_1 PARTITION OF fk_partitioned_fk FOR VALUES FROM (MINVALUE) TO (MAXVALUE); +INSERT INTO fk_notpartitioned_pk VALUES (1); +INSERT INTO fk_partitioned_fk VALUES (1); +ALTER TABLE fk_notpartitioned_pk ALTER COLUMN a TYPE bigint; +DELETE FROM fk_notpartitioned_pk WHERE a = 1; +ERROR: update or delete on table "fk_notpartitioned_pk" violates foreign key constraint "fk_partitioned_fk_a_fkey" on table "fk_partitioned_fk" +DETAIL: Key (a)=(1) is still referenced from table "fk_partitioned_fk". +DROP TABLE fk_notpartitioned_pk, fk_partitioned_fk; diff --git a/src/test/regress/sql/foreign_key.sql b/src/test/regress/sql/foreign_key.sql index 2fcd7d60..8c887eb9 100644 --- a/src/test/regress/sql/foreign_key.sql +++ b/src/test/regress/sql/foreign_key.sql @@ -1060,3 +1060,14 @@ alter table fktable2 drop constraint fktable2_f1_fkey; commit; drop table pktable2, fktable2; + +-- Altering a type referenced by a foreign key needs to drop/recreate the FK. +-- Ensure that works. +CREATE TABLE fk_notpartitioned_pk (a INT, PRIMARY KEY(a), CHECK (a > 0)); +CREATE TABLE fk_partitioned_fk (a INT REFERENCES fk_notpartitioned_pk(a) PRIMARY KEY) PARTITION BY RANGE(a); +CREATE TABLE fk_partitioned_fk_1 PARTITION OF fk_partitioned_fk FOR VALUES FROM (MINVALUE) TO (MAXVALUE); +INSERT INTO fk_notpartitioned_pk VALUES (1); +INSERT INTO fk_partitioned_fk VALUES (1); +ALTER TABLE fk_notpartitioned_pk ALTER COLUMN a TYPE bigint; +DELETE FROM fk_notpartitioned_pk WHERE a = 1; +DROP TABLE fk_notpartitioned_pk, fk_partitioned_fk; From 873bafeff4e7918c0f3ba710d3f43a7a38be32b0 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Tue, 30 Jun 2020 15:36:08 +0800 Subject: [PATCH 264/578] Fix event triggers for partitioned tables. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/catalog/index.c | 10 +- src/backend/commands/event_trigger.c | 13 +- src/backend/commands/indexcmds.c | 3 +- src/backend/commands/tablecmds.c | 2 +- src/backend/commands/view.c | 4 + src/backend/executor/execPartition.c | 31 +++++ src/include/catalog/index.h | 3 +- src/include/executor/execPartition.h | 30 +---- src/include/tcop/deparse_utility.h | 122 +++++++++--------- .../test_ddl_deparse/expected/alter_table.out | 12 ++ .../test_ddl_deparse/sql/alter_table.sql | 8 ++ src/test/regress/expected/event_trigger.out | 20 ++- src/test/regress/sql/event_trigger.sql | 13 ++ 13 files changed, 170 insertions(+), 101 deletions(-) diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 04d3d181..7f01e417 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -48,6 +48,7 @@ #include "catalog/pg_type.h" #include "catalog/storage.h" #include "commands/tablecmds.h" +#include "commands/event_trigger.h" #include "commands/trigger.h" #include "executor/executor.h" #include "miscadmin.h" @@ -198,8 +199,9 @@ relationHasPrimaryKey(Relation rel) void index_check_primary_key(Relation heapRel, IndexInfo *indexInfo, - bool is_alter_table) -{// #lizard forgives + bool is_alter_table, + IndexStmt *stmt) +{ List *cmds; int i; #ifdef __TBASE__ @@ -295,7 +297,11 @@ index_check_primary_key(Relation heapRel, * unduly. */ if (cmds) + { + EventTriggerAlterTableStart((Node *) stmt); AlterTableInternal(RelationGetRelid(heapRel), cmds, true); + EventTriggerAlterTableEnd(); + } } /* diff --git a/src/backend/commands/event_trigger.c b/src/backend/commands/event_trigger.c index d289c395..742e23d4 100644 --- a/src/backend/commands/event_trigger.c +++ b/src/backend/commands/event_trigger.c @@ -1813,11 +1813,6 @@ EventTriggerCollectSimpleCommand(ObjectAddress address, * Note we don't collect the command immediately; instead we keep it in * currentCommand, and only when we're done processing the subcommands we will * add it to the command list. - * - * XXX -- this API isn't considering the possibility of an ALTER TABLE command - * being called reentrantly by an event trigger function. Do we need stackable - * commands at this level? Perhaps at least we should detect the condition and - * raise an error. */ void EventTriggerAlterTableStart(Node *parsetree) @@ -1842,6 +1837,7 @@ EventTriggerAlterTableStart(Node *parsetree) command->d.alterTable.subcmds = NIL; command->parsetree = copyObject(parsetree); + command->parent = currentEventTriggerState->currentCommand; currentEventTriggerState->currentCommand = command; MemoryContextSwitchTo(oldcxt); @@ -1882,6 +1878,7 @@ EventTriggerCollectAlterTableSubcmd(Node *subcmd, ObjectAddress address) return; Assert(IsA(subcmd, AlterTableCmd)); + Assert(OidIsValid(currentEventTriggerState->currentCommand)); Assert(OidIsValid(currentEventTriggerState->currentCommand->d.alterTable.objectId)); oldcxt = MemoryContextSwitchTo(currentEventTriggerState->cxt); @@ -1907,11 +1904,15 @@ EventTriggerCollectAlterTableSubcmd(Node *subcmd, ObjectAddress address) void EventTriggerAlterTableEnd(void) { + CollectedCommand *parent; + /* ignore if event trigger context not set, or collection disabled */ if (!currentEventTriggerState || currentEventTriggerState->commandCollectionInhibited) return; + parent = currentEventTriggerState->currentCommand->parent; + /* If no subcommands, don't collect */ if (list_length(currentEventTriggerState->currentCommand->d.alterTable.subcmds) != 0) { @@ -1922,7 +1923,7 @@ EventTriggerAlterTableEnd(void) else pfree(currentEventTriggerState->currentCommand); - currentEventTriggerState->currentCommand = NULL; + currentEventTriggerState->currentCommand = parent; } /* diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index 234d4e26..ad99f3e2 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -33,6 +33,7 @@ #include "commands/comment.h" #include "commands/dbcommands.h" #include "commands/defrem.h" +#include "commands/event_trigger.h" #include "commands/tablecmds.h" #include "commands/tablespace.h" #include "mb/pg_wchar.h" @@ -664,7 +665,7 @@ DefineIndex(Oid relationId, * Extra checks when creating a PRIMARY KEY index. */ if (stmt->primary) - index_check_primary_key(rel, indexInfo, is_alter_table); + index_check_primary_key(rel, indexInfo, is_alter_table, stmt); /* * We disallow indexes on system columns other than OID. They would not diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index b4f3ddb8..cf6b6896 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -8569,7 +8569,7 @@ ATExecAddIndexConstraint(AlteredTableInfo *tab, Relation rel, /* Extra checks needed if making primary key */ if (stmt->primary) - index_check_primary_key(rel, indexInfo, true); + index_check_primary_key(rel, indexInfo, true, stmt); /* Note we currently don't support EXCLUSION constraints here */ if (stmt->primary) diff --git a/src/backend/commands/view.c b/src/backend/commands/view.c index ae927e9e..b2a9ebc6 100644 --- a/src/backend/commands/view.c +++ b/src/backend/commands/view.c @@ -65,6 +65,8 @@ validateWithCheckOption(char *value) * * Create a view relation and use the rules system to store the query * for the view. + * + * EventTriggerAlterTableStart must have been called already. *--------------------------------------------------------------------- */ static ObjectAddress @@ -190,6 +192,7 @@ DefineVirtualRelation(RangeVar *relation, List *tlist, bool replace, atcmds = lappend(atcmds, atcmd); } + /* EventTriggerAlterTableStart called by ProcessUtilitySlow */ AlterTableInternal(viewOid, atcmds, true); /* Make the new view columns visible */ @@ -221,6 +224,7 @@ DefineVirtualRelation(RangeVar *relation, List *tlist, bool replace, atcmd->def = (Node *) options; atcmds = list_make1(atcmd); + /* EventTriggerAlterTableStart called by ProcessUtilitySlow */ AlterTableInternal(viewOid, atcmds, true); ObjectAddressSet(address, RelationRelationId, viewOid); diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c index 1835d52a..60221c6b 100644 --- a/src/backend/executor/execPartition.c +++ b/src/backend/executor/execPartition.c @@ -23,6 +23,37 @@ #include "utils/rls.h" #include "utils/ruleutils.h" + +/*----------------------- + * PartitionDispatch - information about one partitioned table in a partition + * hierarchy required to route a tuple to one of its partitions + * + * reldesc Relation descriptor of the table + * key Partition key information of the table + * keystate Execution state required for expressions in the partition key + * partdesc Partition descriptor of the table + * tupslot A standalone TupleTableSlot initialized with this table's tuple + * descriptor + * tupmap TupleConversionMap to convert from the parent's rowtype to + * this table's rowtype (when extracting the partition key of a + * tuple just before routing it through this table) + * indexes Array with partdesc->nparts members (for details on what + * individual members represent, see how they are set in + * get_partition_dispatch_recurse()) + *----------------------- + */ +typedef struct PartitionDispatchData +{ + Relation reldesc; + PartitionKey key; + List *keystate; /* list of ExprState */ + PartitionDesc partdesc; + TupleTableSlot *tupslot; + TupleConversionMap *tupmap; + int *indexes; +} PartitionDispatchData; + + static PartitionDispatch *RelationGetPartitionDispatchInfo(Relation rel, int *num_parted, List **leaf_part_oids); static void get_partition_dispatch_recurse(Relation rel, Relation parent, diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h index c2ee59fa..3afe88f8 100644 --- a/src/include/catalog/index.h +++ b/src/include/catalog/index.h @@ -100,7 +100,8 @@ typedef enum extern void index_check_primary_key(Relation heapRel, IndexInfo *indexInfo, - bool is_alter_table); + bool is_alter_table, + IndexStmt *stmt); extern Oid index_create(Relation heapRelation, const char *indexRelationName, diff --git a/src/include/executor/execPartition.h b/src/include/executor/execPartition.h index 6996258a..d4d1be1d 100644 --- a/src/include/executor/execPartition.h +++ b/src/include/executor/execPartition.h @@ -18,35 +18,7 @@ #include "nodes/parsenodes.h" #include "nodes/plannodes.h" -/*----------------------- - * PartitionDispatch - information about one partitioned table in a partition - * hierarchy required to route a tuple to one of its partitions - * - * reldesc Relation descriptor of the table - * key Partition key information of the table - * keystate Execution state required for expressions in the partition key - * partdesc Partition descriptor of the table - * tupslot A standalone TupleTableSlot initialized with this table's tuple - * descriptor - * tupmap TupleConversionMap to convert from the parent's rowtype to - * this table's rowtype (when extracting the partition key of a - * tuple just before routing it through this table) - * indexes Array with partdesc->nparts members (for details on what - * individual members represent, see how they are set in - * get_partition_dispatch_recurse()) - *----------------------- - */ -typedef struct PartitionDispatchData -{ - Relation reldesc; - PartitionKey key; - List *keystate; /* list of ExprState */ - PartitionDesc partdesc; - TupleTableSlot *tupslot; - TupleConversionMap *tupmap; - int *indexes; -} PartitionDispatchData; - +/* See execPartition.c for the definition. */ typedef struct PartitionDispatchData *PartitionDispatch; /*----------------------- diff --git a/src/include/tcop/deparse_utility.h b/src/include/tcop/deparse_utility.h index 3818a858..0d709d06 100644 --- a/src/include/tcop/deparse_utility.h +++ b/src/include/tcop/deparse_utility.h @@ -23,13 +23,13 @@ */ typedef enum CollectedCommandType { - SCT_Simple, - SCT_AlterTable, - SCT_Grant, - SCT_AlterOpFamily, - SCT_AlterDefaultPrivileges, - SCT_CreateOpClass, - SCT_AlterTSConfig + SCT_Simple, + SCT_AlterTable, + SCT_Grant, + SCT_AlterOpFamily, + SCT_AlterDefaultPrivileges, + SCT_CreateOpClass, + SCT_AlterTSConfig } CollectedCommandType; /* @@ -37,69 +37,71 @@ typedef enum CollectedCommandType */ typedef struct CollectedATSubcmd { - ObjectAddress address; /* affected column, constraint, index, ... */ - Node *parsetree; + ObjectAddress address; /* affected column, constraint, index, ... */ + Node *parsetree; } CollectedATSubcmd; typedef struct CollectedCommand { - CollectedCommandType type; - bool in_extension; - Node *parsetree; + CollectedCommandType type; - union - { - /* most commands */ - struct - { - ObjectAddress address; - ObjectAddress secondaryObject; - } simple; + bool in_extension; + Node *parsetree; - /* ALTER TABLE, and internal uses thereof */ - struct - { - Oid objectId; - Oid classId; - List *subcmds; - } alterTable; + union + { + /* most commands */ + struct + { + ObjectAddress address; + ObjectAddress secondaryObject; + } simple; - /* GRANT / REVOKE */ - struct - { - InternalGrant *istmt; - } grant; + /* ALTER TABLE, and internal uses thereof */ + struct + { + Oid objectId; + Oid classId; + List *subcmds; + } alterTable; - /* ALTER OPERATOR FAMILY */ - struct - { - ObjectAddress address; - List *operators; - List *procedures; - } opfam; + /* GRANT / REVOKE */ + struct + { + InternalGrant *istmt; + } grant; - /* CREATE OPERATOR CLASS */ - struct - { - ObjectAddress address; - List *operators; - List *procedures; - } createopc; + /* ALTER OPERATOR FAMILY */ + struct + { + ObjectAddress address; + List *operators; + List *procedures; + } opfam; - /* ALTER TEXT SEARCH CONFIGURATION ADD/ALTER/DROP MAPPING */ - struct - { - ObjectAddress address; - Oid *dictIds; - int ndicts; - } atscfg; + /* CREATE OPERATOR CLASS */ + struct + { + ObjectAddress address; + List *operators; + List *procedures; + } createopc; - /* ALTER DEFAULT PRIVILEGES */ - struct - { - GrantObjectType objtype; - } defprivs; - } d; + /* ALTER TEXT SEARCH CONFIGURATION ADD/ALTER/DROP MAPPING */ + struct + { + ObjectAddress address; + Oid *dictIds; + int ndicts; + } atscfg; + + /* ALTER DEFAULT PRIVILEGES */ + struct + { + GrantObjectType objtype; + } defprivs; + } d; + struct CollectedCommand *parent; /* when nested */ } CollectedCommand; -#endif /* DEPARSE_UTILITY_H */ +#endif /* DEPARSE_UTILITY_H */ diff --git a/src/test/modules/test_ddl_deparse/expected/alter_table.out b/src/test/modules/test_ddl_deparse/expected/alter_table.out index e304787b..7da847d4 100644 --- a/src/test/modules/test_ddl_deparse/expected/alter_table.out +++ b/src/test/modules/test_ddl_deparse/expected/alter_table.out @@ -16,3 +16,15 @@ NOTICE: DDL test: type simple, tag ALTER TABLE ALTER TABLE parent ADD CONSTRAINT a_pos CHECK (a > 0); NOTICE: DDL test: type alter table, tag ALTER TABLE NOTICE: subcommand: ADD CONSTRAINT (and recurse) +CREATE TABLE part ( + a int +) PARTITION BY RANGE (a); +NOTICE: DDL test: type simple, tag CREATE TABLE +CREATE TABLE part1 PARTITION OF part FOR VALUES FROM (1) to (100); +NOTICE: DDL test: type simple, tag CREATE TABLE +ALTER TABLE part ADD PRIMARY KEY (a); +NOTICE: DDL test: type alter table, tag CREATE INDEX +NOTICE: subcommand: SET NOT NULL +NOTICE: subcommand: SET NOT NULL +NOTICE: DDL test: type alter table, tag ALTER TABLE +NOTICE: subcommand: ADD INDEX diff --git a/src/test/modules/test_ddl_deparse/sql/alter_table.sql b/src/test/modules/test_ddl_deparse/sql/alter_table.sql index 6e2cca75..dec53a06 100644 --- a/src/test/modules/test_ddl_deparse/sql/alter_table.sql +++ b/src/test/modules/test_ddl_deparse/sql/alter_table.sql @@ -11,3 +11,11 @@ ALTER TABLE parent ADD COLUMN b serial; ALTER TABLE parent RENAME COLUMN b TO c; ALTER TABLE parent ADD CONSTRAINT a_pos CHECK (a > 0); + +CREATE TABLE part ( + a int +) PARTITION BY RANGE (a); + +CREATE TABLE part1 PARTITION OF part FOR VALUES FROM (1) to (100); + +ALTER TABLE part ADD PRIMARY KEY (a); diff --git a/src/test/regress/expected/event_trigger.out b/src/test/regress/expected/event_trigger.out index 085eb207..2537e6f1 100644 --- a/src/test/regress/expected/event_trigger.out +++ b/src/test/regress/expected/event_trigger.out @@ -283,14 +283,32 @@ CREATE SCHEMA evttrig CREATE TABLE one (col_a SERIAL PRIMARY KEY, col_b text DEFAULT 'forty two') CREATE INDEX one_idx ON one (col_b) CREATE TABLE two (col_c INTEGER CHECK (col_c > 0) REFERENCES one DEFAULT 42); +-- Partitioned tables with a partitioned index +CREATE TABLE evttrig.parted ( + id int PRIMARY KEY) + PARTITION BY RANGE (id); +CREATE TABLE evttrig.part_1_10 PARTITION OF evttrig.parted (id) + FOR VALUES FROM (1) TO (10); +CREATE TABLE evttrig.part_10_20 PARTITION OF evttrig.parted (id) + FOR VALUES FROM (10) TO (20) PARTITION BY RANGE (id); +CREATE TABLE evttrig.part_10_15 PARTITION OF evttrig.part_10_20 (id) + FOR VALUES FROM (10) TO (15); +CREATE TABLE evttrig.part_15_20 PARTITION OF evttrig.part_10_20 (id) + FOR VALUES FROM (15) TO (20); ALTER TABLE evttrig.two DROP COLUMN col_c; ALTER TABLE evttrig.one ALTER COLUMN col_b DROP DEFAULT; ALTER TABLE evttrig.one DROP CONSTRAINT one_pkey; DROP INDEX evttrig.one_idx; DROP SCHEMA evttrig CASCADE; -NOTICE: drop cascades to 2 other objects +NOTICE: drop cascades to 3 other objects DETAIL: drop cascades to table evttrig.one drop cascades to table evttrig.two +drop cascades to table evttrig.parted +NOTICE: NORMAL: orig=f normal=t istemp=f type=table identity=evttrig.parted name={evttrig,parted} args={} +NOTICE: NORMAL: orig=f normal=t istemp=f type=table identity=evttrig.part_1_10 name={evttrig,part_1_10} args={} +NOTICE: NORMAL: orig=f normal=t istemp=f type=table identity=evttrig.part_10_20 name={evttrig,part_10_20} args={} +NOTICE: NORMAL: orig=f normal=t istemp=f type=table identity=evttrig.part_10_15 name={evttrig,part_10_15} args={} +NOTICE: NORMAL: orig=f normal=t istemp=f type=table identity=evttrig.part_15_20 name={evttrig,part_15_20} args={} DROP TABLE a_temp_tbl; DROP EVENT TRIGGER regress_event_trigger_report_dropped; ERROR: event trigger "regress_event_trigger_report_dropped" does not exist diff --git a/src/test/regress/sql/event_trigger.sql b/src/test/regress/sql/event_trigger.sql index b65bf3ec..9c8fa5f6 100644 --- a/src/test/regress/sql/event_trigger.sql +++ b/src/test/regress/sql/event_trigger.sql @@ -263,6 +263,19 @@ CREATE SCHEMA evttrig CREATE INDEX one_idx ON one (col_b) CREATE TABLE two (col_c INTEGER CHECK (col_c > 0) REFERENCES one DEFAULT 42); +-- Partitioned tables with a partitioned index +CREATE TABLE evttrig.parted ( + id int PRIMARY KEY) + PARTITION BY RANGE (id); +CREATE TABLE evttrig.part_1_10 PARTITION OF evttrig.parted (id) + FOR VALUES FROM (1) TO (10); +CREATE TABLE evttrig.part_10_20 PARTITION OF evttrig.parted (id) + FOR VALUES FROM (10) TO (20) PARTITION BY RANGE (id); +CREATE TABLE evttrig.part_10_15 PARTITION OF evttrig.part_10_20 (id) + FOR VALUES FROM (10) TO (15); +CREATE TABLE evttrig.part_15_20 PARTITION OF evttrig.part_10_20 (id) + FOR VALUES FROM (15) TO (20); + ALTER TABLE evttrig.two DROP COLUMN col_c; ALTER TABLE evttrig.one ALTER COLUMN col_b DROP DEFAULT; ALTER TABLE evttrig.one DROP CONSTRAINT one_pkey; From 40d30a952b94ef13273936fedb953c0ad3df9e66 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Tue, 30 Jun 2020 15:53:50 +0800 Subject: [PATCH 265/578] Fix catalog insertion order for ATTACH PARTITION. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/commands/tablecmds.c | 6 +++--- src/test/regress/expected/alter_table.out | 15 +++++++++++++++ src/test/regress/expected/alter_table_1.out | 15 +++++++++++++++ src/test/regress/expected/alter_table_2.out | 15 +++++++++++++++ src/test/regress/expected/alter_table_3.out | 15 +++++++++++++++ src/test/regress/sql/alter_table.sql | 18 +++++++++++++++++- 6 files changed, 80 insertions(+), 4 deletions(-) diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index cf6b6896..f976badd 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -16688,9 +16688,6 @@ ATExecAttachPartition(List **wqueue, Relation rel, PartitionCmd *cmd) trigger_name, RelationGetRelationName(attachrel)), errdetail("ROW triggers with transition tables are not supported on partitions"))); - /* OK to create inheritance. Rest of the checks performed there */ - CreateInheritance(attachrel, rel); - /* Update the default partition oid */ if (cmd->bound->is_default) update_default_partition_oid(RelationGetRelid(rel), @@ -16704,6 +16701,9 @@ ATExecAttachPartition(List **wqueue, Relation rel, PartitionCmd *cmd) check_new_partition_bound(RelationGetRelationName(attachrel), rel, cmd->bound); + /* OK to create inheritance. Rest of the checks performed there */ + CreateInheritance(attachrel, rel); + /* Update the pg_class entry. */ StorePartitionBound(attachrel, rel, cmd->bound); diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out index 088474cf..ea00b3ae 100644 --- a/src/test/regress/expected/alter_table.out +++ b/src/test/regress/expected/alter_table.out @@ -3745,3 +3745,18 @@ alter table defpart_attach_test_d add check (a > 1); alter table defpart_attach_test attach partition defpart_attach_test_d default; INFO: partition constraint for table "defpart_attach_test_d" is implied by existing constraints drop table defpart_attach_test; +-- test case where the partitioning operator is a SQL function whose +-- evaluation results in the table's relcache being rebuilt partway through +-- the execution of an ATTACH PARTITION command +create function at_test_sql_partop (int4, int4) returns int language sql +as $$ select case when $1 = $2 then 0 when $1 > $2 then 1 else -1 end; $$; +create operator class at_test_sql_partop for type int4 using btree as + operator 1 < (int4, int4), operator 2 <= (int4, int4), + operator 3 = (int4, int4), operator 4 >= (int4, int4), + operator 5 > (int4, int4), function 1 at_test_sql_partop(int4, int4); +create table at_test_sql_partop (a int) partition by range (a at_test_sql_partop); +create table at_test_sql_partop_1 (a int); +alter table at_test_sql_partop attach partition at_test_sql_partop_1 for values from (0) to (10); +drop table at_test_sql_partop; +drop operator class at_test_sql_partop using btree; +drop function at_test_sql_partop; diff --git a/src/test/regress/expected/alter_table_1.out b/src/test/regress/expected/alter_table_1.out index 8e1053bc..744691c9 100644 --- a/src/test/regress/expected/alter_table_1.out +++ b/src/test/regress/expected/alter_table_1.out @@ -3668,3 +3668,18 @@ alter table defpart_attach_test_d add check (a > 1); alter table defpart_attach_test attach partition defpart_attach_test_d default; INFO: partition constraint for table "defpart_attach_test_d" is implied by existing constraints drop table defpart_attach_test; +-- test case where the partitioning operator is a SQL function whose +-- evaluation results in the table's relcache being rebuilt partway through +-- the execution of an ATTACH PARTITION command +create function at_test_sql_partop (int4, int4) returns int language sql +as $$ select case when $1 = $2 then 0 when $1 > $2 then 1 else -1 end; $$; +create operator class at_test_sql_partop for type int4 using btree as + operator 1 < (int4, int4), operator 2 <= (int4, int4), + operator 3 = (int4, int4), operator 4 >= (int4, int4), + operator 5 > (int4, int4), function 1 at_test_sql_partop(int4, int4); +create table at_test_sql_partop (a int) partition by range (a at_test_sql_partop); +create table at_test_sql_partop_1 (a int); +alter table at_test_sql_partop attach partition at_test_sql_partop_1 for values from (0) to (10); +drop table at_test_sql_partop; +drop operator class at_test_sql_partop using btree; +drop function at_test_sql_partop; diff --git a/src/test/regress/expected/alter_table_2.out b/src/test/regress/expected/alter_table_2.out index 19a9d000..a958aa64 100644 --- a/src/test/regress/expected/alter_table_2.out +++ b/src/test/regress/expected/alter_table_2.out @@ -3668,3 +3668,18 @@ alter table defpart_attach_test_d add check (a > 1); alter table defpart_attach_test attach partition defpart_attach_test_d default; INFO: partition constraint for table "defpart_attach_test_d" is implied by existing constraints drop table defpart_attach_test; +-- test case where the partitioning operator is a SQL function whose +-- evaluation results in the table's relcache being rebuilt partway through +-- the execution of an ATTACH PARTITION command +create function at_test_sql_partop (int4, int4) returns int language sql +as $$ select case when $1 = $2 then 0 when $1 > $2 then 1 else -1 end; $$; +create operator class at_test_sql_partop for type int4 using btree as + operator 1 < (int4, int4), operator 2 <= (int4, int4), + operator 3 = (int4, int4), operator 4 >= (int4, int4), + operator 5 > (int4, int4), function 1 at_test_sql_partop(int4, int4); +create table at_test_sql_partop (a int) partition by range (a at_test_sql_partop); +create table at_test_sql_partop_1 (a int); +alter table at_test_sql_partop attach partition at_test_sql_partop_1 for values from (0) to (10); +drop table at_test_sql_partop; +drop operator class at_test_sql_partop using btree; +drop function at_test_sql_partop; diff --git a/src/test/regress/expected/alter_table_3.out b/src/test/regress/expected/alter_table_3.out index 5cdf3e7a..9d426e3c 100644 --- a/src/test/regress/expected/alter_table_3.out +++ b/src/test/regress/expected/alter_table_3.out @@ -3668,3 +3668,18 @@ alter table defpart_attach_test_d add check (a > 1); alter table defpart_attach_test attach partition defpart_attach_test_d default; INFO: partition constraint for table "defpart_attach_test_d" is implied by existing constraints drop table defpart_attach_test; +-- test case where the partitioning operator is a SQL function whose +-- evaluation results in the table's relcache being rebuilt partway through +-- the execution of an ATTACH PARTITION command +create function at_test_sql_partop (int4, int4) returns int language sql +as $$ select case when $1 = $2 then 0 when $1 > $2 then 1 else -1 end; $$; +create operator class at_test_sql_partop for type int4 using btree as + operator 1 < (int4, int4), operator 2 <= (int4, int4), + operator 3 = (int4, int4), operator 4 >= (int4, int4), + operator 5 > (int4, int4), function 1 at_test_sql_partop(int4, int4); +create table at_test_sql_partop (a int) partition by range (a at_test_sql_partop); +create table at_test_sql_partop_1 (a int); +alter table at_test_sql_partop attach partition at_test_sql_partop_1 for values from (0) to (10); +drop table at_test_sql_partop; +drop operator class at_test_sql_partop using btree; +drop function at_test_sql_partop; diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql index e1c6772c..7b5f2409 100644 --- a/src/test/regress/sql/alter_table.sql +++ b/src/test/regress/sql/alter_table.sql @@ -2515,4 +2515,20 @@ alter table defpart_attach_test_d add check (a > 1); -- should be attached successfully and without needing to be scanned alter table defpart_attach_test attach partition defpart_attach_test_d default; -drop table defpart_attach_test; \ No newline at end of file +drop table defpart_attach_test; + +-- test case where the partitioning operator is a SQL function whose +-- evaluation results in the table's relcache being rebuilt partway through +-- the execution of an ATTACH PARTITION command +create function at_test_sql_partop (int4, int4) returns int language sql +as $$ select case when $1 = $2 then 0 when $1 > $2 then 1 else -1 end; $$; +create operator class at_test_sql_partop for type int4 using btree as + operator 1 < (int4, int4), operator 2 <= (int4, int4), + operator 3 = (int4, int4), operator 4 >= (int4, int4), + operator 5 > (int4, int4), function 1 at_test_sql_partop(int4, int4); +create table at_test_sql_partop (a int) partition by range (a at_test_sql_partop); +create table at_test_sql_partop_1 (a int); +alter table at_test_sql_partop attach partition at_test_sql_partop_1 for values from (0) to (10); +drop table at_test_sql_partop; +drop operator class at_test_sql_partop using btree; +drop function at_test_sql_partop; \ No newline at end of file From 58fd2898036f5773284c5c92efaffc4361917106 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Tue, 30 Jun 2020 17:20:43 +0800 Subject: [PATCH 266/578] Add pg_partition_tree to display information about partitions. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- doc/src/sgml/func.sgml | 43 ++++++ src/backend/utils/adt/Makefile | 2 +- src/backend/utils/adt/partitionfuncs.c | 154 +++++++++++++++++++ src/include/catalog/pg_proc.h | 3 + src/test/regress/expected/partition_info.out | 114 ++++++++++++++ src/test/regress/parallel_schedule | 2 +- src/test/regress/serial_schedule | 1 + src/test/regress/sql/partition_info.sql | 68 ++++++++ 8 files changed, 385 insertions(+), 2 deletions(-) create mode 100644 src/backend/utils/adt/partitionfuncs.c create mode 100644 src/test/regress/expected/partition_info.out create mode 100644 src/test/regress/sql/partition_info.sql diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 6729c562..cb4821d5 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -19824,6 +19824,49 @@ postgres=# SELECT * FROM pg_walfile_name_offset(pg_stop_backup()); The function returns the number of new collation objects it created. + + Partitioning Information Functions + + + Name Return Type Description + + + + + pg_partition_tree(regclass) + setof record + + List information about tables or indexes in a partition tree for a + given partitioned table or partitioned index, with one row for each + partition. Information provided includes the name of the partition, + the name of its immediate parent, a boolean value telling if the + partition is a leaf, and an integer telling its level in the hierarchy. + The value of level begins at 0 for the input table + or index in its role as the root of the partition tree, + 1 for its partitions, 2 for + their partitions, and so on. + + + + +
+ + + To check the total size of the data contained in + measurement table described in + , one could use the + following query: + + + +=# SELECT pg_size_pretty(sum(pg_relation_size(relid))) AS total_size + FROM pg_partition_tree('measurement'); + total_size +------------ + 24 kB +(1 row) + + diff --git a/src/backend/utils/adt/Makefile b/src/backend/utils/adt/Makefile index 1fb01841..1e6765fc 100644 --- a/src/backend/utils/adt/Makefile +++ b/src/backend/utils/adt/Makefile @@ -19,7 +19,7 @@ OBJS = acl.o amutils.o arrayfuncs.o array_expanded.o array_selfuncs.o \ jsonfuncs.o like.o lockfuncs.o mac.o mac8.o misc.o nabstime.o name.o \ network.o network_gist.o network_selfuncs.o network_spgist.o \ numeric.o numutils.o oid.o oracle_compat.o \ - orderedsetaggs.o pg_locale.o pg_lsn.o pg_upgrade_support.o \ + orderedsetaggs.o partitionfuncs.o pg_locale.o pg_lsn.o pg_upgrade_support.o \ pgstatfuncs.o \ pseudotypes.o quote.o rangetypes.o rangetypes_gist.o \ rangetypes_selfuncs.o rangetypes_spgist.o rangetypes_typanalyze.o \ diff --git a/src/backend/utils/adt/partitionfuncs.c b/src/backend/utils/adt/partitionfuncs.c new file mode 100644 index 00000000..8f9218ad --- /dev/null +++ b/src/backend/utils/adt/partitionfuncs.c @@ -0,0 +1,154 @@ +/*------------------------------------------------------------------------- + * + * partitionfuncs.c + * Functions for accessing partition-related metadata + * + * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/utils/adt/partitionfuncs.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/htup_details.h" +#include "catalog/partition.h" +#include "catalog/pg_class.h" +#include "catalog/pg_inherits.h" +#include "catalog/pg_type.h" +#include "funcapi.h" +#include "utils/fmgrprotos.h" +#include "utils/lsyscache.h" + + +/* + * pg_partition_tree + * + * Produce a view with one row per member of a partition tree, beginning + * from the top-most parent given by the caller. This gives information + * about each partition, its immediate partitioned parent, if it is + * a leaf partition and its level in the hierarchy. + */ +Datum +pg_partition_tree(PG_FUNCTION_ARGS) +{ +#define PG_PARTITION_TREE_COLS 4 + Oid rootrelid = PG_GETARG_OID(0); + char relkind = get_rel_relkind(rootrelid); + FuncCallContext *funcctx; + ListCell **next; + + /* Only allow relation types that can appear in partition trees. */ + if (relkind != RELKIND_RELATION && + relkind != RELKIND_FOREIGN_TABLE && + relkind != RELKIND_INDEX && + relkind != RELKIND_PARTITIONED_TABLE && + relkind != RELKIND_PARTITIONED_INDEX) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("\"%s\" is not a table, a foreign table, or an index", + get_rel_name(rootrelid)))); + + /* stuff done only on the first call of the function */ + if (SRF_IS_FIRSTCALL()) + { + MemoryContext oldcxt; + TupleDesc tupdesc; + List *partitions; + + /* create a function context for cross-call persistence */ + funcctx = SRF_FIRSTCALL_INIT(); + + /* switch to memory context appropriate for multiple function calls */ + oldcxt = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + /* + * Find all members of inheritance set. We only need AccessShareLock + * on the children for the partition information lookup. + */ + partitions = find_all_inheritors(rootrelid, AccessShareLock, NULL); + + tupdesc = CreateTemplateTupleDesc(PG_PARTITION_TREE_COLS, false); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "relid", + REGCLASSOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "parentid", + REGCLASSOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "isleaf", + BOOLOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 4, "level", + INT4OID, -1, 0); + + funcctx->tuple_desc = BlessTupleDesc(tupdesc); + + /* allocate memory for user context */ + next = (ListCell **) palloc(sizeof(ListCell *)); + *next = list_head(partitions); + funcctx->user_fctx = (void *) next; + + MemoryContextSwitchTo(oldcxt); + } + + /* stuff done on every call of the function */ + funcctx = SRF_PERCALL_SETUP(); + next = (ListCell **) funcctx->user_fctx; + + if (*next != NULL) + { + Datum result; + Datum values[PG_PARTITION_TREE_COLS]; + bool nulls[PG_PARTITION_TREE_COLS]; + HeapTuple tuple; + Oid parentid = InvalidOid; + Oid relid = lfirst_oid(*next); + char relkind = get_rel_relkind(relid); + int level = 0; + List *ancestors = get_partition_ancestors(lfirst_oid(*next)); + ListCell *lc; + + /* + * Form tuple with appropriate data. + */ + MemSet(nulls, 0, sizeof(nulls)); + MemSet(values, 0, sizeof(values)); + + /* relid */ + values[0] = ObjectIdGetDatum(relid); + + /* parentid */ + if (ancestors != NIL) + parentid = linitial_oid(ancestors); + if (OidIsValid(parentid)) + values[1] = ObjectIdGetDatum(parentid); + else + nulls[1] = true; + + /* isleaf */ + values[2] = BoolGetDatum(relkind != RELKIND_PARTITIONED_TABLE && + relkind != RELKIND_PARTITIONED_INDEX); + + /* level */ + if (relid != rootrelid) + { + foreach(lc, ancestors) + { + level++; + if (lfirst_oid(lc) == rootrelid) + break; + } + } + values[3] = Int32GetDatum(level); + + *next = lnext(*next); + + tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); + result = HeapTupleGetDatum(tuple); + SRF_RETURN_NEXT(funcctx, result); + } + + /* done when there are no more elements left */ + SRF_RETURN_DONE(funcctx); +} diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index 51adc65a..bd1481cf 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -5710,6 +5710,9 @@ DESCR("list of files in the WAL directory"); /* hash partitioning constraint function */ DATA(insert OID = 4687 ( satisfies_hash_partition PGNSP PGUID 12 1 0 2276 0 f f f f f f i s 4 0 16 "26 23 23 2276" _null_ "{i,i,i,v}" _null_ _null_ _null_ satisfies_hash_partition _null_ _null_ _null_ )); DESCR("hash partition CHECK constraint"); +/* information about a partition tree */ +DATA(insert OID = 4688 ( pg_partition_tree PGNSP PGUID 12 1 1000 0 0 f f f f t t v s 1 0 2249 "2205" "{2205,2205,2205,16,23}" "{i,o,o,o,o}" "{rootrelid,relid,parentrelid,isleaf,level}" _null_ _null_ pg_partition_tree _null_ _null_ _null_ )); +DESCR("view partition tree tables"); DATA(insert OID = 3410 ( pg_extent_info PGNSP PGUID 12 10 20 0 0 f f f f f t v s 1 0 2249 "2205" "{23,16,23,23,23,23,23,23,23}" "{o,o,o,o,o,o,o,o,o}" "{eid,is_occupied,shardid,freespace_cat,hwm,scan_next,scan_prev,alloc_next,alloc_prev}" _null_ _null_ pg_extent_info_oid _null_ _null_ _null_ )); DESCR("get extent info of a relation"); DATA(insert OID = 3411 ( pg_shard_scan_list PGNSP PGUID 12 10 20 0 0 f f f f f t v s 2 0 2249 "2205 23" "{23,16,23,23,23,23}" "{o,o,o,o,o,o}" "{eid,is_occupied,shardid,freespace_cat,hwm,scan_next}" _null_ _null_ pg_shard_scan_list_oid _null_ _null_ _null_ )); diff --git a/src/test/regress/expected/partition_info.out b/src/test/regress/expected/partition_info.out new file mode 100644 index 00000000..6b116125 --- /dev/null +++ b/src/test/regress/expected/partition_info.out @@ -0,0 +1,114 @@ +-- +-- Tests for pg_partition_tree +-- +SELECT * FROM pg_partition_tree(NULL); + relid | parentrelid | isleaf | level +-------+-------------+--------+------- +(0 rows) + +-- Test table partition trees +CREATE TABLE ptif_test (a int, b int) PARTITION BY range (a); +CREATE TABLE ptif_test0 PARTITION OF ptif_test + FOR VALUES FROM (minvalue) TO (0) PARTITION BY list (b); +CREATE TABLE ptif_test01 PARTITION OF ptif_test0 FOR VALUES IN (1); +CREATE TABLE ptif_test1 PARTITION OF ptif_test + FOR VALUES FROM (0) TO (100) PARTITION BY list (b); +CREATE TABLE ptif_test11 PARTITION OF ptif_test1 FOR VALUES IN (1); +CREATE TABLE ptif_test2 PARTITION OF ptif_test + FOR VALUES FROM (100) TO (maxvalue); +-- Test index partition tree +CREATE INDEX ptif_test_index ON ONLY ptif_test (a); +CREATE INDEX ptif_test0_index ON ONLY ptif_test0 (a); +ALTER INDEX ptif_test_index ATTACH PARTITION ptif_test0_index; +CREATE INDEX ptif_test01_index ON ptif_test01 (a); +ALTER INDEX ptif_test0_index ATTACH PARTITION ptif_test01_index; +CREATE INDEX ptif_test1_index ON ONLY ptif_test1 (a); +ALTER INDEX ptif_test_index ATTACH PARTITION ptif_test1_index; +CREATE INDEX ptif_test11_index ON ptif_test11 (a); +ALTER INDEX ptif_test1_index ATTACH PARTITION ptif_test11_index; +CREATE INDEX ptif_test2_index ON ptif_test2 (a); +ALTER INDEX ptif_test_index ATTACH PARTITION ptif_test2_index; +-- List all tables members of the tree +SELECT relid, parentrelid, level, isleaf + FROM pg_partition_tree('ptif_test'); + relid | parentrelid | level | isleaf +-------------+-------------+-------+-------- + ptif_test | | 0 | f + ptif_test0 | ptif_test | 1 | f + ptif_test1 | ptif_test | 1 | f + ptif_test2 | ptif_test | 1 | t + ptif_test01 | ptif_test0 | 2 | t + ptif_test11 | ptif_test1 | 2 | t +(6 rows) + +-- List tables from an intermediate level +SELECT relid, parentrelid, level, isleaf + FROM pg_partition_tree('ptif_test0') p + JOIN pg_class c ON (p.relid = c.oid); + relid | parentrelid | level | isleaf +-------------+-------------+-------+-------- + ptif_test0 | ptif_test | 0 | f + ptif_test01 | ptif_test0 | 1 | t +(2 rows) + +-- List from leaf table +SELECT relid, parentrelid, level, isleaf + FROM pg_partition_tree('ptif_test01') p + JOIN pg_class c ON (p.relid = c.oid); + relid | parentrelid | level | isleaf +-------------+-------------+-------+-------- + ptif_test01 | ptif_test0 | 0 | t +(1 row) + +-- List all indexes members of the tree +SELECT relid, parentrelid, level, isleaf + FROM pg_partition_tree('ptif_test_index'); + relid | parentrelid | level | isleaf +-------------------+------------------+-------+-------- + ptif_test_index | | 0 | f + ptif_test0_index | ptif_test_index | 1 | f + ptif_test1_index | ptif_test_index | 1 | f + ptif_test2_index | ptif_test_index | 1 | t + ptif_test01_index | ptif_test0_index | 2 | t + ptif_test11_index | ptif_test1_index | 2 | t +(6 rows) + +-- List indexes from an intermediate level +SELECT relid, parentrelid, level, isleaf + FROM pg_partition_tree('ptif_test0_index') p + JOIN pg_class c ON (p.relid = c.oid); + relid | parentrelid | level | isleaf +-------------------+------------------+-------+-------- + ptif_test0_index | ptif_test_index | 0 | f + ptif_test01_index | ptif_test0_index | 1 | t +(2 rows) + +-- List from leaf index +SELECT relid, parentrelid, level, isleaf + FROM pg_partition_tree('ptif_test01_index') p + JOIN pg_class c ON (p.relid = c.oid); + relid | parentrelid | level | isleaf +-------------------+------------------+-------+-------- + ptif_test01_index | ptif_test0_index | 0 | t +(1 row) + +DROP TABLE ptif_test; +-- A table not part of a partition tree works is the only member listed. +CREATE TABLE ptif_normal_table(a int); +SELECT relid, parentrelid, level, isleaf + FROM pg_partition_tree('ptif_normal_table'); + relid | parentrelid | level | isleaf +-------------------+-------------+-------+-------- + ptif_normal_table | | 0 | t +(1 row) + +DROP TABLE ptif_normal_table; +-- Views and materialized viewS cannot be part of a partition tree. +CREATE VIEW ptif_test_view AS SELECT 1; +CREATE MATERIALIZED VIEW ptif_test_matview AS SELECT 1; +SELECT * FROM pg_partition_tree('ptif_test_view'); +ERROR: "ptif_test_view" is not a table, a foreign table, or an index +SELECT * FROM pg_partition_tree('ptif_test_matview'); +ERROR: "ptif_test_matview" is not a table, a foreign table, or an index +DROP VIEW ptif_test_view; +DROP MATERIALIZED VIEW ptif_test_matview; diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 905cb00a..95fafcd7 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -134,7 +134,7 @@ test: plancache limit plpgsql copy2 temp domain prepare without_oid conversion t # ---------- # Another group of parallel tests # ---------- -test: identity partition_join partition_prune partition_prune_hash hash_part +test: identity partition_join partition_prune partition_prune_hash hash_part partition_info # event triggers cannot run concurrently with any test that runs DDL test: event_trigger diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule index 1f00bfbc..f91b37b9 100644 --- a/src/test/regress/serial_schedule +++ b/src/test/regress/serial_schedule @@ -194,6 +194,7 @@ test: identity test: partition_join test: partition_prune test: partition_prune_hash +test: partition_info test: hash_part test: event_trigger test: fast_default diff --git a/src/test/regress/sql/partition_info.sql b/src/test/regress/sql/partition_info.sql new file mode 100644 index 00000000..5a76f22b --- /dev/null +++ b/src/test/regress/sql/partition_info.sql @@ -0,0 +1,68 @@ +-- +-- Tests for pg_partition_tree +-- +SELECT * FROM pg_partition_tree(NULL); + +-- Test table partition trees +CREATE TABLE ptif_test (a int, b int) PARTITION BY range (a); +CREATE TABLE ptif_test0 PARTITION OF ptif_test + FOR VALUES FROM (minvalue) TO (0) PARTITION BY list (b); +CREATE TABLE ptif_test01 PARTITION OF ptif_test0 FOR VALUES IN (1); +CREATE TABLE ptif_test1 PARTITION OF ptif_test + FOR VALUES FROM (0) TO (100) PARTITION BY list (b); +CREATE TABLE ptif_test11 PARTITION OF ptif_test1 FOR VALUES IN (1); +CREATE TABLE ptif_test2 PARTITION OF ptif_test + FOR VALUES FROM (100) TO (maxvalue); + +-- Test index partition tree +CREATE INDEX ptif_test_index ON ONLY ptif_test (a); +CREATE INDEX ptif_test0_index ON ONLY ptif_test0 (a); +ALTER INDEX ptif_test_index ATTACH PARTITION ptif_test0_index; +CREATE INDEX ptif_test01_index ON ptif_test01 (a); +ALTER INDEX ptif_test0_index ATTACH PARTITION ptif_test01_index; +CREATE INDEX ptif_test1_index ON ONLY ptif_test1 (a); +ALTER INDEX ptif_test_index ATTACH PARTITION ptif_test1_index; +CREATE INDEX ptif_test11_index ON ptif_test11 (a); +ALTER INDEX ptif_test1_index ATTACH PARTITION ptif_test11_index; +CREATE INDEX ptif_test2_index ON ptif_test2 (a); +ALTER INDEX ptif_test_index ATTACH PARTITION ptif_test2_index; + +-- List all tables members of the tree +SELECT relid, parentrelid, level, isleaf + FROM pg_partition_tree('ptif_test'); +-- List tables from an intermediate level +SELECT relid, parentrelid, level, isleaf + FROM pg_partition_tree('ptif_test0') p + JOIN pg_class c ON (p.relid = c.oid); +-- List from leaf table +SELECT relid, parentrelid, level, isleaf + FROM pg_partition_tree('ptif_test01') p + JOIN pg_class c ON (p.relid = c.oid); + +-- List all indexes members of the tree +SELECT relid, parentrelid, level, isleaf + FROM pg_partition_tree('ptif_test_index'); +-- List indexes from an intermediate level +SELECT relid, parentrelid, level, isleaf + FROM pg_partition_tree('ptif_test0_index') p + JOIN pg_class c ON (p.relid = c.oid); +-- List from leaf index +SELECT relid, parentrelid, level, isleaf + FROM pg_partition_tree('ptif_test01_index') p + JOIN pg_class c ON (p.relid = c.oid); + +DROP TABLE ptif_test; + +-- A table not part of a partition tree works is the only member listed. +CREATE TABLE ptif_normal_table(a int); +SELECT relid, parentrelid, level, isleaf + FROM pg_partition_tree('ptif_normal_table'); +DROP TABLE ptif_normal_table; + +-- Views and materialized viewS cannot be part of a partition tree. +CREATE VIEW ptif_test_view AS SELECT 1; +CREATE MATERIALIZED VIEW ptif_test_matview AS SELECT 1; +SELECT * FROM pg_partition_tree('ptif_test_view'); +SELECT * FROM pg_partition_tree('ptif_test_matview'); +DROP VIEW ptif_test_view; +DROP MATERIALIZED VIEW ptif_test_matview; From 62b8223aa66cb27f23294f6cd9f184a12de4c41b Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Tue, 30 Jun 2020 17:52:18 +0800 Subject: [PATCH 267/578] Fix tablespace handling for partitioned indexes --- src/backend/catalog/heap.c | 9 ++++ src/backend/commands/tablecmds.c | 60 +++++++++++++++++++++-- src/test/regress/input/tablespace.source | 10 ++++ src/test/regress/output/tablespace.source | 19 ++++++- 4 files changed, 94 insertions(+), 4 deletions(-) diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c index 57e486f1..ff83af36 100644 --- a/src/backend/catalog/heap.c +++ b/src/backend/catalog/heap.c @@ -419,6 +419,15 @@ heap_create(const char *relname, */ reltablespace = InvalidOid; break; + + case RELKIND_PARTITIONED_INDEX: + /* + * Preserve tablespace so that it's used as tablespace for indexes + * on future partitions. + */ + create_storage = false; + break; + case RELKIND_SEQUENCE: create_storage = true; diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index f976badd..fa871fd8 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -495,6 +495,7 @@ static bool ATPrepChangePersistence(Relation rel, bool toLogged); static void ATPrepSetTableSpace(AlteredTableInfo *tab, Relation rel, char *tablespacename, LOCKMODE lockmode); static void ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode); +static void ATExecPartedIdxSetTableSpace(Relation rel, Oid newTableSpace); static void ATExecSetRelOptions(Relation rel, List *defList, AlterTableType operation, LOCKMODE lockmode); @@ -4904,7 +4905,8 @@ ATPrepCmd(List **wqueue, Relation rel, AlterTableCmd *cmd, pass = AT_PASS_DROP; break; case AT_SetTableSpace: /* SET TABLESPACE */ - ATSimplePermissions(rel, ATT_TABLE | ATT_MATVIEW | ATT_INDEX); + ATSimplePermissions(rel, ATT_TABLE | ATT_MATVIEW | ATT_INDEX | + ATT_PARTITIONED_INDEX); /* This command never recurses */ ATPrepSetTableSpace(tab, rel, cmd->name, lockmode); pass = AT_PASS_MISC; /* doesn't actually matter */ @@ -5278,10 +5280,13 @@ ATExecCmd(List **wqueue, AlteredTableInfo *tab, Relation rel, */ break; case AT_SetTableSpace: /* SET TABLESPACE */ - /* - * Nothing to do here; Phase 3 does the work + * Only do this for partitioned indexes, for which this is just + * a catalog change. Other relation types are handled by Phase 3. */ + if (rel->rd_rel->relkind == RELKIND_PARTITIONED_INDEX) + ATExecPartedIdxSetTableSpace(rel, tab->newTableSpace); + break; case AT_SetRelOptions: /* SET (...) */ case AT_ResetRelOptions: /* RESET (...) */ @@ -12590,6 +12595,55 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode) list_free(reltoastidxids); } +/* + * Special handling of ALTER TABLE SET TABLESPACE for partitioned indexes, + * which have no storage (so not handled in Phase 3 like other relation types) + */ +static void +ATExecPartedIdxSetTableSpace(Relation rel, Oid newTableSpace) +{ + HeapTuple tuple; + Oid oldTableSpace; + Relation pg_class; + Form_pg_class rd_rel; + Oid indexOid = RelationGetRelid(rel); + + Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_INDEX); + + /* + * No work if no change in tablespace. + */ + oldTableSpace = rel->rd_rel->reltablespace; + if (newTableSpace == oldTableSpace || + (newTableSpace == MyDatabaseTableSpace && oldTableSpace == 0)) + { + InvokeObjectPostAlterHook(RelationRelationId, + indexOid, 0); + return; + } + + /* Get a modifiable copy of the relation's pg_class row */ + pg_class = heap_open(RelationRelationId, RowExclusiveLock); + + tuple = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(indexOid)); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for relation %u", indexOid); + rd_rel = (Form_pg_class) GETSTRUCT(tuple); + + /* update the pg_class row */ + rd_rel->reltablespace = (newTableSpace == MyDatabaseTableSpace) ? InvalidOid : newTableSpace; + CatalogTupleUpdate(pg_class, &tuple->t_self, tuple); + + InvokeObjectPostAlterHook(RelationRelationId, indexOid, 0); + + heap_freetuple(tuple); + + heap_close(pg_class, RowExclusiveLock); + + /* Make sure the reltablespace change is visible */ + CommandCounterIncrement(); +} + /* * Alter Table ALL ... SET TABLESPACE * diff --git a/src/test/regress/input/tablespace.source b/src/test/regress/input/tablespace.source index 03a62bd7..1454e433 100644 --- a/src/test/regress/input/tablespace.source +++ b/src/test/regress/input/tablespace.source @@ -44,6 +44,14 @@ CREATE INDEX foo_idx on testschema.foo(i) TABLESPACE regress_tblspace; SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c where c.reltablespace = t.oid AND c.relname = 'foo_idx'; +-- partitioned index +CREATE TABLE testschema.part (a int) PARTITION BY LIST (a); +CREATE TABLE testschema.part1 PARTITION OF testschema.part FOR VALUES IN (1); +CREATE INDEX part_a_idx ON testschema.part (a) TABLESPACE regress_tblspace; +CREATE TABLE testschema.part2 PARTITION OF testschema.part FOR VALUES IN (2); +SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c + where c.reltablespace = t.oid AND c.relname LIKE 'part%_idx'; + -- check that default_tablespace doesn't affect ALTER TABLE index rebuilds CREATE TABLE testschema.test_default_tab(id bigint) TABLESPACE regress_tblspace; INSERT INTO testschema.test_default_tab VALUES (1); @@ -93,6 +101,8 @@ CREATE UNIQUE INDEX anindex ON testschema.atable(column1); ALTER TABLE testschema.atable SET TABLESPACE regress_tblspace; ALTER INDEX testschema.anindex SET TABLESPACE regress_tblspace; +ALTER INDEX testschema.part_a_idx SET TABLESPACE pg_default; +ALTER INDEX testschema.part_a_idx SET TABLESPACE regress_tblspace; INSERT INTO testschema.atable VALUES(3); -- ok INSERT INTO testschema.atable VALUES(1); -- fail (checks index) diff --git a/src/test/regress/output/tablespace.source b/src/test/regress/output/tablespace.source index 40f8a72f..a1a615de 100644 --- a/src/test/regress/output/tablespace.source +++ b/src/test/regress/output/tablespace.source @@ -61,6 +61,20 @@ SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c foo_idx | regress_tblspace (1 row) +-- partitioned index +CREATE TABLE testschema.part (a int) PARTITION BY LIST (a); +CREATE TABLE testschema.part1 PARTITION OF testschema.part FOR VALUES IN (1); +CREATE INDEX part_a_idx ON testschema.part (a) TABLESPACE regress_tblspace; +CREATE TABLE testschema.part2 PARTITION OF testschema.part FOR VALUES IN (2); +SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c + where c.reltablespace = t.oid AND c.relname LIKE 'part%_idx'; + relname | spcname +-------------+------------------ + part1_a_idx | regress_tblspace + part2_a_idx | regress_tblspace + part_a_idx | regress_tblspace +(3 rows) + -- check that default_tablespace doesn't affect ALTER TABLE index rebuilds CREATE TABLE testschema.test_default_tab(id bigint) TABLESPACE regress_tblspace; INSERT INTO testschema.test_default_tab VALUES (1); @@ -200,6 +214,8 @@ CREATE TABLE testschema.atable AS VALUES (1), (2); CREATE UNIQUE INDEX anindex ON testschema.atable(column1); ALTER TABLE testschema.atable SET TABLESPACE regress_tblspace; ALTER INDEX testschema.anindex SET TABLESPACE regress_tblspace; +ALTER INDEX testschema.part_a_idx SET TABLESPACE pg_default; +ALTER INDEX testschema.part_a_idx SET TABLESPACE regress_tblspace; INSERT INTO testschema.atable VALUES(3); -- ok INSERT INTO testschema.atable VALUES(1); -- fail (checks index) ERROR: duplicate key value violates unique constraint "anindex" @@ -241,9 +257,10 @@ NOTICE: no matching relations in tablespace "regress_tblspace_renamed" found -- Should succeed DROP TABLESPACE regress_tblspace_renamed; DROP SCHEMA testschema CASCADE; -NOTICE: drop cascades to 4 other objects +NOTICE: drop cascades to 5 other objects DETAIL: drop cascades to table testschema.foo drop cascades to table testschema.asselect +drop cascades to table testschema.part drop cascades to table testschema.atable drop cascades to table testschema.tablespace_acl DROP ROLE regress_tablespace_user1; From e07fc7615b31017ae3554d2a7a8166b247389765 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Tue, 30 Jun 2020 19:16:12 +0800 Subject: [PATCH 268/578] Optimize nested ConvertRowtypeExpr nodes. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/optimizer/util/clauses.c | 46 +++++++++++++++++++++++++ src/test/regress/expected/inherit.out | 18 ++++++++++ src/test/regress/expected/inherit_1.out | 16 +++++++++ src/test/regress/expected/inherit_2.out | 16 +++++++++ src/test/regress/expected/inherit_3.out | 16 +++++++++ src/test/regress/sql/inherit.sql | 5 +++ 6 files changed, 117 insertions(+) diff --git a/src/backend/optimizer/util/clauses.c b/src/backend/optimizer/util/clauses.c index 82a9f2ba..697b7dcc 100644 --- a/src/backend/optimizer/util/clauses.c +++ b/src/backend/optimizer/util/clauses.c @@ -3614,6 +3614,52 @@ eval_const_expressions_mutator(Node *node, context); } break; + case T_ConvertRowtypeExpr: + { + ConvertRowtypeExpr *cre = castNode(ConvertRowtypeExpr, node); + Node *arg; + ConvertRowtypeExpr *newcre; + + arg = eval_const_expressions_mutator((Node *) cre->arg, + context); + + newcre = makeNode(ConvertRowtypeExpr); + newcre->resulttype = cre->resulttype; + newcre->convertformat = cre->convertformat; + newcre->location = cre->location; + + /* + * In case of a nested ConvertRowtypeExpr, we can convert the + * leaf row directly to the topmost row format without any + * intermediate conversions. (This works because + * ConvertRowtypeExpr is used only for child->parent + * conversion in inheritance trees, which works by exact match + * of column name, and a column absent in an intermediate + * result can't be present in the final result.) + * + * No need to check more than one level deep, because the + * above recursion will have flattened anything else. + */ + if (arg != NULL && IsA(arg, ConvertRowtypeExpr)) + { + ConvertRowtypeExpr *argcre = (ConvertRowtypeExpr *) arg; + + arg = (Node *) argcre->arg; + + /* + * Make sure an outer implicit conversion can't hide an + * inner explicit one. + */ + if (newcre->convertformat == COERCE_IMPLICIT_CAST) + newcre->convertformat = argcre->convertformat; + } + + newcre->arg = (Expr *) arg; + + if (arg != NULL && IsA(arg, Const)) + return ece_evaluate_expr((Node *) newcre); + return (Node *) newcre; + } default: break; } diff --git a/src/test/regress/expected/inherit.out b/src/test/regress/expected/inherit.out index be0a774d..91c39448 100644 --- a/src/test/regress/expected/inherit.out +++ b/src/test/regress/expected/inherit.out @@ -1001,6 +1001,8 @@ NOTICE: drop cascades to table c1 -- tables. See the pgsql-hackers thread beginning Dec. 4/04 create table base (i integer); create table derived () inherits (base); +create table more_derived (like derived, b int) inherits (derived); +NOTICE: merging column "i" with inherited definition insert into derived (i) values (0); select derived::base from derived; derived @@ -1014,6 +1016,22 @@ select NULL::derived::base; (1 row) +-- remove redundant conversions. +explain (verbose on, costs off) select row(i, b)::more_derived::derived::base from more_derived; + QUERY PLAN +------------------------------------------- + Seq Scan on public.more_derived + Output: (ROW(i, b)::more_derived)::base +(2 rows) + +explain (verbose on, costs off) select (1, 2)::more_derived::derived::base; + QUERY PLAN +----------------------- + Result + Output: '(1)'::base +(2 rows) + +drop table more_derived; drop table derived; drop table base; create table p1(ff1 int) distribute by roundrobin; diff --git a/src/test/regress/expected/inherit_1.out b/src/test/regress/expected/inherit_1.out index d16ab5d6..8f7d0e3a 100644 --- a/src/test/regress/expected/inherit_1.out +++ b/src/test/regress/expected/inherit_1.out @@ -994,6 +994,22 @@ select NULL::derived::base; (1 row) +-- remove redundant conversions. +explain (verbose on, costs off) select row(i, b)::more_derived::derived::base from more_derived; + QUERY PLAN +------------------------------------------- + Seq Scan on public.more_derived + Output: (ROW(i, b)::more_derived)::base +(2 rows) + +explain (verbose on, costs off) select (1, 2)::more_derived::derived::base; + QUERY PLAN +----------------------- + Result + Output: '(1)'::base +(2 rows) + +drop table more_derived; drop table derived; drop table base; create table p1(ff1 int) distribute by roundrobin; diff --git a/src/test/regress/expected/inherit_2.out b/src/test/regress/expected/inherit_2.out index 0502f335..65ff71fe 100644 --- a/src/test/regress/expected/inherit_2.out +++ b/src/test/regress/expected/inherit_2.out @@ -996,6 +996,22 @@ select NULL::derived::base; (1 row) +-- remove redundant conversions. +explain (verbose on, costs off) select row(i, b)::more_derived::derived::base from more_derived; + QUERY PLAN +------------------------------------------- + Seq Scan on public.more_derived + Output: (ROW(i, b)::more_derived)::base +(2 rows) + +explain (verbose on, costs off) select (1, 2)::more_derived::derived::base; + QUERY PLAN +----------------------- + Result + Output: '(1)'::base +(2 rows) + +drop table more_derived; drop table derived; drop table base; create table p1(ff1 int) distribute by roundrobin; diff --git a/src/test/regress/expected/inherit_3.out b/src/test/regress/expected/inherit_3.out index 955a1170..707a6f63 100644 --- a/src/test/regress/expected/inherit_3.out +++ b/src/test/regress/expected/inherit_3.out @@ -994,6 +994,22 @@ select NULL::derived::base; (1 row) +-- remove redundant conversions. +explain (verbose on, costs off) select row(i, b)::more_derived::derived::base from more_derived; + QUERY PLAN +------------------------------------------- + Seq Scan on public.more_derived + Output: (ROW(i, b)::more_derived)::base +(2 rows) + +explain (verbose on, costs off) select (1, 2)::more_derived::derived::base; + QUERY PLAN +----------------------- + Result + Output: '(1)'::base +(2 rows) + +drop table more_derived; drop table derived; drop table base; create table p1(ff1 int) distribute by roundrobin; diff --git a/src/test/regress/sql/inherit.sql b/src/test/regress/sql/inherit.sql index ea17dd86..e58bfd36 100644 --- a/src/test/regress/sql/inherit.sql +++ b/src/test/regress/sql/inherit.sql @@ -306,9 +306,14 @@ drop table p1 cascade; -- tables. See the pgsql-hackers thread beginning Dec. 4/04 create table base (i integer); create table derived () inherits (base); +create table more_derived (like derived, b int) inherits (derived); insert into derived (i) values (0); select derived::base from derived; select NULL::derived::base; +-- remove redundant conversions. +explain (verbose on, costs off) select row(i, b)::more_derived::derived::base from more_derived; +explain (verbose on, costs off) select (1, 2)::more_derived::derived::base; +drop table more_derived; drop table derived; drop table base; From 98a2dfe642fc9aa07b7550fee520bb13f4b3e847 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Tue, 30 Jun 2020 19:40:52 +0800 Subject: [PATCH 269/578] Fix dependency handling of partitions and inheritance for ON COMMIT. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- doc/src/sgml/ref/create_table.sgml | 7 ++- src/backend/commands/tablecmds.c | 63 ++++++++++++++++------ src/test/regress/expected/temp.out | 85 ++++++++++++++++++++++++++++++ src/test/regress/sql/temp.sql | 59 +++++++++++++++++++++ 4 files changed, 196 insertions(+), 18 deletions(-) diff --git a/doc/src/sgml/ref/create_table.sgml b/doc/src/sgml/ref/create_table.sgml index 62792897..70d3dcfc 100644 --- a/doc/src/sgml/ref/create_table.sgml +++ b/doc/src/sgml/ref/create_table.sgml @@ -1107,7 +1107,8 @@ All rows in the temporary table will be deleted at the end of each transaction block. Essentially, an automatic is done - at each commit. + at each commit. When used on a partitioned table, this + is not cascaded to its partitions. @@ -1117,7 +1118,9 @@ The temporary table will be dropped at the end of the current - transaction block. + transaction block. When used on a partitioned table, this action + drops its partitions and when used on tables with inheritance + children, it drops the dependent children. diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index fa871fd8..062a6439 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -15617,6 +15617,7 @@ PreCommit_on_commit_actions(void) {// #lizard forgives ListCell *l; List *oids_to_truncate = NIL; + List *oids_to_drop = NIL; #ifdef XCP /* @@ -15659,35 +15660,65 @@ PreCommit_on_commit_actions(void) oids_to_truncate = lappend_oid(oids_to_truncate, oc->relid); break; case ONCOMMIT_DROP: + oids_to_drop = lappend_oid(oids_to_drop, oc->relid); + break; + } + } + + /* + * Truncate relations before dropping so that all dependencies between + * relations are removed after they are worked on. Doing it like this + * might be a waste as it is possible that a relation being truncated will + * be dropped anyway due to its parent being dropped, but this makes the + * code more robust because of not having to re-check that the relation + * exists at truncation time. + */ + if (oids_to_truncate != NIL) + { + heap_truncate(oids_to_truncate); + CommandCounterIncrement(); /* XXX needed? */ + } + if (oids_to_drop != NIL) + { + ObjectAddresses *targetObjects = new_object_addresses(); + ListCell *l; + + foreach(l, oids_to_drop) { ObjectAddress object; object.classId = RelationRelationId; - object.objectId = oc->relid; + object.objectId = lfirst_oid(l); object.objectSubId = 0; + Assert(!object_address_present(&object, targetObjects)); + + add_exact_object_address(&object, targetObjects); + } + /* - * Since this is an automatic drop, rather than one - * directly initiated by the user, we pass the - * PERFORM_DELETION_INTERNAL flag. + * Since this is an automatic drop, rather than one directly initiated + * by the user, we pass the PERFORM_DELETION_INTERNAL flag. */ - performDeletion(&object, - DROP_CASCADE, PERFORM_DELETION_INTERNAL); + performMultipleDeletions(targetObjects, DROP_CASCADE, + PERFORM_DELETION_INTERNAL | PERFORM_DELETION_QUIETLY); + +#ifdef USE_ASSERT_CHECKING /* - * Note that table deletion will call - * remove_on_commit_action, so the entry should get marked - * as deleted. + * Note that table deletion will call remove_on_commit_action, so the + * entry should get marked as deleted. */ + foreach(l, on_commits) + { + OnCommitItem *oc = (OnCommitItem *) lfirst(l); + + if (oc->oncommit != ONCOMMIT_DROP) + continue; + Assert(oc->deleting_subid != InvalidSubTransactionId); - break; - } } - } - if (oids_to_truncate != NIL) - { - heap_truncate(oids_to_truncate); - CommandCounterIncrement(); /* XXX needed? */ +#endif } } diff --git a/src/test/regress/expected/temp.out b/src/test/regress/expected/temp.out index 0c4ac2ea..ee8f251d 100644 --- a/src/test/regress/expected/temp.out +++ b/src/test/regress/expected/temp.out @@ -191,3 +191,88 @@ select pg_temp.whoami(); (1 row) drop table public.whereami; +-- Check dependencies between ON COMMIT actions with a partitioned +-- table and its partitions. Using ON COMMIT DROP on a parent removes +-- the whole set. +begin; +create temp table temp_parted_oncommit_test (a int) + partition by list (a) on commit drop; +create temp table temp_parted_oncommit_test1 + partition of temp_parted_oncommit_test + for values in (1) on commit delete rows; +create temp table temp_parted_oncommit_test2 + partition of temp_parted_oncommit_test + for values in (2) on commit drop; +insert into temp_parted_oncommit_test values (1), (2); +commit; +-- no relations remain in this case. +select relname from pg_class where relname like 'temp_parted_oncommit_test%'; + relname +--------- +(0 rows) + +-- Using ON COMMIT DELETE on a partitioned table does not remove +-- all rows if partitions preserve their data. +begin; +create temp table temp_parted_oncommit_test (a int) + partition by list (a) on commit delete rows; +create temp table temp_parted_oncommit_test1 + partition of temp_parted_oncommit_test + for values in (1) on commit preserve rows; +create temp table temp_parted_oncommit_test2 + partition of temp_parted_oncommit_test + for values in (2) on commit drop; +insert into temp_parted_oncommit_test values (1), (2); +commit; +-- Data from the remaining partition is still here as its rows are +-- preserved. +select * from temp_parted_oncommit_test; + a +--- + 1 +(1 row) + +-- two relations remain in this case. +select relname from pg_class where relname like 'temp_parted_oncommit_test%'; + relname +---------------------------- + temp_parted_oncommit_test + temp_parted_oncommit_test1 +(2 rows) + +drop table temp_parted_oncommit_test; +-- Check dependencies between ON COMMIT actions with inheritance trees. +-- Using ON COMMIT DROP on a parent removes the whole set. +begin; +create temp table temp_inh_oncommit_test (a int) on commit drop; +create temp table temp_inh_oncommit_test1 () + inherits(temp_inh_oncommit_test) on commit delete rows; +insert into temp_inh_oncommit_test1 values (1); +commit; +-- no relations remain in this case +select relname from pg_class where relname like 'temp_inh_oncommit_test%'; + relname +--------- +(0 rows) + +-- Data on the parent is removed, and the child goes away. +begin; +create temp table temp_inh_oncommit_test (a int) on commit delete rows; +create temp table temp_inh_oncommit_test1 () + inherits(temp_inh_oncommit_test) on commit drop; +insert into temp_inh_oncommit_test1 values (1); +insert into temp_inh_oncommit_test values (1); +commit; +select * from temp_inh_oncommit_test; + a +--- +(0 rows) + +-- one relation remains +select relname from pg_class where relname like 'temp_inh_oncommit_test%'; + relname +------------------------ + temp_inh_oncommit_test +(1 row) + +drop table temp_inh_oncommit_test; diff --git a/src/test/regress/sql/temp.sql b/src/test/regress/sql/temp.sql index 6c3fc018..efac176f 100644 --- a/src/test/regress/sql/temp.sql +++ b/src/test/regress/sql/temp.sql @@ -151,3 +151,62 @@ select whoami(); select pg_temp.whoami(); drop table public.whereami; + +-- Check dependencies between ON COMMIT actions with a partitioned +-- table and its partitions. Using ON COMMIT DROP on a parent removes +-- the whole set. +begin; +create temp table temp_parted_oncommit_test (a int) + partition by list (a) on commit drop; +create temp table temp_parted_oncommit_test1 + partition of temp_parted_oncommit_test + for values in (1) on commit delete rows; +create temp table temp_parted_oncommit_test2 + partition of temp_parted_oncommit_test + for values in (2) on commit drop; +insert into temp_parted_oncommit_test values (1), (2); +commit; +-- no relations remain in this case. +select relname from pg_class where relname like 'temp_parted_oncommit_test%'; +-- Using ON COMMIT DELETE on a partitioned table does not remove +-- all rows if partitions preserve their data. +begin; +create temp table temp_parted_oncommit_test (a int) + partition by list (a) on commit delete rows; +create temp table temp_parted_oncommit_test1 + partition of temp_parted_oncommit_test + for values in (1) on commit preserve rows; +create temp table temp_parted_oncommit_test2 + partition of temp_parted_oncommit_test + for values in (2) on commit drop; +insert into temp_parted_oncommit_test values (1), (2); +commit; +-- Data from the remaining partition is still here as its rows are +-- preserved. +select * from temp_parted_oncommit_test; +-- two relations remain in this case. +select relname from pg_class where relname like 'temp_parted_oncommit_test%'; +drop table temp_parted_oncommit_test; + +-- Check dependencies between ON COMMIT actions with inheritance trees. +-- Using ON COMMIT DROP on a parent removes the whole set. +begin; +create temp table temp_inh_oncommit_test (a int) on commit drop; +create temp table temp_inh_oncommit_test1 () + inherits(temp_inh_oncommit_test) on commit delete rows; +insert into temp_inh_oncommit_test1 values (1); +commit; +-- no relations remain in this case +select relname from pg_class where relname like 'temp_inh_oncommit_test%'; +-- Data on the parent is removed, and the child goes away. +begin; +create temp table temp_inh_oncommit_test (a int) on commit delete rows; +create temp table temp_inh_oncommit_test1 () + inherits(temp_inh_oncommit_test) on commit drop; +insert into temp_inh_oncommit_test1 values (1); +insert into temp_inh_oncommit_test values (1); +commit; +select * from temp_inh_oncommit_test; +-- one relation remains +select relname from pg_class where relname like 'temp_inh_oncommit_test%'; +drop table temp_inh_oncommit_test; \ No newline at end of file From 10f8d2a0f55abe970a3c9ae8bcf2df4f994d83f8 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Tue, 30 Jun 2020 20:09:23 +0800 Subject: [PATCH 270/578] Disallow COPY FREEZE on partitioned tables.http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- doc/src/sgml/perform.sgml | 3 ++- doc/src/sgml/ref/copy.sgml | 4 +++- src/backend/commands/copy.c | 29 +++++++++++++++++++++++++++-- 3 files changed, 32 insertions(+), 4 deletions(-) diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml index 454c3f1f..70483f73 100644 --- a/doc/src/sgml/perform.sgml +++ b/doc/src/sgml/perform.sgml @@ -1546,7 +1546,8 @@ SELECT * FROM x, y, a, b, c WHERE something AND somethingelse; needs to be written, because in case of an error, the files containing the newly loaded data will be removed anyway. However, this consideration only applies when - is minimal as all commands + is minimal for + non-partitioned tables as all commands must write WAL otherwise. diff --git a/doc/src/sgml/ref/copy.sgml b/doc/src/sgml/ref/copy.sgml index 48f0c5c7..84edcac5 100644 --- a/doc/src/sgml/ref/copy.sgml +++ b/doc/src/sgml/ref/copy.sgml @@ -230,7 +230,9 @@ COPY { table_name [ ( COPY FREEZE on + a partitioned table. Note that all other sessions will immediately be able to see the data diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 5b7eb4b9..533187a0 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -3064,11 +3064,20 @@ CopyFrom(CopyState cstate) * go into pages containing tuples from any other transactions --- but this * must be the case if we have a new table or new relfilenode, so we need * no additional work to enforce that. + * + * We currently don't support this optimization if the COPY target is a + * partitioned table as we currently only lazily initialize partition + * information when routing the first tuple to the partition. We cannot + * know at this stage if we can perform this optimization. It should be + * possible to improve on this, but it does mean maintaining heap insert + * option flags per partition and setting them when we first open the + * partition. *---------- */ /* createSubid is creation check, newRelfilenodeSubid is truncation check */ - if (cstate->rel->rd_createSubid != InvalidSubTransactionId || - cstate->rel->rd_newRelfilenodeSubid != InvalidSubTransactionId) + if (cstate->rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE && + (cstate->rel->rd_createSubid != InvalidSubTransactionId || + cstate->rel->rd_newRelfilenodeSubid != InvalidSubTransactionId)) { hi_options |= HEAP_INSERT_SKIP_FSM; if (!XLogIsNeeded()) @@ -3085,6 +3094,22 @@ CopyFrom(CopyState cstate) */ if (cstate->freeze) { + /* + * We currently disallow COPY FREEZE on partitioned tables. The + * reason for this is that we've simply not yet opened the partitions + * to determine if the optimization can be applied to them. We could + * go and open them all here, but doing so may be quite a costly + * overhead for small copies. In any case, we may just end up routing + * tuples to a small number of partitions. It seems better just to + * raise an ERROR for partitioned tables. + */ + if (cstate->rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot perform FREEZE on a partitioned table"))); + } + if (!ThereAreNoPriorRegisteredSnapshots() || !ThereAreNoReadyPortals()) ereport(ERROR, (errcode(ERRCODE_INVALID_TRANSACTION_STATE), From c2687280400bd3d646ccc43e5fd4bd33a9dd9cc1 Mon Sep 17 00:00:00 2001 From: Alvaro Herrera Date: Fri, 23 Nov 2018 08:44:15 -0300 Subject: [PATCH 271/578] Don't allow partitioned indexes in pg_global tablespace Missing in dfa608141982. Author: David Rowley Discussion: https://postgr.es/m/CAKJS1f-M3NMTCpv=vDfkoqHbMPFf=3-Z1ud=+1DHH00tC+zLaQ@mail.gmail.com --- src/backend/commands/tablecmds.c | 6 ++++++ src/test/regress/input/tablespace.source | 1 + src/test/regress/output/tablespace.source | 2 ++ 3 files changed, 9 insertions(+) diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 062a6439..0bc4e296 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -12610,6 +12610,12 @@ ATExecPartedIdxSetTableSpace(Relation rel, Oid newTableSpace) Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_INDEX); + /* Can't allow a non-shared relation in pg_global */ + if (newTableSpace == GLOBALTABLESPACE_OID) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("only shared relations can be placed in pg_global tablespace"))); + /* * No work if no change in tablespace. */ diff --git a/src/test/regress/input/tablespace.source b/src/test/regress/input/tablespace.source index 1454e433..e4e4cf0e 100644 --- a/src/test/regress/input/tablespace.source +++ b/src/test/regress/input/tablespace.source @@ -101,6 +101,7 @@ CREATE UNIQUE INDEX anindex ON testschema.atable(column1); ALTER TABLE testschema.atable SET TABLESPACE regress_tblspace; ALTER INDEX testschema.anindex SET TABLESPACE regress_tblspace; +ALTER INDEX testschema.part_a_idx SET TABLESPACE pg_global; ALTER INDEX testschema.part_a_idx SET TABLESPACE pg_default; ALTER INDEX testschema.part_a_idx SET TABLESPACE regress_tblspace; diff --git a/src/test/regress/output/tablespace.source b/src/test/regress/output/tablespace.source index a1a615de..8fa26db8 100644 --- a/src/test/regress/output/tablespace.source +++ b/src/test/regress/output/tablespace.source @@ -214,6 +214,8 @@ CREATE TABLE testschema.atable AS VALUES (1), (2); CREATE UNIQUE INDEX anindex ON testschema.atable(column1); ALTER TABLE testschema.atable SET TABLESPACE regress_tblspace; ALTER INDEX testschema.anindex SET TABLESPACE regress_tblspace; +ALTER INDEX testschema.part_a_idx SET TABLESPACE pg_global; +ERROR: only shared relations can be placed in pg_global tablespace ALTER INDEX testschema.part_a_idx SET TABLESPACE pg_default; ALTER INDEX testschema.part_a_idx SET TABLESPACE regress_tblspace; INSERT INTO testschema.atable VALUES(3); -- ok From eab56b88c0a7264119e14f0ea99822b79bb846fc Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Wed, 12 Dec 2018 09:49:39 +0900 Subject: [PATCH 272/578] Tweak pg_partition_tree for undefined relations and unsupported relkinds This fixes a crash which happened when calling the function directly with a relation OID referring to a non-existing object, and changes the behavior so as NULL is returned for unsupported relkinds instead of generating an error. This puts the new function in line with many other system functions, and eases actions like full scans of pg_class. Author: Michael Paquier Reviewed-by: Amit Langote, Stephen Frost Discussion: https://postgr.es/m/20181207010406.GO2407@paquier.xyz --- src/backend/utils/adt/partitionfuncs.c | 11 ++++++----- src/test/regress/expected/partition_info.out | 18 ++++++++++++++++-- src/test/regress/sql/partition_info.sql | 1 + 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/src/backend/utils/adt/partitionfuncs.c b/src/backend/utils/adt/partitionfuncs.c index 8f9218ad..2c9fcd1f 100644 --- a/src/backend/utils/adt/partitionfuncs.c +++ b/src/backend/utils/adt/partitionfuncs.c @@ -23,6 +23,7 @@ #include "funcapi.h" #include "utils/fmgrprotos.h" #include "utils/lsyscache.h" +#include "utils/syscache.h" /* @@ -42,16 +43,16 @@ pg_partition_tree(PG_FUNCTION_ARGS) FuncCallContext *funcctx; ListCell **next; - /* Only allow relation types that can appear in partition trees. */ + if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(rootrelid))) + PG_RETURN_NULL(); + + /* Return NULL for relation types that cannot appear in partition trees */ if (relkind != RELKIND_RELATION && relkind != RELKIND_FOREIGN_TABLE && relkind != RELKIND_INDEX && relkind != RELKIND_PARTITIONED_TABLE && relkind != RELKIND_PARTITIONED_INDEX) - ereport(ERROR, - (errcode(ERRCODE_WRONG_OBJECT_TYPE), - errmsg("\"%s\" is not a table, a foreign table, or an index", - get_rel_name(rootrelid)))); + PG_RETURN_NULL(); /* stuff done only on the first call of the function */ if (SRF_IS_FIRSTCALL()) diff --git a/src/test/regress/expected/partition_info.out b/src/test/regress/expected/partition_info.out index 6b116125..202d8208 100644 --- a/src/test/regress/expected/partition_info.out +++ b/src/test/regress/expected/partition_info.out @@ -6,6 +6,12 @@ SELECT * FROM pg_partition_tree(NULL); -------+-------------+--------+------- (0 rows) +SELECT * FROM pg_partition_tree(0); + relid | parentrelid | isleaf | level +-------+-------------+--------+------- + | | | +(1 row) + -- Test table partition trees CREATE TABLE ptif_test (a int, b int) PARTITION BY range (a); CREATE TABLE ptif_test0 PARTITION OF ptif_test @@ -107,8 +113,16 @@ DROP TABLE ptif_normal_table; CREATE VIEW ptif_test_view AS SELECT 1; CREATE MATERIALIZED VIEW ptif_test_matview AS SELECT 1; SELECT * FROM pg_partition_tree('ptif_test_view'); -ERROR: "ptif_test_view" is not a table, a foreign table, or an index + relid | parentrelid | isleaf | level +-------+-------------+--------+------- + | | | +(1 row) + SELECT * FROM pg_partition_tree('ptif_test_matview'); -ERROR: "ptif_test_matview" is not a table, a foreign table, or an index + relid | parentrelid | isleaf | level +-------+-------------+--------+------- + | | | +(1 row) + DROP VIEW ptif_test_view; DROP MATERIALIZED VIEW ptif_test_matview; diff --git a/src/test/regress/sql/partition_info.sql b/src/test/regress/sql/partition_info.sql index 5a76f22b..9b55a7fe 100644 --- a/src/test/regress/sql/partition_info.sql +++ b/src/test/regress/sql/partition_info.sql @@ -2,6 +2,7 @@ -- Tests for pg_partition_tree -- SELECT * FROM pg_partition_tree(NULL); +SELECT * FROM pg_partition_tree(0); -- Test table partition trees CREATE TABLE ptif_test (a int, b int) PARTITION BY range (a); From 4e7e2780297c3c933d2350fbf794d6836a2c4ec7 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Tue, 30 Jun 2020 20:46:08 +0800 Subject: [PATCH 273/578] Fix tablespace handling for partitioned tables. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- doc/src/sgml/ref/create_table.sgml | 8 +- src/backend/catalog/heap.c | 6 +- src/backend/commands/tablecmds.c | 152 ++++++++++++++-------- src/include/catalog/pg_class.h | 13 ++ src/test/regress/input/tablespace.source | 12 ++ src/test/regress/output/tablespace.source | 19 +++ 6 files changed, 148 insertions(+), 62 deletions(-) diff --git a/doc/src/sgml/ref/create_table.sgml b/doc/src/sgml/ref/create_table.sgml index 70d3dcfc..47f82c50 100644 --- a/doc/src/sgml/ref/create_table.sgml +++ b/doc/src/sgml/ref/create_table.sgml @@ -1136,8 +1136,12 @@ of the tablespace in which the new table is to be created. If not specified, is consulted, or - if the table is temporary. - + if the table is temporary. For + partitioned tables, since no storage is required for the table itself, + the tablespace specified here only serves to mark the default tablespace + for any newly created partitions when no other tablespace is explicitly + specified. + diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c index ff83af36..56e4d7f1 100644 --- a/src/backend/catalog/heap.c +++ b/src/backend/catalog/heap.c @@ -410,7 +410,6 @@ heap_create(const char *relname, case RELKIND_VIEW: case RELKIND_COMPOSITE_TYPE: case RELKIND_FOREIGN_TABLE: - case RELKIND_PARTITIONED_TABLE: create_storage = false; /* @@ -420,10 +419,11 @@ heap_create(const char *relname, reltablespace = InvalidOid; break; + case RELKIND_PARTITIONED_TABLE: case RELKIND_PARTITIONED_INDEX: /* - * Preserve tablespace so that it's used as tablespace for indexes - * on future partitions. + * For partitioned tables and indexes, preserve tablespace so that + * it's used as the tablespace for future partitions. */ create_storage = false; break; diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 0bc4e296..b24611be 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -340,7 +340,7 @@ struct DropRelationCallbackState static void truncate_check_rel(Relation rel); static List *MergeAttributes(List *schema, List *supers, char relpersistence, - bool is_partition, List **supOids, List **supconstr, + bool is_partition, List **supconstr, int *supOidCount); static bool MergeCheckConstraint(List *constraints, char *name, Node *expr); static void MergeAttributesIntoExisting(Relation child_rel, Relation parent_rel); @@ -495,7 +495,7 @@ static bool ATPrepChangePersistence(Relation rel, bool toLogged); static void ATPrepSetTableSpace(AlteredTableInfo *tab, Relation rel, char *tablespacename, LOCKMODE lockmode); static void ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode); -static void ATExecPartedIdxSetTableSpace(Relation rel, Oid newTableSpace); +static void ATExecSetTableSpaceNoStorage(Relation rel, Oid newTableSpace); static void ATExecSetRelOptions(Relation rel, List *defList, AlterTableType operation, LOCKMODE lockmode); @@ -593,6 +593,7 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, static char *validnsps[] = HEAP_RELOPT_NAMESPACES; Oid ofTypeId; ObjectAddress address; + LOCKMODE parentLockmode; #ifdef _SHARDING_ bool has_extent = false; @@ -668,6 +669,46 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("cannot create temporary table within security-restricted operation"))); + /* + * Determine the lockmode to use when scanning parents. A self-exclusive + * lock is needed here. + * + * For regular inheritance, if two backends attempt to add children to the + * same parent simultaneously, and that parent has no pre-existing + * children, then both will attempt to update the parent's relhassubclass + * field, leading to a "tuple concurrently updated" error. Also, this + * interlocks against a concurrent ANALYZE on the parent table, which + * might otherwise be attempting to clear the parent's relhassubclass + * field, if its previous children were recently dropped. + * + * If the child table is a partition, then we instead grab an exclusive + * lock on the parent because its partition descriptor will be changed by + * addition of the new partition. + */ + parentLockmode = (stmt->partbound != NULL ? AccessExclusiveLock : + ShareUpdateExclusiveLock); + + /* Determine the list of OIDs of the parents. */ + inheritOids = NIL; + foreach(listptr, stmt->inhRelations) + { + RangeVar *rv = (RangeVar *) lfirst(listptr); + Oid parentOid; + + parentOid = RangeVarGetRelid(rv, parentLockmode, false); + + /* + * Reject duplications in the list of parents. + */ + if (list_member_oid(inheritOids, parentOid)) + ereport(ERROR, + (errcode(ERRCODE_DUPLICATE_TABLE), + errmsg("relation \"%s\" would be inherited from more than once", + get_rel_name(parentOid)))); + + inheritOids = lappend_oid(inheritOids, parentOid); + } + /* * Select tablespace to use. If not specified, use default tablespace * (which may in turn default to database's default). @@ -676,6 +717,25 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, { tablespaceId = get_tablespace_oid(stmt->tablespacename, false); } + else if (stmt->partbound) + { + HeapTuple tup; + + /* + * For partitions, when no other tablespace is specified, we default + * the tablespace to the parent partitioned table's. + */ + Assert(list_length(inheritOids) == 1); + tup = SearchSysCache1(RELOID, + DatumGetObjectId(linitial_oid(inheritOids))); + + tablespaceId = ((Form_pg_class) GETSTRUCT(tup))->reltablespace; + + if (!OidIsValid(tablespaceId)) + tablespaceId = GetDefaultTablespace(stmt->relation->relpersistence); + + ReleaseSysCache(tup); + } else { tablespaceId = GetDefaultTablespace(stmt->relation->relpersistence); @@ -734,10 +794,10 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, * modified by MergeAttributes.) */ stmt->tableElts = - MergeAttributes(stmt->tableElts, stmt->inhRelations, + MergeAttributes(stmt->tableElts, inheritOids, stmt->relation->relpersistence, stmt->partbound != NULL, - &inheritOids, &old_constraints, &parentOidCount); + &old_constraints, &parentOidCount); /* * Create a tuple descriptor from the relation schema. Note that this @@ -2206,12 +2266,11 @@ storage_name(char c) * Input arguments: * 'schema' is the column/attribute definition for the table. (It's a list * of ColumnDef's.) It is destructively changed. - * 'supers' is a list of names (as RangeVar nodes) of parent relations. + * 'supers' is a list of OIDs of parent relations, already locked by caller. * 'relpersistence' is a persistence type of the table. * 'is_partition' tells if the table is a partition * * Output arguments: - * 'supOids' receives a list of the OIDs of the parent relations. * 'supconstr' receives a list of constraints belonging to the parents, * updated as necessary to be valid for the child. * 'supOidCount' is set to the number of parents that have OID columns. @@ -2260,12 +2319,11 @@ storage_name(char c) */ static List * MergeAttributes(List *schema, List *supers, char relpersistence, - bool is_partition, List **supOids, List **supconstr, + bool is_partition, List **supconstr, int *supOidCount) {// #lizard forgives ListCell *entry; List *inhSchema = NIL; - List *parentOids = NIL; List *constraints = NIL; int parentsWithOids = 0; bool have_bogus_defaults = false; @@ -2372,31 +2430,15 @@ MergeAttributes(List *schema, List *supers, char relpersistence, child_attno = 0; foreach(entry, supers) { - RangeVar *parent = (RangeVar *) lfirst(entry); + Oid parent = lfirst_oid(entry); Relation relation; TupleDesc tupleDesc; TupleConstr *constr; AttrNumber *newattno; AttrNumber parent_attno; - /* - * A self-exclusive lock is needed here. If two backends attempt to - * add children to the same parent simultaneously, and that parent has - * no pre-existing children, then both will attempt to update the - * parent's relhassubclass field, leading to a "tuple concurrently - * updated" error. Also, this interlocks against a concurrent ANALYZE - * on the parent table, which might otherwise be attempting to clear - * the parent's relhassubclass field, if its previous children were - * recently dropped. - * - * If the child table is a partition, then we instead grab an - * exclusive lock on the parent because its partition descriptor will - * be changed by addition of the new partition. - */ - if (!is_partition) - relation = heap_openrv(parent, ShareUpdateExclusiveLock); - else - relation = heap_openrv(parent, AccessExclusiveLock); + /* caller already got lock */ + relation = heap_open(parent, NoLock); /* * We do not allow partitioned tables and partitions to participate in @@ -2407,12 +2449,12 @@ MergeAttributes(List *schema, List *supers, char relpersistence, ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("cannot inherit from partitioned table \"%s\"", - parent->relname))); + RelationGetRelationName(relation)))); if (relation->rd_rel->relispartition && !is_partition) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("cannot inherit from partition \"%s\"", - parent->relname))); + RelationGetRelationName(relation)))); if (relation->rd_rel->relkind != RELKIND_RELATION && relation->rd_rel->relkind != RELKIND_FOREIGN_TABLE && @@ -2420,7 +2462,7 @@ MergeAttributes(List *schema, List *supers, char relpersistence, ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("inherited relation \"%s\" is not a table or foreign table", - parent->relname))); + RelationGetRelationName(relation)))); /* Permanent rels cannot inherit from temporary ones */ if (relpersistence != RELPERSISTENCE_TEMP && relation->rd_rel->relpersistence == RELPERSISTENCE_TEMP) @@ -2429,7 +2471,7 @@ MergeAttributes(List *schema, List *supers, char relpersistence, errmsg(!is_partition ? "cannot inherit from temporary relation \"%s\"" : "cannot create a permanent relation as partition of temporary relation \"%s\"", - parent->relname))); + RelationGetRelationName(relation)))); /* If existing rel is temp, it must belong to this session */ if (relation->rd_rel->relpersistence == RELPERSISTENCE_TEMP && @@ -2448,17 +2490,6 @@ MergeAttributes(List *schema, List *supers, char relpersistence, aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_CLASS, RelationGetRelationName(relation)); - /* - * Reject duplications in the list of parents. - */ - if (list_member_oid(parentOids, RelationGetRelid(relation))) - ereport(ERROR, - (errcode(ERRCODE_DUPLICATE_TABLE), - errmsg("relation \"%s\" would be inherited from more than once", - parent->relname))); - - parentOids = lappend_oid(parentOids, RelationGetRelid(relation)); - if (relation->rd_rel->relhasoids) parentsWithOids++; @@ -2926,7 +2957,6 @@ MergeAttributes(List *schema, List *supers, char relpersistence, } } - *supOids = parentOids; *supconstr = constraints; *supOidCount = parentsWithOids; return schema; @@ -5281,11 +5311,13 @@ ATExecCmd(List **wqueue, AlteredTableInfo *tab, Relation rel, break; case AT_SetTableSpace: /* SET TABLESPACE */ /* - * Only do this for partitioned indexes, for which this is just - * a catalog change. Other relation types are handled by Phase 3. + * Only do this for partitioned tables and indexes, for which this + * is just a catalog change. Other relation types which have + * storage are handled by Phase 3. */ - if (rel->rd_rel->relkind == RELKIND_PARTITIONED_INDEX) - ATExecPartedIdxSetTableSpace(rel, tab->newTableSpace); + if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE || + rel->rd_rel->relkind == RELKIND_PARTITIONED_INDEX) + ATExecSetTableSpaceNoStorage(rel, tab->newTableSpace); break; case AT_SetRelOptions: /* SET (...) */ @@ -12596,19 +12628,26 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode) } /* - * Special handling of ALTER TABLE SET TABLESPACE for partitioned indexes, - * which have no storage (so not handled in Phase 3 like other relation types) + * Special handling of ALTER TABLE SET TABLESPACE for relations with no + * storage that have an interest in preserving tablespace. + * + * Since these have no storage the tablespace can be updated with a simple + * metadata only operation to update the tablespace. */ static void -ATExecPartedIdxSetTableSpace(Relation rel, Oid newTableSpace) +ATExecSetTableSpaceNoStorage(Relation rel, Oid newTableSpace) { HeapTuple tuple; Oid oldTableSpace; Relation pg_class; Form_pg_class rd_rel; - Oid indexOid = RelationGetRelid(rel); + Oid reloid = RelationGetRelid(rel); - Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_INDEX); + /* + * Shouldn't be called on relations having storage; these are processed + * in phase 3. + */ + Assert(!RELKIND_CAN_HAVE_STORAGE(rel->rd_rel->relkind)); /* Can't allow a non-shared relation in pg_global */ if (newTableSpace == GLOBALTABLESPACE_OID) @@ -12623,24 +12662,23 @@ ATExecPartedIdxSetTableSpace(Relation rel, Oid newTableSpace) if (newTableSpace == oldTableSpace || (newTableSpace == MyDatabaseTableSpace && oldTableSpace == 0)) { - InvokeObjectPostAlterHook(RelationRelationId, - indexOid, 0); + InvokeObjectPostAlterHook(RelationRelationId, reloid, 0); return; } /* Get a modifiable copy of the relation's pg_class row */ pg_class = heap_open(RelationRelationId, RowExclusiveLock); - tuple = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(indexOid)); + tuple = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(reloid)); if (!HeapTupleIsValid(tuple)) - elog(ERROR, "cache lookup failed for relation %u", indexOid); + elog(ERROR, "cache lookup failed for relation %u", reloid); rd_rel = (Form_pg_class) GETSTRUCT(tuple); /* update the pg_class row */ rd_rel->reltablespace = (newTableSpace == MyDatabaseTableSpace) ? InvalidOid : newTableSpace; CatalogTupleUpdate(pg_class, &tuple->t_self, tuple); - InvokeObjectPostAlterHook(RelationRelationId, indexOid, 0); + InvokeObjectPostAlterHook(RelationRelationId, reloid, 0); heap_freetuple(tuple); diff --git a/src/include/catalog/pg_class.h b/src/include/catalog/pg_class.h index f69dd5e3..15929163 100644 --- a/src/include/catalog/pg_class.h +++ b/src/include/catalog/pg_class.h @@ -205,6 +205,19 @@ DESCR(""); */ #define REPLICA_IDENTITY_INDEX 'i' +/* + * Relation kinds that have physical storage. These relations normally have + * relfilenode set to non-zero, but it can also be zero if the relation is + * mapped. + */ +#define RELKIND_CAN_HAVE_STORAGE(relkind) \ + ((relkind) == RELKIND_RELATION || \ + (relkind) == RELKIND_INDEX || \ + (relkind) == RELKIND_SEQUENCE || \ + (relkind) == RELKIND_TOASTVALUE || \ + (relkind) == RELKIND_MATVIEW) + + #ifdef _MLS_ /* enum for relkindext column */ #define RELKIND_AUDIT_SYS_TABLE 'a' diff --git a/src/test/regress/input/tablespace.source b/src/test/regress/input/tablespace.source index e4e4cf0e..abad2716 100644 --- a/src/test/regress/input/tablespace.source +++ b/src/test/regress/input/tablespace.source @@ -44,6 +44,18 @@ CREATE INDEX foo_idx on testschema.foo(i) TABLESPACE regress_tblspace; SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c where c.reltablespace = t.oid AND c.relname = 'foo_idx'; +-- partitioned table +CREATE TABLE testschema.part (a int) PARTITION BY LIST (a); +CREATE TABLE testschema.part12 PARTITION OF testschema.part FOR VALUES IN(1,2) PARTITION BY LIST (a) TABLESPACE regress_tblspace; +CREATE TABLE testschema.part12_1 PARTITION OF testschema.part12 FOR VALUES IN (1); +ALTER TABLE testschema.part12 SET TABLESPACE pg_default; +CREATE TABLE testschema.part12_2 PARTITION OF testschema.part12 FOR VALUES IN (2); +-- Ensure part12_1 defaulted to regress_tblspace and part12_2 defaulted to pg_default. +SELECT relname, spcname FROM pg_catalog.pg_class c + LEFT JOIN pg_catalog.pg_tablespace t ON c.reltablespace = t.oid + where c.relname LIKE 'part%' order by relname; +DROP TABLE testschema.part; + -- partitioned index CREATE TABLE testschema.part (a int) PARTITION BY LIST (a); CREATE TABLE testschema.part1 PARTITION OF testschema.part FOR VALUES IN (1); diff --git a/src/test/regress/output/tablespace.source b/src/test/regress/output/tablespace.source index 8fa26db8..03383fd4 100644 --- a/src/test/regress/output/tablespace.source +++ b/src/test/regress/output/tablespace.source @@ -61,6 +61,25 @@ SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c foo_idx | regress_tblspace (1 row) +-- partitioned table +CREATE TABLE testschema.part (a int) PARTITION BY LIST (a); +CREATE TABLE testschema.part12 PARTITION OF testschema.part FOR VALUES IN(1,2) PARTITION BY LIST (a) TABLESPACE regress_tblspace; +CREATE TABLE testschema.part12_1 PARTITION OF testschema.part12 FOR VALUES IN (1); +ALTER TABLE testschema.part12 SET TABLESPACE pg_default; +CREATE TABLE testschema.part12_2 PARTITION OF testschema.part12 FOR VALUES IN (2); +-- Ensure part12_1 defaulted to regress_tblspace and part12_2 defaulted to pg_default. +SELECT relname, spcname FROM pg_catalog.pg_class c + LEFT JOIN pg_catalog.pg_tablespace t ON c.reltablespace = t.oid + where c.relname LIKE 'part%' order by relname; + relname | spcname +----------+------------------ + part | + part12 | + part12_1 | regress_tblspace + part12_2 | +(4 rows) + +DROP TABLE testschema.part; -- partitioned index CREATE TABLE testschema.part (a int) PARTITION BY LIST (a); CREATE TABLE testschema.part1 PARTITION OF testschema.part FOR VALUES IN (1); From 0548b6623b865cac379d7f8ad30f328062dc1412 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Tue, 30 Jun 2020 20:50:55 +0800 Subject: [PATCH 274/578] Include partitioned indexes to system view pg_indexes.http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/catalog/system_views.sql | 2 +- src/test/regress/expected/rules.out | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 0e7d14ff..c5326deb 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -162,7 +162,7 @@ CREATE VIEW pg_indexes AS JOIN pg_class I ON (I.oid = X.indexrelid) LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace) LEFT JOIN pg_tablespace T ON (T.oid = I.reltablespace) - WHERE C.relkind IN ('r', 'm') AND I.relkind = 'i'; + WHERE C.relkind IN ('r', 'm', 'p') AND I.relkind IN ('i', 'I'); CREATE OR REPLACE VIEW pg_sequences AS SELECT diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index f750332d..ba5666ef 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1414,7 +1414,7 @@ pg_indexes| SELECT n.nspname AS schemaname, JOIN pg_class i ON ((i.oid = x.indexrelid))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) LEFT JOIN pg_tablespace t ON ((t.oid = i.reltablespace))) - WHERE ((c.relkind = ANY (ARRAY['r'::"char", 'm'::"char"])) AND (i.relkind = 'i'::"char")); + WHERE ((c.relkind = ANY (ARRAY['r'::"char", 'm'::"char", 'p'::"char"])) AND (i.relkind = ANY (ARRAY['i'::"char", 'I'::"char"]))); pg_locks| SELECT l.locktype, l.database, l.relation, From af0fcaefa0f8b0bc253cf9e36e9e605e3c44577b Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Tue, 30 Jun 2020 21:21:09 +0800 Subject: [PATCH 275/578] Delay lock acquisition for partitions until we route a tuple to them.http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/executor/execPartition.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c index 60221c6b..dd60cbc8 100644 --- a/src/backend/executor/execPartition.c +++ b/src/backend/executor/execPartition.c @@ -73,9 +73,6 @@ static char *ExecBuildSlotPartitionKeyDescription(Relation rel, * tuple routing for partitioned tables, encapsulates it in * PartitionTupleRouting, and returns it. * - * Note that all the relations in the partition tree are locked using the - * RowExclusiveLock mode upon return from this function. - * * While we allocate the arrays of pointers of ResultRelInfo and * TupleConversionMap for all partitions here, actual objects themselves are * lazily allocated for a given partition if a tuple is actually routed to it; @@ -100,7 +97,6 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, Relation rel) * Get the information about the partition tree after locking all the * partitions. */ - (void) find_all_inheritors(RelationGetRelid(rel), RowExclusiveLock, NULL); proute = (PartitionTupleRouting *) palloc0(sizeof(PartitionTupleRouting)); proute->partition_dispatch_info = RelationGetPartitionDispatchInfo(rel, &proute->num_dispatch, @@ -329,8 +325,9 @@ ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd, /* * ExecInitPartitionInfo - * Initialize ResultRelInfo and other information for a partition if not - * already done + * Lock the partition and initialize ResultRelInfo. Also setup other + * information for the partition and store it in the next empty slot in + * the proute->partitions array. * * Returns the ResultRelInfo */ @@ -346,11 +343,7 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, ModifyTable *node = mtstate ? (ModifyTable *) mtstate->ps.plan : NULL; MemoryContext oldContext; - /* - * We locked all the partitions in ExecSetupPartitionTupleRouting - * including the leaf partitions. - */ - partrel = heap_open(proute->partition_oids[partidx], NoLock); + partrel = table_open(dispatch->partdesc->oids[partidx], RowExclusiveLock); /* * Keep ResultRelInfo and other information for this partition in the From a376a42f535439761e491db12c8dbfff406c91d4 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Wed, 1 Jul 2020 10:40:31 +0800 Subject: [PATCH 276/578] pg_partition_ancestors. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- doc/src/sgml/func.sgml | 11 ++++ src/backend/utils/adt/partitionfuncs.c | 49 +++++++++++++++++ src/include/catalog/pg_proc.h | 2 + src/test/regress/expected/partition_info.out | 55 ++++++++++++++++++++ src/test/regress/sql/partition_info.sql | 11 ++++ 5 files changed, 128 insertions(+) diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index cb4821d5..8cefea34 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -19847,6 +19847,17 @@ postgres=# SELECT * FROM pg_walfile_name_offset(pg_stop_backup()); their partitions, and so on. + + + pg_partition_ancestors + pg_partition_ancestors(regclass) + + setof regclass + + List the ancestor relations of the given partition, + including the partition itself. + + diff --git a/src/backend/utils/adt/partitionfuncs.c b/src/backend/utils/adt/partitionfuncs.c index 2c9fcd1f..1020c2c3 100644 --- a/src/backend/utils/adt/partitionfuncs.c +++ b/src/backend/utils/adt/partitionfuncs.c @@ -153,3 +153,52 @@ pg_partition_tree(PG_FUNCTION_ARGS) /* done when there are no more elements left */ SRF_RETURN_DONE(funcctx); } + +/* + * pg_partition_ancestors + * + * Produces a view with one row per ancestor of the given partition, + * including the input relation itself. + */ +Datum +pg_partition_ancestors(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); + FuncCallContext *funcctx; + ListCell **next; + + if (SRF_IS_FIRSTCALL()) + { + MemoryContext oldcxt; + List *ancestors; + + funcctx = SRF_FIRSTCALL_INIT(); + + if (!check_rel_can_be_partition(relid)) + SRF_RETURN_DONE(funcctx); + + oldcxt = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + ancestors = get_partition_ancestors(relid); + ancestors = lcons_oid(relid, ancestors); + + next = (ListCell **) palloc(sizeof(ListCell *)); + *next = list_head(ancestors); + funcctx->user_fctx = (void *) next; + + MemoryContextSwitchTo(oldcxt); + } + + funcctx = SRF_PERCALL_SETUP(); + next = (ListCell **) funcctx->user_fctx; + + if (*next != NULL) + { + Oid relid = lfirst_oid(*next); + + *next = lnext(*next); + SRF_RETURN_NEXT(funcctx, ObjectIdGetDatum(relid)); + } + + SRF_RETURN_DONE(funcctx); +} diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index bd1481cf..9af8050d 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -5713,6 +5713,8 @@ DESCR("hash partition CHECK constraint"); /* information about a partition tree */ DATA(insert OID = 4688 ( pg_partition_tree PGNSP PGUID 12 1 1000 0 0 f f f f t t v s 1 0 2249 "2205" "{2205,2205,2205,16,23}" "{i,o,o,o,o}" "{rootrelid,relid,parentrelid,isleaf,level}" _null_ _null_ pg_partition_tree _null_ _null_ _null_ )); DESCR("view partition tree tables"); +DATA(insert OID = 4689 ( pg_partition_ancestors PGNSP PGUID 12 1 10 0 0 f f f f t t v s 1 0 2205 "2205" "{2205,2205}" "{i,o}" "{partitionid,relid}" _null_ _null_ pg_partition_ancestors _null_ _null_ _null_ )); +DESCR("view ancestors of the partition"); DATA(insert OID = 3410 ( pg_extent_info PGNSP PGUID 12 10 20 0 0 f f f f f t v s 1 0 2249 "2205" "{23,16,23,23,23,23,23,23,23}" "{o,o,o,o,o,o,o,o,o}" "{eid,is_occupied,shardid,freespace_cat,hwm,scan_next,scan_prev,alloc_next,alloc_prev}" _null_ _null_ pg_extent_info_oid _null_ _null_ _null_ )); DESCR("get extent info of a relation"); DATA(insert OID = 3411 ( pg_shard_scan_list PGNSP PGUID 12 10 20 0 0 f f f f f t v s 2 0 2249 "2205 23" "{23,16,23,23,23,23}" "{o,o,o,o,o,o}" "{eid,is_occupied,shardid,freespace_cat,hwm,scan_next}" _null_ _null_ pg_shard_scan_list_oid _null_ _null_ _null_ )); diff --git a/src/test/regress/expected/partition_info.out b/src/test/regress/expected/partition_info.out index 202d8208..5916eca2 100644 --- a/src/test/regress/expected/partition_info.out +++ b/src/test/regress/expected/partition_info.out @@ -12,6 +12,16 @@ SELECT * FROM pg_partition_tree(0); | | | (1 row) +SELECT * FROM pg_partition_ancestors(NULL); + relid +------- +(0 rows) + +SELECT * FROM pg_partition_ancestors(0); + relid +------- +(0 rows) + -- Test table partition trees CREATE TABLE ptif_test (a int, b int) PARTITION BY range (a); CREATE TABLE ptif_test0 PARTITION OF ptif_test @@ -66,6 +76,21 @@ SELECT relid, parentrelid, level, isleaf ptif_test01 | ptif_test0 | 0 | t (1 row) +-- List all ancestors of root and leaf tables +SELECT * FROM pg_partition_ancestors('ptif_test01'); + relid +------------- + ptif_test01 + ptif_test0 + ptif_test +(3 rows) + +SELECT * FROM pg_partition_ancestors('ptif_test'); + relid +----------- + ptif_test +(1 row) + -- List all indexes members of the tree SELECT relid, parentrelid, level, isleaf FROM pg_partition_tree('ptif_test_index'); @@ -98,6 +123,21 @@ SELECT relid, parentrelid, level, isleaf ptif_test01_index | ptif_test0_index | 0 | t (1 row) +-- List all ancestors of root and leaf indexes +SELECT * FROM pg_partition_ancestors('ptif_test01_index'); + relid +------------------- + ptif_test01_index + ptif_test0_index + ptif_test_index +(3 rows) + +SELECT * FROM pg_partition_ancestors('ptif_test_index'); + relid +----------------- + ptif_test_index +(1 row) + DROP TABLE ptif_test; -- A table not part of a partition tree works is the only member listed. CREATE TABLE ptif_normal_table(a int); @@ -108,6 +148,11 @@ SELECT relid, parentrelid, level, isleaf ptif_normal_table | | 0 | t (1 row) +SELECT * FROM pg_partition_ancestors('ptif_normal_table'); + relid +------- +(0 rows) + DROP TABLE ptif_normal_table; -- Views and materialized viewS cannot be part of a partition tree. CREATE VIEW ptif_test_view AS SELECT 1; @@ -124,5 +169,15 @@ SELECT * FROM pg_partition_tree('ptif_test_matview'); | | | (1 row) +SELECT * FROM pg_partition_ancestors('ptif_test_view'); + relid +------- +(0 rows) + +SELECT * FROM pg_partition_ancestors('ptif_test_matview'); + relid +------- +(0 rows) + DROP VIEW ptif_test_view; DROP MATERIALIZED VIEW ptif_test_matview; diff --git a/src/test/regress/sql/partition_info.sql b/src/test/regress/sql/partition_info.sql index 9b55a7fe..6e2ec675 100644 --- a/src/test/regress/sql/partition_info.sql +++ b/src/test/regress/sql/partition_info.sql @@ -3,6 +3,8 @@ -- SELECT * FROM pg_partition_tree(NULL); SELECT * FROM pg_partition_tree(0); +SELECT * FROM pg_partition_ancestors(NULL); +SELECT * FROM pg_partition_ancestors(0); -- Test table partition trees CREATE TABLE ptif_test (a int, b int) PARTITION BY range (a); @@ -39,6 +41,9 @@ SELECT relid, parentrelid, level, isleaf SELECT relid, parentrelid, level, isleaf FROM pg_partition_tree('ptif_test01') p JOIN pg_class c ON (p.relid = c.oid); +-- List all ancestors of root and leaf tables +SELECT * FROM pg_partition_ancestors('ptif_test01'); +SELECT * FROM pg_partition_ancestors('ptif_test'); -- List all indexes members of the tree SELECT relid, parentrelid, level, isleaf @@ -51,6 +56,9 @@ SELECT relid, parentrelid, level, isleaf SELECT relid, parentrelid, level, isleaf FROM pg_partition_tree('ptif_test01_index') p JOIN pg_class c ON (p.relid = c.oid); +-- List all ancestors of root and leaf indexes +SELECT * FROM pg_partition_ancestors('ptif_test01_index'); +SELECT * FROM pg_partition_ancestors('ptif_test_index'); DROP TABLE ptif_test; @@ -58,6 +66,7 @@ DROP TABLE ptif_test; CREATE TABLE ptif_normal_table(a int); SELECT relid, parentrelid, level, isleaf FROM pg_partition_tree('ptif_normal_table'); +SELECT * FROM pg_partition_ancestors('ptif_normal_table'); DROP TABLE ptif_normal_table; -- Views and materialized viewS cannot be part of a partition tree. @@ -65,5 +74,7 @@ CREATE VIEW ptif_test_view AS SELECT 1; CREATE MATERIALIZED VIEW ptif_test_matview AS SELECT 1; SELECT * FROM pg_partition_tree('ptif_test_view'); SELECT * FROM pg_partition_tree('ptif_test_matview'); +SELECT * FROM pg_partition_ancestors('ptif_test_view'); +SELECT * FROM pg_partition_ancestors('ptif_test_matview'); DROP VIEW ptif_test_view; DROP MATERIALIZED VIEW ptif_test_matview; From 2e3345d3059a8936cb688e1f72b0aa9c52959343 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Wed, 1 Jul 2020 11:00:50 +0800 Subject: [PATCH 277/578] pg_upgrade: Ignore TOAST for partitioned tables --- src/bin/pg_dump/pg_dump.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index 10a8ce5f..97384c01 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -4054,14 +4054,20 @@ binary_upgrade_set_type_oids_by_rel_oid(Archive *fout, Oid pg_type_oid; bool toast_set = false; - /* we only support old >= 8.3 for binary upgrades */ + /* + * We only support old >= 8.3 for binary upgrades. + * + * We purposefully ignore toast OIDs for partitioned tables; the reason is + * that versions 10 and 11 have them, but 12 does not, so emitting them + * causes the upgrade to fail. + */ appendPQExpBuffer(upgrade_query, "SELECT c.reltype AS crel, t.reltype AS trel " "FROM pg_catalog.pg_class c " "LEFT JOIN pg_catalog.pg_class t ON " - " (c.reltoastrelid = t.oid) " + " (c.reltoastrelid = t.oid AND c.relkind <> '%c') " "WHERE c.oid = '%u'::pg_catalog.oid;", - pg_rel_oid); + RELKIND_PARTITIONED_TABLE, pg_rel_oid); upgrade_res = ExecuteSqlQueryForSingleRow(fout, upgrade_query->data); @@ -5789,6 +5795,10 @@ getTables(Archive *fout, int *numTables) * information about each table, basically just enough to decide if it is * interesting. We must fetch all tables in this phase because otherwise * we cannot correctly identify inherited columns, owned sequences, etc. + * + * We purposefully ignore toast OIDs for partitioned tables; the reason is + * that versions 10 and 11 have them, but 12 does not, so emitting them + * causes the upgrade to fail. */ if (fout->remoteVersion >= 90600) @@ -5902,7 +5912,7 @@ getTables(Archive *fout, int *numTables) "d.classid = c.tableoid AND d.objid = c.oid AND " "d.objsubid = 0 AND " "d.refclassid = c.tableoid AND d.deptype IN ('a', 'i')) " - "LEFT JOIN pg_class tc ON (c.reltoastrelid = tc.oid) " + "LEFT JOIN pg_class tc ON (c.reltoastrelid = tc.oid AND c.relkind <> '%c') " "LEFT JOIN pg_init_privs pip ON " "(c.oid = pip.objoid " "AND pip.classoid = 'pg_class'::regclass " @@ -5929,6 +5939,7 @@ getTables(Archive *fout, int *numTables) ispartition, partbound, RELKIND_SEQUENCE, + RELKIND_PARTITIONED_TABLE, RELKIND_RELATION, RELKIND_SEQUENCE, RELKIND_VIEW, RELKIND_COMPOSITE_TYPE, RELKIND_MATVIEW, RELKIND_FOREIGN_TABLE, From 1740d570fac7cf324d7ff7f0253afd227c90c03f Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Wed, 1 Jul 2020 11:43:19 +0800 Subject: [PATCH 278/578] Add pg_partition_root to display top-most parent of a partition tree. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- doc/src/sgml/func.sgml | 11 +++ src/backend/utils/adt/partitionfuncs.c | 75 +++++++++++++++++--- src/include/catalog/pg_proc.h | 7 +- src/test/regress/expected/partition_info.out | 58 +++++++++++++++ src/test/regress/sql/partition_info.sql | 13 ++++ 5 files changed, 153 insertions(+), 11 deletions(-) diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 8cefea34..08a9ec2a 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -19846,6 +19846,17 @@ postgres=# SELECT * FROM pg_walfile_name_offset(pg_stop_backup()); 1 for its partitions, 2 for their partitions, and so on. + + + pg_partition_root + pg_partition_root(regclass) + + regclass + + Return the top-most parent of a partition tree to which the given + relation belongs. + + diff --git a/src/backend/utils/adt/partitionfuncs.c b/src/backend/utils/adt/partitionfuncs.c index 1020c2c3..13ddec59 100644 --- a/src/backend/utils/adt/partitionfuncs.c +++ b/src/backend/utils/adt/partitionfuncs.c @@ -25,6 +25,33 @@ #include "utils/lsyscache.h" #include "utils/syscache.h" +/* +* Checks if a given relation can be part of a partition tree. Returns +* false if the relation cannot be processed, in which case it is up to +* the caller to decide what to do, by either raising an error or doing +* something else. +*/ +static bool +check_rel_can_be_partition(Oid relid) +{ +char relkind; + +/* Check if relation exists */ +if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(relid))) + return false; + +relkind = get_rel_relkind(relid); + +/* Only allow relation types that can appear in partition trees. */ +if (relkind != RELKIND_RELATION && + relkind != RELKIND_FOREIGN_TABLE && + relkind != RELKIND_INDEX && + relkind != RELKIND_PARTITIONED_TABLE && + relkind != RELKIND_PARTITIONED_INDEX) + return false; + +return true; +} /* * pg_partition_tree @@ -39,19 +66,10 @@ pg_partition_tree(PG_FUNCTION_ARGS) { #define PG_PARTITION_TREE_COLS 4 Oid rootrelid = PG_GETARG_OID(0); - char relkind = get_rel_relkind(rootrelid); FuncCallContext *funcctx; ListCell **next; - if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(rootrelid))) - PG_RETURN_NULL(); - - /* Return NULL for relation types that cannot appear in partition trees */ - if (relkind != RELKIND_RELATION && - relkind != RELKIND_FOREIGN_TABLE && - relkind != RELKIND_INDEX && - relkind != RELKIND_PARTITIONED_TABLE && - relkind != RELKIND_PARTITIONED_INDEX) + if (!check_rel_can_be_partition(rootrelid)) PG_RETURN_NULL(); /* stuff done only on the first call of the function */ @@ -154,6 +172,43 @@ pg_partition_tree(PG_FUNCTION_ARGS) SRF_RETURN_DONE(funcctx); } +/* + * pg_partition_root + * + * Returns the top-most parent of the partition tree to which a given + * relation belongs, or NULL if it's not (or cannot be) part of any + * partition tree. + */ +Datum +pg_partition_root(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); + Oid rootrelid; + List *ancestors; + + if (!check_rel_can_be_partition(relid)) + PG_RETURN_NULL(); + + /* + * If the relation is not a partition (it may be the partition parent), + * return itself as a result. + */ + if (!get_rel_relispartition(relid)) + PG_RETURN_OID(relid); + + /* Fetch the top-most parent */ + ancestors = get_partition_ancestors(relid); + rootrelid = llast_oid(ancestors); + list_free(ancestors); + + /* + * "rootrelid" must contain a valid OID, given that the input relation is + * a valid partition tree member as checked above. + */ + Assert(OidIsValid(rootrelid)); + PG_RETURN_OID(rootrelid); +} + /* * pg_partition_ancestors * diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index 9af8050d..76881d68 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -5713,7 +5713,12 @@ DESCR("hash partition CHECK constraint"); /* information about a partition tree */ DATA(insert OID = 4688 ( pg_partition_tree PGNSP PGUID 12 1 1000 0 0 f f f f t t v s 1 0 2249 "2205" "{2205,2205,2205,16,23}" "{i,o,o,o,o}" "{rootrelid,relid,parentrelid,isleaf,level}" _null_ _null_ pg_partition_tree _null_ _null_ _null_ )); DESCR("view partition tree tables"); -DATA(insert OID = 4689 ( pg_partition_ancestors PGNSP PGUID 12 1 10 0 0 f f f f t t v s 1 0 2205 "2205" "{2205,2205}" "{i,o}" "{partitionid,relid}" _null_ _null_ pg_partition_ancestors _null_ _null_ _null_ )); + +/* function to get the top-most partition root parent */ +DATA(insert OID = 4689 ( pg_partition_root PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 2205 "2205" _null_ _null_ _null_ _null_ _null_ pg_partition_root _null_ _null_ _null_ )); +DESCR("get top-most partition root parent"); + +DATA(insert OID = 4690 ( pg_partition_ancestors PGNSP PGUID 12 1 10 0 0 f f f f t t v s 1 0 2205 "2205" "{2205,2205}" "{i,o}" "{partitionid,relid}" _null_ _null_ pg_partition_ancestors _null_ _null_ _null_ )); DESCR("view ancestors of the partition"); DATA(insert OID = 3410 ( pg_extent_info PGNSP PGUID 12 10 20 0 0 f f f f f t v s 1 0 2249 "2205" "{23,16,23,23,23,23,23,23,23}" "{o,o,o,o,o,o,o,o,o}" "{eid,is_occupied,shardid,freespace_cat,hwm,scan_next,scan_prev,alloc_next,alloc_prev}" _null_ _null_ pg_extent_info_oid _null_ _null_ _null_ )); DESCR("get extent info of a relation"); diff --git a/src/test/regress/expected/partition_info.out b/src/test/regress/expected/partition_info.out index 5916eca2..00a0ed80 100644 --- a/src/test/regress/expected/partition_info.out +++ b/src/test/regress/expected/partition_info.out @@ -12,6 +12,18 @@ SELECT * FROM pg_partition_tree(0); | | | (1 row) +SELECT pg_partition_root(NULL); + pg_partition_root +------------------- + +(1 row) + +SELECT pg_partition_root(0); + pg_partition_root +------------------- + +(1 row) + SELECT * FROM pg_partition_ancestors(NULL); relid ------- @@ -76,6 +88,20 @@ SELECT relid, parentrelid, level, isleaf ptif_test01 | ptif_test0 | 0 | t (1 row) +-- List all members using pg_partition_root with leaf table reference +SELECT relid, parentrelid, level, isleaf + FROM pg_partition_tree(pg_partition_root('ptif_test01')) p + JOIN pg_class c ON (p.relid = c.oid); + relid | parentrelid | level | isleaf +-------------+-------------+-------+-------- + ptif_test | | 0 | f + ptif_test0 | ptif_test | 1 | f + ptif_test1 | ptif_test | 1 | f + ptif_test2 | ptif_test | 1 | t + ptif_test01 | ptif_test0 | 2 | t + ptif_test11 | ptif_test1 | 2 | t +(6 rows) + -- List all ancestors of root and leaf tables SELECT * FROM pg_partition_ancestors('ptif_test01'); relid @@ -123,6 +149,20 @@ SELECT relid, parentrelid, level, isleaf ptif_test01_index | ptif_test0_index | 0 | t (1 row) +-- List all members using pg_partition_root with leaf index reference +SELECT relid, parentrelid, level, isleaf + FROM pg_partition_tree(pg_partition_root('ptif_test01_index')) p + JOIN pg_class c ON (p.relid = c.oid); + relid | parentrelid | level | isleaf +-------------------+------------------+-------+-------- + ptif_test_index | | 0 | f + ptif_test0_index | ptif_test_index | 1 | f + ptif_test1_index | ptif_test_index | 1 | f + ptif_test2_index | ptif_test_index | 1 | t + ptif_test01_index | ptif_test0_index | 2 | t + ptif_test11_index | ptif_test1_index | 2 | t +(6 rows) + -- List all ancestors of root and leaf indexes SELECT * FROM pg_partition_ancestors('ptif_test01_index'); relid @@ -148,6 +188,12 @@ SELECT relid, parentrelid, level, isleaf ptif_normal_table | | 0 | t (1 row) +SELECT pg_partition_root('ptif_normal_table'); + pg_partition_root +------------------- + ptif_normal_table +(1 row) + SELECT * FROM pg_partition_ancestors('ptif_normal_table'); relid ------- @@ -169,6 +215,18 @@ SELECT * FROM pg_partition_tree('ptif_test_matview'); | | | (1 row) +SELECT pg_partition_root('ptif_test_view'); + pg_partition_root +------------------- + +(1 row) + +SELECT pg_partition_root('ptif_test_matview'); + pg_partition_root +------------------- + +(1 row) + SELECT * FROM pg_partition_ancestors('ptif_test_view'); relid ------- diff --git a/src/test/regress/sql/partition_info.sql b/src/test/regress/sql/partition_info.sql index 6e2ec675..f49688eb 100644 --- a/src/test/regress/sql/partition_info.sql +++ b/src/test/regress/sql/partition_info.sql @@ -3,6 +3,8 @@ -- SELECT * FROM pg_partition_tree(NULL); SELECT * FROM pg_partition_tree(0); +SELECT pg_partition_root(NULL); +SELECT pg_partition_root(0); SELECT * FROM pg_partition_ancestors(NULL); SELECT * FROM pg_partition_ancestors(0); @@ -41,6 +43,10 @@ SELECT relid, parentrelid, level, isleaf SELECT relid, parentrelid, level, isleaf FROM pg_partition_tree('ptif_test01') p JOIN pg_class c ON (p.relid = c.oid); +-- List all members using pg_partition_root with leaf table reference +SELECT relid, parentrelid, level, isleaf + FROM pg_partition_tree(pg_partition_root('ptif_test01')) p + JOIN pg_class c ON (p.relid = c.oid); -- List all ancestors of root and leaf tables SELECT * FROM pg_partition_ancestors('ptif_test01'); SELECT * FROM pg_partition_ancestors('ptif_test'); @@ -56,6 +62,10 @@ SELECT relid, parentrelid, level, isleaf SELECT relid, parentrelid, level, isleaf FROM pg_partition_tree('ptif_test01_index') p JOIN pg_class c ON (p.relid = c.oid); +-- List all members using pg_partition_root with leaf index reference +SELECT relid, parentrelid, level, isleaf + FROM pg_partition_tree(pg_partition_root('ptif_test01_index')) p + JOIN pg_class c ON (p.relid = c.oid); -- List all ancestors of root and leaf indexes SELECT * FROM pg_partition_ancestors('ptif_test01_index'); SELECT * FROM pg_partition_ancestors('ptif_test_index'); @@ -66,6 +76,7 @@ DROP TABLE ptif_test; CREATE TABLE ptif_normal_table(a int); SELECT relid, parentrelid, level, isleaf FROM pg_partition_tree('ptif_normal_table'); +SELECT pg_partition_root('ptif_normal_table'); SELECT * FROM pg_partition_ancestors('ptif_normal_table'); DROP TABLE ptif_normal_table; @@ -74,6 +85,8 @@ CREATE VIEW ptif_test_view AS SELECT 1; CREATE MATERIALIZED VIEW ptif_test_matview AS SELECT 1; SELECT * FROM pg_partition_tree('ptif_test_view'); SELECT * FROM pg_partition_tree('ptif_test_matview'); +SELECT pg_partition_root('ptif_test_view'); +SELECT pg_partition_root('ptif_test_matview'); SELECT * FROM pg_partition_ancestors('ptif_test_view'); SELECT * FROM pg_partition_ancestors('ptif_test_matview'); DROP VIEW ptif_test_view; From e98d386320bdbf17cffb0249d8e79bee099b6fad Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Wed, 1 Jul 2020 13:09:41 +0800 Subject: [PATCH 279/578] Fix crash with pg_partition_root.http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/utils/adt/partitionfuncs.c | 43 +++---- src/test/regress/expected/partition_info.out | 113 +++++++++++++++---- src/test/regress/sql/partition_info.sql | 29 ++++- 3 files changed, 142 insertions(+), 43 deletions(-) diff --git a/src/backend/utils/adt/partitionfuncs.c b/src/backend/utils/adt/partitionfuncs.c index 13ddec59..87f1cced 100644 --- a/src/backend/utils/adt/partitionfuncs.c +++ b/src/backend/utils/adt/partitionfuncs.c @@ -34,23 +34,23 @@ static bool check_rel_can_be_partition(Oid relid) { -char relkind; + char relkind; + bool relispartition; -/* Check if relation exists */ -if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(relid))) - return false; + /* Check if relation exists */ + if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(relid))) + return false; -relkind = get_rel_relkind(relid); + relkind = get_rel_relkind(relid); + relispartition = get_rel_relispartition(relid); -/* Only allow relation types that can appear in partition trees. */ -if (relkind != RELKIND_RELATION && - relkind != RELKIND_FOREIGN_TABLE && - relkind != RELKIND_INDEX && - relkind != RELKIND_PARTITIONED_TABLE && - relkind != RELKIND_PARTITIONED_INDEX) - return false; + /* Only allow relation types that can appear in partition trees. */ + if (!relispartition && + relkind != RELKIND_PARTITIONED_TABLE && + relkind != RELKIND_PARTITIONED_INDEX) + return false; -return true; + return true; } /* @@ -69,9 +69,6 @@ pg_partition_tree(PG_FUNCTION_ARGS) FuncCallContext *funcctx; ListCell **next; - if (!check_rel_can_be_partition(rootrelid)) - PG_RETURN_NULL(); - /* stuff done only on the first call of the function */ if (SRF_IS_FIRSTCALL()) { @@ -82,6 +79,9 @@ pg_partition_tree(PG_FUNCTION_ARGS) /* create a function context for cross-call persistence */ funcctx = SRF_FIRSTCALL_INIT(); + if (!check_rel_can_be_partition(rootrelid)) + SRF_RETURN_DONE(funcctx); + /* switch to memory context appropriate for multiple function calls */ oldcxt = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); @@ -189,15 +189,16 @@ pg_partition_root(PG_FUNCTION_ARGS) if (!check_rel_can_be_partition(relid)) PG_RETURN_NULL(); + /* fetch the list of ancestors */ + ancestors = get_partition_ancestors(relid); + /* - * If the relation is not a partition (it may be the partition parent), - * return itself as a result. + * If the input relation is already the top-most parent, just return + * itself. */ - if (!get_rel_relispartition(relid)) + if (ancestors == NIL) PG_RETURN_OID(relid); - /* Fetch the top-most parent */ - ancestors = get_partition_ancestors(relid); rootrelid = llast_oid(ancestors); list_free(ancestors); diff --git a/src/test/regress/expected/partition_info.out b/src/test/regress/expected/partition_info.out index 00a0ed80..c26d02a5 100644 --- a/src/test/regress/expected/partition_info.out +++ b/src/test/regress/expected/partition_info.out @@ -8,9 +8,8 @@ SELECT * FROM pg_partition_tree(NULL); SELECT * FROM pg_partition_tree(0); relid | parentrelid | isleaf | level --------+-------------+--------+------- - | | | -(1 row) +-------+-------------+--------+------- +(0 row) SELECT pg_partition_root(NULL); pg_partition_root @@ -43,7 +42,35 @@ CREATE TABLE ptif_test1 PARTITION OF ptif_test FOR VALUES FROM (0) TO (100) PARTITION BY list (b); CREATE TABLE ptif_test11 PARTITION OF ptif_test1 FOR VALUES IN (1); CREATE TABLE ptif_test2 PARTITION OF ptif_test - FOR VALUES FROM (100) TO (maxvalue); + FOR VALUES FROM (100) TO (200); +-- This partitioned table should remain with no partitions. +CREATE TABLE ptif_test3 PARTITION OF ptif_test + FOR VALUES FROM (200) TO (maxvalue) PARTITION BY list (b); +-- Test pg_partition_root for tables +SELECT pg_partition_root('ptif_test'); + pg_partition_root +------------------- + ptif_test +(1 row) + +SELECT pg_partition_root('ptif_test0'); + pg_partition_root +------------------- + ptif_test +(1 row) + +SELECT pg_partition_root('ptif_test01'); + pg_partition_root +------------------- + ptif_test +(1 row) + +SELECT pg_partition_root('ptif_test3'); + pg_partition_root +------------------- + ptif_test +(1 row) + -- Test index partition tree CREATE INDEX ptif_test_index ON ONLY ptif_test (a); CREATE INDEX ptif_test0_index ON ONLY ptif_test0 (a); @@ -56,6 +83,33 @@ CREATE INDEX ptif_test11_index ON ptif_test11 (a); ALTER INDEX ptif_test1_index ATTACH PARTITION ptif_test11_index; CREATE INDEX ptif_test2_index ON ptif_test2 (a); ALTER INDEX ptif_test_index ATTACH PARTITION ptif_test2_index; +CREATE INDEX ptif_test3_index ON ptif_test3 (a); +ALTER INDEX ptif_test_index ATTACH PARTITION ptif_test3_index; +-- Test pg_partition_root for indexes +SELECT pg_partition_root('ptif_test_index'); + pg_partition_root +------------------- + ptif_test_index +(1 row) + +SELECT pg_partition_root('ptif_test0_index'); + pg_partition_root +------------------- + ptif_test_index +(1 row) + +SELECT pg_partition_root('ptif_test01_index'); + pg_partition_root +------------------- + ptif_test_index +(1 row) + +SELECT pg_partition_root('ptif_test3_index'); + pg_partition_root +------------------- + ptif_test_index +(1 row) + -- List all tables members of the tree SELECT relid, parentrelid, level, isleaf FROM pg_partition_tree('ptif_test'); @@ -65,9 +119,10 @@ SELECT relid, parentrelid, level, isleaf ptif_test0 | ptif_test | 1 | f ptif_test1 | ptif_test | 1 | f ptif_test2 | ptif_test | 1 | t + ptif_test3 | ptif_test | 1 | f ptif_test01 | ptif_test0 | 2 | t ptif_test11 | ptif_test1 | 2 | t -(6 rows) +(7 rows) -- List tables from an intermediate level SELECT relid, parentrelid, level, isleaf @@ -88,6 +143,15 @@ SELECT relid, parentrelid, level, isleaf ptif_test01 | ptif_test0 | 0 | t (1 row) +-- List from partitioned table with no partitions +SELECT relid, parentrelid, level, isleaf + FROM pg_partition_tree('ptif_test3') p + JOIN pg_class c ON (p.relid = c.oid); + relid | parentrelid | level | isleaf +------------+-------------+-------+-------- + ptif_test3 | ptif_test | 0 | f +(1 row) + -- List all members using pg_partition_root with leaf table reference SELECT relid, parentrelid, level, isleaf FROM pg_partition_tree(pg_partition_root('ptif_test01')) p @@ -98,9 +162,10 @@ SELECT relid, parentrelid, level, isleaf ptif_test0 | ptif_test | 1 | f ptif_test1 | ptif_test | 1 | f ptif_test2 | ptif_test | 1 | t + ptif_test3 | ptif_test | 1 | f ptif_test01 | ptif_test0 | 2 | t ptif_test11 | ptif_test1 | 2 | t -(6 rows) +(7 rows) -- List all ancestors of root and leaf tables SELECT * FROM pg_partition_ancestors('ptif_test01'); @@ -126,9 +191,10 @@ SELECT relid, parentrelid, level, isleaf ptif_test0_index | ptif_test_index | 1 | f ptif_test1_index | ptif_test_index | 1 | f ptif_test2_index | ptif_test_index | 1 | t + ptif_test3_index | ptif_test_index | 1 | f ptif_test01_index | ptif_test0_index | 2 | t ptif_test11_index | ptif_test1_index | 2 | t -(6 rows) +(7 rows) -- List indexes from an intermediate level SELECT relid, parentrelid, level, isleaf @@ -149,6 +215,15 @@ SELECT relid, parentrelid, level, isleaf ptif_test01_index | ptif_test0_index | 0 | t (1 row) +-- List from partitioned index with no partitions +SELECT relid, parentrelid, level, isleaf + FROM pg_partition_tree('ptif_test3_index') p + JOIN pg_class c ON (p.relid = c.oid); + relid | parentrelid | level | isleaf +------------------+-----------------+-------+-------- + ptif_test3_index | ptif_test_index | 0 | f +(1 row) + -- List all members using pg_partition_root with leaf index reference SELECT relid, parentrelid, level, isleaf FROM pg_partition_tree(pg_partition_root('ptif_test01_index')) p @@ -159,9 +234,10 @@ SELECT relid, parentrelid, level, isleaf ptif_test0_index | ptif_test_index | 1 | f ptif_test1_index | ptif_test_index | 1 | f ptif_test2_index | ptif_test_index | 1 | t + ptif_test3_index | ptif_test_index | 1 | f ptif_test01_index | ptif_test0_index | 2 | t ptif_test11_index | ptif_test1_index | 2 | t -(6 rows) +(7 rows) -- List all ancestors of root and leaf indexes SELECT * FROM pg_partition_ancestors('ptif_test01_index'); @@ -179,19 +255,18 @@ SELECT * FROM pg_partition_ancestors('ptif_test_index'); (1 row) DROP TABLE ptif_test; --- A table not part of a partition tree works is the only member listed. +-- A table not part of a partition tree works is not listed. CREATE TABLE ptif_normal_table(a int); SELECT relid, parentrelid, level, isleaf FROM pg_partition_tree('ptif_normal_table'); - relid | parentrelid | level | isleaf --------------------+-------------+-------+-------- - ptif_normal_table | | 0 | t -(1 row) + relid | parentrelid | level | isleaf +-------+-------------+-------+-------- +(0 rows) SELECT pg_partition_root('ptif_normal_table'); pg_partition_root ------------------- - ptif_normal_table + (1 row) SELECT * FROM pg_partition_ancestors('ptif_normal_table'); @@ -205,15 +280,13 @@ CREATE VIEW ptif_test_view AS SELECT 1; CREATE MATERIALIZED VIEW ptif_test_matview AS SELECT 1; SELECT * FROM pg_partition_tree('ptif_test_view'); relid | parentrelid | isleaf | level --------+-------------+--------+------- - | | | -(1 row) +-------+-------------+--------+------- +(0 row) SELECT * FROM pg_partition_tree('ptif_test_matview'); relid | parentrelid | isleaf | level --------+-------------+--------+------- - | | | -(1 row) +-------+-------------+--------+------- +(0 row) SELECT pg_partition_root('ptif_test_view'); pg_partition_root diff --git a/src/test/regress/sql/partition_info.sql b/src/test/regress/sql/partition_info.sql index f49688eb..afa16c07 100644 --- a/src/test/regress/sql/partition_info.sql +++ b/src/test/regress/sql/partition_info.sql @@ -17,7 +17,16 @@ CREATE TABLE ptif_test1 PARTITION OF ptif_test FOR VALUES FROM (0) TO (100) PARTITION BY list (b); CREATE TABLE ptif_test11 PARTITION OF ptif_test1 FOR VALUES IN (1); CREATE TABLE ptif_test2 PARTITION OF ptif_test - FOR VALUES FROM (100) TO (maxvalue); + FOR VALUES FROM (100) TO (200); +-- This partitioned table should remain with no partitions. +CREATE TABLE ptif_test3 PARTITION OF ptif_test + FOR VALUES FROM (200) TO (maxvalue) PARTITION BY list (b); + +-- Test pg_partition_root for tables +SELECT pg_partition_root('ptif_test'); +SELECT pg_partition_root('ptif_test0'); +SELECT pg_partition_root('ptif_test01'); +SELECT pg_partition_root('ptif_test3'); -- Test index partition tree CREATE INDEX ptif_test_index ON ONLY ptif_test (a); @@ -31,6 +40,14 @@ CREATE INDEX ptif_test11_index ON ptif_test11 (a); ALTER INDEX ptif_test1_index ATTACH PARTITION ptif_test11_index; CREATE INDEX ptif_test2_index ON ptif_test2 (a); ALTER INDEX ptif_test_index ATTACH PARTITION ptif_test2_index; +CREATE INDEX ptif_test3_index ON ptif_test3 (a); +ALTER INDEX ptif_test_index ATTACH PARTITION ptif_test3_index; + +-- Test pg_partition_root for indexes +SELECT pg_partition_root('ptif_test_index'); +SELECT pg_partition_root('ptif_test0_index'); +SELECT pg_partition_root('ptif_test01_index'); +SELECT pg_partition_root('ptif_test3_index'); -- List all tables members of the tree SELECT relid, parentrelid, level, isleaf @@ -43,6 +60,10 @@ SELECT relid, parentrelid, level, isleaf SELECT relid, parentrelid, level, isleaf FROM pg_partition_tree('ptif_test01') p JOIN pg_class c ON (p.relid = c.oid); +-- List from partitioned table with no partitions +SELECT relid, parentrelid, level, isleaf + FROM pg_partition_tree('ptif_test3') p + JOIN pg_class c ON (p.relid = c.oid); -- List all members using pg_partition_root with leaf table reference SELECT relid, parentrelid, level, isleaf FROM pg_partition_tree(pg_partition_root('ptif_test01')) p @@ -62,6 +83,10 @@ SELECT relid, parentrelid, level, isleaf SELECT relid, parentrelid, level, isleaf FROM pg_partition_tree('ptif_test01_index') p JOIN pg_class c ON (p.relid = c.oid); +-- List from partitioned index with no partitions +SELECT relid, parentrelid, level, isleaf + FROM pg_partition_tree('ptif_test3_index') p + JOIN pg_class c ON (p.relid = c.oid); -- List all members using pg_partition_root with leaf index reference SELECT relid, parentrelid, level, isleaf FROM pg_partition_tree(pg_partition_root('ptif_test01_index')) p @@ -72,7 +97,7 @@ SELECT * FROM pg_partition_ancestors('ptif_test_index'); DROP TABLE ptif_test; --- A table not part of a partition tree works is the only member listed. +-- A table not part of a partition tree works is not listed. CREATE TABLE ptif_normal_table(a int); SELECT relid, parentrelid, level, isleaf FROM pg_partition_tree('ptif_normal_table'); From 31f9b1f23520319f12e2a075ef0608854ba45cca Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Wed, 1 Jul 2020 19:27:24 +0800 Subject: [PATCH 280/578] psql \dP: list partitioned tables and indexes. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- doc/src/sgml/ref/psql-ref.sgml | 33 +++++ src/bin/psql/command.c | 17 +++ src/bin/psql/describe.c | 214 +++++++++++++++++++++++++++++ src/bin/psql/describe.h | 11 +- src/bin/psql/help.c | 3 +- src/bin/psql/tab-complete.c | 41 +++++- src/test/regress/expected/psql.out | 131 ++++++++++++++++++ src/test/regress/sql/psql.sql | 69 ++++++++++ 8 files changed, 513 insertions(+), 6 deletions(-) diff --git a/doc/src/sgml/ref/psql-ref.sgml b/doc/src/sgml/ref/psql-ref.sgml index c592edac..db3109f7 100644 --- a/doc/src/sgml/ref/psql-ref.sgml +++ b/doc/src/sgml/ref/psql-ref.sgml @@ -1609,6 +1609,39 @@ testdb=> + + + \dP[itn+] [ pattern ] + + + Lists partitioned relations. + If pattern + is specified, only entries whose name matches the pattern are listed. + The modifiers t (tables) and i + (indexes) can be appended to the command, filtering the kind of + relations to list. By default, partitioned tables and indexes are + listed. + + + + If the modifier n (nested) is used, + or a pattern is specified, then non-root partitioned tables are + included, and a column is shown displaying the parent of each + partitioned relation. + + + + If + is appended to the command, the sum of sizes of + table's partitions (including that of their indexes) is also displayed, + along with the associated description. + If n is combined with +, two + sizes are shown: one including the total size of directly-attached + leaf partitions, and another showing the total size of all partitions, + including indirectly attached sub-partitions. + + + + \drds [ role-pattern [ database-pattern ] ] diff --git a/src/bin/psql/command.c b/src/bin/psql/command.c index 49813637..30ee6793 100644 --- a/src/bin/psql/command.c +++ b/src/bin/psql/command.c @@ -797,6 +797,23 @@ exec_command_d(PsqlScanState scan_state, bool active_branch, const char *cmd) case 'p': success = permissionsList(pattern); break; + case 'P': + { + switch (cmd[2]) + { + case '\0': + case '+': + case 't': + case 'i': + case 'n': + success = listPartitionedTables(&cmd[2], pattern, show_verbose); + break; + default: + status = PSQL_CMD_UNKNOWN; + break; + } + } + break; case 'T': success = describeTypes(pattern, show_verbose, show_system); break; diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c index 1c671aca..b7023ae5 100644 --- a/src/bin/psql/describe.c +++ b/src/bin/psql/describe.c @@ -3771,6 +3771,220 @@ listTables(const char *tabtypes, const char *pattern, bool verbose, bool showSys return true; } +/* + * \dP + * Takes an optional regexp to select particular relations + * + * As with \d, you can specify the kinds of relations you want: + * + * t for tables + * i for indexes + * + * And there's additional flags: + * + * n to list non-leaf partitioned tables + * + * and you can mix and match these in any order. + */ +bool +listPartitionedTables(const char *reltypes, const char *pattern, bool verbose) +{ + bool showTables = strchr(reltypes, 't') != NULL; + bool showIndexes = strchr(reltypes, 'i') != NULL; + bool showNested = strchr(reltypes, 'n') != NULL; + PQExpBufferData buf; + PQExpBufferData title; + PGresult *res; + printQueryOpt myopt = pset.popt; + bool translate_columns[] = {false, false, false, false, false, false, false, false, false}; + const char *tabletitle; + bool mixed_output = false; + + /* + * Note: Declarative table partitioning is only supported as of Pg 10.0. + */ + if (pset.sversion < 100000) + { + char sverbuf[32]; + + pg_log_error("The server (version %s) does not support declarative table partitioning.", + formatPGVersionNumber(pset.sversion, false, + sverbuf, sizeof(sverbuf))); + return true; + } + + /* If no relation kind was selected, show them all */ + if (!showTables && !showIndexes) + showTables = showIndexes = true; + + if (showIndexes && !showTables) + tabletitle = _("List of partitioned indexes"); /* \dPi */ + else if (showTables && !showIndexes) + tabletitle = _("List of partitioned tables"); /* \dPt */ + else + { + /* show all kinds */ + tabletitle = _("List of partitioned relations"); + mixed_output = true; + } + + initPQExpBuffer(&buf); + + printfPQExpBuffer(&buf, + "SELECT n.nspname as \"%s\",\n" + " c.relname as \"%s\",\n" + " pg_catalog.pg_get_userbyid(c.relowner) as \"%s\"", + gettext_noop("Schema"), + gettext_noop("Name"), + gettext_noop("Owner")); + + if (mixed_output) + { + appendPQExpBuffer(&buf, + ",\n CASE c.relkind" + " WHEN " CppAsString2(RELKIND_PARTITIONED_TABLE) " THEN '%s'" + " WHEN " CppAsString2(RELKIND_PARTITIONED_INDEX) " THEN '%s'" + " END as \"%s\"", + gettext_noop("partitioned table"), + gettext_noop("partitioned index"), + gettext_noop("Type")); + + translate_columns[3] = true; + } + + if (showNested || pattern) + appendPQExpBuffer(&buf, + ",\n c3.oid::regclass as \"%s\"", + gettext_noop("Parent name")); + + if (showIndexes) + appendPQExpBuffer(&buf, + ",\n c2.oid::regclass as \"%s\"", + gettext_noop("On table")); + + if (verbose) + { + if (showNested) + { + appendPQExpBuffer(&buf, + ",\n s.dps as \"%s\"", + gettext_noop("Leaf partition size")); + appendPQExpBuffer(&buf, + ",\n s.tps as \"%s\"", + gettext_noop("Total size")); + } + else + /* Sizes of all partitions are considered in this case. */ + appendPQExpBuffer(&buf, + ",\n s.tps as \"%s\"", + gettext_noop("Total size")); + + appendPQExpBuffer(&buf, + ",\n pg_catalog.obj_description(c.oid, 'pg_class') as \"%s\"", + gettext_noop("Description")); + } + + appendPQExpBufferStr(&buf, + "\nFROM pg_catalog.pg_class c" + "\n LEFT JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace"); + + if (showIndexes) + appendPQExpBufferStr(&buf, + "\n LEFT JOIN pg_catalog.pg_index i ON i.indexrelid = c.oid" + "\n LEFT JOIN pg_catalog.pg_class c2 ON i.indrelid = c2.oid"); + + if (showNested || pattern) + appendPQExpBufferStr(&buf, + "\n LEFT JOIN pg_catalog.pg_inherits inh ON c.oid = inh.inhrelid" + "\n LEFT JOIN pg_catalog.pg_class c3 ON c3.oid = inh.inhparent"); + + if (verbose) + { + if (pset.sversion < 120000) + { + appendPQExpBuffer(&buf, + ",\n LATERAL (WITH RECURSIVE d\n" + " AS (SELECT inhrelid AS oid, 1 AS level\n" + " FROM pg_catalog.pg_inherits\n" + " WHERE inhparent = c.oid\n" + " UNION ALL\n" + " SELECT inhrelid, level + 1\n" + " FROM pg_catalog.pg_inherits i\n" + " JOIN d ON i.inhparent = d.oid)\n" + " SELECT pg_catalog.pg_size_pretty(sum(pg_catalog.pg_table_size(" + "d.oid))) AS tps,\n" + " pg_catalog.pg_size_pretty(sum(" + "\n CASE WHEN d.level = 1" + " THEN pg_catalog.pg_table_size(d.oid) ELSE 0 END)) AS dps\n" + " FROM d) s"); + } + else + { + /* PostgreSQL 12 has pg_partition_tree function */ + appendPQExpBuffer(&buf, + ",\n LATERAL (SELECT pg_catalog.pg_size_pretty(sum(" + "\n CASE WHEN ppt.isleaf AND ppt.level = 1" + "\n THEN pg_catalog.pg_table_size(ppt.relid)" + " ELSE 0 END)) AS dps" + ",\n pg_catalog.pg_size_pretty(sum(" + "pg_catalog.pg_table_size(ppt.relid))) AS tps" + "\n FROM pg_catalog.pg_partition_tree(c.oid) ppt) s"); + } + } + + appendPQExpBufferStr(&buf, "\nWHERE c.relkind IN ("); + if (showTables) + appendPQExpBufferStr(&buf, CppAsString2(RELKIND_PARTITIONED_TABLE) ","); + if (showIndexes) + appendPQExpBufferStr(&buf, CppAsString2(RELKIND_PARTITIONED_INDEX) ","); + appendPQExpBufferStr(&buf, "''"); /* dummy */ + appendPQExpBufferStr(&buf, ")\n"); + + appendPQExpBufferStr(&buf, !showNested && !pattern ? + " AND NOT c.relispartition\n" : ""); + + if (!pattern) + appendPQExpBufferStr(&buf, " AND n.nspname <> 'pg_catalog'\n" + " AND n.nspname <> 'information_schema'\n"); + + /* + * TOAST objects are suppressed unconditionally. Since we don't provide + * any way to select RELKIND_TOASTVALUE above, we would never show toast + * tables in any case; it seems a bit confusing to allow their indexes to + * be shown. Use plain \d if you really need to look at a TOAST + * table/index. + */ + appendPQExpBufferStr(&buf, " AND n.nspname !~ '^pg_toast'\n"); + + processSQLNamePattern(pset.db, &buf, pattern, true, false, + "n.nspname", "c.relname", NULL, + "pg_catalog.pg_table_is_visible(c.oid)"); + + appendPQExpBuffer(&buf, "ORDER BY \"Schema\", %s%s\"Name\";", + mixed_output ? "\"Type\" DESC, " : "", + showNested || pattern ? "\"Parent name\" NULLS FIRST, " : ""); + + res = PSQLexec(buf.data); + termPQExpBuffer(&buf); + if (!res) + return false; + + initPQExpBuffer(&title); + appendPQExpBuffer(&title, "%s", tabletitle); + + myopt.nullPrint = NULL; + myopt.title = title.data; + myopt.translate_header = true; + myopt.translate_columns = translate_columns; + myopt.n_translate_columns = lengthof(translate_columns); + + printQuery(res, &myopt, pset.queryFout, false, pset.logfile); + + termPQExpBuffer(&title); + + PQclear(res); + return true; +} /* * \dL diff --git a/src/bin/psql/describe.h b/src/bin/psql/describe.h index da6046c9..2224397f 100644 --- a/src/bin/psql/describe.h +++ b/src/bin/psql/describe.h @@ -63,6 +63,9 @@ extern bool listAllDbs(const char *pattern, bool verbose); /* \dt, \di, \ds, \dS, etc. */ extern bool listTables(const char *tabtypes, const char *pattern, bool verbose, bool showSystem); +/* \dP */ +extern bool listPartitionedTables(const char *reltypes, const char *pattern, bool verbose); + /* \dD */ extern bool listDomains(const char *pattern, bool verbose, bool showSystem); @@ -103,12 +106,12 @@ extern bool listExtensionContents(const char *pattern); extern bool listEventTriggers(const char *pattern, bool verbose); /* \dRp */ -bool listPublications(const char *pattern); +bool listPublications(const char *pattern); /* \dRp+ */ -bool describePublications(const char *pattern); +bool describePublications(const char *pattern); /* \dRs */ -bool describeSubscriptions(const char *pattern, bool verbose); +bool describeSubscriptions(const char *pattern, bool verbose); -#endif /* DESCRIBE_H */ +#endif /* DESCRIBE_H */ diff --git a/src/bin/psql/help.c b/src/bin/psql/help.c index a793c9f9..ec7d0efe 100644 --- a/src/bin/psql/help.c +++ b/src/bin/psql/help.c @@ -167,7 +167,7 @@ slashUsage(unsigned short int pager) * Use "psql --help=commands | wc" to count correctly. It's okay to count * the USE_READLINE line even in builds without that. */ - output = PageOutput(122, pager ? &(pset.popt.topt) : NULL); + output = PageOutput(123, pager ? &(pset.popt.topt) : NULL); fprintf(output, _("General\n")); fprintf(output, _(" \\copyright show PostgreSQL usage and distribution terms\n")); @@ -248,6 +248,7 @@ slashUsage(unsigned short int pager) fprintf(output, _(" \\do[S] [PATTERN] list operators\n")); fprintf(output, _(" \\dO[S+] [PATTERN] list collations\n")); fprintf(output, _(" \\dp [PATTERN] list table, view, and sequence access privileges\n")); + fprintf(output, _(" \\dP[tin+] [PATTERN] list [only table/index] partitioned relations\n")); fprintf(output, _(" \\drds [PATRN1 [PATRN2]] list per-database role settings\n")); fprintf(output, _(" \\dRp[+] [PATTERN] list replication publications\n")); fprintf(output, _(" \\dRs[+] [PATTERN] list replication subscriptions\n")); diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c index 29eaf18b..fd1c4a5e 100644 --- a/src/bin/psql/tab-complete.c +++ b/src/bin/psql/tab-complete.c @@ -499,6 +499,23 @@ static const SchemaQuery Query_for_list_of_constraints_with_schema = { NULL }; +/* partitioned relations */ +static const SchemaQuery Query_for_list_of_partitioned_relations = { + /* catname */ + "pg_catalog.pg_class c", + /* selcondition */ + "c.relkind IN (" CppAsString2(RELKIND_PARTITIONED_TABLE) + ", " CppAsString2(RELKIND_PARTITIONED_INDEX) ")", + /* viscondition */ + "pg_catalog.pg_table_is_visible(c.oid)", + /* namespace */ + "c.relnamespace", + /* result */ + "pg_catalog.quote_ident(c.relname)", + /* qualresult */ + NULL +}; + /* Relations supporting INSERT, UPDATE or DELETE */ static const SchemaQuery Query_for_list_of_updatables = { /* catname */ @@ -518,6 +535,22 @@ static const SchemaQuery Query_for_list_of_updatables = { NULL }; +static const SchemaQuery Query_for_list_of_partitioned_indexes = { + /* catname */ + "pg_catalog.pg_class c", + /* selcondition */ + "c.relkind = " CppAsString2(RELKIND_PARTITIONED_INDEX), + /* viscondition */ + "pg_catalog.pg_table_is_visible(c.oid)", + /* namespace */ + "c.relnamespace", + /* result */ + "pg_catalog.quote_ident(c.relname)", + /* qualresult */ + NULL +}; + +/* All relations */ static const SchemaQuery Query_for_list_of_relations = { /* catname */ "pg_catalog.pg_class c", @@ -1444,7 +1477,7 @@ psql_completion(const char *text, int start, int end) "\\d", "\\da", "\\dA", "\\db", "\\dc", "\\dC", "\\dd", "\\ddp", "\\dD", "\\des", "\\det", "\\deu", "\\dew", "\\dE", "\\df", "\\dF", "\\dFd", "\\dFp", "\\dFt", "\\dg", "\\di", "\\dl", "\\dL", - "\\dm", "\\dn", "\\do", "\\dO", "\\dp", + "\\dm", "\\dn", "\\do", "\\dO", "\\dp", "\\dP", "\\dPi", "\\dPt", "\\drds", "\\dRs", "\\dRp", "\\ds", "\\dS", "\\dt", "\\dT", "\\dv", "\\du", "\\dx", "\\dy", "\\e", "\\echo", "\\ef", "\\elif", "\\else", "\\encoding", @@ -3471,6 +3504,12 @@ psql_completion(const char *text, int start, int end) COMPLETE_WITH_QUERY(Query_for_list_of_schemas); else if (TailMatchesCS1("\\dp") || TailMatchesCS1("\\z")) COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_tsvmf, NULL); + else if (TailMatchesCS("\\dPi*")) + COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_partitioned_indexes, NULL); + else if (TailMatchesCS("\\dPt*")) + COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_partitioned_tables, NULL); + else if (TailMatchesCS("\\dP*")) + COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_partitioned_relations, NULL); else if (TailMatchesCS1("\\ds*")) COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_sequences, NULL); else if (TailMatchesCS1("\\dt*")) diff --git a/src/test/regress/expected/psql.out b/src/test/regress/expected/psql.out index d602aeef..3e0eae21 100644 --- a/src/test/regress/expected/psql.out +++ b/src/test/regress/expected/psql.out @@ -2964,3 +2964,134 @@ SELECT 3 UNION SELECT 4 UNION SELECT 5 ORDER BY 1; +create schema testpart; +create role testrole_partitioning; +alter schema testpart owner to testrole_partitioning; +set role to testrole_partitioning; +-- run test inside own schema and hide other partitions +set search_path to testpart; +create table testtable_apple(logdate date); +create table testtable_orange(logdate date); +create index testtable_apple_index on testtable_apple(logdate); +create index testtable_orange_index on testtable_orange(logdate); +create table testpart_apple(logdate date) partition by range(logdate); +create table testpart_orange(logdate date) partition by range(logdate); +create index testpart_apple_index on testpart_apple(logdate); +create index testpart_orange_index on testpart_orange(logdate); +-- only partition related object should be displayed +\dP test*apple* + List of partitioned relations + Schema | Name | Owner | Type | Parent name | On table +----------+----------------------+-----------------------+-------------------+-------------+---------------- + testpart | testpart_apple | testrole_partitioning | partitioned table | | + testpart | testpart_apple_index | testrole_partitioning | partitioned index | | testpart_apple +(2 rows) + +\dPt test*apple* + List of partitioned tables + Schema | Name | Owner | Parent name +----------+----------------+-----------------------+------------- + testpart | testpart_apple | testrole_partitioning | +(1 row) + +\dPi test*apple* + List of partitioned indexes + Schema | Name | Owner | Parent name | On table +----------+----------------------+-----------------------+-------------+---------------- + testpart | testpart_apple_index | testrole_partitioning | | testpart_apple +(1 row) + +drop table testtable_apple; +drop table testtable_orange; +drop table testpart_apple; +drop table testpart_orange; +create table parent_tab (id int) partition by range (id); +create index parent_index on parent_tab (id); +create table child_0_10 partition of parent_tab + for values from (0) to (10); +create table child_10_20 partition of parent_tab + for values from (10) to (20); +create table child_20_30 partition of parent_tab + for values from (20) to (30); +insert into parent_tab values (generate_series(0,29)); +create table child_30_40 partition of parent_tab +for values from (30) to (40) + partition by range(id); +create table child_30_35 partition of child_30_40 + for values from (30) to (35); +create table child_35_40 partition of child_30_40 + for values from (35) to (40); +insert into parent_tab values (generate_series(30,39)); +\dPt + List of partitioned tables + Schema | Name | Owner +----------+------------+----------------------- + testpart | parent_tab | testrole_partitioning +(1 row) + +\dPi + List of partitioned indexes + Schema | Name | Owner | On table +----------+--------------+-----------------------+------------ + testpart | parent_index | testrole_partitioning | parent_tab +(1 row) + +\dP testpart.* + List of partitioned relations + Schema | Name | Owner | Type | Parent name | On table +----------+--------------------+-----------------------+-------------------+--------------+------------- + testpart | parent_tab | testrole_partitioning | partitioned table | | + testpart | child_30_40 | testrole_partitioning | partitioned table | parent_tab | + testpart | parent_index | testrole_partitioning | partitioned index | | parent_tab + testpart | child_30_40_id_idx | testrole_partitioning | partitioned index | parent_index | child_30_40 +(4 rows) + +\dP + List of partitioned relations + Schema | Name | Owner | Type | On table +----------+--------------+-----------------------+-------------------+------------ + testpart | parent_tab | testrole_partitioning | partitioned table | + testpart | parent_index | testrole_partitioning | partitioned index | parent_tab +(2 rows) + +\dPtn + List of partitioned tables + Schema | Name | Owner | Parent name +----------+-------------+-----------------------+------------- + testpart | parent_tab | testrole_partitioning | + testpart | child_30_40 | testrole_partitioning | parent_tab +(2 rows) + +\dPin + List of partitioned indexes + Schema | Name | Owner | Parent name | On table +----------+--------------------+-----------------------+--------------+------------- + testpart | parent_index | testrole_partitioning | | parent_tab + testpart | child_30_40_id_idx | testrole_partitioning | parent_index | child_30_40 +(2 rows) + +\dPn + List of partitioned relations + Schema | Name | Owner | Type | Parent name | On table +----------+--------------------+-----------------------+-------------------+--------------+------------- + testpart | parent_tab | testrole_partitioning | partitioned table | | + testpart | child_30_40 | testrole_partitioning | partitioned table | parent_tab | + testpart | parent_index | testrole_partitioning | partitioned index | | parent_tab + testpart | child_30_40_id_idx | testrole_partitioning | partitioned index | parent_index | child_30_40 +(4 rows) + +\dPn testpart.* + List of partitioned relations + Schema | Name | Owner | Type | Parent name | On table +----------+--------------------+-----------------------+-------------------+--------------+------------- + testpart | parent_tab | testrole_partitioning | partitioned table | | + testpart | child_30_40 | testrole_partitioning | partitioned table | parent_tab | + testpart | parent_index | testrole_partitioning | partitioned index | | parent_tab + testpart | child_30_40_id_idx | testrole_partitioning | partitioned index | parent_index | child_30_40 +(4 rows) + +drop table parent_tab cascade; +drop schema testpart; +set search_path to default; +set role to default; +drop role testrole_partitioning; \ No newline at end of file diff --git a/src/test/regress/sql/psql.sql b/src/test/regress/sql/psql.sql index b56a05f7..60b34177 100644 --- a/src/test/regress/sql/psql.sql +++ b/src/test/regress/sql/psql.sql @@ -560,3 +560,72 @@ UNION SELECT 5 ORDER BY 1; \r \p + +create schema testpart; +create role testrole_partitioning; + +alter schema testpart owner to testrole_partitioning; + +set role to testrole_partitioning; + +-- run test inside own schema and hide other partitions +set search_path to testpart; + +create table testtable_apple(logdate date); +create table testtable_orange(logdate date); +create index testtable_apple_index on testtable_apple(logdate); +create index testtable_orange_index on testtable_orange(logdate); + +create table testpart_apple(logdate date) partition by range(logdate); +create table testpart_orange(logdate date) partition by range(logdate); + +create index testpart_apple_index on testpart_apple(logdate); +create index testpart_orange_index on testpart_orange(logdate); + +-- only partition related object should be displayed +\dP test*apple* +\dPt test*apple* +\dPi test*apple* + +drop table testtable_apple; +drop table testtable_orange; +drop table testpart_apple; +drop table testpart_orange; + +create table parent_tab (id int) partition by range (id); +create index parent_index on parent_tab (id); +create table child_0_10 partition of parent_tab + for values from (0) to (10); +create table child_10_20 partition of parent_tab + for values from (10) to (20); +create table child_20_30 partition of parent_tab + for values from (20) to (30); +insert into parent_tab values (generate_series(0,29)); +create table child_30_40 partition of parent_tab +for values from (30) to (40) + partition by range(id); +create table child_30_35 partition of child_30_40 + for values from (30) to (35); +create table child_35_40 partition of child_30_40 + for values from (35) to (40); +insert into parent_tab values (generate_series(30,39)); + +\dPt +\dPi + +\dP testpart.* +\dP + +\dPtn +\dPin +\dPn +\dPn testpart.* + +drop table parent_tab cascade; + +drop schema testpart; + +set search_path to default; + +set role to default; +drop role testrole_partitioning; From 219faaf03d6ca29fce4eb774d27de63ca68cc53c Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Wed, 1 Jul 2020 20:07:49 +0800 Subject: [PATCH 281/578] Fix EvalPlanQualStart to handle partitioned result rels correctly. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/executor/execMain.c | 14 +++++++++++++- src/test/isolation/expected/eval-plan-qual.out | 12 ++++++++++++ src/test/isolation/specs/eval-plan-qual.spec | 18 ++++++++++++++++++ 3 files changed, 43 insertions(+), 1 deletion(-) diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 5a082133..4140f135 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -3510,7 +3510,7 @@ EvalPlanQualStart(EPQState *epqstate, EState *parentestate, Plan *planTree) * es_param_exec_vals, etc. * * The ResultRelInfo array management is trickier than it looks. We - * create a fresh array for the child but copy all the content from the + * create fresh arrays for the child but copy all the content from the * parent. This is because it's okay for the child to share any * per-relation state the parent has already created --- but if the child * sets up any ResultRelInfo fields, such as its own junkfilter, that @@ -3527,6 +3527,7 @@ EvalPlanQualStart(EPQState *epqstate, EState *parentestate, Plan *planTree) if (parentestate->es_num_result_relations > 0) { int numResultRelations = parentestate->es_num_result_relations; + int numRootResultRels = parentestate->es_num_root_result_relations; ResultRelInfo *resultRelInfos; resultRelInfos = (ResultRelInfo *) @@ -3535,6 +3536,17 @@ EvalPlanQualStart(EPQState *epqstate, EState *parentestate, Plan *planTree) numResultRelations * sizeof(ResultRelInfo)); estate->es_result_relations = resultRelInfos; estate->es_num_result_relations = numResultRelations; + + /* Also transfer partitioned root result relations. */ + if (numRootResultRels > 0) + { + resultRelInfos = (ResultRelInfo *) + palloc(numRootResultRels * sizeof(ResultRelInfo)); + memcpy(resultRelInfos, parentestate->es_root_result_relations, + numRootResultRels * sizeof(ResultRelInfo)); + estate->es_root_result_relations = resultRelInfos; + estate->es_num_root_result_relations = numRootResultRels; + } } /* es_result_relation_info must NOT be copied */ /* es_trig_target_relations must NOT be copied */ diff --git a/src/test/isolation/expected/eval-plan-qual.out b/src/test/isolation/expected/eval-plan-qual.out index 10c784a0..6be164fe 100644 --- a/src/test/isolation/expected/eval-plan-qual.out +++ b/src/test/isolation/expected/eval-plan-qual.out @@ -184,3 +184,15 @@ step readwcte: <... completed> id value 1 tableAValue2 + +starting permutation: simplepartupdate complexpartupdate c1 c2 +step simplepartupdate: + update parttbl set a = a; + +step complexpartupdate: + with u as (update parttbl set a = a returning parttbl.*) + update parttbl set a = u.a from u; + +step c1: COMMIT; +step complexpartupdate: <... completed> +step c2: COMMIT; \ No newline at end of file diff --git a/src/test/isolation/specs/eval-plan-qual.spec b/src/test/isolation/specs/eval-plan-qual.spec index 5e1fce05..a2f6948b 100644 --- a/src/test/isolation/specs/eval-plan-qual.spec +++ b/src/test/isolation/specs/eval-plan-qual.spec @@ -47,10 +47,17 @@ setup INSERT INTO table_a VALUES (1, 'tableAValue'); INSERT INTO table_b VALUES (1, 'tableBValue'); } +setup +{ + CREATE TABLE parttbl (a int) PARTITION BY LIST (a); + CREATE TABLE parttbl1 PARTITION OF parttbl FOR VALUES IN (1); + INSERT INTO parttbl VALUES (1); +} teardown { DROP TABLE accounts, p, table_a, table_b CASCADE; + DROP TABLE parttbl; } session "s1" @@ -101,6 +108,11 @@ step "updateforss" { UPDATE table_b SET value = 'newTableBValue' WHERE id = 1; } +# test for EPQ on a partitioned result table + +step "simplepartupdate" { + update parttbl set a = a; +} session "s2" setup { BEGIN ISOLATION LEVEL READ COMMITTED; } @@ -127,6 +139,10 @@ step "readforss" { WHERE ta.id = 1 FOR UPDATE OF ta; } step "wrtwcte" { UPDATE table_a SET value = 'tableAValue2' WHERE id = 1; } +step "complexpartupdate" { + with u as (update parttbl set a = a returning parttbl.*) + update parttbl set a = u.a from u; +} step "c2" { COMMIT; } session "s3" @@ -158,3 +174,5 @@ permutation "wx2" "partiallock" "c2" "c1" "read" permutation "wx2" "lockwithvalues" "c2" "c1" "read" permutation "updateforss" "readforss" "c1" "c2" permutation "wrtwcte" "readwcte" "c1" "c2" + +permutation "simplepartupdate" "complexpartupdate" "c1" "c2" From c8c453ee8547a85a445bc5718e94058cf38487f6 Mon Sep 17 00:00:00 2001 From: Alvaro Herrera Date: Wed, 17 Apr 2019 18:17:43 -0400 Subject: [PATCH 282/578] psql: display tablespace for partitioned indexes Nothing was shown previously. --- src/bin/psql/describe.c | 3 ++- src/test/regress/input/tablespace.source | 1 + src/test/regress/output/tablespace.source | 8 ++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c index b7023ae5..4fd3864b 100644 --- a/src/bin/psql/describe.c +++ b/src/bin/psql/describe.c @@ -3281,7 +3281,8 @@ add_tablespace_footer(printTableContent *const cont, char relkind, if (relkind == RELKIND_RELATION || relkind == RELKIND_MATVIEW || relkind == RELKIND_INDEX || - relkind == RELKIND_PARTITIONED_TABLE) + relkind == RELKIND_PARTITIONED_TABLE || + relkind == RELKIND_PARTITIONED_INDEX) { /* * We ignore the database default tablespace so that users not using diff --git a/src/test/regress/input/tablespace.source b/src/test/regress/input/tablespace.source index abad2716..4bf5302d 100644 --- a/src/test/regress/input/tablespace.source +++ b/src/test/regress/input/tablespace.source @@ -63,6 +63,7 @@ CREATE INDEX part_a_idx ON testschema.part (a) TABLESPACE regress_tblspace; CREATE TABLE testschema.part2 PARTITION OF testschema.part FOR VALUES IN (2); SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c where c.reltablespace = t.oid AND c.relname LIKE 'part%_idx'; +\d testschema.part_a_idx -- check that default_tablespace doesn't affect ALTER TABLE index rebuilds CREATE TABLE testschema.test_default_tab(id bigint) TABLESPACE regress_tblspace; diff --git a/src/test/regress/output/tablespace.source b/src/test/regress/output/tablespace.source index 03383fd4..6688ae7e 100644 --- a/src/test/regress/output/tablespace.source +++ b/src/test/regress/output/tablespace.source @@ -94,6 +94,14 @@ SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c part_a_idx | regress_tblspace (3 rows) +\d testschema.part_a_idx +Partitioned index "testschema.part_a_idx" + Column | Type | Key? | Definition +--------+---------+------+------------ + a | integer | yes | a +btree, for table "testschema.part" +Tablespace: "regress_tblspace" + -- check that default_tablespace doesn't affect ALTER TABLE index rebuilds CREATE TABLE testschema.test_default_tab(id bigint) TABLESPACE regress_tblspace; INSERT INTO testschema.test_default_tab VALUES (1); From aac238dfe88c46dfaa5c234f90973a1dea5358eb Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Wed, 1 Jul 2020 21:43:00 +0800 Subject: [PATCH 283/578] Fix tablespace inheritance for partitioned rels. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/bootstrap/bootparse.y | 2 + src/backend/commands/indexcmds.c | 28 +- src/backend/commands/matview.c | 2 +- src/backend/commands/tablecmds.c | 32 +-- src/backend/commands/tablespace.c | 16 +- src/backend/nodes/copyfuncs.c | 2 + src/backend/nodes/equalfuncs.c | 2 + src/backend/nodes/outfuncs.c | 4 + src/backend/parser/gram.y | 2 + src/backend/parser/parse_utilcmd.c | 2 + src/backend/utils/adt/ruleutils.c | 11 +- src/include/commands/tablespace.h | 30 +- src/include/nodes/parsenodes.h | 4 + src/test/regress/input/tablespace.source | 90 ++++++ src/test/regress/output/tablespace.source | 334 ++++++++++++++++++++++ 15 files changed, 523 insertions(+), 38 deletions(-) diff --git a/src/backend/bootstrap/bootparse.y b/src/backend/bootstrap/bootparse.y index 78267925..137c2dad 100644 --- a/src/backend/bootstrap/bootparse.y +++ b/src/backend/bootstrap/bootparse.y @@ -377,6 +377,7 @@ Boot_DeclareIndexStmt: stmt->transformed = false; stmt->concurrent = false; stmt->if_not_exists = false; + stmt->reset_default_tblspc = false; /* locks and races need not concern us in bootstrap mode */ relationId = RangeVarGetRelid(stmt->relation, NoLock, @@ -421,6 +422,7 @@ Boot_DeclareUniqueIndexStmt: stmt->transformed = false; stmt->concurrent = false; stmt->if_not_exists = false; + stmt->reset_default_tblspc = false; /* locks and races need not concern us in bootstrap mode */ relationId = RangeVarGetRelid(stmt->relation, NoLock, diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index ad99f3e2..4596a9f4 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -349,9 +349,23 @@ DefineIndex(Oid relationId, LOCKTAG heaplocktag; LOCKMODE lockmode; Snapshot snapshot; + int save_nestlevel = -1; int i; /* + * Some callers need us to run with an empty default_tablespace; this is a + * necessary hack to be able to reproduce catalog state accurately when + * recreating indexes after table-rewriting ALTER TABLE. + */ + if (stmt->reset_default_tblspc) + { + save_nestlevel = NewGUCNestLevel(); + (void) set_config_option("default_tablespace", "", + PGC_USERSET, PGC_S_SESSION, + GUC_ACTION_SAVE, true, 0, false); + } + + /* * count attributes in index */ numberOfAttributes = list_length(stmt->indexParams); @@ -448,10 +462,15 @@ DefineIndex(Oid relationId, if (stmt->tableSpace) { tablespaceId = get_tablespace_oid(stmt->tableSpace, false); + if (partitioned && tablespaceId == MyDatabaseTableSpace) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot specify default tablespace for partitioned relation"))); } else { - tablespaceId = GetDefaultTablespace(rel->rd_rel->relpersistence); + tablespaceId = GetDefaultTablespace(rel->rd_rel->relpersistence, + partitioned); /* note InvalidOid is OK in this case */ } @@ -753,6 +772,13 @@ DefineIndex(Oid relationId, ObjectAddressSet(address, RelationRelationId, indexRelationId); + /* + * Revert to original default_tablespace. Must do this before any return + * from this function, but after index_create, so this is a good time. + */ + if (save_nestlevel >= 0) + AtEOXact_GUC(true, save_nestlevel); + if (!OidIsValid(indexRelationId)) { heap_close(rel, NoLock); diff --git a/src/backend/commands/matview.c b/src/backend/commands/matview.c index 0144ee68..102e2f36 100644 --- a/src/backend/commands/matview.c +++ b/src/backend/commands/matview.c @@ -303,7 +303,7 @@ ExecRefreshMatView(RefreshMatViewStmt *stmt, const char *queryString, /* Concurrent refresh builds new data in temp tablespace, and does diff. */ if (concurrent) { - tableSpace = GetDefaultTablespace(RELPERSISTENCE_TEMP); + tableSpace = GetDefaultTablespace(RELPERSISTENCE_TEMP, false); relpersistence = RELPERSISTENCE_TEMP; } else diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index b24611be..8f8a76d8 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -590,6 +590,7 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, Datum reloptions; ListCell *listptr; AttrNumber attnum; + bool partitioned; static char *validnsps[] = HEAP_RELOPT_NAMESPACES; Oid ofTypeId; ObjectAddress address; @@ -634,7 +635,10 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, else #endif relkind = RELKIND_PARTITIONED_TABLE; + partitioned = true; } + else + partitioned = false; /* * Look up the namespace in which we are supposed to create the relation, @@ -716,31 +720,24 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, if (stmt->tablespacename) { tablespaceId = get_tablespace_oid(stmt->tablespacename, false); + + if (partitioned && tablespaceId == MyDatabaseTableSpace) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot specify default tablespace for partitioned relations"))); } else if (stmt->partbound) { - HeapTuple tup; - /* * For partitions, when no other tablespace is specified, we default * the tablespace to the parent partitioned table's. */ Assert(list_length(inheritOids) == 1); - tup = SearchSysCache1(RELOID, - DatumGetObjectId(linitial_oid(inheritOids))); - - tablespaceId = ((Form_pg_class) GETSTRUCT(tup))->reltablespace; - - if (!OidIsValid(tablespaceId)) - tablespaceId = GetDefaultTablespace(stmt->relation->relpersistence); - - ReleaseSysCache(tup); + tablespaceId = get_rel_tablespace(linitial_oid(inheritOids)); } else - { - tablespaceId = GetDefaultTablespace(stmt->relation->relpersistence); - /* note InvalidOid is OK in this case */ - } + tablespaceId = GetDefaultTablespace(stmt->relation->relpersistence, + partitioned); /* Check permissions except when using database's default */ if (OidIsValid(tablespaceId) && tablespaceId != MyDatabaseTableSpace) @@ -1183,7 +1180,7 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, * Process the partitioning specification (if any) and store the partition * key information into the catalog. */ - if (stmt->partspec) + if (partitioned) { ParseState *pstate; char strategy; @@ -11628,6 +11625,7 @@ ATPostAlterTypeParse(Oid oldId, Oid oldRelId, Oid refRelId, char *cmd, if (!rewrite) TryReuseIndex(oldId, stmt); + stmt->reset_default_tblspc = true; /* keep the index's comment */ stmt->idxcomment = GetComment(oldId, RelationRelationId, 0); @@ -11659,6 +11657,7 @@ ATPostAlterTypeParse(Oid oldId, Oid oldRelId, Oid refRelId, char *cmd, /* keep any comment on the index */ indstmt->idxcomment = GetComment(indoid, RelationRelationId, 0); + indstmt->reset_default_tblspc = true; cmd->subtype = AT_ReAddIndex; tab->subcmds[AT_PASS_OLD_INDEX] = @@ -11680,6 +11679,7 @@ ATPostAlterTypeParse(Oid oldId, Oid oldRelId, Oid refRelId, char *cmd, if (con->contype == CONSTR_FOREIGN && !rewrite && tab->rewrite == 0) TryReuseForeignKey(oldId, con); + con->reset_default_tblspc = true; cmd->subtype = AT_ReAddConstraint; tab->subcmds[AT_PASS_OLD_CONSTR] = lappend(tab->subcmds[AT_PASS_OLD_CONSTR], cmd); diff --git a/src/backend/commands/tablespace.c b/src/backend/commands/tablespace.c index 28892609..1b208c6c 100644 --- a/src/backend/commands/tablespace.c +++ b/src/backend/commands/tablespace.c @@ -1206,7 +1206,9 @@ check_default_tablespace(char **newval, void **extra, GucSource source) * GetDefaultTablespace -- get the OID of the current default tablespace * * Temporary objects have different default tablespaces, hence the - * relpersistence parameter must be specified. + * relpersistence parameter must be specified. Also, for partitioned tables, + * we disallow specifying the database default, so that needs to be specified + * too. * * May return InvalidOid to indicate "use the database's default tablespace". * @@ -1217,7 +1219,7 @@ check_default_tablespace(char **newval, void **extra, GucSource source) * default_tablespace GUC variable. */ Oid -GetDefaultTablespace(char relpersistence) +GetDefaultTablespace(char relpersistence, bool partitioned) { Oid result; @@ -1243,10 +1245,18 @@ GetDefaultTablespace(char relpersistence) /* * Allow explicit specification of database's default tablespace in - * default_tablespace without triggering permissions checks. + * default_tablespace without triggering permissions checks. Don't + * allow specifying that when creating a partitioned table, however, + * since the result is confusing. */ if (result == MyDatabaseTableSpace) + { + if (partitioned) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot specify default tablespace for partitioned relations"))); result = InvalidOid; + } return result; } diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index e87c8463..ea4a0c71 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -3153,6 +3153,7 @@ _copyConstraint(const Constraint *from) COPY_NODE_FIELD(options); COPY_STRING_FIELD(indexname); COPY_STRING_FIELD(indexspace); + COPY_SCALAR_FIELD(reset_default_tblspc); COPY_STRING_FIELD(access_method); COPY_NODE_FIELD(where_clause); COPY_NODE_FIELD(pktable); @@ -3745,6 +3746,7 @@ _copyIndexStmt(const IndexStmt *from) COPY_SCALAR_FIELD(transformed); COPY_SCALAR_FIELD(concurrent); COPY_SCALAR_FIELD(if_not_exists); + COPY_SCALAR_FIELD(reset_default_tblspc); #ifdef __TBASE__ COPY_SCALAR_FIELD(parentIndexOid); #endif diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c index 8abab4bb..c92cbd30 100644 --- a/src/backend/nodes/equalfuncs.c +++ b/src/backend/nodes/equalfuncs.c @@ -1365,6 +1365,7 @@ _equalIndexStmt(const IndexStmt *a, const IndexStmt *b) COMPARE_SCALAR_FIELD(transformed); COMPARE_SCALAR_FIELD(concurrent); COMPARE_SCALAR_FIELD(if_not_exists); + COMPARE_SCALAR_FIELD(reset_default_tblspc); return true; } @@ -2644,6 +2645,7 @@ _equalConstraint(const Constraint *a, const Constraint *b) COMPARE_NODE_FIELD(options); COMPARE_STRING_FIELD(indexname); COMPARE_STRING_FIELD(indexspace); + COMPARE_SCALAR_FIELD(reset_default_tblspc); COMPARE_STRING_FIELD(access_method); COMPARE_NODE_FIELD(where_clause); COMPARE_NODE_FIELD(pktable); diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index 24ca2109..092a7dd5 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -4027,6 +4027,7 @@ _outIndexStmt(StringInfo str, const IndexStmt *node) WRITE_BOOL_FIELD(transformed); WRITE_BOOL_FIELD(concurrent); WRITE_BOOL_FIELD(if_not_exists); + WRITE_BOOL_FIELD(reset_default_tblspc); } static void @@ -4898,6 +4899,7 @@ _outConstraint(StringInfo str, const Constraint *node) WRITE_NODE_FIELD(options); WRITE_STRING_FIELD(indexname); WRITE_STRING_FIELD(indexspace); + WRITE_BOOL_FIELD(reset_default_tblspc); /* access_method and where_clause not currently used */ break; @@ -4907,6 +4909,7 @@ _outConstraint(StringInfo str, const Constraint *node) WRITE_NODE_FIELD(options); WRITE_STRING_FIELD(indexname); WRITE_STRING_FIELD(indexspace); + WRITE_BOOL_FIELD(reset_default_tblspc); /* access_method and where_clause not currently used */ break; @@ -4916,6 +4919,7 @@ _outConstraint(StringInfo str, const Constraint *node) WRITE_NODE_FIELD(options); WRITE_STRING_FIELD(indexname); WRITE_STRING_FIELD(indexspace); + WRITE_BOOL_FIELD(reset_default_tblspc); WRITE_STRING_FIELD(access_method); WRITE_NODE_FIELD(where_clause); break; diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index a3eb7514..f8e17e4e 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -7758,6 +7758,7 @@ IndexStmt: CREATE opt_unique INDEX opt_concurrently opt_index_name n->initdeferred = false; n->transformed = false; n->if_not_exists = false; + n->reset_default_tblspc = false; $$ = (Node *)n; } | CREATE opt_unique INDEX opt_concurrently IF_P NOT EXISTS index_name @@ -7784,6 +7785,7 @@ IndexStmt: CREATE opt_unique INDEX opt_concurrently opt_index_name n->initdeferred = false; n->transformed = false; n->if_not_exists = true; + n->reset_default_tblspc = false; $$ = (Node *)n; } ; diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c index a5c17b8b..af249f71 100644 --- a/src/backend/parser/parse_utilcmd.c +++ b/src/backend/parser/parse_utilcmd.c @@ -1795,6 +1795,7 @@ generateClonedIndexStmt(CreateStmtContext *cxt, Relation source_idx, index->transformed = true; /* don't need transformIndexStmt */ index->concurrent = false; index->if_not_exists = false; + index->reset_default_tblspc = false; /* * We don't try to preserve the name of the source index; instead, just @@ -2268,6 +2269,7 @@ transformIndexConstraint(Constraint *constraint, CreateStmtContext *cxt) index->transformed = false; index->concurrent = false; index->if_not_exists = false; + index->reset_default_tblspc = constraint->reset_default_tblspc; /* * If it's ALTER TABLE ADD CONSTRAINT USING INDEX, look up the index and diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c index 984ace45..9acf184a 100644 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@ -1472,13 +1472,14 @@ pg_get_indexdef_worker(Oid indexrelid, int colno, Oid tblspc; tblspc = get_rel_tablespace(indexrelid); - if (!OidIsValid(tblspc)) - tblspc = MyDatabaseTableSpace; + if (OidIsValid(tblspc)) + { if (isConstraint) appendStringInfoString(&buf, " USING INDEX"); appendStringInfo(&buf, " TABLESPACE %s", quote_identifier(get_tablespace_name(tblspc))); } + } /* * If it's a partial index, decompile and append the predicate @@ -2126,6 +2127,12 @@ pg_get_constraintdef_worker(Oid constraintId, bool fullCommand, pfree(options); } + /* + * Print the tablespace, unless it's the database default. + * This is to help ALTER TABLE usage of this facility, + * which needs this behavior to recreate exact catalog + * state. + */ tblspc = get_rel_tablespace(indexId); if (OidIsValid(tblspc)) appendStringInfo(&buf, " USING INDEX TABLESPACE %s", diff --git a/src/include/commands/tablespace.h b/src/include/commands/tablespace.h index 9a933cca..32805ab4 100644 --- a/src/include/commands/tablespace.h +++ b/src/include/commands/tablespace.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * tablespace.h - * Tablespace management commands (create/drop tablespace). + * Tablespace management commands (create/drop tablespace). * * * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group @@ -20,40 +20,40 @@ #include "nodes/parsenodes.h" /* XLOG stuff */ -#define XLOG_TBLSPC_CREATE 0x00 -#define XLOG_TBLSPC_DROP 0x10 +#define XLOG_TBLSPC_CREATE 0x00 +#define XLOG_TBLSPC_DROP 0x10 typedef struct xl_tblspc_create_rec { - Oid ts_id; - char ts_path[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated string */ + Oid ts_id; + char ts_path[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated string */ } xl_tblspc_create_rec; typedef struct xl_tblspc_drop_rec { - Oid ts_id; + Oid ts_id; } xl_tblspc_drop_rec; typedef struct TableSpaceOpts { - int32 vl_len_; /* varlena header (do not touch directly!) */ - float8 random_page_cost; - float8 seq_page_cost; - int effective_io_concurrency; + int32 vl_len_; /* varlena header (do not touch directly!) */ + float8 random_page_cost; + float8 seq_page_cost; + int effective_io_concurrency; } TableSpaceOpts; -extern Oid CreateTableSpace(CreateTableSpaceStmt *stmt); +extern Oid CreateTableSpace(CreateTableSpaceStmt *stmt); extern void DropTableSpace(DropTableSpaceStmt *stmt); extern ObjectAddress RenameTableSpace(const char *oldname, const char *newname); -extern Oid AlterTableSpaceOptions(AlterTableSpaceOptionsStmt *stmt); +extern Oid AlterTableSpaceOptions(AlterTableSpaceOptionsStmt *stmt); extern void TablespaceCreateDbspace(Oid spcNode, Oid dbNode, bool isRedo); -extern Oid GetDefaultTablespace(char relpersistence); +extern Oid GetDefaultTablespace(char relpersistence, bool partitioned); extern void PrepareTempTablespaces(void); -extern Oid get_tablespace_oid(const char *tablespacename, bool missing_ok); +extern Oid get_tablespace_oid(const char *tablespacename, bool missing_ok); extern char *get_tablespace_name(Oid spc_oid); extern bool directory_is_empty(const char *path); @@ -63,4 +63,4 @@ extern void tblspc_redo(XLogReaderState *rptr); extern void tblspc_desc(StringInfo buf, XLogReaderState *rptr); extern const char *tblspc_identify(uint8 info); -#endif /* TABLESPACE_H */ +#endif /* TABLESPACE_H */ diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index 983a1ab0..acc64eb0 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -2252,6 +2252,8 @@ typedef struct Constraint List *options; /* options from WITH clause */ char *indexname; /* existing index to use; otherwise NULL */ char *indexspace; /* index tablespace; NULL for default */ + bool reset_default_tblspc; /* reset default_tablespace prior to + * creating the index */ /* These could be, but currently are not, used for UNIQUE/PKEY: */ char *access_method; /* index access method; NULL for default */ Node *where_clause; /* partial index predicate */ @@ -2881,6 +2883,8 @@ typedef struct IndexStmt bool transformed; /* true when transformIndexStmt is finished */ bool concurrent; /* should this be a concurrent index build? */ bool if_not_exists; /* just do nothing if index already exists? */ + bool reset_default_tblspc; /* reset default_tablespace prior to + * executing */ #ifdef __TBASE__ /* used for interval partition */ Oid parentIndexOid; diff --git a/src/test/regress/input/tablespace.source b/src/test/regress/input/tablespace.source index 4bf5302d..5323f07e 100644 --- a/src/test/regress/input/tablespace.source +++ b/src/test/regress/input/tablespace.source @@ -65,24 +65,45 @@ SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c where c.reltablespace = t.oid AND c.relname LIKE 'part%_idx'; \d testschema.part_a_idx +-- partitioned rels cannot specify the default tablespace. These fail: +CREATE TABLE testschema.dflt (a int PRIMARY KEY) PARTITION BY LIST (a) TABLESPACE pg_default; +CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE pg_default) PARTITION BY LIST (a); +SET default_tablespace TO 'pg_default'; +CREATE TABLE testschema.dflt (a int PRIMARY KEY) PARTITION BY LIST (a) TABLESPACE regress_tblspace; +CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE regress_tblspace) PARTITION BY LIST (a); +-- but these work: +CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE regress_tblspace) PARTITION BY LIST (a) TABLESPACE regress_tblspace; +SET default_tablespace TO ''; +CREATE TABLE testschema.dflt2 (a int PRIMARY KEY) PARTITION BY LIST (a); +DROP TABLE testschema.dflt, testschema.dflt2; + -- check that default_tablespace doesn't affect ALTER TABLE index rebuilds CREATE TABLE testschema.test_default_tab(id bigint) TABLESPACE regress_tblspace; INSERT INTO testschema.test_default_tab VALUES (1); CREATE INDEX test_index1 on testschema.test_default_tab (id); CREATE INDEX test_index2 on testschema.test_default_tab (id) TABLESPACE regress_tblspace; +ALTER TABLE testschema.test_default_tab ADD CONSTRAINT test_index3 PRIMARY KEY (id); +ALTER TABLE testschema.test_default_tab ADD CONSTRAINT test_index4 UNIQUE (id) USING INDEX TABLESPACE regress_tblspace; + \d testschema.test_index1 \d testschema.test_index2 +\d testschema.test_index3 +\d testschema.test_index4 -- use a custom tablespace for default_tablespace SET default_tablespace TO regress_tblspace; -- tablespace should not change if no rewrite ALTER TABLE testschema.test_default_tab ALTER id TYPE bigint; \d testschema.test_index1 \d testschema.test_index2 +\d testschema.test_index3 +\d testschema.test_index4 SELECT * FROM testschema.test_default_tab; -- tablespace should not change even if there is an index rewrite ALTER TABLE testschema.test_default_tab ALTER id TYPE int; \d testschema.test_index1 \d testschema.test_index2 +\d testschema.test_index3 +\d testschema.test_index4 SELECT * FROM testschema.test_default_tab; -- now use the default tablespace for default_tablespace SET default_tablespace TO ''; @@ -90,12 +111,64 @@ SET default_tablespace TO ''; ALTER TABLE testschema.test_default_tab ALTER id TYPE int; \d testschema.test_index1 \d testschema.test_index2 +\d testschema.test_index3 +\d testschema.test_index4 -- tablespace should not change even if there is an index rewrite ALTER TABLE testschema.test_default_tab ALTER id TYPE bigint; \d testschema.test_index1 \d testschema.test_index2 +\d testschema.test_index3 +\d testschema.test_index4 DROP TABLE testschema.test_default_tab; +-- check that default_tablespace doesn't affect ALTER TABLE index rebuilds +-- (this time with a partitioned table) +CREATE TABLE testschema.test_default_tab_p(id bigint, val bigint) + PARTITION BY LIST (id) TABLESPACE regress_tblspace; +CREATE TABLE testschema.test_default_tab_p1 PARTITION OF testschema.test_default_tab_p + FOR VALUES IN (1); +INSERT INTO testschema.test_default_tab_p VALUES (1); +CREATE INDEX test_index1 on testschema.test_default_tab_p (val); +CREATE INDEX test_index2 on testschema.test_default_tab_p (val) TABLESPACE regress_tblspace; +ALTER TABLE testschema.test_default_tab_p ADD CONSTRAINT test_index3 PRIMARY KEY (id); +ALTER TABLE testschema.test_default_tab_p ADD CONSTRAINT test_index4 UNIQUE (id) USING INDEX TABLESPACE regress_tblspace; + +\d testschema.test_index1 +\d testschema.test_index2 +\d testschema.test_index3 +\d testschema.test_index4 +-- use a custom tablespace for default_tablespace +SET default_tablespace TO regress_tblspace; +-- tablespace should not change if no rewrite +ALTER TABLE testschema.test_default_tab_p ALTER val TYPE bigint; +\d testschema.test_index1 +\d testschema.test_index2 +\d testschema.test_index3 +\d testschema.test_index4 +SELECT * FROM testschema.test_default_tab_p; +-- tablespace should not change even if there is an index rewrite +ALTER TABLE testschema.test_default_tab_p ALTER val TYPE int; +\d testschema.test_index1 +\d testschema.test_index2 +\d testschema.test_index3 +\d testschema.test_index4 +SELECT * FROM testschema.test_default_tab_p; +-- now use the default tablespace for default_tablespace +SET default_tablespace TO ''; +-- tablespace should not change if no rewrite +ALTER TABLE testschema.test_default_tab_p ALTER val TYPE int; +\d testschema.test_index1 +\d testschema.test_index2 +\d testschema.test_index3 +\d testschema.test_index4 +-- tablespace should not change even if there is an index rewrite +ALTER TABLE testschema.test_default_tab_p ALTER val TYPE bigint; +\d testschema.test_index1 +\d testschema.test_index2 +\d testschema.test_index3 +\d testschema.test_index4 +DROP TABLE testschema.test_default_tab_p; + -- check that default_tablespace affects index additions in ALTER TABLE CREATE TABLE testschema.test_tab(id int) TABLESPACE regress_tblspace; INSERT INTO testschema.test_tab VALUES (1); @@ -108,6 +181,23 @@ ALTER TABLE testschema.test_tab ADD CONSTRAINT test_tab_pkey PRIMARY KEY (id); SELECT * FROM testschema.test_tab; DROP TABLE testschema.test_tab; +-- check that default_tablespace is handled correctly by multi-command +-- ALTER TABLE that includes a tablespace-preserving rewrite +CREATE TABLE testschema.test_tab(a int, b int, c int); +SET default_tablespace TO regress_tblspace; +ALTER TABLE testschema.test_tab ADD CONSTRAINT test_tab_unique UNIQUE (a); +CREATE INDEX test_tab_a_idx ON testschema.test_tab (a); +SET default_tablespace TO ''; +CREATE INDEX test_tab_b_idx ON testschema.test_tab (b); +\d testschema.test_tab_unique +\d testschema.test_tab_a_idx +\d testschema.test_tab_b_idx +ALTER TABLE testschema.test_tab ALTER b TYPE bigint, ADD UNIQUE (c); +\d testschema.test_tab_unique +\d testschema.test_tab_a_idx +\d testschema.test_tab_b_idx +DROP TABLE testschema.test_tab; + -- let's try moving a table from one place to another CREATE TABLE testschema.atable AS VALUES (1), (2); CREATE UNIQUE INDEX anindex ON testschema.atable(column1); diff --git a/src/test/regress/output/tablespace.source b/src/test/regress/output/tablespace.source index 6688ae7e..21e8baff 100644 --- a/src/test/regress/output/tablespace.source +++ b/src/test/regress/output/tablespace.source @@ -102,11 +102,28 @@ Partitioned index "testschema.part_a_idx" btree, for table "testschema.part" Tablespace: "regress_tblspace" +-- partitioned rels cannot specify the default tablespace. These fail: +CREATE TABLE testschema.dflt (a int PRIMARY KEY) PARTITION BY LIST (a) TABLESPACE pg_default; +ERROR: cannot specify default tablespace for partitioned relations +CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE pg_default) PARTITION BY LIST (a); +ERROR: cannot specify default tablespace for partitioned relation +SET default_tablespace TO 'pg_default'; +CREATE TABLE testschema.dflt (a int PRIMARY KEY) PARTITION BY LIST (a) TABLESPACE regress_tblspace; +ERROR: cannot specify default tablespace for partitioned relations +CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE regress_tblspace) PARTITION BY LIST (a); +ERROR: cannot specify default tablespace for partitioned relations +-- but these work: +CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE regress_tblspace) PARTITION BY LIST (a) TABLESPACE regress_tblspace; +SET default_tablespace TO ''; +CREATE TABLE testschema.dflt2 (a int PRIMARY KEY) PARTITION BY LIST (a); +DROP TABLE testschema.dflt, testschema.dflt2; -- check that default_tablespace doesn't affect ALTER TABLE index rebuilds CREATE TABLE testschema.test_default_tab(id bigint) TABLESPACE regress_tblspace; INSERT INTO testschema.test_default_tab VALUES (1); CREATE INDEX test_index1 on testschema.test_default_tab (id); CREATE INDEX test_index2 on testschema.test_default_tab (id) TABLESPACE regress_tblspace; +ALTER TABLE testschema.test_default_tab ADD CONSTRAINT test_index3 PRIMARY KEY (id); +ALTER TABLE testschema.test_default_tab ADD CONSTRAINT test_index4 UNIQUE (id) USING INDEX TABLESPACE regress_tblspace; \d testschema.test_index1 Index "testschema.test_index1" Column | Type | Definition @@ -122,6 +139,21 @@ Index "testschema.test_index2" btree, for table "testschema.test_default_tab" Tablespace: "regress_tblspace" +\d testschema.test_index3 + Index "testschema.test_index3" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +primary key, btree, for table "testschema.test_default_tab" + +\d testschema.test_index4 + Index "testschema.test_index4" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +unique, btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + -- use a custom tablespace for default_tablespace SET default_tablespace TO regress_tblspace; -- tablespace should not change if no rewrite @@ -141,6 +173,21 @@ Index "testschema.test_index2" btree, for table "testschema.test_default_tab" Tablespace: "regress_tblspace" +\d testschema.test_index3 + Index "testschema.test_index3" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +primary key, btree, for table "testschema.test_default_tab" + +\d testschema.test_index4 + Index "testschema.test_index4" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +unique, btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + SELECT * FROM testschema.test_default_tab; id ---- @@ -164,6 +211,21 @@ Index "testschema.test_index2" btree, for table "testschema.test_default_tab" Tablespace: "regress_tblspace" +\d testschema.test_index3 + Index "testschema.test_index3" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +primary key, btree, for table "testschema.test_default_tab" + +\d testschema.test_index4 + Index "testschema.test_index4" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +unique, btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + SELECT * FROM testschema.test_default_tab; id ---- @@ -189,6 +251,21 @@ Index "testschema.test_index2" btree, for table "testschema.test_default_tab" Tablespace: "regress_tblspace" +\d testschema.test_index3 + Index "testschema.test_index3" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +primary key, btree, for table "testschema.test_default_tab" + +\d testschema.test_index4 + Index "testschema.test_index4" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +unique, btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + -- tablespace should not change even if there is an index rewrite ALTER TABLE testschema.test_default_tab ALTER id TYPE bigint; \d testschema.test_index1 @@ -206,7 +283,208 @@ Index "testschema.test_index2" btree, for table "testschema.test_default_tab" Tablespace: "regress_tblspace" +\d testschema.test_index3 + Index "testschema.test_index3" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +primary key, btree, for table "testschema.test_default_tab" + +\d testschema.test_index4 + Index "testschema.test_index4" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +unique, btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + DROP TABLE testschema.test_default_tab; +-- check that default_tablespace doesn't affect ALTER TABLE index rebuilds +-- (this time with a partitioned table) +CREATE TABLE testschema.test_default_tab_p(id bigint, val bigint) + PARTITION BY LIST (id) TABLESPACE regress_tblspace; +CREATE TABLE testschema.test_default_tab_p1 PARTITION OF testschema.test_default_tab_p + FOR VALUES IN (1); +INSERT INTO testschema.test_default_tab_p VALUES (1); +CREATE INDEX test_index1 on testschema.test_default_tab_p (val); +CREATE INDEX test_index2 on testschema.test_default_tab_p (val) TABLESPACE regress_tblspace; +ALTER TABLE testschema.test_default_tab_p ADD CONSTRAINT test_index3 PRIMARY KEY (id); +ALTER TABLE testschema.test_default_tab_p ADD CONSTRAINT test_index4 UNIQUE (id) USING INDEX TABLESPACE regress_tblspace; +\d testschema.test_index1 +Partitioned index "testschema.test_index1" + Column | Type | Key? | Definition +--------+--------+------+------------ + val | bigint | yes | val +btree, for table "testschema.test_default_tab_p" + +\d testschema.test_index2 +Partitioned index "testschema.test_index2" + Column | Type | Key? | Definition +--------+--------+------+------------ + val | bigint | yes | val +btree, for table "testschema.test_default_tab_p" +Tablespace: "regress_tblspace" + +\d testschema.test_index3 +Partitioned index "testschema.test_index3" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +primary key, btree, for table "testschema.test_default_tab_p" + +\d testschema.test_index4 +Partitioned index "testschema.test_index4" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +unique, btree, for table "testschema.test_default_tab_p" +Tablespace: "regress_tblspace" + +-- use a custom tablespace for default_tablespace +SET default_tablespace TO regress_tblspace; +-- tablespace should not change if no rewrite +ALTER TABLE testschema.test_default_tab_p ALTER val TYPE bigint; +\d testschema.test_index1 +Partitioned index "testschema.test_index1" + Column | Type | Key? | Definition +--------+--------+------+------------ + val | bigint | yes | val +btree, for table "testschema.test_default_tab_p" + +\d testschema.test_index2 +Partitioned index "testschema.test_index2" + Column | Type | Key? | Definition +--------+--------+------+------------ + val | bigint | yes | val +btree, for table "testschema.test_default_tab_p" +Tablespace: "regress_tblspace" + +\d testschema.test_index3 +Partitioned index "testschema.test_index3" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +primary key, btree, for table "testschema.test_default_tab_p" + +\d testschema.test_index4 +Partitioned index "testschema.test_index4" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +unique, btree, for table "testschema.test_default_tab_p" +Tablespace: "regress_tblspace" + +SELECT * FROM testschema.test_default_tab_p; + id | val +----+----- + 1 | +(1 row) + +-- tablespace should not change even if there is an index rewrite +ALTER TABLE testschema.test_default_tab_p ALTER val TYPE int; +\d testschema.test_index1 +Partitioned index "testschema.test_index1" + Column | Type | Key? | Definition +--------+---------+------+------------ + val | integer | yes | val +btree, for table "testschema.test_default_tab_p" + +\d testschema.test_index2 +Partitioned index "testschema.test_index2" + Column | Type | Key? | Definition +--------+---------+------+------------ + val | integer | yes | val +btree, for table "testschema.test_default_tab_p" +Tablespace: "regress_tblspace" + +\d testschema.test_index3 +Partitioned index "testschema.test_index3" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +primary key, btree, for table "testschema.test_default_tab_p" + +\d testschema.test_index4 +Partitioned index "testschema.test_index4" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +unique, btree, for table "testschema.test_default_tab_p" +Tablespace: "regress_tblspace" + +SELECT * FROM testschema.test_default_tab_p; + id | val +----+----- + 1 | +(1 row) + +-- now use the default tablespace for default_tablespace +SET default_tablespace TO ''; +-- tablespace should not change if no rewrite +ALTER TABLE testschema.test_default_tab_p ALTER val TYPE int; +\d testschema.test_index1 +Partitioned index "testschema.test_index1" + Column | Type | Key? | Definition +--------+---------+------+------------ + val | integer | yes | val +btree, for table "testschema.test_default_tab_p" + +\d testschema.test_index2 +Partitioned index "testschema.test_index2" + Column | Type | Key? | Definition +--------+---------+------+------------ + val | integer | yes | val +btree, for table "testschema.test_default_tab_p" +Tablespace: "regress_tblspace" + +\d testschema.test_index3 +Partitioned index "testschema.test_index3" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +primary key, btree, for table "testschema.test_default_tab_p" + +\d testschema.test_index4 +Partitioned index "testschema.test_index4" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +unique, btree, for table "testschema.test_default_tab_p" +Tablespace: "regress_tblspace" + +-- tablespace should not change even if there is an index rewrite +ALTER TABLE testschema.test_default_tab_p ALTER val TYPE bigint; +\d testschema.test_index1 +Partitioned index "testschema.test_index1" + Column | Type | Key? | Definition +--------+--------+------+------------ + val | bigint | yes | val +btree, for table "testschema.test_default_tab_p" + +\d testschema.test_index2 +Partitioned index "testschema.test_index2" + Column | Type | Key? | Definition +--------+--------+------+------------ + val | bigint | yes | val +btree, for table "testschema.test_default_tab_p" +Tablespace: "regress_tblspace" + +\d testschema.test_index3 +Partitioned index "testschema.test_index3" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +primary key, btree, for table "testschema.test_default_tab_p" + +\d testschema.test_index4 +Partitioned index "testschema.test_index4" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +unique, btree, for table "testschema.test_default_tab_p" +Tablespace: "regress_tblspace" + +DROP TABLE testschema.test_default_tab_p; -- check that default_tablespace affects index additions in ALTER TABLE CREATE TABLE testschema.test_tab(id int) TABLESPACE regress_tblspace; INSERT INTO testschema.test_tab VALUES (1); @@ -235,6 +513,62 @@ SELECT * FROM testschema.test_tab; 1 (1 row) +DROP TABLE testschema.test_tab; +-- check that default_tablespace is handled correctly by multi-command +-- ALTER TABLE that includes a tablespace-preserving rewrite +CREATE TABLE testschema.test_tab(a int, b int, c int); +SET default_tablespace TO regress_tblspace; +ALTER TABLE testschema.test_tab ADD CONSTRAINT test_tab_unique UNIQUE (a); +CREATE INDEX test_tab_a_idx ON testschema.test_tab (a); +SET default_tablespace TO ''; +CREATE INDEX test_tab_b_idx ON testschema.test_tab (b); +\d testschema.test_tab_unique + Index "testschema.test_tab_unique" + Column | Type | Key? | Definition +--------+---------+------+------------ + a | integer | yes | a +unique, btree, for table "testschema.test_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_tab_a_idx + Index "testschema.test_tab_a_idx" + Column | Type | Key? | Definition +--------+---------+------+------------ + a | integer | yes | a +btree, for table "testschema.test_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_tab_b_idx + Index "testschema.test_tab_b_idx" + Column | Type | Key? | Definition +--------+---------+------+------------ + b | integer | yes | b +btree, for table "testschema.test_tab" + +ALTER TABLE testschema.test_tab ALTER b TYPE bigint, ADD UNIQUE (c); +\d testschema.test_tab_unique + Index "testschema.test_tab_unique" + Column | Type | Key? | Definition +--------+---------+------+------------ + a | integer | yes | a +unique, btree, for table "testschema.test_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_tab_a_idx + Index "testschema.test_tab_a_idx" + Column | Type | Key? | Definition +--------+---------+------+------------ + a | integer | yes | a +btree, for table "testschema.test_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_tab_b_idx + Index "testschema.test_tab_b_idx" + Column | Type | Key? | Definition +--------+--------+------+------------ + b | bigint | yes | b +btree, for table "testschema.test_tab" + DROP TABLE testschema.test_tab; -- let's try moving a table from one place to another CREATE TABLE testschema.atable AS VALUES (1), (2); From 577e6c7d3b530dbc9dee23db132f5aeba33da48b Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Thu, 2 Jul 2020 10:31:37 +0800 Subject: [PATCH 284/578] Fix bogus logic for combining range-partitioned columns during pruning. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/partitioning/partprune.c | 46 ++++--------------- src/test/regress/expected/partition_prune.out | 27 +++++++++++ src/test/regress/sql/partition_prune.sql | 19 ++++++++ 3 files changed, 55 insertions(+), 37 deletions(-) diff --git a/src/backend/partitioning/partprune.c b/src/backend/partitioning/partprune.c index 242267f2..38c1e8ea 100644 --- a/src/backend/partitioning/partprune.c +++ b/src/backend/partitioning/partprune.c @@ -806,9 +806,6 @@ gen_prune_steps_from_opexps(PartitionScheme part_scheme, List *opsteps = NIL; List *btree_clauses[BTMaxStrategyNumber + 1], *hash_clauses[HTMaxStrategyNumber + 1]; - bool need_next_less, - need_next_eq, - need_next_greater; int i; memset(btree_clauses, 0, sizeof(btree_clauses)); @@ -819,9 +816,8 @@ gen_prune_steps_from_opexps(PartitionScheme part_scheme, bool consider_next_key = true; /* - * To be useful for pruning, we must have clauses for a prefix of - * partition keys in the case of range partitioning. So, ignore - * clauses for keys after this one. + * For range partitioning, if we have no clauses for the current key, + * we can't consider any later keys either, so we can stop here. */ if (part_scheme->strategy == PARTITION_STRATEGY_RANGE && clauselist == NIL) @@ -836,7 +832,6 @@ gen_prune_steps_from_opexps(PartitionScheme part_scheme, clauselist == NIL && !bms_is_member(i, nullkeys)) return NULL; - need_next_eq = need_next_less = need_next_greater = true; foreach(lc, clauselist) { PartClauseInfo *pc = (PartClauseInfo *) lfirst(lc); @@ -858,7 +853,6 @@ gen_prune_steps_from_opexps(PartitionScheme part_scheme, case PARTITION_STRATEGY_RANGE: { PartClauseInfo *last = NULL; - bool inclusive = false; /* * Add this clause to the list of clauses to be used @@ -876,35 +870,13 @@ gen_prune_steps_from_opexps(PartitionScheme part_scheme, lappend(btree_clauses[pc->op_strategy], pc); /* - * We may not need the next clause if they're of - * certain strategy. + * We can't consider subsequent partition keys if the + * clause for the current key contains a non-inclusive + * operator. */ - switch (pc->op_strategy) - { - case BTLessEqualStrategyNumber: - inclusive = true; - /* fall through */ - case BTLessStrategyNumber: - if (!inclusive) - need_next_eq = need_next_less = false; - break; - case BTEqualStrategyNumber: - /* always accept clauses for the next key. */ - break; - case BTGreaterEqualStrategyNumber: - inclusive = true; - /* fall through */ - case BTGreaterStrategyNumber: - if (!inclusive) - need_next_eq = need_next_greater = false; - break; - } - - /* We may want to change our mind. */ - if (consider_next_key) - consider_next_key = (need_next_eq || - need_next_less || - need_next_greater); + if (pc->op_strategy == BTLessStrategyNumber || + pc->op_strategy == BTGreaterStrategyNumber) + consider_next_key = false; break; } @@ -2340,7 +2312,7 @@ get_matching_range_bounds(PartitionPruneContext *context, /* * Look for the greatest bound that is < or <= lookup value and - * set minoff to its offset. + * set maxoff to its offset. */ off = partition_range_datum_bsearch(partsupfunc, partcollation, diff --git a/src/test/regress/expected/partition_prune.out b/src/test/regress/expected/partition_prune.out index 94bceb8d..95a64972 100644 --- a/src/test/regress/expected/partition_prune.out +++ b/src/test/regress/expected/partition_prune.out @@ -1461,3 +1461,30 @@ explain (costs off) select * from rparted_by_int2 where a > 100000000000000; (3 rows) drop table lp, coll_pruning, rlp, mc3p, mc2p, boolpart, rp, coll_pruning_multi, like_op_noprune, lparted_by_int2, rparted_by_int2; +-- +-- Check that pruning with composite range partitioning works correctly when +-- it must ignore clauses for trailing keys once it has seen a clause with +-- non-inclusive operator for an earlier key +-- +create table mc3p (a int, b int, c int) partition by range (a, abs(b), c); +create table mc3p0 partition of mc3p + for values from (0, 0, 0) to (0, maxvalue, maxvalue); +create table mc3p1 partition of mc3p + for values from (1, 1, 1) to (2, minvalue, minvalue); +create table mc3p2 partition of mc3p + for values from (2, minvalue, minvalue) to (3, maxvalue, maxvalue); +insert into mc3p values (0, 1, 1), (1, 1, 1), (2, 1, 1); +explain (analyze, costs off, summary off, timing off) +select * from mc3p where a < 3 and abs(b) = 1; + QUERY PLAN +------------------------------------------------- + Append (actual rows=3 loops=1) + -> Seq Scan on mc3p0 (actual rows=1 loops=1) + Filter: ((a < 3) AND (abs(b) = 1)) + -> Seq Scan on mc3p1 (actual rows=1 loops=1) + Filter: ((a < 3) AND (abs(b) = 1)) + -> Seq Scan on mc3p2 (actual rows=1 loops=1) + Filter: ((a < 3) AND (abs(b) = 1)) +(7 rows) + +drop table mc3p; diff --git a/src/test/regress/sql/partition_prune.sql b/src/test/regress/sql/partition_prune.sql index 4862cdfd..4b5acbe1 100644 --- a/src/test/regress/sql/partition_prune.sql +++ b/src/test/regress/sql/partition_prune.sql @@ -265,3 +265,22 @@ create table rparted_by_int2_maxvalue partition of rparted_by_int2 for values fr explain (costs off) select * from rparted_by_int2 where a > 100000000000000; drop table lp, coll_pruning, rlp, mc3p, mc2p, boolpart, rp, coll_pruning_multi, like_op_noprune, lparted_by_int2, rparted_by_int2; + +-- +-- Check that pruning with composite range partitioning works correctly when +-- it must ignore clauses for trailing keys once it has seen a clause with +-- non-inclusive operator for an earlier key +-- +create table mc3p (a int, b int, c int) partition by range (a, abs(b), c); +create table mc3p0 partition of mc3p + for values from (0, 0, 0) to (0, maxvalue, maxvalue); +create table mc3p1 partition of mc3p + for values from (1, 1, 1) to (2, minvalue, minvalue); +create table mc3p2 partition of mc3p + for values from (2, minvalue, minvalue) to (3, maxvalue, maxvalue); +insert into mc3p values (0, 1, 1), (1, 1, 1), (2, 1, 1); + +explain (analyze, costs off, summary off, timing off) +select * from mc3p where a < 3 and abs(b) = 1; + +drop table mc3p; From f756f4e13415ba36120130930d38466f05512010 Mon Sep 17 00:00:00 2001 From: Alvaro Herrera Date: Fri, 7 Jun 2019 00:44:17 -0400 Subject: [PATCH 285/578] Fix default_tablespace usage for partitioned tables MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In commit 87259588d0ab I (Álvaro) tried to rationalize the determination of tablespace to use for partitioned tables, but failed to handle the default_tablespace case. Repair and add proper tests. Author: Amit Langote, Rushabh Lathia Reported-by: Rushabh Lathia Reviewed-by: Amit Langote, Álvaro Herrera Discussion: https://postgr.es/m/CAGPqQf0cYjm1=rjxk_6gU0SjUS70=yFUAdCJLwWzh9bhNJnyVg@mail.gmail.com --- src/backend/commands/tablecmds.c | 8 +++- src/test/regress/input/tablespace.source | 34 ++++++++++++--- src/test/regress/output/tablespace.source | 50 ++++++++++++++++++----- 3 files changed, 73 insertions(+), 19 deletions(-) diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 8f8a76d8..393d30c4 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -714,8 +714,8 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, } /* - * Select tablespace to use. If not specified, use default tablespace - * (which may in turn default to database's default). + * Select tablespace to use: an explicitly indicated one, or (in the case + * of a partitioned table) the parent's, if it has one. */ if (stmt->tablespacename) { @@ -736,6 +736,10 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, tablespaceId = get_rel_tablespace(linitial_oid(inheritOids)); } else + tablespaceId = InvalidOid; + + /* still nothing? use the default */ + if (!OidIsValid(tablespaceId)) tablespaceId = GetDefaultTablespace(stmt->relation->relpersistence, partitioned); diff --git a/src/test/regress/input/tablespace.source b/src/test/regress/input/tablespace.source index 5323f07e..d46f0e4c 100644 --- a/src/test/regress/input/tablespace.source +++ b/src/test/regress/input/tablespace.source @@ -44,16 +44,38 @@ CREATE INDEX foo_idx on testschema.foo(i) TABLESPACE regress_tblspace; SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c where c.reltablespace = t.oid AND c.relname = 'foo_idx'; +-- -- partitioned table +-- CREATE TABLE testschema.part (a int) PARTITION BY LIST (a); -CREATE TABLE testschema.part12 PARTITION OF testschema.part FOR VALUES IN(1,2) PARTITION BY LIST (a) TABLESPACE regress_tblspace; -CREATE TABLE testschema.part12_1 PARTITION OF testschema.part12 FOR VALUES IN (1); -ALTER TABLE testschema.part12 SET TABLESPACE pg_default; -CREATE TABLE testschema.part12_2 PARTITION OF testschema.part12 FOR VALUES IN (2); --- Ensure part12_1 defaulted to regress_tblspace and part12_2 defaulted to pg_default. +SET default_tablespace TO pg_global; +CREATE TABLE testschema.part_1 PARTITION OF testschema.part FOR VALUES IN (1); +RESET default_tablespace; +CREATE TABLE testschema.part_1 PARTITION OF testschema.part FOR VALUES IN (1); +SET default_tablespace TO regress_tblspace; +CREATE TABLE testschema.part_2 PARTITION OF testschema.part FOR VALUES IN (2); +SET default_tablespace TO pg_global; +CREATE TABLE testschema.part_3 PARTITION OF testschema.part FOR VALUES IN (3); +ALTER TABLE testschema.part SET TABLESPACE regress_tblspace; +CREATE TABLE testschema.part_3 PARTITION OF testschema.part FOR VALUES IN (3); +CREATE TABLE testschema.part_4 PARTITION OF testschema.part FOR VALUES IN (4) + TABLESPACE pg_default; +CREATE TABLE testschema.part_56 PARTITION OF testschema.part FOR VALUES IN (5, 6) + PARTITION BY LIST (a); +ALTER TABLE testschema.part SET TABLESPACE pg_default; +CREATE TABLE testschema.part_78 PARTITION OF testschema.part FOR VALUES IN (7, 8) + PARTITION BY LIST (a); +CREATE TABLE testschema.part_910 PARTITION OF testschema.part FOR VALUES IN (9, 10) + PARTITION BY LIST (a) TABLESPACE regress_tblspace; +RESET default_tablespace; +CREATE TABLE testschema.part_78 PARTITION OF testschema.part FOR VALUES IN (7, 8) + PARTITION BY LIST (a); + SELECT relname, spcname FROM pg_catalog.pg_class c + JOIN pg_catalog.pg_namespace n ON (c.relnamespace = n.oid) LEFT JOIN pg_catalog.pg_tablespace t ON c.reltablespace = t.oid - where c.relname LIKE 'part%' order by relname; + where c.relname LIKE 'part%' AND n.nspname = 'testschema' order by relname; +RESET default_tablespace; DROP TABLE testschema.part; -- partitioned index diff --git a/src/test/regress/output/tablespace.source b/src/test/regress/output/tablespace.source index 21e8baff..15c0d3e0 100644 --- a/src/test/regress/output/tablespace.source +++ b/src/test/regress/output/tablespace.source @@ -61,24 +61,52 @@ SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c foo_idx | regress_tblspace (1 row) +-- -- partitioned table +-- CREATE TABLE testschema.part (a int) PARTITION BY LIST (a); -CREATE TABLE testschema.part12 PARTITION OF testschema.part FOR VALUES IN(1,2) PARTITION BY LIST (a) TABLESPACE regress_tblspace; -CREATE TABLE testschema.part12_1 PARTITION OF testschema.part12 FOR VALUES IN (1); -ALTER TABLE testschema.part12 SET TABLESPACE pg_default; -CREATE TABLE testschema.part12_2 PARTITION OF testschema.part12 FOR VALUES IN (2); --- Ensure part12_1 defaulted to regress_tblspace and part12_2 defaulted to pg_default. +SET default_tablespace TO pg_global; +CREATE TABLE testschema.part_1 PARTITION OF testschema.part FOR VALUES IN (1); +ERROR: only shared relations can be placed in pg_global tablespace +RESET default_tablespace; +CREATE TABLE testschema.part_1 PARTITION OF testschema.part FOR VALUES IN (1); +SET default_tablespace TO regress_tblspace; +CREATE TABLE testschema.part_2 PARTITION OF testschema.part FOR VALUES IN (2); +SET default_tablespace TO pg_global; +CREATE TABLE testschema.part_3 PARTITION OF testschema.part FOR VALUES IN (3); +ERROR: only shared relations can be placed in pg_global tablespace +ALTER TABLE testschema.part SET TABLESPACE regress_tblspace; +CREATE TABLE testschema.part_3 PARTITION OF testschema.part FOR VALUES IN (3); +CREATE TABLE testschema.part_4 PARTITION OF testschema.part FOR VALUES IN (4) + TABLESPACE pg_default; +CREATE TABLE testschema.part_56 PARTITION OF testschema.part FOR VALUES IN (5, 6) + PARTITION BY LIST (a); +ALTER TABLE testschema.part SET TABLESPACE pg_default; +CREATE TABLE testschema.part_78 PARTITION OF testschema.part FOR VALUES IN (7, 8) + PARTITION BY LIST (a); +ERROR: only shared relations can be placed in pg_global tablespace +CREATE TABLE testschema.part_910 PARTITION OF testschema.part FOR VALUES IN (9, 10) + PARTITION BY LIST (a) TABLESPACE regress_tblspace; +RESET default_tablespace; +CREATE TABLE testschema.part_78 PARTITION OF testschema.part FOR VALUES IN (7, 8) + PARTITION BY LIST (a); SELECT relname, spcname FROM pg_catalog.pg_class c + JOIN pg_catalog.pg_namespace n ON (c.relnamespace = n.oid) LEFT JOIN pg_catalog.pg_tablespace t ON c.reltablespace = t.oid - where c.relname LIKE 'part%' order by relname; + where c.relname LIKE 'part%' AND n.nspname = 'testschema' order by relname; relname | spcname ----------+------------------ part | - part12 | - part12_1 | regress_tblspace - part12_2 | -(4 rows) - + part_1 | + part_2 | regress_tblspace + part_3 | regress_tblspace + part_4 | + part_56 | regress_tblspace + part_78 | + part_910 | regress_tblspace +(8 rows) + +RESET default_tablespace; DROP TABLE testschema.part; -- partitioned index CREATE TABLE testschema.part (a int) PARTITION BY LIST (a); From 015a1b4073770b195ec00941ce95d9467d5833e3 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Thu, 2 Jul 2020 11:04:21 +0800 Subject: [PATCH 286/578] Fix RANGE partition pruning with multiple boolean partition keys.http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/partitioning/partprune.c | 50 ++++++++++++++----- src/test/regress/expected/partition_prune.out | 15 +++++- src/test/regress/sql/partition_prune.sql | 11 +++- 3 files changed, 61 insertions(+), 15 deletions(-) diff --git a/src/backend/partitioning/partprune.c b/src/backend/partitioning/partprune.c index 38c1e8ea..7adf41f3 100644 --- a/src/backend/partitioning/partprune.c +++ b/src/backend/partitioning/partprune.c @@ -140,8 +140,10 @@ static PruneStepResult *perform_pruning_base_step(PartitionPruneContext *context static PruneStepResult *perform_pruning_combine_step(PartitionPruneContext *context, PartitionPruneStepCombine *cstep, PruneStepResult **step_results); -static bool match_boolean_partition_clause(Oid partopfamily, Expr *clause, - Expr *partkey, Expr **outconst); +static PartClauseMatchStatus match_boolean_partition_clause(Oid partopfamily, + Expr *clause, + Expr *partkey, + Expr **outconst); static bool partkey_datum_from_expr(PartitionPruneContext *context, Expr *expr, Datum *value); @@ -1166,6 +1168,7 @@ match_clause_to_partition_key(RelOptInfo *rel, bool *clause_is_not_null, PartClauseInfo **pc, List **clause_steps) { + PartClauseMatchStatus boolmatchstatus; PartitionScheme part_scheme = rel->part_scheme; Expr *expr; Oid partopfamily = part_scheme->partopfamily[partkeyidx], @@ -1175,7 +1178,10 @@ match_clause_to_partition_key(RelOptInfo *rel, * Recognize specially shaped clauses that match with the Boolean * partition key. */ - if (match_boolean_partition_clause(partopfamily, clause, partkey, &expr)) + boolmatchstatus = match_boolean_partition_clause(partopfamily, clause, + partkey, &expr); + + if (boolmatchstatus == PARTCLAUSE_MATCH_CLAUSE) { PartClauseInfo *partclause; @@ -1529,7 +1535,21 @@ match_clause_to_partition_key(RelOptInfo *rel, return PARTCLAUSE_MATCH_NULLNESS; } - return PARTCLAUSE_UNSUPPORTED; + /* + * If we get here then the return value depends on the result of the + * match_boolean_partition_clause call above. If the call returned + * PARTCLAUSE_UNSUPPORTED then we're either not dealing with a bool qual + * or the bool qual is not suitable for pruning. Since the qual didn't + * match up to any of the other qual types supported here, then trying to + * match it against any other partition key is a waste of time, so just + * return PARTCLAUSE_UNSUPPORTED. If the qual just couldn't be matched to + * this partition key, then it may match another, so return + * PARTCLAUSE_NOMATCH. The only other value that + * match_boolean_partition_clause can return is PARTCLAUSE_MATCH_CLAUSE, + * and since that value was already dealt with above, then we can just + * return boolmatchstatus. + */ + return boolmatchstatus; } /* @@ -2670,11 +2690,15 @@ perform_pruning_combine_step(PartitionPruneContext *context, /* * match_boolean_partition_clause * - * Sets *outconst to a Const containing true or false value and returns true if - * we're able to match the clause to the partition key as specially-shaped - * Boolean clause. Returns false otherwise with *outconst set to NULL. + * If we're able to match the clause to the partition key as specially-shaped + * boolean clause, set *outconst to a Const containing a true or false value + * and return PARTCLAUSE_MATCH_CLAUSE. Returns PARTCLAUSE_UNSUPPORTED if the + * clause is not a boolean clause or if the boolean clause is unsuitable for + * partition pruning. Returns PARTCLAUSE_NOMATCH if it's a bool quals but + * just does not match this partition key. *outconst is set to NULL in the + * latter two cases. */ -static bool +static PartClauseMatchStatus match_boolean_partition_clause(Oid partopfamily, Expr *clause, Expr *partkey, Expr **outconst) { @@ -2683,7 +2707,7 @@ match_boolean_partition_clause(Oid partopfamily, Expr *clause, Expr *partkey, *outconst = NULL; if (!IsBooleanOpfamily(partopfamily)) - return false; + return PARTCLAUSE_UNSUPPORTED; if (IsA(clause, BooleanTest)) { @@ -2692,7 +2716,7 @@ match_boolean_partition_clause(Oid partopfamily, Expr *clause, Expr *partkey, /* Only IS [NOT] TRUE/FALSE are any good to us */ if (btest->booltesttype == IS_UNKNOWN || btest->booltesttype == IS_NOT_UNKNOWN) - return false; + return PARTCLAUSE_UNSUPPORTED; leftop = btest->arg; if (IsA(leftop, RelabelType)) @@ -2705,7 +2729,7 @@ match_boolean_partition_clause(Oid partopfamily, Expr *clause, Expr *partkey, : (Expr *) makeBoolConst(false, false); if (*outconst) - return true; + return PARTCLAUSE_MATCH_CLAUSE; } else { @@ -2725,10 +2749,10 @@ match_boolean_partition_clause(Oid partopfamily, Expr *clause, Expr *partkey, *outconst = (Expr *) makeBoolConst(false, false); if (*outconst) - return true; + return PARTCLAUSE_MATCH_CLAUSE; } - return false; + return PARTCLAUSE_NOMATCH; } /* diff --git a/src/test/regress/expected/partition_prune.out b/src/test/regress/expected/partition_prune.out index 95a64972..a08f303d 100644 --- a/src/test/regress/expected/partition_prune.out +++ b/src/test/regress/expected/partition_prune.out @@ -1122,6 +1122,19 @@ explain (costs off) select * from boolpart where a is not unknown; Filter: (a IS NOT UNKNOWN) (7 rows) +create table boolrangep (a bool, b bool, c int) partition by range (a,b,c); +create table boolrangep_tf partition of boolrangep for values from ('true', 'false', 0) to ('true', 'false', 100); +create table boolrangep_ft partition of boolrangep for values from ('false', 'true', 0) to ('false', 'true', 100); +create table boolrangep_ff1 partition of boolrangep for values from ('false', 'false', 0) to ('false', 'false', 50); +create table boolrangep_ff2 partition of boolrangep for values from ('false', 'false', 50) to ('false', 'false', 100); +-- try a more complex case that's been known to trip up pruning in the past +explain (costs off) select * from boolrangep where not a and not b and c = 25; + QUERY PLAN +---------------------------------------------- + Seq Scan on boolrangep_ff1 + Filter: ((NOT a) AND (NOT b) AND (c = 25)) +(2 rows) + -- test scalar-to-array operators create table coercepart (a varchar) partition by list (a); create table coercepart_ab partition of coercepart for values in ('ab'); @@ -1460,7 +1473,7 @@ explain (costs off) select * from rparted_by_int2 where a > 100000000000000; Filter: (a > '100000000000000'::bigint) (3 rows) -drop table lp, coll_pruning, rlp, mc3p, mc2p, boolpart, rp, coll_pruning_multi, like_op_noprune, lparted_by_int2, rparted_by_int2; +drop table lp, coll_pruning, rlp, mc3p, mc2p, boolpart, boolrangep, rp, coll_pruning_multi, like_op_noprune, lparted_by_int2, rparted_by_int2; -- -- Check that pruning with composite range partitioning works correctly when -- it must ignore clauses for trailing keys once it has seen a clause with diff --git a/src/test/regress/sql/partition_prune.sql b/src/test/regress/sql/partition_prune.sql index 4b5acbe1..1cd151e2 100644 --- a/src/test/regress/sql/partition_prune.sql +++ b/src/test/regress/sql/partition_prune.sql @@ -159,6 +159,15 @@ explain (costs off) select * from boolpart where a is not true and a is not fals explain (costs off) select * from boolpart where a is unknown; explain (costs off) select * from boolpart where a is not unknown; +create table boolrangep (a bool, b bool, c int) partition by range (a,b,c); +create table boolrangep_tf partition of boolrangep for values from ('true', 'false', 0) to ('true', 'false', 100); +create table boolrangep_ft partition of boolrangep for values from ('false', 'true', 0) to ('false', 'true', 100); +create table boolrangep_ff1 partition of boolrangep for values from ('false', 'false', 0) to ('false', 'false', 50); +create table boolrangep_ff2 partition of boolrangep for values from ('false', 'false', 50) to ('false', 'false', 100); + +-- try a more complex case that's been known to trip up pruning in the past +explain (costs off) select * from boolrangep where not a and not b and c = 25; + -- test scalar-to-array operators create table coercepart (a varchar) partition by list (a); create table coercepart_ab partition of coercepart for values in ('ab'); @@ -264,7 +273,7 @@ create table rparted_by_int2_maxvalue partition of rparted_by_int2 for values fr -- all partitions but rparted_by_int2_maxvalue pruned explain (costs off) select * from rparted_by_int2 where a > 100000000000000; -drop table lp, coll_pruning, rlp, mc3p, mc2p, boolpart, rp, coll_pruning_multi, like_op_noprune, lparted_by_int2, rparted_by_int2; +drop table lp, coll_pruning, rlp, mc3p, mc2p, boolpart, boolrangep, rp, coll_pruning_multi, like_op_noprune, lparted_by_int2, rparted_by_int2; -- -- Check that pruning with composite range partitioning works correctly when From 80d2048d3e70e4e31ec32accf51fb626b43fbc60 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Thu, 2 Jul 2020 11:28:00 +0800 Subject: [PATCH 287/578] Install dependencies to prevent dropping partition key columns. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/catalog/dependency.c | 53 ++++++++++++++++++--- src/backend/catalog/heap.c | 28 +++++++++-- src/backend/commands/tablecmds.c | 27 +++++------ src/include/catalog/dependency.h | 2 +- src/test/regress/expected/alter_table.out | 10 ++-- src/test/regress/expected/alter_table_1.out | 6 +-- src/test/regress/expected/alter_table_2.out | 6 +-- src/test/regress/expected/alter_table_3.out | 6 +-- src/test/regress/expected/create_table.out | 36 ++++++++++++++ src/test/regress/sql/create_table.sql | 33 +++++++++++++ 10 files changed, 168 insertions(+), 39 deletions(-) diff --git a/src/backend/catalog/dependency.c b/src/backend/catalog/dependency.c index 5975a9bb..924d7f35 100644 --- a/src/backend/catalog/dependency.c +++ b/src/backend/catalog/dependency.c @@ -656,6 +656,7 @@ findDependentObjects(const ObjectAddress *object, ObjectIdGetDatum(object->objectId)); if (object->objectSubId != 0) { + /* Consider only dependencies of this sub-object */ ScanKeyInit(&key[2], Anum_pg_depend_objsubid, BTEqualStrategyNumber, F_INT4EQ, @@ -663,7 +664,10 @@ findDependentObjects(const ObjectAddress *object, nkeys = 3; } else + { + /* Consider dependencies of this object and any sub-objects it has */ nkeys = 2; + } scan = systable_beginscan(*depRel, DependDependerIndexId, true, NULL, nkeys, key); @@ -676,6 +680,18 @@ findDependentObjects(const ObjectAddress *object, otherObject.objectId = foundDep->refobjid; otherObject.objectSubId = foundDep->refobjsubid; + /* + * When scanning dependencies of a whole object, we may find rows + * linking sub-objects of the object to the object itself. (Normally, + * such a dependency is implicit, but we must make explicit ones in + * some cases involving partitioning.) We must ignore such rows to + * avoid infinite recursion. + */ + if (otherObject.classId == object->classId && + otherObject.objectId == object->objectId && + object->objectSubId == 0) + continue; + switch (foundDep->deptype) { case DEPENDENCY_NORMAL: @@ -863,6 +879,16 @@ findDependentObjects(const ObjectAddress *object, otherObject.objectSubId = foundDep->objsubid; /* + * If what we found is a sub-object of the current object, just ignore + * it. (Normally, such a dependency is implicit, but we must make + * explicit ones in some cases involving partitioning.) + */ + if (otherObject.classId == object->classId && + otherObject.objectId == object->objectId && + object->objectSubId == 0) + continue; + + /* * Must lock the dependent object before recursing to it. */ AcquireDeletionLock(&otherObject, 0); @@ -1601,8 +1627,10 @@ recordDependencyOnExpr(const ObjectAddress *depender, * As above, but only one relation is expected to be referenced (with * varno = 1 and varlevelsup = 0). Pass the relation OID instead of a * range table. An additional frammish is that dependencies on that - * relation (or its component columns) will be marked with 'self_behavior', - * whereas 'behavior' is used for everything else. + * relation's component columns will be marked with 'self_behavior', + * whereas 'behavior' is used for everything else; also, if 'reverse_self' + * is true, those dependencies are reversed so that the columns are made + * to depend on the table not vice versa. * * NOTE: the caller should ensure that a whole-table dependency on the * specified relation is created separately, if one is needed. In particular, @@ -1615,7 +1643,7 @@ recordDependencyOnSingleRelExpr(const ObjectAddress *depender, Node *expr, Oid relId, DependencyType behavior, DependencyType self_behavior, - bool ignore_self) + bool reverse_self) { find_expr_references_context context; RangeTblEntry rte; @@ -1638,7 +1666,8 @@ recordDependencyOnSingleRelExpr(const ObjectAddress *depender, eliminate_duplicate_dependencies(context.addrs); /* Separate self-dependencies if necessary */ - if (behavior != self_behavior && context.addrs->numrefs > 0) + if ((behavior != self_behavior || reverse_self) && + context.addrs->numrefs > 0) { ObjectAddresses *self_addrs; ObjectAddress *outobj; @@ -1669,11 +1698,23 @@ recordDependencyOnSingleRelExpr(const ObjectAddress *depender, } context.addrs->numrefs = outrefs; - /* Record the self-dependencies */ - if (!ignore_self) + /* Record the self-dependencies with the appropriate direction */ + if (!reverse_self) recordMultipleDependencies(depender, self_addrs->refs, self_addrs->numrefs, self_behavior); + else + { + /* Can't use recordMultipleDependencies, so do it the hard way */ + int selfref; + + for (selfref = 0; selfref < self_addrs->numrefs; selfref++) + { + ObjectAddress *thisobj = self_addrs->refs + selfref; + + recordDependencyOn(thisobj, depender, self_behavior); + } + } free_object_addresses(self_addrs); } diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c index 56e4d7f1..39a9c235 100644 --- a/src/backend/catalog/heap.c +++ b/src/backend/catalog/heap.c @@ -4323,16 +4323,36 @@ StorePartitionKey(Relation rel, } /* - * Anything mentioned in the expressions. We must ignore the column - * references, which will depend on the table itself; there is no separate - * partition key object. + * The partitioning columns are made internally dependent on the table, + * because we cannot drop any of them without dropping the whole table. + * (ATExecDropColumn independently enforces that, but it's not bulletproof + * so we need the dependencies too.) + */ + for (i = 0; i < partnatts; i++) + { + if (partattrs[i] == 0) + continue; /* ignore expressions here */ + + referenced.classId = RelationRelationId; + referenced.objectId = RelationGetRelid(rel); + referenced.objectSubId = partattrs[i]; + + recordDependencyOn(&referenced, &myself, DEPENDENCY_INTERNAL); + } + + /* + * Also consider anything mentioned in partition expressions. External + * references (e.g. functions) get NORMAL dependencies. Table columns + * mentioned in the expressions are handled the same as plain partitioning + * columns, i.e. they become internally dependent on the whole table. */ if (partexprs) recordDependencyOnSingleRelExpr(&myself, (Node *) partexprs, RelationGetRelid(rel), DEPENDENCY_NORMAL, - DEPENDENCY_AUTO, true); + DEPENDENCY_INTERNAL, + true /* reverse the self-deps */ ); /* * We must invalidate the relcache so that the next diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 393d30c4..43ccc2f3 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -8220,26 +8220,29 @@ ATExecDropColumn(List **wqueue, Relation rel, const char *colName, errmsg("cannot drop system column \"%s\"", colName))); - /* Don't drop inherited columns */ + /* + * Don't drop inherited columns, unless recursing (presumably from a drop + * of the parent column) + */ if (targetatt->attinhcount > 0 && !recursing) ereport(ERROR, (errcode(ERRCODE_INVALID_TABLE_DEFINITION), errmsg("cannot drop inherited column \"%s\"", colName))); - /* Don't drop columns used in the partition key */ + /* + * Don't drop columns used in the partition key, either. (If we let this + * go through, the key column's dependencies would cause a cascaded drop + * of the whole table, which is surely not what the user expected.) + */ if (has_partition_attrs(rel, bms_make_singleton(attnum - FirstLowInvalidHeapAttributeNumber), &is_expr)) { - if (!is_expr) ereport(ERROR, (errcode(ERRCODE_INVALID_TABLE_DEFINITION), - errmsg("cannot drop column named in partition key"))); - else - ereport(ERROR, - (errcode(ERRCODE_INVALID_TABLE_DEFINITION), - errmsg("cannot drop column referenced in partition key expression"))); + errmsg("cannot drop column \"%s\" because it is part of the partition key of relation \"%s\"", + colName, RelationGetRelationName(rel)))); } #ifdef __TBASE__ @@ -10683,14 +10686,10 @@ ATPrepAlterColumnType(List **wqueue, bms_make_singleton(attnum - FirstLowInvalidHeapAttributeNumber), &is_expr)) { - if (!is_expr) - ereport(ERROR, - (errcode(ERRCODE_INVALID_TABLE_DEFINITION), - errmsg("cannot alter type of column named in partition key"))); - else ereport(ERROR, (errcode(ERRCODE_INVALID_TABLE_DEFINITION), - errmsg("cannot alter type of column referenced in partition key expression"))); + errmsg("cannot alter column \"%s\" because it is part of the partition key of relation \"%s\"", + colName, RelationGetRelationName(rel)))); } #ifdef __TBASE__ diff --git a/src/include/catalog/dependency.h b/src/include/catalog/dependency.h index 83aa4d05..6af0f85d 100644 --- a/src/include/catalog/dependency.h +++ b/src/include/catalog/dependency.h @@ -279,7 +279,7 @@ extern void recordDependencyOnSingleRelExpr(const ObjectAddress *depender, Node *expr, Oid relId, DependencyType behavior, DependencyType self_behavior, - bool ignore_self); + bool reverse_self); extern ObjectClass getObjectClass(const ObjectAddress *object); diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out index ea00b3ae..9ace1f7a 100644 --- a/src/test/regress/expected/alter_table.out +++ b/src/test/regress/expected/alter_table.out @@ -3141,11 +3141,11 @@ LINE 1: ALTER TABLE partitioned ADD EXCLUDE USING gist (a WITH &&); ALTER TABLE partitioned DROP COLUMN a; ERROR: Distribution column cannot be dropped ALTER TABLE partitioned ALTER COLUMN a TYPE char(5); -ERROR: cannot alter type of column named in partition key +ERROR: cannot alter column "a" because it is part of the partition key of relation "partitioned" ALTER TABLE partitioned DROP COLUMN b; -ERROR: cannot drop column referenced in partition key expression +ERROR: cannot drop column "b" because it is part of the partition key of relation "partitioned" ALTER TABLE partitioned ALTER COLUMN b TYPE char(5); -ERROR: cannot alter type of column referenced in partition key expression +ERROR: cannot alter column "b" because it is part of the partition key of relation "partitioned" -- partitioned table cannot participate in regular inheritance CREATE TABLE nonpartitioned ( a int, @@ -3669,9 +3669,9 @@ ERROR: cannot change inheritance of a partition -- partitioned tables; for example, part_5, which is list_parted2's -- partition, is partitioned on b; ALTER TABLE list_parted2 DROP COLUMN b; -ERROR: cannot drop column named in partition key +ERROR: cannot drop column "b" because it is part of the partition key of relation "part_5" ALTER TABLE list_parted2 ALTER COLUMN b TYPE text; -ERROR: cannot alter type of column named in partition key +ERROR: cannot alter column "b" because it is part of the partition key of relation "part_5" -- dropping non-partition key columns should be allowed on the parent table. ALTER TABLE list_parted DROP COLUMN b; SELECT * FROM list_parted; diff --git a/src/test/regress/expected/alter_table_1.out b/src/test/regress/expected/alter_table_1.out index 744691c9..aa4082c8 100644 --- a/src/test/regress/expected/alter_table_1.out +++ b/src/test/regress/expected/alter_table_1.out @@ -3140,11 +3140,11 @@ LINE 1: ALTER TABLE partitioned ADD EXCLUDE USING gist (a WITH &&); ALTER TABLE partitioned DROP COLUMN a; ERROR: Distribution column cannot be dropped ALTER TABLE partitioned ALTER COLUMN a TYPE char(5); -ERROR: cannot alter type of column named in partition key +ERROR: cannot alter column "a" because it is part of the partition key of relation "partitioned" ALTER TABLE partitioned DROP COLUMN b; -ERROR: cannot drop column referenced in partition key expression +ERROR: cannot drop column "b" because it is part of the partition key of relation "partitioned" ALTER TABLE partitioned ALTER COLUMN b TYPE char(5); -ERROR: cannot alter type of column referenced in partition key expression +ERROR: cannot alter column "b" because it is part of the partition key of relation "partitioned" -- partitioned table cannot participate in regular inheritance CREATE TABLE nonpartitioned ( a int, diff --git a/src/test/regress/expected/alter_table_2.out b/src/test/regress/expected/alter_table_2.out index a958aa64..773bd445 100644 --- a/src/test/regress/expected/alter_table_2.out +++ b/src/test/regress/expected/alter_table_2.out @@ -3140,11 +3140,11 @@ LINE 1: ALTER TABLE partitioned ADD EXCLUDE USING gist (a WITH &&); ALTER TABLE partitioned DROP COLUMN a; ERROR: Distribution column cannot be dropped ALTER TABLE partitioned ALTER COLUMN a TYPE char(5); -ERROR: cannot alter type of column named in partition key +ERROR: cannot alter column "a" because it is part of the partition key of relation "partitioned" ALTER TABLE partitioned DROP COLUMN b; -ERROR: cannot drop column referenced in partition key expression +ERROR: cannot drop column "b" because it is part of the partition key of relation "partitioned" ALTER TABLE partitioned ALTER COLUMN b TYPE char(5); -ERROR: cannot alter type of column referenced in partition key expression +ERROR: cannot alter column "b" because it is part of the partition key of relation "partitioned" -- partitioned table cannot participate in regular inheritance CREATE TABLE nonpartitioned ( a int, diff --git a/src/test/regress/expected/alter_table_3.out b/src/test/regress/expected/alter_table_3.out index 9d426e3c..3287e360 100644 --- a/src/test/regress/expected/alter_table_3.out +++ b/src/test/regress/expected/alter_table_3.out @@ -3140,11 +3140,11 @@ LINE 1: ALTER TABLE partitioned ADD EXCLUDE USING gist (a WITH &&); ALTER TABLE partitioned DROP COLUMN a; ERROR: Distribution column cannot be dropped ALTER TABLE partitioned ALTER COLUMN a TYPE char(5); -ERROR: cannot alter type of column named in partition key +ERROR: cannot alter column "a" because it is part of the partition key of relation "partitioned" ALTER TABLE partitioned DROP COLUMN b; -ERROR: cannot drop column referenced in partition key expression +ERROR: cannot drop column "b" because it is part of the partition key of relation "partitioned" ALTER TABLE partitioned ALTER COLUMN b TYPE char(5); -ERROR: cannot alter type of column referenced in partition key expression +ERROR: cannot alter column "b" because it is part of the partition key of relation "partitioned" -- partitioned table cannot participate in regular inheritance CREATE TABLE nonpartitioned ( a int, diff --git a/src/test/regress/expected/create_table.out b/src/test/regress/expected/create_table.out index e9bf8784..3290fe55 100644 --- a/src/test/regress/expected/create_table.out +++ b/src/test/regress/expected/create_table.out @@ -461,6 +461,42 @@ Partition of: partitioned2 FOR VALUES FROM ('-1', 'aaaaa') TO (100, 'ccccc') Partition constraint: (((a + 1) IS NOT NULL) AND (substr(b, 1, 5) IS NOT NULL) AND (((a + 1) > '-1'::integer) OR (((a + 1) = '-1'::integer) AND (substr(b, 1, 5) >= 'aaaaa'::text))) AND (((a + 1) < 100) OR (((a + 1) = 100) AND (substr(b, 1, 5) < 'ccccc'::text)))) DROP TABLE partitioned, partitioned2; +-- check that dependencies of partition columns are handled correctly +create domain intdom1 as int; +create table partitioned ( + a intdom1, + b text +) partition by range (a); +alter table partitioned drop column a; -- fail +ERROR: cannot drop column "a" because it is part of the partition key of relation "partitioned" +drop domain intdom1; -- fail, requires cascade +ERROR: cannot drop type intdom1 because other objects depend on it +DETAIL: table partitioned depends on type intdom1 +HINT: Use DROP ... CASCADE to drop the dependent objects too. +drop domain intdom1 cascade; +NOTICE: drop cascades to table partitioned +table partitioned; -- gone +ERROR: relation "partitioned" does not exist +LINE 1: table partitioned; + ^ +-- likewise for columns used in partition expressions +create domain intdom1 as int; +create table partitioned ( + a intdom1, + b text +) partition by range (plusone(a)); +alter table partitioned drop column a; -- fail +ERROR: cannot drop column "a" because it is part of the partition key of relation "partitioned" +drop domain intdom1; -- fail, requires cascade +ERROR: cannot drop type intdom1 because other objects depend on it +DETAIL: table partitioned depends on type intdom1 +HINT: Use DROP ... CASCADE to drop the dependent objects too. +drop domain intdom1 cascade; +NOTICE: drop cascades to table partitioned +table partitioned; -- gone +ERROR: relation "partitioned" does not exist +LINE 1: table partitioned; + ^ -- -- Partitions -- diff --git a/src/test/regress/sql/create_table.sql b/src/test/regress/sql/create_table.sql index 68482d79..d00a5935 100644 --- a/src/test/regress/sql/create_table.sql +++ b/src/test/regress/sql/create_table.sql @@ -432,6 +432,39 @@ CREATE TABLE part2_1 PARTITION OF partitioned2 FOR VALUES FROM (-1, 'aaaaa') TO DROP TABLE partitioned, partitioned2; +-- check that dependencies of partition columns are handled correctly +create domain intdom1 as int; + +create table partitioned ( + a intdom1, + b text +) partition by range (a); + +alter table partitioned drop column a; -- fail + +drop domain intdom1; -- fail, requires cascade + +drop domain intdom1 cascade; + +table partitioned; -- gone + +-- likewise for columns used in partition expressions +create domain intdom1 as int; + +create table partitioned ( + a intdom1, + b text +) partition by range (plusone(a)); + +alter table partitioned drop column a; -- fail + +drop domain intdom1; -- fail, requires cascade + +drop domain intdom1 cascade; + +table partitioned; -- gone + + -- -- Partitions -- From 3e54f3574fad0481d1946d019a074e0a33e39d69 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Thu, 2 Jul 2020 11:49:42 +0800 Subject: [PATCH 288/578] Check that partitions are not in use when dropping constraints. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/commands/tablecmds.c | 56 +++++++++++++++++---- src/test/regress/expected/foreign_key.out | 15 ++++++ src/test/regress/expected/foreign_key_1.out | 15 ++++++ src/test/regress/expected/foreign_key_2.out | 15 ++++++ src/test/regress/sql/foreign_key.sql | 12 +++++ 5 files changed, 102 insertions(+), 11 deletions(-) diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 43ccc2f3..31c4aa91 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -405,6 +405,7 @@ static void ATSimplePermissions(Relation rel, int allowed_targets); static void ATWrongRelkindError(Relation rel, int allowed_targets); static void ATSimpleRecursion(List **wqueue, Relation rel, AlterTableCmd *cmd, bool recurse, LOCKMODE lockmode); +static void ATCheckPartitionsNotInUse(Relation rel, LOCKMODE lockmode); static void ATTypedTableRecursion(List **wqueue, Relation rel, AlterTableCmd *cmd, LOCKMODE lockmode); static List *find_typed_table_dependencies(Oid typeOid, const char *typeName, @@ -4151,8 +4152,7 @@ CheckTableNotInUse(Relation rel, const char *stmt) ereport(ERROR, (errcode(ERRCODE_OBJECT_IN_USE), /* translator: first %s is a SQL command, eg ALTER TABLE */ - errmsg("cannot %s \"%s\" because " - "it is being used by active queries in this session", + errmsg("cannot %s \"%s\" because it is being used by active queries in this session", stmt, RelationGetRelationName(rel)))); if (rel->rd_rel->relkind != RELKIND_INDEX && @@ -4160,8 +4160,7 @@ CheckTableNotInUse(Relation rel, const char *stmt) ereport(ERROR, (errcode(ERRCODE_OBJECT_IN_USE), /* translator: first %s is a SQL command, eg ALTER TABLE */ - errmsg("cannot %s \"%s\" because " - "it has pending trigger events", + errmsg("cannot %s \"%s\" because it has pending trigger events", stmt, RelationGetRelationName(rel)))); } @@ -4790,16 +4789,19 @@ ATPrepCmd(List **wqueue, Relation rel, AlterTableCmd *cmd, break; case AT_AddIdentity: ATSimplePermissions(rel, ATT_TABLE | ATT_VIEW | ATT_FOREIGN_TABLE); + /* This command never recurses */ pass = AT_PASS_ADD_CONSTR; break; - case AT_DropIdentity: - ATSimplePermissions(rel, ATT_TABLE | ATT_VIEW | ATT_FOREIGN_TABLE); - pass = AT_PASS_DROP; - break; case AT_SetIdentity: ATSimplePermissions(rel, ATT_TABLE | ATT_VIEW | ATT_FOREIGN_TABLE); + /* This command never recurses */ pass = AT_PASS_COL_ATTRS; break; + case AT_DropIdentity: + ATSimplePermissions(rel, ATT_TABLE | ATT_VIEW | ATT_FOREIGN_TABLE); + /* This command never recurses */ + pass = AT_PASS_DROP; + break; case AT_DropNotNull: /* ALTER COLUMN DROP NOT NULL */ ATSimplePermissions(rel, ATT_TABLE | ATT_FOREIGN_TABLE); ATPrepDropNotNull(rel, recurse, recursing); @@ -4861,7 +4863,8 @@ ATPrepCmd(List **wqueue, Relation rel, AlterTableCmd *cmd, break; case AT_DropConstraint: /* DROP CONSTRAINT */ ATSimplePermissions(rel, ATT_TABLE | ATT_FOREIGN_TABLE); - /* Recursion occurs during execution phase */ + ATCheckPartitionsNotInUse(rel, lockmode); + /* Other recursion occurs during execution phase */ /* No command-specific prep needed except saving recurse flag */ if (recurse) cmd->subtype = AT_DropConstraintRecurse; @@ -6477,8 +6480,9 @@ ATSimpleRecursion(List **wqueue, Relation rel, AlterTableCmd *cmd, bool recurse, LOCKMODE lockmode) {// #lizard forgives /* - * Propagate to children if desired. Only plain tables and foreign tables - * have children, so no need to search for other relkinds. + * Propagate to children if desired. Only plain tables, foreign tables + * and partitioned tables have children, so no need to search for other + * relkinds. */ if (recurse && (rel->rd_rel->relkind == RELKIND_RELATION || @@ -6550,6 +6554,36 @@ ATSimpleRecursion(List **wqueue, Relation rel, } } +/* + * Obtain list of partitions of the given table, locking them all at the given + * lockmode and ensuring that they all pass CheckTableNotInUse. + * + * This function is a no-op if the given relation is not a partitioned table; + * in particular, nothing is done if it's a legacy inheritance parent. + */ +static void +ATCheckPartitionsNotInUse(Relation rel, LOCKMODE lockmode) +{ + if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + { + List *inh; + ListCell *cell; + + inh = find_all_inheritors(RelationGetRelid(rel), lockmode, NULL); + /* first element is the parent rel; must ignore it */ + for_each_cell(cell, lnext(list_head(inh))) + { + Relation childrel; + + /* find_all_inheritors already got lock */ + childrel = table_open(lfirst_oid(cell), NoLock); + CheckTableNotInUse(childrel, "ALTER TABLE"); + table_close(childrel, NoLock); + } + list_free(inh); + } +} + /* * ATTypedTableRecursion * diff --git a/src/test/regress/expected/foreign_key.out b/src/test/regress/expected/foreign_key.out index 5a958f3a..c3d40a45 100644 --- a/src/test/regress/expected/foreign_key.out +++ b/src/test/regress/expected/foreign_key.out @@ -1443,3 +1443,18 @@ DELETE FROM fk_notpartitioned_pk WHERE a = 1; ERROR: update or delete on table "fk_notpartitioned_pk" violates foreign key constraint "fk_partitioned_fk_a_fkey" on table "fk_partitioned_fk" DETAIL: Key (a)=(1) is still referenced from table "fk_partitioned_fk". DROP TABLE fk_notpartitioned_pk, fk_partitioned_fk; +-- ensure we check partitions are "not used" when dropping constraints +CREATE SCHEMA fkpart8 + CREATE TABLE tbl1(f1 int PRIMARY KEY) + CREATE TABLE tbl2(f1 int REFERENCES tbl1 DEFERRABLE INITIALLY DEFERRED) PARTITION BY RANGE(f1) + CREATE TABLE tbl2_p1 PARTITION OF tbl2 FOR VALUES FROM (minvalue) TO (maxvalue); +INSERT INTO fkpart8.tbl1 VALUES(1); +BEGIN; +INSERT INTO fkpart8.tbl2 VALUES(1); +ALTER TABLE fkpart8.tbl2 DROP CONSTRAINT tbl2_f1_fkey; +ERROR: cannot ALTER TABLE "tbl2_p1" because it has pending trigger events +COMMIT; +DROP SCHEMA fkpart8 CASCADE; +NOTICE: drop cascades to 2 other objects +DETAIL: drop cascades to table fkpart8.tbl1 +drop cascades to table fkpart8.tbl2 diff --git a/src/test/regress/expected/foreign_key_1.out b/src/test/regress/expected/foreign_key_1.out index cb069e3a..2d28c086 100644 --- a/src/test/regress/expected/foreign_key_1.out +++ b/src/test/regress/expected/foreign_key_1.out @@ -1438,3 +1438,18 @@ DELETE FROM fk_notpartitioned_pk WHERE a = 1; ERROR: update or delete on table "fk_notpartitioned_pk" violates foreign key constraint "fk_partitioned_fk_a_fkey" on table "fk_partitioned_fk" DETAIL: Key (a)=(1) is still referenced from table "fk_partitioned_fk". DROP TABLE fk_notpartitioned_pk, fk_partitioned_fk; +-- ensure we check partitions are "not used" when dropping constraints +CREATE SCHEMA fkpart8 + CREATE TABLE tbl1(f1 int PRIMARY KEY) + CREATE TABLE tbl2(f1 int REFERENCES tbl1 DEFERRABLE INITIALLY DEFERRED) PARTITION BY RANGE(f1) + CREATE TABLE tbl2_p1 PARTITION OF tbl2 FOR VALUES FROM (minvalue) TO (maxvalue); +INSERT INTO fkpart8.tbl1 VALUES(1); +BEGIN; +INSERT INTO fkpart8.tbl2 VALUES(1); +ALTER TABLE fkpart8.tbl2 DROP CONSTRAINT tbl2_f1_fkey; +ERROR: cannot ALTER TABLE "tbl2_p1" because it has pending trigger events +COMMIT; +DROP SCHEMA fkpart8 CASCADE; +NOTICE: drop cascades to 2 other objects +DETAIL: drop cascades to table fkpart8.tbl1 +drop cascades to table fkpart8.tbl2 diff --git a/src/test/regress/expected/foreign_key_2.out b/src/test/regress/expected/foreign_key_2.out index 27e9026d..e3b7210b 100644 --- a/src/test/regress/expected/foreign_key_2.out +++ b/src/test/regress/expected/foreign_key_2.out @@ -1450,3 +1450,18 @@ DELETE FROM fk_notpartitioned_pk WHERE a = 1; ERROR: update or delete on table "fk_notpartitioned_pk" violates foreign key constraint "fk_partitioned_fk_a_fkey" on table "fk_partitioned_fk" DETAIL: Key (a)=(1) is still referenced from table "fk_partitioned_fk". DROP TABLE fk_notpartitioned_pk, fk_partitioned_fk; +-- ensure we check partitions are "not used" when dropping constraints +CREATE SCHEMA fkpart8 + CREATE TABLE tbl1(f1 int PRIMARY KEY) + CREATE TABLE tbl2(f1 int REFERENCES tbl1 DEFERRABLE INITIALLY DEFERRED) PARTITION BY RANGE(f1) + CREATE TABLE tbl2_p1 PARTITION OF tbl2 FOR VALUES FROM (minvalue) TO (maxvalue); +INSERT INTO fkpart8.tbl1 VALUES(1); +BEGIN; +INSERT INTO fkpart8.tbl2 VALUES(1); +ALTER TABLE fkpart8.tbl2 DROP CONSTRAINT tbl2_f1_fkey; +ERROR: cannot ALTER TABLE "tbl2_p1" because it has pending trigger events +COMMIT; +DROP SCHEMA fkpart8 CASCADE; +NOTICE: drop cascades to 2 other objects +DETAIL: drop cascades to table fkpart8.tbl1 +drop cascades to table fkpart8.tbl2 diff --git a/src/test/regress/sql/foreign_key.sql b/src/test/regress/sql/foreign_key.sql index 8c887eb9..a1d62828 100644 --- a/src/test/regress/sql/foreign_key.sql +++ b/src/test/regress/sql/foreign_key.sql @@ -1071,3 +1071,15 @@ INSERT INTO fk_partitioned_fk VALUES (1); ALTER TABLE fk_notpartitioned_pk ALTER COLUMN a TYPE bigint; DELETE FROM fk_notpartitioned_pk WHERE a = 1; DROP TABLE fk_notpartitioned_pk, fk_partitioned_fk; + +-- ensure we check partitions are "not used" when dropping constraints +CREATE SCHEMA fkpart8 + CREATE TABLE tbl1(f1 int PRIMARY KEY) + CREATE TABLE tbl2(f1 int REFERENCES tbl1 DEFERRABLE INITIALLY DEFERRED) PARTITION BY RANGE(f1) + CREATE TABLE tbl2_p1 PARTITION OF tbl2 FOR VALUES FROM (minvalue) TO (maxvalue); +INSERT INTO fkpart8.tbl1 VALUES(1); +BEGIN; +INSERT INTO fkpart8.tbl2 VALUES(1); +ALTER TABLE fkpart8.tbl2 DROP CONSTRAINT tbl2_f1_fkey; +COMMIT; +DROP SCHEMA fkpart8 CASCADE; \ No newline at end of file From 1f127fe76c8020f68bbcd02140df46bf9a8a9828 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Thu, 2 Jul 2020 12:56:51 +0800 Subject: [PATCH 289/578] Fix infelicities in describeOneTableDetails' partitioned-table handling. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/bin/psql/describe.c | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c index 4fd3864b..c67d2570 100644 --- a/src/bin/psql/describe.c +++ b/src/bin/psql/describe.c @@ -1877,42 +1877,40 @@ describeOneTableDetails(const char *schemaname, } /* Make footers */ - if (pset.sversion >= 100000) + + if (tableinfo.ispartition) { - /* Get the partition information */ + /* Footer information for a partition child table */ PGresult *result; - char *parent_name; - char *partdef; - char *partconstraintdef = NULL; printfPQExpBuffer(&buf, "SELECT inhparent::pg_catalog.regclass,\n" - " pg_catalog.pg_get_expr(c.relpartbound, inhrelid)"); + " pg_catalog.pg_get_expr(c.relpartbound, c.oid)"); /* If verbose, also request the partition constraint definition */ if (verbose) - appendPQExpBuffer(&buf, - ",\n pg_catalog.pg_get_partition_constraintdef(inhrelid)"); + appendPQExpBufferStr(&buf, + ",\n pg_catalog.pg_get_partition_constraintdef(c.oid)"); appendPQExpBuffer(&buf, "\nFROM pg_catalog.pg_class c" " JOIN pg_catalog.pg_inherits i" " ON c.oid = inhrelid" - "\nWHERE c.oid = '%s' AND c.relispartition;", oid); + "\nWHERE c.oid = '%s';", oid); result = PSQLexec(buf.data); if (!result) goto error_return; if (PQntuples(result) > 0) { - parent_name = PQgetvalue(result, 0, 0); - partdef = PQgetvalue(result, 0, 1); - - if (PQnfields(result) == 3 && !PQgetisnull(result, 0, 2)) - partconstraintdef = PQgetvalue(result, 0, 2); + char *parent_name = PQgetvalue(result, 0, 0); + char *partdef = PQgetvalue(result, 0, 1); + char *partconstraintdef = NULL; printfPQExpBuffer(&tmpbuf, _("Partition of: %s %s"), parent_name, partdef); printTableAddFooter(&cont, tmpbuf.data); + if (!PQgetisnull(result, 0, 2)) + partconstraintdef = PQgetvalue(result, 0, 2); /* If there isn't any constraint, show that explicitly */ if (partconstraintdef == NULL || partconstraintdef[0] == '\0') printfPQExpBuffer(&tmpbuf, _("No partition constraint")); @@ -1921,26 +1919,29 @@ describeOneTableDetails(const char *schemaname, partconstraintdef); printTableAddFooter(&cont, tmpbuf.data); - PQclear(result); } + PQclear(result); } if (tableinfo.relkind == RELKIND_PARTITIONED_TABLE) { - /* Get the partition key information */ + /* Footer information for a partitioned table (partitioning parent) */ PGresult *result; - char *partkeydef; printfPQExpBuffer(&buf, "SELECT pg_catalog.pg_get_partkeydef('%s'::pg_catalog.oid);", oid); result = PSQLexec(buf.data); - if (!result || PQntuples(result) != 1) + if (!result) goto error_return; - partkeydef = PQgetvalue(result, 0, 0); + if (PQntuples(result) == 1) + { + char *partkeydef = PQgetvalue(result, 0, 0); + printfPQExpBuffer(&tmpbuf, _("Partition key: %s"), partkeydef); printTableAddFooter(&cont, tmpbuf.data); + } PQclear(result); } From 9813e44a44ee359b8799dbf43f55eaed09b06972 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Thu, 2 Jul 2020 15:12:56 +0800 Subject: [PATCH 290/578] Improve pruning of a default partition.http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/partitioning/partprune.c | 218 ++++++++---------- src/include/partitioning/partbounds.h | 1 - src/test/regress/expected/partition_prune.out | 20 +- src/test/regress/sql/partition_prune.sql | 1 + 4 files changed, 111 insertions(+), 129 deletions(-) diff --git a/src/backend/partitioning/partprune.c b/src/backend/partitioning/partprune.c index 7adf41f3..5d84bf2a 100644 --- a/src/backend/partitioning/partprune.c +++ b/src/backend/partitioning/partprune.c @@ -269,6 +269,7 @@ get_matching_partitions(PartitionPruneContext *context, List *pruning_steps) PruneStepResult **results, *final_result; ListCell *lc; + bool scan_default; /* If there are no pruning steps then all partitions match. */ if (num_steps == 0) @@ -317,30 +318,39 @@ get_matching_partitions(PartitionPruneContext *context, List *pruning_steps) Assert(final_result != NULL); i = -1; result = NULL; + scan_default = final_result->scan_default; while ((i = bms_next_member(final_result->bound_offsets, i)) >= 0) { int partindex = context->boundinfo->indexes[i]; - /* - * In range and hash partitioning cases, some slots may contain -1, - * indicating that no partition has been defined to accept a given - * range of data or for a given remainder, respectively. The default - * partition, if any, in case of range partitioning, will be added to - * the result, because the specified range still satisfies the query's - * conditions. - */ - if (partindex >= 0) - result = bms_add_member(result, partindex); + if (partindex < 0) + { + /* + * In range partitioning cases, if a partition index is -1 it + * means that the bound at the offset is the upper bound for a + * range not covered by any partition (other than a possible + * default partition). In hash partitioning, the same means no + * partition has been defined for the corresponding remainder + * value. + * + * In either case, the value is still part of the queried range of + * values, so mark to scan the default partition if one exists. + */ + scan_default |= partition_bound_has_default(context->boundinfo); + continue; + } + + result = bms_add_member(result, partindex); } - /* Add the null and/or default partition if needed and if present. */ + /* Add the null and/or default partition if needed and present. */ if (final_result->scan_null) { Assert(context->strategy == PARTITION_STRATEGY_LIST); Assert(partition_bound_accepts_nulls(context->boundinfo)); result = bms_add_member(result, context->boundinfo->null_index); } - if (final_result->scan_default) + if (scan_default) { Assert(context->strategy == PARTITION_STRATEGY_LIST || context->strategy == PARTITION_STRATEGY_RANGE); @@ -1809,6 +1819,11 @@ get_matching_hash_bounds(PartitionPruneContext *context, * get_matching_list_bounds * Determine the offsets of list bounds matching the specified value, * according to the semantics of the given operator strategy + * + * scan_default will be set in the returned struct, if the default partition + * needs to be scanned, provided one exists at all. scan_null will be set if + * the special null-accepting partition needs to be scanned. + * * 'opstrategy' if non-zero must be a btree strategy number. * * 'value' contains the value to use for pruning. @@ -2008,8 +2023,13 @@ get_matching_list_bounds(PartitionPruneContext *context, * Each datum whose offset is in result is to be treated as the upper bound of * the partition that will contain the desired values. * - * If default partition needs to be scanned for given values, set scan_default - * in result if present. + * scan_default is set in the returned struct if a default partition exists + * and we're absolutely certain that it needs to be scanned. We do *not* set + * it just because values match portions of the key space uncovered by + * partitions other than default (space which we normally assume to belong to + * the default partition): the final set of bounds obtained after combining + * multiple pruning steps might exclude it, so we infer its inclusion + * elsewhere. * * 'opstrategy' if non-zero must be a btree strategy number. * @@ -2035,8 +2055,7 @@ get_matching_range_bounds(PartitionPruneContext *context, int *partindices = boundinfo->indexes; int off, minoff, - maxoff, - i; + maxoff; bool is_equal; bool inclusive = false; @@ -2066,12 +2085,15 @@ get_matching_range_bounds(PartitionPruneContext *context, */ if (nvalues == 0) { + /* ignore key space not covered by any partitions */ if (partindices[minoff] < 0) minoff++; if (partindices[maxoff] < 0) maxoff--; result->scan_default = partition_bound_has_default(boundinfo); + Assert(partindices[minoff] >= 0 && + partindices[maxoff] >= 0); result->bound_offsets = bms_add_range(NULL, minoff, maxoff); return result; @@ -2099,11 +2121,7 @@ get_matching_range_bounds(PartitionPruneContext *context, if (nvalues == partnatts) { /* There can only be zero or one matching partition. */ - if (partindices[off + 1] >= 0) - result->bound_offsets = bms_make_singleton(off + 1); - else - result->scan_default = - partition_bound_has_default(boundinfo); + result->bound_offsets = bms_make_singleton(off + 1); return result; } else @@ -2191,57 +2209,21 @@ get_matching_range_bounds(PartitionPruneContext *context, maxoff = off + 1; } - /* - * Skip if minoff/maxoff are actually the upper bound of a - * un-assigned portion of values. - */ - if (partindices[minoff] < 0 && minoff < boundinfo->ndatums) - minoff++; - if (partindices[maxoff] < 0 && maxoff >= 1) - maxoff--; - - /* - * There may exist a range of values unassigned to any - * non-default partition between the datums at minoff and - * maxoff. Add the default partition in that case. - */ - if (partition_bound_has_default(boundinfo)) - { - for (i = minoff; i <= maxoff; i++) - { - if (partindices[i] < 0) - { - result->scan_default = true; - break; - } - } - } - Assert(minoff >= 0 && maxoff >= 0); result->bound_offsets = bms_add_range(NULL, minoff, maxoff); } - else if (off >= 0) /* !is_equal */ + else { /* * The lookup value falls in the range between some bounds in * boundinfo. 'off' would be the offset of the greatest bound * that is <= lookup value, so add off + 1 to the result * instead as the offset of the upper bound of the only - * partition that may contain the lookup value. - */ - if (partindices[off + 1] >= 0) - result->bound_offsets = bms_make_singleton(off + 1); - else - result->scan_default = - partition_bound_has_default(boundinfo); - } - else - { - /* - * off < 0: the lookup value is smaller than all bounds, so - * only the default partition qualifies, if there is one. + * partition that may contain the lookup value. If 'off' is + * -1 indicating that all bounds are greater, then we simply + * end up adding the first bound's offset, that is, 0. */ - result->scan_default = partition_bound_has_default(boundinfo); + result->bound_offsets = bms_make_singleton(off + 1); } return result; @@ -2312,16 +2294,18 @@ get_matching_range_bounds(PartitionPruneContext *context, minoff = inclusive ? off : off + 1; } - - /* - * lookup value falls in the range between some bounds in - * boundinfo. off would be the offset of the greatest bound - * that is <= lookup value, so add off + 1 to the result - * instead as the offset of the upper bound of the smallest - * partition that may contain the lookup value. - */ else + { + + /* + * lookup value falls in the range between some bounds in + * boundinfo. off would be the offset of the greatest + * bound that is <= lookup value, so add off + 1 to the + * result instead as the offset of the upper bound of the + * smallest partition that may contain the lookup value. + */ minoff = off + 1; + } } break; @@ -2339,16 +2323,7 @@ get_matching_range_bounds(PartitionPruneContext *context, boundinfo, nvalues, values, &is_equal); - if (off < 0) - { - /* - * All bounds are greater than the key, so we could only - * expect to find the lookup key in the default partition. - */ - result->scan_default = partition_bound_has_default(boundinfo); - return result; - } - else + if (off >= 0) { /* * See the comment above. @@ -2396,6 +2371,14 @@ get_matching_range_bounds(PartitionPruneContext *context, else maxoff = off; } + else + { + /* + * 'off' is -1 indicating that all bounds are greater, so just + * set the first bound's offset as maxoff. + */ + maxoff = off + 1; + } break; default: @@ -2403,58 +2386,43 @@ get_matching_range_bounds(PartitionPruneContext *context, break; } + Assert(minoff >= 0 && minoff <= boundinfo->ndatums); + Assert(maxoff >= 0 && maxoff <= boundinfo->ndatums); + /* - * Skip a gap and when doing so, check if the bound contains a finite - * value to decide if we need to add the default partition. If it's an - * infinite bound, we need not add the default partition, as having an - * infinite bound means the partition in question catches any values that - * would otherwise be in the default partition. + * If the smallest partition to return has MINVALUE (negative infinity) as + * its lower bound, increment it to point to the next finite bound + * (supposedly its upper bound), so that we don't advertently end up + * scanning the default partition. */ - if (partindices[minoff] < 0) + if (minoff < boundinfo->ndatums && partindices[minoff] < 0) { int lastkey = nvalues - 1; - if (minoff >= 0 && - minoff < boundinfo->ndatums && - boundinfo->kind[minoff][lastkey] == - PARTITION_RANGE_DATUM_VALUE) - result->scan_default = partition_bound_has_default(boundinfo); - - minoff++; + if (boundinfo->kind[minoff][lastkey] == + PARTITION_RANGE_DATUM_MINVALUE) + { + minoff++; + Assert(boundinfo->indexes[minoff] >= 0); + } } /* - * Skip a gap. See the above comment about how we decide whether or or - * not to scan the default partition based whether the datum that will - * become the maximum datum is finite or not. + * If the previous greatest partition has MAXVALUE (positive infinity) as + * its upper bound (something only possible to do with multi-column range + * partitioning), we scan switch to it as the greatest partition to + * return. Again, so that we don't advertently end up scanning the + * default partition. */ if (maxoff >= 1 && partindices[maxoff] < 0) { int lastkey = nvalues - 1; - if (maxoff >= 0 && - maxoff <= boundinfo->ndatums && - boundinfo->kind[maxoff - 1][lastkey] == - PARTITION_RANGE_DATUM_VALUE) - result->scan_default = partition_bound_has_default(boundinfo); - - maxoff--; - } - - if (partition_bound_has_default(boundinfo)) - { - /* - * There may exist a range of values unassigned to any non-default - * partition between the datums at minoff and maxoff. Add the default - * partition in that case. - */ - for (i = minoff; i <= maxoff; i++) + if (boundinfo->kind[maxoff - 1][lastkey] == + PARTITION_RANGE_DATUM_MAXVALUE) { - if (partindices[i] < 0) - { - result->scan_default = true; - break; - } + maxoff--; + Assert(boundinfo->indexes[maxoff] >= 0); } } @@ -2599,14 +2567,24 @@ perform_pruning_combine_step(PartitionPruneContext *context, /* * A combine step without any source steps is an indication to not perform - * any partition pruning, we just return all partitions. + * any partition pruning. Return all datum indexes in that case. */ result = (PruneStepResult *) palloc0(sizeof(PruneStepResult)); if (list_length(cstep->source_stepids) == 0) { PartitionBoundInfo boundinfo = context->boundinfo; + int rangemax; + + /* + * Add all valid offsets into the boundinfo->indexes array. For range + * partitioning, boundinfo->indexes contains (boundinfo->ndatums + 1) + * valid entries; otherwise there are boundinfo->ndatums. + */ + rangemax = context->strategy == PARTITION_STRATEGY_RANGE ? + boundinfo->ndatums : boundinfo->ndatums - 1; - result->bound_offsets = bms_add_range(NULL, 0, boundinfo->ndatums - 1); + result->bound_offsets = + bms_add_range(result->bound_offsets, 0, rangemax); result->scan_default = partition_bound_has_default(boundinfo); result->scan_null = partition_bound_accepts_nulls(boundinfo); return result; diff --git a/src/include/partitioning/partbounds.h b/src/include/partitioning/partbounds.h index c76014d4..45df3fb8 100644 --- a/src/include/partitioning/partbounds.h +++ b/src/include/partitioning/partbounds.h @@ -52,7 +52,6 @@ * pointed by remainder produced when hash value of the datum-tuple is divided * by the greatest modulus. */ - typedef struct PartitionBoundInfoData { char strategy; /* hash, list or range? */ diff --git a/src/test/regress/expected/partition_prune.out b/src/test/regress/expected/partition_prune.out index a08f303d..ff388472 100644 --- a/src/test/regress/expected/partition_prune.out +++ b/src/test/regress/expected/partition_prune.out @@ -517,15 +517,13 @@ explain (costs off) select * from rlp where a <= 31; Filter: (a <= 31) -> Seq Scan on rlp5_1 Filter: (a <= 31) - -> Seq Scan on rlp5_default - Filter: (a <= 31) -> Seq Scan on rlp_default_10 Filter: (a <= 31) -> Seq Scan on rlp_default_30 Filter: (a <= 31) -> Seq Scan on rlp_default_default Filter: (a <= 31) -(29 rows) +(27 rows) explain (costs off) select * from rlp where a = 1 or a = 7; QUERY PLAN @@ -573,11 +571,7 @@ explain (costs off) select * from rlp where a > 20 and a < 27; Filter: ((a > 20) AND (a < 27)) -> Seq Scan on rlp4_2 Filter: ((a > 20) AND (a < 27)) - -> Seq Scan on rlp4_default - Filter: ((a > 20) AND (a < 27)) - -> Seq Scan on rlp_default_default - Filter: ((a > 20) AND (a < 27)) -(9 rows) +(5 rows) explain (costs off) select * from rlp where a = 29; QUERY PLAN @@ -603,6 +597,16 @@ explain (costs off) select * from rlp where a >= 29; Filter: (a >= 29) (11 rows) +explain (costs off) select * from rlp where a < 1 or (a > 20 and a < 25); + QUERY PLAN +------------------------------------------------------ + Append + -> Seq Scan on rlp1 + Filter: ((a < 1) OR ((a > 20) AND (a < 25))) + -> Seq Scan on rlp4_1 + Filter: ((a < 1) OR ((a > 20) AND (a < 25))) +(5 rows) + -- redundant clauses are eliminated explain (costs off) select * from rlp where a > 1 and a = 10; /* only default */ QUERY PLAN diff --git a/src/test/regress/sql/partition_prune.sql b/src/test/regress/sql/partition_prune.sql index 1cd151e2..55fda489 100644 --- a/src/test/regress/sql/partition_prune.sql +++ b/src/test/regress/sql/partition_prune.sql @@ -83,6 +83,7 @@ explain (costs off) select * from rlp where a = 1 or b = 'ab'; explain (costs off) select * from rlp where a > 20 and a < 27; explain (costs off) select * from rlp where a = 29; explain (costs off) select * from rlp where a >= 29; +explain (costs off) select * from rlp where a < 1 or (a > 20 and a < 25); -- redundant clauses are eliminated explain (costs off) select * from rlp where a > 1 and a = 10; /* only default */ From c072d696d1b205959f88bb169ab62dbaf44fb393 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Thu, 2 Jul 2020 15:25:18 +0800 Subject: [PATCH 291/578] Fix dependency handling of column drop with partitioned tables.http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/commands/tablecmds.c | 43 ++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 31c4aa91..9ab2e3ec 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -447,7 +447,8 @@ static void ATPrepDropColumn(List **wqueue, Relation rel, bool recurse, bool rec static ObjectAddress ATExecDropColumn(List **wqueue, Relation rel, const char *colName, DropBehavior behavior, bool recurse, bool recursing, - bool missing_ok, LOCKMODE lockmode); + bool missing_ok, LOCKMODE lockmode, + ObjectAddresses *addrs); static ObjectAddress ATExecAddIndex(AlteredTableInfo *tab, Relation rel, IndexStmt *stmt, bool is_rebuild, LOCKMODE lockmode); static ObjectAddress ATExecAddConstraint(List **wqueue, @@ -5209,12 +5210,14 @@ ATExecCmd(List **wqueue, AlteredTableInfo *tab, Relation rel, case AT_DropColumn: /* DROP COLUMN */ address = ATExecDropColumn(wqueue, rel, cmd->name, cmd->behavior, false, false, - cmd->missing_ok, lockmode); + cmd->missing_ok, lockmode, + NULL); break; case AT_DropColumnRecurse: /* DROP COLUMN with recursion */ address = ATExecDropColumn(wqueue, rel, cmd->name, cmd->behavior, true, false, - cmd->missing_ok, lockmode); + cmd->missing_ok, lockmode, + NULL); break; case AT_AddIndex: /* ADD INDEX */ address = ATExecAddIndex(tab, rel, (IndexStmt *) cmd->def, false, @@ -8203,14 +8206,23 @@ ATPrepDropColumn(List **wqueue, Relation rel, bool recurse, bool recursing, } /* - * Return value is the address of the dropped column. + * Drops column 'colName' from relation 'rel' and returns the address of the + * dropped column. The column is also dropped (or marked as no longer + * inherited from relation) from the relation's inheritance children, if any. + * + * In the recursive invocations for inheritance child relations, instead of + * dropping the column directly (if to be dropped at all), its object address + * is added to 'addrs', which must be non-NULL in such invocations. All + * columns are dropped at the same time after all the children have been + * checked recursively. */ static ObjectAddress ATExecDropColumn(List **wqueue, Relation rel, const char *colName, DropBehavior behavior, bool recurse, bool recursing, - bool missing_ok, LOCKMODE lockmode) -{// #lizard forgives + bool missing_ok, LOCKMODE lockmode, + ObjectAddresses *addrs) +{ HeapTuple tuple; Form_pg_attribute targetatt; AttrNumber attnum; @@ -8222,6 +8234,11 @@ ATExecDropColumn(List **wqueue, Relation rel, const char *colName, if (recursing) ATSimplePermissions(rel, ATT_TABLE | ATT_FOREIGN_TABLE); + /* Initialize addrs on the first invocation */ + Assert(!recursing || addrs != NULL); + if (!recursing) + addrs = new_object_addresses(); + /* * get the number of the attribute */ @@ -8362,7 +8379,7 @@ ATExecDropColumn(List **wqueue, Relation rel, const char *colName, /* Time to delete this child column, too */ ATExecDropColumn(wqueue, childrel, colName, behavior, true, true, - false, lockmode); + false, lockmode, addrs); } else { @@ -8421,14 +8438,18 @@ ATExecDropColumn(List **wqueue, Relation rel, const char *colName, heap_close(attr_rel, RowExclusiveLock); } - /* - * Perform the actual column deletion - */ + /* Add object to delete */ object.classId = RelationRelationId; object.objectId = RelationGetRelid(rel); object.objectSubId = attnum; + add_exact_object_address(&object, addrs); - performDeletion(&object, behavior, 0); + if (!recursing) + { + /* Recursion has ended, drop everything that was collected */ + performMultipleDeletions(addrs, behavior, 0); + free_object_addresses(addrs); + } /* * If we dropped the OID column, must adjust pg_class.relhasoids and tell From 26b10a96796d14d7b62b1749edec738b3c75f36f Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Thu, 2 Jul 2020 17:04:48 +0800 Subject: [PATCH 292/578] Simplify index_[constraint_]create API. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/catalog/index.c | 107 ++++++++++++++++--------------- src/backend/catalog/toasting.c | 3 +- src/backend/commands/indexcmds.c | 33 +++++++--- src/backend/commands/tablecmds.c | 13 ++-- src/include/catalog/index.h | 29 +++++---- 5 files changed, 105 insertions(+), 80 deletions(-) diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 7f01e417..db5d16ee 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -719,19 +719,25 @@ UpdateIndexRelation(Oid indexoid, * classObjectId: array of index opclass OIDs, one per index column * coloptions: array of per-index-column indoption settings * reloptions: AM-specific options - * isprimary: index is a PRIMARY KEY - * isconstraint: index is owned by PRIMARY KEY, UNIQUE, or EXCLUSION constraint - * deferrable: constraint is DEFERRABLE - * initdeferred: constraint is INITIALLY DEFERRED - * allow_system_table_mods: allow table to be a system catalog - * skip_build: true to skip the index_build() step for the moment; caller - * must do it later (typically via reindex_index()) - * concurrent: if true, do not lock the table against writers. The index - * will be marked "invalid" and the caller must take additional steps + * flags: bitmask that can include any combination of these bits: + * INDEX_CREATE_IS_PRIMARY + * the index is a primary key + * INDEX_CREATE_ADD_CONSTRAINT: + * invoke index_constraint_create also + * INDEX_CREATE_SKIP_BUILD: + * skip the index_build() step for the moment; caller must do it + * later (typically via reindex_index()) + * INDEX_CREATE_CONCURRENT: + * do not lock the table against writers. The index will be + * marked "invalid" and the caller must take additional steps * to fix it up. + * INDEX_CREATE_IF_NOT_EXISTS: + * do not throw an error if a relation with the same name + * already exists. + * constr_flags: flags passed to index_constraint_create + * (only if INDEX_CREATE_ADD_CONSTRAINT is set) + * allow_system_table_mods: allow table to be a system catalog * is_internal: if true, post creation hook for new index - * if_not_exists: if true, do not throw an error if a relation with - * the same name already exists. * * Returns the OID of the created index. */ @@ -748,16 +754,11 @@ index_create(Relation heapRelation, Oid *classObjectId, int16 *coloptions, Datum reloptions, - bool isprimary, - bool isconstraint, - bool deferrable, - bool initdeferred, + bits16 flags, + bits16 constr_flags, bool allow_system_table_mods, - bool skip_build, - bool concurrent, - bool is_internal, - bool if_not_exists) -{// #lizard forgives + bool is_internal) +{ Oid heapRelationId = RelationGetRelid(heapRelation); Relation pg_class; Relation indexRelation; @@ -768,6 +769,12 @@ index_create(Relation heapRelation, Oid namespaceId; int i; char relpersistence; + bool isprimary = (flags & INDEX_CREATE_IS_PRIMARY) != 0; + bool concurrent = (flags & INDEX_CREATE_CONCURRENT) != 0; + + /* constraint flags can only be set when a constraint is requested */ + Assert((constr_flags == 0) || + ((flags & INDEX_CREATE_ADD_CONSTRAINT) != 0)); is_exclusion = (indexInfo->ii_ExclusionOps != NULL); @@ -833,7 +840,7 @@ index_create(Relation heapRelation, if (get_relname_relid(indexRelationName, namespaceId)) { - if (if_not_exists) + if ((flags & INDEX_CREATE_IF_NOT_EXISTS) != 0) { ereport(NOTICE, (errcode(ERRCODE_DUPLICATE_TABLE), @@ -956,7 +963,7 @@ index_create(Relation heapRelation, UpdateIndexRelation(indexRelationId, heapRelationId, indexInfo, collationObjectId, classObjectId, coloptions, isprimary, is_exclusion, - !deferrable, + (constr_flags & INDEX_CONSTR_CREATE_DEFERRABLE) == 0, !concurrent); /* @@ -982,7 +989,7 @@ index_create(Relation heapRelation, myself.objectId = indexRelationId; myself.objectSubId = 0; - if (isconstraint) + if ((flags & INDEX_CREATE_ADD_CONSTRAINT) != 0) { char constraintType; @@ -1003,11 +1010,7 @@ index_create(Relation heapRelation, indexInfo, indexRelationName, constraintType, - deferrable, - initdeferred, - false, /* already marked primary */ - false, /* pg_index entry is OK */ - false, /* no old dependencies */ + constr_flags, allow_system_table_mods, is_internal); } @@ -1044,10 +1047,6 @@ index_create(Relation heapRelation, recordDependencyOn(&myself, &referenced, DEPENDENCY_AUTO); } - - /* Non-constraint indexes can't be deferrable */ - Assert(!deferrable); - Assert(!initdeferred); } /* Store dependency on collations */ @@ -1098,9 +1097,7 @@ index_create(Relation heapRelation, else { /* Bootstrap mode - assert we weren't asked for constraint support */ - Assert(!isconstraint); - Assert(!deferrable); - Assert(!initdeferred); + Assert((flags & INDEX_CREATE_ADD_CONSTRAINT) == 0); } /* Post creation hook for new index */ @@ -1128,15 +1125,16 @@ index_create(Relation heapRelation, * If this is bootstrap (initdb) time, then we don't actually fill in the * index yet. We'll be creating more indexes and classes later, so we * delay filling them in until just before we're done with bootstrapping. - * Similarly, if the caller specified skip_build then filling the index is - * delayed till later (ALTER TABLE can save work in some cases with this). - * Otherwise, we call the AM routine that constructs the index. + * Similarly, if the caller specified to skip the build then filling the + * index is delayed till later (ALTER TABLE can save work in some cases + * with this). Otherwise, we call the AM routine that constructs the + * index. */ if (IsBootstrapProcessingMode()) { index_register(heapRelationId, indexRelationId, indexInfo); } - else if (skip_build) + else if ((flags & INDEX_CREATE_SKIP_BUILD) != 0) { /* * Caller is responsible for filling the index later on. However, @@ -1176,12 +1174,13 @@ index_create(Relation heapRelation, * constraintName: what it say (generally, should match name of index) * constraintType: one of CONSTRAINT_PRIMARY, CONSTRAINT_UNIQUE, or * CONSTRAINT_EXCLUSION - * deferrable: constraint is DEFERRABLE - * initdeferred: constraint is INITIALLY DEFERRED - * mark_as_primary: if true, set flags to mark index as primary key - * update_pgindex: if true, update pg_index row (else caller's done that) - * remove_old_dependencies: if true, remove existing dependencies of index - * on table's columns + * flags: bitmask that can include any combination of these bits: + * INDEX_CONSTR_CREATE_MARK_AS_PRIMARY: index is a PRIMARY KEY + * INDEX_CONSTR_CREATE_DEFERRABLE: constraint is DEFERRABLE + * INDEX_CONSTR_CREATE_INIT_DEFERRED: constraint is INITIALLY DEFERRED + * INDEX_CONSTR_CREATE_UPDATE_INDEX: update the pg_index row + * INDEX_CONSTR_CREATE_REMOVE_OLD_DEPS: remove existing dependencies + * of index on table's columns * allow_system_table_mods: allow table to be a system catalog * is_internal: index is constructed due to internal process */ @@ -1191,11 +1190,7 @@ index_constraint_create(Relation heapRelation, IndexInfo *indexInfo, const char *constraintName, char constraintType, - bool deferrable, - bool initdeferred, - bool mark_as_primary, - bool update_pgindex, - bool remove_old_dependencies, + bits16 constr_flags, bool allow_system_table_mods, bool is_internal) {// #lizard forgives @@ -1203,6 +1198,13 @@ index_constraint_create(Relation heapRelation, ObjectAddress myself, referenced; Oid conOid; + bool deferrable; + bool initdeferred; + bool mark_as_primary; + + deferrable = (constr_flags & INDEX_CONSTR_CREATE_DEFERRABLE) != 0; + initdeferred = (constr_flags & INDEX_CONSTR_CREATE_INIT_DEFERRED) != 0; + mark_as_primary = (constr_flags & INDEX_CONSTR_CREATE_MARK_AS_PRIMARY) != 0; /* constraint creation support doesn't work while bootstrapping */ Assert(!IsBootstrapProcessingMode()); @@ -1229,7 +1231,7 @@ index_constraint_create(Relation heapRelation, * has any expressions or predicate, but we'd never be turning such an * index into a UNIQUE or PRIMARY KEY constraint. */ - if (remove_old_dependencies) + if (constr_flags & INDEX_CONSTR_CREATE_REMOVE_OLD_DEPS) deleteDependencyRecordsForClass(RelationRelationId, indexRelationId, RelationRelationId, DEPENDENCY_AUTO); @@ -1334,7 +1336,8 @@ index_constraint_create(Relation heapRelation, * is a risk that concurrent readers of the table will miss seeing this * index at all. */ - if (update_pgindex && (mark_as_primary || deferrable)) + if ((constr_flags & INDEX_CONSTR_CREATE_UPDATE_INDEX) && + (mark_as_primary || deferrable)) { Relation pg_index; HeapTuple indexTuple; diff --git a/src/backend/catalog/toasting.c b/src/backend/catalog/toasting.c index d908bfc3..325b72e9 100644 --- a/src/backend/catalog/toasting.c +++ b/src/backend/catalog/toasting.c @@ -396,8 +396,7 @@ create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid, BTREE_AM_OID, rel->rd_rel->reltablespace, collationObjectId, classObjectId, coloptions, (Datum) 0, - true, false, false, false, - true, false, false, true, false); + INDEX_CREATE_IS_PRIMARY, 0, true, true); heap_close(toast_rel, NoLock); diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index 4596a9f4..af0b9947 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -340,6 +340,8 @@ DefineIndex(Oid relationId, Datum reloptions; int16 *coloptions; IndexInfo *indexInfo; + bits16 flags; + bits16 constr_flags; int numberOfAttributes; TransactionId limitXmin; VirtualTransactionId *old_snapshots; @@ -755,20 +757,35 @@ DefineIndex(Oid relationId, Assert(!OidIsValid(stmt->oldNode) || (skip_build && !stmt->concurrent)); /* - * Make the catalog entries for the index, including constraints. Then, if - * not skip_build || concurrent, actually build the index. + * Make the catalog entries for the index, including constraints. This + * step also actually builds the index, except if caller requested not to + * or in concurrent mode, in which case it'll be done later. */ + flags = constr_flags = 0; + if (stmt->isconstraint) + flags |= INDEX_CREATE_ADD_CONSTRAINT; + if (skip_build || stmt->concurrent) + flags |= INDEX_CREATE_SKIP_BUILD; + if (stmt->if_not_exists) + flags |= INDEX_CREATE_IF_NOT_EXISTS; + if (stmt->concurrent) + flags |= INDEX_CREATE_CONCURRENT; + if (stmt->primary) + flags |= INDEX_CREATE_IS_PRIMARY; + + if (stmt->deferrable) + constr_flags |= INDEX_CONSTR_CREATE_DEFERRABLE; + if (stmt->initdeferred) + constr_flags |= INDEX_CONSTR_CREATE_INIT_DEFERRED; + indexRelationId = index_create(rel, indexRelationName, indexRelationId, stmt->oldNode, indexInfo, indexColNames, accessMethodId, tablespaceId, collationObjectId, classObjectId, - coloptions, reloptions, stmt->primary, - stmt->isconstraint, stmt->deferrable, stmt->initdeferred, - allowSystemTableMods, - skip_build || stmt->concurrent, - stmt->concurrent, !check_rights, - stmt->if_not_exists); + coloptions, reloptions, + flags, constr_flags, + allowSystemTableMods, !check_rights); ObjectAddressSet(address, RelationRelationId, indexRelationId); diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 9ab2e3ec..c6e42c3d 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -8630,6 +8630,7 @@ ATExecAddIndexConstraint(AlteredTableInfo *tab, Relation rel, char *constraintName; char constraintType; ObjectAddress address; + bits16 flags; Assert(IsA(stmt, IndexStmt)); Assert(OidIsValid(index_oid)); @@ -8674,16 +8675,18 @@ ATExecAddIndexConstraint(AlteredTableInfo *tab, Relation rel, constraintType = CONSTRAINT_UNIQUE; /* Create the catalog entries for the constraint */ + flags = INDEX_CONSTR_CREATE_UPDATE_INDEX | + INDEX_CONSTR_CREATE_REMOVE_OLD_DEPS | + (stmt->initdeferred ? INDEX_CONSTR_CREATE_INIT_DEFERRED : 0) | + (stmt->deferrable ? INDEX_CONSTR_CREATE_DEFERRABLE : 0) | + (stmt->primary ? INDEX_CONSTR_CREATE_MARK_AS_PRIMARY : 0); + address = index_constraint_create(rel, index_oid, indexInfo, constraintName, constraintType, - stmt->deferrable, - stmt->initdeferred, - stmt->primary, - true, /* update pg_index */ - true, /* remove old dependencies */ + flags, allowSystemTableMods, false); /* is_internal */ diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h index 3afe88f8..4928dfd1 100644 --- a/src/include/catalog/index.h +++ b/src/include/catalog/index.h @@ -103,6 +103,12 @@ extern void index_check_primary_key(Relation heapRel, bool is_alter_table, IndexStmt *stmt); +#define INDEX_CREATE_IS_PRIMARY (1 << 0) +#define INDEX_CREATE_ADD_CONSTRAINT (1 << 1) +#define INDEX_CREATE_SKIP_BUILD (1 << 2) +#define INDEX_CREATE_CONCURRENT (1 << 3) +#define INDEX_CREATE_IF_NOT_EXISTS (1 << 4) + extern Oid index_create(Relation heapRelation, const char *indexRelationName, Oid indexRelationId, @@ -115,26 +121,23 @@ extern Oid index_create(Relation heapRelation, Oid *classObjectId, int16 *coloptions, Datum reloptions, - bool isprimary, - bool isconstraint, - bool deferrable, - bool initdeferred, + bits16 flags, + bits16 constr_flags, bool allow_system_table_mods, - bool skip_build, - bool concurrent, - bool is_internal, - bool if_not_exists); + bool is_internal); + +#define INDEX_CONSTR_CREATE_MARK_AS_PRIMARY (1 << 0) +#define INDEX_CONSTR_CREATE_DEFERRABLE (1 << 1) +#define INDEX_CONSTR_CREATE_INIT_DEFERRED (1 << 2) +#define INDEX_CONSTR_CREATE_UPDATE_INDEX (1 << 3) +#define INDEX_CONSTR_CREATE_REMOVE_OLD_DEPS (1 << 4) extern ObjectAddress index_constraint_create(Relation heapRelation, Oid indexRelationId, IndexInfo *indexInfo, const char *constraintName, char constraintType, - bool deferrable, - bool initdeferred, - bool mark_as_primary, - bool update_pgindex, - bool remove_old_dependencies, + bits16 constr_flags, bool allow_system_table_mods, bool is_internal); From 521f269f55bd55e22da864eae943816ebf9cf6ee Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Thu, 2 Jul 2020 19:59:12 +0800 Subject: [PATCH 293/578] Local partitioned indexes. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- doc/src/sgml/catalogs.sgml | 23 + doc/src/sgml/ref/alter_index.sgml | 14 + doc/src/sgml/ref/alter_table.sgml | 8 +- doc/src/sgml/ref/create_index.sgml | 33 +- doc/src/sgml/ref/reindex.sgml | 5 + src/backend/access/common/reloptions.c | 1 + src/backend/access/heap/heapam.c | 9 +- src/backend/access/index/indexam.c | 3 +- src/backend/bootstrap/bootparse.y | 2 + src/backend/catalog/aclchk.c | 9 +- src/backend/catalog/dependency.c | 14 +- src/backend/catalog/index.c | 201 +++++- src/backend/catalog/objectaddress.c | 5 +- src/backend/catalog/pg_depend.c | 13 +- src/backend/catalog/pg_inherits.c | 80 +++ src/backend/catalog/toasting.c | 2 + src/backend/commands/indexcmds.c | 394 ++++++++++- src/backend/commands/tablecmds.c | 665 +++++++++++++++++-- src/backend/nodes/copyfuncs.c | 1 + src/backend/nodes/equalfuncs.c | 1 + src/backend/nodes/outfuncs.c | 1 + src/backend/parser/gram.y | 33 +- src/backend/parser/parse_utilcmd.c | 55 +- src/backend/tcop/utility.c | 22 + src/backend/utils/adt/amutils.c | 3 +- src/backend/utils/adt/ruleutils.c | 17 +- src/backend/utils/cache/relcache.c | 40 +- src/bin/pg_dump/common.c | 102 +++ src/bin/pg_dump/pg_dump.c | 102 ++- src/bin/pg_dump/pg_dump.h | 11 + src/bin/pg_dump/pg_dump_sort.c | 56 +- src/bin/pg_dump/t/002_pg_dump.pl | 95 +++ src/bin/psql/describe.c | 20 +- src/bin/psql/tab-complete.c | 34 +- src/include/catalog/dependency.h | 15 + src/include/catalog/index.h | 10 + src/include/catalog/pg_class.h | 1 + src/include/catalog/pg_inherits_fn.h | 9 +- src/include/commands/defrem.h | 3 +- src/include/nodes/execnodes.h | 1 + src/include/nodes/parsenodes.h | 7 +- src/include/parser/parse_utilcmd.h | 3 + src/test/regress/expected/alter_table.out | 65 +- src/test/regress/expected/indexing.out | 757 ++++++++++++++++++++++ src/test/regress/parallel_schedule | 2 +- src/test/regress/serial_schedule | 1 + src/test/regress/sql/alter_table.sql | 16 + src/test/regress/sql/indexing.sql | 388 +++++++++++ 48 files changed, 3176 insertions(+), 176 deletions(-) create mode 100644 src/test/regress/expected/indexing.out create mode 100644 src/test/regress/sql/indexing.sql diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml index fdac2074..399f8275 100644 --- a/doc/src/sgml/catalogs.sgml +++ b/doc/src/sgml/catalogs.sgml @@ -3008,6 +3008,29 @@ SCRAM-SHA-256$<iteration count>:<salt>< + + DEPENDENCY_INTERNAL_AUTO (I) + + + The dependent object was created as part of creation of the + referenced object, and is really just a part of its internal + implementation. A DROP of the dependent object + will be disallowed outright (we'll tell the user to issue a + DROP against the referenced object, instead). + While a regular internal dependency will prevent + the dependent object from being dropped while any such dependencies + remain, DEPENDENCY_INTERNAL_AUTO will allow such + a drop as long as the object can be found by following any of such + dependencies. + Example: an index on a partition is made internal-auto-dependent on + both the partition itself as well as on the index on the parent + partitioned table; so the partition index is dropped together with + either the partition it indexes, or with the parent index it is + attached to. + + + + DEPENDENCY_EXTENSION (e) diff --git a/doc/src/sgml/ref/alter_index.sgml b/doc/src/sgml/ref/alter_index.sgml index ad77b574..149a16bc 100644 --- a/doc/src/sgml/ref/alter_index.sgml +++ b/doc/src/sgml/ref/alter_index.sgml @@ -23,6 +23,7 @@ PostgreSQL documentation ALTER INDEX [ IF EXISTS ] name RENAME TO new_name ALTER INDEX [ IF EXISTS ] name SET TABLESPACE tablespace_name +ALTER INDEX name ATTACH PARTITION index_name ALTER INDEX name DEPENDS ON EXTENSION extension_name ALTER INDEX [ IF EXISTS ] name SET ( storage_parameter = value [, ... ] ) ALTER INDEX [ IF EXISTS ] name RESET ( storage_parameter [, ... ] ) @@ -73,6 +74,19 @@ ALTER INDEX ALL IN TABLESPACE name + + ATTACH PARTITION + + + Causes the named index to become attached to the altered index. + The named index must be on a partition of the table containing the + index being altered, and have an equivalent definition. An attached + index cannot be dropped by itself, and will automatically be dropped + if its parent index is dropped. + + + + DEPENDS ON EXTENSION diff --git a/doc/src/sgml/ref/alter_table.sgml b/doc/src/sgml/ref/alter_table.sgml index d9ddbd01..ba4c3d04 100644 --- a/doc/src/sgml/ref/alter_table.sgml +++ b/doc/src/sgml/ref/alter_table.sgml @@ -837,7 +837,10 @@ ALTER TABLE [ IF EXISTS ] name as a partition of the target table. The table can be attached as a partition for specific values using FOR VALUES or as a default partition by using DEFAULT - . + . For each index in the target table, a corresponding + one will be created in the attached table; or, if an equivalent + index already exists, will be attached to the target table's index, + as if ALTER INDEX ATTACH PARTITION had been executed. @@ -929,7 +932,8 @@ ALTER TABLE [ IF EXISTS ] name This form detaches specified partition of the target table. The detached partition continues to exist as a standalone table, but no longer has any - ties to the table from which it was detached. + ties to the table from which it was detached. Any indexes that were + attached to the target table's indexes are detached. diff --git a/doc/src/sgml/ref/create_index.sgml b/doc/src/sgml/ref/create_index.sgml index 6e59d73a..85634e5f 100644 --- a/doc/src/sgml/ref/create_index.sgml +++ b/doc/src/sgml/ref/create_index.sgml @@ -21,7 +21,7 @@ PostgreSQL documentation -CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ] name ] ON table_name [ USING method ] +CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ] name ] ON [ ONLY ] table_name [ USING method ] ( { column_name | ( expression ) } [ COLLATE collation ] [ opclass ] [ ASC | DESC ] [ NULLS { FIRST | LAST } ] [, ...] ) [ WITH ( storage_parameter = value [, ... ] ) ] [ TABLESPACE tablespace_name ] @@ -155,6 +155,16 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ] + + ONLY + + + Indicates not to recurse creating indexes on partitions, if the + table is partitioned. The default is to recurse. + + + + table_name @@ -549,6 +559,27 @@ Indexes: linkend="xindex">. + + When CREATE INDEX is invoked on a partitioned + table, the default behavior is to recurse to all partitions to ensure + they all have matching indexes. + Each partition is first checked to determine whether an equivalent + index already exists, and if so, that index will become attached as a + partition index to the index being created, which will become its + parent index. + If no matching index exists, a new index will be created and + automatically attached; the name of the new index in each partition + will be determined as if no index name had been specified in the + command. + If the ONLY option is specified, no recursion + is done, and the index is marked invalid + (ALTER INDEX ... ATTACH PARTITION turns the index + valid, once all partitions acquire the index.) Note, however, that + any partition that is created in the future using + CREATE TABLE ... PARTITION OF will automatically + contain the index regardless of whether this option was specified. + + For index methods that support ordered scans (currently, only B-tree), the optional clauses ASC, DESC, NULLS diff --git a/doc/src/sgml/ref/reindex.sgml b/doc/src/sgml/ref/reindex.sgml index 3908ade3..61d4c1e1 100644 --- a/doc/src/sgml/ref/reindex.sgml +++ b/doc/src/sgml/ref/reindex.sgml @@ -231,6 +231,11 @@ REINDEX [ ( VERBOSE ) ] { INDEX | TABLE | SCHEMA | DATABASE | SYSTEM } + + Reindexing partitioned tables or partitioned indexes is not supported. + Each individual partition can be reindexed separately instead. + + diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c index 25b6394c..f3602fb6 100644 --- a/src/backend/access/common/reloptions.c +++ b/src/backend/access/common/reloptions.c @@ -1052,6 +1052,7 @@ extractRelOptions(HeapTuple tuple, TupleDesc tupdesc, options = view_reloptions(datum, false); break; case RELKIND_INDEX: + case RELKIND_PARTITIONED_INDEX: options = index_reloptions(amoptions, datum, false); break; case RELKIND_FOREIGN_TABLE: diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 97064050..b0129032 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -1484,7 +1484,8 @@ heap_open(Oid relationId, LOCKMODE lockmode) r = relation_open(relationId, lockmode); - if (r->rd_rel->relkind == RELKIND_INDEX) + if (r->rd_rel->relkind == RELKIND_INDEX || + r->rd_rel->relkind == RELKIND_PARTITIONED_INDEX) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("\"%s\" is an index", @@ -1512,7 +1513,8 @@ heap_openrv(const RangeVar *relation, LOCKMODE lockmode) r = relation_openrv(relation, lockmode); - if (r->rd_rel->relkind == RELKIND_INDEX) + if (r->rd_rel->relkind == RELKIND_INDEX || + r->rd_rel->relkind == RELKIND_PARTITIONED_INDEX) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("\"%s\" is an index", @@ -1544,7 +1546,8 @@ heap_openrv_extended(const RangeVar *relation, LOCKMODE lockmode, if (r) { - if (r->rd_rel->relkind == RELKIND_INDEX) + if (r->rd_rel->relkind == RELKIND_INDEX || + r->rd_rel->relkind == RELKIND_PARTITIONED_INDEX) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("\"%s\" is an index", diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index c7be5d3a..931f71cc 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -215,7 +215,8 @@ index_open(Oid relationId, LOCKMODE lockmode) r = relation_open(relationId, lockmode); - if (r->rd_rel->relkind != RELKIND_INDEX) + if (r->rd_rel->relkind != RELKIND_INDEX && + r->rd_rel->relkind != RELKIND_PARTITIONED_INDEX) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("\"%s\" is not an index", diff --git a/src/backend/bootstrap/bootparse.y b/src/backend/bootstrap/bootparse.y index 137c2dad..128b2e6c 100644 --- a/src/backend/bootstrap/bootparse.y +++ b/src/backend/bootstrap/bootparse.y @@ -386,6 +386,7 @@ Boot_DeclareIndexStmt: DefineIndex(relationId, stmt, $4, + InvalidOid, false, false, false, @@ -431,6 +432,7 @@ Boot_DeclareUniqueIndexStmt: DefineIndex(relationId, stmt, $5, + InvalidOid, false, false, false, diff --git a/src/backend/catalog/aclchk.c b/src/backend/catalog/aclchk.c index 73fdd150..26e9dfb9 100644 --- a/src/backend/catalog/aclchk.c +++ b/src/backend/catalog/aclchk.c @@ -1789,7 +1789,8 @@ ExecGrant_Relation(InternalGrant *istmt) pg_class_tuple = (Form_pg_class) GETSTRUCT(tuple); /* Not sensible to grant on an index */ - if (pg_class_tuple->relkind == RELKIND_INDEX) + if (pg_class_tuple->relkind == RELKIND_INDEX || + pg_class_tuple->relkind == RELKIND_PARTITIONED_INDEX) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("\"%s\" is an index", @@ -5384,7 +5385,8 @@ recordExtObjInitPriv(Oid objoid, Oid classoid) pg_class_tuple = (Form_pg_class) GETSTRUCT(tuple); /* Indexes don't have permissions */ - if (pg_class_tuple->relkind == RELKIND_INDEX) + if (pg_class_tuple->relkind == RELKIND_INDEX || + pg_class_tuple->relkind == RELKIND_PARTITIONED_INDEX) return; /* Composite types don't have permissions either */ @@ -5669,7 +5671,8 @@ removeExtObjInitPriv(Oid objoid, Oid classoid) pg_class_tuple = (Form_pg_class) GETSTRUCT(tuple); /* Indexes don't have permissions */ - if (pg_class_tuple->relkind == RELKIND_INDEX) + if (pg_class_tuple->relkind == RELKIND_INDEX || + pg_class_tuple->relkind == RELKIND_PARTITIONED_INDEX) return; /* Composite types don't have permissions either */ diff --git a/src/backend/catalog/dependency.c b/src/backend/catalog/dependency.c index 924d7f35..737e549d 100644 --- a/src/backend/catalog/dependency.c +++ b/src/backend/catalog/dependency.c @@ -728,6 +728,7 @@ findDependentObjects(const ObjectAddress *object, /* FALL THRU */ case DEPENDENCY_INTERNAL: + case DEPENDENCY_INTERNAL_AUTO: /* * This object is part of the internal implementation of @@ -779,6 +780,14 @@ findDependentObjects(const ObjectAddress *object, * transform this deletion request into a delete of this * owning object. * + * For INTERNAL_AUTO dependencies, we don't enforce this; + * in other words, we don't follow the links back to the + * owning object. + */ + if (foundDep->deptype == DEPENDENCY_INTERNAL_AUTO) + break; + + /* * First, release caller's lock on this object and get * deletion lock on the owning object. (We must release * caller's lock to avoid deadlock against a concurrent @@ -821,6 +830,7 @@ findDependentObjects(const ObjectAddress *object, /* And we're done here. */ systable_endscan(scan); return; + case DEPENDENCY_PIN: /* @@ -918,6 +928,7 @@ findDependentObjects(const ObjectAddress *object, case DEPENDENCY_AUTO_EXTENSION: subflags = DEPFLAG_AUTO; break; + case DEPENDENCY_INTERNAL_AUTO: case DEPENDENCY_INTERNAL: subflags = DEPFLAG_INTERNAL; break; @@ -1267,7 +1278,8 @@ doDeletion(const ObjectAddress *object, int flags) { char relKind = get_rel_relkind(object->objectId); - if (relKind == RELKIND_INDEX) + if (relKind == RELKIND_INDEX || + relKind == RELKIND_PARTITIONED_INDEX) { bool concurrent = ((flags & PERFORM_DELETION_CONCURRENTLY) != 0); diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index db5d16ee..81c91015 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -41,6 +41,8 @@ #include "catalog/pg_collation.h" #include "catalog/pg_constraint.h" #include "catalog/pg_constraint_fn.h" +#include "catalog/pg_depend.h" +#include "catalog/pg_inherits_fn.h" #include "catalog/pg_operator.h" #include "catalog/pg_opclass.h" #include "catalog/pg_tablespace.h" @@ -56,6 +58,7 @@ #include "nodes/nodeFuncs.h" #include "optimizer/clauses.h" #include "parser/parser.h" +#include "rewrite/rewriteManip.h" #include "pgxc/pgxc.h" #include "storage/bufmgr.h" #include "storage/lmgr.h" @@ -103,6 +106,7 @@ static void InitializeAttributeOids(Relation indexRelation, int numatts, Oid indexoid); static void AppendAttributeTuples(Relation indexRelation, int numatts); static void UpdateIndexRelation(Oid indexoid, Oid heapoid, + Oid parentIndexId, IndexInfo *indexInfo, Oid *collationOids, Oid *classOids, @@ -110,7 +114,8 @@ static void UpdateIndexRelation(Oid indexoid, Oid heapoid, bool primary, bool isexclusion, bool immediate, - bool isvalid); + bool isvalid, + bool isready); static void index_update_stats(Relation rel, bool hasindex, bool isprimary, double reltuples); @@ -590,6 +595,7 @@ AppendAttributeTuples(Relation indexRelation, int numatts) static void UpdateIndexRelation(Oid indexoid, Oid heapoid, + Oid parentIndexOid, IndexInfo *indexInfo, Oid *collationOids, Oid *classOids, @@ -597,7 +603,8 @@ UpdateIndexRelation(Oid indexoid, bool primary, bool isexclusion, bool immediate, - bool isvalid) + bool isvalid, + bool isready) { int2vector *indkey; oidvector *indcollation; @@ -671,8 +678,7 @@ UpdateIndexRelation(Oid indexoid, values[Anum_pg_index_indisclustered - 1] = BoolGetDatum(false); values[Anum_pg_index_indisvalid - 1] = BoolGetDatum(isvalid); values[Anum_pg_index_indcheckxmin - 1] = BoolGetDatum(false); - /* we set isvalid and isready the same way */ - values[Anum_pg_index_indisready - 1] = BoolGetDatum(isvalid); + values[Anum_pg_index_indisready - 1] = BoolGetDatum(isready); values[Anum_pg_index_indislive - 1] = BoolGetDatum(true); values[Anum_pg_index_indisreplident - 1] = BoolGetDatum(false); values[Anum_pg_index_indkey - 1] = PointerGetDatum(indkey); @@ -709,6 +715,8 @@ UpdateIndexRelation(Oid indexoid, * indexRelationId: normally, pass InvalidOid to let this routine * generate an OID for the index. During bootstrap this may be * nonzero to specify a preselected OID. + * parentIndexRelid: if creating an index partition, the OID of the + * parent index; otherwise InvalidOid. * relFileNode: normally, pass InvalidOid to get new storage. May be * nonzero to attach an existing valid build. * indexInfo: same info executor uses to insert into the index @@ -734,6 +742,8 @@ UpdateIndexRelation(Oid indexoid, * INDEX_CREATE_IF_NOT_EXISTS: * do not throw an error if a relation with the same name * already exists. + * INDEX_CREATE_PARTITIONED: + * create a partitioned index (table must be partitioned) * constr_flags: flags passed to index_constraint_create * (only if INDEX_CREATE_ADD_CONSTRAINT is set) * allow_system_table_mods: allow table to be a system catalog @@ -745,6 +755,7 @@ Oid index_create(Relation heapRelation, const char *indexRelationName, Oid indexRelationId, + Oid parentIndexRelid, Oid relFileNode, IndexInfo *indexInfo, List *indexColNames, @@ -770,12 +781,18 @@ index_create(Relation heapRelation, int i; char relpersistence; bool isprimary = (flags & INDEX_CREATE_IS_PRIMARY) != 0; + bool invalid = (flags & INDEX_CREATE_INVALID) != 0; bool concurrent = (flags & INDEX_CREATE_CONCURRENT) != 0; + bool partitioned = (flags & INDEX_CREATE_PARTITIONED) != 0; + char relkind; /* constraint flags can only be set when a constraint is requested */ Assert((constr_flags == 0) || ((flags & INDEX_CREATE_ADD_CONSTRAINT) != 0)); + /* partitioned indexes must never be "built" by themselves */ + Assert(!partitioned || (flags & INDEX_CREATE_SKIP_BUILD)); + relkind = partitioned ? RELKIND_PARTITIONED_INDEX : RELKIND_INDEX; is_exclusion = (indexInfo->ii_ExclusionOps != NULL); pg_class = heap_open(RelationRelationId, RowExclusiveLock); @@ -893,9 +910,9 @@ index_create(Relation heapRelation, } /* - * create the index relation's relcache entry and physical disk file. (If - * we fail further down, it's the smgr's responsibility to remove the disk - * file again.) + * create the index relation's relcache entry and, if necessary, the + * physical disk file. (If we fail further down, it's the smgr's + * responsibility to remove the disk file again, if any.) */ indexRelation = heap_create(indexRelationName, namespaceId, @@ -903,7 +920,7 @@ index_create(Relation heapRelation, indexRelationId, relFileNode, indexTupDesc, - RELKIND_INDEX, + relkind, relpersistence, shared_relation, mapped_relation, @@ -960,12 +977,18 @@ index_create(Relation heapRelation, * (Or, could define a rule to maintain the predicate) --Nels, Feb '92 * ---------------- */ - UpdateIndexRelation(indexRelationId, heapRelationId, indexInfo, + UpdateIndexRelation(indexRelationId, heapRelationId, parentIndexRelid, + indexInfo, collationObjectId, classObjectId, coloptions, isprimary, is_exclusion, (constr_flags & INDEX_CONSTR_CREATE_DEFERRABLE) == 0, + !concurrent && !invalid, !concurrent); + /* update pg_inherits, if needed */ + if (OidIsValid(parentIndexRelid)) + StoreSingleInheritance(indexRelationId, parentIndexRelid, 1); + /* * Register constraint and dependencies for the index. * @@ -1017,6 +1040,9 @@ index_create(Relation heapRelation, else { bool have_simple_col = false; + DependencyType deptype; + + deptype = OidIsValid(parentIndexRelid) ? DEPENDENCY_INTERNAL_AUTO : DEPENDENCY_AUTO; /* Create auto dependencies on simply-referenced columns */ for (i = 0; i < indexInfo->ii_NumIndexAttrs; i++) @@ -1027,7 +1053,7 @@ index_create(Relation heapRelation, referenced.objectId = heapRelationId; referenced.objectSubId = indexInfo->ii_KeyAttrNumbers[i]; - recordDependencyOn(&myself, &referenced, DEPENDENCY_AUTO); + recordDependencyOn(&myself, &referenced, deptype); have_simple_col = true; } @@ -1045,8 +1071,18 @@ index_create(Relation heapRelation, referenced.objectId = heapRelationId; referenced.objectSubId = 0; - recordDependencyOn(&myself, &referenced, DEPENDENCY_AUTO); + recordDependencyOn(&myself, &referenced, deptype); + } } + + /* Store dependency on parent index, if any */ + if (OidIsValid(parentIndexRelid)) + { + referenced.classId = RelationRelationId; + referenced.objectId = parentIndexRelid; + referenced.objectSubId = 0; + + recordDependencyOn(&myself, &referenced, DEPENDENCY_INTERNAL_AUTO); } /* Store dependency on collations */ @@ -1598,8 +1634,9 @@ index_drop(Oid indexId, bool concurrent) } /* - * Schedule physical removal of the files + * Schedule physical removal of the files (if any) */ + if (userIndexRelation->rd_rel->relkind != RELKIND_PARTITIONED_INDEX) RelationDropStorage(userIndexRelation); /* @@ -1649,6 +1686,11 @@ index_drop(Oid indexId, bool concurrent) DeleteRelationTuple(indexId); /* + * fix INHERITS relation + */ + DeleteInheritsTuple(indexId, InvalidOid); + + /* * We are presently too lazy to attempt to compute the new correct value * of relhasindex (the next VACUUM will fix it if necessary). So there is * no need to update the pg_class tuple for the owning relation. But we @@ -1741,12 +1783,120 @@ BuildIndexInfo(Relation index) ii->ii_BrokenHotChain = false; /* set up for possible use by index AM */ + ii->ii_Am = index->rd_rel->relam; ii->ii_AmCache = NULL; ii->ii_Context = CurrentMemoryContext; return ii; } +/* + * CompareIndexInfo + * Return whether the properties of two indexes (in different tables) + * indicate that they have the "same" definitions. + * + * Note: passing collations and opfamilies separately is a kludge. Adding + * them to IndexInfo may result in better coding here and elsewhere. + * + * Use convert_tuples_by_name_map(index2, index1) to build the attmap. + */ +bool +CompareIndexInfo(IndexInfo *info1, IndexInfo *info2, + Oid *collations1, Oid *collations2, + Oid *opfamilies1, Oid *opfamilies2, + AttrNumber *attmap, int maplen) +{ + int i; + + if (info1->ii_Unique != info2->ii_Unique) + return false; + + /* indexes are only equivalent if they have the same access method */ + if (info1->ii_Am != info2->ii_Am) + return false; + + /* and same number of attributes */ + if (info1->ii_NumIndexAttrs != info2->ii_NumIndexAttrs) + return false; + + /* + * and columns match through the attribute map (actual attribute numbers + * might differ!) Note that this implies that index columns that are + * expressions appear in the same positions. We will next compare the + * expressions themselves. + */ + for (i = 0; i < info1->ii_NumIndexAttrs; i++) + { + if (maplen < info2->ii_KeyAttrNumbers[i]) + elog(ERROR, "incorrect attribute map"); + + if (attmap[info2->ii_KeyAttrNumbers[i] - 1] != + info1->ii_KeyAttrNumbers[i]) + return false; + + if (collations1[i] != collations2[i]) + return false; + if (opfamilies1[i] != opfamilies2[i]) + return false; + } + + /* + * For expression indexes: either both are expression indexes, or neither + * is; if they are, make sure the expressions match. + */ + if ((info1->ii_Expressions != NIL) != (info2->ii_Expressions != NIL)) + return false; + if (info1->ii_Expressions != NIL) + { + bool found_whole_row; + Node *mapped; + + mapped = map_variable_attnos((Node *) info2->ii_Expressions, + 1, 0, attmap, maplen, + InvalidOid, &found_whole_row); + if (found_whole_row) + { + /* + * we could throw an error here, but seems out of scope for this + * routine. + */ + return false; + } + + if (!equal(info1->ii_Expressions, mapped)) + return false; + } + + /* Partial index predicates must be identical, if they exist */ + if ((info1->ii_Predicate == NULL) != (info2->ii_Predicate == NULL)) + return false; + if (info1->ii_Predicate != NULL) + { + bool found_whole_row; + Node *mapped; + + mapped = map_variable_attnos((Node *) info2->ii_Predicate, + 1, 0, attmap, maplen, + InvalidOid, &found_whole_row); + if (found_whole_row) + { + /* + * we could throw an error here, but seems out of scope for this + * routine. + */ + return false; + } + if (!equal(info1->ii_Predicate, mapped)) + return false; + } + + /* No support currently for comparing exclusion indexes. */ + if (info1->ii_ExclusionOps != NULL || info2->ii_ExclusionOps != NULL) + return false; + + return true; +} + /* ---------------- * BuildSpeculativeIndexInfo * Add extra state to IndexInfo record @@ -1969,6 +2119,9 @@ index_update_stats(Relation rel, elog(ERROR, "could not find tuple for relation %u", relid); rd_rel = (Form_pg_class) GETSTRUCT(tuple); + /* Should this be a more comprehensive test? */ + Assert(rd_rel->relkind != RELKIND_PARTITIONED_INDEX); + /* Apply required updates, if any, to copied tuple */ dirty = false; @@ -3416,6 +3569,14 @@ reindex_index(Oid indexId, bool skip_constraint_checks, char persistence, iRel = index_open(indexId, AccessExclusiveLock); /* + * The case of reindexing partitioned tables and indexes is handled + * differently by upper layers, so this case shouldn't arise. + */ + if (iRel->rd_rel->relkind == RELKIND_PARTITIONED_INDEX) + elog(ERROR, "unsupported relation kind for index \"%s\"", + RelationGetRelationName(iRel)); + + /* * Don't allow reindex on temp tables of other backends ... their local * buffer manager is not going to cope. */ @@ -3614,6 +3775,22 @@ reindex_relation(Oid relid, int flags, int options) */ rel = heap_open(relid, ShareLock); + /* + * This may be useful when implemented someday; but that day is not today. + * For now, avoid erroring out when called in a multi-table context + * (REINDEX SCHEMA) and happen to come across a partitioned table. The + * partitions may be reindexed on their own anyway. + */ + if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + { + ereport(WARNING, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("REINDEX of partitioned tables is not yet implemented, skipping \"%s\"", + RelationGetRelationName(rel)))); + heap_close(rel, ShareLock); + return false; + } + toast_relid = rel->rd_rel->reltoastrelid; /* diff --git a/src/backend/catalog/objectaddress.c b/src/backend/catalog/objectaddress.c index 7ff21d2d..1a45c53b 100644 --- a/src/backend/catalog/objectaddress.c +++ b/src/backend/catalog/objectaddress.c @@ -1299,7 +1299,8 @@ get_relation_by_qualified_name(ObjectType objtype, List *object, switch (objtype) { case OBJECT_INDEX: - if (relation->rd_rel->relkind != RELKIND_INDEX) + if (relation->rd_rel->relkind != RELKIND_INDEX && + relation->rd_rel->relkind != RELKIND_PARTITIONED_INDEX) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("\"%s\" is not an index", @@ -3628,6 +3629,7 @@ getRelationDescription(StringInfo buffer, Oid relid) relname); break; case RELKIND_INDEX: + case RELKIND_PARTITIONED_INDEX: appendStringInfo(buffer, _("index %s"), relname); break; @@ -4144,6 +4146,7 @@ getRelationTypeDescription(StringInfo buffer, Oid relid, int32 objectSubId) appendStringInfoString(buffer, "table"); break; case RELKIND_INDEX: + case RELKIND_PARTITIONED_INDEX: appendStringInfoString(buffer, "index"); break; case RELKIND_SEQUENCE: diff --git a/src/backend/catalog/pg_depend.c b/src/backend/catalog/pg_depend.c index 68711d22..bf20cc54 100644 --- a/src/backend/catalog/pg_depend.c +++ b/src/backend/catalog/pg_depend.c @@ -656,14 +656,19 @@ get_constraint_index(Oid constraintId) /* * We assume any internal dependency of an index on the constraint - * must be what we are looking for. (The relkind test is just - * paranoia; there shouldn't be any such dependencies otherwise.) + * must be what we are looking for. */ if (deprec->classid == RelationRelationId && deprec->objsubid == 0 && - deprec->deptype == DEPENDENCY_INTERNAL && - get_rel_relkind(deprec->objid) == RELKIND_INDEX) + deprec->deptype == DEPENDENCY_INTERNAL) { + char relkind = get_rel_relkind(deprec->objid); + + /* This is pure paranoia; there shouldn't be any such */ + if (relkind != RELKIND_INDEX && + relkind != RELKIND_PARTITIONED_INDEX) + break; + indexId = deprec->objid; break; } diff --git a/src/backend/catalog/pg_inherits.c b/src/backend/catalog/pg_inherits.c index 84e2fa04..351bd788 100644 --- a/src/backend/catalog/pg_inherits.c +++ b/src/backend/catalog/pg_inherits.c @@ -400,3 +400,83 @@ typeInheritsFrom(Oid subclassTypeId, Oid superclassTypeId) return result; } + +/* + * Create a single pg_inherits row with the given data + */ +void +StoreSingleInheritance(Oid relationId, Oid parentOid, int32 seqNumber) +{ + Datum values[Natts_pg_inherits]; + bool nulls[Natts_pg_inherits]; + HeapTuple tuple; + Relation inhRelation; + + inhRelation = heap_open(InheritsRelationId, RowExclusiveLock); + + /* + * Make the pg_inherits entry + */ + values[Anum_pg_inherits_inhrelid - 1] = ObjectIdGetDatum(relationId); + values[Anum_pg_inherits_inhparent - 1] = ObjectIdGetDatum(parentOid); + values[Anum_pg_inherits_inhseqno - 1] = Int32GetDatum(seqNumber); + + memset(nulls, 0, sizeof(nulls)); + + tuple = heap_form_tuple(RelationGetDescr(inhRelation), values, nulls); + + CatalogTupleInsert(inhRelation, tuple); + + heap_freetuple(tuple); + + heap_close(inhRelation, RowExclusiveLock); +} + +/* + * DeleteInheritsTuple + * + * Delete pg_inherits tuples with the given inhrelid. inhparent may be given + * as InvalidOid, in which case all tuples matching inhrelid are deleted; + * otherwise only delete tuples with the specified inhparent. + * + * Returns whether at least one row was deleted. + */ +bool +DeleteInheritsTuple(Oid inhrelid, Oid inhparent) +{ + bool found = false; + Relation catalogRelation; + ScanKeyData key; + SysScanDesc scan; + HeapTuple inheritsTuple; + + /* + * Find pg_inherits entries by inhrelid. + */ + catalogRelation = heap_open(InheritsRelationId, RowExclusiveLock); + ScanKeyInit(&key, + Anum_pg_inherits_inhrelid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(inhrelid)); + scan = systable_beginscan(catalogRelation, InheritsRelidSeqnoIndexId, + true, NULL, 1, &key); + + while (HeapTupleIsValid(inheritsTuple = systable_getnext(scan))) + { + Oid parent; + + /* Compare inhparent if it was given, and do the actual deletion. */ + parent = ((Form_pg_inherits) GETSTRUCT(inheritsTuple))->inhparent; + if (!OidIsValid(inhparent) || parent == inhparent) + { + CatalogTupleDelete(catalogRelation, &inheritsTuple->t_self); + found = true; + } + } + + /* Done */ + systable_endscan(scan); + heap_close(catalogRelation, RowExclusiveLock); + + return found; +} diff --git a/src/backend/catalog/toasting.c b/src/backend/catalog/toasting.c index 325b72e9..a82b2037 100644 --- a/src/backend/catalog/toasting.c +++ b/src/backend/catalog/toasting.c @@ -378,6 +378,7 @@ create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid, indexInfo->ii_ReadyForInserts = true; indexInfo->ii_Concurrent = false; indexInfo->ii_BrokenHotChain = false; + indexInfo->ii_Am = BTREE_AM_OID; indexInfo->ii_AmCache = NULL; indexInfo->ii_Context = CurrentMemoryContext; @@ -391,6 +392,7 @@ create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid, coloptions[1] = 0; index_create(toast_rel, toast_idxname, toastIndexOid, InvalidOid, + InvalidOid, indexInfo, list_make2("chunk_id", "chunk_seq"), BTREE_AM_OID, diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index af0b9947..22c2348e 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -25,7 +25,10 @@ #include "catalog/catalog.h" #include "catalog/index.h" #include "catalog/indexing.h" +#include "catalog/partition.h" #include "catalog/pg_am.h" +#include "catalog/pg_inherits.h" +#include "catalog/pg_inherits_fn.h" #include "catalog/pg_opclass.h" #include "catalog/pg_opfamily.h" #include "catalog/pg_tablespace.h" @@ -38,6 +41,7 @@ #include "commands/tablespace.h" #include "mb/pg_wchar.h" #include "miscadmin.h" +#include "nodes/makefuncs.h" #include "nodes/nodeFuncs.h" #include "optimizer/clauses.h" #include "optimizer/planner.h" @@ -45,6 +49,7 @@ #include "parser/parse_coerce.h" #include "parser/parse_func.h" #include "parser/parse_oper.h" +#include "rewrite/rewriteManip.h" #ifdef PGXC #include "parser/parse_utilcmd.h" #include "pgxc/pgxc.h" @@ -84,6 +89,7 @@ static char *ChooseIndexNameAddition(List *colnames); static List *ChooseIndexColumnNames(List *indexElems); static void RangeVarCallbackForReindexIndex(const RangeVar *relation, Oid relId, Oid oldRelId, void *arg); +static void ReindexPartitionedIndex(Relation parentIdx); /* * CheckIndexCompatible @@ -190,6 +196,7 @@ CheckIndexCompatible(Oid oldId, indexInfo->ii_ExclusionOps = NULL; indexInfo->ii_ExclusionProcs = NULL; indexInfo->ii_ExclusionStrats = NULL; + indexInfo->ii_Am = accessMethodId; indexInfo->ii_AmCache = NULL; indexInfo->ii_Context = CurrentMemoryContext; typeObjectId = (Oid *) palloc(numberOfAttributes * sizeof(Oid)); @@ -299,14 +306,15 @@ CheckIndexCompatible(Oid oldId, * 'stmt': IndexStmt describing the properties of the new index. * 'indexRelationId': normally InvalidOid, but during bootstrap can be * nonzero to specify a preselected OID for the index. + * 'parentIndexId': the OID of the parent index; InvalidOid if not the child + * of a partitioned index. * 'is_alter_table': this is due to an ALTER rather than a CREATE operation. * 'check_rights': check for CREATE rights in namespace and tablespace. (This * should be true except when ALTER is deleting/recreating an index.) * 'check_not_in_use': check for table not already in use in current session. * This should be true unless caller is holding the table open, in which * case the caller had better have checked it earlier. - * 'skip_build': make the catalog entries but leave the index file empty; - * it will be filled later. + * 'skip_build': make the catalog entries but don't create the index files * 'quiet': suppress the NOTICE chatter ordinarily provided for constraints. * * Returns the object address of the created index. @@ -315,6 +323,7 @@ ObjectAddress DefineIndex(Oid relationId, IndexStmt *stmt, Oid indexRelationId, + Oid parentIndexId, bool is_alter_table, bool check_rights, bool check_not_in_use, @@ -337,6 +346,7 @@ DefineIndex(Oid relationId, IndexAmRoutine *amRoutine; bool amcanorder; amoptions_function amoptions; + bool partitioned; Datum reloptions; int16 *coloptions; IndexInfo *indexInfo; @@ -399,7 +409,8 @@ DefineIndex(Oid relationId, namespaceId = RelationGetNamespace(rel); if (rel->rd_rel->relkind != RELKIND_RELATION && - rel->rd_rel->relkind != RELKIND_MATVIEW) + rel->rd_rel->relkind != RELKIND_MATVIEW && + rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE) { if (rel->rd_rel->relkind == RELKIND_FOREIGN_TABLE) @@ -411,11 +422,6 @@ DefineIndex(Oid relationId, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("cannot create index on foreign table \"%s\"", RelationGetRelationName(rel)))); - else if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) - ereport(ERROR, - (errcode(ERRCODE_WRONG_OBJECT_TYPE), - errmsg("cannot create index on partitioned table \"%s\"", - RelationGetRelationName(rel)))); else ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), @@ -424,6 +430,38 @@ DefineIndex(Oid relationId, } /* + * Establish behavior for partitioned tables, and verify sanity of + * parameters. + * + * We do not build an actual index in this case; we only create a few + * catalog entries. The actual indexes are built by recursing for each + * partition. + */ + partitioned = rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE; + if (partitioned) + { + if (stmt->concurrent) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot create index on partitioned table \"%s\" concurrently", + RelationGetRelationName(rel)))); + if (stmt->unique) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot create unique index on partitioned table \"%s\"", + RelationGetRelationName(rel)))); + if (stmt->excludeOpNames) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot create exclusion constraints on partitioned table \"%s\"", + RelationGetRelationName(rel)))); + if (stmt->primary || stmt->isconstraint) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot create constraints on partitioned tables"))); + } + + /* * Don't try to CREATE INDEX on temp tables of other backends. */ if (RELATION_IS_OTHER_TEMP(rel)) @@ -668,6 +706,7 @@ DefineIndex(Oid relationId, indexInfo->ii_ReadyForInserts = !stmt->concurrent; indexInfo->ii_Concurrent = stmt->concurrent; indexInfo->ii_BrokenHotChain = false; + indexInfo->ii_Am = accessMethodId; indexInfo->ii_AmCache = NULL; indexInfo->ii_Context = CurrentMemoryContext; @@ -759,19 +798,24 @@ DefineIndex(Oid relationId, /* * Make the catalog entries for the index, including constraints. This * step also actually builds the index, except if caller requested not to - * or in concurrent mode, in which case it'll be done later. + * or in concurrent mode, in which case it'll be done later, or + * doing a partitioned index (because those don't have storage). */ flags = constr_flags = 0; if (stmt->isconstraint) flags |= INDEX_CREATE_ADD_CONSTRAINT; - if (skip_build || stmt->concurrent) + if (skip_build || stmt->concurrent || partitioned) flags |= INDEX_CREATE_SKIP_BUILD; if (stmt->if_not_exists) flags |= INDEX_CREATE_IF_NOT_EXISTS; if (stmt->concurrent) flags |= INDEX_CREATE_CONCURRENT; + if (partitioned) + flags |= INDEX_CREATE_PARTITIONED; if (stmt->primary) flags |= INDEX_CREATE_IS_PRIMARY; + if (partitioned && stmt->relation && !stmt->relation->inh) + flags |= INDEX_CREATE_INVALID; if (stmt->deferrable) constr_flags |= INDEX_CONSTR_CREATE_DEFERRABLE; @@ -779,8 +823,8 @@ DefineIndex(Oid relationId, constr_flags |= INDEX_CONSTR_CREATE_INIT_DEFERRED; indexRelationId = - index_create(rel, indexRelationName, indexRelationId, stmt->oldNode, - indexInfo, indexColNames, + index_create(rel, indexRelationName, indexRelationId, parentIndexId, + stmt->oldNode, indexInfo, indexColNames, accessMethodId, tablespaceId, collationObjectId, classObjectId, coloptions, reloptions, @@ -807,6 +851,160 @@ DefineIndex(Oid relationId, CreateComments(indexRelationId, RelationRelationId, 0, stmt->idxcomment); + if (partitioned) + { + /* + * Unless caller specified to skip this step (via ONLY), process + * each partition to make sure they all contain a corresponding index. + * + * If we're called internally (no stmt->relation), recurse always. + */ + if (!stmt->relation || stmt->relation->inh) + { + PartitionDesc partdesc = RelationGetPartitionDesc(rel); + int nparts = partdesc->nparts; + Oid *part_oids = palloc(sizeof(Oid) * nparts); + bool invalidate_parent = false; + TupleDesc parentDesc; + Oid *opfamOids; + + memcpy(part_oids, partdesc->oids, sizeof(Oid) * nparts); + + parentDesc = CreateTupleDescCopy(RelationGetDescr(rel)); + opfamOids = palloc(sizeof(Oid) * numberOfAttributes); + for (i = 0; i < numberOfAttributes; i++) + opfamOids[i] = get_opclass_family(classObjectId[i]); + + heap_close(rel, NoLock); + + /* + * For each partition, scan all existing indexes; if one matches + * our index definition and is not already attached to some other + * parent index, attach it to the one we just created. + * + * If none matches, build a new index by calling ourselves + * recursively with the same options (except for the index name). + */ + for (i = 0; i < nparts; i++) + { + Oid childRelid = part_oids[i]; + Relation childrel; + List *childidxs; + ListCell *cell; + AttrNumber *attmap; + bool found = false; + int maplen; + + childrel = heap_open(childRelid, lockmode); + childidxs = RelationGetIndexList(childrel); + attmap = + convert_tuples_by_name_map(RelationGetDescr(childrel), + parentDesc, + gettext_noop("could not convert row type")); + maplen = parentDesc->natts; + + + foreach(cell, childidxs) + { + Oid cldidxid = lfirst_oid(cell); + Relation cldidx; + IndexInfo *cldIdxInfo; + + /* this index is already partition of another one */ + if (has_superclass(cldidxid)) + continue; + + cldidx = index_open(cldidxid, lockmode); + cldIdxInfo = BuildIndexInfo(cldidx); + if (CompareIndexInfo(cldIdxInfo, indexInfo, + cldidx->rd_indcollation, + collationObjectId, + cldidx->rd_opfamily, + opfamOids, + attmap, maplen)) + { + /* + * Found a match. Attach index to parent and we're + * done, but keep lock till commit. + */ + IndexSetParentIndex(cldidx, indexRelationId); + + if (!IndexIsValid(cldidx->rd_index)) + invalidate_parent = true; + + found = true; + index_close(cldidx, NoLock); + break; + } + + index_close(cldidx, lockmode); + } + + list_free(childidxs); + heap_close(childrel, NoLock); + + /* + * If no matching index was found, create our own. + */ + if (!found) + { + IndexStmt *childStmt = copyObject(stmt); + bool found_whole_row; + + childStmt->whereClause = + map_variable_attnos(stmt->whereClause, 1, 0, + attmap, maplen, + InvalidOid, &found_whole_row); + if (found_whole_row) + elog(ERROR, "cannot convert whole-row table reference"); + + childStmt->idxname = NULL; + childStmt->relationId = childRelid; + DefineIndex(childRelid, childStmt, + InvalidOid, /* no predefined OID */ + indexRelationId, /* this is our child */ + false, check_rights, check_not_in_use, + false, quiet); + } + + pfree(attmap); + } + + /* + * The pg_index row we inserted for this index was marked + * indisvalid=true. But if we attached an existing index that + * is invalid, this is incorrect, so update our row to + * invalid too. + */ + if (invalidate_parent) + { + Relation pg_index = heap_open(IndexRelationId, RowExclusiveLock); + HeapTuple tup, + newtup; + + tup = SearchSysCache1(INDEXRELID, + ObjectIdGetDatum(indexRelationId)); + if (!tup) + elog(ERROR, "cache lookup failed for index %u", + indexRelationId); + newtup = heap_copytuple(tup); + ((Form_pg_index) GETSTRUCT(newtup))->indisvalid = false; + CatalogTupleUpdate(pg_index, &tup->t_self, newtup); + ReleaseSysCache(tup); + heap_close(pg_index, RowExclusiveLock); + heap_freetuple(newtup); + } + } + else + heap_close(rel, NoLock); + + /* + * Indexes on partitioned tables are not themselves built, so we're + * done here. + */ + return address; + } + if (!stmt->concurrent) { /* Close the heap and we're done, in the non-concurrent case */ @@ -1904,7 +2102,7 @@ ChooseIndexColumnNames(List *indexElems) * ReindexIndex * Recreate a specific index. */ -Oid +void ReindexIndex(RangeVar *indexRelation, int options) { Oid indOid; @@ -1927,12 +2125,17 @@ ReindexIndex(RangeVar *indexRelation, int options) * lock on the index. */ irel = index_open(indOid, NoLock); + + if (irel->rd_rel->relkind == RELKIND_PARTITIONED_INDEX) + { + ReindexPartitionedIndex(irel); + return; + } + persistence = irel->rd_rel->relpersistence; index_close(irel, NoLock); reindex_index(indOid, false, persistence, options); - - return indOid; } /* @@ -1971,7 +2174,8 @@ RangeVarCallbackForReindexIndex(const RangeVar *relation, relkind = get_rel_relkind(relId); if (!relkind) return; - if (relkind != RELKIND_INDEX) + if (relkind != RELKIND_INDEX && + relkind != RELKIND_PARTITIONED_INDEX) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("\"%s\" is not an index", relation->relname))); @@ -2115,6 +2319,12 @@ ReindexMultipleTables(const char *objectName, ReindexObjectType objectKind, /* * Only regular tables and matviews can have indexes, so ignore any * other kind of relation. + * + * It is tempting to also consider partitioned tables here, but that + * has the problem that if the children are in the same schema, they + * would be processed twice. Maybe we could have a separate list of + * partitioned tables, and expand that afterwards into relids, + * ignoring any duplicates. */ if (classtuple->relkind != RELKIND_RELATION && classtuple->relkind != RELKIND_MATVIEW) @@ -2177,3 +2387,155 @@ ReindexMultipleTables(const char *objectName, ReindexObjectType objectKind, MemoryContextDelete(private_context); } + +/* + * ReindexPartitionedIndex + * Reindex each child of the given partitioned index. + * + * Not yet implemented. + */ +static void +ReindexPartitionedIndex(Relation parentIdx) +{ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("REINDEX is not yet implemented for partitioned indexes"))); +} + +/* + * Insert or delete an appropriate pg_inherits tuple to make the given index + * be a partition of the indicated parent index. + * + * This also corrects the pg_depend information for the affected index. + */ +void +IndexSetParentIndex(Relation partitionIdx, Oid parentOid) +{ + Relation pg_inherits; + ScanKeyData key[2]; + SysScanDesc scan; + Oid partRelid = RelationGetRelid(partitionIdx); + HeapTuple tuple; + bool fix_dependencies; + + /* Make sure this is an index */ + Assert(partitionIdx->rd_rel->relkind == RELKIND_INDEX || + partitionIdx->rd_rel->relkind == RELKIND_PARTITIONED_INDEX); + + /* + * Scan pg_inherits for rows linking our index to some parent. + */ + pg_inherits = relation_open(InheritsRelationId, RowExclusiveLock); + ScanKeyInit(&key[0], + Anum_pg_inherits_inhrelid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(partRelid)); + ScanKeyInit(&key[1], + Anum_pg_inherits_inhseqno, + BTEqualStrategyNumber, F_INT4EQ, + Int32GetDatum(1)); + scan = systable_beginscan(pg_inherits, InheritsRelidSeqnoIndexId, true, + NULL, 2, key); + tuple = systable_getnext(scan); + + if (!HeapTupleIsValid(tuple)) + { + if (parentOid == InvalidOid) + { + /* + * No pg_inherits row, and no parent wanted: nothing to do in + * this case. + */ + fix_dependencies = false; + } + else + { + Datum values[Natts_pg_inherits]; + bool isnull[Natts_pg_inherits]; + + /* + * No pg_inherits row exists, and we want a parent for this index, + * so insert it. + */ + values[Anum_pg_inherits_inhrelid - 1] = ObjectIdGetDatum(partRelid); + values[Anum_pg_inherits_inhparent - 1] = + ObjectIdGetDatum(parentOid); + values[Anum_pg_inherits_inhseqno - 1] = Int32GetDatum(1); + memset(isnull, false, sizeof(isnull)); + + tuple = heap_form_tuple(RelationGetDescr(pg_inherits), + values, isnull); + CatalogTupleInsert(pg_inherits, tuple); + + fix_dependencies = true; + } + } + else + { + Form_pg_inherits inhForm = (Form_pg_inherits) GETSTRUCT(tuple); + + if (parentOid == InvalidOid) + { + /* + * There exists a pg_inherits row, which we want to clear; do so. + */ + CatalogTupleDelete(pg_inherits, &tuple->t_self); + fix_dependencies = true; + } + else + { + /* + * A pg_inherits row exists. If it's the same we want, then we're + * good; if it differs, that amounts to a corrupt catalog and + * should not happen. + */ + if (inhForm->inhparent != parentOid) + { + /* unexpected: we should not get called in this case */ + elog(ERROR, "bogus pg_inherit row: inhrelid %u inhparent %u", + inhForm->inhrelid, inhForm->inhparent); + } + + /* already in the right state */ + fix_dependencies = false; + } + } + + /* done with pg_inherits */ + systable_endscan(scan); + relation_close(pg_inherits, RowExclusiveLock); + + if (fix_dependencies) + { + ObjectAddress partIdx; + + /* + * Insert/delete pg_depend rows. If setting a parent, add an + * INTERNAL_AUTO dependency to the parent index; if making standalone, + * remove all existing rows and put back the regular dependency on the + * table. + */ + ObjectAddressSet(partIdx, RelationRelationId, partRelid); + + if (OidIsValid(parentOid)) + { + ObjectAddress parentIdx; + + ObjectAddressSet(parentIdx, RelationRelationId, parentOid); + recordDependencyOn(&partIdx, &parentIdx, DEPENDENCY_INTERNAL_AUTO); + } + else + { + ObjectAddress partitionTbl; + + ObjectAddressSet(partitionTbl, RelationRelationId, + partitionIdx->rd_index->indrelid); + + deleteDependencyRecordsForClass(RelationRelationId, partRelid, + RelationRelationId, + DEPENDENCY_INTERNAL_AUTO); + + recordDependencyOn(&partIdx, &partitionTbl, DEPENDENCY_AUTO); + } + } +} diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index c6e42c3d..834f2840 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -311,6 +311,12 @@ static const struct dropmsgstrings dropmsgstringarray[] = { gettext_noop("table \"%s\" does not exist, skipping"), gettext_noop("\"%s\" is not a table"), gettext_noop("Use DROP TABLE to remove a table.")}, + {RELKIND_PARTITIONED_INDEX, + ERRCODE_UNDEFINED_OBJECT, + gettext_noop("index \"%s\" does not exist"), + gettext_noop("index \"%s\" does not exist, skipping"), + gettext_noop("\"%s\" is not an index"), + gettext_noop("Use DROP INDEX to remove an index.")}, {'\0', 0, NULL, NULL, NULL, NULL} }; @@ -329,6 +335,7 @@ struct DropRelationCallbackState #define ATT_INDEX 0x0008 #define ATT_COMPOSITE_TYPE 0x0010 #define ATT_FOREIGN_TABLE 0x0020 +#define ATT_PARTITIONED_INDEX 0x0040 /* * Partition tables are expected to be dropped when the parent partitioned @@ -542,10 +549,16 @@ static void CreateInheritance(Relation child_rel, Relation parent_rel); static void RemoveInheritance(Relation child_rel, Relation parent_rel); static ObjectAddress ATExecAttachPartition(List **wqueue, Relation rel, PartitionCmd *cmd); +static void AttachPartitionEnsureIndexes(Relation rel, Relation attachrel); static void QueuePartitionConstraintValidation(List **wqueue, Relation scanrel, List *partConstraint, bool validate_default); static ObjectAddress ATExecDetachPartition(Relation rel, RangeVar *name); +static ObjectAddress ATExecAttachPartitionIdx(List **wqueue, Relation rel, + RangeVar *name); +static void validatePartitionedIndex(Relation partedIdx, Relation partedTbl); +static void refuseDupeIndexAttach(Relation parentIdx, Relation partIdx, + Relation partitionTbl); #ifdef _SHARDING_ static void AtExecRebuildExtent(Relation rel); #endif @@ -1237,12 +1250,59 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, StorePartitionKey(rel, strategy, partnatts, partattrs, partexprs, partopclass, partcollation); + + /* make it all visible */ + CommandCounterIncrement(); #ifdef __TBASE__ } #endif } /* + * If we're creating a partition, create now all the indexes defined in + * the parent. We can't do it earlier, because DefineIndex wants to know + * the partition key which we just stored. + */ + if (stmt->partbound) + { + Oid parentId = linitial_oid(inheritOids); + Relation parent; + List *idxlist; + ListCell *cell; + + /* Already have strong enough lock on the parent */ + parent = heap_open(parentId, NoLock); + idxlist = RelationGetIndexList(parent); + + /* + * For each index in the parent table, create one in the partition + */ + foreach(cell, idxlist) + { + Relation idxRel = index_open(lfirst_oid(cell), AccessShareLock); + AttrNumber *attmap; + IndexStmt *idxstmt; + + attmap = convert_tuples_by_name_map(RelationGetDescr(rel), + RelationGetDescr(parent), + gettext_noop("could not convert row type")); + idxstmt = + generateClonedIndexStmt(NULL, RelationGetRelid(rel), idxRel, + attmap, RelationGetDescr(rel)->natts); + DefineIndex(RelationGetRelid(rel), + idxstmt, + InvalidOid, + RelationGetRelid(idxRel), + false, false, false, false, false); + + index_close(idxRel, AccessShareLock); + } + + list_free(idxlist); + heap_close(parent, NoLock); + } + + /* * Now add any newly specified column default values and CHECK constraints * to the new relation. These are passed to us in the form of raw * parsetrees; we need to transform them to executable expression trees @@ -1728,10 +1788,13 @@ RangeVarCallbackForDropRelation(const RangeVar *rel, Oid relOid, Oid oldRelOid, * but RemoveRelations() can only pass one relkind for a given relation. * It chooses RELKIND_RELATION for both regular and partitioned tables. * That means we must be careful before giving the wrong type error when - * the relation is RELKIND_PARTITIONED_TABLE. + * the relation is RELKIND_PARTITIONED_TABLE. An equivalent problem + * exists with indexes. */ if (classform->relkind == RELKIND_PARTITIONED_TABLE) expected_relkind = RELKIND_RELATION; + else if (classform->relkind == RELKIND_PARTITIONED_INDEX) + expected_relkind = RELKIND_INDEX; else expected_relkind = classform->relkind; @@ -1759,7 +1822,8 @@ RangeVarCallbackForDropRelation(const RangeVar *rel, Oid relOid, Oid oldRelOid, * we do it the other way around. No error if we don't find a pg_index * entry, though --- the relation may have been dropped. */ - if (relkind == RELKIND_INDEX && relOid != oldRelOid) + if ((relkind == RELKIND_INDEX || relkind == RELKIND_PARTITIONED_INDEX) && + relOid != oldRelOid) { state->heapOid = IndexGetRelation(relOid, true); if (OidIsValid(state->heapOid)) @@ -3065,27 +3129,11 @@ StoreCatalogInheritance1(Oid relationId, Oid parentOid, int16 seqNumber, Relation inhRelation, bool child_is_partition) { - TupleDesc desc = RelationGetDescr(inhRelation); - Datum values[Natts_pg_inherits]; - bool nulls[Natts_pg_inherits]; ObjectAddress childobject, parentobject; - HeapTuple tuple; - - /* - * Make the pg_inherits entry - */ - values[Anum_pg_inherits_inhrelid - 1] = ObjectIdGetDatum(relationId); - values[Anum_pg_inherits_inhparent - 1] = ObjectIdGetDatum(parentOid); - values[Anum_pg_inherits_inhseqno - 1] = Int16GetDatum(seqNumber); - - memset(nulls, 0, sizeof(nulls)); - - tuple = heap_form_tuple(desc, values, nulls); - CatalogTupleInsert(inhRelation, tuple); - - heap_freetuple(tuple); + /* store the pg_inherits row */ + StoreSingleInheritance(relationId, parentOid, seqNumber); /* * Store a dependency too @@ -3415,6 +3463,7 @@ renameatt_check(Oid myrelid, Form_pg_class classform, bool recursing) relkind != RELKIND_MATVIEW && relkind != RELKIND_COMPOSITE_TYPE && relkind != RELKIND_INDEX && + relkind != RELKIND_PARTITIONED_INDEX && relkind != RELKIND_FOREIGN_TABLE && relkind != RELKIND_PARTITIONED_TABLE) ereport(ERROR, @@ -4077,7 +4126,8 @@ RenameRelationInternal(Oid myrelid, const char *newrelname, bool is_internal) /* * Also rename the associated constraint, if any. */ - if (targetrelation->rd_rel->relkind == RELKIND_INDEX) + if (targetrelation->rd_rel->relkind == RELKIND_INDEX || + targetrelation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX) { Oid constraintId = get_index_constraint(myrelid); @@ -4157,6 +4207,7 @@ CheckTableNotInUse(Relation rel, const char *stmt) stmt, RelationGetRelationName(rel)))); if (rel->rd_rel->relkind != RELKIND_INDEX && + rel->rd_rel->relkind != RELKIND_PARTITIONED_INDEX && AfterTriggerPendingOnRel(RelationGetRelid(rel))) ereport(ERROR, (errcode(ERRCODE_OBJECT_IN_USE), @@ -5051,6 +5102,10 @@ ATPrepCmd(List **wqueue, Relation rel, AlterTableCmd *cmd, break; #endif case AT_AttachPartition: + ATSimplePermissions(rel, ATT_TABLE | ATT_PARTITIONED_INDEX); + /* No command-specific prep needed */ + pass = AT_PASS_MISC; + break; case AT_DetachPartition: ATSimplePermissions(rel, ATT_TABLE); /* No command-specific prep needed */ @@ -5430,9 +5485,15 @@ ATExecCmd(List **wqueue, AlteredTableInfo *tab, Relation rel, break; #endif case AT_AttachPartition: + if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) ATExecAttachPartition(wqueue, rel, (PartitionCmd *) cmd->def); + else + ATExecAttachPartitionIdx(wqueue, rel, + ((PartitionCmd *) cmd->def)->name); break; case AT_DetachPartition: + /* ATPrepCmd ensures it must be a table */ + Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE); ATExecDetachPartition(rel, ((PartitionCmd *) cmd->def)->name); break; #ifdef __TBASE__ @@ -5750,9 +5811,13 @@ ATRewriteTables(AlterTableStmt *parsetree, List **wqueue, LOCKMODE lockmode) errmsg("Incompatible operation with data redistribution"))); #endif - /* Foreign tables have no storage, nor do partitioned tables. */ + /* + * Foreign tables have no storage, nor do partitioned tables and + * indexes. + */ if (tab->relkind == RELKIND_FOREIGN_TABLE || - tab->relkind == RELKIND_PARTITIONED_TABLE) + tab->relkind == RELKIND_PARTITIONED_TABLE || + tab->relkind == RELKIND_PARTITIONED_INDEX) continue; /* @@ -6383,6 +6448,9 @@ ATSimplePermissions(Relation rel, int allowed_targets) case RELKIND_INDEX: actual_target = ATT_INDEX; break; + case RELKIND_PARTITIONED_INDEX: + actual_target = ATT_PARTITIONED_INDEX; + break; case RELKIND_COMPOSITE_TYPE: actual_target = ATT_COMPOSITE_TYPE; break; @@ -7940,6 +8008,7 @@ ATPrepSetStatistics(Relation rel, const char *colName, Node *newValue, LOCKMODE if (rel->rd_rel->relkind != RELKIND_RELATION && rel->rd_rel->relkind != RELKIND_MATVIEW && rel->rd_rel->relkind != RELKIND_INDEX && + rel->rd_rel->relkind != RELKIND_PARTITIONED_INDEX && rel->rd_rel->relkind != RELKIND_FOREIGN_TABLE && rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE) ereport(ERROR, @@ -7947,6 +8016,17 @@ ATPrepSetStatistics(Relation rel, const char *colName, Node *newValue, LOCKMODE errmsg("\"%s\" is not a table, materialized view, index, or foreign table", RelationGetRelationName(rel)))); + /* + * We allow referencing columns by numbers only for indexes, since table + * column numbers could contain gaps if columns are later dropped. + */ + if (rel->rd_rel->relkind != RELKIND_INDEX && + rel->rd_rel->relkind != RELKIND_PARTITIONED_INDEX && + !colName) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot refer to non-index column by number"))); + /* Permissions checks */ if (!pg_class_ownercheck(RelationGetRelid(rel), GetUserId())) aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_CLASS, @@ -8006,6 +8086,15 @@ ATExecSetStatistics(Relation rel, const char *colName, Node *newValue, LOCKMODE errmsg("cannot alter system column \"%s\"", colName))); + if ((rel->rd_rel->relkind == RELKIND_INDEX || + rel->rd_rel->relkind == RELKIND_PARTITIONED_INDEX) && + rel->rd_index->indkey.values[attnum - 1] != 0) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot alter statistics on non-expression column \"%s\" of index \"%s\"", + NameStr(attrtuple->attname), RelationGetRelationName(rel)), + errhint("Alter statistics on table column instead."))); + attrtuple->attstattarget = newtarget; CatalogTupleUpdate(attrelation, &tuple->t_self, tuple); @@ -8521,6 +8610,7 @@ ATExecAddIndex(AlteredTableInfo *tab, Relation rel, address = DefineIndex(RelationGetRelid(rel), stmt, InvalidOid, /* no predefined OID */ + InvalidOid, /* no parent index */ true, /* is_alter_table */ check_rights, false, /* check_not_in_use - we did it already */ @@ -11140,7 +11230,8 @@ ATExecAlterColumnType(AlteredTableInfo *tab, Relation rel, { char relKind = get_rel_relkind(foundObject.objectId); - if (relKind == RELKIND_INDEX) + if (relKind == RELKIND_INDEX || + relKind == RELKIND_PARTITIONED_INDEX) { Assert(foundObject.objectSubId == 0); if (!list_member_oid(tab->changedIndexOids, foundObject.objectId)) @@ -11924,6 +12015,15 @@ ATExecChangeOwner(Oid relationOid, Oid newOwnerId, bool recursing, LOCKMODE lock newOwnerId = tuple_class->relowner; } break; + case RELKIND_PARTITIONED_INDEX: + if (recursing) + break; + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot change owner of index \"%s\"", + NameStr(tuple_class->relname)), + errhint("Change the ownership of the index's table, instead."))); + break; case RELKIND_SEQUENCE: if (!recursing && tuple_class->relowner != newOwnerId) @@ -12045,6 +12145,7 @@ ATExecChangeOwner(Oid relationOid, Oid newOwnerId, bool recursing, LOCKMODE lock */ if (tuple_class->relkind != RELKIND_COMPOSITE_TYPE && tuple_class->relkind != RELKIND_INDEX && + tuple_class->relkind != RELKIND_PARTITIONED_INDEX && tuple_class->relkind != RELKIND_TOASTVALUE) changeDependencyOnOwner(RelationRelationId, relationOid, newOwnerId); @@ -12052,7 +12153,8 @@ ATExecChangeOwner(Oid relationOid, Oid newOwnerId, bool recursing, LOCKMODE lock /* * Also change the ownership of the table's row type, if it has one */ - if (tuple_class->relkind != RELKIND_INDEX) + if (tuple_class->relkind != RELKIND_INDEX && + tuple_class->relkind != RELKIND_PARTITIONED_INDEX) AlterTypeOwnerInternal(tuple_class->reltype, newOwnerId); /* @@ -12061,6 +12163,7 @@ ATExecChangeOwner(Oid relationOid, Oid newOwnerId, bool recursing, LOCKMODE lock * relation, as well as its toast table (if it has one). */ if (tuple_class->relkind == RELKIND_RELATION || + tuple_class->relkind == RELKIND_PARTITIONED_TABLE || tuple_class->relkind == RELKIND_MATVIEW || tuple_class->relkind == RELKIND_TOASTVALUE) { @@ -12386,6 +12489,7 @@ ATExecSetRelOptions(Relation rel, List *defList, AlterTableType operation, (void) view_reloptions(newOptions, true); break; case RELKIND_INDEX: + case RELKIND_PARTITIONED_INDEX: (void) index_reloptions(rel->rd_amroutine->amoptions, newOptions, true); break; default: @@ -12859,7 +12963,8 @@ AlterTableMoveAll(AlterTableMoveAllStmt *stmt) relForm->relkind != RELKIND_RELATION && relForm->relkind != RELKIND_PARTITIONED_TABLE) || (stmt->objtype == OBJECT_INDEX && - relForm->relkind != RELKIND_INDEX) || + relForm->relkind != RELKIND_INDEX && + relForm->relkind != RELKIND_PARTITIONED_INDEX) || (stmt->objtype == OBJECT_MATVIEW && relForm->relkind != RELKIND_MATVIEW)) continue; @@ -13797,45 +13902,18 @@ RemoveInheritance(Relation child_rel, Relation parent_rel) Relation catalogRelation; SysScanDesc scan; ScanKeyData key[3]; - HeapTuple inheritsTuple, - attributeTuple, + HeapTuple attributeTuple, constraintTuple; List *connames; - bool found = false; + bool found; bool child_is_partition = false; /* If parent_rel is a partitioned table, child_rel must be a partition */ if (parent_rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) child_is_partition = true; - /* - * Find and destroy the pg_inherits entry linking the two, or error out if - * there is none. - */ - catalogRelation = heap_open(InheritsRelationId, RowExclusiveLock); - ScanKeyInit(&key[0], - Anum_pg_inherits_inhrelid, - BTEqualStrategyNumber, F_OIDEQ, - ObjectIdGetDatum(RelationGetRelid(child_rel))); - scan = systable_beginscan(catalogRelation, InheritsRelidSeqnoIndexId, - true, NULL, 1, key); - - while (HeapTupleIsValid(inheritsTuple = systable_getnext(scan))) - { - Oid inhparent; - - inhparent = ((Form_pg_inherits) GETSTRUCT(inheritsTuple))->inhparent; - if (inhparent == RelationGetRelid(parent_rel)) - { - CatalogTupleDelete(catalogRelation, &inheritsTuple->t_self); - found = true; - break; - } - } - - systable_endscan(scan); - heap_close(catalogRelation, RowExclusiveLock); - + found = DeleteInheritsTuple(RelationGetRelid(child_rel), + RelationGetRelid(parent_rel)); if (!found) { if (child_is_partition) @@ -16073,7 +16151,8 @@ RangeVarCallbackForAlterRelation(const RangeVar *rv, Oid relid, Oid oldrelid, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("\"%s\" is not a composite type", rv->relname))); - if (reltype == OBJECT_INDEX && relkind != RELKIND_INDEX + if (reltype == OBJECT_INDEX && relkind != RELKIND_INDEX && + relkind != RELKIND_PARTITIONED_INDEX && !IsA(stmt, RenameStmt)) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), @@ -16897,6 +16976,9 @@ ATExecAttachPartition(List **wqueue, Relation rel, PartitionCmd *cmd) /* Update the pg_class entry. */ StorePartitionBound(attachrel, rel, cmd->bound); + /* Ensure there exists a correct set of indexes in the partition. */ + AttachPartitionEnsureIndexes(rel, attachrel); + /* * Generate partition constraint from the partition bound specification. * If the parent itself is a partition, make sure to include its @@ -16964,6 +17046,127 @@ ATExecAttachPartition(List **wqueue, Relation rel, PartitionCmd *cmd) return address; } +/* + * AttachPartitionEnsureIndexes + * subroutine for ATExecAttachPartition to create/match indexes + * + * Enforce the indexing rule for partitioned tables during ALTER TABLE / ATTACH + * PARTITION: every partition must have an index attached to each index on the + * partitioned table. + */ +static void +AttachPartitionEnsureIndexes(Relation rel, Relation attachrel) +{ + List *idxes; + List *attachRelIdxs; + Relation *attachrelIdxRels; + IndexInfo **attachInfos; + int i; + ListCell *cell; + MemoryContext cxt; + MemoryContext oldcxt; + + cxt = AllocSetContextCreate(CurrentMemoryContext, + "AttachPartitionEnsureIndexes", + ALLOCSET_DEFAULT_SIZES); + oldcxt = MemoryContextSwitchTo(cxt); + + idxes = RelationGetIndexList(rel); + attachRelIdxs = RelationGetIndexList(attachrel); + attachrelIdxRels = palloc(sizeof(Relation) * list_length(attachRelIdxs)); + attachInfos = palloc(sizeof(IndexInfo *) * list_length(attachRelIdxs)); + + /* Build arrays of all existing indexes and their IndexInfos */ + i = 0; + foreach(cell, attachRelIdxs) + { + Oid cldIdxId = lfirst_oid(cell); + + attachrelIdxRels[i] = index_open(cldIdxId, AccessShareLock); + attachInfos[i] = BuildIndexInfo(attachrelIdxRels[i]); + i++; + } + + /* + * For each index on the partitioned table, find a matching one in the + * partition-to-be; if one is not found, create one. + */ + foreach(cell, idxes) + { + Oid idx = lfirst_oid(cell); + Relation idxRel = index_open(idx, AccessShareLock); + IndexInfo *info; + AttrNumber *attmap; + bool found = false; + + /* + * Ignore indexes in the partitioned table other than partitioned + * indexes. + */ + if (idxRel->rd_rel->relkind != RELKIND_PARTITIONED_INDEX) + { + index_close(idxRel, AccessShareLock); + continue; + } + + /* construct an indexinfo to compare existing indexes against */ + info = BuildIndexInfo(idxRel); + attmap = convert_tuples_by_name_map(RelationGetDescr(attachrel), + RelationGetDescr(rel), + gettext_noop("could not convert row type")); + + /* + * Scan the list of existing indexes in the partition-to-be, and mark + * the first matching, unattached one we find, if any, as partition of + * the parent index. If we find one, we're done. + */ + for (i = 0; i < list_length(attachRelIdxs); i++) + { + /* does this index have a parent? if so, can't use it */ + if (has_superclass(RelationGetRelid(attachrelIdxRels[i]))) + continue; + + if (CompareIndexInfo(attachInfos[i], info, + attachrelIdxRels[i]->rd_indcollation, + idxRel->rd_indcollation, + attachrelIdxRels[i]->rd_opfamily, + idxRel->rd_opfamily, + attmap, + RelationGetDescr(rel)->natts)) + { + /* bingo. */ + IndexSetParentIndex(attachrelIdxRels[i], idx); + found = true; + break; + } + } + + /* + * If no suitable index was found in the partition-to-be, create one + * now. + */ + if (!found) + { + IndexStmt *stmt; + + stmt = generateClonedIndexStmt(NULL, RelationGetRelid(attachrel), + idxRel, attmap, + RelationGetDescr(rel)->natts); + DefineIndex(RelationGetRelid(attachrel), stmt, InvalidOid, + RelationGetRelid(idxRel), + false, false, false, false, false); + } + + index_close(idxRel, AccessShareLock); + } + + /* Clean up. */ + for (i = 0; i < list_length(attachRelIdxs); i++) + index_close(attachrelIdxRels[i], AccessShareLock); + MemoryContextSwitchTo(oldcxt); + MemoryContextDelete(cxt); +} + /* * ALTER TABLE DETACH PARTITION * @@ -16982,6 +17185,8 @@ ATExecDetachPartition(Relation rel, RangeVar *name) new_repl[Natts_pg_class]; ObjectAddress address; Oid defaultPartOid; + List *indexes; + ListCell *cell; #ifdef _MLS_ bool schema_bound; Oid partoid; @@ -17067,6 +17272,24 @@ ATExecDetachPartition(Relation rel, RangeVar *name) } } + /* detach indexes too */ + indexes = RelationGetIndexList(partRel); + foreach(cell, indexes) + { + Oid idxid = lfirst_oid(cell); + Relation idx; + + if (!has_superclass(idxid)) + continue; + + Assert((IndexGetRelation(get_partition_parent(idxid), false) == + RelationGetRelid(rel))); + + idx = index_open(idxid, AccessExclusiveLock); + IndexSetParentIndex(idx, InvalidOid); + relation_close(idx, AccessExclusiveLock); + } + /* * Invalidate the parent's relcache so that the partition is no longer * included in its partition descriptor. @@ -17081,6 +17304,332 @@ ATExecDetachPartition(Relation rel, RangeVar *name) return address; } + +/* + * Before acquiring lock on an index, acquire the same lock on the owning + * table. + */ +struct AttachIndexCallbackState +{ + Oid partitionOid; + Oid parentTblOid; + bool lockedParentTbl; +}; + +static void +RangeVarCallbackForAttachIndex(const RangeVar *rv, Oid relOid, Oid oldRelOid, + void *arg) +{ + struct AttachIndexCallbackState *state; + Form_pg_class classform; + HeapTuple tuple; + + state = (struct AttachIndexCallbackState *) arg; + + if (!state->lockedParentTbl) + { + LockRelationOid(state->parentTblOid, AccessShareLock); + state->lockedParentTbl = true; + } + + /* + * If we previously locked some other heap, and the name we're looking up + * no longer refers to an index on that relation, release the now-useless + * lock. XXX maybe we should do *after* we verify whether the index does + * not actually belong to the same relation ... + */ + if (relOid != oldRelOid && OidIsValid(state->partitionOid)) + { + UnlockRelationOid(state->partitionOid, AccessShareLock); + state->partitionOid = InvalidOid; + } + + /* Didn't find a relation, so no need for locking or permission checks. */ + if (!OidIsValid(relOid)) + return; + + tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(relOid)); + if (!HeapTupleIsValid(tuple)) + return; /* concurrently dropped, so nothing to do */ + classform = (Form_pg_class) GETSTRUCT(tuple); + if (classform->relkind != RELKIND_PARTITIONED_INDEX && + classform->relkind != RELKIND_INDEX) + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("\"%s\" is not an index", rv->relname))); + ReleaseSysCache(tuple); + + /* + * Since we need only examine the heap's tupledesc, an access share lock + * on it (preventing any DDL) is sufficient. + */ + state->partitionOid = IndexGetRelation(relOid, false); + LockRelationOid(state->partitionOid, AccessShareLock); +} + +/* + * ALTER INDEX i1 ATTACH PARTITION i2 + */ +static ObjectAddress +ATExecAttachPartitionIdx(List **wqueue, Relation parentIdx, RangeVar *name) +{ + Relation partIdx; + Relation partTbl; + Relation parentTbl; + ObjectAddress address; + Oid partIdxId; + Oid currParent; + struct AttachIndexCallbackState state; + + /* + * We need to obtain lock on the index 'name' to modify it, but we also + * need to read its owning table's tuple descriptor -- so we need to lock + * both. To avoid deadlocks, obtain lock on the table before doing so on + * the index. Furthermore, we need to examine the parent table of the + * partition, so lock that one too. + */ + state.partitionOid = InvalidOid; + state.parentTblOid = parentIdx->rd_index->indrelid; + state.lockedParentTbl = false; + partIdxId = + RangeVarGetRelidExtended(name, AccessExclusiveLock, false, false, + RangeVarCallbackForAttachIndex, + (void *) &state); + /* Not there? */ + if (!OidIsValid(partIdxId)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("index \"%s\" does not exist", name->relname))); + + /* no deadlock risk: RangeVarGetRelidExtended already acquired the lock */ + partIdx = relation_open(partIdxId, AccessExclusiveLock); + + /* we already hold locks on both tables, so this is safe: */ + parentTbl = relation_open(parentIdx->rd_index->indrelid, AccessShareLock); + partTbl = relation_open(partIdx->rd_index->indrelid, NoLock); + + ObjectAddressSet(address, RelationRelationId, RelationGetRelid(partIdx)); + + /* Silently do nothing if already in the right state */ + currParent = !has_superclass(partIdxId) ? InvalidOid : + get_partition_parent(partIdxId); + if (currParent != RelationGetRelid(parentIdx)) + { + IndexInfo *childInfo; + IndexInfo *parentInfo; + AttrNumber *attmap; + bool found; + int i; + PartitionDesc partDesc; + + /* + * If this partition already has an index attached, refuse the operation. + */ + refuseDupeIndexAttach(parentIdx, partIdx, partTbl); + + if (OidIsValid(currParent)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot attach index \"%s\" as a partition of index \"%s\"", + RelationGetRelationName(partIdx), + RelationGetRelationName(parentIdx)), + errdetail("Index \"%s\" is already attached to another index.", + RelationGetRelationName(partIdx)))); + + /* Make sure it indexes a partition of the other index's table */ + partDesc = RelationGetPartitionDesc(parentTbl); + found = false; + for (i = 0; i < partDesc->nparts; i++) + { + if (partDesc->oids[i] == state.partitionOid) + { + found = true; + break; + } + } + if (!found) + ereport(ERROR, + (errmsg("cannot attach index \"%s\" as a partition of index \"%s\"", + RelationGetRelationName(partIdx), + RelationGetRelationName(parentIdx)), + errdetail("Index \"%s\" is not an index on any partition of table \"%s\".", + RelationGetRelationName(partIdx), + RelationGetRelationName(parentTbl)))); + + /* Ensure the indexes are compatible */ + childInfo = BuildIndexInfo(partIdx); + parentInfo = BuildIndexInfo(parentIdx); + attmap = convert_tuples_by_name_map(RelationGetDescr(partTbl), + RelationGetDescr(parentTbl), + gettext_noop("could not convert row type")); + if (!CompareIndexInfo(childInfo, parentInfo, + partIdx->rd_indcollation, + parentIdx->rd_indcollation, + partIdx->rd_opfamily, + parentIdx->rd_opfamily, + attmap, + RelationGetDescr(partTbl)->natts)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("cannot attach index \"%s\" as a partition of index \"%s\"", + RelationGetRelationName(partIdx), + RelationGetRelationName(parentIdx)), + errdetail("The index definitions do not match."))); + + /* All good -- do it */ + IndexSetParentIndex(partIdx, RelationGetRelid(parentIdx)); + pfree(attmap); + + CommandCounterIncrement(); + + validatePartitionedIndex(parentIdx, parentTbl); + } + + relation_close(parentTbl, AccessShareLock); + /* keep these locks till commit */ + relation_close(partTbl, NoLock); + relation_close(partIdx, NoLock); + + return address; +} + +/* + * Verify whether the given partition already contains an index attached + * to the given partitioned index. If so, raise an error. + */ +static void +refuseDupeIndexAttach(Relation parentIdx, Relation partIdx, Relation partitionTbl) +{ + Relation pg_inherits; + ScanKeyData key; + HeapTuple tuple; + SysScanDesc scan; + + pg_inherits = heap_open(InheritsRelationId, AccessShareLock); + ScanKeyInit(&key, Anum_pg_inherits_inhparent, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(RelationGetRelid(parentIdx))); + scan = systable_beginscan(pg_inherits, InheritsParentIndexId, true, + NULL, 1, &key); + while (HeapTupleIsValid(tuple = systable_getnext(scan))) + { + Form_pg_inherits inhForm; + Oid tab; + + inhForm = (Form_pg_inherits) GETSTRUCT(tuple); + tab = IndexGetRelation(inhForm->inhrelid, false); + if (tab == RelationGetRelid(partitionTbl)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot attach index \"%s\" as a partition of index \"%s\"", + RelationGetRelationName(partIdx), + RelationGetRelationName(parentIdx)), + errdetail("Another index is already attached for partition \"%s\".", + RelationGetRelationName(partitionTbl)))); + } + + systable_endscan(scan); + heap_close(pg_inherits, AccessShareLock); +} + +/* + * Verify whether the set of attached partition indexes to a parent index on + * a partitioned table is complete. If it is, mark the parent index valid. + * + * This should be called each time a partition index is attached. + */ +static void +validatePartitionedIndex(Relation partedIdx, Relation partedTbl) +{ + Relation inheritsRel; + SysScanDesc scan; + ScanKeyData key; + int tuples = 0; + HeapTuple inhTup; + bool updated = false; + + Assert(partedIdx->rd_rel->relkind == RELKIND_PARTITIONED_INDEX); + + /* + * Scan pg_inherits for this parent index. Count each valid index we find + * (verifying the pg_index entry for each), and if we reach the total + * amount we expect, we can mark this parent index as valid. + */ + inheritsRel = heap_open(InheritsRelationId, AccessShareLock); + ScanKeyInit(&key, Anum_pg_inherits_inhparent, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(RelationGetRelid(partedIdx))); + scan = systable_beginscan(inheritsRel, InheritsParentIndexId, true, + NULL, 1, &key); + while ((inhTup = systable_getnext(scan)) != NULL) + { + Form_pg_inherits inhForm = (Form_pg_inherits) GETSTRUCT(inhTup); + HeapTuple indTup; + Form_pg_index indexForm; + + indTup = SearchSysCache1(INDEXRELID, + ObjectIdGetDatum(inhForm->inhrelid)); + if (!indTup) + elog(ERROR, "cache lookup failed for index %u", + inhForm->inhrelid); + indexForm = (Form_pg_index) GETSTRUCT(indTup); + if (IndexIsValid(indexForm)) + tuples += 1; + ReleaseSysCache(indTup); + } + + /* Done with pg_inherits */ + systable_endscan(scan); + heap_close(inheritsRel, AccessShareLock); + + /* + * If we found as many inherited indexes as the partitioned table has + * partitions, we're good; update pg_index to set indisvalid. + */ + if (tuples == RelationGetPartitionDesc(partedTbl)->nparts) + { + Relation idxRel; + HeapTuple newtup; + + idxRel = heap_open(IndexRelationId, RowExclusiveLock); + + newtup = heap_copytuple(partedIdx->rd_indextuple); + ((Form_pg_index) GETSTRUCT(newtup))->indisvalid = true; + updated = true; + + CatalogTupleUpdate(idxRel, &partedIdx->rd_indextuple->t_self, newtup); + + heap_close(idxRel, RowExclusiveLock); + } + + /* + * If this index is in turn a partition of a larger index, validating it + * might cause the parent to become valid also. Try that. + */ + if (updated && + has_superclass(RelationGetRelid(partedIdx))) + { + Oid parentIdxId, + parentTblId; + Relation parentIdx, + parentTbl; + + /* make sure we see the validation we just did */ + CommandCounterIncrement(); + + parentIdxId = get_partition_parent(RelationGetRelid(partedIdx)); + parentTblId = get_partition_parent(RelationGetRelid(partedTbl)); + parentIdx = relation_open(parentIdxId, AccessExclusiveLock); + parentTbl = relation_open(parentTblId, AccessExclusiveLock); + Assert(!parentIdx->rd_index->indisvalid); + + validatePartitionedIndex(parentIdx, parentTbl); + + relation_close(parentIdx, AccessExclusiveLock); + relation_close(parentTbl, AccessExclusiveLock); + } +} + #ifdef _MIGRATE_ bool oidarray_contian_oid(Oid *old_oids, int old_num, Oid new_oid) diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index ea4a0c71..8bc360f1 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -3729,6 +3729,7 @@ _copyIndexStmt(const IndexStmt *from) COPY_STRING_FIELD(idxname); COPY_NODE_FIELD(relation); + COPY_SCALAR_FIELD(relationId); COPY_STRING_FIELD(accessMethod); COPY_STRING_FIELD(tableSpace); COPY_NODE_FIELD(indexParams); diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c index c92cbd30..7bbe8255 100644 --- a/src/backend/nodes/equalfuncs.c +++ b/src/backend/nodes/equalfuncs.c @@ -1348,6 +1348,7 @@ _equalIndexStmt(const IndexStmt *a, const IndexStmt *b) { COMPARE_STRING_FIELD(idxname); COMPARE_NODE_FIELD(relation); + COMPARE_SCALAR_FIELD(relationId); COMPARE_STRING_FIELD(accessMethod); COMPARE_STRING_FIELD(tableSpace); COMPARE_NODE_FIELD(indexParams); diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index 092a7dd5..7df4571b 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -4010,6 +4010,7 @@ _outIndexStmt(StringInfo str, const IndexStmt *node) WRITE_STRING_FIELD(idxname); WRITE_NODE_FIELD(relation); + WRITE_OID_FIELD(relationId); WRITE_STRING_FIELD(accessMethod); WRITE_STRING_FIELD(tableSpace); WRITE_NODE_FIELD(indexParams); diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index f8e17e4e..4e74a77d 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -316,7 +316,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); %type add_drop opt_asc_desc opt_nulls_order %type alter_table_cmd alter_type_cmd opt_collate_clause - replica_identity partition_cmd alter_group_cmd + replica_identity partition_cmd alter_group_cmd index_partition_cmd %type alter_table_cmds alter_type_cmds alter_group_cmds %type alter_identity_column_option_list %type alter_identity_column_option @@ -1996,6 +1996,15 @@ AlterTableStmt: n->missing_ok = true; $$ = (Node *)n; } + | ALTER INDEX qualified_name index_partition_cmd + { + AlterTableStmt *n = makeNode(AlterTableStmt); + n->relation = $3; + n->cmds = list_make1($4); + n->relkind = OBJECT_INDEX; + n->missing_ok = false; + $$ = (Node *)n; + } | ALTER INDEX ALL IN_P TABLESPACE name SET TABLESPACE name opt_nowait { AlterTableMoveAllStmt *n = @@ -2146,6 +2155,22 @@ alter_group_cmd: } ; +index_partition_cmd: + /* ALTER INDEX ATTACH PARTITION */ + ATTACH PARTITION qualified_name + { + AlterTableCmd *n = makeNode(AlterTableCmd); + PartitionCmd *cmd = makeNode(PartitionCmd); + + n->subtype = AT_AttachPartition; + cmd->name = $3; + cmd->bound = NULL; + n->def = (Node *) cmd; + + $$ = (Node *) n; + } + ; + alter_table_cmd: /* ALTER TABLE ADD */ ADD_P columnDef @@ -7735,7 +7760,7 @@ defacl_privilege_target: *****************************************************************************/ IndexStmt: CREATE opt_unique INDEX opt_concurrently opt_index_name - ON qualified_name access_method_clause '(' index_params ')' + ON relation_expr access_method_clause '(' index_params ')' opt_reloptions OptTableSpace where_clause { IndexStmt *n = makeNode(IndexStmt); @@ -7743,6 +7768,7 @@ IndexStmt: CREATE opt_unique INDEX opt_concurrently opt_index_name n->concurrent = $4; n->idxname = $5; n->relation = $7; + n->relationId = InvalidOid; n->accessMethod = $8; n->indexParams = $10; n->options = $12; @@ -7762,7 +7788,7 @@ IndexStmt: CREATE opt_unique INDEX opt_concurrently opt_index_name $$ = (Node *)n; } | CREATE opt_unique INDEX opt_concurrently IF_P NOT EXISTS index_name - ON qualified_name access_method_clause '(' index_params ')' + ON relation_expr access_method_clause '(' index_params ')' opt_reloptions OptTableSpace where_clause { IndexStmt *n = makeNode(IndexStmt); @@ -7770,6 +7796,7 @@ IndexStmt: CREATE opt_unique INDEX opt_concurrently opt_index_name n->concurrent = $4; n->idxname = $8; n->relation = $10; + n->relationId = InvalidOid; n->accessMethod = $11; n->indexParams = $13; n->options = $15; diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c index af249f71..cf1bc20a 100644 --- a/src/backend/parser/parse_utilcmd.c +++ b/src/backend/parser/parse_utilcmd.c @@ -169,9 +169,6 @@ static void transformTableLikeClause(CreateStmtContext *cxt, TableLikeClause *table_like_clause); static void transformOfType(CreateStmtContext *cxt, TypeName *ofTypename); -static IndexStmt *generateClonedIndexStmt(CreateStmtContext *cxt, - Relation source_idx, - const AttrNumber *attmap, int attmap_length); static List *get_collation(Oid collation, Oid actual_datatype); static List *get_opclass(Oid opclass, Oid actual_datatype); static void transformIndexConstraints(CreateStmtContext *cxt); @@ -1632,7 +1629,8 @@ transformTableLikeClause(CreateStmtContext *cxt, TableLikeClause *table_like_cla parent_index = index_open(parent_index_oid, AccessShareLock); /* Build CREATE INDEX statement to recreate the parent_index */ - index_stmt = generateClonedIndexStmt(cxt, parent_index, + index_stmt = generateClonedIndexStmt(cxt->relation, InvalidOid, + parent_index, attmap, tupleDesc->natts); #ifdef __TBASE__ @@ -1720,10 +1718,12 @@ transformOfType(CreateStmtContext *cxt, TypeName *ofTypename) /* * Generate an IndexStmt node using information from an already existing index - * "source_idx". Attribute numbers should be adjusted according to attmap. + * "source_idx", for the rel identified either by heapRel or heapRelid. + * + * Attribute numbers should be adjusted according to attmap. */ -static IndexStmt * -generateClonedIndexStmt(CreateStmtContext *cxt, Relation source_idx, +IndexStmt * +generateClonedIndexStmt(RangeVar *heapRel, Oid heapRelid, Relation source_idx, const AttrNumber *attmap, int attmap_length) {// #lizard forgives Oid source_relid = RelationGetRelid(source_idx); @@ -1745,6 +1745,9 @@ generateClonedIndexStmt(CreateStmtContext *cxt, Relation source_idx, Datum datum; bool isnull; + Assert((heapRel == NULL && OidIsValid(heapRelid)) || + (heapRel != NULL && !OidIsValid(heapRelid))); + /* * Fetch pg_class tuple of source index. We can't use the copy in the * relcache entry because it doesn't include optional fields. @@ -1780,7 +1783,8 @@ generateClonedIndexStmt(CreateStmtContext *cxt, Relation source_idx, /* Begin building the IndexStmt */ index = makeNode(IndexStmt); - index->relation = cxt->relation; + index->relation = heapRel; + index->relationId = heapRelid; index->accessMethod = pstrdup(NameStr(amrec->amname)); if (OidIsValid(idxrelrec->reltablespace)) index->tableSpace = get_tablespace_name(idxrelrec->reltablespace); @@ -4947,18 +4951,39 @@ transformPartitionCmd(CreateStmtContext *cxt, PartitionCmd *cmd) { Relation parentRel = cxt->rel; - /* the table must be partitioned */ - if (parentRel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE) - ereport(ERROR, - (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), - errmsg("\"%s\" is not partitioned", - RelationGetRelationName(parentRel)))); - + switch (parentRel->rd_rel->relkind) + { + case RELKIND_PARTITIONED_TABLE: /* transform the partition bound, if any */ Assert(RelationGetPartitionKey(parentRel) != NULL); if (cmd->bound != NULL) cxt->partbound = transformPartitionBound(cxt->pstate, parentRel, cmd->bound); + break; + case RELKIND_PARTITIONED_INDEX: + /* nothing to check */ + Assert(cmd->bound == NULL); + break; + case RELKIND_RELATION: + /* the table must be partitioned */ + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("table \"%s\" is not partitioned", + RelationGetRelationName(parentRel)))); + break; + case RELKIND_INDEX: + /* the index must be partitioned */ + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("index \"%s\" is not partitioned", + RelationGetRelationName(parentRel)))); + break; + default: + /* parser shouldn't let this case through */ + elog(ERROR, "\"%s\" is not a partitioned table or index", + RelationGetRelationName(parentRel)); + break; + } } /* diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index c99d090e..a6536b13 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -25,6 +25,7 @@ #include "access/xlog.h" #include "catalog/catalog.h" #include "catalog/namespace.h" +#include "catalog/pg_inherits_fn.h" #include "catalog/toasting.h" #include "commands/alter.h" #include "commands/async.h" @@ -3677,6 +3678,7 @@ ProcessUtilitySlow(ParseState *pstate, IndexStmt *stmt = (IndexStmt *) parsetree; Oid relid; LOCKMODE lockmode; + List *inheritors = NIL; #ifdef __TBASE__ Relation rel = NULL; #endif @@ -3719,6 +3721,23 @@ ProcessUtilitySlow(ParseState *pstate, } #endif + /* + * CREATE INDEX on partitioned tables (but not regular + * inherited tables) recurses to partitions, so we must + * acquire locks early to avoid deadlocks. + */ + if (stmt->relation->inh) + { + Relation rel; + + /* already locked by RangeVarGetRelidExtended */ + rel = heap_open(relid, NoLock); + if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + inheritors = find_all_inheritors(relid, lockmode, + NULL); + heap_close(rel, NoLock); + } + /* Run parse analysis ... */ stmt = transformIndexStmt(relid, stmt, queryString); @@ -3728,6 +3747,7 @@ ProcessUtilitySlow(ParseState *pstate, DefineIndex(relid, /* OID of heap relation */ stmt, InvalidOid, /* no predefined OID */ + InvalidOid, /* no parent index */ false, /* is_alter_table */ true, /* check_rights */ true, /* check_not_in_use */ @@ -3879,6 +3899,8 @@ ProcessUtilitySlow(ParseState *pstate, parsetree); commandCollected = true; EventTriggerAlterTableEnd(); + + list_free(inheritors); } break; diff --git a/src/backend/utils/adt/amutils.c b/src/backend/utils/adt/amutils.c index b05d24d0..31367fb1 100644 --- a/src/backend/utils/adt/amutils.c +++ b/src/backend/utils/adt/amutils.c @@ -183,7 +183,8 @@ indexam_property(FunctionCallInfo fcinfo, if (!HeapTupleIsValid(tuple)) PG_RETURN_NULL(); rd_rel = (Form_pg_class) GETSTRUCT(tuple); - if (rd_rel->relkind != RELKIND_INDEX) + if (rd_rel->relkind != RELKIND_INDEX && + rd_rel->relkind != RELKIND_PARTITIONED_INDEX) { ReleaseSysCache(tuple); PG_RETURN_NULL(); diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c index 9acf184a..8ce8cefe 100644 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@ -360,7 +360,7 @@ static void decompile_column_index_array(Datum column_index_array, Oid relId, static char *pg_get_ruledef_worker(Oid ruleoid, int prettyFlags); static char *pg_get_indexdef_worker(Oid indexrelid, int colno, const Oid *excludeOps, - bool attrsOnly, bool showTblSpc, + bool attrsOnly, bool showTblSpc, bool inherits, int prettyFlags, bool missing_ok); static char *pg_get_statisticsobj_worker(Oid statextid, bool missing_ok); static char *pg_get_partkeydef_worker(Oid relid, int prettyFlags, @@ -1142,7 +1142,7 @@ pg_get_indexdef(PG_FUNCTION_ARGS) prettyFlags = PRETTYFLAG_INDENT; - res = pg_get_indexdef_worker(indexrelid, 0, NULL, false, false, + res = pg_get_indexdef_worker(indexrelid, 0, NULL, false, false, false, prettyFlags, true); if (res == NULL) @@ -1163,7 +1163,7 @@ pg_get_indexdef_ext(PG_FUNCTION_ARGS) prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT; res = pg_get_indexdef_worker(indexrelid, colno, NULL, colno != 0, false, - prettyFlags, true); + false, prettyFlags, true); if (res == NULL) PG_RETURN_NULL(); @@ -1179,7 +1179,7 @@ pg_get_indexdef_ext(PG_FUNCTION_ARGS) char * pg_get_indexdef_string(Oid indexrelid) { - return pg_get_indexdef_worker(indexrelid, 0, NULL, false, true, 0, false); + return pg_get_indexdef_worker(indexrelid, 0, NULL, false, true, true, 0, false); } /* Internal version that just reports the column definitions */ @@ -1189,7 +1189,7 @@ pg_get_indexdef_columns(Oid indexrelid, bool pretty) int prettyFlags; prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT; - return pg_get_indexdef_worker(indexrelid, 0, NULL, true, false, + return pg_get_indexdef_worker(indexrelid, 0, NULL, true, false, false, prettyFlags, false); } @@ -1202,7 +1202,7 @@ pg_get_indexdef_columns(Oid indexrelid, bool pretty) static char * pg_get_indexdef_worker(Oid indexrelid, int colno, const Oid *excludeOps, - bool attrsOnly, bool showTblSpc, + bool attrsOnly, bool showTblSpc, bool inherits, int prettyFlags, bool missing_ok) {// #lizard forgives /* might want a separate isConstraint parameter later */ @@ -1318,9 +1318,11 @@ pg_get_indexdef_worker(Oid indexrelid, int colno, if (!attrsOnly) { if (!isConstraint) - appendStringInfo(&buf, "CREATE %sINDEX %s ON %s USING %s (", + appendStringInfo(&buf, "CREATE %sINDEX %s ON %s%s USING %s (", idxrec->indisunique ? "UNIQUE " : "", quote_identifier(NameStr(idxrelrec->relname)), + idxrelrec->relkind == RELKIND_PARTITIONED_INDEX + && !inherits ? "ONLY " : "", generate_relation_name(indrelid, NIL), quote_identifier(NameStr(amrec->amname))); else /* currently, must be EXCLUDE constraint */ @@ -2237,6 +2239,7 @@ pg_get_constraintdef_worker(Oid constraintId, bool fullCommand, operators, false, false, + false, prettyFlags, false)); break; diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index f9520010..55943dff 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -490,18 +490,26 @@ static void RelationParseRelOptions(Relation relation, HeapTuple tuple) {// #lizard forgives bytea *options; + amoptions_function amoptsfn; relation->rd_options = NULL; - /* Fall out if relkind should not have options */ + /* + * Look up any AM-specific parse function; fall out if relkind should not + * have options. + */ switch (relation->rd_rel->relkind) { case RELKIND_RELATION: case RELKIND_TOASTVALUE: - case RELKIND_INDEX: case RELKIND_VIEW: case RELKIND_MATVIEW: case RELKIND_PARTITIONED_TABLE: + amoptsfn = NULL; + break; + case RELKIND_INDEX: + case RELKIND_PARTITIONED_INDEX: + amoptsfn = relation->rd_amroutine->amoptions; break; default: return; @@ -516,10 +524,8 @@ RelationParseRelOptions(Relation relation, HeapTuple tuple) { return; } - options = extractRelOptions(tuple, - GetPgClassDescriptor(), - relation->rd_rel->relkind == RELKIND_INDEX ? - relation->rd_amroutine->amoptions : NULL); + + options = extractRelOptions(tuple, GetPgClassDescriptor(), amoptsfn); /* * Copy parsed data into CacheMemoryContext. To guard against the @@ -2379,7 +2385,8 @@ RelationIdGetRelation(Oid relationId) * and we don't want to use the full-blown procedure because it's * a headache for indexes that reload itself depends on. */ - if (rd->rd_rel->relkind == RELKIND_INDEX) + if (rd->rd_rel->relkind == RELKIND_INDEX || + rd->rd_rel->relkind == RELKIND_PARTITIONED_INDEX) RelationReloadIndexInfo(rd); else RelationClearRelation(rd, true); @@ -2585,7 +2592,8 @@ RelationReloadIndexInfo(Relation relation) Form_pg_class relp; /* Should be called only for invalidated indexes */ - Assert(relation->rd_rel->relkind == RELKIND_INDEX && + Assert((relation->rd_rel->relkind == RELKIND_INDEX || + relation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX) && !relation->rd_isvalid); /* Ensure it's closed at smgr level */ @@ -2816,7 +2824,8 @@ RelationClearRelation(Relation relation, bool rebuild) { RelationInitPhysicalAddr(relation); - if (relation->rd_rel->relkind == RELKIND_INDEX) + if (relation->rd_rel->relkind == RELKIND_INDEX || + relation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX) { relation->rd_isvalid = false; /* needs to be revalidated */ if (relation->rd_refcnt > 1 && IsTransactionState()) @@ -2832,7 +2841,8 @@ RelationClearRelation(Relation relation, bool rebuild) * re-read the pg_class row to handle possible physical relocation of the * index, and we check for pg_index updates too. */ - if (relation->rd_rel->relkind == RELKIND_INDEX && + if ((relation->rd_rel->relkind == RELKIND_INDEX || + relation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX) && relation->rd_refcnt > 0 && relation->rd_indexcxt != NULL) { @@ -5904,7 +5914,10 @@ load_relcache_init_file(bool shared) rel->rd_att->constr = constr; } - /* If it's an index, there's more to do */ + /* + * If it's an index, there's more to do. Note we explicitly ignore + * partitioned indexes here. + */ if (rel->rd_rel->relkind == RELKIND_INDEX) { MemoryContext indexcxt; @@ -6268,7 +6281,10 @@ write_relcache_init_file(bool shared) (rel->rd_options ? VARSIZE(rel->rd_options) : 0), fp); - /* If it's an index, there's more to do */ + /* + * If it's an index, there's more to do. Note we explicitly ignore + * partitioned indexes here. + */ if (rel->rd_rel->relkind == RELKIND_INDEX) { /* write the pg_index tuple */ diff --git a/src/bin/pg_dump/common.c b/src/bin/pg_dump/common.c index a03dd76c..0942fa5b 100644 --- a/src/bin/pg_dump/common.c +++ b/src/bin/pg_dump/common.c @@ -68,6 +68,7 @@ static int numextmembers; static void flagInhTables(TableInfo *tbinfo, int numTables, InhInfo *inhinfo, int numInherits); +static void flagInhIndexes(Archive *fout, TableInfo *tblinfo, int numTables); static void flagInhAttrs(DumpOptions *dopt, TableInfo *tblinfo, int numTables); static DumpableObject **buildIndexArray(void *objArray, int numObjs, Size objSize); @@ -76,6 +77,8 @@ static int ExtensionMemberIdCompare(const void *p1, const void *p2); static void findParentsByOid(TableInfo *self, InhInfo *inhinfo, int numInherits); static int strInArray(const char *pattern, char **arr, int arr_size); +static IndxInfo *findIndexByOid(Oid oid, DumpableObject **idxinfoindex, + int numIndexes); /* @@ -258,6 +261,10 @@ getSchemaData(Archive *fout, int *numTablesPtr) getIndexes(fout, tblinfo, numTables); if (g_verbose) + write_msg(NULL, "flagging indexes in partitioned tables\n"); + flagInhIndexes(fout, tblinfo, numTables); + + if (g_verbose) write_msg(NULL, "reading extended statistics\n"); getExtendedStatistics(fout, tblinfo, numTables); @@ -335,6 +342,89 @@ flagInhTables(TableInfo *tblinfo, int numTables, } } +/* + * flagInhIndexes - + * Create AttachIndexInfo objects for partitioned indexes, and add + * appropriate dependency links. + */ +static void +flagInhIndexes(Archive *fout, TableInfo tblinfo[], int numTables) +{ + int i, + j, + k; + DumpableObject ***parentIndexArray; + + parentIndexArray = (DumpableObject ***) + pg_malloc0(getMaxDumpId() * sizeof(DumpableObject **)); + + for (i = 0; i < numTables; i++) + { + TableInfo *parenttbl; + IndexAttachInfo *attachinfo; + + if (!tblinfo[i].ispartition || tblinfo[i].numParents == 0) + continue; + + Assert(tblinfo[i].numParents == 1); + parenttbl = tblinfo[i].parents[0]; + + /* + * We need access to each parent table's index list, but there is no + * index to cover them outside of this function. To avoid having to + * sort every parent table's indexes each time we come across each of + * its partitions, create an indexed array for each parent the first + * time it is required. + */ + if (parentIndexArray[parenttbl->dobj.dumpId] == NULL) + parentIndexArray[parenttbl->dobj.dumpId] = + buildIndexArray(parenttbl->indexes, + parenttbl->numIndexes, + sizeof(IndxInfo)); + + attachinfo = (IndexAttachInfo *) + pg_malloc0(tblinfo[i].numIndexes * sizeof(IndexAttachInfo)); + for (j = 0, k = 0; j < tblinfo[i].numIndexes; j++) + { + IndxInfo *index = &(tblinfo[i].indexes[j]); + IndxInfo *parentidx; + + if (index->parentidx == 0) + continue; + + parentidx = findIndexByOid(index->parentidx, + parentIndexArray[parenttbl->dobj.dumpId], + parenttbl->numIndexes); + if (parentidx == NULL) + continue; + + attachinfo[k].dobj.objType = DO_INDEX_ATTACH; + attachinfo[k].dobj.catId.tableoid = 0; + attachinfo[k].dobj.catId.oid = 0; + AssignDumpId(&attachinfo[k].dobj); + attachinfo[k].dobj.name = pg_strdup(index->dobj.name); + attachinfo[k].parentIdx = parentidx; + attachinfo[k].partitionIdx = index; + + /* + * We want dependencies from parent to partition (so that the + * partition index is created first), and another one from + * attach object to parent (so that the partition index is + * attached once the parent index has been created). + */ + addObjectDependency(&parentidx->dobj, index->dobj.dumpId); + addObjectDependency(&attachinfo[k].dobj, parentidx->dobj.dumpId); + + k++; + } + } + + for (i = 0; i < numTables; i++) + if (parentIndexArray[i]) + pg_free(parentIndexArray[i]); + pg_free(parentIndexArray); +} + /* flagInhAttrs - * for each dumpable table in tblinfo, flag its inherited attributes * @@ -808,6 +898,18 @@ findExtensionByOid(Oid oid) return (ExtensionInfo *) findObjectByOid(oid, extinfoindex, numExtensions); } +/* + * findIndexByOid + * find the entry of the index with the given oid + * + * This one's signature is different from the previous ones because we lack a + * global array of all indexes, so caller must pass their array as argument. + */ +static IndxInfo * +findIndexByOid(Oid oid, DumpableObject **idxinfoindex, int numIndexes) +{ + return (IndxInfo *) findObjectByOid(oid, idxinfoindex, numIndexes); +} /* * setExtensionMembership diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index 97384c01..b11b02ae 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -200,6 +200,7 @@ static void dumpAttrDef(Archive *fout, AttrDefInfo *adinfo); static void dumpSequence(Archive *fout, TableInfo *tbinfo); static void dumpSequenceData(Archive *fout, TableDataInfo *tdinfo); static void dumpIndex(Archive *fout, IndxInfo *indxinfo); +static void dumpIndexAttach(Archive *fout, IndexAttachInfo *attachinfo); static void dumpStatisticsExt(Archive *fout, StatsExtInfo *statsextinfo); static void dumpConstraint(Archive *fout, ConstraintInfo *coninfo); static void dumpTableConstraintComment(Archive *fout, ConstraintInfo *coninfo); @@ -6770,6 +6771,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables) int i_tableoid, i_oid, i_indexname, + i_parentidx, i_indexdef, i_indnkeys, i_indkey, @@ -6791,15 +6793,17 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables) { TableInfo *tbinfo = &tblinfo[i]; - /* Only plain tables and materialized views have indexes. */ - if (tbinfo->relkind != RELKIND_RELATION && - tbinfo->relkind != RELKIND_MATVIEW) - continue; if (!tbinfo->hasindex) continue; - /* Ignore indexes of tables whose definitions are not to be dumped */ - if (!(tbinfo->dobj.dump & DUMP_COMPONENT_DEFINITION)) + /* + * Ignore indexes of tables whose definitions are not to be dumped. + * + * We also need indexes on partitioned tables which have partitions to + * be dumped, in order to dump the indexes on the partitions. + */ + if (!(tbinfo->dobj.dump & DUMP_COMPONENT_DEFINITION) && + !tbinfo->interesting) continue; if (g_verbose) @@ -6822,7 +6826,39 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables) * is not. */ resetPQExpBuffer(query); - if (fout->remoteVersion >= 90400) + if (fout->remoteVersion >= 11000) + { + appendPQExpBuffer(query, + "SELECT t.tableoid, t.oid, " + "t.relname AS indexname, " + "inh.inhparent AS parentidx, " + "pg_catalog.pg_get_indexdef(i.indexrelid) AS indexdef, " + "t.relnatts AS indnkeys, " + "i.indkey, i.indisclustered, " + "i.indisreplident, t.relpages, " + "c.contype, c.conname, " + "c.condeferrable, c.condeferred, " + "c.tableoid AS contableoid, " + "c.oid AS conoid, " + "pg_catalog.pg_get_constraintdef(c.oid, false) AS condef, " + "(SELECT spcname FROM pg_catalog.pg_tablespace s WHERE s.oid = t.reltablespace) AS tablespace, " + "t.reloptions AS indreloptions " + "FROM pg_catalog.pg_index i " + "JOIN pg_catalog.pg_class t ON (t.oid = i.indexrelid) " + "JOIN pg_catalog.pg_class t2 ON (t2.oid = i.indrelid) " + "LEFT JOIN pg_catalog.pg_constraint c " + "ON (i.indrelid = c.conrelid AND " + "i.indexrelid = c.conindid AND " + "c.contype IN ('p','u','x')) " + "LEFT JOIN pg_catalog.pg_inherits inh " + "ON (inh.inhrelid = indexrelid) " + "WHERE i.indrelid = '%u'::pg_catalog.oid " + "AND (i.indisvalid OR t2.relkind = 'p') " + "AND i.indisready " + "ORDER BY indexname", + tbinfo->dobj.catId.oid); + } + else if (fout->remoteVersion >= 90400) { /* * the test on indisready is necessary in 9.2, and harmless in @@ -6831,6 +6867,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables) appendPQExpBuffer(query, "SELECT t.tableoid, t.oid, " "t.relname AS indexname, " + "0 AS parentidx, " "pg_catalog.pg_get_indexdef(i.indexrelid) AS indexdef, " "t.relnatts AS indnkeys, " "i.indkey, i.indisclustered, " @@ -6862,6 +6899,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables) appendPQExpBuffer(query, "SELECT t.tableoid, t.oid, " "t.relname AS indexname, " + "0 AS parentidx, " "pg_catalog.pg_get_indexdef(i.indexrelid) AS indexdef, " "t.relnatts AS indnkeys, " "i.indkey, i.indisclustered, " @@ -6889,6 +6927,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables) appendPQExpBuffer(query, "SELECT t.tableoid, t.oid, " "t.relname AS indexname, " + "0 AS parentidx, " "pg_catalog.pg_get_indexdef(i.indexrelid) AS indexdef, " "t.relnatts AS indnkeys, " "i.indkey, i.indisclustered, " @@ -6919,6 +6958,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables) appendPQExpBuffer(query, "SELECT t.tableoid, t.oid, " "t.relname AS indexname, " + "0 AS parentidx, " "pg_catalog.pg_get_indexdef(i.indexrelid) AS indexdef, " "t.relnatts AS indnkeys, " "i.indkey, i.indisclustered, " @@ -6951,6 +6991,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables) i_tableoid = PQfnumber(res, "tableoid"); i_oid = PQfnumber(res, "oid"); i_indexname = PQfnumber(res, "indexname"); + i_parentidx = PQfnumber(res, "parentidx"); i_indexdef = PQfnumber(res, "indexdef"); i_indnkeys = PQfnumber(res, "indnkeys"); i_indkey = PQfnumber(res, "indkey"); @@ -6967,8 +7008,10 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables) i_tablespace = PQfnumber(res, "tablespace"); i_indreloptions = PQfnumber(res, "indreloptions"); - indxinfo = (IndxInfo *) pg_malloc(ntups * sizeof(IndxInfo)); + tbinfo->indexes = indxinfo = + (IndxInfo *) pg_malloc(ntups * sizeof(IndxInfo)); constrinfo = (ConstraintInfo *) pg_malloc(ntups * sizeof(ConstraintInfo)); + tbinfo->numIndexes = ntups; for (j = 0; j < ntups; j++) { @@ -6978,6 +7021,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables) indxinfo[j].dobj.catId.tableoid = atooid(PQgetvalue(res, j, i_tableoid)); indxinfo[j].dobj.catId.oid = atooid(PQgetvalue(res, j, i_oid)); AssignDumpId(&indxinfo[j].dobj); + indxinfo[j].dobj.dump = tbinfo->dobj.dump; indxinfo[j].dobj.name = pg_strdup(PQgetvalue(res, j, i_indexname)); indxinfo[j].dobj.namespace = tbinfo->dobj.namespace; indxinfo[j].indextable = tbinfo; @@ -6990,6 +7034,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables) indxinfo[j].indkeys, indxinfo[j].indnkeys); indxinfo[j].indisclustered = (PQgetvalue(res, j, i_indisclustered)[0] == 't'); indxinfo[j].indisreplident = (PQgetvalue(res, j, i_indisreplident)[0] == 't'); + indxinfo[j].parentidx = atooid(PQgetvalue(res, j, i_parentidx)); indxinfo[j].relpages = atoi(PQgetvalue(res, j, i_relpages)); contype = *(PQgetvalue(res, j, i_contype)); @@ -7003,6 +7048,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables) constrinfo[j].dobj.catId.tableoid = atooid(PQgetvalue(res, j, i_contableoid)); constrinfo[j].dobj.catId.oid = atooid(PQgetvalue(res, j, i_conoid)); AssignDumpId(&constrinfo[j].dobj); + constrinfo[j].dobj.dump = tbinfo->dobj.dump; constrinfo[j].dobj.name = pg_strdup(PQgetvalue(res, j, i_conname)); constrinfo[j].dobj.namespace = tbinfo->dobj.namespace; constrinfo[j].contable = tbinfo; @@ -9773,6 +9819,9 @@ dumpDumpableObject(Archive *fout, DumpableObject *dobj) case DO_INDEX: dumpIndex(fout, (IndxInfo *) dobj); break; + case DO_INDEX_ATTACH: + dumpIndexAttach(fout, (IndexAttachInfo *) dobj); + break; case DO_STATSEXT: dumpStatisticsExt(fout, (StatsExtInfo *) dobj); break; @@ -16528,6 +16577,42 @@ dumpIndex(Archive *fout, IndxInfo *indxinfo) destroyPQExpBuffer(labelq); } +/* + * dumpIndexAttach + * write out to fout a partitioned-index attachment clause + */ +void +dumpIndexAttach(Archive *fout, IndexAttachInfo *attachinfo) +{ + if (fout->dopt->dataOnly) + return; + + if (attachinfo->partitionIdx->dobj.dump & DUMP_COMPONENT_DEFINITION) + { + PQExpBuffer q = createPQExpBuffer(); + + appendPQExpBuffer(q, "\nALTER INDEX %s ", + fmtQualifiedId(fout->remoteVersion, + attachinfo->parentIdx->dobj.namespace->dobj.name, + attachinfo->parentIdx->dobj.name)); + appendPQExpBuffer(q, "ATTACH PARTITION %s;\n", + fmtQualifiedId(fout->remoteVersion, + attachinfo->partitionIdx->dobj.namespace->dobj.name, + attachinfo->partitionIdx->dobj.name)); + + ArchiveEntry(fout, attachinfo->dobj.catId, attachinfo->dobj.dumpId, + attachinfo->dobj.name, + NULL, NULL, + "", + false, "INDEX ATTACH", SECTION_POST_DATA, + q->data, "", NULL, + NULL, 0, + NULL, NULL); + + destroyPQExpBuffer(q); + } +} + /* * dumpStatisticsExt * write out to fout an extended statistics object @@ -18188,6 +18273,7 @@ addBoundaryDependencies(DumpableObject **dobjs, int numObjs, addObjectDependency(postDataBound, dobj->dumpId); break; case DO_INDEX: + case DO_INDEX_ATTACH: case DO_STATSEXT: case DO_REFRESH_MATVIEW: case DO_TRIGGER: diff --git a/src/bin/pg_dump/pg_dump.h b/src/bin/pg_dump/pg_dump.h index 1cff1a8b..133a66a9 100644 --- a/src/bin/pg_dump/pg_dump.h +++ b/src/bin/pg_dump/pg_dump.h @@ -116,6 +116,7 @@ typedef enum DO_TABLE, DO_ATTRDEF, DO_INDEX, + DO_INDEX_ATTACH, DO_STATSEXT, DO_RULE, DO_TRIGGER, @@ -418,6 +419,8 @@ typedef struct _tableInfo */ int numParents; /* number of (immediate) parent tables */ struct _tableInfo **parents; /* TableInfos of immediate parents */ + int numIndexes; /* number of indexes */ + struct _indxInfo *indexes; /* indexes */ struct _tableDataInfo *dataObj; /* TableDataInfo, if dumping its data */ int numTriggers; /* number of triggers for table */ struct _triggerInfo *triggers; /* array of TriggerInfo structs */ @@ -451,11 +454,19 @@ typedef struct _indxInfo Oid *indkeys; bool indisclustered; bool indisreplident; + Oid parentidx; /* if partitioned, parent index OID */ /* if there is an associated constraint object, its dumpId: */ DumpId indexconstraint; int relpages; /* relpages of the underlying table */ } IndxInfo; +typedef struct _indexAttachInfo +{ + DumpableObject dobj; + IndxInfo *parentIdx; /* link to index on partitioned table */ + IndxInfo *partitionIdx; /* link to index on partition */ +} IndexAttachInfo; + typedef struct _statsExtInfo { DumpableObject dobj; diff --git a/src/bin/pg_dump/pg_dump_sort.c b/src/bin/pg_dump/pg_dump_sort.c index 3bf17ece..46b38750 100644 --- a/src/bin/pg_dump/pg_dump_sort.c +++ b/src/bin/pg_dump/pg_dump_sort.c @@ -35,6 +35,10 @@ static const char *modulename = gettext_noop("sorter"); * pg_dump.c; that is, PRE_DATA objects must sort before DO_PRE_DATA_BOUNDARY, * POST_DATA objects must sort after DO_POST_DATA_BOUNDARY, and DATA objects * must sort between them. + * + * Note: sortDataAndIndexObjectsBySize wants to have all DO_TABLE_DATA and + * DO_INDEX objects in contiguous chunks, so do not reuse the values for those + * for other object types. */ static const int dbObjectTypePriority[] = { @@ -53,11 +57,12 @@ static const int dbObjectTypePriority[] = 18, /* DO_TABLE */ 20, /* DO_ATTRDEF */ 28, /* DO_INDEX */ - 29, /* DO_STATSEXT */ - 30, /* DO_RULE */ - 31, /* DO_TRIGGER */ + 29, /* DO_INDEX_ATTACH */ + 30, /* DO_STATSEXT */ + 31, /* DO_RULE */ + 32, /* DO_TRIGGER */ 27, /* DO_CONSTRAINT */ - 32, /* DO_FK_CONSTRAINT */ + 33, /* DO_FK_CONSTRAINT */ 2, /* DO_PROCLANG */ 10, /* DO_CAST */ 23, /* DO_TABLE_DATA */ @@ -69,18 +74,18 @@ static const int dbObjectTypePriority[] = 15, /* DO_TSCONFIG */ 16, /* DO_FDW */ 17, /* DO_FOREIGN_SERVER */ - 32, /* DO_DEFAULT_ACL */ + 33, /* DO_DEFAULT_ACL */ 3, /* DO_TRANSFORM */ 21, /* DO_BLOB */ 25, /* DO_BLOB_DATA */ 22, /* DO_PRE_DATA_BOUNDARY */ 26, /* DO_POST_DATA_BOUNDARY */ - 33, /* DO_EVENT_TRIGGER */ - 38, /* DO_REFRESH_MATVIEW */ - 34, /* DO_POLICY */ - 35, /* DO_PUBLICATION */ - 36, /* DO_PUBLICATION_REL */ - 37 /* DO_SUBSCRIPTION */ + 34, /* DO_EVENT_TRIGGER */ + 39, /* DO_REFRESH_MATVIEW */ + 35, /* DO_POLICY */ + 36, /* DO_PUBLICATION */ + 37, /* DO_PUBLICATION_REL */ + 38 /* DO_SUBSCRIPTION */ }; static DumpId preDataBoundId; @@ -937,6 +942,13 @@ repairDomainConstraintMultiLoop(DumpableObject *domainobj, addObjectDependency(constraintobj, postDataBoundId); } +static void +repairIndexLoop(DumpableObject *partedindex, + DumpableObject *partindex) +{ + removeObjectDependency(partedindex, partindex->dumpId); +} + /* * Fix a dependency loop, or die trying ... * @@ -1099,6 +1111,23 @@ repairDependencyLoop(DumpableObject **loop, return; } + /* index on partitioned table and corresponding index on partition */ + if (nLoop == 2 && + loop[0]->objType == DO_INDEX && + loop[1]->objType == DO_INDEX) + { + if (((IndxInfo *) loop[0])->parentidx == loop[1]->catId.oid) + { + repairIndexLoop(loop[0], loop[1]); + return; + } + else if (((IndxInfo *) loop[1])->parentidx == loop[0]->catId.oid) + { + repairIndexLoop(loop[1], loop[0]); + return; + } + } + /* Indirect loop involving table and attribute default */ if (nLoop > 2) { @@ -1292,6 +1321,11 @@ describeDumpableObject(DumpableObject *obj, char *buf, int bufsize) "INDEX %s (ID %d OID %u)", obj->name, obj->dumpId, obj->catId.oid); return; + case DO_INDEX_ATTACH: + snprintf(buf, bufsize, + "INDEX ATTACH %s (ID %d)", + obj->name, obj->dumpId); + return; case DO_STATSEXT: snprintf(buf, bufsize, "STATISTICS %s (ID %d OID %u)", diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl index c492fbdc..360d5954 100644 --- a/src/bin/pg_dump/t/002_pg_dump.pl +++ b/src/bin/pg_dump/t/002_pg_dump.pl @@ -5163,6 +5163,101 @@ section_pre_data => 1, test_schema_plus_blobs => 1, }, }, + 'CREATE INDEX ON ONLY measurement' => { + all_runs => 1, + catch_all => 'CREATE ... commands', + create_order => 92, + create_sql => 'CREATE INDEX ON dump_test.measurement (city_id, logdate);', + regexp => qr/^ + \QCREATE INDEX measurement_city_id_logdate_idx ON ONLY measurement USING\E + /xm, + like => { + binary_upgrade => 1, + clean => 1, + clean_if_exists => 1, + createdb => 1, + defaults => 1, + exclude_test_table => 1, + exclude_test_table_data => 1, + no_blobs => 1, + no_privs => 1, + no_owner => 1, + only_dump_test_schema => 1, + pg_dumpall_dbprivs => 1, + schema_only => 1, + section_post_data => 1, + test_schema_plus_blobs => 1, + with_oids => 1, }, + unlike => { + exclude_dump_test_schema => 1, + only_dump_test_table => 1, + pg_dumpall_globals => 1, + pg_dumpall_globals_clean => 1, + role => 1, + section_pre_data => 1, }, }, + + 'CREATE INDEX ... ON measurement_y2006_m2' => { + all_runs => 1, + catch_all => 'CREATE ... commands', + regexp => qr/^ + \QCREATE INDEX measurement_y2006m2_city_id_logdate_idx ON measurement_y2006m2 \E + /xm, + like => { + binary_upgrade => 1, + clean => 1, + clean_if_exists => 1, + createdb => 1, + defaults => 1, + exclude_dump_test_schema => 1, + exclude_test_table => 1, + exclude_test_table_data => 1, + no_blobs => 1, + no_privs => 1, + no_owner => 1, + pg_dumpall_dbprivs => 1, + role => 1, + schema_only => 1, + section_post_data => 1, + with_oids => 1, }, + unlike => { + only_dump_test_schema => 1, + only_dump_test_table => 1, + pg_dumpall_globals => 1, + pg_dumpall_globals_clean => 1, + section_pre_data => 1, + test_schema_plus_blobs => 1, }, }, + + 'ALTER INDEX ... ATTACH PARTITION' => { + all_runs => 1, + catch_all => 'CREATE ... commands', + regexp => qr/^ + \QALTER INDEX dump_test.measurement_city_id_logdate_idx ATTACH PARTITION dump_test_second_schema.measurement_y2006m2_city_id_logdate_idx\E + /xm, + like => { + binary_upgrade => 1, + clean => 1, + clean_if_exists => 1, + createdb => 1, + defaults => 1, + exclude_dump_test_schema => 1, + exclude_test_table => 1, + exclude_test_table_data => 1, + no_blobs => 1, + no_privs => 1, + no_owner => 1, + pg_dumpall_dbprivs => 1, + role => 1, + schema_only => 1, + section_post_data => 1, + with_oids => 1, }, + unlike => { + only_dump_test_schema => 1, + only_dump_test_table => 1, + pg_dumpall_globals => 1, + pg_dumpall_globals_clean => 1, + section_pre_data => 1, + test_schema_plus_blobs => 1, }, }, + 'CREATE VIEW test_view' => { all_runs => 1, catch_all => 'CREATE ... commands', diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c index c67d2570..64bb8794 100644 --- a/src/bin/psql/describe.c +++ b/src/bin/psql/describe.c @@ -1622,7 +1622,8 @@ describeOneTableDetails(const char *schemaname, appendPQExpBufferStr(&buf, ",\n a.attidentity"); else appendPQExpBufferStr(&buf, ",\n ''::pg_catalog.char AS attidentity"); - if (tableinfo.relkind == RELKIND_INDEX) + if (tableinfo.relkind == RELKIND_INDEX || + tableinfo.relkind == RELKIND_PARTITIONED_INDEX) appendPQExpBufferStr(&buf, ",\n pg_catalog.pg_get_indexdef(a.attrelid, a.attnum, TRUE) AS indexdef"); else appendPQExpBufferStr(&buf, ",\n NULL AS indexdef"); @@ -1687,6 +1688,7 @@ describeOneTableDetails(const char *schemaname, schemaname, relationname); break; case RELKIND_INDEX: + case RELKIND_PARTITIONED_INDEX: if (tableinfo.relpersistence == 'u') printfPQExpBuffer(&title, _("Unlogged index \"%s.%s\""), schemaname, relationname); @@ -1747,7 +1749,8 @@ describeOneTableDetails(const char *schemaname, if (tableinfo.relkind == RELKIND_SEQUENCE) headers[cols++] = gettext_noop("Value"); - if (tableinfo.relkind == RELKIND_INDEX) + if (tableinfo.relkind == RELKIND_INDEX || + tableinfo.relkind == RELKIND_PARTITIONED_INDEX) headers[cols++] = gettext_noop("Definition"); if (tableinfo.relkind == RELKIND_FOREIGN_TABLE && pset.sversion >= 90200) @@ -1757,6 +1760,7 @@ describeOneTableDetails(const char *schemaname, { headers[cols++] = gettext_noop("Storage"); if (tableinfo.relkind == RELKIND_RELATION || + tableinfo.relkind == RELKIND_PARTITIONED_INDEX || tableinfo.relkind == RELKIND_MATVIEW || tableinfo.relkind == RELKIND_FOREIGN_TABLE || tableinfo.relkind == RELKIND_PARTITIONED_TABLE) @@ -1833,7 +1837,8 @@ describeOneTableDetails(const char *schemaname, printTableAddCell(&cont, seq_values[i], false, false); /* Expression for index column */ - if (tableinfo.relkind == RELKIND_INDEX) + if (tableinfo.relkind == RELKIND_INDEX || + tableinfo.relkind == RELKIND_PARTITIONED_INDEX) printTableAddCell(&cont, PQgetvalue(res, i, 7), false, false); /* FDW options for foreign table column, only for 9.2 or later */ @@ -1856,6 +1861,7 @@ describeOneTableDetails(const char *schemaname, /* Statistics target, if the relkind supports this feature */ if (tableinfo.relkind == RELKIND_RELATION || + tableinfo.relkind == RELKIND_PARTITIONED_INDEX || tableinfo.relkind == RELKIND_MATVIEW || tableinfo.relkind == RELKIND_FOREIGN_TABLE || tableinfo.relkind == RELKIND_PARTITIONED_TABLE) @@ -1945,7 +1951,8 @@ describeOneTableDetails(const char *schemaname, PQclear(result); } - if (tableinfo.relkind == RELKIND_INDEX) + if (tableinfo.relkind == RELKIND_INDEX || + tableinfo.relkind == RELKIND_PARTITIONED_INDEX) { /* Footer information about an index */ PGresult *result; @@ -3631,6 +3638,7 @@ listTables(const char *tabtypes, const char *pattern, bool verbose, bool showSys " WHEN 's' THEN '%s'" " WHEN " CppAsString2(RELKIND_FOREIGN_TABLE) " THEN '%s'" " WHEN " CppAsString2(RELKIND_PARTITIONED_TABLE) " THEN '%s'" + " WHEN " CppAsString2(RELKIND_PARTITIONED_INDEX) " THEN '%s'" " END as \"%s\",\n" " pg_catalog.pg_get_userbyid(c.relowner) as \"%s\"", gettext_noop("Schema"), @@ -3643,6 +3651,7 @@ listTables(const char *tabtypes, const char *pattern, bool verbose, bool showSys gettext_noop("special"), gettext_noop("foreign table"), gettext_noop("table"), /* partitioned table */ + gettext_noop("index"), /* partitioned index */ gettext_noop("Type"), gettext_noop("Owner")); @@ -3699,7 +3708,8 @@ listTables(const char *tabtypes, const char *pattern, bool verbose, bool showSys if (showMatViews) appendPQExpBufferStr(&buf, CppAsString2(RELKIND_MATVIEW) ","); if (showIndexes) - appendPQExpBufferStr(&buf, CppAsString2(RELKIND_INDEX) ","); + appendPQExpBufferStr(&buf, CppAsString2(RELKIND_INDEX) "," + CppAsString2(RELKIND_PARTITIONED_INDEX) ","); if (showSeq) appendPQExpBufferStr(&buf, CppAsString2(RELKIND_SEQUENCE) ","); if (showSystem || pattern) diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c index fd1c4a5e..db21cc50 100644 --- a/src/bin/psql/tab-complete.c +++ b/src/bin/psql/tab-complete.c @@ -412,7 +412,8 @@ static const SchemaQuery Query_for_list_of_indexes = { /* catname */ "pg_catalog.pg_class c", /* selcondition */ - "c.relkind IN (" CppAsString2(RELKIND_INDEX) ")", + "c.relkind IN (" CppAsString2(RELKIND_INDEX) ", " + CppAsString2(RELKIND_PARTITIONED_INDEX) ")", /* viscondition */ "pg_catalog.pg_table_is_visible(c.oid)", /* namespace */ @@ -604,6 +605,23 @@ static const SchemaQuery Query_for_list_of_tmf = { NULL }; +static const SchemaQuery Query_for_list_of_tpm = { + /* catname */ + "pg_catalog.pg_class c", + /* selcondition */ + "c.relkind IN (" CppAsString2(RELKIND_RELATION) ", " + CppAsString2(RELKIND_PARTITIONED_TABLE) ", " + CppAsString2(RELKIND_MATVIEW) ")", + /* viscondition */ + "pg_catalog.pg_table_is_visible(c.oid)", + /* namespace */ + "c.relnamespace", + /* result */ + "pg_catalog.quote_ident(c.relname)", + /* qualresult */ + NULL +}; + static const SchemaQuery Query_for_list_of_tm = { /* catname */ "pg_catalog.pg_class c", @@ -1706,7 +1724,12 @@ psql_completion(const char *text, int start, int end) "UNION SELECT 'ALL IN TABLESPACE'"); /* ALTER INDEX */ else if (Matches3("ALTER", "INDEX", MatchAny)) - COMPLETE_WITH_LIST4("OWNER TO", "RENAME TO", "SET", "RESET"); + COMPLETE_WITH_LIST5("OWNER TO", "RENAME TO", "SET", + "RESET", "ATTACH PARTITION"); + else if (Matches4("ALTER", "INDEX", MatchAny, "ATTACH")) + COMPLETE_WITH_CONST("PARTITION"); + else if (Matches5("ALTER", "INDEX", MatchAny, "ATTACH", "PARTITION")) + COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_indexes, NULL); /* ALTER INDEX SET */ else if (Matches4("ALTER", "INDEX", MatchAny, "SET")) COMPLETE_WITH_LIST2("(", "TABLESPACE"); @@ -2372,10 +2395,13 @@ psql_completion(const char *text, int start, int end) COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_indexes, " UNION SELECT 'ON'" " UNION SELECT 'CONCURRENTLY'"); - /* Complete ... INDEX|CONCURRENTLY [] ON with a list of tables */ + /* + * Complete ... INDEX|CONCURRENTLY [] ON with a list of relations + * that can indexes can be created on + */ else if (TailMatches3("INDEX|CONCURRENTLY", MatchAny, "ON") || TailMatches2("INDEX|CONCURRENTLY", "ON")) - COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_tm, NULL); + COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_tpm, NULL); /* * Complete CREATE|UNIQUE INDEX CONCURRENTLY with "ON" and existing diff --git a/src/include/catalog/dependency.h b/src/include/catalog/dependency.h index 6af0f85d..c2c0c9b1 100644 --- a/src/include/catalog/dependency.h +++ b/src/include/catalog/dependency.h @@ -110,6 +110,20 @@ * Example: a trigger that's created to enforce a foreign-key constraint * is made internally dependent on the constraint's pg_constraint entry. * + * DEPENDENCY_INTERNAL_AUTO ('I'): the dependent object was created as + * part of creation of the referenced object, and is really just a part + * of its internal implementation. A DROP of the dependent object will + * be disallowed outright (we'll tell the user to issue a DROP against the + * referenced object, instead). While a regular internal dependency will + * prevent the dependent object from being dropped while any such + * dependencies remain, DEPENDENCY_INTERNAL_AUTO will allow such a drop as + * long as the object can be found by following any of such dependencies. + * Example: an index on a partition is made internal-auto-dependent on + * both the partition itself as well as on the index on the parent + * partitioned table; so the partition index is dropped together with + * either the partition it indexes, or with the parent index it is attached + * to. + * DEPENDENCY_EXTENSION ('e'): the dependent object is a member of the * extension that is the referenced object. The dependent object can be * dropped only via DROP EXTENSION on the referenced object. Functionally @@ -136,6 +150,7 @@ typedef enum DependencyType DEPENDENCY_NORMAL = 'n', DEPENDENCY_AUTO = 'a', DEPENDENCY_INTERNAL = 'i', + DEPENDENCY_INTERNAL_AUTO = 'I', DEPENDENCY_EXTENSION = 'e', DEPENDENCY_AUTO_EXTENSION = 'x', DEPENDENCY_PIN = 'p' diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h index 4928dfd1..3a7ed05f 100644 --- a/src/include/catalog/index.h +++ b/src/include/catalog/index.h @@ -108,10 +108,13 @@ extern void index_check_primary_key(Relation heapRel, #define INDEX_CREATE_SKIP_BUILD (1 << 2) #define INDEX_CREATE_CONCURRENT (1 << 3) #define INDEX_CREATE_IF_NOT_EXISTS (1 << 4) +#define INDEX_CREATE_PARTITIONED (1 << 5) +#define INDEX_CREATE_INVALID (1 << 6) extern Oid index_create(Relation heapRelation, const char *indexRelationName, Oid indexRelationId, + Oid parentIndexRelid, Oid relFileNode, IndexInfo *indexInfo, List *indexColNames, @@ -145,6 +148,11 @@ extern void index_drop(Oid indexId, bool concurrent); extern IndexInfo *BuildIndexInfo(Relation index); +extern bool CompareIndexInfo(IndexInfo *info1, IndexInfo *info2, + Oid *collations1, Oid *collations2, + Oid *opfamilies1, Oid *opfamilies2, + AttrNumber *attmap, int maplen); + extern void BuildSpeculativeIndexInfo(Relation index, IndexInfo *ii); extern void FormIndexDatum(IndexInfo *indexInfo, @@ -199,4 +207,6 @@ extern Oid IndexGetRelation(Oid indexId, bool missing_ok); extern bool index_is_interval(Oid indexId); #endif +extern void IndexSetParentIndex(Relation idx, Oid parentOid); + #endif /* INDEX_H */ diff --git a/src/include/catalog/pg_class.h b/src/include/catalog/pg_class.h index 15929163..4ad3a5a6 100644 --- a/src/include/catalog/pg_class.h +++ b/src/include/catalog/pg_class.h @@ -183,6 +183,7 @@ DESCR(""); #define RELKIND_COMPOSITE_TYPE 'c' /* composite type */ #define RELKIND_FOREIGN_TABLE 'f' /* foreign table */ #define RELKIND_PARTITIONED_TABLE 'p' /* partitioned table */ +#define RELKIND_PARTITIONED_INDEX 'I' /* partitioned index */ #define RELPERSISTENCE_PERMANENT 'p' /* regular table */ #define RELPERSISTENCE_UNLOGGED 'u' /* unlogged permanent table */ diff --git a/src/include/catalog/pg_inherits_fn.h b/src/include/catalog/pg_inherits_fn.h index 3c371890..09663312 100644 --- a/src/include/catalog/pg_inherits_fn.h +++ b/src/include/catalog/pg_inherits_fn.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * pg_inherits_fn.h - * prototypes for functions in catalog/pg_inherits.c + * prototypes for functions in catalog/pg_inherits.c * * * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group @@ -19,9 +19,12 @@ extern List *find_inheritance_children(Oid parentrelId, LOCKMODE lockmode); extern List *find_all_inheritors(Oid parentrelId, LOCKMODE lockmode, - List **parents); + List **parents); extern bool has_subclass(Oid relationId); extern bool has_superclass(Oid relationId); extern bool typeInheritsFrom(Oid subclassTypeId, Oid superclassTypeId); +extern void StoreSingleInheritance(Oid relationId, Oid parentOid, + int32 seqNumber); +extern bool DeleteInheritsTuple(Oid inhrelid, Oid inhparent); -#endif /* PG_INHERITS_FN_H */ +#endif /* PG_INHERITS_FN_H */ diff --git a/src/include/commands/defrem.h b/src/include/commands/defrem.h index 2e4f2c44..377f9f94 100644 --- a/src/include/commands/defrem.h +++ b/src/include/commands/defrem.h @@ -25,12 +25,13 @@ extern void RemoveObjects(DropStmt *stmt); extern ObjectAddress DefineIndex(Oid relationId, IndexStmt *stmt, Oid indexRelationId, + Oid parentIndexId, bool is_alter_table, bool check_rights, bool check_not_in_use, bool skip_build, bool quiet); -extern Oid ReindexIndex(RangeVar *indexRelation, int options); +extern void ReindexIndex(RangeVar *indexRelation, int options); extern Oid ReindexTable(RangeVar *relation, int options); extern void ReindexMultipleTables(const char *objectName, ReindexObjectType objectKind, int options); diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index bed56a23..ddb99ddf 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -152,6 +152,7 @@ typedef struct IndexInfo bool ii_ReadyForInserts; bool ii_Concurrent; bool ii_BrokenHotChain; + Oid ii_Am; void *ii_AmCache; MemoryContext ii_Context; } IndexInfo; diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index acc64eb0..df2746c9 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -918,7 +918,7 @@ typedef struct PartitionRangeDatum } PartitionRangeDatum; /* - * PartitionCmd - info for ALTER TABLE ATTACH/DETACH PARTITION commands + * PartitionCmd - info for ALTER TABLE/INDEX ATTACH/DETACH PARTITION commands */ typedef struct PartitionCmd { @@ -2859,6 +2859,10 @@ typedef struct FetchStmt * index, just a UNIQUE/PKEY constraint using an existing index. isconstraint * must always be true in this case, and the fields describing the index * properties are empty. + * + * The relation to build the index on can be represented either by name + * (in which case the RangeVar indicates whether to recurse or not) or by OID + * (in which case the command is always recursive). * ---------------------- */ typedef struct IndexStmt @@ -2866,6 +2870,7 @@ typedef struct IndexStmt NodeTag type; char *idxname; /* name of new index, or NULL for default */ RangeVar *relation; /* relation to build index on */ + Oid relationId; /* OID of relation to build index on */ char *accessMethod; /* name of access method (eg. btree) */ char *tableSpace; /* tablespace, or NULL for default */ List *indexParams; /* columns to index: a list of IndexElem */ diff --git a/src/include/parser/parse_utilcmd.h b/src/include/parser/parse_utilcmd.h index adde3238..e527a119 100644 --- a/src/include/parser/parse_utilcmd.h +++ b/src/include/parser/parse_utilcmd.h @@ -101,5 +101,8 @@ extern bool CheckLocalIndexColumn (char loctype, char *partcolname, char *indexc #endif extern PartitionBoundSpec *transformPartitionBound(ParseState *pstate, Relation parent, PartitionBoundSpec *spec); +extern IndexStmt *generateClonedIndexStmt(RangeVar *heapRel, Oid heapOid, + Relation source_idx, + const AttrNumber *attmap, int attmap_length); #endif /* PARSE_UTILCMD_H */ diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out index 9ace1f7a..626f34f7 100644 --- a/src/test/regress/expected/alter_table.out +++ b/src/test/regress/expected/alter_table.out @@ -1967,6 +1967,67 @@ create table tab1 (a int, b text); create table tab2 (x int, y tab1); alter table tab1 alter column b type varchar; -- fails ERROR: cannot alter table "tab1" because column "tab2.y" uses its row type +-- Alter column type that's part of a partitioned index +create table at_partitioned (a int, b text) partition by range (a); +create table at_part_1 partition of at_partitioned for values from (0) to (1000); +insert into at_partitioned values (512, '0.123'); +create table at_part_2 (b text, a int); +insert into at_part_2 values ('1.234', 1024); +create index on at_partitioned (b); +create index on at_partitioned (a); +\d at_part_1 + Table "public.at_part_1" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | text | | | +Partition of: at_partitioned FOR VALUES FROM (0) TO (1000) +Indexes: + "at_part_1_a_idx" btree (a) + "at_part_1_b_idx" btree (b) + +\d at_part_2 + Table "public.at_part_2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + b | text | | | + a | integer | | | + +alter table at_partitioned attach partition at_part_2 for values from (1000) to (2000); +\d at_part_2 + Table "public.at_part_2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + b | text | | | + a | integer | | | +Partition of: at_partitioned FOR VALUES FROM (1000) TO (2000) +Indexes: + "at_part_2_a_idx" btree (a) + "at_part_2_b_idx" btree (b) + +alter table at_partitioned alter column b type numeric using b::numeric; +\d at_part_1 + Table "public.at_part_1" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | numeric | | | +Partition of: at_partitioned FOR VALUES FROM (0) TO (1000) +Indexes: + "at_part_1_a_idx" btree (a) + "at_part_1_b_idx" btree (b) + +\d at_part_2 + Table "public.at_part_2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + b | numeric | | | + a | integer | | | +Partition of: at_partitioned FOR VALUES FROM (1000) TO (2000) +Indexes: + "at_part_2_a_idx" btree (a) + "at_part_2_b_idx" btree (b) + -- disallow recursive containment of row types create temp table recur1 (f1 int); alter table recur1 add column f2 recur1; -- fails @@ -3168,7 +3229,7 @@ CREATE TABLE unparted ( ); CREATE TABLE fail_part (like unparted); ALTER TABLE unparted ATTACH PARTITION fail_part FOR VALUES IN ('a'); -ERROR: "unparted" is not partitioned +ERROR: table "unparted" is not partitioned DROP TABLE unparted, fail_part; -- check that partition bound is compatible CREATE TABLE list_parted ( @@ -3563,7 +3624,7 @@ DROP TABLE fail_part; -- check that the table is partitioned at all CREATE TABLE regular_table (a int); ALTER TABLE regular_table DETACH PARTITION any_name; -ERROR: "regular_table" is not partitioned +ERROR: table "regular_table" is not partitioned DROP TABLE regular_table; -- check that the partition being detached exists at all ALTER TABLE list_parted2 DETACH PARTITION part_4; diff --git a/src/test/regress/expected/indexing.out b/src/test/regress/expected/indexing.out new file mode 100644 index 00000000..e9cccca8 --- /dev/null +++ b/src/test/regress/expected/indexing.out @@ -0,0 +1,757 @@ +-- Creating an index on a partitioned table makes the partitions +-- automatically get the index +create table idxpart (a int, b int, c text) partition by range (a); +create table idxpart1 partition of idxpart for values from (0) to (10); +create table idxpart2 partition of idxpart for values from (10) to (100) + partition by range (b); +create table idxpart21 partition of idxpart2 for values from (0) to (100); +create index on idxpart (a); +select relname, relkind, inhparent::regclass + from pg_class left join pg_index ix on (indexrelid = oid) + left join pg_inherits on (ix.indexrelid = inhrelid) + where relname like 'idxpart%' order by relname; + relname | relkind | inhparent +-----------------+---------+---------------- + idxpart | p | + idxpart1 | r | + idxpart1_a_idx | i | idxpart_a_idx + idxpart2 | p | + idxpart21 | r | + idxpart21_a_idx | i | idxpart2_a_idx + idxpart2_a_idx | I | idxpart_a_idx + idxpart_a_idx | I | +(8 rows) + +drop table idxpart; +-- Some unsupported features +create table idxpart (a int, b int, c text) partition by range (a); +create table idxpart1 partition of idxpart for values from (0) to (10); +create unique index on idxpart (a); +ERROR: cannot create unique index on partitioned table "idxpart" +create index concurrently on idxpart (a); +ERROR: cannot create index on partitioned table "idxpart" concurrently +drop table idxpart; +-- If a table without index is attached as partition to a table with +-- an index, the index is automatically created +create table idxpart (a int, b int, c text) partition by range (a); +create index idxparti on idxpart (a); +create index idxparti2 on idxpart (b, c); +create table idxpart1 (like idxpart); +\d idxpart1 + Table "public.idxpart1" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | text | | | + +alter table idxpart attach partition idxpart1 for values from (0) to (10); +\d idxpart1 + Table "public.idxpart1" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | text | | | +Partition of: idxpart FOR VALUES FROM (0) TO (10) +Indexes: + "idxpart1_a_idx" btree (a) + "idxpart1_b_c_idx" btree (b, c) + +drop table idxpart; +-- If a partition already has an index, don't create a duplicative one +create table idxpart (a int, b int) partition by range (a, b); +create table idxpart1 partition of idxpart for values from (0, 0) to (10, 10); +create index on idxpart1 (a, b); +create index on idxpart (a, b); +\d idxpart1 + Table "public.idxpart1" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | +Partition of: idxpart FOR VALUES FROM (0, 0) TO (10, 10) +Indexes: + "idxpart1_a_b_idx" btree (a, b) + +select relname, relkind, inhparent::regclass + from pg_class left join pg_index ix on (indexrelid = oid) + left join pg_inherits on (ix.indexrelid = inhrelid) + where relname like 'idxpart%' order by relname; + relname | relkind | inhparent +------------------+---------+----------------- + idxpart | p | + idxpart1 | r | + idxpart1_a_b_idx | i | idxpart_a_b_idx + idxpart_a_b_idx | I | +(4 rows) + +drop table idxpart; +-- DROP behavior for partitioned indexes +create table idxpart (a int) partition by range (a); +create index on idxpart (a); +create table idxpart1 partition of idxpart for values from (0) to (10); +drop index idxpart1_a_idx; -- no way +ERROR: cannot drop index idxpart1_a_idx because index idxpart_a_idx requires it +HINT: You can drop index idxpart_a_idx instead. +drop index idxpart_a_idx; -- both indexes go away +select relname, relkind from pg_class + where relname like 'idxpart%' order by relname; + relname | relkind +----------+--------- + idxpart | p + idxpart1 | r +(2 rows) + +create index on idxpart (a); +drop table idxpart1; -- the index on partition goes away too +select relname, relkind from pg_class + where relname like 'idxpart%' order by relname; + relname | relkind +---------------+--------- + idxpart | p + idxpart_a_idx | I +(2 rows) + +drop table idxpart; +-- ALTER INDEX .. ATTACH, error cases +create table idxpart (a int, b int) partition by range (a, b); +create table idxpart1 partition of idxpart for values from (0, 0) to (10, 10); +create index idxpart_a_b_idx on only idxpart (a, b); +create index idxpart1_a_b_idx on idxpart1 (a, b); +create index idxpart1_tst1 on idxpart1 (b, a); +create index idxpart1_tst2 on idxpart1 using hash (a); +create index idxpart1_tst3 on idxpart1 (a, b) where a > 10; +alter index idxpart attach partition idxpart1; +ERROR: "idxpart" is not an index +alter index idxpart_a_b_idx attach partition idxpart1; +ERROR: "idxpart1" is not an index +alter index idxpart_a_b_idx attach partition idxpart_a_b_idx; +ERROR: cannot attach index "idxpart_a_b_idx" as a partition of index "idxpart_a_b_idx" +DETAIL: Index "idxpart_a_b_idx" is not an index on any partition of table "idxpart". +alter index idxpart_a_b_idx attach partition idxpart1_b_idx; +ERROR: relation "idxpart1_b_idx" does not exist +alter index idxpart_a_b_idx attach partition idxpart1_tst1; +ERROR: cannot attach index "idxpart1_tst1" as a partition of index "idxpart_a_b_idx" +DETAIL: The index definitions do not match. +alter index idxpart_a_b_idx attach partition idxpart1_tst2; +ERROR: cannot attach index "idxpart1_tst2" as a partition of index "idxpart_a_b_idx" +DETAIL: The index definitions do not match. +alter index idxpart_a_b_idx attach partition idxpart1_tst3; +ERROR: cannot attach index "idxpart1_tst3" as a partition of index "idxpart_a_b_idx" +DETAIL: The index definitions do not match. +-- OK +alter index idxpart_a_b_idx attach partition idxpart1_a_b_idx; +alter index idxpart_a_b_idx attach partition idxpart1_a_b_idx; -- quiet +-- reject dupe +create index idxpart1_2_a_b on idxpart1 (a, b); +alter index idxpart_a_b_idx attach partition idxpart1_2_a_b; +ERROR: cannot attach index "idxpart1_2_a_b" as a partition of index "idxpart_a_b_idx" +DETAIL: Another index is already attached for partition "idxpart1". +drop table idxpart; +-- make sure everything's gone +select indexrelid::regclass, indrelid::regclass + from pg_index where indexrelid::regclass::text like 'idxpart%'; + indexrelid | indrelid +------------+---------- +(0 rows) + +-- Don't auto-attach incompatible indexes +create table idxpart (a int, b int) partition by range (a); +create table idxpart1 (a int, b int); +create index on idxpart1 using hash (a); +create index on idxpart1 (a) where b > 1; +create index on idxpart1 ((a + 0)); +create index on idxpart1 (a, a); +create index on idxpart (a); +alter table idxpart attach partition idxpart1 for values from (0) to (1000); +\d idxpart1 + Table "public.idxpart1" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | +Partition of: idxpart FOR VALUES FROM (0) TO (1000) +Indexes: + "idxpart1_a_a1_idx" btree (a, a) + "idxpart1_a_idx" hash (a) + "idxpart1_a_idx1" btree (a) WHERE b > 1 + "idxpart1_a_idx2" btree (a) + "idxpart1_expr_idx" btree ((a + 0)) + +drop table idxpart; +-- If CREATE INDEX ONLY, don't create indexes on partitions; and existing +-- indexes on partitions don't change parent. ALTER INDEX ATTACH can change +-- the parent after the fact. +create table idxpart (a int) partition by range (a); +create table idxpart1 partition of idxpart for values from (0) to (100); +create table idxpart2 partition of idxpart for values from (100) to (1000) + partition by range (a); +create table idxpart21 partition of idxpart2 for values from (100) to (200); +create table idxpart22 partition of idxpart2 for values from (200) to (300); +create index on idxpart22 (a); +create index on only idxpart2 (a); +create index on idxpart (a); +-- Here we expect that idxpart1 and idxpart2 have a new index, but idxpart21 +-- does not; also, idxpart22 is not attached. +\d idxpart1 + Table "public.idxpart1" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | +Partition of: idxpart FOR VALUES FROM (0) TO (100) +Indexes: + "idxpart1_a_idx" btree (a) + +\d idxpart2 + Table "public.idxpart2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | +Partition of: idxpart FOR VALUES FROM (100) TO (1000) +Partition key: RANGE (a) +Indexes: + "idxpart2_a_idx" btree (a) INVALID +Number of partitions: 2 (Use \d+ to list them.) + +\d idxpart21 + Table "public.idxpart21" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | +Partition of: idxpart2 FOR VALUES FROM (100) TO (200) + +select indexrelid::regclass, indrelid::regclass, inhparent::regclass + from pg_index idx left join pg_inherits inh on (idx.indexrelid = inh.inhrelid) +where indexrelid::regclass::text like 'idxpart%' + order by indrelid::regclass::text collate "C"; + indexrelid | indrelid | inhparent +-----------------+-----------+--------------- + idxpart_a_idx | idxpart | + idxpart1_a_idx | idxpart1 | idxpart_a_idx + idxpart2_a_idx | idxpart2 | idxpart_a_idx + idxpart22_a_idx | idxpart22 | +(4 rows) + +alter index idxpart2_a_idx attach partition idxpart22_a_idx; +select indexrelid::regclass, indrelid::regclass, inhparent::regclass + from pg_index idx left join pg_inherits inh on (idx.indexrelid = inh.inhrelid) +where indexrelid::regclass::text like 'idxpart%' + order by indrelid::regclass::text collate "C"; + indexrelid | indrelid | inhparent +-----------------+-----------+---------------- + idxpart_a_idx | idxpart | + idxpart1_a_idx | idxpart1 | idxpart_a_idx + idxpart2_a_idx | idxpart2 | idxpart_a_idx + idxpart22_a_idx | idxpart22 | idxpart2_a_idx +(4 rows) + +-- attaching idxpart22 is not enough to set idxpart22_a_idx valid ... +alter index idxpart2_a_idx attach partition idxpart22_a_idx; +\d idxpart2 + Table "public.idxpart2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | +Partition of: idxpart FOR VALUES FROM (100) TO (1000) +Partition key: RANGE (a) +Indexes: + "idxpart2_a_idx" btree (a) INVALID +Number of partitions: 2 (Use \d+ to list them.) + +-- ... but this one is. +create index on idxpart21 (a); +alter index idxpart2_a_idx attach partition idxpart21_a_idx; +\d idxpart2 + Table "public.idxpart2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | +Partition of: idxpart FOR VALUES FROM (100) TO (1000) +Partition key: RANGE (a) +Indexes: + "idxpart2_a_idx" btree (a) +Number of partitions: 2 (Use \d+ to list them.) + +drop table idxpart; +-- When a table is attached a partition and it already has an index, a +-- duplicate index should not get created, but rather the index becomes +-- attached to the parent's index. +create table idxpart (a int, b int, c text) partition by range (a); +create index idxparti on idxpart (a); +create index idxparti2 on idxpart (b, c); +create table idxpart1 (like idxpart including indexes); +\d idxpart1 + Table "public.idxpart1" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | text | | | +Indexes: + "idxpart1_a_idx" btree (a) + "idxpart1_b_c_idx" btree (b, c) + +select relname, relkind, inhparent::regclass + from pg_class left join pg_index ix on (indexrelid = oid) + left join pg_inherits on (ix.indexrelid = inhrelid) + where relname like 'idxpart%' order by relname; + relname | relkind | inhparent +------------------+---------+----------- + idxpart | p | + idxpart1 | r | + idxpart1_a_idx | i | + idxpart1_b_c_idx | i | + idxparti | I | + idxparti2 | I | +(6 rows) + +alter table idxpart attach partition idxpart1 for values from (0) to (10); +\d idxpart1 + Table "public.idxpart1" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | text | | | +Partition of: idxpart FOR VALUES FROM (0) TO (10) +Indexes: + "idxpart1_a_idx" btree (a) + "idxpart1_b_c_idx" btree (b, c) + +select relname, relkind, inhparent::regclass + from pg_class left join pg_index ix on (indexrelid = oid) + left join pg_inherits on (ix.indexrelid = inhrelid) + where relname like 'idxpart%' order by relname; + relname | relkind | inhparent +------------------+---------+----------- + idxpart | p | + idxpart1 | r | + idxpart1_a_idx | i | idxparti + idxpart1_b_c_idx | i | idxparti2 + idxparti | I | + idxparti2 | I | +(6 rows) + +drop table idxpart; +-- Verify that attaching an invalid index does not mark the parent index valid. +-- On the other hand, attaching a valid index marks not only its direct +-- ancestor valid, but also any indirect ancestor that was only missing the one +-- that was just made valid +create table idxpart (a int, b int) partition by range (a); +create table idxpart1 partition of idxpart for values from (1) to (1000) partition by range (a); +create table idxpart11 partition of idxpart1 for values from (1) to (100); +create index on only idxpart1 (a); +create index on only idxpart (a); +-- this results in two invalid indexes: +select relname, indisvalid from pg_class join pg_index on indexrelid = oid + where relname like 'idxpart%' order by relname; + relname | indisvalid +----------------+------------ + idxpart1_a_idx | f + idxpart_a_idx | f +(2 rows) + +-- idxpart1_a_idx is not valid, so idxpart_a_idx should not become valid: +alter index idxpart_a_idx attach partition idxpart1_a_idx; +select relname, indisvalid from pg_class join pg_index on indexrelid = oid + where relname like 'idxpart%' order by relname; + relname | indisvalid +----------------+------------ + idxpart1_a_idx | f + idxpart_a_idx | f +(2 rows) + +-- after creating and attaching this, both idxpart1_a_idx and idxpart_a_idx +-- should become valid +create index on idxpart11 (a); +alter index idxpart1_a_idx attach partition idxpart11_a_idx; +select relname, indisvalid from pg_class join pg_index on indexrelid = oid + where relname like 'idxpart%' order by relname; + relname | indisvalid +-----------------+------------ + idxpart11_a_idx | t + idxpart1_a_idx | t + idxpart_a_idx | t +(3 rows) + +drop table idxpart; +-- verify dependency handling during ALTER TABLE DETACH PARTITION +create table idxpart (a int) partition by range (a); +create table idxpart1 (like idxpart); +create index on idxpart1 (a); +create index on idxpart (a); +create table idxpart2 (like idxpart); +alter table idxpart attach partition idxpart1 for values from (0000) to (1000); +alter table idxpart attach partition idxpart2 for values from (1000) to (2000); +create table idxpart3 partition of idxpart for values from (2000) to (3000); +select relname, relkind from pg_class where relname like 'idxpart%' order by relname; + relname | relkind +----------------+--------- + idxpart | p + idxpart1 | r + idxpart1_a_idx | i + idxpart2 | r + idxpart2_a_idx | i + idxpart3 | r + idxpart3_a_idx | i + idxpart_a_idx | I +(8 rows) + +-- a) after detaching partitions, the indexes can be dropped independently +alter table idxpart detach partition idxpart1; +alter table idxpart detach partition idxpart2; +alter table idxpart detach partition idxpart3; +drop index idxpart1_a_idx; +drop index idxpart2_a_idx; +drop index idxpart3_a_idx; +select relname, relkind from pg_class where relname like 'idxpart%' order by relname; + relname | relkind +---------------+--------- + idxpart | p + idxpart1 | r + idxpart2 | r + idxpart3 | r + idxpart_a_idx | I +(5 rows) + +drop table idxpart, idxpart1, idxpart2, idxpart3; +select relname, relkind from pg_class where relname like 'idxpart%' order by relname; + relname | relkind +---------+--------- +(0 rows) + +create table idxpart (a int) partition by range (a); +create table idxpart1 (like idxpart); +create index on idxpart1 (a); +create index on idxpart (a); +create table idxpart2 (like idxpart); +alter table idxpart attach partition idxpart1 for values from (0000) to (1000); +alter table idxpart attach partition idxpart2 for values from (1000) to (2000); +create table idxpart3 partition of idxpart for values from (2000) to (3000); +-- b) after detaching, dropping the index on parent does not remove the others +select relname, relkind from pg_class where relname like 'idxpart%' order by relname; + relname | relkind +----------------+--------- + idxpart | p + idxpart1 | r + idxpart1_a_idx | i + idxpart2 | r + idxpart2_a_idx | i + idxpart3 | r + idxpart3_a_idx | i + idxpart_a_idx | I +(8 rows) + +alter table idxpart detach partition idxpart1; +alter table idxpart detach partition idxpart2; +alter table idxpart detach partition idxpart3; +drop index idxpart_a_idx; +select relname, relkind from pg_class where relname like 'idxpart%' order by relname; + relname | relkind +----------------+--------- + idxpart | p + idxpart1 | r + idxpart1_a_idx | i + idxpart2 | r + idxpart2_a_idx | i + idxpart3 | r + idxpart3_a_idx | i +(7 rows) + +drop table idxpart, idxpart1, idxpart2, idxpart3; +select relname, relkind from pg_class where relname like 'idxpart%' order by relname; + relname | relkind +---------+--------- +(0 rows) + +-- Verify that expression indexes inherit correctly +create table idxpart (a int, b int) partition by range (a); +create table idxpart1 (like idxpart); +create index on idxpart1 ((a + b)); +create index on idxpart ((a + b)); +create table idxpart2 (like idxpart); +alter table idxpart attach partition idxpart1 for values from (0000) to (1000); +alter table idxpart attach partition idxpart2 for values from (1000) to (2000); +create table idxpart3 partition of idxpart for values from (2000) to (3000); +select relname as child, inhparent::regclass as parent, pg_get_indexdef as childdef + from pg_class join pg_inherits on inhrelid = oid, + lateral pg_get_indexdef(pg_class.oid) + where relkind in ('i', 'I') and relname like 'idxpart%' order by relname; + child | parent | childdef +-------------------+------------------+-------------------------------------------------------------------- + idxpart1_expr_idx | idxpart_expr_idx | CREATE INDEX idxpart1_expr_idx ON idxpart1 USING btree (((a + b))) + idxpart2_expr_idx | idxpart_expr_idx | CREATE INDEX idxpart2_expr_idx ON idxpart2 USING btree (((a + b))) + idxpart3_expr_idx | idxpart_expr_idx | CREATE INDEX idxpart3_expr_idx ON idxpart3 USING btree (((a + b))) +(3 rows) + +drop table idxpart; +-- Verify behavior for collation (mis)matches +create table idxpart (a text) partition by range (a); +create table idxpart1 (like idxpart); +create table idxpart2 (like idxpart); +create index on idxpart2 (a collate "POSIX"); +create index on idxpart2 (a); +create index on idxpart2 (a collate "C"); +alter table idxpart attach partition idxpart1 for values from ('aaa') to ('bbb'); +alter table idxpart attach partition idxpart2 for values from ('bbb') to ('ccc'); +create table idxpart3 partition of idxpart for values from ('ccc') to ('ddd'); +create index on idxpart (a collate "C"); +create table idxpart4 partition of idxpart for values from ('ddd') to ('eee'); +select relname as child, inhparent::regclass as parent, pg_get_indexdef as childdef + from pg_class left join pg_inherits on inhrelid = oid, + lateral pg_get_indexdef(pg_class.oid) + where relkind in ('i', 'I') and relname like 'idxpart%' order by relname; + child | parent | childdef +-----------------+---------------+------------------------------------------------------------------------- + idxpart1_a_idx | idxpart_a_idx | CREATE INDEX idxpart1_a_idx ON idxpart1 USING btree (a COLLATE "C") + idxpart2_a_idx | | CREATE INDEX idxpart2_a_idx ON idxpart2 USING btree (a COLLATE "POSIX") + idxpart2_a_idx1 | | CREATE INDEX idxpart2_a_idx1 ON idxpart2 USING btree (a) + idxpart2_a_idx2 | idxpart_a_idx | CREATE INDEX idxpart2_a_idx2 ON idxpart2 USING btree (a COLLATE "C") + idxpart3_a_idx | idxpart_a_idx | CREATE INDEX idxpart3_a_idx ON idxpart3 USING btree (a COLLATE "C") + idxpart4_a_idx | idxpart_a_idx | CREATE INDEX idxpart4_a_idx ON idxpart4 USING btree (a COLLATE "C") + idxpart_a_idx | | CREATE INDEX idxpart_a_idx ON ONLY idxpart USING btree (a COLLATE "C") +(7 rows) + +drop table idxpart; +-- Verify behavior for opclass (mis)matches +create table idxpart (a text) partition by range (a); +create table idxpart1 (like idxpart); +create table idxpart2 (like idxpart); +create index on idxpart2 (a); +alter table idxpart attach partition idxpart1 for values from ('aaa') to ('bbb'); +alter table idxpart attach partition idxpart2 for values from ('bbb') to ('ccc'); +create table idxpart3 partition of idxpart for values from ('ccc') to ('ddd'); +create index on idxpart (a text_pattern_ops); +create table idxpart4 partition of idxpart for values from ('ddd') to ('eee'); +-- must *not* have attached the index we created on idxpart2 +select relname as child, inhparent::regclass as parent, pg_get_indexdef as childdef + from pg_class left join pg_inherits on inhrelid = oid, + lateral pg_get_indexdef(pg_class.oid) + where relkind in ('i', 'I') and relname like 'idxpart%' order by relname; + child | parent | childdef +-----------------+---------------+----------------------------------------------------------------------------- + idxpart1_a_idx | idxpart_a_idx | CREATE INDEX idxpart1_a_idx ON idxpart1 USING btree (a text_pattern_ops) + idxpart2_a_idx | | CREATE INDEX idxpart2_a_idx ON idxpart2 USING btree (a) + idxpart2_a_idx1 | idxpart_a_idx | CREATE INDEX idxpart2_a_idx1 ON idxpart2 USING btree (a text_pattern_ops) + idxpart3_a_idx | idxpart_a_idx | CREATE INDEX idxpart3_a_idx ON idxpart3 USING btree (a text_pattern_ops) + idxpart4_a_idx | idxpart_a_idx | CREATE INDEX idxpart4_a_idx ON idxpart4 USING btree (a text_pattern_ops) + idxpart_a_idx | | CREATE INDEX idxpart_a_idx ON ONLY idxpart USING btree (a text_pattern_ops) +(6 rows) + +drop index idxpart_a_idx; +create index on only idxpart (a text_pattern_ops); +-- must reject +alter index idxpart_a_idx attach partition idxpart2_a_idx; +ERROR: cannot attach index "idxpart2_a_idx" as a partition of index "idxpart_a_idx" +DETAIL: The index definitions do not match. +drop table idxpart; +-- Verify that attaching indexes maps attribute numbers correctly +create table idxpart (col1 int, a int, col2 int, b int) partition by range (a); +create table idxpart1 (b int, col1 int, col2 int, col3 int, a int); +alter table idxpart drop column col1, drop column col2; +alter table idxpart1 drop column col1, drop column col2, drop column col3; +alter table idxpart attach partition idxpart1 for values from (0) to (1000); +create index idxpart_1_idx on only idxpart (b, a); +create index idxpart1_1_idx on idxpart1 (b, a); +create index idxpart1_1b_idx on idxpart1 (b); +-- test expressions and partial-index predicate, too +create index idxpart_2_idx on only idxpart ((b + a)) where a > 1; +create index idxpart1_2_idx on idxpart1 ((b + a)) where a > 1; +create index idxpart1_2b_idx on idxpart1 ((a + b)) where a > 1; +create index idxpart1_2c_idx on idxpart1 ((b + a)) where b > 1; +alter index idxpart_1_idx attach partition idxpart1_1b_idx; -- fail +ERROR: cannot attach index "idxpart1_1b_idx" as a partition of index "idxpart_1_idx" +DETAIL: The index definitions do not match. +alter index idxpart_1_idx attach partition idxpart1_1_idx; +alter index idxpart_2_idx attach partition idxpart1_2b_idx; -- fail +ERROR: cannot attach index "idxpart1_2b_idx" as a partition of index "idxpart_2_idx" +DETAIL: The index definitions do not match. +alter index idxpart_2_idx attach partition idxpart1_2c_idx; -- fail +ERROR: cannot attach index "idxpart1_2c_idx" as a partition of index "idxpart_2_idx" +DETAIL: The index definitions do not match. +alter index idxpart_2_idx attach partition idxpart1_2_idx; -- ok +select relname as child, inhparent::regclass as parent, pg_get_indexdef as childdef + from pg_class left join pg_inherits on inhrelid = oid, + lateral pg_get_indexdef(pg_class.oid) + where relkind in ('i', 'I') and relname like 'idxpart%' order by relname; + child | parent | childdef +-----------------+---------------+---------------------------------------------------------------------------------- + idxpart1_1_idx | idxpart_1_idx | CREATE INDEX idxpart1_1_idx ON idxpart1 USING btree (b, a) + idxpart1_1b_idx | | CREATE INDEX idxpart1_1b_idx ON idxpart1 USING btree (b) + idxpart1_2_idx | idxpart_2_idx | CREATE INDEX idxpart1_2_idx ON idxpart1 USING btree (((b + a))) WHERE (a > 1) + idxpart1_2b_idx | | CREATE INDEX idxpart1_2b_idx ON idxpart1 USING btree (((a + b))) WHERE (a > 1) + idxpart1_2c_idx | | CREATE INDEX idxpart1_2c_idx ON idxpart1 USING btree (((b + a))) WHERE (b > 1) + idxpart_1_idx | | CREATE INDEX idxpart_1_idx ON ONLY idxpart USING btree (b, a) + idxpart_2_idx | | CREATE INDEX idxpart_2_idx ON ONLY idxpart USING btree (((b + a))) WHERE (a > 1) +(7 rows) + +drop table idxpart; +-- Make sure the partition columns are mapped correctly +create table idxpart (a int, b int, c text) partition by range (a); +create index idxparti on idxpart (a); +create index idxparti2 on idxpart (c, b); +create table idxpart1 (c text, a int, b int); +alter table idxpart attach partition idxpart1 for values from (0) to (10); +create table idxpart2 (c text, a int, b int); +create index on idxpart2 (a); +create index on idxpart2 (c, b); +alter table idxpart attach partition idxpart2 for values from (10) to (20); +select c.relname, pg_get_indexdef(indexrelid) + from pg_class c join pg_index i on c.oid = i.indexrelid + where indrelid::regclass::text like 'idxpart%' + order by indrelid::regclass::text collate "C"; + relname | pg_get_indexdef +------------------+-------------------------------------------------------------- + idxparti | CREATE INDEX idxparti ON ONLY idxpart USING btree (a) + idxparti2 | CREATE INDEX idxparti2 ON ONLY idxpart USING btree (c, b) + idxpart1_a_idx | CREATE INDEX idxpart1_a_idx ON idxpart1 USING btree (a) + idxpart1_c_b_idx | CREATE INDEX idxpart1_c_b_idx ON idxpart1 USING btree (c, b) + idxpart2_a_idx | CREATE INDEX idxpart2_a_idx ON idxpart2 USING btree (a) + idxpart2_c_b_idx | CREATE INDEX idxpart2_c_b_idx ON idxpart2 USING btree (c, b) +(6 rows) + +drop table idxpart; +-- Verify that columns are mapped correctly in expression indexes +create table idxpart (col1 int, col2 int, a int, b int) partition by range (a); +create table idxpart1 (col2 int, b int, col1 int, a int); +create table idxpart2 (col1 int, col2 int, b int, a int); +alter table idxpart drop column col1, drop column col2; +alter table idxpart1 drop column col1, drop column col2; +alter table idxpart2 drop column col1, drop column col2; +create index on idxpart2 (abs(b)); +alter table idxpart attach partition idxpart2 for values from (0) to (1); +create index on idxpart (abs(b)); +alter table idxpart attach partition idxpart1 for values from (1) to (2); +select c.relname, pg_get_indexdef(indexrelid) + from pg_class c join pg_index i on c.oid = i.indexrelid + where indrelid::regclass::text like 'idxpart%' + order by indrelid::regclass::text collate "C"; + relname | pg_get_indexdef +------------------+------------------------------------------------------------------- + idxpart_abs_idx | CREATE INDEX idxpart_abs_idx ON ONLY idxpart USING btree (abs(b)) + idxpart1_abs_idx | CREATE INDEX idxpart1_abs_idx ON idxpart1 USING btree (abs(b)) + idxpart2_abs_idx | CREATE INDEX idxpart2_abs_idx ON idxpart2 USING btree (abs(b)) +(3 rows) + +drop table idxpart; +-- Verify that columns are mapped correctly for WHERE in a partial index +create table idxpart (col1 int, a int, col3 int, b int) partition by range (a); +alter table idxpart drop column col1, drop column col3; +create table idxpart1 (col1 int, col2 int, col3 int, col4 int, b int, a int); +alter table idxpart1 drop column col1, drop column col2, drop column col3, drop column col4; +alter table idxpart attach partition idxpart1 for values from (0) to (1000); +create table idxpart2 (col1 int, col2 int, b int, a int); +create index on idxpart2 (a) where b > 1000; +alter table idxpart2 drop column col1, drop column col2; +alter table idxpart attach partition idxpart2 for values from (1000) to (2000); +create index on idxpart (a) where b > 1000; +select c.relname, pg_get_indexdef(indexrelid) + from pg_class c join pg_index i on c.oid = i.indexrelid + where indrelid::regclass::text like 'idxpart%' + order by indrelid::regclass::text collate "C"; + relname | pg_get_indexdef +----------------+----------------------------------------------------------------------------- + idxpart_a_idx | CREATE INDEX idxpart_a_idx ON ONLY idxpart USING btree (a) WHERE (b > 1000) + idxpart1_a_idx | CREATE INDEX idxpart1_a_idx ON idxpart1 USING btree (a) WHERE (b > 1000) + idxpart2_a_idx | CREATE INDEX idxpart2_a_idx ON idxpart2 USING btree (a) WHERE (b > 1000) +(3 rows) + +drop table idxpart; +-- Column number mapping: dropped columns in the partition +create table idxpart1 (drop_1 int, drop_2 int, col_keep int, drop_3 int); +alter table idxpart1 drop column drop_1; +alter table idxpart1 drop column drop_2; +alter table idxpart1 drop column drop_3; +create index on idxpart1 (col_keep); +create table idxpart (col_keep int) partition by range (col_keep); +create index on idxpart (col_keep); +alter table idxpart attach partition idxpart1 for values from (0) to (1000); +\d idxpart + Table "public.idxpart" + Column | Type | Collation | Nullable | Default +----------+---------+-----------+----------+--------- + col_keep | integer | | | +Partition key: RANGE (col_keep) +Indexes: + "idxpart_col_keep_idx" btree (col_keep) +Number of partitions: 1 (Use \d+ to list them.) + +\d idxpart1 + Table "public.idxpart1" + Column | Type | Collation | Nullable | Default +----------+---------+-----------+----------+--------- + col_keep | integer | | | +Partition of: idxpart FOR VALUES FROM (0) TO (1000) +Indexes: + "idxpart1_col_keep_idx" btree (col_keep) + +select attrelid::regclass, attname, attnum from pg_attribute + where attrelid::regclass::text like 'idxpart%' and attnum > 0 + order by attrelid::regclass, attnum; + attrelid | attname | attnum +-----------------------+------------------------------+-------- + idxpart1 | ........pg.dropped.1........ | 1 + idxpart1 | ........pg.dropped.2........ | 2 + idxpart1 | col_keep | 3 + idxpart1 | ........pg.dropped.4........ | 4 + idxpart1_col_keep_idx | col_keep | 1 + idxpart | col_keep | 1 + idxpart_col_keep_idx | col_keep | 1 +(7 rows) + +drop table idxpart; +-- Column number mapping: dropped columns in the parent table +create table idxpart(drop_1 int, drop_2 int, col_keep int, drop_3 int) partition by range (col_keep); +alter table idxpart drop column drop_1; +alter table idxpart drop column drop_2; +alter table idxpart drop column drop_3; +create table idxpart1 (col_keep int); +create index on idxpart1 (col_keep); +create index on idxpart (col_keep); +alter table idxpart attach partition idxpart1 for values from (0) to (1000); +\d idxpart + Table "public.idxpart" + Column | Type | Collation | Nullable | Default +----------+---------+-----------+----------+--------- + col_keep | integer | | | +Partition key: RANGE (col_keep) +Indexes: + "idxpart_col_keep_idx" btree (col_keep) +Number of partitions: 1 (Use \d+ to list them.) + +\d idxpart1 + Table "public.idxpart1" + Column | Type | Collation | Nullable | Default +----------+---------+-----------+----------+--------- + col_keep | integer | | | +Partition of: idxpart FOR VALUES FROM (0) TO (1000) +Indexes: + "idxpart1_col_keep_idx" btree (col_keep) + +select attrelid::regclass, attname, attnum from pg_attribute + where attrelid::regclass::text like 'idxpart%' and attnum > 0 + order by attrelid::regclass, attnum; + attrelid | attname | attnum +-----------------------+------------------------------+-------- + idxpart | ........pg.dropped.1........ | 1 + idxpart | ........pg.dropped.2........ | 2 + idxpart | col_keep | 3 + idxpart | ........pg.dropped.4........ | 4 + idxpart1 | col_keep | 1 + idxpart1_col_keep_idx | col_keep | 1 + idxpart_col_keep_idx | col_keep | 1 +(7 rows) + +drop table idxpart; +-- intentionally leave some objects around +create table idxpart (a int) partition by range (a); +create table idxpart1 partition of idxpart for values from (0) to (100); +create table idxpart2 partition of idxpart for values from (100) to (1000) + partition by range (a); +create table idxpart21 partition of idxpart2 for values from (100) to (200); +create table idxpart22 partition of idxpart2 for values from (200) to (300); +create index on idxpart22 (a); +create index on only idxpart2 (a); +alter index idxpart2_a_idx attach partition idxpart22_a_idx; +create index on idxpart (a); diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 95fafcd7..58485cf1 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -134,7 +134,7 @@ test: plancache limit plpgsql copy2 temp domain prepare without_oid conversion t # ---------- # Another group of parallel tests # ---------- -test: identity partition_join partition_prune partition_prune_hash hash_part partition_info +test: identity partition_join partition_prune partition_prune_hash hash_part partition_info indexing # event triggers cannot run concurrently with any test that runs DDL test: event_trigger diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule index f91b37b9..3e5da44f 100644 --- a/src/test/regress/serial_schedule +++ b/src/test/regress/serial_schedule @@ -196,6 +196,7 @@ test: partition_prune test: partition_prune_hash test: partition_info test: hash_part +test: indexing test: event_trigger test: fast_default test: stats diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql index 7b5f2409..b53af1c0 100644 --- a/src/test/regress/sql/alter_table.sql +++ b/src/test/regress/sql/alter_table.sql @@ -1328,6 +1328,22 @@ create table tab1 (a int, b text); create table tab2 (x int, y tab1); alter table tab1 alter column b type varchar; -- fails +-- Alter column type that's part of a partitioned index +create table at_partitioned (a int, b text) partition by range (a); +create table at_part_1 partition of at_partitioned for values from (0) to (1000); +insert into at_partitioned values (512, '0.123'); +create table at_part_2 (b text, a int); +insert into at_part_2 values ('1.234', 1024); +create index on at_partitioned (b); +create index on at_partitioned (a); +\d at_part_1 +\d at_part_2 +alter table at_partitioned attach partition at_part_2 for values from (1000) to (2000); +\d at_part_2 +alter table at_partitioned alter column b type numeric using b::numeric; +\d at_part_1 +\d at_part_2 + -- disallow recursive containment of row types create temp table recur1 (f1 int); alter table recur1 add column f2 recur1; -- fails diff --git a/src/test/regress/sql/indexing.sql b/src/test/regress/sql/indexing.sql new file mode 100644 index 00000000..33be7186 --- /dev/null +++ b/src/test/regress/sql/indexing.sql @@ -0,0 +1,388 @@ +-- Creating an index on a partitioned table makes the partitions +-- automatically get the index +create table idxpart (a int, b int, c text) partition by range (a); +create table idxpart1 partition of idxpart for values from (0) to (10); +create table idxpart2 partition of idxpart for values from (10) to (100) + partition by range (b); +create table idxpart21 partition of idxpart2 for values from (0) to (100); +create index on idxpart (a); +select relname, relkind, inhparent::regclass + from pg_class left join pg_index ix on (indexrelid = oid) + left join pg_inherits on (ix.indexrelid = inhrelid) + where relname like 'idxpart%' order by relname; +drop table idxpart; + +-- Some unsupported features +create table idxpart (a int, b int, c text) partition by range (a); +create table idxpart1 partition of idxpart for values from (0) to (10); +create unique index on idxpart (a); +create index concurrently on idxpart (a); +drop table idxpart; + +-- If a table without index is attached as partition to a table with +-- an index, the index is automatically created +create table idxpart (a int, b int, c text) partition by range (a); +create index idxparti on idxpart (a); +create index idxparti2 on idxpart (b, c); +create table idxpart1 (like idxpart); +\d idxpart1 +alter table idxpart attach partition idxpart1 for values from (0) to (10); +\d idxpart1 +drop table idxpart; + +-- If a partition already has an index, don't create a duplicative one +create table idxpart (a int, b int) partition by range (a, b); +create table idxpart1 partition of idxpart for values from (0, 0) to (10, 10); +create index on idxpart1 (a, b); +create index on idxpart (a, b); +\d idxpart1 +select relname, relkind, inhparent::regclass + from pg_class left join pg_index ix on (indexrelid = oid) + left join pg_inherits on (ix.indexrelid = inhrelid) + where relname like 'idxpart%' order by relname; +drop table idxpart; + +-- DROP behavior for partitioned indexes +create table idxpart (a int) partition by range (a); +create index on idxpart (a); +create table idxpart1 partition of idxpart for values from (0) to (10); +drop index idxpart1_a_idx; -- no way +drop index idxpart_a_idx; -- both indexes go away +select relname, relkind from pg_class + where relname like 'idxpart%' order by relname; +create index on idxpart (a); +drop table idxpart1; -- the index on partition goes away too +select relname, relkind from pg_class + where relname like 'idxpart%' order by relname; +drop table idxpart; + +-- ALTER INDEX .. ATTACH, error cases +create table idxpart (a int, b int) partition by range (a, b); +create table idxpart1 partition of idxpart for values from (0, 0) to (10, 10); +create index idxpart_a_b_idx on only idxpart (a, b); +create index idxpart1_a_b_idx on idxpart1 (a, b); +create index idxpart1_tst1 on idxpart1 (b, a); +create index idxpart1_tst2 on idxpart1 using hash (a); +create index idxpart1_tst3 on idxpart1 (a, b) where a > 10; + +alter index idxpart attach partition idxpart1; +alter index idxpart_a_b_idx attach partition idxpart1; +alter index idxpart_a_b_idx attach partition idxpart_a_b_idx; +alter index idxpart_a_b_idx attach partition idxpart1_b_idx; +alter index idxpart_a_b_idx attach partition idxpart1_tst1; +alter index idxpart_a_b_idx attach partition idxpart1_tst2; +alter index idxpart_a_b_idx attach partition idxpart1_tst3; +-- OK +alter index idxpart_a_b_idx attach partition idxpart1_a_b_idx; +alter index idxpart_a_b_idx attach partition idxpart1_a_b_idx; -- quiet + +-- reject dupe +create index idxpart1_2_a_b on idxpart1 (a, b); +alter index idxpart_a_b_idx attach partition idxpart1_2_a_b; +drop table idxpart; +-- make sure everything's gone +select indexrelid::regclass, indrelid::regclass + from pg_index where indexrelid::regclass::text like 'idxpart%'; + +-- Don't auto-attach incompatible indexes +create table idxpart (a int, b int) partition by range (a); +create table idxpart1 (a int, b int); +create index on idxpart1 using hash (a); +create index on idxpart1 (a) where b > 1; +create index on idxpart1 ((a + 0)); +create index on idxpart1 (a, a); +create index on idxpart (a); +alter table idxpart attach partition idxpart1 for values from (0) to (1000); +\d idxpart1 +drop table idxpart; + +-- If CREATE INDEX ONLY, don't create indexes on partitions; and existing +-- indexes on partitions don't change parent. ALTER INDEX ATTACH can change +-- the parent after the fact. +create table idxpart (a int) partition by range (a); +create table idxpart1 partition of idxpart for values from (0) to (100); +create table idxpart2 partition of idxpart for values from (100) to (1000) + partition by range (a); +create table idxpart21 partition of idxpart2 for values from (100) to (200); +create table idxpart22 partition of idxpart2 for values from (200) to (300); +create index on idxpart22 (a); +create index on only idxpart2 (a); +create index on idxpart (a); +-- Here we expect that idxpart1 and idxpart2 have a new index, but idxpart21 +-- does not; also, idxpart22 is not attached. +\d idxpart1 +\d idxpart2 +\d idxpart21 +select indexrelid::regclass, indrelid::regclass, inhparent::regclass + from pg_index idx left join pg_inherits inh on (idx.indexrelid = inh.inhrelid) +where indexrelid::regclass::text like 'idxpart%' + order by indrelid::regclass::text collate "C"; +alter index idxpart2_a_idx attach partition idxpart22_a_idx; +select indexrelid::regclass, indrelid::regclass, inhparent::regclass + from pg_index idx left join pg_inherits inh on (idx.indexrelid = inh.inhrelid) +where indexrelid::regclass::text like 'idxpart%' + order by indrelid::regclass::text collate "C"; +-- attaching idxpart22 is not enough to set idxpart22_a_idx valid ... +alter index idxpart2_a_idx attach partition idxpart22_a_idx; +\d idxpart2 +-- ... but this one is. +create index on idxpart21 (a); +alter index idxpart2_a_idx attach partition idxpart21_a_idx; +\d idxpart2 +drop table idxpart; + +-- When a table is attached a partition and it already has an index, a +-- duplicate index should not get created, but rather the index becomes +-- attached to the parent's index. +create table idxpart (a int, b int, c text) partition by range (a); +create index idxparti on idxpart (a); +create index idxparti2 on idxpart (b, c); +create table idxpart1 (like idxpart including indexes); +\d idxpart1 +select relname, relkind, inhparent::regclass + from pg_class left join pg_index ix on (indexrelid = oid) + left join pg_inherits on (ix.indexrelid = inhrelid) + where relname like 'idxpart%' order by relname; +alter table idxpart attach partition idxpart1 for values from (0) to (10); +\d idxpart1 +select relname, relkind, inhparent::regclass + from pg_class left join pg_index ix on (indexrelid = oid) + left join pg_inherits on (ix.indexrelid = inhrelid) + where relname like 'idxpart%' order by relname; +drop table idxpart; + +-- Verify that attaching an invalid index does not mark the parent index valid. +-- On the other hand, attaching a valid index marks not only its direct +-- ancestor valid, but also any indirect ancestor that was only missing the one +-- that was just made valid +create table idxpart (a int, b int) partition by range (a); +create table idxpart1 partition of idxpart for values from (1) to (1000) partition by range (a); +create table idxpart11 partition of idxpart1 for values from (1) to (100); +create index on only idxpart1 (a); +create index on only idxpart (a); +-- this results in two invalid indexes: +select relname, indisvalid from pg_class join pg_index on indexrelid = oid + where relname like 'idxpart%' order by relname; +-- idxpart1_a_idx is not valid, so idxpart_a_idx should not become valid: +alter index idxpart_a_idx attach partition idxpart1_a_idx; +select relname, indisvalid from pg_class join pg_index on indexrelid = oid + where relname like 'idxpart%' order by relname; +-- after creating and attaching this, both idxpart1_a_idx and idxpart_a_idx +-- should become valid +create index on idxpart11 (a); +alter index idxpart1_a_idx attach partition idxpart11_a_idx; +select relname, indisvalid from pg_class join pg_index on indexrelid = oid + where relname like 'idxpart%' order by relname; +drop table idxpart; + +-- verify dependency handling during ALTER TABLE DETACH PARTITION +create table idxpart (a int) partition by range (a); +create table idxpart1 (like idxpart); +create index on idxpart1 (a); +create index on idxpart (a); +create table idxpart2 (like idxpart); +alter table idxpart attach partition idxpart1 for values from (0000) to (1000); +alter table idxpart attach partition idxpart2 for values from (1000) to (2000); +create table idxpart3 partition of idxpart for values from (2000) to (3000); +select relname, relkind from pg_class where relname like 'idxpart%' order by relname; +-- a) after detaching partitions, the indexes can be dropped independently +alter table idxpart detach partition idxpart1; +alter table idxpart detach partition idxpart2; +alter table idxpart detach partition idxpart3; +drop index idxpart1_a_idx; +drop index idxpart2_a_idx; +drop index idxpart3_a_idx; +select relname, relkind from pg_class where relname like 'idxpart%' order by relname; +drop table idxpart, idxpart1, idxpart2, idxpart3; +select relname, relkind from pg_class where relname like 'idxpart%' order by relname; + +create table idxpart (a int) partition by range (a); +create table idxpart1 (like idxpart); +create index on idxpart1 (a); +create index on idxpart (a); +create table idxpart2 (like idxpart); +alter table idxpart attach partition idxpart1 for values from (0000) to (1000); +alter table idxpart attach partition idxpart2 for values from (1000) to (2000); +create table idxpart3 partition of idxpart for values from (2000) to (3000); +-- b) after detaching, dropping the index on parent does not remove the others +select relname, relkind from pg_class where relname like 'idxpart%' order by relname; +alter table idxpart detach partition idxpart1; +alter table idxpart detach partition idxpart2; +alter table idxpart detach partition idxpart3; +drop index idxpart_a_idx; +select relname, relkind from pg_class where relname like 'idxpart%' order by relname; +drop table idxpart, idxpart1, idxpart2, idxpart3; +select relname, relkind from pg_class where relname like 'idxpart%' order by relname; + +-- Verify that expression indexes inherit correctly +create table idxpart (a int, b int) partition by range (a); +create table idxpart1 (like idxpart); +create index on idxpart1 ((a + b)); +create index on idxpart ((a + b)); +create table idxpart2 (like idxpart); +alter table idxpart attach partition idxpart1 for values from (0000) to (1000); +alter table idxpart attach partition idxpart2 for values from (1000) to (2000); +create table idxpart3 partition of idxpart for values from (2000) to (3000); +select relname as child, inhparent::regclass as parent, pg_get_indexdef as childdef + from pg_class join pg_inherits on inhrelid = oid, + lateral pg_get_indexdef(pg_class.oid) + where relkind in ('i', 'I') and relname like 'idxpart%' order by relname; +drop table idxpart; + +-- Verify behavior for collation (mis)matches +create table idxpart (a text) partition by range (a); +create table idxpart1 (like idxpart); +create table idxpart2 (like idxpart); +create index on idxpart2 (a collate "POSIX"); +create index on idxpart2 (a); +create index on idxpart2 (a collate "C"); +alter table idxpart attach partition idxpart1 for values from ('aaa') to ('bbb'); +alter table idxpart attach partition idxpart2 for values from ('bbb') to ('ccc'); +create table idxpart3 partition of idxpart for values from ('ccc') to ('ddd'); +create index on idxpart (a collate "C"); +create table idxpart4 partition of idxpart for values from ('ddd') to ('eee'); +select relname as child, inhparent::regclass as parent, pg_get_indexdef as childdef + from pg_class left join pg_inherits on inhrelid = oid, + lateral pg_get_indexdef(pg_class.oid) + where relkind in ('i', 'I') and relname like 'idxpart%' order by relname; +drop table idxpart; + +-- Verify behavior for opclass (mis)matches +create table idxpart (a text) partition by range (a); +create table idxpart1 (like idxpart); +create table idxpart2 (like idxpart); +create index on idxpart2 (a); +alter table idxpart attach partition idxpart1 for values from ('aaa') to ('bbb'); +alter table idxpart attach partition idxpart2 for values from ('bbb') to ('ccc'); +create table idxpart3 partition of idxpart for values from ('ccc') to ('ddd'); +create index on idxpart (a text_pattern_ops); +create table idxpart4 partition of idxpart for values from ('ddd') to ('eee'); +-- must *not* have attached the index we created on idxpart2 +select relname as child, inhparent::regclass as parent, pg_get_indexdef as childdef + from pg_class left join pg_inherits on inhrelid = oid, + lateral pg_get_indexdef(pg_class.oid) + where relkind in ('i', 'I') and relname like 'idxpart%' order by relname; +drop index idxpart_a_idx; +create index on only idxpart (a text_pattern_ops); +-- must reject +alter index idxpart_a_idx attach partition idxpart2_a_idx; +drop table idxpart; + +-- Verify that attaching indexes maps attribute numbers correctly +create table idxpart (col1 int, a int, col2 int, b int) partition by range (a); +create table idxpart1 (b int, col1 int, col2 int, col3 int, a int); +alter table idxpart drop column col1, drop column col2; +alter table idxpart1 drop column col1, drop column col2, drop column col3; +alter table idxpart attach partition idxpart1 for values from (0) to (1000); +create index idxpart_1_idx on only idxpart (b, a); +create index idxpart1_1_idx on idxpart1 (b, a); +create index idxpart1_1b_idx on idxpart1 (b); +-- test expressions and partial-index predicate, too +create index idxpart_2_idx on only idxpart ((b + a)) where a > 1; +create index idxpart1_2_idx on idxpart1 ((b + a)) where a > 1; +create index idxpart1_2b_idx on idxpart1 ((a + b)) where a > 1; +create index idxpart1_2c_idx on idxpart1 ((b + a)) where b > 1; +alter index idxpart_1_idx attach partition idxpart1_1b_idx; -- fail +alter index idxpart_1_idx attach partition idxpart1_1_idx; +alter index idxpart_2_idx attach partition idxpart1_2b_idx; -- fail +alter index idxpart_2_idx attach partition idxpart1_2c_idx; -- fail +alter index idxpart_2_idx attach partition idxpart1_2_idx; -- ok +select relname as child, inhparent::regclass as parent, pg_get_indexdef as childdef + from pg_class left join pg_inherits on inhrelid = oid, + lateral pg_get_indexdef(pg_class.oid) + where relkind in ('i', 'I') and relname like 'idxpart%' order by relname; +drop table idxpart; + +-- Make sure the partition columns are mapped correctly +create table idxpart (a int, b int, c text) partition by range (a); +create index idxparti on idxpart (a); +create index idxparti2 on idxpart (c, b); +create table idxpart1 (c text, a int, b int); +alter table idxpart attach partition idxpart1 for values from (0) to (10); +create table idxpart2 (c text, a int, b int); +create index on idxpart2 (a); +create index on idxpart2 (c, b); +alter table idxpart attach partition idxpart2 for values from (10) to (20); +select c.relname, pg_get_indexdef(indexrelid) + from pg_class c join pg_index i on c.oid = i.indexrelid + where indrelid::regclass::text like 'idxpart%' + order by indrelid::regclass::text collate "C"; +drop table idxpart; + +-- Verify that columns are mapped correctly in expression indexes +create table idxpart (col1 int, col2 int, a int, b int) partition by range (a); +create table idxpart1 (col2 int, b int, col1 int, a int); +create table idxpart2 (col1 int, col2 int, b int, a int); +alter table idxpart drop column col1, drop column col2; +alter table idxpart1 drop column col1, drop column col2; +alter table idxpart2 drop column col1, drop column col2; +create index on idxpart2 (abs(b)); +alter table idxpart attach partition idxpart2 for values from (0) to (1); +create index on idxpart (abs(b)); +alter table idxpart attach partition idxpart1 for values from (1) to (2); +select c.relname, pg_get_indexdef(indexrelid) + from pg_class c join pg_index i on c.oid = i.indexrelid + where indrelid::regclass::text like 'idxpart%' + order by indrelid::regclass::text collate "C"; +drop table idxpart; + +-- Verify that columns are mapped correctly for WHERE in a partial index +create table idxpart (col1 int, a int, col3 int, b int) partition by range (a); +alter table idxpart drop column col1, drop column col3; +create table idxpart1 (col1 int, col2 int, col3 int, col4 int, b int, a int); +alter table idxpart1 drop column col1, drop column col2, drop column col3, drop column col4; +alter table idxpart attach partition idxpart1 for values from (0) to (1000); +create table idxpart2 (col1 int, col2 int, b int, a int); +create index on idxpart2 (a) where b > 1000; +alter table idxpart2 drop column col1, drop column col2; +alter table idxpart attach partition idxpart2 for values from (1000) to (2000); +create index on idxpart (a) where b > 1000; +select c.relname, pg_get_indexdef(indexrelid) + from pg_class c join pg_index i on c.oid = i.indexrelid + where indrelid::regclass::text like 'idxpart%' + order by indrelid::regclass::text collate "C"; +drop table idxpart; + +-- Column number mapping: dropped columns in the partition +create table idxpart1 (drop_1 int, drop_2 int, col_keep int, drop_3 int); +alter table idxpart1 drop column drop_1; +alter table idxpart1 drop column drop_2; +alter table idxpart1 drop column drop_3; +create index on idxpart1 (col_keep); +create table idxpart (col_keep int) partition by range (col_keep); +create index on idxpart (col_keep); +alter table idxpart attach partition idxpart1 for values from (0) to (1000); +\d idxpart +\d idxpart1 +select attrelid::regclass, attname, attnum from pg_attribute + where attrelid::regclass::text like 'idxpart%' and attnum > 0 + order by attrelid::regclass, attnum; +drop table idxpart; + +-- Column number mapping: dropped columns in the parent table +create table idxpart(drop_1 int, drop_2 int, col_keep int, drop_3 int) partition by range (col_keep); +alter table idxpart drop column drop_1; +alter table idxpart drop column drop_2; +alter table idxpart drop column drop_3; +create table idxpart1 (col_keep int); +create index on idxpart1 (col_keep); +create index on idxpart (col_keep); +alter table idxpart attach partition idxpart1 for values from (0) to (1000); +\d idxpart +\d idxpart1 +select attrelid::regclass, attname, attnum from pg_attribute + where attrelid::regclass::text like 'idxpart%' and attnum > 0 + order by attrelid::regclass, attnum; +drop table idxpart; + +-- intentionally leave some objects around +create table idxpart (a int) partition by range (a); +create table idxpart1 partition of idxpart for values from (0) to (100); +create table idxpart2 partition of idxpart for values from (100) to (1000) + partition by range (a); +create table idxpart21 partition of idxpart2 for values from (100) to (200); +create table idxpart22 partition of idxpart2 for values from (200) to (300); +create index on idxpart22 (a); +create index on only idxpart2 (a); +alter index idxpart2_a_idx attach partition idxpart22_a_idx; +create index on idxpart (a); From 2d2f13f0a28a10c9186765837ff0e53ae3941fd3 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Tue, 22 Aug 2017 15:36:49 -0700 Subject: [PATCH 294/578] Add a hash_combine function for mixing hash values. This hash function is derived from Boost's function of the same name. Author: Andres Freund, Thomas Munro Discussion: https://postgr.es/m/CAEepm%3D3rdgjfxW4cKvJ0OEmya2-34B0qHNG1xV0vK7TGPJGMUQ%40mail.gmail.com Discussion: https://postgr.es/m/20170731210844.3cwrkmsmbbpt4rjc%40alap3.anarazel.de --- src/include/utils/hashutils.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 src/include/utils/hashutils.h diff --git a/src/include/utils/hashutils.h b/src/include/utils/hashutils.h new file mode 100644 index 00000000..56b7bfc9 --- /dev/null +++ b/src/include/utils/hashutils.h @@ -0,0 +1,23 @@ +/* + * Utilities for working with hash values. + * + * Portions Copyright (c) 2017, PostgreSQL Global Development Group + */ + +#ifndef HASHUTILS_H +#define HASHUTILS_H + +/* + * Combine two hash values, resulting in another hash value, with decent bit + * mixing. + * + * Similar to boost's hash_combine(). + */ +static inline uint32 +hash_combine(uint32 a, uint32 b) +{ + a ^= b + 0x9e3779b9 + (a << 6) + (a >> 2); + return a; +} + +#endif /* HASHUTILS_H */ From fbed5652ec7a2d4189f265659fb494374b22bfd4 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Fri, 22 Sep 2017 13:38:42 -0700 Subject: [PATCH 295/578] Add inline murmurhash32(uint32) function. The function already existed in tidbitmap.c but more users requiring fast hashing of 32bit ints are coming up. Author: Andres Freund Discussion: https://postgr.es/m/20170914061207.zxotvyopetm7lrrp@alap3.anarazel.de --- src/backend/nodes/tidbitmap.c | 20 ++------------------ src/include/utils/hashutils.h | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/src/backend/nodes/tidbitmap.c b/src/backend/nodes/tidbitmap.c index 5be82536..73820707 100644 --- a/src/backend/nodes/tidbitmap.c +++ b/src/backend/nodes/tidbitmap.c @@ -45,6 +45,7 @@ #include "nodes/tidbitmap.h" #include "storage/lwlock.h" #include "utils/dsa.h" +#include "utils/hashutils.h" /* * The maximum number of tuples per page is not large (typically 256 with @@ -237,30 +238,13 @@ static int tbm_comparator(const void *left, const void *right); static int tbm_shared_comparator(const void *left, const void *right, void *arg); -/* - * Simple inline murmur hash implementation for the exact width required, for - * performance. - */ -static inline uint32 -hash_blockno(BlockNumber b) -{ - uint32 h = b; - - h ^= h >> 16; - h *= 0x85ebca6b; - h ^= h >> 13; - h *= 0xc2b2ae35; - h ^= h >> 16; - return h; -} - /* define hashtable mapping block numbers to PagetableEntry's */ #define SH_USE_NONDEFAULT_ALLOCATOR #define SH_PREFIX pagetable #define SH_ELEMENT_TYPE PagetableEntry #define SH_KEY_TYPE BlockNumber #define SH_KEY blockno -#define SH_HASH_KEY(tb, key) hash_blockno(key) +#define SH_HASH_KEY(tb, key) murmurhash32(key) #define SH_EQUAL(tb, a, b) a == b #define SH_SCOPE static inline #define SH_DEFINE diff --git a/src/include/utils/hashutils.h b/src/include/utils/hashutils.h index 56b7bfc9..35281689 100644 --- a/src/include/utils/hashutils.h +++ b/src/include/utils/hashutils.h @@ -20,4 +20,22 @@ hash_combine(uint32 a, uint32 b) return a; } + +/* + * Simple inline murmur hash implementation hashing a 32 bit ingeger, for + * performance. + */ +static inline uint32 +murmurhash32(uint32 data) +{ + uint32 h = data; + + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + return h; +} + #endif /* HASHUTILS_H */ From 330f9f9673a3f9bac14bc4bf241fc0e0f3f69f14 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Fri, 29 Sep 2017 15:52:55 -0700 Subject: [PATCH 296/578] Fix typo. Reported-By: Thomas Munro and Jesper Pedersen --- src/include/utils/hashutils.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/include/utils/hashutils.h b/src/include/utils/hashutils.h index 35281689..366bd0e7 100644 --- a/src/include/utils/hashutils.h +++ b/src/include/utils/hashutils.h @@ -22,7 +22,7 @@ hash_combine(uint32 a, uint32 b) /* - * Simple inline murmur hash implementation hashing a 32 bit ingeger, for + * Simple inline murmur hash implementation hashing a 32 bit integer, for * performance. */ static inline uint32 From a974c3771ccc22b6bf8387f09f41d92ff27ef12f Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Sun, 29 Oct 2017 12:41:43 +0530 Subject: [PATCH 297/578] Add hash_combine64. Extracted from a larger patch by Amul Sul, with some comment additions by me. Discussion: http://postgr.es/m/20171024113004.hn5qajypin4dy5sw@alap3.anarazel.de --- src/include/utils/hashutils.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/include/utils/hashutils.h b/src/include/utils/hashutils.h index 366bd0e7..3a5c21f5 100644 --- a/src/include/utils/hashutils.h +++ b/src/include/utils/hashutils.h @@ -8,8 +8,8 @@ #define HASHUTILS_H /* - * Combine two hash values, resulting in another hash value, with decent bit - * mixing. + * Combine two 32-bit hash values, resulting in another hash value, with + * decent bit mixing. * * Similar to boost's hash_combine(). */ @@ -20,6 +20,18 @@ hash_combine(uint32 a, uint32 b) return a; } +/* + * Combine two 64-bit hash values, resulting in another hash value, using the + * same kind of technique as hash_combine(). Testing shows that this also + * produces good bit mixing. + */ +static inline uint64 +hash_combine64(uint64 a, uint64 b) +{ + /* 0x49a0f4dd15e5a8e3 is 64bit random data */ + a ^= b + 0x49a0f4dd15e5a8e3 + (a << 54) + (a >> 7); + return a; +} /* * Simple inline murmur hash implementation hashing a 32 bit integer, for From bff015b6c1b4bd4fcc154514215024b9608bb9e5 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Tue, 7 Nov 2017 13:54:36 -0500 Subject: [PATCH 298/578] Fix unportable spelling of int64 constant. Per buildfarm member pademelon. --- src/include/utils/hashutils.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/include/utils/hashutils.h b/src/include/utils/hashutils.h index 3a5c21f5..0a2620be 100644 --- a/src/include/utils/hashutils.h +++ b/src/include/utils/hashutils.h @@ -29,7 +29,7 @@ static inline uint64 hash_combine64(uint64 a, uint64 b) { /* 0x49a0f4dd15e5a8e3 is 64bit random data */ - a ^= b + 0x49a0f4dd15e5a8e3 + (a << 54) + (a >> 7); + a ^= b + UINT64CONST(0x49a0f4dd15e5a8e3) + (a << 54) + (a >> 7); return a; } From 8d4128c29b72c99f855d7598f1e293b7de35eb8a Mon Sep 17 00:00:00 2001 From: ericxwu Date: Fri, 3 Jul 2020 19:30:35 +0800 Subject: [PATCH 299/578] update select_parallel expect result --- .../regress/expected/select_parallel_5.out | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/test/regress/expected/select_parallel_5.out b/src/test/regress/expected/select_parallel_5.out index 6b20689d..d5be6ecb 100644 --- a/src/test/regress/expected/select_parallel_5.out +++ b/src/test/regress/expected/select_parallel_5.out @@ -95,6 +95,30 @@ explain (costs off) -> Parallel Seq Scan on tenk1 (10 rows) +explain (costs off) + select count(stringu1) as num, (CASE WHEN length(stringu1) > 5 THEN 'LONG' ELSE 'SHORT' END) as islong + from tenk1 group by islong order by num; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------ + Sort + Sort Key: (count(stringu1)) + -> Finalize HashAggregate + Group Key: CASE WHEN (length((stringu1)::text) > 5) THEN 'LONG'::text ELSE 'SHORT'::text END + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Gather + Workers Planned: 4 + -> Partial HashAggregate + Group Key: CASE WHEN (length((stringu1)::text) > 5) THEN 'LONG'::text ELSE 'SHORT'::text END + -> Parallel Seq Scan on tenk1 +(10 rows) + +select count(stringu1) as num, (CASE WHEN length(stringu1) > 5 THEN 'LONG' ELSE 'SHORT' END) as islong + from tenk1 group by islong order by num; + num | islong +-------+-------- + 10000 | LONG +(1 row) + -- test that parallel plan for aggregates is not selected when -- target list contains parallel restricted clause. explain (costs off) From fbda1355519b5c6a5275fb419eefcd1aa3630b5a Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Fri, 3 Jul 2020 20:17:32 +0800 Subject: [PATCH 300/578] fix compile errors.20200703. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/commands/copy.c | 2 +- src/backend/commands/indexcmds.c | 1 + src/backend/commands/tablecmds.c | 7 +-- src/backend/executor/execMain.c | 2 +- src/backend/executor/execPartition.c | 24 +++------ src/backend/executor/nodeModifyTable.c | 9 ++-- src/backend/optimizer/path/allpaths.c | 35 ++++++++++++- src/backend/optimizer/path/joinrels.c | 2 +- src/backend/optimizer/prep/prepunion.c | 65 +++++++++++++++++++++++- src/backend/optimizer/util/placeholder.c | 2 +- src/backend/optimizer/util/relnode.c | 2 +- src/include/executor/execPartition.h | 2 +- src/include/optimizer/prep.h | 3 ++ 13 files changed, 122 insertions(+), 34 deletions(-) diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 533187a0..8bf02419 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -3541,7 +3541,7 @@ CopyFrom(CopyState cstate) * We might need to convert from the parent rowtype to the * partition rowtype. */ - tuple = ConvertPartitionTupleSlot(proute->parent_child_tupconv_maps[leaf_part_index], + tuple = ConvertPartitionTupleSlot(resultRelInfo->ri_RelationDesc, proute->parent_child_tupconv_maps[leaf_part_index], tuple, proute->partition_tuple_slot, &slot); diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index 22c2348e..be45e453 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -67,6 +67,7 @@ #include "utils/snapmgr.h" #include "utils/syscache.h" #include "utils/tqual.h" +#include "utils/guc.h" /* non-export function prototypes */ diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 834f2840..430141a2 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -6647,9 +6647,9 @@ ATCheckPartitionsNotInUse(Relation rel, LOCKMODE lockmode) Relation childrel; /* find_all_inheritors already got lock */ - childrel = table_open(lfirst_oid(cell), NoLock); + childrel = heap_open(lfirst_oid(cell), NoLock); CheckTableNotInUse(childrel, "ALTER TABLE"); - table_close(childrel, NoLock); + heap_close(childrel, NoLock); } list_free(inh); } @@ -8489,7 +8489,7 @@ ATExecDropColumn(List **wqueue, Relation rel, const char *colName, /* Time to delete this child column, too */ ATExecDropColumn(wqueue, childrel, colName, behavior, true, true, - false, lockmode); + false, lockmode, addrs); } else { @@ -8658,6 +8658,7 @@ ATExecAddIndex(AlteredTableInfo *tab, Relation rel, addr = DefineIndex(partOid, /* OID of heap relation */ partidxstmt, InvalidOid, /* no predefined OID */ + InvalidOid, /* no parent index */ true, /* is_alter_table */ check_rights, /* check_rights */ false, /* check_not_in_use */ diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 4140f135..3bc95f7d 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -2287,7 +2287,7 @@ ExecPartitionCheckEmitError(ResultRelInfo *resultRelInfo, gettext_noop("could not convert row type")); if (map != NULL) { - tuple = do_convert_tuple(tuple, map); + tuple = do_convert_tuple(tuple, map, rel); ExecSetSlotDescriptor(slot, tupdesc); ExecStoreTuple(tuple, slot, InvalidBuffer, false); } diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c index dd60cbc8..f5d69874 100644 --- a/src/backend/executor/execPartition.c +++ b/src/backend/executor/execPartition.c @@ -189,7 +189,7 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, Relation rel) * UPDATE of a partition-key becomes a DELETE+INSERT operation, so * this check is required even when the operation is CMD_UPDATE. */ - CheckValidResultRel(leaf_part_rri, CMD_INSERT); + CheckValidResultRel(leaf_part_rri->ri_RelationDesc, CMD_INSERT); } proute->partitions[i] = leaf_part_rri; @@ -257,7 +257,7 @@ ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd, HeapTuple tuple = ExecFetchSlotTuple(slot); ExecClearTuple(myslot); - tuple = do_convert_tuple(tuple, map); + tuple = do_convert_tuple(tuple, map, NULL); ExecStoreTuple(tuple, myslot, InvalidBuffer, true); slot = myslot; } @@ -343,7 +343,7 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, ModifyTable *node = mtstate ? (ModifyTable *) mtstate->ps.plan : NULL; MemoryContext oldContext; - partrel = table_open(dispatch->partdesc->oids[partidx], RowExclusiveLock); + partrel = heap_open(proute->partition_oids[partidx], RowExclusiveLock); /* * Keep ResultRelInfo and other information for this partition in the @@ -363,19 +363,7 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, * partition-key becomes a DELETE+INSERT operation, so this check is still * required when the operation is CMD_UPDATE. */ - CheckValidResultRel(leaf_part_rri, CMD_INSERT); - - /* - * Since we've just initialized this ResultRelInfo, it's not in any list - * attached to the estate as yet. Add it, so that it can be found later. - * - * Note that the entries in this list appear in no predetermined order, - * because partition result rels are initialized as and when they're - * needed. - */ - estate->es_tuple_routing_result_relations = - lappend(estate->es_tuple_routing_result_relations, - leaf_part_rri); + CheckValidResultRel(leaf_part_rri->ri_RelationDesc, CMD_INSERT); /* * Open partition indices. The user may have asked to check for conflicts @@ -589,7 +577,7 @@ TupConvMapForLeaf(PartitionTupleRouting *proute, * tuple is returned unmodified. */ HeapTuple -ConvertPartitionTupleSlot(TupleConversionMap *map, +ConvertPartitionTupleSlot(Relation partrel, TupleConversionMap *map, HeapTuple tuple, TupleTableSlot *new_slot, TupleTableSlot **p_my_slot) @@ -597,7 +585,7 @@ ConvertPartitionTupleSlot(TupleConversionMap *map, if (!map) return tuple; - tuple = do_convert_tuple(tuple, map); + tuple = do_convert_tuple(tuple, map, partrel); /* * Change the partition tuple slot descriptor, as per converted tuple. diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 34a53370..3d6b9769 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -286,6 +286,7 @@ ExecInsert(ModifyTableState *mtstate, {// #lizard forgives HeapTuple tuple; ResultRelInfo *resultRelInfo; + ResultRelInfo *saved_resultRelInfo = NULL; Relation resultRelationDesc; Oid newId; List *recheckIndexes = NIL; @@ -329,7 +330,7 @@ ExecInsert(ModifyTableState *mtstate, #ifdef __TBASE__ /* Determine the interval partition to heap_insert the tuple into */ - else if (resultRelInfo->ispartparent) + if (resultRelInfo->ispartparent) { AttrNumber partkey; Datum partvalue; @@ -1386,7 +1387,7 @@ lreplace:; * Row movement, part 1. Delete the tuple, but skip RETURNING * processing. We want to return rows from INSERT. */ - ExecDelete(mtstate, tupleid, oldtuple, planSlot, epqstate, estate, + ExecDelete(mtstate, tupleid, oldtuple, slot, planSlot, epqstate, estate, &tuple_deleted, false, false); /* @@ -1433,7 +1434,7 @@ lreplace:; map_index = resultRelInfo - mtstate->resultRelInfo; Assert(map_index >= 0 && map_index < mtstate->mt_nplans); tupconv_map = tupconv_map_for_subplan(mtstate, map_index); - tuple = ConvertPartitionTupleSlot(tupconv_map, + tuple = ConvertPartitionTupleSlot(resultRelInfo->ri_RelationDesc, tupconv_map, tuple, proute->root_tuple_slot, &slot); @@ -2040,7 +2041,7 @@ ExecPrepareTupleRouting(ModifyTableState *mtstate, /* * Convert the tuple, if necessary. */ - ConvertPartitionTupleSlot(proute->parent_child_tupconv_maps[partidx], + ConvertPartitionTupleSlot(partrel->ri_RelationDesc, proute->parent_child_tupconv_maps[partidx], tuple, proute->partition_tuple_slot, &slot); diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 59663d81..73159be1 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -1031,7 +1031,7 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel, childrel->reltarget->exprs = (List *) adjust_appendrel_attrs(root, (Node *) rel->reltarget->exprs, - 1, &appinfo); + &appinfo); /* * We have to make child entries in the EquivalenceClass data @@ -2169,10 +2169,41 @@ set_subquery_pathlist(PlannerInfo *root, RelOptInfo *rel, subpath->pathkeys, make_tlist_from_pathtarget(subpath->pathtarget)); + if (subpath->distribution && subpath->distribution->distributionExpr) + { + ListCell *lc; + + /* FIXME Could we use pathtarget directly? */ + List *targetlist = make_tlist_from_pathtarget(subpath->pathtarget); + + /* + * The distribution expression from the subplan's tlist, but it should + * be from the rel, need conversion. + */ + distribution = makeNode(Distribution); + distribution->distributionType = subpath->distribution->distributionType; + distribution->nodes = bms_copy(subpath->distribution->nodes); + distribution->restrictNodes = bms_copy(subpath->distribution->restrictNodes); + + foreach(lc, targetlist) + { + TargetEntry *tle = (TargetEntry *) lfirst(lc); + if (equal(tle->expr, subpath->distribution->distributionExpr)) + { + distribution->distributionExpr = (Node *) + makeVarFromTargetEntry(rel->relid, tle); + break; + } + } + } + else + distribution = subpath->distribution; + /* Generate outer path using this subpath */ add_partial_path(rel, (Path *) create_subqueryscan_path(root, rel, subpath, - pathkeys, required_outer)); + pathkeys, required_outer, + distribution)); } } diff --git a/src/backend/optimizer/path/joinrels.c b/src/backend/optimizer/path/joinrels.c index d8afa3ef..715036b9 100644 --- a/src/backend/optimizer/path/joinrels.c +++ b/src/backend/optimizer/path/joinrels.c @@ -1405,7 +1405,7 @@ try_partition_wise_join(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2, * applicable to the parent join. */ child_restrictlist = - (List *) adjust_appendrel_attrs(root, + (List *) adjust_appendrel_attrs_nappinfos(root, (Node *) parent_restrictlist, nappinfos, appinfos); pfree(appinfos); diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c index a9c117f1..ea16dfee 100644 --- a/src/backend/optimizer/prep/prepunion.c +++ b/src/backend/optimizer/prep/prepunion.c @@ -2037,6 +2037,69 @@ adjust_appendrel_attrs(PlannerInfo *root, Node *node, AppendRelInfo *appinfo) return result; } +/* + * adjust_appendrel_attrs + * Copy the specified query or expression and translate Vars referring to a + * parent rel to refer to the corresponding child rel instead. We also + * update rtindexes appearing outside Vars, such as resultRelation and + * jointree relids. + * + * Note: this is only applied after conversion of sublinks to subplans, + * so we don't need to cope with recursion into sub-queries. + * + * Note: this is not hugely different from what pullup_replace_vars() does; + * maybe we should try to fold the two routines together. + */ +Node * +adjust_appendrel_attrs_nappinfos(PlannerInfo *root, Node *node, int nappinfos, + AppendRelInfo **appinfos) +{ + Node *result; + adjust_appendrel_attrs_context context; + + context.root = root; + context.nappinfos = nappinfos; + context.appinfos = appinfos; + + /* If there's nothing to adjust, don't call this function. */ + Assert(nappinfos >= 1 && appinfos != NULL); + + /* + * Must be prepared to start with a Query or a bare expression tree. + */ + if (node && IsA(node, Query)) + { + Query *newnode; + int cnt; + + newnode = query_tree_mutator((Query *) node, + adjust_appendrel_attrs_mutator, + (void *) &context, + QTW_IGNORE_RC_SUBQUERIES); + for (cnt = 0; cnt < nappinfos; cnt++) + { + AppendRelInfo *appinfo = appinfos[cnt]; + + if (newnode->resultRelation == appinfo->parent_relid) + { + newnode->resultRelation = appinfo->child_relid; + /* Fix tlist resnos too, if it's inherited UPDATE */ + if (newnode->commandType == CMD_UPDATE) + newnode->targetList = + adjust_inherited_tlist(newnode->targetList, + appinfo); + break; + } + } + + result = (Node *) newnode; + } + else + result = adjust_appendrel_attrs_mutator(node, &context); + + return result; +} + static Node * adjust_appendrel_attrs_mutator(Node *node, adjust_appendrel_attrs_context *context) @@ -2467,7 +2530,7 @@ build_child_join_sjinfo(PlannerInfo *root, SpecialJoinInfo *parent_sjinfo, sjinfo->syn_righthand = adjust_child_relids(sjinfo->syn_righthand, right_nappinfos, right_appinfos); - sjinfo->semi_rhs_exprs = (List *) adjust_appendrel_attrs(root, + sjinfo->semi_rhs_exprs = (List *) adjust_appendrel_attrs_nappinfos(root, (Node *) sjinfo->semi_rhs_exprs, right_nappinfos, right_appinfos); diff --git a/src/backend/optimizer/util/placeholder.c b/src/backend/optimizer/util/placeholder.c index 0d5351a6..a344dbe8 100644 --- a/src/backend/optimizer/util/placeholder.c +++ b/src/backend/optimizer/util/placeholder.c @@ -499,7 +499,7 @@ add_placeholders_to_child_joinrel(PlannerInfo *root, RelOptInfo *childrel, if (bms_overlap(phv->phrels, parentrel->relids) && childrel->reloptkind == RELOPT_OTHER_JOINREL) { - phv = (PlaceHolderVar *) adjust_appendrel_attrs(root, + phv = (PlaceHolderVar *) adjust_appendrel_attrs_nappinfos(root, (Node *) phv, nappinfos, appinfos); diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c index 1f6fb286..39100cae 100644 --- a/src/backend/optimizer/util/relnode.c +++ b/src/backend/optimizer/util/relnode.c @@ -819,7 +819,7 @@ build_child_join_rel(PlannerInfo *root, RelOptInfo *outer_rel, /* Construct joininfo list. */ appinfos = find_appinfos_by_relids(root, joinrel->relids, &nappinfos); - joinrel->joininfo = (List *) adjust_appendrel_attrs(root, + joinrel->joininfo = (List *) adjust_appendrel_attrs_nappinfos(root, (Node *) parent_joinrel->joininfo, nappinfos, appinfos); diff --git a/src/include/executor/execPartition.h b/src/include/executor/execPartition.h index d4d1be1d..0cd7b1b5 100644 --- a/src/include/executor/execPartition.h +++ b/src/include/executor/execPartition.h @@ -93,7 +93,7 @@ extern ResultRelInfo *ExecInitPartitionInfo(ModifyTableState *mtstate, extern void ExecSetupChildParentMapForLeaf(PartitionTupleRouting *proute); extern TupleConversionMap *TupConvMapForLeaf(PartitionTupleRouting *proute, ResultRelInfo *rootRelInfo, int leaf_index); -extern HeapTuple ConvertPartitionTupleSlot(TupleConversionMap *map, +extern HeapTuple ConvertPartitionTupleSlot(Relation partrel, TupleConversionMap *map, HeapTuple tuple, TupleTableSlot *new_slot, TupleTableSlot **p_my_slot); diff --git a/src/include/optimizer/prep.h b/src/include/optimizer/prep.h index e51066ed..f560052d 100644 --- a/src/include/optimizer/prep.h +++ b/src/include/optimizer/prep.h @@ -55,6 +55,9 @@ extern void expand_inherited_tables(PlannerInfo *root); extern Node *adjust_appendrel_attrs(PlannerInfo *root, Node *node, AppendRelInfo *appinfo); +extern Node *adjust_appendrel_attrs_nappinfos(PlannerInfo *root, Node *node, int nappinfos, + AppendRelInfo **appinfos); + extern Node *adjust_appendrel_attrs_multilevel(PlannerInfo *root, Node *node, RelOptInfo *child_rel); From cafa1ab65a4455970dd1c2f5fb6d09f4c261c9b4 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Tue, 7 Jul 2020 21:35:37 +0800 Subject: [PATCH 301/578] fix compile errors.20200707. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/catalog/partition.c | 85 +++++++++++++++++++++---- src/backend/optimizer/plan/createplan.c | 2 +- src/backend/optimizer/plan/planner.c | 38 ++++++----- src/backend/optimizer/plan/subselect.c | 19 +++++- src/backend/optimizer/prep/prepunion.c | 74 +++++++++++++++++++++ src/backend/optimizer/util/clauses.c | 7 ++ src/backend/optimizer/util/pathnode.c | 3 +- src/backend/tcop/utility.c | 1 + src/backend/utils/adt/partitionfuncs.c | 1 + src/include/catalog/partition.h | 1 + src/include/optimizer/prep.h | 3 + 11 files changed, 200 insertions(+), 34 deletions(-) diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c index 5c85918f..74d045dc 100644 --- a/src/backend/catalog/partition.c +++ b/src/backend/catalog/partition.c @@ -56,7 +56,9 @@ #include "utils/ruleutils.h" #include "utils/syscache.h" - +static Oid get_partition_parent_worker(Relation inhRel, Oid relid); +static void get_partition_ancestors_worker(Relation inhRel, Oid relid, + List **ancestors); static int32 qsort_partition_hbound_cmp(const void *a, const void *b); static int32 qsort_partition_list_value_cmp(const void *a, const void *b, void *arg); @@ -1335,15 +1337,34 @@ check_default_allows_bound(Relation parent, Relation default_rel, Oid get_partition_parent(Oid relid) { - Form_pg_inherits form; Relation catalogRelation; - SysScanDesc scan; - ScanKeyData key[2]; - HeapTuple tuple; Oid result; catalogRelation = heap_open(InheritsRelationId, AccessShareLock); + result = get_partition_parent_worker(catalogRelation, relid); + + if (!OidIsValid(result)) + elog(ERROR, "could not find tuple for parent of relation %u", relid); + + heap_close(catalogRelation, AccessShareLock); + + return result; +} + +/* + * get_partition_parent_worker + * Scan the pg_inherits relation to return the OID of the parent of the + * given relation + */ +static Oid +get_partition_parent_worker(Relation inhRel, Oid relid) +{ + SysScanDesc scan; + ScanKeyData key[2]; + Oid result = InvalidOid; + HeapTuple tuple; + ScanKeyInit(&key[0], Anum_pg_inherits_inhrelid, BTEqualStrategyNumber, F_OIDEQ, @@ -1353,22 +1374,64 @@ get_partition_parent(Oid relid) BTEqualStrategyNumber, F_INT4EQ, Int32GetDatum(1)); - scan = systable_beginscan(catalogRelation, InheritsRelidSeqnoIndexId, true, + scan = systable_beginscan(inhRel, InheritsRelidSeqnoIndexId, true, NULL, 2, key); - tuple = systable_getnext(scan); - if (!HeapTupleIsValid(tuple)) - elog(ERROR, "could not find tuple for parent of relation %u", relid); + if (HeapTupleIsValid(tuple)) + { + Form_pg_inherits form = (Form_pg_inherits) GETSTRUCT(tuple); - form = (Form_pg_inherits) GETSTRUCT(tuple); result = form->inhparent; + } systable_endscan(scan); - heap_close(catalogRelation, AccessShareLock); return result; } +/* + * get_partition_ancestors + * Obtain ancestors of given relation + * + * Returns a list of ancestors of the given relation. + * + * Note: Because this function assumes that the relation whose OID is passed + * as an argument and each ancestor will have precisely one parent, it should + * only be called when it is known that the relation is a partition. + */ +List * +get_partition_ancestors(Oid relid) +{ + List *result = NIL; + Relation inhRel; + + inhRel = heap_open(InheritsRelationId, AccessShareLock); + + get_partition_ancestors_worker(inhRel, relid, &result); + + heap_close(inhRel, AccessShareLock); + + return result; +} + +/* + * get_partition_ancestors_worker + * recursive worker for get_partition_ancestors + */ +static void +get_partition_ancestors_worker(Relation inhRel, Oid relid, List **ancestors) +{ + Oid parentOid; + + /* Recursion ends at the topmost level, ie., when there's no parent */ + parentOid = get_partition_parent_worker(inhRel, relid); + if (parentOid == InvalidOid) + return; + + *ancestors = lappend_oid(*ancestors, parentOid); + get_partition_ancestors_worker(inhRel, parentOid, ancestors); +} + /* * get_qual_from_partbound * Given a parser node for partition bound, return the list of executable diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index bf38bafc..706b3340 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -6728,7 +6728,7 @@ make_remotesubplan(PlannerInfo *root, /* need sort */ if (distributionType == LOCATOR_TYPE_NONE && pathkeys && need_sort) { - subplan = (Plan *)make_sort_from_pathkeys(subplan, pathkeys); + subplan = (Plan *)make_sort_from_pathkeys(subplan, pathkeys, NULL); subplan->startup_cost = gather_plan->plan.startup_cost; subplan->total_cost = gather_plan->plan.total_cost; diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 3e3065cd..2b736fd4 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -223,7 +223,7 @@ static void add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, const AggClauseCosts *agg_costs, const AggClauseCosts *agg_final_costs, grouping_sets_data *gd, bool can_sort, bool can_hash, - double dNumGroups, List *havingQual); + double dNumGroups, List *havingQual, bool *try_distributed_aggregation); static void add_partial_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, RelOptInfo *grouped_rel, @@ -4118,12 +4118,9 @@ create_degenerate_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel, path = (Path *) create_append_path(grouped_rel, paths, - NIL, NULL, 0, - false, - NIL, - -1); + NIL); path->pathtarget = target; } else @@ -4157,13 +4154,17 @@ create_ordinary_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel, Path *cheapest_path = input_rel->cheapest_total_path; AggClauseCosts agg_partial_costs; /* parallel only */ AggClauseCosts agg_final_costs; /* parallel only */ + Size hashaggtablesize; double dNumGroups; + double dNumPartialGroups = 0; bool can_hash; bool can_sort; bool try_parallel_aggregation; bool try_distributed_aggregation; PathTarget *partial_grouping_target = NULL; + ListCell *lc; + /* * Estimate number of groups. */ @@ -4347,7 +4348,7 @@ create_ordinary_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel, add_paths_to_grouping_rel(root, input_rel, grouped_rel, target, partial_grouping_target, agg_costs, &agg_final_costs, gd, can_sort, can_hash, - dNumGroups, (List *) parse->havingQual); + dNumGroups, (List *) parse->havingQual, &try_distributed_aggregation); /* Generate XL aggregate paths, with distributed 2-phase aggregation. */ @@ -4381,7 +4382,8 @@ create_ordinary_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel, /* Estimate number of partial groups. */ dNumPartialGroups = get_number_of_groups(root, cheapest_path->rows, - gd); + gd, + parse->targetList); /* * Collect statistics about aggregates for estimating costs of @@ -6840,7 +6842,7 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, const AggClauseCosts *agg_costs, const AggClauseCosts *agg_final_costs, grouping_sets_data *gd, bool can_sort, bool can_hash, - double dNumGroups, List *havingQual) + double dNumGroups, List *havingQual, bool *try_distributed_aggregation) { Query *parse = root->parse; Path *cheapest_path = input_rel->cheapest_total_path; @@ -6872,12 +6874,13 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, { #ifdef __TBASE__ bool try_redistribute_grouping = false; - PathTarget * local_grouping_target = make_partial_grouping_target(root, target); + PathTarget * local_grouping_target = make_partial_grouping_target(root, target, (Node *) parse->havingQual); /* Estimate number of partial groups. */ double dNumLocalGroups = get_number_of_groups(root, cheapest_path->rows, - gd); + gd, + parse->targetList); #endif #ifdef __TBASE__ @@ -6971,7 +6974,7 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, AGGSPLIT_INITIAL_SERIAL, parse->groupClause, NIL, - &agg_partial_costs, + agg_costs, dNumLocalGroups); } else if (parse->groupClause) @@ -7003,7 +7006,7 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, #endif else - try_distributed_aggregation = false; + *try_distributed_aggregation = false; #ifdef __TBASE__ if(try_redistribute_grouping) @@ -7335,7 +7338,7 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, #endif else - try_distributed_aggregation = false; + *try_distributed_aggregation = false; #ifdef __TBASE__ /* @@ -7624,12 +7627,13 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, * final grouping */ AggClauseCosts hashagg_partial_costs; - PathTarget * local_grouping_target = make_partial_grouping_target(root, target); + PathTarget * local_grouping_target = make_partial_grouping_target(root, target, (Node *) parse->havingQual); /* Estimate number of partial groups. */ double dNumLocalGroups = get_number_of_groups(root, cheapest_path->rows, - gd); + gd, + parse->targetList); try_redistribute_grouping = true; MemSet(&hashagg_partial_costs, 0, sizeof(AggClauseCosts)); @@ -7667,7 +7671,7 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, path = create_remotesubplan_path(root, path, NULL); #endif else - try_distributed_aggregation = false; + *try_distributed_aggregation = false; /* * We just need an Agg over the cheapest-total input path, @@ -7825,7 +7829,7 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, path = create_remotesubplan_path(root, path, NULL); #endif else - try_distributed_aggregation = false; + *try_distributed_aggregation = false; #ifdef __TBASE__ if (!redistribute_group) diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index bcdbe3da..3aa0c9f4 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -172,6 +172,7 @@ static Node *process_sublinks_mutator(Node *node, process_sublinks_context *context); static Bitmapset *finalize_plan(PlannerInfo *root, Plan *plan, + int gather_param, Bitmapset *valid_params, Bitmapset *scan_params); static bool finalize_primnode(Node *node, finalize_primnode_context *context); @@ -4974,12 +4975,15 @@ void SS_finalize_plan(PlannerInfo *root, Plan *plan) { /* No setup needed, just recurse through plan tree. */ - (void) finalize_plan(root, plan, root->outer_params, NULL); + (void) finalize_plan(root, plan, -1, root->outer_params, NULL); } /* * Recursive processing of all nodes in the plan tree * + * gather_param is the rescan_param of an ancestral Gather/GatherMerge, + * or -1 if there is none. + * * valid_params is the set of param IDs supplied by outer plan levels * that are valid to reference in this plan node or its children. * @@ -5006,7 +5010,7 @@ SS_finalize_plan(PlannerInfo *root, Plan *plan) * can be handled more cleanly. */ static Bitmapset * -finalize_plan(PlannerInfo *root, Plan *plan, Bitmapset *valid_params, +finalize_plan(PlannerInfo *root, Plan *plan, int gather_param, Bitmapset *valid_params, Bitmapset *scan_params) {// #lizard forgives finalize_primnode_context context; @@ -5137,7 +5141,7 @@ finalize_plan(PlannerInfo *root, Plan *plan, Bitmapset *valid_params, context.paramids = bms_add_members(context.paramids, scan_params); break; - case T_SubqueryScan: + { SubqueryScan *sscan = (SubqueryScan *) plan; RelOptInfo *rel; @@ -5287,6 +5291,7 @@ finalize_plan(PlannerInfo *root, Plan *plan, Bitmapset *valid_params, bms_add_members(context.paramids, finalize_plan(root, (Plan *) lfirst(lc), + gather_param, valid_params, scan_params)); } @@ -5317,6 +5322,7 @@ finalize_plan(PlannerInfo *root, Plan *plan, Bitmapset *valid_params, bms_add_members(context.paramids, finalize_plan(root, (Plan *) lfirst(l), + gather_param, valid_params, scan_params)); } @@ -5344,6 +5350,7 @@ finalize_plan(PlannerInfo *root, Plan *plan, Bitmapset *valid_params, bms_add_members(context.paramids, finalize_plan(root, (Plan *) lfirst(l), + gather_param, valid_params, scan_params)); } @@ -5360,6 +5367,7 @@ finalize_plan(PlannerInfo *root, Plan *plan, Bitmapset *valid_params, bms_add_members(context.paramids, finalize_plan(root, (Plan *) lfirst(l), + gather_param, valid_params, scan_params)); } @@ -5376,6 +5384,7 @@ finalize_plan(PlannerInfo *root, Plan *plan, Bitmapset *valid_params, bms_add_members(context.paramids, finalize_plan(root, (Plan *) lfirst(l), + gather_param, valid_params, scan_params)); } @@ -5392,6 +5401,7 @@ finalize_plan(PlannerInfo *root, Plan *plan, Bitmapset *valid_params, bms_add_members(context.paramids, finalize_plan(root, (Plan *) lfirst(l), + gather_param, valid_params, scan_params)); } @@ -5503,6 +5513,7 @@ finalize_plan(PlannerInfo *root, Plan *plan, Bitmapset *valid_params, /* Process left and right child plans, if any */ child_params = finalize_plan(root, plan->lefttree, + gather_param, valid_params, scan_params); context.paramids = bms_add_members(context.paramids, child_params); @@ -5512,6 +5523,7 @@ finalize_plan(PlannerInfo *root, Plan *plan, Bitmapset *valid_params, /* right child can reference nestloop_params as well as valid_params */ child_params = finalize_plan(root, plan->righttree, + gather_param, bms_union(nestloop_params, valid_params), scan_params); /* ... and they don't count as parameters used at my level */ @@ -5523,6 +5535,7 @@ finalize_plan(PlannerInfo *root, Plan *plan, Bitmapset *valid_params, /* easy case */ child_params = finalize_plan(root, plan->righttree, + gather_param, valid_params, scan_params); } diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c index ea16dfee..62bc5200 100644 --- a/src/backend/optimizer/prep/prepunion.c +++ b/src/backend/optimizer/prep/prepunion.c @@ -59,6 +59,8 @@ typedef struct { PlannerInfo *root; AppendRelInfo *appinfo; + AppendRelInfo **appinfos; + int nappinfos; } adjust_appendrel_attrs_context; static Path *recurse_set_operations(Node *setOp, PlannerInfo *root, @@ -121,6 +123,8 @@ static Bitmapset *translate_col_privs(const Bitmapset *parent_privs, List *translated_vars); static Node *adjust_appendrel_attrs_mutator(Node *node, adjust_appendrel_attrs_context *context); +static Relids adjust_child_relids(Relids relids, int nappinfos, + AppendRelInfo **appinfos); static Relids adjust_relid_set(Relids relids, Index oldrelid, Index newrelid); static List *adjust_inherited_tlist(List *tlist, AppendRelInfo *context); @@ -2309,6 +2313,40 @@ adjust_appendrel_attrs_mutator(Node *node, (void *) context); } +/* + * Substitute child relids for parent relids in a Relid set. The array of + * appinfos specifies the substitutions to be performed. + */ +static Relids +adjust_child_relids(Relids relids, int nappinfos, AppendRelInfo **appinfos) +{ + Bitmapset *result = NULL; + int cnt; + + for (cnt = 0; cnt < nappinfos; cnt++) + { + AppendRelInfo *appinfo = appinfos[cnt]; + + /* Remove parent, add child */ + if (bms_is_member(appinfo->parent_relid, relids)) + { + /* Make a copy if we are changing the set. */ + if (!result) + result = bms_copy(relids); + + result = bms_del_member(result, appinfo->parent_relid); + result = bms_add_member(result, appinfo->child_relid); + } + } + + /* If we made any changes, return the modified copy. */ + if (result) + return result; + + /* Otherwise, return the original set without modification. */ + return relids; +} + /* * Substitute newrelid for oldrelid in a Relid set */ @@ -2541,3 +2579,39 @@ build_child_join_sjinfo(PlannerInfo *root, SpecialJoinInfo *parent_sjinfo, return sjinfo; } +/* + * find_appinfos_by_relids + * Find AppendRelInfo structures for all relations specified by relids. + * + * The AppendRelInfos are returned in an array, which can be pfree'd by the + * caller. *nappinfos is set to the number of entries in the array. + */ +AppendRelInfo ** +find_appinfos_by_relids(PlannerInfo *root, Relids relids, int *nappinfos) +{ + ListCell *lc; + AppendRelInfo **appinfos; + int cnt = 0; + + *nappinfos = bms_num_members(relids); + appinfos = (AppendRelInfo **) palloc(sizeof(AppendRelInfo *) * *nappinfos); + + foreach(lc, root->append_rel_list) + { + AppendRelInfo *appinfo = lfirst(lc); + + if (bms_is_member(appinfo->child_relid, relids)) + { + appinfos[cnt] = appinfo; + cnt++; + + /* Stop when we have gathered all the AppendRelInfos. */ + if (cnt == *nappinfos) + return appinfos; + } + } + + /* Should have found the entries ... */ + elog(ERROR, "did not find all requested child rels in append_rel_list"); + return NULL; /* not reached */ +} diff --git a/src/backend/optimizer/util/clauses.c b/src/backend/optimizer/util/clauses.c index 697b7dcc..ef96602f 100644 --- a/src/backend/optimizer/util/clauses.c +++ b/src/backend/optimizer/util/clauses.c @@ -2510,6 +2510,13 @@ estimate_expression_value(PlannerInfo *root, Node *node) return eval_const_expressions_mutator(node, &context); } +/* Generic macro for applying evaluate_expr */ +#define ece_evaluate_expr(node) \ + ((Node *) evaluate_expr((Expr *) (node), \ + exprType((Node *) (node)), \ + exprTypmod((Node *) (node)), \ + exprCollation((Node *) (node)))) + static Node * eval_const_expressions_mutator(Node *node, eval_const_expressions_context *context) diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 4d2a1f32..2d4a5d2b 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -6967,8 +6967,7 @@ reparameterize_path_by_child(PlannerInfo *root, Path *path, #define ADJUST_CHILD_ATTRS(node) \ ((node) = \ (List *) adjust_appendrel_attrs_multilevel(root, (Node *) (node), \ - child_rel->relids, \ - child_rel->top_parent_relids)) + child_rel)) #define REPARAMETERIZE_CHILD_PATH(path) \ do { \ diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index a6536b13..1ef4a799 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -3817,6 +3817,7 @@ ProcessUtilitySlow(ParseState *pstate, addr = DefineIndex(partOid, /* OID of heap relation */ partidxstmt, InvalidOid, /* no predefined OID */ + InvalidOid, false, /* is_alter_table */ true, /* check_rights */ true, /* check_not_in_use */ diff --git a/src/backend/utils/adt/partitionfuncs.c b/src/backend/utils/adt/partitionfuncs.c index 87f1cced..1e77f172 100644 --- a/src/backend/utils/adt/partitionfuncs.c +++ b/src/backend/utils/adt/partitionfuncs.c @@ -19,6 +19,7 @@ #include "catalog/partition.h" #include "catalog/pg_class.h" #include "catalog/pg_inherits.h" +#include "catalog/pg_inherits_fn.h" #include "catalog/pg_type.h" #include "funcapi.h" #include "utils/fmgrprotos.h" diff --git a/src/include/catalog/partition.h b/src/include/catalog/partition.h index 6cade9aa..4265fd50 100644 --- a/src/include/catalog/partition.h +++ b/src/include/catalog/partition.h @@ -52,6 +52,7 @@ extern PartitionBoundInfo partition_bounds_copy(PartitionBoundInfo src, extern void check_new_partition_bound(char *relname, Relation parent, PartitionBoundSpec *spec); extern Oid get_partition_parent(Oid relid); +extern List *get_partition_ancestors(Oid relid); extern List *get_qual_from_partbound(Relation rel, Relation parent, PartitionBoundSpec *spec); extern List *map_partition_varattnos(List *expr, int fromrel_varno, diff --git a/src/include/optimizer/prep.h b/src/include/optimizer/prep.h index f560052d..99c87a2d 100644 --- a/src/include/optimizer/prep.h +++ b/src/include/optimizer/prep.h @@ -61,6 +61,9 @@ extern Node *adjust_appendrel_attrs_nappinfos(PlannerInfo *root, Node *node, int extern Node *adjust_appendrel_attrs_multilevel(PlannerInfo *root, Node *node, RelOptInfo *child_rel); +extern AppendRelInfo **find_appinfos_by_relids(PlannerInfo *root, + Relids relids, int *nappinfos); + extern SpecialJoinInfo *build_child_join_sjinfo(PlannerInfo *root, SpecialJoinInfo *parent_sjinfo, Relids left_relids, Relids right_relids); From 369c85820770ca15d2a36a777e0302ffc3608a29 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Tue, 7 Jul 2020 21:47:49 +0800 Subject: [PATCH 302/578] Fix assorted bugs in pg_get_partition_constraintdef(). http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/catalog/partition.c | 43 ++++++++++++++++---------- src/backend/utils/cache/lsyscache.c | 24 ++++++++++++++ src/include/utils/lsyscache.h | 1 + src/test/regress/expected/indexing.out | 19 ++++++++++++ src/test/regress/sql/indexing.sql | 2 ++ 5 files changed, 73 insertions(+), 16 deletions(-) diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c index 74d045dc..1b46a503 100644 --- a/src/backend/catalog/partition.c +++ b/src/backend/catalog/partition.c @@ -1535,31 +1535,38 @@ RelationGetPartitionQual(Relation rel) * get_partition_qual_relid * * Returns an expression tree describing the passed-in relation's partition - * constraint. If there is no partition constraint returns NULL; this can - * happen if the default partition is the only partition. + * constraint. + * + * If the relation is not found, or is not a partition, or there is no + * partition constraint, return NULL. We must guard against the first two + * cases because this supports a SQL function that could be passed any OID. + * The last case can happen even if relispartition is true, when a default + * partition is the only partition. */ Expr * get_partition_qual_relid(Oid relid) { - Relation rel = heap_open(relid, AccessShareLock); Expr *result = NULL; - List *and_args; - /* Do the work only if this relation is a partition. */ - if (rel->rd_rel->relispartition) + /* Do the work only if this relation exists and is a partition. */ + if (get_rel_relispartition(relid)) { + Relation rel = relation_open(relid, AccessShareLock); + List *and_args; + and_args = generate_partition_qual(rel); + /* Convert implicit-AND list format to boolean expression */ if (and_args == NIL) result = NULL; else if (list_length(and_args) > 1) result = makeBoolExpr(AND_EXPR, and_args, -1); else result = linitial(and_args); - } - /* Keep the lock. */ - heap_close(rel, NoLock); + /* Keep the lock, to allow safe deparsing against the rel by caller. */ + relation_close(rel, NoLock); + } return result; } @@ -2455,7 +2462,6 @@ generate_partition_qual(Relation rel) MemoryContext oldcxt; Datum boundDatum; bool isnull; - PartitionBoundSpec *bound; List *my_qual = NIL, *result = NIL; Relation parent; @@ -2469,7 +2475,7 @@ generate_partition_qual(Relation rel) return copyObject(rel->rd_partcheck); /* Grab at least an AccessShareLock on the parent table */ - parent = heap_open(get_partition_parent(RelationGetRelid(rel)), + parent = relation_open(get_partition_parent(RelationGetRelid(rel)), AccessShareLock); /* Get pg_class.relpartbound */ @@ -2481,14 +2487,19 @@ generate_partition_qual(Relation rel) boundDatum = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_relpartbound, &isnull); - if (isnull) /* should not happen */ - elog(ERROR, "relation \"%s\" has relpartbound = null", - RelationGetRelationName(rel)); + + if (!isnull) + { + PartitionBoundSpec *bound; + bound = castNode(PartitionBoundSpec, stringToNode(TextDatumGetCString(boundDatum))); - ReleaseSysCache(tuple); my_qual = get_qual_from_partbound(rel, parent, bound); + } + + ReleaseSysCache(tuple); + /* Add the parent's quals to the list (if any) */ if (parent->rd_rel->relispartition) @@ -2514,7 +2525,7 @@ generate_partition_qual(Relation rel) MemoryContextSwitchTo(oldcxt); /* Keep the parent locked until commit */ - heap_close(parent, NoLock); + relation_close(parent, NoLock); return result; } diff --git a/src/backend/utils/cache/lsyscache.c b/src/backend/utils/cache/lsyscache.c index d8a59308..9061c0ed 100644 --- a/src/backend/utils/cache/lsyscache.c +++ b/src/backend/utils/cache/lsyscache.c @@ -1934,6 +1934,30 @@ get_rel_relkind(Oid relid) return '\0'; } +/* + * get_rel_relispartition + * + * Returns the relispartition flag associated with a given relation. + */ +bool +get_rel_relispartition(Oid relid) +{ + HeapTuple tp; + + tp = SearchSysCache1(RELOID, ObjectIdGetDatum(relid)); + if (HeapTupleIsValid(tp)) + { + Form_pg_class reltup = (Form_pg_class) GETSTRUCT(tp); + bool result; + + result = reltup->relispartition; + ReleaseSysCache(tp); + return result; + } + else + return false; +} + /* * get_rel_tablespace * diff --git a/src/include/utils/lsyscache.h b/src/include/utils/lsyscache.h index e0d757b0..e94c510b 100644 --- a/src/include/utils/lsyscache.h +++ b/src/include/utils/lsyscache.h @@ -141,6 +141,7 @@ extern char *get_rel_name(Oid relid); extern Oid get_rel_namespace(Oid relid); extern Oid get_rel_type_id(Oid relid); extern char get_rel_relkind(Oid relid); +extern bool get_rel_relispartition(Oid relid); extern Oid get_rel_tablespace(Oid relid); extern char get_rel_persistence(Oid relid); extern Oid get_transform_fromsql(Oid typid, Oid langid, List *trftypes); diff --git a/src/test/regress/expected/indexing.out b/src/test/regress/expected/indexing.out index e9cccca8..804aa2eb 100644 --- a/src/test/regress/expected/indexing.out +++ b/src/test/regress/expected/indexing.out @@ -58,6 +58,25 @@ Indexes: "idxpart1_a_idx" btree (a) "idxpart1_b_c_idx" btree (b, c) +\d+ idxpart1_a_idx + Index "public.idxpart1_a_idx" + Column | Type | Key? | Definition | Storage | Stats target +--------+---------+------+------------+---------+-------------- + a | integer | yes | a | plain | +Partition of: idxparti +No partition constraint +btree, for table "public.idxpart1" + +\d+ idxpart1_b_c_idx + Index "public.idxpart1_b_c_idx" + Column | Type | Key? | Definition | Storage | Stats target +--------+---------+------+------------+----------+-------------- + b | integer | yes | b | plain | + c | text | yes | c | extended | +Partition of: idxparti2 +No partition constraint +btree, for table "public.idxpart1" + drop table idxpart; -- If a partition already has an index, don't create a duplicative one create table idxpart (a int, b int) partition by range (a, b); diff --git a/src/test/regress/sql/indexing.sql b/src/test/regress/sql/indexing.sql index 33be7186..cd1dd3b0 100644 --- a/src/test/regress/sql/indexing.sql +++ b/src/test/regress/sql/indexing.sql @@ -28,6 +28,8 @@ create table idxpart1 (like idxpart); \d idxpart1 alter table idxpart attach partition idxpart1 for values from (0) to (10); \d idxpart1 +\d+ idxpart1_a_idx +\d+ idxpart1_b_c_idx drop table idxpart; -- If a partition already has an index, don't create a duplicative one From b2297103dc452b257132e86e152d7f679bb512d4 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Thu, 9 Jul 2020 19:50:00 +0800 Subject: [PATCH 303/578] fix pg_amproc.h error,\d+ partition table error, default partition error.http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/catalog/partition.c | 57 +++++++------------- src/backend/optimizer/path/allpaths.c | 2 +- src/backend/optimizer/plan/subselect.c | 2 +- src/bin/psql/describe.c | 47 ++++++++++------- src/bin/psql/tab-complete.c | 6 +-- src/include/catalog/pg_amproc.h | 72 +++++++++++++------------- 6 files changed, 88 insertions(+), 98 deletions(-) diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c index 1b46a503..263a426a 100644 --- a/src/backend/catalog/partition.c +++ b/src/backend/catalog/partition.c @@ -331,21 +331,18 @@ RelationBuildPartitionDesc(Relation rel) } else if (key->strategy == PARTITION_STRATEGY_RANGE) { - int j, - k; + int k; PartitionRangeBound **all_bounds, *prev; - bool *distinct_indexes; all_bounds = (PartitionRangeBound **) palloc0(2 * nparts * sizeof(PartitionRangeBound *)); - distinct_indexes = (bool *) palloc(2 * nparts * sizeof(bool)); /* * Create a unified list of range bounds across all the * partitions. */ - i = j = 0; + i = ndatums = 0; foreach(cell, boundspecs) { PartitionBoundSpec *spec = castNode(PartitionBoundSpec, @@ -371,28 +368,26 @@ RelationBuildPartitionDesc(Relation rel) true); upper = make_one_range_bound(key, i, spec->upperdatums, false); - all_bounds[j] = lower; - all_bounds[j + 1] = upper; - j += 2; + all_bounds[ndatums++] = lower; + all_bounds[ndatums++] = upper; i++; } - Assert(j == nparts * 2 || - (default_index != -1 && j == (nparts - 1) * 2)); + Assert(ndatums == nparts * 2 || + (default_index != -1 && ndatums == (nparts - 1) * 2)); /* Sort all the bounds in ascending order */ - qsort_arg(all_bounds, j, + qsort_arg(all_bounds, ndatums, sizeof(PartitionRangeBound *), qsort_partition_rbound_cmp, (void *) key); - /* - * Count the number of distinct bounds to allocate an array of - * that size. - */ - ndatums = 0; + /* Save distinct bounds from all_bounds into rbounds. */ + rbounds = (PartitionRangeBound **) + palloc(ndatums * sizeof(PartitionRangeBound *)); + k = 0; prev = NULL; - for (i = 0; i < 2 * nparts; i++) + for (i = 0; i < ndatums; i++) { PartitionRangeBound *cur = all_bounds[i]; bool is_distinct = false; @@ -429,34 +424,18 @@ RelationBuildPartitionDesc(Relation rel) } /* - * Count the current bound if it is distinct from the previous - * one. Also, store if the index i contains a distinct bound - * that we'd like put in the relcache array. + * Only if the bound is distinct save it into a temporary + * array i.e. rbounds which is later copied into boundinfo + * datums array. */ if (is_distinct) - { - distinct_indexes[i] = true; - ndatums++; - } - else - distinct_indexes[i] = false; + rbounds[k++] = all_bounds[i]; prev = cur; } - /* - * Finally save them in an array from where they will be copied - * into the relcache. - */ - rbounds = (PartitionRangeBound **) palloc(ndatums * - sizeof(PartitionRangeBound *)); - k = 0; - for (i = 0; i < 2 * nparts; i++) - { - if (distinct_indexes[i]) - rbounds[k++] = all_bounds[i]; - } - Assert(k == ndatums); + /* Update ndatums to hold the count of distinct datums. */ + ndatums = k; } else elog(ERROR, "unexpected partition strategy: %d", diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 73159be1..fbeeb3e5 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -1031,7 +1031,7 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel, childrel->reltarget->exprs = (List *) adjust_appendrel_attrs(root, (Node *) rel->reltarget->exprs, - &appinfo); + appinfo); /* * We have to make child entries in the EquivalenceClass data diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index 3aa0c9f4..c1583491 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -5141,7 +5141,7 @@ finalize_plan(PlannerInfo *root, Plan *plan, int gather_param, Bitmapset *valid_ context.paramids = bms_add_members(context.paramids, scan_params); break; - + case T_SubqueryScan: { SubqueryScan *sscan = (SubqueryScan *) plan; RelOptInfo *rel; diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c index 64bb8794..4c2e07ac 100644 --- a/src/bin/psql/describe.c +++ b/src/bin/psql/describe.c @@ -1400,6 +1400,7 @@ describeOneTableDetails(const char *schemaname, bool rowsecurity; bool forcerowsecurity; bool hasoids; + bool ispartition; Oid tablespace; char *reloptions; char *reloftype; @@ -1428,7 +1429,7 @@ describeOneTableDetails(const char *schemaname, printfPQExpBuffer(&buf, "SELECT c.relchecks, c.relkind, c.relhasindex, c.relhasrules, " "c.relhastriggers, c.relrowsecurity, c.relforcerowsecurity, " - "c.relhasoids, %s, c.reltablespace, " + "c.relhasoids, c.relispartition, %s, c.reltablespace, " "CASE WHEN c.reloftype = 0 THEN '' ELSE c.reloftype::pg_catalog.regtype::pg_catalog.text END, " #ifdef __TBASE__ "c.relpersistence, c.relreplident, c.relpartkind\n" @@ -1559,20 +1560,21 @@ describeOneTableDetails(const char *schemaname, tableinfo.rowsecurity = strcmp(PQgetvalue(res, 0, 5), "t") == 0; tableinfo.forcerowsecurity = strcmp(PQgetvalue(res, 0, 6), "t") == 0; tableinfo.hasoids = strcmp(PQgetvalue(res, 0, 7), "t") == 0; + tableinfo.ispartition = strcmp(PQgetvalue(res, 0, 8), "t") == 0; tableinfo.reloptions = (pset.sversion >= 80200) ? - pg_strdup(PQgetvalue(res, 0, 8)) : NULL; + pg_strdup(PQgetvalue(res, 0, 9)) : NULL; tableinfo.tablespace = (pset.sversion >= 80000) ? - atooid(PQgetvalue(res, 0, 9)) : 0; + atooid(PQgetvalue(res, 0, 10)) : 0; tableinfo.reloftype = (pset.sversion >= 90000 && - strcmp(PQgetvalue(res, 0, 10), "") != 0) ? - pg_strdup(PQgetvalue(res, 0, 10)) : NULL; + strcmp(PQgetvalue(res, 0, 11), "") != 0) ? + pg_strdup(PQgetvalue(res, 0, 11)) : NULL; tableinfo.relpersistence = (pset.sversion >= 90100) ? - *(PQgetvalue(res, 0, 11)) : 0; + *(PQgetvalue(res, 0, 12)) : 0; tableinfo.relreplident = (pset.sversion >= 90400) ? - *(PQgetvalue(res, 0, 12)) : 'd'; + *(PQgetvalue(res, 0, 13)) : 'd'; #ifdef __TBASE__ tableinfo.relpartkind = (pset.sversion >= 90500)? - *PQgetvalue(res, 0, 13) : 'n'; + *PQgetvalue(res, 0, 14) : 'n'; #endif PQclear(res); res = NULL; @@ -2257,11 +2259,15 @@ describeOneTableDetails(const char *schemaname, tableinfo.relkind == RELKIND_PARTITIONED_TABLE) { printfPQExpBuffer(&buf, - "SELECT conname,\n" - " pg_catalog.pg_get_constraintdef(r.oid, true) as condef\n" - "FROM pg_catalog.pg_constraint r\n" - "WHERE r.conrelid = '%s' AND r.contype = 'f' ORDER BY 1;", - oid); + "SELECT conrelid = '%s'::pg_catalog.regclass AS sametable,\n" + " conname,\n" + " pg_catalog.pg_get_constraintdef(oid, true) as condef,\n" + " conrelid::pg_catalog.regclass AS ontable\n" + " FROM pg_catalog.pg_constraint,\n" + " pg_catalog.pg_partition_ancestors('%s')\n" + " WHERE conrelid = relid AND contype = 'f'\n" + " ORDER BY sametable DESC, conname;", + oid, oid); result = PSQLexec(buf.data); if (!result) goto error_return; @@ -2273,10 +2279,15 @@ describeOneTableDetails(const char *schemaname, printTableAddFooter(&cont, _("Foreign-key constraints:")); for (i = 0; i < tuples; i++) { - /* untranslated constraint name and def */ - printfPQExpBuffer(&buf, " \"%s\" %s", - PQgetvalue(result, i, 0), - PQgetvalue(result, i, 1)); + /* + * Print untranslated constraint name and definition. Use + * a "TABLE tab" prefix when the constraint is defined in + * a parent partitioned table. + */ + printfPQExpBuffer(&buf, " TABLE \"%s\" CONSTRAINT \"%s\" %s", + PQgetvalue(result, i, 1), + PQgetvalue(result, i, 2), + PQgetvalue(result, i, 3)); printTableAddFooter(&cont, buf.data); } @@ -3819,7 +3830,7 @@ listPartitionedTables(const char *reltypes, const char *pattern, bool verbose) { char sverbuf[32]; - pg_log_error("The server (version %s) does not support declarative table partitioning.", + psql_error("The server (version %s) does not support declarative table partitioning.", formatPGVersionNumber(pset.sversion, false, sverbuf, sizeof(sverbuf))); return true; diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c index db21cc50..638f04f5 100644 --- a/src/bin/psql/tab-complete.c +++ b/src/bin/psql/tab-complete.c @@ -3530,11 +3530,11 @@ psql_completion(const char *text, int start, int end) COMPLETE_WITH_QUERY(Query_for_list_of_schemas); else if (TailMatchesCS1("\\dp") || TailMatchesCS1("\\z")) COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_tsvmf, NULL); - else if (TailMatchesCS("\\dPi*")) + else if (TailMatchesCS1("\\dPi*")) COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_partitioned_indexes, NULL); - else if (TailMatchesCS("\\dPt*")) + else if (TailMatchesCS1("\\dPt*")) COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_partitioned_tables, NULL); - else if (TailMatchesCS("\\dP*")) + else if (TailMatchesCS1("\\dP*")) COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_partitioned_relations, NULL); else if (TailMatchesCS1("\\ds*")) COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_sequences, NULL); diff --git a/src/include/catalog/pg_amproc.h b/src/include/catalog/pg_amproc.h index b239bbec..b02d0d0d 100644 --- a/src/include/catalog/pg_amproc.h +++ b/src/include/catalog/pg_amproc.h @@ -153,77 +153,77 @@ DATA(insert ( 4033 3802 3802 1 4044 )); /* hash */ DATA(insert ( 427 1042 1042 1 1080 )); -DATA(insert ( 427 1042 1042 2 972 )); +DATA(insert ( 427 1042 1042 2 4676 )); DATA(insert ( 431 18 18 1 454 )); -DATA(insert ( 431 18 18 2 446 )); +DATA(insert ( 431 18 18 2 4666 )); DATA(insert ( 435 1082 1082 1 450 )); -DATA(insert ( 435 1082 1082 2 425 )); +DATA(insert ( 435 1082 1082 2 4661 )); DATA(insert ( 627 2277 2277 1 626 )); -DATA(insert ( 627 2277 2277 2 782 )); +DATA(insert ( 627 2277 2277 2 4686 )); DATA(insert ( 1971 700 700 1 451 )); -DATA(insert ( 1971 700 700 2 443 )); +DATA(insert ( 1971 700 700 2 4663 )); DATA(insert ( 1971 701 701 1 452 )); -DATA(insert ( 1971 701 701 2 444 )); +DATA(insert ( 1971 701 701 2 4664 )); DATA(insert ( 1975 869 869 1 422 )); -DATA(insert ( 1975 869 869 2 779 )); +DATA(insert ( 1975 869 869 2 4673 )); DATA(insert ( 1977 21 21 1 449 )); -DATA(insert ( 1977 21 21 2 441 )); +DATA(insert ( 1977 21 21 2 4660 )); DATA(insert ( 1977 23 23 1 450 )); -DATA(insert ( 1977 23 23 2 425 )); +DATA(insert ( 1977 23 23 2 4661 )); DATA(insert ( 1977 20 20 1 949 )); -DATA(insert ( 1977 20 20 2 442 )); +DATA(insert ( 1977 20 20 2 4662 )); DATA(insert ( 1983 1186 1186 1 1697 )); -DATA(insert ( 1983 1186 1186 2 3418 )); +DATA(insert ( 1983 1186 1186 2 4679 )); DATA(insert ( 1985 829 829 1 399 )); -DATA(insert ( 1985 829 829 2 778 )); +DATA(insert ( 1985 829 829 2 4672 )); DATA(insert ( 1987 19 19 1 455 )); -DATA(insert ( 1987 19 19 2 447 )); +DATA(insert ( 1987 19 19 2 4667 )); DATA(insert ( 1990 26 26 1 453 )); -DATA(insert ( 1990 26 26 2 445 )); +DATA(insert ( 1990 26 26 2 4665 )); DATA(insert ( 1992 30 30 1 457 )); -DATA(insert ( 1992 30 30 2 776 )); +DATA(insert ( 1992 30 30 2 4670 )); DATA(insert ( 1995 25 25 1 400 )); -DATA(insert ( 1995 25 25 2 448)); +DATA(insert ( 1995 25 25 2 4668)); DATA(insert ( 1997 1083 1083 1 1688 )); -DATA(insert ( 1997 1083 1083 2 3409 )); +DATA(insert ( 1997 1083 1083 2 4677 )); DATA(insert ( 1998 1700 1700 1 432 )); -DATA(insert ( 1998 1700 1700 2 780 )); +DATA(insert ( 1998 1700 1700 2 4674 )); DATA(insert ( 1999 1184 1184 1 2039 )); -DATA(insert ( 1999 1184 1184 2 3411 )); +DATA(insert ( 1999 1184 1184 2 4680 )); DATA(insert ( 2001 1266 1266 1 1696 )); -DATA(insert ( 2001 1266 1266 2 3410 )); +DATA(insert ( 2001 1266 1266 2 4678 )); DATA(insert ( 2040 1114 1114 1 2039 )); -DATA(insert ( 2040 1114 1114 2 3411 )); +DATA(insert ( 2040 1114 1114 2 4680 )); DATA(insert ( 2222 16 16 1 454 )); -DATA(insert ( 2222 16 16 2 446 )); +DATA(insert ( 2222 16 16 2 4666 )); DATA(insert ( 2223 17 17 1 456 )); -DATA(insert ( 2223 17 17 2 772 )); +DATA(insert ( 2223 17 17 2 4669 )); DATA(insert ( 2225 28 28 1 450 )); -DATA(insert ( 2225 28 28 2 425)); +DATA(insert ( 2225 28 28 2 4661)); DATA(insert ( 2226 29 29 1 450 )); -DATA(insert ( 2226 29 29 2 425 )); +DATA(insert ( 2226 29 29 2 4661 )); DATA(insert ( 2227 702 702 1 450 )); -DATA(insert ( 2227 702 702 2 425 )); +DATA(insert ( 2227 702 702 2 4661 )); DATA(insert ( 2228 703 703 1 450 )); -DATA(insert ( 2228 703 703 2 425 )); +DATA(insert ( 2228 703 703 2 4661 )); DATA(insert ( 2229 25 25 1 400 )); -DATA(insert ( 2229 25 25 2 448 )); +DATA(insert ( 2229 25 25 2 4668 )); DATA(insert ( 2231 1042 1042 1 1080 )); -DATA(insert ( 2231 1042 1042 2 972 )); +DATA(insert ( 2231 1042 1042 2 4676 )); DATA(insert ( 2235 1033 1033 1 329 )); -DATA(insert ( 2235 1033 1033 2 777 )); +DATA(insert ( 2235 1033 1033 2 4671 )); DATA(insert ( 2969 2950 2950 1 2963 )); -DATA(insert ( 2969 2950 2950 2 3412 )); +DATA(insert ( 2969 2950 2950 2 4681 )); DATA(insert ( 3254 3220 3220 1 3252 )); -DATA(insert ( 3254 3220 3220 2 3413 )); +DATA(insert ( 3254 3220 3220 2 4682 )); DATA(insert ( 3372 774 774 1 328 )); -DATA(insert ( 3372 774 774 2 781 )); +DATA(insert ( 3372 774 774 2 4675 )); DATA(insert ( 3523 3500 3500 1 3515 )); -DATA(insert ( 3523 3500 3500 2 3414 )); +DATA(insert ( 3523 3500 3500 2 4683 )); DATA(insert ( 3903 3831 3831 1 3902 )); -DATA(insert ( 3903 3831 3831 2 3417 )); +DATA(insert ( 3903 3831 3831 2 4685 )); DATA(insert ( 4034 3802 3802 1 4045 )); -DATA(insert ( 4034 3802 3802 2 3416)); +DATA(insert ( 4034 3802 3802 2 4684)); /* gist */ DATA(insert ( 1029 600 600 1 2179 )); From ac8c2434c5dc5be3570e150ea6ca3c4d78082bdb Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Fri, 10 Jul 2020 11:58:37 +0800 Subject: [PATCH 304/578] Fix crash when ALTER TABLE recreates indexes on partitions. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/commands/indexcmds.c | 4 ++-- src/backend/commands/tablecmds.c | 2 +- src/test/regress/expected/indexing.out | 8 ++++++++ src/test/regress/sql/indexing.sql | 9 +++++++++ 4 files changed, 20 insertions(+), 3 deletions(-) diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index be45e453..76701f4a 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -964,8 +964,8 @@ DefineIndex(Oid relationId, DefineIndex(childRelid, childStmt, InvalidOid, /* no predefined OID */ indexRelationId, /* this is our child */ - false, check_rights, check_not_in_use, - false, quiet); + is_alter_table, check_rights, check_not_in_use, + skip_build, quiet); } pfree(attmap); diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 430141a2..989fb062 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -17155,7 +17155,7 @@ AttachPartitionEnsureIndexes(Relation rel, Relation attachrel) RelationGetDescr(rel)->natts); DefineIndex(RelationGetRelid(attachrel), stmt, InvalidOid, RelationGetRelid(idxRel), - false, false, false, false, false); + true, false, false, false, false); } index_close(idxRel, AccessShareLock); diff --git a/src/test/regress/expected/indexing.out b/src/test/regress/expected/indexing.out index 804aa2eb..d4326a87 100644 --- a/src/test/regress/expected/indexing.out +++ b/src/test/regress/expected/indexing.out @@ -31,6 +31,14 @@ ERROR: cannot create unique index on partitioned table "idxpart" create index concurrently on idxpart (a); ERROR: cannot create index on partitioned table "idxpart" concurrently drop table idxpart; +-- Verify bugfix with index rewrite on ALTER TABLE / SET DATA TYPE +-- https://postgr.es/m/CAKcux6mxNCGsgATwf5CGMF8g4WSupCXicCVMeKUTuWbyxHOMsQ@mail.gmail.com +CREATE TABLE idxpart (a INT, b TEXT, c INT) PARTITION BY RANGE(a); +CREATE TABLE idxpart1 PARTITION OF idxpart FOR VALUES FROM (MINVALUE) TO (MAXVALUE); +CREATE INDEX partidx_abc_idx ON idxpart (a, b, c); +INSERT INTO idxpart (a, b, c) SELECT i, i, i FROM generate_series(1, 50) i; +ALTER TABLE idxpart ALTER COLUMN c TYPE numeric; +DROP TABLE idxpart; -- If a table without index is attached as partition to a table with -- an index, the index is automatically created create table idxpart (a int, b int, c text) partition by range (a); diff --git a/src/test/regress/sql/indexing.sql b/src/test/regress/sql/indexing.sql index cd1dd3b0..4762e687 100644 --- a/src/test/regress/sql/indexing.sql +++ b/src/test/regress/sql/indexing.sql @@ -19,6 +19,15 @@ create unique index on idxpart (a); create index concurrently on idxpart (a); drop table idxpart; +-- Verify bugfix with index rewrite on ALTER TABLE / SET DATA TYPE +-- https://postgr.es/m/CAKcux6mxNCGsgATwf5CGMF8g4WSupCXicCVMeKUTuWbyxHOMsQ@mail.gmail.com +CREATE TABLE idxpart (a INT, b TEXT, c INT) PARTITION BY RANGE(a); +CREATE TABLE idxpart1 PARTITION OF idxpart FOR VALUES FROM (MINVALUE) TO (MAXVALUE); +CREATE INDEX partidx_abc_idx ON idxpart (a, b, c); +INSERT INTO idxpart (a, b, c) SELECT i, i, i FROM generate_series(1, 50) i; +ALTER TABLE idxpart ALTER COLUMN c TYPE numeric; +DROP TABLE idxpart; + -- If a table without index is attached as partition to a table with -- an index, the index is automatically created create table idxpart (a int, b int, c text) partition by range (a); From d10527dd94a348df085f2c41864b0b0a2b0088f1 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Thu, 16 Jul 2020 19:30:57 +0800 Subject: [PATCH 305/578] fix regress error related partition table. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/optimizer/path/allpaths.c | 6 +- src/backend/optimizer/path/equivclass.c | 5 +- src/backend/optimizer/path/joinrels.c | 2 +- src/backend/optimizer/plan/planner.c | 2 +- src/backend/optimizer/prep/prepunion.c | 273 +- src/backend/optimizer/util/pathnode.c | 3 +- src/backend/optimizer/util/placeholder.c | 2 +- src/backend/optimizer/util/relnode.c | 2 +- src/bin/psql/describe.c | 45 +- src/include/optimizer/prep.h | 8 +- src/test/regress/expected/alter_table.out | 21 +- src/test/regress/expected/alter_table_3.out | 3 +- src/test/regress/expected/create_table.out | 6 + src/test/regress/expected/event_trigger.out | 7 +- src/test/regress/expected/foreign_data.out | 3 + src/test/regress/expected/foreign_key_2.out | 25 +- src/test/regress/expected/identity_1.out | 12 + src/test/regress/expected/indexing.out | 65 +- src/test/regress/expected/inherit_2.out | 24 + src/test/regress/expected/inherit_3.out | 78 +- src/test/regress/expected/insert.out | 50 +- .../regress/expected/insert_conflict_1.out | 21 + src/test/regress/expected/partition_info.out | 94 +- .../regress/expected/partition_join_1.out | 2102 ++++++++++++++ src/test/regress/expected/partition_prune.out | 2477 +++++++++-------- .../regress/expected/partition_prune_hash.out | 288 +- src/test/regress/expected/psql.out | 26 +- src/test/regress/expected/sanity_check_1.out | 6 + src/test/regress/expected/sysviews.out | 2 +- src/test/regress/expected/temp.out | 2 +- src/test/regress/expected/truncate.out | 22 +- src/test/regress/expected/update.out | 355 +-- src/test/regress/input/tablespace.source | 29 +- src/test/regress/output/tablespace.source | 326 +-- src/test/regress/sql/alter_table.sql | 4 +- src/test/regress/sql/event_trigger.sql | 2 +- src/test/regress/sql/indexing.sql | 32 +- src/test/regress/sql/inherit.sql | 2 +- src/test/regress/sql/insert.sql | 6 +- src/test/regress/sql/partition_info.sql | 24 - src/test/regress/sql/partition_prune.sql | 2 +- src/test/regress/sql/select_parallel.sql | 2 + src/test/regress/sql/temp.sql | 2 +- src/test/regress/sql/truncate.sql | 4 +- src/test/regress/sql/update.sql | 23 +- 45 files changed, 4348 insertions(+), 2147 deletions(-) create mode 100644 src/test/regress/expected/partition_join_1.out diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index fbeeb3e5..4326a646 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -1031,7 +1031,7 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel, childrel->reltarget->exprs = (List *) adjust_appendrel_attrs(root, (Node *) rel->reltarget->exprs, - appinfo); + 1, &appinfo); /* * We have to make child entries in the EquivalenceClass data @@ -1073,7 +1073,7 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel, Assert(IsA(rinfo, RestrictInfo)); childqual = adjust_appendrel_attrs(root, (Node *) rinfo->clause, - appinfo); + 1, &appinfo); childqual = eval_const_expressions(root, childqual); /* check for flat-out constant */ if (childqual && IsA(childqual, Const)) @@ -1190,7 +1190,7 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel, childrel->joininfo = (List *) adjust_appendrel_attrs(root, (Node *) rel->joininfo, - appinfo); + 1, &appinfo); /* * If parallelism is allowable for this query in general, see whether diff --git a/src/backend/optimizer/path/equivclass.c b/src/backend/optimizer/path/equivclass.c index 4ad19a55..bb16aff0 100644 --- a/src/backend/optimizer/path/equivclass.c +++ b/src/backend/optimizer/path/equivclass.c @@ -1344,7 +1344,8 @@ generate_join_implied_equalities_broken(PlannerInfo *root, if (IS_OTHER_REL(inner_rel) && result != NIL) result = (List *) adjust_appendrel_attrs_multilevel(root, (Node *) result, - inner_rel); + inner_rel->relids, + inner_rel->top_parent_relids); return result; } @@ -2127,7 +2128,7 @@ add_child_rel_equivalences(PlannerInfo *root, child_expr = (Expr *) adjust_appendrel_attrs(root, (Node *) cur_em->em_expr, - appinfo); + 1, &appinfo); /* * Transform em_relids to match. Note we do *not* do diff --git a/src/backend/optimizer/path/joinrels.c b/src/backend/optimizer/path/joinrels.c index 715036b9..d8afa3ef 100644 --- a/src/backend/optimizer/path/joinrels.c +++ b/src/backend/optimizer/path/joinrels.c @@ -1405,7 +1405,7 @@ try_partition_wise_join(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2, * applicable to the parent join. */ child_restrictlist = - (List *) adjust_appendrel_attrs_nappinfos(root, + (List *) adjust_appendrel_attrs(root, (Node *) parent_restrictlist, nappinfos, appinfos); pfree(appinfos); diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 2b736fd4..55c28ea3 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -1402,7 +1402,7 @@ inheritance_planner(PlannerInfo *root) subroot->parse = (Query *) adjust_appendrel_attrs(parent_root, (Node *) parent_parse, - appinfo); + 1, &appinfo); /* * If there are securityQuals attached to the parent, move them to the diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c index 62bc5200..e5ddbc5c 100644 --- a/src/backend/optimizer/prep/prepunion.c +++ b/src/backend/optimizer/prep/prepunion.c @@ -58,9 +58,8 @@ typedef struct { PlannerInfo *root; - AppendRelInfo *appinfo; - AppendRelInfo **appinfos; int nappinfos; + AppendRelInfo **appinfos; } adjust_appendrel_attrs_context; static Path *recurse_set_operations(Node *setOp, PlannerInfo *root, @@ -125,7 +124,8 @@ static Node *adjust_appendrel_attrs_mutator(Node *node, adjust_appendrel_attrs_context *context); static Relids adjust_child_relids(Relids relids, int nappinfos, AppendRelInfo **appinfos); -static Relids adjust_relid_set(Relids relids, Index oldrelid, Index newrelid); +static Relids adjust_child_relids(Relids relids, int nappinfos, + AppendRelInfo **appinfos); static List *adjust_inherited_tlist(List *tlist, AppendRelInfo *context); @@ -1991,56 +1991,6 @@ translate_col_privs(const Bitmapset *parent_privs, return child_privs; } -/* - * adjust_appendrel_attrs - * Copy the specified query or expression and translate Vars referring - * to the parent rel of the specified AppendRelInfo to refer to the - * child rel instead. We also update rtindexes appearing outside Vars, - * such as resultRelation and jointree relids. - * - * Note: this is only applied after conversion of sublinks to subplans, - * so we don't need to cope with recursion into sub-queries. - * - * Note: this is not hugely different from what pullup_replace_vars() does; - * maybe we should try to fold the two routines together. - */ -Node * -adjust_appendrel_attrs(PlannerInfo *root, Node *node, AppendRelInfo *appinfo) -{ - Node *result; - adjust_appendrel_attrs_context context; - - context.root = root; - context.appinfo = appinfo; - - /* - * Must be prepared to start with a Query or a bare expression tree. - */ - if (node && IsA(node, Query)) - { - Query *newnode; - - newnode = query_tree_mutator((Query *) node, - adjust_appendrel_attrs_mutator, - (void *) &context, - QTW_IGNORE_RC_SUBQUERIES); - if (newnode->resultRelation == appinfo->parent_relid) - { - newnode->resultRelation = appinfo->child_relid; - /* Fix tlist resnos too, if it's inherited UPDATE */ - if (newnode->commandType == CMD_UPDATE) - newnode->targetList = - adjust_inherited_tlist(newnode->targetList, - appinfo); - } - result = (Node *) newnode; - } - else - result = adjust_appendrel_attrs_mutator(node, &context); - - return result; -} - /* * adjust_appendrel_attrs * Copy the specified query or expression and translate Vars referring to a @@ -2055,7 +2005,7 @@ adjust_appendrel_attrs(PlannerInfo *root, Node *node, AppendRelInfo *appinfo) * maybe we should try to fold the two routines together. */ Node * -adjust_appendrel_attrs_nappinfos(PlannerInfo *root, Node *node, int nappinfos, +adjust_appendrel_attrs(PlannerInfo *root, Node *node, int nappinfos, AppendRelInfo **appinfos) { Node *result; @@ -2107,17 +2057,28 @@ adjust_appendrel_attrs_nappinfos(PlannerInfo *root, Node *node, int nappinfos, static Node * adjust_appendrel_attrs_mutator(Node *node, adjust_appendrel_attrs_context *context) -{// #lizard forgives - AppendRelInfo *appinfo = context->appinfo; +{ + AppendRelInfo **appinfos = context->appinfos; + int nappinfos = context->nappinfos; + int cnt; if (node == NULL) return NULL; if (IsA(node, Var)) { Var *var = (Var *) copyObject(node); + AppendRelInfo *appinfo = NULL; + + for (cnt = 0; cnt < nappinfos; cnt++) + { + if (var->varno == appinfos[cnt]->parent_relid) + { + appinfo = appinfos[cnt]; + break; + } + } - if (var->varlevelsup == 0 && - var->varno == appinfo->parent_relid) + if (var->varlevelsup == 0 && appinfo) { var->varno = appinfo->child_relid; var->varnoold = appinfo->child_relid; @@ -2197,29 +2158,54 @@ adjust_appendrel_attrs_mutator(Node *node, { CurrentOfExpr *cexpr = (CurrentOfExpr *) copyObject(node); + for (cnt = 0; cnt < nappinfos; cnt++) + { + AppendRelInfo *appinfo = appinfos[cnt]; + if (cexpr->cvarno == appinfo->parent_relid) + { cexpr->cvarno = appinfo->child_relid; + break; + } + } return (Node *) cexpr; } if (IsA(node, RangeTblRef)) { RangeTblRef *rtr = (RangeTblRef *) copyObject(node); + for (cnt = 0; cnt < nappinfos; cnt++) + { + AppendRelInfo *appinfo = appinfos[cnt]; + if (rtr->rtindex == appinfo->parent_relid) + { rtr->rtindex = appinfo->child_relid; + break; + } + } return (Node *) rtr; } if (IsA(node, JoinExpr)) { /* Copy the JoinExpr node with correct mutation of subnodes */ JoinExpr *j; + AppendRelInfo *appinfo; j = (JoinExpr *) expression_tree_mutator(node, adjust_appendrel_attrs_mutator, (void *) context); /* now fix JoinExpr's rtindex (probably never happens) */ + for (cnt = 0; cnt < nappinfos; cnt++) + { + appinfo = appinfos[cnt]; + if (j->rtindex == appinfo->parent_relid) + { j->rtindex = appinfo->child_relid; + break; + } + } return (Node *) j; } if (IsA(node, PlaceHolderVar)) @@ -2232,9 +2218,8 @@ adjust_appendrel_attrs_mutator(Node *node, (void *) context); /* now fix PlaceHolderVar's relid sets */ if (phv->phlevelsup == 0) - phv->phrels = adjust_relid_set(phv->phrels, - appinfo->parent_relid, - appinfo->child_relid); + phv->phrels = adjust_child_relids(phv->phrels, context->nappinfos, + context->appinfos); return (Node *) phv; } /* Shouldn't need to handle planner auxiliary nodes here */ @@ -2265,24 +2250,24 @@ adjust_appendrel_attrs_mutator(Node *node, adjust_appendrel_attrs_mutator((Node *) oldinfo->orclause, context); /* adjust relid sets too */ - newinfo->clause_relids = adjust_relid_set(oldinfo->clause_relids, - appinfo->parent_relid, - appinfo->child_relid); - newinfo->required_relids = adjust_relid_set(oldinfo->required_relids, - appinfo->parent_relid, - appinfo->child_relid); - newinfo->outer_relids = adjust_relid_set(oldinfo->outer_relids, - appinfo->parent_relid, - appinfo->child_relid); - newinfo->nullable_relids = adjust_relid_set(oldinfo->nullable_relids, - appinfo->parent_relid, - appinfo->child_relid); - newinfo->left_relids = adjust_relid_set(oldinfo->left_relids, - appinfo->parent_relid, - appinfo->child_relid); - newinfo->right_relids = adjust_relid_set(oldinfo->right_relids, - appinfo->parent_relid, - appinfo->child_relid); + newinfo->clause_relids = adjust_child_relids(oldinfo->clause_relids, + context->nappinfos, + context->appinfos); + newinfo->required_relids = adjust_child_relids(oldinfo->required_relids, + context->nappinfos, + context->appinfos); + newinfo->outer_relids = adjust_child_relids(oldinfo->outer_relids, + context->nappinfos, + context->appinfos); + newinfo->nullable_relids = adjust_child_relids(oldinfo->nullable_relids, + context->nappinfos, + context->appinfos); + newinfo->left_relids = adjust_child_relids(oldinfo->left_relids, + context->nappinfos, + context->appinfos); + newinfo->right_relids = adjust_child_relids(oldinfo->right_relids, + context->nappinfos, + context->appinfos); /* * Reset cached derivative fields, since these might need to have @@ -2347,23 +2332,6 @@ adjust_child_relids(Relids relids, int nappinfos, AppendRelInfo **appinfos) return relids; } -/* - * Substitute newrelid for oldrelid in a Relid set - */ -static Relids -adjust_relid_set(Relids relids, Index oldrelid, Index newrelid) -{ - if (bms_is_member(oldrelid, relids)) - { - /* Ensure we have a modifiable copy */ - relids = bms_copy(relids); - /* Remove old, add new */ - relids = bms_del_member(relids, oldrelid); - relids = bms_add_member(relids, newrelid); - } - return relids; -} - /* * Replace any relid present in top_parent_relids with its child in * child_relids. Members of child_relids can be multiple levels below top @@ -2518,65 +2486,42 @@ adjust_inherited_tlist(List *tlist, AppendRelInfo *context) * adjust_appendrel_attrs_multilevel * Apply Var translations from a toplevel appendrel parent down to a child. * - * In some cases we need to translate expressions referencing a baserel + * In some cases we need to translate expressions referencing a parent relation * to reference an appendrel child that's multiple levels removed from it. */ Node * adjust_appendrel_attrs_multilevel(PlannerInfo *root, Node *node, - RelOptInfo *child_rel) + Relids child_relids, + Relids top_parent_relids) { - AppendRelInfo *appinfo = find_childrel_appendrelinfo(root, child_rel); - RelOptInfo *parent_rel = find_base_rel(root, appinfo->parent_relid); + AppendRelInfo **appinfos; + Bitmapset *parent_relids = NULL; + int nappinfos; + int cnt; - /* If parent is also a child, first recurse to apply its translations */ - if (IS_OTHER_REL(parent_rel)) - node = adjust_appendrel_attrs_multilevel(root, node, parent_rel); - else - Assert(parent_rel->reloptkind == RELOPT_BASEREL); - /* Now translate for this child */ - return adjust_appendrel_attrs(root, node, appinfo); -} + Assert(bms_num_members(child_relids) == bms_num_members(top_parent_relids)); -/* - * Construct the SpecialJoinInfo for a child-join by translating - * SpecialJoinInfo for the join between parents. left_relids and right_relids - * are the relids of left and right side of the join respectively. - */ -SpecialJoinInfo * -build_child_join_sjinfo(PlannerInfo *root, SpecialJoinInfo *parent_sjinfo, - Relids left_relids, Relids right_relids) + appinfos = find_appinfos_by_relids(root, child_relids, &nappinfos); + + /* Construct relids set for the immediate parent of given child. */ + for (cnt = 0; cnt < nappinfos; cnt++) { - SpecialJoinInfo *sjinfo = makeNode(SpecialJoinInfo); - AppendRelInfo **left_appinfos; - int left_nappinfos; - AppendRelInfo **right_appinfos; - int right_nappinfos; + AppendRelInfo *appinfo = appinfos[cnt]; - memcpy(sjinfo, parent_sjinfo, sizeof(SpecialJoinInfo)); - left_appinfos = find_appinfos_by_relids(root, left_relids, - &left_nappinfos); - right_appinfos = find_appinfos_by_relids(root, right_relids, - &right_nappinfos); + parent_relids = bms_add_member(parent_relids, appinfo->parent_relid); + } - sjinfo->min_lefthand = adjust_child_relids(sjinfo->min_lefthand, - left_nappinfos, left_appinfos); - sjinfo->min_righthand = adjust_child_relids(sjinfo->min_righthand, - right_nappinfos, - right_appinfos); - sjinfo->syn_lefthand = adjust_child_relids(sjinfo->syn_lefthand, - left_nappinfos, left_appinfos); - sjinfo->syn_righthand = adjust_child_relids(sjinfo->syn_righthand, - right_nappinfos, - right_appinfos); - sjinfo->semi_rhs_exprs = (List *) adjust_appendrel_attrs_nappinfos(root, - (Node *) sjinfo->semi_rhs_exprs, - right_nappinfos, - right_appinfos); + /* Recurse if immediate parent is not the top parent. */ + if (!bms_equal(parent_relids, top_parent_relids)) + node = adjust_appendrel_attrs_multilevel(root, node, parent_relids, + top_parent_relids); - pfree(left_appinfos); - pfree(right_appinfos); + /* Now translate for this child */ + node = adjust_appendrel_attrs(root, node, nappinfos, appinfos); - return sjinfo; + pfree(appinfos); + + return node; } /* @@ -2615,3 +2560,45 @@ find_appinfos_by_relids(PlannerInfo *root, Relids relids, int *nappinfos) elog(ERROR, "did not find all requested child rels in append_rel_list"); return NULL; /* not reached */ } + +/* + * Construct the SpecialJoinInfo for a child-join by translating + * SpecialJoinInfo for the join between parents. left_relids and right_relids + * are the relids of left and right side of the join respectively. + */ +SpecialJoinInfo * +build_child_join_sjinfo(PlannerInfo *root, SpecialJoinInfo *parent_sjinfo, + Relids left_relids, Relids right_relids) +{ + SpecialJoinInfo *sjinfo = makeNode(SpecialJoinInfo); + AppendRelInfo **left_appinfos; + int left_nappinfos; + AppendRelInfo **right_appinfos; + int right_nappinfos; + + memcpy(sjinfo, parent_sjinfo, sizeof(SpecialJoinInfo)); + left_appinfos = find_appinfos_by_relids(root, left_relids, + &left_nappinfos); + right_appinfos = find_appinfos_by_relids(root, right_relids, + &right_nappinfos); + + sjinfo->min_lefthand = adjust_child_relids(sjinfo->min_lefthand, + left_nappinfos, left_appinfos); + sjinfo->min_righthand = adjust_child_relids(sjinfo->min_righthand, + right_nappinfos, + right_appinfos); + sjinfo->syn_lefthand = adjust_child_relids(sjinfo->syn_lefthand, + left_nappinfos, left_appinfos); + sjinfo->syn_righthand = adjust_child_relids(sjinfo->syn_righthand, + right_nappinfos, + right_appinfos); + sjinfo->semi_rhs_exprs = (List *) adjust_appendrel_attrs(root, + (Node *) sjinfo->semi_rhs_exprs, + right_nappinfos, + right_appinfos); + + pfree(left_appinfos); + pfree(right_appinfos); + + return sjinfo; +} diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 2d4a5d2b..4d2a1f32 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -6967,7 +6967,8 @@ reparameterize_path_by_child(PlannerInfo *root, Path *path, #define ADJUST_CHILD_ATTRS(node) \ ((node) = \ (List *) adjust_appendrel_attrs_multilevel(root, (Node *) (node), \ - child_rel)) + child_rel->relids, \ + child_rel->top_parent_relids)) #define REPARAMETERIZE_CHILD_PATH(path) \ do { \ diff --git a/src/backend/optimizer/util/placeholder.c b/src/backend/optimizer/util/placeholder.c index a344dbe8..0d5351a6 100644 --- a/src/backend/optimizer/util/placeholder.c +++ b/src/backend/optimizer/util/placeholder.c @@ -499,7 +499,7 @@ add_placeholders_to_child_joinrel(PlannerInfo *root, RelOptInfo *childrel, if (bms_overlap(phv->phrels, parentrel->relids) && childrel->reloptkind == RELOPT_OTHER_JOINREL) { - phv = (PlaceHolderVar *) adjust_appendrel_attrs_nappinfos(root, + phv = (PlaceHolderVar *) adjust_appendrel_attrs(root, (Node *) phv, nappinfos, appinfos); diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c index 39100cae..1f6fb286 100644 --- a/src/backend/optimizer/util/relnode.c +++ b/src/backend/optimizer/util/relnode.c @@ -819,7 +819,7 @@ build_child_join_rel(PlannerInfo *root, RelOptInfo *outer_rel, /* Construct joininfo list. */ appinfos = find_appinfos_by_relids(root, joinrel->relids, &nappinfos); - joinrel->joininfo = (List *) adjust_appendrel_attrs_nappinfos(root, + joinrel->joininfo = (List *) adjust_appendrel_attrs(root, (Node *) parent_joinrel->joininfo, nappinfos, appinfos); diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c index 4c2e07ac..ff427084 100644 --- a/src/bin/psql/describe.c +++ b/src/bin/psql/describe.c @@ -1911,12 +1911,15 @@ describeOneTableDetails(const char *schemaname, { char *parent_name = PQgetvalue(result, 0, 0); char *partdef = PQgetvalue(result, 0, 1); - char *partconstraintdef = NULL; printfPQExpBuffer(&tmpbuf, _("Partition of: %s %s"), parent_name, partdef); printTableAddFooter(&cont, tmpbuf.data); + if (verbose) + { + char *partconstraintdef = NULL; + if (!PQgetisnull(result, 0, 2)) partconstraintdef = PQgetvalue(result, 0, 2); /* If there isn't any constraint, show that explicitly */ @@ -1926,7 +1929,7 @@ describeOneTableDetails(const char *schemaname, printfPQExpBuffer(&tmpbuf, _("Partition constraint: %s"), partconstraintdef); printTableAddFooter(&cont, tmpbuf.data); - + } } PQclear(result); } @@ -2258,6 +2261,13 @@ describeOneTableDetails(const char *schemaname, if (tableinfo.hastriggers || tableinfo.relkind == RELKIND_PARTITIONED_TABLE) { + if (pset.sversion >= 100000 && + (tableinfo.ispartition || tableinfo.relkind == RELKIND_PARTITIONED_TABLE)) + { + /* + * Put the constraints defined in this table first, followed + * by the constraints defined in ancestor partitioned tables. + */ printfPQExpBuffer(&buf, "SELECT conrelid = '%s'::pg_catalog.regclass AS sametable,\n" " conname,\n" @@ -2268,6 +2278,21 @@ describeOneTableDetails(const char *schemaname, " WHERE conrelid = relid AND contype = 'f'\n" " ORDER BY sametable DESC, conname;", oid, oid); + + } + else + { + printfPQExpBuffer(&buf, + "SELECT true as sametable, conname,\n" + " pg_catalog.pg_get_constraintdef(r.oid, true) as condef,\n" + " conrelid::pg_catalog.regclass AS ontable\n" + " FROM pg_catalog.pg_constraint r\n" + " WHERE r.conrelid = '%s' AND r.contype = 'f'\n", + oid); + + appendPQExpBuffer(&buf, " ORDER BY conname"); + } + result = PSQLexec(buf.data); if (!result) goto error_return; @@ -2276,6 +2301,11 @@ describeOneTableDetails(const char *schemaname, if (tuples > 0) { + int i_sametable = PQfnumber(result, "sametable"), + i_conname = PQfnumber(result, "conname"), + i_condef = PQfnumber(result, "condef"), + i_ontable = PQfnumber(result, "ontable"); + printTableAddFooter(&cont, _("Foreign-key constraints:")); for (i = 0; i < tuples; i++) { @@ -2284,10 +2314,15 @@ describeOneTableDetails(const char *schemaname, * a "TABLE tab" prefix when the constraint is defined in * a parent partitioned table. */ + if (strcmp(PQgetvalue(result, i, i_sametable), "f") == 0) printfPQExpBuffer(&buf, " TABLE \"%s\" CONSTRAINT \"%s\" %s", - PQgetvalue(result, i, 1), - PQgetvalue(result, i, 2), - PQgetvalue(result, i, 3)); + PQgetvalue(result, i, i_ontable), + PQgetvalue(result, i, i_conname), + PQgetvalue(result, i, i_condef)); + else + printfPQExpBuffer(&buf, " \"%s\" %s", + PQgetvalue(result, i, i_conname), + PQgetvalue(result, i, i_condef)); printTableAddFooter(&cont, buf.data); } diff --git a/src/include/optimizer/prep.h b/src/include/optimizer/prep.h index 99c87a2d..e35330d7 100644 --- a/src/include/optimizer/prep.h +++ b/src/include/optimizer/prep.h @@ -53,13 +53,11 @@ extern RelOptInfo *plan_set_operations(PlannerInfo *root); extern void expand_inherited_tables(PlannerInfo *root); extern Node *adjust_appendrel_attrs(PlannerInfo *root, Node *node, - AppendRelInfo *appinfo); - -extern Node *adjust_appendrel_attrs_nappinfos(PlannerInfo *root, Node *node, int nappinfos, - AppendRelInfo **appinfos); + int nappinfos, AppendRelInfo **appinfos); extern Node *adjust_appendrel_attrs_multilevel(PlannerInfo *root, Node *node, - RelOptInfo *child_rel); + Relids child_relids, + Relids top_parent_relids); extern AppendRelInfo **find_appinfos_by_relids(PlannerInfo *root, Relids relids, int *nappinfos); diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out index 626f34f7..737710bc 100644 --- a/src/test/regress/expected/alter_table.out +++ b/src/test/regress/expected/alter_table.out @@ -1448,7 +1448,7 @@ copy test("........pg.dropped.1........") to stdout; ERROR: column "........pg.dropped.1........" of relation "test" does not exist copy test from stdin; ERROR: extra data after last expected column -CONTEXT: COPY test, line 1: "10 11 12" +CONTEXT: COPY test, line 1: "10 11 12", nodetype:1(1:cn,0:dn) select * from test order by b; b | c ---+--- @@ -1971,8 +1971,8 @@ ERROR: cannot alter table "tab1" because column "tab2.y" uses its row type create table at_partitioned (a int, b text) partition by range (a); create table at_part_1 partition of at_partitioned for values from (0) to (1000); insert into at_partitioned values (512, '0.123'); -create table at_part_2 (b text, a int); -insert into at_part_2 values ('1.234', 1024); +create table at_part_2 (a int, b text); +insert into at_part_2 values (1024, '1.234'); create index on at_partitioned (b); create index on at_partitioned (a); \d at_part_1 @@ -1990,16 +1990,16 @@ Indexes: Table "public.at_part_2" Column | Type | Collation | Nullable | Default --------+---------+-----------+----------+--------- - b | text | | | a | integer | | | + b | text | | | alter table at_partitioned attach partition at_part_2 for values from (1000) to (2000); \d at_part_2 Table "public.at_part_2" Column | Type | Collation | Nullable | Default --------+---------+-----------+----------+--------- - b | text | | | a | integer | | | + b | text | | | Partition of: at_partitioned FOR VALUES FROM (1000) TO (2000) Indexes: "at_part_2_a_idx" btree (a) @@ -2021,8 +2021,8 @@ Indexes: Table "public.at_part_2" Column | Type | Collation | Nullable | Default --------+---------+-----------+----------+--------- - b | numeric | | | a | integer | | | + b | numeric | | | Partition of: at_partitioned FOR VALUES FROM (1000) TO (2000) Indexes: "at_part_2_a_idx" btree (a) @@ -2992,8 +2992,7 @@ ALTER TABLE new_system_table RENAME TO old_system_table; CREATE INDEX old_system_table__othercol ON old_system_table (othercol); INSERT INTO old_system_table(othercol) VALUES ('somedata'), ('otherdata'); UPDATE old_system_table SET id = -id; -ERROR: could not plan this distributed update -DETAIL: correlated UPDATE or updating distribution column currently not supported in Postgres-XL. +ERROR: Distributed column or partition column "id" can't be updated in current version DELETE FROM old_system_table WHERE othercol = 'somedata'; TRUNCATE old_system_table; ALTER TABLE old_system_table DROP CONSTRAINT new_system_table_pkey; @@ -3577,8 +3576,8 @@ CREATE OPERATOR CLASS custom_opclass FOR TYPE int4 USING HASH AS OPERATOR 1 = , FUNCTION 2 dummy_hashint4(int4, int8); -- check that the new partition won't overlap with an existing partition CREATE TABLE hash_parted ( - a int, - b int + a int, + b int ) PARTITION BY HASH (a custom_opclass); CREATE TABLE hpart_1 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 4, REMAINDER 0); CREATE TABLE fail_part (LIKE hpart_1); @@ -3599,7 +3598,7 @@ ALTER TABLE hash_parted ATTACH PARTITION hpart_2 FOR VALUES WITH (MODULUS 4, REM -- check that leaf partitions are scanned when attaching a partitioned -- table CREATE TABLE hpart_5 ( - LIKE hash_parted + LIKE hash_parted ) PARTITION BY LIST (b); -- check that violating rows are correctly reported CREATE TABLE hpart_5_a PARTITION OF hpart_5 FOR VALUES IN ('1', '2', '3'); diff --git a/src/test/regress/expected/alter_table_3.out b/src/test/regress/expected/alter_table_3.out index 3287e360..50bc6605 100644 --- a/src/test/regress/expected/alter_table_3.out +++ b/src/test/regress/expected/alter_table_3.out @@ -2931,7 +2931,8 @@ ALTER TABLE new_system_table RENAME TO old_system_table; CREATE INDEX old_system_table__othercol ON old_system_table (othercol); INSERT INTO old_system_table(othercol) VALUES ('somedata'), ('otherdata'); UPDATE old_system_table SET id = -id; -ERROR: Distributed column or partition column "id" can't be updated in current version +ERROR: could not plan this distributed update +DETAIL: correlated UPDATE or updating distribution column currently not supported in Postgres-XL. DELETE FROM old_system_table WHERE othercol = 'somedata'; TRUNCATE old_system_table; ALTER TABLE old_system_table DROP CONSTRAINT new_system_table_pkey; diff --git a/src/test/regress/expected/create_table.out b/src/test/regress/expected/create_table.out index 3290fe55..4f679633 100644 --- a/src/test/regress/expected/create_table.out +++ b/src/test/regress/expected/create_table.out @@ -446,6 +446,8 @@ Number of partitions: 0 b | text | | | | extended | | Partition key: RANGE (((a + 1)), substr(b, 1, 5)) Number of partitions: 0 +Distribute By: HASH(a) +Location Nodes: ALL DATANODES INSERT INTO partitioned2 VALUES (1, 'hello'); ERROR: no partition of relation "partitioned2" found for row @@ -459,6 +461,8 @@ CREATE TABLE part2_1 PARTITION OF partitioned2 FOR VALUES FROM (-1, 'aaaaa') TO b | text | | | | extended | | Partition of: partitioned2 FOR VALUES FROM ('-1', 'aaaaa') TO (100, 'ccccc') Partition constraint: (((a + 1) IS NOT NULL) AND (substr(b, 1, 5) IS NOT NULL) AND (((a + 1) > '-1'::integer) OR (((a + 1) = '-1'::integer) AND (substr(b, 1, 5) >= 'aaaaa'::text))) AND (((a + 1) < 100) OR (((a + 1) = 100) AND (substr(b, 1, 5) < 'ccccc'::text)))) +Distribute By: HASH(a) +Location Nodes: ALL DATANODES DROP TABLE partitioned, partitioned2; -- check that dependencies of partition columns are handled correctly @@ -953,5 +957,7 @@ create table boolspart_f partition of boolspart for values in (false); Partition key: LIST (a) Partitions: boolspart_f FOR VALUES IN (false), boolspart_t FOR VALUES IN (true) +Distribute By: HASH(a) +Location Nodes: ALL DATANODES drop table boolspart; diff --git a/src/test/regress/expected/event_trigger.out b/src/test/regress/expected/event_trigger.out index 2537e6f1..6ff64a59 100644 --- a/src/test/regress/expected/event_trigger.out +++ b/src/test/regress/expected/event_trigger.out @@ -285,7 +285,7 @@ CREATE SCHEMA evttrig CREATE TABLE two (col_c INTEGER CHECK (col_c > 0) REFERENCES one DEFAULT 42); -- Partitioned tables with a partitioned index CREATE TABLE evttrig.parted ( - id int PRIMARY KEY) + id int) PARTITION BY RANGE (id); CREATE TABLE evttrig.part_1_10 PARTITION OF evttrig.parted (id) FOR VALUES FROM (1) TO (10); @@ -304,11 +304,6 @@ NOTICE: drop cascades to 3 other objects DETAIL: drop cascades to table evttrig.one drop cascades to table evttrig.two drop cascades to table evttrig.parted -NOTICE: NORMAL: orig=f normal=t istemp=f type=table identity=evttrig.parted name={evttrig,parted} args={} -NOTICE: NORMAL: orig=f normal=t istemp=f type=table identity=evttrig.part_1_10 name={evttrig,part_1_10} args={} -NOTICE: NORMAL: orig=f normal=t istemp=f type=table identity=evttrig.part_10_20 name={evttrig,part_10_20} args={} -NOTICE: NORMAL: orig=f normal=t istemp=f type=table identity=evttrig.part_10_15 name={evttrig,part_10_15} args={} -NOTICE: NORMAL: orig=f normal=t istemp=f type=table identity=evttrig.part_15_20 name={evttrig,part_15_20} args={} DROP TABLE a_temp_tbl; DROP EVENT TRIGGER regress_event_trigger_report_dropped; ERROR: event trigger "regress_event_trigger_report_dropped" does not exist diff --git a/src/test/regress/expected/foreign_data.out b/src/test/regress/expected/foreign_data.out index a5326254..dec9af7f 100644 --- a/src/test/regress/expected/foreign_data.out +++ b/src/test/regress/expected/foreign_data.out @@ -1491,6 +1491,7 @@ ERROR: server "s0" does not exist c2 | text | | | | extended | | c3 | date | | | | plain | | Partition key: LIST (c1) +Number of partitions: 0 Distribute By: HASH(c1) Location Nodes: ALL DATANODES @@ -1540,6 +1541,7 @@ ERROR: relation "pt2_1" does not exist c2 | text | | | | extended | | c3 | date | | | | plain | | Partition key: LIST (c1) +Number of partitions: 0 Distribute By: HASH(c1) Location Nodes: ALL DATANODES @@ -1560,6 +1562,7 @@ ERROR: relation "pt2_1" does not exist c2 | text | | | | extended | | c3 | date | | | | plain | | Partition key: LIST (c1) +Number of partitions: 0 Distribute By: HASH(c1) Location Nodes: ALL DATANODES diff --git a/src/test/regress/expected/foreign_key_2.out b/src/test/regress/expected/foreign_key_2.out index e3b7210b..c64ced2f 100644 --- a/src/test/regress/expected/foreign_key_2.out +++ b/src/test/regress/expected/foreign_key_2.out @@ -1442,26 +1442,39 @@ drop table pktable2, fktable2; -- Ensure that works. CREATE TABLE fk_notpartitioned_pk (a INT, PRIMARY KEY(a), CHECK (a > 0)); CREATE TABLE fk_partitioned_fk (a INT REFERENCES fk_notpartitioned_pk(a) PRIMARY KEY) PARTITION BY RANGE(a); +ERROR: foreign key constraints are not supported on partitioned tables +LINE 1: CREATE TABLE fk_partitioned_fk (a INT REFERENCES fk_notparti... + ^ CREATE TABLE fk_partitioned_fk_1 PARTITION OF fk_partitioned_fk FOR VALUES FROM (MINVALUE) TO (MAXVALUE); +ERROR: relation "fk_partitioned_fk" does not exist INSERT INTO fk_notpartitioned_pk VALUES (1); INSERT INTO fk_partitioned_fk VALUES (1); +ERROR: relation "fk_partitioned_fk" does not exist +LINE 1: INSERT INTO fk_partitioned_fk VALUES (1); + ^ ALTER TABLE fk_notpartitioned_pk ALTER COLUMN a TYPE bigint; DELETE FROM fk_notpartitioned_pk WHERE a = 1; -ERROR: update or delete on table "fk_notpartitioned_pk" violates foreign key constraint "fk_partitioned_fk_a_fkey" on table "fk_partitioned_fk" -DETAIL: Key (a)=(1) is still referenced from table "fk_partitioned_fk". DROP TABLE fk_notpartitioned_pk, fk_partitioned_fk; +ERROR: table "fk_partitioned_fk" does not exist -- ensure we check partitions are "not used" when dropping constraints CREATE SCHEMA fkpart8 CREATE TABLE tbl1(f1 int PRIMARY KEY) CREATE TABLE tbl2(f1 int REFERENCES tbl1 DEFERRABLE INITIALLY DEFERRED) PARTITION BY RANGE(f1) CREATE TABLE tbl2_p1 PARTITION OF tbl2 FOR VALUES FROM (minvalue) TO (maxvalue); +ERROR: foreign key constraints are not supported on partitioned tables +LINE 3: CREATE TABLE tbl2(f1 int REFERENCES tbl1 DEFERRABLE INITIA... + ^ INSERT INTO fkpart8.tbl1 VALUES(1); +ERROR: relation "fkpart8.tbl1" does not exist +LINE 1: INSERT INTO fkpart8.tbl1 VALUES(1); + ^ BEGIN; INSERT INTO fkpart8.tbl2 VALUES(1); +ERROR: relation "fkpart8.tbl2" does not exist +LINE 1: INSERT INTO fkpart8.tbl2 VALUES(1); + ^ ALTER TABLE fkpart8.tbl2 DROP CONSTRAINT tbl2_f1_fkey; -ERROR: cannot ALTER TABLE "tbl2_p1" because it has pending trigger events +ERROR: current transaction is aborted, commands ignored until end of transaction block COMMIT; DROP SCHEMA fkpart8 CASCADE; -NOTICE: drop cascades to 2 other objects -DETAIL: drop cascades to table fkpart8.tbl1 -drop cascades to table fkpart8.tbl2 +ERROR: schema "fkpart8" does not exist diff --git a/src/test/regress/expected/identity_1.out b/src/test/regress/expected/identity_1.out index facf2230..e07bcfa2 100644 --- a/src/test/regress/expected/identity_1.out +++ b/src/test/regress/expected/identity_1.out @@ -322,3 +322,15 @@ SELECT * FROM itest8; RESET ROLE; DROP TABLE itest8; DROP USER regress_user1; +-- typed tables (currently not supported) +CREATE TYPE itest_type AS (f1 integer, f2 text, f3 bigint); +CREATE TABLE itest12 OF itest_type (f1 WITH OPTIONS GENERATED ALWAYS AS IDENTITY); -- error +ERROR: identity colums are not supported on typed tables +DROP TYPE itest_type CASCADE; +-- table partitions (currently not supported) +CREATE TABLE itest_parent (f1 date NOT NULL, f2 text, f3 bigint) PARTITION BY RANGE (f1); +CREATE TABLE itest_child PARTITION OF itest_parent ( + f3 WITH OPTIONS GENERATED ALWAYS AS IDENTITY +) FOR VALUES FROM ('2016-07-01') TO ('2016-08-01'); -- error +ERROR: identify columns are not supported on partitions +DROP TABLE itest_parent; diff --git a/src/test/regress/expected/indexing.out b/src/test/regress/expected/indexing.out index d4326a87..f996a88d 100644 --- a/src/test/regress/expected/indexing.out +++ b/src/test/regress/expected/indexing.out @@ -29,7 +29,8 @@ create table idxpart1 partition of idxpart for values from (0) to (10); create unique index on idxpart (a); ERROR: cannot create unique index on partitioned table "idxpart" create index concurrently on idxpart (a); -ERROR: cannot create index on partitioned table "idxpart" concurrently +ERROR: PGXC does not support concurrent INDEX yet +DETAIL: The feature is not currently supported drop table idxpart; -- Verify bugfix with index rewrite on ALTER TABLE / SET DATA TYPE -- https://postgr.es/m/CAKcux6mxNCGsgATwf5CGMF8g4WSupCXicCVMeKUTuWbyxHOMsQ@mail.gmail.com @@ -67,22 +68,18 @@ Indexes: "idxpart1_b_c_idx" btree (b, c) \d+ idxpart1_a_idx - Index "public.idxpart1_a_idx" - Column | Type | Key? | Definition | Storage | Stats target ---------+---------+------+------------+---------+-------------- - a | integer | yes | a | plain | -Partition of: idxparti -No partition constraint + Index "public.idxpart1_a_idx" + Column | Type | Definition | Storage +--------+---------+------------+--------- + a | integer | a | plain btree, for table "public.idxpart1" \d+ idxpart1_b_c_idx - Index "public.idxpart1_b_c_idx" - Column | Type | Key? | Definition | Storage | Stats target ---------+---------+------+------------+----------+-------------- - b | integer | yes | b | plain | - c | text | yes | c | extended | -Partition of: idxparti2 -No partition constraint + Index "public.idxpart1_b_c_idx" + Column | Type | Definition | Storage +--------+---------+------------+---------- + b | integer | b | plain + c | text | c | extended btree, for table "public.idxpart1" drop table idxpart; @@ -574,10 +571,8 @@ ERROR: cannot attach index "idxpart2_a_idx" as a partition of index "idxpart_a_ DETAIL: The index definitions do not match. drop table idxpart; -- Verify that attaching indexes maps attribute numbers correctly -create table idxpart (col1 int, a int, col2 int, b int) partition by range (a); -create table idxpart1 (b int, col1 int, col2 int, col3 int, a int); -alter table idxpart drop column col1, drop column col2; -alter table idxpart1 drop column col1, drop column col2, drop column col3; +create table idxpart (a int, b int) partition by range (a); +create table idxpart1 (a int, b int); alter table idxpart attach partition idxpart1 for values from (0) to (1000); create index idxpart_1_idx on only idxpart (b, a); create index idxpart1_1_idx on idxpart1 (b, a); @@ -618,9 +613,9 @@ drop table idxpart; create table idxpart (a int, b int, c text) partition by range (a); create index idxparti on idxpart (a); create index idxparti2 on idxpart (c, b); -create table idxpart1 (c text, a int, b int); +create table idxpart1 (a int, b int, c text); alter table idxpart attach partition idxpart1 for values from (0) to (10); -create table idxpart2 (c text, a int, b int); +create table idxpart2 (a int, b int, c text); create index on idxpart2 (a); create index on idxpart2 (c, b); alter table idxpart attach partition idxpart2 for values from (10) to (20); @@ -640,12 +635,9 @@ select c.relname, pg_get_indexdef(indexrelid) drop table idxpart; -- Verify that columns are mapped correctly in expression indexes -create table idxpart (col1 int, col2 int, a int, b int) partition by range (a); -create table idxpart1 (col2 int, b int, col1 int, a int); -create table idxpart2 (col1 int, col2 int, b int, a int); -alter table idxpart drop column col1, drop column col2; -alter table idxpart1 drop column col1, drop column col2; -alter table idxpart2 drop column col1, drop column col2; +create table idxpart (a int, b int) partition by range (a); +create table idxpart1 (a int, b int); +create table idxpart2 (a int, b int); create index on idxpart2 (abs(b)); alter table idxpart attach partition idxpart2 for values from (0) to (1); create index on idxpart (abs(b)); @@ -663,14 +655,11 @@ select c.relname, pg_get_indexdef(indexrelid) drop table idxpart; -- Verify that columns are mapped correctly for WHERE in a partial index -create table idxpart (col1 int, a int, col3 int, b int) partition by range (a); -alter table idxpart drop column col1, drop column col3; -create table idxpart1 (col1 int, col2 int, col3 int, col4 int, b int, a int); -alter table idxpart1 drop column col1, drop column col2, drop column col3, drop column col4; +create table idxpart (a int, b int) partition by range (a); +create table idxpart1 (a int, b int); alter table idxpart attach partition idxpart1 for values from (0) to (1000); -create table idxpart2 (col1 int, col2 int, b int, a int); +create table idxpart2 (a int, b int); create index on idxpart2 (a) where b > 1000; -alter table idxpart2 drop column col1, drop column col2; alter table idxpart attach partition idxpart2 for values from (1000) to (2000); create index on idxpart (a) where b > 1000; select c.relname, pg_get_indexdef(indexrelid) @@ -686,7 +675,7 @@ select c.relname, pg_get_indexdef(indexrelid) drop table idxpart; -- Column number mapping: dropped columns in the partition -create table idxpart1 (drop_1 int, drop_2 int, col_keep int, drop_3 int); +create table idxpart1 (col_keep int, drop_1 int, drop_2 int, drop_3 int); alter table idxpart1 drop column drop_1; alter table idxpart1 drop column drop_2; alter table idxpart1 drop column drop_3; @@ -718,9 +707,9 @@ select attrelid::regclass, attname, attnum from pg_attribute order by attrelid::regclass, attnum; attrelid | attname | attnum -----------------------+------------------------------+-------- - idxpart1 | ........pg.dropped.1........ | 1 + idxpart1 | col_keep | 1 idxpart1 | ........pg.dropped.2........ | 2 - idxpart1 | col_keep | 3 + idxpart1 | ........pg.dropped.3........ | 3 idxpart1 | ........pg.dropped.4........ | 4 idxpart1_col_keep_idx | col_keep | 1 idxpart | col_keep | 1 @@ -729,7 +718,7 @@ select attrelid::regclass, attname, attnum from pg_attribute drop table idxpart; -- Column number mapping: dropped columns in the parent table -create table idxpart(drop_1 int, drop_2 int, col_keep int, drop_3 int) partition by range (col_keep); +create table idxpart(col_keep int, drop_1 int, drop_2 int, drop_3 int) partition by range (col_keep); alter table idxpart drop column drop_1; alter table idxpart drop column drop_2; alter table idxpart drop column drop_3; @@ -761,9 +750,9 @@ select attrelid::regclass, attname, attnum from pg_attribute order by attrelid::regclass, attnum; attrelid | attname | attnum -----------------------+------------------------------+-------- - idxpart | ........pg.dropped.1........ | 1 + idxpart | col_keep | 1 idxpart | ........pg.dropped.2........ | 2 - idxpart | col_keep | 3 + idxpart | ........pg.dropped.3........ | 3 idxpart | ........pg.dropped.4........ | 4 idxpart1 | col_keep | 1 idxpart1_col_keep_idx | col_keep | 1 diff --git a/src/test/regress/expected/inherit_2.out b/src/test/regress/expected/inherit_2.out index 65ff71fe..b6a72418 100644 --- a/src/test/regress/expected/inherit_2.out +++ b/src/test/regress/expected/inherit_2.out @@ -866,6 +866,28 @@ select tableoid::regclass::text as relname, parted_tab.* from parted_tab order b (3 rows) drop table parted_tab; +-- Check UPDATE with multi-level partitioned inherited target +create table mlparted_tab (a int, b char, c text) partition by list (a); +create table mlparted_tab_part1 partition of mlparted_tab for values in (1); +create table mlparted_tab_part2 partition of mlparted_tab for values in (2) partition by list (b); +create table mlparted_tab_part3 partition of mlparted_tab for values in (3); +create table mlparted_tab_part2a partition of mlparted_tab_part2 for values in ('a'); +create table mlparted_tab_part2b partition of mlparted_tab_part2 for values in ('b'); +insert into mlparted_tab values (1, 'a'), (2, 'a'), (2, 'b'), (3, 'a'); +update mlparted_tab mlp set c = 'xxx' +from + (select a from some_tab union all select a+1 from some_tab) ss (a) +where (mlp.a = ss.a and mlp.b = 'b') or mlp.a = 3; +select tableoid::regclass::text as relname, mlparted_tab.* from mlparted_tab order by 1,2; + relname | a | b | c +---------------------+---+---+----- + mlparted_tab_part1 | 1 | a | + mlparted_tab_part2a | 2 | a | + mlparted_tab_part2b | 2 | b | xxx + mlparted_tab_part3 | 3 | a | xxx +(4 rows) + +drop table mlparted_tab; drop table some_tab cascade; NOTICE: drop cascades to table some_tab_child /* Test multiple inheritance of column defaults */ @@ -983,6 +1005,8 @@ NOTICE: drop cascades to table c1 -- tables. See the pgsql-hackers thread beginning Dec. 4/04 create table base (i integer); create table derived () inherits (base); +create table more_derived (like derived, b int) inherits (derived); +NOTICE: merging column "i" with inherited definition insert into derived (i) values (0); select derived::base from derived; derived diff --git a/src/test/regress/expected/inherit_3.out b/src/test/regress/expected/inherit_3.out index 707a6f63..251ee257 100644 --- a/src/test/regress/expected/inherit_3.out +++ b/src/test/regress/expected/inherit_3.out @@ -864,6 +864,30 @@ select tableoid::regclass::text as relname, parted_tab.* from parted_tab order b (3 rows) drop table parted_tab; +-- Check UPDATE with multi-level partitioned inherited target +create table mlparted_tab (a int, b char, c text) partition by list (a); +create table mlparted_tab_part1 partition of mlparted_tab for values in (1); +create table mlparted_tab_part2 partition of mlparted_tab for values in (2) partition by list (b); +create table mlparted_tab_part3 partition of mlparted_tab for values in (3); +create table mlparted_tab_part2a partition of mlparted_tab_part2 for values in ('a'); +create table mlparted_tab_part2b partition of mlparted_tab_part2 for values in ('b'); +insert into mlparted_tab values (1, 'a'), (2, 'a'), (2, 'b'), (3, 'a'); +update mlparted_tab mlp set c = 'xxx' +from + (select a from some_tab union all select a+1 from some_tab) ss (a) +where (mlp.a = ss.a and mlp.b = 'b') or mlp.a = 3; +ERROR: could not plan this distributed update +DETAIL: correlated UPDATE or updating distribution column currently not supported in Postgres-XL. +select tableoid::regclass::text as relname, mlparted_tab.* from mlparted_tab order by 1,2; + relname | a | b | c +---------------------+---+---+--- + mlparted_tab_part1 | 1 | a | + mlparted_tab_part2a | 2 | a | + mlparted_tab_part2b | 2 | b | + mlparted_tab_part3 | 3 | a | +(4 rows) + +drop table mlparted_tab; drop table some_tab cascade; NOTICE: drop cascades to table some_tab_child /* Test multiple inheritance of column defaults */ @@ -981,6 +1005,8 @@ NOTICE: drop cascades to table c1 -- tables. See the pgsql-hackers thread beginning Dec. 4/04 create table base (i integer); create table derived () inherits (base); +create table more_derived (like derived, b int) inherits (derived); +NOTICE: merging column "i" with inherited definition insert into derived (i) values (0); select derived::base from derived; derived @@ -997,16 +1023,20 @@ select NULL::derived::base; -- remove redundant conversions. explain (verbose on, costs off) select row(i, b)::more_derived::derived::base from more_derived; QUERY PLAN -------------------------------------------- - Seq Scan on public.more_derived +---------------------------------------------------------------------------------------------- + Remote Fast Query Execution + Output: ((ROW(more_derived.i, more_derived.b)::more_derived)::derived)::base + Node/s: datanode_1, datanode_2 + Remote query: SELECT ((ROW(i, b)::more_derived)::derived)::base AS "row" FROM more_derived + -> Seq Scan on public.more_derived Output: (ROW(i, b)::more_derived)::base -(2 rows) +(6 rows) explain (verbose on, costs off) select (1, 2)::more_derived::derived::base; QUERY PLAN ------------------------ +------------------------------------------- Result - Output: '(1)'::base + Output: (ROW(1, 2)::more_derived)::base (2 rows) drop table more_derived; @@ -1997,19 +2027,17 @@ explain (costs off) select * from list_parted where a in ('ab', 'cd', 'ef'); Filter: ((a)::text = ANY ('{ab,cd,ef}'::text[])) (7 rows) -explain (costs off) select * from list_parted where a = 'ab' or a in (null, 'cd'); +explain (costs off) select * from list_parted where a = 'ab' or a is null or a ='cd'; QUERY PLAN --------------------------------------------------------------------------------------------- Remote Fast Query Execution Node/s: datanode_1, datanode_2 -> Append -> Seq Scan on part_ab_cd - Filter: (((a)::text = 'ab'::text) OR ((a)::text = ANY ('{NULL,cd}'::text[]))) - -> Seq Scan on part_ef_gh - Filter: (((a)::text = 'ab'::text) OR ((a)::text = ANY ('{NULL,cd}'::text[]))) + Filter: (((a)::text = 'ab'::text) OR (a IS NULL) OR ((a)::text = 'cd'::text)) -> Seq Scan on part_null_xy - Filter: (((a)::text = 'ab'::text) OR ((a)::text = ANY ('{NULL,cd}'::text[]))) -(9 rows) + Filter: (((a)::text = 'ab'::text) OR (a IS NULL) OR ((a)::text = 'cd'::text)) +(7 rows) explain (costs off) select * from list_parted where a = 'ab'; QUERY PLAN @@ -2172,13 +2200,15 @@ create table mcrparted4 partition of mcrparted for values from (20, 10, 10) to ( create table mcrparted5 partition of mcrparted for values from (20, 20, 20) to (maxvalue, maxvalue, maxvalue); explain (costs off) select * from mcrparted where a = 0; -- scans mcrparted0, mcrparted_def QUERY PLAN ------------------------------------- +--------------------------------------- Remote Fast Query Execution Node/s: datanode_2 -> Append -> Seq Scan on mcrparted0 Filter: (a = 0) -(5 rows) + -> Seq Scan on mcrparted_def + Filter: (a = 0) +(7 rows) explain (costs off) select * from mcrparted where a = 10 and abs(b) < 5; -- scans mcrparted1, mcrparted_def QUERY PLAN @@ -2188,7 +2218,9 @@ explain (costs off) select * from mcrparted where a = 10 and abs(b) < 5; -- scan -> Append -> Seq Scan on mcrparted1 Filter: ((a = 10) AND (abs(b) < 5)) -(5 rows) + -> Seq Scan on mcrparted_def + Filter: ((a = 10) AND (abs(b) < 5)) +(7 rows) explain (costs off) select * from mcrparted where a = 10 and abs(b) = 5; -- scans mcrparted1, mcrparted2, mcrparted_def QUERY PLAN @@ -2200,11 +2232,13 @@ explain (costs off) select * from mcrparted where a = 10 and abs(b) = 5; -- scan Filter: ((a = 10) AND (abs(b) = 5)) -> Seq Scan on mcrparted2 Filter: ((a = 10) AND (abs(b) = 5)) -(7 rows) + -> Seq Scan on mcrparted_def + Filter: ((a = 10) AND (abs(b) = 5)) +(9 rows) explain (costs off) select * from mcrparted where abs(b) = 5; -- scans all partitions QUERY PLAN ------------------------------------- +--------------------------------------- Remote Fast Query Execution Node/s: datanode_1, datanode_2 -> Append @@ -2220,7 +2254,9 @@ explain (costs off) select * from mcrparted where abs(b) = 5; -- scans all parti Filter: (abs(b) = 5) -> Seq Scan on mcrparted5 Filter: (abs(b) = 5) -(15 rows) + -> Seq Scan on mcrparted_def + Filter: (abs(b) = 5) +(17 rows) explain (costs off) select * from mcrparted where a > -1; -- scans all partitions QUERY PLAN @@ -2240,7 +2276,9 @@ explain (costs off) select * from mcrparted where a > -1; -- scans all partition Filter: (a > '-1'::integer) -> Seq Scan on mcrparted5 Filter: (a > '-1'::integer) -(15 rows) + -> Seq Scan on mcrparted_def + Filter: (a > '-1'::integer) +(17 rows) explain (costs off) select * from mcrparted where a = 20 and abs(b) = 10 and c > 10; -- scans mcrparted4 QUERY PLAN @@ -2264,7 +2302,9 @@ explain (costs off) select * from mcrparted where a = 20 and c > 20; -- scans mc Filter: ((c > 20) AND (a = 20)) -> Seq Scan on mcrparted5 Filter: ((c > 20) AND (a = 20)) -(9 rows) + -> Seq Scan on mcrparted_def + Filter: ((c > 20) AND (a = 20)) +(11 rows) drop table mcrparted; -- check that partitioned table Appends cope with being referenced in diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out index e1a74c4a..d12e3494 100644 --- a/src/test/regress/expected/insert.out +++ b/src/test/regress/expected/insert.out @@ -268,17 +268,17 @@ insert into part_default_p2 values ('de', 35); insert into list_parted values ('ab', 21); insert into list_parted values ('xx', 1); insert into list_parted values ('yy', 2); -select tableoid::regclass, * from list_parted; +select tableoid::regclass, * from list_parted order by 1,2,3; tableoid | a | b --------------------+----+---- part_cc_dd | cC | 1 + part_null | | 0 part_ee_ff1 | ff | 1 part_ee_ff2 | ff | 11 part_xx_yy_p1 | xx | 1 part_xx_yy_defpart | yy | 2 - part_null | | 0 - part_default_p1 | cd | 25 part_default_p1 | ab | 21 + part_default_p1 | cd | 25 part_default_p2 | de | 35 (9 rows) @@ -322,11 +322,11 @@ select tableoid::regclass, * from range_parted order by 1, 2, 3; part3 | b | 1 part4 | b | 10 part4 | b | 10 - part_def | c | 10 - part_def | | part_def | a | - part_def | | 19 part_def | b | 20 + part_def | c | 10 + part_def | | 19 + part_def | | (11 rows) -- ok @@ -342,21 +342,21 @@ DETAIL: Partition key of the failing row contains (b) = (0). -- ok insert into list_parted values ('EE', 1); insert into part_ee_ff values ('EE', 10); -select tableoid::regclass, * from list_parted; +select tableoid::regclass, * from list_parted order by 1,2,3; tableoid | a | b --------------------+----+---- part_aa_bb | aA | part_cc_dd | cC | 1 - part_ee_ff1 | ff | 1 + part_null | | 0 + part_null | | 1 part_ee_ff1 | EE | 1 - part_ee_ff2 | ff | 11 + part_ee_ff1 | ff | 1 part_ee_ff2 | EE | 10 + part_ee_ff2 | ff | 11 part_xx_yy_p1 | xx | 1 part_xx_yy_defpart | yy | 2 - part_null | | 0 - part_null | | 1 - part_default_p1 | cd | 25 part_default_p1 | ab | 21 + part_default_p1 | cd | 25 part_default_p2 | de | 35 (13 rows) @@ -413,7 +413,7 @@ DETAIL: Failing row contains (11). insert into hpart3 values(11); -- view data select tableoid::regclass as part, a, a%4 as "remainder = a % 4" -from hash_parted order by part; +from hash_parted order by part,a; part | a | remainder = a % 4 --------+----+------------------- hpart0 | 4 | 0 @@ -447,6 +447,8 @@ Partitions: part_aa_bb FOR VALUES IN ('aa', 'bb'), part_null FOR VALUES IN (NULL), part_xx_yy FOR VALUES IN ('xx', 'yy'), PARTITIONED, part_default DEFAULT, PARTITIONED +Distribute By: HASH(a) +Location Nodes: ALL DATANODES -- cleanup drop table range_parted, list_parted; @@ -464,6 +466,8 @@ create table part_default partition of list_parted default; a | integer | | | | plain | | Partition of: list_parted DEFAULT No partition constraint +Distribute By: HASH(a) +Location Nodes: ALL DATANODES insert into part_default values (null); insert into part_default values (1); @@ -639,7 +643,7 @@ DETAIL: Failing row contains (34, 50, null). -- ok create table mlparted_defd partition of mlparted_def default; insert into mlparted values (70, 100); -select tableoid::regclass, * from mlparted_def; +select tableoid::regclass, * from mlparted_def order by 1; tableoid | a | b | c ---------------+----+-----+--- mlparted_def1 | 40 | 100 | @@ -785,17 +789,27 @@ create table donothingbrtrig_test (a int, b text) partition by list (a); create table donothingbrtrig_test1 (b text, a int); create table donothingbrtrig_test2 (c text, b text, a int); alter table donothingbrtrig_test2 drop column c; +ERROR: Distribution column cannot be dropped create or replace function donothingbrtrig_func() returns trigger as $$begin raise notice 'b: %', new.b; return NULL; end$$ language plpgsql; create trigger donothingbrtrig1 before insert on donothingbrtrig_test1 for each row execute procedure donothingbrtrig_func(); +ERROR: Postgres-XL does not support TRIGGER yet +DETAIL: The feature is not currently supported create trigger donothingbrtrig2 before insert on donothingbrtrig_test2 for each row execute procedure donothingbrtrig_func(); +ERROR: Postgres-XL does not support TRIGGER yet +DETAIL: The feature is not currently supported alter table donothingbrtrig_test attach partition donothingbrtrig_test1 for values in (1); +ERROR: table "donothingbrtrig_test1" contains column "a" at position 2, but parent "donothingbrtrig_test" has it at position 1 +DETAIL: Postgres-XL requires attribute positions to match +HINT: Check for column ordering and dropped columns, if any alter table donothingbrtrig_test attach partition donothingbrtrig_test2 for values in (2); +ERROR: table "donothingbrtrig_test2" contains column "c" not found in parent "donothingbrtrig_test" +DETAIL: New partition should contain only the columns present in parent. insert into donothingbrtrig_test values (1, 'foo'), (2, 'bar'); -NOTICE: b: foo -NOTICE: b: bar +ERROR: no partition of relation "donothingbrtrig_test" found for row +DETAIL: Partition key of the failing row contains (a) = (1). copy donothingbrtrig_test from stdout; -NOTICE: b: baz -NOTICE: b: qux +ERROR: no partition of relation "donothingbrtrig_test" found for row +DETAIL: Partition key of the failing row contains (a) = (1). select tableoid::regclass, * from donothingbrtrig_test; tableoid | a | b ----------+---+--- diff --git a/src/test/regress/expected/insert_conflict_1.out b/src/test/regress/expected/insert_conflict_1.out index 1a544406..40048bfb 100644 --- a/src/test/regress/expected/insert_conflict_1.out +++ b/src/test/regress/expected/insert_conflict_1.out @@ -830,3 +830,24 @@ select * from selfconflict order by 1; (3 rows) drop table selfconflict; +-- check that the following works: +-- insert into partitioned_table on conflict do nothing +create table parted_conflict_test (a int, b char) partition by list (a); +create table parted_conflict_test_1 partition of parted_conflict_test (b unique) for values in (1); +ERROR: Unique index of partitioned table must contain the hash/modulo distribution column. +insert into parted_conflict_test values (1, 'a') on conflict do nothing; +ERROR: no partition of relation "parted_conflict_test" found for row +DETAIL: Partition key of the failing row contains (a) = (1). +insert into parted_conflict_test values (1, 'a') on conflict do nothing; +ERROR: no partition of relation "parted_conflict_test" found for row +DETAIL: Partition key of the failing row contains (a) = (1). +-- however, on conflict do update is not supported yet +insert into parted_conflict_test values (1) on conflict (b) do update set a = excluded.a; +ERROR: Distributed column or partition column "a" can't be updated in current version +-- but it works OK if we target the partition directly +insert into parted_conflict_test_1 values (1) on conflict (b) do +update set a = excluded.a; +ERROR: relation "parted_conflict_test_1" does not exist +LINE 1: insert into parted_conflict_test_1 values (1) on conflict (b... + ^ +drop table parted_conflict_test; diff --git a/src/test/regress/expected/partition_info.out b/src/test/regress/expected/partition_info.out index c26d02a5..d26fb257 100644 --- a/src/test/regress/expected/partition_info.out +++ b/src/test/regress/expected/partition_info.out @@ -8,8 +8,8 @@ SELECT * FROM pg_partition_tree(NULL); SELECT * FROM pg_partition_tree(0); relid | parentrelid | isleaf | level --------+-------------+--------+------- -(0 row) +-------+-------------+--------+------- +(0 rows) SELECT pg_partition_root(NULL); pg_partition_root @@ -45,7 +45,7 @@ CREATE TABLE ptif_test2 PARTITION OF ptif_test FOR VALUES FROM (100) TO (200); -- This partitioned table should remain with no partitions. CREATE TABLE ptif_test3 PARTITION OF ptif_test - FOR VALUES FROM (200) TO (maxvalue) PARTITION BY list (b); + FOR VALUES FROM (200) TO (maxvalue) PARTITION BY list (b); -- Test pg_partition_root for tables SELECT pg_partition_root('ptif_test'); pg_partition_root @@ -98,12 +98,6 @@ SELECT pg_partition_root('ptif_test0_index'); ptif_test_index (1 row) -SELECT pg_partition_root('ptif_test01_index'); - pg_partition_root -------------------- - ptif_test_index -(1 row) - SELECT pg_partition_root('ptif_test3_index'); pg_partition_root ------------------- @@ -182,78 +176,6 @@ SELECT * FROM pg_partition_ancestors('ptif_test'); ptif_test (1 row) --- List all indexes members of the tree -SELECT relid, parentrelid, level, isleaf - FROM pg_partition_tree('ptif_test_index'); - relid | parentrelid | level | isleaf --------------------+------------------+-------+-------- - ptif_test_index | | 0 | f - ptif_test0_index | ptif_test_index | 1 | f - ptif_test1_index | ptif_test_index | 1 | f - ptif_test2_index | ptif_test_index | 1 | t - ptif_test3_index | ptif_test_index | 1 | f - ptif_test01_index | ptif_test0_index | 2 | t - ptif_test11_index | ptif_test1_index | 2 | t -(7 rows) - --- List indexes from an intermediate level -SELECT relid, parentrelid, level, isleaf - FROM pg_partition_tree('ptif_test0_index') p - JOIN pg_class c ON (p.relid = c.oid); - relid | parentrelid | level | isleaf --------------------+------------------+-------+-------- - ptif_test0_index | ptif_test_index | 0 | f - ptif_test01_index | ptif_test0_index | 1 | t -(2 rows) - --- List from leaf index -SELECT relid, parentrelid, level, isleaf - FROM pg_partition_tree('ptif_test01_index') p - JOIN pg_class c ON (p.relid = c.oid); - relid | parentrelid | level | isleaf --------------------+------------------+-------+-------- - ptif_test01_index | ptif_test0_index | 0 | t -(1 row) - --- List from partitioned index with no partitions -SELECT relid, parentrelid, level, isleaf - FROM pg_partition_tree('ptif_test3_index') p - JOIN pg_class c ON (p.relid = c.oid); - relid | parentrelid | level | isleaf -------------------+-----------------+-------+-------- - ptif_test3_index | ptif_test_index | 0 | f -(1 row) - --- List all members using pg_partition_root with leaf index reference -SELECT relid, parentrelid, level, isleaf - FROM pg_partition_tree(pg_partition_root('ptif_test01_index')) p - JOIN pg_class c ON (p.relid = c.oid); - relid | parentrelid | level | isleaf --------------------+------------------+-------+-------- - ptif_test_index | | 0 | f - ptif_test0_index | ptif_test_index | 1 | f - ptif_test1_index | ptif_test_index | 1 | f - ptif_test2_index | ptif_test_index | 1 | t - ptif_test3_index | ptif_test_index | 1 | f - ptif_test01_index | ptif_test0_index | 2 | t - ptif_test11_index | ptif_test1_index | 2 | t -(7 rows) - --- List all ancestors of root and leaf indexes -SELECT * FROM pg_partition_ancestors('ptif_test01_index'); - relid -------------------- - ptif_test01_index - ptif_test0_index - ptif_test_index -(3 rows) - -SELECT * FROM pg_partition_ancestors('ptif_test_index'); - relid ------------------ - ptif_test_index -(1 row) - DROP TABLE ptif_test; -- A table not part of a partition tree works is not listed. CREATE TABLE ptif_normal_table(a int); @@ -266,7 +188,7 @@ SELECT relid, parentrelid, level, isleaf SELECT pg_partition_root('ptif_normal_table'); pg_partition_root ------------------- - + (1 row) SELECT * FROM pg_partition_ancestors('ptif_normal_table'); @@ -280,13 +202,13 @@ CREATE VIEW ptif_test_view AS SELECT 1; CREATE MATERIALIZED VIEW ptif_test_matview AS SELECT 1; SELECT * FROM pg_partition_tree('ptif_test_view'); relid | parentrelid | isleaf | level --------+-------------+--------+------- -(0 row) +-------+-------------+--------+------- +(0 rows) SELECT * FROM pg_partition_tree('ptif_test_matview'); relid | parentrelid | isleaf | level --------+-------------+--------+------- -(0 row) +-------+-------------+--------+------- +(0 rows) SELECT pg_partition_root('ptif_test_view'); pg_partition_root diff --git a/src/test/regress/expected/partition_join_1.out b/src/test/regress/expected/partition_join_1.out new file mode 100644 index 00000000..83d35561 --- /dev/null +++ b/src/test/regress/expected/partition_join_1.out @@ -0,0 +1,2102 @@ +-- +-- PARTITION_JOIN +-- Test partition-wise join between partitioned tables +-- +-- Enable partition-wise join, which by default is disabled. +SET enable_partition_wise_join to true; +-- +-- partitioned by a single column +-- +CREATE TABLE prt1 (a int, b int, c varchar) PARTITION BY RANGE(a); +CREATE TABLE prt1_p1 PARTITION OF prt1 FOR VALUES FROM (0) TO (250); +CREATE TABLE prt1_p3 PARTITION OF prt1 FOR VALUES FROM (500) TO (600); +CREATE TABLE prt1_p2 PARTITION OF prt1 FOR VALUES FROM (250) TO (500); +INSERT INTO prt1 SELECT i, i % 25, to_char(i, 'FM0000') FROM generate_series(0, 599) i WHERE i % 2 = 0; +CREATE INDEX iprt1_p1_a on prt1_p1(a); +CREATE INDEX iprt1_p2_a on prt1_p2(a); +CREATE INDEX iprt1_p3_a on prt1_p3(a); +ANALYZE prt1; +CREATE TABLE prt2 (a int, b int, c varchar) PARTITION BY RANGE(b); +CREATE TABLE prt2_p1 PARTITION OF prt2 FOR VALUES FROM (0) TO (250); +CREATE TABLE prt2_p2 PARTITION OF prt2 FOR VALUES FROM (250) TO (500); +CREATE TABLE prt2_p3 PARTITION OF prt2 FOR VALUES FROM (500) TO (600); +INSERT INTO prt2 SELECT i % 25, i, to_char(i, 'FM0000') FROM generate_series(0, 599) i WHERE i % 3 = 0; +CREATE INDEX iprt2_p1_b on prt2_p1(b); +CREATE INDEX iprt2_p2_b on prt2_p2(b); +CREATE INDEX iprt2_p3_b on prt2_p3(b); +ANALYZE prt2; +-- inner join +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.b = 0 ORDER BY t1.a, t2.b; + QUERY PLAN +----------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: t1.a + -> Append + -> Nested Loop + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on prt1_p1 t1 + Filter: (b = 0) + -> Index Scan using iprt2_p1_b on prt2_p1 t2 + Index Cond: (b = t1.a) + -> Nested Loop + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on prt1_p2 t1_1 + Filter: (b = 0) + -> Index Scan using iprt2_p2_b on prt2_p2 t2_1 + Index Cond: (b = t1_1.a) + -> Nested Loop + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on prt1_p3 t1_2 + Filter: (b = 0) + -> Bitmap Heap Scan on prt2_p3 t2_2 + Recheck Cond: (b = t1_2.a) + -> Bitmap Index Scan on iprt2_p3_b + Index Cond: (b = t1_2.a) +(24 rows) + +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.b = 0 ORDER BY t1.a, t2.b; + a | c | b | c +-----+------+-----+------ + 0 | 0000 | 0 | 0000 + 150 | 0150 | 150 | 0150 + 300 | 0300 | 300 | 0300 + 450 | 0450 | 450 | 0450 +(4 rows) + +-- left outer join, with whole-row reference +EXPLAIN (COSTS OFF) +SELECT t1, t2 FROM prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b; + QUERY PLAN +----------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: t1.a, t2.b + -> Result + -> Append + -> Nested Loop Left Join + -> Seq Scan on prt1_p1 t1 + Filter: (b = 0) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Index Scan using iprt2_p1_b on prt2_p1 t2 + Index Cond: (t1.a = b) + -> Nested Loop Left Join + -> Seq Scan on prt1_p2 t1_1 + Filter: (b = 0) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Index Scan using iprt2_p2_b on prt2_p2 t2_1 + Index Cond: (t1_1.a = b) + -> Nested Loop Left Join + -> Seq Scan on prt1_p3 t1_2 + Filter: (b = 0) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Bitmap Heap Scan on prt2_p3 t2_2 + Recheck Cond: (t1_2.a = b) + -> Bitmap Index Scan on iprt2_p3_b + Index Cond: (t1_2.a = b) +(31 rows) + +SELECT t1, t2 FROM prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b; + t1 | t2 +--------------+-------------- + (0,0,0000) | (0,0,0000) + (50,0,0050) | + (100,0,0100) | + (150,0,0150) | (0,150,0150) + (200,0,0200) | + (250,0,0250) | + (300,0,0300) | (0,300,0300) + (350,0,0350) | + (400,0,0400) | + (450,0,0450) | (0,450,0450) + (500,0,0500) | + (550,0,0550) | +(12 rows) + +-- right outer join +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b; + QUERY PLAN +--------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: t1.a, t2.b + -> Result + -> Append + -> Nested Loop Left Join + -> Remote Subquery Scan on all (datanode_2) + Distribute results by H: b + -> Seq Scan on prt2_p1 t2 + Filter: (a = 0) + -> Index Scan using iprt1_p1_a on prt1_p1 t1 + Index Cond: (a = t2.b) + -> Nested Loop Left Join + -> Remote Subquery Scan on all (datanode_2) + Distribute results by H: b + -> Seq Scan on prt2_p2 t2_1 + Filter: (a = 0) + -> Index Scan using iprt1_p2_a on prt1_p2 t1_1 + Index Cond: (a = t2_1.b) + -> Nested Loop Left Join + -> Remote Subquery Scan on all (datanode_2) + Distribute results by H: b + -> Seq Scan on prt2_p3 t2_2 + Filter: (a = 0) + -> Index Scan using iprt1_p3_a on prt1_p3 t1_2 + Index Cond: (a = t2_2.b) +(26 rows) + +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b; + a | c | b | c +-----+------+-----+------ + 0 | 0000 | 0 | 0000 + 150 | 0150 | 150 | 0150 + 300 | 0300 | 300 | 0300 + 450 | 0450 | 450 | 0450 + | | 75 | 0075 + | | 225 | 0225 + | | 375 | 0375 + | | 525 | 0525 +(8 rows) + +-- full outer join, with placeholder vars +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b) WHERE t1.phv = t1.a OR t2.phv = t2.b ORDER BY t1.a, t2.b; + QUERY PLAN +------------------------------------------------------------------------ + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: prt1_p1.a, prt2_p1.b + -> Append + -> Hash Full Join + Hash Cond: (prt1_p1.a = prt2_p1.b) + Filter: (((50) = prt1_p1.a) OR ((75) = prt2_p1.b)) + -> Seq Scan on prt1_p1 + Filter: (b = 0) + -> Hash + -> Remote Subquery Scan on all (datanode_2) + Distribute results by H: b + -> Seq Scan on prt2_p1 + Filter: (a = 0) + -> Hash Full Join + Hash Cond: (prt1_p2.a = prt2_p2.b) + Filter: (((50) = prt1_p2.a) OR ((75) = prt2_p2.b)) + -> Seq Scan on prt1_p2 + Filter: (b = 0) + -> Hash + -> Remote Subquery Scan on all (datanode_2) + Distribute results by H: b + -> Seq Scan on prt2_p2 + Filter: (a = 0) + -> Hash Full Join + Hash Cond: (prt1_p3.a = prt2_p3.b) + Filter: (((50) = prt1_p3.a) OR ((75) = prt2_p3.b)) + -> Seq Scan on prt1_p3 + Filter: (b = 0) + -> Hash + -> Remote Subquery Scan on all (datanode_2) + Distribute results by H: b + -> Seq Scan on prt2_p3 + Filter: (a = 0) +(34 rows) + +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b) WHERE t1.phv = t1.a OR t2.phv = t2.b ORDER BY t1.a, t2.b; + a | c | b | c +----+------+----+------ + 50 | 0050 | | + | | 75 | 0075 +(2 rows) + +-- Join with pruned partitions from joining relations +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.a < 450 AND t2.b > 250 AND t1.b = 0 ORDER BY t1.a, t2.b; + QUERY PLAN +----------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: t1.a + -> Append + -> Nested Loop + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on prt1_p2 t1 + Filter: ((a < 450) AND (b = 0)) + -> Index Scan using iprt2_p2_b on prt2_p2 t2 + Index Cond: ((b = t1.a) AND (b > 250)) +(10 rows) + +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.a < 450 AND t2.b > 250 AND t1.b = 0 ORDER BY t1.a, t2.b; + a | c | b | c +-----+------+-----+------ + 300 | 0300 | 300 | 0300 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 LEFT JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b; + QUERY PLAN +----------------------------------------------------------------------------- + Sort + Sort Key: prt1_p1.a, b + -> Append + -> Hash Left Join + Hash Cond: (prt1_p1.a = b) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on prt1_p1 + Filter: ((a < 450) AND (b = 0)) + -> Hash + -> Result + One-Time Filter: false + -> Nested Loop Left Join + -> Seq Scan on prt1_p2 + Filter: ((a < 450) AND (b = 0)) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Index Scan using iprt2_p2_b on prt2_p2 + Index Cond: ((prt1_p2.a = b) AND (b > 250)) +(19 rows) + +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 LEFT JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b; + a | c | b | c +-----+------+-----+------ + 0 | 0000 | | + 50 | 0050 | | + 100 | 0100 | | + 150 | 0150 | | + 200 | 0200 | | + 250 | 0250 | | + 300 | 0300 | 300 | 0300 + 350 | 0350 | | + 400 | 0400 | | +(9 rows) + +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 FULL JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 OR t2.a = 0 ORDER BY t1.a, t2.b; + QUERY PLAN +----------------------------------------------------------------------------- + Sort + Sort Key: prt1_p1.a, b + -> Append + -> Hash Full Join + Hash Cond: (prt1_p1.a = b) + Filter: ((prt1_p1.b = 0) OR (a = 0)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on prt1_p1 + Filter: (a < 450) + -> Hash + -> Result + One-Time Filter: false + -> Hash Full Join + Hash Cond: (prt1_p2.a = prt2_p2.b) + Filter: ((prt1_p2.b = 0) OR (prt2_p2.a = 0)) + -> Seq Scan on prt1_p2 + Filter: (a < 450) + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Seq Scan on prt2_p2 + Filter: (b > 250) + -> Hash Full Join + Hash Cond: (prt2_p3.b = a) + Filter: ((b = 0) OR (prt2_p3.a = 0)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on prt2_p3 + Filter: (b > 250) + -> Hash + -> Result + One-Time Filter: false +(31 rows) + +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 FULL JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 OR t2.a = 0 ORDER BY t1.a, t2.b; + a | c | b | c +-----+------+-----+------ + 0 | 0000 | | + 50 | 0050 | | + 100 | 0100 | | + 150 | 0150 | | + 200 | 0200 | | + 250 | 0250 | | + 300 | 0300 | 300 | 0300 + 350 | 0350 | | + 400 | 0400 | | + | | 375 | 0375 + | | 450 | 0450 + | | 525 | 0525 +(12 rows) + +-- Semi-join +EXPLAIN (COSTS OFF) +SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t2.b FROM prt2 t2 WHERE t2.a = 0) AND t1.b = 0 ORDER BY t1.a; + QUERY PLAN +------------------------------------------------------------------------------ + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: t1.a + -> Append + -> Nested Loop + -> Remote Subquery Scan on all (datanode_2) + Distribute results by H: b + -> HashAggregate + Group Key: t2.b + -> Remote Subquery Scan on all (datanode_2) + Distribute results by H: b + -> HashAggregate + Group Key: t2.b + -> Seq Scan on prt2_p1 t2 + Filter: (a = 0) + -> Index Scan using iprt1_p1_a on prt1_p1 t1 + Index Cond: (a = t2.b) + Filter: (b = 0) + -> Nested Loop + -> Remote Subquery Scan on all (datanode_2) + Distribute results by H: b + -> HashAggregate + Group Key: t2_1.b + -> Remote Subquery Scan on all (datanode_2) + Distribute results by H: b + -> HashAggregate + Group Key: t2_1.b + -> Seq Scan on prt2_p2 t2_1 + Filter: (a = 0) + -> Index Scan using iprt1_p2_a on prt1_p2 t1_1 + Index Cond: (a = t2_1.b) + Filter: (b = 0) + -> Nested Loop + -> Remote Subquery Scan on all (datanode_2) + Distribute results by H: b + -> HashAggregate + Group Key: t2_2.b + -> Remote Subquery Scan on all (datanode_2) + Distribute results by H: b + -> HashAggregate + Group Key: t2_2.b + -> Seq Scan on prt2_p3 t2_2 + Filter: (a = 0) + -> Index Scan using iprt1_p3_a on prt1_p3 t1_2 + Index Cond: (a = t2_2.b) + Filter: (b = 0) +(46 rows) + +SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t2.b FROM prt2 t2 WHERE t2.a = 0) AND t1.b = 0 ORDER BY t1.a; + a | b | c +-----+---+------ + 0 | 0 | 0000 + 150 | 0 | 0150 + 300 | 0 | 0300 + 450 | 0 | 0450 +(4 rows) + +-- Anti-join with aggregates +EXPLAIN (COSTS OFF) +SELECT sum(t1.a), avg(t1.a), sum(t1.b), avg(t1.b) FROM prt1 t1 WHERE NOT EXISTS (SELECT 1 FROM prt2 t2 WHERE t1.a = t2.b); + QUERY PLAN +-------------------------------------------------------------------------------------------- + Finalize Aggregate + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Partial Aggregate + -> Append + -> Hash Anti Join + Hash Cond: (t1.a = t2.b) + -> Seq Scan on prt1_p1 t1 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Seq Scan on prt2_p1 t2 + -> Hash Anti Join + Hash Cond: (t1_1.a = t2_1.b) + -> Seq Scan on prt1_p2 t1_1 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Seq Scan on prt2_p2 t2_1 + -> Nested Loop Anti Join + -> Seq Scan on prt1_p3 t1_2 + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Index Only Scan using iprt2_p3_b on prt2_p3 t2_2 + Index Cond: (b = t1_2.a) +(25 rows) + +SELECT sum(t1.a), avg(t1.a), sum(t1.b), avg(t1.b) FROM prt1 t1 WHERE NOT EXISTS (SELECT 1 FROM prt2 t2 WHERE t1.a = t2.b); + sum | avg | sum | avg +-------+----------------------+------+--------------------- + 60000 | 300.0000000000000000 | 2400 | 12.0000000000000000 +(1 row) + +-- lateral reference +EXPLAIN (COSTS OFF) +SELECT * FROM prt1 t1 LEFT JOIN LATERAL + (SELECT t2.a AS t2a, t3.a AS t3a, least(t1.a,t2.a,t3.b) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss + ON t1.a = ss.t2a WHERE t1.b = 0 ORDER BY t1.a; +ERROR: could not devise a query plan for the given query +SELECT * FROM prt1 t1 LEFT JOIN LATERAL + (SELECT t2.a AS t2a, t3.a AS t3a, least(t1.a,t2.a,t3.b) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss + ON t1.a = ss.t2a WHERE t1.b = 0 ORDER BY t1.a; + a | b | c | t2a | t3a | least +-----+---+------+-----+-----+------- + 0 | 0 | 0000 | 0 | 0 | 0 + 50 | 0 | 0050 | | | + 100 | 0 | 0100 | | | + 150 | 0 | 0150 | 150 | 0 | 150 + 200 | 0 | 0200 | | | + 250 | 0 | 0250 | | | + 300 | 0 | 0300 | 300 | 0 | 300 + 350 | 0 | 0350 | | | + 400 | 0 | 0400 | | | + 450 | 0 | 0450 | 450 | 0 | 450 + 500 | 0 | 0500 | | | + 550 | 0 | 0550 | | | +(12 rows) + +EXPLAIN (COSTS OFF) +SELECT t1.a, ss.t2a, ss.t2c FROM prt1 t1 LEFT JOIN LATERAL + (SELECT t2.a AS t2a, t3.a AS t3a, t2.b t2b, t2.c t2c, least(t1.a,t2.a,t3.b) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss + ON t1.c = ss.t2c WHERE (t1.b + coalesce(ss.t2b, 0)) = 0 ORDER BY t1.a; + QUERY PLAN +----------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a + -> Hash Right Join + Hash Cond: ((c)::text = (c)::text) + Filter: ((b + COALESCE(b, 0)) = 0) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: c + -> Append + -> Hash Join + Hash Cond: (t2.a = t3.b) + -> Seq Scan on prt1_p1 t2 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Seq Scan on prt2_p1 t3 + -> Hash Join + Hash Cond: (t2_1.a = t3_1.b) + -> Seq Scan on prt1_p2 t2_1 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Seq Scan on prt2_p2 t3_1 + -> Nested Loop + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Seq Scan on prt2_p3 t3_2 + -> Index Scan using iprt1_p3_a on prt1_p3 t2_2 + Index Cond: (a = t3_2.b) + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: c + -> Append + -> Seq Scan on prt1_p1 t1 + -> Seq Scan on prt1_p2 t1_1 + -> Seq Scan on prt1_p3 t1_2 +(36 rows) + +SELECT t1.a, ss.t2a, ss.t2c FROM prt1 t1 LEFT JOIN LATERAL + (SELECT t2.a AS t2a, t3.a AS t3a, t2.b t2b, t2.c t2c, least(t1.a,t2.a,t3.a) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss + ON t1.c = ss.t2c WHERE (t1.b + coalesce(ss.t2b, 0)) = 0 ORDER BY t1.a; + a | t2a | t2c +-----+-----+------ + 0 | 0 | 0000 + 50 | | + 100 | | + 150 | 150 | 0150 + 200 | | + 250 | | + 300 | 300 | 0300 + 350 | | + 400 | | + 450 | 450 | 0450 + 500 | | + 550 | | +(12 rows) + +-- +-- partitioned by expression +-- +CREATE TABLE prt1_e (a int, b int, c int) PARTITION BY RANGE(((a + b)/2)); +CREATE TABLE prt1_e_p1 PARTITION OF prt1_e FOR VALUES FROM (0) TO (250); +CREATE TABLE prt1_e_p2 PARTITION OF prt1_e FOR VALUES FROM (250) TO (500); +CREATE TABLE prt1_e_p3 PARTITION OF prt1_e FOR VALUES FROM (500) TO (600); +INSERT INTO prt1_e SELECT i, i, i % 25 FROM generate_series(0, 599, 2) i; +CREATE INDEX iprt1_e_p1_ab2 on prt1_e_p1(((a+b)/2)); +CREATE INDEX iprt1_e_p2_ab2 on prt1_e_p2(((a+b)/2)); +CREATE INDEX iprt1_e_p3_ab2 on prt1_e_p3(((a+b)/2)); +ANALYZE prt1_e; +CREATE TABLE prt2_e (a int, b int, c int) PARTITION BY RANGE(((b + a)/2)); +CREATE TABLE prt2_e_p1 PARTITION OF prt2_e FOR VALUES FROM (0) TO (250); +CREATE TABLE prt2_e_p2 PARTITION OF prt2_e FOR VALUES FROM (250) TO (500); +CREATE TABLE prt2_e_p3 PARTITION OF prt2_e FOR VALUES FROM (500) TO (600); +INSERT INTO prt2_e SELECT i, i, i % 25 FROM generate_series(0, 599, 3) i; +ANALYZE prt2_e; +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_e t1, prt2_e t2 WHERE (t1.a + t1.b)/2 = (t2.b + t2.a)/2 AND t1.c = 0 ORDER BY t1.a, t2.b; + QUERY PLAN +--------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: t1.a, t2.b + -> Append + -> Hash Join + Hash Cond: (((t2.b + t2.a) / 2) = ((t1.a + t1.b) / 2)) + -> Seq Scan on prt2_e_p1 t2 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on prt1_e_p1 t1 + Filter: (c = 0) + -> Hash Join + Hash Cond: (((t2_1.b + t2_1.a) / 2) = ((t1_1.a + t1_1.b) / 2)) + -> Seq Scan on prt2_e_p2 t2_1 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on prt1_e_p2 t1_1 + Filter: (c = 0) + -> Nested Loop + -> Seq Scan on prt2_e_p3 t2_2 + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Index Scan using iprt1_e_p3_ab2 on prt1_e_p3 t1_2 + Index Cond: (((a + b) / 2) = ((t2_2.b + t2_2.a) / 2)) + Filter: (c = 0) +(25 rows) + +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_e t1, prt2_e t2 WHERE (t1.a + t1.b)/2 = (t2.b + t2.a)/2 AND t1.c = 0 ORDER BY t1.a, t2.b; + a | c | b | c +-----+---+-----+--- + 0 | 0 | 0 | 0 + 150 | 0 | 150 | 0 + 300 | 0 | 300 | 0 + 450 | 0 | 450 | 0 +(4 rows) + +-- +-- N-way join +-- +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM prt1 t1, prt2 t2, prt1_e t3 WHERE t1.a = t2.b AND t1.a = (t3.a + t3.b)/2 AND t1.b = 0 ORDER BY t1.a, t2.b; + QUERY PLAN +----------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: t1.a + -> Result + -> Append + -> Nested Loop + Join Filter: (t1.a = (((t3.a + t3.b) / 2))) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Nested Loop + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on prt1_p1 t1 + Filter: (b = 0) + -> Index Scan using iprt2_p1_b on prt2_p1 t2 + Index Cond: (b = t1.a) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: ((a + b) / 2) + -> Index Scan using iprt1_e_p1_ab2 on prt1_e_p1 t3 + Index Cond: (((a + b) / 2) = t2.b) + -> Nested Loop + Join Filter: (t1_1.a = (((t3_1.a + t3_1.b) / 2))) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Nested Loop + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on prt1_p2 t1_1 + Filter: (b = 0) + -> Index Scan using iprt2_p2_b on prt2_p2 t2_1 + Index Cond: (b = t1_1.a) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: ((a + b) / 2) + -> Index Scan using iprt1_e_p2_ab2 on prt1_e_p2 t3_1 + Index Cond: (((a + b) / 2) = t2_1.b) + -> Nested Loop + Join Filter: (t1_2.a = t2_2.b) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Nested Loop + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on prt1_p3 t1_2 + Filter: (b = 0) + -> Index Scan using iprt1_e_p3_ab2 on prt1_e_p3 t3_2 + Index Cond: (((a + b) / 2) = t1_2.a) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Index Scan using iprt2_p3_b on prt2_p3 t2_2 + Index Cond: (b = ((t3_2.a + t3_2.b) / 2)) +(50 rows) + +SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM prt1 t1, prt2 t2, prt1_e t3 WHERE t1.a = t2.b AND t1.a = (t3.a + t3.b)/2 AND t1.b = 0 ORDER BY t1.a, t2.b; + a | c | b | c | ?column? | c +-----+------+-----+------+----------+--- + 0 | 0000 | 0 | 0000 | 0 | 0 + 150 | 0150 | 150 | 0150 | 300 | 0 + 300 | 0300 | 300 | 0300 | 600 | 0 + 450 | 0450 | 450 | 0450 | 900 | 0 +(4 rows) + +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) LEFT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.b = 0 ORDER BY t1.a, t2.b, t3.a + t3.b; + QUERY PLAN +----------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: t1.a, t2.b, ((t3.a + t3.b)) + -> Result + -> Append + -> Nested Loop Left Join + -> Nested Loop Left Join + -> Seq Scan on prt1_p1 t1 + Filter: (b = 0) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Index Scan using iprt2_p1_b on prt2_p1 t2 + Index Cond: (t1.a = b) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: ((a + b) / 2) + -> Index Scan using iprt1_e_p1_ab2 on prt1_e_p1 t3 + Index Cond: (t1.a = ((a + b) / 2)) + -> Nested Loop Left Join + -> Nested Loop Left Join + -> Seq Scan on prt1_p2 t1_1 + Filter: (b = 0) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Index Scan using iprt2_p2_b on prt2_p2 t2_1 + Index Cond: (t1_1.a = b) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: ((a + b) / 2) + -> Index Scan using iprt1_e_p2_ab2 on prt1_e_p2 t3_1 + Index Cond: (t1_1.a = ((a + b) / 2)) + -> Nested Loop Left Join + -> Nested Loop Left Join + -> Seq Scan on prt1_p3 t1_2 + Filter: (b = 0) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Bitmap Heap Scan on prt2_p3 t2_2 + Recheck Cond: (t1_2.a = b) + -> Bitmap Index Scan on iprt2_p3_b + Index Cond: (t1_2.a = b) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: ((a + b) / 2) + -> Index Scan using iprt1_e_p3_ab2 on prt1_e_p3 t3_2 + Index Cond: (t1_2.a = ((a + b) / 2)) +(49 rows) + +SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) LEFT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.b = 0 ORDER BY t1.a, t2.b, t3.a + t3.b; + a | c | b | c | ?column? | c +-----+------+-----+------+----------+--- + 0 | 0000 | 0 | 0000 | 0 | 0 + 50 | 0050 | | | 100 | 0 + 100 | 0100 | | | 200 | 0 + 150 | 0150 | 150 | 0150 | 300 | 0 + 200 | 0200 | | | 400 | 0 + 250 | 0250 | | | 500 | 0 + 300 | 0300 | 300 | 0300 | 600 | 0 + 350 | 0350 | | | 700 | 0 + 400 | 0400 | | | 800 | 0 + 450 | 0450 | 450 | 0450 | 900 | 0 + 500 | 0500 | | | 1000 | 0 + 550 | 0550 | | | 1100 | 0 +(12 rows) + +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b; + QUERY PLAN +----------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: t1.a, t2.b, ((t3.a + t3.b)) + -> Result + -> Append + -> Nested Loop Left Join + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Nested Loop Left Join + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: ((a + b) / 2) + -> Seq Scan on prt1_e_p1 t3 + Filter: (c = 0) + -> Index Scan using iprt1_p1_a on prt1_p1 t1 + Index Cond: (a = ((t3.a + t3.b) / 2)) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Index Scan using iprt2_p1_b on prt2_p1 t2 + Index Cond: (t1.a = b) + -> Nested Loop Left Join + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Nested Loop Left Join + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: ((a + b) / 2) + -> Seq Scan on prt1_e_p2 t3_1 + Filter: (c = 0) + -> Index Scan using iprt1_p2_a on prt1_p2 t1_1 + Index Cond: (a = ((t3_1.a + t3_1.b) / 2)) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Index Scan using iprt2_p2_b on prt2_p2 t2_1 + Index Cond: (t1_1.a = b) + -> Nested Loop Left Join + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Nested Loop Left Join + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: ((a + b) / 2) + -> Seq Scan on prt1_e_p3 t3_2 + Filter: (c = 0) + -> Index Scan using iprt1_p3_a on prt1_p3 t1_2 + Index Cond: (a = ((t3_2.a + t3_2.b) / 2)) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Index Scan using iprt2_p3_b on prt2_p3 t2_2 + Index Cond: (t1_2.a = b) +(50 rows) + +SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b; + a | c | b | c | ?column? | c +-----+------+-----+------+----------+--- + 0 | 0000 | 0 | 0000 | 0 | 0 + 50 | 0050 | | | 100 | 0 + 100 | 0100 | | | 200 | 0 + 150 | 0150 | 150 | 0150 | 300 | 0 + 200 | 0200 | | | 400 | 0 + 250 | 0250 | | | 500 | 0 + 300 | 0300 | 300 | 0300 | 600 | 0 + 350 | 0350 | | | 700 | 0 + 400 | 0400 | | | 800 | 0 + 450 | 0450 | 450 | 0450 | 900 | 0 + 500 | 0500 | | | 1000 | 0 + 550 | 0550 | | | 1100 | 0 +(12 rows) + +-- Cases with non-nullable expressions in subquery results; +-- make sure these go to null as expected +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.phv, t2.b, t2.phv, t3.a + t3.b, t3.phv FROM ((SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b)) FULL JOIN (SELECT 50 phv, * FROM prt1_e WHERE prt1_e.c = 0) t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.a = t1.phv OR t2.b = t2.phv OR (t3.a + t3.b)/2 = t3.phv ORDER BY t1.a, t2.b, t3.a + t3.b; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: prt1_p1.a, prt2_p1.b, ((prt1_e_p1.a + prt1_e_p1.b)) + -> Result + -> Append + -> Hash Full Join + Hash Cond: (prt1_p1.a = (((prt1_e_p1.a + prt1_e_p1.b) / 2))) + Filter: ((prt1_p1.a = (50)) OR (prt2_p1.b = (75)) OR (((prt1_e_p1.a + prt1_e_p1.b) / 2) = (50))) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Hash Full Join + Hash Cond: (prt1_p1.a = prt2_p1.b) + -> Seq Scan on prt1_p1 + Filter: (b = 0) + -> Hash + -> Remote Subquery Scan on all (datanode_2) + Distribute results by H: b + -> Seq Scan on prt2_p1 + Filter: (a = 0) + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: ((a + b) / 2) + -> Seq Scan on prt1_e_p1 + Filter: (c = 0) + -> Hash Full Join + Hash Cond: (prt1_p2.a = (((prt1_e_p2.a + prt1_e_p2.b) / 2))) + Filter: ((prt1_p2.a = (50)) OR (prt2_p2.b = (75)) OR (((prt1_e_p2.a + prt1_e_p2.b) / 2) = (50))) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Hash Full Join + Hash Cond: (prt1_p2.a = prt2_p2.b) + -> Seq Scan on prt1_p2 + Filter: (b = 0) + -> Hash + -> Remote Subquery Scan on all (datanode_2) + Distribute results by H: b + -> Seq Scan on prt2_p2 + Filter: (a = 0) + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: ((a + b) / 2) + -> Seq Scan on prt1_e_p2 + Filter: (c = 0) + -> Hash Full Join + Hash Cond: (prt1_p3.a = (((prt1_e_p3.a + prt1_e_p3.b) / 2))) + Filter: ((prt1_p3.a = (50)) OR (prt2_p3.b = (75)) OR (((prt1_e_p3.a + prt1_e_p3.b) / 2) = (50))) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Hash Full Join + Hash Cond: (prt1_p3.a = prt2_p3.b) + -> Seq Scan on prt1_p3 + Filter: (b = 0) + -> Hash + -> Remote Subquery Scan on all (datanode_2) + Distribute results by H: b + -> Seq Scan on prt2_p3 + Filter: (a = 0) + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: ((a + b) / 2) + -> Seq Scan on prt1_e_p3 + Filter: (c = 0) +(62 rows) + +SELECT t1.a, t1.phv, t2.b, t2.phv, t3.a + t3.b, t3.phv FROM ((SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b)) FULL JOIN (SELECT 50 phv, * FROM prt1_e WHERE prt1_e.c = 0) t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.a = t1.phv OR t2.b = t2.phv OR (t3.a + t3.b)/2 = t3.phv ORDER BY t1.a, t2.b, t3.a + t3.b; + a | phv | b | phv | ?column? | phv +----+-----+----+-----+----------+----- + 50 | 50 | | | 100 | 50 + | | 75 | 75 | | +(2 rows) + +-- Semi-join +EXPLAIN (COSTS OFF) +SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHERE t1.a = 0 AND t1.b = (t2.a + t2.b)/2) AND t1.b = 0 ORDER BY t1.a; + QUERY PLAN +--------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: t1.a + -> Append + -> Nested Loop + Join Filter: (t1.a = t1_3.b) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> HashAggregate + Group Key: t1_3.b + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> HashAggregate + Group Key: t1_3.b + -> Nested Loop + -> Remote Subquery Scan on all (datanode_2) + -> Seq Scan on prt2_p1 t1_3 + Filter: (a = 0) + -> Index Scan using iprt1_e_p1_ab2 on prt1_e_p1 t2 + Index Cond: (((a + b) / 2) = t1_3.b) + -> Index Scan using iprt1_p1_a on prt1_p1 t1 + Index Cond: (a = ((t2.a + t2.b) / 2)) + Filter: (b = 0) + -> Nested Loop + Join Filter: (t1_1.a = t1_4.b) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> HashAggregate + Group Key: t1_4.b + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> HashAggregate + Group Key: t1_4.b + -> Nested Loop + -> Remote Subquery Scan on all (datanode_2) + -> Seq Scan on prt2_p2 t1_4 + Filter: (a = 0) + -> Index Scan using iprt1_e_p2_ab2 on prt1_e_p2 t2_1 + Index Cond: (((a + b) / 2) = t1_4.b) + -> Index Scan using iprt1_p2_a on prt1_p2 t1_1 + Index Cond: (a = ((t2_1.a + t2_1.b) / 2)) + Filter: (b = 0) + -> Nested Loop + Join Filter: (t1_2.a = t1_5.b) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> HashAggregate + Group Key: t1_5.b + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> HashAggregate + Group Key: t1_5.b + -> Nested Loop + -> Remote Subquery Scan on all (datanode_2) + -> Seq Scan on prt2_p3 t1_5 + Filter: (a = 0) + -> Index Scan using iprt1_e_p3_ab2 on prt1_e_p3 t2_2 + Index Cond: (((a + b) / 2) = t1_5.b) + -> Index Scan using iprt1_p3_a on prt1_p3 t1_2 + Index Cond: (a = ((t2_2.a + t2_2.b) / 2)) + Filter: (b = 0) +(61 rows) + +SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHERE t1.a = 0 AND t1.b = (t2.a + t2.b)/2) AND t1.b = 0 ORDER BY t1.a; + a | b | c +-----+---+------ + 0 | 0 | 0000 + 150 | 0 | 0150 + 300 | 0 | 0300 + 450 | 0 | 0450 +(4 rows) + +EXPLAIN (COSTS OFF) +SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a; + QUERY PLAN +--------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: t1.a + -> Append + -> Nested Loop Semi Join + -> Seq Scan on prt1_p1 t1 + Filter: (b = 0) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Nested Loop Semi Join + -> Index Only Scan using iprt2_p1_b on prt2_p1 t1_3 + Index Cond: (b = t1.a) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Index Scan using iprt1_e_p1_ab2 on prt1_e_p1 t1_6 + Index Cond: (((a + b) / 2) = t1_3.b) + Filter: (c = 0) + -> Nested Loop Semi Join + -> Seq Scan on prt1_p2 t1_1 + Filter: (b = 0) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Nested Loop Semi Join + -> Index Only Scan using iprt2_p2_b on prt2_p2 t1_4 + Index Cond: (b = t1_1.a) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Index Scan using iprt1_e_p2_ab2 on prt1_e_p2 t1_7 + Index Cond: (((a + b) / 2) = t1_4.b) + Filter: (c = 0) + -> Nested Loop Semi Join + -> Seq Scan on prt1_p3 t1_2 + Filter: (b = 0) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Nested Loop Semi Join + -> Bitmap Heap Scan on prt2_p3 t1_5 + Recheck Cond: (b = t1_2.a) + -> Bitmap Index Scan on iprt2_p3_b + Index Cond: (b = t1_2.a) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Index Scan using iprt1_e_p3_ab2 on prt1_e_p3 t1_8 + Index Cond: (((a + b) / 2) = t1_5.b) + Filter: (c = 0) +(48 rows) + +SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a; + a | b | c +-----+---+------ + 0 | 0 | 0000 + 150 | 0 | 0150 + 300 | 0 | 0300 + 450 | 0 | 0450 +(4 rows) + +-- test merge joins +SET enable_hashjoin TO off; +SET enable_nestloop TO off; +EXPLAIN (COSTS OFF) +SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a; + QUERY PLAN +----------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: t1.a + -> Append + -> Merge Semi Join + Merge Cond: (t1.a = t1_3.b) + -> Sort + Sort Key: t1.a + -> Seq Scan on prt1_p1 t1 + Filter: (b = 0) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Merge Semi Join + Merge Cond: (t1_3.b = (((t1_6.a + t1_6.b) / 2))) + -> Sort + Sort Key: t1_3.b + -> Seq Scan on prt2_p1 t1_3 + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: (((t1_6.a + t1_6.b) / 2)) + -> Seq Scan on prt1_e_p1 t1_6 + Filter: (c = 0) + -> Merge Semi Join + Merge Cond: (t1_1.a = t1_4.b) + -> Sort + Sort Key: t1_1.a + -> Seq Scan on prt1_p2 t1_1 + Filter: (b = 0) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Merge Semi Join + Merge Cond: (t1_4.b = (((t1_7.a + t1_7.b) / 2))) + -> Sort + Sort Key: t1_4.b + -> Seq Scan on prt2_p2 t1_4 + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: (((t1_7.a + t1_7.b) / 2)) + -> Seq Scan on prt1_e_p2 t1_7 + Filter: (c = 0) + -> Merge Semi Join + Merge Cond: (t1_2.a = t1_5.b) + -> Sort + Sort Key: t1_2.a + -> Seq Scan on prt1_p3 t1_2 + Filter: (b = 0) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Merge Semi Join + Merge Cond: (t1_5.b = (((t1_8.a + t1_8.b) / 2))) + -> Sort + Sort Key: t1_5.b + -> Seq Scan on prt2_p3 t1_5 + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: (((t1_8.a + t1_8.b) / 2)) + -> Seq Scan on prt1_e_p3 t1_8 + Filter: (c = 0) +(58 rows) + +SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a; + a | b | c +-----+---+------ + 0 | 0 | 0000 + 150 | 0 | 0150 + 300 | 0 | 0300 + 450 | 0 | 0450 +(4 rows) + +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b; + QUERY PLAN +----------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: t1.a, t2.b, ((t3.a + t3.b)) + -> Result + -> Append + -> Merge Right Join + Merge Cond: (t1.a = (((t3.a + t3.b) / 2))) + -> Merge Left Join + Merge Cond: (t1.a = t2.b) + -> Sort + Sort Key: t1.a + -> Seq Scan on prt1_p1 t1 + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Sort + Sort Key: t2.b + -> Seq Scan on prt2_p1 t2 + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: ((a + b) / 2) + -> Sort + Sort Key: (((t3.a + t3.b) / 2)) + -> Seq Scan on prt1_e_p1 t3 + Filter: (c = 0) + -> Merge Right Join + Merge Cond: (t1_1.a = (((t3_1.a + t3_1.b) / 2))) + -> Merge Left Join + Merge Cond: (t1_1.a = t2_1.b) + -> Sort + Sort Key: t1_1.a + -> Seq Scan on prt1_p2 t1_1 + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Sort + Sort Key: t2_1.b + -> Seq Scan on prt2_p2 t2_1 + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: ((a + b) / 2) + -> Sort + Sort Key: (((t3_1.a + t3_1.b) / 2)) + -> Seq Scan on prt1_e_p2 t3_1 + Filter: (c = 0) + -> Merge Right Join + Merge Cond: (t2_2.b = t1_2.a) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Sort + Sort Key: t2_2.b + -> Seq Scan on prt2_p3 t2_2 + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Sort + Sort Key: t1_2.a + -> Merge Left Join + Merge Cond: ((((t3_2.a + t3_2.b) / 2)) = t1_2.a) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: ((a + b) / 2) + -> Sort + Sort Key: (((t3_2.a + t3_2.b) / 2)) + -> Seq Scan on prt1_e_p3 t3_2 + Filter: (c = 0) + -> Sort + Sort Key: t1_2.a + -> Seq Scan on prt1_p3 t1_2 +(68 rows) + +SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b; + a | c | b | c | ?column? | c +-----+------+-----+------+----------+--- + 0 | 0000 | 0 | 0000 | 0 | 0 + 50 | 0050 | | | 100 | 0 + 100 | 0100 | | | 200 | 0 + 150 | 0150 | 150 | 0150 | 300 | 0 + 200 | 0200 | | | 400 | 0 + 250 | 0250 | | | 500 | 0 + 300 | 0300 | 300 | 0300 | 600 | 0 + 350 | 0350 | | | 700 | 0 + 400 | 0400 | | | 800 | 0 + 450 | 0450 | 450 | 0450 | 900 | 0 + 500 | 0500 | | | 1000 | 0 + 550 | 0550 | | | 1100 | 0 +(12 rows) + +-- MergeAppend on nullable column +EXPLAIN (COSTS OFF) +SELECT t1.a, t2.b FROM (SELECT * FROM prt1 WHERE a < 450) t1 LEFT JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b; + QUERY PLAN +----------------------------------------------------------------------- + Sort + Sort Key: prt1_p1.a, b + -> Append + -> Merge Left Join + Merge Cond: (prt1_p1.a = b) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: prt1_p1.a + -> Seq Scan on prt1_p1 + Filter: ((a < 450) AND (b = 0)) + -> Sort + Sort Key: b + -> Result + One-Time Filter: false + -> Merge Right Join + Merge Cond: (prt2_p2.b = prt1_p2.a) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Sort + Sort Key: prt2_p2.b + -> Seq Scan on prt2_p2 + Filter: (b > 250) + -> Sort + Sort Key: prt1_p2.a + -> Seq Scan on prt1_p2 + Filter: ((a < 450) AND (b = 0)) +(26 rows) + +SELECT t1.a, t2.b FROM (SELECT * FROM prt1 WHERE a < 450) t1 LEFT JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b; + a | b +-----+----- + 0 | + 50 | + 100 | + 150 | + 200 | + 250 | + 300 | 300 + 350 | + 400 | +(9 rows) + +RESET enable_hashjoin; +RESET enable_nestloop; +-- +-- partitioned by multiple columns +-- +CREATE TABLE prt1_m (a int, b int, c int) PARTITION BY RANGE(a, ((a + b)/2)); +CREATE TABLE prt1_m_p1 PARTITION OF prt1_m FOR VALUES FROM (0, 0) TO (250, 250); +CREATE TABLE prt1_m_p2 PARTITION OF prt1_m FOR VALUES FROM (250, 250) TO (500, 500); +CREATE TABLE prt1_m_p3 PARTITION OF prt1_m FOR VALUES FROM (500, 500) TO (600, 600); +INSERT INTO prt1_m SELECT i, i, i % 25 FROM generate_series(0, 599, 2) i; +ANALYZE prt1_m; +CREATE TABLE prt2_m (a int, b int, c int) PARTITION BY RANGE(((b + a)/2), b); +CREATE TABLE prt2_m_p1 PARTITION OF prt2_m FOR VALUES FROM (0, 0) TO (250, 250); +CREATE TABLE prt2_m_p2 PARTITION OF prt2_m FOR VALUES FROM (250, 250) TO (500, 500); +CREATE TABLE prt2_m_p3 PARTITION OF prt2_m FOR VALUES FROM (500, 500) TO (600, 600); +INSERT INTO prt2_m SELECT i, i, i % 25 FROM generate_series(0, 599, 3) i; +ANALYZE prt2_m; +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_m WHERE prt1_m.c = 0) t1 FULL JOIN (SELECT * FROM prt2_m WHERE prt2_m.c = 0) t2 ON (t1.a = (t2.b + t2.a)/2 AND t2.b = (t1.a + t1.b)/2) ORDER BY t1.a, t2.b; + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: prt1_m_p1.a, prt2_m_p1.b + -> Append + -> Hash Full Join + Hash Cond: ((prt1_m_p1.a = (((prt2_m_p1.b + prt2_m_p1.a) / 2))) AND (((prt1_m_p1.a + prt1_m_p1.b) / 2) = prt2_m_p1.b)) + -> Seq Scan on prt1_m_p1 + Filter: (c = 0) + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: ((b + a) / 2) + -> Seq Scan on prt2_m_p1 + Filter: (c = 0) + -> Hash Full Join + Hash Cond: ((prt1_m_p2.a = (((prt2_m_p2.b + prt2_m_p2.a) / 2))) AND (((prt1_m_p2.a + prt1_m_p2.b) / 2) = prt2_m_p2.b)) + -> Seq Scan on prt1_m_p2 + Filter: (c = 0) + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: ((b + a) / 2) + -> Seq Scan on prt2_m_p2 + Filter: (c = 0) + -> Hash Full Join + Hash Cond: ((prt1_m_p3.a = (((prt2_m_p3.b + prt2_m_p3.a) / 2))) AND (((prt1_m_p3.a + prt1_m_p3.b) / 2) = prt2_m_p3.b)) + -> Seq Scan on prt1_m_p3 + Filter: (c = 0) + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: ((b + a) / 2) + -> Seq Scan on prt2_m_p3 + Filter: (c = 0) +(31 rows) + +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_m WHERE prt1_m.c = 0) t1 FULL JOIN (SELECT * FROM prt2_m WHERE prt2_m.c = 0) t2 ON (t1.a = (t2.b + t2.a)/2 AND t2.b = (t1.a + t1.b)/2) ORDER BY t1.a, t2.b; + a | c | b | c +-----+---+-----+--- + 0 | 0 | 0 | 0 + 50 | 0 | | + 100 | 0 | | + 150 | 0 | 150 | 0 + 200 | 0 | | + 250 | 0 | | + 300 | 0 | 300 | 0 + 350 | 0 | | + 400 | 0 | | + 450 | 0 | 450 | 0 + 500 | 0 | | + 550 | 0 | | + | | 75 | 0 + | | 225 | 0 + | | 375 | 0 + | | 525 | 0 +(16 rows) + +-- +-- tests for list partitioned tables. +-- +CREATE TABLE plt1 (a int, b int, c text) PARTITION BY LIST(c); +CREATE TABLE plt1_p1 PARTITION OF plt1 FOR VALUES IN ('0000', '0003', '0004', '0010'); +CREATE TABLE plt1_p2 PARTITION OF plt1 FOR VALUES IN ('0001', '0005', '0002', '0009'); +CREATE TABLE plt1_p3 PARTITION OF plt1 FOR VALUES IN ('0006', '0007', '0008', '0011'); +INSERT INTO plt1 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i; +ANALYZE plt1; +CREATE TABLE plt2 (a int, b int, c text) PARTITION BY LIST(c); +CREATE TABLE plt2_p1 PARTITION OF plt2 FOR VALUES IN ('0000', '0003', '0004', '0010'); +CREATE TABLE plt2_p2 PARTITION OF plt2 FOR VALUES IN ('0001', '0005', '0002', '0009'); +CREATE TABLE plt2_p3 PARTITION OF plt2 FOR VALUES IN ('0006', '0007', '0008', '0011'); +INSERT INTO plt2 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 3) i; +ANALYZE plt2; +-- +-- list partitioned by expression +-- +CREATE TABLE plt1_e (a int, b int, c text) PARTITION BY LIST(ltrim(c, 'A')); +CREATE TABLE plt1_e_p1 PARTITION OF plt1_e FOR VALUES IN ('0000', '0003', '0004', '0010'); +CREATE TABLE plt1_e_p2 PARTITION OF plt1_e FOR VALUES IN ('0001', '0005', '0002', '0009'); +CREATE TABLE plt1_e_p3 PARTITION OF plt1_e FOR VALUES IN ('0006', '0007', '0008', '0011'); +INSERT INTO plt1_e SELECT i, i, 'A' || to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i; +ANALYZE plt1_e; +-- test partition matching with N-way join +EXPLAIN (COSTS OFF) +SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM plt1 t1, plt2 t2, plt1_e t3 WHERE t1.c = t2.c AND ltrim(t3.c, 'A') = t1.c GROUP BY t1.c, t2.c, t3.c ORDER BY t1.c, t2.c, t3.c; + QUERY PLAN +----------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Finalize GroupAggregate + Group Key: c, c, c + -> Sort + Sort Key: c, c + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: c + -> Partial HashAggregate + Group Key: t1.c, t2.c, t3.c + -> Result + -> Append + -> Hash Join + Hash Cond: (t1.c = t2.c) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on plt1_p1 t1 + -> Hash + -> Hash Join + Hash Cond: (t2.c = ltrim(t3.c, 'A'::text)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on plt2_p1 t2 + -> Hash + -> Seq Scan on plt1_e_p1 t3 + -> Hash Join + Hash Cond: (t1_1.c = t2_1.c) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on plt1_p2 t1_1 + -> Hash + -> Hash Join + Hash Cond: (t2_1.c = ltrim(t3_1.c, 'A'::text)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on plt2_p2 t2_1 + -> Hash + -> Seq Scan on plt1_e_p2 t3_1 + -> Hash Join + Hash Cond: (t1_2.c = t2_2.c) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on plt1_p3 t1_2 + -> Hash + -> Hash Join + Hash Cond: (t2_2.c = ltrim(t3_2.c, 'A'::text)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on plt2_p3 t2_2 + -> Hash + -> Seq Scan on plt1_e_p3 t3_2 +(44 rows) + +SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM plt1 t1, plt2 t2, plt1_e t3 WHERE t1.c = t2.c AND ltrim(t3.c, 'A') = t1.c GROUP BY t1.c, t2.c, t3.c ORDER BY t1.c, t2.c, t3.c; + avg | avg | avg | c | c | c +----------------------+----------------------+-----------------------+------+------+------- + 24.0000000000000000 | 24.0000000000000000 | 48.0000000000000000 | 0000 | 0000 | A0000 + 74.0000000000000000 | 75.0000000000000000 | 148.0000000000000000 | 0001 | 0001 | A0001 + 124.0000000000000000 | 124.5000000000000000 | 248.0000000000000000 | 0002 | 0002 | A0002 + 174.0000000000000000 | 174.0000000000000000 | 348.0000000000000000 | 0003 | 0003 | A0003 + 224.0000000000000000 | 225.0000000000000000 | 448.0000000000000000 | 0004 | 0004 | A0004 + 274.0000000000000000 | 274.5000000000000000 | 548.0000000000000000 | 0005 | 0005 | A0005 + 324.0000000000000000 | 324.0000000000000000 | 648.0000000000000000 | 0006 | 0006 | A0006 + 374.0000000000000000 | 375.0000000000000000 | 748.0000000000000000 | 0007 | 0007 | A0007 + 424.0000000000000000 | 424.5000000000000000 | 848.0000000000000000 | 0008 | 0008 | A0008 + 474.0000000000000000 | 474.0000000000000000 | 948.0000000000000000 | 0009 | 0009 | A0009 + 524.0000000000000000 | 525.0000000000000000 | 1048.0000000000000000 | 0010 | 0010 | A0010 + 574.0000000000000000 | 574.5000000000000000 | 1148.0000000000000000 | 0011 | 0011 | A0011 +(12 rows) + +-- joins where one of the relations is proven empty +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.a = 1 AND t1.a = 2; + QUERY PLAN +-------------------------- + Result + One-Time Filter: false +(2 rows) + +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 LEFT JOIN prt2 t2 ON t1.a = t2.b; + QUERY PLAN +-------------------------- + Result + One-Time Filter: false +(2 rows) + +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b, prt1 t3 WHERE t2.b = t3.a; + QUERY PLAN +----------------------------------------------------------------------------------- + Hash Left Join + Hash Cond: (b = a) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Hash Join + Hash Cond: (t3.a = t2.b) + -> Seq Scan on prt1_p1 t3 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Seq Scan on prt2_p1 t2 + -> Hash Join + Hash Cond: (t3_1.a = t2_1.b) + -> Seq Scan on prt1_p2 t3_1 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Seq Scan on prt2_p2 t2_1 + -> Nested Loop + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Seq Scan on prt2_p3 t2_2 + -> Index Only Scan using iprt1_p3_a on prt1_p3 t3_2 + Index Cond: (a = t2_2.b) + -> Hash + -> Result + One-Time Filter: false +(27 rows) + +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 FULL JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b; + QUERY PLAN +------------------------------------------------------ + Sort + Sort Key: a, b + -> Hash Left Join + Hash Cond: (b = a) + -> Remote Subquery Scan on all (datanode_2) + -> Append + -> Seq Scan on prt2_p1 t2 + Filter: (a = 0) + -> Seq Scan on prt2_p2 t2_1 + Filter: (a = 0) + -> Seq Scan on prt2_p3 t2_2 + Filter: (a = 0) + -> Hash + -> Result + One-Time Filter: false +(15 rows) + +-- +-- tests for hash partitioned tables. +-- +CREATE TABLE pht1 (a int, b int, c text) PARTITION BY HASH(c); +CREATE TABLE pht1_p1 PARTITION OF pht1 FOR VALUES WITH (MODULUS 3, REMAINDER 0); +CREATE TABLE pht1_p2 PARTITION OF pht1 FOR VALUES WITH (MODULUS 3, REMAINDER 1); +CREATE TABLE pht1_p3 PARTITION OF pht1 FOR VALUES WITH (MODULUS 3, REMAINDER 2); +INSERT INTO pht1 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i; +ANALYZE pht1; +CREATE TABLE pht2 (a int, b int, c text) PARTITION BY HASH(c); +CREATE TABLE pht2_p1 PARTITION OF pht2 FOR VALUES WITH (MODULUS 3, REMAINDER 0); +CREATE TABLE pht2_p2 PARTITION OF pht2 FOR VALUES WITH (MODULUS 3, REMAINDER 1); +CREATE TABLE pht2_p3 PARTITION OF pht2 FOR VALUES WITH (MODULUS 3, REMAINDER 2); +INSERT INTO pht2 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 3) i; +ANALYZE pht2; +-- +-- hash partitioned by expression +-- +CREATE TABLE pht1_e (a int, b int, c text) PARTITION BY HASH(ltrim(c, 'A')); +CREATE TABLE pht1_e_p1 PARTITION OF pht1_e FOR VALUES WITH (MODULUS 3, REMAINDER 0); +CREATE TABLE pht1_e_p2 PARTITION OF pht1_e FOR VALUES WITH (MODULUS 3, REMAINDER 1); +CREATE TABLE pht1_e_p3 PARTITION OF pht1_e FOR VALUES WITH (MODULUS 3, REMAINDER 2); +INSERT INTO pht1_e SELECT i, i, 'A' || to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i; +ANALYZE pht1_e; +-- test partition matching with N-way join +EXPLAIN (COSTS OFF) +SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM pht1 t1, pht2 t2, pht1_e t3 WHERE t1.c = t2.c AND ltrim(t3.c, 'A') = t1.c GROUP BY t1.c, t2.c, t3.c ORDER BY t1.c, t2.c, t3.c; + QUERY PLAN +----------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Finalize GroupAggregate + Group Key: c, c, c + -> Sort + Sort Key: c, c + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: c + -> Partial HashAggregate + Group Key: t1.c, t2.c, t3.c + -> Result + -> Append + -> Hash Join + Hash Cond: (t1.c = t2.c) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on pht1_p1 t1 + -> Hash + -> Hash Join + Hash Cond: (t2.c = ltrim(t3.c, 'A'::text)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on pht2_p1 t2 + -> Hash + -> Seq Scan on pht1_e_p1 t3 + -> Hash Join + Hash Cond: (t1_1.c = t2_1.c) + -> Hash Join + Hash Cond: (t1_1.c = ltrim(t3_1.c, 'A'::text)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on pht1_p2 t1_1 + -> Hash + -> Seq Scan on pht1_e_p2 t3_1 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on pht2_p2 t2_1 + -> Hash Join + Hash Cond: (t1_2.c = t2_2.c) + -> Hash Join + Hash Cond: (t1_2.c = ltrim(t3_2.c, 'A'::text)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on pht1_p3 t1_2 + -> Hash + -> Seq Scan on pht1_e_p3 t3_2 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on pht2_p3 t2_2 +(44 rows) + +SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM pht1 t1, pht2 t2, pht1_e t3 WHERE t1.c = t2.c AND ltrim(t3.c, 'A') = t1.c GROUP BY t1.c, t2.c, t3.c ORDER BY t1.c, t2.c, t3.c; + avg | avg | avg | c | c | c +----------------------+----------------------+-----------------------+------+------+------- + 24.0000000000000000 | 24.0000000000000000 | 48.0000000000000000 | 0000 | 0000 | A0000 + 74.0000000000000000 | 75.0000000000000000 | 148.0000000000000000 | 0001 | 0001 | A0001 + 124.0000000000000000 | 124.5000000000000000 | 248.0000000000000000 | 0002 | 0002 | A0002 + 174.0000000000000000 | 174.0000000000000000 | 348.0000000000000000 | 0003 | 0003 | A0003 + 224.0000000000000000 | 225.0000000000000000 | 448.0000000000000000 | 0004 | 0004 | A0004 + 274.0000000000000000 | 274.5000000000000000 | 548.0000000000000000 | 0005 | 0005 | A0005 + 324.0000000000000000 | 324.0000000000000000 | 648.0000000000000000 | 0006 | 0006 | A0006 + 374.0000000000000000 | 375.0000000000000000 | 748.0000000000000000 | 0007 | 0007 | A0007 + 424.0000000000000000 | 424.5000000000000000 | 848.0000000000000000 | 0008 | 0008 | A0008 + 474.0000000000000000 | 474.0000000000000000 | 948.0000000000000000 | 0009 | 0009 | A0009 + 524.0000000000000000 | 525.0000000000000000 | 1048.0000000000000000 | 0010 | 0010 | A0010 + 574.0000000000000000 | 574.5000000000000000 | 1148.0000000000000000 | 0011 | 0011 | A0011 +(12 rows) + +-- +-- multiple levels of partitioning +-- +CREATE TABLE prt1_l (a int, b int, c varchar) PARTITION BY RANGE(a); +CREATE TABLE prt1_l_p1 PARTITION OF prt1_l FOR VALUES FROM (0) TO (250); +CREATE TABLE prt1_l_p2 PARTITION OF prt1_l FOR VALUES FROM (250) TO (500) PARTITION BY LIST (c); +CREATE TABLE prt1_l_p2_p1 PARTITION OF prt1_l_p2 FOR VALUES IN ('0000', '0001'); +CREATE TABLE prt1_l_p2_p2 PARTITION OF prt1_l_p2 FOR VALUES IN ('0002', '0003'); +CREATE TABLE prt1_l_p3 PARTITION OF prt1_l FOR VALUES FROM (500) TO (600) PARTITION BY RANGE (b); +CREATE TABLE prt1_l_p3_p1 PARTITION OF prt1_l_p3 FOR VALUES FROM (0) TO (13); +CREATE TABLE prt1_l_p3_p2 PARTITION OF prt1_l_p3 FOR VALUES FROM (13) TO (25); +INSERT INTO prt1_l SELECT i, i % 25, to_char(i % 4, 'FM0000') FROM generate_series(0, 599, 2) i; +ANALYZE prt1_l; +CREATE TABLE prt2_l (a int, b int, c varchar) PARTITION BY RANGE(b); +CREATE TABLE prt2_l_p1 PARTITION OF prt2_l FOR VALUES FROM (0) TO (250); +CREATE TABLE prt2_l_p2 PARTITION OF prt2_l FOR VALUES FROM (250) TO (500) PARTITION BY LIST (c); +CREATE TABLE prt2_l_p2_p1 PARTITION OF prt2_l_p2 FOR VALUES IN ('0000', '0001'); +CREATE TABLE prt2_l_p2_p2 PARTITION OF prt2_l_p2 FOR VALUES IN ('0002', '0003'); +CREATE TABLE prt2_l_p3 PARTITION OF prt2_l FOR VALUES FROM (500) TO (600) PARTITION BY RANGE (a); +CREATE TABLE prt2_l_p3_p1 PARTITION OF prt2_l_p3 FOR VALUES FROM (0) TO (13); +CREATE TABLE prt2_l_p3_p2 PARTITION OF prt2_l_p3 FOR VALUES FROM (13) TO (25); +INSERT INTO prt2_l SELECT i % 25, i, to_char(i % 4, 'FM0000') FROM generate_series(0, 599, 3) i; +ANALYZE prt2_l; +-- inner join, qual covering only top-level partitions +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1, prt2_l t2 WHERE t1.a = t2.b AND t1.b = 0 ORDER BY t1.a, t2.b; + QUERY PLAN +----------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: t1.a + -> Append + -> Hash Join + Hash Cond: (t2.b = t1.a) + -> Seq Scan on prt2_l_p1 t2 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on prt1_l_p1 t1 + Filter: (b = 0) + -> Hash Join + Hash Cond: (t2_1.b = a) + -> Append + -> Seq Scan on prt2_l_p2_p1 t2_1 + -> Seq Scan on prt2_l_p2_p2 t2_2 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt1_l_p2_p1 t1_1 + Filter: (b = 0) + -> Seq Scan on prt1_l_p2_p2 t1_2 + Filter: (b = 0) + -> Nested Loop + Join Filter: (a = t2_3.b) + -> Append + -> Seq Scan on prt2_l_p3_p1 t2_3 + -> Seq Scan on prt2_l_p3_p2 t2_4 + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt1_l_p3_p1 t1_3 + Filter: (b = 0) +(33 rows) + +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1, prt2_l t2 WHERE t1.a = t2.b AND t1.b = 0 ORDER BY t1.a, t2.b; + a | c | b | c +-----+------+-----+------ + 0 | 0000 | 0 | 0000 + 150 | 0002 | 150 | 0002 + 300 | 0000 | 300 | 0000 + 450 | 0002 | 450 | 0002 +(4 rows) + +-- left join +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 LEFT JOIN prt2_l t2 ON t1.a = t2.b AND t1.c = t2.c WHERE t1.b = 0 ORDER BY t1.a, t2.b; + QUERY PLAN +------------------------------------------------------------------------------------------ + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: t1.a, t2.b + -> Append + -> Hash Right Join + Hash Cond: ((t2.b = t1.a) AND ((t2.c)::text = (t1.c)::text)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Seq Scan on prt2_l_p1 t2 + -> Hash + -> Seq Scan on prt1_l_p1 t1 + Filter: (b = 0) + -> Hash Right Join + Hash Cond: ((t2_1.b = t1_1.a) AND ((t2_1.c)::text = (t1_1.c)::text)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Seq Scan on prt2_l_p2_p1 t2_1 + -> Hash + -> Seq Scan on prt1_l_p2_p1 t1_1 + Filter: (b = 0) + -> Hash Right Join + Hash Cond: ((t2_2.b = t1_2.a) AND ((t2_2.c)::text = (t1_2.c)::text)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Seq Scan on prt2_l_p2_p2 t2_2 + -> Hash + -> Seq Scan on prt1_l_p2_p2 t1_2 + Filter: (b = 0) + -> Nested Loop Left Join + Join Filter: ((a = b) AND ((c)::text = (c)::text)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Append + -> Seq Scan on prt1_l_p3_p1 t1_3 + Filter: (b = 0) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Append + -> Seq Scan on prt2_l_p3_p1 t2_3 + -> Seq Scan on prt2_l_p3_p2 t2_4 +(41 rows) + +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 LEFT JOIN prt2_l t2 ON t1.a = t2.b AND t1.c = t2.c WHERE t1.b = 0 ORDER BY t1.a, t2.b; + a | c | b | c +-----+------+-----+------ + 0 | 0000 | 0 | 0000 + 50 | 0002 | | + 100 | 0000 | | + 150 | 0002 | 150 | 0002 + 200 | 0000 | | + 250 | 0002 | | + 300 | 0000 | 300 | 0000 + 350 | 0002 | | + 400 | 0000 | | + 450 | 0002 | 450 | 0002 + 500 | 0000 | | + 550 | 0002 | | +(12 rows) + +-- right join +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 RIGHT JOIN prt2_l t2 ON t1.a = t2.b AND t1.c = t2.c WHERE t2.a = 0 ORDER BY t1.a, t2.b; + QUERY PLAN +-------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: t1.a, t2.b + -> Result + -> Append + -> Hash Right Join + Hash Cond: ((t1.a = t2.b) AND ((t1.c)::text = (t2.c)::text)) + -> Seq Scan on prt1_l_p1 t1 + -> Hash + -> Remote Subquery Scan on all (datanode_2) + Distribute results by H: b + -> Seq Scan on prt2_l_p1 t2 + Filter: (a = 0) + -> Nested Loop Left Join + Join Filter: ((t1_1.a = t2_1.b) AND ((t1_1.c)::text = (t2_1.c)::text)) + -> Remote Subquery Scan on all (datanode_2) + Distribute results by H: b + -> Seq Scan on prt2_l_p2_p1 t2_1 + Filter: (a = 0) + -> Seq Scan on prt1_l_p2_p1 t1_1 + -> Hash Right Join + Hash Cond: ((t1_2.a = t2_2.b) AND ((t1_2.c)::text = (t2_2.c)::text)) + -> Seq Scan on prt1_l_p2_p2 t1_2 + -> Hash + -> Remote Subquery Scan on all (datanode_2) + Distribute results by H: b + -> Seq Scan on prt2_l_p2_p2 t2_2 + Filter: (a = 0) + -> Nested Loop Left Join + Join Filter: ((a = b) AND ((c)::text = (c)::text)) + -> Remote Subquery Scan on all (datanode_2) + Distribute results by H: b + -> Append + -> Seq Scan on prt2_l_p3_p1 t2_3 + Filter: (a = 0) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Append + -> Seq Scan on prt1_l_p3_p1 t1_3 + -> Seq Scan on prt1_l_p3_p2 t1_4 +(41 rows) + +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 RIGHT JOIN prt2_l t2 ON t1.a = t2.b AND t1.c = t2.c WHERE t2.a = 0 ORDER BY t1.a, t2.b; + a | c | b | c +-----+------+-----+------ + 0 | 0000 | 0 | 0000 + 150 | 0002 | 150 | 0002 + 300 | 0000 | 300 | 0000 + 450 | 0002 | 450 | 0002 + | | 75 | 0003 + | | 225 | 0001 + | | 375 | 0003 + | | 525 | 0001 +(8 rows) + +-- full join +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_l WHERE prt1_l.b = 0) t1 FULL JOIN (SELECT * FROM prt2_l WHERE prt2_l.a = 0) t2 ON (t1.a = t2.b AND t1.c = t2.c) ORDER BY t1.a, t2.b; + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: prt1_l_p1.a, prt2_l_p1.b + -> Append + -> Hash Full Join + Hash Cond: ((prt1_l_p1.a = prt2_l_p1.b) AND ((prt1_l_p1.c)::text = (prt2_l_p1.c)::text)) + -> Seq Scan on prt1_l_p1 + Filter: (b = 0) + -> Hash + -> Remote Subquery Scan on all (datanode_2) + Distribute results by H: b + -> Seq Scan on prt2_l_p1 + Filter: (a = 0) + -> Hash Full Join + Hash Cond: ((prt1_l_p2_p1.a = prt2_l_p2_p1.b) AND ((prt1_l_p2_p1.c)::text = (prt2_l_p2_p1.c)::text)) + -> Seq Scan on prt1_l_p2_p1 + Filter: (b = 0) + -> Hash + -> Remote Subquery Scan on all (datanode_2) + Distribute results by H: b + -> Seq Scan on prt2_l_p2_p1 + Filter: (a = 0) + -> Hash Full Join + Hash Cond: ((prt1_l_p2_p2.a = prt2_l_p2_p2.b) AND ((prt1_l_p2_p2.c)::text = (prt2_l_p2_p2.c)::text)) + -> Seq Scan on prt1_l_p2_p2 + Filter: (b = 0) + -> Hash + -> Remote Subquery Scan on all (datanode_2) + Distribute results by H: b + -> Seq Scan on prt2_l_p2_p2 + Filter: (a = 0) + -> Hash Full Join + Hash Cond: ((a = b) AND ((c)::text = (c)::text)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Append + -> Seq Scan on prt1_l_p3_p1 + Filter: (b = 0) + -> Hash + -> Remote Subquery Scan on all (datanode_2) + Distribute results by H: b + -> Append + -> Seq Scan on prt2_l_p3_p1 + Filter: (a = 0) +(44 rows) + +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_l WHERE prt1_l.b = 0) t1 FULL JOIN (SELECT * FROM prt2_l WHERE prt2_l.a = 0) t2 ON (t1.a = t2.b AND t1.c = t2.c) ORDER BY t1.a, t2.b; + a | c | b | c +-----+------+-----+------ + 0 | 0000 | 0 | 0000 + 50 | 0002 | | + 100 | 0000 | | + 150 | 0002 | 150 | 0002 + 200 | 0000 | | + 250 | 0002 | | + 300 | 0000 | 300 | 0000 + 350 | 0002 | | + 400 | 0000 | | + 450 | 0002 | 450 | 0002 + 500 | 0000 | | + 550 | 0002 | | + | | 75 | 0003 + | | 225 | 0001 + | | 375 | 0003 + | | 525 | 0001 +(16 rows) + +-- lateral partition-wise join +EXPLAIN (COSTS OFF) +SELECT * FROM prt1_l t1 LEFT JOIN LATERAL + (SELECT t2.a AS t2a, t2.c AS t2c, t2.b AS t2b, t3.b AS t3b, least(t1.a,t2.a,t3.b) FROM prt1_l t2 JOIN prt2_l t3 ON (t2.a = t3.b AND t2.c = t3.c)) ss + ON t1.a = ss.t2a AND t1.c = ss.t2c WHERE t1.b = 0 ORDER BY t1.a; +ERROR: could not devise a query plan for the given query +SELECT * FROM prt1_l t1 LEFT JOIN LATERAL + (SELECT t2.a AS t2a, t2.c AS t2c, t2.b AS t2b, t3.b AS t3b, least(t1.a,t2.a,t3.b) FROM prt1_l t2 JOIN prt2_l t3 ON (t2.a = t3.b AND t2.c = t3.c)) ss + ON t1.a = ss.t2a AND t1.c = ss.t2c WHERE t1.b = 0 ORDER BY t1.a; + a | b | c | t2a | t2c | t2b | t3b | least +-----+---+------+-----+------+-----+-----+------- + 0 | 0 | 0000 | 0 | 0000 | 0 | 0 | 0 + 50 | 0 | 0002 | | | | | + 100 | 0 | 0000 | | | | | + 150 | 0 | 0002 | 150 | 0002 | 0 | 150 | 150 + 200 | 0 | 0000 | | | | | + 250 | 0 | 0002 | | | | | + 300 | 0 | 0000 | 300 | 0000 | 0 | 300 | 300 + 350 | 0 | 0002 | | | | | + 400 | 0 | 0000 | | | | | + 450 | 0 | 0002 | 450 | 0002 | 0 | 450 | 450 + 500 | 0 | 0000 | | | | | + 550 | 0 | 0002 | | | | | +(12 rows) + +-- join with one side empty +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_l WHERE a = 1 AND a = 2) t1 RIGHT JOIN prt2_l t2 ON t1.a = t2.b AND t1.b = t2.a AND t1.c = t2.c; + QUERY PLAN +---------------------------------------------------------------- + Hash Left Join + Hash Cond: ((b = a) AND (a = b) AND ((c)::text = (c)::text)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt2_l_p1 t2 + -> Seq Scan on prt2_l_p2_p1 t2_1 + -> Seq Scan on prt2_l_p2_p2 t2_2 + -> Seq Scan on prt2_l_p3_p1 t2_3 + -> Seq Scan on prt2_l_p3_p2 t2_4 + -> Hash + -> Result + One-Time Filter: false +(12 rows) + +-- +-- negative testcases +-- +CREATE TABLE prt1_n (a int, b int, c varchar) PARTITION BY RANGE(c); +CREATE TABLE prt1_n_p1 PARTITION OF prt1_n FOR VALUES FROM ('0000') TO ('0250'); +CREATE TABLE prt1_n_p2 PARTITION OF prt1_n FOR VALUES FROM ('0250') TO ('0500'); +INSERT INTO prt1_n SELECT i, i, to_char(i, 'FM0000') FROM generate_series(0, 499, 2) i; +ANALYZE prt1_n; +CREATE TABLE prt2_n (a int, b int, c text) PARTITION BY LIST(c); +CREATE TABLE prt2_n_p1 PARTITION OF prt2_n FOR VALUES IN ('0000', '0003', '0004', '0010', '0006', '0007'); +CREATE TABLE prt2_n_p2 PARTITION OF prt2_n FOR VALUES IN ('0001', '0005', '0002', '0009', '0008', '0011'); +INSERT INTO prt2_n SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i; +ANALYZE prt2_n; +CREATE TABLE prt3_n (a int, b int, c text) PARTITION BY LIST(c); +CREATE TABLE prt3_n_p1 PARTITION OF prt3_n FOR VALUES IN ('0000', '0004', '0006', '0007'); +CREATE TABLE prt3_n_p2 PARTITION OF prt3_n FOR VALUES IN ('0001', '0002', '0008', '0010'); +CREATE TABLE prt3_n_p3 PARTITION OF prt3_n FOR VALUES IN ('0003', '0005', '0009', '0011'); +INSERT INTO prt2_n SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i; +ANALYZE prt3_n; +CREATE TABLE prt4_n (a int, b int, c text) PARTITION BY RANGE(a); +CREATE TABLE prt4_n_p1 PARTITION OF prt4_n FOR VALUES FROM (0) TO (300); +CREATE TABLE prt4_n_p2 PARTITION OF prt4_n FOR VALUES FROM (300) TO (500); +CREATE TABLE prt4_n_p3 PARTITION OF prt4_n FOR VALUES FROM (500) TO (600); +INSERT INTO prt4_n SELECT i, i, to_char(i, 'FM0000') FROM generate_series(0, 599, 2) i; +ANALYZE prt4_n; +-- partition-wise join can not be applied if the partition ranges differ +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt4_n t2 WHERE t1.a = t2.a; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Hash Join + Hash Cond: (t1.a = t2.a) + -> Append + -> Seq Scan on prt1_p1 t1 + -> Seq Scan on prt1_p2 t1_1 + -> Seq Scan on prt1_p3 t1_2 + -> Hash + -> Append + -> Seq Scan on prt4_n_p1 t2 + -> Seq Scan on prt4_n_p2 t2_1 + -> Seq Scan on prt4_n_p3 t2_2 +(12 rows) + +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt4_n t2, prt2 t3 WHERE t1.a = t2.a and t1.a = t3.b; + QUERY PLAN +----------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Hash Join + Hash Cond: (t1.a = t2.a) + -> Append + -> Hash Join + Hash Cond: (t1.a = t3.b) + -> Seq Scan on prt1_p1 t1 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Seq Scan on prt2_p1 t3 + -> Hash Join + Hash Cond: (t1_1.a = t3_1.b) + -> Seq Scan on prt1_p2 t1_1 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Seq Scan on prt2_p2 t3_1 + -> Nested Loop + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Seq Scan on prt2_p3 t3_2 + -> Index Scan using iprt1_p3_a on prt1_p3 t1_2 + Index Cond: (a = t3_2.b) + -> Hash + -> Append + -> Seq Scan on prt4_n_p1 t2 + -> Seq Scan on prt4_n_p2 t2_1 + -> Seq Scan on prt4_n_p3 t2_2 +(29 rows) + +-- partition-wise join can not be applied if there are no equi-join conditions +-- between partition keys +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1 LEFT JOIN prt2 t2 ON (t1.a < t2.b); + QUERY PLAN +----------------------------------------------------------------- + Nested Loop Left Join + Join Filter: (a < b) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt1_p1 t1 + -> Seq Scan on prt1_p2 t1_1 + -> Seq Scan on prt1_p3 t1_2 + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt2_p1 t2 + -> Seq Scan on prt2_p2 t2_1 + -> Seq Scan on prt2_p3 t2_2 +(13 rows) + +-- equi-join with join condition on partial keys does not qualify for +-- partition-wise join +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_m t1, prt2_m t2 WHERE t1.a = (t2.b + t2.a)/2; + QUERY PLAN +----------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Hash Join + Hash Cond: ((((b + a) / 2)) = a) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: ((b + a) / 2) + -> Result + -> Append + -> Seq Scan on prt2_m_p1 t2 + -> Seq Scan on prt2_m_p2 t2_1 + -> Seq Scan on prt2_m_p3 t2_2 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Append + -> Seq Scan on prt1_m_p1 t1 + -> Seq Scan on prt1_m_p2 t1_1 + -> Seq Scan on prt1_m_p3 t1_2 +(17 rows) + +-- equi-join between out-of-order partition key columns does not qualify for +-- partition-wise join +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_m t1 LEFT JOIN prt2_m t2 ON t1.a = t2.b; + QUERY PLAN +----------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Hash Left Join + Hash Cond: (t1.a = b) + -> Append + -> Seq Scan on prt1_m_p1 t1 + -> Seq Scan on prt1_m_p2 t1_1 + -> Seq Scan on prt1_m_p3 t1_2 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt2_m_p1 t2 + -> Seq Scan on prt2_m_p2 t2_1 + -> Seq Scan on prt2_m_p3 t2_2 +(13 rows) + +-- equi-join between non-key columns does not qualify for partition-wise join +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_m t1 LEFT JOIN prt2_m t2 ON t1.c = t2.c; + QUERY PLAN +----------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Hash Left Join + Hash Cond: (t1.c = c) + -> Append + -> Seq Scan on prt1_m_p1 t1 + -> Seq Scan on prt1_m_p2 t1_1 + -> Seq Scan on prt1_m_p3 t1_2 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt2_m_p1 t2 + -> Seq Scan on prt2_m_p2 t2_1 + -> Seq Scan on prt2_m_p3 t2_2 +(13 rows) + +-- partition-wise join can not be applied between tables with different +-- partition lists +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_n t1 LEFT JOIN prt2_n t2 ON (t1.c = t2.c); + QUERY PLAN +----------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Hash Right Join + Hash Cond: (c = (c)::text) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: c + -> Append + -> Seq Scan on prt2_n_p1 t2 + -> Seq Scan on prt2_n_p2 t2_1 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: c + -> Append + -> Seq Scan on prt1_n_p1 t1 + -> Seq Scan on prt1_n_p2 t1_1 +(14 rows) + +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_n t1 JOIN prt2_n t2 ON (t1.c = t2.c) JOIN plt1 t3 ON (t1.c = t3.c); + QUERY PLAN +----------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Hash Join + Hash Cond: (c = (c)::text) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on plt1_p1 t3 + -> Seq Scan on plt1_p2 t3_1 + -> Seq Scan on plt1_p3 t3_2 + -> Hash + -> Hash Join + Hash Cond: (t2.c = (c)::text) + -> Append + -> Seq Scan on prt2_n_p1 t2 + -> Seq Scan on prt2_n_p2 t2_1 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt1_n_p1 t1 + -> Seq Scan on prt1_n_p2 t1_1 +(19 rows) + +-- partition-wise join can not be applied for a join between list and range +-- partitioned table +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_n t1 FULL JOIN prt1 t2 ON (t1.c = t2.c); + QUERY PLAN +----------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Hash Full Join + Hash Cond: ((c)::text = (c)::text) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: c + -> Append + -> Seq Scan on prt1_p1 t2 + -> Seq Scan on prt1_p2 t2_1 + -> Seq Scan on prt1_p3 t2_2 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: c + -> Append + -> Seq Scan on prt1_n_p1 t1 + -> Seq Scan on prt1_n_p2 t1_1 +(15 rows) + diff --git a/src/test/regress/expected/partition_prune.out b/src/test/regress/expected/partition_prune.out index ff388472..61bbdf23 100644 --- a/src/test/regress/expected/partition_prune.out +++ b/src/test/regress/expected/partition_prune.out @@ -9,140 +9,164 @@ create table lp_bc partition of lp for values in ('b', 'c'); create table lp_g partition of lp for values in ('g'); create table lp_null partition of lp for values in (null); explain (costs off) select * from lp; - QUERY PLAN ------------------------------- - Append - -> Seq Scan on lp_ad - -> Seq Scan on lp_bc - -> Seq Scan on lp_ef - -> Seq Scan on lp_g - -> Seq Scan on lp_null - -> Seq Scan on lp_default -(7 rows) + QUERY PLAN +------------------------------------ + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on lp_ad + -> Seq Scan on lp_bc + -> Seq Scan on lp_ef + -> Seq Scan on lp_g + -> Seq Scan on lp_null + -> Seq Scan on lp_default +(9 rows) explain (costs off) select * from lp where a > 'a' and a < 'd'; - QUERY PLAN ------------------------------------------------------------ - Append - -> Seq Scan on lp_bc - Filter: ((a > 'a'::bpchar) AND (a < 'd'::bpchar)) - -> Seq Scan on lp_default - Filter: ((a > 'a'::bpchar) AND (a < 'd'::bpchar)) -(5 rows) + QUERY PLAN +----------------------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on lp_bc + Filter: ((a > 'a'::bpchar) AND (a < 'd'::bpchar)) + -> Seq Scan on lp_default + Filter: ((a > 'a'::bpchar) AND (a < 'd'::bpchar)) +(7 rows) explain (costs off) select * from lp where a > 'a' and a <= 'd'; - QUERY PLAN ------------------------------------------------------------- - Append - -> Seq Scan on lp_ad - Filter: ((a > 'a'::bpchar) AND (a <= 'd'::bpchar)) - -> Seq Scan on lp_bc - Filter: ((a > 'a'::bpchar) AND (a <= 'd'::bpchar)) - -> Seq Scan on lp_default - Filter: ((a > 'a'::bpchar) AND (a <= 'd'::bpchar)) -(7 rows) + QUERY PLAN +------------------------------------------------------------------ + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on lp_ad + Filter: ((a > 'a'::bpchar) AND (a <= 'd'::bpchar)) + -> Seq Scan on lp_bc + Filter: ((a > 'a'::bpchar) AND (a <= 'd'::bpchar)) + -> Seq Scan on lp_default + Filter: ((a > 'a'::bpchar) AND (a <= 'd'::bpchar)) +(9 rows) explain (costs off) select * from lp where a = 'a'; - QUERY PLAN ------------------------------------ - Append - -> Seq Scan on lp_ad - Filter: (a = 'a'::bpchar) -(3 rows) + QUERY PLAN +----------------------------------------- + Remote Fast Query Execution + Node/s: datanode_2 + -> Append + -> Seq Scan on lp_ad + Filter: (a = 'a'::bpchar) +(5 rows) explain (costs off) select * from lp where 'a' = a; /* commuted */ - QUERY PLAN ------------------------------------ - Append - -> Seq Scan on lp_ad - Filter: ('a'::bpchar = a) -(3 rows) + QUERY PLAN +----------------------------------------- + Remote Fast Query Execution + Node/s: datanode_2 + -> Append + -> Seq Scan on lp_ad + Filter: ('a'::bpchar = a) +(5 rows) explain (costs off) select * from lp where a is not null; - QUERY PLAN ---------------------------------- - Append - -> Seq Scan on lp_ad - Filter: (a IS NOT NULL) - -> Seq Scan on lp_bc - Filter: (a IS NOT NULL) - -> Seq Scan on lp_ef - Filter: (a IS NOT NULL) - -> Seq Scan on lp_g - Filter: (a IS NOT NULL) - -> Seq Scan on lp_default - Filter: (a IS NOT NULL) -(11 rows) + QUERY PLAN +--------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on lp_ad + Filter: (a IS NOT NULL) + -> Seq Scan on lp_bc + Filter: (a IS NOT NULL) + -> Seq Scan on lp_ef + Filter: (a IS NOT NULL) + -> Seq Scan on lp_g + Filter: (a IS NOT NULL) + -> Seq Scan on lp_default + Filter: (a IS NOT NULL) +(13 rows) explain (costs off) select * from lp where a is null; - QUERY PLAN ------------------------------ - Append - -> Seq Scan on lp_null - Filter: (a IS NULL) -(3 rows) + QUERY PLAN +----------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on lp_null + Filter: (a IS NULL) +(5 rows) explain (costs off) select * from lp where a = 'a' or a = 'c'; - QUERY PLAN ----------------------------------------------------------- - Append - -> Seq Scan on lp_ad - Filter: ((a = 'a'::bpchar) OR (a = 'c'::bpchar)) - -> Seq Scan on lp_bc - Filter: ((a = 'a'::bpchar) OR (a = 'c'::bpchar)) -(5 rows) + QUERY PLAN +---------------------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on lp_ad + Filter: ((a = 'a'::bpchar) OR (a = 'c'::bpchar)) + -> Seq Scan on lp_bc + Filter: ((a = 'a'::bpchar) OR (a = 'c'::bpchar)) +(7 rows) explain (costs off) select * from lp where a is not null and (a = 'a' or a = 'c'); - QUERY PLAN --------------------------------------------------------------------------------- - Append - -> Seq Scan on lp_ad - Filter: ((a IS NOT NULL) AND ((a = 'a'::bpchar) OR (a = 'c'::bpchar))) - -> Seq Scan on lp_bc - Filter: ((a IS NOT NULL) AND ((a = 'a'::bpchar) OR (a = 'c'::bpchar))) -(5 rows) + QUERY PLAN +-------------------------------------------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on lp_ad + Filter: ((a IS NOT NULL) AND ((a = 'a'::bpchar) OR (a = 'c'::bpchar))) + -> Seq Scan on lp_bc + Filter: ((a IS NOT NULL) AND ((a = 'a'::bpchar) OR (a = 'c'::bpchar))) +(7 rows) explain (costs off) select * from lp where a <> 'g'; - QUERY PLAN ------------------------------------- - Append - -> Seq Scan on lp_ad - Filter: (a <> 'g'::bpchar) - -> Seq Scan on lp_bc - Filter: (a <> 'g'::bpchar) - -> Seq Scan on lp_ef - Filter: (a <> 'g'::bpchar) - -> Seq Scan on lp_default - Filter: (a <> 'g'::bpchar) -(9 rows) + QUERY PLAN +------------------------------------------ + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on lp_ad + Filter: (a <> 'g'::bpchar) + -> Seq Scan on lp_bc + Filter: (a <> 'g'::bpchar) + -> Seq Scan on lp_ef + Filter: (a <> 'g'::bpchar) + -> Seq Scan on lp_default + Filter: (a <> 'g'::bpchar) +(11 rows) explain (costs off) select * from lp where a <> 'a' and a <> 'd'; - QUERY PLAN -------------------------------------------------------------- - Append - -> Seq Scan on lp_bc - Filter: ((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) - -> Seq Scan on lp_ef - Filter: ((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) - -> Seq Scan on lp_g - Filter: ((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) - -> Seq Scan on lp_default - Filter: ((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) -(9 rows) + QUERY PLAN +------------------------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on lp_bc + Filter: ((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) + -> Seq Scan on lp_ef + Filter: ((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) + -> Seq Scan on lp_g + Filter: ((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) + -> Seq Scan on lp_default + Filter: ((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) +(11 rows) explain (costs off) select * from lp where a not in ('a', 'd'); - QUERY PLAN ------------------------------------------------- - Append - -> Seq Scan on lp_bc - Filter: (a <> ALL ('{a,d}'::bpchar[])) - -> Seq Scan on lp_ef - Filter: (a <> ALL ('{a,d}'::bpchar[])) - -> Seq Scan on lp_g - Filter: (a <> ALL ('{a,d}'::bpchar[])) - -> Seq Scan on lp_default - Filter: (a <> ALL ('{a,d}'::bpchar[])) -(9 rows) + QUERY PLAN +------------------------------------------------------ + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on lp_bc + Filter: (a <> ALL ('{a,d}'::bpchar[])) + -> Seq Scan on lp_ef + Filter: (a <> ALL ('{a,d}'::bpchar[])) + -> Seq Scan on lp_g + Filter: (a <> ALL ('{a,d}'::bpchar[])) + -> Seq Scan on lp_default + Filter: (a <> ALL ('{a,d}'::bpchar[])) +(11 rows) -- collation matches the partitioning collation, pruning works create table coll_pruning (a text collate "C") partition by list (a); @@ -150,25 +174,29 @@ create table coll_pruning_a partition of coll_pruning for values in ('a'); create table coll_pruning_b partition of coll_pruning for values in ('b'); create table coll_pruning_def partition of coll_pruning default; explain (costs off) select * from coll_pruning where a collate "C" = 'a' collate "C"; - QUERY PLAN ---------------------------------------------- - Append - -> Seq Scan on coll_pruning_a - Filter: (a = 'a'::text COLLATE "C") -(3 rows) + QUERY PLAN +--------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on coll_pruning_a + Filter: (a = 'a'::text COLLATE "C") +(5 rows) -- collation doesn't match the partitioning collation, no pruning occurs explain (costs off) select * from coll_pruning where a collate "POSIX" = 'a' collate "POSIX"; - QUERY PLAN ---------------------------------------------------------- - Append - -> Seq Scan on coll_pruning_a - Filter: ((a)::text = 'a'::text COLLATE "POSIX") - -> Seq Scan on coll_pruning_b - Filter: ((a)::text = 'a'::text COLLATE "POSIX") - -> Seq Scan on coll_pruning_def - Filter: ((a)::text = 'a'::text COLLATE "POSIX") -(7 rows) + QUERY PLAN +--------------------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on coll_pruning_a + Filter: ((a)::text = 'a'::text COLLATE "POSIX") + -> Seq Scan on coll_pruning_b + Filter: ((a)::text = 'a'::text COLLATE "POSIX") + -> Seq Scan on coll_pruning_def + Filter: ((a)::text = 'a'::text COLLATE "POSIX") +(9 rows) create table rlp (a int, b varchar) partition by range (a); create table rlp_default partition of rlp default partition by list (a); @@ -178,7 +206,7 @@ create table rlp_default_30 partition of rlp_default for values in (30); create table rlp_default_null partition of rlp_default for values in (null); create table rlp1 partition of rlp for values from (minvalue) to (1); create table rlp2 partition of rlp for values from (1) to (10); -create table rlp3 (b varchar, a int) partition by list (b varchar_ops); +create table rlp3 (a int, b varchar) partition by list (b varchar_ops); create table rlp3_default partition of rlp3 default; create table rlp3abcd partition of rlp3 for values in ('ab', 'cd'); create table rlp3efgh partition of rlp3 for values in ('ef', 'gh'); @@ -192,480 +220,544 @@ create table rlp5 partition of rlp for values from (31) to (maxvalue) partition create table rlp5_default partition of rlp5 default; create table rlp5_1 partition of rlp5 for values from (31) to (40); explain (costs off) select * from rlp where a < 1; - QUERY PLAN -------------------------- - Append - -> Seq Scan on rlp1 - Filter: (a < 1) -(3 rows) + QUERY PLAN +---------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on rlp1 + Filter: (a < 1) +(5 rows) explain (costs off) select * from rlp where 1 > a; /* commuted */ - QUERY PLAN -------------------------- - Append - -> Seq Scan on rlp1 - Filter: (1 > a) -(3 rows) + QUERY PLAN +---------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on rlp1 + Filter: (1 > a) +(5 rows) explain (costs off) select * from rlp where a <= 1; - QUERY PLAN --------------------------- - Append - -> Seq Scan on rlp1 - Filter: (a <= 1) - -> Seq Scan on rlp2 - Filter: (a <= 1) -(5 rows) + QUERY PLAN +---------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on rlp1 + Filter: (a <= 1) + -> Seq Scan on rlp2 + Filter: (a <= 1) +(7 rows) explain (costs off) select * from rlp where a = 1; - QUERY PLAN -------------------------- - Append - -> Seq Scan on rlp2 - Filter: (a = 1) -(3 rows) + QUERY PLAN +------------------------------- + Remote Fast Query Execution + Node/s: datanode_1 + -> Append + -> Seq Scan on rlp2 + Filter: (a = 1) +(5 rows) explain (costs off) select * from rlp where a = 1::bigint; /* same as above */ - QUERY PLAN ------------------------------------ - Append - -> Seq Scan on rlp2 - Filter: (a = '1'::bigint) -(3 rows) + QUERY PLAN +----------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1 + -> Append + -> Seq Scan on rlp2 + Filter: (a = '1'::bigint) +(5 rows) explain (costs off) select * from rlp where a = 1::numeric; /* no pruning */ - QUERY PLAN ------------------------------------------------ - Append - -> Seq Scan on rlp1 - Filter: ((a)::numeric = '1'::numeric) - -> Seq Scan on rlp2 - Filter: ((a)::numeric = '1'::numeric) - -> Seq Scan on rlp3abcd - Filter: ((a)::numeric = '1'::numeric) - -> Seq Scan on rlp3efgh - Filter: ((a)::numeric = '1'::numeric) - -> Seq Scan on rlp3nullxy - Filter: ((a)::numeric = '1'::numeric) - -> Seq Scan on rlp3_default - Filter: ((a)::numeric = '1'::numeric) - -> Seq Scan on rlp4_1 - Filter: ((a)::numeric = '1'::numeric) - -> Seq Scan on rlp4_2 - Filter: ((a)::numeric = '1'::numeric) - -> Seq Scan on rlp4_default - Filter: ((a)::numeric = '1'::numeric) - -> Seq Scan on rlp5_1 - Filter: ((a)::numeric = '1'::numeric) - -> Seq Scan on rlp5_default - Filter: ((a)::numeric = '1'::numeric) - -> Seq Scan on rlp_default_10 - Filter: ((a)::numeric = '1'::numeric) - -> Seq Scan on rlp_default_30 - Filter: ((a)::numeric = '1'::numeric) - -> Seq Scan on rlp_default_null - Filter: ((a)::numeric = '1'::numeric) - -> Seq Scan on rlp_default_default - Filter: ((a)::numeric = '1'::numeric) -(31 rows) + QUERY PLAN +----------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on rlp1 + Filter: ((a)::numeric = '1'::numeric) + -> Seq Scan on rlp2 + Filter: ((a)::numeric = '1'::numeric) + -> Seq Scan on rlp3abcd + Filter: ((a)::numeric = '1'::numeric) + -> Seq Scan on rlp3efgh + Filter: ((a)::numeric = '1'::numeric) + -> Seq Scan on rlp3nullxy + Filter: ((a)::numeric = '1'::numeric) + -> Seq Scan on rlp3_default + Filter: ((a)::numeric = '1'::numeric) + -> Seq Scan on rlp4_1 + Filter: ((a)::numeric = '1'::numeric) + -> Seq Scan on rlp4_2 + Filter: ((a)::numeric = '1'::numeric) + -> Seq Scan on rlp4_default + Filter: ((a)::numeric = '1'::numeric) + -> Seq Scan on rlp5_1 + Filter: ((a)::numeric = '1'::numeric) + -> Seq Scan on rlp5_default + Filter: ((a)::numeric = '1'::numeric) + -> Seq Scan on rlp_default_10 + Filter: ((a)::numeric = '1'::numeric) + -> Seq Scan on rlp_default_30 + Filter: ((a)::numeric = '1'::numeric) + -> Seq Scan on rlp_default_null + Filter: ((a)::numeric = '1'::numeric) + -> Seq Scan on rlp_default_default + Filter: ((a)::numeric = '1'::numeric) +(33 rows) explain (costs off) select * from rlp where a <= 10; - QUERY PLAN ---------------------------------------- - Append - -> Seq Scan on rlp1 - Filter: (a <= 10) - -> Seq Scan on rlp2 - Filter: (a <= 10) - -> Seq Scan on rlp_default_10 - Filter: (a <= 10) - -> Seq Scan on rlp_default_default - Filter: (a <= 10) -(9 rows) + QUERY PLAN +--------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on rlp1 + Filter: (a <= 10) + -> Seq Scan on rlp2 + Filter: (a <= 10) + -> Seq Scan on rlp_default_10 + Filter: (a <= 10) + -> Seq Scan on rlp_default_default + Filter: (a <= 10) +(11 rows) explain (costs off) select * from rlp where a > 10; - QUERY PLAN ---------------------------------------- - Append - -> Seq Scan on rlp3abcd - Filter: (a > 10) - -> Seq Scan on rlp3efgh - Filter: (a > 10) - -> Seq Scan on rlp3nullxy - Filter: (a > 10) - -> Seq Scan on rlp3_default - Filter: (a > 10) - -> Seq Scan on rlp4_1 - Filter: (a > 10) - -> Seq Scan on rlp4_2 - Filter: (a > 10) - -> Seq Scan on rlp4_default - Filter: (a > 10) - -> Seq Scan on rlp5_1 - Filter: (a > 10) - -> Seq Scan on rlp5_default - Filter: (a > 10) - -> Seq Scan on rlp_default_30 - Filter: (a > 10) - -> Seq Scan on rlp_default_default - Filter: (a > 10) -(23 rows) + QUERY PLAN +--------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on rlp3abcd + Filter: (a > 10) + -> Seq Scan on rlp3efgh + Filter: (a > 10) + -> Seq Scan on rlp3nullxy + Filter: (a > 10) + -> Seq Scan on rlp3_default + Filter: (a > 10) + -> Seq Scan on rlp4_1 + Filter: (a > 10) + -> Seq Scan on rlp4_2 + Filter: (a > 10) + -> Seq Scan on rlp4_default + Filter: (a > 10) + -> Seq Scan on rlp5_1 + Filter: (a > 10) + -> Seq Scan on rlp5_default + Filter: (a > 10) + -> Seq Scan on rlp_default_30 + Filter: (a > 10) + -> Seq Scan on rlp_default_default + Filter: (a > 10) +(25 rows) explain (costs off) select * from rlp where a < 15; - QUERY PLAN ---------------------------------------- - Append - -> Seq Scan on rlp1 - Filter: (a < 15) - -> Seq Scan on rlp2 - Filter: (a < 15) - -> Seq Scan on rlp_default_10 - Filter: (a < 15) - -> Seq Scan on rlp_default_default - Filter: (a < 15) -(9 rows) + QUERY PLAN +--------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on rlp1 + Filter: (a < 15) + -> Seq Scan on rlp2 + Filter: (a < 15) + -> Seq Scan on rlp_default_10 + Filter: (a < 15) + -> Seq Scan on rlp_default_default + Filter: (a < 15) +(11 rows) explain (costs off) select * from rlp where a <= 15; - QUERY PLAN ---------------------------------------- - Append - -> Seq Scan on rlp1 - Filter: (a <= 15) - -> Seq Scan on rlp2 - Filter: (a <= 15) - -> Seq Scan on rlp3abcd - Filter: (a <= 15) - -> Seq Scan on rlp3efgh - Filter: (a <= 15) - -> Seq Scan on rlp3nullxy - Filter: (a <= 15) - -> Seq Scan on rlp3_default - Filter: (a <= 15) - -> Seq Scan on rlp_default_10 - Filter: (a <= 15) - -> Seq Scan on rlp_default_default - Filter: (a <= 15) -(17 rows) + QUERY PLAN +--------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on rlp1 + Filter: (a <= 15) + -> Seq Scan on rlp2 + Filter: (a <= 15) + -> Seq Scan on rlp3abcd + Filter: (a <= 15) + -> Seq Scan on rlp3efgh + Filter: (a <= 15) + -> Seq Scan on rlp3nullxy + Filter: (a <= 15) + -> Seq Scan on rlp3_default + Filter: (a <= 15) + -> Seq Scan on rlp_default_10 + Filter: (a <= 15) + -> Seq Scan on rlp_default_default + Filter: (a <= 15) +(19 rows) explain (costs off) select * from rlp where a > 15 and b = 'ab'; - QUERY PLAN ---------------------------------------------------------- - Append - -> Seq Scan on rlp3abcd - Filter: ((a > 15) AND ((b)::text = 'ab'::text)) - -> Seq Scan on rlp4_1 - Filter: ((a > 15) AND ((b)::text = 'ab'::text)) - -> Seq Scan on rlp4_2 - Filter: ((a > 15) AND ((b)::text = 'ab'::text)) - -> Seq Scan on rlp4_default - Filter: ((a > 15) AND ((b)::text = 'ab'::text)) - -> Seq Scan on rlp5_1 - Filter: ((a > 15) AND ((b)::text = 'ab'::text)) - -> Seq Scan on rlp5_default - Filter: ((a > 15) AND ((b)::text = 'ab'::text)) - -> Seq Scan on rlp_default_30 - Filter: ((a > 15) AND ((b)::text = 'ab'::text)) - -> Seq Scan on rlp_default_default - Filter: ((a > 15) AND ((b)::text = 'ab'::text)) -(17 rows) + QUERY PLAN +--------------------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on rlp3abcd + Filter: ((a > 15) AND ((b)::text = 'ab'::text)) + -> Seq Scan on rlp4_1 + Filter: ((a > 15) AND ((b)::text = 'ab'::text)) + -> Seq Scan on rlp4_2 + Filter: ((a > 15) AND ((b)::text = 'ab'::text)) + -> Seq Scan on rlp4_default + Filter: ((a > 15) AND ((b)::text = 'ab'::text)) + -> Seq Scan on rlp5_1 + Filter: ((a > 15) AND ((b)::text = 'ab'::text)) + -> Seq Scan on rlp5_default + Filter: ((a > 15) AND ((b)::text = 'ab'::text)) + -> Seq Scan on rlp_default_30 + Filter: ((a > 15) AND ((b)::text = 'ab'::text)) + -> Seq Scan on rlp_default_default + Filter: ((a > 15) AND ((b)::text = 'ab'::text)) +(19 rows) explain (costs off) select * from rlp where a = 16; - QUERY PLAN --------------------------------- - Append - -> Seq Scan on rlp3abcd - Filter: (a = 16) - -> Seq Scan on rlp3efgh - Filter: (a = 16) - -> Seq Scan on rlp3nullxy - Filter: (a = 16) - -> Seq Scan on rlp3_default - Filter: (a = 16) -(9 rows) + QUERY PLAN +-------------------------------------- + Remote Fast Query Execution + Node/s: datanode_2 + -> Append + -> Seq Scan on rlp3abcd + Filter: (a = 16) + -> Seq Scan on rlp3efgh + Filter: (a = 16) + -> Seq Scan on rlp3nullxy + Filter: (a = 16) + -> Seq Scan on rlp3_default + Filter: (a = 16) +(11 rows) explain (costs off) select * from rlp where a = 16 and b in ('not', 'in', 'here'); - QUERY PLAN ----------------------------------------------------------------------------- - Append - -> Seq Scan on rlp3_default - Filter: ((a = 16) AND ((b)::text = ANY ('{not,in,here}'::text[]))) -(3 rows) + QUERY PLAN +---------------------------------------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_2 + -> Append + -> Seq Scan on rlp3_default + Filter: ((a = 16) AND ((b)::text = ANY ('{not,in,here}'::text[]))) +(5 rows) explain (costs off) select * from rlp where a = 16 and b < 'ab'; - QUERY PLAN ---------------------------------------------------------- - Append - -> Seq Scan on rlp3_default - Filter: (((b)::text < 'ab'::text) AND (a = 16)) -(3 rows) + QUERY PLAN +--------------------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_2 + -> Append + -> Seq Scan on rlp3_default + Filter: (((b)::text < 'ab'::text) AND (a = 16)) +(5 rows) explain (costs off) select * from rlp where a = 16 and b <= 'ab'; - QUERY PLAN ----------------------------------------------------------- - Append - -> Seq Scan on rlp3abcd - Filter: (((b)::text <= 'ab'::text) AND (a = 16)) - -> Seq Scan on rlp3_default - Filter: (((b)::text <= 'ab'::text) AND (a = 16)) -(5 rows) + QUERY PLAN +---------------------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_2 + -> Append + -> Seq Scan on rlp3abcd + Filter: (((b)::text <= 'ab'::text) AND (a = 16)) + -> Seq Scan on rlp3_default + Filter: (((b)::text <= 'ab'::text) AND (a = 16)) +(7 rows) explain (costs off) select * from rlp where a = 16 and b is null; - QUERY PLAN --------------------------------------------- - Append - -> Seq Scan on rlp3nullxy - Filter: ((b IS NULL) AND (a = 16)) -(3 rows) + QUERY PLAN +-------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_2 + -> Append + -> Seq Scan on rlp3nullxy + Filter: ((b IS NULL) AND (a = 16)) +(5 rows) explain (costs off) select * from rlp where a = 16 and b is not null; - QUERY PLAN ------------------------------------------------- - Append - -> Seq Scan on rlp3abcd - Filter: ((b IS NOT NULL) AND (a = 16)) - -> Seq Scan on rlp3efgh - Filter: ((b IS NOT NULL) AND (a = 16)) - -> Seq Scan on rlp3nullxy - Filter: ((b IS NOT NULL) AND (a = 16)) - -> Seq Scan on rlp3_default - Filter: ((b IS NOT NULL) AND (a = 16)) -(9 rows) + QUERY PLAN +------------------------------------------------------ + Remote Fast Query Execution + Node/s: datanode_2 + -> Append + -> Seq Scan on rlp3abcd + Filter: ((b IS NOT NULL) AND (a = 16)) + -> Seq Scan on rlp3efgh + Filter: ((b IS NOT NULL) AND (a = 16)) + -> Seq Scan on rlp3nullxy + Filter: ((b IS NOT NULL) AND (a = 16)) + -> Seq Scan on rlp3_default + Filter: ((b IS NOT NULL) AND (a = 16)) +(11 rows) explain (costs off) select * from rlp where a is null; - QUERY PLAN ------------------------------------- - Append - -> Seq Scan on rlp_default_null - Filter: (a IS NULL) -(3 rows) + QUERY PLAN +------------------------------------------ + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on rlp_default_null + Filter: (a IS NULL) +(5 rows) explain (costs off) select * from rlp where a is not null; - QUERY PLAN ---------------------------------------- - Append - -> Seq Scan on rlp1 - Filter: (a IS NOT NULL) - -> Seq Scan on rlp2 - Filter: (a IS NOT NULL) - -> Seq Scan on rlp3abcd - Filter: (a IS NOT NULL) - -> Seq Scan on rlp3efgh - Filter: (a IS NOT NULL) - -> Seq Scan on rlp3nullxy - Filter: (a IS NOT NULL) - -> Seq Scan on rlp3_default - Filter: (a IS NOT NULL) - -> Seq Scan on rlp4_1 - Filter: (a IS NOT NULL) - -> Seq Scan on rlp4_2 - Filter: (a IS NOT NULL) - -> Seq Scan on rlp4_default - Filter: (a IS NOT NULL) - -> Seq Scan on rlp5_1 - Filter: (a IS NOT NULL) - -> Seq Scan on rlp5_default - Filter: (a IS NOT NULL) - -> Seq Scan on rlp_default_10 - Filter: (a IS NOT NULL) - -> Seq Scan on rlp_default_30 - Filter: (a IS NOT NULL) - -> Seq Scan on rlp_default_default - Filter: (a IS NOT NULL) -(29 rows) + QUERY PLAN +--------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on rlp1 + Filter: (a IS NOT NULL) + -> Seq Scan on rlp2 + Filter: (a IS NOT NULL) + -> Seq Scan on rlp3abcd + Filter: (a IS NOT NULL) + -> Seq Scan on rlp3efgh + Filter: (a IS NOT NULL) + -> Seq Scan on rlp3nullxy + Filter: (a IS NOT NULL) + -> Seq Scan on rlp3_default + Filter: (a IS NOT NULL) + -> Seq Scan on rlp4_1 + Filter: (a IS NOT NULL) + -> Seq Scan on rlp4_2 + Filter: (a IS NOT NULL) + -> Seq Scan on rlp4_default + Filter: (a IS NOT NULL) + -> Seq Scan on rlp5_1 + Filter: (a IS NOT NULL) + -> Seq Scan on rlp5_default + Filter: (a IS NOT NULL) + -> Seq Scan on rlp_default_10 + Filter: (a IS NOT NULL) + -> Seq Scan on rlp_default_30 + Filter: (a IS NOT NULL) + -> Seq Scan on rlp_default_default + Filter: (a IS NOT NULL) +(31 rows) explain (costs off) select * from rlp where a > 30; - QUERY PLAN ---------------------------------------- - Append - -> Seq Scan on rlp5_1 - Filter: (a > 30) - -> Seq Scan on rlp5_default - Filter: (a > 30) - -> Seq Scan on rlp_default_default - Filter: (a > 30) -(7 rows) + QUERY PLAN +--------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on rlp5_1 + Filter: (a > 30) + -> Seq Scan on rlp5_default + Filter: (a > 30) + -> Seq Scan on rlp_default_default + Filter: (a > 30) +(9 rows) explain (costs off) select * from rlp where a = 30; /* only default is scanned */ - QUERY PLAN ----------------------------------- - Append - -> Seq Scan on rlp_default_30 - Filter: (a = 30) -(3 rows) + QUERY PLAN +---------------------------------------- + Remote Fast Query Execution + Node/s: datanode_2 + -> Append + -> Seq Scan on rlp_default_30 + Filter: (a = 30) +(5 rows) explain (costs off) select * from rlp where a <= 31; - QUERY PLAN ---------------------------------------- - Append - -> Seq Scan on rlp1 - Filter: (a <= 31) - -> Seq Scan on rlp2 - Filter: (a <= 31) - -> Seq Scan on rlp3abcd - Filter: (a <= 31) - -> Seq Scan on rlp3efgh - Filter: (a <= 31) - -> Seq Scan on rlp3nullxy - Filter: (a <= 31) - -> Seq Scan on rlp3_default - Filter: (a <= 31) - -> Seq Scan on rlp4_1 - Filter: (a <= 31) - -> Seq Scan on rlp4_2 - Filter: (a <= 31) - -> Seq Scan on rlp4_default - Filter: (a <= 31) - -> Seq Scan on rlp5_1 - Filter: (a <= 31) - -> Seq Scan on rlp_default_10 - Filter: (a <= 31) - -> Seq Scan on rlp_default_30 - Filter: (a <= 31) - -> Seq Scan on rlp_default_default - Filter: (a <= 31) -(27 rows) + QUERY PLAN +--------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on rlp1 + Filter: (a <= 31) + -> Seq Scan on rlp2 + Filter: (a <= 31) + -> Seq Scan on rlp3abcd + Filter: (a <= 31) + -> Seq Scan on rlp3efgh + Filter: (a <= 31) + -> Seq Scan on rlp3nullxy + Filter: (a <= 31) + -> Seq Scan on rlp3_default + Filter: (a <= 31) + -> Seq Scan on rlp4_1 + Filter: (a <= 31) + -> Seq Scan on rlp4_2 + Filter: (a <= 31) + -> Seq Scan on rlp4_default + Filter: (a <= 31) + -> Seq Scan on rlp5_1 + Filter: (a <= 31) + -> Seq Scan on rlp_default_10 + Filter: (a <= 31) + -> Seq Scan on rlp_default_30 + Filter: (a <= 31) + -> Seq Scan on rlp_default_default + Filter: (a <= 31) +(29 rows) explain (costs off) select * from rlp where a = 1 or a = 7; - QUERY PLAN --------------------------------------- - Append - -> Seq Scan on rlp2 - Filter: ((a = 1) OR (a = 7)) -(3 rows) + QUERY PLAN +-------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on rlp2 + Filter: ((a = 1) OR (a = 7)) +(5 rows) explain (costs off) select * from rlp where a = 1 or b = 'ab'; - QUERY PLAN -------------------------------------------------------- - Append - -> Seq Scan on rlp1 - Filter: ((a = 1) OR ((b)::text = 'ab'::text)) - -> Seq Scan on rlp2 - Filter: ((a = 1) OR ((b)::text = 'ab'::text)) - -> Seq Scan on rlp3abcd - Filter: ((a = 1) OR ((b)::text = 'ab'::text)) - -> Seq Scan on rlp4_1 - Filter: ((a = 1) OR ((b)::text = 'ab'::text)) - -> Seq Scan on rlp4_2 - Filter: ((a = 1) OR ((b)::text = 'ab'::text)) - -> Seq Scan on rlp4_default - Filter: ((a = 1) OR ((b)::text = 'ab'::text)) - -> Seq Scan on rlp5_1 - Filter: ((a = 1) OR ((b)::text = 'ab'::text)) - -> Seq Scan on rlp5_default - Filter: ((a = 1) OR ((b)::text = 'ab'::text)) - -> Seq Scan on rlp_default_10 - Filter: ((a = 1) OR ((b)::text = 'ab'::text)) - -> Seq Scan on rlp_default_30 - Filter: ((a = 1) OR ((b)::text = 'ab'::text)) - -> Seq Scan on rlp_default_null - Filter: ((a = 1) OR ((b)::text = 'ab'::text)) - -> Seq Scan on rlp_default_default - Filter: ((a = 1) OR ((b)::text = 'ab'::text)) -(25 rows) + QUERY PLAN +------------------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on rlp1 + Filter: ((a = 1) OR ((b)::text = 'ab'::text)) + -> Seq Scan on rlp2 + Filter: ((a = 1) OR ((b)::text = 'ab'::text)) + -> Seq Scan on rlp3abcd + Filter: ((a = 1) OR ((b)::text = 'ab'::text)) + -> Seq Scan on rlp4_1 + Filter: ((a = 1) OR ((b)::text = 'ab'::text)) + -> Seq Scan on rlp4_2 + Filter: ((a = 1) OR ((b)::text = 'ab'::text)) + -> Seq Scan on rlp4_default + Filter: ((a = 1) OR ((b)::text = 'ab'::text)) + -> Seq Scan on rlp5_1 + Filter: ((a = 1) OR ((b)::text = 'ab'::text)) + -> Seq Scan on rlp5_default + Filter: ((a = 1) OR ((b)::text = 'ab'::text)) + -> Seq Scan on rlp_default_10 + Filter: ((a = 1) OR ((b)::text = 'ab'::text)) + -> Seq Scan on rlp_default_30 + Filter: ((a = 1) OR ((b)::text = 'ab'::text)) + -> Seq Scan on rlp_default_null + Filter: ((a = 1) OR ((b)::text = 'ab'::text)) + -> Seq Scan on rlp_default_default + Filter: ((a = 1) OR ((b)::text = 'ab'::text)) +(27 rows) explain (costs off) select * from rlp where a > 20 and a < 27; - QUERY PLAN ------------------------------------------ - Append - -> Seq Scan on rlp4_1 - Filter: ((a > 20) AND (a < 27)) - -> Seq Scan on rlp4_2 - Filter: ((a > 20) AND (a < 27)) -(5 rows) + QUERY PLAN +----------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on rlp4_1 + Filter: ((a > 20) AND (a < 27)) + -> Seq Scan on rlp4_2 + Filter: ((a > 20) AND (a < 27)) +(7 rows) explain (costs off) select * from rlp where a = 29; - QUERY PLAN --------------------------------- - Append - -> Seq Scan on rlp4_default - Filter: (a = 29) -(3 rows) + QUERY PLAN +-------------------------------------- + Remote Fast Query Execution + Node/s: datanode_2 + -> Append + -> Seq Scan on rlp4_default + Filter: (a = 29) +(5 rows) explain (costs off) select * from rlp where a >= 29; - QUERY PLAN ---------------------------------------- - Append - -> Seq Scan on rlp4_default - Filter: (a >= 29) - -> Seq Scan on rlp5_1 - Filter: (a >= 29) - -> Seq Scan on rlp5_default - Filter: (a >= 29) - -> Seq Scan on rlp_default_30 - Filter: (a >= 29) - -> Seq Scan on rlp_default_default - Filter: (a >= 29) -(11 rows) + QUERY PLAN +--------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on rlp4_default + Filter: (a >= 29) + -> Seq Scan on rlp5_1 + Filter: (a >= 29) + -> Seq Scan on rlp5_default + Filter: (a >= 29) + -> Seq Scan on rlp_default_30 + Filter: (a >= 29) + -> Seq Scan on rlp_default_default + Filter: (a >= 29) +(13 rows) explain (costs off) select * from rlp where a < 1 or (a > 20 and a < 25); - QUERY PLAN ------------------------------------------------------- - Append - -> Seq Scan on rlp1 - Filter: ((a < 1) OR ((a > 20) AND (a < 25))) - -> Seq Scan on rlp4_1 - Filter: ((a < 1) OR ((a > 20) AND (a < 25))) -(5 rows) + QUERY PLAN +------------------------------------------------------------ + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on rlp1 + Filter: ((a < 1) OR ((a > 20) AND (a < 25))) + -> Seq Scan on rlp4_1 + Filter: ((a < 1) OR ((a > 20) AND (a < 25))) +(7 rows) -- redundant clauses are eliminated explain (costs off) select * from rlp where a > 1 and a = 10; /* only default */ - QUERY PLAN ----------------------------------------- - Append - -> Seq Scan on rlp_default_10 - Filter: ((a > 1) AND (a = 10)) -(3 rows) + QUERY PLAN +---------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_2 + -> Append + -> Seq Scan on rlp_default_10 + Filter: ((a > 1) AND (a = 10)) +(5 rows) explain (costs off) select * from rlp where a > 1 and a >=15; /* rlp3 onwards, including default */ - QUERY PLAN ------------------------------------------ - Append - -> Seq Scan on rlp3abcd - Filter: ((a > 1) AND (a >= 15)) - -> Seq Scan on rlp3efgh - Filter: ((a > 1) AND (a >= 15)) - -> Seq Scan on rlp3nullxy - Filter: ((a > 1) AND (a >= 15)) - -> Seq Scan on rlp3_default - Filter: ((a > 1) AND (a >= 15)) - -> Seq Scan on rlp4_1 - Filter: ((a > 1) AND (a >= 15)) - -> Seq Scan on rlp4_2 - Filter: ((a > 1) AND (a >= 15)) - -> Seq Scan on rlp4_default - Filter: ((a > 1) AND (a >= 15)) - -> Seq Scan on rlp5_1 - Filter: ((a > 1) AND (a >= 15)) - -> Seq Scan on rlp5_default - Filter: ((a > 1) AND (a >= 15)) - -> Seq Scan on rlp_default_30 - Filter: ((a > 1) AND (a >= 15)) - -> Seq Scan on rlp_default_default - Filter: ((a > 1) AND (a >= 15)) -(23 rows) + QUERY PLAN +----------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on rlp3abcd + Filter: ((a > 1) AND (a >= 15)) + -> Seq Scan on rlp3efgh + Filter: ((a > 1) AND (a >= 15)) + -> Seq Scan on rlp3nullxy + Filter: ((a > 1) AND (a >= 15)) + -> Seq Scan on rlp3_default + Filter: ((a > 1) AND (a >= 15)) + -> Seq Scan on rlp4_1 + Filter: ((a > 1) AND (a >= 15)) + -> Seq Scan on rlp4_2 + Filter: ((a > 1) AND (a >= 15)) + -> Seq Scan on rlp4_default + Filter: ((a > 1) AND (a >= 15)) + -> Seq Scan on rlp5_1 + Filter: ((a > 1) AND (a >= 15)) + -> Seq Scan on rlp5_default + Filter: ((a > 1) AND (a >= 15)) + -> Seq Scan on rlp_default_30 + Filter: ((a > 1) AND (a >= 15)) + -> Seq Scan on rlp_default_default + Filter: ((a > 1) AND (a >= 15)) +(25 rows) explain (costs off) select * from rlp where a = 1 and a = 3; /* empty */ - QUERY PLAN --------------------------- - Result - One-Time Filter: false -(2 rows) + QUERY PLAN +-------------------------------- + Remote Fast Query Execution + Node/s: datanode_1 + -> Result + One-Time Filter: false +(4 rows) explain (costs off) select * from rlp where (a = 1 and a = 3) or (a > 1 and a = 15); - QUERY PLAN -------------------------------------------------------------------- - Append - -> Seq Scan on rlp2 - Filter: (((a = 1) AND (a = 3)) OR ((a > 1) AND (a = 15))) - -> Seq Scan on rlp3abcd - Filter: (((a = 1) AND (a = 3)) OR ((a > 1) AND (a = 15))) - -> Seq Scan on rlp3efgh - Filter: (((a = 1) AND (a = 3)) OR ((a > 1) AND (a = 15))) - -> Seq Scan on rlp3nullxy - Filter: (((a = 1) AND (a = 3)) OR ((a > 1) AND (a = 15))) - -> Seq Scan on rlp3_default - Filter: (((a = 1) AND (a = 3)) OR ((a > 1) AND (a = 15))) -(11 rows) + QUERY PLAN +------------------------------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on rlp2 + Filter: (((a = 1) AND (a = 3)) OR ((a > 1) AND (a = 15))) + -> Seq Scan on rlp3abcd + Filter: (((a = 1) AND (a = 3)) OR ((a > 1) AND (a = 15))) + -> Seq Scan on rlp3efgh + Filter: (((a = 1) AND (a = 3)) OR ((a > 1) AND (a = 15))) + -> Seq Scan on rlp3nullxy + Filter: (((a = 1) AND (a = 3)) OR ((a > 1) AND (a = 15))) + -> Seq Scan on rlp3_default + Filter: (((a = 1) AND (a = 3)) OR ((a > 1) AND (a = 15))) +(13 rows) -- multi-column keys create table mc3p (a int, b int, c int) partition by range (a, abs(b), c); @@ -679,268 +771,306 @@ create table mc3p5 partition of mc3p for values from (11, 1, 1) to (20, 10, 10); create table mc3p6 partition of mc3p for values from (20, 10, 10) to (20, 20, 20); create table mc3p7 partition of mc3p for values from (20, 20, 20) to (maxvalue, maxvalue, maxvalue); explain (costs off) select * from mc3p where a = 1; - QUERY PLAN --------------------------------- - Append - -> Seq Scan on mc3p0 - Filter: (a = 1) - -> Seq Scan on mc3p1 - Filter: (a = 1) - -> Seq Scan on mc3p_default - Filter: (a = 1) -(7 rows) + QUERY PLAN +-------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1 + -> Append + -> Seq Scan on mc3p0 + Filter: (a = 1) + -> Seq Scan on mc3p1 + Filter: (a = 1) + -> Seq Scan on mc3p_default + Filter: (a = 1) +(9 rows) explain (costs off) select * from mc3p where a = 1 and abs(b) < 1; - QUERY PLAN --------------------------------------------- - Append - -> Seq Scan on mc3p0 - Filter: ((a = 1) AND (abs(b) < 1)) - -> Seq Scan on mc3p_default - Filter: ((a = 1) AND (abs(b) < 1)) -(5 rows) + QUERY PLAN +-------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1 + -> Append + -> Seq Scan on mc3p0 + Filter: ((a = 1) AND (abs(b) < 1)) + -> Seq Scan on mc3p_default + Filter: ((a = 1) AND (abs(b) < 1)) +(7 rows) explain (costs off) select * from mc3p where a = 1 and abs(b) = 1; - QUERY PLAN --------------------------------------------- - Append - -> Seq Scan on mc3p0 - Filter: ((a = 1) AND (abs(b) = 1)) - -> Seq Scan on mc3p1 - Filter: ((a = 1) AND (abs(b) = 1)) - -> Seq Scan on mc3p_default - Filter: ((a = 1) AND (abs(b) = 1)) -(7 rows) + QUERY PLAN +-------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1 + -> Append + -> Seq Scan on mc3p0 + Filter: ((a = 1) AND (abs(b) = 1)) + -> Seq Scan on mc3p1 + Filter: ((a = 1) AND (abs(b) = 1)) + -> Seq Scan on mc3p_default + Filter: ((a = 1) AND (abs(b) = 1)) +(9 rows) explain (costs off) select * from mc3p where a = 1 and abs(b) = 1 and c < 8; - QUERY PLAN --------------------------------------------------------- - Append - -> Seq Scan on mc3p0 - Filter: ((c < 8) AND (a = 1) AND (abs(b) = 1)) - -> Seq Scan on mc3p1 - Filter: ((c < 8) AND (a = 1) AND (abs(b) = 1)) -(5 rows) + QUERY PLAN +-------------------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1 + -> Append + -> Seq Scan on mc3p0 + Filter: ((c < 8) AND (a = 1) AND (abs(b) = 1)) + -> Seq Scan on mc3p1 + Filter: ((c < 8) AND (a = 1) AND (abs(b) = 1)) +(7 rows) explain (costs off) select * from mc3p where a = 10 and abs(b) between 5 and 35; - QUERY PLAN ------------------------------------------------------------------ - Append - -> Seq Scan on mc3p1 - Filter: ((a = 10) AND (abs(b) >= 5) AND (abs(b) <= 35)) - -> Seq Scan on mc3p2 - Filter: ((a = 10) AND (abs(b) >= 5) AND (abs(b) <= 35)) - -> Seq Scan on mc3p3 - Filter: ((a = 10) AND (abs(b) >= 5) AND (abs(b) <= 35)) - -> Seq Scan on mc3p4 - Filter: ((a = 10) AND (abs(b) >= 5) AND (abs(b) <= 35)) - -> Seq Scan on mc3p_default - Filter: ((a = 10) AND (abs(b) >= 5) AND (abs(b) <= 35)) -(11 rows) + QUERY PLAN +----------------------------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_2 + -> Append + -> Seq Scan on mc3p1 + Filter: ((a = 10) AND (abs(b) >= 5) AND (abs(b) <= 35)) + -> Seq Scan on mc3p2 + Filter: ((a = 10) AND (abs(b) >= 5) AND (abs(b) <= 35)) + -> Seq Scan on mc3p3 + Filter: ((a = 10) AND (abs(b) >= 5) AND (abs(b) <= 35)) + -> Seq Scan on mc3p4 + Filter: ((a = 10) AND (abs(b) >= 5) AND (abs(b) <= 35)) + -> Seq Scan on mc3p_default + Filter: ((a = 10) AND (abs(b) >= 5) AND (abs(b) <= 35)) +(13 rows) explain (costs off) select * from mc3p where a > 10; - QUERY PLAN --------------------------------- - Append - -> Seq Scan on mc3p5 - Filter: (a > 10) - -> Seq Scan on mc3p6 - Filter: (a > 10) - -> Seq Scan on mc3p7 - Filter: (a > 10) - -> Seq Scan on mc3p_default - Filter: (a > 10) -(9 rows) + QUERY PLAN +-------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on mc3p5 + Filter: (a > 10) + -> Seq Scan on mc3p6 + Filter: (a > 10) + -> Seq Scan on mc3p7 + Filter: (a > 10) + -> Seq Scan on mc3p_default + Filter: (a > 10) +(11 rows) explain (costs off) select * from mc3p where a >= 10; - QUERY PLAN --------------------------------- - Append - -> Seq Scan on mc3p1 - Filter: (a >= 10) - -> Seq Scan on mc3p2 - Filter: (a >= 10) - -> Seq Scan on mc3p3 - Filter: (a >= 10) - -> Seq Scan on mc3p4 - Filter: (a >= 10) - -> Seq Scan on mc3p5 - Filter: (a >= 10) - -> Seq Scan on mc3p6 - Filter: (a >= 10) - -> Seq Scan on mc3p7 - Filter: (a >= 10) - -> Seq Scan on mc3p_default - Filter: (a >= 10) -(17 rows) + QUERY PLAN +-------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on mc3p1 + Filter: (a >= 10) + -> Seq Scan on mc3p2 + Filter: (a >= 10) + -> Seq Scan on mc3p3 + Filter: (a >= 10) + -> Seq Scan on mc3p4 + Filter: (a >= 10) + -> Seq Scan on mc3p5 + Filter: (a >= 10) + -> Seq Scan on mc3p6 + Filter: (a >= 10) + -> Seq Scan on mc3p7 + Filter: (a >= 10) + -> Seq Scan on mc3p_default + Filter: (a >= 10) +(19 rows) explain (costs off) select * from mc3p where a < 10; - QUERY PLAN --------------------------------- - Append - -> Seq Scan on mc3p0 - Filter: (a < 10) - -> Seq Scan on mc3p1 - Filter: (a < 10) - -> Seq Scan on mc3p_default - Filter: (a < 10) -(7 rows) + QUERY PLAN +-------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on mc3p0 + Filter: (a < 10) + -> Seq Scan on mc3p1 + Filter: (a < 10) + -> Seq Scan on mc3p_default + Filter: (a < 10) +(9 rows) explain (costs off) select * from mc3p where a <= 10 and abs(b) < 10; - QUERY PLAN ------------------------------------------------ - Append - -> Seq Scan on mc3p0 - Filter: ((a <= 10) AND (abs(b) < 10)) - -> Seq Scan on mc3p1 - Filter: ((a <= 10) AND (abs(b) < 10)) - -> Seq Scan on mc3p2 - Filter: ((a <= 10) AND (abs(b) < 10)) - -> Seq Scan on mc3p_default - Filter: ((a <= 10) AND (abs(b) < 10)) -(9 rows) + QUERY PLAN +----------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on mc3p0 + Filter: ((a <= 10) AND (abs(b) < 10)) + -> Seq Scan on mc3p1 + Filter: ((a <= 10) AND (abs(b) < 10)) + -> Seq Scan on mc3p2 + Filter: ((a <= 10) AND (abs(b) < 10)) + -> Seq Scan on mc3p_default + Filter: ((a <= 10) AND (abs(b) < 10)) +(11 rows) explain (costs off) select * from mc3p where a = 11 and abs(b) = 0; - QUERY PLAN ---------------------------------------------- - Append - -> Seq Scan on mc3p_default - Filter: ((a = 11) AND (abs(b) = 0)) -(3 rows) + QUERY PLAN +--------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_2 + -> Append + -> Seq Scan on mc3p_default + Filter: ((a = 11) AND (abs(b) = 0)) +(5 rows) explain (costs off) select * from mc3p where a = 20 and abs(b) = 10 and c = 100; - QUERY PLAN ------------------------------------------------------------- - Append - -> Seq Scan on mc3p6 - Filter: ((a = 20) AND (c = 100) AND (abs(b) = 10)) -(3 rows) + QUERY PLAN +------------------------------------------------------------------ + Remote Fast Query Execution + Node/s: datanode_2 + -> Append + -> Seq Scan on mc3p6 + Filter: ((a = 20) AND (c = 100) AND (abs(b) = 10)) +(5 rows) explain (costs off) select * from mc3p where a > 20; - QUERY PLAN --------------------------------- - Append - -> Seq Scan on mc3p7 - Filter: (a > 20) - -> Seq Scan on mc3p_default - Filter: (a > 20) -(5 rows) + QUERY PLAN +-------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on mc3p7 + Filter: (a > 20) + -> Seq Scan on mc3p_default + Filter: (a > 20) +(7 rows) explain (costs off) select * from mc3p where a >= 20; - QUERY PLAN --------------------------------- - Append - -> Seq Scan on mc3p5 - Filter: (a >= 20) - -> Seq Scan on mc3p6 - Filter: (a >= 20) - -> Seq Scan on mc3p7 - Filter: (a >= 20) - -> Seq Scan on mc3p_default - Filter: (a >= 20) -(9 rows) + QUERY PLAN +-------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on mc3p5 + Filter: (a >= 20) + -> Seq Scan on mc3p6 + Filter: (a >= 20) + -> Seq Scan on mc3p7 + Filter: (a >= 20) + -> Seq Scan on mc3p_default + Filter: (a >= 20) +(11 rows) explain (costs off) select * from mc3p where (a = 1 and abs(b) = 1 and c = 1) or (a = 10 and abs(b) = 5 and c = 10) or (a > 11 and a < 20); - QUERY PLAN ---------------------------------------------------------------------------------------------------------------------------------- - Append - -> Seq Scan on mc3p1 - Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20))) - -> Seq Scan on mc3p2 - Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20))) - -> Seq Scan on mc3p5 - Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20))) - -> Seq Scan on mc3p_default - Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20))) -(9 rows) + QUERY PLAN +--------------------------------------------------------------------------------------------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on mc3p1 + Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20))) + -> Seq Scan on mc3p2 + Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20))) + -> Seq Scan on mc3p5 + Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20))) + -> Seq Scan on mc3p_default + Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20))) +(11 rows) explain (costs off) select * from mc3p where (a = 1 and abs(b) = 1 and c = 1) or (a = 10 and abs(b) = 5 and c = 10) or (a > 11 and a < 20) or a < 1; - QUERY PLAN --------------------------------------------------------------------------------------------------------------------------------------------- - Append - -> Seq Scan on mc3p0 - Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1)) - -> Seq Scan on mc3p1 - Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1)) - -> Seq Scan on mc3p2 - Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1)) - -> Seq Scan on mc3p5 - Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1)) - -> Seq Scan on mc3p_default - Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1)) -(11 rows) + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on mc3p0 + Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1)) + -> Seq Scan on mc3p1 + Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1)) + -> Seq Scan on mc3p2 + Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1)) + -> Seq Scan on mc3p5 + Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1)) + -> Seq Scan on mc3p_default + Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1)) +(13 rows) explain (costs off) select * from mc3p where (a = 1 and abs(b) = 1 and c = 1) or (a = 10 and abs(b) = 5 and c = 10) or (a > 11 and a < 20) or a < 1 or a = 1; - QUERY PLAN -------------------------------------------------------------------------------------------------------------------------------------------------------- - Append - -> Seq Scan on mc3p0 - Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1) OR (a = 1)) - -> Seq Scan on mc3p1 - Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1) OR (a = 1)) - -> Seq Scan on mc3p2 - Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1) OR (a = 1)) - -> Seq Scan on mc3p5 - Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1) OR (a = 1)) - -> Seq Scan on mc3p_default - Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1) OR (a = 1)) -(11 rows) + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on mc3p0 + Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1) OR (a = 1)) + -> Seq Scan on mc3p1 + Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1) OR (a = 1)) + -> Seq Scan on mc3p2 + Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1) OR (a = 1)) + -> Seq Scan on mc3p5 + Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1) OR (a = 1)) + -> Seq Scan on mc3p_default + Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1) OR (a = 1)) +(13 rows) explain (costs off) select * from mc3p where a = 1 or abs(b) = 1 or c = 1; - QUERY PLAN ------------------------------------------------------- - Append - -> Seq Scan on mc3p0 - Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1)) - -> Seq Scan on mc3p1 - Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1)) - -> Seq Scan on mc3p2 - Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1)) - -> Seq Scan on mc3p3 - Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1)) - -> Seq Scan on mc3p4 - Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1)) - -> Seq Scan on mc3p5 - Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1)) - -> Seq Scan on mc3p6 - Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1)) - -> Seq Scan on mc3p7 - Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1)) - -> Seq Scan on mc3p_default - Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1)) -(19 rows) + QUERY PLAN +------------------------------------------------------------ + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on mc3p0 + Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1)) + -> Seq Scan on mc3p1 + Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1)) + -> Seq Scan on mc3p2 + Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1)) + -> Seq Scan on mc3p3 + Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1)) + -> Seq Scan on mc3p4 + Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1)) + -> Seq Scan on mc3p5 + Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1)) + -> Seq Scan on mc3p6 + Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1)) + -> Seq Scan on mc3p7 + Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1)) + -> Seq Scan on mc3p_default + Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1)) +(21 rows) explain (costs off) select * from mc3p where (a = 1 and abs(b) = 1) or (a = 10 and abs(b) = 10); - QUERY PLAN ------------------------------------------------------------------------------- - Append - -> Seq Scan on mc3p0 - Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 10))) - -> Seq Scan on mc3p1 - Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 10))) - -> Seq Scan on mc3p2 - Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 10))) - -> Seq Scan on mc3p3 - Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 10))) - -> Seq Scan on mc3p4 - Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 10))) - -> Seq Scan on mc3p_default - Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 10))) -(13 rows) + QUERY PLAN +------------------------------------------------------------------------------------ + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on mc3p0 + Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 10))) + -> Seq Scan on mc3p1 + Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 10))) + -> Seq Scan on mc3p2 + Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 10))) + -> Seq Scan on mc3p3 + Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 10))) + -> Seq Scan on mc3p4 + Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 10))) + -> Seq Scan on mc3p_default + Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 10))) +(15 rows) explain (costs off) select * from mc3p where (a = 1 and abs(b) = 1) or (a = 10 and abs(b) = 9); - QUERY PLAN ------------------------------------------------------------------------------ - Append - -> Seq Scan on mc3p0 - Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 9))) - -> Seq Scan on mc3p1 - Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 9))) - -> Seq Scan on mc3p2 - Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 9))) - -> Seq Scan on mc3p_default - Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 9))) -(9 rows) + QUERY PLAN +----------------------------------------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on mc3p0 + Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 9))) + -> Seq Scan on mc3p1 + Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 9))) + -> Seq Scan on mc3p2 + Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 9))) + -> Seq Scan on mc3p_default + Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 9))) +(11 rows) -- a simpler multi-column keys case create table mc2p (a int, b int) partition by range (a, b); @@ -952,91 +1082,109 @@ create table mc2p3 partition of mc2p for values from (2, minvalue) to (2, 1); create table mc2p4 partition of mc2p for values from (2, 1) to (2, maxvalue); create table mc2p5 partition of mc2p for values from (2, maxvalue) to (maxvalue, maxvalue); explain (costs off) select * from mc2p where a < 2; - QUERY PLAN --------------------------------- - Append - -> Seq Scan on mc2p0 - Filter: (a < 2) - -> Seq Scan on mc2p1 - Filter: (a < 2) - -> Seq Scan on mc2p2 - Filter: (a < 2) - -> Seq Scan on mc2p_default - Filter: (a < 2) -(9 rows) + QUERY PLAN +-------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on mc2p0 + Filter: (a < 2) + -> Seq Scan on mc2p1 + Filter: (a < 2) + -> Seq Scan on mc2p2 + Filter: (a < 2) + -> Seq Scan on mc2p_default + Filter: (a < 2) +(11 rows) explain (costs off) select * from mc2p where a = 2 and b < 1; - QUERY PLAN ---------------------------------------- - Append - -> Seq Scan on mc2p3 - Filter: ((b < 1) AND (a = 2)) -(3 rows) + QUERY PLAN +--------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1 + -> Append + -> Seq Scan on mc2p3 + Filter: ((b < 1) AND (a = 2)) +(5 rows) explain (costs off) select * from mc2p where a > 1; - QUERY PLAN --------------------------------- - Append - -> Seq Scan on mc2p2 - Filter: (a > 1) - -> Seq Scan on mc2p3 - Filter: (a > 1) - -> Seq Scan on mc2p4 - Filter: (a > 1) - -> Seq Scan on mc2p5 - Filter: (a > 1) - -> Seq Scan on mc2p_default - Filter: (a > 1) -(11 rows) + QUERY PLAN +-------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on mc2p2 + Filter: (a > 1) + -> Seq Scan on mc2p3 + Filter: (a > 1) + -> Seq Scan on mc2p4 + Filter: (a > 1) + -> Seq Scan on mc2p5 + Filter: (a > 1) + -> Seq Scan on mc2p_default + Filter: (a > 1) +(13 rows) explain (costs off) select * from mc2p where a = 1 and b > 1; - QUERY PLAN ---------------------------------------- - Append - -> Seq Scan on mc2p2 - Filter: ((b > 1) AND (a = 1)) -(3 rows) + QUERY PLAN +--------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1 + -> Append + -> Seq Scan on mc2p2 + Filter: ((b > 1) AND (a = 1)) +(5 rows) -- all partitions but the default one should be pruned explain (costs off) select * from mc2p where a = 1 and b is null; - QUERY PLAN -------------------------------------------- - Append - -> Seq Scan on mc2p_default - Filter: ((b IS NULL) AND (a = 1)) -(3 rows) + QUERY PLAN +------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1 + -> Append + -> Seq Scan on mc2p_default + Filter: ((b IS NULL) AND (a = 1)) +(5 rows) explain (costs off) select * from mc2p where a is null and b is null; - QUERY PLAN ------------------------------------------------ - Append - -> Seq Scan on mc2p_default - Filter: ((a IS NULL) AND (b IS NULL)) -(3 rows) + QUERY PLAN +----------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on mc2p_default + Filter: ((a IS NULL) AND (b IS NULL)) +(5 rows) explain (costs off) select * from mc2p where a is null and b = 1; - QUERY PLAN -------------------------------------------- - Append - -> Seq Scan on mc2p_default - Filter: ((a IS NULL) AND (b = 1)) -(3 rows) + QUERY PLAN +------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on mc2p_default + Filter: ((a IS NULL) AND (b = 1)) +(5 rows) explain (costs off) select * from mc2p where a is null; - QUERY PLAN --------------------------------- - Append - -> Seq Scan on mc2p_default - Filter: (a IS NULL) -(3 rows) + QUERY PLAN +-------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on mc2p_default + Filter: (a IS NULL) +(5 rows) explain (costs off) select * from mc2p where b is null; - QUERY PLAN --------------------------------- - Append - -> Seq Scan on mc2p_default - Filter: (b IS NULL) -(3 rows) + QUERY PLAN +-------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on mc2p_default + Filter: (b IS NULL) +(5 rows) -- boolean partitioning create table boolpart (a bool) partition by list (a); @@ -1044,87 +1192,95 @@ create table boolpart_default partition of boolpart default; create table boolpart_t partition of boolpart for values in ('true'); create table boolpart_f partition of boolpart for values in ('false'); explain (costs off) select * from boolpart where a in (true, false); - QUERY PLAN ------------------------------------------------- - Append - -> Seq Scan on boolpart_f - Filter: (a = ANY ('{t,f}'::boolean[])) - -> Seq Scan on boolpart_t - Filter: (a = ANY ('{t,f}'::boolean[])) -(5 rows) + QUERY PLAN +------------------------------------------------------ + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on boolpart_f + Filter: (a = ANY ('{t,f}'::boolean[])) + -> Seq Scan on boolpart_t + Filter: (a = ANY ('{t,f}'::boolean[])) +(7 rows) explain (costs off) select * from boolpart where a = false; QUERY PLAN ------------------------------------ - Append - -> Seq Scan on boolpart_f - Filter: (NOT a) - -> Seq Scan on boolpart_t - Filter: (NOT a) - -> Seq Scan on boolpart_default - Filter: (NOT a) -(7 rows) + Remote Fast Query Execution + Node/s: datanode_2 + -> Append + -> Seq Scan on boolpart_f + Filter: (NOT a) +(5 rows) explain (costs off) select * from boolpart where not a = false; QUERY PLAN ------------------------------------ - Append - -> Seq Scan on boolpart_f - Filter: a - -> Seq Scan on boolpart_t - Filter: a - -> Seq Scan on boolpart_default - Filter: a -(7 rows) + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on boolpart_t + Filter: a +(5 rows) explain (costs off) select * from boolpart where a is true or a is not true; - QUERY PLAN --------------------------------------------------- - Append - -> Seq Scan on boolpart_f - Filter: ((a IS TRUE) OR (a IS NOT TRUE)) - -> Seq Scan on boolpart_t - Filter: ((a IS TRUE) OR (a IS NOT TRUE)) -(5 rows) + QUERY PLAN +-------------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on boolpart_f + Filter: ((a IS TRUE) OR (a IS NOT TRUE)) + -> Seq Scan on boolpart_t + Filter: ((a IS TRUE) OR (a IS NOT TRUE)) +(7 rows) explain (costs off) select * from boolpart where a is not true; - QUERY PLAN ---------------------------------- - Append - -> Seq Scan on boolpart_f - Filter: (a IS NOT TRUE) -(3 rows) + QUERY PLAN +--------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on boolpart_f + Filter: (a IS NOT TRUE) +(5 rows) explain (costs off) select * from boolpart where a is not true and a is not false; - QUERY PLAN --------------------------- - Result - One-Time Filter: false -(2 rows) + QUERY PLAN +---------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Result + One-Time Filter: false +(4 rows) explain (costs off) select * from boolpart where a is unknown; - QUERY PLAN ------------------------------------- - Append - -> Seq Scan on boolpart_f - Filter: (a IS UNKNOWN) - -> Seq Scan on boolpart_t - Filter: (a IS UNKNOWN) - -> Seq Scan on boolpart_default - Filter: (a IS UNKNOWN) -(7 rows) + QUERY PLAN +------------------------------------------ + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on boolpart_f + Filter: (a IS UNKNOWN) + -> Seq Scan on boolpart_t + Filter: (a IS UNKNOWN) + -> Seq Scan on boolpart_default + Filter: (a IS UNKNOWN) +(9 rows) explain (costs off) select * from boolpart where a is not unknown; - QUERY PLAN ------------------------------------- - Append - -> Seq Scan on boolpart_f - Filter: (a IS NOT UNKNOWN) - -> Seq Scan on boolpart_t - Filter: (a IS NOT UNKNOWN) - -> Seq Scan on boolpart_default - Filter: (a IS NOT UNKNOWN) -(7 rows) + QUERY PLAN +------------------------------------------ + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on boolpart_f + Filter: (a IS NOT UNKNOWN) + -> Seq Scan on boolpart_t + Filter: (a IS NOT UNKNOWN) + -> Seq Scan on boolpart_default + Filter: (a IS NOT UNKNOWN) +(9 rows) create table boolrangep (a bool, b bool, c int) partition by range (a,b,c); create table boolrangep_tf partition of boolrangep for values from ('true', 'false', 0) to ('true', 'false', 100); @@ -1133,11 +1289,14 @@ create table boolrangep_ff1 partition of boolrangep for values from ('false', 'f create table boolrangep_ff2 partition of boolrangep for values from ('false', 'false', 50) to ('false', 'false', 100); -- try a more complex case that's been known to trip up pruning in the past explain (costs off) select * from boolrangep where not a and not b and c = 25; - QUERY PLAN ----------------------------------------------- - Seq Scan on boolrangep_ff1 - Filter: ((NOT a) AND (NOT b) AND (c = 25)) -(2 rows) + QUERY PLAN +---------------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on boolrangep_ff1 + Filter: ((NOT a) AND (NOT b) AND (c = 25)) +(5 rows) -- test scalar-to-array operators create table coercepart (a varchar) partition by list (a); @@ -1145,64 +1304,74 @@ create table coercepart_ab partition of coercepart for values in ('ab'); create table coercepart_bc partition of coercepart for values in ('bc'); create table coercepart_cd partition of coercepart for values in ('cd'); explain (costs off) select * from coercepart where a in ('ab', to_char(125, '999')); - QUERY PLAN ------------------------------------------------------------------------------------------------------------------------------- - Append - -> Seq Scan on coercepart_ab - Filter: ((a)::text = ANY ((ARRAY['ab'::character varying, (to_char(125, '999'::text))::character varying])::text[])) - -> Seq Scan on coercepart_bc - Filter: ((a)::text = ANY ((ARRAY['ab'::character varying, (to_char(125, '999'::text))::character varying])::text[])) - -> Seq Scan on coercepart_cd - Filter: ((a)::text = ANY ((ARRAY['ab'::character varying, (to_char(125, '999'::text))::character varying])::text[])) -(7 rows) + QUERY PLAN +------------------------------------------------------------------------------------------------ + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on coercepart_ab + Filter: ((a)::text = ANY (ARRAY['ab'::text, (to_char(125, '999'::text))::text])) + -> Seq Scan on coercepart_bc + Filter: ((a)::text = ANY (ARRAY['ab'::text, (to_char(125, '999'::text))::text])) + -> Seq Scan on coercepart_cd + Filter: ((a)::text = ANY (ARRAY['ab'::text, (to_char(125, '999'::text))::text])) +(9 rows) explain (costs off) select * from coercepart where a ~ any ('{ab}'); - QUERY PLAN ----------------------------------------------------- - Append - -> Seq Scan on coercepart_ab - Filter: ((a)::text ~ ANY ('{ab}'::text[])) - -> Seq Scan on coercepart_bc - Filter: ((a)::text ~ ANY ('{ab}'::text[])) - -> Seq Scan on coercepart_cd - Filter: ((a)::text ~ ANY ('{ab}'::text[])) -(7 rows) + QUERY PLAN +---------------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on coercepart_ab + Filter: ((a)::text ~ ANY ('{ab}'::text[])) + -> Seq Scan on coercepart_bc + Filter: ((a)::text ~ ANY ('{ab}'::text[])) + -> Seq Scan on coercepart_cd + Filter: ((a)::text ~ ANY ('{ab}'::text[])) +(9 rows) explain (costs off) select * from coercepart where a !~ all ('{ab}'); - QUERY PLAN ------------------------------------------------------ - Append - -> Seq Scan on coercepart_ab - Filter: ((a)::text !~ ALL ('{ab}'::text[])) - -> Seq Scan on coercepart_bc - Filter: ((a)::text !~ ALL ('{ab}'::text[])) - -> Seq Scan on coercepart_cd - Filter: ((a)::text !~ ALL ('{ab}'::text[])) -(7 rows) + QUERY PLAN +----------------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on coercepart_ab + Filter: ((a)::text !~ ALL ('{ab}'::text[])) + -> Seq Scan on coercepart_bc + Filter: ((a)::text !~ ALL ('{ab}'::text[])) + -> Seq Scan on coercepart_cd + Filter: ((a)::text !~ ALL ('{ab}'::text[])) +(9 rows) explain (costs off) select * from coercepart where a ~ any ('{ab,bc}'); - QUERY PLAN -------------------------------------------------------- - Append - -> Seq Scan on coercepart_ab - Filter: ((a)::text ~ ANY ('{ab,bc}'::text[])) - -> Seq Scan on coercepart_bc - Filter: ((a)::text ~ ANY ('{ab,bc}'::text[])) - -> Seq Scan on coercepart_cd - Filter: ((a)::text ~ ANY ('{ab,bc}'::text[])) -(7 rows) + QUERY PLAN +------------------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on coercepart_ab + Filter: ((a)::text ~ ANY ('{ab,bc}'::text[])) + -> Seq Scan on coercepart_bc + Filter: ((a)::text ~ ANY ('{ab,bc}'::text[])) + -> Seq Scan on coercepart_cd + Filter: ((a)::text ~ ANY ('{ab,bc}'::text[])) +(9 rows) explain (costs off) select * from coercepart where a !~ all ('{ab,bc}'); - QUERY PLAN --------------------------------------------------------- - Append - -> Seq Scan on coercepart_ab - Filter: ((a)::text !~ ALL ('{ab,bc}'::text[])) - -> Seq Scan on coercepart_bc - Filter: ((a)::text !~ ALL ('{ab,bc}'::text[])) - -> Seq Scan on coercepart_cd - Filter: ((a)::text !~ ALL ('{ab,bc}'::text[])) -(7 rows) + QUERY PLAN +-------------------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on coercepart_ab + Filter: ((a)::text !~ ALL ('{ab,bc}'::text[])) + -> Seq Scan on coercepart_bc + Filter: ((a)::text !~ ALL ('{ab,bc}'::text[])) + -> Seq Scan on coercepart_cd + Filter: ((a)::text !~ ALL ('{ab,bc}'::text[])) +(9 rows) drop table coercepart; CREATE TABLE part (a INT, b INT) PARTITION BY LIST (a); @@ -1211,14 +1380,16 @@ CREATE TABLE part_p2 PARTITION OF part DEFAULT PARTITION BY RANGE(a); CREATE TABLE part_p2_p1 PARTITION OF part_p2 DEFAULT; INSERT INTO part VALUES (-1,-1), (1,1), (2,NULL), (NULL,-2),(NULL,NULL); EXPLAIN (COSTS OFF) SELECT tableoid::regclass as part, a, b FROM part WHERE a IS NULL ORDER BY 1, 2, 3; - QUERY PLAN ---------------------------------------------------------------------------- - Sort - Sort Key: ((part_p2_p1.tableoid)::regclass), part_p2_p1.a, part_p2_p1.b - -> Append - -> Seq Scan on part_p2_p1 - Filter: (a IS NULL) -(5 rows) + QUERY PLAN +--------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: ((part_p2_p1.tableoid)::regclass), part_p2_p1.a, part_p2_p1.b + -> Result + -> Append + -> Seq Scan on part_p2_p1 + Filter: (a IS NULL) +(7 rows) -- -- some more cases @@ -1228,78 +1399,90 @@ EXPLAIN (COSTS OFF) SELECT tableoid::regclass as part, a, b FROM part WHERE a IS -- -- pruning won't work for mc3p, because some keys are Params explain (costs off) select * from mc2p t1, lateral (select count(*) from mc3p t2 where t2.a = t1.b and abs(t2.b) = 1 and t2.c = 1) s where t1.a = 1; - QUERY PLAN ------------------------------------------------------------------------ + QUERY PLAN +-------------------------------------------------------------------------------------- Nested Loop - -> Append - -> Seq Scan on mc2p1 t1 - Filter: (a = 1) - -> Seq Scan on mc2p2 t1_1 - Filter: (a = 1) - -> Seq Scan on mc2p_default t1_2 - Filter: (a = 1) - -> Aggregate + -> Remote Subquery Scan on all (datanode_1) -> Append - -> Seq Scan on mc3p0 t2 - Filter: ((a = t1.b) AND (c = 1) AND (abs(b) = 1)) - -> Seq Scan on mc3p1 t2_1 - Filter: ((a = t1.b) AND (c = 1) AND (abs(b) = 1)) - -> Seq Scan on mc3p2 t2_2 - Filter: ((a = t1.b) AND (c = 1) AND (abs(b) = 1)) - -> Seq Scan on mc3p3 t2_3 - Filter: ((a = t1.b) AND (c = 1) AND (abs(b) = 1)) - -> Seq Scan on mc3p4 t2_4 - Filter: ((a = t1.b) AND (c = 1) AND (abs(b) = 1)) - -> Seq Scan on mc3p5 t2_5 - Filter: ((a = t1.b) AND (c = 1) AND (abs(b) = 1)) - -> Seq Scan on mc3p6 t2_6 - Filter: ((a = t1.b) AND (c = 1) AND (abs(b) = 1)) - -> Seq Scan on mc3p7 t2_7 - Filter: ((a = t1.b) AND (c = 1) AND (abs(b) = 1)) - -> Seq Scan on mc3p_default t2_8 - Filter: ((a = t1.b) AND (c = 1) AND (abs(b) = 1)) -(28 rows) + -> Seq Scan on mc2p1 t1 + Filter: (a = 1) + -> Seq Scan on mc2p2 t1_1 + Filter: (a = 1) + -> Seq Scan on mc2p_default t1_2 + Filter: (a = 1) + -> Materialize + -> Finalize Aggregate + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Partial Aggregate + -> Append + -> Seq Scan on mc3p0 t2 + Filter: ((a = b) AND (c = 1) AND (abs(b) = 1)) + -> Seq Scan on mc3p1 t2_1 + Filter: ((a = b) AND (c = 1) AND (abs(b) = 1)) + -> Seq Scan on mc3p2 t2_2 + Filter: ((a = b) AND (c = 1) AND (abs(b) = 1)) + -> Seq Scan on mc3p3 t2_3 + Filter: ((a = b) AND (c = 1) AND (abs(b) = 1)) + -> Seq Scan on mc3p4 t2_4 + Filter: ((a = b) AND (c = 1) AND (abs(b) = 1)) + -> Seq Scan on mc3p5 t2_5 + Filter: ((a = b) AND (c = 1) AND (abs(b) = 1)) + -> Seq Scan on mc3p6 t2_6 + Filter: ((a = b) AND (c = 1) AND (abs(b) = 1)) + -> Seq Scan on mc3p7 t2_7 + Filter: ((a = b) AND (c = 1) AND (abs(b) = 1)) + -> Seq Scan on mc3p_default t2_8 + Filter: ((a = b) AND (c = 1) AND (abs(b) = 1)) +(32 rows) -- pruning should work fine, because values for a prefix of keys (a, b) are -- available explain (costs off) select * from mc2p t1, lateral (select count(*) from mc3p t2 where t2.c = t1.b and abs(t2.b) = 1 and t2.a = 1) s where t1.a = 1; - QUERY PLAN ------------------------------------------------------------------------ + QUERY PLAN +-------------------------------------------------------------------------------------- Nested Loop - -> Append - -> Seq Scan on mc2p1 t1 - Filter: (a = 1) - -> Seq Scan on mc2p2 t1_1 - Filter: (a = 1) - -> Seq Scan on mc2p_default t1_2 - Filter: (a = 1) - -> Aggregate + -> Remote Subquery Scan on all (datanode_1) -> Append - -> Seq Scan on mc3p0 t2 - Filter: ((c = t1.b) AND (a = 1) AND (abs(b) = 1)) - -> Seq Scan on mc3p1 t2_1 - Filter: ((c = t1.b) AND (a = 1) AND (abs(b) = 1)) - -> Seq Scan on mc3p_default t2_2 - Filter: ((c = t1.b) AND (a = 1) AND (abs(b) = 1)) -(16 rows) + -> Seq Scan on mc2p1 t1 + Filter: (a = 1) + -> Seq Scan on mc2p2 t1_1 + Filter: (a = 1) + -> Seq Scan on mc2p_default t1_2 + Filter: (a = 1) + -> Materialize + -> Finalize Aggregate + -> Remote Subquery Scan on all (datanode_1) + -> Partial Aggregate + -> Append + -> Seq Scan on mc3p0 t2 + Filter: ((c = b) AND (a = 1) AND (abs(b) = 1)) + -> Seq Scan on mc3p1 t2_1 + Filter: ((c = b) AND (a = 1) AND (abs(b) = 1)) + -> Seq Scan on mc3p_default t2_2 + Filter: ((c = b) AND (a = 1) AND (abs(b) = 1)) +(20 rows) -- also here, because values for all keys are provided explain (costs off) select * from mc2p t1, lateral (select count(*) from mc3p t2 where t2.a = 1 and abs(t2.b) = 1 and t2.c = 1) s where t1.a = 1; - QUERY PLAN --------------------------------------------------------------------- + QUERY PLAN +-------------------------------------------------------------------------------- Nested Loop - -> Aggregate - -> Append - -> Seq Scan on mc3p1 t2 - Filter: ((a = 1) AND (c = 1) AND (abs(b) = 1)) - -> Append - -> Seq Scan on mc2p1 t1 - Filter: (a = 1) - -> Seq Scan on mc2p2 t1_1 - Filter: (a = 1) - -> Seq Scan on mc2p_default t1_2 - Filter: (a = 1) -(12 rows) + -> Finalize Aggregate + -> Remote Subquery Scan on all (datanode_1) + -> Partial Aggregate + -> Append + -> Seq Scan on mc3p1 t2 + Filter: ((a = 1) AND (c = 1) AND (abs(b) = 1)) + -> Materialize + -> Remote Subquery Scan on all (datanode_1) + -> Append + -> Seq Scan on mc2p1 t1 + Filter: (a = 1) + -> Seq Scan on mc2p2 t1_1 + Filter: (a = 1) + -> Seq Scan on mc2p_default t1_2 + Filter: (a = 1) +(16 rows) -- -- pruning with clauses containing <> operator @@ -1310,82 +1493,94 @@ create table rp0 partition of rp for values from (minvalue) to (1); create table rp1 partition of rp for values from (1) to (2); create table rp2 partition of rp for values from (2) to (maxvalue); explain (costs off) select * from rp where a <> 1; - QUERY PLAN --------------------------- - Append - -> Seq Scan on rp0 - Filter: (a <> 1) - -> Seq Scan on rp1 - Filter: (a <> 1) - -> Seq Scan on rp2 - Filter: (a <> 1) -(7 rows) + QUERY PLAN +---------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on rp0 + Filter: (a <> 1) + -> Seq Scan on rp1 + Filter: (a <> 1) + -> Seq Scan on rp2 + Filter: (a <> 1) +(9 rows) explain (costs off) select * from rp where a <> 1 and a <> 2; - QUERY PLAN ------------------------------------------ - Append - -> Seq Scan on rp0 - Filter: ((a <> 1) AND (a <> 2)) - -> Seq Scan on rp1 - Filter: ((a <> 1) AND (a <> 2)) - -> Seq Scan on rp2 - Filter: ((a <> 1) AND (a <> 2)) -(7 rows) + QUERY PLAN +----------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on rp0 + Filter: ((a <> 1) AND (a <> 2)) + -> Seq Scan on rp1 + Filter: ((a <> 1) AND (a <> 2)) + -> Seq Scan on rp2 + Filter: ((a <> 1) AND (a <> 2)) +(9 rows) -- null partition should be eliminated due to strict <> clause. explain (costs off) select * from lp where a <> 'a'; - QUERY PLAN ------------------------------------- - Append - -> Seq Scan on lp_ad - Filter: (a <> 'a'::bpchar) - -> Seq Scan on lp_bc - Filter: (a <> 'a'::bpchar) - -> Seq Scan on lp_ef - Filter: (a <> 'a'::bpchar) - -> Seq Scan on lp_g - Filter: (a <> 'a'::bpchar) - -> Seq Scan on lp_default - Filter: (a <> 'a'::bpchar) -(11 rows) + QUERY PLAN +------------------------------------------ + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on lp_ad + Filter: (a <> 'a'::bpchar) + -> Seq Scan on lp_bc + Filter: (a <> 'a'::bpchar) + -> Seq Scan on lp_ef + Filter: (a <> 'a'::bpchar) + -> Seq Scan on lp_g + Filter: (a <> 'a'::bpchar) + -> Seq Scan on lp_default + Filter: (a <> 'a'::bpchar) +(13 rows) -- ensure we detect contradictions in clauses; a can't be NULL and NOT NULL. explain (costs off) select * from lp where a <> 'a' and a is null; - QUERY PLAN --------------------------- - Result - One-Time Filter: false -(2 rows) + QUERY PLAN +---------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Result + One-Time Filter: false +(4 rows) explain (costs off) select * from lp where (a <> 'a' and a <> 'd') or a is null; - QUERY PLAN ------------------------------------------------------------------------------- - Append - -> Seq Scan on lp_bc - Filter: (((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) OR (a IS NULL)) - -> Seq Scan on lp_ef - Filter: (((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) OR (a IS NULL)) - -> Seq Scan on lp_g - Filter: (((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) OR (a IS NULL)) - -> Seq Scan on lp_null - Filter: (((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) OR (a IS NULL)) - -> Seq Scan on lp_default - Filter: (((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) OR (a IS NULL)) -(11 rows) + QUERY PLAN +------------------------------------------------------------------------------------ + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on lp_bc + Filter: (((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) OR (a IS NULL)) + -> Seq Scan on lp_ef + Filter: (((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) OR (a IS NULL)) + -> Seq Scan on lp_g + Filter: (((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) OR (a IS NULL)) + -> Seq Scan on lp_null + Filter: (((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) OR (a IS NULL)) + -> Seq Scan on lp_default + Filter: (((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) OR (a IS NULL)) +(13 rows) -- check that it also works for a partitioned table that's not root, -- which in this case are partitions of rlp that are themselves -- list-partitioned on b explain (costs off) select * from rlp where a = 15 and b <> 'ab' and b <> 'cd' and b <> 'xy' and b is not null; - QUERY PLAN ------------------------------------------------------------------------------------------------------------------------------------------- - Append - -> Seq Scan on rlp3efgh - Filter: ((b IS NOT NULL) AND ((b)::text <> 'ab'::text) AND ((b)::text <> 'cd'::text) AND ((b)::text <> 'xy'::text) AND (a = 15)) - -> Seq Scan on rlp3_default - Filter: ((b IS NOT NULL) AND ((b)::text <> 'ab'::text) AND ((b)::text <> 'cd'::text) AND ((b)::text <> 'xy'::text) AND (a = 15)) -(5 rows) + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------------------ + Remote Fast Query Execution + Node/s: datanode_1 + -> Append + -> Seq Scan on rlp3efgh + Filter: ((b IS NOT NULL) AND ((b)::text <> 'ab'::text) AND ((b)::text <> 'cd'::text) AND ((b)::text <> 'xy'::text) AND (a = 15)) + -> Seq Scan on rlp3_default + Filter: ((b IS NOT NULL) AND ((b)::text <> 'ab'::text) AND ((b)::text <> 'cd'::text) AND ((b)::text <> 'xy'::text) AND (a = 15)) +(7 rows) -- -- different collations for different keys with same expression @@ -1396,36 +1591,42 @@ create table coll_pruning_multi2 partition of coll_pruning_multi for values from create table coll_pruning_multi3 partition of coll_pruning_multi for values from ('b', 'a') to ('b', 'e'); -- no pruning, because no value for the leading key explain (costs off) select * from coll_pruning_multi where substr(a, 1) = 'e' collate "C"; - QUERY PLAN --------------------------------------------------------- - Append - -> Seq Scan on coll_pruning_multi1 - Filter: (substr(a, 1) = 'e'::text COLLATE "C") - -> Seq Scan on coll_pruning_multi2 - Filter: (substr(a, 1) = 'e'::text COLLATE "C") - -> Seq Scan on coll_pruning_multi3 - Filter: (substr(a, 1) = 'e'::text COLLATE "C") -(7 rows) + QUERY PLAN +-------------------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on coll_pruning_multi1 + Filter: (substr(a, 1) = 'e'::text COLLATE "C") + -> Seq Scan on coll_pruning_multi2 + Filter: (substr(a, 1) = 'e'::text COLLATE "C") + -> Seq Scan on coll_pruning_multi3 + Filter: (substr(a, 1) = 'e'::text COLLATE "C") +(9 rows) -- pruning, with a value provided for the leading key explain (costs off) select * from coll_pruning_multi where substr(a, 1) = 'a' collate "POSIX"; - QUERY PLAN ------------------------------------------------------------- - Append - -> Seq Scan on coll_pruning_multi1 - Filter: (substr(a, 1) = 'a'::text COLLATE "POSIX") - -> Seq Scan on coll_pruning_multi2 - Filter: (substr(a, 1) = 'a'::text COLLATE "POSIX") -(5 rows) + QUERY PLAN +------------------------------------------------------------------ + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on coll_pruning_multi1 + Filter: (substr(a, 1) = 'a'::text COLLATE "POSIX") + -> Seq Scan on coll_pruning_multi2 + Filter: (substr(a, 1) = 'a'::text COLLATE "POSIX") +(7 rows) -- pruning, with values provided for both keys explain (costs off) select * from coll_pruning_multi where substr(a, 1) = 'e' collate "C" and substr(a, 1) = 'a' collate "POSIX"; - QUERY PLAN ---------------------------------------------------------------------------------------------------------- - Append - -> Seq Scan on coll_pruning_multi2 - Filter: ((substr(a, 1) = 'e'::text COLLATE "C") AND (substr(a, 1) = 'a'::text COLLATE "POSIX")) -(3 rows) + QUERY PLAN +--------------------------------------------------------------------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on coll_pruning_multi2 + Filter: ((substr(a, 1) = 'e'::text COLLATE "C") AND (substr(a, 1) = 'a'::text COLLATE "POSIX")) +(5 rows) -- -- LIKE operators don't prune @@ -1434,14 +1635,16 @@ create table like_op_noprune (a text) partition by list (a); create table like_op_noprune1 partition of like_op_noprune for values in ('ABC'); create table like_op_noprune2 partition of like_op_noprune for values in ('BCD'); explain (costs off) select * from like_op_noprune where a like '%BC'; - QUERY PLAN ------------------------------------- - Append - -> Seq Scan on like_op_noprune1 - Filter: (a ~~ '%BC'::text) - -> Seq Scan on like_op_noprune2 - Filter: (a ~~ '%BC'::text) -(5 rows) + QUERY PLAN +------------------------------------------ + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on like_op_noprune1 + Filter: (a ~~ '%BC'::text) + -> Seq Scan on like_op_noprune2 + Filter: (a ~~ '%BC'::text) +(7 rows) -- -- tests wherein clause value requires a cross-type comparison function @@ -1450,32 +1653,31 @@ create table lparted_by_int2 (a smallint) partition by list (a); create table lparted_by_int2_1 partition of lparted_by_int2 for values in (1); create table lparted_by_int2_16384 partition of lparted_by_int2 for values in (16384); explain (costs off) select * from lparted_by_int2 where a = 100000000000000; - QUERY PLAN --------------------------- - Result - One-Time Filter: false -(2 rows) - +ERROR: smallint out of range create table rparted_by_int2 (a smallint) partition by range (a); create table rparted_by_int2_1 partition of rparted_by_int2 for values from (1) to (10); create table rparted_by_int2_16384 partition of rparted_by_int2 for values from (10) to (16384); -- all partitions pruned explain (costs off) select * from rparted_by_int2 where a > 100000000000000; - QUERY PLAN --------------------------- - Result - One-Time Filter: false -(2 rows) + QUERY PLAN +---------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Result + One-Time Filter: false +(4 rows) create table rparted_by_int2_maxvalue partition of rparted_by_int2 for values from (16384) to (maxvalue); -- all partitions but rparted_by_int2_maxvalue pruned explain (costs off) select * from rparted_by_int2 where a > 100000000000000; - QUERY PLAN -------------------------------------------------- - Append - -> Seq Scan on rparted_by_int2_maxvalue - Filter: (a > '100000000000000'::bigint) -(3 rows) + QUERY PLAN +------------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on rparted_by_int2_maxvalue + Filter: (a > '100000000000000'::bigint) +(5 rows) drop table lp, coll_pruning, rlp, mc3p, mc2p, boolpart, boolrangep, rp, coll_pruning_multi, like_op_noprune, lparted_by_int2, rparted_by_int2; -- @@ -1493,15 +1695,10 @@ create table mc3p2 partition of mc3p insert into mc3p values (0, 1, 1), (1, 1, 1), (2, 1, 1); explain (analyze, costs off, summary off, timing off) select * from mc3p where a < 3 and abs(b) = 1; - QUERY PLAN -------------------------------------------------- - Append (actual rows=3 loops=1) - -> Seq Scan on mc3p0 (actual rows=1 loops=1) - Filter: ((a < 3) AND (abs(b) = 1)) - -> Seq Scan on mc3p1 (actual rows=1 loops=1) - Filter: ((a < 3) AND (abs(b) = 1)) - -> Seq Scan on mc3p2 (actual rows=1 loops=1) - Filter: ((a < 3) AND (abs(b) = 1)) -(7 rows) + QUERY PLAN +----------------------------------------------------- + Remote Fast Query Execution (actual rows=3 loops=1) + Node/s: datanode_1, datanode_2 +(2 rows) drop table mc3p; diff --git a/src/test/regress/expected/partition_prune_hash.out b/src/test/regress/expected/partition_prune_hash.out index fbba3f1f..60122369 100644 --- a/src/test/regress/expected/partition_prune_hash.out +++ b/src/test/regress/expected/partition_prune_hash.out @@ -27,163 +27,191 @@ select tableoid::regclass, * from hp order by 1; -- partial keys won't prune, nor would non-equality conditions explain (costs off) select * from hp where a = 1; - QUERY PLAN -------------------------- - Append - -> Seq Scan on hp0 - Filter: (a = 1) - -> Seq Scan on hp1 - Filter: (a = 1) - -> Seq Scan on hp2 - Filter: (a = 1) - -> Seq Scan on hp3 - Filter: (a = 1) -(9 rows) + QUERY PLAN +------------------------------- + Remote Fast Query Execution + Node/s: datanode_1 + -> Append + -> Seq Scan on hp0 + Filter: (a = 1) + -> Seq Scan on hp1 + Filter: (a = 1) + -> Seq Scan on hp2 + Filter: (a = 1) + -> Seq Scan on hp3 + Filter: (a = 1) +(11 rows) explain (costs off) select * from hp where b = 'xxx'; - QUERY PLAN ------------------------------------ - Append - -> Seq Scan on hp0 - Filter: (b = 'xxx'::text) - -> Seq Scan on hp1 - Filter: (b = 'xxx'::text) - -> Seq Scan on hp2 - Filter: (b = 'xxx'::text) - -> Seq Scan on hp3 - Filter: (b = 'xxx'::text) -(9 rows) + QUERY PLAN +----------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on hp0 + Filter: (b = 'xxx'::text) + -> Seq Scan on hp1 + Filter: (b = 'xxx'::text) + -> Seq Scan on hp2 + Filter: (b = 'xxx'::text) + -> Seq Scan on hp3 + Filter: (b = 'xxx'::text) +(11 rows) explain (costs off) select * from hp where a is null; - QUERY PLAN ------------------------------ - Append - -> Seq Scan on hp0 - Filter: (a IS NULL) - -> Seq Scan on hp1 - Filter: (a IS NULL) - -> Seq Scan on hp2 - Filter: (a IS NULL) - -> Seq Scan on hp3 - Filter: (a IS NULL) -(9 rows) + QUERY PLAN +----------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on hp0 + Filter: (a IS NULL) + -> Seq Scan on hp1 + Filter: (a IS NULL) + -> Seq Scan on hp2 + Filter: (a IS NULL) + -> Seq Scan on hp3 + Filter: (a IS NULL) +(11 rows) explain (costs off) select * from hp where b is null; - QUERY PLAN ------------------------------ - Append - -> Seq Scan on hp0 - Filter: (b IS NULL) - -> Seq Scan on hp1 - Filter: (b IS NULL) - -> Seq Scan on hp2 - Filter: (b IS NULL) - -> Seq Scan on hp3 - Filter: (b IS NULL) -(9 rows) + QUERY PLAN +----------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on hp0 + Filter: (b IS NULL) + -> Seq Scan on hp1 + Filter: (b IS NULL) + -> Seq Scan on hp2 + Filter: (b IS NULL) + -> Seq Scan on hp3 + Filter: (b IS NULL) +(11 rows) explain (costs off) select * from hp where a < 1 and b = 'xxx'; - QUERY PLAN -------------------------------------------------- - Append - -> Seq Scan on hp0 - Filter: ((a < 1) AND (b = 'xxx'::text)) - -> Seq Scan on hp1 - Filter: ((a < 1) AND (b = 'xxx'::text)) - -> Seq Scan on hp2 - Filter: ((a < 1) AND (b = 'xxx'::text)) - -> Seq Scan on hp3 - Filter: ((a < 1) AND (b = 'xxx'::text)) -(9 rows) + QUERY PLAN +------------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on hp0 + Filter: ((a < 1) AND (b = 'xxx'::text)) + -> Seq Scan on hp1 + Filter: ((a < 1) AND (b = 'xxx'::text)) + -> Seq Scan on hp2 + Filter: ((a < 1) AND (b = 'xxx'::text)) + -> Seq Scan on hp3 + Filter: ((a < 1) AND (b = 'xxx'::text)) +(11 rows) explain (costs off) select * from hp where a <> 1 and b = 'yyy'; - QUERY PLAN --------------------------------------------------- - Append - -> Seq Scan on hp0 - Filter: ((a <> 1) AND (b = 'yyy'::text)) - -> Seq Scan on hp1 - Filter: ((a <> 1) AND (b = 'yyy'::text)) - -> Seq Scan on hp2 - Filter: ((a <> 1) AND (b = 'yyy'::text)) - -> Seq Scan on hp3 - Filter: ((a <> 1) AND (b = 'yyy'::text)) -(9 rows) + QUERY PLAN +-------------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on hp0 + Filter: ((a <> 1) AND (b = 'yyy'::text)) + -> Seq Scan on hp1 + Filter: ((a <> 1) AND (b = 'yyy'::text)) + -> Seq Scan on hp2 + Filter: ((a <> 1) AND (b = 'yyy'::text)) + -> Seq Scan on hp3 + Filter: ((a <> 1) AND (b = 'yyy'::text)) +(11 rows) -- pruning should work if non-null values are provided for all the keys explain (costs off) select * from hp where a is null and b is null; - QUERY PLAN ------------------------------------------------ - Append - -> Seq Scan on hp0 - Filter: ((a IS NULL) AND (b IS NULL)) -(3 rows) + QUERY PLAN +----------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on hp0 + Filter: ((a IS NULL) AND (b IS NULL)) +(5 rows) explain (costs off) select * from hp where a = 1 and b is null; - QUERY PLAN -------------------------------------------- - Append - -> Seq Scan on hp0 - Filter: ((b IS NULL) AND (a = 1)) -(3 rows) - -explain (costs off) select * from hp where a = 1 and b = 'xxx'; QUERY PLAN ------------------------------------------------- - Append - -> Seq Scan on hp0 - Filter: ((a = 1) AND (b = 'xxx'::text)) -(3 rows) + Remote Fast Query Execution + Node/s: datanode_1 + -> Append + -> Seq Scan on hp0 + Filter: ((b IS NULL) AND (a = 1)) +(5 rows) + +explain (costs off) select * from hp where a = 1 and b = 'xxx'; + QUERY PLAN +------------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1 + -> Append + -> Seq Scan on hp0 + Filter: ((a = 1) AND (b = 'xxx'::text)) +(5 rows) explain (costs off) select * from hp where a is null and b = 'xxx'; - QUERY PLAN ------------------------------------------------------ - Append - -> Seq Scan on hp1 - Filter: ((a IS NULL) AND (b = 'xxx'::text)) -(3 rows) + QUERY PLAN +----------------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on hp1 + Filter: ((a IS NULL) AND (b = 'xxx'::text)) +(5 rows) explain (costs off) select * from hp where a = 10 and b = 'xxx'; - QUERY PLAN --------------------------------------------------- - Append - -> Seq Scan on hp2 - Filter: ((a = 10) AND (b = 'xxx'::text)) -(3 rows) + QUERY PLAN +-------------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_2 + -> Append + -> Seq Scan on hp2 + Filter: ((a = 10) AND (b = 'xxx'::text)) +(5 rows) explain (costs off) select * from hp where a = 10 and b = 'yyy'; - QUERY PLAN --------------------------------------------------- - Append - -> Seq Scan on hp3 - Filter: ((a = 10) AND (b = 'yyy'::text)) -(3 rows) + QUERY PLAN +-------------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_2 + -> Append + -> Seq Scan on hp3 + Filter: ((a = 10) AND (b = 'yyy'::text)) +(5 rows) explain (costs off) select * from hp where (a = 10 and b = 'yyy') or (a = 10 and b = 'xxx') or (a is null and b is null); - QUERY PLAN -------------------------------------------------------------------------------------------------------------------------- - Append - -> Seq Scan on hp0 - Filter: (((a = 10) AND (b = 'yyy'::text)) OR ((a = 10) AND (b = 'xxx'::text)) OR ((a IS NULL) AND (b IS NULL))) - -> Seq Scan on hp2 - Filter: (((a = 10) AND (b = 'yyy'::text)) OR ((a = 10) AND (b = 'xxx'::text)) OR ((a IS NULL) AND (b IS NULL))) - -> Seq Scan on hp3 - Filter: (((a = 10) AND (b = 'yyy'::text)) OR ((a = 10) AND (b = 'xxx'::text)) OR ((a IS NULL) AND (b IS NULL))) -(7 rows) + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on hp0 + Filter: (((a = 10) AND (b = 'yyy'::text)) OR ((a = 10) AND (b = 'xxx'::text)) OR ((a IS NULL) AND (b IS NULL))) + -> Seq Scan on hp2 + Filter: (((a = 10) AND (b = 'yyy'::text)) OR ((a = 10) AND (b = 'xxx'::text)) OR ((a IS NULL) AND (b IS NULL))) + -> Seq Scan on hp3 + Filter: (((a = 10) AND (b = 'yyy'::text)) OR ((a = 10) AND (b = 'xxx'::text)) OR ((a IS NULL) AND (b IS NULL))) +(9 rows) -- hash partitiong pruning doesn't occur with <> operator clauses explain (costs off) select * from hp where a <> 1 and b <> 'xxx'; - QUERY PLAN ---------------------------------------------------- - Append - -> Seq Scan on hp0 - Filter: ((a <> 1) AND (b <> 'xxx'::text)) - -> Seq Scan on hp1 - Filter: ((a <> 1) AND (b <> 'xxx'::text)) - -> Seq Scan on hp2 - Filter: ((a <> 1) AND (b <> 'xxx'::text)) - -> Seq Scan on hp3 - Filter: ((a <> 1) AND (b <> 'xxx'::text)) -(9 rows) + QUERY PLAN +--------------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on hp0 + Filter: ((a <> 1) AND (b <> 'xxx'::text)) + -> Seq Scan on hp1 + Filter: ((a <> 1) AND (b <> 'xxx'::text)) + -> Seq Scan on hp2 + Filter: ((a <> 1) AND (b <> 'xxx'::text)) + -> Seq Scan on hp3 + Filter: ((a <> 1) AND (b <> 'xxx'::text)) +(11 rows) drop table hp; diff --git a/src/test/regress/expected/psql.out b/src/test/regress/expected/psql.out index 3e0eae21..f89bfdda 100644 --- a/src/test/regress/expected/psql.out +++ b/src/test/regress/expected/psql.out @@ -3030,11 +3030,12 @@ insert into parent_tab values (generate_series(30,39)); (1 row) \dPi - List of partitioned indexes - Schema | Name | Owner | On table -----------+--------------+-----------------------+------------ - testpart | parent_index | testrole_partitioning | parent_tab -(1 row) + List of partitioned indexes + Schema | Name | Owner | On table +----------+--------------------+-----------------------+------------- + testpart | child_30_40_id_idx | testrole_partitioning | child_30_40 + testpart | parent_index | testrole_partitioning | parent_tab +(2 rows) \dP testpart.* List of partitioned relations @@ -3047,12 +3048,13 @@ insert into parent_tab values (generate_series(30,39)); (4 rows) \dP - List of partitioned relations - Schema | Name | Owner | Type | On table -----------+--------------+-----------------------+-------------------+------------ - testpart | parent_tab | testrole_partitioning | partitioned table | - testpart | parent_index | testrole_partitioning | partitioned index | parent_tab -(2 rows) + List of partitioned relations + Schema | Name | Owner | Type | On table +----------+--------------------+-----------------------+-------------------+------------- + testpart | parent_tab | testrole_partitioning | partitioned table | + testpart | child_30_40_id_idx | testrole_partitioning | partitioned index | child_30_40 + testpart | parent_index | testrole_partitioning | partitioned index | parent_tab +(3 rows) \dPtn List of partitioned tables @@ -3094,4 +3096,4 @@ drop table parent_tab cascade; drop schema testpart; set search_path to default; set role to default; -drop role testrole_partitioning; \ No newline at end of file +drop role testrole_partitioning; diff --git a/src/test/regress/expected/sanity_check_1.out b/src/test/regress/expected/sanity_check_1.out index 90bcd228..8b55f563 100644 --- a/src/test/regress/expected/sanity_check_1.out +++ b/src/test/regress/expected/sanity_check_1.out @@ -39,6 +39,8 @@ date_tbl|f default_tbl|f defaultexpr_tbl|f dept|f +donothingbrtrig_test1|f +donothingbrtrig_test2|f dupindexcols|t e_star|f emp|f @@ -77,6 +79,10 @@ mlparted12|f mlparted2|f mlparted3|f mlparted4|f +mlparted_def|f +mlparted_def1|f +mlparted_def2|f +mlparted_defd|f money_data|f num_data|f num_exp_add|t diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index d098ccb4..65fd3d80 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -113,10 +113,10 @@ select name, setting from pg_settings where name like 'enable%'; enable_multi_cluster_print | off enable_nestloop | on enable_nestloop_suppression | off - enable_partition_wise_join | off enable_null_string | off enable_oracle_compatible | off enable_parallel_ddl | off + enable_partition_wise_join | off enable_pgbouncer | off enable_plpgsql_debug_print | off enable_pooler_debug_print | on diff --git a/src/test/regress/expected/temp.out b/src/test/regress/expected/temp.out index ee8f251d..82e23805 100644 --- a/src/test/regress/expected/temp.out +++ b/src/test/regress/expected/temp.out @@ -215,7 +215,7 @@ select relname from pg_class where relname like 'temp_parted_oncommit_test%'; -- all rows if partitions preserve their data. begin; create temp table temp_parted_oncommit_test (a int) - partition by list (a) on commit delete rows; + partition by list (a) on commit preserve rows; create temp table temp_parted_oncommit_test1 partition of temp_parted_oncommit_test for values in (1) on commit preserve rows; diff --git a/src/test/regress/expected/truncate.out b/src/test/regress/expected/truncate.out index 168fc0cc..60a6822c 100644 --- a/src/test/regress/expected/truncate.out +++ b/src/test/regress/expected/truncate.out @@ -485,8 +485,8 @@ DROP TABLE truncparted; -- Make sure truncate did execute on all tables CREATE FUNCTION tp_ins_data() RETURNS void LANGUAGE plpgsql AS $$ BEGIN - INSERT INTO truncprim VALUES (1), (100), (150); - INSERT INTO truncpart VALUES (1), (100), (150); + INSERT INTO truncprim VALUES (1), (100), (150); + INSERT INTO truncpart VALUES (1), (100), (150); END $$; CREATE FUNCTION tp_chk_data(OUT pktb regclass, OUT pkval int, OUT fktb regclass, OUT fkval int) @@ -499,17 +499,14 @@ CREATE FUNCTION tp_chk_data(OUT pktb regclass, OUT pkval int, OUT fktb regclass, END $$; CREATE TABLE truncprim (a int PRIMARY KEY); -CREATE TABLE truncpart (a int REFERENCES truncprim) +CREATE TABLE truncpart (a int) PARTITION BY RANGE (a); CREATE TABLE truncpart_1 PARTITION OF truncpart FOR VALUES FROM (0) TO (100); CREATE TABLE truncpart_2 PARTITION OF truncpart FOR VALUES FROM (100) TO (200) PARTITION BY RANGE (a); CREATE TABLE truncpart_2_1 PARTITION OF truncpart_2 FOR VALUES FROM (100) TO (150); CREATE TABLE truncpart_2_d PARTITION OF truncpart_2 DEFAULT; -TRUNCATE TABLE truncprim; -- should fail -ERROR: cannot truncate a table referenced in a foreign key constraint -DETAIL: Table "truncpart" references "truncprim". -HINT: Truncate table "truncpart" at the same time, or use TRUNCATE ... CASCADE. +TRUNCATE TABLE truncprim; select tp_ins_data(); tp_ins_data ------------- @@ -530,13 +527,16 @@ select tp_ins_data(); (1 row) -- should truncate everything -SET client_min_messages TO WARNING; -- suppress cascading notices +SET client_min_messages TO WARNING; -- suppress cascading notices TRUNCATE TABLE truncprim CASCADE; RESET client_min_messages; SELECT * FROM tp_chk_data(); - pktb | pkval | fktb | fkval -------+-------+------+------- -(0 rows) + pktb | pkval | fktb | fkval +------+-------+---------------+------- + | | truncpart_1 | 1 + | | truncpart_2_1 | 100 + | | truncpart_2_d | 150 +(3 rows) SELECT tp_ins_data(); tp_ins_data diff --git a/src/test/regress/expected/update.out b/src/test/regress/expected/update.out index ed21a142..bac5f4ed 100644 --- a/src/test/regress/expected/update.out +++ b/src/test/regress/expected/update.out @@ -103,26 +103,26 @@ ERROR: multiple assignments to same column "b" UPDATE update_test SET (b,a) = (select a,b from update_test where b = 41 and c = 'car') WHERE a = 100 AND b = 20; -SELECT * FROM update_test; +SELECT * FROM update_test order by 1; a | b | c -----+----+----- - 100 | 21 | 11 | 41 | car 11 | 42 | car 41 | 11 | + 100 | 21 | (4 rows) -- correlated sub-select: UPDATE update_test o SET (b,a) = (select a+1,b from update_test i where i.a=o.a and i.b=o.b and i.c is not distinct from o.c); -SELECT * FROM update_test; +SELECT * FROM update_test order by 1; a | b | c ----+-----+----- + 11 | 42 | 21 | 101 | 41 | 12 | car 42 | 12 | car - 11 | 42 | (4 rows) -- fail, multiple rows supplied: @@ -131,7 +131,7 @@ ERROR: more than one row returned by a subquery used as an expression -- set to null if no rows supplied: UPDATE update_test SET (b,a) = (select a+1,b from update_test where a = 1000) WHERE a = 11; -SELECT * FROM update_test; +SELECT * FROM update_test order by 1; a | b | c ----+-----+----- 21 | 101 | @@ -210,16 +210,16 @@ DROP TABLE upsert_test; -- movement convert UPDATEs into DELETE+INSERT. CREATE TABLE range_parted ( a text, - b bigint, - c numeric, - d int, - e varchar + b bigint, + c numeric, + d int, + e varchar ) PARTITION BY RANGE (a, b); -- Create partitions intentionally in descending bound order, so as to test -- that update-row-movement works with the leaf partitions not in bound order. -CREATE TABLE part_b_20_b_30 (e varchar, c numeric, a text, b bigint, d int); +CREATE TABLE part_b_20_b_30 (a text, b bigint, c numeric, d int, e varchar); ALTER TABLE range_parted ATTACH PARTITION part_b_20_b_30 FOR VALUES FROM ('b', 20) TO ('b', 30); -CREATE TABLE part_b_10_b_20 (e varchar, c numeric, a text, b bigint, d int) PARTITION BY RANGE (c); +CREATE TABLE part_b_10_b_20 (a text, b bigint, c numeric, d int, e varchar) PARTITION BY RANGE (c); CREATE TABLE part_b_1_b_10 PARTITION OF range_parted FOR VALUES FROM ('b', 1) TO ('b', 10); ALTER TABLE range_parted ATTACH PARTITION part_b_10_b_20 FOR VALUES FROM ('b', 10) TO ('b', 20); CREATE TABLE part_a_10_a_20 PARTITION OF range_parted FOR VALUES FROM ('a', 10) TO ('a', 20); @@ -230,15 +230,11 @@ UPDATE part_b_10_b_20 set b = b - 6; -- Create some more partitions following the above pattern of descending bound -- order, but let's make the situation a bit more complex by having the -- attribute numbers of the columns vary from their parent partition. -CREATE TABLE part_c_100_200 (e varchar, c numeric, a text, b bigint, d int) PARTITION BY range (abs(d)); -ALTER TABLE part_c_100_200 DROP COLUMN e, DROP COLUMN c, DROP COLUMN a; -ALTER TABLE part_c_100_200 ADD COLUMN c numeric, ADD COLUMN e varchar, ADD COLUMN a text; -ALTER TABLE part_c_100_200 DROP COLUMN b; -ALTER TABLE part_c_100_200 ADD COLUMN b bigint; +CREATE TABLE part_c_100_200 (a text, b bigint, c numeric, d int, e varchar) PARTITION BY range (abs(d)); CREATE TABLE part_d_1_15 PARTITION OF part_c_100_200 FOR VALUES FROM (1) TO (15); CREATE TABLE part_d_15_20 PARTITION OF part_c_100_200 FOR VALUES FROM (15) TO (20); ALTER TABLE part_b_10_b_20 ATTACH PARTITION part_c_100_200 FOR VALUES FROM (100) TO (200); -CREATE TABLE part_c_1_100 (e varchar, d int, c numeric, b bigint, a text); +CREATE TABLE part_c_1_100 (a text, b bigint, c numeric, d int, e varchar); ALTER TABLE part_b_10_b_20 ATTACH PARTITION part_c_1_100 FOR VALUES FROM (1) TO (100); \set init_range_parted 'truncate range_parted; insert into range_parted VALUES (''a'', 1, 1, 1), (''a'', 10, 200, 1), (''b'', 12, 96, 1), (''b'', 13, 97, 2), (''b'', 15, 105, 16), (''b'', 17, 105, 19)' \set show_data 'select tableoid::regclass::text COLLATE "C" partname, * from range_parted ORDER BY 1, 2, 3, 4, 5, 6' @@ -256,91 +252,57 @@ ALTER TABLE part_b_10_b_20 ATTACH PARTITION part_c_1_100 FOR VALUES FROM (1) TO -- The order of subplans should be in bound order EXPLAIN (costs off) UPDATE range_parted set c = c - 50 WHERE c > 97; - QUERY PLAN -------------------------------------- - Update on range_parted - Update on part_a_1_a_10 - Update on part_a_10_a_20 - Update on part_b_1_b_10 - Update on part_c_1_100 - Update on part_d_1_15 - Update on part_d_15_20 - Update on part_b_20_b_30 - -> Seq Scan on part_a_1_a_10 - Filter: (c > '97'::numeric) - -> Seq Scan on part_a_10_a_20 - Filter: (c > '97'::numeric) - -> Seq Scan on part_b_1_b_10 - Filter: (c > '97'::numeric) - -> Seq Scan on part_c_1_100 - Filter: (c > '97'::numeric) - -> Seq Scan on part_d_1_15 - Filter: (c > '97'::numeric) - -> Seq Scan on part_d_15_20 - Filter: (c > '97'::numeric) - -> Seq Scan on part_b_20_b_30 - Filter: (c > '97'::numeric) -(22 rows) + QUERY PLAN +------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Update on range_parted + Update on part_a_1_a_10 + Update on part_a_10_a_20 + Update on part_b_1_b_10 + Update on part_c_1_100 + Update on part_d_1_15 + Update on part_d_15_20 + Update on part_b_20_b_30 + -> Seq Scan on part_a_1_a_10 + Filter: (c > '97'::numeric) + -> Seq Scan on part_a_10_a_20 + Filter: (c > '97'::numeric) + -> Seq Scan on part_b_1_b_10 + Filter: (c > '97'::numeric) + -> Seq Scan on part_c_1_100 + Filter: (c > '97'::numeric) + -> Seq Scan on part_d_1_15 + Filter: (c > '97'::numeric) + -> Seq Scan on part_d_15_20 + Filter: (c > '97'::numeric) + -> Seq Scan on part_b_20_b_30 + Filter: (c > '97'::numeric) +(24 rows) -- fail, row movement happens only within the partition subtree. UPDATE part_c_100_200 set c = c - 20, d = c WHERE c = 105; ERROR: new row for relation "part_c_100_200" violates partition constraint -DETAIL: Failing row contains (105, 85, null, b, 15). +DETAIL: Failing row contains (b, 15, 85, 105, null). -- fail, no partition key update, so no attempt to move tuple, -- but "a = 'a'" violates partition constraint enforced by root partition) UPDATE part_b_10_b_20 set a = 'a'; -ERROR: new row for relation "part_c_1_100" violates partition constraint -DETAIL: Failing row contains (null, 1, 96, 12, a). +ERROR: Distributed column or partition column "a" can't be updated in current version -- ok, partition key update, no constraint violation UPDATE range_parted set d = d - 10 WHERE d > 10; -- ok, no partition key update, no constraint violation UPDATE range_parted set e = d; -- No row found UPDATE part_c_1_100 set c = c + 20 WHERE c = 98; --- ok, row movement -UPDATE part_b_10_b_20 set c = c + 20 returning c, b, a; - c | b | a ------+----+--- - 116 | 12 | b - 117 | 13 | b - 125 | 15 | b - 125 | 17 | b -(4 rows) - :show_data; partname | a | b | c | d | e ----------------+---+----+-----+---+--- part_a_10_a_20 | a | 10 | 200 | 1 | 1 part_a_1_a_10 | a | 1 | 1 | 1 | 1 - part_d_1_15 | b | 12 | 116 | 1 | 1 - part_d_1_15 | b | 13 | 117 | 2 | 2 - part_d_1_15 | b | 15 | 125 | 6 | 6 - part_d_1_15 | b | 17 | 125 | 9 | 9 -(6 rows) - --- fail, row movement happens only within the partition subtree. -UPDATE part_b_10_b_20 set b = b - 6 WHERE c > 116 returning *; -ERROR: new row for relation "part_d_1_15" violates partition constraint -DETAIL: Failing row contains (2, 117, 2, b, 7). --- ok, row movement, with subset of rows moved into different partition. -UPDATE range_parted set b = b - 6 WHERE c > 116 returning a, b + c; - a | ?column? ----+---------- - a | 204 - b | 124 - b | 134 - b | 136 -(4 rows) - -:show_data; - partname | a | b | c | d | e ----------------+---+----+-----+---+--- - part_a_1_a_10 | a | 1 | 1 | 1 | 1 - part_a_1_a_10 | a | 4 | 200 | 1 | 1 - part_b_1_b_10 | b | 7 | 117 | 2 | 2 - part_b_1_b_10 | b | 9 | 125 | 6 | 6 - part_d_1_15 | b | 11 | 125 | 9 | 9 - part_d_1_15 | b | 12 | 116 | 1 | 1 + part_c_1_100 | b | 12 | 96 | 1 | 1 + part_c_1_100 | b | 13 | 97 | 2 | 2 + part_d_1_15 | b | 15 | 105 | 6 | 6 + part_d_1_15 | b | 17 | 105 | 9 | 9 (6 rows) -- Common table needed for multiple test scenarios. @@ -352,34 +314,34 @@ CREATE VIEW upview AS SELECT * FROM range_parted WHERE (select c > c1 FROM minta UPDATE upview set c = 199 WHERE b = 4; -- fail, check option violation UPDATE upview set c = 120 WHERE b = 4; -ERROR: new row violates check option for view "upview" -DETAIL: Failing row contains (a, 4, 120, 1, 1). -- fail, row movement with check option violation UPDATE upview set a = 'b', b = 15, c = 120 WHERE b = 4; -ERROR: new row violates check option for view "upview" -DETAIL: Failing row contains (b, 15, 120, 1, 1). +ERROR: could not plan this distributed update +DETAIL: correlated UPDATE or updating distribution column currently not supported in Postgres-XL. -- ok, row movement, check option passes UPDATE upview set a = 'b', b = 15 WHERE b = 4; +ERROR: could not plan this distributed update +DETAIL: correlated UPDATE or updating distribution column currently not supported in Postgres-XL. :show_data; - partname | a | b | c | d | e ----------------+---+----+-----+---+--- - part_a_1_a_10 | a | 1 | 1 | 1 | 1 - part_b_1_b_10 | b | 7 | 117 | 2 | 2 - part_b_1_b_10 | b | 9 | 125 | 6 | 6 - part_d_1_15 | b | 11 | 125 | 9 | 9 - part_d_1_15 | b | 12 | 116 | 1 | 1 - part_d_1_15 | b | 15 | 199 | 1 | 1 + partname | a | b | c | d | e +----------------+---+----+-----+---+--- + part_a_10_a_20 | a | 10 | 200 | 1 | 1 + part_a_1_a_10 | a | 1 | 1 | 1 | 1 + part_c_1_100 | b | 12 | 96 | 1 | 1 + part_c_1_100 | b | 13 | 97 | 2 | 2 + part_d_1_15 | b | 15 | 105 | 6 | 6 + part_d_1_15 | b | 17 | 105 | 9 | 9 (6 rows) -- cleanup DROP VIEW upview; -- RETURNING having whole-row vars. :init_range_parted; -UPDATE range_parted set c = 95 WHERE a = 'b' and b > 10 and c > 100 returning (range_parted), *; - range_parted | a | b | c | d | e ----------------+---+----+----+----+--- - (b,15,95,16,) | b | 15 | 95 | 16 | - (b,17,95,19,) | b | 17 | 95 | 19 | +UPDATE range_parted set c = 95 WHERE a = 'b' and b > 10 and c < 100 returning (range_parted), *; + range_parted | a | b | c | d | e +--------------+---+----+----+---+--- + (b,12,95,1,) | b | 12 | 95 | 1 | + (b,13,95,2,) | b | 13 | 95 | 2 | (2 rows) :show_data; @@ -387,10 +349,10 @@ UPDATE range_parted set c = 95 WHERE a = 'b' and b > 10 and c > 100 returning (r ----------------+---+----+-----+----+--- part_a_10_a_20 | a | 10 | 200 | 1 | part_a_1_a_10 | a | 1 | 1 | 1 | - part_c_1_100 | b | 12 | 96 | 1 | - part_c_1_100 | b | 13 | 97 | 2 | - part_c_1_100 | b | 15 | 95 | 16 | - part_c_1_100 | b | 17 | 95 | 19 | + part_c_1_100 | b | 12 | 95 | 1 | + part_c_1_100 | b | 13 | 95 | 2 | + part_d_15_20 | b | 15 | 105 | 16 | + part_d_15_20 | b | 17 | 105 | 19 | (6 rows) -- Transition tables with update row movement @@ -408,8 +370,9 @@ $$; CREATE TRIGGER trans_updatetrig AFTER UPDATE ON range_parted REFERENCING OLD TABLE AS old_table NEW TABLE AS new_table FOR EACH STATEMENT EXECUTE PROCEDURE trans_updatetrigfunc(); +ERROR: Postgres-XL does not support TRIGGER yet +DETAIL: The feature is not currently supported UPDATE range_parted set c = (case when c = 96 then 110 else c + 1 end ) WHERE a = 'b' and b > 10 and c >= 96; -NOTICE: trigger = trans_updatetrig, old table = (b,12,96,1,), (b,13,97,2,), (b,15,105,16,), (b,17,105,19,), new table = (b,12,110,1,), (b,13,98,2,), (b,15,106,16,), (b,17,106,19,) :show_data; partname | a | b | c | d | e ----------------+---+----+-----+----+--- @@ -428,11 +391,14 @@ NOTICE: trigger = trans_updatetrig, old table = (b,12,96,1,), (b,13,97,2,), (b, CREATE TRIGGER trans_deletetrig AFTER DELETE ON range_parted REFERENCING OLD TABLE AS old_table FOR EACH STATEMENT EXECUTE PROCEDURE trans_updatetrigfunc(); +ERROR: Postgres-XL does not support TRIGGER yet +DETAIL: The feature is not currently supported CREATE TRIGGER trans_inserttrig AFTER INSERT ON range_parted REFERENCING NEW TABLE AS new_table FOR EACH STATEMENT EXECUTE PROCEDURE trans_updatetrigfunc(); +ERROR: Postgres-XL does not support TRIGGER yet +DETAIL: The feature is not currently supported UPDATE range_parted set c = c + 50 WHERE a = 'b' and b > 10 and c >= 96; -NOTICE: trigger = trans_updatetrig, old table = (b,12,96,1,), (b,13,97,2,), (b,15,105,16,), (b,17,105,19,), new table = (b,12,146,1,), (b,13,147,2,), (b,15,155,16,), (b,17,155,19,) :show_data; partname | a | b | c | d | e ----------------+---+----+-----+----+--- @@ -445,7 +411,9 @@ NOTICE: trigger = trans_updatetrig, old table = (b,12,96,1,), (b,13,97,2,), (b, (6 rows) DROP TRIGGER trans_deletetrig ON range_parted; +ERROR: trigger "trans_deletetrig" for table "range_parted" does not exist DROP TRIGGER trans_inserttrig ON range_parted; +ERROR: trigger "trans_inserttrig" for table "range_parted" does not exist -- Don't drop trans_updatetrig yet. It is required below. -- Test with transition tuple conversion happening for rows moved into the -- new partition. This requires a trigger that references transition table @@ -461,36 +429,40 @@ BEGIN END $$ language plpgsql; CREATE TRIGGER trig_c1_100 BEFORE UPDATE OR INSERT ON part_c_1_100 FOR EACH ROW EXECUTE PROCEDURE func_parted_mod_b(); +ERROR: Postgres-XL does not support TRIGGER yet +DETAIL: The feature is not currently supported CREATE TRIGGER trig_d1_15 BEFORE UPDATE OR INSERT ON part_d_1_15 FOR EACH ROW EXECUTE PROCEDURE func_parted_mod_b(); +ERROR: Postgres-XL does not support TRIGGER yet +DETAIL: The feature is not currently supported CREATE TRIGGER trig_d15_20 BEFORE UPDATE OR INSERT ON part_d_15_20 FOR EACH ROW EXECUTE PROCEDURE func_parted_mod_b(); +ERROR: Postgres-XL does not support TRIGGER yet +DETAIL: The feature is not currently supported :init_range_parted; UPDATE range_parted set c = (case when c = 96 then 110 else c + 1 end) WHERE a = 'b' and b > 10 and c >= 96; -NOTICE: trigger = trans_updatetrig, old table = (b,13,96,1,), (b,14,97,2,), (b,16,105,16,), (b,18,105,19,), new table = (b,15,110,1,), (b,15,98,2,), (b,17,106,16,), (b,19,106,19,) :show_data; partname | a | b | c | d | e ----------------+---+----+-----+----+--- part_a_10_a_20 | a | 10 | 200 | 1 | part_a_1_a_10 | a | 1 | 1 | 1 | - part_c_1_100 | b | 15 | 98 | 2 | - part_d_15_20 | b | 17 | 106 | 16 | - part_d_15_20 | b | 19 | 106 | 19 | - part_d_1_15 | b | 15 | 110 | 1 | + part_c_1_100 | b | 13 | 98 | 2 | + part_d_15_20 | b | 15 | 106 | 16 | + part_d_15_20 | b | 17 | 106 | 19 | + part_d_1_15 | b | 12 | 110 | 1 | (6 rows) :init_range_parted; UPDATE range_parted set c = c + 50 WHERE a = 'b' and b > 10 and c >= 96; -NOTICE: trigger = trans_updatetrig, old table = (b,13,96,1,), (b,14,97,2,), (b,16,105,16,), (b,18,105,19,), new table = (b,15,146,1,), (b,16,147,2,), (b,17,155,16,), (b,19,155,19,) :show_data; partname | a | b | c | d | e ----------------+---+----+-----+----+--- part_a_10_a_20 | a | 10 | 200 | 1 | part_a_1_a_10 | a | 1 | 1 | 1 | - part_d_15_20 | b | 17 | 155 | 16 | - part_d_15_20 | b | 19 | 155 | 19 | - part_d_1_15 | b | 15 | 146 | 1 | - part_d_1_15 | b | 16 | 147 | 2 | + part_d_15_20 | b | 15 | 155 | 16 | + part_d_15_20 | b | 17 | 155 | 19 | + part_d_1_15 | b | 12 | 146 | 1 | + part_d_1_15 | b | 13 | 147 | 2 | (6 rows) -- Case where per-partition tuple conversion map array is allocated, but the @@ -498,22 +470,25 @@ NOTICE: trigger = trans_updatetrig, old table = (b,13,96,1,), (b,14,97,2,), (b, -- matching table attributes of the partition and the target table. :init_range_parted; UPDATE range_parted set b = 15 WHERE b = 1; -NOTICE: trigger = trans_updatetrig, old table = (a,1,1,1,), new table = (a,15,1,1,) :show_data; partname | a | b | c | d | e ----------------+---+----+-----+----+--- part_a_10_a_20 | a | 10 | 200 | 1 | part_a_10_a_20 | a | 15 | 1 | 1 | - part_c_1_100 | b | 13 | 96 | 1 | - part_c_1_100 | b | 14 | 97 | 2 | - part_d_15_20 | b | 16 | 105 | 16 | - part_d_15_20 | b | 18 | 105 | 19 | + part_c_1_100 | b | 12 | 96 | 1 | + part_c_1_100 | b | 13 | 97 | 2 | + part_d_15_20 | b | 15 | 105 | 16 | + part_d_15_20 | b | 17 | 105 | 19 | (6 rows) DROP TRIGGER trans_updatetrig ON range_parted; +ERROR: trigger "trans_updatetrig" for table "range_parted" does not exist DROP TRIGGER trig_c1_100 ON part_c_1_100; +ERROR: trigger "trig_c1_100" for table "part_c_1_100" does not exist DROP TRIGGER trig_d1_15 ON part_d_1_15; +ERROR: trigger "trig_d1_15" for table "part_d_1_15" does not exist DROP TRIGGER trig_d15_20 ON part_d_15_20; +ERROR: trigger "trig_d15_20" for table "part_d_15_20" does not exist DROP FUNCTION func_parted_mod_b(); -- RLS policies with update-row-movement ----------------------------------------- @@ -527,7 +502,7 @@ SET SESSION AUTHORIZATION regress_range_parted_user; -- This should fail with RLS violation error while moving row from -- part_a_10_a_20 to part_d_1_15, because we are setting 'c' to an odd number. UPDATE range_parted set a = 'b', c = 151 WHERE a = 'a' and c = 200; -ERROR: new row violates row-level security policy for table "range_parted" +ERROR: Distributed column or partition column "a" can't be updated in current version RESET SESSION AUTHORIZATION; -- Create a trigger on part_d_1_15 CREATE FUNCTION func_d_1_15() RETURNS trigger AS $$ @@ -537,12 +512,15 @@ BEGIN END $$ LANGUAGE plpgsql; CREATE TRIGGER trig_d_1_15 BEFORE INSERT ON part_d_1_15 FOR EACH ROW EXECUTE PROCEDURE func_d_1_15(); +ERROR: Postgres-XL does not support TRIGGER yet +DETAIL: The feature is not currently supported :init_range_parted; SET SESSION AUTHORIZATION regress_range_parted_user; -- Here, RLS checks should succeed while moving row from part_a_10_a_20 to -- part_d_1_15. Even though the UPDATE is setting 'c' to an odd number, the -- trigger at the destination partition again makes it an even number. UPDATE range_parted set a = 'b', c = 151 WHERE a = 'a' and c = 200; +ERROR: Distributed column or partition column "a" can't be updated in current version RESET SESSION AUTHORIZATION; :init_range_parted; SET SESSION AUTHORIZATION regress_range_parted_user; @@ -550,10 +528,11 @@ SET SESSION AUTHORIZATION regress_range_parted_user; -- 'c' to an even number, the trigger at the destination partition again makes -- it an odd number. UPDATE range_parted set a = 'b', c = 150 WHERE a = 'a' and c = 200; -ERROR: new row violates row-level security policy for table "range_parted" +ERROR: Distributed column or partition column "a" can't be updated in current version -- Cleanup RESET SESSION AUTHORIZATION; DROP TRIGGER trig_d_1_15 ON part_d_1_15; +ERROR: trigger "trig_d_1_15" for table "part_d_1_15" does not exist DROP FUNCTION func_d_1_15(); -- Policy expression contains SubPlan RESET SESSION AUTHORIZATION; @@ -564,9 +543,10 @@ CREATE POLICY policy_range_parted_subplan on range_parted SET SESSION AUTHORIZATION regress_range_parted_user; -- fail, mintab has row with c1 = 120 UPDATE range_parted set a = 'b', c = 122 WHERE a = 'a' and c = 200; -ERROR: new row violates row-level security policy "policy_range_parted_subplan" for table "range_parted" +ERROR: Distributed column or partition column "a" can't be updated in current version -- ok UPDATE range_parted set a = 'b', c = 120 WHERE a = 'a' and c = 200; +ERROR: Distributed column or partition column "a" can't be updated in current version -- RLS policy expression contains whole row. RESET SESSION AUTHORIZATION; :init_range_parted; @@ -575,12 +555,13 @@ CREATE POLICY policy_range_parted_wholerow on range_parted AS RESTRICTIVE for UP SET SESSION AUTHORIZATION regress_range_parted_user; -- ok, should pass the RLS check UPDATE range_parted set a = 'b', c = 112 WHERE a = 'a' and c = 200; +ERROR: Distributed column or partition column "a" can't be updated in current version RESET SESSION AUTHORIZATION; :init_range_parted; SET SESSION AUTHORIZATION regress_range_parted_user; -- fail, the whole row RLS check should fail UPDATE range_parted set a = 'b', c = 116 WHERE a = 'a' and c = 200; -ERROR: new row violates row-level security policy "policy_range_parted_wholerow" for table "range_parted" +ERROR: Distributed column or partition column "a" can't be updated in current version -- Cleanup RESET SESSION AUTHORIZATION; DROP POLICY policy_range_parted ON range_parted; @@ -603,35 +584,58 @@ $$; -- Triggers on root partition CREATE TRIGGER parent_delete_trig AFTER DELETE ON range_parted for each statement execute procedure trigfunc(); +ERROR: Postgres-XL does not support TRIGGER yet +DETAIL: The feature is not currently supported CREATE TRIGGER parent_update_trig AFTER UPDATE ON range_parted for each statement execute procedure trigfunc(); +ERROR: Postgres-XL does not support TRIGGER yet +DETAIL: The feature is not currently supported CREATE TRIGGER parent_insert_trig AFTER INSERT ON range_parted for each statement execute procedure trigfunc(); +ERROR: Postgres-XL does not support TRIGGER yet +DETAIL: The feature is not currently supported -- Triggers on leaf partition part_c_1_100 CREATE TRIGGER c1_delete_trig AFTER DELETE ON part_c_1_100 for each statement execute procedure trigfunc(); +ERROR: Postgres-XL does not support TRIGGER yet +DETAIL: The feature is not currently supported CREATE TRIGGER c1_update_trig AFTER UPDATE ON part_c_1_100 for each statement execute procedure trigfunc(); +ERROR: Postgres-XL does not support TRIGGER yet +DETAIL: The feature is not currently supported CREATE TRIGGER c1_insert_trig AFTER INSERT ON part_c_1_100 for each statement execute procedure trigfunc(); +ERROR: Postgres-XL does not support TRIGGER yet +DETAIL: The feature is not currently supported -- Triggers on leaf partition part_d_1_15 CREATE TRIGGER d1_delete_trig AFTER DELETE ON part_d_1_15 for each statement execute procedure trigfunc(); +ERROR: Postgres-XL does not support TRIGGER yet +DETAIL: The feature is not currently supported CREATE TRIGGER d1_update_trig AFTER UPDATE ON part_d_1_15 for each statement execute procedure trigfunc(); +ERROR: Postgres-XL does not support TRIGGER yet +DETAIL: The feature is not currently supported CREATE TRIGGER d1_insert_trig AFTER INSERT ON part_d_1_15 for each statement execute procedure trigfunc(); +ERROR: Postgres-XL does not support TRIGGER yet +DETAIL: The feature is not currently supported -- Triggers on leaf partition part_d_15_20 CREATE TRIGGER d15_delete_trig AFTER DELETE ON part_d_15_20 for each statement execute procedure trigfunc(); +ERROR: Postgres-XL does not support TRIGGER yet +DETAIL: The feature is not currently supported CREATE TRIGGER d15_update_trig AFTER UPDATE ON part_d_15_20 for each statement execute procedure trigfunc(); +ERROR: Postgres-XL does not support TRIGGER yet +DETAIL: The feature is not currently supported CREATE TRIGGER d15_insert_trig AFTER INSERT ON part_d_15_20 for each statement execute procedure trigfunc(); +ERROR: Postgres-XL does not support TRIGGER yet +DETAIL: The feature is not currently supported -- Move all rows from part_c_100_200 to part_c_1_100. None of the delete or -- insert statement triggers should be fired. UPDATE range_parted set c = c - 50 WHERE c > 97; -NOTICE: trigger = parent_update_trig fired on table range_parted during UPDATE :show_data; partname | a | b | c | d | e ----------------+---+----+-----+----+--- @@ -644,17 +648,29 @@ NOTICE: trigger = parent_update_trig fired on table range_parted during UPDATE (6 rows) DROP TRIGGER parent_delete_trig ON range_parted; +ERROR: trigger "parent_delete_trig" for table "range_parted" does not exist DROP TRIGGER parent_update_trig ON range_parted; +ERROR: trigger "parent_update_trig" for table "range_parted" does not exist DROP TRIGGER parent_insert_trig ON range_parted; +ERROR: trigger "parent_insert_trig" for table "range_parted" does not exist DROP TRIGGER c1_delete_trig ON part_c_1_100; +ERROR: trigger "c1_delete_trig" for table "part_c_1_100" does not exist DROP TRIGGER c1_update_trig ON part_c_1_100; +ERROR: trigger "c1_update_trig" for table "part_c_1_100" does not exist DROP TRIGGER c1_insert_trig ON part_c_1_100; +ERROR: trigger "c1_insert_trig" for table "part_c_1_100" does not exist DROP TRIGGER d1_delete_trig ON part_d_1_15; +ERROR: trigger "d1_delete_trig" for table "part_d_1_15" does not exist DROP TRIGGER d1_update_trig ON part_d_1_15; +ERROR: trigger "d1_update_trig" for table "part_d_1_15" does not exist DROP TRIGGER d1_insert_trig ON part_d_1_15; +ERROR: trigger "d1_insert_trig" for table "part_d_1_15" does not exist DROP TRIGGER d15_delete_trig ON part_d_15_20; +ERROR: trigger "d15_delete_trig" for table "part_d_15_20" does not exist DROP TRIGGER d15_update_trig ON part_d_15_20; +ERROR: trigger "d15_update_trig" for table "part_d_15_20" does not exist DROP TRIGGER d15_insert_trig ON part_d_15_20; +ERROR: trigger "d15_insert_trig" for table "part_d_15_20" does not exist -- Creating default partition for range :init_range_parted; create table part_def partition of range_parted default; @@ -669,14 +685,16 @@ create table part_def partition of range_parted default; e | character varying | | | | extended | | Partition of: range_parted DEFAULT Partition constraint: (NOT ((a IS NOT NULL) AND (b IS NOT NULL) AND (((a = 'a'::text) AND (b >= '1'::bigint) AND (b < '10'::bigint)) OR ((a = 'a'::text) AND (b >= '10'::bigint) AND (b < '20'::bigint)) OR ((a = 'b'::text) AND (b >= '1'::bigint) AND (b < '10'::bigint)) OR ((a = 'b'::text) AND (b >= '10'::bigint) AND (b < '20'::bigint)) OR ((a = 'b'::text) AND (b >= '20'::bigint) AND (b < '30'::bigint))))) - +Distribute By: HASH(a) +Location Nodes: ALL DATANODES + insert into range_parted values ('c', 9); -- ok update part_def set a = 'd' where a = 'c'; +ERROR: Distributed column or partition column "a" can't be updated in current version -- fail update part_def set a = 'a' where a = 'd'; -ERROR: new row for relation "part_def" violates partition constraint -DETAIL: Failing row contains (a, 9, null, null, null). +ERROR: Distributed column or partition column "a" can't be updated in current version :show_data; partname | a | b | c | d | e ----------------+---+----+-----+----+--- @@ -686,33 +704,36 @@ DETAIL: Failing row contains (a, 9, null, null, null). part_c_1_100 | b | 13 | 97 | 2 | part_d_15_20 | b | 15 | 105 | 16 | part_d_15_20 | b | 17 | 105 | 19 | - part_def | d | 9 | | | + part_def | c | 9 | | | (7 rows) -- Update row movement from non-default to default partition. -- fail, default partition is not under part_a_10_a_20; UPDATE part_a_10_a_20 set a = 'ad' WHERE a = 'a'; -ERROR: new row for relation "part_a_10_a_20" violates partition constraint -DETAIL: Failing row contains (ad, 10, 200, 1, null). +ERROR: Distributed column or partition column "a" can't be updated in current version -- ok UPDATE range_parted set a = 'ad' WHERE a = 'a'; +ERROR: Distributed column or partition column "a" can't be updated in current version UPDATE range_parted set a = 'bd' WHERE a = 'b'; +ERROR: Distributed column or partition column "a" can't be updated in current version :show_data; - partname | a | b | c | d | e -----------+----+----+-----+----+--- - part_def | ad | 1 | 1 | 1 | - part_def | ad | 10 | 200 | 1 | - part_def | bd | 12 | 96 | 1 | - part_def | bd | 13 | 97 | 2 | - part_def | bd | 15 | 105 | 16 | - part_def | bd | 17 | 105 | 19 | - part_def | d | 9 | | | + partname | a | b | c | d | e +----------------+---+----+-----+----+--- + part_a_10_a_20 | a | 10 | 200 | 1 | + part_a_1_a_10 | a | 1 | 1 | 1 | + part_c_1_100 | b | 12 | 96 | 1 | + part_c_1_100 | b | 13 | 97 | 2 | + part_d_15_20 | b | 15 | 105 | 16 | + part_d_15_20 | b | 17 | 105 | 19 | + part_def | c | 9 | | | (7 rows) -- Update row movement from default to non-default partitions. -- ok UPDATE range_parted set a = 'a' WHERE a = 'ad'; +ERROR: Distributed column or partition column "a" can't be updated in current version UPDATE range_parted set a = 'b' WHERE a = 'bd'; +ERROR: Distributed column or partition column "a" can't be updated in current version :show_data; partname | a | b | c | d | e ----------------+---+----+-----+----+--- @@ -722,7 +743,7 @@ UPDATE range_parted set a = 'b' WHERE a = 'bd'; part_c_1_100 | b | 13 | 97 | 2 | part_d_15_20 | b | 15 | 105 | 16 | part_d_15_20 | b | 17 | 105 | 19 | - part_def | d | 9 | | | + part_def | c | 9 | | | (7 rows) -- Cleanup: range_parted no longer needed. @@ -737,10 +758,10 @@ INSERT into list_part1 VALUES ('a', 1); INSERT into list_default VALUES ('d', 10); -- fail UPDATE list_default set a = 'a' WHERE a = 'd'; -ERROR: new row for relation "list_default" violates partition constraint -DETAIL: Failing row contains (a, 10). +ERROR: Distributed column or partition column "a" can't be updated in current version -- ok UPDATE list_default set a = 'x' WHERE a = 'd'; +ERROR: Distributed column or partition column "a" can't be updated in current version DROP TABLE list_parted; -------------- -- Some more update-partition-key test scenarios below. This time use list @@ -751,19 +772,28 @@ CREATE TABLE list_parted (a numeric, b int, c int8) PARTITION BY list (a); CREATE TABLE sub_parted PARTITION OF list_parted for VALUES in (1) PARTITION BY list (b); CREATE TABLE sub_part1(b int, c int8, a numeric); ALTER TABLE sub_parted ATTACH PARTITION sub_part1 for VALUES in (1); +ERROR: table "sub_part1" contains column "a" at position 3, but parent "sub_parted" has it at position 1 +DETAIL: Postgres-XL requires attribute positions to match +HINT: Check for column ordering and dropped columns, if any CREATE TABLE sub_part2(b int, c int8, a numeric); ALTER TABLE sub_parted ATTACH PARTITION sub_part2 for VALUES in (2); +ERROR: table "sub_part2" contains column "a" at position 3, but parent "sub_parted" has it at position 1 +DETAIL: Postgres-XL requires attribute positions to match +HINT: Check for column ordering and dropped columns, if any CREATE TABLE list_part1(a numeric, b int, c int8); ALTER TABLE list_parted ATTACH PARTITION list_part1 for VALUES in (2,3); INSERT into list_parted VALUES (2,5,50); INSERT into list_parted VALUES (3,6,60); INSERT into sub_parted VALUES (1,1,60); +ERROR: no partition of relation "sub_parted" found for row +DETAIL: Partition key of the failing row contains (b) = (1). INSERT into sub_parted VALUES (1,2,10); +ERROR: no partition of relation "sub_parted" found for row +DETAIL: Partition key of the failing row contains (b) = (2). -- Test partition constraint violation when intermediate ancestor is used and -- constraint is inherited from upper root. UPDATE sub_parted set a = 2 WHERE c = 10; -ERROR: new row for relation "sub_part2" violates partition constraint -DETAIL: Failing row contains (2, 10, 2). +ERROR: Distributed column or partition column "a" can't be updated in current version -- Test update-partition-key, where the unpruned partitions do not have their -- partition keys updated. SELECT tableoid::regclass::text, * FROM list_parted WHERE a = 2 ORDER BY 1; @@ -787,14 +817,14 @@ BEGIN END $$ LANGUAGE plpgsql; CREATE TRIGGER parted_mod_b before update on sub_part1 for each row execute procedure func_parted_mod_b(); +ERROR: Postgres-XL does not support TRIGGER yet +DETAIL: The feature is not currently supported SELECT tableoid::regclass::text, * FROM list_parted ORDER BY 1, 2, 3, 4; tableoid | a | b | c ------------+---+----+---- list_part1 | 2 | 52 | 50 list_part1 | 3 | 6 | 60 - sub_part1 | 1 | 1 | 60 - sub_part2 | 1 | 2 | 10 -(4 rows) +(2 rows) -- This should do the tuple routing even though there is no explicit -- partition-key update, because there is a trigger on sub_part1. @@ -804,11 +834,10 @@ SELECT tableoid::regclass::text, * FROM list_parted ORDER BY 1, 2, 3, 4; ------------+---+----+---- list_part1 | 2 | 52 | 50 list_part1 | 3 | 6 | 60 - sub_part2 | 1 | 2 | 10 - sub_part2 | 1 | 2 | 70 -(4 rows) +(2 rows) DROP TRIGGER parted_mod_b ON sub_part1; +ERROR: trigger "parted_mod_b" for table "sub_part1" does not exist -- If BR DELETE trigger prevented DELETE from happening, we should also skip -- the INSERT if that delete is part of UPDATE=>DELETE+INSERT. CREATE OR REPLACE FUNCTION func_parted_mod_b() returns trigger as $$ @@ -818,28 +847,26 @@ BEGIN END $$ LANGUAGE plpgsql; CREATE TRIGGER trig_skip_delete before delete on sub_part2 for each row execute procedure func_parted_mod_b(); +ERROR: Postgres-XL does not support TRIGGER yet +DETAIL: The feature is not currently supported UPDATE list_parted set b = 1 WHERE c = 70; -NOTICE: Trigger: Got OLD row (2,70,1), but returning NULL SELECT tableoid::regclass::text, * FROM list_parted ORDER BY 1, 2, 3, 4; tableoid | a | b | c ------------+---+----+---- list_part1 | 2 | 52 | 50 list_part1 | 3 | 6 | 60 - sub_part2 | 1 | 2 | 10 - sub_part2 | 1 | 2 | 70 -(4 rows) +(2 rows) -- Drop the trigger. Now the row should be moved. DROP TRIGGER trig_skip_delete ON sub_part2; +ERROR: trigger "trig_skip_delete" for table "sub_part2" does not exist UPDATE list_parted set b = 1 WHERE c = 70; SELECT tableoid::regclass::text, * FROM list_parted ORDER BY 1, 2, 3, 4; tableoid | a | b | c ------------+---+----+---- list_part1 | 2 | 52 | 50 list_part1 | 3 | 6 | 60 - sub_part1 | 1 | 1 | 70 - sub_part2 | 1 | 2 | 10 -(4 rows) +(2 rows) DROP FUNCTION func_parted_mod_b(); -- UPDATE partition-key with FROM clause. If join produces multiple output @@ -848,14 +875,13 @@ DROP FUNCTION func_parted_mod_b(); CREATE TABLE non_parted (id int); INSERT into non_parted VALUES (1), (1), (1), (2), (2), (2), (3), (3), (3); UPDATE list_parted t1 set a = 2 FROM non_parted t2 WHERE t1.a = t2.id and a = 1; +ERROR: Distributed column or partition column "a" can't be updated in current version SELECT tableoid::regclass::text, * FROM list_parted ORDER BY 1, 2, 3, 4; tableoid | a | b | c ------------+---+----+---- - list_part1 | 2 | 1 | 70 - list_part1 | 2 | 2 | 10 list_part1 | 2 | 52 | 50 list_part1 | 3 | 6 | 60 -(4 rows) +(2 rows) DROP TABLE non_parted; -- Cleanup: list_parted no longer needed. @@ -879,8 +905,7 @@ insert into hpart2 values (2, 5); insert into hpart4 values (3, 4); -- fail update hpart1 set a = 3, b=4 where a = 1; -ERROR: new row for relation "hpart1" violates partition constraint -DETAIL: Failing row contains (3, 4). +ERROR: Distributed column or partition column "a" can't be updated in current version -- ok, row movement update hash_parted set b = b - 1 where b = 1; -- ok diff --git a/src/test/regress/input/tablespace.source b/src/test/regress/input/tablespace.source index d46f0e4c..49ba6200 100644 --- a/src/test/regress/input/tablespace.source +++ b/src/test/regress/input/tablespace.source @@ -87,7 +87,7 @@ SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c where c.reltablespace = t.oid AND c.relname LIKE 'part%_idx'; \d testschema.part_a_idx --- partitioned rels cannot specify the default tablespace. These fail: +-- partitioned rels cannot specify the primary key. These fail: CREATE TABLE testschema.dflt (a int PRIMARY KEY) PARTITION BY LIST (a) TABLESPACE pg_default; CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE pg_default) PARTITION BY LIST (a); SET default_tablespace TO 'pg_default'; @@ -109,23 +109,17 @@ ALTER TABLE testschema.test_default_tab ADD CONSTRAINT test_index4 UNIQUE (id) U \d testschema.test_index1 \d testschema.test_index2 -\d testschema.test_index3 -\d testschema.test_index4 -- use a custom tablespace for default_tablespace SET default_tablespace TO regress_tblspace; -- tablespace should not change if no rewrite ALTER TABLE testschema.test_default_tab ALTER id TYPE bigint; \d testschema.test_index1 \d testschema.test_index2 -\d testschema.test_index3 -\d testschema.test_index4 SELECT * FROM testschema.test_default_tab; -- tablespace should not change even if there is an index rewrite ALTER TABLE testschema.test_default_tab ALTER id TYPE int; \d testschema.test_index1 \d testschema.test_index2 -\d testschema.test_index3 -\d testschema.test_index4 SELECT * FROM testschema.test_default_tab; -- now use the default tablespace for default_tablespace SET default_tablespace TO ''; @@ -133,14 +127,10 @@ SET default_tablespace TO ''; ALTER TABLE testschema.test_default_tab ALTER id TYPE int; \d testschema.test_index1 \d testschema.test_index2 -\d testschema.test_index3 -\d testschema.test_index4 -- tablespace should not change even if there is an index rewrite ALTER TABLE testschema.test_default_tab ALTER id TYPE bigint; \d testschema.test_index1 \d testschema.test_index2 -\d testschema.test_index3 -\d testschema.test_index4 DROP TABLE testschema.test_default_tab; -- check that default_tablespace doesn't affect ALTER TABLE index rebuilds @@ -157,23 +147,12 @@ ALTER TABLE testschema.test_default_tab_p ADD CONSTRAINT test_index4 UNIQUE (id) \d testschema.test_index1 \d testschema.test_index2 -\d testschema.test_index3 -\d testschema.test_index4 -- use a custom tablespace for default_tablespace SET default_tablespace TO regress_tblspace; --- tablespace should not change if no rewrite -ALTER TABLE testschema.test_default_tab_p ALTER val TYPE bigint; -\d testschema.test_index1 -\d testschema.test_index2 -\d testschema.test_index3 -\d testschema.test_index4 -SELECT * FROM testschema.test_default_tab_p; -- tablespace should not change even if there is an index rewrite ALTER TABLE testschema.test_default_tab_p ALTER val TYPE int; \d testschema.test_index1 \d testschema.test_index2 -\d testschema.test_index3 -\d testschema.test_index4 SELECT * FROM testschema.test_default_tab_p; -- now use the default tablespace for default_tablespace SET default_tablespace TO ''; @@ -181,14 +160,10 @@ SET default_tablespace TO ''; ALTER TABLE testschema.test_default_tab_p ALTER val TYPE int; \d testschema.test_index1 \d testschema.test_index2 -\d testschema.test_index3 -\d testschema.test_index4 -- tablespace should not change even if there is an index rewrite ALTER TABLE testschema.test_default_tab_p ALTER val TYPE bigint; \d testschema.test_index1 \d testschema.test_index2 -\d testschema.test_index3 -\d testschema.test_index4 DROP TABLE testschema.test_default_tab_p; -- check that default_tablespace affects index additions in ALTER TABLE @@ -214,7 +189,7 @@ CREATE INDEX test_tab_b_idx ON testschema.test_tab (b); \d testschema.test_tab_unique \d testschema.test_tab_a_idx \d testschema.test_tab_b_idx -ALTER TABLE testschema.test_tab ALTER b TYPE bigint, ADD UNIQUE (c); +ALTER TABLE testschema.test_tab ALTER b TYPE bigint; \d testschema.test_tab_unique \d testschema.test_tab_a_idx \d testschema.test_tab_b_idx diff --git a/src/test/regress/output/tablespace.source b/src/test/regress/output/tablespace.source index 15c0d3e0..1a5dc4d1 100644 --- a/src/test/regress/output/tablespace.source +++ b/src/test/regress/output/tablespace.source @@ -123,28 +123,43 @@ SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c (3 rows) \d testschema.part_a_idx -Partitioned index "testschema.part_a_idx" - Column | Type | Key? | Definition ---------+---------+------+------------ - a | integer | yes | a + Index "testschema.part_a_idx" + Column | Type | Definition +--------+---------+------------ + a | integer | a btree, for table "testschema.part" Tablespace: "regress_tblspace" --- partitioned rels cannot specify the default tablespace. These fail: +-- partitioned rels cannot specify the primary key. These fail: CREATE TABLE testschema.dflt (a int PRIMARY KEY) PARTITION BY LIST (a) TABLESPACE pg_default; -ERROR: cannot specify default tablespace for partitioned relations +ERROR: primary key constraints are not supported on partitioned tables +LINE 1: CREATE TABLE testschema.dflt (a int PRIMARY KEY) PARTITION B... + ^ CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE pg_default) PARTITION BY LIST (a); -ERROR: cannot specify default tablespace for partitioned relation +ERROR: primary key constraints are not supported on partitioned tables +LINE 1: CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX ... + ^ SET default_tablespace TO 'pg_default'; CREATE TABLE testschema.dflt (a int PRIMARY KEY) PARTITION BY LIST (a) TABLESPACE regress_tblspace; -ERROR: cannot specify default tablespace for partitioned relations +ERROR: primary key constraints are not supported on partitioned tables +LINE 1: CREATE TABLE testschema.dflt (a int PRIMARY KEY) PARTITION B... + ^ CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE regress_tblspace) PARTITION BY LIST (a); -ERROR: cannot specify default tablespace for partitioned relations +ERROR: primary key constraints are not supported on partitioned tables +LINE 1: CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX ... + ^ -- but these work: CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE regress_tblspace) PARTITION BY LIST (a) TABLESPACE regress_tblspace; +ERROR: primary key constraints are not supported on partitioned tables +LINE 1: CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX ... + ^ SET default_tablespace TO ''; CREATE TABLE testschema.dflt2 (a int PRIMARY KEY) PARTITION BY LIST (a); +ERROR: primary key constraints are not supported on partitioned tables +LINE 1: CREATE TABLE testschema.dflt2 (a int PRIMARY KEY) PARTITION ... + ^ DROP TABLE testschema.dflt, testschema.dflt2; +ERROR: table "dflt" does not exist -- check that default_tablespace doesn't affect ALTER TABLE index rebuilds CREATE TABLE testschema.test_default_tab(id bigint) TABLESPACE regress_tblspace; INSERT INTO testschema.test_default_tab VALUES (1); @@ -167,21 +182,6 @@ Index "testschema.test_index2" btree, for table "testschema.test_default_tab" Tablespace: "regress_tblspace" -\d testschema.test_index3 - Index "testschema.test_index3" - Column | Type | Key? | Definition ---------+--------+------+------------ - id | bigint | yes | id -primary key, btree, for table "testschema.test_default_tab" - -\d testschema.test_index4 - Index "testschema.test_index4" - Column | Type | Key? | Definition ---------+--------+------+------------ - id | bigint | yes | id -unique, btree, for table "testschema.test_default_tab" -Tablespace: "regress_tblspace" - -- use a custom tablespace for default_tablespace SET default_tablespace TO regress_tblspace; -- tablespace should not change if no rewrite @@ -201,21 +201,6 @@ Index "testschema.test_index2" btree, for table "testschema.test_default_tab" Tablespace: "regress_tblspace" -\d testschema.test_index3 - Index "testschema.test_index3" - Column | Type | Key? | Definition ---------+---------+------+------------ - id | integer | yes | id -primary key, btree, for table "testschema.test_default_tab" - -\d testschema.test_index4 - Index "testschema.test_index4" - Column | Type | Key? | Definition ---------+---------+------+------------ - id | integer | yes | id -unique, btree, for table "testschema.test_default_tab" -Tablespace: "regress_tblspace" - SELECT * FROM testschema.test_default_tab; id ---- @@ -239,21 +224,6 @@ Index "testschema.test_index2" btree, for table "testschema.test_default_tab" Tablespace: "regress_tblspace" -\d testschema.test_index3 - Index "testschema.test_index3" - Column | Type | Key? | Definition ---------+--------+------+------------ - id | bigint | yes | id -primary key, btree, for table "testschema.test_default_tab" - -\d testschema.test_index4 - Index "testschema.test_index4" - Column | Type | Key? | Definition ---------+--------+------+------------ - id | bigint | yes | id -unique, btree, for table "testschema.test_default_tab" -Tablespace: "regress_tblspace" - SELECT * FROM testschema.test_default_tab; id ---- @@ -279,21 +249,6 @@ Index "testschema.test_index2" btree, for table "testschema.test_default_tab" Tablespace: "regress_tblspace" -\d testschema.test_index3 - Index "testschema.test_index3" - Column | Type | Key? | Definition ---------+---------+------+------------ - id | integer | yes | id -primary key, btree, for table "testschema.test_default_tab" - -\d testschema.test_index4 - Index "testschema.test_index4" - Column | Type | Key? | Definition ---------+---------+------+------------ - id | integer | yes | id -unique, btree, for table "testschema.test_default_tab" -Tablespace: "regress_tblspace" - -- tablespace should not change even if there is an index rewrite ALTER TABLE testschema.test_default_tab ALTER id TYPE bigint; \d testschema.test_index1 @@ -311,21 +266,6 @@ Index "testschema.test_index2" btree, for table "testschema.test_default_tab" Tablespace: "regress_tblspace" -\d testschema.test_index3 - Index "testschema.test_index3" - Column | Type | Key? | Definition ---------+--------+------+------------ - id | bigint | yes | id -primary key, btree, for table "testschema.test_default_tab" - -\d testschema.test_index4 - Index "testschema.test_index4" - Column | Type | Key? | Definition ---------+--------+------+------------ - id | bigint | yes | id -unique, btree, for table "testschema.test_default_tab" -Tablespace: "regress_tblspace" - DROP TABLE testschema.test_default_tab; -- check that default_tablespace doesn't affect ALTER TABLE index rebuilds -- (this time with a partitioned table) @@ -337,109 +277,47 @@ INSERT INTO testschema.test_default_tab_p VALUES (1); CREATE INDEX test_index1 on testschema.test_default_tab_p (val); CREATE INDEX test_index2 on testschema.test_default_tab_p (val) TABLESPACE regress_tblspace; ALTER TABLE testschema.test_default_tab_p ADD CONSTRAINT test_index3 PRIMARY KEY (id); +ERROR: primary key constraints are not supported on partitioned tables +LINE 1: ALTER TABLE testschema.test_default_tab_p ADD CONSTRAINT tes... + ^ ALTER TABLE testschema.test_default_tab_p ADD CONSTRAINT test_index4 UNIQUE (id) USING INDEX TABLESPACE regress_tblspace; +ERROR: unique constraints are not supported on partitioned tables +LINE 1: ALTER TABLE testschema.test_default_tab_p ADD CONSTRAINT tes... + ^ \d testschema.test_index1 -Partitioned index "testschema.test_index1" - Column | Type | Key? | Definition ---------+--------+------+------------ - val | bigint | yes | val +Index "testschema.test_index1" + Column | Type | Definition +--------+--------+------------ + val | bigint | val btree, for table "testschema.test_default_tab_p" \d testschema.test_index2 -Partitioned index "testschema.test_index2" - Column | Type | Key? | Definition ---------+--------+------+------------ - val | bigint | yes | val +Index "testschema.test_index2" + Column | Type | Definition +--------+--------+------------ + val | bigint | val btree, for table "testschema.test_default_tab_p" Tablespace: "regress_tblspace" -\d testschema.test_index3 -Partitioned index "testschema.test_index3" - Column | Type | Key? | Definition ---------+--------+------+------------ - id | bigint | yes | id -primary key, btree, for table "testschema.test_default_tab_p" - -\d testschema.test_index4 -Partitioned index "testschema.test_index4" - Column | Type | Key? | Definition ---------+--------+------+------------ - id | bigint | yes | id -unique, btree, for table "testschema.test_default_tab_p" -Tablespace: "regress_tblspace" - -- use a custom tablespace for default_tablespace SET default_tablespace TO regress_tblspace; --- tablespace should not change if no rewrite -ALTER TABLE testschema.test_default_tab_p ALTER val TYPE bigint; -\d testschema.test_index1 -Partitioned index "testschema.test_index1" - Column | Type | Key? | Definition ---------+--------+------+------------ - val | bigint | yes | val -btree, for table "testschema.test_default_tab_p" - -\d testschema.test_index2 -Partitioned index "testschema.test_index2" - Column | Type | Key? | Definition ---------+--------+------+------------ - val | bigint | yes | val -btree, for table "testschema.test_default_tab_p" -Tablespace: "regress_tblspace" - -\d testschema.test_index3 -Partitioned index "testschema.test_index3" - Column | Type | Key? | Definition ---------+--------+------+------------ - id | bigint | yes | id -primary key, btree, for table "testschema.test_default_tab_p" - -\d testschema.test_index4 -Partitioned index "testschema.test_index4" - Column | Type | Key? | Definition ---------+--------+------+------------ - id | bigint | yes | id -unique, btree, for table "testschema.test_default_tab_p" -Tablespace: "regress_tblspace" - -SELECT * FROM testschema.test_default_tab_p; - id | val -----+----- - 1 | -(1 row) - -- tablespace should not change even if there is an index rewrite ALTER TABLE testschema.test_default_tab_p ALTER val TYPE int; \d testschema.test_index1 -Partitioned index "testschema.test_index1" - Column | Type | Key? | Definition ---------+---------+------+------------ - val | integer | yes | val +Index "testschema.test_index1" + Column | Type | Definition +--------+---------+------------ + val | integer | val btree, for table "testschema.test_default_tab_p" \d testschema.test_index2 -Partitioned index "testschema.test_index2" - Column | Type | Key? | Definition ---------+---------+------+------------ - val | integer | yes | val +Index "testschema.test_index2" + Column | Type | Definition +--------+---------+------------ + val | integer | val btree, for table "testschema.test_default_tab_p" Tablespace: "regress_tblspace" -\d testschema.test_index3 -Partitioned index "testschema.test_index3" - Column | Type | Key? | Definition ---------+--------+------+------------ - id | bigint | yes | id -primary key, btree, for table "testschema.test_default_tab_p" - -\d testschema.test_index4 -Partitioned index "testschema.test_index4" - Column | Type | Key? | Definition ---------+--------+------+------------ - id | bigint | yes | id -unique, btree, for table "testschema.test_default_tab_p" -Tablespace: "regress_tblspace" - SELECT * FROM testschema.test_default_tab_p; id | val ----+----- @@ -451,67 +329,37 @@ SET default_tablespace TO ''; -- tablespace should not change if no rewrite ALTER TABLE testschema.test_default_tab_p ALTER val TYPE int; \d testschema.test_index1 -Partitioned index "testschema.test_index1" - Column | Type | Key? | Definition ---------+---------+------+------------ - val | integer | yes | val +Index "testschema.test_index1" + Column | Type | Definition +--------+---------+------------ + val | integer | val btree, for table "testschema.test_default_tab_p" \d testschema.test_index2 -Partitioned index "testschema.test_index2" - Column | Type | Key? | Definition ---------+---------+------+------------ - val | integer | yes | val +Index "testschema.test_index2" + Column | Type | Definition +--------+---------+------------ + val | integer | val btree, for table "testschema.test_default_tab_p" Tablespace: "regress_tblspace" -\d testschema.test_index3 -Partitioned index "testschema.test_index3" - Column | Type | Key? | Definition ---------+--------+------+------------ - id | bigint | yes | id -primary key, btree, for table "testschema.test_default_tab_p" - -\d testschema.test_index4 -Partitioned index "testschema.test_index4" - Column | Type | Key? | Definition ---------+--------+------+------------ - id | bigint | yes | id -unique, btree, for table "testschema.test_default_tab_p" -Tablespace: "regress_tblspace" - -- tablespace should not change even if there is an index rewrite ALTER TABLE testschema.test_default_tab_p ALTER val TYPE bigint; \d testschema.test_index1 -Partitioned index "testschema.test_index1" - Column | Type | Key? | Definition ---------+--------+------+------------ - val | bigint | yes | val +Index "testschema.test_index1" + Column | Type | Definition +--------+--------+------------ + val | bigint | val btree, for table "testschema.test_default_tab_p" \d testschema.test_index2 -Partitioned index "testschema.test_index2" - Column | Type | Key? | Definition ---------+--------+------+------------ - val | bigint | yes | val +Index "testschema.test_index2" + Column | Type | Definition +--------+--------+------------ + val | bigint | val btree, for table "testschema.test_default_tab_p" Tablespace: "regress_tblspace" -\d testschema.test_index3 -Partitioned index "testschema.test_index3" - Column | Type | Key? | Definition ---------+--------+------+------------ - id | bigint | yes | id -primary key, btree, for table "testschema.test_default_tab_p" - -\d testschema.test_index4 -Partitioned index "testschema.test_index4" - Column | Type | Key? | Definition ---------+--------+------+------------ - id | bigint | yes | id -unique, btree, for table "testschema.test_default_tab_p" -Tablespace: "regress_tblspace" - DROP TABLE testschema.test_default_tab_p; -- check that default_tablespace affects index additions in ALTER TABLE CREATE TABLE testschema.test_tab(id int) TABLESPACE regress_tblspace; @@ -551,50 +399,50 @@ CREATE INDEX test_tab_a_idx ON testschema.test_tab (a); SET default_tablespace TO ''; CREATE INDEX test_tab_b_idx ON testschema.test_tab (b); \d testschema.test_tab_unique - Index "testschema.test_tab_unique" - Column | Type | Key? | Definition ---------+---------+------+------------ - a | integer | yes | a +Index "testschema.test_tab_unique" + Column | Type | Definition +--------+---------+------------ + a | integer | a unique, btree, for table "testschema.test_tab" Tablespace: "regress_tblspace" \d testschema.test_tab_a_idx - Index "testschema.test_tab_a_idx" - Column | Type | Key? | Definition ---------+---------+------+------------ - a | integer | yes | a +Index "testschema.test_tab_a_idx" + Column | Type | Definition +--------+---------+------------ + a | integer | a btree, for table "testschema.test_tab" Tablespace: "regress_tblspace" \d testschema.test_tab_b_idx - Index "testschema.test_tab_b_idx" - Column | Type | Key? | Definition ---------+---------+------+------------ - b | integer | yes | b +Index "testschema.test_tab_b_idx" + Column | Type | Definition +--------+---------+------------ + b | integer | b btree, for table "testschema.test_tab" -ALTER TABLE testschema.test_tab ALTER b TYPE bigint, ADD UNIQUE (c); +ALTER TABLE testschema.test_tab ALTER b TYPE bigint; \d testschema.test_tab_unique - Index "testschema.test_tab_unique" - Column | Type | Key? | Definition ---------+---------+------+------------ - a | integer | yes | a +Index "testschema.test_tab_unique" + Column | Type | Definition +--------+---------+------------ + a | integer | a unique, btree, for table "testschema.test_tab" Tablespace: "regress_tblspace" \d testschema.test_tab_a_idx - Index "testschema.test_tab_a_idx" - Column | Type | Key? | Definition ---------+---------+------+------------ - a | integer | yes | a +Index "testschema.test_tab_a_idx" + Column | Type | Definition +--------+---------+------------ + a | integer | a btree, for table "testschema.test_tab" Tablespace: "regress_tblspace" \d testschema.test_tab_b_idx - Index "testschema.test_tab_b_idx" - Column | Type | Key? | Definition ---------+--------+------+------------ - b | bigint | yes | b +Index "testschema.test_tab_b_idx" + Column | Type | Definition +--------+--------+------------ + b | bigint | b btree, for table "testschema.test_tab" DROP TABLE testschema.test_tab; diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql index b53af1c0..c0e41d0f 100644 --- a/src/test/regress/sql/alter_table.sql +++ b/src/test/regress/sql/alter_table.sql @@ -1332,8 +1332,8 @@ alter table tab1 alter column b type varchar; -- fails create table at_partitioned (a int, b text) partition by range (a); create table at_part_1 partition of at_partitioned for values from (0) to (1000); insert into at_partitioned values (512, '0.123'); -create table at_part_2 (b text, a int); -insert into at_part_2 values ('1.234', 1024); +create table at_part_2 (a int, b text); +insert into at_part_2 values (1024, '1.234'); create index on at_partitioned (b); create index on at_partitioned (a); \d at_part_1 diff --git a/src/test/regress/sql/event_trigger.sql b/src/test/regress/sql/event_trigger.sql index 9c8fa5f6..dc8b017a 100644 --- a/src/test/regress/sql/event_trigger.sql +++ b/src/test/regress/sql/event_trigger.sql @@ -265,7 +265,7 @@ CREATE SCHEMA evttrig -- Partitioned tables with a partitioned index CREATE TABLE evttrig.parted ( - id int PRIMARY KEY) + id int) PARTITION BY RANGE (id); CREATE TABLE evttrig.part_1_10 PARTITION OF evttrig.parted (id) FOR VALUES FROM (1) TO (10); diff --git a/src/test/regress/sql/indexing.sql b/src/test/regress/sql/indexing.sql index 4762e687..130ee7cc 100644 --- a/src/test/regress/sql/indexing.sql +++ b/src/test/regress/sql/indexing.sql @@ -280,10 +280,8 @@ alter index idxpart_a_idx attach partition idxpart2_a_idx; drop table idxpart; -- Verify that attaching indexes maps attribute numbers correctly -create table idxpart (col1 int, a int, col2 int, b int) partition by range (a); -create table idxpart1 (b int, col1 int, col2 int, col3 int, a int); -alter table idxpart drop column col1, drop column col2; -alter table idxpart1 drop column col1, drop column col2, drop column col3; +create table idxpart (a int, b int) partition by range (a); +create table idxpart1 (a int, b int); alter table idxpart attach partition idxpart1 for values from (0) to (1000); create index idxpart_1_idx on only idxpart (b, a); create index idxpart1_1_idx on idxpart1 (b, a); @@ -308,9 +306,9 @@ drop table idxpart; create table idxpart (a int, b int, c text) partition by range (a); create index idxparti on idxpart (a); create index idxparti2 on idxpart (c, b); -create table idxpart1 (c text, a int, b int); +create table idxpart1 (a int, b int, c text); alter table idxpart attach partition idxpart1 for values from (0) to (10); -create table idxpart2 (c text, a int, b int); +create table idxpart2 (a int, b int, c text); create index on idxpart2 (a); create index on idxpart2 (c, b); alter table idxpart attach partition idxpart2 for values from (10) to (20); @@ -321,12 +319,9 @@ select c.relname, pg_get_indexdef(indexrelid) drop table idxpart; -- Verify that columns are mapped correctly in expression indexes -create table idxpart (col1 int, col2 int, a int, b int) partition by range (a); -create table idxpart1 (col2 int, b int, col1 int, a int); -create table idxpart2 (col1 int, col2 int, b int, a int); -alter table idxpart drop column col1, drop column col2; -alter table idxpart1 drop column col1, drop column col2; -alter table idxpart2 drop column col1, drop column col2; +create table idxpart (a int, b int) partition by range (a); +create table idxpart1 (a int, b int); +create table idxpart2 (a int, b int); create index on idxpart2 (abs(b)); alter table idxpart attach partition idxpart2 for values from (0) to (1); create index on idxpart (abs(b)); @@ -338,14 +333,11 @@ select c.relname, pg_get_indexdef(indexrelid) drop table idxpart; -- Verify that columns are mapped correctly for WHERE in a partial index -create table idxpart (col1 int, a int, col3 int, b int) partition by range (a); -alter table idxpart drop column col1, drop column col3; -create table idxpart1 (col1 int, col2 int, col3 int, col4 int, b int, a int); -alter table idxpart1 drop column col1, drop column col2, drop column col3, drop column col4; +create table idxpart (a int, b int) partition by range (a); +create table idxpart1 (a int, b int); alter table idxpart attach partition idxpart1 for values from (0) to (1000); -create table idxpart2 (col1 int, col2 int, b int, a int); +create table idxpart2 (a int, b int); create index on idxpart2 (a) where b > 1000; -alter table idxpart2 drop column col1, drop column col2; alter table idxpart attach partition idxpart2 for values from (1000) to (2000); create index on idxpart (a) where b > 1000; select c.relname, pg_get_indexdef(indexrelid) @@ -355,7 +347,7 @@ select c.relname, pg_get_indexdef(indexrelid) drop table idxpart; -- Column number mapping: dropped columns in the partition -create table idxpart1 (drop_1 int, drop_2 int, col_keep int, drop_3 int); +create table idxpart1 (col_keep int, drop_1 int, drop_2 int, drop_3 int); alter table idxpart1 drop column drop_1; alter table idxpart1 drop column drop_2; alter table idxpart1 drop column drop_3; @@ -371,7 +363,7 @@ select attrelid::regclass, attname, attnum from pg_attribute drop table idxpart; -- Column number mapping: dropped columns in the parent table -create table idxpart(drop_1 int, drop_2 int, col_keep int, drop_3 int) partition by range (col_keep); +create table idxpart(col_keep int, drop_1 int, drop_2 int, drop_3 int) partition by range (col_keep); alter table idxpart drop column drop_1; alter table idxpart drop column drop_2; alter table idxpart drop column drop_3; diff --git a/src/test/regress/sql/inherit.sql b/src/test/regress/sql/inherit.sql index e58bfd36..5e8a2215 100644 --- a/src/test/regress/sql/inherit.sql +++ b/src/test/regress/sql/inherit.sql @@ -699,7 +699,7 @@ explain (costs off) select * from list_parted; explain (costs off) select * from list_parted where a is null; explain (costs off) select * from list_parted where a is not null; explain (costs off) select * from list_parted where a in ('ab', 'cd', 'ef'); -explain (costs off) select * from list_parted where a = 'ab' or a in (null, 'cd'); +explain (costs off) select * from list_parted where a = 'ab' or a is null or a ='cd'; explain (costs off) select * from list_parted where a = 'ab'; create table range_list_parted ( diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql index 9a561519..70bfcbe8 100644 --- a/src/test/regress/sql/insert.sql +++ b/src/test/regress/sql/insert.sql @@ -168,7 +168,7 @@ insert into part_default_p2 values ('de', 35); insert into list_parted values ('ab', 21); insert into list_parted values ('xx', 1); insert into list_parted values ('yy', 2); -select tableoid::regclass, * from list_parted; +select tableoid::regclass, * from list_parted order by 1,2,3; -- Check tuple routing for partitioned tables @@ -253,7 +253,7 @@ insert into hpart3 values(11); -- view data select tableoid::regclass as part, a, a%4 as "remainder = a % 4" -from hash_parted order by part; +from hash_parted order by part,a; -- test \d+ output on a table which has both partitioned and unpartitioned -- partitions @@ -375,7 +375,7 @@ insert into mlparted_def2 values (34, 50); create table mlparted_defd partition of mlparted_def default; insert into mlparted values (70, 100); -select tableoid::regclass, * from mlparted_def; +select tableoid::regclass, * from mlparted_def order by 1; -- check that message shown after failure to find a partition shows the -- appropriate key description (or none) in various situations diff --git a/src/test/regress/sql/partition_info.sql b/src/test/regress/sql/partition_info.sql index afa16c07..bd62ec81 100644 --- a/src/test/regress/sql/partition_info.sql +++ b/src/test/regress/sql/partition_info.sql @@ -46,7 +46,6 @@ ALTER INDEX ptif_test_index ATTACH PARTITION ptif_test3_index; -- Test pg_partition_root for indexes SELECT pg_partition_root('ptif_test_index'); SELECT pg_partition_root('ptif_test0_index'); -SELECT pg_partition_root('ptif_test01_index'); SELECT pg_partition_root('ptif_test3_index'); -- List all tables members of the tree @@ -72,29 +71,6 @@ SELECT relid, parentrelid, level, isleaf SELECT * FROM pg_partition_ancestors('ptif_test01'); SELECT * FROM pg_partition_ancestors('ptif_test'); --- List all indexes members of the tree -SELECT relid, parentrelid, level, isleaf - FROM pg_partition_tree('ptif_test_index'); --- List indexes from an intermediate level -SELECT relid, parentrelid, level, isleaf - FROM pg_partition_tree('ptif_test0_index') p - JOIN pg_class c ON (p.relid = c.oid); --- List from leaf index -SELECT relid, parentrelid, level, isleaf - FROM pg_partition_tree('ptif_test01_index') p - JOIN pg_class c ON (p.relid = c.oid); --- List from partitioned index with no partitions -SELECT relid, parentrelid, level, isleaf - FROM pg_partition_tree('ptif_test3_index') p - JOIN pg_class c ON (p.relid = c.oid); --- List all members using pg_partition_root with leaf index reference -SELECT relid, parentrelid, level, isleaf - FROM pg_partition_tree(pg_partition_root('ptif_test01_index')) p - JOIN pg_class c ON (p.relid = c.oid); --- List all ancestors of root and leaf indexes -SELECT * FROM pg_partition_ancestors('ptif_test01_index'); -SELECT * FROM pg_partition_ancestors('ptif_test_index'); - DROP TABLE ptif_test; -- A table not part of a partition tree works is not listed. diff --git a/src/test/regress/sql/partition_prune.sql b/src/test/regress/sql/partition_prune.sql index 55fda489..d6875fb9 100644 --- a/src/test/regress/sql/partition_prune.sql +++ b/src/test/regress/sql/partition_prune.sql @@ -39,7 +39,7 @@ create table rlp_default_null partition of rlp_default for values in (null); create table rlp1 partition of rlp for values from (minvalue) to (1); create table rlp2 partition of rlp for values from (1) to (10); -create table rlp3 (b varchar, a int) partition by list (b varchar_ops); +create table rlp3 (a int, b varchar) partition by list (b varchar_ops); create table rlp3_default partition of rlp3 default; create table rlp3abcd partition of rlp3 for values in ('ab', 'cd'); create table rlp3efgh partition of rlp3 for values in ('ef', 'gh'); diff --git a/src/test/regress/sql/select_parallel.sql b/src/test/regress/sql/select_parallel.sql index 25ee90a1..fdc504e0 100644 --- a/src/test/regress/sql/select_parallel.sql +++ b/src/test/regress/sql/select_parallel.sql @@ -143,7 +143,9 @@ EXPLAIN (costs off) SELECT xc_node_id != 0 FROM t_worker_identifier; SELECT xc_node_id != 0 FROM t_worker_identifier; -- provoke error in worker +SAVEPOINT settings; select stringu1::int2 from tenk1 where unique1 = 1; +ROLLBACK TO SAVEPOINT settings; -- test interaction with set-returning functions SAVEPOINT settings; diff --git a/src/test/regress/sql/temp.sql b/src/test/regress/sql/temp.sql index efac176f..0ef81583 100644 --- a/src/test/regress/sql/temp.sql +++ b/src/test/regress/sql/temp.sql @@ -172,7 +172,7 @@ select relname from pg_class where relname like 'temp_parted_oncommit_test%'; -- all rows if partitions preserve their data. begin; create temp table temp_parted_oncommit_test (a int) - partition by list (a) on commit delete rows; + partition by list (a) on commit preserve rows; create temp table temp_parted_oncommit_test1 partition of temp_parted_oncommit_test for values in (1) on commit preserve rows; diff --git a/src/test/regress/sql/truncate.sql b/src/test/regress/sql/truncate.sql index afde2f66..681fac5d 100644 --- a/src/test/regress/sql/truncate.sql +++ b/src/test/regress/sql/truncate.sql @@ -263,7 +263,7 @@ CREATE FUNCTION tp_chk_data(OUT pktb regclass, OUT pkval int, OUT fktb regclass, END $$; CREATE TABLE truncprim (a int PRIMARY KEY); -CREATE TABLE truncpart (a int REFERENCES truncprim) +CREATE TABLE truncpart (a int) PARTITION BY RANGE (a); CREATE TABLE truncpart_1 PARTITION OF truncpart FOR VALUES FROM (0) TO (100); CREATE TABLE truncpart_2 PARTITION OF truncpart FOR VALUES FROM (100) TO (200) @@ -271,7 +271,7 @@ CREATE TABLE truncpart_2 PARTITION OF truncpart FOR VALUES FROM (100) TO (200) CREATE TABLE truncpart_2_1 PARTITION OF truncpart_2 FOR VALUES FROM (100) TO (150); CREATE TABLE truncpart_2_d PARTITION OF truncpart_2 DEFAULT; -TRUNCATE TABLE truncprim; -- should fail +TRUNCATE TABLE truncprim; select tp_ins_data(); -- should truncate everything diff --git a/src/test/regress/sql/update.sql b/src/test/regress/sql/update.sql index a4f2f161..c97218b3 100644 --- a/src/test/regress/sql/update.sql +++ b/src/test/regress/sql/update.sql @@ -130,9 +130,9 @@ CREATE TABLE range_parted ( -- Create partitions intentionally in descending bound order, so as to test -- that update-row-movement works with the leaf partitions not in bound order. -CREATE TABLE part_b_20_b_30 (e varchar, c numeric, a text, b bigint, d int); +CREATE TABLE part_b_20_b_30 (a text, b bigint, c numeric, d int, e varchar); ALTER TABLE range_parted ATTACH PARTITION part_b_20_b_30 FOR VALUES FROM ('b', 20) TO ('b', 30); -CREATE TABLE part_b_10_b_20 (e varchar, c numeric, a text, b bigint, d int) PARTITION BY RANGE (c); +CREATE TABLE part_b_10_b_20 (a text, b bigint, c numeric, d int, e varchar) PARTITION BY RANGE (c); CREATE TABLE part_b_1_b_10 PARTITION OF range_parted FOR VALUES FROM ('b', 1) TO ('b', 10); ALTER TABLE range_parted ATTACH PARTITION part_b_10_b_20 FOR VALUES FROM ('b', 10) TO ('b', 20); CREATE TABLE part_a_10_a_20 PARTITION OF range_parted FOR VALUES FROM ('a', 10) TO ('a', 20); @@ -145,17 +145,13 @@ UPDATE part_b_10_b_20 set b = b - 6; -- Create some more partitions following the above pattern of descending bound -- order, but let's make the situation a bit more complex by having the -- attribute numbers of the columns vary from their parent partition. -CREATE TABLE part_c_100_200 (e varchar, c numeric, a text, b bigint, d int) PARTITION BY range (abs(d)); -ALTER TABLE part_c_100_200 DROP COLUMN e, DROP COLUMN c, DROP COLUMN a; -ALTER TABLE part_c_100_200 ADD COLUMN c numeric, ADD COLUMN e varchar, ADD COLUMN a text; -ALTER TABLE part_c_100_200 DROP COLUMN b; -ALTER TABLE part_c_100_200 ADD COLUMN b bigint; +CREATE TABLE part_c_100_200 (a text, b bigint, c numeric, d int, e varchar) PARTITION BY range (abs(d)); CREATE TABLE part_d_1_15 PARTITION OF part_c_100_200 FOR VALUES FROM (1) TO (15); CREATE TABLE part_d_15_20 PARTITION OF part_c_100_200 FOR VALUES FROM (15) TO (20); ALTER TABLE part_b_10_b_20 ATTACH PARTITION part_c_100_200 FOR VALUES FROM (100) TO (200); -CREATE TABLE part_c_1_100 (e varchar, d int, c numeric, b bigint, a text); +CREATE TABLE part_c_1_100 (a text, b bigint, c numeric, d int, e varchar); ALTER TABLE part_b_10_b_20 ATTACH PARTITION part_c_1_100 FOR VALUES FROM (1) TO (100); \set init_range_parted 'truncate range_parted; insert into range_parted VALUES (''a'', 1, 1, 1), (''a'', 10, 200, 1), (''b'', 12, 96, 1), (''b'', 13, 97, 2), (''b'', 15, 105, 16), (''b'', 17, 105, 19)' @@ -177,15 +173,6 @@ UPDATE range_parted set d = d - 10 WHERE d > 10; UPDATE range_parted set e = d; -- No row found UPDATE part_c_1_100 set c = c + 20 WHERE c = 98; --- ok, row movement -UPDATE part_b_10_b_20 set c = c + 20 returning c, b, a; -:show_data; - --- fail, row movement happens only within the partition subtree. -UPDATE part_b_10_b_20 set b = b - 6 WHERE c > 116 returning *; --- ok, row movement, with subset of rows moved into different partition. -UPDATE range_parted set b = b - 6 WHERE c > 116 returning a, b + c; - :show_data; -- Common table needed for multiple test scenarios. @@ -210,7 +197,7 @@ DROP VIEW upview; -- RETURNING having whole-row vars. :init_range_parted; -UPDATE range_parted set c = 95 WHERE a = 'b' and b > 10 and c > 100 returning (range_parted), *; +UPDATE range_parted set c = 95 WHERE a = 'b' and b > 10 and c < 100 returning (range_parted), *; :show_data; From da7b6686fdcfccfbf92ac58e1663a9b80a160e73 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Thu, 16 Jul 2020 20:29:14 +0800 Subject: [PATCH 306/578] Postpone generate_gather_paths for topmost scan/join rel. http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- src/backend/optimizer/geqo/geqo_eval.c | 21 ++++++++++++++------- src/backend/optimizer/path/allpaths.c | 24 ++++++++++++++++++------ src/backend/optimizer/plan/planner.c | 9 +++++++++ 3 files changed, 41 insertions(+), 13 deletions(-) diff --git a/src/backend/optimizer/geqo/geqo_eval.c b/src/backend/optimizer/geqo/geqo_eval.c index 108b866c..87be2b76 100644 --- a/src/backend/optimizer/geqo/geqo_eval.c +++ b/src/backend/optimizer/geqo/geqo_eval.c @@ -40,7 +40,7 @@ typedef struct } Clump; static List *merge_clump(PlannerInfo *root, List *clumps, Clump *new_clump, - bool force); + int num_gene, bool force); static bool desirable_join(PlannerInfo *root, RelOptInfo *outer_rel, RelOptInfo *inner_rel); @@ -196,7 +196,7 @@ gimme_tree(PlannerInfo *root, Gene *tour, int num_gene) cur_clump->size = 1; /* Merge it into the clumps list, using only desirable joins */ - clumps = merge_clump(root, clumps, cur_clump, false); + clumps = merge_clump(root, clumps, cur_clump, num_gene, false); } if (list_length(clumps) > 1) @@ -210,7 +210,7 @@ gimme_tree(PlannerInfo *root, Gene *tour, int num_gene) { Clump *clump = (Clump *) lfirst(lc); - fclumps = merge_clump(root, fclumps, clump, true); + fclumps = merge_clump(root, fclumps, clump, num_gene, true); } clumps = fclumps; } @@ -235,8 +235,9 @@ gimme_tree(PlannerInfo *root, Gene *tour, int num_gene) * "desirable" joins. */ static List * -merge_clump(PlannerInfo *root, List *clumps, Clump *new_clump, bool force) -{// #lizard forgives +merge_clump(PlannerInfo *root, List *clumps, Clump *new_clump, int num_gene, + bool force) +{ ListCell *prev; ListCell *lc; @@ -267,7 +268,13 @@ merge_clump(PlannerInfo *root, List *clumps, Clump *new_clump, bool force) /* Create paths for partition-wise joins. */ generate_partition_wise_join_paths(root, joinrel); - /* Create GatherPaths for any useful partial paths for rel */ + /* + * Except for the topmost scan/join rel, consider gathering + * partial paths. We'll do the same for the topmost scan/join + * rel once we know the final targetlist (see + * grouping_planner). + */ + if (old_clump->size + new_clump->size < num_gene) generate_gather_paths(root, joinrel); /* Find and save the cheapest paths for this joinrel */ @@ -286,7 +293,7 @@ merge_clump(PlannerInfo *root, List *clumps, Clump *new_clump, bool force) * others. When no further merge is possible, we'll reinsert * it into the list. */ - return merge_clump(root, clumps, old_clump, force); + return merge_clump(root, clumps, old_clump, num_gene, force); } } prev = lc; diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 4326a646..40bd2cf0 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -497,13 +497,20 @@ set_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, } /* - * If this is a baserel, consider gathering any partial paths we may have - * created for it. (If we tried to gather inheritance children, we could + * If this is a baserel, we should normally consider gathering any partial + * paths we may have created for it. + * + * However, if this is an inheritance child, skip it. Otherwise, we could * end up with a very large number of gather nodes, each trying to grab - * its own pool of workers, so don't do this for otherrels. Instead, - * we'll consider gathering partial paths for the parent appendrel.) + * its own pool of workers. Instead, we'll consider gathering partial + * paths for the parent appendrel. + * + * Also, if this is the topmost scan/join rel (that is, the only baserel), + * we postpone this until the final scan/join targelist is available (see + * grouping_planner). */ - if (rel->reloptkind == RELOPT_BASEREL) + if (rel->reloptkind == RELOPT_BASEREL && + bms_membership(root->all_baserels) != BMS_SINGLETON) generate_gather_paths(root, rel); /* @@ -2730,7 +2737,12 @@ standard_join_search(PlannerInfo *root, int levels_needed, List *initial_rels) /* Create paths for partition-wise joins. */ generate_partition_wise_join_paths(root, rel); - /* Create GatherPaths for any useful partial paths for rel */ + /* + * Except for the topmost scan/join rel, consider gathering + * partial paths. We'll do the same for the topmost scan/join rel + * once we know the final targetlist (see grouping_planner). + */ + if (lev < levels_needed) generate_gather_paths(root, rel); /* Find and save the cheapest paths for this rel */ diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 55c28ea3..7ee9d475 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -2133,6 +2133,15 @@ grouping_planner(PlannerInfo *root, bool inheritance_update, } /* + * Generate Gather or Gather Merge paths for the topmost scan/join + * relation. Once that's done, we must re-determine which paths are + * cheapest. (The previously-cheapest path might even have been + * pfree'd!) + */ + generate_gather_paths(root, current_rel); + set_cheapest(current_rel); + + /* * Forcibly apply SRF-free scan/join target to all the Paths for the * scan/join rel. * From a5465dd281c813cbbefa5f4ded13490339e1d6d7 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Fri, 17 Jul 2020 08:57:36 +0800 Subject: [PATCH 307/578] fix regress error related partition join . http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233 --- .../regress/expected/partition_join_1.out | 1838 +++++++---------- src/test/regress/sql/partition_join.sql | 2 +- 2 files changed, 791 insertions(+), 1049 deletions(-) diff --git a/src/test/regress/expected/partition_join_1.out b/src/test/regress/expected/partition_join_1.out index 83d35561..a9a52a07 100644 --- a/src/test/regress/expected/partition_join_1.out +++ b/src/test/regress/expected/partition_join_1.out @@ -3,7 +3,7 @@ -- Test partition-wise join between partitioned tables -- -- Enable partition-wise join, which by default is disabled. -SET enable_partition_wise_join to true; +--SET enable_partition_wise_join to true; -- -- partitioned by a single column -- @@ -32,29 +32,23 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.b = ----------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) -> Sort - Sort Key: t1.a - -> Append - -> Nested Loop - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Seq Scan on prt1_p1 t1 - Filter: (b = 0) - -> Index Scan using iprt2_p1_b on prt2_p1 t2 - Index Cond: (b = t1.a) - -> Nested Loop - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Seq Scan on prt1_p2 t1_1 - Filter: (b = 0) - -> Index Scan using iprt2_p2_b on prt2_p2 t2_1 - Index Cond: (b = t1_1.a) - -> Nested Loop + Sort Key: a + -> Hash Join + Hash Cond: (t2.b = a) + -> Append + -> Seq Scan on prt2_p1 t2 + -> Seq Scan on prt2_p2 t2_1 + -> Seq Scan on prt2_p3 t2_2 + -> Hash -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Seq Scan on prt1_p3 t1_2 - Filter: (b = 0) - -> Bitmap Heap Scan on prt2_p3 t2_2 - Recheck Cond: (b = t1_2.a) - -> Bitmap Index Scan on iprt2_p3_b - Index Cond: (b = t1_2.a) -(24 rows) + -> Append + -> Seq Scan on prt1_p1 t1 + Filter: (b = 0) + -> Seq Scan on prt1_p2 t1_1 + Filter: (b = 0) + -> Seq Scan on prt1_p3 t1_2 + Filter: (b = 0) +(18 rows) SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.b = 0 ORDER BY t1.a, t2.b; a | c | b | c @@ -68,40 +62,30 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.b = -- left outer join, with whole-row reference EXPLAIN (COSTS OFF) SELECT t1, t2 FROM prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b; - QUERY PLAN ------------------------------------------------------------------------------------------ + QUERY PLAN +----------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) -> Sort - Sort Key: t1.a, t2.b - -> Result - -> Append - -> Nested Loop Left Join - -> Seq Scan on prt1_p1 t1 - Filter: (b = 0) - -> Materialize - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b - -> Index Scan using iprt2_p1_b on prt2_p1 t2 - Index Cond: (t1.a = b) - -> Nested Loop Left Join - -> Seq Scan on prt1_p2 t1_1 - Filter: (b = 0) - -> Materialize - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b - -> Index Scan using iprt2_p2_b on prt2_p2 t2_1 - Index Cond: (t1_1.a = b) - -> Nested Loop Left Join - -> Seq Scan on prt1_p3 t1_2 - Filter: (b = 0) - -> Materialize - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b - -> Bitmap Heap Scan on prt2_p3 t2_2 - Recheck Cond: (t1_2.a = b) - -> Bitmap Index Scan on iprt2_p3_b - Index Cond: (t1_2.a = b) -(31 rows) + Sort Key: a, b + -> Hash Right Join + Hash Cond: (b = a) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Append + -> Seq Scan on prt2_p1 t2 + -> Seq Scan on prt2_p2 t2_1 + -> Seq Scan on prt2_p3 t2_2 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Append + -> Seq Scan on prt1_p1 t1 + Filter: (b = 0) + -> Seq Scan on prt1_p2 t1_1 + Filter: (b = 0) + -> Seq Scan on prt1_p3 t1_2 + Filter: (b = 0) +(21 rows) SELECT t1, t2 FROM prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b; t1 | t2 @@ -123,35 +107,32 @@ SELECT t1, t2 FROM prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER -- right outer join EXPLAIN (COSTS OFF) SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b; - QUERY PLAN ---------------------------------------------------------------------------- + QUERY PLAN +--------------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) -> Sort - Sort Key: t1.a, t2.b - -> Result - -> Append - -> Nested Loop Left Join - -> Remote Subquery Scan on all (datanode_2) - Distribute results by H: b - -> Seq Scan on prt2_p1 t2 - Filter: (a = 0) - -> Index Scan using iprt1_p1_a on prt1_p1 t1 - Index Cond: (a = t2.b) - -> Nested Loop Left Join - -> Remote Subquery Scan on all (datanode_2) - Distribute results by H: b - -> Seq Scan on prt2_p2 t2_1 - Filter: (a = 0) - -> Index Scan using iprt1_p2_a on prt1_p2 t1_1 - Index Cond: (a = t2_1.b) - -> Nested Loop Left Join - -> Remote Subquery Scan on all (datanode_2) - Distribute results by H: b - -> Seq Scan on prt2_p3 t2_2 - Filter: (a = 0) - -> Index Scan using iprt1_p3_a on prt1_p3 t1_2 - Index Cond: (a = t2_2.b) -(26 rows) + Sort Key: a, b + -> Nested Loop Left Join + -> Remote Subquery Scan on all (datanode_2) + Distribute results by H: b + -> Append + -> Seq Scan on prt2_p1 t2 + Filter: (a = 0) + -> Seq Scan on prt2_p2 t2_1 + Filter: (a = 0) + -> Seq Scan on prt2_p3 t2_2 + Filter: (a = 0) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Append + -> Index Scan using iprt1_p1_a on prt1_p1 t1 + Index Cond: (a = b) + -> Index Scan using iprt1_p2_a on prt1_p2 t1_1 + Index Cond: (a = b) + -> Index Scan using iprt1_p3_a on prt1_p3 t1_2 + Index Cond: (a = b) +(23 rows) SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b; a | c | b | c @@ -169,43 +150,34 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b WHE -- full outer join, with placeholder vars EXPLAIN (COSTS OFF) SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b) WHERE t1.phv = t1.a OR t2.phv = t2.b ORDER BY t1.a, t2.b; - QUERY PLAN ------------------------------------------------------------------------- + QUERY PLAN +----------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) -> Sort - Sort Key: prt1_p1.a, prt2_p1.b - -> Append - -> Hash Full Join - Hash Cond: (prt1_p1.a = prt2_p1.b) - Filter: (((50) = prt1_p1.a) OR ((75) = prt2_p1.b)) - -> Seq Scan on prt1_p1 - Filter: (b = 0) - -> Hash - -> Remote Subquery Scan on all (datanode_2) - Distribute results by H: b + Sort Key: a, b + -> Hash Full Join + Hash Cond: (a = b) + Filter: (((50) = a) OR ((75) = b)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Append + -> Seq Scan on prt1_p1 + Filter: (b = 0) + -> Seq Scan on prt1_p2 + Filter: (b = 0) + -> Seq Scan on prt1_p3 + Filter: (b = 0) + -> Hash + -> Remote Subquery Scan on all (datanode_2) + Distribute results by H: b + -> Append -> Seq Scan on prt2_p1 Filter: (a = 0) - -> Hash Full Join - Hash Cond: (prt1_p2.a = prt2_p2.b) - Filter: (((50) = prt1_p2.a) OR ((75) = prt2_p2.b)) - -> Seq Scan on prt1_p2 - Filter: (b = 0) - -> Hash - -> Remote Subquery Scan on all (datanode_2) - Distribute results by H: b -> Seq Scan on prt2_p2 Filter: (a = 0) - -> Hash Full Join - Hash Cond: (prt1_p3.a = prt2_p3.b) - Filter: (((50) = prt1_p3.a) OR ((75) = prt2_p3.b)) - -> Seq Scan on prt1_p3 - Filter: (b = 0) - -> Hash - -> Remote Subquery Scan on all (datanode_2) - Distribute results by H: b -> Seq Scan on prt2_p3 Filter: (a = 0) -(34 rows) +(25 rows) SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b) WHERE t1.phv = t1.a OR t2.phv = t2.b ORDER BY t1.a, t2.b; a | c | b | c @@ -217,19 +189,26 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) -- Join with pruned partitions from joining relations EXPLAIN (COSTS OFF) SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.a < 450 AND t2.b > 250 AND t1.b = 0 ORDER BY t1.a, t2.b; - QUERY PLAN ------------------------------------------------------------------------------ + QUERY PLAN +----------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) -> Sort - Sort Key: t1.a - -> Append - -> Nested Loop - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Seq Scan on prt1_p2 t1 + Sort Key: a + -> Nested Loop + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt1_p1 t1 + Filter: ((a < 450) AND (b = 0)) + -> Seq Scan on prt1_p2 t1_1 Filter: ((a < 450) AND (b = 0)) + -> Append -> Index Scan using iprt2_p2_b on prt2_p2 t2 - Index Cond: ((b = t1.a) AND (b > 250)) -(10 rows) + Index Cond: ((b = a) AND (b > 250)) + -> Bitmap Heap Scan on prt2_p3 t2_1 + Recheck Cond: ((b = a) AND (b > 250)) + -> Bitmap Index Scan on iprt2_p3_b + Index Cond: ((b = a) AND (b > 250)) +(17 rows) SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.a < 450 AND t2.b > 250 AND t1.b = 0 ORDER BY t1.a, t2.b; a | c | b | c @@ -239,92 +218,84 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.a < EXPLAIN (COSTS OFF) SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 LEFT JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b; - QUERY PLAN ------------------------------------------------------------------------------ - Sort - Sort Key: prt1_p1.a, b - -> Append - -> Hash Left Join - Hash Cond: (prt1_p1.a = b) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Seq Scan on prt1_p1 - Filter: ((a < 450) AND (b = 0)) - -> Hash - -> Result - One-Time Filter: false + QUERY PLAN +--------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a, b -> Nested Loop Left Join - -> Seq Scan on prt1_p2 - Filter: ((a < 450) AND (b = 0)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Append + -> Seq Scan on prt1_p1 + Filter: ((a < 450) AND (b = 0)) + -> Seq Scan on prt1_p2 + Filter: ((a < 450) AND (b = 0)) -> Materialize -> Remote Subquery Scan on all (datanode_1,datanode_2) Distribute results by H: b - -> Index Scan using iprt2_p2_b on prt2_p2 - Index Cond: ((prt1_p2.a = b) AND (b > 250)) -(19 rows) + -> Append + -> Index Scan using iprt2_p2_b on prt2_p2 + Index Cond: ((a = b) AND (b > 250)) + -> Bitmap Heap Scan on prt2_p3 + Recheck Cond: ((a = b) AND (b > 250)) + -> Bitmap Index Scan on iprt2_p3_b + Index Cond: ((a = b) AND (b > 250)) +(21 rows) SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 LEFT JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b; - a | c | b | c + a | c | b | c -----+------+-----+------ - 0 | 0000 | | - 50 | 0050 | | - 100 | 0100 | | - 150 | 0150 | | - 200 | 0200 | | - 250 | 0250 | | + 0 | 0000 | | + 50 | 0050 | | + 100 | 0100 | | + 150 | 0150 | | + 200 | 0200 | | + 250 | 0250 | | 300 | 0300 | 300 | 0300 - 350 | 0350 | | - 400 | 0400 | | + 350 | 0350 | | + 400 | 0400 | | (9 rows) EXPLAIN (COSTS OFF) SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 FULL JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 OR t2.a = 0 ORDER BY t1.a, t2.b; QUERY PLAN ----------------------------------------------------------------------------- - Sort - Sort Key: prt1_p1.a, b - -> Append + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a, b -> Hash Full Join - Hash Cond: (prt1_p1.a = b) - Filter: ((prt1_p1.b = 0) OR (a = 0)) + Hash Cond: (a = b) + Filter: ((b = 0) OR (a = 0)) -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Seq Scan on prt1_p1 - Filter: (a < 450) - -> Hash - -> Result - One-Time Filter: false - -> Hash Full Join - Hash Cond: (prt1_p2.a = prt2_p2.b) - Filter: ((prt1_p2.b = 0) OR (prt2_p2.a = 0)) - -> Seq Scan on prt1_p2 - Filter: (a < 450) + Distribute results by H: a + -> Append + -> Seq Scan on prt1_p1 + Filter: (a < 450) + -> Seq Scan on prt1_p2 + Filter: (a < 450) -> Hash -> Remote Subquery Scan on all (datanode_1,datanode_2) Distribute results by H: b - -> Seq Scan on prt2_p2 - Filter: (b > 250) - -> Hash Full Join - Hash Cond: (prt2_p3.b = a) - Filter: ((b = 0) OR (prt2_p3.a = 0)) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Seq Scan on prt2_p3 - Filter: (b > 250) - -> Hash - -> Result - One-Time Filter: false -(31 rows) + -> Append + -> Seq Scan on prt2_p2 + Filter: (b > 250) + -> Seq Scan on prt2_p3 + Filter: (b > 250) +(21 rows) SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 FULL JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 OR t2.a = 0 ORDER BY t1.a, t2.b; - a | c | b | c + a | c | b | c -----+------+-----+------ - 0 | 0000 | | - 50 | 0050 | | - 100 | 0100 | | - 150 | 0150 | | - 200 | 0200 | | - 250 | 0250 | | + 0 | 0000 | | + 50 | 0050 | | + 100 | 0100 | | + 150 | 0150 | | + 200 | 0200 | | + 250 | 0250 | | 300 | 0300 | 300 | 0300 - 350 | 0350 | | - 400 | 0400 | | + 350 | 0350 | | + 400 | 0400 | | | | 375 | 0375 | | 450 | 0450 | | 525 | 0525 @@ -333,55 +304,37 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 FULL JO -- Semi-join EXPLAIN (COSTS OFF) SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t2.b FROM prt2 t2 WHERE t2.a = 0) AND t1.b = 0 ORDER BY t1.a; - QUERY PLAN ------------------------------------------------------------------------------- + QUERY PLAN +-------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) -> Sort Sort Key: t1.a - -> Append - -> Nested Loop - -> Remote Subquery Scan on all (datanode_2) - Distribute results by H: b - -> HashAggregate - Group Key: t2.b - -> Remote Subquery Scan on all (datanode_2) - Distribute results by H: b - -> HashAggregate - Group Key: t2.b + -> Nested Loop + -> Remote Subquery Scan on all (datanode_2) + -> HashAggregate + Group Key: b + -> Remote Subquery Scan on all (datanode_2) + Distribute results by H: b + -> HashAggregate + Group Key: t2.b + -> Append -> Seq Scan on prt2_p1 t2 Filter: (a = 0) - -> Index Scan using iprt1_p1_a on prt1_p1 t1 - Index Cond: (a = t2.b) - Filter: (b = 0) - -> Nested Loop - -> Remote Subquery Scan on all (datanode_2) - Distribute results by H: b - -> HashAggregate - Group Key: t2_1.b - -> Remote Subquery Scan on all (datanode_2) - Distribute results by H: b - -> HashAggregate - Group Key: t2_1.b -> Seq Scan on prt2_p2 t2_1 Filter: (a = 0) - -> Index Scan using iprt1_p2_a on prt1_p2 t1_1 - Index Cond: (a = t2_1.b) - Filter: (b = 0) - -> Nested Loop - -> Remote Subquery Scan on all (datanode_2) - Distribute results by H: b - -> HashAggregate - Group Key: t2_2.b - -> Remote Subquery Scan on all (datanode_2) - Distribute results by H: b - -> HashAggregate - Group Key: t2_2.b -> Seq Scan on prt2_p3 t2_2 Filter: (a = 0) + -> Append + -> Index Scan using iprt1_p1_a on prt1_p1 t1 + Index Cond: (a = b) + Filter: (b = 0) + -> Index Scan using iprt1_p2_a on prt1_p2 t1_1 + Index Cond: (a = b) + Filter: (b = 0) -> Index Scan using iprt1_p3_a on prt1_p3 t1_2 - Index Cond: (a = t2_2.b) + Index Cond: (a = b) Filter: (b = 0) -(46 rows) +(28 rows) SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t2.b FROM prt2 t2 WHERE t2.a = 0) AND t1.b = 0 ORDER BY t1.a; a | b | c @@ -395,34 +348,24 @@ SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t2.b FROM prt2 t2 WHERE t2.a = 0) -- Anti-join with aggregates EXPLAIN (COSTS OFF) SELECT sum(t1.a), avg(t1.a), sum(t1.b), avg(t1.b) FROM prt1 t1 WHERE NOT EXISTS (SELECT 1 FROM prt2 t2 WHERE t1.a = t2.b); - QUERY PLAN --------------------------------------------------------------------------------------------- + QUERY PLAN +----------------------------------------------------------------------------------- Finalize Aggregate -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Partial Aggregate - -> Append - -> Hash Anti Join - Hash Cond: (t1.a = t2.b) + -> Hash Anti Join + Hash Cond: (t1.a = b) + -> Append -> Seq Scan on prt1_p1 t1 - -> Hash - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b - -> Seq Scan on prt2_p1 t2 - -> Hash Anti Join - Hash Cond: (t1_1.a = t2_1.b) -> Seq Scan on prt1_p2 t1_1 - -> Hash - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b - -> Seq Scan on prt2_p2 t2_1 - -> Nested Loop Anti Join -> Seq Scan on prt1_p3 t1_2 - -> Materialize - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b - -> Index Only Scan using iprt2_p3_b on prt2_p3 t2_2 - Index Cond: (b = t1_2.a) -(25 rows) + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt2_p1 t2 + -> Seq Scan on prt2_p2 t2_1 + -> Seq Scan on prt2_p3 t2_2 +(15 rows) SELECT sum(t1.a), avg(t1.a), sum(t1.b), avg(t1.b) FROM prt1 t1 WHERE NOT EXISTS (SELECT 1 FROM prt2 t2 WHERE t1.a = t2.b); sum | avg | sum | avg @@ -435,24 +378,61 @@ EXPLAIN (COSTS OFF) SELECT * FROM prt1 t1 LEFT JOIN LATERAL (SELECT t2.a AS t2a, t3.a AS t3a, least(t1.a,t2.a,t3.b) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss ON t1.a = ss.t2a WHERE t1.b = 0 ORDER BY t1.a; -ERROR: could not devise a query plan for the given query + QUERY PLAN +-------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a + -> Nested Loop Left Join + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Append + -> Seq Scan on prt1_p1 t1 + Filter: (b = 0) + -> Seq Scan on prt1_p2 t1_1 + Filter: (b = 0) + -> Seq Scan on prt1_p3 t1_2 + Filter: (b = 0) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Nested Loop + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Index Only Scan using iprt1_p1_a on prt1_p1 t2 + Index Cond: (a = a) + -> Index Only Scan using iprt1_p2_a on prt1_p2 t2_1 + Index Cond: (a = a) + -> Index Only Scan using iprt1_p3_a on prt1_p3 t2_2 + Index Cond: (a = a) + -> Append + -> Index Scan using iprt2_p1_b on prt2_p1 t3 + Index Cond: (b = a) + -> Index Scan using iprt2_p2_b on prt2_p2 t3_1 + Index Cond: (b = a) + -> Bitmap Heap Scan on prt2_p3 t3_2 + Recheck Cond: (b = a) + -> Bitmap Index Scan on iprt2_p3_b + Index Cond: (b = a) +(34 rows) + SELECT * FROM prt1 t1 LEFT JOIN LATERAL (SELECT t2.a AS t2a, t3.a AS t3a, least(t1.a,t2.a,t3.b) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss ON t1.a = ss.t2a WHERE t1.b = 0 ORDER BY t1.a; - a | b | c | t2a | t3a | least + a | b | c | t2a | t3a | least -----+---+------+-----+-----+------- 0 | 0 | 0000 | 0 | 0 | 0 - 50 | 0 | 0050 | | | - 100 | 0 | 0100 | | | + 50 | 0 | 0050 | | | + 100 | 0 | 0100 | | | 150 | 0 | 0150 | 150 | 0 | 150 - 200 | 0 | 0200 | | | - 250 | 0 | 0250 | | | + 200 | 0 | 0200 | | | + 250 | 0 | 0250 | | | 300 | 0 | 0300 | 300 | 0 | 300 - 350 | 0 | 0350 | | | - 400 | 0 | 0400 | | | + 350 | 0 | 0350 | | | + 400 | 0 | 0400 | | | 450 | 0 | 0450 | 450 | 0 | 450 - 500 | 0 | 0500 | | | - 550 | 0 | 0550 | | | + 500 | 0 | 0500 | | | + 550 | 0 | 0550 | | | (12 rows) EXPLAIN (COSTS OFF) @@ -464,42 +444,33 @@ SELECT t1.a, ss.t2a, ss.t2c FROM prt1 t1 LEFT JOIN LATERAL Remote Subquery Scan on all (datanode_1,datanode_2) -> Sort Sort Key: a - -> Hash Right Join + -> Hash Left Join Hash Cond: ((c)::text = (c)::text) Filter: ((b + COALESCE(b, 0)) = 0) -> Remote Subquery Scan on all (datanode_1,datanode_2) Distribute results by H: c -> Append - -> Hash Join - Hash Cond: (t2.a = t3.b) - -> Seq Scan on prt1_p1 t2 - -> Hash - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b - -> Seq Scan on prt2_p1 t3 - -> Hash Join - Hash Cond: (t2_1.a = t3_1.b) - -> Seq Scan on prt1_p2 t2_1 - -> Hash - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b - -> Seq Scan on prt2_p2 t3_1 - -> Nested Loop - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b - -> Seq Scan on prt2_p3 t3_2 - -> Index Scan using iprt1_p3_a on prt1_p3 t2_2 - Index Cond: (a = t3_2.b) + -> Seq Scan on prt1_p1 t1 + -> Seq Scan on prt1_p2 t1_1 + -> Seq Scan on prt1_p3 t1_2 -> Hash -> Remote Subquery Scan on all (datanode_1,datanode_2) Distribute results by H: c - -> Append - -> Seq Scan on prt1_p1 t1 - -> Seq Scan on prt1_p2 t1_1 - -> Seq Scan on prt1_p3 t1_2 -(36 rows) - -SELECT t1.a, ss.t2a, ss.t2c FROM prt1 t1 LEFT JOIN LATERAL + -> Hash Join + Hash Cond: (t2.a = b) + -> Append + -> Seq Scan on prt1_p1 t2 + -> Seq Scan on prt1_p2 t2_1 + -> Seq Scan on prt1_p3 t2_2 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt2_p1 t3 + -> Seq Scan on prt2_p2 t3_1 + -> Seq Scan on prt2_p3 t3_2 +(27 rows) + +SELECT t1.a, ss.t2a, ss.t2c FROM prt1 t1 LEFT JOIN LATERAL (SELECT t2.a AS t2a, t3.a AS t3a, t2.b t2b, t2.c t2c, least(t1.a,t2.a,t3.a) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss ON t1.c = ss.t2c WHERE (t1.b + coalesce(ss.t2b, 0)) = 0 ORDER BY t1.a; a | t2a | t2c @@ -538,34 +509,27 @@ INSERT INTO prt2_e SELECT i, i, i % 25 FROM generate_series(0, 599, 3) i; ANALYZE prt2_e; EXPLAIN (COSTS OFF) SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_e t1, prt2_e t2 WHERE (t1.a + t1.b)/2 = (t2.b + t2.a)/2 AND t1.c = 0 ORDER BY t1.a, t2.b; - QUERY PLAN ---------------------------------------------------------------------------------------------- + QUERY PLAN +----------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) -> Sort - Sort Key: t1.a, t2.b - -> Append - -> Hash Join - Hash Cond: (((t2.b + t2.a) / 2) = ((t1.a + t1.b) / 2)) + Sort Key: a, t2.b + -> Hash Join + Hash Cond: (((t2.b + t2.a) / 2) = ((a + b) / 2)) + -> Append -> Seq Scan on prt2_e_p1 t2 - -> Hash - -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on prt2_e_p2 t2_1 + -> Seq Scan on prt2_e_p3 t2_2 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append -> Seq Scan on prt1_e_p1 t1 Filter: (c = 0) - -> Hash Join - Hash Cond: (((t2_1.b + t2_1.a) / 2) = ((t1_1.a + t1_1.b) / 2)) - -> Seq Scan on prt2_e_p2 t2_1 - -> Hash - -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on prt1_e_p2 t1_1 Filter: (c = 0) - -> Nested Loop - -> Seq Scan on prt2_e_p3 t2_2 - -> Materialize - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Index Scan using iprt1_e_p3_ab2 on prt1_e_p3 t1_2 - Index Cond: (((a + b) / 2) = ((t2_2.b + t2_2.a) / 2)) + -> Seq Scan on prt1_e_p3 t1_2 Filter: (c = 0) -(25 rows) +(18 rows) SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_e t1, prt2_e t2 WHERE (t1.a + t1.b)/2 = (t2.b + t2.a)/2 AND t1.c = 0 ORDER BY t1.a, t2.b; a | c | b | c @@ -581,59 +545,42 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_e t1, prt2_e t2 WHERE (t1.a + t1.b)/2 = -- EXPLAIN (COSTS OFF) SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM prt1 t1, prt2 t2, prt1_e t3 WHERE t1.a = t2.b AND t1.a = (t3.a + t3.b)/2 AND t1.b = 0 ORDER BY t1.a, t2.b; - QUERY PLAN ------------------------------------------------------------------------------------------------ + QUERY PLAN +--------------------------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) -> Sort - Sort Key: t1.a - -> Result - -> Append - -> Nested Loop - Join Filter: (t1.a = (((t3.a + t3.b) / 2))) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: a - -> Nested Loop - -> Remote Subquery Scan on all (datanode_1,datanode_2) + Sort Key: a + -> Nested Loop + Join Filter: (a = (((a + b) / 2))) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Hash Join + Hash Cond: (t2.b = a) + -> Append + -> Seq Scan on prt2_p1 t2 + -> Seq Scan on prt2_p2 t2_1 + -> Seq Scan on prt2_p3 t2_2 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append -> Seq Scan on prt1_p1 t1 Filter: (b = 0) - -> Index Scan using iprt2_p1_b on prt2_p1 t2 - Index Cond: (b = t1.a) - -> Materialize - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: ((a + b) / 2) - -> Index Scan using iprt1_e_p1_ab2 on prt1_e_p1 t3 - Index Cond: (((a + b) / 2) = t2.b) - -> Nested Loop - Join Filter: (t1_1.a = (((t3_1.a + t3_1.b) / 2))) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: a - -> Nested Loop - -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on prt1_p2 t1_1 Filter: (b = 0) - -> Index Scan using iprt2_p2_b on prt2_p2 t2_1 - Index Cond: (b = t1_1.a) - -> Materialize - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: ((a + b) / 2) - -> Index Scan using iprt1_e_p2_ab2 on prt1_e_p2 t3_1 - Index Cond: (((a + b) / 2) = t2_1.b) - -> Nested Loop - Join Filter: (t1_2.a = t2_2.b) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: a - -> Nested Loop - -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on prt1_p3 t1_2 Filter: (b = 0) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: ((a + b) / 2) + -> Result + -> Append + -> Index Scan using iprt1_e_p1_ab2 on prt1_e_p1 t3 + Index Cond: (((a + b) / 2) = b) + -> Index Scan using iprt1_e_p2_ab2 on prt1_e_p2 t3_1 + Index Cond: (((a + b) / 2) = b) -> Index Scan using iprt1_e_p3_ab2 on prt1_e_p3 t3_2 - Index Cond: (((a + b) / 2) = t1_2.a) - -> Materialize - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b - -> Index Scan using iprt2_p3_b on prt2_p3 t2_2 - Index Cond: (b = ((t3_2.a + t3_2.b) / 2)) -(50 rows) + Index Cond: (((a + b) / 2) = b) +(33 rows) SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM prt1 t1, prt2 t2, prt1_e t3 WHERE t1.a = t2.b AND t1.a = (t3.a + t3.b)/2 AND t1.b = 0 ORDER BY t1.a, t2.b; a | c | b | c | ?column? | c @@ -646,58 +593,40 @@ SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM prt1 t1, prt2 t2, prt1_e t EXPLAIN (COSTS OFF) SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) LEFT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.b = 0 ORDER BY t1.a, t2.b, t3.a + t3.b; - QUERY PLAN ------------------------------------------------------------------------------------------------ + QUERY PLAN +----------------------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) -> Sort - Sort Key: t1.a, t2.b, ((t3.a + t3.b)) - -> Result - -> Append - -> Nested Loop Left Join - -> Nested Loop Left Join - -> Seq Scan on prt1_p1 t1 - Filter: (b = 0) - -> Materialize - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b - -> Index Scan using iprt2_p1_b on prt2_p1 t2 - Index Cond: (t1.a = b) - -> Materialize - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: ((a + b) / 2) - -> Index Scan using iprt1_e_p1_ab2 on prt1_e_p1 t3 - Index Cond: (t1.a = ((a + b) / 2)) - -> Nested Loop Left Join - -> Nested Loop Left Join - -> Seq Scan on prt1_p2 t1_1 - Filter: (b = 0) - -> Materialize - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b - -> Index Scan using iprt2_p2_b on prt2_p2 t2_1 - Index Cond: (t1_1.a = b) - -> Materialize - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: ((a + b) / 2) - -> Index Scan using iprt1_e_p2_ab2 on prt1_e_p2 t3_1 - Index Cond: (t1_1.a = ((a + b) / 2)) - -> Nested Loop Left Join - -> Nested Loop Left Join - -> Seq Scan on prt1_p3 t1_2 - Filter: (b = 0) - -> Materialize - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b - -> Bitmap Heap Scan on prt2_p3 t2_2 - Recheck Cond: (t1_2.a = b) - -> Bitmap Index Scan on iprt2_p3_b - Index Cond: (t1_2.a = b) - -> Materialize + Sort Key: a, b, ((a + b)) + -> Hash Right Join + Hash Cond: ((((a + b) / 2)) = a) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: ((a + b) / 2) + -> Result + -> Append + -> Seq Scan on prt1_e_p1 t3 + -> Seq Scan on prt1_e_p2 t3_1 + -> Seq Scan on prt1_e_p3 t3_2 + -> Hash + -> Hash Right Join + Hash Cond: (b = a) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Append + -> Seq Scan on prt2_p1 t2 + -> Seq Scan on prt2_p2 t2_1 + -> Seq Scan on prt2_p3 t2_2 + -> Hash -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: ((a + b) / 2) - -> Index Scan using iprt1_e_p3_ab2 on prt1_e_p3 t3_2 - Index Cond: (t1_2.a = ((a + b) / 2)) -(49 rows) + Distribute results by H: a + -> Append + -> Seq Scan on prt1_p1 t1 + Filter: (b = 0) + -> Seq Scan on prt1_p2 t1_1 + Filter: (b = 0) + -> Seq Scan on prt1_p3 t1_2 + Filter: (b = 0) +(31 rows) SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) LEFT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.b = 0 ORDER BY t1.a, t2.b, t3.a + t3.b; a | c | b | c | ?column? | c @@ -718,59 +647,44 @@ SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 EXPLAIN (COSTS OFF) SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b; - QUERY PLAN ------------------------------------------------------------------------------------------------ + QUERY PLAN +----------------------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) -> Sort - Sort Key: t1.a, t2.b, ((t3.a + t3.b)) - -> Result - -> Append - -> Nested Loop Left Join - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: a - -> Nested Loop Left Join - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: ((a + b) / 2) - -> Seq Scan on prt1_e_p1 t3 - Filter: (c = 0) - -> Index Scan using iprt1_p1_a on prt1_p1 t1 - Index Cond: (a = ((t3.a + t3.b) / 2)) - -> Materialize - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b - -> Index Scan using iprt2_p1_b on prt2_p1 t2 - Index Cond: (t1.a = b) - -> Nested Loop Left Join - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: a - -> Nested Loop Left Join - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: ((a + b) / 2) - -> Seq Scan on prt1_e_p2 t3_1 - Filter: (c = 0) - -> Index Scan using iprt1_p2_a on prt1_p2 t1_1 - Index Cond: (a = ((t3_1.a + t3_1.b) / 2)) - -> Materialize - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b - -> Index Scan using iprt2_p2_b on prt2_p2 t2_1 - Index Cond: (t1_1.a = b) - -> Nested Loop Left Join + Sort Key: a, b, ((a + b)) + -> Nested Loop Left Join + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Hash Right Join + Hash Cond: (a = (((a + b) / 2))) -> Remote Subquery Scan on all (datanode_1,datanode_2) Distribute results by H: a - -> Nested Loop Left Join - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: ((a + b) / 2) - -> Seq Scan on prt1_e_p3 t3_2 - Filter: (c = 0) - -> Index Scan using iprt1_p3_a on prt1_p3 t1_2 - Index Cond: (a = ((t3_2.a + t3_2.b) / 2)) - -> Materialize + -> Append + -> Seq Scan on prt1_p1 t1 + -> Seq Scan on prt1_p2 t1_1 + -> Seq Scan on prt1_p3 t1_2 + -> Hash -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b - -> Index Scan using iprt2_p3_b on prt2_p3 t2_2 - Index Cond: (t1_2.a = b) -(50 rows) + Distribute results by H: ((a + b) / 2) + -> Result + -> Append + -> Seq Scan on prt1_e_p1 t3 + Filter: (c = 0) + -> Seq Scan on prt1_e_p2 t3_1 + Filter: (c = 0) + -> Seq Scan on prt1_e_p3 t3_2 + Filter: (c = 0) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Append + -> Index Scan using iprt2_p1_b on prt2_p1 t2 + Index Cond: (a = b) + -> Index Scan using iprt2_p2_b on prt2_p2 t2_1 + Index Cond: (a = b) + -> Index Scan using iprt2_p3_b on prt2_p3 t2_2 + Index Cond: (a = b) +(35 rows) SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b; a | c | b | c | ?column? | c @@ -793,71 +707,49 @@ SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 -- make sure these go to null as expected EXPLAIN (COSTS OFF) SELECT t1.a, t1.phv, t2.b, t2.phv, t3.a + t3.b, t3.phv FROM ((SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b)) FULL JOIN (SELECT 50 phv, * FROM prt1_e WHERE prt1_e.c = 0) t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.a = t1.phv OR t2.b = t2.phv OR (t3.a + t3.b)/2 = t3.phv ORDER BY t1.a, t2.b, t3.a + t3.b; - QUERY PLAN ----------------------------------------------------------------------------------------------------------------------------- + QUERY PLAN +----------------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) -> Sort - Sort Key: prt1_p1.a, prt2_p1.b, ((prt1_e_p1.a + prt1_e_p1.b)) - -> Result - -> Append + Sort Key: a, b, ((a + b)) + -> Hash Full Join + Hash Cond: (a = (((a + b) / 2))) + Filter: ((a = (50)) OR (b = (75)) OR (((a + b) / 2) = (50))) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a -> Hash Full Join - Hash Cond: (prt1_p1.a = (((prt1_e_p1.a + prt1_e_p1.b) / 2))) - Filter: ((prt1_p1.a = (50)) OR (prt2_p1.b = (75)) OR (((prt1_e_p1.a + prt1_e_p1.b) / 2) = (50))) + Hash Cond: (a = b) -> Remote Subquery Scan on all (datanode_1,datanode_2) Distribute results by H: a - -> Hash Full Join - Hash Cond: (prt1_p1.a = prt2_p1.b) + -> Append -> Seq Scan on prt1_p1 Filter: (b = 0) - -> Hash - -> Remote Subquery Scan on all (datanode_2) - Distribute results by H: b - -> Seq Scan on prt2_p1 - Filter: (a = 0) - -> Hash - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: ((a + b) / 2) - -> Seq Scan on prt1_e_p1 - Filter: (c = 0) - -> Hash Full Join - Hash Cond: (prt1_p2.a = (((prt1_e_p2.a + prt1_e_p2.b) / 2))) - Filter: ((prt1_p2.a = (50)) OR (prt2_p2.b = (75)) OR (((prt1_e_p2.a + prt1_e_p2.b) / 2) = (50))) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: a - -> Hash Full Join - Hash Cond: (prt1_p2.a = prt2_p2.b) -> Seq Scan on prt1_p2 Filter: (b = 0) - -> Hash - -> Remote Subquery Scan on all (datanode_2) - Distribute results by H: b - -> Seq Scan on prt2_p2 - Filter: (a = 0) - -> Hash - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: ((a + b) / 2) - -> Seq Scan on prt1_e_p2 - Filter: (c = 0) - -> Hash Full Join - Hash Cond: (prt1_p3.a = (((prt1_e_p3.a + prt1_e_p3.b) / 2))) - Filter: ((prt1_p3.a = (50)) OR (prt2_p3.b = (75)) OR (((prt1_e_p3.a + prt1_e_p3.b) / 2) = (50))) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: a - -> Hash Full Join - Hash Cond: (prt1_p3.a = prt2_p3.b) -> Seq Scan on prt1_p3 Filter: (b = 0) - -> Hash - -> Remote Subquery Scan on all (datanode_2) - Distribute results by H: b - -> Seq Scan on prt2_p3 - Filter: (a = 0) -> Hash - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: ((a + b) / 2) + -> Remote Subquery Scan on all (datanode_2) + Distribute results by H: b + -> Append + -> Seq Scan on prt2_p1 + Filter: (a = 0) + -> Seq Scan on prt2_p2 + Filter: (a = 0) + -> Seq Scan on prt2_p3 + Filter: (a = 0) + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: ((a + b) / 2) + -> Result + -> Append + -> Seq Scan on prt1_e_p1 + Filter: (c = 0) + -> Seq Scan on prt1_e_p2 + Filter: (c = 0) -> Seq Scan on prt1_e_p3 Filter: (c = 0) -(62 rows) +(40 rows) SELECT t1.a, t1.phv, t2.b, t2.phv, t3.a + t3.b, t3.phv FROM ((SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b)) FULL JOIN (SELECT 50 phv, * FROM prt1_e WHERE prt1_e.c = 0) t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.a = t1.phv OR t2.b = t2.phv OR (t3.a + t3.b)/2 = t3.phv ORDER BY t1.a, t2.b, t3.a + t3.b; a | phv | b | phv | ?column? | phv @@ -873,66 +765,47 @@ SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHER --------------------------------------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) -> Sort - Sort Key: t1.a - -> Append - -> Nested Loop - Join Filter: (t1.a = t1_3.b) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b - -> HashAggregate - Group Key: t1_3.b - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b - -> HashAggregate - Group Key: t1_3.b - -> Nested Loop - -> Remote Subquery Scan on all (datanode_2) + Sort Key: a + -> Nested Loop + Join Filter: (a = b) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> HashAggregate + Group Key: b + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> HashAggregate + Group Key: b + -> Nested Loop + -> Remote Subquery Scan on all (datanode_2) + -> Append -> Seq Scan on prt2_p1 t1_3 Filter: (a = 0) - -> Index Scan using iprt1_e_p1_ab2 on prt1_e_p1 t2 - Index Cond: (((a + b) / 2) = t1_3.b) - -> Index Scan using iprt1_p1_a on prt1_p1 t1 - Index Cond: (a = ((t2.a + t2.b) / 2)) - Filter: (b = 0) - -> Nested Loop - Join Filter: (t1_1.a = t1_4.b) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b - -> HashAggregate - Group Key: t1_4.b - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b - -> HashAggregate - Group Key: t1_4.b - -> Nested Loop - -> Remote Subquery Scan on all (datanode_2) -> Seq Scan on prt2_p2 t1_4 Filter: (a = 0) - -> Index Scan using iprt1_e_p2_ab2 on prt1_e_p2 t2_1 - Index Cond: (((a + b) / 2) = t1_4.b) - -> Index Scan using iprt1_p2_a on prt1_p2 t1_1 - Index Cond: (a = ((t2_1.a + t2_1.b) / 2)) - Filter: (b = 0) - -> Nested Loop - Join Filter: (t1_2.a = t1_5.b) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b - -> HashAggregate - Group Key: t1_5.b - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b - -> HashAggregate - Group Key: t1_5.b - -> Nested Loop - -> Remote Subquery Scan on all (datanode_2) -> Seq Scan on prt2_p3 t1_5 Filter: (a = 0) + -> Append + -> Index Scan using iprt1_e_p1_ab2 on prt1_e_p1 t2 + Index Cond: (((a + b) / 2) = b) + -> Index Scan using iprt1_e_p2_ab2 on prt1_e_p2 t2_1 + Index Cond: (((a + b) / 2) = b) -> Index Scan using iprt1_e_p3_ab2 on prt1_e_p3 t2_2 - Index Cond: (((a + b) / 2) = t1_5.b) - -> Index Scan using iprt1_p3_a on prt1_p3 t1_2 - Index Cond: (a = ((t2_2.a + t2_2.b) / 2)) - Filter: (b = 0) -(61 rows) + Index Cond: (((a + b) / 2) = b) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Append + -> Index Scan using iprt1_p1_a on prt1_p1 t1 + Index Cond: (a = ((a + b) / 2)) + Filter: (b = 0) + -> Index Scan using iprt1_p2_a on prt1_p2 t1_1 + Index Cond: (a = ((a + b) / 2)) + Filter: (b = 0) + -> Index Scan using iprt1_p3_a on prt1_p3 t1_2 + Index Cond: (a = ((a + b) / 2)) + Filter: (b = 0) +(42 rows) SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHERE t1.a = 0 AND t1.b = (t2.a + t2.b)/2) AND t1.b = 0 ORDER BY t1.a; a | b | c @@ -945,57 +818,49 @@ SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHER EXPLAIN (COSTS OFF) SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a; - QUERY PLAN ---------------------------------------------------------------------------------------------------------- + QUERY PLAN +----------------------------------------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) -> Sort - Sort Key: t1.a - -> Append - -> Nested Loop Semi Join - -> Seq Scan on prt1_p1 t1 - Filter: (b = 0) - -> Materialize - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b - -> Nested Loop Semi Join - -> Index Only Scan using iprt2_p1_b on prt2_p1 t1_3 - Index Cond: (b = t1.a) - -> Materialize - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Index Scan using iprt1_e_p1_ab2 on prt1_e_p1 t1_6 - Index Cond: (((a + b) / 2) = t1_3.b) - Filter: (c = 0) - -> Nested Loop Semi Join - -> Seq Scan on prt1_p2 t1_1 - Filter: (b = 0) - -> Materialize - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b - -> Nested Loop Semi Join - -> Index Only Scan using iprt2_p2_b on prt2_p2 t1_4 - Index Cond: (b = t1_1.a) - -> Materialize - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Index Scan using iprt1_e_p2_ab2 on prt1_e_p2 t1_7 - Index Cond: (((a + b) / 2) = t1_4.b) - Filter: (c = 0) - -> Nested Loop Semi Join - -> Seq Scan on prt1_p3 t1_2 - Filter: (b = 0) - -> Materialize + Sort Key: a + -> Nested Loop + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> HashAggregate + Group Key: b -> Remote Subquery Scan on all (datanode_1,datanode_2) Distribute results by H: b - -> Nested Loop Semi Join - -> Bitmap Heap Scan on prt2_p3 t1_5 - Recheck Cond: (b = t1_2.a) - -> Bitmap Index Scan on iprt2_p3_b - Index Cond: (b = t1_2.a) - -> Materialize - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Index Scan using iprt1_e_p3_ab2 on prt1_e_p3 t1_8 - Index Cond: (((a + b) / 2) = t1_5.b) - Filter: (c = 0) -(48 rows) + -> HashAggregate + Group Key: t1_3.b + -> Hash Semi Join + Hash Cond: (t1_3.b = ((a + b) / 2)) + -> Append + -> Seq Scan on prt2_p1 t1_3 + -> Seq Scan on prt2_p2 t1_4 + -> Seq Scan on prt2_p3 t1_5 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt1_e_p1 t1_6 + Filter: (c = 0) + -> Seq Scan on prt1_e_p2 t1_7 + Filter: (c = 0) + -> Seq Scan on prt1_e_p3 t1_8 + Filter: (c = 0) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Append + -> Index Scan using iprt1_p1_a on prt1_p1 t1 + Index Cond: (a = b) + Filter: (b = 0) + -> Index Scan using iprt1_p2_a on prt1_p2 t1_1 + Index Cond: (a = b) + Filter: (b = 0) + -> Index Scan using iprt1_p3_a on prt1_p3 t1_2 + Index Cond: (a = b) + Filter: (b = 0) +(40 rows) SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a; a | b | c @@ -1011,67 +876,44 @@ SET enable_hashjoin TO off; SET enable_nestloop TO off; EXPLAIN (COSTS OFF) SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a; - QUERY PLAN ------------------------------------------------------------------------------------------ + QUERY PLAN +----------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) - -> Sort - Sort Key: t1.a - -> Append - -> Merge Semi Join - Merge Cond: (t1.a = t1_3.b) - -> Sort - Sort Key: t1.a + -> Merge Semi Join + Merge Cond: (a = b) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Sort + Sort Key: t1.a + -> Append -> Seq Scan on prt1_p1 t1 Filter: (b = 0) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b - -> Merge Semi Join - Merge Cond: (t1_3.b = (((t1_6.a + t1_6.b) / 2))) - -> Sort - Sort Key: t1_3.b - -> Seq Scan on prt2_p1 t1_3 - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Sort - Sort Key: (((t1_6.a + t1_6.b) / 2)) - -> Seq Scan on prt1_e_p1 t1_6 - Filter: (c = 0) - -> Merge Semi Join - Merge Cond: (t1_1.a = t1_4.b) - -> Sort - Sort Key: t1_1.a -> Seq Scan on prt1_p2 t1_1 Filter: (b = 0) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b - -> Merge Semi Join - Merge Cond: (t1_4.b = (((t1_7.a + t1_7.b) / 2))) - -> Sort - Sort Key: t1_4.b - -> Seq Scan on prt2_p2 t1_4 - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Sort - Sort Key: (((t1_7.a + t1_7.b) / 2)) - -> Seq Scan on prt1_e_p2 t1_7 - Filter: (c = 0) - -> Merge Semi Join - Merge Cond: (t1_2.a = t1_5.b) - -> Sort - Sort Key: t1_2.a -> Seq Scan on prt1_p3 t1_2 Filter: (b = 0) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Merge Semi Join + Merge Cond: (t1_3.b = (((a + b) / 2))) + -> Sort + Sort Key: t1_3.b + -> Append + -> Seq Scan on prt2_p1 t1_3 + -> Seq Scan on prt2_p2 t1_4 + -> Seq Scan on prt2_p3 t1_5 -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b - -> Merge Semi Join - Merge Cond: (t1_5.b = (((t1_8.a + t1_8.b) / 2))) - -> Sort - Sort Key: t1_5.b - -> Seq Scan on prt2_p3 t1_5 - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Sort - Sort Key: (((t1_8.a + t1_8.b) / 2)) + -> Sort + Sort Key: (((t1_6.a + t1_6.b) / 2)) + -> Result + -> Append + -> Seq Scan on prt1_e_p1 t1_6 + Filter: (c = 0) + -> Seq Scan on prt1_e_p2 t1_7 + Filter: (c = 0) -> Seq Scan on prt1_e_p3 t1_8 Filter: (c = 0) -(58 rows) +(35 rows) SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a; a | b | c @@ -1084,77 +926,50 @@ SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN ( EXPLAIN (COSTS OFF) SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b; - QUERY PLAN ------------------------------------------------------------------------------------------------------------ + QUERY PLAN +----------------------------------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) -> Sort - Sort Key: t1.a, t2.b, ((t3.a + t3.b)) - -> Result - -> Append - -> Merge Right Join - Merge Cond: (t1.a = (((t3.a + t3.b) / 2))) - -> Merge Left Join - Merge Cond: (t1.a = t2.b) - -> Sort - Sort Key: t1.a - -> Seq Scan on prt1_p1 t1 - -> Materialize - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b - -> Sort - Sort Key: t2.b - -> Seq Scan on prt2_p1 t2 - -> Materialize - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: ((a + b) / 2) - -> Sort - Sort Key: (((t3.a + t3.b) / 2)) - -> Seq Scan on prt1_e_p1 t3 - Filter: (c = 0) - -> Merge Right Join - Merge Cond: (t1_1.a = (((t3_1.a + t3_1.b) / 2))) - -> Merge Left Join - Merge Cond: (t1_1.a = t2_1.b) - -> Sort - Sort Key: t1_1.a - -> Seq Scan on prt1_p2 t1_1 - -> Materialize + Sort Key: a, b, ((a + b)) + -> Merge Right Join + Merge Cond: (b = a) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Sort + Sort Key: t2.b + -> Append + -> Seq Scan on prt2_p1 t2 + -> Seq Scan on prt2_p2 t2_1 + -> Seq Scan on prt2_p3 t2_2 + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Sort + Sort Key: a + -> Merge Right Join + Merge Cond: (a = (((a + b) / 2))) -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b + Distribute results by H: a -> Sort - Sort Key: t2_1.b - -> Seq Scan on prt2_p2 t2_1 - -> Materialize - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: ((a + b) / 2) - -> Sort - Sort Key: (((t3_1.a + t3_1.b) / 2)) - -> Seq Scan on prt1_e_p2 t3_1 - Filter: (c = 0) - -> Merge Right Join - Merge Cond: (t2_2.b = t1_2.a) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b - -> Sort - Sort Key: t2_2.b - -> Seq Scan on prt2_p3 t2_2 - -> Materialize - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: a - -> Sort - Sort Key: t1_2.a - -> Merge Left Join - Merge Cond: ((((t3_2.a + t3_2.b) / 2)) = t1_2.a) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: ((a + b) / 2) - -> Sort - Sort Key: (((t3_2.a + t3_2.b) / 2)) - -> Seq Scan on prt1_e_p3 t3_2 - Filter: (c = 0) - -> Sort - Sort Key: t1_2.a + Sort Key: t1.a + -> Append + -> Seq Scan on prt1_p1 t1 + -> Seq Scan on prt1_p2 t1_1 -> Seq Scan on prt1_p3 t1_2 -(68 rows) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: ((a + b) / 2) + -> Sort + Sort Key: (((t3.a + t3.b) / 2)) + -> Result + -> Append + -> Seq Scan on prt1_e_p1 t3 + Filter: (c = 0) + -> Seq Scan on prt1_e_p2 t3_1 + Filter: (c = 0) + -> Seq Scan on prt1_e_p3 t3_2 + Filter: (c = 0) +(41 rows) SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b; a | c | b | c | ?column? | c @@ -1176,48 +991,46 @@ SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 -- MergeAppend on nullable column EXPLAIN (COSTS OFF) SELECT t1.a, t2.b FROM (SELECT * FROM prt1 WHERE a < 450) t1 LEFT JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b; - QUERY PLAN ------------------------------------------------------------------------ - Sort - Sort Key: prt1_p1.a, b - -> Append - -> Merge Left Join - Merge Cond: (prt1_p1.a = b) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Sort - Sort Key: prt1_p1.a - -> Seq Scan on prt1_p1 - Filter: ((a < 450) AND (b = 0)) - -> Sort - Sort Key: b - -> Result - One-Time Filter: false + QUERY PLAN +----------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a, b -> Merge Right Join - Merge Cond: (prt2_p2.b = prt1_p2.a) + Merge Cond: (b = a) -> Remote Subquery Scan on all (datanode_1,datanode_2) Distribute results by H: b -> Sort Sort Key: prt2_p2.b - -> Seq Scan on prt2_p2 - Filter: (b > 250) - -> Sort - Sort Key: prt1_p2.a - -> Seq Scan on prt1_p2 - Filter: ((a < 450) AND (b = 0)) -(26 rows) + -> Append + -> Seq Scan on prt2_p2 + Filter: (b > 250) + -> Seq Scan on prt2_p3 + Filter: (b > 250) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Sort + Sort Key: prt1_p1.a + -> Append + -> Seq Scan on prt1_p1 + Filter: ((a < 450) AND (b = 0)) + -> Seq Scan on prt1_p2 + Filter: ((a < 450) AND (b = 0)) +(24 rows) SELECT t1.a, t2.b FROM (SELECT * FROM prt1 WHERE a < 450) t1 LEFT JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b; - a | b + a | b -----+----- - 0 | - 50 | - 100 | - 150 | - 200 | - 250 | + 0 | + 50 | + 100 | + 150 | + 200 | + 250 | 300 | 300 - 350 | - 400 | + 350 | + 400 | (9 rows) RESET enable_hashjoin; @@ -1239,40 +1052,34 @@ INSERT INTO prt2_m SELECT i, i, i % 25 FROM generate_series(0, 599, 3) i; ANALYZE prt2_m; EXPLAIN (COSTS OFF) SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_m WHERE prt1_m.c = 0) t1 FULL JOIN (SELECT * FROM prt2_m WHERE prt2_m.c = 0) t2 ON (t1.a = (t2.b + t2.a)/2 AND t2.b = (t1.a + t1.b)/2) ORDER BY t1.a, t2.b; - QUERY PLAN --------------------------------------------------------------------------------------------------------------------------------------------- - Remote Subquery Scan on all (datanode_1,datanode_2) - -> Sort - Sort Key: prt1_m_p1.a, prt2_m_p1.b - -> Append - -> Hash Full Join - Hash Cond: ((prt1_m_p1.a = (((prt2_m_p1.b + prt2_m_p1.a) / 2))) AND (((prt1_m_p1.a + prt1_m_p1.b) / 2) = prt2_m_p1.b)) - -> Seq Scan on prt1_m_p1 - Filter: (c = 0) - -> Hash - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: ((b + a) / 2) - -> Seq Scan on prt2_m_p1 - Filter: (c = 0) - -> Hash Full Join - Hash Cond: ((prt1_m_p2.a = (((prt2_m_p2.b + prt2_m_p2.a) / 2))) AND (((prt1_m_p2.a + prt1_m_p2.b) / 2) = prt2_m_p2.b)) - -> Seq Scan on prt1_m_p2 - Filter: (c = 0) - -> Hash - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: ((b + a) / 2) - -> Seq Scan on prt2_m_p2 - Filter: (c = 0) - -> Hash Full Join - Hash Cond: ((prt1_m_p3.a = (((prt2_m_p3.b + prt2_m_p3.a) / 2))) AND (((prt1_m_p3.a + prt1_m_p3.b) / 2) = prt2_m_p3.b)) - -> Seq Scan on prt1_m_p3 - Filter: (c = 0) - -> Hash - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: ((b + a) / 2) - -> Seq Scan on prt2_m_p3 - Filter: (c = 0) -(31 rows) + QUERY PLAN +----------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a, b + -> Hash Full Join + Hash Cond: ((a = (((b + a) / 2))) AND (((a + b) / 2) = b)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Append + -> Seq Scan on prt1_m_p1 + Filter: (c = 0) + -> Seq Scan on prt1_m_p2 + Filter: (c = 0) + -> Seq Scan on prt1_m_p3 + Filter: (c = 0) + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: ((b + a) / 2) + -> Result + -> Append + -> Seq Scan on prt2_m_p1 + Filter: (c = 0) + -> Seq Scan on prt2_m_p2 + Filter: (c = 0) + -> Seq Scan on prt2_m_p3 + Filter: (c = 0) +(25 rows) SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_m WHERE prt1_m.c = 0) t1 FULL JOIN (SELECT * FROM prt2_m WHERE prt2_m.c = 0) t2 ON (t1.a = (t2.b + t2.a)/2 AND t2.b = (t1.a + t1.b)/2) ORDER BY t1.a, t2.b; a | c | b | c @@ -1322,8 +1129,8 @@ ANALYZE plt1_e; -- test partition matching with N-way join EXPLAIN (COSTS OFF) SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM plt1 t1, plt2 t2, plt1_e t3 WHERE t1.c = t2.c AND ltrim(t3.c, 'A') = t1.c GROUP BY t1.c, t2.c, t3.c ORDER BY t1.c, t2.c, t3.c; - QUERY PLAN ------------------------------------------------------------------------------------------------------------------ + QUERY PLAN +----------------------------------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) -> Finalize GroupAggregate Group Key: c, c, c @@ -1332,43 +1139,28 @@ SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM plt1 t1, pl -> Remote Subquery Scan on all (datanode_1,datanode_2) Distribute results by H: c -> Partial HashAggregate - Group Key: t1.c, t2.c, t3.c - -> Result - -> Append - -> Hash Join - Hash Cond: (t1.c = t2.c) + Group Key: c, c, t3.c + -> Hash Join + Hash Cond: (c = c) + -> Hash Join + Hash Cond: (ltrim(t3.c, 'A'::text) = c) + -> Append + -> Seq Scan on plt1_e_p1 t3 + -> Seq Scan on plt1_e_p2 t3_1 + -> Seq Scan on plt1_e_p3 t3_2 + -> Hash -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on plt2_p1 t2 + -> Seq Scan on plt2_p2 t2_1 + -> Seq Scan on plt2_p3 t2_2 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append -> Seq Scan on plt1_p1 t1 - -> Hash - -> Hash Join - Hash Cond: (t2.c = ltrim(t3.c, 'A'::text)) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Seq Scan on plt2_p1 t2 - -> Hash - -> Seq Scan on plt1_e_p1 t3 - -> Hash Join - Hash Cond: (t1_1.c = t2_1.c) - -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on plt1_p2 t1_1 - -> Hash - -> Hash Join - Hash Cond: (t2_1.c = ltrim(t3_1.c, 'A'::text)) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Seq Scan on plt2_p2 t2_1 - -> Hash - -> Seq Scan on plt1_e_p2 t3_1 - -> Hash Join - Hash Cond: (t1_2.c = t2_2.c) - -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on plt1_p3 t1_2 - -> Hash - -> Hash Join - Hash Cond: (t2_2.c = ltrim(t3_2.c, 'A'::text)) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Seq Scan on plt2_p3 t2_2 - -> Hash - -> Seq Scan on plt1_e_p3 t3_2 -(44 rows) +(29 rows) SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM plt1 t1, plt2 t2, plt1_e t3 WHERE t1.c = t2.c AND ltrim(t3.c, 'A') = t1.c GROUP BY t1.c, t2.c, t3.c ORDER BY t1.c, t2.c, t3.c; avg | avg | avg | c | c | c @@ -1406,36 +1198,27 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 EXPLAIN (COSTS OFF) SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b, prt1 t3 WHERE t2.b = t3.a; - QUERY PLAN ------------------------------------------------------------------------------------ + QUERY PLAN +----------------------------------------------------------------------- Hash Left Join Hash Cond: (b = a) -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Append - -> Hash Join - Hash Cond: (t3.a = t2.b) - -> Seq Scan on prt1_p1 t3 - -> Hash - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b - -> Seq Scan on prt2_p1 t2 - -> Hash Join - Hash Cond: (t3_1.a = t2_1.b) - -> Seq Scan on prt1_p2 t3_1 - -> Hash - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b - -> Seq Scan on prt2_p2 t2_1 - -> Nested Loop - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b + -> Hash Join + Hash Cond: (a = t2.b) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt1_p1 t3 + -> Seq Scan on prt1_p2 t3_1 + -> Seq Scan on prt1_p3 t3_2 + -> Hash + -> Append + -> Seq Scan on prt2_p1 t2 + -> Seq Scan on prt2_p2 t2_1 -> Seq Scan on prt2_p3 t2_2 - -> Index Only Scan using iprt1_p3_a on prt1_p3 t3_2 - Index Cond: (a = t2_2.b) -> Hash -> Result One-Time Filter: false -(27 rows) +(18 rows) EXPLAIN (COSTS OFF) SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 FULL JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b; @@ -1485,8 +1268,8 @@ ANALYZE pht1_e; -- test partition matching with N-way join EXPLAIN (COSTS OFF) SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM pht1 t1, pht2 t2, pht1_e t3 WHERE t1.c = t2.c AND ltrim(t3.c, 'A') = t1.c GROUP BY t1.c, t2.c, t3.c ORDER BY t1.c, t2.c, t3.c; - QUERY PLAN ------------------------------------------------------------------------------------------------------------------ + QUERY PLAN +----------------------------------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) -> Finalize GroupAggregate Group Key: c, c, c @@ -1495,43 +1278,28 @@ SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM pht1 t1, ph -> Remote Subquery Scan on all (datanode_1,datanode_2) Distribute results by H: c -> Partial HashAggregate - Group Key: t1.c, t2.c, t3.c - -> Result - -> Append - -> Hash Join - Hash Cond: (t1.c = t2.c) + Group Key: c, c, t3.c + -> Hash Join + Hash Cond: (c = c) + -> Hash Join + Hash Cond: (ltrim(t3.c, 'A'::text) = c) + -> Append + -> Seq Scan on pht1_e_p1 t3 + -> Seq Scan on pht1_e_p2 t3_1 + -> Seq Scan on pht1_e_p3 t3_2 + -> Hash -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Seq Scan on pht1_p1 t1 - -> Hash - -> Hash Join - Hash Cond: (t2.c = ltrim(t3.c, 'A'::text)) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Seq Scan on pht2_p1 t2 - -> Hash - -> Seq Scan on pht1_e_p1 t3 - -> Hash Join - Hash Cond: (t1_1.c = t2_1.c) - -> Hash Join - Hash Cond: (t1_1.c = ltrim(t3_1.c, 'A'::text)) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Seq Scan on pht1_p2 t1_1 - -> Hash - -> Seq Scan on pht1_e_p2 t3_1 - -> Hash - -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on pht2_p1 t2 -> Seq Scan on pht2_p2 t2_1 - -> Hash Join - Hash Cond: (t1_2.c = t2_2.c) - -> Hash Join - Hash Cond: (t1_2.c = ltrim(t3_2.c, 'A'::text)) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Seq Scan on pht1_p3 t1_2 - -> Hash - -> Seq Scan on pht1_e_p3 t3_2 - -> Hash - -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on pht2_p3 t2_2 -(44 rows) + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on pht1_p1 t1 + -> Seq Scan on pht1_p2 t1_1 + -> Seq Scan on pht1_p3 t1_2 +(29 rows) SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM pht1 t1, pht2 t2, pht1_e t3 WHERE t1.c = t2.c AND ltrim(t3.c, 'A') = t1.c GROUP BY t1.c, t2.c, t3.c ORDER BY t1.c, t2.c, t3.c; avg | avg | avg | c | c | c @@ -1576,42 +1344,31 @@ ANALYZE prt2_l; -- inner join, qual covering only top-level partitions EXPLAIN (COSTS OFF) SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1, prt2_l t2 WHERE t1.a = t2.b AND t1.b = 0 ORDER BY t1.a, t2.b; - QUERY PLAN ------------------------------------------------------------------------------------ + QUERY PLAN +----------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) -> Sort - Sort Key: t1.a - -> Append - -> Hash Join - Hash Cond: (t2.b = t1.a) + Sort Key: a + -> Hash Join + Hash Cond: (t2.b = a) + -> Append -> Seq Scan on prt2_l_p1 t2 - -> Hash - -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on prt2_l_p2_p1 t2_1 + -> Seq Scan on prt2_l_p2_p2 t2_2 + -> Seq Scan on prt2_l_p3_p1 t2_3 + -> Seq Scan on prt2_l_p3_p2 t2_4 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append -> Seq Scan on prt1_l_p1 t1 Filter: (b = 0) - -> Hash Join - Hash Cond: (t2_1.b = a) - -> Append - -> Seq Scan on prt2_l_p2_p1 t2_1 - -> Seq Scan on prt2_l_p2_p2 t2_2 - -> Hash - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Append - -> Seq Scan on prt1_l_p2_p1 t1_1 - Filter: (b = 0) - -> Seq Scan on prt1_l_p2_p2 t1_2 - Filter: (b = 0) - -> Nested Loop - Join Filter: (a = t2_3.b) - -> Append - -> Seq Scan on prt2_l_p3_p1 t2_3 - -> Seq Scan on prt2_l_p3_p2 t2_4 - -> Materialize - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Append - -> Seq Scan on prt1_l_p3_p1 t1_3 - Filter: (b = 0) -(33 rows) + -> Seq Scan on prt1_l_p2_p1 t1_1 + Filter: (b = 0) + -> Seq Scan on prt1_l_p2_p2 t1_2 + Filter: (b = 0) + -> Seq Scan on prt1_l_p3_p1 t1_3 + Filter: (b = 0) +(22 rows) SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1, prt2_l t2 WHERE t1.a = t2.b AND t1.b = 0 ORDER BY t1.a, t2.b; a | c | b | c @@ -1625,50 +1382,34 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1, prt2_l t2 WHERE t1.a = t2.b AND t1 -- left join EXPLAIN (COSTS OFF) SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 LEFT JOIN prt2_l t2 ON t1.a = t2.b AND t1.c = t2.c WHERE t1.b = 0 ORDER BY t1.a, t2.b; - QUERY PLAN ------------------------------------------------------------------------------------------- + QUERY PLAN +----------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) -> Sort - Sort Key: t1.a, t2.b - -> Append - -> Hash Right Join - Hash Cond: ((t2.b = t1.a) AND ((t2.c)::text = (t1.c)::text)) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b + Sort Key: a, b + -> Hash Right Join + Hash Cond: ((b = a) AND ((c)::text = (c)::text)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Append -> Seq Scan on prt2_l_p1 t2 - -> Hash - -> Seq Scan on prt1_l_p1 t1 - Filter: (b = 0) - -> Hash Right Join - Hash Cond: ((t2_1.b = t1_1.a) AND ((t2_1.c)::text = (t1_1.c)::text)) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b -> Seq Scan on prt2_l_p2_p1 t2_1 - -> Hash - -> Seq Scan on prt1_l_p2_p1 t1_1 - Filter: (b = 0) - -> Hash Right Join - Hash Cond: ((t2_2.b = t1_2.a) AND ((t2_2.c)::text = (t1_2.c)::text)) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b -> Seq Scan on prt2_l_p2_p2 t2_2 - -> Hash - -> Seq Scan on prt1_l_p2_p2 t1_2 - Filter: (b = 0) - -> Nested Loop Left Join - Join Filter: ((a = b) AND ((c)::text = (c)::text)) + -> Seq Scan on prt2_l_p3_p1 t2_3 + -> Seq Scan on prt2_l_p3_p2 t2_4 + -> Hash -> Remote Subquery Scan on all (datanode_1,datanode_2) Distribute results by H: a -> Append + -> Seq Scan on prt1_l_p1 t1 + Filter: (b = 0) + -> Seq Scan on prt1_l_p2_p1 t1_1 + Filter: (b = 0) + -> Seq Scan on prt1_l_p2_p2 t1_2 + Filter: (b = 0) -> Seq Scan on prt1_l_p3_p1 t1_3 Filter: (b = 0) - -> Materialize - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b - -> Append - -> Seq Scan on prt2_l_p3_p1 t2_3 - -> Seq Scan on prt2_l_p3_p2 t2_4 -(41 rows) +(25 rows) SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 LEFT JOIN prt2_l t2 ON t1.a = t2.b AND t1.c = t2.c WHERE t1.b = 0 ORDER BY t1.a, t2.b; a | c | b | c @@ -1690,50 +1431,34 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 LEFT JOIN prt2_l t2 ON t1.a = t2.b -- right join EXPLAIN (COSTS OFF) SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 RIGHT JOIN prt2_l t2 ON t1.a = t2.b AND t1.c = t2.c WHERE t2.a = 0 ORDER BY t1.a, t2.b; - QUERY PLAN --------------------------------------------------------------------------------------------------- + QUERY PLAN +----------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) -> Sort - Sort Key: t1.a, t2.b - -> Result - -> Append - -> Hash Right Join - Hash Cond: ((t1.a = t2.b) AND ((t1.c)::text = (t2.c)::text)) + Sort Key: a, b + -> Hash Right Join + Hash Cond: ((a = b) AND ((c)::text = (c)::text)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Append -> Seq Scan on prt1_l_p1 t1 - -> Hash - -> Remote Subquery Scan on all (datanode_2) - Distribute results by H: b - -> Seq Scan on prt2_l_p1 t2 - Filter: (a = 0) - -> Nested Loop Left Join - Join Filter: ((t1_1.a = t2_1.b) AND ((t1_1.c)::text = (t2_1.c)::text)) - -> Remote Subquery Scan on all (datanode_2) - Distribute results by H: b - -> Seq Scan on prt2_l_p2_p1 t2_1 - Filter: (a = 0) -> Seq Scan on prt1_l_p2_p1 t1_1 - -> Hash Right Join - Hash Cond: ((t1_2.a = t2_2.b) AND ((t1_2.c)::text = (t2_2.c)::text)) -> Seq Scan on prt1_l_p2_p2 t1_2 - -> Hash - -> Remote Subquery Scan on all (datanode_2) - Distribute results by H: b - -> Seq Scan on prt2_l_p2_p2 t2_2 - Filter: (a = 0) - -> Nested Loop Left Join - Join Filter: ((a = b) AND ((c)::text = (c)::text)) - -> Remote Subquery Scan on all (datanode_2) - Distribute results by H: b - -> Append - -> Seq Scan on prt2_l_p3_p1 t2_3 - Filter: (a = 0) - -> Materialize - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: a - -> Append - -> Seq Scan on prt1_l_p3_p1 t1_3 - -> Seq Scan on prt1_l_p3_p2 t1_4 -(41 rows) + -> Seq Scan on prt1_l_p3_p1 t1_3 + -> Seq Scan on prt1_l_p3_p2 t1_4 + -> Hash + -> Remote Subquery Scan on all (datanode_2) + Distribute results by H: b + -> Append + -> Seq Scan on prt2_l_p1 t2 + Filter: (a = 0) + -> Seq Scan on prt2_l_p2_p1 t2_1 + Filter: (a = 0) + -> Seq Scan on prt2_l_p2_p2 t2_2 + Filter: (a = 0) + -> Seq Scan on prt2_l_p3_p1 t2_3 + Filter: (a = 0) +(25 rows) SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 RIGHT JOIN prt2_l t2 ON t1.a = t2.b AND t1.c = t2.c WHERE t2.a = 0 ORDER BY t1.a, t2.b; a | c | b | c @@ -1751,53 +1476,37 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 RIGHT JOIN prt2_l t2 ON t1.a = t2.b -- full join EXPLAIN (COSTS OFF) SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_l WHERE prt1_l.b = 0) t1 FULL JOIN (SELECT * FROM prt2_l WHERE prt2_l.a = 0) t2 ON (t1.a = t2.b AND t1.c = t2.c) ORDER BY t1.a, t2.b; - QUERY PLAN --------------------------------------------------------------------------------------------------------------------------- + QUERY PLAN +----------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) -> Sort - Sort Key: prt1_l_p1.a, prt2_l_p1.b - -> Append - -> Hash Full Join - Hash Cond: ((prt1_l_p1.a = prt2_l_p1.b) AND ((prt1_l_p1.c)::text = (prt2_l_p1.c)::text)) - -> Seq Scan on prt1_l_p1 - Filter: (b = 0) - -> Hash - -> Remote Subquery Scan on all (datanode_2) - Distribute results by H: b + Sort Key: a, b + -> Hash Full Join + Hash Cond: ((a = b) AND ((c)::text = (c)::text)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Append + -> Seq Scan on prt1_l_p1 + Filter: (b = 0) + -> Seq Scan on prt1_l_p2_p1 + Filter: (b = 0) + -> Seq Scan on prt1_l_p2_p2 + Filter: (b = 0) + -> Seq Scan on prt1_l_p3_p1 + Filter: (b = 0) + -> Hash + -> Remote Subquery Scan on all (datanode_2) + Distribute results by H: b + -> Append -> Seq Scan on prt2_l_p1 Filter: (a = 0) - -> Hash Full Join - Hash Cond: ((prt1_l_p2_p1.a = prt2_l_p2_p1.b) AND ((prt1_l_p2_p1.c)::text = (prt2_l_p2_p1.c)::text)) - -> Seq Scan on prt1_l_p2_p1 - Filter: (b = 0) - -> Hash - -> Remote Subquery Scan on all (datanode_2) - Distribute results by H: b -> Seq Scan on prt2_l_p2_p1 Filter: (a = 0) - -> Hash Full Join - Hash Cond: ((prt1_l_p2_p2.a = prt2_l_p2_p2.b) AND ((prt1_l_p2_p2.c)::text = (prt2_l_p2_p2.c)::text)) - -> Seq Scan on prt1_l_p2_p2 - Filter: (b = 0) - -> Hash - -> Remote Subquery Scan on all (datanode_2) - Distribute results by H: b -> Seq Scan on prt2_l_p2_p2 Filter: (a = 0) - -> Hash Full Join - Hash Cond: ((a = b) AND ((c)::text = (c)::text)) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: a - -> Append - -> Seq Scan on prt1_l_p3_p1 - Filter: (b = 0) - -> Hash - -> Remote Subquery Scan on all (datanode_2) - Distribute results by H: b - -> Append - -> Seq Scan on prt2_l_p3_p1 - Filter: (a = 0) -(44 rows) + -> Seq Scan on prt2_l_p3_p1 + Filter: (a = 0) +(28 rows) SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_l WHERE prt1_l.b = 0) t1 FULL JOIN (SELECT * FROM prt2_l WHERE prt2_l.a = 0) t2 ON (t1.a = t2.b AND t1.c = t2.c) ORDER BY t1.a, t2.b; a | c | b | c @@ -1825,24 +1534,66 @@ EXPLAIN (COSTS OFF) SELECT * FROM prt1_l t1 LEFT JOIN LATERAL (SELECT t2.a AS t2a, t2.c AS t2c, t2.b AS t2b, t3.b AS t3b, least(t1.a,t2.a,t3.b) FROM prt1_l t2 JOIN prt2_l t3 ON (t2.a = t3.b AND t2.c = t3.c)) ss ON t1.a = ss.t2a AND t1.c = ss.t2c WHERE t1.b = 0 ORDER BY t1.a; -ERROR: could not devise a query plan for the given query + QUERY PLAN +------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a + -> Nested Loop Left Join + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Append + -> Seq Scan on prt1_l_p1 t1 + Filter: (b = 0) + -> Seq Scan on prt1_l_p2_p1 t1_1 + Filter: (b = 0) + -> Seq Scan on prt1_l_p2_p2 t1_2 + Filter: (b = 0) + -> Seq Scan on prt1_l_p3_p1 t1_3 + Filter: (b = 0) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Hash Join + Hash Cond: ((b = t2.a) AND ((c)::text = (t2.c)::text)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt2_l_p1 t3 + -> Seq Scan on prt2_l_p2_p1 t3_1 + -> Seq Scan on prt2_l_p2_p2 t3_2 + -> Seq Scan on prt2_l_p3_p1 t3_3 + -> Seq Scan on prt2_l_p3_p2 t3_4 + -> Hash + -> Append + -> Seq Scan on prt1_l_p1 t2 + Filter: ((a = a) AND ((c)::text = (c)::text)) + -> Seq Scan on prt1_l_p2_p1 t2_1 + Filter: ((a = a) AND ((c)::text = (c)::text)) + -> Seq Scan on prt1_l_p2_p2 t2_2 + Filter: ((a = a) AND ((c)::text = (c)::text)) + -> Seq Scan on prt1_l_p3_p1 t2_3 + Filter: ((a = a) AND ((c)::text = (c)::text)) + -> Seq Scan on prt1_l_p3_p2 t2_4 + Filter: ((a = a) AND ((c)::text = (c)::text)) +(39 rows) + SELECT * FROM prt1_l t1 LEFT JOIN LATERAL (SELECT t2.a AS t2a, t2.c AS t2c, t2.b AS t2b, t3.b AS t3b, least(t1.a,t2.a,t3.b) FROM prt1_l t2 JOIN prt2_l t3 ON (t2.a = t3.b AND t2.c = t3.c)) ss ON t1.a = ss.t2a AND t1.c = ss.t2c WHERE t1.b = 0 ORDER BY t1.a; - a | b | c | t2a | t2c | t2b | t3b | least + a | b | c | t2a | t2c | t2b | t3b | least -----+---+------+-----+------+-----+-----+------- 0 | 0 | 0000 | 0 | 0000 | 0 | 0 | 0 - 50 | 0 | 0002 | | | | | - 100 | 0 | 0000 | | | | | + 50 | 0 | 0002 | | | | | + 100 | 0 | 0000 | | | | | 150 | 0 | 0002 | 150 | 0002 | 0 | 150 | 150 - 200 | 0 | 0000 | | | | | - 250 | 0 | 0002 | | | | | + 200 | 0 | 0000 | | | | | + 250 | 0 | 0002 | | | | | 300 | 0 | 0000 | 300 | 0000 | 0 | 300 | 300 - 350 | 0 | 0002 | | | | | - 400 | 0 | 0000 | | | | | + 350 | 0 | 0002 | | | | | + 400 | 0 | 0000 | | | | | 450 | 0 | 0002 | 450 | 0002 | 0 | 450 | 450 - 500 | 0 | 0000 | | | | | - 550 | 0 | 0002 | | | | | + 500 | 0 | 0000 | | | | | + 550 | 0 | 0002 | | | | | (12 rows) -- join with one side empty @@ -1914,34 +1665,25 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt4_n t2, prt2 t3 WHERE t1.a = t2.a ----------------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) -> Hash Join - Hash Cond: (t1.a = t2.a) + Hash Cond: (t2.a = t1.a) -> Append + -> Seq Scan on prt4_n_p1 t2 + -> Seq Scan on prt4_n_p2 t2_1 + -> Seq Scan on prt4_n_p3 t2_2 + -> Hash -> Hash Join - Hash Cond: (t1.a = t3.b) - -> Seq Scan on prt1_p1 t1 - -> Hash - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b - -> Seq Scan on prt2_p1 t3 - -> Hash Join - Hash Cond: (t1_1.a = t3_1.b) - -> Seq Scan on prt1_p2 t1_1 + Hash Cond: (t1.a = b) + -> Append + -> Seq Scan on prt1_p1 t1 + -> Seq Scan on prt1_p2 t1_1 + -> Seq Scan on prt1_p3 t1_2 -> Hash -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b - -> Seq Scan on prt2_p2 t3_1 - -> Nested Loop - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b - -> Seq Scan on prt2_p3 t3_2 - -> Index Scan using iprt1_p3_a on prt1_p3 t1_2 - Index Cond: (a = t3_2.b) - -> Hash - -> Append - -> Seq Scan on prt4_n_p1 t2 - -> Seq Scan on prt4_n_p2 t2_1 - -> Seq Scan on prt4_n_p3 t2_2 -(29 rows) + -> Append + -> Seq Scan on prt2_p1 t3 + -> Seq Scan on prt2_p2 t3_1 + -> Seq Scan on prt2_p3 t3_2 +(20 rows) -- partition-wise join can not be applied if there are no equi-join conditions -- between partition keys diff --git a/src/test/regress/sql/partition_join.sql b/src/test/regress/sql/partition_join.sql index 4aa775e7..f84075e7 100644 --- a/src/test/regress/sql/partition_join.sql +++ b/src/test/regress/sql/partition_join.sql @@ -4,7 +4,7 @@ -- -- Enable partition-wise join, which by default is disabled. -SET enable_partition_wise_join to true; +--SET enable_partition_wise_join to true; -- -- partitioned by a single column From 622bf643e530d597d4897f26504e61f75120c848 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Fri, 17 Jul 2020 11:33:08 +0800 Subject: [PATCH 308/578] fix regress executor plan error --- src/test/regress/expected/join_6.out | 6227 ++++++++++++++ .../regress/expected/partition_join_2.out | 1819 ++++ src/test/regress/expected/subselect_2.out | 1164 +++ src/test/regress/expected/xc_groupby_3.out | 7513 +++++++++++++++++ 4 files changed, 16723 insertions(+) create mode 100644 src/test/regress/expected/join_6.out create mode 100644 src/test/regress/expected/partition_join_2.out create mode 100644 src/test/regress/expected/subselect_2.out create mode 100644 src/test/regress/expected/xc_groupby_3.out diff --git a/src/test/regress/expected/join_6.out b/src/test/regress/expected/join_6.out new file mode 100644 index 00000000..58736aa0 --- /dev/null +++ b/src/test/regress/expected/join_6.out @@ -0,0 +1,6227 @@ +-- +-- JOIN +-- Test JOIN clauses +-- +CREATE TABLE J1_TBL ( + i integer, + j integer, + t text +); +CREATE TABLE J2_TBL ( + i integer, + k integer +); +INSERT INTO J1_TBL VALUES (1, 4, 'one'); +INSERT INTO J1_TBL VALUES (2, 3, 'two'); +INSERT INTO J1_TBL VALUES (3, 2, 'three'); +INSERT INTO J1_TBL VALUES (4, 1, 'four'); +INSERT INTO J1_TBL VALUES (5, 0, 'five'); +INSERT INTO J1_TBL VALUES (6, 6, 'six'); +INSERT INTO J1_TBL VALUES (7, 7, 'seven'); +INSERT INTO J1_TBL VALUES (8, 8, 'eight'); +INSERT INTO J1_TBL VALUES (0, NULL, 'zero'); +INSERT INTO J1_TBL VALUES (NULL, NULL, 'null'); +INSERT INTO J1_TBL VALUES (NULL, 0, 'zero'); +INSERT INTO J2_TBL VALUES (1, -1); +INSERT INTO J2_TBL VALUES (2, 2); +INSERT INTO J2_TBL VALUES (3, -3); +INSERT INTO J2_TBL VALUES (2, 4); +INSERT INTO J2_TBL VALUES (5, -5); +INSERT INTO J2_TBL VALUES (5, -5); +INSERT INTO J2_TBL VALUES (0, NULL); +INSERT INTO J2_TBL VALUES (NULL, NULL); +INSERT INTO J2_TBL VALUES (NULL, 0); +-- +-- CORRELATION NAMES +-- Make sure that table/column aliases are supported +-- before diving into more complex join syntax. +-- +SELECT '' AS "xxx", * + FROM J1_TBL AS tx + ORDER BY i, j, t; + xxx | i | j | t +-----+---+---+------- + | 0 | | zero + | 1 | 4 | one + | 2 | 3 | two + | 3 | 2 | three + | 4 | 1 | four + | 5 | 0 | five + | 6 | 6 | six + | 7 | 7 | seven + | 8 | 8 | eight + | | 0 | zero + | | | null +(11 rows) + +SELECT '' AS "xxx", * + FROM J1_TBL tx + ORDER BY i, j, t; + xxx | i | j | t +-----+---+---+------- + | 0 | | zero + | 1 | 4 | one + | 2 | 3 | two + | 3 | 2 | three + | 4 | 1 | four + | 5 | 0 | five + | 6 | 6 | six + | 7 | 7 | seven + | 8 | 8 | eight + | | 0 | zero + | | | null +(11 rows) + +SELECT '' AS "xxx", * + FROM J1_TBL AS t1 (a, b, c) + ORDER BY a, b, c; + xxx | a | b | c +-----+---+---+------- + | 0 | | zero + | 1 | 4 | one + | 2 | 3 | two + | 3 | 2 | three + | 4 | 1 | four + | 5 | 0 | five + | 6 | 6 | six + | 7 | 7 | seven + | 8 | 8 | eight + | | 0 | zero + | | | null +(11 rows) + +SELECT '' AS "xxx", * + FROM J1_TBL t1 (a, b, c) + ORDER BY a, b, c; + xxx | a | b | c +-----+---+---+------- + | 0 | | zero + | 1 | 4 | one + | 2 | 3 | two + | 3 | 2 | three + | 4 | 1 | four + | 5 | 0 | five + | 6 | 6 | six + | 7 | 7 | seven + | 8 | 8 | eight + | | 0 | zero + | | | null +(11 rows) + +SELECT '' AS "xxx", * + FROM J1_TBL t1 (a, b, c), J2_TBL t2 (d, e) + ORDER BY a, b, c, d, e; + xxx | a | b | c | d | e +-----+---+---+-------+---+---- + | 0 | | zero | 0 | + | 0 | | zero | 1 | -1 + | 0 | | zero | 2 | 2 + | 0 | | zero | 2 | 4 + | 0 | | zero | 3 | -3 + | 0 | | zero | 5 | -5 + | 0 | | zero | 5 | -5 + | 0 | | zero | | 0 + | 0 | | zero | | + | 1 | 4 | one | 0 | + | 1 | 4 | one | 1 | -1 + | 1 | 4 | one | 2 | 2 + | 1 | 4 | one | 2 | 4 + | 1 | 4 | one | 3 | -3 + | 1 | 4 | one | 5 | -5 + | 1 | 4 | one | 5 | -5 + | 1 | 4 | one | | 0 + | 1 | 4 | one | | + | 2 | 3 | two | 0 | + | 2 | 3 | two | 1 | -1 + | 2 | 3 | two | 2 | 2 + | 2 | 3 | two | 2 | 4 + | 2 | 3 | two | 3 | -3 + | 2 | 3 | two | 5 | -5 + | 2 | 3 | two | 5 | -5 + | 2 | 3 | two | | 0 + | 2 | 3 | two | | + | 3 | 2 | three | 0 | + | 3 | 2 | three | 1 | -1 + | 3 | 2 | three | 2 | 2 + | 3 | 2 | three | 2 | 4 + | 3 | 2 | three | 3 | -3 + | 3 | 2 | three | 5 | -5 + | 3 | 2 | three | 5 | -5 + | 3 | 2 | three | | 0 + | 3 | 2 | three | | + | 4 | 1 | four | 0 | + | 4 | 1 | four | 1 | -1 + | 4 | 1 | four | 2 | 2 + | 4 | 1 | four | 2 | 4 + | 4 | 1 | four | 3 | -3 + | 4 | 1 | four | 5 | -5 + | 4 | 1 | four | 5 | -5 + | 4 | 1 | four | | 0 + | 4 | 1 | four | | + | 5 | 0 | five | 0 | + | 5 | 0 | five | 1 | -1 + | 5 | 0 | five | 2 | 2 + | 5 | 0 | five | 2 | 4 + | 5 | 0 | five | 3 | -3 + | 5 | 0 | five | 5 | -5 + | 5 | 0 | five | 5 | -5 + | 5 | 0 | five | | 0 + | 5 | 0 | five | | + | 6 | 6 | six | 0 | + | 6 | 6 | six | 1 | -1 + | 6 | 6 | six | 2 | 2 + | 6 | 6 | six | 2 | 4 + | 6 | 6 | six | 3 | -3 + | 6 | 6 | six | 5 | -5 + | 6 | 6 | six | 5 | -5 + | 6 | 6 | six | | 0 + | 6 | 6 | six | | + | 7 | 7 | seven | 0 | + | 7 | 7 | seven | 1 | -1 + | 7 | 7 | seven | 2 | 2 + | 7 | 7 | seven | 2 | 4 + | 7 | 7 | seven | 3 | -3 + | 7 | 7 | seven | 5 | -5 + | 7 | 7 | seven | 5 | -5 + | 7 | 7 | seven | | 0 + | 7 | 7 | seven | | + | 8 | 8 | eight | 0 | + | 8 | 8 | eight | 1 | -1 + | 8 | 8 | eight | 2 | 2 + | 8 | 8 | eight | 2 | 4 + | 8 | 8 | eight | 3 | -3 + | 8 | 8 | eight | 5 | -5 + | 8 | 8 | eight | 5 | -5 + | 8 | 8 | eight | | 0 + | 8 | 8 | eight | | + | | 0 | zero | 0 | + | | 0 | zero | 1 | -1 + | | 0 | zero | 2 | 2 + | | 0 | zero | 2 | 4 + | | 0 | zero | 3 | -3 + | | 0 | zero | 5 | -5 + | | 0 | zero | 5 | -5 + | | 0 | zero | | 0 + | | 0 | zero | | + | | | null | 0 | + | | | null | 1 | -1 + | | | null | 2 | 2 + | | | null | 2 | 4 + | | | null | 3 | -3 + | | | null | 5 | -5 + | | | null | 5 | -5 + | | | null | | 0 + | | | null | | +(99 rows) + +SELECT '' AS "xxx", t1.a, t2.e + FROM J1_TBL t1 (a, b, c), J2_TBL t2 (d, e) + WHERE t1.a = t2.d + ORDER BY a, e; + xxx | a | e +-----+---+---- + | 0 | + | 1 | -1 + | 2 | 2 + | 2 | 4 + | 3 | -3 + | 5 | -5 + | 5 | -5 +(7 rows) + +-- +-- CROSS JOIN +-- Qualifications are not allowed on cross joins, +-- which degenerate into a standard unqualified inner join. +-- +SELECT '' AS "xxx", * + FROM J1_TBL CROSS JOIN J2_TBL + ORDER BY J1_TBL.i, J1_TBL.j, J1_TBL.t, J2_TBL.i, J2_TBL.k; + xxx | i | j | t | i | k +-----+---+---+-------+---+---- + | 0 | | zero | 0 | + | 0 | | zero | 1 | -1 + | 0 | | zero | 2 | 2 + | 0 | | zero | 2 | 4 + | 0 | | zero | 3 | -3 + | 0 | | zero | 5 | -5 + | 0 | | zero | 5 | -5 + | 0 | | zero | | 0 + | 0 | | zero | | + | 1 | 4 | one | 0 | + | 1 | 4 | one | 1 | -1 + | 1 | 4 | one | 2 | 2 + | 1 | 4 | one | 2 | 4 + | 1 | 4 | one | 3 | -3 + | 1 | 4 | one | 5 | -5 + | 1 | 4 | one | 5 | -5 + | 1 | 4 | one | | 0 + | 1 | 4 | one | | + | 2 | 3 | two | 0 | + | 2 | 3 | two | 1 | -1 + | 2 | 3 | two | 2 | 2 + | 2 | 3 | two | 2 | 4 + | 2 | 3 | two | 3 | -3 + | 2 | 3 | two | 5 | -5 + | 2 | 3 | two | 5 | -5 + | 2 | 3 | two | | 0 + | 2 | 3 | two | | + | 3 | 2 | three | 0 | + | 3 | 2 | three | 1 | -1 + | 3 | 2 | three | 2 | 2 + | 3 | 2 | three | 2 | 4 + | 3 | 2 | three | 3 | -3 + | 3 | 2 | three | 5 | -5 + | 3 | 2 | three | 5 | -5 + | 3 | 2 | three | | 0 + | 3 | 2 | three | | + | 4 | 1 | four | 0 | + | 4 | 1 | four | 1 | -1 + | 4 | 1 | four | 2 | 2 + | 4 | 1 | four | 2 | 4 + | 4 | 1 | four | 3 | -3 + | 4 | 1 | four | 5 | -5 + | 4 | 1 | four | 5 | -5 + | 4 | 1 | four | | 0 + | 4 | 1 | four | | + | 5 | 0 | five | 0 | + | 5 | 0 | five | 1 | -1 + | 5 | 0 | five | 2 | 2 + | 5 | 0 | five | 2 | 4 + | 5 | 0 | five | 3 | -3 + | 5 | 0 | five | 5 | -5 + | 5 | 0 | five | 5 | -5 + | 5 | 0 | five | | 0 + | 5 | 0 | five | | + | 6 | 6 | six | 0 | + | 6 | 6 | six | 1 | -1 + | 6 | 6 | six | 2 | 2 + | 6 | 6 | six | 2 | 4 + | 6 | 6 | six | 3 | -3 + | 6 | 6 | six | 5 | -5 + | 6 | 6 | six | 5 | -5 + | 6 | 6 | six | | 0 + | 6 | 6 | six | | + | 7 | 7 | seven | 0 | + | 7 | 7 | seven | 1 | -1 + | 7 | 7 | seven | 2 | 2 + | 7 | 7 | seven | 2 | 4 + | 7 | 7 | seven | 3 | -3 + | 7 | 7 | seven | 5 | -5 + | 7 | 7 | seven | 5 | -5 + | 7 | 7 | seven | | 0 + | 7 | 7 | seven | | + | 8 | 8 | eight | 0 | + | 8 | 8 | eight | 1 | -1 + | 8 | 8 | eight | 2 | 2 + | 8 | 8 | eight | 2 | 4 + | 8 | 8 | eight | 3 | -3 + | 8 | 8 | eight | 5 | -5 + | 8 | 8 | eight | 5 | -5 + | 8 | 8 | eight | | 0 + | 8 | 8 | eight | | + | | 0 | zero | 0 | + | | 0 | zero | 1 | -1 + | | 0 | zero | 2 | 2 + | | 0 | zero | 2 | 4 + | | 0 | zero | 3 | -3 + | | 0 | zero | 5 | -5 + | | 0 | zero | 5 | -5 + | | 0 | zero | | 0 + | | 0 | zero | | + | | | null | 0 | + | | | null | 1 | -1 + | | | null | 2 | 2 + | | | null | 2 | 4 + | | | null | 3 | -3 + | | | null | 5 | -5 + | | | null | 5 | -5 + | | | null | | 0 + | | | null | | +(99 rows) + +-- ambiguous column +SELECT '' AS "xxx", i, k, t + FROM J1_TBL CROSS JOIN J2_TBL; +ERROR: column reference "i" is ambiguous +LINE 1: SELECT '' AS "xxx", i, k, t + ^ +-- resolve previous ambiguity by specifying the table name +SELECT '' AS "xxx", t1.i, k, t + FROM J1_TBL t1 CROSS JOIN J2_TBL t2 + ORDER BY i, k, t; + xxx | i | k | t +-----+---+----+------- + | 0 | -5 | zero + | 0 | -5 | zero + | 0 | -3 | zero + | 0 | -1 | zero + | 0 | 0 | zero + | 0 | 2 | zero + | 0 | 4 | zero + | 0 | | zero + | 0 | | zero + | 1 | -5 | one + | 1 | -5 | one + | 1 | -3 | one + | 1 | -1 | one + | 1 | 0 | one + | 1 | 2 | one + | 1 | 4 | one + | 1 | | one + | 1 | | one + | 2 | -5 | two + | 2 | -5 | two + | 2 | -3 | two + | 2 | -1 | two + | 2 | 0 | two + | 2 | 2 | two + | 2 | 4 | two + | 2 | | two + | 2 | | two + | 3 | -5 | three + | 3 | -5 | three + | 3 | -3 | three + | 3 | -1 | three + | 3 | 0 | three + | 3 | 2 | three + | 3 | 4 | three + | 3 | | three + | 3 | | three + | 4 | -5 | four + | 4 | -5 | four + | 4 | -3 | four + | 4 | -1 | four + | 4 | 0 | four + | 4 | 2 | four + | 4 | 4 | four + | 4 | | four + | 4 | | four + | 5 | -5 | five + | 5 | -5 | five + | 5 | -3 | five + | 5 | -1 | five + | 5 | 0 | five + | 5 | 2 | five + | 5 | 4 | five + | 5 | | five + | 5 | | five + | 6 | -5 | six + | 6 | -5 | six + | 6 | -3 | six + | 6 | -1 | six + | 6 | 0 | six + | 6 | 2 | six + | 6 | 4 | six + | 6 | | six + | 6 | | six + | 7 | -5 | seven + | 7 | -5 | seven + | 7 | -3 | seven + | 7 | -1 | seven + | 7 | 0 | seven + | 7 | 2 | seven + | 7 | 4 | seven + | 7 | | seven + | 7 | | seven + | 8 | -5 | eight + | 8 | -5 | eight + | 8 | -3 | eight + | 8 | -1 | eight + | 8 | 0 | eight + | 8 | 2 | eight + | 8 | 4 | eight + | 8 | | eight + | 8 | | eight + | | -5 | null + | | -5 | null + | | -5 | zero + | | -5 | zero + | | -3 | null + | | -3 | zero + | | -1 | null + | | -1 | zero + | | 0 | null + | | 0 | zero + | | 2 | null + | | 2 | zero + | | 4 | null + | | 4 | zero + | | | null + | | | null + | | | zero + | | | zero +(99 rows) + +SELECT '' AS "xxx", ii, tt, kk + FROM (J1_TBL CROSS JOIN J2_TBL) + AS tx (ii, jj, tt, ii2, kk) + ORDER BY ii, tt, kk; + xxx | ii | tt | kk +-----+----+-------+---- + | 0 | zero | -5 + | 0 | zero | -5 + | 0 | zero | -3 + | 0 | zero | -1 + | 0 | zero | 0 + | 0 | zero | 2 + | 0 | zero | 4 + | 0 | zero | + | 0 | zero | + | 1 | one | -5 + | 1 | one | -5 + | 1 | one | -3 + | 1 | one | -1 + | 1 | one | 0 + | 1 | one | 2 + | 1 | one | 4 + | 1 | one | + | 1 | one | + | 2 | two | -5 + | 2 | two | -5 + | 2 | two | -3 + | 2 | two | -1 + | 2 | two | 0 + | 2 | two | 2 + | 2 | two | 4 + | 2 | two | + | 2 | two | + | 3 | three | -5 + | 3 | three | -5 + | 3 | three | -3 + | 3 | three | -1 + | 3 | three | 0 + | 3 | three | 2 + | 3 | three | 4 + | 3 | three | + | 3 | three | + | 4 | four | -5 + | 4 | four | -5 + | 4 | four | -3 + | 4 | four | -1 + | 4 | four | 0 + | 4 | four | 2 + | 4 | four | 4 + | 4 | four | + | 4 | four | + | 5 | five | -5 + | 5 | five | -5 + | 5 | five | -3 + | 5 | five | -1 + | 5 | five | 0 + | 5 | five | 2 + | 5 | five | 4 + | 5 | five | + | 5 | five | + | 6 | six | -5 + | 6 | six | -5 + | 6 | six | -3 + | 6 | six | -1 + | 6 | six | 0 + | 6 | six | 2 + | 6 | six | 4 + | 6 | six | + | 6 | six | + | 7 | seven | -5 + | 7 | seven | -5 + | 7 | seven | -3 + | 7 | seven | -1 + | 7 | seven | 0 + | 7 | seven | 2 + | 7 | seven | 4 + | 7 | seven | + | 7 | seven | + | 8 | eight | -5 + | 8 | eight | -5 + | 8 | eight | -3 + | 8 | eight | -1 + | 8 | eight | 0 + | 8 | eight | 2 + | 8 | eight | 4 + | 8 | eight | + | 8 | eight | + | | null | -5 + | | null | -5 + | | null | -3 + | | null | -1 + | | null | 0 + | | null | 2 + | | null | 4 + | | null | + | | null | + | | zero | -5 + | | zero | -5 + | | zero | -3 + | | zero | -1 + | | zero | 0 + | | zero | 2 + | | zero | 4 + | | zero | + | | zero | +(99 rows) + +SELECT '' AS "xxx", tx.ii, tx.jj, tx.kk + FROM (J1_TBL t1 (a, b, c) CROSS JOIN J2_TBL t2 (d, e)) + AS tx (ii, jj, tt, ii2, kk) + ORDER BY ii, jj, kk; + xxx | ii | jj | kk +-----+----+----+---- + | 0 | | -5 + | 0 | | -5 + | 0 | | -3 + | 0 | | -1 + | 0 | | 0 + | 0 | | 2 + | 0 | | 4 + | 0 | | + | 0 | | + | 1 | 4 | -5 + | 1 | 4 | -5 + | 1 | 4 | -3 + | 1 | 4 | -1 + | 1 | 4 | 0 + | 1 | 4 | 2 + | 1 | 4 | 4 + | 1 | 4 | + | 1 | 4 | + | 2 | 3 | -5 + | 2 | 3 | -5 + | 2 | 3 | -3 + | 2 | 3 | -1 + | 2 | 3 | 0 + | 2 | 3 | 2 + | 2 | 3 | 4 + | 2 | 3 | + | 2 | 3 | + | 3 | 2 | -5 + | 3 | 2 | -5 + | 3 | 2 | -3 + | 3 | 2 | -1 + | 3 | 2 | 0 + | 3 | 2 | 2 + | 3 | 2 | 4 + | 3 | 2 | + | 3 | 2 | + | 4 | 1 | -5 + | 4 | 1 | -5 + | 4 | 1 | -3 + | 4 | 1 | -1 + | 4 | 1 | 0 + | 4 | 1 | 2 + | 4 | 1 | 4 + | 4 | 1 | + | 4 | 1 | + | 5 | 0 | -5 + | 5 | 0 | -5 + | 5 | 0 | -3 + | 5 | 0 | -1 + | 5 | 0 | 0 + | 5 | 0 | 2 + | 5 | 0 | 4 + | 5 | 0 | + | 5 | 0 | + | 6 | 6 | -5 + | 6 | 6 | -5 + | 6 | 6 | -3 + | 6 | 6 | -1 + | 6 | 6 | 0 + | 6 | 6 | 2 + | 6 | 6 | 4 + | 6 | 6 | + | 6 | 6 | + | 7 | 7 | -5 + | 7 | 7 | -5 + | 7 | 7 | -3 + | 7 | 7 | -1 + | 7 | 7 | 0 + | 7 | 7 | 2 + | 7 | 7 | 4 + | 7 | 7 | + | 7 | 7 | + | 8 | 8 | -5 + | 8 | 8 | -5 + | 8 | 8 | -3 + | 8 | 8 | -1 + | 8 | 8 | 0 + | 8 | 8 | 2 + | 8 | 8 | 4 + | 8 | 8 | + | 8 | 8 | + | | 0 | -5 + | | 0 | -5 + | | 0 | -3 + | | 0 | -1 + | | 0 | 0 + | | 0 | 2 + | | 0 | 4 + | | 0 | + | | 0 | + | | | -5 + | | | -5 + | | | -3 + | | | -1 + | | | 0 + | | | 2 + | | | 4 + | | | + | | | +(99 rows) + +SELECT '' AS "xxx", * + FROM J1_TBL CROSS JOIN J2_TBL a CROSS JOIN J2_TBL b + ORDER BY J1_TBL.i,J1_TBL.j,J1_TBL.t,a.i,a.k,b.i,b.k; + xxx | i | j | t | i | k | i | k +-----+---+---+-------+---+----+---+---- + | 0 | | zero | 0 | | 0 | + | 0 | | zero | 0 | | 1 | -1 + | 0 | | zero | 0 | | 2 | 2 + | 0 | | zero | 0 | | 2 | 4 + | 0 | | zero | 0 | | 3 | -3 + | 0 | | zero | 0 | | 5 | -5 + | 0 | | zero | 0 | | 5 | -5 + | 0 | | zero | 0 | | | 0 + | 0 | | zero | 0 | | | + | 0 | | zero | 1 | -1 | 0 | + | 0 | | zero | 1 | -1 | 1 | -1 + | 0 | | zero | 1 | -1 | 2 | 2 + | 0 | | zero | 1 | -1 | 2 | 4 + | 0 | | zero | 1 | -1 | 3 | -3 + | 0 | | zero | 1 | -1 | 5 | -5 + | 0 | | zero | 1 | -1 | 5 | -5 + | 0 | | zero | 1 | -1 | | 0 + | 0 | | zero | 1 | -1 | | + | 0 | | zero | 2 | 2 | 0 | + | 0 | | zero | 2 | 2 | 1 | -1 + | 0 | | zero | 2 | 2 | 2 | 2 + | 0 | | zero | 2 | 2 | 2 | 4 + | 0 | | zero | 2 | 2 | 3 | -3 + | 0 | | zero | 2 | 2 | 5 | -5 + | 0 | | zero | 2 | 2 | 5 | -5 + | 0 | | zero | 2 | 2 | | 0 + | 0 | | zero | 2 | 2 | | + | 0 | | zero | 2 | 4 | 0 | + | 0 | | zero | 2 | 4 | 1 | -1 + | 0 | | zero | 2 | 4 | 2 | 2 + | 0 | | zero | 2 | 4 | 2 | 4 + | 0 | | zero | 2 | 4 | 3 | -3 + | 0 | | zero | 2 | 4 | 5 | -5 + | 0 | | zero | 2 | 4 | 5 | -5 + | 0 | | zero | 2 | 4 | | 0 + | 0 | | zero | 2 | 4 | | + | 0 | | zero | 3 | -3 | 0 | + | 0 | | zero | 3 | -3 | 1 | -1 + | 0 | | zero | 3 | -3 | 2 | 2 + | 0 | | zero | 3 | -3 | 2 | 4 + | 0 | | zero | 3 | -3 | 3 | -3 + | 0 | | zero | 3 | -3 | 5 | -5 + | 0 | | zero | 3 | -3 | 5 | -5 + | 0 | | zero | 3 | -3 | | 0 + | 0 | | zero | 3 | -3 | | + | 0 | | zero | 5 | -5 | 0 | + | 0 | | zero | 5 | -5 | 0 | + | 0 | | zero | 5 | -5 | 1 | -1 + | 0 | | zero | 5 | -5 | 1 | -1 + | 0 | | zero | 5 | -5 | 2 | 2 + | 0 | | zero | 5 | -5 | 2 | 2 + | 0 | | zero | 5 | -5 | 2 | 4 + | 0 | | zero | 5 | -5 | 2 | 4 + | 0 | | zero | 5 | -5 | 3 | -3 + | 0 | | zero | 5 | -5 | 3 | -3 + | 0 | | zero | 5 | -5 | 5 | -5 + | 0 | | zero | 5 | -5 | 5 | -5 + | 0 | | zero | 5 | -5 | 5 | -5 + | 0 | | zero | 5 | -5 | 5 | -5 + | 0 | | zero | 5 | -5 | | 0 + | 0 | | zero | 5 | -5 | | 0 + | 0 | | zero | 5 | -5 | | + | 0 | | zero | 5 | -5 | | + | 0 | | zero | | 0 | 0 | + | 0 | | zero | | 0 | 1 | -1 + | 0 | | zero | | 0 | 2 | 2 + | 0 | | zero | | 0 | 2 | 4 + | 0 | | zero | | 0 | 3 | -3 + | 0 | | zero | | 0 | 5 | -5 + | 0 | | zero | | 0 | 5 | -5 + | 0 | | zero | | 0 | | 0 + | 0 | | zero | | 0 | | + | 0 | | zero | | | 0 | + | 0 | | zero | | | 1 | -1 + | 0 | | zero | | | 2 | 2 + | 0 | | zero | | | 2 | 4 + | 0 | | zero | | | 3 | -3 + | 0 | | zero | | | 5 | -5 + | 0 | | zero | | | 5 | -5 + | 0 | | zero | | | | 0 + | 0 | | zero | | | | + | 1 | 4 | one | 0 | | 0 | + | 1 | 4 | one | 0 | | 1 | -1 + | 1 | 4 | one | 0 | | 2 | 2 + | 1 | 4 | one | 0 | | 2 | 4 + | 1 | 4 | one | 0 | | 3 | -3 + | 1 | 4 | one | 0 | | 5 | -5 + | 1 | 4 | one | 0 | | 5 | -5 + | 1 | 4 | one | 0 | | | 0 + | 1 | 4 | one | 0 | | | + | 1 | 4 | one | 1 | -1 | 0 | + | 1 | 4 | one | 1 | -1 | 1 | -1 + | 1 | 4 | one | 1 | -1 | 2 | 2 + | 1 | 4 | one | 1 | -1 | 2 | 4 + | 1 | 4 | one | 1 | -1 | 3 | -3 + | 1 | 4 | one | 1 | -1 | 5 | -5 + | 1 | 4 | one | 1 | -1 | 5 | -5 + | 1 | 4 | one | 1 | -1 | | 0 + | 1 | 4 | one | 1 | -1 | | + | 1 | 4 | one | 2 | 2 | 0 | + | 1 | 4 | one | 2 | 2 | 1 | -1 + | 1 | 4 | one | 2 | 2 | 2 | 2 + | 1 | 4 | one | 2 | 2 | 2 | 4 + | 1 | 4 | one | 2 | 2 | 3 | -3 + | 1 | 4 | one | 2 | 2 | 5 | -5 + | 1 | 4 | one | 2 | 2 | 5 | -5 + | 1 | 4 | one | 2 | 2 | | 0 + | 1 | 4 | one | 2 | 2 | | + | 1 | 4 | one | 2 | 4 | 0 | + | 1 | 4 | one | 2 | 4 | 1 | -1 + | 1 | 4 | one | 2 | 4 | 2 | 2 + | 1 | 4 | one | 2 | 4 | 2 | 4 + | 1 | 4 | one | 2 | 4 | 3 | -3 + | 1 | 4 | one | 2 | 4 | 5 | -5 + | 1 | 4 | one | 2 | 4 | 5 | -5 + | 1 | 4 | one | 2 | 4 | | 0 + | 1 | 4 | one | 2 | 4 | | + | 1 | 4 | one | 3 | -3 | 0 | + | 1 | 4 | one | 3 | -3 | 1 | -1 + | 1 | 4 | one | 3 | -3 | 2 | 2 + | 1 | 4 | one | 3 | -3 | 2 | 4 + | 1 | 4 | one | 3 | -3 | 3 | -3 + | 1 | 4 | one | 3 | -3 | 5 | -5 + | 1 | 4 | one | 3 | -3 | 5 | -5 + | 1 | 4 | one | 3 | -3 | | 0 + | 1 | 4 | one | 3 | -3 | | + | 1 | 4 | one | 5 | -5 | 0 | + | 1 | 4 | one | 5 | -5 | 0 | + | 1 | 4 | one | 5 | -5 | 1 | -1 + | 1 | 4 | one | 5 | -5 | 1 | -1 + | 1 | 4 | one | 5 | -5 | 2 | 2 + | 1 | 4 | one | 5 | -5 | 2 | 2 + | 1 | 4 | one | 5 | -5 | 2 | 4 + | 1 | 4 | one | 5 | -5 | 2 | 4 + | 1 | 4 | one | 5 | -5 | 3 | -3 + | 1 | 4 | one | 5 | -5 | 3 | -3 + | 1 | 4 | one | 5 | -5 | 5 | -5 + | 1 | 4 | one | 5 | -5 | 5 | -5 + | 1 | 4 | one | 5 | -5 | 5 | -5 + | 1 | 4 | one | 5 | -5 | 5 | -5 + | 1 | 4 | one | 5 | -5 | | 0 + | 1 | 4 | one | 5 | -5 | | 0 + | 1 | 4 | one | 5 | -5 | | + | 1 | 4 | one | 5 | -5 | | + | 1 | 4 | one | | 0 | 0 | + | 1 | 4 | one | | 0 | 1 | -1 + | 1 | 4 | one | | 0 | 2 | 2 + | 1 | 4 | one | | 0 | 2 | 4 + | 1 | 4 | one | | 0 | 3 | -3 + | 1 | 4 | one | | 0 | 5 | -5 + | 1 | 4 | one | | 0 | 5 | -5 + | 1 | 4 | one | | 0 | | 0 + | 1 | 4 | one | | 0 | | + | 1 | 4 | one | | | 0 | + | 1 | 4 | one | | | 1 | -1 + | 1 | 4 | one | | | 2 | 2 + | 1 | 4 | one | | | 2 | 4 + | 1 | 4 | one | | | 3 | -3 + | 1 | 4 | one | | | 5 | -5 + | 1 | 4 | one | | | 5 | -5 + | 1 | 4 | one | | | | 0 + | 1 | 4 | one | | | | + | 2 | 3 | two | 0 | | 0 | + | 2 | 3 | two | 0 | | 1 | -1 + | 2 | 3 | two | 0 | | 2 | 2 + | 2 | 3 | two | 0 | | 2 | 4 + | 2 | 3 | two | 0 | | 3 | -3 + | 2 | 3 | two | 0 | | 5 | -5 + | 2 | 3 | two | 0 | | 5 | -5 + | 2 | 3 | two | 0 | | | 0 + | 2 | 3 | two | 0 | | | + | 2 | 3 | two | 1 | -1 | 0 | + | 2 | 3 | two | 1 | -1 | 1 | -1 + | 2 | 3 | two | 1 | -1 | 2 | 2 + | 2 | 3 | two | 1 | -1 | 2 | 4 + | 2 | 3 | two | 1 | -1 | 3 | -3 + | 2 | 3 | two | 1 | -1 | 5 | -5 + | 2 | 3 | two | 1 | -1 | 5 | -5 + | 2 | 3 | two | 1 | -1 | | 0 + | 2 | 3 | two | 1 | -1 | | + | 2 | 3 | two | 2 | 2 | 0 | + | 2 | 3 | two | 2 | 2 | 1 | -1 + | 2 | 3 | two | 2 | 2 | 2 | 2 + | 2 | 3 | two | 2 | 2 | 2 | 4 + | 2 | 3 | two | 2 | 2 | 3 | -3 + | 2 | 3 | two | 2 | 2 | 5 | -5 + | 2 | 3 | two | 2 | 2 | 5 | -5 + | 2 | 3 | two | 2 | 2 | | 0 + | 2 | 3 | two | 2 | 2 | | + | 2 | 3 | two | 2 | 4 | 0 | + | 2 | 3 | two | 2 | 4 | 1 | -1 + | 2 | 3 | two | 2 | 4 | 2 | 2 + | 2 | 3 | two | 2 | 4 | 2 | 4 + | 2 | 3 | two | 2 | 4 | 3 | -3 + | 2 | 3 | two | 2 | 4 | 5 | -5 + | 2 | 3 | two | 2 | 4 | 5 | -5 + | 2 | 3 | two | 2 | 4 | | 0 + | 2 | 3 | two | 2 | 4 | | + | 2 | 3 | two | 3 | -3 | 0 | + | 2 | 3 | two | 3 | -3 | 1 | -1 + | 2 | 3 | two | 3 | -3 | 2 | 2 + | 2 | 3 | two | 3 | -3 | 2 | 4 + | 2 | 3 | two | 3 | -3 | 3 | -3 + | 2 | 3 | two | 3 | -3 | 5 | -5 + | 2 | 3 | two | 3 | -3 | 5 | -5 + | 2 | 3 | two | 3 | -3 | | 0 + | 2 | 3 | two | 3 | -3 | | + | 2 | 3 | two | 5 | -5 | 0 | + | 2 | 3 | two | 5 | -5 | 0 | + | 2 | 3 | two | 5 | -5 | 1 | -1 + | 2 | 3 | two | 5 | -5 | 1 | -1 + | 2 | 3 | two | 5 | -5 | 2 | 2 + | 2 | 3 | two | 5 | -5 | 2 | 2 + | 2 | 3 | two | 5 | -5 | 2 | 4 + | 2 | 3 | two | 5 | -5 | 2 | 4 + | 2 | 3 | two | 5 | -5 | 3 | -3 + | 2 | 3 | two | 5 | -5 | 3 | -3 + | 2 | 3 | two | 5 | -5 | 5 | -5 + | 2 | 3 | two | 5 | -5 | 5 | -5 + | 2 | 3 | two | 5 | -5 | 5 | -5 + | 2 | 3 | two | 5 | -5 | 5 | -5 + | 2 | 3 | two | 5 | -5 | | 0 + | 2 | 3 | two | 5 | -5 | | 0 + | 2 | 3 | two | 5 | -5 | | + | 2 | 3 | two | 5 | -5 | | + | 2 | 3 | two | | 0 | 0 | + | 2 | 3 | two | | 0 | 1 | -1 + | 2 | 3 | two | | 0 | 2 | 2 + | 2 | 3 | two | | 0 | 2 | 4 + | 2 | 3 | two | | 0 | 3 | -3 + | 2 | 3 | two | | 0 | 5 | -5 + | 2 | 3 | two | | 0 | 5 | -5 + | 2 | 3 | two | | 0 | | 0 + | 2 | 3 | two | | 0 | | + | 2 | 3 | two | | | 0 | + | 2 | 3 | two | | | 1 | -1 + | 2 | 3 | two | | | 2 | 2 + | 2 | 3 | two | | | 2 | 4 + | 2 | 3 | two | | | 3 | -3 + | 2 | 3 | two | | | 5 | -5 + | 2 | 3 | two | | | 5 | -5 + | 2 | 3 | two | | | | 0 + | 2 | 3 | two | | | | + | 3 | 2 | three | 0 | | 0 | + | 3 | 2 | three | 0 | | 1 | -1 + | 3 | 2 | three | 0 | | 2 | 2 + | 3 | 2 | three | 0 | | 2 | 4 + | 3 | 2 | three | 0 | | 3 | -3 + | 3 | 2 | three | 0 | | 5 | -5 + | 3 | 2 | three | 0 | | 5 | -5 + | 3 | 2 | three | 0 | | | 0 + | 3 | 2 | three | 0 | | | + | 3 | 2 | three | 1 | -1 | 0 | + | 3 | 2 | three | 1 | -1 | 1 | -1 + | 3 | 2 | three | 1 | -1 | 2 | 2 + | 3 | 2 | three | 1 | -1 | 2 | 4 + | 3 | 2 | three | 1 | -1 | 3 | -3 + | 3 | 2 | three | 1 | -1 | 5 | -5 + | 3 | 2 | three | 1 | -1 | 5 | -5 + | 3 | 2 | three | 1 | -1 | | 0 + | 3 | 2 | three | 1 | -1 | | + | 3 | 2 | three | 2 | 2 | 0 | + | 3 | 2 | three | 2 | 2 | 1 | -1 + | 3 | 2 | three | 2 | 2 | 2 | 2 + | 3 | 2 | three | 2 | 2 | 2 | 4 + | 3 | 2 | three | 2 | 2 | 3 | -3 + | 3 | 2 | three | 2 | 2 | 5 | -5 + | 3 | 2 | three | 2 | 2 | 5 | -5 + | 3 | 2 | three | 2 | 2 | | 0 + | 3 | 2 | three | 2 | 2 | | + | 3 | 2 | three | 2 | 4 | 0 | + | 3 | 2 | three | 2 | 4 | 1 | -1 + | 3 | 2 | three | 2 | 4 | 2 | 2 + | 3 | 2 | three | 2 | 4 | 2 | 4 + | 3 | 2 | three | 2 | 4 | 3 | -3 + | 3 | 2 | three | 2 | 4 | 5 | -5 + | 3 | 2 | three | 2 | 4 | 5 | -5 + | 3 | 2 | three | 2 | 4 | | 0 + | 3 | 2 | three | 2 | 4 | | + | 3 | 2 | three | 3 | -3 | 0 | + | 3 | 2 | three | 3 | -3 | 1 | -1 + | 3 | 2 | three | 3 | -3 | 2 | 2 + | 3 | 2 | three | 3 | -3 | 2 | 4 + | 3 | 2 | three | 3 | -3 | 3 | -3 + | 3 | 2 | three | 3 | -3 | 5 | -5 + | 3 | 2 | three | 3 | -3 | 5 | -5 + | 3 | 2 | three | 3 | -3 | | 0 + | 3 | 2 | three | 3 | -3 | | + | 3 | 2 | three | 5 | -5 | 0 | + | 3 | 2 | three | 5 | -5 | 0 | + | 3 | 2 | three | 5 | -5 | 1 | -1 + | 3 | 2 | three | 5 | -5 | 1 | -1 + | 3 | 2 | three | 5 | -5 | 2 | 2 + | 3 | 2 | three | 5 | -5 | 2 | 2 + | 3 | 2 | three | 5 | -5 | 2 | 4 + | 3 | 2 | three | 5 | -5 | 2 | 4 + | 3 | 2 | three | 5 | -5 | 3 | -3 + | 3 | 2 | three | 5 | -5 | 3 | -3 + | 3 | 2 | three | 5 | -5 | 5 | -5 + | 3 | 2 | three | 5 | -5 | 5 | -5 + | 3 | 2 | three | 5 | -5 | 5 | -5 + | 3 | 2 | three | 5 | -5 | 5 | -5 + | 3 | 2 | three | 5 | -5 | | 0 + | 3 | 2 | three | 5 | -5 | | 0 + | 3 | 2 | three | 5 | -5 | | + | 3 | 2 | three | 5 | -5 | | + | 3 | 2 | three | | 0 | 0 | + | 3 | 2 | three | | 0 | 1 | -1 + | 3 | 2 | three | | 0 | 2 | 2 + | 3 | 2 | three | | 0 | 2 | 4 + | 3 | 2 | three | | 0 | 3 | -3 + | 3 | 2 | three | | 0 | 5 | -5 + | 3 | 2 | three | | 0 | 5 | -5 + | 3 | 2 | three | | 0 | | 0 + | 3 | 2 | three | | 0 | | + | 3 | 2 | three | | | 0 | + | 3 | 2 | three | | | 1 | -1 + | 3 | 2 | three | | | 2 | 2 + | 3 | 2 | three | | | 2 | 4 + | 3 | 2 | three | | | 3 | -3 + | 3 | 2 | three | | | 5 | -5 + | 3 | 2 | three | | | 5 | -5 + | 3 | 2 | three | | | | 0 + | 3 | 2 | three | | | | + | 4 | 1 | four | 0 | | 0 | + | 4 | 1 | four | 0 | | 1 | -1 + | 4 | 1 | four | 0 | | 2 | 2 + | 4 | 1 | four | 0 | | 2 | 4 + | 4 | 1 | four | 0 | | 3 | -3 + | 4 | 1 | four | 0 | | 5 | -5 + | 4 | 1 | four | 0 | | 5 | -5 + | 4 | 1 | four | 0 | | | 0 + | 4 | 1 | four | 0 | | | + | 4 | 1 | four | 1 | -1 | 0 | + | 4 | 1 | four | 1 | -1 | 1 | -1 + | 4 | 1 | four | 1 | -1 | 2 | 2 + | 4 | 1 | four | 1 | -1 | 2 | 4 + | 4 | 1 | four | 1 | -1 | 3 | -3 + | 4 | 1 | four | 1 | -1 | 5 | -5 + | 4 | 1 | four | 1 | -1 | 5 | -5 + | 4 | 1 | four | 1 | -1 | | 0 + | 4 | 1 | four | 1 | -1 | | + | 4 | 1 | four | 2 | 2 | 0 | + | 4 | 1 | four | 2 | 2 | 1 | -1 + | 4 | 1 | four | 2 | 2 | 2 | 2 + | 4 | 1 | four | 2 | 2 | 2 | 4 + | 4 | 1 | four | 2 | 2 | 3 | -3 + | 4 | 1 | four | 2 | 2 | 5 | -5 + | 4 | 1 | four | 2 | 2 | 5 | -5 + | 4 | 1 | four | 2 | 2 | | 0 + | 4 | 1 | four | 2 | 2 | | + | 4 | 1 | four | 2 | 4 | 0 | + | 4 | 1 | four | 2 | 4 | 1 | -1 + | 4 | 1 | four | 2 | 4 | 2 | 2 + | 4 | 1 | four | 2 | 4 | 2 | 4 + | 4 | 1 | four | 2 | 4 | 3 | -3 + | 4 | 1 | four | 2 | 4 | 5 | -5 + | 4 | 1 | four | 2 | 4 | 5 | -5 + | 4 | 1 | four | 2 | 4 | | 0 + | 4 | 1 | four | 2 | 4 | | + | 4 | 1 | four | 3 | -3 | 0 | + | 4 | 1 | four | 3 | -3 | 1 | -1 + | 4 | 1 | four | 3 | -3 | 2 | 2 + | 4 | 1 | four | 3 | -3 | 2 | 4 + | 4 | 1 | four | 3 | -3 | 3 | -3 + | 4 | 1 | four | 3 | -3 | 5 | -5 + | 4 | 1 | four | 3 | -3 | 5 | -5 + | 4 | 1 | four | 3 | -3 | | 0 + | 4 | 1 | four | 3 | -3 | | + | 4 | 1 | four | 5 | -5 | 0 | + | 4 | 1 | four | 5 | -5 | 0 | + | 4 | 1 | four | 5 | -5 | 1 | -1 + | 4 | 1 | four | 5 | -5 | 1 | -1 + | 4 | 1 | four | 5 | -5 | 2 | 2 + | 4 | 1 | four | 5 | -5 | 2 | 2 + | 4 | 1 | four | 5 | -5 | 2 | 4 + | 4 | 1 | four | 5 | -5 | 2 | 4 + | 4 | 1 | four | 5 | -5 | 3 | -3 + | 4 | 1 | four | 5 | -5 | 3 | -3 + | 4 | 1 | four | 5 | -5 | 5 | -5 + | 4 | 1 | four | 5 | -5 | 5 | -5 + | 4 | 1 | four | 5 | -5 | 5 | -5 + | 4 | 1 | four | 5 | -5 | 5 | -5 + | 4 | 1 | four | 5 | -5 | | 0 + | 4 | 1 | four | 5 | -5 | | 0 + | 4 | 1 | four | 5 | -5 | | + | 4 | 1 | four | 5 | -5 | | + | 4 | 1 | four | | 0 | 0 | + | 4 | 1 | four | | 0 | 1 | -1 + | 4 | 1 | four | | 0 | 2 | 2 + | 4 | 1 | four | | 0 | 2 | 4 + | 4 | 1 | four | | 0 | 3 | -3 + | 4 | 1 | four | | 0 | 5 | -5 + | 4 | 1 | four | | 0 | 5 | -5 + | 4 | 1 | four | | 0 | | 0 + | 4 | 1 | four | | 0 | | + | 4 | 1 | four | | | 0 | + | 4 | 1 | four | | | 1 | -1 + | 4 | 1 | four | | | 2 | 2 + | 4 | 1 | four | | | 2 | 4 + | 4 | 1 | four | | | 3 | -3 + | 4 | 1 | four | | | 5 | -5 + | 4 | 1 | four | | | 5 | -5 + | 4 | 1 | four | | | | 0 + | 4 | 1 | four | | | | + | 5 | 0 | five | 0 | | 0 | + | 5 | 0 | five | 0 | | 1 | -1 + | 5 | 0 | five | 0 | | 2 | 2 + | 5 | 0 | five | 0 | | 2 | 4 + | 5 | 0 | five | 0 | | 3 | -3 + | 5 | 0 | five | 0 | | 5 | -5 + | 5 | 0 | five | 0 | | 5 | -5 + | 5 | 0 | five | 0 | | | 0 + | 5 | 0 | five | 0 | | | + | 5 | 0 | five | 1 | -1 | 0 | + | 5 | 0 | five | 1 | -1 | 1 | -1 + | 5 | 0 | five | 1 | -1 | 2 | 2 + | 5 | 0 | five | 1 | -1 | 2 | 4 + | 5 | 0 | five | 1 | -1 | 3 | -3 + | 5 | 0 | five | 1 | -1 | 5 | -5 + | 5 | 0 | five | 1 | -1 | 5 | -5 + | 5 | 0 | five | 1 | -1 | | 0 + | 5 | 0 | five | 1 | -1 | | + | 5 | 0 | five | 2 | 2 | 0 | + | 5 | 0 | five | 2 | 2 | 1 | -1 + | 5 | 0 | five | 2 | 2 | 2 | 2 + | 5 | 0 | five | 2 | 2 | 2 | 4 + | 5 | 0 | five | 2 | 2 | 3 | -3 + | 5 | 0 | five | 2 | 2 | 5 | -5 + | 5 | 0 | five | 2 | 2 | 5 | -5 + | 5 | 0 | five | 2 | 2 | | 0 + | 5 | 0 | five | 2 | 2 | | + | 5 | 0 | five | 2 | 4 | 0 | + | 5 | 0 | five | 2 | 4 | 1 | -1 + | 5 | 0 | five | 2 | 4 | 2 | 2 + | 5 | 0 | five | 2 | 4 | 2 | 4 + | 5 | 0 | five | 2 | 4 | 3 | -3 + | 5 | 0 | five | 2 | 4 | 5 | -5 + | 5 | 0 | five | 2 | 4 | 5 | -5 + | 5 | 0 | five | 2 | 4 | | 0 + | 5 | 0 | five | 2 | 4 | | + | 5 | 0 | five | 3 | -3 | 0 | + | 5 | 0 | five | 3 | -3 | 1 | -1 + | 5 | 0 | five | 3 | -3 | 2 | 2 + | 5 | 0 | five | 3 | -3 | 2 | 4 + | 5 | 0 | five | 3 | -3 | 3 | -3 + | 5 | 0 | five | 3 | -3 | 5 | -5 + | 5 | 0 | five | 3 | -3 | 5 | -5 + | 5 | 0 | five | 3 | -3 | | 0 + | 5 | 0 | five | 3 | -3 | | + | 5 | 0 | five | 5 | -5 | 0 | + | 5 | 0 | five | 5 | -5 | 0 | + | 5 | 0 | five | 5 | -5 | 1 | -1 + | 5 | 0 | five | 5 | -5 | 1 | -1 + | 5 | 0 | five | 5 | -5 | 2 | 2 + | 5 | 0 | five | 5 | -5 | 2 | 2 + | 5 | 0 | five | 5 | -5 | 2 | 4 + | 5 | 0 | five | 5 | -5 | 2 | 4 + | 5 | 0 | five | 5 | -5 | 3 | -3 + | 5 | 0 | five | 5 | -5 | 3 | -3 + | 5 | 0 | five | 5 | -5 | 5 | -5 + | 5 | 0 | five | 5 | -5 | 5 | -5 + | 5 | 0 | five | 5 | -5 | 5 | -5 + | 5 | 0 | five | 5 | -5 | 5 | -5 + | 5 | 0 | five | 5 | -5 | | 0 + | 5 | 0 | five | 5 | -5 | | 0 + | 5 | 0 | five | 5 | -5 | | + | 5 | 0 | five | 5 | -5 | | + | 5 | 0 | five | | 0 | 0 | + | 5 | 0 | five | | 0 | 1 | -1 + | 5 | 0 | five | | 0 | 2 | 2 + | 5 | 0 | five | | 0 | 2 | 4 + | 5 | 0 | five | | 0 | 3 | -3 + | 5 | 0 | five | | 0 | 5 | -5 + | 5 | 0 | five | | 0 | 5 | -5 + | 5 | 0 | five | | 0 | | 0 + | 5 | 0 | five | | 0 | | + | 5 | 0 | five | | | 0 | + | 5 | 0 | five | | | 1 | -1 + | 5 | 0 | five | | | 2 | 2 + | 5 | 0 | five | | | 2 | 4 + | 5 | 0 | five | | | 3 | -3 + | 5 | 0 | five | | | 5 | -5 + | 5 | 0 | five | | | 5 | -5 + | 5 | 0 | five | | | | 0 + | 5 | 0 | five | | | | + | 6 | 6 | six | 0 | | 0 | + | 6 | 6 | six | 0 | | 1 | -1 + | 6 | 6 | six | 0 | | 2 | 2 + | 6 | 6 | six | 0 | | 2 | 4 + | 6 | 6 | six | 0 | | 3 | -3 + | 6 | 6 | six | 0 | | 5 | -5 + | 6 | 6 | six | 0 | | 5 | -5 + | 6 | 6 | six | 0 | | | 0 + | 6 | 6 | six | 0 | | | + | 6 | 6 | six | 1 | -1 | 0 | + | 6 | 6 | six | 1 | -1 | 1 | -1 + | 6 | 6 | six | 1 | -1 | 2 | 2 + | 6 | 6 | six | 1 | -1 | 2 | 4 + | 6 | 6 | six | 1 | -1 | 3 | -3 + | 6 | 6 | six | 1 | -1 | 5 | -5 + | 6 | 6 | six | 1 | -1 | 5 | -5 + | 6 | 6 | six | 1 | -1 | | 0 + | 6 | 6 | six | 1 | -1 | | + | 6 | 6 | six | 2 | 2 | 0 | + | 6 | 6 | six | 2 | 2 | 1 | -1 + | 6 | 6 | six | 2 | 2 | 2 | 2 + | 6 | 6 | six | 2 | 2 | 2 | 4 + | 6 | 6 | six | 2 | 2 | 3 | -3 + | 6 | 6 | six | 2 | 2 | 5 | -5 + | 6 | 6 | six | 2 | 2 | 5 | -5 + | 6 | 6 | six | 2 | 2 | | 0 + | 6 | 6 | six | 2 | 2 | | + | 6 | 6 | six | 2 | 4 | 0 | + | 6 | 6 | six | 2 | 4 | 1 | -1 + | 6 | 6 | six | 2 | 4 | 2 | 2 + | 6 | 6 | six | 2 | 4 | 2 | 4 + | 6 | 6 | six | 2 | 4 | 3 | -3 + | 6 | 6 | six | 2 | 4 | 5 | -5 + | 6 | 6 | six | 2 | 4 | 5 | -5 + | 6 | 6 | six | 2 | 4 | | 0 + | 6 | 6 | six | 2 | 4 | | + | 6 | 6 | six | 3 | -3 | 0 | + | 6 | 6 | six | 3 | -3 | 1 | -1 + | 6 | 6 | six | 3 | -3 | 2 | 2 + | 6 | 6 | six | 3 | -3 | 2 | 4 + | 6 | 6 | six | 3 | -3 | 3 | -3 + | 6 | 6 | six | 3 | -3 | 5 | -5 + | 6 | 6 | six | 3 | -3 | 5 | -5 + | 6 | 6 | six | 3 | -3 | | 0 + | 6 | 6 | six | 3 | -3 | | + | 6 | 6 | six | 5 | -5 | 0 | + | 6 | 6 | six | 5 | -5 | 0 | + | 6 | 6 | six | 5 | -5 | 1 | -1 + | 6 | 6 | six | 5 | -5 | 1 | -1 + | 6 | 6 | six | 5 | -5 | 2 | 2 + | 6 | 6 | six | 5 | -5 | 2 | 2 + | 6 | 6 | six | 5 | -5 | 2 | 4 + | 6 | 6 | six | 5 | -5 | 2 | 4 + | 6 | 6 | six | 5 | -5 | 3 | -3 + | 6 | 6 | six | 5 | -5 | 3 | -3 + | 6 | 6 | six | 5 | -5 | 5 | -5 + | 6 | 6 | six | 5 | -5 | 5 | -5 + | 6 | 6 | six | 5 | -5 | 5 | -5 + | 6 | 6 | six | 5 | -5 | 5 | -5 + | 6 | 6 | six | 5 | -5 | | 0 + | 6 | 6 | six | 5 | -5 | | 0 + | 6 | 6 | six | 5 | -5 | | + | 6 | 6 | six | 5 | -5 | | + | 6 | 6 | six | | 0 | 0 | + | 6 | 6 | six | | 0 | 1 | -1 + | 6 | 6 | six | | 0 | 2 | 2 + | 6 | 6 | six | | 0 | 2 | 4 + | 6 | 6 | six | | 0 | 3 | -3 + | 6 | 6 | six | | 0 | 5 | -5 + | 6 | 6 | six | | 0 | 5 | -5 + | 6 | 6 | six | | 0 | | 0 + | 6 | 6 | six | | 0 | | + | 6 | 6 | six | | | 0 | + | 6 | 6 | six | | | 1 | -1 + | 6 | 6 | six | | | 2 | 2 + | 6 | 6 | six | | | 2 | 4 + | 6 | 6 | six | | | 3 | -3 + | 6 | 6 | six | | | 5 | -5 + | 6 | 6 | six | | | 5 | -5 + | 6 | 6 | six | | | | 0 + | 6 | 6 | six | | | | + | 7 | 7 | seven | 0 | | 0 | + | 7 | 7 | seven | 0 | | 1 | -1 + | 7 | 7 | seven | 0 | | 2 | 2 + | 7 | 7 | seven | 0 | | 2 | 4 + | 7 | 7 | seven | 0 | | 3 | -3 + | 7 | 7 | seven | 0 | | 5 | -5 + | 7 | 7 | seven | 0 | | 5 | -5 + | 7 | 7 | seven | 0 | | | 0 + | 7 | 7 | seven | 0 | | | + | 7 | 7 | seven | 1 | -1 | 0 | + | 7 | 7 | seven | 1 | -1 | 1 | -1 + | 7 | 7 | seven | 1 | -1 | 2 | 2 + | 7 | 7 | seven | 1 | -1 | 2 | 4 + | 7 | 7 | seven | 1 | -1 | 3 | -3 + | 7 | 7 | seven | 1 | -1 | 5 | -5 + | 7 | 7 | seven | 1 | -1 | 5 | -5 + | 7 | 7 | seven | 1 | -1 | | 0 + | 7 | 7 | seven | 1 | -1 | | + | 7 | 7 | seven | 2 | 2 | 0 | + | 7 | 7 | seven | 2 | 2 | 1 | -1 + | 7 | 7 | seven | 2 | 2 | 2 | 2 + | 7 | 7 | seven | 2 | 2 | 2 | 4 + | 7 | 7 | seven | 2 | 2 | 3 | -3 + | 7 | 7 | seven | 2 | 2 | 5 | -5 + | 7 | 7 | seven | 2 | 2 | 5 | -5 + | 7 | 7 | seven | 2 | 2 | | 0 + | 7 | 7 | seven | 2 | 2 | | + | 7 | 7 | seven | 2 | 4 | 0 | + | 7 | 7 | seven | 2 | 4 | 1 | -1 + | 7 | 7 | seven | 2 | 4 | 2 | 2 + | 7 | 7 | seven | 2 | 4 | 2 | 4 + | 7 | 7 | seven | 2 | 4 | 3 | -3 + | 7 | 7 | seven | 2 | 4 | 5 | -5 + | 7 | 7 | seven | 2 | 4 | 5 | -5 + | 7 | 7 | seven | 2 | 4 | | 0 + | 7 | 7 | seven | 2 | 4 | | + | 7 | 7 | seven | 3 | -3 | 0 | + | 7 | 7 | seven | 3 | -3 | 1 | -1 + | 7 | 7 | seven | 3 | -3 | 2 | 2 + | 7 | 7 | seven | 3 | -3 | 2 | 4 + | 7 | 7 | seven | 3 | -3 | 3 | -3 + | 7 | 7 | seven | 3 | -3 | 5 | -5 + | 7 | 7 | seven | 3 | -3 | 5 | -5 + | 7 | 7 | seven | 3 | -3 | | 0 + | 7 | 7 | seven | 3 | -3 | | + | 7 | 7 | seven | 5 | -5 | 0 | + | 7 | 7 | seven | 5 | -5 | 0 | + | 7 | 7 | seven | 5 | -5 | 1 | -1 + | 7 | 7 | seven | 5 | -5 | 1 | -1 + | 7 | 7 | seven | 5 | -5 | 2 | 2 + | 7 | 7 | seven | 5 | -5 | 2 | 2 + | 7 | 7 | seven | 5 | -5 | 2 | 4 + | 7 | 7 | seven | 5 | -5 | 2 | 4 + | 7 | 7 | seven | 5 | -5 | 3 | -3 + | 7 | 7 | seven | 5 | -5 | 3 | -3 + | 7 | 7 | seven | 5 | -5 | 5 | -5 + | 7 | 7 | seven | 5 | -5 | 5 | -5 + | 7 | 7 | seven | 5 | -5 | 5 | -5 + | 7 | 7 | seven | 5 | -5 | 5 | -5 + | 7 | 7 | seven | 5 | -5 | | 0 + | 7 | 7 | seven | 5 | -5 | | 0 + | 7 | 7 | seven | 5 | -5 | | + | 7 | 7 | seven | 5 | -5 | | + | 7 | 7 | seven | | 0 | 0 | + | 7 | 7 | seven | | 0 | 1 | -1 + | 7 | 7 | seven | | 0 | 2 | 2 + | 7 | 7 | seven | | 0 | 2 | 4 + | 7 | 7 | seven | | 0 | 3 | -3 + | 7 | 7 | seven | | 0 | 5 | -5 + | 7 | 7 | seven | | 0 | 5 | -5 + | 7 | 7 | seven | | 0 | | 0 + | 7 | 7 | seven | | 0 | | + | 7 | 7 | seven | | | 0 | + | 7 | 7 | seven | | | 1 | -1 + | 7 | 7 | seven | | | 2 | 2 + | 7 | 7 | seven | | | 2 | 4 + | 7 | 7 | seven | | | 3 | -3 + | 7 | 7 | seven | | | 5 | -5 + | 7 | 7 | seven | | | 5 | -5 + | 7 | 7 | seven | | | | 0 + | 7 | 7 | seven | | | | + | 8 | 8 | eight | 0 | | 0 | + | 8 | 8 | eight | 0 | | 1 | -1 + | 8 | 8 | eight | 0 | | 2 | 2 + | 8 | 8 | eight | 0 | | 2 | 4 + | 8 | 8 | eight | 0 | | 3 | -3 + | 8 | 8 | eight | 0 | | 5 | -5 + | 8 | 8 | eight | 0 | | 5 | -5 + | 8 | 8 | eight | 0 | | | 0 + | 8 | 8 | eight | 0 | | | + | 8 | 8 | eight | 1 | -1 | 0 | + | 8 | 8 | eight | 1 | -1 | 1 | -1 + | 8 | 8 | eight | 1 | -1 | 2 | 2 + | 8 | 8 | eight | 1 | -1 | 2 | 4 + | 8 | 8 | eight | 1 | -1 | 3 | -3 + | 8 | 8 | eight | 1 | -1 | 5 | -5 + | 8 | 8 | eight | 1 | -1 | 5 | -5 + | 8 | 8 | eight | 1 | -1 | | 0 + | 8 | 8 | eight | 1 | -1 | | + | 8 | 8 | eight | 2 | 2 | 0 | + | 8 | 8 | eight | 2 | 2 | 1 | -1 + | 8 | 8 | eight | 2 | 2 | 2 | 2 + | 8 | 8 | eight | 2 | 2 | 2 | 4 + | 8 | 8 | eight | 2 | 2 | 3 | -3 + | 8 | 8 | eight | 2 | 2 | 5 | -5 + | 8 | 8 | eight | 2 | 2 | 5 | -5 + | 8 | 8 | eight | 2 | 2 | | 0 + | 8 | 8 | eight | 2 | 2 | | + | 8 | 8 | eight | 2 | 4 | 0 | + | 8 | 8 | eight | 2 | 4 | 1 | -1 + | 8 | 8 | eight | 2 | 4 | 2 | 2 + | 8 | 8 | eight | 2 | 4 | 2 | 4 + | 8 | 8 | eight | 2 | 4 | 3 | -3 + | 8 | 8 | eight | 2 | 4 | 5 | -5 + | 8 | 8 | eight | 2 | 4 | 5 | -5 + | 8 | 8 | eight | 2 | 4 | | 0 + | 8 | 8 | eight | 2 | 4 | | + | 8 | 8 | eight | 3 | -3 | 0 | + | 8 | 8 | eight | 3 | -3 | 1 | -1 + | 8 | 8 | eight | 3 | -3 | 2 | 2 + | 8 | 8 | eight | 3 | -3 | 2 | 4 + | 8 | 8 | eight | 3 | -3 | 3 | -3 + | 8 | 8 | eight | 3 | -3 | 5 | -5 + | 8 | 8 | eight | 3 | -3 | 5 | -5 + | 8 | 8 | eight | 3 | -3 | | 0 + | 8 | 8 | eight | 3 | -3 | | + | 8 | 8 | eight | 5 | -5 | 0 | + | 8 | 8 | eight | 5 | -5 | 0 | + | 8 | 8 | eight | 5 | -5 | 1 | -1 + | 8 | 8 | eight | 5 | -5 | 1 | -1 + | 8 | 8 | eight | 5 | -5 | 2 | 2 + | 8 | 8 | eight | 5 | -5 | 2 | 2 + | 8 | 8 | eight | 5 | -5 | 2 | 4 + | 8 | 8 | eight | 5 | -5 | 2 | 4 + | 8 | 8 | eight | 5 | -5 | 3 | -3 + | 8 | 8 | eight | 5 | -5 | 3 | -3 + | 8 | 8 | eight | 5 | -5 | 5 | -5 + | 8 | 8 | eight | 5 | -5 | 5 | -5 + | 8 | 8 | eight | 5 | -5 | 5 | -5 + | 8 | 8 | eight | 5 | -5 | 5 | -5 + | 8 | 8 | eight | 5 | -5 | | 0 + | 8 | 8 | eight | 5 | -5 | | 0 + | 8 | 8 | eight | 5 | -5 | | + | 8 | 8 | eight | 5 | -5 | | + | 8 | 8 | eight | | 0 | 0 | + | 8 | 8 | eight | | 0 | 1 | -1 + | 8 | 8 | eight | | 0 | 2 | 2 + | 8 | 8 | eight | | 0 | 2 | 4 + | 8 | 8 | eight | | 0 | 3 | -3 + | 8 | 8 | eight | | 0 | 5 | -5 + | 8 | 8 | eight | | 0 | 5 | -5 + | 8 | 8 | eight | | 0 | | 0 + | 8 | 8 | eight | | 0 | | + | 8 | 8 | eight | | | 0 | + | 8 | 8 | eight | | | 1 | -1 + | 8 | 8 | eight | | | 2 | 2 + | 8 | 8 | eight | | | 2 | 4 + | 8 | 8 | eight | | | 3 | -3 + | 8 | 8 | eight | | | 5 | -5 + | 8 | 8 | eight | | | 5 | -5 + | 8 | 8 | eight | | | | 0 + | 8 | 8 | eight | | | | + | | 0 | zero | 0 | | 0 | + | | 0 | zero | 0 | | 1 | -1 + | | 0 | zero | 0 | | 2 | 2 + | | 0 | zero | 0 | | 2 | 4 + | | 0 | zero | 0 | | 3 | -3 + | | 0 | zero | 0 | | 5 | -5 + | | 0 | zero | 0 | | 5 | -5 + | | 0 | zero | 0 | | | 0 + | | 0 | zero | 0 | | | + | | 0 | zero | 1 | -1 | 0 | + | | 0 | zero | 1 | -1 | 1 | -1 + | | 0 | zero | 1 | -1 | 2 | 2 + | | 0 | zero | 1 | -1 | 2 | 4 + | | 0 | zero | 1 | -1 | 3 | -3 + | | 0 | zero | 1 | -1 | 5 | -5 + | | 0 | zero | 1 | -1 | 5 | -5 + | | 0 | zero | 1 | -1 | | 0 + | | 0 | zero | 1 | -1 | | + | | 0 | zero | 2 | 2 | 0 | + | | 0 | zero | 2 | 2 | 1 | -1 + | | 0 | zero | 2 | 2 | 2 | 2 + | | 0 | zero | 2 | 2 | 2 | 4 + | | 0 | zero | 2 | 2 | 3 | -3 + | | 0 | zero | 2 | 2 | 5 | -5 + | | 0 | zero | 2 | 2 | 5 | -5 + | | 0 | zero | 2 | 2 | | 0 + | | 0 | zero | 2 | 2 | | + | | 0 | zero | 2 | 4 | 0 | + | | 0 | zero | 2 | 4 | 1 | -1 + | | 0 | zero | 2 | 4 | 2 | 2 + | | 0 | zero | 2 | 4 | 2 | 4 + | | 0 | zero | 2 | 4 | 3 | -3 + | | 0 | zero | 2 | 4 | 5 | -5 + | | 0 | zero | 2 | 4 | 5 | -5 + | | 0 | zero | 2 | 4 | | 0 + | | 0 | zero | 2 | 4 | | + | | 0 | zero | 3 | -3 | 0 | + | | 0 | zero | 3 | -3 | 1 | -1 + | | 0 | zero | 3 | -3 | 2 | 2 + | | 0 | zero | 3 | -3 | 2 | 4 + | | 0 | zero | 3 | -3 | 3 | -3 + | | 0 | zero | 3 | -3 | 5 | -5 + | | 0 | zero | 3 | -3 | 5 | -5 + | | 0 | zero | 3 | -3 | | 0 + | | 0 | zero | 3 | -3 | | + | | 0 | zero | 5 | -5 | 0 | + | | 0 | zero | 5 | -5 | 0 | + | | 0 | zero | 5 | -5 | 1 | -1 + | | 0 | zero | 5 | -5 | 1 | -1 + | | 0 | zero | 5 | -5 | 2 | 2 + | | 0 | zero | 5 | -5 | 2 | 2 + | | 0 | zero | 5 | -5 | 2 | 4 + | | 0 | zero | 5 | -5 | 2 | 4 + | | 0 | zero | 5 | -5 | 3 | -3 + | | 0 | zero | 5 | -5 | 3 | -3 + | | 0 | zero | 5 | -5 | 5 | -5 + | | 0 | zero | 5 | -5 | 5 | -5 + | | 0 | zero | 5 | -5 | 5 | -5 + | | 0 | zero | 5 | -5 | 5 | -5 + | | 0 | zero | 5 | -5 | | 0 + | | 0 | zero | 5 | -5 | | 0 + | | 0 | zero | 5 | -5 | | + | | 0 | zero | 5 | -5 | | + | | 0 | zero | | 0 | 0 | + | | 0 | zero | | 0 | 1 | -1 + | | 0 | zero | | 0 | 2 | 2 + | | 0 | zero | | 0 | 2 | 4 + | | 0 | zero | | 0 | 3 | -3 + | | 0 | zero | | 0 | 5 | -5 + | | 0 | zero | | 0 | 5 | -5 + | | 0 | zero | | 0 | | 0 + | | 0 | zero | | 0 | | + | | 0 | zero | | | 0 | + | | 0 | zero | | | 1 | -1 + | | 0 | zero | | | 2 | 2 + | | 0 | zero | | | 2 | 4 + | | 0 | zero | | | 3 | -3 + | | 0 | zero | | | 5 | -5 + | | 0 | zero | | | 5 | -5 + | | 0 | zero | | | | 0 + | | 0 | zero | | | | + | | | null | 0 | | 0 | + | | | null | 0 | | 1 | -1 + | | | null | 0 | | 2 | 2 + | | | null | 0 | | 2 | 4 + | | | null | 0 | | 3 | -3 + | | | null | 0 | | 5 | -5 + | | | null | 0 | | 5 | -5 + | | | null | 0 | | | 0 + | | | null | 0 | | | + | | | null | 1 | -1 | 0 | + | | | null | 1 | -1 | 1 | -1 + | | | null | 1 | -1 | 2 | 2 + | | | null | 1 | -1 | 2 | 4 + | | | null | 1 | -1 | 3 | -3 + | | | null | 1 | -1 | 5 | -5 + | | | null | 1 | -1 | 5 | -5 + | | | null | 1 | -1 | | 0 + | | | null | 1 | -1 | | + | | | null | 2 | 2 | 0 | + | | | null | 2 | 2 | 1 | -1 + | | | null | 2 | 2 | 2 | 2 + | | | null | 2 | 2 | 2 | 4 + | | | null | 2 | 2 | 3 | -3 + | | | null | 2 | 2 | 5 | -5 + | | | null | 2 | 2 | 5 | -5 + | | | null | 2 | 2 | | 0 + | | | null | 2 | 2 | | + | | | null | 2 | 4 | 0 | + | | | null | 2 | 4 | 1 | -1 + | | | null | 2 | 4 | 2 | 2 + | | | null | 2 | 4 | 2 | 4 + | | | null | 2 | 4 | 3 | -3 + | | | null | 2 | 4 | 5 | -5 + | | | null | 2 | 4 | 5 | -5 + | | | null | 2 | 4 | | 0 + | | | null | 2 | 4 | | + | | | null | 3 | -3 | 0 | + | | | null | 3 | -3 | 1 | -1 + | | | null | 3 | -3 | 2 | 2 + | | | null | 3 | -3 | 2 | 4 + | | | null | 3 | -3 | 3 | -3 + | | | null | 3 | -3 | 5 | -5 + | | | null | 3 | -3 | 5 | -5 + | | | null | 3 | -3 | | 0 + | | | null | 3 | -3 | | + | | | null | 5 | -5 | 0 | + | | | null | 5 | -5 | 0 | + | | | null | 5 | -5 | 1 | -1 + | | | null | 5 | -5 | 1 | -1 + | | | null | 5 | -5 | 2 | 2 + | | | null | 5 | -5 | 2 | 2 + | | | null | 5 | -5 | 2 | 4 + | | | null | 5 | -5 | 2 | 4 + | | | null | 5 | -5 | 3 | -3 + | | | null | 5 | -5 | 3 | -3 + | | | null | 5 | -5 | 5 | -5 + | | | null | 5 | -5 | 5 | -5 + | | | null | 5 | -5 | 5 | -5 + | | | null | 5 | -5 | 5 | -5 + | | | null | 5 | -5 | | 0 + | | | null | 5 | -5 | | 0 + | | | null | 5 | -5 | | + | | | null | 5 | -5 | | + | | | null | | 0 | 0 | + | | | null | | 0 | 1 | -1 + | | | null | | 0 | 2 | 2 + | | | null | | 0 | 2 | 4 + | | | null | | 0 | 3 | -3 + | | | null | | 0 | 5 | -5 + | | | null | | 0 | 5 | -5 + | | | null | | 0 | | 0 + | | | null | | 0 | | + | | | null | | | 0 | + | | | null | | | 1 | -1 + | | | null | | | 2 | 2 + | | | null | | | 2 | 4 + | | | null | | | 3 | -3 + | | | null | | | 5 | -5 + | | | null | | | 5 | -5 + | | | null | | | | 0 + | | | null | | | | +(891 rows) + +-- +-- +-- Inner joins (equi-joins) +-- +-- +-- +-- Inner joins (equi-joins) with USING clause +-- The USING syntax changes the shape of the resulting table +-- by including a column in the USING clause only once in the result. +-- +-- Inner equi-join on specified column +SELECT '' AS "xxx", * + FROM J1_TBL INNER JOIN J2_TBL USING (i) + ORDER BY i, j, k, t; + xxx | i | j | t | k +-----+---+---+-------+---- + | 0 | | zero | + | 1 | 4 | one | -1 + | 2 | 3 | two | 2 + | 2 | 3 | two | 4 + | 3 | 2 | three | -3 + | 5 | 0 | five | -5 + | 5 | 0 | five | -5 +(7 rows) + +-- Same as above, slightly different syntax +SELECT '' AS "xxx", * + FROM J1_TBL JOIN J2_TBL USING (i) + ORDER BY i, j, k, t; + xxx | i | j | t | k +-----+---+---+-------+---- + | 0 | | zero | + | 1 | 4 | one | -1 + | 2 | 3 | two | 2 + | 2 | 3 | two | 4 + | 3 | 2 | three | -3 + | 5 | 0 | five | -5 + | 5 | 0 | five | -5 +(7 rows) + +SELECT '' AS "xxx", * + FROM J1_TBL t1 (a, b, c) JOIN J2_TBL t2 (a, d) USING (a) + ORDER BY a, d; + xxx | a | b | c | d +-----+---+---+-------+---- + | 0 | | zero | + | 1 | 4 | one | -1 + | 2 | 3 | two | 2 + | 2 | 3 | two | 4 + | 3 | 2 | three | -3 + | 5 | 0 | five | -5 + | 5 | 0 | five | -5 +(7 rows) + +SELECT '' AS "xxx", * + FROM J1_TBL t1 (a, b, c) JOIN J2_TBL t2 (a, b) USING (b) + ORDER BY b, t1.a; + xxx | b | a | c | a +-----+---+---+-------+--- + | 0 | 5 | five | + | 0 | | zero | + | 2 | 3 | three | 2 + | 4 | 1 | one | 2 +(4 rows) + +-- +-- NATURAL JOIN +-- Inner equi-join on all columns with the same name +-- +SELECT '' AS "xxx", * + FROM J1_TBL NATURAL JOIN J2_TBL + ORDER BY i, j, k, t; + xxx | i | j | t | k +-----+---+---+-------+---- + | 0 | | zero | + | 1 | 4 | one | -1 + | 2 | 3 | two | 2 + | 2 | 3 | two | 4 + | 3 | 2 | three | -3 + | 5 | 0 | five | -5 + | 5 | 0 | five | -5 +(7 rows) + +SELECT '' AS "xxx", * + FROM J1_TBL t1 (a, b, c) NATURAL JOIN J2_TBL t2 (a, d) + ORDER BY a, b, c, d; + xxx | a | b | c | d +-----+---+---+-------+---- + | 0 | | zero | + | 1 | 4 | one | -1 + | 2 | 3 | two | 2 + | 2 | 3 | two | 4 + | 3 | 2 | three | -3 + | 5 | 0 | five | -5 + | 5 | 0 | five | -5 +(7 rows) + +SELECT '' AS "xxx", * + FROM J1_TBL t1 (a, b, c) NATURAL JOIN J2_TBL t2 (d, a) + ORDER BY a, b, c, d; + xxx | a | b | c | d +-----+---+---+------+--- + | 0 | | zero | + | 2 | 3 | two | 2 + | 4 | 1 | four | 2 +(3 rows) + +-- mismatch number of columns +-- currently, Postgres will fill in with underlying names +SELECT '' AS "xxx", * + FROM J1_TBL t1 (a, b) NATURAL JOIN J2_TBL t2 (a) + ORDER BY a, b, t, k; + xxx | a | b | t | k +-----+---+---+-------+---- + | 0 | | zero | + | 1 | 4 | one | -1 + | 2 | 3 | two | 2 + | 2 | 3 | two | 4 + | 3 | 2 | three | -3 + | 5 | 0 | five | -5 + | 5 | 0 | five | -5 +(7 rows) + +-- +-- Inner joins (equi-joins) +-- +SELECT '' AS "xxx", * + FROM J1_TBL JOIN J2_TBL ON (J1_TBL.i = J2_TBL.i) + ORDER BY J1_TBL.i, J1_TBL.j, J1_TBL.t, J2_TBL.i, J2_TBL.k; + xxx | i | j | t | i | k +-----+---+---+-------+---+---- + | 0 | | zero | 0 | + | 1 | 4 | one | 1 | -1 + | 2 | 3 | two | 2 | 2 + | 2 | 3 | two | 2 | 4 + | 3 | 2 | three | 3 | -3 + | 5 | 0 | five | 5 | -5 + | 5 | 0 | five | 5 | -5 +(7 rows) + +SELECT '' AS "xxx", * + FROM J1_TBL JOIN J2_TBL ON (J1_TBL.i = J2_TBL.k) + ORDER BY J1_TBL.i, J1_TBL.j, J1_TBL.t, J2_TBL.i, J2_TBL.k; + xxx | i | j | t | i | k +-----+---+---+------+---+--- + | 0 | | zero | | 0 + | 2 | 3 | two | 2 | 2 + | 4 | 1 | four | 2 | 4 +(3 rows) + +-- +-- Non-equi-joins +-- +SELECT '' AS "xxx", * + FROM J1_TBL JOIN J2_TBL ON (J1_TBL.i <= J2_TBL.k) + ORDER BY J1_TBL.i, J1_TBL.j, J1_TBL.t, J2_TBL.i, J2_TBL.k; + xxx | i | j | t | i | k +-----+---+---+-------+---+--- + | 0 | | zero | 2 | 2 + | 0 | | zero | 2 | 4 + | 0 | | zero | | 0 + | 1 | 4 | one | 2 | 2 + | 1 | 4 | one | 2 | 4 + | 2 | 3 | two | 2 | 2 + | 2 | 3 | two | 2 | 4 + | 3 | 2 | three | 2 | 4 + | 4 | 1 | four | 2 | 4 +(9 rows) + +-- +-- Outer joins +-- Note that OUTER is a noise word +-- +SELECT '' AS "xxx", * + FROM J1_TBL LEFT OUTER JOIN J2_TBL USING (i) + ORDER BY i, k, t; + xxx | i | j | t | k +-----+---+---+-------+---- + | 0 | | zero | + | 1 | 4 | one | -1 + | 2 | 3 | two | 2 + | 2 | 3 | two | 4 + | 3 | 2 | three | -3 + | 4 | 1 | four | + | 5 | 0 | five | -5 + | 5 | 0 | five | -5 + | 6 | 6 | six | + | 7 | 7 | seven | + | 8 | 8 | eight | + | | | null | + | | 0 | zero | +(13 rows) + +SELECT '' AS "xxx", * + FROM J1_TBL LEFT JOIN J2_TBL USING (i) + ORDER BY i, k, t; + xxx | i | j | t | k +-----+---+---+-------+---- + | 0 | | zero | + | 1 | 4 | one | -1 + | 2 | 3 | two | 2 + | 2 | 3 | two | 4 + | 3 | 2 | three | -3 + | 4 | 1 | four | + | 5 | 0 | five | -5 + | 5 | 0 | five | -5 + | 6 | 6 | six | + | 7 | 7 | seven | + | 8 | 8 | eight | + | | | null | + | | 0 | zero | +(13 rows) + +SELECT '' AS "xxx", * + FROM J1_TBL RIGHT OUTER JOIN J2_TBL USING (i) + ORDER BY i, j, k, t; + xxx | i | j | t | k +-----+---+---+-------+---- + | 0 | | zero | + | 1 | 4 | one | -1 + | 2 | 3 | two | 2 + | 2 | 3 | two | 4 + | 3 | 2 | three | -3 + | 5 | 0 | five | -5 + | 5 | 0 | five | -5 + | | | | 0 + | | | | +(9 rows) + +SELECT '' AS "xxx", * + FROM J1_TBL RIGHT JOIN J2_TBL USING (i) + ORDER BY i, j, k, t; + xxx | i | j | t | k +-----+---+---+-------+---- + | 0 | | zero | + | 1 | 4 | one | -1 + | 2 | 3 | two | 2 + | 2 | 3 | two | 4 + | 3 | 2 | three | -3 + | 5 | 0 | five | -5 + | 5 | 0 | five | -5 + | | | | 0 + | | | | +(9 rows) + +SELECT '' AS "xxx", * + FROM J1_TBL FULL OUTER JOIN J2_TBL USING (i) + ORDER BY i, k, t; + xxx | i | j | t | k +-----+---+---+-------+---- + | 0 | | zero | + | 1 | 4 | one | -1 + | 2 | 3 | two | 2 + | 2 | 3 | two | 4 + | 3 | 2 | three | -3 + | 4 | 1 | four | + | 5 | 0 | five | -5 + | 5 | 0 | five | -5 + | 6 | 6 | six | + | 7 | 7 | seven | + | 8 | 8 | eight | + | | | | 0 + | | | null | + | | 0 | zero | + | | | | +(15 rows) + +SELECT '' AS "xxx", * + FROM J1_TBL FULL JOIN J2_TBL USING (i) + ORDER BY i, k, t; + xxx | i | j | t | k +-----+---+---+-------+---- + | 0 | | zero | + | 1 | 4 | one | -1 + | 2 | 3 | two | 2 + | 2 | 3 | two | 4 + | 3 | 2 | three | -3 + | 4 | 1 | four | + | 5 | 0 | five | -5 + | 5 | 0 | five | -5 + | 6 | 6 | six | + | 7 | 7 | seven | + | 8 | 8 | eight | + | | | | 0 + | | | null | + | | 0 | zero | + | | | | +(15 rows) + +SELECT '' AS "xxx", * + FROM J1_TBL LEFT JOIN J2_TBL USING (i) WHERE (k = 1); + xxx | i | j | t | k +-----+---+---+---+--- +(0 rows) + +SELECT '' AS "xxx", * + FROM J1_TBL LEFT JOIN J2_TBL USING (i) WHERE (i = 1); + xxx | i | j | t | k +-----+---+---+-----+---- + | 1 | 4 | one | -1 +(1 row) + +-- +-- More complicated constructs +-- +-- +-- Multiway full join +-- +CREATE TABLE t1 (name TEXT, n INTEGER); +CREATE TABLE t2 (name TEXT, n INTEGER); +CREATE TABLE t3 (name TEXT, n INTEGER); +INSERT INTO t1 VALUES ( 'bb', 11 ); +INSERT INTO t2 VALUES ( 'bb', 12 ); +INSERT INTO t2 VALUES ( 'cc', 22 ); +INSERT INTO t2 VALUES ( 'ee', 42 ); +INSERT INTO t3 VALUES ( 'bb', 13 ); +INSERT INTO t3 VALUES ( 'cc', 23 ); +INSERT INTO t3 VALUES ( 'dd', 33 ); +SELECT * FROM t1 FULL JOIN t2 USING (name) FULL JOIN t3 USING (name) +ORDER BY name,t1.n, t2.n, t3.n; + name | n | n | n +------+----+----+---- + bb | 11 | 12 | 13 + cc | | 22 | 23 + dd | | | 33 + ee | | 42 | +(4 rows) + +-- +-- Test interactions of join syntax and subqueries +-- +-- Basic cases (we expect planner to pull up the subquery here) +SELECT * FROM +(SELECT * FROM t2) as s2 +INNER JOIN +(SELECT * FROM t3) s3 +USING (name) +ORDER BY name, s2.n, s3.n; + name | n | n +------+----+---- + bb | 12 | 13 + cc | 22 | 23 +(2 rows) + +SELECT * FROM +(SELECT * FROM t2) as s2 +LEFT JOIN +(SELECT * FROM t3) s3 +USING (name) +ORDER BY name, s2.n, s3.n; + name | n | n +------+----+---- + bb | 12 | 13 + cc | 22 | 23 + ee | 42 | +(3 rows) + +SELECT * FROM +(SELECT * FROM t2) as s2 +FULL JOIN +(SELECT * FROM t3) s3 +USING (name) +ORDER BY name, s2.n, s3.n; + name | n | n +------+----+---- + bb | 12 | 13 + cc | 22 | 23 + dd | | 33 + ee | 42 | +(4 rows) + +-- Cases with non-nullable expressions in subquery results; +-- make sure these go to null as expected +SELECT * FROM +(SELECT name, n as s2_n, 2 as s2_2 FROM t2) as s2 +NATURAL INNER JOIN +(SELECT name, n as s3_n, 3 as s3_2 FROM t3) s3 +ORDER BY name, s2_n, s3_n; + name | s2_n | s2_2 | s3_n | s3_2 +------+------+------+------+------ + bb | 12 | 2 | 13 | 3 + cc | 22 | 2 | 23 | 3 +(2 rows) + +SELECT * FROM +(SELECT name, n as s2_n, 2 as s2_2 FROM t2) as s2 +NATURAL LEFT JOIN +(SELECT name, n as s3_n, 3 as s3_2 FROM t3) s3 +ORDER BY name, s2_n, s3_n; + name | s2_n | s2_2 | s3_n | s3_2 +------+------+------+------+------ + bb | 12 | 2 | 13 | 3 + cc | 22 | 2 | 23 | 3 + ee | 42 | 2 | | +(3 rows) + +SELECT * FROM +(SELECT name, n as s2_n, 2 as s2_2 FROM t2) as s2 +NATURAL FULL JOIN +(SELECT name, n as s3_n, 3 as s3_2 FROM t3) s3 +ORDER BY name, s2_n, s3_n; + name | s2_n | s2_2 | s3_n | s3_2 +------+------+------+------+------ + bb | 12 | 2 | 13 | 3 + cc | 22 | 2 | 23 | 3 + dd | | | 33 | 3 + ee | 42 | 2 | | +(4 rows) + +SELECT * FROM +(SELECT name, n as s1_n, 1 as s1_1 FROM t1) as s1 +NATURAL INNER JOIN +(SELECT name, n as s2_n, 2 as s2_2 FROM t2) as s2 +NATURAL INNER JOIN +(SELECT name, n as s3_n, 3 as s3_2 FROM t3) s3 +ORDER BY name, s1_n, s2_n, s3_n; + name | s1_n | s1_1 | s2_n | s2_2 | s3_n | s3_2 +------+------+------+------+------+------+------ + bb | 11 | 1 | 12 | 2 | 13 | 3 +(1 row) + +SELECT * FROM +(SELECT name, n as s1_n, 1 as s1_1 FROM t1) as s1 +NATURAL FULL JOIN +(SELECT name, n as s2_n, 2 as s2_2 FROM t2) as s2 +NATURAL FULL JOIN +(SELECT name, n as s3_n, 3 as s3_2 FROM t3) s3 +ORDER BY name, s1_n, s2_n, s3_n; + name | s1_n | s1_1 | s2_n | s2_2 | s3_n | s3_2 +------+------+------+------+------+------+------ + bb | 11 | 1 | 12 | 2 | 13 | 3 + cc | | | 22 | 2 | 23 | 3 + dd | | | | | 33 | 3 + ee | | | 42 | 2 | | +(4 rows) + +SELECT * FROM +(SELECT name, n as s1_n FROM t1) as s1 +NATURAL FULL JOIN + (SELECT * FROM + (SELECT name, n as s2_n FROM t2) as s2 + NATURAL FULL JOIN + (SELECT name, n as s3_n FROM t3) as s3 + ) ss2 + ORDER BY name, s1_n, s2_n, s3_n; + name | s1_n | s2_n | s3_n +------+------+------+------ + bb | 11 | 12 | 13 + cc | | 22 | 23 + dd | | | 33 + ee | | 42 | +(4 rows) + +SELECT * FROM +(SELECT name, n as s1_n FROM t1) as s1 +NATURAL FULL JOIN + (SELECT * FROM + (SELECT name, n as s2_n, 2 as s2_2 FROM t2) as s2 + NATURAL FULL JOIN + (SELECT name, n as s3_n FROM t3) as s3 + ) ss2 + ORDER BY name, s1_n, s2_n, s3_n; + name | s1_n | s2_n | s2_2 | s3_n +------+------+------+------+------ + bb | 11 | 12 | 2 | 13 + cc | | 22 | 2 | 23 + dd | | | | 33 + ee | | 42 | 2 | +(4 rows) + +-- Test for propagation of nullability constraints into sub-joins +create temp table x (x1 int, x2 int); +insert into x values (1,11); +insert into x values (2,22); +insert into x values (3,null); +insert into x values (4,44); +insert into x values (5,null); +create temp table y (y1 int, y2 int); +insert into y values (1,111); +insert into y values (2,222); +insert into y values (3,333); +insert into y values (4,null); +select * from x ORDER BY x1; + x1 | x2 +----+---- + 1 | 11 + 2 | 22 + 3 | + 4 | 44 + 5 | +(5 rows) + +select * from y ORDER BY y1; + y1 | y2 +----+----- + 1 | 111 + 2 | 222 + 3 | 333 + 4 | +(4 rows) + +select * from x left join y on (x1 = y1 and x2 is not null) ORDER BY x1, x2, y1, y2; + x1 | x2 | y1 | y2 +----+----+----+----- + 1 | 11 | 1 | 111 + 2 | 22 | 2 | 222 + 3 | | | + 4 | 44 | 4 | + 5 | | | +(5 rows) + +select * from x left join y on (x1 = y1 and y2 is not null) ORDER BY x1, x2, y1, y2; + x1 | x2 | y1 | y2 +----+----+----+----- + 1 | 11 | 1 | 111 + 2 | 22 | 2 | 222 + 3 | | 3 | 333 + 4 | 44 | | + 5 | | | +(5 rows) + +select * from (x left join y on (x1 = y1)) left join x xx(xx1,xx2) +on (x1 = xx1) ORDER BY x1, x2, y1, y2; + x1 | x2 | y1 | y2 | xx1 | xx2 +----+----+----+-----+-----+----- + 1 | 11 | 1 | 111 | 1 | 11 + 2 | 22 | 2 | 222 | 2 | 22 + 3 | | 3 | 333 | 3 | + 4 | 44 | 4 | | 4 | 44 + 5 | | | | 5 | +(5 rows) + +select * from (x left join y on (x1 = y1)) left join x xx(xx1,xx2) +on (x1 = xx1 and x2 is not null) ORDER BY x1, x2, y1, y2; + x1 | x2 | y1 | y2 | xx1 | xx2 +----+----+----+-----+-----+----- + 1 | 11 | 1 | 111 | 1 | 11 + 2 | 22 | 2 | 222 | 2 | 22 + 3 | | 3 | 333 | | + 4 | 44 | 4 | | 4 | 44 + 5 | | | | | +(5 rows) + +select * from (x left join y on (x1 = y1)) left join x xx(xx1,xx2) +on (x1 = xx1 and y2 is not null) ORDER BY x1, x2, y1, y2; + x1 | x2 | y1 | y2 | xx1 | xx2 +----+----+----+-----+-----+----- + 1 | 11 | 1 | 111 | 1 | 11 + 2 | 22 | 2 | 222 | 2 | 22 + 3 | | 3 | 333 | 3 | + 4 | 44 | 4 | | | + 5 | | | | | +(5 rows) + +select * from (x left join y on (x1 = y1)) left join x xx(xx1,xx2) +on (x1 = xx1 and xx2 is not null) ORDER BY x1, x2, y1, y2; + x1 | x2 | y1 | y2 | xx1 | xx2 +----+----+----+-----+-----+----- + 1 | 11 | 1 | 111 | 1 | 11 + 2 | 22 | 2 | 222 | 2 | 22 + 3 | | 3 | 333 | | + 4 | 44 | 4 | | 4 | 44 + 5 | | | | | +(5 rows) + +-- these should NOT give the same answers as above +select * from (x left join y on (x1 = y1)) left join x xx(xx1,xx2) +on (x1 = xx1) where (x2 is not null) +ORDER BY x1, x2, y1, y2; + x1 | x2 | y1 | y2 | xx1 | xx2 +----+----+----+-----+-----+----- + 1 | 11 | 1 | 111 | 1 | 11 + 2 | 22 | 2 | 222 | 2 | 22 + 4 | 44 | 4 | | 4 | 44 +(3 rows) + +select * from (x left join y on (x1 = y1)) left join x xx(xx1,xx2) +on (x1 = xx1) where (y2 is not null) +ORDER BY x1, x2, y1, y2; + x1 | x2 | y1 | y2 | xx1 | xx2 +----+----+----+-----+-----+----- + 1 | 11 | 1 | 111 | 1 | 11 + 2 | 22 | 2 | 222 | 2 | 22 + 3 | | 3 | 333 | 3 | +(3 rows) + +select * from (x left join y on (x1 = y1)) left join x xx(xx1,xx2) +on (x1 = xx1) where (xx2 is not null) +ORDER BY x1, x2, y1, y2; + x1 | x2 | y1 | y2 | xx1 | xx2 +----+----+----+-----+-----+----- + 1 | 11 | 1 | 111 | 1 | 11 + 2 | 22 | 2 | 222 | 2 | 22 + 4 | 44 | 4 | | 4 | 44 +(3 rows) + +-- +-- regression test: check for bug with propagation of implied equality +-- to outside an IN +-- +select count(*) from tenk1 a where unique1 in + (select unique1 from tenk1 b join tenk1 c using (unique1) + where b.unique2 = 42); + count +------- + 1 +(1 row) + +-- +-- regression test: check for failure to generate a plan with multiple +-- degenerate IN clauses +-- +select count(*) from tenk1 x where + x.unique1 in (select a.f1 from int4_tbl a,float8_tbl b where a.f1=b.f1) and + x.unique1 = 0 and + x.unique1 in (select aa.f1 from int4_tbl aa,float8_tbl bb where aa.f1=bb.f1); + count +------- + 1 +(1 row) + +-- try that with GEQO too +begin; +set geqo = on; +set geqo_threshold = 2; +select count(*) from tenk1 x where + x.unique1 in (select a.f1 from int4_tbl a,float8_tbl b where a.f1=b.f1) and + x.unique1 = 0 and + x.unique1 in (select aa.f1 from int4_tbl aa,float8_tbl bb where aa.f1=bb.f1); + count +------- + 1 +(1 row) + +rollback; +-- +-- regression test: be sure we cope with proven-dummy append rels +-- +explain (costs off) +select aa, bb, unique1, unique1 + from tenk1 right join b on aa = unique1 + where bb < bb and bb is null; + QUERY PLAN +-------------------------- + Result + One-Time Filter: false +(2 rows) + +select aa, bb, unique1, unique1 + from tenk1 right join b on aa = unique1 + where bb < bb and bb is null; + aa | bb | unique1 | unique1 +----+----+---------+--------- +(0 rows) + +-- +-- regression test: check handling of empty-FROM subquery underneath outer join +-- +set enable_nestloop to off; +explain (costs off) +select * from int8_tbl i1 left join (int8_tbl i2 join + (select 123 as x) ss on i2.q1 = x) on i1.q2 = i2.q2 +order by 1, 2; + QUERY PLAN +------------------------------------------------------------------ + Sort + Sort Key: i1.q1, i1.q2 + -> Hash Left Join + Hash Cond: (i1.q2 = i2.q2) + -> Remote Subquery Scan on all (datanode_1) + -> Seq Scan on int8_tbl i1 + -> Hash + -> Hash Join + Hash Cond: (i2.q1 = (123)) + -> Remote Subquery Scan on all (datanode_1) + -> Seq Scan on int8_tbl i2 + -> Hash + -> Result +(13 rows) + +select * from int8_tbl i1 left join (int8_tbl i2 join + (select 123 as x) ss on i2.q1 = x) on i1.q2 = i2.q2 +order by 1, 2; + q1 | q2 | q1 | q2 | x +------------------+-------------------+-----+------------------+----- + 123 | 456 | 123 | 456 | 123 + 123 | 4567890123456789 | 123 | 4567890123456789 | 123 + 4567890123456789 | -4567890123456789 | | | + 4567890123456789 | 123 | | | + 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 | 123 +(5 rows) + +reset enable_nestloop; +-- +-- regression test: check a case where join_clause_is_movable_into() gives +-- an imprecise result, causing an assertion failure +-- +select count(*) +from + (select t3.tenthous as x1, coalesce(t1.stringu1, t2.stringu1) as x2 + from tenk1 t1 + left join tenk1 t2 on t1.unique1 = t2.unique1 + join tenk1 t3 on t1.unique2 = t3.unique2) ss, + tenk1 t4, + tenk1 t5 +where t4.thousand = t5.unique1 and ss.x1 = t4.tenthous and ss.x2 = t5.stringu1; + count +------- + 1000 +(1 row) + +-- +-- regression test: check a case where we formerly missed including an EC +-- enforcement clause because it was expected to be handled at scan level +-- +explain (costs off) +select a.f1, b.f1, t.thousand, t.tenthous from + tenk1 t, + (select sum(f1)+1 as f1 from int4_tbl i4a) a, + (select sum(f1) as f1 from int4_tbl i4b) b +where b.f1 = t.thousand and a.f1 = b.f1 and (a.f1+b.f1+999) = t.tenthous; + QUERY PLAN +----------------------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Nested Loop + -> Aggregate + -> Seq Scan on int4_tbl i4b + -> Nested Loop + Join Filter: ((sum(i4b.f1)) = ((sum(i4a.f1) + 1))) + -> Aggregate + -> Seq Scan on int4_tbl i4a + -> Index Only Scan using tenk1_thous_tenthous on tenk1 t + Index Cond: ((thousand = (sum(i4b.f1))) AND (tenthous = ((((sum(i4a.f1) + 1)) + (sum(i4b.f1))) + 999))) +(10 rows) + +select a.f1, b.f1, t.thousand, t.tenthous from + tenk1 t, + (select sum(f1)+1 as f1 from int4_tbl i4a) a, + (select sum(f1) as f1 from int4_tbl i4b) b +where b.f1 = t.thousand and a.f1 = b.f1 and (a.f1+b.f1+999) = t.tenthous; + f1 | f1 | thousand | tenthous +----+----+----------+---------- +(0 rows) + +-- +-- Clean up +-- +DROP TABLE t1; +DROP TABLE t2; +DROP TABLE t3; +DROP TABLE J1_TBL; +DROP TABLE J2_TBL; +-- Both DELETE and UPDATE allow the specification of additional tables +-- to "join" against to determine which rows should be modified. +CREATE TEMP TABLE t1 (a int, b int); +CREATE TEMP TABLE t2 (a int, b int); +CREATE TEMP TABLE t3 (x int, y int); +INSERT INTO t1 VALUES (5, 10); +INSERT INTO t1 VALUES (15, 20); +INSERT INTO t1 VALUES (100, 100); +INSERT INTO t1 VALUES (200, 1000); +INSERT INTO t2 VALUES (200, 2000); +INSERT INTO t3 VALUES (5, 20); +INSERT INTO t3 VALUES (6, 7); +INSERT INTO t3 VALUES (7, 8); +INSERT INTO t3 VALUES (500, 100); +DELETE FROM t3 USING t1 table1 WHERE t3.x = table1.a; +SELECT * FROM t3 ORDER By x, y; + x | y +-----+----- + 6 | 7 + 7 | 8 + 500 | 100 +(3 rows) + +DELETE FROM t3 USING t1 JOIN t2 USING (a) WHERE t3.x > t1.a; +ERROR: could not plan this distributed delete +DETAIL: correlated or complex DELETE is currently not supported in Postgres-XL. +SELECT * FROM t3 ORDER By x, y; + x | y +-----+----- + 6 | 7 + 7 | 8 + 500 | 100 +(3 rows) + +DELETE FROM t3 USING t3 t3_other WHERE t3.x = t3_other.x AND t3.y = t3_other.y; +SELECT * FROM t3 ORDER By x, y; + x | y +---+--- +(0 rows) + +-- Test join against inheritance tree +create temp table t2a () inherits (t2); +insert into t2a values (200, 2001); +select * from t1 left join t2 on (t1.a = t2.a) order by 1,2,3,4; + a | b | a | b +-----+------+-----+------ + 5 | 10 | | + 15 | 20 | | + 100 | 100 | | + 200 | 1000 | 200 | 2000 + 200 | 1000 | 200 | 2001 +(5 rows) + +-- Test matching of column name with wrong alias +select t1.x from t1 join t3 on (t1.a = t3.x); +ERROR: column t1.x does not exist +LINE 1: select t1.x from t1 join t3 on (t1.a = t3.x); + ^ +HINT: Perhaps you meant to reference the column "t3.x". +-- +-- regression test for 8.1 merge right join bug +-- +CREATE TEMP TABLE tt1 ( tt1_id int4, joincol int4 ); +INSERT INTO tt1 VALUES (1, 11); +INSERT INTO tt1 VALUES (2, NULL); +CREATE TEMP TABLE tt2 ( tt2_id int4, joincol int4 ); +INSERT INTO tt2 VALUES (21, 11); +INSERT INTO tt2 VALUES (22, 11); +set enable_hashjoin to off; +set enable_nestloop to off; +-- these should give the same results +select tt1.*, tt2.* from tt1 left join tt2 on tt1.joincol = tt2.joincol + ORDER BY tt1_id, tt2_id; + tt1_id | joincol | tt2_id | joincol +--------+---------+--------+--------- + 1 | 11 | 21 | 11 + 1 | 11 | 22 | 11 + 2 | | | +(3 rows) + +select tt1.*, tt2.* from tt2 right join tt1 on tt1.joincol = tt2.joincol + ORDER BY tt1_id, tt2_id; + tt1_id | joincol | tt2_id | joincol +--------+---------+--------+--------- + 1 | 11 | 21 | 11 + 1 | 11 | 22 | 11 + 2 | | | +(3 rows) + +reset enable_hashjoin; +reset enable_nestloop; +-- +-- regression test for bug #13908 (hash join with skew tuples & nbatch increase) +-- +set work_mem to '64kB'; +set enable_mergejoin to off; +explain (costs off) +select count(*) from tenk1 a, tenk1 b + where a.hundred = b.thousand and (b.fivethous % 10) < 10; + QUERY PLAN +----------------------------------------------------------------------------------- + Finalize Aggregate + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Partial Aggregate + -> Hash Join + Hash Cond: (a.hundred = b.thousand) + -> Index Only Scan using tenk1_hundred on tenk1 a + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on tenk1 b + Filter: ((fivethous % 10) < 10) +(10 rows) + +select count(*) from tenk1 a, tenk1 b + where a.hundred = b.thousand and (b.fivethous % 10) < 10; + count +-------- + 100000 +(1 row) + +reset work_mem; +reset enable_mergejoin; +-- +-- regression test for 8.2 bug with improper re-ordering of left joins +-- +create temp table tt3(f1 int, f2 text); +insert into tt3 select x, repeat('xyzzy', 100) from generate_series(1,10000) x; +create index tt3i on tt3(f1); +analyze tt3; +create temp table tt4(f1 int); +insert into tt4 values (0),(1),(9999); +analyze tt4; +SELECT a.f1 +FROM tt4 a +LEFT JOIN ( + SELECT b.f1 + FROM tt3 b LEFT JOIN tt3 c ON (b.f1 = c.f1) + WHERE c.f1 IS NULL +) AS d ON (a.f1 = d.f1) +WHERE d.f1 IS NULL ORDER BY f1; + f1 +------ + 0 + 1 + 9999 +(3 rows) + +-- +-- regression test for proper handling of outer joins within antijoins +-- +create temp table tt4x(c1 int, c2 int, c3 int); +explain (costs off) +select * from tt4x t1 +where not exists ( + select 1 from tt4x t2 + left join tt4x t3 on t2.c3 = t3.c1 + left join ( select t5.c1 as c1 + from tt4x t4 left join tt4x t5 on t4.c2 = t5.c1 + ) a1 on t3.c2 = a1.c1 + where t1.c1 = t2.c2 +); + QUERY PLAN +----------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Hash Anti Join + Hash Cond: (t1.c1 = t2.c2) + -> Seq Scan on tt4x t1 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: c2 + -> Merge Right Join + Merge Cond: (t5.c1 = t3.c2) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: c1 + -> Merge Join + Merge Cond: (t4.c2 = t5.c1) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: c2 + -> Sort + Sort Key: t4.c2 + -> Seq Scan on tt4x t4 + -> Sort + Sort Key: t5.c1 + -> Seq Scan on tt4x t5 + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: c2 + -> Sort + Sort Key: t3.c2 + -> Hash Left Join + Hash Cond: (t2.c3 = t3.c1) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: c3 + -> Seq Scan on tt4x t2 + -> Hash + -> Seq Scan on tt4x t3 +(33 rows) + +-- +-- regression test for problems of the sort depicted in bug #3494 +-- +create temp table tt5(f1 int, f2 int); +create temp table tt6(f1 int, f2 int); +insert into tt5 values(1, 10); +insert into tt5 values(1, 11); +insert into tt6 values(1, 9); +insert into tt6 values(1, 2); +insert into tt6 values(2, 9); +select * from tt5,tt6 where tt5.f1 = tt6.f1 and tt5.f1 = tt5.f2 - tt6.f2 + ORDER BY tt5.f1, tt5.f2, tt6.f1, tt6.f2; + f1 | f2 | f1 | f2 +----+----+----+---- + 1 | 10 | 1 | 9 +(1 row) + +-- +-- regression test for problems of the sort depicted in bug #3588 +-- +create temp table xx (pkxx int); +create temp table yy (pkyy int, pkxx int); +insert into xx values (1); +insert into xx values (2); +insert into xx values (3); +insert into yy values (101, 1); +insert into yy values (201, 2); +insert into yy values (301, NULL); +select yy.pkyy as yy_pkyy, yy.pkxx as yy_pkxx, yya.pkyy as yya_pkyy, + xxa.pkxx as xxa_pkxx, xxb.pkxx as xxb_pkxx +from yy + left join (SELECT * FROM yy where pkyy = 101) as yya ON yy.pkyy = yya.pkyy + left join xx xxa on yya.pkxx = xxa.pkxx + left join xx xxb on coalesce (xxa.pkxx, 1) = xxb.pkxx + ORDER BY yy_pkyy, yy_pkxx, yya_pkyy, xxa_pkxx, xxb_pkxx; + yy_pkyy | yy_pkxx | yya_pkyy | xxa_pkxx | xxb_pkxx +---------+---------+----------+----------+---------- + 101 | 1 | 101 | 1 | 1 + 201 | 2 | | | 1 + 301 | | | | 1 +(3 rows) + +-- +-- regression test for improper pushing of constants across outer-join clauses +-- (as seen in early 8.2.x releases) +-- +create temp table zt1 (f1 int primary key); +create temp table zt2 (f2 int primary key); +create temp table zt3 (f3 int primary key); +insert into zt1 values(53); +insert into zt2 values(53); +select * from + zt2 left join zt3 on (f2 = f3) + left join zt1 on (f3 = f1) +where f2 = 53 +ORDER BY f1, f2, f3; + f2 | f3 | f1 +----+----+---- + 53 | | +(1 row) + +create temp view zv1 as select *,'dummy'::text AS junk from zt1; +select * from + zt2 left join zt3 on (f2 = f3) + left join zv1 on (f3 = f1) +where f2 = 53 +ORDER BY f1, f2, f3; + f2 | f3 | f1 | junk +----+----+----+------ + 53 | | | +(1 row) + +-- +-- regression test for improper extraction of OR indexqual conditions +-- (as seen in early 8.3.x releases) +-- +select a.unique2, a.ten, b.tenthous, b.unique2, b.hundred +from tenk1 a left join tenk1 b on a.unique2 = b.tenthous +where a.unique1 = 42 and + ((b.unique2 is null and a.ten = 2) or b.hundred = 3); + unique2 | ten | tenthous | unique2 | hundred +---------+-----+----------+---------+--------- +(0 rows) + +-- +-- test proper positioning of one-time quals in EXISTS (8.4devel bug) +-- +prepare foo(bool) as + select count(*) from tenk1 a left join tenk1 b + on (a.unique2 = b.unique1 and exists + (select 1 from tenk1 c where c.thousand = b.unique2 and $1)); +execute foo(true); + count +------- + 10000 +(1 row) + +execute foo(false); + count +------- + 10000 +(1 row) + +-- +-- test for sane behavior with noncanonical merge clauses, per bug #4926 +-- +begin; +set enable_mergejoin = 1; +set enable_hashjoin = 0; +set enable_nestloop = 0; +create temp table a (i integer); +create temp table b (x integer, y integer); +select * from a left join b on i = x and i = y and x = i; + i | x | y +---+---+--- +(0 rows) + +rollback; +-- +-- test NULL behavior of whole-row Vars, per bug #5025 +-- +select t1.q2, count(t2.*) +from int8_tbl t1 left join int8_tbl t2 on (t1.q2 = t2.q1) +group by t1.q2 order by 1; + q2 | count +-------------------+------- + -4567890123456789 | 0 + 123 | 2 + 456 | 0 + 4567890123456789 | 6 +(4 rows) + +select t1.q2, count(t2.*) +from int8_tbl t1 left join (select * from int8_tbl) t2 on (t1.q2 = t2.q1) +group by t1.q2 order by 1; + q2 | count +-------------------+------- + -4567890123456789 | 0 + 123 | 2 + 456 | 0 + 4567890123456789 | 6 +(4 rows) + +select t1.q2, count(t2.*) +from int8_tbl t1 left join (select * from int8_tbl offset 0) t2 on (t1.q2 = t2.q1) +group by t1.q2 order by 1; + q2 | count +-------------------+------- + -4567890123456789 | 0 + 123 | 2 + 456 | 0 + 4567890123456789 | 6 +(4 rows) + +select t1.q2, count(t2.*) +from int8_tbl t1 left join + (select q1, case when q2=1 then 1 else q2 end as q2 from int8_tbl) t2 + on (t1.q2 = t2.q1) +group by t1.q2 order by 1; + q2 | count +-------------------+------- + -4567890123456789 | 0 + 123 | 2 + 456 | 0 + 4567890123456789 | 6 +(4 rows) + +-- +-- test incorrect failure to NULL pulled-up subexpressions +-- +begin; +create temp table a ( + code char not null, + constraint a_pk primary key (code) +); +create temp table b ( + a char not null, + num integer not null, + constraint b_pk primary key (a, num) +); +create temp table c ( + name char not null, + a char, + constraint c_pk primary key (name) +); +insert into a (code) values ('p'); +insert into a (code) values ('q'); +insert into b (a, num) values ('p', 1); +insert into b (a, num) values ('p', 2); +insert into c (name, a) values ('A', 'p'); +insert into c (name, a) values ('B', 'q'); +insert into c (name, a) values ('C', null); +select c.name, ss.code, ss.b_cnt, ss.const +from c left join + (select a.code, coalesce(b_grp.cnt, 0) as b_cnt, -1 as const + from a left join + (select count(1) as cnt, b.a from b group by b.a) as b_grp + on a.code = b_grp.a + ) as ss + on (c.a = ss.code) +order by c.name; + name | code | b_cnt | const +------+------+-------+------- + A | p | 2 | -1 + B | q | 0 | -1 + C | | | +(3 rows) + +rollback; +-- +-- test incorrect handling of placeholders that only appear in targetlists, +-- per bug #6154 +-- +SELECT * FROM +( SELECT 1 as key1 ) sub1 +LEFT JOIN +( SELECT sub3.key3, sub4.value2, COALESCE(sub4.value2, 66) as value3 FROM + ( SELECT 1 as key3 ) sub3 + LEFT JOIN + ( SELECT sub5.key5, COALESCE(sub6.value1, 1) as value2 FROM + ( SELECT 1 as key5 ) sub5 + LEFT JOIN + ( SELECT 2 as key6, 42 as value1 ) sub6 + ON sub5.key5 = sub6.key6 + ) sub4 + ON sub4.key5 = sub3.key3 +) sub2 +ON sub1.key1 = sub2.key3; + key1 | key3 | value2 | value3 +------+------+--------+-------- + 1 | 1 | 1 | 1 +(1 row) + +-- test the path using join aliases, too +SELECT * FROM +( SELECT 1 as key1 ) sub1 +LEFT JOIN +( SELECT sub3.key3, value2, COALESCE(value2, 66) as value3 FROM + ( SELECT 1 as key3 ) sub3 + LEFT JOIN + ( SELECT sub5.key5, COALESCE(sub6.value1, 1) as value2 FROM + ( SELECT 1 as key5 ) sub5 + LEFT JOIN + ( SELECT 2 as key6, 42 as value1 ) sub6 + ON sub5.key5 = sub6.key6 + ) sub4 + ON sub4.key5 = sub3.key3 +) sub2 +ON sub1.key1 = sub2.key3; + key1 | key3 | value2 | value3 +------+------+--------+-------- + 1 | 1 | 1 | 1 +(1 row) + +-- +-- test case where a PlaceHolderVar is used as a nestloop parameter +-- +EXPLAIN (NUM_NODES OFF, NODES OFF, COSTS OFF) +SELECT qq, unique1 + FROM + ( SELECT COALESCE(q1, 0) AS qq FROM int8_tbl a ) AS ss1 + FULL OUTER JOIN + ( SELECT COALESCE(q2, -1) AS qq FROM int8_tbl b ) AS ss2 + USING (qq) + INNER JOIN tenk1 c ON qq = unique2; + QUERY PLAN +--------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + -> Nested Loop + -> Hash Full Join + Hash Cond: (COALESCE(a.q1, '0'::bigint) = COALESCE(b.q2, '-1'::bigint)) + -> Seq Scan on int8_tbl a + -> Hash + -> Seq Scan on int8_tbl b + -> Index Scan using tenk1_unique2 on tenk1 c + Index Cond: (unique2 = COALESCE((COALESCE(a.q1, '0'::bigint)), (COALESCE(b.q2, '-1'::bigint)))) +(9 rows) + +SELECT qq, unique1 + FROM + ( SELECT COALESCE(q1, 0) AS qq FROM int8_tbl a ) AS ss1 + FULL OUTER JOIN + ( SELECT COALESCE(q2, -1) AS qq FROM int8_tbl b ) AS ss2 + USING (qq) + INNER JOIN tenk1 c ON qq = unique2; + qq | unique1 +-----+--------- + 123 | 4596 + 123 | 4596 + 456 | 7318 +(3 rows) + +-- +-- nested nestloops can require nested PlaceHolderVars +-- +create temp table nt1 ( + id int primary key, + a1 boolean, + a2 boolean +); +create temp table nt2 ( + id int primary key, + nt1_id int, + b1 boolean, + b2 boolean +); +create temp table nt3 ( + id int primary key, + nt2_id int, + c1 boolean +); +insert into nt1 values (1,true,true); +insert into nt1 values (2,true,false); +insert into nt1 values (3,false,false); +insert into nt2 values (1,1,true,true); +insert into nt2 values (2,2,true,false); +insert into nt2 values (3,3,false,false); +insert into nt3 values (1,1,true); +insert into nt3 values (2,2,false); +insert into nt3 values (3,3,true); +explain(num_nodes off, nodes off, costs off) +select nt3.id +from nt3 as nt3 + left join + (select nt2.*, (nt2.b1 and ss1.a3) AS b3 + from nt2 as nt2 + left join + (select nt1.*, (nt1.id is not null) as a3 from nt1) as ss1 + on ss1.id = nt2.nt1_id + ) as ss2 + on ss2.id = nt3.nt2_id +where nt3.id = 1 and ss2.b3; + QUERY PLAN +---------------------------------------------------------------- + Remote Subquery Scan on all + -> Nested Loop + -> Remote Subquery Scan on all + Distribute results by H: nt1_id + -> Nested Loop + -> Remote Subquery Scan on all + Distribute results by H: nt2_id + -> Index Scan using nt3_pkey on nt3 + Index Cond: (id = 1) + -> Index Scan using nt2_pkey on nt2 + Index Cond: (id = nt3.nt2_id) + -> Index Only Scan using nt1_pkey on nt1 + Index Cond: (id = nt2.nt1_id) + Filter: (nt2.b1 AND (id IS NOT NULL)) +(14 rows) + +select nt3.id +from nt3 as nt3 + left join + (select nt2.*, (nt2.b1 and ss1.a3) AS b3 + from nt2 as nt2 + left join + (select nt1.*, (nt1.id is not null) as a3 from nt1) as ss1 + on ss1.id = nt2.nt1_id + ) as ss2 + on ss2.id = nt3.nt2_id +where nt3.id = 1 and ss2.b3; + id +---- + 1 +(1 row) + +-- +-- test case where a PlaceHolderVar is propagated into a subquery +-- +explain (num_nodes off, nodes off, costs off) +select * from + int8_tbl t1 left join + (select q1 as x, 42 as y from int8_tbl t2) ss + on t1.q2 = ss.x +where + 1 = (select 1 from int8_tbl t3 where ss.y is not null limit 1) +order by 1,2; + QUERY PLAN +----------------------------------------------------------------------------- + Remote Subquery Scan on all + -> Sort + Sort Key: t1.q1, t1.q2 + -> Hash Left Join + Hash Cond: (t1.q2 = t2.q1) + Filter: (1 = (SubPlan 1)) + -> Seq Scan on int8_tbl t1 + -> Hash + -> Seq Scan on int8_tbl t2 + SubPlan 1 + -> Limit + -> Remote Subquery Scan on all + -> Limit + -> Result + One-Time Filter: ((42) IS NOT NULL) + -> Seq Scan on int8_tbl t3 +(16 rows) + +select * from + int8_tbl t1 left join + (select q1 as x, 42 as y from int8_tbl t2) ss + on t1.q2 = ss.x +where + 1 = (select 1 from int8_tbl t3 where ss.y is not null limit 1) +order by 1,2; + q1 | q2 | x | y +------------------+------------------+------------------+---- + 123 | 4567890123456789 | 4567890123456789 | 42 + 123 | 4567890123456789 | 4567890123456789 | 42 + 123 | 4567890123456789 | 4567890123456789 | 42 + 4567890123456789 | 123 | 123 | 42 + 4567890123456789 | 123 | 123 | 42 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 42 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 42 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 42 +(8 rows) + +-- +-- test the corner cases FULL JOIN ON TRUE and FULL JOIN ON FALSE +-- +select * from int4_tbl a full join int4_tbl b on true order by 1,2; + f1 | f1 +-------------+------------- + -2147483647 | -2147483647 + -2147483647 | -123456 + -2147483647 | 0 + -2147483647 | 123456 + -2147483647 | 2147483647 + -123456 | -2147483647 + -123456 | -123456 + -123456 | 0 + -123456 | 123456 + -123456 | 2147483647 + 0 | -2147483647 + 0 | -123456 + 0 | 0 + 0 | 123456 + 0 | 2147483647 + 123456 | -2147483647 + 123456 | -123456 + 123456 | 0 + 123456 | 123456 + 123456 | 2147483647 + 2147483647 | -2147483647 + 2147483647 | -123456 + 2147483647 | 0 + 2147483647 | 123456 + 2147483647 | 2147483647 +(25 rows) + +select * from int4_tbl a full join int4_tbl b on false order by 1,2; + f1 | f1 +-------------+------------- + -2147483647 | + -123456 | + 0 | + 123456 | + 2147483647 | + | -2147483647 + | -123456 + | 0 + | 123456 + | 2147483647 +(10 rows) + +-- +-- test for ability to use a cartesian join when necessary +-- +explain (num_nodes off, nodes off, costs off) +select * from + tenk1 join int4_tbl on f1 = twothousand, + int4(sin(1)) q1, + int4(sin(0)) q2 +where q1 = thousand or q2 = thousand; + QUERY PLAN +------------------------------------------------------------------------------------ + Nested Loop + Join Filter: (tenk1.twothousand = int4_tbl.f1) + -> Nested Loop + -> Nested Loop + -> Function Scan on q1 + -> Function Scan on q2 + -> Materialize + -> Remote Subquery Scan on all + -> Bitmap Heap Scan on tenk1 + Recheck Cond: ((q1.q1 = thousand) OR (q2.q2 = thousand)) + -> BitmapOr + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (q1.q1 = thousand) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (q2.q2 = thousand) + -> Materialize + -> Remote Subquery Scan on all + -> Seq Scan on int4_tbl +(18 rows) + +explain (num_nodes off, nodes off, costs off) +select * from + tenk1 join int4_tbl on f1 = twothousand, + int4(sin(1)) q1, + int4(sin(0)) q2 +where thousand = (q1 + q2); + QUERY PLAN +-------------------------------------------------------------------------- + Nested Loop + Join Filter: (tenk1.twothousand = int4_tbl.f1) + -> Nested Loop + -> Nested Loop + -> Function Scan on q1 + -> Function Scan on q2 + -> Materialize + -> Remote Subquery Scan on all + -> Bitmap Heap Scan on tenk1 + Recheck Cond: (thousand = (q1.q1 + q2.q2)) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (thousand = (q1.q1 + q2.q2)) + -> Materialize + -> Remote Subquery Scan on all + -> Seq Scan on int4_tbl +(15 rows) + +-- +-- test ability to generate a suitable plan for a star-schema query +-- +explain (costs off) +select * from + tenk1, int8_tbl a, int8_tbl b +where thousand = a.q1 and tenthous = b.q1 and a.q2 = 1 and b.q2 = 2; + QUERY PLAN +--------------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Merge Join + Merge Cond: (tenk1.thousand = a.q1) + -> Sort + Sort Key: tenk1.thousand + -> Merge Join + Merge Cond: (tenk1.tenthous = b.q1) + -> Sort + Sort Key: tenk1.tenthous + -> Seq Scan on tenk1 + -> Sort + Sort Key: b.q1 + -> Seq Scan on int8_tbl b + Filter: (q2 = 2) + -> Sort + Sort Key: a.q1 + -> Seq Scan on int8_tbl a + Filter: (q2 = 1) +(19 rows) + +-- +-- test a corner case in which we shouldn't apply the star-schema optimization +-- +explain (costs off, nodes off) +select t1.unique2, t1.stringu1, t2.unique1, t2.stringu2 from + tenk1 t1 + inner join int4_tbl i1 + left join (select v1.x2, v2.y1, 11 AS d1 + from (values(1,0)) v1(x1,x2) + left join (values(3,1)) v2(y1,y2) + on v1.x1 = v2.y2) subq1 + on (i1.f1 = subq1.x2) + on (t1.unique2 = subq1.d1) + left join tenk1 t2 + on (subq1.y1 = t2.unique1) +where t1.unique2 < 42 and t1.stringu1 > t2.stringu2; + QUERY PLAN +----------------------------------------------------------------------------- + Nested Loop + Join Filter: (t1.stringu1 > t2.stringu2) + -> Nested Loop + -> Nested Loop + Join Filter: ((1) = (1)) + -> Hash Join + Hash Cond: (i1.f1 = (0)) + -> Remote Subquery Scan on all + -> Seq Scan on int4_tbl i1 + -> Hash + -> Result + -> Result + -> Materialize + -> Remote Subquery Scan on all + -> Index Scan using tenk1_unique2 on tenk1 t1 + Index Cond: ((unique2 = (11)) AND (unique2 < 42)) + -> Materialize + -> Remote Subquery Scan on all + -> Index Scan using tenk1_unique1 on tenk1 t2 + Index Cond: (unique1 = (3)) +(20 rows) + +select t1.unique2, t1.stringu1, t2.unique1, t2.stringu2 from + tenk1 t1 + inner join int4_tbl i1 + left join (select v1.x2, v2.y1, 11 AS d1 + from (values(1,0)) v1(x1,x2) + left join (values(3,1)) v2(y1,y2) + on v1.x1 = v2.y2) subq1 + on (i1.f1 = subq1.x2) + on (t1.unique2 = subq1.d1) + left join tenk1 t2 + on (subq1.y1 = t2.unique1) +where t1.unique2 < 42 and t1.stringu1 > t2.stringu2; + unique2 | stringu1 | unique1 | stringu2 +---------+----------+---------+---------- + 11 | WFAAAA | 3 | LKIAAA +(1 row) + +-- variant that isn't quite a star-schema case +select ss1.d1 from + tenk1 as t1 + inner join tenk1 as t2 + on t1.tenthous = t2.ten + inner join + int8_tbl as i8 + left join int4_tbl as i4 + inner join (select 64::information_schema.cardinal_number as d1 + from tenk1 t3, + lateral (select abs(t3.unique1) + random()) ss0(x) + where t3.fivethous < 0) as ss1 + on i4.f1 = ss1.d1 + on i8.q1 = i4.f1 + on t1.tenthous = ss1.d1 +where t1.unique1 < i4.f1; + d1 +---- +(0 rows) + +-- +-- test extraction of restriction OR clauses from join OR clause +-- (we used to only do this for indexable clauses) +-- +explain (costs off) +select * from tenk1 a join tenk1 b on + (a.unique1 = 1 and b.unique1 = 2) or (a.unique2 = 3 and b.hundred = 4); + QUERY PLAN +------------------------------------------------------------------------------------------------- + Nested Loop + Join Filter: (((a.unique1 = 1) AND (b.unique1 = 2)) OR ((a.unique2 = 3) AND (b.hundred = 4))) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Bitmap Heap Scan on tenk1 b + Recheck Cond: ((unique1 = 2) OR (hundred = 4)) + -> BitmapOr + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 = 2) + -> Bitmap Index Scan on tenk1_hundred + Index Cond: (hundred = 4) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Bitmap Heap Scan on tenk1 a + Recheck Cond: ((unique1 = 1) OR (unique2 = 3)) + -> BitmapOr + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 = 1) + -> Bitmap Index Scan on tenk1_unique2 + Index Cond: (unique2 = 3) +(19 rows) + +explain (costs off) +select * from tenk1 a join tenk1 b on + (a.unique1 = 1 and b.unique1 = 2) or (a.unique2 = 3 and b.ten = 4); + QUERY PLAN +--------------------------------------------------------------------------------------------- + Nested Loop + Join Filter: (((a.unique1 = 1) AND (b.unique1 = 2)) OR ((a.unique2 = 3) AND (b.ten = 4))) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on tenk1 b + Filter: ((unique1 = 2) OR (ten = 4)) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Bitmap Heap Scan on tenk1 a + Recheck Cond: ((unique1 = 1) OR (unique2 = 3)) + -> BitmapOr + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 = 1) + -> Bitmap Index Scan on tenk1_unique2 + Index Cond: (unique2 = 3) +(14 rows) + +explain (costs off) +select * from tenk1 a join tenk1 b on + (a.unique1 = 1 and b.unique1 = 2) or + ((a.unique2 = 3 or a.unique2 = 7) and b.hundred = 4); + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------- + Nested Loop + Join Filter: (((a.unique1 = 1) AND (b.unique1 = 2)) OR (((a.unique2 = 3) OR (a.unique2 = 7)) AND (b.hundred = 4))) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Bitmap Heap Scan on tenk1 b + Recheck Cond: ((unique1 = 2) OR (hundred = 4)) + -> BitmapOr + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 = 2) + -> Bitmap Index Scan on tenk1_hundred + Index Cond: (hundred = 4) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Bitmap Heap Scan on tenk1 a + Recheck Cond: ((unique1 = 1) OR (unique2 = 3) OR (unique2 = 7)) + -> BitmapOr + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 = 1) + -> Bitmap Index Scan on tenk1_unique2 + Index Cond: (unique2 = 3) + -> Bitmap Index Scan on tenk1_unique2 + Index Cond: (unique2 = 7) +(21 rows) + +-- +-- test placement of movable quals in a parameterized join tree +-- +explain (num_nodes off, nodes off, costs off) +select * from tenk1 t1 left join + (tenk1 t2 join tenk1 t3 on t2.thousand = t3.unique2) + on t1.hundred = t2.hundred and t1.ten = t3.ten +where t1.unique1 = 1; + QUERY PLAN +-------------------------------------------------------------------------------------- + Remote Subquery Scan on all + -> Nested Loop Left Join + -> Remote Subquery Scan on all + Distribute results by H: hundred + -> Index Scan using tenk1_unique1 on tenk1 t1 + Index Cond: (unique1 = 1) + -> Materialize + -> Remote Subquery Scan on all + Distribute results by H: hundred + -> Hash Join + Hash Cond: (t3.unique2 = t2.thousand) + Join Filter: (t1.ten = t3.ten) + -> Remote Subquery Scan on all + Distribute results by H: unique2 + -> Seq Scan on tenk1 t3 + -> Hash + -> Remote Subquery Scan on all + Distribute results by H: thousand + -> Bitmap Heap Scan on tenk1 t2 + Recheck Cond: (t1.hundred = hundred) + -> Bitmap Index Scan on tenk1_hundred + Index Cond: (t1.hundred = hundred) +(22 rows) + +explain (num_nodes off, nodes off, costs off) +select * from tenk1 t1 left join + (tenk1 t2 join tenk1 t3 on t2.thousand = t3.unique2) + on t1.hundred = t2.hundred and t1.ten + t2.ten = t3.ten +where t1.unique1 = 1; + QUERY PLAN +-------------------------------------------------------------------------------------- + Remote Subquery Scan on all + -> Nested Loop Left Join + -> Remote Subquery Scan on all + Distribute results by H: hundred + -> Index Scan using tenk1_unique1 on tenk1 t1 + Index Cond: (unique1 = 1) + -> Materialize + -> Remote Subquery Scan on all + Distribute results by H: hundred + -> Hash Join + Hash Cond: (t3.unique2 = t2.thousand) + Join Filter: ((t1.ten + t2.ten) = t3.ten) + -> Remote Subquery Scan on all + Distribute results by H: unique2 + -> Seq Scan on tenk1 t3 + -> Hash + -> Remote Subquery Scan on all + Distribute results by H: thousand + -> Bitmap Heap Scan on tenk1 t2 + Recheck Cond: (t1.hundred = hundred) + -> Bitmap Index Scan on tenk1_hundred + Index Cond: (t1.hundred = hundred) +(22 rows) + +explain (num_nodes off, nodes off, costs off) +select count(*) from + tenk1 a join tenk1 b on a.unique1 = b.unique2 + left join tenk1 c on a.unique2 = b.unique1 and c.thousand = a.thousand + join int4_tbl on b.thousand = f1; + QUERY PLAN +------------------------------------------------------------------------------------------------------- + Finalize Aggregate + -> Remote Subquery Scan on all + -> Partial Aggregate + -> Hash Right Join + Hash Cond: (c.thousand = a.thousand) + Join Filter: (a.unique2 = b.unique1) + -> Remote Subquery Scan on all + Distribute results by H: thousand + -> Index Only Scan using tenk1_thous_tenthous on tenk1 c + -> Hash + -> Remote Subquery Scan on all + Distribute results by H: thousand + -> Nested Loop + -> Remote Subquery Scan on all + Distribute results by H: unique2 + -> Nested Loop + -> Seq Scan on int4_tbl + -> Bitmap Heap Scan on tenk1 b + Recheck Cond: (thousand = int4_tbl.f1) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (thousand = int4_tbl.f1) + -> Index Scan using tenk1_unique1 on tenk1 a + Index Cond: (unique1 = b.unique2) +(23 rows) + +select count(*) from + tenk1 a join tenk1 b on a.unique1 = b.unique2 + left join tenk1 c on a.unique2 = b.unique1 and c.thousand = a.thousand + join int4_tbl on b.thousand = f1; + count +------- + 10 +(1 row) + +explain (num_nodes off, nodes off, costs off) +select b.unique1 from + tenk1 a join tenk1 b on a.unique1 = b.unique2 + left join tenk1 c on b.unique1 = 42 and c.thousand = a.thousand + join int4_tbl i1 on b.thousand = f1 + right join int4_tbl i2 on i2.f1 = b.tenthous + order by 1; + QUERY PLAN +------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + -> Sort + Sort Key: b.unique1 + -> Hash Right Join + Hash Cond: (b.tenthous = i2.f1) + -> Remote Subquery Scan on all + Distribute results by H: tenthous + -> Hash Right Join + Hash Cond: (c.thousand = a.thousand) + Join Filter: (b.unique1 = 42) + -> Remote Subquery Scan on all + Distribute results by H: 42 + -> Index Only Scan using tenk1_thous_tenthous on tenk1 c + -> Hash + -> Remote Subquery Scan on all + Distribute results by H: unique1 + -> Nested Loop + -> Remote Subquery Scan on all + Distribute results by H: unique2 + -> Nested Loop + -> Seq Scan on int4_tbl i1 + -> Bitmap Heap Scan on tenk1 b + Recheck Cond: (thousand = i1.f1) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (thousand = i1.f1) + -> Index Scan using tenk1_unique1 on tenk1 a + Index Cond: (unique1 = b.unique2) + -> Hash + -> Remote Subquery Scan on all + Distribute results by H: f1 + -> Seq Scan on int4_tbl i2 +(31 rows) + +select b.unique1 from + tenk1 a join tenk1 b on a.unique1 = b.unique2 + left join tenk1 c on b.unique1 = 42 and c.thousand = a.thousand + join int4_tbl i1 on b.thousand = f1 + right join int4_tbl i2 on i2.f1 = b.tenthous + order by 1; + unique1 +--------- + 0 + + + + +(5 rows) + +explain (num_nodes off, nodes off, costs off) +select * from +( + select unique1, q1, coalesce(unique1, -1) + q1 as fault + from int8_tbl left join tenk1 on (q2 = unique2) +) ss +where fault = 122 +order by fault; + QUERY PLAN +-------------------------------------------------------------------------------- + Remote Subquery Scan on all + -> Nested Loop Left Join + Filter: ((COALESCE(tenk1.unique1, '-1'::integer) + int8_tbl.q1) = 122) + -> Remote Subquery Scan on all + Distribute results by H: q2 + -> Seq Scan on int8_tbl + -> Materialize + -> Remote Subquery Scan on all + Distribute results by H: unique2 + -> Index Scan using tenk1_unique2 on tenk1 + Index Cond: (int8_tbl.q2 = unique2) +(11 rows) + +select * from +( + select unique1, q1, coalesce(unique1, -1) + q1 as fault + from int8_tbl left join tenk1 on (q2 = unique2) +) ss +where fault = 122 +order by fault; + unique1 | q1 | fault +---------+-----+------- + | 123 | 122 +(1 row) + +-- +-- test handling of potential equivalence clauses above outer joins +-- +explain (num_nodes off, nodes off, costs off) +select q1, unique2, thousand, hundred + from int8_tbl a left join tenk1 b on q1 = unique2 + where coalesce(thousand,123) = q1 and q1 = coalesce(hundred,123); + QUERY PLAN +-------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + -> Nested Loop Left Join + Filter: ((COALESCE(b.thousand, 123) = a.q1) AND (a.q1 = COALESCE(b.hundred, 123))) + -> Remote Subquery Scan on all + Distribute results by H: q1 + -> Seq Scan on int8_tbl a + -> Materialize + -> Remote Subquery Scan on all + Distribute results by H: COALESCE(thousand, 123) + -> Index Scan using tenk1_unique2 on tenk1 b + Index Cond: (a.q1 = unique2) +(11 rows) + +select q1, unique2, thousand, hundred + from int8_tbl a left join tenk1 b on q1 = unique2 + where coalesce(thousand,123) = q1 and q1 = coalesce(hundred,123); + q1 | unique2 | thousand | hundred +----+---------+----------+--------- +(0 rows) + +explain (num_nodes off, nodes off, costs off) +select f1, unique2, case when unique2 is null then f1 else 0 end + from int4_tbl a left join tenk1 b on f1 = unique2 + where (case when unique2 is null then f1 else 0 end) = 0; + QUERY PLAN +-------------------------------------------------------------------------- + Remote Subquery Scan on all + -> Merge Right Join + Merge Cond: (b.unique2 = a.f1) + Filter: (CASE WHEN (b.unique2 IS NULL) THEN a.f1 ELSE 0 END = 0) + -> Remote Subquery Scan on all + Distribute results by H: unique2 + -> Index Only Scan using tenk1_unique2 on tenk1 b + -> Materialize + -> Remote Subquery Scan on all + Distribute results by H: f1 + -> Sort + Sort Key: a.f1 + -> Seq Scan on int4_tbl a +(13 rows) + +select f1, unique2, case when unique2 is null then f1 else 0 end + from int4_tbl a left join tenk1 b on f1 = unique2 + where (case when unique2 is null then f1 else 0 end) = 0; + f1 | unique2 | case +----+---------+------ + 0 | 0 | 0 +(1 row) + +-- +-- another case with equivalence clauses above outer joins (bug #8591) +-- +explain (costs off) +select a.unique1, b.unique1, c.unique1, coalesce(b.twothousand, a.twothousand) + from tenk1 a left join tenk1 b on b.thousand = a.unique1 left join tenk1 c on c.unique2 = coalesce(b.twothousand, a.twothousand) + where a.unique2 < 10 and coalesce(b.twothousand, a.twothousand) = 44; + QUERY PLAN +--------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Nested Loop Left Join + -> Hash Right Join + Hash Cond: (b.thousand = a.unique1) + Filter: (COALESCE(b.twothousand, a.twothousand) = 44) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: thousand + -> Seq Scan on tenk1 b + -> Hash + -> Index Scan using tenk1_unique2 on tenk1 a + Index Cond: (unique2 < 10) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Index Scan using tenk1_unique2 on tenk1 c + Index Cond: ((unique2 = COALESCE(b.twothousand, a.twothousand)) AND (unique2 = 44)) +(15 rows) + +select a.unique1, b.unique1, c.unique1, coalesce(b.twothousand, a.twothousand) + from tenk1 a left join tenk1 b on b.thousand = a.unique1 left join tenk1 c on c.unique2 = coalesce(b.twothousand, a.twothousand) + where a.unique2 < 10 and coalesce(b.twothousand, a.twothousand) = 44; + unique1 | unique1 | unique1 | coalesce +---------+---------+---------+---------- +(0 rows) + +-- +-- check handling of join aliases when flattening multiple levels of subquery +-- +explain (verbose, costs off) +select foo1.join_key as foo1_id, foo3.join_key AS foo3_id, bug_field from + (values (0),(1)) foo1(join_key) +left join + (select join_key, bug_field from + (select ss1.join_key, ss1.bug_field from + (select f1 as join_key, 666 as bug_field from int4_tbl i1) ss1 + ) foo2 + left join + (select unique2 as join_key from tenk1 i2) ss2 + using (join_key) + ) foo3 +using (join_key); + QUERY PLAN +-------------------------------------------------------------------------------- + Hash Right Join + Output: "*VALUES*".column1, i1.f1, (666) + Hash Cond: (i1.f1 = "*VALUES*".column1) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: i1.f1, 666 + -> Merge Right Join + Output: i1.f1, 666 + Merge Cond: (i2.unique2 = i1.f1) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: i2.unique2 + Distribute results by H: unique2 + Sort Key: i2.unique2 + -> Index Only Scan using tenk1_unique2 on public.tenk1 i2 + Output: i2.unique2 + -> Materialize + Output: i1.f1 + -> Remote Subquery Scan on all (datanode_1) + Output: i1.f1 + Distribute results by H: f1 + -> Sort + Output: i1.f1 + Sort Key: i1.f1 + -> Seq Scan on public.int4_tbl i1 + Output: i1.f1 + -> Hash + Output: "*VALUES*".column1 + -> Values Scan on "*VALUES*" + Output: "*VALUES*".column1 +(28 rows) + +select foo1.join_key as foo1_id, foo3.join_key AS foo3_id, bug_field from + (values (0),(1)) foo1(join_key) +left join + (select join_key, bug_field from + (select ss1.join_key, ss1.bug_field from + (select f1 as join_key, 666 as bug_field from int4_tbl i1) ss1 + ) foo2 + left join + (select unique2 as join_key from tenk1 i2) ss2 + using (join_key) + ) foo3 +using (join_key); + foo1_id | foo3_id | bug_field +---------+---------+----------- + 0 | 0 | 666 + 1 | | +(2 rows) + +-- +-- test successful handling of nested outer joins with degenerate join quals +-- +set enable_nestloop to on; +set enable_hashjoin to off; +set enable_mergejoin to off; +explain (verbose, costs off) +select t1.* from + text_tbl t1 + left join (select *, '***'::text as d1 from int8_tbl i8b1) b1 + left join int8_tbl i8 + left join (select *, null::int as d2 from int8_tbl i8b2) b2 + on (i8.q1 = b2.q1) + on (b2.d2 = b1.q2) + on (t1.f1 = b1.d1) + left join int4_tbl i4 + on (i8.q2 = i4.f1); + QUERY PLAN +---------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + Output: t1.f1 + -> Nested Loop Left Join + Output: t1.f1 + Join Filter: (i8.q2 = i4.f1) + -> Nested Loop Left Join + Output: t1.f1, i8.q2 + Join Filter: (t1.f1 = '***'::text) + -> Seq Scan on public.text_tbl t1 + Output: t1.f1 + -> Materialize + Output: i8.q2 + -> Nested Loop Left Join + Output: i8.q2 + Join Filter: ((NULL::integer) = i8b1.q2) + -> Seq Scan on public.int8_tbl i8b1 + Output: i8b1.q1, i8b1.q2 + -> Materialize + Output: i8.q2, (NULL::integer) + -> Nested Loop Left Join + Output: i8.q2, (NULL::integer) + Join Filter: (i8.q1 = i8b2.q1) + -> Seq Scan on public.int8_tbl i8 + Output: i8.q1, i8.q2 + -> Materialize + Output: i8b2.q1, (NULL::integer) + -> Seq Scan on public.int8_tbl i8b2 + Output: i8b2.q1, NULL::integer + -> Materialize + Output: i4.f1 + -> Seq Scan on public.int4_tbl i4 + Output: i4.f1 +(32 rows) + +select t1.* from + text_tbl t1 + left join (select *, '***'::text as d1 from int8_tbl i8b1) b1 + left join int8_tbl i8 + left join (select *, null::int as d2 from int8_tbl i8b2) b2 + on (i8.q1 = b2.q1) + on (b2.d2 = b1.q2) + on (t1.f1 = b1.d1) + left join int4_tbl i4 + on (i8.q2 = i4.f1); + f1 +------------------- + hi de ho neighbor + doh! +(2 rows) + +explain (verbose, costs off) +select t1.* from + text_tbl t1 + left join (select *, '***'::text as d1 from int8_tbl i8b1) b1 + left join int8_tbl i8 + left join (select *, null::int as d2 from int8_tbl i8b2, int4_tbl i4b2) b2 + on (i8.q1 = b2.q1) + on (b2.d2 = b1.q2) + on (t1.f1 = b1.d1) + left join int4_tbl i4 + on (i8.q2 = i4.f1); + QUERY PLAN +---------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + Output: t1.f1 + -> Nested Loop Left Join + Output: t1.f1 + Join Filter: (i8.q2 = i4.f1) + -> Nested Loop Left Join + Output: t1.f1, i8.q2 + Join Filter: (t1.f1 = '***'::text) + -> Seq Scan on public.text_tbl t1 + Output: t1.f1 + -> Materialize + Output: i8.q2 + -> Nested Loop Left Join + Output: i8.q2 + Join Filter: ((NULL::integer) = i8b1.q2) + -> Seq Scan on public.int8_tbl i8b1 + Output: i8b1.q1, i8b1.q2 + -> Materialize + Output: i8.q2, (NULL::integer) + -> Nested Loop Left Join + Output: i8.q2, (NULL::integer) + Join Filter: (i8.q1 = i8b2.q1) + -> Seq Scan on public.int8_tbl i8 + Output: i8.q1, i8.q2 + -> Materialize + Output: i8b2.q1, (NULL::integer) + -> Nested Loop + Output: i8b2.q1, NULL::integer + -> Seq Scan on public.int8_tbl i8b2 + Output: i8b2.q1, i8b2.q2 + -> Materialize + -> Seq Scan on public.int4_tbl i4b2 + -> Materialize + Output: i4.f1 + -> Seq Scan on public.int4_tbl i4 + Output: i4.f1 +(36 rows) + +select t1.* from + text_tbl t1 + left join (select *, '***'::text as d1 from int8_tbl i8b1) b1 + left join int8_tbl i8 + left join (select *, null::int as d2 from int8_tbl i8b2, int4_tbl i4b2) b2 + on (i8.q1 = b2.q1) + on (b2.d2 = b1.q2) + on (t1.f1 = b1.d1) + left join int4_tbl i4 + on (i8.q2 = i4.f1); + f1 +------------------- + hi de ho neighbor + doh! +(2 rows) + +explain (verbose, costs off) +select t1.* from + text_tbl t1 + left join (select *, '***'::text as d1 from int8_tbl i8b1) b1 + left join int8_tbl i8 + left join (select *, null::int as d2 from int8_tbl i8b2, int4_tbl i4b2 + where q1 = f1) b2 + on (i8.q1 = b2.q1) + on (b2.d2 = b1.q2) + on (t1.f1 = b1.d1) + left join int4_tbl i4 + on (i8.q2 = i4.f1); + QUERY PLAN +---------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + Output: t1.f1 + -> Nested Loop Left Join + Output: t1.f1 + Join Filter: (i8.q2 = i4.f1) + -> Nested Loop Left Join + Output: t1.f1, i8.q2 + Join Filter: (t1.f1 = '***'::text) + -> Seq Scan on public.text_tbl t1 + Output: t1.f1 + -> Materialize + Output: i8.q2 + -> Nested Loop Left Join + Output: i8.q2 + Join Filter: ((NULL::integer) = i8b1.q2) + -> Seq Scan on public.int8_tbl i8b1 + Output: i8b1.q1, i8b1.q2 + -> Materialize + Output: i8.q2, (NULL::integer) + -> Nested Loop Left Join + Output: i8.q2, (NULL::integer) + Join Filter: (i8.q1 = i8b2.q1) + -> Seq Scan on public.int8_tbl i8 + Output: i8.q1, i8.q2 + -> Materialize + Output: i8b2.q1, (NULL::integer) + -> Nested Loop + Output: i8b2.q1, NULL::integer + Join Filter: (i8b2.q1 = i4b2.f1) + -> Seq Scan on public.int8_tbl i8b2 + Output: i8b2.q1, i8b2.q2 + -> Materialize + Output: i4b2.f1 + -> Seq Scan on public.int4_tbl i4b2 + Output: i4b2.f1 + -> Materialize + Output: i4.f1 + -> Seq Scan on public.int4_tbl i4 + Output: i4.f1 +(39 rows) + +select t1.* from + text_tbl t1 + left join (select *, '***'::text as d1 from int8_tbl i8b1) b1 + left join int8_tbl i8 + left join (select *, null::int as d2 from int8_tbl i8b2, int4_tbl i4b2 + where q1 = f1) b2 + on (i8.q1 = b2.q1) + on (b2.d2 = b1.q2) + on (t1.f1 = b1.d1) + left join int4_tbl i4 + on (i8.q2 = i4.f1); + f1 +------------------- + hi de ho neighbor + doh! +(2 rows) + +explain (verbose, costs off) +select * from + text_tbl t1 + inner join int8_tbl i8 + on i8.q2 = 456 + right join text_tbl t2 + on t1.f1 = 'doh!' + left join int4_tbl i4 + on i8.q1 = i4.f1; + QUERY PLAN +-------------------------------------------------------------- + Nested Loop Left Join + Output: t1.f1, i8.q1, i8.q2, t2.f1, i4.f1 + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: t2.f1 + -> Seq Scan on public.text_tbl t2 + Output: t2.f1 + -> Materialize + Output: i8.q1, i8.q2, i4.f1, t1.f1 + -> Remote Subquery Scan on all (datanode_2) + Output: i8.q1, i8.q2, i4.f1, t1.f1 + -> Nested Loop + Output: i8.q1, i8.q2, i4.f1, t1.f1 + -> Nested Loop Left Join + Output: i8.q1, i8.q2, i4.f1 + Join Filter: (i8.q1 = i4.f1) + -> Seq Scan on public.int8_tbl i8 + Output: i8.q1, i8.q2 + Filter: (i8.q2 = 456) + -> Seq Scan on public.int4_tbl i4 + Output: i4.f1 + -> Seq Scan on public.text_tbl t1 + Output: t1.f1 + Filter: (t1.f1 = 'doh!'::text) +(23 rows) + +select * from + text_tbl t1 + inner join int8_tbl i8 + on i8.q2 = 456 + right join text_tbl t2 + on t1.f1 = 'doh!' + left join int4_tbl i4 + on i8.q1 = i4.f1; + f1 | q1 | q2 | f1 | f1 +------+-----+-----+-------------------+---- + doh! | 123 | 456 | hi de ho neighbor | + doh! | 123 | 456 | doh! | +(2 rows) + +reset enable_nestloop; +reset enable_hashjoin; +reset enable_mergejoin; +-- +-- test for appropriate join order in the presence of lateral references +-- +explain (verbose, costs off) +select * from + text_tbl t1 + left join int8_tbl i8 + on i8.q2 = 123, + lateral (select i8.q1, t2.f1 from text_tbl t2 limit 1) as ss +where t1.f1 = ss.f1; + QUERY PLAN +----------------------------------------------------------------------- + Nested Loop + Output: t1.f1, i8.q1, i8.q2, (i8.q1), t2.f1 + Join Filter: (t1.f1 = t2.f1) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: t1.f1, i8.q1, i8.q2 + -> Nested Loop Left Join + Output: t1.f1, i8.q1, i8.q2 + -> Seq Scan on public.text_tbl t1 + Output: t1.f1 + -> Materialize + Output: i8.q1, i8.q2 + -> Seq Scan on public.int8_tbl i8 + Output: i8.q1, i8.q2 + Filter: (i8.q2 = 123) + -> Materialize + Output: (i8.q1), t2.f1 + -> Limit + Output: (i8.q1), t2.f1 + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: i8.q1, t2.f1 + -> Limit + Output: (i8.q1), t2.f1 + -> Seq Scan on public.text_tbl t2 + Output: i8.q1, t2.f1 +(24 rows) + +select * from + text_tbl t1 + left join int8_tbl i8 + on i8.q2 = 123, + lateral (select i8.q1, t2.f1 from text_tbl t2 limit 1) as ss +where t1.f1 = ss.f1; + f1 | q1 | q2 | q1 | f1 +-------------------+------------------+-----+------------------+------------------- + hi de ho neighbor | 4567890123456789 | 123 | 4567890123456789 | hi de ho neighbor +(1 row) + +explain (verbose, costs off) +select * from + text_tbl t1 + left join int8_tbl i8 + on i8.q2 = 123, + lateral (select i8.q1, t2.f1 from text_tbl t2 limit 1) as ss1, + lateral (select ss1.* from text_tbl t3 limit 1) as ss2 +where t1.f1 = ss2.f1; + QUERY PLAN +----------------------------------------------------------------------------- + Nested Loop + Output: t1.f1, i8.q1, i8.q2, (i8.q1), t2.f1, ((i8.q1)), (t2.f1) + Join Filter: (t1.f1 = (t2.f1)) + -> Nested Loop + Output: t1.f1, i8.q1, i8.q2, (i8.q1), t2.f1 + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: t1.f1, i8.q1, i8.q2 + -> Nested Loop Left Join + Output: t1.f1, i8.q1, i8.q2 + -> Seq Scan on public.text_tbl t1 + Output: t1.f1 + -> Materialize + Output: i8.q1, i8.q2 + -> Seq Scan on public.int8_tbl i8 + Output: i8.q1, i8.q2 + Filter: (i8.q2 = 123) + -> Materialize + Output: (i8.q1), t2.f1 + -> Limit + Output: (i8.q1), t2.f1 + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: i8.q1, t2.f1 + -> Limit + Output: (i8.q1), t2.f1 + -> Seq Scan on public.text_tbl t2 + Output: i8.q1, t2.f1 + -> Materialize + Output: ((i8.q1)), (t2.f1) + -> Limit + Output: ((i8.q1)), (t2.f1) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: (i8.q1), t2.f1 + -> Limit + Output: ((i8.q1)), (t2.f1) + -> Seq Scan on public.text_tbl t3 + Output: (i8.q1), t2.f1 +(36 rows) + +select * from + text_tbl t1 + left join int8_tbl i8 + on i8.q2 = 123, + lateral (select i8.q1, t2.f1 from text_tbl t2 limit 1) as ss1, + lateral (select ss1.* from text_tbl t3 limit 1) as ss2 +where t1.f1 = ss2.f1; + f1 | q1 | q2 | q1 | f1 | q1 | f1 +-------------------+------------------+-----+------------------+-------------------+------------------+------------------- + hi de ho neighbor | 4567890123456789 | 123 | 4567890123456789 | hi de ho neighbor | 4567890123456789 | hi de ho neighbor +(1 row) + +explain (verbose, costs off) +select 1 from + text_tbl as tt1 + inner join text_tbl as tt2 on (tt1.f1 = 'foo') + left join text_tbl as tt3 on (tt3.f1 = 'foo') + left join text_tbl as tt4 on (tt3.f1 = tt4.f1), + lateral (select tt4.f1 as c0 from text_tbl as tt5 limit 1) as ss1 +where tt1.f1 = ss1.c0; + QUERY PLAN +----------------------------------------------------------------------------- + Nested Loop + Output: 1 + -> Nested Loop Left Join + Output: tt1.f1, tt4.f1 + -> Nested Loop + Output: tt1.f1 + -> Remote Subquery Scan on all (datanode_2) + Output: tt1.f1 + -> Seq Scan on public.text_tbl tt1 + Output: tt1.f1 + Filter: (tt1.f1 = 'foo'::text) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on public.text_tbl tt2 + -> Materialize + Output: tt4.f1 + -> Remote Subquery Scan on all (datanode_2) + Output: tt4.f1 + -> Nested Loop Left Join + Output: tt4.f1 + Join Filter: (tt3.f1 = tt4.f1) + -> Seq Scan on public.text_tbl tt3 + Output: tt3.f1 + Filter: (tt3.f1 = 'foo'::text) + -> Seq Scan on public.text_tbl tt4 + Output: tt4.f1 + Filter: (tt4.f1 = 'foo'::text) + -> Materialize + Output: ss1.c0 + -> Subquery Scan on ss1 + Output: ss1.c0 + Filter: (ss1.c0 = 'foo'::text) + -> Limit + Output: (tt4.f1) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: tt4.f1 + -> Limit + Output: (tt4.f1) + -> Seq Scan on public.text_tbl tt5 + Output: tt4.f1 +(40 rows) + +select 1 from + text_tbl as tt1 + inner join text_tbl as tt2 on (tt1.f1 = 'foo') + left join text_tbl as tt3 on (tt3.f1 = 'foo') + left join text_tbl as tt4 on (tt3.f1 = tt4.f1), + lateral (select tt4.f1 as c0 from text_tbl as tt5 limit 1) as ss1 +where tt1.f1 = ss1.c0; + ?column? +---------- +(0 rows) + +-- +-- check a case in which a PlaceHolderVar forces join order +-- +explain (verbose, costs off) +select ss2.* from + int4_tbl i41 + left join int8_tbl i8 + join (select i42.f1 as c1, i43.f1 as c2, 42 as c3 + from int4_tbl i42, int4_tbl i43) ss1 + on i8.q1 = ss1.c2 + on i41.f1 = ss1.c1, + lateral (select i41.*, i8.*, ss1.* from text_tbl limit 1) ss2 +where ss1.c2 = 0; + QUERY PLAN +------------------------------------------------------------------------------------------ + Nested Loop + Output: (i41.f1), (i8.q1), (i8.q2), (i42.f1), (i43.f1), ((42)) + -> Remote Subquery Scan on all (datanode_1) + Output: i41.f1, i42.f1, i8.q1, i8.q2, i43.f1, 42 + -> Hash Join + Output: i41.f1, i42.f1, i8.q1, i8.q2, i43.f1, 42 + Hash Cond: (i41.f1 = i42.f1) + -> Nested Loop + Output: i8.q1, i8.q2, i43.f1, i41.f1 + -> Nested Loop + Output: i8.q1, i8.q2, i43.f1 + -> Seq Scan on public.int8_tbl i8 + Output: i8.q1, i8.q2 + Filter: (i8.q1 = 0) + -> Seq Scan on public.int4_tbl i43 + Output: i43.f1 + Filter: (i43.f1 = 0) + -> Seq Scan on public.int4_tbl i41 + Output: i41.f1 + -> Hash + Output: i42.f1 + -> Seq Scan on public.int4_tbl i42 + Output: i42.f1 + -> Materialize + Output: (i41.f1), (i8.q1), (i8.q2), (i42.f1), (i43.f1), ((42)) + -> Limit + Output: (i41.f1), (i8.q1), (i8.q2), (i42.f1), (i43.f1), ((42)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: i41.f1, i8.q1, i8.q2, i42.f1, i43.f1, (42) + -> Limit + Output: (i41.f1), (i8.q1), (i8.q2), (i42.f1), (i43.f1), ((42)) + -> Seq Scan on public.text_tbl + Output: i41.f1, i8.q1, i8.q2, i42.f1, i43.f1, (42) +(33 rows) + +select ss2.* from + int4_tbl i41 + left join int8_tbl i8 + join (select i42.f1 as c1, i43.f1 as c2, 42 as c3 + from int4_tbl i42, int4_tbl i43) ss1 + on i8.q1 = ss1.c2 + on i41.f1 = ss1.c1, + lateral (select i41.*, i8.*, ss1.* from text_tbl limit 1) ss2 +where ss1.c2 = 0; + f1 | q1 | q2 | c1 | c2 | c3 +----+----+----+----+----+---- +(0 rows) + +-- +-- test successful handling of full join underneath left join (bug #14105) +-- +explain (costs off) +select * from + (select 1 as id) as xx + left join + (tenk1 as a1 full join (select 1 as id) as yy on (a1.unique1 = yy.id)) + on (xx.id = coalesce(yy.id)); + QUERY PLAN +----------------------------------------------------------------------- + Nested Loop Left Join + Join Filter: ((1) = COALESCE((1))) + -> Result + -> Materialize + -> Hash Full Join + Hash Cond: (a1.unique1 = (1)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on tenk1 a1 + -> Hash + -> Result +(10 rows) + +select * from + (select 1 as id) as xx + left join + (tenk1 as a1 full join (select 1 as id) as yy on (a1.unique1 = yy.id)) + on (xx.id = coalesce(yy.id)); + id | unique1 | unique2 | two | four | ten | twenty | hundred | thousand | twothousand | fivethous | tenthous | odd | even | stringu1 | stringu2 | string4 | id +----+---------+---------+-----+------+-----+--------+---------+----------+-------------+-----------+----------+-----+------+----------+----------+---------+---- + 1 | 1 | 2838 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 2 | 3 | BAAAAA | EFEAAA | OOOOxx | 1 +(1 row) + +-- +-- test ability to push constants through outer join clauses +-- +explain (num_nodes off, nodes off, costs off) + select * from int4_tbl a left join tenk1 b on f1 = unique2 where f1 = 0; + QUERY PLAN +------------------------------------------------------------------- + Remote Subquery Scan on all + -> Nested Loop Left Join + Join Filter: (a.f1 = b.unique2) + -> Remote Subquery Scan on all + Distribute results by H: f1 + -> Seq Scan on int4_tbl a + Filter: (f1 = 0) + -> Materialize + -> Remote Subquery Scan on all + Distribute results by H: unique2 + -> Index Scan using tenk1_unique2 on tenk1 b + Index Cond: (unique2 = 0) +(12 rows) + +explain (num_nodes off, nodes off, costs off) + select * from tenk1 a full join tenk1 b using(unique2) where unique2 = 42; + QUERY PLAN +------------------------------------------------------------------- + Remote Subquery Scan on all + -> Merge Full Join + Merge Cond: (a.unique2 = b.unique2) + -> Remote Subquery Scan on all + Distribute results by H: unique2 + -> Index Scan using tenk1_unique2 on tenk1 a + Index Cond: (unique2 = 42) + -> Materialize + -> Remote Subquery Scan on all + Distribute results by H: unique2 + -> Index Scan using tenk1_unique2 on tenk1 b + Index Cond: (unique2 = 42) +(12 rows) + +-- +-- test that quals attached to an outer join have correct semantics, +-- specifically that they don't re-use expressions computed below the join; +-- we force a mergejoin so that coalesce(b.q1, 1) appears as a join input +-- +set enable_hashjoin to off; +set enable_nestloop to off; +explain (verbose, costs off) + select a.q2, b.q1 + from int8_tbl a left join int8_tbl b on a.q2 = coalesce(b.q1, 1) + where coalesce(b.q1, 1) > 0; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------------------------------ + Remote Fast Query Execution + Output: a.q2, b.q1 + Node/s: datanode_1 + Remote query: SELECT a.q2, b.q1 FROM (int8_tbl a LEFT JOIN int8_tbl b ON ((a.q2 = COALESCE(b.q1, (1)::bigint)))) WHERE (COALESCE(b.q1, (1)::bigint) > 0) + -> Merge Left Join + Output: a.q2, b.q1 + Merge Cond: (a.q2 = (COALESCE(b.q1, '1'::bigint))) + Filter: (COALESCE(b.q1, '1'::bigint) > 0) + -> Sort + Output: a.q2 + Sort Key: a.q2 + -> Seq Scan on public.int8_tbl a + Output: a.q2 + -> Sort + Output: b.q1, (COALESCE(b.q1, '1'::bigint)) + Sort Key: (COALESCE(b.q1, '1'::bigint)) + -> Seq Scan on public.int8_tbl b + Output: b.q1, COALESCE(b.q1, '1'::bigint) +(18 rows) + +select a.q2, b.q1 + from int8_tbl a left join int8_tbl b on a.q2 = coalesce(b.q1, 1) + where coalesce(b.q1, 1) > 0; + q2 | q1 +-------------------+------------------ + -4567890123456789 | + 123 | 123 + 123 | 123 + 456 | + 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 +(10 rows) + +reset enable_hashjoin; +reset enable_nestloop; +-- +-- test join removal +-- +begin; +CREATE TEMP TABLE a (id int PRIMARY KEY, b_id int); +CREATE TEMP TABLE b (id int PRIMARY KEY, c_id int); +CREATE TEMP TABLE c (id int PRIMARY KEY); +CREATE TEMP TABLE d (a int, b int); +INSERT INTO a VALUES (0, 0), (1, NULL); +INSERT INTO b VALUES (0, 0), (1, NULL); +INSERT INTO c VALUES (0), (1); +INSERT INTO d VALUES (1,3), (2,2), (3,1); +-- all three cases should be optimizable into a simple seqscan +explain (verbose false, costs false, nodes false) SELECT a.* FROM a LEFT JOIN b ON a.b_id = b.id; + QUERY PLAN +----------------------------- + Remote Subquery Scan on all + -> Seq Scan on a +(2 rows) + +explain (verbose false, costs false, nodes false) SELECT b.* FROM b LEFT JOIN c ON b.c_id = c.id; + QUERY PLAN +----------------------------- + Remote Subquery Scan on all + -> Seq Scan on b +(2 rows) + +explain (verbose false, costs false, nodes false) + SELECT a.* FROM a LEFT JOIN (b left join c on b.c_id = c.id) + ON (a.b_id = b.id); + QUERY PLAN +----------------------------- + Remote Subquery Scan on all + -> Seq Scan on a +(2 rows) + +-- check optimization of outer join within another special join +explain (verbose false, costs false, nodes false) +select id from a where id in ( + select b.id from b left join c on b.id = c.id +); + QUERY PLAN +---------------------------------- + Remote Subquery Scan on all + -> Hash Join + Hash Cond: (a.id = b.id) + -> Seq Scan on a + -> Hash + -> Seq Scan on b +(6 rows) + +-- check that join removal works for a left join when joining a subquery +-- that is guaranteed to be unique by its GROUP BY clause +explain (costs off) +select d.* from d left join (select * from b group by b.id, b.c_id) s + on d.a = s.id and d.b = s.c_id; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on d +(2 rows) + +-- similarly, but keying off a DISTINCT clause +explain (costs off) +select d.* from d left join (select distinct * from b) s + on d.a = s.id and d.b = s.c_id; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on d +(2 rows) + +-- join removal is not possible when the GROUP BY contains a column that is +-- not in the join condition. (Note: as of 9.6, we notice that b.id is a +-- primary key and so drop b.c_id from the GROUP BY of the resulting plan; +-- but this happens too late for join removal in the outer plan level.) +explain (costs off) +select d.* from d left join (select * from b group by b.id, b.c_id) s + on d.a = s.id; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Merge Right Join + Merge Cond: (b.id = d.a) + -> Group + Group Key: b.id + -> Index Scan using b_pkey on b + -> Sort + Sort Key: d.a + -> Seq Scan on d +(9 rows) + +-- similarly, but keying off a DISTINCT clause +explain (costs off) +select d.* from d left join (select distinct * from b) s + on d.a = s.id; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Merge Right Join + Merge Cond: (b.id = d.a) + -> Unique + -> Sort + Sort Key: b.id, b.c_id + -> Seq Scan on b + -> Sort + Sort Key: d.a + -> Seq Scan on d +(10 rows) + +-- check join removal works when uniqueness of the join condition is enforced +-- by a UNION +explain (costs off) +select d.* from d left join (select id from a union select id from b) s + on d.a = s.id; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on d +(2 rows) + +-- check join removal with a cross-type comparison operator +-- commenting out queries on replicated tables +-- as they can go either on datanode_1 or datanode_2 +--explain (costs off) +--select i8.* from int8_tbl i8 left join (select f1 from int4_tbl group by f1) i4 + --on i8.q1 = i4.f1; +rollback; +create temp table parent (k int primary key, pd int); +create temp table child (k int unique, cd int); +insert into parent values (1, 10), (2, 20), (3, 30); +insert into child values (1, 100), (4, 400); +-- this case is optimizable +select p.* from parent p left join child c on (p.k = c.k) order by 1,2; + k | pd +---+---- + 1 | 10 + 2 | 20 + 3 | 30 +(3 rows) + +explain (verbose false, costs false, nodes false) + select p.* from parent p left join child c on (p.k = c.k) order by 1,2; + QUERY PLAN +---------------------------------- + Remote Subquery Scan on all + -> Sort + Sort Key: p.k, p.pd + -> Seq Scan on parent p +(4 rows) + +-- this case is not +select p.*, linked from parent p + left join (select c.*, true as linked from child c) as ss + on (p.k = ss.k) order by p.k; + k | pd | linked +---+----+-------- + 1 | 10 | t + 2 | 20 | + 3 | 30 | +(3 rows) + +explain (verbose false, costs false, nodes false) + select p.*, linked from parent p + left join (select c.*, true as linked from child c) as ss + on (p.k = ss.k) order by p.k; + QUERY PLAN +---------------------------------------------------------- + Remote Subquery Scan on all + -> Merge Left Join + Merge Cond: (p.k = c.k) + -> Index Scan using parent_pkey on parent p + -> Index Only Scan using child_k_key on child c +(5 rows) + +-- check for a 9.0rc1 bug: join removal breaks pseudoconstant qual handling +select p.* from + parent p left join child c on (p.k = c.k) + where p.k = 1 and p.k = 2; + k | pd +---+---- +(0 rows) + +explain (verbose false, costs false, nodes false) +select p.* from + parent p left join child c on (p.k = c.k) + where p.k = 1 and p.k = 2; + QUERY PLAN +------------------------------------------------------ + Remote Fast Query Execution + -> Result + One-Time Filter: false + -> Index Scan using parent_pkey on parent p + Index Cond: (k = 1) +(5 rows) + +select p.* from + (parent p left join child c on (p.k = c.k)) join parent x on p.k = x.k + where p.k = 1 and p.k = 2; + k | pd +---+---- +(0 rows) + +explain (verbose false, costs false, nodes false) +select p.* from + (parent p left join child c on (p.k = c.k)) join parent x on p.k = x.k + where p.k = 1 and p.k = 2; + QUERY PLAN +-------------------------------- + Remote Fast Query Execution + -> Result + One-Time Filter: false +(3 rows) + +-- bug 5255: this is not optimizable by join removal +begin; +CREATE TEMP TABLE a (id int PRIMARY KEY); +CREATE TEMP TABLE b (id int PRIMARY KEY, a_id int); +INSERT INTO a VALUES (0), (1); +INSERT INTO b VALUES (0, 0), (1, NULL); +SELECT * FROM b LEFT JOIN a ON (b.a_id = a.id) WHERE (a.id IS NULL OR a.id > 0); + id | a_id | id +----+------+---- + 1 | | +(1 row) + +SELECT b.* FROM b LEFT JOIN a ON (b.a_id = a.id) WHERE (a.id IS NULL OR a.id > 0); + id | a_id +----+------ + 1 | +(1 row) + +rollback; +-- another join removal bug: this is not optimizable, either +begin; +create temp table innertab (id int8 primary key, dat1 int8); +insert into innertab values(123, 42); +SELECT * FROM + (SELECT 1 AS x) ss1 + LEFT JOIN + (SELECT q1, q2, COALESCE(dat1, q1) AS y + FROM int8_tbl LEFT JOIN innertab ON q2 = id) ss2 + ON true order by 1, 2, 3, 4; + x | q1 | q2 | y +---+------------------+-------------------+------------------ + 1 | 123 | 456 | 123 + 1 | 123 | 4567890123456789 | 123 + 1 | 4567890123456789 | -4567890123456789 | 4567890123456789 + 1 | 4567890123456789 | 123 | 42 + 1 | 4567890123456789 | 4567890123456789 | 4567890123456789 +(5 rows) + +rollback; +-- another join removal bug: we must clean up correctly when removing a PHV +begin; +create temp table uniquetbl (f1 text unique); +explain (costs off) +select t1.* from + uniquetbl as t1 + left join (select *, '***'::text as d1 from uniquetbl) t2 + on t1.f1 = t2.f1 + left join uniquetbl t3 + on t2.d1 = t3.f1; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on uniquetbl t1 +(2 rows) + +explain (costs off) +select t0.* +from + text_tbl t0 + left join + (select case t1.ten when 0 then 'doh!'::text else null::text end as case1, + t1.stringu2 + from tenk1 t1 + join int4_tbl i4 ON i4.f1 = t1.unique2 + left join uniquetbl u1 ON u1.f1 = t1.string4) ss + on t0.f1 = ss.case1 +where ss.stringu2 !~* ss.case1; + QUERY PLAN +-------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Nested Loop + Join Filter: ((CASE t1.ten WHEN 0 THEN 'doh!'::text ELSE NULL::text END) = t0.f1) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: CASE ten WHEN 0 THEN 'doh!'::text ELSE NULL::text END + -> Nested Loop + -> Seq Scan on int4_tbl i4 + -> Index Scan using tenk1_unique2 on tenk1 t1 + Index Cond: (unique2 = i4.f1) + Filter: (stringu2 !~* CASE ten WHEN 0 THEN 'doh!'::text ELSE NULL::text END) + -> Materialize + -> Seq Scan on text_tbl t0 +(12 rows) + +select t0.* +from + text_tbl t0 + left join + (select case t1.ten when 0 then 'doh!'::text else null::text end as case1, + t1.stringu2 + from tenk1 t1 + join int4_tbl i4 ON i4.f1 = t1.unique2 + left join uniquetbl u1 ON u1.f1 = t1.string4) ss + on t0.f1 = ss.case1 +where ss.stringu2 !~* ss.case1; + f1 +------ + doh! +(1 row) + +rollback; +-- bug #8444: we've historically allowed duplicate aliases within aliased JOINs +select * from + int8_tbl x join (int4_tbl x cross join int4_tbl y) j on q1 = f1; -- error +ERROR: column reference "f1" is ambiguous +LINE 2: ..._tbl x join (int4_tbl x cross join int4_tbl y) j on q1 = f1; + ^ +select * from + int8_tbl x join (int4_tbl x cross join int4_tbl y) j on q1 = y.f1; -- error +ERROR: invalid reference to FROM-clause entry for table "y" +LINE 2: ...bl x join (int4_tbl x cross join int4_tbl y) j on q1 = y.f1; + ^ +HINT: There is an entry for table "y", but it cannot be referenced from this part of the query. +select * from + int8_tbl x join (int4_tbl x cross join int4_tbl y(ff)) j on q1 = f1; -- ok + q1 | q2 | f1 | ff +----+----+----+---- +(0 rows) + +-- +-- Test hints given on incorrect column references are useful +-- +select t1.uunique1 from + tenk1 t1 join tenk2 t2 on t1.two = t2.two; -- error, prefer "t1" suggestion +ERROR: column t1.uunique1 does not exist +LINE 1: select t1.uunique1 from + ^ +HINT: Perhaps you meant to reference the column "t1.unique1". +select t2.uunique1 from + tenk1 t1 join tenk2 t2 on t1.two = t2.two; -- error, prefer "t2" suggestion +ERROR: column t2.uunique1 does not exist +LINE 1: select t2.uunique1 from + ^ +HINT: Perhaps you meant to reference the column "t2.unique1". +select uunique1 from + tenk1 t1 join tenk2 t2 on t1.two = t2.two; -- error, suggest both at once +ERROR: column "uunique1" does not exist +LINE 1: select uunique1 from + ^ +HINT: Perhaps you meant to reference the column "t1.unique1" or the column "t2.unique1". +-- +-- Take care to reference the correct RTE +-- +select atts.relid::regclass, s.* from pg_stats s join + pg_attribute a on s.attname = a.attname and s.tablename = + a.attrelid::regclass::text join (select unnest(indkey) attnum, + indexrelid from pg_index i) atts on atts.attnum = a.attnum where + schemaname != 'pg_catalog'; +ERROR: column atts.relid does not exist +LINE 1: select atts.relid::regclass, s.* from pg_stats s join + ^ +-- +-- Test LATERAL +-- +select unique2, x.* +from tenk1 a, lateral (select * from int4_tbl b where f1 = a.unique1) x; + unique2 | f1 +---------+---- + 9998 | 0 +(1 row) + +explain (costs off) + select unique2, x.* + from tenk1 a, lateral (select * from int4_tbl b where f1 = a.unique1) x; + QUERY PLAN +------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Nested Loop + -> Seq Scan on int4_tbl b + -> Index Scan using tenk1_unique1 on tenk1 a + Index Cond: (unique1 = b.f1) +(5 rows) + +select unique2, x.* +from int4_tbl x, lateral (select unique2 from tenk1 where f1 = unique1) ss; + unique2 | f1 +---------+---- + 9998 | 0 +(1 row) + +explain (costs off) + select unique2, x.* + from int4_tbl x, lateral (select unique2 from tenk1 where f1 = unique1) ss; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Nested Loop + -> Seq Scan on int4_tbl x + -> Index Scan using tenk1_unique1 on tenk1 + Index Cond: (unique1 = x.f1) +(5 rows) + +explain (costs off) + select unique2, x.* + from int4_tbl x cross join lateral (select unique2 from tenk1 where f1 = unique1) ss; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Nested Loop + -> Seq Scan on int4_tbl x + -> Index Scan using tenk1_unique1 on tenk1 + Index Cond: (unique1 = x.f1) +(5 rows) + +select unique2, x.* +from int4_tbl x left join lateral (select unique1, unique2 from tenk1 where f1 = unique1) ss on true order by 1; + unique2 | f1 +---------+------------- + 9998 | 0 + | -123456 + | 2147483647 + | 123456 + | -2147483647 +(5 rows) + +--explain (costs off) + --select unique2, x.* + --from int4_tbl x left join lateral (select unique1, unique2 from tenk1 where f1 = unique1) ss on true; +-- check scoping of lateral versus parent references +-- the first of these should return int8_tbl.q2, the second int8_tbl.q1 +select *, (select r from (select q1 as q2) x, (select q2 as r) y) from int8_tbl; + q1 | q2 | r +------------------+-------------------+------------------- + 123 | 456 | 456 + 123 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 123 | 123 + 4567890123456789 | 4567890123456789 | 4567890123456789 + 4567890123456789 | -4567890123456789 | -4567890123456789 +(5 rows) + +select *, (select r from (select q1 as q2) x, lateral (select q2 as r) y) from int8_tbl; + q1 | q2 | r +------------------+-------------------+------------------ + 123 | 456 | 123 + 123 | 4567890123456789 | 123 + 4567890123456789 | 123 | 4567890123456789 + 4567890123456789 | 4567890123456789 | 4567890123456789 + 4567890123456789 | -4567890123456789 | 4567890123456789 +(5 rows) + +-- lateral with function in FROM +select count(*) from tenk1 a, lateral generate_series(1,two) g; + count +------- + 5000 +(1 row) + +explain (costs off) + select count(*) from tenk1 a, lateral generate_series(1,two) g; + QUERY PLAN +----------------------------------------------------------------- + Aggregate + -> Nested Loop + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on tenk1 a + -> Function Scan on generate_series g +(5 rows) + +explain (costs off) + select count(*) from tenk1 a cross join lateral generate_series(1,two) g; + QUERY PLAN +----------------------------------------------------------------- + Aggregate + -> Nested Loop + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on tenk1 a + -> Function Scan on generate_series g +(5 rows) + +-- don't need the explicit LATERAL keyword for functions +explain (costs off) + select count(*) from tenk1 a, generate_series(1,two) g; + QUERY PLAN +----------------------------------------------------------------- + Aggregate + -> Nested Loop + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on tenk1 a + -> Function Scan on generate_series g +(5 rows) + +-- lateral with UNION ALL subselect +explain (num_nodes off, nodes off, costs off) + select * from generate_series(100,200) g, + lateral (select * from int8_tbl a where g = q1 union all + select * from int8_tbl b where g = q2) ss; + QUERY PLAN +------------------------------------------------ + Nested Loop + -> Function Scan on generate_series g + -> Materialize + -> Remote Subquery Scan on all + -> Append + -> Seq Scan on int8_tbl a + Filter: (g.g = q1) + -> Seq Scan on int8_tbl b + Filter: (g.g = q2) +(9 rows) + +select * from generate_series(100,200) g, + lateral (select * from int8_tbl a where g = q1 union all + select * from int8_tbl b where g = q2) ss; + g | q1 | q2 +-----+------------------+------------------ + 123 | 123 | 456 + 123 | 123 | 4567890123456789 + 123 | 4567890123456789 | 123 +(3 rows) + +-- lateral with VALUES +explain (num_nodes off, nodes off, costs off) + select count(*) from tenk1 a, + tenk1 b join lateral (values(a.unique1)) ss(x) on b.unique2 = ss.x; + QUERY PLAN +------------------------------------------------------------------------------ + Finalize Aggregate + -> Remote Subquery Scan on all + -> Partial Aggregate + -> Merge Join + Merge Cond: (b.unique2 = a.unique1) + -> Remote Subquery Scan on all + Distribute results by H: unique2 + -> Index Only Scan using tenk1_unique2 on tenk1 b + -> Index Only Scan using tenk1_unique1 on tenk1 a +(9 rows) + +select count(*) from tenk1 a, + tenk1 b join lateral (values(a.unique1)) ss(x) on b.unique2 = ss.x; + count +------- + 10000 +(1 row) + +-- lateral with VALUES, no flattening possible +explain (num_nodes off, nodes off, costs off) + select count(*) from tenk1 a, + tenk1 b join lateral (values(a.unique1),(-1)) ss(x) on b.unique2 = ss.x; + QUERY PLAN +------------------------------------------------------------------------ + Aggregate + -> Hash Join + Hash Cond: ("*VALUES*".column1 = b.unique2) + -> Nested Loop + -> Remote Subquery Scan on all + -> Index Only Scan using tenk1_unique1 on tenk1 a + -> Values Scan on "*VALUES*" + -> Hash + -> Remote Subquery Scan on all + -> Index Only Scan using tenk1_unique2 on tenk1 b +(10 rows) + +select count(*) from tenk1 a, + tenk1 b join lateral (values(a.unique1),(-1)) ss(x) on b.unique2 = ss.x; + count +------- + 10000 +(1 row) + +-- lateral injecting a strange outer join condition +set enable_hashjoin to off; +set enable_mergejoin to off; +explain (num_nodes off, nodes off, costs off) + select * from int8_tbl a, + int8_tbl x left join lateral (select a.q1 from int4_tbl y) ss(z) + on x.q2 = ss.z + order by a.q1, a.q2, x.q1, x.q2, ss.z; + QUERY PLAN +------------------------------------------------------ + Remote Subquery Scan on all + -> Sort + Sort Key: a.q1, a.q2, x.q1, x.q2, (a.q1) + -> Nested Loop + -> Seq Scan on int8_tbl a + -> Nested Loop Left Join + Join Filter: (x.q2 = (a.q1)) + -> Seq Scan on int8_tbl x + -> Materialize + -> Seq Scan on int4_tbl y +(10 rows) + +select * from int8_tbl a, + int8_tbl x left join lateral (select a.q1 from int4_tbl y) ss(z) + on x.q2 = ss.z + order by a.q1, a.q2, x.q1, x.q2, ss.z; + q1 | q2 | q1 | q2 | z +------------------+-------------------+------------------+-------------------+------------------ + 123 | 456 | 123 | 456 | + 123 | 456 | 123 | 4567890123456789 | + 123 | 456 | 4567890123456789 | -4567890123456789 | + 123 | 456 | 4567890123456789 | 123 | 123 + 123 | 456 | 4567890123456789 | 123 | 123 + 123 | 456 | 4567890123456789 | 123 | 123 + 123 | 456 | 4567890123456789 | 123 | 123 + 123 | 456 | 4567890123456789 | 123 | 123 + 123 | 456 | 4567890123456789 | 4567890123456789 | + 123 | 4567890123456789 | 123 | 456 | + 123 | 4567890123456789 | 123 | 4567890123456789 | + 123 | 4567890123456789 | 4567890123456789 | -4567890123456789 | + 123 | 4567890123456789 | 4567890123456789 | 123 | 123 + 123 | 4567890123456789 | 4567890123456789 | 123 | 123 + 123 | 4567890123456789 | 4567890123456789 | 123 | 123 + 123 | 4567890123456789 | 4567890123456789 | 123 | 123 + 123 | 4567890123456789 | 4567890123456789 | 123 | 123 + 123 | 4567890123456789 | 4567890123456789 | 4567890123456789 | + 4567890123456789 | -4567890123456789 | 123 | 456 | + 4567890123456789 | -4567890123456789 | 123 | 4567890123456789 | 4567890123456789 + 4567890123456789 | -4567890123456789 | 123 | 4567890123456789 | 4567890123456789 + 4567890123456789 | -4567890123456789 | 123 | 4567890123456789 | 4567890123456789 + 4567890123456789 | -4567890123456789 | 123 | 4567890123456789 | 4567890123456789 + 4567890123456789 | -4567890123456789 | 123 | 4567890123456789 | 4567890123456789 + 4567890123456789 | -4567890123456789 | 4567890123456789 | -4567890123456789 | + 4567890123456789 | -4567890123456789 | 4567890123456789 | 123 | + 4567890123456789 | -4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 4567890123456789 | -4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 4567890123456789 | -4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 4567890123456789 | -4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 4567890123456789 | -4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 123 | 123 | 456 | + 4567890123456789 | 123 | 123 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 123 | 123 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 123 | 123 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 123 | 123 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 123 | 123 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 123 | 4567890123456789 | -4567890123456789 | + 4567890123456789 | 123 | 4567890123456789 | 123 | + 4567890123456789 | 123 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 123 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 123 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 123 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 123 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 | 123 | 456 | + 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 | 4567890123456789 | -4567890123456789 | + 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 | + 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 +(57 rows) + +reset enable_hashjoin; +reset enable_mergejoin; +-- lateral reference to a join alias variable +select * from (select f1/2 as x from int4_tbl) ss1 join int4_tbl i4 on x = f1, + lateral (select x) ss2(y) order by 1,2,3; + x | f1 | y +---+----+--- + 0 | 0 | 0 +(1 row) + +select * from (select f1 as x from int4_tbl) ss1 join int4_tbl i4 on x = f1, + lateral (values(x)) ss2(y) order by 1,2,3; + x | f1 | y +-------------+-------------+------------- + -2147483647 | -2147483647 | -2147483647 + -123456 | -123456 | -123456 + 0 | 0 | 0 + 123456 | 123456 | 123456 + 2147483647 | 2147483647 | 2147483647 +(5 rows) + +select * from ((select f1/2 as x from int4_tbl) ss1 join int4_tbl i4 on x = f1) j, + lateral (select x) ss2(y) order by 1,2,3; + x | f1 | y +---+----+--- + 0 | 0 | 0 +(1 row) + +-- lateral references requiring pullup +select * from (values(1)) x(lb), + lateral generate_series(lb,4) x4 order by 1,2; + lb | x4 +----+---- + 1 | 1 + 1 | 2 + 1 | 3 + 1 | 4 +(4 rows) + +select * from (select f1/1000000000 from int4_tbl) x(lb), + lateral generate_series(lb,4) x4 order by 1,2; + lb | x4 +----+---- + -2 | -2 + -2 | -1 + -2 | 0 + -2 | 1 + -2 | 2 + -2 | 3 + -2 | 4 + 0 | 0 + 0 | 0 + 0 | 0 + 0 | 1 + 0 | 1 + 0 | 1 + 0 | 2 + 0 | 2 + 0 | 2 + 0 | 3 + 0 | 3 + 0 | 3 + 0 | 4 + 0 | 4 + 0 | 4 + 2 | 2 + 2 | 3 + 2 | 4 +(25 rows) + +select * from (values(1)) x(lb), + lateral (values(lb)) y(lbcopy) order by 1,2; + lb | lbcopy +----+-------- + 1 | 1 +(1 row) + +select * from (values(1)) x(lb), + lateral (select lb from int4_tbl) y(lbcopy); + lb | lbcopy +----+-------- + 1 | 1 + 1 | 1 + 1 | 1 + 1 | 1 + 1 | 1 +(5 rows) + +select * from + int8_tbl x left join (select q1,coalesce(q2,0) q2 from int8_tbl) y on x.q2 = y.q1, + lateral (values(x.q1,y.q1,y.q2)) v(xq1,yq1,yq2); + q1 | q2 | q1 | q2 | xq1 | yq1 | yq2 +------------------+-------------------+------------------+-------------------+------------------+------------------+------------------- + 123 | 456 | | | 123 | | + 123 | 4567890123456789 | 4567890123456789 | -4567890123456789 | 123 | 4567890123456789 | -4567890123456789 + 123 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 | 4567890123456789 + 123 | 4567890123456789 | 4567890123456789 | 123 | 123 | 4567890123456789 | 123 + 4567890123456789 | 123 | 123 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 + 4567890123456789 | 123 | 123 | 456 | 4567890123456789 | 123 | 456 + 4567890123456789 | 4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789 | 4567890123456789 | -4567890123456789 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 | 4567890123456789 | 123 + 4567890123456789 | -4567890123456789 | | | 4567890123456789 | | +(10 rows) + +select * from + int8_tbl x left join (select q1,coalesce(q2,0) q2 from int8_tbl) y on x.q2 = y.q1, + lateral (select x.q1,y.q1,y.q2) v(xq1,yq1,yq2); + q1 | q2 | q1 | q2 | xq1 | yq1 | yq2 +------------------+-------------------+------------------+-------------------+------------------+------------------+------------------- + 123 | 456 | | | 123 | | + 123 | 4567890123456789 | 4567890123456789 | -4567890123456789 | 123 | 4567890123456789 | -4567890123456789 + 123 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 | 4567890123456789 + 123 | 4567890123456789 | 4567890123456789 | 123 | 123 | 4567890123456789 | 123 + 4567890123456789 | 123 | 123 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 + 4567890123456789 | 123 | 123 | 456 | 4567890123456789 | 123 | 456 + 4567890123456789 | 4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789 | 4567890123456789 | -4567890123456789 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 | 4567890123456789 | 123 + 4567890123456789 | -4567890123456789 | | | 4567890123456789 | | +(10 rows) + +select x.* from + int8_tbl x left join (select q1,coalesce(q2,0) q2 from int8_tbl) y on x.q2 = y.q1, + lateral (select x.q1,y.q1,y.q2) v(xq1,yq1,yq2); + q1 | q2 +------------------+------------------- + 123 | 456 + 123 | 4567890123456789 + 123 | 4567890123456789 + 123 | 4567890123456789 + 4567890123456789 | 123 + 4567890123456789 | 123 + 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 + 4567890123456789 | -4567890123456789 +(10 rows) + +select v.* from + (int8_tbl x left join (select q1,coalesce(q2,0) q2 from int8_tbl) y on x.q2 = y.q1) + left join int4_tbl z on z.f1 = x.q2, + lateral (select x.q1,y.q1 union all select x.q2,y.q2) v(vx,vy) + order by vx, vy; + vx | vy +-------------------+------------------- + -4567890123456789 | + 123 | 456 + 123 | 4567890123456789 + 123 | 4567890123456789 + 123 | 4567890123456789 + 123 | 4567890123456789 + 123 | + 456 | + 4567890123456789 | -4567890123456789 + 4567890123456789 | -4567890123456789 + 4567890123456789 | 123 + 4567890123456789 | 123 + 4567890123456789 | 123 + 4567890123456789 | 123 + 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 + 4567890123456789 | +(20 rows) + +select v.* from + (int8_tbl x left join (select q1,(select coalesce(q2,0)) q2 from int8_tbl) y on x.q2 = y.q1) + left join int4_tbl z on z.f1 = x.q2, + lateral (select x.q1,y.q1 union all select x.q2,y.q2) v(vx,vy) + order by vx, vy; + vx | vy +-------------------+------------------- + -4567890123456789 | + 123 | 456 + 123 | 4567890123456789 + 123 | 4567890123456789 + 123 | 4567890123456789 + 123 | 4567890123456789 + 123 | + 456 | + 4567890123456789 | -4567890123456789 + 4567890123456789 | -4567890123456789 + 4567890123456789 | 123 + 4567890123456789 | 123 + 4567890123456789 | 123 + 4567890123456789 | 123 + 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 + 4567890123456789 | +(20 rows) + +create temp table dual(); +insert into dual default values; +analyze dual; +select v.* from + (int8_tbl x left join (select q1,(select coalesce(q2,0)) q2 from int8_tbl) y on x.q2 = y.q1) + left join int4_tbl z on z.f1 = x.q2, + lateral (select x.q1,y.q1 from dual union all select x.q2,y.q2 from dual) v(vx,vy) + order by vx, vy; + vx | vy +-------------------+------------------- + -4567890123456789 | + 123 | 456 + 123 | 4567890123456789 + 123 | 4567890123456789 + 123 | 4567890123456789 + 123 | 4567890123456789 + 123 | + 456 | + 4567890123456789 | -4567890123456789 + 4567890123456789 | -4567890123456789 + 4567890123456789 | 123 + 4567890123456789 | 123 + 4567890123456789 | 123 + 4567890123456789 | 123 + 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 + 4567890123456789 | +(20 rows) + +explain (verbose, num_nodes off, nodes off, costs off) +select * from + int8_tbl a left join + lateral (select *, a.q2 as x from int8_tbl b) ss on a.q2 = ss.q1; + QUERY PLAN +------------------------------------------------ + Remote Subquery Scan on all + Output: a.q1, a.q2, b.q1, b.q2, a.q2 + -> Nested Loop Left Join + Output: a.q1, a.q2, b.q1, b.q2, (a.q2) + -> Seq Scan on public.int8_tbl a + Output: a.q1, a.q2 + -> Seq Scan on public.int8_tbl b + Output: b.q1, b.q2, a.q2 + Filter: (a.q2 = b.q1) +(9 rows) + +select * from + int8_tbl a left join + lateral (select *, a.q2 as x from int8_tbl b) ss on a.q2 = ss.q1; + q1 | q2 | q1 | q2 | x +------------------+-------------------+------------------+-------------------+------------------ + 123 | 456 | | | + 123 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 + 123 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 123 | 4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789 + 4567890123456789 | 123 | 123 | 456 | 123 + 4567890123456789 | 123 | 123 | 4567890123456789 | 123 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789 + 4567890123456789 | -4567890123456789 | | | +(10 rows) + +--explain (verbose, costs off) +--select * from + --int8_tbl a left join + --lateral (select *, coalesce(a.q2, 42) as x from int8_tbl b) ss on a.q2 = ss.q1; +select * from + int8_tbl a left join + lateral (select *, coalesce(a.q2, 42) as x from int8_tbl b) ss on a.q2 = ss.q1; + q1 | q2 | q1 | q2 | x +------------------+-------------------+------------------+-------------------+------------------ + 123 | 456 | | | + 123 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 + 123 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 123 | 4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789 + 4567890123456789 | 123 | 123 | 456 | 123 + 4567890123456789 | 123 | 123 | 4567890123456789 | 123 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789 + 4567890123456789 | -4567890123456789 | | | +(10 rows) + +-- lateral can result in join conditions appearing below their +-- real semantic level +set enable_nestloop to on; +set enable_hashjoin to off; +set enable_mergejoin to off; +explain (num_nodes off, nodes off, verbose, costs off) +select * from int4_tbl i left join + lateral (select * from int2_tbl j where i.f1 = j.f1) k on true; + QUERY PLAN +------------------------------------------------- + Remote Subquery Scan on all + Output: i.f1, j.f1 + -> Nested Loop Left Join + Output: i.f1, j.f1 + Join Filter: (i.f1 = j.f1) + -> Remote Subquery Scan on all + Output: i.f1 + Distribute results by H: f1 + -> Seq Scan on public.int4_tbl i + Output: i.f1 + -> Materialize + Output: j.f1 + -> Seq Scan on public.int2_tbl j + Output: j.f1 +(14 rows) + +select * from int4_tbl i left join + lateral (select * from int2_tbl j where i.f1 = j.f1) k on true order by 1; + f1 | f1 +-------------+---- + -2147483647 | + -123456 | + 0 | 0 + 123456 | + 2147483647 | +(5 rows) + +reset enable_nestloop; +reset enable_hashjoin; +reset enable_mergejoin +explain (num_nodes off, nodes off, verbose, costs off) +select * from int4_tbl i left join + lateral (select coalesce(i) from int2_tbl j where i.f1 = j.f1) k on true; +ERROR: syntax error at or near "explain" +LINE 2: explain (num_nodes off, nodes off, verbose, costs off) + ^ +select * from int4_tbl i left join + lateral (select coalesce(i) from int2_tbl j where i.f1 = j.f1) k on true order by 1; + f1 | coalesce +-------------+---------- + -2147483647 | + -123456 | + 0 | (0) + 123456 | + 2147483647 | +(5 rows) + +set enable_hashjoin to off; +set enable_mergejoin to off; +explain (num_nodes off, nodes off, verbose, costs off) +select * from int4_tbl a, + lateral ( + select * from int4_tbl b left join int8_tbl c on (b.f1 = q1 and a.f1 = q2) + ) ss; + QUERY PLAN +------------------------------------------------------- + Remote Subquery Scan on all + Output: a.f1, f1, q1, q2 + -> Nested Loop + Output: a.f1, b.f1, c.q1, c.q2 + -> Seq Scan on public.int4_tbl a + Output: a.f1 + -> Nested Loop Left Join + Output: b.f1, c.q1, c.q2 + Join Filter: (b.f1 = c.q1) + -> Seq Scan on public.int4_tbl b + Output: b.f1 + -> Materialize + Output: c.q1, c.q2 + -> Seq Scan on public.int8_tbl c + Output: c.q1, c.q2 + Filter: (a.f1 = c.q2) +(16 rows) + +select * from int4_tbl a, + lateral ( + select * from int4_tbl b left join int8_tbl c on (b.f1 = q1 and a.f1 = q2) + ) ss order by 1,2,3,4; + f1 | f1 | q1 | q2 +-------------+-------------+----+---- + -2147483647 | -2147483647 | | + -2147483647 | -123456 | | + -2147483647 | 0 | | + -2147483647 | 123456 | | + -2147483647 | 2147483647 | | + -123456 | -2147483647 | | + -123456 | -123456 | | + -123456 | 0 | | + -123456 | 123456 | | + -123456 | 2147483647 | | + 0 | -2147483647 | | + 0 | -123456 | | + 0 | 0 | | + 0 | 123456 | | + 0 | 2147483647 | | + 123456 | -2147483647 | | + 123456 | -123456 | | + 123456 | 0 | | + 123456 | 123456 | | + 123456 | 2147483647 | | + 2147483647 | -2147483647 | | + 2147483647 | -123456 | | + 2147483647 | 0 | | + 2147483647 | 123456 | | + 2147483647 | 2147483647 | | +(25 rows) + +reset enable_hashjoin; +reset enable_mergejoin; +-- lateral reference in a PlaceHolderVar evaluated at join level +explain (num_nodes off, nodes off, verbose, costs off) +select * from + int8_tbl a left join lateral + (select b.q1 as bq1, c.q1 as cq1, least(a.q1,b.q1,c.q1) from + int8_tbl b cross join int8_tbl c) ss + on a.q2 = ss.bq1; + QUERY PLAN +------------------------------------------------------------------- + Remote Subquery Scan on all + Output: a.q1, a.q2, b.q1, c.q1, LEAST(a.q1, b.q1, c.q1) + -> Nested Loop Left Join + Output: a.q1, a.q2, b.q1, c.q1, (LEAST(a.q1, b.q1, c.q1)) + -> Seq Scan on public.int8_tbl a + Output: a.q1, a.q2 + -> Nested Loop + Output: b.q1, c.q1, LEAST(a.q1, b.q1, c.q1) + -> Seq Scan on public.int8_tbl b + Output: b.q1, b.q2 + Filter: (a.q2 = b.q1) + -> Seq Scan on public.int8_tbl c + Output: c.q1, c.q2 +(13 rows) + +select * from + int8_tbl a left join lateral + (select b.q1 as bq1, c.q1 as cq1, least(a.q1,b.q1,c.q1) from + int8_tbl b cross join int8_tbl c) ss + on a.q2 = ss.bq1; + q1 | q2 | bq1 | cq1 | least +------------------+-------------------+------------------+------------------+------------------ + 123 | 456 | | | + 123 | 4567890123456789 | 4567890123456789 | 123 | 123 + 123 | 4567890123456789 | 4567890123456789 | 123 | 123 + 123 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 + 123 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 + 123 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 + 123 | 4567890123456789 | 4567890123456789 | 123 | 123 + 123 | 4567890123456789 | 4567890123456789 | 123 | 123 + 123 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 + 123 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 + 123 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 + 123 | 4567890123456789 | 4567890123456789 | 123 | 123 + 123 | 4567890123456789 | 4567890123456789 | 123 | 123 + 123 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 + 123 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 + 123 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 + 4567890123456789 | 123 | 123 | 123 | 123 + 4567890123456789 | 123 | 123 | 123 | 123 + 4567890123456789 | 123 | 123 | 4567890123456789 | 123 + 4567890123456789 | 123 | 123 | 4567890123456789 | 123 + 4567890123456789 | 123 | 123 | 4567890123456789 | 123 + 4567890123456789 | 123 | 123 | 123 | 123 + 4567890123456789 | 123 | 123 | 123 | 123 + 4567890123456789 | 123 | 123 | 4567890123456789 | 123 + 4567890123456789 | 123 | 123 | 4567890123456789 | 123 + 4567890123456789 | 123 | 123 | 4567890123456789 | 123 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 | 123 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 | 123 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 | 123 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 | 123 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 | 123 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 | 123 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 4567890123456789 | -4567890123456789 | | | +(42 rows) + +-- case requiring nested PlaceHolderVars +explain (num_nodes off, nodes off, verbose, costs off) +select * from + int8_tbl c left join ( + int8_tbl a left join (select q1, coalesce(q2,42) as x from int8_tbl b) ss1 + on a.q2 = ss1.q1 + cross join + lateral (select q1, coalesce(ss1.x,q2) as y from int8_tbl d) ss2 + ) on c.q2 = ss2.q1, + lateral (select ss2.y offset 0) ss3; + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Nested Loop + Output: c.q1, c.q2, a.q1, a.q2, b.q1, (COALESCE(b.q2, '42'::bigint)), d.q1, (COALESCE(COALESCE(b.q2, '42'::bigint), d.q2)), ((COALESCE(COALESCE(b.q2, '42'::bigint), d.q2))) + -> Remote Subquery Scan on all + Output: c.q1, c.q2, a.q1, a.q2, b.q1, d.q1, COALESCE(b.q2, '42'::bigint), COALESCE(COALESCE(b.q2, '42'::bigint), d.q2) + -> Hash Right Join + Output: c.q1, c.q2, a.q1, a.q2, b.q1, d.q1, (COALESCE(b.q2, '42'::bigint)), (COALESCE((COALESCE(b.q2, '42'::bigint)), d.q2)) + Hash Cond: (d.q1 = c.q2) + -> Nested Loop + Output: a.q1, a.q2, b.q1, d.q1, (COALESCE(b.q2, '42'::bigint)), (COALESCE((COALESCE(b.q2, '42'::bigint)), d.q2)) + -> Hash Left Join + Output: a.q1, a.q2, b.q1, (COALESCE(b.q2, '42'::bigint)) + Hash Cond: (a.q2 = b.q1) + -> Seq Scan on public.int8_tbl a + Output: a.q1, a.q2 + -> Hash + Output: b.q1, (COALESCE(b.q2, '42'::bigint)) + -> Seq Scan on public.int8_tbl b + Output: b.q1, COALESCE(b.q2, '42'::bigint) + -> Seq Scan on public.int8_tbl d + Output: d.q1, COALESCE((COALESCE(b.q2, '42'::bigint)), d.q2) + -> Hash + Output: c.q1, c.q2 + -> Seq Scan on public.int8_tbl c + Output: c.q1, c.q2 + -> Result + Output: (COALESCE(COALESCE(b.q2, '42'::bigint), d.q2)) +(26 rows) + +-- case that breaks the old ph_may_need optimization +explain (num_nodes off, nodes off, verbose, costs off) +select c.*,a.*,ss1.q1,ss2.q1,ss3.* from + int8_tbl c left join ( + int8_tbl a left join + (select q1, coalesce(q2,f1) as x from int8_tbl b, int4_tbl b2 + where q1 < f1) ss1 + on a.q2 = ss1.q1 + cross join + lateral (select q1, coalesce(ss1.x,q2) as y from int8_tbl d) ss2 + ) on c.q2 = ss2.q1, + lateral (select * from int4_tbl i where ss2.y > f1) ss3; + QUERY PLAN +--------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: c.q1, c.q2, a.q1, a.q2, b.q1, d.q1, i.f1 + -> Nested Loop + Output: c.q1, c.q2, a.q1, a.q2, b.q1, d.q1, i.f1 + Join Filter: ((COALESCE((COALESCE(b.q2, (b2.f1)::bigint)), d.q2)) > i.f1) + -> Hash Right Join + Output: c.q1, c.q2, a.q1, a.q2, b.q1, d.q1, (COALESCE((COALESCE(b.q2, (b2.f1)::bigint)), d.q2)) + Hash Cond: (d.q1 = c.q2) + -> Nested Loop + Output: a.q1, a.q2, b.q1, d.q1, (COALESCE((COALESCE(b.q2, (b2.f1)::bigint)), d.q2)) + -> Hash Right Join + Output: a.q1, a.q2, b.q1, (COALESCE(b.q2, (b2.f1)::bigint)) + Hash Cond: (b.q1 = a.q2) + -> Nested Loop + Output: b.q1, COALESCE(b.q2, (b2.f1)::bigint) + Join Filter: (b.q1 < b2.f1) + -> Seq Scan on public.int8_tbl b + Output: b.q1, b.q2 + -> Materialize + Output: b2.f1 + -> Seq Scan on public.int4_tbl b2 + Output: b2.f1 + -> Hash + Output: a.q1, a.q2 + -> Seq Scan on public.int8_tbl a + Output: a.q1, a.q2 + -> Seq Scan on public.int8_tbl d + Output: d.q1, COALESCE((COALESCE(b.q2, (b2.f1)::bigint)), d.q2) + -> Hash + Output: c.q1, c.q2 + -> Seq Scan on public.int8_tbl c + Output: c.q1, c.q2 + -> Materialize + Output: i.f1 + -> Seq Scan on public.int4_tbl i + Output: i.f1 +(36 rows) + +-- check processing of postponed quals (bug #9041) +explain (num_nodes off, nodes off, verbose, costs off) +select * from + (select 1 as x offset 0) x cross join (select 2 as y offset 0) y + left join lateral ( + select * from (select 3 as z offset 0) z where z.z = x.x + ) zz on zz.z = y.y; + QUERY PLAN +---------------------------------------------- + Nested Loop Left Join + Output: (1), (2), (3) + Join Filter: (((3) = (1)) AND ((3) = (2))) + -> Nested Loop + Output: (1), (2) + -> Result + Output: 1 + -> Result + Output: 2 + -> Result + Output: 3 +(11 rows) + +-- check we don't try to do a unique-ified semijoin with LATERAL +explain (verbose, costs off, nodes off) +select * from + (values (0,9998), (1,1000)) v(id,x), + lateral (select f1 from int4_tbl + where f1 = any (select unique1 from tenk1 + where unique2 = v.x offset 0)) ss; + QUERY PLAN +---------------------------------------------------------------------------------- + Nested Loop + Output: "*VALUES*".column1, "*VALUES*".column2, int4_tbl.f1 + -> Values Scan on "*VALUES*" + Output: "*VALUES*".column1, "*VALUES*".column2 + -> Materialize + Output: int4_tbl.f1 + -> Remote Subquery Scan on all + Output: int4_tbl.f1 + -> Nested Loop + Output: int4_tbl.f1 + Join Filter: (int4_tbl.f1 = tenk1.unique1) + -> HashAggregate + Output: tenk1.unique1 + Group Key: tenk1.unique1 + -> Index Scan using tenk1_unique2 on public.tenk1 + Output: tenk1.unique1 + Index Cond: (tenk1.unique2 = "*VALUES*".column2) + -> Seq Scan on public.int4_tbl + Output: int4_tbl.f1 +(19 rows) + +select * from + (values (0,9998), (1,1000)) v(id,x), + lateral (select f1 from int4_tbl + where f1 = any (select unique1 from tenk1 + where unique2 = v.x offset 0)) ss; + id | x | f1 +----+------+---- + 0 | 9998 | 0 +(1 row) + +-- check proper extParam/allParam handling (this isn't exactly a LATERAL issue, +-- but we can make the test case much more compact with LATERAL) +explain (verbose, costs off) +select * from (values (0), (1)) v(id), +lateral (select * from int8_tbl t1, + lateral (select * from + (select * from int8_tbl t2 + where q1 = any (select q2 from int8_tbl t3 + where q2 = (select greatest(t1.q1,t2.q2)) + and (select v.id=0)) offset 0) ss2) ss + where t1.q1 = ss.q2) ss0; + QUERY PLAN +----------------------------------------------------------------------------------- + Nested Loop + Output: "*VALUES*".column1, t1.q1, t1.q2, ss2.q1, ss2.q2 + -> Values Scan on "*VALUES*" + Output: "*VALUES*".column1 + -> Materialize + Output: t1.q1, t1.q2, ss2.q1, ss2.q2 + -> Remote Subquery Scan on all (datanode_1) + Output: t1.q1, t1.q2, ss2.q1, ss2.q2 + -> Nested Loop + Output: t1.q1, t1.q2, ss2.q1, ss2.q2 + -> Seq Scan on public.int8_tbl t1 + Output: t1.q1, t1.q2 + -> Subquery Scan on ss2 + Output: ss2.q1, ss2.q2 + Filter: (t1.q1 = ss2.q2) + -> Seq Scan on public.int8_tbl t2 + Output: t2.q1, t2.q2 + Filter: (SubPlan 3) + SubPlan 3 + -> Remote Subquery Scan on all (datanode_1) + Output: t3.q2 + -> Result + Output: t3.q2 + One-Time Filter: $4 + InitPlan 1 (returns $2) + -> Result + Output: GREATEST($0, t2.q2) + InitPlan 2 (returns $4) + -> Result + Output: ($3 = 0) + -> Seq Scan on public.int8_tbl t3 + Output: t3.q1, t3.q2 + Filter: (t3.q2 = $2) +(33 rows) + +select * from (values (0), (1)) v(id), +lateral (select * from int8_tbl t1, + lateral (select * from + (select * from int8_tbl t2 + where q1 = any (select q2 from int8_tbl t3 + where q2 = (select greatest(t1.q1,t2.q2)) + and (select v.id=0)) offset 0) ss2) ss + where t1.q1 = ss.q2) ss0; + id | q1 | q2 | q1 | q2 +----+------------------+-------------------+------------------+------------------ + 0 | 4567890123456789 | 123 | 4567890123456789 | 4567890123456789 + 0 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 0 | 4567890123456789 | -4567890123456789 | 4567890123456789 | 4567890123456789 +(3 rows) + +-- test some error cases where LATERAL should have been used but wasn't +select f1,g from int4_tbl a, (select f1 as g) ss; +ERROR: column "f1" does not exist +LINE 1: select f1,g from int4_tbl a, (select f1 as g) ss; + ^ +HINT: There is a column named "f1" in table "a", but it cannot be referenced from this part of the query. +select f1,g from int4_tbl a, (select a.f1 as g) ss; +ERROR: invalid reference to FROM-clause entry for table "a" +LINE 1: select f1,g from int4_tbl a, (select a.f1 as g) ss; + ^ +HINT: There is an entry for table "a", but it cannot be referenced from this part of the query. +select f1,g from int4_tbl a cross join (select f1 as g) ss; +ERROR: column "f1" does not exist +LINE 1: select f1,g from int4_tbl a cross join (select f1 as g) ss; + ^ +HINT: There is a column named "f1" in table "a", but it cannot be referenced from this part of the query. +select f1,g from int4_tbl a cross join (select a.f1 as g) ss; +ERROR: invalid reference to FROM-clause entry for table "a" +LINE 1: select f1,g from int4_tbl a cross join (select a.f1 as g) ss... + ^ +HINT: There is an entry for table "a", but it cannot be referenced from this part of the query. +-- SQL:2008 says the left table is in scope but illegal to access here +select f1,g from int4_tbl a right join lateral generate_series(0, a.f1) g on true; +ERROR: invalid reference to FROM-clause entry for table "a" +LINE 1: ... int4_tbl a right join lateral generate_series(0, a.f1) g on... + ^ +DETAIL: The combining JOIN type must be INNER or LEFT for a LATERAL reference. +select f1,g from int4_tbl a full join lateral generate_series(0, a.f1) g on true; +ERROR: invalid reference to FROM-clause entry for table "a" +LINE 1: ...m int4_tbl a full join lateral generate_series(0, a.f1) g on... + ^ +DETAIL: The combining JOIN type must be INNER or LEFT for a LATERAL reference. +-- check we complain about ambiguous table references +select * from + int8_tbl x cross join (int4_tbl x cross join lateral (select x.f1) ss); +ERROR: table reference "x" is ambiguous +LINE 2: ...cross join (int4_tbl x cross join lateral (select x.f1) ss); + ^ +-- LATERAL can be used to put an aggregate into the FROM clause of its query +select 1 from tenk1 a, lateral (select max(a.unique1) from int4_tbl b) ss; +ERROR: aggregate functions are not allowed in FROM clause of their own query level +LINE 1: select 1 from tenk1 a, lateral (select max(a.unique1) from i... + ^ +-- check behavior of LATERAL in UPDATE/DELETE +create temp table xx1 as select f1 as x1, -f1 as x2 from int4_tbl; +-- error, can't do this: +update xx1 set x2 = f1 from (select * from int4_tbl where f1 = x1) ss; +ERROR: column "x1" does not exist +LINE 1: ... set x2 = f1 from (select * from int4_tbl where f1 = x1) ss; + ^ +HINT: There is a column named "x1" in table "xx1", but it cannot be referenced from this part of the query. +update xx1 set x2 = f1 from (select * from int4_tbl where f1 = xx1.x1) ss; +ERROR: invalid reference to FROM-clause entry for table "xx1" +LINE 1: ...t x2 = f1 from (select * from int4_tbl where f1 = xx1.x1) ss... + ^ +HINT: There is an entry for table "xx1", but it cannot be referenced from this part of the query. +-- can't do it even with LATERAL: +update xx1 set x2 = f1 from lateral (select * from int4_tbl where f1 = x1) ss; +ERROR: invalid reference to FROM-clause entry for table "xx1" +LINE 1: ...= f1 from lateral (select * from int4_tbl where f1 = x1) ss; + ^ +HINT: There is an entry for table "xx1", but it cannot be referenced from this part of the query. +-- we might in future allow something like this, but for now it's an error: +update xx1 set x2 = f1 from xx1, lateral (select * from int4_tbl where f1 = x1) ss; +ERROR: table name "xx1" specified more than once +-- also errors: +delete from xx1 using (select * from int4_tbl where f1 = x1) ss; +ERROR: column "x1" does not exist +LINE 1: ...te from xx1 using (select * from int4_tbl where f1 = x1) ss; + ^ +HINT: There is a column named "x1" in table "xx1", but it cannot be referenced from this part of the query. +delete from xx1 using (select * from int4_tbl where f1 = xx1.x1) ss; +ERROR: invalid reference to FROM-clause entry for table "xx1" +LINE 1: ...from xx1 using (select * from int4_tbl where f1 = xx1.x1) ss... + ^ +HINT: There is an entry for table "xx1", but it cannot be referenced from this part of the query. +delete from xx1 using lateral (select * from int4_tbl where f1 = x1) ss; +ERROR: invalid reference to FROM-clause entry for table "xx1" +LINE 1: ...xx1 using lateral (select * from int4_tbl where f1 = x1) ss; + ^ +HINT: There is an entry for table "xx1", but it cannot be referenced from this part of the query. +-- demonstrate problem with extrememly slow join +CREATE TABLE testr (a int, b int) DISTRIBUTE BY REPLICATION; +INSERT INTO testr SELECT generate_series(1, 10000), generate_series(5001, 15000); +CREATE TABLE testh (a int, b int); +INSERT INTO testh SELECT generate_series(1, 10000), generate_series(8001, 18000); +set enable_mergejoin TO false; +set enable_hashjoin TO false; +EXPLAIN (VERBOSE, COSTS OFF) SELECT count(*) FROM testr WHERE NOT EXISTS (SELECT * FROM testh WHERE testr.b = testh.b); + QUERY PLAN +----------------------------------------------------------------------------------- + Finalize Aggregate + Output: count(*) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: PARTIAL count(*) + -> Partial Aggregate + Output: PARTIAL count(*) + -> Nested Loop Anti Join + Join Filter: (testr.b = testh.b) + -> Remote Subquery Scan on all (datanode_1) + Output: testr.b + Distribute results by H: b + -> Seq Scan on public.testr + Output: testr.b + -> Materialize + Output: testh.b + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: testh.b + Distribute results by H: b + -> Seq Scan on public.testh + Output: testh.b +(20 rows) + +SELECT count(*) FROM testr WHERE NOT EXISTS (SELECT * FROM testh WHERE testr.b = testh.b); + count +------- + 3000 +(1 row) + +-- +-- test LATERAL reference propagation down a multi-level inheritance hierarchy +-- produced for a multi-level partitioned table hierarchy. +-- +create table pt1 (a int, b int, c varchar) partition by range(a); +create table pt1p1 partition of pt1 for values from (0) to (100) partition by range(b); +create table pt1p2 partition of pt1 for values from (100) to (200); +create table pt1p1p1 partition of pt1p1 for values from (0) to (100); +insert into pt1 values (1, 1, 'x'), (101, 101, 'y'); +create table ut1 (a int, b int, c varchar); +insert into ut1 values (101, 101, 'y'), (2, 2, 'z'); +explain (verbose, costs off) +select t1.b, ss.phv from ut1 t1 left join lateral + (select t2.a as t2a, t3.a t3a, least(t1.a, t2.a, t3.a) phv + from pt1 t2 join ut1 t3 on t2.a = t3.b) ss + on t1.a = ss.t2a order by t1.a; + QUERY PLAN +----------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + Output: t1.b, LEAST(t1.a, a, t3.a), t1.a + Sort Key: t1.a + -> Sort + Output: t1.b, (LEAST(t1.a, a, t3.a)), t1.a + Sort Key: t1.a + -> Nested Loop Left Join + Output: t1.b, (LEAST(t1.a, a, t3.a)), t1.a + -> Seq Scan on public.ut1 t1 + Output: t1.a, t1.b, t1.c + -> Materialize + Output: a, (LEAST(t1.a, a, t3.a)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: a, LEAST(t1.a, a, t3.a) + Distribute results by H: a + -> Nested Loop + Output: a, LEAST(t1.a, a, t3.a) + Join Filter: (a = t3.b) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: t3.b, t3.a + Distribute results by H: b + -> Seq Scan on public.ut1 t3 + Output: t3.b, t3.a + -> Materialize + Output: a + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: a + Distribute results by H: a + -> Append + -> Seq Scan on public.pt1p1p1 t2 + Output: t2.a + Filter: (t1.a = t2.a) + -> Seq Scan on public.pt1p2 t2_1 + Output: t2_1.a + Filter: (t1.a = t2_1.a) +(35 rows) + +select t1.b, ss.phv from ut1 t1 left join lateral + (select t2.a as t2a, t3.a t3a, least(t1.a, t2.a, t3.a) phv + from pt1 t2 join ut1 t3 on t2.a = t3.b) ss + on t1.a = ss.t2a order by t1.a; + b | phv +-----+----- + 2 | + 101 | 101 +(2 rows) + +drop table pt1; +drop table ut1; +-- +-- test that foreign key join estimation performs sanely for outer joins +-- +begin; +create table fkest (a int, b int, c int unique, primary key(a,b)); +create table fkest1 (a int, b int, primary key(a,b)); +insert into fkest select x/10, x%10, x from generate_series(1,2000) x; +insert into fkest1 select x/10, x%10 from generate_series(1,2000) x; +alter table fkest1 + add constraint fkest1_a_b_fkey foreign key (a,b) references fkest; +analyze fkest; +analyze fkest1; +explain (costs off) +select * +from fkest f + left join fkest1 f1 on f.a = f1.a and f.b = f1.b + left join fkest1 f2 on f.a = f2.a and f.b = f2.b + left join fkest1 f3 on f.a = f3.a and f.b = f3.b +where f.c = 1; + QUERY PLAN +------------------------------------------------------------------------ + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Nested Loop Left Join + -> Nested Loop Left Join + -> Nested Loop Left Join + -> Remote Subquery Scan on all (datanode_1) + Distribute results by H: a + -> Index Scan using fkest_c_key on fkest f + Index Cond: (c = 1) + -> Index Only Scan using fkest1_pkey on fkest1 f1 + Index Cond: ((a = f.a) AND (b = f.b)) + -> Index Only Scan using fkest1_pkey on fkest1 f2 + Index Cond: ((a = f.a) AND (b = f.b)) + -> Index Only Scan using fkest1_pkey on fkest1 f3 + Index Cond: ((a = f.a) AND (b = f.b)) +(14 rows) + +rollback; +-- +-- test planner's ability to mark joins as unique +-- +create table j1 (id int primary key); +create table j2 (id int primary key); +create table j3 (id int); +insert into j1 values(1),(2),(3); +insert into j2 values(1),(2),(3); +insert into j3 values(1),(1); +analyze j1; +analyze j2; +analyze j3; +-- ensure join is properly marked as unique +explain (verbose, costs off) +select * from j1 inner join j2 on j1.id = j2.id; + QUERY PLAN +---------------------------------------------------------------------------- + Remote Fast Query Execution + Output: j1.id, j2.id + Node/s: datanode_1, datanode_2 + Remote query: SELECT j1.id, j2.id FROM (j1 JOIN j2 ON ((j1.id = j2.id))) + -> Nested Loop + Output: j1.id, j2.id + Inner Unique: true + -> Seq Scan on public.j1 + Output: j1.id + -> Bitmap Heap Scan on public.j2 + Output: j2.id + Recheck Cond: (j2.id = j1.id) + -> Bitmap Index Scan on j2_pkey + Index Cond: (j2.id = j1.id) +(14 rows) + +-- ensure join is not unique when not an equi-join +explain (verbose, costs off) +select * from j1 inner join j2 on j1.id > j2.id; + QUERY PLAN +----------------------------------------------------------------- + Nested Loop + Output: j1.id, j2.id + Join Filter: (j1.id > j2.id) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: j1.id + -> Bitmap Heap Scan on public.j1 + Output: j1.id + -> Bitmap Index Scan on j1_pkey + -> Materialize + Output: j2.id + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: j2.id + -> Bitmap Heap Scan on public.j2 + Output: j2.id + -> Bitmap Index Scan on j2_pkey +(15 rows) + +-- ensure non-unique rel is not chosen as inner +explain (verbose, costs off) +select * from j1 inner join j3 on j1.id = j3.id; + QUERY PLAN +---------------------------------------------------------------------------- + Remote Fast Query Execution + Output: j1.id, j3.id + Node/s: datanode_1, datanode_2 + Remote query: SELECT j1.id, j3.id FROM (j1 JOIN j3 ON ((j1.id = j3.id))) + -> Nested Loop + Output: j1.id, j3.id + Inner Unique: true + -> Seq Scan on public.j3 + Output: j3.id + -> Bitmap Heap Scan on public.j1 + Output: j1.id + Recheck Cond: (j1.id = j3.id) + -> Bitmap Index Scan on j1_pkey + Index Cond: (j1.id = j3.id) +(14 rows) + +-- ensure left join is marked as unique +explain (verbose, costs off) +select * from j1 left join j2 on j1.id = j2.id; + QUERY PLAN +--------------------------------------------------------------------------------- + Remote Fast Query Execution + Output: j1.id, j2.id + Node/s: datanode_1, datanode_2 + Remote query: SELECT j1.id, j2.id FROM (j1 LEFT JOIN j2 ON ((j1.id = j2.id))) + -> Nested Loop Left Join + Output: j1.id, j2.id + Inner Unique: true + -> Seq Scan on public.j1 + Output: j1.id + -> Bitmap Heap Scan on public.j2 + Output: j2.id + Recheck Cond: (j1.id = j2.id) + -> Bitmap Index Scan on j2_pkey + Index Cond: (j1.id = j2.id) +(14 rows) + +-- ensure right join is marked as unique +explain (verbose, costs off) +select * from j1 right join j2 on j1.id = j2.id; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + Output: j1.id, j2.id + -> Nested Loop Left Join + Output: j1.id, j2.id + Inner Unique: true + -> Seq Scan on public.j2 + Output: j2.id + -> Bitmap Heap Scan on public.j1 + Output: j1.id + Recheck Cond: (j1.id = j2.id) + -> Bitmap Index Scan on j1_pkey + Index Cond: (j1.id = j2.id) +(12 rows) + +-- ensure full join is marked as unique +explain (verbose, costs off) +select * from j1 full join j2 on j1.id = j2.id; + QUERY PLAN +--------------------------------------------------------------------------------- + Remote Fast Query Execution + Output: j1.id, j2.id + Node/s: datanode_1, datanode_2 + Remote query: SELECT j1.id, j2.id FROM (j1 FULL JOIN j2 ON ((j1.id = j2.id))) + -> Hash Full Join + Output: j1.id, j2.id + Inner Unique: true + Hash Cond: (j1.id = j2.id) + -> Seq Scan on public.j1 + Output: j1.id + -> Hash + Output: j2.id + -> Seq Scan on public.j2 + Output: j2.id +(14 rows) + +-- a clauseless (cross) join can't be unique +explain (verbose, costs off) +select * from j1 cross join j2; + QUERY PLAN +----------------------------------------------------------------- + Nested Loop + Output: j1.id, j2.id + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: j1.id + -> Bitmap Heap Scan on public.j1 + Output: j1.id + -> Bitmap Index Scan on j1_pkey + -> Materialize + Output: j2.id + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: j2.id + -> Bitmap Heap Scan on public.j2 + Output: j2.id + -> Bitmap Index Scan on j2_pkey +(14 rows) + +-- ensure a natural join is marked as unique +explain (verbose, costs off) +select * from j1 natural join j2; + QUERY PLAN +----------------------------------------------------------- + Remote Fast Query Execution + Output: j1.id + Node/s: datanode_1, datanode_2 + Remote query: SELECT j1.id FROM (j1 JOIN j2 USING (id)) + -> Nested Loop + Output: j1.id + Inner Unique: true + -> Seq Scan on public.j1 + Output: j1.id + -> Bitmap Heap Scan on public.j2 + Output: j2.id + Recheck Cond: (j2.id = j1.id) + -> Bitmap Index Scan on j2_pkey + Index Cond: (j2.id = j1.id) +(14 rows) + +-- ensure a distinct clause allows the inner to become unique +explain (verbose, costs off) +select * from j1 +inner join (select distinct id from j3) j3 on j1.id = j3.id; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + Output: j1.id, id + -> Nested Loop + Output: j1.id, j3.id + Inner Unique: true + -> Unique + Output: j3.id + -> Sort + Output: j3.id + Sort Key: j3.id + -> Seq Scan on public.j3 + Output: j3.id + -> Bitmap Heap Scan on public.j1 + Output: j1.id + Recheck Cond: (j1.id = j3.id) + -> Bitmap Index Scan on j1_pkey + Index Cond: (j1.id = j3.id) +(17 rows) + +-- ensure group by clause allows the inner to become unique +explain (verbose, costs off) +select * from j1 +inner join (select id from j3 group by id) j3 on j1.id = j3.id; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + Output: j1.id, id + -> Nested Loop + Output: j1.id, j3.id + Inner Unique: true + -> Group + Output: j3.id + Group Key: j3.id + -> Sort + Output: j3.id + Sort Key: j3.id + -> Seq Scan on public.j3 + Output: j3.id + -> Bitmap Heap Scan on public.j1 + Output: j1.id + Recheck Cond: (j1.id = j3.id) + -> Bitmap Index Scan on j1_pkey + Index Cond: (j1.id = j3.id) +(18 rows) + +drop table j1; +drop table j2; +drop table j3; +-- test more complex permutations of unique joins +create table j1 (id1 int, id2 int, primary key(id1,id2)); +create table j2 (id1 int, id2 int, primary key(id1,id2)); +create table j3 (id1 int, id2 int, primary key(id1,id2)); +insert into j1 values(1,1),(1,2); +insert into j2 values(1,1); +insert into j3 values(1,1); +analyze j1; +analyze j2; +analyze j3; +-- ensure there's no unique join when not all columns which are part of the +-- unique index are seen in the join clause +explain (verbose, costs off) +select * from j1 +inner join j2 on j1.id1 = j2.id1; + QUERY PLAN +------------------------------------------------------------------------------------------------ + Remote Fast Query Execution + Output: j1.id1, j1.id2, j2.id1, j2.id2 + Node/s: datanode_1, datanode_2 + Remote query: SELECT j1.id1, j1.id2, j2.id1, j2.id2 FROM (j1 JOIN j2 ON ((j1.id1 = j2.id1))) + -> Nested Loop + Output: j1.id1, j1.id2, j2.id1, j2.id2 + Join Filter: (j1.id1 = j2.id1) + -> Index Only Scan using j2_pkey on public.j2 + Output: j2.id1, j2.id2 + -> Seq Scan on public.j1 + Output: j1.id1, j1.id2 +(11 rows) + +-- ensure proper unique detection with multiple join quals +explain (verbose, costs off) +select * from j1 +inner join j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------ + Remote Fast Query Execution + Output: j1.id1, j1.id2, j2.id1, j2.id2 + Node/s: datanode_1, datanode_2 + Remote query: SELECT j1.id1, j1.id2, j2.id1, j2.id2 FROM (j1 JOIN j2 ON (((j1.id1 = j2.id1) AND (j1.id2 = j2.id2)))) + -> Nested Loop + Output: j1.id1, j1.id2, j2.id1, j2.id2 + Inner Unique: true + -> Index Only Scan using j2_pkey on public.j2 + Output: j2.id1, j2.id2 + -> Bitmap Heap Scan on public.j1 + Output: j1.id1, j1.id2 + Recheck Cond: ((j1.id1 = j2.id1) AND (j1.id2 = j2.id2)) + -> Bitmap Index Scan on j1_pkey + Index Cond: ((j1.id1 = j2.id1) AND (j1.id2 = j2.id2)) +(14 rows) + +-- ensure we don't detect the join to be unique when quals are not part of the +-- join condition +explain (verbose, costs off) +select * from j1 +inner join j2 on j1.id1 = j2.id1 where j1.id2 = 1; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------- + Remote Fast Query Execution + Output: j1.id1, j1.id2, j2.id1, j2.id2 + Node/s: datanode_1, datanode_2 + Remote query: SELECT j1.id1, j1.id2, j2.id1, j2.id2 FROM (j1 JOIN j2 ON ((j1.id1 = j2.id1))) WHERE (j1.id2 = 1) + -> Nested Loop + Output: j1.id1, j1.id2, j2.id1, j2.id2 + Inner Unique: true + -> Index Only Scan using j2_pkey on public.j2 + Output: j2.id1, j2.id2 + -> Bitmap Heap Scan on public.j1 + Output: j1.id1, j1.id2 + Recheck Cond: ((j1.id1 = j2.id1) AND (j1.id2 = 1)) + -> Bitmap Index Scan on j1_pkey + Index Cond: ((j1.id1 = j2.id1) AND (j1.id2 = 1)) +(14 rows) + +-- as above, but for left joins. +explain (verbose, costs off) +select * from j1 +left join j2 on j1.id1 = j2.id1 where j1.id2 = 1; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------ + Remote Fast Query Execution + Output: j1.id1, j1.id2, j2.id1, j2.id2 + Node/s: datanode_1, datanode_2 + Remote query: SELECT j1.id1, j1.id2, j2.id1, j2.id2 FROM (j1 LEFT JOIN j2 ON ((j1.id1 = j2.id1))) WHERE (j1.id2 = 1) + -> Nested Loop Left Join + Output: j1.id1, j1.id2, j2.id1, j2.id2 + Join Filter: (j1.id1 = j2.id1) + -> Bitmap Heap Scan on public.j1 + Output: j1.id1, j1.id2 + Recheck Cond: (j1.id2 = 1) + -> Bitmap Index Scan on j1_pkey + Index Cond: (j1.id2 = 1) + -> Index Only Scan using j2_pkey on public.j2 + Output: j2.id1, j2.id2 +(14 rows) + +-- validate logic in merge joins which skips mark and restore. +-- it should only do this if all quals which were used to detect the unique +-- are present as join quals, and not plain quals. +set enable_nestloop to 0; +set enable_hashjoin to 0; +set enable_sort to 0; +-- create an index that will be preferred over the PK to perform the join +create index j1_id1_idx on j1 (id1) where id1 % 1000 = 1; +explain (costs off) select * from j1 j1 +inner join j1 j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2 +where j1.id1 % 1000 = 1 and j2.id1 % 1000 = 1; + QUERY PLAN +---------------------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Nested Loop + Join Filter: ((j1.id1 = j2.id1) AND (j1.id2 = j2.id2)) + -> Bitmap Heap Scan on j1 + Recheck Cond: ((id1 % 1000) = 1) + -> Bitmap Index Scan on j1_id1_idx + -> Bitmap Heap Scan on j1 j2 + Recheck Cond: ((id1 % 1000) = 1) + -> Bitmap Index Scan on j1_id1_idx +(10 rows) + +select * from j1 j1 +inner join j1 j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2 +where j1.id1 % 1000 = 1 and j2.id1 % 1000 = 1; + id1 | id2 | id1 | id2 +-----+-----+-----+----- + 1 | 1 | 1 | 1 + 1 | 2 | 1 | 2 +(2 rows) + +reset enable_nestloop; +reset enable_hashjoin; +reset enable_sort; +drop table j1; +drop table j2; +drop table j3; +-- check that semijoin inner is not seen as unique for a portion of the outerrel +explain (verbose, costs off) +select t1.unique1, t2.hundred +from onek t1, tenk1 t2 +where exists (select 1 from tenk1 t3 + where t3.thousand = t1.unique1 and t3.tenthous = t2.hundred) + and t1.unique1 < 1; + QUERY PLAN +--------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + Output: t1.unique1, t2.hundred + -> Nested Loop + Output: t1.unique1, t2.hundred + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: t1.unique1, t3.tenthous + Distribute results by H: tenthous + -> Hash Join + Output: t1.unique1, t3.tenthous + Hash Cond: (t3.thousand = t1.unique1) + -> HashAggregate + Output: t3.thousand, t3.tenthous + Group Key: t3.thousand, t3.tenthous + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: t3.thousand, t3.tenthous + Distribute results by H: thousand + -> HashAggregate + Output: t3.thousand, t3.tenthous + Group Key: t3.thousand, t3.tenthous + -> Index Only Scan using tenk1_thous_tenthous on public.tenk1 t3 + Output: t3.thousand, t3.tenthous + -> Hash + Output: t1.unique1 + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: t1.unique1 + Sort Key: t1.unique1 + -> Index Only Scan using onek_unique1 on public.onek t1 + Output: t1.unique1 + Index Cond: (t1.unique1 < 1) + -> Materialize + Output: t2.hundred + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: t2.hundred + Distribute results by H: hundred + Sort Key: t2.hundred + -> Index Only Scan using tenk1_hundred on public.tenk1 t2 + Output: t2.hundred + Index Cond: (t2.hundred = t3.tenthous) +(38 rows) + +-- ... unless it actually is unique +create table j3 as select unique1, tenthous from onek; +vacuum analyze j3; +create unique index on j3(unique1, tenthous); +explain (verbose, costs off) +select t1.unique1, t2.hundred +from onek t1, tenk1 t2 +where exists (select 1 from j3 + where j3.unique1 = t1.unique1 and j3.tenthous = t2.hundred) + and t1.unique1 < 1; + QUERY PLAN +------------------------------------------------------------------------------------ + Remote Subquery Scan on all (datanode_1,datanode_2) + Output: t1.unique1, t2.hundred + -> Nested Loop + Output: t1.unique1, t2.hundred + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: t1.unique1, j3.tenthous + -> Nested Loop + Output: t1.unique1, j3.tenthous + -> Index Only Scan using onek_unique1 on public.onek t1 + Output: t1.unique1 + Index Cond: (t1.unique1 < 1) + -> Index Only Scan using j3_unique1_tenthous_idx on public.j3 + Output: j3.unique1, j3.tenthous + Index Cond: (j3.unique1 = t1.unique1) + -> Index Only Scan using tenk1_hundred on public.tenk1 t2 + Output: t2.hundred + Index Cond: (t2.hundred = j3.tenthous) +(17 rows) + +drop table j3; diff --git a/src/test/regress/expected/partition_join_2.out b/src/test/regress/expected/partition_join_2.out new file mode 100644 index 00000000..8a414251 --- /dev/null +++ b/src/test/regress/expected/partition_join_2.out @@ -0,0 +1,1819 @@ +-- +-- PARTITION_JOIN +-- Test partition-wise join between partitioned tables +-- +-- Enable partition-wise join, which by default is disabled. +--SET enable_partition_wise_join to true; +-- +-- partitioned by a single column +-- +CREATE TABLE prt1 (a int, b int, c varchar) PARTITION BY RANGE(a); +CREATE TABLE prt1_p1 PARTITION OF prt1 FOR VALUES FROM (0) TO (250); +CREATE TABLE prt1_p3 PARTITION OF prt1 FOR VALUES FROM (500) TO (600); +CREATE TABLE prt1_p2 PARTITION OF prt1 FOR VALUES FROM (250) TO (500); +INSERT INTO prt1 SELECT i, i % 25, to_char(i, 'FM0000') FROM generate_series(0, 599) i WHERE i % 2 = 0; +CREATE INDEX iprt1_p1_a on prt1_p1(a); +CREATE INDEX iprt1_p2_a on prt1_p2(a); +CREATE INDEX iprt1_p3_a on prt1_p3(a); +ANALYZE prt1; +CREATE TABLE prt2 (a int, b int, c varchar) PARTITION BY RANGE(b); +CREATE TABLE prt2_p1 PARTITION OF prt2 FOR VALUES FROM (0) TO (250); +CREATE TABLE prt2_p2 PARTITION OF prt2 FOR VALUES FROM (250) TO (500); +CREATE TABLE prt2_p3 PARTITION OF prt2 FOR VALUES FROM (500) TO (600); +INSERT INTO prt2 SELECT i % 25, i, to_char(i, 'FM0000') FROM generate_series(0, 599) i WHERE i % 3 = 0; +CREATE INDEX iprt2_p1_b on prt2_p1(b); +CREATE INDEX iprt2_p2_b on prt2_p2(b); +CREATE INDEX iprt2_p3_b on prt2_p3(b); +ANALYZE prt2; +-- inner join +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.b = 0 ORDER BY t1.a, t2.b; + QUERY PLAN +----------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a + -> Hash Join + Hash Cond: (t2.b = a) + -> Append + -> Seq Scan on prt2_p1 t2 + -> Seq Scan on prt2_p2 t2_1 + -> Seq Scan on prt2_p3 t2_2 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt1_p1 t1 + Filter: (b = 0) + -> Seq Scan on prt1_p2 t1_1 + Filter: (b = 0) + -> Seq Scan on prt1_p3 t1_2 + Filter: (b = 0) +(18 rows) + +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.b = 0 ORDER BY t1.a, t2.b; + a | c | b | c +-----+------+-----+------ + 0 | 0000 | 0 | 0000 + 150 | 0150 | 150 | 0150 + 300 | 0300 | 300 | 0300 + 450 | 0450 | 450 | 0450 +(4 rows) + +-- left outer join, with whole-row reference +EXPLAIN (COSTS OFF) +SELECT t1, t2 FROM prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b; + QUERY PLAN +----------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a, b + -> Hash Right Join + Hash Cond: (b = a) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Append + -> Seq Scan on prt2_p1 t2 + -> Seq Scan on prt2_p2 t2_1 + -> Seq Scan on prt2_p3 t2_2 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Append + -> Seq Scan on prt1_p1 t1 + Filter: (b = 0) + -> Seq Scan on prt1_p2 t1_1 + Filter: (b = 0) + -> Seq Scan on prt1_p3 t1_2 + Filter: (b = 0) +(21 rows) + +SELECT t1, t2 FROM prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b; + t1 | t2 +--------------+-------------- + (0,0,0000) | (0,0,0000) + (50,0,0050) | + (100,0,0100) | + (150,0,0150) | (0,150,0150) + (200,0,0200) | + (250,0,0250) | + (300,0,0300) | (0,300,0300) + (350,0,0350) | + (400,0,0400) | + (450,0,0450) | (0,450,0450) + (500,0,0500) | + (550,0,0550) | +(12 rows) + +-- right outer join +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b; + QUERY PLAN +----------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a, b + -> Hash Right Join + Hash Cond: (a = b) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Append + -> Seq Scan on prt1_p1 t1 + -> Seq Scan on prt1_p2 t1_1 + -> Seq Scan on prt1_p3 t1_2 + -> Hash + -> Remote Subquery Scan on all (datanode_2) + Distribute results by H: b + -> Append + -> Seq Scan on prt2_p1 t2 + Filter: (a = 0) + -> Seq Scan on prt2_p2 t2_1 + Filter: (a = 0) + -> Seq Scan on prt2_p3 t2_2 + Filter: (a = 0) +(21 rows) + +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b; + a | c | b | c +-----+------+-----+------ + 0 | 0000 | 0 | 0000 + 150 | 0150 | 150 | 0150 + 300 | 0300 | 300 | 0300 + 450 | 0450 | 450 | 0450 + | | 75 | 0075 + | | 225 | 0225 + | | 375 | 0375 + | | 525 | 0525 +(8 rows) + +-- full outer join, with placeholder vars +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b) WHERE t1.phv = t1.a OR t2.phv = t2.b ORDER BY t1.a, t2.b; + QUERY PLAN +----------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a, b + -> Hash Full Join + Hash Cond: (a = b) + Filter: (((50) = a) OR ((75) = b)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Append + -> Seq Scan on prt1_p1 + Filter: (b = 0) + -> Seq Scan on prt1_p2 + Filter: (b = 0) + -> Seq Scan on prt1_p3 + Filter: (b = 0) + -> Hash + -> Remote Subquery Scan on all (datanode_2) + Distribute results by H: b + -> Append + -> Seq Scan on prt2_p1 + Filter: (a = 0) + -> Seq Scan on prt2_p2 + Filter: (a = 0) + -> Seq Scan on prt2_p3 + Filter: (a = 0) +(25 rows) + +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b) WHERE t1.phv = t1.a OR t2.phv = t2.b ORDER BY t1.a, t2.b; + a | c | b | c +----+------+----+------ + 50 | 0050 | | + | | 75 | 0075 +(2 rows) + +-- Join with pruned partitions from joining relations +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.a < 450 AND t2.b > 250 AND t1.b = 0 ORDER BY t1.a, t2.b; + QUERY PLAN +----------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a + -> Nested Loop + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt1_p1 t1 + Filter: ((a < 450) AND (b = 0)) + -> Seq Scan on prt1_p2 t1_1 + Filter: ((a < 450) AND (b = 0)) + -> Append + -> Index Scan using iprt2_p2_b on prt2_p2 t2 + Index Cond: ((b = a) AND (b > 250)) + -> Bitmap Heap Scan on prt2_p3 t2_1 + Recheck Cond: ((b = a) AND (b > 250)) + -> Bitmap Index Scan on iprt2_p3_b + Index Cond: ((b = a) AND (b > 250)) +(17 rows) + +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.a < 450 AND t2.b > 250 AND t1.b = 0 ORDER BY t1.a, t2.b; + a | c | b | c +-----+------+-----+------ + 300 | 0300 | 300 | 0300 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 LEFT JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b; + QUERY PLAN +----------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a, b + -> Hash Right Join + Hash Cond: (b = a) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Append + -> Seq Scan on prt2_p2 + Filter: (b > 250) + -> Seq Scan on prt2_p3 + Filter: (b > 250) + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Append + -> Seq Scan on prt1_p1 + Filter: ((a < 450) AND (b = 0)) + -> Seq Scan on prt1_p2 + Filter: ((a < 450) AND (b = 0)) +(20 rows) + +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 LEFT JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b; + a | c | b | c +-----+------+-----+------ + 0 | 0000 | | + 50 | 0050 | | + 100 | 0100 | | + 150 | 0150 | | + 200 | 0200 | | + 250 | 0250 | | + 300 | 0300 | 300 | 0300 + 350 | 0350 | | + 400 | 0400 | | +(9 rows) + +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 FULL JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 OR t2.a = 0 ORDER BY t1.a, t2.b; + QUERY PLAN +----------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a, b + -> Hash Full Join + Hash Cond: (a = b) + Filter: ((b = 0) OR (a = 0)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Append + -> Seq Scan on prt1_p1 + Filter: (a < 450) + -> Seq Scan on prt1_p2 + Filter: (a < 450) + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Append + -> Seq Scan on prt2_p2 + Filter: (b > 250) + -> Seq Scan on prt2_p3 + Filter: (b > 250) +(21 rows) + +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 FULL JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 OR t2.a = 0 ORDER BY t1.a, t2.b; + a | c | b | c +-----+------+-----+------ + 0 | 0000 | | + 50 | 0050 | | + 100 | 0100 | | + 150 | 0150 | | + 200 | 0200 | | + 250 | 0250 | | + 300 | 0300 | 300 | 0300 + 350 | 0350 | | + 400 | 0400 | | + | | 375 | 0375 + | | 450 | 0450 + | | 525 | 0525 +(12 rows) + +-- Semi-join +EXPLAIN (COSTS OFF) +SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t2.b FROM prt2 t2 WHERE t2.a = 0) AND t1.b = 0 ORDER BY t1.a; + QUERY PLAN +-------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: t1.a + -> Nested Loop + -> Remote Subquery Scan on all (datanode_2) + -> HashAggregate + Group Key: b + -> Remote Subquery Scan on all (datanode_2) + Distribute results by H: b + -> HashAggregate + Group Key: t2.b + -> Append + -> Seq Scan on prt2_p1 t2 + Filter: (a = 0) + -> Seq Scan on prt2_p2 t2_1 + Filter: (a = 0) + -> Seq Scan on prt2_p3 t2_2 + Filter: (a = 0) + -> Append + -> Index Scan using iprt1_p1_a on prt1_p1 t1 + Index Cond: (a = b) + Filter: (b = 0) + -> Index Scan using iprt1_p2_a on prt1_p2 t1_1 + Index Cond: (a = b) + Filter: (b = 0) + -> Index Scan using iprt1_p3_a on prt1_p3 t1_2 + Index Cond: (a = b) + Filter: (b = 0) +(28 rows) + +SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t2.b FROM prt2 t2 WHERE t2.a = 0) AND t1.b = 0 ORDER BY t1.a; + a | b | c +-----+---+------ + 0 | 0 | 0000 + 150 | 0 | 0150 + 300 | 0 | 0300 + 450 | 0 | 0450 +(4 rows) + +-- Anti-join with aggregates +EXPLAIN (COSTS OFF) +SELECT sum(t1.a), avg(t1.a), sum(t1.b), avg(t1.b) FROM prt1 t1 WHERE NOT EXISTS (SELECT 1 FROM prt2 t2 WHERE t1.a = t2.b); + QUERY PLAN +----------------------------------------------------------------------------------- + Finalize Aggregate + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Partial Aggregate + -> Hash Anti Join + Hash Cond: (t1.a = b) + -> Append + -> Seq Scan on prt1_p1 t1 + -> Seq Scan on prt1_p2 t1_1 + -> Seq Scan on prt1_p3 t1_2 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt2_p1 t2 + -> Seq Scan on prt2_p2 t2_1 + -> Seq Scan on prt2_p3 t2_2 +(15 rows) + +SELECT sum(t1.a), avg(t1.a), sum(t1.b), avg(t1.b) FROM prt1 t1 WHERE NOT EXISTS (SELECT 1 FROM prt2 t2 WHERE t1.a = t2.b); + sum | avg | sum | avg +-------+----------------------+------+--------------------- + 60000 | 300.0000000000000000 | 2400 | 12.0000000000000000 +(1 row) + +-- lateral reference +EXPLAIN (COSTS OFF) +SELECT * FROM prt1 t1 LEFT JOIN LATERAL + (SELECT t2.a AS t2a, t3.a AS t3a, least(t1.a,t2.a,t3.b) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss + ON t1.a = ss.t2a WHERE t1.b = 0 ORDER BY t1.a; + QUERY PLAN +-------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Gather Merge + Workers Planned: 1 + -> Sort + Sort Key: a + -> Parallel Nested Loop Left Join + -> Parallel Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Parallel Append + -> Parallel Seq Scan on prt1_p1 t1 + Filter: (b = 0) + -> Parallel Seq Scan on prt1_p2 t1_1 + Filter: (b = 0) + -> Parallel Seq Scan on prt1_p3 t1_2 + Filter: (b = 0) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Nested Loop + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Index Only Scan using iprt1_p1_a on prt1_p1 t2 + Index Cond: (a = a) + -> Index Only Scan using iprt1_p2_a on prt1_p2 t2_1 + Index Cond: (a = a) + -> Index Only Scan using iprt1_p3_a on prt1_p3 t2_2 + Index Cond: (a = a) + -> Append + -> Index Scan using iprt2_p1_b on prt2_p1 t3 + Index Cond: (b = a) + -> Index Scan using iprt2_p2_b on prt2_p2 t3_1 + Index Cond: (b = a) + -> Bitmap Heap Scan on prt2_p3 t3_2 + Recheck Cond: (b = a) + -> Bitmap Index Scan on iprt2_p3_b + Index Cond: (b = a) +(36 rows) + +SELECT * FROM prt1 t1 LEFT JOIN LATERAL + (SELECT t2.a AS t2a, t3.a AS t3a, least(t1.a,t2.a,t3.b) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss + ON t1.a = ss.t2a WHERE t1.b = 0 ORDER BY t1.a; + a | b | c | t2a | t3a | least +-----+---+------+-----+-----+------- + 0 | 0 | 0000 | 0 | 0 | 0 + 50 | 0 | 0050 | | | + 100 | 0 | 0100 | | | + 150 | 0 | 0150 | 150 | 0 | 150 + 200 | 0 | 0200 | | | + 250 | 0 | 0250 | | | + 300 | 0 | 0300 | 300 | 0 | 300 + 350 | 0 | 0350 | | | + 400 | 0 | 0400 | | | + 450 | 0 | 0450 | 450 | 0 | 450 + 500 | 0 | 0500 | | | + 550 | 0 | 0550 | | | +(12 rows) + +EXPLAIN (COSTS OFF) +SELECT t1.a, ss.t2a, ss.t2c FROM prt1 t1 LEFT JOIN LATERAL + (SELECT t2.a AS t2a, t3.a AS t3a, t2.b t2b, t2.c t2c, least(t1.a,t2.a,t3.b) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss + ON t1.c = ss.t2c WHERE (t1.b + coalesce(ss.t2b, 0)) = 0 ORDER BY t1.a; + QUERY PLAN +----------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a + -> Hash Left Join + Hash Cond: ((c)::text = (c)::text) + Filter: ((b + COALESCE(b, 0)) = 0) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: c + -> Append + -> Seq Scan on prt1_p1 t1 + -> Seq Scan on prt1_p2 t1_1 + -> Seq Scan on prt1_p3 t1_2 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: c + -> Hash Join + Hash Cond: (t2.a = b) + -> Append + -> Seq Scan on prt1_p1 t2 + -> Seq Scan on prt1_p2 t2_1 + -> Seq Scan on prt1_p3 t2_2 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt2_p1 t3 + -> Seq Scan on prt2_p2 t3_1 + -> Seq Scan on prt2_p3 t3_2 +(27 rows) + +SELECT t1.a, ss.t2a, ss.t2c FROM prt1 t1 LEFT JOIN LATERAL + (SELECT t2.a AS t2a, t3.a AS t3a, t2.b t2b, t2.c t2c, least(t1.a,t2.a,t3.a) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss + ON t1.c = ss.t2c WHERE (t1.b + coalesce(ss.t2b, 0)) = 0 ORDER BY t1.a; + a | t2a | t2c +-----+-----+------ + 0 | 0 | 0000 + 50 | | + 100 | | + 150 | 150 | 0150 + 200 | | + 250 | | + 300 | 300 | 0300 + 350 | | + 400 | | + 450 | 450 | 0450 + 500 | | + 550 | | +(12 rows) + +-- +-- partitioned by expression +-- +CREATE TABLE prt1_e (a int, b int, c int) PARTITION BY RANGE(((a + b)/2)); +CREATE TABLE prt1_e_p1 PARTITION OF prt1_e FOR VALUES FROM (0) TO (250); +CREATE TABLE prt1_e_p2 PARTITION OF prt1_e FOR VALUES FROM (250) TO (500); +CREATE TABLE prt1_e_p3 PARTITION OF prt1_e FOR VALUES FROM (500) TO (600); +INSERT INTO prt1_e SELECT i, i, i % 25 FROM generate_series(0, 599, 2) i; +CREATE INDEX iprt1_e_p1_ab2 on prt1_e_p1(((a+b)/2)); +CREATE INDEX iprt1_e_p2_ab2 on prt1_e_p2(((a+b)/2)); +CREATE INDEX iprt1_e_p3_ab2 on prt1_e_p3(((a+b)/2)); +ANALYZE prt1_e; +CREATE TABLE prt2_e (a int, b int, c int) PARTITION BY RANGE(((b + a)/2)); +CREATE TABLE prt2_e_p1 PARTITION OF prt2_e FOR VALUES FROM (0) TO (250); +CREATE TABLE prt2_e_p2 PARTITION OF prt2_e FOR VALUES FROM (250) TO (500); +CREATE TABLE prt2_e_p3 PARTITION OF prt2_e FOR VALUES FROM (500) TO (600); +INSERT INTO prt2_e SELECT i, i, i % 25 FROM generate_series(0, 599, 3) i; +ANALYZE prt2_e; +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_e t1, prt2_e t2 WHERE (t1.a + t1.b)/2 = (t2.b + t2.a)/2 AND t1.c = 0 ORDER BY t1.a, t2.b; + QUERY PLAN +----------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a, t2.b + -> Hash Join + Hash Cond: (((t2.b + t2.a) / 2) = ((a + b) / 2)) + -> Append + -> Seq Scan on prt2_e_p1 t2 + -> Seq Scan on prt2_e_p2 t2_1 + -> Seq Scan on prt2_e_p3 t2_2 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt1_e_p1 t1 + Filter: (c = 0) + -> Seq Scan on prt1_e_p2 t1_1 + Filter: (c = 0) + -> Seq Scan on prt1_e_p3 t1_2 + Filter: (c = 0) +(18 rows) + +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_e t1, prt2_e t2 WHERE (t1.a + t1.b)/2 = (t2.b + t2.a)/2 AND t1.c = 0 ORDER BY t1.a, t2.b; + a | c | b | c +-----+---+-----+--- + 0 | 0 | 0 | 0 + 150 | 0 | 150 | 0 + 300 | 0 | 300 | 0 + 450 | 0 | 450 | 0 +(4 rows) + +-- +-- N-way join +-- +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM prt1 t1, prt2 t2, prt1_e t3 WHERE t1.a = t2.b AND t1.a = (t3.a + t3.b)/2 AND t1.b = 0 ORDER BY t1.a, t2.b; + QUERY PLAN +----------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a + -> Hash Join + Hash Cond: (b = a) + -> Hash Join + Hash Cond: (((t3.a + t3.b) / 2) = b) + -> Append + -> Seq Scan on prt1_e_p1 t3 + -> Seq Scan on prt1_e_p2 t3_1 + -> Seq Scan on prt1_e_p3 t3_2 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt2_p1 t2 + -> Seq Scan on prt2_p2 t2_1 + -> Seq Scan on prt2_p3 t2_2 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt1_p1 t1 + Filter: (b = 0) + -> Seq Scan on prt1_p2 t1_1 + Filter: (b = 0) + -> Seq Scan on prt1_p3 t1_2 + Filter: (b = 0) +(26 rows) + +SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM prt1 t1, prt2 t2, prt1_e t3 WHERE t1.a = t2.b AND t1.a = (t3.a + t3.b)/2 AND t1.b = 0 ORDER BY t1.a, t2.b; + a | c | b | c | ?column? | c +-----+------+-----+------+----------+--- + 0 | 0000 | 0 | 0000 | 0 | 0 + 150 | 0150 | 150 | 0150 | 300 | 0 + 300 | 0300 | 300 | 0300 | 600 | 0 + 450 | 0450 | 450 | 0450 | 900 | 0 +(4 rows) + +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) LEFT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.b = 0 ORDER BY t1.a, t2.b, t3.a + t3.b; + QUERY PLAN +----------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a, b, ((a + b)) + -> Hash Right Join + Hash Cond: ((((a + b) / 2)) = a) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: ((a + b) / 2) + -> Result + -> Append + -> Seq Scan on prt1_e_p1 t3 + -> Seq Scan on prt1_e_p2 t3_1 + -> Seq Scan on prt1_e_p3 t3_2 + -> Hash + -> Hash Right Join + Hash Cond: (b = a) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Append + -> Seq Scan on prt2_p1 t2 + -> Seq Scan on prt2_p2 t2_1 + -> Seq Scan on prt2_p3 t2_2 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Append + -> Seq Scan on prt1_p1 t1 + Filter: (b = 0) + -> Seq Scan on prt1_p2 t1_1 + Filter: (b = 0) + -> Seq Scan on prt1_p3 t1_2 + Filter: (b = 0) +(31 rows) + +SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) LEFT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.b = 0 ORDER BY t1.a, t2.b, t3.a + t3.b; + a | c | b | c | ?column? | c +-----+------+-----+------+----------+--- + 0 | 0000 | 0 | 0000 | 0 | 0 + 50 | 0050 | | | 100 | 0 + 100 | 0100 | | | 200 | 0 + 150 | 0150 | 150 | 0150 | 300 | 0 + 200 | 0200 | | | 400 | 0 + 250 | 0250 | | | 500 | 0 + 300 | 0300 | 300 | 0300 | 600 | 0 + 350 | 0350 | | | 700 | 0 + 400 | 0400 | | | 800 | 0 + 450 | 0450 | 450 | 0450 | 900 | 0 + 500 | 0500 | | | 1000 | 0 + 550 | 0550 | | | 1100 | 0 +(12 rows) + +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b; + QUERY PLAN +----------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a, b, ((a + b)) + -> Hash Right Join + Hash Cond: (a = (((a + b) / 2))) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Hash Right Join + Hash Cond: (b = t1.a) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt2_p1 t2 + -> Seq Scan on prt2_p2 t2_1 + -> Seq Scan on prt2_p3 t2_2 + -> Hash + -> Append + -> Seq Scan on prt1_p1 t1 + -> Seq Scan on prt1_p2 t1_1 + -> Seq Scan on prt1_p3 t1_2 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: ((a + b) / 2) + -> Result + -> Append + -> Seq Scan on prt1_e_p1 t3 + Filter: (c = 0) + -> Seq Scan on prt1_e_p2 t3_1 + Filter: (c = 0) + -> Seq Scan on prt1_e_p3 t3_2 + Filter: (c = 0) +(30 rows) + +SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b; + a | c | b | c | ?column? | c +-----+------+-----+------+----------+--- + 0 | 0000 | 0 | 0000 | 0 | 0 + 50 | 0050 | | | 100 | 0 + 100 | 0100 | | | 200 | 0 + 150 | 0150 | 150 | 0150 | 300 | 0 + 200 | 0200 | | | 400 | 0 + 250 | 0250 | | | 500 | 0 + 300 | 0300 | 300 | 0300 | 600 | 0 + 350 | 0350 | | | 700 | 0 + 400 | 0400 | | | 800 | 0 + 450 | 0450 | 450 | 0450 | 900 | 0 + 500 | 0500 | | | 1000 | 0 + 550 | 0550 | | | 1100 | 0 +(12 rows) + +-- Cases with non-nullable expressions in subquery results; +-- make sure these go to null as expected +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.phv, t2.b, t2.phv, t3.a + t3.b, t3.phv FROM ((SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b)) FULL JOIN (SELECT 50 phv, * FROM prt1_e WHERE prt1_e.c = 0) t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.a = t1.phv OR t2.b = t2.phv OR (t3.a + t3.b)/2 = t3.phv ORDER BY t1.a, t2.b, t3.a + t3.b; + QUERY PLAN +----------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a, b, ((a + b)) + -> Hash Full Join + Hash Cond: (a = (((a + b) / 2))) + Filter: ((a = (50)) OR (b = (75)) OR (((a + b) / 2) = (50))) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Hash Full Join + Hash Cond: (a = b) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Append + -> Seq Scan on prt1_p1 + Filter: (b = 0) + -> Seq Scan on prt1_p2 + Filter: (b = 0) + -> Seq Scan on prt1_p3 + Filter: (b = 0) + -> Hash + -> Remote Subquery Scan on all (datanode_2) + Distribute results by H: b + -> Append + -> Seq Scan on prt2_p1 + Filter: (a = 0) + -> Seq Scan on prt2_p2 + Filter: (a = 0) + -> Seq Scan on prt2_p3 + Filter: (a = 0) + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: ((a + b) / 2) + -> Result + -> Append + -> Seq Scan on prt1_e_p1 + Filter: (c = 0) + -> Seq Scan on prt1_e_p2 + Filter: (c = 0) + -> Seq Scan on prt1_e_p3 + Filter: (c = 0) +(40 rows) + +SELECT t1.a, t1.phv, t2.b, t2.phv, t3.a + t3.b, t3.phv FROM ((SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b)) FULL JOIN (SELECT 50 phv, * FROM prt1_e WHERE prt1_e.c = 0) t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.a = t1.phv OR t2.b = t2.phv OR (t3.a + t3.b)/2 = t3.phv ORDER BY t1.a, t2.b, t3.a + t3.b; + a | phv | b | phv | ?column? | phv +----+-----+----+-----+----------+----- + 50 | 50 | | | 100 | 50 + | | 75 | 75 | | +(2 rows) + +-- Semi-join +EXPLAIN (COSTS OFF) +SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHERE t1.a = 0 AND t1.b = (t2.a + t2.b)/2) AND t1.b = 0 ORDER BY t1.a; + QUERY PLAN +--------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Merge Join + Merge Cond: (a = b) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Sort + Sort Key: t1.a + -> Append + -> Seq Scan on prt1_p1 t1 + Filter: (b = 0) + -> Seq Scan on prt1_p2 t1_1 + Filter: (b = 0) + -> Seq Scan on prt1_p3 t1_2 + Filter: (b = 0) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Sort + Sort Key: b + -> HashAggregate + Group Key: b + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> HashAggregate + Group Key: b + -> Nested Loop + -> Remote Subquery Scan on all (datanode_2) + -> Append + -> Seq Scan on prt2_p1 t1_3 + Filter: (a = 0) + -> Seq Scan on prt2_p2 t1_4 + Filter: (a = 0) + -> Seq Scan on prt2_p3 t1_5 + Filter: (a = 0) + -> Append + -> Index Scan using iprt1_e_p1_ab2 on prt1_e_p1 t2 + Index Cond: (((a + b) / 2) = b) + -> Index Scan using iprt1_e_p2_ab2 on prt1_e_p2 t2_1 + Index Cond: (((a + b) / 2) = b) + -> Index Scan using iprt1_e_p3_ab2 on prt1_e_p3 t2_2 + Index Cond: (((a + b) / 2) = b) +(40 rows) + +SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHERE t1.a = 0 AND t1.b = (t2.a + t2.b)/2) AND t1.b = 0 ORDER BY t1.a; + a | b | c +-----+---+------ + 0 | 0 | 0000 + 150 | 0 | 0150 + 300 | 0 | 0300 + 450 | 0 | 0450 +(4 rows) + +EXPLAIN (COSTS OFF) +SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a; + QUERY PLAN +----------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Merge Semi Join + Merge Cond: (a = b) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Sort + Sort Key: t1.a + -> Append + -> Seq Scan on prt1_p1 t1 + Filter: (b = 0) + -> Seq Scan on prt1_p2 t1_1 + Filter: (b = 0) + -> Seq Scan on prt1_p3 t1_2 + Filter: (b = 0) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Sort + Sort Key: t1_3.b + -> Hash Semi Join + Hash Cond: (t1_3.b = ((a + b) / 2)) + -> Append + -> Seq Scan on prt2_p1 t1_3 + -> Seq Scan on prt2_p2 t1_4 + -> Seq Scan on prt2_p3 t1_5 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt1_e_p1 t1_6 + Filter: (c = 0) + -> Seq Scan on prt1_e_p2 t1_7 + Filter: (c = 0) + -> Seq Scan on prt1_e_p3 t1_8 + Filter: (c = 0) +(33 rows) + +SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a; + a | b | c +-----+---+------ + 0 | 0 | 0000 + 150 | 0 | 0150 + 300 | 0 | 0300 + 450 | 0 | 0450 +(4 rows) + +-- test merge joins +SET enable_hashjoin TO off; +SET enable_nestloop TO off; +EXPLAIN (COSTS OFF) +SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a; + QUERY PLAN +----------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Merge Semi Join + Merge Cond: (a = b) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Sort + Sort Key: t1.a + -> Append + -> Seq Scan on prt1_p1 t1 + Filter: (b = 0) + -> Seq Scan on prt1_p2 t1_1 + Filter: (b = 0) + -> Seq Scan on prt1_p3 t1_2 + Filter: (b = 0) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Merge Semi Join + Merge Cond: (t1_3.b = (((a + b) / 2))) + -> Sort + Sort Key: t1_3.b + -> Append + -> Seq Scan on prt2_p1 t1_3 + -> Seq Scan on prt2_p2 t1_4 + -> Seq Scan on prt2_p3 t1_5 + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: (((t1_6.a + t1_6.b) / 2)) + -> Result + -> Append + -> Seq Scan on prt1_e_p1 t1_6 + Filter: (c = 0) + -> Seq Scan on prt1_e_p2 t1_7 + Filter: (c = 0) + -> Seq Scan on prt1_e_p3 t1_8 + Filter: (c = 0) +(35 rows) + +SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a; + a | b | c +-----+---+------ + 0 | 0 | 0000 + 150 | 0 | 0150 + 300 | 0 | 0300 + 450 | 0 | 0450 +(4 rows) + +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b; + QUERY PLAN +----------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a, b, ((a + b)) + -> Merge Right Join + Merge Cond: (a = (((a + b) / 2))) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Merge Left Join + Merge Cond: (t1.a = b) + -> Sort + Sort Key: t1.a + -> Append + -> Seq Scan on prt1_p1 t1 + -> Seq Scan on prt1_p2 t1_1 + -> Seq Scan on prt1_p3 t1_2 + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: t2.b + -> Append + -> Seq Scan on prt2_p1 t2 + -> Seq Scan on prt2_p2 t2_1 + -> Seq Scan on prt2_p3 t2_2 + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: ((a + b) / 2) + -> Sort + Sort Key: (((t3.a + t3.b) / 2)) + -> Result + -> Append + -> Seq Scan on prt1_e_p1 t3 + Filter: (c = 0) + -> Seq Scan on prt1_e_p2 t3_1 + Filter: (c = 0) + -> Seq Scan on prt1_e_p3 t3_2 + Filter: (c = 0) +(36 rows) + +SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b; + a | c | b | c | ?column? | c +-----+------+-----+------+----------+--- + 0 | 0000 | 0 | 0000 | 0 | 0 + 50 | 0050 | | | 100 | 0 + 100 | 0100 | | | 200 | 0 + 150 | 0150 | 150 | 0150 | 300 | 0 + 200 | 0200 | | | 400 | 0 + 250 | 0250 | | | 500 | 0 + 300 | 0300 | 300 | 0300 | 600 | 0 + 350 | 0350 | | | 700 | 0 + 400 | 0400 | | | 800 | 0 + 450 | 0450 | 450 | 0450 | 900 | 0 + 500 | 0500 | | | 1000 | 0 + 550 | 0550 | | | 1100 | 0 +(12 rows) + +-- MergeAppend on nullable column +EXPLAIN (COSTS OFF) +SELECT t1.a, t2.b FROM (SELECT * FROM prt1 WHERE a < 450) t1 LEFT JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b; + QUERY PLAN +----------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a, b + -> Merge Right Join + Merge Cond: (b = a) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Sort + Sort Key: prt2_p2.b + -> Append + -> Seq Scan on prt2_p2 + Filter: (b > 250) + -> Seq Scan on prt2_p3 + Filter: (b > 250) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Sort + Sort Key: prt1_p1.a + -> Append + -> Seq Scan on prt1_p1 + Filter: ((a < 450) AND (b = 0)) + -> Seq Scan on prt1_p2 + Filter: ((a < 450) AND (b = 0)) +(24 rows) + +SELECT t1.a, t2.b FROM (SELECT * FROM prt1 WHERE a < 450) t1 LEFT JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b; + a | b +-----+----- + 0 | + 50 | + 100 | + 150 | + 200 | + 250 | + 300 | 300 + 350 | + 400 | +(9 rows) + +RESET enable_hashjoin; +RESET enable_nestloop; +-- +-- partitioned by multiple columns +-- +CREATE TABLE prt1_m (a int, b int, c int) PARTITION BY RANGE(a, ((a + b)/2)); +CREATE TABLE prt1_m_p1 PARTITION OF prt1_m FOR VALUES FROM (0, 0) TO (250, 250); +CREATE TABLE prt1_m_p2 PARTITION OF prt1_m FOR VALUES FROM (250, 250) TO (500, 500); +CREATE TABLE prt1_m_p3 PARTITION OF prt1_m FOR VALUES FROM (500, 500) TO (600, 600); +INSERT INTO prt1_m SELECT i, i, i % 25 FROM generate_series(0, 599, 2) i; +ANALYZE prt1_m; +CREATE TABLE prt2_m (a int, b int, c int) PARTITION BY RANGE(((b + a)/2), b); +CREATE TABLE prt2_m_p1 PARTITION OF prt2_m FOR VALUES FROM (0, 0) TO (250, 250); +CREATE TABLE prt2_m_p2 PARTITION OF prt2_m FOR VALUES FROM (250, 250) TO (500, 500); +CREATE TABLE prt2_m_p3 PARTITION OF prt2_m FOR VALUES FROM (500, 500) TO (600, 600); +INSERT INTO prt2_m SELECT i, i, i % 25 FROM generate_series(0, 599, 3) i; +ANALYZE prt2_m; +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_m WHERE prt1_m.c = 0) t1 FULL JOIN (SELECT * FROM prt2_m WHERE prt2_m.c = 0) t2 ON (t1.a = (t2.b + t2.a)/2 AND t2.b = (t1.a + t1.b)/2) ORDER BY t1.a, t2.b; + QUERY PLAN +----------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a, b + -> Hash Full Join + Hash Cond: ((a = (((b + a) / 2))) AND (((a + b) / 2) = b)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Append + -> Seq Scan on prt1_m_p1 + Filter: (c = 0) + -> Seq Scan on prt1_m_p2 + Filter: (c = 0) + -> Seq Scan on prt1_m_p3 + Filter: (c = 0) + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: ((b + a) / 2) + -> Result + -> Append + -> Seq Scan on prt2_m_p1 + Filter: (c = 0) + -> Seq Scan on prt2_m_p2 + Filter: (c = 0) + -> Seq Scan on prt2_m_p3 + Filter: (c = 0) +(25 rows) + +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_m WHERE prt1_m.c = 0) t1 FULL JOIN (SELECT * FROM prt2_m WHERE prt2_m.c = 0) t2 ON (t1.a = (t2.b + t2.a)/2 AND t2.b = (t1.a + t1.b)/2) ORDER BY t1.a, t2.b; + a | c | b | c +-----+---+-----+--- + 0 | 0 | 0 | 0 + 50 | 0 | | + 100 | 0 | | + 150 | 0 | 150 | 0 + 200 | 0 | | + 250 | 0 | | + 300 | 0 | 300 | 0 + 350 | 0 | | + 400 | 0 | | + 450 | 0 | 450 | 0 + 500 | 0 | | + 550 | 0 | | + | | 75 | 0 + | | 225 | 0 + | | 375 | 0 + | | 525 | 0 +(16 rows) + +-- +-- tests for list partitioned tables. +-- +CREATE TABLE plt1 (a int, b int, c text) PARTITION BY LIST(c); +CREATE TABLE plt1_p1 PARTITION OF plt1 FOR VALUES IN ('0000', '0003', '0004', '0010'); +CREATE TABLE plt1_p2 PARTITION OF plt1 FOR VALUES IN ('0001', '0005', '0002', '0009'); +CREATE TABLE plt1_p3 PARTITION OF plt1 FOR VALUES IN ('0006', '0007', '0008', '0011'); +INSERT INTO plt1 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i; +ANALYZE plt1; +CREATE TABLE plt2 (a int, b int, c text) PARTITION BY LIST(c); +CREATE TABLE plt2_p1 PARTITION OF plt2 FOR VALUES IN ('0000', '0003', '0004', '0010'); +CREATE TABLE plt2_p2 PARTITION OF plt2 FOR VALUES IN ('0001', '0005', '0002', '0009'); +CREATE TABLE plt2_p3 PARTITION OF plt2 FOR VALUES IN ('0006', '0007', '0008', '0011'); +INSERT INTO plt2 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 3) i; +ANALYZE plt2; +-- +-- list partitioned by expression +-- +CREATE TABLE plt1_e (a int, b int, c text) PARTITION BY LIST(ltrim(c, 'A')); +CREATE TABLE plt1_e_p1 PARTITION OF plt1_e FOR VALUES IN ('0000', '0003', '0004', '0010'); +CREATE TABLE plt1_e_p2 PARTITION OF plt1_e FOR VALUES IN ('0001', '0005', '0002', '0009'); +CREATE TABLE plt1_e_p3 PARTITION OF plt1_e FOR VALUES IN ('0006', '0007', '0008', '0011'); +INSERT INTO plt1_e SELECT i, i, 'A' || to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i; +ANALYZE plt1_e; +-- test partition matching with N-way join +EXPLAIN (COSTS OFF) +SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM plt1 t1, plt2 t2, plt1_e t3 WHERE t1.c = t2.c AND ltrim(t3.c, 'A') = t1.c GROUP BY t1.c, t2.c, t3.c ORDER BY t1.c, t2.c, t3.c; + QUERY PLAN +----------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Finalize GroupAggregate + Group Key: c, c, c + -> Sort + Sort Key: c, c + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: c + -> Partial HashAggregate + Group Key: c, c, t3.c + -> Hash Join + Hash Cond: (c = c) + -> Hash Join + Hash Cond: (c = ltrim(t3.c, 'A'::text)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on plt2_p1 t2 + -> Seq Scan on plt2_p2 t2_1 + -> Seq Scan on plt2_p3 t2_2 + -> Hash + -> Append + -> Seq Scan on plt1_e_p1 t3 + -> Seq Scan on plt1_e_p2 t3_1 + -> Seq Scan on plt1_e_p3 t3_2 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on plt1_p1 t1 + -> Seq Scan on plt1_p2 t1_1 + -> Seq Scan on plt1_p3 t1_2 +(29 rows) + +SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM plt1 t1, plt2 t2, plt1_e t3 WHERE t1.c = t2.c AND ltrim(t3.c, 'A') = t1.c GROUP BY t1.c, t2.c, t3.c ORDER BY t1.c, t2.c, t3.c; + avg | avg | avg | c | c | c +----------------------+----------------------+-----------------------+------+------+------- + 24.0000000000000000 | 24.0000000000000000 | 48.0000000000000000 | 0000 | 0000 | A0000 + 74.0000000000000000 | 75.0000000000000000 | 148.0000000000000000 | 0001 | 0001 | A0001 + 124.0000000000000000 | 124.5000000000000000 | 248.0000000000000000 | 0002 | 0002 | A0002 + 174.0000000000000000 | 174.0000000000000000 | 348.0000000000000000 | 0003 | 0003 | A0003 + 224.0000000000000000 | 225.0000000000000000 | 448.0000000000000000 | 0004 | 0004 | A0004 + 274.0000000000000000 | 274.5000000000000000 | 548.0000000000000000 | 0005 | 0005 | A0005 + 324.0000000000000000 | 324.0000000000000000 | 648.0000000000000000 | 0006 | 0006 | A0006 + 374.0000000000000000 | 375.0000000000000000 | 748.0000000000000000 | 0007 | 0007 | A0007 + 424.0000000000000000 | 424.5000000000000000 | 848.0000000000000000 | 0008 | 0008 | A0008 + 474.0000000000000000 | 474.0000000000000000 | 948.0000000000000000 | 0009 | 0009 | A0009 + 524.0000000000000000 | 525.0000000000000000 | 1048.0000000000000000 | 0010 | 0010 | A0010 + 574.0000000000000000 | 574.5000000000000000 | 1148.0000000000000000 | 0011 | 0011 | A0011 +(12 rows) + +-- joins where one of the relations is proven empty +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.a = 1 AND t1.a = 2; + QUERY PLAN +-------------------------- + Result + One-Time Filter: false +(2 rows) + +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 LEFT JOIN prt2 t2 ON t1.a = t2.b; + QUERY PLAN +-------------------------- + Result + One-Time Filter: false +(2 rows) + +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b, prt1 t3 WHERE t2.b = t3.a; + QUERY PLAN +----------------------------------------------------------------------- + Hash Join + Hash Cond: (a = b) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt1_p1 t3 + -> Seq Scan on prt1_p2 t3_1 + -> Seq Scan on prt1_p3 t3_2 + -> Hash + -> Hash Left Join + Hash Cond: (b = a) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt2_p1 t2 + -> Seq Scan on prt2_p2 t2_1 + -> Seq Scan on prt2_p3 t2_2 + -> Hash + -> Result + One-Time Filter: false +(18 rows) + +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 FULL JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b; + QUERY PLAN +------------------------------------------------------ + Sort + Sort Key: a, b + -> Hash Left Join + Hash Cond: (b = a) + -> Remote Subquery Scan on all (datanode_2) + -> Append + -> Seq Scan on prt2_p1 t2 + Filter: (a = 0) + -> Seq Scan on prt2_p2 t2_1 + Filter: (a = 0) + -> Seq Scan on prt2_p3 t2_2 + Filter: (a = 0) + -> Hash + -> Result + One-Time Filter: false +(15 rows) + +-- +-- tests for hash partitioned tables. +-- +CREATE TABLE pht1 (a int, b int, c text) PARTITION BY HASH(c); +CREATE TABLE pht1_p1 PARTITION OF pht1 FOR VALUES WITH (MODULUS 3, REMAINDER 0); +CREATE TABLE pht1_p2 PARTITION OF pht1 FOR VALUES WITH (MODULUS 3, REMAINDER 1); +CREATE TABLE pht1_p3 PARTITION OF pht1 FOR VALUES WITH (MODULUS 3, REMAINDER 2); +INSERT INTO pht1 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i; +ANALYZE pht1; +CREATE TABLE pht2 (a int, b int, c text) PARTITION BY HASH(c); +CREATE TABLE pht2_p1 PARTITION OF pht2 FOR VALUES WITH (MODULUS 3, REMAINDER 0); +CREATE TABLE pht2_p2 PARTITION OF pht2 FOR VALUES WITH (MODULUS 3, REMAINDER 1); +CREATE TABLE pht2_p3 PARTITION OF pht2 FOR VALUES WITH (MODULUS 3, REMAINDER 2); +INSERT INTO pht2 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 3) i; +ANALYZE pht2; +-- +-- hash partitioned by expression +-- +CREATE TABLE pht1_e (a int, b int, c text) PARTITION BY HASH(ltrim(c, 'A')); +CREATE TABLE pht1_e_p1 PARTITION OF pht1_e FOR VALUES WITH (MODULUS 3, REMAINDER 0); +CREATE TABLE pht1_e_p2 PARTITION OF pht1_e FOR VALUES WITH (MODULUS 3, REMAINDER 1); +CREATE TABLE pht1_e_p3 PARTITION OF pht1_e FOR VALUES WITH (MODULUS 3, REMAINDER 2); +INSERT INTO pht1_e SELECT i, i, 'A' || to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i; +ANALYZE pht1_e; +-- test partition matching with N-way join +EXPLAIN (COSTS OFF) +SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM pht1 t1, pht2 t2, pht1_e t3 WHERE t1.c = t2.c AND ltrim(t3.c, 'A') = t1.c GROUP BY t1.c, t2.c, t3.c ORDER BY t1.c, t2.c, t3.c; + QUERY PLAN +----------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Finalize GroupAggregate + Group Key: c, c, c + -> Sort + Sort Key: c, c + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: c + -> Partial HashAggregate + Group Key: c, c, t3.c + -> Hash Join + Hash Cond: (c = c) + -> Hash Join + Hash Cond: (c = ltrim(t3.c, 'A'::text)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on pht2_p1 t2 + -> Seq Scan on pht2_p2 t2_1 + -> Seq Scan on pht2_p3 t2_2 + -> Hash + -> Append + -> Seq Scan on pht1_e_p1 t3 + -> Seq Scan on pht1_e_p2 t3_1 + -> Seq Scan on pht1_e_p3 t3_2 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on pht1_p1 t1 + -> Seq Scan on pht1_p2 t1_1 + -> Seq Scan on pht1_p3 t1_2 +(29 rows) + +SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM pht1 t1, pht2 t2, pht1_e t3 WHERE t1.c = t2.c AND ltrim(t3.c, 'A') = t1.c GROUP BY t1.c, t2.c, t3.c ORDER BY t1.c, t2.c, t3.c; + avg | avg | avg | c | c | c +----------------------+----------------------+-----------------------+------+------+------- + 24.0000000000000000 | 24.0000000000000000 | 48.0000000000000000 | 0000 | 0000 | A0000 + 74.0000000000000000 | 75.0000000000000000 | 148.0000000000000000 | 0001 | 0001 | A0001 + 124.0000000000000000 | 124.5000000000000000 | 248.0000000000000000 | 0002 | 0002 | A0002 + 174.0000000000000000 | 174.0000000000000000 | 348.0000000000000000 | 0003 | 0003 | A0003 + 224.0000000000000000 | 225.0000000000000000 | 448.0000000000000000 | 0004 | 0004 | A0004 + 274.0000000000000000 | 274.5000000000000000 | 548.0000000000000000 | 0005 | 0005 | A0005 + 324.0000000000000000 | 324.0000000000000000 | 648.0000000000000000 | 0006 | 0006 | A0006 + 374.0000000000000000 | 375.0000000000000000 | 748.0000000000000000 | 0007 | 0007 | A0007 + 424.0000000000000000 | 424.5000000000000000 | 848.0000000000000000 | 0008 | 0008 | A0008 + 474.0000000000000000 | 474.0000000000000000 | 948.0000000000000000 | 0009 | 0009 | A0009 + 524.0000000000000000 | 525.0000000000000000 | 1048.0000000000000000 | 0010 | 0010 | A0010 + 574.0000000000000000 | 574.5000000000000000 | 1148.0000000000000000 | 0011 | 0011 | A0011 +(12 rows) + +-- +-- multiple levels of partitioning +-- +CREATE TABLE prt1_l (a int, b int, c varchar) PARTITION BY RANGE(a); +CREATE TABLE prt1_l_p1 PARTITION OF prt1_l FOR VALUES FROM (0) TO (250); +CREATE TABLE prt1_l_p2 PARTITION OF prt1_l FOR VALUES FROM (250) TO (500) PARTITION BY LIST (c); +CREATE TABLE prt1_l_p2_p1 PARTITION OF prt1_l_p2 FOR VALUES IN ('0000', '0001'); +CREATE TABLE prt1_l_p2_p2 PARTITION OF prt1_l_p2 FOR VALUES IN ('0002', '0003'); +CREATE TABLE prt1_l_p3 PARTITION OF prt1_l FOR VALUES FROM (500) TO (600) PARTITION BY RANGE (b); +CREATE TABLE prt1_l_p3_p1 PARTITION OF prt1_l_p3 FOR VALUES FROM (0) TO (13); +CREATE TABLE prt1_l_p3_p2 PARTITION OF prt1_l_p3 FOR VALUES FROM (13) TO (25); +INSERT INTO prt1_l SELECT i, i % 25, to_char(i % 4, 'FM0000') FROM generate_series(0, 599, 2) i; +ANALYZE prt1_l; +CREATE TABLE prt2_l (a int, b int, c varchar) PARTITION BY RANGE(b); +CREATE TABLE prt2_l_p1 PARTITION OF prt2_l FOR VALUES FROM (0) TO (250); +CREATE TABLE prt2_l_p2 PARTITION OF prt2_l FOR VALUES FROM (250) TO (500) PARTITION BY LIST (c); +CREATE TABLE prt2_l_p2_p1 PARTITION OF prt2_l_p2 FOR VALUES IN ('0000', '0001'); +CREATE TABLE prt2_l_p2_p2 PARTITION OF prt2_l_p2 FOR VALUES IN ('0002', '0003'); +CREATE TABLE prt2_l_p3 PARTITION OF prt2_l FOR VALUES FROM (500) TO (600) PARTITION BY RANGE (a); +CREATE TABLE prt2_l_p3_p1 PARTITION OF prt2_l_p3 FOR VALUES FROM (0) TO (13); +CREATE TABLE prt2_l_p3_p2 PARTITION OF prt2_l_p3 FOR VALUES FROM (13) TO (25); +INSERT INTO prt2_l SELECT i % 25, i, to_char(i % 4, 'FM0000') FROM generate_series(0, 599, 3) i; +ANALYZE prt2_l; +-- inner join, qual covering only top-level partitions +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1, prt2_l t2 WHERE t1.a = t2.b AND t1.b = 0 ORDER BY t1.a, t2.b; + QUERY PLAN +----------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a + -> Hash Join + Hash Cond: (t2.b = a) + -> Append + -> Seq Scan on prt2_l_p1 t2 + -> Seq Scan on prt2_l_p2_p1 t2_1 + -> Seq Scan on prt2_l_p2_p2 t2_2 + -> Seq Scan on prt2_l_p3_p1 t2_3 + -> Seq Scan on prt2_l_p3_p2 t2_4 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt1_l_p1 t1 + Filter: (b = 0) + -> Seq Scan on prt1_l_p2_p1 t1_1 + Filter: (b = 0) + -> Seq Scan on prt1_l_p2_p2 t1_2 + Filter: (b = 0) + -> Seq Scan on prt1_l_p3_p1 t1_3 + Filter: (b = 0) +(22 rows) + +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1, prt2_l t2 WHERE t1.a = t2.b AND t1.b = 0 ORDER BY t1.a, t2.b; + a | c | b | c +-----+------+-----+------ + 0 | 0000 | 0 | 0000 + 150 | 0002 | 150 | 0002 + 300 | 0000 | 300 | 0000 + 450 | 0002 | 450 | 0002 +(4 rows) + +-- left join +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 LEFT JOIN prt2_l t2 ON t1.a = t2.b AND t1.c = t2.c WHERE t1.b = 0 ORDER BY t1.a, t2.b; + QUERY PLAN +----------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a, b + -> Hash Right Join + Hash Cond: ((b = a) AND ((c)::text = (c)::text)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Append + -> Seq Scan on prt2_l_p1 t2 + -> Seq Scan on prt2_l_p2_p1 t2_1 + -> Seq Scan on prt2_l_p2_p2 t2_2 + -> Seq Scan on prt2_l_p3_p1 t2_3 + -> Seq Scan on prt2_l_p3_p2 t2_4 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Append + -> Seq Scan on prt1_l_p1 t1 + Filter: (b = 0) + -> Seq Scan on prt1_l_p2_p1 t1_1 + Filter: (b = 0) + -> Seq Scan on prt1_l_p2_p2 t1_2 + Filter: (b = 0) + -> Seq Scan on prt1_l_p3_p1 t1_3 + Filter: (b = 0) +(25 rows) + +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 LEFT JOIN prt2_l t2 ON t1.a = t2.b AND t1.c = t2.c WHERE t1.b = 0 ORDER BY t1.a, t2.b; + a | c | b | c +-----+------+-----+------ + 0 | 0000 | 0 | 0000 + 50 | 0002 | | + 100 | 0000 | | + 150 | 0002 | 150 | 0002 + 200 | 0000 | | + 250 | 0002 | | + 300 | 0000 | 300 | 0000 + 350 | 0002 | | + 400 | 0000 | | + 450 | 0002 | 450 | 0002 + 500 | 0000 | | + 550 | 0002 | | +(12 rows) + +-- right join +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 RIGHT JOIN prt2_l t2 ON t1.a = t2.b AND t1.c = t2.c WHERE t2.a = 0 ORDER BY t1.a, t2.b; + QUERY PLAN +----------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a, b + -> Hash Right Join + Hash Cond: ((a = b) AND ((c)::text = (c)::text)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Append + -> Seq Scan on prt1_l_p1 t1 + -> Seq Scan on prt1_l_p2_p1 t1_1 + -> Seq Scan on prt1_l_p2_p2 t1_2 + -> Seq Scan on prt1_l_p3_p1 t1_3 + -> Seq Scan on prt1_l_p3_p2 t1_4 + -> Hash + -> Remote Subquery Scan on all (datanode_2) + Distribute results by H: b + -> Append + -> Seq Scan on prt2_l_p1 t2 + Filter: (a = 0) + -> Seq Scan on prt2_l_p2_p1 t2_1 + Filter: (a = 0) + -> Seq Scan on prt2_l_p2_p2 t2_2 + Filter: (a = 0) + -> Seq Scan on prt2_l_p3_p1 t2_3 + Filter: (a = 0) +(25 rows) + +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 RIGHT JOIN prt2_l t2 ON t1.a = t2.b AND t1.c = t2.c WHERE t2.a = 0 ORDER BY t1.a, t2.b; + a | c | b | c +-----+------+-----+------ + 0 | 0000 | 0 | 0000 + 150 | 0002 | 150 | 0002 + 300 | 0000 | 300 | 0000 + 450 | 0002 | 450 | 0002 + | | 75 | 0003 + | | 225 | 0001 + | | 375 | 0003 + | | 525 | 0001 +(8 rows) + +-- full join +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_l WHERE prt1_l.b = 0) t1 FULL JOIN (SELECT * FROM prt2_l WHERE prt2_l.a = 0) t2 ON (t1.a = t2.b AND t1.c = t2.c) ORDER BY t1.a, t2.b; + QUERY PLAN +----------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a, b + -> Hash Full Join + Hash Cond: ((a = b) AND ((c)::text = (c)::text)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Append + -> Seq Scan on prt1_l_p1 + Filter: (b = 0) + -> Seq Scan on prt1_l_p2_p1 + Filter: (b = 0) + -> Seq Scan on prt1_l_p2_p2 + Filter: (b = 0) + -> Seq Scan on prt1_l_p3_p1 + Filter: (b = 0) + -> Hash + -> Remote Subquery Scan on all (datanode_2) + Distribute results by H: b + -> Append + -> Seq Scan on prt2_l_p1 + Filter: (a = 0) + -> Seq Scan on prt2_l_p2_p1 + Filter: (a = 0) + -> Seq Scan on prt2_l_p2_p2 + Filter: (a = 0) + -> Seq Scan on prt2_l_p3_p1 + Filter: (a = 0) +(28 rows) + +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_l WHERE prt1_l.b = 0) t1 FULL JOIN (SELECT * FROM prt2_l WHERE prt2_l.a = 0) t2 ON (t1.a = t2.b AND t1.c = t2.c) ORDER BY t1.a, t2.b; + a | c | b | c +-----+------+-----+------ + 0 | 0000 | 0 | 0000 + 50 | 0002 | | + 100 | 0000 | | + 150 | 0002 | 150 | 0002 + 200 | 0000 | | + 250 | 0002 | | + 300 | 0000 | 300 | 0000 + 350 | 0002 | | + 400 | 0000 | | + 450 | 0002 | 450 | 0002 + 500 | 0000 | | + 550 | 0002 | | + | | 75 | 0003 + | | 225 | 0001 + | | 375 | 0003 + | | 525 | 0001 +(16 rows) + +-- lateral partition-wise join +EXPLAIN (COSTS OFF) +SELECT * FROM prt1_l t1 LEFT JOIN LATERAL + (SELECT t2.a AS t2a, t2.c AS t2c, t2.b AS t2b, t3.b AS t3b, least(t1.a,t2.a,t3.b) FROM prt1_l t2 JOIN prt2_l t3 ON (t2.a = t3.b AND t2.c = t3.c)) ss + ON t1.a = ss.t2a AND t1.c = ss.t2c WHERE t1.b = 0 ORDER BY t1.a; + QUERY PLAN +------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Gather Merge + Workers Planned: 1 + -> Sort + Sort Key: a + -> Parallel Nested Loop Left Join + -> Parallel Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Parallel Append + -> Parallel Seq Scan on prt1_l_p1 t1 + Filter: (b = 0) + -> Parallel Seq Scan on prt1_l_p2_p1 t1_1 + Filter: (b = 0) + -> Parallel Seq Scan on prt1_l_p2_p2 t1_2 + Filter: (b = 0) + -> Parallel Seq Scan on prt1_l_p3_p1 t1_3 + Filter: (b = 0) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Hash Join + Hash Cond: ((b = t2.a) AND ((c)::text = (t2.c)::text)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt2_l_p1 t3 + -> Seq Scan on prt2_l_p2_p1 t3_1 + -> Seq Scan on prt2_l_p2_p2 t3_2 + -> Seq Scan on prt2_l_p3_p1 t3_3 + -> Seq Scan on prt2_l_p3_p2 t3_4 + -> Hash + -> Append + -> Seq Scan on prt1_l_p1 t2 + Filter: ((a = a) AND ((c)::text = (c)::text)) + -> Seq Scan on prt1_l_p2_p1 t2_1 + Filter: ((a = a) AND ((c)::text = (c)::text)) + -> Seq Scan on prt1_l_p2_p2 t2_2 + Filter: ((a = a) AND ((c)::text = (c)::text)) + -> Seq Scan on prt1_l_p3_p1 t2_3 + Filter: ((a = a) AND ((c)::text = (c)::text)) + -> Seq Scan on prt1_l_p3_p2 t2_4 + Filter: ((a = a) AND ((c)::text = (c)::text)) +(41 rows) + +SELECT * FROM prt1_l t1 LEFT JOIN LATERAL + (SELECT t2.a AS t2a, t2.c AS t2c, t2.b AS t2b, t3.b AS t3b, least(t1.a,t2.a,t3.b) FROM prt1_l t2 JOIN prt2_l t3 ON (t2.a = t3.b AND t2.c = t3.c)) ss + ON t1.a = ss.t2a AND t1.c = ss.t2c WHERE t1.b = 0 ORDER BY t1.a; + a | b | c | t2a | t2c | t2b | t3b | least +-----+---+------+-----+------+-----+-----+------- + 0 | 0 | 0000 | 0 | 0000 | 0 | 0 | 0 + 50 | 0 | 0002 | | | | | + 100 | 0 | 0000 | | | | | + 150 | 0 | 0002 | 150 | 0002 | 0 | 150 | 150 + 200 | 0 | 0000 | | | | | + 250 | 0 | 0002 | | | | | + 300 | 0 | 0000 | 300 | 0000 | 0 | 300 | 300 + 350 | 0 | 0002 | | | | | + 400 | 0 | 0000 | | | | | + 450 | 0 | 0002 | 450 | 0002 | 0 | 450 | 450 + 500 | 0 | 0000 | | | | | + 550 | 0 | 0002 | | | | | +(12 rows) + +-- join with one side empty +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_l WHERE a = 1 AND a = 2) t1 RIGHT JOIN prt2_l t2 ON t1.a = t2.b AND t1.b = t2.a AND t1.c = t2.c; + QUERY PLAN +---------------------------------------------------------------- + Hash Left Join + Hash Cond: ((b = a) AND (a = b) AND ((c)::text = (c)::text)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt2_l_p1 t2 + -> Seq Scan on prt2_l_p2_p1 t2_1 + -> Seq Scan on prt2_l_p2_p2 t2_2 + -> Seq Scan on prt2_l_p3_p1 t2_3 + -> Seq Scan on prt2_l_p3_p2 t2_4 + -> Hash + -> Result + One-Time Filter: false +(12 rows) + +-- +-- negative testcases +-- +CREATE TABLE prt1_n (a int, b int, c varchar) PARTITION BY RANGE(c); +CREATE TABLE prt1_n_p1 PARTITION OF prt1_n FOR VALUES FROM ('0000') TO ('0250'); +CREATE TABLE prt1_n_p2 PARTITION OF prt1_n FOR VALUES FROM ('0250') TO ('0500'); +INSERT INTO prt1_n SELECT i, i, to_char(i, 'FM0000') FROM generate_series(0, 499, 2) i; +ANALYZE prt1_n; +CREATE TABLE prt2_n (a int, b int, c text) PARTITION BY LIST(c); +CREATE TABLE prt2_n_p1 PARTITION OF prt2_n FOR VALUES IN ('0000', '0003', '0004', '0010', '0006', '0007'); +CREATE TABLE prt2_n_p2 PARTITION OF prt2_n FOR VALUES IN ('0001', '0005', '0002', '0009', '0008', '0011'); +INSERT INTO prt2_n SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i; +ANALYZE prt2_n; +CREATE TABLE prt3_n (a int, b int, c text) PARTITION BY LIST(c); +CREATE TABLE prt3_n_p1 PARTITION OF prt3_n FOR VALUES IN ('0000', '0004', '0006', '0007'); +CREATE TABLE prt3_n_p2 PARTITION OF prt3_n FOR VALUES IN ('0001', '0002', '0008', '0010'); +CREATE TABLE prt3_n_p3 PARTITION OF prt3_n FOR VALUES IN ('0003', '0005', '0009', '0011'); +INSERT INTO prt2_n SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i; +ANALYZE prt3_n; +CREATE TABLE prt4_n (a int, b int, c text) PARTITION BY RANGE(a); +CREATE TABLE prt4_n_p1 PARTITION OF prt4_n FOR VALUES FROM (0) TO (300); +CREATE TABLE prt4_n_p2 PARTITION OF prt4_n FOR VALUES FROM (300) TO (500); +CREATE TABLE prt4_n_p3 PARTITION OF prt4_n FOR VALUES FROM (500) TO (600); +INSERT INTO prt4_n SELECT i, i, to_char(i, 'FM0000') FROM generate_series(0, 599, 2) i; +ANALYZE prt4_n; +-- partition-wise join can not be applied if the partition ranges differ +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt4_n t2 WHERE t1.a = t2.a; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Hash Join + Hash Cond: (t1.a = t2.a) + -> Append + -> Seq Scan on prt1_p1 t1 + -> Seq Scan on prt1_p2 t1_1 + -> Seq Scan on prt1_p3 t1_2 + -> Hash + -> Append + -> Seq Scan on prt4_n_p1 t2 + -> Seq Scan on prt4_n_p2 t2_1 + -> Seq Scan on prt4_n_p3 t2_2 +(12 rows) + +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt4_n t2, prt2 t3 WHERE t1.a = t2.a and t1.a = t3.b; + QUERY PLAN +----------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Hash Join + Hash Cond: (t2.a = t1.a) + -> Append + -> Seq Scan on prt4_n_p1 t2 + -> Seq Scan on prt4_n_p2 t2_1 + -> Seq Scan on prt4_n_p3 t2_2 + -> Hash + -> Hash Join + Hash Cond: (t1.a = b) + -> Append + -> Seq Scan on prt1_p1 t1 + -> Seq Scan on prt1_p2 t1_1 + -> Seq Scan on prt1_p3 t1_2 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt2_p1 t3 + -> Seq Scan on prt2_p2 t3_1 + -> Seq Scan on prt2_p3 t3_2 +(20 rows) + +-- partition-wise join can not be applied if there are no equi-join conditions +-- between partition keys +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1 LEFT JOIN prt2 t2 ON (t1.a < t2.b); + QUERY PLAN +----------------------------------------------------------------- + Nested Loop Left Join + Join Filter: (a < b) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt1_p1 t1 + -> Seq Scan on prt1_p2 t1_1 + -> Seq Scan on prt1_p3 t1_2 + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt2_p1 t2 + -> Seq Scan on prt2_p2 t2_1 + -> Seq Scan on prt2_p3 t2_2 +(13 rows) + +-- equi-join with join condition on partial keys does not qualify for +-- partition-wise join +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_m t1, prt2_m t2 WHERE t1.a = (t2.b + t2.a)/2; + QUERY PLAN +----------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Hash Join + Hash Cond: ((((b + a) / 2)) = a) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: ((b + a) / 2) + -> Result + -> Append + -> Seq Scan on prt2_m_p1 t2 + -> Seq Scan on prt2_m_p2 t2_1 + -> Seq Scan on prt2_m_p3 t2_2 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Append + -> Seq Scan on prt1_m_p1 t1 + -> Seq Scan on prt1_m_p2 t1_1 + -> Seq Scan on prt1_m_p3 t1_2 +(17 rows) + +-- equi-join between out-of-order partition key columns does not qualify for +-- partition-wise join +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_m t1 LEFT JOIN prt2_m t2 ON t1.a = t2.b; + QUERY PLAN +----------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Hash Left Join + Hash Cond: (t1.a = b) + -> Append + -> Seq Scan on prt1_m_p1 t1 + -> Seq Scan on prt1_m_p2 t1_1 + -> Seq Scan on prt1_m_p3 t1_2 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt2_m_p1 t2 + -> Seq Scan on prt2_m_p2 t2_1 + -> Seq Scan on prt2_m_p3 t2_2 +(13 rows) + +-- equi-join between non-key columns does not qualify for partition-wise join +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_m t1 LEFT JOIN prt2_m t2 ON t1.c = t2.c; + QUERY PLAN +----------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Hash Left Join + Hash Cond: (t1.c = c) + -> Append + -> Seq Scan on prt1_m_p1 t1 + -> Seq Scan on prt1_m_p2 t1_1 + -> Seq Scan on prt1_m_p3 t1_2 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt2_m_p1 t2 + -> Seq Scan on prt2_m_p2 t2_1 + -> Seq Scan on prt2_m_p3 t2_2 +(13 rows) + +-- partition-wise join can not be applied between tables with different +-- partition lists +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_n t1 LEFT JOIN prt2_n t2 ON (t1.c = t2.c); + QUERY PLAN +----------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Hash Right Join + Hash Cond: (c = (c)::text) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: c + -> Append + -> Seq Scan on prt2_n_p1 t2 + -> Seq Scan on prt2_n_p2 t2_1 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: c + -> Append + -> Seq Scan on prt1_n_p1 t1 + -> Seq Scan on prt1_n_p2 t1_1 +(14 rows) + +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_n t1 JOIN prt2_n t2 ON (t1.c = t2.c) JOIN plt1 t3 ON (t1.c = t3.c); + QUERY PLAN +----------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Hash Join + Hash Cond: (c = (c)::text) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on plt1_p1 t3 + -> Seq Scan on plt1_p2 t3_1 + -> Seq Scan on plt1_p3 t3_2 + -> Hash + -> Hash Join + Hash Cond: (t2.c = (c)::text) + -> Append + -> Seq Scan on prt2_n_p1 t2 + -> Seq Scan on prt2_n_p2 t2_1 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt1_n_p1 t1 + -> Seq Scan on prt1_n_p2 t1_1 +(19 rows) + +-- partition-wise join can not be applied for a join between list and range +-- partitioned table +EXPLAIN (COSTS OFF) +SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_n t1 FULL JOIN prt1 t2 ON (t1.c = t2.c); + QUERY PLAN +----------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Hash Full Join + Hash Cond: ((c)::text = (c)::text) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: c + -> Append + -> Seq Scan on prt1_p1 t2 + -> Seq Scan on prt1_p2 t2_1 + -> Seq Scan on prt1_p3 t2_2 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: c + -> Append + -> Seq Scan on prt1_n_p1 t1 + -> Seq Scan on prt1_n_p2 t1_1 +(15 rows) + diff --git a/src/test/regress/expected/subselect_2.out b/src/test/regress/expected/subselect_2.out new file mode 100644 index 00000000..b774cebe --- /dev/null +++ b/src/test/regress/expected/subselect_2.out @@ -0,0 +1,1164 @@ +-- +-- SUBSELECT +-- +SELECT 1 AS one WHERE 1 IN (SELECT 1); + one +----- + 1 +(1 row) + +SELECT 1 AS zero WHERE 1 NOT IN (SELECT 1); + zero +------ +(0 rows) + +SELECT 1 AS zero WHERE 1 IN (SELECT 2); + zero +------ +(0 rows) + +-- Check grammar's handling of extra parens in assorted contexts +SELECT * FROM (SELECT 1 AS x) ss; + x +--- + 1 +(1 row) + +SELECT * FROM ((SELECT 1 AS x)) ss; + x +--- + 1 +(1 row) + +(SELECT 2) UNION SELECT 2; + ?column? +---------- + 2 +(1 row) + +((SELECT 2)) UNION SELECT 2; + ?column? +---------- + 2 +(1 row) + +SELECT ((SELECT 2) UNION SELECT 2); + ?column? +---------- + 2 +(1 row) + +SELECT (((SELECT 2)) UNION SELECT 2); + ?column? +---------- + 2 +(1 row) + +SELECT (SELECT ARRAY[1,2,3])[1]; + array +------- + 1 +(1 row) + +SELECT ((SELECT ARRAY[1,2,3]))[2]; + array +------- + 2 +(1 row) + +SELECT (((SELECT ARRAY[1,2,3])))[3]; + array +------- + 3 +(1 row) + +-- Set up some simple test tables +CREATE TABLE SUBSELECT_TBL ( + f1 integer, + f2 integer, + f3 float +); +INSERT INTO SUBSELECT_TBL VALUES (1, 2, 3); +INSERT INTO SUBSELECT_TBL VALUES (2, 3, 4); +INSERT INTO SUBSELECT_TBL VALUES (3, 4, 5); +INSERT INTO SUBSELECT_TBL VALUES (1, 1, 1); +INSERT INTO SUBSELECT_TBL VALUES (2, 2, 2); +INSERT INTO SUBSELECT_TBL VALUES (3, 3, 3); +INSERT INTO SUBSELECT_TBL VALUES (6, 7, 8); +INSERT INTO SUBSELECT_TBL VALUES (8, 9, NULL); +SELECT '' AS eight, * FROM SUBSELECT_TBL ORDER BY f1, f2, f3; + eight | f1 | f2 | f3 +-------+----+----+---- + | 1 | 1 | 1 + | 1 | 2 | 3 + | 2 | 2 | 2 + | 2 | 3 | 4 + | 3 | 3 | 3 + | 3 | 4 | 5 + | 6 | 7 | 8 + | 8 | 9 | +(8 rows) + +-- Uncorrelated subselects +SELECT '' AS two, f1 AS "Constant Select" FROM SUBSELECT_TBL + WHERE f1 IN (SELECT 1) ORDER BY 2; + two | Constant Select +-----+----------------- + | 1 + | 1 +(2 rows) + +SELECT '' AS six, f1 AS "Uncorrelated Field" FROM SUBSELECT_TBL + WHERE f1 IN (SELECT f2 FROM SUBSELECT_TBL) + ORDER BY 2; + six | Uncorrelated Field +-----+-------------------- + | 1 + | 1 + | 2 + | 2 + | 3 + | 3 +(6 rows) + +SELECT '' AS six, f1 AS "Uncorrelated Field" FROM SUBSELECT_TBL + WHERE f1 IN (SELECT f2 FROM SUBSELECT_TBL WHERE + f2 IN (SELECT f1 FROM SUBSELECT_TBL)) + ORDER BY 2; + six | Uncorrelated Field +-----+-------------------- + | 1 + | 1 + | 2 + | 2 + | 3 + | 3 +(6 rows) + +SELECT '' AS three, f1, f2 + FROM SUBSELECT_TBL + WHERE (f1, f2) NOT IN (SELECT f2, CAST(f3 AS int4) FROM SUBSELECT_TBL + WHERE f3 IS NOT NULL) + ORDER BY f1, f2; + three | f1 | f2 +-------+----+---- + | 1 | 2 + | 6 | 7 + | 8 | 9 +(3 rows) + +-- Correlated subselects +SELECT '' AS six, f1 AS "Correlated Field", f2 AS "Second Field" + FROM SUBSELECT_TBL upper + WHERE f1 IN (SELECT f2 FROM SUBSELECT_TBL WHERE f1 = upper.f1) + ORDER BY f1, f2; + six | Correlated Field | Second Field +-----+------------------+-------------- + | 1 | 1 + | 1 | 2 + | 2 | 2 + | 2 | 3 + | 3 | 3 + | 3 | 4 +(6 rows) + +SELECT '' AS six, f1 AS "Correlated Field", f3 AS "Second Field" + FROM SUBSELECT_TBL upper + WHERE f1 IN + (SELECT f2 FROM SUBSELECT_TBL WHERE CAST(upper.f2 AS float) = f3) + ORDER BY 2, 3; + six | Correlated Field | Second Field +-----+------------------+-------------- + | 1 | 1 + | 2 | 2 + | 2 | 4 + | 3 | 3 + | 3 | 5 +(5 rows) + +SELECT '' AS six, f1 AS "Correlated Field", f3 AS "Second Field" + FROM SUBSELECT_TBL upper + WHERE f3 IN (SELECT upper.f1 + f2 FROM SUBSELECT_TBL + WHERE f2 = CAST(f3 AS integer)) + ORDER BY 2, 3; + six | Correlated Field | Second Field +-----+------------------+-------------- + | 1 | 3 + | 2 | 4 + | 3 | 5 + | 6 | 8 +(4 rows) + +SELECT '' AS five, f1 AS "Correlated Field" + FROM SUBSELECT_TBL + WHERE (f1, f2) IN (SELECT f2, CAST(f3 AS int4) FROM SUBSELECT_TBL + WHERE f3 IS NOT NULL) + ORDER BY 2; + five | Correlated Field +------+------------------ + | 1 + | 2 + | 2 + | 3 + | 3 +(5 rows) + +-- +-- Use some existing tables in the regression test +-- +SELECT '' AS eight, ss.f1 AS "Correlated Field", ss.f3 AS "Second Field" + FROM SUBSELECT_TBL ss + WHERE f1 NOT IN (SELECT f1+1 FROM INT4_TBL + WHERE f1 != ss.f1 AND f1 < 2147483647) + ORDER BY 2, 3; + eight | Correlated Field | Second Field +-------+------------------+-------------- + | 2 | 2 + | 2 | 4 + | 3 | 3 + | 3 | 5 + | 6 | 8 + | 8 | +(6 rows) + +select q1, float8(count(*)) / (select count(*) from int8_tbl) +from int8_tbl group by q1 order by q1; + q1 | ?column? +------------------+---------- + 123 | 0.4 + 4567890123456789 | 0.6 +(2 rows) + +-- Unspecified-type literals in output columns should resolve as text +SELECT *, pg_typeof(f1) FROM + (SELECT 'foo' AS f1 FROM generate_series(1,3)) ss ORDER BY 1; + f1 | pg_typeof +-----+----------- + foo | text + foo | text + foo | text +(3 rows) + +-- ... unless there's context to suggest differently +explain verbose select '42' union all select '43'; + QUERY PLAN +------------------------------------------------- + Append (cost=0.00..0.04 rows=2 width=32) + -> Result (cost=0.00..0.01 rows=1 width=32) + Output: '42'::text + -> Result (cost=0.00..0.01 rows=1 width=32) + Output: '43'::text +(5 rows) + +explain verbose select '42' union all select 43; + QUERY PLAN +------------------------------------------------ + Append (cost=0.00..0.04 rows=2 width=4) + -> Result (cost=0.00..0.01 rows=1 width=4) + Output: 42 + -> Result (cost=0.00..0.01 rows=1 width=4) + Output: 43 +(5 rows) + +-- check materialization of an initplan reference (bug #14524) +explain (verbose, costs off) +select 1 = all (select (select 1)); + QUERY PLAN +----------------------------------- + Result + Output: (SubPlan 2) + SubPlan 2 + -> Materialize + Output: ($0) + InitPlan 1 (returns $0) + -> Result + Output: 1 + -> Result + Output: $0 +(10 rows) + +select 1 = all (select (select 1)); + ?column? +---------- + t +(1 row) + +-- +-- Check EXISTS simplification with LIMIT +-- +explain (costs off) +select * from int4_tbl o where exists + (select 1 from int4_tbl i where i.f1=o.f1 limit null); + QUERY PLAN +------------------------------------------ + Remote Subquery Scan on all (datanode_1) + -> Hash Semi Join + Hash Cond: (o.f1 = i.f1) + -> Seq Scan on int4_tbl o + -> Hash + -> Seq Scan on int4_tbl i +(6 rows) + +explain (costs off, nodes off) +select * from int4_tbl o where not exists + (select 1 from int4_tbl i where i.f1=o.f1 limit 1); + QUERY PLAN +------------------------------------------ + Remote Subquery Scan on all + -> Hash Anti Join + Hash Cond: (o.f1 = i.f1) + -> Seq Scan on int4_tbl o + -> Hash + -> Seq Scan on int4_tbl i +(6 rows) + +explain (costs off, nodes off) +select * from int4_tbl o where exists + (select 1 from int4_tbl i where i.f1=o.f1 limit 0); + QUERY PLAN +-------------------------------------------------------- + Remote Subquery Scan on all + -> Seq Scan on int4_tbl o + Filter: (SubPlan 1) + SubPlan 1 + -> Limit + -> Remote Subquery Scan on all + -> Limit + -> Seq Scan on int4_tbl i + Filter: (f1 = o.f1) +(9 rows) + +-- +-- Test cases to catch unpleasant interactions between IN-join processing +-- and subquery pullup. +-- +select count(*) from + (select 1 from tenk1 a + where unique1 IN (select hundred from tenk1 b)) ss; + count +------- + 100 +(1 row) + +select count(distinct ss.ten) from + (select ten from tenk1 a + where unique1 IN (select hundred from tenk1 b)) ss; + count +------- + 10 +(1 row) + +select count(*) from + (select 1 from tenk1 a + where unique1 IN (select distinct hundred from tenk1 b)) ss; + count +------- + 100 +(1 row) + +select count(distinct ss.ten) from + (select ten from tenk1 a + where unique1 IN (select distinct hundred from tenk1 b)) ss; + count +------- + 10 +(1 row) + +-- +-- Test cases to check for overenthusiastic optimization of +-- "IN (SELECT DISTINCT ...)" and related cases. Per example from +-- Luca Pireddu and Michael Fuhr. +-- +CREATE TEMP TABLE foo (id integer); +CREATE TEMP TABLE bar (id1 integer, id2 integer); +INSERT INTO foo VALUES (1); +INSERT INTO bar VALUES (1, 1); +INSERT INTO bar VALUES (2, 2); +INSERT INTO bar VALUES (3, 1); +-- These cases require an extra level of distinct-ing above subquery s +SELECT * FROM foo WHERE id IN + (SELECT id2 FROM (SELECT DISTINCT id1, id2 FROM bar) AS s); + id +---- + 1 +(1 row) + +SELECT * FROM foo WHERE id IN + (SELECT id2 FROM (SELECT id1,id2 FROM bar GROUP BY id1,id2) AS s); + id +---- + 1 +(1 row) + +SELECT * FROM foo WHERE id IN + (SELECT id2 FROM (SELECT id1, id2 FROM bar UNION + SELECT id1, id2 FROM bar) AS s); + id +---- + 1 +(1 row) + +-- These cases do not +SELECT * FROM foo WHERE id IN + (SELECT id2 FROM (SELECT DISTINCT ON (id2) id1, id2 FROM bar) AS s); + id +---- + 1 +(1 row) + +SELECT * FROM foo WHERE id IN + (SELECT id2 FROM (SELECT id2 FROM bar GROUP BY id2) AS s); + id +---- + 1 +(1 row) + +SELECT * FROM foo WHERE id IN + (SELECT id2 FROM (SELECT id2 FROM bar UNION + SELECT id2 FROM bar) AS s); + id +---- + 1 +(1 row) + +-- +-- Test case to catch problems with multiply nested sub-SELECTs not getting +-- recalculated properly. Per bug report from Didier Moens. +-- +CREATE TABLE orderstest ( + approver_ref integer, + po_ref integer, + ordercanceled boolean +); +INSERT INTO orderstest VALUES (1, 1, false); +INSERT INTO orderstest VALUES (66, 5, false); +INSERT INTO orderstest VALUES (66, 6, false); +INSERT INTO orderstest VALUES (66, 7, false); +INSERT INTO orderstest VALUES (66, 1, true); +INSERT INTO orderstest VALUES (66, 8, false); +INSERT INTO orderstest VALUES (66, 1, false); +INSERT INTO orderstest VALUES (77, 1, false); +INSERT INTO orderstest VALUES (1, 1, false); +INSERT INTO orderstest VALUES (66, 1, false); +INSERT INTO orderstest VALUES (1, 1, false); +CREATE VIEW orders_view AS +SELECT *, +(SELECT CASE + WHEN ord.approver_ref=1 THEN '---' ELSE 'Approved' + END) AS "Approved", +(SELECT CASE + WHEN ord.ordercanceled + THEN 'Canceled' + ELSE + (SELECT CASE + WHEN ord.po_ref=1 + THEN + (SELECT CASE + WHEN ord.approver_ref=1 + THEN '---' + ELSE 'Approved' + END) + ELSE 'PO' + END) +END) AS "Status", +(CASE + WHEN ord.ordercanceled + THEN 'Canceled' + ELSE + (CASE + WHEN ord.po_ref=1 + THEN + (CASE + WHEN ord.approver_ref=1 + THEN '---' + ELSE 'Approved' + END) + ELSE 'PO' + END) +END) AS "Status_OK" +FROM orderstest ord; +SELECT * FROM orders_view +ORDER BY approver_ref, po_ref, ordercanceled; + approver_ref | po_ref | ordercanceled | Approved | Status | Status_OK +--------------+--------+---------------+----------+----------+----------- + 1 | 1 | f | --- | --- | --- + 1 | 1 | f | --- | --- | --- + 1 | 1 | f | --- | --- | --- + 66 | 1 | f | Approved | Approved | Approved + 66 | 1 | f | Approved | Approved | Approved + 66 | 1 | t | Approved | Canceled | Canceled + 66 | 5 | f | Approved | PO | PO + 66 | 6 | f | Approved | PO | PO + 66 | 7 | f | Approved | PO | PO + 66 | 8 | f | Approved | PO | PO + 77 | 1 | f | Approved | Approved | Approved +(11 rows) + +DROP TABLE orderstest cascade; +NOTICE: drop cascades to view orders_view +-- +-- Test cases to catch situations where rule rewriter fails to propagate +-- hasSubLinks flag correctly. Per example from Kyle Bateman. +-- +create temp table parts ( + partnum text, + cost float8 +); +create temp table shipped ( + ttype char(2), + ordnum int4, + partnum text, + value float8 +); +create temp view shipped_view as + select * from shipped where ttype = 'wt'; +create rule shipped_view_insert as on insert to shipped_view do instead + insert into shipped values('wt', new.ordnum, new.partnum, new.value); +insert into parts (partnum, cost) values (1, 1234.56); +insert into shipped_view (ordnum, partnum, value) + values (0, 1, (select cost from parts where partnum = '1')); +select * from shipped_view; + ttype | ordnum | partnum | value +-------+--------+---------+--------- + wt | 0 | 1 | 1234.56 +(1 row) + +create rule shipped_view_update as on update to shipped_view do instead + update shipped set partnum = new.partnum, value = new.value + where ttype = new.ttype and ordnum = new.ordnum; +update shipped_view set value = 11 + from int4_tbl a join int4_tbl b + on (a.f1 = (select f1 from int4_tbl c where c.f1=b.f1)) + where ordnum = a.f1; +ERROR: could not plan this distributed update +DETAIL: correlated UPDATE or updating distribution column currently not supported in Postgres-XL. +select * from shipped_view; + ttype | ordnum | partnum | value +-------+--------+---------+--------- + wt | 0 | 1 | 1234.56 +(1 row) + +select f1, ss1 as relabel from + (select *, (select sum(f1) from int4_tbl b where f1 >= a.f1) as ss1 + from int4_tbl a) ss + ORDER BY f1, relabel; + f1 | relabel +-------------+------------ + -2147483647 | 0 + -123456 | 2147483647 + 0 | 2147607103 + 123456 | 2147607103 + 2147483647 | 2147483647 +(5 rows) + +-- +-- Test cases involving PARAM_EXEC parameters and min/max index optimizations. +-- Per bug report from David Sanchez i Gregori. +-- +select * from ( + select max(unique1) from tenk1 as a + where exists (select 1 from tenk1 as b where b.thousand = a.unique2) +) ss; + max +------ + 9997 +(1 row) + +select * from ( + select min(unique1) from tenk1 as a + where not exists (select 1 from tenk1 as b where b.unique2 = 10000) +) ss; + min +----- + 0 +(1 row) + +-- +-- Test that an IN implemented using a UniquePath does unique-ification +-- with the right semantics, as per bug #4113. (Unfortunately we have +-- no simple way to ensure that this test case actually chooses that type +-- of plan, but it does in releases 7.4-8.3. Note that an ordering difference +-- here might mean that some other plan type is being used, rendering the test +-- pointless.) +-- +create temp table numeric_table (num_col numeric); +insert into numeric_table values (1), (1.000000000000000000001), (2), (3); +create temp table float_table (float_col float8); +insert into float_table values (1), (2), (3); +select * from float_table + where float_col in (select num_col from numeric_table) + ORDER BY float_col; + float_col +----------- + 1 + 2 + 3 +(3 rows) + +select * from numeric_table + where num_col in (select float_col from float_table) + ORDER BY num_col; + num_col +------------------------- + 1 + 1.000000000000000000001 + 2 + 3 +(4 rows) + +-- +-- Test case for bug #4290: bogus calculation of subplan param sets +-- +create temp table ta (id int primary key, val int); +insert into ta values(1,1); +insert into ta values(2,2); +create temp table tb (id int primary key, aval int); +insert into tb values(1,1); +insert into tb values(2,1); +insert into tb values(3,2); +insert into tb values(4,2); +create temp table tc (id int primary key, aid int); +insert into tc values(1,1); +insert into tc values(2,2); +select + ( select min(tb.id) from tb + where tb.aval = (select ta.val from ta where ta.id = tc.aid) ) as min_tb_id +from tc +ORDER BY min_tb_id; + min_tb_id +----------- + 1 + 3 +(2 rows) + +-- +-- Test case for 8.3 "failed to locate grouping columns" bug +-- +create temp table t1 (f1 numeric(14,0), f2 varchar(30)); +select * from + (select distinct f1, f2, (select f2 from t1 x where x.f1 = up.f1) as fs + from t1 up) ss +group by f1,f2,fs; + f1 | f2 | fs +----+----+---- +(0 rows) + +-- +-- Test case for bug #5514 (mishandling of whole-row Vars in subselects) +-- +create temp table table_a(id integer); +insert into table_a values (42); +create temp view view_a as select * from table_a; +select view_a from view_a; + view_a +-------- + (42) +(1 row) + +select (select view_a) from view_a; + view_a +-------- + (42) +(1 row) + +select (select (select view_a)) from view_a; + view_a +-------- + (42) +(1 row) + +select (select (a.*)::text) from view_a a; + a +------ + (42) +(1 row) + +-- +-- Check that whole-row Vars reading the result of a subselect don't include +-- any junk columns therein +-- +select q from (select max(f1) from int4_tbl group by f1 order by f1) q; + q +--------------- + (-2147483647) + (-123456) + (0) + (123456) + (2147483647) +(5 rows) + +with q as (select max(f1) from int4_tbl group by f1 order by f1) + select q from q; + q +--------------- + (-2147483647) + (-123456) + (0) + (123456) + (2147483647) +(5 rows) + +-- +-- Test case for sublinks pushed down into subselects via join alias expansion +-- +select + (select sq1) as qq1 +from + (select exists(select 1 from int4_tbl where f1 = q2) as sq1, 42 as dummy + from int8_tbl) sq0 + join + int4_tbl i4 on dummy = i4.f1; + qq1 +----- +(0 rows) + +-- +-- Test case for subselect within UPDATE of INSERT...ON CONFLICT DO UPDATE +-- +create temp table upsert(key int4 primary key, val text); +insert into upsert values(1, 'val') on conflict (key) do update set val = 'not seen'; +insert into upsert values(1, 'val') on conflict (key) do update set val = 'seen with subselect ' || (select f1 from int4_tbl where f1 != 0 limit 1)::text; +select * from upsert; + key | val +-----+---------------------------- + 1 | seen with subselect 123456 +(1 row) + +with aa as (select 'int4_tbl' u from int4_tbl limit 1) +insert into upsert values (1, 'x'), (999, 'y') +on conflict (key) do update set val = (select u from aa) +returning *; + key | val +-----+---------- + 1 | int4_tbl + 999 | y +(2 rows) + +-- +-- Test case for cross-type partial matching in hashed subplan (bug #7597) +-- +create temp table outer_7597 (f1 int4, f2 int4); +insert into outer_7597 values (0, 0); +insert into outer_7597 values (1, 0); +insert into outer_7597 values (0, null); +insert into outer_7597 values (1, null); +create temp table inner_7597(c1 int8, c2 int8); +insert into inner_7597 values(0, null); +select * from outer_7597 where (f1, f2) not in (select * from inner_7597) order by 1; + f1 | f2 +----+---- + 1 | 0 + 1 | +(2 rows) + +-- +-- Test case for premature memory release during hashing of subplan output +-- +select '1'::text in (select '1'::name union all select '1'::name); + ?column? +---------- + t +(1 row) + +-- +-- Test case for planner bug with nested EXISTS handling +-- +select a.thousand from tenk1 a, tenk1 b +where a.thousand = b.thousand + and exists ( select 1 from tenk1 c where b.hundred = c.hundred + and not exists ( select 1 from tenk1 d + where a.thousand = d.thousand ) ); + thousand +---------- +(0 rows) + +-- +-- Check that nested sub-selects are not pulled up if they contain volatiles +-- +explain (verbose, costs off) + select x, x from + (select (select now()) as x from (values(1),(2)) v(y)) ss; + QUERY PLAN +--------------------------- + Values Scan on "*VALUES*" + Output: $0, $1 + InitPlan 1 (returns $0) + -> Result + Output: now() + InitPlan 2 (returns $1) + -> Result + Output: now() +(8 rows) + +explain (verbose, costs off) + select x, x from + (select (select random()) as x from (values(1),(2)) v(y)) ss; + QUERY PLAN +---------------------------------- + Subquery Scan on ss + Output: ss.x, ss.x + -> Values Scan on "*VALUES*" + Output: $0 + InitPlan 1 (returns $0) + -> Result + Output: random() +(7 rows) + +explain (verbose, costs off) + select x, x from + (select (select now() where y=y) as x from (values(1),(2)) v(y)) ss; + QUERY PLAN +---------------------------------------------------------------------- + Values Scan on "*VALUES*" + Output: (SubPlan 1), (SubPlan 2) + SubPlan 1 + -> Result + Output: now() + One-Time Filter: ("*VALUES*".column1 = "*VALUES*".column1) + SubPlan 2 + -> Result + Output: now() + One-Time Filter: ("*VALUES*".column1 = "*VALUES*".column1) +(10 rows) + +explain (verbose, costs off) + select x, x from + (select (select random() where y=y) as x from (values(1),(2)) v(y)) ss; + QUERY PLAN +---------------------------------------------------------------------------- + Subquery Scan on ss + Output: ss.x, ss.x + -> Values Scan on "*VALUES*" + Output: (SubPlan 1) + SubPlan 1 + -> Result + Output: random() + One-Time Filter: ("*VALUES*".column1 = "*VALUES*".column1) +(8 rows) + +-- +-- Check we behave sanely in corner case of empty SELECT list (bug #8648) +-- +create temp table nocolumns(); +select exists(select * from nocolumns); + exists +-------- + f +(1 row) + +-- +-- Check sane behavior with nested IN SubLinks +-- +explain (verbose, costs off) +select * from int4_tbl where + (case when f1 in (select unique1 from tenk1 a) then f1 else null end) in + (select ten from tenk1 b); + QUERY PLAN +--------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + Output: int4_tbl.f1 + -> Hash Join + Output: int4_tbl.f1 + Inner Unique: true + Hash Cond: (CASE WHEN (hashed SubPlan 1) THEN int4_tbl.f1 ELSE NULL::integer END = b.ten) + -> Seq Scan on public.int4_tbl + Output: int4_tbl.f1 + -> Hash + Output: b.ten + -> HashAggregate + Output: b.ten + Group Key: b.ten + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: b.ten + Distribute results by H: ten + -> HashAggregate + Output: b.ten + Group Key: b.ten + -> Seq Scan on public.tenk1 b + Output: b.ten + SubPlan 1 + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: a.unique1 + -> Index Only Scan using tenk1_unique1 on public.tenk1 a + Output: a.unique1 +(26 rows) + +select * from int4_tbl where + (case when f1 in (select unique1 from tenk1 a) then f1 else null end) in + (select ten from tenk1 b); + f1 +---- + 0 +(1 row) + +-- +-- Check for incorrect optimization when IN subquery contains a SRF +-- +explain (verbose, costs off) +select * from int4_tbl o where (f1, f1) in + (select f1, generate_series(1,2) / 10 g from int4_tbl i group by f1); + QUERY PLAN +------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1) + Output: o.f1 + -> Nested Loop Semi Join + Output: o.f1 + Join Filter: (o.f1 = "ANY_subquery".f1) + -> Seq Scan on public.int4_tbl o + Output: o.f1 + -> Materialize + Output: "ANY_subquery".f1, "ANY_subquery".g + -> Subquery Scan on "ANY_subquery" + Output: "ANY_subquery".f1, "ANY_subquery".g + Filter: ("ANY_subquery".f1 = "ANY_subquery".g) + -> Result + Output: i.f1, ((generate_series(1, 2)) / 10) + -> ProjectSet + Output: generate_series(1, 2), i.f1 + -> HashAggregate + Output: i.f1 + Group Key: i.f1 + -> Seq Scan on public.int4_tbl i + Output: i.f1 +(21 rows) + +select * from int4_tbl o where (f1, f1) in + (select f1, generate_series(1,2) / 10 g from int4_tbl i group by f1); + f1 +---- + 0 +(1 row) + +-- +-- check for over-optimization of whole-row Var referencing an Append plan +-- +select (select q from + (select 1,2,3 where f1 > 0 + union all + select 4,5,6.0 where f1 <= 0 + ) q ) +from int4_tbl order by 1; + q +----------- + (1,2,3) + (1,2,3) + (4,5,6.0) + (4,5,6.0) + (4,5,6.0) +(5 rows) + +-- +-- Check that volatile quals aren't pushed down past a DISTINCT: +-- nextval() should not be called more than the nominal number of times +-- +create temp sequence ts1; +select * from + (select distinct ten from tenk1) ss + where ten < 10 + nextval('ts1') + order by 1; + ten +----- + 0 + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 +(10 rows) + +select nextval('ts1'); + nextval +--------- + 11 +(1 row) + +SELECT setseed(0); + setseed +--------- + +(1 row) + +-- DROP TABLE IF EXISTS asd ; +CREATE TABLE IF NOT EXISTS asd AS +SELECT clientid::numeric(20), + (clientid / 20 )::integer::numeric(20) as userid, + cts + ((random()* 3600 *24 )||'sec')::interval as cts, + (ARRAY['A','B','C','D','E','F'])[(random()*5+1)::integer] as state, + 0 as dim, + ((ARRAY['Cat','Dog','Duck'])[(clientid / 10 )% 3 +1 ]) ::text as app_name, + ((ARRAY['A','B'])[(clientid / 10 )% 2 +1 ]) ::text as platform + FROM generate_series('2016-01-01'::timestamp,'2016-10-01'::timestamp,interval '15 day') cts , generate_series( 1000,2000,10) clientid , generate_series(1,6) t +; +SELECT dates::timestamp as dates ,B.platform,B.app_name, B.clientid, B.userid, + B.state as state +FROM ( VALUES +('2016.08.30. 08:52:43') ,('2016.08.29. 04:57:12') ,('2016.08.26. 08:15:05') , +('2016.08.24. 11:49:51') ,('2016.08.22. 08:45:29') ,('2016.08.21. 04:53:47') ,('2016.08.20. 08:44:03') +) AS D (dates) +JOIN +( SELECT DISTINCT clientid FROM asd + WHERE userid=74 ) C ON True +INNER JOIN LATERAL ( + SELECT DISTINCT ON (clientid,app_name,platform,state,dim) x.* + FROM asd x + INNER JOIN (SELECT p.clientid,p.app_name,p.platform , p.state, p.dim , + MAX(p.cts) AS selected_cts + FROM asd p + where cts y; +end$$; +explain (verbose, costs off) +select * from + (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss + where tattle(x, 8); + QUERY PLAN +---------------------------------------------------------- + Subquery Scan on ss + Output: x, u + Filter: tattle(ss.x, 8) + -> ProjectSet + Output: 9, unnest('{1,2,3,11,12,13}'::integer[]) + -> Result +(6 rows) + +select * from + (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss + where tattle(x, 8); +NOTICE: x = 9, y = 8 +NOTICE: x = 9, y = 8 +NOTICE: x = 9, y = 8 +NOTICE: x = 9, y = 8 +NOTICE: x = 9, y = 8 +NOTICE: x = 9, y = 8 + x | u +---+---- + 9 | 1 + 9 | 2 + 9 | 3 + 9 | 11 + 9 | 12 + 9 | 13 +(6 rows) + +-- if we pretend it's stable, we get different results: +alter function tattle(x int, y int) stable; +explain (verbose, costs off) +select * from + (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss + where tattle(x, 8); + QUERY PLAN +---------------------------------------------------- + ProjectSet + Output: 9, unnest('{1,2,3,11,12,13}'::integer[]) + -> Result + One-Time Filter: tattle(9, 8) +(4 rows) + +select * from + (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss + where tattle(x, 8); +NOTICE: x = 9, y = 8 + x | u +---+---- + 9 | 1 + 9 | 2 + 9 | 3 + 9 | 11 + 9 | 12 + 9 | 13 +(6 rows) + +-- although even a stable qual should not be pushed down if it references SRF +explain (verbose, costs off) +select * from + (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss + where tattle(x, u); + QUERY PLAN +---------------------------------------------------------- + Subquery Scan on ss + Output: x, u + Filter: tattle(ss.x, ss.u) + -> ProjectSet + Output: 9, unnest('{1,2,3,11,12,13}'::integer[]) + -> Result +(6 rows) + +select * from + (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss + where tattle(x, u); +NOTICE: x = 9, y = 1 +NOTICE: x = 9, y = 2 +NOTICE: x = 9, y = 3 +NOTICE: x = 9, y = 11 +NOTICE: x = 9, y = 12 +NOTICE: x = 9, y = 13 + x | u +---+--- + 9 | 1 + 9 | 2 + 9 | 3 +(3 rows) + +drop function tattle(x int, y int); diff --git a/src/test/regress/expected/xc_groupby_3.out b/src/test/regress/expected/xc_groupby_3.out new file mode 100644 index 00000000..6344aa6b --- /dev/null +++ b/src/test/regress/expected/xc_groupby_3.out @@ -0,0 +1,7513 @@ +-- this file contains tests for GROUP BY with combinations of following +-- 1. enable_hashagg = on/off (to force the grouping by sorting) +-- 2. distributed or replicated tables across the datanodes +-- If a testcase is added to any of the combinations, please check if it's +-- applicable in other combinations as well. +-- Since we want to test the plan reduction of GROUP and AGG nodes, disable fast +-- query shipping +set enable_fast_query_shipping to off; +-- Combination 1: enable_hashagg on and distributed tables +set enable_hashagg to on; +-- create required tables and fill them with data +create table xc_groupby_tab1 (val int, val2 int); +create table xc_groupby_tab2 (val int, val2 int); +insert into xc_groupby_tab1 values (1, 1), (2, 1), (3, 1), (2, 2), (6, 2), (4, 3), (1, 3), (6, 3); +insert into xc_groupby_tab2 values (1, 1), (4, 1), (8, 1), (2, 4), (9, 4), (3, 4), (4, 2), (5, 2), (3, 2); +select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2 order by 1, 2; + count | sum | avg | ?column? | val2 +-------+-----+--------------------+------------------+------ + 2 | 8 | 4.0000000000000000 | 4 | 2 + 3 | 6 | 2.0000000000000000 | 2 | 1 + 3 | 11 | 3.6666666666666667 | 3.66666666666667 | 3 +(3 rows) + +explain (verbose true, costs false, nodes false) select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2 order by 1, 2; + QUERY PLAN +--------------------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2 + Sort Key: count(*), sum(xc_groupby_tab1.val) + -> Sort + Output: (count(*)), (sum(val)), (avg(val)), (((sum(val))::double precision / (count(*))::double precision)), val2 + Sort Key: (count(*)), (sum(xc_groupby_tab1.val)) + -> Finalize HashAggregate + Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2 + Group Key: xc_groupby_tab1.val2 + -> Remote Subquery Scan on all + Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val) + Distribute results by H: val2 + -> Partial HashAggregate + Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val) + Group Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val, val2 +(17 rows) + +explain (verbose true, costs false, nodes false) select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2 + -> Finalize HashAggregate + Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2 + Group Key: xc_groupby_tab1.val2 + -> Remote Subquery Scan on all + Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val) + Distribute results by H: val2 + -> Partial HashAggregate + Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val) + Group Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val, val2 +(13 rows) + +-- joins and group by +select count(*), sum(xc_groupby_tab1.val * xc_groupby_tab2.val), avg(xc_groupby_tab1.val*xc_groupby_tab2.val), sum(xc_groupby_tab1.val*xc_groupby_tab2.val)::float8/count(*), xc_groupby_tab1.val2, xc_groupby_tab2.val2 from xc_groupby_tab1 full outer join xc_groupby_tab2 on xc_groupby_tab1.val2 = xc_groupby_tab2.val2 group by xc_groupby_tab1.val2, xc_groupby_tab2.val2 order by count(*); + count | sum | avg | ?column? | val2 | val2 +-------+-----+---------------------+------------------+------+------ + 3 | | | | | 4 + 3 | | | | 3 | + 6 | 96 | 16.0000000000000000 | 16 | 2 | 2 + 9 | 78 | 8.6666666666666667 | 8.66666666666667 | 1 | 1 +(4 rows) + +explain (verbose true, costs false, nodes false) select count(*), sum(xc_groupby_tab1.val * xc_groupby_tab2.val), avg(xc_groupby_tab1.val*xc_groupby_tab2.val), sum(xc_groupby_tab1.val*xc_groupby_tab2.val)::float8/count(*), xc_groupby_tab1.val2, xc_groupby_tab2.val2 from xc_groupby_tab1 full outer join xc_groupby_tab2 on xc_groupby_tab1.val2 = xc_groupby_tab2.val2 group by xc_groupby_tab1.val2, xc_groupby_tab2.val2; + QUERY PLAN +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: count(*), sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), avg((xc_groupby_tab1.val * xc_groupby_tab2.val)), ((sum((xc_groupby_tab1.val * xc_groupby_tab2.val)))::double precision / (count(*))::double precision), xc_groupby_tab1.val2, xc_groupby_tab2.val2 + -> Finalize HashAggregate + Output: count(*), sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), avg((xc_groupby_tab1.val * xc_groupby_tab2.val)), ((sum((xc_groupby_tab1.val * xc_groupby_tab2.val)))::double precision / (count(*))::double precision), xc_groupby_tab1.val2, xc_groupby_tab2.val2 + Group Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2 + -> Remote Subquery Scan on all + Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, PARTIAL count(*), PARTIAL sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), PARTIAL avg((xc_groupby_tab1.val * xc_groupby_tab2.val)) + Distribute results by H: val2 + -> Partial HashAggregate + Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, PARTIAL count(*), PARTIAL sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), PARTIAL avg((xc_groupby_tab1.val * xc_groupby_tab2.val)) + Group Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2 + -> Hash Full Join + Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val + Hash Cond: (xc_groupby_tab1.val2 = xc_groupby_tab2.val2) + -> Remote Subquery Scan on all + Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 + Distribute results by H: val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 + -> Hash + Output: xc_groupby_tab2.val, xc_groupby_tab2.val2 + -> Remote Subquery Scan on all + Output: xc_groupby_tab2.val, xc_groupby_tab2.val2 + Distribute results by H: val2 + -> Seq Scan on public.xc_groupby_tab2 + Output: xc_groupby_tab2.val, xc_groupby_tab2.val2 +(26 rows) + +-- aggregates over aggregates +select sum(y) from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x order by 1; + sum +----- + 8 + 17 +(2 rows) + +explain (verbose true, costs false, nodes false) select sum(y) from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x order by 1; + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: sum(y), x + Sort Key: sum(y) + -> Sort + Output: (sum(y)), x + Sort Key: (sum(y)) + -> Finalize HashAggregate + Output: sum(y), x + Group Key: x + -> Remote Subquery Scan on all + Output: x, PARTIAL sum(y) + Distribute results by H: x + -> Partial HashAggregate + Output: ((xc_groupby_tab1.val2 % 2)), PARTIAL sum((sum(xc_groupby_tab1.val))) + Group Key: (xc_groupby_tab1.val2 % 2) + -> Finalize HashAggregate + Output: sum(xc_groupby_tab1.val), (xc_groupby_tab1.val2 % 2), xc_groupby_tab1.val2 + Group Key: xc_groupby_tab1.val2 + -> Remote Subquery Scan on all + Output: xc_groupby_tab1.val2, PARTIAL sum(xc_groupby_tab1.val) + Distribute results by H: val2 + -> Partial HashAggregate + Output: xc_groupby_tab1.val2, PARTIAL sum(xc_groupby_tab1.val) + Group Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 +(26 rows) + +explain (verbose true, costs false, nodes false) select sum(y) from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x; + QUERY PLAN +-------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: sum(y), x + -> Finalize HashAggregate + Output: sum(y), x + Group Key: x + -> Remote Subquery Scan on all + Output: x, PARTIAL sum(y) + Distribute results by H: x + -> Partial HashAggregate + Output: ((xc_groupby_tab1.val2 % 2)), PARTIAL sum((sum(xc_groupby_tab1.val))) + Group Key: (xc_groupby_tab1.val2 % 2) + -> Finalize HashAggregate + Output: sum(xc_groupby_tab1.val), (xc_groupby_tab1.val2 % 2), xc_groupby_tab1.val2 + Group Key: xc_groupby_tab1.val2 + -> Remote Subquery Scan on all + Output: xc_groupby_tab1.val2, PARTIAL sum(xc_groupby_tab1.val) + Distribute results by H: val2 + -> Partial HashAggregate + Output: xc_groupby_tab1.val2, PARTIAL sum(xc_groupby_tab1.val) + Group Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 +(22 rows) + +-- group by without aggregate +select val2 from xc_groupby_tab1 group by val2 order by val2; + val2 +------ + 1 + 2 + 3 +(3 rows) + +explain (verbose true, costs false, nodes false) select val2 from xc_groupby_tab1 group by val2 order by val2; + QUERY PLAN +------------------------------------------------------------------ + Remote Subquery Scan on all + Output: val2 + Sort Key: xc_groupby_tab1.val2 + -> Finalize GroupAggregate + Output: val2 + Group Key: xc_groupby_tab1.val2 + -> Sort + Output: val2 + Sort Key: xc_groupby_tab1.val2 + -> Remote Subquery Scan on all + Output: val2 + Distribute results by H: val2 + -> Partial HashAggregate + Output: val2 + Group Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val, val2 +(17 rows) + +explain (verbose true, costs false, nodes false) select val2 from xc_groupby_tab1 group by val2; + QUERY PLAN +------------------------------------------------------------ + Remote Subquery Scan on all + Output: val2 + -> Finalize HashAggregate + Output: val2 + Group Key: xc_groupby_tab1.val2 + -> Remote Subquery Scan on all + Output: val2 + Distribute results by H: val2 + -> Partial HashAggregate + Output: val2 + Group Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val, val2 +(13 rows) + +select val + val2 from xc_groupby_tab1 group by val + val2 order by 1; + ?column? +---------- + 2 + 3 + 4 + 7 + 8 + 9 +(6 rows) + +explain (verbose true, costs false, nodes false) select val + val2 from xc_groupby_tab1 group by val + val2 order by 1; + QUERY PLAN +----------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: (val + val2) + Sort Key: (xc_groupby_tab1.val + xc_groupby_tab1.val2) + -> Finalize GroupAggregate + Output: ((val + val2)) + Group Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2)) + -> Sort + Output: ((val + val2)) + Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2)) + -> Remote Subquery Scan on all + Output: (val + val2) + Distribute results by H: (val + val2) + -> Partial HashAggregate + Output: ((val + val2)) + Group Key: (xc_groupby_tab1.val + xc_groupby_tab1.val2) + -> Seq Scan on public.xc_groupby_tab1 + Output: (val + val2) +(17 rows) + +explain (verbose true, costs false, nodes false) select val + val2 from xc_groupby_tab1 group by val + val2; + QUERY PLAN +----------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: (val + val2) + -> Finalize HashAggregate + Output: ((val + val2)) + Group Key: (xc_groupby_tab1.val + xc_groupby_tab1.val2) + -> Remote Subquery Scan on all + Output: (val + val2) + Distribute results by H: (val + val2) + -> Partial HashAggregate + Output: ((val + val2)) + Group Key: (xc_groupby_tab1.val + xc_groupby_tab1.val2) + -> Seq Scan on public.xc_groupby_tab1 + Output: (val + val2) +(13 rows) + +select val + val2, val, val2 from xc_groupby_tab1 group by val, val2 order by 1, 2, 3; + ?column? | val | val2 +----------+-----+------ + 2 | 1 | 1 + 3 | 2 | 1 + 4 | 1 | 3 + 4 | 2 | 2 + 4 | 3 | 1 + 7 | 4 | 3 + 8 | 6 | 2 + 9 | 6 | 3 +(8 rows) + +explain (verbose true, costs false, nodes false) select val + val2, val, val2 from xc_groupby_tab1 group by val, val2 order by 1, 2; + QUERY PLAN +--------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: (val + val2), val, val2 + Sort Key: (xc_groupby_tab1.val + xc_groupby_tab1.val2), xc_groupby_tab1.val + -> Sort + Output: ((val + val2)), val, val2 + Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2)), xc_groupby_tab1.val + -> HashAggregate + Output: (val + val2), val, val2 + Group Key: xc_groupby_tab1.val, xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val, val2 +(11 rows) + +explain (verbose true, costs false, nodes false) select val + val2, val, val2 from xc_groupby_tab1 group by val, val2; + QUERY PLAN +-------------------------------------------------------------- + Remote Subquery Scan on all + Output: (val + val2), val, val2 + -> HashAggregate + Output: (val + val2), val, val2 + Group Key: xc_groupby_tab1.val, xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val, val2 +(7 rows) + +select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2 order by 1, 2, 3; + ?column? | val | val2 +----------+-----+------ + 2 | 1 | 1 + 5 | 3 | 2 + 5 | 4 | 1 + 6 | 2 | 4 + 6 | 4 | 2 + 7 | 3 | 4 +(6 rows) + +explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2 order by 1, 2, 3; + QUERY PLAN +--------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2 + Sort Key: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2 + -> Sort + Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)), xc_groupby_tab1.val, xc_groupby_tab2.val2 + Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)), xc_groupby_tab1.val, xc_groupby_tab2.val2 + -> HashAggregate + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2 + Group Key: xc_groupby_tab1.val, xc_groupby_tab2.val2 + -> Merge Join + Output: xc_groupby_tab1.val, xc_groupby_tab2.val2 + Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) + -> Sort + Output: xc_groupby_tab1.val + Sort Key: xc_groupby_tab1.val + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val + -> Sort + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val + Sort Key: xc_groupby_tab2.val + -> Seq Scan on public.xc_groupby_tab2 + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val +(22 rows) + +explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2; + QUERY PLAN +--------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2 + -> HashAggregate + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2 + Group Key: xc_groupby_tab1.val, xc_groupby_tab2.val2 + -> Merge Join + Output: xc_groupby_tab1.val, xc_groupby_tab2.val2 + Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) + -> Sort + Output: xc_groupby_tab1.val + Sort Key: xc_groupby_tab1.val + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val + -> Sort + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val + Sort Key: xc_groupby_tab2.val + -> Seq Scan on public.xc_groupby_tab2 + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val +(18 rows) + +select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2 order by 1; + ?column? +---------- + 2 + 5 + 6 + 7 +(4 rows) + +explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2 order by 1; + QUERY PLAN +----------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2) + Sort Key: (xc_groupby_tab1.val + xc_groupby_tab2.val2) + -> Finalize GroupAggregate + Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + Group Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + -> Sort + Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + -> Remote Subquery Scan on all + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2) + Distribute results by H: (val + val2) + -> Partial HashAggregate + Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + Group Key: (xc_groupby_tab1.val + xc_groupby_tab2.val2) + -> Merge Join + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2) + Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) + -> Sort + Output: xc_groupby_tab1.val + Sort Key: xc_groupby_tab1.val + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val + -> Sort + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val + Sort Key: xc_groupby_tab2.val + -> Seq Scan on public.xc_groupby_tab2 + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val +(28 rows) + +explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2; + QUERY PLAN +----------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2) + -> Finalize HashAggregate + Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + Group Key: (xc_groupby_tab1.val + xc_groupby_tab2.val2) + -> Remote Subquery Scan on all + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2) + Distribute results by H: (val + val2) + -> Partial HashAggregate + Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + Group Key: (xc_groupby_tab1.val + xc_groupby_tab2.val2) + -> Merge Join + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2) + Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) + -> Sort + Output: xc_groupby_tab1.val + Sort Key: xc_groupby_tab1.val + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val + -> Sort + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val + Sort Key: xc_groupby_tab2.val + -> Seq Scan on public.xc_groupby_tab2 + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val +(24 rows) + +-- group by with aggregates in expression +select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2 order by 1; + ?column? | val2 +---------------------+------ + 11.0000000000000000 | 1 + 14.0000000000000000 | 2 + 17.6666666666666667 | 3 +(3 rows) + +explain (verbose true, costs false, nodes false) select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2 order by 1; + QUERY PLAN +--------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: (((count(*) + sum(val)))::numeric + avg(val)), val2 + Sort Key: (((count(*) + sum(xc_groupby_tab1.val)))::numeric + avg(xc_groupby_tab1.val)) + -> Sort + Output: ((((count(*) + sum(val)))::numeric + avg(val))), val2 + Sort Key: ((((count(*) + sum(xc_groupby_tab1.val)))::numeric + avg(xc_groupby_tab1.val))) + -> Finalize HashAggregate + Output: (((count(*) + sum(val)))::numeric + avg(val)), val2 + Group Key: xc_groupby_tab1.val2 + -> Remote Subquery Scan on all + Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val) + Distribute results by H: val2 + -> Partial HashAggregate + Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val) + Group Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val, val2 +(17 rows) + +explain (verbose true, costs false, nodes false) select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2; + QUERY PLAN +---------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: (((count(*) + sum(val)))::numeric + avg(val)), val2 + -> Finalize HashAggregate + Output: (((count(*) + sum(val)))::numeric + avg(val)), val2 + Group Key: xc_groupby_tab1.val2 + -> Remote Subquery Scan on all + Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val) + Distribute results by H: val2 + -> Partial HashAggregate + Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val) + Group Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val, val2 +(13 rows) + +-- group by with expressions in group by clause +select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2 order by 1; + sum | avg | ?column? +-----+--------------------+---------- + 6 | 2.0000000000000000 | 2 + 8 | 4.0000000000000000 | 4 + 11 | 3.6666666666666667 | 6 +(3 rows) + +explain (verbose true, costs false, nodes false) select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2 order by 1; + QUERY PLAN +------------------------------------------------------------------------------------ + Remote Subquery Scan on all + Output: sum(val), avg(val), (2 * val2) + Sort Key: sum(xc_groupby_tab1.val) + -> Sort + Output: (sum(val)), (avg(val)), ((2 * val2)) + Sort Key: (sum(xc_groupby_tab1.val)) + -> Finalize HashAggregate + Output: sum(val), avg(val), ((2 * val2)) + Group Key: (2 * xc_groupby_tab1.val2) + -> Remote Subquery Scan on all + Output: (2 * val2), PARTIAL sum(val), PARTIAL avg(val) + Distribute results by H: (2 * val2) + -> Partial HashAggregate + Output: ((2 * val2)), PARTIAL sum(val), PARTIAL avg(val) + Group Key: (2 * xc_groupby_tab1.val2) + -> Seq Scan on public.xc_groupby_tab1 + Output: (2 * val2), val +(17 rows) + +explain (verbose true, costs false, nodes false) select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2; + QUERY PLAN +------------------------------------------------------------------------------ + Remote Subquery Scan on all + Output: sum(val), avg(val), (2 * val2) + -> Finalize HashAggregate + Output: sum(val), avg(val), ((2 * val2)) + Group Key: (2 * xc_groupby_tab1.val2) + -> Remote Subquery Scan on all + Output: (2 * val2), PARTIAL sum(val), PARTIAL avg(val) + Distribute results by H: (2 * val2) + -> Partial HashAggregate + Output: ((2 * val2)), PARTIAL sum(val), PARTIAL avg(val) + Group Key: (2 * xc_groupby_tab1.val2) + -> Seq Scan on public.xc_groupby_tab1 + Output: (2 * val2), val +(13 rows) + +drop table xc_groupby_tab1; +drop table xc_groupby_tab2; +-- some tests involving nulls, characters, float type etc. +create table xc_groupby_def(a int, b varchar(25)); +insert into xc_groupby_def VALUES (NULL, NULL); +insert into xc_groupby_def VALUES (1, NULL); +insert into xc_groupby_def VALUES (NULL, 'One'); +insert into xc_groupby_def VALUES (2, 'Two'); +insert into xc_groupby_def VALUES (2, 'Two'); +insert into xc_groupby_def VALUES (3, 'Three'); +insert into xc_groupby_def VALUES (4, 'Three'); +insert into xc_groupby_def VALUES (5, 'Three'); +insert into xc_groupby_def VALUES (6, 'Two'); +insert into xc_groupby_def VALUES (7, NULL); +insert into xc_groupby_def VALUES (8, 'Two'); +insert into xc_groupby_def VALUES (9, 'Three'); +insert into xc_groupby_def VALUES (10, 'Three'); +select a,count(a) from xc_groupby_def group by a order by a; + a | count +----+------- + 1 | 1 + 2 | 2 + 3 | 1 + 4 | 1 + 5 | 1 + 6 | 1 + 7 | 1 + 8 | 1 + 9 | 1 + 10 | 1 + | 0 +(11 rows) + +explain (verbose true, costs false, nodes false) select a,count(a) from xc_groupby_def group by a order by a; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all + Output: a, count(a) + Sort Key: xc_groupby_def.a + -> Sort + Output: a, (count(a)) + Sort Key: xc_groupby_def.a + -> HashAggregate + Output: a, count(a) + Group Key: xc_groupby_def.a + -> Seq Scan on public.xc_groupby_def + Output: a, b +(11 rows) + +select avg(a) from xc_groupby_def group by a order by 1; + avg +------------------------ + 1.00000000000000000000 + 2.0000000000000000 + 3.0000000000000000 + 4.0000000000000000 + 5.0000000000000000 + 6.0000000000000000 + 7.0000000000000000 + 8.0000000000000000 + 9.0000000000000000 + 10.0000000000000000 + +(11 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by a order by 1; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), a + Sort Key: avg(xc_groupby_def.a) + -> Sort + Output: (avg(a)), a + Sort Key: (avg(xc_groupby_def.a)) + -> HashAggregate + Output: avg(a), a + Group Key: xc_groupby_def.a + -> Seq Scan on public.xc_groupby_def + Output: a, b +(11 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by a; + QUERY PLAN +----------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), a + -> HashAggregate + Output: avg(a), a + Group Key: xc_groupby_def.a + -> Seq Scan on public.xc_groupby_def + Output: a, b +(7 rows) + +select avg(a) from xc_groupby_def group by b order by 1; + avg +-------------------- + 4.0000000000000000 + 4.5000000000000000 + 6.2000000000000000 + +(4 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by b order by 1; + QUERY PLAN +----------------------------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), b + Sort Key: avg(xc_groupby_def.a) + -> Sort + Output: (avg(a)), b + Sort Key: (avg(xc_groupby_def.a)) + -> Finalize HashAggregate + Output: avg(a), b + Group Key: xc_groupby_def.b + -> Remote Subquery Scan on all + Output: b, PARTIAL avg(a) + Distribute results by H: b + -> Partial HashAggregate + Output: b, PARTIAL avg(a) + Group Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: a, b +(17 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by b; + QUERY PLAN +----------------------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), b + -> Finalize HashAggregate + Output: avg(a), b + Group Key: xc_groupby_def.b + -> Remote Subquery Scan on all + Output: b, PARTIAL avg(a) + Distribute results by H: b + -> Partial HashAggregate + Output: b, PARTIAL avg(a) + Group Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: a, b +(13 rows) + +select sum(a) from xc_groupby_def group by b order by 1; + sum +----- + 8 + 18 + 31 + +(4 rows) + +explain (verbose true, costs false, nodes false) select sum(a) from xc_groupby_def group by b order by 1; + QUERY PLAN +----------------------------------------------------------------- + Remote Subquery Scan on all + Output: sum(a), b + Sort Key: sum(xc_groupby_def.a) + -> Sort + Output: (sum(a)), b + Sort Key: (sum(xc_groupby_def.a)) + -> Finalize HashAggregate + Output: sum(a), b + Group Key: xc_groupby_def.b + -> Remote Subquery Scan on all + Output: b, PARTIAL sum(a) + Distribute results by H: b + -> Partial HashAggregate + Output: b, PARTIAL sum(a) + Group Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: a, b +(17 rows) + +explain (verbose true, costs false, nodes false) select sum(a) from xc_groupby_def group by b; + QUERY PLAN +----------------------------------------------------------- + Remote Subquery Scan on all + Output: sum(a), b + -> Finalize HashAggregate + Output: sum(a), b + Group Key: xc_groupby_def.b + -> Remote Subquery Scan on all + Output: b, PARTIAL sum(a) + Distribute results by H: b + -> Partial HashAggregate + Output: b, PARTIAL sum(a) + Group Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: a, b +(13 rows) + +select count(*) from xc_groupby_def group by b order by 1; + count +------- + 1 + 3 + 4 + 5 +(4 rows) + +explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def group by b order by 1; + QUERY PLAN +----------------------------------------------------------------- + Remote Subquery Scan on all + Output: count(*), b + Sort Key: count(*) + -> Sort + Output: (count(*)), b + Sort Key: (count(*)) + -> Finalize HashAggregate + Output: count(*), b + Group Key: xc_groupby_def.b + -> Remote Subquery Scan on all + Output: b, PARTIAL count(*) + Distribute results by H: b + -> Partial HashAggregate + Output: b, PARTIAL count(*) + Group Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: a, b +(17 rows) + +explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def group by b; + QUERY PLAN +----------------------------------------------------------- + Remote Subquery Scan on all + Output: count(*), b + -> Finalize HashAggregate + Output: count(*), b + Group Key: xc_groupby_def.b + -> Remote Subquery Scan on all + Output: b, PARTIAL count(*) + Distribute results by H: b + -> Partial HashAggregate + Output: b, PARTIAL count(*) + Group Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: a, b +(13 rows) + +select count(*) from xc_groupby_def where a is not null group by a order by 1; + count +------- + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 2 +(10 rows) + +explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def where a is not null group by a order by 1; + QUERY PLAN +------------------------------------------------------------ + Remote Subquery Scan on all + Output: count(*), a + Sort Key: count(*) + -> Sort + Output: (count(*)), a + Sort Key: (count(*)) + -> HashAggregate + Output: count(*), a + Group Key: xc_groupby_def.a + -> Seq Scan on public.xc_groupby_def + Output: a, b + Filter: (xc_groupby_def.a IS NOT NULL) +(12 rows) + +explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def where a is not null group by a; + QUERY PLAN +------------------------------------------------------ + Remote Subquery Scan on all + Output: count(*), a + -> HashAggregate + Output: count(*), a + Group Key: xc_groupby_def.a + -> Seq Scan on public.xc_groupby_def + Output: a, b + Filter: (xc_groupby_def.a IS NOT NULL) +(8 rows) + +select * from (select b from xc_groupby_def group by b) q order by q.b; + b +------- + One + Three + Two + +(4 rows) + +explain (verbose true, costs false, nodes false) select * from (select b from xc_groupby_def group by b) q order by q.b; + QUERY PLAN +---------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: b + Sort Key: b + -> Finalize GroupAggregate + Output: xc_groupby_def.b + Group Key: xc_groupby_def.b + -> Sort + Output: xc_groupby_def.b + Sort Key: xc_groupby_def.b + -> Remote Subquery Scan on all + Output: xc_groupby_def.b + Distribute results by H: b + -> Partial HashAggregate + Output: xc_groupby_def.b + Group Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: xc_groupby_def.a, xc_groupby_def.b +(17 rows) + +select * from (select b,count(b) from xc_groupby_def group by b) q order by q.b; + b | count +-------+------- + One | 1 + Three | 5 + Two | 4 + | 0 +(4 rows) + +explain (verbose true, costs false, nodes false) select * from (select b,count(b) from xc_groupby_def group by b) q order by q.b; + QUERY PLAN +------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: b, count + Sort Key: b + -> Finalize GroupAggregate + Output: xc_groupby_def.b, count(xc_groupby_def.b) + Group Key: xc_groupby_def.b + -> Sort + Output: xc_groupby_def.b, (PARTIAL count(xc_groupby_def.b)) + Sort Key: xc_groupby_def.b + -> Remote Subquery Scan on all + Output: xc_groupby_def.b, PARTIAL count(xc_groupby_def.b) + Distribute results by H: b + -> Partial HashAggregate + Output: xc_groupby_def.b, PARTIAL count(xc_groupby_def.b) + Group Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: xc_groupby_def.a, xc_groupby_def.b +(17 rows) + +select count(*) from xc_groupby_def where b is null group by b order by 1; + count +------- + 3 +(1 row) + +explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def where b is null group by b; + QUERY PLAN +-------------------------------------------------------------------- + Remote Subquery Scan on all + Output: count(*), b + -> Finalize GroupAggregate + Output: count(*), b + Group Key: xc_groupby_def.b + -> Sort + Output: b, (PARTIAL count(*)) + Sort Key: xc_groupby_def.b + -> Remote Subquery Scan on all + Output: b, PARTIAL count(*) + Distribute results by H: b + -> Partial HashAggregate + Output: b, PARTIAL count(*) + Group Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: a, b + Filter: (xc_groupby_def.b IS NULL) +(17 rows) + +create table xc_groupby_g(a int, b float, c numeric); +insert into xc_groupby_g values(1,2.1,3.2); +insert into xc_groupby_g values(1,2.1,3.2); +insert into xc_groupby_g values(2,2.3,5.2); +select sum(a) from xc_groupby_g group by a; + sum +----- + 2 + 2 +(2 rows) + +explain (verbose true, costs false, nodes false) select sum(a) from xc_groupby_g group by a; + QUERY PLAN +--------------------------------------------- + Remote Subquery Scan on all + Output: sum(a), a + -> HashAggregate + Output: sum(a), a + Group Key: xc_groupby_g.a + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(7 rows) + +select sum(b) from xc_groupby_g group by b order by 1; + sum +----- + 2.3 + 4.2 +(2 rows) + +explain (verbose true, costs false, nodes false) select sum(b) from xc_groupby_g group by b order by 1; + QUERY PLAN +--------------------------------------------------------------- + Remote Subquery Scan on all + Output: sum(b), b + Sort Key: sum(xc_groupby_g.b) + -> Sort + Output: (sum(b)), b + Sort Key: (sum(xc_groupby_g.b)) + -> Finalize HashAggregate + Output: sum(b), b + Group Key: xc_groupby_g.b + -> Remote Subquery Scan on all + Output: b, PARTIAL sum(b) + Distribute results by H: b + -> Partial HashAggregate + Output: b, PARTIAL sum(b) + Group Key: xc_groupby_g.b + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(17 rows) + +explain (verbose true, costs false, nodes false) select sum(b) from xc_groupby_g group by b; + QUERY PLAN +--------------------------------------------------------- + Remote Subquery Scan on all + Output: sum(b), b + -> Finalize HashAggregate + Output: sum(b), b + Group Key: xc_groupby_g.b + -> Remote Subquery Scan on all + Output: b, PARTIAL sum(b) + Distribute results by H: b + -> Partial HashAggregate + Output: b, PARTIAL sum(b) + Group Key: xc_groupby_g.b + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(13 rows) + +select sum(c) from xc_groupby_g group by b order by 1; + sum +----- + 5.2 + 6.4 +(2 rows) + +explain (verbose true, costs false, nodes false) select sum(c) from xc_groupby_g group by b order by 1; + QUERY PLAN +--------------------------------------------------------------- + Remote Subquery Scan on all + Output: sum(c), b + Sort Key: sum(xc_groupby_g.c) + -> Sort + Output: (sum(c)), b + Sort Key: (sum(xc_groupby_g.c)) + -> Finalize HashAggregate + Output: sum(c), b + Group Key: xc_groupby_g.b + -> Remote Subquery Scan on all + Output: b, PARTIAL sum(c) + Distribute results by H: b + -> Partial HashAggregate + Output: b, PARTIAL sum(c) + Group Key: xc_groupby_g.b + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(17 rows) + +explain (verbose true, costs false, nodes false) select sum(c) from xc_groupby_g group by b; + QUERY PLAN +--------------------------------------------------------- + Remote Subquery Scan on all + Output: sum(c), b + -> Finalize HashAggregate + Output: sum(c), b + Group Key: xc_groupby_g.b + -> Remote Subquery Scan on all + Output: b, PARTIAL sum(c) + Distribute results by H: b + -> Partial HashAggregate + Output: b, PARTIAL sum(c) + Group Key: xc_groupby_g.b + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(13 rows) + +select avg(a) from xc_groupby_g group by b order by 1; + avg +------------------------ + 1.00000000000000000000 + 2.0000000000000000 +(2 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_g group by b order by 1; + QUERY PLAN +--------------------------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), b + Sort Key: avg(xc_groupby_g.a) + -> Sort + Output: (avg(a)), b + Sort Key: (avg(xc_groupby_g.a)) + -> Finalize HashAggregate + Output: avg(a), b + Group Key: xc_groupby_g.b + -> Remote Subquery Scan on all + Output: b, PARTIAL avg(a) + Distribute results by H: b + -> Partial HashAggregate + Output: b, PARTIAL avg(a) + Group Key: xc_groupby_g.b + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(17 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_g group by b; + QUERY PLAN +--------------------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), b + -> Finalize HashAggregate + Output: avg(a), b + Group Key: xc_groupby_g.b + -> Remote Subquery Scan on all + Output: b, PARTIAL avg(a) + Distribute results by H: b + -> Partial HashAggregate + Output: b, PARTIAL avg(a) + Group Key: xc_groupby_g.b + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(13 rows) + +select avg(b) from xc_groupby_g group by c order by 1; + avg +----- + 2.1 + 2.3 +(2 rows) + +explain (verbose true, costs false, nodes false) select avg(b) from xc_groupby_g group by c order by 1; + QUERY PLAN +--------------------------------------------------------------- + Remote Subquery Scan on all + Output: avg(b), c + Sort Key: avg(xc_groupby_g.b) + -> Sort + Output: (avg(b)), c + Sort Key: (avg(xc_groupby_g.b)) + -> Finalize HashAggregate + Output: avg(b), c + Group Key: xc_groupby_g.c + -> Remote Subquery Scan on all + Output: c, PARTIAL avg(b) + Distribute results by H: c + -> Partial HashAggregate + Output: c, PARTIAL avg(b) + Group Key: xc_groupby_g.c + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(17 rows) + +explain (verbose true, costs false, nodes false) select avg(b) from xc_groupby_g group by c; + QUERY PLAN +--------------------------------------------------------- + Remote Subquery Scan on all + Output: avg(b), c + -> Finalize HashAggregate + Output: avg(b), c + Group Key: xc_groupby_g.c + -> Remote Subquery Scan on all + Output: c, PARTIAL avg(b) + Distribute results by H: c + -> Partial HashAggregate + Output: c, PARTIAL avg(b) + Group Key: xc_groupby_g.c + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(13 rows) + +select avg(c) from xc_groupby_g group by c order by 1; + avg +-------------------- + 3.2000000000000000 + 5.2000000000000000 +(2 rows) + +explain (verbose true, costs false, nodes false) select avg(c) from xc_groupby_g group by c order by 1; + QUERY PLAN +--------------------------------------------------------------- + Remote Subquery Scan on all + Output: avg(c), c + Sort Key: avg(xc_groupby_g.c) + -> Sort + Output: (avg(c)), c + Sort Key: (avg(xc_groupby_g.c)) + -> Finalize HashAggregate + Output: avg(c), c + Group Key: xc_groupby_g.c + -> Remote Subquery Scan on all + Output: c, PARTIAL avg(c) + Distribute results by H: c + -> Partial HashAggregate + Output: c, PARTIAL avg(c) + Group Key: xc_groupby_g.c + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(17 rows) + +drop table xc_groupby_def; +drop table xc_groupby_g; +-- Combination 2, enable_hashagg on and replicated tables. +-- repeat the same tests for replicated tables +-- create required tables and fill them with data +create table xc_groupby_tab1 (val int, val2 int) distribute by replication; +create table xc_groupby_tab2 (val int, val2 int) distribute by replication; +insert into xc_groupby_tab1 values (1, 1), (2, 1), (3, 1), (2, 2), (6, 2), (4, 3), (1, 3), (6, 3); +insert into xc_groupby_tab2 values (1, 1), (4, 1), (8, 1), (2, 4), (9, 4), (3, 4), (4, 2), (5, 2), (3, 2); +select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2 order by 1, 2, 3; + count | sum | avg | ?column? | val2 +-------+-----+--------------------+------------------+------ + 2 | 8 | 4.0000000000000000 | 4 | 2 + 3 | 6 | 2.0000000000000000 | 2 | 1 + 3 | 11 | 3.6666666666666667 | 3.66666666666667 | 3 +(3 rows) + +explain (verbose true, costs false, nodes false) select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2 order by 1, 2, 3; + QUERY PLAN +--------------------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2 + -> Sort + Output: (count(*)), (sum(val)), (avg(val)), (((sum(val))::double precision / (count(*))::double precision)), val2 + Sort Key: (count(*)), (sum(xc_groupby_tab1.val)), (avg(xc_groupby_tab1.val)) + -> HashAggregate + Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2 + Group Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val, val2 +(10 rows) + +explain (verbose true, costs false, nodes false) select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2 + -> HashAggregate + Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2 + Group Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val, val2 +(7 rows) + +-- joins and group by +select * from (select count(*), sum(xc_groupby_tab1.val * xc_groupby_tab2.val), avg(xc_groupby_tab1.val*xc_groupby_tab2.val), sum(xc_groupby_tab1.val*xc_groupby_tab2.val)::float8/count(*), xc_groupby_tab1.val2 c1, xc_groupby_tab2.val2 c2 from xc_groupby_tab1 full outer join xc_groupby_tab2 on xc_groupby_tab1.val2 = xc_groupby_tab2.val2 group by xc_groupby_tab1.val2, xc_groupby_tab2.val2) q order by q.c1, q.c2; + count | sum | avg | ?column? | c1 | c2 +-------+-----+---------------------+------------------+----+---- + 9 | 78 | 8.6666666666666667 | 8.66666666666667 | 1 | 1 + 6 | 96 | 16.0000000000000000 | 16 | 2 | 2 + 3 | | | | 3 | + 3 | | | | | 4 +(4 rows) + +explain (verbose true, costs false, nodes false) select * from (select count(*), sum(xc_groupby_tab1.val * xc_groupby_tab2.val), avg(xc_groupby_tab1.val*xc_groupby_tab2.val), sum(xc_groupby_tab1.val*xc_groupby_tab2.val)::float8/count(*), xc_groupby_tab1.val2 c1, xc_groupby_tab2.val2 c2 from xc_groupby_tab1 full outer join xc_groupby_tab2 on xc_groupby_tab1.val2 = xc_groupby_tab2.val2 group by xc_groupby_tab1.val2, xc_groupby_tab2.val2) q order by q.c1, q.c2; + QUERY PLAN +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: count, sum, avg, "?column?", c1, c2 + -> GroupAggregate + Output: count(*), sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), avg((xc_groupby_tab1.val * xc_groupby_tab2.val)), ((sum((xc_groupby_tab1.val * xc_groupby_tab2.val)))::double precision / (count(*))::double precision), xc_groupby_tab1.val2, xc_groupby_tab2.val2 + Group Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2 + -> Sort + Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val + Sort Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2 + -> Merge Full Join + Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val + Merge Cond: (xc_groupby_tab1.val2 = xc_groupby_tab2.val2) + -> Sort + Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 + Sort Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 + -> Sort + Output: xc_groupby_tab2.val, xc_groupby_tab2.val2 + Sort Key: xc_groupby_tab2.val2 + -> Seq Scan on public.xc_groupby_tab2 + Output: xc_groupby_tab2.val, xc_groupby_tab2.val2 +(21 rows) + +-- aggregates over aggregates +select * from (select sum(y) sum from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x) q order by q.sum; + sum +----- + 8 + 17 +(2 rows) + +explain (verbose true, costs false, nodes false) select * from (select sum(y) sum from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x) q order by q.sum; + QUERY PLAN +-------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: q.sum + -> Sort + Output: q.sum + Sort Key: q.sum + -> Subquery Scan on q + Output: q.sum + -> HashAggregate + Output: sum((sum(xc_groupby_tab1.val))), ((xc_groupby_tab1.val2 % 2)) + Group Key: (xc_groupby_tab1.val2 % 2) + -> HashAggregate + Output: sum(xc_groupby_tab1.val), (xc_groupby_tab1.val2 % 2), xc_groupby_tab1.val2 + Group Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 +(15 rows) + +-- group by without aggregate +select val2 from xc_groupby_tab1 group by val2 order by 1; + val2 +------ + 1 + 2 + 3 +(3 rows) + +explain (verbose true, costs false, nodes false) select val2 from xc_groupby_tab1 group by val2 order by 1; + QUERY PLAN +------------------------------------------------------ + Remote Subquery Scan on all + Output: val2 + -> Sort + Output: val2 + Sort Key: xc_groupby_tab1.val2 + -> HashAggregate + Output: val2 + Group Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val, val2 +(10 rows) + +explain (verbose true, costs false, nodes false) select val2 from xc_groupby_tab1 group by val2; + QUERY PLAN +------------------------------------------------ + Remote Subquery Scan on all + Output: val2 + -> HashAggregate + Output: val2 + Group Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val, val2 +(7 rows) + +select * from (select val + val2 sum from xc_groupby_tab1 group by val + val2) q order by q.sum; + sum +----- + 2 + 3 + 4 + 7 + 8 + 9 +(6 rows) + +explain (verbose true, costs false, nodes false) select * from (select val + val2 sum from xc_groupby_tab1 group by val + val2) q order by q.sum; + QUERY PLAN +-------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: sum + -> Sort + Output: ((xc_groupby_tab1.val + xc_groupby_tab1.val2)) + Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2)) + -> HashAggregate + Output: ((xc_groupby_tab1.val + xc_groupby_tab1.val2)) + Group Key: (xc_groupby_tab1.val + xc_groupby_tab1.val2) + -> Seq Scan on public.xc_groupby_tab1 + Output: (xc_groupby_tab1.val + xc_groupby_tab1.val2) +(10 rows) + +select * from (select val + val2, val, val2 from xc_groupby_tab1 group by val, val2) q order by q.val, q.val2; + ?column? | val | val2 +----------+-----+------ + 2 | 1 | 1 + 4 | 1 | 3 + 3 | 2 | 1 + 4 | 2 | 2 + 4 | 3 | 1 + 7 | 4 | 3 + 8 | 6 | 2 + 9 | 6 | 3 +(8 rows) + +explain (verbose true, costs false, nodes false) select * from (select val + val2, val, val2 from xc_groupby_tab1 group by val, val2) q order by q.val, q.val2; + QUERY PLAN +--------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: "?column?", val, val2 + -> Sort + Output: ((xc_groupby_tab1.val + xc_groupby_tab1.val2)), xc_groupby_tab1.val, xc_groupby_tab1.val2 + Sort Key: xc_groupby_tab1.val, xc_groupby_tab1.val2 + -> HashAggregate + Output: (xc_groupby_tab1.val + xc_groupby_tab1.val2), xc_groupby_tab1.val, xc_groupby_tab1.val2 + Group Key: xc_groupby_tab1.val, xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 +(10 rows) + +select * from (select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2) q order by q.val, q.val2; + ?column? | val | val2 +----------+-----+------ + 2 | 1 | 1 + 6 | 2 | 4 + 5 | 3 | 2 + 7 | 3 | 4 + 5 | 4 | 1 + 6 | 4 | 2 +(6 rows) + +explain (verbose true, costs false, nodes false) select * from (select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2) q order by q.val, q.val2; + QUERY PLAN +--------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: "?column?", val, val2 + -> Group + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2 + Group Key: xc_groupby_tab1.val, xc_groupby_tab2.val2 + -> Sort + Output: xc_groupby_tab1.val, xc_groupby_tab2.val2 + Sort Key: xc_groupby_tab1.val, xc_groupby_tab2.val2 + -> Merge Join + Output: xc_groupby_tab1.val, xc_groupby_tab2.val2 + Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) + -> Sort + Output: xc_groupby_tab1.val + Sort Key: xc_groupby_tab1.val + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val + -> Sort + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val + Sort Key: xc_groupby_tab2.val + -> Seq Scan on public.xc_groupby_tab2 + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val +(21 rows) + +select * from (select xc_groupby_tab1.val + xc_groupby_tab2.val2 sum from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2) q order by q.sum; + sum +----- + 2 + 5 + 6 + 7 +(4 rows) + +explain (verbose true, costs false, nodes false) select * from (select xc_groupby_tab1.val + xc_groupby_tab2.val2 sum from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2) q order by q.sum; + QUERY PLAN +----------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: sum + -> Group + Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + Group Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + -> Sort + Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + -> Merge Join + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2) + Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) + -> Sort + Output: xc_groupby_tab1.val + Sort Key: xc_groupby_tab1.val + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val + -> Sort + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val + Sort Key: xc_groupby_tab2.val + -> Seq Scan on public.xc_groupby_tab2 + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val +(21 rows) + +-- group by with aggregates in expression +select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2 order by val2; + ?column? | val2 +---------------------+------ + 11.0000000000000000 | 1 + 14.0000000000000000 | 2 + 17.6666666666666667 | 3 +(3 rows) + +explain (verbose true, costs false, nodes false) select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2 order by val2; + QUERY PLAN +--------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: (((count(*) + sum(val)))::numeric + avg(val)), val2 + -> Sort + Output: ((((count(*) + sum(val)))::numeric + avg(val))), val2 + Sort Key: xc_groupby_tab1.val2 + -> HashAggregate + Output: (((count(*) + sum(val)))::numeric + avg(val)), val2 + Group Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val, val2 +(10 rows) + +explain (verbose true, costs false, nodes false) select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2; + QUERY PLAN +--------------------------------------------------------------------- + Remote Subquery Scan on all + Output: (((count(*) + sum(val)))::numeric + avg(val)), val2 + -> HashAggregate + Output: (((count(*) + sum(val)))::numeric + avg(val)), val2 + Group Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val, val2 +(7 rows) + +-- group by with expressions in group by clause +select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2 order by 2 * val2; + sum | avg | ?column? +-----+--------------------+---------- + 6 | 2.0000000000000000 | 2 + 8 | 4.0000000000000000 | 4 + 11 | 3.6666666666666667 | 6 +(3 rows) + +explain (verbose true, costs false, nodes false) select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2 order by 2 * val2; + QUERY PLAN +-------------------------------------------------------- + Remote Subquery Scan on all + Output: sum(val), avg(val), (2 * val2) + -> Sort + Output: (sum(val)), (avg(val)), ((2 * val2)) + Sort Key: ((2 * xc_groupby_tab1.val2)) + -> HashAggregate + Output: sum(val), avg(val), ((2 * val2)) + Group Key: (2 * xc_groupby_tab1.val2) + -> Seq Scan on public.xc_groupby_tab1 + Output: (2 * val2), val +(10 rows) + +explain (verbose true, costs false, nodes false) select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2; + QUERY PLAN +-------------------------------------------------- + Remote Subquery Scan on all + Output: sum(val), avg(val), (2 * val2) + -> HashAggregate + Output: sum(val), avg(val), ((2 * val2)) + Group Key: (2 * xc_groupby_tab1.val2) + -> Seq Scan on public.xc_groupby_tab1 + Output: (2 * val2), val +(7 rows) + +drop table xc_groupby_tab1; +drop table xc_groupby_tab2; +-- some tests involving nulls, characters, float type etc. +create table xc_groupby_def(a int, b varchar(25)) distribute by replication; +insert into xc_groupby_def VALUES (NULL, NULL); +insert into xc_groupby_def VALUES (1, NULL); +insert into xc_groupby_def VALUES (NULL, 'One'); +insert into xc_groupby_def VALUES (2, 'Two'); +insert into xc_groupby_def VALUES (2, 'Two'); +insert into xc_groupby_def VALUES (3, 'Three'); +insert into xc_groupby_def VALUES (4, 'Three'); +insert into xc_groupby_def VALUES (5, 'Three'); +insert into xc_groupby_def VALUES (6, 'Two'); +insert into xc_groupby_def VALUES (7, NULL); +insert into xc_groupby_def VALUES (8, 'Two'); +insert into xc_groupby_def VALUES (9, 'Three'); +insert into xc_groupby_def VALUES (10, 'Three'); +select a,count(a) from xc_groupby_def group by a order by a; + a | count +----+------- + 1 | 1 + 2 | 2 + 3 | 1 + 4 | 1 + 5 | 1 + 6 | 1 + 7 | 1 + 8 | 1 + 9 | 1 + 10 | 1 + | 0 +(11 rows) + +explain (verbose true, costs false, nodes false) select a,count(a) from xc_groupby_def group by a order by a; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all + Output: a, count(a) + -> Sort + Output: a, (count(a)) + Sort Key: xc_groupby_def.a + -> HashAggregate + Output: a, count(a) + Group Key: xc_groupby_def.a + -> Seq Scan on public.xc_groupby_def + Output: a, b +(10 rows) + +select avg(a) from xc_groupby_def group by a order by 1; + avg +------------------------ + 1.00000000000000000000 + 2.0000000000000000 + 3.0000000000000000 + 4.0000000000000000 + 5.0000000000000000 + 6.0000000000000000 + 7.0000000000000000 + 8.0000000000000000 + 9.0000000000000000 + 10.0000000000000000 + +(11 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by a order by 1; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), a + -> Sort + Output: (avg(a)), a + Sort Key: (avg(xc_groupby_def.a)) + -> HashAggregate + Output: avg(a), a + Group Key: xc_groupby_def.a + -> Seq Scan on public.xc_groupby_def + Output: a, b +(10 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by a; + QUERY PLAN +----------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), a + -> HashAggregate + Output: avg(a), a + Group Key: xc_groupby_def.a + -> Seq Scan on public.xc_groupby_def + Output: a, b +(7 rows) + +select avg(a) from xc_groupby_def group by a order by 1; + avg +------------------------ + 1.00000000000000000000 + 2.0000000000000000 + 3.0000000000000000 + 4.0000000000000000 + 5.0000000000000000 + 6.0000000000000000 + 7.0000000000000000 + 8.0000000000000000 + 9.0000000000000000 + 10.0000000000000000 + +(11 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by a order by 1; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), a + -> Sort + Output: (avg(a)), a + Sort Key: (avg(xc_groupby_def.a)) + -> HashAggregate + Output: avg(a), a + Group Key: xc_groupby_def.a + -> Seq Scan on public.xc_groupby_def + Output: a, b +(10 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by a; + QUERY PLAN +----------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), a + -> HashAggregate + Output: avg(a), a + Group Key: xc_groupby_def.a + -> Seq Scan on public.xc_groupby_def + Output: a, b +(7 rows) + +select avg(a) from xc_groupby_def group by b order by 1; + avg +-------------------- + 4.0000000000000000 + 4.5000000000000000 + 6.2000000000000000 + +(4 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by b order by 1; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), b + -> Sort + Output: (avg(a)), b + Sort Key: (avg(xc_groupby_def.a)) + -> HashAggregate + Output: avg(a), b + Group Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: a, b +(10 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by b; + QUERY PLAN +----------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), b + -> HashAggregate + Output: avg(a), b + Group Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: a, b +(7 rows) + +select sum(a) from xc_groupby_def group by b order by 1; + sum +----- + 8 + 18 + 31 + +(4 rows) + +explain (verbose true, costs false, nodes false) select sum(a) from xc_groupby_def group by b order by 1; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all + Output: sum(a), b + -> Sort + Output: (sum(a)), b + Sort Key: (sum(xc_groupby_def.a)) + -> HashAggregate + Output: sum(a), b + Group Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: a, b +(10 rows) + +explain (verbose true, costs false, nodes false) select sum(a) from xc_groupby_def group by b; + QUERY PLAN +----------------------------------------------- + Remote Subquery Scan on all + Output: sum(a), b + -> HashAggregate + Output: sum(a), b + Group Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: a, b +(7 rows) + +select count(*) from xc_groupby_def group by b order by 1; + count +------- + 1 + 3 + 4 + 5 +(4 rows) + +explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def group by b order by 1; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all + Output: count(*), b + -> Sort + Output: (count(*)), b + Sort Key: (count(*)) + -> HashAggregate + Output: count(*), b + Group Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: a, b +(10 rows) + +explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def group by b; + QUERY PLAN +----------------------------------------------- + Remote Subquery Scan on all + Output: count(*), b + -> HashAggregate + Output: count(*), b + Group Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: a, b +(7 rows) + +select count(*) from xc_groupby_def where a is not null group by a order by 1; + count +------- + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 2 +(10 rows) + +explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def where a is not null group by a order by 1; + QUERY PLAN +------------------------------------------------------------ + Remote Subquery Scan on all + Output: count(*), a + -> Sort + Output: (count(*)), a + Sort Key: (count(*)) + -> HashAggregate + Output: count(*), a + Group Key: xc_groupby_def.a + -> Seq Scan on public.xc_groupby_def + Output: a, b + Filter: (xc_groupby_def.a IS NOT NULL) +(11 rows) + +explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def where a is not null group by a; + QUERY PLAN +------------------------------------------------------ + Remote Subquery Scan on all + Output: count(*), a + -> HashAggregate + Output: count(*), a + Group Key: xc_groupby_def.a + -> Seq Scan on public.xc_groupby_def + Output: a, b + Filter: (xc_groupby_def.a IS NOT NULL) +(8 rows) + +select * from (select b from xc_groupby_def group by b) q order by q.b; + b +------- + One + Three + Two + +(4 rows) + +explain (verbose true, costs false, nodes false) select * from (select b from xc_groupby_def group by b) q order by q.b; + QUERY PLAN +---------------------------------------------------------------- + Remote Subquery Scan on all + Output: b + -> Sort + Output: xc_groupby_def.b + Sort Key: xc_groupby_def.b + -> HashAggregate + Output: xc_groupby_def.b + Group Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: xc_groupby_def.a, xc_groupby_def.b +(10 rows) + +select * from (select b,count(b) from xc_groupby_def group by b) q order by q.b; + b | count +-------+------- + One | 1 + Three | 5 + Two | 4 + | 0 +(4 rows) + +explain (verbose true, costs false, nodes false) select * from (select b,count(b) from xc_groupby_def group by b) q order by q.b; + QUERY PLAN +----------------------------------------------------------------- + Remote Subquery Scan on all + Output: b, count + -> Sort + Output: xc_groupby_def.b, (count(xc_groupby_def.b)) + Sort Key: xc_groupby_def.b + -> HashAggregate + Output: xc_groupby_def.b, count(xc_groupby_def.b) + Group Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: xc_groupby_def.a, xc_groupby_def.b +(10 rows) + +select count(*) from xc_groupby_def where b is null group by b order by 1; + count +------- + 3 +(1 row) + +explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def where b is null group by b; + QUERY PLAN +-------------------------------------------------------- + Remote Subquery Scan on all + Output: count(*), b + -> GroupAggregate + Output: count(*), b + Group Key: xc_groupby_def.b + -> Sort + Output: b + Sort Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: b + Filter: (xc_groupby_def.b IS NULL) +(11 rows) + +create table xc_groupby_g(a int, b float, c numeric) distribute by replication; +insert into xc_groupby_g values(1,2.1,3.2); +insert into xc_groupby_g values(1,2.1,3.2); +insert into xc_groupby_g values(2,2.3,5.2); +select sum(a) from xc_groupby_g group by a; + sum +----- + 2 + 2 +(2 rows) + +explain (verbose true, costs false, nodes false) select sum(a) from xc_groupby_g group by a; + QUERY PLAN +--------------------------------------------- + Remote Subquery Scan on all + Output: sum(a), a + -> HashAggregate + Output: sum(a), a + Group Key: xc_groupby_g.a + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(7 rows) + +select sum(b) from xc_groupby_g group by b order by 1; + sum +----- + 2.3 + 4.2 +(2 rows) + +explain (verbose true, costs false, nodes false) select sum(b) from xc_groupby_g group by b order by 1; + QUERY PLAN +--------------------------------------------------- + Remote Subquery Scan on all + Output: sum(b), b + -> Sort + Output: (sum(b)), b + Sort Key: (sum(xc_groupby_g.b)) + -> HashAggregate + Output: sum(b), b + Group Key: xc_groupby_g.b + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(10 rows) + +explain (verbose true, costs false, nodes false) select sum(b) from xc_groupby_g group by b; + QUERY PLAN +--------------------------------------------- + Remote Subquery Scan on all + Output: sum(b), b + -> HashAggregate + Output: sum(b), b + Group Key: xc_groupby_g.b + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(7 rows) + +select sum(c) from xc_groupby_g group by b order by 1; + sum +----- + 5.2 + 6.4 +(2 rows) + +explain (verbose true, costs false, nodes false) select sum(c) from xc_groupby_g group by b order by 1; + QUERY PLAN +--------------------------------------------------- + Remote Subquery Scan on all + Output: sum(c), b + -> Sort + Output: (sum(c)), b + Sort Key: (sum(xc_groupby_g.c)) + -> HashAggregate + Output: sum(c), b + Group Key: xc_groupby_g.b + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(10 rows) + +explain (verbose true, costs false, nodes false) select sum(c) from xc_groupby_g group by b; + QUERY PLAN +--------------------------------------------- + Remote Subquery Scan on all + Output: sum(c), b + -> HashAggregate + Output: sum(c), b + Group Key: xc_groupby_g.b + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(7 rows) + +select avg(a) from xc_groupby_g group by b order by 1; + avg +------------------------ + 1.00000000000000000000 + 2.0000000000000000 +(2 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_g group by b order by 1; + QUERY PLAN +--------------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), b + -> Sort + Output: (avg(a)), b + Sort Key: (avg(xc_groupby_g.a)) + -> HashAggregate + Output: avg(a), b + Group Key: xc_groupby_g.b + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(10 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_g group by b; + QUERY PLAN +--------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), b + -> HashAggregate + Output: avg(a), b + Group Key: xc_groupby_g.b + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(7 rows) + +select avg(b) from xc_groupby_g group by c order by 1; + avg +----- + 2.1 + 2.3 +(2 rows) + +explain (verbose true, costs false, nodes false) select avg(b) from xc_groupby_g group by c order by 1; + QUERY PLAN +--------------------------------------------------- + Remote Subquery Scan on all + Output: avg(b), c + -> Sort + Output: (avg(b)), c + Sort Key: (avg(xc_groupby_g.b)) + -> HashAggregate + Output: avg(b), c + Group Key: xc_groupby_g.c + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(10 rows) + +explain (verbose true, costs false, nodes false) select avg(b) from xc_groupby_g group by c; + QUERY PLAN +--------------------------------------------- + Remote Subquery Scan on all + Output: avg(b), c + -> HashAggregate + Output: avg(b), c + Group Key: xc_groupby_g.c + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(7 rows) + +select avg(c) from xc_groupby_g group by c order by 1; + avg +-------------------- + 3.2000000000000000 + 5.2000000000000000 +(2 rows) + +explain (verbose true, costs false, nodes false) select avg(c) from xc_groupby_g group by c order by 1; + QUERY PLAN +--------------------------------------------------- + Remote Subquery Scan on all + Output: avg(c), c + -> Sort + Output: (avg(c)), c + Sort Key: (avg(xc_groupby_g.c)) + -> HashAggregate + Output: avg(c), c + Group Key: xc_groupby_g.c + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(10 rows) + +explain (verbose true, costs false, nodes false) select avg(c) from xc_groupby_g group by c; + QUERY PLAN +--------------------------------------------- + Remote Subquery Scan on all + Output: avg(c), c + -> HashAggregate + Output: avg(c), c + Group Key: xc_groupby_g.c + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(7 rows) + +drop table xc_groupby_def; +drop table xc_groupby_g; +reset enable_hashagg; +-- Combination 3 enable_hashagg off and distributed tables +set enable_hashagg to off; +-- create required tables and fill them with data +create table xc_groupby_tab1 (val int, val2 int); +create table xc_groupby_tab2 (val int, val2 int); +insert into xc_groupby_tab1 values (1, 1), (2, 1), (3, 1), (2, 2), (6, 2), (4, 3), (1, 3), (6, 3); +insert into xc_groupby_tab2 values (1, 1), (4, 1), (8, 1), (2, 4), (9, 4), (3, 4), (4, 2), (5, 2), (3, 2); +select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2 order by 1; + count | sum | avg | ?column? | val2 +-------+-----+--------------------+------------------+------ + 2 | 8 | 4.0000000000000000 | 4 | 2 + 3 | 6 | 2.0000000000000000 | 2 | 1 + 3 | 11 | 3.6666666666666667 | 3.66666666666667 | 3 +(3 rows) + +explain (verbose true, costs false, nodes false) select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2 + -> Finalize GroupAggregate + Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2 + Group Key: xc_groupby_tab1.val2 + -> Sort + Output: val2, (PARTIAL count(*)), (PARTIAL sum(val)), (PARTIAL avg(val)) + Sort Key: xc_groupby_tab1.val2 + -> Remote Subquery Scan on all + Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val) + Distribute results by H: val2 + -> Partial GroupAggregate + Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val) + Group Key: xc_groupby_tab1.val2 + -> Sort + Output: val2, val + Sort Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val2, val +(19 rows) + +-- joins and group by +select count(*), sum(xc_groupby_tab1.val * xc_groupby_tab2.val), avg(xc_groupby_tab1.val*xc_groupby_tab2.val), sum(xc_groupby_tab1.val*xc_groupby_tab2.val)::float8/count(*), xc_groupby_tab1.val2, xc_groupby_tab2.val2 from xc_groupby_tab1 full outer join xc_groupby_tab2 on xc_groupby_tab1.val2 = xc_groupby_tab2.val2 group by xc_groupby_tab1.val2, xc_groupby_tab2.val2 order by count(*); + count | sum | avg | ?column? | val2 | val2 +-------+-----+---------------------+------------------+------+------ + 3 | | | | | 4 + 3 | | | | 3 | + 6 | 96 | 16.0000000000000000 | 16 | 2 | 2 + 9 | 78 | 8.6666666666666667 | 8.66666666666667 | 1 | 1 +(4 rows) + +explain (verbose true, costs false, nodes false) select count(*), sum(xc_groupby_tab1.val * xc_groupby_tab2.val), avg(xc_groupby_tab1.val*xc_groupby_tab2.val), sum(xc_groupby_tab1.val*xc_groupby_tab2.val)::float8/count(*), xc_groupby_tab1.val2, xc_groupby_tab2.val2 from xc_groupby_tab1 full outer join xc_groupby_tab2 on xc_groupby_tab1.val2 = xc_groupby_tab2.val2 group by xc_groupby_tab1.val2, xc_groupby_tab2.val2; + QUERY PLAN +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: count(*), sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), avg((xc_groupby_tab1.val * xc_groupby_tab2.val)), ((sum((xc_groupby_tab1.val * xc_groupby_tab2.val)))::double precision / (count(*))::double precision), xc_groupby_tab1.val2, xc_groupby_tab2.val2 + -> Finalize GroupAggregate + Output: count(*), sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), avg((xc_groupby_tab1.val * xc_groupby_tab2.val)), ((sum((xc_groupby_tab1.val * xc_groupby_tab2.val)))::double precision / (count(*))::double precision), xc_groupby_tab1.val2, xc_groupby_tab2.val2 + Group Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2 + -> Sort + Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, (PARTIAL count(*)), (PARTIAL sum((xc_groupby_tab1.val * xc_groupby_tab2.val))), (PARTIAL avg((xc_groupby_tab1.val * xc_groupby_tab2.val))) + Sort Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2 + -> Remote Subquery Scan on all + Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, PARTIAL count(*), PARTIAL sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), PARTIAL avg((xc_groupby_tab1.val * xc_groupby_tab2.val)) + Distribute results by H: val2 + -> Partial GroupAggregate + Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, PARTIAL count(*), PARTIAL sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), PARTIAL avg((xc_groupby_tab1.val * xc_groupby_tab2.val)) + Group Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2 + -> Sort + Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val + Sort Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2 + -> Hash Full Join + Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val + Hash Cond: (xc_groupby_tab1.val2 = xc_groupby_tab2.val2) + -> Remote Subquery Scan on all + Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 + Distribute results by H: val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 + -> Hash + Output: xc_groupby_tab2.val, xc_groupby_tab2.val2 + -> Remote Subquery Scan on all + Output: xc_groupby_tab2.val, xc_groupby_tab2.val2 + Distribute results by H: val2 + -> Seq Scan on public.xc_groupby_tab2 + Output: xc_groupby_tab2.val, xc_groupby_tab2.val2 +(32 rows) + +-- aggregates over aggregates +select sum(y) from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x; + sum +----- + 17 + 8 +(2 rows) + +explain (verbose true, costs false, nodes false) select sum(y) from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x; + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: sum(q1.y), q1.x + -> Finalize GroupAggregate + Output: sum(q1.y), q1.x + Group Key: q1.x + -> Sort + Output: q1.x, (PARTIAL sum(q1.y)) + Sort Key: q1.x + -> Remote Subquery Scan on all + Output: q1.x, PARTIAL sum(q1.y) + Distribute results by H: x + -> Partial GroupAggregate + Output: q1.x, PARTIAL sum(q1.y) + Group Key: q1.x + -> Sort + Output: q1.x, q1.y + Sort Key: q1.x + -> Subquery Scan on q1 + Output: q1.x, q1.y + -> Finalize GroupAggregate + Output: sum(xc_groupby_tab1.val), (xc_groupby_tab1.val2 % 2), xc_groupby_tab1.val2 + Group Key: xc_groupby_tab1.val2 + -> Sort + Output: xc_groupby_tab1.val2, (PARTIAL sum(xc_groupby_tab1.val)) + Sort Key: xc_groupby_tab1.val2 + -> Remote Subquery Scan on all + Output: xc_groupby_tab1.val2, PARTIAL sum(xc_groupby_tab1.val) + Distribute results by H: val2 + -> Partial GroupAggregate + Output: xc_groupby_tab1.val2, PARTIAL sum(xc_groupby_tab1.val) + Group Key: xc_groupby_tab1.val2 + -> Sort + Output: xc_groupby_tab1.val2, xc_groupby_tab1.val + Sort Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val2, xc_groupby_tab1.val +(36 rows) + +-- group by without aggregate +select val2 from xc_groupby_tab1 group by val2 order by 1; + val2 +------ + 1 + 2 + 3 +(3 rows) + +explain (verbose true, costs false, nodes false) select val2 from xc_groupby_tab1 group by val2; + QUERY PLAN +------------------------------------------------------------------------ + Remote Subquery Scan on all + Output: val2 + -> Group + Output: val2 + Group Key: xc_groupby_tab1.val2 + -> Sort + Output: val2 + Sort Key: xc_groupby_tab1.val2 + -> Remote Subquery Scan on all + Output: val2 + Distribute results by H: val2 + -> Group + Output: val2 + Group Key: xc_groupby_tab1.val2 + -> Sort + Output: val2 + Sort Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val2 +(19 rows) + +select val + val2 from xc_groupby_tab1 group by val + val2 order by 1; + ?column? +---------- + 2 + 3 + 4 + 7 + 8 + 9 +(6 rows) + +explain (verbose true, costs false, nodes false) select val + val2 from xc_groupby_tab1 group by val + val2; + QUERY PLAN +------------------------------------------------------------------------------------------ + Remote Subquery Scan on all + Output: (val + val2) + -> Group + Output: ((val + val2)) + Group Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2)) + -> Sort + Output: ((val + val2)) + Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2)) + -> Remote Subquery Scan on all + Output: (val + val2) + Distribute results by H: (val + val2) + -> Group + Output: ((val + val2)) + Group Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2)) + -> Sort + Output: ((val + val2)) + Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2)) + -> Seq Scan on public.xc_groupby_tab1 + Output: (val + val2) +(19 rows) + +select val + val2, val, val2 from xc_groupby_tab1 group by val, val2 order by val, val2; + ?column? | val | val2 +----------+-----+------ + 2 | 1 | 1 + 4 | 1 | 3 + 3 | 2 | 1 + 4 | 2 | 2 + 4 | 3 | 1 + 7 | 4 | 3 + 8 | 6 | 2 + 9 | 6 | 3 +(8 rows) + +explain (verbose true, costs false, nodes false) select val + val2, val, val2 from xc_groupby_tab1 group by val, val2 order by val, val2; + QUERY PLAN +------------------------------------------------------------------- + Remote Subquery Scan on all + Output: (val + val2), val, val2 + Sort Key: xc_groupby_tab1.val, xc_groupby_tab1.val2 + -> Group + Output: (val + val2), val, val2 + Group Key: xc_groupby_tab1.val, xc_groupby_tab1.val2 + -> Sort + Output: val, val2 + Sort Key: xc_groupby_tab1.val, xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val, val2 +(11 rows) + +select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2 order by 1; + ?column? | val | val2 +----------+-----+------ + 2 | 1 | 1 + 5 | 3 | 2 + 5 | 4 | 1 + 6 | 4 | 2 + 6 | 2 | 4 + 7 | 3 | 4 +(6 rows) + +explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2; + QUERY PLAN +--------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2 + -> Group + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2 + Group Key: xc_groupby_tab1.val, xc_groupby_tab2.val2 + -> Sort + Output: xc_groupby_tab1.val, xc_groupby_tab2.val2 + Sort Key: xc_groupby_tab1.val, xc_groupby_tab2.val2 + -> Merge Join + Output: xc_groupby_tab1.val, xc_groupby_tab2.val2 + Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) + -> Sort + Output: xc_groupby_tab1.val + Sort Key: xc_groupby_tab1.val + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val + -> Sort + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val + Sort Key: xc_groupby_tab2.val + -> Seq Scan on public.xc_groupby_tab2 + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val +(21 rows) + +select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2 order by 1; + ?column? +---------- + 2 + 5 + 6 + 7 +(4 rows) + +explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2; + QUERY PLAN +----------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2) + -> Group + Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + Group Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + -> Sort + Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + -> Remote Subquery Scan on all + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2) + Distribute results by H: (val + val2) + -> Group + Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + Group Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + -> Sort + Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + -> Merge Join + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2) + Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) + -> Sort + Output: xc_groupby_tab1.val + Sort Key: xc_groupby_tab1.val + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val + -> Sort + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val + Sort Key: xc_groupby_tab2.val + -> Seq Scan on public.xc_groupby_tab2 + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val +(30 rows) + +-- group by with aggregates in expression +select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2 order by 1; + ?column? | val2 +---------------------+------ + 11.0000000000000000 | 1 + 14.0000000000000000 | 2 + 17.6666666666666667 | 3 +(3 rows) + +explain (verbose true, costs false, nodes false) select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2; + QUERY PLAN +---------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: (((count(*) + sum(val)))::numeric + avg(val)), val2 + -> Finalize GroupAggregate + Output: (((count(*) + sum(val)))::numeric + avg(val)), val2 + Group Key: xc_groupby_tab1.val2 + -> Sort + Output: val2, (PARTIAL count(*)), (PARTIAL sum(val)), (PARTIAL avg(val)) + Sort Key: xc_groupby_tab1.val2 + -> Remote Subquery Scan on all + Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val) + Distribute results by H: val2 + -> Partial GroupAggregate + Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val) + Group Key: xc_groupby_tab1.val2 + -> Sort + Output: val2, val + Sort Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val2, val +(19 rows) + +-- group by with expressions in group by clause +select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2 order by 1; + sum | avg | ?column? +-----+--------------------+---------- + 6 | 2.0000000000000000 | 2 + 8 | 4.0000000000000000 | 4 + 11 | 3.6666666666666667 | 6 +(3 rows) + +explain (verbose true, costs false, nodes false) select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2; + QUERY PLAN +------------------------------------------------------------------------------------ + Remote Subquery Scan on all + Output: sum(val), avg(val), (2 * val2) + -> Finalize GroupAggregate + Output: sum(val), avg(val), ((2 * val2)) + Group Key: ((2 * xc_groupby_tab1.val2)) + -> Sort + Output: ((2 * val2)), (PARTIAL sum(val)), (PARTIAL avg(val)) + Sort Key: ((2 * xc_groupby_tab1.val2)) + -> Remote Subquery Scan on all + Output: (2 * val2), PARTIAL sum(val), PARTIAL avg(val) + Distribute results by H: (2 * val2) + -> Partial GroupAggregate + Output: ((2 * val2)), PARTIAL sum(val), PARTIAL avg(val) + Group Key: ((2 * xc_groupby_tab1.val2)) + -> Sort + Output: ((2 * val2)), val + Sort Key: ((2 * xc_groupby_tab1.val2)) + -> Seq Scan on public.xc_groupby_tab1 + Output: (2 * val2), val +(19 rows) + +drop table xc_groupby_tab1; +drop table xc_groupby_tab2; +-- some tests involving nulls, characters, float type etc. +create table xc_groupby_def(a int, b varchar(25)); +insert into xc_groupby_def VALUES (NULL, NULL); +insert into xc_groupby_def VALUES (1, NULL); +insert into xc_groupby_def VALUES (NULL, 'One'); +insert into xc_groupby_def VALUES (2, 'Two'); +insert into xc_groupby_def VALUES (2, 'Two'); +insert into xc_groupby_def VALUES (3, 'Three'); +insert into xc_groupby_def VALUES (4, 'Three'); +insert into xc_groupby_def VALUES (5, 'Three'); +insert into xc_groupby_def VALUES (6, 'Two'); +insert into xc_groupby_def VALUES (7, NULL); +insert into xc_groupby_def VALUES (8, 'Two'); +insert into xc_groupby_def VALUES (9, 'Three'); +insert into xc_groupby_def VALUES (10, 'Three'); +select a,count(a) from xc_groupby_def group by a order by a; + a | count +----+------- + 1 | 1 + 2 | 2 + 3 | 1 + 4 | 1 + 5 | 1 + 6 | 1 + 7 | 1 + 8 | 1 + 9 | 1 + 10 | 1 + | 0 +(11 rows) + +explain (verbose true, costs false, nodes false) select a,count(a) from xc_groupby_def group by a order by a; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all + Output: a, count(a) + Sort Key: xc_groupby_def.a + -> GroupAggregate + Output: a, count(a) + Group Key: xc_groupby_def.a + -> Sort + Output: a + Sort Key: xc_groupby_def.a + -> Seq Scan on public.xc_groupby_def + Output: a +(11 rows) + +select avg(a) from xc_groupby_def group by a order by 1; + avg +------------------------ + 1.00000000000000000000 + 2.0000000000000000 + 3.0000000000000000 + 4.0000000000000000 + 5.0000000000000000 + 6.0000000000000000 + 7.0000000000000000 + 8.0000000000000000 + 9.0000000000000000 + 10.0000000000000000 + +(11 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by a order by 1; + QUERY PLAN +----------------------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), a + Sort Key: avg(xc_groupby_def.a) + -> Sort + Output: (avg(a)), a + Sort Key: (avg(xc_groupby_def.a)) + -> GroupAggregate + Output: avg(a), a + Group Key: xc_groupby_def.a + -> Sort + Output: a + Sort Key: xc_groupby_def.a + -> Seq Scan on public.xc_groupby_def + Output: a +(14 rows) + +select avg(a) from xc_groupby_def group by b order by 1; + avg +-------------------- + 4.0000000000000000 + 4.5000000000000000 + 6.2000000000000000 + +(4 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by b; + QUERY PLAN +----------------------------------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), b + -> Finalize GroupAggregate + Output: avg(a), b + Group Key: xc_groupby_def.b + -> Sort + Output: b, (PARTIAL avg(a)) + Sort Key: xc_groupby_def.b + -> Remote Subquery Scan on all + Output: b, PARTIAL avg(a) + Distribute results by H: b + -> Partial GroupAggregate + Output: b, PARTIAL avg(a) + Group Key: xc_groupby_def.b + -> Sort + Output: b, a + Sort Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: b, a +(19 rows) + +select sum(a) from xc_groupby_def group by b order by 1; + sum +----- + 8 + 18 + 31 + +(4 rows) + +explain (verbose true, costs false, nodes false) select sum(a) from xc_groupby_def group by b; + QUERY PLAN +----------------------------------------------------------------------- + Remote Subquery Scan on all + Output: sum(a), b + -> Finalize GroupAggregate + Output: sum(a), b + Group Key: xc_groupby_def.b + -> Sort + Output: b, (PARTIAL sum(a)) + Sort Key: xc_groupby_def.b + -> Remote Subquery Scan on all + Output: b, PARTIAL sum(a) + Distribute results by H: b + -> Partial GroupAggregate + Output: b, PARTIAL sum(a) + Group Key: xc_groupby_def.b + -> Sort + Output: b, a + Sort Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: b, a +(19 rows) + +select count(*) from xc_groupby_def group by b order by 1; + count +------- + 1 + 3 + 4 + 5 +(4 rows) + +explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def group by b; + QUERY PLAN +----------------------------------------------------------------------- + Remote Subquery Scan on all + Output: count(*), b + -> Finalize GroupAggregate + Output: count(*), b + Group Key: xc_groupby_def.b + -> Sort + Output: b, (PARTIAL count(*)) + Sort Key: xc_groupby_def.b + -> Remote Subquery Scan on all + Output: b, PARTIAL count(*) + Distribute results by H: b + -> Partial GroupAggregate + Output: b, PARTIAL count(*) + Group Key: xc_groupby_def.b + -> Sort + Output: b + Sort Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: b +(19 rows) + +select count(*) from xc_groupby_def where a is not null group by a order by 1; + count +------- + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 2 +(10 rows) + +explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def where a is not null group by a; + QUERY PLAN +------------------------------------------------------------ + Remote Subquery Scan on all + Output: count(*), a + -> GroupAggregate + Output: count(*), a + Group Key: xc_groupby_def.a + -> Sort + Output: a + Sort Key: xc_groupby_def.a + -> Seq Scan on public.xc_groupby_def + Output: a + Filter: (xc_groupby_def.a IS NOT NULL) +(11 rows) + +select b from xc_groupby_def group by b order by 1; + b +------- + One + Three + Two + +(4 rows) + +explain (verbose true, costs false, nodes false) select b from xc_groupby_def group by b; + QUERY PLAN +----------------------------------------------------------------------- + Remote Subquery Scan on all + Output: b + -> Group + Output: b + Group Key: xc_groupby_def.b + -> Sort + Output: b + Sort Key: xc_groupby_def.b + -> Remote Subquery Scan on all + Output: b + Distribute results by H: b + -> Group + Output: b + Group Key: xc_groupby_def.b + -> Sort + Output: b + Sort Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: b +(19 rows) + +select b,count(b) from xc_groupby_def group by b order by 1; + b | count +-------+------- + One | 1 + Three | 5 + Two | 4 + | 0 +(4 rows) + +explain (verbose true, costs false, nodes false) select b,count(b) from xc_groupby_def group by b; + QUERY PLAN +----------------------------------------------------------------------- + Remote Subquery Scan on all + Output: b, count(b) + -> Finalize GroupAggregate + Output: b, count(b) + Group Key: xc_groupby_def.b + -> Sort + Output: b, (PARTIAL count(b)) + Sort Key: xc_groupby_def.b + -> Remote Subquery Scan on all + Output: b, PARTIAL count(b) + Distribute results by H: b + -> Partial GroupAggregate + Output: b, PARTIAL count(b) + Group Key: xc_groupby_def.b + -> Sort + Output: b + Sort Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: b +(19 rows) + +select count(*) from xc_groupby_def where b is null group by b order by 1; + count +------- + 3 +(1 row) + +explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def where b is null group by b; + QUERY PLAN +-------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: count(*), b + -> Finalize GroupAggregate + Output: count(*), b + Group Key: xc_groupby_def.b + -> Sort + Output: b, (PARTIAL count(*)) + Sort Key: xc_groupby_def.b + -> Remote Subquery Scan on all + Output: b, PARTIAL count(*) + Distribute results by H: b + -> Partial GroupAggregate + Output: b, PARTIAL count(*) + Group Key: xc_groupby_def.b + -> Sort + Output: b + Sort Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: b + Filter: (xc_groupby_def.b IS NULL) +(20 rows) + +create table xc_groupby_g(a int, b float, c numeric); +insert into xc_groupby_g values(1,2.1,3.2); +insert into xc_groupby_g values(1,2.1,3.2); +insert into xc_groupby_g values(2,2.3,5.2); +select sum(a) from xc_groupby_g group by a; + sum +----- + 2 + 2 +(2 rows) + +explain (verbose true, costs false, nodes false) select sum(a) from xc_groupby_g group by a; + QUERY PLAN +--------------------------------------------------- + Remote Subquery Scan on all + Output: sum(a), a + -> GroupAggregate + Output: sum(a), a + Group Key: xc_groupby_g.a + -> Sort + Output: a + Sort Key: xc_groupby_g.a + -> Seq Scan on public.xc_groupby_g + Output: a +(10 rows) + +select sum(b) from xc_groupby_g group by b order by 1; + sum +----- + 2.3 + 4.2 +(2 rows) + +explain (verbose true, costs false, nodes false) select sum(b) from xc_groupby_g group by b; + QUERY PLAN +--------------------------------------------------------------------- + Remote Subquery Scan on all + Output: sum(b), b + -> Finalize GroupAggregate + Output: sum(b), b + Group Key: xc_groupby_g.b + -> Sort + Output: b, (PARTIAL sum(b)) + Sort Key: xc_groupby_g.b + -> Remote Subquery Scan on all + Output: b, PARTIAL sum(b) + Distribute results by H: b + -> Partial GroupAggregate + Output: b, PARTIAL sum(b) + Group Key: xc_groupby_g.b + -> Sort + Output: b + Sort Key: xc_groupby_g.b + -> Seq Scan on public.xc_groupby_g + Output: b +(19 rows) + +select sum(c) from xc_groupby_g group by b order by 1; + sum +----- + 5.2 + 6.4 +(2 rows) + +explain (verbose true, costs false, nodes false) select sum(c) from xc_groupby_g group by b; + QUERY PLAN +--------------------------------------------------------------------- + Remote Subquery Scan on all + Output: sum(c), b + -> Finalize GroupAggregate + Output: sum(c), b + Group Key: xc_groupby_g.b + -> Sort + Output: b, (PARTIAL sum(c)) + Sort Key: xc_groupby_g.b + -> Remote Subquery Scan on all + Output: b, PARTIAL sum(c) + Distribute results by H: b + -> Partial GroupAggregate + Output: b, PARTIAL sum(c) + Group Key: xc_groupby_g.b + -> Sort + Output: b, c + Sort Key: xc_groupby_g.b + -> Seq Scan on public.xc_groupby_g + Output: b, c +(19 rows) + +select avg(a) from xc_groupby_g group by b order by 1; + avg +------------------------ + 1.00000000000000000000 + 2.0000000000000000 +(2 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_g group by b; + QUERY PLAN +--------------------------------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), b + -> Finalize GroupAggregate + Output: avg(a), b + Group Key: xc_groupby_g.b + -> Sort + Output: b, (PARTIAL avg(a)) + Sort Key: xc_groupby_g.b + -> Remote Subquery Scan on all + Output: b, PARTIAL avg(a) + Distribute results by H: b + -> Partial GroupAggregate + Output: b, PARTIAL avg(a) + Group Key: xc_groupby_g.b + -> Sort + Output: b, a + Sort Key: xc_groupby_g.b + -> Seq Scan on public.xc_groupby_g + Output: b, a +(19 rows) + +select avg(b) from xc_groupby_g group by c order by 1; + avg +----- + 2.1 + 2.3 +(2 rows) + +explain (verbose true, costs false, nodes false) select avg(b) from xc_groupby_g group by c; + QUERY PLAN +--------------------------------------------------------------------- + Remote Subquery Scan on all + Output: avg(b), c + -> Finalize GroupAggregate + Output: avg(b), c + Group Key: xc_groupby_g.c + -> Sort + Output: c, (PARTIAL avg(b)) + Sort Key: xc_groupby_g.c + -> Remote Subquery Scan on all + Output: c, PARTIAL avg(b) + Distribute results by H: c + -> Partial GroupAggregate + Output: c, PARTIAL avg(b) + Group Key: xc_groupby_g.c + -> Sort + Output: c, b + Sort Key: xc_groupby_g.c + -> Seq Scan on public.xc_groupby_g + Output: c, b +(19 rows) + +select avg(c) from xc_groupby_g group by c order by 1; + avg +-------------------- + 3.2000000000000000 + 5.2000000000000000 +(2 rows) + +explain (verbose true, costs false, nodes false) select avg(c) from xc_groupby_g group by c; + QUERY PLAN +--------------------------------------------------------------------- + Remote Subquery Scan on all + Output: avg(c), c + -> Finalize GroupAggregate + Output: avg(c), c + Group Key: xc_groupby_g.c + -> Sort + Output: c, (PARTIAL avg(c)) + Sort Key: xc_groupby_g.c + -> Remote Subquery Scan on all + Output: c, PARTIAL avg(c) + Distribute results by H: c + -> Partial GroupAggregate + Output: c, PARTIAL avg(c) + Group Key: xc_groupby_g.c + -> Sort + Output: c + Sort Key: xc_groupby_g.c + -> Seq Scan on public.xc_groupby_g + Output: c +(19 rows) + +drop table xc_groupby_def; +drop table xc_groupby_g; +-- Combination 4 enable_hashagg off and replicated tables. +-- repeat the same tests for replicated tables +-- create required tables and fill them with data +create table xc_groupby_tab1 (val int, val2 int) distribute by replication; +create table xc_groupby_tab2 (val int, val2 int) distribute by replication; +insert into xc_groupby_tab1 values (1, 1), (2, 1), (3, 1), (2, 2), (6, 2), (4, 3), (1, 3), (6, 3); +insert into xc_groupby_tab2 values (1, 1), (4, 1), (8, 1), (2, 4), (9, 4), (3, 4), (4, 2), (5, 2), (3, 2); +select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2; + count | sum | avg | ?column? | val2 +-------+-----+--------------------+------------------+------ + 3 | 6 | 2.0000000000000000 | 2 | 1 + 2 | 8 | 4.0000000000000000 | 4 | 2 + 3 | 11 | 3.6666666666666667 | 3.66666666666667 | 3 +(3 rows) + +explain (verbose true, costs false, nodes false) select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2 + -> GroupAggregate + Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2 + Group Key: xc_groupby_tab1.val2 + -> Sort + Output: val2, val + Sort Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val2, val +(10 rows) + +-- joins and group by +select count(*), sum(xc_groupby_tab1.val * xc_groupby_tab2.val), avg(xc_groupby_tab1.val*xc_groupby_tab2.val), sum(xc_groupby_tab1.val*xc_groupby_tab2.val)::float8/count(*), xc_groupby_tab1.val2, xc_groupby_tab2.val2 from xc_groupby_tab1 full outer join xc_groupby_tab2 on xc_groupby_tab1.val2 = xc_groupby_tab2.val2 group by xc_groupby_tab1.val2, xc_groupby_tab2.val2 order by count(*); + count | sum | avg | ?column? | val2 | val2 +-------+-----+---------------------+------------------+------+------ + 3 | | | | 3 | + 3 | | | | | 4 + 6 | 96 | 16.0000000000000000 | 16 | 2 | 2 + 9 | 78 | 8.6666666666666667 | 8.66666666666667 | 1 | 1 +(4 rows) + +explain (verbose true, costs false, nodes false) select count(*), sum(xc_groupby_tab1.val * xc_groupby_tab2.val), avg(xc_groupby_tab1.val*xc_groupby_tab2.val), sum(xc_groupby_tab1.val*xc_groupby_tab2.val)::float8/count(*), xc_groupby_tab1.val2, xc_groupby_tab2.val2 from xc_groupby_tab1 full outer join xc_groupby_tab2 on xc_groupby_tab1.val2 = xc_groupby_tab2.val2 group by xc_groupby_tab1.val2, xc_groupby_tab2.val2; + QUERY PLAN +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: count(*), sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), avg((xc_groupby_tab1.val * xc_groupby_tab2.val)), ((sum((xc_groupby_tab1.val * xc_groupby_tab2.val)))::double precision / (count(*))::double precision), xc_groupby_tab1.val2, xc_groupby_tab2.val2 + -> GroupAggregate + Output: count(*), sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), avg((xc_groupby_tab1.val * xc_groupby_tab2.val)), ((sum((xc_groupby_tab1.val * xc_groupby_tab2.val)))::double precision / (count(*))::double precision), xc_groupby_tab1.val2, xc_groupby_tab2.val2 + Group Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2 + -> Sort + Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val + Sort Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2 + -> Merge Full Join + Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val + Merge Cond: (xc_groupby_tab1.val2 = xc_groupby_tab2.val2) + -> Sort + Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 + Sort Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 + -> Sort + Output: xc_groupby_tab2.val, xc_groupby_tab2.val2 + Sort Key: xc_groupby_tab2.val2 + -> Seq Scan on public.xc_groupby_tab2 + Output: xc_groupby_tab2.val, xc_groupby_tab2.val2 +(21 rows) + +-- aggregates over aggregates +select sum(y) from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x; + sum +----- + 8 + 17 +(2 rows) + +explain (verbose true, costs false, nodes false) select sum(y) from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x; + QUERY PLAN +-------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: sum(q1.y), q1.x + -> GroupAggregate + Output: sum(q1.y), q1.x + Group Key: q1.x + -> Sort + Output: q1.x, q1.y + Sort Key: q1.x + -> Subquery Scan on q1 + Output: q1.x, q1.y + -> GroupAggregate + Output: sum(xc_groupby_tab1.val), (xc_groupby_tab1.val2 % 2), xc_groupby_tab1.val2 + Group Key: xc_groupby_tab1.val2 + -> Sort + Output: xc_groupby_tab1.val2, xc_groupby_tab1.val + Sort Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val2, xc_groupby_tab1.val +(18 rows) + +-- group by without aggregate +select val2 from xc_groupby_tab1 group by val2 order by 1; + val2 +------ + 1 + 2 + 3 +(3 rows) + +explain (verbose true, costs false, nodes false) select val2 from xc_groupby_tab1 group by val2; + QUERY PLAN +------------------------------------------------------ + Remote Subquery Scan on all + Output: val2 + -> Group + Output: val2 + Group Key: xc_groupby_tab1.val2 + -> Sort + Output: val2 + Sort Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val2 +(10 rows) + +select val + val2 from xc_groupby_tab1 group by val + val2 order by 1; + ?column? +---------- + 2 + 3 + 4 + 7 + 8 + 9 +(6 rows) + +explain (verbose true, costs false, nodes false) select val + val2 from xc_groupby_tab1 group by val + val2; + QUERY PLAN +------------------------------------------------------------------------ + Remote Subquery Scan on all + Output: (val + val2) + -> Group + Output: ((val + val2)) + Group Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2)) + -> Sort + Output: ((val + val2)) + Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2)) + -> Seq Scan on public.xc_groupby_tab1 + Output: (val + val2) +(10 rows) + +select val + val2, val, val2 from xc_groupby_tab1 group by val, val2; + ?column? | val | val2 +----------+-----+------ + 2 | 1 | 1 + 4 | 1 | 3 + 3 | 2 | 1 + 4 | 2 | 2 + 4 | 3 | 1 + 7 | 4 | 3 + 8 | 6 | 2 + 9 | 6 | 3 +(8 rows) + +explain (verbose true, costs false, nodes false) select val + val2, val, val2 from xc_groupby_tab1 group by val, val2; + QUERY PLAN +------------------------------------------------------------------- + Remote Subquery Scan on all + Output: (val + val2), val, val2 + -> Group + Output: (val + val2), val, val2 + Group Key: xc_groupby_tab1.val, xc_groupby_tab1.val2 + -> Sort + Output: val, val2 + Sort Key: xc_groupby_tab1.val, xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val, val2 +(10 rows) + +select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2 order by 1; + ?column? | val | val2 +----------+-----+------ + 2 | 1 | 1 + 5 | 3 | 2 + 5 | 4 | 1 + 6 | 2 | 4 + 6 | 4 | 2 + 7 | 3 | 4 +(6 rows) + +explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2; + QUERY PLAN +--------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2 + -> Group + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2 + Group Key: xc_groupby_tab1.val, xc_groupby_tab2.val2 + -> Sort + Output: xc_groupby_tab1.val, xc_groupby_tab2.val2 + Sort Key: xc_groupby_tab1.val, xc_groupby_tab2.val2 + -> Merge Join + Output: xc_groupby_tab1.val, xc_groupby_tab2.val2 + Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) + -> Sort + Output: xc_groupby_tab1.val + Sort Key: xc_groupby_tab1.val + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val + -> Sort + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val + Sort Key: xc_groupby_tab2.val + -> Seq Scan on public.xc_groupby_tab2 + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val +(21 rows) + +select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2 order by 1; + ?column? +---------- + 2 + 5 + 6 + 7 +(4 rows) + +explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2; + QUERY PLAN +----------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2) + -> Group + Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + Group Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + -> Sort + Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + -> Merge Join + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2) + Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) + -> Sort + Output: xc_groupby_tab1.val + Sort Key: xc_groupby_tab1.val + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val + -> Sort + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val + Sort Key: xc_groupby_tab2.val + -> Seq Scan on public.xc_groupby_tab2 + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val +(21 rows) + +-- group by with aggregates in expression +select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2 order by 1; + ?column? | val2 +---------------------+------ + 11.0000000000000000 | 1 + 14.0000000000000000 | 2 + 17.6666666666666667 | 3 +(3 rows) + +explain (verbose true, costs false, nodes false) select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2; + QUERY PLAN +--------------------------------------------------------------------- + Remote Subquery Scan on all + Output: (((count(*) + sum(val)))::numeric + avg(val)), val2 + -> GroupAggregate + Output: (((count(*) + sum(val)))::numeric + avg(val)), val2 + Group Key: xc_groupby_tab1.val2 + -> Sort + Output: val2, val + Sort Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val2, val +(10 rows) + +-- group by with expressions in group by clause +select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2 order by 1; + sum | avg | ?column? +-----+--------------------+---------- + 6 | 2.0000000000000000 | 2 + 8 | 4.0000000000000000 | 4 + 11 | 3.6666666666666667 | 6 +(3 rows) + +explain (verbose true, costs false, nodes false) select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2; + QUERY PLAN +------------------------------------------------------ + Remote Subquery Scan on all + Output: sum(val), avg(val), (2 * val2) + -> GroupAggregate + Output: sum(val), avg(val), ((2 * val2)) + Group Key: ((2 * xc_groupby_tab1.val2)) + -> Sort + Output: ((2 * val2)), val + Sort Key: ((2 * xc_groupby_tab1.val2)) + -> Seq Scan on public.xc_groupby_tab1 + Output: (2 * val2), val +(10 rows) + +drop table xc_groupby_tab1; +drop table xc_groupby_tab2; +-- some tests involving nulls, characters, float type etc. +create table xc_groupby_def(a int, b varchar(25)) distribute by replication; +insert into xc_groupby_def VALUES (NULL, NULL); +insert into xc_groupby_def VALUES (1, NULL); +insert into xc_groupby_def VALUES (NULL, 'One'); +insert into xc_groupby_def VALUES (2, 'Two'); +insert into xc_groupby_def VALUES (2, 'Two'); +insert into xc_groupby_def VALUES (3, 'Three'); +insert into xc_groupby_def VALUES (4, 'Three'); +insert into xc_groupby_def VALUES (5, 'Three'); +insert into xc_groupby_def VALUES (6, 'Two'); +insert into xc_groupby_def VALUES (7, NULL); +insert into xc_groupby_def VALUES (8, 'Two'); +insert into xc_groupby_def VALUES (9, 'Three'); +insert into xc_groupby_def VALUES (10, 'Three'); +select a,count(a) from xc_groupby_def group by a order by a; + a | count +----+------- + 1 | 1 + 2 | 2 + 3 | 1 + 4 | 1 + 5 | 1 + 6 | 1 + 7 | 1 + 8 | 1 + 9 | 1 + 10 | 1 + | 0 +(11 rows) + +explain (verbose true, costs false, nodes false) select a,count(a) from xc_groupby_def group by a order by a; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all + Output: a, count(a) + -> GroupAggregate + Output: a, count(a) + Group Key: xc_groupby_def.a + -> Sort + Output: a + Sort Key: xc_groupby_def.a + -> Seq Scan on public.xc_groupby_def + Output: a +(10 rows) + +select avg(a) from xc_groupby_def group by a; + avg +------------------------ + 1.00000000000000000000 + 2.0000000000000000 + 3.0000000000000000 + 4.0000000000000000 + 5.0000000000000000 + 6.0000000000000000 + 7.0000000000000000 + 8.0000000000000000 + 9.0000000000000000 + 10.0000000000000000 + +(11 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by a; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), a + -> GroupAggregate + Output: avg(a), a + Group Key: xc_groupby_def.a + -> Sort + Output: a + Sort Key: xc_groupby_def.a + -> Seq Scan on public.xc_groupby_def + Output: a +(10 rows) + +select avg(a) from xc_groupby_def group by a; + avg +------------------------ + 1.00000000000000000000 + 2.0000000000000000 + 3.0000000000000000 + 4.0000000000000000 + 5.0000000000000000 + 6.0000000000000000 + 7.0000000000000000 + 8.0000000000000000 + 9.0000000000000000 + 10.0000000000000000 + +(11 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by a; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), a + -> GroupAggregate + Output: avg(a), a + Group Key: xc_groupby_def.a + -> Sort + Output: a + Sort Key: xc_groupby_def.a + -> Seq Scan on public.xc_groupby_def + Output: a +(10 rows) + +select avg(a) from xc_groupby_def group by b order by 1; + avg +-------------------- + 4.0000000000000000 + 4.5000000000000000 + 6.2000000000000000 + +(4 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by b; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), b + -> GroupAggregate + Output: avg(a), b + Group Key: xc_groupby_def.b + -> Sort + Output: b, a + Sort Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: b, a +(10 rows) + +select sum(a) from xc_groupby_def group by b order by 1; + sum +----- + 8 + 18 + 31 + +(4 rows) + +explain (verbose true, costs false, nodes false) select sum(a) from xc_groupby_def group by b; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all + Output: sum(a), b + -> GroupAggregate + Output: sum(a), b + Group Key: xc_groupby_def.b + -> Sort + Output: b, a + Sort Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: b, a +(10 rows) + +select count(*) from xc_groupby_def group by b order by 1; + count +------- + 1 + 3 + 4 + 5 +(4 rows) + +explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def group by b; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all + Output: count(*), b + -> GroupAggregate + Output: count(*), b + Group Key: xc_groupby_def.b + -> Sort + Output: b + Sort Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: b +(10 rows) + +select count(*) from xc_groupby_def where a is not null group by a; + count +------- + 1 + 2 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 +(10 rows) + +explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def where a is not null group by a; + QUERY PLAN +------------------------------------------------------------ + Remote Subquery Scan on all + Output: count(*), a + -> GroupAggregate + Output: count(*), a + Group Key: xc_groupby_def.a + -> Sort + Output: a + Sort Key: xc_groupby_def.a + -> Seq Scan on public.xc_groupby_def + Output: a + Filter: (xc_groupby_def.a IS NOT NULL) +(11 rows) + +select b from xc_groupby_def group by b order by 1; + b +------- + One + Three + Two + +(4 rows) + +explain (verbose true, costs false, nodes false) select b from xc_groupby_def group by b; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all + Output: b + -> Group + Output: b + Group Key: xc_groupby_def.b + -> Sort + Output: b + Sort Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: b +(10 rows) + +select b,count(b) from xc_groupby_def group by b order by 1; + b | count +-------+------- + One | 1 + Three | 5 + Two | 4 + | 0 +(4 rows) + +explain (verbose true, costs false, nodes false) select b,count(b) from xc_groupby_def group by b; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all + Output: b, count(b) + -> GroupAggregate + Output: b, count(b) + Group Key: xc_groupby_def.b + -> Sort + Output: b + Sort Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: b +(10 rows) + +select count(*) from xc_groupby_def where b is null group by b; + count +------- + 3 +(1 row) + +explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def where b is null group by b; + QUERY PLAN +-------------------------------------------------------- + Remote Subquery Scan on all + Output: count(*), b + -> GroupAggregate + Output: count(*), b + Group Key: xc_groupby_def.b + -> Sort + Output: b + Sort Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: b + Filter: (xc_groupby_def.b IS NULL) +(11 rows) + +create table xc_groupby_g(a int, b float, c numeric) distribute by replication; +insert into xc_groupby_g values(1,2.1,3.2); +insert into xc_groupby_g values(1,2.1,3.2); +insert into xc_groupby_g values(2,2.3,5.2); +select sum(a) from xc_groupby_g group by a; + sum +----- + 2 + 2 +(2 rows) + +explain (verbose true, costs false, nodes false) select sum(a) from xc_groupby_g group by a; + QUERY PLAN +--------------------------------------------------- + Remote Subquery Scan on all + Output: sum(a), a + -> GroupAggregate + Output: sum(a), a + Group Key: xc_groupby_g.a + -> Sort + Output: a + Sort Key: xc_groupby_g.a + -> Seq Scan on public.xc_groupby_g + Output: a +(10 rows) + +select sum(b) from xc_groupby_g group by b; + sum +----- + 4.2 + 2.3 +(2 rows) + +explain (verbose true, costs false, nodes false) select sum(b) from xc_groupby_g group by b; + QUERY PLAN +--------------------------------------------------- + Remote Subquery Scan on all + Output: sum(b), b + -> GroupAggregate + Output: sum(b), b + Group Key: xc_groupby_g.b + -> Sort + Output: b + Sort Key: xc_groupby_g.b + -> Seq Scan on public.xc_groupby_g + Output: b +(10 rows) + +select sum(c) from xc_groupby_g group by b; + sum +----- + 6.4 + 5.2 +(2 rows) + +explain (verbose true, costs false, nodes false) select sum(c) from xc_groupby_g group by b; + QUERY PLAN +--------------------------------------------------- + Remote Subquery Scan on all + Output: sum(c), b + -> GroupAggregate + Output: sum(c), b + Group Key: xc_groupby_g.b + -> Sort + Output: b, c + Sort Key: xc_groupby_g.b + -> Seq Scan on public.xc_groupby_g + Output: b, c +(10 rows) + +select avg(a) from xc_groupby_g group by b; + avg +------------------------ + 1.00000000000000000000 + 2.0000000000000000 +(2 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_g group by b; + QUERY PLAN +--------------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), b + -> GroupAggregate + Output: avg(a), b + Group Key: xc_groupby_g.b + -> Sort + Output: b, a + Sort Key: xc_groupby_g.b + -> Seq Scan on public.xc_groupby_g + Output: b, a +(10 rows) + +select avg(b) from xc_groupby_g group by c; + avg +----- + 2.1 + 2.3 +(2 rows) + +explain (verbose true, costs false, nodes false) select avg(b) from xc_groupby_g group by c; + QUERY PLAN +--------------------------------------------------- + Remote Subquery Scan on all + Output: avg(b), c + -> GroupAggregate + Output: avg(b), c + Group Key: xc_groupby_g.c + -> Sort + Output: c, b + Sort Key: xc_groupby_g.c + -> Seq Scan on public.xc_groupby_g + Output: c, b +(10 rows) + +select avg(c) from xc_groupby_g group by c; + avg +-------------------- + 3.2000000000000000 + 5.2000000000000000 +(2 rows) + +explain (verbose true, costs false, nodes false) select avg(c) from xc_groupby_g group by c; + QUERY PLAN +--------------------------------------------------- + Remote Subquery Scan on all + Output: avg(c), c + -> GroupAggregate + Output: avg(c), c + Group Key: xc_groupby_g.c + -> Sort + Output: c + Sort Key: xc_groupby_g.c + -> Seq Scan on public.xc_groupby_g + Output: c +(10 rows) + +drop table xc_groupby_def; +drop table xc_groupby_g; +reset enable_hashagg; +reset enable_fast_query_shipping; +-- Now repeat all the tests with FQS turned on +set enable_fast_query_shipping to on; +-- Combination 1: enable_hashagg on and distributed tables +set enable_hashagg to on; +-- create required tables and fill them with data +create table xc_groupby_tab1 (val int, val2 int); +create table xc_groupby_tab2 (val int, val2 int); +insert into xc_groupby_tab1 values (1, 1), (2, 1), (3, 1), (2, 2), (6, 2), (4, 3), (1, 3), (6, 3); +insert into xc_groupby_tab2 values (1, 1), (4, 1), (8, 1), (2, 4), (9, 4), (3, 4), (4, 2), (5, 2), (3, 2); +select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2 order by 1, 2; + count | sum | avg | ?column? | val2 +-------+-----+--------------------+------------------+------ + 2 | 8 | 4.0000000000000000 | 4 | 2 + 3 | 6 | 2.0000000000000000 | 2 | 1 + 3 | 11 | 3.6666666666666667 | 3.66666666666667 | 3 +(3 rows) + +explain (verbose true, costs false, nodes false) select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2 order by 1, 2; + QUERY PLAN +--------------------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2 + Sort Key: count(*), sum(xc_groupby_tab1.val) + -> Sort + Output: (count(*)), (sum(val)), (avg(val)), (((sum(val))::double precision / (count(*))::double precision)), val2 + Sort Key: (count(*)), (sum(xc_groupby_tab1.val)) + -> Finalize HashAggregate + Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2 + Group Key: xc_groupby_tab1.val2 + -> Remote Subquery Scan on all + Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val) + Distribute results by H: val2 + -> Partial HashAggregate + Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val) + Group Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val, val2 +(17 rows) + +explain (verbose true, costs false, nodes false) select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2 + -> Finalize HashAggregate + Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2 + Group Key: xc_groupby_tab1.val2 + -> Remote Subquery Scan on all + Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val) + Distribute results by H: val2 + -> Partial HashAggregate + Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val) + Group Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val, val2 +(13 rows) + +-- joins and group by +select count(*), sum(xc_groupby_tab1.val * xc_groupby_tab2.val), avg(xc_groupby_tab1.val*xc_groupby_tab2.val), sum(xc_groupby_tab1.val*xc_groupby_tab2.val)::float8/count(*), xc_groupby_tab1.val2, xc_groupby_tab2.val2 from xc_groupby_tab1 full outer join xc_groupby_tab2 on xc_groupby_tab1.val2 = xc_groupby_tab2.val2 group by xc_groupby_tab1.val2, xc_groupby_tab2.val2 order by count(*); + count | sum | avg | ?column? | val2 | val2 +-------+-----+---------------------+------------------+------+------ + 3 | | | | | 4 + 3 | | | | 3 | + 6 | 96 | 16.0000000000000000 | 16 | 2 | 2 + 9 | 78 | 8.6666666666666667 | 8.66666666666667 | 1 | 1 +(4 rows) + +explain (verbose true, costs false, nodes false) select count(*), sum(xc_groupby_tab1.val * xc_groupby_tab2.val), avg(xc_groupby_tab1.val*xc_groupby_tab2.val), sum(xc_groupby_tab1.val*xc_groupby_tab2.val)::float8/count(*), xc_groupby_tab1.val2, xc_groupby_tab2.val2 from xc_groupby_tab1 full outer join xc_groupby_tab2 on xc_groupby_tab1.val2 = xc_groupby_tab2.val2 group by xc_groupby_tab1.val2, xc_groupby_tab2.val2; + QUERY PLAN +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: count(*), sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), avg((xc_groupby_tab1.val * xc_groupby_tab2.val)), ((sum((xc_groupby_tab1.val * xc_groupby_tab2.val)))::double precision / (count(*))::double precision), xc_groupby_tab1.val2, xc_groupby_tab2.val2 + -> Finalize HashAggregate + Output: count(*), sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), avg((xc_groupby_tab1.val * xc_groupby_tab2.val)), ((sum((xc_groupby_tab1.val * xc_groupby_tab2.val)))::double precision / (count(*))::double precision), xc_groupby_tab1.val2, xc_groupby_tab2.val2 + Group Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2 + -> Remote Subquery Scan on all + Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, PARTIAL count(*), PARTIAL sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), PARTIAL avg((xc_groupby_tab1.val * xc_groupby_tab2.val)) + Distribute results by H: val2 + -> Partial HashAggregate + Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, PARTIAL count(*), PARTIAL sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), PARTIAL avg((xc_groupby_tab1.val * xc_groupby_tab2.val)) + Group Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2 + -> Hash Full Join + Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val + Hash Cond: (xc_groupby_tab1.val2 = xc_groupby_tab2.val2) + -> Remote Subquery Scan on all + Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 + Distribute results by H: val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 + -> Hash + Output: xc_groupby_tab2.val, xc_groupby_tab2.val2 + -> Remote Subquery Scan on all + Output: xc_groupby_tab2.val, xc_groupby_tab2.val2 + Distribute results by H: val2 + -> Seq Scan on public.xc_groupby_tab2 + Output: xc_groupby_tab2.val, xc_groupby_tab2.val2 +(26 rows) + +-- aggregates over aggregates +select sum(y) from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x order by 1; + sum +----- + 8 + 17 +(2 rows) + +explain (verbose true, costs false, nodes false) select sum(y) from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x order by 1; + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: sum(y), x + Sort Key: sum(y) + -> Sort + Output: (sum(y)), x + Sort Key: (sum(y)) + -> Finalize HashAggregate + Output: sum(y), x + Group Key: x + -> Remote Subquery Scan on all + Output: x, PARTIAL sum(y) + Distribute results by H: x + -> Partial HashAggregate + Output: ((xc_groupby_tab1.val2 % 2)), PARTIAL sum((sum(xc_groupby_tab1.val))) + Group Key: (xc_groupby_tab1.val2 % 2) + -> Finalize HashAggregate + Output: sum(xc_groupby_tab1.val), (xc_groupby_tab1.val2 % 2), xc_groupby_tab1.val2 + Group Key: xc_groupby_tab1.val2 + -> Remote Subquery Scan on all + Output: xc_groupby_tab1.val2, PARTIAL sum(xc_groupby_tab1.val) + Distribute results by H: val2 + -> Partial HashAggregate + Output: xc_groupby_tab1.val2, PARTIAL sum(xc_groupby_tab1.val) + Group Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 +(26 rows) + +explain (verbose true, costs false, nodes false) select sum(y) from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x; + QUERY PLAN +-------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: sum(y), x + -> Finalize HashAggregate + Output: sum(y), x + Group Key: x + -> Remote Subquery Scan on all + Output: x, PARTIAL sum(y) + Distribute results by H: x + -> Partial HashAggregate + Output: ((xc_groupby_tab1.val2 % 2)), PARTIAL sum((sum(xc_groupby_tab1.val))) + Group Key: (xc_groupby_tab1.val2 % 2) + -> Finalize HashAggregate + Output: sum(xc_groupby_tab1.val), (xc_groupby_tab1.val2 % 2), xc_groupby_tab1.val2 + Group Key: xc_groupby_tab1.val2 + -> Remote Subquery Scan on all + Output: xc_groupby_tab1.val2, PARTIAL sum(xc_groupby_tab1.val) + Distribute results by H: val2 + -> Partial HashAggregate + Output: xc_groupby_tab1.val2, PARTIAL sum(xc_groupby_tab1.val) + Group Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 +(22 rows) + +-- group by without aggregate +select val2 from xc_groupby_tab1 group by val2 order by 1; + val2 +------ + 1 + 2 + 3 +(3 rows) + +explain (verbose true, costs false, nodes false) select val2 from xc_groupby_tab1 group by val2 order by 1; + QUERY PLAN +------------------------------------------------------------------ + Remote Subquery Scan on all + Output: val2 + Sort Key: xc_groupby_tab1.val2 + -> Finalize GroupAggregate + Output: val2 + Group Key: xc_groupby_tab1.val2 + -> Sort + Output: val2 + Sort Key: xc_groupby_tab1.val2 + -> Remote Subquery Scan on all + Output: val2 + Distribute results by H: val2 + -> Partial HashAggregate + Output: val2 + Group Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val, val2 +(17 rows) + +explain (verbose true, costs false, nodes false) select val2 from xc_groupby_tab1 group by val2; + QUERY PLAN +------------------------------------------------------------ + Remote Subquery Scan on all + Output: val2 + -> Finalize HashAggregate + Output: val2 + Group Key: xc_groupby_tab1.val2 + -> Remote Subquery Scan on all + Output: val2 + Distribute results by H: val2 + -> Partial HashAggregate + Output: val2 + Group Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val, val2 +(13 rows) + +select val + val2 from xc_groupby_tab1 group by val + val2 order by 1; + ?column? +---------- + 2 + 3 + 4 + 7 + 8 + 9 +(6 rows) + +explain (verbose true, costs false, nodes false) select val + val2 from xc_groupby_tab1 group by val + val2 order by 1; + QUERY PLAN +----------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: (val + val2) + Sort Key: (xc_groupby_tab1.val + xc_groupby_tab1.val2) + -> Finalize GroupAggregate + Output: ((val + val2)) + Group Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2)) + -> Sort + Output: ((val + val2)) + Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2)) + -> Remote Subquery Scan on all + Output: (val + val2) + Distribute results by H: (val + val2) + -> Partial HashAggregate + Output: ((val + val2)) + Group Key: (xc_groupby_tab1.val + xc_groupby_tab1.val2) + -> Seq Scan on public.xc_groupby_tab1 + Output: (val + val2) +(17 rows) + +explain (verbose true, costs false, nodes false) select val + val2 from xc_groupby_tab1 group by val + val2; + QUERY PLAN +----------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: (val + val2) + -> Finalize HashAggregate + Output: ((val + val2)) + Group Key: (xc_groupby_tab1.val + xc_groupby_tab1.val2) + -> Remote Subquery Scan on all + Output: (val + val2) + Distribute results by H: (val + val2) + -> Partial HashAggregate + Output: ((val + val2)) + Group Key: (xc_groupby_tab1.val + xc_groupby_tab1.val2) + -> Seq Scan on public.xc_groupby_tab1 + Output: (val + val2) +(13 rows) + +select val + val2, val, val2 from xc_groupby_tab1 group by val, val2 order by 1, 2; + ?column? | val | val2 +----------+-----+------ + 2 | 1 | 1 + 3 | 2 | 1 + 4 | 1 | 3 + 4 | 2 | 2 + 4 | 3 | 1 + 7 | 4 | 3 + 8 | 6 | 2 + 9 | 6 | 3 +(8 rows) + +explain (verbose true, costs false, nodes false) select val + val2, val, val2 from xc_groupby_tab1 group by val, val2 order by 1, 2; + QUERY PLAN +--------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: (val + val2), val, val2 + Sort Key: (xc_groupby_tab1.val + xc_groupby_tab1.val2), xc_groupby_tab1.val + -> Sort + Output: ((val + val2)), val, val2 + Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2)), xc_groupby_tab1.val + -> HashAggregate + Output: (val + val2), val, val2 + Group Key: xc_groupby_tab1.val, xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val, val2 +(11 rows) + +explain (verbose true, costs false, nodes false) select val + val2, val, val2 from xc_groupby_tab1 group by val, val2; + QUERY PLAN +--------------------------------------------------------------------------------------------------- + Remote Fast Query Execution + Output: (xc_groupby_tab1.val + xc_groupby_tab1.val2), xc_groupby_tab1.val, xc_groupby_tab1.val2 + Remote query: SELECT (val + val2), val, val2 FROM xc_groupby_tab1 GROUP BY val, val2 + -> HashAggregate + Output: (val + val2), val, val2 + Group Key: xc_groupby_tab1.val, xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val, val2 +(8 rows) + +select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2 order by val, val2; + ?column? | val | val2 +----------+-----+------ + 2 | 1 | 1 + 6 | 2 | 4 + 5 | 3 | 2 + 7 | 3 | 4 + 5 | 4 | 1 + 6 | 4 | 2 +(6 rows) + +explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2 order by val, val2; + QUERY PLAN +--------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2 + Sort Key: xc_groupby_tab1.val, xc_groupby_tab2.val2 + -> Group + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2 + Group Key: xc_groupby_tab1.val, xc_groupby_tab2.val2 + -> Sort + Output: xc_groupby_tab1.val, xc_groupby_tab2.val2 + Sort Key: xc_groupby_tab1.val, xc_groupby_tab2.val2 + -> Merge Join + Output: xc_groupby_tab1.val, xc_groupby_tab2.val2 + Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) + -> Sort + Output: xc_groupby_tab1.val + Sort Key: xc_groupby_tab1.val + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val + -> Sort + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val + Sort Key: xc_groupby_tab2.val + -> Seq Scan on public.xc_groupby_tab2 + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val +(22 rows) + +explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2; + QUERY PLAN +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Remote Fast Query Execution + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2 + Remote query: SELECT (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2 FROM xc_groupby_tab1, xc_groupby_tab2 WHERE (xc_groupby_tab1.val = xc_groupby_tab2.val) GROUP BY xc_groupby_tab1.val, xc_groupby_tab2.val2 + -> HashAggregate + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2 + Group Key: xc_groupby_tab1.val, xc_groupby_tab2.val2 + -> Merge Join + Output: xc_groupby_tab1.val, xc_groupby_tab2.val2 + Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) + -> Sort + Output: xc_groupby_tab1.val + Sort Key: xc_groupby_tab1.val + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val + -> Sort + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val + Sort Key: xc_groupby_tab2.val + -> Seq Scan on public.xc_groupby_tab2 + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val +(19 rows) + +select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2 order by 1; + ?column? +---------- + 2 + 5 + 6 + 7 +(4 rows) + +explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2 order by 1; + QUERY PLAN +----------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2) + Sort Key: (xc_groupby_tab1.val + xc_groupby_tab2.val2) + -> Finalize GroupAggregate + Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + Group Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + -> Sort + Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + -> Remote Subquery Scan on all + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2) + Distribute results by H: (val + val2) + -> Partial HashAggregate + Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + Group Key: (xc_groupby_tab1.val + xc_groupby_tab2.val2) + -> Merge Join + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2) + Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) + -> Sort + Output: xc_groupby_tab1.val + Sort Key: xc_groupby_tab1.val + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val + -> Sort + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val + Sort Key: xc_groupby_tab2.val + -> Seq Scan on public.xc_groupby_tab2 + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val +(28 rows) + +explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2; + QUERY PLAN +----------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2) + -> Finalize HashAggregate + Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + Group Key: (xc_groupby_tab1.val + xc_groupby_tab2.val2) + -> Remote Subquery Scan on all + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2) + Distribute results by H: (val + val2) + -> Partial HashAggregate + Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + Group Key: (xc_groupby_tab1.val + xc_groupby_tab2.val2) + -> Merge Join + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2) + Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) + -> Sort + Output: xc_groupby_tab1.val + Sort Key: xc_groupby_tab1.val + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val + -> Sort + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val + Sort Key: xc_groupby_tab2.val + -> Seq Scan on public.xc_groupby_tab2 + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val +(24 rows) + +-- group by with aggregates in expression +select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2 order by val2; + ?column? | val2 +---------------------+------ + 11.0000000000000000 | 1 + 14.0000000000000000 | 2 + 17.6666666666666667 | 3 +(3 rows) + +explain (verbose true, costs false, nodes false) select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2 order by val2; + QUERY PLAN +---------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: (((count(*) + sum(val)))::numeric + avg(val)), val2 + Sort Key: xc_groupby_tab1.val2 + -> Finalize GroupAggregate + Output: (((count(*) + sum(val)))::numeric + avg(val)), val2 + Group Key: xc_groupby_tab1.val2 + -> Sort + Output: val2, (PARTIAL count(*)), (PARTIAL sum(val)), (PARTIAL avg(val)) + Sort Key: xc_groupby_tab1.val2 + -> Remote Subquery Scan on all + Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val) + Distribute results by H: val2 + -> Partial HashAggregate + Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val) + Group Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val, val2 +(17 rows) + +explain (verbose true, costs false, nodes false) select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2; + QUERY PLAN +---------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: (((count(*) + sum(val)))::numeric + avg(val)), val2 + -> Finalize HashAggregate + Output: (((count(*) + sum(val)))::numeric + avg(val)), val2 + Group Key: xc_groupby_tab1.val2 + -> Remote Subquery Scan on all + Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val) + Distribute results by H: val2 + -> Partial HashAggregate + Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val) + Group Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val, val2 +(13 rows) + +-- group by with expressions in group by clause +select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2 order by 2 * val2; + sum | avg | ?column? +-----+--------------------+---------- + 6 | 2.0000000000000000 | 2 + 8 | 4.0000000000000000 | 4 + 11 | 3.6666666666666667 | 6 +(3 rows) + +explain (verbose true, costs false, nodes false) select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2 order by 2 * val2; + QUERY PLAN +------------------------------------------------------------------------------------ + Remote Subquery Scan on all + Output: sum(val), avg(val), (2 * val2) + Sort Key: (2 * xc_groupby_tab1.val2) + -> Finalize GroupAggregate + Output: sum(val), avg(val), ((2 * val2)) + Group Key: ((2 * xc_groupby_tab1.val2)) + -> Sort + Output: ((2 * val2)), (PARTIAL sum(val)), (PARTIAL avg(val)) + Sort Key: ((2 * xc_groupby_tab1.val2)) + -> Remote Subquery Scan on all + Output: (2 * val2), PARTIAL sum(val), PARTIAL avg(val) + Distribute results by H: (2 * val2) + -> Partial HashAggregate + Output: ((2 * val2)), PARTIAL sum(val), PARTIAL avg(val) + Group Key: (2 * xc_groupby_tab1.val2) + -> Seq Scan on public.xc_groupby_tab1 + Output: (2 * val2), val +(17 rows) + +explain (verbose true, costs false, nodes false) select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2; + QUERY PLAN +------------------------------------------------------------------------------ + Remote Subquery Scan on all + Output: sum(val), avg(val), (2 * val2) + -> Finalize HashAggregate + Output: sum(val), avg(val), ((2 * val2)) + Group Key: (2 * xc_groupby_tab1.val2) + -> Remote Subquery Scan on all + Output: (2 * val2), PARTIAL sum(val), PARTIAL avg(val) + Distribute results by H: (2 * val2) + -> Partial HashAggregate + Output: ((2 * val2)), PARTIAL sum(val), PARTIAL avg(val) + Group Key: (2 * xc_groupby_tab1.val2) + -> Seq Scan on public.xc_groupby_tab1 + Output: (2 * val2), val +(13 rows) + +drop table xc_groupby_tab1; +drop table xc_groupby_tab2; +-- some tests involving nulls, characters, float type etc. +create table xc_groupby_def(a int, b varchar(25)); +insert into xc_groupby_def VALUES (NULL, NULL); +insert into xc_groupby_def VALUES (1, NULL); +insert into xc_groupby_def VALUES (NULL, 'One'); +insert into xc_groupby_def VALUES (2, 'Two'); +insert into xc_groupby_def VALUES (2, 'Two'); +insert into xc_groupby_def VALUES (3, 'Three'); +insert into xc_groupby_def VALUES (4, 'Three'); +insert into xc_groupby_def VALUES (5, 'Three'); +insert into xc_groupby_def VALUES (6, 'Two'); +insert into xc_groupby_def VALUES (7, NULL); +insert into xc_groupby_def VALUES (8, 'Two'); +insert into xc_groupby_def VALUES (9, 'Three'); +insert into xc_groupby_def VALUES (10, 'Three'); +select a,count(a) from xc_groupby_def group by a order by a; + a | count +----+------- + 1 | 1 + 2 | 2 + 3 | 1 + 4 | 1 + 5 | 1 + 6 | 1 + 7 | 1 + 8 | 1 + 9 | 1 + 10 | 1 + | 0 +(11 rows) + +explain (verbose true, costs false, nodes false) select a,count(a) from xc_groupby_def group by a order by a; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all + Output: a, count(a) + Sort Key: xc_groupby_def.a + -> Sort + Output: a, (count(a)) + Sort Key: xc_groupby_def.a + -> HashAggregate + Output: a, count(a) + Group Key: xc_groupby_def.a + -> Seq Scan on public.xc_groupby_def + Output: a, b +(11 rows) + +select avg(a) from xc_groupby_def group by a order by 1; + avg +------------------------ + 1.00000000000000000000 + 2.0000000000000000 + 3.0000000000000000 + 4.0000000000000000 + 5.0000000000000000 + 6.0000000000000000 + 7.0000000000000000 + 8.0000000000000000 + 9.0000000000000000 + 10.0000000000000000 + +(11 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by a order by 1; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), a + Sort Key: avg(xc_groupby_def.a) + -> Sort + Output: (avg(a)), a + Sort Key: (avg(xc_groupby_def.a)) + -> HashAggregate + Output: avg(a), a + Group Key: xc_groupby_def.a + -> Seq Scan on public.xc_groupby_def + Output: a, b +(11 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by a; + QUERY PLAN +----------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), a + -> HashAggregate + Output: avg(a), a + Group Key: xc_groupby_def.a + -> Seq Scan on public.xc_groupby_def + Output: a, b +(7 rows) + +select avg(a) from xc_groupby_def group by b order by 1; + avg +-------------------- + 4.0000000000000000 + 4.5000000000000000 + 6.2000000000000000 + +(4 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by b order by 1; + QUERY PLAN +----------------------------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), b + Sort Key: avg(xc_groupby_def.a) + -> Sort + Output: (avg(a)), b + Sort Key: (avg(xc_groupby_def.a)) + -> Finalize HashAggregate + Output: avg(a), b + Group Key: xc_groupby_def.b + -> Remote Subquery Scan on all + Output: b, PARTIAL avg(a) + Distribute results by H: b + -> Partial HashAggregate + Output: b, PARTIAL avg(a) + Group Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: a, b +(17 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by b; + QUERY PLAN +----------------------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), b + -> Finalize HashAggregate + Output: avg(a), b + Group Key: xc_groupby_def.b + -> Remote Subquery Scan on all + Output: b, PARTIAL avg(a) + Distribute results by H: b + -> Partial HashAggregate + Output: b, PARTIAL avg(a) + Group Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: a, b +(13 rows) + +select sum(a) from xc_groupby_def group by b order by 1; + sum +----- + 8 + 18 + 31 + +(4 rows) + +explain (verbose true, costs false, nodes false) select sum(a) from xc_groupby_def group by b order by 1; + QUERY PLAN +----------------------------------------------------------------- + Remote Subquery Scan on all + Output: sum(a), b + Sort Key: sum(xc_groupby_def.a) + -> Sort + Output: (sum(a)), b + Sort Key: (sum(xc_groupby_def.a)) + -> Finalize HashAggregate + Output: sum(a), b + Group Key: xc_groupby_def.b + -> Remote Subquery Scan on all + Output: b, PARTIAL sum(a) + Distribute results by H: b + -> Partial HashAggregate + Output: b, PARTIAL sum(a) + Group Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: a, b +(17 rows) + +explain (verbose true, costs false, nodes false) select sum(a) from xc_groupby_def group by b; + QUERY PLAN +----------------------------------------------------------- + Remote Subquery Scan on all + Output: sum(a), b + -> Finalize HashAggregate + Output: sum(a), b + Group Key: xc_groupby_def.b + -> Remote Subquery Scan on all + Output: b, PARTIAL sum(a) + Distribute results by H: b + -> Partial HashAggregate + Output: b, PARTIAL sum(a) + Group Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: a, b +(13 rows) + +select count(*) from xc_groupby_def group by b order by 1; + count +------- + 1 + 3 + 4 + 5 +(4 rows) + +explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def group by b order by 1; + QUERY PLAN +----------------------------------------------------------------- + Remote Subquery Scan on all + Output: count(*), b + Sort Key: count(*) + -> Sort + Output: (count(*)), b + Sort Key: (count(*)) + -> Finalize HashAggregate + Output: count(*), b + Group Key: xc_groupby_def.b + -> Remote Subquery Scan on all + Output: b, PARTIAL count(*) + Distribute results by H: b + -> Partial HashAggregate + Output: b, PARTIAL count(*) + Group Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: a, b +(17 rows) + +explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def group by b; + QUERY PLAN +----------------------------------------------------------- + Remote Subquery Scan on all + Output: count(*), b + -> Finalize HashAggregate + Output: count(*), b + Group Key: xc_groupby_def.b + -> Remote Subquery Scan on all + Output: b, PARTIAL count(*) + Distribute results by H: b + -> Partial HashAggregate + Output: b, PARTIAL count(*) + Group Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: a, b +(13 rows) + +select count(*) from xc_groupby_def where a is not null group by a order by 1; + count +------- + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 2 +(10 rows) + +explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def where a is not null group by a order by 1; + QUERY PLAN +------------------------------------------------------------ + Remote Subquery Scan on all + Output: count(*), a + Sort Key: count(*) + -> Sort + Output: (count(*)), a + Sort Key: (count(*)) + -> HashAggregate + Output: count(*), a + Group Key: xc_groupby_def.a + -> Seq Scan on public.xc_groupby_def + Output: a, b + Filter: (xc_groupby_def.a IS NOT NULL) +(12 rows) + +explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def where a is not null group by a; + QUERY PLAN +------------------------------------------------------ + Remote Subquery Scan on all + Output: count(*), a + -> HashAggregate + Output: count(*), a + Group Key: xc_groupby_def.a + -> Seq Scan on public.xc_groupby_def + Output: a, b + Filter: (xc_groupby_def.a IS NOT NULL) +(8 rows) + +select * from (select b from xc_groupby_def group by b) q order by q.b; + b +------- + One + Three + Two + +(4 rows) + +explain (verbose true, costs false, nodes false) select * from (select b from xc_groupby_def group by b) q order by q.b; + QUERY PLAN +---------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: b + Sort Key: b + -> Finalize GroupAggregate + Output: xc_groupby_def.b + Group Key: xc_groupby_def.b + -> Sort + Output: xc_groupby_def.b + Sort Key: xc_groupby_def.b + -> Remote Subquery Scan on all + Output: xc_groupby_def.b + Distribute results by H: b + -> Partial HashAggregate + Output: xc_groupby_def.b + Group Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: xc_groupby_def.a, xc_groupby_def.b +(17 rows) + +select * from (select b,count(b) from xc_groupby_def group by b) q order by q.b; + b | count +-------+------- + One | 1 + Three | 5 + Two | 4 + | 0 +(4 rows) + +explain (verbose true, costs false, nodes false) select * from (select b,count(b) from xc_groupby_def group by b) q order by q.b; + QUERY PLAN +------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: b, count + Sort Key: b + -> Finalize GroupAggregate + Output: xc_groupby_def.b, count(xc_groupby_def.b) + Group Key: xc_groupby_def.b + -> Sort + Output: xc_groupby_def.b, (PARTIAL count(xc_groupby_def.b)) + Sort Key: xc_groupby_def.b + -> Remote Subquery Scan on all + Output: xc_groupby_def.b, PARTIAL count(xc_groupby_def.b) + Distribute results by H: b + -> Partial HashAggregate + Output: xc_groupby_def.b, PARTIAL count(xc_groupby_def.b) + Group Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: xc_groupby_def.a, xc_groupby_def.b +(17 rows) + +select count(*) from xc_groupby_def where b is null group by b order by 1; + count +------- + 3 +(1 row) + +explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def where b is null group by b; + QUERY PLAN +-------------------------------------------------------------------- + Remote Subquery Scan on all + Output: count(*), b + -> Finalize GroupAggregate + Output: count(*), b + Group Key: xc_groupby_def.b + -> Sort + Output: b, (PARTIAL count(*)) + Sort Key: xc_groupby_def.b + -> Remote Subquery Scan on all + Output: b, PARTIAL count(*) + Distribute results by H: b + -> Partial HashAggregate + Output: b, PARTIAL count(*) + Group Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: a, b + Filter: (xc_groupby_def.b IS NULL) +(17 rows) + +create table xc_groupby_g(a int, b float, c numeric); +insert into xc_groupby_g values(1,2.1,3.2); +insert into xc_groupby_g values(1,2.1,3.2); +insert into xc_groupby_g values(2,2.3,5.2); +select sum(a) from xc_groupby_g group by a; + sum +----- + 2 + 2 +(2 rows) + +explain (verbose true, costs false, nodes false) select sum(a) from xc_groupby_g group by a; + QUERY PLAN +--------------------------------------------- + Remote Subquery Scan on all + Output: sum(a), a + -> HashAggregate + Output: sum(a), a + Group Key: xc_groupby_g.a + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(7 rows) + +select sum(b) from xc_groupby_g group by b order by 1; + sum +----- + 2.3 + 4.2 +(2 rows) + +explain (verbose true, costs false, nodes false) select sum(b) from xc_groupby_g group by b order by 1; + QUERY PLAN +--------------------------------------------------------------- + Remote Subquery Scan on all + Output: sum(b), b + Sort Key: sum(xc_groupby_g.b) + -> Sort + Output: (sum(b)), b + Sort Key: (sum(xc_groupby_g.b)) + -> Finalize HashAggregate + Output: sum(b), b + Group Key: xc_groupby_g.b + -> Remote Subquery Scan on all + Output: b, PARTIAL sum(b) + Distribute results by H: b + -> Partial HashAggregate + Output: b, PARTIAL sum(b) + Group Key: xc_groupby_g.b + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(17 rows) + +explain (verbose true, costs false, nodes false) select sum(b) from xc_groupby_g group by b; + QUERY PLAN +--------------------------------------------------------- + Remote Subquery Scan on all + Output: sum(b), b + -> Finalize HashAggregate + Output: sum(b), b + Group Key: xc_groupby_g.b + -> Remote Subquery Scan on all + Output: b, PARTIAL sum(b) + Distribute results by H: b + -> Partial HashAggregate + Output: b, PARTIAL sum(b) + Group Key: xc_groupby_g.b + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(13 rows) + +select sum(c) from xc_groupby_g group by b order by 1; + sum +----- + 5.2 + 6.4 +(2 rows) + +explain (verbose true, costs false, nodes false) select sum(c) from xc_groupby_g group by b order by 1; + QUERY PLAN +--------------------------------------------------------------- + Remote Subquery Scan on all + Output: sum(c), b + Sort Key: sum(xc_groupby_g.c) + -> Sort + Output: (sum(c)), b + Sort Key: (sum(xc_groupby_g.c)) + -> Finalize HashAggregate + Output: sum(c), b + Group Key: xc_groupby_g.b + -> Remote Subquery Scan on all + Output: b, PARTIAL sum(c) + Distribute results by H: b + -> Partial HashAggregate + Output: b, PARTIAL sum(c) + Group Key: xc_groupby_g.b + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(17 rows) + +explain (verbose true, costs false, nodes false) select sum(c) from xc_groupby_g group by b; + QUERY PLAN +--------------------------------------------------------- + Remote Subquery Scan on all + Output: sum(c), b + -> Finalize HashAggregate + Output: sum(c), b + Group Key: xc_groupby_g.b + -> Remote Subquery Scan on all + Output: b, PARTIAL sum(c) + Distribute results by H: b + -> Partial HashAggregate + Output: b, PARTIAL sum(c) + Group Key: xc_groupby_g.b + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(13 rows) + +select avg(a) from xc_groupby_g group by b order by 1; + avg +------------------------ + 1.00000000000000000000 + 2.0000000000000000 +(2 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_g group by b order by 1; + QUERY PLAN +--------------------------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), b + Sort Key: avg(xc_groupby_g.a) + -> Sort + Output: (avg(a)), b + Sort Key: (avg(xc_groupby_g.a)) + -> Finalize HashAggregate + Output: avg(a), b + Group Key: xc_groupby_g.b + -> Remote Subquery Scan on all + Output: b, PARTIAL avg(a) + Distribute results by H: b + -> Partial HashAggregate + Output: b, PARTIAL avg(a) + Group Key: xc_groupby_g.b + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(17 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_g group by b; + QUERY PLAN +--------------------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), b + -> Finalize HashAggregate + Output: avg(a), b + Group Key: xc_groupby_g.b + -> Remote Subquery Scan on all + Output: b, PARTIAL avg(a) + Distribute results by H: b + -> Partial HashAggregate + Output: b, PARTIAL avg(a) + Group Key: xc_groupby_g.b + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(13 rows) + +select avg(b) from xc_groupby_g group by c order by 1; + avg +----- + 2.1 + 2.3 +(2 rows) + +explain (verbose true, costs false, nodes false) select avg(b) from xc_groupby_g group by c order by 1; + QUERY PLAN +--------------------------------------------------------------- + Remote Subquery Scan on all + Output: avg(b), c + Sort Key: avg(xc_groupby_g.b) + -> Sort + Output: (avg(b)), c + Sort Key: (avg(xc_groupby_g.b)) + -> Finalize HashAggregate + Output: avg(b), c + Group Key: xc_groupby_g.c + -> Remote Subquery Scan on all + Output: c, PARTIAL avg(b) + Distribute results by H: c + -> Partial HashAggregate + Output: c, PARTIAL avg(b) + Group Key: xc_groupby_g.c + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(17 rows) + +explain (verbose true, costs false, nodes false) select avg(b) from xc_groupby_g group by c; + QUERY PLAN +--------------------------------------------------------- + Remote Subquery Scan on all + Output: avg(b), c + -> Finalize HashAggregate + Output: avg(b), c + Group Key: xc_groupby_g.c + -> Remote Subquery Scan on all + Output: c, PARTIAL avg(b) + Distribute results by H: c + -> Partial HashAggregate + Output: c, PARTIAL avg(b) + Group Key: xc_groupby_g.c + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(13 rows) + +select avg(c) from xc_groupby_g group by c order by 1; + avg +-------------------- + 3.2000000000000000 + 5.2000000000000000 +(2 rows) + +explain (verbose true, costs false, nodes false) select avg(c) from xc_groupby_g group by c order by 1; + QUERY PLAN +--------------------------------------------------------------- + Remote Subquery Scan on all + Output: avg(c), c + Sort Key: avg(xc_groupby_g.c) + -> Sort + Output: (avg(c)), c + Sort Key: (avg(xc_groupby_g.c)) + -> Finalize HashAggregate + Output: avg(c), c + Group Key: xc_groupby_g.c + -> Remote Subquery Scan on all + Output: c, PARTIAL avg(c) + Distribute results by H: c + -> Partial HashAggregate + Output: c, PARTIAL avg(c) + Group Key: xc_groupby_g.c + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(17 rows) + +explain (verbose true, costs false, nodes false) select avg(c) from xc_groupby_g group by c; + QUERY PLAN +--------------------------------------------------------- + Remote Subquery Scan on all + Output: avg(c), c + -> Finalize HashAggregate + Output: avg(c), c + Group Key: xc_groupby_g.c + -> Remote Subquery Scan on all + Output: c, PARTIAL avg(c) + Distribute results by H: c + -> Partial HashAggregate + Output: c, PARTIAL avg(c) + Group Key: xc_groupby_g.c + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(13 rows) + +drop table xc_groupby_def; +drop table xc_groupby_g; +-- Combination 2, enable_hashagg on and replicated tables. +-- repeat the same tests for replicated tables +-- create required tables and fill them with data +create table xc_groupby_tab1 (val int, val2 int) distribute by replication; +create table xc_groupby_tab2 (val int, val2 int) distribute by replication; +insert into xc_groupby_tab1 values (1, 1), (2, 1), (3, 1), (2, 2), (6, 2), (4, 3), (1, 3), (6, 3); +insert into xc_groupby_tab2 values (1, 1), (4, 1), (8, 1), (2, 4), (9, 4), (3, 4), (4, 2), (5, 2), (3, 2); +select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2 order by 1, 2; + count | sum | avg | ?column? | val2 +-------+-----+--------------------+------------------+------ + 2 | 8 | 4.0000000000000000 | 4 | 2 + 3 | 6 | 2.0000000000000000 | 2 | 1 + 3 | 11 | 3.6666666666666667 | 3.66666666666667 | 3 +(3 rows) + +explain (verbose true, costs false, nodes false) select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2 order by 1, 2; + QUERY PLAN +--------------------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2 + -> Sort + Output: (count(*)), (sum(val)), (avg(val)), (((sum(val))::double precision / (count(*))::double precision)), val2 + Sort Key: (count(*)), (sum(xc_groupby_tab1.val)) + -> HashAggregate + Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2 + Group Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val, val2 +(10 rows) + +explain (verbose true, costs false, nodes false) select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2 + -> HashAggregate + Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2 + Group Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val, val2 +(7 rows) + +-- joins and group by +select * from (select count(*), sum(xc_groupby_tab1.val * xc_groupby_tab2.val), avg(xc_groupby_tab1.val*xc_groupby_tab2.val), sum(xc_groupby_tab1.val*xc_groupby_tab2.val)::float8/count(*), xc_groupby_tab1.val2 c1, xc_groupby_tab2.val2 c2 from xc_groupby_tab1 full outer join xc_groupby_tab2 on xc_groupby_tab1.val2 = xc_groupby_tab2.val2 group by xc_groupby_tab1.val2, xc_groupby_tab2.val2) q order by q.c1, q.c2; + count | sum | avg | ?column? | c1 | c2 +-------+-----+---------------------+------------------+----+---- + 9 | 78 | 8.6666666666666667 | 8.66666666666667 | 1 | 1 + 6 | 96 | 16.0000000000000000 | 16 | 2 | 2 + 3 | | | | 3 | + 3 | | | | | 4 +(4 rows) + +explain (verbose true, costs false, nodes false) select * from (select count(*), sum(xc_groupby_tab1.val * xc_groupby_tab2.val), avg(xc_groupby_tab1.val*xc_groupby_tab2.val), sum(xc_groupby_tab1.val*xc_groupby_tab2.val)::float8/count(*), xc_groupby_tab1.val2 c1, xc_groupby_tab2.val2 c2 from xc_groupby_tab1 full outer join xc_groupby_tab2 on xc_groupby_tab1.val2 = xc_groupby_tab2.val2 group by xc_groupby_tab1.val2, xc_groupby_tab2.val2) q order by q.c1, q.c2; + QUERY PLAN +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: count, sum, avg, "?column?", c1, c2 + -> GroupAggregate + Output: count(*), sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), avg((xc_groupby_tab1.val * xc_groupby_tab2.val)), ((sum((xc_groupby_tab1.val * xc_groupby_tab2.val)))::double precision / (count(*))::double precision), xc_groupby_tab1.val2, xc_groupby_tab2.val2 + Group Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2 + -> Sort + Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val + Sort Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2 + -> Merge Full Join + Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val + Merge Cond: (xc_groupby_tab1.val2 = xc_groupby_tab2.val2) + -> Sort + Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 + Sort Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 + -> Sort + Output: xc_groupby_tab2.val, xc_groupby_tab2.val2 + Sort Key: xc_groupby_tab2.val2 + -> Seq Scan on public.xc_groupby_tab2 + Output: xc_groupby_tab2.val, xc_groupby_tab2.val2 +(21 rows) + +-- aggregates over aggregates +select * from (select sum(y) sum from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x) q order by q.sum; + sum +----- + 8 + 17 +(2 rows) + +explain (verbose true, costs false, nodes false) select * from (select sum(y) sum from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x) q order by q.sum; + QUERY PLAN +-------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: q.sum + -> Sort + Output: q.sum + Sort Key: q.sum + -> Subquery Scan on q + Output: q.sum + -> HashAggregate + Output: sum((sum(xc_groupby_tab1.val))), ((xc_groupby_tab1.val2 % 2)) + Group Key: (xc_groupby_tab1.val2 % 2) + -> HashAggregate + Output: sum(xc_groupby_tab1.val), (xc_groupby_tab1.val2 % 2), xc_groupby_tab1.val2 + Group Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 +(15 rows) + +-- group by without aggregate +select val2 from xc_groupby_tab1 group by val2 order by 1; + val2 +------ + 1 + 2 + 3 +(3 rows) + +explain (verbose true, costs false, nodes false) select val2 from xc_groupby_tab1 group by val2 order by 1; + QUERY PLAN +------------------------------------------------------ + Remote Subquery Scan on all + Output: val2 + -> Sort + Output: val2 + Sort Key: xc_groupby_tab1.val2 + -> HashAggregate + Output: val2 + Group Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val, val2 +(10 rows) + +explain (verbose true, costs false, nodes false) select val2 from xc_groupby_tab1 group by val2; + QUERY PLAN +------------------------------------------------ + Remote Subquery Scan on all + Output: val2 + -> HashAggregate + Output: val2 + Group Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val, val2 +(7 rows) + +select * from (select val + val2 sum from xc_groupby_tab1 group by val + val2) q order by q.sum; + sum +----- + 2 + 3 + 4 + 7 + 8 + 9 +(6 rows) + +explain (verbose true, costs false, nodes false) select * from (select val + val2 sum from xc_groupby_tab1 group by val + val2) q order by q.sum; + QUERY PLAN +-------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: sum + -> Sort + Output: ((xc_groupby_tab1.val + xc_groupby_tab1.val2)) + Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2)) + -> HashAggregate + Output: ((xc_groupby_tab1.val + xc_groupby_tab1.val2)) + Group Key: (xc_groupby_tab1.val + xc_groupby_tab1.val2) + -> Seq Scan on public.xc_groupby_tab1 + Output: (xc_groupby_tab1.val + xc_groupby_tab1.val2) +(10 rows) + +select * from (select val + val2, val, val2 from xc_groupby_tab1 group by val, val2) q order by q.val, q.val2; + ?column? | val | val2 +----------+-----+------ + 2 | 1 | 1 + 4 | 1 | 3 + 3 | 2 | 1 + 4 | 2 | 2 + 4 | 3 | 1 + 7 | 4 | 3 + 8 | 6 | 2 + 9 | 6 | 3 +(8 rows) + +explain (verbose true, costs false, nodes false) select * from (select val + val2, val, val2 from xc_groupby_tab1 group by val, val2) q order by q.val, q.val2; + QUERY PLAN +--------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: "?column?", val, val2 + -> Sort + Output: ((xc_groupby_tab1.val + xc_groupby_tab1.val2)), xc_groupby_tab1.val, xc_groupby_tab1.val2 + Sort Key: xc_groupby_tab1.val, xc_groupby_tab1.val2 + -> HashAggregate + Output: (xc_groupby_tab1.val + xc_groupby_tab1.val2), xc_groupby_tab1.val, xc_groupby_tab1.val2 + Group Key: xc_groupby_tab1.val, xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 +(10 rows) + +select * from (select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2) q order by q.val, q.val2; + ?column? | val | val2 +----------+-----+------ + 2 | 1 | 1 + 6 | 2 | 4 + 5 | 3 | 2 + 7 | 3 | 4 + 5 | 4 | 1 + 6 | 4 | 2 +(6 rows) + +explain (verbose true, costs false, nodes false) select * from (select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2) q order by q.val, q.val2; + QUERY PLAN +--------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: "?column?", val, val2 + -> Group + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2 + Group Key: xc_groupby_tab1.val, xc_groupby_tab2.val2 + -> Sort + Output: xc_groupby_tab1.val, xc_groupby_tab2.val2 + Sort Key: xc_groupby_tab1.val, xc_groupby_tab2.val2 + -> Merge Join + Output: xc_groupby_tab1.val, xc_groupby_tab2.val2 + Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) + -> Sort + Output: xc_groupby_tab1.val + Sort Key: xc_groupby_tab1.val + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val + -> Sort + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val + Sort Key: xc_groupby_tab2.val + -> Seq Scan on public.xc_groupby_tab2 + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val +(21 rows) + +select * from (select xc_groupby_tab1.val + xc_groupby_tab2.val2 sum from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2) q order by q.sum; + sum +----- + 2 + 5 + 6 + 7 +(4 rows) + +explain (verbose true, costs false, nodes false) select * from (select xc_groupby_tab1.val + xc_groupby_tab2.val2 sum from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2) q order by q.sum; + QUERY PLAN +----------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: sum + -> Group + Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + Group Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + -> Sort + Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + -> Merge Join + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2) + Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) + -> Sort + Output: xc_groupby_tab1.val + Sort Key: xc_groupby_tab1.val + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val + -> Sort + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val + Sort Key: xc_groupby_tab2.val + -> Seq Scan on public.xc_groupby_tab2 + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val +(21 rows) + +-- group by with aggregates in expression +select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2 order by 1; + ?column? | val2 +---------------------+------ + 11.0000000000000000 | 1 + 14.0000000000000000 | 2 + 17.6666666666666667 | 3 +(3 rows) + +explain (verbose true, costs false, nodes false) select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2 order by 1; + QUERY PLAN +--------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: (((count(*) + sum(val)))::numeric + avg(val)), val2 + -> Sort + Output: ((((count(*) + sum(val)))::numeric + avg(val))), val2 + Sort Key: ((((count(*) + sum(xc_groupby_tab1.val)))::numeric + avg(xc_groupby_tab1.val))) + -> HashAggregate + Output: (((count(*) + sum(val)))::numeric + avg(val)), val2 + Group Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val, val2 +(10 rows) + +explain (verbose true, costs false, nodes false) select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2; + QUERY PLAN +--------------------------------------------------------------------- + Remote Subquery Scan on all + Output: (((count(*) + sum(val)))::numeric + avg(val)), val2 + -> HashAggregate + Output: (((count(*) + sum(val)))::numeric + avg(val)), val2 + Group Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val, val2 +(7 rows) + +-- group by with expressions in group by clause +select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2 order by 1, 2; + sum | avg | ?column? +-----+--------------------+---------- + 6 | 2.0000000000000000 | 2 + 8 | 4.0000000000000000 | 4 + 11 | 3.6666666666666667 | 6 +(3 rows) + +explain (verbose true, costs false, nodes false) select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2 order by 1, 2; + QUERY PLAN +-------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: sum(val), avg(val), (2 * val2) + -> Sort + Output: (sum(val)), (avg(val)), ((2 * val2)) + Sort Key: (sum(xc_groupby_tab1.val)), (avg(xc_groupby_tab1.val)) + -> HashAggregate + Output: sum(val), avg(val), ((2 * val2)) + Group Key: (2 * xc_groupby_tab1.val2) + -> Seq Scan on public.xc_groupby_tab1 + Output: (2 * val2), val +(10 rows) + +explain (verbose true, costs false, nodes false) select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2; + QUERY PLAN +-------------------------------------------------- + Remote Subquery Scan on all + Output: sum(val), avg(val), (2 * val2) + -> HashAggregate + Output: sum(val), avg(val), ((2 * val2)) + Group Key: (2 * xc_groupby_tab1.val2) + -> Seq Scan on public.xc_groupby_tab1 + Output: (2 * val2), val +(7 rows) + +drop table xc_groupby_tab1; +drop table xc_groupby_tab2; +-- some tests involving nulls, characters, float type etc. +create table xc_groupby_def(a int, b varchar(25)) distribute by replication; +insert into xc_groupby_def VALUES (NULL, NULL); +insert into xc_groupby_def VALUES (1, NULL); +insert into xc_groupby_def VALUES (NULL, 'One'); +insert into xc_groupby_def VALUES (2, 'Two'); +insert into xc_groupby_def VALUES (2, 'Two'); +insert into xc_groupby_def VALUES (3, 'Three'); +insert into xc_groupby_def VALUES (4, 'Three'); +insert into xc_groupby_def VALUES (5, 'Three'); +insert into xc_groupby_def VALUES (6, 'Two'); +insert into xc_groupby_def VALUES (7, NULL); +insert into xc_groupby_def VALUES (8, 'Two'); +insert into xc_groupby_def VALUES (9, 'Three'); +insert into xc_groupby_def VALUES (10, 'Three'); +select a,count(a) from xc_groupby_def group by a order by a; + a | count +----+------- + 1 | 1 + 2 | 2 + 3 | 1 + 4 | 1 + 5 | 1 + 6 | 1 + 7 | 1 + 8 | 1 + 9 | 1 + 10 | 1 + | 0 +(11 rows) + +explain (verbose true, costs false, nodes false) select a,count(a) from xc_groupby_def group by a order by a; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all + Output: a, count(a) + -> Sort + Output: a, (count(a)) + Sort Key: xc_groupby_def.a + -> HashAggregate + Output: a, count(a) + Group Key: xc_groupby_def.a + -> Seq Scan on public.xc_groupby_def + Output: a, b +(10 rows) + +select avg(a) from xc_groupby_def group by a order by a; + avg +------------------------ + 1.00000000000000000000 + 2.0000000000000000 + 3.0000000000000000 + 4.0000000000000000 + 5.0000000000000000 + 6.0000000000000000 + 7.0000000000000000 + 8.0000000000000000 + 9.0000000000000000 + 10.0000000000000000 + +(11 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by a order by a; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), a + -> Sort + Output: (avg(a)), a + Sort Key: xc_groupby_def.a + -> HashAggregate + Output: avg(a), a + Group Key: xc_groupby_def.a + -> Seq Scan on public.xc_groupby_def + Output: a, b +(10 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by a; + QUERY PLAN +----------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), a + -> HashAggregate + Output: avg(a), a + Group Key: xc_groupby_def.a + -> Seq Scan on public.xc_groupby_def + Output: a, b +(7 rows) + +select avg(a) from xc_groupby_def group by a order by 1; + avg +------------------------ + 1.00000000000000000000 + 2.0000000000000000 + 3.0000000000000000 + 4.0000000000000000 + 5.0000000000000000 + 6.0000000000000000 + 7.0000000000000000 + 8.0000000000000000 + 9.0000000000000000 + 10.0000000000000000 + +(11 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by a order by 1; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), a + -> Sort + Output: (avg(a)), a + Sort Key: (avg(xc_groupby_def.a)) + -> HashAggregate + Output: avg(a), a + Group Key: xc_groupby_def.a + -> Seq Scan on public.xc_groupby_def + Output: a, b +(10 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by a; + QUERY PLAN +----------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), a + -> HashAggregate + Output: avg(a), a + Group Key: xc_groupby_def.a + -> Seq Scan on public.xc_groupby_def + Output: a, b +(7 rows) + +select avg(a) from xc_groupby_def group by b order by 1; + avg +-------------------- + 4.0000000000000000 + 4.5000000000000000 + 6.2000000000000000 + +(4 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by b order by 1; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), b + -> Sort + Output: (avg(a)), b + Sort Key: (avg(xc_groupby_def.a)) + -> HashAggregate + Output: avg(a), b + Group Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: a, b +(10 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by b; + QUERY PLAN +----------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), b + -> HashAggregate + Output: avg(a), b + Group Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: a, b +(7 rows) + +select sum(a) from xc_groupby_def group by b order by 1; + sum +----- + 8 + 18 + 31 + +(4 rows) + +explain (verbose true, costs false, nodes false) select sum(a) from xc_groupby_def group by b order by 1; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all + Output: sum(a), b + -> Sort + Output: (sum(a)), b + Sort Key: (sum(xc_groupby_def.a)) + -> HashAggregate + Output: sum(a), b + Group Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: a, b +(10 rows) + +explain (verbose true, costs false, nodes false) select sum(a) from xc_groupby_def group by b; + QUERY PLAN +----------------------------------------------- + Remote Subquery Scan on all + Output: sum(a), b + -> HashAggregate + Output: sum(a), b + Group Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: a, b +(7 rows) + +select count(*) from xc_groupby_def group by b order by b; + count +------- + 1 + 5 + 4 + 3 +(4 rows) + +explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def group by b order by b; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all + Output: count(*), b + -> Sort + Output: (count(*)), b + Sort Key: xc_groupby_def.b + -> HashAggregate + Output: count(*), b + Group Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: a, b +(10 rows) + +explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def group by b; + QUERY PLAN +----------------------------------------------- + Remote Subquery Scan on all + Output: count(*), b + -> HashAggregate + Output: count(*), b + Group Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: a, b +(7 rows) + +select count(*) from xc_groupby_def where a is not null group by a order by 1; + count +------- + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 2 +(10 rows) + +explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def where a is not null group by a order by 1; + QUERY PLAN +------------------------------------------------------------ + Remote Subquery Scan on all + Output: count(*), a + -> Sort + Output: (count(*)), a + Sort Key: (count(*)) + -> HashAggregate + Output: count(*), a + Group Key: xc_groupby_def.a + -> Seq Scan on public.xc_groupby_def + Output: a, b + Filter: (xc_groupby_def.a IS NOT NULL) +(11 rows) + +explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def where a is not null group by a; + QUERY PLAN +------------------------------------------------------ + Remote Subquery Scan on all + Output: count(*), a + -> HashAggregate + Output: count(*), a + Group Key: xc_groupby_def.a + -> Seq Scan on public.xc_groupby_def + Output: a, b + Filter: (xc_groupby_def.a IS NOT NULL) +(8 rows) + +select * from (select b from xc_groupby_def group by b) q order by q.b; + b +------- + One + Three + Two + +(4 rows) + +explain (verbose true, costs false, nodes false) select * from (select b from xc_groupby_def group by b) q order by q.b; + QUERY PLAN +---------------------------------------------------------------- + Remote Subquery Scan on all + Output: b + -> Sort + Output: xc_groupby_def.b + Sort Key: xc_groupby_def.b + -> HashAggregate + Output: xc_groupby_def.b + Group Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: xc_groupby_def.a, xc_groupby_def.b +(10 rows) + +select * from (select b,count(b) from xc_groupby_def group by b) q order by q.b; + b | count +-------+------- + One | 1 + Three | 5 + Two | 4 + | 0 +(4 rows) + +explain (verbose true, costs false, nodes false) select * from (select b,count(b) from xc_groupby_def group by b) q order by q.b; + QUERY PLAN +----------------------------------------------------------------- + Remote Subquery Scan on all + Output: b, count + -> Sort + Output: xc_groupby_def.b, (count(xc_groupby_def.b)) + Sort Key: xc_groupby_def.b + -> HashAggregate + Output: xc_groupby_def.b, count(xc_groupby_def.b) + Group Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: xc_groupby_def.a, xc_groupby_def.b +(10 rows) + +select count(*) from xc_groupby_def where b is null group by b; + count +------- + 3 +(1 row) + +explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def where b is null group by b; + QUERY PLAN +-------------------------------------------------------- + Remote Subquery Scan on all + Output: count(*), b + -> GroupAggregate + Output: count(*), b + Group Key: xc_groupby_def.b + -> Sort + Output: b + Sort Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: b + Filter: (xc_groupby_def.b IS NULL) +(11 rows) + +create table xc_groupby_g(a int, b float, c numeric) distribute by replication; +insert into xc_groupby_g values(1,2.1,3.2); +insert into xc_groupby_g values(1,2.1,3.2); +insert into xc_groupby_g values(2,2.3,5.2); +select sum(a) from xc_groupby_g group by a; + sum +----- + 2 + 2 +(2 rows) + +explain (verbose true, costs false, nodes false) select sum(a) from xc_groupby_g group by a; + QUERY PLAN +--------------------------------------------- + Remote Subquery Scan on all + Output: sum(a), a + -> HashAggregate + Output: sum(a), a + Group Key: xc_groupby_g.a + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(7 rows) + +select sum(b) from xc_groupby_g group by b order by 1; + sum +----- + 2.3 + 4.2 +(2 rows) + +explain (verbose true, costs false, nodes false) select sum(b) from xc_groupby_g group by b order by 1; + QUERY PLAN +--------------------------------------------------- + Remote Subquery Scan on all + Output: sum(b), b + -> Sort + Output: (sum(b)), b + Sort Key: (sum(xc_groupby_g.b)) + -> HashAggregate + Output: sum(b), b + Group Key: xc_groupby_g.b + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(10 rows) + +explain (verbose true, costs false, nodes false) select sum(b) from xc_groupby_g group by b; + QUERY PLAN +--------------------------------------------- + Remote Subquery Scan on all + Output: sum(b), b + -> HashAggregate + Output: sum(b), b + Group Key: xc_groupby_g.b + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(7 rows) + +select sum(c) from xc_groupby_g group by b order by 1; + sum +----- + 5.2 + 6.4 +(2 rows) + +explain (verbose true, costs false, nodes false) select sum(c) from xc_groupby_g group by b order by 1; + QUERY PLAN +--------------------------------------------------- + Remote Subquery Scan on all + Output: sum(c), b + -> Sort + Output: (sum(c)), b + Sort Key: (sum(xc_groupby_g.c)) + -> HashAggregate + Output: sum(c), b + Group Key: xc_groupby_g.b + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(10 rows) + +explain (verbose true, costs false, nodes false) select sum(c) from xc_groupby_g group by b; + QUERY PLAN +--------------------------------------------- + Remote Subquery Scan on all + Output: sum(c), b + -> HashAggregate + Output: sum(c), b + Group Key: xc_groupby_g.b + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(7 rows) + +select avg(a) from xc_groupby_g group by b order by 1; + avg +------------------------ + 1.00000000000000000000 + 2.0000000000000000 +(2 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_g group by b order by 1; + QUERY PLAN +--------------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), b + -> Sort + Output: (avg(a)), b + Sort Key: (avg(xc_groupby_g.a)) + -> HashAggregate + Output: avg(a), b + Group Key: xc_groupby_g.b + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(10 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_g group by b; + QUERY PLAN +--------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), b + -> HashAggregate + Output: avg(a), b + Group Key: xc_groupby_g.b + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(7 rows) + +select avg(b) from xc_groupby_g group by c order by c; + avg +----- + 2.1 + 2.3 +(2 rows) + +explain (verbose true, costs false, nodes false) select avg(b) from xc_groupby_g group by c order by c; + QUERY PLAN +--------------------------------------------------- + Remote Subquery Scan on all + Output: avg(b), c + -> Sort + Output: (avg(b)), c + Sort Key: xc_groupby_g.c + -> HashAggregate + Output: avg(b), c + Group Key: xc_groupby_g.c + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(10 rows) + +explain (verbose true, costs false, nodes false) select avg(b) from xc_groupby_g group by c; + QUERY PLAN +--------------------------------------------- + Remote Subquery Scan on all + Output: avg(b), c + -> HashAggregate + Output: avg(b), c + Group Key: xc_groupby_g.c + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(7 rows) + +select avg(c) from xc_groupby_g group by c order by c; + avg +-------------------- + 3.2000000000000000 + 5.2000000000000000 +(2 rows) + +explain (verbose true, costs false, nodes false) select avg(c) from xc_groupby_g group by c order by c; + QUERY PLAN +--------------------------------------------------- + Remote Subquery Scan on all + Output: avg(c), c + -> Sort + Output: (avg(c)), c + Sort Key: xc_groupby_g.c + -> HashAggregate + Output: avg(c), c + Group Key: xc_groupby_g.c + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(10 rows) + +explain (verbose true, costs false, nodes false) select avg(c) from xc_groupby_g group by c; + QUERY PLAN +--------------------------------------------- + Remote Subquery Scan on all + Output: avg(c), c + -> HashAggregate + Output: avg(c), c + Group Key: xc_groupby_g.c + -> Seq Scan on public.xc_groupby_g + Output: a, b, c +(7 rows) + +drop table xc_groupby_def; +drop table xc_groupby_g; +reset enable_hashagg; +-- Combination 3 enable_hashagg off and distributed tables +set enable_hashagg to off; +-- create required tables and fill them with data +create table xc_groupby_tab1 (val int, val2 int); +create table xc_groupby_tab2 (val int, val2 int); +insert into xc_groupby_tab1 values (1, 1), (2, 1), (3, 1), (2, 2), (6, 2), (4, 3), (1, 3), (6, 3); +insert into xc_groupby_tab2 values (1, 1), (4, 1), (8, 1), (2, 4), (9, 4), (3, 4), (4, 2), (5, 2), (3, 2); +select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2 order by 1, 2; + count | sum | avg | ?column? | val2 +-------+-----+--------------------+------------------+------ + 2 | 8 | 4.0000000000000000 | 4 | 2 + 3 | 6 | 2.0000000000000000 | 2 | 1 + 3 | 11 | 3.6666666666666667 | 3.66666666666667 | 3 +(3 rows) + +explain (verbose true, costs false, nodes false) select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2 order by 1, 2; + QUERY PLAN +--------------------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2 + Sort Key: count(*), sum(xc_groupby_tab1.val) + -> Sort + Output: (count(*)), (sum(val)), (avg(val)), (((sum(val))::double precision / (count(*))::double precision)), val2 + Sort Key: (count(*)), (sum(xc_groupby_tab1.val)) + -> Finalize GroupAggregate + Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2 + Group Key: xc_groupby_tab1.val2 + -> Sort + Output: val2, (PARTIAL count(*)), (PARTIAL sum(val)), (PARTIAL avg(val)) + Sort Key: xc_groupby_tab1.val2 + -> Remote Subquery Scan on all + Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val) + Distribute results by H: val2 + -> Partial GroupAggregate + Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val) + Group Key: xc_groupby_tab1.val2 + -> Sort + Output: val2, val + Sort Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val2, val +(23 rows) + +explain (verbose true, costs false, nodes false) select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2 + -> Finalize GroupAggregate + Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2 + Group Key: xc_groupby_tab1.val2 + -> Sort + Output: val2, (PARTIAL count(*)), (PARTIAL sum(val)), (PARTIAL avg(val)) + Sort Key: xc_groupby_tab1.val2 + -> Remote Subquery Scan on all + Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val) + Distribute results by H: val2 + -> Partial GroupAggregate + Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val) + Group Key: xc_groupby_tab1.val2 + -> Sort + Output: val2, val + Sort Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val2, val +(19 rows) + +-- joins and group by +select count(*), sum(xc_groupby_tab1.val * xc_groupby_tab2.val), avg(xc_groupby_tab1.val*xc_groupby_tab2.val), sum(xc_groupby_tab1.val*xc_groupby_tab2.val)::float8/count(*), xc_groupby_tab1.val2, xc_groupby_tab2.val2 from xc_groupby_tab1 full outer join xc_groupby_tab2 on xc_groupby_tab1.val2 = xc_groupby_tab2.val2 group by xc_groupby_tab1.val2, xc_groupby_tab2.val2 order by count(*); + count | sum | avg | ?column? | val2 | val2 +-------+-----+---------------------+------------------+------+------ + 3 | | | | | 4 + 3 | | | | 3 | + 6 | 96 | 16.0000000000000000 | 16 | 2 | 2 + 9 | 78 | 8.6666666666666667 | 8.66666666666667 | 1 | 1 +(4 rows) + +explain (verbose true, costs false, nodes false) select count(*), sum(xc_groupby_tab1.val * xc_groupby_tab2.val), avg(xc_groupby_tab1.val*xc_groupby_tab2.val), sum(xc_groupby_tab1.val*xc_groupby_tab2.val)::float8/count(*), xc_groupby_tab1.val2, xc_groupby_tab2.val2 from xc_groupby_tab1 full outer join xc_groupby_tab2 on xc_groupby_tab1.val2 = xc_groupby_tab2.val2 group by xc_groupby_tab1.val2, xc_groupby_tab2.val2; + QUERY PLAN +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: count(*), sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), avg((xc_groupby_tab1.val * xc_groupby_tab2.val)), ((sum((xc_groupby_tab1.val * xc_groupby_tab2.val)))::double precision / (count(*))::double precision), xc_groupby_tab1.val2, xc_groupby_tab2.val2 + -> Finalize GroupAggregate + Output: count(*), sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), avg((xc_groupby_tab1.val * xc_groupby_tab2.val)), ((sum((xc_groupby_tab1.val * xc_groupby_tab2.val)))::double precision / (count(*))::double precision), xc_groupby_tab1.val2, xc_groupby_tab2.val2 + Group Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2 + -> Sort + Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, (PARTIAL count(*)), (PARTIAL sum((xc_groupby_tab1.val * xc_groupby_tab2.val))), (PARTIAL avg((xc_groupby_tab1.val * xc_groupby_tab2.val))) + Sort Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2 + -> Remote Subquery Scan on all + Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, PARTIAL count(*), PARTIAL sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), PARTIAL avg((xc_groupby_tab1.val * xc_groupby_tab2.val)) + Distribute results by H: val2 + -> Partial GroupAggregate + Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, PARTIAL count(*), PARTIAL sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), PARTIAL avg((xc_groupby_tab1.val * xc_groupby_tab2.val)) + Group Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2 + -> Sort + Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val + Sort Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2 + -> Hash Full Join + Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val + Hash Cond: (xc_groupby_tab1.val2 = xc_groupby_tab2.val2) + -> Remote Subquery Scan on all + Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 + Distribute results by H: val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 + -> Hash + Output: xc_groupby_tab2.val, xc_groupby_tab2.val2 + -> Remote Subquery Scan on all + Output: xc_groupby_tab2.val, xc_groupby_tab2.val2 + Distribute results by H: val2 + -> Seq Scan on public.xc_groupby_tab2 + Output: xc_groupby_tab2.val, xc_groupby_tab2.val2 +(32 rows) + +-- aggregates over aggregates +select sum(y) from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x; + sum +----- + 17 + 8 +(2 rows) + +explain (verbose true, costs false, nodes false) select sum(y) from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x; + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: sum(q1.y), q1.x + -> Finalize GroupAggregate + Output: sum(q1.y), q1.x + Group Key: q1.x + -> Sort + Output: q1.x, (PARTIAL sum(q1.y)) + Sort Key: q1.x + -> Remote Subquery Scan on all + Output: q1.x, PARTIAL sum(q1.y) + Distribute results by H: x + -> Partial GroupAggregate + Output: q1.x, PARTIAL sum(q1.y) + Group Key: q1.x + -> Sort + Output: q1.x, q1.y + Sort Key: q1.x + -> Subquery Scan on q1 + Output: q1.x, q1.y + -> Finalize GroupAggregate + Output: sum(xc_groupby_tab1.val), (xc_groupby_tab1.val2 % 2), xc_groupby_tab1.val2 + Group Key: xc_groupby_tab1.val2 + -> Sort + Output: xc_groupby_tab1.val2, (PARTIAL sum(xc_groupby_tab1.val)) + Sort Key: xc_groupby_tab1.val2 + -> Remote Subquery Scan on all + Output: xc_groupby_tab1.val2, PARTIAL sum(xc_groupby_tab1.val) + Distribute results by H: val2 + -> Partial GroupAggregate + Output: xc_groupby_tab1.val2, PARTIAL sum(xc_groupby_tab1.val) + Group Key: xc_groupby_tab1.val2 + -> Sort + Output: xc_groupby_tab1.val2, xc_groupby_tab1.val + Sort Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val2, xc_groupby_tab1.val +(36 rows) + +-- group by without aggregate +select val2 from xc_groupby_tab1 group by val2 order by 1; + val2 +------ + 1 + 2 + 3 +(3 rows) + +explain (verbose true, costs false, nodes false) select val2 from xc_groupby_tab1 group by val2 order by 1; + QUERY PLAN +------------------------------------------------------------------------ + Remote Subquery Scan on all + Output: val2 + Sort Key: xc_groupby_tab1.val2 + -> Group + Output: val2 + Group Key: xc_groupby_tab1.val2 + -> Sort + Output: val2 + Sort Key: xc_groupby_tab1.val2 + -> Remote Subquery Scan on all + Output: val2 + Distribute results by H: val2 + -> Group + Output: val2 + Group Key: xc_groupby_tab1.val2 + -> Sort + Output: val2 + Sort Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val2 +(20 rows) + +explain (verbose true, costs false, nodes false) select val2 from xc_groupby_tab1 group by val2; + QUERY PLAN +------------------------------------------------------------------------ + Remote Subquery Scan on all + Output: val2 + -> Group + Output: val2 + Group Key: xc_groupby_tab1.val2 + -> Sort + Output: val2 + Sort Key: xc_groupby_tab1.val2 + -> Remote Subquery Scan on all + Output: val2 + Distribute results by H: val2 + -> Group + Output: val2 + Group Key: xc_groupby_tab1.val2 + -> Sort + Output: val2 + Sort Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val2 +(19 rows) + +select val + val2 from xc_groupby_tab1 group by val + val2 order by 1; + ?column? +---------- + 2 + 3 + 4 + 7 + 8 + 9 +(6 rows) + +explain (verbose true, costs false, nodes false) select val + val2 from xc_groupby_tab1 group by val + val2 order by 1; + QUERY PLAN +------------------------------------------------------------------------------------------ + Remote Subquery Scan on all + Output: (val + val2) + Sort Key: (xc_groupby_tab1.val + xc_groupby_tab1.val2) + -> Group + Output: ((val + val2)) + Group Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2)) + -> Sort + Output: ((val + val2)) + Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2)) + -> Remote Subquery Scan on all + Output: (val + val2) + Distribute results by H: (val + val2) + -> Group + Output: ((val + val2)) + Group Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2)) + -> Sort + Output: ((val + val2)) + Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2)) + -> Seq Scan on public.xc_groupby_tab1 + Output: (val + val2) +(20 rows) + +explain (verbose true, costs false, nodes false) select val + val2 from xc_groupby_tab1 group by val + val2; + QUERY PLAN +------------------------------------------------------------------------------------------ + Remote Subquery Scan on all + Output: (val + val2) + -> Group + Output: ((val + val2)) + Group Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2)) + -> Sort + Output: ((val + val2)) + Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2)) + -> Remote Subquery Scan on all + Output: (val + val2) + Distribute results by H: (val + val2) + -> Group + Output: ((val + val2)) + Group Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2)) + -> Sort + Output: ((val + val2)) + Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2)) + -> Seq Scan on public.xc_groupby_tab1 + Output: (val + val2) +(19 rows) + +select val + val2, val, val2 from xc_groupby_tab1 group by val, val2 order by 1, 2; + ?column? | val | val2 +----------+-----+------ + 2 | 1 | 1 + 3 | 2 | 1 + 4 | 1 | 3 + 4 | 2 | 2 + 4 | 3 | 1 + 7 | 4 | 3 + 8 | 6 | 2 + 9 | 6 | 3 +(8 rows) + +explain (verbose true, costs false, nodes false) select val + val2, val, val2 from xc_groupby_tab1 group by val, val2 order by 1, 2; + QUERY PLAN +--------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: (val + val2), val, val2 + Sort Key: (xc_groupby_tab1.val + xc_groupby_tab1.val2), xc_groupby_tab1.val + -> Sort + Output: ((val + val2)), val, val2 + Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2)), xc_groupby_tab1.val + -> Group + Output: (val + val2), val, val2 + Group Key: xc_groupby_tab1.val, xc_groupby_tab1.val2 + -> Sort + Output: val, val2 + Sort Key: xc_groupby_tab1.val, xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val, val2 +(14 rows) + +explain (verbose true, costs false, nodes false) select val + val2, val, val2 from xc_groupby_tab1 group by val, val2; + QUERY PLAN +--------------------------------------------------------------------------------------------------- + Remote Fast Query Execution + Output: (xc_groupby_tab1.val + xc_groupby_tab1.val2), xc_groupby_tab1.val, xc_groupby_tab1.val2 + Remote query: SELECT (val + val2), val, val2 FROM xc_groupby_tab1 GROUP BY val, val2 + -> Group + Output: (val + val2), val, val2 + Group Key: xc_groupby_tab1.val, xc_groupby_tab1.val2 + -> Sort + Output: val, val2 + Sort Key: xc_groupby_tab1.val, xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val, val2 +(11 rows) + +select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2 order by 1, 2; + ?column? | val | val2 +----------+-----+------ + 2 | 1 | 1 + 5 | 3 | 2 + 5 | 4 | 1 + 6 | 2 | 4 + 6 | 4 | 2 + 7 | 3 | 4 +(6 rows) + +explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2 order by 1, 2; + QUERY PLAN +--------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2 + Sort Key: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val + -> Sort + Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)), xc_groupby_tab1.val, xc_groupby_tab2.val2 + Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)), xc_groupby_tab1.val + -> Group + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2 + Group Key: xc_groupby_tab1.val, xc_groupby_tab2.val2 + -> Sort + Output: xc_groupby_tab1.val, xc_groupby_tab2.val2 + Sort Key: xc_groupby_tab1.val, xc_groupby_tab2.val2 + -> Merge Join + Output: xc_groupby_tab1.val, xc_groupby_tab2.val2 + Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) + -> Sort + Output: xc_groupby_tab1.val + Sort Key: xc_groupby_tab1.val + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val + -> Sort + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val + Sort Key: xc_groupby_tab2.val + -> Seq Scan on public.xc_groupby_tab2 + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val +(25 rows) + +explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2; + QUERY PLAN +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Remote Fast Query Execution + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2 + Remote query: SELECT (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2 FROM xc_groupby_tab1, xc_groupby_tab2 WHERE (xc_groupby_tab1.val = xc_groupby_tab2.val) GROUP BY xc_groupby_tab1.val, xc_groupby_tab2.val2 + -> Group + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2 + Group Key: xc_groupby_tab1.val, xc_groupby_tab2.val2 + -> Sort + Output: xc_groupby_tab1.val, xc_groupby_tab2.val2 + Sort Key: xc_groupby_tab1.val, xc_groupby_tab2.val2 + -> Merge Join + Output: xc_groupby_tab1.val, xc_groupby_tab2.val2 + Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) + -> Sort + Output: xc_groupby_tab1.val + Sort Key: xc_groupby_tab1.val + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val + -> Sort + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val + Sort Key: xc_groupby_tab2.val + -> Seq Scan on public.xc_groupby_tab2 + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val +(22 rows) + +select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2 order by 1; + ?column? +---------- + 2 + 5 + 6 + 7 +(4 rows) + +explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2 order by 1; + QUERY PLAN +----------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2) + Sort Key: (xc_groupby_tab1.val + xc_groupby_tab2.val2) + -> Group + Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + Group Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + -> Sort + Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + -> Remote Subquery Scan on all + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2) + Distribute results by H: (val + val2) + -> Group + Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + Group Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + -> Sort + Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + -> Merge Join + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2) + Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) + -> Sort + Output: xc_groupby_tab1.val + Sort Key: xc_groupby_tab1.val + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val + -> Sort + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val + Sort Key: xc_groupby_tab2.val + -> Seq Scan on public.xc_groupby_tab2 + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val +(31 rows) + +explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2; + QUERY PLAN +----------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2) + -> Group + Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + Group Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + -> Sort + Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + -> Remote Subquery Scan on all + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2) + Distribute results by H: (val + val2) + -> Group + Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + Group Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + -> Sort + Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + -> Merge Join + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2) + Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) + -> Sort + Output: xc_groupby_tab1.val + Sort Key: xc_groupby_tab1.val + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val + -> Sort + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val + Sort Key: xc_groupby_tab2.val + -> Seq Scan on public.xc_groupby_tab2 + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val +(30 rows) + +-- group by with aggregates in expression +select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2 order by 1; + ?column? | val2 +---------------------+------ + 11.0000000000000000 | 1 + 14.0000000000000000 | 2 + 17.6666666666666667 | 3 +(3 rows) + +explain (verbose true, costs false, nodes false) select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2 order by 1; + QUERY PLAN +---------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: (((count(*) + sum(val)))::numeric + avg(val)), val2 + Sort Key: (((count(*) + sum(xc_groupby_tab1.val)))::numeric + avg(xc_groupby_tab1.val)) + -> Sort + Output: ((((count(*) + sum(val)))::numeric + avg(val))), val2 + Sort Key: ((((count(*) + sum(xc_groupby_tab1.val)))::numeric + avg(xc_groupby_tab1.val))) + -> Finalize GroupAggregate + Output: (((count(*) + sum(val)))::numeric + avg(val)), val2 + Group Key: xc_groupby_tab1.val2 + -> Sort + Output: val2, (PARTIAL count(*)), (PARTIAL sum(val)), (PARTIAL avg(val)) + Sort Key: xc_groupby_tab1.val2 + -> Remote Subquery Scan on all + Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val) + Distribute results by H: val2 + -> Partial GroupAggregate + Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val) + Group Key: xc_groupby_tab1.val2 + -> Sort + Output: val2, val + Sort Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val2, val +(23 rows) + +explain (verbose true, costs false, nodes false) select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2; + QUERY PLAN +---------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: (((count(*) + sum(val)))::numeric + avg(val)), val2 + -> Finalize GroupAggregate + Output: (((count(*) + sum(val)))::numeric + avg(val)), val2 + Group Key: xc_groupby_tab1.val2 + -> Sort + Output: val2, (PARTIAL count(*)), (PARTIAL sum(val)), (PARTIAL avg(val)) + Sort Key: xc_groupby_tab1.val2 + -> Remote Subquery Scan on all + Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val) + Distribute results by H: val2 + -> Partial GroupAggregate + Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val) + Group Key: xc_groupby_tab1.val2 + -> Sort + Output: val2, val + Sort Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val2, val +(19 rows) + +-- group by with expressions in group by clause +select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2 order by 3; + sum | avg | ?column? +-----+--------------------+---------- + 6 | 2.0000000000000000 | 2 + 8 | 4.0000000000000000 | 4 + 11 | 3.6666666666666667 | 6 +(3 rows) + +explain (verbose true, costs false, nodes false) select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2 order by 3; + QUERY PLAN +------------------------------------------------------------------------------------ + Remote Subquery Scan on all + Output: sum(val), avg(val), (2 * val2) + Sort Key: (2 * xc_groupby_tab1.val2) + -> Finalize GroupAggregate + Output: sum(val), avg(val), ((2 * val2)) + Group Key: ((2 * xc_groupby_tab1.val2)) + -> Sort + Output: ((2 * val2)), (PARTIAL sum(val)), (PARTIAL avg(val)) + Sort Key: ((2 * xc_groupby_tab1.val2)) + -> Remote Subquery Scan on all + Output: (2 * val2), PARTIAL sum(val), PARTIAL avg(val) + Distribute results by H: (2 * val2) + -> Partial GroupAggregate + Output: ((2 * val2)), PARTIAL sum(val), PARTIAL avg(val) + Group Key: ((2 * xc_groupby_tab1.val2)) + -> Sort + Output: ((2 * val2)), val + Sort Key: ((2 * xc_groupby_tab1.val2)) + -> Seq Scan on public.xc_groupby_tab1 + Output: (2 * val2), val +(20 rows) + +explain (verbose true, costs false, nodes false) select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2; + QUERY PLAN +------------------------------------------------------------------------------------ + Remote Subquery Scan on all + Output: sum(val), avg(val), (2 * val2) + -> Finalize GroupAggregate + Output: sum(val), avg(val), ((2 * val2)) + Group Key: ((2 * xc_groupby_tab1.val2)) + -> Sort + Output: ((2 * val2)), (PARTIAL sum(val)), (PARTIAL avg(val)) + Sort Key: ((2 * xc_groupby_tab1.val2)) + -> Remote Subquery Scan on all + Output: (2 * val2), PARTIAL sum(val), PARTIAL avg(val) + Distribute results by H: (2 * val2) + -> Partial GroupAggregate + Output: ((2 * val2)), PARTIAL sum(val), PARTIAL avg(val) + Group Key: ((2 * xc_groupby_tab1.val2)) + -> Sort + Output: ((2 * val2)), val + Sort Key: ((2 * xc_groupby_tab1.val2)) + -> Seq Scan on public.xc_groupby_tab1 + Output: (2 * val2), val +(19 rows) + +drop table xc_groupby_tab1; +drop table xc_groupby_tab2; +-- some tests involving nulls, characters, float type etc. +create table xc_groupby_def(a int, b varchar(25)); +insert into xc_groupby_def VALUES (NULL, NULL); +insert into xc_groupby_def VALUES (1, NULL); +insert into xc_groupby_def VALUES (NULL, 'One'); +insert into xc_groupby_def VALUES (2, 'Two'); +insert into xc_groupby_def VALUES (2, 'Two'); +insert into xc_groupby_def VALUES (3, 'Three'); +insert into xc_groupby_def VALUES (4, 'Three'); +insert into xc_groupby_def VALUES (5, 'Three'); +insert into xc_groupby_def VALUES (6, 'Two'); +insert into xc_groupby_def VALUES (7, NULL); +insert into xc_groupby_def VALUES (8, 'Two'); +insert into xc_groupby_def VALUES (9, 'Three'); +insert into xc_groupby_def VALUES (10, 'Three'); +select a,count(a) from xc_groupby_def group by a order by a; + a | count +----+------- + 1 | 1 + 2 | 2 + 3 | 1 + 4 | 1 + 5 | 1 + 6 | 1 + 7 | 1 + 8 | 1 + 9 | 1 + 10 | 1 + | 0 +(11 rows) + +explain (verbose true, costs false, nodes false) select a,count(a) from xc_groupby_def group by a order by a; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all + Output: a, count(a) + Sort Key: xc_groupby_def.a + -> GroupAggregate + Output: a, count(a) + Group Key: xc_groupby_def.a + -> Sort + Output: a + Sort Key: xc_groupby_def.a + -> Seq Scan on public.xc_groupby_def + Output: a +(11 rows) + +select avg(a) from xc_groupby_def group by a order by 1; + avg +------------------------ + 1.00000000000000000000 + 2.0000000000000000 + 3.0000000000000000 + 4.0000000000000000 + 5.0000000000000000 + 6.0000000000000000 + 7.0000000000000000 + 8.0000000000000000 + 9.0000000000000000 + 10.0000000000000000 + +(11 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by a order by 1; + QUERY PLAN +----------------------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), a + Sort Key: avg(xc_groupby_def.a) + -> Sort + Output: (avg(a)), a + Sort Key: (avg(xc_groupby_def.a)) + -> GroupAggregate + Output: avg(a), a + Group Key: xc_groupby_def.a + -> Sort + Output: a + Sort Key: xc_groupby_def.a + -> Seq Scan on public.xc_groupby_def + Output: a +(14 rows) + +select avg(a) from xc_groupby_def group by b order by 1; + avg +-------------------- + 4.0000000000000000 + 4.5000000000000000 + 6.2000000000000000 + +(4 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by b; + QUERY PLAN +----------------------------------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), b + -> Finalize GroupAggregate + Output: avg(a), b + Group Key: xc_groupby_def.b + -> Sort + Output: b, (PARTIAL avg(a)) + Sort Key: xc_groupby_def.b + -> Remote Subquery Scan on all + Output: b, PARTIAL avg(a) + Distribute results by H: b + -> Partial GroupAggregate + Output: b, PARTIAL avg(a) + Group Key: xc_groupby_def.b + -> Sort + Output: b, a + Sort Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: b, a +(19 rows) + +select sum(a) from xc_groupby_def group by b order by 1; + sum +----- + 8 + 18 + 31 + +(4 rows) + +explain (verbose true, costs false, nodes false) select sum(a) from xc_groupby_def group by b; + QUERY PLAN +----------------------------------------------------------------------- + Remote Subquery Scan on all + Output: sum(a), b + -> Finalize GroupAggregate + Output: sum(a), b + Group Key: xc_groupby_def.b + -> Sort + Output: b, (PARTIAL sum(a)) + Sort Key: xc_groupby_def.b + -> Remote Subquery Scan on all + Output: b, PARTIAL sum(a) + Distribute results by H: b + -> Partial GroupAggregate + Output: b, PARTIAL sum(a) + Group Key: xc_groupby_def.b + -> Sort + Output: b, a + Sort Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: b, a +(19 rows) + +select count(*) from xc_groupby_def group by b order by 1; + count +------- + 1 + 3 + 4 + 5 +(4 rows) + +explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def group by b; + QUERY PLAN +----------------------------------------------------------------------- + Remote Subquery Scan on all + Output: count(*), b + -> Finalize GroupAggregate + Output: count(*), b + Group Key: xc_groupby_def.b + -> Sort + Output: b, (PARTIAL count(*)) + Sort Key: xc_groupby_def.b + -> Remote Subquery Scan on all + Output: b, PARTIAL count(*) + Distribute results by H: b + -> Partial GroupAggregate + Output: b, PARTIAL count(*) + Group Key: xc_groupby_def.b + -> Sort + Output: b + Sort Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: b +(19 rows) + +select count(*) from xc_groupby_def where a is not null group by a order by 1; + count +------- + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 2 +(10 rows) + +explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def where a is not null group by a; + QUERY PLAN +------------------------------------------------------------ + Remote Subquery Scan on all + Output: count(*), a + -> GroupAggregate + Output: count(*), a + Group Key: xc_groupby_def.a + -> Sort + Output: a + Sort Key: xc_groupby_def.a + -> Seq Scan on public.xc_groupby_def + Output: a + Filter: (xc_groupby_def.a IS NOT NULL) +(11 rows) + +select b from xc_groupby_def group by b order by 1; + b +------- + One + Three + Two + +(4 rows) + +explain (verbose true, costs false, nodes false) select b from xc_groupby_def group by b; + QUERY PLAN +----------------------------------------------------------------------- + Remote Subquery Scan on all + Output: b + -> Group + Output: b + Group Key: xc_groupby_def.b + -> Sort + Output: b + Sort Key: xc_groupby_def.b + -> Remote Subquery Scan on all + Output: b + Distribute results by H: b + -> Group + Output: b + Group Key: xc_groupby_def.b + -> Sort + Output: b + Sort Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: b +(19 rows) + +select b,count(b) from xc_groupby_def group by b order by 1; + b | count +-------+------- + One | 1 + Three | 5 + Two | 4 + | 0 +(4 rows) + +explain (verbose true, costs false, nodes false) select b,count(b) from xc_groupby_def group by b; + QUERY PLAN +----------------------------------------------------------------------- + Remote Subquery Scan on all + Output: b, count(b) + -> Finalize GroupAggregate + Output: b, count(b) + Group Key: xc_groupby_def.b + -> Sort + Output: b, (PARTIAL count(b)) + Sort Key: xc_groupby_def.b + -> Remote Subquery Scan on all + Output: b, PARTIAL count(b) + Distribute results by H: b + -> Partial GroupAggregate + Output: b, PARTIAL count(b) + Group Key: xc_groupby_def.b + -> Sort + Output: b + Sort Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: b +(19 rows) + +select count(*) from xc_groupby_def where b is null group by b order by 1; + count +------- + 3 +(1 row) + +explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def where b is null group by b; + QUERY PLAN +-------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: count(*), b + -> Finalize GroupAggregate + Output: count(*), b + Group Key: xc_groupby_def.b + -> Sort + Output: b, (PARTIAL count(*)) + Sort Key: xc_groupby_def.b + -> Remote Subquery Scan on all + Output: b, PARTIAL count(*) + Distribute results by H: b + -> Partial GroupAggregate + Output: b, PARTIAL count(*) + Group Key: xc_groupby_def.b + -> Sort + Output: b + Sort Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: b + Filter: (xc_groupby_def.b IS NULL) +(20 rows) + +create table xc_groupby_g(a int, b float, c numeric); +insert into xc_groupby_g values(1,2.1,3.2); +insert into xc_groupby_g values(1,2.1,3.2); +insert into xc_groupby_g values(2,2.3,5.2); +select sum(a) from xc_groupby_g group by a order by 1; + sum +----- + 2 + 2 +(2 rows) + +explain (verbose true, costs false, nodes false) select sum(a) from xc_groupby_g group by a; + QUERY PLAN +--------------------------------------------------- + Remote Subquery Scan on all + Output: sum(a), a + -> GroupAggregate + Output: sum(a), a + Group Key: xc_groupby_g.a + -> Sort + Output: a + Sort Key: xc_groupby_g.a + -> Seq Scan on public.xc_groupby_g + Output: a +(10 rows) + +select sum(b) from xc_groupby_g group by b order by 1; + sum +----- + 2.3 + 4.2 +(2 rows) + +explain (verbose true, costs false, nodes false) select sum(b) from xc_groupby_g group by b; + QUERY PLAN +--------------------------------------------------------------------- + Remote Subquery Scan on all + Output: sum(b), b + -> Finalize GroupAggregate + Output: sum(b), b + Group Key: xc_groupby_g.b + -> Sort + Output: b, (PARTIAL sum(b)) + Sort Key: xc_groupby_g.b + -> Remote Subquery Scan on all + Output: b, PARTIAL sum(b) + Distribute results by H: b + -> Partial GroupAggregate + Output: b, PARTIAL sum(b) + Group Key: xc_groupby_g.b + -> Sort + Output: b + Sort Key: xc_groupby_g.b + -> Seq Scan on public.xc_groupby_g + Output: b +(19 rows) + +select sum(c) from xc_groupby_g group by b order by 1; + sum +----- + 5.2 + 6.4 +(2 rows) + +explain (verbose true, costs false, nodes false) select sum(c) from xc_groupby_g group by b; + QUERY PLAN +--------------------------------------------------------------------- + Remote Subquery Scan on all + Output: sum(c), b + -> Finalize GroupAggregate + Output: sum(c), b + Group Key: xc_groupby_g.b + -> Sort + Output: b, (PARTIAL sum(c)) + Sort Key: xc_groupby_g.b + -> Remote Subquery Scan on all + Output: b, PARTIAL sum(c) + Distribute results by H: b + -> Partial GroupAggregate + Output: b, PARTIAL sum(c) + Group Key: xc_groupby_g.b + -> Sort + Output: b, c + Sort Key: xc_groupby_g.b + -> Seq Scan on public.xc_groupby_g + Output: b, c +(19 rows) + +select avg(a) from xc_groupby_g group by b order by 1; + avg +------------------------ + 1.00000000000000000000 + 2.0000000000000000 +(2 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_g group by b; + QUERY PLAN +--------------------------------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), b + -> Finalize GroupAggregate + Output: avg(a), b + Group Key: xc_groupby_g.b + -> Sort + Output: b, (PARTIAL avg(a)) + Sort Key: xc_groupby_g.b + -> Remote Subquery Scan on all + Output: b, PARTIAL avg(a) + Distribute results by H: b + -> Partial GroupAggregate + Output: b, PARTIAL avg(a) + Group Key: xc_groupby_g.b + -> Sort + Output: b, a + Sort Key: xc_groupby_g.b + -> Seq Scan on public.xc_groupby_g + Output: b, a +(19 rows) + +select avg(b) from xc_groupby_g group by c order by 1; + avg +----- + 2.1 + 2.3 +(2 rows) + +explain (verbose true, costs false, nodes false) select avg(b) from xc_groupby_g group by c; + QUERY PLAN +--------------------------------------------------------------------- + Remote Subquery Scan on all + Output: avg(b), c + -> Finalize GroupAggregate + Output: avg(b), c + Group Key: xc_groupby_g.c + -> Sort + Output: c, (PARTIAL avg(b)) + Sort Key: xc_groupby_g.c + -> Remote Subquery Scan on all + Output: c, PARTIAL avg(b) + Distribute results by H: c + -> Partial GroupAggregate + Output: c, PARTIAL avg(b) + Group Key: xc_groupby_g.c + -> Sort + Output: c, b + Sort Key: xc_groupby_g.c + -> Seq Scan on public.xc_groupby_g + Output: c, b +(19 rows) + +select avg(c) from xc_groupby_g group by c order by 1; + avg +-------------------- + 3.2000000000000000 + 5.2000000000000000 +(2 rows) + +explain (verbose true, costs false, nodes false) select avg(c) from xc_groupby_g group by c; + QUERY PLAN +--------------------------------------------------------------------- + Remote Subquery Scan on all + Output: avg(c), c + -> Finalize GroupAggregate + Output: avg(c), c + Group Key: xc_groupby_g.c + -> Sort + Output: c, (PARTIAL avg(c)) + Sort Key: xc_groupby_g.c + -> Remote Subquery Scan on all + Output: c, PARTIAL avg(c) + Distribute results by H: c + -> Partial GroupAggregate + Output: c, PARTIAL avg(c) + Group Key: xc_groupby_g.c + -> Sort + Output: c + Sort Key: xc_groupby_g.c + -> Seq Scan on public.xc_groupby_g + Output: c +(19 rows) + +drop table xc_groupby_def; +drop table xc_groupby_g; +-- Combination 4 enable_hashagg off and replicated tables. +-- repeat the same tests for replicated tables +-- create required tables and fill them with data +create table xc_groupby_tab1 (val int, val2 int) distribute by replication; +create table xc_groupby_tab2 (val int, val2 int) distribute by replication; +insert into xc_groupby_tab1 values (1, 1), (2, 1), (3, 1), (2, 2), (6, 2), (4, 3), (1, 3), (6, 3); +insert into xc_groupby_tab2 values (1, 1), (4, 1), (8, 1), (2, 4), (9, 4), (3, 4), (4, 2), (5, 2), (3, 2); +select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2; + count | sum | avg | ?column? | val2 +-------+-----+--------------------+------------------+------ + 3 | 6 | 2.0000000000000000 | 2 | 1 + 2 | 8 | 4.0000000000000000 | 4 | 2 + 3 | 11 | 3.6666666666666667 | 3.66666666666667 | 3 +(3 rows) + +explain (verbose true, costs false, nodes false) select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2 + -> GroupAggregate + Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2 + Group Key: xc_groupby_tab1.val2 + -> Sort + Output: val2, val + Sort Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val2, val +(10 rows) + +-- joins and group by +select count(*), sum(xc_groupby_tab1.val * xc_groupby_tab2.val), avg(xc_groupby_tab1.val*xc_groupby_tab2.val), sum(xc_groupby_tab1.val*xc_groupby_tab2.val)::float8/count(*), xc_groupby_tab1.val2, xc_groupby_tab2.val2 from xc_groupby_tab1 full outer join xc_groupby_tab2 on xc_groupby_tab1.val2 = xc_groupby_tab2.val2 group by xc_groupby_tab1.val2, xc_groupby_tab2.val2 order by count(*); + count | sum | avg | ?column? | val2 | val2 +-------+-----+---------------------+------------------+------+------ + 3 | | | | 3 | + 3 | | | | | 4 + 6 | 96 | 16.0000000000000000 | 16 | 2 | 2 + 9 | 78 | 8.6666666666666667 | 8.66666666666667 | 1 | 1 +(4 rows) + +explain (verbose true, costs false, nodes false) select count(*), sum(xc_groupby_tab1.val * xc_groupby_tab2.val), avg(xc_groupby_tab1.val*xc_groupby_tab2.val), sum(xc_groupby_tab1.val*xc_groupby_tab2.val)::float8/count(*), xc_groupby_tab1.val2, xc_groupby_tab2.val2 from xc_groupby_tab1 full outer join xc_groupby_tab2 on xc_groupby_tab1.val2 = xc_groupby_tab2.val2 group by xc_groupby_tab1.val2, xc_groupby_tab2.val2; + QUERY PLAN +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: count(*), sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), avg((xc_groupby_tab1.val * xc_groupby_tab2.val)), ((sum((xc_groupby_tab1.val * xc_groupby_tab2.val)))::double precision / (count(*))::double precision), xc_groupby_tab1.val2, xc_groupby_tab2.val2 + -> GroupAggregate + Output: count(*), sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), avg((xc_groupby_tab1.val * xc_groupby_tab2.val)), ((sum((xc_groupby_tab1.val * xc_groupby_tab2.val)))::double precision / (count(*))::double precision), xc_groupby_tab1.val2, xc_groupby_tab2.val2 + Group Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2 + -> Sort + Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val + Sort Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2 + -> Merge Full Join + Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val + Merge Cond: (xc_groupby_tab1.val2 = xc_groupby_tab2.val2) + -> Sort + Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 + Sort Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 + -> Sort + Output: xc_groupby_tab2.val, xc_groupby_tab2.val2 + Sort Key: xc_groupby_tab2.val2 + -> Seq Scan on public.xc_groupby_tab2 + Output: xc_groupby_tab2.val, xc_groupby_tab2.val2 +(21 rows) + +-- aggregates over aggregates +select sum(y) from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x; + sum +----- + 8 + 17 +(2 rows) + +explain (verbose true, costs false, nodes false) select sum(y) from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x; + QUERY PLAN +-------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: sum(q1.y), q1.x + -> GroupAggregate + Output: sum(q1.y), q1.x + Group Key: q1.x + -> Sort + Output: q1.x, q1.y + Sort Key: q1.x + -> Subquery Scan on q1 + Output: q1.x, q1.y + -> GroupAggregate + Output: sum(xc_groupby_tab1.val), (xc_groupby_tab1.val2 % 2), xc_groupby_tab1.val2 + Group Key: xc_groupby_tab1.val2 + -> Sort + Output: xc_groupby_tab1.val2, xc_groupby_tab1.val + Sort Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val2, xc_groupby_tab1.val +(18 rows) + +-- group by without aggregate +select val2 from xc_groupby_tab1 group by val2 order by 1; + val2 +------ + 1 + 2 + 3 +(3 rows) + +explain (verbose true, costs false, nodes false) select val2 from xc_groupby_tab1 group by val2; + QUERY PLAN +------------------------------------------------------ + Remote Subquery Scan on all + Output: val2 + -> Group + Output: val2 + Group Key: xc_groupby_tab1.val2 + -> Sort + Output: val2 + Sort Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val2 +(10 rows) + +select val + val2 from xc_groupby_tab1 group by val + val2 order by 1; + ?column? +---------- + 2 + 3 + 4 + 7 + 8 + 9 +(6 rows) + +explain (verbose true, costs false, nodes false) select val + val2 from xc_groupby_tab1 group by val + val2; + QUERY PLAN +------------------------------------------------------------------------ + Remote Subquery Scan on all + Output: (val + val2) + -> Group + Output: ((val + val2)) + Group Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2)) + -> Sort + Output: ((val + val2)) + Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2)) + -> Seq Scan on public.xc_groupby_tab1 + Output: (val + val2) +(10 rows) + +select val + val2, val, val2 from xc_groupby_tab1 group by val, val2; + ?column? | val | val2 +----------+-----+------ + 2 | 1 | 1 + 4 | 1 | 3 + 3 | 2 | 1 + 4 | 2 | 2 + 4 | 3 | 1 + 7 | 4 | 3 + 8 | 6 | 2 + 9 | 6 | 3 +(8 rows) + +explain (verbose true, costs false, nodes false) select val + val2, val, val2 from xc_groupby_tab1 group by val, val2; + QUERY PLAN +------------------------------------------------------------------- + Remote Subquery Scan on all + Output: (val + val2), val, val2 + -> Group + Output: (val + val2), val, val2 + Group Key: xc_groupby_tab1.val, xc_groupby_tab1.val2 + -> Sort + Output: val, val2 + Sort Key: xc_groupby_tab1.val, xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val, val2 +(10 rows) + +select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2 order by 1; + ?column? | val | val2 +----------+-----+------ + 2 | 1 | 1 + 5 | 3 | 2 + 5 | 4 | 1 + 6 | 2 | 4 + 6 | 4 | 2 + 7 | 3 | 4 +(6 rows) + +explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2; + QUERY PLAN +--------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2 + -> Group + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2 + Group Key: xc_groupby_tab1.val, xc_groupby_tab2.val2 + -> Sort + Output: xc_groupby_tab1.val, xc_groupby_tab2.val2 + Sort Key: xc_groupby_tab1.val, xc_groupby_tab2.val2 + -> Merge Join + Output: xc_groupby_tab1.val, xc_groupby_tab2.val2 + Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) + -> Sort + Output: xc_groupby_tab1.val + Sort Key: xc_groupby_tab1.val + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val + -> Sort + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val + Sort Key: xc_groupby_tab2.val + -> Seq Scan on public.xc_groupby_tab2 + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val +(21 rows) + +select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2 order by 1; + ?column? +---------- + 2 + 5 + 6 + 7 +(4 rows) + +explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2; + QUERY PLAN +----------------------------------------------------------------------------------- + Remote Subquery Scan on all + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2) + -> Group + Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + Group Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + -> Sort + Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)) + -> Merge Join + Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2) + Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) + -> Sort + Output: xc_groupby_tab1.val + Sort Key: xc_groupby_tab1.val + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val + -> Sort + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val + Sort Key: xc_groupby_tab2.val + -> Seq Scan on public.xc_groupby_tab2 + Output: xc_groupby_tab2.val2, xc_groupby_tab2.val +(21 rows) + +-- group by with aggregates in expression +select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2 order by 1; + ?column? | val2 +---------------------+------ + 11.0000000000000000 | 1 + 14.0000000000000000 | 2 + 17.6666666666666667 | 3 +(3 rows) + +explain (verbose true, costs false, nodes false) select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2; + QUERY PLAN +--------------------------------------------------------------------- + Remote Subquery Scan on all + Output: (((count(*) + sum(val)))::numeric + avg(val)), val2 + -> GroupAggregate + Output: (((count(*) + sum(val)))::numeric + avg(val)), val2 + Group Key: xc_groupby_tab1.val2 + -> Sort + Output: val2, val + Sort Key: xc_groupby_tab1.val2 + -> Seq Scan on public.xc_groupby_tab1 + Output: val2, val +(10 rows) + +-- group by with expressions in group by clause +select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2 order by 1; + sum | avg | ?column? +-----+--------------------+---------- + 6 | 2.0000000000000000 | 2 + 8 | 4.0000000000000000 | 4 + 11 | 3.6666666666666667 | 6 +(3 rows) + +explain (verbose true, costs false, nodes false) select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2; + QUERY PLAN +------------------------------------------------------ + Remote Subquery Scan on all + Output: sum(val), avg(val), (2 * val2) + -> GroupAggregate + Output: sum(val), avg(val), ((2 * val2)) + Group Key: ((2 * xc_groupby_tab1.val2)) + -> Sort + Output: ((2 * val2)), val + Sort Key: ((2 * xc_groupby_tab1.val2)) + -> Seq Scan on public.xc_groupby_tab1 + Output: (2 * val2), val +(10 rows) + +drop table xc_groupby_tab1; +drop table xc_groupby_tab2; +-- some tests involving nulls, characters, float type etc. +create table xc_groupby_def(a int, b varchar(25)) distribute by replication; +insert into xc_groupby_def VALUES (NULL, NULL); +insert into xc_groupby_def VALUES (1, NULL); +insert into xc_groupby_def VALUES (NULL, 'One'); +insert into xc_groupby_def VALUES (2, 'Two'); +insert into xc_groupby_def VALUES (2, 'Two'); +insert into xc_groupby_def VALUES (3, 'Three'); +insert into xc_groupby_def VALUES (4, 'Three'); +insert into xc_groupby_def VALUES (5, 'Three'); +insert into xc_groupby_def VALUES (6, 'Two'); +insert into xc_groupby_def VALUES (7, NULL); +insert into xc_groupby_def VALUES (8, 'Two'); +insert into xc_groupby_def VALUES (9, 'Three'); +insert into xc_groupby_def VALUES (10, 'Three'); +select a,count(a) from xc_groupby_def group by a order by a; + a | count +----+------- + 1 | 1 + 2 | 2 + 3 | 1 + 4 | 1 + 5 | 1 + 6 | 1 + 7 | 1 + 8 | 1 + 9 | 1 + 10 | 1 + | 0 +(11 rows) + +explain (verbose true, costs false, nodes false) select a,count(a) from xc_groupby_def group by a order by a; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all + Output: a, count(a) + -> GroupAggregate + Output: a, count(a) + Group Key: xc_groupby_def.a + -> Sort + Output: a + Sort Key: xc_groupby_def.a + -> Seq Scan on public.xc_groupby_def + Output: a +(10 rows) + +select avg(a) from xc_groupby_def group by a; + avg +------------------------ + 1.00000000000000000000 + 2.0000000000000000 + 3.0000000000000000 + 4.0000000000000000 + 5.0000000000000000 + 6.0000000000000000 + 7.0000000000000000 + 8.0000000000000000 + 9.0000000000000000 + 10.0000000000000000 + +(11 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by a; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), a + -> GroupAggregate + Output: avg(a), a + Group Key: xc_groupby_def.a + -> Sort + Output: a + Sort Key: xc_groupby_def.a + -> Seq Scan on public.xc_groupby_def + Output: a +(10 rows) + +select avg(a) from xc_groupby_def group by a; + avg +------------------------ + 1.00000000000000000000 + 2.0000000000000000 + 3.0000000000000000 + 4.0000000000000000 + 5.0000000000000000 + 6.0000000000000000 + 7.0000000000000000 + 8.0000000000000000 + 9.0000000000000000 + 10.0000000000000000 + +(11 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by a; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), a + -> GroupAggregate + Output: avg(a), a + Group Key: xc_groupby_def.a + -> Sort + Output: a + Sort Key: xc_groupby_def.a + -> Seq Scan on public.xc_groupby_def + Output: a +(10 rows) + +select avg(a) from xc_groupby_def group by b order by 1; + avg +-------------------- + 4.0000000000000000 + 4.5000000000000000 + 6.2000000000000000 + +(4 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by b; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), b + -> GroupAggregate + Output: avg(a), b + Group Key: xc_groupby_def.b + -> Sort + Output: b, a + Sort Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: b, a +(10 rows) + +select sum(a) from xc_groupby_def group by b order by 1; + sum +----- + 8 + 18 + 31 + +(4 rows) + +explain (verbose true, costs false, nodes false) select sum(a) from xc_groupby_def group by b; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all + Output: sum(a), b + -> GroupAggregate + Output: sum(a), b + Group Key: xc_groupby_def.b + -> Sort + Output: b, a + Sort Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: b, a +(10 rows) + +select count(*) from xc_groupby_def group by b; + count +------- + 1 + 5 + 4 + 3 +(4 rows) + +explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def group by b; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all + Output: count(*), b + -> GroupAggregate + Output: count(*), b + Group Key: xc_groupby_def.b + -> Sort + Output: b + Sort Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: b +(10 rows) + +select count(*) from xc_groupby_def where a is not null group by a; + count +------- + 1 + 2 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 +(10 rows) + +explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def where a is not null group by a; + QUERY PLAN +------------------------------------------------------------ + Remote Subquery Scan on all + Output: count(*), a + -> GroupAggregate + Output: count(*), a + Group Key: xc_groupby_def.a + -> Sort + Output: a + Sort Key: xc_groupby_def.a + -> Seq Scan on public.xc_groupby_def + Output: a + Filter: (xc_groupby_def.a IS NOT NULL) +(11 rows) + +select b from xc_groupby_def group by b order by 1; + b +------- + One + Three + Two + +(4 rows) + +explain (verbose true, costs false, nodes false) select b from xc_groupby_def group by b; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all + Output: b + -> Group + Output: b + Group Key: xc_groupby_def.b + -> Sort + Output: b + Sort Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: b +(10 rows) + +select b,count(b) from xc_groupby_def group by b order by 1; + b | count +-------+------- + One | 1 + Three | 5 + Two | 4 + | 0 +(4 rows) + +explain (verbose true, costs false, nodes false) select b,count(b) from xc_groupby_def group by b; + QUERY PLAN +----------------------------------------------------- + Remote Subquery Scan on all + Output: b, count(b) + -> GroupAggregate + Output: b, count(b) + Group Key: xc_groupby_def.b + -> Sort + Output: b + Sort Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: b +(10 rows) + +select count(*) from xc_groupby_def where b is null group by b; + count +------- + 3 +(1 row) + +explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def where b is null group by b; + QUERY PLAN +-------------------------------------------------------- + Remote Subquery Scan on all + Output: count(*), b + -> GroupAggregate + Output: count(*), b + Group Key: xc_groupby_def.b + -> Sort + Output: b + Sort Key: xc_groupby_def.b + -> Seq Scan on public.xc_groupby_def + Output: b + Filter: (xc_groupby_def.b IS NULL) +(11 rows) + +create table xc_groupby_g(a int, b float, c numeric) distribute by replication; +insert into xc_groupby_g values(1,2.1,3.2); +insert into xc_groupby_g values(1,2.1,3.2); +insert into xc_groupby_g values(2,2.3,5.2); +select sum(a) from xc_groupby_g group by a; + sum +----- + 2 + 2 +(2 rows) + +explain (verbose true, costs false, nodes false) select sum(a) from xc_groupby_g group by a; + QUERY PLAN +--------------------------------------------------- + Remote Subquery Scan on all + Output: sum(a), a + -> GroupAggregate + Output: sum(a), a + Group Key: xc_groupby_g.a + -> Sort + Output: a + Sort Key: xc_groupby_g.a + -> Seq Scan on public.xc_groupby_g + Output: a +(10 rows) + +select sum(b) from xc_groupby_g group by b; + sum +----- + 4.2 + 2.3 +(2 rows) + +explain (verbose true, costs false, nodes false) select sum(b) from xc_groupby_g group by b; + QUERY PLAN +--------------------------------------------------- + Remote Subquery Scan on all + Output: sum(b), b + -> GroupAggregate + Output: sum(b), b + Group Key: xc_groupby_g.b + -> Sort + Output: b + Sort Key: xc_groupby_g.b + -> Seq Scan on public.xc_groupby_g + Output: b +(10 rows) + +select sum(c) from xc_groupby_g group by b; + sum +----- + 6.4 + 5.2 +(2 rows) + +explain (verbose true, costs false, nodes false) select sum(c) from xc_groupby_g group by b; + QUERY PLAN +--------------------------------------------------- + Remote Subquery Scan on all + Output: sum(c), b + -> GroupAggregate + Output: sum(c), b + Group Key: xc_groupby_g.b + -> Sort + Output: b, c + Sort Key: xc_groupby_g.b + -> Seq Scan on public.xc_groupby_g + Output: b, c +(10 rows) + +select avg(a) from xc_groupby_g group by b; + avg +------------------------ + 1.00000000000000000000 + 2.0000000000000000 +(2 rows) + +explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_g group by b; + QUERY PLAN +--------------------------------------------------- + Remote Subquery Scan on all + Output: avg(a), b + -> GroupAggregate + Output: avg(a), b + Group Key: xc_groupby_g.b + -> Sort + Output: b, a + Sort Key: xc_groupby_g.b + -> Seq Scan on public.xc_groupby_g + Output: b, a +(10 rows) + +select avg(b) from xc_groupby_g group by c; + avg +----- + 2.1 + 2.3 +(2 rows) + +explain (verbose true, costs false, nodes false) select avg(b) from xc_groupby_g group by c; + QUERY PLAN +--------------------------------------------------- + Remote Subquery Scan on all + Output: avg(b), c + -> GroupAggregate + Output: avg(b), c + Group Key: xc_groupby_g.c + -> Sort + Output: c, b + Sort Key: xc_groupby_g.c + -> Seq Scan on public.xc_groupby_g + Output: c, b +(10 rows) + +select avg(c) from xc_groupby_g group by c; + avg +-------------------- + 3.2000000000000000 + 5.2000000000000000 +(2 rows) + +explain (verbose true, costs false, nodes false) select avg(c) from xc_groupby_g group by c; + QUERY PLAN +--------------------------------------------------- + Remote Subquery Scan on all + Output: avg(c), c + -> GroupAggregate + Output: avg(c), c + Group Key: xc_groupby_g.c + -> Sort + Output: c + Sort Key: xc_groupby_g.c + -> Seq Scan on public.xc_groupby_g + Output: c +(10 rows) + +drop table xc_groupby_def; +drop table xc_groupby_g; +reset enable_hashagg; +reset enable_fast_query_shipping; From 47edcde7bc775bd510e59cf411aeb44f754979c9 Mon Sep 17 00:00:00 2001 From: youngxie Date: Tue, 9 Mar 2021 16:19:34 +0800 Subject: [PATCH 309/578] Allow UNIQUE indexes on partitioned tables If we restrict unique constraints on partitioned tables so that they must always include the partition key, then our standard approach to unique indexes already works --- each unique key is forced to exist within a single partition, so enforcing the unique restriction in each index individually is enough to have it enforced globally. Therefore we can implement unique indexes on partitions by simply removing a few restrictions (and adding others.) Discussion: https://postgr.es/m/20171222212921.hi6hg6pem2w2t36z@alvherre.pgsql Discussion: https://postgr.es/m/20171229230607.3iib6b62fn3uaf47@alvherre.pgsql Reviewed-by: Simon Riggs, Jesper Pedersen, Peter Eisentraut, Jaime Casanova, Amit Langote --- src/backend/bootstrap/bootparse.y | 2 + src/backend/catalog/index.c | 50 ++- src/backend/catalog/pg_constraint.c | 77 +++++ src/backend/catalog/toasting.c | 4 +- src/backend/commands/indexcmds.c | 126 +++++++- src/backend/commands/tablecmds.c | 72 ++++- src/backend/parser/analyze.c | 7 + src/backend/parser/parse_utilcmd.c | 33 +- src/backend/tcop/utility.c | 2 + src/bin/pg_dump/t/002_pg_dump.pl | 65 ++++ src/include/catalog/index.h | 5 +- src/include/catalog/pg_constraint_fn.h | 87 +++--- src/include/commands/defrem.h | 1 + src/include/parser/parse_utilcmd.h | 3 +- src/test/regress/expected/alter_table.out | 8 - src/test/regress/expected/create_index.out | 6 + src/test/regress/expected/create_table.out | 12 - src/test/regress/expected/indexing.out | 293 +++++++++++++++++- src/test/regress/expected/insert_conflict.out | 2 +- .../regress/expected/insert_conflict_1.out | 2 +- src/test/regress/output/tablespace.source | 29 +- src/test/regress/sql/alter_table.sql | 2 - src/test/regress/sql/create_index.sql | 6 + src/test/regress/sql/create_table.sql | 8 - src/test/regress/sql/indexing.sql | 171 +++++++++- 25 files changed, 921 insertions(+), 152 deletions(-) diff --git a/src/backend/bootstrap/bootparse.y b/src/backend/bootstrap/bootparse.y index 128b2e6c..e720c618 100644 --- a/src/backend/bootstrap/bootparse.y +++ b/src/backend/bootstrap/bootparse.y @@ -387,6 +387,7 @@ Boot_DeclareIndexStmt: stmt, $4, InvalidOid, + InvalidOid, false, false, false, @@ -433,6 +434,7 @@ Boot_DeclareUniqueIndexStmt: stmt, $5, InvalidOid, + InvalidOid, false, false, false, diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 81c91015..89c9a1ea 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -717,6 +717,8 @@ UpdateIndexRelation(Oid indexoid, * nonzero to specify a preselected OID. * parentIndexRelid: if creating an index partition, the OID of the * parent index; otherwise InvalidOid. + * parentConstraintId: if creating a constraint on a partition, the OID + * of the constraint in the parent; otherwise InvalidOid. * relFileNode: normally, pass InvalidOid to get new storage. May be * nonzero to attach an existing valid build. * indexInfo: same info executor uses to insert into the index @@ -748,6 +750,7 @@ UpdateIndexRelation(Oid indexoid, * (only if INDEX_CREATE_ADD_CONSTRAINT is set) * allow_system_table_mods: allow table to be a system catalog * is_internal: if true, post creation hook for new index + * constraintId: if not NULL, receives OID of created constraint * * Returns the OID of the created index. */ @@ -756,6 +759,7 @@ index_create(Relation heapRelation, const char *indexRelationName, Oid indexRelationId, Oid parentIndexRelid, + Oid parentConstraintId, Oid relFileNode, IndexInfo *indexInfo, List *indexColNames, @@ -768,7 +772,8 @@ index_create(Relation heapRelation, bits16 flags, bits16 constr_flags, bool allow_system_table_mods, - bool is_internal) + bool is_internal, + Oid *constraintId) { Oid heapRelationId = RelationGetRelid(heapRelation); Relation pg_class; @@ -1015,6 +1020,7 @@ index_create(Relation heapRelation, if ((flags & INDEX_CREATE_ADD_CONSTRAINT) != 0) { char constraintType; + ObjectAddress localaddr; if (isprimary) constraintType = CONSTRAINT_PRIMARY; @@ -1028,14 +1034,17 @@ index_create(Relation heapRelation, constraintType = 0; /* keep compiler quiet */ } - index_constraint_create(heapRelation, + localaddr = index_constraint_create(heapRelation, indexRelationId, + parentConstraintId, indexInfo, indexRelationName, constraintType, constr_flags, allow_system_table_mods, is_internal); + if (constraintId) + *constraintId = localaddr.objectId; } else { @@ -1206,6 +1215,8 @@ index_create(Relation heapRelation, * * heapRelation: table owning the index (must be suitably locked by caller) * indexRelationId: OID of the index + * parentConstraintId: if constraint is on a partition, the OID of the + * constraint in the parent. * indexInfo: same info executor uses to insert into the index * constraintName: what it say (generally, should match name of index) * constraintType: one of CONSTRAINT_PRIMARY, CONSTRAINT_UNIQUE, or @@ -1223,6 +1234,7 @@ index_create(Relation heapRelation, ObjectAddress index_constraint_create(Relation heapRelation, Oid indexRelationId, + Oid parentConstraintId, IndexInfo *indexInfo, const char *constraintName, char constraintType, @@ -1237,6 +1249,9 @@ index_constraint_create(Relation heapRelation, bool deferrable; bool initdeferred; bool mark_as_primary; + bool islocal; + bool noinherit; + int inhcount; deferrable = (constr_flags & INDEX_CONSTR_CREATE_DEFERRABLE) != 0; initdeferred = (constr_flags & INDEX_CONSTR_CREATE_INIT_DEFERRED) != 0; @@ -1271,6 +1286,19 @@ index_constraint_create(Relation heapRelation, deleteDependencyRecordsForClass(RelationRelationId, indexRelationId, RelationRelationId, DEPENDENCY_AUTO); + if (OidIsValid(parentConstraintId)) + { + islocal = false; + inhcount = 1; + noinherit = false; + } + else + { + islocal = true; + inhcount = 0; + noinherit = true; + } + /* * Construct a pg_constraint entry. */ @@ -1298,9 +1326,9 @@ index_constraint_create(Relation heapRelation, NULL, /* no check constraint */ NULL, NULL, - true, /* islocal */ - 0, /* inhcount */ - true, /* noinherit */ + islocal, + inhcount, + noinherit, is_internal); /* @@ -1320,6 +1348,18 @@ index_constraint_create(Relation heapRelation, recordDependencyOn(&myself, &referenced, DEPENDENCY_INTERNAL); /* + * Also, if this is a constraint on a partition, mark it as depending + * on the constraint in the parent. + */ + if (OidIsValid(parentConstraintId)) + { + ObjectAddress parentConstr; + + ObjectAddressSet(parentConstr, ConstraintRelationId, parentConstraintId); + recordDependencyOn(&referenced, &parentConstr, DEPENDENCY_INTERNAL_AUTO); + } + + /* * If the constraint is deferrable, create the deferred uniqueness * checking trigger. (The trigger will be given an internal dependency on * the constraint by CreateTrigger.) diff --git a/src/backend/catalog/pg_constraint.c b/src/backend/catalog/pg_constraint.c index fa6f8bc9..a1f21b1b 100644 --- a/src/backend/catalog/pg_constraint.c +++ b/src/backend/catalog/pg_constraint.c @@ -747,6 +747,43 @@ AlterConstraintNamespaces(Oid ownerId, Oid oldNspId, heap_close(conRel, RowExclusiveLock); } +/* + * ConstraintSetParentConstraint + * Set a partition's constraint as child of its parent table's + * + * This updates the constraint's pg_constraint row to show it as inherited, and + * add a dependency to the parent so that it cannot be removed on its own. + */ +void +ConstraintSetParentConstraint(Oid childConstrId, Oid parentConstrId) +{ + Relation constrRel; + Form_pg_constraint constrForm; + HeapTuple tuple, + newtup; + ObjectAddress depender; + ObjectAddress referenced; + + constrRel = heap_open(ConstraintRelationId, RowExclusiveLock); + tuple = SearchSysCache1(CONSTROID, ObjectIdGetDatum(childConstrId)); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for constraint %u", childConstrId); + newtup = heap_copytuple(tuple); + constrForm = (Form_pg_constraint) GETSTRUCT(newtup); + constrForm->conislocal = false; + constrForm->coninhcount++; + CatalogTupleUpdate(constrRel, &tuple->t_self, newtup); + ReleaseSysCache(tuple); + + ObjectAddressSet(referenced, ConstraintRelationId, parentConstrId); + ObjectAddressSet(depender, ConstraintRelationId, childConstrId); + + recordDependencyOn(&depender, &referenced, DEPENDENCY_INTERNAL_AUTO); + + heap_close(constrRel, RowExclusiveLock); +} + + /* * get_relation_constraint_oid * Find a constraint on the specified relation with the specified name. @@ -805,6 +842,46 @@ get_relation_constraint_oid(Oid relid, const char *conname, bool missing_ok) return conOid; } +/* + * Return the OID of the constraint associated with the given index in the + * given relation; or InvalidOid if no such index is catalogued. + */ +Oid +get_relation_idx_constraint_oid(Oid relationId, Oid indexId) +{ + Relation pg_constraint; + SysScanDesc scan; + ScanKeyData key; + HeapTuple tuple; + Oid constraintId = InvalidOid; + + pg_constraint = heap_open(ConstraintRelationId, AccessShareLock); + + ScanKeyInit(&key, + Anum_pg_constraint_conrelid, + BTEqualStrategyNumber, + F_OIDEQ, + ObjectIdGetDatum(relationId)); + scan = systable_beginscan(pg_constraint, ConstraintRelidIndexId, + true, NULL, 1, &key); + while ((tuple = systable_getnext(scan)) != NULL) + { + Form_pg_constraint constrForm; + + constrForm = (Form_pg_constraint) GETSTRUCT(tuple); + if (constrForm->conindid == indexId) + { + constraintId = HeapTupleGetOid(tuple); + break; + } + } + systable_endscan(scan); + + heap_close(pg_constraint, AccessShareLock); + return constraintId; +} + + /* * get_domain_constraint_oid * Find a constraint on the specified domain with the specified name. diff --git a/src/backend/catalog/toasting.c b/src/backend/catalog/toasting.c index a82b2037..95b0564c 100644 --- a/src/backend/catalog/toasting.c +++ b/src/backend/catalog/toasting.c @@ -392,13 +392,13 @@ create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid, coloptions[1] = 0; index_create(toast_rel, toast_idxname, toastIndexOid, InvalidOid, - InvalidOid, + InvalidOid, InvalidOid, indexInfo, list_make2("chunk_id", "chunk_seq"), BTREE_AM_OID, rel->rd_rel->reltablespace, collationObjectId, classObjectId, coloptions, (Datum) 0, - INDEX_CREATE_IS_PRIMARY, 0, true, true); + INDEX_CREATE_IS_PRIMARY, 0, true, true, NULL); heap_close(toast_rel, NoLock); diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index 76701f4a..03245150 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -27,6 +27,7 @@ #include "catalog/indexing.h" #include "catalog/partition.h" #include "catalog/pg_am.h" +#include "catalog/pg_constraint_fn.h" #include "catalog/pg_inherits.h" #include "catalog/pg_inherits_fn.h" #include "catalog/pg_opclass.h" @@ -309,6 +310,8 @@ CheckIndexCompatible(Oid oldId, * nonzero to specify a preselected OID for the index. * 'parentIndexId': the OID of the parent index; InvalidOid if not the child * of a partitioned index. + * 'parentConstraintId': the OID of the parent constraint; InvalidOid if not + * the child of a constraint (only used when recursing) * 'is_alter_table': this is due to an ALTER rather than a CREATE operation. * 'check_rights': check for CREATE rights in namespace and tablespace. (This * should be true except when ALTER is deleting/recreating an index.) @@ -325,6 +328,7 @@ DefineIndex(Oid relationId, IndexStmt *stmt, Oid indexRelationId, Oid parentIndexId, + Oid parentConstraintId, bool is_alter_table, bool check_rights, bool check_not_in_use, @@ -339,6 +343,7 @@ DefineIndex(Oid relationId, Oid accessMethodId; Oid namespaceId; Oid tablespaceId; + Oid createdConstraintId = InvalidOid; List *indexColNames; Relation rel; Relation indexRelation; @@ -446,20 +451,11 @@ DefineIndex(Oid relationId, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot create index on partitioned table \"%s\" concurrently", RelationGetRelationName(rel)))); - if (stmt->unique) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot create unique index on partitioned table \"%s\"", - RelationGetRelationName(rel)))); if (stmt->excludeOpNames) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot create exclusion constraints on partitioned table \"%s\"", RelationGetRelationName(rel)))); - if (stmt->primary || stmt->isconstraint) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot create constraints on partitioned tables"))); } /* @@ -729,6 +725,85 @@ DefineIndex(Oid relationId, index_check_primary_key(rel, indexInfo, is_alter_table, stmt); /* + * If this table is partitioned and we're creating a unique index or a + * primary key, make sure that the indexed columns are part of the + * partition key. Otherwise it would be possible to violate uniqueness by + * putting values that ought to be unique in different partitions. + * + * We could lift this limitation if we had global indexes, but those have + * their own problems, so this is a useful feature combination. + */ + if (partitioned && (stmt->unique || stmt->primary)) + { + PartitionKey key = rel->rd_partkey; + int i; + + /* + * A partitioned table can have unique indexes, as long as all the + * columns in the partition key appear in the unique key. A + * partition-local index can enforce global uniqueness iff the PK + * value completely determines the partition that a row is in. + * + * Thus, verify that all the columns in the partition key appear + * in the unique key definition. + */ + for (i = 0; i < key->partnatts; i++) + { + bool found = false; + int j; + const char *constraint_type; + + if (stmt->primary) + constraint_type = "PRIMARY KEY"; + else if (stmt->unique) + constraint_type = "UNIQUE"; + else if (stmt->excludeOpNames != NIL) + constraint_type = "EXCLUDE"; + else + { + elog(ERROR, "unknown constraint type"); + constraint_type = NULL; /* keep compiler quiet */ + } + + /* + * It may be possible to support UNIQUE constraints when partition + * keys are expressions, but is it worth it? Give up for now. + */ + if (key->partattrs[i] == 0) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("unsupported %s constraint with partition key definition", + constraint_type), + errdetail("%s constraints cannot be used when partition keys include expressions.", + constraint_type))); + + for (j = 0; j < indexInfo->ii_NumIndexAttrs; j++) + { + if (key->partattrs[i] == indexInfo->ii_KeyAttrNumbers[j]) + { + found = true; + break; + } + } + if (!found) + { + Form_pg_attribute att; + + att = TupleDescAttr(RelationGetDescr(rel), key->partattrs[i] - 1); + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("insufficient columns in %s constraint definition", + constraint_type), + errdetail("%s constraint on table \"%s\" lacks column \"%s\" which is part of the partition key.", + constraint_type, RelationGetRelationName(rel), + NameStr(att->attname)))); + } + } + } + + + + /* * We disallow indexes on system columns other than OID. They would not * necessarily get updated correctly, and they don't seem useful anyway. */ @@ -825,12 +900,14 @@ DefineIndex(Oid relationId, indexRelationId = index_create(rel, indexRelationName, indexRelationId, parentIndexId, + parentConstraintId, stmt->oldNode, indexInfo, indexColNames, accessMethodId, tablespaceId, collationObjectId, classObjectId, coloptions, reloptions, flags, constr_flags, - allowSystemTableMods, !check_rights); + allowSystemTableMods, !check_rights, + &createdConstraintId); ObjectAddressSet(address, RelationRelationId, indexRelationId); @@ -924,16 +1001,40 @@ DefineIndex(Oid relationId, opfamOids, attmap, maplen)) { + Oid cldConstrOid = InvalidOid; + /* - * Found a match. Attach index to parent and we're - * done, but keep lock till commit. + * Found a match. + * + * If this index is being created in the parent + * because of a constraint, then the child needs to + * have a constraint also, so look for one. If there + * is no such constraint, this index is no good, so + * keep looking. */ + if (createdConstraintId != InvalidOid) + { + cldConstrOid = + get_relation_idx_constraint_oid(childRelid, + cldidxid); + if (cldConstrOid == InvalidOid) + { + index_close(cldidx, lockmode); + continue; + } + } + + /* Attach index to parent and we're done. */ IndexSetParentIndex(cldidx, indexRelationId); + if (createdConstraintId != InvalidOid) + ConstraintSetParentConstraint(cldConstrOid, + createdConstraintId); if (!IndexIsValid(cldidx->rd_index)) invalidate_parent = true; found = true; + /* keep lock till commit */ index_close(cldidx, NoLock); break; } @@ -964,6 +1065,7 @@ DefineIndex(Oid relationId, DefineIndex(childRelid, childStmt, InvalidOid, /* no predefined OID */ indexRelationId, /* this is our child */ + createdConstraintId, is_alter_table, check_rights, check_not_in_use, skip_build, quiet); } diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 989fb062..6f525512 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -1282,17 +1282,20 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, Relation idxRel = index_open(lfirst_oid(cell), AccessShareLock); AttrNumber *attmap; IndexStmt *idxstmt; + Oid constraintOid; attmap = convert_tuples_by_name_map(RelationGetDescr(rel), RelationGetDescr(parent), gettext_noop("could not convert row type")); idxstmt = generateClonedIndexStmt(NULL, RelationGetRelid(rel), idxRel, - attmap, RelationGetDescr(rel)->natts); + attmap, RelationGetDescr(rel)->natts, + &constraintOid); DefineIndex(RelationGetRelid(rel), idxstmt, InvalidOid, RelationGetRelid(idxRel), + constraintOid, false, false, false, false, false); index_close(idxRel, AccessShareLock); @@ -8611,6 +8614,7 @@ ATExecAddIndex(AlteredTableInfo *tab, Relation rel, stmt, InvalidOid, /* no predefined OID */ InvalidOid, /* no parent index */ + InvalidOid, /* no parent constraint */ true, /* is_alter_table */ check_rights, false, /* check_not_in_use - we did it already */ @@ -8659,6 +8663,7 @@ ATExecAddIndex(AlteredTableInfo *tab, Relation rel, partidxstmt, InvalidOid, /* no predefined OID */ InvalidOid, /* no parent index */ + InvalidOid, /* no parent constraint */ true, /* is_alter_table */ check_rights, /* check_rights */ false, /* check_not_in_use */ @@ -8727,6 +8732,15 @@ ATExecAddIndexConstraint(AlteredTableInfo *tab, Relation rel, Assert(OidIsValid(index_oid)); Assert(stmt->isconstraint); + /* + * Doing this on partitioned tables is not a simple feature to implement, + * so let's punt for now. + */ + if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("ALTER TABLE / ADD CONSTRAINT USING INDEX is not supported on partitioned tables"))); + indexRel = index_open(index_oid, AccessShareLock); indexName = pstrdup(RelationGetRelationName(indexRel)); @@ -8774,6 +8788,7 @@ ATExecAddIndexConstraint(AlteredTableInfo *tab, Relation rel, address = index_constraint_create(rel, index_oid, + InvalidOid, indexInfo, constraintName, constraintType, @@ -17099,6 +17114,7 @@ AttachPartitionEnsureIndexes(Relation rel, Relation attachrel) IndexInfo *info; AttrNumber *attmap; bool found = false; + Oid constraintOid; /* * Ignore indexes in the partitioned table other than partitioned @@ -17115,6 +17131,7 @@ AttachPartitionEnsureIndexes(Relation rel, Relation attachrel) attmap = convert_tuples_by_name_map(RelationGetDescr(attachrel), RelationGetDescr(rel), gettext_noop("could not convert row type")); + constraintOid = get_relation_idx_constraint_oid(RelationGetRelid(rel), idx); /* * Scan the list of existing indexes in the partition-to-be, and mark @@ -17123,6 +17140,8 @@ AttachPartitionEnsureIndexes(Relation rel, Relation attachrel) */ for (i = 0; i < list_length(attachRelIdxs); i++) { + Oid cldConstrOid = InvalidOid; + /* does this index have a parent? if so, can't use it */ if (has_superclass(RelationGetRelid(attachrelIdxRels[i]))) continue; @@ -17135,8 +17154,26 @@ AttachPartitionEnsureIndexes(Relation rel, Relation attachrel) attmap, RelationGetDescr(rel)->natts)) { + /* + * If this index is being created in the parent because of a + * constraint, then the child needs to have a constraint also, + * so look for one. If there is no such constraint, this + * index is no good, so keep looking. + */ + if (OidIsValid(constraintOid)) + { + cldConstrOid = + get_relation_idx_constraint_oid(RelationGetRelid(attachrel), + RelationGetRelid(attachrelIdxRels[i])); + /* no dice */ + if (!OidIsValid(cldConstrOid)) + continue; + } + /* bingo. */ IndexSetParentIndex(attachrelIdxRels[i], idx); + if (OidIsValid(constraintOid)) + ConstraintSetParentConstraint(cldConstrOid, constraintOid); found = true; break; } @@ -17149,12 +17186,15 @@ AttachPartitionEnsureIndexes(Relation rel, Relation attachrel) if (!found) { IndexStmt *stmt; + Oid constraintOid; stmt = generateClonedIndexStmt(NULL, RelationGetRelid(attachrel), idxRel, attmap, - RelationGetDescr(rel)->natts); + RelationGetDescr(rel)->natts, + &constraintOid); DefineIndex(RelationGetRelid(attachrel), stmt, InvalidOid, RelationGetRelid(idxRel), + constraintOid, true, false, false, false, false); } @@ -17422,6 +17462,8 @@ ATExecAttachPartitionIdx(List **wqueue, Relation parentIdx, RangeVar *name) bool found; int i; PartitionDesc partDesc; + Oid constraintOid, + cldConstrId = InvalidOid; /* * If this partition already has an index attached, refuse the operation. @@ -17477,8 +17519,34 @@ ATExecAttachPartitionIdx(List **wqueue, Relation parentIdx, RangeVar *name) RelationGetRelationName(parentIdx)), errdetail("The index definitions do not match."))); + /* + * If there is a constraint in the parent, make sure there is one + * in the child too. + */ + constraintOid = get_relation_idx_constraint_oid(RelationGetRelid(parentTbl), + RelationGetRelid(parentIdx)); + + if (OidIsValid(constraintOid)) + { + cldConstrId = get_relation_idx_constraint_oid(RelationGetRelid(partTbl), + partIdxId); + if (!OidIsValid(cldConstrId)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("cannot attach index \"%s\" as a partition of index \"%s\"", + RelationGetRelationName(partIdx), + RelationGetRelationName(parentIdx)), + errdetail("The index \"%s\" belongs to a constraint in table \"%s\" but no constraint exists for index \"%s\".", + RelationGetRelationName(parentIdx), + RelationGetRelationName(parentTbl), + RelationGetRelationName(partIdx)))); + } + /* All good -- do it */ IndexSetParentIndex(partIdx, RelationGetRelid(parentIdx)); + if (OidIsValid(constraintOid)) + ConstraintSetParentConstraint(cldConstrId, constraintOid); + pfree(attmap); CommandCounterIncrement(); diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c index 62db0557..cd374523 100644 --- a/src/backend/parser/analyze.c +++ b/src/backend/parser/analyze.c @@ -1477,6 +1477,13 @@ transformOnConflictClause(ParseState *pstate, TargetEntry *te; int attno; + if (targetrel->rd_partdesc) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("%s cannot be applied to partitioned table \"%s\"", + "ON CONFLICT DO UPDATE", + RelationGetRelationName(targetrel)))); + /* * All INSERT expressions have been parsed, get ready for potentially * existing SET statements that need to be processed like an UPDATE. diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c index cf1bc20a..5360c6a5 100644 --- a/src/backend/parser/parse_utilcmd.c +++ b/src/backend/parser/parse_utilcmd.c @@ -1091,12 +1091,6 @@ transformColumnDefinition(CreateStmtContext *cxt, ColumnDef *column) errmsg("primary key constraints are not supported on foreign tables"), parser_errposition(cxt->pstate, constraint->location))); - if (cxt->ispartitioned) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("primary key constraints are not supported on partitioned tables"), - parser_errposition(cxt->pstate, - constraint->location))); /* FALL THRU */ case CONSTR_UNIQUE: @@ -1106,12 +1100,6 @@ transformColumnDefinition(CreateStmtContext *cxt, ColumnDef *column) errmsg("unique constraints are not supported on foreign tables"), parser_errposition(cxt->pstate, constraint->location))); - if (cxt->ispartitioned) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("unique constraints are not supported on partitioned tables"), - parser_errposition(cxt->pstate, - constraint->location))); if (constraint->keys == NIL) constraint->keys = list_make1(makeString(column->colname)); cxt->ixconstraints = lappend(cxt->ixconstraints, constraint); @@ -1208,12 +1196,6 @@ transformTableConstraint(CreateStmtContext *cxt, Constraint *constraint) errmsg("primary key constraints are not supported on foreign tables"), parser_errposition(cxt->pstate, constraint->location))); - if (cxt->ispartitioned) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("primary key constraints are not supported on partitioned tables"), - parser_errposition(cxt->pstate, - constraint->location))); cxt->ixconstraints = lappend(cxt->ixconstraints, constraint); break; @@ -1224,12 +1206,6 @@ transformTableConstraint(CreateStmtContext *cxt, Constraint *constraint) errmsg("unique constraints are not supported on foreign tables"), parser_errposition(cxt->pstate, constraint->location))); - if (cxt->ispartitioned) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("unique constraints are not supported on partitioned tables"), - parser_errposition(cxt->pstate, - constraint->location))); cxt->ixconstraints = lappend(cxt->ixconstraints, constraint); break; @@ -1631,7 +1607,7 @@ transformTableLikeClause(CreateStmtContext *cxt, TableLikeClause *table_like_cla /* Build CREATE INDEX statement to recreate the parent_index */ index_stmt = generateClonedIndexStmt(cxt->relation, InvalidOid, parent_index, - attmap, tupleDesc->natts); + attmap, tupleDesc->natts, NULL); #ifdef __TBASE__ if(cxt->interval_child) @@ -1724,8 +1700,8 @@ transformOfType(CreateStmtContext *cxt, TypeName *ofTypename) */ IndexStmt * generateClonedIndexStmt(RangeVar *heapRel, Oid heapRelid, Relation source_idx, - const AttrNumber *attmap, int attmap_length) -{// #lizard forgives + const AttrNumber *attmap, int attmap_length, Oid *constraintOid) +{ Oid source_relid = RelationGetRelid(source_idx); Form_pg_attribute *attrs = RelationGetDescr(source_idx)->attrs; HeapTuple ht_idxrel; @@ -1824,6 +1800,9 @@ generateClonedIndexStmt(RangeVar *heapRel, Oid heapRelid, Relation source_idx, HeapTuple ht_constr; Form_pg_constraint conrec; + if (constraintOid) + *constraintOid = constraintId; + ht_constr = SearchSysCache1(CONSTROID, ObjectIdGetDatum(constraintId)); if (!HeapTupleIsValid(ht_constr)) diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index 1ef4a799..da28b820 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -3748,6 +3748,7 @@ ProcessUtilitySlow(ParseState *pstate, stmt, InvalidOid, /* no predefined OID */ InvalidOid, /* no parent index */ + InvalidOid, /* no parent constraint */ false, /* is_alter_table */ true, /* check_rights */ true, /* check_not_in_use */ @@ -3818,6 +3819,7 @@ ProcessUtilitySlow(ParseState *pstate, partidxstmt, InvalidOid, /* no predefined OID */ InvalidOid, + InvalidOid, /* no parent constraint */ false, /* is_alter_table */ true, /* check_rights */ true, /* check_not_in_use */ diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl index 360d5954..79ff6e4e 100644 --- a/src/bin/pg_dump/t/002_pg_dump.pl +++ b/src/bin/pg_dump/t/002_pg_dump.pl @@ -5196,6 +5196,40 @@ role => 1, section_pre_data => 1, }, }, + 'ALTER TABLE measurement PRIMARY KEY' => { + all_runs => 1, + catch_all => 'CREATE ... commands', + create_order => 93, + create_sql => 'ALTER TABLE dump_test.measurement ADD PRIMARY KEY (city_id, logdate);', + regexp => qr/^ + \QALTER TABLE ONLY measurement\E \n^\s+ + \QADD CONSTRAINT measurement_pkey PRIMARY KEY (city_id, logdate);\E + /xm, + like => { + binary_upgrade => 1, + clean => 1, + clean_if_exists => 1, + createdb => 1, + defaults => 1, + exclude_test_table => 1, + exclude_test_table_data => 1, + no_blobs => 1, + no_privs => 1, + no_owner => 1, + only_dump_test_schema => 1, + pg_dumpall_dbprivs => 1, + schema_only => 1, + section_post_data => 1, + test_schema_plus_blobs => 1, + with_oids => 1, }, + unlike => { + exclude_dump_test_schema => 1, + only_dump_test_table => 1, + pg_dumpall_globals => 1, + pg_dumpall_globals_clean => 1, + role => 1, + section_pre_data => 1, }, }, + 'CREATE INDEX ... ON measurement_y2006_m2' => { all_runs => 1, catch_all => 'CREATE ... commands', @@ -5258,6 +5292,37 @@ section_pre_data => 1, test_schema_plus_blobs => 1, }, }, + 'ALTER INDEX ... ATTACH PARTITION (primary key)' => { + all_runs => 1, + catch_all => 'CREATE ... commands', + regexp => qr/^ + \QALTER INDEX dump_test.measurement_pkey ATTACH PARTITION dump_test_second_schema.measurement_y2006m2_pkey\E + /xm, + like => { + binary_upgrade => 1, + clean => 1, + clean_if_exists => 1, + createdb => 1, + defaults => 1, + exclude_dump_test_schema => 1, + exclude_test_table => 1, + exclude_test_table_data => 1, + no_blobs => 1, + no_privs => 1, + no_owner => 1, + pg_dumpall_dbprivs => 1, + role => 1, + schema_only => 1, + section_post_data => 1, + with_oids => 1, }, + unlike => { + only_dump_test_schema => 1, + only_dump_test_table => 1, + pg_dumpall_globals => 1, + pg_dumpall_globals_clean => 1, + section_pre_data => 1, + test_schema_plus_blobs => 1, }, }, + 'CREATE VIEW test_view' => { all_runs => 1, catch_all => 'CREATE ... commands', diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h index 3a7ed05f..c60ad12f 100644 --- a/src/include/catalog/index.h +++ b/src/include/catalog/index.h @@ -115,6 +115,7 @@ extern Oid index_create(Relation heapRelation, const char *indexRelationName, Oid indexRelationId, Oid parentIndexRelid, + Oid parentConstraintId, Oid relFileNode, IndexInfo *indexInfo, List *indexColNames, @@ -127,7 +128,8 @@ extern Oid index_create(Relation heapRelation, bits16 flags, bits16 constr_flags, bool allow_system_table_mods, - bool is_internal); + bool is_internal, + Oid *constraintId); #define INDEX_CONSTR_CREATE_MARK_AS_PRIMARY (1 << 0) #define INDEX_CONSTR_CREATE_DEFERRABLE (1 << 1) @@ -137,6 +139,7 @@ extern Oid index_create(Relation heapRelation, extern ObjectAddress index_constraint_create(Relation heapRelation, Oid indexRelationId, + Oid parentConstraintId, IndexInfo *indexInfo, const char *constraintName, char constraintType, diff --git a/src/include/catalog/pg_constraint_fn.h b/src/include/catalog/pg_constraint_fn.h index 0086a640..544db00b 100644 --- a/src/include/catalog/pg_constraint_fn.h +++ b/src/include/catalog/pg_constraint_fn.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * pg_constraint_fn.h - * prototypes for functions in catalog/pg_constraint.c + * prototypes for functions in catalog/pg_constraint.c * * * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group @@ -22,61 +22,64 @@ */ typedef enum ConstraintCategory { - CONSTRAINT_RELATION, - CONSTRAINT_DOMAIN, - CONSTRAINT_ASSERTION /* for future expansion */ + CONSTRAINT_RELATION, + CONSTRAINT_DOMAIN, + CONSTRAINT_ASSERTION /* for future expansion */ } ConstraintCategory; extern Oid CreateConstraintEntry(const char *constraintName, - Oid constraintNamespace, - char constraintType, - bool isDeferrable, - bool isDeferred, - bool isValidated, - Oid relId, - const int16 *constraintKey, - int constraintNKeys, - Oid domainId, - Oid indexRelId, - Oid foreignRelId, - const int16 *foreignKey, - const Oid *pfEqOp, - const Oid *ppEqOp, - const Oid *ffEqOp, - int foreignNKeys, - char foreignUpdateType, - char foreignDeleteType, - char foreignMatchType, - const Oid *exclOp, - Node *conExpr, - const char *conBin, - const char *conSrc, - bool conIsLocal, - int conInhCount, - bool conNoInherit, - bool is_internal); + Oid constraintNamespace, + char constraintType, + bool isDeferrable, + bool isDeferred, + bool isValidated, + Oid relId, + const int16 *constraintKey, + int constraintNKeys, + Oid domainId, + Oid indexRelId, + Oid foreignRelId, + const int16 *foreignKey, + const Oid *pfEqOp, + const Oid *ppEqOp, + const Oid *ffEqOp, + int foreignNKeys, + char foreignUpdateType, + char foreignDeleteType, + char foreignMatchType, + const Oid *exclOp, + Node *conExpr, + const char *conBin, + const char *conSrc, + bool conIsLocal, + int conInhCount, + bool conNoInherit, + bool is_internal); extern void RemoveConstraintById(Oid conId); extern void RenameConstraintById(Oid conId, const char *newname); extern void SetValidatedConstraintById(Oid conId); extern bool ConstraintNameIsUsed(ConstraintCategory conCat, Oid objId, - Oid objNamespace, const char *conname); + Oid objNamespace, const char *conname); extern char *ChooseConstraintName(const char *name1, const char *name2, - const char *label, Oid namespaceid, - List *others); + const char *label, Oid namespaceid, + List *others); extern void AlterConstraintNamespaces(Oid ownerId, Oid oldNspId, - Oid newNspId, bool isType, ObjectAddresses *objsMoved); -extern Oid get_relation_constraint_oid(Oid relid, const char *conname, bool missing_ok); -extern Oid get_domain_constraint_oid(Oid typid, const char *conname, bool missing_ok); + Oid newNspId, bool isType, ObjectAddresses *objsMoved); +extern void ConstraintSetParentConstraint(Oid childConstrId, + Oid parentConstrId); +extern Oid get_relation_constraint_oid(Oid relid, const char *conname, bool missing_ok); +extern Oid get_domain_constraint_oid(Oid typid, const char *conname, bool missing_ok); +extern Oid get_relation_idx_constraint_oid(Oid relationId, Oid indexId); extern Bitmapset *get_primary_key_attnos(Oid relid, bool deferrableOk, - Oid *constraintOid); + Oid *constraintOid); extern bool check_functional_grouping(Oid relid, - Index varno, Index varlevelsup, - List *grouping_columns, - List **constraintDeps); + Index varno, Index varlevelsup, + List *grouping_columns, + List **constraintDeps); -#endif /* PG_CONSTRAINT_FN_H */ +#endif /* PG_CONSTRAINT_FN_H */ diff --git a/src/include/commands/defrem.h b/src/include/commands/defrem.h index 377f9f94..1d3959b2 100644 --- a/src/include/commands/defrem.h +++ b/src/include/commands/defrem.h @@ -26,6 +26,7 @@ extern ObjectAddress DefineIndex(Oid relationId, IndexStmt *stmt, Oid indexRelationId, Oid parentIndexId, + Oid parentConstraintId, bool is_alter_table, bool check_rights, bool check_not_in_use, diff --git a/src/include/parser/parse_utilcmd.h b/src/include/parser/parse_utilcmd.h index e527a119..6cb25dbc 100644 --- a/src/include/parser/parse_utilcmd.h +++ b/src/include/parser/parse_utilcmd.h @@ -103,6 +103,7 @@ extern PartitionBoundSpec *transformPartitionBound(ParseState *pstate, Relation PartitionBoundSpec *spec); extern IndexStmt *generateClonedIndexStmt(RangeVar *heapRel, Oid heapOid, Relation source_idx, - const AttrNumber *attmap, int attmap_length); + const AttrNumber *attmap, int attmap_length, + Oid *constraintOid); #endif /* PARSE_UTILCMD_H */ diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out index 737710bc..50f89f13 100644 --- a/src/test/regress/expected/alter_table.out +++ b/src/test/regress/expected/alter_table.out @@ -3181,14 +3181,6 @@ CREATE TABLE partitioned ( a int, b int ) PARTITION BY RANGE (a, (a+b+1)); -ALTER TABLE partitioned ADD UNIQUE (a); -ERROR: unique constraints are not supported on partitioned tables -LINE 1: ALTER TABLE partitioned ADD UNIQUE (a); - ^ -ALTER TABLE partitioned ADD PRIMARY KEY (a); -ERROR: primary key constraints are not supported on partitioned tables -LINE 1: ALTER TABLE partitioned ADD PRIMARY KEY (a); - ^ ALTER TABLE partitioned ADD FOREIGN KEY (a) REFERENCES blah; ERROR: foreign key constraints are not supported on partitioned tables LINE 1: ALTER TABLE partitioned ADD FOREIGN KEY (a) REFERENCES blah; diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out index dd727bc7..50c501c1 100644 --- a/src/test/regress/expected/create_index.out +++ b/src/test/regress/expected/create_index.out @@ -2701,6 +2701,12 @@ DROP INDEX cwi_replaced_pkey; -- Should fail; a constraint depends on it ERROR: cannot drop index cwi_replaced_pkey because constraint cwi_replaced_pkey on table cwi_test requires it HINT: You can drop constraint cwi_replaced_pkey on table cwi_test instead. DROP TABLE cwi_test; +-- ADD CONSTRAINT USING INDEX is forbidden on partitioned tables +CREATE TABLE cwi_test(a int) PARTITION BY hash (a); +create unique index on cwi_test (a); +alter table cwi_test add primary key using index cwi_test_a_idx ; +ERROR: ALTER TABLE / ADD CONSTRAINT USING INDEX is not supported on partitioned tables +DROP TABLE cwi_test; -- -- Check handling of indexes on system columns -- diff --git a/src/test/regress/expected/create_table.out b/src/test/regress/expected/create_table.out index 4f679633..26d364a5 100644 --- a/src/test/regress/expected/create_table.out +++ b/src/test/regress/expected/create_table.out @@ -279,12 +279,6 @@ CREATE TABLE partitioned ( ) PARTITION BY LIST (a1, a2); -- fail ERROR: cannot use "list" partition strategy with more than one column -- unsupported constraint type for partitioned tables -CREATE TABLE partitioned ( - a int PRIMARY KEY -) PARTITION BY RANGE (a); -ERROR: primary key constraints are not supported on partitioned tables -LINE 2: a int PRIMARY KEY - ^ CREATE TABLE pkrel ( a int PRIMARY KEY ); @@ -295,12 +289,6 @@ ERROR: foreign key constraints are not supported on partitioned tables LINE 2: a int REFERENCES pkrel(a) ^ DROP TABLE pkrel; -CREATE TABLE partitioned ( - a int UNIQUE -) PARTITION BY RANGE (a); -ERROR: unique constraints are not supported on partitioned tables -LINE 2: a int UNIQUE - ^ CREATE TABLE partitioned ( a int, EXCLUDE USING gist (a WITH &&) diff --git a/src/test/regress/expected/indexing.out b/src/test/regress/expected/indexing.out index f996a88d..4cd0596f 100644 --- a/src/test/regress/expected/indexing.out +++ b/src/test/regress/expected/indexing.out @@ -26,8 +26,6 @@ drop table idxpart; -- Some unsupported features create table idxpart (a int, b int, c text) partition by range (a); create table idxpart1 partition of idxpart for values from (0) to (10); -create unique index on idxpart (a); -ERROR: cannot create unique index on partitioned table "idxpart" create index concurrently on idxpart (a); ERROR: PGXC does not support concurrent INDEX yet DETAIL: The feature is not currently supported @@ -759,6 +757,295 @@ select attrelid::regclass, attname, attnum from pg_attribute idxpart_col_keep_idx | col_keep | 1 (7 rows) +drop table idxpart; +-- +-- Constraint-related indexes +-- +-- Verify that it works to add primary key / unique to partitioned tables +create table idxpart (a int primary key, b int) partition by range (a); +\d idxpart + Table "public.idxpart" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | not null | + b | integer | | | +Partition key: RANGE (a) +Indexes: + "idxpart_pkey" PRIMARY KEY, btree (a) +Number of partitions: 0 + +drop table idxpart; +-- but not if you fail to use the full partition key +create table idxpart (a int unique, b int) partition by range (a, b); +ERROR: insufficient columns in UNIQUE constraint definition +DETAIL: UNIQUE constraint on table "idxpart" lacks column "b" which is part of the partition key. +create table idxpart (a int, b int unique) partition by range (a, b); +ERROR: insufficient columns in UNIQUE constraint definition +DETAIL: UNIQUE constraint on table "idxpart" lacks column "a" which is part of the partition key. +create table idxpart (a int primary key, b int) partition by range (b, a); +ERROR: insufficient columns in PRIMARY KEY constraint definition +DETAIL: PRIMARY KEY constraint on table "idxpart" lacks column "b" which is part of the partition key. +create table idxpart (a int, b int primary key) partition by range (b, a); +ERROR: insufficient columns in PRIMARY KEY constraint definition +DETAIL: PRIMARY KEY constraint on table "idxpart" lacks column "a" which is part of the partition key. +-- OK if you use them in some other order +create table idxpart (a int, b int, c text, primary key (a, b, c)) partition by range (b, c, a); +drop table idxpart; +-- not other types of index-based constraints +create table idxpart (a int, exclude (a with = )) partition by range (a); +ERROR: exclusion constraints are not supported on partitioned tables +LINE 1: create table idxpart (a int, exclude (a with = )) partition ... + ^ +-- no expressions in partition key for PK/UNIQUE +create table idxpart (a int primary key, b int) partition by range ((b + a)); +ERROR: unsupported PRIMARY KEY constraint with partition key definition +DETAIL: PRIMARY KEY constraints cannot be used when partition keys include expressions. +create table idxpart (a int unique, b int) partition by range ((b + a)); +ERROR: unsupported UNIQUE constraint with partition key definition +DETAIL: UNIQUE constraints cannot be used when partition keys include expressions. +-- use ALTER TABLE to add a primary key +create table idxpart (a int, b int, c text) partition by range (a, b); +alter table idxpart add primary key (a); -- not an incomplete one though +ERROR: insufficient columns in PRIMARY KEY constraint definition +DETAIL: PRIMARY KEY constraint on table "idxpart" lacks column "b" which is part of the partition key. +alter table idxpart add primary key (a, b); -- this works +\d idxpart + Table "public.idxpart" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | not null | + b | integer | | not null | + c | text | | | +Partition key: RANGE (a, b) +Indexes: + "idxpart_pkey" PRIMARY KEY, btree (a, b) +Number of partitions: 0 + +create table idxpart1 partition of idxpart for values from (0, 0) to (1000, 1000); +\d idxpart1 + Table "public.idxpart1" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | not null | + b | integer | | not null | + c | text | | | +Partition of: idxpart FOR VALUES FROM (0, 0) TO (1000, 1000) +Indexes: + "idxpart1_pkey" PRIMARY KEY, btree (a, b) + +drop table idxpart; +-- use ALTER TABLE to add a unique constraint +create table idxpart (a int, b int) partition by range (a, b); +alter table idxpart add unique (a); -- not an incomplete one though +ERROR: insufficient columns in UNIQUE constraint definition +DETAIL: UNIQUE constraint on table "idxpart" lacks column "b" which is part of the partition key. +alter table idxpart add unique (b, a); -- this works +\d idxpart + Table "public.idxpart" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | +Partition key: RANGE (a, b) +Indexes: + "idxpart_b_a_key" UNIQUE CONSTRAINT, btree (b, a) +Number of partitions: 0 + +drop table idxpart; +-- Exclusion constraints cannot be added +create table idxpart (a int, b int) partition by range (a); +alter table idxpart add exclude (a with =); +ERROR: exclusion constraints are not supported on partitioned tables +LINE 1: alter table idxpart add exclude (a with =); + ^ +drop table idxpart; +-- When (sub)partitions are created, they also contain the constraint +create table idxpart (a int, b int, primary key (a, b)) partition by range (a, b); +create table idxpart1 partition of idxpart for values from (1, 1) to (10, 10); +create table idxpart2 partition of idxpart for values from (10, 10) to (20, 20) + partition by range (b); +create table idxpart21 partition of idxpart2 for values from (10) to (15); +create table idxpart22 partition of idxpart2 for values from (15) to (20); +create table idxpart3 (a int not null, b int not null); +alter table idxpart attach partition idxpart3 for values from (20, 20) to (30, 30); +select conname, contype, conrelid::regclass, conindid::regclass, conkey + from pg_constraint where conrelid::regclass::text like 'idxpart%' + order by conname; + conname | contype | conrelid | conindid | conkey +----------------+---------+-----------+----------------+-------- + idxpart1_pkey | p | idxpart1 | idxpart1_pkey | {1,2} + idxpart21_pkey | p | idxpart21 | idxpart21_pkey | {1,2} + idxpart22_pkey | p | idxpart22 | idxpart22_pkey | {1,2} + idxpart2_pkey | p | idxpart2 | idxpart2_pkey | {1,2} + idxpart3_pkey | p | idxpart3 | idxpart3_pkey | {1,2} + idxpart_pkey | p | idxpart | idxpart_pkey | {1,2} +(6 rows) + +drop table idxpart; +-- Verify that multi-layer partitioning honors the requirement that all +-- columns in the partition key must appear in primary key +create table idxpart (a int, b int, primary key (a)) partition by range (a); +create table idxpart2 partition of idxpart +for values from (0) to (1000) partition by range (b); -- fail +ERROR: insufficient columns in PRIMARY KEY constraint definition +DETAIL: PRIMARY KEY constraint on table "idxpart2" lacks column "b" which is part of the partition key. +drop table idxpart; +-- Multi-layer partitioning works correctly in this case: +create table idxpart (a int, b int, primary key (a, b)) partition by range (a); +create table idxpart2 partition of idxpart for values from (0) to (1000) partition by range (b); +create table idxpart21 partition of idxpart2 for values from (0) to (1000); +select conname, contype, conrelid::regclass, conindid::regclass, conkey + from pg_constraint where conrelid::regclass::text like 'idxpart%' + order by conname; + conname | contype | conrelid | conindid | conkey +----------------+---------+-----------+----------------+-------- + idxpart21_pkey | p | idxpart21 | idxpart21_pkey | {1,2} + idxpart2_pkey | p | idxpart2 | idxpart2_pkey | {1,2} + idxpart_pkey | p | idxpart | idxpart_pkey | {1,2} +(3 rows) + +drop table idxpart; +-- If a partitioned table has a unique/PK constraint, then it's not possible +-- to drop the corresponding constraint in the children; nor it's possible +-- to drop the indexes individually. Dropping the constraint in the parent +-- gets rid of the lot. +create table idxpart (i int) partition by hash (i); +create table idxpart0 partition of idxpart (i) for values with (modulus 2, remainder 0); +create table idxpart1 partition of idxpart (i) for values with (modulus 2, remainder 1); +alter table idxpart0 add primary key(i); +alter table idxpart add primary key(i); +select indrelid::regclass, indexrelid::regclass, inhparent::regclass, indisvalid, + conname, conislocal, coninhcount, connoinherit, convalidated + from pg_index idx left join pg_inherits inh on (idx.indexrelid = inh.inhrelid) + left join pg_constraint con on (idx.indexrelid = con.conindid) + where indrelid::regclass::text like 'idxpart%' + order by indexrelid::regclass::text collate "C"; + indrelid | indexrelid | inhparent | indisvalid | conname | conislocal | coninhcount | connoinherit | convalidated +----------+---------------+--------------+------------+---------------+------------+-------------+--------------+-------------- + idxpart0 | idxpart0_pkey | idxpart_pkey | t | idxpart0_pkey | f | 1 | t | t + idxpart1 | idxpart1_pkey | idxpart_pkey | t | idxpart1_pkey | f | 1 | f | t + idxpart | idxpart_pkey | | t | idxpart_pkey | t | 0 | t | t +(3 rows) + +drop index idxpart0_pkey; -- fail +ERROR: cannot drop index idxpart0_pkey because index idxpart_pkey requires it +HINT: You can drop index idxpart_pkey instead. +drop index idxpart1_pkey; -- fail +ERROR: cannot drop index idxpart1_pkey because index idxpart_pkey requires it +HINT: You can drop index idxpart_pkey instead. +alter table idxpart0 drop constraint idxpart0_pkey; -- fail +ERROR: cannot drop inherited constraint "idxpart0_pkey" of relation "idxpart0" +alter table idxpart1 drop constraint idxpart1_pkey; -- fail +ERROR: cannot drop inherited constraint "idxpart1_pkey" of relation "idxpart1" +alter table idxpart drop constraint idxpart_pkey; -- ok +select indrelid::regclass, indexrelid::regclass, inhparent::regclass, indisvalid, + conname, conislocal, coninhcount, connoinherit, convalidated + from pg_index idx left join pg_inherits inh on (idx.indexrelid = inh.inhrelid) + left join pg_constraint con on (idx.indexrelid = con.conindid) + where indrelid::regclass::text like 'idxpart%' + order by indexrelid::regclass::text collate "C"; + indrelid | indexrelid | inhparent | indisvalid | conname | conislocal | coninhcount | connoinherit | convalidated +----------+------------+-----------+------------+---------+------------+-------------+--------------+-------------- +(0 rows) + +drop table idxpart; +-- If a partitioned table has a constraint whose index is not valid, +-- attaching a missing partition makes it valid. +create table idxpart (a int) partition by range (a); +create table idxpart0 (like idxpart); +alter table idxpart0 add primary key (a); +alter table idxpart attach partition idxpart0 for values from (0) to (1000); +alter table only idxpart add primary key (a); +select indrelid::regclass, indexrelid::regclass, inhparent::regclass, indisvalid, + conname, conislocal, coninhcount, connoinherit, convalidated + from pg_index idx left join pg_inherits inh on (idx.indexrelid = inh.inhrelid) + left join pg_constraint con on (idx.indexrelid = con.conindid) + where indrelid::regclass::text like 'idxpart%' + order by indexrelid::regclass::text collate "C"; + indrelid | indexrelid | inhparent | indisvalid | conname | conislocal | coninhcount | connoinherit | convalidated +----------+---------------+-----------+------------+---------------+------------+-------------+--------------+-------------- + idxpart0 | idxpart0_pkey | | t | idxpart0_pkey | t | 0 | t | t + idxpart | idxpart_pkey | | f | idxpart_pkey | t | 0 | t | t +(2 rows) + +alter index idxpart_pkey attach partition idxpart0_pkey; +select indrelid::regclass, indexrelid::regclass, inhparent::regclass, indisvalid, + conname, conislocal, coninhcount, connoinherit, convalidated + from pg_index idx left join pg_inherits inh on (idx.indexrelid = inh.inhrelid) + left join pg_constraint con on (idx.indexrelid = con.conindid) + where indrelid::regclass::text like 'idxpart%' + order by indexrelid::regclass::text collate "C"; + indrelid | indexrelid | inhparent | indisvalid | conname | conislocal | coninhcount | connoinherit | convalidated +----------+---------------+--------------+------------+---------------+------------+-------------+--------------+-------------- + idxpart0 | idxpart0_pkey | idxpart_pkey | t | idxpart0_pkey | f | 1 | t | t + idxpart | idxpart_pkey | | t | idxpart_pkey | t | 0 | t | t +(2 rows) + +drop table idxpart; +-- if a partition has a unique index without a constraint, does not attach +-- automatically; creates a new index instead. +create table idxpart (a int, b int) partition by range (a); +create table idxpart1 (a int not null, b int); +create unique index on idxpart1 (a); +alter table idxpart add primary key (a); +alter table idxpart attach partition idxpart1 for values from (1) to (1000); +select indrelid::regclass, indexrelid::regclass, inhparent::regclass, indisvalid, + conname, conislocal, coninhcount, connoinherit, convalidated + from pg_index idx left join pg_inherits inh on (idx.indexrelid = inh.inhrelid) + left join pg_constraint con on (idx.indexrelid = con.conindid) + where indrelid::regclass::text like 'idxpart%' + order by indexrelid::regclass::text collate "C"; + indrelid | indexrelid | inhparent | indisvalid | conname | conislocal | coninhcount | connoinherit | convalidated +----------+----------------+--------------+------------+---------------+------------+-------------+--------------+-------------- + idxpart1 | idxpart1_a_idx | | t | | | | | + idxpart1 | idxpart1_pkey | idxpart_pkey | t | idxpart1_pkey | f | 1 | f | t + idxpart | idxpart_pkey | | t | idxpart_pkey | t | 0 | t | t +(3 rows) + +drop table idxpart; +-- Can't attach an index without a corresponding constraint +create table idxpart (a int, b int) partition by range (a); +create table idxpart1 (a int not null, b int); +create unique index on idxpart1 (a); +alter table idxpart attach partition idxpart1 for values from (1) to (1000); +alter table only idxpart add primary key (a); +alter index idxpart_pkey attach partition idxpart1_a_idx; -- fail +ERROR: cannot attach index "idxpart1_a_idx" as a partition of index "idxpart_pkey" +DETAIL: The index "idxpart_pkey" belongs to a constraint in table "idxpart" but no constraint exists for index "idxpart1_a_idx". +drop table idxpart; +-- Test that unique constraints are working +create table idxpart (a int, b text, primary key (a, b)) partition by range (a); +create table idxpart1 partition of idxpart for values from (0) to (100000); +create table idxpart2 (like idxpart); +insert into idxpart2 (a, b) values (572814, 'inserted first'); +create unique index on idxpart (a); +alter table idxpart attach partition idxpart2 for values from (100000) to (1000000); +insert into idxpart values (0, 'zero'), (42, 'life'), (2^16, 'sixteen'); +insert into idxpart select 2^g, format('two to power of %s', g) from generate_series(15, 17) g; +ERROR: duplicate key value violates unique constraint "idxpart1_a_idx" +DETAIL: Key (a)=(65536) already exists. +insert into idxpart values (16, 'sixteen'); +insert into idxpart (b, a) values ('one', 142857), ('two', 285714); +insert into idxpart select a * 2, b || b from idxpart where a between 2^16 and 2^19; +ERROR: duplicate key value violates unique constraint "idxpart2_a_idx" +DETAIL: Key (a)=(285714) already exists. +insert into idxpart values (572814, 'five'); +ERROR: duplicate key value violates unique constraint "idxpart2_a_idx" +DETAIL: Key (a)=(572814) already exists. +insert into idxpart values (857142, 'six'); +select tableoid::regclass, * from idxpart order by a; + tableoid | a | b +----------+--------+---------------- + idxpart1 | 0 | zero + idxpart1 | 16 | sixteen + idxpart1 | 42 | life + idxpart1 | 65536 | sixteen + idxpart2 | 142857 | one + idxpart2 | 285714 | two + idxpart2 | 572814 | inserted first + idxpart2 | 857142 | six +(8 rows) + drop table idxpart; -- intentionally leave some objects around create table idxpart (a int) partition by range (a); @@ -771,3 +1058,5 @@ create index on idxpart22 (a); create index on only idxpart2 (a); alter index idxpart2_a_idx attach partition idxpart22_a_idx; create index on idxpart (a); +create table idxpart_another (a int, b int, primary key (a, b)) partition by range (a); +create table idxpart_another_1 partition of idxpart_another for values from (0) to (100); diff --git a/src/test/regress/expected/insert_conflict.out b/src/test/regress/expected/insert_conflict.out index f10974de..dcd26834 100644 --- a/src/test/regress/expected/insert_conflict.out +++ b/src/test/regress/expected/insert_conflict.out @@ -822,7 +822,7 @@ insert into parted_conflict_test values (1, 'a') on conflict do nothing; insert into parted_conflict_test values (1, 'a') on conflict do nothing; -- however, on conflict do update is not supported yet insert into parted_conflict_test values (1) on conflict (b) do update set a = excluded.a; -ERROR: there is no unique or exclusion constraint matching the ON CONFLICT specification +ERROR: ON CONFLICT DO UPDATE cannot be applied to partitioned table "parted_conflict_test" -- but it works OK if we target the partition directly insert into parted_conflict_test_1 values (1) on conflict (b) do update set a = excluded.a; diff --git a/src/test/regress/expected/insert_conflict_1.out b/src/test/regress/expected/insert_conflict_1.out index 40048bfb..5b13fc64 100644 --- a/src/test/regress/expected/insert_conflict_1.out +++ b/src/test/regress/expected/insert_conflict_1.out @@ -843,7 +843,7 @@ ERROR: no partition of relation "parted_conflict_test" found for row DETAIL: Partition key of the failing row contains (a) = (1). -- however, on conflict do update is not supported yet insert into parted_conflict_test values (1) on conflict (b) do update set a = excluded.a; -ERROR: Distributed column or partition column "a" can't be updated in current version +ERROR: ON CONFLICT DO UPDATE cannot be applied to partitioned table "parted_conflict_test" -- but it works OK if we target the partition directly insert into parted_conflict_test_1 values (1) on conflict (b) do update set a = excluded.a; diff --git a/src/test/regress/output/tablespace.source b/src/test/regress/output/tablespace.source index 1a5dc4d1..b78953ac 100644 --- a/src/test/regress/output/tablespace.source +++ b/src/test/regress/output/tablespace.source @@ -132,34 +132,19 @@ Tablespace: "regress_tblspace" -- partitioned rels cannot specify the primary key. These fail: CREATE TABLE testschema.dflt (a int PRIMARY KEY) PARTITION BY LIST (a) TABLESPACE pg_default; -ERROR: primary key constraints are not supported on partitioned tables -LINE 1: CREATE TABLE testschema.dflt (a int PRIMARY KEY) PARTITION B... - ^ +ERROR: cannot specify default tablespace for partitioned relations CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE pg_default) PARTITION BY LIST (a); -ERROR: primary key constraints are not supported on partitioned tables -LINE 1: CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX ... - ^ +ERROR: cannot specify default tablespace for partitioned relation SET default_tablespace TO 'pg_default'; CREATE TABLE testschema.dflt (a int PRIMARY KEY) PARTITION BY LIST (a) TABLESPACE regress_tblspace; -ERROR: primary key constraints are not supported on partitioned tables -LINE 1: CREATE TABLE testschema.dflt (a int PRIMARY KEY) PARTITION B... - ^ +ERROR: cannot specify default tablespace for partitioned relations CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE regress_tblspace) PARTITION BY LIST (a); -ERROR: primary key constraints are not supported on partitioned tables -LINE 1: CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX ... - ^ +ERROR: cannot specify default tablespace for partitioned relations -- but these work: CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE regress_tblspace) PARTITION BY LIST (a) TABLESPACE regress_tblspace; -ERROR: primary key constraints are not supported on partitioned tables -LINE 1: CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX ... - ^ SET default_tablespace TO ''; CREATE TABLE testschema.dflt2 (a int PRIMARY KEY) PARTITION BY LIST (a); -ERROR: primary key constraints are not supported on partitioned tables -LINE 1: CREATE TABLE testschema.dflt2 (a int PRIMARY KEY) PARTITION ... - ^ DROP TABLE testschema.dflt, testschema.dflt2; -ERROR: table "dflt" does not exist -- check that default_tablespace doesn't affect ALTER TABLE index rebuilds CREATE TABLE testschema.test_default_tab(id bigint) TABLESPACE regress_tblspace; INSERT INTO testschema.test_default_tab VALUES (1); @@ -277,13 +262,7 @@ INSERT INTO testschema.test_default_tab_p VALUES (1); CREATE INDEX test_index1 on testschema.test_default_tab_p (val); CREATE INDEX test_index2 on testschema.test_default_tab_p (val) TABLESPACE regress_tblspace; ALTER TABLE testschema.test_default_tab_p ADD CONSTRAINT test_index3 PRIMARY KEY (id); -ERROR: primary key constraints are not supported on partitioned tables -LINE 1: ALTER TABLE testschema.test_default_tab_p ADD CONSTRAINT tes... - ^ ALTER TABLE testschema.test_default_tab_p ADD CONSTRAINT test_index4 UNIQUE (id) USING INDEX TABLESPACE regress_tblspace; -ERROR: unique constraints are not supported on partitioned tables -LINE 1: ALTER TABLE testschema.test_default_tab_p ADD CONSTRAINT tes... - ^ \d testschema.test_index1 Index "testschema.test_index1" Column | Type | Definition diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql index c0e41d0f..daa8f09d 100644 --- a/src/test/regress/sql/alter_table.sql +++ b/src/test/regress/sql/alter_table.sql @@ -1986,8 +1986,6 @@ CREATE TABLE partitioned ( a int, b int ) PARTITION BY RANGE (a, (a+b+1)); -ALTER TABLE partitioned ADD UNIQUE (a); -ALTER TABLE partitioned ADD PRIMARY KEY (a); ALTER TABLE partitioned ADD FOREIGN KEY (a) REFERENCES blah; ALTER TABLE partitioned ADD EXCLUDE USING gist (a WITH &&); diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql index ac87a957..14492a24 100644 --- a/src/test/regress/sql/create_index.sql +++ b/src/test/regress/sql/create_index.sql @@ -833,6 +833,12 @@ DROP INDEX cwi_replaced_pkey; -- Should fail; a constraint depends on it DROP TABLE cwi_test; +-- ADD CONSTRAINT USING INDEX is forbidden on partitioned tables +CREATE TABLE cwi_test(a int) PARTITION BY hash (a); +create unique index on cwi_test (a); +alter table cwi_test add primary key using index cwi_test_a_idx ; +DROP TABLE cwi_test; + -- -- Check handling of indexes on system columns -- diff --git a/src/test/regress/sql/create_table.sql b/src/test/regress/sql/create_table.sql index d00a5935..82f1a87b 100644 --- a/src/test/regress/sql/create_table.sql +++ b/src/test/regress/sql/create_table.sql @@ -296,10 +296,6 @@ CREATE TABLE partitioned ( ) PARTITION BY LIST (a1, a2); -- fail -- unsupported constraint type for partitioned tables -CREATE TABLE partitioned ( - a int PRIMARY KEY -) PARTITION BY RANGE (a); - CREATE TABLE pkrel ( a int PRIMARY KEY ); @@ -308,10 +304,6 @@ CREATE TABLE partitioned ( ) PARTITION BY RANGE (a); DROP TABLE pkrel; -CREATE TABLE partitioned ( - a int UNIQUE -) PARTITION BY RANGE (a); - CREATE TABLE partitioned ( a int, EXCLUDE USING gist (a WITH &&) diff --git a/src/test/regress/sql/indexing.sql b/src/test/regress/sql/indexing.sql index 130ee7cc..8f5a33e9 100644 --- a/src/test/regress/sql/indexing.sql +++ b/src/test/regress/sql/indexing.sql @@ -15,7 +15,6 @@ drop table idxpart; -- Some unsupported features create table idxpart (a int, b int, c text) partition by range (a); create table idxpart1 partition of idxpart for values from (0) to (10); -create unique index on idxpart (a); create index concurrently on idxpart (a); drop table idxpart; @@ -378,6 +377,174 @@ select attrelid::regclass, attname, attnum from pg_attribute order by attrelid::regclass, attnum; drop table idxpart; +-- +-- Constraint-related indexes +-- + +-- Verify that it works to add primary key / unique to partitioned tables +create table idxpart (a int primary key, b int) partition by range (a); +\d idxpart +drop table idxpart; + +-- but not if you fail to use the full partition key +create table idxpart (a int unique, b int) partition by range (a, b); +create table idxpart (a int, b int unique) partition by range (a, b); +create table idxpart (a int primary key, b int) partition by range (b, a); +create table idxpart (a int, b int primary key) partition by range (b, a); + +-- OK if you use them in some other order +create table idxpart (a int, b int, c text, primary key (a, b, c)) partition by range (b, c, a); +drop table idxpart; + +-- not other types of index-based constraints +create table idxpart (a int, exclude (a with = )) partition by range (a); + +-- no expressions in partition key for PK/UNIQUE +create table idxpart (a int primary key, b int) partition by range ((b + a)); +create table idxpart (a int unique, b int) partition by range ((b + a)); + +-- use ALTER TABLE to add a primary key +create table idxpart (a int, b int, c text) partition by range (a, b); +alter table idxpart add primary key (a); -- not an incomplete one though +alter table idxpart add primary key (a, b); -- this works +\d idxpart +create table idxpart1 partition of idxpart for values from (0, 0) to (1000, 1000); +\d idxpart1 +drop table idxpart; + +-- use ALTER TABLE to add a unique constraint +create table idxpart (a int, b int) partition by range (a, b); +alter table idxpart add unique (a); -- not an incomplete one though +alter table idxpart add unique (b, a); -- this works +\d idxpart +drop table idxpart; + +-- Exclusion constraints cannot be added +create table idxpart (a int, b int) partition by range (a); +alter table idxpart add exclude (a with =); +drop table idxpart; + +-- When (sub)partitions are created, they also contain the constraint +create table idxpart (a int, b int, primary key (a, b)) partition by range (a, b); +create table idxpart1 partition of idxpart for values from (1, 1) to (10, 10); +create table idxpart2 partition of idxpart for values from (10, 10) to (20, 20) + partition by range (b); +create table idxpart21 partition of idxpart2 for values from (10) to (15); +create table idxpart22 partition of idxpart2 for values from (15) to (20); +create table idxpart3 (a int not null, b int not null); +alter table idxpart attach partition idxpart3 for values from (20, 20) to (30, 30); +select conname, contype, conrelid::regclass, conindid::regclass, conkey + from pg_constraint where conrelid::regclass::text like 'idxpart%' + order by conname; +drop table idxpart; + +-- Verify that multi-layer partitioning honors the requirement that all +-- columns in the partition key must appear in primary key +create table idxpart (a int, b int, primary key (a)) partition by range (a); +create table idxpart2 partition of idxpart +for values from (0) to (1000) partition by range (b); -- fail +drop table idxpart; + +-- Multi-layer partitioning works correctly in this case: +create table idxpart (a int, b int, primary key (a, b)) partition by range (a); +create table idxpart2 partition of idxpart for values from (0) to (1000) partition by range (b); +create table idxpart21 partition of idxpart2 for values from (0) to (1000); +select conname, contype, conrelid::regclass, conindid::regclass, conkey + from pg_constraint where conrelid::regclass::text like 'idxpart%' + order by conname; +drop table idxpart; + +-- If a partitioned table has a unique/PK constraint, then it's not possible +-- to drop the corresponding constraint in the children; nor it's possible +-- to drop the indexes individually. Dropping the constraint in the parent +-- gets rid of the lot. +create table idxpart (i int) partition by hash (i); +create table idxpart0 partition of idxpart (i) for values with (modulus 2, remainder 0); +create table idxpart1 partition of idxpart (i) for values with (modulus 2, remainder 1); +alter table idxpart0 add primary key(i); +alter table idxpart add primary key(i); +select indrelid::regclass, indexrelid::regclass, inhparent::regclass, indisvalid, + conname, conislocal, coninhcount, connoinherit, convalidated + from pg_index idx left join pg_inherits inh on (idx.indexrelid = inh.inhrelid) + left join pg_constraint con on (idx.indexrelid = con.conindid) + where indrelid::regclass::text like 'idxpart%' + order by indexrelid::regclass::text collate "C"; +drop index idxpart0_pkey; -- fail +drop index idxpart1_pkey; -- fail +alter table idxpart0 drop constraint idxpart0_pkey; -- fail +alter table idxpart1 drop constraint idxpart1_pkey; -- fail +alter table idxpart drop constraint idxpart_pkey; -- ok +select indrelid::regclass, indexrelid::regclass, inhparent::regclass, indisvalid, + conname, conislocal, coninhcount, connoinherit, convalidated + from pg_index idx left join pg_inherits inh on (idx.indexrelid = inh.inhrelid) + left join pg_constraint con on (idx.indexrelid = con.conindid) + where indrelid::regclass::text like 'idxpart%' + order by indexrelid::regclass::text collate "C"; +drop table idxpart; + +-- If a partitioned table has a constraint whose index is not valid, +-- attaching a missing partition makes it valid. +create table idxpart (a int) partition by range (a); +create table idxpart0 (like idxpart); +alter table idxpart0 add primary key (a); +alter table idxpart attach partition idxpart0 for values from (0) to (1000); +alter table only idxpart add primary key (a); +select indrelid::regclass, indexrelid::regclass, inhparent::regclass, indisvalid, + conname, conislocal, coninhcount, connoinherit, convalidated + from pg_index idx left join pg_inherits inh on (idx.indexrelid = inh.inhrelid) + left join pg_constraint con on (idx.indexrelid = con.conindid) + where indrelid::regclass::text like 'idxpart%' + order by indexrelid::regclass::text collate "C"; +alter index idxpart_pkey attach partition idxpart0_pkey; +select indrelid::regclass, indexrelid::regclass, inhparent::regclass, indisvalid, + conname, conislocal, coninhcount, connoinherit, convalidated + from pg_index idx left join pg_inherits inh on (idx.indexrelid = inh.inhrelid) + left join pg_constraint con on (idx.indexrelid = con.conindid) + where indrelid::regclass::text like 'idxpart%' + order by indexrelid::regclass::text collate "C"; +drop table idxpart; + +-- if a partition has a unique index without a constraint, does not attach +-- automatically; creates a new index instead. +create table idxpart (a int, b int) partition by range (a); +create table idxpart1 (a int not null, b int); +create unique index on idxpart1 (a); +alter table idxpart add primary key (a); +alter table idxpart attach partition idxpart1 for values from (1) to (1000); +select indrelid::regclass, indexrelid::regclass, inhparent::regclass, indisvalid, + conname, conislocal, coninhcount, connoinherit, convalidated + from pg_index idx left join pg_inherits inh on (idx.indexrelid = inh.inhrelid) + left join pg_constraint con on (idx.indexrelid = con.conindid) + where indrelid::regclass::text like 'idxpart%' + order by indexrelid::regclass::text collate "C"; +drop table idxpart; + +-- Can't attach an index without a corresponding constraint +create table idxpart (a int, b int) partition by range (a); +create table idxpart1 (a int not null, b int); +create unique index on idxpart1 (a); +alter table idxpart attach partition idxpart1 for values from (1) to (1000); +alter table only idxpart add primary key (a); +alter index idxpart_pkey attach partition idxpart1_a_idx; -- fail +drop table idxpart; + +-- Test that unique constraints are working +create table idxpart (a int, b text, primary key (a, b)) partition by range (a); +create table idxpart1 partition of idxpart for values from (0) to (100000); +create table idxpart2 (like idxpart); +insert into idxpart2 (a, b) values (572814, 'inserted first'); +create unique index on idxpart (a); +alter table idxpart attach partition idxpart2 for values from (100000) to (1000000); +insert into idxpart values (0, 'zero'), (42, 'life'), (2^16, 'sixteen'); +insert into idxpart select 2^g, format('two to power of %s', g) from generate_series(15, 17) g; +insert into idxpart values (16, 'sixteen'); +insert into idxpart (b, a) values ('one', 142857), ('two', 285714); +insert into idxpart select a * 2, b || b from idxpart where a between 2^16 and 2^19; +insert into idxpart values (572814, 'five'); +insert into idxpart values (857142, 'six'); +select tableoid::regclass, * from idxpart order by a; +drop table idxpart; + -- intentionally leave some objects around create table idxpart (a int) partition by range (a); create table idxpart1 partition of idxpart for values from (0) to (100); @@ -389,3 +556,5 @@ create index on idxpart22 (a); create index on only idxpart2 (a); alter index idxpart2_a_idx attach partition idxpart22_a_idx; create index on idxpart (a); +create table idxpart_another (a int, b int, primary key (a, b)) partition by range (a); +create table idxpart_another_1 partition of idxpart_another for values from (0) to (100); From e097352eb5e8ed997c83bd3029ff718226870d0d Mon Sep 17 00:00:00 2001 From: youngxie Date: Thu, 18 Mar 2021 12:26:51 +0000 Subject: [PATCH 310/578] Solve the issue of hash index on coordinator. http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131085729301&url_cache_key=d4e1402777dc733479aac463ad1a9d24 (cherry picked from commit 3bb2732c) 74fa9796 Solve the issue of hash index on coordinator. http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131085729301&url_cache_key=d4e1402777dc733479aac463ad1a9d24 --- src/backend/access/hash/hash.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index b7e21348..82d6e924 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -120,7 +120,15 @@ hashbuild(Relation heap, Relation index, IndexInfo *indexInfo) RelationGetRelationName(index)); /* Estimate the number of rows currently present in the table */ + if (IS_PGXC_COORDINATOR) + { + /* Coordinator has no data */ + relpages = reltuples = allvisfrac = 0; + } + else + { estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac); + } /* Initialize the hash index metadata page and initial buckets */ num_buckets = _hash_init(index, reltuples, MAIN_FORKNUM); From 73937edc074f05e41ddb4e4ea78ae012005d085c Mon Sep 17 00:00:00 2001 From: gregsun Date: Tue, 22 Dec 2020 23:27:59 +0800 Subject: [PATCH 311/578] Patch from PostgreSQL - pg_dump keeps finding loop due to partition indexes loop dependency. http://tapd.oa.com/20421696/bugtrace/bugs/view?bug_id=1020421696084240131 commit 8cff4f5348d075e063100071013f00a900c32b0f Author: Tom Lane Date: Tue Aug 28 19:33:04 2018 -0400 Code review for pg_dump's handling of ALTER INDEX ATTACH PARTITION. Ensure the TOC entry is marked with the correct schema, so that its name is as unique as the index's is. Fix the dependencies: we want dependencies from this TOC entry to the two indexes it depends on, and we don't care (at least not for this purpose) what order the indexes are created in. Also, add dependencies on the indexes' underlying tables. Those might seem pointless given the index dependencies, but they are helpful to cue parallel restore to avoid running the ATTACH PARTITION in parallel with other DDL on the same tables. Discussion: https://postgr.es/m/10817.1535494963@sss.pgh.pa.us --- src/bin/pg_dump/common.c | 24 +++++++++++++++++++----- src/bin/pg_dump/pg_dump.c | 5 +++-- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/src/bin/pg_dump/common.c b/src/bin/pg_dump/common.c index 0942fa5b..21fdcc52 100644 --- a/src/bin/pg_dump/common.c +++ b/src/bin/pg_dump/common.c @@ -403,17 +403,31 @@ flagInhIndexes(Archive *fout, TableInfo tblinfo[], int numTables) attachinfo[k].dobj.catId.oid = 0; AssignDumpId(&attachinfo[k].dobj); attachinfo[k].dobj.name = pg_strdup(index->dobj.name); + attachinfo[k].dobj.namespace = index->indextable->dobj.namespace; attachinfo[k].parentIdx = parentidx; attachinfo[k].partitionIdx = index; /* - * We want dependencies from parent to partition (so that the - * partition index is created first), and another one from - * attach object to parent (so that the partition index is - * attached once the parent index has been created). + * We must state the DO_INDEX_ATTACH object's dependencies + * explicitly, since it will not match anything in pg_depend. + * + * Give it dependencies on both the partition index and the parent + * index, so that it will not be executed till both of those + * exist. (There's no need to care what order those are created + * in.) + * + * In addition, give it dependencies on the indexes' underlying + * tables. This does nothing of great value so far as serial + * restore ordering goes, but it ensures that a parallel restore + * will not try to run the ATTACH concurrently with other + * operations on those tables. */ - addObjectDependency(&parentidx->dobj, index->dobj.dumpId); + addObjectDependency(&attachinfo[k].dobj, index->dobj.dumpId); addObjectDependency(&attachinfo[k].dobj, parentidx->dobj.dumpId); + addObjectDependency(&attachinfo[k].dobj, + index->indextable->dobj.dumpId); + addObjectDependency(&attachinfo[k].dobj, + parentidx->indextable->dobj.dumpId); k++; } diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index b11b02ae..89685c9f 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -16591,7 +16591,7 @@ dumpIndexAttach(Archive *fout, IndexAttachInfo *attachinfo) { PQExpBuffer q = createPQExpBuffer(); - appendPQExpBuffer(q, "\nALTER INDEX %s ", + appendPQExpBuffer(q, "ALTER INDEX %s ", fmtQualifiedId(fout->remoteVersion, attachinfo->parentIdx->dobj.namespace->dobj.name, attachinfo->parentIdx->dobj.name)); @@ -16602,7 +16602,8 @@ dumpIndexAttach(Archive *fout, IndexAttachInfo *attachinfo) ArchiveEntry(fout, attachinfo->dobj.catId, attachinfo->dobj.dumpId, attachinfo->dobj.name, - NULL, NULL, + attachinfo->dobj.namespace->dobj.name, + NULL, "", false, "INDEX ATTACH", SECTION_POST_DATA, q->data, "", NULL, From 7c3781072d85468101d69278a93e95047619e833 Mon Sep 17 00:00:00 2001 From: youngxie Date: Wed, 28 Apr 2021 17:36:05 +0800 Subject: [PATCH 312/578] Optimize distinct agg. Do distinct on datanodes,then agg can run parallel. http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131086201101&url_cache_key=d4e1402777dc733479aac463ad1a9d24 --- src/backend/executor/nodeAgg.c | 14 +-- src/backend/nodes/copyfuncs.c | 1 + src/backend/nodes/outfuncs.c | 2 + src/backend/nodes/readfuncs.c | 1 + src/backend/optimizer/plan/createplan.c | 7 +- src/backend/optimizer/plan/planner.c | 153 ++++++++++++++++++++++-- src/backend/optimizer/plan/setrefs.c | 1 - src/backend/optimizer/util/pathnode.c | 86 +++++++++++++ src/backend/optimizer/util/tlist.c | 25 ++++ src/backend/utils/misc/guc.c | 15 +++ src/include/nodes/plannodes.h | 1 + src/include/nodes/relation.h | 1 + src/include/optimizer/pathnode.h | 3 + src/include/optimizer/tlist.h | 30 +++-- src/test/regress/expected/sysviews.out | 3 +- 15 files changed, 309 insertions(+), 34 deletions(-) diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c index 8b1695a6..0d0185c9 100644 --- a/src/backend/executor/nodeAgg.c +++ b/src/backend/executor/nodeAgg.c @@ -4624,6 +4624,7 @@ build_pertrans_for_aggref(AggStatePerTrans pertrans, int numDistinctCols; int naggs; int i; + Agg *agg = (Agg *)aggstate->ss.ps.plan; /* Begin filling in the pertrans data */ pertrans->aggref = aggref; @@ -4785,11 +4786,13 @@ build_pertrans_for_aggref(AggStatePerTrans pertrans, * have a list of SortGroupClause nodes; fish out the data in them and * stick them into arrays. We ignore ORDER BY for an ordered-set agg, * however; the agg's transfn and finalfn are responsible for that. + * Distributed distinct agg does not need distinct in second phase. * * Note that by construction, if there is a DISTINCT clause then the ORDER * BY clause is a prefix of it (see transformDistinctClause). */ - if (AGGKIND_IS_ORDERED_SET(aggref->aggkind)) + if (AGGKIND_IS_ORDERED_SET(aggref->aggkind) + || agg->noDistinct) { sortlist = NIL; numSortCols = numDistinctCols = 0; @@ -4820,12 +4823,6 @@ build_pertrans_for_aggref(AggStatePerTrans pertrans, pertrans->sortslot = ExecInitExtraTupleSlot(estate); ExecSetSlotDescriptor(pertrans->sortslot, pertrans->sortdesc); - /* - * We don't implement DISTINCT or ORDER BY aggs in the HASHED case - * (yet) - */ - Assert(aggstate->aggstrategy != AGG_HASHED && aggstate->aggstrategy != AGG_MIXED); - /* If we have only one input, we need its len/byval info. */ if (numInputs == 1) { @@ -4869,7 +4866,8 @@ build_pertrans_for_aggref(AggStatePerTrans pertrans, Assert(i == numSortCols); } - if (aggref->aggdistinct) + /* Distributed distinct agg does not need distinct in second phase. */ + if (aggref->aggdistinct && !agg->noDistinct) { Assert(numArguments > 0); diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 8bc360f1..ed04f1d4 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -1047,6 +1047,7 @@ _copyAgg(const Agg *from) #ifdef __TBASE__ COPY_SCALAR_FIELD(entrySize); COPY_SCALAR_FIELD(hybrid); + COPY_SCALAR_FIELD(noDistinct); #endif return newnode; diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index 7df4571b..ac7ea190 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -1330,6 +1330,7 @@ _outAgg(StringInfo str, const Agg *node) #ifdef __TBASE__ WRITE_UINT_FIELD(entrySize); WRITE_BOOL_FIELD(hybrid); + WRITE_BOOL_FIELD(noDistinct); #endif } @@ -3317,6 +3318,7 @@ _outAggPath(StringInfo str, const AggPath *node) #ifdef __TBASE__ WRITE_UINT_FIELD(entrySize); WRITE_BOOL_FIELD(hybrid); + WRITE_BOOL_FIELD(noDistinct); #endif } diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c index 72e9a6fa..b13796e7 100644 --- a/src/backend/nodes/readfuncs.c +++ b/src/backend/nodes/readfuncs.c @@ -3279,6 +3279,7 @@ _readAgg(void) #ifdef __TBASE__ READ_UINT_FIELD(entrySize); READ_BOOL_FIELD(hybrid); + READ_BOOL_FIELD(noDistinct); #endif READ_DONE(); diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 706b3340..13c8ec79 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -2331,6 +2331,8 @@ create_agg_plan(PlannerInfo *root, AggPath *best_path) } } } + + plan->noDistinct = best_path->noDistinct; #endif return plan; @@ -6513,7 +6515,9 @@ make_remotesubplan(PlannerInfo *root, { Agg *node = (Agg *)lefttree; - if (node->aggsplit == AGGSPLIT_INITIAL_SERIAL) + /* do not parallel if it's not safe */ + if (node->aggsplit == AGGSPLIT_INITIAL_SERIAL + && lefttree->parallel_safe) { switch(node->aggstrategy) { @@ -7999,6 +8003,7 @@ make_agg(List *tlist, List *qual, #ifdef __TBASE__ node->hybrid = false; node->entrySize = 0; + node->noDistinct = false; #endif plan->qual = qual; plan->targetlist = tlist; diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 7ee9d475..fc87fcbc 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -79,6 +79,7 @@ create_upper_paths_hook_type create_upper_paths_hook = NULL; #ifdef __TBASE__ bool olap_optimizer = false; +bool enable_distinct_optimizer; #endif /* Expression kind codes for preprocess_expression */ @@ -208,7 +209,7 @@ static PathTarget *make_sort_input_target(PlannerInfo *root, PathTarget *final_target, bool *have_postponed_srfs); static bool grouping_distribution_match(PlannerInfo *root, Query *parse, - Path *path, List *clauses); + Path *path, List *clauses, List *targetList); static bool groupingsets_distribution_match(PlannerInfo *root, Query *parse, Path *path); static Path *adjust_path_distribution(PlannerInfo *root, Query *parse, @@ -239,6 +240,9 @@ static bool can_parallel_agg(PlannerInfo *root, RelOptInfo *input_rel, RelOptInfo *grouped_rel, const AggClauseCosts *agg_costs); #ifdef __TBASE__ static Path *adjust_modifytable_subpath(PlannerInfo *root, Query *parse, Path *path); +static bool can_distinct_agg_optimize(PlannerInfo *root, RelOptInfo *input_rel, + RelOptInfo *grouped_rel, PathTarget *pathtarget, + const AggClauseCosts *agg_costs); #endif /***************************************************************************** @@ -4170,6 +4174,7 @@ create_ordinary_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel, bool can_sort; bool try_parallel_aggregation; bool try_distributed_aggregation; + bool try_distributed_distinct_agg_optimize; PathTarget *partial_grouping_target = NULL; ListCell *lc; @@ -4241,6 +4246,7 @@ create_ordinary_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel, { /* Not even parallel-safe. */ try_distributed_aggregation = false; + try_distributed_distinct_agg_optimize = false; } else if (!parse->hasAggs && parse->groupClause == NIL) { @@ -4249,26 +4255,44 @@ create_ordinary_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel, * some aggregates or a grouping clause. */ try_distributed_aggregation = false; + try_distributed_distinct_agg_optimize = false; } else if (parse->groupingSets) { /* We don't know how to do grouping sets in parallel. */ try_distributed_aggregation = false; + try_distributed_distinct_agg_optimize = false; } - else if (agg_costs->hasNonPartial || agg_costs->hasNonSerial) + else if (agg_costs->hasNonSerial) + { + /* Insufficient support for partial mode. */ + try_distributed_aggregation = false; + try_distributed_distinct_agg_optimize = false; + } + else if (agg_costs->hasNonPartial) { /* Insufficient support for partial mode. */ try_distributed_aggregation = false; + /* Ignore by distint agg optimize */ + try_distributed_distinct_agg_optimize = true; } else { /* Everything looks good. */ try_distributed_aggregation = true; + try_distributed_distinct_agg_optimize = true; } /* Whenever parallel aggregation is allowed, distributed should be too. */ Assert(!(try_parallel_aggregation && !try_distributed_aggregation)); + if (try_distributed_distinct_agg_optimize && + !can_distinct_agg_optimize(root, input_rel, grouped_rel, + target ,agg_costs)) + { + try_distributed_distinct_agg_optimize = false; + } + /* * Before generating paths for grouped_rel, we first generate any possible * partial paths; that way, later code can easily consider both parallel @@ -4383,7 +4407,7 @@ create_ordinary_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel, * we know the per-node groupings won't overlap. But here we need to be * more careful. */ - if (try_distributed_aggregation) + if (try_distributed_aggregation || try_distributed_distinct_agg_optimize) { partial_grouping_target = make_partial_grouping_target(root, target, (Node *) parse->havingQual); @@ -4415,7 +4439,10 @@ create_ordinary_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel, AGGSPLIT_FINAL_DESERIAL, &agg_final_costs); } + } + if (try_distributed_aggregation) + { /* Build final XL grouping paths */ if (can_sort) { @@ -4762,6 +4789,57 @@ create_ordinary_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel, } } + if (try_distributed_distinct_agg_optimize) + { + List *groupExprs = NIL; + Aggref *agg = get_optimize_distinct_agg(target); + + groupExprs = get_sortgrouplist_exprs(agg->aggdistinct, agg->args); + + dNumPartialGroups = estimate_num_groups(root, groupExprs, cheapest_path->rows, + NULL); + + foreach (lc, input_rel->pathlist) + { + Path *path = (Path *)lfirst(lc); + + /* check if we need redistribute */ + if (!grouping_distribution_match(root, parse, path, agg->aggdistinct, agg->args)) + { + path = create_redistribute_distinct_agg_path(root, parse, path, agg); + } + + path = (Path *)create_agg_path(root, + grouped_rel, + path, + partial_grouping_target, + AGG_HASHED, + AGGSPLIT_INITIAL_SERIAL, + parse->groupClause, + NULL, + &agg_partial_costs, + dNumPartialGroups); + /* partial is not parallel safe */ + path->parallel_safe = false; + + path = create_remotesubplan_path(root, path, NULL); + + path = (Path *)create_agg_path(root, + grouped_rel, + path, + target, + AGG_HASHED, + AGGSPLIT_FINAL_DESERIAL, + parse->groupClause, + NULL, + &agg_final_costs, + 1); + ((AggPath *)path)->noDistinct = true; + + add_path(grouped_rel, path); + } + } + /* Give a helpful error if we failed to find any implementation */ if (grouped_rel->pathlist == NIL) ereport(ERROR, @@ -5443,7 +5521,7 @@ create_distinct_paths(PlannerInfo *root, * FIXME This could probably benefit from pushing a UNIQUE * to the remote side, and only doing a merge locally. */ - if (!grouping_distribution_match(root, parse, path, parse->distinctClause)) + if (!grouping_distribution_match(root, parse, path, parse->distinctClause, parse->targetList)) path = create_remotesubplan_path(root, path, NULL); add_path(distinct_rel, (Path *) @@ -5474,7 +5552,7 @@ create_distinct_paths(PlannerInfo *root, -1.0); /* In case of grouping / distribution mismatch, inject remote scan. */ - if (!grouping_distribution_match(root, parse, path, parse->distinctClause)) + if (!grouping_distribution_match(root, parse, path, parse->distinctClause, parse->targetList)) path = create_remotesubplan_path(root, path, NULL); add_path(distinct_rel, (Path *) @@ -5520,7 +5598,7 @@ create_distinct_paths(PlannerInfo *root, Path *input_path = cheapest_input_path; /* If needed, inject RemoteSubplan redistributing the data. */ - if (!grouping_distribution_match(root, parse, input_path, parse->distinctClause)) + if (!grouping_distribution_match(root, parse, input_path, parse->distinctClause, parse->targetList)) input_path = create_remotesubplan_path(root, input_path, NULL); /* XXX Maybe we can make this a 2-phase aggregate too? */ @@ -6784,7 +6862,7 @@ plan_cluster_use_sort(Oid tableOid, Oid indexOid) */ static bool grouping_distribution_match(PlannerInfo *root, Query *parse, Path *path, - List *clauses) + List *clauses, List *targetList) { int i; bool matches_key = false; @@ -6792,7 +6870,7 @@ grouping_distribution_match(PlannerInfo *root, Query *parse, Path *path, int numGroupCols = list_length(clauses); AttrNumber *groupColIdx = extract_grouping_cols(clauses, - parse->targetList); + targetList); #ifdef __COLD_HOT__ if (has_cold_hot_table) @@ -6826,7 +6904,7 @@ grouping_distribution_match(PlannerInfo *root, Query *parse, Path *path, */ for (i = 0; i < numGroupCols; i++) { - TargetEntry *te = (TargetEntry *)list_nth(parse->targetList, + TargetEntry *te = (TargetEntry *)list_nth(targetList, groupColIdx[i]-1); if (equal(te->expr, distribution->distributionExpr)) @@ -8248,6 +8326,61 @@ adjust_path_distribution(PlannerInfo *root, Query *parse, Path *path) return path; } +#ifdef __TBASE__ +/* + * can_distinct_agg_optimize + * Check if distinct app is workable. + */ +static bool +can_distinct_agg_optimize(PlannerInfo *root, RelOptInfo *input_rel, + RelOptInfo *grouped_rel, PathTarget *pathtarget, + const AggClauseCosts *agg_costs) +{ + ListCell *lc = NULL; + Query *parse = NULL; + bool meet_distint_agg_clause = false; + + parse = root->parse; + + /* It's no use for 2phase agg on datanode */ + if (!grouped_rel->consider_parallel || input_rel->partial_pathlist == NIL || + !agg_costs->hasOnlyDistinct || agg_costs->hasNonSerial || agg_costs->hasOrder || + parse->groupClause || parse->groupingSets || parse->havingQual || + parse->distinctClause || has_cold_hot_table || !olap_optimizer || !enable_distinct_optimizer || + IS_PGXC_DATANODE) + { + return false; + } + + foreach (lc, pathtarget->exprs) + { + Aggref *aggref = (Aggref *)lfirst(lc); + + if (IsA(aggref, Aggref) && aggref->aggdistinct != NIL) + { + /* only one distinct agg is allowed */ + if(meet_distint_agg_clause) + return false; + + if (list_length(aggref->aggdistinct) != 1 || + list_length(aggref->args) != 1) + { + return false; + } + + /* currently we only support hash agg */ + if (!grouping_is_hashable(aggref->aggdistinct)) + { + return false; + } + meet_distint_agg_clause = true; + } + } + + return meet_distint_agg_clause; +} +#endif + static bool can_push_down_grouping(PlannerInfo *root, Query *parse, Path *path) { @@ -8269,7 +8402,7 @@ can_push_down_grouping(PlannerInfo *root, Query *parse, Path *path) if (parse->groupingSets) return groupingsets_distribution_match(root, parse, path); - return grouping_distribution_match(root, parse, path, parse->groupClause); + return grouping_distribution_match(root, parse, path, parse->groupClause, parse->targetList); } static bool diff --git a/src/backend/optimizer/plan/setrefs.c b/src/backend/optimizer/plan/setrefs.c index 805585b7..d1016beb 100644 --- a/src/backend/optimizer/plan/setrefs.c +++ b/src/backend/optimizer/plan/setrefs.c @@ -1877,7 +1877,6 @@ convert_combining_aggrefs(Node *node, void *context) /* Assert we've not chosen to partial-ize any unsupported cases */ Assert(orig_agg->aggorder == NIL); - Assert(orig_agg->aggdistinct == NIL); /* * Since aggregate calls can't be nested, we needn't recurse into the diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 4d2a1f32..35bf8b8a 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -3058,6 +3058,92 @@ get_num_connections(int numnodes, int nRemotePlans) return num_connections; } + +/* + * redistribute local grouping results among datanodes for + * distinct aggs like count(distinct a) or avg(distinct a)... + * + * Tips: we do not check the agg column's type, directly use that + * as hash column, but some data types are not supported as hash column now, + * maybe some errors. + */ +Path * +create_redistribute_distinct_agg_path(PlannerInfo *root, Query *parse, Path *path, Aggref *agg) +{ + PathTarget *pathtarget = path->pathtarget; + TargetEntry *te = NULL; + Bitmapset *nodes = NULL; + Oid group; + int i; + + te = get_sortgroupclause_tle((SortGroupClause *)linitial(agg->aggdistinct), + agg->args); + + if(te == NULL) + { + elog(ERROR, "Distinct aggref not found in pathtarget."); + } + + if (list_length(groupOids) > 1) + { + groupOids = NULL; + elog(ERROR, "Tables from different groups should not be invloved in one Query."); + } + + if (groupOids) + { + group = linitial_oid(groupOids); + } + else + { + group = InvalidOid; + } + + if (group == InvalidOid) + { + for (i = 0; i < NumDataNodes; i++) + nodes = bms_add_member(nodes, i); + + /* + * FIXING ME! check hash column's data type to satisfity hash locator func + */ + path = redistribute_path(root, + path, + NULL, + LOCATOR_TYPE_HASH, + (Node *)te->expr, + nodes, + NULL); + } + else + { + ListCell *cell; + List *nodelist = GetGroupNodeList(group); + + foreach (cell, nodelist) + { + int nodeid = lfirst_int(cell); + + nodes = bms_add_member(nodes, nodeid); + } + /* + * FIXING ME! check hash column's data type to satisfity hash locator func + */ + path = redistribute_path(root, + path, + NULL, + LOCATOR_TYPE_SHARD, + (Node *)te->expr, + nodes, + NULL); + } + + path->pathkeys = NULL; + path->pathtarget = pathtarget; + + return path; +} + /* * redistribute local grouping results among datanodes, then * get the final grouping results. seems more efficient... diff --git a/src/backend/optimizer/util/tlist.c b/src/backend/optimizer/util/tlist.c index 6fc75bd5..496fd970 100644 --- a/src/backend/optimizer/util/tlist.c +++ b/src/backend/optimizer/util/tlist.c @@ -468,6 +468,31 @@ get_sortgrouplist_exprs(List *sgClauses, List *targetList) return result; } +/* + * get_distinct_agg_sortgroupclause + * Given a pathtarget , acquire distinct clause + * for aggref with distinct. + * Notice: only one distinct agg clause with one col + * is allowed. + */ +Aggref * +get_optimize_distinct_agg(PathTarget *pathtarget) +{ + ListCell *lc = NULL; + + foreach (lc, pathtarget->exprs) + { + Aggref *aggref = (Aggref *)lfirst(lc); + + if (IsA(aggref, Aggref) && aggref->aggdistinct != NIL) + { + Assert(list_length(aggref->aggdistinct) == 1); + return aggref; + } + } + + return NULL; +} /***************************************************************************** * Functions to extract data from a list of SortGroupClauses diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index e7ba54b0..488fd88f 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -201,6 +201,7 @@ extern bool PlpgsqlDebugPrint; /* used for get total size of session */ static int32 g_TotalMemorySize = 0; extern bool enable_parallel_ddl; +extern bool enable_distinct_optimizer; #endif static int GUC_check_errcode_value; @@ -2787,6 +2788,20 @@ static struct config_bool ConfigureNamesBool[] = }, #endif + { + {"enable_distinct_optimizer", PGC_SUSET, CUSTOM_OPTIONS, + gettext_noop("push down distinct to datanodes."), + NULL + }, + &enable_distinct_optimizer + , +#ifdef _PG_REGRESS_ + true, +#else + false, +#endif + NULL, NULL, NULL + }, /* End-of-list marker */ { diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index 4b3c49d2..c20a6741 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -855,6 +855,7 @@ typedef struct Agg #ifdef __TBASE__ uint32 entrySize; bool hybrid; + bool noDistinct; /* no need of distinct related initialization */ #endif } Agg; diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h index e49bc1a0..4b752d16 100644 --- a/src/include/nodes/relation.h +++ b/src/include/nodes/relation.h @@ -1710,6 +1710,7 @@ typedef struct AggPath #ifdef __TBASE__ uint32 entrySize; bool hybrid; + bool noDistinct; /* no need of distinct related initialization */ #endif } AggPath; diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index 4097e568..a3afb1a4 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -365,6 +365,9 @@ extern RelOptInfo *build_child_join_rel(PlannerInfo *root, #ifdef __TBASE__ extern Path *create_redistribute_grouping_path(PlannerInfo *root, Query *parse, Path *path); +extern Path *create_redistribute_distinct_agg_path(PlannerInfo *root, + Query *parse, Path *path, + Aggref *agg); extern void contains_remotesubplan(Path *path, int *number, bool *redistribute); extern int replication_level; diff --git a/src/include/optimizer/tlist.h b/src/include/optimizer/tlist.h index 5b9d94b0..076bfbab 100644 --- a/src/include/optimizer/tlist.h +++ b/src/include/optimizer/tlist.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * tlist.h - * prototypes for tlist.c. + * prototypes for tlist.c. * * * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group @@ -24,7 +24,7 @@ extern List *add_to_flat_tlist(List *tlist, List *exprs); extern List *get_tlist_exprs(List *tlist, bool includeJunk); -extern int count_nonjunk_tlist_entries(List *tlist); +extern int count_nonjunk_tlist_entries(List *tlist); extern bool tlist_same_exprs(List *tlist1, List *tlist2); @@ -34,18 +34,22 @@ extern bool tlist_same_collations(List *tlist, List *colCollations, bool junkOK) extern void apply_tlist_labeling(List *dest_tlist, List *src_tlist); extern TargetEntry *get_sortgroupref_tle(Index sortref, - List *targetList); + List *targetList); extern TargetEntry *get_sortgroupclause_tle(SortGroupClause *sgClause, - List *targetList); + List *targetList); extern Node *get_sortgroupclause_expr(SortGroupClause *sgClause, - List *targetList); + List *targetList); extern List *get_sortgrouplist_exprs(List *sgClauses, - List *targetList); + List *targetList); + +#ifdef __TBASE__ +extern Aggref *get_optimize_distinct_agg(PathTarget *pathtarget); +#endif extern SortGroupClause *get_sortgroupref_clause(Index sortref, - List *clauses); + List *clauses); extern SortGroupClause *get_sortgroupref_clause_noerr(Index sortref, - List *clauses); + List *clauses); extern Oid *extract_grouping_ops(List *groupClause); extern AttrNumber *extract_grouping_cols(List *groupClause, List *tlist); @@ -57,16 +61,16 @@ extern List *make_tlist_from_pathtarget(PathTarget *target); extern PathTarget *copy_pathtarget(PathTarget *src); extern PathTarget *create_empty_pathtarget(void); extern void add_column_to_pathtarget(PathTarget *target, - Expr *expr, Index sortgroupref); + Expr *expr, Index sortgroupref); extern void add_new_column_to_pathtarget(PathTarget *target, Expr *expr); extern void add_new_columns_to_pathtarget(PathTarget *target, List *exprs); extern void apply_pathtarget_labeling_to_tlist(List *tlist, PathTarget *target); extern void split_pathtarget_at_srfs(PlannerInfo *root, - PathTarget *target, PathTarget *input_target, - List **targets, List **targets_contain_srfs); + PathTarget *target, PathTarget *input_target, + List **targets, List **targets_contain_srfs); /* Convenience macro to get a PathTarget with valid cost/width fields */ #define create_pathtarget(root, tlist) \ - set_pathtarget_cost_width(root, make_pathtarget_from_tlist(tlist)) + set_pathtarget_cost_width(root, make_pathtarget_from_tlist(tlist)) -#endif /* TLIST_H */ +#endif /* TLIST_H */ diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index 65fd3d80..ca0d242d 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -91,6 +91,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_crypt_parellel_debug | off enable_data_mask | on enable_datanode_row_triggers | off + enable_distinct_optimizer | on enable_distri_debug | off enable_distri_debug_print | off enable_distri_visibility_print | off @@ -135,7 +136,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_transparent_crypt | on enable_user_authority_force_check | off enable_xlog_mprotect | on -(62 rows) +(63 rows) -- Test that the pg_timezone_names and pg_timezone_abbrevs views are -- more-or-less working. We can't test their contents in any great detail From b2b74898e764807bba6cea78a611dace2d4de3ed Mon Sep 17 00:00:00 2001 From: andrelin Date: Tue, 29 Jun 2021 15:28:15 +0800 Subject: [PATCH 313/578] 1. Look deep into subxid array, set max as local_xid on secondary DN 2. Copy local_subxids to parallel workers 3. Force parallel workers to send cid 4. regress tpad: http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view/1020421696089290349 --- src/backend/access/transam/varsup.c | 47 +++++++++- src/backend/access/transam/xact.c | 30 +++++++ src/backend/pgxc/pool/pgxcnode.c | 3 +- src/backend/storage/ipc/procarray.c | 19 +++- src/include/access/transam.h | 2 + src/include/storage/procarray.h | 92 ++++++++++---------- src/test/regress/expected/transactions_2.out | 66 ++++++++++++++ src/test/regress/sql/transactions.sql | 51 +++++++++++ 8 files changed, 256 insertions(+), 54 deletions(-) diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c index 51749e7c..6fdadcc2 100644 --- a/src/backend/access/transam/varsup.c +++ b/src/backend/access/transam/varsup.c @@ -91,6 +91,11 @@ GetForceXidFromGTM(void) #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__ static TransactionId local_xid = InvalidTransactionId; +static TransactionId local_subxids[PGPROC_MAX_CACHED_SUBXIDS] = {}; +static int local_nsub; +/* exported information about parallel workers, see xact.c */ +extern int nParallelCurrentXids; +extern TransactionId *ParallelCurrentXids; /* * Set next transaction id to use */ @@ -123,10 +128,10 @@ StoreGlobalXid(const char *globalXid) else if(IsConnFromDatanode()) { - local_xid = GetLocalTransactionId(globalXid); + local_xid = GetLocalTransactionId(globalXid, local_subxids, &local_nsub); if(enable_distri_print) { - elog (LOG, " global xid %s to local xid %d", globalXid, local_xid); + elog (LOG, " global xid %s to local xid %d, %d subxids", globalXid, local_xid, local_nsub); } } @@ -158,21 +163,55 @@ void SetLocalTransactionId(TransactionId xid) } local_xid = xid; + /* if xid is invalid, also need to reset subxid array */ + if (!TransactionIdIsValid(xid)) + { + local_nsub = 0; + } } -TransactionId GetNextTransactionId(void) +TransactionId +GetNextTransactionId(void) { return local_xid; } +int +GetNumSubTransactions(void) +{ + return local_nsub; +} + +TransactionId * +GetSubTransactions(void) +{ + return local_subxids; +} + bool TransactIdIsCurentGlobalTransacId(TransactionId xid) { + int i; + if(enable_distri_print) { elog(LOG, "is current transaction xid %u local xid %d", xid, local_xid); } - return TransactionIdIsValid(local_xid) && TransactionIdEquals(xid, local_xid); + + if (!TransactionIdIsValid(local_xid)) + return false; + + if (TransactionIdEquals(xid, local_xid)) + return true; + + /* check subxids */ + for (i = 0; i < local_nsub; i++) + { + if (TransactionIdEquals(local_subxids[i], xid)) + return true; + } + + return false; } #ifdef __TWO_PHASE_TRANS__ diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 9632a415..91cd002a 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -6512,7 +6512,14 @@ EstimateTransactionStateSpace(void) * command counter, XID count */ #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__ + int nsub = GetNumSubTransactions(); nxids++; /* local xid */ + if (nsub > 0) + { + nxids = add_size(nxids, nsub); /* local subxids */ + } + else /* only do for loop below */ + { #endif for (s = CurrentTransactionState; s != NULL; s = s->parent) @@ -6521,6 +6528,9 @@ EstimateTransactionStateSpace(void) nxids = add_size(nxids, 1); nxids = add_size(nxids, s->nChildXids); } +#ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__ + } +#endif nxids = add_size(nxids, nParallelCurrentXids); @@ -6562,6 +6572,7 @@ SerializeTransactionState(Size maxsize, char *start_address) Size c = 0; TransactionId *workspace; TransactionId *result = (TransactionId *) start_address; + int nsub = 0; result[c++] = (TransactionId) XactIsoLevel; result[c++] = (TransactionId) XactDeferrable; @@ -6588,10 +6599,17 @@ SerializeTransactionState(Size maxsize, char *start_address) return; } + nsub = GetNumSubTransactions(); /* * OK, we need to generate a sorted list of XIDs that our workers should * view as current. First, figure out how many there are. */ + if (nsub > 0) + { + nxids = add_size(nxids, nsub); + } + else + { for (s = CurrentTransactionState; s != NULL; s = s->parent) { if (TransactionIdIsValid(s->transactionId)) @@ -6599,9 +6617,20 @@ SerializeTransactionState(Size maxsize, char *start_address) nxids = add_size(nxids, s->nChildXids); } Assert((c + 1 + nxids) * sizeof(TransactionId) <= maxsize); + } /* Copy them to our scratch space. */ workspace = palloc(nxids * sizeof(TransactionId)); + + if (nsub > 0) + { + TransactionId *subxids = GetSubTransactions(); + memcpy(&workspace[i], subxids, + nsub * sizeof(TransactionId)); + i += nsub; + } + else + { for (s = CurrentTransactionState; s != NULL; s = s->parent) { if (TransactionIdIsValid(s->transactionId)) @@ -6611,6 +6640,7 @@ SerializeTransactionState(Size maxsize, char *start_address) i += s->nChildXids; } Assert(i == nxids); + } /* Sort them. */ qsort(workspace, nxids, sizeof(TransactionId), xidComparator); diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index 5811f647..b69e928d 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -3088,7 +3088,8 @@ pgxc_node_send_cmd_id(PGXCNodeHandle *handle, CommandId cid) int i32; /* No need to send command ID if its sending flag is not enabled */ - if (!IsSendCommandId()) + /* XXX: parallel worker always send cid */ + if (!IsSendCommandId() && !IsParallelWorker()) { return 0; } diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 64481951..35636976 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -1963,7 +1963,7 @@ GetMaxSnapshotSubxidCount(void) } #ifdef __TBASE__ -TransactionId GetLocalTransactionId(const char *globalXid) +TransactionId GetLocalTransactionId(const char *globalXid, TransactionId *subxids, int *nsub) { ProcArrayStruct *arrayP = procArray; @@ -1979,6 +1979,8 @@ TransactionId GetLocalTransactionId(const char *globalXid) int pgprocno = pgprocnos[index]; PGPROC *proc = &allProcs[pgprocno]; volatile PGXACT *pgxact = &allPgXact[pgprocno]; + TransactionId result = InvalidTransactionId; + int nxid; LWLockAcquire(&proc->globalxidLock, LW_SHARED); if (!proc->hasGlobalXid || strcmp(globalXid, proc->globalXid) != 0) @@ -1992,10 +1994,21 @@ TransactionId GetLocalTransactionId(const char *globalXid) continue; } + result = pgxact->xid; + + /* look for max xid in subtrans */ + *nsub = pgxact->nxids; + for (nxid = 0; nxid < pgxact->nxids; nxid++) + { + TransactionId subxid = proc->subxids.xids[nxid]; + subxids[nxid] = subxid; + } + LWLockRelease(&proc->globalxidLock); LWLockRelease(ProcArrayLock); - elog(DEBUG8, "found xid %d for global xid %s", pgxact->xid, globalXid); - return pgxact->xid; + elog(DEBUG8, "found xid %d for global xid %s", result, globalXid); + + return result; } LWLockRelease(ProcArrayLock); diff --git a/src/include/access/transam.h b/src/include/access/transam.h index 0a56dda3..d94c4d26 100644 --- a/src/include/access/transam.h +++ b/src/include/access/transam.h @@ -270,6 +270,8 @@ extern TransactionId GetNewTransactionId(bool isSubXact); extern bool TransactIdIsCurentGlobalTransacId(TransactionId xid); extern TransactionId GetNextTransactionId(void); extern void ExtendLogs(TransactionId xid); +extern int GetNumSubTransactions(void); +extern TransactionId *GetSubTransactions(void); #endif extern TransactionId ReadNewTransactionId(void); extern void SetTransactionIdLimit(TransactionId oldest_datfrozenxid, diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h index 24f45705..d6607bcf 100644 --- a/src/include/storage/procarray.h +++ b/src/include/storage/procarray.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * procarray.h - * POSTGRES process array definitions. + * POSTGRES process array definitions. * * * Portions Copyright (c) 2012-2014, TransLattice, Inc. @@ -26,8 +26,8 @@ extern int GlobalSnapshotSource; typedef enum GlobalSnapshotSourceType { - GLOBAL_SNAPSHOT_SOURCE_GTM, - GLOBAL_SNAPSHOT_SOURCE_COORDINATOR + GLOBAL_SNAPSHOT_SOURCE_GTM, + GLOBAL_SNAPSHOT_SOURCE_COORDINATOR } GlobalSnapshotSourceType; #endif @@ -38,33 +38,33 @@ typedef enum GlobalSnapshotSourceType * to avoid forcing to include proc.h when including procarray.h. So if you modify * PROC_XXX flags, you need to modify these flags. */ -#define PROCARRAY_VACUUM_FLAG 0x02 /* currently running lazy - * vacuum */ -#define PROCARRAY_ANALYZE_FLAG 0x04 /* currently running - * analyze */ -#define PROCARRAY_LOGICAL_DECODING_FLAG 0x10 /* currently doing logical - * decoding outside xact */ - -#define PROCARRAY_SLOTS_XMIN 0x20 /* replication slot xmin, - * catalog_xmin */ +#define PROCARRAY_VACUUM_FLAG 0x02 /* currently running lazy + * vacuum */ +#define PROCARRAY_ANALYZE_FLAG 0x04 /* currently running + * analyze */ +#define PROCARRAY_LOGICAL_DECODING_FLAG 0x10 /* currently doing logical + * decoding outside xact */ + +#define PROCARRAY_SLOTS_XMIN 0x20 /* replication slot xmin, + * catalog_xmin */ /* * Only flags in PROCARRAY_PROC_FLAGS_MASK are considered when matching * PGXACT->vacuumFlags. Other flags are used for different purposes and * have no corresponding PROC flag equivalent. */ -#define PROCARRAY_PROC_FLAGS_MASK (PROCARRAY_VACUUM_FLAG | \ - PROCARRAY_ANALYZE_FLAG | \ - PROCARRAY_LOGICAL_DECODING_FLAG) +#define PROCARRAY_PROC_FLAGS_MASK (PROCARRAY_VACUUM_FLAG | \ + PROCARRAY_ANALYZE_FLAG | \ + PROCARRAY_LOGICAL_DECODING_FLAG) /* Use the following flags as an input "flags" to GetOldestXmin function */ /* Consider all backends except for logical decoding ones which manage xmin separately */ -#define PROCARRAY_FLAGS_DEFAULT PROCARRAY_LOGICAL_DECODING_FLAG +#define PROCARRAY_FLAGS_DEFAULT PROCARRAY_LOGICAL_DECODING_FLAG /* Ignore vacuum backends */ -#define PROCARRAY_FLAGS_VACUUM PROCARRAY_FLAGS_DEFAULT | PROCARRAY_VACUUM_FLAG +#define PROCARRAY_FLAGS_VACUUM PROCARRAY_FLAGS_DEFAULT | PROCARRAY_VACUUM_FLAG /* Ignore analyze backends */ -#define PROCARRAY_FLAGS_ANALYZE PROCARRAY_FLAGS_DEFAULT | PROCARRAY_ANALYZE_FLAG +#define PROCARRAY_FLAGS_ANALYZE PROCARRAY_FLAGS_DEFAULT | PROCARRAY_ANALYZE_FLAG /* Ignore both vacuum and analyze backends */ -#define PROCARRAY_FLAGS_VACUUM_ANALYZE PROCARRAY_FLAGS_DEFAULT | PROCARRAY_VACUUM_FLAG | PROCARRAY_ANALYZE_FLAG +#define PROCARRAY_FLAGS_VACUUM_ANALYZE PROCARRAY_FLAGS_DEFAULT | PROCARRAY_VACUUM_FLAG | PROCARRAY_ANALYZE_FLAG extern Size ProcArrayShmemSize(void); extern void CreateSharedProcArray(void); @@ -77,17 +77,17 @@ extern void ProcArrayClearTransaction(PGPROC *proc); #ifdef PGXC /* PGXC_DATANODE */ typedef enum { - SNAPSHOT_UNDEFINED, /* Coordinator has not sent snapshot or not yet connected */ - SNAPSHOT_LOCAL, /* Coordinator has instructed Datanode to build up snapshot from the local procarray */ - SNAPSHOT_COORDINATOR, /* Coordinator has sent snapshot data */ - SNAPSHOT_DIRECT /* Datanode obtained directly from GTM */ + SNAPSHOT_UNDEFINED, /* Coordinator has not sent snapshot or not yet connected */ + SNAPSHOT_LOCAL, /* Coordinator has instructed Datanode to build up snapshot from the local procarray */ + SNAPSHOT_COORDINATOR, /* Coordinator has sent snapshot data */ + SNAPSHOT_DIRECT /* Datanode obtained directly from GTM */ } SnapshotSource; extern void SetGlobalTimestamp(GlobalTimestamp gts, SnapshotSource source); #if 0 extern void SetGlobalSnapshotData(TransactionId xmin, TransactionId xmax, int xcnt, - TransactionId *xip, - SnapshotSource source); + TransactionId *xip, + SnapshotSource source); #endif extern void UnsetGlobalSnapshotData(void); extern void ReloadConnInfoOnBackends(bool refresh_only); @@ -95,23 +95,23 @@ extern void ReloadConnInfoOnBackends(bool refresh_only); extern void ProcArrayInitRecovery(TransactionId initializedUptoXID); extern void ProcArrayApplyRecoveryInfo(RunningTransactions running); extern void ProcArrayApplyXidAssignment(TransactionId topxid, - int nsubxids, TransactionId *subxids); + int nsubxids, TransactionId *subxids); extern void RecordKnownAssignedTransactionIds(TransactionId xid); extern void ExpireTreeKnownAssignedTransactionIds(TransactionId xid, - int nsubxids, TransactionId *subxids, - TransactionId max_xid); + int nsubxids, TransactionId *subxids, + TransactionId max_xid); extern void ExpireAllKnownAssignedTransactionIds(void); extern void ExpireOldKnownAssignedTransactionIds(TransactionId xid); -extern int GetMaxSnapshotXidCount(void); -extern int GetMaxSnapshotSubxidCount(void); +extern int GetMaxSnapshotXidCount(void); +extern int GetMaxSnapshotSubxidCount(void); #define GetSnapshotData(snapshot, latest) GetSnapshotData_shard(snapshot, latest, true) extern Snapshot GetSnapshotData_shard(Snapshot snapshot, bool latest, bool need_shardmap); extern bool ProcArrayInstallImportedXmin(TransactionId xmin, - VirtualTransactionId *sourcevxid); + VirtualTransactionId *sourcevxid); extern bool ProcArrayInstallRestoredXmin(TransactionId xmin, PGPROC *proc); extern void ProcArrayCheckXminConsistency(TransactionId global_xmin); extern void SetLatestCompletedXid(TransactionId latestCompletedXid); @@ -123,13 +123,13 @@ extern bool TransactionIdIsInProgress(TransactionId xid); extern bool TransactionIdIsPrepared(TransactionId xid, Snapshot snapshot, GlobalTimestamp *prepare_ts); #endif #ifdef __TBASE__ -extern TransactionId GetLocalTransactionId(const char *globalXid); +extern TransactionId GetLocalTransactionId(const char *globalXid, TransactionId *subxids, int *nsub); #endif extern char *GetGlobalTransactionId(const TransactionId pid); extern bool TransactionIdIsActive(TransactionId xid); extern TransactionId GetOldestXmin(Relation rel, int flags); extern TransactionId GetOldestXminInternal(Relation rel, int flags, - bool computeLocal, TransactionId lastGlobalXmin); + bool computeLocal, TransactionId lastGlobalXmin); extern TransactionId GetOldestActiveTransactionId(void); extern TransactionId GetOldestSafeDecodingTransactionId(bool catalogOnly); @@ -138,38 +138,38 @@ extern bool HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids extern PGPROC *BackendPidGetProc(int pid); extern PGPROC *BackendPidGetProcWithLock(int pid); -extern int BackendXidGetPid(TransactionId xid); +extern int BackendXidGetPid(TransactionId xid); extern bool IsBackendPid(int pid); extern VirtualTransactionId *GetCurrentVirtualXIDs(TransactionId limitXmin, - bool excludeXmin0, bool allDbs, int excludeVacuum, - int *nvxids); + bool excludeXmin0, bool allDbs, int excludeVacuum, + int *nvxids); extern VirtualTransactionId *GetConflictingVirtualXIDs(TransactionId limitXmin, Oid dbOid); extern pid_t CancelVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode); extern bool MinimumActiveBackends(int min); -extern int CountDBBackends(Oid databaseid); -extern int CountDBConnections(Oid databaseid); +extern int CountDBBackends(Oid databaseid); +extern int CountDBConnections(Oid databaseid); extern void CancelDBBackends(Oid databaseid, ProcSignalReason sigmode, bool conflictPending); -extern int CountUserBackends(Oid roleid); +extern int CountUserBackends(Oid roleid); extern bool CountOtherDBBackends(Oid databaseId, - int *nbackends, int *nprepared); + int *nbackends, int *nprepared); extern void XidCacheRemoveRunningXids(TransactionId xid, - int nxids, const TransactionId *xids, - TransactionId latestXid); + int nxids, const TransactionId *xids, + TransactionId latestXid); #ifdef XCP extern void GetGlobalSessionInfo(int pid, Oid *coordId, int *coordPid); -extern int GetFirstBackendId(int *numBackends, int *backends); +extern int GetFirstBackendId(int *numBackends, int *backends); #endif /* XCP */ extern void ProcArraySetReplicationSlotXmin(TransactionId xmin, - TransactionId catalog_xmin, bool already_locked); + TransactionId catalog_xmin, bool already_locked); extern void ProcArrayGetReplicationSlotXmin(TransactionId *xmin, - TransactionId *catalog_xmin); + TransactionId *catalog_xmin); #ifdef __TBASE__ extern RunningTransactions GetCurrentRunningTransaction(void); extern GlobalTimestamp GetLatestCommitTS(void); #endif -#endif /* PROCARRAY_H */ +#endif /* PROCARRAY_H */ diff --git a/src/test/regress/expected/transactions_2.out b/src/test/regress/expected/transactions_2.out index e121bf87..30a34e63 100644 --- a/src/test/regress/expected/transactions_2.out +++ b/src/test/regress/expected/transactions_2.out @@ -676,6 +676,72 @@ ERROR: portal "ctt" cannot be run COMMIT; DROP FUNCTION create_temp_tab(); DROP FUNCTION invert(x float8); +-- Test for distributed subtrans in secondary DNs +begin; +savepoint s; +-- create tables in subtransaction +create table t1_trans(f1 int,f2 int); +create table t2_trans(f1 int,f2 int); +create table t3_trans(f1 int,f2 int); +insert into t1_trans values(1,1),(2,2); +insert into t3_trans select * from t1_trans; +insert into t2_trans(f2) select count(1) from t3_trans; +select * from t2_trans; + f1 | f2 +----+---- + | 2 +(1 row) + +abort; +begin; +-- create tables in parent transaction +create table t1_trans(f1 int,f2 int); +create table t2_trans(f1 int,f2 int); +create table t3_trans(f1 int,f2 int); +savepoint s; +insert into t1_trans values(1,1),(2,2); +insert into t3_trans select * from t1_trans; +insert into t2_trans(f2) select count(1) from t3_trans; +select * from t2_trans; + f1 | f2 +----+---- + | 2 +(1 row) + +abort; +-- create tables out of transaction +create table t1_trans(f1 int,f2 int); +create table t2_trans(f1 int,f2 int); +create table t3_trans(f1 int,f2 int); +insert into t1_trans values(1,1),(2,2); +begin; +savepoint s; +insert into t3_trans select * from t1_trans; +insert into t2_trans(f2) select count(1) from t3_trans; +select * from t2_trans; + f1 | f2 +----+---- + | 2 +(1 row) + +abort; +-- test for subtrans in parallel worker +begin; +savepoint s; +set parallel_setup_cost=0; +set parallel_tuple_cost=0; +set min_parallel_table_scan_size=0; +set min_parallel_rows_size=0; +set max_parallel_workers_per_gather=2; +insert into t3_trans select * from t1_trans; +select count(*) from t3_trans join t1_trans using (f2); + count +------- + 2 +(1 row) + +abort; +drop table t1_trans, t2_trans, t3_trans; -- Test for successful cleanup of an aborted transaction at session exit. -- THIS MUST BE THE LAST TEST IN THIS FILE. begin; diff --git a/src/test/regress/sql/transactions.sql b/src/test/regress/sql/transactions.sql index e8c1b3c9..80f235e0 100644 --- a/src/test/regress/sql/transactions.sql +++ b/src/test/regress/sql/transactions.sql @@ -459,6 +459,57 @@ COMMIT; DROP FUNCTION create_temp_tab(); DROP FUNCTION invert(x float8); +-- Test for distributed subtrans in secondary DNs +begin; +savepoint s; +-- create tables in subtransaction +create table t1_trans(f1 int,f2 int); +create table t2_trans(f1 int,f2 int); +create table t3_trans(f1 int,f2 int); +insert into t1_trans values(1,1),(2,2); +insert into t3_trans select * from t1_trans; +insert into t2_trans(f2) select count(1) from t3_trans; +select * from t2_trans; +abort; + +begin; +-- create tables in parent transaction +create table t1_trans(f1 int,f2 int); +create table t2_trans(f1 int,f2 int); +create table t3_trans(f1 int,f2 int); +savepoint s; +insert into t1_trans values(1,1),(2,2); +insert into t3_trans select * from t1_trans; +insert into t2_trans(f2) select count(1) from t3_trans; +select * from t2_trans; +abort; + +-- create tables out of transaction +create table t1_trans(f1 int,f2 int); +create table t2_trans(f1 int,f2 int); +create table t3_trans(f1 int,f2 int); +insert into t1_trans values(1,1),(2,2); +begin; +savepoint s; +insert into t3_trans select * from t1_trans; +insert into t2_trans(f2) select count(1) from t3_trans; +select * from t2_trans; +abort; + +-- test for subtrans in parallel worker +begin; +savepoint s; +set parallel_setup_cost=0; +set parallel_tuple_cost=0; +set min_parallel_table_scan_size=0; +set min_parallel_rows_size=0; +set max_parallel_workers_per_gather=2; +insert into t3_trans select * from t1_trans; +select count(*) from t3_trans join t1_trans using (f2); +abort; + +drop table t1_trans, t2_trans, t3_trans; + -- Test for successful cleanup of an aborted transaction at session exit. -- THIS MUST BE THE LAST TEST IN THIS FILE. From a85cb1e840a9135ec20f8dfd7ea4513b680e83ec Mon Sep 17 00:00:00 2001 From: sigmalin Date: Tue, 27 Jul 2021 10:21:53 +0800 Subject: [PATCH 314/578] fix launched parallel workers > expected http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view/1020421696086983875 (merge request !534) --- src/backend/optimizer/plan/createplan.c | 199 +++++++++++++++++++++++- src/backend/optimizer/plan/planner.c | 1 + src/include/optimizer/planmain.h | 1 + 3 files changed, 200 insertions(+), 1 deletion(-) diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 13c8ec79..7a00bef7 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -107,6 +107,7 @@ bool mergejoin = false; bool child_of_gather = false; bool enable_group_across_query = false; bool enable_distributed_unique_plan = false; +int min_workers_of_hashjon_gather = PG_INT32_MAX; #endif #ifdef __COLD_HOT__ bool has_cold_hot_table = false; @@ -345,6 +346,7 @@ static int add_sort_column(AttrNumber colIdx, Oid sortOp, Oid coll, static double GetPlanRows(Plan *plan); static bool set_plan_parallel(Plan *plan); static void set_plan_nonparallel(Plan *plan); +static bool contain_hashjon_walker(Plan *node); #endif static RemoteSubplan *find_push_down_plan(Plan *plan, bool force); @@ -6472,7 +6474,7 @@ make_remotesubplan(PlannerInfo *root, heap_parallel_workers = Min(heap_parallel_workers, max_parallel_workers_per_gather); - gather->num_workers = Max(heap_parallel_workers, nWorkers); + gather->num_workers = Min(Max(heap_parallel_workers, nWorkers), min_workers_of_hashjon_gather); } else { @@ -6711,6 +6713,8 @@ make_remotesubplan(PlannerInfo *root, parallel_workers = heap_parallel_workers; parallel_workers = Min(parallel_workers, max_parallel_workers_per_gather); + /* launched parallel workers must less than hashjoin's parallel workers under it */ + parallel_workers = Min(parallel_workers, min_workers_of_hashjon_gather); gather_plan = make_gather(copyObject(gather_left->targetlist), NIL, @@ -6747,6 +6751,7 @@ make_remotesubplan(PlannerInfo *root, } } } + min_workers_of_hashjon_gather = PG_INT32_MAX; #endif if (resultDistribution) @@ -8263,6 +8268,15 @@ make_gather(List *qptlist, node->single_copy = single_copy; node->invisible = false; +#ifdef __TBASE__ + /* + * if there has hashjoin in the lower layer, write down the smallest workers + */ + if (min_workers_of_hashjon_gather > nworkers && contain_hashjon_walker(subplan)) + { + min_workers_of_hashjon_gather = nworkers; + } +#endif return node; } @@ -8928,6 +8942,189 @@ contain_remote_subplan_walker(Node *node, void *context, bool include_cte) return false; } +/* + * check if contain hashjon in the plan + */ +static bool +contain_hashjon_walker(Plan *node) +{ + Plan *plan = node; + + if (!plan) + { + return false; + } + + if (IsA(node, RemoteSubplan) || IsA(node, RemoteQuery) || IsA(plan, Gather)) + { + return false; + } + + if (IsA(node, HashJoin)) + { + return true; + } + + if (IsA(node, SubqueryScan)) + { + SubqueryScan *subquery = (SubqueryScan *)node; + plan = subquery->subplan; + } + + if (IsA(plan, Append)) + { + ListCell *lc; + Append *append = (Append *)plan; + + foreach(lc, append->appendplans) + { + Plan *appendplan = (Plan *)lfirst(lc); + + if (appendplan && contain_hashjon_walker(appendplan)) + { + return true; + } + } + + return false; + } + else if (IsA(plan, MergeAppend)) + { + ListCell *lc; + MergeAppend *mergeappend = (MergeAppend *)plan; + + foreach(lc, mergeappend->mergeplans) + { + Plan *mergeappendplan = (Plan *)lfirst(lc); + + if (mergeappendplan && contain_hashjon_walker(mergeappendplan)) + { + return true; + } + } + + return false; + } + + if (outerPlan(plan)) + { + if (contain_hashjon_walker(outerPlan(plan))) + { + return true; + } + } + + if (innerPlan(plan)) + { + if (contain_hashjon_walker(innerPlan(plan))) + { + return true; + } + } + + return false; +} + + +static Plan* +materialize_top_remote_subplan(Plan *node) +{ + Node *plan = (Node *)node; + + if (!plan) + { + return NULL; + } + + if (IsA(node, Material)) + { + return node; + } + + if (IsA(node, RemoteSubplan)) + { + Plan *matplan = (Plan *) make_material(node); + + /* + * We assume the materialize will not spill to disk, and therefore + * charge just cpu_operator_cost per tuple. (Keep this estimate in + * sync with cost_mergejoin.) + */ + copy_plan_costsize(matplan, node); + matplan->total_cost += cpu_operator_cost * matplan->plan_rows; + + return matplan; + } + + if (IsA(node, SubqueryScan)) + { + SubqueryScan *subquery = (SubqueryScan *)node; + plan = (Node *)subquery->subplan; + } + + if (IsA(plan, Append)) + { + ListCell *lc; + Append *append = (Append *)plan; + + foreach(lc, append->appendplans) + { + Plan *appendplan = (Plan *)lfirst(lc); + + if (appendplan) + { + Plan *tmpplan = materialize_top_remote_subplan(appendplan); + if (tmpplan && tmpplan != lfirst(lc)) + { + lfirst(lc) = tmpplan; + } + } + } + + return node; + } + else if (IsA(plan, MergeAppend)) + { + ListCell *lc; + MergeAppend *mergeappend = (MergeAppend *)plan; + + foreach(lc, mergeappend->mergeplans) + { + Plan *mergeappendplan = (Plan *)lfirst(lc); + + if (mergeappendplan) + { + Plan *tmpplan = materialize_top_remote_subplan(mergeappendplan); + if (tmpplan && tmpplan != lfirst(lc)) + { + lfirst(lc) = tmpplan; + } + } + } + + return node; + } + + if (outerPlan(plan)) + { + Plan *tmpplan = materialize_top_remote_subplan(outerPlan(plan)); + if (tmpplan && tmpplan != outerPlan(plan)) + { + outerPlan(plan) = tmpplan; + } + } + + if (innerPlan(plan)) + { + Plan *tmpplan = materialize_top_remote_subplan(innerPlan(plan)); + if (tmpplan && tmpplan != innerPlan(plan)) + { + innerPlan(plan) = tmpplan; + } + } + return node; +} + static void create_remotequery_for_rel(PlannerInfo *root, ModifyTable *mt, RangeTblEntry *res_rel, Index resultRelationIndex, int relcount, CmdType cmdtyp, RelationAccessType accessType, int partindex, diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index fc87fcbc..62a92b5f 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -324,6 +324,7 @@ standard_planner(Query *parse, int cursorOptions, ParamListInfo boundParams) glob->dependsOnRole = false; #ifdef __TBASE__ groupOids = NULL; + min_workers_of_hashjon_gather = PG_INT32_MAX; #endif #ifdef __COLD_HOT__ has_cold_hot_table = false; diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h index 8139e134..4e32be80 100644 --- a/src/include/optimizer/planmain.h +++ b/src/include/optimizer/planmain.h @@ -99,6 +99,7 @@ extern int remote_subplan_depth; extern List *groupOids; extern bool enable_distributed_unique_plan; extern bool has_cold_hot_table; +extern int min_workers_of_hashjon_gather; #define INSERT_TRIGGER "tt_dn_in_" #define UPDATE_TRIGGER "tt_dn_up_" From fee71a4c5a65220e0b19d1aff2372740942d87cd Mon Sep 17 00:00:00 2001 From: sigmalin Date: Wed, 18 Aug 2021 16:19:57 +0800 Subject: [PATCH 315/578] fix latch already owned in parallel mode http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view?bug_id=1020421696090962107# http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131086984137 (merge request !604) --- src/backend/optimizer/plan/createplan.c | 67 ++++++++++++++++++++----- 1 file changed, 55 insertions(+), 12 deletions(-) diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 7a00bef7..9444951b 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -346,7 +346,7 @@ static int add_sort_column(AttrNumber colIdx, Oid sortOp, Oid coll, static double GetPlanRows(Plan *plan); static bool set_plan_parallel(Plan *plan); static void set_plan_nonparallel(Plan *plan); -static bool contain_hashjon_walker(Plan *node); +static bool contain_node_walker(Plan *node, NodeTag type, bool search_nonparallel); #endif static RemoteSubplan *find_push_down_plan(Plan *plan, bool force); @@ -1961,6 +1961,7 @@ create_gather_plan(PlannerInfo *root, GatherPath *best_path) Plan *subplan; List *tlist; bool reset = false; + bool contain_nonparallel_hashjoin = false; /* if child_of_gather is false, set child_of_gather true, and reset the value before return */ if (!child_of_gather) @@ -1977,9 +1978,12 @@ create_gather_plan(PlannerInfo *root, GatherPath *best_path) tlist = build_path_tlist(root, &best_path->path); + /* if contain nonparallel hashjoin, set num_workers to 1 */ + contain_nonparallel_hashjoin = contain_node_walker(subplan, T_HashJoin, true); + gather_plan = make_gather(tlist, NIL, - best_path->num_workers, + (contain_nonparallel_hashjoin) ? 1 : best_path->num_workers, best_path->single_copy, subplan); @@ -6458,10 +6462,19 @@ make_remotesubplan(PlannerInfo *root, Gather *gather = (Gather *)lefttree; int nWorkers = gather->num_workers; Plan *leftplan = lefttree->lefttree; + /* if contain nonparallel hashjoin, set num_workers to 1 */ + bool contain_nonparallel_hashjoin = contain_node_walker(leftplan, T_HashJoin, true); + if (contain_nonparallel_hashjoin) + { + gather->num_workers = 1; + } + else + { /* rows estimate is cut down to per data nodes, set it to all nodes for parallel estimate. */ double rows = GetPlanRows(leftplan) * nodes; int heap_parallel_threshold = 0; int heap_parallel_workers = 1; + bool contain_gather = contain_node_walker(leftplan, T_Gather, false); heap_parallel_threshold = Max(min_parallel_rows_size, 1); while (rows >= (heap_parallel_threshold * 3)) @@ -6473,8 +6486,10 @@ make_remotesubplan(PlannerInfo *root, } heap_parallel_workers = Min(heap_parallel_workers, max_parallel_workers_per_gather); - - gather->num_workers = Min(Max(heap_parallel_workers, nWorkers), min_workers_of_hashjon_gather); + heap_parallel_workers = Max(heap_parallel_workers, nWorkers); + /* if contain gather, need compare the workers with min_workers_of_hashjon_gather */ + gather->num_workers = (contain_gather) ? Min(heap_parallel_workers, min_workers_of_hashjon_gather) : heap_parallel_workers; + } } else { @@ -6485,9 +6500,16 @@ make_remotesubplan(PlannerInfo *root, double inner_rows = lefttree->righttree ? lefttree->righttree->plan_rows : 0; double rows = outer_rows > inner_rows ? outer_rows : inner_rows; + bool contain_nonparallel_hashjoin = contain_node_walker(lefttree, T_HashJoin, true); bool need_parallel = true; int parallel_workers = 0; + /* if contain nonparallel hashjoin, don't add gather plan */ + if (contain_nonparallel_hashjoin) + { + need_parallel = false; + } + /* only add gather to remote_subplan at top */ if (need_parallel && distributionType == LOCATOR_TYPE_NONE) { @@ -8272,7 +8294,7 @@ make_gather(List *qptlist, /* * if there has hashjoin in the lower layer, write down the smallest workers */ - if (min_workers_of_hashjon_gather > nworkers && contain_hashjon_walker(subplan)) + if (min_workers_of_hashjon_gather > nworkers && contain_node_walker(subplan, T_HashJoin, false)) { min_workers_of_hashjon_gather = nworkers; } @@ -8943,10 +8965,12 @@ contain_remote_subplan_walker(Node *node, void *context, bool include_cte) } /* - * check if contain hashjon in the plan + * check if contain the type node in the plan, only support + * T_HashJoin and T_Gather now + * search_nonparallel only work if type is T_HashJoin */ static bool -contain_hashjon_walker(Plan *node) +contain_node_walker(Plan *node, NodeTag type, bool search_nonparallel) { Plan *plan = node; @@ -8955,15 +8979,34 @@ contain_hashjon_walker(Plan *node) return false; } - if (IsA(node, RemoteSubplan) || IsA(node, RemoteQuery) || IsA(plan, Gather)) + if (IsA(node, RemoteSubplan) || IsA(node, RemoteQuery)) { return false; } + if (type == T_HashJoin) + { if (IsA(node, HashJoin)) { + if (search_nonparallel) + { + /* return if contain non parallel hashjoin */ + HashJoin *join_plan = (HashJoin *) node; + return !join_plan->join.plan.parallel_aware; + } + else + { return true; } + } + } + else if (type == T_Gather) + { + if (IsA(node, Gather)) + { + return true; + } + } if (IsA(node, SubqueryScan)) { @@ -8980,7 +9023,7 @@ contain_hashjon_walker(Plan *node) { Plan *appendplan = (Plan *)lfirst(lc); - if (appendplan && contain_hashjon_walker(appendplan)) + if (appendplan && contain_node_walker(appendplan, type, search_nonparallel)) { return true; } @@ -8997,7 +9040,7 @@ contain_hashjon_walker(Plan *node) { Plan *mergeappendplan = (Plan *)lfirst(lc); - if (mergeappendplan && contain_hashjon_walker(mergeappendplan)) + if (mergeappendplan && contain_node_walker(mergeappendplan, type, search_nonparallel)) { return true; } @@ -9008,7 +9051,7 @@ contain_hashjon_walker(Plan *node) if (outerPlan(plan)) { - if (contain_hashjon_walker(outerPlan(plan))) + if (contain_node_walker(outerPlan(plan), type, search_nonparallel)) { return true; } @@ -9016,7 +9059,7 @@ contain_hashjon_walker(Plan *node) if (innerPlan(plan)) { - if (contain_hashjon_walker(innerPlan(plan))) + if (contain_node_walker(innerPlan(plan), type, search_nonparallel)) { return true; } From 1fe35058c9c28b94b142ecf4476de7e7c02d4a96 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Mon, 26 Jul 2021 15:48:54 +0800 Subject: [PATCH 316/578] fix nestloop error failed to found slot for consumer http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131088591001 --- src/backend/optimizer/plan/createplan.c | 7 ++++-- src/test/regress/expected/gist_1.out | 4 ++-- src/test/regress/expected/join_3.out | 24 +++++++++---------- src/test/regress/expected/partition_prune.out | 8 +++---- src/test/regress/expected/rowsecurity_1.out | 12 ++++------ src/test/regress/expected/subselect.out | 4 +--- 6 files changed, 28 insertions(+), 31 deletions(-) diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 9444951b..42ffb26e 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -346,8 +346,8 @@ static int add_sort_column(AttrNumber colIdx, Oid sortOp, Oid coll, static double GetPlanRows(Plan *plan); static bool set_plan_parallel(Plan *plan); static void set_plan_nonparallel(Plan *plan); +static Plan *materialize_top_remote_subplan(Plan *node); static bool contain_node_walker(Plan *node, NodeTag type, bool search_nonparallel); - #endif static RemoteSubplan *find_push_down_plan(Plan *plan, bool force); @@ -4806,9 +4806,11 @@ create_nestloop_plan(PlannerInfo *root, */ #ifdef __TBASE__ if (!IsA(inner_plan, Material) && contain_remote_subplan_walker((Node*)inner_plan, NULL, true)) + { + inner_plan = materialize_top_remote_subplan(inner_plan); + } #else if (IsA(inner_plan, RemoteSubplan)) -#endif { Plan *matplan = (Plan *) make_material(inner_plan); @@ -4822,6 +4824,7 @@ create_nestloop_plan(PlannerInfo *root, inner_plan = matplan; } +#endif #endif join_plan = make_nestloop(tlist, diff --git a/src/test/regress/expected/gist_1.out b/src/test/regress/expected/gist_1.out index 0653fb98..9c135e9a 100644 --- a/src/test/regress/expected/gist_1.out +++ b/src/test/regress/expected/gist_1.out @@ -131,8 +131,8 @@ cross join lateral Nested Loop -> Remote Subquery Scan on all (datanode_1) -> Values Scan on "*VALUES*" - -> Materialize - -> Limit + -> Limit + -> Materialize -> Remote Subquery Scan on all (datanode_1) -> Limit -> Index Only Scan using gist_tbl_point_index on gist_tbl diff --git a/src/test/regress/expected/join_3.out b/src/test/regress/expected/join_3.out index 4b1d3032..7a70d26a 100644 --- a/src/test/regress/expected/join_3.out +++ b/src/test/regress/expected/join_3.out @@ -3844,10 +3844,10 @@ where t1.f1 = ss.f1; -> Seq Scan on public.int8_tbl i8 Output: i8.q1, i8.q2 Filter: (i8.q2 = 123) - -> Materialize - Output: (i8.q1), t2.f1 -> Limit Output: (i8.q1), t2.f1 + -> Materialize + Output: (i8.q1), t2.f1 -> Remote Subquery Scan on all (datanode_1,datanode_2) Output: i8.q1, t2.f1 -> Limit @@ -3876,7 +3876,7 @@ select * from lateral (select ss1.* from text_tbl t3 limit 1) as ss2 where t1.f1 = ss2.f1; QUERY PLAN ------------------------------------------------------------------------------------ +----------------------------------------------------------------------------- Nested Loop Output: t1.f1, i8.q1, i8.q2, (i8.q1), t2.f1, ((i8.q1)), (t2.f1) Join Filter: (t1.f1 = (t2.f1)) @@ -3889,22 +3889,22 @@ where t1.f1 = ss2.f1; -> Seq Scan on public.int8_tbl i8 Output: i8.q1, i8.q2 Filter: (i8.q2 = 123) - -> Materialize - Output: (i8.q1), t2.f1, ((i8.q1)), (t2.f1) -> Nested Loop Output: (i8.q1), t2.f1, ((i8.q1)), (t2.f1) -> Limit Output: (i8.q1), t2.f1 + -> Materialize + Output: (i8.q1), t2.f1 -> Remote Subquery Scan on all (datanode_1,datanode_2) Output: i8.q1, t2.f1 -> Limit Output: (i8.q1), t2.f1 -> Seq Scan on public.text_tbl t2 Output: i8.q1, t2.f1 - -> Materialize - Output: ((i8.q1)), (t2.f1) -> Limit Output: ((i8.q1)), (t2.f1) + -> Materialize + Output: ((i8.q1)), (t2.f1) -> Remote Subquery Scan on all (datanode_1,datanode_2) Output: (i8.q1), t2.f1 -> Limit @@ -3962,13 +3962,13 @@ where tt1.f1 = ss1.c0; -> Seq Scan on public.text_tbl tt4 Output: tt4.f1 Filter: (tt4.f1 = 'foo'::text) - -> Materialize - Output: ss1.c0 -> Subquery Scan on ss1 Output: ss1.c0 Filter: (ss1.c0 = 'foo'::text) -> Limit Output: (tt4.f1) + -> Materialize + Output: (tt4.f1) -> Remote Subquery Scan on all (datanode_1,datanode_2) Output: tt4.f1 -> Limit @@ -4026,10 +4026,10 @@ where ss1.c2 = 0; Output: i42.f1 -> Seq Scan on public.int4_tbl i42 Output: i42.f1 - -> Materialize - Output: (i41.f1), (i8.q1), (i8.q2), (i42.f1), (i43.f1), ((42)) -> Limit Output: (i41.f1), (i8.q1), (i8.q2), (i42.f1), (i43.f1), ((42)) + -> Materialize + Output: (i41.f1), (i8.q1), (i8.q2), (i42.f1), (i43.f1), ((42)) -> Remote Subquery Scan on all (datanode_1,datanode_2) Output: i41.f1, i8.q1, i8.q2, i42.f1, i43.f1, (42) -> Limit @@ -4065,9 +4065,9 @@ select * from Nested Loop Left Join Join Filter: ((1) = COALESCE((1))) -> Result - -> Materialize -> Hash Full Join Hash Cond: (a1.unique1 = (1)) + -> Materialize -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on tenk1 a1 -> Hash diff --git a/src/test/regress/expected/partition_prune.out b/src/test/regress/expected/partition_prune.out index 61bbdf23..2b2b13b9 100644 --- a/src/test/regress/expected/partition_prune.out +++ b/src/test/regress/expected/partition_prune.out @@ -1410,8 +1410,8 @@ explain (costs off) select * from mc2p t1, lateral (select count(*) from mc3p t2 Filter: (a = 1) -> Seq Scan on mc2p_default t1_2 Filter: (a = 1) - -> Materialize - -> Finalize Aggregate + -> Finalize Aggregate + -> Materialize -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Partial Aggregate -> Append @@ -1449,8 +1449,8 @@ explain (costs off) select * from mc2p t1, lateral (select count(*) from mc3p t2 Filter: (a = 1) -> Seq Scan on mc2p_default t1_2 Filter: (a = 1) - -> Materialize - -> Finalize Aggregate + -> Finalize Aggregate + -> Materialize -> Remote Subquery Scan on all (datanode_1) -> Partial Aggregate -> Append diff --git a/src/test/regress/expected/rowsecurity_1.out b/src/test/regress/expected/rowsecurity_1.out index 7ea346ae..1e0441a4 100644 --- a/src/test/regress/expected/rowsecurity_1.out +++ b/src/test/regress/expected/rowsecurity_1.out @@ -2074,9 +2074,8 @@ EXPLAIN (COSTS OFF) EXECUTE plancache_test3; -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on z1 Filter: (((a % 2) = 0) AND f_leak(b)) - -> Materialize -> CTE Scan on q -(9 rows) +(8 rows) SET ROLE regress_rls_group1; SELECT * FROM z1 WHERE f_leak(b) order by 1; @@ -2126,9 +2125,8 @@ EXPLAIN (COSTS OFF) EXECUTE plancache_test3; -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on z1 Filter: (((a % 2) = 0) AND f_leak(b)) - -> Materialize -> CTE Scan on q -(9 rows) +(8 rows) SET SESSION AUTHORIZATION regress_rls_carol; SELECT * FROM z1 WHERE f_leak(b) order by 1; @@ -2178,9 +2176,8 @@ EXPLAIN (COSTS OFF) EXECUTE plancache_test3; -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on z1 Filter: (((a % 2) = 1) AND f_leak(b)) - -> Materialize -> CTE Scan on q -(9 rows) +(8 rows) SET ROLE regress_rls_group2; SELECT * FROM z1 WHERE f_leak(b) order by 1; @@ -2230,9 +2227,8 @@ EXPLAIN (COSTS OFF) EXECUTE plancache_test3; -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on z1 Filter: (((a % 2) = 1) AND f_leak(b)) - -> Materialize -> CTE Scan on q -(9 rows) +(8 rows) -- -- Views should follow policy for view owner. diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out index f38e79c4..8f30b1c9 100644 --- a/src/test/regress/expected/subselect.out +++ b/src/test/regress/expected/subselect.out @@ -2043,11 +2043,9 @@ select * from x; Output: x_1.a -> CTE Scan on z Output: z.a - -> Materialize - Output: z1.a -> CTE Scan on z z1 Output: z1.a -(20 rows) +(18 rows) with recursive x(a) as ((values ('a'), ('b')) From b860205c6ed46e8e218c78687c5eb1e80f6e05fd Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Wed, 15 Dec 2021 16:41:16 +0800 Subject: [PATCH 317/578] fix the number of pg_proc.h columns does not match the error --- contrib/Makefile | 2 +- src/include/catalog/pg_proc.h | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/contrib/Makefile b/contrib/Makefile index 22110f25..43e984e3 100644 --- a/contrib/Makefile +++ b/contrib/Makefile @@ -47,7 +47,7 @@ SUBDIRS = \ spi \ tablefunc \ tbase_gts_tools \ - tbase_memory_tools \ + tbase_memory_tools \ tcn \ test_decoding \ tsm_system_rows \ diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index 76881d68..324dac39 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -1303,7 +1303,7 @@ DATA(insert OID = 1080 ( hashbpchar PGNSP PGUID 12 1 0 0 0 f f f f t f i DESCR("hash"); DATA(insert OID = 1081 ( format_type PGNSP PGUID 12 1 0 0 0 f f f f f f s s 2 0 25 "26 23" _null_ _null_ _null_ _null_ _null_ format_type _null_ _null_ _null_ )); DESCR("format a type oid and atttypmod to canonical SQL"); -DATA(insert OID = 4676 ( hashbpcharextended PGNSP PGUID 12 1 0 0 0 f f f f f t f i s 2 0 20 "1042 20" _null_ _null_ _null_ _null_ _null_ hashbpcharextended _null_ _null_ _null_ )); +DATA(insert OID = 4676 ( hashbpcharextended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "1042 20" _null_ _null_ _null_ _null_ _null_ hashbpcharextended _null_ _null_ _null_ )); DESCR("hash"); DATA(insert OID = 1084 ( date_in PGNSP PGUID 12 1 0 0 0 f f f f t f s s 1 0 1082 "2275" _null_ _null_ _null_ _null_ _null_ date_in _null_ _null_ _null_ )); DESCR("I/O"); @@ -3261,7 +3261,7 @@ DATA(insert OID = 2039 ( timestamp_hash PGNSP PGUID 12 1 0 0 0 f f f f t f i DESCR("hash"); DATA(insert OID = 2041 ( overlaps PGNSP PGUID 12 1 0 0 0 f f f f f f i s 4 0 16 "1114 1114 1114 1114" _null_ _null_ _null_ _null_ _null_ overlaps_timestamp _null_ _null_ _null_ )); DESCR("intervals overlap?"); -DATA(insert OID = 4680 ( timestamp_hash_extended PGNSP PGUID 12 1 0 0 0 f f f f f t f i s 2 0 20 "1114 20" _null_ _null_ _null_ _null_ _null_ timestamp_hash_extended _null_ _null_ _null_ )); +DATA(insert OID = 4680 ( timestamp_hash_extended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "1114 20" _null_ _null_ _null_ _null_ _null_ timestamp_hash_extended _null_ _null_ _null_ )); DESCR("hash"); DATA(insert OID = 2042 ( overlaps PGNSP PGUID 14 1 0 0 0 f f f f f f i s 4 0 16 "1114 1186 1114 1186" _null_ _null_ _null_ _null_ _null_ "select ($1, ($1 + $2)) overlaps ($3, ($3 + $4))" _null_ _null_ _null_ )); DESCR("intervals overlap?"); @@ -4782,7 +4782,7 @@ DATA(insert OID = 3515 ( hashenum PGNSP PGUID 12 1 0 0 0 f f f f t f i s DESCR("hash"); DATA(insert OID = 3524 ( enum_smaller PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3500 "3500 3500" _null_ _null_ _null_ _null_ _null_ enum_smaller _null_ _null_ _null_ )); DESCR("smaller of two"); -DATA(insert OID = 4683 ( hashenumextended PGNSP PGUID 12 1 0 0 0 f f f f f t f i s 2 0 20 "3500 20" _null_ _null_ _null_ _null_ _null_ hashenumextended _null_ _null_ _null_ )); +DATA(insert OID = 4683 ( hashenumextended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "3500 20" _null_ _null_ _null_ _null_ _null_ hashenumextended _null_ _null_ _null_ )); DESCR("hash"); DATA(insert OID = 3525 ( enum_larger PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3500 "3500 3500" _null_ _null_ _null_ _null_ _null_ enum_larger _null_ _null_ _null_ )); DESCR("larger of two"); From a170d578c402280d19c276dfa01f3c251655b994 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Fri, 22 Oct 2021 15:55:47 +0800 Subject: [PATCH 318/578] fix self-development partition number : http://tapd.oa.com/20421696/bugtrace/bugs/view?bug_id=1020421696093310765&url_cache_key=868a8470a54906f0346329b940daf1c8 (merge request !837) Squash merge branch '5.06_jenny_partitions_number' into 'Tbase_v5.06' fix self-development partition number : http://tapd.oa.com/20421696/bugtrace/bugs/view?bug_id=1020421696093310765&url_cache_key=868a8470a54906f0346329b940daf1c8 --- src/backend/utils/adt/ruleutils.c | 28 ++++++++++++++++++++++++++++ src/bin/psql/describe.c | 4 ++-- src/include/catalog/pg_proc.h | 3 +++ src/include/utils/ruleutils.h | 2 ++ 4 files changed, 35 insertions(+), 2 deletions(-) diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c index 8ce8cefe..8fa13f29 100644 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@ -12079,6 +12079,34 @@ RelationGetAllPartitions(Relation rel) } int +GetAllPartitionIntervalCount(Oid parent_oid) +{ + int count = 0; + List *children = NULL; + Relation rel = heap_open(parent_oid, NoLock); + + children = RelationGetAllPartitions(rel); + + if(children) + { + count = children->length; + list_free(children); + } + + heap_close(rel, NoLock); + + return count; +} + +Datum +partitions_number(PG_FUNCTION_ARGS) +{ + Oid parent_oid = PG_GETARG_OID(0); + int ret = GetAllPartitionIntervalCount(parent_oid); + PG_RETURN_INT32(ret); +} + +int RelationGetChildIndex(Relation rel, Oid childoid) { int nparts = 0; diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c index ff427084..85318723 100644 --- a/src/bin/psql/describe.c +++ b/src/bin/psql/describe.c @@ -3219,7 +3219,7 @@ describeOneTableDetails(const char *schemaname, if (verbose && pset.sversion >= 90500 && tableinfo.relkind == 'r' && tableinfo.relpartkind == 'p') { printfPQExpBuffer(&buf, - "SELECT 'RANGE(' || a.attname || ')', p.partnparts," + "SELECT 'RANGE(' || a.attname || ')', partitions_number(c.oid)," "p.partdatatype, p.partstartvalue_ts :: date, p.partstartvalue_int," "CASE WHEN p.partinterval_type=5 THEN p.partinterval_int || ' MONTH' " "WHEN p.partinterval_type=4 THEN p.partinterval_int || ' DAY' " @@ -3239,7 +3239,7 @@ describeOneTableDetails(const char *schemaname, { char * partdatatype; const char *part_by = _("Partition By"); - const char *nparts = _("# Of Partitions"); + const char *nparts = _("Partitions number"); const char *start_with = _("Start With"); const char *interv = _("Interval Of Partition"); diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index 324dac39..8f79ca30 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -5720,6 +5720,9 @@ DESCR("get top-most partition root parent"); DATA(insert OID = 4690 ( pg_partition_ancestors PGNSP PGUID 12 1 10 0 0 f f f f t t v s 1 0 2205 "2205" "{2205,2205}" "{i,o}" "{partitionid,relid}" _null_ _null_ pg_partition_ancestors _null_ _null_ _null_ )); DESCR("view ancestors of the partition"); +/* get partition interval children count */ +DATA(insert OID = 4691 ( partitions_number PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 2205 "2205" _null_ _null_ _null_ _null_ _null_ partitions_number _null_ _null_ _null_ )); +DESCR("get partition interval children count "); DATA(insert OID = 3410 ( pg_extent_info PGNSP PGUID 12 10 20 0 0 f f f f f t v s 1 0 2249 "2205" "{23,16,23,23,23,23,23,23,23}" "{o,o,o,o,o,o,o,o,o}" "{eid,is_occupied,shardid,freespace_cat,hwm,scan_next,scan_prev,alloc_next,alloc_prev}" _null_ _null_ pg_extent_info_oid _null_ _null_ _null_ )); DESCR("get extent info of a relation"); DATA(insert OID = 3411 ( pg_shard_scan_list PGNSP PGUID 12 10 20 0 0 f f f f f t v s 2 0 2249 "2205 23" "{23,16,23,23,23,23}" "{o,o,o,o,o,o}" "{eid,is_occupied,shardid,freespace_cat,hwm,scan_next}" _null_ _null_ pg_shard_scan_list_oid _null_ _null_ _null_ )); diff --git a/src/include/utils/ruleutils.h b/src/include/utils/ruleutils.h index 585bc16e..5dc0e217 100644 --- a/src/include/utils/ruleutils.h +++ b/src/include/utils/ruleutils.h @@ -102,6 +102,8 @@ extern int RelationGetPartitionIdxByValue(Relation rel, Datum value); extern List *RelationGetAllPartitions(Relation rel); +extern int GetAllPartitionIntervalCount(Oid parent_oid); + extern int RelationGetChildIndex(Relation rel, Oid childoid); extern Oid RelationGetPartitionIndex(Relation rel, Oid indexOid, int partidx); From 99635e22edda78fbfd88ea814a92b2c7e4d6fffa Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Tue, 21 Dec 2021 21:25:34 +0800 Subject: [PATCH 319/578] fix regress errors --- contrib/pg_clean/pg_clean.c | 3198 ----------------- src/test/regress/expected/create_table.out | 8 +- src/test/regress/expected/inherit_3.out | 20 +- src/test/regress/expected/join_3.out | 181 +- src/test/regress/expected/limit.out | 2 +- .../regress/expected/partition_join_2.out | 377 +- src/test/regress/expected/rowsecurity_1.out | 8 +- src/test/regress/expected/sanity_check.out | 2 + .../regress/expected/select_parallel_4.out | 110 +- src/test/regress/expected/stats_ext_2.out | 20 +- src/test/regress/expected/subselect.out | 4 +- src/test/regress/expected/sysviews.out | 4 +- 12 files changed, 399 insertions(+), 3535 deletions(-) delete mode 100644 contrib/pg_clean/pg_clean.c diff --git a/contrib/pg_clean/pg_clean.c b/contrib/pg_clean/pg_clean.c deleted file mode 100644 index 08a189f9..00000000 --- a/contrib/pg_clean/pg_clean.c +++ /dev/null @@ -1,3198 +0,0 @@ -#include "postgres.h" -#include "fmgr.h" -#include "funcapi.h" -#include "miscadmin.h" - -#include -#include -#include -#include -#include - -#include "storage/procarray.h" -#include "storage/lwlock.h" -#include "storage/proc.h" -#include "utils/varlena.h" -#include "utils/lsyscache.h" -#include "utils/palloc.h" -#include "utils/builtins.h" - -#include "executor/tuptable.h" -#include "pgxc/execRemote.h" -#include "pgxc/pgxcnode.h" -#include "access/tupdesc.h" -#include "access/htup_details.h" -#include "lib/stringinfo.h" - -#include "access/gtm.h" -#include "datatype/timestamp.h" -#include "access/xact.h" -#include "pgxc/pgxcnode.h" -#include "pgxc/poolmgr.h" -#include "utils/timestamp.h" -#include "catalog/pg_control.h" -#include "commands/dbcommands.h" - -#include "utils/memutils.h" -#include "nodes/memnodes.h" - -#ifdef XCP -#include "catalog/pg_type.h" -#include "catalog/pgxc_node.h" -#include "executor/executor.h" -#include "nodes/makefuncs.h" -#include "utils/snapmgr.h" -#endif -#ifdef PGXC -#include "pgxc/nodemgr.h" -#include "pgxc/pgxc.h" -#endif - -#include "storage/fd.h" -#include "pgstat.h" -#include "access/xact.h" -#include "access/twophase.h" -#include "access/hash.h" - -/*hash_create hash_search*/ -#include "utils/hsearch.h" - -#define TWOPHASE_RECORD_DIR "pg_2pc" -int transaction_threshold = 200000; -#define MAXIMUM_CLEAR_FILE 10000 -#define MAXIMUM_OUTPUT_FILE 1000 -#define XIDPREFIX "_$XC$" -#define DEFAULT_CLEAN_TIME_INTERVAL 120000000 -#ifdef __TWO_PHASE_TESTS__ -#define LEAST_CLEAN_TIME_INTERVAL 10000000 /* in pg_clean test_mode should not clean twophase trans prepared in ten seconds or commit in ten seconds */ -#else -#define LEAST_CLEAN_TIME_INTERVAL 60000000 /* should not clean twophase trans prepared in a minite or commit in a minite */ -#endif -GlobalTimestamp clean_time_interval = DEFAULT_CLEAN_TIME_INTERVAL; - - -PG_MODULE_MAGIC; - -#define MAX_GID 50 -#define MAX_DBNAME 64 -#define GET_START_XID "startxid:" -#define GET_COMMIT_TIMESTAMP "global_commit_timestamp:" -#define GET_START_NODE "startnode:" -#define GET_NODE "nodes:" -#define GET_XID "\nxid:" -#define GET_READONLY "readonly" -#define GIDSIZE (200 + 24) -#define MAX_TWOPC_TXN 1000 -#define STRING_BUFF_LEN 1024 - -#define MAX_CMD_LENGTH 120 - -#define XIDFOUND 1 -#define XIDNOTFOUND -1 -#define XIDEXECFAIL -2 - -#define FILEFOUND 1 -#define FILEUNKOWN -1 -#define FILENOTFOUND -2 - -#define INIT(x)\ -do{\ - x = NULL;\ - x##_count = 0;\ - x##_size = 0;\ -}while(0); - -#define RPALLOC(x)\ -do{\ - if (x##_size < x##_count+1)\ - {\ - int temp_size = (x##_size > 0) ? x##_size : 1;\ - if (NULL == x)\ - {\ - x = palloc0(2*temp_size*sizeof(*x));\ - }\ - else\ - {\ - x = repalloc(x, 2*temp_size*sizeof(*x));\ - }\ - x##_size = 2*temp_size;\ - }\ -}while(0); - -#define PALLOC(x, y)\ -do{\ - RPALLOC(x);\ - x[x##_count] = y;\ - x##_count++;\ -}while(0); - -#define RFREE(x)\ -do{\ - if (x##_size > 0)\ - {\ - pfree(x);\ - }\ - x = NULL;\ - x##_count = 0;\ - x##_size = 0;\ -}while(0); - -#define ENUM_TOCHAR_CASE(x) case x: return(#x); - -/*data structures*/ -typedef enum TXN_STATUS -{ - TXN_STATUS_INITIAL = 0, /* Initial */ - TXN_STATUS_PREPARED, - TXN_STATUS_COMMITTED, - TXN_STATUS_ABORTED, - TXN_STATUS_INPROGRESS, - TXN_STATUS_FAILED, /* Error detected while interacting with the node */ - TXN_STATUS_UNKNOWN /* Unknown: Frozen, running, or not started */ -} TXN_STATUS; - - -typedef enum -{ - UNDO = 0, - ABORT, - COMMIT -} OPERATION; - -typedef enum -{ - TWOPHASE_FILE_EXISTS = 0, - TWOPHASE_FILE_NOT_EXISTS, - TWOPHASE_FILE_OLD, - TWOPHASE_FILE_ERROR -}TWOPHASE_FILE_STATUS; - -typedef struct txn_info -{ - char gid[MAX_GID]; - uint32 *xid; /* xid used in prepare */ - TimestampTz *prepare_timestamp; - char *owner; - char *participants; - Oid origcoord; /* Original coordinator who initiated the txn */ - bool after_first_phase; - uint32 startxid; /* xid in Original coordinator */ - bool isorigcoord_part; /* Is original coordinator a - participant? */ - int num_dnparts; /* Number of participant datanodes */ - int num_coordparts; /* Number of participant coordinators */ - int *dnparts; /* Whether a node was participant in the txn */ - int *coordparts; - TXN_STATUS *txn_stat; /* Array for each nodes */ - char *msg; /* Notice message for this txn. */ - GlobalTimestamp global_commit_timestamp; /* get global_commit_timestamp from node once it is committed*/ - - TXN_STATUS global_txn_stat; - OPERATION op; - bool op_issuccess; - bool is_readonly; - bool belong_abnormal_node; -}txn_info; - -typedef struct database_info -{ - struct database_info *next; - char *database_name; - - HTAB *all_txn_info; -#if 0 - txn_info *head_txn_info; - txn_info *last_txn_info; -#endif -} database_info; - -typedef struct -{ - int index; - txn_info **txn; - int txn_count; - int txn_size; - MemoryContext mycontext; -} print_txn_info; - -typedef struct -{ - int index; - int count; - char **gid; - int gid_count; - int gid_size; - char **database; - int database_count; - int database_size; - char **global_status; - int global_status_count; - int global_status_size; - char **status; - int status_count; - int status_size; - MemoryContext mycontext; -} print_status; - -typedef struct -{ - char ***slot; /*slot[i][j] stores value of row i, colum j*/ - int slot_count; /*number of rows*/ - int slot_size; - int attnum; -}TupleTableSlots; - -/*global variable*/ -static Oid *cn_node_list = NULL; -static Oid *dn_node_list = NULL; -static bool *cn_health_map = NULL; -static bool *dn_health_map = NULL; -static int cn_nodes_num = 0; -static int dn_nodes_num = 0; -static int pgxc_clean_node_count = 0; -static Oid my_nodeoid; -static -database_info *head_database_info = NULL; -static -database_info *last_database_info = NULL; -bool execute = false; -int total_twopc_txn = 0; - -TimestampTz current_time; -GlobalTimestamp abnormal_time = InvalidGlobalTimestamp; -char *abnormal_nodename = NULL; -Oid abnormal_nodeoid = InvalidOid; -bool clear_2pc_belong_node = false; - - -/*function list*/ - /*plugin entry function*/ - -static bool check_node_health(Oid node_oid); -static Datum - execute_query_on_single_node(Oid node, const char * query, int attnum, TupleTableSlots * tuples); -void DestroyTxnHash(void); -static void ResetGlobalVariables(void); - -static Oid - getMyNodeoid(void); -static void - getDatabaseList(void); -static char* TTSgetvalue(TupleTableSlots *result, int tup_num, int field_num); -static void DropTupleTableSlots(TupleTableSlots * -Slots); -static void - getTxnInfoOnNodesAll(void); -void getTxnInfoOnNode(Oid node); -void add_txn_info(char * dbname, Oid node_oid, uint32 xid, char * gid, char * owner, - TimestampTz prepared_time, TXN_STATUS status); -TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info * txn, Oid node_oid); -static txn_info * - find_txn(char *gid); -txn_info* - make_txn_info(char * dbname, char * gid, char * owner); -database_info* - find_database_info(char *database_name); -database_info* - add_database_info(char *database_name); -int find_node_index(Oid node_oid); -Oid find_node_oid(int node_idx); -void getTxnInfoOnOtherNodesAll(void); -void getTxnInfoOnOtherNodesForDatabase(database_info *database); -void getTxnInfoOnOtherNodes(txn_info *txn); -int Get2PCXidByGid(Oid node_oid, char * gid, uint32 * transactionid); -int Get2PCFile(Oid node_oid, char * gid, uint32 * transactionid); - -char *get2PCInfo(const char *tid); - -void getTxnStatus(txn_info * txn, int node_idx); -void recover2PCForDatabaseAll(void); -void recover2PCForDatabase(database_info * db_info); -#if 0 -static bool - setMaintenanceMode(bool status); -#endif -bool send_query_clean_transaction(PGXCNodeHandle * conn, txn_info * txn, const char * finish_cmd); -bool check_2pc_belong_node(txn_info * txn); -bool check_node_participate(txn_info * txn, int node_idx); - -void recover2PC(txn_info * txn); -TXN_STATUS - check_txn_global_status(txn_info *txn); -bool clean_2PC_iscommit(txn_info *txn, bool is_commit, bool is_check); -bool clean_2PC_files(txn_info *txn); -void Init_print_txn_info(print_txn_info *print_txn); -void Init_print_stats_all(print_status *pstatus); -void Init_print_stats(txn_info * txn, char * database, print_status * pstatus); -static const char * - txn_status_to_string(TXN_STATUS status); -static const char * - txn_op_to_string(OPERATION op); -static void - CheckFirstPhase(txn_info *txn); -static void - get_transaction_handles(PGXCNodeAllHandles **pgxc_handles, txn_info *txn); -static void - get_node_handles(PGXCNodeAllHandles ** pgxc_handles, Oid nodeoid); - -Datum pg_clean_execute(PG_FUNCTION_ARGS); -PG_FUNCTION_INFO_V1(pg_clean_execute); -Datum pg_clean_execute(PG_FUNCTION_ARGS) -{ -#ifdef ACCESS_CONTROL_ATTR_NUM -#undef ACCESS_CONTROL_ATTR_NUM -#endif -#define ACCESS_CONTROL_ATTR_NUM 4 - FuncCallContext *funcctx; - HeapTuple tuple; - print_txn_info *print_txn = NULL; - txn_info *temp_txn; - char txn_gid[100]; - char txn_status[100]; - char txn_op[100]; - char txn_op_issuccess[100]; - - Datum values[ACCESS_CONTROL_ATTR_NUM]; - bool nulls[ACCESS_CONTROL_ATTR_NUM]; - - if(!IS_PGXC_COORDINATOR) - { - elog(ERROR, "can only called on coordinator"); - } - - if (SRF_IS_FIRSTCALL()) - { - MemoryContext oldcontext; - TupleDesc tupdesc; - MemoryContext mycontext; - funcctx = SRF_FIRSTCALL_INIT(); - - oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); - - tupdesc = CreateTemplateTupleDesc(ACCESS_CONTROL_ATTR_NUM, false); - TupleDescInitEntry(tupdesc, (AttrNumber) 1, "gid", - TEXTOID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 2, "global_transaction_status", - TEXTOID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 3, "operation", - TEXTOID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 4, "operation_status", - TEXTOID, -1, 0); - funcctx->tuple_desc = BlessTupleDesc(tupdesc); - - funcctx->user_fctx = (print_txn_info *)palloc0(sizeof(print_txn_info)); - print_txn = (print_txn_info *) funcctx->user_fctx; - - - MemoryContextSwitchTo(oldcontext); - mycontext = AllocSetContextCreate(funcctx->multi_call_memory_ctx, - "clean_check", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); - oldcontext = MemoryContextSwitchTo(mycontext); - - /*clear Global*/ - ResetGlobalVariables(); - execute = true; - clean_time_interval = PG_GETARG_INT32(0) * 1000000; - if (LEAST_CLEAN_TIME_INTERVAL > clean_time_interval) - { - clean_time_interval = LEAST_CLEAN_TIME_INTERVAL; - } - - /*get node list*/ - PgxcNodeGetOids(&cn_node_list, &dn_node_list, - &cn_nodes_num, &dn_nodes_num, true); - pgxc_clean_node_count = cn_nodes_num + dn_nodes_num; - my_nodeoid = getMyNodeoid(); - cn_health_map = palloc0(cn_nodes_num * sizeof(bool)); - dn_health_map = palloc0(dn_nodes_num * sizeof(bool)); - - /*add my database info*/ - add_database_info(get_database_name(MyDatabaseId)); - - /*get all info of 2PC transactions*/ - getTxnInfoOnNodesAll(); - - /*get txn info on other nodes all*/ - getTxnInfoOnOtherNodesAll(); - - /*recover all 2PC transactions*/ - recover2PCForDatabaseAll(); - - Init_print_txn_info(print_txn); - - print_txn->mycontext = mycontext; - - MemoryContextSwitchTo(oldcontext); - - } - - funcctx = SRF_PERCALL_SETUP(); - print_txn = (print_txn_info *) funcctx->user_fctx; - - if (print_txn->index < print_txn->txn_count) - { - temp_txn = print_txn->txn[print_txn->index]; - strncpy(txn_gid, temp_txn->gid, 100); - strncpy(txn_status, txn_status_to_string(temp_txn->global_txn_stat), 100); - strncpy(txn_op, txn_op_to_string(temp_txn->op), 100); - if (temp_txn->op_issuccess) - strncpy(txn_op_issuccess, "success", 100); - else - strncpy(txn_op_issuccess, "fail", 100); - - MemSet(values, 0, sizeof(values)); - MemSet(nulls, 0, sizeof(nulls)); - - values[0] = PointerGetDatum(cstring_to_text(txn_gid)); - values[1] = PointerGetDatum(cstring_to_text(txn_status)); - values[2] = PointerGetDatum(cstring_to_text(txn_op)); - values[3] = PointerGetDatum(cstring_to_text(txn_op_issuccess)); - tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); - print_txn->index++; - SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple)); - } - else - { - - //MemoryContextDelete(print_txn->mycontext); - DestroyTxnHash(); - ResetGlobalVariables(); - SRF_RETURN_DONE(funcctx); - } -} - -/* - * clear 2pc after oss detect abnormal node and restart it , - * only clear 2pc belong the abnormal node and before the abnormal time - */ -Datum pg_clean_execute_on_node(PG_FUNCTION_ARGS); -PG_FUNCTION_INFO_V1(pg_clean_execute_on_node); -Datum pg_clean_execute_on_node(PG_FUNCTION_ARGS) -{ -#ifdef ACCESS_CONTROL_ATTR_NUM -#undef ACCESS_CONTROL_ATTR_NUM -#endif -#define ACCESS_CONTROL_ATTR_NUM 4 - FuncCallContext *funcctx; - HeapTuple tuple; - print_txn_info *print_txn = NULL; - txn_info *temp_txn; - char txn_gid[100]; - char txn_status[100]; - char txn_op[100]; - char txn_op_issuccess[100]; - - Datum values[ACCESS_CONTROL_ATTR_NUM]; - bool nulls[ACCESS_CONTROL_ATTR_NUM]; - - if(!IS_PGXC_COORDINATOR) - { - elog(ERROR, "can only called on coordinator"); - } - - if (SRF_IS_FIRSTCALL()) - { - MemoryContext oldcontext; - TupleDesc tupdesc; - MemoryContext mycontext; - funcctx = SRF_FIRSTCALL_INIT(); - - oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); - - tupdesc = CreateTemplateTupleDesc(ACCESS_CONTROL_ATTR_NUM, false); - TupleDescInitEntry(tupdesc, (AttrNumber) 1, "gid", - TEXTOID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 2, "global_transaction_status", - TEXTOID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 3, "operation", - TEXTOID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 4, "operation_status", - TEXTOID, -1, 0); - funcctx->tuple_desc = BlessTupleDesc(tupdesc); - - funcctx->user_fctx = (print_txn_info *)palloc0(sizeof(print_txn_info)); - print_txn = (print_txn_info *) funcctx->user_fctx; - - - MemoryContextSwitchTo(oldcontext); - mycontext = AllocSetContextCreate(funcctx->multi_call_memory_ctx, - "clean_check", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); - oldcontext = MemoryContextSwitchTo(mycontext); - - /*clear Global*/ - ResetGlobalVariables(); - execute = true; - clear_2pc_belong_node = true; - - abnormal_nodename = text_to_cstring(PG_GETARG_TEXT_P(0)); - abnormal_nodeoid = get_pgxc_nodeoid(abnormal_nodename); - if (InvalidOid == abnormal_nodeoid) - { - elog(ERROR, "pg_clean_execute_on_node, cannot clear 2pc of invalid nodename '%s'", abnormal_nodename); - } - abnormal_time = PG_GETARG_INT64(1); - current_time = GetCurrentTimestamp(); - if (abnormal_time >= current_time) - { - elog(ERROR, "pg_clean_execute_on_node, abnormal time "INT64_FORMAT" must before current_time "INT64_FORMAT, abnormal_time, current_time); - } - - /*get node list*/ - PgxcNodeGetOids(&cn_node_list, &dn_node_list, - &cn_nodes_num, &dn_nodes_num, true); - pgxc_clean_node_count = cn_nodes_num + dn_nodes_num; - my_nodeoid = getMyNodeoid(); - cn_health_map = palloc0(cn_nodes_num * sizeof(bool)); - dn_health_map = palloc0(dn_nodes_num * sizeof(bool)); - - /*add my database info*/ - add_database_info(get_database_name(MyDatabaseId)); - - /*get all info of 2PC transactions*/ - getTxnInfoOnNodesAll(); - - /*get txn info on other nodes all*/ - getTxnInfoOnOtherNodesAll(); - - /*recover all 2PC transactions*/ - recover2PCForDatabaseAll(); - - Init_print_txn_info(print_txn); - - print_txn->mycontext = mycontext; - - MemoryContextSwitchTo(oldcontext); - - } - - funcctx = SRF_PERCALL_SETUP(); - print_txn = (print_txn_info *) funcctx->user_fctx; - - if (print_txn->index < print_txn->txn_count) - { - temp_txn = print_txn->txn[print_txn->index]; - strncpy(txn_gid, temp_txn->gid, 100); - strncpy(txn_status, txn_status_to_string(temp_txn->global_txn_stat), 100); - strncpy(txn_op, txn_op_to_string(temp_txn->op), 100); - if (temp_txn->op_issuccess) - strncpy(txn_op_issuccess, "success", 100); - else - strncpy(txn_op_issuccess, "fail", 100); - - MemSet(values, 0, sizeof(values)); - MemSet(nulls, 0, sizeof(nulls)); - - values[0] = PointerGetDatum(cstring_to_text(txn_gid)); - values[1] = PointerGetDatum(cstring_to_text(txn_status)); - values[2] = PointerGetDatum(cstring_to_text(txn_op)); - values[3] = PointerGetDatum(cstring_to_text(txn_op_issuccess)); - tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); - print_txn->index++; - SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple)); - } - else - { - DestroyTxnHash(); - pfree(abnormal_nodename); - ResetGlobalVariables(); - SRF_RETURN_DONE(funcctx); - } -} - - -Datum pg_clean_check_txn(PG_FUNCTION_ARGS); -PG_FUNCTION_INFO_V1(pg_clean_check_txn); -Datum pg_clean_check_txn(PG_FUNCTION_ARGS) -{ -#ifdef ACCESS_CONTROL_ATTR_NUM -#undef ACCESS_CONTROL_ATTR_NUM -#endif -#define ACCESS_CONTROL_ATTR_NUM 4 - FuncCallContext *funcctx; - HeapTuple tuple; - print_status *pstatus = NULL; - - Datum values[ACCESS_CONTROL_ATTR_NUM]; - bool nulls[ACCESS_CONTROL_ATTR_NUM]; - execute = false; - - if(!IS_PGXC_COORDINATOR) - { - elog(ERROR, "can only called on coordinator"); - } - - if (SRF_IS_FIRSTCALL()) - { - MemoryContext oldcontext; - MemoryContext mycontext; - TupleDesc tupdesc; - funcctx = SRF_FIRSTCALL_INIT(); - - oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); - - tupdesc = CreateTemplateTupleDesc(ACCESS_CONTROL_ATTR_NUM, false); - TupleDescInitEntry(tupdesc, (AttrNumber) 1, "gid", - TEXTOID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 2, "database", - TEXTOID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 3, "global_transaction_status", - TEXTOID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 4, "transaction_status_on_allnodes", - TEXTOID, -1, 0); - funcctx->tuple_desc = BlessTupleDesc(tupdesc); - - funcctx->user_fctx = (print_status *)palloc0(sizeof(print_status)); - pstatus = (print_status *) funcctx->user_fctx; - pstatus->index = pstatus->count = 0; - pstatus->gid = NULL; - pstatus->global_status = pstatus->status = (char **)NULL; - pstatus->database = NULL; - pstatus->mycontext = NULL; - - - MemoryContextSwitchTo(oldcontext); - - mycontext = AllocSetContextCreate(funcctx->multi_call_memory_ctx, - "clean_check", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); - oldcontext = MemoryContextSwitchTo(mycontext); - - /*clear Global*/ - ResetGlobalVariables(); - - clean_time_interval = PG_GETARG_INT32(0) * 1000000; - if (LEAST_CLEAN_TIME_INTERVAL > clean_time_interval) - { - clean_time_interval = LEAST_CLEAN_TIME_INTERVAL; - } - /*get node list*/ - PgxcNodeGetOids(&cn_node_list, &dn_node_list, - &cn_nodes_num, &dn_nodes_num, true); - if (cn_node_list == NULL || dn_node_list == NULL) - elog(ERROR, "pg_clean:fail to get cn_node_list and dn_node_list"); - pgxc_clean_node_count = cn_nodes_num + dn_nodes_num; - my_nodeoid = getMyNodeoid(); - cn_health_map = palloc0(cn_nodes_num * sizeof(bool)); - dn_health_map = palloc0(dn_nodes_num * sizeof(bool)); - - /*get all database info*/ - getDatabaseList(); - - /*get all info of 2PC transactions*/ - getTxnInfoOnNodesAll(); - - /*get txn info on other nodes all*/ - getTxnInfoOnOtherNodesAll(); - - /*recover all 2PC transactions*/ - Init_print_stats_all(pstatus); - - pstatus->mycontext = mycontext; - - MemoryContextSwitchTo(oldcontext); - - } - - funcctx = SRF_PERCALL_SETUP(); - pstatus = (print_status *) funcctx->user_fctx; - - if (pstatus->index < pstatus->count) - { - MemSet(values, 0, sizeof(values)); - MemSet(nulls, 0, sizeof(nulls)); - - values[0] = PointerGetDatum(cstring_to_text(pstatus->gid[pstatus->index])); - values[1] = PointerGetDatum(cstring_to_text(pstatus->database[pstatus->index])); - values[2] = PointerGetDatum(cstring_to_text(pstatus->global_status[pstatus->index])); - values[3] = PointerGetDatum(cstring_to_text(pstatus->status[pstatus->index])); - tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); - pstatus->index++; - SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple)); - } - else - { - /* - MemoryContextDelete(pstatus->mycontext); - DropDatabaseInfo(); - */ - DestroyTxnHash(); - ResetGlobalVariables(); - SRF_RETURN_DONE(funcctx); - } -} - -void DestroyTxnHash(void) -{ - database_info *dbinfo = head_database_info; - while (dbinfo) - { - hash_destroy(dbinfo->all_txn_info); - dbinfo = dbinfo->next; - } -} - -static void ResetGlobalVariables(void) -{ - cn_node_list = NULL; - dn_node_list = NULL; - cn_health_map = NULL; - dn_health_map = NULL; - cn_nodes_num = 0; - dn_nodes_num = 0; - pgxc_clean_node_count = 0; - execute = false; - total_twopc_txn = 0; - - head_database_info = last_database_info = NULL; - - current_time = 0; - abnormal_time = InvalidGlobalTimestamp; - abnormal_nodename = NULL; - abnormal_nodeoid = InvalidOid; - clear_2pc_belong_node = false; - -} - -static Oid getMyNodeoid(void) -{ - return get_pgxc_nodeoid(PGXCNodeName); -} - -/* - * execute_query_on_single_node -- execute query on certain node and get results - * input: node oid, execute query, number of attribute in results, results - * return: (Datum) 0 - */ -static Datum -execute_query_on_single_node(Oid node, const char *query, int attnum, TupleTableSlots *tuples) //delete numnodes, delete nodelist, insert node -{ - int ii; - bool issuccess = false; - - /*check health of node*/ - bool ishealthy = check_node_health(node); - -#ifdef XCP - EState *estate; - MemoryContext oldcontext; - RemoteQuery *plan; - RemoteQueryState *pstate; - TupleTableSlot *result = NULL; - Var *dummy; - char ntype = PGXC_NODE_NONE; - - /* - * Make up RemoteQuery plan node - */ - plan = makeNode(RemoteQuery); - plan->combine_type = COMBINE_TYPE_NONE; - plan->exec_nodes = makeNode(ExecNodes); - plan->exec_type = EXEC_ON_NONE; - - plan->exec_nodes->nodeList = lappend_int(plan->exec_nodes->nodeList, - PGXCNodeGetNodeId(node, &ntype)); - if (ntype == PGXC_NODE_NONE) - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Unknown node Oid: %u", node))); - else if (ntype == PGXC_NODE_COORDINATOR) - { - plan->exec_type = EXEC_ON_COORDS; - } - else - { - plan->exec_type = EXEC_ON_DATANODES; - } - - plan->sql_statement = (char *)query; - plan->force_autocommit = false; - /* - * We only need the target entry to determine result data type. - * So create dummy even if real expression is a function. - */ - for (ii = 1; ii <= attnum; ii++) - { - dummy = makeVar(1, ii, TEXTOID, 0, InvalidOid, 0); - plan->scan.plan.targetlist = lappend(plan->scan.plan.targetlist, - makeTargetEntry((Expr *) dummy, ii, NULL, false)); - } - /* prepare to execute */ - estate = CreateExecutorState(); - oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); - estate->es_snapshot = GetActiveSnapshot(); - pstate = ExecInitRemoteQuery(plan, estate, 0); - MemoryContextSwitchTo(oldcontext); - - /*execute query on node when node is healthy*/ - INIT(tuples->slot); - tuples->attnum = 0; - if (ishealthy) - { - int i_tuple = 0; - int i_attnum = 0; - issuccess = true; - result = ExecRemoteQuery((PlanState *) pstate); - tuples->attnum = attnum; - while (result != NULL && !TupIsNull(result)) - { - slot_getallattrs(result); - RPALLOC(tuples->slot); - tuples->slot[i_tuple] = (char **) palloc0(attnum * sizeof(char *)); - - for (i_attnum = 0; i_attnum < attnum; i_attnum++) - { - /*if (result->tts_values[i_attnum] != (Datum)0)*/ - if (result->tts_isnull[i_attnum] == false) - { - tuples->slot[i_tuple][i_attnum] = text_to_cstring(DatumGetTextP(result->tts_values[i_attnum])); - } - else - { - tuples->slot[i_tuple][i_attnum] = NULL; - } - } - tuples->slot_count++; - - result = ExecRemoteQuery((PlanState *) pstate); - i_tuple++; - } - } - ExecEndRemoteQuery(pstate); -#endif - return issuccess == true ? (Datum) 1 : (Datum) 0; -} - -static bool check_node_health(Oid node_oid) -{ - int i; - bool ishealthy = false; - - PoolPingNodeRecheck(node_oid); - PgxcNodeGetHealthMap(cn_node_list, dn_node_list, - &cn_nodes_num, &dn_nodes_num, - cn_health_map, dn_health_map); - if (get_pgxc_nodetype(node_oid) == 'C') - { - for (i = 0; i < cn_nodes_num; i++) - { - if (cn_node_list[i] == node_oid) - { - ishealthy = cn_health_map[i]; - } - } - } - else - { - for (i = 0; i < dn_nodes_num; i++) - { - if (dn_node_list[i] == node_oid) - { - ishealthy = dn_health_map[i]; - } - } - } - return ishealthy; -} - -static void getDatabaseList(void) -{ - int i; - TupleTableSlots result_db; - const char *query_db = "select datname::text from pg_database;"; - /*add datname into tail of head_database_info*/ - if (execute_query_on_single_node(my_nodeoid, query_db, 1, &result_db) == (Datum) 1) - { - for (i = 0; i < result_db.slot_count; i++) - { - if (TTSgetvalue(&result_db, i, 0)) - { - add_database_info(TTSgetvalue(&result_db, i, 0)); - } - } - } - else - { - elog(LOG, "pg_clean: failed get database list on node %s", get_pgxc_nodename(my_nodeoid)); - } - DropTupleTableSlots(&result_db); -} - -/* - * TTSgetvalue -- get attribute from TupleTableSlots - * input: result, index of tuple, index of field - * return: attribute result - */ -static char * TTSgetvalue(TupleTableSlots *result, int tup_num, int field_num) -{ - return result->slot[tup_num][field_num]; -} - -static void DropTupleTableSlots(TupleTableSlots * -Slots) -{ - int i; - int j; - for (i = 0; i < Slots->slot_count; i++) - { - if (Slots->slot[i]) - { - for (j = 0; j < Slots->attnum; j++) - { - if (Slots->slot[i][j]) - { - pfree(Slots->slot[i][j]); - } - } - pfree(Slots->slot[i]); - } - } - RFREE(Slots->slot); - Slots->attnum = 0; - return; -} - -static void getTxnInfoOnNodesAll(void) -{ - int i; - current_time = GetCurrentTimestamp(); - /*upload 2PC transaction from CN*/ - for (i = 0; i < cn_nodes_num; i++) - { - if (total_twopc_txn >= MAX_TWOPC_TXN) - return; - getTxnInfoOnNode(cn_node_list[i]); - } - - /*upload 2PC transaction from DN*/ - for (i = 0; i < dn_nodes_num; i++) - { - if (total_twopc_txn >= MAX_TWOPC_TXN) - return; - getTxnInfoOnNode(dn_node_list[i]); - } -} - -void getTxnInfoOnNode(Oid node) -{ - int i; - TupleTableSlots result_txn; - Datum execute_res; - char query_execute[1024]; - const char *query_txn_status = "select transaction::text, gid::text, owner::text, database::text, timestamptz_out(prepared)::text " - "from pg_prepared_xacts;"; - const char *query_txn_status_execute = "select transaction::text, gid::text, owner::text, database::text, timestamptz_out(prepared)::text " - "from pg_prepared_xacts where database = '%s';"; - snprintf(query_execute, 1024, query_txn_status_execute, get_database_name(MyDatabaseId)); - - if (execute) - execute_res = execute_query_on_single_node(node, query_execute, 5, &result_txn); - else - execute_res = execute_query_on_single_node(node, query_txn_status, 5, &result_txn); - - if (execute_res == (Datum) 1) - { - for (i = 0; i < result_txn.slot_count; i++) - { - uint32 xid; - char* gid; - char* owner; - char* datname; - TimestampTz prepared_time; - - /*read results from each tuple*/ - xid = strtoul(TTSgetvalue(&result_txn, i, 0), NULL, 10); - gid = TTSgetvalue(&result_txn, i, 1); - owner = TTSgetvalue(&result_txn, i, 2); - datname = TTSgetvalue(&result_txn, i, 3); - prepared_time = DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in, - CStringGetDatum(TTSgetvalue(&result_txn, i, 4)), - ObjectIdGetDatum(InvalidOid), - Int32GetDatum(-1))); - - /*add txn to database*/ - add_txn_info(datname, node, xid, gid, owner, prepared_time, TXN_STATUS_PREPARED); - if (total_twopc_txn >= MAX_TWOPC_TXN) - { - break; - } - } - } - else - { - elog(LOG, "pg_clean: failed get database list on node %s", get_pgxc_nodename(node)); - } - DropTupleTableSlots(&result_txn); -} - -void add_txn_info(char* dbname, Oid node_oid, uint32 xid, char * gid, - char * owner, TimestampTz prepared_time, TXN_STATUS status) -{ - txn_info *txn = NULL; - int nodeidx; - - if ((txn = find_txn(gid)) == NULL) - { - txn = make_txn_info(dbname, gid, owner); - total_twopc_txn++; - if (txn == NULL) - { - /*no more memory*/ - elog(ERROR, "there is no more memory for palloc a 2PC transaction"); - } - } - nodeidx = find_node_index(node_oid); - txn->txn_stat[nodeidx] = status; - txn->xid[nodeidx] = xid; - txn->prepare_timestamp[nodeidx] = prepared_time; - if (nodeidx < cn_nodes_num) - { - txn->coordparts[nodeidx] = 1; - txn->num_coordparts++; - } - else - { - txn->dnparts[nodeidx-cn_nodes_num] = 1; - txn->num_dnparts++; - } - return; -} - -TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info *txn, Oid node_oid) -{ - /*get all the participates and initiate to each transactions*/ - TWOPHASE_FILE_STATUS res = TWOPHASE_FILE_NOT_EXISTS; - TupleTableSlots result; - char *partnodes = NULL; - char *startnode = NULL; - char *file_content = NULL; - uint32 startxid = 0; - char *str_startxid = NULL; - char *str_timestamp = NULL; - char *temp = NULL; - Oid temp_nodeoid; - char temp_nodetype; - int temp_nodeidx; - char stmt[1024]; - static const char *STMT_FORM = "select pgxc_get_2pc_file('%s')::text"; - snprintf(stmt, 1024, STMT_FORM, txn->gid, txn->gid, txn->gid, txn->gid); - - if (execute_query_on_single_node(node_oid, stmt, 1, &result) == (Datum) 1) - { - if (result.slot_count && TTSgetvalue(&result, 0, 0)) -#if 0 - TTSgetvalue(&result, 0, 0) && - TTSgetvalue(&result, 0, 1) && - TTSgetvalue(&result, 0, 2)) -#endif - { - file_content = TTSgetvalue(&result, 0, 0); - - if (!IsXidImplicit(txn->gid) && strstr(file_content, GET_READONLY)) - { - txn->is_readonly = true; - txn->global_txn_stat = TXN_STATUS_COMMITTED; - DropTupleTableSlots(&result); - return TWOPHASE_FILE_EXISTS; - } - startnode = strstr(file_content, GET_START_NODE); - str_startxid = strstr(file_content, GET_START_XID); - partnodes = strstr(file_content, GET_NODE); - temp = strstr(file_content, GET_COMMIT_TIMESTAMP); - - /* get the last global_commit_timestamp */ - while (temp) - { - str_timestamp = temp; - temp += strlen(GET_COMMIT_TIMESTAMP); - temp = strstr(temp, GET_COMMIT_TIMESTAMP); - } - - if (startnode) - { - startnode += strlen(GET_START_NODE); - startnode = strtok(startnode, "\n"); - txn->origcoord = get_pgxc_nodeoid(startnode); - } - - if (str_startxid) - { - str_startxid += strlen(GET_START_XID); - str_startxid = strtok(str_startxid, "\n"); - startxid = strtoul(str_startxid, NULL, 10); - txn->startxid = startxid; - } - - if (partnodes) - { - partnodes += strlen(GET_NODE); - partnodes = strtok(partnodes, "\n"); - txn->participants = (char *) palloc0(strlen(partnodes) + 1); - strncpy(txn->participants, partnodes, strlen(partnodes) + 1); - } - - if (NULL == startnode || NULL == str_startxid) - { - res = TWOPHASE_FILE_OLD; - DropTupleTableSlots(&result); - return res; - } - - if (NULL == partnodes) - { - res = TWOPHASE_FILE_ERROR; - DropTupleTableSlots(&result); - return res; - } - - if (str_timestamp) - { - str_timestamp += strlen(GET_COMMIT_TIMESTAMP); - str_timestamp = strtok(str_timestamp, "\n"); - txn->global_commit_timestamp = strtoull(str_timestamp, NULL, 10); - } - - elog(DEBUG1, "get 2pc txn:%s partnodes in nodename: %s (nodeoid:%u) result: partnodes:%s, startnode:%s, startnodeoid:%u, startxid:%u", - txn->gid, get_pgxc_nodename(node_oid), node_oid, partnodes, startnode, txn->origcoord, startxid); - /* in explicit transaction startnode participate the transaction */ - if (strstr(partnodes, startnode) || !IsXidImplicit(txn->gid)) - { - txn->isorigcoord_part = true; - } - else - { - txn->isorigcoord_part = false; - } - - res = TWOPHASE_FILE_EXISTS; - txn->num_coordparts = 0; - txn->num_dnparts = 0; - temp = strtok(partnodes,", "); - while(temp) - { - /*check node type*/ - temp_nodeoid = get_pgxc_nodeoid(temp); - if (temp_nodeoid == InvalidOid) - { - res = TWOPHASE_FILE_ERROR; - break; - } - temp_nodetype = get_pgxc_nodetype(temp_nodeoid); - temp_nodeidx = find_node_index(temp_nodeoid); - - switch (temp_nodetype) - { - case 'C': - txn->coordparts[temp_nodeidx] = 1; - txn->num_coordparts++; - break; - case 'D': - txn->dnparts[temp_nodeidx-cn_nodes_num] = 1; - txn->num_dnparts++; - break; - default: - elog(ERROR,"nodetype of %s is not 'C' or 'D'", temp); - break; - } - temp = strtok(NULL,", "); - } - } - } - else - { - elog(LOG, "pg_clean: failed get database list on node %s", get_pgxc_nodename(node_oid)); - res = TWOPHASE_FILE_ERROR; - } - DropTupleTableSlots(&result); - return res; -} - -static txn_info *find_txn(char *gid) -{ - bool found; - database_info *cur_db; - txn_info *txn; - - for (cur_db = head_database_info; cur_db; cur_db = cur_db->next) - { -#if 0 - for (cur_txn = cur_db->head_txn_info; cur_txn; cur_txn = cur_txn->next) - { - if (0 == strcmp(cur_txn->gid, gid)) - return cur_txn; - } -#endif - txn = (txn_info *)hash_search(cur_db->all_txn_info, (void *)gid, HASH_FIND, &found); - if (found) - return txn; - } - return NULL; -} - -txn_info* make_txn_info(char* dbname, char* gid, char* owner) -{ - bool found; - txn_info *txn_insert_pos = NULL; - database_info *dbinfo; - txn_info *txn; - - dbinfo = add_database_info(dbname); - txn = (txn_info *)palloc0(sizeof(txn_info)); - if (txn == NULL) - return NULL; - //txn->next = NULL; - - //txn->gid = (char *)palloc0(strlen(gid)+1); - strncpy(txn->gid, gid, strlen(gid)+1); - txn->owner = (char *)palloc0(strlen(owner)+1); - strncpy(txn->owner, owner, strlen(owner)+1); - - txn->txn_stat = (TXN_STATUS *)palloc0(sizeof(TXN_STATUS) * pgxc_clean_node_count); - txn->xid = (uint32 *)palloc0(sizeof(uint32) * pgxc_clean_node_count); - txn->prepare_timestamp = (TimestampTz *)palloc0(sizeof(TimestampTz) * pgxc_clean_node_count); - txn->coordparts = (int *)palloc0(cn_nodes_num * sizeof(int)); - - txn->dnparts = (int *)palloc0(dn_nodes_num * sizeof(int)); - if (txn->gid == NULL || txn->owner == NULL || txn->txn_stat == NULL - || txn->xid == NULL || txn->coordparts == NULL || txn->dnparts == NULL || txn->prepare_timestamp == NULL) - { - pfree(txn); - return(NULL); - } - - txn_insert_pos = (txn_info *)hash_search(dbinfo->all_txn_info, - (void *)txn->gid, HASH_ENTER, &found); - if (!found) - memcpy(txn_insert_pos, txn, sizeof(txn_info)); - -#if 0 - if (dbinfo->head_txn_info == NULL) - { - dbinfo->head_txn_info = dbinfo->last_txn_info = txn; - } - else - { - dbinfo->last_txn_info->next = txn; - dbinfo->last_txn_info = txn; - } -#endif - - return txn_insert_pos; -} - -database_info *find_database_info(char *database_name) -{ - database_info *cur_database_info = head_database_info; - - for (;cur_database_info; cur_database_info = cur_database_info->next) - { - if(cur_database_info->database_name && - database_name && - strcmp(cur_database_info->database_name, database_name) == 0) - return(cur_database_info); - } - return(NULL); -} - -database_info *add_database_info(char *database_name) -{ - database_info *rv; - HASHCTL txn_ctl; - char tabname[STRING_BUFF_LEN]; - - if ((rv = find_database_info(database_name)) != NULL) - return rv; /* Already in the list */ - rv = (database_info *)palloc0(sizeof(database_info)); - if (rv == NULL) - return NULL; - rv->next = NULL; - rv->database_name = (char *)palloc0(strlen(database_name) + 1); - strncpy(rv->database_name, database_name, strlen(database_name) + 1); - if (rv->database_name == NULL) - { - pfree(rv); - return NULL; - } -#if 0 - rv->head_txn_info = NULL; - rv->last_txn_info = NULL; -#endif - - snprintf(tabname, STRING_BUFF_LEN, "%s txn info", rv->database_name); - txn_ctl.keysize = MAX_GID; - txn_ctl.entrysize = sizeof(txn_info); - rv->all_txn_info = hash_create(tabname, 64, - &txn_ctl, HASH_ELEM); - if (head_database_info == NULL) - { - head_database_info = last_database_info = rv; - return rv; - } - else - { - last_database_info->next = rv; - last_database_info = rv; - return rv; - } -} - -int find_node_index(Oid node_oid) -{ - int res = -1; - int i; - if (get_pgxc_nodetype(node_oid) == 'C') - { - for (i = 0; i < cn_nodes_num; i++) - { - if (node_oid == cn_node_list[i]) - { - res = i; - break; - } - } - } - else - { - for (i = 0; i < dn_nodes_num; i++) - { - if (node_oid == dn_node_list[i]) - { - res = i+cn_nodes_num; - break; - } - } - } - return res; -} - -Oid find_node_oid(int node_idx) -{ - return (node_idx < cn_nodes_num) ? cn_node_list[node_idx] : - dn_node_list[node_idx-cn_nodes_num]; -} - -void getTxnInfoOnOtherNodesAll(void) -{ - database_info *cur_database; - - for (cur_database = head_database_info; cur_database; cur_database = cur_database->next) - { - getTxnInfoOnOtherNodesForDatabase(cur_database); - } -} - -void getTxnInfoOnOtherNodesForDatabase(database_info *database) -{ - txn_info *cur_txn; - HASH_SEQ_STATUS status; - HTAB *txn = database->all_txn_info; - hash_seq_init(&status, txn); - - while ((cur_txn = (txn_info *) hash_seq_search(&status)) != NULL) - { - getTxnInfoOnOtherNodes(cur_txn); - } -#if 0 - for (cur_txn = database->head_txn_info; cur_txn; cur_txn = cur_txn->next) - { - getTxnInfoOnOtherNodes(cur_txn); - } -#endif -} - -void getTxnInfoOnOtherNodes(txn_info *txn) -{ - int ii; - int ret; - char node_type; - TWOPHASE_FILE_STATUS status = TWOPHASE_FILE_NOT_EXISTS; - Oid node_oid; - uint32 transactionid = 0; - char gid[MAX_GID]; - char *ptr = NULL; - - if (IsXidImplicit(txn->gid)) - { - strncpy(gid, txn->gid, strlen(txn->gid)+1); - ptr = strtok(gid, ":"); - ptr = strtok(NULL, ":"); - node_oid = get_pgxc_nodeoid(ptr); - status = GetTransactionPartNodes(txn, node_oid); - } - else - { - for (ii = 0; ii < cn_nodes_num + dn_nodes_num; ii++) - { - if (ii < cn_nodes_num) - { - status = GetTransactionPartNodes(txn, cn_node_list[ii]); - if (TWOPHASE_FILE_EXISTS == status || - TWOPHASE_FILE_OLD == status || - TWOPHASE_FILE_ERROR == status) - { - node_oid = cn_node_list[ii]; - break; - } - } - else - { - status = GetTransactionPartNodes(txn, dn_node_list[ii - cn_nodes_num]); - if (TWOPHASE_FILE_EXISTS == status || - TWOPHASE_FILE_OLD == status || - TWOPHASE_FILE_ERROR == status) - { - node_oid = dn_node_list[ii - cn_nodes_num]; - break; - } - } - } - - /* since there may be explicit readonly twophase transactions */ - if (txn->is_readonly) - { - return; - } - if (TWOPHASE_FILE_EXISTS == status && - InvalidGlobalTimestamp == txn->global_commit_timestamp && - node_oid != txn->origcoord) - { - status = GetTransactionPartNodes(txn, txn->origcoord); - } - - } - - if (TWOPHASE_FILE_EXISTS != status) - { - /* - * if 2pc file not exists in all nodes, the trans did not pass the prepared phase, - * - */ - txn->global_txn_stat = (TWOPHASE_FILE_NOT_EXISTS == status) ? - TXN_STATUS_ABORTED : TXN_STATUS_UNKNOWN; - return; - } - - - /* judge the range of global status */ - CheckFirstPhase(txn); - - for (ii = 0; ii < pgxc_clean_node_count; ii++) - { - if (txn->txn_stat[ii] == TXN_STATUS_INITIAL) - { - /*check node ii is 'C' or 'D'*/ - node_oid = find_node_oid(ii); - if (node_oid == txn->origcoord) - continue; - node_type = get_pgxc_nodetype(node_oid); - if (node_type == 'C' && txn->coordparts[ii] != 1) - continue; - if (node_type == 'D' && txn->dnparts[ii - cn_nodes_num] != 1) - continue; - /*check coordparts or dnparts*/ - if (txn->xid[ii] == 0) - { - ret = Get2PCXidByGid(node_oid, txn->gid, &transactionid); - if (ret == XIDFOUND) - { - txn->xid[ii] = transactionid; - if (txn->xid[ii] > 0) - getTxnStatus(txn, ii); - } - else if (ret == XIDNOTFOUND) - { - if (txn->after_first_phase) - txn->txn_stat[ii] = TXN_STATUS_COMMITTED; - } - else - txn->txn_stat[ii] = TXN_STATUS_UNKNOWN; - - } - } - } -} - -/*get xid by gid on node_oid*/ -int Get2PCXidByGid(Oid node_oid, char *gid, uint32 *transactionid) -{ - int ret = XIDFOUND; - TupleTableSlots result; - uint32 xid = 0; - static const char *STMT_FORM = "select pgxc_get_2pc_xid('%s')::text;"; - char stmt[100]; - snprintf(stmt, 100, STMT_FORM, gid); - /*if exist get xid by gid on node_oid*/ - if (execute_query_on_single_node(node_oid, stmt, 1, &result) != (Datum) 0) - { - if (result.slot_count) - { - if (TTSgetvalue(&result, 0, 0)) - { - xid = strtoul(TTSgetvalue(&result, 0, 0), NULL, 10); - *transactionid = xid; - if (xid == 0) - ret = XIDNOTFOUND; - } - else - ret = XIDNOTFOUND; - } - else - ret = XIDNOTFOUND; - } - else - ret = XIDEXECFAIL; - DropTupleTableSlots(&result); - return ret; -} - -int Get2PCFile(Oid node_oid, char * gid, uint32 * transactionid) -{ - int ret = FILEFOUND; - TupleTableSlots result; - static const char *STMT_FORM = "select pgxc_get_2pc_file('%s')::text;"; - char stmt[100]; - snprintf(stmt, 100, STMT_FORM, gid); - /*if exist get xid by gid on node_oid*/ - if (execute_query_on_single_node(node_oid, stmt, 1, &result) != (Datum) 0) - { - if (result.slot_count) - { - if (!TTSgetvalue(&result, 0, 0)) - { - ret = FILENOTFOUND; - } - else - { - ret = FILEFOUND; - } - } - else - ret = FILENOTFOUND; - } - else - ret = FILEUNKOWN; - DropTupleTableSlots(&result); - return ret; -} - - -void getTxnStatus(txn_info *txn, int node_idx) -{ - Oid node_oid; - char stmt[1024]; - char *att1; - TupleTableSlots result; - - static const char *STMT_FORM = "SELECT pgxc_is_committed('%d'::xid)::text"; - snprintf(stmt, 1024, STMT_FORM, txn->xid[node_idx], txn->xid[node_idx]); - - node_oid = find_node_oid(node_idx); - if (0 != execute_query_on_single_node(node_oid, stmt, 1, &result)) - { - att1 = TTSgetvalue(&result, 0, 0); - - if (att1) - { - if (strcmp(att1, "true") == 0) - { - txn->txn_stat[node_idx] = TXN_STATUS_COMMITTED; - } - else - txn->txn_stat[node_idx] = TXN_STATUS_ABORTED; - } - else - { - txn->txn_stat[node_idx] = TXN_STATUS_INITIAL; - } - } - else - txn->txn_stat[node_idx] = TXN_STATUS_UNKNOWN; - DropTupleTableSlots(&result); -} - -char *get2PCInfo(const char *tid) -{ - char *result = NULL; - char *info = NULL; - int size = 0; - File fd = -1; - int ret = -1; - struct stat filestate; - char path[MAXPGPATH]; - - info = get_2pc_info_from_cache(tid); - if (NULL != info) - { - size = strlen(info); - result = (char *)palloc0(size + 1); - memcpy(result, info, size); - return result; - } - - elog(LOG, "try to get 2pc info from disk, tid: %s", tid); - - snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid); - if(access(path, F_OK) == 0) - { - if(stat(path, &filestate) == -1) - { - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not get status of file \"%s\"", path))); - } - - size = filestate.st_size; - - if (0 == size) - { - return NULL; - } - - result = (char *)palloc0(size + 1); - - fd = PathNameOpenFile(path, O_RDONLY, S_IRUSR | S_IWUSR); - if (fd < 0) - { - pfree(result); - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not open file \"%s\" for read", path))); - } - - ret = FileRead(fd, result, size, WAIT_EVENT_BUFFILE_READ); - if(ret != size) - { - pfree(result); - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not read file \"%s\"", path))); - } - - FileClose(fd); - return result; - } - - return NULL; -} - -Datum pgxc_get_2pc_file(PG_FUNCTION_ARGS); -PG_FUNCTION_INFO_V1(pgxc_get_2pc_file); -Datum pgxc_get_2pc_file(PG_FUNCTION_ARGS) -{ - char *tid = NULL; - char *result = NULL; - text *t_result = NULL; - - tid = text_to_cstring(PG_GETARG_TEXT_P(0)); - result = get2PCInfo(tid); - if (NULL != result) - { - t_result = cstring_to_text(result); - pfree(result); - return PointerGetDatum(t_result); - } - PG_RETURN_NULL(); -} - - -Datum pgxc_get_2pc_nodes(PG_FUNCTION_ARGS); -PG_FUNCTION_INFO_V1(pgxc_get_2pc_nodes); -Datum pgxc_get_2pc_nodes(PG_FUNCTION_ARGS) -{ - char *tid = NULL; - char *result = NULL; - char *nodename = NULL; - text *t_result = NULL; - - tid = text_to_cstring(PG_GETARG_TEXT_P(0)); - result = get2PCInfo(tid); - if (NULL != result) - { - nodename = strstr(result, GET_NODE); - if (NULL != nodename) - { - nodename += strlen(GET_NODE); - nodename = strtok(nodename, "\n"); - t_result = cstring_to_text(nodename); - pfree(result); - return PointerGetDatum(t_result); - } - } - - PG_RETURN_NULL(); -} - -Datum pgxc_get_2pc_startnode(PG_FUNCTION_ARGS); -PG_FUNCTION_INFO_V1(pgxc_get_2pc_startnode); -Datum pgxc_get_2pc_startnode(PG_FUNCTION_ARGS) -{ - char *tid = NULL; - char *result = NULL; - char *nodename = NULL; - text *t_result = NULL; - - tid = text_to_cstring(PG_GETARG_TEXT_P(0)); - result = get2PCInfo(tid); - if (NULL != result) - { - nodename = strstr(result, GET_START_NODE); - if (NULL != nodename) - { - nodename += strlen(GET_START_NODE); - nodename = strtok(nodename, "\n"); - t_result = cstring_to_text(nodename); - pfree(result); - return PointerGetDatum(t_result); - - } - } - PG_RETURN_NULL(); -} - -Datum pgxc_get_2pc_startxid(PG_FUNCTION_ARGS); -PG_FUNCTION_INFO_V1(pgxc_get_2pc_startxid); -Datum pgxc_get_2pc_startxid(PG_FUNCTION_ARGS) -{ - char *tid = NULL; - char *result = NULL; - char *startxid = NULL; - text *t_result = NULL; - - tid = text_to_cstring(PG_GETARG_TEXT_P(0)); - result = get2PCInfo(tid); - if (NULL != result) - { - startxid = strstr(result, GET_START_XID); - if (NULL != startxid) - { - startxid += strlen(GET_START_XID); - startxid = strtok(startxid, "\n"); - t_result = cstring_to_text(startxid); - pfree(result); - return PointerGetDatum(t_result); - } - } - PG_RETURN_NULL(); -} - - -Datum pgxc_get_2pc_commit_timestamp(PG_FUNCTION_ARGS); -PG_FUNCTION_INFO_V1(pgxc_get_2pc_commit_timestamp); -Datum pgxc_get_2pc_commit_timestamp(PG_FUNCTION_ARGS) -{ - char *tid = NULL; - char *result = NULL; - char *commit_timestamp = NULL; - text *t_result = NULL; - - tid = text_to_cstring(PG_GETARG_TEXT_P(0)); - result = get2PCInfo(tid); - if (NULL != result) - { - commit_timestamp = strstr(result, GET_COMMIT_TIMESTAMP); - if (NULL != commit_timestamp) - { - commit_timestamp += strlen(GET_COMMIT_TIMESTAMP); - commit_timestamp = strtok(commit_timestamp, "\n"); - t_result = cstring_to_text(commit_timestamp); - pfree(result); - return PointerGetDatum(t_result); - } - } - PG_RETURN_NULL(); -} - - - -Datum pgxc_get_2pc_xid(PG_FUNCTION_ARGS); -PG_FUNCTION_INFO_V1(pgxc_get_2pc_xid); -Datum pgxc_get_2pc_xid(PG_FUNCTION_ARGS) -{ - char *tid = NULL; - char *result = NULL; - char *str_xid = NULL; - GlobalTransactionId xid; - - tid = text_to_cstring(PG_GETARG_TEXT_P(0)); - result = get2PCInfo(tid); - if (NULL != result) - { - str_xid = strstr(result, GET_XID); - if (NULL != str_xid) - { - str_xid += strlen(GET_XID); - str_xid = strtok(str_xid, "\n"); - xid = strtoul(str_xid, NULL, 10); - pfree(result); - PG_RETURN_UINT32(xid); - } - } - PG_RETURN_NULL(); -} - -Datum pgxc_remove_2pc_records(PG_FUNCTION_ARGS); -PG_FUNCTION_INFO_V1(pgxc_remove_2pc_records); -Datum pgxc_remove_2pc_records(PG_FUNCTION_ARGS) -{ - char *tid = text_to_cstring(PG_GETARG_TEXT_P(0)); - remove_2pc_records(tid, true); - pfree(tid); - PG_RETURN_BOOL(true); -} - -Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS); -PG_FUNCTION_INFO_V1(pgxc_clear_2pc_records); -Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS) -{ - MemoryContext oldcontext; - MemoryContext mycontext; - - int i = 0; - int count = 0; - TupleTableSlots *result; - TupleTableSlots clear_result; - const char *query = "select pgxc_get_record_list()::text"; - const char *CLEAR_STMT = "select pgxc_remove_2pc_records('%s')::text"; - char clear_query[100]; - char *twopcfiles = NULL; - char *ptr = NULL; - bool res = true; - - if(!IS_PGXC_COORDINATOR) - { - elog(ERROR, "can only called on coordinator"); - } - - mycontext = AllocSetContextCreate(CurrentMemoryContext, - "clean_check", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); - oldcontext = MemoryContextSwitchTo(mycontext); - - ResetGlobalVariables(); -#if 0 - if((dir = opendir(TWOPHASE_RECORD_DIR))) - { - while((ptr = readdir(dir)) != NULL) - { - if (count > 999) - break; - if(strcmp(ptr->d_name,".") == 0 || strcmp(ptr->d_name,"..") == 0) - { - continue; - } - snprintf(path[count], MAX_GID, "/%s", ptr->d_name); - //snprintf(path[count], MAX_GID, "/%s", ptr->d_name); - count++; - } - - closedir(dir); - } -#endif - - /*get node list*/ - PgxcNodeGetOids(&cn_node_list, &dn_node_list, - &cn_nodes_num, &dn_nodes_num, true); - pgxc_clean_node_count = cn_nodes_num + dn_nodes_num; - my_nodeoid = getMyNodeoid(); - cn_health_map = palloc0(cn_nodes_num * sizeof(bool)); - dn_health_map = palloc0(dn_nodes_num * sizeof(bool)); - result = (TupleTableSlots *)palloc0(pgxc_clean_node_count * sizeof(TupleTableSlots)); - - /*collect the 2pc file in nodes*/ - for (i = 0; i < cn_nodes_num; i++) - { - (void) execute_query_on_single_node(cn_node_list[i], query, 1, result+i); - } - - for (i = 0; i < dn_nodes_num; i++) - { - (void) execute_query_on_single_node(dn_node_list[i], query, 1, result+cn_nodes_num+i); - } - /*get all database info*/ - getDatabaseList(); - - /*get all info of 2PC transactions*/ - getTxnInfoOnNodesAll(); -#if 0 - if((dir = opendir(TWOPHASE_RECORD_DIR))) - { - while (i < count) - { - if (!find_txn(path[i])) - { - unlink(path[i]); - WriteClean2pcXlogRec(path[i]); - } - i++; - } - - closedir(dir); - } -#endif - /*delete all rest 2pc file in each nodes*/ - for (i = 0; i < cn_nodes_num; i++) - { - if (0 == result[i].slot_count) - { - continue; - } - if (!(twopcfiles = TTSgetvalue(result+i, 0, 0))) - continue; - ptr = strtok(twopcfiles, ","); - while(ptr) - { - if (count >= MAXIMUM_CLEAR_FILE) - break; - if (!find_txn(ptr)) - { - snprintf(clear_query, 100, CLEAR_STMT, ptr); - if (execute_query_on_single_node(cn_node_list[i], clear_query, 1, &clear_result) == (Datum)0) - res = false; - DropTupleTableSlots(&clear_result); - count++; - } - ptr = strtok(NULL, ","); - } - } - - for (i = 0; i < dn_nodes_num; i++) - { - if (0 == result[cn_nodes_num+i].slot_count) - { - continue; - } - if (!(twopcfiles = TTSgetvalue(result+cn_nodes_num+i, 0, 0))) - continue; - ptr = strtok(twopcfiles, ","); - while(ptr) - { - if (count >= MAXIMUM_CLEAR_FILE) - break; - if (!find_txn(ptr)) - { - snprintf(clear_query, 100, CLEAR_STMT, ptr); - if (execute_query_on_single_node(dn_node_list[i], clear_query, 1, &clear_result) == (Datum)0) - res = false; - DropTupleTableSlots(&clear_result); - count++; - } - ptr = strtok(NULL, ","); - } - } - - for (i = 0; i < pgxc_clean_node_count; i++) - DropTupleTableSlots(result+i); - - DestroyTxnHash(); - ResetGlobalVariables(); - - MemoryContextSwitchTo(oldcontext); - MemoryContextDelete(mycontext); - - - PG_RETURN_BOOL(res); -} - -Datum pgxc_get_record_list(PG_FUNCTION_ARGS); -PG_FUNCTION_INFO_V1(pgxc_get_record_list); -Datum pgxc_get_record_list(PG_FUNCTION_ARGS) -{ - int count = 0; - DIR *dir = NULL; - struct dirent *ptr = NULL; - char *recordList = NULL; - text *t_recordList = NULL; - - /* get from hash table */ - recordList = get_2pc_list_from_cache(&count); - if (count >= MAXIMUM_OUTPUT_FILE) - { - Assert(NULL != recordList); - t_recordList = cstring_to_text(recordList); - return PointerGetDatum(t_recordList); - } - - /* get from disk */ - if(!(dir = opendir(TWOPHASE_RECORD_DIR))) - { - if(NULL == recordList) - { - PG_RETURN_NULL(); - } - - t_recordList = cstring_to_text(recordList); - return PointerGetDatum(t_recordList); - } - - while((ptr = readdir(dir)) != NULL) - { - if(strcmp(ptr->d_name,".") == 0 || strcmp(ptr->d_name,"..") == 0) - { - continue; - } - if (count >= MAXIMUM_OUTPUT_FILE) - { - break; - } - - if(!recordList) - { - recordList = (char *)palloc0(strlen(ptr->d_name) + 1); - sprintf(recordList, "%s", ptr->d_name); - } - else - { - recordList = (char *) repalloc(recordList, - strlen(ptr->d_name) + strlen(recordList) + 2); - sprintf(recordList, "%s,%s", recordList, ptr->d_name); - } - count++; - } - - closedir(dir); - - if(!recordList) - { - PG_RETURN_NULL(); - } - else - { - t_recordList = cstring_to_text(recordList); - return PointerGetDatum(t_recordList); - } -} - -Datum pgxc_commit_on_node(PG_FUNCTION_ARGS); -PG_FUNCTION_INFO_V1(pgxc_commit_on_node); -Datum pgxc_commit_on_node(PG_FUNCTION_ARGS) -{ - /* nodename, gid */ - char *nodename; - Oid nodeoid; - char *gid; - txn_info *txn; - char command[MAX_CMD_LENGTH]; - PGXCNodeHandle **connections = NULL; - int conn_count = 0; - ResponseCombiner combiner; - PGXCNodeAllHandles *pgxc_handles = NULL; - PGXCNodeHandle *conn = NULL; - - /*clear Global*/ - ResetGlobalVariables(); - /*get node list*/ - PgxcNodeGetOids(&cn_node_list, &dn_node_list, - &cn_nodes_num, &dn_nodes_num, true); - if (cn_node_list == NULL || dn_node_list == NULL) - elog(ERROR, "pg_clean:fail to get cn_node_list and dn_node_list"); - pgxc_clean_node_count = cn_nodes_num + dn_nodes_num; - my_nodeoid = getMyNodeoid(); - cn_health_map = palloc0(cn_nodes_num * sizeof(bool)); - dn_health_map = palloc0(dn_nodes_num * sizeof(bool)); - - nodename = text_to_cstring(PG_GETARG_TEXT_P(0)); - gid = text_to_cstring(PG_GETARG_TEXT_P(1)); - nodeoid = get_pgxc_nodeoid(nodename); - if (InvalidOid == nodeoid) - { - elog(ERROR, "Invalid nodename '%s'", nodename); - } - - txn = (txn_info *)palloc0(sizeof(txn_info)); - if (txn == NULL) - { - PG_RETURN_BOOL(false); - } - txn->txn_stat = (TXN_STATUS *)palloc0(sizeof(TXN_STATUS) * pgxc_clean_node_count); - txn->xid = (uint32 *)palloc0(sizeof(uint32) * pgxc_clean_node_count); - txn->prepare_timestamp = (TimestampTz *)palloc0(sizeof(TimestampTz) * pgxc_clean_node_count); - txn->coordparts = (int *)palloc0(cn_nodes_num * sizeof(int)); - txn->dnparts = (int *)palloc0(dn_nodes_num * sizeof(int)); - - strncpy(txn->gid, gid, strlen(gid)+1); - getTxnInfoOnOtherNodes(txn); - snprintf(command, MAX_CMD_LENGTH, "commit prepared '%s'", txn->gid); - - - if (InvalidGlobalTimestamp == txn->global_commit_timestamp) - { - if (!txn->is_readonly) - { - elog(ERROR, "in pg_clean, fail to get global_commit_timestamp for transaction '%s' on", gid); - } - else - { - txn->global_commit_timestamp = GetGlobalTimestampGTM(); - } - } - - connections = (PGXCNodeHandle**)palloc(sizeof(PGXCNodeHandle*)); - get_node_handles(&pgxc_handles, nodeoid); - - conn = (PGXC_NODE_COORDINATOR == get_pgxc_nodetype(nodeoid)) ? - pgxc_handles->coord_handles[0] : pgxc_handles->datanode_handles[0]; - if (!send_query_clean_transaction(conn, txn, command)) - { - elog(ERROR, "pg_clean: send query '%s' from '%s' to '%s' failed ", - command, get_pgxc_nodename(my_nodeoid) , nodename); - } - else - { - connections[conn_count++] = conn; - } - /* receive response */ - if (conn_count) - { - InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE); - if (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner) || - !validate_combiner(&combiner)) - { - if (combiner.errorMessage) - pgxc_node_report_error(&combiner); - else - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to FINISH the transaction on one or more nodes"))); - } - else - CloseCombiner(&combiner); - } - /*clear Global*/ - ResetGlobalVariables(); - clear_handles(); - pfree_pgxc_all_handles(pgxc_handles); - pgxc_handles = NULL; - pfree(connections); - connections = NULL; - - PG_RETURN_BOOL(true); -} - -Datum pgxc_abort_on_node(PG_FUNCTION_ARGS); -PG_FUNCTION_INFO_V1(pgxc_abort_on_node); -Datum pgxc_abort_on_node(PG_FUNCTION_ARGS) -{ - /* nodename, gid */ - char *nodename; - Oid nodeoid; - char *gid; - txn_info *txn; - char command[MAX_CMD_LENGTH]; - PGXCNodeHandle **connections = NULL; - int conn_count = 0; - ResponseCombiner combiner; - PGXCNodeAllHandles *pgxc_handles = NULL; - PGXCNodeHandle *conn = NULL; - - /*clear Global*/ - ResetGlobalVariables(); - /*get node list*/ - PgxcNodeGetOids(&cn_node_list, &dn_node_list, - &cn_nodes_num, &dn_nodes_num, true); - if (cn_node_list == NULL || dn_node_list == NULL) - elog(ERROR, "pg_clean:fail to get cn_node_list and dn_node_list"); - pgxc_clean_node_count = cn_nodes_num + dn_nodes_num; - my_nodeoid = getMyNodeoid(); - cn_health_map = palloc0(cn_nodes_num * sizeof(bool)); - dn_health_map = palloc0(dn_nodes_num * sizeof(bool)); - - nodename = text_to_cstring(PG_GETARG_TEXT_P(0)); - gid = text_to_cstring(PG_GETARG_TEXT_P(1)); - nodeoid = get_pgxc_nodeoid(nodename); - if (InvalidOid == nodeoid) - { - elog(ERROR, "Invalid nodename '%s'", nodename); - } - - txn = (txn_info *)palloc0(sizeof(txn_info)); - if (txn == NULL) - { - PG_RETURN_BOOL(false); - } - txn->txn_stat = (TXN_STATUS *)palloc0(sizeof(TXN_STATUS) * pgxc_clean_node_count); - txn->xid = (uint32 *)palloc0(sizeof(uint32) * pgxc_clean_node_count); - txn->prepare_timestamp = (TimestampTz *)palloc0(sizeof(TimestampTz) * pgxc_clean_node_count); - txn->coordparts = (int *)palloc0(cn_nodes_num * sizeof(int)); - txn->dnparts = (int *)palloc0(dn_nodes_num * sizeof(int)); - - strncpy(txn->gid, gid, strlen(gid)+1); - connections = (PGXCNodeHandle**)palloc(sizeof(PGXCNodeHandle*)); - getTxnInfoOnOtherNodes(txn); - snprintf(command, MAX_CMD_LENGTH, "rollback prepared '%s'", txn->gid); -#if 0 - if (!setMaintenanceMode(true)) - { - elog(ERROR, "Error: fail to set maintenance mode on in pg_clean"); - } -#endif - - get_node_handles(&pgxc_handles, nodeoid); - - conn = (PGXC_NODE_COORDINATOR == get_pgxc_nodetype(nodeoid)) ? - pgxc_handles->coord_handles[0] : pgxc_handles->datanode_handles[0]; - if (!send_query_clean_transaction(conn, txn, command)) - { - elog(ERROR, "pg_clean: send query '%s' from '%s' to '%s' failed ", - command, get_pgxc_nodename(my_nodeoid) , nodename); - } - else - { - connections[conn_count++] = conn; - } - /* receive response */ - if (conn_count) - { - InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE); - if (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner) || - !validate_combiner(&combiner)) - { - if (combiner.errorMessage) - pgxc_node_report_error(&combiner); - else - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to FINISH the transaction on one or more nodes"))); - } - else - CloseCombiner(&combiner); - } - /*clear Global*/ - ResetGlobalVariables(); - clear_handles(); - pfree_pgxc_all_handles(pgxc_handles); - pgxc_handles = NULL; - pfree(connections); - connections = NULL; - - PG_RETURN_BOOL(true); -} - - - -void recover2PCForDatabaseAll(void) -{ - database_info *cur_db = head_database_info; - while (cur_db) - { - recover2PCForDatabase(cur_db); - cur_db = cur_db->next; - } - //clean_old_2PC_files(); -} - -void recover2PCForDatabase(database_info * db_info) -{ - txn_info *cur_txn; - HASH_SEQ_STATUS status; - HTAB *txn = db_info->all_txn_info; - - hash_seq_init(&status, txn); - while ((cur_txn = (txn_info *) hash_seq_search(&status)) != NULL) - { - recover2PC(cur_txn); - } -} - -bool send_query_clean_transaction(PGXCNodeHandle* conn, txn_info *txn, const char *finish_cmd) -{ -#ifdef __TWO_PHASE_TESTS__ - if (PG_CLEAN_SEND_CLEAN <= twophase_exception_case && - PG_CLEAN_SEND_QUERY >= twophase_exception_case) - { - twophase_in = IN_PG_CLEAN; - } -#endif - if (!GlobalTimestampIsValid(txn->global_commit_timestamp) && - TXN_STATUS_COMMITTED == txn->global_txn_stat && - !txn->is_readonly) - return false; - - if (pgxc_node_send_clean(conn)) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("in pg_clean failed to send pg_clean flag for %s PREPARED command", - TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK"))); - return false; - } - if (txn->is_readonly && pgxc_node_send_readonly(conn)) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("in pg_clean failed to send readonly flag for %s PREPARED command", - TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK"))); - return false; - } - - if (txn->after_first_phase && pgxc_node_send_after_prepare(conn)) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("in pg_clean failed to send after prepare flag for %s PREPARED command", - TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK"))); - return false; - } - - /* - * only transaction finished in commit prepared/rollback prepared phase send timestamp - * partial prepared transaction has no need to send other information - */ - if (InvalidGlobalTimestamp != txn->global_commit_timestamp && - pgxc_node_send_global_timestamp(conn, txn->global_commit_timestamp)) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("in pg_clean failed to send global committs for %s PREPARED command", - TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK"))); - } - if (!txn->is_readonly) - { - if (InvalidOid != txn->origcoord && pgxc_node_send_starter(conn, get_pgxc_nodename(txn->origcoord))) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("in pg_clean failed to send start node for %s PREPARED command", - TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK"))); - } - - if (InvalidTransactionId != txn->startxid && pgxc_node_send_startxid(conn, txn->startxid)) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("in pg_clean failed to send start xid for %s PREPARED command", - TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK"))); - } - - if (NULL != txn->participants && pgxc_node_send_partnodes(conn, txn->participants)) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("in pg_clean failed to send participants for %s PREPARED command", - TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK"))); - } - } - - if (pgxc_node_send_query(conn, finish_cmd)) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("in pg_clean failed to send query for %s PREPARED command", - TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK"))); - return false; - } - return true; -} - -bool check_2pc_belong_node(txn_info * txn) -{ - int node_index = 0; - char node_type; - node_index = find_node_index(abnormal_nodeoid); - if (abnormal_nodeoid == txn->origcoord) - { - txn->belong_abnormal_node = true; - return true; - } - node_type = get_pgxc_nodetype(abnormal_nodeoid); - if (node_type == 'C' && txn->coordparts[node_index] == 1) - { - txn->belong_abnormal_node = true; - return true; - } - if (node_type == 'D' && txn->dnparts[node_index - cn_nodes_num] == 1) - { - txn->belong_abnormal_node = true; - return true; - } - txn->belong_abnormal_node = false; - return false; -} - -bool check_node_participate(txn_info * txn, int node_idx) -{ - char node_type = get_pgxc_nodetype(abnormal_nodeoid); - if (PGXC_NODE_COORDINATOR == node_type) - { - return txn->coordparts[node_idx] == 1 ? true : false; - } else if (PGXC_NODE_DATANODE == node_type) - { - return txn->dnparts[node_idx] == 1 ? true : false; - } - return false; -} - -void recover2PC(txn_info * txn) -{ - TXN_STATUS txn_stat; - txn_stat = check_txn_global_status(txn); - txn->global_txn_stat = txn_stat; - -#ifdef DEBUG_EXECABORT - txn_stat = TXN_STATUS_ABORTED; -#endif - - switch (txn_stat) - { - case TXN_STATUS_FAILED: - elog(LOG, "cannot recover 2PC transaction %s for TXN_STATUS_FAILED", txn->gid); - txn->op = UNDO; - txn->op_issuccess = true; - break; - - case TXN_STATUS_UNKNOWN: - elog(LOG, "cannot recover 2PC transaction %s for TXN_STATUS_UNKNOWN", txn->gid); - txn->op = UNDO; - txn->op_issuccess = true; - break; - - case TXN_STATUS_PREPARED: - elog(DEBUG1, "2PC recovery of transaction %s not needed for TXN_STATUS_PREPARED", txn->gid); - txn->op = UNDO; - txn->op_issuccess = true; - break; - - case TXN_STATUS_COMMITTED: - if (InvalidOid == txn->origcoord || txn->is_readonly) - { - txn->op = UNDO; - txn->op_issuccess = true; - } - else - { - txn->op = COMMIT; - /* check whether all nodes can commit prepared */ - if (!clean_2PC_iscommit(txn, true, true)) - { - txn->op_issuccess = false; - elog(LOG, "check commit 2PC transaction %s failed", txn->gid); - return; - } - /* send commit prepared to all nodes */ - if (!clean_2PC_iscommit(txn, true, false)) - { - txn->op_issuccess = false; - elog(LOG, "commit 2PC transaction %s failed", txn->gid); - return; - } - txn->op_issuccess = true; - clean_2PC_files(txn); - } - break; - - case TXN_STATUS_ABORTED: - txn->op = ABORT; - /* check whether all nodes can rollback prepared */ - if (!clean_2PC_iscommit(txn, false, true)) - { - txn->op_issuccess = false; - elog(LOG, "check rollback 2PC transaction %s failed", txn->gid); - return; - } - /* send rollback prepared to all nodes */ - if (!clean_2PC_iscommit(txn, false, false)) - { - txn->op_issuccess = false; - elog(LOG, "rollback 2PC transaction %s failed", txn->gid); - return; - } - txn->op_issuccess = true; - clean_2PC_files(txn); - break; - - case TXN_STATUS_INPROGRESS: - elog(DEBUG1, "2PC recovery of transaction %s not needed for TXN_STATUS_INPROGRESS", txn->gid); - txn->op = UNDO; - txn->op_issuccess = true; - break; - - default: - elog(ERROR, "cannot recover 2PC transaction %s for unkown status", txn->gid); - break; - } - return; -} - -TXN_STATUS check_txn_global_status(txn_info *txn) -{ -#define TXN_PREPARED 0x0001 -#define TXN_COMMITTED 0x0002 -#define TXN_ABORTED 0x0004 -#define TXN_UNKNOWN 0x0008 -#define TXN_INITIAL 0x0010 -#define TXN_INPROGRESS 0X0020 - int ii; - int check_flag = 0; - int node_idx = 0; - TimestampTz prepared_time = 0; - TimestampTz time_gap = clean_time_interval; - - if (!IsXidImplicit(txn->gid) && txn->is_readonly) - { - return TXN_STATUS_COMMITTED; - } - if (txn->global_txn_stat == TXN_STATUS_UNKNOWN) - { - check_flag |= TXN_UNKNOWN; - } - if (txn->global_txn_stat == TXN_STATUS_ABORTED) - { - check_flag |= TXN_ABORTED; - } - - /*check dn participates*/ - for (ii = 0; ii < dn_nodes_num; ii++) - { - if (txn->dnparts[ii] == 1) - { - if (txn->txn_stat[ii + cn_nodes_num] == TXN_STATUS_INITIAL) - check_flag |= TXN_INITIAL; - else if (txn->txn_stat[ii + cn_nodes_num] == TXN_STATUS_UNKNOWN) - check_flag |= TXN_UNKNOWN; - else if (txn->txn_stat[ii + cn_nodes_num] == TXN_STATUS_PREPARED) - { - check_flag |= TXN_PREPARED; - prepared_time = txn->prepare_timestamp[ii + cn_nodes_num] > prepared_time ? - txn->prepare_timestamp[ii + cn_nodes_num] : prepared_time; - } - else if (txn->txn_stat[ii + cn_nodes_num] == TXN_STATUS_INPROGRESS) - check_flag |= TXN_INPROGRESS; - else if (txn->txn_stat[ii + cn_nodes_num] == TXN_STATUS_COMMITTED) - check_flag |= TXN_COMMITTED; - else if (txn->txn_stat[ii + cn_nodes_num] == TXN_STATUS_ABORTED) - check_flag |= TXN_ABORTED; - else - return TXN_STATUS_FAILED; - } - } - /*check cn participates*/ - for (ii = 0; ii < cn_nodes_num; ii++) - { - if (txn->coordparts[ii] == 1) - { - if (txn->txn_stat[ii] == TXN_STATUS_INITIAL) - check_flag |= TXN_ABORTED; - else if (txn->txn_stat[ii] == TXN_STATUS_UNKNOWN) - check_flag |= TXN_UNKNOWN; - else if (txn->txn_stat[ii] == TXN_STATUS_PREPARED) - { - check_flag |= TXN_PREPARED; - prepared_time = txn->prepare_timestamp[ii] > prepared_time ? - txn->prepare_timestamp[ii] : prepared_time; - } - else if (txn->txn_stat[ii] == TXN_STATUS_INPROGRESS) - check_flag |= TXN_INPROGRESS; - else if (txn->txn_stat[ii] == TXN_STATUS_COMMITTED) - check_flag |= TXN_COMMITTED; - else if (txn->txn_stat[ii] == TXN_STATUS_ABORTED) - check_flag |= TXN_ABORTED; - else - return TXN_STATUS_FAILED; - } - } - - /* - * first check the prepare timestamp of both implicit and explicit trans within the time_gap or not - * if not, check the commit timestamp explicit trans within the time_gap or not - */ -#if 0 - if ((check_flag & TXN_INPROGRESS) || - (IsXidImplicit(txn->gid) && current_time - prepared_time <= time_gap) || - (!IsXidImplicit(txn->gid) && - ((!txn->after_first_phase && current_time - prepared_time <= time_gap) || - (txn->after_first_phase && - (InvalidGlobalTimestamp != commit_time && - current_time - commit_time <= time_gap))))) - { - /* transaction inprogress */ - return TXN_STATUS_INPROGRESS; - } -#endif - if (clear_2pc_belong_node) - { - node_idx = find_node_index(abnormal_nodeoid); - if (!check_2pc_belong_node(txn) || - !check_node_participate(txn, node_idx) || - abnormal_time < txn->prepare_timestamp[node_idx]) - { - return TXN_STATUS_INPROGRESS; - } - } - else - { - if (check_flag & TXN_INPROGRESS ||current_time - prepared_time <= time_gap) - { - /* transaction inprogress */ - return TXN_STATUS_INPROGRESS; - } - } - - - if (!IsXidImplicit(txn->gid) && txn->after_first_phase && (TXN_PREPARED == check_flag)) - { - return TXN_STATUS_PREPARED; - } - - if (check_flag & TXN_UNKNOWN) - return TXN_STATUS_UNKNOWN; - - if ((check_flag & TXN_COMMITTED) && (check_flag & TXN_ABORTED)) - /* Mix of committed and aborted. This should not happen. */ - return TXN_STATUS_UNKNOWN; - - if ((check_flag & TXN_PREPARED) == 0) - /* Should be at least one "prepared statement" in nodes */ - return TXN_STATUS_FAILED; - - if (check_flag & TXN_COMMITTED) - /* Some 2PC transactions are committed. Need to commit others. */ - return TXN_STATUS_COMMITTED; - /* All the transactions remain prepared. No need to recover. */ - return TXN_STATUS_ABORTED; -} - -bool clean_2PC_iscommit(txn_info *txn, bool is_commit, bool is_check) -{ - int ii; - static const char *STMT_FORM = "%s prepared '%s';"; - static const char *STMT_FORM_CHECK = "%s prepared '%s' for check only;"; - char command[MAX_CMD_LENGTH]; - int node_idx; - Oid node_oid; - PGXCNodeHandle **connections = NULL; - int conn_count = 0; - ResponseCombiner combiner; - PGXCNodeAllHandles *pgxc_handles = NULL; - - if (is_commit) - { - if (is_check) - { - snprintf(command, MAX_CMD_LENGTH, STMT_FORM_CHECK, "commit", txn->gid); - } - else - { - snprintf(command, MAX_CMD_LENGTH, STMT_FORM, "commit", txn->gid); - } - } - else - { - if (is_check) - { - snprintf(command, MAX_CMD_LENGTH, STMT_FORM_CHECK, "rollback", txn->gid); - } - else - { - snprintf(command, MAX_CMD_LENGTH, STMT_FORM, "rollback", txn->gid); - } - } - if (is_commit && InvalidGlobalTimestamp == txn->global_commit_timestamp) - { - elog(ERROR, "twophase transaction '%s' has InvalidGlobalCommitTimestamp", txn->gid); - } - - connections = (PGXCNodeHandle**)palloc(sizeof(PGXCNodeHandle*) * (txn->num_dnparts + txn->num_coordparts)); - if (connections == NULL) - { - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory for connections"))); - } - get_transaction_handles(&pgxc_handles, txn); - //pgxc_handles = get_handles(nodelist, coordlist, false, true); -#ifdef __TWO_PHASE_TESTS__ - if (PG_CLEAN_SEND_CLEAN <= twophase_exception_case && - PG_CLEAN_ELOG_ERROR >= twophase_exception_case) - { - exception_count = 0; - } -#endif - for (ii = 0; ii < pgxc_handles->dn_conn_count; ii++) - { - node_oid = pgxc_handles->datanode_handles[ii]->nodeoid; - node_idx = find_node_index(node_oid); - if (TXN_STATUS_PREPARED != txn->txn_stat[ node_idx]) - { - continue; - } - /*send global timestamp to dn_node_list[ii]*/ - if (!send_query_clean_transaction(pgxc_handles->datanode_handles[ii], txn, command)) - { - elog(LOG, "pg_clean: send query '%s' from '%s' to '%s' failed ", - command, get_pgxc_nodename(my_nodeoid) , pgxc_handles->datanode_handles[ii]->nodename); - return false; - } - else - { - connections[conn_count++] = pgxc_handles->datanode_handles[ii]; -#ifdef __TWO_PHASE_TESTS__ - if (PG_CLEAN_SEND_CLEAN <= twophase_exception_case && - PG_CLEAN_ELOG_ERROR >= twophase_exception_case) - { - exception_count++; - if (1 == exception_count && - PG_CLEAN_ELOG_ERROR == twophase_exception_case) - { - elog(ERROR, "PG_CLEAN_ELOG_ERROR complish"); - } - } -#endif - } - } - - for (ii = 0; ii < pgxc_handles->co_conn_count; ii++) - { - node_oid = pgxc_handles->coord_handles[ii]->nodeoid; - node_idx = find_node_index(node_oid); - if (TXN_STATUS_PREPARED != txn->txn_stat[ node_idx]) - { - continue; - } - /*send global timestamp to dn_node_list[ii]*/ - if (!send_query_clean_transaction(pgxc_handles->coord_handles[ii], txn, command)) - { - elog(LOG, "pg_clean: send query '%s' from '%s' to '%s' failed ", - command, get_pgxc_nodename(my_nodeoid) , pgxc_handles->coord_handles[ii]->nodename); - return false; - } - else - { - connections[conn_count++] = pgxc_handles->coord_handles[ii]; -#ifdef __TWO_PHASE_TESTS__ - if (PG_CLEAN_SEND_CLEAN <= twophase_exception_case && - PG_CLEAN_ELOG_ERROR >= twophase_exception_case) - { - exception_count++; - if (1 == exception_count && - PG_CLEAN_ELOG_ERROR == twophase_exception_case) - { - elog(ERROR, "PG_CLEAN_ELOG_ERROR complish"); - } - } -#endif - } - - } - - /* receive response */ - if (conn_count) - { - InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE); - if (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner) || - !validate_combiner(&combiner)) - { - if (combiner.errorMessage) - pgxc_node_report_error(&combiner); - else - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to FINISH the transaction on one or more nodes"))); - } - else - CloseCombiner(&combiner); - } - if (enable_distri_print) - { - for (ii = 0; ii < conn_count; ii++) - { - if (DN_CONNECTION_STATE_IDLE != connections[ii]->state) - { - elog(WARNING, "IN pg_clean node:%s invalid stauts:%d", connections[ii]->nodename, connections[ii]->state); - } - } - } - conn_count = 0; - clear_handles(); - pfree_pgxc_all_handles(pgxc_handles); - pgxc_handles = NULL; - - /*last commit or rollback on origcoord if it participate this txn, since after commit the 2pc file is deleted on origcoord*/ - if (txn->origcoord != InvalidOid) - { - node_idx = find_node_index(txn->origcoord); - if (txn->coordparts[node_idx] == 1) - { - /*send global timestamp to dn_node_list[ii]*/ - - if (txn->txn_stat[node_idx] == TXN_STATUS_PREPARED) - { - get_node_handles(&pgxc_handles, txn->origcoord); - if (!send_query_clean_transaction(pgxc_handles->coord_handles[0], txn, command)) - { - elog(LOG, "pg_clean: send query '%s' from %s to %s failed ", - command, get_pgxc_nodename(my_nodeoid) , pgxc_handles->coord_handles[0]->nodename); - return false; - } - else - { - connections[conn_count++] = pgxc_handles->coord_handles[0]; - } - } - } - } - - /* receive response */ - if (conn_count) - { - InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE); - if (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner) || - !validate_combiner(&combiner)) - { - if (combiner.errorMessage) - pgxc_node_report_error(&combiner); - else - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to FINISH the transaction on one or more nodes"))); - } - else - CloseCombiner(&combiner); - } - /*free hash record from gtm*/ - FinishGIDGTM(txn->gid); - - clear_handles(); - pfree_pgxc_all_handles(pgxc_handles); - pgxc_handles = NULL; - pfree(connections); - connections = NULL; - return true; -} - -bool clean_2PC_files(txn_info * txn) -{ - int ii; - TupleTableSlots result; - bool issuccess = true; - static const char *STMT_FORM = "select pgxc_remove_2pc_records('%s')::text"; - char query[MAX_CMD_LENGTH]; - - snprintf(query, MAX_CMD_LENGTH, STMT_FORM, txn->gid); - - for (ii = 0; ii < dn_nodes_num; ii++) - { - if (execute_query_on_single_node(dn_node_list[ii], query, 1, &result) == (Datum) 1) - { - if (TTSgetvalue(&result, 0, 0) == false) - { - elog(LOG, "pg_clean: delete 2PC file failed of transaction %s on node %s", - txn->gid, get_pgxc_nodename(txn->dnparts[ii])); - issuccess = false; - } - } - else - { - elog(LOG, "pg_clean: failed clean 2pc file of transaction %s on node %s", txn->gid, get_pgxc_nodename(dn_node_list[ii])); - issuccess = false; - } - DropTupleTableSlots(&result); - if (!issuccess) - return false; - } - - for (ii = 0; ii < cn_nodes_num; ii++) - { - if (execute_query_on_single_node(cn_node_list[ii], query, 1, &result) == (Datum) 1) - { - if (TTSgetvalue(&result, 0, 0) == false) - { - elog(LOG, "Error:delete 2PC file failed of transaction %s on node %s", - txn->gid, get_pgxc_nodename(txn->coordparts[ii])); - issuccess = false; - } - } - else - { - elog(LOG, "pg_clean: failed clean 2pc file of transaction %s on node %s", txn->gid, get_pgxc_nodename(cn_node_list[ii])); - issuccess = false; - } - DropTupleTableSlots(&result); - if (!issuccess) - return false; - } - return true; -} - -void Init_print_txn_info(print_txn_info * print_txn) -{ - database_info *cur_database = head_database_info; - txn_info *cur_txn; - HASH_SEQ_STATUS status; - HTAB *txn; - - print_txn->index = 0; - INIT(print_txn->txn); - - for (; cur_database; cur_database = cur_database->next) - { - txn = cur_database->all_txn_info; - hash_seq_init(&status, txn); - while ((cur_txn = (txn_info *) hash_seq_search(&status)) != NULL) - { - if (clear_2pc_belong_node && !cur_txn->belong_abnormal_node) - { - continue; - } - if (cur_txn->global_txn_stat != TXN_STATUS_INPROGRESS) - PALLOC(print_txn->txn, cur_txn); - } - -#if 0 - cur_txn = cur_database->head_txn_info; - for (; cur_txn; cur_txn = cur_txn->next) - { - if (cur_txn->global_txn_stat != TXN_STATUS_INPROGRESS) - PALLOC(print_txn->txn, cur_txn); - } -#endif - } -} - -void Init_print_stats_all(print_status *pstatus) -{ - database_info *cur_database; - txn_info *cur_txn; - HASH_SEQ_STATUS status; - HTAB *txn; - - pstatus->index = 0; - pstatus->count = 0; - INIT(pstatus->gid); - INIT(pstatus->global_status); - INIT(pstatus->status); - INIT(pstatus->database); - - for (cur_database = head_database_info; cur_database; cur_database = cur_database->next) - { - txn = cur_database->all_txn_info; - hash_seq_init(&status, txn); - while ((cur_txn = (txn_info *) hash_seq_search(&status)) != NULL) - { - cur_txn->global_txn_stat = check_txn_global_status(cur_txn); - if (cur_txn->global_txn_stat != TXN_STATUS_INPROGRESS) - Init_print_stats(cur_txn, cur_database->database_name, pstatus); - } -#if 0 - for (cur_txn = cur_database->head_txn_info; cur_txn; cur_txn = cur_txn->next) - { - cur_txn->global_txn_stat = check_txn_global_status(cur_txn); - if (cur_txn->global_txn_stat != TXN_STATUS_INPROGRESS) - Init_print_stats(cur_txn, cur_database->database_name, pstatus); - } -#endif - } -} - -void Init_print_stats(txn_info *txn, char *database, print_status * pstatus) -{ - int ii; - StringInfoData query; - initStringInfo(&query); - - RPALLOC(pstatus->gid); - RPALLOC(pstatus->global_status); - RPALLOC(pstatus->status); - RPALLOC(pstatus->database); - - pstatus->gid[pstatus->count] = (char *)palloc0(100 * sizeof(char)); - pstatus->database[pstatus->count] = (char *)palloc0(100 * sizeof(char)); - pstatus->global_status[pstatus->count] = (char *)palloc0(100 * sizeof(char)); - - strncpy(pstatus->gid[pstatus->count], txn->gid, 100); - strncpy(pstatus->database[pstatus->count], database, 100); - strncpy(pstatus->global_status[pstatus->count], txn_status_to_string(check_txn_global_status(txn)), 100); - - for (ii = 0; ii < pgxc_clean_node_count; ii++) - { - appendStringInfo(&query, "%-12s:%-15s", get_pgxc_nodename(find_node_oid(ii)), - txn_status_to_string(txn->txn_stat[ii])); - if (ii < pgxc_clean_node_count - 1) - { - appendStringInfoChar(&query, '\n'); - } - } - - pstatus->status[pstatus->count] = (char *)palloc0((strlen(query.data)+1) * sizeof(char)); - strncpy(pstatus->status[pstatus->count], query.data, strlen(query.data)+1); - pstatus->gid_count++; - pstatus->database_count++; - pstatus->global_status_count++; - pstatus->status_count++; - pstatus->count++; -} - -static const char *txn_status_to_string(TXN_STATUS status) -{ - switch (status) - { - ENUM_TOCHAR_CASE(TXN_STATUS_INITIAL) - ENUM_TOCHAR_CASE(TXN_STATUS_UNKNOWN) - ENUM_TOCHAR_CASE(TXN_STATUS_PREPARED) - ENUM_TOCHAR_CASE(TXN_STATUS_COMMITTED) - ENUM_TOCHAR_CASE(TXN_STATUS_ABORTED) - ENUM_TOCHAR_CASE(TXN_STATUS_INPROGRESS) - ENUM_TOCHAR_CASE(TXN_STATUS_FAILED) - } - return NULL; -} - -static const char *txn_op_to_string(OPERATION op) -{ - switch (op) - { - ENUM_TOCHAR_CASE(UNDO) - ENUM_TOCHAR_CASE(ABORT) - ENUM_TOCHAR_CASE(COMMIT) - } - return NULL; -} - - -static void -CheckFirstPhase(txn_info *txn) -{ -// int ret; - Oid orignode = txn->origcoord; - uint32 startxid = txn->startxid; -// uint32 transactionid; - int nodeidx; - - /* - * if the twophase trans does not success in prepare phase, the orignode == InvalidOid. - */ - if (InvalidOid == orignode) - { - return; - } - nodeidx = find_node_index(orignode); - if (0 == txn->xid[nodeidx]) - { - txn->xid[nodeidx] = startxid; - } - /* start node participate */ - if (txn->isorigcoord_part) - { - if (0 == txn->coordparts[nodeidx]) - { - txn->coordparts[nodeidx] = 1; - txn->num_coordparts++; - } - if (txn->txn_stat[nodeidx] == TXN_STATUS_INITIAL) - { - /*select * from pgxc_is_committed...*/ - getTxnStatus(txn, nodeidx); - } - if (txn->txn_stat[nodeidx] == TXN_STATUS_PREPARED && txn->global_commit_timestamp != InvalidGlobalTimestamp) - { - txn->after_first_phase = true; - } - } - /* start node node participate */ - else - { -#if 0 - ret = Get2PCFile(orignode, txn->gid, &transactionid); - if (ret == FILENOTFOUND) - txn->after_first_phase = false; - else if (ret == FILEUNKOWN) - txn->global_txn_stat = TXN_STATUS_UNKNOWN; - else if (ret == FILEFOUND && txn->global_commit_timestamp != InvalidGlobalTimestamp) - txn->after_first_phase = true; -#endif - if (txn->global_commit_timestamp != InvalidGlobalTimestamp) - { - txn->after_first_phase = true; - } else { - txn->after_first_phase = false; - } - } -} - -void get_transaction_handles(PGXCNodeAllHandles **pgxc_handles, txn_info *txn) -{ - int dn_index = 0; - int cn_index = 0; - int nodeIndex; - char nodetype; - List *coordlist = NIL; - List *nodelist = NIL; - - while (dn_index < dn_nodes_num) - { - - /* Get node type and index */ - nodetype = PGXC_NODE_NONE; - if (TXN_STATUS_PREPARED != txn->txn_stat[dn_index + cn_nodes_num]) - { - dn_index++; - continue; - } - nodeIndex = PGXCNodeGetNodeIdFromName(get_pgxc_nodename(dn_node_list[dn_index]), &nodetype); - if (nodetype == PGXC_NODE_NONE) - ereport(ERROR, - (errcode(ERRCODE_UNDEFINED_OBJECT), - errmsg("PGXC Node %s: object not defined", - get_pgxc_nodename(dn_node_list[dn_index])))); - - /* Check if node is requested is the self-node or not */ - if (nodetype == PGXC_NODE_DATANODE) - { - nodelist = lappend_int(nodelist, nodeIndex); - } - dn_index++; - - } - - while (cn_index < cn_nodes_num) - { - /* Get node type and index */ - nodetype = PGXC_NODE_NONE; - if (TXN_STATUS_PREPARED != txn->txn_stat[cn_index] || cn_node_list[cn_index] == txn->origcoord) - { - cn_index++; - continue; - } - nodeIndex = PGXCNodeGetNodeIdFromName(get_pgxc_nodename(cn_node_list[cn_index]), &nodetype); - if (nodetype == PGXC_NODE_NONE) - ereport(ERROR, - (errcode(ERRCODE_UNDEFINED_OBJECT), - errmsg("PGXC Node %s: object not defined", - get_pgxc_nodename(cn_node_list[cn_index])))); - - /* Check if node is requested is the self-node or not */ - if (nodetype == PGXC_NODE_COORDINATOR) - { - coordlist = lappend_int(coordlist, nodeIndex); - } - cn_index++; - } - *pgxc_handles = get_handles(nodelist, coordlist, false, true, true); -} - -void get_node_handles(PGXCNodeAllHandles **pgxc_handles, Oid nodeoid) -{ - char nodetype = PGXC_NODE_NONE; - int nodeIndex; - List *coordlist = NIL; - List *nodelist = NIL; - - nodeIndex = PGXCNodeGetNodeIdFromName(get_pgxc_nodename(nodeoid), &nodetype); - if (nodetype == PGXC_NODE_COORDINATOR) - { - coordlist = lappend_int(coordlist, nodeIndex); - } - else - { - nodelist = lappend_int(nodelist, nodeIndex); - } - *pgxc_handles = get_handles(nodelist, coordlist, false, true, true); -} - diff --git a/src/test/regress/expected/create_table.out b/src/test/regress/expected/create_table.out index 26d364a5..2cf920b9 100644 --- a/src/test/regress/expected/create_table.out +++ b/src/test/regress/expected/create_table.out @@ -333,7 +333,7 @@ ERROR: cannot use constant expression as partition key DROP FUNCTION const_func(); -- only accept valid partitioning strategy CREATE TABLE partitioned ( - a int + a int ) PARTITION BY MAGIC (a); ERROR: unrecognized partitioning strategy "magic" -- specified column must be present in the table @@ -427,10 +427,10 @@ Partition key: RANGE (a oid_ops, plusone(b), c, d COLLATE "C") Number of partitions: 0 \d+ partitioned2 - Table "public.partitioned2" - Column | Type | Collation | Nullable | Default | Storage | Stats target | Description + Table "public.partitioned2" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description --------+---------+-----------+----------+---------+----------+--------------+------------- - a | integer | | | | plain | | + a | integer | | | | plain | | b | text | | | | extended | | Partition key: RANGE (((a + 1)), substr(b, 1, 5)) Number of partitions: 0 diff --git a/src/test/regress/expected/inherit_3.out b/src/test/regress/expected/inherit_3.out index 251ee257..f845d5fc 100644 --- a/src/test/regress/expected/inherit_3.out +++ b/src/test/regress/expected/inherit_3.out @@ -876,15 +876,13 @@ update mlparted_tab mlp set c = 'xxx' from (select a from some_tab union all select a+1 from some_tab) ss (a) where (mlp.a = ss.a and mlp.b = 'b') or mlp.a = 3; -ERROR: could not plan this distributed update -DETAIL: correlated UPDATE or updating distribution column currently not supported in Postgres-XL. select tableoid::regclass::text as relname, mlparted_tab.* from mlparted_tab order by 1,2; - relname | a | b | c ----------------------+---+---+--- + relname | a | b | c +---------------------+---+---+----- mlparted_tab_part1 | 1 | a | mlparted_tab_part2a | 2 | a | - mlparted_tab_part2b | 2 | b | - mlparted_tab_part3 | 3 | a | + mlparted_tab_part2b | 2 | b | xxx + mlparted_tab_part3 | 3 | a | xxx (4 rows) drop table mlparted_tab; @@ -1022,18 +1020,18 @@ select NULL::derived::base; -- remove redundant conversions. explain (verbose on, costs off) select row(i, b)::more_derived::derived::base from more_derived; - QUERY PLAN + QUERY PLAN ---------------------------------------------------------------------------------------------- Remote Fast Query Execution Output: ((ROW(more_derived.i, more_derived.b)::more_derived)::derived)::base Node/s: datanode_1, datanode_2 Remote query: SELECT ((ROW(i, b)::more_derived)::derived)::base AS "row" FROM more_derived -> Seq Scan on public.more_derived - Output: (ROW(i, b)::more_derived)::base + Output: (ROW(i, b)::more_derived)::base (6 rows) explain (verbose on, costs off) select (1, 2)::more_derived::derived::base; - QUERY PLAN + QUERY PLAN ------------------------------------------- Result Output: (ROW(1, 2)::more_derived)::base @@ -2199,7 +2197,7 @@ create table mcrparted3 partition of mcrparted for values from (11, 1, 1) to (20 create table mcrparted4 partition of mcrparted for values from (20, 10, 10) to (20, 20, 20); create table mcrparted5 partition of mcrparted for values from (20, 20, 20) to (maxvalue, maxvalue, maxvalue); explain (costs off) select * from mcrparted where a = 0; -- scans mcrparted0, mcrparted_def - QUERY PLAN + QUERY PLAN --------------------------------------- Remote Fast Query Execution Node/s: datanode_2 @@ -2237,7 +2235,7 @@ explain (costs off) select * from mcrparted where a = 10 and abs(b) = 5; -- scan (9 rows) explain (costs off) select * from mcrparted where abs(b) = 5; -- scans all partitions - QUERY PLAN + QUERY PLAN --------------------------------------- Remote Fast Query Execution Node/s: datanode_1, datanode_2 diff --git a/src/test/regress/expected/join_3.out b/src/test/regress/expected/join_3.out index 7a70d26a..5b7dfb96 100644 --- a/src/test/regress/expected/join_3.out +++ b/src/test/regress/expected/join_3.out @@ -3844,8 +3844,8 @@ where t1.f1 = ss.f1; -> Seq Scan on public.int8_tbl i8 Output: i8.q1, i8.q2 Filter: (i8.q2 = 123) - -> Limit - Output: (i8.q1), t2.f1 + -> Limit + Output: (i8.q1), t2.f1 -> Materialize Output: (i8.q1), t2.f1 -> Remote Subquery Scan on all (datanode_1,datanode_2) @@ -3875,7 +3875,7 @@ select * from lateral (select i8.q1, t2.f1 from text_tbl t2 limit 1) as ss1, lateral (select ss1.* from text_tbl t3 limit 1) as ss2 where t1.f1 = ss2.f1; - QUERY PLAN + QUERY PLAN ----------------------------------------------------------------------------- Nested Loop Output: t1.f1, i8.q1, i8.q2, (i8.q1), t2.f1, ((i8.q1)), (t2.f1) @@ -3889,10 +3889,10 @@ where t1.f1 = ss2.f1; -> Seq Scan on public.int8_tbl i8 Output: i8.q1, i8.q2 Filter: (i8.q2 = 123) - -> Nested Loop - Output: (i8.q1), t2.f1, ((i8.q1)), (t2.f1) - -> Limit - Output: (i8.q1), t2.f1 + -> Nested Loop + Output: (i8.q1), t2.f1, ((i8.q1)), (t2.f1) + -> Limit + Output: (i8.q1), t2.f1 -> Materialize Output: (i8.q1), t2.f1 -> Remote Subquery Scan on all (datanode_1,datanode_2) @@ -3901,16 +3901,16 @@ where t1.f1 = ss2.f1; Output: (i8.q1), t2.f1 -> Seq Scan on public.text_tbl t2 Output: i8.q1, t2.f1 - -> Limit - Output: ((i8.q1)), (t2.f1) + -> Limit + Output: ((i8.q1)), (t2.f1) -> Materialize Output: ((i8.q1)), (t2.f1) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Output: (i8.q1), t2.f1 - -> Limit - Output: ((i8.q1)), (t2.f1) - -> Seq Scan on public.text_tbl t3 - Output: (i8.q1), t2.f1 + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: (i8.q1), t2.f1 + -> Limit + Output: ((i8.q1)), (t2.f1) + -> Seq Scan on public.text_tbl t3 + Output: (i8.q1), t2.f1 (34 rows) select * from @@ -3962,11 +3962,11 @@ where tt1.f1 = ss1.c0; -> Seq Scan on public.text_tbl tt4 Output: tt4.f1 Filter: (tt4.f1 = 'foo'::text) - -> Subquery Scan on ss1 - Output: ss1.c0 - Filter: (ss1.c0 = 'foo'::text) - -> Limit - Output: (tt4.f1) + -> Subquery Scan on ss1 + Output: ss1.c0 + Filter: (ss1.c0 = 'foo'::text) + -> Limit + Output: (tt4.f1) -> Materialize Output: (tt4.f1) -> Remote Subquery Scan on all (datanode_1,datanode_2) @@ -4026,8 +4026,8 @@ where ss1.c2 = 0; Output: i42.f1 -> Seq Scan on public.int4_tbl i42 Output: i42.f1 - -> Limit - Output: (i41.f1), (i8.q1), (i8.q2), (i42.f1), (i43.f1), ((42)) + -> Limit + Output: (i41.f1), (i8.q1), (i8.q2), (i42.f1), (i43.f1), ((42)) -> Materialize Output: (i41.f1), (i8.q1), (i8.q2), (i42.f1), (i43.f1), ((42)) -> Remote Subquery Scan on all (datanode_1,datanode_2) @@ -4065,13 +4065,13 @@ select * from Nested Loop Left Join Join Filter: ((1) = COALESCE((1))) -> Result - -> Hash Full Join - Hash Cond: (a1.unique1 = (1)) + -> Hash Full Join + Hash Cond: (a1.unique1 = (1)) -> Materialize -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on tenk1 a1 - -> Hash - -> Result + -> Hash + -> Result (10 rows) select * from @@ -4614,8 +4614,8 @@ select *, (select r from (select q1 as q2) x, (select q2 as r) y) from int8_tbl; ------------------+-------------------+------------------- 123 | 456 | 456 123 | 4567890123456789 | 4567890123456789 - 4567890123456789 | 123 | 123 4567890123456789 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 123 | 123 4567890123456789 | -4567890123456789 | -4567890123456789 (5 rows) @@ -4624,8 +4624,8 @@ select *, (select r from (select q1 as q2) x, lateral (select q2 as r) y) from i ------------------+-------------------+------------------ 123 | 456 | 123 123 | 4567890123456789 | 123 - 4567890123456789 | 123 | 4567890123456789 4567890123456789 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 123 | 4567890123456789 4567890123456789 | -4567890123456789 | 4567890123456789 (5 rows) @@ -4929,13 +4929,13 @@ select * from ------------------+-------------------+------------------+-------------------+------------------+------------------+------------------- 123 | 456 | | | 123 | | 123 | 4567890123456789 | 4567890123456789 | -4567890123456789 | 123 | 4567890123456789 | -4567890123456789 - 123 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 | 4567890123456789 123 | 4567890123456789 | 4567890123456789 | 123 | 123 | 4567890123456789 | 123 - 4567890123456789 | 123 | 123 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 - 4567890123456789 | 123 | 123 | 456 | 4567890123456789 | 123 | 456 + 123 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 | 4567890123456789 4567890123456789 | 4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789 | 4567890123456789 | -4567890123456789 - 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 | 4567890123456789 | 123 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 123 | 123 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 + 4567890123456789 | 123 | 123 | 456 | 4567890123456789 | 123 | 456 4567890123456789 | -4567890123456789 | | | 4567890123456789 | | (10 rows) @@ -4946,13 +4946,13 @@ select * from ------------------+-------------------+------------------+-------------------+------------------+------------------+------------------- 123 | 456 | | | 123 | | 123 | 4567890123456789 | 4567890123456789 | -4567890123456789 | 123 | 4567890123456789 | -4567890123456789 - 123 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 | 4567890123456789 123 | 4567890123456789 | 4567890123456789 | 123 | 123 | 4567890123456789 | 123 - 4567890123456789 | 123 | 123 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 - 4567890123456789 | 123 | 123 | 456 | 4567890123456789 | 123 | 456 + 123 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 | 4567890123456789 4567890123456789 | 4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789 | 4567890123456789 | -4567890123456789 - 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 | 4567890123456789 | 123 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 123 | 123 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 + 4567890123456789 | 123 | 123 | 456 | 4567890123456789 | 123 | 456 4567890123456789 | -4567890123456789 | | | 4567890123456789 | | (10 rows) @@ -4965,11 +4965,11 @@ select x.* from 123 | 4567890123456789 123 | 4567890123456789 123 | 4567890123456789 - 4567890123456789 | 123 - 4567890123456789 | 123 4567890123456789 | 4567890123456789 4567890123456789 | 4567890123456789 4567890123456789 | 4567890123456789 + 4567890123456789 | 123 + 4567890123456789 | 123 4567890123456789 | -4567890123456789 (10 rows) @@ -5086,14 +5086,14 @@ select * from q1 | q2 | q1 | q2 | x ------------------+-------------------+------------------+-------------------+------------------ 123 | 456 | | | - 123 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 123 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 123 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 123 | 4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789 - 4567890123456789 | 123 | 123 | 456 | 123 - 4567890123456789 | 123 | 123 | 4567890123456789 | 123 - 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 4567890123456789 | 4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789 + 4567890123456789 | 123 | 123 | 456 | 123 + 4567890123456789 | 123 | 123 | 4567890123456789 | 123 4567890123456789 | -4567890123456789 | | | (10 rows) @@ -5107,14 +5107,14 @@ select * from q1 | q2 | q1 | q2 | x ------------------+-------------------+------------------+-------------------+------------------ 123 | 456 | | | - 123 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 123 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 123 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 123 | 4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789 - 4567890123456789 | 123 | 123 | 456 | 123 - 4567890123456789 | 123 | 123 | 4567890123456789 | 123 - 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 4567890123456789 | 4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789 + 4567890123456789 | 123 | 123 | 456 | 123 + 4567890123456789 | 123 | 123 | 4567890123456789 | 123 4567890123456789 | -4567890123456789 | | | (10 rows) @@ -5284,16 +5284,6 @@ select * from 123 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 123 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 123 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 - 4567890123456789 | 123 | 123 | 123 | 123 - 4567890123456789 | 123 | 123 | 123 | 123 - 4567890123456789 | 123 | 123 | 4567890123456789 | 123 - 4567890123456789 | 123 | 123 | 4567890123456789 | 123 - 4567890123456789 | 123 | 123 | 4567890123456789 | 123 - 4567890123456789 | 123 | 123 | 123 | 123 - 4567890123456789 | 123 | 123 | 123 | 123 - 4567890123456789 | 123 | 123 | 4567890123456789 | 123 - 4567890123456789 | 123 | 123 | 4567890123456789 | 123 - 4567890123456789 | 123 | 123 | 4567890123456789 | 123 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 | 123 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 | 123 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 @@ -5309,6 +5299,16 @@ select * from 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 123 | 123 | 123 | 123 + 4567890123456789 | 123 | 123 | 123 | 123 + 4567890123456789 | 123 | 123 | 4567890123456789 | 123 + 4567890123456789 | 123 | 123 | 4567890123456789 | 123 + 4567890123456789 | 123 | 123 | 4567890123456789 | 123 + 4567890123456789 | 123 | 123 | 123 | 123 + 4567890123456789 | 123 | 123 | 123 | 123 + 4567890123456789 | 123 | 123 | 4567890123456789 | 123 + 4567890123456789 | 123 | 123 | 4567890123456789 | 123 + 4567890123456789 | 123 | 123 | 4567890123456789 | 123 4567890123456789 | -4567890123456789 | | | (42 rows) @@ -5523,8 +5523,8 @@ lateral (select * from int8_tbl t1, where t1.q1 = ss.q2) ss0; id | q1 | q2 | q1 | q2 ----+------------------+-------------------+------------------+------------------ - 0 | 4567890123456789 | 123 | 4567890123456789 | 4567890123456789 0 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 0 | 4567890123456789 | 123 | 4567890123456789 | 4567890123456789 0 | 4567890123456789 | -4567890123456789 | 4567890123456789 | 4567890123456789 (3 rows) @@ -5647,6 +5647,73 @@ SELECT count(*) FROM testr WHERE NOT EXISTS (SELECT * FROM testh WHERE testr.b = 3000 (1 row) +-- +-- test LATERAL reference propagation down a multi-level inheritance hierarchy +-- produced for a multi-level partitioned table hierarchy. +-- +create table pt1 (a int, b int, c varchar) partition by range(a); +create table pt1p1 partition of pt1 for values from (0) to (100) partition by range(b); +create table pt1p2 partition of pt1 for values from (100) to (200); +create table pt1p1p1 partition of pt1p1 for values from (0) to (100); +insert into pt1 values (1, 1, 'x'), (101, 101, 'y'); +create table ut1 (a int, b int, c varchar); +insert into ut1 values (101, 101, 'y'), (2, 2, 'z'); +explain (verbose, costs off) +select t1.b, ss.phv from ut1 t1 left join lateral + (select t2.a as t2a, t3.a t3a, least(t1.a, t2.a, t3.a) phv + from pt1 t2 join ut1 t3 on t2.a = t3.b) ss + on t1.a = ss.t2a order by t1.a; + QUERY PLAN +----------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + Output: t1.b, LEAST(t1.a, a, t3.a), t1.a + Sort Key: t1.a + -> Sort + Output: t1.b, (LEAST(t1.a, a, t3.a)), t1.a + Sort Key: t1.a + -> Nested Loop Left Join + Output: t1.b, (LEAST(t1.a, a, t3.a)), t1.a + -> Seq Scan on public.ut1 t1 + Output: t1.a, t1.b, t1.c + -> Materialize + Output: a, (LEAST(t1.a, a, t3.a)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: a, LEAST(t1.a, a, t3.a) + Distribute results by H: a + -> Nested Loop + Output: a, LEAST(t1.a, a, t3.a) + Join Filter: (a = t3.b) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: t3.b, t3.a + Distribute results by H: b + -> Seq Scan on public.ut1 t3 + Output: t3.b, t3.a + -> Materialize + Output: a + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: a + Distribute results by H: a + -> Append + -> Seq Scan on public.pt1p1p1 t2 + Output: t2.a + Filter: (t1.a = t2.a) + -> Seq Scan on public.pt1p2 t2_1 + Output: t2_1.a + Filter: (t1.a = t2_1.a) +(35 rows) + +select t1.b, ss.phv from ut1 t1 left join lateral + (select t2.a as t2a, t3.a t3a, least(t1.a, t2.a, t3.a) phv + from pt1 t2 join ut1 t3 on t2.a = t3.b) ss + on t1.a = ss.t2a order by t1.a; + b | phv +-----+----- + 2 | + 101 | 101 +(2 rows) + +drop table pt1; +drop table ut1; -- -- test that foreign key join estimation performs sanely for outer joins -- diff --git a/src/test/regress/expected/limit.out b/src/test/regress/expected/limit.out index 68ff5e10..61a3f53e 100644 --- a/src/test/regress/expected/limit.out +++ b/src/test/regress/expected/limit.out @@ -523,7 +523,7 @@ select sum(tenthous) as s1, sum(tenthous) + random()*0 as s2 -> Remote Subquery Scan on all (datanode_1,datanode_2) Output: thousand, PARTIAL sum(tenthous) Distribute results by H: thousand - -> Partial GroupAggregate + -> Partial HashAggregate Output: thousand, PARTIAL sum(tenthous) Group Key: tenk1.thousand -> Index Only Scan using tenk1_thous_tenthous on public.tenk1 diff --git a/src/test/regress/expected/partition_join_2.out b/src/test/regress/expected/partition_join_2.out index 8a414251..2ae2b8a2 100644 --- a/src/test/regress/expected/partition_join_2.out +++ b/src/test/regress/expected/partition_join_2.out @@ -187,25 +187,25 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) -- Join with pruned partitions from joining relations EXPLAIN (COSTS OFF) SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.a < 450 AND t2.b > 250 AND t1.b = 0 ORDER BY t1.a, t2.b; - QUERY PLAN ------------------------------------------------------------------------ + QUERY PLAN +----------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) -> Sort Sort Key: a - -> Nested Loop - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Append - -> Seq Scan on prt1_p1 t1 - Filter: ((a < 450) AND (b = 0)) - -> Seq Scan on prt1_p2 t1_1 - Filter: ((a < 450) AND (b = 0)) + -> Hash Join + Hash Cond: (t2.b = a) -> Append - -> Index Scan using iprt2_p2_b on prt2_p2 t2 - Index Cond: ((b = a) AND (b > 250)) - -> Bitmap Heap Scan on prt2_p3 t2_1 - Recheck Cond: ((b = a) AND (b > 250)) - -> Bitmap Index Scan on iprt2_p3_b - Index Cond: ((b = a) AND (b > 250)) + -> Seq Scan on prt2_p2 t2 + Filter: (b > 250) + -> Seq Scan on prt2_p3 t2_1 + Filter: (b > 250) + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt1_p1 t1 + Filter: ((a < 450) AND (b = 0)) + -> Seq Scan on prt1_p2 t1_1 + Filter: ((a < 450) AND (b = 0)) (17 rows) SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.a < 450 AND t2.b > 250 AND t1.b = 0 ORDER BY t1.a, t2.b; @@ -304,10 +304,11 @@ SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t2.b FROM prt2 t2 WHERE t2.a = 0) QUERY PLAN -------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) - -> Sort - Sort Key: t1.a - -> Nested Loop - -> Remote Subquery Scan on all (datanode_2) + -> Merge Join + Merge Cond: (b = t1.a) + -> Remote Subquery Scan on all (datanode_2) + -> Sort + Sort Key: b -> HashAggregate Group Key: b -> Remote Subquery Scan on all (datanode_2) @@ -321,15 +322,14 @@ SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t2.b FROM prt2 t2 WHERE t2.a = 0) Filter: (a = 0) -> Seq Scan on prt2_p3 t2_2 Filter: (a = 0) + -> Sort + Sort Key: t1.a -> Append - -> Index Scan using iprt1_p1_a on prt1_p1 t1 - Index Cond: (a = b) + -> Seq Scan on prt1_p1 t1 Filter: (b = 0) - -> Index Scan using iprt1_p2_a on prt1_p2 t1_1 - Index Cond: (a = b) + -> Seq Scan on prt1_p2 t1_1 Filter: (b = 0) - -> Index Scan using iprt1_p3_a on prt1_p3 t1_2 - Index Cond: (a = b) + -> Seq Scan on prt1_p3 t1_2 Filter: (b = 0) (28 rows) @@ -378,24 +378,28 @@ SELECT * FROM prt1 t1 LEFT JOIN LATERAL QUERY PLAN -------------------------------------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) - -> Gather Merge - Workers Planned: 1 - -> Sort - Sort Key: a - -> Parallel Nested Loop Left Join - -> Parallel Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a + -> Nested Loop Left Join + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Append + -> Seq Scan on prt1_p1 t1 + Filter: (b = 0) + -> Seq Scan on prt1_p2 t1_1 + Filter: (b = 0) + -> Seq Scan on prt1_p3 t1_2 + Filter: (b = 0) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) Distribute results by H: a - -> Parallel Append - -> Parallel Seq Scan on prt1_p1 t1 - Filter: (b = 0) - -> Parallel Seq Scan on prt1_p2 t1_1 - Filter: (b = 0) - -> Parallel Seq Scan on prt1_p3 t1_2 - Filter: (b = 0) - -> Materialize - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: a - -> Nested Loop + -> Hash Join + Hash Cond: (t3.b = a) + -> Append + -> Index Scan using iprt2_p1_b on prt2_p1 t3 + -> Index Scan using iprt2_p2_b on prt2_p2 t3_1 + -> Index Scan using iprt2_p3_b on prt2_p3 t3_2 + -> Hash -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Append -> Index Only Scan using iprt1_p1_a on prt1_p1 t2 @@ -404,16 +408,7 @@ SELECT * FROM prt1 t1 LEFT JOIN LATERAL Index Cond: (a = a) -> Index Only Scan using iprt1_p3_a on prt1_p3 t2_2 Index Cond: (a = a) - -> Append - -> Index Scan using iprt2_p1_b on prt2_p1 t3 - Index Cond: (b = a) - -> Index Scan using iprt2_p2_b on prt2_p2 t3_1 - Index Cond: (b = a) - -> Bitmap Heap Scan on prt2_p3 t3_2 - Recheck Cond: (b = a) - -> Bitmap Index Scan on iprt2_p3_b - Index Cond: (b = a) -(36 rows) +(31 rows) SELECT * FROM prt1 t1 LEFT JOIN LATERAL (SELECT t2.a AS t2a, t3.a AS t3a, least(t1.a,t2.a,t3.b) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss @@ -438,8 +433,8 @@ EXPLAIN (COSTS OFF) SELECT t1.a, ss.t2a, ss.t2c FROM prt1 t1 LEFT JOIN LATERAL (SELECT t2.a AS t2a, t3.a AS t3a, t2.b t2b, t2.c t2c, least(t1.a,t2.a,t3.b) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss ON t1.c = ss.t2c WHERE (t1.b + coalesce(ss.t2b, 0)) = 0 ORDER BY t1.a; - QUERY PLAN ------------------------------------------------------------------------------------------------ + QUERY PLAN +----------------------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) -> Sort Sort Key: a @@ -456,17 +451,17 @@ SELECT t1.a, ss.t2a, ss.t2c FROM prt1 t1 LEFT JOIN LATERAL -> Remote Subquery Scan on all (datanode_1,datanode_2) Distribute results by H: c -> Hash Join - Hash Cond: (t2.a = b) - -> Append - -> Seq Scan on prt1_p1 t2 - -> Seq Scan on prt1_p2 t2_1 - -> Seq Scan on prt1_p3 t2_2 + Hash Cond: (b = t2.a) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt2_p1 t3 + -> Seq Scan on prt2_p2 t3_1 + -> Seq Scan on prt2_p3 t3_2 -> Hash - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Append - -> Seq Scan on prt2_p1 t3 - -> Seq Scan on prt2_p2 t3_1 - -> Seq Scan on prt2_p3 t3_2 + -> Append + -> Seq Scan on prt1_p1 t2 + -> Seq Scan on prt1_p2 t2_1 + -> Seq Scan on prt1_p3 t2_2 (27 rows) SELECT t1.a, ss.t2a, ss.t2c FROM prt1 t1 LEFT JOIN LATERAL @@ -748,8 +743,8 @@ SELECT t1.a, t1.phv, t2.b, t2.phv, t3.a + t3.b, t3.phv FROM ((SELECT 50 phv, * F -- Semi-join EXPLAIN (COSTS OFF) SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHERE t1.a = 0 AND t1.b = (t2.a + t2.b)/2) AND t1.b = 0 ORDER BY t1.a; - QUERY PLAN ---------------------------------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------------------------ Remote Subquery Scan on all (datanode_1,datanode_2) -> Merge Join Merge Cond: (a = b) @@ -774,23 +769,22 @@ SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHER Distribute results by H: b -> HashAggregate Group Key: b - -> Nested Loop - -> Remote Subquery Scan on all (datanode_2) - -> Append - -> Seq Scan on prt2_p1 t1_3 - Filter: (a = 0) - -> Seq Scan on prt2_p2 t1_4 - Filter: (a = 0) - -> Seq Scan on prt2_p3 t1_5 - Filter: (a = 0) + -> Hash Join + Hash Cond: (((t2.a + t2.b) / 2) = b) -> Append - -> Index Scan using iprt1_e_p1_ab2 on prt1_e_p1 t2 - Index Cond: (((a + b) / 2) = b) - -> Index Scan using iprt1_e_p2_ab2 on prt1_e_p2 t2_1 - Index Cond: (((a + b) / 2) = b) - -> Index Scan using iprt1_e_p3_ab2 on prt1_e_p3 t2_2 - Index Cond: (((a + b) / 2) = b) -(40 rows) + -> Seq Scan on prt1_e_p1 t2 + -> Seq Scan on prt1_e_p2 t2_1 + -> Seq Scan on prt1_e_p3 t2_2 + -> Hash + -> Remote Subquery Scan on all (datanode_2) + -> Append + -> Seq Scan on prt2_p1 t1_3 + Filter: (a = 0) + -> Seq Scan on prt2_p2 t1_4 + Filter: (a = 0) + -> Seq Scan on prt2_p3 t1_5 + Filter: (a = 0) +(39 rows) SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHERE t1.a = 0 AND t1.b = (t2.a + t2.b)/2) AND t1.b = 0 ORDER BY t1.a; a | b | c @@ -1171,26 +1165,26 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 EXPLAIN (COSTS OFF) SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b, prt1 t3 WHERE t2.b = t3.a; - QUERY PLAN ------------------------------------------------------------------------ + QUERY PLAN +----------------------------------------------------------------- Hash Join - Hash Cond: (a = b) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Append - -> Seq Scan on prt1_p1 t3 - -> Seq Scan on prt1_p2 t3_1 - -> Seq Scan on prt1_p3 t3_2 + Hash Cond: (b = a) + -> Hash Left Join + Hash Cond: (b = a) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt2_p1 t2 + -> Seq Scan on prt2_p2 t2_1 + -> Seq Scan on prt2_p3 t2_2 + -> Hash + -> Result + One-Time Filter: false -> Hash - -> Hash Left Join - Hash Cond: (b = a) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Append - -> Seq Scan on prt2_p1 t2 - -> Seq Scan on prt2_p2 t2_1 - -> Seq Scan on prt2_p3 t2_2 - -> Hash - -> Result - One-Time Filter: false + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt1_p1 t3 + -> Seq Scan on prt1_p2 t3_1 + -> Seq Scan on prt1_p3 t3_2 (18 rows) EXPLAIN (COSTS OFF) @@ -1507,50 +1501,48 @@ EXPLAIN (COSTS OFF) SELECT * FROM prt1_l t1 LEFT JOIN LATERAL (SELECT t2.a AS t2a, t2.c AS t2c, t2.b AS t2b, t3.b AS t3b, least(t1.a,t2.a,t3.b) FROM prt1_l t2 JOIN prt2_l t3 ON (t2.a = t3.b AND t2.c = t3.c)) ss ON t1.a = ss.t2a AND t1.c = ss.t2c WHERE t1.b = 0 ORDER BY t1.a; - QUERY PLAN -------------------------------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) - -> Gather Merge - Workers Planned: 1 - -> Sort - Sort Key: a - -> Parallel Nested Loop Left Join - -> Parallel Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: a + -> Nested Loop Left Join + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a + -> Append + -> Seq Scan on prt1_l_p1 t1 + Filter: (b = 0) + -> Seq Scan on prt1_l_p2_p1 t1_1 + Filter: (b = 0) + -> Seq Scan on prt1_l_p2_p2 t1_2 + Filter: (b = 0) + -> Seq Scan on prt1_l_p3_p1 t1_3 + Filter: (b = 0) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) Distribute results by H: a - -> Parallel Append - -> Parallel Seq Scan on prt1_l_p1 t1 - Filter: (b = 0) - -> Parallel Seq Scan on prt1_l_p2_p1 t1_1 - Filter: (b = 0) - -> Parallel Seq Scan on prt1_l_p2_p2 t1_2 - Filter: (b = 0) - -> Parallel Seq Scan on prt1_l_p3_p1 t1_3 - Filter: (b = 0) - -> Materialize - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: a - -> Hash Join - Hash Cond: ((b = t2.a) AND ((c)::text = (t2.c)::text)) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Append - -> Seq Scan on prt2_l_p1 t3 - -> Seq Scan on prt2_l_p2_p1 t3_1 - -> Seq Scan on prt2_l_p2_p2 t3_2 - -> Seq Scan on prt2_l_p3_p1 t3_3 - -> Seq Scan on prt2_l_p3_p2 t3_4 - -> Hash - -> Append - -> Seq Scan on prt1_l_p1 t2 - Filter: ((a = a) AND ((c)::text = (c)::text)) - -> Seq Scan on prt1_l_p2_p1 t2_1 - Filter: ((a = a) AND ((c)::text = (c)::text)) - -> Seq Scan on prt1_l_p2_p2 t2_2 - Filter: ((a = a) AND ((c)::text = (c)::text)) - -> Seq Scan on prt1_l_p3_p1 t2_3 - Filter: ((a = a) AND ((c)::text = (c)::text)) - -> Seq Scan on prt1_l_p3_p2 t2_4 - Filter: ((a = a) AND ((c)::text = (c)::text)) -(41 rows) + -> Hash Join + Hash Cond: ((b = t2.a) AND ((c)::text = (t2.c)::text)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt2_l_p1 t3 + -> Seq Scan on prt2_l_p2_p1 t3_1 + -> Seq Scan on prt2_l_p2_p2 t3_2 + -> Seq Scan on prt2_l_p3_p1 t3_3 + -> Seq Scan on prt2_l_p3_p2 t3_4 + -> Hash + -> Append + -> Seq Scan on prt1_l_p1 t2 + Filter: ((a = a) AND ((c)::text = (c)::text)) + -> Seq Scan on prt1_l_p2_p1 t2_1 + Filter: ((a = a) AND ((c)::text = (c)::text)) + -> Seq Scan on prt1_l_p2_p2 t2_2 + Filter: ((a = a) AND ((c)::text = (c)::text)) + -> Seq Scan on prt1_l_p3_p1 t2_3 + Filter: ((a = a) AND ((c)::text = (c)::text)) + -> Seq Scan on prt1_l_p3_p2 t2_4 + Filter: ((a = a) AND ((c)::text = (c)::text)) +(39 rows) SELECT * FROM prt1_l t1 LEFT JOIN LATERAL (SELECT t2.a AS t2a, t2.c AS t2c, t2.b AS t2b, t3.b AS t3b, least(t1.a,t2.a,t3.b) FROM prt1_l t2 JOIN prt2_l t3 ON (t2.a = t3.b AND t2.c = t3.c)) ss @@ -1622,43 +1614,44 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt4_n t2 WHERE t1.a = t2.a; ----------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) -> Hash Join - Hash Cond: (t1.a = t2.a) + Hash Cond: (t2.a = t1.a) -> Append - -> Seq Scan on prt1_p1 t1 - -> Seq Scan on prt1_p2 t1_1 - -> Seq Scan on prt1_p3 t1_2 + -> Seq Scan on prt4_n_p1 t2 + -> Seq Scan on prt4_n_p2 t2_1 + -> Seq Scan on prt4_n_p3 t2_2 -> Hash -> Append - -> Seq Scan on prt4_n_p1 t2 - -> Seq Scan on prt4_n_p2 t2_1 - -> Seq Scan on prt4_n_p3 t2_2 + -> Seq Scan on prt1_p1 t1 + -> Seq Scan on prt1_p2 t1_1 + -> Seq Scan on prt1_p3 t1_2 (12 rows) EXPLAIN (COSTS OFF) SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt4_n t2, prt2 t3 WHERE t1.a = t2.a and t1.a = t3.b; - QUERY PLAN ------------------------------------------------------------------------------------ + QUERY PLAN +----------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) -> Hash Join - Hash Cond: (t2.a = t1.a) - -> Append - -> Seq Scan on prt4_n_p1 t2 - -> Seq Scan on prt4_n_p2 t2_1 - -> Seq Scan on prt4_n_p3 t2_2 - -> Hash - -> Hash Join - Hash Cond: (t1.a = b) + Hash Cond: (t1.a = b) + -> Hash Join + Hash Cond: (t2.a = t1.a) + -> Append + -> Seq Scan on prt4_n_p1 t2 + -> Seq Scan on prt4_n_p2 t2_1 + -> Seq Scan on prt4_n_p3 t2_2 + -> Hash -> Append -> Seq Scan on prt1_p1 t1 -> Seq Scan on prt1_p2 t1_1 -> Seq Scan on prt1_p3 t1_2 - -> Hash - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Append - -> Seq Scan on prt2_p1 t3 - -> Seq Scan on prt2_p2 t3_1 - -> Seq Scan on prt2_p3 t3_2 -(20 rows) + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Append + -> Seq Scan on prt2_p1 t3 + -> Seq Scan on prt2_p2 t3_1 + -> Seq Scan on prt2_p3 t3_2 +(21 rows) -- partition-wise join can not be applied if there are no equi-join conditions -- between partition keys @@ -1710,41 +1703,41 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_m t1, prt2_m t2 WHERE t1.a = (t2.b + t2. -- partition-wise join EXPLAIN (COSTS OFF) SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_m t1 LEFT JOIN prt2_m t2 ON t1.a = t2.b; - QUERY PLAN ------------------------------------------------------------------------ + QUERY PLAN +----------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) - -> Hash Left Join - Hash Cond: (t1.a = b) - -> Append - -> Seq Scan on prt1_m_p1 t1 - -> Seq Scan on prt1_m_p2 t1_1 - -> Seq Scan on prt1_m_p3 t1_2 + -> Hash Right Join + Hash Cond: (b = t1.a) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt2_m_p1 t2 + -> Seq Scan on prt2_m_p2 t2_1 + -> Seq Scan on prt2_m_p3 t2_2 -> Hash - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Append - -> Seq Scan on prt2_m_p1 t2 - -> Seq Scan on prt2_m_p2 t2_1 - -> Seq Scan on prt2_m_p3 t2_2 + -> Append + -> Seq Scan on prt1_m_p1 t1 + -> Seq Scan on prt1_m_p2 t1_1 + -> Seq Scan on prt1_m_p3 t1_2 (13 rows) -- equi-join between non-key columns does not qualify for partition-wise join EXPLAIN (COSTS OFF) SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_m t1 LEFT JOIN prt2_m t2 ON t1.c = t2.c; - QUERY PLAN ------------------------------------------------------------------------ + QUERY PLAN +----------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) - -> Hash Left Join - Hash Cond: (t1.c = c) - -> Append - -> Seq Scan on prt1_m_p1 t1 - -> Seq Scan on prt1_m_p2 t1_1 - -> Seq Scan on prt1_m_p3 t1_2 + -> Hash Right Join + Hash Cond: (c = t1.c) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt2_m_p1 t2 + -> Seq Scan on prt2_m_p2 t2_1 + -> Seq Scan on prt2_m_p3 t2_2 -> Hash - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Append - -> Seq Scan on prt2_m_p1 t2 - -> Seq Scan on prt2_m_p2 t2_1 - -> Seq Scan on prt2_m_p3 t2_2 + -> Append + -> Seq Scan on prt1_m_p1 t1 + -> Seq Scan on prt1_m_p2 t1_1 + -> Seq Scan on prt1_m_p3 t1_2 (13 rows) -- partition-wise join can not be applied between tables with different diff --git a/src/test/regress/expected/rowsecurity_1.out b/src/test/regress/expected/rowsecurity_1.out index 1e0441a4..770c320f 100644 --- a/src/test/regress/expected/rowsecurity_1.out +++ b/src/test/regress/expected/rowsecurity_1.out @@ -2074,7 +2074,7 @@ EXPLAIN (COSTS OFF) EXECUTE plancache_test3; -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on z1 Filter: (((a % 2) = 0) AND f_leak(b)) - -> CTE Scan on q + -> CTE Scan on q (8 rows) SET ROLE regress_rls_group1; @@ -2125,7 +2125,7 @@ EXPLAIN (COSTS OFF) EXECUTE plancache_test3; -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on z1 Filter: (((a % 2) = 0) AND f_leak(b)) - -> CTE Scan on q + -> CTE Scan on q (8 rows) SET SESSION AUTHORIZATION regress_rls_carol; @@ -2176,7 +2176,7 @@ EXPLAIN (COSTS OFF) EXECUTE plancache_test3; -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on z1 Filter: (((a % 2) = 1) AND f_leak(b)) - -> CTE Scan on q + -> CTE Scan on q (8 rows) SET ROLE regress_rls_group2; @@ -2227,7 +2227,7 @@ EXPLAIN (COSTS OFF) EXECUTE plancache_test3; -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on z1 Filter: (((a % 2) = 1) AND f_leak(b)) - -> CTE Scan on q + -> CTE Scan on q (8 rows) -- diff --git a/src/test/regress/expected/sanity_check.out b/src/test/regress/expected/sanity_check.out index 20bce908..8bea6498 100644 --- a/src/test/regress/expected/sanity_check.out +++ b/src/test/regress/expected/sanity_check.out @@ -39,6 +39,8 @@ date_tbl|f default_tbl|f defaultexpr_tbl|f dept|f +donothingbrtrig_test1|f +donothingbrtrig_test2|f dupindexcols|t e_star|f emp|f diff --git a/src/test/regress/expected/select_parallel_4.out b/src/test/regress/expected/select_parallel_4.out index 93228c2e..f3b81ec8 100644 --- a/src/test/regress/expected/select_parallel_4.out +++ b/src/test/regress/expected/select_parallel_4.out @@ -81,34 +81,34 @@ select length(stringu1) from tenk1 group by length(stringu1); explain (costs off) select stringu1, count(*) from tenk1 group by stringu1 order by stringu1; - QUERY PLAN ------------------------------------------------------------ - Finalize GroupAggregate - Group Key: stringu1 - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Sort - Sort Key: stringu1 - -> Partial HashAggregate - Group Key: stringu1 - -> Gather - Workers Planned: 4 + QUERY PLAN +----------------------------------------------------------------- + Sort + Sort Key: stringu1 + -> Finalize HashAggregate + Group Key: stringu1 + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Gather + Workers Planned: 4 + -> Partial HashAggregate + Group Key: stringu1 -> Parallel Seq Scan on tenk1 (10 rows) explain (costs off) select count(stringu1) as num, (CASE WHEN length(stringu1) > 5 THEN 'LONG' ELSE 'SHORT' END) as islong from tenk1 group by islong order by num; - QUERY PLAN --------------------------------------------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------ Sort Sort Key: (count(stringu1)) -> Finalize HashAggregate Group Key: CASE WHEN (length((stringu1)::text) > 5) THEN 'LONG'::text ELSE 'SHORT'::text END -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Partial HashAggregate - Group Key: (CASE WHEN (length((stringu1)::text) > 5) THEN 'LONG'::text ELSE 'SHORT'::text END) - -> Gather - Workers Planned: 4 + -> Gather + Workers Planned: 4 + -> Partial HashAggregate + Group Key: CASE WHEN (length((stringu1)::text) > 5) THEN 'LONG'::text ELSE 'SHORT'::text END -> Parallel Seq Scan on tenk1 (10 rows) @@ -378,8 +378,10 @@ SELECT xc_node_id != 0 FROM t_worker_identifier; (1 row) -- provoke error in worker +SAVEPOINT settings; select stringu1::int2 from tenk1 where unique1 = 1; ERROR: invalid input syntax for integer: "BAAAAA" +ROLLBACK TO SAVEPOINT settings; -- test interaction with set-returning functions SAVEPOINT settings; -- multiple subqueries under a single Gather node @@ -389,16 +391,17 @@ EXPLAIN (COSTS OFF) SELECT unique1 FROM tenk1 WHERE fivethous = tenthous + 1 UNION ALL SELECT unique1 FROM tenk1 WHERE fivethous = tenthous + 1; - QUERY PLAN ----------------------------------------------------- - Gather - Workers Planned: 4 - -> Parallel Append - -> Parallel Seq Scan on tenk1 - Filter: (fivethous = (tenthous + 1)) - -> Parallel Seq Scan on tenk1 tenk1_1 - Filter: (fivethous = (tenthous + 1)) -(7 rows) + QUERY PLAN +---------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Gather + Workers Planned: 4 + -> Append + -> Parallel Seq Scan on tenk1 + Filter: (fivethous = (tenthous + 1)) + -> Parallel Seq Scan on tenk1 tenk1_1 + Filter: (fivethous = (tenthous + 1)) +(8 rows) ROLLBACK TO SAVEPOINT settings; -- can't use multiple subqueries under a single Gather node due to initPlans @@ -409,34 +412,33 @@ UNION ALL SELECT unique1 FROM tenk1 WHERE fivethous = (SELECT unique2 FROM tenk1 WHERE fivethous = 1 LIMIT 1) ORDER BY 1; - QUERY PLAN --------------------------------------------------------------------- - Sort - Sort Key: tenk1.unique1 - -> Append - -> Gather - Workers Planned: 4 - Params Evaluated: $1 - InitPlan 1 (returns $1) - -> Limit - -> Gather - Workers Planned: 4 - -> Parallel Seq Scan on tenk1 tenk1_2 - Filter: (fivethous = 1) - -> Parallel Seq Scan on tenk1 + QUERY PLAN +-------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Sort + Sort Key: tenk1.unique1 + -> Append + -> Seq Scan on tenk1 + Filter: (fivethous = $0) + InitPlan 1 (returns $0) + -> Limit + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Limit + -> Gather + Workers Planned: 4 + -> Parallel Seq Scan on tenk1 tenk1_2 + Filter: (fivethous = 1) + -> Seq Scan on tenk1 tenk1_1 Filter: (fivethous = $1) - -> Gather - Workers Planned: 4 - Params Evaluated: $3 - InitPlan 2 (returns $3) - -> Limit - -> Gather - Workers Planned: 4 - -> Parallel Seq Scan on tenk1 tenk1_3 - Filter: (fivethous = 1) - -> Parallel Seq Scan on tenk1 tenk1_1 - Filter: (fivethous = $3) -(25 rows) + InitPlan 2 (returns $1) + -> Limit + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Limit + -> Gather + Workers Planned: 4 + -> Parallel Seq Scan on tenk1 tenk1_3 + Filter: (fivethous = 1) +(24 rows) -- test interaction with SRFs SELECT * FROM information_schema.foreign_data_wrapper_options diff --git a/src/test/regress/expected/stats_ext_2.out b/src/test/regress/expected/stats_ext_2.out index e058f176..16b06053 100644 --- a/src/test/regress/expected/stats_ext_2.out +++ b/src/test/regress/expected/stats_ext_2.out @@ -660,7 +660,7 @@ EXPLAIN QUERY PLAN ------------------------------------------------------------------------------------------------- Finalize Aggregate (cost=177.52..177.53 rows=1 width=8) - -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=177.50..177.52 rows=1 width=8) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=177.50..177.52 rows=1 width=0) -> Partial Aggregate (cost=77.50..77.51 rows=1 width=8) -> Seq Scan on subset (cost=0.00..77.50 rows=1 width=0) Filter: ((b = 'prefix_1'::text) AND (c = 1)) @@ -680,9 +680,9 @@ EXPLAIN SELECT count(*) FROM subset WHERE b = 'prefix_1' and c = 1; QUERY PLAN ------------------------------------------------------------------------------------------------- - Finalize Aggregate (cost=177.52..177.53 rows=1 width=8) - -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=177.50..177.52 rows=1 width=8) - -> Partial Aggregate (cost=77.50..77.51 rows=1 width=8) + Finalize Aggregate (cost=177.64..177.65 rows=1 width=8) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=177.62..177.64 rows=1 width=0) + -> Partial Aggregate (cost=77.62..77.64 rows=1 width=8) -> Seq Scan on subset (cost=0.00..77.50 rows=50 width=0) Filter: ((b = 'prefix_1'::text) AND (c = 1)) (5 rows) @@ -698,9 +698,9 @@ EXPLAIN SELECT count(*) FROM subset WHERE b like '%_1' and c = 1; QUERY PLAN ------------------------------------------------------------------------------------------------- - Finalize Aggregate (cost=177.52..177.53 rows=1 width=8) - -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=177.50..177.52 rows=1 width=8) - -> Partial Aggregate (cost=77.50..77.51 rows=1 width=8) + Finalize Aggregate (cost=177.53..177.54 rows=1 width=8) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=177.51..177.53 rows=1 width=0) + -> Partial Aggregate (cost=77.51..77.52 rows=1 width=8) -> Seq Scan on subset (cost=0.00..77.50 rows=5 width=0) Filter: ((b ~~ '%_1'::text) AND (c = 1)) (5 rows) @@ -722,9 +722,9 @@ EXPLAIN SELECT count(*) FROM subset WHERE b like '%_1' and c = 1; QUERY PLAN ------------------------------------------------------------------------------------------------- - Finalize Aggregate (cost=177.52..177.53 rows=1 width=8) - -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=177.50..177.52 rows=1 width=8) - -> Partial Aggregate (cost=77.50..77.51 rows=1 width=8) + Finalize Aggregate (cost=177.64..177.65 rows=1 width=8) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=177.62..177.64 rows=1 width=0) + -> Partial Aggregate (cost=77.62..77.64 rows=1 width=8) -> Seq Scan on subset (cost=0.00..77.50 rows=50 width=0) Filter: ((b ~~ '%_1'::text) AND (c = 1)) (5 rows) diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out index 8f30b1c9..c68869c3 100644 --- a/src/test/regress/expected/subselect.out +++ b/src/test/regress/expected/subselect.out @@ -2043,8 +2043,8 @@ select * from x; Output: x_1.a -> CTE Scan on z Output: z.a - -> CTE Scan on z z1 - Output: z1.a + -> CTE Scan on z z1 + Output: z1.a (18 rows) with recursive x(a) as diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index ca0d242d..c3f67c12 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -91,7 +91,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_crypt_parellel_debug | off enable_data_mask | on enable_datanode_row_triggers | off - enable_distinct_optimizer | on + enable_distinct_optimizer | on enable_distri_debug | off enable_distri_debug_print | off enable_distri_visibility_print | off @@ -136,7 +136,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_transparent_crypt | on enable_user_authority_force_check | off enable_xlog_mprotect | on -(63 rows) +(64 rows) -- Test that the pg_timezone_names and pg_timezone_abbrevs views are -- more-or-less working. We can't test their contents in any great detail From 2766f3223d1b6e2ac104837026671a5b617a298b Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Thu, 23 Dec 2021 19:14:34 +0800 Subject: [PATCH 320/578] add contrib pg_clean pg_unlock tbase_subscription --- contrib/Makefile | 3 + contrib/pg_clean/Makefile | 18 + contrib/pg_clean/pg_clean--1.0.sql | 106 + .../pg_clean/pg_clean--unpackaged--1.0.sql | 19 + contrib/pg_clean/pg_clean.c | 3311 +++++++++++++++++ contrib/pg_clean/pg_clean.control | 5 + contrib/pg_clean/test.sh | 171 + contrib/pg_unlock/Makefile | 18 + contrib/pg_unlock/pg_unlock--1.0.sql | 56 + .../pg_unlock/pg_unlock--unpackaged--1.0.sql | 10 + contrib/pg_unlock/pg_unlock.c | 2349 ++++++++++++ contrib/pg_unlock/pg_unlock.control | 5 + contrib/tbase_subscription/Makefile | 19 + .../tbase_subscription--1.0.sql | 36 + .../tbase_subscription--unpackaged--1.0.sql | 4 + .../tbase_subscription/tbase_subscription.c | 26 + .../tbase_subscription.control | 5 + 17 files changed, 6161 insertions(+) create mode 100644 contrib/pg_clean/Makefile create mode 100644 contrib/pg_clean/pg_clean--1.0.sql create mode 100644 contrib/pg_clean/pg_clean--unpackaged--1.0.sql create mode 100644 contrib/pg_clean/pg_clean.c create mode 100644 contrib/pg_clean/pg_clean.control create mode 100644 contrib/pg_clean/test.sh create mode 100644 contrib/pg_unlock/Makefile create mode 100644 contrib/pg_unlock/pg_unlock--1.0.sql create mode 100644 contrib/pg_unlock/pg_unlock--unpackaged--1.0.sql create mode 100644 contrib/pg_unlock/pg_unlock.c create mode 100644 contrib/pg_unlock/pg_unlock.control create mode 100644 contrib/tbase_subscription/Makefile create mode 100644 contrib/tbase_subscription/tbase_subscription--1.0.sql create mode 100644 contrib/tbase_subscription/tbase_subscription--unpackaged--1.0.sql create mode 100644 contrib/tbase_subscription/tbase_subscription.c create mode 100644 contrib/tbase_subscription/tbase_subscription.control diff --git a/contrib/Makefile b/contrib/Makefile index 43e984e3..14745884 100644 --- a/contrib/Makefile +++ b/contrib/Makefile @@ -39,6 +39,8 @@ SUBDIRS = \ pgrowlocks \ pgstattuple \ pgxc_clean \ + pg_clean \ + pg_unlock \ pgxc_ctl \ pgxc_monitor \ pg_visibility \ @@ -55,6 +57,7 @@ SUBDIRS = \ unaccent \ vacuumlo \ stormstats \ + tbase_subscription \ tbase_pooler_stat \ pg_stat_cluster_activity diff --git a/contrib/pg_clean/Makefile b/contrib/pg_clean/Makefile new file mode 100644 index 00000000..9913e074 --- /dev/null +++ b/contrib/pg_clean/Makefile @@ -0,0 +1,18 @@ +# contrib/pg_clean/Makefile + +MODULE_big = pg_clean +OBJS = pg_clean.o + +EXTENSION = pg_clean +DATA = pg_clean--1.0.sql pg_clean--unpackaged--1.0.sql + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/pg_clean +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif \ No newline at end of file diff --git a/contrib/pg_clean/pg_clean--1.0.sql b/contrib/pg_clean/pg_clean--1.0.sql new file mode 100644 index 00000000..e5bbc9ca --- /dev/null +++ b/contrib/pg_clean/pg_clean--1.0.sql @@ -0,0 +1,106 @@ +/* contrib/pg_clean/pg_clean--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION pg_clean" to load this file. \quit + +-- Register functions. +CREATE FUNCTION pg_clean_execute(IN time_interval integer DEFAULT 120, + OUT gid text, + OUT global_transaction_status text, + OUT operation text, + OUT operation_status text +) +RETURNS SETOF record +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT VOLATILE; + +CREATE FUNCTION pg_clean_execute_on_node(IN abnormal_nodename text, IN abnormal_time bigint, + OUT gid text, + OUT global_transaction_status text, + OUT operation text, + OUT operation_status text +) +RETURNS SETOF record +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT VOLATILE; + + +CREATE FUNCTION pg_clean_check_txn(IN time_interval integer DEFAULT 120, + OUT gid text, + OUT database text, + OUT global_transaction_status text, + OUT transaction_status_on_allnodes text +) +RETURNS SETOF record +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT VOLATILE; + +CREATE FUNCTION pgxc_get_2pc_nodes(gid text) +RETURNS text +AS 'MODULE_PATHNAME' +LANGUAGE C; + +CREATE FUNCTION pgxc_get_2pc_startnode(gid text) +RETURNS text +AS 'MODULE_PATHNAME' +LANGUAGE C; + +CREATE FUNCTION pgxc_get_2pc_startxid(gid text) +RETURNS text +AS 'MODULE_PATHNAME' +LANGUAGE C; + +CREATE FUNCTION pgxc_get_2pc_commit_timestamp(gid text) +RETURNS text +AS 'MODULE_PATHNAME' +LANGUAGE C; + +CREATE FUNCTION pgxc_get_2pc_xid(gid text) +RETURNS integer +AS 'MODULE_PATHNAME' +LANGUAGE C; + +CREATE FUNCTION pgxc_get_2pc_file(gid text) +RETURNS text +AS 'MODULE_PATHNAME' +LANGUAGE C; + +CREATE FUNCTION pgxc_remove_2pc_records(gid text) +RETURNS bool +AS 'MODULE_PATHNAME' +LANGUAGE C; + +CREATE FUNCTION pgxc_clear_2pc_records() +RETURNS bool +AS 'MODULE_PATHNAME' +LANGUAGE C; + +CREATE FUNCTION pgxc_get_record_list() +RETURNS text +AS 'MODULE_PATHNAME' +LANGUAGE C; + +CREATE FUNCTION pgxc_commit_on_node(nodename text, gid text) +RETURNS bool +AS 'MODULE_PATHNAME' +LANGUAGE C; + +CREATE FUNCTION pgxc_abort_on_node(nodename text, gid text) +RETURNS bool +AS 'MODULE_PATHNAME' +LANGUAGE C; + +GRANT ALL ON FUNCTION pg_clean_execute(time_interval integer) TO PUBLIC; +GRANT ALL ON FUNCTION pg_clean_execute_on_node(abnormal_nodename text, abnormal_time bigint) TO PUBLIC; +GRANT ALL ON FUNCTION pg_clean_check_txn(time_interval integer) TO PUBLIC; +GRANT ALL ON FUNCTION pgxc_get_2pc_nodes(gid text) TO PUBLIC; +GRANT ALL ON FUNCTION pgxc_get_2pc_startnode(gid text) TO PUBLIC; +GRANT ALL ON FUNCTION pgxc_get_2pc_startxid(gid text) TO PUBLIC; +GRANT ALL ON FUNCTION pgxc_get_2pc_commit_timestamp(gid text) TO PUBLIC; +GRANT ALL ON FUNCTION pgxc_get_2pc_xid(gid text) TO PUBLIC; +GRANT ALL ON FUNCTION pgxc_get_2pc_file(gid text) TO PUBLIC; +GRANT ALL ON FUNCTION pgxc_remove_2pc_records(gid text) TO PUBLIC; +GRANT ALL ON FUNCTION pgxc_clear_2pc_records() TO PUBLIC; +GRANT ALL ON FUNCTION pgxc_get_record_list() TO PUBLIC; +GRANT ALL ON FUNCTION pgxc_commit_on_node(nodename text, gid text) TO PUBLIC; +GRANT ALL ON FUNCTION pgxc_abort_on_node(nodename text, gid text) TO PUBLIC; diff --git a/contrib/pg_clean/pg_clean--unpackaged--1.0.sql b/contrib/pg_clean/pg_clean--unpackaged--1.0.sql new file mode 100644 index 00000000..a6a67659 --- /dev/null +++ b/contrib/pg_clean/pg_clean--unpackaged--1.0.sql @@ -0,0 +1,19 @@ +/* contrib/pg_clean/pg_clean--unpackaged--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION pg_clean" to load this file. \quit + +ALTER EXTENSION pg_clean ADD function pg_clean_execute(time_interval integer); +ALTER EXTENSION pg_clean ADD function pg_clean_execute_on_node(abnormal_nodename text, abnormal_time bigint); +ALTER EXTENSION pg_clean ADD function pg_clean_check_txn(time_interval integer); +ALTER EXTENSION pg_clean ADD function pgxc_get_2pc_nodes(gid text); +ALTER EXTENSION pg_clean ADD function pgxc_get_2pc_startnode(gid text); +ALTER EXTENSION pg_clean ADD function pgxc_get_2pc_startxid(gid text); +ALTER EXTENSION pg_clean ADD function pgxc_get_2pc_commit_timestamp(gid text); +ALTER EXTENSION pg_clean ADD function pgxc_get_2pc_xid(gid text); +ALTER EXTENSION pg_clean ADD function pgxc_get_2pc_file(gid text); +ALTER EXTENSION pg_clean ADD function pgxc_remove_2pc_records(gid text); +ALTER EXTENSION pg_clean ADD function pgxc_clear_2pc_records(); +ALTER EXTENSION pg_clean ADD function pgxc_get_record_list(); +ALTER EXTENSION pg_clean ADD function pgxc_commit_on_node(nodename text, gid text); +ALTER EXTENSION pg_clean ADD function pgxc_abort_on_node(nodename text, gid text); diff --git a/contrib/pg_clean/pg_clean.c b/contrib/pg_clean/pg_clean.c new file mode 100644 index 00000000..4dc898ff --- /dev/null +++ b/contrib/pg_clean/pg_clean.c @@ -0,0 +1,3311 @@ +#include "postgres.h" +#include "fmgr.h" +#include "funcapi.h" +#include "miscadmin.h" + +#include +#include +#include +#include +#include + +#include "storage/procarray.h" +#include "storage/lwlock.h" +#include "storage/proc.h" +#include "utils/varlena.h" +#include "utils/lsyscache.h" +#include "utils/palloc.h" +#include "utils/builtins.h" + +#include "executor/tuptable.h" +#include "pgxc/execRemote.h" +#include "pgxc/pgxcnode.h" +#include "access/tupdesc.h" +#include "access/htup_details.h" +#include "lib/stringinfo.h" + +#include "access/gtm.h" +#include "datatype/timestamp.h" +#include "access/xact.h" +#include "pgxc/pgxcnode.h" +#include "pgxc/poolmgr.h" +#include "utils/timestamp.h" +#include "catalog/pg_control.h" +#include "commands/dbcommands.h" + +#include "utils/memutils.h" +#include "nodes/memnodes.h" + +#ifdef XCP +#include "catalog/pg_type.h" +#include "catalog/pgxc_node.h" +#include "executor/executor.h" +#include "nodes/makefuncs.h" +#include "utils/snapmgr.h" +#endif +#ifdef PGXC +#include "pgxc/nodemgr.h" +#include "pgxc/pgxc.h" +#endif + +#include "storage/fd.h" +#include "pgstat.h" +#include "access/xact.h" +#include "access/twophase.h" +#include "access/hash.h" + +/*hash_create hash_search*/ +#include "utils/hsearch.h" + +#define TWOPHASE_RECORD_DIR "pg_2pc" +int transaction_threshold = 200000; +#define MAXIMUM_CLEAR_FILE 10000 +#define MAXIMUM_OUTPUT_FILE 1000 +#define XIDPREFIX "_$XC$" +#define DEFAULT_CLEAN_TIME_INTERVAL 120000000 +#ifdef __TWO_PHASE_TESTS__ +#define LEAST_CLEAN_TIME_INTERVAL 10000000 /* in pg_clean test_mode should not clean twophase trans prepared in ten seconds or commit in ten seconds */ +#else +#define LEAST_CLEAN_TIME_INTERVAL 60000000 /* should not clean twophase trans prepared in a minite or commit in a minite */ +#endif +GlobalTimestamp clean_time_interval = DEFAULT_CLEAN_TIME_INTERVAL; + + +PG_MODULE_MAGIC; + +#define MAX_GID 50 +#define MAX_DBNAME 64 +#define GET_START_XID "startxid:" +#define GET_COMMIT_TIMESTAMP "global_commit_timestamp:" +#define GET_START_NODE "startnode:" +#define GET_NODE "nodes:" +#define GET_XID "\nxid:" +#define GET_READONLY "readonly" +#define GIDSIZE (200 + 24) +#define MAX_TWOPC_TXN 1000 + +#define XIDFOUND 1 +#define XIDNOTFOUND -1 +#define XIDEXECFAIL -2 + +#define FILEFOUND 1 +#define FILEUNKOWN -1 +#define FILENOTFOUND -2 + +#define INIT(x)\ +do{\ + x = NULL;\ + x##_count = 0;\ + x##_size = 0;\ +}while(0); + +#define RPALLOC(x)\ +do{\ + if (x##_size < x##_count+1)\ + {\ + int temp_size = (x##_size > 0) ? x##_size : 1;\ + if (NULL == x)\ + {\ + x = palloc0(2*temp_size*sizeof(*x));\ + }\ + else\ + {\ + x = repalloc(x, 2*temp_size*sizeof(*x));\ + }\ + x##_size = 2*temp_size;\ + }\ +}while(0); + +#define PALLOC(x, y)\ +do{\ + RPALLOC(x);\ + x[x##_count] = y;\ + x##_count++;\ +}while(0); + +#define RFREE(x)\ +do{\ + if (x##_size > 0)\ + {\ + pfree(x);\ + }\ + x = NULL;\ + x##_count = 0;\ + x##_size = 0;\ +}while(0); + +#define ENUM_TOCHAR_CASE(x) case x: return(#x); + +/*data structures*/ +typedef enum TXN_STATUS +{ + TXN_STATUS_INITIAL = 0, /* Initial */ + TXN_STATUS_PREPARED, + TXN_STATUS_COMMITTED, + TXN_STATUS_ABORTED, + TXN_STATUS_INPROGRESS, + TXN_STATUS_FAILED, /* Error detected while interacting with the node */ + TXN_STATUS_UNKNOWN /* Unknown: Frozen, running, or not started */ +} TXN_STATUS; + + +typedef enum +{ + UNDO = 0, + ABORT, + COMMIT +} OPERATION; + +typedef enum +{ + TWOPHASE_FILE_EXISTS = 0, + TWOPHASE_FILE_NOT_EXISTS, + TWOPHASE_FILE_OLD, + TWOPHASE_FILE_ERROR +}TWOPHASE_FILE_STATUS; + +typedef struct txn_info +{ + char gid[MAX_GID]; + uint32 *xid; /* xid used in prepare */ + TimestampTz *prepare_timestamp; + char *owner; + char *participants; + Oid origcoord; /* Original coordinator who initiated the txn */ + bool after_first_phase; + uint32 startxid; /* xid in Original coordinator */ + bool isorigcoord_part; /* Is original coordinator a + participant? */ + int num_dnparts; /* Number of participant datanodes */ + int num_coordparts; /* Number of participant coordinators */ + int *dnparts; /* Whether a node was participant in the txn */ + int *coordparts; + TXN_STATUS *txn_stat; /* Array for each nodes */ + char *msg; /* Notice message for this txn. */ + GlobalTimestamp global_commit_timestamp; /* get global_commit_timestamp from node once it is committed*/ + + TXN_STATUS global_txn_stat; + OPERATION op; + bool op_issuccess; + bool is_readonly; + bool belong_abnormal_node; +}txn_info; + +typedef struct database_info +{ + struct database_info *next; + char *database_name; + + HTAB *all_txn_info; +#if 0 + txn_info *head_txn_info; + txn_info *last_txn_info; +#endif +} database_info; + +typedef struct +{ + int index; + txn_info **txn; + int txn_count; + int txn_size; + MemoryContext mycontext; +} print_txn_info; + +typedef struct +{ + int index; + int count; + char **gid; + int gid_count; + int gid_size; + char **database; + int database_count; + int database_size; + char **global_status; + int global_status_count; + int global_status_size; + char **status; + int status_count; + int status_size; + MemoryContext mycontext; +} print_status; + +typedef struct +{ + char ***slot; /*slot[i][j] stores value of row i, colum j*/ + int slot_count; /*number of rows*/ + int slot_size; + int attnum; +}TupleTableSlots; + +/*global variable*/ +static Oid *cn_node_list = NULL; +static Oid *dn_node_list = NULL; +static bool *cn_health_map = NULL; +static bool *dn_health_map = NULL; +static int cn_nodes_num = 0; +static int dn_nodes_num = 0; +static int pgxc_clean_node_count = 0; +static Oid my_nodeoid; +static +database_info *head_database_info = NULL; +static +database_info *last_database_info = NULL; +bool execute = false; +int total_twopc_txn = 0; + +TimestampTz current_time; +GlobalTimestamp abnormal_time = InvalidGlobalTimestamp; +char *abnormal_nodename = NULL; +Oid abnormal_nodeoid = InvalidOid; +bool clear_2pc_belong_node = false; + + +/*function list*/ + /*plugin entry function*/ + +static bool check_node_health(Oid node_oid); +static Datum + execute_query_on_single_node(Oid node, const char * query, int attnum, TupleTableSlots * tuples); +void DestroyTxnHash(void); +static void ResetGlobalVariables(void); + +static Oid + getMyNodeoid(void); +static void + getDatabaseList(void); +static char* TTSgetvalue(TupleTableSlots *result, int tup_num, int field_num); +static void DropTupleTableSlots(TupleTableSlots * +Slots); +static void + getTxnInfoOnNodesAll(void); +void getTxnInfoOnNode(Oid node); +void add_txn_info(char * dbname, Oid node_oid, uint32 xid, char * gid, char * owner, + TimestampTz prepared_time, TXN_STATUS status); +TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info * txn, Oid node_oid); +static txn_info * + find_txn(char *gid); +txn_info* + make_txn_info(char * dbname, char * gid, char * owner); +database_info* + find_database_info(char *database_name); +database_info* + add_database_info(char *database_name); +int find_node_index(Oid node_oid); +Oid find_node_oid(int node_idx); +void getTxnInfoOnOtherNodesAll(void); +void getTxnInfoOnOtherNodesForDatabase(database_info *database); +void getTxnInfoOnOtherNodes(txn_info *txn); +int Get2PCXidByGid(Oid node_oid, char * gid, uint32 * transactionid); +int Get2PCFile(Oid node_oid, char * gid, uint32 * transactionid); + +void getTxnStatus(txn_info * txn, int node_idx); +void recover2PCForDatabaseAll(void); +void recover2PCForDatabase(database_info * db_info); +#if 0 +static bool + setMaintenanceMode(bool status); +#endif +bool send_query_clean_transaction(PGXCNodeHandle * conn, txn_info * txn, const char * finish_cmd); +bool check_2pc_belong_node(txn_info * txn); +bool check_node_participate(txn_info * txn, int node_idx); + +void recover2PC(txn_info * txn); +TXN_STATUS + check_txn_global_status(txn_info *txn); +bool clean_2PC_iscommit(txn_info *txn, bool iscommit); +bool clean_2PC_files(txn_info *txn); +void Init_print_txn_info(print_txn_info *print_txn); +void Init_print_stats_all(print_status *pstatus); +void Init_print_stats(txn_info * txn, char * database, print_status * pstatus); +static const char * + txn_status_to_string(TXN_STATUS status); +static const char * + txn_op_to_string(OPERATION op); +static void + CheckFirstPhase(txn_info *txn); +static void + get_transaction_handles(PGXCNodeAllHandles **pgxc_handles, txn_info *txn); +static void + get_node_handles(PGXCNodeAllHandles ** pgxc_handles, Oid nodeoid); + +Datum pg_clean_execute(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(pg_clean_execute); +Datum pg_clean_execute(PG_FUNCTION_ARGS) +{ +#ifdef ACCESS_CONTROL_ATTR_NUM +#undef ACCESS_CONTROL_ATTR_NUM +#endif +#define ACCESS_CONTROL_ATTR_NUM 4 + FuncCallContext *funcctx; + HeapTuple tuple; + print_txn_info *print_txn = NULL; + txn_info *temp_txn; + char txn_gid[100]; + char txn_status[100]; + char txn_op[100]; + char txn_op_issuccess[100]; + + Datum values[ACCESS_CONTROL_ATTR_NUM]; + bool nulls[ACCESS_CONTROL_ATTR_NUM]; + + if(!IS_PGXC_COORDINATOR) + { + elog(ERROR, "can only called on coordinator"); + } + + if (SRF_IS_FIRSTCALL()) + { + MemoryContext oldcontext; + TupleDesc tupdesc; + MemoryContext mycontext; + funcctx = SRF_FIRSTCALL_INIT(); + + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + tupdesc = CreateTemplateTupleDesc(ACCESS_CONTROL_ATTR_NUM, false); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "gid", + TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "global_transaction_status", + TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "operation", + TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 4, "operation_status", + TEXTOID, -1, 0); + funcctx->tuple_desc = BlessTupleDesc(tupdesc); + + funcctx->user_fctx = (print_txn_info *)palloc0(sizeof(print_txn_info)); + print_txn = (print_txn_info *) funcctx->user_fctx; + + + MemoryContextSwitchTo(oldcontext); + mycontext = AllocSetContextCreate(funcctx->multi_call_memory_ctx, + "clean_check", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + oldcontext = MemoryContextSwitchTo(mycontext); + + /*clear Global*/ + ResetGlobalVariables(); + execute = true; + clean_time_interval = PG_GETARG_INT32(0) * 1000000; + if (LEAST_CLEAN_TIME_INTERVAL > clean_time_interval) + { + clean_time_interval = LEAST_CLEAN_TIME_INTERVAL; + } + + /*get node list*/ + PgxcNodeGetOids(&cn_node_list, &dn_node_list, + &cn_nodes_num, &dn_nodes_num, true); + pgxc_clean_node_count = cn_nodes_num + dn_nodes_num; + my_nodeoid = getMyNodeoid(); + cn_health_map = palloc0(cn_nodes_num * sizeof(bool)); + dn_health_map = palloc0(dn_nodes_num * sizeof(bool)); + + /*add my database info*/ + add_database_info(get_database_name(MyDatabaseId)); + + /*get all info of 2PC transactions*/ + getTxnInfoOnNodesAll(); + + /*get txn info on other nodes all*/ + getTxnInfoOnOtherNodesAll(); + + /*recover all 2PC transactions*/ + recover2PCForDatabaseAll(); + + Init_print_txn_info(print_txn); + + print_txn->mycontext = mycontext; + + MemoryContextSwitchTo(oldcontext); + + } + + funcctx = SRF_PERCALL_SETUP(); + print_txn = (print_txn_info *) funcctx->user_fctx; + + if (print_txn->index < print_txn->txn_count) + { + temp_txn = print_txn->txn[print_txn->index]; + strncpy(txn_gid, temp_txn->gid, 100); + strncpy(txn_status, txn_status_to_string(temp_txn->global_txn_stat), 100); + strncpy(txn_op, txn_op_to_string(temp_txn->op), 100); + if (temp_txn->op_issuccess) + strncpy(txn_op_issuccess, "success", 100); + else + strncpy(txn_op_issuccess, "fail", 100); + + MemSet(values, 0, sizeof(values)); + MemSet(nulls, 0, sizeof(nulls)); + + values[0] = PointerGetDatum(cstring_to_text(txn_gid)); + values[1] = PointerGetDatum(cstring_to_text(txn_status)); + values[2] = PointerGetDatum(cstring_to_text(txn_op)); + values[3] = PointerGetDatum(cstring_to_text(txn_op_issuccess)); + tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); + print_txn->index++; + SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple)); + } + else + { + + //MemoryContextDelete(print_txn->mycontext); + DestroyTxnHash(); + ResetGlobalVariables(); + SRF_RETURN_DONE(funcctx); + } +} + +/* + * clear 2pc after oss detect abnormal node and restart it , + * only clear 2pc belong the abnormal node and before the abnormal time + */ +Datum pg_clean_execute_on_node(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(pg_clean_execute_on_node); +Datum pg_clean_execute_on_node(PG_FUNCTION_ARGS) +{ +#ifdef ACCESS_CONTROL_ATTR_NUM +#undef ACCESS_CONTROL_ATTR_NUM +#endif +#define ACCESS_CONTROL_ATTR_NUM 4 + FuncCallContext *funcctx; + HeapTuple tuple; + print_txn_info *print_txn = NULL; + txn_info *temp_txn; + char txn_gid[100]; + char txn_status[100]; + char txn_op[100]; + char txn_op_issuccess[100]; + + Datum values[ACCESS_CONTROL_ATTR_NUM]; + bool nulls[ACCESS_CONTROL_ATTR_NUM]; + + if(!IS_PGXC_COORDINATOR) + { + elog(ERROR, "can only called on coordinator"); + } + + if (SRF_IS_FIRSTCALL()) + { + MemoryContext oldcontext; + TupleDesc tupdesc; + MemoryContext mycontext; + funcctx = SRF_FIRSTCALL_INIT(); + + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + tupdesc = CreateTemplateTupleDesc(ACCESS_CONTROL_ATTR_NUM, false); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "gid", + TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "global_transaction_status", + TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "operation", + TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 4, "operation_status", + TEXTOID, -1, 0); + funcctx->tuple_desc = BlessTupleDesc(tupdesc); + + funcctx->user_fctx = (print_txn_info *)palloc0(sizeof(print_txn_info)); + print_txn = (print_txn_info *) funcctx->user_fctx; + + + MemoryContextSwitchTo(oldcontext); + mycontext = AllocSetContextCreate(funcctx->multi_call_memory_ctx, + "clean_check", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + oldcontext = MemoryContextSwitchTo(mycontext); + + /*clear Global*/ + ResetGlobalVariables(); + execute = true; + clear_2pc_belong_node = true; + + abnormal_nodename = text_to_cstring(PG_GETARG_TEXT_P(0)); + abnormal_nodeoid = get_pgxc_nodeoid(abnormal_nodename); + if (InvalidOid == abnormal_nodeoid) + { + elog(ERROR, "pg_clean_execute_on_node, cannot clear 2pc of invalid nodename '%s'", abnormal_nodename); + } + abnormal_time = PG_GETARG_INT64(1); + current_time = GetCurrentTimestamp(); + if (abnormal_time >= current_time) + { + elog(ERROR, "pg_clean_execute_on_node, abnormal time "INT64_FORMAT" must before current_time "INT64_FORMAT, abnormal_time, current_time); + } + + /*get node list*/ + PgxcNodeGetOids(&cn_node_list, &dn_node_list, + &cn_nodes_num, &dn_nodes_num, true); + pgxc_clean_node_count = cn_nodes_num + dn_nodes_num; + my_nodeoid = getMyNodeoid(); + cn_health_map = palloc0(cn_nodes_num * sizeof(bool)); + dn_health_map = palloc0(dn_nodes_num * sizeof(bool)); + + /*add my database info*/ + add_database_info(get_database_name(MyDatabaseId)); + + /*get all info of 2PC transactions*/ + getTxnInfoOnNodesAll(); + + /*get txn info on other nodes all*/ + getTxnInfoOnOtherNodesAll(); + + /*recover all 2PC transactions*/ + recover2PCForDatabaseAll(); + + Init_print_txn_info(print_txn); + + print_txn->mycontext = mycontext; + + MemoryContextSwitchTo(oldcontext); + + } + + funcctx = SRF_PERCALL_SETUP(); + print_txn = (print_txn_info *) funcctx->user_fctx; + + if (print_txn->index < print_txn->txn_count) + { + temp_txn = print_txn->txn[print_txn->index]; + strncpy(txn_gid, temp_txn->gid, 100); + strncpy(txn_status, txn_status_to_string(temp_txn->global_txn_stat), 100); + strncpy(txn_op, txn_op_to_string(temp_txn->op), 100); + if (temp_txn->op_issuccess) + strncpy(txn_op_issuccess, "success", 100); + else + strncpy(txn_op_issuccess, "fail", 100); + + MemSet(values, 0, sizeof(values)); + MemSet(nulls, 0, sizeof(nulls)); + + values[0] = PointerGetDatum(cstring_to_text(txn_gid)); + values[1] = PointerGetDatum(cstring_to_text(txn_status)); + values[2] = PointerGetDatum(cstring_to_text(txn_op)); + values[3] = PointerGetDatum(cstring_to_text(txn_op_issuccess)); + tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); + print_txn->index++; + SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple)); + } + else + { + DestroyTxnHash(); + pfree(abnormal_nodename); + ResetGlobalVariables(); + SRF_RETURN_DONE(funcctx); + } +} + + +Datum pg_clean_check_txn(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(pg_clean_check_txn); +Datum pg_clean_check_txn(PG_FUNCTION_ARGS) +{ +#ifdef ACCESS_CONTROL_ATTR_NUM +#undef ACCESS_CONTROL_ATTR_NUM +#endif +#define ACCESS_CONTROL_ATTR_NUM 4 + FuncCallContext *funcctx; + HeapTuple tuple; + print_status *pstatus = NULL; + + Datum values[ACCESS_CONTROL_ATTR_NUM]; + bool nulls[ACCESS_CONTROL_ATTR_NUM]; + execute = false; + + if(!IS_PGXC_COORDINATOR) + { + elog(ERROR, "can only called on coordinator"); + } + + if (SRF_IS_FIRSTCALL()) + { + MemoryContext oldcontext; + MemoryContext mycontext; + TupleDesc tupdesc; + funcctx = SRF_FIRSTCALL_INIT(); + + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + tupdesc = CreateTemplateTupleDesc(ACCESS_CONTROL_ATTR_NUM, false); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "gid", + TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "database", + TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "global_transaction_status", + TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 4, "transaction_status_on_allnodes", + TEXTOID, -1, 0); + funcctx->tuple_desc = BlessTupleDesc(tupdesc); + + funcctx->user_fctx = (print_status *)palloc0(sizeof(print_status)); + pstatus = (print_status *) funcctx->user_fctx; + pstatus->index = pstatus->count = 0; + pstatus->gid = NULL; + pstatus->global_status = pstatus->status = (char **)NULL; + pstatus->database = NULL; + pstatus->mycontext = NULL; + + + MemoryContextSwitchTo(oldcontext); + + mycontext = AllocSetContextCreate(funcctx->multi_call_memory_ctx, + "clean_check", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + oldcontext = MemoryContextSwitchTo(mycontext); + + /*clear Global*/ + ResetGlobalVariables(); + + clean_time_interval = PG_GETARG_INT32(0) * 1000000; + if (LEAST_CLEAN_TIME_INTERVAL > clean_time_interval) + { + clean_time_interval = LEAST_CLEAN_TIME_INTERVAL; + } + /*get node list*/ + PgxcNodeGetOids(&cn_node_list, &dn_node_list, + &cn_nodes_num, &dn_nodes_num, true); + if (cn_node_list == NULL || dn_node_list == NULL) + elog(ERROR, "pg_clean:fail to get cn_node_list and dn_node_list"); + pgxc_clean_node_count = cn_nodes_num + dn_nodes_num; + my_nodeoid = getMyNodeoid(); + cn_health_map = palloc0(cn_nodes_num * sizeof(bool)); + dn_health_map = palloc0(dn_nodes_num * sizeof(bool)); + + /*get all database info*/ + getDatabaseList(); + + /*get all info of 2PC transactions*/ + getTxnInfoOnNodesAll(); + + /*get txn info on other nodes all*/ + getTxnInfoOnOtherNodesAll(); + + /*recover all 2PC transactions*/ + Init_print_stats_all(pstatus); + + pstatus->mycontext = mycontext; + + MemoryContextSwitchTo(oldcontext); + + } + + funcctx = SRF_PERCALL_SETUP(); + pstatus = (print_status *) funcctx->user_fctx; + + if (pstatus->index < pstatus->count) + { + MemSet(values, 0, sizeof(values)); + MemSet(nulls, 0, sizeof(nulls)); + + values[0] = PointerGetDatum(cstring_to_text(pstatus->gid[pstatus->index])); + values[1] = PointerGetDatum(cstring_to_text(pstatus->database[pstatus->index])); + values[2] = PointerGetDatum(cstring_to_text(pstatus->global_status[pstatus->index])); + values[3] = PointerGetDatum(cstring_to_text(pstatus->status[pstatus->index])); + tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); + pstatus->index++; + SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple)); + } + else + { + /* + MemoryContextDelete(pstatus->mycontext); + DropDatabaseInfo(); + */ + DestroyTxnHash(); + ResetGlobalVariables(); + SRF_RETURN_DONE(funcctx); + } +} + +void DestroyTxnHash(void) +{ + database_info *dbinfo = head_database_info; + while (dbinfo) + { + hash_destroy(dbinfo->all_txn_info); + dbinfo = dbinfo->next; + } +} + +static void ResetGlobalVariables(void) +{ + cn_node_list = NULL; + dn_node_list = NULL; + cn_health_map = NULL; + dn_health_map = NULL; + cn_nodes_num = 0; + dn_nodes_num = 0; + pgxc_clean_node_count = 0; + execute = false; + total_twopc_txn = 0; + + head_database_info = last_database_info = NULL; + + current_time = 0; + abnormal_time = InvalidGlobalTimestamp; + abnormal_nodename = NULL; + abnormal_nodeoid = InvalidOid; + clear_2pc_belong_node = false; + +} + +static Oid getMyNodeoid(void) +{ + return get_pgxc_nodeoid(PGXCNodeName); +} + +/* + * execute_query_on_single_node -- execute query on certain node and get results + * input: node oid, execute query, number of attribute in results, results + * return: (Datum) 0 + */ +static Datum +execute_query_on_single_node(Oid node, const char *query, int attnum, TupleTableSlots *tuples) //delete numnodes, delete nodelist, insert node +{ + int ii; + bool issuccess = false; + + /*check health of node*/ + bool ishealthy = check_node_health(node); + +#ifdef XCP + EState *estate; + MemoryContext oldcontext; + RemoteQuery *plan; + RemoteQueryState *pstate; + TupleTableSlot *result = NULL; + Var *dummy; + char ntype = PGXC_NODE_NONE; + + /* + * Make up RemoteQuery plan node + */ + plan = makeNode(RemoteQuery); + plan->combine_type = COMBINE_TYPE_NONE; + plan->exec_nodes = makeNode(ExecNodes); + plan->exec_type = EXEC_ON_NONE; + + plan->exec_nodes->nodeList = lappend_int(plan->exec_nodes->nodeList, + PGXCNodeGetNodeId(node, &ntype)); + if (ntype == PGXC_NODE_NONE) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Unknown node Oid: %u", node))); + else if (ntype == PGXC_NODE_COORDINATOR) + { + plan->exec_type = EXEC_ON_COORDS; + } + else + { + plan->exec_type = EXEC_ON_DATANODES; + } + + plan->sql_statement = (char *)query; + plan->force_autocommit = false; + /* + * We only need the target entry to determine result data type. + * So create dummy even if real expression is a function. + */ + for (ii = 1; ii <= attnum; ii++) + { + dummy = makeVar(1, ii, TEXTOID, 0, InvalidOid, 0); + plan->scan.plan.targetlist = lappend(plan->scan.plan.targetlist, + makeTargetEntry((Expr *) dummy, ii, NULL, false)); + } + /* prepare to execute */ + estate = CreateExecutorState(); + oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); + estate->es_snapshot = GetActiveSnapshot(); + pstate = ExecInitRemoteQuery(plan, estate, 0); + MemoryContextSwitchTo(oldcontext); + + /*execute query on node when node is healthy*/ + INIT(tuples->slot); + tuples->attnum = 0; + if (ishealthy) + { + int i_tuple = 0; + int i_attnum = 0; + issuccess = true; + result = ExecRemoteQuery((PlanState *) pstate); + tuples->attnum = attnum; + while (result != NULL && !TupIsNull(result)) + { + slot_getallattrs(result); + RPALLOC(tuples->slot); + tuples->slot[i_tuple] = (char **) palloc0(attnum * sizeof(char *)); + + for (i_attnum = 0; i_attnum < attnum; i_attnum++) + { + /*if (result->tts_values[i_attnum] != (Datum)0)*/ + if (result->tts_isnull[i_attnum] == false) + { + tuples->slot[i_tuple][i_attnum] = text_to_cstring(DatumGetTextP(result->tts_values[i_attnum])); + } + else + { + tuples->slot[i_tuple][i_attnum] = NULL; + } + } + tuples->slot_count++; + + result = ExecRemoteQuery((PlanState *) pstate); + i_tuple++; + } + } + ExecEndRemoteQuery(pstate); +#endif + return issuccess == true ? (Datum) 1 : (Datum) 0; +} + +static bool check_node_health(Oid node_oid) +{ + int i; + bool ishealthy = false; + + PoolPingNodeRecheck(node_oid); + PgxcNodeGetHealthMap(cn_node_list, dn_node_list, + &cn_nodes_num, &dn_nodes_num, + cn_health_map, dn_health_map); + if (get_pgxc_nodetype(node_oid) == 'C') + { + for (i = 0; i < cn_nodes_num; i++) + { + if (cn_node_list[i] == node_oid) + { + ishealthy = cn_health_map[i]; + } + } + } + else + { + for (i = 0; i < dn_nodes_num; i++) + { + if (dn_node_list[i] == node_oid) + { + ishealthy = dn_health_map[i]; + } + } + } + return ishealthy; +} + +static void getDatabaseList(void) +{ + int i; + TupleTableSlots result_db; + const char *query_db = "select datname::text from pg_database;"; + /*add datname into tail of head_database_info*/ + if (execute_query_on_single_node(my_nodeoid, query_db, 1, &result_db) == (Datum) 1) + { + for (i = 0; i < result_db.slot_count; i++) + { + if (TTSgetvalue(&result_db, i, 0)) + { + add_database_info(TTSgetvalue(&result_db, i, 0)); + } + } + } + else + { + elog(LOG, "pg_clean: failed get database list on node %s", get_pgxc_nodename(my_nodeoid)); + } + DropTupleTableSlots(&result_db); +} + +/* + * TTSgetvalue -- get attribute from TupleTableSlots + * input: result, index of tuple, index of field + * return: attribute result + */ +static char * TTSgetvalue(TupleTableSlots *result, int tup_num, int field_num) +{ + return result->slot[tup_num][field_num]; +} + +static void DropTupleTableSlots(TupleTableSlots * +Slots) +{ + int i; + int j; + for (i = 0; i < Slots->slot_count; i++) + { + if (Slots->slot[i]) + { + for (j = 0; j < Slots->attnum; j++) + { + if (Slots->slot[i][j]) + { + pfree(Slots->slot[i][j]); + } + } + pfree(Slots->slot[i]); + } + } + RFREE(Slots->slot); + Slots->attnum = 0; + return; +} + +static void getTxnInfoOnNodesAll(void) +{ + int i; + current_time = GetCurrentTimestamp(); + /*upload 2PC transaction from CN*/ + for (i = 0; i < cn_nodes_num; i++) + { + if (total_twopc_txn >= MAX_TWOPC_TXN) + return; + getTxnInfoOnNode(cn_node_list[i]); + } + + /*upload 2PC transaction from DN*/ + for (i = 0; i < dn_nodes_num; i++) + { + if (total_twopc_txn >= MAX_TWOPC_TXN) + return; + getTxnInfoOnNode(dn_node_list[i]); + } +} + +void getTxnInfoOnNode(Oid node) +{ + int i; + TupleTableSlots result_txn; + Datum execute_res; + char query_execute[1024]; + const char *query_txn_status = "select transaction::text, gid::text, owner::text, database::text, timestamptz_out(prepared)::text " + "from pg_prepared_xacts;"; + const char *query_txn_status_execute = "select transaction::text, gid::text, owner::text, database::text, timestamptz_out(prepared)::text " + "from pg_prepared_xacts where database = '%s';"; + snprintf(query_execute, 1024, query_txn_status_execute, get_database_name(MyDatabaseId)); + + if (execute) + execute_res = execute_query_on_single_node(node, query_execute, 5, &result_txn); + else + execute_res = execute_query_on_single_node(node, query_txn_status, 5, &result_txn); + + if (execute_res == (Datum) 1) + { + for (i = 0; i < result_txn.slot_count; i++) + { + uint32 xid; + char* gid; + char* owner; + char* datname; + TimestampTz prepared_time; + + /*read results from each tuple*/ + xid = strtoul(TTSgetvalue(&result_txn, i, 0), NULL, 10); + gid = TTSgetvalue(&result_txn, i, 1); + owner = TTSgetvalue(&result_txn, i, 2); + datname = TTSgetvalue(&result_txn, i, 3); + prepared_time = DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in, + CStringGetDatum(TTSgetvalue(&result_txn, i, 4)), + ObjectIdGetDatum(InvalidOid), + Int32GetDatum(-1))); + + /*add txn to database*/ + add_txn_info(datname, node, xid, gid, owner, prepared_time, TXN_STATUS_PREPARED); + if (total_twopc_txn >= MAX_TWOPC_TXN) + { + break; + } + } + } + else + { + elog(LOG, "pg_clean: failed get database list on node %s", get_pgxc_nodename(node)); + } + DropTupleTableSlots(&result_txn); +} + +void add_txn_info(char* dbname, Oid node_oid, uint32 xid, char * gid, + char * owner, TimestampTz prepared_time, TXN_STATUS status) +{ + txn_info *txn = NULL; + int nodeidx; + + if ((txn = find_txn(gid)) == NULL) + { + txn = make_txn_info(dbname, gid, owner); + total_twopc_txn++; + if (txn == NULL) + { + /*no more memory*/ + elog(ERROR, "there is no more memory for palloc a 2PC transaction"); + } + } + nodeidx = find_node_index(node_oid); + txn->txn_stat[nodeidx] = status; + txn->xid[nodeidx] = xid; + txn->prepare_timestamp[nodeidx] = prepared_time; + if (nodeidx < cn_nodes_num) + { + txn->coordparts[nodeidx] = 1; + txn->num_coordparts++; + } + else + { + txn->dnparts[nodeidx-cn_nodes_num] = 1; + txn->num_dnparts++; + } + return; +} + +TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info *txn, Oid node_oid) +{ + /*get all the participates and initiate to each transactions*/ + TWOPHASE_FILE_STATUS res = TWOPHASE_FILE_NOT_EXISTS; + TupleTableSlots result; + char *partnodes = NULL; + char *startnode = NULL; + char *file_content = NULL; + uint32 startxid = 0; + char *str_startxid = NULL; + char *str_timestamp = NULL; + char *temp = NULL; + Oid temp_nodeoid; + char temp_nodetype; + int temp_nodeidx; + char stmt[1024]; + static const char *STMT_FORM = "select pgxc_get_2pc_file('%s')::text"; + snprintf(stmt, 1024, STMT_FORM, txn->gid, txn->gid, txn->gid, txn->gid); + + if (execute_query_on_single_node(node_oid, stmt, 1, &result) == (Datum) 1) + { + if (result.slot_count && TTSgetvalue(&result, 0, 0)) +#if 0 + TTSgetvalue(&result, 0, 0) && + TTSgetvalue(&result, 0, 1) && + TTSgetvalue(&result, 0, 2)) +#endif + { + file_content = TTSgetvalue(&result, 0, 0); + + if (!IsXidImplicit(txn->gid) && strstr(file_content, GET_READONLY)) + { + txn->is_readonly = true; + txn->global_txn_stat = TXN_STATUS_COMMITTED; + DropTupleTableSlots(&result); + return TWOPHASE_FILE_EXISTS; + } + startnode = strstr(file_content, GET_START_NODE); + str_startxid = strstr(file_content, GET_START_XID); + partnodes = strstr(file_content, GET_NODE); + temp = strstr(file_content, GET_COMMIT_TIMESTAMP); + + /* get the last global_commit_timestamp */ + while (temp) + { + str_timestamp = temp; + temp += strlen(GET_COMMIT_TIMESTAMP); + temp = strstr(temp, GET_COMMIT_TIMESTAMP); + } + + if (startnode) + { + startnode += strlen(GET_START_NODE); + startnode = strtok(startnode, "\n"); + txn->origcoord = get_pgxc_nodeoid(startnode); + } + + if (str_startxid) + { + str_startxid += strlen(GET_START_XID); + str_startxid = strtok(str_startxid, "\n"); + startxid = strtoul(str_startxid, NULL, 10); + txn->startxid = startxid; + } + + if (partnodes) + { + partnodes += strlen(GET_NODE); + partnodes = strtok(partnodes, "\n"); + txn->participants = (char *) palloc0(strlen(partnodes) + 1); + strncpy(txn->participants, partnodes, strlen(partnodes) + 1); + } + + if (NULL == startnode || NULL == str_startxid) + { + res = TWOPHASE_FILE_OLD; + DropTupleTableSlots(&result); + return res; + } + + if (NULL == partnodes) + { + res = TWOPHASE_FILE_ERROR; + DropTupleTableSlots(&result); + return res; + } + + if (str_timestamp) + { + str_timestamp += strlen(GET_COMMIT_TIMESTAMP); + str_timestamp = strtok(str_timestamp, "\n"); + txn->global_commit_timestamp = strtoull(str_timestamp, NULL, 10); + } + + elog(DEBUG1, "get 2pc txn:%s partnodes in nodename: %s (nodeoid:%u) result: partnodes:%s, startnode:%s, startnodeoid:%u, startxid:%u", + txn->gid, get_pgxc_nodename(node_oid), node_oid, partnodes, startnode, txn->origcoord, startxid); + /* in explicit transaction startnode participate the transaction */ + if (strstr(partnodes, startnode) || !IsXidImplicit(txn->gid)) + { + txn->isorigcoord_part = true; + } + else + { + txn->isorigcoord_part = false; + } + + res = TWOPHASE_FILE_EXISTS; + txn->num_coordparts = 0; + txn->num_dnparts = 0; + temp = strtok(partnodes,", "); + while(temp) + { + /*check node type*/ + temp_nodeoid = get_pgxc_nodeoid(temp); + if (temp_nodeoid == InvalidOid) + { + res = TWOPHASE_FILE_ERROR; + break; + } + temp_nodetype = get_pgxc_nodetype(temp_nodeoid); + temp_nodeidx = find_node_index(temp_nodeoid); + + switch (temp_nodetype) + { + case 'C': + txn->coordparts[temp_nodeidx] = 1; + txn->num_coordparts++; + break; + case 'D': + txn->dnparts[temp_nodeidx-cn_nodes_num] = 1; + txn->num_dnparts++; + break; + default: + elog(ERROR,"nodetype of %s is not 'C' or 'D'", temp); + break; + } + temp = strtok(NULL,", "); + } + } + } + else + { + elog(LOG, "pg_clean: failed get database list on node %s", get_pgxc_nodename(node_oid)); + res = TWOPHASE_FILE_ERROR; + } + DropTupleTableSlots(&result); + return res; +} + +static txn_info *find_txn(char *gid) +{ + bool found; + database_info *cur_db; + txn_info *txn; + + for (cur_db = head_database_info; cur_db; cur_db = cur_db->next) + { +#if 0 + for (cur_txn = cur_db->head_txn_info; cur_txn; cur_txn = cur_txn->next) + { + if (0 == strcmp(cur_txn->gid, gid)) + return cur_txn; + } +#endif + txn = (txn_info *)hash_search(cur_db->all_txn_info, (void *)gid, HASH_FIND, &found); + if (found) + return txn; + } + return NULL; +} + +txn_info* make_txn_info(char* dbname, char* gid, char* owner) +{ + bool found; + txn_info *txn_insert_pos = NULL; + database_info *dbinfo; + txn_info *txn; + + dbinfo = add_database_info(dbname); + txn = (txn_info *)palloc0(sizeof(txn_info)); + if (txn == NULL) + return NULL; + //txn->next = NULL; + + //txn->gid = (char *)palloc0(strlen(gid)+1); + strncpy(txn->gid, gid, strlen(gid)+1); + txn->owner = (char *)palloc0(strlen(owner)+1); + strncpy(txn->owner, owner, strlen(owner)+1); + + txn->txn_stat = (TXN_STATUS *)palloc0(sizeof(TXN_STATUS) * pgxc_clean_node_count); + txn->xid = (uint32 *)palloc0(sizeof(uint32) * pgxc_clean_node_count); + txn->prepare_timestamp = (TimestampTz *)palloc0(sizeof(TimestampTz) * pgxc_clean_node_count); + txn->coordparts = (int *)palloc0(cn_nodes_num * sizeof(int)); + + txn->dnparts = (int *)palloc0(dn_nodes_num * sizeof(int)); + if (txn->gid == NULL || txn->owner == NULL || txn->txn_stat == NULL + || txn->xid == NULL || txn->coordparts == NULL || txn->dnparts == NULL || txn->prepare_timestamp == NULL) + { + pfree(txn); + return(NULL); + } + + txn_insert_pos = (txn_info *)hash_search(dbinfo->all_txn_info, + (void *)txn->gid, HASH_ENTER, &found); + if (!found) + memcpy(txn_insert_pos, txn, sizeof(txn_info)); + +#if 0 + if (dbinfo->head_txn_info == NULL) + { + dbinfo->head_txn_info = dbinfo->last_txn_info = txn; + } + else + { + dbinfo->last_txn_info->next = txn; + dbinfo->last_txn_info = txn; + } +#endif + + return txn_insert_pos; +} + +database_info *find_database_info(char *database_name) +{ + database_info *cur_database_info = head_database_info; + + for (;cur_database_info; cur_database_info = cur_database_info->next) + { + if(cur_database_info->database_name && + database_name && + strcmp(cur_database_info->database_name, database_name) == 0) + return(cur_database_info); + } + return(NULL); +} + +database_info *add_database_info(char *database_name) +{ + database_info *rv; + HASHCTL txn_ctl; + char tabname[MAX_GID]; + + if ((rv = find_database_info(database_name)) != NULL) + return rv; /* Already in the list */ + rv = (database_info *)palloc0(sizeof(database_info)); + if (rv == NULL) + return NULL; + rv->next = NULL; + rv->database_name = (char *)palloc0(strlen(database_name) + 1); + strncpy(rv->database_name, database_name, strlen(database_name) + 1); + if (rv->database_name == NULL) + { + pfree(rv); + return NULL; + } +#if 0 + rv->head_txn_info = NULL; + rv->last_txn_info = NULL; +#endif + + snprintf(tabname, 64, "%s txn info", rv->database_name); + txn_ctl.keysize = MAX_GID; + txn_ctl.entrysize = sizeof(txn_info); + rv->all_txn_info = hash_create(tabname, 64, + &txn_ctl, HASH_ELEM); + if (head_database_info == NULL) + { + head_database_info = last_database_info = rv; + return rv; + } + else + { + last_database_info->next = rv; + last_database_info = rv; + return rv; + } +} + +int find_node_index(Oid node_oid) +{ + int res; + int i; + if (get_pgxc_nodetype(node_oid) == 'C') + { + for (i = 0; i < cn_nodes_num; i++) + { + if (node_oid == cn_node_list[i]) + { + res = i; + break; + } + } + } + else + { + for (i = 0; i < dn_nodes_num; i++) + { + if (node_oid == dn_node_list[i]) + { + res = i+cn_nodes_num; + break; + } + } + } + return res; +} + +Oid find_node_oid(int node_idx) +{ + return (node_idx < cn_nodes_num) ? cn_node_list[node_idx] : + dn_node_list[node_idx-cn_nodes_num]; +} + +void getTxnInfoOnOtherNodesAll(void) +{ + database_info *cur_database; + + for (cur_database = head_database_info; cur_database; cur_database = cur_database->next) + { + getTxnInfoOnOtherNodesForDatabase(cur_database); + } +} + +void getTxnInfoOnOtherNodesForDatabase(database_info *database) +{ + txn_info *cur_txn; + HASH_SEQ_STATUS status; + HTAB *txn = database->all_txn_info; + hash_seq_init(&status, txn); + + while ((cur_txn = (txn_info *) hash_seq_search(&status)) != NULL) + { + getTxnInfoOnOtherNodes(cur_txn); + } +#if 0 + for (cur_txn = database->head_txn_info; cur_txn; cur_txn = cur_txn->next) + { + getTxnInfoOnOtherNodes(cur_txn); + } +#endif +} + +void getTxnInfoOnOtherNodes(txn_info *txn) +{ + int ii; + int ret; + char node_type; + TWOPHASE_FILE_STATUS status = TWOPHASE_FILE_NOT_EXISTS; + Oid node_oid; + uint32 transactionid = 0; + char gid[MAX_GID]; + char *ptr = NULL; + + if (IsXidImplicit(txn->gid)) + { + strncpy(gid, txn->gid, strlen(txn->gid)+1); + ptr = strtok(gid, ":"); + ptr = strtok(NULL, ":"); + node_oid = get_pgxc_nodeoid(ptr); + status = GetTransactionPartNodes(txn, node_oid); + } + else + { + for (ii = 0; ii < cn_nodes_num + dn_nodes_num; ii++) + { + if (ii < cn_nodes_num) + { + status = GetTransactionPartNodes(txn, cn_node_list[ii]); + if (TWOPHASE_FILE_EXISTS == status || + TWOPHASE_FILE_OLD == status || + TWOPHASE_FILE_ERROR == status) + { + node_oid = cn_node_list[ii]; + break; + } + } + else + { + status = GetTransactionPartNodes(txn, dn_node_list[ii - cn_nodes_num]); + if (TWOPHASE_FILE_EXISTS == status || + TWOPHASE_FILE_OLD == status || + TWOPHASE_FILE_ERROR == status) + { + node_oid = dn_node_list[ii - cn_nodes_num]; + break; + } + } + } + + /* since there may be explicit readonly twophase transactions */ + if (txn->is_readonly) + { + return; + } + if (TWOPHASE_FILE_EXISTS == status && + InvalidGlobalTimestamp == txn->global_commit_timestamp && + node_oid != txn->origcoord) + { + status = GetTransactionPartNodes(txn, txn->origcoord); + } + + } + + if (TWOPHASE_FILE_EXISTS != status) + { + /* + * if 2pc file not exists in all nodes, the trans did not pass the prepared phase, + * + */ + txn->global_txn_stat = (TWOPHASE_FILE_NOT_EXISTS == status) ? + TXN_STATUS_ABORTED : TXN_STATUS_UNKNOWN; + return; + } + + + /* judge the range of global status */ + CheckFirstPhase(txn); + + for (ii = 0; ii < pgxc_clean_node_count; ii++) + { + if (txn->txn_stat[ii] == TXN_STATUS_INITIAL) + { + /*check node ii is 'C' or 'D'*/ + node_oid = find_node_oid(ii); + if (node_oid == txn->origcoord) + continue; + node_type = get_pgxc_nodetype(node_oid); + if (node_type == 'C' && txn->coordparts[ii] != 1) + continue; + if (node_type == 'D' && txn->dnparts[ii - cn_nodes_num] != 1) + continue; + /*check coordparts or dnparts*/ + if (txn->xid[ii] == 0) + { + ret = Get2PCXidByGid(node_oid, txn->gid, &transactionid); + if (ret == XIDFOUND) + { + txn->xid[ii] = transactionid; + if (txn->xid[ii] > 0) + getTxnStatus(txn, ii); + } + else if (ret == XIDNOTFOUND) + { + if (txn->after_first_phase) + txn->txn_stat[ii] = TXN_STATUS_COMMITTED; + } + else + txn->txn_stat[ii] = TXN_STATUS_UNKNOWN; + + } + } + } +} + +/*get xid by gid on node_oid*/ +int Get2PCXidByGid(Oid node_oid, char *gid, uint32 *transactionid) +{ + int ret = XIDFOUND; + TupleTableSlots result; + uint32 xid = 0; + static const char *STMT_FORM = "select pgxc_get_2pc_xid('%s')::text;"; + char stmt[100]; + snprintf(stmt, 100, STMT_FORM, gid); + /*if exist get xid by gid on node_oid*/ + if (execute_query_on_single_node(node_oid, stmt, 1, &result) != (Datum) 0) + { + if (result.slot_count) + { + if (TTSgetvalue(&result, 0, 0)) + { + xid = strtoul(TTSgetvalue(&result, 0, 0), NULL, 10); + *transactionid = xid; + if (xid == 0) + ret = XIDNOTFOUND; + } + else + ret = XIDNOTFOUND; + } + else + ret = XIDNOTFOUND; + } + else + ret = XIDEXECFAIL; + DropTupleTableSlots(&result); + return ret; +} + +int Get2PCFile(Oid node_oid, char * gid, uint32 * transactionid) +{ + int ret = FILEFOUND; + TupleTableSlots result; + static const char *STMT_FORM = "select pgxc_get_2pc_file('%s')::text;"; + char stmt[100]; + snprintf(stmt, 100, STMT_FORM, gid); + /*if exist get xid by gid on node_oid*/ + if (execute_query_on_single_node(node_oid, stmt, 1, &result) != (Datum) 0) + { + if (result.slot_count) + { + if (!TTSgetvalue(&result, 0, 0)) + { + ret = FILENOTFOUND; + } + else + { + ret = FILEFOUND; + } + } + else + ret = FILENOTFOUND; + } + else + ret = FILEUNKOWN; + DropTupleTableSlots(&result); + return ret; +} + + +void getTxnStatus(txn_info *txn, int node_idx) +{ + Oid node_oid; + char stmt[1024]; + char *att1; + TupleTableSlots result; + + static const char *STMT_FORM = "SELECT pgxc_is_committed('%d'::xid)::text"; + snprintf(stmt, 1024, STMT_FORM, txn->xid[node_idx], txn->xid[node_idx]); + + node_oid = find_node_oid(node_idx); + if (0 != execute_query_on_single_node(node_oid, stmt, 1, &result)) + { + att1 = TTSgetvalue(&result, 0, 0); + + if (att1) + { + if (strcmp(att1, "true") == 0) + { + txn->txn_stat[node_idx] = TXN_STATUS_COMMITTED; + } + else + txn->txn_stat[node_idx] = TXN_STATUS_ABORTED; + } + else + { + txn->txn_stat[node_idx] = TXN_STATUS_INITIAL; + } + } + else + txn->txn_stat[node_idx] = TXN_STATUS_UNKNOWN; + DropTupleTableSlots(&result); +} + +Datum pgxc_get_2pc_file(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(pgxc_get_2pc_file); +Datum pgxc_get_2pc_file(PG_FUNCTION_ARGS) +{ + char *tid; + char path[MAXPGPATH]; + File fd; + int ret; + char *result; + text *t_result = NULL; + struct stat filestate; + off_t fileSize; + + tid = text_to_cstring(PG_GETARG_TEXT_P(0)); + + snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid); + + if(access(path, F_OK) == 0) + { + if(stat(path, &filestate) == -1) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not get status of file \"%s\"", path))); + } + + fileSize = filestate.st_size; + + if (0 == fileSize) + { + PG_RETURN_NULL(); + } + + result = (char *)palloc0(fileSize + 1); + + fd = PathNameOpenFile(path, O_RDONLY, S_IRUSR | S_IWUSR); + if (fd < 0) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\" for read", path))); + } + + ret = FileRead(fd, result, fileSize, WAIT_EVENT_BUFFILE_READ); + + if(ret != fileSize) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\"", path))); + } + + FileClose(fd); + if (result) + { + t_result = cstring_to_text(result); + return PointerGetDatum(t_result); + } + } + PG_RETURN_NULL(); +} + + +Datum pgxc_get_2pc_nodes(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(pgxc_get_2pc_nodes); +Datum pgxc_get_2pc_nodes(PG_FUNCTION_ARGS) +{ + char *tid; + char path[MAXPGPATH]; + File fd; + int ret; + char *result; + char *nodename; + text *t_result = NULL; + struct stat filestate; + off_t fileSize; + + tid = text_to_cstring(PG_GETARG_TEXT_P(0)); + + snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid); + + if(access(path, F_OK) == 0) + { + if(stat(path, &filestate) == -1) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not get status of file \"%s\"", path))); + } + + fileSize = filestate.st_size; + + result = (char *)palloc0(fileSize + 1); + + fd = PathNameOpenFile(path, O_RDONLY, S_IRUSR | S_IWUSR); + if (fd < 0) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\" for read", path))); + } + + ret = FileRead(fd, result, fileSize, WAIT_EVENT_BUFFILE_READ); + + if(ret != fileSize) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\"", path))); + } + + FileClose(fd); + if (result) + { + nodename = strstr(result, GET_NODE); + if (nodename) + { + nodename += strlen(GET_NODE); + nodename = strtok(nodename, "\n"); + t_result = cstring_to_text(nodename); + return PointerGetDatum(t_result); + } + } + } + PG_RETURN_NULL(); +} + +Datum pgxc_get_2pc_startnode(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(pgxc_get_2pc_startnode); +Datum pgxc_get_2pc_startnode(PG_FUNCTION_ARGS) +{ + char *tid; + char path[MAXPGPATH]; + File fd; + int ret; + char *result; + char *nodename; + text *t_result = NULL; + struct stat filestate; + off_t fileSize; + + tid = text_to_cstring(PG_GETARG_TEXT_P(0)); + + snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid); + + if(access(path, F_OK) == 0) + { + if(stat(path, &filestate) == -1) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not get status of file \"%s\"", path))); + } + + fileSize = filestate.st_size; + + result = (char *)palloc0(fileSize + 1); + + fd = PathNameOpenFile(path, O_RDONLY, S_IRUSR | S_IWUSR); + if (fd < 0) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\" for read", path))); + } + + ret = FileRead(fd, result, fileSize, WAIT_EVENT_BUFFILE_READ); + + if(ret != fileSize) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\"", path))); + } + + FileClose(fd); + if (result) + { + nodename = strstr(result, GET_START_NODE); + if (nodename) + { + nodename += strlen(GET_START_NODE); + nodename = strtok(nodename, "\n"); + t_result = cstring_to_text(nodename); + return PointerGetDatum(t_result); + } + } + } + PG_RETURN_NULL(); +} + +Datum pgxc_get_2pc_startxid(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(pgxc_get_2pc_startxid); +Datum pgxc_get_2pc_startxid(PG_FUNCTION_ARGS) +{ + char *tid; + char path[MAXPGPATH]; + File fd; + int ret; + char *result; + char *startxid; + text *t_result = NULL; + struct stat filestate; + off_t fileSize; + + tid = text_to_cstring(PG_GETARG_TEXT_P(0)); + + snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid); + + if(access(path, F_OK) == 0) + { + if(stat(path, &filestate) == -1) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not get status of file \"%s\"", path))); + } + + fileSize = filestate.st_size; + + result = (char *)palloc0(fileSize + 1); + + fd = PathNameOpenFile(path, O_RDONLY, S_IRUSR | S_IWUSR); + if (fd < 0) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\" for read", path))); + } + + ret = FileRead(fd, result, fileSize, WAIT_EVENT_BUFFILE_READ); + + if(ret != fileSize) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\"", path))); + } + + FileClose(fd); + if (result) + { + startxid = strstr(result, GET_START_XID); + if (startxid) + { + startxid += strlen(GET_START_XID); + startxid = strtok(startxid, "\n"); + t_result = cstring_to_text(startxid); + return PointerGetDatum(t_result); + } + } + } + PG_RETURN_NULL(); +} + + +Datum pgxc_get_2pc_commit_timestamp(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(pgxc_get_2pc_commit_timestamp); +Datum pgxc_get_2pc_commit_timestamp(PG_FUNCTION_ARGS) +{ + char *tid; + char path[MAXPGPATH]; + File fd; + int ret; + char *result; + char *commit_timestamp; + text *t_result = NULL; + struct stat filestate; + off_t fileSize; + + tid = text_to_cstring(PG_GETARG_TEXT_P(0)); + + snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid); + + if(access(path, F_OK) == 0) + { + if(stat(path, &filestate) == -1) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not get status of file \"%s\"", path))); + } + + fileSize = filestate.st_size; + + result = (char *)palloc0(fileSize + 1); + + fd = PathNameOpenFile(path, O_RDONLY, S_IRUSR | S_IWUSR); + if (fd < 0) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\" for read", path))); + } + + ret = FileRead(fd, result, fileSize, WAIT_EVENT_BUFFILE_READ); + + if(ret != fileSize) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\"", path))); + } + + FileClose(fd); + if (result) + { + commit_timestamp = strstr(result, GET_COMMIT_TIMESTAMP); + if (commit_timestamp) + { + commit_timestamp += strlen(GET_COMMIT_TIMESTAMP); + commit_timestamp = strtok(commit_timestamp, "\n"); + t_result = cstring_to_text(commit_timestamp); + return PointerGetDatum(t_result); + } + } + } + PG_RETURN_NULL(); +} + + + +Datum pgxc_get_2pc_xid(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(pgxc_get_2pc_xid); +Datum pgxc_get_2pc_xid(PG_FUNCTION_ARGS) +{ + char *tid; + char path[MAXPGPATH]; + File fd; + int ret; + GlobalTransactionId xid; + char *result; + char *str_xid; + struct stat filestate; + off_t fileSize; + + tid = text_to_cstring(PG_GETARG_TEXT_P(0)); + + snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid); + + if(access(path, F_OK) == 0) + { + if(stat(path, &filestate) == -1) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not get status of file \"%s\"", path))); + } + + fileSize = filestate.st_size; + result = (char *)palloc0(fileSize + 1); + + fd = PathNameOpenFile(path, O_RDONLY, S_IRUSR | S_IWUSR); + if (fd < 0) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\" for read", path))); + } + + + ret = FileRead(fd, result, fileSize, WAIT_EVENT_BUFFILE_READ); + + if(ret != fileSize) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\"", path))); + } + + FileClose(fd); + + str_xid = strstr(result, GET_XID); + if (str_xid) + { + str_xid += strlen(GET_XID); + str_xid = strtok(str_xid, "\n"); + xid = strtoul(str_xid, NULL, 10); + PG_RETURN_UINT32(xid); + } + + } + PG_RETURN_NULL(); +} + +Datum pgxc_remove_2pc_records(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(pgxc_remove_2pc_records); +Datum pgxc_remove_2pc_records(PG_FUNCTION_ARGS) +{ +#define SLEEP_COUNT 1000 + char *tid = NULL; + + tid = text_to_cstring(PG_GETARG_TEXT_P(0)); + + remove_2pc_records(tid, true); + + pfree(tid); + + PG_RETURN_BOOL(true); +} + +Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(pgxc_clear_2pc_records); +Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS) +{ + MemoryContext oldcontext; + MemoryContext mycontext; + + int i = 0; + int count = 0; + TupleTableSlots *result; + TupleTableSlots clear_result; + const char *query = "select pgxc_get_record_list()::text"; + const char *CLEAR_STMT = "select pgxc_remove_2pc_records('%s')::text"; + char clear_query[100]; + char *twopcfiles = NULL; + char *ptr = NULL; + bool res = true; + + if(!IS_PGXC_COORDINATOR) + { + elog(ERROR, "can only called on coordinator"); + } + + mycontext = AllocSetContextCreate(CurrentMemoryContext, + "clean_check", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + oldcontext = MemoryContextSwitchTo(mycontext); + + ResetGlobalVariables(); +#if 0 + if((dir = opendir(TWOPHASE_RECORD_DIR))) + { + while((ptr = readdir(dir)) != NULL) + { + if (count > 999) + break; + if(strcmp(ptr->d_name,".") == 0 || strcmp(ptr->d_name,"..") == 0) + { + continue; + } + snprintf(path[count], MAX_GID, "/%s", ptr->d_name); + //snprintf(path[count], MAX_GID, "/%s", ptr->d_name); + count++; + } + + closedir(dir); + } +#endif + + /*get node list*/ + PgxcNodeGetOids(&cn_node_list, &dn_node_list, + &cn_nodes_num, &dn_nodes_num, true); + pgxc_clean_node_count = cn_nodes_num + dn_nodes_num; + my_nodeoid = getMyNodeoid(); + cn_health_map = palloc0(cn_nodes_num * sizeof(bool)); + dn_health_map = palloc0(dn_nodes_num * sizeof(bool)); + result = (TupleTableSlots *)palloc0(pgxc_clean_node_count * sizeof(TupleTableSlots)); + + /*collect the 2pc file in nodes*/ + for (i = 0; i < cn_nodes_num; i++) + { + execute_query_on_single_node(cn_node_list[i], query, 1, result+i); + } + + for (i = 0; i < dn_nodes_num; i++) + { + execute_query_on_single_node(dn_node_list[i], query, 1, result+cn_nodes_num+i); + } + /*get all database info*/ + getDatabaseList(); + + /*get all info of 2PC transactions*/ + getTxnInfoOnNodesAll(); +#if 0 + if((dir = opendir(TWOPHASE_RECORD_DIR))) + { + while (i < count) + { + if (!find_txn(path[i])) + { + unlink(path[i]); + WriteClean2pcXlogRec(path[i]); + } + i++; + } + + closedir(dir); + } +#endif + /*delete all rest 2pc file in each nodes*/ + for (i = 0; i < cn_nodes_num; i++) + { + if (0 == result[i].slot_count) + { + continue; + } + if (!(twopcfiles = TTSgetvalue(result+i, 0, 0))) + continue; + ptr = strtok(twopcfiles, ","); + while(ptr) + { + if (count >= MAXIMUM_CLEAR_FILE) + break; + if (!find_txn(ptr)) + { + snprintf(clear_query, 100, CLEAR_STMT, ptr); + if (execute_query_on_single_node(cn_node_list[i], clear_query, 1, &clear_result) == (Datum)0) + res = false; + DropTupleTableSlots(&clear_result); + count++; + } + ptr = strtok(NULL, ","); + } + } + + for (i = 0; i < dn_nodes_num; i++) + { + if (0 == result[cn_nodes_num+i].slot_count) + { + continue; + } + if (!(twopcfiles = TTSgetvalue(result+cn_nodes_num+i, 0, 0))) + continue; + ptr = strtok(twopcfiles, ","); + while(ptr) + { + if (count >= MAXIMUM_CLEAR_FILE) + break; + if (!find_txn(ptr)) + { + snprintf(clear_query, 100, CLEAR_STMT, ptr); + if (execute_query_on_single_node(dn_node_list[i], clear_query, 1, &clear_result) == (Datum)0) + res = false; + DropTupleTableSlots(&clear_result); + count++; + } + ptr = strtok(NULL, ","); + } + } + + for (i = 0; i < pgxc_clean_node_count; i++) + DropTupleTableSlots(result+i); + + DestroyTxnHash(); + ResetGlobalVariables(); + + MemoryContextSwitchTo(oldcontext); + MemoryContextDelete(mycontext); + + + PG_RETURN_BOOL(res); +} + +Datum pgxc_get_record_list(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(pgxc_get_record_list); +Datum pgxc_get_record_list(PG_FUNCTION_ARGS) +{ + int count = 0; + DIR *dir = NULL; + struct dirent *ptr = NULL; + char *recordList = NULL; + text *t_recordList = NULL; + + if(!(dir = opendir(TWOPHASE_RECORD_DIR))) + { + PG_RETURN_NULL(); + } + + while((ptr = readdir(dir)) != NULL) + { + if(strcmp(ptr->d_name,".") == 0 || strcmp(ptr->d_name,"..") == 0) + { + continue; + } + if (count >= MAXIMUM_OUTPUT_FILE) + break; + + if(!recordList) + { + recordList = (char *)palloc0(strlen(ptr->d_name) + 1); + sprintf(recordList, "%s", ptr->d_name); + } + else + { + recordList = (char *) repalloc(recordList, + strlen(ptr->d_name) + strlen(recordList) + 2); + sprintf(recordList, "%s,%s", recordList, ptr->d_name); + } + count++; + } + + closedir(dir); + + if(!recordList) + { + PG_RETURN_NULL(); + } + else + { + t_recordList = cstring_to_text(recordList); + return PointerGetDatum(t_recordList); + } +} + +Datum pgxc_commit_on_node(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(pgxc_commit_on_node); +Datum pgxc_commit_on_node(PG_FUNCTION_ARGS) +{ + /* nodename, gid */ + char *nodename; + Oid nodeoid; + char *gid; + txn_info *txn; + char command[100]; + PGXCNodeHandle **connections = NULL; + int conn_count = 0; + ResponseCombiner combiner; + PGXCNodeAllHandles *pgxc_handles = NULL; + PGXCNodeHandle *conn = NULL; + + /*clear Global*/ + ResetGlobalVariables(); + /*get node list*/ + PgxcNodeGetOids(&cn_node_list, &dn_node_list, + &cn_nodes_num, &dn_nodes_num, true); + if (cn_node_list == NULL || dn_node_list == NULL) + elog(ERROR, "pg_clean:fail to get cn_node_list and dn_node_list"); + pgxc_clean_node_count = cn_nodes_num + dn_nodes_num; + my_nodeoid = getMyNodeoid(); + cn_health_map = palloc0(cn_nodes_num * sizeof(bool)); + dn_health_map = palloc0(dn_nodes_num * sizeof(bool)); + + nodename = text_to_cstring(PG_GETARG_TEXT_P(0)); + gid = text_to_cstring(PG_GETARG_TEXT_P(1)); + nodeoid = get_pgxc_nodeoid(nodename); + if (InvalidOid == nodeoid) + { + elog(ERROR, "Invalid nodename '%s'", nodename); + } + + txn = (txn_info *)palloc0(sizeof(txn_info)); + if (txn == NULL) + { + PG_RETURN_BOOL(false); + } + txn->txn_stat = (TXN_STATUS *)palloc0(sizeof(TXN_STATUS) * pgxc_clean_node_count); + txn->xid = (uint32 *)palloc0(sizeof(uint32) * pgxc_clean_node_count); + txn->prepare_timestamp = (TimestampTz *)palloc0(sizeof(TimestampTz) * pgxc_clean_node_count); + txn->coordparts = (int *)palloc0(cn_nodes_num * sizeof(int)); + txn->dnparts = (int *)palloc0(dn_nodes_num * sizeof(int)); + + strncpy(txn->gid, gid, strlen(gid)+1); + getTxnInfoOnOtherNodes(txn); + snprintf(command, 100, "commit prepared '%s'", txn->gid); + + + if (InvalidGlobalTimestamp == txn->global_commit_timestamp) + { + if (!txn->is_readonly) + { + elog(ERROR, "in pg_clean, fail to get global_commit_timestamp for transaction '%s' on", gid); + } + else + { + txn->global_commit_timestamp = GetGlobalTimestampGTM(); + } + } + + connections = (PGXCNodeHandle**)palloc(sizeof(PGXCNodeHandle*)); + get_node_handles(&pgxc_handles, nodeoid); + + conn = (PGXC_NODE_COORDINATOR == get_pgxc_nodetype(nodeoid)) ? + pgxc_handles->coord_handles[0] : pgxc_handles->datanode_handles[0]; + if (!send_query_clean_transaction(conn, txn, command)) + { + elog(ERROR, "pg_clean: send query '%s' from '%s' to '%s' failed ", + command, get_pgxc_nodename(my_nodeoid) , nodename); + } + else + { + connections[conn_count++] = conn; + } + /* receive response */ + if (conn_count) + { + InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE); + if (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner) || + !validate_combiner(&combiner)) + { + if (combiner.errorMessage) + pgxc_node_report_error(&combiner); + else + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to FINISH the transaction on one or more nodes"))); + } + else + CloseCombiner(&combiner); + } + /*clear Global*/ + ResetGlobalVariables(); + clear_handles(); + pfree_pgxc_all_handles(pgxc_handles); + pgxc_handles = NULL; + pfree(connections); + connections = NULL; + + PG_RETURN_BOOL(true); +} + +Datum pgxc_abort_on_node(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(pgxc_abort_on_node); +Datum pgxc_abort_on_node(PG_FUNCTION_ARGS) +{ + /* nodename, gid */ + char *nodename; + Oid nodeoid; + char *gid; + txn_info *txn; + char command[100]; + PGXCNodeHandle **connections = NULL; + int conn_count = 0; + ResponseCombiner combiner; + PGXCNodeAllHandles *pgxc_handles = NULL; + PGXCNodeHandle *conn = NULL; + + /*clear Global*/ + ResetGlobalVariables(); + /*get node list*/ + PgxcNodeGetOids(&cn_node_list, &dn_node_list, + &cn_nodes_num, &dn_nodes_num, true); + if (cn_node_list == NULL || dn_node_list == NULL) + elog(ERROR, "pg_clean:fail to get cn_node_list and dn_node_list"); + pgxc_clean_node_count = cn_nodes_num + dn_nodes_num; + my_nodeoid = getMyNodeoid(); + cn_health_map = palloc0(cn_nodes_num * sizeof(bool)); + dn_health_map = palloc0(dn_nodes_num * sizeof(bool)); + + nodename = text_to_cstring(PG_GETARG_TEXT_P(0)); + gid = text_to_cstring(PG_GETARG_TEXT_P(1)); + nodeoid = get_pgxc_nodeoid(nodename); + if (InvalidOid == nodeoid) + { + elog(ERROR, "Invalid nodename '%s'", nodename); + } + + txn = (txn_info *)palloc0(sizeof(txn_info)); + if (txn == NULL) + { + PG_RETURN_BOOL(false); + } + txn->txn_stat = (TXN_STATUS *)palloc0(sizeof(TXN_STATUS) * pgxc_clean_node_count); + txn->xid = (uint32 *)palloc0(sizeof(uint32) * pgxc_clean_node_count); + txn->prepare_timestamp = (TimestampTz *)palloc0(sizeof(TimestampTz) * pgxc_clean_node_count); + txn->coordparts = (int *)palloc0(cn_nodes_num * sizeof(int)); + txn->dnparts = (int *)palloc0(dn_nodes_num * sizeof(int)); + + strncpy(txn->gid, gid, strlen(gid)+1); + connections = (PGXCNodeHandle**)palloc(sizeof(PGXCNodeHandle*)); + getTxnInfoOnOtherNodes(txn); + snprintf(command, 100, "rollback prepared '%s'", txn->gid); +#if 0 + if (!setMaintenanceMode(true)) + { + elog(ERROR, "Error: fail to set maintenance mode on in pg_clean"); + } +#endif + + get_node_handles(&pgxc_handles, nodeoid); + + conn = (PGXC_NODE_COORDINATOR == get_pgxc_nodetype(nodeoid)) ? + pgxc_handles->coord_handles[0] : pgxc_handles->datanode_handles[0]; + if (!send_query_clean_transaction(conn, txn, command)) + { + elog(ERROR, "pg_clean: send query '%s' from '%s' to '%s' failed ", + command, get_pgxc_nodename(my_nodeoid) , nodename); + } + else + { + connections[conn_count++] = conn; + } + /* receive response */ + if (conn_count) + { + InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE); + if (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner) || + !validate_combiner(&combiner)) + { + if (combiner.errorMessage) + pgxc_node_report_error(&combiner); + else + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to FINISH the transaction on one or more nodes"))); + } + else + CloseCombiner(&combiner); + } + /*clear Global*/ + ResetGlobalVariables(); + clear_handles(); + pfree_pgxc_all_handles(pgxc_handles); + pgxc_handles = NULL; + pfree(connections); + connections = NULL; + + PG_RETURN_BOOL(true); +} + + + +void recover2PCForDatabaseAll(void) +{ + database_info *cur_db = head_database_info; + while (cur_db) + { + recover2PCForDatabase(cur_db); + cur_db = cur_db->next; + } + //clean_old_2PC_files(); +} + +void recover2PCForDatabase(database_info * db_info) +{ + txn_info *cur_txn; + HASH_SEQ_STATUS status; + HTAB *txn = db_info->all_txn_info; + + hash_seq_init(&status, txn); + while ((cur_txn = (txn_info *) hash_seq_search(&status)) != NULL) + { + recover2PC(cur_txn); + } +} + +bool send_query_clean_transaction(PGXCNodeHandle* conn, txn_info *txn, const char *finish_cmd) +{ +#ifdef __TWO_PHASE_TESTS__ + if (PG_CLEAN_SEND_CLEAN <= twophase_exception_case && + PG_CLEAN_SEND_QUERY >= twophase_exception_case) + { + twophase_in = IN_PG_CLEAN; + } +#endif + if (!GlobalTimestampIsValid(txn->global_commit_timestamp) && + TXN_STATUS_COMMITTED == txn->global_txn_stat && + !txn->is_readonly) + return false; + + if (pgxc_node_send_clean(conn)) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("in pg_clean failed to send pg_clean flag for %s PREPARED command", + TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK"))); + return false; + } + if (txn->is_readonly && pgxc_node_send_readonly(conn)) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("in pg_clean failed to send readonly flag for %s PREPARED command", + TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK"))); + return false; + } + + if (txn->after_first_phase && pgxc_node_send_after_prepare(conn)) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("in pg_clean failed to send after prepare flag for %s PREPARED command", + TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK"))); + return false; + } + + /* + * only transaction finished in commit prepared/rollback prepared phase send timestamp + * partial prepared transaction has no need to send other information + */ + if (InvalidGlobalTimestamp != txn->global_commit_timestamp && + pgxc_node_send_global_timestamp(conn, txn->global_commit_timestamp)) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("in pg_clean failed to send global committs for %s PREPARED command", + TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK"))); + } + if (!txn->is_readonly) + { + if (InvalidOid != txn->origcoord && pgxc_node_send_starter(conn, get_pgxc_nodename(txn->origcoord))) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("in pg_clean failed to send start node for %s PREPARED command", + TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK"))); + } + + if (InvalidTransactionId != txn->startxid && pgxc_node_send_startxid(conn, txn->startxid)) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("in pg_clean failed to send start xid for %s PREPARED command", + TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK"))); + } + + if (NULL != txn->participants && pgxc_node_send_partnodes(conn, txn->participants)) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("in pg_clean failed to send participants for %s PREPARED command", + TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK"))); + } + } + + if (pgxc_node_send_query(conn, finish_cmd)) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("in pg_clean failed to send query for %s PREPARED command", + TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK"))); + return false; + } + return true; +} + +bool check_2pc_belong_node(txn_info * txn) +{ + int node_index = 0; + char node_type; + node_index = find_node_index(abnormal_nodeoid); + if (abnormal_nodeoid == txn->origcoord) + { + txn->belong_abnormal_node = true; + return true; + } + node_type = get_pgxc_nodetype(abnormal_nodeoid); + if (node_type == 'C' && txn->coordparts[node_index] == 1) + { + txn->belong_abnormal_node = true; + return true; + } + if (node_type == 'D' && txn->dnparts[node_index - cn_nodes_num] == 1) + { + txn->belong_abnormal_node = true; + return true; + } + txn->belong_abnormal_node = false; + return false; +} + +bool check_node_participate(txn_info * txn, int node_idx) +{ + char node_type = get_pgxc_nodetype(abnormal_nodeoid); + if (PGXC_NODE_COORDINATOR == node_type) + { + return txn->coordparts[node_idx] == 1 ? true : false; + } else if (PGXC_NODE_DATANODE == node_type) + { + return txn->dnparts[node_idx] == 1 ? true : false; + } + return false; +} + +void recover2PC(txn_info * txn) +{ + TXN_STATUS txn_stat; + txn_stat = check_txn_global_status(txn); + txn->global_txn_stat = txn_stat; + +#ifdef DEBUG_EXECABORT + txn_stat = TXN_STATUS_ABORTED; +#endif + + switch (txn_stat) + { + case TXN_STATUS_FAILED: + elog(LOG, "cannot recover 2PC transaction %s for TXN_STATUS_FAILED", txn->gid); + txn->op = UNDO; + txn->op_issuccess = true; + break; + + case TXN_STATUS_UNKNOWN: + elog(LOG, "cannot recover 2PC transaction %s for TXN_STATUS_UNKNOWN", txn->gid); + txn->op = UNDO; + txn->op_issuccess = true; + break; + + case TXN_STATUS_PREPARED: + elog(DEBUG1, "2PC recovery of transaction %s not needed for TXN_STATUS_PREPARED", txn->gid); + txn->op = UNDO; + txn->op_issuccess = true; + break; + + case TXN_STATUS_COMMITTED: + if (InvalidOid == txn->origcoord || txn->is_readonly) + { + txn->op = UNDO; + txn->op_issuccess = true; + } + else + { + txn->op = COMMIT; + if (!clean_2PC_iscommit(txn, true)) + { + txn->op_issuccess = false; + elog(LOG, "commit 2PC transaction %s failed", txn->gid); + return; + } + txn->op_issuccess = true; + clean_2PC_files(txn); + } + break; + + case TXN_STATUS_ABORTED: + txn->op = ABORT; + if (!clean_2PC_iscommit(txn, false)) + { + txn->op_issuccess = false; + elog(LOG, "rollback 2PC transaction %s failed", txn->gid); + return; + } + txn->op_issuccess = true; + clean_2PC_files(txn); + break; + + case TXN_STATUS_INPROGRESS: + elog(DEBUG1, "2PC recovery of transaction %s not needed for TXN_STATUS_INPROGRESS", txn->gid); + txn->op = UNDO; + txn->op_issuccess = true; + break; + + default: + elog(ERROR, "cannot recover 2PC transaction %s for unkown status", txn->gid); + break; + } + return; +} + +TXN_STATUS check_txn_global_status(txn_info *txn) +{ +#define TXN_PREPARED 0x0001 +#define TXN_COMMITTED 0x0002 +#define TXN_ABORTED 0x0004 +#define TXN_UNKNOWN 0x0008 +#define TXN_INITIAL 0x0010 +#define TXN_INPROGRESS 0X0020 + int ii; + int check_flag = 0; + int node_idx = 0; + TimestampTz prepared_time = 0; + TimestampTz time_gap = clean_time_interval; + + if (!IsXidImplicit(txn->gid) && txn->is_readonly) + { + return TXN_STATUS_COMMITTED; + } + if (txn->global_txn_stat == TXN_STATUS_UNKNOWN) + { + check_flag |= TXN_UNKNOWN; + } + if (txn->global_txn_stat == TXN_STATUS_ABORTED) + { + check_flag |= TXN_ABORTED; + } + + /*check dn participates*/ + for (ii = 0; ii < dn_nodes_num; ii++) + { + if (txn->dnparts[ii] == 1) + { + if (txn->txn_stat[ii + cn_nodes_num] == TXN_STATUS_INITIAL) + check_flag |= TXN_INITIAL; + else if (txn->txn_stat[ii + cn_nodes_num] == TXN_STATUS_UNKNOWN) + check_flag |= TXN_UNKNOWN; + else if (txn->txn_stat[ii + cn_nodes_num] == TXN_STATUS_PREPARED) + { + check_flag |= TXN_PREPARED; + prepared_time = txn->prepare_timestamp[ii + cn_nodes_num] > prepared_time ? + txn->prepare_timestamp[ii + cn_nodes_num] : prepared_time; + } + else if (txn->txn_stat[ii + cn_nodes_num] == TXN_STATUS_INPROGRESS) + check_flag |= TXN_INPROGRESS; + else if (txn->txn_stat[ii + cn_nodes_num] == TXN_STATUS_COMMITTED) + check_flag |= TXN_COMMITTED; + else if (txn->txn_stat[ii + cn_nodes_num] == TXN_STATUS_ABORTED) + check_flag |= TXN_ABORTED; + else + return TXN_STATUS_FAILED; + } + } + /*check cn participates*/ + for (ii = 0; ii < cn_nodes_num; ii++) + { + if (txn->coordparts[ii] == 1) + { + if (txn->txn_stat[ii] == TXN_STATUS_INITIAL) + check_flag |= TXN_ABORTED; + else if (txn->txn_stat[ii] == TXN_STATUS_UNKNOWN) + check_flag |= TXN_UNKNOWN; + else if (txn->txn_stat[ii] == TXN_STATUS_PREPARED) + { + check_flag |= TXN_PREPARED; + prepared_time = txn->prepare_timestamp[ii] > prepared_time ? + txn->prepare_timestamp[ii] : prepared_time; + } + else if (txn->txn_stat[ii] == TXN_STATUS_INPROGRESS) + check_flag |= TXN_INPROGRESS; + else if (txn->txn_stat[ii] == TXN_STATUS_COMMITTED) + check_flag |= TXN_COMMITTED; + else if (txn->txn_stat[ii] == TXN_STATUS_ABORTED) + check_flag |= TXN_ABORTED; + else + return TXN_STATUS_FAILED; + } + } + + /* + * first check the prepare timestamp of both implicit and explicit trans within the time_gap or not + * if not, check the commit timestamp explicit trans within the time_gap or not + */ +#if 0 + if ((check_flag & TXN_INPROGRESS) || + (IsXidImplicit(txn->gid) && current_time - prepared_time <= time_gap) || + (!IsXidImplicit(txn->gid) && + ((!txn->after_first_phase && current_time - prepared_time <= time_gap) || + (txn->after_first_phase && + (InvalidGlobalTimestamp != commit_time && + current_time - commit_time <= time_gap))))) + { + /* transaction inprogress */ + return TXN_STATUS_INPROGRESS; + } +#endif + if (clear_2pc_belong_node) + { + node_idx = find_node_index(abnormal_nodeoid); + if (!check_2pc_belong_node(txn) || + !check_node_participate(txn, node_idx) || + abnormal_time < txn->prepare_timestamp[node_idx]) + { + return TXN_STATUS_INPROGRESS; + } + } + else + { + if (check_flag & TXN_INPROGRESS ||current_time - prepared_time <= time_gap) + { + /* transaction inprogress */ + return TXN_STATUS_INPROGRESS; + } + } + + + if (!IsXidImplicit(txn->gid) && txn->after_first_phase && (TXN_PREPARED == check_flag)) + { + return TXN_STATUS_PREPARED; + } + + if (check_flag & TXN_UNKNOWN) + return TXN_STATUS_UNKNOWN; + + if ((check_flag & TXN_COMMITTED) && (check_flag & TXN_ABORTED)) + /* Mix of committed and aborted. This should not happen. */ + return TXN_STATUS_UNKNOWN; + + if ((check_flag & TXN_PREPARED) == 0) + /* Should be at least one "prepared statement" in nodes */ + return TXN_STATUS_FAILED; + + if (check_flag & TXN_COMMITTED) + /* Some 2PC transactions are committed. Need to commit others. */ + return TXN_STATUS_COMMITTED; + /* All the transactions remain prepared. No need to recover. */ + return TXN_STATUS_ABORTED; +} + +bool clean_2PC_iscommit(txn_info *txn, bool iscommit) +{ + int ii; + static const char *STMT_FORM = "%s prepared '%s';"; + char command[100]; + int node_idx; + Oid node_oid; + PGXCNodeHandle **connections = NULL; + int conn_count = 0; + ResponseCombiner combiner; + PGXCNodeAllHandles *pgxc_handles = NULL; + + if (iscommit) + snprintf(command, 100, STMT_FORM, "commit", txn->gid); + else + snprintf(command, 100, STMT_FORM, "rollback", txn->gid); + if (iscommit && InvalidGlobalTimestamp == txn->global_commit_timestamp) + { + elog(ERROR, "twophase transaction '%s' has InvalidGlobalCommitTimestamp", txn->gid); + } + + connections = (PGXCNodeHandle**)palloc(sizeof(PGXCNodeHandle*) * (txn->num_dnparts + txn->num_coordparts)); + if (connections == NULL) + { + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory for connections"))); + } + get_transaction_handles(&pgxc_handles, txn); + +#ifdef __TWO_PHASE_TESTS__ + if (PG_CLEAN_SEND_CLEAN <= twophase_exception_case && + PG_CLEAN_ELOG_ERROR >= twophase_exception_case) + { + exception_count = 0; + } +#endif + for (ii = 0; ii < pgxc_handles->dn_conn_count; ii++) + { + node_oid = pgxc_handles->datanode_handles[ii]->nodeoid; + node_idx = find_node_index(node_oid); + if (TXN_STATUS_PREPARED != txn->txn_stat[ node_idx]) + { + continue; + } + /*send global timestamp to dn_node_list[ii]*/ + if (!send_query_clean_transaction(pgxc_handles->datanode_handles[ii], txn, command)) + { + elog(LOG, "pg_clean: send query '%s' from '%s' to '%s' failed ", + command, get_pgxc_nodename(my_nodeoid) , pgxc_handles->datanode_handles[ii]->nodename); + return false; + } + else + { + connections[conn_count++] = pgxc_handles->datanode_handles[ii]; +#ifdef __TWO_PHASE_TESTS__ + if (PG_CLEAN_SEND_CLEAN <= twophase_exception_case && + PG_CLEAN_ELOG_ERROR >= twophase_exception_case) + { + exception_count++; + if (1 == exception_count && + PG_CLEAN_ELOG_ERROR == twophase_exception_case) + { + elog(ERROR, "PG_CLEAN_ELOG_ERROR complish"); + } + } +#endif + } + } + + for (ii = 0; ii < pgxc_handles->co_conn_count; ii++) + { + node_oid = pgxc_handles->coord_handles[ii]->nodeoid; + node_idx = find_node_index(node_oid); + if (TXN_STATUS_PREPARED != txn->txn_stat[ node_idx]) + { + continue; + } + /*send global timestamp to dn_node_list[ii]*/ + if (!send_query_clean_transaction(pgxc_handles->coord_handles[ii], txn, command)) + { + elog(LOG, "pg_clean: send query '%s' from '%s' to '%s' failed ", + command, get_pgxc_nodename(my_nodeoid) , pgxc_handles->coord_handles[ii]->nodename); + return false; + } + else + { + connections[conn_count++] = pgxc_handles->coord_handles[ii]; +#ifdef __TWO_PHASE_TESTS__ + if (PG_CLEAN_SEND_CLEAN <= twophase_exception_case && + PG_CLEAN_ELOG_ERROR >= twophase_exception_case) + { + exception_count++; + if (1 == exception_count && + PG_CLEAN_ELOG_ERROR == twophase_exception_case) + { + elog(ERROR, "PG_CLEAN_ELOG_ERROR complish"); + } + } +#endif + } + + } + + /* receive response */ + if (conn_count) + { + InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE); + if (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner) || + !validate_combiner(&combiner)) + { + if (combiner.errorMessage) + pgxc_node_report_error(&combiner); + else + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to FINISH the transaction on one or more nodes"))); + } + else + CloseCombiner(&combiner); + } + if (enable_distri_print) + { + for (ii = 0; ii < conn_count; ii++) + { + if (DN_CONNECTION_STATE_IDLE != connections[ii]->state) + { + elog(WARNING, "IN pg_clean node:%s invalid stauts:%d", connections[ii]->nodename, connections[ii]->state); + } + } + } + conn_count = 0; + clear_handles(); + pfree_pgxc_all_handles(pgxc_handles); + pgxc_handles = NULL; + + /*last commit or rollback on origcoord if it participate this txn, since after commit the 2pc file is deleted on origcoord*/ + if (txn->origcoord != InvalidOid) + { + node_idx = find_node_index(txn->origcoord); + if (txn->coordparts[node_idx] == 1) + { + /*send global timestamp to dn_node_list[ii]*/ + + if (txn->txn_stat[node_idx] == TXN_STATUS_PREPARED) + { + get_node_handles(&pgxc_handles, txn->origcoord); + if (!send_query_clean_transaction(pgxc_handles->coord_handles[0], txn, command)) + { + elog(LOG, "pg_clean: send query '%s' from %s to %s failed ", + command, get_pgxc_nodename(my_nodeoid) , pgxc_handles->coord_handles[0]->nodename); + return false; + } + else + { + connections[conn_count++] = pgxc_handles->coord_handles[0]; + } + } + } + } + + /* receive response */ + if (conn_count) + { + InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE); + if (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner) || + !validate_combiner(&combiner)) + { + if (combiner.errorMessage) + pgxc_node_report_error(&combiner); + else + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to FINISH the transaction on one or more nodes"))); + } + else + CloseCombiner(&combiner); + } + /*free hash record from gtm*/ + FinishGIDGTM(txn->gid); + + clear_handles(); + pfree_pgxc_all_handles(pgxc_handles); + pgxc_handles = NULL; + pfree(connections); + connections = NULL; + return true; +} + +bool clean_2PC_files(txn_info * txn) +{ + int ii; + TupleTableSlots result; + bool issuccess = true; + static const char *STMT_FORM = "select pgxc_remove_2pc_records('%s')::text"; + char query[100]; + + snprintf(query, 100, STMT_FORM, txn->gid); + + for (ii = 0; ii < dn_nodes_num; ii++) + { + if (execute_query_on_single_node(dn_node_list[ii], query, 1, &result) == (Datum) 1) + { + if (TTSgetvalue(&result, 0, 0) == false) + { + elog(LOG, "pg_clean: delete 2PC file failed of transaction %s on node %s", + txn->gid, get_pgxc_nodename(txn->dnparts[ii])); + issuccess = false; + } + } + else + { + elog(LOG, "pg_clean: failed clean 2pc file of transaction %s on node %s", txn->gid, get_pgxc_nodename(dn_node_list[ii])); + issuccess = false; + } + DropTupleTableSlots(&result); + if (!issuccess) + return false; + } + + for (ii = 0; ii < cn_nodes_num; ii++) + { + if (execute_query_on_single_node(cn_node_list[ii], query, 1, &result) == (Datum) 1) + { + if (TTSgetvalue(&result, 0, 0) == false) + { + elog(LOG, "Error:delete 2PC file failed of transaction %s on node %s", + txn->gid, get_pgxc_nodename(txn->coordparts[ii])); + issuccess = false; + } + } + else + { + elog(LOG, "pg_clean: failed clean 2pc file of transaction %s on node %s", txn->gid, get_pgxc_nodename(cn_node_list[ii])); + issuccess = false; + } + DropTupleTableSlots(&result); + if (!issuccess) + return false; + } + return true; +} + +void Init_print_txn_info(print_txn_info * print_txn) +{ + database_info *cur_database = head_database_info; + txn_info *cur_txn; + HASH_SEQ_STATUS status; + HTAB *txn; + + print_txn->index = 0; + INIT(print_txn->txn); + + for (; cur_database; cur_database = cur_database->next) + { + txn = cur_database->all_txn_info; + hash_seq_init(&status, txn); + while ((cur_txn = (txn_info *) hash_seq_search(&status)) != NULL) + { + if (clear_2pc_belong_node && !cur_txn->belong_abnormal_node) + { + continue; + } + if (cur_txn->global_txn_stat != TXN_STATUS_INPROGRESS) + PALLOC(print_txn->txn, cur_txn); + } + +#if 0 + cur_txn = cur_database->head_txn_info; + for (; cur_txn; cur_txn = cur_txn->next) + { + if (cur_txn->global_txn_stat != TXN_STATUS_INPROGRESS) + PALLOC(print_txn->txn, cur_txn); + } +#endif + } +} + +void Init_print_stats_all(print_status *pstatus) +{ + database_info *cur_database; + txn_info *cur_txn; + HASH_SEQ_STATUS status; + HTAB *txn; + + pstatus->index = 0; + pstatus->count = 0; + INIT(pstatus->gid); + INIT(pstatus->global_status); + INIT(pstatus->status); + INIT(pstatus->database); + + for (cur_database = head_database_info; cur_database; cur_database = cur_database->next) + { + txn = cur_database->all_txn_info; + hash_seq_init(&status, txn); + while ((cur_txn = (txn_info *) hash_seq_search(&status)) != NULL) + { + cur_txn->global_txn_stat = check_txn_global_status(cur_txn); + if (cur_txn->global_txn_stat != TXN_STATUS_INPROGRESS) + Init_print_stats(cur_txn, cur_database->database_name, pstatus); + } +#if 0 + for (cur_txn = cur_database->head_txn_info; cur_txn; cur_txn = cur_txn->next) + { + cur_txn->global_txn_stat = check_txn_global_status(cur_txn); + if (cur_txn->global_txn_stat != TXN_STATUS_INPROGRESS) + Init_print_stats(cur_txn, cur_database->database_name, pstatus); + } +#endif + } +} + +void Init_print_stats(txn_info *txn, char *database, print_status * pstatus) +{ + int ii; + StringInfoData query; + initStringInfo(&query); + + RPALLOC(pstatus->gid); + RPALLOC(pstatus->global_status); + RPALLOC(pstatus->status); + RPALLOC(pstatus->database); + + pstatus->gid[pstatus->count] = (char *)palloc0(100 * sizeof(char)); + pstatus->database[pstatus->count] = (char *)palloc0(100 * sizeof(char)); + pstatus->global_status[pstatus->count] = (char *)palloc0(100 * sizeof(char)); + + strncpy(pstatus->gid[pstatus->count], txn->gid, 100); + strncpy(pstatus->database[pstatus->count], database, 100); + strncpy(pstatus->global_status[pstatus->count], txn_status_to_string(check_txn_global_status(txn)), 100); + + for (ii = 0; ii < pgxc_clean_node_count; ii++) + { + appendStringInfo(&query, "%-12s:%-15s", get_pgxc_nodename(find_node_oid(ii)), + txn_status_to_string(txn->txn_stat[ii])); + if (ii < pgxc_clean_node_count - 1) + { + appendStringInfoChar(&query, '\n'); + } + } + + pstatus->status[pstatus->count] = (char *)palloc0((strlen(query.data)+1) * sizeof(char)); + strncpy(pstatus->status[pstatus->count], query.data, strlen(query.data)+1); + pstatus->gid_count++; + pstatus->database_count++; + pstatus->global_status_count++; + pstatus->status_count++; + pstatus->count++; +} + +static const char *txn_status_to_string(TXN_STATUS status) +{ + switch (status) + { + ENUM_TOCHAR_CASE(TXN_STATUS_INITIAL) + ENUM_TOCHAR_CASE(TXN_STATUS_UNKNOWN) + ENUM_TOCHAR_CASE(TXN_STATUS_PREPARED) + ENUM_TOCHAR_CASE(TXN_STATUS_COMMITTED) + ENUM_TOCHAR_CASE(TXN_STATUS_ABORTED) + ENUM_TOCHAR_CASE(TXN_STATUS_INPROGRESS) + ENUM_TOCHAR_CASE(TXN_STATUS_FAILED) + } + return NULL; +} + +static const char *txn_op_to_string(OPERATION op) +{ + switch (op) + { + ENUM_TOCHAR_CASE(UNDO) + ENUM_TOCHAR_CASE(ABORT) + ENUM_TOCHAR_CASE(COMMIT) + } + return NULL; +} + + +static void +CheckFirstPhase(txn_info *txn) +{ +// int ret; + Oid orignode = txn->origcoord; + uint32 startxid = txn->startxid; +// uint32 transactionid; + int nodeidx; + + /* + * if the twophase trans does not success in prepare phase, the orignode == InvalidOid. + */ + if (InvalidOid == orignode) + { + return; + } + nodeidx = find_node_index(orignode); + if (0 == txn->xid[nodeidx]) + { + txn->xid[nodeidx] = startxid; + } + /* start node participate */ + if (txn->isorigcoord_part) + { + if (0 == txn->coordparts[nodeidx]) + { + txn->coordparts[nodeidx] = 1; + txn->num_coordparts++; + } + if (txn->txn_stat[nodeidx] == TXN_STATUS_INITIAL) + { + /*select * from pgxc_is_committed...*/ + getTxnStatus(txn, nodeidx); + } + if (txn->txn_stat[nodeidx] == TXN_STATUS_PREPARED && txn->global_commit_timestamp != InvalidGlobalTimestamp) + { + txn->after_first_phase = true; + } + } + /* start node node participate */ + else + { +#if 0 + ret = Get2PCFile(orignode, txn->gid, &transactionid); + if (ret == FILENOTFOUND) + txn->after_first_phase = false; + else if (ret == FILEUNKOWN) + txn->global_txn_stat = TXN_STATUS_UNKNOWN; + else if (ret == FILEFOUND && txn->global_commit_timestamp != InvalidGlobalTimestamp) + txn->after_first_phase = true; +#endif + if (txn->global_commit_timestamp != InvalidGlobalTimestamp) + { + txn->after_first_phase = true; + } else { + txn->after_first_phase = false; + } + } +} + +void get_transaction_handles(PGXCNodeAllHandles **pgxc_handles, txn_info *txn) +{ + int dn_index = 0; + int cn_index = 0; + int nodeIndex; + char nodetype; + List *coordlist = NIL; + List *nodelist = NIL; + + while (dn_index < dn_nodes_num) + { + + /* Get node type and index */ + nodetype = PGXC_NODE_NONE; + if (TXN_STATUS_PREPARED != txn->txn_stat[dn_index + cn_nodes_num]) + { + dn_index++; + continue; + } + nodeIndex = PGXCNodeGetNodeIdFromName(get_pgxc_nodename(dn_node_list[dn_index]), &nodetype); + if (nodetype == PGXC_NODE_NONE) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("PGXC Node %s: object not defined", + get_pgxc_nodename(dn_node_list[dn_index])))); + + /* Check if node is requested is the self-node or not */ + if (nodetype == PGXC_NODE_DATANODE) + { + nodelist = lappend_int(nodelist, nodeIndex); + } + dn_index++; + + } + + while (cn_index < cn_nodes_num) + { + /* Get node type and index */ + nodetype = PGXC_NODE_NONE; + if (TXN_STATUS_PREPARED != txn->txn_stat[cn_index] || cn_node_list[cn_index] == txn->origcoord) + { + cn_index++; + continue; + } + nodeIndex = PGXCNodeGetNodeIdFromName(get_pgxc_nodename(cn_node_list[cn_index]), &nodetype); + if (nodetype == PGXC_NODE_NONE) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("PGXC Node %s: object not defined", + get_pgxc_nodename(cn_node_list[cn_index])))); + + /* Check if node is requested is the self-node or not */ + if (nodetype == PGXC_NODE_COORDINATOR) + { + coordlist = lappend_int(coordlist, nodeIndex); + } + cn_index++; + } + *pgxc_handles = get_handles(nodelist, coordlist, false, true, true); +} + +void get_node_handles(PGXCNodeAllHandles **pgxc_handles, Oid nodeoid) +{ + char nodetype = PGXC_NODE_NONE; + int nodeIndex; + List *coordlist = NIL; + List *nodelist = NIL; + + nodeIndex = PGXCNodeGetNodeIdFromName(get_pgxc_nodename(nodeoid), &nodetype); + if (nodetype == PGXC_NODE_COORDINATOR) + { + coordlist = lappend_int(coordlist, nodeIndex); + } + else + { + nodelist = lappend_int(nodelist, nodeIndex); + } + *pgxc_handles = get_handles(nodelist, coordlist, false, true, true); +} + diff --git a/contrib/pg_clean/pg_clean.control b/contrib/pg_clean/pg_clean.control new file mode 100644 index 00000000..00524ac0 --- /dev/null +++ b/contrib/pg_clean/pg_clean.control @@ -0,0 +1,5 @@ +# 2PC transaction recovering extention +comment = 'tools for clean all the remaining 2PC transactions' +default_version = '1.0' +module_pathname = '$libdir/pg_clean' +relocatable = true \ No newline at end of file diff --git a/contrib/pg_clean/test.sh b/contrib/pg_clean/test.sh new file mode 100644 index 00000000..2f56a302 --- /dev/null +++ b/contrib/pg_clean/test.sh @@ -0,0 +1,171 @@ +#!/bin/bash +# +# This script sets up test environment for pgxc_clean. +# Please note that all the prepared transactions are +# partially committed or aborted. +# +# You should configure PGPORT and PGHOST to connect, as +# well as node names for your test environment. +# +# Before you run this script, XC should be up and ready. +# Also, this may try to drop test databases. You may need +# to run CLEAN CONNECTION satement for each coordinator in +# advance. +# + +export PGPORT=52898 +export PGHOST=localhost +sourcedb=postgres + +{ +psql -e postgres < +#include +#include + +#include "storage/procarray.h" +#include "storage/lwlock.h" +#include "storage/proc.h" +#include "utils/varlena.h" +#include "utils/lsyscache.h" +#include "utils/palloc.h" +#include "utils/builtins.h" + +#include "executor/tuptable.h" +#include "pgxc/execRemote.h" +#include "pgxc/pgxcnode.h" +#include "pgxc/poolmgr.h" +#include "access/tupdesc.h" +#include "access/htup_details.h" +#include "lib/stringinfo.h" +#ifdef XCP +#include "catalog/pg_type.h" +#include "catalog/pgxc_node.h" +#include "executor/executor.h" +#include "nodes/makefuncs.h" +#include "utils/snapmgr.h" +#endif +#ifdef PGXC +#include "pgxc/nodemgr.h" +#include "pgxc/pgxc.h" +#endif + +PG_MODULE_MAGIC; + +#define MAX_GID 50 +#define MAX_DBNAME 64 +#define MAX_RELNAME 64 +#define MAX_MODE 30 +#define MAX_DEADLOCK 10000 + +/*macros about space allocation and release*/ +#define INIT(x)\ +do{\ + x = NULL;\ + x##_count = 0;\ + x##_size = 0;\ +}while(0); + +#define RPALLOC(x)\ +do{\ + if (x##_size < x##_count+1)\ + {\ + int temp_size = (x##_size > 0) ? x##_size : 1;\ + if (NULL == x)\ + {\ + x = palloc0(2*temp_size*sizeof(*x));\ + }\ + else\ + {\ + x = repalloc(x, 2*temp_size*sizeof(*x));\ + }\ + x##_size = 2*temp_size;\ + }\ +}while(0); + +#define PALLOC(x, y)\ +do{\ + RPALLOC(x);\ + x[x##_count] = y;\ + x##_count++;\ +}while(0); + +#define RFREE(x)\ +do{\ + if (x##_size > 0)\ + {\ + pfree(x);\ + }\ + x = NULL;\ + x##_count = 0;\ + x##_size = 0;\ +}while(0); + +/*data structures*/ + /*about lock*/ +typedef enum +{ + Lockmode_ASL = 0, /*AccessShareLock*/ + Lockmode_RSL, /*RowShareLock*/ + Lockmode_REL, /*RowExclusiveLock*/ + Lockmode_SUEL, /*ShareUpdateExclusiveLock*/ + Lockmode_SL, /*ShareLock*/ + Lockmode_SREL, /*ShareRowExclusiveLock*/ + Lockmode_EL, /*ExclusiveLock*/ + Lockmode_AEL /*AccessExclusiveLock*/ +} MODE; + +typedef enum +{ + Locktype_Relation = 0, + Locktype_Page, + Locktype_Tuple, + Locktype_Transactionid, + Locktype_Object, + Locktype_Userlock, + Locktype_Advisory +} LOCKTYPE; + +typedef struct +{ + LOCKTYPE m_locktype; + char m_dbname[MAX_DBNAME]; + char m_relname[MAX_RELNAME]; + uint32 m_page; + uint16 m_tuple; + MODE m_mode; + bool m_granted; + uint32 m_transactionid; + Oid m_node; + uint32 m_pid; + char * m_query; +} lockinfo; + + /*about deadlock*/ +typedef struct +{ + int* txns; + int txns_count; + int txns_size; + bool killed; +} deadlock; + + /*about transactions*/ +typedef struct +{ + int pre; + int post; +}Edge; + +typedef struct +{ + char ***slot; /*slot[i][j] stores value of row i, colum j*/ + int slot_count; /*number of rows*/ + int slot_size; + int attnum; +}TupleTableSlots; + +typedef struct +{ + char gid[MAX_GID]; /*globla transactionid*/ + uint32 *pid; /*Local pid on each node*/ + int pid_count; + int pid_size; + Oid *node; /*a global transaction corresponding to multiple nodes*/ + int node_count; + int node_size; + Oid initiator; /*node initiating the transaction*/ + lockinfo *hold; /*hold lock list of the transaction*/ + int hold_count; + int hold_size; + lockinfo *wait; /*wait lock list of the transaction*/ + int wait_count; + int wait_size; + bool searched; /*transaction travesal status during deadlock detection*/ + bool alive; /*whether the transaction is killed*/ + int* deadlock; /*belonging deadlocks*/ + int deadlock_count; /*deadlock count of the transaction*/ + int deadlock_size; + Edge* out; + int out_count; + int out_size; + int wait_txn; + char* query; +}transaction; + +typedef struct +{ + int* stack; /*stack during depth-first search*/ + int stack_count; + int stack_size; + int* stackpre; /*stores parents of transactions in stack*/ + int stackpre_count; + int stackpre_size; + int* path; /*extended path in depth-first search*/ + int path_count; + int path_size; + int* txn_exist; /*stores index of trasaction[i] in path, + txn_exist[txnid] = i; (path[i] = txnid or txn_exist[txnid] = -1;)*/ +} deeplist; + + /*about output results*/ +typedef struct +{ + int index; + char **edge; + int edge_count; + int edge_size; + + char **nodes; + int nodes_count; + int nodes_size; + + char **querys; + int querys_count; + int querys_size; +} PrintEdge; + +typedef struct +{ + int index; + char **deadlock; + char **nodename; + char **query; + int deadlock_count; + int *per_size; +} PrintDeadlock; + +typedef struct +{ + int index; + char **txn; + int txn_count; + int txn_size; + + char **cancel_query; + int cancel_query_count; + int cancel_query_size; + + char **nodename; + int nodename_count; + int nodename_size; +} PrintRollbackTxn; + +typedef struct +{ + int index; + PrintRollbackTxn *Ptxns; + int Ptxns_count; + int Ptxns_size; +} PrintAllRollbackTxns; + + +/*function list*/ +static void ResetGlobalVariables(void); + + /*plugin entry function*/ +Datum pg_unlock_execute(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(pg_unlock_execute); + +Datum pg_unlock_check_deadlock(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(pg_unlock_check_deadlock); + +Datum pg_unlock_check_dependency(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(pg_unlock_check_dependency); + +Datum pg_unlock_killbypid(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(pg_unlock_killbypid); + +Datum pg_findgxid(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(pg_findgxid); + + /*get all the transaction info*/ +static char * TTSgetvalue(TupleTableSlots *result, int tup_num, int field_num); +static void DropTupleTableSlots(TupleTableSlots * Slots); +static Datum execute_on_single_node(Oid node, const char * query, int attnum, TupleTableSlots * tuples); +void GetAllTransInfo(void); +void LoadTransaction(Oid node); +void InitTransaction(int txn_index); +void add_pid_node(int txn_index, uint32 pid, Oid node); +LOCKTYPE + find_locktype(char *locktype); +MODE find_mode(char *mode); + + /*build transaction dependency gragh*/ +void InitAllEdge(void); +void InitEdge(int pre, int post); +bool is_conflict_withtxn(lockinfo *wait, int post_txn); +bool is_conflict_withlock(lockinfo *wait, lockinfo *hold); +bool check_include(lockinfo *wait, lockinfo *hold); +void DropTransaction(int i); +void DropAlltransactions(void); +void DropEdge(int id); + + /*find all deadlocks*/ +void InitDeadlock(void); +void DropDeadlock(deadlock *loop); +void DropAlldeadlocks(void); +void DetectDeadlock(void); +int traverse(deeplist* list); +void path_deadlock(deeplist * list, int start); +void InitDeeplist(deeplist* list); +void DropDeeplist(deeplist* list); +void ClearDeeplist(deeplist* list); + + /*recover all deadlocks*/ +void RecoverDeadlock(void); +void CountDeadlocks(void); +void CountWaitTxn(void); +void SortByDeadlock(int *sort_txnid); +void quiksort(int *sort_txnid, int low, int high); +void KillDeadlockByTxn(int txnid); +bool DeadlockExists(int id); + + /*output results*/ +void InitPrintEdge(PrintEdge *Pedge); +void DropPrintEdge(PrintEdge *Pedge); +void InitPrintDeadlock(PrintDeadlock *Pdeadlock); +void DropPrintDeadlock(PrintDeadlock *Pdeadlock); +void InitPrinttxn(PrintRollbackTxn *Ptxn); +void DropPrinttxn(PrintRollbackTxn *Ptxn); +char *GetGxid(Oid node, uint32 pid); +int check_node_pid(char *nodename, uint32 pid); +bool check_exist_gid(char *gid); +void KillTxn(int txnid); + +/*global variables*/ +static Oid *cn_node_list = NULL; +static Oid *dn_node_list = NULL; +static Oid *sdn_node_list = NULL; +static bool *cn_health_map = NULL; +static bool *dn_health_map = NULL; +static int cn_nodes_num; +static int dn_nodes_num; +static int sdn_nodes_num; + +static transaction * + pgxc_transaction = NULL; /*stores all transactions*/ +static int pgxc_transaction_count = 0; /*transaction count*/ +static int pgxc_transaction_size = 0; /*records capacity of pgxc_transaction*/ +static int **pgxc_edge = NULL; +static deadlock * + pgxc_deadlock = NULL; +static int pgxc_deadlock_count = 0; +static int pgxc_deadlock_size = 0; + +static int m_matrix[8][8] = /*conflict info among lock modes*/ +{ + {0, 0, 0, 0, 0, 0, 0, 1}, + {0, 0, 0, 0, 0, 0, 1, 1}, + {0, 0, 1, 0, 1, 1, 1, 1}, + {0, 0, 0, 1, 1, 1, 1, 1}, + {0, 0, 1, 1, 0, 1, 1, 1}, + {0, 0, 1, 1, 1, 1, 1, 1}, + {0, 1, 1, 1, 1, 1, 1, 1}, + {1, 1, 1, 1, 1, 1, 1, 1} +}; + +static void ResetGlobalVariables(void) +{ + cn_node_list = NULL; + dn_node_list = NULL; + sdn_node_list = NULL; + cn_health_map = NULL; + dn_health_map = NULL; + cn_nodes_num = 0; + dn_nodes_num = 0; + sdn_nodes_num = 0; + + pgxc_transaction = NULL; /*stores all transactions*/ + pgxc_transaction_count = 0; /*transaction count*/ + pgxc_transaction_size = 0; /*records capacity of pgxc_transaction*/ + pgxc_edge = NULL; + + pgxc_deadlock = NULL; + pgxc_deadlock_count = 0; + pgxc_deadlock_size = 0; + +} + + +/* + * pg_unlock_execute -- detect and recover deadlocks + * input: no + * output: info of rollback transactions + */ +Datum +pg_unlock_execute(PG_FUNCTION_ARGS) +{ +#ifdef ACCESS_CONTROL_ATTR_NUM +#undef ACCESS_CONTROL_ATTR_NUM +#endif +#define ACCESS_CONTROL_ATTR_NUM 5 + FuncCallContext *funcctx; + PrintAllRollbackTxns *Partxns; + char **rec; + char **nodename; + char **query; + HeapTuple tuple; + + Datum values[ACCESS_CONTROL_ATTR_NUM]; + bool nulls[ACCESS_CONTROL_ATTR_NUM]; + + if(!IS_PGXC_COORDINATOR) + { + elog(ERROR, "can only called on coordinator"); + } + + if (SRF_IS_FIRSTCALL()) + { + MemoryContext oldcontext; + TupleDesc tupdesc; + funcctx = SRF_FIRSTCALL_INIT(); + + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + tupdesc = CreateTemplateTupleDesc(ACCESS_CONTROL_ATTR_NUM, false); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "executetime", + INT8OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "txnindex", + INT8OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "rollbacktxn(ip:port)", + TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 4, "nodename", + TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 5, "cancel_query", + TEXTOID, -1, 0); + funcctx->tuple_desc = BlessTupleDesc(tupdesc); + + funcctx->user_fctx = palloc0(sizeof(PrintAllRollbackTxns)); + Partxns = (PrintAllRollbackTxns *)funcctx->user_fctx; + INIT(Partxns->Ptxns); + Partxns->index = 0; + + ResetGlobalVariables(); + /*get node list*/ + PgxcNodeGetOidsExtend(&cn_node_list, &dn_node_list, &sdn_node_list, + &cn_nodes_num, &dn_nodes_num, &sdn_nodes_num, true); + cn_health_map = palloc0(cn_nodes_num * sizeof(bool)); + dn_health_map = palloc0(dn_nodes_num * sizeof(bool)); + do + { + /*get all transaction info and associat it to global xid*/ + GetAllTransInfo(); + if (pgxc_transaction_count == 0) + { + elog(DEBUG1, "pg_unlock: there is no transaction"); + break; + } + + /*build transaction dependency graph*/ + InitAllEdge(); + + /*detect deadlocks*/ + DetectDeadlock(); + if (pgxc_deadlock_count == 0) + { + /*program ends until there is no deadlock*/ + elog(DEBUG1, "pg_unlock: there is no deadlock"); + break; + } + /*recover deadlocks through killing one transaction*/ + RecoverDeadlock(); + + /*record output info*/ + RPALLOC(Partxns->Ptxns); + InitPrinttxn(&(Partxns->Ptxns[Partxns->Ptxns_count])); + if (Partxns->Ptxns[Partxns->Ptxns_count].txn_count > 0) + { + Partxns->Ptxns_count++; + } + DropAlldeadlocks(); + DropAlltransactions(); + }while(true); + MemoryContextSwitchTo(oldcontext); + } + + funcctx = SRF_PERCALL_SETUP(); + Partxns = (PrintAllRollbackTxns *) funcctx->user_fctx; + + if (Partxns->index < Partxns->Ptxns_count) + { + PrintRollbackTxn *temp = &(Partxns->Ptxns[Partxns->index]); + rec = Partxns->Ptxns[Partxns->index].txn; + nodename = Partxns->Ptxns[Partxns->index].nodename; + query = Partxns->Ptxns[Partxns->index].cancel_query; + + while (temp->index < temp->txn_count) + { + MemSet(values, 0, sizeof(values)); + MemSet(nulls, 0, sizeof(nulls)); + + if (temp->index == 0) + { + values[0] = Int32GetDatum(Partxns->index); + } + values[1] = Int32GetDatum(temp->index); + values[2] = PointerGetDatum(cstring_to_text(rec[temp->index])); + values[3] = PointerGetDatum(cstring_to_text(nodename[temp->index])); + values[4] = PointerGetDatum(cstring_to_text(query[temp->index])); + tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); + temp->index++; + if (temp->index < temp->txn_count) + { + SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple)); + } + } + Partxns->index++; + tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); + SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple)); + } + else + { + RFREE(Partxns->Ptxns); + Partxns->index = 0; + DropAlldeadlocks(); + DropAlltransactions(); + pfree(cn_health_map); + pfree(dn_health_map); + if (cn_node_list) + { + pfree(cn_node_list); + cn_nodes_num = 0; + } + if (dn_node_list) + { + pfree(dn_node_list); + dn_nodes_num = 0; + } + if (sdn_node_list) + { + pfree(sdn_node_list); + sdn_nodes_num = 0; + } + SRF_RETURN_DONE(funcctx); + } +} + +/* + * pg_unlock_check_deadlock -- detect deadlocks without recover + * input: no + * output: info of deadlocks + */ +Datum pg_unlock_check_deadlock(PG_FUNCTION_ARGS) +{ +#ifdef ACCESS_CONTROL_ATTR_NUM +#undef ACCESS_CONTROL_ATTR_NUM +#endif +#define ACCESS_CONTROL_ATTR_NUM 4 + FuncCallContext *funcctx; + PrintDeadlock *Pdeadlock; + char **rec; + char **nodes; + char **querys; + HeapTuple tuple; + + Datum values[ACCESS_CONTROL_ATTR_NUM]; + bool nulls[ACCESS_CONTROL_ATTR_NUM]; + + if (SRF_IS_FIRSTCALL()) + { + MemoryContext oldcontext; + TupleDesc tupdesc; + funcctx = SRF_FIRSTCALL_INIT(); + + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + tupdesc = CreateTemplateTupleDesc(ACCESS_CONTROL_ATTR_NUM, false); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "deadlockid", + INT8OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "deadlocks", + TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "nodename", + TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 4, "query", + TEXTOID, -1, 0); + funcctx->tuple_desc = BlessTupleDesc(tupdesc); + + funcctx->user_fctx = palloc0(sizeof(PrintDeadlock)); + Pdeadlock = (PrintDeadlock*)funcctx->user_fctx; + + ResetGlobalVariables(); + /*get node list*/ + PgxcNodeGetOidsExtend(&cn_node_list, &dn_node_list, &sdn_node_list, + &cn_nodes_num, &dn_nodes_num, &sdn_nodes_num, true); + cn_health_map = palloc0(cn_nodes_num * sizeof(bool)); + dn_health_map = palloc0(dn_nodes_num * sizeof(bool)); + + /*get all transaction info and associat it to global xid*/ + GetAllTransInfo(); + + /*build transaction dependency graph*/ + InitAllEdge(); + + /*detect deadlocks*/ + DetectDeadlock(); + + /*record output info*/ + InitPrintDeadlock(Pdeadlock); + MemoryContextSwitchTo(oldcontext); + } + + funcctx = SRF_PERCALL_SETUP(); + Pdeadlock = (PrintDeadlock *) funcctx->user_fctx; + rec = Pdeadlock->deadlock; + nodes = Pdeadlock->nodename; + querys = Pdeadlock->query; + + if (Pdeadlock->index < Pdeadlock->deadlock_count) + { + MemSet(values, 0, sizeof(values)); + MemSet(nulls, 0, sizeof(nulls)); + + values[0] = Int32GetDatum(Pdeadlock->index); + values[1] = PointerGetDatum(cstring_to_text(rec[Pdeadlock->index])); + values[2] = PointerGetDatum(cstring_to_text(nodes[Pdeadlock->index])); + values[3] = PointerGetDatum(cstring_to_text(querys[Pdeadlock->index])); + tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); + Pdeadlock->index++; + SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple)); + } + else + { + DropPrintDeadlock(Pdeadlock); + DropAlldeadlocks(); + DropAlltransactions(); + pfree(cn_health_map); + pfree(dn_health_map); + if (cn_node_list) + { + pfree(cn_node_list); + cn_nodes_num = 0; + } + if (dn_node_list) + { + pfree(dn_node_list); + dn_nodes_num = 0; + } + if (sdn_node_list) + { + pfree(sdn_node_list); + sdn_nodes_num = 0; + } + SRF_RETURN_DONE(funcctx); + } +} + +/* + * pg_unlock_check_dependency -- only detect transaction dependency + * input: no + * output: info of transaction dependency + */ +Datum pg_unlock_check_dependency(PG_FUNCTION_ARGS) +{ +#ifdef ACCESS_CONTROL_ATTR_NUM +#undef ACCESS_CONTROL_ATTR_NUM +#endif +#define ACCESS_CONTROL_ATTR_NUM 4 + FuncCallContext *funcctx; + PrintEdge *Pedge; + char **rec; + char **nodes; + char **querys; + HeapTuple tuple; + + Datum values[ACCESS_CONTROL_ATTR_NUM]; + bool nulls[ACCESS_CONTROL_ATTR_NUM]; + + if (SRF_IS_FIRSTCALL()) + { + MemoryContext oldcontext; + TupleDesc tupdesc; + funcctx = SRF_FIRSTCALL_INIT(); + + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + tupdesc = CreateTemplateTupleDesc(ACCESS_CONTROL_ATTR_NUM, false); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "dependencyid", + INT8OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "dependency", + TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "nodename", + TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 4, "query", + TEXTOID, -1, 0); + funcctx->tuple_desc = BlessTupleDesc(tupdesc); + + funcctx->user_fctx = palloc0(sizeof(PrintEdge)); + Pedge = (PrintEdge*)funcctx->user_fctx; + + ResetGlobalVariables(); + /*get node list*/ + PgxcNodeGetOidsExtend(&cn_node_list, &dn_node_list, &sdn_node_list, + &cn_nodes_num, &dn_nodes_num, &sdn_nodes_num, true); + cn_health_map = palloc0(cn_nodes_num * sizeof(bool)); + dn_health_map = palloc0(dn_nodes_num * sizeof(bool)); + + /*get all transaction info and associat it to global xid*/ + GetAllTransInfo(); + + /*build transaction dependency graph*/ + InitAllEdge(); + + /*record output info*/ + InitPrintEdge(Pedge); + MemoryContextSwitchTo(oldcontext); + } + + funcctx = SRF_PERCALL_SETUP(); + Pedge = (PrintEdge *) funcctx->user_fctx; + rec = Pedge->edge; + nodes = Pedge->nodes; + querys = Pedge->querys; + + if (Pedge->index < Pedge->edge_count) + { + MemSet(values, 0, sizeof(values)); + MemSet(nulls, 0, sizeof(nulls)); + + values[0] = Int32GetDatum(Pedge->index); + values[1] = PointerGetDatum(cstring_to_text(rec[Pedge->index])); + values[2] = PointerGetDatum(cstring_to_text(nodes[Pedge->index])); + values[3] = PointerGetDatum(cstring_to_text(querys[Pedge->index])); + tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); + Pedge->index++; + SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple)); + } + else + { + DropPrintEdge(Pedge); + DropAlltransactions(); + pfree(cn_health_map); + pfree(dn_health_map); + if (cn_node_list) + { + pfree(cn_node_list); + cn_nodes_num = 0; + } + if (dn_node_list) + { + pfree(dn_node_list); + dn_nodes_num = 0; + } + if (sdn_node_list) + { + pfree(sdn_node_list); + sdn_nodes_num = 0; + } + SRF_RETURN_DONE(funcctx); + } + +} + +/* + * pg_unlock_killbypid -- kill certain transaction by user + * input: nodename, pid + * output: execute result success of error info + */ +Datum pg_unlock_killbypid(PG_FUNCTION_ARGS) +{ + char *Kstatus; + char *nodename = text_to_cstring(PG_GETARG_TEXT_P(0)); + uint32 kpid = PG_GETARG_UINT32(1); + int size = sizeof(char) * 100; + char gid[MAX_GID]; + text *t_status = NULL; + int txnindex; + + Kstatus = (char *)palloc0(size); + + if(!IS_PGXC_COORDINATOR) + { + elog(ERROR, "can only called on coordinator"); + } + + do + { + ResetGlobalVariables(); + /*get node list*/ + PgxcNodeGetOidsExtend(&cn_node_list, &dn_node_list, &sdn_node_list, + &cn_nodes_num, &dn_nodes_num, &sdn_nodes_num, true); + cn_health_map = palloc0(cn_nodes_num * sizeof(bool)); + dn_health_map = palloc0(dn_nodes_num * sizeof(bool)); + + /*get all transaction info and associat it to global xid*/ + GetAllTransInfo(); + + /*find global transaction according to nodename and pid*/ + txnindex = check_node_pid(nodename, kpid); + if (txnindex < 0) + { + snprintf(Kstatus, size, "Fail:error not exists node:%s or pid:%u on node %s", nodename, kpid, nodename); + break; + } + if (get_pgxc_nodetype(get_pgxc_nodeoid(nodename)) != 'C') + { + snprintf(Kstatus, size, "Fail:error node:%s is not coordinator", nodename); + break; + } + memcpy(gid, pgxc_transaction[txnindex].gid, sizeof(gid)); + + /*kill the transaction*/ + KillTxn(txnindex); + DropAlltransactions(); + + /*check whether this transaction is existed*/ + LoadTransaction(get_pgxc_nodeoid(nodename)); + if(!check_exist_gid(gid)) + { + snprintf(Kstatus, size, "Success: pid:%u on node %s is killed", kpid, nodename); + break; + } + else + { + snprintf(Kstatus, size, "Fail:error pid:%u on node %s is not killed", kpid, nodename); + break; + } + }while(0); + DropAlltransactions(); + pfree(nodename); + pfree(cn_health_map); + pfree(dn_health_map); + if (cn_node_list) + { + pfree(cn_node_list); + cn_nodes_num = 0; + } + if (dn_node_list) + { + pfree(dn_node_list); + dn_nodes_num = 0; + } + if (sdn_node_list) + { + pfree(sdn_node_list); + sdn_nodes_num = 0; + } + t_status = cstring_to_text(Kstatus); + pfree(Kstatus); + return PointerGetDatum(t_status); +} + + +/* + * execute_on_single_node -- execute query on certain node and get results + * input: node oid, execute query, number of attribute in results, results + * return: (Datum) 0 + */ +static Datum +execute_on_single_node(Oid node, const char *query, int attnum, TupleTableSlots *tuples) //delete numnodes, delete nodelist, insert node +{ + + int i; + int ii; + Datum datum = (Datum) 0; + bool isnull = false; + int i_tuple; + int i_attnum; + /*check health of node*/ + bool ishealthy; + +#ifdef XCP + EState *estate; + MemoryContext oldcontext; + RemoteQuery *plan; + RemoteQueryState *pstate; + TupleTableSlot *result = NULL; + Var *dummy; + char ntype; +#endif + + + /*get heathy status of query node*/ + PoolPingNodeRecheck(node); + PgxcNodeGetHealthMap(cn_node_list, dn_node_list, &cn_nodes_num, &dn_nodes_num, cn_health_map, dn_health_map); + if (get_pgxc_nodetype(node) == 'C') + { + for (i = 0; i < cn_nodes_num; i++) + { + if (cn_node_list[i] == node) + { + ishealthy = cn_health_map[i]; + } + } + } + else + { + for (i = 0; i < dn_nodes_num; i++) + { + if (dn_node_list[i] == node) + { + ishealthy = dn_health_map[i]; + } + } + } + +#ifdef XCP + /* + * Make up RemoteQuery plan node + */ + plan = makeNode(RemoteQuery); + plan->combine_type = COMBINE_TYPE_NONE; + plan->exec_nodes = makeNode(ExecNodes); + plan->exec_type = EXEC_ON_NONE; + + ntype = PGXC_NODE_NONE; + plan->exec_nodes->nodeList = lappend_int(plan->exec_nodes->nodeList, + PGXCNodeGetNodeId(node, &ntype)); + if (ntype == PGXC_NODE_NONE) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Unknown node Oid: %u", node))); + else if (ntype == PGXC_NODE_COORDINATOR) + { + plan->exec_type = EXEC_ON_COORDS; + } + else + { + plan->exec_type = EXEC_ON_DATANODES; + } + + plan->sql_statement = (char*)query; + plan->force_autocommit = false; + /* + * We only need the target entry to determine result data type. + * So create dummy even if real expression is a function. + */ + for (ii = 1; ii <= attnum; ii++) + { + dummy = makeVar(1, ii, TEXTOID, 0, InvalidOid, 0); //TEXTOID?? + plan->scan.plan.targetlist = lappend(plan->scan.plan.targetlist, + makeTargetEntry((Expr *) dummy, ii, NULL, false)); + } + /* prepare to execute */ + estate = CreateExecutorState(); + oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); + estate->es_snapshot = GetActiveSnapshot(); + pstate = ExecInitRemoteQuery(plan, estate, 0); + MemoryContextSwitchTo(oldcontext); + + /*execute query on node when node is healthy*/ + INIT(tuples->slot); + tuples->attnum = 0; + if (ishealthy) + { + result = ExecRemoteQuery((PlanState *) pstate); + tuples->attnum = attnum; + i_tuple = 0; + i_attnum = 0; + while (result != NULL && !TupIsNull(result)) + { + slot_getallattrs(result); + RPALLOC(tuples->slot); + tuples->slot[i_tuple] = (char **) palloc(attnum * sizeof(char *)); + + for (i_attnum = 0; i_attnum < attnum; i_attnum++) + { + if (result->tts_values[i_attnum] != (Datum)0) + { + tuples->slot[i_tuple][i_attnum] = text_to_cstring(DatumGetTextP(result->tts_values[i_attnum])); + } + else + { + tuples->slot[i_tuple][i_attnum] = NULL; + } + } + tuples->slot_count++; + + result = ExecRemoteQuery((PlanState *) pstate); + i_tuple++; + } + } + ExecEndRemoteQuery(pstate); +#else + /* + * Connect to SPI manager + */ + if ((ret = SPI_connect()) < 0) + /* internal error */ + elog(ERROR, "SPI connect failure - returned %d", ret); + + initStringInfo(&buf); + + /* Get pg_***_size function results from all Datanodes */ + nodename = get_pgxc_nodename(node); + + ret = SPI_execute_direct(query, nodename); + spi_tupdesc = SPI_tuptable->tupdesc; + + if (ret != SPI_OK_SELECT) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("failed to execute query '%s' on node '%s'", + query, nodename))); + } + + /* + * The query must always return one row having one column: + */ + Assert(SPI_processed == 1 && spi_tupdesc->natts == 1); + + datum = SPI_getbinval(SPI_tuptable->vals[0], spi_tupdesc, 1, &isnull); + + /* For single node, don't assume the type of datum. It can be bool also. */ + SPI_finish(); +#endif + return (Datum) 0; + if (isnull +#ifdef _MLS_ + && (NULL != result)) +#endif + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Expected datum but got null instead " + "while executing query '%s'", + query))); + PG_RETURN_DATUM(datum); +} + +/* + * GetAllTransInfo -- get all transactions from all nodes and stores them in pgxc_transaction + * input: no + * return: no + */ +void GetAllTransInfo(void) +{ + int i; + for (i = 0; i < cn_nodes_num; i++) + { + LoadTransaction(cn_node_list[i]); + } + for (i = 0; i < dn_nodes_num; i++) + { + LoadTransaction(dn_node_list[i]); + } +} + +/* + * LoadTransaction -- get transactions from certain node and stores them in pgxc_transaction + * input: node oid + * return: no + */ +void LoadTransaction(Oid node) +{ + const char *query_stmt = "select a1.pid::text, a1.locktype::text, a2.datname::text, a2.relname::text, " + "a1.page::text, a1.tuple::text, a1.mode::text, a1.granted::text, a1.transactionid::text, a3.query::text " + "from (select locktype::text, database, relation, page::text, " + "tuple::text, mode::text, granted::text, pid::text, transactionid::text " + "from pg_locks where (locktype = 'relation' or locktype = 'page' or locktype = 'tuple' or locktype = 'transactionid')" + " and (pid is not null))a1 " + "left join " + "(select distinct pg_database.datname::text, pg_class.relname::text, " + "pg_locks.database, pg_locks.relation " + "from pg_database, pg_class, pg_locks, pg_namespace " + "where pg_database.oid = pg_locks.database and pg_class.oid = pg_locks.relation " + "and pg_namespace.oid = pg_class.relnamespace and pg_namespace.nspname " + "not in ('pg_catalog','information_schema'))a2 " + "on a1.database = a2.database and a1.relation = a2.relation " + "left join " + "(select pid::text, query::text from pg_stat_activity)a3 on a1.pid = a3.pid and a3.pid != '%d' " + "where (a1.locktype = 'transactionid' and a1.transactionid is not null)" + " or (a1.locktype != 'transactionid' and a2.datname is not null and a2.relname is not null) order by a1.pid;"; + + char query_txnid[2048]; + + /*stores tuples in result_txnid*/ + TupleTableSlots result_txnid; + int i; + int i_txn; + int ntuples; + uint32 pid; + char *temp = NULL; + char *rel_name = NULL; + char *db_name = NULL; + char *ptr = NULL; + char *gid = NULL; + int nodeid = 0; + lockinfo templock; + + sprintf(query_txnid, query_stmt, MyProcPid); + execute_on_single_node(node, query_txnid, 10, &result_txnid); + if (result_txnid.slot == NULL) + { + elog(DEBUG1, "pg_unlock: there is no transaction on node %s", get_pgxc_nodename(node)); + return; + } + + ntuples = result_txnid.slot_count; + for (i = 0; i < ntuples; i++) + { + pid = strtoul(TTSgetvalue(&result_txnid, i, 0), NULL, 10); + /*get global xid of pid on node*/ + gid = GetGxid(node, pid); + /*select for update apply for transactionid without global xid*/ + if (gid == NULL) + { + continue; + } + + /*check whether the gid is already existed*/ + for (i_txn = 0; i_txn < pgxc_transaction_count; i_txn++) + { + if (strcmp(gid, pgxc_transaction[i_txn].gid) == 0) + { + break; + } + } + + /*insert this new transaction when gid is not find in pgxc_transaction*/ + if (i_txn >= pgxc_transaction_count) + { + RPALLOC(pgxc_transaction); + InitTransaction(pgxc_transaction_count); + memcpy(pgxc_transaction[pgxc_transaction_count].gid, gid, sizeof(char) * MAX_GID); + pgxc_transaction_count++; + i_txn = pgxc_transaction_count-1; + } + add_pid_node(i_txn, pid, node); + ptr = strtok(gid, ":"); + nodeid = atoi(ptr); + pgxc_transaction[i_txn].initiator = get_nodeoid_from_nodeid(nodeid, PGXC_NODE_COORDINATOR); + //pgxc_transaction[i_txn].initiator = get_pgxc_nodeoid(ptr); + pfree(gid); + + /*read lockinfo from result_txnid*/ + templock.m_pid = pid; + templock.m_node = node; + templock.m_locktype = find_locktype(TTSgetvalue(&result_txnid, i, 1)); + + /*we only consider the first four locktypes*/ + if (templock.m_locktype > Locktype_Transactionid) + { + continue; + } + + db_name = TTSgetvalue(&result_txnid, i, 2); + if (db_name) + { + memcpy(templock.m_dbname, db_name, strlen(db_name)+1); + } + else + { + MemSet(templock.m_dbname, 0, sizeof(templock.m_dbname)); + } + rel_name = TTSgetvalue(&result_txnid, i, 3); + if (rel_name) + { + memcpy(templock.m_relname, rel_name, strlen(rel_name)+1); + } + else + { + MemSet(templock.m_relname, 0, sizeof(templock.m_relname)); + } + if (TTSgetvalue(&result_txnid, i, 4) != NULL) + { + templock.m_page = strtoul(TTSgetvalue(&result_txnid, i, 4), NULL, 10); + } + else + { + templock.m_page = 0; + } + if (TTSgetvalue(&result_txnid, i, 5) != NULL) + { + templock.m_tuple = strtoul(TTSgetvalue(&result_txnid, i, 5), NULL, 10); + } + else + { + templock.m_tuple = 0; + } + templock.m_mode = find_mode(TTSgetvalue(&result_txnid, i, 6)); + if (TTSgetvalue(&result_txnid, i, 8) != NULL) + { + templock.m_transactionid = strtoul(TTSgetvalue(&result_txnid, i, 8), NULL, 10); + } + else + { + templock.m_transactionid = 0; + } + temp = TTSgetvalue(&result_txnid, i, 7); + + if (TTSgetvalue(&result_txnid, i, 9)) + { + if (strlen(TTSgetvalue(&result_txnid, i, 9)) <= 1024) + { + templock.m_query = (char *)pstrdup(TTSgetvalue(&result_txnid, i, 9)); + } + else + { + templock.m_query = (char *)palloc0(1025); + strncpy(templock.m_query, TTSgetvalue(&result_txnid, i, 9), 1024); + } + } + else + { + templock.m_query = NULL; + } + /*put templock into transaction hold list or wait list due to granted*/ + if (strcmp(temp, "true") == 0) + { + templock.m_granted = true; + PALLOC(pgxc_transaction[i_txn].hold, templock); + } + else + { + templock.m_granted = false; + PALLOC(pgxc_transaction[i_txn].wait, templock); + } + if (pgxc_transaction[i_txn].initiator == node) + { + if (templock.m_query) + { + pgxc_transaction[i_txn].query = pstrdup(templock.m_query); + } + else + { + pgxc_transaction[i_txn].query = pstrdup("unknown"); + } + } + + } + DropTupleTableSlots(&result_txnid); +} + +/* + * TTSgetvalue -- get attribute from TupleTableSlots + * input: result, index of tuple, index of field + * return: attribute result + */ +static char * TTSgetvalue(TupleTableSlots *result, int tup_num, int field_num) +{ + return result->slot[tup_num][field_num]; +} + +static void DropTupleTableSlots(TupleTableSlots * Slots) +{ + int i; + int j; + for (i = 0; i < Slots->slot_count; i++) + { + if (Slots->slot[i]) + { + for (j = 0; j < Slots->attnum; j++) + { + if (Slots->slot[i][j]) + { + pfree(Slots->slot[i][j]); + } + } + pfree(Slots->slot[i]); + } + } + RFREE(Slots->slot); + Slots->attnum = 0; + return; +} + +void InitTransaction(int txn_index) +{ + transaction *temp; + temp = pgxc_transaction; + if (temp == NULL) + { + elog(LOG, "pg_unlock: error pgxc_transaction is null"); + exit(1); + } + INIT(temp[txn_index].pid); + INIT(temp[txn_index].node); + INIT(temp[txn_index].hold); + INIT(temp[txn_index].wait); + INIT(temp[txn_index].out); + temp[txn_index].searched = false; + temp[txn_index].alive = true; + INIT(temp[txn_index].deadlock); + temp[txn_index].wait_txn = 0; + temp[txn_index].query = NULL; +} + +/* + * add_pid_node -- add pid and node to certain transaction + * input: index of transaction, pid, node oid + * return: void + */ +void add_pid_node(int txn_index, uint32 pid, Oid node) +{ + transaction *temp; + temp = pgxc_transaction; + + PALLOC(temp[txn_index].pid, pid); + PALLOC(temp[txn_index].node, node); +} + +LOCKTYPE find_locktype(char * locktype) +{ + LOCKTYPE j; + if (strcmp(locktype, "relation") == 0) + { + j = Locktype_Relation; + } + else if (strcmp(locktype, "page") == 0) + { + j = Locktype_Page; + } + else if (strcmp(locktype, "tuple") == 0) + { + j = Locktype_Tuple; + } + else if (strcmp(locktype, "transactionid") == 0) + { + j = Locktype_Transactionid; + } + else if (strcmp(locktype, "object") == 0) + { + j = Locktype_Object; + } + else if (strcmp(locktype, "userlock") == 0) + { + j = Locktype_Userlock; + } + else if (strcmp(locktype, "advisory") == 0) + { + j = Locktype_Advisory; + } + else + { + elog(LOG, "pg_unlock: unknown locktype: %s", locktype); + exit (1); + } + return j; +} + +MODE find_mode(char *mode) +{ + MODE i; + if (strcmp(mode, "AccessShareLock") == 0) + { + i = Lockmode_ASL; + } + else if (strcmp(mode, "RowShareLock") == 0) + { + i = Lockmode_RSL; + } + else if (strcmp(mode, "RowExclusiveLock") == 0) + { + i = Lockmode_REL; + } + else if (strcmp(mode, "ShareUpdateExclusiveLock") == 0) + { + i = Lockmode_SUEL; + } + else if (strcmp(mode, "ShareLock") == 0) + { + i = Lockmode_SL; + } + else if (strcmp(mode, "ShareRowExclusiveLock") == 0) + { + i = Lockmode_SREL; + } + else if (strcmp(mode, "ExclusiveLock") == 0) + { + i = Lockmode_EL; + } + else if (strcmp(mode, "AccessExclusiveLock") == 0) + { + i = Lockmode_AEL; + } + else + { + elog(LOG, "pg_unlock: unkown lock mode %s", mode); + exit (1); + } + return i; +} + +/* + * InitAllEdge -- build all transaction dependency graph and stores in pgxc_transaction, pgxc_edge + * input: no + * return: no + */ +void InitAllEdge(void) +{ + int i; + int j; + pgxc_edge = (int **)palloc(pgxc_transaction_count * sizeof(int *)); + for (i = 0; i < pgxc_transaction_count; i++) + { + pgxc_edge[i] = (int *)palloc(pgxc_transaction_count * sizeof(int)); + for (j = 0; j < pgxc_transaction_count; j++) + { + pgxc_edge[i][j] = 0; + } + } + + /*search for all edges*/ + for (i = 0; i < pgxc_transaction_count; i++) + { + for (j = 0; j < pgxc_transaction_count; j++) + { + if (i == j) + { + continue; + } + InitEdge(i, j); + } + } +} + +/* + * InitEdge -- build dependency between two transactions and stores it in pgxc_transaction, pgxc_edge + * input: pre transaction index, post transaction index + * return: no + */ +void InitEdge(int pre, int post) +{ + int i; + int out_count; + Edge *out = NULL; + int pre_end = pgxc_transaction[pre].wait_count; + lockinfo *pre_wait = pgxc_transaction[pre].wait; + + for (i = 0; i < pre_end; i++) + { + /*if lock pre_wait[i] conflict with pgxc_transaction[post]*/ + if (is_conflict_withtxn(pre_wait+i, post)) + { + RPALLOC(pgxc_transaction[pre].out); + out = pgxc_transaction[pre].out; + out_count = pgxc_transaction[pre].out_count; + out[out_count].pre = pre; + out[out_count].post = post; + pgxc_transaction[pre].out_count++; + pgxc_edge[pre][post] = 1; + break; + } + } +} + +/* + * is_conflict_withtxn -- build dependency between two transactions and stores it in pgxc_transaction, pgxc_edge + * input: pre transaction index, post transaction index + * return: conflict or not + */ +bool is_conflict_withtxn(lockinfo *wait, int post_txn) +{ + bool conflict = false; + lockinfo *hold = pgxc_transaction[post_txn].hold; + int hold_count = pgxc_transaction[post_txn].hold_count; + int i; + for (i = 0; i < hold_count; i++) + { + if (is_conflict_withlock(wait, hold + i)) + { + conflict = true; + break; + } + } + return conflict; +} + +/* + * is_conflict_withlock -- build dependency between two locks + * input: pre lockinfo, post lockinfo + * return: conflict or not + */ +bool is_conflict_withlock(lockinfo *wait, lockinfo *hold) +{ + bool conflict = false; + bool sameobject = true; + + /*locks of same granted will not conflict*/ + if (wait->m_node != hold->m_node || wait->m_granted == hold->m_granted) + { + return conflict; + } + + /*locks of different locktype will not conflict*/ + if ((wait->m_locktype < Locktype_Transactionid) ^ (hold->m_locktype < Locktype_Transactionid)) + { + sameobject = false; + } + + /*check locktype among relation, page and tuple*/ + else if(wait->m_locktype < Locktype_Transactionid && hold->m_locktype < Locktype_Transactionid) + { + if ((strcmp(wait->m_dbname, hold->m_dbname) == 0) && !check_include(wait, hold)) + { + sameobject = false; + } + } + + /*check between transactionid*/ + else if(wait->m_locktype == Locktype_Transactionid && hold->m_locktype == Locktype_Transactionid) + { + if (wait->m_node != hold->m_node || wait->m_transactionid != hold->m_transactionid) + { + sameobject = false; + } + } + + /*check locktype among relation, page and tuple*/ + if (sameobject == true) + { + conflict = (m_matrix[(int)wait->m_mode][(int)hold->m_mode] == 1); + } + return conflict; +} + +bool check_include(lockinfo *wait, lockinfo *hold) +{ + bool include = false; + LOCKTYPE i = wait->m_locktype; + LOCKTYPE j = hold->m_locktype; + int min; + int max; + + if ((i >= Locktype_Transactionid) || (j >= Locktype_Transactionid)) + { + return include; + } + min = i <= j ? i : j; + max = i <= j ? j : i; + switch (min) + { + case Locktype_Relation: + if (strcmp(wait->m_relname, hold->m_relname) == 0) + { + include = true; + } + break; + case Locktype_Page: + if (strcmp(wait->m_relname, hold->m_relname) == 0) + { + /*locks in same relation and page or + relation lock and page lock of the same relation*/ + if ((i != j) || (wait->m_page == hold->m_page)) + { + include = true; + } + } + break; + case Locktype_Tuple: + if (strcmp(wait->m_relname, hold->m_relname) == 0) + { + if (max == Locktype_Relation) + { + include = true; + break; + } + if (wait->m_page == hold->m_page) + { + if (max == Locktype_Page) + { + include = true; + break; + } + if (wait->m_tuple == hold->m_tuple) + { + if (max == Locktype_Tuple) + { + include = true; + break; + } + } + } + } + break; + default: + elog(LOG, "pg_unlock: could not match locktype %d to relation, page or tuple", min); + break; + } + return include; +} + +void InitDeadlock(void) +{ + RPALLOC(pgxc_deadlock); + INIT(pgxc_deadlock[pgxc_deadlock_count].txns); + RPALLOC(pgxc_deadlock[pgxc_deadlock_count].txns); + pgxc_deadlock[pgxc_deadlock_count].killed = false; + return; +} + +void DropDeadlock(deadlock *loop) +{ + RFREE(loop->txns); + loop->killed = false; + return; +} + +void DropAlldeadlocks(void) +{ + int i; + for (i = pgxc_deadlock_count - 1; i >= 0; i--) + { + DropDeadlock(pgxc_deadlock+i); + } + RFREE(pgxc_deadlock); +} + +/* + * DetectDeadlock -- detect deadlock according to transaction dependency and store them in pgxc_deadlock + * input: no + * return: no + */ +void DetectDeadlock(void) +{ + int i; + deeplist dfs; + int loop_start; + + InitDeeplist(&dfs); + for (i = 0; i < pgxc_transaction_count; i++) + { + if (pgxc_deadlock_count > MAX_DEADLOCK) + { + break; + } + + /*we can find all the deadlocks that conclude the transaction through tranvers it*/ + if (pgxc_transaction[i].searched == true) + { + continue; + } + else + { + /*push i into stack*/ + PALLOC(dfs.stack, i); + PALLOC(dfs.stackpre, -1); + } + while (dfs.stack_count != 0 ) + { + if (pgxc_deadlock_count > MAX_DEADLOCK) + { + break; + } + /*loop_start indicate whether deadlock exists*/ + loop_start = traverse(&dfs); + if (loop_start > -1) + { + path_deadlock(&dfs, loop_start); + } + } + ClearDeeplist(&dfs); + } + DropDeeplist(&dfs); +} + +/* + * traverse -- traverse according to transaction dependency and store them in list->path + * input: deeplist + * return: index of deadlock start transaction in path + */ +int traverse(deeplist* list) +{ + int res = -1; + + /*pop the last element in stack*/ + int i; + int post; + int start = list->stack[list->stack_count - 1]; + int startpre = list->stackpre[list->stackpre_count - 1]; + + list->stack_count--; + list->stackpre_count--; + pgxc_transaction[start].searched = true; + + /*delete element in path, if the pop element in stack is not its post*/ + if (list->path_count > 0) + { + while(list->path[list->path_count-1] != startpre) + { + list->path_count--; + list->txn_exist[list->path[list->path_count]] = -1; + } + } + + /*push the pop element into path*/ + PALLOC(list->path, start); + list->txn_exist[start] = list->path_count-1; + + /*find all the outedge of the above pop element*/ + for (i = 0; i < pgxc_transaction[start].out_count; i++) + { + post = pgxc_transaction[start].out[i].post; + + /*if the transaction post does not exit in path*/ + if (list->txn_exist[post] < 0) + { + PALLOC(list->stack, post); + PALLOC(list->stackpre, start); + } + /*or return the index of path according to the transaction*/ + else + { + res = list->txn_exist[post]; + } + } + return res; +} + +/* + * path_deadlock -- add element in path to pgxc_deadlock + * input: deeplist, index of deadlock start element in path + * return: no + */ +void path_deadlock(deeplist *list, int start) +{ + deadlock *loop = NULL; + int i; + int ii; + int ij; + int total_count = list->path_count - start; + bool isexist = false; + int ii_txns_count; + int ij_txns_count; + + InitDeadlock(); + loop = pgxc_deadlock+pgxc_deadlock_count; + + for (i = start; i < list->path_count; i++) + { + PALLOC(loop->txns, list->path[i]); + } + /*first check whether the deadlock is exits*/ + for (i = 0; i < pgxc_deadlock_count; i++) + { + if (pgxc_deadlock[i].txns_count == total_count) + { + isexist = true; + ii_txns_count = pgxc_deadlock[i].txns_count; + ij_txns_count = loop->txns_count * 2 - 1; + for (ii = 0, ij = 0; ii < ii_txns_count && ij < ij_txns_count;) + { + if (pgxc_deadlock[i].txns[ii] != loop->txns[ij % loop->txns_count]) + { + if (ii == 0 && ij < loop->txns_count) + { + ij++; + } + else + { + /*deadlock not exist*/ + isexist = false; + break; + } + } + else + { + ii++; + ij++; + } + } + if (isexist == true) + { + break; + } + /*deadlock in list[start~path_count-1] is already exist*/ + } + } + + if (isexist == false) + { + pgxc_deadlock_count++; + } + else + { + RFREE(loop->txns); + } + /*if not existed then insert into pgxc_deadlock*/ + return; +} + +void InitDeeplist(deeplist* list) +{ + int i; + INIT(list->stack); + INIT(list->stackpre); + INIT(list->path); + list->txn_exist = (int *)palloc(pgxc_transaction_count * sizeof(int)); + for (i = 0; i < pgxc_transaction_count; i++) + { + list->txn_exist[i] = -1; + } + return; +} + +void ClearDeeplist(deeplist * list) +{ + int i = 0; + list->stack_count = 0; + list->stackpre_count = 0; + list->path_count = 0; + for (i = 0; i < pgxc_transaction_count; i++) + { + list->txn_exist[i] = -1; + } + return; +} + +void DropDeeplist(deeplist * list) +{ + RFREE(list->stack); + RFREE(list->stackpre); + RFREE(list->path); + pfree(list->txn_exist); + list->txn_exist = NULL; + return; +} + +/* + * RecoverDeadlock -- kill at most one transaction in each deadlock + * input: no + * return: no + */ +void RecoverDeadlock(void) +{ + int* sort_txnid = NULL; + if (pgxc_deadlock_count == 0) + { + return; + } + + sort_txnid = (int *)palloc(pgxc_transaction_count * sizeof(int)); + /*Count deadlocks belong to each transactions*/ + CountDeadlocks(); + CountWaitTxn(); + + /*sort transaction index by deadlock count*/ + SortByDeadlock(sort_txnid); + /*first kill transaction with the most deadlocks*/ + KillDeadlockByTxn(sort_txnid[0]); + pfree(sort_txnid); + return; +} + +void CountDeadlocks(void) +{ + int i; + int j; + + for (i = 0; i < pgxc_deadlock_count; i++) + { + for (j = 0; j < pgxc_deadlock[i].txns_count; j++) + { + PALLOC(pgxc_transaction[pgxc_deadlock[i].txns[j]].deadlock, i); + } + } + return; +} + +void SortByDeadlock(int *sort_txnid) +{ + int i; + for (i = 0; i < pgxc_transaction_count; i++) + { + sort_txnid[i] = i; + } + quiksort(sort_txnid, 0, pgxc_transaction_count-1); +} + +void quiksort(int *sort_txnid, int low, int high) +{ + int i = low; + int j = high; + int temp = sort_txnid[i]; + + if( low > high) + { + return ; + } + while(i < j) + { + while(((pgxc_transaction[sort_txnid[j]].deadlock_count + < pgxc_transaction[temp].deadlock_count) + || ((pgxc_transaction[sort_txnid[j]].deadlock_count + == pgxc_transaction[temp].deadlock_count) + && (pgxc_transaction[sort_txnid[j]].wait_txn + <= pgxc_transaction[temp].wait_txn))) + && (i < j)) + { + j--; + } + sort_txnid[i] = sort_txnid[j]; + while(((pgxc_transaction[sort_txnid[i]].deadlock_count + > pgxc_transaction[temp].deadlock_count) + || ((pgxc_transaction[sort_txnid[j]].deadlock_count + == pgxc_transaction[temp].deadlock_count) + && (pgxc_transaction[sort_txnid[j]].wait_txn + >= pgxc_transaction[temp].wait_txn))) + && (i < j)) + { + i++; + } + sort_txnid[j]= sort_txnid[i]; + } + sort_txnid[i] = temp; + quiksort(sort_txnid,low,i-1); + quiksort(sort_txnid,j+1,high); +} + + +/* + * KillDeadlockByTxn -- kill certain transaction + * input: transaction index + * return: no + */ +void KillDeadlockByTxn(int txnid) +{ + int i; + transaction *txn = pgxc_transaction; + Oid* node = pgxc_transaction[txnid].node; + uint32* pid = pgxc_transaction[txnid].pid; + char query[500]; + TupleTableSlots result; + + if (DeadlockExists(txnid) == false) + { + return; + } + + txn[txnid].alive = false; + for (i = 0; i < txn[txnid].deadlock_count; i++) + { + pgxc_deadlock[txn[txnid].deadlock[i]].killed = true; + } + + for (i = 0; i < pgxc_transaction[txnid].node_count; i++) + { + snprintf(query, 500,"select pg_cancel_backend(%u);", pid[i]); + execute_on_single_node(node[i], query, 0, &result); + DropTupleTableSlots(&result); + } + return; +} + +bool DeadlockExists(int id) +{ + bool res = false; + transaction *txn = pgxc_transaction; + int i; + for (i = 0; i < txn[id].deadlock_count; i++) + { + if (pgxc_deadlock[txn[id].deadlock[i]].killed == false) + { + res = true; + } + } + return res; +} + +void DropTransaction(int i) +{ + transaction *txn = pgxc_transaction; + + txn[i].gid[0] = '\0'; + txn[i].searched = false; + txn[i].alive = true; + txn[i].wait_txn = 0; + + RFREE(txn[i].pid); + RFREE(txn[i].node); + if (txn[i].hold_size && txn[i].hold->m_query) + { + pfree(txn[i].hold->m_query); + txn[i].hold->m_query = NULL; + } + RFREE(txn[i].hold); + if (txn[i].wait_size && txn[i].wait->m_query) + { + pfree(txn[i].wait->m_query); + txn[i].wait->m_query = NULL; + } + RFREE(txn[i].wait); + RFREE(txn[i].deadlock); + RFREE(txn[i].out); + if (txn[i].query) + { + pfree(txn[i].query); + txn[i].query = NULL; + } +} + +void DropAlltransactions(void) +{ + int i; + + for (i = 0; i < pgxc_transaction_count; i++) + { + DropTransaction(i); + } + + if (pgxc_edge != NULL) + { + for (i = 0; i < pgxc_transaction_count; i++) + { + pfree(pgxc_edge[i]); + } + if (pgxc_transaction_count) + { + pfree(pgxc_edge); + } + pgxc_edge = NULL; + } + + RFREE(pgxc_transaction); +} + +void InitPrintEdge(PrintEdge *Pedge) +{ + int i; + int j; + int index1; + int index2; + int len = 0; + + Pedge->index = 0; + INIT(Pedge->edge); + INIT(Pedge->nodes); + INIT(Pedge->querys); + RPALLOC(Pedge->edge); + RPALLOC(Pedge->nodes); + RPALLOC(Pedge->querys); + + for (i = 0; i < pgxc_transaction_count; i++) + { + for (j = 0; j < pgxc_transaction[i].out_count; j++) + { + RPALLOC(Pedge->edge); + Pedge->edge[Pedge->edge_count] = (char *) palloc(2*MAX_GID*sizeof(char) + 10); + + index1 = pgxc_transaction[i].out[j].pre; + index2 = pgxc_transaction[i].out[j].post; + snprintf(Pedge->edge[Pedge->edge_count], 2*MAX_GID*sizeof(char) + 10, "%s --> %s", + pgxc_transaction[index1].gid, pgxc_transaction[index2].gid); + + RPALLOC(Pedge->nodes); + Pedge->nodes[Pedge->nodes_count] = (char *) palloc(2*NAMEDATALEN*sizeof(char) + 10); + snprintf(Pedge->nodes[Pedge->nodes_count], 2*NAMEDATALEN*sizeof(char) + 10, "%s --> %s", + get_pgxc_nodename(pgxc_transaction[index1].initiator), + get_pgxc_nodename(pgxc_transaction[index2].initiator)); + + RPALLOC(Pedge->querys); + len = 0; + if (pgxc_transaction[index1].query) + { + len += strlen(pgxc_transaction[index1].query); + } + if (pgxc_transaction[index2].query) + { + len += strlen(pgxc_transaction[index2].query); + } + Pedge->querys[Pedge->querys_count] = (char *) palloc(len+ 10); + snprintf(Pedge->querys[Pedge->querys_count], len+10, "%s --> %s", + pgxc_transaction[index1].query, pgxc_transaction[index2].query); + + Pedge->edge_count++; + Pedge->nodes_count++; + Pedge->querys_count++; + } + } +} + +void DropPrintEdge(PrintEdge *Pedge) +{ + int i; + if (NULL == Pedge) + { + return; + } + for (i = 0; i < Pedge->edge_count; i++) + { + pfree(Pedge->edge[i]); + } + RFREE(Pedge->edge); + + for (i = 0; i < Pedge->nodes_count; i++) + { + pfree(Pedge->nodes[i]); + } + RFREE(Pedge->nodes); + + for (i = 0; i < Pedge->querys_count; i++) + { + pfree(Pedge->querys[i]); + } + RFREE(Pedge->querys); + Pedge->index = 0; + Pedge = NULL; +} + +void InitPrintDeadlock(PrintDeadlock *Pdeadlock) +{ + int i; + int j; + StringInfoData query; + StringInfoData nodename; + StringInfoData deadlock_query; + + Pdeadlock->index = 0; + Pdeadlock->deadlock = NULL; + Pdeadlock->deadlock_count = pgxc_deadlock_count; + Pdeadlock->per_size = (int *)palloc(pgxc_deadlock_count * sizeof(int)); + Pdeadlock->deadlock = (char **)palloc(pgxc_deadlock_count * sizeof(char *)); + Pdeadlock->nodename = (char **)palloc(pgxc_deadlock_count * sizeof(char *)); + Pdeadlock->query = (char **)palloc(pgxc_deadlock_count * sizeof(char *)); + + for (i = 0; i < pgxc_deadlock_count; i++) + { + Pdeadlock->per_size[i] = pgxc_deadlock[i].txns_count*(MAX_GID+10)*sizeof(char); + Pdeadlock->deadlock[i] = (char *) palloc(Pdeadlock->per_size[i]); + Pdeadlock->nodename[i] = (char *) palloc(pgxc_deadlock[i].txns_count * NAMEDATALEN); + + initStringInfo(&query); + initStringInfo(&nodename); + initStringInfo(&deadlock_query); + + for (j = 0; j < pgxc_deadlock[i].txns_count; j++) + { + appendStringInfo(&query, "%-15s(%-15s:%-12d)", pgxc_transaction[pgxc_deadlock[i].txns[j]].gid, + get_pgxc_nodehost(pgxc_transaction[pgxc_deadlock[i].txns[j]].initiator), + get_pgxc_nodeport(pgxc_transaction[pgxc_deadlock[i].txns[j]].initiator)); + appendStringInfo(&nodename, "%s", get_pgxc_nodename(pgxc_transaction[pgxc_deadlock[i].txns[j]].initiator)); + appendStringInfo(&deadlock_query, "%s", pgxc_transaction[pgxc_deadlock[i].txns[j]].query); + if (j < pgxc_deadlock[i].txns_count-1) + { + appendStringInfoChar(&query, '\n'); + appendStringInfoChar(&nodename, '\n'); + appendStringInfoChar(&deadlock_query, '\n'); + } + } + snprintf(Pdeadlock->deadlock[i], Pdeadlock->per_size[i], "%s", query.data); + snprintf(Pdeadlock->nodename[i], pgxc_deadlock[i].txns_count * NAMEDATALEN, "%s", nodename.data); + + Pdeadlock->query[i] = (char *) palloc(deadlock_query.len + 1); + snprintf(Pdeadlock->query[i], deadlock_query.len + 1, "%s", deadlock_query.data); + } +} + +void DropPrintDeadlock(PrintDeadlock *Pdeadlock) +{ + int i; + for (i = 0; i < Pdeadlock->deadlock_count; i++) + { + pfree(Pdeadlock->deadlock[i]); + pfree(Pdeadlock->nodename[i]); + pfree(Pdeadlock->query[i]); + } + pfree(Pdeadlock->deadlock); + pfree(Pdeadlock->nodename); + pfree(Pdeadlock->query); + pfree(Pdeadlock->per_size); + Pdeadlock->deadlock = NULL; + Pdeadlock->nodename = NULL; + Pdeadlock->query = NULL; + Pdeadlock->per_size = NULL; + Pdeadlock->index = 0; + Pdeadlock->deadlock_count = 0; +} + +void InitPrinttxn(PrintRollbackTxn *Ptxn) +{ + int i; + int len; + + Ptxn->index = 0; + INIT(Ptxn->txn); + INIT(Ptxn->nodename); + INIT(Ptxn->cancel_query); + + for (i = 0; i < pgxc_transaction_count; i++) + { + if (pgxc_transaction[i].alive == false) + { + RPALLOC(Ptxn->txn); + Ptxn->txn[Ptxn->txn_count] = (char *) palloc((MAX_GID+10) * sizeof(char)); + sprintf(Ptxn->txn[Ptxn->txn_count], "%-15s(%-15s:%-15d)", pgxc_transaction[i].gid, + get_pgxc_nodehost(pgxc_transaction[i].initiator), + get_pgxc_nodeport(pgxc_transaction[i].initiator)); + RPALLOC(Ptxn->nodename); + Ptxn->nodename[Ptxn->nodename_count] = (char *) palloc(NAMEDATALEN); + sprintf(Ptxn->nodename[Ptxn->nodename_count], "%s", get_pgxc_nodename(pgxc_transaction[i].initiator)); + + RPALLOC(Ptxn->cancel_query); + len = 0; + if (pgxc_transaction[i].query) + { + len += strlen(pgxc_transaction[i].query); + Ptxn->cancel_query[Ptxn->cancel_query_count] = (char *) palloc0(len + 1); + sprintf(Ptxn->cancel_query[Ptxn->cancel_query_count], "%s", pgxc_transaction[i].query); + } + else + { + Ptxn->cancel_query[Ptxn->cancel_query_count] = (char *) palloc0(10); + sprintf(Ptxn->cancel_query[Ptxn->cancel_query_count], "unknown"); + } + + Ptxn->txn_count++; + Ptxn->nodename_count++; + Ptxn->cancel_query_count++; + } + } +} + +void DropPrinttxn(PrintRollbackTxn *Ptxn) +{ + int i; + for (i = 0; i < Ptxn->txn_count; i++) + { + pfree(Ptxn->txn[i]); + } + + for (i = 0; i < Ptxn->cancel_query_count; i++) + { + pfree(Ptxn->cancel_query[i]); + } + + for (i = 0; i < Ptxn->nodename_count; i++) + { + pfree(Ptxn->nodename[i]); + } + RFREE(Ptxn->txn); + RFREE(Ptxn->cancel_query); + RFREE(Ptxn->nodename); + Ptxn->index = 0; + Ptxn = NULL; +} + +/* + * GetGxid -- get global xid of certain pid on certain node + * input: node oid, pid + * return: global xid + */ +char *GetGxid(Oid node, uint32 pid) +{ + char *res = NULL; + char *temp = NULL; + TupleTableSlots result; + char query[100]; + + snprintf(query, 100, "select pg_findgxid(%u)", pid); + execute_on_single_node(node, query, 1, &result); + if (result.slot == NULL) + { + elog(LOG, "pg_unlock: could not obtain global transactionid from pid %u on node %s", pid, get_pgxc_nodename(node)); + return res; + } + temp = TTSgetvalue(&result, 0, 0); + if (temp != NULL) + { + res = (char *)palloc(20 * sizeof(char)); + memcpy(res, temp, 20 * sizeof(char)); + } + DropTupleTableSlots(&result); + return res; +} + +/* + * pg_findgxid -- get global xid of certain pid + * input: pid + * return: global xid + */ +Datum pg_findgxid(PG_FUNCTION_ARGS) +{ + uint32 pid = PG_GETARG_UINT32(0); + char *globalXid = GetGlobalTransactionId(pid); + text *t_gxid = NULL; + if (globalXid != NULL) + { + t_gxid = cstring_to_text(globalXid); + return PointerGetDatum(t_gxid); + } + PG_RETURN_NULL(); +} + +/* + * check_node_pid -- check whether certain pid on certain node exists + * input: nodename, pid + * return: exist or not + */ +int check_node_pid(char *nodename, uint32 pid) +{ + int res = -1; + int i; + int j; + for (i = 0; i < pgxc_transaction_count; i++) + { + if (strcmp(get_pgxc_nodename(pgxc_transaction[i].initiator) , nodename) == 0) + { + for (j = 0; j < pgxc_transaction[i].pid_count; j++) + { + if (pid == pgxc_transaction[i].pid[j]) + { + res = i; + } + } + } + } + return res; +} + +/* + * KillTxn -- kill certain transaction + * input: transaction index + * return: no + */ +void KillTxn(int txnid) +{ + int i; + TupleTableSlots result; + char query[500]; + Oid* node = pgxc_transaction[txnid].node; + uint32* pid = pgxc_transaction[txnid].pid; + + for (i = 0; i < pgxc_transaction[txnid].node_count; i++) + { + snprintf(query, 500,"select pg_cancel_backend(%u);", pid[i]); + execute_on_single_node(node[i], query, 0, &result); + DropTupleTableSlots(&result); + } + return; +} + +/* + * check_exist_gid -- check whether certain transaction exists + * input: transaction global xid + * return: exist or not + */ +bool check_exist_gid(char *gid) +{ + bool res = false; + int i; + for (i = 0; i < pgxc_transaction_count; i++) + { + if (strcmp(pgxc_transaction[i].gid, gid) == 0) + { + res = true; + } + } + return res; +} + +void CountWaitTxn(void) +{ + int i; + int j; + for (i = 0; i < pgxc_transaction_count; i++) + { + for (j = 0; j < pgxc_transaction_count; j++) + { + if (pgxc_edge[i][j] == 1) + { + pgxc_transaction[j].wait_txn++; + } + } + } +} diff --git a/contrib/pg_unlock/pg_unlock.control b/contrib/pg_unlock/pg_unlock.control new file mode 100644 index 00000000..033558c4 --- /dev/null +++ b/contrib/pg_unlock/pg_unlock.control @@ -0,0 +1,5 @@ +# deadlock detect extention +comment = 'tools for detect and unlock all the deadlocks' +default_version = '1.0' +module_pathname = '$libdir/pg_unlock' +relocatable = true \ No newline at end of file diff --git a/contrib/tbase_subscription/Makefile b/contrib/tbase_subscription/Makefile new file mode 100644 index 00000000..59cb31c6 --- /dev/null +++ b/contrib/tbase_subscription/Makefile @@ -0,0 +1,19 @@ +# contrib/tbase_subscription/Makefile + +MODULE_big = tbase_subscription +OBJS = tbase_subscription.o + +EXTENSION = tbase_subscription +DATA = tbase_subscription--1.0.sql \ + tbase_subscription--unpackaged--1.0.sql + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/tbase_subscription +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/contrib/tbase_subscription/tbase_subscription--1.0.sql b/contrib/tbase_subscription/tbase_subscription--1.0.sql new file mode 100644 index 00000000..02391235 --- /dev/null +++ b/contrib/tbase_subscription/tbase_subscription--1.0.sql @@ -0,0 +1,36 @@ +/* contrib/tbase_subscription/tbase_subscription--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION tbase_subscription" to load this file. \quit + +CREATE TABLE tbase_subscription +( + sub_name name, -- Name of TBase subscription created on coordinator + sub_ignore_pk_conflict bool, -- ignore primary key conflict occurs when apply + sub_manual_hot_date text, -- GUC parameter, manual_hot_date + sub_temp_hot_date text, -- GUC parameter, temp_hot_date + sub_temp_cold_date text, -- GUC parameter, temp_cold_date + sub_parallel_number int4, -- Split TBase subscription into multiple parallel tbase-sub-subscriptions + sub_is_all_actived bool -- Whether all parallel tbase-sub-subscriptions are actived. + -- If there are some parallel tbase-sub-subscriptions, + -- other tbase-sub-subscriptions can be activated only after + -- the first tbase-sub-subscription has completed the data COPY. + -- And other tbase-sub-subscriptions can only be activated by + -- the first tbase-sub-subscription. +) WITH OIDS; + +CREATE TABLE tbase_subscription_parallel +( + sub_parent oid, -- Oid of parent tbase subsription stored in tbase_subscription above + sub_child oid, -- A TBase subscription may be split into multiple parallel tbase-sub-subscriptions, + -- and each tbase-sub-subscription is recorded in pg_subscription with a given oid + sub_index int4, -- Index of this tbase-sub-subscription in all parallel tbase-sub-subscriptions + sub_active_state bool, -- Whether the current tbase-sub-subscription is activated by the first tbase-sub-subscription, + -- valid only when sub_index > 0 + sub_active_lsn pg_lsn -- The LSN value that was set when the current tbase-sub-subscription was activated by the first + -- tbase-sub-subscription, valid only when sub_index > 0 +) WITH OIDS; + +-- Don't want this to be available to non-superusers. +REVOKE ALL ON TABLE tbase_subscription FROM PUBLIC; +REVOKE ALL ON TABLE tbase_subscription_parallel FROM PUBLIC; diff --git a/contrib/tbase_subscription/tbase_subscription--unpackaged--1.0.sql b/contrib/tbase_subscription/tbase_subscription--unpackaged--1.0.sql new file mode 100644 index 00000000..9b576874 --- /dev/null +++ b/contrib/tbase_subscription/tbase_subscription--unpackaged--1.0.sql @@ -0,0 +1,4 @@ +/* contrib/tbase_subscription/tbase_subscription--unpackaged--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION tbase_subscription" to load this file. \quit diff --git a/contrib/tbase_subscription/tbase_subscription.c b/contrib/tbase_subscription/tbase_subscription.c new file mode 100644 index 00000000..33312ad2 --- /dev/null +++ b/contrib/tbase_subscription/tbase_subscription.c @@ -0,0 +1,26 @@ +#include "postgres.h" + +#include +#include + +#include "access/hash.h" +#include "executor/instrument.h" +#include "funcapi.h" +#include "mb/pg_wchar.h" +#include "miscadmin.h" +#include "parser/analyze.h" +#include "parser/parsetree.h" +#include "parser/scanner.h" +#include "parser/scansup.h" +#include "pgstat.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/spin.h" +#include "tcop/utility.h" +#include "utils/builtins.h" +#include "access/xact.h" +#include "access/transam.h" +#include "utils/timestamp.h" + +PG_MODULE_MAGIC; + diff --git a/contrib/tbase_subscription/tbase_subscription.control b/contrib/tbase_subscription/tbase_subscription.control new file mode 100644 index 00000000..5a7ae862 --- /dev/null +++ b/contrib/tbase_subscription/tbase_subscription.control @@ -0,0 +1,5 @@ +# tbase_subscription extension +comment = 'support for hot and cold subscriptions and two-way subscriptions' +default_version = '1.0' +module_pathname = '$libdir/tbase_subscription' +relocatable = true From 6343ea17701a471d25fa75959785bc1ebef33f1b Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Sat, 1 Jan 2022 17:51:54 +0800 Subject: [PATCH 321/578] create branch v2.3.0 --- src/backend/utils/adt/version.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/utils/adt/version.c b/src/backend/utils/adt/version.c index 9b7f49d3..2636c448 100644 --- a/src/backend/utils/adt/version.c +++ b/src/backend/utils/adt/version.c @@ -78,7 +78,7 @@ #include "utils/builtins.h" -#define TBASE_VERSION_STR "TBase_V2.0.0_release" +#define TBASE_VERSION_STR "TBase_V2.3.0_release" Datum pgsql_version(PG_FUNCTION_ARGS) From 3747e3c0df82b4d1d45018976e8e1231f32d7f01 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Sat, 1 Jan 2022 17:55:29 +0800 Subject: [PATCH 322/578] master branch --- src/backend/utils/adt/version.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/utils/adt/version.c b/src/backend/utils/adt/version.c index 2636c448..b11ef63b 100644 --- a/src/backend/utils/adt/version.c +++ b/src/backend/utils/adt/version.c @@ -78,7 +78,7 @@ #include "utils/builtins.h" -#define TBASE_VERSION_STR "TBase_V2.3.0_release" +#define TBASE_VERSION_STR "TBase_master" Datum pgsql_version(PG_FUNCTION_ARGS) From 20e3f09bc0e8fe8507890b9b5d539506bff4729a Mon Sep 17 00:00:00 2001 From: JennyJennyChen <48546628+JennyJennyChen@users.noreply.github.com> Date: Wed, 19 Jan 2022 16:48:25 +0800 Subject: [PATCH 323/578] fix parallel select hang (#119) https://github.com/Tencent/TBase/issues/108 Co-authored-by: bethding --- src/backend/pgxc/squeue/squeue.c | 47 +++++++++++++++++++------------- 1 file changed, 28 insertions(+), 19 deletions(-) diff --git a/src/backend/pgxc/squeue/squeue.c b/src/backend/pgxc/squeue/squeue.c index a4deed0d..1c6fc478 100644 --- a/src/backend/pgxc/squeue/squeue.c +++ b/src/backend/pgxc/squeue/squeue.c @@ -430,6 +430,7 @@ typedef struct ParallelSendDataQueue size_t send_data_len; size_t write_data_len; bool long_tuple; + bool wait_free_space; DataPumpSndStatus status; /* status of the data sending */ bool stuck; bool last_send; @@ -633,7 +634,7 @@ static void *ParallelSenderThreadMain(void *arg); static void ParallelSenderSendData(ParallelSendThreadControl *threadControl, bool last_send); static bool SendNodeData(ParallelSendNodeControl *node, bool last_send); static char *GetNodeData(ParallelSendDataQueue *buf, uint32 *uiLen, bool *long_tuple); -static uint32 NodeDataSize(ParallelSendDataQueue *buf, bool *long_tuple); +static uint32 NodeDataSize(ParallelSendDataQueue *buf, bool *long_tuple, bool *wait_free_space); static void IncNodeDataOff(ParallelSendDataQueue *buf, uint32 uiLen); static int RawSendNodeData(ParallelSendNodeControl *node, int32 sock, char * data, int32 len, int32 * reason); static int32 SetNodeSocket(void *sndctl, int32 nodeindex, int32 nodeId, int32 socket); @@ -647,7 +648,7 @@ static void SendNodeDataRemote(SharedQueue squeue, ParallelWorkerControl *contro TupleTableSlot *slot, Tuplestorestate **tuplestore, MemoryContext tmpcxt); static bool ParallelSendDataRow(ParallelWorkerControl *control, ParallelSendDataQueue *buf, char *data, size_t len, int32 consumerIdx); static uint32 BufferFreeSpace(ParallelSendDataQueue *buf); -static void SetBufferBorder(ParallelSendDataQueue *buf, bool long_tuple); +static void SetBufferBorderAndWaitFlag(ParallelSendDataQueue *buf, bool long_tuple, bool wait_free_space); static void PutNodeData(ParallelSendDataQueue *buf, char *data, uint32 len); static char *GetBufferWriteOff(ParallelSendDataQueue *buf, uint32 *uiLen); static void IncBufferWriteOff(ParallelSendDataQueue *buf, uint32 uiLen); @@ -6649,6 +6650,7 @@ InitParallelSendSharedData(SharedQueue sq, ParallelSendControl *senderControl, i buffer->send_data_len = 0; buffer->write_data_len = 0; buffer->long_tuple = 0; + buffer->wait_free_space = false; buffer->status = DataPumpSndStatus_no_socket; buffer->stuck = false; buffer->last_send = false; @@ -6956,6 +6958,7 @@ SendNodeData(ParallelSendNodeControl *node, bool last_send) {// #lizard forgives bool should_send = false; bool long_tuple = false; + bool wait_free_space = false; int i = 0; uint32 len = 0; int32 ret = 0; @@ -6992,10 +6995,13 @@ SendNodeData(ParallelSendNodeControl *node, bool last_send) if (!should_send) { - data_size = NodeDataSize(buffer, &long_tuple); + data_size = NodeDataSize(buffer, &long_tuple, &wait_free_space); - /* too small to send */ - if (data_size < g_SndBatchSize * 1024) + /* + * If wait_free_space is true, sender thread should send data to free buffer space, + * else wait until data_size reach to batch threshold. + */ + if (!wait_free_space && data_size < g_SndBatchSize * 1024) { node->current_buffer = (node->current_buffer + 1) % node->numParallelWorkers; @@ -7072,7 +7078,7 @@ SendNodeData(ParallelSendNodeControl *node, bool last_send) } /* get left data length */ - len = NodeDataSize(buffer, &long_tuple); + len = NodeDataSize(buffer, &long_tuple, &wait_free_space); if (len == 0) { @@ -7114,7 +7120,8 @@ GetNodeData(ParallelSendDataQueue *buf, uint32 *uiLen, bool *long_tuple) char *data; if (buf) { - if (0 == NodeDataSize(buf, long_tuple)) + bool wait_flag = false; + if (0 == NodeDataSize(buf, long_tuple, &wait_flag)) { *uiLen = 0; return NULL; @@ -7166,7 +7173,7 @@ GetNodeData(ParallelSendDataQueue *buf, uint32 *uiLen, bool *long_tuple) /* Return total data size in buffer */ static uint32 -NodeDataSize(ParallelSendDataQueue *buf, bool *long_tuple) +NodeDataSize(ParallelSendDataQueue *buf, bool *long_tuple, bool *wait_free_space) { uint32 border = 0; uint32 tail = 0; @@ -7177,6 +7184,7 @@ NodeDataSize(ParallelSendDataQueue *buf, bool *long_tuple) tail = buf->bufTail; border = buf->bufBorder; *long_tuple = buf->long_tuple; + *wait_free_space = buf->wait_free_space; spinlock_unlock(&(buf->bufLock)); if (INVALID_BORDER == border) @@ -7882,9 +7890,10 @@ ParallelSendDataRow(ParallelWorkerControl *control, ParallelSendDataQueue *buf, /* no space left, */ if (BufferFreeSpace(buf) < (uint32)tuple_len) { + /* Set flag to notice sender thread send data without waiting batch size threshold */ if (!long_tuple) { - SetBufferBorder(buf, false); + SetBufferBorderAndWaitFlag(buf, false, true); pg_usleep(50L); return false; } @@ -7906,7 +7915,7 @@ ParallelSendDataRow(ParallelWorkerControl *control, ParallelSendDataQueue *buf, /* Data */ PutNodeData(buf, data, len); - SetBufferBorder(buf, false); + SetBufferBorderAndWaitFlag(buf, false, false); } else { @@ -7923,7 +7932,7 @@ ParallelSendDataRow(ParallelWorkerControl *control, ParallelSendDataQueue *buf, /* put message 'D' */ while (data_len < header_len) { - SetBufferBorder(buf, false); + SetBufferBorderAndWaitFlag(buf, false, true); pg_usleep(100L); data_len = BufferFreeSpace(buf); @@ -7962,7 +7971,7 @@ ParallelSendDataRow(ParallelWorkerControl *control, ParallelSendDataQueue *buf, } else { - SetBufferBorder(buf, true); + SetBufferBorderAndWaitFlag(buf, true, true); pg_usleep(100L); if (buf->status == DataPumpSndStatus_error) @@ -7972,7 +7981,7 @@ ParallelSendDataRow(ParallelWorkerControl *control, ParallelSendDataQueue *buf, } } - SetBufferBorder(buf, false); + SetBufferBorderAndWaitFlag(buf, false, false); } buf->ntuples++; @@ -8011,15 +8020,15 @@ BufferFreeSpace(ParallelSendDataQueue *buf) } static void -SetBufferBorder(ParallelSendDataQueue *buf, bool long_tuple) +SetBufferBorderAndWaitFlag(ParallelSendDataQueue *buf, bool long_tuple, bool wait_free_space) { spinlock_lock(&(buf->bufLock)); buf->bufBorder = buf->bufHead; buf->long_tuple = long_tuple; + buf->wait_free_space = wait_free_space; spinlock_unlock(&(buf->bufLock)); } - /* Send data into buffer */ static void PutNodeData(ParallelSendDataQueue *buf, char *data, uint32 len) @@ -8568,7 +8577,7 @@ ParallelFastSendDatarow(ParallelSendDataQueue *buf, TupleTableSlot *slot, void * } else { - SetBufferBorder(buf, true); + SetBufferBorderAndWaitFlag(buf, true, true); pg_usleep(50L); if (buf->status == DataPumpSndStatus_error) @@ -8596,11 +8605,11 @@ ParallelFastSendDatarow(ParallelSendDataQueue *buf, TupleTableSlot *slot, void * } } - SetBufferBorder(buf, false); + SetBufferBorderAndWaitFlag(buf, false, false); } else { - SetBufferBorder(buf, false); + SetBufferBorderAndWaitFlag(buf, false, false); } @@ -8628,7 +8637,7 @@ ParallelFastSendDatarow(ParallelSendDataQueue *buf, TupleTableSlot *slot, void * #if 1 /* Not enough space, wakeup sender. */ //ParallelSendWakeupSender(control, buf, nodeindex); - SetBufferBorder(buf, false); + SetBufferBorderAndWaitFlag(buf, false, true); //pg_usleep(50L); #endif return false; From caf14f34f10bb2995d5aac6b67071ce9bb973f8b Mon Sep 17 00:00:00 2001 From: JennyJennyChen <48546628+JennyJennyChen@users.noreply.github.com> Date: Wed, 19 Jan 2022 16:53:07 +0800 Subject: [PATCH 324/578] fix active snapshot null when set_global_snapshot = false (#120) * fix parallel select hang https://github.com/Tencent/TBase/issues/108 * fix active snapshot null when set_global_snapshot = false https://github.com/Tencent/TBase/issues/106 Co-authored-by: bethding --- src/backend/executor/execParallel.c | 9 +++++++++ src/backend/pgxc/pool/execRemote.c | 3 +-- src/include/pgxc/execRemote.h | 1 + 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c index 920bc32e..72cc61df 100644 --- a/src/backend/executor/execParallel.c +++ b/src/backend/executor/execParallel.c @@ -752,6 +752,15 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate, int nworkers) } #endif +#ifdef __TBASE__ + /* set snapshot as needed */ + if (!g_set_global_snapshot && !ActiveSnapshotSet()) + { + SetSnapshot(estate); + } +#endif + + /* Everyone's had a chance to ask for space, so now create the DSM. */ InitializeParallelDSM(pcxt); diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index d105295a..c781abcf 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -150,7 +150,6 @@ pgxc_node_remote_prefinish(char *prepareGID, char *nodestring); static void pgxc_abort_connections(PGXCNodeAllHandles *all_handles); static void pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle); static void pgxc_node_remote_abort(TranscationType txn_type, bool need_release_handle); -static bool SetSnapshot(EState *state); static int pgxc_node_remote_commit_internal(PGXCNodeAllHandles *handles, TranscationType txn_type); #endif @@ -12943,7 +12942,7 @@ SubTranscation_PreAbort_Remote(void) } -static bool +bool SetSnapshot(EState *state) { bool result = false; diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h index 60910919..5b6d46c6 100644 --- a/src/include/pgxc/execRemote.h +++ b/src/include/pgxc/execRemote.h @@ -438,6 +438,7 @@ extern TupleDesc create_tuple_desc(char *msg_body, size_t len); extern void ExecFinishRemoteSubplan(RemoteSubplanState *node); extern void ExecShutdownRemoteSubplan(RemoteSubplanState *node); +extern bool SetSnapshot(EState *state); #endif #ifdef __SUBSCRIPTION__ From b65d457372da7fb94f50844af835962c9091cf56 Mon Sep 17 00:00:00 2001 From: dafoerx Date: Tue, 15 Mar 2022 15:20:38 +0800 Subject: [PATCH 325/578] [BUGFIX] 1.Skip invalid relid in group information check 2.Determine whether tables of different groups are allowed to insert. --- src/backend/parser/analyze.c | 80 +++++++++++++++++++------- src/test/regress/expected/insert_1.out | 48 ++++++++++++++++ src/test/regress/sql/insert.sql | 28 +++++++++ 3 files changed, 136 insertions(+), 20 deletions(-) diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c index fb5e27f1..8d01fc39 100644 --- a/src/backend/parser/analyze.c +++ b/src/backend/parser/analyze.c @@ -575,6 +575,44 @@ transformDeleteStmt(ParseState *pstate, DeleteStmt *stmt) return qry; } +/* + * Determine whether tables of different groups are allowed to insert. + */ +static bool +is_table_allowed_insert(RelationLocInfo *from, RelationLocInfo *to) +{ + List *from_nodelist = from->rl_nodeList; + List *to_nodelist = to->rl_nodeList; + List *diff = NULL; + bool result = false; + + /* necessary check, will never happened. */ + if (from == NULL || to == NULL) + { + elog(ERROR, "is_reptable_allow_insert, invalid params %s:%s", + from ? " " : "from is null", + to ? " " : "to is null"); + } + + /* step1: From table must be replication table. */ + if ( +#ifdef __COLD_HOT__ + (from->coldGroupId != to->coldGroupId) || +#endif + ((from->groupId != to->groupId) && (!IsRelationReplicated(from)))) + { + return false; + } + + /* step2: Data distribution nodes have intersections */ + diff = list_difference_int(to_nodelist, from_nodelist); + + /* stemp3: Insertions are allowed if there is an intersection of data distribution nodes. */ + result = (list_length(diff) != list_length(to_nodelist)); + list_free(diff); + return result; +} + /* * transformInsertStmt - * transform an Insert Statement @@ -704,35 +742,37 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt) Query *selectQuery; #ifdef __TBASE__ - /* prevent insert into cold_hot table select ... */ - if (pstate->p_target_relation) - { + /* prevent insert into cold_hot table select ... */ + if (pstate->p_target_relation) + { RelationLocInfo *target_rel_loc_info = pstate->p_target_relation->rd_locator_info; RelationLocInfo *from_rel_loc_info; if (target_rel_loc_info && target_rel_loc_info->locatorType == LOCATOR_TYPE_SHARD) - { + { foreach(lc, selectStmt->fromClause) - { + { Node *node = lfirst(lc); if (IsA(node, RangeVar)) { - Relation rel = heap_openrv((RangeVar *) node, AccessShareLock); - - from_rel_loc_info = rel->rd_locator_info; - if (from_rel_loc_info == NULL || /* from system table */ -#ifdef __COLD_HOT__ - from_rel_loc_info->coldGroupId != target_rel_loc_info->coldGroupId || -#endif - from_rel_loc_info->groupId != target_rel_loc_info->groupId) - { - elog(ERROR, "shard table could not be inserted from any other tables in different group"); + Oid relid = RangeVarGetRelid((RangeVar *) node, NoLock, true); + + if (InvalidOid != relid) + { + Relation rel = heap_open(relid, AccessShareLock); + + from_rel_loc_info = rel->rd_locator_info; + if (!is_table_allowed_insert(from_rel_loc_info, target_rel_loc_info)) + { + elog(ERROR, + "shard table could not be inserted from any other tables in different group"); + } + + heap_close(rel, AccessShareLock); + } } - - heap_close(rel, AccessShareLock); - } - } - } + } + } } #endif diff --git a/src/test/regress/expected/insert_1.out b/src/test/regress/expected/insert_1.out index 592137e9..40dd14ab 100644 --- a/src/test/regress/expected/insert_1.out +++ b/src/test/regress/expected/insert_1.out @@ -728,3 +728,51 @@ insert into returningwrtest values (2, 'foo') returning returningwrtest; (1 row) drop table returningwrtest; +-- check insert into a shard table from a CTE table +create table t1(f1 int,f2 int) distribute by shard(f1); +NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. +create table t2(f1 int,f2 int) distribute by shard(f1); +NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. +insert into t1 values(1,1); +insert into t1 values(2,2); +with baseInfo as(select * from t1) +insert into t2 select * from baseInfo; +drop table t1; +drop table t2; +-- Determine whether tables of different groups are allowed to insert. +set default_locator_type to shard; +drop table if exists t2; +NOTICE: table "t2" does not exist, skipping +drop table if exists t2_rep; +NOTICE: table "t2_rep" does not exist, skipping +drop table if exists t2_new; +NOTICE: table "t2_new" does not exist, skipping +create table t2(f1 int,f2 int); +NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. +create table t2_rep(f1 int,f2 int) distribute by replication; +insert into t2_rep values(1,1),(2,2); +insert into t2 select * from t2_rep; +select count(*) from t2_rep; + count +------- + 2 +(1 row) + +select count(*) from t2; + count +------- + 2 +(1 row) + +create table t2_new as select * from t2_rep; +NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. +select count(*) from t2_new; + count +------- + 2 +(1 row) + +drop table t2; +drop table t2_rep; +drop table t2_new; +reset default_locator_type; diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql index 75d801b9..b121eac0 100644 --- a/src/test/regress/sql/insert.sql +++ b/src/test/regress/sql/insert.sql @@ -407,3 +407,31 @@ alter table returningwrtest2 drop c; alter table returningwrtest attach partition returningwrtest2 for values in (2); insert into returningwrtest values (2, 'foo') returning returningwrtest; drop table returningwrtest; + +-- check insert into a shard table from a CTE table +create table t1(f1 int,f2 int) distribute by shard(f1); +create table t2(f1 int,f2 int) distribute by shard(f1); +insert into t1 values(1,1); +insert into t1 values(2,2); +with baseInfo as(select * from t1) +insert into t2 select * from baseInfo; +drop table t1; +drop table t2; + +-- Determine whether tables of different groups are allowed to insert. +set default_locator_type to shard; +drop table if exists t2; +drop table if exists t2_rep; +drop table if exists t2_new; +create table t2(f1 int,f2 int); +create table t2_rep(f1 int,f2 int) distribute by replication; +insert into t2_rep values(1,1),(2,2); +insert into t2 select * from t2_rep; +select count(*) from t2_rep; +select count(*) from t2; +create table t2_new as select * from t2_rep; +select count(*) from t2_new; +drop table t2; +drop table t2_rep; +drop table t2_new; +reset default_locator_type; \ No newline at end of file From 19ba10741cf8383b6ad18bfc3092c45d7254ebf7 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Thu, 25 Feb 2021 11:40:03 +0800 Subject: [PATCH 326/578] for gtm unix domain socket http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131085360117(merge request !178) --- src/backend/access/transam/gtm.c | 41 ++ src/backend/utils/misc/guc.c | 17 +- src/gtm/client/fe-connect.c | 21 +- src/gtm/client/ip.c | 131 +++++ src/gtm/gtm_ctl/gtm_ctl.c | 17 + src/gtm/libpq/ip.c | 132 +++++ src/gtm/libpq/pqcomm.c | 203 +++++++- src/gtm/main/gtm_opt.c | 736 +++++++++++++++------------- src/gtm/main/main.c | 39 +- src/gtm/proxy/proxy_main.c | 10 +- src/include/gtm/gtm_c.h | 2 +- src/include/gtm/gtm_opt.h | 253 +++++----- src/include/gtm/libpq.h | 24 +- src/include/gtm/pqcomm.h | 29 +- src/include/postmaster/postmaster.h | 29 +- 15 files changed, 1163 insertions(+), 521 deletions(-) diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c index 5f4e8218..969311e5 100644 --- a/src/backend/access/transam/gtm.c +++ b/src/backend/access/transam/gtm.c @@ -54,6 +54,7 @@ char *NewGtmHost = NULL; int NewGtmPort = -1; bool g_GTM_skip_catalog = false; +char *gtm_unix_socket_directory = DEFAULT_PGSOCKET_DIR; #endif char *GtmHost = NULL; int GtmPort = 0; @@ -1191,6 +1192,7 @@ InitGTM(void) #ifdef __TBASE__ int try_cnt = 0; const int max_try_cnt = 1; + bool same_host = false; /* * Only re-set gtm info in two cases: @@ -1209,6 +1211,13 @@ InitGTM(void) errmsg("GtmHost and GtmPort are not set"))); return; } + +#ifdef HAVE_UNIX_SOCKETS + if (GtmHost && (strcmp(PGXCNodeHost, GtmHost) == 0) && gtm_unix_socket_file_exists()) + { + same_host = true; + } +#endif #endif try_connect_gtm: @@ -1222,11 +1231,24 @@ InitGTM(void) else if (IS_PGXC_DATANODE) remote_type = GTM_NODE_DATANODE; +#ifdef __TBASE__ + if (same_host) + { + /* Use 60s as connection timeout */ + snprintf(conn_str, CONNECT_STR_LEN, "host=%s port=%d node_name=%s remote_type=%d postmaster=1 connect_timeout=%d", + gtm_unix_socket_directory, GtmPort, PGXCNodeName, remote_type, + tcp_keepalives_idle > 0 ? + tcp_keepalives_idle : GtmConnectTimeout); + } + else +#endif + { /* Use 60s as connection timeout */ snprintf(conn_str, CONNECT_STR_LEN, "host=%s port=%d node_name=%s remote_type=%d postmaster=1 connect_timeout=%d", GtmHost, GtmPort, PGXCNodeName, remote_type, tcp_keepalives_idle > 0 ? tcp_keepalives_idle : GtmConnectTimeout); + } /* Log activity of GTM connections */ if(GTMDebugPrint) @@ -1234,11 +1256,24 @@ InitGTM(void) } else { +#ifdef __TBASE__ + if (same_host) + { + /* Use 60s as connection timeout */ + snprintf(conn_str, CONNECT_STR_LEN, "host=%s port=%d node_name=%s connect_timeout=%d", + gtm_unix_socket_directory, GtmPort, PGXCNodeName, + tcp_keepalives_idle > 0 ? + tcp_keepalives_idle : GtmConnectTimeout); + } + else +#endif + { /* Use 60s as connection timeout */ snprintf(conn_str, CONNECT_STR_LEN, "host=%s port=%d node_name=%s connect_timeout=%d", GtmHost, GtmPort, PGXCNodeName, tcp_keepalives_idle > 0 ? tcp_keepalives_idle : GtmConnectTimeout); + } /* Log activity of GTM connections */ if (IsAutoVacuumWorkerProcess() && GTMDebugPrint) @@ -1268,6 +1303,12 @@ InitGTM(void) } CloseGTM(); try_cnt++; + + /* if connect with unix domain socket failed */ + if (same_host) + { + same_host = false; + } goto try_connect_gtm; } else diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 488fd88f..a0db968c 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -5368,7 +5368,22 @@ static struct config_string ConfigureNamesString[] = #endif NULL, NULL, NULL }, - +#ifdef __TBASE__ + { + {"gtm_unix_socket_directory", PGC_POSTMASTER, CONN_AUTH_SETTINGS, + gettext_noop("Sets the directory where GTM Unix-domain sockets used."), + NULL, + GUC_SUPERUSER_ONLY + }, + >m_unix_socket_directory, +#ifdef HAVE_UNIX_SOCKETS + DEFAULT_PGSOCKET_DIR, +#else + "", +#endif + NULL, NULL, NULL + }, +#endif { {"listen_addresses", PGC_POSTMASTER, CONN_AUTH_SETTINGS, gettext_noop("Sets the host name or IP address(es) to listen to."), diff --git a/src/gtm/client/fe-connect.c b/src/gtm/client/fe-connect.c index 97e6e0c5..5f1f167f 100644 --- a/src/gtm/client/fe-connect.c +++ b/src/gtm/client/fe-connect.c @@ -260,7 +260,7 @@ static int connectGTMStart(GTM_Conn *conn) {// #lizard forgives int portnum = 0; - char portstr[128]; + char portstr[MAXGTMPATH]; struct addrinfo *addrs = NULL; struct addrinfo hint; const char *node; @@ -299,10 +299,27 @@ connectGTMStart(GTM_Conn *conn) /* Using pghost, so we have to look-up the hostname */ node = conn->pghost; hint.ai_family = AF_UNSPEC; +#ifdef __TBASE__ +#ifdef HAVE_UNIX_SOCKETS + if (is_absolute_path(conn->pghost)) + { + node = NULL; + hint.ai_family = AF_UNIX; + UNIXSOCK_PATH(portstr, portnum, conn->pghost); + if (strlen(portstr) >= UNIXSOCK_PATH_BUFLEN) + { + appendGTMPQExpBuffer(&conn->errorMessage, + libpq_gettext("Unix-domain socket path \"%s\" is too long (maximum %d bytes)\n"), + portstr, + (int) (UNIXSOCK_PATH_BUFLEN - 1)); + goto connect_errReturn; + } + } +#endif +#endif } else { - /* Without Unix sockets, default to localhost instead */ node = "localhost"; hint.ai_family = AF_UNSPEC; } diff --git a/src/gtm/client/ip.c b/src/gtm/client/ip.c index 73c6ec72..8327a204 100644 --- a/src/gtm/client/ip.c +++ b/src/gtm/client/ip.c @@ -46,6 +46,16 @@ static int range_sockaddr_AF_INET6(const struct sockaddr_in6 * addr, const struct sockaddr_in6 * netmask); #endif +#ifdef HAVE_UNIX_SOCKETS +static int getaddrinfo_unix(const char *path, + const struct addrinfo *hintsp, + struct addrinfo **result); + +static int getnameinfo_unix(const struct sockaddr_un *sa, int salen, + char *node, int nodelen, + char *service, int servicelen, + int flags); +#endif /* * gtm_getaddrinfo_all - get address info for Unix, IPv4 and IPv6 sockets @@ -59,6 +69,11 @@ gtm_getaddrinfo_all(const char *hostname, const char *servname, /* not all versions of getaddrinfo() zero *result on failure */ *result = NULL; +#ifdef HAVE_UNIX_SOCKETS + if (hintp->ai_family == AF_UNIX) + return getaddrinfo_unix(servname, hintp, result); +#endif + /* NULL has special meaning to getaddrinfo(). */ rc = getaddrinfo((!hostname || hostname[0] == '\0') ? NULL : hostname, servname, hintp, result); @@ -103,6 +118,14 @@ gtm_getnameinfo_all(const struct sockaddr_storage * addr, int salen, { int rc; +#ifdef HAVE_UNIX_SOCKETS + if (addr && addr->ss_family == AF_UNIX) + rc = getnameinfo_unix((const struct sockaddr_un *) addr, salen, + node, nodelen, + service, servicelen, + flags); + else +#endif rc = getnameinfo((const struct sockaddr *) addr, salen, node, nodelen, service, servicelen, @@ -322,3 +345,111 @@ gtm_promote_v4_to_v6_mask(struct sockaddr_storage * addr) } #endif /* HAVE_IPV6 */ + + +#ifdef HAVE_UNIX_SOCKETS + +/* ------- + * getaddrinfo_unix - get unix socket info using IPv6-compatible API + * + * Bugs: only one addrinfo is set even though hintsp is NULL or + * ai_socktype is 0 + * AI_CANONNAME is not supported. + * ------- + */ +static int +getaddrinfo_unix(const char *path, const struct addrinfo *hintsp, + struct addrinfo **result) +{ + struct addrinfo hints; + struct addrinfo *aip; + struct sockaddr_un *unp; + + *result = NULL; + + MemSet(&hints, 0, sizeof(hints)); + + if (strlen(path) >= sizeof(unp->sun_path)) + return EAI_FAIL; + + if (hintsp == NULL) + { + hints.ai_family = AF_UNIX; + hints.ai_socktype = SOCK_STREAM; + } + else + memcpy(&hints, hintsp, sizeof(hints)); + + if (hints.ai_socktype == 0) + hints.ai_socktype = SOCK_STREAM; + + if (hints.ai_family != AF_UNIX) + { + /* shouldn't have been called */ + return EAI_FAIL; + } + + aip = calloc(1, sizeof(struct addrinfo)); + if (aip == NULL) + return EAI_MEMORY; + + unp = calloc(1, sizeof(struct sockaddr_un)); + if (unp == NULL) + { + free(aip); + return EAI_MEMORY; + } + + aip->ai_family = AF_UNIX; + aip->ai_socktype = hints.ai_socktype; + aip->ai_protocol = hints.ai_protocol; + aip->ai_next = NULL; + aip->ai_canonname = NULL; + *result = aip; + + unp->sun_family = AF_UNIX; + aip->ai_addr = (struct sockaddr *) unp; + aip->ai_addrlen = sizeof(struct sockaddr_un); + + strcpy(unp->sun_path, path); + +#ifdef HAVE_STRUCT_SOCKADDR_STORAGE_SS_LEN + unp->sun_len = sizeof(struct sockaddr_un); +#endif + + return 0; +} + +/* + * Convert an address to a hostname. + */ +static int +getnameinfo_unix(const struct sockaddr_un *sa, int salen, + char *node, int nodelen, + char *service, int servicelen, + int flags) +{ + int ret = -1; + + /* Invalid arguments. */ + if (sa == NULL || sa->sun_family != AF_UNIX || + (node == NULL && service == NULL)) + return EAI_FAIL; + + if (node) + { + ret = snprintf(node, nodelen, "%s", "[local]"); + if (ret == -1 || ret > nodelen) + return EAI_MEMORY; + } + + if (service) + { + ret = snprintf(service, servicelen, "%s", sa->sun_path); + if (ret == -1 || ret > servicelen) + return EAI_MEMORY; + } + + return 0; +} +#endif /* HAVE_UNIX_SOCKETS */ \ No newline at end of file diff --git a/src/gtm/gtm_ctl/gtm_ctl.c b/src/gtm/gtm_ctl/gtm_ctl.c index 3d1cd2f4..bf804b98 100644 --- a/src/gtm/gtm_ctl/gtm_ctl.c +++ b/src/gtm/gtm_ctl/gtm_ctl.c @@ -118,6 +118,8 @@ static char gtmopts_file[MAXPGPATH]; static char pid_file[MAXPGPATH]; static char conf_file[MAXPGPATH]; +void CreateLockFile(const char *filename, const char *refName); +void DeleteLockFile(const char *filename); /* * Write errors to stderr (or by gtm_equal means when stderr is * not available). @@ -1776,3 +1778,18 @@ pg_realloc(void *ptr, size_t size) write_stderr("out of memory\n"); return tmp; } + +/* + * for compile + */ +void +CreateLockFile(const char *filename, const char *refName) +{ + return; +} + +void +DeleteLockFile(const char *filename) +{ + return; +} \ No newline at end of file diff --git a/src/gtm/libpq/ip.c b/src/gtm/libpq/ip.c index 5cf8f041..6e94a5fc 100644 --- a/src/gtm/libpq/ip.c +++ b/src/gtm/libpq/ip.c @@ -46,6 +46,17 @@ static int range_sockaddr_AF_INET6(const struct sockaddr_in6 * addr, const struct sockaddr_in6 * netmask); #endif +#ifdef HAVE_UNIX_SOCKETS +static int getaddrinfo_unix(const char *path, + const struct addrinfo *hintsp, + struct addrinfo **result); + +static int getnameinfo_unix(const struct sockaddr_un *sa, int salen, + char *node, int nodelen, + char *service, int servicelen, + int flags); +#endif + /* * pg_getaddrinfo_all - get address info for Unix, IPv4 and IPv6 sockets @@ -59,6 +70,11 @@ pg_getaddrinfo_all(const char *hostname, const char *servname, /* not all versions of getaddrinfo() zero *result on failure */ *result = NULL; +#ifdef HAVE_UNIX_SOCKETS + if (hintp->ai_family == AF_UNIX) + return getaddrinfo_unix(servname, hintp, result); +#endif + /* NULL has special meaning to getaddrinfo(). */ rc = getaddrinfo((!hostname || hostname[0] == '\0') ? NULL : hostname, servname, hintp, result); @@ -103,6 +119,14 @@ pg_getnameinfo_all(const struct sockaddr_storage * addr, int salen, { int rc; +#ifdef HAVE_UNIX_SOCKETS + if (addr && addr->ss_family == AF_UNIX) + rc = getnameinfo_unix((const struct sockaddr_un *) addr, salen, + node, nodelen, + service, servicelen, + flags); + else +#endif rc = getnameinfo((const struct sockaddr *) addr, salen, node, nodelen, service, servicelen, @@ -322,3 +346,111 @@ pg_promote_v4_to_v6_mask(struct sockaddr_storage * addr) } #endif /* HAVE_IPV6 */ + + +#ifdef HAVE_UNIX_SOCKETS + +/* ------- + * getaddrinfo_unix - get unix socket info using IPv6-compatible API + * + * Bugs: only one addrinfo is set even though hintsp is NULL or + * ai_socktype is 0 + * AI_CANONNAME is not supported. + * ------- + */ +static int +getaddrinfo_unix(const char *path, const struct addrinfo *hintsp, + struct addrinfo **result) +{ + struct addrinfo hints; + struct addrinfo *aip; + struct sockaddr_un *unp; + + *result = NULL; + + MemSet(&hints, 0, sizeof(hints)); + + if (strlen(path) >= sizeof(unp->sun_path)) + return EAI_FAIL; + + if (hintsp == NULL) + { + hints.ai_family = AF_UNIX; + hints.ai_socktype = SOCK_STREAM; + } + else + memcpy(&hints, hintsp, sizeof(hints)); + + if (hints.ai_socktype == 0) + hints.ai_socktype = SOCK_STREAM; + + if (hints.ai_family != AF_UNIX) + { + /* shouldn't have been called */ + return EAI_FAIL; + } + + aip = calloc(1, sizeof(struct addrinfo)); + if (aip == NULL) + return EAI_MEMORY; + + unp = calloc(1, sizeof(struct sockaddr_un)); + if (unp == NULL) + { + free(aip); + return EAI_MEMORY; + } + + aip->ai_family = AF_UNIX; + aip->ai_socktype = hints.ai_socktype; + aip->ai_protocol = hints.ai_protocol; + aip->ai_next = NULL; + aip->ai_canonname = NULL; + *result = aip; + + unp->sun_family = AF_UNIX; + aip->ai_addr = (struct sockaddr *) unp; + aip->ai_addrlen = sizeof(struct sockaddr_un); + + strcpy(unp->sun_path, path); + +#ifdef HAVE_STRUCT_SOCKADDR_STORAGE_SS_LEN + unp->sun_len = sizeof(struct sockaddr_un); +#endif + + return 0; +} + +/* + * Convert an address to a hostname. + */ +static int +getnameinfo_unix(const struct sockaddr_un *sa, int salen, + char *node, int nodelen, + char *service, int servicelen, + int flags) +{ + int ret = -1; + + /* Invalid arguments. */ + if (sa == NULL || sa->sun_family != AF_UNIX || + (node == NULL && service == NULL)) + return EAI_FAIL; + + if (node) + { + ret = snprintf(node, nodelen, "%s", "[local]"); + if (ret == -1 || ret > nodelen) + return EAI_MEMORY; + } + + if (service) + { + ret = snprintf(service, servicelen, "%s", sa->sun_path); + if (ret == -1 || ret > servicelen) + return EAI_MEMORY; + } + + return 0; +} +#endif /* HAVE_UNIX_SOCKETS */ diff --git a/src/gtm/libpq/pqcomm.c b/src/gtm/libpq/pqcomm.c index 3ddb5c91..90a8c93f 100644 --- a/src/gtm/libpq/pqcomm.c +++ b/src/gtm/libpq/pqcomm.c @@ -92,7 +92,6 @@ #include "gtm/libpq-be.h" #include "gtm/elog.h" -#define MAXGTMPATH 256 /* Where the Unix socket file is */ static char sock_path[MAXGTMPATH]; @@ -110,6 +109,20 @@ extern int tcp_keepalives_count; static int internal_putbytes(Port *myport, const char *s, size_t len); static int internal_flush(Port *myport); +#ifdef HAVE_UNIX_SOCKETS +static int Lock_AF_UNIX(char *unixSocketDir, char *unixSocketPath); +static int Setup_AF_UNIX(char *sock_path); +#endif /* HAVE_UNIX_SOCKETS */ + +extern void CreateLockFile(const char *filename, const char *refName); +extern void DeleteLockFile(const char *filename); +extern void RemoveSocketFile(void); +/* + * Configuration options + */ +int unix_socket_permissions = 0777; +char *unix_socket_group = ""; + /* * Streams -- wrapper around Unix socket system calls * @@ -126,9 +139,8 @@ static int internal_flush(Port *myport); * * RETURNS: STATUS_OK or STATUS_ERROR */ - int -StreamServerPort(int family, char *hostName, unsigned short portNumber, +StreamServerPort(int family, char *hostName, unsigned short portNumber, char *unixSocketDir, int ListenSocket[], int MaxListen) {// #lizard forgives int fd, @@ -143,6 +155,8 @@ StreamServerPort(int family, char *hostName, unsigned short portNumber, struct addrinfo hint; int listen_index = 0; int added = 0; + const char *addrDesc; + char addrBuf[NI_MAXHOST]; #if !defined(WIN32) || defined(IPV6_V6ONLY) int one = 1; @@ -154,6 +168,28 @@ StreamServerPort(int family, char *hostName, unsigned short portNumber, hint.ai_flags = AI_PASSIVE; hint.ai_socktype = SOCK_STREAM; +#ifdef HAVE_UNIX_SOCKETS + if (family == AF_UNIX) + { + /* + * Create unixSocketPath from portNumber and unixSocketDir and lock + * that file path + */ + UNIXSOCK_PATH(sock_path, portNumber, unixSocketDir); + if (strlen(sock_path) >= UNIXSOCK_PATH_BUFLEN) + { + ereport(LOG, + (errmsg("Unix-domain socket path \"%s\" is too long (maximum %d bytes)", + sock_path, + (int) (UNIXSOCK_PATH_BUFLEN - 1)))); + return STATUS_ERROR; + } + if (Lock_AF_UNIX(unixSocketDir, sock_path) != STATUS_OK) + return STATUS_ERROR; + service = sock_path; + } + else +#endif /* HAVE_UNIX_SOCKETS */ { snprintf(portNumberStr, sizeof(portNumberStr), "%d", portNumber); service = portNumberStr; @@ -210,6 +246,11 @@ StreamServerPort(int family, char *hostName, unsigned short portNumber, case AF_INET6: familyDesc = "IPv6"; break; +#endif +#ifdef HAVE_UNIX_SOCKETS + case AF_UNIX: + familyDesc = "Unix"; + break; #endif default: snprintf(familyDescBuf, sizeof(familyDescBuf), @@ -219,7 +260,22 @@ StreamServerPort(int family, char *hostName, unsigned short portNumber, break; } - if ((fd = socket(addr->ai_family, SOCK_STREAM, 0)) < 0) + /* set up text form of address for log messages */ +#ifdef HAVE_UNIX_SOCKETS + if (addr->ai_family == AF_UNIX) + addrDesc = sock_path; + else +#endif + { + pg_getnameinfo_all((const struct sockaddr_storage *) addr->ai_addr, + addr->ai_addrlen, + addrBuf, sizeof(addrBuf), + NULL, 0, + NI_NUMERICHOST); + addrDesc = addrBuf; + } + + if ((fd = socket(addr->ai_family, SOCK_STREAM, 0)) < 0) { ereport(LOG, (EACCES, @@ -296,6 +352,16 @@ StreamServerPort(int family, char *hostName, unsigned short portNumber, continue; } +#ifdef HAVE_UNIX_SOCKETS + if (addr->ai_family == AF_UNIX) + { + if (Setup_AF_UNIX(service) != STATUS_OK) + { + close(fd); + break; + } + } +#endif #define GTM_MAX_CONNECTIONS 4096 /* @@ -314,6 +380,19 @@ StreamServerPort(int family, char *hostName, unsigned short portNumber, close(fd); continue; } + +#ifdef HAVE_UNIX_SOCKETS + if (addr->ai_family == AF_UNIX) + ereport(LOG, + (errmsg("listening on Unix socket \"%s\"", + addrDesc))); + else +#endif + ereport(LOG, + /* translator: first %s is IPv4 or IPv6 */ + (errmsg("listening on %s address \"%s\", port %d", + familyDesc, addrDesc, (int) portNumber))); + ListenSocket[listen_index] = fd; added++; } @@ -327,6 +406,122 @@ StreamServerPort(int family, char *hostName, unsigned short portNumber, } +/* + * Create a lockfile for the specified Unix socket file. + */ +static void +CreateSocketLockFile(const char *socketfile, const char *socketDir) +{ + char lockfile[MAXPGPATH]; + + snprintf(lockfile, sizeof(lockfile), "%s.lock", socketfile); + CreateLockFile(lockfile, socketDir); +} + +/* + * Remove a lockfile and Unix socket file. + */ +void +RemoveSocketFile(void) +{ + char lockfile[MAXPGPATH]; + + snprintf(lockfile, sizeof(lockfile), "%s.lock", sock_path); + DeleteLockFile(lockfile); + + (void) unlink(sock_path); +} + +#ifdef HAVE_UNIX_SOCKETS + +/* + * Lock_AF_UNIX -- configure unix socket file path + */ +static int +Lock_AF_UNIX(char *unixSocketDir, char *unixSocketPath) +{ + /* + * Grab an interlock file associated with the socket file. + * + * Note: there are two reasons for using a socket lock file, rather than + * trying to interlock directly on the socket itself. First, it's a lot + * more portable, and second, it lets us remove any pre-existing socket + * file without race conditions. + */ + CreateSocketLockFile(unixSocketPath, unixSocketDir); + + /* + * Once we have the interlock, we can safely delete any pre-existing + * socket file to avoid failure at bind() time. + */ + (void) unlink(unixSocketPath); + + return STATUS_OK; +} + + +/* + * Setup_AF_UNIX -- configure unix socket permissions + */ +static int +Setup_AF_UNIX(char *sock_path) +{ + /* + * Fix socket ownership/permission if requested. Note we must do this + * before we listen() to avoid a window where unwanted connections could + * get accepted. + */ + Assert(unix_socket_group); + if (unix_socket_group[0] != '\0') + { +#ifdef WIN32 + elog(WARNING, "configuration item unix_socket_group is not supported on this platform"); +#else + char *endptr; + unsigned long val; + gid_t gid; + + val = strtoul(unix_socket_group, &endptr, 10); + if (*endptr == '\0') + { /* numeric group id */ + gid = val; + } + else + { /* convert group name to id */ + struct group *gr; + + gr = getgrnam(unix_socket_group); + if (!gr) + { + ereport(LOG, + (errmsg("group \"%s\" does not exist", + unix_socket_group))); + return STATUS_ERROR; + } + gid = gr->gr_gid; + } + if (chown(sock_path, -1, gid) == -1) + { + ereport(LOG, + (errmsg("could not set group of file \"%s\": %m", + sock_path))); + return STATUS_ERROR; + } +#endif + } + + if (chmod(sock_path, unix_socket_permissions) == -1) + { + ereport(LOG, + (errmsg("could not set permissions of file \"%s\": %m", + sock_path))); + return STATUS_ERROR; + } + return STATUS_OK; +} +#endif /* HAVE_UNIX_SOCKETS */ + + /* * StreamConnection -- create a new connection with client using * server port. Set port->sock to the FD of the new connection. diff --git a/src/gtm/main/gtm_opt.c b/src/gtm/main/gtm_opt.c index f8c1c2b9..f1a9418d 100644 --- a/src/gtm/main/gtm_opt.c +++ b/src/gtm/main/gtm_opt.c @@ -12,7 +12,7 @@ * Written by Peter Eisentraut . * * IDENTIFICATION - * src/backend/utils/misc/guc.c + * src/backend/utils/misc/guc.c * *-------------------------------------------------------------------- */ @@ -55,10 +55,10 @@ extern int tcp_keepalives_idle; extern int tcp_keepalives_count; extern int tcp_keepalives_interval; extern char *GTMDataDir; -extern int scale_factor_threads; -extern int worker_thread_number; +extern int scale_factor_threads; +extern int worker_thread_number; #ifdef __TBASE__ -extern bool enable_gtm_sequence_debug; +extern bool enable_gtm_sequence_debug; extern int wal_writer_delay; extern int checkpoint_interval; extern char *archive_command; @@ -77,7 +77,9 @@ extern int GTMStartupGTSDelta; extern int GTMGTSFreezeLimit; #endif - +extern char* unix_socket_directory; +extern char* unix_socket_group; +extern int unix_socket_permissions; /* * We have different sets for client and server message level options because @@ -90,8 +92,8 @@ Gtm_Startup_Mode_Options(); /* * GTM option variables that are exported from this module */ -char *data_directory; -char *GTMConfigFileName; +char *data_directory; +char *GTMConfigFileName; /* * Displayable names for context types (enum GtmContext) @@ -117,23 +119,23 @@ Config_Type_Names(); * TO ADD AN OPTION: * * 1. Declare a global variable of type bool, int, double, or char* - * and make use of it. + * and make use of it. * * 2. Decide at what times it's safe to set the option. See guc.h for - * details. + * details. * * 3. Decide on a name, a default value, upper and lower bounds (if - * applicable), etc. + * applicable), etc. * * 4. Add a record below. * * 5. Add it to src/backend/utils/misc/postgresql.conf.sample, if - * appropriate. + * appropriate. * * 6. Don't forget to document the option (at least in config.sgml). * * 7. If it's a new GTMOPT_LIST option you must edit pg_dumpall.c to ensure - * it is not single quoted at dump time. + * it is not single quoted at dump time. */ /* @@ -146,205 +148,205 @@ Config_Type_Names(); struct config_bool ConfigureNamesBool[] = { - { - {GTM_OPTNAME_SYNCHRONOUS_BACKUP, GTMC_STARTUP, - gettext_noop("Specifies if backup to GTM-Standby is taken in synchronous manner."), - gettext_noop("Default value is off."), - 0 - }, - &Backup_synchronously, + { + {GTM_OPTNAME_SYNCHRONOUS_BACKUP, GTMC_STARTUP, + gettext_noop("Specifies if backup to GTM-Standby is taken in synchronous manner."), + gettext_noop("Default value is off."), + 0 + }, + &Backup_synchronously, false, NULL, NULL, false, NULL - }, -#ifdef __TBASE__ - { - {GTM_OPTNAME_SYNCHRONOUS_COMMIT, GTMC_SIGHUP, - gettext_noop("enable GTM synchronous commit."), - gettext_noop("Standby must be connected when set."), - 0 - }, - &enable_sync_commit, + }, +#ifdef __TBASE__ + { + {GTM_OPTNAME_SYNCHRONOUS_COMMIT, GTMC_SIGHUP, + gettext_noop("enable GTM synchronous commit."), + gettext_noop("Standby must be connected when set."), + 0 + }, + &enable_sync_commit, false, NULL, NULL, false, NULL - }, - { + }, + { {GTM_OPTNAME_ENABLE_DEBUG, GTMC_STARTUP, - gettext_noop("enable GTM debug print."), - gettext_noop("Default value is off."), - 0 - }, - &enable_gtm_debug, + gettext_noop("enable GTM debug print."), + gettext_noop("Default value is off."), + 0 + }, + &enable_gtm_debug, false, NULL, NULL, false, NULL - }, + }, #ifdef __XLOG__ { {GTM_OPTNAME_ARCHIVE_MODE, GTMC_STARTUP, - gettext_noop("enable archive."), - gettext_noop("Default value is off."), - 0 - }, - &archive_mode, + gettext_noop("enable archive."), + gettext_noop("Default value is off."), + 0 + }, + &archive_mode, false, NULL, NULL, false, NULL - }, - { + }, + { {GTM_OPTNAME_ENABLE_XLOG_DEBUG, GTMC_STARTUP, - gettext_noop("enable GTM xlog debug print."), - gettext_noop("Default value is off."), - 0 - }, - &enalbe_gtm_xlog_debug, + gettext_noop("enable GTM xlog debug print."), + gettext_noop("Default value is off."), + 0 + }, + &enalbe_gtm_xlog_debug, false, NULL, NULL, false, NULL - }, + }, #endif - /* Set it as a GUC only if we are running regression. */ - { + /* Set it as a GUC only if we are running regression. */ + { {GTM_OPTNAME_ENABLE_SEQ_DEBUG, GTMC_STARTUP, - gettext_noop("enable GTM sequence debug."), - gettext_noop("Default value is off."), - 0 - }, - &enable_gtm_sequence_debug, + gettext_noop("enable GTM sequence debug."), + gettext_noop("Default value is off."), + 0 + }, + &enable_gtm_sequence_debug, #ifdef _PG_REGRESS_ true, NULL, NULL, false, NULL #else false, NULL, NULL, false, NULL #endif - }, + }, #endif - { + { {GTM_OPTNAME_CLUSTER_READ_ONLY, GTMC_STARTUP, - gettext_noop("Nodes connected with gtm will be readonly."), - gettext_noop("Default value is off."), - 0 - }, - >MClusterReadOnly, + gettext_noop("Nodes connected with gtm will be readonly."), + gettext_noop("Default value is off."), + 0 + }, + >MClusterReadOnly, false, NULL, NULL, false, NULL - }, + }, - /* End-of-list marker */ - { + /* End-of-list marker */ + { {NULL, 0, NULL, NULL, 0}, NULL, false, NULL, NULL, false, NULL - } + } }; struct config_int ConfigureNamesInt[] = { - { - {GTM_OPTNAME_PORT, GTMC_STARTUP, - gettext_noop("Listen Port of GTM or GTM standby server."), - NULL, - 0 - }, - >MPortNumber, + { + {GTM_OPTNAME_PORT, GTMC_STARTUP, + gettext_noop("Listen Port of GTM or GTM standby server."), + NULL, + 0 + }, + >MPortNumber, 0, 0, INT_MAX, NULL, NULL, - 0, NULL - }, - { + 0, NULL + }, + { {GTM_OPTNAME_ACTIVE_PORT, GTMC_STARTUP, - gettext_noop("GTM server port number when it works as GTM-Standby."), - NULL, - 0 - }, - &active_port, + gettext_noop("GTM server port number when it works as GTM-Standby."), + NULL, + 0 + }, + &active_port, 0, 0, INT_MAX, NULL, NULL, - 0, NULL - }, - { - {GTM_OPTNAME_KEEPALIVES_IDLE, GTMC_STARTUP, - gettext_noop("Sets \"keepalives_idle\" option for the connection to GTM."), - gettext_noop("This option is effective only when it runs as GTM-Standby."), - GTMOPT_UNIT_TIME - }, - &tcp_keepalives_idle, + 0, NULL + }, + { + {GTM_OPTNAME_KEEPALIVES_IDLE, GTMC_STARTUP, + gettext_noop("Sets \"keepalives_idle\" option for the connection to GTM."), + gettext_noop("This option is effective only when it runs as GTM-Standby."), + GTMOPT_UNIT_TIME + }, + &tcp_keepalives_idle, 0, 0, INT_MAX, NULL, NULL, - 0, NULL - }, - { - {GTM_OPTNAME_KEEPALIVES_INTERVAL, GTMC_STARTUP, - gettext_noop("Sets \"keepalives_interval\" option fo the connetion to GTM."), - gettext_noop("This option is effective only when it runs as GTM-Standby."), - GTMOPT_UNIT_TIME - }, - &tcp_keepalives_interval, + 0, NULL + }, + { + {GTM_OPTNAME_KEEPALIVES_INTERVAL, GTMC_STARTUP, + gettext_noop("Sets \"keepalives_interval\" option fo the connetion to GTM."), + gettext_noop("This option is effective only when it runs as GTM-Standby."), + GTMOPT_UNIT_TIME + }, + &tcp_keepalives_interval, 0, 0, INT_MAX, NULL, NULL, - 0, NULL - }, - { - {GTM_OPTNAME_KEEPALIVES_COUNT, GTMC_STARTUP, - gettext_noop("Sets \"keepalives_count\" option to the connection to GTM."), - gettext_noop("This option is effective only when it runs as GTM-Standby."), - 0 - }, - &tcp_keepalives_count, + 0, NULL + }, + { + {GTM_OPTNAME_KEEPALIVES_COUNT, GTMC_STARTUP, + gettext_noop("Sets \"keepalives_count\" option to the connection to GTM."), + gettext_noop("This option is effective only when it runs as GTM-Standby."), + 0 + }, + &tcp_keepalives_count, 0, 0, INT_MAX, NULL, NULL, - 0, NULL - }, - { - { - GTM_OPTNAME_SCALE_FACTOR_THREADS, GTMC_STARTUP, - gettext_noop("The scale factor of the number of worker thread, zero means disabled."), - NULL, - 0 - }, - &scale_factor_threads, + 0, NULL + }, + { + { + GTM_OPTNAME_SCALE_FACTOR_THREADS, GTMC_STARTUP, + gettext_noop("The scale factor of the number of worker thread, zero means disabled."), + NULL, + 0 + }, + &scale_factor_threads, 1, 0, INT_MAX, NULL, NULL, - 0, NULL - }, - { - { - GTM_OPTNAME_WORKER_THREADS_NUMBER, GTMC_STARTUP, - gettext_noop("The number of worker thread, zero means disabled."), - NULL, - 0 - }, - &worker_thread_number, + 0, NULL + }, + { + { + GTM_OPTNAME_WORKER_THREADS_NUMBER, GTMC_STARTUP, + gettext_noop("The number of worker thread, zero means disabled."), + NULL, + 0 + }, + &worker_thread_number, 2, 0, INT_MAX, NULL, NULL, - 0, NULL - }, + 0, NULL + }, #ifdef __XLOG__ - { - { + { + { GTM_OPTNAME_WAL_WRITER_DELAY, GTMC_STARTUP, - gettext_noop("Wal_writer will flush xlog every wal_writer_delay ms."), - NULL, - 0 - }, - &wal_writer_delay, + gettext_noop("Wal_writer will flush xlog every wal_writer_delay ms."), + NULL, + 0 + }, + &wal_writer_delay, 100, 10, INT_MAX, NULL, NULL, - 0, NULL - }, - { - { + 0, NULL + }, + { + { GTM_OPTNAME_CHECKPOINT_INTERVAL, GTMC_STARTUP, - gettext_noop("Checkpointer will do checkpoint every checkpoint_interval minute."), - NULL, - 0 - }, - &checkpoint_interval, + gettext_noop("Checkpointer will do checkpoint every checkpoint_interval minute."), + NULL, + 0 + }, + &checkpoint_interval, 30, 1, INT_MAX, NULL, NULL, - 0, NULL - }, - { - { - GTM_OPTNAME_MAX_RESERVED_WAL_NUMBER, GTMC_STARTUP, - gettext_noop("Max number of reserved xlog segments."), - NULL, - 0 - }, - &max_reserved_wal_number, + 0, NULL + }, + { + { + GTM_OPTNAME_MAX_RESERVED_WAL_NUMBER, GTMC_STARTUP, + gettext_noop("Max number of reserved xlog segments."), + NULL, + 0 + }, + &max_reserved_wal_number, 0, 0, INT_MAX, NULL, NULL, - 0, NULL - }, - { - { - GTM_OPTNAME_MAX_WAL_SENDER, GTMC_STARTUP, - gettext_noop("Max number of wal senders."), - NULL, - 0 - }, - &max_wal_sender, + 0, NULL + }, + { + { + GTM_OPTNAME_MAX_WAL_SENDER, GTMC_STARTUP, + gettext_noop("Max number of wal senders."), + NULL, + 0 + }, + &max_wal_sender, 3, 0, 100, NULL, NULL, - 0, NULL - }, + 0, NULL + }, { { GTM_OPTNAME_MAX_WAL_SENDER, GTMC_STARTUP, @@ -357,204 +359,222 @@ struct config_int ConfigureNamesInt[] = 0, NULL }, #endif - { - { - GTM_OPTNAME_GTS_FREEZE_TIME_LIMIT, GTMC_STARTUP, - gettext_noop("refuse to start gtm before GTS has n days left,default 100 years"), - NULL, - 0 - }, - >MGTSFreezeLimit, + { + { + GTM_OPTNAME_GTS_FREEZE_TIME_LIMIT, GTMC_STARTUP, + gettext_noop("refuse to start gtm before GTS has n days left,default 100 years"), + NULL, + 0 + }, + >MGTSFreezeLimit, 365 * 100, 0, INT_MAX, NULL, NULL, - 0, NULL - }, + 0, NULL + }, + { + { + GTM_OPTNAME_STARTUP_GTS_DELTA, GTMC_STARTUP, + gettext_noop("Add -d seconds to GTS when started"), + NULL, + 0 + }, + >MStartupGTSDelta, + 300 , 0, INT_MAX, NULL, NULL, + 0, NULL + }, + { { - GTM_OPTNAME_STARTUP_GTS_DELTA, GTMC_STARTUP, - gettext_noop("Add -d seconds to GTS when started"), + GTM_OPTNAME_UNIX_SOCKET_PERMISSIONS, GTMC_STARTUP, + gettext_noop("Sets the access permissions of the Unix-domain socket." + "Unix-domain sockets use the usual Unix file system " + "permission set. The parameter value is expected " + "to be a numeric mode specification in the form " + "accepted by the chmod and umask system calls. " + "(To use the customary octal format the number must " + "start with a 0 (zero).)"), NULL, 0 }, - >MStartupGTSDelta, - 300 , 0, INT_MAX, NULL, NULL, + &unix_socket_permissions, + 0777, 0000, 0777, NULL, NULL, 0, NULL }, - /* End-of-list marker */ - { + /* End-of-list marker */ + { {NULL, 0, NULL, NULL, 0}, NULL, 0, 0, 0, NULL, NULL, 0, NULL - } + } }; struct config_real ConfigureNamesReal[] = { - /* End-of-list marker */ - { + /* End-of-list marker */ + { {NULL, 0, NULL, NULL, 0}, NULL, 0.0, 0.0, 0.0, NULL, NULL, 0.0, NULL - } + } }; struct config_string ConfigureNamesString[] = { - { - {GTM_OPTNAME_DATA_DIR, GTMC_STARTUP, - gettext_noop("Work directory."), - NULL, - 0 - }, - >MDataDir, - NULL, + { + {GTM_OPTNAME_DATA_DIR, GTMC_STARTUP, + gettext_noop("Work directory."), + NULL, + 0 + }, + >MDataDir, + NULL, NULL, NULL, - NULL, - NULL - }, + NULL, + NULL + }, - { + { {GTM_OPTNAME_CONFIG_FILE, GTMC_STARTUP, - gettext_noop("Configuration file name."), - NULL, - 0 - }, - >MConfigFileName, - CONFIG_FILENAME, + gettext_noop("Configuration file name."), + NULL, + 0 + }, + >MConfigFileName, + CONFIG_FILENAME, NULL, NULL, - NULL, - NULL - }, + NULL, + NULL + }, - { - {GTM_OPTNAME_NODENAME, GTMC_STARTUP, - gettext_noop("Name of this GTM/GTM-Standby."), - NULL, - 0 - }, - &NodeName, - "gtm", + { + {GTM_OPTNAME_NODENAME, GTMC_STARTUP, + gettext_noop("Name of this GTM/GTM-Standby."), + NULL, + 0 + }, + &NodeName, + "gtm", NULL, NULL, - NULL, - NULL - }, + NULL, + NULL + }, - { - {GTM_OPTNAME_LISTEN_ADDRESSES, GTMC_STARTUP, - gettext_noop("Listen address."), - NULL, - 0 - }, - &ListenAddresses, - "*", + { + {GTM_OPTNAME_LISTEN_ADDRESSES, GTMC_STARTUP, + gettext_noop("Listen address."), + NULL, + 0 + }, + &ListenAddresses, + "*", NULL, NULL, - NULL, NULL - }, + NULL, NULL + }, - { + { {GTM_OPTNAME_ACTIVE_HOST, GTMC_STARTUP, - gettext_noop("Address of target GTM ACT."), - gettext_noop("This parameter is effective only when it runs as GTM-Standby"), - 0 - }, - &active_addr, - NULL, + gettext_noop("Address of target GTM ACT."), + gettext_noop("This parameter is effective only when it runs as GTM-Standby"), + 0 + }, + &active_addr, + NULL, NULL, NULL, - NULL, NULL - }, + NULL, NULL + }, - { + { {GTM_OPTNAME_LOG_FILE, GTMC_STARTUP, - gettext_noop("Log file name."), - NULL, - 0 - }, - >MLogFile, - "gtm.log", + gettext_noop("Log file name."), + NULL, + 0 + }, + >MLogFile, + "gtm.log", NULL, NULL, - NULL, NULL - }, + NULL, NULL + }, - { + { {GTM_OPTNAME_ERROR_REPORTER, GTMC_STARTUP, - gettext_noop("Command to report various errors."), - NULL, - 0 - }, - &error_reporter, - NULL, + gettext_noop("Command to report various errors."), + NULL, + 0 + }, + &error_reporter, + NULL, NULL, NULL, - NULL, NULL - }, + NULL, NULL + }, - { + { {GTM_OPTNAME_STATUS_READER, GTMC_STARTUP, - gettext_noop("Command to get status of global XC node status."), - gettext_noop("Runs when configuration file is read by SIGHUP"), - 0 - }, - &status_reader, - NULL, + gettext_noop("Command to get status of global XC node status."), + gettext_noop("Runs when configuration file is read by SIGHUP"), + 0 + }, + &status_reader, + NULL, NULL, NULL, - NULL, NULL - }, + NULL, NULL + }, #ifdef __XLOG__ { {GTM_OPTNAME_ARCHIVE_COMMAND, GTMC_STARTUP, - gettext_noop("Archive use this command to backup xlog."), - NULL, - 0 - }, - &archive_command, - NULL, + gettext_noop("Archive use this command to backup xlog."), + NULL, + 0 + }, + &archive_command, + NULL, NULL, NULL, - NULL, NULL - }, + NULL, NULL + }, { - {GTM_OPTNAME_SYNCHRONOUS_STANDBY_NAMES, GTMC_SIGHUP, - gettext_noop("to indicate which are synchronous slaves."), - NULL, - 0 - }, - &synchronous_standby_names, + {GTM_OPTNAME_SYNCHRONOUS_STANDBY_NAMES, GTMC_SIGHUP, + gettext_noop("to indicate which are synchronous slaves."), + NULL, + 0 + }, + &synchronous_standby_names, "", NULL, NULL, - NULL, NULL - }, - - { - {GTM_OPTNAME_APPLICATION_NAME, GTMC_STARTUP, - gettext_noop("application name used in sync replication indication"), - NULL, - 0 - }, - &application_name, - "", + NULL, NULL + }, + + { + {GTM_OPTNAME_APPLICATION_NAME, GTMC_STARTUP, + gettext_noop("application name used in sync replication indication"), + NULL, + 0 + }, + &application_name, + "", NULL, NULL, - NULL, NULL - }, - { - {GTM_OPTNAME_RECOVERY_COMMAND, GTMC_STARTUP, - gettext_noop("Point in time recovery,recovery command"), - NULL, - 0 - }, - &recovery_command, - NULL, + NULL, NULL + }, + { + {GTM_OPTNAME_RECOVERY_COMMAND, GTMC_STARTUP, + gettext_noop("Point in time recovery,recovery command"), + NULL, + 0 + }, + &recovery_command, + NULL, NULL, NULL, - NULL, NULL - }, - { - {GTM_OPTNAME_RECOVERY_TARGET_GLOBALTIMESTAMP, GTMC_STARTUP, - gettext_noop("Point in time recovery,recovery timestamp"), - NULL, - 0 - }, - &recovery_target_timestamp, - NULL, + NULL, NULL + }, + { + {GTM_OPTNAME_RECOVERY_TARGET_GLOBALTIMESTAMP, GTMC_STARTUP, + gettext_noop("Point in time recovery,recovery timestamp"), + NULL, + 0 + }, + &recovery_target_timestamp, + NULL, NULL, NULL, - NULL, NULL - }, + NULL, NULL + }, #endif - { + { {GTM_OPTNAME_STARTUP_GTS_SET, GTMC_STARTUP, gettext_noop("Force start GTM with this GTS"), NULL, @@ -564,47 +584,75 @@ struct config_string ConfigureNamesString[] = NULL, NULL, NULL, NULL, NULL + }, + + { + {GTM_OPTNAME_UNIX_SOCKET_DIRECTORY, GTMC_STARTUP, + gettext_noop("Sets the directory where Unix-domain sockets will be created."), + NULL, + 0 + }, + &unix_socket_directory, +#ifdef HAVE_UNIX_SOCKETS + DEFAULT_PGSOCKET_DIR, +#else + "", +#endif + NULL, NULL, + NULL, NULL }, - /* End-of-list marker */ { + {GTM_OPTNAME_UNIX_SOCKET_GROUP, GTMC_STARTUP, + gettext_noop("Sets the owning group of the Unix-domain socket."), + NULL, + 0 + }, + &unix_socket_group, + "", + NULL, NULL, + NULL, NULL + }, + + /* End-of-list marker */ + { {NULL, 0, NULL, NULL}, NULL, NULL, NULL, NULL, NULL, NULL - } + } }; struct config_enum ConfigureNamesEnum[] = { - { + { {GTM_OPTNAME_LOG_MIN_MESSAGES, GTMC_STARTUP, - gettext_noop("Minimum message level to write to the log file."), - NULL, - 0 - }, - &log_min_messages, - WARNING, - server_message_level_options, + gettext_noop("Minimum message level to write to the log file."), + NULL, + 0 + }, + &log_min_messages, + WARNING, + server_message_level_options, NULL,NULL, - WARNING, NULL - }, + WARNING, NULL + }, - { + { {GTM_OPTNAME_STARTUP, GTMC_STARTUP, - gettext_noop("Specifies startup mode, act or standby."), - NULL, - 0 - }, - >M_StandbyMode, - GTM_ACT_MODE, - gtm_startup_mode_options, + gettext_noop("Specifies startup mode, act or standby."), + NULL, + 0 + }, + >M_StandbyMode, + GTM_ACT_MODE, + gtm_startup_mode_options, NULL,NULL, - GTM_ACT_MODE, NULL - }, + GTM_ACT_MODE, NULL + }, - /* End-of-list marker */ - { + /* End-of-list marker */ + { {NULL, 0, NULL, NULL, 0}, NULL, 0, NULL,NULL,NULL, 0, NULL - } + } }; /******** end of options list ********/ @@ -615,12 +663,12 @@ struct config_enum ConfigureNamesEnum[] = struct config_generic **gtm_opt_variables; /* Current number of variables contained in the vector */ -int num_gtm_opt_variables; +int num_gtm_opt_variables; /* Vector capacity */ -int size_gtm_opt_variables; +int size_gtm_opt_variables; -bool reporting_enabled; /* TRUE to enable GTMOPT_REPORT */ +bool reporting_enabled; /* TRUE to enable GTMOPT_REPORT */ -int GTMOptUpdateCount = 0; /* Indicates when specific option is updated */ +int GTMOptUpdateCount = 0; /* Indicates when specific option is updated */ diff --git a/src/gtm/main/main.c b/src/gtm/main/main.c index ded1a044..81fa43c6 100644 --- a/src/gtm/main/main.c +++ b/src/gtm/main/main.c @@ -116,6 +116,7 @@ bool GTMClusterReadOnly; char *GTMStartupGTSSet; int GTMGTSFreezeLimit; int GTMStartupGTSDelta; +char *unix_socket_directory = NULL; #endif GTM_MutexLock control_lock; @@ -229,11 +230,12 @@ static void GTM_RegisterPGXCNode(Port *myport, char *PGXCNodeName); static bool CreateOptsFile(int argc, char *argv[]); static void CreateDataDirLockFile(void); -static void CreateLockFile(const char *filename, const char *refName); +void CreateLockFile(const char *filename, const char *refName); static void SetDataDir(void); static void ChangeToDataDir(void); static void checkDataDir(void); -static void DeleteLockFile(const char *filename); +void DeleteLockFile(const char *filename); +extern void RemoveSocketFile(void); static void PromoteToActive(void); #ifndef __XLOG__ static void ProcessSyncStandbyCommand(Port *myport, GTM_MessageType mtype, StringInfo message); @@ -614,7 +616,7 @@ main(int argc, char *argv[]) #endif bool force_xid = false; - int process_thread_num; + int process_thread_num = 0; bool do_basebackup = false; /* * Local variable to hold command line options. @@ -938,10 +940,12 @@ main(int argc, char *argv[]) if (strcmp(ListenAddresses, "*") == 0) status = StreamServerPort(AF_UNSPEC, NULL, (unsigned short) GTMPortNumber, + NULL, ListenSocket, MAXLISTEN); else status = StreamServerPort(AF_UNSPEC, ListenAddresses, (unsigned short) GTMPortNumber, + NULL, ListenSocket, MAXLISTEN); if (status != STATUS_OK) @@ -950,6 +954,25 @@ main(int argc, char *argv[]) ListenAddresses))); } +#ifdef __TBASE__ +#ifdef HAVE_UNIX_SOCKETS + if (unix_socket_directory) + { + status = StreamServerPort(AF_UNIX, NULL, + (unsigned short) GTMPortNumber, + unix_socket_directory, + ListenSocket, MAXLISTEN); + + if (status != STATUS_OK) + { + ereport(FATAL, + (errmsg("could not create Unix-domain socket in directory \"%s\"", + unix_socket_directory))); + } + } +#endif +#endif + /* * check that we have some socket to listen on */ @@ -1658,6 +1681,12 @@ ServerLoop(void) #ifdef __XLOG__ /* Delete pid file */ DeleteLockFile(GTM_PID_FILE); +#endif + +#ifdef __TBASE__ +#ifdef HAVE_UNIX_SOCKETS + RemoveSocketFile(); +#endif #endif elog(LOG, "GTM exits"); exit(1); @@ -4599,7 +4628,7 @@ CreateDataDirLockFile() * amPostmaster is used to determine how to encode the output PID. * isDDLock and refName are used to determine what error message to produce. */ -static void +void CreateLockFile(const char *filename, const char *refName) {// #lizard forgives int fd; @@ -4806,7 +4835,7 @@ CreateOptsFile(int argc, char *argv[]) } /* delete pid file */ -static void +void DeleteLockFile(const char *filename) { if (unlink(filename) < 0) diff --git a/src/gtm/proxy/proxy_main.c b/src/gtm/proxy/proxy_main.c index 7ea36366..5508c06d 100644 --- a/src/gtm/proxy/proxy_main.c +++ b/src/gtm/proxy/proxy_main.c @@ -232,11 +232,11 @@ static void GTMProxy_CommandPending(GTMProxy_ConnectionInfo *conninfo, static bool CreateOptsFile(int argc, char *argv[]); static void CreateDataDirLockFile(void); -static void CreateLockFile(const char *filename, const char *refName); +void CreateLockFile(const char *filename, const char *refName); static void SetDataDir(void); static void ChangeToDataDir(void); static void checkDataDir(void); -static void DeleteLockFile(const char *filename); +void DeleteLockFile(const char *filename); static void RegisterProxy(bool is_reconnect); static void UnregisterProxy(void); static GTM_Conn *ConnectGTM(void); @@ -877,10 +877,12 @@ main(int argc, char *argv[]) if (strcmp(ListenAddresses, "*") == 0) status = StreamServerPort(AF_UNSPEC, NULL, (unsigned short) GTMProxyPortNumber, + NULL, ListenSocket, MAXLISTEN); else status = StreamServerPort(AF_UNSPEC, ListenAddresses, (unsigned short) GTMProxyPortNumber, + NULL, ListenSocket, MAXLISTEN); if (status == STATUS_OK) @@ -2800,7 +2802,7 @@ CreateDataDirLockFile() * amPostmaster is used to determine how to encode the output PID. * isDDLock and refName are used to determine what error message to produce. */ -static void +void CreateLockFile(const char *filename, const char *refName) {// #lizard forgives int fd; @@ -2999,7 +3001,7 @@ CreateOptsFile(int argc, char *argv[]) } /* delete pid file */ -static void +void DeleteLockFile(const char *filename) { if (unlink(filename) < 0) diff --git a/src/include/gtm/gtm_c.h b/src/include/gtm/gtm_c.h index b5a18302..b5257f1a 100644 --- a/src/include/gtm/gtm_c.h +++ b/src/include/gtm/gtm_c.h @@ -393,7 +393,7 @@ typedef enum #define GTM_SYNC_CYCLE (5 * GTM_GTS_ONE_SECOND) #define GTM_SYNC_TIME_LIMIT (60 * GTM_GTS_ONE_SECOND) #define GTM_LOG_COLLECT_CYCLE (5 * GTM_GTS_ONE_SECOND) - +#define MAXGTMPATH 256 #pragma pack() #endif diff --git a/src/include/gtm/gtm_opt.h b/src/include/gtm/gtm_opt.h index 9dc07802..8bd33409 100644 --- a/src/include/gtm/gtm_opt.h +++ b/src/include/gtm/gtm_opt.h @@ -37,7 +37,7 @@ * configuration file, or by client request in the connection startup * packet (e.g., from libpq's PGOPTIONS variable). Furthermore, an * already-started backend will ignore changes to such an option in the - * configuration file. The idea is that these options are fixed for a + * configuration file. The idea is that these options are fixed for a * given backend once it's started, but they can vary across backends. * * SUSET options can be set at postmaster startup, with the SIGHUP @@ -47,10 +47,10 @@ */ typedef enum { - GTMC_DEFAULT, - GTMC_STARTUP, - GTMC_SIGHUP, - GTMC_USERSET + GTMC_DEFAULT, + GTMC_STARTUP, + GTMC_SIGHUP, + GTMC_USERSET } GtmOptContext; /* @@ -69,7 +69,7 @@ typedef enum * * GTMC_S_TEST is used when testing values to be stored as per-database or * per-user defaults ("doit" will always be false, so this never gets stored - * as the actual source of any value). This is an interactive case, but + * as the actual source of any value). This is an interactive case, but * it needs its own source value because some assign hooks need to make * different validity checks in this case. * @@ -77,19 +77,19 @@ typedef enum */ typedef enum { - GTMC_S_DEFAULT, /* hard-wired default ("boot_val") */ - GTMC_S_DYNAMIC_DEFAULT, /* default computed during initialization */ - GTMC_S_ENV_VAR, /* postmaster environment variable *//* Not used in GTM */ - GTMC_S_FILE, /* gtm.conf or gtm_proxy.conf */ - GTMC_S_ARGV, /* postmaster command line */ - GTMC_S_DATABASE, /* per-database setting *//* Not used in GTM */ - GTMC_S_USER, /* per-user setting *//* Not used in GTM */ - GTMC_S_DATABASE_USER, /* per-user-and-database setting *//* Not used in GTM */ - GTMC_S_CLIENT, /* from client connection request *//* Not used in GTM */ - GTMC_S_OVERRIDE, /* special case to forcibly set default *//* Not used in GTM */ - GTMC_S_INTERACTIVE, /* dividing line for error reporting *//* Not used in GTM */ - GTMC_S_TEST, /* test per-database or per-user setting *//* Not used in GTM */ - GTMC_S_SESSION /* SET command *//* Not used in GTM */ + GTMC_S_DEFAULT, /* hard-wired default ("boot_val") */ + GTMC_S_DYNAMIC_DEFAULT, /* default computed during initialization */ + GTMC_S_ENV_VAR, /* postmaster environment variable *//* Not used in GTM */ + GTMC_S_FILE, /* gtm.conf or gtm_proxy.conf */ + GTMC_S_ARGV, /* postmaster command line */ + GTMC_S_DATABASE, /* per-database setting *//* Not used in GTM */ + GTMC_S_USER, /* per-user setting *//* Not used in GTM */ + GTMC_S_DATABASE_USER, /* per-user-and-database setting *//* Not used in GTM */ + GTMC_S_CLIENT, /* from client connection request *//* Not used in GTM */ + GTMC_S_OVERRIDE, /* special case to forcibly set default *//* Not used in GTM */ + GTMC_S_INTERACTIVE, /* dividing line for error reporting *//* Not used in GTM */ + GTMC_S_TEST, /* test per-database or per-user setting *//* Not used in GTM */ + GTMC_S_SESSION /* SET command *//* Not used in GTM */ } GtmOptSource; /* @@ -98,19 +98,19 @@ typedef enum */ typedef struct ConfigVariable { - char *name; - char *value; - char *filename; - int sourceline; - struct ConfigVariable *next; + char *name; + char *value; + char *filename; + int sourceline; + struct ConfigVariable *next; } ConfigVariable; extern bool ParseConfigFile(const char *config_file, const char *calling_file, - int depth, int elevel, - ConfigVariable **head_p, ConfigVariable **tail_p); + int depth, int elevel, + ConfigVariable **head_p, ConfigVariable **tail_p); extern bool ParseConfigFp(FILE *fp, const char *config_file, - int depth, int elevel, - ConfigVariable **head_p, ConfigVariable **tail_p); + int depth, int elevel, + ConfigVariable **head_p, ConfigVariable **tail_p); extern void FreeConfigVariables(ConfigVariable *list); /* @@ -120,9 +120,9 @@ extern void FreeConfigVariables(ConfigVariable *list); */ struct config_enum_entry { - const char *name; - int val; - bool hidden; + const char *name; + int val; + bool hidden; }; /* @@ -152,10 +152,10 @@ typedef const char *(*GtmOptShowHook) (void); */ typedef enum { - /* Types of set_config_option actions */ - GTMOPT_ACTION_SET, /* regular SET command */ - GTMOPT_ACTION_LOCAL, /* SET LOCAL command */ - GTMOPT_ACTION_SAVE /* function SET option */ + /* Types of set_config_option actions */ + GTMOPT_ACTION_SET, /* regular SET command */ + GTMOPT_ACTION_LOCAL, /* SET LOCAL command */ + GTMOPT_ACTION_SAVE /* function SET option */ } GtmOptAction; #define GTMOPT_QUALIFIER_SEPARATOR '.' @@ -163,34 +163,34 @@ typedef enum /* * bit values in "flags" of a GUC variable */ -#define GTMOPT_LIST_INPUT 0x0001 /* input can be list format */ -#define GTMOPT_LIST_QUOTE 0x0002 /* double-quote list elements */ -#define GTMOPT_NO_SHOW_ALL 0x0004 /* exclude from SHOW ALL */ -#define GTMOPT_NO_RESET_ALL 0x0008 /* exclude from RESET ALL */ -#define GTMOPT_REPORT 0x0010 /* auto-report changes to client */ -#define GTMOPT_NOT_IN_SAMPLE 0x0020 /* not in postgresql.conf.sample */ -#define GTMOPT_DISALLOW_IN_FILE 0x0040 /* can't set in postgresql.conf */ -#define GTMOPT_CUSTOM_PLACEHOLDER 0x0080 /* placeholder for custom variable */ -#define GTMOPT_SUPERUSER_ONLY 0x0100 /* show only to superusers */ -#define GTMOPT_IS_NAME 0x0200 /* limit string to NAMEDATALEN-1 */ - -#define GTMOPT_UNIT_KB 0x0400 /* value is in kilobytes */ -#define GTMOPT_UNIT_BLOCKS 0x0800 /* value is in blocks */ -#define GTMOPT_UNIT_XBLOCKS 0x0C00 /* value is in xlog blocks */ -#define GTMOPT_UNIT_MEMORY 0x0C00 /* mask for KB, BLOCKS, XBLOCKS */ - -#define GTMOPT_UNIT_MS 0x1000 /* value is in milliseconds */ -#define GTMOPT_UNIT_S 0x2000 /* value is in seconds */ -#define GTMOPT_UNIT_MIN 0x4000 /* value is in minutes */ -#define GTMOPT_UNIT_TIME 0x7000 /* mask for MS, S, MIN */ - -#define GTMOPT_NOT_WHILE_SEC_REST 0x8000 /* can't set if security restricted */ +#define GTMOPT_LIST_INPUT 0x0001 /* input can be list format */ +#define GTMOPT_LIST_QUOTE 0x0002 /* double-quote list elements */ +#define GTMOPT_NO_SHOW_ALL 0x0004 /* exclude from SHOW ALL */ +#define GTMOPT_NO_RESET_ALL 0x0008 /* exclude from RESET ALL */ +#define GTMOPT_REPORT 0x0010 /* auto-report changes to client */ +#define GTMOPT_NOT_IN_SAMPLE 0x0020 /* not in postgresql.conf.sample */ +#define GTMOPT_DISALLOW_IN_FILE 0x0040 /* can't set in postgresql.conf */ +#define GTMOPT_CUSTOM_PLACEHOLDER 0x0080 /* placeholder for custom variable */ +#define GTMOPT_SUPERUSER_ONLY 0x0100 /* show only to superusers */ +#define GTMOPT_IS_NAME 0x0200 /* limit string to NAMEDATALEN-1 */ + +#define GTMOPT_UNIT_KB 0x0400 /* value is in kilobytes */ +#define GTMOPT_UNIT_BLOCKS 0x0800 /* value is in blocks */ +#define GTMOPT_UNIT_XBLOCKS 0x0C00 /* value is in xlog blocks */ +#define GTMOPT_UNIT_MEMORY 0x0C00 /* mask for KB, BLOCKS, XBLOCKS */ + +#define GTMOPT_UNIT_MS 0x1000 /* value is in milliseconds */ +#define GTMOPT_UNIT_S 0x2000 /* value is in seconds */ +#define GTMOPT_UNIT_MIN 0x4000 /* value is in minutes */ +#define GTMOPT_UNIT_TIME 0x7000 /* mask for MS, S, MIN */ + +#define GTMOPT_NOT_WHILE_SEC_REST 0x8000 /* can't set if security restricted */ /* * Functions exported by gtm_opt.c */ extern void SetConfigOption(const char *name, const char *value, - GtmOptContext context, GtmOptSource source); + GtmOptContext context, GtmOptSource source); extern void EmitWarningsOnPlaceholders(const char *className); @@ -200,17 +200,17 @@ extern bool ProcessConfigFile(GtmOptContext context); extern void InitializeGTMOptions(void); extern bool SelectConfigFiles(const char *userDoption, const char *progname); extern void ResetAllOptions(void); -extern int NewGTMNestLevel(void); +extern int NewGTMNestLevel(void); extern bool parse_int(const char *value, int *result, int flags, - const char **hintmsg); + const char **hintmsg); extern bool parse_real(const char *value, double *result); extern bool set_config_option(const char *name, const char *value, - GtmOptContext context, GtmOptSource source, - bool changeVal); + GtmOptContext context, GtmOptSource source, + bool changeVal); extern char *GetConfigOptionByName(const char *name, const char **varname); extern void GetConfigOptionByNum(int varnum, const char **values, bool *noshow); -extern int GetNumConfigOptions(void); +extern int GetNumConfigOptions(void); extern void ParseLongOption(const char *string, char **name, char **value); #ifndef PG_KRB_SRVTAB @@ -223,9 +223,9 @@ extern void ParseLongOption(const char *string, char **name, char **value); /* upper limit for GUC variables measured in kilobytes of memory */ /* note that various places assume the byte size fits in a "long" variable */ #if SIZEOF_SIZE_T > 4 && SIZEOF_LONG > 4 -#define MAX_KILOBYTES INT_MAX +#define MAX_KILOBYTES INT_MAX #else -#define MAX_KILOBYTES (INT_MAX / 1024) +#define MAX_KILOBYTES (INT_MAX / 1024) #endif #ifdef TRACE_SORT @@ -245,20 +245,20 @@ extern volatile sig_atomic_t ConfigReloadPending; */ #define Server_Message_Level_Options()\ static const struct config_enum_entry server_message_level_options[] = {\ - {"debug", DEBUG2, true},\ - {"debug5", DEBUG5, false},\ - {"debug4", DEBUG4, false},\ - {"debug3", DEBUG3, false},\ - {"debug2", DEBUG2, false},\ - {"debug1", DEBUG1, false},\ - {"info", INFO, false},\ - {"notice", NOTICE, false},\ - {"warning", WARNING, false},\ - {"error", ERROR, false},\ - {"log", LOG, false},\ - {"fatal", FATAL, false},\ - {"panic", PANIC, false},\ - {NULL, 0, false}\ + {"debug", DEBUG2, true},\ + {"debug5", DEBUG5, false},\ + {"debug4", DEBUG4, false},\ + {"debug3", DEBUG3, false},\ + {"debug2", DEBUG2, false},\ + {"debug1", DEBUG1, false},\ + {"info", INFO, false},\ + {"notice", NOTICE, false},\ + {"warning", WARNING, false},\ + {"error", ERROR, false},\ + {"log", LOG, false},\ + {"fatal", FATAL, false},\ + {"panic", PANIC, false},\ + {NULL, 0, false}\ } /* @@ -266,9 +266,9 @@ static const struct config_enum_entry server_message_level_options[] = {\ */ #define Gtm_Startup_Mode_Options()\ static const struct config_enum_entry gtm_startup_mode_options[] = {\ - {"act", GTM_ACT_MODE, false},\ - {"standby", GTM_STANDBY_MODE, false},\ - {NULL, 0, false}\ + {"act", GTM_ACT_MODE, false},\ + {"standby", GTM_STANDBY_MODE, false},\ + {NULL, 0, false}\ } /* @@ -279,8 +279,8 @@ static const struct config_enum_entry gtm_startup_mode_options[] = {\ #define gtmOptContext_Names()\ const char *const GtmOptContext_Names[] =\ {\ - /* GTMC_STGARTUP */ "startup",\ - /* GTMC_SIGHUP */ "sighup"\ + /* GTMC_STGARTUP */ "startup",\ + /* GTMC_SIGHUP */ "sighup"\ } /* @@ -291,19 +291,19 @@ const char *const GtmOptContext_Names[] =\ #define gtmOptSource_Names()\ const char *const GtmOptSource_Names[] =\ {\ - /* GTMC_S_DEFAULT */ "default",\ - /* GTMC_S_DYNAMIC_DEFAULT */ "default",\ - /* GTMC_S_ENV_VAR */ "environment variable",\ - /* GTMC_S_FILE */ "configuration file",\ - /* GTMC_S_ARGV */ "command line",\ - /* GTMC_S_DATABASE */ "database",\ - /* GTMC_S_USER */ "user",\ - /* GTMC_S_DATABASE_USER */ "database user",\ - /* GTMC_S_CLIENT */ "client",\ - /* GTMC_S_OVERRIDE */ "override",\ - /* GTMC_S_INTERACTIVE */ "interactive",\ - /* GTMC_S_TEST */ "test",\ - /* GTMC_S_SESSION */ "session"\ + /* GTMC_S_DEFAULT */ "default",\ + /* GTMC_S_DYNAMIC_DEFAULT */ "default",\ + /* GTMC_S_ENV_VAR */ "environment variable",\ + /* GTMC_S_FILE */ "configuration file",\ + /* GTMC_S_ARGV */ "command line",\ + /* GTMC_S_DATABASE */ "database",\ + /* GTMC_S_USER */ "user",\ + /* GTMC_S_DATABASE_USER */ "database user",\ + /* GTMC_S_CLIENT */ "client",\ + /* GTMC_S_OVERRIDE */ "override",\ + /* GTMC_S_INTERACTIVE */ "interactive",\ + /* GTMC_S_TEST */ "test",\ + /* GTMC_S_SESSION */ "session"\ } /* @@ -314,11 +314,11 @@ const char *const GtmOptSource_Names[] =\ #define Config_Type_Names()\ const char *const config_type_names[] =\ {\ - /* GTMC_BOOL */ "bool",\ - /* GTMC_INT */ "integer",\ - /* GTMC_REAL */ "real",\ - /* GTMC_STRING */ "string",\ - /* GTMC_ENUM */ "enum"\ + /* GTMC_BOOL */ "bool",\ + /* GTMC_INT */ "integer",\ + /* GTMC_REAL */ "real",\ + /* GTMC_STRING */ "string",\ + /* GTMC_ENUM */ "enum"\ } @@ -328,37 +328,37 @@ const char *const config_type_names[] =\ * This will be used both in *.conf and command line option override. */ -#define GTM_OPTNAME_ACTIVE_HOST "active_host" -#define GTM_OPTNAME_ACTIVE_PORT "active_port" -#define GTM_OPTNAME_CONFIG_FILE "config_file" -#define GTM_OPTNAME_DATA_DIR "data_dir" -#define GTM_OPTNAME_ERROR_REPORTER "error_reporter" +#define GTM_OPTNAME_ACTIVE_HOST "active_host" +#define GTM_OPTNAME_ACTIVE_PORT "active_port" +#define GTM_OPTNAME_CONFIG_FILE "config_file" +#define GTM_OPTNAME_DATA_DIR "data_dir" +#define GTM_OPTNAME_ERROR_REPORTER "error_reporter" #define GTM_OPTNAME_CONNECT_RETRY_INTERVAL "gtm_connect_retry_interval" -#define GTM_OPTNAME_GTM_HOST "gtm_host" -#define GTM_OPTNAME_GTM_PORT "gtm_port" -#define GTM_OPTNAME_KEEPALIVES_IDLE "keepalives_idle" -#define GTM_OPTNAME_KEEPALIVES_INTERVAL "keepalives_interval" -#define GTM_OPTNAME_KEEPALIVES_COUNT "keepalives_count" -#define GTM_OPTNAME_LISTEN_ADDRESSES "listen_addresses" -#define GTM_OPTNAME_LOG_FILE "log_file" -#define GTM_OPTNAME_LOG_MIN_MESSAGES "log_min_messages" -#define GTM_OPTNAME_NODENAME "nodename" -#define GTM_OPTNAME_PORT "port" -#define GTM_OPTNAME_STARTUP "startup" -#define GTM_OPTNAME_STATUS_READER "status_reader" -#define GTM_OPTNAME_SYNCHRONOUS_BACKUP "synchronous_backup" -#define GTM_OPTNAME_WORKER_THREADS "worker_threads" +#define GTM_OPTNAME_GTM_HOST "gtm_host" +#define GTM_OPTNAME_GTM_PORT "gtm_port" +#define GTM_OPTNAME_KEEPALIVES_IDLE "keepalives_idle" +#define GTM_OPTNAME_KEEPALIVES_INTERVAL "keepalives_interval" +#define GTM_OPTNAME_KEEPALIVES_COUNT "keepalives_count" +#define GTM_OPTNAME_LISTEN_ADDRESSES "listen_addresses" +#define GTM_OPTNAME_LOG_FILE "log_file" +#define GTM_OPTNAME_LOG_MIN_MESSAGES "log_min_messages" +#define GTM_OPTNAME_NODENAME "nodename" +#define GTM_OPTNAME_PORT "port" +#define GTM_OPTNAME_STARTUP "startup" +#define GTM_OPTNAME_STATUS_READER "status_reader" +#define GTM_OPTNAME_SYNCHRONOUS_BACKUP "synchronous_backup" +#define GTM_OPTNAME_WORKER_THREADS "worker_threads" #define GTM_OPTNAME_ENABLE_DEBUG "enable_gtm_debug" #define GTM_OPTNAME_ENABLE_SEQ_DEBUG "enable_gtm_sequence_debug" -#define GTM_OPTNAME_SCALE_FACTOR_THREADS "scale_factor_threads" -#define GTM_OPTNAME_WORKER_THREADS_NUMBER "worker_thread_number" +#define GTM_OPTNAME_SCALE_FACTOR_THREADS "scale_factor_threads" +#define GTM_OPTNAME_WORKER_THREADS_NUMBER "worker_thread_number" #define GTM_OPTNAME_GTS_FREEZE_TIME_LIMIT "gtm_freeze_time_limit" #define GTM_OPTNAME_STARTUP_GTS_DELTA "gtm_startup_gts_delta" #define GTM_OPTNAME_STARTUP_GTS_SET "gtm_startup_gts_set" #define GTM_OPTNAME_CLUSTER_READ_ONLY "gtm_cluster_read_only" #ifdef __XLOG__ -#define GTM_OPTNAME_SYNCHRONOUS_COMMIT "synchronous_commit" +#define GTM_OPTNAME_SYNCHRONOUS_COMMIT "synchronous_commit" #define GTM_OPTNAME_WAL_WRITER_DELAY "wal_writer_delay" #define GTM_OPTNAME_CHECKPOINT_INTERVAL "checkpoint_interval" #define GTM_OPTNAME_ARCHIVE_COMMAND "archive_command" @@ -372,8 +372,9 @@ const char *const config_type_names[] =\ #define GTM_OPTNAME_RECOVERY_COMMAND "recovery_command" #endif - - +#define GTM_OPTNAME_UNIX_SOCKET_DIRECTORY "unix_socket_directory" +#define GTM_OPTNAME_UNIX_SOCKET_GROUP "unix_socket_group" +#define GTM_OPTNAME_UNIX_SOCKET_PERMISSIONS "unix_socket_permissions" #endif /* GTM_OPT_H */ diff --git a/src/include/gtm/libpq.h b/src/include/gtm/libpq.h index 2a60d8c9..8c2ac2b0 100644 --- a/src/include/gtm/libpq.h +++ b/src/include/gtm/libpq.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * libpq.h - * POSTGRES LIBPQ buffer structure definitions. + * POSTGRES LIBPQ buffer structure definitions. * * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group @@ -29,19 +29,19 @@ * prototypes for functions in pqcomm.c */ extern int StreamServerPort(int family, char *hostName, - unsigned short portNumber, int ListenSocket[], - int MaxListen); -extern int StreamConnection(int server_fd, Port *port); + unsigned short portNumber, char *unixSocketDir, int ListenSocket[], + int MaxListen); +extern int StreamConnection(int server_fd, Port *port); extern void StreamClose(int sock); extern void TouchSocketFile(void); extern void pq_comm_reset(void); -extern int pq_getbytes(Port *myport, char *s, size_t len); -extern int pq_getstring(Port *myport, StringInfo s); -extern int pq_getmessage(Port *myport, StringInfo s, int maxlen); -extern int pq_getbyte(Port *myport); -extern int pq_peekbyte(Port *myport); -extern int pq_putbytes(Port *myport, const char *s, size_t len); -extern int pq_flush(Port *myport); -extern int pq_putmessage(Port *myport, char msgtype, const char *s, size_t len); +extern int pq_getbytes(Port *myport, char *s, size_t len); +extern int pq_getstring(Port *myport, StringInfo s); +extern int pq_getmessage(Port *myport, StringInfo s, int maxlen); +extern int pq_getbyte(Port *myport); +extern int pq_peekbyte(Port *myport); +extern int pq_putbytes(Port *myport, const char *s, size_t len); +extern int pq_flush(Port *myport); +extern int pq_putmessage(Port *myport, char msgtype, const char *s, size_t len); #endif /* LIBPQ_H */ diff --git a/src/include/gtm/pqcomm.h b/src/include/gtm/pqcomm.h index d98934db..e39a72a2 100644 --- a/src/include/gtm/pqcomm.h +++ b/src/include/gtm/pqcomm.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * pqcomm.h - * Definitions common to frontends and backends. + * Definitions common to frontends and backends. * * NOTE: for historical reasons, this does not correspond to pqcomm.c. * pqcomm.c's routines are declared in libpq.h. @@ -26,24 +26,37 @@ typedef struct { - struct sockaddr_storage addr; - size_t salen; + struct sockaddr_storage addr; + size_t salen; } SockAddr; /* Configure the UNIX socket location for the well known port. */ #define UNIXSOCK_PATH(path, port, sockdir) \ - snprintf(path, sizeof(path), "%s/.s.PGSQL.%d", \ - ((sockdir) && *(sockdir) != '\0') ? (sockdir) : \ - DEFAULT_PGSOCKET_DIR, \ - (port)) + snprintf(path, sizeof(path), "%s/.s.GTM.%d", \ + ((sockdir) && *(sockdir) != '\0') ? (sockdir) : \ + DEFAULT_PGSOCKET_DIR, \ + (port)) /* * In protocol 3.0 and later, the startup packet length is not fixed, but - * we set an arbitrary limit on it anyway. This is just to prevent simple + * we set an arbitrary limit on it anyway. This is just to prevent simple * denial-of-service attacks via sending enough data to run the server * out of memory. */ #define MAX_STARTUP_PACKET_LENGTH 10000 + +/* + * The maximum workable length of a socket path is what will fit into + * struct sockaddr_un. This is usually only 100 or so bytes :-(. + * + * For consistency, always pass a MAXPGPATH-sized buffer to UNIXSOCK_PATH(), + * then complain if the resulting string is >= UNIXSOCK_PATH_BUFLEN bytes. + * (Because the standard API for getaddrinfo doesn't allow it to complain in + * a useful way when the socket pathname is too long, we have to test for + * this explicitly, instead of just letting the subroutine return an error.) + */ +#define UNIXSOCK_PATH_BUFLEN sizeof(((struct sockaddr_un *) NULL)->sun_path) + #endif /* PQCOMM_H */ diff --git a/src/include/postmaster/postmaster.h b/src/include/postmaster/postmaster.h index f32e7db9..0c27047e 100644 --- a/src/include/postmaster/postmaster.h +++ b/src/include/postmaster/postmaster.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * postmaster.h - * Exports from postmaster/postmaster.c. + * Exports from postmaster/postmaster.c. * * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -15,15 +15,15 @@ /* GUC options */ extern bool EnableSSL; -extern int ReservedBackends; -extern int PostPortNumber; -extern int Unix_socket_permissions; +extern int ReservedBackends; +extern int PostPortNumber; +extern int Unix_socket_permissions; extern char *Unix_socket_group; extern char *Unix_socket_directories; extern char *ListenAddresses; extern bool ClientAuthInProgress; -extern int PreAuthDelay; -extern int AuthenticationTimeout; +extern int PreAuthDelay; +extern int AuthenticationTimeout; extern bool Log_connections; extern bool log_hostname; extern bool enable_bonjour; @@ -37,6 +37,7 @@ extern char *g_BouncerConf; extern bool enable_null_string; extern bool g_concurrently_index; extern bool g_set_global_snapshot; +extern char *gtm_unix_socket_directory; #endif #ifdef __COLD_HOT__ @@ -66,15 +67,15 @@ enum #ifdef WIN32 extern HANDLE PostmasterHandle; #else -extern int postmaster_alive_fds[2]; +extern int postmaster_alive_fds[2]; /* * Constants that represent which of postmaster_alive_fds is held by * postmaster, and which is used in children to check for postmaster death. */ -#define POSTMASTER_FD_WATCH 0 /* used in children to check for - * postmaster death */ -#define POSTMASTER_FD_OWN 1 /* kept open by postmaster only */ +#define POSTMASTER_FD_WATCH 0 /* used in children to check for + * postmaster death */ +#define POSTMASTER_FD_OWN 1 /* kept open by postmaster only */ #endif extern const char *progname; @@ -82,9 +83,9 @@ extern const char *progname; extern void PostmasterMain(int argc, char *argv[]) pg_attribute_noreturn(); extern void ClosePostmasterPorts(bool am_syslogger); -extern int MaxLivePostmasterChildren(void); +extern int MaxLivePostmasterChildren(void); -extern int GetNumShmemAttachedBgworkers(void); +extern int GetNumShmemAttachedBgworkers(void); extern bool PostmasterMarkPIDForWorkerNotify(int); #ifdef EXEC_BACKEND @@ -105,10 +106,10 @@ extern void ShmemBackendArrayAllocation(void); * compute 4*MaxBackends without any overflow check. This is rechecked in the * relevant GUC check hooks and in RegisterBackgroundWorker(). */ -#define MAX_BACKENDS 0x3FFFF +#define MAX_BACKENDS 0x3FFFF #ifdef __TBASE__ extern void PostmasterEnableLogTimeout(void); extern void PostmasterDisableTimeout(void); extern bool PostmasterIsPrimaryAndNormal(void); #endif -#endif /* _POSTMASTER_H */ +#endif /* _POSTMASTER_H */ From 55318242da135554e697df6231e60e25c30cbf2c Mon Sep 17 00:00:00 2001 From: whalesong Date: Wed, 20 Jan 2021 16:21:54 +0800 Subject: [PATCH 327/578] Bugfix: gtm switch cause prepared statement can not use, ID84618929 --- src/backend/access/transam/gtm.c | 85 ++++++++- src/backend/access/transam/xact.c | 11 ++ src/backend/utils/misc/guc.c | 21 +++ src/include/access/gtm.h | 3 + src/include/access/xact.h | 287 +++++++++++++++--------------- 5 files changed, 257 insertions(+), 150 deletions(-) diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c index 969311e5..daf77a90 100644 --- a/src/backend/access/transam/gtm.c +++ b/src/backend/access/transam/gtm.c @@ -56,6 +56,10 @@ int NewGtmPort = -1; bool g_GTM_skip_catalog = false; char *gtm_unix_socket_directory = DEFAULT_PGSOCKET_DIR; #endif + +int reconnect_gtm_retry_times = 3; +int reconnect_gtm_retry_interval = 500; + char *GtmHost = NULL; int GtmPort = 0; static int GtmConnectTimeout = 60; @@ -82,6 +86,7 @@ List *g_DropSeqList = NULL; List *g_AlterSeqList = NULL; #define GTM_SEQ_POSTFIX "_$TBASE$_" static void CheckConnection(void); +static void ResetGTMConnection(void); static int GetGTMStoreStatus(GTMStorageStatus *header); static int GetGTMStoreSequence(GTM_StoredSeqInfo **store_seq); static int GetGTMStoreTransaction(GTM_StoredTransactionInfo **store_txn); @@ -1107,6 +1112,10 @@ GetMasterGtmInfo(void) /* If NewGtmHost and NewGtmPort, just use it. */ if (NewGtmHost && NewGtmPort != 0) { + elog(LOG, + "GetMasterGtmInfo: set master gtm info with NewGtmHost:%s NewGtmPort:%d", + NewGtmHost, NewGtmPort); + GtmHost = strdup(NewGtmHost); GtmPort = NewGtmPort; @@ -1114,9 +1123,6 @@ GetMasterGtmInfo(void) NewGtmHost = NULL; NewGtmPort = 0; - elog(LOG, - "GetMasterGtmInfo: set master gtm info with NewGtmHost:%s NewGtmPort:%d", - NewGtmHost, NewGtmPort); return; } @@ -1184,6 +1190,53 @@ CheckConnection(void) } } +static void +ResetGTMConnection(void) +{ + Relation rel; + HeapScanDesc scan; + HeapTuple gtmtup; + Form_pgxc_node nodeForm; + bool found = false; + + CloseGTM(); + ResetGtmInfo(); + + /* + * We must be sure there is no error report, because we may be + * in AbortTransaction now. + * 1.If we are not in a inprogress or commit transaction, we should not open relation. + * 2.If we do not get lock, it is ok to try it next time. + */ + if ( (IsTransactionState() || IsTransactionCommit()) && + ConditionalLockRelationOid(PgxcNodeRelationId, AccessShareLock)) + { + rel = relation_open(PgxcNodeRelationId, NoLock); + scan = heap_beginscan_catalog(rel, 0, NULL); + /* Only one record will match */ + while (HeapTupleIsValid(gtmtup = heap_getnext(scan, ForwardScanDirection))) + { + nodeForm = (Form_pgxc_node) GETSTRUCT(gtmtup); + if (PGXC_NODE_GTM == nodeForm->node_type && nodeForm->nodeis_primary) + { + GtmHost = strdup(NameStr(nodeForm->node_host)); + GtmPort = nodeForm->node_port; + found = true; + break; + } + } + heap_endscan(scan); + relation_close(rel, AccessShareLock); + + if (!found) + { + elog(LOG, "can not get master gtm info from pgxc_node"); + } + } + + InitGTM(); +} + void InitGTM(void) {// #lizard forgives @@ -1382,6 +1435,7 @@ GetGlobalTimestampGTM(void) GTM_Timestamp latest_gts = InvalidGlobalTimestamp; struct rusage start_r; struct timeval start_t; + int retry_cnt = 0; if (log_gtm_stats) ResetUsageCommon(&start_r, &start_t); @@ -1400,21 +1454,38 @@ GetGlobalTimestampGTM(void) /* If something went wrong (timeout), try and reset GTM connection * and retry. This is safe at the beginning of a transaction. */ - if (!GlobalTimestampIsValid(gts_result.gts)) + while (!GlobalTimestampIsValid(gts_result.gts) && + retry_cnt < reconnect_gtm_retry_times) { if(GTMDebugPrint) { elog(LOG, "get global timestamp reconnect"); } - CloseGTM(); - InitGTM(); + + ResetGTMConnection(); + retry_cnt++; + + elog(DEBUG5, "reset gtm connection %d times", retry_cnt); + if (conn) { gts_result = get_global_timestamp(conn); + if (GlobalTimestampIsValid(gts_result.gts)) + { + elog(DEBUG5, "retry get global timestamp gts " INT64_FORMAT, + gts_result.gts); + break; + } } else if(GTMDebugPrint) { - elog(LOG, "get global timestamp conn is null after retry"); + elog(LOG, "get global timestamp conn is null after retry %d times", + retry_cnt); + } + + if (retry_cnt < reconnect_gtm_retry_times) + { + pg_usleep(reconnect_gtm_retry_interval * 1000); } } elog(DEBUG7, "get global timestamp gts " INT64_FORMAT, gts_result.gts); diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 91cd002a..0f80f39c 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -540,6 +540,17 @@ IsTransactionState(void) return (s->state == TRANS_INPROGRESS); } +/* + * IsTransactionCommit + * + * This returns true if transaction state is TRANS_COMMIT + */ +bool +IsTransactionCommit(void) +{ + return (CurrentTransactionState->state == TRANS_COMMIT); +} + /* * IsAbortedTransactionBlockState * diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index a0db968c..6e3a43dc 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -4757,6 +4757,27 @@ static struct config_int ConfigureNamesInt[] = }, #endif + { + {"reconnect_gtm_retry_times", PGC_USERSET, CUSTOM_OPTIONS, + gettext_noop("reconnect gtm retry times"), + NULL + }, + &reconnect_gtm_retry_times, + 3, 0, 100, + NULL, NULL, NULL + }, + + { + {"reconnect_gtm_retry_interval", PGC_USERSET, CUSTOM_OPTIONS, + gettext_noop("reconnect gtm retry interval"), + NULL, + GUC_UNIT_MS + }, + &reconnect_gtm_retry_interval, + 500, 0, 60000, + NULL, NULL, NULL + }, + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL diff --git a/src/include/access/gtm.h b/src/include/access/gtm.h index aa11962c..5da0eb6a 100644 --- a/src/include/access/gtm.h +++ b/src/include/access/gtm.h @@ -107,6 +107,9 @@ extern char *NewGtmHost; extern int NewGtmPort; #endif +extern int reconnect_gtm_retry_times; +extern int reconnect_gtm_retry_interval; + extern bool IsGTMConnected(void); extern void InitGTM(void); extern void CloseGTM(void); diff --git a/src/include/access/xact.h b/src/include/access/xact.h index d63c63ed..a06c14d4 100644 --- a/src/include/access/xact.h +++ b/src/include/access/xact.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * xact.h - * postgres transaction system definitions + * postgres transaction system definitions * * * Portions Copyright (c) 2012-2014, TransLattice, Inc. @@ -33,12 +33,12 @@ /* * Xact isolation levels */ -#define XACT_READ_UNCOMMITTED 0 -#define XACT_READ_COMMITTED 1 -#define XACT_REPEATABLE_READ 2 -#define XACT_SERIALIZABLE 3 +#define XACT_READ_UNCOMMITTED 0 +#define XACT_READ_COMMITTED 1 +#define XACT_REPEATABLE_READ 2 +#define XACT_SERIALIZABLE 3 -extern int DefaultXactIsoLevel; +extern int DefaultXactIsoLevel; extern PGDLLIMPORT int XactIsoLevel; /* @@ -68,19 +68,19 @@ extern bool XactDeferrable; typedef enum { - SYNCHRONOUS_COMMIT_OFF, /* asynchronous commit */ - SYNCHRONOUS_COMMIT_LOCAL_FLUSH, /* wait for local flush only */ - SYNCHRONOUS_COMMIT_REMOTE_WRITE, /* wait for local flush and remote - * write */ - SYNCHRONOUS_COMMIT_REMOTE_FLUSH, /* wait for local and remote flush */ - SYNCHRONOUS_COMMIT_REMOTE_APPLY /* wait for local flush and remote apply */ -} SyncCommitLevel; + SYNCHRONOUS_COMMIT_OFF, /* asynchronous commit */ + SYNCHRONOUS_COMMIT_LOCAL_FLUSH, /* wait for local flush only */ + SYNCHRONOUS_COMMIT_REMOTE_WRITE, /* wait for local flush and remote + * write */ + SYNCHRONOUS_COMMIT_REMOTE_FLUSH, /* wait for local and remote flush */ + SYNCHRONOUS_COMMIT_REMOTE_APPLY /* wait for local flush and remote apply */ +} SyncCommitLevel; /* Define the default setting for synchronous_commit */ -#define SYNCHRONOUS_COMMIT_ON SYNCHRONOUS_COMMIT_REMOTE_FLUSH +#define SYNCHRONOUS_COMMIT_ON SYNCHRONOUS_COMMIT_REMOTE_FLUSH /* Synchronous commit level */ -extern int synchronous_commit; +extern int synchronous_commit; /* * Miscellaneous flag bits to record events which occur on the top level @@ -89,48 +89,48 @@ extern int synchronous_commit; * globally accessible, so can be set from anywhere in the code which requires * recording flags. */ -extern int MyXactFlags; +extern int MyXactFlags; /* * XACT_FLAGS_ACCESSEDTEMPREL - set when a temporary relation is accessed. We * don't allow PREPARE TRANSACTION in that case. */ -#define XACT_FLAGS_ACCESSEDTEMPREL (1U << 0) +#define XACT_FLAGS_ACCESSEDTEMPREL (1U << 0) /* * XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK - records whether the top level xact * logged any Access Exclusive Locks. */ -#define XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK (1U << 1) +#define XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK (1U << 1) /* - * start- and end-of-transaction callbacks for dynamically loaded modules + * start- and end-of-transaction callbacks for dynamically loaded modules */ typedef enum { - XACT_EVENT_COMMIT, - XACT_EVENT_PARALLEL_COMMIT, - XACT_EVENT_ABORT, - XACT_EVENT_PARALLEL_ABORT, - XACT_EVENT_PREPARE, - XACT_EVENT_PRE_COMMIT, - XACT_EVENT_PARALLEL_PRE_COMMIT, - XACT_EVENT_PRE_PREPARE + XACT_EVENT_COMMIT, + XACT_EVENT_PARALLEL_COMMIT, + XACT_EVENT_ABORT, + XACT_EVENT_PARALLEL_ABORT, + XACT_EVENT_PREPARE, + XACT_EVENT_PRE_COMMIT, + XACT_EVENT_PARALLEL_PRE_COMMIT, + XACT_EVENT_PRE_PREPARE } XactEvent; typedef void (*XactCallback) (XactEvent event, void *arg); typedef enum { - SUBXACT_EVENT_START_SUB, - SUBXACT_EVENT_COMMIT_SUB, - SUBXACT_EVENT_ABORT_SUB, - SUBXACT_EVENT_PRE_COMMIT_SUB + SUBXACT_EVENT_START_SUB, + SUBXACT_EVENT_COMMIT_SUB, + SUBXACT_EVENT_ABORT_SUB, + SUBXACT_EVENT_PRE_COMMIT_SUB } SubXactEvent; typedef void (*SubXactCallback) (SubXactEvent event, SubTransactionId mySubid, - SubTransactionId parentSubid, void *arg); + SubTransactionId parentSubid, void *arg); #ifdef PGXC /* @@ -138,16 +138,16 @@ typedef void (*SubXactCallback) (SubXactEvent event, SubTransactionId mySubid, */ typedef enum { - GTM_EVENT_COMMIT, - GTM_EVENT_ABORT, - GTM_EVENT_PREPARE + GTM_EVENT_COMMIT, + GTM_EVENT_ABORT, + GTM_EVENT_PREPARE } GTMEvent; typedef void (*GTMCallback) (GTMEvent event, void *arg); #endif /* ---------------- - * transaction-related XLOG entries + * transaction-related XLOG entries * ---------------- */ @@ -155,24 +155,24 @@ typedef void (*GTMCallback) (GTMEvent event, void *arg); * XLOG allows to store some information in high 4 bits of log record xl_info * field. We use 3 for the opcode, and one about an optional flag variable. */ -#define XLOG_XACT_COMMIT 0x00 -#define XLOG_XACT_PREPARE 0x10 -#define XLOG_XACT_ABORT 0x20 -#define XLOG_XACT_COMMIT_PREPARED 0x30 -#define XLOG_XACT_ABORT_PREPARED 0x40 -#define XLOG_XACT_ASSIGNMENT 0x50 +#define XLOG_XACT_COMMIT 0x00 +#define XLOG_XACT_PREPARE 0x10 +#define XLOG_XACT_ABORT 0x20 +#define XLOG_XACT_COMMIT_PREPARED 0x30 +#define XLOG_XACT_ABORT_PREPARED 0x40 +#define XLOG_XACT_ASSIGNMENT 0x50 #ifdef __TBASE__ /* free opcode 0x60 */ -#define XLOG_XACT_ACQUIRE_GTS 0x60 +#define XLOG_XACT_ACQUIRE_GTS 0x60 #endif /* free opcode 0x70 */ /* mask for filtering opcodes out of xl_info */ -#define XLOG_XACT_OPMASK 0x70 +#define XLOG_XACT_OPMASK 0x70 /* does this record have a 'xinfo' field or not */ -#define XLOG_XACT_HAS_INFO 0x80 +#define XLOG_XACT_HAS_INFO 0x80 /* record 2plc file for readonly explicit transaction */ #define XLOG_XACT_RECORD_READONLY 0x90 @@ -180,13 +180,13 @@ typedef void (*GTMCallback) (GTMEvent event, void *arg); * The following flags, stored in xinfo, determine which information is * contained in commit/abort records. */ -#define XACT_XINFO_HAS_DBINFO (1U << 0) -#define XACT_XINFO_HAS_SUBXACTS (1U << 1) -#define XACT_XINFO_HAS_RELFILENODES (1U << 2) -#define XACT_XINFO_HAS_INVALS (1U << 3) -#define XACT_XINFO_HAS_TWOPHASE (1U << 4) -#define XACT_XINFO_HAS_ORIGIN (1U << 5) -#define XACT_XINFO_HAS_AE_LOCKS (1U << 6) +#define XACT_XINFO_HAS_DBINFO (1U << 0) +#define XACT_XINFO_HAS_SUBXACTS (1U << 1) +#define XACT_XINFO_HAS_RELFILENODES (1U << 2) +#define XACT_XINFO_HAS_INVALS (1U << 3) +#define XACT_XINFO_HAS_TWOPHASE (1U << 4) +#define XACT_XINFO_HAS_ORIGIN (1U << 5) +#define XACT_XINFO_HAS_AE_LOCKS (1U << 6) /* * Also stored in xinfo, these indicating a variety of additional actions that @@ -196,24 +196,24 @@ typedef void (*GTMCallback) (GTMEvent event, void *arg); * EOXact... routines which run at the end of the original transaction * completion. */ -#define XACT_COMPLETION_APPLY_FEEDBACK (1U << 29) -#define XACT_COMPLETION_UPDATE_RELCACHE_FILE (1U << 30) -#define XACT_COMPLETION_FORCE_SYNC_COMMIT (1U << 31) +#define XACT_COMPLETION_APPLY_FEEDBACK (1U << 29) +#define XACT_COMPLETION_UPDATE_RELCACHE_FILE (1U << 30) +#define XACT_COMPLETION_FORCE_SYNC_COMMIT (1U << 31) /* Access macros for above flags */ #define XactCompletionApplyFeedback(xinfo) \ - ((xinfo & XACT_COMPLETION_APPLY_FEEDBACK) != 0) + ((xinfo & XACT_COMPLETION_APPLY_FEEDBACK) != 0) #define XactCompletionRelcacheInitFileInval(xinfo) \ - ((xinfo & XACT_COMPLETION_UPDATE_RELCACHE_FILE) != 0) + ((xinfo & XACT_COMPLETION_UPDATE_RELCACHE_FILE) != 0) #define XactCompletionForceSyncCommit(xinfo) \ - ((xinfo & XACT_COMPLETION_FORCE_SYNC_COMMIT) != 0) + ((xinfo & XACT_COMPLETION_FORCE_SYNC_COMMIT) != 0) typedef struct xl_xact_assignment { - TransactionId xtop; /* assigned XID's top-level XID */ - int nsubxacts; /* number of subtransaction XIDs */ - TransactionId xsub[FLEXIBLE_ARRAY_MEMBER]; /* assigned subxids */ + TransactionId xtop; /* assigned XID's top-level XID */ + int nsubxacts; /* number of subtransaction XIDs */ + TransactionId xsub[FLEXIBLE_ARRAY_MEMBER]; /* assigned subxids */ } xl_xact_assignment; #define MinSizeOfXactAssignment offsetof(xl_xact_assignment, xsub) @@ -237,78 +237,78 @@ typedef struct xl_xact_assignment typedef struct xl_xact_xinfo { - /* - * Even though we right now only require 1 byte of space in xinfo we use - * four so following records don't have to care about alignment. Commit - * records can be large, so copying large portions isn't attractive. - */ - uint32 xinfo; + /* + * Even though we right now only require 1 byte of space in xinfo we use + * four so following records don't have to care about alignment. Commit + * records can be large, so copying large portions isn't attractive. + */ + uint32 xinfo; } xl_xact_xinfo; typedef struct xl_xact_dbinfo { - Oid dbId; /* MyDatabaseId */ - Oid tsId; /* MyDatabaseTableSpace */ + Oid dbId; /* MyDatabaseId */ + Oid tsId; /* MyDatabaseTableSpace */ } xl_xact_dbinfo; typedef struct xl_xact_subxacts { - int nsubxacts; /* number of subtransaction XIDs */ - TransactionId subxacts[FLEXIBLE_ARRAY_MEMBER]; + int nsubxacts; /* number of subtransaction XIDs */ + TransactionId subxacts[FLEXIBLE_ARRAY_MEMBER]; } xl_xact_subxacts; #define MinSizeOfXactSubxacts offsetof(xl_xact_subxacts, subxacts) typedef struct xl_xact_relfilenodes { - int nrels; /* number of subtransaction XIDs */ - RelFileNode xnodes[FLEXIBLE_ARRAY_MEMBER]; + int nrels; /* number of subtransaction XIDs */ + RelFileNode xnodes[FLEXIBLE_ARRAY_MEMBER]; } xl_xact_relfilenodes; #define MinSizeOfXactRelfilenodes offsetof(xl_xact_relfilenodes, xnodes) typedef struct xl_xact_invals { - int nmsgs; /* number of shared inval msgs */ - SharedInvalidationMessage msgs[FLEXIBLE_ARRAY_MEMBER]; + int nmsgs; /* number of shared inval msgs */ + SharedInvalidationMessage msgs[FLEXIBLE_ARRAY_MEMBER]; } xl_xact_invals; #define MinSizeOfXactInvals offsetof(xl_xact_invals, msgs) typedef struct xl_xact_twophase { - TransactionId xid; + TransactionId xid; } xl_xact_twophase; typedef struct xl_xact_origin { - XLogRecPtr origin_lsn; - TimestampTz origin_timestamp; + XLogRecPtr origin_lsn; + TimestampTz origin_timestamp; } xl_xact_origin; typedef struct xl_xact_commit { - TimestampTz global_timestamp; /* logical global timestamp */ - TimestampTz xact_time; /* time of commit */ - - /* xl_xact_xinfo follows if XLOG_XACT_HAS_INFO */ - /* xl_xact_dbinfo follows if XINFO_HAS_DBINFO */ - /* xl_xact_subxacts follows if XINFO_HAS_SUBXACT */ - /* xl_xact_relfilenodes follows if XINFO_HAS_RELFILENODES */ - /* xl_xact_invals follows if XINFO_HAS_INVALS */ - /* xl_xact_twophase follows if XINFO_HAS_TWOPHASE */ - /* xl_xact_origin follows if XINFO_HAS_ORIGIN, stored unaligned! */ + TimestampTz global_timestamp; /* logical global timestamp */ + TimestampTz xact_time; /* time of commit */ + + /* xl_xact_xinfo follows if XLOG_XACT_HAS_INFO */ + /* xl_xact_dbinfo follows if XINFO_HAS_DBINFO */ + /* xl_xact_subxacts follows if XINFO_HAS_SUBXACT */ + /* xl_xact_relfilenodes follows if XINFO_HAS_RELFILENODES */ + /* xl_xact_invals follows if XINFO_HAS_INVALS */ + /* xl_xact_twophase follows if XINFO_HAS_TWOPHASE */ + /* xl_xact_origin follows if XINFO_HAS_ORIGIN, stored unaligned! */ } xl_xact_commit; #define MinSizeOfXactCommit (offsetof(xl_xact_commit, xact_time) + sizeof(TimestampTz)) typedef struct xl_xact_abort { - TimestampTz global_timestamp; /* logical global timestamp */ - TimestampTz xact_time; /* time of abort */ - - /* xl_xact_xinfo follows if XLOG_XACT_HAS_INFO */ - /* No db_info required */ - /* xl_xact_subxacts follows if HAS_SUBXACT */ - /* xl_xact_relfilenodes follows if HAS_RELFILENODES */ - /* No invalidation messages needed. */ - /* xl_xact_twophase follows if XINFO_HAS_TWOPHASE */ + TimestampTz global_timestamp; /* logical global timestamp */ + TimestampTz xact_time; /* time of abort */ + + /* xl_xact_xinfo follows if XLOG_XACT_HAS_INFO */ + /* No db_info required */ + /* xl_xact_subxacts follows if HAS_SUBXACT */ + /* xl_xact_relfilenodes follows if HAS_RELFILENODES */ + /* No invalidation messages needed. */ + /* xl_xact_twophase follows if XINFO_HAS_TWOPHASE */ } xl_xact_abort; #define MinSizeOfXactAbort sizeof(xl_xact_abort) @@ -319,48 +319,48 @@ typedef struct xl_xact_abort */ typedef struct xl_xact_parsed_commit { - TimestampTz global_timestamp; /* logical global timestamp */ - TimestampTz xact_time; + TimestampTz global_timestamp; /* logical global timestamp */ + TimestampTz xact_time; - uint32 xinfo; + uint32 xinfo; - Oid dbId; /* MyDatabaseId */ - Oid tsId; /* MyDatabaseTableSpace */ + Oid dbId; /* MyDatabaseId */ + Oid tsId; /* MyDatabaseTableSpace */ - int nsubxacts; - TransactionId *subxacts; + int nsubxacts; + TransactionId *subxacts; - int nrels; - RelFileNode *xnodes; + int nrels; + RelFileNode *xnodes; - int nmsgs; - SharedInvalidationMessage *msgs; + int nmsgs; + SharedInvalidationMessage *msgs; - TransactionId twophase_xid; /* only for 2PC */ + TransactionId twophase_xid; /* only for 2PC */ - XLogRecPtr origin_lsn; - TimestampTz origin_timestamp; + XLogRecPtr origin_lsn; + TimestampTz origin_timestamp; } xl_xact_parsed_commit; typedef struct xl_xact_parsed_abort { - TimestampTz global_timestamp; /* logical global timestamp */ - TimestampTz xact_time; - uint32 xinfo; + TimestampTz global_timestamp; /* logical global timestamp */ + TimestampTz xact_time; + uint32 xinfo; - int nsubxacts; - TransactionId *subxacts; + int nsubxacts; + TransactionId *subxacts; - int nrels; - RelFileNode *xnodes; + int nrels; + RelFileNode *xnodes; - TransactionId twophase_xid; /* only for 2PC */ + TransactionId twophase_xid; /* only for 2PC */ } xl_xact_parsed_abort; #ifdef __TBASE__ typedef struct xl_xact_acquire_gts { - TimestampTz global_timestamp; /* logical global timestamp */ + TimestampTz global_timestamp; /* logical global timestamp */ }xl_xact_acquire_gts; #endif @@ -483,12 +483,12 @@ typedef enum REMOTE_ABORT /* from pgxc_node_remote_abort */ }CurrentOperation; /* record twophase trans operation before receive responses */ -typedef struct ConnTransState /* record twophase trasaction state of each connection*/ +typedef struct ConnTransState /* record twophase trasaction state of each connection*/ { bool is_participant; ConnState conn_state; /* record state of each connection in twophase trans */ - TwoPhaseTransState state; /* state of twophase trans in each connection */ - int handle_idx; /* index of dn_handles or cn_handles */ + TwoPhaseTransState state; /* state of twophase trans in each connection */ + int handle_idx; /* index of dn_handles or cn_handles */ }ConnTransState; typedef struct AllConnNodeInfo @@ -503,14 +503,14 @@ typedef struct LocalTwoPhaseState bool is_start_node; bool is_readonly; /* since explicit transaction can be readonly, need to record readonly in 2pc file */ bool is_after_prepare; /* record whether the transaction pass the whole prepare phase */ - char *gid; /* gid of twophase transaction*/ - TwoPhaseTransState state; /* global twophase state */ - ConnTransState *coord_state; /* each coord participants state */ + char *gid; /* gid of twophase transaction*/ + TwoPhaseTransState state; /* global twophase state */ + ConnTransState *coord_state; /* each coord participants state */ int coord_index; /* index of coord_state */ - ConnTransState *datanode_state; + ConnTransState *datanode_state; int datanode_index; /* index of datanode_state */ bool isprinted; /* is printed in AbortTransaction */ - char start_node_name[NAMEDATALEN]; /* twophase trans startnode */ + char start_node_name[NAMEDATALEN]; /* twophase trans startnode */ TransactionId start_xid; char *participants; PGXCNodeAllHandles *handles; /* handles in each phase in twophase trans */ @@ -522,10 +522,11 @@ extern LocalTwoPhaseState g_twophase_state; #endif /* ---------------- - * extern definitions + * extern definitions * ---------------- */ extern bool IsTransactionState(void); +extern bool IsTransactionCommit(void); extern bool IsAbortedTransactionBlockState(void); extern TransactionId GetTopTransactionId(void); extern TransactionId GetTopTransactionIdIfAny(void); @@ -606,7 +607,7 @@ extern void SetCurrentStatementStartTimestamp(void); extern TimestampTz GetCurrentGTMStartTimestamp(void); extern void SetCurrentGTMDeltaTimestamp(TimestampTz timestamp); #endif -extern int GetCurrentTransactionNestLevel(void); +extern int GetCurrentTransactionNestLevel(void); extern bool TransactionIdIsCurrentTransactionId(TransactionId xid); extern void CommandCounterIncrement(void); extern void ForceSyncCommit(void); @@ -671,22 +672,22 @@ extern bool IsPGXCNodeXactDatanodeDirect(void); extern void TransactionRecordXidWait(TransactionId xid); #endif -extern int xactGetCommittedChildren(TransactionId **ptr); +extern int xactGetCommittedChildren(TransactionId **ptr); extern XLogRecPtr XactLogCommitRecord(TimestampTz global_timestamp, - TimestampTz commit_time, - int nsubxacts, TransactionId *subxacts, - int nrels, RelFileNode *rels, - int nmsgs, SharedInvalidationMessage *msgs, - bool relcacheInval, bool forceSync, - int xactflags, - TransactionId twophase_xid); + TimestampTz commit_time, + int nsubxacts, TransactionId *subxacts, + int nrels, RelFileNode *rels, + int nmsgs, SharedInvalidationMessage *msgs, + bool relcacheInval, bool forceSync, + int xactflags, + TransactionId twophase_xid); extern XLogRecPtr XactLogAbortRecord(TimestampTz global_timestamp, - TimestampTz abort_time, - int nsubxacts, TransactionId *subxacts, - int nrels, RelFileNode *rels, - int xactflags, TransactionId twophase_xid); + TimestampTz abort_time, + int nsubxacts, TransactionId *subxacts, + int nrels, RelFileNode *rels, + int xactflags, TransactionId twophase_xid); extern void xact_redo(XLogReaderState *record); /* xactdesc.c */ @@ -701,4 +702,4 @@ extern void EnterParallelMode(void); extern void ExitParallelMode(void); extern bool IsInParallelMode(void); -#endif /* XACT_H */ +#endif /* XACT_H */ From 513207d06e4f48bf85d374fc7ba98697790f9607 Mon Sep 17 00:00:00 2001 From: youngxie Date: Wed, 24 Feb 2021 22:37:43 +0800 Subject: [PATCH 328/578] Fix coredump due to node number changes. http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131084977493&url_cache_key=d4e1402777dc733479aac463ad1a9d24 When a transcation is running, we hold pooler_reload until transaction finishes and rebuild global memory that related with node number after reload. --- src/backend/access/transam/xact.c | 3 - src/backend/commands/prepare.c | 48 +++++++++++++++ src/backend/pgxc/nodemgr/nodemgr.c | 9 +++ src/backend/pgxc/pool/execRemote.c | 15 +++++ src/backend/pgxc/pool/pgxcnode.c | 53 +++++++++++++---- src/backend/pgxc/pool/poolutils.c | 38 +----------- src/backend/tcop/postgres.c | 4 +- src/backend/tcop/utility.c | 10 ++++ src/backend/utils/hash/dynahash.c | 9 +++ src/include/commands/prepare.h | 1 + src/include/pgxc/execRemote.h | 1 + src/include/pgxc/nodemgr.h | 1 + src/include/pgxc/pgxcnode.h | 3 + src/include/utils/hsearch.h | 95 +++++++++++++++--------------- 14 files changed, 187 insertions(+), 103 deletions(-) diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 0f80f39c..f43288dc 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -8168,9 +8168,6 @@ IsTransactionIdle(void) } } - elog(WARNING,"reload is be processing in transaction. trans state: %d", CurrentTransactionState->state); - elog(WARNING,"reload is be processing in transaction. trans block state: %d", CurrentTransactionState->blockState); - return false; } diff --git a/src/backend/commands/prepare.c b/src/backend/commands/prepare.c index 7cf29f5f..1dc55563 100644 --- a/src/backend/commands/prepare.c +++ b/src/backend/commands/prepare.c @@ -479,6 +479,54 @@ InitQueryHashTable(void) #endif } +/* + * Rebuild query hash table. + */ +void +RebuildDatanodeQueryHashTable(void) +{ + HASHCTL hash_ctl; + HASH_SEQ_STATUS seq; + DatanodeStatement *entry; + DatanodeStatement *entry_tmp; + Size original_entry_size; + HTAB *datanode_queries_tmp = NULL; + + if (!IS_PGXC_COORDINATOR || !datanode_queries) + { + return; + } + + MemSet(&hash_ctl, 0, sizeof(hash_ctl)); + hash_ctl.keysize = NAMEDATALEN; + hash_ctl.entrysize = sizeof(DatanodeStatement) + NumDataNodes * sizeof(int); + + original_entry_size = hash_get_entry_size(datanode_queries); + + /* node number not changed, no need to rebuild */ + if (original_entry_size == hash_ctl.entrysize) + { + return ; + } + + datanode_queries_tmp = hash_create("Datanode Queries", + 64, + &hash_ctl, + HASH_ELEM); + /* walk over cache */ + hash_seq_init(&seq, datanode_queries); + while ((entry = hash_seq_search(&seq)) != NULL) + { + /* Now we can copy the hash table entry */ + entry_tmp = (DatanodeStatement *) hash_search(datanode_queries_tmp, entry->stmt_name, + HASH_ENTER, NULL); + memcpy(entry_tmp, entry, original_entry_size); + } + + hash_destroy(datanode_queries); + datanode_queries = datanode_queries_tmp; +} + #ifdef PGXC /* * Assign the statement name for all the RemoteQueries in the plan tree, so diff --git a/src/backend/pgxc/nodemgr/nodemgr.c b/src/backend/pgxc/nodemgr/nodemgr.c index 570aadee..830b1b8d 100644 --- a/src/backend/pgxc/nodemgr/nodemgr.c +++ b/src/backend/pgxc/nodemgr/nodemgr.c @@ -573,6 +573,15 @@ count_coords_datanodes(Relation rel, int *num_coord, int *num_dns) *num_dns = dnCount; } +/* + * Whether node changes happened + */ +bool +PrimaryNodeNumberChanged(void) +{ + return (*shmemNumCoords + *shmemNumDataNodes != NumCoords + NumDataNodes); +} + /* * PgxcNodeListAndCount * diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index c781abcf..c11ab9d7 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -189,6 +189,21 @@ stat_statement() current_tran_statements++; } +/* + * clean memory related to stat transaction + */ +void +clean_stat_transaction(void) +{ + if(!nodes_per_transaction) + { + return ; + } + + free(nodes_per_transaction); + nodes_per_transaction = NULL; +} + /* * To collect statistics: count a transaction */ diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index b69e928d..353f6b7f 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -229,7 +229,6 @@ InitMultinodeExecutor(bool is_force) PGXCNodeHandlesLookupEnt *node_handle_ent = NULL; #endif - /* Free all the existing information first */ if (is_force) pgxc_node_all_free(); @@ -245,6 +244,10 @@ InitMultinodeExecutor(bool is_force) /* Get classified list of node Oids */ PgxcNodeGetOidsExtend(&coOids, &dnOids, &sdnOids, &NumCoords, &NumDataNodes, &NumSlaveDataNodes, true); + /* Process node number related memory */ + RebuildDatanodeQueryHashTable(); + clean_stat_transaction(); + #ifdef XCP /* * Coordinator and datanode handles should be available during all the @@ -3629,12 +3632,6 @@ get_any_handle(List *datanodelist) errmsg("Invalid NULL node list"))); } - if (HandlesInvalidatePending) - if (DoInvalidateRemoteHandles()) - ereport(ERROR, - (errcode(ERRCODE_QUERY_CANCELED), - errmsg("canceling transaction due to cluster configuration reset by administrator command"))); - if (HandlesRefreshPending) if (DoRefreshRemoteHandles()) ereport(ERROR, @@ -3772,12 +3769,6 @@ get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query, bool /* index of the result array */ int i = 0; - if (HandlesInvalidatePending) - if (DoInvalidateRemoteHandles()) - ereport(ERROR, - (errcode(ERRCODE_QUERY_CANCELED), - errmsg("canceling transaction due to cluster configuration reset by administrator command"))); - if (HandlesRefreshPending) if (DoRefreshRemoteHandles()) ereport(ERROR, @@ -5011,6 +5002,21 @@ PoolerMessagesPending(void) return false; } +/* + * Check HandleInvalidatePending flag + */ +void +CheckInvalidateRemoteHandles(void) +{ + if (!HandlesInvalidatePending) + return ; + + if (DoInvalidateRemoteHandles()) + ereport(ERROR, + (errcode(ERRCODE_QUERY_CANCELED), + errmsg("canceling transaction due to cluster configuration reset by administrator command"))); +} + /* * For all handles, mark as they are not in use and discard pending input/output */ @@ -5019,9 +5025,30 @@ DoInvalidateRemoteHandles(void) { bool result = false; + /* + * Not reload until transaction is complete. + * That contain two condition. + * 1. transaction status is idle. + * 2. GlobalCommitTimestamp has to be invalid + * which makes sure we are not in 2pc commit phase. + */ + if (InterruptHoldoffCount || !IsTransactionIdle() || GetGlobalCommitTimestamp() != InvalidGlobalTimestamp) + { + return result; + } + HOLD_INTERRUPTS(); + /* + * Reinitialize session, it updates the shared memory table. + * Initialize XL executor. This must be done inside a transaction block. + */ + StartTransactionCommand(); InitMultinodeExecutor(true); + CommitTransactionCommand(); + + /* Disconnect from the pooler to get new connection infos next time */ + PoolManagerDisconnect(); HandlesInvalidatePending = false; HandlesRefreshPending = false; diff --git a/src/backend/pgxc/pool/poolutils.c b/src/backend/pgxc/pool/poolutils.c index b03a4e2b..0b684619 100644 --- a/src/backend/pgxc/pool/poolutils.c +++ b/src/backend/pgxc/pool/poolutils.c @@ -26,6 +26,7 @@ #include "pgxc/nodemgr.h" #include "pgxc/poolutils.h" #include "pgxc/pgxcnode.h" +#include "pgxc/execRemote.h" #include "access/gtm.h" #include "access/xact.h" #include "catalog/pgxc_node.h" @@ -415,45 +416,8 @@ HandlePoolerReload(void) if (proc_exit_inprogress) return; - if (InterruptHoldoffCount != 0) - return; - -#ifdef __TBASE__ - if (PoolerReloadHoldoffCount) - { - PoolerReloadPending = true; - return; - } - - if (false == IsTransactionIdle()) - { - return; - } - - PoolerReloadPending = false; -#endif - - HOLD_INTERRUPTS(); - - /* - * Reinitialize session, it updates the shared memory table. - * Initialize XL executor. This must be done inside a transaction block. - */ - StartTransactionCommand(); - InitMultinodeExecutor(true); - CommitTransactionCommand(); - - /* Request query cancel, when convenient */ - InterruptPending = true; - QueryCancelPending = true; - - /* Disconnect from the pooler to get new connection infos next time */ - PoolManagerDisconnect(); - /* Prevent using of cached connections to remote nodes */ RequestInvalidateRemoteHandles(); - - RESUME_INTERRUPTS(); } /* diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 7020932d..48a49f73 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -5264,9 +5264,7 @@ PostgresMain(int argc, char *argv[], #endif #ifdef __TBASE__ - RESUME_POOLER_RELOAD(); - CHECK_FOR_POOLER_RELOAD(); - HOLD_POOLER_RELOAD(); + CheckInvalidateRemoteHandles(); #endif initStringInfo(&input_message); diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index da28b820..37139a63 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -1845,7 +1845,17 @@ standard_ProcessUtility(PlannedStmt *pstmt, { /* Is the statement a prohibited one? */ if (!IsStmtAllowedInLockedMode(parsetree, queryString)) + { + /* node number changes with ddl is not allowed */ + if (HandlesInvalidatePending && PrimaryNodeNumberChanged()) + { + ereport(ERROR, + (errcode(ERRCODE_QUERY_CANCELED), + errmsg("canceling transaction due to cluster configuration reset by administrator command"))); + } pgxc_lock_for_utility_stmt(parsetree); + + } } check_xact_readonly(parsetree); diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c index ee65603b..8e62e871 100644 --- a/src/backend/utils/hash/dynahash.c +++ b/src/backend/utils/hash/dynahash.c @@ -1347,6 +1347,15 @@ hash_get_num_entries(HTAB *hashp) return sum; } +/* + * hash_get_entry_size -- get the entry size of a hashtable + */ +Size +hash_get_entry_size(HTAB *hashp) +{ + return hashp->hctl->entrysize; +} + /* * hash_seq_init/_search/_term * Sequentially search through hash table and return diff --git a/src/include/commands/prepare.h b/src/include/commands/prepare.h index 53fbdede..a5d6383e 100644 --- a/src/include/commands/prepare.h +++ b/src/include/commands/prepare.h @@ -146,6 +146,7 @@ extern void PrepareRemoteDMLStatement(bool upsert, char *stmt, char *select_stmt, char *update_stmt); extern void DropRemoteDMLStatement(char *stmt, char *update_stmt); +extern void RebuildDatanodeQueryHashTable(void); #endif #endif /* PREPARE_H */ diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h index 5b6d46c6..98d51719 100644 --- a/src/include/pgxc/execRemote.h +++ b/src/include/pgxc/execRemote.h @@ -457,6 +457,7 @@ extern void ClearLocalTwoPhaseState(void); extern char *GetTransStateString(TwoPhaseTransState state); extern char *GetConnStateString(ConnState state); extern void get_partnodes(PGXCNodeAllHandles * handles, StringInfo participants); +extern void clean_stat_transaction(void); #endif #endif diff --git a/src/include/pgxc/nodemgr.h b/src/include/pgxc/nodemgr.h index 0f31d8cc..ad39f5c6 100644 --- a/src/include/pgxc/nodemgr.h +++ b/src/include/pgxc/nodemgr.h @@ -67,6 +67,7 @@ extern void PgxcNodeRemove(DropNodeStmt *stmt); extern void PgxcNodeDnListHealth(List *nodeList, bool *dnhealth); extern bool PgxcNodeUpdateHealth(Oid node, bool status); +extern bool PrimaryNodeNumberChanged(void); /* GUC parameter */ extern bool enable_multi_cluster; extern bool enable_multi_cluster_print; diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h index f15515c3..6643dc36 100644 --- a/src/include/pgxc/pgxcnode.h +++ b/src/include/pgxc/pgxcnode.h @@ -143,6 +143,8 @@ typedef struct PGXCNodeHandle **coord_handles; /* an array of Coordinator handles */ } PGXCNodeAllHandles; +extern volatile bool HandlesInvalidatePending; + extern void InitMultinodeExecutor(bool is_force); extern Oid get_nodeoid_from_nodeid(int nodeid, char node_type); @@ -295,6 +297,7 @@ inline bool is_ddl_leader_cn(char *leader_cn); extern int pgxc_node_send_sessionid(PGXCNodeHandle * handle); extern void SerializeSessionId(Size maxsize, char *start_address); extern void StartParallelWorkerSessionId(char *address); +void CheckInvalidateRemoteHandles(void); #endif #ifdef __AUDIT__ diff --git a/src/include/utils/hsearch.h b/src/include/utils/hsearch.h index 43bbd08c..651b3b59 100644 --- a/src/include/utils/hsearch.h +++ b/src/include/utils/hsearch.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * hsearch.h - * exported definitions for utils/hash/dynahash.c; see notes therein + * exported definitions for utils/hash/dynahash.c; see notes therein * * * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group @@ -27,7 +27,7 @@ typedef uint32 (*HashValueFunc) (const void *key, Size keysize); * as key comparison functions.) */ typedef int (*HashCompareFunc) (const void *key1, const void *key2, - Size keysize); + Size keysize); /* * Key copying functions must have this signature. The return value is not @@ -50,8 +50,8 @@ typedef void *(*HashAllocFunc) (Size request); */ typedef struct HASHELEMENT { - struct HASHELEMENT *link; /* link to next entry in same bucket */ - uint32 hashvalue; /* hash function result for this entry */ + struct HASHELEMENT *link; /* link to next entry in same bucket */ + uint32 hashvalue; /* hash function result for this entry */ } HASHELEMENT; /* Hash table header struct is an opaque type known only within dynahash.c */ @@ -64,74 +64,75 @@ typedef struct HTAB HTAB; /* Only those fields indicated by hash_flags need be set */ typedef struct HASHCTL { - long num_partitions; /* # partitions (must be power of 2) */ - long ssize; /* segment size */ - long dsize; /* (initial) directory size */ - long max_dsize; /* limit to dsize if dir size is limited */ - long ffactor; /* fill factor */ - Size keysize; /* hash key length in bytes */ - Size entrysize; /* total user element size in bytes */ - HashValueFunc hash; /* hash function */ - HashCompareFunc match; /* key comparison function */ - HashCopyFunc keycopy; /* key copying function */ - HashAllocFunc alloc; /* memory allocator */ - MemoryContext hcxt; /* memory context to use for allocations */ - HASHHDR *hctl; /* location of header in shared mem */ + long num_partitions; /* # partitions (must be power of 2) */ + long ssize; /* segment size */ + long dsize; /* (initial) directory size */ + long max_dsize; /* limit to dsize if dir size is limited */ + long ffactor; /* fill factor */ + Size keysize; /* hash key length in bytes */ + Size entrysize; /* total user element size in bytes */ + HashValueFunc hash; /* hash function */ + HashCompareFunc match; /* key comparison function */ + HashCopyFunc keycopy; /* key copying function */ + HashAllocFunc alloc; /* memory allocator */ + MemoryContext hcxt; /* memory context to use for allocations */ + HASHHDR *hctl; /* location of header in shared mem */ } HASHCTL; /* Flags to indicate which parameters are supplied */ -#define HASH_PARTITION 0x0001 /* Hashtable is used w/partitioned locking */ -#define HASH_SEGMENT 0x0002 /* Set segment size */ -#define HASH_DIRSIZE 0x0004 /* Set directory size (initial and max) */ -#define HASH_FFACTOR 0x0008 /* Set fill factor */ -#define HASH_ELEM 0x0010 /* Set keysize and entrysize */ -#define HASH_BLOBS 0x0020 /* Select support functions for binary keys */ -#define HASH_FUNCTION 0x0040 /* Set user defined hash function */ -#define HASH_COMPARE 0x0080 /* Set user defined comparison function */ -#define HASH_KEYCOPY 0x0100 /* Set user defined key-copying function */ -#define HASH_ALLOC 0x0200 /* Set memory allocator */ -#define HASH_CONTEXT 0x0400 /* Set memory allocation context */ -#define HASH_SHARED_MEM 0x0800 /* Hashtable is in shared memory */ -#define HASH_ATTACH 0x1000 /* Do not initialize hctl */ -#define HASH_FIXED_SIZE 0x2000 /* Initial size is a hard limit */ +#define HASH_PARTITION 0x0001 /* Hashtable is used w/partitioned locking */ +#define HASH_SEGMENT 0x0002 /* Set segment size */ +#define HASH_DIRSIZE 0x0004 /* Set directory size (initial and max) */ +#define HASH_FFACTOR 0x0008 /* Set fill factor */ +#define HASH_ELEM 0x0010 /* Set keysize and entrysize */ +#define HASH_BLOBS 0x0020 /* Select support functions for binary keys */ +#define HASH_FUNCTION 0x0040 /* Set user defined hash function */ +#define HASH_COMPARE 0x0080 /* Set user defined comparison function */ +#define HASH_KEYCOPY 0x0100 /* Set user defined key-copying function */ +#define HASH_ALLOC 0x0200 /* Set memory allocator */ +#define HASH_CONTEXT 0x0400 /* Set memory allocation context */ +#define HASH_SHARED_MEM 0x0800 /* Hashtable is in shared memory */ +#define HASH_ATTACH 0x1000 /* Do not initialize hctl */ +#define HASH_FIXED_SIZE 0x2000 /* Initial size is a hard limit */ /* max_dsize value to indicate expansible directory */ -#define NO_MAX_DSIZE (-1) +#define NO_MAX_DSIZE (-1) /* hash_search operations */ typedef enum { - HASH_FIND, - HASH_ENTER, - HASH_REMOVE, - HASH_ENTER_NULL + HASH_FIND, + HASH_ENTER, + HASH_REMOVE, + HASH_ENTER_NULL } HASHACTION; /* hash_seq status (should be considered an opaque type by callers) */ typedef struct { - HTAB *hashp; - uint32 curBucket; /* index of current bucket */ - HASHELEMENT *curEntry; /* current entry in bucket */ + HTAB *hashp; + uint32 curBucket; /* index of current bucket */ + HASHELEMENT *curEntry; /* current entry in bucket */ } HASH_SEQ_STATUS; /* * prototypes for functions in dynahash.c */ extern HTAB *hash_create(const char *tabname, long nelem, - HASHCTL *info, int flags); + HASHCTL *info, int flags); extern void hash_destroy(HTAB *hashp); extern void hash_stats(const char *where, HTAB *hashp); extern void *hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, - bool *foundPtr); + bool *foundPtr); extern uint32 get_hash_value(HTAB *hashp, const void *keyPtr); extern void *hash_search_with_hash_value(HTAB *hashp, const void *keyPtr, - uint32 hashvalue, HASHACTION action, - bool *foundPtr); + uint32 hashvalue, HASHACTION action, + bool *foundPtr); extern bool hash_update_hash_key(HTAB *hashp, void *existingEntry, - const void *newKeyPtr); + const void *newKeyPtr); extern long hash_get_num_entries(HTAB *hashp); +extern Size hash_get_entry_size(HTAB *hashp); extern void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp); extern void *hash_seq_search(HASH_SEQ_STATUS *status); extern void hash_seq_term(HASH_SEQ_STATUS *status); @@ -153,8 +154,8 @@ extern uint32 string_hash(const void *key, Size keysize); extern uint32 tag_hash(const void *key, Size keysize); extern uint32 uint32_hash(const void *key, Size keysize); extern uint32 bitmap_hash(const void *key, Size keysize); -extern int bitmap_match(const void *key1, const void *key2, Size keysize); +extern int bitmap_match(const void *key1, const void *key2, Size keysize); -#define oid_hash uint32_hash /* Remove me eventually */ +#define oid_hash uint32_hash /* Remove me eventually */ -#endif /* HSEARCH_H */ +#endif /* HSEARCH_H */ From 518b65fa4d82542a4247d46ebe1ff8488614b7fa Mon Sep 17 00:00:00 2001 From: yeyukui Date: Thu, 25 Feb 2021 19:17:46 +0800 Subject: [PATCH 329/578] optimize node start slow because crypt table too much --- src/backend/utils/cache/relcryptmap.c | 9 +++++---- src/backend/utils/misc/guc.c | 9 +++++++++ src/backend/utils/misc/mls.c | 4 +++- src/include/utils/relcrypt.h | 1 + 4 files changed, 18 insertions(+), 5 deletions(-) diff --git a/src/backend/utils/cache/relcryptmap.c b/src/backend/utils/cache/relcryptmap.c index c7fb8b63..79eef334 100644 --- a/src/backend/utils/cache/relcryptmap.c +++ b/src/backend/utils/cache/relcryptmap.c @@ -131,8 +131,8 @@ #define REL_CRYPT_MAP_FILENAME "pg_rel_crypt.map" #define REL_CRYPT_MAP_FILEMAGIC 0x952702 /* version ID value */ -#define REL_CRYPT_HASHTABLE_MAX_SIZE (1 << 20) -#define REL_CRYPT_HASHTABLE_INIT_SIZE (1 << 11) +#define REL_CRYPT_HASHTABLE_MAX_SIZE ((g_rel_crypt_hash_size > (1 << 20))? g_rel_crypt_hash_size : (1 << 20)) +#define REL_CRYPT_HASHTABLE_INIT_SIZE g_rel_crypt_hash_size #define REL_CRYPT_HASHTABLE_NUM_PARTITIONS 128 @@ -1267,9 +1267,9 @@ static int rel_crypt_hash_key_cmp (const void *key1, const void *key2, Size keys { const RelFileNode *tagPtr1 = key1, *tagPtr2 = key2; - if (tagPtr1->spcNode == tagPtr2->spcNode + if (tagPtr1->relNode == tagPtr2->relNode && tagPtr1->dbNode == tagPtr2->dbNode - && tagPtr1->relNode == tagPtr2->relNode ) + && tagPtr1->spcNode == tagPtr2->spcNode ) return 0; return 1; @@ -1294,6 +1294,7 @@ void rel_crypt_hash_insert(RelFileNode * rnode, AlgoId algo_id, bool write_wal, hashcode, HASH_ENTER, &found); + if (false == found) { relcrypt->algo_id = algo_id; diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 6e3a43dc..ffc6dbd4 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -4488,6 +4488,15 @@ static struct config_int ConfigureNamesInt[] = 32, 4, 64, NULL, NULL, NULL }, + { + {"rel_crypt_hash_size", PGC_POSTMASTER, CUSTOM_OPTIONS, + gettext_noop("Number of rel crypt hash table size, it will influence node start time."), + NULL + }, + &g_rel_crypt_hash_size, + 2048, 2048, INT_MAX, + NULL, NULL, NULL + }, #endif { {"pooler_port", PGC_POSTMASTER, DATA_NODES, diff --git a/src/backend/utils/misc/mls.c b/src/backend/utils/misc/mls.c index e55f36dd..5ff3cf8e 100644 --- a/src/backend/utils/misc/mls.c +++ b/src/backend/utils/misc/mls.c @@ -142,7 +142,7 @@ bool g_enable_data_mask = false; bool g_enable_transparent_crypt = false; bool g_enable_crypt_debug = false; #endif - +int g_rel_crypt_hash_size = 2048; #define MLS_QUERY_STRING_PRUNE_DELIMETER '(' @@ -1619,7 +1619,9 @@ void MlsShmemInit(void) MlsInitFileAccess(); crypt_key_info_load_mapfile(); + elog(LOG, "start rel crypt load mapfile"); rel_crypt_load_mapfile(); + elog(LOG, "end rel crypt load mapfile"); /* after vfd access, rollback all init actions */ MlsCleanFileAccess(); diff --git a/src/include/utils/relcrypt.h b/src/include/utils/relcrypt.h index 399e703b..fc071bad 100644 --- a/src/include/utils/relcrypt.h +++ b/src/include/utils/relcrypt.h @@ -70,6 +70,7 @@ typedef int16 AlgoId; extern bool g_enable_cls; extern bool g_enable_transparent_crypt; extern bool g_enable_crypt_debug; +extern int g_rel_crypt_hash_size; extern int g_checkpoint_crypt_worker; extern int g_checkpoint_crypt_queue_length; From 819d218caaa7d2602a6d65b0de7a481cf5d3def6 Mon Sep 17 00:00:00 2001 From: youngxie Date: Tue, 2 Mar 2021 20:28:45 +0800 Subject: [PATCH 330/578] Perfects database dropping procedure. http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131085665555&url_cache_key=d4e1402777dc733479aac463ad1a9d24 (cherry picked from commit ba4dddc4) 58b03906 Fix comment f51d80ab Fix compile warning. 12d8c778 Perfects database dropping procedure. http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131085665555&url_cache_key=d4e1402777dc733479aac463ad1a9d24 --- src/backend/commands/dbcommands.c | 125 ++++++++++++++++++++++++++++++ src/backend/nodes/copyfuncs.c | 1 + src/backend/parser/gram.y | 18 +++++ src/backend/tcop/utility.c | 40 +++++++++- src/include/commands/dbcommands.h | 13 ++-- src/include/nodes/parsenodes.h | 1 + 6 files changed, 188 insertions(+), 10 deletions(-) diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 6e209fff..ae8c18c1 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -815,6 +815,131 @@ createdb_failure_callback(int code, Datum arg) remove_dbtablespaces(fparms->dest_dboid); } +/* + * DROP DATABASE PREPARE + * + * Lock the database and check the constraint in advance. + */ +void +dropdb_prepare(const char *dbname, bool missing_ok) +{ + Oid db_id; + bool db_istemplate; + Relation pgdbrel; + int notherbackends; + int npreparedxacts; + int nslots, + nslots_active; + int nsubscriptions; + + /* + * Look up the target database's OID, and get exclusive lock on it. We + * need this to ensure that no new backend starts up in the target + * database while we are deleting it (see postinit.c), and that no one is + * using it as a CREATE DATABASE template or trying to delete it for + * themselves. + */ + pgdbrel = heap_open(DatabaseRelationId, RowExclusiveLock); + + if (!get_db_info(dbname, AccessExclusiveLock, &db_id, NULL, NULL, + &db_istemplate, NULL, NULL, NULL, NULL, NULL, NULL, NULL)) + { + if (!missing_ok) + { + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_DATABASE), + errmsg("database \"%s\" does not exist", dbname))); + } + else + { + /* Close pg_database, release the lock, since we changed nothing */ + heap_close(pgdbrel, RowExclusiveLock); + ereport(NOTICE, + (errmsg("database \"%s\" does not exist, skipping", + dbname))); + return; + } + } + + /* + * Permission checks + */ + if (!pg_database_ownercheck(db_id, GetUserId())) + aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_DATABASE, + dbname); + + /* DROP hook for the database being removed */ + InvokeObjectDropHook(DatabaseRelationId, db_id, 0); + + /* + * Disallow dropping a DB that is marked istemplate. This is just to + * prevent people from accidentally dropping template0 or template1; they + * can do so if they're really determined ... + */ + if (db_istemplate) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot drop a template database"))); + + /* Obviously can't drop my own database */ + if (db_id == MyDatabaseId) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_IN_USE), + errmsg("cannot drop the currently open database"))); + + /* + * Check whether there are active logical slots that refer to the + * to-be-dropped database. The database lock we are holding prevents the + * creation of new slots using the database or existing slots becoming + * active. + */ + (void) ReplicationSlotsCountDBSlots(db_id, &nslots, &nslots_active); + if (nslots_active) + { + ereport(ERROR, + (errcode(ERRCODE_OBJECT_IN_USE), + errmsg("database \"%s\" is used by an active logical replication slot", + dbname), + errdetail_plural("There is %d active slot", + "There are %d active slots", + nslots_active, nslots_active))); + } + + /* + * Check for other backends in the target database. (Because we hold the + * database lock, no new ones can start after this.) + * + * As in CREATE DATABASE, check this after other error conditions. + */ + if (CountOtherDBBackends(db_id, ¬herbackends, &npreparedxacts)) + { +#ifndef _PG_REGRESS_ + ereport(ERROR, + (errcode(ERRCODE_OBJECT_IN_USE), + errmsg("database \"%s\" is being accessed by other users", + dbname), + errdetail_busy_db(notherbackends, npreparedxacts))); +#else + elog(ERROR, "database \"%s\" is being accessed by other users", dbname); +#endif + } + + /* + * Check if there are subscriptions defined in the target database. + * + * We can't drop them automatically because they might be holding + * resources in other databases/instances. + */ + if ((nsubscriptions = CountDBSubscriptions(db_id)) > 0) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_IN_USE), + errmsg("database \"%s\" is being used by logical replication subscription", + dbname), + errdetail_plural("There is %d subscription.", + "There are %d subscriptions.", + nsubscriptions, nsubscriptions))); + heap_close(pgdbrel, RowExclusiveLock); +} /* * DROP DATABASE diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index ed04f1d4..6e6e562f 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -4128,6 +4128,7 @@ _copyDropdbStmt(const DropdbStmt *from) COPY_STRING_FIELD(dbname); COPY_SCALAR_FIELD(missing_ok); + COPY_SCALAR_FIELD(prepare); return newnode; } diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 4e74a77d..7f8b9e22 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -10632,6 +10632,7 @@ DropdbStmt: DROP DATABASE database_name DropdbStmt *n = makeNode(DropdbStmt); n->dbname = $3; n->missing_ok = FALSE; + n->prepare = FALSE; $$ = (Node *)n; } | DROP DATABASE IF_P EXISTS database_name @@ -10639,6 +10640,23 @@ DropdbStmt: DROP DATABASE database_name DropdbStmt *n = makeNode(DropdbStmt); n->dbname = $5; n->missing_ok = TRUE; + n->prepare = FALSE; + $$ = (Node *)n; + } + | DROP DATABASE PREPARE database_name + { + DropdbStmt *n = makeNode(DropdbStmt); + n->dbname = $4; + n->missing_ok = FALSE; + n->prepare = TRUE; + $$ = (Node *)n; + } + | DROP DATABASE PREPARE IF_P EXISTS database_name + { + DropdbStmt *n = makeNode(DropdbStmt); + n->dbname = $6; + n->missing_ok = TRUE; + n->prepare = TRUE; $$ = (Node *)n; } ; diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index 37139a63..1527823f 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -643,18 +643,35 @@ ProcessUtilityPre(PlannedStmt *pstmt, break; case T_DropdbStmt: - /* Clean connections before dropping a database on local node */ if (IS_PGXC_LOCAL_COORDINATOR) { DropdbStmt *stmt = (DropdbStmt *) parsetree; - char query[256]; + char query[STRINGLENGTH]; + /* Clean connections before dropping a database on local node */ DropDBCleanConnection(stmt->dbname); /* Clean also remote Coordinators */ - sprintf(query, "CLEAN CONNECTION TO ALL FOR DATABASE %s;", + snprintf(query, STRINGLENGTH, "CLEAN CONNECTION TO ALL FOR DATABASE %s;", quote_identifier(stmt->dbname)); ExecUtilityStmtOnNodes(parsetree, query, NULL, sentToRemote, true, EXEC_ON_ALL_NODES, false, false); + + if (!stmt->prepare) + { + /* Lock database and check the constraints before we actually dropping */ + if (stmt->missing_ok) + { + snprintf(query, STRINGLENGTH, "DROP DATABASE PREPARE IF EXISTS %s;", + quote_identifier(stmt->dbname)); + } + else + { + snprintf(query, STRINGLENGTH, "DROP DATABASE PREPARE %s;", + quote_identifier(stmt->dbname)); + } + ExecUtilityStmtOnNodes(parsetree, query, NULL, sentToRemote, false, + EXEC_ON_ALL_NODES, false, false); + } } break; @@ -2107,12 +2124,20 @@ standard_ProcessUtility(PlannedStmt *pstmt, { DropdbStmt *stmt = (DropdbStmt *) parsetree; + if (!stmt->prepare) + { /* no event triggers for global objects */ if (IS_PGXC_LOCAL_COORDINATOR) + { PreventTransactionChain(isTopLevel, "DROP DATABASE"); - + } dropdb(stmt->dbname, stmt->missing_ok); } + else + { + dropdb_prepare(stmt->dbname, stmt->missing_ok); + } + } break; /* Query-level asynchronous notification */ @@ -5290,7 +5315,14 @@ CreateCommandTag(Node *parsetree) break; case T_DropdbStmt: + if (((DropdbStmt *) parsetree)->prepare) + { + tag = "DROP DATABASE PREPARE"; + } + else + { tag = "DROP DATABASE"; + } break; case T_NotifyStmt: diff --git a/src/include/commands/dbcommands.h b/src/include/commands/dbcommands.h index 98aa3a6c..cb5844ff 100644 --- a/src/include/commands/dbcommands.h +++ b/src/include/commands/dbcommands.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * dbcommands.h - * Database management commands (create/drop database). + * Database management commands (create/drop database). * * * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group @@ -19,14 +19,15 @@ #include "lib/stringinfo.h" #include "nodes/parsenodes.h" -extern Oid createdb(ParseState *pstate, const CreatedbStmt *stmt); +extern Oid createdb(ParseState *pstate, const CreatedbStmt *stmt); extern void dropdb(const char *dbname, bool missing_ok); +extern void dropdb_prepare(const char *dbname, bool missing_ok); extern ObjectAddress RenameDatabase(const char *oldname, const char *newname); -extern Oid AlterDatabase(ParseState *pstate, AlterDatabaseStmt *stmt, bool isTopLevel); -extern Oid AlterDatabaseSet(AlterDatabaseSetStmt *stmt); +extern Oid AlterDatabase(ParseState *pstate, AlterDatabaseStmt *stmt, bool isTopLevel); +extern Oid AlterDatabaseSet(AlterDatabaseSetStmt *stmt); extern ObjectAddress AlterDatabaseOwner(const char *dbname, Oid newOwnerId); -extern Oid get_database_oid(const char *dbname, bool missingok); +extern Oid get_database_oid(const char *dbname, bool missingok); extern char *get_database_name(Oid dbid); extern void check_encoding_locale_matches(int encoding, const char *collate, const char *ctype); @@ -35,4 +36,4 @@ extern void check_encoding_locale_matches(int encoding, const char *collate, con extern bool IsSetTableSpace(AlterDatabaseStmt *stmt); #endif -#endif /* DBCOMMANDS_H */ +#endif /* DBCOMMANDS_H */ diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index df2746c9..2c54a436 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -3239,6 +3239,7 @@ typedef struct DropdbStmt NodeTag type; char *dbname; /* database to drop */ bool missing_ok; /* skip error if db is missing? */ + bool prepare; /* database drop preparation step */ } DropdbStmt; /* ---------------------- From 2293c35431e0b84989fa285a1359d0e6ac48f05a Mon Sep 17 00:00:00 2001 From: whalesong Date: Fri, 5 Mar 2021 20:28:08 +0800 Subject: [PATCH 331/578] Bugfix: 2pc is rollbacked by pg_clean_execute when waiting for wal sync, ID85257415 (merge request !194) --- contrib/pg_clean/pg_clean.c | 67 +++++++++++++++++++------ doc/src/sgml/ref/commit_prepared.sgml | 11 +++- doc/src/sgml/ref/rollback_prepared.sgml | 11 +++- src/backend/access/transam/twophase.c | 30 +++++++++-- src/backend/parser/gram.y | 14 ++++++ src/backend/tcop/utility.c | 32 ++++++++++++ src/include/access/twophase.h | 2 + src/include/nodes/parsenodes.h | 4 +- 8 files changed, 150 insertions(+), 21 deletions(-) diff --git a/contrib/pg_clean/pg_clean.c b/contrib/pg_clean/pg_clean.c index 4dc898ff..e31394c4 100644 --- a/contrib/pg_clean/pg_clean.c +++ b/contrib/pg_clean/pg_clean.c @@ -84,6 +84,8 @@ PG_MODULE_MAGIC; #define GIDSIZE (200 + 24) #define MAX_TWOPC_TXN 1000 +#define MAX_CMD_LENGTH 120 + #define XIDFOUND 1 #define XIDNOTFOUND -1 #define XIDEXECFAIL -2 @@ -314,7 +316,7 @@ bool check_node_participate(txn_info * txn, int node_idx); void recover2PC(txn_info * txn); TXN_STATUS check_txn_global_status(txn_info *txn); -bool clean_2PC_iscommit(txn_info *txn, bool iscommit); +bool clean_2PC_iscommit(txn_info *txn, bool is_commit, bool is_check); bool clean_2PC_files(txn_info *txn); void Init_print_txn_info(print_txn_info *print_txn); void Init_print_stats_all(print_status *pstatus); @@ -2228,7 +2230,7 @@ Datum pgxc_commit_on_node(PG_FUNCTION_ARGS) Oid nodeoid; char *gid; txn_info *txn; - char command[100]; + char command[MAX_CMD_LENGTH]; PGXCNodeHandle **connections = NULL; int conn_count = 0; ResponseCombiner combiner; @@ -2268,7 +2270,7 @@ Datum pgxc_commit_on_node(PG_FUNCTION_ARGS) strncpy(txn->gid, gid, strlen(gid)+1); getTxnInfoOnOtherNodes(txn); - snprintf(command, 100, "commit prepared '%s'", txn->gid); + snprintf(command, MAX_CMD_LENGTH, "commit prepared '%s'", txn->gid); if (InvalidGlobalTimestamp == txn->global_commit_timestamp) @@ -2334,7 +2336,7 @@ Datum pgxc_abort_on_node(PG_FUNCTION_ARGS) Oid nodeoid; char *gid; txn_info *txn; - char command[100]; + char command[MAX_CMD_LENGTH]; PGXCNodeHandle **connections = NULL; int conn_count = 0; ResponseCombiner combiner; @@ -2375,7 +2377,7 @@ Datum pgxc_abort_on_node(PG_FUNCTION_ARGS) strncpy(txn->gid, gid, strlen(gid)+1); connections = (PGXCNodeHandle**)palloc(sizeof(PGXCNodeHandle*)); getTxnInfoOnOtherNodes(txn); - snprintf(command, 100, "rollback prepared '%s'", txn->gid); + snprintf(command, MAX_CMD_LENGTH, "rollback prepared '%s'", txn->gid); #if 0 if (!setMaintenanceMode(true)) { @@ -2617,7 +2619,15 @@ void recover2PC(txn_info * txn) else { txn->op = COMMIT; - if (!clean_2PC_iscommit(txn, true)) + /* check whether all nodes can commit prepared */ + if (!clean_2PC_iscommit(txn, true, true)) + { + txn->op_issuccess = false; + elog(LOG, "check commit 2PC transaction %s failed", txn->gid); + return; + } + /* send commit prepared to all nodes */ + if (!clean_2PC_iscommit(txn, true, false)) { txn->op_issuccess = false; elog(LOG, "commit 2PC transaction %s failed", txn->gid); @@ -2630,7 +2640,15 @@ void recover2PC(txn_info * txn) case TXN_STATUS_ABORTED: txn->op = ABORT; - if (!clean_2PC_iscommit(txn, false)) + /* check whether all nodes can rollback prepared */ + if (!clean_2PC_iscommit(txn, false, true)) + { + txn->op_issuccess = false; + elog(LOG, "check rollback 2PC transaction %s failed", txn->gid); + return; + } + /* send rollback prepared to all nodes */ + if (!clean_2PC_iscommit(txn, false, false)) { txn->op_issuccess = false; elog(LOG, "rollback 2PC transaction %s failed", txn->gid); @@ -2791,11 +2809,12 @@ TXN_STATUS check_txn_global_status(txn_info *txn) return TXN_STATUS_ABORTED; } -bool clean_2PC_iscommit(txn_info *txn, bool iscommit) +bool clean_2PC_iscommit(txn_info *txn, bool is_commit, bool is_check) { int ii; static const char *STMT_FORM = "%s prepared '%s';"; - char command[100]; + static const char *STMT_FORM_CHECK = "%s prepared '%s' for check only;"; + char command[MAX_CMD_LENGTH]; int node_idx; Oid node_oid; PGXCNodeHandle **connections = NULL; @@ -2803,11 +2822,29 @@ bool clean_2PC_iscommit(txn_info *txn, bool iscommit) ResponseCombiner combiner; PGXCNodeAllHandles *pgxc_handles = NULL; - if (iscommit) - snprintf(command, 100, STMT_FORM, "commit", txn->gid); + if (is_commit) + { + if (is_check) + { + snprintf(command, MAX_CMD_LENGTH, STMT_FORM_CHECK, "commit", txn->gid); + } else - snprintf(command, 100, STMT_FORM, "rollback", txn->gid); - if (iscommit && InvalidGlobalTimestamp == txn->global_commit_timestamp) + { + snprintf(command, MAX_CMD_LENGTH, STMT_FORM, "commit", txn->gid); + } + } + else + { + if (is_check) + { + snprintf(command, MAX_CMD_LENGTH, STMT_FORM_CHECK, "rollback", txn->gid); + } + else + { + snprintf(command, MAX_CMD_LENGTH, STMT_FORM, "rollback", txn->gid); + } + } + if (is_commit && InvalidGlobalTimestamp == txn->global_commit_timestamp) { elog(ERROR, "twophase transaction '%s' has InvalidGlobalCommitTimestamp", txn->gid); } @@ -2986,9 +3023,9 @@ bool clean_2PC_files(txn_info * txn) TupleTableSlots result; bool issuccess = true; static const char *STMT_FORM = "select pgxc_remove_2pc_records('%s')::text"; - char query[100]; + char query[MAX_CMD_LENGTH]; - snprintf(query, 100, STMT_FORM, txn->gid); + snprintf(query, MAX_CMD_LENGTH, STMT_FORM, txn->gid); for (ii = 0; ii < dn_nodes_num; ii++) { diff --git a/doc/src/sgml/ref/commit_prepared.sgml b/doc/src/sgml/ref/commit_prepared.sgml index 58438f99..cad0a868 100644 --- a/doc/src/sgml/ref/commit_prepared.sgml +++ b/doc/src/sgml/ref/commit_prepared.sgml @@ -21,7 +21,7 @@ PostgreSQL documentation -COMMIT PREPARED transaction_id +COMMIT PREPARED transaction_id [ FOR CHECK ONLY ] @@ -47,6 +47,15 @@ COMMIT PREPARED transaction_id + + + FOR CHECK ONLY + + + Check whether a prepared transaction can be committed. + + + diff --git a/doc/src/sgml/ref/rollback_prepared.sgml b/doc/src/sgml/ref/rollback_prepared.sgml index 141c77b1..c1835358 100644 --- a/doc/src/sgml/ref/rollback_prepared.sgml +++ b/doc/src/sgml/ref/rollback_prepared.sgml @@ -21,7 +21,7 @@ PostgreSQL documentation -ROLLBACK PREPARED transaction_id +ROLLBACK PREPARED transaction_id [ FOR CHECK ONLY ] @@ -47,6 +47,15 @@ ROLLBACK PREPARED transaction_id + + + FOR CHECK ONLY + + + Check whether a prepared transaction can be rollbacked. + + + diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 28197d77..33ce87ad 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -149,7 +149,7 @@ static GlobalTransaction LookupGXact(const char *gid, Oid user); static GlobalTransaction -LockGXact(const char *gid, Oid user); +LockGXact(const char *gid, Oid user, bool is_check); @@ -763,7 +763,7 @@ LookupGXact(const char *gid, Oid user) * Locate the prepared transaction and mark it busy for COMMIT or PREPARE. */ static GlobalTransaction -LockGXact(const char *gid, Oid user) +LockGXact(const char *gid, Oid user, bool is_check) {// #lizard forgives int i; @@ -812,9 +812,12 @@ LockGXact(const char *gid, Oid user) errmsg("prepared transaction belongs to another database"), errhint("Connect to the database where the transaction was prepared to finish it."))); + if (!is_check) + { /* OK for me to lock it */ gxact->locking_backend = MyBackendId; MyLockedGxact = gxact; + } LWLockRelease(TwoPhaseStateLock); @@ -1710,6 +1713,27 @@ StandbyTransactionIdIsPrepared(TransactionId xid) return result; } +/* + * CheckPreparedTransactionLock: Check whether the prepared transaction + * can be rollbacked + */ +void +CheckPreparedTransactionLock(const char *gid) +{ + GlobalTransaction gxact = LockGXact(gid, GetUserId(), true); + if (enable_distri_print) + { + if (gxact == NULL) + { + elog(LOG, "prepared gid %s gxact is NULL.", gid); + } + else + { + elog(LOG, "prepared gid %s gxact xid %d.", gid, gxact->xid); + } + } +} + /* * FinishPreparedTransaction: execute COMMIT PREPARED or ROLLBACK PREPARED */ @@ -1821,7 +1845,7 @@ FinishPreparedTransaction(const char *gid, bool isCommit) * Validate the GID, and lock the GXACT to ensure that two backends do not * try to commit the same GID at once. */ - gxact = LockGXact(gid, GetUserId()); + gxact = LockGXact(gid, GetUserId(), false); #ifdef PGXC /* * LockGXact returns NULL if this node does not contain given two-phase diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 7f8b9e22..2f34a131 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -10370,6 +10370,20 @@ TransactionStmt: n->gid = $3; $$ = (Node *)n; } + | COMMIT PREPARED Sconst FOR CHECK ONLY + { + TransactionStmt *n = makeNode(TransactionStmt); + n->kind = TRANS_STMT_COMMIT_PREPARED_CHECK; + n->gid = $3; + $$ = (Node *)n; + } + | ROLLBACK PREPARED Sconst FOR CHECK ONLY + { + TransactionStmt *n = makeNode(TransactionStmt); + n->kind = TRANS_STMT_ROLLBACK_PREPARED_CHECK; + n->gid = $3; + $$ = (Node *)n; + } ; opt_transaction: WORK {} diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index 1527823f..29409d44 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -602,6 +602,18 @@ ProcessUtilityPre(PlannedStmt *pstmt, } break; + case TRANS_STMT_COMMIT_PREPARED_CHECK: + PreventTransactionChain(isTopLevel, "COMMIT PREPARED CHECK"); + PreventCommandDuringRecovery("COMMIT PREPARED CHECK"); + elog(LOG, "COMMIT PREPARED %s FOR CHECK ONLY", stmt->gid); + break; + + case TRANS_STMT_ROLLBACK_PREPARED_CHECK: + PreventTransactionChain(isTopLevel, "ROLLBACK PREPARED CHECK"); + PreventCommandDuringRecovery("ROLLBACK PREPARED CHECK"); + elog(LOG, "ROLLBACK PREPARED %s FOR CHECK ONLY", stmt->gid); + break; + case TRANS_STMT_ROLLBACK: break; @@ -1974,6 +1986,18 @@ standard_ProcessUtility(PlannedStmt *pstmt, FinishPreparedTransaction(stmt->gid, false); break; + case TRANS_STMT_COMMIT_PREPARED_CHECK: + PreventTransactionChain(isTopLevel, "COMMIT PREPARED CHECK"); + PreventCommandDuringRecovery("COMMIT PREPARED CHECK"); + CheckPreparedTransactionLock(stmt->gid); + break; + + case TRANS_STMT_ROLLBACK_PREPARED_CHECK: + PreventTransactionChain(isTopLevel, "ROLLBACK PREPARED CHECK"); + PreventCommandDuringRecovery("ROLLBACK PREPARED CHECK"); + CheckPreparedTransactionLock(stmt->gid); + break; + case TRANS_STMT_ROLLBACK: UserAbortTransactionBlock(); break; @@ -4946,6 +4970,14 @@ CreateCommandTag(Node *parsetree) tag = "ROLLBACK PREPARED"; break; + case TRANS_STMT_COMMIT_PREPARED_CHECK: + tag = "COMMIT PREPARED CHECK"; + break; + + case TRANS_STMT_ROLLBACK_PREPARED_CHECK: + tag = "ROLLBACK PREPARED CHECK"; + break; + default: tag = "???"; break; diff --git a/src/include/access/twophase.h b/src/include/access/twophase.h index 4f12b674..8a4831b0 100644 --- a/src/include/access/twophase.h +++ b/src/include/access/twophase.h @@ -122,6 +122,8 @@ extern void CheckPointTwoPhase(XLogRecPtr redo_horizon); extern void FinishPreparedTransaction(const char *gid, bool isCommit); +extern void CheckPreparedTransactionLock(const char *gid); + extern void PrepareRedoAdd(char *buf, XLogRecPtr start_lsn, XLogRecPtr end_lsn); extern void PrepareRedoRemove(TransactionId xid, bool giveWarning); diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index 2c54a436..57111155 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -3110,7 +3110,9 @@ typedef enum TransactionStmtKind TRANS_STMT_ROLLBACK_PREPARED, TRANS_STMT_BEGIN_SUBTXN, TRANS_STMT_ROLLBACK_SUBTXN, - TRANS_STMT_COMMIT_SUBTXN + TRANS_STMT_COMMIT_SUBTXN, + TRANS_STMT_COMMIT_PREPARED_CHECK, + TRANS_STMT_ROLLBACK_PREPARED_CHECK } TransactionStmtKind; typedef struct TransactionStmt From d94f0215a9ed34d3227a04e5c765e1445e0c183d Mon Sep 17 00:00:00 2001 From: bethding Date: Mon, 8 Mar 2021 12:32:37 +0800 Subject: [PATCH 332/578] snyc dynamic shared memory from pg --- src/backend/utils/mmgr/dsa.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/backend/utils/mmgr/dsa.c b/src/backend/utils/mmgr/dsa.c index f7f11c06..9a6d036d 100644 --- a/src/backend/utils/mmgr/dsa.c +++ b/src/backend/utils/mmgr/dsa.c @@ -2255,6 +2255,7 @@ check_for_freed_segments(dsa_area *area) LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); check_for_freed_segments_locked(area); LWLockRelease(DSA_AREA_LOCK(area)); + area->freed_segment_counter = freed_segment_counter; } } From 76b08dbff66e362b1180e5cff19a23aea56dc79a Mon Sep 17 00:00:00 2001 From: whalesong Date: Sat, 20 Feb 2021 11:56:08 +0800 Subject: [PATCH 333/578] Bugfix: dn core in core FinishPreparedTransaction, ID85239005 (merge request !198) --- src/backend/access/transam/twophase.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 33ce87ad..ebe93238 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -1911,9 +1911,25 @@ FinishPreparedTransaction(const char *gid, bool isCommit) * to disk if for some reason they have lived for a long time. */ if (gxact->ondisk) + { buf = ReadTwoPhaseFile(xid, true); + if (NULL == buf) + { + ereport(PANIC, + (errcode_for_file_access(), + errmsg("read two-phase file failed, gid: %s", gid))); + } + } else + { XlogReadTwoPhaseData(gxact->prepare_start_lsn, &buf, NULL); + if (NULL == buf) + { + ereport(PANIC, + (errcode_for_file_access(), + errmsg("read two-phase data from xlog failed, gid: %s", gid))); + } + } /* From 331adf0463e14a65d9f77354e4d0d4b7001f5684 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Mon, 8 Mar 2021 16:29:14 +0800 Subject: [PATCH 334/578] fix http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131080533479 (merge request !199) --- src/gtm/main/gtm_standby.c | 959 +++++++++++++++++----------------- src/gtm/main/main.c | 94 +++- src/include/gtm/gtm_standby.h | 7 + 3 files changed, 558 insertions(+), 502 deletions(-) diff --git a/src/gtm/main/gtm_standby.c b/src/gtm/main/gtm_standby.c index 06fa54b1..e84cca5d 100644 --- a/src/gtm/main/gtm_standby.c +++ b/src/gtm/main/gtm_standby.c @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * gtm_standby.c - * Functionalities of GTM Standby + * Functionalities of GTM Standby * * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -9,7 +9,7 @@ * * * IDENTIFICATION - * src/gtm/common/gtm_standby.c + * src/gtm/common/gtm_standby.c * *------------------------------------------------------------------------- */ @@ -47,7 +47,7 @@ static char standbyNodeName[NI_MAXHOST]; static int standbyPortNumber; static char *standbyDataDir; extern char *NodeName; -extern int GTMPortNumber; +extern int GTMPortNumber; #ifndef __XLOG__ static GTM_Conn *gtm_standby_connect_to_standby_int(int *report_needed); @@ -64,214 +64,215 @@ extern int GTM_Standby_Connetion_Timeout; int gtm_standby_start_startup(void) { - GTM_ActiveConn = gtm_standby_connectToActiveGTM(); - if (GTM_ActiveConn == NULL || GTMPQstatus(GTM_ActiveConn) != CONNECTION_OK) - { - int save_errno = errno; - if(GTM_ActiveConn) - elog(ERROR, "can not connect to GTM: %s %m", GTMPQerrorMessage(GTM_ActiveConn)); - else - elog(ERROR, "connection is null: %m"); - - errno = save_errno; - if(GTM_ActiveConn) - GTMPQfinish(GTM_ActiveConn); - } - - elog(LOG, "Connection established to the GTM active."); - - return 1; + GTM_ActiveConn = gtm_standby_connectToActiveGTM(); + if (GTM_ActiveConn == NULL || GTMPQstatus(GTM_ActiveConn) != CONNECTION_OK) + { + int save_errno = errno; + if(GTM_ActiveConn) + elog(LOG, "can not connect to GTM: %s %m", GTMPQerrorMessage(GTM_ActiveConn)); + else + elog(LOG, "connection is null: %m"); + + errno = save_errno; + if(GTM_ActiveConn) + GTMPQfinish(GTM_ActiveConn); + return 0; + } + + elog(LOG, "Connection established to the GTM active."); + + return 1; } int gtm_standby_finish_startup(void) { - elog(DEBUG1, "Closing a startup connection..."); + elog(DEBUG1, "Closing a startup connection..."); - GTMPQfinish(GTM_ActiveConn); - GTM_ActiveConn = NULL; + GTMPQfinish(GTM_ActiveConn); + GTM_ActiveConn = NULL; - elog(DEBUG1, "A startup connection closed."); - return 1; + elog(DEBUG1, "A startup connection closed."); + return 1; } int gtm_standby_restore_next_gxid(void) { - GlobalTransactionId next_gxid = InvalidGlobalTransactionId; -#ifdef __TBASE__ - next_gxid = get_next_gxid(GTM_ActiveConn); - GTM_RestoreStoreInfo(next_gxid, true); + GlobalTransactionId next_gxid = InvalidGlobalTransactionId; +#ifdef __TBASE__ + next_gxid = get_next_gxid(GTM_ActiveConn); + GTM_RestoreStoreInfo(next_gxid, true); #else - next_gxid = get_next_gxid(GTM_ActiveConn); - GTM_RestoreTxnInfo(NULL, next_gxid, NULL, true); + next_gxid = get_next_gxid(GTM_ActiveConn); + GTM_RestoreTxnInfo(NULL, next_gxid, NULL, true); #endif - elog(DEBUG1, "Restoring the next GXID done."); - return 1; + elog(DEBUG1, "Restoring the next GXID done."); + return 1; } int gtm_standby_restore_sequence(void) { #ifndef __TBASE__ - GTM_SeqInfo *seq_list; - int num_seq; - int i; - - /* - * Restore sequence data. - */ - num_seq = get_sequence_list(GTM_ActiveConn, &seq_list); - - for (i = 0; i < num_seq; i++) - { - GTM_SeqRestore(seq_list[i].gs_key, - seq_list[i].gs_increment_by, - seq_list[i].gs_min_value, - seq_list[i].gs_max_value, - seq_list[i].gs_init_value, - seq_list[i].gs_value, - seq_list[i].gs_state, - seq_list[i].gs_cycle, - seq_list[i].gs_called); - } - - elog(DEBUG1, "Restoring sequences done."); + GTM_SeqInfo *seq_list; + int num_seq; + int i; + + /* + * Restore sequence data. + */ + num_seq = get_sequence_list(GTM_ActiveConn, &seq_list); + + for (i = 0; i < num_seq; i++) + { + GTM_SeqRestore(seq_list[i].gs_key, + seq_list[i].gs_increment_by, + seq_list[i].gs_min_value, + seq_list[i].gs_max_value, + seq_list[i].gs_init_value, + seq_list[i].gs_value, + seq_list[i].gs_state, + seq_list[i].gs_cycle, + seq_list[i].gs_called); + } + + elog(DEBUG1, "Restoring sequences done."); #endif - return 1; + return 1; } int gtm_standby_restore_gxid(void) { #ifndef __TBASE__ - int num_txn; - GTM_Transactions txn; - int i; - /* - * Restore gxid data. - */ - num_txn = get_txn_gxid_list(GTM_ActiveConn, &txn); - - GTM_RWLockAcquire(>MTransactions.gt_XidGenLock, GTM_LOCKMODE_WRITE); - GTM_RWLockAcquire(>MTransactions.gt_TransArrayLock, GTM_LOCKMODE_WRITE); - - GTMTransactions.gt_txn_count = txn.gt_txn_count; - GTMTransactions.gt_gtm_state = txn.gt_gtm_state; - GTMTransactions.gt_nextXid = txn.gt_nextXid; - GTMTransactions.gt_oldestXid = txn.gt_oldestXid; - GTMTransactions.gt_xidVacLimit = txn.gt_xidVacLimit; - GTMTransactions.gt_xidWarnLimit = txn.gt_xidWarnLimit; - GTMTransactions.gt_xidStopLimit = txn.gt_xidStopLimit; - GTMTransactions.gt_xidWrapLimit = txn.gt_xidWrapLimit; - GTMTransactions.gt_latestCompletedXid = txn.gt_latestCompletedXid; - GTMTransactions.gt_recent_global_xmin = txn.gt_recent_global_xmin; - GTMTransactions.gt_lastslot = txn.gt_lastslot; - - for (i = 0; i < num_txn; i++) - { - int handle = txn.gt_transactions_array[i].gti_handle; - - GTMTransactions.gt_transactions_array[handle].gti_handle = txn.gt_transactions_array[i].gti_handle; - - GTMTransactions.gt_transactions_array[handle].gti_client_id = txn.gt_transactions_array[i].gti_client_id; - GTMTransactions.gt_transactions_array[handle].gti_in_use = txn.gt_transactions_array[i].gti_in_use; - GTMTransactions.gt_transactions_array[handle].gti_gxid = txn.gt_transactions_array[i].gti_gxid; - GTMTransactions.gt_transactions_array[handle].gti_state = txn.gt_transactions_array[i].gti_state; - GTMTransactions.gt_transactions_array[handle].gti_xmin = txn.gt_transactions_array[i].gti_xmin; - GTMTransactions.gt_transactions_array[handle].gti_isolevel = txn.gt_transactions_array[i].gti_isolevel; - GTMTransactions.gt_transactions_array[handle].gti_readonly = txn.gt_transactions_array[i].gti_readonly; - GTMTransactions.gt_transactions_array[handle].gti_proxy_client_id = txn.gt_transactions_array[i].gti_proxy_client_id; - - if (txn.gt_transactions_array[i].nodestring == NULL ) - GTMTransactions.gt_transactions_array[handle].nodestring = NULL; - else - GTMTransactions.gt_transactions_array[handle].nodestring = txn.gt_transactions_array[i].nodestring; - - /* GID */ - if (txn.gt_transactions_array[i].gti_gid == NULL ) - GTMTransactions.gt_transactions_array[handle].gti_gid = NULL; - else - GTMTransactions.gt_transactions_array[handle].gti_gid = txn.gt_transactions_array[i].gti_gid; - - /* copy GTM_SnapshotData */ - GTMTransactions.gt_transactions_array[handle].gti_current_snapshot.sn_xmin = - txn.gt_transactions_array[i].gti_current_snapshot.sn_xmin; - GTMTransactions.gt_transactions_array[handle].gti_current_snapshot.sn_xmax = - txn.gt_transactions_array[i].gti_current_snapshot.sn_xmax; - GTMTransactions.gt_transactions_array[handle].gti_current_snapshot.sn_xcnt = - txn.gt_transactions_array[i].gti_current_snapshot.sn_xcnt; - GTMTransactions.gt_transactions_array[handle].gti_current_snapshot.sn_xip = - txn.gt_transactions_array[i].gti_current_snapshot.sn_xip; - /* end of copying GTM_SnapshotData */ - - GTMTransactions.gt_transactions_array[handle].gti_snapshot_set = - txn.gt_transactions_array[i].gti_snapshot_set; - GTMTransactions.gt_transactions_array[handle].gti_vacuum = - txn.gt_transactions_array[i].gti_vacuum; - - /* - * Is this correct? Is GTM_TXN_COMMITTED transaction categorized as "open"? - */ - if (GTMTransactions.gt_transactions_array[handle].gti_state != GTM_TXN_ABORTED) - { - GTMTransactions.gt_open_transactions = - gtm_lappend(GTMTransactions.gt_open_transactions, - >MTransactions.gt_transactions_array[handle]); - } - } - - dump_transactions_elog(>MTransactions, num_txn); - - GTM_RWLockRelease(>MTransactions.gt_TransArrayLock); - GTM_RWLockRelease(>MTransactions.gt_XidGenLock); - - elog(DEBUG1, "Restoring %d gxid(s) done.", num_txn); + int num_txn; + GTM_Transactions txn; + int i; + /* + * Restore gxid data. + */ + num_txn = get_txn_gxid_list(GTM_ActiveConn, &txn); + + GTM_RWLockAcquire(>MTransactions.gt_XidGenLock, GTM_LOCKMODE_WRITE); + GTM_RWLockAcquire(>MTransactions.gt_TransArrayLock, GTM_LOCKMODE_WRITE); + + GTMTransactions.gt_txn_count = txn.gt_txn_count; + GTMTransactions.gt_gtm_state = txn.gt_gtm_state; + GTMTransactions.gt_nextXid = txn.gt_nextXid; + GTMTransactions.gt_oldestXid = txn.gt_oldestXid; + GTMTransactions.gt_xidVacLimit = txn.gt_xidVacLimit; + GTMTransactions.gt_xidWarnLimit = txn.gt_xidWarnLimit; + GTMTransactions.gt_xidStopLimit = txn.gt_xidStopLimit; + GTMTransactions.gt_xidWrapLimit = txn.gt_xidWrapLimit; + GTMTransactions.gt_latestCompletedXid = txn.gt_latestCompletedXid; + GTMTransactions.gt_recent_global_xmin = txn.gt_recent_global_xmin; + GTMTransactions.gt_lastslot = txn.gt_lastslot; + + for (i = 0; i < num_txn; i++) + { + int handle = txn.gt_transactions_array[i].gti_handle; + + GTMTransactions.gt_transactions_array[handle].gti_handle = txn.gt_transactions_array[i].gti_handle; + + GTMTransactions.gt_transactions_array[handle].gti_client_id = txn.gt_transactions_array[i].gti_client_id; + GTMTransactions.gt_transactions_array[handle].gti_in_use = txn.gt_transactions_array[i].gti_in_use; + GTMTransactions.gt_transactions_array[handle].gti_gxid = txn.gt_transactions_array[i].gti_gxid; + GTMTransactions.gt_transactions_array[handle].gti_state = txn.gt_transactions_array[i].gti_state; + GTMTransactions.gt_transactions_array[handle].gti_xmin = txn.gt_transactions_array[i].gti_xmin; + GTMTransactions.gt_transactions_array[handle].gti_isolevel = txn.gt_transactions_array[i].gti_isolevel; + GTMTransactions.gt_transactions_array[handle].gti_readonly = txn.gt_transactions_array[i].gti_readonly; + GTMTransactions.gt_transactions_array[handle].gti_proxy_client_id = txn.gt_transactions_array[i].gti_proxy_client_id; + + if (txn.gt_transactions_array[i].nodestring == NULL ) + GTMTransactions.gt_transactions_array[handle].nodestring = NULL; + else + GTMTransactions.gt_transactions_array[handle].nodestring = txn.gt_transactions_array[i].nodestring; + + /* GID */ + if (txn.gt_transactions_array[i].gti_gid == NULL ) + GTMTransactions.gt_transactions_array[handle].gti_gid = NULL; + else + GTMTransactions.gt_transactions_array[handle].gti_gid = txn.gt_transactions_array[i].gti_gid; + + /* copy GTM_SnapshotData */ + GTMTransactions.gt_transactions_array[handle].gti_current_snapshot.sn_xmin = + txn.gt_transactions_array[i].gti_current_snapshot.sn_xmin; + GTMTransactions.gt_transactions_array[handle].gti_current_snapshot.sn_xmax = + txn.gt_transactions_array[i].gti_current_snapshot.sn_xmax; + GTMTransactions.gt_transactions_array[handle].gti_current_snapshot.sn_xcnt = + txn.gt_transactions_array[i].gti_current_snapshot.sn_xcnt; + GTMTransactions.gt_transactions_array[handle].gti_current_snapshot.sn_xip = + txn.gt_transactions_array[i].gti_current_snapshot.sn_xip; + /* end of copying GTM_SnapshotData */ + + GTMTransactions.gt_transactions_array[handle].gti_snapshot_set = + txn.gt_transactions_array[i].gti_snapshot_set; + GTMTransactions.gt_transactions_array[handle].gti_vacuum = + txn.gt_transactions_array[i].gti_vacuum; + + /* + * Is this correct? Is GTM_TXN_COMMITTED transaction categorized as "open"? + */ + if (GTMTransactions.gt_transactions_array[handle].gti_state != GTM_TXN_ABORTED) + { + GTMTransactions.gt_open_transactions = + gtm_lappend(GTMTransactions.gt_open_transactions, + >MTransactions.gt_transactions_array[handle]); + } + } + + dump_transactions_elog(>MTransactions, num_txn); + + GTM_RWLockRelease(>MTransactions.gt_TransArrayLock); + GTM_RWLockRelease(>MTransactions.gt_XidGenLock); + + elog(DEBUG1, "Restoring %d gxid(s) done.", num_txn); #endif - return 1; + return 1; } int gtm_standby_restore_node(void) { - GTM_PGXCNodeInfo *data; - int rc, i; - int num_node; - - elog(LOG, "Copying node information from the GTM active..."); - - data = (GTM_PGXCNodeInfo *) malloc(sizeof(GTM_PGXCNodeInfo) * 128); - memset(data, 0, sizeof(GTM_PGXCNodeInfo) * 128); - - rc = get_node_list(GTM_ActiveConn, data, 128); - if (rc < 0) - { - elog(DEBUG3, "get_node_list() failed."); - rc = 0; - goto finished; - } - - num_node = rc; - - for (i = 0; i < num_node; i++) - { - elog(DEBUG1, "get_node_list: nodetype=%d, nodename=%s, datafolder=%s", - data[i].type, data[i].nodename, data[i].datafolder); - if (Recovery_PGXCNodeRegister(data[i].type, data[i].nodename, data[i].port, - data[i].proxyname, data[i].status, - data[i].ipaddress, data[i].datafolder, true, - -1 /* dummy socket */, false) != 0) - { - rc = 0; - goto finished; - } - } - - elog(LOG, "Copying node information from GTM active done."); - -finished: - free(data); - return rc; + GTM_PGXCNodeInfo *data; + int rc, i; + int num_node; + + elog(LOG, "Copying node information from the GTM active..."); + + data = (GTM_PGXCNodeInfo *) malloc(sizeof(GTM_PGXCNodeInfo) * 128); + memset(data, 0, sizeof(GTM_PGXCNodeInfo) * 128); + + rc = get_node_list(GTM_ActiveConn, data, 128); + if (rc < 0) + { + elog(DEBUG3, "get_node_list() failed."); + rc = 0; + goto finished; + } + + num_node = rc; + + for (i = 0; i < num_node; i++) + { + elog(DEBUG1, "get_node_list: nodetype=%d, nodename=%s, datafolder=%s", + data[i].type, data[i].nodename, data[i].datafolder); + if (Recovery_PGXCNodeRegister(data[i].type, data[i].nodename, data[i].port, + data[i].proxyname, data[i].status, + data[i].ipaddress, data[i].datafolder, true, + -1 /* dummy socket */, false) != 0) + { + rc = 0; + goto finished; + } + } + + elog(LOG, "Copying node information from GTM active done."); + +finished: + free(data); + return rc; } /* @@ -285,54 +286,54 @@ gtm_standby_restore_node(void) int gtm_standby_register_self(const char *node_name, int port, const char *datadir) { - int rc; + int rc; #ifdef __XLOG__ static char *s_node_name = NULL; - static int s_port = 0; - static char *s_datadir = NULL; - - static bool init = false; - - if(init == false) - { - s_node_name = strdup(node_name); - s_datadir = strdup(datadir); - s_port = port; - - init = true; - } - else - { - node_name = s_node_name; - port = s_port; - datadir = s_datadir; - } + static int s_port = 0; + static char *s_datadir = NULL; + + static bool init = false; + + if(init == false) + { + s_node_name = strdup(node_name); + s_datadir = strdup(datadir); + s_port = port; + + init = true; + } + else + { + node_name = s_node_name; + port = s_port; + datadir = s_datadir; + } #endif - elog(DEBUG8, "Registering standby-GTM status..."); + elog(DEBUG8, "Registering standby-GTM status..."); - node_get_local_addr(GTM_ActiveConn, standbyHostName, sizeof(standbyNodeName), &rc); + node_get_local_addr(GTM_ActiveConn, standbyHostName, sizeof(standbyNodeName), &rc); - memset(standbyNodeName, 0, NI_MAXHOST); - strncpy(standbyNodeName, node_name, NI_MAXHOST - 1); - standbyPortNumber = port; - standbyDataDir= (char *)datadir; - elog(LOG, "register standbyhostname %s, port number %d node name %s datadir %s", - standbyHostName, standbyPortNumber, standbyNodeName, standbyDataDir); - rc = node_register_internal(GTM_ActiveConn, GTM_NODE_GTM, standbyHostName, standbyPortNumber, - standbyNodeName, standbyDataDir, - NODE_DISCONNECTED); - if (rc < 0) - { - elog(DEBUG1, "Failed to register a standby-GTM status."); + memset(standbyNodeName, 0, NI_MAXHOST); + strncpy(standbyNodeName, node_name, NI_MAXHOST - 1); + standbyPortNumber = port; + standbyDataDir= (char *)datadir; + elog(LOG, "register standbyhostname %s, port number %d node name %s datadir %s", + standbyHostName, standbyPortNumber, standbyNodeName, standbyDataDir); + rc = node_register_internal(GTM_ActiveConn, GTM_NODE_GTM, standbyHostName, standbyPortNumber, + standbyNodeName, standbyDataDir, + NODE_DISCONNECTED); + if (rc < 0) + { + elog(DEBUG1, "Failed to register a standby-GTM status."); - return 0; - } + return 0; + } - elog(DEBUG1, "Registering standby-GTM done."); - - return 1; + elog(DEBUG1, "Registering standby-GTM done."); + + return 1; } /* @@ -343,31 +344,31 @@ gtm_standby_register_self(const char *node_name, int port, const char *datadir) int gtm_standby_activate_self(void) { - int rc; - - elog(DEBUG1, "Updating the standby-GTM status to \"CONNECTED\"..."); - - rc = node_unregister(GTM_ActiveConn, GTM_NODE_GTM, standbyNodeName); - if (rc < 0) - { - elog(DEBUG1, "Failed to unregister old standby-GTM status."); - return 0; - } - elog(LOG, "register standbyhostname %s, port number %d node name %s datadir %s", - standbyHostName, standbyPortNumber, standbyNodeName, standbyDataDir); - rc = node_register_internal(GTM_ActiveConn, GTM_NODE_GTM, standbyHostName, standbyPortNumber, - standbyNodeName, standbyDataDir, - NODE_CONNECTED); - - if (rc < 0) - { - elog(DEBUG1, "Failed to register a new standby-GTM status."); - return 0; - } - - elog(DEBUG1, "Updating the standby-GTM status done."); - - return 1; + int rc; + + elog(DEBUG1, "Updating the standby-GTM status to \"CONNECTED\"..."); + + rc = node_unregister(GTM_ActiveConn, GTM_NODE_GTM, standbyNodeName); + if (rc < 0) + { + elog(DEBUG1, "Failed to unregister old standby-GTM status."); + return 0; + } + elog(LOG, "register standbyhostname %s, port number %d node name %s datadir %s", + standbyHostName, standbyPortNumber, standbyNodeName, standbyDataDir); + rc = node_register_internal(GTM_ActiveConn, GTM_NODE_GTM, standbyHostName, standbyPortNumber, + standbyNodeName, standbyDataDir, + NODE_CONNECTED); + + if (rc < 0) + { + elog(DEBUG1, "Failed to register a new standby-GTM status."); + return 0; + } + + elog(DEBUG1, "Updating the standby-GTM status done."); + + return 1; } @@ -380,35 +381,35 @@ gtm_standby_activate_self(void) GTM_PGXCNodeInfo * find_standby_node_info(void) { - GTM_PGXCNodeInfo *node[1024]; - size_t n; - int i; - - n = pgxcnode_find_by_type(GTM_NODE_GTM, node, 1024); - - for (i = 0 ; i < n ; i++) - { - elog(DEBUG8, "pgxcnode_find_by_type: nodename=%s, type=%d, ipaddress=%s, port=%d, status=%d", - node[i]->nodename, - node[i]->type, - node[i]->ipaddress, - node[i]->port, - node[i]->status); - - /* - * Must not try and connect to ourself. That will lead to a deadlock - * - * !!TODO Ideally we should not be registered on the GTM, but when a - * failover happens, the standby may carry forward the node - * registration information previously sent by the original master as a - * backup. This needs to be studied further - */ - if (strcmp(node[i]->nodename, NodeName) && - node[i]->status == NODE_CONNECTED) - return node[i]; - } - - return NULL; + GTM_PGXCNodeInfo *node[1024]; + size_t n; + int i; + + n = pgxcnode_find_by_type(GTM_NODE_GTM, node, 1024); + + for (i = 0 ; i < n ; i++) + { + elog(DEBUG8, "pgxcnode_find_by_type: nodename=%s, type=%d, ipaddress=%s, port=%d, status=%d", + node[i]->nodename, + node[i]->type, + node[i]->ipaddress, + node[i]->port, + node[i]->status); + + /* + * Must not try and connect to ourself. That will lead to a deadlock + * + * !!TODO Ideally we should not be registered on the GTM, but when a + * failover happens, the standby may carry forward the node + * registration information previously sent by the original master as a + * backup. This needs to be studied further + */ + if (strcmp(node[i]->nodename, NodeName) && + node[i]->status == NODE_CONNECTED) + return node[i]; + } + + return NULL; } @@ -423,100 +424,100 @@ find_standby_node_info(void) GTM_Conn * gtm_standby_connect_to_standby(void) { - GTM_Conn *conn; - int report; + GTM_Conn *conn; + int report; - conn = gtm_standby_connect_to_standby_int(&report); + conn = gtm_standby_connect_to_standby_int(&report); - return conn; + return conn; } #endif GTM_Conn * gtm_connect_to_standby(GTM_PGXCNodeInfo *n,int timeout) { - GTM_Conn *standby = NULL; - char conn_string[1024]; - - elog(DEBUG8, "GTM standby is active. Going to connect."); - - snprintf(conn_string, sizeof(conn_string), - "host=%s port=%d node_name=%s remote_type=%d connect_timeout=%d", - n->ipaddress, n->port, NodeName, GTM_NODE_GTM, timeout); - - standby = PQconnectGTM(conn_string); - if (standby == NULL || GTMPQstatus(standby) != CONNECTION_OK) - { - int save_errno = errno; - if(standby) - { - elog(LOG, "can not connect to GTM standby: %s %m", GTMPQerrorMessage(standby)); - } - else - { - elog(LOG, "connection is null: %m"); - } - - errno = save_errno; - if(standby) - { - GTMPQfinish(standby); - } - return NULL; - } - - return standby; + GTM_Conn *standby = NULL; + char conn_string[1024]; + + elog(DEBUG8, "GTM standby is active. Going to connect."); + + snprintf(conn_string, sizeof(conn_string), + "host=%s port=%d node_name=%s remote_type=%d connect_timeout=%d", + n->ipaddress, n->port, NodeName, GTM_NODE_GTM, timeout); + + standby = PQconnectGTM(conn_string); + if (standby == NULL || GTMPQstatus(standby) != CONNECTION_OK) + { + int save_errno = errno; + if(standby) + { + elog(LOG, "can not connect to GTM standby: %s %m", GTMPQerrorMessage(standby)); + } + else + { + elog(LOG, "connection is null: %m"); + } + + errno = save_errno; + if(standby) + { + GTMPQfinish(standby); + } + return NULL; + } + + return standby; } #ifndef __XLOG__ static GTM_Conn * gtm_standby_connect_to_standby_int(int *report_needed) { - GTM_Conn *standby = NULL; - GTM_PGXCNodeInfo *n; - char conn_string[1024]; - - *report_needed = 0; - - n = find_standby_node_info(); - if (!n) - { - elog(LOG, "Any GTM standby node not found in registered node(s)."); - return NULL; - } - - elog(DEBUG8, "GTM standby is active. Going to connect."); - *report_needed = 1; - - - snprintf(conn_string, sizeof(conn_string), - "host=%s port=%d node_name=%s remote_type=%d connect_timeout=%d", - n->ipaddress, n->port, NodeName, GTM_NODE_GTM, GTM_Standby_Connetion_Timeout); - - standby = PQconnectGTM(conn_string); - if (standby == NULL || GTMPQstatus(standby) != CONNECTION_OK) - { - int save_errno = errno; - if(standby) - { - elog(LOG, "can not connect to GTM standby: %s %m", GTMPQerrorMessage(standby)); - } - else - { - elog(LOG, "connection is null: %m"); - } - - errno = save_errno; - if(standby) - { - GTMPQfinish(standby); - } - return NULL; - } - - elog(DEBUG8, "Connection established with GTM standby. - %p conn %s socket %d", n, conn_string, standby->sock); - - return standby; + GTM_Conn *standby = NULL; + GTM_PGXCNodeInfo *n; + char conn_string[1024]; + + *report_needed = 0; + + n = find_standby_node_info(); + if (!n) + { + elog(LOG, "Any GTM standby node not found in registered node(s)."); + return NULL; + } + + elog(DEBUG8, "GTM standby is active. Going to connect."); + *report_needed = 1; + + + snprintf(conn_string, sizeof(conn_string), + "host=%s port=%d node_name=%s remote_type=%d connect_timeout=%d", + n->ipaddress, n->port, NodeName, GTM_NODE_GTM, GTM_Standby_Connetion_Timeout); + + standby = PQconnectGTM(conn_string); + if (standby == NULL || GTMPQstatus(standby) != CONNECTION_OK) + { + int save_errno = errno; + if(standby) + { + elog(LOG, "can not connect to GTM standby: %s %m", GTMPQerrorMessage(standby)); + } + else + { + elog(LOG, "connection is null: %m"); + } + + errno = save_errno; + if(standby) + { + GTMPQfinish(standby); + } + return NULL; + } + + elog(DEBUG8, "Connection established with GTM standby. - %p conn %s socket %d", n, conn_string, standby->sock); + + return standby; } #endif @@ -524,10 +525,10 @@ gtm_standby_connect_to_standby_int(int *report_needed) void gtm_standby_disconnect_from_standby(GTM_Conn *conn) { - if (Recovery_IsStandby()) - return; + if (Recovery_IsStandby()) + return; - GTMPQfinish(conn); + GTMPQfinish(conn); } @@ -535,28 +536,28 @@ gtm_standby_disconnect_from_standby(GTM_Conn *conn) GTM_Conn * gtm_standby_reconnect_to_standby(GTM_Conn *old_conn, int retry_max) { - GTM_Conn *newconn = NULL; - int report; - int i; + GTM_Conn *newconn = NULL; + int report; + int i; - if (Recovery_IsStandby()) - return NULL; + if (Recovery_IsStandby()) + return NULL; - if (old_conn != NULL) - gtm_standby_disconnect_from_standby(old_conn); + if (old_conn != NULL) + gtm_standby_disconnect_from_standby(old_conn); - for (i = 0; i < retry_max; i++) - { - elog(DEBUG1, "gtm_standby_reconnect_to_standby(): going to re-connect. retry=%d", i); + for (i = 0; i < retry_max; i++) + { + elog(DEBUG1, "gtm_standby_reconnect_to_standby(): going to re-connect. retry=%d", i); - newconn = gtm_standby_connect_to_standby_int(&report); - if (newconn != NULL) - break; + newconn = gtm_standby_connect_to_standby_int(&report); + if (newconn != NULL) + break; - elog(DEBUG1, "gtm_standby_reconnect_to_standby(): re-connect failed. retry=%d", i); - } + elog(DEBUG1, "gtm_standby_reconnect_to_standby(): re-connect failed. retry=%d", i); + } - return newconn; + return newconn; } #endif @@ -567,96 +568,96 @@ gtm_standby_reconnect_to_standby(GTM_Conn *old_conn, int retry_max) bool gtm_standby_check_communication_error(Port *myport, int *retry_count, GTM_Conn *oldconn) { - - /* - * This function may be called without result from standby. - */ - if (GetMyConnection(myport)->standby->result - && GetMyConnection(myport)->standby->result->gr_status == GTM_RESULT_COMM_ERROR) - { - if (*retry_count == 0) - { - (*retry_count)++; - - GetMyConnection(myport)->standby = - gtm_standby_reconnect_to_standby(GetMyConnection(myport)->standby, - GTM_STANDBY_RETRY_MAX); - - if (GetMyConnection(myport)->standby) - return true; - } - - elog(DEBUG1, "communication error with standby."); - } - return false; + + /* + * This function may be called without result from standby. + */ + if (GetMyConnection(myport)->standby->result + && GetMyConnection(myport)->standby->result->gr_status == GTM_RESULT_COMM_ERROR) + { + if (*retry_count == 0) + { + (*retry_count)++; + + GetMyConnection(myport)->standby = + gtm_standby_reconnect_to_standby(GetMyConnection(myport)->standby, + GTM_STANDBY_RETRY_MAX); + + if (GetMyConnection(myport)->standby) + return true; + } + + elog(DEBUG1, "communication error with standby."); + } + return false; } #endif int gtm_standby_begin_backup(int64 identifier, int64 lsn, GlobalTimestamp gts) { - int rc = set_begin_backup(GTM_ActiveConn, identifier, lsn, gts); - return (rc ? 0 : 1); + int rc = set_begin_backup(GTM_ActiveConn, identifier, lsn, gts); + return (rc ? 0 : 1); } int gtm_standby_end_backup(void) { - int rc = set_end_backup(GTM_ActiveConn, false); - - return (rc ? 0 : 1); + int rc = set_end_backup(GTM_ActiveConn, false); + + return (rc ? 0 : 1); } int gtm_standby_start_replication(const char *application_name) { - char ip_port[NI_MAXHOST]; + char ip_port[NI_MAXHOST]; int rc = 0; - int i = 0; - int len = 0; + int i = 0; + int len = 0; - if(strlen(application_name) == 0) - { - node_get_local_addr(GTM_ActiveConn, ip_port, NI_MAXHOST, &rc); + if(strlen(application_name) == 0) + { + node_get_local_addr(GTM_ActiveConn, ip_port, NI_MAXHOST, &rc); - len = strlen(ip_port); + len = strlen(ip_port); - snprintf(ip_port + len,NI_MAXHOST - len,":%d",GTMPortNumber); + snprintf(ip_port + len,NI_MAXHOST - len,":%d",GTMPortNumber); - for(i = 0; i < len ; i++) - { - if(ip_port[i] == '_') - ip_port[i] = '.'; - } + for(i = 0; i < len ; i++) + { + if(ip_port[i] == '_') + ip_port[i] = '.'; + } - return set_begin_replication(GTM_ActiveConn,ip_port,NodeName); - } + return set_begin_replication(GTM_ActiveConn,ip_port,NodeName); + } return set_begin_replication(GTM_ActiveConn,application_name,NodeName); } -extern char *NodeName; /* Defined in main.c */ +extern char *NodeName; /* Defined in main.c */ void gtm_standby_finishActiveConn(void) { - - GTM_ActiveConn = gtm_standby_connectToActiveGTM(); - if (GTM_ActiveConn == NULL) - { - elog(DEBUG3, "Error in connection"); - return; - } - elog(DEBUG1, "Connection established to the GTM active."); - - /* Unregister self from Active-GTM */ - node_unregister(GTM_ActiveConn, GTM_NODE_GTM, NodeName); - /* Disconnect form Active */ - GTMPQfinish(GTM_ActiveConn); - + + GTM_ActiveConn = gtm_standby_connectToActiveGTM(); + if (GTM_ActiveConn == NULL) + { + elog(DEBUG3, "Error in connection"); + return; + } + elog(DEBUG1, "Connection established to the GTM active."); + + /* Unregister self from Active-GTM */ + node_unregister(GTM_ActiveConn, GTM_NODE_GTM, NodeName); + /* Disconnect form Active */ + GTMPQfinish(GTM_ActiveConn); + #ifdef __TBASE__ - GTM_ActiveConn = NULL; + GTM_ActiveConn = NULL; #endif } @@ -664,17 +665,17 @@ gtm_standby_finishActiveConn(void) static GTM_Conn * gtm_standby_connectToActiveGTM(void) { - char connect_string[1024]; - int active_port = Recovery_StandbyGetActivePort(); - char *active_address = Recovery_StandbyGetActiveAddress(); + char connect_string[1024]; + int active_port = Recovery_StandbyGetActivePort(); + char *active_address = Recovery_StandbyGetActiveAddress(); - /* Need to connect to Active-GTM again here */ - elog(LOG, "Connecting the GTM active on %s:%d...", active_address, active_port); + /* Need to connect to Active-GTM again here */ + elog(LOG, "Connecting the GTM active on %s:%d...", active_address, active_port); - sprintf(connect_string, "host=%s port=%d node_name=%s remote_type=%d", - active_address, active_port, NodeName, GTM_NODE_GTM); + sprintf(connect_string, "host=%s port=%d node_name=%s remote_type=%d", + active_address, active_port, NodeName, GTM_NODE_GTM); - return PQconnectGTM(connect_string); + return PQconnectGTM(connect_string); } #ifdef __TBASE__ /* @@ -682,59 +683,59 @@ gtm_standby_connectToActiveGTM(void) */ int32 GTM_StoreStandbyInitFromMaster(char *data_dir) { - int32 ret = 0; - size_t size = 0; - char *data = NULL; - - if (NULL == data_dir) - { - elog(LOG, "GTM_StoreStandbyInitFromMaster invalid null parameter"); - return GTM_STORE_ERROR; - } - - if (enable_gtm_sequence_debug) - { - elog(LOG, "GTM_StoreStandbyInitFromMaster begin"); - } - - size = (uint32)get_storage_file(GTM_ActiveConn, &data,&XLogCtl->apply,&XLogCtl->thisTimeLineID); - if (-1 == size) - { - elog(LOG, "GTM_StoreStandbyInitFromMaster get_storage_file failed"); - return GTM_STORE_ERROR; - } - - ret = GTM_StoreStandbyInit(data_dir, data, (uint32)size); - if (ret) - { - elog(LOG, "GTM_StoreStandbyInitFromMaster GTM_StoreStandbyInit failed"); - return GTM_STORE_ERROR; - } + int32 ret = 0; + size_t size = 0; + char *data = NULL; + + if (NULL == data_dir) + { + elog(LOG, "GTM_StoreStandbyInitFromMaster invalid null parameter"); + return GTM_STORE_ERROR; + } + + if (enable_gtm_sequence_debug) + { + elog(LOG, "GTM_StoreStandbyInitFromMaster begin"); + } + + size = (uint32)get_storage_file(GTM_ActiveConn, &data,&XLogCtl->apply,&XLogCtl->thisTimeLineID); + if (-1 == size) + { + elog(LOG, "GTM_StoreStandbyInitFromMaster get_storage_file failed"); + return GTM_STORE_ERROR; + } + + ret = GTM_StoreStandbyInit(data_dir, data, (uint32)size); + if (ret) + { + elog(LOG, "GTM_StoreStandbyInitFromMaster GTM_StoreStandbyInit failed"); + return GTM_STORE_ERROR; + } /* we transfer data from the beginning of xlog */ - XLogCtl->LogwrtResult.Write = XLogCtl->LogwrtResult.Flush = XLogCtl->apply - (XLogCtl->apply % GTM_XLOG_SEG_SIZE); - NewXLogFile(GetSegmentNo(XLogCtl->LogwrtResult.Flush)); + XLogCtl->LogwrtResult.Write = XLogCtl->LogwrtResult.Flush = XLogCtl->apply - (XLogCtl->apply % GTM_XLOG_SEG_SIZE); + NewXLogFile(GetSegmentNo(XLogCtl->LogwrtResult.Flush)); ControlData->checkPoint = XLogCtl->apply; ControlData->prevCheckPoint = InvalidXLogRecPtr; - ControlData->thisTimeLineID = XLogCtl->thisTimeLineID; - ControlData->gts = g_GTM_Store_Header->m_next_gts; - ControlData->time = time(NULL); + ControlData->thisTimeLineID = XLogCtl->thisTimeLineID; + ControlData->gts = g_GTM_Store_Header->m_next_gts; + ControlData->time = time(NULL); - ControlDataSync(false); + ControlDataSync(false); AddBackupLabel(GetSegmentNo(XLogCtl->LogwrtResult.Flush)); - elog(LOG,"Get start replication at %X/%X,timeLine: %d", - (uint32_t)(XLogCtl->LogwrtResult.Flush>>32), - (uint32_t)(XLogCtl->LogwrtResult.Flush), - XLogCtl->thisTimeLineID); + elog(LOG,"Get start replication at %X/%X,timeLine: %d", + (uint32_t)(XLogCtl->LogwrtResult.Flush>>32), + (uint32_t)(XLogCtl->LogwrtResult.Flush), + XLogCtl->thisTimeLineID); - if (enable_gtm_sequence_debug) - { - elog(LOG, "GTM_StoreStandbyInitFromMaster done"); - } - return GTM_STORE_OK; + if (enable_gtm_sequence_debug) + { + elog(LOG, "GTM_StoreStandbyInitFromMaster done"); + } + return GTM_STORE_OK; } static void diff --git a/src/gtm/main/main.c b/src/gtm/main/main.c index 81fa43c6..e15bcbf6 100644 --- a/src/gtm/main/main.c +++ b/src/gtm/main/main.c @@ -181,6 +181,8 @@ bool isGTM = true; GTM_ThreadID TopMostThreadID; +enum GTM_PromoteStatus promote_status = GTM_PRPMOTE_INIT; +s_lock_t promote_status_lck; /* The socket(s) we're listening to. */ #define MAXLISTEN 64 @@ -1000,6 +1002,38 @@ main(int argc, char *argv[]) Recovery_StandbySetConnInfo(active_addr, active_port); } + SpinLockInit(&promote_status_lck); + promote_status = GTM_PRPMOTE_INIT; + + pqsignal(SIGHUP, GTM_SigleHandler); + pqsignal(SIGKILL, GTM_SigleHandler); + pqsignal(SIGQUIT, GTM_SigleHandler); + pqsignal(SIGTERM, GTM_SigleHandler); + pqsignal(SIGINT, GTM_SigleHandler); + pqsignal(SIGUSR1, GTM_SigleHandler); + pqsignal(SIGPIPE, SIG_IGN); + + pqinitmask(); + + /* + * Establish a connection between the active and standby. + */ + while (Recovery_IsStandby()) + { + if (gtm_standby_start_startup()) + { + elog(LOG, "Standby GTM Startup connection established with active-GTM."); + break; + } + + elog(LOG, "Failed to establish a connection to active-GTM."); + usleep(GTM_GTS_ONE_SECOND); + } + + SpinLockAcquire(&promote_status_lck); + promote_status = GTM_PRPMOTE_CONNED; + SpinLockRelease(&promote_status_lck); + #ifdef __XLOG__ if(access(RECOVERY_CONF_NAME,F_OK) == 0) @@ -1053,20 +1087,6 @@ main(int argc, char *argv[]) #endif - /* - * Establish a connection between the active and standby. - */ - if (Recovery_IsStandby()) - { - - if (!gtm_standby_start_startup()) - { - elog(ERROR, "Failed to establish a connection to active-GTM."); - exit(1); - } - elog(LOG, "Standby GTM Startup connection established with active-GTM."); - } - #ifdef __TBASE__ elog(LOG, "Starting GTM server at (%s:%d) with syn storage", ListenAddresses, GTMPortNumber); #else @@ -1279,15 +1299,21 @@ main(int argc, char *argv[]) if (!CreateOptsFile(argc, argv)) exit(1); - pqsignal(SIGHUP, GTM_SigleHandler); - pqsignal(SIGKILL, GTM_SigleHandler); - pqsignal(SIGQUIT, GTM_SigleHandler); - pqsignal(SIGTERM, GTM_SigleHandler); - pqsignal(SIGINT, GTM_SigleHandler); - pqsignal(SIGUSR1, GTM_SigleHandler); - pqsignal(SIGPIPE, SIG_IGN); + SpinLockAcquire(&promote_status_lck); + /* + * GTM_PRPMOTE_IN_STARTUP is setting in PromoteToActive, + * do CurrentTimeLineID++ here. + */ + if (promote_status == GTM_PRPMOTE_IN_STARTUP) + { + SetCurrentTimeLineID(GetCurrentTimeLineID() + 1); + } - pqinitmask(); + /* + * set promote_status to GTM_PRPMOTE_NORMAL finally + */ + promote_status = GTM_PRPMOTE_NORMAL; + SpinLockRelease(&promote_status_lck); /* * Now, activating a standby GTM... @@ -2942,7 +2968,10 @@ GTM_ThreadWalReceiver(void *argp) sleep(1); - gtm_standby_start_startup(); + if (!gtm_standby_start_startup()) + { + elog(ERROR, "Failed to establish a connection to active-GTM."); + } if (GTM_ActiveConn == NULL || GTMPQstatus(GTM_ActiveConn) != CONNECTION_OK || gtm_standby_register_self(NULL,0,NULL) == 0 || @@ -4868,10 +4897,29 @@ PromoteToActive(void) */ // GTM_SetInitialAndNextClientIdentifierAtPromote(); + SpinLockAcquire(&promote_status_lck); + if (promote_status != GTM_PRPMOTE_INIT && promote_status != GTM_PRPMOTE_NORMAL) + { + elog(LOG, "Promote signal received. But not allow to promote, promote status %d", promote_status); + SpinLockRelease(&promote_status_lck); + return; + } + /* * Do promoting things here. + * if promote_status is GTM_PRPMOTE_INIT, should use the CurrentTimeLineID to Recovery or OpenXLogFile, + * keep it's value here */ + if (promote_status == GTM_PRPMOTE_NORMAL) + { SetCurrentTimeLineID(GetCurrentTimeLineID() + 1); + } + else + { + promote_status = GTM_PRPMOTE_IN_STARTUP; + } + SpinLockRelease(&promote_status_lck); + Recovery_StandbySetStandby(false); StartupThreadAfterPromote(); CreateDataDirLockFile(); diff --git a/src/include/gtm/gtm_standby.h b/src/include/gtm/gtm_standby.h index cdc056ef..01a037b7 100644 --- a/src/include/gtm/gtm_standby.h +++ b/src/include/gtm/gtm_standby.h @@ -67,5 +67,12 @@ extern int gtm_standby_start_replication(const char *application_name); #define GTM_ACT_MODE 0 #define GTM_STANDBY_MODE 1 +enum GTM_PromoteStatus +{ + GTM_PRPMOTE_INIT = 0, + GTM_PRPMOTE_IN_STARTUP = 1, + GTM_PRPMOTE_CONNED = 2, + GTM_PRPMOTE_NORMAL = 3, +}; #endif /* GTM_STANDBY_H */ From 1548ccf168042ac2fb4c61d66afcb8695c838e82 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Tue, 9 Mar 2021 10:09:46 +0800 Subject: [PATCH 335/578] fix http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131080533479 --- src/gtm/main/main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/gtm/main/main.c b/src/gtm/main/main.c index e15bcbf6..63e0951a 100644 --- a/src/gtm/main/main.c +++ b/src/gtm/main/main.c @@ -1031,7 +1031,10 @@ main(int argc, char *argv[]) } SpinLockAcquire(&promote_status_lck); + if (promote_status == GTM_PRPMOTE_INIT) + { promote_status = GTM_PRPMOTE_CONNED; + } SpinLockRelease(&promote_status_lck); #ifdef __XLOG__ From 4d36953e9bb717cb1c33fdd1dd3201f8e20f9819 Mon Sep 17 00:00:00 2001 From: bethding Date: Tue, 9 Mar 2021 10:15:52 +0800 Subject: [PATCH 336/578] remove unsed para --- src/backend/utils/mmgr/dsa.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/backend/utils/mmgr/dsa.c b/src/backend/utils/mmgr/dsa.c index 9a6d036d..f7f11c06 100644 --- a/src/backend/utils/mmgr/dsa.c +++ b/src/backend/utils/mmgr/dsa.c @@ -2255,7 +2255,6 @@ check_for_freed_segments(dsa_area *area) LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); check_for_freed_segments_locked(area); LWLockRelease(DSA_AREA_LOCK(area)); - area->freed_segment_counter = freed_segment_counter; } } From c756b84d17ff336a6955288b835587688a0b0f81 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Mon, 15 Mar 2021 16:14:19 +0800 Subject: [PATCH 337/578] fix gtm http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131080533479 (merge request !213) --- src/gtm/main/gtm_standby.c | 1 + src/gtm/main/gtm_txn.c | 42 +++++ src/gtm/main/gtm_xlog.c | 5 +- src/gtm/main/main.c | 354 +++++++++++++++++++++++++++++++------ src/include/gtm/gtm_txn.h | 271 ++++++++++++++-------------- 5 files changed, 479 insertions(+), 194 deletions(-) diff --git a/src/gtm/main/gtm_standby.c b/src/gtm/main/gtm_standby.c index e84cca5d..f441c889 100644 --- a/src/gtm/main/gtm_standby.c +++ b/src/gtm/main/gtm_standby.c @@ -76,6 +76,7 @@ gtm_standby_start_startup(void) errno = save_errno; if(GTM_ActiveConn) GTMPQfinish(GTM_ActiveConn); + GTM_ActiveConn = NULL; return 0; } diff --git a/src/gtm/main/gtm_txn.c b/src/gtm/main/gtm_txn.c index 1a8f83f7..bc91470f 100644 --- a/src/gtm/main/gtm_txn.c +++ b/src/gtm/main/gtm_txn.c @@ -1563,6 +1563,48 @@ ProcessCheckGTMCommand(Port *myport, StringInfo message) pq_endmessage(myport, &buf); pq_flush(myport); } + + +/* + * Check gtm slave status by acquiring gts. + */ +void +ProcessStandbyPreCheckGTMCommand(Port *myport, StringInfo message) +{ + StringInfoData buf; + int is_master = 0; + GTM_Timestamp master_timestamp = InvalidGTS; + int standby_count = 0; + XLogRecPtr flush_ptr; + + /* read timeout message */ + pq_getmsgint(message,sizeof(int)); + pq_getmsgend(message); + + if (myport->remote_type != GTM_NODE_GTM_CTL) + { + /* standby node only handle GTS request from gtm_ctl*/ + elog(ERROR, "check gtm command is supposed to be fired only by gtm or gtm_ctl!!"); + } + + /* get static gts from ControlData */ + GTM_RWLockAcquire(&ControlDataLock,GTM_LOCKMODE_WRITE); + master_timestamp = ControlData->gts; + GTM_RWLockRelease(&ControlDataLock); + + flush_ptr = GetCurrentXLogwrtResult().Flush; + is_master = Recovery_IsStandby(); + + pq_beginmessage(&buf, 'S'); + pq_sendint(&buf, TXN_CHECK_GTM_STATUS_RESULT, 4); + pq_sendbytes(&buf, (char *) &is_master, sizeof(is_master)); + pq_sendbytes(&buf, (char *) &master_timestamp, sizeof(GTM_Timestamp)); + pq_sendint64(&buf, flush_ptr); + pq_sendint(&buf, standby_count, sizeof(int)); + + pq_endmessage(myport, &buf); + pq_flush(myport); +} #endif /* * Process MSG_TXN_BEGIN_GETGXID message diff --git a/src/gtm/main/gtm_xlog.c b/src/gtm/main/gtm_xlog.c index 492fa491..ec3d7c66 100644 --- a/src/gtm/main/gtm_xlog.c +++ b/src/gtm/main/gtm_xlog.c @@ -133,6 +133,7 @@ extern int GTMStartupGTSDelta; static bool g_recovery_finish; static bool *g_GTMStoreDirtyMap; static GTM_MutexLock g_CheckPointLock; +extern enum GTM_PromoteStatus promote_status; XLogCtlData *XLogCtl; XLogSyncStandby *XLogSync; @@ -2344,7 +2345,7 @@ RedoRangeOverwrite(XLogCmdRangerOverWrite *cmd) if(enalbe_gtm_xlog_debug) PrintRedoRangeOverwrite(cmd); - if(Recovery_IsStandby() && recovery_pitr_mode == false) + if(Recovery_IsStandby() && recovery_pitr_mode == false && promote_status == GTM_PRPMOTE_NORMAL) { memcpy(g_GTMStoreMapAddr + cmd->offset,cmd->data,cmd->bytes); @@ -2386,7 +2387,7 @@ RedoCheckPoint(XLogCmdCheckPoint *cmd,XLogRecPtr pos) SetCurrentTimeLineID(cmd->timeline); - if(Recovery_IsStandby() && recovery_pitr_mode == false) + if(Recovery_IsStandby() && recovery_pitr_mode == false && promote_status == GTM_PRPMOTE_NORMAL) DoCheckPoint(false); return sizeof(XLogCmdCheckPoint); diff --git a/src/gtm/main/main.c b/src/gtm/main/main.c index 63e0951a..113cee78 100644 --- a/src/gtm/main/main.c +++ b/src/gtm/main/main.c @@ -140,6 +140,7 @@ GTM_ThreadInfo *g_timer_thread = NULL; GTM_ThreadInfo *g_logcollector_thread = NULL; void *GTM_ThreadLogCollector(void *argp); extern void GTM_ErrorLogCollector(ErrorData *edata, StringInfo buff); +GTM_ThreadInfo *g_standby_pre_server_thread = NULL; #ifdef __XLOG__ GTM_ThreadInfo *g_basebackup_thread = NULL; @@ -245,6 +246,8 @@ static void ProcessSyncStandbyCommand(Port *myport, GTM_MessageType mtype, Strin static void ProcessBarrierCommand(Port *myport, GTM_MessageType mtype, StringInfo message); static int GTMInitConnection(GTM_ConnectionInfo *conninfo); +static void SetNonBlockConnection(GTM_ConnectionInfo *conninfo); +static void gtm_standby_pre_server_loop(const char *data_dir); #ifdef __XLOG__ static void thread_replication_clean(GTM_StandbyReplication *replication); @@ -604,6 +607,26 @@ static bool CheckClockSource(void) #endif +static void GTM_XLogRecoveryIfNeed(const char *data_dir) +{ + Assert(ControlData != NULL); + + switch(ControlData->state) + { + case DB_SHUTDOWNED_IN_RECOVERY: + case DB_SHUTDOWNING: + case DB_STARTUP: + case DB_IN_CRASH_RECOVERY: + case DB_IN_ARCHIVE_RECOVERY: + case DB_IN_PRODUCTION: + elog(LOG, "Detect GTM server crash."); + GTM_XLogRecovery(ControlData->checkPoint,data_dir); + break; + case DB_SHUTDOWNED: + break; + } +} + int main(int argc, char *argv[]) {// #lizard forgives @@ -659,6 +682,8 @@ main(int argc, char *argv[]) int util_thread_cnt = 0; isStartUp = true; + SpinLockInit(&promote_status_lck); + promote_status = GTM_PRPMOTE_INIT; /* * At first, initialize options. Also moved something from BaseInit() here. @@ -1002,41 +1027,6 @@ main(int argc, char *argv[]) Recovery_StandbySetConnInfo(active_addr, active_port); } - SpinLockInit(&promote_status_lck); - promote_status = GTM_PRPMOTE_INIT; - - pqsignal(SIGHUP, GTM_SigleHandler); - pqsignal(SIGKILL, GTM_SigleHandler); - pqsignal(SIGQUIT, GTM_SigleHandler); - pqsignal(SIGTERM, GTM_SigleHandler); - pqsignal(SIGINT, GTM_SigleHandler); - pqsignal(SIGUSR1, GTM_SigleHandler); - pqsignal(SIGPIPE, SIG_IGN); - - pqinitmask(); - - /* - * Establish a connection between the active and standby. - */ - while (Recovery_IsStandby()) - { - if (gtm_standby_start_startup()) - { - elog(LOG, "Standby GTM Startup connection established with active-GTM."); - break; - } - - elog(LOG, "Failed to establish a connection to active-GTM."); - usleep(GTM_GTS_ONE_SECOND); - } - - SpinLockAcquire(&promote_status_lck); - if (promote_status == GTM_PRPMOTE_INIT) - { - promote_status = GTM_PRPMOTE_CONNED; - } - SpinLockRelease(&promote_status_lck); - #ifdef __XLOG__ if(access(RECOVERY_CONF_NAME,F_OK) == 0) @@ -1061,24 +1051,61 @@ main(int argc, char *argv[]) if(Recovery_IsStandby() == false) { - Assert(ControlData != NULL); + GTM_XLogRecoveryIfNeed(data_dir); + } + +#endif + +#ifdef __TBASE__ + elog(LOG, "Starting GTM server at (%s:%d) with syn storage", ListenAddresses, GTMPortNumber); +#else + elog(LOG, "Starting GTM server at (%s:%d) -- control file %s", ListenAddresses, GTMPortNumber, GTMControlFile); +#endif + + g_max_lock_number = 6000; + pqsignal(SIGHUP, GTM_SigleHandler); + pqsignal(SIGKILL, GTM_SigleHandler); + pqsignal(SIGQUIT, GTM_SigleHandler); + pqsignal(SIGTERM, GTM_SigleHandler); + pqsignal(SIGINT, GTM_SigleHandler); + pqsignal(SIGUSR1, GTM_SigleHandler); + pqsignal(SIGPIPE, SIG_IGN); - switch(ControlData->state) + pqinitmask(); + + /* + * Establish a connection between the active and standby. + */ + if (Recovery_IsStandby()) + { + if (!gtm_standby_start_startup()) { - case DB_SHUTDOWNED_IN_RECOVERY: - case DB_SHUTDOWNING: - case DB_STARTUP: - case DB_IN_CRASH_RECOVERY: - case DB_IN_ARCHIVE_RECOVERY: - case DB_IN_PRODUCTION: - elog(LOG, "Detect GTM server crash."); - GTM_XLogRecovery(ControlData->checkPoint,data_dir); - break; - case DB_SHUTDOWNED: - break; +#ifdef __TBASE__ + elog(LOG, "Failed to establish a connection to active-GTM."); + + /* + * if failed to establish a connection to active-GTM, just + * retry, but support the check status command. + */ + gtm_standby_pre_server_loop(data_dir); +#else + elog(ERROR, "Failed to establish a connection to active-GTM."); +#endif } + else + { + elog(LOG, "Standby GTM Startup connection established with active-GTM."); + } + } + + SpinLockAcquire(&promote_status_lck); + if (promote_status == GTM_PRPMOTE_INIT) + { + promote_status = GTM_PRPMOTE_CONNED; } + SpinLockRelease(&promote_status_lck); +#ifdef __XLOG__ GTM_XLogFileInit(data_dir); GTM_RWLockAcquire(&ControlDataLock,GTM_LOCKMODE_WRITE); @@ -1090,12 +1117,6 @@ main(int argc, char *argv[]) #endif -#ifdef __TBASE__ - elog(LOG, "Starting GTM server at (%s:%d) with syn storage", ListenAddresses, GTMPortNumber); -#else - elog(LOG, "Starting GTM server at (%s:%d) -- control file %s", ListenAddresses, GTMPortNumber, GTMControlFile); -#endif - /* * Read the last GXID and start from there */ @@ -1109,6 +1130,8 @@ main(int argc, char *argv[]) GlobalTimestamp gts = 0; int max_retry_times = 10; + system("rm -rf gtm_xlog/*"); + bret = GTM_StoreGetSysInfo(&identifier, &lsn, >s); if (!bret) { @@ -1434,8 +1457,6 @@ main(int argc, char *argv[]) process_thread_num = g_max_thread_number < process_thread_num ? g_max_thread_number : process_thread_num; } - g_max_lock_number = 6000; - /* Create GTM threads handling requests */ g_timekeeper_thread = GTM_ThreadCreate(GTM_ThreadTimeKeeper, g_max_lock_number); if (NULL == g_timekeeper_thread) @@ -1521,7 +1542,11 @@ main(int argc, char *argv[]) util_thread_cnt++; } - for(i = 0; i < process_thread_num; i++) + /* + * maybe one GTM_ThreadMain create as g_standby_pre_server_thread before + */ + i = (g_standby_pre_server_thread == NULL) ? 0 : 1; + for(; i < process_thread_num; i++) { elog(DEBUG8, "Create thread %d.\n", i); if (NULL == GTM_ThreadCreate(GTM_ThreadMain, g_max_lock_number)) @@ -1807,6 +1832,206 @@ ServerLoop(void) } } +/* + * add connection into g_standby_pre_server_thread + */ +static int +gtm_add_connection_standby_pre_server(Port *port) +{ + GTM_ConnectionInfo *conninfo = NULL; + struct epoll_event event; + + if (!g_standby_pre_server_thread->thr_epoll_ok) + { + elog(LOG, "g_standby_pre_server_thread epoll not ready."); + return STATUS_ERROR; + } + + conninfo = (GTM_ConnectionInfo *)palloc0(sizeof (GTM_ConnectionInfo)); + conninfo->con_port = port; + conninfo->con_init = false; + port->conn = conninfo; + + /* Set conn to non-blocking mode for epoll wait */ + SetNonBlockConnection(conninfo); + + conninfo->con_thrinfo = g_standby_pre_server_thread; + event.data.ptr = conninfo; + event.events = EPOLLIN | EPOLLERR | EPOLLHUP | EPOLLRDHUP; + if(-1 == epoll_ctl (g_standby_pre_server_thread->thr_efd, EPOLL_CTL_ADD, conninfo->con_port->sock, &event)) + { + elog(LOG, "failed to add socket to epoll"); + return STATUS_ERROR; + } + + return STATUS_OK; +} + +/* + * handle loop before establish a connection to active-GTM + */ +static void +gtm_standby_pre_server_loop(const char *data_dir) +{ + fd_set readmask; + int nSockets; + sigjmp_buf local_sigjmp_buf; + + /* + * recovery here first + */ + GTM_XLogRecoveryIfNeed(data_dir); + + /* + * start GTM_ThreadMain to support get gtm status command + */ + g_standby_pre_server_thread = GTM_ThreadCreate(GTM_ThreadMain, g_max_lock_number); + if (NULL == g_standby_pre_server_thread) + { + elog(LOG, "Failed to create standby_pre_server_thread thread"); + exit(1); + } + + if (sigsetjmp(local_sigjmp_buf, 1) != 0) + { + RWLockCleanUp(); + /* Report the error to the server log */ + EmitErrorReport(NULL); + + /* + * Now return to normal top-level context and clear ErrorContext for + * next time. + */ + FlushErrorState(); + } + + /* We can now handle ereport(ERROR) */ + PG_exception_stack = &local_sigjmp_buf; + + nSockets = initMasks(&readmask); + while (Recovery_IsStandby()) + { + fd_set rmask; + int selres; + + /* + * Wait for a connection request to arrive. + * + * We wait at most one minute, to ensure that the other background + * tasks handled below get done even when no requests are arriving. + */ + memcpy((char *) &rmask, (char *) &readmask, sizeof(fd_set)); + + PG_SETMASK(&UnBlockSig); + + /* if timekeeper thread exit, main thread should prepare to exit. */ + if (GTMAbortPending) + { + /* + * XXX We should do a clean shutdown here. For the time being, just + * write the next GXID to be issued in the control file and exit + * gracefully + */ + + elog(LOG, "GTM shutting down."); + + /* + * Tell GTM that we are shutting down so that no new GXIDs are + * issued this point onwards + */ + GTM_SetShuttingDown(); + + GTM_RWLockAcquire(&ControlDataLock,GTM_LOCKMODE_WRITE); + ControlData->state = DB_SHUTDOWNED; + ControlDataSync(false); + GTM_RWLockRelease(&ControlDataLock); + + /* Delete pid file */ + DeleteLockFile(GTM_PID_FILE); +#ifdef HAVE_UNIX_SOCKETS + RemoveSocketFile(); +#endif + elog(LOG, "GTM exits"); + exit(1); + } + + { + /* must set timeout each time; some OSes change it! */ + struct timeval timeout; + + timeout.tv_sec = 5; + timeout.tv_usec = 0; + + selres = select(nSockets, &rmask, NULL, NULL, &timeout); + } + + /* + * Block all signals until we wait again. (This makes it safe for our + * signal handlers to do nontrivial work.) + */ + PG_SETMASK(&BlockSig); + + /* Now check the select() result */ + if (selres < 0) + { + if (errno != EINTR && errno != EWOULDBLOCK) + { + ereport(LOG, + (EACCES, + errmsg("select() failed in main thread: %m"))); + exit(1); + } + } + + /* + * New connection pending on any of our sockets? If so, fork a child + * process to deal with it. + */ + if (selres > 0) + { + int i; + + for (i = 0; i < MAXLISTEN; i++) + { + if (ListenSocket[i] == -1) + { + break; + } + + if (FD_ISSET(ListenSocket[i], &rmask)) + { + Port *port; + + port = ConnCreate(ListenSocket[i]); + if (port) + { + if (gtm_add_connection_standby_pre_server(port) != STATUS_OK) + { + StreamClose(port->sock); + ConnFree(port); + } + } + } + } + } + + /* + * retry establish a connection between the active and standby, + * controlling frequency with select timeout + */ + if (gtm_standby_start_startup()) + { + elog(LOG, "Standby GTM Startup connection established with active-GTM."); + break; + } + elog(LOG, "Failed to establish a connection to active-GTM."); + } + + /* + * clear exception stack here + */ + PG_exception_stack = NULL; +} /* * Initialise the masks for select() for the ports we are listening on. @@ -2967,7 +3192,10 @@ GTM_ThreadWalReceiver(void *argp) goto promote; if(GTM_ActiveConn) + { GTMPQfinish(GTM_ActiveConn); + GTM_ActiveConn = NULL; + } sleep(1); @@ -3619,7 +3847,18 @@ ProcessCommand(Port *myport, StringInfo input_message) * compile option. */ elog(DEBUG1, "mtype = %s (%d).", gtm_util_message_name(mtype), (int)mtype); + #ifdef __TBASE__ + if (promote_status != GTM_PRPMOTE_NORMAL) + { + if (mtype != MSG_CHECK_GTM_STATUS) + { + elog(ERROR, "standby gtm only support get gtm status command before establish a connection to active-GTM or promote, mtype = %s (%d).", gtm_util_message_name(mtype), (int)mtype); + } + + return ProcessStandbyPreCheckGTMCommand(myport, input_message); + } + start_time = getSystemTime(); /* * Get Timestamp does not need to sync with standby @@ -4054,6 +4293,7 @@ GTMAddConnection(Port *port, GTM_Conn *standby) if(-1 == epoll_ctl (thrinfo->thr_efd, EPOLL_CTL_ADD, conninfo->con_port->sock, &event)) { elog(LOG, "failed to add socket to epoll"); + return STATUS_ERROR; } break; } diff --git a/src/include/gtm/gtm_txn.h b/src/include/gtm/gtm_txn.h index b84c97db..0dc754ba 100644 --- a/src/include/gtm/gtm_txn.h +++ b/src/include/gtm/gtm_txn.h @@ -41,11 +41,11 @@ extern void GlobalTransactionIdAbort(GlobalTransactionId transactionId); /* in transam/varsup.c */ extern GlobalTransactionId GTM_GetGlobalTransactionId(GTM_TransactionHandle handle); extern bool GTM_GetGlobalTransactionIdMulti( - GTM_TransactionHandle handle[], - int txn_count, - GlobalTransactionId gxids[], - GTM_TransactionHandle new_handle[], - int *new_txn_count); + GTM_TransactionHandle handle[], + int txn_count, + GlobalTransactionId gxids[], + GTM_TransactionHandle new_handle[], + int *new_txn_count); extern GlobalTransactionId ReadNewGlobalTransactionId(void); extern GlobalTransactionId GTM_GetLatestCompletedXID(void); extern void SetGlobalTransactionIdLimit(GlobalTransactionId oldest_datfrozenxid); @@ -72,118 +72,118 @@ extern void GTM_WriteRestorePointXid(FILE *f); typedef enum GTM_States { - GTM_STARTING, - GTM_RUNNING, - GTM_SHUTTING_DOWN + GTM_STARTING, + GTM_RUNNING, + GTM_SHUTTING_DOWN } GTM_States; /* Global transaction states at the GTM */ typedef enum GTM_TransactionStates { - GTM_TXN_INIT, - GTM_TXN_STARTING, - GTM_TXN_IN_PROGRESS, - GTM_TXN_PREPARE_IN_PROGRESS, - GTM_TXN_PREPARED, - GTM_TXN_COMMIT_IN_PROGRESS, - GTM_TXN_COMMITTED, - GTM_TXN_ABORT_IN_PROGRESS, - GTM_TXN_ABORTED, - GTM_TXN_IMPLICATE_PREPARED + GTM_TXN_INIT, + GTM_TXN_STARTING, + GTM_TXN_IN_PROGRESS, + GTM_TXN_PREPARE_IN_PROGRESS, + GTM_TXN_PREPARED, + GTM_TXN_COMMIT_IN_PROGRESS, + GTM_TXN_COMMITTED, + GTM_TXN_ABORT_IN_PROGRESS, + GTM_TXN_ABORTED, + GTM_TXN_IMPLICATE_PREPARED } GTM_TransactionStates; typedef struct GTM_TransactionInfo { - GTM_TransactionHandle gti_handle; - uint32 gti_client_id; - char gti_global_session_id[GTM_MAX_SESSION_ID_LEN]; - bool gti_in_use; - GlobalTransactionId gti_gxid; - GTM_TransactionStates gti_state; - GlobalTransactionId gti_xmin; - GTM_IsolationLevel gti_isolevel; - bool gti_readonly; - GTMProxy_ConnID gti_proxy_client_id; - char *nodestring; /* List of nodes prepared */ - char *gti_gid; - - GTM_SnapshotData gti_current_snapshot; - bool gti_snapshot_set; - - GTM_RWLock gti_lock; - bool gti_vacuum; - gtm_List *gti_created_seqs; - gtm_List *gti_dropped_seqs; - gtm_List *gti_altered_seqs; + GTM_TransactionHandle gti_handle; + uint32 gti_client_id; + char gti_global_session_id[GTM_MAX_SESSION_ID_LEN]; + bool gti_in_use; + GlobalTransactionId gti_gxid; + GTM_TransactionStates gti_state; + GlobalTransactionId gti_xmin; + GTM_IsolationLevel gti_isolevel; + bool gti_readonly; + GTMProxy_ConnID gti_proxy_client_id; + char *nodestring; /* List of nodes prepared */ + char *gti_gid; + + GTM_SnapshotData gti_current_snapshot; + bool gti_snapshot_set; + + GTM_RWLock gti_lock; + bool gti_vacuum; + gtm_List *gti_created_seqs; + gtm_List *gti_dropped_seqs; + gtm_List *gti_altered_seqs; } GTM_TransactionInfo; -#define GTM_MAX_2PC_NODES 16 +#define GTM_MAX_2PC_NODES 16 /* By default a GID length is limited to 256 bits in PostgreSQL */ -#define GTM_MAX_GID_LEN 256 -#define GTM_MAX_NODESTRING_LEN 1024 -#define GTM_CheckTransactionHandle(x) ((x) >= 0 && (x) < GTM_MAX_GLOBAL_TRANSACTIONS) -#define GTM_IsTransSerializable(x) ((x)->gti_isolevel == GTM_ISOLATION_SERIALIZABLE) +#define GTM_MAX_GID_LEN 256 +#define GTM_MAX_NODESTRING_LEN 1024 +#define GTM_CheckTransactionHandle(x) ((x) >= 0 && (x) < GTM_MAX_GLOBAL_TRANSACTIONS) +#define GTM_IsTransSerializable(x) ((x)->gti_isolevel == GTM_ISOLATION_SERIALIZABLE) #define GTM_MAX_THREADS 512 #define CACHE_LINE_SIZE 64 typedef union rw_lock { - int lock; - char padding[CACHE_LINE_SIZE]; + int lock; + char padding[CACHE_LINE_SIZE]; } RW_lock; typedef struct GTM_Transactions { - uint32 gt_txn_count; - GTM_States gt_gtm_state; - - GTM_RWLock gt_XidGenLock; - - /* - * These fields are protected by XidGenLock - */ - GlobalTransactionId gt_nextXid; /* next XID to assign */ - GlobalTransactionId gt_backedUpXid; /* backed up, restoration point */ - - GlobalTransactionId gt_oldestXid; /* cluster-wide minimum datfrozenxid */ - GlobalTransactionId gt_xidVacLimit; /* start forcing autovacuums here */ - GlobalTransactionId gt_xidWarnLimit; /* start complaining here */ - GlobalTransactionId gt_xidStopLimit; /* refuse to advance nextXid beyond here */ - GlobalTransactionId gt_xidWrapLimit; /* where the world ends */ - - /* - * These fields are protected by TransArrayLock. - */ - GlobalTransactionId gt_latestCompletedXid; /* newest XID that has committed or - * aborted */ - - GlobalTransactionId gt_recent_global_xmin; - - int32 gt_lastslot; - GTM_TransactionInfo gt_transactions_array[GTM_MAX_GLOBAL_TRANSACTIONS]; - gtm_List *gt_open_transactions; - - GTM_RWLock gt_TransArrayLock; - pg_atomic_uint32 gt_global_xid; - - GlobalTimestamp gt_last_cycle; - GlobalTimestamp gt_global_timestamp; - /* For debug purpose */ - GlobalTimestamp gt_last_issue_timestamp; - GlobalTimestamp gt_last_raw_timestamp; - GlobalTimestamp gt_last_last_cycle; - GlobalTimestamp gt_last_global_timestamp; - GlobalTimestamp gt_last_tv_sec; - GlobalTimestamp gt_last_tv_nsec; - pg_atomic_uint64 gt_access_ts_seq; - pg_atomic_uint64 gt_last_access_ts_seq; - - RW_lock gt_in_locking[GTM_MAX_THREADS]; + uint32 gt_txn_count; + GTM_States gt_gtm_state; + + GTM_RWLock gt_XidGenLock; + + /* + * These fields are protected by XidGenLock + */ + GlobalTransactionId gt_nextXid; /* next XID to assign */ + GlobalTransactionId gt_backedUpXid; /* backed up, restoration point */ + + GlobalTransactionId gt_oldestXid; /* cluster-wide minimum datfrozenxid */ + GlobalTransactionId gt_xidVacLimit; /* start forcing autovacuums here */ + GlobalTransactionId gt_xidWarnLimit; /* start complaining here */ + GlobalTransactionId gt_xidStopLimit; /* refuse to advance nextXid beyond here */ + GlobalTransactionId gt_xidWrapLimit; /* where the world ends */ + + /* + * These fields are protected by TransArrayLock. + */ + GlobalTransactionId gt_latestCompletedXid; /* newest XID that has committed or + * aborted */ + + GlobalTransactionId gt_recent_global_xmin; + + int32 gt_lastslot; + GTM_TransactionInfo gt_transactions_array[GTM_MAX_GLOBAL_TRANSACTIONS]; + gtm_List *gt_open_transactions; + + GTM_RWLock gt_TransArrayLock; + pg_atomic_uint32 gt_global_xid; + + GlobalTimestamp gt_last_cycle; + GlobalTimestamp gt_global_timestamp; + /* For debug purpose */ + GlobalTimestamp gt_last_issue_timestamp; + GlobalTimestamp gt_last_raw_timestamp; + GlobalTimestamp gt_last_last_cycle; + GlobalTimestamp gt_last_global_timestamp; + GlobalTimestamp gt_last_tv_sec; + GlobalTimestamp gt_last_tv_nsec; + pg_atomic_uint64 gt_access_ts_seq; + pg_atomic_uint64 gt_last_access_ts_seq; + + RW_lock gt_in_locking[GTM_MAX_THREADS]; } GTM_Transactions; -extern GTM_Transactions GTMTransactions; +extern GTM_Transactions GTMTransactions; /* NOTE: This macro should be used with READ lock held on gt_TransArrayLock! */ -#define GTM_CountOpenTransactions() (gtm_list_length(GTMTransactions.gt_open_transactions)) +#define GTM_CountOpenTransactions() (gtm_list_length(GTMTransactions.gt_open_transactions)) /* * Two hash tables will be maintained to quickly find the @@ -198,53 +198,53 @@ bool GTM_IsGXIDInProgress(GlobalTransactionId gxid); /* Transaction Control */ void GTM_InitTxnManager(void); GTM_TransactionHandle GTM_BeginTransaction(GTM_IsolationLevel isolevel, - bool readonly, - const char *global_sessionid); + bool readonly, + const char *global_sessionid); int GTM_BeginTransactionMulti(GTM_IsolationLevel isolevel[], - bool readonly[], - const char *global_sessionid[], - GTMProxy_ConnID connid[], - int txn_count, - GTM_TransactionHandle txns[]); + bool readonly[], + const char *global_sessionid[], + GTMProxy_ConnID connid[], + int txn_count, + GTM_TransactionHandle txns[]); int GTM_RollbackTransaction(GTM_TransactionHandle txn); int GTM_RollbackTransactionMulti(GTM_TransactionHandle txn[], int txn_count, int status[]); int GTM_RollbackTransactionGXID(GlobalTransactionId gxid); int GTM_CommitTransaction(GTM_TransactionHandle txn, - int waited_xid_count, GlobalTransactionId *waited_xids); + int waited_xid_count, GlobalTransactionId *waited_xids); int GTM_CommitTransactionMulti(GTM_TransactionHandle txn[], int txn_count, - int waited_xid_count, GlobalTransactionId *waited_xids, - int status[]); + int waited_xid_count, GlobalTransactionId *waited_xids, + int status[]); int GTM_CommitTransactionGXID(GlobalTransactionId gxid); int GTM_PrepareTransaction(GTM_TransactionHandle txn); int GTM_StartPreparedTransaction(GTM_TransactionHandle txn, - char *gid, - char *nodestring); + char *gid, + char *nodestring); int -GTM_LogTransaction( GlobalTransactionId gxid, - const char *gid, - const char *nodestring, - int node_count, - int isGlobal, - int isCommit, - GlobalTimestamp prepare_ts, - GlobalTimestamp commit_ts); +GTM_LogTransaction( GlobalTransactionId gxid, + const char *gid, + const char *nodestring, + int node_count, + int isGlobal, + int isCommit, + GlobalTimestamp prepare_ts, + GlobalTimestamp commit_ts); int GTM_LogScan(GlobalTransactionId gxid, - const char *nodestring, - GlobalTimestamp start_ts, - GlobalTimestamp local_start_ts, - GlobalTimestamp local_complete_ts, - int scan_type, - const char *rel_name, - int64 scan_number); + const char *nodestring, + GlobalTimestamp start_ts, + GlobalTimestamp local_start_ts, + GlobalTimestamp local_complete_ts, + int scan_type, + const char *rel_name, + int64 scan_number); int GTM_StartPreparedTransactionGXID(GlobalTransactionId gxid, - char *gid, - char *nodestring); + char *gid, + char *nodestring); int GTM_GetGIDData(GTM_TransactionHandle prepared_txn, - GlobalTransactionId *prepared_gxid, - char **nodestring); + GlobalTransactionId *prepared_gxid, + char **nodestring); uint32 GTM_GetAllPrepared(GlobalTransactionId gxids[], uint32 gxidcnt); GTM_TransactionStates GTM_GetStatus(GTM_TransactionHandle txn); GTM_TransactionStates GTM_GetStatusGXID(GlobalTransactionId gxid); @@ -254,9 +254,9 @@ uint32 GTMGetFirstClientIdentifier(void); uint32 GTMGetLastClientIdentifier(void); GTM_Snapshot GTM_GetSnapshotData(GTM_TransactionInfo *my_txninfo, - GTM_Snapshot snapshot); + GTM_Snapshot snapshot); GTM_Snapshot GTM_GetTransactionSnapshot(GTM_TransactionHandle handle[], - int txn_count, int *status); + int txn_count, int *status); void GTM_FreeCachedTransInfo(void); void ProcessBeginTransactionCommand(Port *myport, StringInfo message); @@ -267,11 +267,11 @@ ProcessBkupGlobalTimestamp(Port *myport, StringInfo message); void ProcessBkupBeginTransactionCommand(Port *myport, StringInfo message); void GTM_BkupBeginTransactionMulti(GTM_IsolationLevel *isolevel, - bool *readonly, - const char **global_sessionid, - uint32 *client_id, - GTMProxy_ConnID *connid, - int txn_count); + bool *readonly, + const char **global_sessionid, + uint32 *client_id, + GTMProxy_ConnID *connid, + int txn_count); void ProcessBeginTransactionCommandMulti(Port *myport, StringInfo message); void ProcessBeginTransactionGetGXIDCommand(Port *myport, StringInfo message); @@ -299,11 +299,11 @@ void GTM_WriteRestorePointVersion(FILE *f); void GTM_RestoreStart(FILE *ctlf, struct GTM_RestoreContext *context); void GTM_SaveTxnInfo(FILE *ctlf); void GTM_RestoreTxnInfo(FILE *ctlf, GlobalTransactionId next_gxid, - struct GTM_RestoreContext *context, bool force_xid); + struct GTM_RestoreContext *context, bool force_xid); void GTM_BkupBeginTransaction(GTM_IsolationLevel isolevel, - bool readonly, - const char *global_sessionid, - uint32 client_id); + bool readonly, + const char *global_sessionid, + uint32 client_id); void ProcessBkupBeginTransactionGetGXIDCommand(Port *myport, StringInfo message); void ProcessBkupBeginTransactionGetGXIDCommandMulti(Port *myport, StringInfo message); @@ -326,6 +326,7 @@ extern void ProcessFinishGIDTransactionCommand(Port *myport, StringInfo message) void ProcessGetGTSCommand(Port *myport, StringInfo message); void ProcessGetGTSCommandMulti(Port *myport, StringInfo message); void ProcessCheckGTMCommand(Port *myport, StringInfo message); +void ProcessStandbyPreCheckGTMCommand(Port *myport, StringInfo message); #endif #endif From a3c89d525a0f7cb7ad77fec16b800dae1c20a21b Mon Sep 17 00:00:00 2001 From: youngxie Date: Tue, 16 Mar 2021 16:27:34 +0800 Subject: [PATCH 338/578] Converity and memory problem fix. --- contrib/pg_clean/pg_clean.c | 11 +- contrib/pgcrypto/pgp-mpi-internal.c | 397 ++++++++-------- contrib/pgxc_ctl/coord_cmd.c | 2 +- contrib/pgxc_ctl/datanode_cmd.c | 2 +- contrib/pgxc_ctl/variables.c | 572 ++++++++++++------------ src/backend/access/common/printtup.c | 1 + src/backend/access/heap/heapam.c | 3 +- src/backend/access/spgist/spgscan.c | 4 + src/backend/access/transam/gtm.c | 15 +- src/backend/access/transam/twophase.c | 4 +- src/backend/access/transam/xlog.c | 4 +- src/backend/audit/audit_fga.c | 3 +- src/backend/catalog/objectaddress.c | 1 + src/backend/catalog/pgxc_class.c | 6 - src/backend/commands/sequence.c | 4 +- src/backend/commands/statscmds.c | 5 + src/backend/contrib/pgcrypto/internal.c | 2 +- src/backend/executor/execUtils.c | 2 + src/backend/libpq/hba.c | 21 + src/backend/libpq/pqcomm.c | 7 +- src/backend/nodes/bitmapset.c | 2 +- src/backend/optimizer/path/clausesel.c | 2 +- src/backend/optimizer/plan/subselect.c | 2 +- src/backend/optimizer/util/pathnode.c | 12 +- src/backend/parser/analyze.c | 2 +- src/backend/parser/parse_utilcmd.c | 3 + src/backend/pgxc/copy/copyops.c | 3 - src/backend/pgxc/locator/locator.c | 2 + src/backend/pgxc/nodemgr/nodemgr.c | 1 + src/backend/pgxc/plan/planner.c | 4 + src/backend/pgxc/pool/execRemote.c | 44 +- src/backend/pgxc/pool/pgxcnode.c | 14 +- src/backend/pgxc/pool/poolcomm.c | 23 +- src/backend/pgxc/pool/poolmgr.c | 67 ++- src/backend/pgxc/squeue/squeue.c | 4 +- src/backend/replication/slotfuncs.c | 4 +- src/backend/storage/file/fd.c | 2 +- src/backend/tcop/postgres.c | 4 +- src/backend/tcop/pquery.c | 2 +- src/backend/tcop/utility.c | 2 +- src/backend/utils/adt/datetime.c | 4 +- src/backend/utils/adt/jsonb_util.c | 6 +- src/backend/utils/adt/network_gist.c | 2 +- src/backend/utils/adt/oid.c | 2 +- src/backend/utils/adt/timestamp.c | 2 +- src/backend/utils/cache/relcryptmap.c | 2 +- src/backend/utils/misc/datamask.c | 4 +- src/backend/utils/misc/guc.c | 3 + src/backend/utils/misc/mls.c | 8 +- src/backend/utils/misc/relcrypt.c | 4 +- src/backend/utils/mmgr/dsa.c | 1 + src/backend/utils/sort/tuplesort.c | 6 + src/bin/initgtm/initgtm.c | 2 +- src/bin/pg_dump/pg_backup_tar.c | 2 +- src/gtm/client/fe-connect.c | 14 +- src/gtm/client/gtm_client.c | 9 +- src/gtm/common/gtm_opt_handler.c | 13 + src/gtm/gtm_ctl/gtm_ctl.c | 5 +- src/gtm/main/gtm_store.c | 14 +- src/gtm/main/gtm_xlog.c | 7 +- src/gtm/main/main.c | 19 +- src/gtm/proxy/proxy_main.c | 5 +- src/gtm/recovery/register_common.c | 7 +- src/gtm/xlog_test/xlog_reader.c | 10 +- src/include/access/xlog.h | 1 + src/include/audit/audit_fga.h | 2 +- src/include/pgxc/pgxcnode.h | 2 +- src/interfaces/ecpg/ecpglib/execute.c | 5 +- src/interfaces/ecpg/preproc/ecpg.c | 2 +- src/interfaces/libpq/fe-auth.c | 2 - src/interfaces/libpq/fe-connect.c | 4 +- src/interfaces/libpq/fe-protocol2.c | 2 - src/pl/plpgsql/src/pl_exec.c | 1 + src/timezone/localtime.c | 2 - 74 files changed, 796 insertions(+), 643 deletions(-) diff --git a/contrib/pg_clean/pg_clean.c b/contrib/pg_clean/pg_clean.c index e31394c4..8d1514f4 100644 --- a/contrib/pg_clean/pg_clean.c +++ b/contrib/pg_clean/pg_clean.c @@ -83,6 +83,7 @@ PG_MODULE_MAGIC; #define GET_READONLY "readonly" #define GIDSIZE (200 + 24) #define MAX_TWOPC_TXN 1000 +#define STRING_BUFF_LEN 1024 #define MAX_CMD_LENGTH 120 @@ -1302,7 +1303,7 @@ database_info *add_database_info(char *database_name) { database_info *rv; HASHCTL txn_ctl; - char tabname[MAX_GID]; + char tabname[STRING_BUFF_LEN]; if ((rv = find_database_info(database_name)) != NULL) return rv; /* Already in the list */ @@ -1322,7 +1323,7 @@ database_info *add_database_info(char *database_name) rv->last_txn_info = NULL; #endif - snprintf(tabname, 64, "%s txn info", rv->database_name); + snprintf(tabname, STRING_BUFF_LEN, "%s txn info", rv->database_name); txn_ctl.keysize = MAX_GID; txn_ctl.entrysize = sizeof(txn_info); rv->all_txn_info = hash_create(tabname, 64, @@ -1342,7 +1343,7 @@ database_info *add_database_info(char *database_name) int find_node_index(Oid node_oid) { - int res; + int res = -1; int i; if (get_pgxc_nodetype(node_oid) == 'C') { @@ -2078,12 +2079,12 @@ Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS) /*collect the 2pc file in nodes*/ for (i = 0; i < cn_nodes_num; i++) { - execute_query_on_single_node(cn_node_list[i], query, 1, result+i); + (void) execute_query_on_single_node(cn_node_list[i], query, 1, result+i); } for (i = 0; i < dn_nodes_num; i++) { - execute_query_on_single_node(dn_node_list[i], query, 1, result+cn_nodes_num+i); + (void) execute_query_on_single_node(dn_node_list[i], query, 1, result+cn_nodes_num+i); } /*get all database info*/ getDatabaseList(); diff --git a/contrib/pgcrypto/pgp-mpi-internal.c b/contrib/pgcrypto/pgp-mpi-internal.c index 9420d678..e7c8637e 100644 --- a/contrib/pgcrypto/pgp-mpi-internal.c +++ b/contrib/pgcrypto/pgp-mpi-internal.c @@ -1,6 +1,6 @@ /* * pgp-mpi-internal.c - * OpenPGP MPI functions. + * OpenPGP MPI functions. * * Copyright (c) 2005 Marko Kreen * All rights reserved. @@ -9,10 +9,10 @@ * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. + * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE @@ -38,19 +38,22 @@ static mpz_t * mp_new() { - mpz_t *mp = mp_int_alloc(); + mpz_t *mp = mp_int_alloc(); - mp_int_init_size(mp, 256); - return mp; + if (mp_int_init_size(mp, 256)) + { + return NULL; + } + return mp; } static void mp_clear_free(mpz_t *a) { - if (!a) - return; - /* fixme: no clear? */ - mp_int_free(a); + if (!a) + return; + /* fixme: no clear? */ + mp_int_free(a); } @@ -58,86 +61,86 @@ static int mp_px_rand(uint32 bits, mpz_t *res) { #ifdef HAVE_STRONG_RANDOM - unsigned bytes = (bits + 7) / 8; - int last_bits = bits & 7; - uint8 *buf; - - buf = px_alloc(bytes); - if (!pg_strong_random((char *) buf, bytes)) - { - px_free(buf); - return PXE_NO_RANDOM; - } - - /* clear unnecessary bits and set last bit to one */ - if (last_bits) - { - buf[0] >>= 8 - last_bits; - buf[0] |= 1 << (last_bits - 1); - } - else - buf[0] |= 1 << 7; - - mp_int_read_unsigned(res, buf, bytes); - - px_free(buf); - - return 0; + unsigned bytes = (bits + 7) / 8; + int last_bits = bits & 7; + uint8 *buf; + + buf = px_alloc(bytes); + if (!pg_strong_random((char *) buf, bytes)) + { + px_free(buf); + return PXE_NO_RANDOM; + } + + /* clear unnecessary bits and set last bit to one */ + if (last_bits) + { + buf[0] >>= 8 - last_bits; + buf[0] |= 1 << (last_bits - 1); + } + else + buf[0] |= 1 << 7; + + mp_int_read_unsigned(res, buf, bytes); + + px_free(buf); + + return 0; #else - return PXE_NO_RANDOM; + return PXE_NO_RANDOM; #endif } static void mp_modmul(mpz_t *a, mpz_t *b, mpz_t *p, mpz_t *res) { - mpz_t *tmp = mp_new(); + mpz_t *tmp = mp_new(); - mp_int_mul(a, b, tmp); - mp_int_mod(tmp, p, res); - mp_clear_free(tmp); + mp_int_mul(a, b, tmp); + mp_int_mod(tmp, p, res); + mp_clear_free(tmp); } static mpz_t * mpi_to_bn(PGP_MPI *n) { - mpz_t *bn = mp_new(); - - mp_int_read_unsigned(bn, n->data, n->bytes); - - if (!bn) - return NULL; - if (mp_int_count_bits(bn) != n->bits) - { - px_debug("mpi_to_bn: bignum conversion failed: mpi=%d, bn=%d", - n->bits, mp_int_count_bits(bn)); - mp_clear_free(bn); - return NULL; - } - return bn; + mpz_t *bn = mp_new(); + + mp_int_read_unsigned(bn, n->data, n->bytes); + + if (!bn) + return NULL; + if (mp_int_count_bits(bn) != n->bits) + { + px_debug("mpi_to_bn: bignum conversion failed: mpi=%d, bn=%d", + n->bits, mp_int_count_bits(bn)); + mp_clear_free(bn); + return NULL; + } + return bn; } static PGP_MPI * bn_to_mpi(mpz_t *bn) { - int res; - PGP_MPI *n; - int bytes; - - res = pgp_mpi_alloc(mp_int_count_bits(bn), &n); - if (res < 0) - return NULL; - - bytes = (mp_int_count_bits(bn) + 7) / 8; - if (bytes != n->bytes) - { - px_debug("bn_to_mpi: bignum conversion failed: bn=%d, mpi=%d", - bytes, n->bytes); - pgp_mpi_free(n); - return NULL; - } - mp_int_to_unsigned(bn, n->data, n->bytes); - return n; + int res; + PGP_MPI *n; + int bytes; + + res = pgp_mpi_alloc(mp_int_count_bits(bn), &n); + if (res < 0) + return NULL; + + bytes = (mp_int_count_bits(bn) + 7) / 8; + if (bytes != n->bytes) + { + px_debug("bn_to_mpi: bignum conversion failed: bn=%d, mpi=%d", + bytes, n->bytes); + pgp_mpi_free(n); + return NULL; + } + mp_int_to_unsigned(bn, n->data, n->bytes); + return n; } /* @@ -158,152 +161,152 @@ bn_to_mpi(mpz_t *bn) static int decide_k_bits(int p_bits) { - if (p_bits <= 5120) - return p_bits / 10 + 160; - else - return (p_bits / 8 + 200) * 3 / 2; + if (p_bits <= 5120) + return p_bits / 10 + 160; + else + return (p_bits / 8 + 200) * 3 / 2; } int pgp_elgamal_encrypt(PGP_PubKey *pk, PGP_MPI *_m, - PGP_MPI **c1_p, PGP_MPI **c2_p) + PGP_MPI **c1_p, PGP_MPI **c2_p) { - int res = PXE_PGP_MATH_FAILED; - int k_bits; - mpz_t *m = mpi_to_bn(_m); - mpz_t *p = mpi_to_bn(pk->pub.elg.p); - mpz_t *g = mpi_to_bn(pk->pub.elg.g); - mpz_t *y = mpi_to_bn(pk->pub.elg.y); - mpz_t *k = mp_new(); - mpz_t *yk = mp_new(); - mpz_t *c1 = mp_new(); - mpz_t *c2 = mp_new(); - - if (!m || !p || !g || !y || !k || !yk || !c1 || !c2) - goto err; - - /* - * generate k - */ - k_bits = decide_k_bits(mp_int_count_bits(p)); - res = mp_px_rand(k_bits, k); - if (res < 0) - return res; - - /* - * c1 = g^k c2 = m * y^k - */ - mp_int_exptmod(g, k, p, c1); - mp_int_exptmod(y, k, p, yk); - mp_modmul(m, yk, p, c2); - - /* result */ - *c1_p = bn_to_mpi(c1); - *c2_p = bn_to_mpi(c2); - if (*c1_p && *c2_p) - res = 0; + int res = PXE_PGP_MATH_FAILED; + int k_bits; + mpz_t *m = mpi_to_bn(_m); + mpz_t *p = mpi_to_bn(pk->pub.elg.p); + mpz_t *g = mpi_to_bn(pk->pub.elg.g); + mpz_t *y = mpi_to_bn(pk->pub.elg.y); + mpz_t *k = mp_new(); + mpz_t *yk = mp_new(); + mpz_t *c1 = mp_new(); + mpz_t *c2 = mp_new(); + + if (!m || !p || !g || !y || !k || !yk || !c1 || !c2) + goto err; + + /* + * generate k + */ + k_bits = decide_k_bits(mp_int_count_bits(p)); + res = mp_px_rand(k_bits, k); + if (res < 0) + return res; + + /* + * c1 = g^k c2 = m * y^k + */ + mp_int_exptmod(g, k, p, c1); + mp_int_exptmod(y, k, p, yk); + mp_modmul(m, yk, p, c2); + + /* result */ + *c1_p = bn_to_mpi(c1); + *c2_p = bn_to_mpi(c2); + if (*c1_p && *c2_p) + res = 0; err: - mp_clear_free(c2); - mp_clear_free(c1); - mp_clear_free(yk); - mp_clear_free(k); - mp_clear_free(y); - mp_clear_free(g); - mp_clear_free(p); - mp_clear_free(m); - return res; + mp_clear_free(c2); + mp_clear_free(c1); + mp_clear_free(yk); + mp_clear_free(k); + mp_clear_free(y); + mp_clear_free(g); + mp_clear_free(p); + mp_clear_free(m); + return res; } int pgp_elgamal_decrypt(PGP_PubKey *pk, PGP_MPI *_c1, PGP_MPI *_c2, - PGP_MPI **msg_p) + PGP_MPI **msg_p) { - int res = PXE_PGP_MATH_FAILED; - mpz_t *c1 = mpi_to_bn(_c1); - mpz_t *c2 = mpi_to_bn(_c2); - mpz_t *p = mpi_to_bn(pk->pub.elg.p); - mpz_t *x = mpi_to_bn(pk->sec.elg.x); - mpz_t *c1x = mp_new(); - mpz_t *div = mp_new(); - mpz_t *m = mp_new(); - - if (!c1 || !c2 || !p || !x || !c1x || !div || !m) - goto err; - - /* - * m = c2 / (c1^x) - */ - mp_int_exptmod(c1, x, p, c1x); - mp_int_invmod(c1x, p, div); - mp_modmul(c2, div, p, m); - - /* result */ - *msg_p = bn_to_mpi(m); - if (*msg_p) - res = 0; + int res = PXE_PGP_MATH_FAILED; + mpz_t *c1 = mpi_to_bn(_c1); + mpz_t *c2 = mpi_to_bn(_c2); + mpz_t *p = mpi_to_bn(pk->pub.elg.p); + mpz_t *x = mpi_to_bn(pk->sec.elg.x); + mpz_t *c1x = mp_new(); + mpz_t *div = mp_new(); + mpz_t *m = mp_new(); + + if (!c1 || !c2 || !p || !x || !c1x || !div || !m) + goto err; + + /* + * m = c2 / (c1^x) + */ + mp_int_exptmod(c1, x, p, c1x); + mp_int_invmod(c1x, p, div); + mp_modmul(c2, div, p, m); + + /* result */ + *msg_p = bn_to_mpi(m); + if (*msg_p) + res = 0; err: - mp_clear_free(m); - mp_clear_free(div); - mp_clear_free(c1x); - mp_clear_free(x); - mp_clear_free(p); - mp_clear_free(c2); - mp_clear_free(c1); - return res; + mp_clear_free(m); + mp_clear_free(div); + mp_clear_free(c1x); + mp_clear_free(x); + mp_clear_free(p); + mp_clear_free(c2); + mp_clear_free(c1); + return res; } int pgp_rsa_encrypt(PGP_PubKey *pk, PGP_MPI *_m, PGP_MPI **c_p) { - int res = PXE_PGP_MATH_FAILED; - mpz_t *m = mpi_to_bn(_m); - mpz_t *e = mpi_to_bn(pk->pub.rsa.e); - mpz_t *n = mpi_to_bn(pk->pub.rsa.n); - mpz_t *c = mp_new(); - - if (!m || !e || !n || !c) - goto err; - - /* - * c = m ^ e - */ - mp_int_exptmod(m, e, n, c); - - *c_p = bn_to_mpi(c); - if (*c_p) - res = 0; + int res = PXE_PGP_MATH_FAILED; + mpz_t *m = mpi_to_bn(_m); + mpz_t *e = mpi_to_bn(pk->pub.rsa.e); + mpz_t *n = mpi_to_bn(pk->pub.rsa.n); + mpz_t *c = mp_new(); + + if (!m || !e || !n || !c) + goto err; + + /* + * c = m ^ e + */ + mp_int_exptmod(m, e, n, c); + + *c_p = bn_to_mpi(c); + if (*c_p) + res = 0; err: - mp_clear_free(c); - mp_clear_free(n); - mp_clear_free(e); - mp_clear_free(m); - return res; + mp_clear_free(c); + mp_clear_free(n); + mp_clear_free(e); + mp_clear_free(m); + return res; } int pgp_rsa_decrypt(PGP_PubKey *pk, PGP_MPI *_c, PGP_MPI **m_p) { - int res = PXE_PGP_MATH_FAILED; - mpz_t *c = mpi_to_bn(_c); - mpz_t *d = mpi_to_bn(pk->sec.rsa.d); - mpz_t *n = mpi_to_bn(pk->pub.rsa.n); - mpz_t *m = mp_new(); - - if (!m || !d || !n || !c) - goto err; - - /* - * m = c ^ d - */ - mp_int_exptmod(c, d, n, m); - - *m_p = bn_to_mpi(m); - if (*m_p) - res = 0; + int res = PXE_PGP_MATH_FAILED; + mpz_t *c = mpi_to_bn(_c); + mpz_t *d = mpi_to_bn(pk->sec.rsa.d); + mpz_t *n = mpi_to_bn(pk->pub.rsa.n); + mpz_t *m = mp_new(); + + if (!m || !d || !n || !c) + goto err; + + /* + * m = c ^ d + */ + mp_int_exptmod(c, d, n, m); + + *m_p = bn_to_mpi(m); + if (*m_p) + res = 0; err: - mp_clear_free(m); - mp_clear_free(n); - mp_clear_free(d); - mp_clear_free(c); - return res; + mp_clear_free(m); + mp_clear_free(n); + mp_clear_free(d); + mp_clear_free(c); + return res; } diff --git a/contrib/pgxc_ctl/coord_cmd.c b/contrib/pgxc_ctl/coord_cmd.c index 0df40e89..f45eb417 100644 --- a/contrib/pgxc_ctl/coord_cmd.c +++ b/contrib/pgxc_ctl/coord_cmd.c @@ -1715,7 +1715,7 @@ int add_coordinatorSlave(char *name, char *host, int port, int pooler_port, char "# archive_command = 'rsync %%p %s@%s:%s/%%f'\n" "max_wal_senders = %d\n" "# End of Addition\n", - timeStampString(date, MAXPATH), + timeStampString(date, MAXTOKEN+1), sval(VAR_pgxcUser), host, archDir, getDefaultWalSender(TRUE)); pclose(f); diff --git a/contrib/pgxc_ctl/datanode_cmd.c b/contrib/pgxc_ctl/datanode_cmd.c index 8b6326aa..a0d6636e 100644 --- a/contrib/pgxc_ctl/datanode_cmd.c +++ b/contrib/pgxc_ctl/datanode_cmd.c @@ -1436,7 +1436,7 @@ int add_datanodeSlave(char *name, char *host, int port, int pooler, char *dir, fprintf(f, "#================================================\n" "# Additional entry by adding the slave, %s\n", - timeStampString(date, MAXPATH)); + timeStampString(date, MAXTOKEN+1)); for (kk = 0; aval(VAR_datanodePgHbaEntries)[kk]; kk++) { diff --git a/contrib/pgxc_ctl/variables.c b/contrib/pgxc_ctl/variables.c index 9ed61ddb..f8bd8917 100644 --- a/contrib/pgxc_ctl/variables.c +++ b/contrib/pgxc_ctl/variables.c @@ -24,367 +24,367 @@ static void clear_var(pgxc_ctl_var *var); */ static int hash_val(char *name) { - unsigned char *name_u = (unsigned char *)name; - unsigned char v; - - for(v = 0; *name_u; name_u++) - v += *name_u; - return (v%NUM_HASH_BUCKET); + unsigned char *name_u = (unsigned char *)name; + unsigned char v; + + for(v = 0; *name_u; name_u++) + v += *name_u; + return (v%NUM_HASH_BUCKET); } #define LIMIT_TO_DOUBLE 128 #define INCR_OVER_DOUBLE 10 static int next_size(int sz) { - if (sz <= 0) - return 1; - if (sz <= LIMIT_TO_DOUBLE) - return sz*2; - else - return sz + INCR_OVER_DOUBLE; + if (sz <= 0) + return 1; + if (sz <= LIMIT_TO_DOUBLE) + return sz*2; + else + return sz + INCR_OVER_DOUBLE; } void init_var_hash() { - int i; + int i; - for (i = 0; i < NUM_HASH_BUCKET; i++) - { - var_hash[i].el_size = 1; - var_hash[i].el_used = 0; - var_hash[i].el = (pgxc_ctl_var **)Malloc(sizeof(pgxc_ctl_var *)); - var_hash[i].el[0] = NULL; - } + for (i = 0; i < NUM_HASH_BUCKET; i++) + { + var_hash[i].el_size = 1; + var_hash[i].el_used = 0; + var_hash[i].el = (pgxc_ctl_var **)Malloc(sizeof(pgxc_ctl_var *)); + var_hash[i].el[0] = NULL; + } } static void remove_from_hash(pgxc_ctl_var *var) { - int hash_v = hash_val(var->varname); - int ii, jj; + int hash_v = hash_val(var->varname); + int ii, jj; - for(ii = 0; var_hash[hash_v].el[ii]; ii++) - { - if (var_hash[hash_v].el[ii] != var) - continue; - else - { - for(jj = ii; var_hash[hash_v].el[jj]; jj++) - var_hash[hash_v].el[jj] = var_hash[hash_v].el[jj + 1]; - var_hash[hash_v].el_used--; - return; - } - } - return; + for(ii = 0; var_hash[hash_v].el[ii]; ii++) + { + if (var_hash[hash_v].el[ii] != var) + continue; + else + { + for(jj = ii; var_hash[hash_v].el[jj]; jj++) + var_hash[hash_v].el[jj] = var_hash[hash_v].el[jj + 1]; + var_hash[hash_v].el_used--; + return; + } + } + return; } void add_var_hash(pgxc_ctl_var *var) { - int hash_v = hash_val(var->varname); - if (var_hash[hash_v].el_used + 1 >= var_hash[hash_v].el_size) - { - var_hash[hash_v].el_size = next_size(var_hash[hash_v].el_size); - var_hash[hash_v].el = (pgxc_ctl_var **)Realloc(var_hash[hash_v].el, sizeof(pgxc_ctl_var *) * var_hash[hash_v].el_size); - } - var_hash[hash_v].el[var_hash[hash_v].el_used++] = var; - var_hash[hash_v].el[var_hash[hash_v].el_used] = NULL; + int hash_v = hash_val(var->varname); + if (var_hash[hash_v].el_used + 1 >= var_hash[hash_v].el_size) + { + var_hash[hash_v].el_size = next_size(var_hash[hash_v].el_size); + var_hash[hash_v].el = (pgxc_ctl_var **)Realloc(var_hash[hash_v].el, sizeof(pgxc_ctl_var *) * var_hash[hash_v].el_size); + } + var_hash[hash_v].el[var_hash[hash_v].el_used++] = var; + var_hash[hash_v].el[var_hash[hash_v].el_used] = NULL; } pgxc_ctl_var *new_var(char *name) { - pgxc_ctl_var *newv; - - if (find_var(name)) - { - elog(ERROR, "ERROR: Variable %s already defined. Check your configuration.\n", name); - return NULL; - } - - newv = (pgxc_ctl_var *)Malloc(sizeof(pgxc_ctl_var)); - if (var_head == NULL) - { - var_head = var_tail = newv; - newv->prev = NULL; - } - else - { - newv->prev = var_tail; - var_tail->next = newv; - var_tail = newv; - } - newv->next = NULL; - newv->varname = Strdup(name); - newv->val_size = 1; - newv->val_used = 0; - newv->val = (char **)Malloc(sizeof(char *)); - newv->val[0] = NULL; - add_var_hash(newv); - return(newv); + pgxc_ctl_var *newv; + + if (find_var(name)) + { + elog(ERROR, "ERROR: Variable %s already defined. Check your configuration.\n", name); + return NULL; + } + + newv = (pgxc_ctl_var *)Malloc(sizeof(pgxc_ctl_var)); + if (var_head == NULL) + { + var_head = var_tail = newv; + newv->prev = NULL; + } + else + { + newv->prev = var_tail; + var_tail->next = newv; + var_tail = newv; + } + newv->next = NULL; + newv->varname = Strdup(name); + newv->val_size = 1; + newv->val_used = 0; + newv->val = (char **)Malloc(sizeof(char *)); + newv->val[0] = NULL; + add_var_hash(newv); + return(newv); } void remove_var(pgxc_ctl_var *var) { - if ((var_head == var_tail) && (var_head == var)) - var_head = var_tail = NULL; - else if (var_head == var) - { - var_head = var_head->next; - var_head->prev = NULL; - } - else if (var_tail == var) - { - var_tail->next = NULL; - var_tail = var_tail->prev; - } - else - { - var->prev->next = var->next; - var->next->prev = var->prev; - } - clear_var(var); + if ((var_head == var_tail) && (var_head == var)) + var_head = var_tail = NULL; + else if (var_head == var) + { + var_head = var_head->next; + var_head->prev = NULL; + } + else if (var_tail == var) + { + var_tail->next = NULL; + var_tail = var_tail->prev; + } + else + { + var->prev->next = var->next; + var->next->prev = var->prev; + } + clear_var(var); } static void clear_var(pgxc_ctl_var *var) { - int ii; + int ii; - remove_from_hash(var); - for (ii = 0; var->val[ii]; ii++) - free(var->val[ii]); - free(var->varname); - free(var); - -} + remove_from_hash(var); + for (ii = 0; var->val[ii]; ii++) + free(var->val[ii]); + free(var->varname); + free(var); + +} void add_val(pgxc_ctl_var *var, char *val) { - if (var->val_size <= var->val_used+1) - { - var->val_size = next_size(var->val_size); - var->val = (char **)Realloc(var->val, sizeof(char *)*var->val_size); - } - var->val[var->val_used++] = Strdup(val); - var->val[var->val_used] = NULL; + if (var->val_size <= var->val_used+1) + { + var->val_size = next_size(var->val_size); + var->val = (char **)Realloc(var->val, sizeof(char *)*var->val_size); + } + var->val[var->val_used++] = Strdup(val); + var->val[var->val_used] = NULL; } void add_val_name(char *name, char *val) { - pgxc_ctl_var *var; - if (!(var = find_var(name))) - return; - add_val(var, name); - return; + pgxc_ctl_var *var; + if (!(var = find_var(name))) + return; + add_val(var, name); + return; } pgxc_ctl_var *find_var(char *name) { - pgxc_var_hash *hash = &var_hash[hash_val(name)]; - int i; + pgxc_var_hash *hash = &var_hash[hash_val(name)]; + int i; - for (i = 0; i < hash->el_used; i++) - { - if (strcmp(hash->el[i]->varname, name) == 0) - return hash->el[i]; - } - return NULL; + for (i = 0; i < hash->el_used; i++) + { + if (strcmp(hash->el[i]->varname, name) == 0) + return hash->el[i]; + } + return NULL; } char *sval(char *name) { - pgxc_ctl_var *var = find_var(name); - if (!var) - return NULL; - return var->val[0]; + pgxc_ctl_var *var = find_var(name); + if (!var) + return NULL; + return var->val[0]; } char **aval(char *name) { - pgxc_ctl_var *var = find_var(name); - if (!var) - return NULL; - return var->val; + pgxc_ctl_var *var = find_var(name); + if (!var) + return NULL; + return var->val; } void reset_value(pgxc_ctl_var *var) { - int i; - for (i = 0; var->val[i]; i++) - { - Free (var->val[i]); - var->val[i] = NULL; - } - var->val_used = 0; + int i; + for (i = 0; var->val[i]; i++) + { + Free (var->val[i]); + var->val[i] = NULL; + } + var->val_used = 0; } void assign_val(char *destName, char *srcName) { - pgxc_ctl_var *dest = find_var(destName); - pgxc_ctl_var *src = find_var(srcName); - int ii; + pgxc_ctl_var *dest = find_var(destName); + pgxc_ctl_var *src = find_var(srcName); + int ii; - reset_value(dest); - for (ii = 0; ii < src->val_used; ii++) - add_val(dest, src->val[ii]); + reset_value(dest); + for (ii = 0; ii < src->val_used; ii++) + add_val(dest, src->val[ii]); } void assign_sval(char *destName, char *val) { - pgxc_ctl_var *dest = find_var(destName); + pgxc_ctl_var *dest = find_var(destName); - reset_value(dest); - add_val(dest, val); + reset_value(dest); + add_val(dest, val); } void reset_var(char *name) { - confirm_var(name); - reset_value(find_var(name)); + confirm_var(name); + reset_value(find_var(name)); } void reset_var_val(char *name, char *val) { - reset_var(name); - add_val(find_var(name), val); + reset_var(name); + add_val(find_var(name), val); } pgxc_ctl_var *confirm_var(char *name) { - pgxc_ctl_var *rc; - if ((rc = find_var(name))) - return rc; - return new_var(name); + pgxc_ctl_var *rc; + if ((rc = find_var(name))) + return rc; + return new_var(name); } void print_vars(void) { - pgxc_ctl_var *cur; + pgxc_ctl_var *cur; - lockLogFile(); - for(cur = var_head; cur; cur=cur->next) - print_var(cur->varname); - unlockLogFile(); + lockLogFile(); + for(cur = var_head; cur; cur=cur->next) + print_var(cur->varname); + unlockLogFile(); } void print_var(char *vname) { - pgxc_ctl_var *var; - char outBuf[MAXLINE + 1]; - - outBuf[0] = 0; - if ((var = find_var(vname)) == NULL) - { - elog(ERROR, "ERROR: Variable %s not found.\n", vname); - return; - } - else - { - char **curv; - char editbuf[MAXPATH]; - - snprintf(editbuf, MAXPATH, "%s (", vname); - strncat(outBuf, editbuf, MAXLINE); - for (curv=var->val; *curv; curv++) - { - snprintf(editbuf, MAXPATH, " \"%s\" ", *curv); - strncat(outBuf, editbuf, MAXLINE); - } - strncat(outBuf, ")", MAXLINE); - elog(NOTICE, "%s\n", outBuf); - } - + pgxc_ctl_var *var; + char outBuf[MAXLINE + 1]; + + outBuf[0] = 0; + if ((var = find_var(vname)) == NULL) + { + elog(ERROR, "ERROR: Variable %s not found.\n", vname); + return; + } + else + { + char **curv; + char editbuf[MAXPATH]; + + snprintf(editbuf, MAXPATH, "%s (", vname); + strncat(outBuf, editbuf, MAXLINE); + for (curv=var->val; *curv; curv++) + { + snprintf(editbuf, MAXPATH, " \"%s\" ", *curv); + strncat(outBuf, editbuf, MAXLINE); + } + strncat(outBuf, ")", MAXLINE); + elog(NOTICE, "%s\n", outBuf); + } + } void log_var(char *varname) { - if (logFile) - print_var(varname); + if (logFile) + print_var(varname); } int arraySizeName(char *name) { - pgxc_ctl_var *var; + pgxc_ctl_var *var; - if ((var = find_var(name)) == NULL) - return -1; - return(arraySize(var)); + if ((var = find_var(name)) == NULL) + return -1; + return(arraySize(var)); } int arraySize(pgxc_ctl_var *var) { - return var->val_used; + return var->val_used; } char **add_member(char **array, char *val) { - char **rv; - int ii; + char **rv; + int ii; - for (ii = 0; array[ii]; ii++); - rv = Realloc(array, sizeof(char *) * (ii + 2)); - rv[ii] = Strdup(val); - rv[ii+1] = NULL; - return(rv); + for (ii = 0; array[ii]; ii++); + rv = Realloc(array, sizeof(char *) * (ii + 2)); + rv[ii] = Strdup(val); + rv[ii+1] = NULL; + return(rv); } void clean_array(char **array) { - int ii; - if (array) - { - for(ii = 0; array[ii]; ii++) - Free(array[ii]); - Free(array); - } + int ii; + if (array) + { + for(ii = 0; array[ii]; ii++) + Free(array[ii]); + Free(array); + } } void var_assign(char **dest, char *src) { - Free(*dest); - *dest = src; + Free(*dest); + *dest = src; } char *listValue(char *name) { - pgxc_ctl_var *dest; - int ii; - char *buf; + pgxc_ctl_var *dest; + int ii; + char *buf; - if ((dest = find_var(name)) == NULL) - return Strdup(""); - buf = Malloc(MAXLINE+1); - buf[0]=0; - for(ii = 0; ii < dest->val_used; ii++) - { - strncat(buf, dest->val[ii], MAXLINE); - strncat(buf, " ", MAXLINE); - } - return buf; + if ((dest = find_var(name)) == NULL) + return Strdup(""); + buf = Malloc(MAXLINE+1); + buf[0]=0; + for(ii = 0; ii < dest->val_used; ii++) + { + strncat(buf, dest->val[ii], MAXLINE); + strncat(buf, " ", MAXLINE); + } + return buf; } int ifExists(char *name, char *value) { - pgxc_ctl_var *var = find_var(name); - int ii; + pgxc_ctl_var *var = find_var(name); + int ii; - if (!var) - return FALSE; - for (ii = 0; ii < var->val_used; ii++) - if (strcmp((var->val)[ii], value) == 0) - return TRUE; - return FALSE; + if (!var) + return FALSE; + for (ii = 0; ii < var->val_used; ii++) + if (strcmp((var->val)[ii], value) == 0) + return TRUE; + return FALSE; } - + int IfExists(char *name, char *value) { - pgxc_ctl_var *var = find_var(name); - int ii; + pgxc_ctl_var *var = find_var(name); + int ii; - if (!var) - return FALSE; - for (ii = 0; ii < var->val_used; ii++) - if (strcasecmp((var->val)[ii], value) == 0) - return TRUE; - return FALSE; + if (!var) + return FALSE; + for (ii = 0; ii < var->val_used; ii++) + if (strcasecmp((var->val)[ii], value) == 0) + return TRUE; + return FALSE; } /* @@ -393,39 +393,39 @@ int IfExists(char *name, char *value) */ int extendVar(char *name, int newSize, char *def_value) { - pgxc_ctl_var *target; - char **old_val; - int old_size; - int ii; - - if ((target = find_var(name)) == NULL) - return -1; - if (def_value == NULL) - def_value = "none"; - - /* - * If the allocated array is not already big enough to store newSize + 1 - * elements, we must extend it newSize + 1 - */ - if (target->val_size <= newSize) - { - old_val = target->val; - old_size = target->val_size; - target->val = Malloc0(sizeof(char *) * (newSize + 1)); - memcpy(target->val, old_val, sizeof(char *) * old_size); - target->val_size = newSize + 1; - Free(old_val); - } - - for (ii = target->val_used; ii < newSize; ii++) - (target->val)[ii] = Strdup(def_value); - - /* Store NULL in the last element to mark the end-of-array */ - (target->val)[newSize] = NULL; - if (target->val_used < newSize) - target->val_used = newSize; - - return 0; + pgxc_ctl_var *target; + char **old_val; + int old_size; + int ii; + + if ((target = find_var(name)) == NULL) + return -1; + if (def_value == NULL) + def_value = "none"; + + /* + * If the allocated array is not already big enough to store newSize + 1 + * elements, we must extend it newSize + 1 + */ + if (target->val_size <= newSize) + { + old_val = target->val; + old_size = target->val_size; + target->val = Malloc0(sizeof(char *) * (newSize + 1)); + memcpy(target->val, old_val, sizeof(char *) * old_size); + target->val_size = newSize + 1; + Free(old_val); + } + + for (ii = target->val_used; ii < newSize; ii++) + (target->val)[ii] = Strdup(def_value); + + /* Store NULL in the last element to mark the end-of-array */ + (target->val)[newSize] = NULL; + if (target->val_used < newSize) + target->val_used = newSize; + + return 0; } @@ -434,40 +434,40 @@ int extendVar(char *name, int newSize, char *def_value) * Returns *val if success, NULL if failed */ void assign_arrayEl_internal(char *name, int idx, char *val, char *pad, - int extend) + int extend) { - pgxc_ctl_var *var = confirm_var(name); + pgxc_ctl_var *var = confirm_var(name); - if (pad == NULL) - pad = "none"; - /* - * Pad if needed - */ - if (extend) - extendVar(name, idx+1, pad); - Free(var->val[idx]); - var->val[idx] = Strdup(val); + if (pad == NULL) + pad = "none"; + /* + * Pad if needed + */ + if (extend) + (void) extendVar(name, idx+1, pad); + Free(var->val[idx]); + var->val[idx] = Strdup(val); } void assign_arrayEl(char *name, int idx, char *val, char *pad) { - return assign_arrayEl_internal(name, idx, val, pad, TRUE); + return assign_arrayEl_internal(name, idx, val, pad, TRUE); } void replace_arrayEl(char *name, int idx, char *val, char *pad) { - return assign_arrayEl_internal(name, idx, val, pad, FALSE); + return assign_arrayEl_internal(name, idx, val, pad, FALSE); } int doesExist(char *name, int idx) { - pgxc_ctl_var *var; + pgxc_ctl_var *var; - if (name == NULL) - return 0; - if ((var = find_var(name)) == NULL) - return 0; - if (var->val_used <= idx) - return 0; - return 1; + if (name == NULL) + return 0; + if ((var = find_var(name)) == NULL) + return 0; + if (var->val_used <= idx) + return 0; + return 1; } diff --git a/src/backend/access/common/printtup.c b/src/backend/access/common/printtup.c index dfd64707..c72fb1af 100644 --- a/src/backend/access/common/printtup.c +++ b/src/backend/access/common/printtup.c @@ -449,6 +449,7 @@ printtup(TupleTableSlot *slot, DestReceiver *self) { int len = strlen(outputstr); #ifdef __TBASE__ + int len = strlen(outputstr); if (slot->tts_tupleDescriptor->attrs[i]->atttypid == RECORDOID && self->mydest == DestRemoteExecute) { Oid tupType; diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index b0129032..db17aec9 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -3827,6 +3827,7 @@ heap_delete(Relation relation, ItemPointer tid, if (RelationNeedsWAL(relation)) { xl_heap_delete xlrec; + xl_heap_header xlhdr; XLogRecPtr recptr; /* For logical decode we need combocids to properly decode the catalog */ @@ -3860,8 +3861,6 @@ heap_delete(Relation relation, ItemPointer tid, #ifdef __STORAGE_SCALABLE__ HeapTuple tup; #endif - xl_heap_header xlhdr; - xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2; xlhdr.t_infomask = old_key_tuple->t_data->t_infomask; xlhdr.t_hoff = old_key_tuple->t_data->t_hoff; diff --git a/src/backend/access/spgist/spgscan.c b/src/backend/access/spgist/spgscan.c index 9d0dfda6..29519f67 100644 --- a/src/backend/access/spgist/spgscan.c +++ b/src/backend/access/spgist/spgscan.c @@ -590,6 +590,10 @@ storeGettuple(SpGistScanOpaque so, ItemPointer heapPtr, so->recheck[so->nPtrs] = recheck; if (so->want_itup) { + if (so->indexTupDesc->natts != 1) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("wrong natts in indexTupDesc."))); /* * Reconstruct index data. We have to copy the datum out of the temp * context anyway, so we may as well create the tuple here. diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c index daf77a90..1e6c908b 100644 --- a/src/backend/access/transam/gtm.c +++ b/src/backend/access/transam/gtm.c @@ -1430,12 +1430,17 @@ CloseGTM(void) #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__ GTM_Timestamp GetGlobalTimestampGTM(void) -{// #lizard forgives - Get_GTS_Result gts_result = {InvalidGlobalTimestamp,false}; - GTM_Timestamp latest_gts = InvalidGlobalTimestamp; - struct rusage start_r; - struct timeval start_t; +{ + struct rusage start_r; + struct timeval start_t; int retry_cnt = 0; + Get_GTS_Result gts_result = {InvalidGlobalTimestamp,false}; + GTM_Timestamp latest_gts = InvalidGlobalTimestamp; + + if (!g_set_global_snapshot) + { + return LocalCommitTimestamp; + } if (log_gtm_stats) ResetUsageCommon(&start_r, &start_t); diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index ebe93238..37a041f6 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -3386,7 +3386,7 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta { XLogBeginInsert(); XLogRegisterData((char *)tid, strlen(tid)+1); - XLogRegisterData((char *)&commit_timestamp, sizeof(GlobalTimestamp) + 1); + XLogRegisterData((char *)&commit_timestamp, sizeof(GlobalTimestamp)); xlogrec = XLogInsert(RM_XLOG_ID, XLOG_RECORD_2PC_TIMESTAMP); /* only start node need to flush and sync XLOG_RECORD_2PC_TIMESTAMP */ if (IS_PGXC_LOCAL_COORDINATOR) @@ -3398,7 +3398,7 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta if (enable_distri_print) { - read(fd, file_content, 2048);//FileRead(fd, file_content, 2048, WAIT_EVENT_BUFFILE_READ); + (void) read(fd, file_content, 2048);//FileRead(fd, file_content, 2048, WAIT_EVENT_BUFFILE_READ); elog(LOG, "before append 2pc file: %s, file_content: %s", tid, file_content); } diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 405e9bac..18ed9b22 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -10199,11 +10199,11 @@ xlog_redo(XLogReaderState *record) { startnode = temp; memcpy(&startxid, pos, sizeof(TransactionId)); - pos = pos + sizeof(TransactionId) + 1; + pos = pos + sizeof(TransactionId) ; nodestring = pos; pos = pos + strlen(nodestring) + 1; memcpy(&xid, pos, sizeof(TransactionId)); - pos = pos + sizeof(TransactionId) + 1; + pos = pos + sizeof(TransactionId) ; if (IsXidImplicit(gid)) { memcpy(&commit_timestamp, pos, sizeof(GlobalTimestamp)); diff --git a/src/backend/audit/audit_fga.c b/src/backend/audit/audit_fga.c index 06bdbd6e..2327ac00 100644 --- a/src/backend/audit/audit_fga.c +++ b/src/backend/audit/audit_fga.c @@ -431,7 +431,7 @@ exec_policy_funct_on_other_node(char *query_string) { cn_node_list = (Oid *) palloc0(cn_nodes_num * sizeof(Oid)); - PGXCGetCoordOidOthers(&cn_node_list); + PGXCGetCoordOidOthers(cn_node_list); pgxc_execute_on_nodes(cn_nodes_num, cn_node_list, query_string); } } @@ -1822,6 +1822,7 @@ process_fga_trigger(bool timeout) else { elog(LOG, "AUDIT_FGA: cannot connect to db"); + PQfinish(conn); } } } diff --git a/src/backend/catalog/objectaddress.c b/src/backend/catalog/objectaddress.c index 1a45c53b..d085bae1 100644 --- a/src/backend/catalog/objectaddress.c +++ b/src/backend/catalog/objectaddress.c @@ -3570,6 +3570,7 @@ getObjectDescription(const ObjectAddress *object) { appendStringInfoString(&buffer, _("distributed ")); getRelationDescription(&buffer, object->objectId); + break; } /* diff --git a/src/backend/catalog/pgxc_class.c b/src/backend/catalog/pgxc_class.c index 8384bccd..ccbc45c3 100644 --- a/src/backend/catalog/pgxc_class.c +++ b/src/backend/catalog/pgxc_class.c @@ -744,12 +744,6 @@ ModifyPgxcClass(PgxcClassModifyType type, PgxcClassModifyData *data) pfree(nodelist); pfree(newtup); } - else - { - heap_endscan(scan); - heap_close(rel,AccessExclusiveLock); - elog(ERROR, "unknow PgxcClassModifyType %d.", type); - } } tup = heap_getnext(scan, ForwardScanDirection); diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c index 3d522795..3cb9c044 100644 --- a/src/backend/commands/sequence.c +++ b/src/backend/commands/sequence.c @@ -168,7 +168,7 @@ DefineSequence(ParseState *pstate, CreateSeqStmt *seq) List *owned_by; CreateStmt *stmt = makeNode(CreateStmt); Oid seqoid; - ObjectAddress address; + ObjectAddress address = InvalidObjectAddress; Relation rel; HeapTuple tuple; TupleDesc tupDesc; @@ -575,7 +575,7 @@ AlterSequence(ParseState *pstate, AlterSeqStmt *stmt) bool cycle; bool is_restart; #endif - ObjectAddress address; + ObjectAddress address = InvalidObjectAddress; Relation rel; HeapTuple seqtuple; HeapTuple newdatatuple; diff --git a/src/backend/commands/statscmds.c b/src/backend/commands/statscmds.c index 8fefe73b..63ca4812 100644 --- a/src/backend/commands/statscmds.c +++ b/src/backend/commands/statscmds.c @@ -67,7 +67,12 @@ CreateStatistics(CreateStatsStmt *stmt) Oid relid; ObjectAddress parentobject, myself; +#ifdef __TBASE__ + Datum types[3]; /* one for each possible type of statistic */ +#else Datum types[2]; /* one for each possible type of statistic */ +#endif + int ntypes; ArrayType *stxkind; bool build_ndistinct; diff --git a/src/backend/contrib/pgcrypto/internal.c b/src/backend/contrib/pgcrypto/internal.c index ce369693..63d1df30 100644 --- a/src/backend/contrib/pgcrypto/internal.c +++ b/src/backend/contrib/pgcrypto/internal.c @@ -705,7 +705,7 @@ system_reseed(void) check_time = t; /* roll dice */ - px_get_random_bytes(buf, 1); + (void) px_get_random_bytes(buf, 1); skip = buf[0] >= SYSTEM_RESEED_CHANCE; } /* clear 1 byte */ diff --git a/src/backend/executor/execUtils.c b/src/backend/executor/execUtils.c index 79c629be..c6401651 100644 --- a/src/backend/executor/execUtils.c +++ b/src/backend/executor/execUtils.c @@ -1083,6 +1083,7 @@ GetAttributeByName(HeapTupleHeader tuple, const char *attname, bool *isNull) tmptup.t_len = HeapTupleHeaderGetDatumLength(tuple); ItemPointerSetInvalid(&(tmptup.t_self)); tmptup.t_tableOid = InvalidOid; + tmptup.t_xc_node_id = InvalidOid; tmptup.t_data = tuple; result = heap_getattr(&tmptup, @@ -1131,6 +1132,7 @@ GetAttributeByNum(HeapTupleHeader tuple, tmptup.t_len = HeapTupleHeaderGetDatumLength(tuple); ItemPointerSetInvalid(&(tmptup.t_self)); tmptup.t_tableOid = InvalidOid; + tmptup.t_xc_node_id = InvalidOid; tmptup.t_data = tuple; result = heap_getattr(&tmptup, diff --git a/src/backend/libpq/hba.c b/src/backend/libpq/hba.c index 97f886f5..35ad2dc1 100644 --- a/src/backend/libpq/hba.c +++ b/src/backend/libpq/hba.c @@ -2542,38 +2542,59 @@ gethba_options(HbaLine *hba) CStringGetTextDatum(psprintf("ldapbinddn=%s", hba->ldapbinddn)); if (hba->ldapbindpasswd) + { + Assert(noptions < MAX_HBA_OPTIONS); options[noptions++] = CStringGetTextDatum(psprintf("ldapbindpasswd=%s", hba->ldapbindpasswd)); + } if (hba->ldapsearchattribute) + { + Assert(noptions < MAX_HBA_OPTIONS); options[noptions++] = CStringGetTextDatum(psprintf("ldapsearchattribute=%s", hba->ldapsearchattribute)); + } if (hba->ldapscope) + { + Assert(noptions < MAX_HBA_OPTIONS); options[noptions++] = CStringGetTextDatum(psprintf("ldapscope=%d", hba->ldapscope)); } + } if (hba->auth_method == uaRADIUS) { if (hba->radiusservers_s) + { + Assert(noptions < MAX_HBA_OPTIONS); options[noptions++] = CStringGetTextDatum(psprintf("radiusservers=%s", hba->radiusservers_s)); + } if (hba->radiussecrets_s) + { + Assert(noptions < MAX_HBA_OPTIONS); options[noptions++] = CStringGetTextDatum(psprintf("radiussecrets=%s", hba->radiussecrets_s)); + } if (hba->radiusidentifiers_s) + { + Assert(noptions < MAX_HBA_OPTIONS); options[noptions++] = CStringGetTextDatum(psprintf("radiusidentifiers=%s", hba->radiusidentifiers_s)); + } if (hba->radiusports_s) + { + Assert(noptions < MAX_HBA_OPTIONS); options[noptions++] = CStringGetTextDatum(psprintf("radiusports=%s", hba->radiusports_s)); } + } Assert(noptions <= MAX_HBA_OPTIONS); diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c index ca926c8c..da7c9a50 100644 --- a/src/backend/libpq/pqcomm.c +++ b/src/backend/libpq/pqcomm.c @@ -2024,7 +2024,12 @@ SetSockKeepAlive(int sock) struct tcp_info info; int len = sizeof(info); /* check sock */ - getsockopt(sock, IPPROTO_TCP, TCP_INFO, &info, (socklen_t *)&len); + if (getsockopt(sock, IPPROTO_TCP, TCP_INFO, &info, (socklen_t *)&len) < 0) + { + elog(LOG, "getsockopt(TCP_INFO) failed"); + return; + } + if (info.tcpi_state != TCP_ESTABLISHED) { return; diff --git a/src/backend/nodes/bitmapset.c b/src/backend/nodes/bitmapset.c index f4b56e9f..61b30a35 100644 --- a/src/backend/nodes/bitmapset.c +++ b/src/backend/nodes/bitmapset.c @@ -1246,7 +1246,7 @@ bms_any_member(Bitmapset *a) int member; int random = abs(rand()) % bms_num_members(a); for (member = 0; member < random; member++) - bms_first_member(a); + (void) bms_first_member(a); return bms_first_member(a); } #endif diff --git a/src/backend/optimizer/path/clausesel.c b/src/backend/optimizer/path/clausesel.c index 86fe951b..794a8d81 100644 --- a/src/backend/optimizer/path/clausesel.c +++ b/src/backend/optimizer/path/clausesel.c @@ -938,7 +938,7 @@ clause_selectivity_could_under_estimated(PlannerInfo *root, Path *path) if (is_opclause(clause)) { OpExpr *opclause = (OpExpr *) clause; - char *oprname; + char *oprname = NULL; Oid opno = opclause->opno; HeapTuple opTuple; Form_pg_operator operform; diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index c1583491..13150602 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -3580,7 +3580,7 @@ check_or_exist_qual_pullupable(PlannerInfo *root, Node *node) } else if (or_clause(node)) { - return pull_vars_of_level((Node *)lfirst(l), 1) == NIL; + return pull_vars_of_level(node, 1) == NIL; } else { diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 35bf8b8a..cc19120e 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -3868,7 +3868,6 @@ create_index_path(PlannerInfo *root, List *indexquals, *indexqualcols; #ifdef __COLD_HOT__ - bool or_clause = false; List *quals = NULL; RangeTblEntry *rte = planner_rt_fetch(rel->relid, root); RelationLocInfo *rel_loc_info = GetRelationLocInfo(rte->relid); @@ -3939,7 +3938,7 @@ create_index_path(PlannerInfo *root, } } - if (IS_PGXC_COORDINATOR && !or_clause) + if (IS_PGXC_COORDINATOR) { int count = 0; Distribution *distribution = ((Path *)pathnode)->distribution; @@ -4013,15 +4012,6 @@ create_index_path(PlannerInfo *root, } } } -/* - else if (IS_PGXC_COORDINATOR && or_clause && root->parse->commandType == CMD_SELECT) - { - if (rel_loc_info && AttributeNumberIsValid(rel_loc_info->secAttrNum)) - { - add_groups_to_list(false, rte->relid, rel_loc_info, NULL, NULL, NULL); - } - } -*/ #ifdef __COLD_HOT__ if (IS_PGXC_COORDINATOR) { diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c index efabf774..d0d5c909 100644 --- a/src/backend/parser/analyze.c +++ b/src/backend/parser/analyze.c @@ -326,7 +326,7 @@ transformOptionalSelectInto(ParseState *pstate, Node *parseTree) stmt = stmt->larg; Assert(stmt && IsA(stmt, SelectStmt) &&stmt->larg == NULL); - if ((stmt != NULL) && (stmt->intoClause)) + if (stmt && stmt->intoClause) { CreateTableAsStmt *ctas = makeNode(CreateTableAsStmt); diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c index 5360c6a5..d041fdfb 100644 --- a/src/backend/parser/parse_utilcmd.c +++ b/src/backend/parser/parse_utilcmd.c @@ -3314,6 +3314,9 @@ transformAlterTableStmt(Oid relid, AlterTableStmt *stmt, cxt.ispartitioned = (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE); cxt.partbound = NULL; cxt.ofType = false; + cxt.interval_child = false; + cxt.interval_child_idx = -1; + cxt.interval_parentId = InvalidOid; /* * The only subtypes that currently require parse transformation handling diff --git a/src/backend/pgxc/copy/copyops.c b/src/backend/pgxc/copy/copyops.c index aba4aa05..0e9e1b90 100644 --- a/src/backend/pgxc/copy/copyops.c +++ b/src/backend/pgxc/copy/copyops.c @@ -123,9 +123,6 @@ attribute_out_text(StringInfo buf, char *string) c = 'v'; break; default: - /* If it's the delimiter, must backslash it */ - if (c == delimc) - break; /* All ASCII control chars are length 1 */ ptr++; continue; /* fall to end of loop */ diff --git a/src/backend/pgxc/locator/locator.c b/src/backend/pgxc/locator/locator.c index 9ce0b468..20abfd91 100644 --- a/src/backend/pgxc/locator/locator.c +++ b/src/backend/pgxc/locator/locator.c @@ -1767,6 +1767,7 @@ static int locate_shard_insert(Locator *self, Datum value, bool isnull, } else { + Assert(global_index >= 0); local_index = self->nodeindexMap[global_index]; } ((void **) self->results)[0] = ((void **) self->nodeMap)[local_index]; @@ -1893,6 +1894,7 @@ static int locate_shard_select(Locator *self, Datum value, bool isnull, hashvalue = compute_hash(self->dataType, value, LOCATOR_TYPE_SHARD); global_index = GetNodeIndexByHashValue(self->groupid, hashvalue); + Assert(global_index >= 0); switch (self->listType) { diff --git a/src/backend/pgxc/nodemgr/nodemgr.c b/src/backend/pgxc/nodemgr/nodemgr.c index 830b1b8d..65fbcccd 100644 --- a/src/backend/pgxc/nodemgr/nodemgr.c +++ b/src/backend/pgxc/nodemgr/nodemgr.c @@ -36,6 +36,7 @@ #include "catalog/pgxc_shard_map.h" #include "utils/fmgroids.h" #include "catalog/pgxc_class.h" +#include "access/xact.h" #endif #ifdef __TBASE__ diff --git a/src/backend/pgxc/plan/planner.c b/src/backend/pgxc/plan/planner.c index 2692e307..e2d7158f 100644 --- a/src/backend/pgxc/plan/planner.c +++ b/src/backend/pgxc/plan/planner.c @@ -1067,7 +1067,9 @@ pgxc_build_dml_statement(PlannerInfo *root, CmdType cmdtype, ListCell *lc; bool can_use_pk_for_rep_change = false; int16 *indexed_col_numbers = NULL; +#if 0 int index_col_count = 0; +#endif /* Make sure we are dealing with DMLs */ if (cmdtype != CMD_UPDATE && @@ -1328,6 +1330,7 @@ pgxc_build_dml_statement(PlannerInfo *root, CmdType cmdtype, rqplan->rq_param_types[rqplan->rq_num_params++] = INT4OID; } } +#if 0 else { /* @@ -1360,6 +1363,7 @@ pgxc_build_dml_statement(PlannerInfo *root, CmdType cmdtype, pkattno, resultRelationIndex, INT4OID, false); } } +#endif query_to_deparse->jointree->quals = (Node *)make_andclause( (List *)query_to_deparse->jointree->quals); } diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index c11ab9d7..8f0e5ab9 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -252,9 +252,11 @@ stat_log() for (i = 0; i < MAX_STATEMENTS_PER_TRAN; i++) elog(DEBUG1, "%d Statements per Transaction: %d (%d%%)", i, statements_per_transaction[i], statements_per_transaction[i] * 100 / total_transactions); - } + elog(DEBUG1, "%d+ Statements per Transaction: %d (%d%%)", MAX_STATEMENTS_PER_TRAN, statements_per_transaction[MAX_STATEMENTS_PER_TRAN], statements_per_transaction[MAX_STATEMENTS_PER_TRAN] * 100 / total_transactions); + } + if (nodes_per_transaction) { int i; @@ -9233,19 +9235,12 @@ ExecRemoteQuery(PlanState *pstate) #ifdef __TBASE__ if (enable_statistic) { - double __tmp__ = ((double)combiner->recv_tuples); - if (__tmp__ != 0) - { - elog(LOG, "FetchTuple: recv_node_count:%d, recv_tuples:%lu, recv_total_time:%ld, avg_time:%lf.", - combiner->recv_node_count, combiner->recv_tuples, combiner->recv_total_time, - ((double)combiner->recv_total_time) / __tmp__); - } - else - { - elog(LOG, "FetchTuple: recv_node_count:%d, recv_tuples:%lu, recv_total_time:%ld, avg_time:--", - combiner->recv_node_count, combiner->recv_tuples, combiner->recv_total_time - ); - } + elog(LOG, "FetchTuple: recv_node_count:%d, recv_tuples:%lu, " + "recv_total_time:%ld, avg_time:%lf.", + combiner->recv_node_count,combiner->recv_tuples, + combiner->recv_total_time, + combiner->recv_tuples ? ((double)combiner->recv_total_time)/ + ((double)combiner->recv_tuples) : -1); } #endif return NULL; @@ -11184,19 +11179,12 @@ ExecRemoteSubplan(PlanState *pstate) #ifdef __TBASE__ if (enable_statistic) { - double __tmp__= (double)combiner->recv_tuples; - if(__tmp__) - { - elog(LOG, "FetchTuple: worker:%d, recv_node_count:%d, recv_tuples:%lu, recv_total_time:%ld, avg_time:%lf.", - ParallelWorkerNumber, combiner->recv_node_count, combiner->recv_tuples, combiner->recv_total_time, - ((double)combiner->recv_total_time) / __tmp__); - } - else - { - elog(LOG, "FetchTuple: worker:%d, recv_node_count:%d, recv_tuples:%lu, recv_total_time:%ld, avg_time:--.", - ParallelWorkerNumber, combiner->recv_node_count, combiner->recv_tuples, combiner->recv_total_time - ); - } + elog(LOG, "FetchTuple: recv_node_count:%d, recv_tuples:%lu, " + "recv_total_time:%ld, avg_time:%lf.", + combiner->recv_node_count,combiner->recv_tuples, + combiner->recv_total_time, + combiner->recv_tuples ? ((double)combiner->recv_total_time)/ + ((double)combiner->recv_tuples) : -1); } #endif return NULL; @@ -12560,7 +12548,7 @@ is_node_prepared(RemoteQueryState *rstate, int node) { int32 wordindex = 0; int32 wordoffset = 0; - if (node > MAX_NODES_NUMBER) + if (node >= MAX_NODES_NUMBER) { elog(ERROR, "invalid nodeid:%d is bigger than maximum node number of the cluster", node); } diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index 353f6b7f..c7c630fa 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -1274,9 +1274,6 @@ pgxc_node_read_data(PGXCNodeHandle *conn, bool close_if_error) return EOF; } - if (someread) - return 1; /* got a zero read after successful tries */ - return 0; } @@ -4926,7 +4923,12 @@ PGXCNodeGetTransactionParamStr(void) void pgxc_node_set_query(PGXCNodeHandle *handle, const char *set_query) { - pgxc_node_send_query(handle, set_query); + if (pgxc_node_send_query(handle, set_query) != 0) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send query %s",set_query))); + } /* * Now read responses until ReadyForQuery. * XXX We may need to handle possible errors here. @@ -5787,7 +5789,7 @@ PGXCNodeTypeString(char node_type) #endif #ifdef __AUDIT_FGA__ -void PGXCGetCoordOidOthers(Oid **nodelist) +void PGXCGetCoordOidOthers(Oid *nodelist) { Oid node_oid; int i; @@ -5798,7 +5800,7 @@ void PGXCGetCoordOidOthers(Oid **nodelist) node_oid = co_handles[i].nodeoid; if (co_handles[PGXCNodeId - 1].nodeoid != node_oid) { - (*nodelist)[j] = node_oid; + nodelist[j] = node_oid; j++; } } diff --git a/src/backend/pgxc/pool/poolcomm.c b/src/backend/pgxc/pool/poolcomm.c index 8a70dffb..c0168b00 100644 --- a/src/backend/pgxc/pool/poolcomm.c +++ b/src/backend/pgxc/pool/poolcomm.c @@ -83,7 +83,10 @@ pool_listen(unsigned short port, const char *unixSocketName) /* bind the name to the descriptor */ if (bind(fd, (struct sockaddr *) & unix_addr, len) < 0) + { + close(fd); return -1; + } /* * Select appropriate accept-queue length limit. PG_SOMAXCONN is only @@ -96,7 +99,10 @@ pool_listen(unsigned short port, const char *unixSocketName) /* tell kernel we're a server */ if (listen(fd, maxconn) < 0) + { + close(fd); return -1; + } @@ -165,7 +171,10 @@ pool_connect(unsigned short port, const char *unixSocketName) strlen(unix_addr.sun_path) + 1; if (connect(fd, (struct sockaddr *) & unix_addr, len) < 0) + { + close(fd); return -1; + } return fd; #else @@ -833,14 +842,9 @@ pool_recvfds(PoolPort *port, int *fds, int count) } else if (r == 0) { - if(recved_size == size) - break; - else - { error_no = errno; goto receive_error; } - } recved_size += r; if(recved_size == size) @@ -1159,8 +1163,8 @@ pool_sendres_with_command_id(PoolPort *port, int res, CommandId cmdID, char *err failure: if (buf) { - buf = NULL; free(buf); + buf = NULL; } if (PoolConnectDebugPrint) @@ -1329,9 +1333,6 @@ pool_recvres(PoolPort *port, bool need_log) } else if (r == 0) { - if(recved_size == size) - break; - else goto failure; } @@ -1449,9 +1450,6 @@ pool_recvpids(PoolPort *port, int **pids) } else if (r == 0) { - if(size == recved_size) - break; - else goto failure; } @@ -1476,6 +1474,7 @@ pool_recvpids(PoolPort *port, int **pids) if (n32 == 0) { elog(WARNING, "No transaction to abort"); + free(buf); return 0; } diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c index 63d2c3e9..b9c3bd8c 100644 --- a/src/backend/pgxc/pool/poolmgr.c +++ b/src/backend/pgxc/pool/poolmgr.c @@ -1115,7 +1115,13 @@ char *session_options(void) continue; } - SplitIdentifierString(strdup(value), ',', &value_list); + if (SplitIdentifierString(strdup(value), ',', &value_list) < 0) + { + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("parse session options failed."))); + } + foreach(l, value_list) { char *value = (char *) lfirst(l); @@ -10903,6 +10909,65 @@ handle_session_command(PoolAgent * agent, StringInfo s) } } +static bool +remove_all_agent_references(Oid nodeoid) +{ + int i, j, index; + bool res = true; + + /* + * Identify if it's a coordinator or datanode first + * and get its index + */ + for (i = 0; i < agentCount; i++) + { + bool found = false; + PoolAgent *agent; + + index = agentIndexes[i]; + agent = poolAgents[index]; + + for (j = 0; j < agent->num_dn_connections; j++) + { + if (agent->dn_conn_oids[j] == nodeoid) + { + found = true; + break; + } + } + if (found) + { + PGXCNodePoolSlot *slot = agent->dn_connections[j]; + if (slot) + release_connection(agent->pool, slot, j, agent->dn_conn_oids[j], false, false); + agent->dn_connections[j] = NULL; + } + else + { + for (j = 0; j < agent->num_coord_connections; j++) + { + if (agent->coord_conn_oids[j] == nodeoid) + { + found = true; + break; + } + } + if (found) + { + PGXCNodePoolSlot *slot = agent->coord_connections[j]; + if (slot) + release_connection(agent->pool, slot, j, agent->coord_conn_oids[j], true, true); + agent->coord_connections[j] = NULL; + } + else + { + elog(LOG, "Node not found! (%u)", nodeoid); + res = false; + } + } + } + return res; +} /* * refresh_database_pools diff --git a/src/backend/pgxc/squeue/squeue.c b/src/backend/pgxc/squeue/squeue.c index 1c6fc478..a145edc5 100644 --- a/src/backend/pgxc/squeue/squeue.c +++ b/src/backend/pgxc/squeue/squeue.c @@ -5781,7 +5781,7 @@ ExecFastSendDatarow(TupleTableSlot *slot, void *sndctl, int32 nodeindex, MemoryC uint32 remaining_length = 0; MemoryContext savecxt = NULL; - ReserveSpace(node->buffer, tuple_len, &data_offset); + (void) ReserveSpace(node->buffer, tuple_len, &data_offset); remaining_length = tuple_len; /* MsgType */ FillReserveSpace(node->buffer, data_offset, "D", 1); @@ -8167,7 +8167,7 @@ ParallelFastSendDatarow(ParallelSendDataQueue *buf, TupleTableSlot *slot, void * uint32 remaining_length = 0; MemoryContext savecxt = NULL; - ReserveBufferSpace(buf, tuple_len, &data_offset); + (void) ReserveBufferSpace(buf, tuple_len, &data_offset); remaining_length = tuple_len; /* MsgType */ FillReserveBufferSpace(buf, data_offset, "D", 1); diff --git a/src/backend/replication/slotfuncs.c b/src/backend/replication/slotfuncs.c index 3646fd85..96991186 100644 --- a/src/backend/replication/slotfuncs.c +++ b/src/backend/replication/slotfuncs.c @@ -352,7 +352,7 @@ RenameSlot(const char *oldname, const char *newname) Oid get_replication_slot_slotid(const char *slotname, bool missing_ok) { - Oid oid; + Oid oid = InvalidOid; int i = 0; for (i = 0; i < max_replication_slots; i++) @@ -382,7 +382,7 @@ get_replication_slot_slotid(const char *slotname, bool missing_ok) Oid get_replication_slot_dbid(const char *slotname, bool missing_ok) { - Oid oid; + Oid oid = InvalidOid; int i = 0; for (i = 0; i < max_replication_slots; i++) diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 18da64a4..67ae7984 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -1584,7 +1584,7 @@ OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError) * just did the same thing. If it doesn't work then we'll bomb out on * the second create attempt, instead. */ - mkdir(tempdirpath, S_IRWXU); + (void) mkdir(tempdirpath, S_IRWXU); file = PathNameOpenFile(tempfilepath, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY, diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 48a49f73..1ea3d3ea 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -217,7 +217,7 @@ bool explain_stmt = false; #endif #ifdef __AUDIT_FGA__ -char *g_commandTag = NULL; +const char *g_commandTag = NULL; #endif @@ -1303,7 +1303,7 @@ exec_simple_query(const char *query_string) commandTag = CreateCommandTag(parsetree->stmt); #ifdef __AUDIT_FGA__ - g_commandTag = pnstrdup(commandTag, strlen(commandTag)); + g_commandTag = commandTag; #endif #ifdef __TBASE__ diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c index 29c53160..1a2cb2cc 100644 --- a/src/backend/tcop/pquery.c +++ b/src/backend/tcop/pquery.c @@ -1615,7 +1615,7 @@ PortalRun(Portal portal, long count, bool isTopLevel, bool run_once, * saveResourceOwner points to subtransaction's resourceOwner, but ROLLBACK SUBTXN * has already released the resource, so we need to switch to current transaction owner. */ - else if (IS_PGXC_DATANODE && (strcmp(portal->commandTag, "ROLLBACK SUBTXN") == 0)) + else if (IS_PGXC_DATANODE && portal->commandTag && (strcmp(portal->commandTag, "ROLLBACK SUBTXN") == 0)) { CurrentResourceOwner = GetCurrentTransactionResourceOwner(); } diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index 29409d44..cf727d6f 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -3134,7 +3134,7 @@ ProcessUtilitySlow(ParseState *pstate, bool isCompleteQuery = (context <= PROCESS_UTILITY_QUERY); bool needCleanup; bool commandCollected = false; - ObjectAddress address; + ObjectAddress address = InvalidObjectAddress; ObjectAddress secondaryObject = InvalidObjectAddress; /* All event trigger calls are done only when isCompleteQuery is true */ diff --git a/src/backend/utils/adt/datetime.c b/src/backend/utils/adt/datetime.c index 02c3455c..6661ab49 100644 --- a/src/backend/utils/adt/datetime.c +++ b/src/backend/utils/adt/datetime.c @@ -435,7 +435,7 @@ GetCurrentDateTime(struct pg_tm *tm) int tz; fsec_t fsec; - timestamp2tm(GetCurrentTransactionStartTimestamp(), &tz, tm, &fsec, + (void) timestamp2tm(GetCurrentTransactionStartTimestamp(), &tz, tm, &fsec, NULL, NULL); /* Note: don't pass NULL tzp to timestamp2tm; affects behavior */ } @@ -451,7 +451,7 @@ GetCurrentTimeUsec(struct pg_tm *tm, fsec_t *fsec, int *tzp) { int tz; - timestamp2tm(GetCurrentTransactionStartTimestamp(), &tz, tm, fsec, + (void) timestamp2tm(GetCurrentTransactionStartTimestamp(), &tz, tm, fsec, NULL, NULL); /* Note: don't pass NULL tzp to timestamp2tm; affects behavior */ if (tzp != NULL) diff --git a/src/backend/utils/adt/jsonb_util.c b/src/backend/utils/adt/jsonb_util.c index 91078189..a3877dff 100644 --- a/src/backend/utils/adt/jsonb_util.c +++ b/src/backend/utils/adt/jsonb_util.c @@ -582,15 +582,15 @@ pushJsonbValueScalar(JsonbParseState **pstate, JsonbIteratorToken seq, (*pstate)->size); break; case WJB_KEY: - Assert(scalarVal->type == jbvString); + Assert(scalarVal && scalarVal->type == jbvString); appendKey(*pstate, scalarVal); break; case WJB_VALUE: - Assert(IsAJsonbScalar(scalarVal)); + Assert(scalarVal && IsAJsonbScalar(scalarVal)); appendValue(*pstate, scalarVal); break; case WJB_ELEM: - Assert(IsAJsonbScalar(scalarVal)); + Assert(scalarVal && IsAJsonbScalar(scalarVal)); appendElement(*pstate, scalarVal); break; case WJB_END_OBJECT: diff --git a/src/backend/utils/adt/network_gist.c b/src/backend/utils/adt/network_gist.c index e7a4a5e3..4edfdec3 100644 --- a/src/backend/utils/adt/network_gist.c +++ b/src/backend/utils/adt/network_gist.c @@ -472,7 +472,7 @@ build_inet_union_key(int family, int minbits, int commonbits, unsigned char *addr) { GistInetKey *result; - + Assert(commonbits >= 0); /* Make sure any unused bits are zeroed. */ result = (GistInetKey *) palloc0(sizeof(GistInetKey)); diff --git a/src/backend/utils/adt/oid.c b/src/backend/utils/adt/oid.c index a4f41db7..8b28d653 100644 --- a/src/backend/utils/adt/oid.c +++ b/src/backend/utils/adt/oid.c @@ -551,7 +551,7 @@ oidvector_append(oidvector *oldoids, Oid newOid) result->ndim = 1; SET_VARSIZE(result, OidVectorSize(oldlen + 1)); - if ((oldoids) && (oldoids->dim1 > 0)) + if (oldoids && oldoids->dim1 > 0) memcpy(result->values, oldoids->values, oldlen * sizeof(Oid)); result->values[result->dim1-1] = newOid; diff --git a/src/backend/utils/adt/timestamp.c b/src/backend/utils/adt/timestamp.c index 70e1125e..aaaf55cf 100644 --- a/src/backend/utils/adt/timestamp.c +++ b/src/backend/utils/adt/timestamp.c @@ -2036,7 +2036,7 @@ SetEpochTimestamp(void) GetEpochTime(tm); /* we don't bother to test for failure ... */ - tm2timestamp(tm, 0, NULL, &dt); + (void) tm2timestamp(tm, 0, NULL, &dt); return dt; } /* SetEpochTimestamp() */ diff --git a/src/backend/utils/cache/relcryptmap.c b/src/backend/utils/cache/relcryptmap.c index 79eef334..fd16e5fb 100644 --- a/src/backend/utils/cache/relcryptmap.c +++ b/src/backend/utils/cache/relcryptmap.c @@ -283,7 +283,7 @@ void cyprt_key_info_hash_init(void) CRYPT_KEY_INFO_HASHTABLE_INIT_SIZE, CRYPT_KEY_INFO_HASHTABLE_MAX_SIZE, &info, - HASH_ELEM | HASH_PARTITION | HASH_COMPARE); + HASH_ELEM | HASH_PARTITION | HASH_COMPARE | HASH_BLOBS); g_crypt_key_info_lock = (CryptKeyInfoLock) ShmemInitStruct("crypt key info lock shmem", MAXALIGN64(sizeof(CryptKeyInfoLockData)), &found); diff --git a/src/backend/utils/misc/datamask.c b/src/backend/utils/misc/datamask.c index edf4db7c..a0101b84 100644 --- a/src/backend/utils/misc/datamask.c +++ b/src/backend/utils/misc/datamask.c @@ -625,10 +625,10 @@ bool datamask_scan_key_contain_mask(ScanState *node) ScanKey ScanKeys; int NumScanKeys; - if(!IsA(node, IndexScanState) && !IsA(node, IndexOnlyScanState)) + if(node == NULL) return false; - if(node == NULL) + if(!IsA(node, IndexScanState) && !IsA(node, IndexOnlyScanState)) return false; if (node->ss_currentRelation && diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index ffc6dbd4..d1bf813f 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -6985,6 +6985,7 @@ SelectConfigFiles(const char *userDoption, const char *progname) strerror(errno)); if (errno == ENOENT) write_stderr("Run initdb or pg_basebackup to initialize a PostgreSQL data directory.\n"); + free(configdir); return false; } @@ -7054,6 +7055,7 @@ SelectConfigFiles(const char *userDoption, const char *progname) "or by the -D invocation option, or by the " "PGDATA environment variable.\n", progname, ConfigFileName); + free(configdir); return false; } @@ -7102,6 +7104,7 @@ SelectConfigFiles(const char *userDoption, const char *progname) "or by the -D invocation option, or by the " "PGDATA environment variable.\n", progname, ConfigFileName); + free(configdir); return false; } SetConfigOption("hba_file", fname, PGC_POSTMASTER, PGC_S_OVERRIDE); diff --git a/src/backend/utils/misc/mls.c b/src/backend/utils/misc/mls.c index 5ff3cf8e..4d7ea96e 100644 --- a/src/backend/utils/misc/mls.c +++ b/src/backend/utils/misc/mls.c @@ -1317,10 +1317,10 @@ void InsertTrsprtCryptPolicyMapTuple(Relation pg_transp_crypt_map_desc, rd_rel = pg_transp_crypt_map_desc->rd_rel; - memset(&NameStr(schemaname), 0, NAMEDATALEN); - memcpy(&NameStr(schemaname), GetSchemaNameByOid(relnamespace), NAMEDATALEN); - memset(&NameStr(spcname), 0, NAMEDATALEN); - memcpy(&NameStr(spcname), get_tablespace_name(spaceoid), NAMEDATALEN); + memset(NameStr(schemaname), 0, NAMEDATALEN); + strncpy(NameStr(schemaname), GetSchemaNameByOid(relnamespace), NAMEDATALEN); + memset(NameStr(spcname), 0, NAMEDATALEN); + strncpy(NameStr(spcname), get_tablespace_name(spaceoid), NAMEDATALEN); /* diff --git a/src/backend/utils/misc/relcrypt.c b/src/backend/utils/misc/relcrypt.c index b3d25625..fa69b54d 100644 --- a/src/backend/utils/misc/relcrypt.c +++ b/src/backend/utils/misc/relcrypt.c @@ -2260,7 +2260,7 @@ text * decrypt_procedure(AlgoId algo_id, text * text_src, int context_length) sm4_crypt_ecb(&(cryptkey->sm4_ctx_decrypt), 0, ctx_len, (unsigned char*)VARDATA_ANY(text_src), (unsigned char*)VARDATA_ANY(text_src)); - text_ret = text_src; + text_ret = NULL; } } @@ -2273,7 +2273,7 @@ text * decrypt_procedure(AlgoId algo_id, text * text_src, int context_length) PointerGetDatum(text_src), PointerGetDatum(privatekey), PointerGetDatum(text_src)); - text_ret = text_src; + text_ret = NULL; } else { diff --git a/src/backend/utils/mmgr/dsa.c b/src/backend/utils/mmgr/dsa.c index f7f11c06..ce9fc58a 100644 --- a/src/backend/utils/mmgr/dsa.c +++ b/src/backend/utils/mmgr/dsa.c @@ -1975,6 +1975,7 @@ get_best_segment(dsa_area *area, Size npages) Assert(LWLockHeldByMe(DSA_AREA_LOCK(area))); check_for_freed_segments_locked(area); + Assert(npages > 0); /* * Start searching from the first bin that *might* have enough contiguous diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c index ad5d9988..11e27a41 100644 --- a/src/backend/utils/sort/tuplesort.c +++ b/src/backend/utils/sort/tuplesort.c @@ -3810,6 +3810,8 @@ comparetup_heap(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) ltup.t_data = (HeapTupleHeader) ((char *) a->tuple - MINIMAL_TUPLE_OFFSET); rtup.t_len = ((MinimalTuple) b->tuple)->t_len + MINIMAL_TUPLE_OFFSET; rtup.t_data = (HeapTupleHeader) ((char *) b->tuple - MINIMAL_TUPLE_OFFSET); + ltup.t_tableOid = InvalidOid; + rtup.t_xc_node_id = InvalidOid; tupDesc = state->tupDesc; if (sortKey->abbrev_converter) @@ -3864,6 +3866,9 @@ copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup) /* set up first-column key value */ htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); + htup.t_tableOid = InvalidOid; + htup.t_xc_node_id = InvalidOid; + original = heap_getattr(&htup, state->sortKeys[0].ssup_attno, state->tupDesc, @@ -4017,6 +4022,7 @@ readtup_datanode(Tuplesortstate *state, SortTuple *stup, /* set up first-column key value */ htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); + htup.t_tableOid = InvalidOid; stup->datum1 = heap_getattr(&htup, state->sortKeys[0].ssup_attno, state->tupDesc, diff --git a/src/bin/initgtm/initgtm.c b/src/bin/initgtm/initgtm.c index 77b9baeb..204b6b0b 100644 --- a/src/bin/initgtm/initgtm.c +++ b/src/bin/initgtm/initgtm.c @@ -544,7 +544,7 @@ setup_config(void) } writefile(path, conflines); - chmod(path, S_IRUSR | S_IWUSR); + (void) chmod(path, S_IRUSR | S_IWUSR); free(conflines); diff --git a/src/bin/pg_dump/pg_backup_tar.c b/src/bin/pg_dump/pg_backup_tar.c index bab64a56..387370b4 100644 --- a/src/bin/pg_dump/pg_backup_tar.c +++ b/src/bin/pg_dump/pg_backup_tar.c @@ -867,7 +867,7 @@ _CloseArchive(ArchiveHandle *AH) */ th = tarOpen(AH, "restore.sql", 'w'); - tarPrintf(AH, th, "--\n" + (void) tarPrintf(AH, th, "--\n" "-- NOTE:\n" "--\n" "-- File paths need to be edited. Search for $$PATH$$ and\n" diff --git a/src/gtm/client/fe-connect.c b/src/gtm/client/fe-connect.c index 5f1f167f..c5a0842e 100644 --- a/src/gtm/client/fe-connect.c +++ b/src/gtm/client/fe-connect.c @@ -635,7 +635,7 @@ GTMPQconnectPoll(GTM_Conn *conn) case CONNECTION_STARTED: { int optval; - size_t optlen = sizeof(optval); + ACCEPT_TYPE_ARG3 optlen = sizeof(optval); /* * Write ready, since we've made it here, so the connection @@ -1052,9 +1052,9 @@ closeGTM_Conn(GTM_Conn *conn) * Force length word for backends may try to read that in a generic * code */ - gtmpqPutMsgStart('X', true, conn); - gtmpqPutMsgEnd(conn); - gtmpqFlush(conn); + (void) gtmpqPutMsgStart('X', true, conn); + (void) gtmpqPutMsgEnd(conn); + (void) gtmpqFlush(conn); } /* @@ -1463,7 +1463,11 @@ GTMSetSockKeepAlive(GTM_Conn *conn, int tcp_keepalives_idle, struct tcp_info info; int len = sizeof(info); /* check sock */ - getsockopt(sock, IPPROTO_TCP, TCP_INFO, &info, (socklen_t *)&len); + if (getsockopt(sock, IPPROTO_TCP, TCP_INFO, &info, (socklen_t *)&len) < 0) + { + return false; + } + if (info.tcpi_state != TCP_ESTABLISHED) { /* No need to set */ diff --git a/src/gtm/client/gtm_client.c b/src/gtm/client/gtm_client.c index c0ef6ab1..8ff8b131 100644 --- a/src/gtm/client/gtm_client.c +++ b/src/gtm/client/gtm_client.c @@ -2828,9 +2828,12 @@ begin_transaction_multi(GTM_Conn *conn, int txn_count, GTM_IsolationLevel *txn_i for (i = 0; i < txn_count; i++) { - gtmpqPutInt(txn_isolation_level[i], sizeof(int), conn); - gtmpqPutc(txn_read_only[i], conn); - gtmpqPutInt(txn_connid[i], sizeof(int), conn); + if (gtmpqPutInt(txn_isolation_level[i], sizeof(int), conn) || + gtmpqPutc(txn_read_only[i], conn) || + gtmpqPutInt(txn_connid[i], sizeof(int), conn)) + { + goto send_failed; + } } /* Finish the message. */ diff --git a/src/gtm/common/gtm_opt_handler.c b/src/gtm/common/gtm_opt_handler.c index a3fea6b4..bb0344a5 100644 --- a/src/gtm/common/gtm_opt_handler.c +++ b/src/gtm/common/gtm_opt_handler.c @@ -1440,6 +1440,7 @@ SelectConfigFiles(const char *userDoption, const char *progname) "You must specify the --config-file or -D invocation " "option or set the PGDATA environment variable.\n", progname); + free(configdir); return false; } @@ -2699,7 +2700,10 @@ set_config_sourcefile(const char *name, char *sourcefile, int sourceline) if (record == NULL) { if (isStartUp) + { write_stderr("unrecognized configuration parameter \"%s\"\n", name); + exit(1); + } else elog(ERROR, "unrecognized configuration parameter \"%s\"", name); } @@ -2752,7 +2756,10 @@ GetConfigOption(const char *name, bool restrict_superuser) if (record == NULL) { if (isStartUp) + { write_stderr("unrecognized configuration parameter \"%s\"\n", name); + exit(1); + } else ereport(ERROR, (0, @@ -2801,7 +2808,10 @@ GetConfigOptionResetString(const char *name) if (record == NULL) { if (isStartUp) + { write_stderr("unrecognized configuration parameter \"%s\"\n", name); + exit(1); + } else ereport(ERROR, (0, @@ -2874,7 +2884,10 @@ GetConfigOptionByName(const char *name, const char **varname) if (record == NULL) { if (isStartUp) + { write_stderr("unrecognized configuration parameter \"%s\"\n", name); + exit(1); + } else ereport(ERROR, (0, diff --git a/src/gtm/gtm_ctl/gtm_ctl.c b/src/gtm/gtm_ctl/gtm_ctl.c index bf804b98..4989ed41 100644 --- a/src/gtm/gtm_ctl/gtm_ctl.c +++ b/src/gtm/gtm_ctl/gtm_ctl.c @@ -567,7 +567,7 @@ kill_zombie() } } - fclose(fp); + pclose(fp); } static void @@ -1097,8 +1097,11 @@ do_status(void) printf("\"-D\" \"%s\"",gtm_data); optlines = readfile(gtmopts_file); if (optlines != NULL) + { for (; *optlines != NULL; optlines++) fputs(*optlines, stdout); + free(optlines); + } } diff --git a/src/gtm/main/gtm_store.c b/src/gtm/main/gtm_store.c index 8208b72a..43a1c9ab 100644 --- a/src/gtm/main/gtm_store.c +++ b/src/gtm/main/gtm_store.c @@ -3448,7 +3448,13 @@ int32 GTM_StoreDropAllSeqInDatabase(GTM_SequenceKey seq_database_key) { elog(LOG, "GTM_StoreDropAllSeqInDatabase drop %s",seq_list[i].gs_key.gsk_key); } - GTM_StoreDropSeq(seq_list[i].gti_store_handle); + + if (GTM_StoreDropSeq(seq_list[i].gti_store_handle)) + { + ereport(LOG, + (ERANGE, + errmsg("GTM_StoreDropSeq fail"))); + } } if (enable_gtm_sequence_debug) @@ -3973,7 +3979,7 @@ ProcessCheckStorageTransactionCommand(Port *myport, StringInfo message) if (error) { memcpy(&txn_list[txn_count].txn, txn_info, sizeof(GTM_StoredTransactionInfo)); - if (error || GTMStorageStatus_CRC_error) + if (error & GTMStorageStatus_CRC_error) { if (need_fix) { @@ -3986,12 +3992,12 @@ ProcessCheckStorageTransactionCommand(Port *myport, StringInfo message) } } - if (error || GTMStorageStatus_freelist_error) + if (error & GTMStorageStatus_freelist_error) { txn_list[txn_count].status |= GTMStorageStatus_freelist_unchanged; } - if (error || GTMStorageStatus_hashtab_error) + if (error & GTMStorageStatus_hashtab_error) { txn_list[txn_count].status |= GTMStorageStatus_hashtab_unchanged; } diff --git a/src/gtm/main/gtm_xlog.c b/src/gtm/main/gtm_xlog.c index ec3d7c66..d38aea68 100644 --- a/src/gtm/main/gtm_xlog.c +++ b/src/gtm/main/gtm_xlog.c @@ -836,7 +836,7 @@ ReadXLogFileToBuffIntern(GTM_XLogSegmentBuff *buff,TimeLineID timeline,XLogSegNo for(;;) { - bytes = read(fd,buff->buff + buff->total_length,GTM_XLOG_SEG_SIZE); + bytes = read(fd,buff->buff + buff->total_length,GTM_XLOG_SEG_SIZE - buff->total_length); if(bytes < 0) { @@ -1227,7 +1227,7 @@ GTM_XLogCtlDataInit(void) g_checkpointDirtySize = (uint32 *)palloc(sizeof(uint32) * g_GTMStoreSize); g_checkpointDirtyStart = (uint32 *)palloc(sizeof(uint32) * g_GTMStoreSize); - if(enalbe_gtm_xlog_debug || enalbe_gtm_xlog_debug) + if(enalbe_gtm_xlog_debug || enable_gtm_debug) { elog(LOG,"Read ControlData CurrBytePos to %"PRIu64" PrevBytePos to %"PRIu64"",Insert->CurrBytePos,Insert->PrevBytePos); elog(LOG,"Read ControlData EndOfXLog to %X/%X PreEndOfXLog to %X/%X", @@ -3437,9 +3437,6 @@ XLogWrite(XLogRecPtr req) do { - if(nleft == 0) - break; - errno = 0; written = write(XLogCtl->xlog_fd, XLogCtl->writerBuff + start_pos, nleft); if (written <= 0) diff --git a/src/gtm/main/main.c b/src/gtm/main/main.c index 113cee78..7324ad8b 100644 --- a/src/gtm/main/main.c +++ b/src/gtm/main/main.c @@ -563,7 +563,10 @@ static int CheckTscFeatures(char *cmd) if (file == NULL) return false; - fscanf(file, "%d", &count); + if (fscanf(file, "%d", &count) == EOF) + { + count = 0; + } pclose(file); return count; @@ -2190,6 +2193,7 @@ GTM_ThreadTimeKeeper(void *argp) action.sa_flags = 0; action.sa_handler = GTM_ThreadSigHandler; + sigemptyset(&action.sa_mask); ret = sigaction(SIGQUIT, &action, NULL); if (ret) @@ -2451,7 +2455,7 @@ GTM_ThreadCheckPointer(void *argp) action.sa_flags = 0; action.sa_handler = GTM_ThreadSigHandler; - + sigemptyset(&action.sa_mask); ret = sigaction(SIGQUIT, &action, NULL); if (ret) { @@ -2567,6 +2571,7 @@ GTM_ThreadWalSender(void *argp) action.sa_flags = 0; action.sa_handler = GTM_ThreadSigHandler; + sigemptyset(&action.sa_mask); ret = sigaction(SIGQUIT, &action, NULL); if (ret) @@ -2859,6 +2864,8 @@ GTM_ThreadArchiver(void *argp) action.sa_flags = 0; action.sa_handler = GTM_ThreadSigHandler; + sigemptyset(&action.sa_mask); + ret = sigaction(SIGQUIT, &action, NULL); if (ret) @@ -3081,6 +3088,7 @@ GTM_ThreadWalRedoer(void *argp) action.sa_flags = 0; action.sa_handler = GTM_ThreadSigHandler; + sigemptyset(&action.sa_mask); ret = sigaction(SIGQUIT, &action, NULL); if (ret) @@ -3149,6 +3157,8 @@ GTM_ThreadWalReceiver(void *argp) action.sa_flags = 0; action.sa_handler = GTM_ThreadSigHandler; + sigemptyset(&action.sa_mask); + ret = sigaction(SIGQUIT, &action, NULL); if (ret) @@ -3348,6 +3358,7 @@ GTM_ThreadMain(void *argp) action.sa_flags = 0; action.sa_handler = GTM_ThreadSigHandler; + sigemptyset(&action.sa_mask); ret = sigaction(SIGQUIT, &action, NULL); if (ret) @@ -3611,6 +3622,7 @@ GTM_ThreadBasebackup(void *argp) action.sa_flags = 0; action.sa_handler = GTM_ThreadSigHandler; + sigemptyset(&action.sa_mask); ret = sigaction(SIGQUIT, &action, NULL); if (ret) @@ -3877,7 +3889,7 @@ ProcessCommand(Port *myport, StringInfo input_message) mtype != MSG_CHECK_GTM_STATUS ); - if(GetMyThreadInfo->handle_standby) + if(my_threadinfo->handle_standby) { #ifndef __XLOG__ /* Handle standby connecion staff. */ @@ -4259,6 +4271,7 @@ GTMAddConnection(Port *port, GTM_Conn *standby) if (NULL == GTM_ThreadCreate(GTM_ThreadMain, g_max_lock_number)) { elog(WARNING, "Failed to create gtm thread."); + GTM_RWLockAcquire(>MThreads->gt_lock, GTM_LOCKMODE_READ); break; } GTM_RWLockAcquire(>MThreads->gt_lock, GTM_LOCKMODE_READ); diff --git a/src/gtm/proxy/proxy_main.c b/src/gtm/proxy/proxy_main.c index 5508c06d..279471d3 100644 --- a/src/gtm/proxy/proxy_main.c +++ b/src/gtm/proxy/proxy_main.c @@ -1522,7 +1522,10 @@ GTMProxy_ThreadMain(void *argp) * Make sure everything is on wire now */ Enable_Longjmp(); - gtmpqFlush(thrinfo->thr_gtm_conn); + if (gtmpqFlush(thrinfo->thr_gtm_conn)) + { + elog(ERROR, "Error sending flush message"); + } Disable_Longjmp(); /* diff --git a/src/gtm/recovery/register_common.c b/src/gtm/recovery/register_common.c index a10c60e9..9c79bb33 100644 --- a/src/gtm/recovery/register_common.c +++ b/src/gtm/recovery/register_common.c @@ -598,12 +598,17 @@ Recovery_RecordRegisterInfo(GTM_PGXCNodeInfo *nodeinfo, bool is_register) int ctlfd; int len; + if (nodeinfo == NULL) + { + return; + } + GTM_RWLockAcquire(&RegisterFileLock, GTM_LOCKMODE_WRITE); ctlfd = open(GTMPGXCNodeFile, O_WRONLY | O_CREAT | O_APPEND, S_IRUSR | S_IWUSR); - if (ctlfd == -1 || nodeinfo == NULL) + if (ctlfd == -1) { GTM_RWLockRelease(&RegisterFileLock); return; diff --git a/src/gtm/xlog_test/xlog_reader.c b/src/gtm/xlog_test/xlog_reader.c index 400846f7..3e0d6520 100644 --- a/src/gtm/xlog_test/xlog_reader.c +++ b/src/gtm/xlog_test/xlog_reader.c @@ -915,7 +915,8 @@ void bind_service_threads(void) } /* time keeper thread will not handle any signal, any signal will cause the thread exit. */ -void * +void +* GTM_ThreadTimeKeeper(void *argp) { GTM_ThreadInfo *my_threadinfo = (GTM_ThreadInfo *)argp; @@ -1028,7 +1029,8 @@ GTM_ThreadTimeKeeper(void *argp) /* time keeper thread will not handle any signal, any signal will cause the thread exit. */ -void * +void +* GTM_ThreadTimeBackup(void *argp) { GTM_ThreadInfo *my_threadinfo = (GTM_ThreadInfo *)argp; @@ -1047,6 +1049,7 @@ GTM_ThreadTimeBackup(void *argp) action.sa_flags = 0; action.sa_handler = GTM_ThreadSigHandler; + sigemptyset(&action.sa_mask); ret = sigaction(SIGQUIT, &action, NULL); if (ret) @@ -1174,6 +1177,7 @@ GTM_ThreadCheckPointer(void *argp) action.sa_flags = 0; action.sa_handler = GTM_ThreadSigHandler; + sigemptyset(&action.sa_mask); ret = sigaction(SIGQUIT, &action, NULL); if (ret) @@ -1253,7 +1257,7 @@ GTM_ThreadXLogWriter(void *argp) action.sa_flags = 0; action.sa_handler = GTM_ThreadSigHandler; - + sigemptyset(&action.sa_mask); ret = sigaction(SIGQUIT, &action, NULL); if (ret) { diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index a0db442b..018cecd9 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -244,6 +244,7 @@ extern void CheckXLogRemoved(XLogSegNo segno, TimeLineID tli); extern XLogSegNo XLogGetLastRemovedSegno(void); extern void XLogSetAsyncXactLSN(XLogRecPtr record); extern void XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn); +extern XLogRecPtr XLogGetReplicationSlotMinimumLSN(void); extern void xlog_redo(XLogReaderState *record); extern void xlog_desc(StringInfo buf, XLogReaderState *record); diff --git a/src/include/audit/audit_fga.h b/src/include/audit/audit_fga.h index d0697439..c60c3549 100644 --- a/src/include/audit/audit_fga.h +++ b/src/include/audit/audit_fga.h @@ -70,7 +70,7 @@ #define AUDIT_TRIGGER_FEEDBACK_LEN 256 extern bool enable_fga; -extern char *g_commandTag; +extern const char *g_commandTag; /* simple list of strings */ diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h index 6643dc36..be08deab 100644 --- a/src/include/pgxc/pgxcnode.h +++ b/src/include/pgxc/pgxcnode.h @@ -305,7 +305,7 @@ extern const char * PGXCNodeTypeString(char node_type); #endif #ifdef __AUDIT_FGA__ -extern void PGXCGetCoordOidOthers(Oid **nodelist); +extern void PGXCGetCoordOidOthers(Oid *nodelist); extern void PGXCGetAllDnOid(Oid *nodelist); #endif diff --git a/src/interfaces/ecpg/ecpglib/execute.c b/src/interfaces/ecpg/ecpglib/execute.c index e5fb8ba0..ced7beb8 100644 --- a/src/interfaces/ecpg/ecpglib/execute.c +++ b/src/interfaces/ecpg/ecpglib/execute.c @@ -299,7 +299,10 @@ ecpg_is_type_an_array(int type, const struct statement *stmt, const struct varia else return (ECPG_ARRAY_ERROR); - ecpg_type_infocache_push(&(stmt->connection->cache_head), type, isarray, stmt->lineno); + if (!ecpg_type_infocache_push(&(stmt->connection->cache_head), type, isarray, stmt->lineno)) + { + return (ECPG_ARRAY_ERROR); + } ecpg_log("ecpg_is_type_an_array on line %d: type (%d); C (%d); array (%s)\n", stmt->lineno, type, var->type, ECPG_IS_ARRAY(isarray) ? "yes" : "no"); return isarray; } diff --git a/src/interfaces/ecpg/preproc/ecpg.c b/src/interfaces/ecpg/preproc/ecpg.c index e83faf84..353a7d64 100644 --- a/src/interfaces/ecpg/preproc/ecpg.c +++ b/src/interfaces/ecpg/preproc/ecpg.c @@ -469,7 +469,7 @@ main(int argc, char *const argv[]) */ if (ret_value != 0) { - if (strcmp(output_filename, "-") != 0 && unlink(output_filename) != 0) + if (output_filename && strcmp(output_filename, "-") != 0 && unlink(output_filename) != 0) fprintf(stderr, _("could not remove output file \"%s\"\n"), output_filename); } } diff --git a/src/interfaces/libpq/fe-auth.c b/src/interfaces/libpq/fe-auth.c index 44659d51..e0dbc56a 100644 --- a/src/interfaces/libpq/fe-auth.c +++ b/src/interfaces/libpq/fe-auth.c @@ -606,8 +606,6 @@ pg_SASL_init(PGconn *conn, int payloadlen) oom_error: termPQExpBuffer(&mechanism_buf); - if (initialresponse) - free(initialresponse); printfPQExpBuffer(&conn->errorMessage, libpq_gettext("out of memory\n")); return STATUS_ERROR; diff --git a/src/interfaces/libpq/fe-connect.c b/src/interfaces/libpq/fe-connect.c index e8be4dae..8c1ec04b 100644 --- a/src/interfaces/libpq/fe-connect.c +++ b/src/interfaces/libpq/fe-connect.c @@ -3631,8 +3631,8 @@ sendTerminateConn(PGconn *conn) * Try to send "close connection" message to backend. Ignore any * error. */ - pqPutMsgStart('X', false, conn); - pqPutMsgEnd(conn); + (void) pqPutMsgStart('X', false, conn); + (void) pqPutMsgEnd(conn); (void) pqFlush(conn); } } diff --git a/src/interfaces/libpq/fe-protocol2.c b/src/interfaces/libpq/fe-protocol2.c index 8ab4b5e5..d0d63160 100644 --- a/src/interfaces/libpq/fe-protocol2.c +++ b/src/interfaces/libpq/fe-protocol2.c @@ -1064,8 +1064,6 @@ pqGetErrorNotice2(PGconn *conn, bool isError) return 0; failure: - if (res) - PQclear(res); termPQExpBuffer(&workBuf); return EOF; } diff --git a/src/pl/plpgsql/src/pl_exec.c b/src/pl/plpgsql/src/pl_exec.c index 309956d5..78331d9d 100644 --- a/src/pl/plpgsql/src/pl_exec.c +++ b/src/pl/plpgsql/src/pl_exec.c @@ -6197,6 +6197,7 @@ get_tuple_from_datum(Datum value) ItemPointerSetInvalid(&(tmptup.t_self)); tmptup.t_tableOid = InvalidOid; tmptup.t_data = td; + tmptup.t_xc_node_id = InvalidOid; /* Build a copy and return it */ return heap_copytuple(&tmptup); diff --git a/src/timezone/localtime.c b/src/timezone/localtime.c index 9adc4eab..262ca97a 100644 --- a/src/timezone/localtime.c +++ b/src/timezone/localtime.c @@ -230,8 +230,6 @@ tzloadbody(char const *name, char *canonname, struct state *sp, bool doextend, if (!name) { name = TZDEFAULT; - if (!name) - return EINVAL; } if (name[0] == ':') From b3c94a4c40b835dfbd1232c97ed6cc1e7c640e63 Mon Sep 17 00:00:00 2001 From: youngxie Date: Tue, 16 Mar 2021 21:15:55 +0800 Subject: [PATCH 339/578] Fix ce. --- src/backend/access/common/printtup.c | 1 - src/backend/parser/parse_utilcmd.c | 1 - 2 files changed, 2 deletions(-) diff --git a/src/backend/access/common/printtup.c b/src/backend/access/common/printtup.c index c72fb1af..dfd64707 100644 --- a/src/backend/access/common/printtup.c +++ b/src/backend/access/common/printtup.c @@ -449,7 +449,6 @@ printtup(TupleTableSlot *slot, DestReceiver *self) { int len = strlen(outputstr); #ifdef __TBASE__ - int len = strlen(outputstr); if (slot->tts_tupleDescriptor->attrs[i]->atttypid == RECORDOID && self->mydest == DestRemoteExecute) { Oid tupType; diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c index d041fdfb..25550493 100644 --- a/src/backend/parser/parse_utilcmd.c +++ b/src/backend/parser/parse_utilcmd.c @@ -3313,7 +3313,6 @@ transformAlterTableStmt(Oid relid, AlterTableStmt *stmt, #endif cxt.ispartitioned = (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE); cxt.partbound = NULL; - cxt.ofType = false; cxt.interval_child = false; cxt.interval_child_idx = -1; cxt.interval_parentId = InvalidOid; From 63a9f3086f047b6be67d63d2a305dc5a2bb6d210 Mon Sep 17 00:00:00 2001 From: bethding Date: Sat, 13 Mar 2021 11:11:41 +0800 Subject: [PATCH 340/578] check interrupts before recv retry --- src/backend/libpq/be-secure.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/backend/libpq/be-secure.c b/src/backend/libpq/be-secure.c index f0006171..ea947f5b 100644 --- a/src/backend/libpq/be-secure.c +++ b/src/backend/libpq/be-secure.c @@ -257,8 +257,10 @@ secure_read(Port *port, void *ptr, size_t len) /* * We'll retry the read. Most likely it will return immediately * because there's still no data available, and we'll wait for the - * socket to become ready again. + * socket to become ready again. But we should check interrupts + * before retry incase of conflict interrupt. */ + CHECK_FOR_INTERRUPTS(); } goto retry; } From 5095f437e8c0fd71c6f91f06523896e5b10db0e1 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Thu, 18 Mar 2021 17:24:49 +0800 Subject: [PATCH 341/578] fix bug in coverity code --- src/backend/libpq/pqcomm.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c index da7c9a50..fb35a142 100644 --- a/src/backend/libpq/pqcomm.c +++ b/src/backend/libpq/pqcomm.c @@ -2026,7 +2026,6 @@ SetSockKeepAlive(int sock) /* check sock */ if (getsockopt(sock, IPPROTO_TCP, TCP_INFO, &info, (socklen_t *)&len) < 0) { - elog(LOG, "getsockopt(TCP_INFO) failed"); return; } From 57dcc31186ce36c7b2c4e2bf80844d76679759a8 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Thu, 18 Mar 2021 19:36:09 +0800 Subject: [PATCH 342/578] fix complie warning --- src/gtm/main/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gtm/main/main.c b/src/gtm/main/main.c index 7324ad8b..ec2b2143 100644 --- a/src/gtm/main/main.c +++ b/src/gtm/main/main.c @@ -610,7 +610,7 @@ static bool CheckClockSource(void) #endif -static void GTM_XLogRecoveryIfNeed(const char *data_dir) +static void GTM_XLogRecoveryIfNeed(char *data_dir) { Assert(ControlData != NULL); From 90b348d4ced58f040c318efb18cc5c0df0302eca Mon Sep 17 00:00:00 2001 From: arrowbowang Date: Fri, 19 Mar 2021 10:54:16 +0800 Subject: [PATCH 343/578] fix ID85357491 for range partition table with big step --- src/backend/utils/adt/ruleutils.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c index 8fa13f29..eb2e5420 100644 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@ -11717,7 +11717,7 @@ find_partidx_by_int(int64 start, int step, int partitions, int gap = -1; int align = -1; - if(value < start || value >= start + step*partitions) + if(value < start || value >= start + ((int64)step)*partitions) { return PARTITION_ROUTER_RESULT_NULL; } From 0c110531600bc86c2afdc73455085aac970e5dbe Mon Sep 17 00:00:00 2001 From: youngxie Date: Fri, 19 Mar 2021 16:31:14 +0800 Subject: [PATCH 344/578] Fix ce --- src/gtm/gtm_ctl/gtm_ctl.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/gtm/gtm_ctl/gtm_ctl.c b/src/gtm/gtm_ctl/gtm_ctl.c index 4989ed41..2b34ad0d 100644 --- a/src/gtm/gtm_ctl/gtm_ctl.c +++ b/src/gtm/gtm_ctl/gtm_ctl.c @@ -1100,7 +1100,6 @@ do_status(void) { for (; *optlines != NULL; optlines++) fputs(*optlines, stdout); - free(optlines); } } From d09218dca2bc9644d2b1acd5e761702549ce93dc Mon Sep 17 00:00:00 2001 From: jackywpxie Date: Mon, 22 Mar 2021 15:29:45 +0800 Subject: [PATCH 345/578] jacky/bugfix/coredump_Tbase_v5.05.3 (merge request !206) Squash merge branch 'jacky/bugfix/coredump_Tbase_v5.05.3' into 'Tbase_v5.05.3' * rollback to former. * rollback * delete nouse code. * delete nouse code. * refactor * disable nodePool->created * delete nouse code. * m_version * Revert 'debug info.' * Revert 'debug info' * comment * add log info. * add log info. * debug info. * debug info * debug info. * modified accoording to suggestion * modified according to suggestion. * delete nouse code. * time * delete nouse code. * fixed warm invalid slot. * fix Conflicts: src/backend/pgxc/pool/poolmgr.c --- src/backend/pgxc/pool/poolmgr.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c index b9c3bd8c..f33b95f0 100644 --- a/src/backend/pgxc/pool/poolmgr.c +++ b/src/backend/pgxc/pool/poolmgr.c @@ -4725,6 +4725,7 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot, */ if (PoolConnectDebugPrint) { +<<<<<<< HEAD elog(LOG, POOL_MGR_PREFIX"release_connection connection to " "database:%s user:%s " @@ -4733,6 +4734,9 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot, dbPool->database, dbPool->user_name, nodePool->node_name, slot->backend_pid, nodeidx, nodePool->size, nodePool->freeSize); +======= + elog(LOG, POOL_MGR_PREFIX"release_connection connection to node:%s backend_pid:%d nodeidx:%d size:%d freeSize:%d can not find nodepool, just destory it", nodePool->node_name, slot->backend_pid, nodeidx, nodePool->size, nodePool->freeSize); +>>>>>>> jacky/bugfix/coredump_Tbase_v5.05.3 (merge request !206) } destroy_slot(nodeidx, node, slot); return; From 59ce0b55fdf1b23f292d68acfdda130ff8ca8dea Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Thu, 30 Jun 2022 11:01:56 +0800 Subject: [PATCH 346/578] Revert "jacky/bugfix/coredump_Tbase_v5.05.3 (merge request !206)" This reverts commit 16a767922044eef82de654e833469729dc9f019b. --- src/backend/pgxc/pool/poolmgr.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c index f33b95f0..b9c3bd8c 100644 --- a/src/backend/pgxc/pool/poolmgr.c +++ b/src/backend/pgxc/pool/poolmgr.c @@ -4725,7 +4725,6 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot, */ if (PoolConnectDebugPrint) { -<<<<<<< HEAD elog(LOG, POOL_MGR_PREFIX"release_connection connection to " "database:%s user:%s " @@ -4734,9 +4733,6 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot, dbPool->database, dbPool->user_name, nodePool->node_name, slot->backend_pid, nodeidx, nodePool->size, nodePool->freeSize); -======= - elog(LOG, POOL_MGR_PREFIX"release_connection connection to node:%s backend_pid:%d nodeidx:%d size:%d freeSize:%d can not find nodepool, just destory it", nodePool->node_name, slot->backend_pid, nodeidx, nodePool->size, nodePool->freeSize); ->>>>>>> jacky/bugfix/coredump_Tbase_v5.05.3 (merge request !206) } destroy_slot(nodeidx, node, slot); return; From 467a3da532eb1462a914025cd3acab576b991137 Mon Sep 17 00:00:00 2001 From: ericxwu Date: Thu, 1 Apr 2021 15:44:40 +0800 Subject: [PATCH 347/578] Skip early ExecFinishNode if limit have been pushed down Upper level limit node could skip early ExecFinishNode to save 2~3ms of meaningless communication, since we've already push down the limit. --- src/backend/executor/execProcnode.c | 3 +++ src/backend/executor/nodeLimit.c | 1 + src/backend/nodes/copyfuncs.c | 4 +++- src/backend/nodes/outfuncs.c | 6 ++++++ src/backend/nodes/readfuncs.c | 4 +++- src/backend/optimizer/plan/createplan.c | 15 +++++++++++---- src/backend/optimizer/plan/planner.c | 16 ++++++++++++++-- src/backend/optimizer/util/pathnode.c | 4 +++- src/include/nodes/plannodes.h | 3 +++ src/include/nodes/relation.h | 3 +++ src/include/optimizer/pathnode.h | 3 ++- src/include/optimizer/planmain.h | 2 +- 12 files changed, 53 insertions(+), 11 deletions(-) diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c index cec4400d..d9ae1aa9 100644 --- a/src/backend/executor/execProcnode.c +++ b/src/backend/executor/execProcnode.c @@ -1075,6 +1075,9 @@ ExecFinishNode(PlanState *node) return; } break; + case T_LimitState: + elog(LOG, "[LIMITSTATE]"); + break; default: break; } diff --git a/src/backend/executor/nodeLimit.c b/src/backend/executor/nodeLimit.c index 10e90910..9849ff29 100644 --- a/src/backend/executor/nodeLimit.c +++ b/src/backend/executor/nodeLimit.c @@ -142,6 +142,7 @@ ExecLimit(PlanState *pstate) elog(LOG, "ExecLimit: pid %d nodeLimit finishing", MyProcPid); } + if (!((Limit *)node->ps.plan)->skipEarlyFinish) ExecFinishNode(pstate); if (g_DataPumpDebug) diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 6e6e562f..9ccd69bd 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -1197,7 +1197,9 @@ _copyLimit(const Limit *from) */ COPY_NODE_FIELD(limitOffset); COPY_NODE_FIELD(limitCount); - +#ifdef __TBASE__ + COPY_SCALAR_FIELD(skipEarlyFinish); +#endif return newnode; } diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index ac7ea190..8266ad33 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -1679,6 +1679,9 @@ _outLimit(StringInfo str, const Limit *node) WRITE_NODE_FIELD(limitOffset); WRITE_NODE_FIELD(limitCount); +#ifdef __TBASE__ + WRITE_BOOL_FIELD(skipEarlyFinish); +#endif } #ifdef XCP @@ -3454,6 +3457,9 @@ _outLimitPath(StringInfo str, const LimitPath *node) WRITE_NODE_FIELD(subpath); WRITE_NODE_FIELD(limitOffset); WRITE_NODE_FIELD(limitCount); +#ifdef __TBASE__ + WRITE_BOOL_FIELD(skipEarlyFinish); +#endif } static void diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c index b13796e7..db2b9441 100644 --- a/src/backend/nodes/readfuncs.c +++ b/src/backend/nodes/readfuncs.c @@ -3656,7 +3656,9 @@ _readLimit(void) READ_NODE_FIELD(limitOffset); READ_NODE_FIELD(limitCount); - +#ifdef __TBASE__ + READ_BOOL_FIELD(skipEarlyFinish); +#endif READ_DONE(); } diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 42ffb26e..367b6766 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -2576,7 +2576,8 @@ create_minmaxagg_plan(PlannerInfo *root, MinMaxAggPath *best_path) plan = (Plan *) make_limit(plan, subparse->limitOffset, subparse->limitCount, - 0, 1); + 0, 1, + false); /* Must apply correct cost/width data to Limit node */ plan->startup_cost = mminfo->path->startup_cost; @@ -2608,7 +2609,8 @@ create_minmaxagg_plan(PlannerInfo *root, MinMaxAggPath *best_path) plan = (Plan *) make_limit(plan, subparse->limitOffset, subparse->limitCount, - 0, 1); + 0, 1, + false); plan->startup_cost = mminfo->path->startup_cost; plan->total_cost = mminfo->pathcost; @@ -3066,7 +3068,8 @@ create_limit_plan(PlannerInfo *root, LimitPath *best_path, int flags, plan = make_limit(subplan, best_path->limitOffset, best_path->limitCount, - offset_est, count_est); + offset_est, count_est, + best_path->skipEarlyFinish); copy_generic_path_info(&plan->plan, (Path *) best_path); @@ -8386,7 +8389,7 @@ make_lockrows(Plan *lefttree, List *rowMarks, int epqParam) */ Limit * make_limit(Plan *lefttree, Node *limitOffset, Node *limitCount, - int64 offset_est, int64 count_est) + int64 offset_est, int64 count_est, bool skipEarlyFinish) { Limit *node = makeNode(Limit); Plan *plan = &node->plan; @@ -8399,6 +8402,10 @@ make_limit(Plan *lefttree, Node *limitOffset, Node *limitCount, node->limitOffset = limitOffset; node->limitCount = limitCount; +#ifdef __TBASE__ + node->skipEarlyFinish = skipEarlyFinish; +#endif + return node; } diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 62a92b5f..de1c8ab4 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -2361,6 +2361,8 @@ grouping_planner(PlannerInfo *root, bool inheritance_update, */ if (limit_needed(parse)) { + bool pushDown = false; + /* If needed, add a LimitPath on top of a RemoteSubplan. */ if (path->distribution) { @@ -2392,7 +2394,16 @@ grouping_planner(PlannerInfo *root, bool inheritance_update, path = (Path *) create_limit_path(root, final_rel, path, NULL, limitCount, /* LIMIT + OFFSET */ - 0, offset_est + count_est); + 0, offset_est + count_est, + false); +#ifdef __TBASE__ + /* + * Upper level limit node could skip early ExecFinishNode to save + * 2~3ms of meaningless communication, since we've already push + * down the limit. + */ + pushDown = true; +#endif } path = create_remotesubplan_path(root, path, NULL); @@ -2401,7 +2412,8 @@ grouping_planner(PlannerInfo *root, bool inheritance_update, path = (Path *) create_limit_path(root, final_rel, path, parse->limitOffset, parse->limitCount, - offset_est, count_est); + offset_est, count_est, + pushDown); } /* diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index cc19120e..032253ed 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -6843,7 +6843,8 @@ LimitPath * create_limit_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, Node *limitOffset, Node *limitCount, - int64 offset_est, int64 count_est) + int64 offset_est, int64 count_est, + bool skipEarlyFinish) {// #lizard forgives LimitPath *pathnode = makeNode(LimitPath); @@ -6864,6 +6865,7 @@ create_limit_path(PlannerInfo *root, RelOptInfo *rel, pathnode->subpath = subpath; pathnode->limitOffset = limitOffset; pathnode->limitCount = limitCount; + pathnode->skipEarlyFinish = skipEarlyFinish; pathnode->path.distribution = copyObject(subpath->distribution); diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index c20a6741..03457b86 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -988,6 +988,9 @@ typedef struct Limit Plan plan; Node *limitOffset; /* OFFSET parameter, or NULL if none */ Node *limitCount; /* COUNT parameter, or NULL if none */ +#ifdef __TBASE__ + bool skipEarlyFinish; /* Early ExecFinishNode ? */ +#endif } Limit; diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h index 4b752d16..689a392f 100644 --- a/src/include/nodes/relation.h +++ b/src/include/nodes/relation.h @@ -1851,6 +1851,9 @@ typedef struct LimitPath Path *subpath; /* path representing input source */ Node *limitOffset; /* OFFSET parameter, or NULL if none */ Node *limitCount; /* COUNT parameter, or NULL if none */ +#ifdef __TBASE__ + bool skipEarlyFinish; /* Early ExecFinishNode ? */ +#endif } LimitPath; diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index a3afb1a4..505cb463 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -308,7 +308,8 @@ extern ModifyTablePath *create_modifytable_path(PlannerInfo *root, extern LimitPath *create_limit_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, Node *limitOffset, Node *limitCount, - int64 offset_est, int64 count_est); + int64 offset_est, int64 count_est, + bool earlyFinish); extern Path *reparameterize_path(PlannerInfo *root, Path *path, Relids required_outer, diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h index 4e32be80..dc4ca53e 100644 --- a/src/include/optimizer/planmain.h +++ b/src/include/optimizer/planmain.h @@ -140,7 +140,7 @@ extern Agg *make_agg(List *tlist, List *qual, List *groupingSets, List *chain, double dNumGroups, Plan *lefttree); extern Limit *make_limit(Plan *lefttree, Node *limitOffset, Node *limitCount, - int64 offset_est, int64 count_est); + int64 offset_est, int64 count_est, bool skipEarlyFinish); extern RemoteSubplan *make_remotesubplan(PlannerInfo *root, Plan *lefttree, Distribution *resultDistribution, From 317d14b57e445319c250a7141786e011a8b0a773 Mon Sep 17 00:00:00 2001 From: ericxwu Date: Wed, 31 Mar 2021 14:56:53 +0800 Subject: [PATCH 348/578] Remove hasAggs/having/sort/limit restrictions to support more subquery FQS We will walk through all these expressions to check exec_nodes in RTE_SUBQUERY, so these query sub cluases are safe to FQS. --- src/backend/optimizer/util/pgxcship.c | 7 +------ src/test/regress/expected/xc_FQS_2.out | 20 ++++++++++++++++++++ src/test/regress/sql/xc_FQS.sql | 2 ++ 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/src/backend/optimizer/util/pgxcship.c b/src/backend/optimizer/util/pgxcship.c index 7bff63dd..c07af9a4 100644 --- a/src/backend/optimizer/util/pgxcship.c +++ b/src/backend/optimizer/util/pgxcship.c @@ -1828,16 +1828,11 @@ pgxc_is_simple_subquery(Query *query) * Can't pushdown a subquery involving grouping, aggregation, SRFs, * sorting, limiting, or WITH. */ - if (query->hasAggs || - query->hasWindowFuncs || + if (query->hasWindowFuncs || query->hasTargetSRFs || query->groupClause || query->groupingSets || - query->havingQual || - query->sortClause || query->distinctClause || - query->limitOffset || - query->limitCount || query->hasForUpdate || query->cteList) return false; diff --git a/src/test/regress/expected/xc_FQS_2.out b/src/test/regress/expected/xc_FQS_2.out index 9b35d802..7f9570b4 100644 --- a/src/test/regress/expected/xc_FQS_2.out +++ b/src/test/regress/expected/xc_FQS_2.out @@ -1704,6 +1704,26 @@ select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select max(c) from sub 1 | sz | 2 (3 rows) +explain select * from (select * from subquery_fqs where id = 1 order by c limit 1) where c = 2; + QUERY PLAN +-------------------------------------------------------------------------------------- + Remote Fast Query Execution (cost=0.00..0.00 rows=0 width=0) + Node/s: datanode_1 + -> Subquery Scan on "__Alias_22__" (cost=21.02..21.04 rows=1 width=40) + Filter: ("__Alias_22__".c = 2) + -> Limit (cost=21.02..21.02 rows=1 width=40) + -> Sort (cost=21.02..21.03 rows=4 width=40) + Sort Key: subquery_fqs.c + -> Seq Scan on subquery_fqs (cost=0.00..21.00 rows=4 width=40) + Filter: (id = 1) +(9 rows) + +select * from (select * from subquery_fqs where id = 1 order by c limit 1) where c = 2; + id | a | c +----+----+--- + 1 | gd | 2 +(1 row) + drop table tab1_rr; drop table tab1_hash; drop table tab1_modulo; diff --git a/src/test/regress/sql/xc_FQS.sql b/src/test/regress/sql/xc_FQS.sql index 14721a76..bc99b709 100644 --- a/src/test/regress/sql/xc_FQS.sql +++ b/src/test/regress/sql/xc_FQS.sql @@ -290,6 +290,8 @@ explain select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select c from select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select c from subquery_fqs t2 where t2.id=1 order by c limit 1); explain select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select max(c) from subquery_fqs t2 where t2.id=1); select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select max(c) from subquery_fqs t2 where t2.id=1); +explain select * from (select * from subquery_fqs where id = 1 order by c limit 1) where c = 2; +select * from (select * from subquery_fqs where id = 1 order by c limit 1) where c = 2; drop table tab1_rr; drop table tab1_hash; From 7ce1beece3461b1373ad692c552cc2fc9aa9f1b8 Mon Sep 17 00:00:00 2001 From: youngxie Date: Thu, 1 Apr 2021 16:53:40 +0800 Subject: [PATCH 349/578] Stable function can be pushed down. http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131086322927&url_cache_key=d4e1402777dc733479aac463ad1a9d24 --- src/backend/optimizer/util/pgxcship.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/optimizer/util/pgxcship.c b/src/backend/optimizer/util/pgxcship.c index c07af9a4..ec2f0504 100644 --- a/src/backend/optimizer/util/pgxcship.c +++ b/src/backend/optimizer/util/pgxcship.c @@ -2221,7 +2221,7 @@ pgxc_is_func_shippable(Oid funcid) default: { - result = (func_volatile(funcid) == PROVOLATILE_IMMUTABLE); + result = (func_volatile(funcid) != PROVOLATILE_VOLATILE); break; } } From 0cfc0c39aab376c593d50eec77677fd7e86d60c6 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Fri, 2 Apr 2021 10:44:38 +0800 Subject: [PATCH 350/578] fix coverity code --- src/gtm/client/fe-connect.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/gtm/client/fe-connect.c b/src/gtm/client/fe-connect.c index c5a0842e..080af583 100644 --- a/src/gtm/client/fe-connect.c +++ b/src/gtm/client/fe-connect.c @@ -1465,7 +1465,8 @@ GTMSetSockKeepAlive(GTM_Conn *conn, int tcp_keepalives_idle, /* check sock */ if (getsockopt(sock, IPPROTO_TCP, TCP_INFO, &info, (socklen_t *)&len) < 0) { - return false; + /* No need to set */ + return true; } if (info.tcpi_state != TCP_ESTABLISHED) From c7c05327ffb7236baab7420e465544e28a10b398 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Fri, 2 Apr 2021 19:03:48 +0800 Subject: [PATCH 351/578] for gtm log rotation http://tapd.oa.com/pgxz/prong/stories/view/1010092131863477653 --- src/gtm/common/Makefile | 2 +- src/gtm/common/elog.c | 87 +++- src/gtm/common/gtm_opt_handler.c | 4 + src/gtm/common/syslogger.c | 676 +++++++++++++++++++++++++++++++ src/gtm/main/gtm_opt.c | 133 ++++-- src/gtm/main/main.c | 340 +++++++++++++++- src/include/gtm/elog.h | 4 + src/include/gtm/gtm.h | 211 +++++----- src/include/gtm/gtm_c.h | 2 + src/include/gtm/gtm_opt.h | 4 + src/include/gtm/gtm_time.h | 31 +- src/include/gtm/syslogger.h | 98 +++++ 12 files changed, 1420 insertions(+), 172 deletions(-) create mode 100644 src/gtm/common/syslogger.c create mode 100644 src/include/gtm/syslogger.h diff --git a/src/gtm/common/Makefile b/src/gtm/common/Makefile index 43d80dad..47052e32 100644 --- a/src/gtm/common/Makefile +++ b/src/gtm/common/Makefile @@ -23,7 +23,7 @@ LDFLAGS=-L$(top_builddir)/common -L$(top_builddir)/libpq LIBS=-lpthread -lrt OBJS = gtm_opt_handler.o aset.o mcxt.o gtm_utils.o elog.o assert.o stringinfo.o gtm_lock.o \ - gtm_list.o gtm_serialize.o gtm_serialize_debug.o gtm_time.o gtm_gxid.o heap.o datapump.o bloom.o + gtm_list.o gtm_serialize.o gtm_serialize_debug.o gtm_time.o gtm_gxid.o heap.o datapump.o bloom.o syslogger.o all:all-lib diff --git a/src/gtm/common/elog.c b/src/gtm/common/elog.c index 833b9e25..a6c6c66d 100644 --- a/src/gtm/common/elog.c +++ b/src/gtm/common/elog.c @@ -28,6 +28,7 @@ #include "gtm/gtm_ext.h" #include "gtm/libpq.h" #include "gtm/pqformat.h" +#include "gtm/syslogger.h" #undef _ #define _(x) x @@ -42,6 +43,8 @@ char *GTMLogFile = NULL; /* GUC parameters */ int Log_destination = LOG_DESTINATION_STDERR; +int exit_flag = GTM_DEFAULT_EXIT_FLAG; + /* Macro for checking errordata_stack_depth is reasonable */ #define CHECK_STACK_DEPTH() \ do { \ @@ -321,15 +324,15 @@ errfinish(int dummy,...) } /* Emit the message to the right places */ - { GTM_ThreadInfo *thrinfo = GetMyThreadInfo; if(thrinfo->thr_conn) { EmitErrorReport(thrinfo->thr_conn->con_port); } - - } + else + { EmitErrorReport(NULL); + } /* Now free up subsidiary data attached to stack entry, and release it */ if (edata->message) @@ -745,6 +748,76 @@ pg_re_throw(void) } +/* + * Send data to the syslogger using the chunked protocol + * + * Note: when there are multiple backends writing into the syslogger pipe, + * it's critical that each write go into the pipe indivisibly, and not + * get interleaved with data from other processes. Fortunately, the POSIX + * spec requires that writes to pipes be atomic so long as they are not + * more than PIPE_BUF bytes long. So we divide long messages into chunks + * that are no more than that length, and send one chunk per write() call. + * The collector process knows how to reassemble the chunks. + * + * Because of the atomic write requirement, there are only two possible + * results from write() here: -1 for failure, or the requested number of + * bytes. There is not really anything we can do about a failure; retry would + * probably be an infinite loop, and we can't even report the error usefully. + * (There is noplace else we could send it!) So we might as well just ignore + * the result from write(). However, on some platforms you get a compiler + * warning from ignoring write()'s result, so do a little dance with casting + * rc to void to shut up the compiler. + */ +static void +write_pipe_chunks(char *data, int len, int dest) +{ + PipeProtoChunk p; + int fd = fileno(stderr); + int rc; + + Assert(len > 0); + + p.proto.nuls[0] = p.proto.nuls[1] = '\0'; + p.proto.pid = (exit_flag == GTM_DEFAULT_EXIT_FLAG) ? (int) MyThreadID : 0; + + /* write all but the last chunk */ + while (len > PIPE_MAX_PAYLOAD) + { + p.proto.is_last = (dest == LOG_DESTINATION_CSVLOG ? 'F' : 'f'); + p.proto.len = PIPE_MAX_PAYLOAD; + memcpy(p.proto.data, data, PIPE_MAX_PAYLOAD); + + rc = write(fd, &p, PIPE_HEADER_SIZE + PIPE_MAX_PAYLOAD); + +#ifdef __TBASE__ + /* if we are interruppted, just return */ + if (EINTR == errno && rc < 0) + { + return; + } +#endif + (void) rc; + data += PIPE_MAX_PAYLOAD; + len -= PIPE_MAX_PAYLOAD; + } + + /* write the last chunk */ + p.proto.is_last = (dest == LOG_DESTINATION_CSVLOG ? 'T' : 't'); + p.proto.len = len; + memcpy(p.proto.data, data, len); + + rc = write(fd, &p, PIPE_HEADER_SIZE + len); +#ifdef __TBASE__ + /* if we are interruppted, just return */ + if (EINTR == errno && rc < 0) + { + return; + } +#endif + (void) rc; +} + + /* * Initialization of error output file */ @@ -857,9 +930,11 @@ send_message_to_server_log(ErrorData *edata) edata->filename, edata->lineno); } - /* Write to stderr, if enabled */ - if (Log_destination & LOG_DESTINATION_STDERR) - write(fileno(stderr), buf.data, buf.len); + /* If in the syslogger thread, try to write messages direct to file */ + if (GetMyThreadInfo->am_syslogger) + write_syslogger_file(buf.data, buf.len, LOG_DESTINATION_STDERR); + else + write_pipe_chunks(buf.data, buf.len, LOG_DESTINATION_STDERR); if (errlog_collection_func && (buf.len > 0) && ('\0' != buf.data[0])) (*errlog_collection_func) (edata, &buf); diff --git a/src/gtm/common/gtm_opt_handler.c b/src/gtm/common/gtm_opt_handler.c index bb0344a5..569db18d 100644 --- a/src/gtm/common/gtm_opt_handler.c +++ b/src/gtm/common/gtm_opt_handler.c @@ -47,6 +47,7 @@ static int gtm_opt_var_compare(const void *a, const void *b); static void InitializeOneGTMOption(struct config_generic * gconf); static void ReportGTMOption(struct config_generic * record); static char *_ShowOption(struct config_generic * record, bool use_units); +extern void GTM_SendNotifyByte(); /* * Variables to bel fed by specific option definition: gtm_opt.c and gtm_proxy_opt.c @@ -350,6 +351,9 @@ ProcessConfigFile(GtmOptContext context) FreeConfigVariables(head); if (cvc) free(cvc); + + /* notify the syslogger */ + GTM_SendNotifyByte(); return true; cleanup_list: diff --git a/src/gtm/common/syslogger.c b/src/gtm/common/syslogger.c new file mode 100644 index 00000000..844d6218 --- /dev/null +++ b/src/gtm/common/syslogger.c @@ -0,0 +1,676 @@ +/*------------------------------------------------------------------------- + * + * syslogger.c + * + * The system logger (syslogger) catches all + * stderr output from the gtm thread by redirecting to a pipe, and + * writes it to a set of logfiles. It's possible to have size and + * age limits for the logfile configured in gtm.conf. If these limits + * are reached or passed, the current logfile is closed and a new one + * is created (rotated) The logfiles are stored in a subdirectory gtm_log. + * + * Copyright (c) 2021-Present TBase development team, Tencent + * + * IDENTIFICATION + * src/gtm/common/syslogger.c + * + *------------------------------------------------------------------------- + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "gtm/gtm_c.h" +#include "gtm/gtm.h" +#include "gtm/stringinfo.h" +#include "gtm/gtm_list.h" +#include "gtm/palloc.h" +#include "gtm/syslogger.h" +#include "gtm/gtm_time.h" +#include "gtm/elog.h" + +/* + * GUC parameters. Logging_collector cannot be changed after postmaster + * start, but the rest can change at SIGHUP. + */ +bool Logging_collector = false; +int Log_RotationAge = HOURS_PER_DAY * MINS_PER_HOUR; +int Log_RotationSize = 10 * 1024; +char *Log_directory = NULL; +char *Log_filename = "gtm-%Y-%m-%d_%H%M%S.log"; +bool Log_truncate_on_rotation = false; +int Log_file_mode = S_IRUSR | S_IWUSR; + +/* + * Private state + */ +pg_time_t next_rotation_time; +bool rotation_disabled = false; +FILE *gtmlogFile = NULL; +pg_time_t first_syslogger_file_time = 0; +static char *last_file_name = NULL; +bool rotation_requested = false; + +/* + * Buffers for saving partial messages from different backends. + * + * Keep NBUFFER_LISTS lists of these, with the entry for a given source pid + * being in the list numbered (pid % NBUFFER_LISTS), so as to cut down on + * the number of entries we have to examine for any one incoming message. + * There must never be more than one entry for the same source pid. + * + * An inactive buffer is not removed from its list, just held for re-use. + * An inactive buffer has pid == 0 and undefined contents of data. + */ +typedef struct +{ + int32 pid; /* PID of source process */ + StringInfoData data; /* accumulated data, as a StringInfo */ +} save_buffer; + +#define NBUFFER_LISTS 256 +static gtm_List *buffer_lists[NBUFFER_LISTS]; + + +int syslogPipe[2] = {-1, -1}; +int signalPipe[2] = {-1, -1}; + + +void flush_pipe_input(char *logbuffer, int *bytes_in_logbuffer); +static FILE *logfile_open(const char *filename, const char *mode, + bool allow_errors); + +void logfile_rotate(bool time_based_rotation, int size_rotation_for); +static char *logfile_getname(pg_time_t timestamp, const char *suffix); + + +/* -------------------------------- + * pipe protocol handling + * -------------------------------- + */ + +/* + * Process data received through the syslogger pipe. + * + * This routine interprets the log pipe protocol which sends log messages as + * (hopefully atomic) chunks - such chunks are detected and reassembled here. + * + * The protocol has a header that starts with two nul bytes, then has a 16 bit + * length, the pid of the sending process, and a flag to indicate if it is + * the last chunk in a message. Incomplete chunks are saved until we read some + * more, and non-final chunks are accumulated until we get the final chunk. + * + * All of this is to avoid 2 problems: + * . partial messages being written to logfiles (messes rotation), and + * . messages from different backends being interleaved (messages garbled). + * + * Any non-protocol messages are written out directly. These should only come + * from non-PostgreSQL sources, however (e.g. third party libraries writing to + * stderr). + * + * logbuffer is the data input buffer, and *bytes_in_logbuffer is the number + * of bytes present. On exit, any not-yet-eaten data is left-justified in + * logbuffer, and *bytes_in_logbuffer is updated. + */ +void +process_pipe_input(char *logbuffer, int *bytes_in_logbuffer, bool* pipe_eof_seen) +{ + char *cursor = logbuffer; + int count = *bytes_in_logbuffer; + int dest = LOG_DESTINATION_STDERR; + + /* While we have enough for a header, process data... */ + while (count >= (int) (offsetof(PipeProtoHeader, data) + 1)) + { + PipeProtoHeader p; + int chunklen; + + /* Do we have a valid header? */ + memcpy(&p, cursor, offsetof(PipeProtoHeader, data)); + if (p.nuls[0] == '\0' && p.nuls[1] == '\0' && + p.len > 0 && p.len <= PIPE_MAX_PAYLOAD && + (p.is_last == 't' || p.is_last == 'f' || + p.is_last == 'T' || p.is_last == 'F')) + { + gtm_List *buffer_list; + gtm_ListCell *cell; + save_buffer *existing_slot = NULL, + *free_slot = NULL; + StringInfo str; + + chunklen = PIPE_HEADER_SIZE + p.len; + + if (p.pid == 0) + { + *pipe_eof_seen = true; + } + + /* Fall out of loop if we don't have the whole chunk yet */ + if (count < chunklen) + break; + + dest = (p.is_last == 'T' || p.is_last == 'F') ? + LOG_DESTINATION_CSVLOG : LOG_DESTINATION_STDERR; + + /* Locate any existing buffer for this source pid */ + buffer_list = buffer_lists[p.pid % NBUFFER_LISTS]; + gtm_foreach(cell, buffer_list) + { + save_buffer *buf = (save_buffer *) gtm_lfirst(cell); + + if (buf->pid == p.pid) + { + existing_slot = buf; + break; + } + if (buf->pid == 0 && free_slot == NULL) + free_slot = buf; + } + + if (p.is_last == 'f' || p.is_last == 'F') + { + /* + * Save a complete non-final chunk in a per-pid buffer + */ + if (existing_slot != NULL) + { + /* Add chunk to data from preceding chunks */ + str = &(existing_slot->data); + appendBinaryStringInfo(str, + cursor + PIPE_HEADER_SIZE, + p.len); + } + else + { + /* First chunk of message, save in a new buffer */ + if (free_slot == NULL) + { + /* + * Need a free slot, but there isn't one in the list, + * so create a new one and extend the list with it. + */ + free_slot = palloc(sizeof(save_buffer)); + buffer_list = gtm_lappend(buffer_list, free_slot); + buffer_lists[p.pid % NBUFFER_LISTS] = buffer_list; + } + free_slot->pid = p.pid; + str = &(free_slot->data); + initStringInfo(str); + appendBinaryStringInfo(str, + cursor + PIPE_HEADER_SIZE, + p.len); + } + } + else + { + /* + * Final chunk --- add it to anything saved for that pid, and + * either way write the whole thing out. + */ + if (existing_slot != NULL) + { + str = &(existing_slot->data); + appendBinaryStringInfo(str, + cursor + PIPE_HEADER_SIZE, + p.len); + write_syslogger_file(str->data, str->len, dest); + /* Mark the buffer unused, and reclaim string storage */ + existing_slot->pid = 0; + pfree(str->data); + } + else + { + /* The whole message was one chunk, evidently. */ + write_syslogger_file(cursor + PIPE_HEADER_SIZE, p.len, + dest); + } + } + + /* Finished processing this chunk */ + cursor += chunklen; + count -= chunklen; + } + else + { + /* Process non-protocol data */ + + /* + * Look for the start of a protocol header. If found, dump data + * up to there and repeat the loop. Otherwise, dump it all and + * fall out of the loop. (Note: we want to dump it all if at all + * possible, so as to avoid dividing non-protocol messages across + * logfiles. We expect that in many scenarios, a non-protocol + * message will arrive all in one read(), and we want to respect + * the read() boundary if possible.) + */ + for (chunklen = 1; chunklen < count; chunklen++) + { + if (cursor[chunklen] == '\0') + break; + } + /* fall back on the stderr log as the destination */ + write_syslogger_file(cursor, chunklen, LOG_DESTINATION_STDERR); + cursor += chunklen; + count -= chunklen; + } + } + + /* We don't have a full chunk, so left-align what remains in the buffer */ + if (count > 0 && cursor != logbuffer) + memmove(logbuffer, cursor, count); + *bytes_in_logbuffer = count; +} + +/* + * Force out any buffered data + * + * This is currently used only at syslogger shutdown, but could perhaps be + * useful at other times, so it is careful to leave things in a clean state. + */ +void +flush_pipe_input(char *logbuffer, int *bytes_in_logbuffer) +{ + int i; + + /* Dump any incomplete protocol messages */ + for (i = 0; i < NBUFFER_LISTS; i++) + { + gtm_List *list = buffer_lists[i]; + gtm_ListCell *cell; + + gtm_foreach(cell, list) + { + save_buffer *buf = (save_buffer *) gtm_lfirst(cell); + + if (buf->pid != 0) + { + StringInfo str = &(buf->data); + + write_syslogger_file(str->data, str->len, LOG_DESTINATION_STDERR); + /* Mark the buffer unused, and reclaim string storage */ + buf->pid = 0; + pfree(str->data); + } + } + } + + /* + * Force out any remaining pipe data as-is; we don't bother trying to + * remove any protocol headers that may exist in it. + */ + if (*bytes_in_logbuffer > 0) + write_syslogger_file(logbuffer, *bytes_in_logbuffer, LOG_DESTINATION_STDERR); + *bytes_in_logbuffer = 0; +} + + +/* -------------------------------- + * logfile routines + * -------------------------------- + */ + +/* + * Write text to the currently open logfile + * + * This is exported so that elog.c can call it when am_syslogger is true. + * This allows the syslogger process to record elog messages of its own, + * even though its stderr does not point at the syslog pipe. + */ +void +write_syslogger_file(const char *buffer, int count, int destination) +{ + int rc; + + if (destination != LOG_DESTINATION_STDERR) + { + return; + } + + if (gtmlogFile == NULL) + { + write(fileno(stderr), buffer, count); + return; + } + + rc = fwrite(buffer, 1, count, gtmlogFile); + + /* can't use ereport here because of possible recursion */ + if (rc != count) + write_stderr("could not write to log file: %s\n", strerror(errno)); +} + +/* + * Open a new logfile with proper permissions and buffering options. + * + * If allow_errors is true, we just log any open failure and return NULL + * (with errno still correct for the fopen failure). + * Otherwise, errors are treated as fatal. + */ +static FILE * +logfile_open(const char *filename, const char *mode, bool allow_errors) +{ + FILE *fh; + mode_t oumask; + + /* + * Note we do not let Log_file_mode disable IWUSR, since we certainly want + * to be able to write the files ourselves. + */ + oumask = umask((mode_t) ((~(Log_file_mode | S_IWUSR)) & (S_IRWXU | S_IRWXG | S_IRWXO))); + fh = fopen(filename, mode); + umask(oumask); + + if (fh) + { + setvbuf(fh, NULL, PG_IOLBF, 0); + +#ifdef WIN32 + /* use CRLF line endings on Windows */ + _setmode(_fileno(fh), _O_TEXT); +#endif + } + else + { + int save_errno = errno; + + ereport(allow_errors ? LOG : FATAL, + (errmsg("could not open log file \"%s\": %m", + filename))); + errno = save_errno; + } + + return fh; +} + +/* + * perform logfile rotation + */ +void +logfile_rotate(bool time_based_rotation, int size_rotation_for) +{ + char *filename; + pg_time_t fntime; + FILE *fh; + + rotation_requested = false; + + /* + * When doing a time-based rotation, invent the new logfile name based on + * the planned rotation time, not current time, to avoid "slippage" in the + * file name when we don't do the rotation immediately. + */ + if (time_based_rotation) + fntime = next_rotation_time; + else + fntime = time(NULL); + filename = logfile_getname(fntime, NULL); + + /* + * Decide whether to overwrite or append. We can overwrite if (a) + * Log_truncate_on_rotation is set, (b) the rotation was triggered by + * elapsed time and not something else, and (c) the computed file name is + * different from what we were previously logging into. + * + * Note: last_file_name should never be NULL here, but if it is, append. + */ + if (time_based_rotation || (size_rotation_for & LOG_DESTINATION_STDERR)) + { + if (Log_truncate_on_rotation && time_based_rotation && + last_file_name != NULL && + strcmp(filename, last_file_name) != 0) + fh = logfile_open(filename, "w", true); + else + fh = logfile_open(filename, "a", true); + + if (!fh) + { + /* + * ENFILE/EMFILE are not too surprising on a busy system; just + * keep using the old file till we manage to get a new one. + * Otherwise, assume something's wrong with Log_directory and stop + * trying to create files. + */ + if (errno != ENFILE && errno != EMFILE) + { + ereport(LOG, + (errmsg("disabling automatic rotation (use SIGHUP to re-enable)"))); + rotation_disabled = true; + } + + if (filename) + pfree(filename); + return; + } + + fclose(gtmlogFile); + gtmlogFile = fh; + + /* instead of pfree'ing filename, remember it for next time */ + if (last_file_name != NULL) + pfree(last_file_name); + last_file_name = filename; + filename = NULL; + } + + if (filename) + pfree(filename); + + set_next_rotation_time(); +} + + +/* + * construct logfile name using timestamp information + * + * If suffix isn't NULL, append it to the name, replacing any ".log" + * that may be in the pattern. + * + * Result is palloc'd.postgresql-%Y-%m-%d_%H%M%S.log + */ +static char * +logfile_getname(pg_time_t timestamp, const char *suffix) +{ + char *filename; + int len; + time_t stamp_time; + struct tm timeinfo; + filename = palloc(MAXPGPATH); + + snprintf(filename, MAXPGPATH, "%s/", Log_directory); + + len = strlen(filename); + + stamp_time = time(NULL); + localtime_r(&stamp_time,&timeinfo); + /* treat Log_filename as a strftime pattern */ + strftime(filename + len, MAXPGPATH - len, Log_filename, + &timeinfo); + + if (suffix != NULL) + { + len = strlen(filename); + if (len > 4 && (strcmp(filename + (len - 4), ".log") == 0)) + len -= 4; + strlcpy(filename + len, suffix, MAXPGPATH - len); + } + + return filename; +} + +/* + * Determine the next planned rotation time, and store in next_rotation_time. + */ +void +set_next_rotation_time(void) +{ + pg_time_t now; + struct tm timeinfo; + int rotinterval; + + /* nothing to do if time-based rotation is disabled */ + if (Log_RotationAge <= 0) + return; + + /* + * The requirements here are to choose the next time > now that is a + * "multiple" of the log rotation interval. "Multiple" can be interpreted + * fairly loosely. In this version we align to log_timezone rather than + * GMT. + */ + rotinterval = Log_RotationAge * SECS_PER_MINUTE; /* convert to seconds */ + now = (pg_time_t) time(NULL); + localtime_r(&now,&timeinfo); + now += timeinfo.tm_gmtoff; + now -= now % rotinterval; + now += rotinterval; + now -= timeinfo.tm_gmtoff; + next_rotation_time = now; +} + +/* + * Initialization of error output file + */ +void +GTM_LogFileInit(void) +{ + char *filename; + + /* + * Create log directory if not present; ignore errors + */ + mkdir(Log_directory, S_IRWXU); + + first_syslogger_file_time = time(NULL); + filename = logfile_getname(first_syslogger_file_time, NULL); + + gtmlogFile = logfile_open(filename, "a", false); + + pfree(filename); +} + +/* + * Send one byte to the signal pipe, to wake up syslogger + */ +void +GTM_SendNotifyByte(void) +{ + int rc; + char dummy = 0; + + if (signalPipe[1] == -1) + { + return; + } + +retry: + rc = write(signalPipe[1], &dummy, 1); + if (rc < 0) + { + /* If interrupted by signal, just retry */ + if (errno == EINTR) + goto retry; + + /* + * If the pipe is full, we don't need to retry, the data that's there + * already is enough to wake up WaitLatch. + */ + if (errno == EAGAIN || errno == EWOULDBLOCK) + return; + + /* + * Oops, the write() failed for some other reason. We might be in a + * signal handler, so it's not safe to elog(). We have no choice but + * silently ignore the error. + */ + return; + } +} + +/* + * Read all available data from the signal pipe + */ +void +GTM_drainNotifyBytes(void) +{ + /* + * There shouldn't normally be more than one byte in the pipe, or maybe a + * few bytes if multiple processes run SetLatch at the same instant. + */ + char buf[16]; + int rc; + + if (signalPipe[0] == -1) + { + return; + } + + for (;;) + { + rc = read(signalPipe[0], buf, sizeof(buf)); + if (rc < 0) + { + if (errno == EAGAIN || errno == EWOULDBLOCK) + break; /* the pipe is empty */ + else if (errno == EINTR) + continue; /* retry */ + else + { + elog(LOG, "read() on signalPipe failed: %m"); + break; + } + } + else if (rc == 0) + { + elog(LOG, "unexpected EOF on signalPipe"); + break; + } + else if (rc < sizeof(buf)) + { + /* we successfully drained the pipe; no need to read() again */ + break; + } + /* else buffer wasn't big enough, so read again */ + } +} + +int +GTM_InitSysloggerEpoll(void) +{ + int efd = -1; + struct epoll_event event; + + if (syslogPipe[0] == -1 || signalPipe[0] == -1) + { + return -1; + } + + efd = epoll_create1(0); + if(efd == -1) + { + elog(LOG, "failed to create epoll"); + return -1; + } + + event.data.fd = syslogPipe[0]; + event.events = EPOLLIN | EPOLLERR | EPOLLHUP | EPOLLRDHUP; + if(-1 == epoll_ctl (efd, EPOLL_CTL_ADD, syslogPipe[0], &event)) + { + elog(LOG, "failed to add socket to epoll"); + return -1; + } + + event.data.fd = signalPipe[0]; + event.events = EPOLLIN | EPOLLERR | EPOLLHUP | EPOLLRDHUP; + if(-1 == epoll_ctl (efd, EPOLL_CTL_ADD, signalPipe[0], &event)) + { + elog(LOG, "failed to add socket to epoll"); + return -1; + } + + return efd; +} diff --git a/src/gtm/main/gtm_opt.c b/src/gtm/main/gtm_opt.c index f1a9418d..4029cc8f 100644 --- a/src/gtm/main/gtm_opt.c +++ b/src/gtm/main/gtm_opt.c @@ -30,7 +30,7 @@ #include "gtm/gtm_opt_tables.h" #include "gtm/gtm_opt.h" #include "gtm/gtm_standby.h" - +#include "gtm/gtm_time.h" #define CONFIG_FILENAME "gtm.conf" const char *config_filename = CONFIG_FILENAME; @@ -81,6 +81,11 @@ extern char* unix_socket_directory; extern char* unix_socket_group; extern int unix_socket_permissions; +extern char *Log_filename; +extern int Log_RotationAge; +extern int Log_RotationSize; +extern bool Log_truncate_on_rotation; + /* * We have different sets for client and server message level options because * they sort slightly different (see "log" level) @@ -212,15 +217,25 @@ struct config_bool ConfigureNamesBool[] = }, #endif { - {GTM_OPTNAME_CLUSTER_READ_ONLY, GTMC_STARTUP, - gettext_noop("Nodes connected with gtm will be readonly."), - gettext_noop("Default value is off."), - 0 - }, - >MClusterReadOnly, - false, NULL, NULL, false, NULL + {GTM_OPTNAME_CLUSTER_READ_ONLY, GTMC_STARTUP, + gettext_noop("Nodes connected with gtm will be readonly."), + gettext_noop("Default value is off."), + 0 + }, + >MClusterReadOnly, + false, NULL, NULL, false, NULL }, + { + {GTM_OPTNAME_LOG_TRUNCATE_ON_ROTATION, GTMC_SIGHUP, + gettext_noop("Truncate existing log files of same name during log rotation."), + gettext_noop("Default value is off."), + 0 + }, + &Log_truncate_on_rotation, + false, NULL, NULL, false, NULL + }, + /* End-of-list marker */ { {NULL, 0, NULL, NULL, 0}, NULL, false, NULL, NULL, false, NULL @@ -360,44 +375,68 @@ struct config_int ConfigureNamesInt[] = }, #endif { - { - GTM_OPTNAME_GTS_FREEZE_TIME_LIMIT, GTMC_STARTUP, - gettext_noop("refuse to start gtm before GTS has n days left,default 100 years"), - NULL, - 0 - }, - >MGTSFreezeLimit, - 365 * 100, 0, INT_MAX, NULL, NULL, - 0, NULL + { + GTM_OPTNAME_GTS_FREEZE_TIME_LIMIT, GTMC_STARTUP, + gettext_noop("refuse to start gtm before GTS has n days left,default 100 years"), + NULL, + 0 + }, + >MGTSFreezeLimit, + 365 * 100, 0, INT_MAX, NULL, NULL, + 0, NULL }, { - { - GTM_OPTNAME_STARTUP_GTS_DELTA, GTMC_STARTUP, - gettext_noop("Add -d seconds to GTS when started"), - NULL, - 0 - }, - >MStartupGTSDelta, - 300 , 0, INT_MAX, NULL, NULL, - 0, NULL + { + GTM_OPTNAME_STARTUP_GTS_DELTA, GTMC_STARTUP, + gettext_noop("Add -d seconds to GTS when started"), + NULL, + 0 + }, + >MStartupGTSDelta, + 300 , 0, INT_MAX, NULL, NULL, + 0, NULL }, { - { - GTM_OPTNAME_UNIX_SOCKET_PERMISSIONS, GTMC_STARTUP, - gettext_noop("Sets the access permissions of the Unix-domain socket." - "Unix-domain sockets use the usual Unix file system " - "permission set. The parameter value is expected " - "to be a numeric mode specification in the form " - "accepted by the chmod and umask system calls. " - "(To use the customary octal format the number must " - "start with a 0 (zero).)"), - NULL, - 0 - }, - &unix_socket_permissions, - 0777, 0000, 0777, NULL, NULL, - 0, NULL + { + GTM_OPTNAME_UNIX_SOCKET_PERMISSIONS, GTMC_STARTUP, + gettext_noop("Sets the access permissions of the Unix-domain socket." + "Unix-domain sockets use the usual Unix file system " + "permission set. The parameter value is expected " + "to be a numeric mode specification in the form " + "accepted by the chmod and umask system calls. " + "(To use the customary octal format the number must " + "start with a 0 (zero).)"), + NULL, + 0 + }, + &unix_socket_permissions, + 0777, 0000, 0777, NULL, NULL, + 0, NULL + }, + + { + { + GTM_OPTNAME_LOG_ROTATION_AGE, GTMC_SIGHUP, + gettext_noop("Automatic log file rotation will occur after N minutes."), + NULL, + 0 + }, + &Log_RotationAge, + HOURS_PER_DAY * MINS_PER_HOUR, 0, INT_MAX / SECS_PER_MINUTE, NULL, NULL, + 0, NULL + }, + + { + { + GTM_OPTNAME_LOG_ROTATION_SIZE, GTMC_SIGHUP, + gettext_noop("Automatic log file rotation will occur after N kilobytes."), + NULL, + 0 + }, + &Log_RotationSize, + 10 * 1024, 0, INT_MAX / 1024, NULL, NULL, + 0, NULL }, /* End-of-list marker */ @@ -492,6 +531,18 @@ struct config_string ConfigureNamesString[] = NULL, NULL }, + { + {GTM_OPTNAME_LOG_FILENAME_PATTERN, GTMC_SIGHUP, + gettext_noop("Sets the file name pattern for log files."), + NULL, + 0 + }, + &Log_filename, + "gtm-%Y-%m-%d_%H%M%S.log", + NULL, NULL, + NULL, NULL + }, + { {GTM_OPTNAME_ERROR_REPORTER, GTMC_STARTUP, gettext_noop("Command to report various errors."), diff --git a/src/gtm/main/main.c b/src/gtm/main/main.c index ec2b2143..7d9563f1 100644 --- a/src/gtm/main/main.c +++ b/src/gtm/main/main.c @@ -59,6 +59,7 @@ #include "gtm/gtm_time.h" #include "gtm/gtm_stat.h" #include "gtm/gtm_stat_error.h" +#include "gtm/syslogger.h" #ifdef __TBASE__ #include "gtm/gtm_store.h" @@ -76,6 +77,7 @@ extern char *optarg; #define GTM_DEFAULT_PORT 6666 #define GTM_PID_FILE "gtm.pid" #define GTM_LOG_FILE "gtm.log" +#define GTM_LOG_FILE_DIR "gtm_log" #define LOOPS_UNTIL_HIBERNATE 50 #define HIBERNATE_FACTOR 25 @@ -138,6 +140,8 @@ GTM_ThreadInfo *g_timekeeper_thread = NULL; GTM_ThreadInfo *g_timebackup_thread = NULL; GTM_ThreadInfo *g_timer_thread = NULL; GTM_ThreadInfo *g_logcollector_thread = NULL; +GTM_ThreadInfo *g_syslogger_thread = NULL; + void *GTM_ThreadLogCollector(void *argp); extern void GTM_ErrorLogCollector(ErrorData *edata, StringInfo buff); GTM_ThreadInfo *g_standby_pre_server_thread = NULL; @@ -202,6 +206,8 @@ void GTM_PortCleanup(Port *con_port); #endif void *GTM_ThreadMain(void *argp); void *GTM_ThreadTimeKeeper(void *argp); +static void *GTM_ThreadSysLogger(void *argp); +static bool GTM_SysLoggerStart(void); #ifdef __XLOG__ void *GTM_ThreadCheckPointer(void *argp); @@ -247,7 +253,7 @@ static void ProcessBarrierCommand(Port *myport, GTM_MessageType mtype, StringInf static int GTMInitConnection(GTM_ConnectionInfo *conninfo); static void SetNonBlockConnection(GTM_ConnectionInfo *conninfo); -static void gtm_standby_pre_server_loop(const char *data_dir); +static void gtm_standby_pre_server_loop(char *data_dir); #ifdef __XLOG__ static void thread_replication_clean(GTM_StandbyReplication *replication); @@ -256,6 +262,8 @@ static void WaitRedoertoExit(void); static void GTMSigHupHandler(void); #endif +void GTM_Exit(void); + /* * One-time initialization. It's called immediately after the main process * starts @@ -337,6 +345,8 @@ MainThreadInit() memset(thrinfo->locks_hold, 0x00, sizeof(void*) * g_max_lock_number); #endif + /* thread main is syslogger before syslogger thread create */ + thrinfo->am_syslogger = true; GTM_RWLockInit(&thrinfo->thr_lock); GTM_RWLockAcquire(&thrinfo->thr_lock, GTM_LOCKMODE_WRITE); @@ -387,16 +397,22 @@ BaseInit(char *data_dir) SpinLockInit(&g_last_sync_gts_lock); #endif + if (Log_directory == NULL) + { + Log_directory = (char *) malloc(GTM_MAX_PATH); + sprintf(Log_directory, "%s/%s", GTMDataDir, GTM_LOG_FILE_DIR); + } + if (GTMLogFile == NULL) { GTMLogFile = (char *) malloc(GTM_MAX_PATH); - sprintf(GTMLogFile, "%s/%s", GTMDataDir, GTM_LOG_FILE); + sprintf(GTMLogFile, "%s/%s", Log_directory, GTM_LOG_FILE); } /* Save Node Register File in register.c */ Recovery_SaveRegisterFileName(GTMDataDir); - DebugFileOpen(); + GTM_LogFileInit(); GTM_InitTxnManager(); GTM_InitSeqManager(); @@ -1460,6 +1476,14 @@ main(int argc, char *argv[]) process_thread_num = g_max_thread_number < process_thread_num ? g_max_thread_number : process_thread_num; } + /* start syslogger thread to handle log */ + if (!GTM_SysLoggerStart()) + { + elog(ERROR, "Failed to create syslogger thread."); + exit(1); + } + util_thread_cnt++; + /* Create GTM threads handling requests */ g_timekeeper_thread = GTM_ThreadCreate(GTM_ThreadTimeKeeper, g_max_lock_number); if (NULL == g_timekeeper_thread) @@ -1874,7 +1898,7 @@ gtm_add_connection_standby_pre_server(Port *port) * handle loop before establish a connection to active-GTM */ static void -gtm_standby_pre_server_loop(const char *data_dir) +gtm_standby_pre_server_loop(char *data_dir) { fd_set readmask; int nSockets; @@ -5912,4 +5936,312 @@ void CheckStandbyConnect(GTM_ThreadInfo *my_threadinfo, GTM_ConnectionInfo *conn } #endif + + +/* + * syslogger thread, handle log rotation and write log to logfile. + */ +static void* +GTM_ThreadSysLogger(void *argp) +{ +#define GTM_SYSLOGGER_WAIT_EVENTS 2 + GTM_ThreadInfo *my_threadinfo = (GTM_ThreadInfo *)argp; + sigjmp_buf local_sigjmp_buf; + pg_time_t now = 0; + int efd = -1; + int n = 0; + char logbuffer[READ_BUF_SIZE]; + int bytes_in_logbuffer = 0; + char *currentLogFilename = NULL; + int currentLogRotationAge = 0; + bool got_SIGHUP = false; + bool pipe_eof_seen = false; + sigset_t mask; + struct epoll_event events[GTM_SYSLOGGER_WAIT_EVENTS]; + + my_threadinfo->am_syslogger = true; + + /* ignore signal */ + sigfillset(&mask); + pthread_sigmask(SIG_BLOCK, &mask, NULL); + + elog(DEBUG8, "Starting the syslogger thread"); + + bind_service_threads(); + + efd = GTM_InitSysloggerEpoll(); + if (efd == -1) + { + elog(LOG, "failed to init syslogger epoll"); + exit(1); + } + + MessageContext = AllocSetContextCreate(TopMemoryContext, + "MessageContext", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE, + false); + + /* + * POSTGRES main processing loop begins here + * + * If an exception is encountered, processing resumes here so we abort the + * current transaction and start a new one. + * + * You might wonder why this isn't coded as an infinite loop around a + * PG_TRY construct. The reason is that this is the bottom of the + * exception stack, and so with PG_TRY there would be no exception handler + * in force at all during the CATCH part. By leaving the outermost setjmp + * always active, we have at least some chance of recovering from an error + * during error recovery. (If we get into an infinite loop thereby, it + * will soon be stopped by overflow of elog.c's internal state stack.) + */ + if (sigsetjmp(local_sigjmp_buf, 1) != 0) + { +#ifdef __TBASE__ + RWLockCleanUp(); +#endif + EmitErrorReport(NULL); + + /* + * Now return to normal top-level context and clear ErrorContext for + * next time. + */ + MemoryContextSwitchTo(TopMemoryContext); + FlushErrorState(); + } + + /* We can now handle ereport(ERROR) */ + PG_exception_stack = &local_sigjmp_buf; + + MemoryContextSwitchTo(MessageContext); + MemoryContextResetAndDeleteChildren(MessageContext); + + /* remember active logfile parameters */ + currentLogFilename = pstrdup(Log_filename); + currentLogRotationAge = Log_RotationAge; + + /* set next planned rotation time */ + set_next_rotation_time(); + + for(;;) + { + bool time_based_rotation = false; + int size_rotation_for = 0; + long cur_timeout; + int i = 0; + + /* + * Process any requests or signals received recently. + */ + if (got_SIGHUP) + { + got_SIGHUP = false; + + if (strcmp(Log_filename, currentLogFilename) != 0) + { + pfree(currentLogFilename); + currentLogFilename = pstrdup(Log_filename); + rotation_requested = true; + } + + /* + * If rotation time parameter changed, reset next rotation time, + * but don't immediately force a rotation. + */ + if (currentLogRotationAge != Log_RotationAge) + { + currentLogRotationAge = Log_RotationAge; + set_next_rotation_time(); + } + + /* + * If we had a rotation-disabling failure, re-enable rotation + * attempts after SIGHUP, and force one immediately. + */ + if (rotation_disabled) + { + rotation_disabled = false; + rotation_requested = true; + } + } + + if (Log_RotationAge > 0 && !rotation_disabled) + { + /* Do a logfile rotation if it's time */ + now = (pg_time_t) time(NULL); + if (now >= next_rotation_time) + rotation_requested = time_based_rotation = true; + } + + if (!rotation_requested && Log_RotationSize > 0 && !rotation_disabled) + { + /* Do a rotation if file is too big */ + if (ftell(gtmlogFile) >= Log_RotationSize * 1024L) + { + rotation_requested = true; + size_rotation_for |= LOG_DESTINATION_STDERR; + } + } + + if (rotation_requested) + { + /* + * Force rotation when both values are zero. It means the request + * was sent by pg_rotate_logfile. + */ + if (!time_based_rotation && size_rotation_for == 0) + size_rotation_for = LOG_DESTINATION_STDERR; + logfile_rotate(time_based_rotation, size_rotation_for); + } + + /* + * Calculate time till next time-based rotation, so that we don't + * sleep longer than that. We assume the value of "now" obtained + * above is still close enough. Note we can't make this calculation + * until after calling logfile_rotate(), since it will advance + * next_rotation_time. + * + * Also note that we need to beware of overflow in calculation of the + * timeout: with large settings of Log_RotationAge, next_rotation_time + * could be more than INT_MAX msec in the future. In that case we'll + * wait no more than INT_MAX msec, and try again. + */ + if (Log_RotationAge > 0 && !rotation_disabled) + { + pg_time_t delay; + + delay = next_rotation_time - now; + if (delay > 0) + { + if (delay > INT_MAX / 1000) + delay = INT_MAX / 1000; + cur_timeout = delay * 1000L; /* msec */ + } + else + cur_timeout = 0; + } + else + { + cur_timeout = -1L; + } + + /* + * Sleep until there's something to do + */ + n = epoll_wait (efd, events, GTM_SYSLOGGER_WAIT_EVENTS, cur_timeout); + for(i = 0; i < n; i++) + { + if(events[i].events & EPOLLIN) + { + if (events[i].data.fd == signalPipe[0]) + { + got_SIGHUP = true; + GTM_drainNotifyBytes(); + elog(LOG, "Configuration update message received in syslogger thread."); + } + else + { + int bytesRead; + + bytesRead = read(syslogPipe[0], + logbuffer + bytes_in_logbuffer, + sizeof(logbuffer) - bytes_in_logbuffer); + if (bytesRead < 0) + { + if (errno != EINTR) + ereport(LOG, + (errmsg("could not read from logger pipe: %m"))); + } + else if (bytesRead > 0) + { + bytes_in_logbuffer += bytesRead; + process_pipe_input(logbuffer, &bytes_in_logbuffer, &pipe_eof_seen); + continue; + } + else + { + /* + * Zero bytes read when select() is saying read-ready means + * EOF on the pipe: that is, there are no longer any processes + * with the pipe write end open. Therefore, the postmaster + * and all backends are shut down, and we are done. + */ + pipe_eof_seen = true; + } + } + } + } + + if (pipe_eof_seen) + { + elog(LOG, "GTM syslogger exit(%d)", exit_flag); + /* if there's any data left then force it out now */ + flush_pipe_input(logbuffer, &bytes_in_logbuffer); + + exit(exit_flag); + } + } + + return my_threadinfo; +} + +static bool +GTM_SysLoggerStart(void) +{ + if (syslogPipe[0] < 0) + { + if (pipe(syslogPipe) < 0) + ereport(FATAL, + (errmsg("could not create pipe for syslog: %m"))); + } + + if (signalPipe[0] < 0) + { + if (pipe(signalPipe) < 0) + ereport(FATAL, + (errmsg("could not create pipe for signal: %m"))); + } + + /* Create GTM threads handling requests */ + g_syslogger_thread = GTM_ThreadCreate(GTM_ThreadSysLogger, g_max_lock_number); + if (NULL == g_syslogger_thread) + { + return false; + } + + fflush(stdout); + if (dup2(syslogPipe[1], fileno(stdout)) < 0) + ereport(FATAL, + (errmsg("could not redirect stdout: %m"))); + fflush(stderr); + if (dup2(syslogPipe[1], fileno(stderr)) < 0) + ereport(FATAL, + (errmsg("could not redirect stderr: %m"))); + /* Now we are done with the write end of the pipe. */ + close(syslogPipe[1]); + syslogPipe[1] = -1; + + GetMyThreadInfo->am_syslogger = false; + atexit(GTM_Exit); + return true; +} + +/* + * let exit in syslogger + */ +void +GTM_Exit(void) +{ + if (g_syslogger_thread != NULL && !GetMyThreadInfo->am_syslogger) + { + Assert(exit_flag != GTM_DEFAULT_EXIT_FLAG); + + /* notify syslogger to do the exit */ + elog(LOG, "notify syslogger to exit(%d).", exit_flag); + sleep(-1); + } +} + #endif diff --git a/src/include/gtm/elog.h b/src/include/gtm/elog.h index 70387e54..ffdc99c5 100644 --- a/src/include/gtm/elog.h +++ b/src/include/gtm/elog.h @@ -299,6 +299,10 @@ typedef enum #define LOG_DESTINATION_EVENTLOG 4 #define LOG_DESTINATION_CSVLOG 8 +#define GTM_DEFAULT_EXIT_FLAG 1024 +extern int exit_flag; +#define exit(x) (exit)(exit_flag = (x)) + /* Other exported functions */ extern void pg_re_throw(void); extern void DebugFileOpen(void); diff --git a/src/include/gtm/gtm.h b/src/include/gtm/gtm.h index 715d91cb..714587eb 100644 --- a/src/include/gtm/gtm.h +++ b/src/include/gtm/gtm.h @@ -31,60 +31,60 @@ extern char *GTMLogFile; typedef enum GTM_ThreadStatus { - GTM_THREAD_STARTING, - GTM_THREAD_RUNNING, - GTM_THREAD_EXITING, - GTM_THREAD_BACKUP, /* Backup to standby is in progress */ - /* Must be the last */ - GTM_THREAD_INVALID + GTM_THREAD_STARTING, + GTM_THREAD_RUNNING, + GTM_THREAD_EXITING, + GTM_THREAD_BACKUP, /* Backup to standby is in progress */ + /* Must be the last */ + GTM_THREAD_INVALID } GTM_ThreadStatus; struct GTM_ConnectionInfo; #define ERRORDATA_STACK_SIZE 20 -#define GTM_MAX_CONNECTIONS_PER_THREAD 1024 -#define MAX_LOCKS_PER_THREAD 256 -#define GTM_MIN_THREADS 32 /* Provision for minimum threads */ -#define GTM_MAX_THREADS 512 /* Max threads allowed in the GTM */ +#define GTM_MAX_CONNECTIONS_PER_THREAD 1024 +#define MAX_LOCKS_PER_THREAD 256 +#define GTM_MIN_THREADS 32 /* Provision for minimum threads */ +#define GTM_MAX_THREADS 512 /* Max threads allowed in the GTM */ -#define GTM_TIMEOUT -1L -#define GTM_THREAD_FACTOR 1 +#define GTM_TIMEOUT -1L +#define GTM_THREAD_FACTOR 1 typedef int32 GTM_TimerHandle; #define INVALID_TIMER_HANDLE INVALID_STORAGE_HANDLE typedef struct GTM_ThreadInfo { - /* - * Initial few members get includes from gtm_common.h. This is to make sure - * that the GTMProxy_ThreadInfo and GTM_ThreadInfo structure can be - * typecasted to each other and these initial members can be safely - * accessed. If you need a member which should be common to both - * structures, consider adding them to GTM_COMMON_THREAD_INFO - */ - GTM_COMMON_THREAD_INFO - - GTM_ThreadStatus thr_status; - GTM_ConnectionInfo *thr_conn; + /* + * Initial few members get includes from gtm_common.h. This is to make sure + * that the GTMProxy_ThreadInfo and GTM_ThreadInfo structure can be + * typecasted to each other and these initial members can be safely + * accessed. If you need a member which should be common to both + * structures, consider adding them to GTM_COMMON_THREAD_INFO + */ + GTM_COMMON_THREAD_INFO + + GTM_ThreadStatus thr_status; + GTM_ConnectionInfo *thr_conn; #ifndef __XLOG__ - GTM_Conn *standby; + GTM_Conn *standby; #endif - GTM_RWLock thr_lock; /* Used to protect standby connection when new GTM standby registered. */ - - gtm_List *thr_cached_txninfo; - GTM_SnapshotData thr_snapshot; - - /* fields for epoll. */ - int thr_efd; - bool thr_epoll_ok; + GTM_RWLock thr_lock; /* Used to protect standby connection when new GTM standby registered. */ + + gtm_List *thr_cached_txninfo; + GTM_SnapshotData thr_snapshot; + + /* fields for epoll. */ + int thr_efd; + bool thr_epoll_ok; - /* fields for lock track. */ - GTM_RWLock **locks_hold; - int max_lock_number; - int current_number; + /* fields for lock track. */ + GTM_RWLock **locks_hold; + int max_lock_number; + int current_number; - GTM_TimerHandle backup_timer_handle; + GTM_TimerHandle backup_timer_handle; int insert_lock_id; int insert_try_lock_id; @@ -92,38 +92,39 @@ typedef struct GTM_ThreadInfo #ifdef __XLOG__ XLogRegisterBuff *register_buff; time_t last_sync_gts; /* copy of g_last_sync_gts used to detect gts sync timeout */ - GTM_RWLock **write_locks_hold; + GTM_RWLock **write_locks_hold; int *write_counters; - int current_write_number; - bool xlog_inserting; + int current_write_number; + bool xlog_inserting; - XLogWaiter xlog_waiter; + XLogWaiter xlog_waiter; bool handle_standby; #endif GTM_WorkerStatistics *stat_handle; /* statistics hanndle */ DataPumpBuf *datapump_buff; /* log collection buff */ + bool am_syslogger; } GTM_ThreadInfo; typedef struct GTM_Threads { - uint32 gt_thread_count; - uint32 gt_start_thread_count; - uint32 gt_array_size; + uint32 gt_thread_count; + uint32 gt_start_thread_count; + uint32 gt_array_size; #ifndef __XLOG__ - bool gt_standby_ready; + bool gt_standby_ready; #endif - GTM_ThreadInfo **gt_threads; - uint32 gt_starting_client_id; - uint32 gt_next_client_id; - uint32 gt_next_thread; - bool gt_block_new_connection; - GTM_RWLock gt_lock; + GTM_ThreadInfo **gt_threads; + uint32 gt_starting_client_id; + uint32 gt_next_client_id; + uint32 gt_next_thread; + bool gt_block_new_connection; + GTM_RWLock gt_lock; } GTM_Threads; extern GTM_Threads *GTMThreads; typedef struct GTM_RestoreContext { - int version; + int version; } GTM_RestoreContext; int GTM_ThreadAdd(GTM_ThreadInfo *thrinfo); @@ -137,78 +138,78 @@ void GTM_DoForAllOtherThreads(void (* process_routine)(GTM_ThreadInfo *)); void GTM_SetInitialAndNextClientIdentifierAtPromote(void); GTM_ThreadInfo *GTM_ThreadCreate( - void *(* startroutine)(void *), int32 max_lock); + void *(* startroutine)(void *), int32 max_lock); GTM_ThreadInfo * GTM_GetThreadInfo(GTM_ThreadID thrid); #ifdef XCP extern void SaveControlInfo(void); void GTM_RestoreSeqInfo(FILE *ctlf, struct GTM_RestoreContext *context); -#define CONTROL_INTERVAL 50000 +#define CONTROL_INTERVAL 50000 #endif extern void GTM_ConnCleanup(GTM_ConnectionInfo *conn); extern void GTM_RemoveConnection(GTM_ConnectionInfo *conn); #ifdef __TBASE__ -extern bool enable_gtm_sequence_debug; -extern bool enable_gtm_debug; +extern bool enable_gtm_sequence_debug; +extern bool enable_gtm_debug; extern bool enable_sync_commit; extern int warnning_time_cost; #endif /* * pthread keys to get thread specific information */ -extern pthread_key_t threadinfo_key; -extern MemoryContext TopMostMemoryContext; -extern GTM_ThreadID TopMostThreadID; - -#define SetMyThreadInfo(thrinfo) pthread_setspecific(threadinfo_key, (thrinfo)) -#define GetMyThreadInfo ((GTM_ThreadInfo *)pthread_getspecific(threadinfo_key)) -#define GetMyConnection(port) ((GTM_ConnectionInfo *)((port)->conn)) - -#define ThreadId (GetMyThreadInfo->thr_localid) -#define TopMemoryContext (GetMyThreadInfo->thr_thread_context) -#define ThreadTopContext (GetMyThreadInfo->thr_thread_context) -#define MessageContext (GetMyThreadInfo->thr_message_context) -#define CurrentMemoryContext (GetMyThreadInfo->thr_current_context) -#define ErrorContext (GetMyThreadInfo->thr_error_context) -#define errordata (GetMyThreadInfo->thr_error_data) -#define recursion_depth (GetMyThreadInfo->thr_error_recursion_depth) -#define errordata_stack_depth (GetMyThreadInfo->thr_error_stack_depth) -#define CritSectionCount (GetMyThreadInfo->thr_criticalsec_count) - -#define PG_exception_stack (GetMyThreadInfo->thr_sigjmp_buf) -#define MyConnection (GetMyThreadInfo->thr_conn) -#define MyPort ((GetMyThreadInfo->thr_conn != NULL) ? \ - GetMyThreadInfo->thr_conn->con_port : \ - NULL) -#define MyThreadID (GetMyThreadInfo->thr_id) -#define IsMainThread() (GetMyThreadInfo->thr_id == TopMostThreadID) - -#define GTM_CachedTransInfo (GetMyThreadInfo->thr_cached_txninfo) -#define GTM_HaveFreeCachedTransInfo() (gtm_list_length(GTM_CachedTransInfo)) - -#define GTM_MAX_CACHED_TRANSINFO 0 -#define GTM_HaveEnoughCachedTransInfo() (gtm_list_length(GTM_CachedTransInfo) >= GTM_MAX_CACHED_TRANSINFO) +extern pthread_key_t threadinfo_key; +extern MemoryContext TopMostMemoryContext; +extern GTM_ThreadID TopMostThreadID; + +#define SetMyThreadInfo(thrinfo) pthread_setspecific(threadinfo_key, (thrinfo)) +#define GetMyThreadInfo ((GTM_ThreadInfo *)pthread_getspecific(threadinfo_key)) +#define GetMyConnection(port) ((GTM_ConnectionInfo *)((port)->conn)) + +#define ThreadId (GetMyThreadInfo->thr_localid) +#define TopMemoryContext (GetMyThreadInfo->thr_thread_context) +#define ThreadTopContext (GetMyThreadInfo->thr_thread_context) +#define MessageContext (GetMyThreadInfo->thr_message_context) +#define CurrentMemoryContext (GetMyThreadInfo->thr_current_context) +#define ErrorContext (GetMyThreadInfo->thr_error_context) +#define errordata (GetMyThreadInfo->thr_error_data) +#define recursion_depth (GetMyThreadInfo->thr_error_recursion_depth) +#define errordata_stack_depth (GetMyThreadInfo->thr_error_stack_depth) +#define CritSectionCount (GetMyThreadInfo->thr_criticalsec_count) + +#define PG_exception_stack (GetMyThreadInfo->thr_sigjmp_buf) +#define MyConnection (GetMyThreadInfo->thr_conn) +#define MyPort ((GetMyThreadInfo->thr_conn != NULL) ? \ + GetMyThreadInfo->thr_conn->con_port : \ + NULL) +#define MyThreadID (GetMyThreadInfo->thr_id) +#define IsMainThread() (GetMyThreadInfo->thr_id == TopMostThreadID) + +#define GTM_CachedTransInfo (GetMyThreadInfo->thr_cached_txninfo) +#define GTM_HaveFreeCachedTransInfo() (gtm_list_length(GTM_CachedTransInfo)) + +#define GTM_MAX_CACHED_TRANSINFO 0 +#define GTM_HaveEnoughCachedTransInfo() (gtm_list_length(GTM_CachedTransInfo) >= GTM_MAX_CACHED_TRANSINFO) #define START_CRIT_SECTION() (CritSectionCount++) #define END_CRIT_SECTION() \ - do { \ - Assert(CritSectionCount > 0); \ - CritSectionCount--; \ - } while(0) - - -#define GTM_CLIENT_ID_EQ(a, b) \ - ((a) == (b)) -#define GTM_CLIENT_ID_LT(a, b) \ - (((int32)((a) - (b)) < 0) ? true : false) -#define GTM_CLIENT_ID_GT(a, b) \ - (!GTM_CLIENT_ID_LT(a, b) && !GTM_CLIENT_ID_EQ(a, b)) -#define GTM_CLIENT_ID_NEXT(a) \ - ((((a) + 1) == UINT32_MAX) ? 1 : ((a) + 1)) - -#define GTM_CONTROL_FILE "gtm.control" + do { \ + Assert(CritSectionCount > 0); \ + CritSectionCount--; \ + } while(0) + + +#define GTM_CLIENT_ID_EQ(a, b) \ + ((a) == (b)) +#define GTM_CLIENT_ID_LT(a, b) \ + (((int32)((a) - (b)) < 0) ? true : false) +#define GTM_CLIENT_ID_GT(a, b) \ + (!GTM_CLIENT_ID_LT(a, b) && !GTM_CLIENT_ID_EQ(a, b)) +#define GTM_CLIENT_ID_NEXT(a) \ + ((((a) + 1) == UINT32_MAX) ? 1 : ((a) + 1)) + +#define GTM_CONTROL_FILE "gtm.control" #define GTM_CONTROL_FILE_TMP "gtm.control.tmp" -#define GTM_CONTROL_VERSION 20180716 +#define GTM_CONTROL_VERSION 20180716 #endif diff --git a/src/include/gtm/gtm_c.h b/src/include/gtm/gtm_c.h index b5257f1a..ad0be27e 100644 --- a/src/include/gtm/gtm_c.h +++ b/src/include/gtm/gtm_c.h @@ -385,6 +385,8 @@ typedef enum GTMStorageStatus_status_butty }GTMStorageCheckStatus; +typedef int64 pg_time_t; + /* * Add delta 100s by assuming 10 timstamp/us * at which rate the GTM can provide 1000w/s throughput. diff --git a/src/include/gtm/gtm_opt.h b/src/include/gtm/gtm_opt.h index 8bd33409..c0f202d6 100644 --- a/src/include/gtm/gtm_opt.h +++ b/src/include/gtm/gtm_opt.h @@ -341,6 +341,10 @@ const char *const config_type_names[] =\ #define GTM_OPTNAME_KEEPALIVES_COUNT "keepalives_count" #define GTM_OPTNAME_LISTEN_ADDRESSES "listen_addresses" #define GTM_OPTNAME_LOG_FILE "log_file" +#define GTM_OPTNAME_LOG_FILENAME_PATTERN "log_filename_pattern" +#define GTM_OPTNAME_LOG_ROTATION_AGE "log_rotation_age" +#define GTM_OPTNAME_LOG_ROTATION_SIZE "log_rotation_size" +#define GTM_OPTNAME_LOG_TRUNCATE_ON_ROTATION "log_truncate_on_rotation" #define GTM_OPTNAME_LOG_MIN_MESSAGES "log_min_messages" #define GTM_OPTNAME_NODENAME "nodename" #define GTM_OPTNAME_PORT "port" diff --git a/src/include/gtm/gtm_time.h b/src/include/gtm/gtm_time.h index 71c763df..afeb1abd 100644 --- a/src/include/gtm/gtm_time.h +++ b/src/include/gtm/gtm_time.h @@ -16,21 +16,22 @@ #define GTM_TIME_H /* Julian-date equivalents of Day 0 in Unix and GTM reckoning */ -#define UNIX_EPOCH_JDATE 2440588 /* == date2j(1970, 1, 1) */ -#define GTM_EPOCH_JDATE 2451545 /* == date2j(2000, 1, 1) */ +#define UNIX_EPOCH_JDATE 2440588 /* == date2j(1970, 1, 1) */ +#define GTM_EPOCH_JDATE 2451545 /* == date2j(2000, 1, 1) */ -#define SECS_PER_YEAR (36525 * 864) /* avoid floating-point computation */ -#define SECS_PER_DAY 86400 -#define SECS_PER_HOUR 3600 -#define SECS_PER_MINUTE 60 -#define MINS_PER_HOUR 60 +#define SECS_PER_YEAR (36525 * 864) /* avoid floating-point computation */ +#define SECS_PER_DAY 86400 +#define SECS_PER_HOUR 3600 +#define SECS_PER_MINUTE 60 +#define MINS_PER_HOUR 60 +#define HOURS_PER_DAY 24 #ifdef HAVE_INT64_TIMESTAMP -#define USECS_PER_DAY INT64CONST(86400000000) -#define USECS_PER_HOUR INT64CONST(3600000000) -#define USECS_PER_MINUTE INT64CONST(60000000) -#define USECS_PER_SEC INT64CONST(1000000) -#define NSECS_PER_SEC INT64CONST(1000000000) +#define USECS_PER_DAY INT64CONST(86400000000) +#define USECS_PER_HOUR INT64CONST(3600000000) +#define USECS_PER_MINUTE INT64CONST(60000000) +#define USECS_PER_SEC INT64CONST(1000000) +#define NSECS_PER_SEC INT64CONST(1000000000) #endif @@ -43,9 +44,9 @@ extern GlobalTimestamp GTM_TimestampGetMonotonicRawPrecise(GlobalTimestamp *tv_sec, GlobalTimestamp *tv_nsec); void GTM_TimestampDifference(GTM_Timestamp start_time, GTM_Timestamp stop_time, - long *secs, int *microsecs); + long *secs, int *microsecs); bool GTM_TimestampDifferenceExceeds(GTM_Timestamp start_time, - GTM_Timestamp stop_time, - int msec); + GTM_Timestamp stop_time, + int msec); #endif diff --git a/src/include/gtm/syslogger.h b/src/include/gtm/syslogger.h new file mode 100644 index 00000000..3ebd66a0 --- /dev/null +++ b/src/include/gtm/syslogger.h @@ -0,0 +1,98 @@ +/*------------------------------------------------------------------------- + * + * syslogger.h + * Exports from gtm/syslogger.c. + * + * Copyright (c) 2021-Present TBase development team, Tencent + * + * src/include/gtm/syslogger.h + * + *------------------------------------------------------------------------- + */ +#ifndef _SYSLOGGER_H +#define _SYSLOGGER_H + +#include /* for PIPE_BUF */ + + +/* + * Primitive protocol structure for writing to syslogger pipe(s). The idea + * here is to divide long messages into chunks that are not more than + * PIPE_BUF bytes long, which according to POSIX spec must be written into + * the pipe atomically. The pipe reader then uses the protocol headers to + * reassemble the parts of a message into a single string. The reader can + * also cope with non-protocol data coming down the pipe, though we cannot + * guarantee long strings won't get split apart. + * + * We use non-nul bytes in is_last to make the protocol a tiny bit + * more robust against finding a false double nul byte prologue. But + * we still might find it in the len and/or pid bytes unless we're careful. + */ + +#ifdef PIPE_BUF +/* Are there any systems with PIPE_BUF > 64K? Unlikely, but ... */ +#if PIPE_BUF > 65536 +#define PIPE_CHUNK_SIZE 65536 +#else +#define PIPE_CHUNK_SIZE ((int) PIPE_BUF) +#endif +#else /* not defined */ +/* POSIX says the value of PIPE_BUF must be at least 512, so use that */ +#define PIPE_CHUNK_SIZE 512 +#endif + +/* + * We read() into a temp buffer twice as big as a chunk, so that any fragment + * left after processing can be moved down to the front and we'll still have + * room to read a full chunk. + */ +#define READ_BUF_SIZE (2 * PIPE_CHUNK_SIZE) + +typedef struct +{ + char nuls[2]; /* always \0\0 */ + uint16 len; /* size of this chunk (counts data only) */ + int32 pid; /* writer's pid */ + char is_last; /* last chunk of message? 't' or 'f' ('T' or + * 'F' for CSV case) */ + char data[FLEXIBLE_ARRAY_MEMBER]; /* data payload starts here */ +} PipeProtoHeader; + +typedef union +{ + PipeProtoHeader proto; + char filler[PIPE_CHUNK_SIZE]; +} PipeProtoChunk; + +#define PIPE_HEADER_SIZE offsetof(PipeProtoHeader, data) +#define PIPE_MAX_PAYLOAD ((int) (PIPE_CHUNK_SIZE - PIPE_HEADER_SIZE)) + + +/* GUC options */ +extern bool Logging_collector; +extern int Log_RotationAge; +extern int Log_RotationSize; +extern char *Log_directory; +extern char *Log_filename; +extern bool Log_truncate_on_rotation; +extern int Log_file_mode; + +extern int syslogPipe[2]; +extern int signalPipe[2]; +extern bool rotation_disabled; +extern pg_time_t next_rotation_time; +extern pg_time_t first_syslogger_file_time; +extern FILE *gtmlogFile; +extern bool rotation_requested; + +extern int SysLogger_Start(void); +extern void logfile_rotate(bool time_based_rotation, int size_rotation_for); +extern void write_syslogger_file(const char *buffer, int count, int dest); +extern void set_next_rotation_time(void); +extern void process_pipe_input(char *logbuffer, int *bytes_in_logbuffer, bool *pipe_eof_seen); +extern void flush_pipe_input(char *logbuffer, int *bytes_in_logbuffer); +extern void GTM_LogFileInit(void); +extern void GTM_SendNotifyByte(void); +extern void GTM_drainNotifyBytes(void); +extern int GTM_InitSysloggerEpoll(void); +#endif /* _SYSLOGGER_H */ From 2fd1374fddcdbd200464f00b9a689337631e7b2e Mon Sep 17 00:00:00 2001 From: youngxie Date: Wed, 7 Apr 2021 19:47:21 +0800 Subject: [PATCH 352/578] Improve performance with type converter function when insert with multi values. (merge request !267) (cherry picked from commit f223a4f5) 8ba3cbdc Improve performance with type converter function when insert with multi values. http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131086323123&url_cache_key=d4e1402777dc733479aac463ad1a9d24 --- src/backend/parser/analyze.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c index d0d5c909..a5a5e96b 100644 --- a/src/backend/parser/analyze.c +++ b/src/backend/parser/analyze.c @@ -1146,7 +1146,7 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt) if (IsExtendedQuery() && qry->isMultiValues && !qry->hasUnshippableTriggers) { /* - * simple insert if all values are params + * simple insert if all values are params or can be pushed down * * if not simple insert, do not transform insert into to copy from */ @@ -1154,7 +1154,8 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt) foreach(cell, sublist) { Node *node = (Node *)lfirst(cell); - if (!IsA(node, Param)) + if (!IsA(node, Param) && + !pgxc_is_expr_shippable(node, NULL)) { qry->isMultiValues = false; break; From da91cb2bd70091ed7d40ba9a9631e5113b898d45 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Mon, 12 Apr 2021 10:47:37 +0800 Subject: [PATCH 353/578] fix warning --- src/gtm/common/elog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gtm/common/elog.c b/src/gtm/common/elog.c index a6c6c66d..597a252e 100644 --- a/src/gtm/common/elog.c +++ b/src/gtm/common/elog.c @@ -290,6 +290,7 @@ errfinish(int dummy,...) {// #lizard forgives ErrorData *edata = &errordata[errordata_stack_depth]; int elevel = edata->elevel; + GTM_ThreadInfo *thrinfo = GetMyThreadInfo; MemoryContext oldcontext; recursion_depth++; @@ -324,7 +325,6 @@ errfinish(int dummy,...) } /* Emit the message to the right places */ - GTM_ThreadInfo *thrinfo = GetMyThreadInfo; if(thrinfo->thr_conn) { EmitErrorReport(thrinfo->thr_conn->con_port); From a172b03288d43fcb7b8d82c52a4561b45f1641fd Mon Sep 17 00:00:00 2001 From: sigmalin Date: Fri, 16 Apr 2021 15:21:08 +0800 Subject: [PATCH 354/578] for http://tapd.oa.com/pgxz/prong/stories/view/1010092131863532547 (merge request !277) --- contrib/pg_unlock/pg_unlock.c | 75 ++++++++++++++++++++++++++--------- 1 file changed, 57 insertions(+), 18 deletions(-) diff --git a/contrib/pg_unlock/pg_unlock.c b/contrib/pg_unlock/pg_unlock.c index f070a071..50401a9f 100644 --- a/contrib/pg_unlock/pg_unlock.c +++ b/contrib/pg_unlock/pg_unlock.c @@ -41,6 +41,7 @@ PG_MODULE_MAGIC; #define MAX_RELNAME 64 #define MAX_MODE 30 #define MAX_DEADLOCK 10000 +#define MAX_DEADLOCK_CHECKLOOP (10) /*macros about space allocation and release*/ #define INIT(x)\ @@ -461,6 +462,11 @@ pg_unlock_execute(PG_FUNCTION_ARGS) if (Partxns->Ptxns[Partxns->Ptxns_count].txn_count > 0) { Partxns->Ptxns_count++; + if (Partxns->Ptxns_count >= MAX_DEADLOCK_CHECKLOOP) + { + /* avoid deadlock all the time */ + break; + } } DropAlldeadlocks(); DropAlltransactions(); @@ -1029,6 +1035,46 @@ void GetAllTransInfo(void) } } +/* + * BinarySearchGid -- Binary search gid in pgxc_transaction + * input: gid + * return: gid pos or insert pos, was gid found + */ +static int +BinarySearchGid(char *gid, bool *found) +{ + int low = 0; + int high = pgxc_transaction_count - 1; + int mid = 0; + int cmp_result = 0; + *found = false; + + while (low <= high) + { + mid = (low + high) / 2; + cmp_result = strcmp(gid, pgxc_transaction[mid].gid); + if (cmp_result == 0) + { + /* gid == pgxc_transaction[mid].gid */ + *found = true; + return mid; + } + else if (cmp_result > 0) + { + /* gid > pgxc_transaction[mid].gid */ + low = mid + 1; + } + else + { + /* gid < pgxc_transaction[mid].gid */ + high = mid - 1; + } + } + + /* return insert pos */ + return high + 1; +} + /* * LoadTransaction -- get transactions from certain node and stores them in pgxc_transaction * input: node oid @@ -1037,7 +1083,7 @@ void GetAllTransInfo(void) void LoadTransaction(Oid node) { const char *query_stmt = "select a1.pid::text, a1.locktype::text, a2.datname::text, a2.relname::text, " - "a1.page::text, a1.tuple::text, a1.mode::text, a1.granted::text, a1.transactionid::text, a3.query::text " + "a1.page::text, a1.tuple::text, a1.mode::text, a1.granted::text, a1.transactionid::text, a3.query::text, pg_findgxid(a1.pid::int)::text " "from (select locktype::text, database, relation, page::text, " "tuple::text, mode::text, granted::text, pid::text, transactionid::text " "from pg_locks where (locktype = 'relation' or locktype = 'page' or locktype = 'tuple' or locktype = 'transactionid')" @@ -1070,9 +1116,10 @@ void LoadTransaction(Oid node) char *gid = NULL; int nodeid = 0; lockinfo templock; - + bool found = false; + sprintf(query_txnid, query_stmt, MyProcPid); - execute_on_single_node(node, query_txnid, 10, &result_txnid); + execute_on_single_node(node, query_txnid, 11, &result_txnid); if (result_txnid.slot == NULL) { elog(DEBUG1, "pg_unlock: there is no transaction on node %s", get_pgxc_nodename(node)); @@ -1084,37 +1131,29 @@ void LoadTransaction(Oid node) { pid = strtoul(TTSgetvalue(&result_txnid, i, 0), NULL, 10); /*get global xid of pid on node*/ - gid = GetGxid(node, pid); - /*select for update apply for transactionid without global xid*/ + gid = TTSgetvalue(&result_txnid, i, 10); + /*select for update apply for transactionid without global xid*/ if (gid == NULL) { continue; } /*check whether the gid is already existed*/ - for (i_txn = 0; i_txn < pgxc_transaction_count; i_txn++) - { - if (strcmp(gid, pgxc_transaction[i_txn].gid) == 0) - { - break; - } - } - + i_txn = BinarySearchGid(gid, &found); /*insert this new transaction when gid is not find in pgxc_transaction*/ - if (i_txn >= pgxc_transaction_count) + if (!found) { RPALLOC(pgxc_transaction); - InitTransaction(pgxc_transaction_count); - memcpy(pgxc_transaction[pgxc_transaction_count].gid, gid, sizeof(char) * MAX_GID); + memmove(&pgxc_transaction[i_txn + 1], &pgxc_transaction[i_txn], (pgxc_transaction_count - i_txn) * sizeof(transaction)); + InitTransaction(i_txn); + memcpy(pgxc_transaction[i_txn].gid, gid, sizeof(char) * MAX_GID); pgxc_transaction_count++; - i_txn = pgxc_transaction_count-1; } add_pid_node(i_txn, pid, node); ptr = strtok(gid, ":"); nodeid = atoi(ptr); pgxc_transaction[i_txn].initiator = get_nodeoid_from_nodeid(nodeid, PGXC_NODE_COORDINATOR); //pgxc_transaction[i_txn].initiator = get_pgxc_nodeoid(ptr); - pfree(gid); /*read lockinfo from result_txnid*/ templock.m_pid = pid; From 1782234fe80950b3671c4c1bfd4df8b8e5b0ac36 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Fri, 23 Apr 2021 14:28:05 +0800 Subject: [PATCH 355/578] Fix gtm backup node memory leak problem during xlog redo (merge request !289) http://tapd.oa.com/TBase_C/bugtrace/bugs/view?bug_id=1020385652086953173 --- src/gtm/client/fe-connect.c | 72 +-------- src/gtm/client/fe-protocol.c | 239 +++++++++++++++++++++------ src/gtm/gtm_ctl/gtm_ctl.c | 2 + src/include/gtm/gtm_client.h | 281 ++++++++++++++++---------------- src/include/gtm/gtm_msg.h | 305 ++++++++++++++++++----------------- 5 files changed, 491 insertions(+), 408 deletions(-) diff --git a/src/gtm/client/fe-connect.c b/src/gtm/client/fe-connect.c index 080af583..4a46c9b8 100644 --- a/src/gtm/client/fe-connect.c +++ b/src/gtm/client/fe-connect.c @@ -952,75 +952,9 @@ freeGTM_Conn(GTM_Conn *conn) if (conn->result->gr_snapshot.sn_xip) free(conn->result->gr_snapshot.sn_xip); - /* Depending on result type there could be allocated data */ - switch (conn->result->gr_type) - { - case SEQUENCE_INIT_RESULT: - case SEQUENCE_RESET_RESULT: - case SEQUENCE_CLOSE_RESULT: - case SEQUENCE_RENAME_RESULT: - case SEQUENCE_ALTER_RESULT: - case SEQUENCE_SET_VAL_RESULT: - case MSG_DB_SEQUENCE_RENAME_RESULT: - if (conn->result->gr_resdata.grd_seqkey.gsk_key) - free(conn->result->gr_resdata.grd_seqkey.gsk_key); - break; - - case SEQUENCE_GET_NEXT_RESULT: - case SEQUENCE_GET_LAST_RESULT: - if (conn->result->gr_resdata.grd_seq.seqkey.gsk_key) - free(conn->result->gr_resdata.grd_seq.seqkey.gsk_key); - break; - - default: - break; - } - - -#ifdef __TBASE__ - if (conn->result->grd_storage_data.len && conn->result->grd_storage_data.data) - { - free(conn->result->grd_storage_data.data); - conn->result->grd_storage_data.data = NULL; - conn->result->grd_storage_data.len = 0; - } - - if (conn->result->grd_store_seq.count && conn->result->grd_store_seq.seqs) - { - free(conn->result->grd_store_seq.seqs); - conn->result->grd_store_seq.seqs = NULL; - conn->result->grd_store_seq.count = 0; - } - - if (conn->result->grd_store_txn.count && conn->result->grd_store_txn.txns) - { - free(conn->result->grd_store_txn.txns); - conn->result->grd_store_txn.txns = NULL; - conn->result->grd_store_txn.count = 0; - } - - if (conn->result->grd_store_check_seq.count && conn->result->grd_store_check_seq.seqs) - { - free(conn->result->grd_store_check_seq.seqs); - conn->result->grd_store_check_seq.seqs = NULL; - conn->result->grd_store_check_seq.count = 0; - } - - if (conn->result->grd_store_check_txn.count && conn->result->grd_store_check_txn.txns) - { - free(conn->result->grd_store_check_txn.txns); - conn->result->grd_store_check_txn.txns = NULL; - conn->result->grd_store_check_txn.count = 0; - } - - if (conn->result->grd_errlog.len && conn->result->grd_errlog.errlog) - { - free(conn->result->grd_errlog.errlog); - conn->result->grd_errlog.errlog = NULL; - conn->result->grd_errlog.len = 0; - } - -#endif + /* release memory for one-time application */ + gtmpqFreeResultResource(conn->result); + free(conn->result); } #endif diff --git a/src/gtm/client/fe-protocol.c b/src/gtm/client/fe-protocol.c index d545384c..89bedf88 100644 --- a/src/gtm/client/fe-protocol.c +++ b/src/gtm/client/fe-protocol.c @@ -435,7 +435,7 @@ break; { int len = 0; int count = 0; - + memset(&result->gr_resdata.grd_gts, 0, sizeof(result->gr_resdata.grd_gts)); if (gtmpqGetnchar((char *) &result->gr_resdata.grd_gts.node_status, sizeof(int), conn)) @@ -713,14 +713,6 @@ result->gr_status = GTM_RESULT_ERROR; int data_len = 0; char *data_buf = NULL; - /* free result of last call */ - if (result->grd_storage_data.len && result->grd_storage_data.data) - { - free(result->grd_storage_data.data); - result->grd_storage_data.data = NULL; - result->grd_storage_data.len = 0; - } - #ifdef __XLOG__ /* get xlog start pos and timeline */ if (gtmpqGetInt64((int64 *)&result->grd_storage_data.start_pos, conn)) @@ -897,13 +889,6 @@ result->gr_status = GTM_RESULT_ERROR; case MSG_LIST_GTM_STORE_SEQ_RESULT: /* List gtm running sequence info */ { - if (conn->result->grd_store_seq.count && conn->result->grd_store_seq.seqs) - { - free(conn->result->grd_store_seq.seqs); - conn->result->grd_store_seq.seqs = NULL; - conn->result->grd_store_seq.count = 0; - } - if (gtmpqGetInt(&conn->result->grd_store_seq.count, sizeof(int32), conn)) { @@ -927,13 +912,6 @@ result->gr_status = GTM_RESULT_ERROR; case MSG_LIST_GTM_TXN_STORE_RESULT: /* List gtm running sequence info */ { - if (conn->result->grd_store_txn.count && conn->result->grd_store_txn.txns) - { - free(conn->result->grd_store_txn.txns); - conn->result->grd_store_txn.txns = NULL; - conn->result->grd_store_txn.count = 0; - } - if (gtmpqGetInt(&conn->result->grd_store_txn.count, sizeof(int32), conn)) { @@ -959,13 +937,6 @@ result->gr_status = GTM_RESULT_ERROR; case MSG_CHECK_GTM_SEQ_STORE_RESULT: /* Check gtm sequence valid info */ { - if (conn->result->grd_store_check_seq.count && conn->result->grd_store_check_seq.seqs) - { - free(conn->result->grd_store_check_seq.seqs); - conn->result->grd_store_check_seq.seqs = NULL; - conn->result->grd_store_check_seq.count = 0; - } - if (gtmpqGetInt(&conn->result->grd_store_check_seq.count, sizeof(int32), conn)) { @@ -990,13 +961,6 @@ result->gr_status = GTM_RESULT_ERROR; case MSG_CHECK_GTM_TXN_STORE_RESULT: /* Check gtm transaction usage info */ { - if (conn->result->grd_store_check_txn.count && conn->result->grd_store_check_txn.txns) - { - free(conn->result->grd_store_check_txn.txns); - conn->result->grd_store_check_txn.txns = NULL; - conn->result->grd_store_check_txn.count = 0; - } - if (gtmpqGetInt(&conn->result->grd_store_check_txn.count, sizeof(int32), conn)) { @@ -1167,7 +1131,7 @@ result->gr_status = GTM_RESULT_ERROR; } if (result->gr_resdata.grd_txn_get_gid_data.nodelen != 0) { - /* Do necessary allocation */ + /* Do necessary allocation, free outside */ result->gr_resdata.grd_txn_get_gid_data.nodestring = (char *) malloc(sizeof(char *) * result->gr_resdata.grd_txn_get_gid_data.nodelen + 1); if (result->gr_resdata.grd_txn_get_gid_data.nodestring == NULL) @@ -1268,6 +1232,8 @@ result->gr_status = GTM_RESULT_ERROR; char *buf = NULL; int buf_size = 8192; + memset(result->gr_resdata.grd_node_list.nodeinfo, 0, sizeof(result->gr_resdata.grd_node_list.nodeinfo)); + if (gtmpqGetInt(&result->gr_resdata.grd_node_list.num_node, sizeof(int32), conn)) { result->gr_status = GTM_RESULT_ERROR; @@ -1286,6 +1252,7 @@ result->gr_status = GTM_RESULT_ERROR; { int size; GTM_PGXCNodeInfo *data = (GTM_PGXCNodeInfo *) malloc(sizeof(GTM_PGXCNodeInfo)); + memset(data, 0, sizeof(GTM_PGXCNodeInfo)); if (gtmpqGetInt(&size, sizeof(int32), conn)) { @@ -1316,6 +1283,26 @@ result->gr_status = GTM_RESULT_ERROR; if (!gtm_deserialize_pgxcnodeinfo(data, buf, size, &conn->errorMessage)) { result->gr_status = GTM_RESULT_ERROR; + if (data->nodename) + { + genFree(data->nodename); + } + if (data->proxyname) + { + genFree(data->proxyname); + } + if (data->ipaddress) + { + genFree(data->ipaddress); + } + if (data->datafolder) + { + genFree(data->datafolder); + } + if (data->sessions) + { + genFree(data->sessions); + } free(data); break; } @@ -1392,6 +1379,8 @@ result->gr_status = GTM_RESULT_ERROR; int offset = 0; int pack_size = 0; int i = 0; + result->gr_resdata.grd_xlog_data.length = 0; + result->gr_resdata.grd_xlog_data.xlog_data = NULL; if (gtmpqGetInt64((int64 *)&result->gr_resdata.grd_xlog_data.flush, conn)) { @@ -1497,17 +1486,14 @@ gtmpqReadSeqKey(GTM_SequenceKey seqkey, GTM_Conn *conn) return 0; } +/* + * release the one-time-applied memory. if the memory design is reused, + * please release it last in freeGTM_Conn + */ void -gtmpqFreeResultData(GTM_Result *result, GTM_PGXCNodeType remote_type) -{// #lizard forgives - /* - * If we are running as a GTM proxy, we don't have anything to do. This may - * change though as we add more message types below and some of them may - * need cleanup even at the proxy level - */ - if (remote_type == GTM_NODE_GTM_PROXY) - return; - +gtmpqFreeResultResource(GTM_Result *result) +{ + int i = 0; switch (result->gr_type) { case SEQUENCE_INIT_RESULT: @@ -1546,8 +1532,165 @@ gtmpqFreeResultData(GTM_Result *result, GTM_PGXCNodeType remote_type) * again shortly */ break; + case NODE_UNREGISTER_RESULT: + case NODE_REGISTER_RESULT: + if (result->gr_resdata.grd_node.node_name) + { + free(result->gr_resdata.grd_node.node_name); + result->gr_resdata.grd_node.node_name = NULL; + } + break; + case NODE_LIST_RESULT: + if (result->gr_resdata.grd_node_list.num_node) + { + for (i = 0; i < result->gr_resdata.grd_node_list.num_node; i++) + { + if (result->gr_resdata.grd_node_list.nodeinfo[i]) + { + GTM_PGXCNodeInfo *data = result->gr_resdata.grd_node_list.nodeinfo[i]; + if (data->nodename) + { + genFree(data->nodename); + data->nodename = NULL; + } + if (data->proxyname) + { + genFree(data->proxyname); + data->proxyname = NULL; + } + if (data->ipaddress) + { + genFree(data->ipaddress); + data->ipaddress = NULL; + } + if (data->datafolder) + { + genFree(data->datafolder); + data->datafolder = NULL; + } + if (data->sessions) + { + genFree(data->sessions); + data->sessions = NULL; + } + free(result->gr_resdata.grd_node_list.nodeinfo[i]); + result->gr_resdata.grd_node_list.nodeinfo[i] = NULL; + } + } + result->gr_resdata.grd_node_list.num_node = 0; + } + break; +#ifdef __XLOG__ + case MSG_REPLICATION_CONTENT: + if (result->gr_resdata.grd_xlog_data.length && result->gr_resdata.grd_xlog_data.xlog_data) + { + free(result->gr_resdata.grd_xlog_data.xlog_data); + result->gr_resdata.grd_xlog_data.xlog_data = NULL; + result->gr_resdata.grd_xlog_data.length = 0; + } + break; +#endif +#ifdef __TBASE__ + case TXN_CHECK_GTM_STATUS_RESULT: + if (result->gr_resdata.grd_gts.standby_count > 0 && + result->gr_resdata.grd_gts.standby_count <= GTM_MAX_WALSENDER) + { + if (result->gr_resdata.grd_gts.slave_is_sync) + { + free(result->gr_resdata.grd_gts.slave_is_sync); + result->gr_resdata.grd_gts.slave_is_sync = NULL; + } + if (result->gr_resdata.grd_gts.slave_timestamp) + { + free(result->gr_resdata.grd_gts.slave_timestamp); + result->gr_resdata.grd_gts.slave_timestamp = NULL; + } + + if (result->gr_resdata.grd_gts.slave_flush_ptr) + { + free(result->gr_resdata.grd_gts.slave_flush_ptr); + result->gr_resdata.grd_gts.slave_flush_ptr = NULL; + } + + for (i = 0; i < result->gr_resdata.grd_gts.standby_count; i++) + { + if (result->gr_resdata.grd_gts.application_name[i]) + { + free(result->gr_resdata.grd_gts.application_name[i]); + result->gr_resdata.grd_gts.application_name[i] = NULL; + } + } + + result->gr_resdata.grd_gts.standby_count = 0; + } + break; + case MSG_GET_GTM_ERRORLOG_RESULT: + if (result->grd_errlog.len && result->grd_errlog.errlog) + { + free(result->grd_errlog.errlog); + result->grd_errlog.errlog = NULL; + result->grd_errlog.len = 0; + } + break; + case STORAGE_TRANSFER_RESULT: + /* free result of last call */ + if (result->grd_storage_data.len && result->grd_storage_data.data) + { + free(result->grd_storage_data.data); + result->grd_storage_data.data = NULL; + result->grd_storage_data.len = 0; + } + break; + case MSG_LIST_GTM_STORE_SEQ_RESULT: + if (result->grd_store_seq.count && result->grd_store_seq.seqs) + { + free(result->grd_store_seq.seqs); + result->grd_store_seq.seqs = NULL; + result->grd_store_seq.count = 0; + } + break; + case MSG_LIST_GTM_TXN_STORE_RESULT: + if (result->grd_store_txn.count && result->grd_store_txn.txns) + { + free(result->grd_store_txn.txns); + result->grd_store_txn.txns = NULL; + result->grd_store_txn.count = 0; + } + break; + case MSG_CHECK_GTM_SEQ_STORE_RESULT: + if (result->grd_store_check_seq.count && result->grd_store_check_seq.seqs) + { + free(result->grd_store_check_seq.seqs); + result->grd_store_check_seq.seqs = NULL; + result->grd_store_check_seq.count = 0; + } + break; + case MSG_CHECK_GTM_TXN_STORE_RESULT: + if (result->grd_store_check_txn.count && result->grd_store_check_txn.txns) + { + free(result->grd_store_check_txn.txns); + result->grd_store_check_txn.txns = NULL; + result->grd_store_check_txn.count = 0; + } + break; +#endif default: break; } } + +void +gtmpqFreeResultData(GTM_Result *result, GTM_PGXCNodeType remote_type) +{ + + /* + * If we are running as a GTM proxy, we don't have anything to do. This may + * change though as we add more message types below and some of them may + * need cleanup even at the proxy level + */ + if (remote_type == GTM_NODE_GTM_PROXY) + return; + + gtmpqFreeResultResource(result); +} diff --git a/src/gtm/gtm_ctl/gtm_ctl.c b/src/gtm/gtm_ctl/gtm_ctl.c index 2b34ad0d..9aeeb322 100644 --- a/src/gtm/gtm_ctl/gtm_ctl.c +++ b/src/gtm/gtm_ctl/gtm_ctl.c @@ -1050,6 +1050,8 @@ do_status(void) exit(1); } + if (gtm_conn) + disconnect_gtm(gtm_conn); return ; } diff --git a/src/include/gtm/gtm_client.h b/src/include/gtm/gtm_client.h index 2381286a..2ae03ff1 100644 --- a/src/include/gtm/gtm_client.h +++ b/src/include/gtm/gtm_client.h @@ -29,54 +29,54 @@ typedef union GTM_ResultData { - GTM_TransactionHandle grd_txnhandle; /* TXN_BEGIN */ + GTM_TransactionHandle grd_txnhandle; /* TXN_BEGIN */ - bool backup_result; /* BEGIN_BACKUP result */ - struct - { - GlobalTransactionId gxid; - GTM_Timestamp timestamp; - } grd_gxid_tp; /* TXN_BEGIN_GETGXID */ - + bool backup_result; /* BEGIN_BACKUP result */ + struct + { + GlobalTransactionId gxid; + GTM_Timestamp timestamp; + } grd_gxid_tp; /* TXN_BEGIN_GETGXID */ + - struct - { - GTM_Timestamp grd_gts; /* GETGTS or when CHECK_GTM GTS from primary GTM. */ - bool gtm_readonly; /* read only mode for gtm */ - int node_status; /* Master or Slave, 0:master, 1 slave */ + struct + { + GTM_Timestamp grd_gts; /* GETGTS or when CHECK_GTM GTS from primary GTM. */ + bool gtm_readonly; /* read only mode for gtm */ + int node_status; /* Master or Slave, 0:master, 1 slave */ #ifndef __XLOG__ - GTM_Timestamp grd_gts_standby; /* CHECK_GTM, GTS from standby. */ - char standbyhost[MAX_HOSTADDR_LEN]; - char standbyport[MAX_PORT_LEN]; + GTM_Timestamp grd_gts_standby; /* CHECK_GTM, GTS from standby. */ + char standbyhost[MAX_HOSTADDR_LEN]; + char standbyport[MAX_PORT_LEN]; #else XLogRecPtr master_flush; int standby_count; - int *slave_is_sync; + int *slave_is_sync; char *application_name[GTM_MAX_WALSENDER]; - XLogRecPtr *slave_flush_ptr; - GTM_Timestamp *slave_timestamp; + XLogRecPtr *slave_flush_ptr; + GTM_Timestamp *slave_timestamp; #endif - }grd_gts; + }grd_gts; #ifdef __XLOG__ - struct - { - XLogRecPtr flush; - XLogRecPtr write; - XLogRecPtr apply; - } grd_replication; + struct + { + XLogRecPtr flush; + XLogRecPtr write; + XLogRecPtr apply; + } grd_replication; - struct - { - XLogRecPtr pos; - int length; - char* xlog_data; - int reply; - XLogRecPtr flush; - } grd_xlog_data; - + struct + { + XLogRecPtr pos; + int length; + char* xlog_data; + int reply; + XLogRecPtr flush; + } grd_xlog_data; + #endif GlobalTransactionId grd_gxid; /* TXN_PREPARE @@ -198,20 +198,20 @@ typedef union GTM_ResultData typedef struct GTM_Result { - GTM_ResultType gr_type; - int gr_msglen; - int gr_status; - GTM_ProxyMsgHeader gr_proxyhdr; - GTM_ResultData gr_resdata; - -#ifdef __TBASE__ - struct - { - int32 len; - char *data; + GTM_ResultType gr_type; + int gr_msglen; + int gr_status; + GTM_ProxyMsgHeader gr_proxyhdr; + GTM_ResultData gr_resdata; + +#ifdef __TBASE__ + struct + { + int32 len; + char *data; #ifdef __XLOG__ - XLogRecPtr start_pos; - TimeLineID time_line; + XLogRecPtr start_pos; + TimeLineID time_line; #endif } grd_storage_data; /* STORAGE_TRANSFER_RESULT */ int gr_finish_status; /* TXN_FINISH_GID_RESULT result */ @@ -249,24 +249,24 @@ typedef struct GTM_Result } grd_errlog; #endif - /* - * We keep these two items outside the union to avoid repeated malloc/free - * of the xip array. If these items are pushed inside the union, they may - * get overwritten by other members in the union - */ - int gr_xip_size; - GTM_SnapshotData gr_snapshot; - - /* - * Similarly, keep the buffer for proxying data outside the union - */ - char *gr_proxy_data; - int gr_proxy_datalen; + /* + * We keep these two items outside the union to avoid repeated malloc/free + * of the xip array. If these items are pushed inside the union, they may + * get overwritten by other members in the union + */ + int gr_xip_size; + GTM_SnapshotData gr_snapshot; + + /* + * Similarly, keep the buffer for proxying data outside the union + */ + char *gr_proxy_data; + int gr_proxy_datalen; } GTM_Result; typedef struct Get_GTS_Result { - GTM_Timestamp gts; /* GETGTS or when CHECK_GTM GTS from primary GTM. */ + GTM_Timestamp gts; /* GETGTS or when CHECK_GTM GTS from primary GTM. */ bool gtm_readonly; /* read only mode for gtm */ } Get_GTS_Result; @@ -288,19 +288,19 @@ size_t get_sequence_list(GTM_Conn *, GTM_SeqInfo **); * Transaction Management API */ GlobalTransactionId begin_transaction(GTM_Conn *conn, GTM_IsolationLevel isolevel, - const char *global_sessionid, - GTM_Timestamp *timestamp); + const char *global_sessionid, + GTM_Timestamp *timestamp); int bkup_begin_transaction(GTM_Conn *conn, GTM_IsolationLevel isolevel, - bool read_only, const char *global_sessionid, - uint32 client_id, GTM_Timestamp timestamp); + bool read_only, const char *global_sessionid, + uint32 client_id, GTM_Timestamp timestamp); #ifdef __TBASE__ Get_GTS_Result get_global_timestamp(GTM_Conn *conn); #ifdef __XLOG__ int check_gtm_status(GTM_Conn *conn, int *status, GTM_Timestamp *master,XLogRecPtr *master_ptr, - int *standby_count,int **slave_is_sync, GTM_Timestamp **standby , - XLogRecPtr **slave_flush_ptr,char **application_name[GTM_MAX_WALSENDER],int timeout_seconds); + int *standby_count,int **slave_is_sync, GTM_Timestamp **standby , + XLogRecPtr **slave_flush_ptr,char **application_name[GTM_MAX_WALSENDER],int timeout_seconds); #else int check_gtm_status(GTM_Conn *conn, int *status, GTM_Timestamp *master, GTM_Timestamp *standby, char *standbyhost, char *standbyport, int32 buflen); #endif @@ -311,151 +311,151 @@ int get_gtm_errlog(GTM_Conn *conn, int timeout_seconds, char** errlog, int* len) #endif int bkup_begin_transaction_gxid(GTM_Conn *conn, GlobalTransactionId gxid, - GTM_IsolationLevel isolevel, bool read_only, - const char *global_sessionid, - uint32 client_id, GTM_Timestamp timestamp); + GTM_IsolationLevel isolevel, bool read_only, + const char *global_sessionid, + uint32 client_id, GTM_Timestamp timestamp); GlobalTransactionId begin_transaction_autovacuum(GTM_Conn *conn, GTM_IsolationLevel isolevel); int bkup_begin_transaction_autovacuum(GTM_Conn *conn, GlobalTransactionId gxid, - GTM_IsolationLevel isolevel, - uint32 client_id); + GTM_IsolationLevel isolevel, + uint32 client_id); int commit_transaction(GTM_Conn *conn, GlobalTransactionId gxid, - int waited_xid_count, - GlobalTransactionId *waited_xids); + int waited_xid_count, + GlobalTransactionId *waited_xids); int bkup_commit_transaction(GTM_Conn *conn, GlobalTransactionId gxid); int commit_prepared_transaction(GTM_Conn *conn, GlobalTransactionId gxid, - GlobalTransactionId prepared_gxid, - int waited_xid_count, - GlobalTransactionId *waited_xids); + GlobalTransactionId prepared_gxid, + int waited_xid_count, + GlobalTransactionId *waited_xids); int bkup_commit_prepared_transaction(GTM_Conn *conn, GlobalTransactionId gxid, GlobalTransactionId prepared_gxid); int abort_transaction(GTM_Conn *conn, GlobalTransactionId gxid); int bkup_abort_transaction(GTM_Conn *conn, GlobalTransactionId gxid); int start_prepared_transaction(GTM_Conn *conn, GlobalTransactionId gxid, char *gid, - char *nodestring); + char *nodestring); int log_commit_transaction(GTM_Conn *conn, GlobalTransactionId gxid,const char *gid, - const char *nodestring, int node_count, bool isGlobal, bool isCommit, - GlobalTimestamp prepare_ts, GlobalTimestamp commit_ts); + const char *nodestring, int node_count, bool isGlobal, bool isCommit, + GlobalTimestamp prepare_ts, GlobalTimestamp commit_ts); int log_scan_transaction(GTM_Conn *conn, - GlobalTransactionId gxid, - const char *node_string, - GlobalTimestamp start_ts, - GlobalTimestamp local_start_ts, - GlobalTimestamp local_complete_ts, - int scan_type, - const char *rel_name, - int64 scan_number); + GlobalTransactionId gxid, + const char *node_string, + GlobalTimestamp start_ts, + GlobalTimestamp local_start_ts, + GlobalTimestamp local_complete_ts, + int scan_type, + const char *rel_name, + int64 scan_number); int backup_start_prepared_transaction(GTM_Conn *conn, GlobalTransactionId gxid, char *gid, - char *nodestring); + char *nodestring); int prepare_transaction(GTM_Conn *conn, GlobalTransactionId gxid); int bkup_prepare_transaction(GTM_Conn *conn, GlobalTransactionId gxid); int get_gid_data(GTM_Conn *conn, GTM_IsolationLevel isolevel, char *gid, - GlobalTransactionId *gxid, - GlobalTransactionId *prepared_gxid, - char **nodestring); + GlobalTransactionId *gxid, + GlobalTransactionId *prepared_gxid, + char **nodestring); /* * Multiple Transaction Management API */ int begin_transaction_multi(GTM_Conn *conn, int txn_count, GTM_IsolationLevel *txn_isolation_level, - bool *txn_read_only, GTMProxy_ConnID *txn_connid, - int *txn_count_out, GlobalTransactionId *gxid_out, GTM_Timestamp *ts_out); + bool *txn_read_only, GTMProxy_ConnID *txn_connid, + int *txn_count_out, GlobalTransactionId *gxid_out, GTM_Timestamp *ts_out); int bkup_begin_transaction_multi(GTM_Conn *conn, int txn_count, - GlobalTransactionId *gxid, GTM_IsolationLevel *isolevel, - bool *read_only, - const char *txn_global_sessionid[], - uint32 *client_id, - GTMProxy_ConnID *txn_connid); + GlobalTransactionId *gxid, GTM_IsolationLevel *isolevel, + bool *read_only, + const char *txn_global_sessionid[], + uint32 *client_id, + GTMProxy_ConnID *txn_connid); int commit_transaction_multi(GTM_Conn *conn, int txn_count, GlobalTransactionId *gxid, - int *txn_count_out, int *status_out); + int *txn_count_out, int *status_out); int bkup_commit_transaction_multi(GTM_Conn *conn, int txn_count, - GlobalTransactionId *gxid); + GlobalTransactionId *gxid); int abort_transaction_multi(GTM_Conn *conn, int txn_count, GlobalTransactionId *gxid, - int *txn_count_out, int *status_out); + int *txn_count_out, int *status_out); int bkup_abort_transaction_multi(GTM_Conn *conn, int txn_count, GlobalTransactionId *gxid); int snapshot_get_multi(GTM_Conn *conn, int txn_count, GlobalTransactionId *gxid, - int *txn_count_out, int *status_out, - GlobalTransactionId *xmin_out, GlobalTransactionId *xmax_out, - GlobalTransactionId *recent_global_xmin_out, int32 *xcnt_out); + int *txn_count_out, int *status_out, + GlobalTransactionId *xmin_out, GlobalTransactionId *xmax_out, + GlobalTransactionId *recent_global_xmin_out, int32 *xcnt_out); /* * Snapshot Management API */ GTM_SnapshotData *get_snapshot(GTM_Conn *conn, GlobalTransactionId gxid, - bool canbe_grouped); + bool canbe_grouped); /* * Node Registering management API */ int node_register(GTM_Conn *conn, - GTM_PGXCNodeType type, - GTM_PGXCNodePort port, - char *node_name, - char *datafolder); + GTM_PGXCNodeType type, + GTM_PGXCNodePort port, + char *node_name, + char *datafolder); int node_register(GTM_Conn *conn, GTM_PGXCNodeType type, GTM_PGXCNodePort port, - char *node_name, char *datafolder); -int node_register_internal(GTM_Conn *conn, GTM_PGXCNodeType type, const char *host, GTM_PGXCNodePort port, char *node_name, - char *datafolder, GTM_PGXCNodeStatus status); + char *node_name, char *datafolder); +int node_register_internal(GTM_Conn *conn, GTM_PGXCNodeType type, const char *host, GTM_PGXCNodePort port, char *node_name, + char *datafolder, GTM_PGXCNodeStatus status); int bkup_node_register_internal(GTM_Conn *conn, GTM_PGXCNodeType type, const char *host, GTM_PGXCNodePort port, - char *node_name, char *datafolder, - GTM_PGXCNodeStatus status); + char *node_name, char *datafolder, + GTM_PGXCNodeStatus status); int node_unregister(GTM_Conn *conn, GTM_PGXCNodeType type, const char *node_name); int bkup_node_unregister(GTM_Conn *conn, GTM_PGXCNodeType type, const char * node_name); int backend_disconnect(GTM_Conn *conn, bool is_postmaster, GTM_PGXCNodeType type, char *node_name); char *node_get_local_addr(GTM_Conn *conn, char *buf, size_t buflen, int *rc); int register_session(GTM_Conn *conn, const char *coord_name, int coord_procid, - int coord_backendid); + int coord_backendid); int report_global_xmin(GTM_Conn *conn, const char *node_name, - GTM_PGXCNodeType type, GlobalTransactionId gxid, - GlobalTransactionId *global_xmin, - GlobalTransactionId *latest_completed_xid, - int *errcode); + GTM_PGXCNodeType type, GlobalTransactionId gxid, + GlobalTransactionId *global_xmin, + GlobalTransactionId *latest_completed_xid, + int *errcode); /* * Sequence Management API */ int open_sequence(GTM_Conn *conn, GTM_SequenceKey key, GTM_Sequence increment, - GTM_Sequence minval, GTM_Sequence maxval, - GTM_Sequence startval, bool cycle, - GlobalTransactionId gxid); + GTM_Sequence minval, GTM_Sequence maxval, + GTM_Sequence startval, bool cycle, + GlobalTransactionId gxid); int bkup_open_sequence(GTM_Conn *conn, GTM_SequenceKey key, GTM_Sequence increment, - GTM_Sequence minval, GTM_Sequence maxval, - GTM_Sequence startval, bool cycle, - GlobalTransactionId gxid); + GTM_Sequence minval, GTM_Sequence maxval, + GTM_Sequence startval, bool cycle, + GlobalTransactionId gxid); int alter_sequence(GTM_Conn *conn, GTM_SequenceKey key, GTM_Sequence increment, - GTM_Sequence minval, GTM_Sequence maxval, - GTM_Sequence startval, GTM_Sequence lastval, bool cycle, bool is_restart); + GTM_Sequence minval, GTM_Sequence maxval, + GTM_Sequence startval, GTM_Sequence lastval, bool cycle, bool is_restart); int bkup_alter_sequence(GTM_Conn *conn, GTM_SequenceKey key, GTM_Sequence increment, - GTM_Sequence minval, GTM_Sequence maxval, - GTM_Sequence startval, GTM_Sequence lastval, bool cycle, bool is_restart); + GTM_Sequence minval, GTM_Sequence maxval, + GTM_Sequence startval, GTM_Sequence lastval, bool cycle, bool is_restart); int close_sequence(GTM_Conn *conn, GTM_SequenceKey key, GlobalTransactionId gxid); int bkup_close_sequence(GTM_Conn *conn, GTM_SequenceKey key, GlobalTransactionId gxid); int rename_sequence(GTM_Conn *conn, GTM_SequenceKey key, - GTM_SequenceKey newkey, GlobalTransactionId gxid); + GTM_SequenceKey newkey, GlobalTransactionId gxid); int bkup_rename_sequence(GTM_Conn *conn, GTM_SequenceKey key, - GTM_SequenceKey newkey, GlobalTransactionId gxid); + GTM_SequenceKey newkey, GlobalTransactionId gxid); int get_current(GTM_Conn *conn, GTM_SequenceKey key, - char *coord_name, int coord_procid, GTM_Sequence *result); + char *coord_name, int coord_procid, GTM_Sequence *result); int get_next(GTM_Conn *conn, GTM_SequenceKey key, - char *coord_name, int coord_procid, - GTM_Sequence range, GTM_Sequence *result, GTM_Sequence *rangemax); + char *coord_name, int coord_procid, + GTM_Sequence range, GTM_Sequence *result, GTM_Sequence *rangemax); int bkup_get_next(GTM_Conn *conn, GTM_SequenceKey key, - char *coord_name, int coord_procid, - GTM_Sequence range, GTM_Sequence *result, GTM_Sequence *rangemax); + char *coord_name, int coord_procid, + GTM_Sequence range, GTM_Sequence *result, GTM_Sequence *rangemax); int set_val(GTM_Conn *conn, GTM_SequenceKey key, char *coord_name, - int coord_procid, GTM_Sequence nextval, bool iscalled); + int coord_procid, GTM_Sequence nextval, bool iscalled); int bkup_set_val(GTM_Conn *conn, GTM_SequenceKey key, char *coord_name, - int coord_procid, GTM_Sequence nextval, bool iscalled); + int coord_procid, GTM_Sequence nextval, bool iscalled); int reset_sequence(GTM_Conn *conn, GTM_SequenceKey key); int bkup_reset_sequence(GTM_Conn *conn, GTM_SequenceKey key); @@ -499,4 +499,5 @@ int32 check_storage_sequence(GTM_Conn *conn, GTMStorageSequneceStatus **store_se int32 check_storage_transaction(GTM_Conn *conn, GTMStorageTransactionStatus **store_txn, bool need_fix); int rename_db_sequence(GTM_Conn *conn, GTM_SequenceKey key, GTM_SequenceKey newkey, GlobalTransactionId gxid); #endif +void gtmpqFreeResultResource(GTM_Result *result); #endif diff --git a/src/include/gtm/gtm_msg.h b/src/include/gtm/gtm_msg.h index bb66c194..acedc926 100644 --- a/src/include/gtm/gtm_msg.h +++ b/src/include/gtm/gtm_msg.h @@ -22,112 +22,114 @@ */ typedef enum GTM_MessageType { - MSG_TYPE_INVALID, - MSG_SYNC_STANDBY, /* Message to sync woth GTM-Standby */ - MSG_NODE_REGISTER, /* Register a PGXC Node with GTM */ - MSG_BKUP_NODE_REGISTER, /* Backup of MSG_NODE_REGISTER */ - MSG_NODE_UNREGISTER, /* Unregister a PGXC Node with GTM */ - MSG_BKUP_NODE_UNREGISTER, /* Backup of MSG_NODE_UNREGISTER */ - MSG_REGISTER_SESSION, /* Register distributed session with GTM */ - MSG_REPORT_XMIN, /* Report RecentGlobalXmin to GTM */ - MSG_BKUP_REPORT_XMIN, - MSG_NODE_LIST, /* Get node list */ - MSG_NODE_BEGIN_REPLICATION_INIT, - MSG_NODE_END_REPLICATION_INIT, - MSG_BEGIN_BACKUP, /* Start backup by Standby */ - MSG_END_BACKUP, /* End backup preparation by Standby */ - MSG_TXN_BEGIN, /* Start a new transaction */ - MSG_BKUP_TXN_BEGIN, /* Backup of MSG_TXN_BEGIN */ - MSG_BKUP_GLOBAL_TIMESTAMP, /* Backup of the latest issued global timestmap */ - MSG_TXN_BEGIN_GETGXID, /* Start a new transaction and get GXID */ - MSG_BKUP_TXN_BEGIN_GETGXID, /* Backup of MSG_TXN_BEGIN_GETGXID */ - MSG_TXN_BEGIN_GETGXID_MULTI, /* Start multiple new transactions and get GXIDs */ - MSG_BKUP_TXN_BEGIN_GETGXID_MULTI, /* Backup of MSG_TXN_BEGIN_GETGXID_MULTI */ - MSG_TXN_START_PREPARED, /* Begins to prepare a transation for commit */ - MSG_BKUP_TXN_START_PREPARED, /* Backup of MSG_TXN_START_PREPARED */ - MSG_TXN_COMMIT, /* Commit a running or prepared transaction */ - MSG_BKUP_TXN_COMMIT, /* Backup of MSG_TXN_COMMIT */ - MSG_TXN_LOG_GLOBAL_COMMIT, /* Log a committed transaction*/ - MSG_TXN_LOG_COMMIT, /* Log a committed transaction*/ - MSG_TXN_LOG_GLOBAL_SCAN, /* Log a global scan */ - MSG_TXN_LOG_SCAN, /* Log a scan */ - MSG_TXN_COMMIT_MULTI, /* Commit multiple running or prepared transactions */ - MSG_BKUP_TXN_COMMIT_MULTI, /* Bacukp of MSG_TXN_COMMIT_MULTI */ - MSG_TXN_COMMIT_PREPARED, /* Commit a prepared transaction */ - MSG_BKUP_TXN_COMMIT_PREPARED, /* Backup of MSG_TXN_COMMIT_PREPARED */ - MSG_TXN_PREPARE, /* Finish preparing a transaction */ - MSG_BKUP_TXN_PREPARE, /* Backup of MSG_TXN_PREPARE */ - MSG_TXN_ROLLBACK, /* Rollback a transaction */ - MSG_BKUP_TXN_ROLLBACK, /* Backup of MSG_TXN_ROLLBACK */ - MSG_TXN_ROLLBACK_MULTI, /* Rollback multiple transactions */ - MSG_BKUP_TXN_ROLLBACK_MULTI, /* Backup of MSG_TXN_ROLLBACK_MULTI */ - MSG_TXN_GET_GID_DATA, /* Get info associated with a GID, and get a GXID */ - MSG_TXN_GET_GXID, /* Get a GXID for a transaction */ - MSG_BKUP_TXN_GET_GXID, - MSG_TXN_GET_NEXT_GXID, /* Get next GXID */ - MSG_TXN_GXID_LIST, - MSG_SNAPSHOT_GET, /* Get a global snapshot */ - MSG_SNAPSHOT_GET_MULTI, /* Get multiple global snapshots */ - MSG_SNAPSHOT_GXID_GET, /* Get GXID and snapshot together */ - MSG_SEQUENCE_INIT, /* Initialize a new global sequence */ - MSG_BKUP_SEQUENCE_INIT, /* Backup of MSG_SEQUENCE_INIT */ - MSG_SEQUENCE_GET_CURRENT,/* Get the current value of sequence */ - MSG_SEQUENCE_GET_NEXT, /* Get the next sequence value of sequence */ - MSG_BKUP_SEQUENCE_GET_NEXT, /* Backup of MSG_SEQUENCE_GET_NEXT */ - MSG_SEQUENCE_GET_LAST, /* Get the last sequence value of sequence */ - MSG_SEQUENCE_SET_VAL, /* Set values for sequence */ - MSG_BKUP_SEQUENCE_SET_VAL, /* Backup of MSG_SEQUENCE_SET_VAL */ - MSG_SEQUENCE_RESET, /* Reset the sequence */ - MSG_BKUP_SEQUENCE_RESET, /* Backup of MSG_SEQUENCE_RESET */ - MSG_SEQUENCE_CLOSE, /* Close a previously inited sequence */ - MSG_BKUP_SEQUENCE_CLOSE, /* Backup of MSG_SEQUENCE_CLOSE */ - MSG_SEQUENCE_RENAME, /* Rename a sequence */ - MSG_BKUP_SEQUENCE_RENAME, /* Backup of MSG_SEQUENCE_RENAME */ - MSG_SEQUENCE_ALTER, /* Alter a sequence */ - MSG_BKUP_SEQUENCE_ALTER, /* Backup of MSG_SEQUENCE_ALTER */ - MSG_SEQUENCE_LIST, /* Get a list of sequences */ - MSG_TXN_GET_STATUS, /* Get status of a given transaction */ - MSG_TXN_GET_ALL_PREPARED, /* Get information about all outstanding - * prepared transactions */ - MSG_TXN_BEGIN_GETGXID_AUTOVACUUM, /* Start a new transaction and get GXID for autovacuum */ - MSG_BKUP_TXN_BEGIN_GETGXID_AUTOVACUUM, /* Backup of MSG_TXN_BEGIN_GETGXID_AUTOVACUUM */ - MSG_DATA_FLUSH, /* flush pending data */ - MSG_BACKEND_DISCONNECT, /* tell GTM that the backend diconnected from the proxy */ - MSG_BARRIER, /* Tell the barrier was issued */ - MSG_BKUP_BARRIER, /* Backup barrier to standby */ + MSG_TYPE_INVALID, + MSG_SYNC_STANDBY, /* Message to sync woth GTM-Standby */ + MSG_NODE_REGISTER, /* Register a PGXC Node with GTM */ + MSG_BKUP_NODE_REGISTER, /* Backup of MSG_NODE_REGISTER */ + MSG_NODE_UNREGISTER, /* Unregister a PGXC Node with GTM */ + MSG_BKUP_NODE_UNREGISTER, /* Backup of MSG_NODE_UNREGISTER */ + MSG_REGISTER_SESSION, /* Register distributed session with GTM */ + MSG_REPORT_XMIN, /* Report RecentGlobalXmin to GTM */ + MSG_BKUP_REPORT_XMIN, + MSG_NODE_LIST, /* Get node list */ + MSG_NODE_BEGIN_REPLICATION_INIT, + MSG_NODE_END_REPLICATION_INIT, + MSG_BEGIN_BACKUP, /* Start backup by Standby */ + MSG_END_BACKUP, /* End backup preparation by Standby */ + MSG_TXN_BEGIN, /* Start a new transaction */ + MSG_BKUP_TXN_BEGIN, /* Backup of MSG_TXN_BEGIN */ + MSG_BKUP_GLOBAL_TIMESTAMP, /* Backup of the latest issued global timestmap */ + MSG_TXN_BEGIN_GETGXID, /* Start a new transaction and get GXID */ + MSG_BKUP_TXN_BEGIN_GETGXID, /* Backup of MSG_TXN_BEGIN_GETGXID */ + MSG_TXN_BEGIN_GETGXID_MULTI, /* Start multiple new transactions and get GXIDs */ + MSG_BKUP_TXN_BEGIN_GETGXID_MULTI, /* Backup of MSG_TXN_BEGIN_GETGXID_MULTI */ + MSG_TXN_START_PREPARED, /* Begins to prepare a transation for commit */ + MSG_BKUP_TXN_START_PREPARED, /* Backup of MSG_TXN_START_PREPARED */ + MSG_TXN_COMMIT, /* Commit a running or prepared transaction */ + MSG_BKUP_TXN_COMMIT, /* Backup of MSG_TXN_COMMIT */ + MSG_TXN_LOG_GLOBAL_COMMIT, /* Log a committed transaction*/ + MSG_TXN_LOG_COMMIT, /* Log a committed transaction*/ + MSG_TXN_LOG_GLOBAL_SCAN, /* Log a global scan */ + MSG_TXN_LOG_SCAN, /* Log a scan */ + MSG_TXN_COMMIT_MULTI, /* Commit multiple running or prepared transactions */ + MSG_BKUP_TXN_COMMIT_MULTI, /* Bacukp of MSG_TXN_COMMIT_MULTI */ + MSG_TXN_COMMIT_PREPARED, /* Commit a prepared transaction */ + MSG_BKUP_TXN_COMMIT_PREPARED, /* Backup of MSG_TXN_COMMIT_PREPARED */ + MSG_TXN_PREPARE, /* Finish preparing a transaction */ + MSG_BKUP_TXN_PREPARE, /* Backup of MSG_TXN_PREPARE */ + MSG_TXN_ROLLBACK, /* Rollback a transaction */ + MSG_BKUP_TXN_ROLLBACK, /* Backup of MSG_TXN_ROLLBACK */ + MSG_TXN_ROLLBACK_MULTI, /* Rollback multiple transactions */ + MSG_BKUP_TXN_ROLLBACK_MULTI, /* Backup of MSG_TXN_ROLLBACK_MULTI */ + MSG_TXN_GET_GID_DATA, /* Get info associated with a GID, and get a GXID */ + MSG_TXN_GET_GXID, /* Get a GXID for a transaction */ + MSG_BKUP_TXN_GET_GXID, + MSG_TXN_GET_NEXT_GXID, /* Get next GXID */ + MSG_TXN_GXID_LIST, + MSG_SNAPSHOT_GET, /* Get a global snapshot */ + MSG_SNAPSHOT_GET_MULTI, /* Get multiple global snapshots */ + MSG_SNAPSHOT_GXID_GET, /* Get GXID and snapshot together */ + MSG_SEQUENCE_INIT, /* Initialize a new global sequence */ + MSG_BKUP_SEQUENCE_INIT, /* Backup of MSG_SEQUENCE_INIT */ + MSG_SEQUENCE_GET_CURRENT,/* Get the current value of sequence */ + MSG_SEQUENCE_GET_NEXT, /* Get the next sequence value of sequence */ + MSG_BKUP_SEQUENCE_GET_NEXT, /* Backup of MSG_SEQUENCE_GET_NEXT */ + MSG_SEQUENCE_GET_LAST, /* Get the last sequence value of sequence */ + MSG_SEQUENCE_SET_VAL, /* Set values for sequence */ + MSG_BKUP_SEQUENCE_SET_VAL, /* Backup of MSG_SEQUENCE_SET_VAL */ + MSG_SEQUENCE_RESET, /* Reset the sequence */ + MSG_BKUP_SEQUENCE_RESET, /* Backup of MSG_SEQUENCE_RESET */ + MSG_SEQUENCE_CLOSE, /* Close a previously inited sequence */ + MSG_BKUP_SEQUENCE_CLOSE, /* Backup of MSG_SEQUENCE_CLOSE */ + MSG_SEQUENCE_RENAME, /* Rename a sequence */ + MSG_BKUP_SEQUENCE_RENAME, /* Backup of MSG_SEQUENCE_RENAME */ + MSG_SEQUENCE_ALTER, /* Alter a sequence */ + MSG_BKUP_SEQUENCE_ALTER, /* Backup of MSG_SEQUENCE_ALTER */ + MSG_SEQUENCE_LIST, /* Get a list of sequences */ + MSG_TXN_GET_STATUS, /* Get status of a given transaction */ + MSG_TXN_GET_ALL_PREPARED, /* Get information about all outstanding + * prepared transactions */ + MSG_TXN_BEGIN_GETGXID_AUTOVACUUM, /* Start a new transaction and get GXID for autovacuum */ + MSG_BKUP_TXN_BEGIN_GETGXID_AUTOVACUUM, /* Backup of MSG_TXN_BEGIN_GETGXID_AUTOVACUUM */ + MSG_DATA_FLUSH, /* flush pending data */ + MSG_BACKEND_DISCONNECT, /* tell GTM that the backend diconnected from the proxy */ + MSG_BARRIER, /* Tell the barrier was issued */ + MSG_BKUP_BARRIER, /* Backup barrier to standby */ #ifdef __TBASE__ - /* Gtm storage tags. */ - MSG_GET_STORAGE, /* Backup get storage file */ - MSG_TXN_FINISH_GID, /* Finish gid transaction in GTM */ - MSG_LIST_GTM_STORE, /* List gtm store info */ - MSG_LIST_GTM_STORE_SEQ, /* List gtm running sequence info */ - MSG_LIST_GTM_STORE_TXN, /* List gtm running transaction info */ - MSG_CHECK_GTM_STORE_SEQ, /* Check gtm sequence usage info */ - MSG_CHECK_GTM_STORE_TXN, /* Check gtm transaction usage info */ - MSG_CLEAN_SESSION_SEQ, /* clean up session related seq */ + /* Gtm storage tags. */ + MSG_GET_STORAGE, /* Backup get storage file */ + MSG_TXN_FINISH_GID, /* Finish gid transaction in GTM */ + MSG_LIST_GTM_STORE, /* List gtm store info */ + MSG_LIST_GTM_STORE_SEQ, /* List gtm running sequence info */ + MSG_LIST_GTM_STORE_TXN, /* List gtm running transaction info */ + MSG_CHECK_GTM_STORE_SEQ, /* Check gtm sequence usage info */ + MSG_CHECK_GTM_STORE_TXN, /* Check gtm transaction usage info */ + MSG_CLEAN_SESSION_SEQ, /* clean up session related seq */ - /* Global timestamp tags. */ - MSG_GETGTS, /* Get a global timestamp */ - MSG_GETGTS_MULTI, /* Get multiple global timestamps */ + /* Global timestamp tags. */ + MSG_GETGTS, /* Get a global timestamp */ + MSG_GETGTS_MULTI, /* Get multiple global timestamps */ - MSG_CHECK_GTM_STATUS, /* Get global timestamp from both master and slave gtm. */ + MSG_CHECK_GTM_STATUS, /* Get global timestamp from both master and slave gtm. */ - MSG_DB_SEQUENCE_RENAME, /* Rename all sequence in database*/ - MSG_BKUP_DB_SEQUENCE_RENAME, + MSG_DB_SEQUENCE_RENAME, /* Rename all sequence in database*/ + MSG_BKUP_DB_SEQUENCE_RENAME, #endif #ifdef __XLOG__ - MSG_START_REPLICATION, + MSG_START_REPLICATION, MSG_GET_REPLICATION_STATUS, MSG_GET_REPLICATION_TRANSFER, #endif +#ifdef __TBASE__ MSG_GET_STATISTICS, MSG_GET_ERRORLOG, +#endif - /* - * Must be at the end - */ - MSG_TYPE_COUNT /* A dummmy entry just to count the message types */ + /* + * Must be at the end + */ + MSG_TYPE_COUNT /* A dummmy entry just to count the message types */ } GTM_MessageType; /* @@ -136,67 +138,67 @@ typedef enum GTM_MessageType */ typedef enum GTM_ResultType { - SYNC_STANDBY_RESULT, - NODE_REGISTER_RESULT, - NODE_UNREGISTER_RESULT, - REGISTER_SESSION_RESULT, - REPORT_XMIN_RESULT, - NODE_LIST_RESULT, - NODE_BEGIN_REPLICATION_INIT_RESULT, - NODE_END_REPLICATION_INIT_RESULT, - BEGIN_BACKUP_SUCCEED_RESULT, + SYNC_STANDBY_RESULT, + NODE_REGISTER_RESULT, + NODE_UNREGISTER_RESULT, + REGISTER_SESSION_RESULT, + REPORT_XMIN_RESULT, + NODE_LIST_RESULT, + NODE_BEGIN_REPLICATION_INIT_RESULT, + NODE_END_REPLICATION_INIT_RESULT, + BEGIN_BACKUP_SUCCEED_RESULT, #ifdef __TBASE__ - BEGIN_BACKUP_FAIL_RESULT, + BEGIN_BACKUP_FAIL_RESULT, #endif - END_BACKUP_RESULT, - TXN_BEGIN_RESULT, - TXN_BEGIN_GETGXID_RESULT, - TXN_BEGIN_GETGTS_RESULT, - TXN_BEGIN_GETGXID_MULTI_RESULT, - TXN_BEGIN_GETGTS_MULTI_RESULT, + END_BACKUP_RESULT, + TXN_BEGIN_RESULT, + TXN_BEGIN_GETGXID_RESULT, + TXN_BEGIN_GETGTS_RESULT, + TXN_BEGIN_GETGXID_MULTI_RESULT, + TXN_BEGIN_GETGTS_MULTI_RESULT, #ifdef __TBASE__ - TXN_CHECK_GTM_STATUS_RESULT, + TXN_CHECK_GTM_STATUS_RESULT, #endif - TXN_PREPARE_RESULT, - TXN_START_PREPARED_RESULT, - TXN_LOG_TRANSACTION_RESULT, - TXN_LOG_SCAN_RESULT, - TXN_COMMIT_PREPARED_RESULT, - TXN_COMMIT_RESULT, - TXN_COMMIT_MULTI_RESULT, - TXN_ROLLBACK_RESULT, - TXN_ROLLBACK_MULTI_RESULT, - TXN_GET_GID_DATA_RESULT, - TXN_GET_GXID_RESULT, - TXN_GET_NEXT_GXID_RESULT, - TXN_GXID_LIST_RESULT, - SNAPSHOT_GET_RESULT, - SNAPSHOT_GET_MULTI_RESULT, - SNAPSHOT_GXID_GET_RESULT, - SEQUENCE_INIT_RESULT, - SEQUENCE_GET_CURRENT_RESULT, - SEQUENCE_GET_NEXT_RESULT, - SEQUENCE_GET_LAST_RESULT, - SEQUENCE_SET_VAL_RESULT, - SEQUENCE_RESET_RESULT, - SEQUENCE_CLOSE_RESULT, - SEQUENCE_RENAME_RESULT, - SEQUENCE_ALTER_RESULT, - SEQUENCE_LIST_RESULT, - TXN_GET_STATUS_RESULT, - TXN_GET_ALL_PREPARED_RESULT, - TXN_BEGIN_GETGXID_AUTOVACUUM_RESULT, - BARRIER_RESULT, + TXN_PREPARE_RESULT, + TXN_START_PREPARED_RESULT, + TXN_LOG_TRANSACTION_RESULT, + TXN_LOG_SCAN_RESULT, + TXN_COMMIT_PREPARED_RESULT, + TXN_COMMIT_RESULT, + TXN_COMMIT_MULTI_RESULT, + TXN_ROLLBACK_RESULT, + TXN_ROLLBACK_MULTI_RESULT, + TXN_GET_GID_DATA_RESULT, + TXN_GET_GXID_RESULT, + TXN_GET_NEXT_GXID_RESULT, + TXN_GXID_LIST_RESULT, + SNAPSHOT_GET_RESULT, + SNAPSHOT_GET_MULTI_RESULT, + SNAPSHOT_GXID_GET_RESULT, + SEQUENCE_INIT_RESULT, + SEQUENCE_GET_CURRENT_RESULT, + SEQUENCE_GET_NEXT_RESULT, + SEQUENCE_GET_LAST_RESULT, + SEQUENCE_SET_VAL_RESULT, + SEQUENCE_RESET_RESULT, + SEQUENCE_CLOSE_RESULT, + SEQUENCE_RENAME_RESULT, + SEQUENCE_ALTER_RESULT, + SEQUENCE_LIST_RESULT, + TXN_GET_STATUS_RESULT, + TXN_GET_ALL_PREPARED_RESULT, + TXN_BEGIN_GETGXID_AUTOVACUUM_RESULT, + BARRIER_RESULT, #ifdef __TBASE__ - STORAGE_TRANSFER_RESULT, - TXN_FINISH_GID_RESULT, - MSG_LIST_GTM_STORE_RESULT, - MSG_LIST_GTM_STORE_SEQ_RESULT, /* List gtm running sequence info */ - MSG_LIST_GTM_TXN_STORE_RESULT, /* List gtm running transaction info */ - MSG_CHECK_GTM_SEQ_STORE_RESULT, /* Check gtm sequence usage info */ - MSG_CHECK_GTM_TXN_STORE_RESULT, /* Check gtm transaction usage info */ - MSG_CLEAN_SESSION_SEQ_RESULT, - MSG_DB_SEQUENCE_RENAME_RESULT, + STORAGE_TRANSFER_RESULT, + TXN_FINISH_GID_RESULT, + MSG_LIST_GTM_STORE_RESULT, + MSG_LIST_GTM_STORE_SEQ_RESULT, /* List gtm running sequence info */ + MSG_LIST_GTM_TXN_STORE_RESULT, /* List gtm running transaction info */ + MSG_CHECK_GTM_SEQ_STORE_RESULT, /* Check gtm sequence usage info */ + MSG_CHECK_GTM_TXN_STORE_RESULT, /* Check gtm transaction usage info */ + MSG_CLEAN_SESSION_SEQ_RESULT, + MSG_DB_SEQUENCE_RENAME_RESULT, #endif #ifdef __XLOG__ @@ -206,9 +208,10 @@ typedef enum GTM_ResultType MSG_REPLICATION_CONTENT, #endif +#ifdef __TBASE__ MSG_GET_GTM_STATISTICS_RESULT, MSG_GET_GTM_ERRORLOG_RESULT, - +#endif RESULT_TYPE_COUNT } GTM_ResultType; @@ -221,7 +224,7 @@ typedef enum GTM_ResultType */ typedef struct GTM_ProxyMsgHeader { - GTMProxy_ConnID ph_conid; + GTMProxy_ConnID ph_conid; } GTM_ProxyMsgHeader; #endif From 008ba88f444d5b07685979260309609fbf6762bd Mon Sep 17 00:00:00 2001 From: anthonyyan Date: Tue, 27 Apr 2021 14:47:01 +0800 Subject: [PATCH 356/578] avoid handle remote handles when dn_handles or co_handles is NULL(merge request !297) http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131087088693 --- src/backend/pgxc/pool/execRemote.c | 13 ++++++++++++- src/backend/pgxc/pool/pgxcnode.c | 6 ++++++ src/include/pgxc/pgxcnode.h | 2 +- 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 8f0e5ab9..e02f262f 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -7623,6 +7623,11 @@ PreAbort_Remote(TranscationType txn_type, bool need_release_handle) struct rusage start_r; struct timeval start_t; + if (!is_pgxc_handles_init()) + { + return true; + } + clean_nodes = (PGXCNodeHandle**)palloc(sizeof(PGXCNodeHandle*) * (NumCoords + NumDataNodes)); cancel_dn_list = (int*)palloc(sizeof(int) * NumDataNodes); cancel_co_list = (int*)palloc(sizeof(int) * NumCoords); @@ -12859,8 +12864,14 @@ void SetCurrentHandlesReadonly(void) { int i = 0; PGXCNodeHandle *conn = NULL; - PGXCNodeAllHandles *handles = get_current_handles(); + PGXCNodeAllHandles *handles = NULL; + if (!is_pgxc_handles_init()) + { + return; + } + + handles = get_current_handles(); for (i = 0; i < handles->dn_conn_count; i++) { diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index c7c630fa..6db7d43b 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -5860,6 +5860,12 @@ is_ddl_leader_cn(char *first_cn) return strcmp(first_cn, PGXCNodeName) == 0; } + +inline bool +is_pgxc_handles_init() +{ + return (dn_handles != NULL && co_handles != NULL); +} #endif /* diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h index be08deab..e5f9c6e1 100644 --- a/src/include/pgxc/pgxcnode.h +++ b/src/include/pgxc/pgxcnode.h @@ -297,7 +297,7 @@ inline bool is_ddl_leader_cn(char *leader_cn); extern int pgxc_node_send_sessionid(PGXCNodeHandle * handle); extern void SerializeSessionId(Size maxsize, char *start_address); extern void StartParallelWorkerSessionId(char *address); -void CheckInvalidateRemoteHandles(void); +extern bool is_pgxc_handles_init(void); #endif #ifdef __AUDIT__ From 724c50e44a2398dbbbf81377d45fe654ebf6158c Mon Sep 17 00:00:00 2001 From: arrowbowang Date: Wed, 21 Apr 2021 15:30:08 +0800 Subject: [PATCH 357/578] fix update with returning clause on partitioned table --- src/backend/executor/nodeModifyTable.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 3d6b9769..83042aac 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -2275,6 +2275,11 @@ ExecModifyTable(PlanState *pstate) { subplanstate = node->partplans[node->part_whichplan]; part_resultRelInfo = resultRelInfo->part_relinfo[node->part_whichplan]; + /* when use update ... returning this fuction will be reentered, + * so the execution should ues the last state of part_resultRelInfo + * */ + junkfilter = resultRelInfo->ri_junkFilter; + estate->es_result_relation_info = part_resultRelInfo; } else { From 46179ceeefb5d3ecd4ce65b069777643b299dc58 Mon Sep 17 00:00:00 2001 From: qiannzhang Date: Mon, 26 Apr 2021 20:55:21 +0800 Subject: [PATCH 358/578] Fix several bugs when pull up or sublinks. 1.Fix if one of two Exists contains sublinks itself and is not related (not pullup). 2.Fix if some qual of one Exists constains only local vars (can pullup). 3.Fix if some qual of one Exists constains only upper vars (not pullup). 4.Fix if some qual of one Exists is too complicated to convert (not pullup). Add regress. http://tapd.oa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131087122695 --- src/backend/optimizer/plan/subselect.c | 30 +++- src/backend/optimizer/prep/prepjointree.c | 2 +- src/test/regress/expected/subselect.out | 192 ++++++++++++++++++++++ src/test/regress/sql/subselect.sql | 97 +++++++++++ 4 files changed, 311 insertions(+), 10 deletions(-) diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index 13150602..61647167 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -3029,7 +3029,7 @@ get_or_exist_subquery_targetlist(PlannerInfo *root, Node *node, List **targetLis } if (list_length(new_args) == 1) { - return (Node *)list_head(new_args); + return (Node *)linitial(new_args); } else if (list_length(new_args) == 0) { @@ -3044,7 +3044,7 @@ get_or_exist_subquery_targetlist(PlannerInfo *root, Node *node, List **targetLis Var *var; vars = pull_vars_of_level(node, 0); - + /* only support upper_var = local_var */ Assert(list_length(vars) == 1); *targetList = lappend(*targetList, lfirst(vars->head)); @@ -3584,30 +3584,39 @@ check_or_exist_qual_pullupable(PlannerInfo *root, Node *node) } else { - List *vars = pull_vars_of_level(node, 1); - if (vars == NIL) + bool result = false; + + if (pull_vars_of_level(node, 1) == NIL) return true; + /* If upper_var, only support upper_var = local_var */ + if (pull_vars_of_level(node, 0) == NIL) + return false; if (IsA(node, OpExpr)) { HeapTuple opertup; Form_pg_operator operform; char *oprname; - OpExpr *expr = (OpExpr *)node; + if (list_length(expr->args) != 2 || + !IsA(linitial(expr->args), Var) || + !IsA(llast(expr->args), Var)) + { + return false; + } + opertup = SearchSysCache1(OPEROID, ObjectIdGetDatum(expr->opno)); if (!HeapTupleIsValid(opertup)) return false; operform = (Form_pg_operator)GETSTRUCT(opertup); oprname = NameStr(operform->oprname); - + /* only support simple equal */ + result = (strcmp(oprname, "=") == 0); ReleaseSysCache(opertup); - if (strcmp(oprname, "=") == 0 && list_length(expr->args) == 2) - return true; } - return false; + return result; } return true; } @@ -3625,6 +3634,9 @@ bool check_or_exist_sublink_pullupable(PlannerInfo *root, Node *node) if (subselect->cteList) return false; + if (subselect->hasSubLinks) + return false; + if (!simplify_EXISTS_query(root, subselect)) return false; diff --git a/src/backend/optimizer/prep/prepjointree.c b/src/backend/optimizer/prep/prepjointree.c index 93ceb77e..45e03eb6 100644 --- a/src/backend/optimizer/prep/prepjointree.c +++ b/src/backend/optimizer/prep/prepjointree.c @@ -528,7 +528,7 @@ pull_up_or_sublinks_qual_recurse(PlannerInfo *root, Node *node, Node **jtlink, N } else { - BoolExpr *expr = (BoolExpr *)(*or_clause); + BoolExpr *expr = (BoolExpr *)(*orquals); if (expr->boolop == OR_EXPR) { *orquals = (Node *)make_andclause(list_make2(*orquals, diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out index c68869c3..32ed8e4f 100644 --- a/src/test/regress/expected/subselect.out +++ b/src/test/regress/expected/subselect.out @@ -2222,3 +2222,195 @@ drop table notin_t1; drop table notin_t2; drop function explain_sq_limit(); drop table sq_limit; +-- pull up or sublinks +set enable_pullup_subquery to on; +create table coltest( + c1 int, + c2 bigint, + c3 int2, + c4 bool, + c5 name, + c6 float4, + c7 float8, + c9 numeric, + c10 text, + c11 char(100), + c12 varchar, + c13 money, + c14 date, + c15 timestamp, + c16 timestamp with time zone, + c17 time, + c18 time with time zone, + c19 interval, + c20 abstime, + c21 reltime, + c22 tinterval, + c23 box, + c24 line, + c25 path, + c26 point, + c27 lseg, + c28 polygon, + c29 circle, + c30 inet, + c31 macaddr +); +explain (costs off) +select subq_2.c3 as c0 +from coltest as subq_2 +where + (EXISTS ( + select subq_2.c5 as c3, sample_2.c9 as c2 + from public.coltest as sample_2 + where subq_2.c6 = sample_2.c6)) + or + (EXISTS ( + select ref_2.c3 as c2 + from public.coltest as sample_3 + left join coltest as ref_2 on (true) + where (EXISTS ( + select sample_3.c1 as c1, ref_3.c11 as c7 + from public.coltest as ref_3 + where ref_3.c6 = sample_3.c6)))) +; + QUERY PLAN +------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on coltest subq_2 + Filter: ((SubPlan 1) OR $1) + InitPlan 2 (returns $1) + -> Nested Loop Left Join + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Nested Loop Semi Join + Join Filter: (sample_3.c6 = ref_3.c6) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: c6 + -> Seq Scan on coltest sample_3 + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: c6 + -> Seq Scan on coltest ref_3 + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on coltest ref_2 + SubPlan 1 + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on coltest sample_2 + Filter: (subq_2.c6 = c6) +(22 rows) + +explain (costs off) +select subq_2.c3 as c0 +from coltest as subq_2 +where + (EXISTS ( + select subq_2.c5 as c3, sample_2.c9 as c2 + from public.coltest as sample_2 + where case when subq_2.c6 is NULL then sample_2.c14 else cast(null as date) end + = sample_2.c14)) + or + (EXISTS ( + select sample_3.c3 as c2 + from public.coltest as sample_3 + where subq_2.c6 = sample_3.c6)) +; + QUERY PLAN +-------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on coltest subq_2 + Filter: ((SubPlan 1) OR (SubPlan 2)) + SubPlan 1 + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on coltest sample_2 + Filter: (CASE WHEN (subq_2.c6 IS NULL) THEN c14 ELSE NULL::date END = c14) + SubPlan 2 + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on coltest sample_3 + Filter: (subq_2.c6 = c6) +(11 rows) + +explain (costs off) +select subq_2.c3 as c0 +from coltest as subq_2 +where + (EXISTS ( + select subq_2.c5 as c3, sample_2.c9 as c2 + from public.coltest as sample_2 + where subq_2.c6 = sample_2.c6 and subq_2.c10='a')) + or + (EXISTS ( + select sample_3.c3 as c2 + from public.coltest as sample_3 + where subq_2.c6 = sample_3.c6)) +; + QUERY PLAN +------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on coltest subq_2 + Filter: ((SubPlan 1) OR (SubPlan 2)) + SubPlan 1 + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Result + One-Time Filter: (subq_2.c10 = 'a'::text) + -> Seq Scan on coltest sample_2 + Filter: (subq_2.c6 = c6) + SubPlan 2 + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Seq Scan on coltest sample_3 + Filter: (subq_2.c6 = c6) +(13 rows) + +explain (costs off) +select subq_2.c3 as c0 +from coltest as subq_2 +where + (EXISTS ( + select subq_2.c5 as c3, sample_2.c9 as c2 + from public.coltest as sample_2 + where subq_2.c6 = sample_2.c6 and sample_2.c10='a')) + or + (EXISTS ( + select sample_3.c3 as c2 + from public.coltest as sample_3 + where subq_2.c6 = sample_3.c6)) +; + QUERY PLAN +----------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Hash Right Join + Hash Cond: (fake = subq_2.c6) + Filter: ((fake_1 IS NOT NULL) OR (fake_1 IS NOT NULL)) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: fake + -> Finalize HashAggregate + Group Key: sample_3.c6 + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: c6 + -> Partial HashAggregate + Group Key: sample_3.c6 + -> Seq Scan on coltest sample_3 + -> Hash + -> Hash Left Join + Hash Cond: (subq_2.c6 = fake) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: c6 + -> Seq Scan on coltest subq_2 + -> Hash + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: fake + -> Group + Group Key: sample_2.c6 + -> Sort + Sort Key: sample_2.c6 + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: c6 + -> Group + Group Key: sample_2.c6 + -> Sort + Sort Key: sample_2.c6 + -> Seq Scan on coltest sample_2 + Filter: (c10 = 'a'::text) +(34 rows) + +drop table coltest; diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql index f17f38f3..17f719d9 100644 --- a/src/test/regress/sql/subselect.sql +++ b/src/test/regress/sql/subselect.sql @@ -904,3 +904,100 @@ drop table notin_t2; drop function explain_sq_limit(); drop table sq_limit; + +-- pull up or sublinks +set enable_pullup_subquery to on; +create table coltest( + c1 int, + c2 bigint, + c3 int2, + c4 bool, + c5 name, + c6 float4, + c7 float8, + c9 numeric, + c10 text, + c11 char(100), + c12 varchar, + c13 money, + c14 date, + c15 timestamp, + c16 timestamp with time zone, + c17 time, + c18 time with time zone, + c19 interval, + c20 abstime, + c21 reltime, + c22 tinterval, + c23 box, + c24 line, + c25 path, + c26 point, + c27 lseg, + c28 polygon, + c29 circle, + c30 inet, + c31 macaddr +); +explain (costs off) +select subq_2.c3 as c0 +from coltest as subq_2 +where + (EXISTS ( + select subq_2.c5 as c3, sample_2.c9 as c2 + from public.coltest as sample_2 + where subq_2.c6 = sample_2.c6)) + or + (EXISTS ( + select ref_2.c3 as c2 + from public.coltest as sample_3 + left join coltest as ref_2 on (true) + where (EXISTS ( + select sample_3.c1 as c1, ref_3.c11 as c7 + from public.coltest as ref_3 + where ref_3.c6 = sample_3.c6)))) +; +explain (costs off) +select subq_2.c3 as c0 +from coltest as subq_2 +where + (EXISTS ( + select subq_2.c5 as c3, sample_2.c9 as c2 + from public.coltest as sample_2 + where case when subq_2.c6 is NULL then sample_2.c14 else cast(null as date) end + = sample_2.c14)) + or + (EXISTS ( + select sample_3.c3 as c2 + from public.coltest as sample_3 + where subq_2.c6 = sample_3.c6)) +; +explain (costs off) +select subq_2.c3 as c0 +from coltest as subq_2 +where + (EXISTS ( + select subq_2.c5 as c3, sample_2.c9 as c2 + from public.coltest as sample_2 + where subq_2.c6 = sample_2.c6 and subq_2.c10='a')) + or + (EXISTS ( + select sample_3.c3 as c2 + from public.coltest as sample_3 + where subq_2.c6 = sample_3.c6)) +; +explain (costs off) +select subq_2.c3 as c0 +from coltest as subq_2 +where + (EXISTS ( + select subq_2.c5 as c3, sample_2.c9 as c2 + from public.coltest as sample_2 + where subq_2.c6 = sample_2.c6 and sample_2.c10='a')) + or + (EXISTS ( + select sample_3.c3 as c2 + from public.coltest as sample_3 + where subq_2.c6 = sample_3.c6)) +; +drop table coltest; From b529a4eef0d6336fae2db6f2173d70fc97e94428 Mon Sep 17 00:00:00 2001 From: youngxie Date: Tue, 19 Jan 2021 14:56:46 +0800 Subject: [PATCH 359/578] add switch for 2pc recovery file. --- src/backend/access/transam/twophase.c | 30 ++++++++++++++++++++++++++ src/backend/utils/misc/guc.c | 9 ++++++++ src/include/access/twophase.h | 4 ++++ src/test/regress/expected/sysviews.out | 1 + 4 files changed, 44 insertions(+) diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 37a041f6..2d1cbd4a 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -143,6 +143,9 @@ int max_prepared_xacts = 10000; /* We require 2PC */ #else int max_prepared_xacts = 0; #endif +#ifdef __TBASE__ +bool enable_2pc_recovery_info = true; +#endif static GlobalTransaction @@ -3148,6 +3151,12 @@ void record_2pc_redo_remove_gid_xid(TransactionId xid) int i; GlobalTransaction gxact = NULL; bool found = false; + + if(!enable_2pc_recovery_info) + { + return ; + } + for (i = 0; i < TwoPhaseState->numPrepXacts; i++) { gxact = TwoPhaseState->prepXacts[i]; @@ -3185,6 +3194,11 @@ void record_2pc_involved_nodes_xid(const char * tid, XLogRecPtr xlogrec = 0; #endif + if (!enable_2pc_recovery_info) + { + return ; + } + if (enable_distri_print) { elog(LOG, "record twophase txn gid: %s, startnode: %s, participants: %s", tid, startnode, nodestring); @@ -3333,6 +3347,11 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta GlobalTransaction gxact = NULL; #endif + if (!enable_2pc_recovery_info) + { + return ; + } + if (enable_distri_print) { elog(LOG, "record twophase txn gid: %s, commit_timestamp: %ld", tid, commit_timestamp); @@ -3433,6 +3452,12 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta void remove_2pc_records(const char * tid, bool record_in_xlog) { char path[MAXPGPATH]; + + if (!enable_2pc_recovery_info) + { + return ; + } + snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid); /* no need to check file exists. since when it do not exists , unlink won't success */ @@ -3455,6 +3480,11 @@ void record_2pc_readonly(const char *gid) char path[MAXPGPATH]; char content[10] = "readonly"; + if(!enable_2pc_recovery_info) + { + return ; + } + if (enable_distri_print) { elog(LOG, "record readonly twophase txn gid: %s", gid); diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index d1bf813f..d7c85782 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -2552,6 +2552,15 @@ static struct config_bool ConfigureNamesBool[] = false, NULL, NULL, NULL }, + { + {"enable_2pc_recovery_info", PGC_POSTMASTER, DEVELOPER_OPTIONS, + gettext_noop("write extra file for 2pc crash recovery."), + NULL + }, + &enable_2pc_recovery_info, + true, + NULL, NULL, NULL + }, #endif #ifdef __AUDIT__ diff --git a/src/include/access/twophase.h b/src/include/access/twophase.h index 8a4831b0..cbf83a3e 100644 --- a/src/include/access/twophase.h +++ b/src/include/access/twophase.h @@ -92,6 +92,10 @@ extern int max_prepared_xacts; extern int transaction_threshold; +#ifdef __TBASE__ +extern bool enable_2pc_recovery_info; +#endif + extern Size TwoPhaseShmemSize(void); extern void TwoPhaseShmemInit(void); diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index c3f67c12..9c939d43 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -72,6 +72,7 @@ select count(*) >= 0 as ok from pg_prepared_xacts; select name, setting from pg_settings where name like 'enable%'; name | setting -----------------------------------+--------- + enable_2pc_recovery_info | on enable_audit | off enable_audit_warning | off enable_auditlogger_warning | off From e504107243f55360c7a3f5aeb2936ff841b527d5 Mon Sep 17 00:00:00 2001 From: whalesong Date: Wed, 28 Apr 2021 21:36:20 +0800 Subject: [PATCH 360/578] 2pc files opt: add 2pc hash table on shmem (merge request !300) --- contrib/pg_clean/pg_clean.c | 353 +++----- src/backend/access/transam/twophase.c | 1054 +++++++++++++++++++++--- src/backend/access/transam/xlog.c | 28 +- src/backend/storage/ipc/ipci.c | 7 + src/backend/utils/misc/guc.c | 66 ++ src/include/access/twophase.h | 18 + src/include/catalog/pg_control.h | 368 ++++----- src/test/regress/expected/sysviews.out | 6 +- 8 files changed, 1330 insertions(+), 570 deletions(-) diff --git a/contrib/pg_clean/pg_clean.c b/contrib/pg_clean/pg_clean.c index 8d1514f4..459a2fc0 100644 --- a/contrib/pg_clean/pg_clean.c +++ b/contrib/pg_clean/pg_clean.c @@ -303,6 +303,8 @@ void getTxnInfoOnOtherNodes(txn_info *txn); int Get2PCXidByGid(Oid node_oid, char * gid, uint32 * transactionid); int Get2PCFile(Oid node_oid, char * gid, uint32 * transactionid); +char *get2PCInfo(const char *tid); + void getTxnStatus(txn_info * txn, int node_idx); void recover2PCForDatabaseAll(void); void recover2PCForDatabase(database_info * db_info); @@ -1615,23 +1617,28 @@ void getTxnStatus(txn_info *txn, int node_idx) DropTupleTableSlots(&result); } -Datum pgxc_get_2pc_file(PG_FUNCTION_ARGS); -PG_FUNCTION_INFO_V1(pgxc_get_2pc_file); -Datum pgxc_get_2pc_file(PG_FUNCTION_ARGS) +char *get2PCInfo(const char *tid) { - char *tid; - char path[MAXPGPATH]; - File fd; - int ret; - char *result; - text *t_result = NULL; + char *result = NULL; + char *info = NULL; + int size = 0; + File fd = -1; + int ret = -1; struct stat filestate; - off_t fileSize; + char path[MAXPGPATH]; - tid = text_to_cstring(PG_GETARG_TEXT_P(0)); + info = get_2pc_info_from_cache(tid); + if (NULL != info) + { + size = strlen(info); + result = (char *)palloc0(size + 1); + memcpy(result, info, size); + return result; + } - snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid); + elog(LOG, "try to get 2pc info from disk, tid: %s", tid); + snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid); if(access(path, F_OK) == 0) { if(stat(path, &filestate) == -1) @@ -1641,39 +1648,56 @@ Datum pgxc_get_2pc_file(PG_FUNCTION_ARGS) errmsg("could not get status of file \"%s\"", path))); } - fileSize = filestate.st_size; + size = filestate.st_size; - if (0 == fileSize) + if (0 == size) { - PG_RETURN_NULL(); + return NULL; } - result = (char *)palloc0(fileSize + 1); + result = (char *)palloc0(size + 1); fd = PathNameOpenFile(path, O_RDONLY, S_IRUSR | S_IWUSR); if (fd < 0) { + pfree(result); ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\" for read", path))); } - ret = FileRead(fd, result, fileSize, WAIT_EVENT_BUFFILE_READ); - - if(ret != fileSize) + ret = FileRead(fd, result, size, WAIT_EVENT_BUFFILE_READ); + if(ret != size) { + pfree(result); ereport(ERROR, (errcode_for_file_access(), errmsg("could not read file \"%s\"", path))); } FileClose(fd); - if (result) + return result; + } + + return NULL; +} + +Datum pgxc_get_2pc_file(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(pgxc_get_2pc_file); +Datum pgxc_get_2pc_file(PG_FUNCTION_ARGS) +{ + char *tid = NULL; + char *result = NULL; + text *t_result = NULL; + + tid = text_to_cstring(PG_GETARG_TEXT_P(0)); + result = get2PCInfo(tid); + if (NULL != result) { t_result = cstring_to_text(result); + pfree(result); return PointerGetDatum(t_result); } - } PG_RETURN_NULL(); } @@ -1682,63 +1706,26 @@ Datum pgxc_get_2pc_nodes(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(pgxc_get_2pc_nodes); Datum pgxc_get_2pc_nodes(PG_FUNCTION_ARGS) { - char *tid; - char path[MAXPGPATH]; - File fd; - int ret; - char *result; - char *nodename; + char *tid = NULL; + char *result = NULL; + char *nodename = NULL; text *t_result = NULL; - struct stat filestate; - off_t fileSize; tid = text_to_cstring(PG_GETARG_TEXT_P(0)); - - snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid); - - if(access(path, F_OK) == 0) - { - if(stat(path, &filestate) == -1) - { - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not get status of file \"%s\"", path))); - } - - fileSize = filestate.st_size; - - result = (char *)palloc0(fileSize + 1); - - fd = PathNameOpenFile(path, O_RDONLY, S_IRUSR | S_IWUSR); - if (fd < 0) - { - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not open file \"%s\" for read", path))); - } - - ret = FileRead(fd, result, fileSize, WAIT_EVENT_BUFFILE_READ); - - if(ret != fileSize) - { - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not read file \"%s\"", path))); - } - - FileClose(fd); - if (result) + result = get2PCInfo(tid); + if (NULL != result) { nodename = strstr(result, GET_NODE); - if (nodename) + if (NULL != nodename) { nodename += strlen(GET_NODE); nodename = strtok(nodename, "\n"); t_result = cstring_to_text(nodename); + pfree(result); return PointerGetDatum(t_result); } } - } + PG_RETURN_NULL(); } @@ -1746,61 +1733,24 @@ Datum pgxc_get_2pc_startnode(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(pgxc_get_2pc_startnode); Datum pgxc_get_2pc_startnode(PG_FUNCTION_ARGS) { - char *tid; - char path[MAXPGPATH]; - File fd; - int ret; - char *result; - char *nodename; + char *tid = NULL; + char *result = NULL; + char *nodename = NULL; text *t_result = NULL; - struct stat filestate; - off_t fileSize; tid = text_to_cstring(PG_GETARG_TEXT_P(0)); - - snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid); - - if(access(path, F_OK) == 0) - { - if(stat(path, &filestate) == -1) - { - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not get status of file \"%s\"", path))); - } - - fileSize = filestate.st_size; - - result = (char *)palloc0(fileSize + 1); - - fd = PathNameOpenFile(path, O_RDONLY, S_IRUSR | S_IWUSR); - if (fd < 0) - { - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not open file \"%s\" for read", path))); - } - - ret = FileRead(fd, result, fileSize, WAIT_EVENT_BUFFILE_READ); - - if(ret != fileSize) - { - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not read file \"%s\"", path))); - } - - FileClose(fd); - if (result) + result = get2PCInfo(tid); + if (NULL != result) { nodename = strstr(result, GET_START_NODE); - if (nodename) + if (NULL != nodename) { nodename += strlen(GET_START_NODE); nodename = strtok(nodename, "\n"); t_result = cstring_to_text(nodename); + pfree(result); return PointerGetDatum(t_result); - } + } } PG_RETURN_NULL(); @@ -1810,63 +1760,25 @@ Datum pgxc_get_2pc_startxid(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(pgxc_get_2pc_startxid); Datum pgxc_get_2pc_startxid(PG_FUNCTION_ARGS) { - char *tid; - char path[MAXPGPATH]; - File fd; - int ret; - char *result; - char *startxid; + char *tid = NULL; + char *result = NULL; + char *startxid = NULL; text *t_result = NULL; - struct stat filestate; - off_t fileSize; tid = text_to_cstring(PG_GETARG_TEXT_P(0)); - - snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid); - - if(access(path, F_OK) == 0) - { - if(stat(path, &filestate) == -1) - { - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not get status of file \"%s\"", path))); - } - - fileSize = filestate.st_size; - - result = (char *)palloc0(fileSize + 1); - - fd = PathNameOpenFile(path, O_RDONLY, S_IRUSR | S_IWUSR); - if (fd < 0) - { - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not open file \"%s\" for read", path))); - } - - ret = FileRead(fd, result, fileSize, WAIT_EVENT_BUFFILE_READ); - - if(ret != fileSize) - { - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not read file \"%s\"", path))); - } - - FileClose(fd); - if (result) + result = get2PCInfo(tid); + if (NULL != result) { startxid = strstr(result, GET_START_XID); - if (startxid) + if (NULL != startxid) { startxid += strlen(GET_START_XID); startxid = strtok(startxid, "\n"); t_result = cstring_to_text(startxid); + pfree(result); return PointerGetDatum(t_result); } } - } PG_RETURN_NULL(); } @@ -1875,63 +1787,25 @@ Datum pgxc_get_2pc_commit_timestamp(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(pgxc_get_2pc_commit_timestamp); Datum pgxc_get_2pc_commit_timestamp(PG_FUNCTION_ARGS) { - char *tid; - char path[MAXPGPATH]; - File fd; - int ret; - char *result; - char *commit_timestamp; + char *tid = NULL; + char *result = NULL; + char *commit_timestamp = NULL; text *t_result = NULL; - struct stat filestate; - off_t fileSize; tid = text_to_cstring(PG_GETARG_TEXT_P(0)); - - snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid); - - if(access(path, F_OK) == 0) - { - if(stat(path, &filestate) == -1) - { - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not get status of file \"%s\"", path))); - } - - fileSize = filestate.st_size; - - result = (char *)palloc0(fileSize + 1); - - fd = PathNameOpenFile(path, O_RDONLY, S_IRUSR | S_IWUSR); - if (fd < 0) - { - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not open file \"%s\" for read", path))); - } - - ret = FileRead(fd, result, fileSize, WAIT_EVENT_BUFFILE_READ); - - if(ret != fileSize) - { - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not read file \"%s\"", path))); - } - - FileClose(fd); - if (result) + result = get2PCInfo(tid); + if (NULL != result) { commit_timestamp = strstr(result, GET_COMMIT_TIMESTAMP); - if (commit_timestamp) + if (NULL != commit_timestamp) { commit_timestamp += strlen(GET_COMMIT_TIMESTAMP); commit_timestamp = strtok(commit_timestamp, "\n"); t_result = cstring_to_text(commit_timestamp); + pfree(result); return PointerGetDatum(t_result); } } - } PG_RETURN_NULL(); } @@ -1941,61 +1815,24 @@ Datum pgxc_get_2pc_xid(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(pgxc_get_2pc_xid); Datum pgxc_get_2pc_xid(PG_FUNCTION_ARGS) { - char *tid; - char path[MAXPGPATH]; - File fd; - int ret; + char *tid = NULL; + char *result = NULL; + char *str_xid = NULL; GlobalTransactionId xid; - char *result; - char *str_xid; - struct stat filestate; - off_t fileSize; tid = text_to_cstring(PG_GETARG_TEXT_P(0)); - - snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid); - - if(access(path, F_OK) == 0) + result = get2PCInfo(tid); + if (NULL != result) { - if(stat(path, &filestate) == -1) - { - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not get status of file \"%s\"", path))); - } - - fileSize = filestate.st_size; - result = (char *)palloc0(fileSize + 1); - - fd = PathNameOpenFile(path, O_RDONLY, S_IRUSR | S_IWUSR); - if (fd < 0) - { - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not open file \"%s\" for read", path))); - } - - - ret = FileRead(fd, result, fileSize, WAIT_EVENT_BUFFILE_READ); - - if(ret != fileSize) - { - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not read file \"%s\"", path))); - } - - FileClose(fd); - str_xid = strstr(result, GET_XID); - if (str_xid) + if (NULL != str_xid) { str_xid += strlen(GET_XID); str_xid = strtok(str_xid, "\n"); xid = strtoul(str_xid, NULL, 10); + pfree(result); PG_RETURN_UINT32(xid); } - } PG_RETURN_NULL(); } @@ -2004,15 +1841,9 @@ Datum pgxc_remove_2pc_records(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(pgxc_remove_2pc_records); Datum pgxc_remove_2pc_records(PG_FUNCTION_ARGS) { -#define SLEEP_COUNT 1000 - char *tid = NULL; - - tid = text_to_cstring(PG_GETARG_TEXT_P(0)); - + char *tid = text_to_cstring(PG_GETARG_TEXT_P(0)); remove_2pc_records(tid, true); - pfree(tid); - PG_RETURN_BOOL(true); } @@ -2181,11 +2012,27 @@ Datum pgxc_get_record_list(PG_FUNCTION_ARGS) char *recordList = NULL; text *t_recordList = NULL; + /* get from hash table */ + recordList = get_2pc_list_from_cache(&count); + if (count >= MAXIMUM_OUTPUT_FILE) + { + Assert(NULL != recordList); + t_recordList = cstring_to_text(recordList); + return PointerGetDatum(t_recordList); + } + + /* get from disk */ if(!(dir = opendir(TWOPHASE_RECORD_DIR))) { + if(NULL == recordList) + { PG_RETURN_NULL(); } + t_recordList = cstring_to_text(recordList); + return PointerGetDatum(t_recordList); + } + while((ptr = readdir(dir)) != NULL) { if(strcmp(ptr->d_name,".") == 0 || strcmp(ptr->d_name,"..") == 0) @@ -2193,7 +2040,9 @@ Datum pgxc_get_record_list(PG_FUNCTION_ARGS) continue; } if (count >= MAXIMUM_OUTPUT_FILE) + { break; + } if(!recordList) { diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 2d1cbd4a..46570383 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -130,12 +130,13 @@ #define TWOPHASE_DIR "pg_twophase" #define TWOPHASE_RECORD_DIR "pg_2pc" + +#define GET_2PC_FILE_PATH(path, tid) \ + snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid) + int transaction_threshold = 200000; -#define GET_START_XID "startxid:" -#define GET_COMMIT_TIMESTAMP "global_commit_timestamp:" + #define GET_START_NODE "startnode:" -#define GET_NODE "nodes:" -#define GET_XID "xid:" /* GUC variable, can't be changed after startup */ #ifdef PGXC @@ -147,6 +148,38 @@ int max_prepared_xacts = 0; bool enable_2pc_recovery_info = true; #endif +#ifdef __TWO_PHASE_TRANS__ +static HTAB *record_2pc_cache = NULL; + +bool enable_2pc_file_cache = true; +bool enable_2pc_file_check = true; +bool enable_2pc_entry_key_check = true; +bool enable_2pc_entry_trace = false; + +int record_2pc_cache_size = 50000; +int record_2pc_entry_size = 2048; +int record_2pc_partitions = 32; + +#define MAX_OUTPUT_FILE 1000 + +#define MAX_TID_SIZE MAXPGPATH +#define MAX_2PC_INFO_SIZE (record_2pc_entry_size - MAX_TID_SIZE) +#define DFLT_2PC_INFO_SIZE 1024 /* default size */ + +/* hash table entry for 2pc record */ +typedef struct Cache2pcInfo +{ + char key[MAX_TID_SIZE]; /* hash key: tid */ + char info[DFLT_2PC_INFO_SIZE]; + +} Cache2pcInfo; + +inline void +check_entry_key(const char *tid, const char *key, const char *func); + +void +check_2pc_file(const char *tid, const char *info, const char *func); +#endif static GlobalTransaction LookupGXact(const char *gid, Oid user); @@ -2107,6 +2140,12 @@ FinishPreparedTransaction(const char *gid, bool isCommit) { remove_2pc_records(gid, false); } + else + { + /* rename 2pc file when rollback on the current node */ + rename_2pc_records(gid, 0); + } + ClearLocalTwoPhaseState(); if (isCommit) @@ -2263,6 +2302,17 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon) int i; int serialized_xacts = 0; +#ifdef __TWO_PHASE_TRANS__ + File fd = -1; + int ret = 0; + int size = 0; + Cache2pcInfo *entry = NULL; + bool found = false; + char path[MAXPGPATH]; +#endif + + elog(LOG, "[CheckPointTwoPhase] checkpoint: "UINT64_FORMAT, redo_horizon); + if (max_prepared_xacts <= 0) return; /* nothing to do */ @@ -2300,17 +2350,167 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon) char *buf; int len; + /* save to pg_twophase */ XlogReadTwoPhaseData(gxact->prepare_start_lsn, &buf, &len); RecreateTwoPhaseFile(gxact->xid, buf, len); + pfree(buf); + +#ifdef __TWO_PHASE_TRANS__ + /* save to pg_2pc */ + if (NULL != record_2pc_cache) + { + Assert(strlen(gxact->gid) < MAX_TID_SIZE); + entry = (Cache2pcInfo *)hash_search(record_2pc_cache, + gxact->gid, HASH_FIND, &found); + if (found) + { + /* save to file */ + Assert(NULL != entry); + check_entry_key(gxact->gid, entry->key, "CheckPointTwoPhase"); + check_2pc_file(gxact->gid, entry->info, "CheckPointTwoPhase"); + + elog(LOG, "[CheckPointTwoPhase] %s is found " + "in hash table", gxact->gid); + + size = strlen(entry->info); + + memset(path, 0, MAXPGPATH); + GET_2PC_FILE_PATH(path, gxact->gid); + + fd = open(path, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR); + if (fd < 0) + { + elog(ERROR, "[CheckPointTwoPhase] could not create file %s, " + "errMsg: %s", path, strerror(errno)); + } + + ret = write(fd, entry->info, size); + if(ret != size) + { + close(fd); + elog(ERROR, "[CheckPointTwoPhase] could not write file %s, " + "errMsg: %s, ret: %d, info: %s", + path, strerror(errno), ret, entry->info); + } + close(fd); + + /* remove from hash table */ + entry = (Cache2pcInfo *)hash_search(record_2pc_cache, + gxact->gid, HASH_REMOVE, &found); + if (!found) + { + elog(WARNING, "[CheckPointTwoPhase] %s is not found " + "in hash table when remove it", gxact->gid); + } + else if (enable_2pc_entry_trace) + { + elog(LOG, "[CheckPointTwoPhase] %s is removed " + "from hash table", gxact->gid); + } + } + else + { + elog(LOG, "[CheckPointTwoPhase] %s is not found " + "in hash table", gxact->gid); + } + } +#endif + gxact->ondisk = true; gxact->prepare_start_lsn = InvalidXLogRecPtr; gxact->prepare_end_lsn = InvalidXLogRecPtr; - pfree(buf); serialized_xacts++; } } LWLockRelease(TwoPhaseStateLock); +#ifdef __TWO_PHASE_TRANS__ + /* start node maybe no in prepared xacts */ + if (IS_PGXC_COORDINATOR && NULL != record_2pc_cache) + { + HASH_SEQ_STATUS seq; + Cache2pcInfo *entry = NULL; + char *start_node = NULL; + char info[MAX_2PC_INFO_SIZE]; + + hash_seq_init(&seq, record_2pc_cache); + while ((entry = hash_seq_search(&seq)) != NULL) + { + Assert(NULL != entry); + check_2pc_file(entry->key, entry->info, "CheckPointTwoPhase"); + + elog(LOG, "[CheckPointTwoPhase] key %s is found " + "in hash table", entry->key); + + if (IsXidImplicit(entry->key)) + { + memset(info, 0, MAX_2PC_INFO_SIZE); + memcpy(info, entry->info, strlen(entry->info)); + + start_node = strstr(info, GET_START_NODE); + if (NULL != start_node) + { + start_node += strlen(GET_START_NODE); + start_node = strtok(start_node, "\n"); + + if (0 != strcmp(start_node, PGXCNodeName)) + { + elog(LOG, "[CheckPointTwoPhase] %s start node is not %s", + entry->key, PGXCNodeName); + continue; + } + else + { + elog(LOG, "[CheckPointTwoPhase] %s start node is %s", + entry->key, PGXCNodeName); + } + } + else + { + elog(WARNING, "[CheckPointTwoPhase] %s get start node failed, " + "info: %s", entry->key, entry->info); + } + } + + size = strlen(entry->info); + + memset(path, 0, MAXPGPATH); + GET_2PC_FILE_PATH(path, entry->key); + + fd = open(path, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR); + if (fd < 0) + { + elog(ERROR, "[CheckPointTwoPhase] could not create file %s, " + "errMsg: %s", path, strerror(errno)); + } + + ret = write(fd, entry->info, size); + if(ret != size) + { + close(fd); + elog(ERROR, "[CheckPointTwoPhase] could not write file %s, " + "errMsg: %s, ret: %d, info: %s", + path, strerror(errno), ret, entry->info); + } + close(fd); + + /* remove from hash table */ + entry = (Cache2pcInfo *)hash_search(record_2pc_cache, + entry->key, HASH_REMOVE, &found); + if (!found) + { + elog(WARNING, "[CheckPointTwoPhase] %s is not found " + "in hash table when remove it", entry->key); + } + else if (enable_2pc_entry_trace) + { + elog(LOG, "[CheckPointTwoPhase] %s is removed " + "from hash table", entry->key); + } + } + } +#endif + /* * Flush unconditionally the parent directory to make any information * durable on disk. Two-phase files could have been removed and those @@ -3146,6 +3346,117 @@ PrepareRedoRemove(TransactionId xid, bool giveWarning) } #ifdef __TWO_PHASE_TRANS__ +/* + * Check the entry key in the hash table is same with tid. + */ +inline void check_entry_key(const char *tid, const char *key, const char *func) +{ + if (!enable_2pc_entry_key_check) + { + return; + } + + if (0 != strcmp(tid, key)) + { + elog(PANIC, "[%s] %s get wrong key: %s", func, tid, key); + } +} + +/* + * Check whether the 2pc file is exist when it is saved in the hash table. + */ +void check_2pc_file(const char *tid, const char *info, const char *func) +{ + if (enable_2pc_file_check) + { + int size = 0; + struct stat filestate; + char path[MAXPGPATH]; + Cache2pcInfo *entry = NULL; + bool found = false; + + Assert (NULL != tid); + Assert (NULL != info); + Assert (NULL != func); + + GET_2PC_FILE_PATH(path, tid); + if (0 != access(path, F_OK)) + { + return; + } + + elog(LOG, "[check_2pc_file][%s] node(%s) found file %s", + func, PGXCNodeName, path); + + if(stat(path, &filestate) == -1) + { + elog(ERROR, "[check_2pc_file][%s] could not get status of file %s", + func, path); + } + + size = filestate.st_size; + + if (0 != size) + { + int ret = 0; + File fd = -1; + char result[size + 1]; + + fd = PathNameOpenFile(path, O_RDONLY, S_IRUSR | S_IWUSR); + if (fd < 0) + { + elog(ERROR, "[check_2pc_file][%s] could not open file %s for read", + func, path); + } + + memset(result, 0, size +1); + ret = FileRead(fd, result, size, WAIT_EVENT_BUFFILE_READ); + if(ret != size) + { + FileClose(fd); + elog(ERROR, "[check_2pc_file][%s] read %s error, ret: %d, size: %d", + func, path, ret, size); + } + FileClose(fd); + + if (0 != strcmp(result, info)) + { + elog(LOG, "[check_2pc_file][%s] file %s result: %s, info: %s", + func, path, result, info); + } + } + else + { + elog(LOG, "[check_2pc_file][%s] get empty file %s, info: %s", + func, path, info); + } + + if (NULL == record_2pc_cache) + { + elog(LOG, "[check_2pc_file][%s] record_2pc_cache is NULL, " + "tid: %s, info: %s", func, tid, info); + return; + } + + entry = (Cache2pcInfo *)hash_search(record_2pc_cache, + tid, HASH_FIND, &found); + if (!found) + { + elog(LOG, "[check_2pc_file][%s] %s is not found " + "in hash table, info: %s", func, tid, info); + return; + } + + Assert (NULL != entry); + + if (0 != strcmp(entry->info, info)) + { + elog(LOG, "[check_2pc_file][%s] %s info change from '%s' to '%s'", + func, tid, info, entry->info); + } + } +} + void record_2pc_redo_remove_gid_xid(TransactionId xid) { int i; @@ -3190,6 +3501,8 @@ void record_2pc_involved_nodes_xid(const char * tid, char path[MAXPGPATH]; off_t fileSize; char *result = NULL; + Cache2pcInfo *entry = NULL; + bool found = false; #ifdef __TWO_PHASE_TESTS__ XLogRecPtr xlogrec = 0; #endif @@ -3199,31 +3512,65 @@ void record_2pc_involved_nodes_xid(const char * tid, return ; } - if (enable_distri_print) + if (enable_distri_print || enable_2pc_entry_trace) { - elog(LOG, "record twophase txn gid: %s, startnode: %s, participants: %s", tid, startnode, nodestring); + elog(LOG, "[record_2pc_involved_nodes_xid] record %s, " + "startnode: %s, participants: %s", + tid, startnode, nodestring); } if (NULL == tid || '\0' == tid[0]) { - elog(ERROR, "record twophase txn GID is empty"); + elog(ERROR, "[record_2pc_involved_nodes_xid] gid is empty"); } if (NULL == startnode || '\0' == startnode[0]) { - elog(PANIC, "record twophase txn gid: %s, startnode is empty", tid); + elog(PANIC, "[record_2pc_involved_nodes_xid] %s startnode is empty", tid); } if (NULL == nodestring || '\0' == nodestring[0]) { - elog(PANIC, "record twophase txn gid: %s, participants is empty", tid); + elog(PANIC, "[record_2pc_involved_nodes_xid] %s participants is empty", tid); } - /* the 2pc dir is already created in initdb */ - snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid); + initStringInfo(&content); + appendStringInfo(&content, "startnode:%s\n", startnode); + appendStringInfo(&content, "startxid:%u\n", startxid); + appendStringInfo(&content, "nodes:%s\n", nodestring); + appendStringInfo(&content, "xid:%u\n", xid); + size = content.len; + + Assert(size == strlen(content.data)); /* if in_pg_clean, then check whether the file exists */ if (g_twophase_state.in_pg_clean) { + /* if tid already exists, check content and return */ + if (NULL != record_2pc_cache) + { + Assert(strlen(tid) < MAX_TID_SIZE); + entry = (Cache2pcInfo *)hash_search(record_2pc_cache, tid, HASH_FIND, &found); + if (found) + { + Assert(NULL != entry); + check_entry_key(tid, entry->key, "record_2pc_involved_nodes_xid"); + check_2pc_file(tid, entry->info, "record_2pc_involved_nodes_xid"); + + if (strncmp(entry->info, content.data, size) != 0) + { + elog(ERROR, "[record_2pc_involved_nodes_xid] pg_clean attemp to " + "write %s info conflict, content: %s, info: %s", + tid, content.data, entry->info); + } + + resetStringInfo(&content); + pfree(content.data); + return; + } + } + + GET_2PC_FILE_PATH(path, tid); + /* if file already exists, check content and return */ if (stat(path, &fst) >= 0) { @@ -3235,40 +3582,111 @@ void record_2pc_involved_nodes_xid(const char * tid, { ereport(ERROR, (errcode_for_file_access(), - errmsg("could not open file \"%s\" for read", path))); + errmsg("[record_2pc_involved_nodes_xid] could not " + "open file %s for read", path))); } ret = FileRead(fd, result, fileSize, WAIT_EVENT_BUFFILE_READ); if(ret != fileSize) { + FileClose(fd); ereport(ERROR, (errcode_for_file_access(), - errmsg("could not read file \"%s\"", path))); + errmsg("[record_2pc_involved_nodes_xid] could not " + "read file %s, ret: %d", path, ret))); } - FileClose(fd); - if (result) + + Assert(NULL != result); + + if (strncmp(result, content.data, size) != 0) { - initStringInfo(&content); - appendStringInfo(&content, "startnode:%s\n", startnode); - appendStringInfo(&content, "startxid:%u\n", startxid); - appendStringInfo(&content, "nodes:%s\n", nodestring); - appendStringInfo(&content, "xid:%u\n", xid); - if (strncmp(result, content.data, content.len) != 0) - { - elog(ERROR, "pg_clean attemp to write 2pc file conflict with file '%s', " - "attemp to write startnode: %s, startxid: %u, " - "nodestring: %s, xid: %u", tid, startnode, startxid, nodestring, xid); - } - else - { - resetStringInfo(&content); - pfree(content.data); - return; - } + elog(ERROR, "[record_2pc_involved_nodes_xid] pg_clean attemp to " + "write %s info conflict, content: %s, info: %s", + tid, content.data, result); } + + pfree(result); + + resetStringInfo(&content); + pfree(content.data); + return; } } + if (!RecoveryInProgress()) + { + XLogBeginInsert(); + XLogRegisterData((char *)tid, strlen(tid) + 1); + XLogRegisterData((char *)startnode, strlen(startnode) + 1); + XLogRegisterData((char *)&startxid, sizeof(GlobalTransactionId) + 1); + XLogRegisterData((char *)nodestring, strlen(nodestring) + 1); + XLogRegisterData((char *)&xid, sizeof(GlobalTransactionId) + 1); +#ifdef __TWO_PHASE_TESTS__ + xlogrec = +#endif + XLogInsert(RM_XLOG_ID, XLOG_CREATE_2PC_FILE); +#ifdef __TWO_PHASE_TESTS__ + if (PART_PREPARE_AFTER_RECORD_2PC == twophase_exception_case && + g_twophase_state.is_start_node) + { + XLogFlush(xlogrec); + run_pg_clean = 1; + complish = true; + elog(STOP, "[record_2pc_involved_nodes_xid] twophase exception: " + "simulate kill start node after record 2pc file"); + } +#endif + } + + if (NULL != record_2pc_cache && size < MAX_2PC_INFO_SIZE) + { + Assert(strlen(tid) < MAX_TID_SIZE); + entry = (Cache2pcInfo *)hash_search(record_2pc_cache, + tid, HASH_ENTER_NULL, &found); + if (NULL != entry) + { + check_entry_key(tid, entry->key, "record_2pc_involved_nodes_xid"); + check_2pc_file(tid, entry->info, "record_2pc_involved_nodes_xid"); + + if (found) + { + if (RecoveryInProgress()) + { + elog(LOG, "[record_2pc_involved_nodes_xid] %s is found " + "in hash table in recovery mode", tid); + } + else + { + elog(LOG, "[record_2pc_involved_nodes_xid] %s is found " + "in hash table", tid); + } + } + else if (enable_2pc_entry_trace) + { + elog(LOG, "[record_2pc_involved_nodes_xid] %s is added " + "to hash table", tid); + } + + memset(entry->info, 0, MAX_2PC_INFO_SIZE); + memcpy(entry->info, content.data, size); + + resetStringInfo(&content); + pfree(content.data); + return; + } + else + { + elog(LOG, "[record_2pc_involved_nodes_xid] %s entry is NULL", tid); + } + } + else if (NULL != record_2pc_cache) + { + elog(LOG, "[record_2pc_involved_nodes_xid] %s size: %d, " + "max info size: %d", tid, size, MAX_2PC_INFO_SIZE); + } + + GET_2PC_FILE_PATH(path, tid); + /* * we open 2pc file under the following two different situations: * a. if in recovery mode, @@ -3287,49 +3705,23 @@ void record_2pc_involved_nodes_xid(const char * tid, } if (fd < 0) { - elog(ERROR, "could not create 2pc file \"%s\", errMsg:%s", path, strerror(errno)); + elog(ERROR, "[record_2pc_involved_nodes_xid] could not create file %s, " + "errMsg: %s", path, strerror(errno)); return; } - initStringInfo(&content); - appendStringInfo(&content, "startnode:%s\n", startnode); - appendStringInfo(&content, "startxid:%u\n", startxid); - appendStringInfo(&content, "nodes:%s\n", nodestring); - appendStringInfo(&content, "xid:%u\n", xid); - size = strlen(content.data); ret = FileWrite(fd, content.data, size, WAIT_EVENT_BUFFILE_WRITE); if(ret != size) { - elog(ERROR, "could not write 2pc file \"%s\", errMsg:%s", path, strerror(errno)); + FileClose(fd); + elog(ERROR, "[record_2pc_involved_nodes_xid] could not write file %s, " + "errMsg: %s, ret: %d, content: %s", + path, strerror(errno), ret, content.data); } - resetStringInfo(&content); - pfree(content.data); FileClose(fd); - if (!RecoveryInProgress()) - { - XLogBeginInsert(); - XLogRegisterData((char *)tid, strlen(tid)+1); - XLogRegisterData((char *)startnode, strlen(startnode)+1); - XLogRegisterData((char *)&startxid, sizeof(GlobalTransactionId) + 1); - XLogRegisterData((char *)nodestring, strlen(nodestring)+1); - XLogRegisterData((char *)&xid, sizeof(GlobalTransactionId) + 1); -#ifdef __TWO_PHASE_TESTS__ - xlogrec = -#endif - XLogInsert(RM_XLOG_ID, XLOG_CREATE_2PC_FILE); -#ifdef __TWO_PHASE_TESTS__ - if (PART_PREPARE_AFTER_RECORD_2PC == twophase_exception_case && - g_twophase_state.is_start_node) - { - XLogFlush(xlogrec); - run_pg_clean = 1; - complish = true; - elog(STOP, "twophase exception: simulate kill start node after record 2pc file"); - } -#endif - } - + resetStringInfo(&content); + pfree(content.data); } /* record commit timestamp in 2pc file while twophase trans failed in commit phase in the current node */ @@ -3338,10 +3730,13 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta char path[MAXPGPATH]; char file_content[2048]; StringInfoData content; - File fd; - int ret; - int size; + File fd = -1; + int ret = 0; + int size = 0; + int new_size = 0; XLogRecPtr xlogrec = 0; + Cache2pcInfo *entry = NULL; + bool found = false; #if 0 int i; GlobalTransaction gxact = NULL; @@ -3352,30 +3747,146 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta return ; } - if (enable_distri_print) + if (enable_distri_print || enable_2pc_entry_trace) { - elog(LOG, "record twophase txn gid: %s, commit_timestamp: %ld", tid, commit_timestamp); + elog(LOG, "[record_2pc_commit_timestamp] %s commit_timestamp: " + INT64_FORMAT, tid, commit_timestamp); } Assert(tid[0] != '\0'); if (InvalidGlobalTimestamp == commit_timestamp && (TWO_PHASE_COMMITTING == g_twophase_state.state || TWO_PHASE_COMMIT_END == g_twophase_state.state)) { - elog(ERROR, "can not commit transaction'%s' on node '%s' with InvalidGlobalTimestamp", tid, PGXCNodeName); + elog(ERROR, "[record_2pc_commit_timestamp] could not commit " + "transaction '%s' on node '%s' with InvalidGlobalTimestamp", + tid, PGXCNodeName); } + if (!RecoveryInProgress()) + { + XLogBeginInsert(); + XLogRegisterData((char *)tid, strlen(tid) + 1); + XLogRegisterData((char *)&commit_timestamp, sizeof(GlobalTimestamp) + 1); + xlogrec = XLogInsert(RM_XLOG_ID, XLOG_RECORD_2PC_TIMESTAMP); + /* only start node need to flush and sync XLOG_RECORD_2PC_TIMESTAMP */ + if (IS_PGXC_LOCAL_COORDINATOR) + { + XLogFlush(xlogrec); + SyncRepWaitForLSN(xlogrec, false); + } + } - /* the 2pc dir is already created in initdb */ - snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid); + initStringInfo(&content); + appendStringInfo(&content, "global_commit_timestamp:"INT64_FORMAT"\n", + commit_timestamp); + size = strlen(content.data); - /* the 2pc file exists already */ - fd = open(path, O_RDWR | O_APPEND, S_IRUSR | S_IWUSR);//PathNameOpenFile(path, O_RDWR | O_APPEND, S_IRUSR | S_IWUSR); + if (NULL != record_2pc_cache) + { + Assert(strlen(tid) < MAX_TID_SIZE); + entry = (Cache2pcInfo *)hash_search(record_2pc_cache, tid, HASH_FIND, &found); + if (found) + { + Assert(NULL != entry); + check_entry_key(tid, entry->key, "record_2pc_commit_timestamp"); + check_2pc_file(tid, entry->info, "record_2pc_commit_timestamp"); + + if (RecoveryInProgress()) + { + elog(LOG, "[record_2pc_commit_timestamp] %s is found " + "in hash table in recovery mode", tid); + } + else if (enable_2pc_entry_trace) + { + elog(LOG, "[record_2pc_commit_timestamp] %s is found " + "in hash table", tid); + } + + new_size = size + strlen(entry->info); + + if (new_size >= MAX_2PC_INFO_SIZE) + { + /* save to file */ + elog(LOG, "[record_2pc_commit_timestamp] %s new size(%d) " + "overflow(%d)", tid, new_size, MAX_2PC_INFO_SIZE); + + GET_2PC_FILE_PATH(path, tid); + + fd = open(path, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR); if (fd < 0) { - if (enable_distri_print) + if (RecoveryInProgress()) + { + elog(LOG, "[record_2pc_commit_timestamp] could not " + "append timestamp in file %s, errMsg: %s", + path, strerror(errno)); + } + else { - elog(LOG, "cannot open 2pc file %s", tid); + elog(ERROR, "[record_2pc_commit_timestamp] could not " + "append timestamp in file %s, errMsg: %s", + path, strerror(errno)); + } + return; } + + ret = write(fd, entry->info, strlen(entry->info)); + if(ret != new_size) + { + close(fd); + elog(ERROR, "[record_2pc_commit_timestamp] could not write " + "file %s, errMsg: %s, ret: %d, info: %s", + path, strerror(errno), ret, entry->info); + } + ret = write(fd, content.data, size); + if(ret != new_size) + { + close(fd); + elog(ERROR, "[record_2pc_commit_timestamp] could not write " + "file %s, errMsg: %s, ret: %d, info: %s", + path, strerror(errno), ret, content.data); + } + close(fd); + + /* remove from hash table */ + entry = (Cache2pcInfo *)hash_search(record_2pc_cache, + tid, HASH_REMOVE, &found); + if (!found) + { + elog(WARNING, "[record_2pc_commit_timestamp] %s is not found" + "in hash table when remove it", tid); + } + else if (enable_2pc_entry_trace) + { + elog(LOG, "[record_2pc_commit_timestamp] %s is removed " + "from hash table", entry->key); + } + + resetStringInfo(&content); + pfree(content.data); + return; + } + + /* save to hash table */ + memcpy(entry->info + strlen(entry->info), content.data, size); + + resetStringInfo(&content); + pfree(content.data); + return; + } + else + { + elog(LOG, "[record_2pc_commit_timestamp] %s is not found " + "in hash table", tid); + } + } + + GET_2PC_FILE_PATH(path, tid); + + /* the 2pc file exists already */ + fd = open(path, O_RDWR | O_APPEND, S_IRUSR | S_IWUSR); + if (fd < 0) + { if (RecoveryInProgress()) { #if 0 @@ -3388,109 +3899,242 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta } if (0 == strcmp(gxact->gid, tid)) { - elog(ERROR, "in record_2pc_commit_timestamp could not append timestamp in 2pc file \"%s\", errMsg:%s", path, strerror(errno)); + elog(ERROR, "[record_2pc_commit_timestamp] could not " + "append timestamp in file %s, errMsg: %s", + path, strerror(errno)); } } #endif - elog(LOG, "in record_2pc_commit_timestamp could not append timestamp in 2pc file \"%s\", errMsg:%s", path, strerror(errno)); + elog(LOG, "[record_2pc_commit_timestamp] could not open file %s, " + "errMsg: %s", path, strerror(errno)); } else { - elog(ERROR, "in record_2pc_commit_timestamp could not append timestamp in 2pc file \"%s\", errMsg:%s", path, strerror(errno)); + elog(ERROR, "[record_2pc_commit_timestamp] could not open file %s, " + "errMsg: %s", path, strerror(errno)); } return; } - if (!RecoveryInProgress()) - { - XLogBeginInsert(); - XLogRegisterData((char *)tid, strlen(tid)+1); - XLogRegisterData((char *)&commit_timestamp, sizeof(GlobalTimestamp)); - xlogrec = XLogInsert(RM_XLOG_ID, XLOG_RECORD_2PC_TIMESTAMP); - /* only start node need to flush and sync XLOG_RECORD_2PC_TIMESTAMP */ - if (IS_PGXC_LOCAL_COORDINATOR) - { - XLogFlush(xlogrec); - SyncRepWaitForLSN(xlogrec, false); - } - } - if (enable_distri_print) { - (void) read(fd, file_content, 2048);//FileRead(fd, file_content, 2048, WAIT_EVENT_BUFFILE_READ); - elog(LOG, "before append 2pc file: %s, file_content: %s", tid, file_content); + memset(file_content, 0, 2048); + ret = read(fd, file_content, 2048); + elog(LOG, "[record_2pc_commit_timestamp] before append file: %s, " + "file_content: %s, content.data: %s, ret: %d", + path, file_content, content.data, ret); } - initStringInfo(&content); - appendStringInfo(&content, "global_commit_timestamp:"INT64_FORMAT"\n", commit_timestamp); - size = strlen(content.data); - if (enable_distri_print) - { - elog(LOG, "before append 2pc file: %s, content.data: %s", tid, content.data); - } ret = write(fd, content.data, size); if(ret != size) { - if (enable_distri_print) - { - elog(LOG, "cannot append timestamp to 2pc file %s", tid); - } - elog(ERROR, "in could not write 2pc file \"%s\", errMsg:%s", path, strerror(errno)); + close(fd); + elog(ERROR, "[record_2pc_commit_timestamp] could not write file %s, " + "errMsg: %s", path, strerror(errno)); } + if (enable_distri_print) { memset(file_content, 0, 2048); lseek(fd, 0, SEEK_SET); ret = read(fd, file_content, 2048); - elog(LOG, "after append 2pc file: %s, file_content: %s, ret = %d", tid, file_content, ret); + elog(LOG, "[record_2pc_commit_timestamp] after append file: %s, " + "file_content: %s, ret: %d", tid, file_content, ret); } + + close(fd); + resetStringInfo(&content); pfree(content.data); - close(fd); } void remove_2pc_records(const char * tid, bool record_in_xlog) { char path[MAXPGPATH]; + Cache2pcInfo *entry = NULL; + bool found = false; if (!enable_2pc_recovery_info) { return ; } - snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid); + if (enable_distri_print || enable_2pc_entry_trace) + { + elog(LOG, "[remove_2pc_records] %s record_in_xlog: %d", + tid, record_in_xlog); + } - /* no need to check file exists. since when it do not exists , unlink won't success */ if (!RecoveryInProgress() && record_in_xlog) { + char *type = "remove"; XLogBeginInsert(); XLogRegisterData((char *)tid, strlen(tid)+1); + XLogRegisterData((char *)type, strlen(type) + 1); XLogInsert(RM_XLOG_ID, XLOG_CLEAN_2PC_FILE); } + + if (NULL != record_2pc_cache) + { + Assert(strlen(tid) < MAX_TID_SIZE); + if (enable_2pc_entry_key_check) + { + entry = (Cache2pcInfo *)hash_search(record_2pc_cache, + tid, HASH_FIND, &found); + if (found) + { + Assert(NULL != entry); + check_entry_key(tid, entry->key, "remove_2pc_records"); + check_2pc_file(tid, entry->info, "remove_2pc_records"); + } + } + entry = (Cache2pcInfo *)hash_search(record_2pc_cache, + tid, HASH_REMOVE, &found); + if (found) + { + Assert(NULL != entry); + if (enable_2pc_entry_trace) + { + elog(LOG, "[remove_2pc_records] %s is removed " + "from hash table", tid); + } + return; + } + } + + GET_2PC_FILE_PATH(path, tid); + + /* + * no need to check file exists. + * since when it do not exists, unlink won't success. + */ if (0 != unlink(path)) { - elog(LOG, "node: %s fail to remove 2pc file: %s", PGXCNodeName, tid); + elog(LOG, "[remove_2pc_records] could not unlink file %s, " + "errMsg: %s", path, strerror(errno)); } } +void rename_2pc_records(const char *tid, TimestampTz timestamp) +{ + char path[MAXPGPATH]; + char new_path[MAXPGPATH]; + Cache2pcInfo *entry = NULL; + bool found = false; + File fd = 0; + int ret = 0; + + if (!enable_2pc_recovery_info) + { + return; + } + + if (enable_distri_print || enable_2pc_entry_trace) + { + elog(LOG, "[rename_2pc_records] %s timestamp: " + INT64_FORMAT, tid, timestamp); + } + + if (0 == timestamp) + { + timestamp = GetCurrentTimestamp(); + } + + if (!RecoveryInProgress()) + { + char *type = "rename"; + XLogBeginInsert(); + XLogRegisterData((char *)tid, strlen(tid) + 1); + XLogRegisterData((char *)type, strlen(type) + 1); + XLogRegisterData((char *)×tamp, sizeof(TimestampTz) + 1); + XLogInsert(RM_XLOG_ID, XLOG_CLEAN_2PC_FILE); + } + + GET_2PC_FILE_PATH(path, tid); + snprintf(new_path, MAXPGPATH, "%s." INT64_FORMAT ".rollback", path, timestamp); + + if (NULL != record_2pc_cache) + { + Assert(strlen(tid) < MAX_TID_SIZE); + entry = (Cache2pcInfo *)hash_search(record_2pc_cache, + tid, HASH_FIND, &found); + if (found) + { + Assert(NULL != entry); + check_entry_key(tid, entry->key, "rename_2pc_records"); + check_2pc_file(tid, entry->info, "rename_2pc_records"); + + fd = PathNameOpenFile(new_path, O_RDWR | O_CREAT | O_EXCL, + S_IRUSR | S_IWUSR); + if (fd < 0) + { + elog(ERROR, "[rename_2pc_records] could not create file %s, " + "errMsg: %s", new_path, strerror(errno)); + } + + ret = FileWrite(fd, entry->info, strlen(entry->info), + WAIT_EVENT_BUFFILE_WRITE); + if(ret != strlen(entry->info)) + { + FileClose(fd); + elog(ERROR, "[rename_2pc_records] could not write file %s, " + "errMsg: %s, ret: %d, info: %s", + path, strerror(errno), ret, entry->info); + } + FileClose(fd); + + entry = (Cache2pcInfo *)hash_search(record_2pc_cache, + tid, HASH_REMOVE, &found); + if (!found) + { + elog(ERROR, "[rename_2pc_records] %s is not found " + "in hash table when remove it", tid); + } + else if (enable_2pc_entry_trace) + { + elog(LOG, "[rename_2pc_records] %s is removed " + "from hash table", tid); + } + return; + } + } + + if (0 != access(path, F_OK)) + { + elog(LOG, "[rename_2pc_records] could not access file %s, " + "errMsg: %s", path, strerror(errno)); + return; + } + if (0 != link(path, new_path)) + { + elog(ERROR, "[rename_2pc_records] could not link file %s to %s, " + "errMsg: %s", path, new_path, strerror(errno)); + } + if (0 != unlink(path)) + { + elog(WARNING, "[rename_2pc_records] could not unlink file %s, " + "errMsg: %s", path, strerror(errno)); + } +} + void record_2pc_readonly(const char *gid) { File fd = 0; int ret = 0; char path[MAXPGPATH]; char content[10] = "readonly"; + Cache2pcInfo *entry = NULL; + bool found = false; if(!enable_2pc_recovery_info) { return ; } - if (enable_distri_print) + if (enable_distri_print || enable_2pc_entry_trace) { - elog(LOG, "record readonly twophase txn gid: %s", gid); + elog(LOG, "[record_2pc_readonly] %s is readonly", gid); } - /* the 2pc dir is already created in initdb */ - snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", gid); if (!RecoveryInProgress()) { @@ -3500,6 +4144,45 @@ void record_2pc_readonly(const char *gid) XLogInsert(RM_XLOG_ID, XLOG_CREATE_2PC_FILE); } + if (NULL != record_2pc_cache) + { + Assert(strlen(gid) < MAX_TID_SIZE); + entry = (Cache2pcInfo *)hash_search(record_2pc_cache, + gid, HASH_ENTER_NULL, &found); + if (NULL != entry) + { + check_entry_key(gid, entry->key, "record_2pc_readonly"); + check_2pc_file(gid, entry->info, "record_2pc_readonly"); + + if (found) + { + if (RecoveryInProgress()) + { + elog(LOG, "[record_2pc_readonly] %s is found " + "in hash table in recovery mode", gid); + } + else + { + elog(LOG, "[record_2pc_readonly] %s is found " + "in hash table", gid); + } + } + else if (enable_2pc_entry_trace) + { + elog(LOG, "[record_2pc_readonly] %s is added " + "to hash table", gid); + } + memcpy(entry->info, content, strlen(content)); + return; + } + else + { + elog(LOG, "[record_2pc_readonly] %s entry is NULL", gid); + } + } + + /* the 2pc dir is already created in initdb */ + GET_2PC_FILE_PATH(path, gid); /* * we open 2pc file under the following two different situations: @@ -3507,7 +4190,8 @@ void record_2pc_readonly(const char *gid) * the existed 2pc file can be trucated and reused. * b. if not under recovery progress, * we not allowed the implicit trans gid existed, - * since the xid in startnode should not be truncate if the twophase trans is part commit or part abort. + * since the xid in startnode should not be truncate if the + * twophase trans is part commit or part abort. */ if (RecoveryInProgress()) { @@ -3519,18 +4203,142 @@ void record_2pc_readonly(const char *gid) } if (fd < 0) { - elog(ERROR, "could not create readonly 2pc file \"%s\", errMsg:%s", path, strerror(errno)); + elog(ERROR, "[record_2pc_readonly] could not create file %s, " + "errMsg: %s", path, strerror(errno)); return; } ret = FileWrite(fd, content, strlen(content), WAIT_EVENT_BUFFILE_WRITE); if(ret != strlen(content)) { - elog(ERROR, "could not write 2pc file \"%s\", errMsg:%s", path, strerror(errno)); + FileClose(fd); + elog(ERROR, "[record_2pc_readonly] could not write file %s, " + "errMsg: %s, ret: %d, content: %s", + path, strerror(errno), ret, content); } FileClose(fd); +} +/* + * Get 2pc info from hash table. + */ +char *get_2pc_info_from_cache(const char *tid) +{ + Cache2pcInfo *entry = NULL; + bool found = false; + if (NULL != record_2pc_cache) + { + Assert(strlen(tid) < MAX_TID_SIZE); + entry = (Cache2pcInfo *)hash_search(record_2pc_cache, + tid, HASH_FIND, &found); + if (found) + { + Assert(NULL != entry); + + check_entry_key(tid, entry->key, "get_2pc_info_from_cache"); + + if (enable_2pc_entry_trace) + { + elog(LOG, "[get_2pc_info_from_cache] %s is found " + "in hast table, key: %s, info: %s", + tid, entry->key, entry->info); } -#endif + return entry->info; + } + if (enable_2pc_entry_trace) + { + elog(LOG, "[get_2pc_info_from_cache] %s is not found " + "in hast table", tid); + } + } + return NULL; +} + +/* + * Get 2pc list from hash table. + */ +char *get_2pc_list_from_cache(int *count) +{ + HASH_SEQ_STATUS seq; + Cache2pcInfo *entry = NULL; + char *recordList = NULL; + + if (NULL == record_2pc_cache) + { + return NULL; + } + + hash_seq_init(&seq, record_2pc_cache); + while ((entry = hash_seq_search(&seq)) != NULL) + { + Assert(NULL != entry); + check_2pc_file(entry->key, entry->info, "get_2pc_list_from_cache"); + + if (NULL != count && *count >= MAX_OUTPUT_FILE) + { + break; + } + + if(NULL == recordList) + { + recordList = (char *)palloc0(strlen(entry->key) + 1); + sprintf(recordList, "%s", entry->key); + } + else + { + recordList = (char *) repalloc(recordList, + strlen(entry->key) + strlen(recordList) + 2); + sprintf(recordList, "%s,%s", recordList, entry->key); + } + if (NULL != count) + { + (*count)++; + } + } + + return recordList; +} + +/* + * Initialize 2pc info cache using shared memory hash table. + */ +void +Record2pcCacheInit(void) +{ + HASHCTL info; + int flags = 0; + + if (!enable_2pc_file_cache) + { + record_2pc_cache = NULL; + return; + } + + info.keysize = MAX_TID_SIZE; + info.entrysize = record_2pc_entry_size; + info.num_partitions = record_2pc_partitions; + + flags = HASH_ELEM | HASH_PARTITION; + + record_2pc_cache = ShmemInitHash("Record 2pc Cache", + record_2pc_cache_size/4, record_2pc_cache_size, + &info, flags); +} + +/* + * Return 2pc info cache size. + */ +Size +Record2pcCacheSize(void) +{ + long cache_size = 0; + if (enable_2pc_file_cache) + { + cache_size = (long)record_2pc_cache_size * record_2pc_entry_size; + } + return cache_size; +} + +#endif diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 18ed9b22..7044cd8b 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -10171,15 +10171,27 @@ xlog_redo(XLogReaderState *record) #ifdef __TWO_PHASE_TRANS__ else if (info == XLOG_CLEAN_2PC_FILE) { - char *gid; + char *pos = NULL; + char *gid = NULL; + char *type = NULL; + TimestampTz timestamp = 0; gid = XLogRecGetData(record); + type = gid + strlen(gid) + 1; + pos = type + strlen(type) + 1; + memcpy(×tamp, pos, sizeof(TimestampTz)); + if (0 == strcmp(type, "rename")) + { + rename_2pc_records(gid, timestamp); + } + else + { remove_2pc_records(gid, false); } + } else if (info == XLOG_CREATE_2PC_FILE) { TransactionId xid; TransactionId startxid; - GlobalTimestamp commit_timestamp = 0; char *gid; char *startnode; char *nodestring; @@ -10199,19 +10211,15 @@ xlog_redo(XLogReaderState *record) { startnode = temp; memcpy(&startxid, pos, sizeof(TransactionId)); - pos = pos + sizeof(TransactionId) ; + pos = pos + sizeof(TransactionId) + 1; nodestring = pos; pos = pos + strlen(nodestring) + 1; memcpy(&xid, pos, sizeof(TransactionId)); - pos = pos + sizeof(TransactionId) ; - if (IsXidImplicit(gid)) - { - memcpy(&commit_timestamp, pos, sizeof(GlobalTimestamp)); - } if (enable_distri_print) { - elog(LOG, "xlog redo 2pc file name: '%s', startnode: %s, startxid: %u, nodestring: %s, " - "xid: %u, commit_timestamp:"INT64_FORMAT, gid, startnode, startxid, nodestring, xid, commit_timestamp); + elog(LOG, "xlog redo 2pc file name: '%s', startnode: %s, " + "startxid: %u, nodestring: %s, xid: %u", + gid, startnode, startxid, nodestring, xid); } #ifdef __TWO_PHASE_TESTS__ if (FILE_XLOG_EXISTED == twophase_exception_case) diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index cdc96d59..3cdb9063 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -306,6 +306,9 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port) size = add_size(size, PubStatDataShmemSize(g_PubStatHashSize, g_PubTableStatHashSize)); size = add_size(size, SubStatDataShmemSize(g_SubStatHashSize, g_SubTableStatHashSize)); #endif +#ifdef __TWO_PHASE_TRANS__ + size = add_size(size, Record2pcCacheSize()); +#endif #ifdef __COLD_HOT__ size = add_size(size, DualWriteTableSize()); #endif @@ -482,6 +485,10 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port) InitSubStatData(g_SubStatHashSize, g_SubTableStatHashSize); #endif +#ifdef __TWO_PHASE_TRANS__ + Record2pcCacheInit(); +#endif + #ifdef __COLD_HOT__ DualWriteCtlInit(); #endif diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index d7c85782..9c699458 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -2689,6 +2689,45 @@ static struct config_bool ConfigureNamesBool[] = }, #endif +#ifdef __TWO_PHASE_TRANS__ + { + {"enable_2pc_file_cache", PGC_POSTMASTER, CUSTOM_OPTIONS, + gettext_noop("Enable 2PC cache."), + NULL + }, + &enable_2pc_file_cache, + true, + NULL, NULL, NULL + }, + { + {"enable_2pc_file_check", PGC_USERSET, CUSTOM_OPTIONS, + gettext_noop("Enable 2PC file check."), + NULL + }, + &enable_2pc_file_check, + true, + NULL, NULL, NULL + }, + { + {"enable_2pc_entry_key_check", PGC_USERSET, CUSTOM_OPTIONS, + gettext_noop("Enable 2PC entry key check."), + NULL + }, + &enable_2pc_entry_key_check, + true, + NULL, NULL, NULL + }, + { + {"enable_2pc_entry_trace", PGC_USERSET, CUSTOM_OPTIONS, + gettext_noop("Enable 2PC entry trace."), + NULL + }, + &enable_2pc_entry_trace, + false, + NULL, NULL, NULL + }, +#endif + #ifdef __TBASE__ { {"enable_lock_account", PGC_SUSET, CUSTOM_OPTIONS, @@ -4744,6 +4783,33 @@ static struct config_int ConfigureNamesInt[] = NULL, NULL, NULL }, +#ifdef __TWO_PHASE_TRANS__ + { + {"record_2pc_cache_size", PGC_POSTMASTER, CUSTOM_OPTIONS, + gettext_noop("2PC info cache size."), + }, + &record_2pc_cache_size, + 50000, 100, INT_MAX, + NULL, NULL, NULL + }, + { + {"record_2pc_entry_size", PGC_POSTMASTER, CUSTOM_OPTIONS, + gettext_noop("2PC info cache entry size."), + }, + &record_2pc_entry_size, + 2048, 1200, INT_MAX, + NULL, NULL, NULL + }, + { + {"record_2pc_partitions", PGC_POSTMASTER, CUSTOM_OPTIONS, + gettext_noop("2PC info cache partition number."), + }, + &record_2pc_partitions, + 32, 1, INT_MAX, + NULL, NULL, NULL + }, +#endif + #ifdef __TBASE__ { {"account_lock_track_count", PGC_POSTMASTER, LOGGING, diff --git a/src/include/access/twophase.h b/src/include/access/twophase.h index cbf83a3e..bd76266f 100644 --- a/src/include/access/twophase.h +++ b/src/include/access/twophase.h @@ -96,6 +96,17 @@ extern int transaction_threshold; extern bool enable_2pc_recovery_info; #endif +#ifdef __TWO_PHASE_TRANS__ +extern bool enable_2pc_file_cache; +extern bool enable_2pc_file_check; +extern bool enable_2pc_entry_key_check; +extern bool enable_2pc_entry_trace; + +extern int record_2pc_cache_size; +extern int record_2pc_entry_size; +extern int record_2pc_partitions; +#endif + extern Size TwoPhaseShmemSize(void); extern void TwoPhaseShmemInit(void); @@ -143,7 +154,14 @@ extern void record_2pc_involved_nodes_xid(const char * tid, GlobalTransactionId xid); extern void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timestamp); extern void remove_2pc_records(const char *tid, bool record_in_xlog); +extern void rename_2pc_records(const char *tid, TimestampTz timestamp); extern void record_2pc_readonly(const char *gid); + +extern char *get_2pc_info_from_cache(const char *tid); +extern char *get_2pc_list_from_cache(int *count); + +extern void Record2pcCacheInit(void); +extern Size Record2pcCacheSize(void); #endif #endif /* TWOPHASE_H */ diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index 89a74b0c..8828efe7 100644 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -1,8 +1,8 @@ /*------------------------------------------------------------------------- * * pg_control.h - * The system control file "pg_control" is not a heap relation. - * However, we define it here so that the format is documented. + * The system control file "pg_control" is not a heap relation. + * However, we define it here so that the format is documented. * * * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group @@ -16,15 +16,15 @@ #define PG_CONTROL_H #include "access/xlogdefs.h" -#include "pgtime.h" /* for pg_time_t */ +#include "pgtime.h" /* for pg_time_t */ #include "port/pg_crc32c.h" /* Version identifier for this pg_control format */ -#define PG_CONTROL_VERSION 1002 +#define PG_CONTROL_VERSION 1002 /* Nonce key length, see below */ -#define MOCK_AUTH_NONCE_LEN 32 +#define MOCK_AUTH_NONCE_LEN 32 /* * Body of CheckPoint XLOG records. This is declared here because we keep @@ -33,61 +33,61 @@ */ typedef struct CheckPoint { - XLogRecPtr redo; /* next RecPtr available when we began to - * create CheckPoint (i.e. REDO start point) */ - TimeLineID ThisTimeLineID; /* current TLI */ - TimeLineID PrevTimeLineID; /* previous TLI, if this record begins a new - * timeline (equals ThisTimeLineID otherwise) */ - bool fullPageWrites; /* current full_page_writes */ - uint32 nextXidEpoch; /* higher-order bits of nextXid */ - TransactionId nextXid; /* next free XID */ - Oid nextOid; /* next free OID */ - MultiXactId nextMulti; /* next free MultiXactId */ - MultiXactOffset nextMultiOffset; /* next free MultiXact offset */ - TransactionId oldestXid; /* cluster-wide minimum datfrozenxid */ - Oid oldestXidDB; /* database with minimum datfrozenxid */ - MultiXactId oldestMulti; /* cluster-wide minimum datminmxid */ - Oid oldestMultiDB; /* database with minimum datminmxid */ - pg_time_t time; /* time stamp of checkpoint */ - TransactionId oldestCommitTsXid; /* oldest Xid with valid commit - * timestamp */ - TransactionId newestCommitTsXid; /* newest Xid with valid commit - * timestamp */ + XLogRecPtr redo; /* next RecPtr available when we began to + * create CheckPoint (i.e. REDO start point) */ + TimeLineID ThisTimeLineID; /* current TLI */ + TimeLineID PrevTimeLineID; /* previous TLI, if this record begins a new + * timeline (equals ThisTimeLineID otherwise) */ + bool fullPageWrites; /* current full_page_writes */ + uint32 nextXidEpoch; /* higher-order bits of nextXid */ + TransactionId nextXid; /* next free XID */ + Oid nextOid; /* next free OID */ + MultiXactId nextMulti; /* next free MultiXactId */ + MultiXactOffset nextMultiOffset; /* next free MultiXact offset */ + TransactionId oldestXid; /* cluster-wide minimum datfrozenxid */ + Oid oldestXidDB; /* database with minimum datfrozenxid */ + MultiXactId oldestMulti; /* cluster-wide minimum datminmxid */ + Oid oldestMultiDB; /* database with minimum datminmxid */ + pg_time_t time; /* time stamp of checkpoint */ + TransactionId oldestCommitTsXid; /* oldest Xid with valid commit + * timestamp */ + TransactionId newestCommitTsXid; /* newest Xid with valid commit + * timestamp */ #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__ - GlobalTimestamp latestCommitTs; - GlobalTimestamp latestGTS; + GlobalTimestamp latestCommitTs; + GlobalTimestamp latestGTS; #endif - /* - * Oldest XID still running. This is only needed to initialize hot standby - * mode from an online checkpoint, so we only bother calculating this for - * online checkpoints and only when wal_level is replica. Otherwise it's - * set to InvalidTransactionId. - */ - TransactionId oldestActiveXid; + /* + * Oldest XID still running. This is only needed to initialize hot standby + * mode from an online checkpoint, so we only bother calculating this for + * online checkpoints and only when wal_level is replica. Otherwise it's + * set to InvalidTransactionId. + */ + TransactionId oldestActiveXid; } CheckPoint; /* XLOG info values for XLOG rmgr */ -#define XLOG_CHECKPOINT_SHUTDOWN 0x00 -#define XLOG_CHECKPOINT_ONLINE 0x10 -#define XLOG_NOOP 0x20 -#define XLOG_NEXTOID 0x30 -#define XLOG_SWITCH 0x40 -#define XLOG_BACKUP_END 0x50 -#define XLOG_PARAMETER_CHANGE 0x60 -#define XLOG_RESTORE_POINT 0x70 -#define XLOG_FPW_CHANGE 0x80 -#define XLOG_END_OF_RECOVERY 0x90 -#define XLOG_FPI_FOR_HINT 0xA0 -#define XLOG_FPI 0xB0 +#define XLOG_CHECKPOINT_SHUTDOWN 0x00 +#define XLOG_CHECKPOINT_ONLINE 0x10 +#define XLOG_NOOP 0x20 +#define XLOG_NEXTOID 0x30 +#define XLOG_SWITCH 0x40 +#define XLOG_BACKUP_END 0x50 +#define XLOG_PARAMETER_CHANGE 0x60 +#define XLOG_RESTORE_POINT 0x70 +#define XLOG_FPW_CHANGE 0x80 +#define XLOG_END_OF_RECOVERY 0x90 +#define XLOG_FPI_FOR_HINT 0xA0 +#define XLOG_FPI 0xB0 #ifdef __TBASE__ -#define XLOG_MVCC 0xC0 +#define XLOG_MVCC 0xC0 #endif -/* remove 2pc file while 2pc is cleaned*/ -#define XLOG_CLEAN_2PC_FILE 0XD0 -#define XLOG_CREATE_2PC_FILE 0xE0 +/* remove or rename 2pc file when 2pc is cleaned */ +#define XLOG_CLEAN_2PC_FILE 0XD0 +#define XLOG_CREATE_2PC_FILE 0xE0 #define XLOG_RECORD_2PC_TIMESTAMP 0xF0 /* @@ -96,13 +96,13 @@ typedef struct CheckPoint */ typedef enum DBState { - DB_STARTUP = 0, - DB_SHUTDOWNED, - DB_SHUTDOWNED_IN_RECOVERY, - DB_SHUTDOWNING, - DB_IN_CRASH_RECOVERY, - DB_IN_ARCHIVE_RECOVERY, - DB_IN_PRODUCTION + DB_STARTUP = 0, + DB_SHUTDOWNED, + DB_SHUTDOWNED_IN_RECOVERY, + DB_SHUTDOWNING, + DB_IN_CRASH_RECOVERY, + DB_IN_ARCHIVE_RECOVERY, + DB_IN_PRODUCTION } DBState; /* @@ -111,150 +111,150 @@ typedef enum DBState typedef struct ControlFileData { - /* - * Unique system identifier --- to ensure we match up xlog files with the - * installation that produced them. - */ - uint64 system_identifier; + /* + * Unique system identifier --- to ensure we match up xlog files with the + * installation that produced them. + */ + uint64 system_identifier; - /* - * Version identifier information. Keep these fields at the same offset, - * especially pg_control_version; they won't be real useful if they move - * around. (For historical reasons they must be 8 bytes into the file - * rather than immediately at the front.) - * - * pg_control_version identifies the format of pg_control itself. - * catalog_version_no identifies the format of the system catalogs. - * - * There are additional version identifiers in individual files; for - * example, WAL logs contain per-page magic numbers that can serve as - * version cues for the WAL log. - */ - uint32 pg_control_version; /* PG_CONTROL_VERSION */ - uint32 catalog_version_no; /* see catversion.h */ + /* + * Version identifier information. Keep these fields at the same offset, + * especially pg_control_version; they won't be real useful if they move + * around. (For historical reasons they must be 8 bytes into the file + * rather than immediately at the front.) + * + * pg_control_version identifies the format of pg_control itself. + * catalog_version_no identifies the format of the system catalogs. + * + * There are additional version identifiers in individual files; for + * example, WAL logs contain per-page magic numbers that can serve as + * version cues for the WAL log. + */ + uint32 pg_control_version; /* PG_CONTROL_VERSION */ + uint32 catalog_version_no; /* see catversion.h */ - /* - * System status data - */ - DBState state; /* see enum above */ - pg_time_t time; /* time stamp of last pg_control update */ - XLogRecPtr checkPoint; /* last check point record ptr */ - XLogRecPtr prevCheckPoint; /* previous check point record ptr */ + /* + * System status data + */ + DBState state; /* see enum above */ + pg_time_t time; /* time stamp of last pg_control update */ + XLogRecPtr checkPoint; /* last check point record ptr */ + XLogRecPtr prevCheckPoint; /* previous check point record ptr */ - CheckPoint checkPointCopy; /* copy of last check point record */ + CheckPoint checkPointCopy; /* copy of last check point record */ - XLogRecPtr unloggedLSN; /* current fake LSN value, for unlogged rels */ + XLogRecPtr unloggedLSN; /* current fake LSN value, for unlogged rels */ - /* - * These two values determine the minimum point we must recover up to - * before starting up: - * - * minRecoveryPoint is updated to the latest replayed LSN whenever we - * flush a data change during archive recovery. That guards against - * starting archive recovery, aborting it, and restarting with an earlier - * stop location. If we've already flushed data changes from WAL record X - * to disk, we mustn't start up until we reach X again. Zero when not - * doing archive recovery. - * - * backupStartPoint is the redo pointer of the backup start checkpoint, if - * we are recovering from an online backup and haven't reached the end of - * backup yet. It is reset to zero when the end of backup is reached, and - * we mustn't start up before that. A boolean would suffice otherwise, but - * we use the redo pointer as a cross-check when we see an end-of-backup - * record, to make sure the end-of-backup record corresponds the base - * backup we're recovering from. - * - * backupEndPoint is the backup end location, if we are recovering from an - * online backup which was taken from the standby and haven't reached the - * end of backup yet. It is initialized to the minimum recovery point in - * pg_control which was backed up last. It is reset to zero when the end - * of backup is reached, and we mustn't start up before that. - * - * If backupEndRequired is true, we know for sure that we're restoring - * from a backup, and must see a backup-end record before we can safely - * start up. If it's false, but backupStartPoint is set, a backup_label - * file was found at startup but it may have been a leftover from a stray - * pg_start_backup() call, not accompanied by pg_stop_backup(). - */ - XLogRecPtr minRecoveryPoint; - TimeLineID minRecoveryPointTLI; - XLogRecPtr backupStartPoint; - XLogRecPtr backupEndPoint; - bool backupEndRequired; + /* + * These two values determine the minimum point we must recover up to + * before starting up: + * + * minRecoveryPoint is updated to the latest replayed LSN whenever we + * flush a data change during archive recovery. That guards against + * starting archive recovery, aborting it, and restarting with an earlier + * stop location. If we've already flushed data changes from WAL record X + * to disk, we mustn't start up until we reach X again. Zero when not + * doing archive recovery. + * + * backupStartPoint is the redo pointer of the backup start checkpoint, if + * we are recovering from an online backup and haven't reached the end of + * backup yet. It is reset to zero when the end of backup is reached, and + * we mustn't start up before that. A boolean would suffice otherwise, but + * we use the redo pointer as a cross-check when we see an end-of-backup + * record, to make sure the end-of-backup record corresponds the base + * backup we're recovering from. + * + * backupEndPoint is the backup end location, if we are recovering from an + * online backup which was taken from the standby and haven't reached the + * end of backup yet. It is initialized to the minimum recovery point in + * pg_control which was backed up last. It is reset to zero when the end + * of backup is reached, and we mustn't start up before that. + * + * If backupEndRequired is true, we know for sure that we're restoring + * from a backup, and must see a backup-end record before we can safely + * start up. If it's false, but backupStartPoint is set, a backup_label + * file was found at startup but it may have been a leftover from a stray + * pg_start_backup() call, not accompanied by pg_stop_backup(). + */ + XLogRecPtr minRecoveryPoint; + TimeLineID minRecoveryPointTLI; + XLogRecPtr backupStartPoint; + XLogRecPtr backupEndPoint; + bool backupEndRequired; - /* - * Parameter settings that determine if the WAL can be used for archival - * or hot standby. - */ - int wal_level; - bool wal_log_hints; - int MaxConnections; - int max_worker_processes; - int max_prepared_xacts; - int max_locks_per_xact; - bool track_commit_timestamp; + /* + * Parameter settings that determine if the WAL can be used for archival + * or hot standby. + */ + int wal_level; + bool wal_log_hints; + int MaxConnections; + int max_worker_processes; + int max_prepared_xacts; + int max_locks_per_xact; + bool track_commit_timestamp; - /* - * This data is used to check for hardware-architecture compatibility of - * the database and the backend executable. We need not check endianness - * explicitly, since the pg_control version will surely look wrong to a - * machine of different endianness, but we do need to worry about MAXALIGN - * and floating-point format. (Note: storage layout nominally also - * depends on SHORTALIGN and INTALIGN, but in practice these are the same - * on all architectures of interest.) - * - * Testing just one double value is not a very bulletproof test for - * floating-point compatibility, but it will catch most cases. - */ - uint32 maxAlign; /* alignment requirement for tuples */ - double floatFormat; /* constant 1234567.0 */ -#define FLOATFORMAT_VALUE 1234567.0 + /* + * This data is used to check for hardware-architecture compatibility of + * the database and the backend executable. We need not check endianness + * explicitly, since the pg_control version will surely look wrong to a + * machine of different endianness, but we do need to worry about MAXALIGN + * and floating-point format. (Note: storage layout nominally also + * depends on SHORTALIGN and INTALIGN, but in practice these are the same + * on all architectures of interest.) + * + * Testing just one double value is not a very bulletproof test for + * floating-point compatibility, but it will catch most cases. + */ + uint32 maxAlign; /* alignment requirement for tuples */ + double floatFormat; /* constant 1234567.0 */ +#define FLOATFORMAT_VALUE 1234567.0 - /* - * This data is used to make sure that configuration of this database is - * compatible with the backend executable. - */ - uint32 blcksz; /* data block size for this DB */ - uint32 relseg_size; /* blocks per segment of large relation */ + /* + * This data is used to make sure that configuration of this database is + * compatible with the backend executable. + */ + uint32 blcksz; /* data block size for this DB */ + uint32 relseg_size; /* blocks per segment of large relation */ - uint32 xlog_blcksz; /* block size within WAL files */ - uint32 xlog_seg_size; /* size of each WAL segment */ + uint32 xlog_blcksz; /* block size within WAL files */ + uint32 xlog_seg_size; /* size of each WAL segment */ - uint32 nameDataLen; /* catalog name field width */ - uint32 indexMaxKeys; /* max number of columns in an index */ + uint32 nameDataLen; /* catalog name field width */ + uint32 indexMaxKeys; /* max number of columns in an index */ - uint32 toast_max_chunk_size; /* chunk size in TOAST tables */ - uint32 loblksize; /* chunk size in pg_largeobject */ + uint32 toast_max_chunk_size; /* chunk size in TOAST tables */ + uint32 loblksize; /* chunk size in pg_largeobject */ - /* flags indicating pass-by-value status of various types */ - bool float4ByVal; /* float4 pass-by-value? */ - bool float8ByVal; /* float8, int8, etc pass-by-value? */ + /* flags indicating pass-by-value status of various types */ + bool float4ByVal; /* float4 pass-by-value? */ + bool float8ByVal; /* float8, int8, etc pass-by-value? */ - /* Are data pages protected by checksums? Zero if no checksum version */ - uint32 data_checksum_version; + /* Are data pages protected by checksums? Zero if no checksum version */ + uint32 data_checksum_version; - /* - * Random nonce, used in authentication requests that need to proceed - * based on values that are cluster-unique, like a SASL exchange that - * failed at an early stage. - */ - char mock_authentication_nonce[MOCK_AUTH_NONCE_LEN]; + /* + * Random nonce, used in authentication requests that need to proceed + * based on values that are cluster-unique, like a SASL exchange that + * failed at an early stage. + */ + char mock_authentication_nonce[MOCK_AUTH_NONCE_LEN]; #ifdef __TBASE__ - /* - * need mvcc if page is all visible? + /* + * need mvcc if page is all visible? */ int32 need_mvcc; - /* reserved */ - int32 reserved_1; - int32 reserved_2; - int32 reserved_3; - int32 reserved_4; - int32 reserved_5; + /* reserved */ + int32 reserved_1; + int32 reserved_2; + int32 reserved_3; + int32 reserved_4; + int32 reserved_5; #endif - /* CRC of all above ... MUST BE LAST! */ - pg_crc32c crc; + /* CRC of all above ... MUST BE LAST! */ + pg_crc32c crc; } ControlFileData; /* @@ -263,7 +263,7 @@ typedef struct ControlFileData * means the active data can't be more than one disk sector, which is 512 * bytes on common hardware. Be very careful about raising this limit. */ -#define PG_CONTROL_MAX_SAFE_SIZE 512 +#define PG_CONTROL_MAX_SAFE_SIZE 512 /* * Physical size of the pg_control file. Note that this is considerably @@ -272,6 +272,6 @@ typedef struct ControlFileData * changes, so that ReadControlFile will deliver a suitable wrong-version * message instead of a read error if it's looking at an incompatible file. */ -#define PG_CONTROL_FILE_SIZE 8192 +#define PG_CONTROL_FILE_SIZE 8192 -#endif /* PG_CONTROL_H */ +#endif /* PG_CONTROL_H */ diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index 9c939d43..9cfd0f21 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -72,6 +72,10 @@ select count(*) >= 0 as ok from pg_prepared_xacts; select name, setting from pg_settings where name like 'enable%'; name | setting -----------------------------------+--------- + enable_2pc_entry_key_check | on + enable_2pc_entry_trace | off + enable_2pc_file_cache | on + enable_2pc_file_check | on enable_2pc_recovery_info | on enable_audit | off enable_audit_warning | off @@ -137,7 +141,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_transparent_crypt | on enable_user_authority_force_check | off enable_xlog_mprotect | on -(64 rows) +(67 rows) -- Test that the pg_timezone_names and pg_timezone_abbrevs views are -- more-or-less working. We can't test their contents in any great detail From ed7cbf0202143b594712bece67eba7e06d5f8e13 Mon Sep 17 00:00:00 2001 From: bethding Date: Wed, 28 Apr 2021 17:48:07 +0800 Subject: [PATCH 361/578] fix committed sequenced in gtm be dropped when subtranction abort http://tapd.oa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131087135229 --- src/backend/access/transam/xact.c | 14 ++++ src/test/regress/expected/create_table.out | 75 ++++++++++++++++++++++ src/test/regress/sql/create_table.sql | 55 ++++++++++++++++ 3 files changed, 144 insertions(+) diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index f43288dc..bc765d44 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -6095,6 +6095,20 @@ CommitSubTransaction(void) s->parallelModeLevel = 0; } +#ifdef __TBASE__ + if (s->curTransactionOwner) + { + TransactionId xid = GetCurrentTransactionIdIfAny(); + + if (TransactionIdIsValid(xid)) + { + CheckGTMConnection(); + } + + FinishSeqOp(true); + } +#endif + /* Do the actual "commit", such as it is */ s->state = TRANS_COMMIT; diff --git a/src/test/regress/expected/create_table.out b/src/test/regress/expected/create_table.out index 2cf920b9..f3e08d39 100644 --- a/src/test/regress/expected/create_table.out +++ b/src/test/regress/expected/create_table.out @@ -949,3 +949,78 @@ Distribute By: HASH(a) Location Nodes: ALL DATANODES drop table boolspart; +drop function if exists create_multi_tables1(integer, varchar); +NOTICE: function create_multi_tables1(pg_catalog.int4,pg_catalog.varchar) does not exist, skipping +CREATE OR REPLACE FUNCTION create_multi_tables1(table_num_in integer, table_sql varchar) RETURNS void + LANGUAGE 'plpgsql' + COST 100 + VOLATILE +AS $BODY$ +declare + v_idx integer := 0; + v_strTable varchar :=''; + v_strSql varchar :=''; +begin + while v_idx < table_num_in loop + v_idx = v_idx+1; + v_strTable = CONCAT('simple_metadata_query_', v_idx); + v_strSql = table_sql||' '||v_strTable||'(c1 bigint, c31 smallserial);'; + RAISE NOTICE 'create %', v_strTable; + BEGIN + EXECUTE v_strSql; + EXCEPTION when others then + raise notice 'ERROR: (%)', SQLERRM; + end; + end loop; + + RAISE NOTICE 'finished .....'; +end +$BODY$; +drop function if exists del_multi_table1(varchar); +NOTICE: function del_multi_table1(pg_catalog.varchar) does not exist, skipping +CREATE FUNCTION del_multi_table1(table_sql varchar) RETURNS void AS $$ +DECLARE + tmp VARCHAR(512); +DECLARE names CURSOR FOR + select tablename from pg_tables where tablename like 'simple_metadata_query_%'; +BEGIN + FOR stmt IN names LOOP + tmp := table_sql||' '|| quote_ident(stmt.tablename) || ' CASCADE;'; + RAISE NOTICE '%', tmp; + BEGIN + EXECUTE tmp; + EXCEPTION when others then + raise notice 'ERROR: (%)', SQLERRM; + end; + END LOOP; + RAISE NOTICE 'finished .....'; +END +$$ LANGUAGE plpgsql VOLATILE COST 100; +CREATE TABLE simple_metadata_query_3(c1 int, c31 smallserial); +SELECT create_multi_tables1(5, 'create table'); +NOTICE: create simple_metadata_query_1 +NOTICE: create simple_metadata_query_2 +NOTICE: create simple_metadata_query_3 +NOTICE: ERROR: (relation "simple_metadata_query_3" already exists) +NOTICE: create simple_metadata_query_4 +NOTICE: create simple_metadata_query_5 +NOTICE: finished ..... + create_multi_tables1 +---------------------- + +(1 row) + +SELECT del_multi_table1('drop table if exists'); +NOTICE: drop table if exists simple_metadata_query_1 CASCADE; +NOTICE: drop table if exists simple_metadata_query_2 CASCADE; +NOTICE: drop table if exists simple_metadata_query_3 CASCADE; +NOTICE: drop table if exists simple_metadata_query_4 CASCADE; +NOTICE: drop table if exists simple_metadata_query_5 CASCADE; +NOTICE: finished ..... + del_multi_table1 +------------------ + +(1 row) + +DROP FUNCTION create_multi_tables1; +DROP FUNCTION del_multi_table1; diff --git a/src/test/regress/sql/create_table.sql b/src/test/regress/sql/create_table.sql index 82f1a87b..9fc3ae65 100644 --- a/src/test/regress/sql/create_table.sql +++ b/src/test/regress/sql/create_table.sql @@ -746,3 +746,58 @@ create table boolspart_t partition of boolspart for values in (true); create table boolspart_f partition of boolspart for values in (false); \d+ boolspart drop table boolspart; + +drop function if exists create_multi_tables1(integer, varchar); +CREATE OR REPLACE FUNCTION create_multi_tables1(table_num_in integer, table_sql varchar) RETURNS void + LANGUAGE 'plpgsql' + COST 100 + VOLATILE +AS $BODY$ +declare + v_idx integer := 0; + v_strTable varchar :=''; + v_strSql varchar :=''; +begin + while v_idx < table_num_in loop + v_idx = v_idx+1; + v_strTable = CONCAT('simple_metadata_query_', v_idx); + v_strSql = table_sql||' '||v_strTable||'(c1 bigint, c31 smallserial);'; + RAISE NOTICE 'create %', v_strTable; + BEGIN + EXECUTE v_strSql; + EXCEPTION when others then + raise notice 'ERROR: (%)', SQLERRM; + end; + end loop; + + RAISE NOTICE 'finished .....'; +end +$BODY$; + +drop function if exists del_multi_table1(varchar); +CREATE FUNCTION del_multi_table1(table_sql varchar) RETURNS void AS $$ +DECLARE + tmp VARCHAR(512); +DECLARE names CURSOR FOR + select tablename from pg_tables where tablename like 'simple_metadata_query_%'; +BEGIN + FOR stmt IN names LOOP + tmp := table_sql||' '|| quote_ident(stmt.tablename) || ' CASCADE;'; + RAISE NOTICE '%', tmp; + BEGIN + EXECUTE tmp; + EXCEPTION when others then + raise notice 'ERROR: (%)', SQLERRM; + end; + END LOOP; + RAISE NOTICE 'finished .....'; +END +$$ LANGUAGE plpgsql VOLATILE COST 100; + +CREATE TABLE simple_metadata_query_3(c1 int, c31 smallserial); +SELECT create_multi_tables1(5, 'create table'); +SELECT del_multi_table1('drop table if exists'); + +DROP FUNCTION create_multi_tables1; +DROP FUNCTION del_multi_table1; + From a8c5007545aa5dff9f8f10e92308f203cbafb48a Mon Sep 17 00:00:00 2001 From: youngxie Date: Thu, 29 Apr 2021 14:31:29 +0800 Subject: [PATCH 362/578] Fix distinct agg regress. --- src/test/regress/expected/sysviews.out | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index 9cfd0f21..842fabf5 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -141,7 +141,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_transparent_crypt | on enable_user_authority_force_check | off enable_xlog_mprotect | on -(67 rows) +(68 rows) -- Test that the pg_timezone_names and pg_timezone_abbrevs views are -- more-or-less working. We can't test their contents in any great detail From 2759d7daf70c41b5a7a97a98d5c1194f1040c93c Mon Sep 17 00:00:00 2001 From: whalesong Date: Fri, 30 Apr 2021 14:42:02 +0800 Subject: [PATCH 363/578] 2pc files opt: add 2pc hash table on shmem (merge request 300), code opt --- src/backend/access/transam/twophase.c | 28 +++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 46570383..2fc12062 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -3539,7 +3539,6 @@ void record_2pc_involved_nodes_xid(const char * tid, appendStringInfo(&content, "nodes:%s\n", nodestring); appendStringInfo(&content, "xid:%u\n", xid); size = content.len; - Assert(size == strlen(content.data)); /* if in_pg_clean, then check whether the file exists */ @@ -3667,8 +3666,7 @@ void record_2pc_involved_nodes_xid(const char * tid, "to hash table", tid); } - memset(entry->info, 0, MAX_2PC_INFO_SIZE); - memcpy(entry->info, content.data, size); + memcpy(entry->info, content.data, size + 1); resetStringInfo(&content); pfree(content.data); @@ -3779,7 +3777,8 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta initStringInfo(&content); appendStringInfo(&content, "global_commit_timestamp:"INT64_FORMAT"\n", commit_timestamp); - size = strlen(content.data); + size = content.len; + Assert(size == strlen(content.data)); if (NULL != record_2pc_cache) { @@ -3804,8 +3803,16 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta new_size = size + strlen(entry->info); - if (new_size >= MAX_2PC_INFO_SIZE) + if (new_size < MAX_2PC_INFO_SIZE) { + /* save to hash table */ + memcpy(entry->info + strlen(entry->info), content.data, size + 1); + + resetStringInfo(&content); + pfree(content.data); + return; + } + /* save to file */ elog(LOG, "[record_2pc_commit_timestamp] %s new size(%d) " "overflow(%d)", tid, new_size, MAX_2PC_INFO_SIZE); @@ -3866,14 +3873,6 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta pfree(content.data); return; } - - /* save to hash table */ - memcpy(entry->info + strlen(entry->info), content.data, size); - - resetStringInfo(&content); - pfree(content.data); - return; - } else { elog(LOG, "[record_2pc_commit_timestamp] %s is not found " @@ -4147,6 +4146,7 @@ void record_2pc_readonly(const char *gid) if (NULL != record_2pc_cache) { Assert(strlen(gid) < MAX_TID_SIZE); + Assert(strlen(content) < MAX_2PC_INFO_SIZE); entry = (Cache2pcInfo *)hash_search(record_2pc_cache, gid, HASH_ENTER_NULL, &found); if (NULL != entry) @@ -4172,7 +4172,7 @@ void record_2pc_readonly(const char *gid) elog(LOG, "[record_2pc_readonly] %s is added " "to hash table", gid); } - memcpy(entry->info, content, strlen(content)); + memcpy(entry->info, content, strlen(content) + 1); return; } else From 618f7530b441e996a2dd3fa9a8eef07dbe696aad Mon Sep 17 00:00:00 2001 From: bethding Date: Fri, 30 Apr 2021 15:56:27 +0800 Subject: [PATCH 364/578] precheck befor choose sequence name --- src/backend/parser/parse_utilcmd.c | 57 ++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c index 25550493..88c6077e 100644 --- a/src/backend/parser/parse_utilcmd.c +++ b/src/backend/parser/parse_utilcmd.c @@ -86,6 +86,7 @@ #ifdef __TBASE__ #include "utils/fmgroids.h" #include "catalog/pgxc_class.h" +#include "utils/inval.h" #endif #ifdef XCP @@ -200,6 +201,8 @@ static Const *transformPartitionBoundValue(ParseState *pstate, A_Const *con, #ifdef __TBASE__ static void transformPartitionBy(ParseState *pstate, ColumnDef *partcol, PartitionBy *partitionby); +static char * ChooseSerialName(const char *relname, const char *colname, + const char *label, Oid namespaceid); #endif /* * transformCreateStmt - @@ -722,6 +725,52 @@ transformCreateStmt(CreateStmt *stmt, const char *queryString) return result; } +#ifdef __TBASE__ +/* + * Check relation exists before choose sequence name, if + * the relation already exists, no need to create sequence + * and relation. + */ +static char * +ChooseSerialName(const char *relname, const char *colname, + const char *label, Oid namespaceid) +{ + int pass = 0; + char modlabel[NAMEDATALEN]; + char *sqname; + Oid seqoid; + + /* try the unmodified label first */ + StrNCpy(modlabel, label, sizeof(modlabel)); + + for (;;) + { + sqname = makeObjectName(relname, colname, modlabel); + + AcceptInvalidationMessages(); + seqoid = get_relname_relid(sqname, namespaceid); + if (OidIsValid(seqoid)) + { + Relation rel = heap_open(seqoid, AccessShareLock); + if (OidIsValid(get_relname_relid(relname, namespaceid))) + { + heap_close(rel, AccessShareLock); + elog(ERROR, "relation \"%s\" already exists", relname); + } + heap_close(rel, AccessShareLock); + + /* found a conflict, so try a new name component */ + pfree(sqname); + snprintf(modlabel, sizeof(modlabel), "%s%d", label, ++pass); + } + else + break; + } + + return sqname; +} +#endif + /* * generateSerialExtraStmts * Generate CREATE SEQUENCE and ALTER SEQUENCE ... OWNED BY statements @@ -801,6 +850,14 @@ generateSerialExtraStmts(CreateStmtContext *cxt, ColumnDef *column, RangeVarAdjustRelationPersistence(cxt->relation, snamespaceid); } snamespace = get_namespace_name(snamespaceid); +#ifdef __TBASE__ + if (strcmp("CREATE TABLE", cxt->stmtType) == 0) + sname = ChooseSerialName(cxt->relation->relname, + column->colname, + "seq", + snamespaceid); + else +#endif sname = ChooseRelationName(cxt->relation->relname, column->colname, "seq", From c9b82d01d5028413090f6c80f17e962c5f00e310 Mon Sep 17 00:00:00 2001 From: whalesong Date: Fri, 30 Apr 2021 17:37:11 +0800 Subject: [PATCH 365/578] 2pc files opt: add 2pc hash table on shmem (merge request 300), bugfix --- src/backend/access/transam/twophase.c | 64 ++++++++++++++++----------- 1 file changed, 38 insertions(+), 26 deletions(-) diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 2fc12062..78ae69ff 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -136,6 +136,8 @@ int transaction_threshold = 200000; +#define FILE_CONTENT_SIZE 2048 + #define GET_START_NODE "startnode:" /* GUC variable, can't be changed after startup */ @@ -3726,7 +3728,7 @@ void record_2pc_involved_nodes_xid(const char * tid, void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timestamp) {// #lizard forgives char path[MAXPGPATH]; - char file_content[2048]; + char file_content[FILE_CONTENT_SIZE]; StringInfoData content; File fd = -1; int ret = 0; @@ -3819,41 +3821,42 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta GET_2PC_FILE_PATH(path, tid); - fd = open(path, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR); - if (fd < 0) - { if (RecoveryInProgress()) { - elog(LOG, "[record_2pc_commit_timestamp] could not " - "append timestamp in file %s, errMsg: %s", - path, strerror(errno)); + fd = PathNameOpenFile(path, O_RDWR | O_TRUNC | O_CREAT, + S_IRUSR | S_IWUSR); } else { + fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL, + S_IRUSR | S_IWUSR); + } + if (fd < 0) + { elog(ERROR, "[record_2pc_commit_timestamp] could not " "append timestamp in file %s, errMsg: %s", path, strerror(errno)); } - return; - } - ret = write(fd, entry->info, strlen(entry->info)); - if(ret != new_size) + ret = FileWrite(fd, entry->info, strlen(entry->info), + WAIT_EVENT_BUFFILE_WRITE); + if(ret != strlen(entry->info)) { - close(fd); + FileClose(fd); elog(ERROR, "[record_2pc_commit_timestamp] could not write " "file %s, errMsg: %s, ret: %d, info: %s", path, strerror(errno), ret, entry->info); } - ret = write(fd, content.data, size); - if(ret != new_size) + ret = FileWrite(fd, content.data, size, + WAIT_EVENT_BUFFILE_WRITE); + if(ret != size) { - close(fd); + FileClose(fd); elog(ERROR, "[record_2pc_commit_timestamp] could not write " "file %s, errMsg: %s, ret: %d, info: %s", path, strerror(errno), ret, content.data); } - close(fd); + FileClose(fd); /* remove from hash table */ entry = (Cache2pcInfo *)hash_search(record_2pc_cache, @@ -3883,7 +3886,7 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta GET_2PC_FILE_PATH(path, tid); /* the 2pc file exists already */ - fd = open(path, O_RDWR | O_APPEND, S_IRUSR | S_IWUSR); + fd = PathNameOpenFile(path, O_RDWR | O_APPEND, S_IRUSR | S_IWUSR); if (fd < 0) { if (RecoveryInProgress()) @@ -3917,31 +3920,32 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta if (enable_distri_print) { - memset(file_content, 0, 2048); - ret = read(fd, file_content, 2048); + memset(file_content, 0, FILE_CONTENT_SIZE); + ret = FileRead(fd, file_content, FILE_CONTENT_SIZE, WAIT_EVENT_BUFFILE_READ); elog(LOG, "[record_2pc_commit_timestamp] before append file: %s, " "file_content: %s, content.data: %s, ret: %d", path, file_content, content.data, ret); } - ret = write(fd, content.data, size); + ret = FileWrite(fd, content.data, size, WAIT_EVENT_BUFFILE_WRITE); if(ret != size) { - close(fd); + FileClose(fd); elog(ERROR, "[record_2pc_commit_timestamp] could not write file %s, " "errMsg: %s", path, strerror(errno)); } if (enable_distri_print) { - memset(file_content, 0, 2048); - lseek(fd, 0, SEEK_SET); - ret = read(fd, file_content, 2048); + memset(file_content, 0, FILE_CONTENT_SIZE); + FileSeek(fd, 0, SEEK_SET); + ret = FileRead(fd, file_content, FILE_CONTENT_SIZE, WAIT_EVENT_BUFFILE_READ); elog(LOG, "[record_2pc_commit_timestamp] after append file: %s, " - "file_content: %s, ret: %d", tid, file_content, ret); + "file_content: %s, ret: %d", + path, file_content, ret); } - close(fd); + FileClose(fd); resetStringInfo(&content); pfree(content.data); @@ -4063,8 +4067,16 @@ void rename_2pc_records(const char *tid, TimestampTz timestamp) check_entry_key(tid, entry->key, "rename_2pc_records"); check_2pc_file(tid, entry->info, "rename_2pc_records"); + if (RecoveryInProgress()) + { + fd = PathNameOpenFile(new_path, O_RDWR | O_TRUNC | O_CREAT, + S_IRUSR | S_IWUSR); + } + else + { fd = PathNameOpenFile(new_path, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR); + } if (fd < 0) { elog(ERROR, "[rename_2pc_records] could not create file %s, " From dc3ecb2072bd6768cc69c2dbad4105e4b03544d1 Mon Sep 17 00:00:00 2001 From: bethding Date: Fri, 30 Apr 2021 17:58:59 +0800 Subject: [PATCH 366/578] fix warning --- src/backend/optimizer/util/pgxcship.c | 4 ++++ src/backend/parser/analyze.c | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/backend/optimizer/util/pgxcship.c b/src/backend/optimizer/util/pgxcship.c index ec2f0504..b749e028 100644 --- a/src/backend/optimizer/util/pgxcship.c +++ b/src/backend/optimizer/util/pgxcship.c @@ -158,6 +158,7 @@ static ExecNodes* pgxc_is_group_subquery_shippable(Query *query, Shippability_co static void pgxc_is_rte_subquery_shippable(Node *node, Shippability_context *sc_context); static bool pgxc_is_simple_subquery(Query *subquery); static bool pgxc_FQS_check_subquery_const(Query *query); +static ExecNodes *make_FQS_single_node(); #endif /* * Set the given reason in Shippability_context indicating why the query can not be @@ -1878,6 +1879,9 @@ pgxc_query_contains_only_pg_catalog(List *rtable) return true; } +/* + * Construct ExecNodes for single datanode to fqs + */ static ExecNodes * make_FQS_single_node() { diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c index a5a5e96b..539cd7c8 100644 --- a/src/backend/parser/analyze.c +++ b/src/backend/parser/analyze.c @@ -1155,7 +1155,7 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt) { Node *node = (Node *)lfirst(cell); if (!IsA(node, Param) && - !pgxc_is_expr_shippable(node, NULL)) + !pgxc_is_expr_shippable((Expr*)node, NULL)) { qry->isMultiValues = false; break; From 044e41270182b7616a7cd72060919b8d5b894229 Mon Sep 17 00:00:00 2001 From: challzhang Date: Mon, 3 May 2021 17:48:39 +0800 Subject: [PATCH 367/578] Merge to v2.15.19 from v3. Fix drop sequence will fail when cn drop sequence failed before. http://tapd.oa.com/TBase_C/bugtrace/bugs/view/1020385652086462863 --- src/backend/access/transam/gtm.c | 5 +- src/gtm/main/gtm_seq.c | 134 ++++++++++++++++++++++++++++++- src/include/gtm/gtm_seq.h | 125 ++++++++++++++-------------- 3 files changed, 199 insertions(+), 65 deletions(-) diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c index 1e6c908b..1ac9e6d2 100644 --- a/src/backend/access/transam/gtm.c +++ b/src/backend/access/transam/gtm.c @@ -84,7 +84,8 @@ typedef struct List *g_CreateSeqList = NULL; List *g_DropSeqList = NULL; List *g_AlterSeqList = NULL; -#define GTM_SEQ_POSTFIX "_$TBASE$_" +/* constant postfix for sequence to avoid same name */ +#define GTM_SEQ_POSTFIX "_$TBASE$_sequence_temp_54312678712612" static void CheckConnection(void); static void ResetGTMConnection(void); static int GetGTMStoreStatus(GTMStorageStatus *header); @@ -156,7 +157,7 @@ void RegisterSeqDrop(char *name, int32 type) if (GTM_SEQ_FULL_NAME == type) { /* Here we can only add postfix for the temp sequence, or drop database will fail. */ - snprintf(temp, GTM_NAME_LEN, "%s_%d_%zu"GTM_SEQ_POSTFIX, name, MyProcPid, tp.tv_usec); + snprintf(temp, GTM_NAME_LEN, "%s"GTM_SEQ_POSTFIX, name); if (RenameSequenceGTM((char *)name, temp)) { elog(ERROR, "Deletion of sequences on database %s failed when backup old seq", name); diff --git a/src/gtm/main/gtm_seq.c b/src/gtm/main/gtm_seq.c index 9941cb30..d103ce2b 100644 --- a/src/gtm/main/gtm_seq.c +++ b/src/gtm/main/gtm_seq.c @@ -633,6 +633,8 @@ int GTM_SeqAlter(GTM_SequenceKey seqkey, int32 ret = 0; #endif GTM_SeqInfo *seqinfo = seq_find_seqinfo(seqkey); + GTM_SequenceKeyData newseqkey; + char *seqkey_copy; #ifdef __TBASE__ if (NULL ==seqinfo) @@ -644,11 +646,30 @@ int GTM_SeqAlter(GTM_SequenceKey seqkey, if (seqinfo == NULL) { - ereport(LOG, + /* Find seqinfo by using GTM_SEQ_POSTFIX seqkey when can not find by seqkey*/ + seqkey_copy = palloc(sizeof(char) * (seqkey->gsk_keylen + strlen(GTM_SEQ_POSTFIX))); + memcpy(seqkey_copy, seqkey->gsk_key, seqkey->gsk_keylen); + newseqkey.gsk_keylen = seqkey->gsk_keylen + strlen(GTM_SEQ_POSTFIX); + newseqkey.gsk_type = seqkey->gsk_type; + newseqkey.gsk_key = strcat(seqkey_copy, GTM_SEQ_POSTFIX); + + seqinfo = seq_find_seqinfo(&newseqkey); +#ifdef __TBASE__ + if (NULL == seqinfo) + { + GTM_FormSeqOfStore(&newseqkey); + seqinfo = seq_find_seqinfo(&newseqkey); + } +#endif + if (NULL == seqinfo) + { + ereport(ERROR, (EINVAL, errmsg("The sequence with the given key does not exist"))); return EINVAL; } + GTM_SeqRename(&newseqkey, seqkey, InvalidGlobalTransactionId); + } GTM_RWLockAcquire(&seqinfo->gs_lock, GTM_LOCKMODE_WRITE); @@ -979,6 +1000,7 @@ GTM_SeqRename(GTM_SequenceKey seqkey, GTM_SequenceKey newseqkey, int32 ret = 0; #endif GTM_SeqInfo *seqinfo = NULL; + GTM_SeqInfo *newseqinfo = NULL; int errcode = 0; MemoryContext oldContext; GTM_SeqAlteredInfo *alterinfo; @@ -995,11 +1017,28 @@ GTM_SeqRename(GTM_SequenceKey seqkey, GTM_SequenceKey newseqkey, /* replace old key by new key */ if (seqinfo == NULL) { + newseqinfo = seq_find_seqinfo(newseqkey); +#ifdef __TBASE__ + if (NULL == seqinfo) + { + GTM_FormSeqOfStore(newseqkey); + newseqinfo = seq_find_seqinfo(newseqkey); + } +#endif + + if(newseqinfo == NULL) + { ereport(LOG, (EINVAL, errmsg("Sequence with the key:%s does not exist", seqkey->gsk_key))); return EINVAL; } + ereport(LOG, + (EEXIST, + errmsg("Sequence with the key:%s has been renamed to %s", seqkey->gsk_key, newseqkey->gsk_key))); + seq_release_seqinfo(newseqinfo); + return 0; + } oldContext = MemoryContextSwitchTo(TopMostMemoryContext); alterinfo = (GTM_SeqAlteredInfo *) palloc0(sizeof (GTM_SeqAlteredInfo)); @@ -1044,6 +1083,8 @@ GTM_SeqGetCurrent(GTM_SequenceKey seqkey, char *coord_name, GTM_SeqInfo *seqinfo = NULL; int i; bool found = false; + GTM_SequenceKeyData newseqkey; + char *seqkey_copy = NULL; seqinfo = seq_find_seqinfo(seqkey); #ifdef __TBASE__ @@ -1059,11 +1100,30 @@ GTM_SeqGetCurrent(GTM_SequenceKey seqkey, char *coord_name, if (seqinfo == NULL) { + /* Find seqinfo by using GTM_SEQ_POSTFIX seqkey when can not find by seqkey*/ + seqkey_copy = palloc(sizeof(char) * (seqkey->gsk_keylen + strlen(GTM_SEQ_POSTFIX))); + memcpy(seqkey_copy, seqkey->gsk_key, seqkey->gsk_keylen); + newseqkey.gsk_keylen = seqkey->gsk_keylen + strlen(GTM_SEQ_POSTFIX); + newseqkey.gsk_type = seqkey->gsk_type; + newseqkey.gsk_key = strcat(seqkey_copy, GTM_SEQ_POSTFIX); + + seqinfo = seq_find_seqinfo(&newseqkey); +#ifdef __TBASE__ + if (NULL == seqinfo) + { + GTM_FormSeqOfStore(&newseqkey); + seqinfo = seq_find_seqinfo(&newseqkey); + } +#endif + if (NULL == seqinfo) + { ereport(ERROR, (EINVAL, errmsg("sequence \"%s\" does not exist", seqkey->gsk_key))); return; } + GTM_SeqRename(&newseqkey, seqkey, InvalidGlobalTransactionId); + } GTM_RWLockAcquire(&seqinfo->gs_lock, GTM_LOCKMODE_READ); @@ -1167,6 +1227,8 @@ GTM_SeqSetVal(GTM_SequenceKey seqkey, char *coord_name, int32 ret = 0; GTM_SeqInfo *seqinfo = seq_find_seqinfo(seqkey); #endif + GTM_SequenceKeyData newseqkey; + char *seqkey_copy; #ifdef __TBASE__ if (NULL ==seqinfo) @@ -1178,12 +1240,29 @@ GTM_SeqSetVal(GTM_SequenceKey seqkey, char *coord_name, if (seqinfo == NULL) { + /* Find seqinfo by using GTM_SEQ_POSTFIX seqkey when can not find by seqkey*/ + seqkey_copy = palloc(sizeof(char) * (seqkey->gsk_keylen + strlen(GTM_SEQ_POSTFIX))); + memcpy(seqkey_copy, seqkey->gsk_key, seqkey->gsk_keylen); + newseqkey.gsk_keylen = seqkey->gsk_keylen + strlen(GTM_SEQ_POSTFIX); + newseqkey.gsk_type = seqkey->gsk_type; + newseqkey.gsk_key = strcat(seqkey_copy, GTM_SEQ_POSTFIX); + seqinfo = seq_find_seqinfo(&newseqkey); +#ifdef __TBASE__ + if (NULL == seqinfo) + { + GTM_FormSeqOfStore(&newseqkey); + seqinfo = seq_find_seqinfo(&newseqkey); + } +#endif + if (NULL == seqinfo) + { ereport(LOG, (EINVAL, errmsg("The sequence with the given key does not exist"))); - return EINVAL; } + GTM_SeqRename(&newseqkey, seqkey, InvalidGlobalTransactionId); + } GTM_RWLockAcquire(&seqinfo->gs_lock, GTM_LOCKMODE_WRITE); @@ -1240,6 +1319,8 @@ GTM_SeqGetNext(GTM_SequenceKey seqkey, char *coord_name, char buf[100] = {0}; GTM_Sequence used_count = 0; #endif + GTM_SequenceKeyData newseqkey; + char *seqkey_copy; GTM_SeqInfo *seqinfo = seq_find_seqinfo(seqkey); #ifdef __TBASE__ @@ -1252,11 +1333,30 @@ GTM_SeqGetNext(GTM_SequenceKey seqkey, char *coord_name, if (seqinfo == NULL) { + /* Find seqinfo by using GTM_SEQ_POSTFIX seqkey when can not find by seqkey*/ + seqkey_copy = palloc(sizeof(char) * (seqkey->gsk_keylen + strlen(GTM_SEQ_POSTFIX))); + memcpy(seqkey_copy, seqkey->gsk_key, seqkey->gsk_keylen); + newseqkey.gsk_keylen = seqkey->gsk_keylen + strlen(GTM_SEQ_POSTFIX); + newseqkey.gsk_type = seqkey->gsk_type; + newseqkey.gsk_key = strcat(seqkey_copy, GTM_SEQ_POSTFIX); + + seqinfo = seq_find_seqinfo(&newseqkey); +#ifdef __TBASE__ + if (NULL == seqinfo) + { + GTM_FormSeqOfStore(&newseqkey); + seqinfo = seq_find_seqinfo(&newseqkey); + } +#endif + if (NULL == seqinfo) + { ereport(LOG, (EINVAL, errmsg("The sequence with the given key does not exist"))); return EINVAL; } + GTM_SeqRename(&newseqkey, seqkey, InvalidGlobalTransactionId); + } GTM_RWLockAcquire(&seqinfo->gs_lock, GTM_LOCKMODE_WRITE); @@ -1515,6 +1615,8 @@ GTM_SeqReset(GTM_SequenceKey seqkey) #ifdef __TBASE__ int32 ret = 0; #endif + GTM_SequenceKeyData newseqkey; + char *seqkey_copy; GTM_SeqInfo *seqinfo = seq_find_seqinfo(seqkey); #ifdef __TBASE__ @@ -1527,11 +1629,30 @@ GTM_SeqReset(GTM_SequenceKey seqkey) if (seqinfo == NULL) { + /* Find seqinfo by using GTM_SEQ_POSTFIX seqkey when can not find by seqkey*/ + seqkey_copy = palloc(sizeof(char) * (seqkey->gsk_keylen + strlen(GTM_SEQ_POSTFIX))); + memcpy(seqkey_copy, seqkey->gsk_key, seqkey->gsk_keylen); + newseqkey.gsk_keylen = seqkey->gsk_keylen + strlen(GTM_SEQ_POSTFIX); + newseqkey.gsk_type = seqkey->gsk_type; + newseqkey.gsk_key = strcat(seqkey_copy, GTM_SEQ_POSTFIX); + + seqinfo = seq_find_seqinfo(&newseqkey); +#ifdef __TBASE__ + if (NULL == seqinfo) + { + GTM_FormSeqOfStore(&newseqkey); + seqinfo = seq_find_seqinfo(&newseqkey); + } +#endif + if (NULL == seqinfo) + { ereport(LOG, (EINVAL, errmsg("The sequence with the given key does not exist"))); return EINVAL; } + GTM_SeqRename(&newseqkey, seqkey, InvalidGlobalTransactionId); + } GTM_RWLockAcquire(&seqinfo->gs_lock, GTM_LOCKMODE_WRITE); seqinfo->gs_value = seqinfo->gs_backedUpValue = seqinfo->gs_init_value; @@ -1597,6 +1718,7 @@ ProcessSequenceInitCommand(Port *myport, StringInfo message, bool is_backup) MemoryContext oldContext; const char *data; GlobalTransactionId gxid; + char postfix[100]; if (Recovery_IsStandby()) { @@ -1612,6 +1734,14 @@ ProcessSequenceInitCommand(Port *myport, StringInfo message, bool is_backup) seqkey.gsk_keylen = pq_getmsgint(message, sizeof (seqkey.gsk_keylen)); seqkey.gsk_key = (char *)pq_getmsgbytes(message, seqkey.gsk_keylen); + /* Check whether the seqkey contains GTM_SEQ_POSTFIX */ + if (seqkey.gsk_keylen > strlen(GTM_SEQ_POSTFIX)) + { + strncpy(postfix, seqkey.gsk_key + (seqkey.gsk_keylen - strlen(GTM_SEQ_POSTFIX) - 1), strlen(GTM_SEQ_POSTFIX)); + if (!strcmp(postfix, GTM_SEQ_POSTFIX)) + elog(ERROR, "postfix of sequence key can not be _$TBASE$_sequence_temp_54312678712612."); + } + /* * Read various sequence parameters */ diff --git a/src/include/gtm/gtm_seq.h b/src/include/gtm/gtm_seq.h index 14d5918c..d9dd072d 100644 --- a/src/include/gtm/gtm_seq.h +++ b/src/include/gtm/gtm_seq.h @@ -24,84 +24,87 @@ #define SEQ_RESERVE_COUNT 5000 #define SEQ_RESERVE_MIN_GAP 10 #endif +/* constant postfix for sequence to avoid same name */ +#define GTM_SEQ_POSTFIX "_$TBASE$_sequence_temp_54312678712612" +#define SEQ_KEY_LEN 256 typedef struct GTM_SeqLastVal { - char gs_coord_name[SP_NODE_NAME]; - int32 gs_coord_procid; - GTM_Sequence gs_last_value; + char gs_coord_name[SP_NODE_NAME]; + int32 gs_coord_procid; + GTM_Sequence gs_last_value; } GTM_SeqLastVal; typedef struct GTM_SeqInfo { - GTM_SequenceKey gs_key; - GTM_SequenceKey gs_oldkey; - GTM_Sequence gs_value; - GTM_Sequence gs_backedUpValue; - GTM_Sequence gs_init_value; - int32 gs_max_lastvals; - int32 gs_lastval_count; - GTM_SeqLastVal *gs_last_values; - GTM_Sequence gs_increment_by; /* increase step */ - GTM_Sequence gs_min_value; /* min value of the seq */ - GTM_Sequence gs_max_value; /* max value of the seq */ - bool gs_cycle; /* whether we are cycled */ - bool gs_called; - GlobalTransactionId gs_created_gxid; - - int32 gs_ref_count; - int32 gs_state; - GTM_RWLock gs_lock; + GTM_SequenceKey gs_key; + GTM_SequenceKey gs_oldkey; + GTM_Sequence gs_value; + GTM_Sequence gs_backedUpValue; + GTM_Sequence gs_init_value; + int32 gs_max_lastvals; + int32 gs_lastval_count; + GTM_SeqLastVal *gs_last_values; + GTM_Sequence gs_increment_by; /* increase step */ + GTM_Sequence gs_min_value; /* min value of the seq */ + GTM_Sequence gs_max_value; /* max value of the seq */ + bool gs_cycle; /* whether we are cycled */ + bool gs_called; + GlobalTransactionId gs_created_gxid; + + int32 gs_ref_count; + int32 gs_state; + GTM_RWLock gs_lock; #ifdef __TBASE__ - bool gs_reserved; /* whether we have reserve value*/ - GTMStorageHandle gs_store_handle; - int32 gs_left_reserve_seq_number; + bool gs_reserved; /* whether we have reserve value*/ + GTMStorageHandle gs_store_handle; + int32 gs_left_reserve_seq_number; #endif } GTM_SeqInfo; -#define SEQ_STATE_ACTIVE 1 -#define SEQ_STATE_DELETED 2 +#define SEQ_STATE_ACTIVE 1 +#define SEQ_STATE_DELETED 2 -#define SEQ_IS_ASCENDING(s) ((s)->gs_increment_by > 0) -#define SEQ_IS_CYCLE(s) ((s)->gs_cycle) -#define SEQ_IS_CALLED(s) ((s)->gs_called) +#define SEQ_IS_ASCENDING(s) ((s)->gs_increment_by > 0) +#define SEQ_IS_CYCLE(s) ((s)->gs_cycle) +#define SEQ_IS_CALLED(s) ((s)->gs_called) -#define SEQ_DEF_MAX_SEQVAL_ASCEND 0x7ffffffffffffffeLL -#define SEQ_DEF_MIN_SEQVAL_ASCEND 0x1 +#define SEQ_DEF_MAX_SEQVAL_ASCEND 0x7ffffffffffffffeLL +#define SEQ_DEF_MIN_SEQVAL_ASCEND 0x1 -#define SEQ_DEF_MAX_SEQVAL_DESCEND -0x1 -#define SEQ_DEF_MIN_SEQVAL_DESCEND -0x7ffffffffffffffeLL +#define SEQ_DEF_MAX_SEQVAL_DESCEND -0x1 +#define SEQ_DEF_MIN_SEQVAL_DESCEND -0x7ffffffffffffffeLL -#define SEQ_MAX_REFCOUNT 1024 +#define SEQ_MAX_REFCOUNT 1024 /* SEQUENCE Management */ void GTM_InitSeqManager(void); int GTM_SeqOpen(GTM_SequenceKey seqkey, - GTM_Sequence increment_by, - GTM_Sequence minval, - GTM_Sequence maxval, - GTM_Sequence startval, - bool cycle, - GlobalTransactionId gxid - ); + GTM_Sequence increment_by, + GTM_Sequence minval, + GTM_Sequence maxval, + GTM_Sequence startval, + bool cycle, + GlobalTransactionId gxid + ); int GTM_SeqAlter(GTM_SequenceKey seqkey, - GTM_Sequence increment_by, - GTM_Sequence minval, - GTM_Sequence maxval, - GTM_Sequence startval, - GTM_Sequence lastval, - bool cycle, - bool is_restart); + GTM_Sequence increment_by, + GTM_Sequence minval, + GTM_Sequence maxval, + GTM_Sequence startval, + GTM_Sequence lastval, + bool cycle, + bool is_restart); int GTM_SeqClose(GTM_SequenceKey seqkey, GlobalTransactionId gxid); int GTM_SeqRename(GTM_SequenceKey seqkey, GTM_SequenceKey newseqkey, - GlobalTransactionId gxid); + GlobalTransactionId gxid); int GTM_SeqGetNext(GTM_SequenceKey seqkey, char *coord_name, - int coord_procid, GTM_Sequence range, - GTM_Sequence *result, GTM_Sequence *rangemax); + int coord_procid, GTM_Sequence range, + GTM_Sequence *result, GTM_Sequence *rangemax); void GTM_SeqGetCurrent(GTM_SequenceKey seqkey, char *coord_name, - int coord_procid, GTM_Sequence *result); + int coord_procid, GTM_Sequence *result); int GTM_SeqSetVal(GTM_SequenceKey seqkey, char *coord_name, - int coord_procid, GTM_Sequence nextval, bool iscalled); + int coord_procid, GTM_Sequence nextval, bool iscalled); int GTM_SeqReset(GTM_SequenceKey seqkey); void ProcessSequenceInitCommand(Port *myport, StringInfo message, bool is_backup); @@ -120,14 +123,14 @@ void ProcessDBSequenceRenameCommand(Port *myport, StringInfo message, bool is_ba void decode_seq_key(char* value, GTM_SequenceKey seqkey); void GTM_SaveSeqInfo(FILE *ctlf); int GTM_SeqRestore(GTM_SequenceKey seqkey, - GTM_Sequence increment_by, - GTM_Sequence minval, - GTM_Sequence maxval, - GTM_Sequence startval, - GTM_Sequence curval, - int32 state, - bool cycle, - bool called); + GTM_Sequence increment_by, + GTM_Sequence minval, + GTM_Sequence maxval, + GTM_Sequence startval, + GTM_Sequence curval, + int32 state, + bool cycle, + bool called); void GTM_CleanupSeqSession(char *coord_name, int coord_procid); From 91408ee3c7ad6f24e98fac9b3defdc2bcc57b299 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Thu, 6 May 2021 17:27:49 +0800 Subject: [PATCH 368/578] fix gtm_ctl -l logfile http://tapd.oa.com/TBase_C/bugtrace/bugs/view/1020385652086076245 (merge request !313) Squash merge branch 'sigmalin_v2' into 'Tbase_v2.15.19' * fix gtm_ctl -l logfile --- src/gtm/gtm_ctl/gtm_ctl.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/src/gtm/gtm_ctl/gtm_ctl.c b/src/gtm/gtm_ctl/gtm_ctl.c index 9aeeb322..fa26bed5 100644 --- a/src/gtm/gtm_ctl/gtm_ctl.c +++ b/src/gtm/gtm_ctl/gtm_ctl.c @@ -316,8 +316,8 @@ start_gtm(void) snprintf(gtm_startup_gts,MAXPGPATH,"-g %s",startup_gts); if (log_file != NULL) - len = snprintf(cmd, MAXPGPATH - 1, "\"%s\" %s%s -l %s %s &" , - gtm_app_path, gtmdata_opt, gtm_opts, log_file, gtm_startup_gts); + len = snprintf(cmd, MAXPGPATH - 1, "\"%s\" %s%s %s >> \"%s\" 2>&1 &" , + gtm_app_path, gtmdata_opt, gtm_opts, gtm_startup_gts, log_file); else len = snprintf(cmd, MAXPGPATH - 1, "\"%s\" %s%s %s < \"%s\" 2>&1 &" , gtm_app_path, gtmdata_opt, gtm_opts, gtm_startup_gts, DEVNULL); @@ -348,14 +348,6 @@ static int RunAsDaemon(char *cmd) int status; case 0: - /* - * Using fileno(xxx) may encounter trivial error because xxx may - * have been closed at somewhere else and fileno() may fail. - * Its safer to use literal file descriptor here. - */ - close(0); - close(1); - close(2); if ((status = system(cmd)) == -1) /* * Same behavior as /bin/sh could not be From 4e37d605af52c7d433e05398b71e61ca05922b0c Mon Sep 17 00:00:00 2001 From: hanwayjiang Date: Sat, 8 May 2021 15:46:18 +0800 Subject: [PATCH 369/578] =?UTF-8?q?=E3=80=90=E3=80=90=20TBase=E5=86=85?= =?UTF-8?q?=E6=A0=B8=20V2=E3=80=91TBase=20V2=E6=94=AF=E6=8C=81dblink?= =?UTF-8?q?=E6=8F=92=E4=BB=B6=EF=BC=8Cdblink=E6=8F=92=E4=BB=B6=E6=94=AF?= =?UTF-8?q?=E6=8C=81copy=E5=8A=9F=E8=83=BD=E3=80=91=20http://tapd.oa.com/p?= =?UTF-8?q?gxz/prong/stories/view/1010092131864638363=20(merge=20request?= =?UTF-8?q?=20!315)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Squash merge branch 'tbase_v2_dblink' into 'Tbase_v2.15.19' * 【【 TBase内核 V2】TBase V2支持dblink插件,dblink插件支持copy功能】 http://tapd.oa.com/pgxz/prong/stories/view/1010092131864638363 --- contrib/dblink/dblink--1.2.sql | 5 + contrib/dblink/dblink.c | 4904 +++++++++++++++++--------------- src/backend/tcop/utility.c | 20 +- 3 files changed, 2578 insertions(+), 2351 deletions(-) diff --git a/contrib/dblink/dblink--1.2.sql b/contrib/dblink/dblink--1.2.sql index 405eccb0..fabe10fc 100644 --- a/contrib/dblink/dblink--1.2.sql +++ b/contrib/dblink/dblink--1.2.sql @@ -178,6 +178,11 @@ RETURNS int4 AS 'MODULE_PATHNAME', 'dblink_is_busy' LANGUAGE C STRICT PARALLEL RESTRICTED; +CREATE FUNCTION dblink_copy_table(text, text, text, text, text) +RETURNS int4 +AS 'MODULE_PATHNAME', 'dblink_copy_table' +LANGUAGE C STRICT PARALLEL RESTRICTED; + CREATE FUNCTION dblink_get_result(text) RETURNS SETOF record AS 'MODULE_PATHNAME', 'dblink_get_result' diff --git a/contrib/dblink/dblink.c b/contrib/dblink/dblink.c index bb39b2c6..fb1c99ce 100644 --- a/contrib/dblink/dblink.c +++ b/contrib/dblink/dblink.c @@ -44,12 +44,14 @@ #include "catalog/pg_foreign_server.h" #include "catalog/pg_type.h" #include "catalog/pg_user_mapping.h" +#include "commands/copy.h" #include "executor/spi.h" #include "foreign/foreign.h" #include "funcapi.h" #include "lib/stringinfo.h" #include "mb/pg_wchar.h" #include "miscadmin.h" +#include "parser/parse_relation.h" #include "parser/scansup.h" #include "utils/acl.h" #include "utils/builtins.h" @@ -65,21 +67,21 @@ PG_MODULE_MAGIC; typedef struct remoteConn { - PGconn *conn; /* Hold the remote connection */ - int openCursorCount; /* The number of open cursors */ - bool newXactForCursor; /* Opened a transaction for a cursor */ + PGconn *conn; /* Hold the remote connection */ + int openCursorCount; /* The number of open cursors */ + bool newXactForCursor; /* Opened a transaction for a cursor */ } remoteConn; typedef struct storeInfo { - FunctionCallInfo fcinfo; - Tuplestorestate *tuplestore; - AttInMetadata *attinmeta; - MemoryContext tmpcontext; - char **cstrs; - /* temp storage for results to avoid leaks on exception */ - PGresult *last_res; - PGresult *cur_res; + FunctionCallInfo fcinfo; + Tuplestorestate *tuplestore; + AttInMetadata *attinmeta; + MemoryContext tmpcontext; + char **cstrs; + /* temp storage for results to avoid leaks on exception */ + PGresult *last_res; + PGresult *cur_res; } storeInfo; /* @@ -88,12 +90,12 @@ typedef struct storeInfo static Datum dblink_record_internal(FunctionCallInfo fcinfo, bool is_async); static void prepTuplestoreResult(FunctionCallInfo fcinfo); static void materializeResult(FunctionCallInfo fcinfo, PGconn *conn, - PGresult *res); + PGresult *res); static void materializeQueryResult(FunctionCallInfo fcinfo, - PGconn *conn, - const char *conname, - const char *sql, - bool fail); + PGconn *conn, + const char *conname, + const char *sql, + bool fail); static PGresult *storeQueryResult(volatile storeInfo *sinfo, PGconn *conn, const char *sql); static void storeRow(volatile storeInfo *sinfo, PGresult *res, bool first); static remoteConn *getConnectionByName(const char *name); @@ -106,22 +108,22 @@ static char *get_sql_insert(Relation rel, int *pkattnums, int pknumatts, char ** static char *get_sql_delete(Relation rel, int *pkattnums, int pknumatts, char **tgt_pkattvals); static char *get_sql_update(Relation rel, int *pkattnums, int pknumatts, char **src_pkattvals, char **tgt_pkattvals); static char *quote_ident_cstr(char *rawstr); -static int get_attnum_pk_pos(int *pkattnums, int pknumatts, int key); +static int get_attnum_pk_pos(int *pkattnums, int pknumatts, int key); static HeapTuple get_tuple_of_interest(Relation rel, int *pkattnums, int pknumatts, char **src_pkattvals); static Relation get_rel_from_relname(text *relname_text, LOCKMODE lockmode, AclMode aclmode); static char *generate_relation_name(Relation rel); static void dblink_connstr_check(const char *connstr); static void dblink_security_check(PGconn *conn, remoteConn *rconn); static void dblink_res_error(PGconn *conn, const char *conname, PGresult *res, - const char *dblink_context_msg, bool fail); + const char *dblink_context_msg, bool fail); static char *get_connect_string(const char *servername); static char *escape_param_str(const char *from); static void validate_pkattnums(Relation rel, - int2vector *pkattnums_arg, int32 pknumatts_arg, - int **pkattnums, int *pknumatts); + int2vector *pkattnums_arg, int32 pknumatts_arg, + int **pkattnums, int *pknumatts); static bool is_valid_dblink_option(const PQconninfoOption *options, - const char *option, Oid context); -static int applyRemoteGucs(PGconn *conn); + const char *option, Oid context); +static int applyRemoteGucs(PGconn *conn); static void restoreLocalGucs(int nestlevel); /* Global */ @@ -129,16 +131,16 @@ static remoteConn *pconn = NULL; static HTAB *remoteConnHash = NULL; /* - * Following is list that holds multiple remote connections. - * Calling convention of each dblink function changes to accept - * connection name as the first parameter. The connection list is - * much like ecpg e.g. a mapping between a name and a PGconn object. + * Following is list that holds multiple remote connections. + * Calling convention of each dblink function changes to accept + * connection name as the first parameter. The connection list is + * much like ecpg e.g. a mapping between a name and a PGconn object. */ typedef struct remoteConnHashEnt { - char name[NAMEDATALEN]; - remoteConn *rconn; + char name[NAMEDATALEN]; + remoteConn *rconn; } remoteConnHashEnt; /* initial number of connection hashes */ @@ -147,104 +149,104 @@ typedef struct remoteConnHashEnt static char * xpstrdup(const char *in) { - if (in == NULL) - return NULL; - return pstrdup(in); + if (in == NULL) + return NULL; + return pstrdup(in); } static void pg_attribute_noreturn() dblink_res_internalerror(PGconn *conn, PGresult *res, const char *p2) { - char *msg = pchomp(PQerrorMessage(conn)); + char *msg = pchomp(PQerrorMessage(conn)); - if (res) - PQclear(res); - elog(ERROR, "%s: %s", p2, msg); + if (res) + PQclear(res); + elog(ERROR, "%s: %s", p2, msg); } static void pg_attribute_noreturn() dblink_conn_not_avail(const char *conname) { - if (conname) - ereport(ERROR, - (errcode(ERRCODE_CONNECTION_DOES_NOT_EXIST), - errmsg("connection \"%s\" not available", conname))); - else - ereport(ERROR, - (errcode(ERRCODE_CONNECTION_DOES_NOT_EXIST), - errmsg("connection not available"))); + if (conname) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_DOES_NOT_EXIST), + errmsg("connection \"%s\" not available", conname))); + else + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_DOES_NOT_EXIST), + errmsg("connection not available"))); } static void dblink_get_conn(char *conname_or_str, - PGconn *volatile *conn_p, char **conname_p, volatile bool *freeconn_p) + PGconn *volatile *conn_p, char **conname_p, volatile bool *freeconn_p) { - remoteConn *rconn = getConnectionByName(conname_or_str); - PGconn *conn; - char *conname; - bool freeconn; - - if (rconn) - { - conn = rconn->conn; - conname = conname_or_str; - freeconn = false; - } - else - { - const char *connstr; - - connstr = get_connect_string(conname_or_str); - if (connstr == NULL) - connstr = conname_or_str; - dblink_connstr_check(connstr); - conn = PQconnectdb(connstr); - if (PQstatus(conn) == CONNECTION_BAD) - { - char *msg = pchomp(PQerrorMessage(conn)); - - PQfinish(conn); - ereport(ERROR, - (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION), - errmsg("could not establish connection"), - errdetail_internal("%s", msg))); - } - dblink_security_check(conn, rconn); - if (PQclientEncoding(conn) != GetDatabaseEncoding()) - PQsetClientEncoding(conn, GetDatabaseEncodingName()); - freeconn = true; - conname = NULL; - } - - *conn_p = conn; - *conname_p = conname; - *freeconn_p = freeconn; + remoteConn *rconn = getConnectionByName(conname_or_str); + PGconn *conn; + char *conname; + bool freeconn; + + if (rconn) + { + conn = rconn->conn; + conname = conname_or_str; + freeconn = false; + } + else + { + const char *connstr; + + connstr = get_connect_string(conname_or_str); + if (connstr == NULL) + connstr = conname_or_str; + dblink_connstr_check(connstr); + conn = PQconnectdb(connstr); + if (PQstatus(conn) == CONNECTION_BAD) + { + char *msg = pchomp(PQerrorMessage(conn)); + + PQfinish(conn); + ereport(ERROR, + (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION), + errmsg("could not establish connection"), + errdetail_internal("%s", msg))); + } + dblink_security_check(conn, rconn); + if (PQclientEncoding(conn) != GetDatabaseEncoding()) + PQsetClientEncoding(conn, GetDatabaseEncodingName()); + freeconn = true; + conname = NULL; + } + + *conn_p = conn; + *conname_p = conname; + *freeconn_p = freeconn; } static PGconn * dblink_get_named_conn(const char *conname) { - remoteConn *rconn = getConnectionByName(conname); + remoteConn *rconn = getConnectionByName(conname); - if (rconn) - return rconn->conn; + if (rconn) + return rconn->conn; - dblink_conn_not_avail(conname); - return NULL; /* keep compiler quiet */ + dblink_conn_not_avail(conname); + return NULL; /* keep compiler quiet */ } static void dblink_init(void) { - if (!pconn) - { - pconn = (remoteConn *) MemoryContextAlloc(TopMemoryContext, sizeof(remoteConn)); - pconn->conn = NULL; - pconn->openCursorCount = 0; - pconn->newXactForCursor = FALSE; - } + if (!pconn) + { + pconn = (remoteConn *) MemoryContextAlloc(TopMemoryContext, sizeof(remoteConn)); + pconn->conn = NULL; + pconn->openCursorCount = 0; + pconn->newXactForCursor = FALSE; + } } /* @@ -254,69 +256,69 @@ PG_FUNCTION_INFO_V1(dblink_connect); Datum dblink_connect(PG_FUNCTION_ARGS) { - char *conname_or_str = NULL; - char *connstr = NULL; - char *connname = NULL; - char *msg; - PGconn *conn = NULL; - remoteConn *rconn = NULL; - - dblink_init(); - - if (PG_NARGS() == 2) - { - conname_or_str = text_to_cstring(PG_GETARG_TEXT_PP(1)); - connname = text_to_cstring(PG_GETARG_TEXT_PP(0)); - } - else if (PG_NARGS() == 1) - conname_or_str = text_to_cstring(PG_GETARG_TEXT_PP(0)); - - if (connname) - rconn = (remoteConn *) MemoryContextAlloc(TopMemoryContext, - sizeof(remoteConn)); - - /* first check for valid foreign data server */ - connstr = get_connect_string(conname_or_str); - if (connstr == NULL) - connstr = conname_or_str; - - /* check password in connection string if not superuser */ - dblink_connstr_check(connstr); - conn = PQconnectdb(connstr); - - if (PQstatus(conn) == CONNECTION_BAD) - { - msg = pchomp(PQerrorMessage(conn)); - PQfinish(conn); - if (rconn) - pfree(rconn); - - ereport(ERROR, - (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION), - errmsg("could not establish connection"), - errdetail_internal("%s", msg))); - } - - /* check password actually used if not superuser */ - dblink_security_check(conn, rconn); - - /* attempt to set client encoding to match server encoding, if needed */ - if (PQclientEncoding(conn) != GetDatabaseEncoding()) - PQsetClientEncoding(conn, GetDatabaseEncodingName()); - - if (connname) - { - rconn->conn = conn; - createNewConnection(connname, rconn); - } - else - { - if (pconn->conn) - PQfinish(pconn->conn); - pconn->conn = conn; - } - - PG_RETURN_TEXT_P(cstring_to_text("OK")); + char *conname_or_str = NULL; + char *connstr = NULL; + char *connname = NULL; + char *msg; + PGconn *conn = NULL; + remoteConn *rconn = NULL; + + dblink_init(); + + if (PG_NARGS() == 2) + { + conname_or_str = text_to_cstring(PG_GETARG_TEXT_PP(1)); + connname = text_to_cstring(PG_GETARG_TEXT_PP(0)); + } + else if (PG_NARGS() == 1) + conname_or_str = text_to_cstring(PG_GETARG_TEXT_PP(0)); + + if (connname) + rconn = (remoteConn *) MemoryContextAlloc(TopMemoryContext, + sizeof(remoteConn)); + + /* first check for valid foreign data server */ + connstr = get_connect_string(conname_or_str); + if (connstr == NULL) + connstr = conname_or_str; + + /* check password in connection string if not superuser */ + dblink_connstr_check(connstr); + conn = PQconnectdb(connstr); + + if (PQstatus(conn) == CONNECTION_BAD) + { + msg = pchomp(PQerrorMessage(conn)); + PQfinish(conn); + if (rconn) + pfree(rconn); + + ereport(ERROR, + (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION), + errmsg("could not establish connection"), + errdetail_internal("%s", msg))); + } + + /* check password actually used if not superuser */ + dblink_security_check(conn, rconn); + + /* attempt to set client encoding to match server encoding, if needed */ + if (PQclientEncoding(conn) != GetDatabaseEncoding()) + PQsetClientEncoding(conn, GetDatabaseEncodingName()); + + if (connname) + { + rconn->conn = conn; + createNewConnection(connname, rconn); + } + else + { + if (pconn->conn) + PQfinish(pconn->conn); + pconn->conn = conn; + } + + PG_RETURN_TEXT_P(cstring_to_text("OK")); } /* @@ -326,35 +328,35 @@ PG_FUNCTION_INFO_V1(dblink_disconnect); Datum dblink_disconnect(PG_FUNCTION_ARGS) { - char *conname = NULL; - remoteConn *rconn = NULL; - PGconn *conn = NULL; - - dblink_init(); - - if (PG_NARGS() == 1) - { - conname = text_to_cstring(PG_GETARG_TEXT_PP(0)); - rconn = getConnectionByName(conname); - if (rconn) - conn = rconn->conn; - } - else - conn = pconn->conn; - - if (!conn) - dblink_conn_not_avail(conname); - - PQfinish(conn); - if (rconn) - { - deleteConnection(conname); - pfree(rconn); - } - else - pconn->conn = NULL; - - PG_RETURN_TEXT_P(cstring_to_text("OK")); + char *conname = NULL; + remoteConn *rconn = NULL; + PGconn *conn = NULL; + + dblink_init(); + + if (PG_NARGS() == 1) + { + conname = text_to_cstring(PG_GETARG_TEXT_PP(0)); + rconn = getConnectionByName(conname); + if (rconn) + conn = rconn->conn; + } + else + conn = pconn->conn; + + if (!conn) + dblink_conn_not_avail(conname); + + PQfinish(conn); + if (rconn) + { + deleteConnection(conname); + pfree(rconn); + } + else + pconn->conn = NULL; + + PG_RETURN_TEXT_P(cstring_to_text("OK")); } /* @@ -364,89 +366,89 @@ PG_FUNCTION_INFO_V1(dblink_open); Datum dblink_open(PG_FUNCTION_ARGS) { - PGresult *res = NULL; - PGconn *conn; - char *curname = NULL; - char *sql = NULL; - char *conname = NULL; - StringInfoData buf; - remoteConn *rconn = NULL; - bool fail = true; /* default to backward compatible behavior */ - - dblink_init(); - initStringInfo(&buf); - - if (PG_NARGS() == 2) - { - /* text,text */ - curname = text_to_cstring(PG_GETARG_TEXT_PP(0)); - sql = text_to_cstring(PG_GETARG_TEXT_PP(1)); - rconn = pconn; - } - else if (PG_NARGS() == 3) - { - /* might be text,text,text or text,text,bool */ - if (get_fn_expr_argtype(fcinfo->flinfo, 2) == BOOLOID) - { - curname = text_to_cstring(PG_GETARG_TEXT_PP(0)); - sql = text_to_cstring(PG_GETARG_TEXT_PP(1)); - fail = PG_GETARG_BOOL(2); - rconn = pconn; - } - else - { - conname = text_to_cstring(PG_GETARG_TEXT_PP(0)); - curname = text_to_cstring(PG_GETARG_TEXT_PP(1)); - sql = text_to_cstring(PG_GETARG_TEXT_PP(2)); - rconn = getConnectionByName(conname); - } - } - else if (PG_NARGS() == 4) - { - /* text,text,text,bool */ - conname = text_to_cstring(PG_GETARG_TEXT_PP(0)); - curname = text_to_cstring(PG_GETARG_TEXT_PP(1)); - sql = text_to_cstring(PG_GETARG_TEXT_PP(2)); - fail = PG_GETARG_BOOL(3); - rconn = getConnectionByName(conname); - } - - if (!rconn || !rconn->conn) - dblink_conn_not_avail(conname); - - conn = rconn->conn; - - /* If we are not in a transaction, start one */ - if (PQtransactionStatus(conn) == PQTRANS_IDLE) - { - res = PQexec(conn, "BEGIN"); - if (PQresultStatus(res) != PGRES_COMMAND_OK) - dblink_res_internalerror(conn, res, "begin error"); - PQclear(res); - rconn->newXactForCursor = TRUE; - - /* - * Since transaction state was IDLE, we force cursor count to - * initially be 0. This is needed as a previous ABORT might have wiped - * out our transaction without maintaining the cursor count for us. - */ - rconn->openCursorCount = 0; - } - - /* if we started a transaction, increment cursor count */ - if (rconn->newXactForCursor) - (rconn->openCursorCount)++; - - appendStringInfo(&buf, "DECLARE %s CURSOR FOR %s", curname, sql); - res = PQexec(conn, buf.data); - if (!res || PQresultStatus(res) != PGRES_COMMAND_OK) - { - dblink_res_error(conn, conname, res, "could not open cursor", fail); - PG_RETURN_TEXT_P(cstring_to_text("ERROR")); - } - - PQclear(res); - PG_RETURN_TEXT_P(cstring_to_text("OK")); + PGresult *res = NULL; + PGconn *conn; + char *curname = NULL; + char *sql = NULL; + char *conname = NULL; + StringInfoData buf; + remoteConn *rconn = NULL; + bool fail = true; /* default to backward compatible behavior */ + + dblink_init(); + initStringInfo(&buf); + + if (PG_NARGS() == 2) + { + /* text,text */ + curname = text_to_cstring(PG_GETARG_TEXT_PP(0)); + sql = text_to_cstring(PG_GETARG_TEXT_PP(1)); + rconn = pconn; + } + else if (PG_NARGS() == 3) + { + /* might be text,text,text or text,text,bool */ + if (get_fn_expr_argtype(fcinfo->flinfo, 2) == BOOLOID) + { + curname = text_to_cstring(PG_GETARG_TEXT_PP(0)); + sql = text_to_cstring(PG_GETARG_TEXT_PP(1)); + fail = PG_GETARG_BOOL(2); + rconn = pconn; + } + else + { + conname = text_to_cstring(PG_GETARG_TEXT_PP(0)); + curname = text_to_cstring(PG_GETARG_TEXT_PP(1)); + sql = text_to_cstring(PG_GETARG_TEXT_PP(2)); + rconn = getConnectionByName(conname); + } + } + else if (PG_NARGS() == 4) + { + /* text,text,text,bool */ + conname = text_to_cstring(PG_GETARG_TEXT_PP(0)); + curname = text_to_cstring(PG_GETARG_TEXT_PP(1)); + sql = text_to_cstring(PG_GETARG_TEXT_PP(2)); + fail = PG_GETARG_BOOL(3); + rconn = getConnectionByName(conname); + } + + if (!rconn || !rconn->conn) + dblink_conn_not_avail(conname); + + conn = rconn->conn; + + /* If we are not in a transaction, start one */ + if (PQtransactionStatus(conn) == PQTRANS_IDLE) + { + res = PQexec(conn, "BEGIN"); + if (PQresultStatus(res) != PGRES_COMMAND_OK) + dblink_res_internalerror(conn, res, "begin error"); + PQclear(res); + rconn->newXactForCursor = TRUE; + + /* + * Since transaction state was IDLE, we force cursor count to + * initially be 0. This is needed as a previous ABORT might have wiped + * out our transaction without maintaining the cursor count for us. + */ + rconn->openCursorCount = 0; + } + + /* if we started a transaction, increment cursor count */ + if (rconn->newXactForCursor) + (rconn->openCursorCount)++; + + appendStringInfo(&buf, "DECLARE %s CURSOR FOR %s", curname, sql); + res = PQexec(conn, buf.data); + if (!res || PQresultStatus(res) != PGRES_COMMAND_OK) + { + dblink_res_error(conn, conname, res, "could not open cursor", fail); + PG_RETURN_TEXT_P(cstring_to_text("ERROR")); + } + + PQclear(res); + PG_RETURN_TEXT_P(cstring_to_text("OK")); } /* @@ -456,83 +458,83 @@ PG_FUNCTION_INFO_V1(dblink_close); Datum dblink_close(PG_FUNCTION_ARGS) { - PGconn *conn; - PGresult *res = NULL; - char *curname = NULL; - char *conname = NULL; - StringInfoData buf; - remoteConn *rconn = NULL; - bool fail = true; /* default to backward compatible behavior */ - - dblink_init(); - initStringInfo(&buf); - - if (PG_NARGS() == 1) - { - /* text */ - curname = text_to_cstring(PG_GETARG_TEXT_PP(0)); - rconn = pconn; - } - else if (PG_NARGS() == 2) - { - /* might be text,text or text,bool */ - if (get_fn_expr_argtype(fcinfo->flinfo, 1) == BOOLOID) - { - curname = text_to_cstring(PG_GETARG_TEXT_PP(0)); - fail = PG_GETARG_BOOL(1); - rconn = pconn; - } - else - { - conname = text_to_cstring(PG_GETARG_TEXT_PP(0)); - curname = text_to_cstring(PG_GETARG_TEXT_PP(1)); - rconn = getConnectionByName(conname); - } - } - if (PG_NARGS() == 3) - { - /* text,text,bool */ - conname = text_to_cstring(PG_GETARG_TEXT_PP(0)); - curname = text_to_cstring(PG_GETARG_TEXT_PP(1)); - fail = PG_GETARG_BOOL(2); - rconn = getConnectionByName(conname); - } - - if (!rconn || !rconn->conn) - dblink_conn_not_avail(conname); - - conn = rconn->conn; - - appendStringInfo(&buf, "CLOSE %s", curname); - - /* close the cursor */ - res = PQexec(conn, buf.data); - if (!res || PQresultStatus(res) != PGRES_COMMAND_OK) - { - dblink_res_error(conn, conname, res, "could not close cursor", fail); - PG_RETURN_TEXT_P(cstring_to_text("ERROR")); - } - - PQclear(res); - - /* if we started a transaction, decrement cursor count */ - if (rconn->newXactForCursor) - { - (rconn->openCursorCount)--; - - /* if count is zero, commit the transaction */ - if (rconn->openCursorCount == 0) - { - rconn->newXactForCursor = FALSE; - - res = PQexec(conn, "COMMIT"); - if (PQresultStatus(res) != PGRES_COMMAND_OK) - dblink_res_internalerror(conn, res, "commit error"); - PQclear(res); - } - } - - PG_RETURN_TEXT_P(cstring_to_text("OK")); + PGconn *conn; + PGresult *res = NULL; + char *curname = NULL; + char *conname = NULL; + StringInfoData buf; + remoteConn *rconn = NULL; + bool fail = true; /* default to backward compatible behavior */ + + dblink_init(); + initStringInfo(&buf); + + if (PG_NARGS() == 1) + { + /* text */ + curname = text_to_cstring(PG_GETARG_TEXT_PP(0)); + rconn = pconn; + } + else if (PG_NARGS() == 2) + { + /* might be text,text or text,bool */ + if (get_fn_expr_argtype(fcinfo->flinfo, 1) == BOOLOID) + { + curname = text_to_cstring(PG_GETARG_TEXT_PP(0)); + fail = PG_GETARG_BOOL(1); + rconn = pconn; + } + else + { + conname = text_to_cstring(PG_GETARG_TEXT_PP(0)); + curname = text_to_cstring(PG_GETARG_TEXT_PP(1)); + rconn = getConnectionByName(conname); + } + } + if (PG_NARGS() == 3) + { + /* text,text,bool */ + conname = text_to_cstring(PG_GETARG_TEXT_PP(0)); + curname = text_to_cstring(PG_GETARG_TEXT_PP(1)); + fail = PG_GETARG_BOOL(2); + rconn = getConnectionByName(conname); + } + + if (!rconn || !rconn->conn) + dblink_conn_not_avail(conname); + + conn = rconn->conn; + + appendStringInfo(&buf, "CLOSE %s", curname); + + /* close the cursor */ + res = PQexec(conn, buf.data); + if (!res || PQresultStatus(res) != PGRES_COMMAND_OK) + { + dblink_res_error(conn, conname, res, "could not close cursor", fail); + PG_RETURN_TEXT_P(cstring_to_text("ERROR")); + } + + PQclear(res); + + /* if we started a transaction, decrement cursor count */ + if (rconn->newXactForCursor) + { + (rconn->openCursorCount)--; + + /* if count is zero, commit the transaction */ + if (rconn->openCursorCount == 0) + { + rconn->newXactForCursor = FALSE; + + res = PQexec(conn, "COMMIT"); + if (PQresultStatus(res) != PGRES_COMMAND_OK) + dblink_res_internalerror(conn, res, "commit error"); + PQclear(res); + } + } + + PG_RETURN_TEXT_P(cstring_to_text("OK")); } /* @@ -542,91 +544,91 @@ PG_FUNCTION_INFO_V1(dblink_fetch); Datum dblink_fetch(PG_FUNCTION_ARGS) { - PGresult *res = NULL; - char *conname = NULL; - remoteConn *rconn = NULL; - PGconn *conn = NULL; - StringInfoData buf; - char *curname = NULL; - int howmany = 0; - bool fail = true; /* default to backward compatible */ - - prepTuplestoreResult(fcinfo); - - dblink_init(); - - if (PG_NARGS() == 4) - { - /* text,text,int,bool */ - conname = text_to_cstring(PG_GETARG_TEXT_PP(0)); - curname = text_to_cstring(PG_GETARG_TEXT_PP(1)); - howmany = PG_GETARG_INT32(2); - fail = PG_GETARG_BOOL(3); - - rconn = getConnectionByName(conname); - if (rconn) - conn = rconn->conn; - } - else if (PG_NARGS() == 3) - { - /* text,text,int or text,int,bool */ - if (get_fn_expr_argtype(fcinfo->flinfo, 2) == BOOLOID) - { - curname = text_to_cstring(PG_GETARG_TEXT_PP(0)); - howmany = PG_GETARG_INT32(1); - fail = PG_GETARG_BOOL(2); - conn = pconn->conn; - } - else - { - conname = text_to_cstring(PG_GETARG_TEXT_PP(0)); - curname = text_to_cstring(PG_GETARG_TEXT_PP(1)); - howmany = PG_GETARG_INT32(2); - - rconn = getConnectionByName(conname); - if (rconn) - conn = rconn->conn; - } - } - else if (PG_NARGS() == 2) - { - /* text,int */ - curname = text_to_cstring(PG_GETARG_TEXT_PP(0)); - howmany = PG_GETARG_INT32(1); - conn = pconn->conn; - } - - if (!conn) - dblink_conn_not_avail(conname); - - initStringInfo(&buf); - appendStringInfo(&buf, "FETCH %d FROM %s", howmany, curname); - - /* - * Try to execute the query. Note that since libpq uses malloc, the - * PGresult will be long-lived even though we are still in a short-lived - * memory context. - */ - res = PQexec(conn, buf.data); - if (!res || - (PQresultStatus(res) != PGRES_COMMAND_OK && - PQresultStatus(res) != PGRES_TUPLES_OK)) - { - dblink_res_error(conn, conname, res, - "could not fetch from cursor", fail); - return (Datum) 0; - } - else if (PQresultStatus(res) == PGRES_COMMAND_OK) - { - /* cursor does not exist - closed already or bad name */ - PQclear(res); - ereport(ERROR, - (errcode(ERRCODE_INVALID_CURSOR_NAME), - errmsg("cursor \"%s\" does not exist", curname))); - } - - materializeResult(fcinfo, conn, res); - return (Datum) 0; + PGresult *res = NULL; + char *conname = NULL; + remoteConn *rconn = NULL; + PGconn *conn = NULL; + StringInfoData buf; + char *curname = NULL; + int howmany = 0; + bool fail = true; /* default to backward compatible */ + + prepTuplestoreResult(fcinfo); + + dblink_init(); + + if (PG_NARGS() == 4) + { + /* text,text,int,bool */ + conname = text_to_cstring(PG_GETARG_TEXT_PP(0)); + curname = text_to_cstring(PG_GETARG_TEXT_PP(1)); + howmany = PG_GETARG_INT32(2); + fail = PG_GETARG_BOOL(3); + + rconn = getConnectionByName(conname); + if (rconn) + conn = rconn->conn; + } + else if (PG_NARGS() == 3) + { + /* text,text,int or text,int,bool */ + if (get_fn_expr_argtype(fcinfo->flinfo, 2) == BOOLOID) + { + curname = text_to_cstring(PG_GETARG_TEXT_PP(0)); + howmany = PG_GETARG_INT32(1); + fail = PG_GETARG_BOOL(2); + conn = pconn->conn; + } + else + { + conname = text_to_cstring(PG_GETARG_TEXT_PP(0)); + curname = text_to_cstring(PG_GETARG_TEXT_PP(1)); + howmany = PG_GETARG_INT32(2); + + rconn = getConnectionByName(conname); + if (rconn) + conn = rconn->conn; + } + } + else if (PG_NARGS() == 2) + { + /* text,int */ + curname = text_to_cstring(PG_GETARG_TEXT_PP(0)); + howmany = PG_GETARG_INT32(1); + conn = pconn->conn; + } + + if (!conn) + dblink_conn_not_avail(conname); + + initStringInfo(&buf); + appendStringInfo(&buf, "FETCH %d FROM %s", howmany, curname); + + /* + * Try to execute the query. Note that since libpq uses malloc, the + * PGresult will be long-lived even though we are still in a short-lived + * memory context. + */ + res = PQexec(conn, buf.data); + if (!res || + (PQresultStatus(res) != PGRES_COMMAND_OK && + PQresultStatus(res) != PGRES_TUPLES_OK)) + { + dblink_res_error(conn, conname, res, + "could not fetch from cursor", fail); + return (Datum) 0; + } + else if (PQresultStatus(res) == PGRES_COMMAND_OK) + { + /* cursor does not exist - closed already or bad name */ + PQclear(res); + ereport(ERROR, + (errcode(ERRCODE_INVALID_CURSOR_NAME), + errmsg("cursor \"%s\" does not exist", curname))); + } + + materializeResult(fcinfo, conn, res); + return (Datum) 0; } /* @@ -636,158 +638,382 @@ PG_FUNCTION_INFO_V1(dblink_record); Datum dblink_record(PG_FUNCTION_ARGS) { - return dblink_record_internal(fcinfo, false); + return dblink_record_internal(fcinfo, false); } PG_FUNCTION_INFO_V1(dblink_send_query); Datum dblink_send_query(PG_FUNCTION_ARGS) { - PGconn *conn; - char *sql; - int retval; - - if (PG_NARGS() == 2) - { - conn = dblink_get_named_conn(text_to_cstring(PG_GETARG_TEXT_PP(0))); - sql = text_to_cstring(PG_GETARG_TEXT_PP(1)); - } - else - /* shouldn't happen */ - elog(ERROR, "wrong number of arguments"); - - /* async query send */ - retval = PQsendQuery(conn, sql); - if (retval != 1) - elog(NOTICE, "could not send query: %s", pchomp(PQerrorMessage(conn))); - - PG_RETURN_INT32(retval); + PGconn *conn; + char *sql; + int retval; + + if (PG_NARGS() == 2) + { + conn = dblink_get_named_conn(text_to_cstring(PG_GETARG_TEXT_PP(0))); + sql = text_to_cstring(PG_GETARG_TEXT_PP(1)); + } + else + /* shouldn't happen */ + elog(ERROR, "wrong number of arguments"); + + /* async query send */ + retval = PQsendQuery(conn, sql); + if (retval != 1) + elog(NOTICE, "could not send query: %s", pchomp(PQerrorMessage(conn))); + + PG_RETURN_INT32(retval); } PG_FUNCTION_INFO_V1(dblink_get_result); Datum dblink_get_result(PG_FUNCTION_ARGS) { - return dblink_record_internal(fcinfo, true); + return dblink_record_internal(fcinfo, true); } static Datum dblink_record_internal(FunctionCallInfo fcinfo, bool is_async) { - PGconn *volatile conn = NULL; - volatile bool freeconn = false; - - prepTuplestoreResult(fcinfo); - - dblink_init(); - - PG_TRY(); - { - char *sql = NULL; - char *conname = NULL; - bool fail = true; /* default to backward compatible */ - - if (!is_async) - { - if (PG_NARGS() == 3) - { - /* text,text,bool */ - conname = text_to_cstring(PG_GETARG_TEXT_PP(0)); - sql = text_to_cstring(PG_GETARG_TEXT_PP(1)); - fail = PG_GETARG_BOOL(2); - dblink_get_conn(conname, &conn, &conname, &freeconn); - } - else if (PG_NARGS() == 2) - { - /* text,text or text,bool */ - if (get_fn_expr_argtype(fcinfo->flinfo, 1) == BOOLOID) - { - sql = text_to_cstring(PG_GETARG_TEXT_PP(0)); - fail = PG_GETARG_BOOL(1); - conn = pconn->conn; - } - else - { - conname = text_to_cstring(PG_GETARG_TEXT_PP(0)); - sql = text_to_cstring(PG_GETARG_TEXT_PP(1)); - dblink_get_conn(conname, &conn, &conname, &freeconn); - } - } - else if (PG_NARGS() == 1) - { - /* text */ - conn = pconn->conn; - sql = text_to_cstring(PG_GETARG_TEXT_PP(0)); - } - else - /* shouldn't happen */ - elog(ERROR, "wrong number of arguments"); - } - else /* is_async */ - { - /* get async result */ - conname = text_to_cstring(PG_GETARG_TEXT_PP(0)); - - if (PG_NARGS() == 2) - { - /* text,bool */ - fail = PG_GETARG_BOOL(1); - conn = dblink_get_named_conn(conname); - } - else if (PG_NARGS() == 1) - { - /* text */ - conn = dblink_get_named_conn(conname); - } - else - /* shouldn't happen */ - elog(ERROR, "wrong number of arguments"); - } - - if (!conn) - dblink_conn_not_avail(conname); - - if (!is_async) - { - /* synchronous query, use efficient tuple collection method */ - materializeQueryResult(fcinfo, conn, conname, sql, fail); - } - else - { - /* async result retrieval, do it the old way */ - PGresult *res = PQgetResult(conn); - - /* NULL means we're all done with the async results */ - if (res) - { - if (PQresultStatus(res) != PGRES_COMMAND_OK && - PQresultStatus(res) != PGRES_TUPLES_OK) - { - dblink_res_error(conn, conname, res, - "could not execute query", fail); - /* if fail isn't set, we'll return an empty query result */ - } - else - { - materializeResult(fcinfo, conn, res); - } - } - } - } - PG_CATCH(); - { - /* if needed, close the connection to the database */ - if (freeconn) - PQfinish(conn); - PG_RE_THROW(); - } - PG_END_TRY(); - - /* if needed, close the connection to the database */ - if (freeconn) - PQfinish(conn); - - return (Datum) 0; + PGconn *volatile conn = NULL; + volatile bool freeconn = false; + + prepTuplestoreResult(fcinfo); + + dblink_init(); + + PG_TRY(); + { + char *sql = NULL; + char *conname = NULL; + bool fail = true; /* default to backward compatible */ + + if (!is_async) + { + if (PG_NARGS() == 3) + { + /* text,text,bool */ + conname = text_to_cstring(PG_GETARG_TEXT_PP(0)); + sql = text_to_cstring(PG_GETARG_TEXT_PP(1)); + fail = PG_GETARG_BOOL(2); + dblink_get_conn(conname, &conn, &conname, &freeconn); + } + else if (PG_NARGS() == 2) + { + /* text,text or text,bool */ + if (get_fn_expr_argtype(fcinfo->flinfo, 1) == BOOLOID) + { + sql = text_to_cstring(PG_GETARG_TEXT_PP(0)); + fail = PG_GETARG_BOOL(1); + conn = pconn->conn; + } + else + { + conname = text_to_cstring(PG_GETARG_TEXT_PP(0)); + sql = text_to_cstring(PG_GETARG_TEXT_PP(1)); + dblink_get_conn(conname, &conn, &conname, &freeconn); + } + } + else if (PG_NARGS() == 1) + { + /* text */ + conn = pconn->conn; + sql = text_to_cstring(PG_GETARG_TEXT_PP(0)); + } + else + /* shouldn't happen */ + elog(ERROR, "wrong number of arguments"); + } + else /* is_async */ + { + /* get async result */ + conname = text_to_cstring(PG_GETARG_TEXT_PP(0)); + + if (PG_NARGS() == 2) + { + /* text,bool */ + fail = PG_GETARG_BOOL(1); + conn = dblink_get_named_conn(conname); + } + else if (PG_NARGS() == 1) + { + /* text */ + conn = dblink_get_named_conn(conname); + } + else + /* shouldn't happen */ + elog(ERROR, "wrong number of arguments"); + } + + if (!conn) + dblink_conn_not_avail(conname); + + if (!is_async) + { + /* synchronous query, use efficient tuple collection method */ + materializeQueryResult(fcinfo, conn, conname, sql, fail); + } + else + { + /* async result retrieval, do it the old way */ + PGresult *res = PQgetResult(conn); + + /* NULL means we're all done with the async results */ + if (res) + { + if (PQresultStatus(res) != PGRES_COMMAND_OK && + PQresultStatus(res) != PGRES_TUPLES_OK) + { + dblink_res_error(conn, conname, res, + "could not execute query", fail); + /* if fail isn't set, we'll return an empty query result */ + } + else + { + materializeResult(fcinfo, conn, res); + } + } + } + } + PG_CATCH(); + { + /* if needed, close the connection to the database */ + if (freeconn) + PQfinish(conn); + PG_RE_THROW(); + } + PG_END_TRY(); + + /* if needed, close the connection to the database */ + if (freeconn) + PQfinish(conn); + + return (Datum) 0; +} + +static StringInfo copybuf = NULL; +static char *tmp_cbuf = NULL; +static PGconn *copy_conn = NULL; + +static int +receive_copy_data(PGconn *conn, char **buffer) +{ + int rawlen; + + if (tmp_cbuf != NULL) + PQfreemem(tmp_cbuf); + tmp_cbuf = NULL; + + /* Try to receive a CopyData message */ + rawlen = PQgetCopyData(conn, &tmp_cbuf, 0); + + if (rawlen < -1) + { + if (tmp_cbuf != NULL) + PQfreemem(tmp_cbuf); + tmp_cbuf = NULL; + + ereport(ERROR, + (errmsg("could not receive data from stream: %s", + pchomp(PQerrorMessage(conn))))); + } + + /* Return received messages to caller */ + *buffer = tmp_cbuf; + return rawlen; +} + +static int +copy_read_data(void *outbuf, int minread, int maxread) +{ + int bytesread = 0; + int avail; + + /* If there are some leftover data from previous read, use it. */ + avail = copybuf->len - copybuf->cursor; + if (avail) + { + if (avail > maxread) + avail = maxread; + memcpy(outbuf, ©buf->data[copybuf->cursor], avail); + copybuf->cursor += avail; + maxread -= avail; + bytesread += avail; + } + + while (maxread > 0 && bytesread < minread) + { + int len; + char *buf = NULL; + + for (;;) + { + /* Try read the data. */ + len = receive_copy_data(copy_conn, &buf); + + CHECK_FOR_INTERRUPTS(); + + if (len < 0) + return bytesread; + else + { + /* Process the data */ + copybuf->data = buf; + copybuf->len = len; + copybuf->cursor = 0; + + avail = copybuf->len - copybuf->cursor; + if (avail > maxread) + avail = maxread; + memcpy(outbuf, ©buf->data[copybuf->cursor], avail); + outbuf = (void *) ((char *) outbuf + avail); + copybuf->cursor += avail; + maxread -= avail; + bytesread += avail; + } + + if (maxread <= 0 || bytesread >= minread) + return bytesread; + } + } + + return bytesread; +} + +static bool isRemoteTableAsSelect(char * rtblname) +{ + char *tmp = rtblname; + + while (tmp != NULL) + { + if (*tmp == ' ') + { + ++tmp; + continue; + } + + if (*tmp == '(' ) + ++ tmp; + + if (0 == pg_strncasecmp(tmp, "SELECT", 6)) + return true; + else + return false; + } + + return false; +} + +/* + * Copy remote table to local table. + * + * Here issue 'COPY TO STDOUT' to remote server, and put data in a buffer, + * then local server will use COPY FROM statement to copy data into table + * directly. + */ +static void +copyRemoteTableTo(char *nspname, char *tblname, char *rnspname, char *rtblname, + char *connstr) +{ + bool freeconn = false; + char *conname = connstr; + PGconn *conn = NULL; + ParseState *pstate = NULL; + Relation rel = NULL; + + dblink_init(); + + PG_TRY(); + { + StringInfoData cmd; + PGresult *res; + Oid namespaceId; + Oid relId; + CopyState cstate; + + dblink_get_conn(conname, &conn, &conname, &freeconn); + copy_conn = conn; + if (conn == NULL) + elog(ERROR, "failed to connect to remote server"); + + initStringInfo(&cmd); + + /* Send copy statement to remote server */ + if (isRemoteTableAsSelect(rtblname) ) + { + appendStringInfo(&cmd, "COPY (%s) TO STDOUT", rtblname); + } + else + { + appendStringInfo(&cmd, "COPY %s TO STDOUT", + quote_qualified_identifier(rnspname, rtblname)); + } + + if (!PQsendQuery(conn, cmd.data)) + elog(ERROR, "failed to get data stream from remote server"); + + res = PQgetResult(conn); + if (PQresultStatus(res) != PGRES_COPY_OUT) + elog(ERROR, "get bad stream status from remote server"); + + namespaceId = LookupExplicitNamespace(nspname, false); + relId = get_relname_relid(tblname, namespaceId); + rel = heap_open(relId, RowExclusiveLock); + + copybuf = makeStringInfo(); + pstate = make_parsestate(NULL); + addRangeTableEntryForRelation(pstate, rel, NULL, false, false); + + cstate = BeginCopyFrom(pstate, rel, NULL, false, copy_read_data, NULL, NIL); + + (void) CopyFrom(cstate); + EndCopyFrom(cstate); + + relation_close(rel, RowExclusiveLock); + } + PG_CATCH(); + { + if (tmp_cbuf != NULL) + PQfreemem(tmp_cbuf); + tmp_cbuf = NULL; + + if (freeconn) + PQfinish(conn); + PG_RE_THROW(); + } + PG_END_TRY(); + + if (tmp_cbuf != NULL) + PQfreemem(tmp_cbuf); + tmp_cbuf = NULL; + if (freeconn) + PQfinish(conn); +} + +PG_FUNCTION_INFO_V1(dblink_copy_table); +Datum +dblink_copy_table(PG_FUNCTION_ARGS) +{ + char *nspname; + char *tblname; + char *rnspname; + char *rtblname; + char *connstr; + + if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2) || PG_ARGISNULL(3) + || PG_ARGISNULL(4)) + elog(ERROR, "function argument has null values"); + + nspname = text_to_cstring(PG_GETARG_TEXT_PP(0)); + tblname = text_to_cstring(PG_GETARG_TEXT_PP(1)); + rnspname = text_to_cstring(PG_GETARG_TEXT_PP(2)); + rtblname = text_to_cstring(PG_GETARG_TEXT_PP(3)); + connstr = text_to_cstring(PG_GETARG_TEXT_PP(4)); + + copyRemoteTableTo(nspname, tblname, rnspname, rtblname, connstr); + + return (Datum) 0; } /* @@ -799,24 +1025,24 @@ dblink_record_internal(FunctionCallInfo fcinfo, bool is_async) static void prepTuplestoreResult(FunctionCallInfo fcinfo) { - ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; - - /* check to see if query supports us returning a tuplestore */ - if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("set-valued function called in context that cannot accept a set"))); - if (!(rsinfo->allowedModes & SFRM_Materialize)) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("materialize mode required, but it is not allowed in this context"))); - - /* let the executor know we're sending back a tuplestore */ - rsinfo->returnMode = SFRM_Materialize; - - /* caller must fill these to return a non-empty result */ - rsinfo->setResult = NULL; - rsinfo->setDesc = NULL; + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + + /* check to see if query supports us returning a tuplestore */ + if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("set-valued function called in context that cannot accept a set"))); + if (!(rsinfo->allowedModes & SFRM_Materialize)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("materialize mode required, but it is not allowed in this context"))); + + /* let the executor know we're sending back a tuplestore */ + rsinfo->returnMode = SFRM_Materialize; + + /* caller must fill these to return a non-empty result */ + rsinfo->setResult = NULL; + rsinfo->setDesc = NULL; } /* @@ -827,139 +1053,139 @@ prepTuplestoreResult(FunctionCallInfo fcinfo) static void materializeResult(FunctionCallInfo fcinfo, PGconn *conn, PGresult *res) { - ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; - - /* prepTuplestoreResult must have been called previously */ - Assert(rsinfo->returnMode == SFRM_Materialize); - - PG_TRY(); - { - TupleDesc tupdesc; - bool is_sql_cmd; - int ntuples; - int nfields; - - if (PQresultStatus(res) == PGRES_COMMAND_OK) - { - is_sql_cmd = true; - - /* - * need a tuple descriptor representing one TEXT column to return - * the command status string as our result tuple - */ - tupdesc = CreateTemplateTupleDesc(1, false); - TupleDescInitEntry(tupdesc, (AttrNumber) 1, "status", - TEXTOID, -1, 0); - ntuples = 1; - nfields = 1; - } - else - { - Assert(PQresultStatus(res) == PGRES_TUPLES_OK); - - is_sql_cmd = false; - - /* get a tuple descriptor for our result type */ - switch (get_call_result_type(fcinfo, NULL, &tupdesc)) - { - case TYPEFUNC_COMPOSITE: - /* success */ - break; - case TYPEFUNC_RECORD: - /* failed to determine actual type of RECORD */ - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("function returning record called in context " - "that cannot accept type record"))); - break; - default: - /* result type isn't composite */ - elog(ERROR, "return type must be a row type"); - break; - } - - /* make sure we have a persistent copy of the tupdesc */ - tupdesc = CreateTupleDescCopy(tupdesc); - ntuples = PQntuples(res); - nfields = PQnfields(res); - } - - /* - * check result and tuple descriptor have the same number of columns - */ - if (nfields != tupdesc->natts) - ereport(ERROR, - (errcode(ERRCODE_DATATYPE_MISMATCH), - errmsg("remote query result rowtype does not match " - "the specified FROM clause rowtype"))); - - if (ntuples > 0) - { - AttInMetadata *attinmeta; - int nestlevel = -1; - Tuplestorestate *tupstore; - MemoryContext oldcontext; - int row; - char **values; - - attinmeta = TupleDescGetAttInMetadata(tupdesc); - - /* Set GUCs to ensure we read GUC-sensitive data types correctly */ - if (!is_sql_cmd) - nestlevel = applyRemoteGucs(conn); - - oldcontext = MemoryContextSwitchTo( - rsinfo->econtext->ecxt_per_query_memory); - tupstore = tuplestore_begin_heap(true, false, work_mem); - rsinfo->setResult = tupstore; - rsinfo->setDesc = tupdesc; - MemoryContextSwitchTo(oldcontext); - - values = (char **) palloc(nfields * sizeof(char *)); - - /* put all tuples into the tuplestore */ - for (row = 0; row < ntuples; row++) - { - HeapTuple tuple; - - if (!is_sql_cmd) - { - int i; - - for (i = 0; i < nfields; i++) - { - if (PQgetisnull(res, row, i)) - values[i] = NULL; - else - values[i] = PQgetvalue(res, row, i); - } - } - else - { - values[0] = PQcmdStatus(res); - } - - /* build the tuple and put it into the tuplestore. */ - tuple = BuildTupleFromCStrings(attinmeta, values); - tuplestore_puttuple(tupstore, tuple); - } - - /* clean up GUC settings, if we changed any */ - restoreLocalGucs(nestlevel); - - /* clean up and return the tuplestore */ - tuplestore_donestoring(tupstore); - } - - PQclear(res); - } - PG_CATCH(); - { - /* be sure to release the libpq result */ - PQclear(res); - PG_RE_THROW(); - } - PG_END_TRY(); + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + + /* prepTuplestoreResult must have been called previously */ + Assert(rsinfo->returnMode == SFRM_Materialize); + + PG_TRY(); + { + TupleDesc tupdesc; + bool is_sql_cmd; + int ntuples; + int nfields; + + if (PQresultStatus(res) == PGRES_COMMAND_OK) + { + is_sql_cmd = true; + + /* + * need a tuple descriptor representing one TEXT column to return + * the command status string as our result tuple + */ + tupdesc = CreateTemplateTupleDesc(1, false); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "status", + TEXTOID, -1, 0); + ntuples = 1; + nfields = 1; + } + else + { + Assert(PQresultStatus(res) == PGRES_TUPLES_OK); + + is_sql_cmd = false; + + /* get a tuple descriptor for our result type */ + switch (get_call_result_type(fcinfo, NULL, &tupdesc)) + { + case TYPEFUNC_COMPOSITE: + /* success */ + break; + case TYPEFUNC_RECORD: + /* failed to determine actual type of RECORD */ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("function returning record called in context " + "that cannot accept type record"))); + break; + default: + /* result type isn't composite */ + elog(ERROR, "return type must be a row type"); + break; + } + + /* make sure we have a persistent copy of the tupdesc */ + tupdesc = CreateTupleDescCopy(tupdesc); + ntuples = PQntuples(res); + nfields = PQnfields(res); + } + + /* + * check result and tuple descriptor have the same number of columns + */ + if (nfields != tupdesc->natts) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("remote query result rowtype does not match " + "the specified FROM clause rowtype"))); + + if (ntuples > 0) + { + AttInMetadata *attinmeta; + int nestlevel = -1; + Tuplestorestate *tupstore; + MemoryContext oldcontext; + int row; + char **values; + + attinmeta = TupleDescGetAttInMetadata(tupdesc); + + /* Set GUCs to ensure we read GUC-sensitive data types correctly */ + if (!is_sql_cmd) + nestlevel = applyRemoteGucs(conn); + + oldcontext = MemoryContextSwitchTo( + rsinfo->econtext->ecxt_per_query_memory); + tupstore = tuplestore_begin_heap(true, false, work_mem); + rsinfo->setResult = tupstore; + rsinfo->setDesc = tupdesc; + MemoryContextSwitchTo(oldcontext); + + values = (char **) palloc(nfields * sizeof(char *)); + + /* put all tuples into the tuplestore */ + for (row = 0; row < ntuples; row++) + { + HeapTuple tuple; + + if (!is_sql_cmd) + { + int i; + + for (i = 0; i < nfields; i++) + { + if (PQgetisnull(res, row, i)) + values[i] = NULL; + else + values[i] = PQgetvalue(res, row, i); + } + } + else + { + values[0] = PQcmdStatus(res); + } + + /* build the tuple and put it into the tuplestore. */ + tuple = BuildTupleFromCStrings(attinmeta, values); + tuplestore_puttuple(tupstore, tuple); + } + + /* clean up GUC settings, if we changed any */ + restoreLocalGucs(nestlevel); + + /* clean up and return the tuplestore */ + tuplestore_donestoring(tupstore); + } + + PQclear(res); + } + PG_CATCH(); + { + /* be sure to release the libpq result */ + PQclear(res); + PG_RE_THROW(); + } + PG_END_TRY(); } /* @@ -972,117 +1198,117 @@ materializeResult(FunctionCallInfo fcinfo, PGconn *conn, PGresult *res) */ static void materializeQueryResult(FunctionCallInfo fcinfo, - PGconn *conn, - const char *conname, - const char *sql, - bool fail) + PGconn *conn, + const char *conname, + const char *sql, + bool fail) { - ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; - PGresult *volatile res = NULL; - volatile storeInfo sinfo; - - /* prepTuplestoreResult must have been called previously */ - Assert(rsinfo->returnMode == SFRM_Materialize); - - /* initialize storeInfo to empty */ - memset((void *) &sinfo, 0, sizeof(sinfo)); - sinfo.fcinfo = fcinfo; - - PG_TRY(); - { - /* Create short-lived memory context for data conversions */ - sinfo.tmpcontext = AllocSetContextCreate(CurrentMemoryContext, - "dblink temporary context", - ALLOCSET_DEFAULT_SIZES); - - /* execute query, collecting any tuples into the tuplestore */ - res = storeQueryResult(&sinfo, conn, sql); - - if (!res || - (PQresultStatus(res) != PGRES_COMMAND_OK && - PQresultStatus(res) != PGRES_TUPLES_OK)) - { - /* - * dblink_res_error will clear the passed PGresult, so we need - * this ugly dance to avoid doing so twice during error exit - */ - PGresult *res1 = res; - - res = NULL; - dblink_res_error(conn, conname, res1, - "could not execute query", fail); - /* if fail isn't set, we'll return an empty query result */ - } - else if (PQresultStatus(res) == PGRES_COMMAND_OK) - { - /* - * storeRow didn't get called, so we need to convert the command - * status string to a tuple manually - */ - TupleDesc tupdesc; - AttInMetadata *attinmeta; - Tuplestorestate *tupstore; - HeapTuple tuple; - char *values[1]; - MemoryContext oldcontext; - - /* - * need a tuple descriptor representing one TEXT column to return - * the command status string as our result tuple - */ - tupdesc = CreateTemplateTupleDesc(1, false); - TupleDescInitEntry(tupdesc, (AttrNumber) 1, "status", - TEXTOID, -1, 0); - attinmeta = TupleDescGetAttInMetadata(tupdesc); - - oldcontext = MemoryContextSwitchTo( - rsinfo->econtext->ecxt_per_query_memory); - tupstore = tuplestore_begin_heap(true, false, work_mem); - rsinfo->setResult = tupstore; - rsinfo->setDesc = tupdesc; - MemoryContextSwitchTo(oldcontext); - - values[0] = PQcmdStatus(res); - - /* build the tuple and put it into the tuplestore. */ - tuple = BuildTupleFromCStrings(attinmeta, values); - tuplestore_puttuple(tupstore, tuple); - - PQclear(res); - res = NULL; - } - else - { - Assert(PQresultStatus(res) == PGRES_TUPLES_OK); - /* storeRow should have created a tuplestore */ - Assert(rsinfo->setResult != NULL); - - PQclear(res); - res = NULL; - } - - /* clean up data conversion short-lived memory context */ - if (sinfo.tmpcontext != NULL) - MemoryContextDelete(sinfo.tmpcontext); - sinfo.tmpcontext = NULL; - - PQclear(sinfo.last_res); - sinfo.last_res = NULL; - PQclear(sinfo.cur_res); - sinfo.cur_res = NULL; - } - PG_CATCH(); - { - /* be sure to release any libpq result we collected */ - PQclear(res); - PQclear(sinfo.last_res); - PQclear(sinfo.cur_res); - /* and clear out any pending data in libpq */ - while ((res = PQgetResult(conn)) != NULL) - PQclear(res); - PG_RE_THROW(); - } - PG_END_TRY(); + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + PGresult *volatile res = NULL; + volatile storeInfo sinfo; + + /* prepTuplestoreResult must have been called previously */ + Assert(rsinfo->returnMode == SFRM_Materialize); + + /* initialize storeInfo to empty */ + memset((void *) &sinfo, 0, sizeof(sinfo)); + sinfo.fcinfo = fcinfo; + + PG_TRY(); + { + /* Create short-lived memory context for data conversions */ + sinfo.tmpcontext = AllocSetContextCreate(CurrentMemoryContext, + "dblink temporary context", + ALLOCSET_DEFAULT_SIZES); + + /* execute query, collecting any tuples into the tuplestore */ + res = storeQueryResult(&sinfo, conn, sql); + + if (!res || + (PQresultStatus(res) != PGRES_COMMAND_OK && + PQresultStatus(res) != PGRES_TUPLES_OK)) + { + /* + * dblink_res_error will clear the passed PGresult, so we need + * this ugly dance to avoid doing so twice during error exit + */ + PGresult *res1 = res; + + res = NULL; + dblink_res_error(conn, conname, res1, + "could not execute query", fail); + /* if fail isn't set, we'll return an empty query result */ + } + else if (PQresultStatus(res) == PGRES_COMMAND_OK) + { + /* + * storeRow didn't get called, so we need to convert the command + * status string to a tuple manually + */ + TupleDesc tupdesc; + AttInMetadata *attinmeta; + Tuplestorestate *tupstore; + HeapTuple tuple; + char *values[1]; + MemoryContext oldcontext; + + /* + * need a tuple descriptor representing one TEXT column to return + * the command status string as our result tuple + */ + tupdesc = CreateTemplateTupleDesc(1, false); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "status", + TEXTOID, -1, 0); + attinmeta = TupleDescGetAttInMetadata(tupdesc); + + oldcontext = MemoryContextSwitchTo( + rsinfo->econtext->ecxt_per_query_memory); + tupstore = tuplestore_begin_heap(true, false, work_mem); + rsinfo->setResult = tupstore; + rsinfo->setDesc = tupdesc; + MemoryContextSwitchTo(oldcontext); + + values[0] = PQcmdStatus(res); + + /* build the tuple and put it into the tuplestore. */ + tuple = BuildTupleFromCStrings(attinmeta, values); + tuplestore_puttuple(tupstore, tuple); + + PQclear(res); + res = NULL; + } + else + { + Assert(PQresultStatus(res) == PGRES_TUPLES_OK); + /* storeRow should have created a tuplestore */ + Assert(rsinfo->setResult != NULL); + + PQclear(res); + res = NULL; + } + + /* clean up data conversion short-lived memory context */ + if (sinfo.tmpcontext != NULL) + MemoryContextDelete(sinfo.tmpcontext); + sinfo.tmpcontext = NULL; + + PQclear(sinfo.last_res); + sinfo.last_res = NULL; + PQclear(sinfo.cur_res); + sinfo.cur_res = NULL; + } + PG_CATCH(); + { + /* be sure to release any libpq result we collected */ + PQclear(res); + PQclear(sinfo.last_res); + PQclear(sinfo.cur_res); + /* and clear out any pending data in libpq */ + while ((res = PQgetResult(conn)) != NULL) + PQclear(res); + PG_RE_THROW(); + } + PG_END_TRY(); } /* @@ -1091,63 +1317,63 @@ materializeQueryResult(FunctionCallInfo fcinfo, static PGresult * storeQueryResult(volatile storeInfo *sinfo, PGconn *conn, const char *sql) { - bool first = true; - int nestlevel = -1; - PGresult *res; - - if (!PQsendQuery(conn, sql)) - elog(ERROR, "could not send query: %s", pchomp(PQerrorMessage(conn))); - - if (!PQsetSingleRowMode(conn)) /* shouldn't fail */ - elog(ERROR, "failed to set single-row mode for dblink query"); - - for (;;) - { - CHECK_FOR_INTERRUPTS(); - - sinfo->cur_res = PQgetResult(conn); - if (!sinfo->cur_res) - break; - - if (PQresultStatus(sinfo->cur_res) == PGRES_SINGLE_TUPLE) - { - /* got one row from possibly-bigger resultset */ - - /* - * Set GUCs to ensure we read GUC-sensitive data types correctly. - * We shouldn't do this until we have a row in hand, to ensure - * libpq has seen any earlier ParameterStatus protocol messages. - */ - if (first && nestlevel < 0) - nestlevel = applyRemoteGucs(conn); - - storeRow(sinfo, sinfo->cur_res, first); - - PQclear(sinfo->cur_res); - sinfo->cur_res = NULL; - first = false; - } - else - { - /* if empty resultset, fill tuplestore header */ - if (first && PQresultStatus(sinfo->cur_res) == PGRES_TUPLES_OK) - storeRow(sinfo, sinfo->cur_res, first); - - /* store completed result at last_res */ - PQclear(sinfo->last_res); - sinfo->last_res = sinfo->cur_res; - sinfo->cur_res = NULL; - first = true; - } - } - - /* clean up GUC settings, if we changed any */ - restoreLocalGucs(nestlevel); - - /* return last_res */ - res = sinfo->last_res; - sinfo->last_res = NULL; - return res; + bool first = true; + int nestlevel = -1; + PGresult *res; + + if (!PQsendQuery(conn, sql)) + elog(ERROR, "could not send query: %s", pchomp(PQerrorMessage(conn))); + + if (!PQsetSingleRowMode(conn)) /* shouldn't fail */ + elog(ERROR, "failed to set single-row mode for dblink query"); + + for (;;) + { + CHECK_FOR_INTERRUPTS(); + + sinfo->cur_res = PQgetResult(conn); + if (!sinfo->cur_res) + break; + + if (PQresultStatus(sinfo->cur_res) == PGRES_SINGLE_TUPLE) + { + /* got one row from possibly-bigger resultset */ + + /* + * Set GUCs to ensure we read GUC-sensitive data types correctly. + * We shouldn't do this until we have a row in hand, to ensure + * libpq has seen any earlier ParameterStatus protocol messages. + */ + if (first && nestlevel < 0) + nestlevel = applyRemoteGucs(conn); + + storeRow(sinfo, sinfo->cur_res, first); + + PQclear(sinfo->cur_res); + sinfo->cur_res = NULL; + first = false; + } + else + { + /* if empty resultset, fill tuplestore header */ + if (first && PQresultStatus(sinfo->cur_res) == PGRES_TUPLES_OK) + storeRow(sinfo, sinfo->cur_res, first); + + /* store completed result at last_res */ + PQclear(sinfo->last_res); + sinfo->last_res = sinfo->cur_res; + sinfo->cur_res = NULL; + first = true; + } + } + + /* clean up GUC settings, if we changed any */ + restoreLocalGucs(nestlevel); + + /* return last_res */ + res = sinfo->last_res; + sinfo->last_res = NULL; + return res; } /* @@ -1159,107 +1385,107 @@ storeQueryResult(volatile storeInfo *sinfo, PGconn *conn, const char *sql) static void storeRow(volatile storeInfo *sinfo, PGresult *res, bool first) { - int nfields = PQnfields(res); - HeapTuple tuple; - int i; - MemoryContext oldcontext; - - if (first) - { - /* Prepare for new result set */ - ReturnSetInfo *rsinfo = (ReturnSetInfo *) sinfo->fcinfo->resultinfo; - TupleDesc tupdesc; - - /* - * It's possible to get more than one result set if the query string - * contained multiple SQL commands. In that case, we follow PQexec's - * traditional behavior of throwing away all but the last result. - */ - if (sinfo->tuplestore) - tuplestore_end(sinfo->tuplestore); - sinfo->tuplestore = NULL; - - /* get a tuple descriptor for our result type */ - switch (get_call_result_type(sinfo->fcinfo, NULL, &tupdesc)) - { - case TYPEFUNC_COMPOSITE: - /* success */ - break; - case TYPEFUNC_RECORD: - /* failed to determine actual type of RECORD */ - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("function returning record called in context " - "that cannot accept type record"))); - break; - default: - /* result type isn't composite */ - elog(ERROR, "return type must be a row type"); - break; - } - - /* make sure we have a persistent copy of the tupdesc */ - tupdesc = CreateTupleDescCopy(tupdesc); - - /* check result and tuple descriptor have the same number of columns */ - if (nfields != tupdesc->natts) - ereport(ERROR, - (errcode(ERRCODE_DATATYPE_MISMATCH), - errmsg("remote query result rowtype does not match " - "the specified FROM clause rowtype"))); - - /* Prepare attinmeta for later data conversions */ - sinfo->attinmeta = TupleDescGetAttInMetadata(tupdesc); - - /* Create a new, empty tuplestore */ - oldcontext = MemoryContextSwitchTo(rsinfo->econtext->ecxt_per_query_memory); - sinfo->tuplestore = tuplestore_begin_heap(true, false, work_mem); - rsinfo->setResult = sinfo->tuplestore; - rsinfo->setDesc = tupdesc; - MemoryContextSwitchTo(oldcontext); - - /* Done if empty resultset */ - if (PQntuples(res) == 0) - return; - - /* - * Set up sufficiently-wide string pointers array; this won't change - * in size so it's easy to preallocate. - */ - if (sinfo->cstrs) - pfree(sinfo->cstrs); - sinfo->cstrs = (char **) palloc(nfields * sizeof(char *)); - } - - /* Should have a single-row result if we get here */ - Assert(PQntuples(res) == 1); - - /* - * Do the following work in a temp context that we reset after each tuple. - * This cleans up not only the data we have direct access to, but any - * cruft the I/O functions might leak. - */ - oldcontext = MemoryContextSwitchTo(sinfo->tmpcontext); - - /* - * Fill cstrs with null-terminated strings of column values. - */ - for (i = 0; i < nfields; i++) - { - if (PQgetisnull(res, 0, i)) - sinfo->cstrs[i] = NULL; - else - sinfo->cstrs[i] = PQgetvalue(res, 0, i); - } - - /* Convert row to a tuple, and add it to the tuplestore */ - tuple = BuildTupleFromCStrings(sinfo->attinmeta, sinfo->cstrs); - - tuplestore_puttuple(sinfo->tuplestore, tuple); - - /* Clean up */ - MemoryContextSwitchTo(oldcontext); - MemoryContextReset(sinfo->tmpcontext); + int nfields = PQnfields(res); + HeapTuple tuple; + int i; + MemoryContext oldcontext; + + if (first) + { + /* Prepare for new result set */ + ReturnSetInfo *rsinfo = (ReturnSetInfo *) sinfo->fcinfo->resultinfo; + TupleDesc tupdesc; + + /* + * It's possible to get more than one result set if the query string + * contained multiple SQL commands. In that case, we follow PQexec's + * traditional behavior of throwing away all but the last result. + */ + if (sinfo->tuplestore) + tuplestore_end(sinfo->tuplestore); + sinfo->tuplestore = NULL; + + /* get a tuple descriptor for our result type */ + switch (get_call_result_type(sinfo->fcinfo, NULL, &tupdesc)) + { + case TYPEFUNC_COMPOSITE: + /* success */ + break; + case TYPEFUNC_RECORD: + /* failed to determine actual type of RECORD */ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("function returning record called in context " + "that cannot accept type record"))); + break; + default: + /* result type isn't composite */ + elog(ERROR, "return type must be a row type"); + break; + } + + /* make sure we have a persistent copy of the tupdesc */ + tupdesc = CreateTupleDescCopy(tupdesc); + + /* check result and tuple descriptor have the same number of columns */ + if (nfields != tupdesc->natts) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("remote query result rowtype does not match " + "the specified FROM clause rowtype"))); + + /* Prepare attinmeta for later data conversions */ + sinfo->attinmeta = TupleDescGetAttInMetadata(tupdesc); + + /* Create a new, empty tuplestore */ + oldcontext = MemoryContextSwitchTo(rsinfo->econtext->ecxt_per_query_memory); + sinfo->tuplestore = tuplestore_begin_heap(true, false, work_mem); + rsinfo->setResult = sinfo->tuplestore; + rsinfo->setDesc = tupdesc; + MemoryContextSwitchTo(oldcontext); + + /* Done if empty resultset */ + if (PQntuples(res) == 0) + return; + + /* + * Set up sufficiently-wide string pointers array; this won't change + * in size so it's easy to preallocate. + */ + if (sinfo->cstrs) + pfree(sinfo->cstrs); + sinfo->cstrs = (char **) palloc(nfields * sizeof(char *)); + } + + /* Should have a single-row result if we get here */ + Assert(PQntuples(res) == 1); + + /* + * Do the following work in a temp context that we reset after each tuple. + * This cleans up not only the data we have direct access to, but any + * cruft the I/O functions might leak. + */ + oldcontext = MemoryContextSwitchTo(sinfo->tmpcontext); + + /* + * Fill cstrs with null-terminated strings of column values. + */ + for (i = 0; i < nfields; i++) + { + if (PQgetisnull(res, 0, i)) + sinfo->cstrs[i] = NULL; + else + sinfo->cstrs[i] = PQgetvalue(res, 0, i); + } + + /* Convert row to a tuple, and add it to the tuplestore */ + tuple = BuildTupleFromCStrings(sinfo->attinmeta, sinfo->cstrs); + + tuplestore_puttuple(sinfo->tuplestore, tuple); + + /* Clean up */ + MemoryContextSwitchTo(oldcontext); + MemoryContextReset(sinfo->tmpcontext); } /* @@ -1271,27 +1497,27 @@ PG_FUNCTION_INFO_V1(dblink_get_connections); Datum dblink_get_connections(PG_FUNCTION_ARGS) { - HASH_SEQ_STATUS status; - remoteConnHashEnt *hentry; - ArrayBuildState *astate = NULL; - - if (remoteConnHash) - { - hash_seq_init(&status, remoteConnHash); - while ((hentry = (remoteConnHashEnt *) hash_seq_search(&status)) != NULL) - { - /* stash away current value */ - astate = accumArrayResult(astate, - CStringGetTextDatum(hentry->name), - false, TEXTOID, CurrentMemoryContext); - } - } - - if (astate) - PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate, - CurrentMemoryContext)); - else - PG_RETURN_NULL(); + HASH_SEQ_STATUS status; + remoteConnHashEnt *hentry; + ArrayBuildState *astate = NULL; + + if (remoteConnHash) + { + hash_seq_init(&status, remoteConnHash); + while ((hentry = (remoteConnHashEnt *) hash_seq_search(&status)) != NULL) + { + /* stash away current value */ + astate = accumArrayResult(astate, + CStringGetTextDatum(hentry->name), + false, TEXTOID, CurrentMemoryContext); + } + } + + if (astate) + PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate, + CurrentMemoryContext)); + else + PG_RETURN_NULL(); } /* @@ -1299,53 +1525,53 @@ dblink_get_connections(PG_FUNCTION_ARGS) * * Returns 1 if the connection is busy, 0 otherwise * Params: - * text connection_name - name of the connection to check + * text connection_name - name of the connection to check * */ PG_FUNCTION_INFO_V1(dblink_is_busy); Datum dblink_is_busy(PG_FUNCTION_ARGS) { - PGconn *conn; + PGconn *conn; - dblink_init(); - conn = dblink_get_named_conn(text_to_cstring(PG_GETARG_TEXT_PP(0))); + dblink_init(); + conn = dblink_get_named_conn(text_to_cstring(PG_GETARG_TEXT_PP(0))); - PQconsumeInput(conn); - PG_RETURN_INT32(PQisBusy(conn)); + PQconsumeInput(conn); + PG_RETURN_INT32(PQisBusy(conn)); } /* * Cancels a running request on a connection * * Returns text: - * "OK" if the cancel request has been sent correctly, - * an error message otherwise + * "OK" if the cancel request has been sent correctly, + * an error message otherwise * * Params: - * text connection_name - name of the connection to check + * text connection_name - name of the connection to check * */ PG_FUNCTION_INFO_V1(dblink_cancel_query); Datum dblink_cancel_query(PG_FUNCTION_ARGS) { - int res; - PGconn *conn; - PGcancel *cancel; - char errbuf[256]; - - dblink_init(); - conn = dblink_get_named_conn(text_to_cstring(PG_GETARG_TEXT_PP(0))); - cancel = PQgetCancel(conn); - - res = PQcancel(cancel, errbuf, 256); - PQfreeCancel(cancel); - - if (res == 1) - PG_RETURN_TEXT_P(cstring_to_text("OK")); - else - PG_RETURN_TEXT_P(cstring_to_text(errbuf)); + int res; + PGconn *conn; + PGcancel *cancel; + char errbuf[256]; + + dblink_init(); + conn = dblink_get_named_conn(text_to_cstring(PG_GETARG_TEXT_PP(0))); + cancel = PQgetCancel(conn); + + res = PQcancel(cancel, errbuf, 256); + PQfreeCancel(cancel); + + if (res == 1) + PG_RETURN_TEXT_P(cstring_to_text("OK")); + else + PG_RETURN_TEXT_P(cstring_to_text(errbuf)); } @@ -1353,27 +1579,27 @@ dblink_cancel_query(PG_FUNCTION_ARGS) * Get error message from a connection * * Returns text: - * "OK" if no error, an error message otherwise + * "OK" if no error, an error message otherwise * * Params: - * text connection_name - name of the connection to check + * text connection_name - name of the connection to check * */ PG_FUNCTION_INFO_V1(dblink_error_message); Datum dblink_error_message(PG_FUNCTION_ARGS) { - char *msg; - PGconn *conn; + char *msg; + PGconn *conn; - dblink_init(); - conn = dblink_get_named_conn(text_to_cstring(PG_GETARG_TEXT_PP(0))); + dblink_init(); + conn = dblink_get_named_conn(text_to_cstring(PG_GETARG_TEXT_PP(0))); - msg = PQerrorMessage(conn); - if (msg == NULL || msg[0] == '\0') - PG_RETURN_TEXT_P(cstring_to_text("OK")); - else - PG_RETURN_TEXT_P(cstring_to_text(pchomp(msg))); + msg = PQerrorMessage(conn); + if (msg == NULL || msg[0] == '\0') + PG_RETURN_TEXT_P(cstring_to_text("OK")); + else + PG_RETURN_TEXT_P(cstring_to_text(pchomp(msg))); } /* @@ -1383,101 +1609,101 @@ PG_FUNCTION_INFO_V1(dblink_exec); Datum dblink_exec(PG_FUNCTION_ARGS) { - text *volatile sql_cmd_status = NULL; - PGconn *volatile conn = NULL; - volatile bool freeconn = false; - - dblink_init(); - - PG_TRY(); - { - PGresult *res = NULL; - char *sql = NULL; - char *conname = NULL; - bool fail = true; /* default to backward compatible behavior */ - - if (PG_NARGS() == 3) - { - /* must be text,text,bool */ - conname = text_to_cstring(PG_GETARG_TEXT_PP(0)); - sql = text_to_cstring(PG_GETARG_TEXT_PP(1)); - fail = PG_GETARG_BOOL(2); - dblink_get_conn(conname, &conn, &conname, &freeconn); - } - else if (PG_NARGS() == 2) - { - /* might be text,text or text,bool */ - if (get_fn_expr_argtype(fcinfo->flinfo, 1) == BOOLOID) - { - sql = text_to_cstring(PG_GETARG_TEXT_PP(0)); - fail = PG_GETARG_BOOL(1); - conn = pconn->conn; - } - else - { - conname = text_to_cstring(PG_GETARG_TEXT_PP(0)); - sql = text_to_cstring(PG_GETARG_TEXT_PP(1)); - dblink_get_conn(conname, &conn, &conname, &freeconn); - } - } - else if (PG_NARGS() == 1) - { - /* must be single text argument */ - conn = pconn->conn; - sql = text_to_cstring(PG_GETARG_TEXT_PP(0)); - } - else - /* shouldn't happen */ - elog(ERROR, "wrong number of arguments"); - - if (!conn) - dblink_conn_not_avail(conname); - - res = PQexec(conn, sql); - if (!res || - (PQresultStatus(res) != PGRES_COMMAND_OK && - PQresultStatus(res) != PGRES_TUPLES_OK)) - { - dblink_res_error(conn, conname, res, - "could not execute command", fail); - - /* - * and save a copy of the command status string to return as our - * result tuple - */ - sql_cmd_status = cstring_to_text("ERROR"); - } - else if (PQresultStatus(res) == PGRES_COMMAND_OK) - { - /* - * and save a copy of the command status string to return as our - * result tuple - */ - sql_cmd_status = cstring_to_text(PQcmdStatus(res)); - PQclear(res); - } - else - { - PQclear(res); - ereport(ERROR, - (errcode(ERRCODE_S_R_E_PROHIBITED_SQL_STATEMENT_ATTEMPTED), - errmsg("statement returning results not allowed"))); - } - } - PG_CATCH(); - { - /* if needed, close the connection to the database */ - if (freeconn) - PQfinish(conn); - PG_RE_THROW(); - } - PG_END_TRY(); - - /* if needed, close the connection to the database */ - if (freeconn) - PQfinish(conn); - - PG_RETURN_TEXT_P(sql_cmd_status); + text *volatile sql_cmd_status = NULL; + PGconn *volatile conn = NULL; + volatile bool freeconn = false; + + dblink_init(); + + PG_TRY(); + { + PGresult *res = NULL; + char *sql = NULL; + char *conname = NULL; + bool fail = true; /* default to backward compatible behavior */ + + if (PG_NARGS() == 3) + { + /* must be text,text,bool */ + conname = text_to_cstring(PG_GETARG_TEXT_PP(0)); + sql = text_to_cstring(PG_GETARG_TEXT_PP(1)); + fail = PG_GETARG_BOOL(2); + dblink_get_conn(conname, &conn, &conname, &freeconn); + } + else if (PG_NARGS() == 2) + { + /* might be text,text or text,bool */ + if (get_fn_expr_argtype(fcinfo->flinfo, 1) == BOOLOID) + { + sql = text_to_cstring(PG_GETARG_TEXT_PP(0)); + fail = PG_GETARG_BOOL(1); + conn = pconn->conn; + } + else + { + conname = text_to_cstring(PG_GETARG_TEXT_PP(0)); + sql = text_to_cstring(PG_GETARG_TEXT_PP(1)); + dblink_get_conn(conname, &conn, &conname, &freeconn); + } + } + else if (PG_NARGS() == 1) + { + /* must be single text argument */ + conn = pconn->conn; + sql = text_to_cstring(PG_GETARG_TEXT_PP(0)); + } + else + /* shouldn't happen */ + elog(ERROR, "wrong number of arguments"); + + if (!conn) + dblink_conn_not_avail(conname); + + res = PQexec(conn, sql); + if (!res || + (PQresultStatus(res) != PGRES_COMMAND_OK && + PQresultStatus(res) != PGRES_TUPLES_OK)) + { + dblink_res_error(conn, conname, res, + "could not execute command", fail); + + /* + * and save a copy of the command status string to return as our + * result tuple + */ + sql_cmd_status = cstring_to_text("ERROR"); + } + else if (PQresultStatus(res) == PGRES_COMMAND_OK) + { + /* + * and save a copy of the command status string to return as our + * result tuple + */ + sql_cmd_status = cstring_to_text(PQcmdStatus(res)); + PQclear(res); + } + else + { + PQclear(res); + ereport(ERROR, + (errcode(ERRCODE_S_R_E_PROHIBITED_SQL_STATEMENT_ATTEMPTED), + errmsg("statement returning results not allowed"))); + } + } + PG_CATCH(); + { + /* if needed, close the connection to the database */ + if (freeconn) + PQfinish(conn); + PG_RE_THROW(); + } + PG_END_TRY(); + + /* if needed, close the connection to the database */ + if (freeconn) + PQfinish(conn); + + PG_RETURN_TEXT_P(sql_cmd_status); } @@ -1491,104 +1717,104 @@ PG_FUNCTION_INFO_V1(dblink_get_pkey); Datum dblink_get_pkey(PG_FUNCTION_ARGS) { - int16 numatts; - char **results; - FuncCallContext *funcctx; - int32 call_cntr; - int32 max_calls; - AttInMetadata *attinmeta; - MemoryContext oldcontext; - - /* stuff done only on the first call of the function */ - if (SRF_IS_FIRSTCALL()) - { - Relation rel; - TupleDesc tupdesc; - - /* create a function context for cross-call persistence */ - funcctx = SRF_FIRSTCALL_INIT(); - - /* - * switch to memory context appropriate for multiple function calls - */ - oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); - - /* open target relation */ - rel = get_rel_from_relname(PG_GETARG_TEXT_PP(0), AccessShareLock, ACL_SELECT); - - /* get the array of attnums */ - results = get_pkey_attnames(rel, &numatts); - - relation_close(rel, AccessShareLock); - - /* - * need a tuple descriptor representing one INT and one TEXT column - */ - tupdesc = CreateTemplateTupleDesc(2, false); - TupleDescInitEntry(tupdesc, (AttrNumber) 1, "position", - INT4OID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 2, "colname", - TEXTOID, -1, 0); - - /* - * Generate attribute metadata needed later to produce tuples from raw - * C strings - */ - attinmeta = TupleDescGetAttInMetadata(tupdesc); - funcctx->attinmeta = attinmeta; - - if ((results != NULL) && (numatts > 0)) - { - funcctx->max_calls = numatts; - - /* got results, keep track of them */ - funcctx->user_fctx = results; - } - else - { - /* fast track when no results */ - MemoryContextSwitchTo(oldcontext); - SRF_RETURN_DONE(funcctx); - } - - MemoryContextSwitchTo(oldcontext); - } - - /* stuff done on every call of the function */ - funcctx = SRF_PERCALL_SETUP(); - - /* - * initialize per-call variables - */ - call_cntr = funcctx->call_cntr; - max_calls = funcctx->max_calls; - - results = (char **) funcctx->user_fctx; - attinmeta = funcctx->attinmeta; - - if (call_cntr < max_calls) /* do when there is more left to send */ - { - char **values; - HeapTuple tuple; - Datum result; - - values = (char **) palloc(2 * sizeof(char *)); - values[0] = psprintf("%d", call_cntr + 1); - values[1] = results[call_cntr]; - - /* build the tuple */ - tuple = BuildTupleFromCStrings(attinmeta, values); - - /* make the tuple into a datum */ - result = HeapTupleGetDatum(tuple); - - SRF_RETURN_NEXT(funcctx, result); - } - else - { - /* do when there is no more left */ - SRF_RETURN_DONE(funcctx); - } + int16 numatts; + char **results; + FuncCallContext *funcctx; + int32 call_cntr; + int32 max_calls; + AttInMetadata *attinmeta; + MemoryContext oldcontext; + + /* stuff done only on the first call of the function */ + if (SRF_IS_FIRSTCALL()) + { + Relation rel; + TupleDesc tupdesc; + + /* create a function context for cross-call persistence */ + funcctx = SRF_FIRSTCALL_INIT(); + + /* + * switch to memory context appropriate for multiple function calls + */ + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + /* open target relation */ + rel = get_rel_from_relname(PG_GETARG_TEXT_PP(0), AccessShareLock, ACL_SELECT); + + /* get the array of attnums */ + results = get_pkey_attnames(rel, &numatts); + + relation_close(rel, AccessShareLock); + + /* + * need a tuple descriptor representing one INT and one TEXT column + */ + tupdesc = CreateTemplateTupleDesc(2, false); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "position", + INT4OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "colname", + TEXTOID, -1, 0); + + /* + * Generate attribute metadata needed later to produce tuples from raw + * C strings + */ + attinmeta = TupleDescGetAttInMetadata(tupdesc); + funcctx->attinmeta = attinmeta; + + if ((results != NULL) && (numatts > 0)) + { + funcctx->max_calls = numatts; + + /* got results, keep track of them */ + funcctx->user_fctx = results; + } + else + { + /* fast track when no results */ + MemoryContextSwitchTo(oldcontext); + SRF_RETURN_DONE(funcctx); + } + + MemoryContextSwitchTo(oldcontext); + } + + /* stuff done on every call of the function */ + funcctx = SRF_PERCALL_SETUP(); + + /* + * initialize per-call variables + */ + call_cntr = funcctx->call_cntr; + max_calls = funcctx->max_calls; + + results = (char **) funcctx->user_fctx; + attinmeta = funcctx->attinmeta; + + if (call_cntr < max_calls) /* do when there is more left to send */ + { + char **values; + HeapTuple tuple; + Datum result; + + values = (char **) palloc(2 * sizeof(char *)); + values[0] = psprintf("%d", call_cntr + 1); + values[1] = results[call_cntr]; + + /* build the tuple */ + tuple = BuildTupleFromCStrings(attinmeta, values); + + /* make the tuple into a datum */ + result = HeapTupleGetDatum(tuple); + + SRF_RETURN_NEXT(funcctx, result); + } + else + { + /* do when there is no more left */ + SRF_RETURN_DONE(funcctx); + } } @@ -1615,75 +1841,75 @@ PG_FUNCTION_INFO_V1(dblink_build_sql_insert); Datum dblink_build_sql_insert(PG_FUNCTION_ARGS) { - text *relname_text = PG_GETARG_TEXT_PP(0); - int2vector *pkattnums_arg = (int2vector *) PG_GETARG_POINTER(1); - int32 pknumatts_arg = PG_GETARG_INT32(2); - ArrayType *src_pkattvals_arry = PG_GETARG_ARRAYTYPE_P(3); - ArrayType *tgt_pkattvals_arry = PG_GETARG_ARRAYTYPE_P(4); - Relation rel; - int *pkattnums; - int pknumatts; - char **src_pkattvals; - char **tgt_pkattvals; - int src_nitems; - int tgt_nitems; - char *sql; - - /* - * Open target relation. - */ - rel = get_rel_from_relname(relname_text, AccessShareLock, ACL_SELECT); - - /* - * Process pkattnums argument. - */ - validate_pkattnums(rel, pkattnums_arg, pknumatts_arg, - &pkattnums, &pknumatts); - - /* - * Source array is made up of key values that will be used to locate the - * tuple of interest from the local system. - */ - src_pkattvals = get_text_array_contents(src_pkattvals_arry, &src_nitems); - - /* - * There should be one source array key value for each key attnum - */ - if (src_nitems != pknumatts) - ereport(ERROR, - (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), - errmsg("source key array length must match number of key " \ - "attributes"))); - - /* - * Target array is made up of key values that will be used to build the - * SQL string for use on the remote system. - */ - tgt_pkattvals = get_text_array_contents(tgt_pkattvals_arry, &tgt_nitems); - - /* - * There should be one target array key value for each key attnum - */ - if (tgt_nitems != pknumatts) - ereport(ERROR, - (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), - errmsg("target key array length must match number of key " \ - "attributes"))); - - /* - * Prep work is finally done. Go get the SQL string. - */ - sql = get_sql_insert(rel, pkattnums, pknumatts, src_pkattvals, tgt_pkattvals); - - /* - * Now we can close the relation. - */ - relation_close(rel, AccessShareLock); - - /* - * And send it - */ - PG_RETURN_TEXT_P(cstring_to_text(sql)); + text *relname_text = PG_GETARG_TEXT_PP(0); + int2vector *pkattnums_arg = (int2vector *) PG_GETARG_POINTER(1); + int32 pknumatts_arg = PG_GETARG_INT32(2); + ArrayType *src_pkattvals_arry = PG_GETARG_ARRAYTYPE_P(3); + ArrayType *tgt_pkattvals_arry = PG_GETARG_ARRAYTYPE_P(4); + Relation rel; + int *pkattnums; + int pknumatts; + char **src_pkattvals; + char **tgt_pkattvals; + int src_nitems; + int tgt_nitems; + char *sql; + + /* + * Open target relation. + */ + rel = get_rel_from_relname(relname_text, AccessShareLock, ACL_SELECT); + + /* + * Process pkattnums argument. + */ + validate_pkattnums(rel, pkattnums_arg, pknumatts_arg, + &pkattnums, &pknumatts); + + /* + * Source array is made up of key values that will be used to locate the + * tuple of interest from the local system. + */ + src_pkattvals = get_text_array_contents(src_pkattvals_arry, &src_nitems); + + /* + * There should be one source array key value for each key attnum + */ + if (src_nitems != pknumatts) + ereport(ERROR, + (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), + errmsg("source key array length must match number of key " \ + "attributes"))); + + /* + * Target array is made up of key values that will be used to build the + * SQL string for use on the remote system. + */ + tgt_pkattvals = get_text_array_contents(tgt_pkattvals_arry, &tgt_nitems); + + /* + * There should be one target array key value for each key attnum + */ + if (tgt_nitems != pknumatts) + ereport(ERROR, + (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), + errmsg("target key array length must match number of key " \ + "attributes"))); + + /* + * Prep work is finally done. Go get the SQL string. + */ + sql = get_sql_insert(rel, pkattnums, pknumatts, src_pkattvals, tgt_pkattvals); + + /* + * Now we can close the relation. + */ + relation_close(rel, AccessShareLock); + + /* + * And send it + */ + PG_RETURN_TEXT_P(cstring_to_text(sql)); } @@ -1706,57 +1932,57 @@ PG_FUNCTION_INFO_V1(dblink_build_sql_delete); Datum dblink_build_sql_delete(PG_FUNCTION_ARGS) { - text *relname_text = PG_GETARG_TEXT_PP(0); - int2vector *pkattnums_arg = (int2vector *) PG_GETARG_POINTER(1); - int32 pknumatts_arg = PG_GETARG_INT32(2); - ArrayType *tgt_pkattvals_arry = PG_GETARG_ARRAYTYPE_P(3); - Relation rel; - int *pkattnums; - int pknumatts; - char **tgt_pkattvals; - int tgt_nitems; - char *sql; - - /* - * Open target relation. - */ - rel = get_rel_from_relname(relname_text, AccessShareLock, ACL_SELECT); - - /* - * Process pkattnums argument. - */ - validate_pkattnums(rel, pkattnums_arg, pknumatts_arg, - &pkattnums, &pknumatts); - - /* - * Target array is made up of key values that will be used to build the - * SQL string for use on the remote system. - */ - tgt_pkattvals = get_text_array_contents(tgt_pkattvals_arry, &tgt_nitems); - - /* - * There should be one target array key value for each key attnum - */ - if (tgt_nitems != pknumatts) - ereport(ERROR, - (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), - errmsg("target key array length must match number of key " \ - "attributes"))); - - /* - * Prep work is finally done. Go get the SQL string. - */ - sql = get_sql_delete(rel, pkattnums, pknumatts, tgt_pkattvals); - - /* - * Now we can close the relation. - */ - relation_close(rel, AccessShareLock); - - /* - * And send it - */ - PG_RETURN_TEXT_P(cstring_to_text(sql)); + text *relname_text = PG_GETARG_TEXT_PP(0); + int2vector *pkattnums_arg = (int2vector *) PG_GETARG_POINTER(1); + int32 pknumatts_arg = PG_GETARG_INT32(2); + ArrayType *tgt_pkattvals_arry = PG_GETARG_ARRAYTYPE_P(3); + Relation rel; + int *pkattnums; + int pknumatts; + char **tgt_pkattvals; + int tgt_nitems; + char *sql; + + /* + * Open target relation. + */ + rel = get_rel_from_relname(relname_text, AccessShareLock, ACL_SELECT); + + /* + * Process pkattnums argument. + */ + validate_pkattnums(rel, pkattnums_arg, pknumatts_arg, + &pkattnums, &pknumatts); + + /* + * Target array is made up of key values that will be used to build the + * SQL string for use on the remote system. + */ + tgt_pkattvals = get_text_array_contents(tgt_pkattvals_arry, &tgt_nitems); + + /* + * There should be one target array key value for each key attnum + */ + if (tgt_nitems != pknumatts) + ereport(ERROR, + (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), + errmsg("target key array length must match number of key " \ + "attributes"))); + + /* + * Prep work is finally done. Go get the SQL string. + */ + sql = get_sql_delete(rel, pkattnums, pknumatts, tgt_pkattvals); + + /* + * Now we can close the relation. + */ + relation_close(rel, AccessShareLock); + + /* + * And send it + */ + PG_RETURN_TEXT_P(cstring_to_text(sql)); } @@ -1783,75 +2009,75 @@ PG_FUNCTION_INFO_V1(dblink_build_sql_update); Datum dblink_build_sql_update(PG_FUNCTION_ARGS) { - text *relname_text = PG_GETARG_TEXT_PP(0); - int2vector *pkattnums_arg = (int2vector *) PG_GETARG_POINTER(1); - int32 pknumatts_arg = PG_GETARG_INT32(2); - ArrayType *src_pkattvals_arry = PG_GETARG_ARRAYTYPE_P(3); - ArrayType *tgt_pkattvals_arry = PG_GETARG_ARRAYTYPE_P(4); - Relation rel; - int *pkattnums; - int pknumatts; - char **src_pkattvals; - char **tgt_pkattvals; - int src_nitems; - int tgt_nitems; - char *sql; - - /* - * Open target relation. - */ - rel = get_rel_from_relname(relname_text, AccessShareLock, ACL_SELECT); - - /* - * Process pkattnums argument. - */ - validate_pkattnums(rel, pkattnums_arg, pknumatts_arg, - &pkattnums, &pknumatts); - - /* - * Source array is made up of key values that will be used to locate the - * tuple of interest from the local system. - */ - src_pkattvals = get_text_array_contents(src_pkattvals_arry, &src_nitems); - - /* - * There should be one source array key value for each key attnum - */ - if (src_nitems != pknumatts) - ereport(ERROR, - (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), - errmsg("source key array length must match number of key " \ - "attributes"))); - - /* - * Target array is made up of key values that will be used to build the - * SQL string for use on the remote system. - */ - tgt_pkattvals = get_text_array_contents(tgt_pkattvals_arry, &tgt_nitems); - - /* - * There should be one target array key value for each key attnum - */ - if (tgt_nitems != pknumatts) - ereport(ERROR, - (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), - errmsg("target key array length must match number of key " \ - "attributes"))); - - /* - * Prep work is finally done. Go get the SQL string. - */ - sql = get_sql_update(rel, pkattnums, pknumatts, src_pkattvals, tgt_pkattvals); - - /* - * Now we can close the relation. - */ - relation_close(rel, AccessShareLock); - - /* - * And send it - */ - PG_RETURN_TEXT_P(cstring_to_text(sql)); + text *relname_text = PG_GETARG_TEXT_PP(0); + int2vector *pkattnums_arg = (int2vector *) PG_GETARG_POINTER(1); + int32 pknumatts_arg = PG_GETARG_INT32(2); + ArrayType *src_pkattvals_arry = PG_GETARG_ARRAYTYPE_P(3); + ArrayType *tgt_pkattvals_arry = PG_GETARG_ARRAYTYPE_P(4); + Relation rel; + int *pkattnums; + int pknumatts; + char **src_pkattvals; + char **tgt_pkattvals; + int src_nitems; + int tgt_nitems; + char *sql; + + /* + * Open target relation. + */ + rel = get_rel_from_relname(relname_text, AccessShareLock, ACL_SELECT); + + /* + * Process pkattnums argument. + */ + validate_pkattnums(rel, pkattnums_arg, pknumatts_arg, + &pkattnums, &pknumatts); + + /* + * Source array is made up of key values that will be used to locate the + * tuple of interest from the local system. + */ + src_pkattvals = get_text_array_contents(src_pkattvals_arry, &src_nitems); + + /* + * There should be one source array key value for each key attnum + */ + if (src_nitems != pknumatts) + ereport(ERROR, + (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), + errmsg("source key array length must match number of key " \ + "attributes"))); + + /* + * Target array is made up of key values that will be used to build the + * SQL string for use on the remote system. + */ + tgt_pkattvals = get_text_array_contents(tgt_pkattvals_arry, &tgt_nitems); + + /* + * There should be one target array key value for each key attnum + */ + if (tgt_nitems != pknumatts) + ereport(ERROR, + (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), + errmsg("target key array length must match number of key " \ + "attributes"))); + + /* + * Prep work is finally done. Go get the SQL string. + */ + sql = get_sql_update(rel, pkattnums, pknumatts, src_pkattvals, tgt_pkattvals); + + /* + * Now we can close the relation. + */ + relation_close(rel, AccessShareLock); + + /* + * And send it + */ + PG_RETURN_TEXT_P(cstring_to_text(sql)); } /* @@ -1864,8 +2090,8 @@ PG_FUNCTION_INFO_V1(dblink_current_query); Datum dblink_current_query(PG_FUNCTION_ARGS) { - /* This is now just an alias for the built-in function current_query() */ - PG_RETURN_DATUM(current_query(fcinfo)); + /* This is now just an alias for the built-in function current_query() */ + PG_RETURN_DATUM(current_query(fcinfo)); } /* @@ -1876,77 +2102,77 @@ dblink_current_query(PG_FUNCTION_ARGS) * connection per default. * */ -#define DBLINK_NOTIFY_COLS 3 +#define DBLINK_NOTIFY_COLS 3 PG_FUNCTION_INFO_V1(dblink_get_notify); Datum dblink_get_notify(PG_FUNCTION_ARGS) { - PGconn *conn; - PGnotify *notify; - ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; - TupleDesc tupdesc; - Tuplestorestate *tupstore; - MemoryContext per_query_ctx; - MemoryContext oldcontext; - - prepTuplestoreResult(fcinfo); - - dblink_init(); - if (PG_NARGS() == 1) - conn = dblink_get_named_conn(text_to_cstring(PG_GETARG_TEXT_PP(0))); - else - conn = pconn->conn; - - /* create the tuplestore in per-query memory */ - per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; - oldcontext = MemoryContextSwitchTo(per_query_ctx); - - tupdesc = CreateTemplateTupleDesc(DBLINK_NOTIFY_COLS, false); - TupleDescInitEntry(tupdesc, (AttrNumber) 1, "notify_name", - TEXTOID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 2, "be_pid", - INT4OID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 3, "extra", - TEXTOID, -1, 0); - - tupstore = tuplestore_begin_heap(true, false, work_mem); - rsinfo->setResult = tupstore; - rsinfo->setDesc = tupdesc; - - MemoryContextSwitchTo(oldcontext); - - PQconsumeInput(conn); - while ((notify = PQnotifies(conn)) != NULL) - { - Datum values[DBLINK_NOTIFY_COLS]; - bool nulls[DBLINK_NOTIFY_COLS]; - - memset(values, 0, sizeof(values)); - memset(nulls, 0, sizeof(nulls)); - - if (notify->relname != NULL) - values[0] = CStringGetTextDatum(notify->relname); - else - nulls[0] = true; - - values[1] = Int32GetDatum(notify->be_pid); - - if (notify->extra != NULL) - values[2] = CStringGetTextDatum(notify->extra); - else - nulls[2] = true; - - tuplestore_putvalues(tupstore, tupdesc, values, nulls); - - PQfreemem(notify); - PQconsumeInput(conn); - } - - /* clean up and return the tuplestore */ - tuplestore_donestoring(tupstore); - - return (Datum) 0; + PGconn *conn; + PGnotify *notify; + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + TupleDesc tupdesc; + Tuplestorestate *tupstore; + MemoryContext per_query_ctx; + MemoryContext oldcontext; + + prepTuplestoreResult(fcinfo); + + dblink_init(); + if (PG_NARGS() == 1) + conn = dblink_get_named_conn(text_to_cstring(PG_GETARG_TEXT_PP(0))); + else + conn = pconn->conn; + + /* create the tuplestore in per-query memory */ + per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; + oldcontext = MemoryContextSwitchTo(per_query_ctx); + + tupdesc = CreateTemplateTupleDesc(DBLINK_NOTIFY_COLS, false); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "notify_name", + TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "be_pid", + INT4OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "extra", + TEXTOID, -1, 0); + + tupstore = tuplestore_begin_heap(true, false, work_mem); + rsinfo->setResult = tupstore; + rsinfo->setDesc = tupdesc; + + MemoryContextSwitchTo(oldcontext); + + PQconsumeInput(conn); + while ((notify = PQnotifies(conn)) != NULL) + { + Datum values[DBLINK_NOTIFY_COLS]; + bool nulls[DBLINK_NOTIFY_COLS]; + + memset(values, 0, sizeof(values)); + memset(nulls, 0, sizeof(nulls)); + + if (notify->relname != NULL) + values[0] = CStringGetTextDatum(notify->relname); + else + nulls[0] = true; + + values[1] = Int32GetDatum(notify->be_pid); + + if (notify->extra != NULL) + values[2] = CStringGetTextDatum(notify->extra); + else + nulls[2] = true; + + tuplestore_putvalues(tupstore, tupdesc, values, nulls); + + PQfreemem(notify); + PQconsumeInput(conn); + } + + /* clean up and return the tuplestore */ + tuplestore_donestoring(tupstore); + + return (Datum) 0; } /* @@ -1960,61 +2186,61 @@ PG_FUNCTION_INFO_V1(dblink_fdw_validator); Datum dblink_fdw_validator(PG_FUNCTION_ARGS) { - List *options_list = untransformRelOptions(PG_GETARG_DATUM(0)); - Oid context = PG_GETARG_OID(1); - ListCell *cell; - - static const PQconninfoOption *options = NULL; - - /* - * Get list of valid libpq options. - * - * To avoid unnecessary work, we get the list once and use it throughout - * the lifetime of this backend process. We don't need to care about - * memory context issues, because PQconndefaults allocates with malloc. - */ - if (!options) - { - options = PQconndefaults(); - if (!options) /* assume reason for failure is OOM */ - ereport(ERROR, - (errcode(ERRCODE_FDW_OUT_OF_MEMORY), - errmsg("out of memory"), - errdetail("could not get libpq's default connection options"))); - } - - /* Validate each supplied option. */ - foreach(cell, options_list) - { - DefElem *def = (DefElem *) lfirst(cell); - - if (!is_valid_dblink_option(options, def->defname, context)) - { - /* - * Unknown option, or invalid option for the context specified, so - * complain about it. Provide a hint with list of valid options - * for the context. - */ - StringInfoData buf; - const PQconninfoOption *opt; - - initStringInfo(&buf); - for (opt = options; opt->keyword; opt++) - { - if (is_valid_dblink_option(options, opt->keyword, context)) - appendStringInfo(&buf, "%s%s", - (buf.len > 0) ? ", " : "", - opt->keyword); - } - ereport(ERROR, - (errcode(ERRCODE_FDW_OPTION_NAME_NOT_FOUND), - errmsg("invalid option \"%s\"", def->defname), - errhint("Valid options in this context are: %s", - buf.data))); - } - } - - PG_RETURN_VOID(); + List *options_list = untransformRelOptions(PG_GETARG_DATUM(0)); + Oid context = PG_GETARG_OID(1); + ListCell *cell; + + static const PQconninfoOption *options = NULL; + + /* + * Get list of valid libpq options. + * + * To avoid unnecessary work, we get the list once and use it throughout + * the lifetime of this backend process. We don't need to care about + * memory context issues, because PQconndefaults allocates with malloc. + */ + if (!options) + { + options = PQconndefaults(); + if (!options) /* assume reason for failure is OOM */ + ereport(ERROR, + (errcode(ERRCODE_FDW_OUT_OF_MEMORY), + errmsg("out of memory"), + errdetail("could not get libpq's default connection options"))); + } + + /* Validate each supplied option. */ + foreach(cell, options_list) + { + DefElem *def = (DefElem *) lfirst(cell); + + if (!is_valid_dblink_option(options, def->defname, context)) + { + /* + * Unknown option, or invalid option for the context specified, so + * complain about it. Provide a hint with list of valid options + * for the context. + */ + StringInfoData buf; + const PQconninfoOption *opt; + + initStringInfo(&buf); + for (opt = options; opt->keyword; opt++) + { + if (is_valid_dblink_option(options, opt->keyword, context)) + appendStringInfo(&buf, "%s%s", + (buf.len > 0) ? ", " : "", + opt->keyword); + } + ereport(ERROR, + (errcode(ERRCODE_FDW_OPTION_NAME_NOT_FOUND), + errmsg("invalid option \"%s\"", def->defname), + errhint("Valid options in this context are: %s", + buf.data))); + } + } + + PG_RETURN_VOID(); } @@ -2032,52 +2258,52 @@ dblink_fdw_validator(PG_FUNCTION_ARGS) static char ** get_pkey_attnames(Relation rel, int16 *numatts) { - Relation indexRelation; - ScanKeyData skey; - SysScanDesc scan; - HeapTuple indexTuple; - int i; - char **result = NULL; - TupleDesc tupdesc; - - /* initialize numatts to 0 in case no primary key exists */ - *numatts = 0; - - tupdesc = rel->rd_att; - - /* Prepare to scan pg_index for entries having indrelid = this rel. */ - indexRelation = heap_open(IndexRelationId, AccessShareLock); - ScanKeyInit(&skey, - Anum_pg_index_indrelid, - BTEqualStrategyNumber, F_OIDEQ, - ObjectIdGetDatum(RelationGetRelid(rel))); - - scan = systable_beginscan(indexRelation, IndexIndrelidIndexId, true, - NULL, 1, &skey); - - while (HeapTupleIsValid(indexTuple = systable_getnext(scan))) - { - Form_pg_index index = (Form_pg_index) GETSTRUCT(indexTuple); - - /* we're only interested if it is the primary key */ - if (index->indisprimary) - { - *numatts = index->indnatts; - if (*numatts > 0) - { - result = (char **) palloc(*numatts * sizeof(char *)); - - for (i = 0; i < *numatts; i++) - result[i] = SPI_fname(tupdesc, index->indkey.values[i]); - } - break; - } - } - - systable_endscan(scan); - heap_close(indexRelation, AccessShareLock); - - return result; + Relation indexRelation; + ScanKeyData skey; + SysScanDesc scan; + HeapTuple indexTuple; + int i; + char **result = NULL; + TupleDesc tupdesc; + + /* initialize numatts to 0 in case no primary key exists */ + *numatts = 0; + + tupdesc = rel->rd_att; + + /* Prepare to scan pg_index for entries having indrelid = this rel. */ + indexRelation = heap_open(IndexRelationId, AccessShareLock); + ScanKeyInit(&skey, + Anum_pg_index_indrelid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(RelationGetRelid(rel))); + + scan = systable_beginscan(indexRelation, IndexIndrelidIndexId, true, + NULL, 1, &skey); + + while (HeapTupleIsValid(indexTuple = systable_getnext(scan))) + { + Form_pg_index index = (Form_pg_index) GETSTRUCT(indexTuple); + + /* we're only interested if it is the primary key */ + if (index->indisprimary) + { + *numatts = index->indnatts; + if (*numatts > 0) + { + result = (char **) palloc(*numatts * sizeof(char *)); + + for (i = 0; i < *numatts; i++) + result[i] = SPI_fname(tupdesc, index->indkey.values[i]); + } + break; + } + } + + systable_endscan(scan); + heap_close(indexRelation, AccessShareLock); + + return result; } /* @@ -2087,255 +2313,255 @@ get_pkey_attnames(Relation rel, int16 *numatts) static char ** get_text_array_contents(ArrayType *array, int *numitems) { - int ndim = ARR_NDIM(array); - int *dims = ARR_DIMS(array); - int nitems; - int16 typlen; - bool typbyval; - char typalign; - char **values; - char *ptr; - bits8 *bitmap; - int bitmask; - int i; - - Assert(ARR_ELEMTYPE(array) == TEXTOID); - - *numitems = nitems = ArrayGetNItems(ndim, dims); - - get_typlenbyvalalign(ARR_ELEMTYPE(array), - &typlen, &typbyval, &typalign); - - values = (char **) palloc(nitems * sizeof(char *)); - - ptr = ARR_DATA_PTR(array); - bitmap = ARR_NULLBITMAP(array); - bitmask = 1; - - for (i = 0; i < nitems; i++) - { - if (bitmap && (*bitmap & bitmask) == 0) - { - values[i] = NULL; - } - else - { - values[i] = TextDatumGetCString(PointerGetDatum(ptr)); - ptr = att_addlength_pointer(ptr, typlen, ptr); - ptr = (char *) att_align_nominal(ptr, typalign); - } - - /* advance bitmap pointer if any */ - if (bitmap) - { - bitmask <<= 1; - if (bitmask == 0x100) - { - bitmap++; - bitmask = 1; - } - } - } - - return values; + int ndim = ARR_NDIM(array); + int *dims = ARR_DIMS(array); + int nitems; + int16 typlen; + bool typbyval; + char typalign; + char **values; + char *ptr; + bits8 *bitmap; + int bitmask; + int i; + + Assert(ARR_ELEMTYPE(array) == TEXTOID); + + *numitems = nitems = ArrayGetNItems(ndim, dims); + + get_typlenbyvalalign(ARR_ELEMTYPE(array), + &typlen, &typbyval, &typalign); + + values = (char **) palloc(nitems * sizeof(char *)); + + ptr = ARR_DATA_PTR(array); + bitmap = ARR_NULLBITMAP(array); + bitmask = 1; + + for (i = 0; i < nitems; i++) + { + if (bitmap && (*bitmap & bitmask) == 0) + { + values[i] = NULL; + } + else + { + values[i] = TextDatumGetCString(PointerGetDatum(ptr)); + ptr = att_addlength_pointer(ptr, typlen, ptr); + ptr = (char *) att_align_nominal(ptr, typalign); + } + + /* advance bitmap pointer if any */ + if (bitmap) + { + bitmask <<= 1; + if (bitmask == 0x100) + { + bitmap++; + bitmask = 1; + } + } + } + + return values; } static char * get_sql_insert(Relation rel, int *pkattnums, int pknumatts, char **src_pkattvals, char **tgt_pkattvals) { - char *relname; - HeapTuple tuple; - TupleDesc tupdesc; - int natts; - StringInfoData buf; - char *val; - int key; - int i; - bool needComma; - - initStringInfo(&buf); - - /* get relation name including any needed schema prefix and quoting */ - relname = generate_relation_name(rel); - - tupdesc = rel->rd_att; - natts = tupdesc->natts; - - tuple = get_tuple_of_interest(rel, pkattnums, pknumatts, src_pkattvals); - if (!tuple) - ereport(ERROR, - (errcode(ERRCODE_CARDINALITY_VIOLATION), - errmsg("source row not found"))); - - appendStringInfo(&buf, "INSERT INTO %s(", relname); - - needComma = false; - for (i = 0; i < natts; i++) - { - if (tupdesc->attrs[i]->attisdropped) - continue; - - if (needComma) - appendStringInfoChar(&buf, ','); - - appendStringInfoString(&buf, - quote_ident_cstr(NameStr(tupdesc->attrs[i]->attname))); - needComma = true; - } - - appendStringInfoString(&buf, ") VALUES("); - - /* - * Note: i is physical column number (counting from 0). - */ - needComma = false; - for (i = 0; i < natts; i++) - { - if (tupdesc->attrs[i]->attisdropped) - continue; - - if (needComma) - appendStringInfoChar(&buf, ','); - - key = get_attnum_pk_pos(pkattnums, pknumatts, i); - - if (key >= 0) - val = tgt_pkattvals[key] ? pstrdup(tgt_pkattvals[key]) : NULL; - else - val = SPI_getvalue(tuple, tupdesc, i + 1); - - if (val != NULL) - { - appendStringInfoString(&buf, quote_literal_cstr(val)); - pfree(val); - } - else - appendStringInfoString(&buf, "NULL"); - needComma = true; - } - appendStringInfoChar(&buf, ')'); - - return (buf.data); + char *relname; + HeapTuple tuple; + TupleDesc tupdesc; + int natts; + StringInfoData buf; + char *val; + int key; + int i; + bool needComma; + + initStringInfo(&buf); + + /* get relation name including any needed schema prefix and quoting */ + relname = generate_relation_name(rel); + + tupdesc = rel->rd_att; + natts = tupdesc->natts; + + tuple = get_tuple_of_interest(rel, pkattnums, pknumatts, src_pkattvals); + if (!tuple) + ereport(ERROR, + (errcode(ERRCODE_CARDINALITY_VIOLATION), + errmsg("source row not found"))); + + appendStringInfo(&buf, "INSERT INTO %s(", relname); + + needComma = false; + for (i = 0; i < natts; i++) + { + if (tupdesc->attrs[i]->attisdropped) + continue; + + if (needComma) + appendStringInfoChar(&buf, ','); + + appendStringInfoString(&buf, + quote_ident_cstr(NameStr(tupdesc->attrs[i]->attname))); + needComma = true; + } + + appendStringInfoString(&buf, ") VALUES("); + + /* + * Note: i is physical column number (counting from 0). + */ + needComma = false; + for (i = 0; i < natts; i++) + { + if (tupdesc->attrs[i]->attisdropped) + continue; + + if (needComma) + appendStringInfoChar(&buf, ','); + + key = get_attnum_pk_pos(pkattnums, pknumatts, i); + + if (key >= 0) + val = tgt_pkattvals[key] ? pstrdup(tgt_pkattvals[key]) : NULL; + else + val = SPI_getvalue(tuple, tupdesc, i + 1); + + if (val != NULL) + { + appendStringInfoString(&buf, quote_literal_cstr(val)); + pfree(val); + } + else + appendStringInfoString(&buf, "NULL"); + needComma = true; + } + appendStringInfoChar(&buf, ')'); + + return (buf.data); } static char * get_sql_delete(Relation rel, int *pkattnums, int pknumatts, char **tgt_pkattvals) { - char *relname; - TupleDesc tupdesc; - StringInfoData buf; - int i; + char *relname; + TupleDesc tupdesc; + StringInfoData buf; + int i; - initStringInfo(&buf); + initStringInfo(&buf); - /* get relation name including any needed schema prefix and quoting */ - relname = generate_relation_name(rel); + /* get relation name including any needed schema prefix and quoting */ + relname = generate_relation_name(rel); - tupdesc = rel->rd_att; + tupdesc = rel->rd_att; - appendStringInfo(&buf, "DELETE FROM %s WHERE ", relname); - for (i = 0; i < pknumatts; i++) - { - int pkattnum = pkattnums[i]; + appendStringInfo(&buf, "DELETE FROM %s WHERE ", relname); + for (i = 0; i < pknumatts; i++) + { + int pkattnum = pkattnums[i]; - if (i > 0) - appendStringInfoString(&buf, " AND "); + if (i > 0) + appendStringInfoString(&buf, " AND "); - appendStringInfoString(&buf, - quote_ident_cstr(NameStr(tupdesc->attrs[pkattnum]->attname))); + appendStringInfoString(&buf, + quote_ident_cstr(NameStr(tupdesc->attrs[pkattnum]->attname))); - if (tgt_pkattvals[i] != NULL) - appendStringInfo(&buf, " = %s", - quote_literal_cstr(tgt_pkattvals[i])); - else - appendStringInfoString(&buf, " IS NULL"); - } + if (tgt_pkattvals[i] != NULL) + appendStringInfo(&buf, " = %s", + quote_literal_cstr(tgt_pkattvals[i])); + else + appendStringInfoString(&buf, " IS NULL"); + } - return (buf.data); + return (buf.data); } static char * get_sql_update(Relation rel, int *pkattnums, int pknumatts, char **src_pkattvals, char **tgt_pkattvals) { - char *relname; - HeapTuple tuple; - TupleDesc tupdesc; - int natts; - StringInfoData buf; - char *val; - int key; - int i; - bool needComma; - - initStringInfo(&buf); - - /* get relation name including any needed schema prefix and quoting */ - relname = generate_relation_name(rel); - - tupdesc = rel->rd_att; - natts = tupdesc->natts; - - tuple = get_tuple_of_interest(rel, pkattnums, pknumatts, src_pkattvals); - if (!tuple) - ereport(ERROR, - (errcode(ERRCODE_CARDINALITY_VIOLATION), - errmsg("source row not found"))); - - appendStringInfo(&buf, "UPDATE %s SET ", relname); - - /* - * Note: i is physical column number (counting from 0). - */ - needComma = false; - for (i = 0; i < natts; i++) - { - if (tupdesc->attrs[i]->attisdropped) - continue; - - if (needComma) - appendStringInfoString(&buf, ", "); - - appendStringInfo(&buf, "%s = ", - quote_ident_cstr(NameStr(tupdesc->attrs[i]->attname))); - - key = get_attnum_pk_pos(pkattnums, pknumatts, i); - - if (key >= 0) - val = tgt_pkattvals[key] ? pstrdup(tgt_pkattvals[key]) : NULL; - else - val = SPI_getvalue(tuple, tupdesc, i + 1); - - if (val != NULL) - { - appendStringInfoString(&buf, quote_literal_cstr(val)); - pfree(val); - } - else - appendStringInfoString(&buf, "NULL"); - needComma = true; - } - - appendStringInfoString(&buf, " WHERE "); - - for (i = 0; i < pknumatts; i++) - { - int pkattnum = pkattnums[i]; - - if (i > 0) - appendStringInfoString(&buf, " AND "); - - appendStringInfoString(&buf, - quote_ident_cstr(NameStr(tupdesc->attrs[pkattnum]->attname))); - - val = tgt_pkattvals[i]; - - if (val != NULL) - appendStringInfo(&buf, " = %s", quote_literal_cstr(val)); - else - appendStringInfoString(&buf, " IS NULL"); - } - - return (buf.data); + char *relname; + HeapTuple tuple; + TupleDesc tupdesc; + int natts; + StringInfoData buf; + char *val; + int key; + int i; + bool needComma; + + initStringInfo(&buf); + + /* get relation name including any needed schema prefix and quoting */ + relname = generate_relation_name(rel); + + tupdesc = rel->rd_att; + natts = tupdesc->natts; + + tuple = get_tuple_of_interest(rel, pkattnums, pknumatts, src_pkattvals); + if (!tuple) + ereport(ERROR, + (errcode(ERRCODE_CARDINALITY_VIOLATION), + errmsg("source row not found"))); + + appendStringInfo(&buf, "UPDATE %s SET ", relname); + + /* + * Note: i is physical column number (counting from 0). + */ + needComma = false; + for (i = 0; i < natts; i++) + { + if (tupdesc->attrs[i]->attisdropped) + continue; + + if (needComma) + appendStringInfoString(&buf, ", "); + + appendStringInfo(&buf, "%s = ", + quote_ident_cstr(NameStr(tupdesc->attrs[i]->attname))); + + key = get_attnum_pk_pos(pkattnums, pknumatts, i); + + if (key >= 0) + val = tgt_pkattvals[key] ? pstrdup(tgt_pkattvals[key]) : NULL; + else + val = SPI_getvalue(tuple, tupdesc, i + 1); + + if (val != NULL) + { + appendStringInfoString(&buf, quote_literal_cstr(val)); + pfree(val); + } + else + appendStringInfoString(&buf, "NULL"); + needComma = true; + } + + appendStringInfoString(&buf, " WHERE "); + + for (i = 0; i < pknumatts; i++) + { + int pkattnum = pkattnums[i]; + + if (i > 0) + appendStringInfoString(&buf, " AND "); + + appendStringInfoString(&buf, + quote_ident_cstr(NameStr(tupdesc->attrs[pkattnum]->attname))); + + val = tgt_pkattvals[i]; + + if (val != NULL) + appendStringInfo(&buf, " = %s", quote_literal_cstr(val)); + else + appendStringInfoString(&buf, " IS NULL"); + } + + return (buf.data); } /* @@ -2345,136 +2571,136 @@ get_sql_update(Relation rel, int *pkattnums, int pknumatts, char **src_pkattvals static char * quote_ident_cstr(char *rawstr) { - text *rawstr_text; - text *result_text; - char *result; + text *rawstr_text; + text *result_text; + char *result; - rawstr_text = cstring_to_text(rawstr); - result_text = DatumGetTextPP(DirectFunctionCall1(quote_ident, - PointerGetDatum(rawstr_text))); - result = text_to_cstring(result_text); + rawstr_text = cstring_to_text(rawstr); + result_text = DatumGetTextPP(DirectFunctionCall1(quote_ident, + PointerGetDatum(rawstr_text))); + result = text_to_cstring(result_text); - return result; + return result; } static int get_attnum_pk_pos(int *pkattnums, int pknumatts, int key) { - int i; + int i; - /* - * Not likely a long list anyway, so just scan for the value - */ - for (i = 0; i < pknumatts; i++) - if (key == pkattnums[i]) - return i; + /* + * Not likely a long list anyway, so just scan for the value + */ + for (i = 0; i < pknumatts; i++) + if (key == pkattnums[i]) + return i; - return -1; + return -1; } static HeapTuple get_tuple_of_interest(Relation rel, int *pkattnums, int pknumatts, char **src_pkattvals) { - char *relname; - TupleDesc tupdesc; - int natts; - StringInfoData buf; - int ret; - HeapTuple tuple; - int i; - - /* - * Connect to SPI manager - */ - if ((ret = SPI_connect()) < 0) - /* internal error */ - elog(ERROR, "SPI connect failure - returned %d", ret); - - initStringInfo(&buf); - - /* get relation name including any needed schema prefix and quoting */ - relname = generate_relation_name(rel); - - tupdesc = rel->rd_att; - natts = tupdesc->natts; - - /* - * Build sql statement to look up tuple of interest, ie, the one matching - * src_pkattvals. We used to use "SELECT *" here, but it's simpler to - * generate a result tuple that matches the table's physical structure, - * with NULLs for any dropped columns. Otherwise we have to deal with two - * different tupdescs and everything's very confusing. - */ - appendStringInfoString(&buf, "SELECT "); - - for (i = 0; i < natts; i++) - { - if (i > 0) - appendStringInfoString(&buf, ", "); - - if (tupdesc->attrs[i]->attisdropped) - appendStringInfoString(&buf, "NULL"); - else - appendStringInfoString(&buf, - quote_ident_cstr(NameStr(tupdesc->attrs[i]->attname))); - } - - appendStringInfo(&buf, " FROM %s WHERE ", relname); - - for (i = 0; i < pknumatts; i++) - { - int pkattnum = pkattnums[i]; - - if (i > 0) - appendStringInfoString(&buf, " AND "); - - appendStringInfoString(&buf, - quote_ident_cstr(NameStr(tupdesc->attrs[pkattnum]->attname))); - - if (src_pkattvals[i] != NULL) - appendStringInfo(&buf, " = %s", - quote_literal_cstr(src_pkattvals[i])); - else - appendStringInfoString(&buf, " IS NULL"); - } - - /* - * Retrieve the desired tuple - */ - ret = SPI_exec(buf.data, 0); - pfree(buf.data); - - /* - * Only allow one qualifying tuple - */ - if ((ret == SPI_OK_SELECT) && (SPI_processed > 1)) - ereport(ERROR, - (errcode(ERRCODE_CARDINALITY_VIOLATION), - errmsg("source criteria matched more than one record"))); - - else if (ret == SPI_OK_SELECT && SPI_processed == 1) - { - SPITupleTable *tuptable = SPI_tuptable; - - tuple = SPI_copytuple(tuptable->vals[0]); - SPI_finish(); - - return tuple; - } - else - { - /* - * no qualifying tuples - */ - SPI_finish(); - - return NULL; - } - - /* - * never reached, but keep compiler quiet - */ - return NULL; + char *relname; + TupleDesc tupdesc; + int natts; + StringInfoData buf; + int ret; + HeapTuple tuple; + int i; + + /* + * Connect to SPI manager + */ + if ((ret = SPI_connect()) < 0) + /* internal error */ + elog(ERROR, "SPI connect failure - returned %d", ret); + + initStringInfo(&buf); + + /* get relation name including any needed schema prefix and quoting */ + relname = generate_relation_name(rel); + + tupdesc = rel->rd_att; + natts = tupdesc->natts; + + /* + * Build sql statement to look up tuple of interest, ie, the one matching + * src_pkattvals. We used to use "SELECT *" here, but it's simpler to + * generate a result tuple that matches the table's physical structure, + * with NULLs for any dropped columns. Otherwise we have to deal with two + * different tupdescs and everything's very confusing. + */ + appendStringInfoString(&buf, "SELECT "); + + for (i = 0; i < natts; i++) + { + if (i > 0) + appendStringInfoString(&buf, ", "); + + if (tupdesc->attrs[i]->attisdropped) + appendStringInfoString(&buf, "NULL"); + else + appendStringInfoString(&buf, + quote_ident_cstr(NameStr(tupdesc->attrs[i]->attname))); + } + + appendStringInfo(&buf, " FROM %s WHERE ", relname); + + for (i = 0; i < pknumatts; i++) + { + int pkattnum = pkattnums[i]; + + if (i > 0) + appendStringInfoString(&buf, " AND "); + + appendStringInfoString(&buf, + quote_ident_cstr(NameStr(tupdesc->attrs[pkattnum]->attname))); + + if (src_pkattvals[i] != NULL) + appendStringInfo(&buf, " = %s", + quote_literal_cstr(src_pkattvals[i])); + else + appendStringInfoString(&buf, " IS NULL"); + } + + /* + * Retrieve the desired tuple + */ + ret = SPI_exec(buf.data, 0); + pfree(buf.data); + + /* + * Only allow one qualifying tuple + */ + if ((ret == SPI_OK_SELECT) && (SPI_processed > 1)) + ereport(ERROR, + (errcode(ERRCODE_CARDINALITY_VIOLATION), + errmsg("source criteria matched more than one record"))); + + else if (ret == SPI_OK_SELECT && SPI_processed == 1) + { + SPITupleTable *tuptable = SPI_tuptable; + + tuple = SPI_copytuple(tuptable->vals[0]); + SPI_finish(); + + return tuple; + } + else + { + /* + * no qualifying tuples + */ + SPI_finish(); + + return NULL; + } + + /* + * never reached, but keep compiler quiet + */ + return NULL; } /* @@ -2485,146 +2711,146 @@ get_tuple_of_interest(Relation rel, int *pkattnums, int pknumatts, char **src_pk static Relation get_rel_from_relname(text *relname_text, LOCKMODE lockmode, AclMode aclmode) { - RangeVar *relvar; - Relation rel; - AclResult aclresult; + RangeVar *relvar; + Relation rel; + AclResult aclresult; - relvar = makeRangeVarFromNameList(textToQualifiedNameList(relname_text)); - rel = heap_openrv(relvar, lockmode); + relvar = makeRangeVarFromNameList(textToQualifiedNameList(relname_text)); + rel = heap_openrv(relvar, lockmode); - aclresult = pg_class_aclcheck(RelationGetRelid(rel), GetUserId(), - aclmode); - if (aclresult != ACLCHECK_OK) - aclcheck_error(aclresult, ACL_KIND_CLASS, - RelationGetRelationName(rel)); + aclresult = pg_class_aclcheck(RelationGetRelid(rel), GetUserId(), + aclmode); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, ACL_KIND_CLASS, + RelationGetRelationName(rel)); - return rel; + return rel; } /* * generate_relation_name - copied from ruleutils.c - * Compute the name to display for a relation + * Compute the name to display for a relation * * The result includes all necessary quoting and schema-prefixing. */ static char * generate_relation_name(Relation rel) { - char *nspname; - char *result; + char *nspname; + char *result; - /* Qualify the name if not visible in search path */ - if (RelationIsVisible(RelationGetRelid(rel))) - nspname = NULL; - else - nspname = get_namespace_name(rel->rd_rel->relnamespace); + /* Qualify the name if not visible in search path */ + if (RelationIsVisible(RelationGetRelid(rel))) + nspname = NULL; + else + nspname = get_namespace_name(rel->rd_rel->relnamespace); - result = quote_qualified_identifier(nspname, RelationGetRelationName(rel)); + result = quote_qualified_identifier(nspname, RelationGetRelationName(rel)); - return result; + return result; } static remoteConn * getConnectionByName(const char *name) { - remoteConnHashEnt *hentry; - char *key; + remoteConnHashEnt *hentry; + char *key; - if (!remoteConnHash) - remoteConnHash = createConnHash(); + if (!remoteConnHash) + remoteConnHash = createConnHash(); - key = pstrdup(name); - truncate_identifier(key, strlen(key), false); - hentry = (remoteConnHashEnt *) hash_search(remoteConnHash, - key, HASH_FIND, NULL); + key = pstrdup(name); + truncate_identifier(key, strlen(key), false); + hentry = (remoteConnHashEnt *) hash_search(remoteConnHash, + key, HASH_FIND, NULL); - if (hentry) - return (hentry->rconn); + if (hentry) + return (hentry->rconn); - return (NULL); + return (NULL); } static HTAB * createConnHash(void) { - HASHCTL ctl; + HASHCTL ctl; - ctl.keysize = NAMEDATALEN; - ctl.entrysize = sizeof(remoteConnHashEnt); + ctl.keysize = NAMEDATALEN; + ctl.entrysize = sizeof(remoteConnHashEnt); - return hash_create("Remote Con hash", NUMCONN, &ctl, HASH_ELEM); + return hash_create("Remote Con hash", NUMCONN, &ctl, HASH_ELEM); } static void createNewConnection(const char *name, remoteConn *rconn) { - remoteConnHashEnt *hentry; - bool found; - char *key; - - if (!remoteConnHash) - remoteConnHash = createConnHash(); - - key = pstrdup(name); - truncate_identifier(key, strlen(key), true); - hentry = (remoteConnHashEnt *) hash_search(remoteConnHash, key, - HASH_ENTER, &found); - - if (found) - { - PQfinish(rconn->conn); - pfree(rconn); - - ereport(ERROR, - (errcode(ERRCODE_DUPLICATE_OBJECT), - errmsg("duplicate connection name"))); - } - - hentry->rconn = rconn; - strlcpy(hentry->name, name, sizeof(hentry->name)); + remoteConnHashEnt *hentry; + bool found; + char *key; + + if (!remoteConnHash) + remoteConnHash = createConnHash(); + + key = pstrdup(name); + truncate_identifier(key, strlen(key), true); + hentry = (remoteConnHashEnt *) hash_search(remoteConnHash, key, + HASH_ENTER, &found); + + if (found) + { + PQfinish(rconn->conn); + pfree(rconn); + + ereport(ERROR, + (errcode(ERRCODE_DUPLICATE_OBJECT), + errmsg("duplicate connection name"))); + } + + hentry->rconn = rconn; + strlcpy(hentry->name, name, sizeof(hentry->name)); } static void deleteConnection(const char *name) { - remoteConnHashEnt *hentry; - bool found; - char *key; + remoteConnHashEnt *hentry; + bool found; + char *key; - if (!remoteConnHash) - remoteConnHash = createConnHash(); + if (!remoteConnHash) + remoteConnHash = createConnHash(); - key = pstrdup(name); - truncate_identifier(key, strlen(key), false); - hentry = (remoteConnHashEnt *) hash_search(remoteConnHash, - key, HASH_REMOVE, &found); + key = pstrdup(name); + truncate_identifier(key, strlen(key), false); + hentry = (remoteConnHashEnt *) hash_search(remoteConnHash, + key, HASH_REMOVE, &found); - if (!hentry) - ereport(ERROR, - (errcode(ERRCODE_UNDEFINED_OBJECT), - errmsg("undefined connection name"))); + if (!hentry) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("undefined connection name"))); } static void dblink_security_check(PGconn *conn, remoteConn *rconn) { - if (!superuser()) - { - if (!PQconnectionUsedPassword(conn)) - { - PQfinish(conn); - if (rconn) - pfree(rconn); - - ereport(ERROR, - (errcode(ERRCODE_S_R_E_PROHIBITED_SQL_STATEMENT_ATTEMPTED), - errmsg("password is required"), - errdetail("Non-superuser cannot connect if the server does not request a password."), - errhint("Target server's authentication method must be changed."))); - } - } + if (!superuser()) + { + if (!PQconnectionUsedPassword(conn)) + { + PQfinish(conn); + if (rconn) + pfree(rconn); + + ereport(ERROR, + (errcode(ERRCODE_S_R_E_PROHIBITED_SQL_STATEMENT_ATTEMPTED), + errmsg("password is required"), + errdetail("Non-superuser cannot connect if the server does not request a password."), + errhint("Target server's authentication method must be changed."))); + } + } } /* @@ -2636,96 +2862,96 @@ dblink_security_check(PGconn *conn, remoteConn *rconn) static void dblink_connstr_check(const char *connstr) { - if (!superuser()) - { - PQconninfoOption *options; - PQconninfoOption *option; - bool connstr_gives_password = false; - - options = PQconninfoParse(connstr, NULL); - if (options) - { - for (option = options; option->keyword != NULL; option++) - { - if (strcmp(option->keyword, "password") == 0) - { - if (option->val != NULL && option->val[0] != '\0') - { - connstr_gives_password = true; - break; - } - } - } - PQconninfoFree(options); - } - - if (!connstr_gives_password) - ereport(ERROR, - (errcode(ERRCODE_S_R_E_PROHIBITED_SQL_STATEMENT_ATTEMPTED), - errmsg("password is required"), - errdetail("Non-superusers must provide a password in the connection string."))); - } + if (!superuser()) + { + PQconninfoOption *options; + PQconninfoOption *option; + bool connstr_gives_password = false; + + options = PQconninfoParse(connstr, NULL); + if (options) + { + for (option = options; option->keyword != NULL; option++) + { + if (strcmp(option->keyword, "password") == 0) + { + if (option->val != NULL && option->val[0] != '\0') + { + connstr_gives_password = true; + break; + } + } + } + PQconninfoFree(options); + } + + if (!connstr_gives_password) + ereport(ERROR, + (errcode(ERRCODE_S_R_E_PROHIBITED_SQL_STATEMENT_ATTEMPTED), + errmsg("password is required"), + errdetail("Non-superusers must provide a password in the connection string."))); + } } static void dblink_res_error(PGconn *conn, const char *conname, PGresult *res, - const char *dblink_context_msg, bool fail) + const char *dblink_context_msg, bool fail) { - int level; - char *pg_diag_sqlstate = PQresultErrorField(res, PG_DIAG_SQLSTATE); - char *pg_diag_message_primary = PQresultErrorField(res, PG_DIAG_MESSAGE_PRIMARY); - char *pg_diag_message_detail = PQresultErrorField(res, PG_DIAG_MESSAGE_DETAIL); - char *pg_diag_message_hint = PQresultErrorField(res, PG_DIAG_MESSAGE_HINT); - char *pg_diag_context = PQresultErrorField(res, PG_DIAG_CONTEXT); - int sqlstate; - char *message_primary; - char *message_detail; - char *message_hint; - char *message_context; - const char *dblink_context_conname = "unnamed"; - - if (fail) - level = ERROR; - else - level = NOTICE; - - if (pg_diag_sqlstate) - sqlstate = MAKE_SQLSTATE(pg_diag_sqlstate[0], - pg_diag_sqlstate[1], - pg_diag_sqlstate[2], - pg_diag_sqlstate[3], - pg_diag_sqlstate[4]); - else - sqlstate = ERRCODE_CONNECTION_FAILURE; - - message_primary = xpstrdup(pg_diag_message_primary); - message_detail = xpstrdup(pg_diag_message_detail); - message_hint = xpstrdup(pg_diag_message_hint); - message_context = xpstrdup(pg_diag_context); - - /* - * If we don't get a message from the PGresult, try the PGconn. This is - * needed because for connection-level failures, PQexec may just return - * NULL, not a PGresult at all. - */ - if (message_primary == NULL) - message_primary = pchomp(PQerrorMessage(conn)); - - if (res) - PQclear(res); - - if (conname) - dblink_context_conname = conname; - - ereport(level, - (errcode(sqlstate), - message_primary ? errmsg_internal("%s", message_primary) : - errmsg("could not obtain message string for remote error"), - message_detail ? errdetail_internal("%s", message_detail) : 0, - message_hint ? errhint("%s", message_hint) : 0, - message_context ? errcontext("%s", message_context) : 0, - errcontext("Error occurred on dblink connection named \"%s\": %s.", - dblink_context_conname, dblink_context_msg))); + int level; + char *pg_diag_sqlstate = PQresultErrorField(res, PG_DIAG_SQLSTATE); + char *pg_diag_message_primary = PQresultErrorField(res, PG_DIAG_MESSAGE_PRIMARY); + char *pg_diag_message_detail = PQresultErrorField(res, PG_DIAG_MESSAGE_DETAIL); + char *pg_diag_message_hint = PQresultErrorField(res, PG_DIAG_MESSAGE_HINT); + char *pg_diag_context = PQresultErrorField(res, PG_DIAG_CONTEXT); + int sqlstate; + char *message_primary; + char *message_detail; + char *message_hint; + char *message_context; + const char *dblink_context_conname = "unnamed"; + + if (fail) + level = ERROR; + else + level = NOTICE; + + if (pg_diag_sqlstate) + sqlstate = MAKE_SQLSTATE(pg_diag_sqlstate[0], + pg_diag_sqlstate[1], + pg_diag_sqlstate[2], + pg_diag_sqlstate[3], + pg_diag_sqlstate[4]); + else + sqlstate = ERRCODE_CONNECTION_FAILURE; + + message_primary = xpstrdup(pg_diag_message_primary); + message_detail = xpstrdup(pg_diag_message_detail); + message_hint = xpstrdup(pg_diag_message_hint); + message_context = xpstrdup(pg_diag_context); + + /* + * If we don't get a message from the PGresult, try the PGconn. This is + * needed because for connection-level failures, PQexec may just return + * NULL, not a PGresult at all. + */ + if (message_primary == NULL) + message_primary = pchomp(PQerrorMessage(conn)); + + if (res) + PQclear(res); + + if (conname) + dblink_context_conname = conname; + + ereport(level, + (errcode(sqlstate), + message_primary ? errmsg_internal("%s", message_primary) : + errmsg("could not obtain message string for remote error"), + message_detail ? errdetail_internal("%s", message_detail) : 0, + message_hint ? errhint("%s", message_hint) : 0, + message_context ? errcontext("%s", message_context) : 0, + errcontext("Error occurred on dblink connection named \"%s\": %s.", + dblink_context_conname, dblink_context_msg))); } /* @@ -2734,86 +2960,86 @@ dblink_res_error(PGconn *conn, const char *conname, PGresult *res, static char * get_connect_string(const char *servername) { - ForeignServer *foreign_server = NULL; - UserMapping *user_mapping; - ListCell *cell; - StringInfoData buf; - ForeignDataWrapper *fdw; - AclResult aclresult; - char *srvname; - - static const PQconninfoOption *options = NULL; - - initStringInfo(&buf); - - /* - * Get list of valid libpq options. - * - * To avoid unnecessary work, we get the list once and use it throughout - * the lifetime of this backend process. We don't need to care about - * memory context issues, because PQconndefaults allocates with malloc. - */ - if (!options) - { - options = PQconndefaults(); - if (!options) /* assume reason for failure is OOM */ - ereport(ERROR, - (errcode(ERRCODE_FDW_OUT_OF_MEMORY), - errmsg("out of memory"), - errdetail("could not get libpq's default connection options"))); - } - - /* first gather the server connstr options */ - srvname = pstrdup(servername); - truncate_identifier(srvname, strlen(srvname), false); - foreign_server = GetForeignServerByName(srvname, true); - - if (foreign_server) - { - Oid serverid = foreign_server->serverid; - Oid fdwid = foreign_server->fdwid; - Oid userid = GetUserId(); - - user_mapping = GetUserMapping(userid, serverid); - fdw = GetForeignDataWrapper(fdwid); - - /* Check permissions, user must have usage on the server. */ - aclresult = pg_foreign_server_aclcheck(serverid, userid, ACL_USAGE); - if (aclresult != ACLCHECK_OK) - aclcheck_error(aclresult, ACL_KIND_FOREIGN_SERVER, foreign_server->servername); - - foreach(cell, fdw->options) - { - DefElem *def = lfirst(cell); - - if (is_valid_dblink_option(options, def->defname, ForeignDataWrapperRelationId)) - appendStringInfo(&buf, "%s='%s' ", def->defname, - escape_param_str(strVal(def->arg))); - } - - foreach(cell, foreign_server->options) - { - DefElem *def = lfirst(cell); - - if (is_valid_dblink_option(options, def->defname, ForeignServerRelationId)) - appendStringInfo(&buf, "%s='%s' ", def->defname, - escape_param_str(strVal(def->arg))); - } - - foreach(cell, user_mapping->options) - { - - DefElem *def = lfirst(cell); - - if (is_valid_dblink_option(options, def->defname, UserMappingRelationId)) - appendStringInfo(&buf, "%s='%s' ", def->defname, - escape_param_str(strVal(def->arg))); - } - - return buf.data; - } - else - return NULL; + ForeignServer *foreign_server = NULL; + UserMapping *user_mapping; + ListCell *cell; + StringInfoData buf; + ForeignDataWrapper *fdw; + AclResult aclresult; + char *srvname; + + static const PQconninfoOption *options = NULL; + + initStringInfo(&buf); + + /* + * Get list of valid libpq options. + * + * To avoid unnecessary work, we get the list once and use it throughout + * the lifetime of this backend process. We don't need to care about + * memory context issues, because PQconndefaults allocates with malloc. + */ + if (!options) + { + options = PQconndefaults(); + if (!options) /* assume reason for failure is OOM */ + ereport(ERROR, + (errcode(ERRCODE_FDW_OUT_OF_MEMORY), + errmsg("out of memory"), + errdetail("could not get libpq's default connection options"))); + } + + /* first gather the server connstr options */ + srvname = pstrdup(servername); + truncate_identifier(srvname, strlen(srvname), false); + foreign_server = GetForeignServerByName(srvname, true); + + if (foreign_server) + { + Oid serverid = foreign_server->serverid; + Oid fdwid = foreign_server->fdwid; + Oid userid = GetUserId(); + + user_mapping = GetUserMapping(userid, serverid); + fdw = GetForeignDataWrapper(fdwid); + + /* Check permissions, user must have usage on the server. */ + aclresult = pg_foreign_server_aclcheck(serverid, userid, ACL_USAGE); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, ACL_KIND_FOREIGN_SERVER, foreign_server->servername); + + foreach(cell, fdw->options) + { + DefElem *def = lfirst(cell); + + if (is_valid_dblink_option(options, def->defname, ForeignDataWrapperRelationId)) + appendStringInfo(&buf, "%s='%s' ", def->defname, + escape_param_str(strVal(def->arg))); + } + + foreach(cell, foreign_server->options) + { + DefElem *def = lfirst(cell); + + if (is_valid_dblink_option(options, def->defname, ForeignServerRelationId)) + appendStringInfo(&buf, "%s='%s' ", def->defname, + escape_param_str(strVal(def->arg))); + } + + foreach(cell, user_mapping->options) + { + + DefElem *def = lfirst(cell); + + if (is_valid_dblink_option(options, def->defname, UserMappingRelationId)) + appendStringInfo(&buf, "%s='%s' ", def->defname, + escape_param_str(strVal(def->arg))); + } + + return buf.data; + } + else + return NULL; } /* @@ -2824,19 +3050,19 @@ get_connect_string(const char *servername) static char * escape_param_str(const char *str) { - const char *cp; - StringInfoData buf; + const char *cp; + StringInfoData buf; - initStringInfo(&buf); + initStringInfo(&buf); - for (cp = str; *cp; cp++) - { - if (*cp == '\\' || *cp == '\'') - appendStringInfoChar(&buf, '\\'); - appendStringInfoChar(&buf, *cp); - } + for (cp = str; *cp; cp++) + { + if (*cp == '\\' || *cp == '\'') + appendStringInfoChar(&buf, '\\'); + appendStringInfoChar(&buf, *cp); + } - return buf.data; + return buf.data; } /* @@ -2856,58 +3082,58 @@ escape_param_str(const char *str) */ static void validate_pkattnums(Relation rel, - int2vector *pkattnums_arg, int32 pknumatts_arg, - int **pkattnums, int *pknumatts) + int2vector *pkattnums_arg, int32 pknumatts_arg, + int **pkattnums, int *pknumatts) { - TupleDesc tupdesc = rel->rd_att; - int natts = tupdesc->natts; - int i; - - /* Don't take more array elements than there are */ - pknumatts_arg = Min(pknumatts_arg, pkattnums_arg->dim1); - - /* Must have at least one pk attnum selected */ - if (pknumatts_arg <= 0) - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("number of key attributes must be > 0"))); - - /* Allocate output array */ - *pkattnums = (int *) palloc(pknumatts_arg * sizeof(int)); - *pknumatts = pknumatts_arg; - - /* Validate attnums and convert to internal form */ - for (i = 0; i < pknumatts_arg; i++) - { - int pkattnum = pkattnums_arg->values[i]; - int lnum; - int j; - - /* Can throw error immediately if out of range */ - if (pkattnum <= 0 || pkattnum > natts) - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("invalid attribute number %d", pkattnum))); - - /* Identify which physical column has this logical number */ - lnum = 0; - for (j = 0; j < natts; j++) - { - /* dropped columns don't count */ - if (tupdesc->attrs[j]->attisdropped) - continue; - - if (++lnum == pkattnum) - break; - } - - if (j < natts) - (*pkattnums)[i] = j; - else - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("invalid attribute number %d", pkattnum))); - } + TupleDesc tupdesc = rel->rd_att; + int natts = tupdesc->natts; + int i; + + /* Don't take more array elements than there are */ + pknumatts_arg = Min(pknumatts_arg, pkattnums_arg->dim1); + + /* Must have at least one pk attnum selected */ + if (pknumatts_arg <= 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("number of key attributes must be > 0"))); + + /* Allocate output array */ + *pkattnums = (int *) palloc(pknumatts_arg * sizeof(int)); + *pknumatts = pknumatts_arg; + + /* Validate attnums and convert to internal form */ + for (i = 0; i < pknumatts_arg; i++) + { + int pkattnum = pkattnums_arg->values[i]; + int lnum; + int j; + + /* Can throw error immediately if out of range */ + if (pkattnum <= 0 || pkattnum > natts) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid attribute number %d", pkattnum))); + + /* Identify which physical column has this logical number */ + lnum = 0; + for (j = 0; j < natts; j++) + { + /* dropped columns don't count */ + if (tupdesc->attrs[j]->attisdropped) + continue; + + if (++lnum == pkattnum) + break; + } + + if (j < natts) + (*pkattnums)[i] = j; + else + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid attribute number %d", pkattnum))); + } } /* @@ -2915,11 +3141,11 @@ validate_pkattnums(Relation rel, * * We basically allow whatever libpq thinks is an option, with these * restrictions: - * debug options: disallowed - * "client_encoding": disallowed - * "user": valid only in USER MAPPING options - * secure options (eg password): valid only in USER MAPPING options - * others: valid only in FOREIGN SERVER options + * debug options: disallowed + * "client_encoding": disallowed + * "user": valid only in USER MAPPING options + * secure options (eg password): valid only in USER MAPPING options + * others: valid only in FOREIGN SERVER options * * We disallow client_encoding because it would be overridden anyway via * PQclientEncoding; allowing it to be specified would merely promote @@ -2927,43 +3153,43 @@ validate_pkattnums(Relation rel, */ static bool is_valid_dblink_option(const PQconninfoOption *options, const char *option, - Oid context) + Oid context) { - const PQconninfoOption *opt; - - /* Look up the option in libpq result */ - for (opt = options; opt->keyword; opt++) - { - if (strcmp(opt->keyword, option) == 0) - break; - } - if (opt->keyword == NULL) - return false; - - /* Disallow debug options (particularly "replication") */ - if (strchr(opt->dispchar, 'D')) - return false; - - /* Disallow "client_encoding" */ - if (strcmp(opt->keyword, "client_encoding") == 0) - return false; - - /* - * If the option is "user" or marked secure, it should be specified only - * in USER MAPPING. Others should be specified only in SERVER. - */ - if (strcmp(opt->keyword, "user") == 0 || strchr(opt->dispchar, '*')) - { - if (context != UserMappingRelationId) - return false; - } - else - { - if (context != ForeignServerRelationId) - return false; - } - - return true; + const PQconninfoOption *opt; + + /* Look up the option in libpq result */ + for (opt = options; opt->keyword; opt++) + { + if (strcmp(opt->keyword, option) == 0) + break; + } + if (opt->keyword == NULL) + return false; + + /* Disallow debug options (particularly "replication") */ + if (strchr(opt->dispchar, 'D')) + return false; + + /* Disallow "client_encoding" */ + if (strcmp(opt->keyword, "client_encoding") == 0) + return false; + + /* + * If the option is "user" or marked secure, it should be specified only + * in USER MAPPING. Others should be specified only in SERVER. + */ + if (strcmp(opt->keyword, "user") == 0 || strchr(opt->dispchar, '*')) + { + if (context != UserMappingRelationId) + return false; + } + else + { + if (context != ForeignServerRelationId) + return false; + } + + return true; } /* @@ -2979,50 +3205,50 @@ is_valid_dblink_option(const PQconninfoOption *options, const char *option, static int applyRemoteGucs(PGconn *conn) { - static const char *const GUCsAffectingIO[] = { - "DateStyle", - "IntervalStyle" - }; - - int nestlevel = -1; - int i; - - for (i = 0; i < lengthof(GUCsAffectingIO); i++) - { - const char *gucName = GUCsAffectingIO[i]; - const char *remoteVal = PQparameterStatus(conn, gucName); - const char *localVal; - - /* - * If the remote server is pre-8.4, it won't have IntervalStyle, but - * that's okay because its output format won't be ambiguous. So just - * skip the GUC if we don't get a value for it. (We might eventually - * need more complicated logic with remote-version checks here.) - */ - if (remoteVal == NULL) - continue; - - /* - * Avoid GUC-setting overhead if the remote and local GUCs already - * have the same value. - */ - localVal = GetConfigOption(gucName, false, false); - Assert(localVal != NULL); - - if (strcmp(remoteVal, localVal) == 0) - continue; - - /* Create new GUC nest level if we didn't already */ - if (nestlevel < 0) - nestlevel = NewGUCNestLevel(); - - /* Apply the option (this will throw error on failure) */ - (void) set_config_option(gucName, remoteVal, - PGC_USERSET, PGC_S_SESSION, - GUC_ACTION_SAVE, true, 0, false); - } - - return nestlevel; + static const char *const GUCsAffectingIO[] = { + "DateStyle", + "IntervalStyle" + }; + + int nestlevel = -1; + int i; + + for (i = 0; i < lengthof(GUCsAffectingIO); i++) + { + const char *gucName = GUCsAffectingIO[i]; + const char *remoteVal = PQparameterStatus(conn, gucName); + const char *localVal; + + /* + * If the remote server is pre-8.4, it won't have IntervalStyle, but + * that's okay because its output format won't be ambiguous. So just + * skip the GUC if we don't get a value for it. (We might eventually + * need more complicated logic with remote-version checks here.) + */ + if (remoteVal == NULL) + continue; + + /* + * Avoid GUC-setting overhead if the remote and local GUCs already + * have the same value. + */ + localVal = GetConfigOption(gucName, false, false); + Assert(localVal != NULL); + + if (strcmp(remoteVal, localVal) == 0) + continue; + + /* Create new GUC nest level if we didn't already */ + if (nestlevel < 0) + nestlevel = NewGUCNestLevel(); + + /* Apply the option (this will throw error on failure) */ + (void) set_config_option(gucName, remoteVal, + PGC_USERSET, PGC_S_SESSION, + GUC_ACTION_SAVE, true, 0, false); + } + + return nestlevel; } /* @@ -3031,7 +3257,7 @@ applyRemoteGucs(PGconn *conn) static void restoreLocalGucs(int nestlevel) { - /* Do nothing if no new nestlevel was created */ - if (nestlevel > 0) - AtEOXact_GUC(true, nestlevel); + /* Do nothing if no new nestlevel was created */ + if (nestlevel > 0) + AtEOXact_GUC(true, nestlevel); } diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index cf727d6f..73736d71 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -953,35 +953,31 @@ ProcessUtilityPre(PlannedStmt *pstmt, break; case T_CreateFdwStmt: - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("Postgres-XL does not support FOREIGN DATA WRAPPER yet"), - errdetail("The feature is not currently supported"))); + exec_type = EXEC_ON_ALL_NODES; break; case T_AlterFdwStmt: + exec_type = EXEC_ON_ALL_NODES; break; case T_CreateForeignServerStmt: - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("Postgres-XL does not support SERVER yet"), - errdetail("The feature is not currently supported"))); + exec_type = EXEC_ON_ALL_NODES; break; case T_AlterForeignServerStmt: + exec_type = EXEC_ON_ALL_NODES; break; case T_CreateUserMappingStmt: - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("Postgres-XL does not support USER MAPPING yet"), - errdetail("The feature is not currently supported"))); + exec_type = EXEC_ON_ALL_NODES; break; case T_AlterUserMappingStmt: case T_DropUserMappingStmt: + exec_type = EXEC_ON_ALL_NODES; + break; case T_ImportForeignSchemaStmt: + break; case T_CompositeTypeStmt: /* CREATE TYPE (composite) */ case T_CreateEnumStmt: /* CREATE TYPE AS ENUM */ case T_CreateRangeStmt: /* CREATE TYPE AS RANGE */ From 35c5c7526b75d66f73b297fb25fff3c0b5225863 Mon Sep 17 00:00:00 2001 From: yeyukui Date: Tue, 6 Apr 2021 17:34:13 +0800 Subject: [PATCH 370/578] fix coredump about crypt --- src/backend/utils/misc/relcrypt.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/backend/utils/misc/relcrypt.c b/src/backend/utils/misc/relcrypt.c index fa69b54d..954ff6c6 100644 --- a/src/backend/utils/misc/relcrypt.c +++ b/src/backend/utils/misc/relcrypt.c @@ -1609,7 +1609,15 @@ Datum trsprt_crypt_decrypt_one_col_value(TranspCrypt*transp_crypt, if (TRANSP_CRYPT_INVALID_ALGORITHM_ID != transp_crypt->algo_id) { datum_text = decrypt_procedure(transp_crypt->algo_id, DatumGetTextP(inputval), INVALID_CONTEXT_LENGTH); + if (datum_text) + { datum_ret = transparent_crypt_text_get_datum(datum_text, attr); + } + else + { + datum_ret = transparent_crypt_text_get_datum(DatumGetTextP(inputval), attr); + } + return datum_ret; } From 2d5f2ba3cae3e7d8526f8489af44579df90a4f20 Mon Sep 17 00:00:00 2001 From: whalesong Date: Thu, 13 May 2021 19:24:35 +0800 Subject: [PATCH 371/578] 2pc files opt: add 2pc hash table on shmem (merge request 300), bugfix: 2pc file not found --- src/backend/access/transam/twophase.c | 398 +++++++++++++++----------- 1 file changed, 226 insertions(+), 172 deletions(-) diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 78ae69ff..f2fbc7e6 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -168,6 +168,8 @@ int record_2pc_partitions = 32; #define MAX_2PC_INFO_SIZE (record_2pc_entry_size - MAX_TID_SIZE) #define DFLT_2PC_INFO_SIZE 1024 /* default size */ +#define MAX_RETRY_TIMES 2 + /* hash table entry for 2pc record */ typedef struct Cache2pcInfo { @@ -179,6 +181,9 @@ typedef struct Cache2pcInfo inline void check_entry_key(const char *tid, const char *key, const char *func); +void +print_record_2pc_cache(const char *func); + void check_2pc_file(const char *tid, const char *info, const char *func); #endif @@ -2303,6 +2308,7 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon) {// #lizard forgives int i; int serialized_xacts = 0; + char *func = "CheckPointTwoPhase"; #ifdef __TWO_PHASE_TRANS__ File fd = -1; @@ -2313,7 +2319,8 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon) char path[MAXPGPATH]; #endif - elog(LOG, "[CheckPointTwoPhase] checkpoint: "UINT64_FORMAT, redo_horizon); + + elog(LOG, "[%s] checkpoint: "UINT64_FORMAT, func, redo_horizon); if (max_prepared_xacts <= 0) return; /* nothing to do */ @@ -2368,11 +2375,10 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon) { /* save to file */ Assert(NULL != entry); - check_entry_key(gxact->gid, entry->key, "CheckPointTwoPhase"); - check_2pc_file(gxact->gid, entry->info, "CheckPointTwoPhase"); + check_entry_key(gxact->gid, entry->key, func); + check_2pc_file(gxact->gid, entry->info, func); - elog(LOG, "[CheckPointTwoPhase] %s is found " - "in hash table", gxact->gid); + elog(LOG, "[%s] %s is found in hash table", func, gxact->gid); size = strlen(entry->info); @@ -2382,38 +2388,54 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon) fd = open(path, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR); if (fd < 0) { - elog(ERROR, "[CheckPointTwoPhase] could not create file %s, " - "errMsg: %s", path, strerror(errno)); + elog(ERROR, "[%s] could not create file %s, errMsg: %s", + func, path, strerror(errno)); } ret = write(fd, entry->info, size); if(ret != size) { close(fd); - elog(ERROR, "[CheckPointTwoPhase] could not write file %s, " - "errMsg: %s, ret: %d, info: %s", - path, strerror(errno), ret, entry->info); + elog(ERROR, "[%s] could not write file %s, errMsg: %s, " + "ret: %d, info: %s", + func, path, strerror(errno), ret, entry->info); + } + + if (size != strlen(entry->info)) + { + elog(LOG, "[%s] %s size change from %d to %zu, info: %s", + func, gxact->gid, size, strlen(entry->info), entry->info); + + Assert(size < strlen(entry->info)); + ret = write(fd, entry->info + size, strlen(entry->info) - size); + if(ret != strlen(entry->info) - size) + { + close(fd); + elog(ERROR, "[%s] could not write file %s, errMsg: %s, " + "ret: %d, info: %s", + func, path, strerror(errno), ret, entry->info); + } } close(fd); + fsync_fname(path, false); /* remove from hash table */ entry = (Cache2pcInfo *)hash_search(record_2pc_cache, gxact->gid, HASH_REMOVE, &found); if (!found) { - elog(WARNING, "[CheckPointTwoPhase] %s is not found " - "in hash table when remove it", gxact->gid); + elog(WARNING, "[%s] %s is not found in hash table " + "when remove it", func, gxact->gid); } - else if (enable_2pc_entry_trace) + else { - elog(LOG, "[CheckPointTwoPhase] %s is removed " - "from hash table", gxact->gid); + elog(LOG, "[%s] %s is removed from hash table", + func, gxact->gid); } } else { - elog(LOG, "[CheckPointTwoPhase] %s is not found " - "in hash table", gxact->gid); + elog(LOG, "[%s] %s is not found in hash table", func, gxact->gid); } } #endif @@ -2439,10 +2461,9 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon) while ((entry = hash_seq_search(&seq)) != NULL) { Assert(NULL != entry); - check_2pc_file(entry->key, entry->info, "CheckPointTwoPhase"); + check_2pc_file(entry->key, entry->info, func); - elog(LOG, "[CheckPointTwoPhase] key %s is found " - "in hash table", entry->key); + elog(LOG, "[%s] key %s is found in hash table", func, entry->key); if (IsXidImplicit(entry->key)) { @@ -2457,20 +2478,20 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon) if (0 != strcmp(start_node, PGXCNodeName)) { - elog(LOG, "[CheckPointTwoPhase] %s start node is not %s", - entry->key, PGXCNodeName); + elog(LOG, "[%s] %s start node is not %s", + func, entry->key, PGXCNodeName); continue; } else { - elog(LOG, "[CheckPointTwoPhase] %s start node is %s", - entry->key, PGXCNodeName); + elog(LOG, "[%s] %s start node is %s", + func, entry->key, PGXCNodeName); } } else { - elog(WARNING, "[CheckPointTwoPhase] %s get start node failed, " - "info: %s", entry->key, entry->info); + elog(WARNING, "[%s] %s get start node failed, info: %s", + func, entry->key, entry->info); } } @@ -2482,32 +2503,49 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon) fd = open(path, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR); if (fd < 0) { - elog(ERROR, "[CheckPointTwoPhase] could not create file %s, " - "errMsg: %s", path, strerror(errno)); + elog(ERROR, "[%s] could not create file %s, errMsg: %s", + func, path, strerror(errno)); } ret = write(fd, entry->info, size); if(ret != size) { close(fd); - elog(ERROR, "[CheckPointTwoPhase] could not write file %s, " - "errMsg: %s, ret: %d, info: %s", - path, strerror(errno), ret, entry->info); + elog(ERROR, "[%s] could not write file %s, errMsg: %s, " + "ret: %d, info: %s", + func, path, strerror(errno), ret, entry->info); } + + if (size != strlen(entry->info)) + { + elog(LOG, "[%s] %s size change from %d to %zu, info: %s", + func, entry->key, size, strlen(entry->info), entry->info); + + Assert(size < strlen(entry->info)); + ret = write(fd, entry->info + size, strlen(entry->info) - size); + if(ret != strlen(entry->info) - size) + { close(fd); + elog(ERROR, "[%s] could not write file %s, errMsg: %s, " + "ret: %d, info: %s", + func, path, strerror(errno), ret, entry->info); + } + } + close(fd); + fsync_fname(path, false); /* remove from hash table */ entry = (Cache2pcInfo *)hash_search(record_2pc_cache, entry->key, HASH_REMOVE, &found); if (!found) { - elog(WARNING, "[CheckPointTwoPhase] %s is not found " - "in hash table when remove it", entry->key); + elog(WARNING, "[%s] %s is not found in hash table " + "when remove it", func, entry->key); } - else if (enable_2pc_entry_trace) + else { - elog(LOG, "[CheckPointTwoPhase] %s is removed " - "from hash table", entry->key); + elog(LOG, "[%s] %s is removed from hash table", + func, entry->key); } } } @@ -3364,6 +3402,22 @@ inline void check_entry_key(const char *tid, const char *key, const char *func) } } +void print_record_2pc_cache(const char *func) +{ + if (NULL != record_2pc_cache) + { + HASH_SEQ_STATUS seq; + Cache2pcInfo *entry = NULL; + + hash_seq_init(&seq, record_2pc_cache); + while ((entry = hash_seq_search(&seq)) != NULL) + { + Assert(NULL != entry); + elog(LOG, "[print_record_2pc_cache][%s] key: %s, info: %s", + func, entry->key, entry->info); + } + } +} /* * Check whether the 2pc file is exist when it is saved in the hash table. */ @@ -3505,6 +3559,8 @@ void record_2pc_involved_nodes_xid(const char * tid, char *result = NULL; Cache2pcInfo *entry = NULL; bool found = false; + char *func = "record_2pc_involved_nodes_xid"; + #ifdef __TWO_PHASE_TESTS__ XLogRecPtr xlogrec = 0; #endif @@ -3516,23 +3572,22 @@ void record_2pc_involved_nodes_xid(const char * tid, if (enable_distri_print || enable_2pc_entry_trace) { - elog(LOG, "[record_2pc_involved_nodes_xid] record %s, " - "startnode: %s, participants: %s", - tid, startnode, nodestring); + elog(LOG, "[%s] record %s, startnode: %s, participants: %s", + func, tid, startnode, nodestring); } if (NULL == tid || '\0' == tid[0]) { - elog(ERROR, "[record_2pc_involved_nodes_xid] gid is empty"); + elog(ERROR, "[%s] gid is empty", func); } if (NULL == startnode || '\0' == startnode[0]) { - elog(PANIC, "[record_2pc_involved_nodes_xid] %s startnode is empty", tid); + elog(PANIC, "[%s] %s startnode is empty", func, tid); } if (NULL == nodestring || '\0' == nodestring[0]) { - elog(PANIC, "[record_2pc_involved_nodes_xid] %s participants is empty", tid); + elog(PANIC, "[%s] %s participants is empty", func, tid); } initStringInfo(&content); @@ -3554,14 +3609,14 @@ void record_2pc_involved_nodes_xid(const char * tid, if (found) { Assert(NULL != entry); - check_entry_key(tid, entry->key, "record_2pc_involved_nodes_xid"); - check_2pc_file(tid, entry->info, "record_2pc_involved_nodes_xid"); + check_entry_key(tid, entry->key, func); + check_2pc_file(tid, entry->info, func); if (strncmp(entry->info, content.data, size) != 0) { - elog(ERROR, "[record_2pc_involved_nodes_xid] pg_clean attemp to " - "write %s info conflict, content: %s, info: %s", - tid, content.data, entry->info); + elog(ERROR, "[%s] pg_clean attemp to write %s info conflict, " + "content: %s, info: %s", + func, tid, content.data, entry->info); } resetStringInfo(&content); @@ -3583,8 +3638,7 @@ void record_2pc_involved_nodes_xid(const char * tid, { ereport(ERROR, (errcode_for_file_access(), - errmsg("[record_2pc_involved_nodes_xid] could not " - "open file %s for read", path))); + errmsg("[%s] could not open file %s for read", func, path))); } ret = FileRead(fd, result, fileSize, WAIT_EVENT_BUFFILE_READ); if(ret != fileSize) @@ -3592,8 +3646,7 @@ void record_2pc_involved_nodes_xid(const char * tid, FileClose(fd); ereport(ERROR, (errcode_for_file_access(), - errmsg("[record_2pc_involved_nodes_xid] could not " - "read file %s, ret: %d", path, ret))); + errmsg("[%s] could not read file %s, ret: %d", func, path, ret))); } FileClose(fd); @@ -3601,9 +3654,9 @@ void record_2pc_involved_nodes_xid(const char * tid, if (strncmp(result, content.data, size) != 0) { - elog(ERROR, "[record_2pc_involved_nodes_xid] pg_clean attemp to " - "write %s info conflict, content: %s, info: %s", - tid, content.data, result); + elog(ERROR, "[%s] pg_clean attemp to write %s info conflict, " + "content: %s, info: %s", + func, tid, content.data, result); } pfree(result); @@ -3633,8 +3686,8 @@ void record_2pc_involved_nodes_xid(const char * tid, XLogFlush(xlogrec); run_pg_clean = 1; complish = true; - elog(STOP, "[record_2pc_involved_nodes_xid] twophase exception: " - "simulate kill start node after record 2pc file"); + elog(STOP, "[%s] twophase exception: simulate kill start node " + "after record 2pc file", func); } #endif } @@ -3646,29 +3699,28 @@ void record_2pc_involved_nodes_xid(const char * tid, tid, HASH_ENTER_NULL, &found); if (NULL != entry) { - check_entry_key(tid, entry->key, "record_2pc_involved_nodes_xid"); - check_2pc_file(tid, entry->info, "record_2pc_involved_nodes_xid"); + check_entry_key(tid, entry->key, func); + check_2pc_file(tid, entry->info, func); if (found) { if (RecoveryInProgress()) { - elog(LOG, "[record_2pc_involved_nodes_xid] %s is found " - "in hash table in recovery mode", tid); + elog(LOG, "[%s] %s is found in hash table in recovery mode", + func, tid); } else { - elog(LOG, "[record_2pc_involved_nodes_xid] %s is found " - "in hash table", tid); + elog(LOG, "[%s] %s is found in hash table", func, tid); } } else if (enable_2pc_entry_trace) { - elog(LOG, "[record_2pc_involved_nodes_xid] %s is added " - "to hash table", tid); + elog(LOG, "[%s] %s is added to hash table", func, tid); } memcpy(entry->info, content.data, size + 1); + check_entry_key(tid, entry->key, func); resetStringInfo(&content); pfree(content.data); @@ -3676,13 +3728,13 @@ void record_2pc_involved_nodes_xid(const char * tid, } else { - elog(LOG, "[record_2pc_involved_nodes_xid] %s entry is NULL", tid); + elog(LOG, "[%s] %s entry is NULL", func, tid); } } else if (NULL != record_2pc_cache) { - elog(LOG, "[record_2pc_involved_nodes_xid] %s size: %d, " - "max info size: %d", tid, size, MAX_2PC_INFO_SIZE); + elog(LOG, "[%s] %s size: %d, max info size: %d", + func, tid, size, MAX_2PC_INFO_SIZE); } GET_2PC_FILE_PATH(path, tid); @@ -3705,8 +3757,8 @@ void record_2pc_involved_nodes_xid(const char * tid, } if (fd < 0) { - elog(ERROR, "[record_2pc_involved_nodes_xid] could not create file %s, " - "errMsg: %s", path, strerror(errno)); + elog(ERROR, "[%s] could not create file %s, errMsg: %s", + func, path, strerror(errno)); return; } @@ -3714,9 +3766,8 @@ void record_2pc_involved_nodes_xid(const char * tid, if(ret != size) { FileClose(fd); - elog(ERROR, "[record_2pc_involved_nodes_xid] could not write file %s, " - "errMsg: %s, ret: %d, content: %s", - path, strerror(errno), ret, content.data); + elog(ERROR, "[%s] could not write file %s, errMsg: %s, ret: %d, content: %s", + func, path, strerror(errno), ret, content.data); } FileClose(fd); @@ -3734,13 +3785,11 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta int ret = 0; int size = 0; int new_size = 0; + int retry_times = 0; XLogRecPtr xlogrec = 0; Cache2pcInfo *entry = NULL; bool found = false; -#if 0 - int i; - GlobalTransaction gxact = NULL; -#endif + char *func = "record_2pc_commit_timestamp"; if (!enable_2pc_recovery_info) { @@ -3749,17 +3798,16 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta if (enable_distri_print || enable_2pc_entry_trace) { - elog(LOG, "[record_2pc_commit_timestamp] %s commit_timestamp: " - INT64_FORMAT, tid, commit_timestamp); + elog(LOG, "[%s] %s commit_timestamp: "INT64_FORMAT, + func, tid, commit_timestamp); } Assert(tid[0] != '\0'); if (InvalidGlobalTimestamp == commit_timestamp && (TWO_PHASE_COMMITTING == g_twophase_state.state || TWO_PHASE_COMMIT_END == g_twophase_state.state)) { - elog(ERROR, "[record_2pc_commit_timestamp] could not commit " - "transaction '%s' on node '%s' with InvalidGlobalTimestamp", - tid, PGXCNodeName); + elog(ERROR, "[%s] could not commit transaction '%s' on node '%s' " + "with InvalidGlobalTimestamp", func, tid, PGXCNodeName); } if (!RecoveryInProgress()) @@ -3782,25 +3830,24 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta size = content.len; Assert(size == strlen(content.data)); - if (NULL != record_2pc_cache) + while (NULL != record_2pc_cache && retry_times++ < MAX_RETRY_TIMES) { Assert(strlen(tid) < MAX_TID_SIZE); entry = (Cache2pcInfo *)hash_search(record_2pc_cache, tid, HASH_FIND, &found); if (found) { Assert(NULL != entry); - check_entry_key(tid, entry->key, "record_2pc_commit_timestamp"); - check_2pc_file(tid, entry->info, "record_2pc_commit_timestamp"); + check_entry_key(tid, entry->key, func); + check_2pc_file(tid, entry->info, func); if (RecoveryInProgress()) { - elog(LOG, "[record_2pc_commit_timestamp] %s is found " - "in hash table in recovery mode", tid); + elog(LOG, "[%s] %s is found in hash table in recovery mode", + func, tid); } else if (enable_2pc_entry_trace) { - elog(LOG, "[record_2pc_commit_timestamp] %s is found " - "in hash table", tid); + elog(LOG, "[%s] %s is found in hash table", func, tid); } new_size = size + strlen(entry->info); @@ -3809,6 +3856,7 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta { /* save to hash table */ memcpy(entry->info + strlen(entry->info), content.data, size + 1); + check_entry_key(tid, entry->key, func); resetStringInfo(&content); pfree(content.data); @@ -3816,8 +3864,8 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta } /* save to file */ - elog(LOG, "[record_2pc_commit_timestamp] %s new size(%d) " - "overflow(%d)", tid, new_size, MAX_2PC_INFO_SIZE); + elog(LOG, "[%s] %s new size(%d) overflow(%d)", + func, tid, new_size, MAX_2PC_INFO_SIZE); GET_2PC_FILE_PATH(path, tid); @@ -3833,9 +3881,8 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta } if (fd < 0) { - elog(ERROR, "[record_2pc_commit_timestamp] could not " - "append timestamp in file %s, errMsg: %s", - path, strerror(errno)); + elog(ERROR, "[%s] could not append timestamp in file %s, errMsg: %s", + func, path, strerror(errno)); } ret = FileWrite(fd, entry->info, strlen(entry->info), @@ -3843,18 +3890,18 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta if(ret != strlen(entry->info)) { FileClose(fd); - elog(ERROR, "[record_2pc_commit_timestamp] could not write " - "file %s, errMsg: %s, ret: %d, info: %s", - path, strerror(errno), ret, entry->info); + elog(ERROR, "[%s] could not write file %s, errMsg: %s, " + "ret: %d, info: %s", + func, path, strerror(errno), ret, entry->info); } ret = FileWrite(fd, content.data, size, WAIT_EVENT_BUFFILE_WRITE); if(ret != size) { FileClose(fd); - elog(ERROR, "[record_2pc_commit_timestamp] could not write " - "file %s, errMsg: %s, ret: %d, info: %s", - path, strerror(errno), ret, content.data); + elog(ERROR, "[%s] could not write file %s, errMsg: %s, " + "ret: %d, info: %s", + func, path, strerror(errno), ret, content.data); } FileClose(fd); @@ -3863,24 +3910,32 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta tid, HASH_REMOVE, &found); if (!found) { - elog(WARNING, "[record_2pc_commit_timestamp] %s is not found" - "in hash table when remove it", tid); + elog(WARNING, "[%s] %s is not found in hash table when remove it", + func, tid); } else if (enable_2pc_entry_trace) { - elog(LOG, "[record_2pc_commit_timestamp] %s is removed " - "from hash table", entry->key); + elog(LOG, "[%s] %s is removed from hash table", func, entry->key); } resetStringInfo(&content); pfree(content.data); return; } - else - { - elog(LOG, "[record_2pc_commit_timestamp] %s is not found " - "in hash table", tid); + + /* not found */ + elog(LOG, "[%s] %s is not found in hash table, retry times: %d", + func, tid, retry_times); + + Assert(NULL == entry); + print_record_2pc_cache(func); + + pg_usleep(5000L); /* sleep 5ms */ } + + if (NULL != record_2pc_cache) + { + elog(LOG, "[%s] %s is not found in hash table, get from disk", func, tid); } GET_2PC_FILE_PATH(path, tid); @@ -3892,6 +3947,8 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta if (RecoveryInProgress()) { #if 0 + int i; + GlobalTransaction gxact = NULL; for (i = 0; i < TwoPhaseState->numPrepXacts; i++) { gxact = TwoPhaseState->prepXacts[i]; @@ -3901,19 +3958,18 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta } if (0 == strcmp(gxact->gid, tid)) { - elog(ERROR, "[record_2pc_commit_timestamp] could not " - "append timestamp in file %s, errMsg: %s", - path, strerror(errno)); + elog(ERROR, "[%s] could not append timestamp in file %s, " + "errMsg: %s", func, path, strerror(errno)); } } #endif - elog(LOG, "[record_2pc_commit_timestamp] could not open file %s, " - "errMsg: %s", path, strerror(errno)); + elog(LOG, "[%s] could not open file %s, errMsg: %s", + func, path, strerror(errno)); } else { - elog(ERROR, "[record_2pc_commit_timestamp] could not open file %s, " - "errMsg: %s", path, strerror(errno)); + elog(ERROR, "[%s] could not open file %s, errMsg: %s", + func, path, strerror(errno)); } return; } @@ -3922,17 +3978,16 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta { memset(file_content, 0, FILE_CONTENT_SIZE); ret = FileRead(fd, file_content, FILE_CONTENT_SIZE, WAIT_EVENT_BUFFILE_READ); - elog(LOG, "[record_2pc_commit_timestamp] before append file: %s, " - "file_content: %s, content.data: %s, ret: %d", - path, file_content, content.data, ret); + elog(LOG, "[%s] before append file: %s, file_content: %s, content.data: %s, " + "ret: %d", func, path, file_content, content.data, ret); } ret = FileWrite(fd, content.data, size, WAIT_EVENT_BUFFILE_WRITE); if(ret != size) { FileClose(fd); - elog(ERROR, "[record_2pc_commit_timestamp] could not write file %s, " - "errMsg: %s", path, strerror(errno)); + elog(ERROR, "[%s] could not write file %s, errMsg: %s", + func, path, strerror(errno)); } if (enable_distri_print) @@ -3940,9 +3995,8 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta memset(file_content, 0, FILE_CONTENT_SIZE); FileSeek(fd, 0, SEEK_SET); ret = FileRead(fd, file_content, FILE_CONTENT_SIZE, WAIT_EVENT_BUFFILE_READ); - elog(LOG, "[record_2pc_commit_timestamp] after append file: %s, " - "file_content: %s, ret: %d", - path, file_content, ret); + elog(LOG, "[%s] after append file: %s, file_content: %s, ret: %d", + func, path, file_content, ret); } FileClose(fd); @@ -3956,6 +4010,7 @@ void remove_2pc_records(const char * tid, bool record_in_xlog) char path[MAXPGPATH]; Cache2pcInfo *entry = NULL; bool found = false; + char *func = "remove_2pc_records"; if (!enable_2pc_recovery_info) { @@ -3964,8 +4019,7 @@ void remove_2pc_records(const char * tid, bool record_in_xlog) if (enable_distri_print || enable_2pc_entry_trace) { - elog(LOG, "[remove_2pc_records] %s record_in_xlog: %d", - tid, record_in_xlog); + elog(LOG, "[%s] %s record_in_xlog: %d", func, tid, record_in_xlog); } if (!RecoveryInProgress() && record_in_xlog) @@ -3987,8 +4041,8 @@ void remove_2pc_records(const char * tid, bool record_in_xlog) if (found) { Assert(NULL != entry); - check_entry_key(tid, entry->key, "remove_2pc_records"); - check_2pc_file(tid, entry->info, "remove_2pc_records"); + check_entry_key(tid, entry->key, func); + check_2pc_file(tid, entry->info, func); } } entry = (Cache2pcInfo *)hash_search(record_2pc_cache, @@ -3998,8 +4052,7 @@ void remove_2pc_records(const char * tid, bool record_in_xlog) Assert(NULL != entry); if (enable_2pc_entry_trace) { - elog(LOG, "[remove_2pc_records] %s is removed " - "from hash table", tid); + elog(LOG, "[%s] %s is removed from hash table", func, tid); } return; } @@ -4013,8 +4066,8 @@ void remove_2pc_records(const char * tid, bool record_in_xlog) */ if (0 != unlink(path)) { - elog(LOG, "[remove_2pc_records] could not unlink file %s, " - "errMsg: %s", path, strerror(errno)); + elog(LOG, "[%s] could not unlink file %s, errMsg: %s", + func, path, strerror(errno)); } } @@ -4026,6 +4079,7 @@ void rename_2pc_records(const char *tid, TimestampTz timestamp) bool found = false; File fd = 0; int ret = 0; + char *func = "rename_2pc_records"; if (!enable_2pc_recovery_info) { @@ -4034,8 +4088,7 @@ void rename_2pc_records(const char *tid, TimestampTz timestamp) if (enable_distri_print || enable_2pc_entry_trace) { - elog(LOG, "[rename_2pc_records] %s timestamp: " - INT64_FORMAT, tid, timestamp); + elog(LOG, "[%s] %s timestamp: "INT64_FORMAT, func, tid, timestamp); } if (0 == timestamp) @@ -4064,8 +4117,8 @@ void rename_2pc_records(const char *tid, TimestampTz timestamp) if (found) { Assert(NULL != entry); - check_entry_key(tid, entry->key, "rename_2pc_records"); - check_2pc_file(tid, entry->info, "rename_2pc_records"); + check_entry_key(tid, entry->key, func); + check_2pc_file(tid, entry->info, func); if (RecoveryInProgress()) { @@ -4079,8 +4132,8 @@ void rename_2pc_records(const char *tid, TimestampTz timestamp) } if (fd < 0) { - elog(ERROR, "[rename_2pc_records] could not create file %s, " - "errMsg: %s", new_path, strerror(errno)); + elog(ERROR, "[%s] could not create file %s, errMsg: %s", + func, new_path, strerror(errno)); } ret = FileWrite(fd, entry->info, strlen(entry->info), @@ -4088,9 +4141,9 @@ void rename_2pc_records(const char *tid, TimestampTz timestamp) if(ret != strlen(entry->info)) { FileClose(fd); - elog(ERROR, "[rename_2pc_records] could not write file %s, " - "errMsg: %s, ret: %d, info: %s", - path, strerror(errno), ret, entry->info); + elog(ERROR, "[%s] could not write file %s, errMsg: %s, " + "ret: %d, info: %s", + func, path, strerror(errno), ret, entry->info); } FileClose(fd); @@ -4098,13 +4151,12 @@ void rename_2pc_records(const char *tid, TimestampTz timestamp) tid, HASH_REMOVE, &found); if (!found) { - elog(ERROR, "[rename_2pc_records] %s is not found " - "in hash table when remove it", tid); + elog(ERROR, "[%s] %s is not found in hash table when remove it", + func, tid); } else if (enable_2pc_entry_trace) { - elog(LOG, "[rename_2pc_records] %s is removed " - "from hash table", tid); + elog(LOG, "[%s] %s is removed from hash table", func, tid); } return; } @@ -4112,19 +4164,19 @@ void rename_2pc_records(const char *tid, TimestampTz timestamp) if (0 != access(path, F_OK)) { - elog(LOG, "[rename_2pc_records] could not access file %s, " - "errMsg: %s", path, strerror(errno)); + elog(LOG, "[%s] could not access file %s, errMsg: %s", + func, path, strerror(errno)); return; } if (0 != link(path, new_path)) { - elog(ERROR, "[rename_2pc_records] could not link file %s to %s, " - "errMsg: %s", path, new_path, strerror(errno)); + elog(ERROR, "[%s] could not link file %s to %s, errMsg: %s", + func, path, new_path, strerror(errno)); } if (0 != unlink(path)) { - elog(WARNING, "[rename_2pc_records] could not unlink file %s, " - "errMsg: %s", path, strerror(errno)); + elog(WARNING, "[%s] could not unlink file %s, errMsg: %s", + func, path, strerror(errno)); } } @@ -4136,6 +4188,7 @@ void record_2pc_readonly(const char *gid) char content[10] = "readonly"; Cache2pcInfo *entry = NULL; bool found = false; + char *func = "record_2pc_readonly"; if(!enable_2pc_recovery_info) { @@ -4144,7 +4197,7 @@ void record_2pc_readonly(const char *gid) if (enable_distri_print || enable_2pc_entry_trace) { - elog(LOG, "[record_2pc_readonly] %s is readonly", gid); + elog(LOG, "[%s] %s is readonly", func, gid); } if (!RecoveryInProgress()) @@ -4163,33 +4216,34 @@ void record_2pc_readonly(const char *gid) gid, HASH_ENTER_NULL, &found); if (NULL != entry) { - check_entry_key(gid, entry->key, "record_2pc_readonly"); - check_2pc_file(gid, entry->info, "record_2pc_readonly"); + check_entry_key(gid, entry->key, func); + check_2pc_file(gid, entry->info, func); if (found) { if (RecoveryInProgress()) { - elog(LOG, "[record_2pc_readonly] %s is found " - "in hash table in recovery mode", gid); + elog(LOG, "[%s] %s is found in hash table in recovery mode", + func, gid); } else { - elog(LOG, "[record_2pc_readonly] %s is found " - "in hash table", gid); + elog(LOG, "[%s] %s is found in hash table", func, gid); } } else if (enable_2pc_entry_trace) { - elog(LOG, "[record_2pc_readonly] %s is added " - "to hash table", gid); + elog(LOG, "[%s] %s is added to hash table", func, gid); } + memcpy(entry->info, content, strlen(content) + 1); + check_entry_key(gid, entry->key, func); + return; } else { - elog(LOG, "[record_2pc_readonly] %s entry is NULL", gid); + elog(LOG, "[%s] %s entry is NULL", func, gid); } } @@ -4215,8 +4269,8 @@ void record_2pc_readonly(const char *gid) } if (fd < 0) { - elog(ERROR, "[record_2pc_readonly] could not create file %s, " - "errMsg: %s", path, strerror(errno)); + elog(ERROR, "[%s] could not create file %s, errMsg: %s", + func, path, strerror(errno)); return; } @@ -4224,9 +4278,8 @@ void record_2pc_readonly(const char *gid) if(ret != strlen(content)) { FileClose(fd); - elog(ERROR, "[record_2pc_readonly] could not write file %s, " - "errMsg: %s, ret: %d, content: %s", - path, strerror(errno), ret, content); + elog(ERROR, "[%s] could not write file %s, errMsg: %s, ret: %d, content: %s", + func, path, strerror(errno), ret, content); } FileClose(fd); } @@ -4238,6 +4291,8 @@ char *get_2pc_info_from_cache(const char *tid) { Cache2pcInfo *entry = NULL; bool found = false; + char *func = "get_2pc_info_from_cache"; + if (NULL != record_2pc_cache) { Assert(strlen(tid) < MAX_TID_SIZE); @@ -4247,13 +4302,12 @@ char *get_2pc_info_from_cache(const char *tid) { Assert(NULL != entry); - check_entry_key(tid, entry->key, "get_2pc_info_from_cache"); + check_entry_key(tid, entry->key, func); if (enable_2pc_entry_trace) { - elog(LOG, "[get_2pc_info_from_cache] %s is found " - "in hast table, key: %s, info: %s", - tid, entry->key, entry->info); + elog(LOG, "[%s] %s is found in hast table, key: %s, info: %s", + func, tid, entry->key, entry->info); } return entry->info; @@ -4261,8 +4315,7 @@ char *get_2pc_info_from_cache(const char *tid) if (enable_2pc_entry_trace) { - elog(LOG, "[get_2pc_info_from_cache] %s is not found " - "in hast table", tid); + elog(LOG, "[%s] %s is not found in hast table", func, tid); } } return NULL; @@ -4276,6 +4329,7 @@ char *get_2pc_list_from_cache(int *count) HASH_SEQ_STATUS seq; Cache2pcInfo *entry = NULL; char *recordList = NULL; + char *func = "get_2pc_list_from_cache"; if (NULL == record_2pc_cache) { @@ -4286,7 +4340,7 @@ char *get_2pc_list_from_cache(int *count) while ((entry = hash_seq_search(&seq)) != NULL) { Assert(NULL != entry); - check_2pc_file(entry->key, entry->info, "get_2pc_list_from_cache"); + check_2pc_file(entry->key, entry->info, func); if (NULL != count && *count >= MAX_OUTPUT_FILE) { @@ -4335,7 +4389,7 @@ Record2pcCacheInit(void) flags = HASH_ELEM | HASH_PARTITION; record_2pc_cache = ShmemInitHash("Record 2pc Cache", - record_2pc_cache_size/4, record_2pc_cache_size, + record_2pc_cache_size/2, record_2pc_cache_size, &info, flags); } From 89d29959851865d6f3d2d28d7af871db8530036f Mon Sep 17 00:00:00 2001 From: hanwayjiang Date: Thu, 13 May 2021 20:16:42 +0800 Subject: [PATCH 372/578] fix regress for http://tapd.oa.com/pgxz/prong/stories/view/1010092131864638363 (merge request !324) Squash merge branch 'tbase_v2_hanway513' into 'Tbase_v2.15.19' * fix regress for http://tapd.oa.com/pgxz/prong/stories/view/1010092131864638363 --- src/test/regress/expected/alter_generic.out | 30 +- src/test/regress/expected/event_trigger.out | 7 +- src/test/regress/expected/foreign_data.out | 1298 +++++++++++------ src/test/regress/expected/object_address.out | 219 ++- src/test/regress/expected/rolenames.out | 191 ++- .../regress/expected/xl_limitations_1.out | 5 +- 6 files changed, 1145 insertions(+), 605 deletions(-) diff --git a/src/test/regress/expected/alter_generic.out b/src/test/regress/expected/alter_generic.out index 2d7998ff..788c5964 100644 --- a/src/test/regress/expected/alter_generic.out +++ b/src/test/regress/expected/alter_generic.out @@ -149,34 +149,28 @@ SELECT n.nspname, c.conname, a.rolname -- Foreign Data Wrapper and Foreign Server -- CREATE FOREIGN DATA WRAPPER alt_fdw1; -ERROR: Postgres-XL does not support FOREIGN DATA WRAPPER yet -DETAIL: The feature is not currently supported CREATE FOREIGN DATA WRAPPER alt_fdw2; -ERROR: Postgres-XL does not support FOREIGN DATA WRAPPER yet -DETAIL: The feature is not currently supported CREATE SERVER alt_fserv1 FOREIGN DATA WRAPPER alt_fdw1; -ERROR: Postgres-XL does not support SERVER yet -DETAIL: The feature is not currently supported CREATE SERVER alt_fserv2 FOREIGN DATA WRAPPER alt_fdw2; -ERROR: Postgres-XL does not support SERVER yet -DETAIL: The feature is not currently supported ALTER FOREIGN DATA WRAPPER alt_fdw1 RENAME TO alt_fdw2; -- failed (name conflict) -ERROR: foreign-data wrapper "alt_fdw1" does not exist +ERROR: foreign-data wrapper "alt_fdw2" already exists ALTER FOREIGN DATA WRAPPER alt_fdw1 RENAME TO alt_fdw3; -- OK -ERROR: foreign-data wrapper "alt_fdw1" does not exist ALTER SERVER alt_fserv1 RENAME TO alt_fserv2; -- failed (name conflict) -ERROR: server "alt_fserv1" does not exist +ERROR: server "alt_fserv2" already exists ALTER SERVER alt_fserv1 RENAME TO alt_fserv3; -- OK -ERROR: server "alt_fserv1" does not exist SELECT fdwname FROM pg_foreign_data_wrapper WHERE fdwname like 'alt_fdw%'; fdwname ---------- -(0 rows) +---------- + alt_fdw2 + alt_fdw3 +(2 rows) SELECT srvname FROM pg_foreign_server WHERE srvname like 'alt_fserv%'; srvname ---------- -(0 rows) +------------ + alt_fserv2 + alt_fserv3 +(2 rows) -- -- Procedural Language @@ -681,9 +675,9 @@ SELECT nspname, prsname --- \set VERBOSITY terse \\ -- suppress cascade details DROP FOREIGN DATA WRAPPER alt_fdw2 CASCADE; -ERROR: foreign-data wrapper "alt_fdw2" does not exist +NOTICE: drop cascades to server alt_fserv2 DROP FOREIGN DATA WRAPPER alt_fdw3 CASCADE; -ERROR: foreign-data wrapper "alt_fdw3" does not exist +NOTICE: drop cascades to server alt_fserv3 DROP LANGUAGE alt_lang2 CASCADE; DROP LANGUAGE alt_lang3 CASCADE; DROP SCHEMA alt_nsp1 CASCADE; diff --git a/src/test/regress/expected/event_trigger.out b/src/test/regress/expected/event_trigger.out index 6ff64a59..8c96ae64 100644 --- a/src/test/regress/expected/event_trigger.out +++ b/src/test/regress/expected/event_trigger.out @@ -109,14 +109,8 @@ comment on table event_trigger_fire1 is 'here is a comment'; revoke all on table event_trigger_fire1 from public; drop table event_trigger_fire1; create foreign data wrapper useless; -ERROR: Postgres-XL does not support FOREIGN DATA WRAPPER yet -DETAIL: The feature is not currently supported create server useless_server foreign data wrapper useless; -ERROR: Postgres-XL does not support SERVER yet -DETAIL: The feature is not currently supported create user mapping for regress_evt_user server useless_server; -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported alter default privileges for role regress_evt_user revoke delete on tables from regress_evt_user; -- alter owner to non-superuser should fail @@ -139,6 +133,7 @@ ERROR: event trigger "regress_event_trigger" does not exist drop role regress_evt_user; ERROR: role "regress_evt_user" cannot be dropped because some objects depend on it DETAIL: owner of default privileges on new relations belonging to role regress_evt_user +owner of user mapping for regress_evt_user on server useless_server -- cleanup before next test -- these are all OK; the second one should emit a NOTICE drop event trigger if exists regress_event_trigger2; diff --git a/src/test/regress/expected/foreign_data.out b/src/test/regress/expected/foreign_data.out index dec9af7f..26e01f5f 100644 --- a/src/test/regress/expected/foreign_data.out +++ b/src/test/regress/expected/foreign_data.out @@ -14,18 +14,15 @@ CREATE ROLE regress_test_role_super SUPERUSER; CREATE ROLE regress_test_indirect; CREATE ROLE regress_unprivileged_role; CREATE FOREIGN DATA WRAPPER dummy; -ERROR: Postgres-XL does not support FOREIGN DATA WRAPPER yet -DETAIL: The feature is not currently supported COMMENT ON FOREIGN DATA WRAPPER dummy IS 'useless'; -ERROR: foreign-data wrapper "dummy" does not exist CREATE FOREIGN DATA WRAPPER postgresql VALIDATOR postgresql_fdw_validator; -ERROR: Postgres-XL does not support FOREIGN DATA WRAPPER yet -DETAIL: The feature is not currently supported -- At this point we should have 2 built-in wrappers and no servers. SELECT fdwname, fdwhandler::regproc, fdwvalidator::regproc, fdwoptions FROM pg_foreign_data_wrapper ORDER BY 1, 2, 3; fdwname | fdwhandler | fdwvalidator | fdwoptions ----------+------------+--------------+------------ -(0 rows) +------------+------------+--------------------------+------------ + dummy | - | - | + postgresql | - | postgresql_fdw_validator | +(2 rows) SELECT srvname, srvoptions FROM pg_foreign_server; srvname | srvoptions @@ -39,60 +36,58 @@ SELECT * FROM pg_user_mapping; -- CREATE FOREIGN DATA WRAPPER CREATE FOREIGN DATA WRAPPER foo VALIDATOR bar; -- ERROR -ERROR: Postgres-XL does not support FOREIGN DATA WRAPPER yet -DETAIL: The feature is not currently supported +ERROR: function bar(text[], oid) does not exist CREATE FOREIGN DATA WRAPPER foo; -ERROR: Postgres-XL does not support FOREIGN DATA WRAPPER yet -DETAIL: The feature is not currently supported \dew List of foreign-data wrappers Name | Owner | Handler | Validator -------+-------+---------+----------- -(0 rows) +------------+---------------------------+---------+-------------------------- + dummy | regress_foreign_data_user | - | - + foo | regress_foreign_data_user | - | - + postgresql | regress_foreign_data_user | - | postgresql_fdw_validator +(3 rows) CREATE FOREIGN DATA WRAPPER foo; -- duplicate -ERROR: Postgres-XL does not support FOREIGN DATA WRAPPER yet -DETAIL: The feature is not currently supported +ERROR: foreign-data wrapper "foo" already exists DROP FOREIGN DATA WRAPPER foo; -ERROR: foreign-data wrapper "foo" does not exist CREATE FOREIGN DATA WRAPPER foo OPTIONS (testing '1'); -ERROR: Postgres-XL does not support FOREIGN DATA WRAPPER yet -DETAIL: The feature is not currently supported \dew+ List of foreign-data wrappers Name | Owner | Handler | Validator | Access privileges | FDW options | Description -------+-------+---------+-----------+-------------------+-------------+------------- -(0 rows) +------------+---------------------------+---------+--------------------------+-------------------+---------------+------------- + dummy | regress_foreign_data_user | - | - | | | useless + foo | regress_foreign_data_user | - | - | | (testing '1') | + postgresql | regress_foreign_data_user | - | postgresql_fdw_validator | | | +(3 rows) DROP FOREIGN DATA WRAPPER foo; -ERROR: foreign-data wrapper "foo" does not exist CREATE FOREIGN DATA WRAPPER foo OPTIONS (testing '1', testing '2'); -- ERROR -ERROR: Postgres-XL does not support FOREIGN DATA WRAPPER yet -DETAIL: The feature is not currently supported +ERROR: option "testing" provided more than once CREATE FOREIGN DATA WRAPPER foo OPTIONS (testing '1', another '2'); -ERROR: Postgres-XL does not support FOREIGN DATA WRAPPER yet -DETAIL: The feature is not currently supported \dew+ List of foreign-data wrappers Name | Owner | Handler | Validator | Access privileges | FDW options | Description -------+-------+---------+-----------+-------------------+-------------+------------- -(0 rows) +------------+---------------------------+---------+--------------------------+-------------------+----------------------------+------------- + dummy | regress_foreign_data_user | - | - | | | useless + foo | regress_foreign_data_user | - | - | | (testing '1', another '2') | + postgresql | regress_foreign_data_user | - | postgresql_fdw_validator | | | +(3 rows) DROP FOREIGN DATA WRAPPER foo; -ERROR: foreign-data wrapper "foo" does not exist SET ROLE regress_test_role; CREATE FOREIGN DATA WRAPPER foo; -- ERROR -ERROR: Postgres-XL does not support FOREIGN DATA WRAPPER yet -DETAIL: The feature is not currently supported +ERROR: permission denied to create foreign-data wrapper "foo" +HINT: Must be superuser to create a foreign-data wrapper. RESET ROLE; CREATE FOREIGN DATA WRAPPER foo VALIDATOR postgresql_fdw_validator; -ERROR: Postgres-XL does not support FOREIGN DATA WRAPPER yet -DETAIL: The feature is not currently supported \dew+ List of foreign-data wrappers Name | Owner | Handler | Validator | Access privileges | FDW options | Description -------+-------+---------+-----------+-------------------+-------------+------------- -(0 rows) +------------+---------------------------+---------+--------------------------+-------------------+-------------+------------- + dummy | regress_foreign_data_user | - | - | | | useless + foo | regress_foreign_data_user | - | postgresql_fdw_validator | | | + postgresql | regress_foreign_data_user | - | postgresql_fdw_validator | | | +(3 rows) -- ALTER FOREIGN DATA WRAPPER ALTER FOREIGN DATA WRAPPER foo; -- ERROR @@ -100,46 +95,53 @@ ERROR: syntax error at or near ";" LINE 1: ALTER FOREIGN DATA WRAPPER foo; ^ ALTER FOREIGN DATA WRAPPER foo VALIDATOR bar; -- ERROR -ERROR: foreign-data wrapper "foo" does not exist +ERROR: function bar(text[], oid) does not exist ALTER FOREIGN DATA WRAPPER foo NO VALIDATOR; -ERROR: foreign-data wrapper "foo" does not exist \dew+ List of foreign-data wrappers Name | Owner | Handler | Validator | Access privileges | FDW options | Description -------+-------+---------+-----------+-------------------+-------------+------------- -(0 rows) +------------+---------------------------+---------+--------------------------+-------------------+-------------+------------- + dummy | regress_foreign_data_user | - | - | | | useless + foo | regress_foreign_data_user | - | - | | | + postgresql | regress_foreign_data_user | - | postgresql_fdw_validator | | | +(3 rows) ALTER FOREIGN DATA WRAPPER foo OPTIONS (a '1', b '2'); -ERROR: foreign-data wrapper "foo" does not exist ALTER FOREIGN DATA WRAPPER foo OPTIONS (SET c '4'); -- ERROR -ERROR: foreign-data wrapper "foo" does not exist +ERROR: option "c" not found ALTER FOREIGN DATA WRAPPER foo OPTIONS (DROP c); -- ERROR -ERROR: foreign-data wrapper "foo" does not exist +ERROR: option "c" not found ALTER FOREIGN DATA WRAPPER foo OPTIONS (ADD x '1', DROP x); -ERROR: foreign-data wrapper "foo" does not exist \dew+ List of foreign-data wrappers Name | Owner | Handler | Validator | Access privileges | FDW options | Description -------+-------+---------+-----------+-------------------+-------------+------------- -(0 rows) +------------+---------------------------+---------+--------------------------+-------------------+----------------+------------- + dummy | regress_foreign_data_user | - | - | | | useless + foo | regress_foreign_data_user | - | - | | (a '1', b '2') | + postgresql | regress_foreign_data_user | - | postgresql_fdw_validator | | | +(3 rows) ALTER FOREIGN DATA WRAPPER foo OPTIONS (DROP a, SET b '3', ADD c '4'); -ERROR: foreign-data wrapper "foo" does not exist \dew+ List of foreign-data wrappers Name | Owner | Handler | Validator | Access privileges | FDW options | Description -------+-------+---------+-----------+-------------------+-------------+------------- -(0 rows) +------------+---------------------------+---------+--------------------------+-------------------+----------------+------------- + dummy | regress_foreign_data_user | - | - | | | useless + foo | regress_foreign_data_user | - | - | | (b '3', c '4') | + postgresql | regress_foreign_data_user | - | postgresql_fdw_validator | | | +(3 rows) ALTER FOREIGN DATA WRAPPER foo OPTIONS (a '2'); -ERROR: foreign-data wrapper "foo" does not exist ALTER FOREIGN DATA WRAPPER foo OPTIONS (b '4'); -- ERROR -ERROR: foreign-data wrapper "foo" does not exist +ERROR: option "b" provided more than once \dew+ List of foreign-data wrappers Name | Owner | Handler | Validator | Access privileges | FDW options | Description -------+-------+---------+-----------+-------------------+-------------+------------- -(0 rows) +------------+---------------------------+---------+--------------------------+-------------------+-----------------------+------------- + dummy | regress_foreign_data_user | - | - | | | useless + foo | regress_foreign_data_user | - | - | | (b '3', c '4', a '2') | + postgresql | regress_foreign_data_user | - | postgresql_fdw_validator | | | +(3 rows) SET ROLE regress_test_role; ALTER FOREIGN DATA WRAPPER foo OPTIONS (ADD d '5'); -- ERROR @@ -147,17 +149,19 @@ ERROR: permission denied to alter foreign-data wrapper "foo" HINT: Must be superuser to alter a foreign-data wrapper. SET ROLE regress_test_role_super; ALTER FOREIGN DATA WRAPPER foo OPTIONS (ADD d '5'); -ERROR: foreign-data wrapper "foo" does not exist \dew+ List of foreign-data wrappers Name | Owner | Handler | Validator | Access privileges | FDW options | Description -------+-------+---------+-----------+-------------------+-------------+------------- -(0 rows) +------------+---------------------------+---------+--------------------------+-------------------+------------------------------+------------- + dummy | regress_foreign_data_user | - | - | | | useless + foo | regress_foreign_data_user | - | - | | (b '3', c '4', a '2', d '5') | + postgresql | regress_foreign_data_user | - | postgresql_fdw_validator | | | +(3 rows) ALTER FOREIGN DATA WRAPPER foo OWNER TO regress_test_role; -- ERROR -ERROR: foreign-data wrapper "foo" does not exist +ERROR: permission denied to change owner of foreign-data wrapper "foo" +HINT: The owner of a foreign-data wrapper must be a superuser. ALTER FOREIGN DATA WRAPPER foo OWNER TO regress_test_role_super; -ERROR: foreign-data wrapper "foo" does not exist ALTER ROLE regress_test_role_super NOSUPERUSER; SET ROLE regress_test_role_super; ALTER FOREIGN DATA WRAPPER foo OPTIONS (ADD e '6'); -- ERROR @@ -167,19 +171,23 @@ RESET ROLE; \dew+ List of foreign-data wrappers Name | Owner | Handler | Validator | Access privileges | FDW options | Description -------+-------+---------+-----------+-------------------+-------------+------------- -(0 rows) +------------+---------------------------+---------+--------------------------+-------------------+------------------------------+------------- + dummy | regress_foreign_data_user | - | - | | | useless + foo | regress_test_role_super | - | - | | (b '3', c '4', a '2', d '5') | + postgresql | regress_foreign_data_user | - | postgresql_fdw_validator | | | +(3 rows) ALTER FOREIGN DATA WRAPPER foo RENAME TO foo1; -ERROR: foreign-data wrapper "foo" does not exist \dew+ List of foreign-data wrappers Name | Owner | Handler | Validator | Access privileges | FDW options | Description -------+-------+---------+-----------+-------------------+-------------+------------- -(0 rows) +------------+---------------------------+---------+--------------------------+-------------------+------------------------------+------------- + dummy | regress_foreign_data_user | - | - | | | useless + foo1 | regress_test_role_super | - | - | | (b '3', c '4', a '2', d '5') | + postgresql | regress_foreign_data_user | - | postgresql_fdw_validator | | | +(3 rows) ALTER FOREIGN DATA WRAPPER foo1 RENAME TO foo; -ERROR: foreign-data wrapper "foo1" does not exist -- DROP FOREIGN DATA WRAPPER DROP FOREIGN DATA WRAPPER nonexistent; -- ERROR ERROR: foreign-data wrapper "nonexistent" does not exist @@ -188,71 +196,78 @@ NOTICE: foreign-data wrapper "nonexistent" does not exist, skipping \dew+ List of foreign-data wrappers Name | Owner | Handler | Validator | Access privileges | FDW options | Description -------+-------+---------+-----------+-------------------+-------------+------------- -(0 rows) +------------+---------------------------+---------+--------------------------+-------------------+------------------------------+------------- + dummy | regress_foreign_data_user | - | - | | | useless + foo | regress_test_role_super | - | - | | (b '3', c '4', a '2', d '5') | + postgresql | regress_foreign_data_user | - | postgresql_fdw_validator | | | +(3 rows) DROP ROLE regress_test_role_super; -- ERROR +ERROR: role "regress_test_role_super" cannot be dropped because some objects depend on it +DETAIL: owner of foreign-data wrapper foo SET ROLE regress_test_role_super; -ERROR: role "regress_test_role_super" does not exist DROP FOREIGN DATA WRAPPER foo; -ERROR: foreign-data wrapper "foo" does not exist RESET ROLE; DROP ROLE regress_test_role_super; -ERROR: role "regress_test_role_super" does not exist \dew+ List of foreign-data wrappers Name | Owner | Handler | Validator | Access privileges | FDW options | Description -------+-------+---------+-----------+-------------------+-------------+------------- -(0 rows) +------------+---------------------------+---------+--------------------------+-------------------+-------------+------------- + dummy | regress_foreign_data_user | - | - | | | useless + postgresql | regress_foreign_data_user | - | postgresql_fdw_validator | | | +(2 rows) CREATE FOREIGN DATA WRAPPER foo; -ERROR: Postgres-XL does not support FOREIGN DATA WRAPPER yet -DETAIL: The feature is not currently supported CREATE SERVER s1 FOREIGN DATA WRAPPER foo; -ERROR: Postgres-XL does not support SERVER yet -DETAIL: The feature is not currently supported COMMENT ON SERVER s1 IS 'foreign server'; -ERROR: server "s1" does not exist CREATE USER MAPPING FOR current_user SERVER s1; -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported CREATE USER MAPPING FOR current_user SERVER s1; -- ERROR -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported +ERROR: user mapping for "regress_foreign_data_user" already exists for server s1 CREATE USER MAPPING IF NOT EXISTS FOR current_user SERVER s1; -- NOTICE -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported +NOTICE: user mapping for "regress_foreign_data_user" already exists for server s1, skipping \dew+ List of foreign-data wrappers Name | Owner | Handler | Validator | Access privileges | FDW options | Description -------+-------+---------+-----------+-------------------+-------------+------------- -(0 rows) +------------+---------------------------+---------+--------------------------+-------------------+-------------+------------- + dummy | regress_foreign_data_user | - | - | | | useless + foo | regress_foreign_data_user | - | - | | | + postgresql | regress_foreign_data_user | - | postgresql_fdw_validator | | | +(3 rows) \des+ List of foreign servers Name | Owner | Foreign-data wrapper | Access privileges | Type | Version | FDW options | Description -------+-------+----------------------+-------------------+------+---------+-------------+------------- -(0 rows) +------+---------------------------+----------------------+-------------------+------+---------+-------------+---------------- + s1 | regress_foreign_data_user | foo | | | | | foreign server +(1 row) \deu+ List of user mappings Server | User name | FDW options ---------+-----------+------------- -(0 rows) +--------+---------------------------+------------- + s1 | regress_foreign_data_user | +(1 row) DROP FOREIGN DATA WRAPPER foo; -- ERROR -ERROR: foreign-data wrapper "foo" does not exist +ERROR: cannot drop foreign-data wrapper foo because other objects depend on it +DETAIL: server s1 depends on foreign-data wrapper foo +user mapping for regress_foreign_data_user on server s1 depends on server s1 +HINT: Use DROP ... CASCADE to drop the dependent objects too. SET ROLE regress_test_role; DROP FOREIGN DATA WRAPPER foo CASCADE; -- ERROR -ERROR: foreign-data wrapper "foo" does not exist +ERROR: must be owner of foreign-data wrapper foo RESET ROLE; DROP FOREIGN DATA WRAPPER foo CASCADE; -ERROR: foreign-data wrapper "foo" does not exist +NOTICE: drop cascades to 2 other objects +DETAIL: drop cascades to server s1 +drop cascades to user mapping for regress_foreign_data_user on server s1 \dew+ List of foreign-data wrappers Name | Owner | Handler | Validator | Access privileges | FDW options | Description -------+-------+---------+-----------+-------------------+-------------+------------- -(0 rows) +------------+---------------------------+---------+--------------------------+-------------------+-------------+------------- + dummy | regress_foreign_data_user | - | - | | | useless + postgresql | regress_foreign_data_user | - | postgresql_fdw_validator | | | +(2 rows) \des+ List of foreign servers @@ -268,87 +283,84 @@ ERROR: foreign-data wrapper "foo" does not exist -- exercise CREATE SERVER CREATE SERVER s1 FOREIGN DATA WRAPPER foo; -- ERROR -ERROR: Postgres-XL does not support SERVER yet -DETAIL: The feature is not currently supported +ERROR: foreign-data wrapper "foo" does not exist CREATE FOREIGN DATA WRAPPER foo OPTIONS ("test wrapper" 'true'); -ERROR: Postgres-XL does not support FOREIGN DATA WRAPPER yet -DETAIL: The feature is not currently supported CREATE SERVER s1 FOREIGN DATA WRAPPER foo; -ERROR: Postgres-XL does not support SERVER yet -DETAIL: The feature is not currently supported CREATE SERVER s1 FOREIGN DATA WRAPPER foo; -- ERROR -ERROR: Postgres-XL does not support SERVER yet -DETAIL: The feature is not currently supported +ERROR: server "s1" already exists CREATE SERVER IF NOT EXISTS s1 FOREIGN DATA WRAPPER foo; -- No ERROR, just NOTICE -ERROR: Postgres-XL does not support SERVER yet -DETAIL: The feature is not currently supported +NOTICE: server "s1" already exists, skipping CREATE SERVER s2 FOREIGN DATA WRAPPER foo OPTIONS (host 'a', dbname 'b'); -ERROR: Postgres-XL does not support SERVER yet -DETAIL: The feature is not currently supported CREATE SERVER s3 TYPE 'oracle' FOREIGN DATA WRAPPER foo; -ERROR: Postgres-XL does not support SERVER yet -DETAIL: The feature is not currently supported CREATE SERVER s4 TYPE 'oracle' FOREIGN DATA WRAPPER foo OPTIONS (host 'a', dbname 'b'); -ERROR: Postgres-XL does not support SERVER yet -DETAIL: The feature is not currently supported CREATE SERVER s5 VERSION '15.0' FOREIGN DATA WRAPPER foo; -ERROR: Postgres-XL does not support SERVER yet -DETAIL: The feature is not currently supported CREATE SERVER s6 VERSION '16.0' FOREIGN DATA WRAPPER foo OPTIONS (host 'a', dbname 'b'); -ERROR: Postgres-XL does not support SERVER yet -DETAIL: The feature is not currently supported CREATE SERVER s7 TYPE 'oracle' VERSION '17.0' FOREIGN DATA WRAPPER foo OPTIONS (host 'a', dbname 'b'); -ERROR: Postgres-XL does not support SERVER yet -DETAIL: The feature is not currently supported CREATE SERVER s8 FOREIGN DATA WRAPPER postgresql OPTIONS (foo '1'); -- ERROR -ERROR: Postgres-XL does not support SERVER yet -DETAIL: The feature is not currently supported +ERROR: invalid option "foo" +HINT: Valid options in this context are: authtype, service, connect_timeout, dbname, host, hostaddr, port, tty, options, requiressl, sslmode, gsslib CREATE SERVER s8 FOREIGN DATA WRAPPER postgresql OPTIONS (host 'localhost', dbname 's8db'); -ERROR: Postgres-XL does not support SERVER yet -DETAIL: The feature is not currently supported \des+ List of foreign servers Name | Owner | Foreign-data wrapper | Access privileges | Type | Version | FDW options | Description -------+-------+----------------------+-------------------+------+---------+-------------+------------- -(0 rows) +------+---------------------------+----------------------+-------------------+--------+---------+-----------------------------------+------------- + s1 | regress_foreign_data_user | foo | | | | | + s2 | regress_foreign_data_user | foo | | | | (host 'a', dbname 'b') | + s3 | regress_foreign_data_user | foo | | oracle | | | + s4 | regress_foreign_data_user | foo | | oracle | | (host 'a', dbname 'b') | + s5 | regress_foreign_data_user | foo | | | 15.0 | | + s6 | regress_foreign_data_user | foo | | | 16.0 | (host 'a', dbname 'b') | + s7 | regress_foreign_data_user | foo | | oracle | 17.0 | (host 'a', dbname 'b') | + s8 | regress_foreign_data_user | postgresql | | | | (host 'localhost', dbname 's8db') | +(8 rows) SET ROLE regress_test_role; CREATE SERVER t1 FOREIGN DATA WRAPPER foo; -- ERROR: no usage on FDW -ERROR: Postgres-XL does not support SERVER yet -DETAIL: The feature is not currently supported +ERROR: permission denied for foreign-data wrapper foo RESET ROLE; GRANT USAGE ON FOREIGN DATA WRAPPER foo TO regress_test_role; -ERROR: foreign-data wrapper "foo" does not exist SET ROLE regress_test_role; CREATE SERVER t1 FOREIGN DATA WRAPPER foo; -ERROR: Postgres-XL does not support SERVER yet -DETAIL: The feature is not currently supported RESET ROLE; \des+ List of foreign servers Name | Owner | Foreign-data wrapper | Access privileges | Type | Version | FDW options | Description -------+-------+----------------------+-------------------+------+---------+-------------+------------- -(0 rows) +------+---------------------------+----------------------+-------------------+--------+---------+-----------------------------------+------------- + s1 | regress_foreign_data_user | foo | | | | | + s2 | regress_foreign_data_user | foo | | | | (host 'a', dbname 'b') | + s3 | regress_foreign_data_user | foo | | oracle | | | + s4 | regress_foreign_data_user | foo | | oracle | | (host 'a', dbname 'b') | + s5 | regress_foreign_data_user | foo | | | 15.0 | | + s6 | regress_foreign_data_user | foo | | | 16.0 | (host 'a', dbname 'b') | + s7 | regress_foreign_data_user | foo | | oracle | 17.0 | (host 'a', dbname 'b') | + s8 | regress_foreign_data_user | postgresql | | | | (host 'localhost', dbname 's8db') | + t1 | regress_test_role | foo | | | | | +(9 rows) REVOKE USAGE ON FOREIGN DATA WRAPPER foo FROM regress_test_role; -ERROR: foreign-data wrapper "foo" does not exist GRANT USAGE ON FOREIGN DATA WRAPPER foo TO regress_test_indirect; -ERROR: foreign-data wrapper "foo" does not exist SET ROLE regress_test_role; CREATE SERVER t2 FOREIGN DATA WRAPPER foo; -- ERROR -ERROR: Postgres-XL does not support SERVER yet -DETAIL: The feature is not currently supported +ERROR: permission denied for foreign-data wrapper foo RESET ROLE; GRANT regress_test_indirect TO regress_test_role; SET ROLE regress_test_role; CREATE SERVER t2 FOREIGN DATA WRAPPER foo; -ERROR: Postgres-XL does not support SERVER yet -DETAIL: The feature is not currently supported \des+ List of foreign servers Name | Owner | Foreign-data wrapper | Access privileges | Type | Version | FDW options | Description -------+-------+----------------------+-------------------+------+---------+-------------+------------- -(0 rows) +------+---------------------------+----------------------+-------------------+--------+---------+-----------------------------------+------------- + s1 | regress_foreign_data_user | foo | | | | | + s2 | regress_foreign_data_user | foo | | | | (host 'a', dbname 'b') | + s3 | regress_foreign_data_user | foo | | oracle | | | + s4 | regress_foreign_data_user | foo | | oracle | | (host 'a', dbname 'b') | + s5 | regress_foreign_data_user | foo | | | 15.0 | | + s6 | regress_foreign_data_user | foo | | | 16.0 | (host 'a', dbname 'b') | + s7 | regress_foreign_data_user | foo | | oracle | 17.0 | (host 'a', dbname 'b') | + s8 | regress_foreign_data_user | postgresql | | | | (host 'localhost', dbname 's8db') | + t1 | regress_test_role | foo | | | | | + t2 | regress_test_role | foo | | | | | +(10 rows) RESET ROLE; REVOKE regress_test_indirect FROM regress_test_role; @@ -360,72 +372,97 @@ LINE 1: ALTER SERVER s0; ALTER SERVER s0 OPTIONS (a '1'); -- ERROR ERROR: server "s0" does not exist ALTER SERVER s1 VERSION '1.0' OPTIONS (servername 's1'); -ERROR: server "s1" does not exist ALTER SERVER s2 VERSION '1.1'; -ERROR: server "s2" does not exist ALTER SERVER s3 OPTIONS ("tns name" 'orcl', port '1521'); -ERROR: server "s3" does not exist GRANT USAGE ON FOREIGN SERVER s1 TO regress_test_role; -ERROR: server "s1" does not exist GRANT USAGE ON FOREIGN SERVER s6 TO regress_test_role2 WITH GRANT OPTION; -ERROR: server "s6" does not exist \des+ List of foreign servers Name | Owner | Foreign-data wrapper | Access privileges | Type | Version | FDW options | Description -------+-------+----------------------+-------------------+------+---------+-------------+------------- -(0 rows) +------+---------------------------+----------------------+-------------------------------------------------------+--------+---------+-----------------------------------+------------- + s1 | regress_foreign_data_user | foo | regress_foreign_data_user=U/regress_foreign_data_user+| | 1.0 | (servername 's1') | + | | | regress_test_role=U/regress_foreign_data_user | | | | + s2 | regress_foreign_data_user | foo | | | 1.1 | (host 'a', dbname 'b') | + s3 | regress_foreign_data_user | foo | | oracle | | ("tns name" 'orcl', port '1521') | + s4 | regress_foreign_data_user | foo | | oracle | | (host 'a', dbname 'b') | + s5 | regress_foreign_data_user | foo | | | 15.0 | | + s6 | regress_foreign_data_user | foo | regress_foreign_data_user=U/regress_foreign_data_user+| | 16.0 | (host 'a', dbname 'b') | + | | | regress_test_role2=U*/regress_foreign_data_user | | | | + s7 | regress_foreign_data_user | foo | | oracle | 17.0 | (host 'a', dbname 'b') | + s8 | regress_foreign_data_user | postgresql | | | | (host 'localhost', dbname 's8db') | + t1 | regress_test_role | foo | | | | | + t2 | regress_test_role | foo | | | | | +(10 rows) SET ROLE regress_test_role; ALTER SERVER s1 VERSION '1.1'; -- ERROR -ERROR: server "s1" does not exist +ERROR: must be owner of foreign server s1 ALTER SERVER s1 OWNER TO regress_test_role; -- ERROR -ERROR: server "s1" does not exist +ERROR: must be owner of foreign server s1 RESET ROLE; ALTER SERVER s1 OWNER TO regress_test_role; -ERROR: server "s1" does not exist GRANT regress_test_role2 TO regress_test_role; SET ROLE regress_test_role; ALTER SERVER s1 VERSION '1.1'; -ERROR: server "s1" does not exist ALTER SERVER s1 OWNER TO regress_test_role2; -- ERROR -ERROR: server "s1" does not exist +ERROR: permission denied for foreign-data wrapper foo RESET ROLE; ALTER SERVER s8 OPTIONS (foo '1'); -- ERROR option validation -ERROR: server "s8" does not exist +ERROR: invalid option "foo" +HINT: Valid options in this context are: authtype, service, connect_timeout, dbname, host, hostaddr, port, tty, options, requiressl, sslmode, gsslib ALTER SERVER s8 OPTIONS (connect_timeout '30', SET dbname 'db1', DROP host); -ERROR: server "s8" does not exist SET ROLE regress_test_role; ALTER SERVER s1 OWNER TO regress_test_indirect; -- ERROR -ERROR: server "s1" does not exist +ERROR: must be member of role "regress_test_indirect" RESET ROLE; GRANT regress_test_indirect TO regress_test_role; SET ROLE regress_test_role; ALTER SERVER s1 OWNER TO regress_test_indirect; -ERROR: server "s1" does not exist RESET ROLE; GRANT USAGE ON FOREIGN DATA WRAPPER foo TO regress_test_indirect; -ERROR: foreign-data wrapper "foo" does not exist SET ROLE regress_test_role; ALTER SERVER s1 OWNER TO regress_test_indirect; -ERROR: server "s1" does not exist RESET ROLE; DROP ROLE regress_test_indirect; -- ERROR +ERROR: role "regress_test_indirect" cannot be dropped because some objects depend on it +DETAIL: owner of server s1 +privileges for foreign-data wrapper foo \des+ List of foreign servers Name | Owner | Foreign-data wrapper | Access privileges | Type | Version | FDW options | Description -------+-------+----------------------+-------------------+------+---------+-------------+------------- -(0 rows) +------+---------------------------+----------------------+-------------------------------------------------------+--------+---------+--------------------------------------+------------- + s1 | regress_test_indirect | foo | regress_test_indirect=U/regress_test_indirect | | 1.1 | (servername 's1') | + s2 | regress_foreign_data_user | foo | | | 1.1 | (host 'a', dbname 'b') | + s3 | regress_foreign_data_user | foo | | oracle | | ("tns name" 'orcl', port '1521') | + s4 | regress_foreign_data_user | foo | | oracle | | (host 'a', dbname 'b') | + s5 | regress_foreign_data_user | foo | | | 15.0 | | + s6 | regress_foreign_data_user | foo | regress_foreign_data_user=U/regress_foreign_data_user+| | 16.0 | (host 'a', dbname 'b') | + | | | regress_test_role2=U*/regress_foreign_data_user | | | | + s7 | regress_foreign_data_user | foo | | oracle | 17.0 | (host 'a', dbname 'b') | + s8 | regress_foreign_data_user | postgresql | | | | (dbname 'db1', connect_timeout '30') | + t1 | regress_test_role | foo | | | | | + t2 | regress_test_role | foo | | | | | +(10 rows) ALTER SERVER s8 RENAME to s8new; -ERROR: server "s8" does not exist \des+ List of foreign servers Name | Owner | Foreign-data wrapper | Access privileges | Type | Version | FDW options | Description -------+-------+----------------------+-------------------+------+---------+-------------+------------- -(0 rows) +-------+---------------------------+----------------------+-------------------------------------------------------+--------+---------+--------------------------------------+------------- + s1 | regress_test_indirect | foo | regress_test_indirect=U/regress_test_indirect | | 1.1 | (servername 's1') | + s2 | regress_foreign_data_user | foo | | | 1.1 | (host 'a', dbname 'b') | + s3 | regress_foreign_data_user | foo | | oracle | | ("tns name" 'orcl', port '1521') | + s4 | regress_foreign_data_user | foo | | oracle | | (host 'a', dbname 'b') | + s5 | regress_foreign_data_user | foo | | | 15.0 | | + s6 | regress_foreign_data_user | foo | regress_foreign_data_user=U/regress_foreign_data_user+| | 16.0 | (host 'a', dbname 'b') | + | | | regress_test_role2=U*/regress_foreign_data_user | | | | + s7 | regress_foreign_data_user | foo | | oracle | 17.0 | (host 'a', dbname 'b') | + s8new | regress_foreign_data_user | postgresql | | | | (dbname 'db1', connect_timeout '30') | + t1 | regress_test_role | foo | | | | | + t2 | regress_test_role | foo | | | | | +(10 rows) ALTER SERVER s8new RENAME to s8; -ERROR: server "s8new" does not exist -- DROP SERVER DROP SERVER nonexistent; -- ERROR ERROR: server "nonexistent" does not exist @@ -434,51 +471,83 @@ NOTICE: server "nonexistent" does not exist, skipping \des List of foreign servers Name | Owner | Foreign-data wrapper -------+-------+---------------------- -(0 rows) +------+---------------------------+---------------------- + s1 | regress_test_indirect | foo + s2 | regress_foreign_data_user | foo + s3 | regress_foreign_data_user | foo + s4 | regress_foreign_data_user | foo + s5 | regress_foreign_data_user | foo + s6 | regress_foreign_data_user | foo + s7 | regress_foreign_data_user | foo + s8 | regress_foreign_data_user | postgresql + t1 | regress_test_role | foo + t2 | regress_test_role | foo +(10 rows) SET ROLE regress_test_role; DROP SERVER s2; -- ERROR -ERROR: server "s2" does not exist +ERROR: must be owner of foreign server s2 DROP SERVER s1; -ERROR: server "s1" does not exist RESET ROLE; \des List of foreign servers Name | Owner | Foreign-data wrapper -------+-------+---------------------- -(0 rows) +------+---------------------------+---------------------- + s2 | regress_foreign_data_user | foo + s3 | regress_foreign_data_user | foo + s4 | regress_foreign_data_user | foo + s5 | regress_foreign_data_user | foo + s6 | regress_foreign_data_user | foo + s7 | regress_foreign_data_user | foo + s8 | regress_foreign_data_user | postgresql + t1 | regress_test_role | foo + t2 | regress_test_role | foo +(9 rows) ALTER SERVER s2 OWNER TO regress_test_role; -ERROR: server "s2" does not exist SET ROLE regress_test_role; DROP SERVER s2; -ERROR: server "s2" does not exist RESET ROLE; \des List of foreign servers Name | Owner | Foreign-data wrapper -------+-------+---------------------- -(0 rows) +------+---------------------------+---------------------- + s3 | regress_foreign_data_user | foo + s4 | regress_foreign_data_user | foo + s5 | regress_foreign_data_user | foo + s6 | regress_foreign_data_user | foo + s7 | regress_foreign_data_user | foo + s8 | regress_foreign_data_user | postgresql + t1 | regress_test_role | foo + t2 | regress_test_role | foo +(8 rows) CREATE USER MAPPING FOR current_user SERVER s3; -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported \deu List of user mappings Server | User name ---------+----------- -(0 rows) +--------+--------------------------- + s3 | regress_foreign_data_user +(1 row) DROP SERVER s3; -- ERROR -ERROR: server "s3" does not exist +ERROR: cannot drop server s3 because other objects depend on it +DETAIL: user mapping for regress_foreign_data_user on server s3 depends on server s3 +HINT: Use DROP ... CASCADE to drop the dependent objects too. DROP SERVER s3 CASCADE; -ERROR: server "s3" does not exist +NOTICE: drop cascades to user mapping for regress_foreign_data_user on server s3 \des List of foreign servers Name | Owner | Foreign-data wrapper -------+-------+---------------------- -(0 rows) +------+---------------------------+---------------------- + s4 | regress_foreign_data_user | foo + s5 | regress_foreign_data_user | foo + s6 | regress_foreign_data_user | foo + s7 | regress_foreign_data_user | foo + s8 | regress_foreign_data_user | postgresql + t1 | regress_test_role | foo + t2 | regress_test_role | foo +(7 rows) \deu List of user mappings @@ -488,59 +557,44 @@ List of user mappings -- CREATE USER MAPPING CREATE USER MAPPING FOR regress_test_missing_role SERVER s1; -- ERROR -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported +ERROR: role "regress_test_missing_role" does not exist CREATE USER MAPPING FOR current_user SERVER s1; -- ERROR -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported +ERROR: server "s1" does not exist CREATE USER MAPPING FOR current_user SERVER s4; -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported CREATE USER MAPPING FOR user SERVER s4; -- ERROR duplicate -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported +ERROR: user mapping for "regress_foreign_data_user" already exists for server s4 CREATE USER MAPPING FOR public SERVER s4 OPTIONS ("this mapping" 'is public'); -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported CREATE USER MAPPING FOR user SERVER s8 OPTIONS (username 'test', password 'secret'); -- ERROR -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported +ERROR: invalid option "username" +HINT: Valid options in this context are: user, password CREATE USER MAPPING FOR user SERVER s8 OPTIONS (user 'test', password 'secret'); -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported ALTER SERVER s5 OWNER TO regress_test_role; -ERROR: server "s5" does not exist ALTER SERVER s6 OWNER TO regress_test_indirect; -ERROR: role "regress_test_indirect" does not exist SET ROLE regress_test_role; CREATE USER MAPPING FOR current_user SERVER s5; -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported CREATE USER MAPPING FOR current_user SERVER s6 OPTIONS (username 'test'); -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported CREATE USER MAPPING FOR current_user SERVER s7; -- ERROR -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported +ERROR: permission denied for foreign server s7 CREATE USER MAPPING FOR public SERVER s8; -- ERROR -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported +ERROR: must be owner of foreign server s8 RESET ROLE; ALTER SERVER t1 OWNER TO regress_test_indirect; -ERROR: role "regress_test_indirect" does not exist SET ROLE regress_test_role; CREATE USER MAPPING FOR current_user SERVER t1 OPTIONS (username 'bob', password 'boo'); -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported CREATE USER MAPPING FOR public SERVER t1; -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported RESET ROLE; \deu List of user mappings Server | User name ---------+----------- -(0 rows) +--------+--------------------------- + s4 | public + s4 | regress_foreign_data_user + s5 | regress_test_role + s6 | regress_test_role + s8 | regress_foreign_data_user + t1 | public + t1 | regress_test_role +(7 rows) -- ALTER USER MAPPING ALTER USER MAPPING FOR regress_test_missing_role SERVER s4 OPTIONS (gotcha 'true'); -- ERROR @@ -548,24 +602,29 @@ ERROR: role "regress_test_missing_role" does not exist ALTER USER MAPPING FOR user SERVER ss4 OPTIONS (gotcha 'true'); -- ERROR ERROR: server "ss4" does not exist ALTER USER MAPPING FOR public SERVER s5 OPTIONS (gotcha 'true'); -- ERROR -ERROR: server "s5" does not exist +ERROR: user mapping for "public" does not exist for the server ALTER USER MAPPING FOR current_user SERVER s8 OPTIONS (username 'test'); -- ERROR -ERROR: server "s8" does not exist +ERROR: invalid option "username" +HINT: Valid options in this context are: user, password ALTER USER MAPPING FOR current_user SERVER s8 OPTIONS (DROP user, SET password 'public'); -ERROR: server "s8" does not exist SET ROLE regress_test_role; ALTER USER MAPPING FOR current_user SERVER s5 OPTIONS (ADD modified '1'); -ERROR: server "s5" does not exist ALTER USER MAPPING FOR public SERVER s4 OPTIONS (ADD modified '1'); -- ERROR -ERROR: server "s4" does not exist +ERROR: must be owner of foreign server s4 ALTER USER MAPPING FOR public SERVER t1 OPTIONS (ADD modified '1'); -ERROR: server "t1" does not exist RESET ROLE; \deu+ List of user mappings Server | User name | FDW options ---------+-----------+------------- -(0 rows) +--------+---------------------------+---------------------------------- + s4 | public | ("this mapping" 'is public') + s4 | regress_foreign_data_user | + s5 | regress_test_role | (modified '1') + s6 | regress_test_role | (username 'test') + s8 | regress_foreign_data_user | (password 'public') + t1 | public | (modified '1') + t1 | regress_test_role | (username 'bob', password 'boo') +(7 rows) -- DROP USER MAPPING DROP USER MAPPING FOR regress_test_missing_role SERVER s4; -- ERROR @@ -573,33 +632,36 @@ ERROR: role "regress_test_missing_role" does not exist DROP USER MAPPING FOR user SERVER ss4; ERROR: server "ss4" does not exist DROP USER MAPPING FOR public SERVER s7; -- ERROR -ERROR: server "s7" does not exist +ERROR: user mapping for "public" does not exist for the server DROP USER MAPPING IF EXISTS FOR regress_test_missing_role SERVER s4; NOTICE: role "regress_test_missing_role" does not exist, skipping DROP USER MAPPING IF EXISTS FOR user SERVER ss4; NOTICE: server does not exist, skipping DROP USER MAPPING IF EXISTS FOR public SERVER s7; -NOTICE: server does not exist, skipping +NOTICE: user mapping for "public" does not exist for the server, skipping CREATE USER MAPPING FOR public SERVER s8; -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported SET ROLE regress_test_role; DROP USER MAPPING FOR public SERVER s8; -- ERROR -ERROR: server "s8" does not exist +ERROR: must be owner of foreign server s8 RESET ROLE; DROP SERVER s7; -ERROR: server "s7" does not exist \deu List of user mappings Server | User name ---------+----------- -(0 rows) +--------+--------------------------- + s4 | public + s4 | regress_foreign_data_user + s5 | regress_test_role + s6 | regress_test_role + s8 | public + s8 | regress_foreign_data_user + t1 | public + t1 | regress_test_role +(8 rows) -- CREATE FOREIGN TABLE CREATE SCHEMA foreign_schema; CREATE SERVER s0 FOREIGN DATA WRAPPER dummy; -ERROR: Postgres-XL does not support SERVER yet -DETAIL: The feature is not currently supported CREATE FOREIGN TABLE ft1 (); -- ERROR ERROR: syntax error at or near ";" LINE 1: CREATE FOREIGN TABLE ft1 (); @@ -643,123 +705,132 @@ CREATE FOREIGN TABLE ft1 ( c3 date, CHECK (c3 BETWEEN '1994-01-01'::date AND '1994-01-31'::date) ) SERVER s0 OPTIONS (delimiter ',', quote '"', "be quoted" 'value'); -ERROR: server "s0" does not exist COMMENT ON FOREIGN TABLE ft1 IS 'ft1'; -ERROR: relation "ft1" does not exist COMMENT ON COLUMN ft1.c1 IS 'ft1.c1'; -ERROR: relation "ft1" does not exist \d+ ft1 + Foreign table "public.ft1" + Column | Type | Collation | Nullable | Default | FDW options | Storage | Stats target | Description +--------+---------+-----------+----------+---------+--------------------------------+----------+--------------+------------- + c1 | integer | | not null | | ("param 1" 'val1') | plain | | ft1.c1 + c2 | text | | | | (param2 'val2', param3 'val3') | extended | | + c3 | date | | | | | plain | | +Check constraints: + "ft1_c2_check" CHECK (c2 <> ''::text) + "ft1_c3_check" CHECK (c3 >= '01-01-1994'::date AND c3 <= '01-31-1994'::date) +Server: s0 +FDW options: (delimiter ',', quote '"', "be quoted" 'value') + \det+ List of foreign tables Schema | Table | Server | FDW options | Description ---------+-------+--------+-------------+------------- -(0 rows) +--------+-------+--------+-------------------------------------------------+------------- + public | ft1 | s0 | (delimiter ',', quote '"', "be quoted" 'value') | ft1 +(1 row) CREATE INDEX id_ft1_c2 ON ft1 (c2); -- ERROR -ERROR: relation "ft1" does not exist +ERROR: cannot create index on foreign table "ft1" SELECT * FROM ft1; -- ERROR -ERROR: relation "ft1" does not exist -LINE 1: SELECT * FROM ft1; - ^ +ERROR: foreign-data wrapper "dummy" has no handler EXPLAIN SELECT * FROM ft1; -- ERROR -ERROR: relation "ft1" does not exist -LINE 1: EXPLAIN SELECT * FROM ft1; - ^ +ERROR: foreign-data wrapper "dummy" has no handler -- ALTER FOREIGN TABLE COMMENT ON FOREIGN TABLE ft1 IS 'foreign table'; -ERROR: relation "ft1" does not exist COMMENT ON FOREIGN TABLE ft1 IS NULL; -ERROR: relation "ft1" does not exist COMMENT ON COLUMN ft1.c1 IS 'foreign column'; -ERROR: relation "ft1" does not exist COMMENT ON COLUMN ft1.c1 IS NULL; -ERROR: relation "ft1" does not exist ALTER FOREIGN TABLE ft1 ADD COLUMN c4 integer; -ERROR: relation "ft1" does not exist ALTER FOREIGN TABLE ft1 ADD COLUMN c5 integer DEFAULT 0; -ERROR: relation "ft1" does not exist ALTER FOREIGN TABLE ft1 ADD COLUMN c6 integer; -ERROR: relation "ft1" does not exist ALTER FOREIGN TABLE ft1 ADD COLUMN c7 integer NOT NULL; -ERROR: relation "ft1" does not exist ALTER FOREIGN TABLE ft1 ADD COLUMN c8 integer; -ERROR: relation "ft1" does not exist ALTER FOREIGN TABLE ft1 ADD COLUMN c9 integer; -ERROR: relation "ft1" does not exist ALTER FOREIGN TABLE ft1 ADD COLUMN c10 integer OPTIONS (p1 'v1'); -ERROR: relation "ft1" does not exist ALTER FOREIGN TABLE ft1 ALTER COLUMN c4 SET DEFAULT 0; -ERROR: relation "ft1" does not exist ALTER FOREIGN TABLE ft1 ALTER COLUMN c5 DROP DEFAULT; -ERROR: relation "ft1" does not exist ALTER FOREIGN TABLE ft1 ALTER COLUMN c6 SET NOT NULL; -ERROR: relation "ft1" does not exist ALTER FOREIGN TABLE ft1 ALTER COLUMN c7 DROP NOT NULL; -ERROR: relation "ft1" does not exist ALTER FOREIGN TABLE ft1 ALTER COLUMN c8 TYPE char(10) USING '0'; -- ERROR -ERROR: relation "ft1" does not exist +ERROR: "ft1" is not a table ALTER FOREIGN TABLE ft1 ALTER COLUMN c8 TYPE char(10); -ERROR: relation "ft1" does not exist ALTER FOREIGN TABLE ft1 ALTER COLUMN c8 SET DATA TYPE text; -ERROR: relation "ft1" does not exist ALTER FOREIGN TABLE ft1 ALTER COLUMN xmin OPTIONS (ADD p1 'v1'); -- ERROR -ERROR: relation "ft1" does not exist +ERROR: cannot alter system column "xmin" ALTER FOREIGN TABLE ft1 ALTER COLUMN c7 OPTIONS (ADD p1 'v1', ADD p2 'v2'), ALTER COLUMN c8 OPTIONS (ADD p1 'v1', ADD p2 'v2'); -ERROR: relation "ft1" does not exist ALTER FOREIGN TABLE ft1 ALTER COLUMN c8 OPTIONS (SET p2 'V2', DROP p1); -ERROR: relation "ft1" does not exist ALTER FOREIGN TABLE ft1 ALTER COLUMN c1 SET STATISTICS 10000; -ERROR: relation "ft1" does not exist ALTER FOREIGN TABLE ft1 ALTER COLUMN c1 SET (n_distinct = 100); -ERROR: relation "ft1" does not exist ALTER FOREIGN TABLE ft1 ALTER COLUMN c8 SET STATISTICS -1; -ERROR: relation "ft1" does not exist ALTER FOREIGN TABLE ft1 ALTER COLUMN c8 SET STORAGE PLAIN; -ERROR: relation "ft1" does not exist \d+ ft1 + Foreign table "public.ft1" + Column | Type | Collation | Nullable | Default | FDW options | Storage | Stats target | Description +--------+---------+-----------+----------+---------+--------------------------------+----------+--------------+------------- + c1 | integer | | not null | | ("param 1" 'val1') | plain | 10000 | + c2 | text | | | | (param2 'val2', param3 'val3') | extended | | + c3 | date | | | | | plain | | + c4 | integer | | | 0 | | plain | | + c5 | integer | | | | | plain | | + c6 | integer | | not null | | | plain | | + c7 | integer | | | | (p1 'v1', p2 'v2') | plain | | + c8 | text | | | | (p2 'V2') | plain | | + c9 | integer | | | | | plain | | + c10 | integer | | | | (p1 'v1') | plain | | +Check constraints: + "ft1_c2_check" CHECK (c2 <> ''::text) + "ft1_c3_check" CHECK (c3 >= '01-01-1994'::date AND c3 <= '01-31-1994'::date) +Server: s0 +FDW options: (delimiter ',', quote '"', "be quoted" 'value') + -- can't change the column type if it's used elsewhere CREATE TABLE use_ft1_column_type (x ft1); -ERROR: type "ft1" does not exist -LINE 1: CREATE TABLE use_ft1_column_type (x ft1); - ^ ALTER FOREIGN TABLE ft1 ALTER COLUMN c8 SET DATA TYPE integer; -- ERROR -ERROR: relation "ft1" does not exist +ERROR: cannot alter foreign table "ft1" because column "use_ft1_column_type.x" uses its row type DROP TABLE use_ft1_column_type; -ERROR: table "use_ft1_column_type" does not exist ALTER FOREIGN TABLE ft1 ADD PRIMARY KEY (c7); -- ERROR -ERROR: relation "ft1" does not exist +ERROR: primary key constraints are not supported on foreign tables +LINE 1: ALTER FOREIGN TABLE ft1 ADD PRIMARY KEY (c7); + ^ ALTER FOREIGN TABLE ft1 ADD CONSTRAINT ft1_c9_check CHECK (c9 < 0) NOT VALID; -ERROR: relation "ft1" does not exist ALTER FOREIGN TABLE ft1 ALTER CONSTRAINT ft1_c9_check DEFERRABLE; -- ERROR -ERROR: relation "ft1" does not exist +ERROR: "ft1" is not a table ALTER FOREIGN TABLE ft1 DROP CONSTRAINT ft1_c9_check; -ERROR: relation "ft1" does not exist ALTER FOREIGN TABLE ft1 DROP CONSTRAINT no_const; -- ERROR -ERROR: relation "ft1" does not exist +ERROR: constraint "no_const" of relation "ft1" does not exist ALTER FOREIGN TABLE ft1 DROP CONSTRAINT IF EXISTS no_const; -ERROR: relation "ft1" does not exist +NOTICE: constraint "no_const" of relation "ft1" does not exist, skipping ALTER FOREIGN TABLE ft1 SET WITH OIDS; -ERROR: relation "ft1" does not exist ALTER FOREIGN TABLE ft1 OWNER TO regress_test_role; -ERROR: relation "ft1" does not exist ALTER FOREIGN TABLE ft1 OPTIONS (DROP delimiter, SET quote '~', ADD escape '@'); -ERROR: relation "ft1" does not exist ALTER FOREIGN TABLE ft1 DROP COLUMN no_column; -- ERROR -ERROR: relation "ft1" does not exist +ERROR: column "no_column" of relation "ft1" does not exist ALTER FOREIGN TABLE ft1 DROP COLUMN IF EXISTS no_column; -ERROR: relation "ft1" does not exist +NOTICE: column "no_column" of relation "ft1" does not exist, skipping ALTER FOREIGN TABLE ft1 DROP COLUMN c9; -ERROR: relation "ft1" does not exist ALTER FOREIGN TABLE ft1 SET SCHEMA foreign_schema; -ERROR: relation "ft1" does not exist ALTER FOREIGN TABLE ft1 SET TABLESPACE ts; -- ERROR ERROR: relation "ft1" does not exist ALTER FOREIGN TABLE foreign_schema.ft1 RENAME c1 TO foreign_column_1; -ERROR: relation "foreign_schema.ft1" does not exist ALTER FOREIGN TABLE foreign_schema.ft1 RENAME TO foreign_table_1; -ERROR: relation "foreign_schema.ft1" does not exist \d foreign_schema.foreign_table_1 + Foreign table "foreign_schema.foreign_table_1" + Column | Type | Collation | Nullable | Default | FDW options +------------------+---------+-----------+----------+---------+-------------------------------- + foreign_column_1 | integer | | not null | | ("param 1" 'val1') + c2 | text | | | | (param2 'val2', param3 'val3') + c3 | date | | | | + c4 | integer | | | 0 | + c5 | integer | | | | + c6 | integer | | not null | | + c7 | integer | | | | (p1 'v1', p2 'v2') + c8 | text | | | | (p2 'V2') + c10 | integer | | | | (p1 'v1') +Check constraints: + "ft1_c2_check" CHECK (c2 <> ''::text) + "ft1_c3_check" CHECK (c3 >= '01-01-1994'::date AND c3 <= '01-31-1994'::date) +Server: s0 +FDW options: (quote '~', "be quoted" 'value', escape '@') + -- alter noexisting table ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ADD COLUMN c4 integer; NOTICE: relation "doesnt_exist_ft1" does not exist, skipping @@ -807,77 +878,132 @@ NOTICE: relation "doesnt_exist_ft1" does not exist, skipping -- Information schema SELECT * FROM information_schema.foreign_data_wrappers ORDER BY 1, 2; foreign_data_wrapper_catalog | foreign_data_wrapper_name | authorization_identifier | library_name | foreign_data_wrapper_language -------------------------------+---------------------------+--------------------------+--------------+------------------------------- -(0 rows) +------------------------------+---------------------------+---------------------------+--------------+------------------------------- + regression | dummy | regress_foreign_data_user | | c + regression | foo | regress_foreign_data_user | | c + regression | postgresql | regress_foreign_data_user | | c +(3 rows) SELECT * FROM information_schema.foreign_data_wrapper_options ORDER BY 1, 2, 3; foreign_data_wrapper_catalog | foreign_data_wrapper_name | option_name | option_value -------------------------------+---------------------------+-------------+-------------- -(0 rows) +------------------------------+---------------------------+--------------+-------------- + regression | foo | test wrapper | true +(1 row) SELECT * FROM information_schema.foreign_servers ORDER BY 1, 2; foreign_server_catalog | foreign_server_name | foreign_data_wrapper_catalog | foreign_data_wrapper_name | foreign_server_type | foreign_server_version | authorization_identifier -------------------------+---------------------+------------------------------+---------------------------+---------------------+------------------------+-------------------------- -(0 rows) +------------------------+---------------------+------------------------------+---------------------------+---------------------+------------------------+--------------------------- + regression | s0 | regression | dummy | | | regress_foreign_data_user + regression | s4 | regression | foo | oracle | | regress_foreign_data_user + regression | s5 | regression | foo | | 15.0 | regress_test_role + regression | s6 | regression | foo | | 16.0 | regress_test_indirect + regression | s8 | regression | postgresql | | | regress_foreign_data_user + regression | t1 | regression | foo | | | regress_test_indirect + regression | t2 | regression | foo | | | regress_test_role +(7 rows) SELECT * FROM information_schema.foreign_server_options ORDER BY 1, 2, 3; foreign_server_catalog | foreign_server_name | option_name | option_value -------------------------+---------------------+-------------+-------------- -(0 rows) +------------------------+---------------------+-----------------+-------------- + regression | s4 | dbname | b + regression | s4 | host | a + regression | s6 | dbname | b + regression | s6 | host | a + regression | s8 | connect_timeout | 30 + regression | s8 | dbname | db1 +(6 rows) SELECT * FROM information_schema.user_mappings ORDER BY lower(authorization_identifier), 2, 3; authorization_identifier | foreign_server_catalog | foreign_server_name ---------------------------+------------------------+--------------------- -(0 rows) +---------------------------+------------------------+--------------------- + PUBLIC | regression | s4 + PUBLIC | regression | s8 + PUBLIC | regression | t1 + regress_foreign_data_user | regression | s4 + regress_foreign_data_user | regression | s8 + regress_test_role | regression | s5 + regress_test_role | regression | s6 + regress_test_role | regression | t1 +(8 rows) SELECT * FROM information_schema.user_mapping_options ORDER BY lower(authorization_identifier), 2, 3, 4; authorization_identifier | foreign_server_catalog | foreign_server_name | option_name | option_value ---------------------------+------------------------+---------------------+-------------+-------------- -(0 rows) +---------------------------+------------------------+---------------------+--------------+-------------- + PUBLIC | regression | s4 | this mapping | is public + PUBLIC | regression | t1 | modified | 1 + regress_foreign_data_user | regression | s8 | password | public + regress_test_role | regression | s5 | modified | 1 + regress_test_role | regression | s6 | username | test + regress_test_role | regression | t1 | password | boo + regress_test_role | regression | t1 | username | bob +(7 rows) SELECT * FROM information_schema.usage_privileges WHERE object_type LIKE 'FOREIGN%' AND object_name IN ('s6', 'foo') ORDER BY 1, 2, 3, 4, 5; grantor | grantee | object_catalog | object_schema | object_name | object_type | privilege_type | is_grantable ----------+---------+----------------+---------------+-------------+-------------+----------------+-------------- -(0 rows) +---------------------------+---------------------------+----------------+---------------+-------------+----------------------+----------------+-------------- + regress_foreign_data_user | regress_foreign_data_user | regression | | foo | FOREIGN DATA WRAPPER | USAGE | YES + regress_foreign_data_user | regress_test_indirect | regression | | foo | FOREIGN DATA WRAPPER | USAGE | NO + regress_test_indirect | regress_test_indirect | regression | | s6 | FOREIGN SERVER | USAGE | YES + regress_test_indirect | regress_test_role2 | regression | | s6 | FOREIGN SERVER | USAGE | YES +(4 rows) SELECT * FROM information_schema.role_usage_grants WHERE object_type LIKE 'FOREIGN%' AND object_name IN ('s6', 'foo') ORDER BY 1, 2, 3, 4, 5; grantor | grantee | object_catalog | object_schema | object_name | object_type | privilege_type | is_grantable ----------+---------+----------------+---------------+-------------+-------------+----------------+-------------- -(0 rows) +---------------------------+---------------------------+----------------+---------------+-------------+----------------------+----------------+-------------- + regress_foreign_data_user | regress_foreign_data_user | regression | | foo | FOREIGN DATA WRAPPER | USAGE | YES + regress_foreign_data_user | regress_test_indirect | regression | | foo | FOREIGN DATA WRAPPER | USAGE | NO + regress_test_indirect | regress_test_indirect | regression | | s6 | FOREIGN SERVER | USAGE | YES + regress_test_indirect | regress_test_role2 | regression | | s6 | FOREIGN SERVER | USAGE | YES +(4 rows) SELECT * FROM information_schema.foreign_tables ORDER BY 1, 2, 3; foreign_table_catalog | foreign_table_schema | foreign_table_name | foreign_server_catalog | foreign_server_name -----------------------+----------------------+--------------------+------------------------+--------------------- -(0 rows) + regression | foreign_schema | foreign_table_1 | regression | s0 +(1 row) SELECT * FROM information_schema.foreign_table_options ORDER BY 1, 2, 3, 4; foreign_table_catalog | foreign_table_schema | foreign_table_name | option_name | option_value -----------------------+----------------------+--------------------+-------------+-------------- -(0 rows) + regression | foreign_schema | foreign_table_1 | be quoted | value + regression | foreign_schema | foreign_table_1 | escape | @ + regression | foreign_schema | foreign_table_1 | quote | ~ +(3 rows) SET ROLE regress_test_role; SELECT * FROM information_schema.user_mapping_options ORDER BY 1, 2, 3, 4; authorization_identifier | foreign_server_catalog | foreign_server_name | option_name | option_value --------------------------+------------------------+---------------------+-------------+-------------- -(0 rows) + PUBLIC | regression | t1 | modified | 1 + regress_test_role | regression | s5 | modified | 1 + regress_test_role | regression | s6 | username | test + regress_test_role | regression | t1 | password | boo + regress_test_role | regression | t1 | username | bob +(5 rows) SELECT * FROM information_schema.usage_privileges WHERE object_type LIKE 'FOREIGN%' AND object_name IN ('s6', 'foo') ORDER BY 1, 2, 3, 4, 5; grantor | grantee | object_catalog | object_schema | object_name | object_type | privilege_type | is_grantable ----------+---------+----------------+---------------+-------------+-------------+----------------+-------------- -(0 rows) +---------------------------+-----------------------+----------------+---------------+-------------+----------------------+----------------+-------------- + regress_foreign_data_user | regress_test_indirect | regression | | foo | FOREIGN DATA WRAPPER | USAGE | NO + regress_test_indirect | regress_test_indirect | regression | | s6 | FOREIGN SERVER | USAGE | YES + regress_test_indirect | regress_test_role2 | regression | | s6 | FOREIGN SERVER | USAGE | YES +(3 rows) SELECT * FROM information_schema.role_usage_grants WHERE object_type LIKE 'FOREIGN%' AND object_name IN ('s6', 'foo') ORDER BY 1, 2, 3, 4, 5; grantor | grantee | object_catalog | object_schema | object_name | object_type | privilege_type | is_grantable ----------+---------+----------------+---------------+-------------+-------------+----------------+-------------- -(0 rows) +---------------------------+-----------------------+----------------+---------------+-------------+----------------------+----------------+-------------- + regress_foreign_data_user | regress_test_indirect | regression | | foo | FOREIGN DATA WRAPPER | USAGE | NO + regress_test_indirect | regress_test_indirect | regression | | s6 | FOREIGN SERVER | USAGE | YES + regress_test_indirect | regress_test_role2 | regression | | s6 | FOREIGN SERVER | USAGE | YES +(3 rows) DROP USER MAPPING FOR current_user SERVER t1; -ERROR: server "t1" does not exist SET ROLE regress_test_role2; SELECT * FROM information_schema.user_mapping_options ORDER BY 1, 2, 3, 4; authorization_identifier | foreign_server_catalog | foreign_server_name | option_name | option_value --------------------------+------------------------+---------------------+-------------+-------------- -(0 rows) + regress_test_role | regression | s6 | username | +(1 row) RESET ROLE; -- has_foreign_data_wrapper_privilege @@ -885,229 +1011,255 @@ SELECT has_foreign_data_wrapper_privilege('regress_test_role', (SELECT oid FROM pg_foreign_data_wrapper WHERE fdwname='foo'), 'USAGE'); has_foreign_data_wrapper_privilege ------------------------------------ - + t (1 row) SELECT has_foreign_data_wrapper_privilege('regress_test_role', 'foo', 'USAGE'); -ERROR: foreign-data wrapper "foo" does not exist + has_foreign_data_wrapper_privilege +------------------------------------ + t +(1 row) + SELECT has_foreign_data_wrapper_privilege( (SELECT oid FROM pg_catalog.pg_roles WHERE rolname='regress_test_role'), (SELECT oid FROM pg_foreign_data_wrapper WHERE fdwname='foo'), 'USAGE'); has_foreign_data_wrapper_privilege ------------------------------------ - + t (1 row) SELECT has_foreign_data_wrapper_privilege( (SELECT oid FROM pg_foreign_data_wrapper WHERE fdwname='foo'), 'USAGE'); has_foreign_data_wrapper_privilege ------------------------------------ - + t (1 row) SELECT has_foreign_data_wrapper_privilege( (SELECT oid FROM pg_catalog.pg_roles WHERE rolname='regress_test_role'), 'foo', 'USAGE'); -ERROR: foreign-data wrapper "foo" does not exist + has_foreign_data_wrapper_privilege +------------------------------------ + t +(1 row) + SELECT has_foreign_data_wrapper_privilege('foo', 'USAGE'); -ERROR: foreign-data wrapper "foo" does not exist + has_foreign_data_wrapper_privilege +------------------------------------ + t +(1 row) + GRANT USAGE ON FOREIGN DATA WRAPPER foo TO regress_test_role; -ERROR: foreign-data wrapper "foo" does not exist SELECT has_foreign_data_wrapper_privilege('regress_test_role', 'foo', 'USAGE'); -ERROR: foreign-data wrapper "foo" does not exist + has_foreign_data_wrapper_privilege +------------------------------------ + t +(1 row) + -- has_server_privilege SELECT has_server_privilege('regress_test_role', (SELECT oid FROM pg_foreign_server WHERE srvname='s8'), 'USAGE'); has_server_privilege ---------------------- - + f (1 row) SELECT has_server_privilege('regress_test_role', 's8', 'USAGE'); -ERROR: server "s8" does not exist + has_server_privilege +---------------------- + f +(1 row) + SELECT has_server_privilege( (SELECT oid FROM pg_catalog.pg_roles WHERE rolname='regress_test_role'), (SELECT oid FROM pg_foreign_server WHERE srvname='s8'), 'USAGE'); has_server_privilege ---------------------- - + f (1 row) SELECT has_server_privilege( (SELECT oid FROM pg_foreign_server WHERE srvname='s8'), 'USAGE'); has_server_privilege ---------------------- - + t (1 row) SELECT has_server_privilege( (SELECT oid FROM pg_catalog.pg_roles WHERE rolname='regress_test_role'), 's8', 'USAGE'); -ERROR: server "s8" does not exist + has_server_privilege +---------------------- + f +(1 row) + SELECT has_server_privilege('s8', 'USAGE'); -ERROR: server "s8" does not exist + has_server_privilege +---------------------- + t +(1 row) + GRANT USAGE ON FOREIGN SERVER s8 TO regress_test_role; -ERROR: server "s8" does not exist SELECT has_server_privilege('regress_test_role', 's8', 'USAGE'); -ERROR: server "s8" does not exist + has_server_privilege +---------------------- + t +(1 row) + REVOKE USAGE ON FOREIGN SERVER s8 FROM regress_test_role; -ERROR: server "s8" does not exist GRANT USAGE ON FOREIGN SERVER s4 TO regress_test_role; -ERROR: server "s4" does not exist DROP USER MAPPING FOR public SERVER s4; -ERROR: server "s4" does not exist ALTER SERVER s6 OPTIONS (DROP host, DROP dbname); -ERROR: server "s6" does not exist ALTER USER MAPPING FOR regress_test_role SERVER s6 OPTIONS (DROP username); -ERROR: server "s6" does not exist ALTER FOREIGN DATA WRAPPER foo VALIDATOR postgresql_fdw_validator; -ERROR: foreign-data wrapper "foo" does not exist +WARNING: changing the foreign-data wrapper validator can cause the options for dependent objects to become invalid -- Privileges SET ROLE regress_unprivileged_role; CREATE FOREIGN DATA WRAPPER foobar; -- ERROR -ERROR: Postgres-XL does not support FOREIGN DATA WRAPPER yet -DETAIL: The feature is not currently supported +ERROR: permission denied to create foreign-data wrapper "foobar" +HINT: Must be superuser to create a foreign-data wrapper. ALTER FOREIGN DATA WRAPPER foo OPTIONS (gotcha 'true'); -- ERROR ERROR: permission denied to alter foreign-data wrapper "foo" HINT: Must be superuser to alter a foreign-data wrapper. ALTER FOREIGN DATA WRAPPER foo OWNER TO regress_unprivileged_role; -- ERROR -ERROR: foreign-data wrapper "foo" does not exist +ERROR: permission denied to change owner of foreign-data wrapper "foo" +HINT: Must be superuser to change owner of a foreign-data wrapper. DROP FOREIGN DATA WRAPPER foo; -- ERROR -ERROR: foreign-data wrapper "foo" does not exist +ERROR: must be owner of foreign-data wrapper foo GRANT USAGE ON FOREIGN DATA WRAPPER foo TO regress_test_role; -- ERROR -ERROR: foreign-data wrapper "foo" does not exist +ERROR: permission denied for foreign-data wrapper foo CREATE SERVER s9 FOREIGN DATA WRAPPER foo; -- ERROR -ERROR: Postgres-XL does not support SERVER yet -DETAIL: The feature is not currently supported +ERROR: permission denied for foreign-data wrapper foo ALTER SERVER s4 VERSION '0.5'; -- ERROR -ERROR: server "s4" does not exist +ERROR: must be owner of foreign server s4 ALTER SERVER s4 OWNER TO regress_unprivileged_role; -- ERROR -ERROR: server "s4" does not exist +ERROR: must be owner of foreign server s4 DROP SERVER s4; -- ERROR -ERROR: server "s4" does not exist +ERROR: must be owner of foreign server s4 GRANT USAGE ON FOREIGN SERVER s4 TO regress_test_role; -- ERROR -ERROR: server "s4" does not exist +ERROR: permission denied for foreign server s4 CREATE USER MAPPING FOR public SERVER s4; -- ERROR -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported +ERROR: must be owner of foreign server s4 ALTER USER MAPPING FOR regress_test_role SERVER s6 OPTIONS (gotcha 'true'); -- ERROR -ERROR: server "s6" does not exist +ERROR: must be owner of foreign server s6 DROP USER MAPPING FOR regress_test_role SERVER s6; -- ERROR -ERROR: server "s6" does not exist +ERROR: must be owner of foreign server s6 RESET ROLE; GRANT USAGE ON FOREIGN DATA WRAPPER postgresql TO regress_unprivileged_role; -ERROR: foreign-data wrapper "postgresql" does not exist GRANT USAGE ON FOREIGN DATA WRAPPER foo TO regress_unprivileged_role WITH GRANT OPTION; -ERROR: foreign-data wrapper "foo" does not exist SET ROLE regress_unprivileged_role; CREATE FOREIGN DATA WRAPPER foobar; -- ERROR -ERROR: Postgres-XL does not support FOREIGN DATA WRAPPER yet -DETAIL: The feature is not currently supported +ERROR: permission denied to create foreign-data wrapper "foobar" +HINT: Must be superuser to create a foreign-data wrapper. ALTER FOREIGN DATA WRAPPER foo OPTIONS (gotcha 'true'); -- ERROR ERROR: permission denied to alter foreign-data wrapper "foo" HINT: Must be superuser to alter a foreign-data wrapper. DROP FOREIGN DATA WRAPPER foo; -- ERROR -ERROR: foreign-data wrapper "foo" does not exist +ERROR: must be owner of foreign-data wrapper foo GRANT USAGE ON FOREIGN DATA WRAPPER postgresql TO regress_test_role; -- WARNING -ERROR: foreign-data wrapper "postgresql" does not exist +WARNING: no privileges were granted for "postgresql" GRANT USAGE ON FOREIGN DATA WRAPPER foo TO regress_test_role; -ERROR: foreign-data wrapper "foo" does not exist CREATE SERVER s9 FOREIGN DATA WRAPPER postgresql; -ERROR: Postgres-XL does not support SERVER yet -DETAIL: The feature is not currently supported ALTER SERVER s6 VERSION '0.5'; -- ERROR -ERROR: server "s6" does not exist +ERROR: must be owner of foreign server s6 DROP SERVER s6; -- ERROR -ERROR: server "s6" does not exist +ERROR: must be owner of foreign server s6 GRANT USAGE ON FOREIGN SERVER s6 TO regress_test_role; -- ERROR -ERROR: server "s6" does not exist +ERROR: permission denied for foreign server s6 GRANT USAGE ON FOREIGN SERVER s9 TO regress_test_role; -ERROR: server "s9" does not exist CREATE USER MAPPING FOR public SERVER s6; -- ERROR -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported +ERROR: must be owner of foreign server s6 CREATE USER MAPPING FOR public SERVER s9; -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported ALTER USER MAPPING FOR regress_test_role SERVER s6 OPTIONS (gotcha 'true'); -- ERROR -ERROR: server "s6" does not exist +ERROR: must be owner of foreign server s6 DROP USER MAPPING FOR regress_test_role SERVER s6; -- ERROR -ERROR: server "s6" does not exist +ERROR: must be owner of foreign server s6 RESET ROLE; REVOKE USAGE ON FOREIGN DATA WRAPPER foo FROM regress_unprivileged_role; -- ERROR -ERROR: foreign-data wrapper "foo" does not exist +ERROR: dependent privileges exist +HINT: Use CASCADE to revoke them too. REVOKE USAGE ON FOREIGN DATA WRAPPER foo FROM regress_unprivileged_role CASCADE; -ERROR: foreign-data wrapper "foo" does not exist SET ROLE regress_unprivileged_role; GRANT USAGE ON FOREIGN DATA WRAPPER foo TO regress_test_role; -- ERROR -ERROR: foreign-data wrapper "foo" does not exist +ERROR: permission denied for foreign-data wrapper foo CREATE SERVER s10 FOREIGN DATA WRAPPER foo; -- ERROR -ERROR: Postgres-XL does not support SERVER yet -DETAIL: The feature is not currently supported +ERROR: permission denied for foreign-data wrapper foo ALTER SERVER s9 VERSION '1.1'; -ERROR: server "s9" does not exist GRANT USAGE ON FOREIGN SERVER s9 TO regress_test_role; -ERROR: server "s9" does not exist CREATE USER MAPPING FOR current_user SERVER s9; -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported -- We use terse mode to avoid ordering issues in cascade detail output. \set VERBOSITY terse DROP SERVER s9 CASCADE; -ERROR: server "s9" does not exist +NOTICE: drop cascades to 2 other objects \set VERBOSITY default RESET ROLE; CREATE SERVER s9 FOREIGN DATA WRAPPER foo; -ERROR: Postgres-XL does not support SERVER yet -DETAIL: The feature is not currently supported GRANT USAGE ON FOREIGN SERVER s9 TO regress_unprivileged_role; -ERROR: server "s9" does not exist SET ROLE regress_unprivileged_role; ALTER SERVER s9 VERSION '1.2'; -- ERROR -ERROR: server "s9" does not exist +ERROR: must be owner of foreign server s9 GRANT USAGE ON FOREIGN SERVER s9 TO regress_test_role; -- WARNING -ERROR: server "s9" does not exist +WARNING: no privileges were granted for "s9" CREATE USER MAPPING FOR current_user SERVER s9; -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported DROP SERVER s9 CASCADE; -- ERROR -ERROR: server "s9" does not exist +ERROR: must be owner of foreign server s9 -- Check visibility of user mapping data SET ROLE regress_test_role; CREATE SERVER s10 FOREIGN DATA WRAPPER foo; -ERROR: Postgres-XL does not support SERVER yet -DETAIL: The feature is not currently supported CREATE USER MAPPING FOR public SERVER s10 OPTIONS (user 'secret'); -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported CREATE USER MAPPING FOR regress_unprivileged_role SERVER s10 OPTIONS (user 'secret'); -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported -- owner of server can see some option fields \deu+ List of user mappings Server | User name | FDW options ---------+-----------+------------- -(0 rows) +--------+---------------------------+------------------- + s10 | public | ("user" 'secret') + s10 | regress_unprivileged_role | + s4 | regress_foreign_data_user | + s5 | regress_test_role | (modified '1') + s6 | regress_test_role | + s8 | public | + s8 | regress_foreign_data_user | + s9 | regress_unprivileged_role | + t1 | public | (modified '1') +(9 rows) RESET ROLE; -- superuser can see all option fields \deu+ List of user mappings Server | User name | FDW options ---------+-----------+------------- -(0 rows) +--------+---------------------------+--------------------- + s10 | public | ("user" 'secret') + s10 | regress_unprivileged_role | ("user" 'secret') + s4 | regress_foreign_data_user | + s5 | regress_test_role | (modified '1') + s6 | regress_test_role | + s8 | public | + s8 | regress_foreign_data_user | (password 'public') + s9 | regress_unprivileged_role | + t1 | public | (modified '1') +(9 rows) -- unprivileged user cannot see any option field SET ROLE regress_unprivileged_role; \deu+ List of user mappings Server | User name | FDW options ---------+-----------+------------- -(0 rows) +--------+---------------------------+------------- + s10 | public | + s10 | regress_unprivileged_role | + s4 | regress_foreign_data_user | + s5 | regress_test_role | + s6 | regress_test_role | + s8 | public | + s8 | regress_foreign_data_user | + s9 | regress_unprivileged_role | + t1 | public | +(9 rows) RESET ROLE; \set VERBOSITY terse DROP SERVER s10 CASCADE; -ERROR: server "s10" does not exist +NOTICE: drop cascades to 2 other objects \set VERBOSITY default -- Triggers CREATE FUNCTION dummy_trigger() RETURNS TRIGGER AS $$ @@ -1154,18 +1306,18 @@ ERROR: Postgres-XL does not support TRIGGER yet DETAIL: The feature is not currently supported ALTER FOREIGN TABLE foreign_schema.foreign_table_1 DISABLE TRIGGER trigtest_before_stmt; -ERROR: relation "foreign_schema.foreign_table_1" does not exist +ERROR: trigger "trigtest_before_stmt" for table "foreign_table_1" does not exist ALTER FOREIGN TABLE foreign_schema.foreign_table_1 ENABLE TRIGGER trigtest_before_stmt; -ERROR: relation "foreign_schema.foreign_table_1" does not exist +ERROR: trigger "trigtest_before_stmt" for table "foreign_table_1" does not exist DROP TRIGGER trigtest_before_stmt ON foreign_schema.foreign_table_1; -ERROR: relation "foreign_schema.foreign_table_1" does not exist +ERROR: trigger "trigtest_before_stmt" for table "foreign_table_1" does not exist DROP TRIGGER trigtest_before_row ON foreign_schema.foreign_table_1; -ERROR: relation "foreign_schema.foreign_table_1" does not exist +ERROR: trigger "trigtest_before_row" for table "foreign_table_1" does not exist DROP TRIGGER trigtest_after_stmt ON foreign_schema.foreign_table_1; -ERROR: relation "foreign_schema.foreign_table_1" does not exist +ERROR: trigger "trigtest_after_stmt" for table "foreign_table_1" does not exist DROP TRIGGER trigtest_after_row ON foreign_schema.foreign_table_1; -ERROR: relation "foreign_schema.foreign_table_1" does not exist +ERROR: trigger "trigtest_after_row" for table "foreign_table_1" does not exist DROP FUNCTION dummy_trigger(); -- Table inheritance CREATE TABLE pt1 ( @@ -1175,7 +1327,6 @@ CREATE TABLE pt1 ( ); CREATE FOREIGN TABLE ft2 () INHERITS (pt1) SERVER s0 OPTIONS (delimiter ',', quote '"', "be quoted" 'value'); -ERROR: server "s0" does not exist \d+ pt1 Table "public.pt1" Column | Type | Collation | Nullable | Default | Storage | Stats target | Description @@ -1183,12 +1334,22 @@ ERROR: server "s0" does not exist c1 | integer | | not null | | plain | | c2 | text | | | | extended | | c3 | date | | | | plain | | +Child tables: ft2 Distribute By: HASH(c1) Location Nodes: ALL DATANODES \d+ ft2 + Foreign table "public.ft2" + Column | Type | Collation | Nullable | Default | FDW options | Storage | Stats target | Description +--------+---------+-----------+----------+---------+-------------+----------+--------------+------------- + c1 | integer | | not null | | | plain | | + c2 | text | | | | | extended | | + c3 | date | | | | | plain | | +Server: s0 +FDW options: (delimiter ',', quote '"', "be quoted" 'value') +Inherits: pt1 + DROP FOREIGN TABLE ft2; -ERROR: foreign table "ft2" does not exist \d+ pt1 Table "public.pt1" Column | Type | Collation | Nullable | Default | Storage | Stats target | Description @@ -1204,10 +1365,17 @@ CREATE FOREIGN TABLE ft2 ( c2 text, c3 date ) SERVER s0 OPTIONS (delimiter ',', quote '"', "be quoted" 'value'); -ERROR: server "s0" does not exist \d+ ft2 + Foreign table "public.ft2" + Column | Type | Collation | Nullable | Default | FDW options | Storage | Stats target | Description +--------+---------+-----------+----------+---------+-------------+----------+--------------+------------- + c1 | integer | | not null | | | plain | | + c2 | text | | | | | extended | | + c3 | date | | | | | plain | | +Server: s0 +FDW options: (delimiter ',', quote '"', "be quoted" 'value') + ALTER FOREIGN TABLE ft2 INHERIT pt1; -ERROR: relation "ft2" does not exist \d+ pt1 Table "public.pt1" Column | Type | Collation | Nullable | Default | Storage | Stats target | Description @@ -1215,20 +1383,41 @@ ERROR: relation "ft2" does not exist c1 | integer | | not null | | plain | | c2 | text | | | | extended | | c3 | date | | | | plain | | +Child tables: ft2 Distribute By: HASH(c1) Location Nodes: ALL DATANODES \d+ ft2 + Foreign table "public.ft2" + Column | Type | Collation | Nullable | Default | FDW options | Storage | Stats target | Description +--------+---------+-----------+----------+---------+-------------+----------+--------------+------------- + c1 | integer | | not null | | | plain | | + c2 | text | | | | | extended | | + c3 | date | | | | | plain | | +Server: s0 +FDW options: (delimiter ',', quote '"', "be quoted" 'value') +Inherits: pt1 + CREATE TABLE ct3() INHERITS(ft2); -ERROR: relation "ft2" does not exist +ERROR: inherited relation "ft2" is not a table CREATE FOREIGN TABLE ft3 ( c1 integer NOT NULL, c2 text, c3 date ) INHERITS(ft2) SERVER s0; -ERROR: relation "ft2" does not exist +ERROR: inherited relation "ft2" is not a table \d+ ft2 + Foreign table "public.ft2" + Column | Type | Collation | Nullable | Default | FDW options | Storage | Stats target | Description +--------+---------+-----------+----------+---------+-------------+----------+--------------+------------- + c1 | integer | | not null | | | plain | | + c2 | text | | | | | extended | | + c3 | date | | | | | plain | | +Server: s0 +FDW options: (delimiter ',', quote '"', "be quoted" 'value') +Inherits: pt1 + \d+ ct3 \d+ ft3 -- add attributes recursively @@ -1249,10 +1438,26 @@ ALTER TABLE pt1 ADD COLUMN c8 integer; c6 | integer | | | | plain | | c7 | integer | | not null | | plain | | c8 | integer | | | | plain | | +Child tables: ft2 Distribute By: HASH(c1) Location Nodes: ALL DATANODES \d+ ft2 + Foreign table "public.ft2" + Column | Type | Collation | Nullable | Default | FDW options | Storage | Stats target | Description +--------+---------+-----------+----------+---------+-------------+----------+--------------+------------- + c1 | integer | | not null | | | plain | | + c2 | text | | | | | extended | | + c3 | date | | | | | plain | | + c4 | integer | | | | | plain | | + c5 | integer | | | 0 | | plain | | + c6 | integer | | | | | plain | | + c7 | integer | | not null | | | plain | | + c8 | integer | | | | | plain | | +Server: s0 +FDW options: (delimiter ',', quote '"', "be quoted" 'value') +Inherits: pt1 + \d+ ct3 \d+ ft3 -- alter attributes recursively @@ -1261,6 +1466,7 @@ ALTER TABLE pt1 ALTER COLUMN c5 DROP DEFAULT; ALTER TABLE pt1 ALTER COLUMN c6 SET NOT NULL; ALTER TABLE pt1 ALTER COLUMN c7 DROP NOT NULL; ALTER TABLE pt1 ALTER COLUMN c8 TYPE char(10) USING '0'; -- ERROR +ERROR: "ft2" is not a table ALTER TABLE pt1 ALTER COLUMN c8 TYPE char(10); ALTER TABLE pt1 ALTER COLUMN c8 SET DATA TYPE text; ALTER TABLE pt1 ALTER COLUMN c1 SET STATISTICS 10000; @@ -1279,10 +1485,26 @@ ALTER TABLE pt1 ALTER COLUMN c8 SET STORAGE EXTERNAL; c6 | integer | | not null | | plain | | c7 | integer | | | | plain | | c8 | text | | | | external | | +Child tables: ft2 Distribute By: HASH(c1) Location Nodes: ALL DATANODES \d+ ft2 + Foreign table "public.ft2" + Column | Type | Collation | Nullable | Default | FDW options | Storage | Stats target | Description +--------+---------+-----------+----------+---------+-------------+----------+--------------+------------- + c1 | integer | | not null | | | plain | 10000 | + c2 | text | | | | | extended | | + c3 | date | | | | | plain | | + c4 | integer | | | 0 | | plain | | + c5 | integer | | | | | plain | | + c6 | integer | | not null | | | plain | | + c7 | integer | | | | | plain | | + c8 | text | | | | | external | | +Server: s0 +FDW options: (delimiter ',', quote '"', "be quoted" 'value') +Inherits: pt1 + -- drop attributes recursively ALTER TABLE pt1 DROP COLUMN c4; ALTER TABLE pt1 DROP COLUMN c5; @@ -1296,10 +1518,21 @@ ALTER TABLE pt1 DROP COLUMN c8; c1 | integer | | not null | | plain | 10000 | c2 | text | | | | extended | | c3 | date | | | | plain | | +Child tables: ft2 Distribute By: HASH(c1) Location Nodes: ALL DATANODES \d+ ft2 + Foreign table "public.ft2" + Column | Type | Collation | Nullable | Default | FDW options | Storage | Stats target | Description +--------+---------+-----------+----------+---------+-------------+----------+--------------+------------- + c1 | integer | | not null | | | plain | 10000 | + c2 | text | | | | | extended | | + c3 | date | | | | | plain | | +Server: s0 +FDW options: (delimiter ',', quote '"', "be quoted" 'value') +Inherits: pt1 + -- add constraints recursively ALTER TABLE pt1 ADD CONSTRAINT pt1chk1 CHECK (c1 > 0) NO INHERIT; ALTER TABLE pt1 ADD CONSTRAINT pt1chk2 CHECK (c2 <> ''); @@ -1325,13 +1558,25 @@ SELECT relname, conname, contype, conislocal, coninhcount, connoinherit Check constraints: "pt1chk1" CHECK (c1 > 0) NO INHERIT "pt1chk2" CHECK (c2 <> ''::text) +Child tables: ft2 Distribute By: HASH(c1) Location Nodes: ALL DATANODES \d+ ft2 + Foreign table "public.ft2" + Column | Type | Collation | Nullable | Default | FDW options | Storage | Stats target | Description +--------+---------+-----------+----------+---------+-------------+----------+--------------+------------- + c1 | integer | | not null | | | plain | 10000 | + c2 | text | | | | | extended | | + c3 | date | | | | | plain | | +Check constraints: + "pt1chk2" CHECK (c2 <> ''::text) +Server: s0 +FDW options: (delimiter ',', quote '"', "be quoted" 'value') +Inherits: pt1 + \set VERBOSITY terse DROP FOREIGN TABLE ft2; -- ERROR -ERROR: foreign table "ft2" does not exist DROP FOREIGN TABLE ft2 CASCADE; ERROR: foreign table "ft2" does not exist \set VERBOSITY default @@ -1340,14 +1585,11 @@ CREATE FOREIGN TABLE ft2 ( c2 text, c3 date ) SERVER s0 OPTIONS (delimiter ',', quote '"', "be quoted" 'value'); -ERROR: server "s0" does not exist -- child must have parent's INHERIT constraints ALTER FOREIGN TABLE ft2 INHERIT pt1; -- ERROR -ERROR: relation "ft2" does not exist +ERROR: child table is missing constraint "pt1chk2" ALTER FOREIGN TABLE ft2 ADD CONSTRAINT pt1chk2 CHECK (c2 <> ''); -ERROR: relation "ft2" does not exist ALTER FOREIGN TABLE ft2 INHERIT pt1; -ERROR: relation "ft2" does not exist -- child does not inherit NO INHERIT constraints \d+ pt1 Table "public.pt1" @@ -1359,10 +1601,23 @@ ERROR: relation "ft2" does not exist Check constraints: "pt1chk1" CHECK (c1 > 0) NO INHERIT "pt1chk2" CHECK (c2 <> ''::text) +Child tables: ft2 Distribute By: HASH(c1) Location Nodes: ALL DATANODES \d+ ft2 + Foreign table "public.ft2" + Column | Type | Collation | Nullable | Default | FDW options | Storage | Stats target | Description +--------+---------+-----------+----------+---------+-------------+----------+--------------+------------- + c1 | integer | | not null | | | plain | | + c2 | text | | | | | extended | | + c3 | date | | | | | plain | | +Check constraints: + "pt1chk2" CHECK (c2 <> ''::text) +Server: s0 +FDW options: (delimiter ',', quote '"', "be quoted" 'value') +Inherits: pt1 + -- drop constraints recursively ALTER TABLE pt1 DROP CONSTRAINT pt1chk1 CASCADE; ALTER TABLE pt1 DROP CONSTRAINT pt1chk2 CASCADE; @@ -1378,10 +1633,24 @@ ALTER TABLE pt1 ADD CONSTRAINT pt1chk3 CHECK (c2 <> '') NOT VALID; c3 | date | | | | plain | | Check constraints: "pt1chk3" CHECK (c2 <> ''::text) NOT VALID +Child tables: ft2 Distribute By: HASH(c1) Location Nodes: ALL DATANODES \d+ ft2 + Foreign table "public.ft2" + Column | Type | Collation | Nullable | Default | FDW options | Storage | Stats target | Description +--------+---------+-----------+----------+---------+-------------+----------+--------------+------------- + c1 | integer | | not null | | | plain | | + c2 | text | | | | | extended | | + c3 | date | | | | | plain | | +Check constraints: + "pt1chk2" CHECK (c2 <> ''::text) + "pt1chk3" CHECK (c2 <> ''::text) NOT VALID +Server: s0 +FDW options: (delimiter ',', quote '"', "be quoted" 'value') +Inherits: pt1 + -- VALIDATE CONSTRAINT need do nothing on foreign tables ALTER TABLE pt1 VALIDATE CONSTRAINT pt1chk3; \d+ pt1 @@ -1393,10 +1662,24 @@ ALTER TABLE pt1 VALIDATE CONSTRAINT pt1chk3; c3 | date | | | | plain | | Check constraints: "pt1chk3" CHECK (c2 <> ''::text) +Child tables: ft2 Distribute By: HASH(c1) Location Nodes: ALL DATANODES \d+ ft2 + Foreign table "public.ft2" + Column | Type | Collation | Nullable | Default | FDW options | Storage | Stats target | Description +--------+---------+-----------+----------+---------+-------------+----------+--------------+------------- + c1 | integer | | not null | | | plain | | + c2 | text | | | | | extended | | + c3 | date | | | | | plain | | +Check constraints: + "pt1chk2" CHECK (c2 <> ''::text) + "pt1chk3" CHECK (c2 <> ''::text) +Server: s0 +FDW options: (delimiter ',', quote '"', "be quoted" 'value') +Inherits: pt1 + -- OID system column ALTER TABLE pt1 SET WITH OIDS; \d+ pt1 @@ -1408,13 +1691,28 @@ ALTER TABLE pt1 SET WITH OIDS; c3 | date | | | | plain | | Check constraints: "pt1chk3" CHECK (c2 <> ''::text) +Child tables: ft2 Has OIDs: yes Distribute By: HASH(c1) Location Nodes: ALL DATANODES \d+ ft2 + Foreign table "public.ft2" + Column | Type | Collation | Nullable | Default | FDW options | Storage | Stats target | Description +--------+---------+-----------+----------+---------+-------------+----------+--------------+------------- + c1 | integer | | not null | | | plain | | + c2 | text | | | | | extended | | + c3 | date | | | | | plain | | +Check constraints: + "pt1chk2" CHECK (c2 <> ''::text) + "pt1chk3" CHECK (c2 <> ''::text) +Server: s0 +FDW options: (delimiter ',', quote '"', "be quoted" 'value') +Inherits: pt1 +Has OIDs: yes + ALTER TABLE ft2 SET WITHOUT OIDS; -- ERROR -ERROR: relation "ft2" does not exist +ERROR: cannot drop inherited column "oid" ALTER TABLE pt1 SET WITHOUT OIDS; \d+ pt1 Table "public.pt1" @@ -1425,10 +1723,24 @@ ALTER TABLE pt1 SET WITHOUT OIDS; c3 | date | | | | plain | | Check constraints: "pt1chk3" CHECK (c2 <> ''::text) +Child tables: ft2 Distribute By: HASH(c1) Location Nodes: ALL DATANODES \d+ ft2 + Foreign table "public.ft2" + Column | Type | Collation | Nullable | Default | FDW options | Storage | Stats target | Description +--------+---------+-----------+----------+---------+-------------+----------+--------------+------------- + c1 | integer | | not null | | | plain | | + c2 | text | | | | | extended | | + c3 | date | | | | | plain | | +Check constraints: + "pt1chk2" CHECK (c2 <> ''::text) + "pt1chk3" CHECK (c2 <> ''::text) +Server: s0 +FDW options: (delimiter ',', quote '"', "be quoted" 'value') +Inherits: pt1 + -- changes name of an attribute recursively ALTER TABLE pt1 RENAME COLUMN c1 TO f1; ALTER TABLE pt1 RENAME COLUMN c2 TO f2; @@ -1444,36 +1756,55 @@ ALTER TABLE pt1 RENAME CONSTRAINT pt1chk3 TO f2_check; f3 | date | | | | plain | | Check constraints: "f2_check" CHECK (f2 <> ''::text) +Child tables: ft2 Distribute By: HASH(f1) Location Nodes: ALL DATANODES \d+ ft2 + Foreign table "public.ft2" + Column | Type | Collation | Nullable | Default | FDW options | Storage | Stats target | Description +--------+---------+-----------+----------+---------+-------------+----------+--------------+------------- + f1 | integer | | not null | | | plain | | + f2 | text | | | | | extended | | + f3 | date | | | | | plain | | +Check constraints: + "f2_check" CHECK (f2 <> ''::text) + "pt1chk2" CHECK (f2 <> ''::text) +Server: s0 +FDW options: (delimiter ',', quote '"', "be quoted" 'value') +Inherits: pt1 + -- TRUNCATE doesn't work on foreign tables, either directly or recursively TRUNCATE ft2; -- ERROR -ERROR: relation "ft2" does not exist +ERROR: "ft2" is not a table TRUNCATE pt1; -- ERROR +ERROR: "ft2" is not a table DROP TABLE pt1 CASCADE; +NOTICE: drop cascades to foreign table ft2 -- IMPORT FOREIGN SCHEMA IMPORT FOREIGN SCHEMA s1 FROM SERVER s9 INTO public; -- ERROR -ERROR: server "s9" does not exist +ERROR: foreign-data wrapper "foo" has no handler IMPORT FOREIGN SCHEMA s1 LIMIT TO (t1) FROM SERVER s9 INTO public; --ERROR -ERROR: server "s9" does not exist +ERROR: foreign-data wrapper "foo" has no handler IMPORT FOREIGN SCHEMA s1 EXCEPT (t1) FROM SERVER s9 INTO public; -- ERROR -ERROR: server "s9" does not exist +ERROR: foreign-data wrapper "foo" has no handler IMPORT FOREIGN SCHEMA s1 EXCEPT (t1, t2) FROM SERVER s9 INTO public OPTIONS (option1 'value1', option2 'value2'); -- ERROR -ERROR: server "s9" does not exist +ERROR: foreign-data wrapper "foo" has no handler -- DROP FOREIGN TABLE DROP FOREIGN TABLE no_table; -- ERROR ERROR: foreign table "no_table" does not exist DROP FOREIGN TABLE IF EXISTS no_table; NOTICE: foreign table "no_table" does not exist, skipping DROP FOREIGN TABLE foreign_schema.foreign_table_1; -ERROR: foreign table "foreign_table_1" does not exist -- REASSIGN OWNED/DROP OWNED of foreign objects REASSIGN OWNED BY regress_test_role TO regress_test_role2; DROP OWNED BY regress_test_role2; +ERROR: cannot drop desired object(s) because other objects depend on them +DETAIL: user mapping for regress_test_role on server s5 depends on server s5 +HINT: Use DROP ... CASCADE to drop the dependent objects too. DROP OWNED BY regress_test_role2 CASCADE; +NOTICE: drop cascades to user mapping for regress_test_role on server s5 -- Foreign partition DDL stuff CREATE TABLE pt2 ( c1 integer NOT NULL, @@ -1482,7 +1813,6 @@ CREATE TABLE pt2 ( ) PARTITION BY LIST (c1); CREATE FOREIGN TABLE pt2_1 PARTITION OF pt2 FOR VALUES IN (1) SERVER s0 OPTIONS (delimiter ',', quote '"', "be quoted" 'value'); -ERROR: server "s0" does not exist \d+ pt2 Table "public.pt2" Column | Type | Collation | Nullable | Default | Storage | Stats target | Description @@ -1491,26 +1821,45 @@ ERROR: server "s0" does not exist c2 | text | | | | extended | | c3 | date | | | | plain | | Partition key: LIST (c1) -Number of partitions: 0 +Partitions: pt2_1 FOR VALUES IN (1) Distribute By: HASH(c1) Location Nodes: ALL DATANODES \d+ pt2_1 + Foreign table "public.pt2_1" + Column | Type | Collation | Nullable | Default | FDW options | Storage | Stats target | Description +--------+---------+-----------+----------+---------+-------------+----------+--------------+------------- + c1 | integer | | not null | | | plain | | + c2 | text | | | | | extended | | + c3 | date | | | | | plain | | +Partition of: pt2 FOR VALUES IN (1) +Partition constraint: ((c1 IS NOT NULL) AND (c1 = ANY (ARRAY[1]))) +Server: s0 +FDW options: (delimiter ',', quote '"', "be quoted" 'value') + -- partition cannot have additional columns DROP FOREIGN TABLE pt2_1; -ERROR: foreign table "pt2_1" does not exist CREATE FOREIGN TABLE pt2_1 ( c1 integer NOT NULL, c2 text, c3 date, c4 char ) SERVER s0 OPTIONS (delimiter ',', quote '"', "be quoted" 'value'); -ERROR: server "s0" does not exist \d+ pt2_1 + Foreign table "public.pt2_1" + Column | Type | Collation | Nullable | Default | FDW options | Storage | Stats target | Description +--------+--------------+-----------+----------+---------+-------------+----------+--------------+------------- + c1 | integer | | not null | | | plain | | + c2 | text | | | | | extended | | + c3 | date | | | | | plain | | + c4 | character(1) | | | | | extended | | +Server: s0 +FDW options: (delimiter ',', quote '"', "be quoted" 'value') + ALTER TABLE pt2 ATTACH PARTITION pt2_1 FOR VALUES IN (1); -- ERROR -ERROR: relation "pt2_1" does not exist +ERROR: table "pt2_1" contains column "c4" not found in parent "pt2" +DETAIL: New partition should contain only the columns present in parent. DROP FOREIGN TABLE pt2_1; -ERROR: foreign table "pt2_1" does not exist \d+ pt2 Table "public.pt2" Column | Type | Collation | Nullable | Default | Storage | Stats target | Description @@ -1528,11 +1877,18 @@ CREATE FOREIGN TABLE pt2_1 ( c2 text, c3 date ) SERVER s0 OPTIONS (delimiter ',', quote '"', "be quoted" 'value'); -ERROR: server "s0" does not exist \d+ pt2_1 + Foreign table "public.pt2_1" + Column | Type | Collation | Nullable | Default | FDW options | Storage | Stats target | Description +--------+---------+-----------+----------+---------+-------------+----------+--------------+------------- + c1 | integer | | not null | | | plain | | + c2 | text | | | | | extended | | + c3 | date | | | | | plain | | +Server: s0 +FDW options: (delimiter ',', quote '"', "be quoted" 'value') + -- no attach partition validation occurs for foreign tables ALTER TABLE pt2 ATTACH PARTITION pt2_1 FOR VALUES IN (1); -ERROR: relation "pt2_1" does not exist \d+ pt2 Table "public.pt2" Column | Type | Collation | Nullable | Default | Storage | Stats target | Description @@ -1541,19 +1897,28 @@ ERROR: relation "pt2_1" does not exist c2 | text | | | | extended | | c3 | date | | | | plain | | Partition key: LIST (c1) -Number of partitions: 0 +Partitions: pt2_1 FOR VALUES IN (1) Distribute By: HASH(c1) Location Nodes: ALL DATANODES \d+ pt2_1 + Foreign table "public.pt2_1" + Column | Type | Collation | Nullable | Default | FDW options | Storage | Stats target | Description +--------+---------+-----------+----------+---------+-------------+----------+--------------+------------- + c1 | integer | | not null | | | plain | | + c2 | text | | | | | extended | | + c3 | date | | | | | plain | | +Partition of: pt2 FOR VALUES IN (1) +Partition constraint: ((c1 IS NOT NULL) AND (c1 = ANY (ARRAY[1]))) +Server: s0 +FDW options: (delimiter ',', quote '"', "be quoted" 'value') + -- cannot add column to a partition ALTER TABLE pt2_1 ADD c4 char; -ERROR: relation "pt2_1" does not exist +ERROR: cannot add column to a partition -- ok to have a partition's own constraints though ALTER TABLE pt2_1 ALTER c3 SET NOT NULL; -ERROR: relation "pt2_1" does not exist ALTER TABLE pt2_1 ADD CONSTRAINT p21chk CHECK (c2 <> ''); -ERROR: relation "pt2_1" does not exist \d+ pt2 Table "public.pt2" Column | Type | Collation | Nullable | Default | Storage | Stats target | Description @@ -1562,17 +1927,29 @@ ERROR: relation "pt2_1" does not exist c2 | text | | | | extended | | c3 | date | | | | plain | | Partition key: LIST (c1) -Number of partitions: 0 +Partitions: pt2_1 FOR VALUES IN (1) Distribute By: HASH(c1) Location Nodes: ALL DATANODES \d+ pt2_1 + Foreign table "public.pt2_1" + Column | Type | Collation | Nullable | Default | FDW options | Storage | Stats target | Description +--------+---------+-----------+----------+---------+-------------+----------+--------------+------------- + c1 | integer | | not null | | | plain | | + c2 | text | | | | | extended | | + c3 | date | | not null | | | plain | | +Partition of: pt2 FOR VALUES IN (1) +Partition constraint: ((c1 IS NOT NULL) AND (c1 = ANY (ARRAY[1]))) +Check constraints: + "p21chk" CHECK (c2 <> ''::text) +Server: s0 +FDW options: (delimiter ',', quote '"', "be quoted" 'value') + -- cannot drop inherited NOT NULL constraint from a partition ALTER TABLE pt2_1 ALTER c1 DROP NOT NULL; -ERROR: relation "pt2_1" does not exist +ERROR: column "c1" is marked NOT NULL in parent table -- partition must have parent's constraints ALTER TABLE pt2 DETACH PARTITION pt2_1; -ERROR: relation "pt2_1" does not exist ALTER TABLE pt2 ALTER c2 SET NOT NULL; \d+ pt2 Table "public.pt2" @@ -1587,14 +1964,22 @@ Distribute By: HASH(c1) Location Nodes: ALL DATANODES \d+ pt2_1 + Foreign table "public.pt2_1" + Column | Type | Collation | Nullable | Default | FDW options | Storage | Stats target | Description +--------+---------+-----------+----------+---------+-------------+----------+--------------+------------- + c1 | integer | | not null | | | plain | | + c2 | text | | | | | extended | | + c3 | date | | not null | | | plain | | +Check constraints: + "p21chk" CHECK (c2 <> ''::text) +Server: s0 +FDW options: (delimiter ',', quote '"', "be quoted" 'value') + ALTER TABLE pt2 ATTACH PARTITION pt2_1 FOR VALUES IN (1); -- ERROR -ERROR: relation "pt2_1" does not exist +ERROR: column "c2" in child table must be marked NOT NULL ALTER FOREIGN TABLE pt2_1 ALTER c2 SET NOT NULL; -ERROR: relation "pt2_1" does not exist ALTER TABLE pt2 ATTACH PARTITION pt2_1 FOR VALUES IN (1); -ERROR: relation "pt2_1" does not exist ALTER TABLE pt2 DETACH PARTITION pt2_1; -ERROR: relation "pt2_1" does not exist ALTER TABLE pt2 ADD CONSTRAINT pt2chk1 CHECK (c1 > 0); \d+ pt2 Table "public.pt2" @@ -1611,46 +1996,55 @@ Distribute By: HASH(c1) Location Nodes: ALL DATANODES \d+ pt2_1 + Foreign table "public.pt2_1" + Column | Type | Collation | Nullable | Default | FDW options | Storage | Stats target | Description +--------+---------+-----------+----------+---------+-------------+----------+--------------+------------- + c1 | integer | | not null | | | plain | | + c2 | text | | not null | | | extended | | + c3 | date | | not null | | | plain | | +Check constraints: + "p21chk" CHECK (c2 <> ''::text) +Server: s0 +FDW options: (delimiter ',', quote '"', "be quoted" 'value') + ALTER TABLE pt2 ATTACH PARTITION pt2_1 FOR VALUES IN (1); -- ERROR -ERROR: relation "pt2_1" does not exist +ERROR: child table is missing constraint "pt2chk1" ALTER FOREIGN TABLE pt2_1 ADD CONSTRAINT pt2chk1 CHECK (c1 > 0); -ERROR: relation "pt2_1" does not exist ALTER TABLE pt2 ATTACH PARTITION pt2_1 FOR VALUES IN (1); -ERROR: relation "pt2_1" does not exist -- TRUNCATE doesn't work on foreign tables, either directly or recursively TRUNCATE pt2_1; -- ERROR -ERROR: relation "pt2_1" does not exist +ERROR: "pt2_1" is not a table TRUNCATE pt2; -- ERROR +ERROR: "pt2_1" is not a table DROP FOREIGN TABLE pt2_1; -ERROR: foreign table "pt2_1" does not exist DROP TABLE pt2; -- Cleanup DROP SCHEMA foreign_schema CASCADE; DROP ROLE regress_test_role; -- ERROR +ERROR: role "regress_test_role" cannot be dropped because some objects depend on it +DETAIL: privileges for server s4 +privileges for foreign-data wrapper foo +owner of user mapping for regress_test_role on server s6 DROP SERVER t1 CASCADE; -ERROR: server "t1" does not exist +NOTICE: drop cascades to user mapping for public on server t1 DROP USER MAPPING FOR regress_test_role SERVER s6; -ERROR: role "regress_test_role" does not exist \set VERBOSITY terse DROP FOREIGN DATA WRAPPER foo CASCADE; -ERROR: foreign-data wrapper "foo" does not exist +NOTICE: drop cascades to 5 other objects DROP SERVER s8 CASCADE; -ERROR: server "s8" does not exist +NOTICE: drop cascades to 2 other objects \set VERBOSITY default DROP ROLE regress_test_indirect; -ERROR: role "regress_test_indirect" does not exist DROP ROLE regress_test_role; -ERROR: role "regress_test_role" does not exist DROP ROLE regress_unprivileged_role; -- ERROR +ERROR: role "regress_unprivileged_role" cannot be dropped because some objects depend on it +DETAIL: privileges for foreign-data wrapper postgresql REVOKE ALL ON FOREIGN DATA WRAPPER postgresql FROM regress_unprivileged_role; -ERROR: foreign-data wrapper "postgresql" does not exist DROP ROLE regress_unprivileged_role; -ERROR: role "regress_unprivileged_role" does not exist DROP ROLE regress_test_role2; DROP FOREIGN DATA WRAPPER postgresql CASCADE; -ERROR: foreign-data wrapper "postgresql" does not exist DROP FOREIGN DATA WRAPPER dummy CASCADE; -ERROR: foreign-data wrapper "dummy" does not exist +NOTICE: drop cascades to server s0 \c DROP ROLE regress_foreign_data_user; -- At this point we should have no wrappers, no servers, and no mappings. diff --git a/src/test/regress/expected/object_address.out b/src/test/regress/expected/object_address.out index 75cc6638..c0a5ceac 100644 --- a/src/test/regress/expected/object_address.out +++ b/src/test/regress/expected/object_address.out @@ -10,11 +10,7 @@ CREATE USER regress_addr_user; CREATE SCHEMA addr_nsp; SET search_path TO 'addr_nsp'; CREATE FOREIGN DATA WRAPPER addr_fdw; -ERROR: Postgres-XL does not support FOREIGN DATA WRAPPER yet -DETAIL: The feature is not currently supported CREATE SERVER addr_fserv FOREIGN DATA WRAPPER addr_fdw; -ERROR: Postgres-XL does not support SERVER yet -DETAIL: The feature is not currently supported CREATE TEXT SEARCH DICTIONARY addr_ts_dict (template=simple); CREATE TEXT SEARCH CONFIGURATION addr_ts_conf (copy=english); CREATE TEXT SEARCH TEMPLATE addr_ts_temp (lexize=dsimple_lexize); @@ -28,7 +24,6 @@ CREATE MATERIALIZED VIEW addr_nsp.genmatview AS SELECT * FROM addr_nsp.gentable; CREATE TYPE addr_nsp.gencomptype AS (a int); CREATE TYPE addr_nsp.genenum AS ENUM ('one', 'two'); CREATE FOREIGN TABLE addr_nsp.genftable (a int) SERVER addr_fserv; -ERROR: server "addr_fserv" does not exist CREATE AGGREGATE addr_nsp.genaggr(int4) (sfunc = int4pl, stype = int4); CREATE DOMAIN addr_nsp.gendomain AS int4 CONSTRAINT domconstr CHECK (value > 0); CREATE FUNCTION addr_nsp.trig() RETURNS TRIGGER LANGUAGE plpgsql AS $$ BEGIN END; $$; @@ -37,21 +32,17 @@ ERROR: Postgres-XL does not support TRIGGER yet DETAIL: The feature is not currently supported CREATE POLICY genpol ON addr_nsp.gentable; CREATE SERVER "integer" FOREIGN DATA WRAPPER addr_fdw; -ERROR: Postgres-XL does not support SERVER yet -DETAIL: The feature is not currently supported CREATE USER MAPPING FOR regress_addr_user SERVER "integer"; -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported ALTER DEFAULT PRIVILEGES FOR ROLE regress_addr_user IN SCHEMA public GRANT ALL ON TABLES TO regress_addr_user; ALTER DEFAULT PRIVILEGES FOR ROLE regress_addr_user REVOKE DELETE ON TABLES FROM regress_addr_user; CREATE TRANSFORM FOR int LANGUAGE SQL ( FROM SQL WITH FUNCTION varchar_transform(internal), TO SQL WITH FUNCTION int4recv(internal)); CREATE PUBLICATION addr_pub FOR TABLE addr_nsp.gentable; -ERROR: Postgres-XL does not support CREATE PUBLICATION +ERROR: COORDINATOR does not support CREATE PUBLICATION DETAIL: The feature is not currently supported CREATE SUBSCRIPTION addr_sub CONNECTION '' PUBLICATION bar WITH (connect = false, slot_name = NONE); -ERROR: Postgres-XL does not support CREATE SUBSCRIPTION +ERROR: COORDINATOR only supports CREATE TBASE SUBSCRIPTION DETAIL: The feature is not currently supported CREATE STATISTICS addr_nsp.gentable_stat ON a, b FROM addr_nsp.gentable; -- test some error cases @@ -77,8 +68,12 @@ BEGIN END LOOP; END; $$; -ERROR: Internal subtransactions not supported in Postgres-XL -CONTEXT: PL/pgSQL function inline_code_block line 8 during statement block entry +WARNING: error for toast table: unsupported object type "toast table" +WARNING: error for index column: unsupported object type "index column" +WARNING: error for sequence column: unsupported object type "sequence column" +WARNING: error for toast table column: unsupported object type "toast table column" +WARNING: error for view column: unsupported object type "view column" +WARNING: error for materialized view column: unsupported object type "materialized view column" -- miscellaneous other errors select * from pg_get_object_address('operator of access method', '{btree,integer_ops,1}', '{int4,bool}'); ERROR: operator 1 (int4, bool) of operator family integer_ops for access method btree does not exist @@ -121,8 +116,198 @@ BEGIN END LOOP; END; $$; -ERROR: Internal subtransactions not supported in Postgres-XL -CONTEXT: PL/pgSQL function inline_code_block line 24 during statement block entry +WARNING: error for table,{eins},{}: relation "eins" does not exist +WARNING: error for table,{eins},{integer}: relation "eins" does not exist +WARNING: error for table,{addr_nsp,zwei},{}: relation "addr_nsp.zwei" does not exist +WARNING: error for table,{addr_nsp,zwei},{integer}: relation "addr_nsp.zwei" does not exist +WARNING: error for table,{eins,zwei,drei},{}: cross-database references are not implemented: "eins.zwei.drei" +WARNING: error for table,{eins,zwei,drei},{integer}: cross-database references are not implemented: "eins.zwei.drei" +WARNING: error for index,{eins},{}: relation "eins" does not exist +WARNING: error for index,{eins},{integer}: relation "eins" does not exist +WARNING: error for index,{addr_nsp,zwei},{}: relation "addr_nsp.zwei" does not exist +WARNING: error for index,{addr_nsp,zwei},{integer}: relation "addr_nsp.zwei" does not exist +WARNING: error for index,{eins,zwei,drei},{}: cross-database references are not implemented: "eins.zwei.drei" +WARNING: error for index,{eins,zwei,drei},{integer}: cross-database references are not implemented: "eins.zwei.drei" +WARNING: error for sequence,{eins},{}: relation "eins" does not exist +WARNING: error for sequence,{eins},{integer}: relation "eins" does not exist +WARNING: error for sequence,{addr_nsp,zwei},{}: relation "addr_nsp.zwei" does not exist +WARNING: error for sequence,{addr_nsp,zwei},{integer}: relation "addr_nsp.zwei" does not exist +WARNING: error for sequence,{eins,zwei,drei},{}: cross-database references are not implemented: "eins.zwei.drei" +WARNING: error for sequence,{eins,zwei,drei},{integer}: cross-database references are not implemented: "eins.zwei.drei" +WARNING: error for view,{eins},{}: relation "eins" does not exist +WARNING: error for view,{eins},{integer}: relation "eins" does not exist +WARNING: error for view,{addr_nsp,zwei},{}: relation "addr_nsp.zwei" does not exist +WARNING: error for view,{addr_nsp,zwei},{integer}: relation "addr_nsp.zwei" does not exist +WARNING: error for view,{eins,zwei,drei},{}: cross-database references are not implemented: "eins.zwei.drei" +WARNING: error for view,{eins,zwei,drei},{integer}: cross-database references are not implemented: "eins.zwei.drei" +WARNING: error for materialized view,{eins},{}: relation "eins" does not exist +WARNING: error for materialized view,{eins},{integer}: relation "eins" does not exist +WARNING: error for materialized view,{addr_nsp,zwei},{}: relation "addr_nsp.zwei" does not exist +WARNING: error for materialized view,{addr_nsp,zwei},{integer}: relation "addr_nsp.zwei" does not exist +WARNING: error for materialized view,{eins,zwei,drei},{}: cross-database references are not implemented: "eins.zwei.drei" +WARNING: error for materialized view,{eins,zwei,drei},{integer}: cross-database references are not implemented: "eins.zwei.drei" +WARNING: error for foreign table,{eins},{}: relation "eins" does not exist +WARNING: error for foreign table,{eins},{integer}: relation "eins" does not exist +WARNING: error for foreign table,{addr_nsp,zwei},{}: relation "addr_nsp.zwei" does not exist +WARNING: error for foreign table,{addr_nsp,zwei},{integer}: relation "addr_nsp.zwei" does not exist +WARNING: error for foreign table,{eins,zwei,drei},{}: cross-database references are not implemented: "eins.zwei.drei" +WARNING: error for foreign table,{eins,zwei,drei},{integer}: cross-database references are not implemented: "eins.zwei.drei" +WARNING: error for table column,{eins},{}: column name must be qualified +WARNING: error for table column,{eins},{integer}: column name must be qualified +WARNING: error for table column,{addr_nsp,zwei},{}: relation "addr_nsp" does not exist +WARNING: error for table column,{addr_nsp,zwei},{integer}: relation "addr_nsp" does not exist +WARNING: error for table column,{eins,zwei,drei},{}: schema "eins" does not exist +WARNING: error for table column,{eins,zwei,drei},{integer}: schema "eins" does not exist +WARNING: error for foreign table column,{eins},{}: column name must be qualified +WARNING: error for foreign table column,{eins},{integer}: column name must be qualified +WARNING: error for foreign table column,{addr_nsp,zwei},{}: relation "addr_nsp" does not exist +WARNING: error for foreign table column,{addr_nsp,zwei},{integer}: relation "addr_nsp" does not exist +WARNING: error for foreign table column,{eins,zwei,drei},{}: schema "eins" does not exist +WARNING: error for foreign table column,{eins,zwei,drei},{integer}: schema "eins" does not exist +WARNING: error for aggregate,{eins},{}: aggregate eins(*) does not exist +WARNING: error for aggregate,{eins},{integer}: aggregate eins(integer) does not exist +WARNING: error for aggregate,{addr_nsp,zwei},{}: aggregate addr_nsp.zwei(*) does not exist +WARNING: error for aggregate,{addr_nsp,zwei},{integer}: aggregate addr_nsp.zwei(integer) does not exist +WARNING: error for aggregate,{eins,zwei,drei},{}: cross-database references are not implemented: eins.zwei.drei +WARNING: error for aggregate,{eins,zwei,drei},{integer}: cross-database references are not implemented: eins.zwei.drei +WARNING: error for function,{eins},{}: function eins() does not exist +WARNING: error for function,{eins},{integer}: function eins(integer) does not exist +WARNING: error for function,{addr_nsp,zwei},{}: function addr_nsp.zwei() does not exist +WARNING: error for function,{addr_nsp,zwei},{integer}: function addr_nsp.zwei(integer) does not exist +WARNING: error for function,{eins,zwei,drei},{}: cross-database references are not implemented: eins.zwei.drei +WARNING: error for function,{eins,zwei,drei},{integer}: cross-database references are not implemented: eins.zwei.drei +WARNING: error for type,{eins},{}: type "eins" does not exist +WARNING: error for type,{eins},{integer}: type "eins" does not exist +WARNING: error for type,{addr_nsp,zwei},{}: name list length must be exactly 1 +WARNING: error for type,{addr_nsp,zwei},{integer}: name list length must be exactly 1 +WARNING: error for type,{eins,zwei,drei},{}: name list length must be exactly 1 +WARNING: error for type,{eins,zwei,drei},{integer}: name list length must be exactly 1 +WARNING: error for cast,{eins},{}: argument list length must be exactly 1 +WARNING: error for cast,{eins},{integer}: type "eins" does not exist +WARNING: error for cast,{addr_nsp,zwei},{}: name list length must be exactly 1 +WARNING: error for cast,{addr_nsp,zwei},{integer}: name list length must be exactly 1 +WARNING: error for cast,{eins,zwei,drei},{}: name list length must be exactly 1 +WARNING: error for cast,{eins,zwei,drei},{integer}: name list length must be exactly 1 +WARNING: error for table constraint,{eins},{}: must specify relation and object name +WARNING: error for table constraint,{eins},{integer}: must specify relation and object name +WARNING: error for table constraint,{addr_nsp,zwei},{}: relation "addr_nsp" does not exist +WARNING: error for table constraint,{addr_nsp,zwei},{integer}: relation "addr_nsp" does not exist +WARNING: error for table constraint,{eins,zwei,drei},{}: schema "eins" does not exist +WARNING: error for table constraint,{eins,zwei,drei},{integer}: schema "eins" does not exist +WARNING: error for domain constraint,{eins},{}: argument list length must be exactly 1 +WARNING: error for domain constraint,{eins},{integer}: type "eins" does not exist +WARNING: error for domain constraint,{addr_nsp,zwei},{}: name list length must be exactly 1 +WARNING: error for domain constraint,{addr_nsp,zwei},{integer}: name list length must be exactly 1 +WARNING: error for domain constraint,{eins,zwei,drei},{}: name list length must be exactly 1 +WARNING: error for domain constraint,{eins,zwei,drei},{integer}: name list length must be exactly 1 +WARNING: error for conversion,{eins},{}: conversion "eins" does not exist +WARNING: error for conversion,{eins},{integer}: conversion "eins" does not exist +WARNING: error for conversion,{addr_nsp,zwei},{}: conversion "addr_nsp.zwei" does not exist +WARNING: error for conversion,{addr_nsp,zwei},{integer}: conversion "addr_nsp.zwei" does not exist +WARNING: error for conversion,{eins,zwei,drei},{}: cross-database references are not implemented: eins.zwei.drei +WARNING: error for conversion,{eins,zwei,drei},{integer}: cross-database references are not implemented: eins.zwei.drei +WARNING: error for default value,{eins},{}: column name must be qualified +WARNING: error for default value,{eins},{integer}: column name must be qualified +WARNING: error for default value,{addr_nsp,zwei},{}: relation "addr_nsp" does not exist +WARNING: error for default value,{addr_nsp,zwei},{integer}: relation "addr_nsp" does not exist +WARNING: error for default value,{eins,zwei,drei},{}: schema "eins" does not exist +WARNING: error for default value,{eins,zwei,drei},{integer}: schema "eins" does not exist +WARNING: error for operator,{eins},{}: argument list length must be exactly 2 +WARNING: error for operator,{eins},{integer}: argument list length must be exactly 2 +WARNING: error for operator,{addr_nsp,zwei},{}: argument list length must be exactly 2 +WARNING: error for operator,{addr_nsp,zwei},{integer}: argument list length must be exactly 2 +WARNING: error for operator,{eins,zwei,drei},{}: argument list length must be exactly 2 +WARNING: error for operator,{eins,zwei,drei},{integer}: argument list length must be exactly 2 +WARNING: error for operator class,{eins},{}: name list length must be at least 2 +WARNING: error for operator class,{eins},{integer}: name list length must be at least 2 +WARNING: error for operator class,{addr_nsp,zwei},{}: access method "addr_nsp" does not exist +WARNING: error for operator class,{addr_nsp,zwei},{integer}: access method "addr_nsp" does not exist +WARNING: error for operator class,{eins,zwei,drei},{}: access method "eins" does not exist +WARNING: error for operator class,{eins,zwei,drei},{integer}: access method "eins" does not exist +WARNING: error for operator family,{eins},{}: name list length must be at least 2 +WARNING: error for operator family,{eins},{integer}: name list length must be at least 2 +WARNING: error for operator family,{addr_nsp,zwei},{}: access method "addr_nsp" does not exist +WARNING: error for operator family,{addr_nsp,zwei},{integer}: access method "addr_nsp" does not exist +WARNING: error for operator family,{eins,zwei,drei},{}: access method "eins" does not exist +WARNING: error for operator family,{eins,zwei,drei},{integer}: access method "eins" does not exist +WARNING: error for rule,{eins},{}: must specify relation and object name +WARNING: error for rule,{eins},{integer}: must specify relation and object name +WARNING: error for rule,{addr_nsp,zwei},{}: relation "addr_nsp" does not exist +WARNING: error for rule,{addr_nsp,zwei},{integer}: relation "addr_nsp" does not exist +WARNING: error for rule,{eins,zwei,drei},{}: schema "eins" does not exist +WARNING: error for rule,{eins,zwei,drei},{integer}: schema "eins" does not exist +WARNING: error for trigger,{eins},{}: must specify relation and object name +WARNING: error for trigger,{eins},{integer}: must specify relation and object name +WARNING: error for trigger,{addr_nsp,zwei},{}: relation "addr_nsp" does not exist +WARNING: error for trigger,{addr_nsp,zwei},{integer}: relation "addr_nsp" does not exist +WARNING: error for trigger,{eins,zwei,drei},{}: schema "eins" does not exist +WARNING: error for trigger,{eins,zwei,drei},{integer}: schema "eins" does not exist +WARNING: error for text search parser,{eins},{}: text search parser "eins" does not exist +WARNING: error for text search parser,{eins},{integer}: text search parser "eins" does not exist +WARNING: error for text search parser,{addr_nsp,zwei},{}: text search parser "addr_nsp.zwei" does not exist +WARNING: error for text search parser,{addr_nsp,zwei},{integer}: text search parser "addr_nsp.zwei" does not exist +WARNING: error for text search parser,{eins,zwei,drei},{}: cross-database references are not implemented: eins.zwei.drei +WARNING: error for text search parser,{eins,zwei,drei},{integer}: cross-database references are not implemented: eins.zwei.drei +WARNING: error for text search dictionary,{eins},{}: text search dictionary "eins" does not exist +WARNING: error for text search dictionary,{eins},{integer}: text search dictionary "eins" does not exist +WARNING: error for text search dictionary,{addr_nsp,zwei},{}: text search dictionary "addr_nsp.zwei" does not exist +WARNING: error for text search dictionary,{addr_nsp,zwei},{integer}: text search dictionary "addr_nsp.zwei" does not exist +WARNING: error for text search dictionary,{eins,zwei,drei},{}: cross-database references are not implemented: eins.zwei.drei +WARNING: error for text search dictionary,{eins,zwei,drei},{integer}: cross-database references are not implemented: eins.zwei.drei +WARNING: error for text search template,{eins},{}: text search template "eins" does not exist +WARNING: error for text search template,{eins},{integer}: text search template "eins" does not exist +WARNING: error for text search template,{addr_nsp,zwei},{}: text search template "addr_nsp.zwei" does not exist +WARNING: error for text search template,{addr_nsp,zwei},{integer}: text search template "addr_nsp.zwei" does not exist +WARNING: error for text search template,{eins,zwei,drei},{}: cross-database references are not implemented: eins.zwei.drei +WARNING: error for text search template,{eins,zwei,drei},{integer}: cross-database references are not implemented: eins.zwei.drei +WARNING: error for text search configuration,{eins},{}: text search configuration "eins" does not exist +WARNING: error for text search configuration,{eins},{integer}: text search configuration "eins" does not exist +WARNING: error for text search configuration,{addr_nsp,zwei},{}: text search configuration "addr_nsp.zwei" does not exist +WARNING: error for text search configuration,{addr_nsp,zwei},{integer}: text search configuration "addr_nsp.zwei" does not exist +WARNING: error for text search configuration,{eins,zwei,drei},{}: cross-database references are not implemented: eins.zwei.drei +WARNING: error for text search configuration,{eins,zwei,drei},{integer}: cross-database references are not implemented: eins.zwei.drei +WARNING: error for policy,{eins},{}: must specify relation and object name +WARNING: error for policy,{eins},{integer}: must specify relation and object name +WARNING: error for policy,{addr_nsp,zwei},{}: relation "addr_nsp" does not exist +WARNING: error for policy,{addr_nsp,zwei},{integer}: relation "addr_nsp" does not exist +WARNING: error for policy,{eins,zwei,drei},{}: schema "eins" does not exist +WARNING: error for policy,{eins,zwei,drei},{integer}: schema "eins" does not exist +WARNING: error for user mapping,{eins},{}: argument list length must be exactly 1 +WARNING: error for user mapping,{eins},{integer}: user mapping for user "eins" on server "integer" does not exist +WARNING: error for user mapping,{addr_nsp,zwei},{}: argument list length must be exactly 1 +WARNING: error for user mapping,{addr_nsp,zwei},{integer}: user mapping for user "addr_nsp" on server "integer" does not exist +WARNING: error for user mapping,{eins,zwei,drei},{}: argument list length must be exactly 1 +WARNING: error for user mapping,{eins,zwei,drei},{integer}: user mapping for user "eins" on server "integer" does not exist +WARNING: error for default acl,{eins},{}: argument list length must be exactly 1 +WARNING: error for default acl,{eins},{integer}: unrecognized default ACL object type "i" +WARNING: error for default acl,{addr_nsp,zwei},{}: argument list length must be exactly 1 +WARNING: error for default acl,{addr_nsp,zwei},{integer}: unrecognized default ACL object type "i" +WARNING: error for default acl,{eins,zwei,drei},{}: argument list length must be exactly 1 +WARNING: error for default acl,{eins,zwei,drei},{integer}: unrecognized default ACL object type "i" +WARNING: error for transform,{eins},{}: argument list length must be exactly 1 +WARNING: error for transform,{eins},{integer}: type "eins" does not exist +WARNING: error for transform,{addr_nsp,zwei},{}: name list length must be exactly 1 +WARNING: error for transform,{addr_nsp,zwei},{integer}: name list length must be exactly 1 +WARNING: error for transform,{eins,zwei,drei},{}: name list length must be exactly 1 +WARNING: error for transform,{eins,zwei,drei},{integer}: name list length must be exactly 1 +WARNING: error for operator of access method,{eins},{}: name list length must be at least 3 +WARNING: error for operator of access method,{eins},{integer}: name list length must be at least 3 +WARNING: error for operator of access method,{addr_nsp,zwei},{}: name list length must be at least 3 +WARNING: error for operator of access method,{addr_nsp,zwei},{integer}: name list length must be at least 3 +WARNING: error for operator of access method,{eins,zwei,drei},{}: argument list length must be exactly 2 +WARNING: error for operator of access method,{eins,zwei,drei},{integer}: argument list length must be exactly 2 +WARNING: error for function of access method,{eins},{}: name list length must be at least 3 +WARNING: error for function of access method,{eins},{integer}: name list length must be at least 3 +WARNING: error for function of access method,{addr_nsp,zwei},{}: name list length must be at least 3 +WARNING: error for function of access method,{addr_nsp,zwei},{integer}: name list length must be at least 3 +WARNING: error for function of access method,{eins,zwei,drei},{}: argument list length must be exactly 2 +WARNING: error for function of access method,{eins,zwei,drei},{integer}: argument list length must be exactly 2 +WARNING: error for publication relation,{eins},{}: argument list length must be exactly 1 +WARNING: error for publication relation,{eins},{integer}: relation "eins" does not exist +WARNING: error for publication relation,{addr_nsp,zwei},{}: argument list length must be exactly 1 +WARNING: error for publication relation,{addr_nsp,zwei},{integer}: relation "addr_nsp.zwei" does not exist +WARNING: error for publication relation,{eins,zwei,drei},{}: argument list length must be exactly 1 +WARNING: error for publication relation,{eins,zwei,drei},{integer}: cross-database references are not implemented: "eins.zwei.drei" -- these object types cannot be qualified names SELECT pg_get_object_address('language', '{one}', '{}'); ERROR: language "one" does not exist @@ -278,11 +463,11 @@ SELECT (pg_identify_object(addr1.classid, addr1.objid, addr1.objsubid)).*, --- \set VERBOSITY terse \\ -- suppress cascade details DROP FOREIGN DATA WRAPPER addr_fdw CASCADE; -ERROR: foreign-data wrapper "addr_fdw" does not exist +NOTICE: drop cascades to 4 other objects DROP PUBLICATION addr_pub; ERROR: publication "addr_pub" does not exist DROP SUBSCRIPTION addr_sub; -ERROR: subscription "addr_sub" does not exist +ERROR: COORDINATOR only supports DROP TBASE SUBSCRIPTION DROP SCHEMA addr_nsp CASCADE; NOTICE: drop cascades to 12 other objects DROP OWNED BY regress_addr_user; diff --git a/src/test/regress/expected/rolenames.out b/src/test/regress/expected/rolenames.out index 1540568b..dce82f5d 100644 --- a/src/test/regress/expected/rolenames.out +++ b/src/test/regress/expected/rolenames.out @@ -609,59 +609,23 @@ SELECT p.proname, r.rolname -- CREATE USER MAPPING CREATE FOREIGN DATA WRAPPER test_wrapper; -ERROR: Postgres-XL does not support FOREIGN DATA WRAPPER yet -DETAIL: The feature is not currently supported CREATE SERVER sv1 FOREIGN DATA WRAPPER test_wrapper; -ERROR: Postgres-XL does not support SERVER yet -DETAIL: The feature is not currently supported CREATE SERVER sv2 FOREIGN DATA WRAPPER test_wrapper; -ERROR: Postgres-XL does not support SERVER yet -DETAIL: The feature is not currently supported CREATE SERVER sv3 FOREIGN DATA WRAPPER test_wrapper; -ERROR: Postgres-XL does not support SERVER yet -DETAIL: The feature is not currently supported CREATE SERVER sv4 FOREIGN DATA WRAPPER test_wrapper; -ERROR: Postgres-XL does not support SERVER yet -DETAIL: The feature is not currently supported CREATE SERVER sv5 FOREIGN DATA WRAPPER test_wrapper; -ERROR: Postgres-XL does not support SERVER yet -DETAIL: The feature is not currently supported CREATE SERVER sv6 FOREIGN DATA WRAPPER test_wrapper; -ERROR: Postgres-XL does not support SERVER yet -DETAIL: The feature is not currently supported CREATE SERVER sv7 FOREIGN DATA WRAPPER test_wrapper; -ERROR: Postgres-XL does not support SERVER yet -DETAIL: The feature is not currently supported CREATE SERVER sv8 FOREIGN DATA WRAPPER test_wrapper; -ERROR: Postgres-XL does not support SERVER yet -DETAIL: The feature is not currently supported CREATE SERVER sv9 FOREIGN DATA WRAPPER test_wrapper; -ERROR: Postgres-XL does not support SERVER yet -DETAIL: The feature is not currently supported CREATE USER MAPPING FOR CURRENT_USER SERVER sv1 OPTIONS (user 'CURRENT_USER'); -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported CREATE USER MAPPING FOR "current_user" SERVER sv2 OPTIONS (user '"current_user"'); -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported CREATE USER MAPPING FOR USER SERVER sv3 OPTIONS (user 'USER'); -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported CREATE USER MAPPING FOR "user" SERVER sv4 OPTIONS (user '"USER"'); -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported CREATE USER MAPPING FOR SESSION_USER SERVER sv5 OPTIONS (user 'SESSION_USER'); -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported CREATE USER MAPPING FOR PUBLIC SERVER sv6 OPTIONS (user 'PUBLIC'); -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported CREATE USER MAPPING FOR "Public" SERVER sv7 OPTIONS (user '"Public"'); -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported CREATE USER MAPPING FOR regress_testrolx SERVER sv8 OPTIONS (user 'regress_testrolx'); -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported CREATE USER MAPPING FOR CURRENT_ROLE SERVER sv9 OPTIONS (user 'CURRENT_ROLE'); -- error ERROR: syntax error at or near "CURRENT_ROLE" @@ -669,38 +633,37 @@ LINE 1: CREATE USER MAPPING FOR CURRENT_ROLE SERVER sv9 ^ CREATE USER MAPPING FOR nonexistent SERVER sv9 OPTIONS (user 'nonexistent'); -- error; -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported +ERROR: role "nonexistent" does not exist SELECT * FROM chkumapping(); - umname | umserver | umoptions ---------+----------+----------- -(0 rows) + umname | umserver | umoptions +------------------+----------+--------------------------- + regress_testrol2 | sv1 | {user=CURRENT_USER} + current_user | sv2 | {"user=\"current_user\""} + regress_testrol2 | sv3 | {user=USER} + user | sv4 | {"user=\"USER\""} + regress_testrol1 | sv5 | {user=SESSION_USER} + | sv6 | {user=PUBLIC} + Public | sv7 | {"user=\"Public\""} + regress_testrolx | sv8 | {user=regress_testrolx} +(8 rows) -- ALTER USER MAPPING ALTER USER MAPPING FOR CURRENT_USER SERVER sv1 OPTIONS (SET user 'CURRENT_USER_alt'); -ERROR: server "sv1" does not exist ALTER USER MAPPING FOR "current_user" SERVER sv2 OPTIONS (SET user '"current_user"_alt'); -ERROR: server "sv2" does not exist ALTER USER MAPPING FOR USER SERVER sv3 OPTIONS (SET user 'USER_alt'); -ERROR: server "sv3" does not exist ALTER USER MAPPING FOR "user" SERVER sv4 OPTIONS (SET user '"user"_alt'); -ERROR: server "sv4" does not exist ALTER USER MAPPING FOR SESSION_USER SERVER sv5 OPTIONS (SET user 'SESSION_USER_alt'); -ERROR: server "sv5" does not exist ALTER USER MAPPING FOR PUBLIC SERVER sv6 OPTIONS (SET user 'public_alt'); -ERROR: server "sv6" does not exist ALTER USER MAPPING FOR "Public" SERVER sv7 OPTIONS (SET user '"Public"_alt'); -ERROR: server "sv7" does not exist ALTER USER MAPPING FOR regress_testrolx SERVER sv8 OPTIONS (SET user 'regress_testrolx_alt'); -ERROR: server "sv8" does not exist ALTER USER MAPPING FOR CURRENT_ROLE SERVER sv9 OPTIONS (SET user 'CURRENT_ROLE_alt'); ERROR: syntax error at or near "CURRENT_ROLE" @@ -710,27 +673,27 @@ ALTER USER MAPPING FOR nonexistent SERVER sv9 OPTIONS (SET user 'nonexistent_alt'); -- error ERROR: role "nonexistent" does not exist SELECT * FROM chkumapping(); - umname | umserver | umoptions ---------+----------+----------- -(0 rows) + umname | umserver | umoptions +------------------+----------+------------------------------- + regress_testrol2 | sv1 | {user=CURRENT_USER_alt} + current_user | sv2 | {"user=\"current_user\"_alt"} + regress_testrol2 | sv3 | {user=USER_alt} + user | sv4 | {"user=\"user\"_alt"} + regress_testrol1 | sv5 | {user=SESSION_USER_alt} + | sv6 | {user=public_alt} + Public | sv7 | {"user=\"Public\"_alt"} + regress_testrolx | sv8 | {user=regress_testrolx_alt} +(8 rows) -- DROP USER MAPPING DROP USER MAPPING FOR CURRENT_USER SERVER sv1; -ERROR: server "sv1" does not exist DROP USER MAPPING FOR "current_user" SERVER sv2; -ERROR: server "sv2" does not exist DROP USER MAPPING FOR USER SERVER sv3; -ERROR: server "sv3" does not exist DROP USER MAPPING FOR "user" SERVER sv4; -ERROR: server "sv4" does not exist DROP USER MAPPING FOR SESSION_USER SERVER sv5; -ERROR: server "sv5" does not exist DROP USER MAPPING FOR PUBLIC SERVER sv6; -ERROR: server "sv6" does not exist DROP USER MAPPING FOR "Public" SERVER sv7; -ERROR: server "sv7" does not exist DROP USER MAPPING FOR regress_testrolx SERVER sv8; -ERROR: server "sv8" does not exist DROP USER MAPPING FOR CURRENT_ROLE SERVER sv9; -- error ERROR: syntax error at or near "CURRENT_ROLE" LINE 1: DROP USER MAPPING FOR CURRENT_ROLE SERVER sv9; @@ -743,86 +706,98 @@ SELECT * FROM chkumapping(); (0 rows) CREATE USER MAPPING FOR CURRENT_USER SERVER sv1 OPTIONS (user 'CURRENT_USER'); -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported CREATE USER MAPPING FOR "current_user" SERVER sv2 OPTIONS (user '"current_user"'); -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported CREATE USER MAPPING FOR USER SERVER sv3 OPTIONS (user 'USER'); -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported CREATE USER MAPPING FOR "user" SERVER sv4 OPTIONS (user '"USER"'); -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported CREATE USER MAPPING FOR SESSION_USER SERVER sv5 OPTIONS (user 'SESSION_USER'); -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported CREATE USER MAPPING FOR PUBLIC SERVER sv6 OPTIONS (user 'PUBLIC'); -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported CREATE USER MAPPING FOR "Public" SERVER sv7 OPTIONS (user '"Public"'); -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported CREATE USER MAPPING FOR regress_testrolx SERVER sv8 OPTIONS (user 'regress_testrolx'); -ERROR: Postgres-XL does not support USER MAPPING yet -DETAIL: The feature is not currently supported SELECT * FROM chkumapping(); - umname | umserver | umoptions ---------+----------+----------- -(0 rows) + umname | umserver | umoptions +------------------+----------+--------------------------- + regress_testrol2 | sv1 | {user=CURRENT_USER} + current_user | sv2 | {"user=\"current_user\""} + regress_testrol2 | sv3 | {user=USER} + user | sv4 | {"user=\"USER\""} + regress_testrol1 | sv5 | {user=SESSION_USER} + | sv6 | {user=PUBLIC} + Public | sv7 | {"user=\"Public\""} + regress_testrolx | sv8 | {user=regress_testrolx} +(8 rows) -- DROP USER MAPPING IF EXISTS DROP USER MAPPING IF EXISTS FOR CURRENT_USER SERVER sv1; -NOTICE: server does not exist, skipping SELECT * FROM chkumapping(); - umname | umserver | umoptions ---------+----------+----------- -(0 rows) + umname | umserver | umoptions +------------------+----------+--------------------------- + current_user | sv2 | {"user=\"current_user\""} + regress_testrol2 | sv3 | {user=USER} + user | sv4 | {"user=\"USER\""} + regress_testrol1 | sv5 | {user=SESSION_USER} + | sv6 | {user=PUBLIC} + Public | sv7 | {"user=\"Public\""} + regress_testrolx | sv8 | {user=regress_testrolx} +(7 rows) DROP USER MAPPING IF EXISTS FOR "current_user" SERVER sv2; -NOTICE: server does not exist, skipping SELECT * FROM chkumapping(); - umname | umserver | umoptions ---------+----------+----------- -(0 rows) + umname | umserver | umoptions +------------------+----------+------------------------- + regress_testrol2 | sv3 | {user=USER} + user | sv4 | {"user=\"USER\""} + regress_testrol1 | sv5 | {user=SESSION_USER} + | sv6 | {user=PUBLIC} + Public | sv7 | {"user=\"Public\""} + regress_testrolx | sv8 | {user=regress_testrolx} +(6 rows) DROP USER MAPPING IF EXISTS FOR USER SERVER sv3; -NOTICE: server does not exist, skipping SELECT * FROM chkumapping(); - umname | umserver | umoptions ---------+----------+----------- -(0 rows) + umname | umserver | umoptions +------------------+----------+------------------------- + user | sv4 | {"user=\"USER\""} + regress_testrol1 | sv5 | {user=SESSION_USER} + | sv6 | {user=PUBLIC} + Public | sv7 | {"user=\"Public\""} + regress_testrolx | sv8 | {user=regress_testrolx} +(5 rows) DROP USER MAPPING IF EXISTS FOR "user" SERVER sv4; -NOTICE: server does not exist, skipping SELECT * FROM chkumapping(); - umname | umserver | umoptions ---------+----------+----------- -(0 rows) + umname | umserver | umoptions +------------------+----------+------------------------- + regress_testrol1 | sv5 | {user=SESSION_USER} + | sv6 | {user=PUBLIC} + Public | sv7 | {"user=\"Public\""} + regress_testrolx | sv8 | {user=regress_testrolx} +(4 rows) DROP USER MAPPING IF EXISTS FOR SESSION_USER SERVER sv5; -NOTICE: server does not exist, skipping SELECT * FROM chkumapping(); - umname | umserver | umoptions ---------+----------+----------- -(0 rows) + umname | umserver | umoptions +------------------+----------+------------------------- + | sv6 | {user=PUBLIC} + Public | sv7 | {"user=\"Public\""} + regress_testrolx | sv8 | {user=regress_testrolx} +(3 rows) DROP USER MAPPING IF EXISTS FOR PUBLIC SERVER sv6; -NOTICE: server does not exist, skipping SELECT * FROM chkumapping(); - umname | umserver | umoptions ---------+----------+----------- -(0 rows) + umname | umserver | umoptions +------------------+----------+------------------------- + Public | sv7 | {"user=\"Public\""} + regress_testrolx | sv8 | {user=regress_testrolx} +(2 rows) DROP USER MAPPING IF EXISTS FOR "Public" SERVER sv7; -NOTICE: server does not exist, skipping SELECT * FROM chkumapping(); - umname | umserver | umoptions ---------+----------+----------- -(0 rows) + umname | umserver | umoptions +------------------+----------+------------------------- + regress_testrolx | sv8 | {user=regress_testrolx} +(1 row) DROP USER MAPPING IF EXISTS FOR regress_testrolx SERVER sv8; -NOTICE: server does not exist, skipping SELECT * FROM chkumapping(); umname | umserver | umoptions --------+----------+----------- diff --git a/src/test/regress/expected/xl_limitations_1.out b/src/test/regress/expected/xl_limitations_1.out index 161cd7b4..b1dfd26c 100644 --- a/src/test/regress/expected/xl_limitations_1.out +++ b/src/test/regress/expected/xl_limitations_1.out @@ -730,12 +730,9 @@ SELECT sum(n) FROM t; --FDWs are not supported CREATE FOREIGN DATA WRAPPER xl_foo; -- ERROR -ERROR: Postgres-XL does not support FOREIGN DATA WRAPPER yet -DETAIL: The feature is not currently supported RESET ROLE; CREATE FOREIGN DATA WRAPPER xl_foo VALIDATOR postgresql_fdw_validator; -ERROR: Postgres-XL does not support FOREIGN DATA WRAPPER yet -DETAIL: The feature is not currently supported +ERROR: foreign-data wrapper "xl_foo" already exists --LISTEN/NOTIFY is not supported. Looks like they are supported now. --We would obviously have issues with LISTEN/NOTIFY if clients are connected to different coordinators. Need to test that manually as it is difficult via regression. --LISTEN notify_async1; From 26fafa56d99ba804d0bf0c21fe3dd6fb9fc9623e Mon Sep 17 00:00:00 2001 From: whalesong Date: Fri, 14 May 2021 16:18:02 +0800 Subject: [PATCH 373/578] 2pc files opt: add 2pc hash table on shmem (merge request 300), bugfix: add retry --- src/backend/access/transam/twophase.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index f2fbc7e6..bf6f5c1b 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -168,7 +168,7 @@ int record_2pc_partitions = 32; #define MAX_2PC_INFO_SIZE (record_2pc_entry_size - MAX_TID_SIZE) #define DFLT_2PC_INFO_SIZE 1024 /* default size */ -#define MAX_RETRY_TIMES 2 +#define MAX_RETRY_TIMES 10 /* hash table entry for 2pc record */ typedef struct Cache2pcInfo @@ -3716,7 +3716,8 @@ void record_2pc_involved_nodes_xid(const char * tid, } else if (enable_2pc_entry_trace) { - elog(LOG, "[%s] %s is added to hash table", func, tid); + elog(LOG, "[%s] %s is added to hash table, entry: %p", + func, tid, entry); } memcpy(entry->info, content.data, size + 1); @@ -3830,6 +3831,8 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta size = content.len; Assert(size == strlen(content.data)); + GET_2PC_FILE_PATH(path, tid); + while (NULL != record_2pc_cache && retry_times++ < MAX_RETRY_TIMES) { Assert(strlen(tid) < MAX_TID_SIZE); @@ -3930,6 +3933,12 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta Assert(NULL == entry); print_record_2pc_cache(func); + if (0 == access(path, F_OK)) + { + elog(LOG, "[%s] %s found 2pc file %s", func, tid, path); + break; + } + pg_usleep(5000L); /* sleep 5ms */ } @@ -3938,8 +3947,6 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta elog(LOG, "[%s] %s is not found in hash table, get from disk", func, tid); } - GET_2PC_FILE_PATH(path, tid); - /* the 2pc file exists already */ fd = PathNameOpenFile(path, O_RDWR | O_APPEND, S_IRUSR | S_IWUSR); if (fd < 0) @@ -3968,7 +3975,7 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta } else { - elog(ERROR, "[%s] could not open file %s, errMsg: %s", + elog(PANIC, "[%s] could not open file %s, errMsg: %s", func, path, strerror(errno)); } return; From 8b92338d071d6096cc8443fcbf07889734d9f9dd Mon Sep 17 00:00:00 2001 From: whalesong Date: Wed, 19 May 2021 17:37:48 +0800 Subject: [PATCH 374/578] 2pc files opt: add 2pc hash table on shmem (merge request 300), bugfix: rename conflict when startup --- src/backend/access/transam/twophase.c | 40 ++++++++++++++++++++++----- src/backend/utils/misc/guc.c | 4 +-- 2 files changed, 35 insertions(+), 9 deletions(-) diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index bf6f5c1b..3124e174 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -158,7 +158,7 @@ bool enable_2pc_file_check = true; bool enable_2pc_entry_key_check = true; bool enable_2pc_entry_trace = false; -int record_2pc_cache_size = 50000; +int record_2pc_cache_size = 4096; int record_2pc_entry_size = 2048; int record_2pc_partitions = 32; @@ -4127,16 +4127,25 @@ void rename_2pc_records(const char *tid, TimestampTz timestamp) check_entry_key(tid, entry->key, func); check_2pc_file(tid, entry->info, func); + if (0 == access(new_path, F_OK)) + { if (RecoveryInProgress()) { - fd = PathNameOpenFile(new_path, O_RDWR | O_TRUNC | O_CREAT, - S_IRUSR | S_IWUSR); + elog(LOG, "[%s] file %s exist", func, new_path); } else { + elog(WARNING, "[%s] file %s exist", func, new_path); + } + if (0 != unlink(new_path)) + { + elog(ERROR, "[%s] could not unlink file %s, errMsg: %s", + func, new_path, strerror(errno)); + } + } + fd = PathNameOpenFile(new_path, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR); - } if (fd < 0) { elog(ERROR, "[%s] could not create file %s, errMsg: %s", @@ -4175,6 +4184,23 @@ void rename_2pc_records(const char *tid, TimestampTz timestamp) func, path, strerror(errno)); return; } + if (0 == access(new_path, F_OK)) + { + if (RecoveryInProgress()) + { + elog(LOG, "[%s] file %s exist", func, new_path); + } + else + { + elog(WARNING, "[%s] file %s exist", func, new_path); + } + if (0 != unlink(new_path)) + { + elog(WARNING, "[%s] could not unlink file %s, errMsg: %s", + func, new_path, strerror(errno)); + return; + } + } if (0 != link(path, new_path)) { elog(ERROR, "[%s] could not link file %s to %s, errMsg: %s", @@ -4396,7 +4422,7 @@ Record2pcCacheInit(void) flags = HASH_ELEM | HASH_PARTITION; record_2pc_cache = ShmemInitHash("Record 2pc Cache", - record_2pc_cache_size/2, record_2pc_cache_size, + record_2pc_cache_size, record_2pc_cache_size, &info, flags); } @@ -4406,10 +4432,10 @@ Record2pcCacheInit(void) Size Record2pcCacheSize(void) { - long cache_size = 0; + Size cache_size = 0; if (enable_2pc_file_cache) { - cache_size = (long)record_2pc_cache_size * record_2pc_entry_size; + cache_size = hash_estimate_size(record_2pc_cache_size, record_2pc_entry_size); } return cache_size; } diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 9c699458..5dda04d5 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -4789,7 +4789,7 @@ static struct config_int ConfigureNamesInt[] = gettext_noop("2PC info cache size."), }, &record_2pc_cache_size, - 50000, 100, INT_MAX, + 4096, 1, INT_MAX, NULL, NULL, NULL }, { @@ -4797,7 +4797,7 @@ static struct config_int ConfigureNamesInt[] = gettext_noop("2PC info cache entry size."), }, &record_2pc_entry_size, - 2048, 1200, INT_MAX, + 2048, 1028, INT_MAX, NULL, NULL, NULL }, { From 5c5838201896f8f635af4ad4cbb027df8435da7e Mon Sep 17 00:00:00 2001 From: whalesong Date: Wed, 19 May 2021 17:48:42 +0800 Subject: [PATCH 375/578] 2pc files opt: add 2pc hash table on shmem (merge request 300), bugfix: rename conflict when startup, opt --- src/backend/utils/misc/guc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 5dda04d5..5066d491 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -4789,7 +4789,7 @@ static struct config_int ConfigureNamesInt[] = gettext_noop("2PC info cache size."), }, &record_2pc_cache_size, - 4096, 1, INT_MAX, + 4096, 32, INT_MAX, NULL, NULL, NULL }, { From 11badf21ed44b64facf64f5e5fc2b44bbada1c56 Mon Sep 17 00:00:00 2001 From: gregsun Date: Tue, 27 Apr 2021 12:40:00 +0800 Subject: [PATCH 376/578] Release sub-transaction also in datanode. http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view?bug_id=1020421696087091551 Conflicts: src/test/regress/parallel_schedule --- src/backend/pgxc/pool/execRemote.c | 4 + src/test/regress/expected/pl_bugs.out | 1944 +++++++++++++++++++++++ src/test/regress/parallel_schedule | 3 + src/test/regress/sql/pl_bugs.sql | 2052 +++++++++++++++++++++++++ 4 files changed, 4003 insertions(+) create mode 100644 src/test/regress/expected/pl_bugs.out create mode 100644 src/test/regress/sql/pl_bugs.sql diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index e02f262f..17e6f838 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -12915,7 +12915,11 @@ SubTranscation_PreCommit_Remote(void) ALLOCSET_DEFAULT_SIZES); old = MemoryContextSwitchTo(temp); /* Only local coord can send down commit_subtxn when exec plpgsql */ +#ifdef _PG_ORCL_ + if (InPlpgsqlFunc()) +#else if (InPlpgsqlFunc() && IS_PGXC_LOCAL_COORDINATOR) +#endif { pgxc_node_remote_commit(TXN_TYPE_CommitSubTxn, false); } diff --git a/src/test/regress/expected/pl_bugs.out b/src/test/regress/expected/pl_bugs.out new file mode 100644 index 00000000..0930dd68 --- /dev/null +++ b/src/test/regress/expected/pl_bugs.out @@ -0,0 +1,1944 @@ +CREATE SCHEMA sync; +SET search_path = sync, pg_catalog; +set enable_oracle_compatible to on; +-- +-- Name: func_getlastnetvalue(varchar2, date); Type: FUNCTION; Schema: sync; Owner: gregsun +-- +CREATE FUNCTION func_getlastnetvalue(v_fundcode varchar2, v_cdate date) RETURNS numeric + LANGUAGE plpgsql + AS $$ + declare v_netvalue text; +begin + begin + select p1 + into v_netvalue + from p + limit 1; + exception + when no_data_found then + return 1; + + end; + return 1; +end; + $$; +-- +-- Name: sp_b03_ts_remetrade(varchar2, varchar2, varchar2, varchar2); Type: PROCEDURE; Schema: sync; Owner: gregsun +-- +CREATE function sp_b03_ts_remetrade(p_start_date varchar2, p_work_date varchar2, INOUT err_num varchar2 DEFAULT 0, INOUT err_msg varchar2 DEFAULT NULL::varchar2) + LANGUAGE plpgsql + AS $$ + declare + V_START_DATE DATE; + V_END_DATE DATE; + V_WORK_DATE DATE; + V_SP_NAME VARCHAR(30); + V_TAB_LEVEL VARCHAR(20); + V_LOG_STEP_NO VARCHAR(20); + V_LOG_BEGIN_TIME DATE := SYSDATE; + V_LOG_END_TIME DATE; + V_LOG_ROWCOUNT NUMBER := 0; + V_ELAPSED NUMBER; + V_ALL_ELAPSED NUMBER; + V_STEP_DESC sys_stat_error_log.STEP_DESC%TYPE; +BEGIN + + V_SP_NAME := 'SP_B03_TS_REMETRADE'; + V_TAB_LEVEL := 'B'; + + IF P_START_DATE IS NULL + THEN + RAISE EXCEPTION 'P_START_DATE IS NULL!'; + ELSE + V_START_DATE := TO_DATE(P_START_DATE, 'YYYY-MM-DD'); + END IF; + IF P_WORK_DATE IS NULL + THEN + RAISE EXCEPTION 'P_WORK_DATE IS NULL!'; + ELSE + V_WORK_DATE := TO_DATE(P_WORK_DATE, 'YYYY-MM-DD'); + END IF; + IF P_WORK_DATE IS NULL + THEN + RAISE EXCEPTION 'P_WORK_DATE IS NULL!'; + ELSE + V_END_DATE := TO_DATE(P_WORK_DATE, 'YYYY-MM-DD'); + END IF; + + + + V_LOG_STEP_NO := 'STEP_01'; + V_STEP_DESC := '清除目标表数据'; + V_LOG_BEGIN_TIME := SYSDATE; + V_LOG_ROWCOUNT := NULL; + CALL SP_PUB_INSERT_LOG_DATE(V_SP_NAME + , + V_TAB_LEVEL + , + V_LOG_STEP_NO + , + V_STEP_DESC + , + V_LOG_BEGIN_TIME + , + V_LOG_END_TIME + , + V_WORK_DATE + , + V_LOG_ROWCOUNT + , + V_ELAPSED + , + V_ALL_ELAPSED); + + CALL SP_PUB_DEL_TB('B03_TS_REMETRADE'); + /*DELETE FROM B03_TS_REMETRADE Y + WHERE Y.ENDDATE >=V_START_DATE;*/ + + GET DIAGNOSTICS V_LOG_ROWCOUNT = ROW_COUNT; + + + + CALL SP_PUB_UPDATE_LOG_DATE(V_SP_NAME + , + V_TAB_LEVEL + , + V_LOG_STEP_NO + , + V_LOG_BEGIN_TIME + , + SYSDATE::DATE + , + V_WORK_DATE + , + V_LOG_ROWCOUNT + , + (SYSDATE - V_LOG_BEGIN_TIME)::NUMERIC + , + V_ALL_ELAPSED); + + V_LOG_STEP_NO := 'STEP_02'; + V_STEP_DESC := '插入目标表B03_TS_REMETRADE'; + V_LOG_BEGIN_TIME := SYSDATE; + V_LOG_ROWCOUNT := NULL; + CALL SP_PUB_INSERT_LOG_DATE(V_SP_NAME, + V_TAB_LEVEL, + V_LOG_STEP_NO, + V_STEP_DESC, + V_LOG_BEGIN_TIME, + V_LOG_END_TIME, + V_WORK_DATE, + V_LOG_ROWCOUNT, + V_ELAPSED, + V_ALL_ELAPSED); + + INSERT INTO B03_TS_REMETRADE + (C_FUNDCODE, + C_FUNDNAME, + C_FUNDACCO, + F_NETVALUE, + C_AGENCYNAME, + C_CUSTNAME, + D_DATE, + D_CDATE, + F_CONFIRMBALANCE, + F_TRADEFARE, + F_CONFIRMSHARES, + F_RELBALANCE, + F_INTEREST, + INFO, + WORK_DATE, + LOAD_DATE) + SELECT A.C_FUNDCODE, + A.C_FUNDNAME, + A.C_FUNDACCO, + A.F_NETVALUE, + A.C_AGENCYNAME, + A.C_CUSTNAME, + A.D_DATE, + A.D_CDATE, + A.F_CONFIRMBALANCE, + A.F_TRADEFARE, + A.F_CONFIRMSHARES, + ABS(NVL(B.F_OCCURBALANCE, A.F_RELBALANCE)) F_RELBALANCE, + A.F_INTEREST, + NVL(DECODE(B.C_BUSINFLAG, + '02', + '申购', + '50', + '申购', + '74', + '申购', + '03', + '赎回'), + DECODE(A.C_BUSINFLAG, + '01', + '认购', + '02', + '申购', + '03', + '赎回', + '53', + '强制赎回', + '50', + '产品成立')) AS INFO, + V_WORK_DATE, + SYSDATE AS LOAD_DATE + FROM (SELECT A.C_FUNDCODE, + C.C_FUNDNAME, + A.C_FUNDACCO, + FUNC_GETLASTNETVALUE(A.C_FUNDCODE, A.D_CDATE) F_NETVALUE, + (SELECT C_AGENCYNAME + FROM S017_TAGENCYINFO + WHERE A.C_AGENCYNO = C_AGENCYNO) C_AGENCYNAME, + B.C_CUSTNAME, + TO_CHAR(A.D_DATE, 'yyyy-mm-dd') D_DATE, + TO_CHAR(A.D_CDATE, 'yyyy-mm-dd') D_CDATE, + DECODE(A.C_BUSINFLAG, + '03', + A.F_CONFIRMBALANCE + A.F_TRADEFARE, + '53', + A.F_CONFIRMBALANCE + A.F_TRADEFARE, + A.F_CONFIRMBALANCE) F_CONFIRMBALANCE, + A.F_TRADEFARE, + A.F_CONFIRMSHARES, + DECODE(A.C_BUSINFLAG, + '03', + A.F_CONFIRMBALANCE, + '53', + A.F_CONFIRMBALANCE, + A.F_CONFIRMBALANCE - A.F_TRADEFARE) F_RELBALANCE, + A.F_INTEREST, + A.C_BUSINFLAG, + A.C_CSERIALNO + FROM (SELECT D_DATE, + C_AGENCYNO, + DECODE(C_BUSINFLAG, + '03', + DECODE(C_IMPROPERREDEEM, + '3', + '100', + '5', + '100', + C_BUSINFLAG), + C_BUSINFLAG) C_BUSINFLAG, + C_FUNDACCO, + D_CDATE, + C_FUNDCODE, + F_CONFIRMBALANCE, + F_CONFIRMSHARES, + C_REQUESTNO, + F_TRADEFARE, + C_TRADEACCO, + F_INTEREST, + C_CSERIALNO, + L_SERIALNO, + L_CONTRACTSERIALNO + FROM S017_TCONFIRM_ALL T3 + UNION + SELECT D_DATE, + C_AGENCYNO, + '02' C_BUSINFLAG, + C_FUNDACCO, + D_LASTDATE AS D_CDATE, + C_FUNDCODE, + F_REINVESTBALANCE F_CONFIRMBALANCE, + F_REALSHARES F_CONFIRMSHARES, + '' C_REQUESTNO, + 0 F_TRADEFARE, + C_TRADEACCO, + 0 F_INTEREST, + C_CSERIALNO, + 0 L_SERIALNO, + L_CONTRACTSERIALNO + FROM S017_TDIVIDENDDETAIL T1 + WHERE T1.C_FLAG = '0') A + LEFT JOIN S017_TACCONET TACN + ON A.C_TRADEACCO = TACN.C_TRADEACCO + LEFT JOIN (SELECT * FROM S017_TACCOINFO WHERE C_ACCOUNTTYPE = 'A') X + ON A.C_FUNDACCO = X.C_FUNDACCO + LEFT JOIN S017_TTRUSTCLIENTINFO_ALL B + ON X.C_CUSTNO = B.C_CUSTNO + INNER JOIN S017_TFUNDINFO C + ON A.C_FUNDCODE = C.C_FUNDCODE + ) A + LEFT JOIN (SELECT ST1.D_CDATE, + ST1.C_FUNDCODE, + ST1.F_OCCURBALANCE, + ST1.C_BUSINFLAG, + ST1.C_FUNDACCO, + ST1.C_CSERIALNO + FROM S017_TSHARECURRENTS_ALL ST1 + WHERE ST1.C_BUSINFLAG <> '74' + UNION ALL + SELECT ST2.D_DATE AS D_CDATE, + ST2.C_FUNDCODE, + ST2.F_TOTALPROFIT AS F_OCCURBALANCE, + '74' AS C_BUSINFLAG, + ST2.C_FUNDACCO, + ST2.C_CSERIALNO + FROM S017_TDIVIDENDDETAIL ST2 + WHERE ST2.C_FLAG = '0') B + ON A.C_FUNDCODE = B.C_FUNDCODE + AND A.C_FUNDACCO = B.C_FUNDACCO + AND TO_DATE(A.D_CDATE, 'YYYY-MM-DD') = B.D_CDATE + AND A.C_CSERIALNO = B.C_CSERIALNO; + GET DIAGNOSTICS V_LOG_ROWCOUNT = ROW_COUNT; + + + CALL SP_PUB_UPDATE_LOG_DATE(V_SP_NAME, + V_TAB_LEVEL, + V_LOG_STEP_NO, + V_LOG_BEGIN_TIME, + SYSDATE, + V_WORK_DATE, + V_LOG_ROWCOUNT, + (SYSDATE - V_LOG_BEGIN_TIME)::NUMERIC, + V_ALL_ELAPSED); + ERR_NUM := 0; + ERR_MSG := 'NORMAL,SUCCESSFUL COMPLETION'; +END; + $$; +ERROR: invalid type name "sys_stat_error_log.STEP_DESC%TYPE" +LINE 16: V_STEP_DESC sys_stat_error_log.STEP_DESC%TYPE; + ^ +-- +-- Name: sp_pub_del_tb(varchar2); Type: PROCEDURE; Schema: sync; Owner: gregsun +-- +CREATE PROCEDURE sp_pub_del_tb(p_tab_name varchar2) + LANGUAGE plpgsql + AS $$ + declare n_sql varchar2(4000); +begin + + n_sql := 'truncate table '||p_tab_name; + + execute immediate n_sql; +exception + when no_data_found then null; + when others then raise; +end ; + $$; +ERROR: syntax error at or near "PROCEDURE" +LINE 1: CREATE PROCEDURE sp_pub_del_tb(p_tab_name varchar2) + ^ +-- +-- Name: sp_pub_insert_log_date(varchar2, varchar2, varchar2, varchar2, date, date, date, numeric, numeric, numeric); Type: PROCEDURE; Schema: sync; Owner: gregsun +-- +CREATE PROCEDURE sp_pub_insert_log_date(p_in_proc_name varchar2, p_in_tab_level varchar2, p_in_step_no varchar2, p_in_step_desc varchar2, p_in_begin_time date, p_in_end_time date, p_in_work_date date, p_in_row_num numeric, p_in_elapsed numeric, p_in_all_elapsed numeric) + LANGUAGE plpgsql + AS $$ + declare + BEGIN + INSERT INTO SYNC.SYS_STAT_ERROR_LOG + (PROC_NAME + ,TAB_LEVEL + ,STEP_NO + ,STEP_DESC + ,BEGIN_TIME + ,END_TIME + ,WORKDATE + ,ROW_NUM + ,ELAPSED + ,ALL_ELAPSED) + VALUES + (P_IN_PROC_NAME + ,P_IN_TAB_LEVEL + ,P_IN_STEP_NO + ,P_IN_STEP_DESC + ,P_IN_BEGIN_TIME + ,P_IN_END_TIME + ,P_IN_WORK_DATE + ,P_IN_ROW_NUM + ,P_IN_ELAPSED + ,P_IN_ALL_ELAPSED); + COMMIT; + END ; + $$; +ERROR: syntax error at or near "PROCEDURE" +LINE 1: CREATE PROCEDURE sp_pub_insert_log_date(p_in_proc_name varch... + ^ +-- +-- Name: sp_pub_update_log_date(varchar2, varchar2, varchar2, date, date, date, numeric, numeric, numeric); Type: PROCEDURE; Schema: sync; Owner: gregsun +-- +CREATE PROCEDURE sp_pub_update_log_date(p_in_proc_name varchar2, p_in_tab_level varchar2, p_in_step_no varchar2, p_in_begin_time date, p_in_end_time date, p_in_work_date date, p_in_row_num numeric, p_in_elapsed numeric, p_in_all_elapsed numeric) + LANGUAGE plpgsql + AS $$ BEGIN + UPDATE SYNC.SYS_STAT_ERROR_LOG + SET END_TIME = P_IN_END_TIME + ,ROW_NUM = P_IN_ROW_NUM + ,ELAPSED = P_IN_ELAPSED + ,ALL_ELAPSED = P_IN_ALL_ELAPSED + WHERE PROC_NAME = P_IN_PROC_NAME + AND TAB_LEVEL = P_IN_TAB_LEVEL + AND STEP_NO = P_IN_STEP_NO + AND BEGIN_TIME = P_IN_BEGIN_TIME + AND WORKDATE = P_IN_WORK_DATE; + COMMIT; + END ; + $$; +ERROR: syntax error at or near "PROCEDURE" +LINE 1: CREATE PROCEDURE sp_pub_update_log_date(p_in_proc_name varch... + ^ +SET default_tablespace = ''; +SET default_with_oids = false; +-- +-- Name: b03_ts_remetrade; Type: TABLE; Schema: sync; Owner: gregsun +-- +CREATE TABLE b03_ts_remetrade ( + c_fundcode character varying(500) NOT NULL, + c_fundname character varying(4000), + c_fundacco character varying(30), + f_netvalue numeric(16,2), + c_agencyname character varying(4000), + c_custname character varying(4000), + d_date character varying(100), + d_cdate character varying(100), + f_confirmbalance numeric(16,2), + f_tradefare numeric(16,2), + f_confirmshares numeric(16,2), + f_relbalance numeric(16,2), + f_interest numeric(16,2), + info character varying(500), + work_date timestamp(0) without time zone, + load_date timestamp(0) without time zone +) +DISTRIBUTE BY SHARD (c_fundcode) to GROUP default_group; +NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. +-- +-- Name: b03_ts_remetrade_bak; Type: TABLE; Schema: sync; Owner: gregsun +-- +CREATE TABLE b03_ts_remetrade_bak ( + c_fundcode character varying(500) NOT NULL, + c_fundname character varying(4000), + c_fundacco character varying(30), + f_netvalue numeric(16,2), + c_agencyname character varying(4000), + c_custname character varying(4000), + d_date character varying(100), + d_cdate character varying(100), + f_confirmbalance numeric(16,2), + f_tradefare numeric(16,2), + f_confirmshares numeric(16,2), + f_relbalance numeric(16,2), + f_interest numeric(16,2), + info character varying(500), + work_date timestamp(0) without time zone, + load_date timestamp(0) without time zone +) +DISTRIBUTE BY SHARD (c_fundcode) to GROUP default_group; +NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. +-- +-- Name: ks0_fund_base_26; Type: TABLE; Schema: sync; Owner: gregsun +-- +CREATE TABLE ks0_fund_base_26 ( + id1 numeric(48,0) NOT NULL, + acc_cd character varying(500) NOT NULL, + tdate timestamp(0) without time zone NOT NULL, + ins_cd character varying(500) NOT NULL, + cost_price_asset numeric(30,8), + pcol character varying(50) +) +DISTRIBUTE BY SHARD (id1) to GROUP default_group; +NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. +-- +-- Name: p; Type: TABLE; Schema: sync; Owner: gregsun +-- +CREATE TABLE p ( + p1 text, + p2 text +) +DISTRIBUTE BY HASH (p1); +-- +-- Name: s017_taccoinfo; Type: TABLE; Schema: sync; Owner: gregsun +-- +CREATE TABLE s017_taccoinfo ( + c_custno character varying(30) NOT NULL, + c_accounttype character(1), + c_fundacco character varying(30), + c_agencyno character(3), + c_netno character varying(30), + c_childnetno character varying(30), + d_opendate timestamp(0) without time zone, + d_lastmodify timestamp(0) without time zone, + c_accostatus character(1), + c_freezecause character(1), + d_backdate timestamp(0) without time zone, + l_changetime numeric(10,0), + d_firstinvest timestamp(0) without time zone, + c_password character varying(100), + c_bourseflag character(1), + c_operator character varying(100), + jy_custid numeric(10,0), + work_date timestamp(0) without time zone +) +DISTRIBUTE BY SHARD (c_custno) to GROUP default_group; +NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. +-- +-- Name: s017_tacconet; Type: TABLE; Schema: sync; Owner: gregsun +-- +CREATE TABLE s017_tacconet ( + c_fundacco character varying(30) NOT NULL, + c_agencyno character varying(6), + c_netno character varying(30), + c_tradeacco character varying(100), + c_openflag character varying(2), + c_bonustype character varying(2), + c_bankno character varying(500), + c_bankacco character varying(500), + c_nameinbank character varying(1000), + d_appenddate timestamp(0) without time zone, + c_childnetno character varying(30), + c_tradeaccobak character varying(100), + c_bankname character varying(500), + c_banklinecode character varying(100), + c_channelbankno character varying(30), + c_bankprovincecode character varying(30), + c_bankcityno character varying(30), + sys_id character varying(10), + work_date timestamp(0) without time zone, + load_date timestamp(0) without time zone +) +DISTRIBUTE BY SHARD (c_fundacco) to GROUP default_group; +NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. +-- +-- Name: s017_tagencyinfo; Type: TABLE; Schema: sync; Owner: gregsun +-- +CREATE TABLE s017_tagencyinfo ( + c_agencyno character varying(6) NOT NULL, + c_agencyname character varying(1000), + c_fullname character varying(1000), + c_agncyaddress character varying(500), + c_agncyzipcode character varying(30), + c_agncycontact character varying(30), + c_agncyphone character varying(100), + c_agncyfaxno character varying(100), + c_agncymail character varying(100), + c_agncybankno character varying(24), + c_agncybankacco character varying(100), + c_agncybankname character varying(500), + d_agncyregdate timestamp(0) without time zone, + c_agncystatus character varying(2), + d_lastdate timestamp(0) without time zone, + c_agencytype character varying(2), + c_detail character varying(2), + c_right character varying(2), + c_zdcode character varying(30), + l_liquidateredeem numeric(10,0), + l_liquidateallot numeric(10,0), + l_liquidatebonus numeric(10,0), + l_liquidatesub numeric(10,0), + c_sharetypes character varying(30), + f_agio numeric(5,4), + c_ztgonestep character varying(2), + c_preassign character varying(2), + l_cserialno numeric(10,0), + c_comparetype character varying(2), + c_liquidatetype character varying(2), + c_multitradeacco character varying(2), + c_iversion character varying(6), + c_imode character varying(2), + c_changeonstep character varying(2), + f_outagio numeric(5,4), + f_agiohint numeric(5,4), + f_outagiohint numeric(5,4), + c_allotliqtype character varying(2), + c_redeemliqtype character varying(2), + c_centerflag character varying(2), + c_netno character varying(6), + c_littledealtype character varying(2), + c_overtimedeal character varying(2), + d_lastinputtime timestamp(0) without time zone, + f_interestrate numeric(5,4), + c_clearsite character varying(2), + c_isdeal character varying(2), + c_agencyenglishname character varying(100), + l_fundaccono numeric(10,0), + c_rationflag character varying(2), + c_splitflag character varying(2), + c_tacode character varying(30), + c_outdataflag character varying(2), + c_hasindex character varying(2), + c_transferbyadjust character varying(2), + c_sharedetailexptype character varying(2), + c_navexptype character varying(2), + c_ecdmode character varying(2), + c_agencytypedetail character varying(2), + c_advanceshrconfirm character varying(2), + c_ecdversion character varying(2), + c_capmode character varying(2), + c_internetplatform character varying(2), + c_capautoarrive character varying(2), + c_outcapitaldata character varying(30), + c_ecdcheckmode character varying(30), + c_ecddealmode character varying(30), + c_fileimpmode character varying(30), + c_isotc character varying(2), + c_enableecd character varying(30), + c_autoaccotype character varying(30), + c_tncheckmode numeric(10,0), + c_captureidinfo character varying(30), + c_realfreeze character varying(30), + sys_id character varying(10), + work_date timestamp(0) without time zone, + load_date timestamp(0) without time zone +) +DISTRIBUTE BY SHARD (c_agencyno) to GROUP default_group; +NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. +-- +-- Name: s017_tconfirm_all; Type: TABLE; Schema: sync; Owner: gregsun +-- +CREATE TABLE s017_tconfirm_all ( + c_businflag character(2) NOT NULL, + d_cdate timestamp(0) without time zone, + c_cserialno character varying(100), + d_date timestamp(0) without time zone, + l_serialno numeric(10,0), + c_agencyno character(3), + c_netno character varying(30), + c_fundacco character varying(30), + c_tradeacco character varying(100), + c_fundcode character varying(30), + c_sharetype character(1), + f_confirmbalance numeric(16,2), + f_confirmshares numeric(16,2), + f_tradefare numeric(16,2), + f_tafare numeric(16,2), + f_stamptax numeric(16,2), + f_backfare numeric(16,2), + f_otherfare1 numeric(16,2), + f_interest numeric(16,2), + f_interesttax numeric(16,2), + f_totalfare numeric(16,2), + f_agencyfare numeric(16,2), + f_netvalue numeric(12,4), + f_frozenbalance numeric(16,2), + f_unfrozenbalance numeric(16,2), + c_status character(1), + c_cause character varying(100), + c_taflag character(1), + c_custtype character(1), + c_custno character varying(30), + f_gainbalance numeric(16,2), + f_orifare numeric(16,2), + c_requestendflag character(1), + f_unbalance numeric(16,2), + f_unshares numeric(16,2), + c_reserve character varying(500), + f_interestshare numeric(16,2), + f_chincome numeric(16,2), + f_chshare numeric(16,2), + f_confirmincome numeric(16,2), + f_oritradefare numeric(16,2), + f_oritafare numeric(16,2), + f_oribackfare numeric(16,2), + f_oriotherfare1 numeric(16,2), + c_requestno character varying(100), + f_balance numeric(16,2), + f_shares numeric(16,2), + f_agio numeric(5,4), + f_lastshares numeric(16,2), + f_lastfreezeshare numeric(16,2), + c_othercode character varying(30), + c_otheracco character varying(30), + c_otheragency character(3), + c_othernetno character varying(30), + c_bonustype character(1), + c_foriginalno character varying(500), + c_exceedflag character(1), + c_childnetno character varying(30), + c_othershare character(1), + c_actcode character(3), + c_acceptmode character(1), + c_freezecause character(1), + c_freezeenddate character varying(100), + f_totalbalance numeric(16,2), + f_totalshares numeric(16,2), + c_outbusinflag character(3), + c_protocolno character varying(30), + c_memo character varying(500), + f_registfare numeric(16,2), + f_fundfare numeric(16,2), + f_oriagio numeric(5,4), + c_shareclass character(1), + d_cisdate timestamp(0) without time zone, + c_bourseflag character(1), + c_fundtype character(1), + f_backfareagio numeric(5,4), + c_bankno character varying(30), + c_subfundmethod character varying(30), + c_combcode character varying(30), + f_returnfare numeric(16,2), + c_contractno character varying(100), + c_captype character(1), + l_contractserialno numeric(10,0), + l_othercontractserialno numeric(10,0), + d_exportdate timestamp(0) without time zone, + f_transferfee numeric(16,2), + f_oriconfirmbalance numeric(16,2), + f_extendnetvalue numeric(23,15), + l_remitserialno numeric(10,0), + c_zhxtht character varying(500), + c_improperredeem character(1), + f_untradefare numeric(16,2), + f_untradeinfare numeric(16,2), + f_untradeoutfare numeric(16,2), + c_profitnottransfer character(1), + f_outprofit numeric(9,6), + f_inprofit numeric(9,6), + c_totrustcontractid character varying(500), + d_repurchasedate timestamp(0) without time zone, + f_chengoutbalance numeric(16,2), + c_exporting character(1), + jy_fundid numeric(10,0), + jy_contractbh character varying(100), + jy_custid numeric(10,0), + jy_tocustid numeric(10,0), + jy_fare numeric(16,2), + c_trustcontractid character varying(500), + f_taagencyfare numeric(16,2), + f_taregisterfare numeric(16,2), + d_cdate_jy timestamp(0) without time zone, + jy_adjust character(1), + jy_subfundid numeric, + jy_adjust1114 character(1), + jy_cdate timestamp(0) without time zone, + c_bankacco character varying(500), + c_bankname character varying(500), + c_nameinbank character varying(1000), + f_riskcapital numeric(16,2), + f_replenishriskcapital numeric(16,2), + c_fromfundcode character varying(30), + c_fromtrustcontractid character varying(500), + c_trustagencyno character varying(100), + l_rdmschserialno numeric(10,0), + f_redeemprofit numeric(16,2), + f_redeemproyieldrate numeric(13,10), + d_redeemprobigdate timestamp(0) without time zone, + d_redeemproenddate timestamp(0) without time zone, + c_changeownerincomebelong character(1), + l_midremitserialno numeric(10,0), + c_fromtype character(1), + c_iscycinvest character(1), + l_fromserialno numeric(10,0), + l_frominterestconserialno numeric(10,0), + c_changeownerinterest character(1), + c_msgsendflag character(1), + l_sharedelaydays numeric(3,0), + c_istodayconfirm character(1), + f_newincome numeric(16,2), + f_floorincome numeric(10,9), + l_incomeremitserialno numeric(10,0), + c_isnetting character(1), + l_bankserialno numeric(10,0), + c_subfundcode character varying(30), + f_chengoutsum numeric(16,2), + f_chengoutprofit numeric(16,2), + l_confirmtransserialno numeric(10,0), + c_shareadjustgzexpflag character(1), + c_issend character(1), + c_exchangeflag character(1), + yh_date_1112 timestamp(0) without time zone, + l_banktocontractserialno numeric(10,0), + c_payfeetype character(1), + c_tobankno character varying(30), + c_tobankacco character varying(500), + c_tobankname character varying(500), + c_tonameinbank character varying(1000), + c_tobanklinecode character varying(100), + c_tobankprovincecode character varying(30), + c_tobankcityno character varying(30), + l_assetseperateno numeric(10,0), + c_sharecserialno character varying(100), + c_redeemprincipaltype character(1), + work_date timestamp(0) without time zone, + c_businname character varying(100) +) +DISTRIBUTE BY SHARD (c_businflag) to GROUP default_group; +NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. +-- +-- Name: s017_tdividenddetail; Type: TABLE; Schema: sync; Owner: gregsun +-- +CREATE TABLE s017_tdividenddetail ( + d_cdate timestamp(0) without time zone NOT NULL, + c_cserialno character varying(100), + d_regdate timestamp(0) without time zone, + d_date timestamp(0) without time zone, + c_fundacco character varying(30), + c_tradeacco character varying(100), + c_fundcode character varying(30), + c_sharetype character varying(2), + c_agencyno character varying(6), + c_netno character varying(30), + f_totalshare numeric(16,2), + f_unitprofit numeric(7,4), + f_totalprofit numeric(16,2), + f_tax numeric(16,2), + c_flag character varying(2), + f_realbalance numeric(16,2), + f_reinvestbalance numeric(16,2), + f_realshares numeric(16,2), + f_fare numeric(16,2), + d_lastdate timestamp(0) without time zone, + f_netvalue numeric(7,4), + f_frozenbalance numeric(16,2), + f_frozenshares numeric(16,2), + f_incometax numeric(9,4), + c_reserve character varying(100), + d_requestdate timestamp(0) without time zone, + c_shareclass character varying(30), + l_contractserialno numeric(10,0), + l_specprjserialno numeric(10,0), + f_investadvisorratio numeric(9,8), + f_transferfee numeric(16,2), + l_profitserialno numeric(10,0), + d_exportdate timestamp(0) without time zone, + c_custid character varying(30), + jy_fundid numeric, + jy_subfundid numeric, + jy_custid numeric, + jy_contractbh character varying(100), + jy_profitsn numeric, + jy_profitmoney numeric, + jy_capitalmoney numeric, + jy_adjust character varying(2), + c_reinvestnetvalue character varying(2), + f_transferbalance numeric(16,2), + l_relatedserialno numeric(10,0), + c_printoperator character varying(100), + c_printauditor character varying(100), + sys_id character varying(10), + work_date timestamp(0) without time zone, + load_date timestamp(0) without time zone, + f_remainshares numeric(16,2) +) +DISTRIBUTE BY SHARD (d_cdate) to GROUP default_group; +NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. +-- +-- Name: s017_tfundday; Type: TABLE; Schema: sync; Owner: gregsun +-- +CREATE TABLE s017_tfundday ( + d_date timestamp(0) without time zone, + d_cdate timestamp(0) without time zone, + c_fundcode varchar2(30), + c_todaystatus varchar2(2), + c_status varchar2(2), + f_netvalue numeric(7,4), + f_lastshares numeric(16,2), + f_lastasset numeric(16,2), + f_asucceed numeric(16,2), + f_rsucceed numeric(16,2), + c_vastflag varchar2(2), + f_encashratio numeric(9,8), + f_changeratio numeric(9,8), + c_excessflag varchar2(2), + f_subscriberatio numeric(9,8), + c_inputpersonnel varchar2(100), + c_checkpersonnel varchar2(100), + f_income numeric(16,2), + f_incomeratio numeric(9,6), + f_unassign numeric(16,2), + f_incomeunit numeric(10,5), + f_totalnetvalue numeric(7,4), + f_servicefare numeric(16,2), + f_assign numeric(16,2), + f_growthrate numeric(9,8), + c_netvalueflag varchar2(2), + f_managefare numeric(16,2), + d_exportdate timestamp(0) without time zone, + c_flag varchar2(2), + f_advisorfee numeric(16,2), + d_auditdate timestamp(0) without time zone, + f_extendnetvalue numeric(23,15), + f_extendtotalnetvalue numeric(23,15), + jy_fundcode varchar2(30), + f_yearincomeratio numeric(9,6), + f_riskcapital numeric(16,2), + f_totalincome numeric(16,2), + f_agencyexpyearincomeration numeric(9,6), + f_agencyexpincomeunit numeric(10,5), + f_agencyexpincomeration numeric(9,6), + f_agencyexpincome numeric(16,2), + c_isspecflag varchar2(2), + c_isasync varchar2(2), + sys_id varchar2(10), + work_date timestamp(0) without time zone, + load_date timestamp(0) without time zone DEFAULT orcl_sysdate() +) +DISTRIBUTE BY HASH (d_date); +-- +-- Name: s017_tfundinfo; Type: TABLE; Schema: sync; Owner: gregsun +-- +CREATE TABLE s017_tfundinfo ( + c_fundcode character varying(30) NOT NULL, + c_fundname character varying(1000), + c_moneytype character varying(6), + c_managername character varying(100), + c_trusteecode character varying(30), + f_parvalue numeric(7,4), + f_issueprice numeric(12,4), + c_trusteeacco character varying(100), + d_issuedate timestamp(0) without time zone, + d_setupdate timestamp(0) without time zone, + f_maxbala numeric(16,2), + f_maxshares numeric(16,2), + f_minbala numeric(16,2), + f_minshares numeric(16,2), + l_elimitday numeric(10,0), + l_slimitday numeric(10,0), + l_alimitday numeric(10,0), + l_mincount numeric(10,0), + l_climitday numeric(10,0), + f_maxallot numeric(9,8), + f_maxredeem numeric(9,8), + c_fundcharacter character varying(500), + c_fundstatus character varying(2), + c_subscribemode character varying(2), + l_timelimit numeric(10,0), + l_subscribeunit numeric(10,0), + c_sharetypes character varying(30), + c_issuetype character varying(2), + f_factcollect numeric(16,2), + d_failuedate timestamp(0) without time zone, + f_allotratio numeric(9,8), + c_feeratiotype1 character varying(2), + c_feeratiotype2 character varying(2), + c_feetype character varying(2), + c_exceedpart character varying(2), + c_bonustype character varying(2), + c_forceredeem character varying(2), + c_interestdealtype character varying(2), + f_redeemfareratio numeric(5,4), + f_changefareratio numeric(5,4), + f_managerfee numeric(7,6), + f_right numeric(5,4), + c_property character varying(2), + d_evendate timestamp(0) without time zone, + f_totalbonus numeric(7,4), + c_changefree character varying(2), + c_reportcode character varying(30), + c_backfarecal character varying(2), + l_moneydate numeric(10,0), + l_netprecision numeric(10,0), + c_corpuscontent character varying(2), + f_corpusratio numeric(5,4), + c_farecaltype character varying(2), + l_liquidateallot numeric(10,0), + l_liquidateredeem numeric(10,0), + l_liquidatebonus numeric(10,0), + l_taspecialacco numeric(10,0), + c_fareprecision character varying(2), + d_issueenddate timestamp(0) without time zone, + c_farebelongasset character varying(2), + l_liquidatechange numeric(10,0), + l_liquidatefail numeric(10,0), + l_liquidateend numeric(10,0), + c_sharedetail character varying(2), + c_trusteebankname character varying(500), + c_boursetradeflag character varying(2), + c_fundenglishname character varying(100), + l_bankaccono numeric(10,0), + c_cleanflag character varying(2), + c_precision character varying(2), + c_upgradeflag character varying(2), + c_isdeal character varying(2), + c_farecltprecision character varying(2), + c_balanceprecision character varying(2), + c_shareprecision character varying(2), + c_bonusprecision character varying(2), + c_interestprecision character varying(2), + f_maxallotasset numeric(16,2), + f_maxallotshares numeric(16,2), + c_foreigntrustee character varying(6), + l_tnconfirm numeric(3,0), + c_rationallotstatus character varying(2), + f_trusteefee numeric(7,6), + c_fundacco character varying(30), + c_financetype character varying(2), + l_liquidatechangein numeric(10,0), + c_custname character varying(500), + c_identitytype character varying(2), + c_custtype character varying(2), + c_identityno character varying(100), + c_deductschemecode character varying(30), + c_customermanager character varying(30), + c_templateid character varying(30), + f_pr0 numeric(7,4), + f_deductratio numeric(5,4), + c_farecalculatetype character varying(2), + c_saletype character varying(2), + l_maxcount numeric(10,0), + l_zhallotliqdays numeric(10,0), + l_zhredeemliqdays numeric(10,0), + f_liqasset numeric(16,2), + l_zhallotexpdays numeric(10,0), + l_zhredeemexpdays numeric(10,0), + c_limitmode character varying(2), + c_ordermode character varying(2), + c_acntlmtdealmode character varying(2), + l_informdays numeric(2,0), + c_allowpartredeem character varying(2), + c_fundendmode character varying(2), + f_fundendagio numeric(10,9), + c_minbalalimitisconfirm character varying(2), + c_gradetype character varying(2), + c_qryfreqtype character varying(2), + l_qrydaysltd numeric(2,0), + d_contractenddate timestamp(0) without time zone, + c_useinopenday character varying(2), + c_allotcalinterst character varying(2), + c_fundrisk character varying(2), + c_exitallot character varying(2), + c_subinterestcalc character varying(2), + c_earlyexitredfee character varying(2), + c_navexpfqy character varying(2), + l_navexpday numeric(10,0), + c_isbounded character varying(2), + c_earlyexitfeecalc character varying(2), + c_designdptid character varying(100), + c_fixeddividway character varying(2), + c_trusttype character varying(2), + f_maxnaturalmoney numeric(16,2), + c_projectid character varying(30), + c_trustclass character varying(2), + f_trustscale numeric(16,2), + c_structflag character varying(2), + c_priconveyflag character varying(2), + c_repurchasetype character varying(2), + c_iswholerepurchase character varying(2), + f_repurchaseminbala numeric(16,2), + c_repurchasemainbody character varying(2), + c_canelyrepurchase character varying(2), + c_earlybacktime character varying(2), + c_repurchaseprice character varying(2), + c_premiumpaymenttime character varying(2), + c_liquisource character varying(2), + l_period numeric(3,0), + c_canextensionflag character varying(2), + c_canelyliquidflag character varying(2), + c_trustassetdesc character varying(100), + c_returnside character varying(2), + c_returnpaymentway character varying(2), + c_returnbase character varying(2), + c_refepaymentway character varying(2), + c_refeside character varying(2), + c_refebase character varying(2), + f_warnline numeric(5,4), + f_stopline numeric(5,4), + f_collectinterest numeric(11,8), + f_durationinterest numeric(7,4), + f_investadvisorratio numeric(7,6), + c_bonusschema character varying(2), + c_guaranteetype character varying(2), + c_guaranteedesc character varying(100), + c_expectedyieldtype character varying(2), + f_minexpectedyield numeric(12,4), + f_maxexpectedyield numeric(12,4), + c_incomecycletype character varying(2), + f_incomecyclevalue numeric(10,0), + c_subaccotype character varying(2), + c_allotaccotype character varying(2), + c_fundtype character varying(2), + c_cootype character varying(1000), + c_projecttype character varying(2), + c_investdirection character varying(30), + c_investdirectionfractionize character varying(2), + c_industrydetail character varying(1000), + c_initeresttype character varying(2), + c_isextended character varying(2), + d_extenddate timestamp(0) without time zone, + c_dealmanagetype character varying(2), + c_investarea character varying(2), + c_projectcode character varying(1000), + c_fundshortname character varying(500), + c_contractid character varying(500), + c_functype character varying(2), + c_specialbusintype character varying(1000), + c_investindustry character varying(2), + c_managetype character varying(2), + c_area character varying(500), + c_risk character varying(2), + c_iscommitteedisscuss character varying(2), + c_structtype character varying(2), + c_commendplace character varying(2), + l_npmaxcount numeric(5,0), + c_client character varying(100), + c_clientcusttype character varying(2), + c_clientidtype character varying(2), + c_clientidno character varying(100), + c_clientbankname character varying(100), + c_clientaccono character varying(100), + c_clientaddress character varying(500), + c_clientzipcode character varying(30), + c_clientphoneno1 character varying(100), + c_clientphoneno2 character varying(100), + c_clientfax character varying(100), + c_beneficiary character varying(100), + c_collectbankname character varying(500), + c_collectbankno character varying(6), + c_collectaccountname character varying(500), + c_collectbankacco character varying(100), + c_keeperbankname character varying(500), + c_keeperaccountname character varying(500), + c_keeperaccountno character varying(100), + c_keepername character varying(500), + c_keepercorporation character varying(500), + c_keeperaddress character varying(500), + c_keeperzipcode character varying(30), + c_keeperphoneno1 character varying(100), + c_keeperphoneno2 character varying(100), + c_keeperfax character varying(100), + c_incomedistributetype character varying(2), + c_alarmline character varying(1000), + c_stoplossline character varying(1000), + f_investadvisorfee numeric(12,2), + c_investadvisordeduct character varying(1000), + c_capitalacco character varying(500), + c_stockacconame character varying(500), + c_stocksalesdept character varying(500), + c_thirdpartybankno character varying(6), + c_thirdpartybankname character varying(500), + c_thirdpartyacconame character varying(500), + c_thirdpartyaccono character varying(100), + c_investadvisor character varying(500), + c_investadvisorbankno character varying(6), + c_investadvisorbankname character varying(500), + c_investadvisoracconame character varying(500), + c_investadvisoraccono character varying(100), + c_investadvisorcorporation character varying(500), + c_investadvisoraddress character varying(500), + c_investadvisorzipcode character varying(30), + c_investadvisorphoneno1 character varying(100), + c_investadvisorphoneno2 character varying(100), + c_investadvisorfax character varying(100), + c_authdelegate character varying(100), + c_loanfinanceparty character varying(500), + c_loanfinancepartycorporation character varying(500), + c_loanfinancepartyaddress character varying(500), + c_loanfinancepartyzipcode character varying(30), + c_loanfinancepartyphoneno1 character varying(100), + c_loanfinancepartyphoneno2 character varying(100), + c_loanfinancepartyfax character varying(100), + c_loaninteresttype character varying(2), + f_loaninterestrate numeric(7,4), + f_loanduration numeric(5,0), + c_loanmanagebank character varying(500), + f_loanmanagefee numeric(9,2), + f_loanfinancecost numeric(9,2), + f_creditattornduration numeric(5,0), + f_creditattorninterestduration numeric(7,4), + f_creditattornprice numeric(12,2), + f_billattornduration numeric(5,0), + f_billattorninterestduration numeric(7,4), + f_billattornprice numeric(12,2), + c_stkincfincparty character varying(1000), + c_stkincfincpartycorporation character varying(500), + c_stkincfincpartyaddress character varying(500), + c_stkincfincpartyzipcode character varying(30), + c_stkincfincpartyphoneno1 character varying(100), + c_stkincfincpartyphoneno2 character varying(100), + c_stkincfincpartyfax character varying(100), + c_stkincincomeannualizedrate numeric(7,4), + c_stkincinteresttype character varying(2), + f_stkincattornprice numeric(12,2), + f_stkincattornduration numeric(5,0), + f_stkincbail numeric(12,2), + f_stkincfinccost numeric(9,2), + c_stkincmemo1 character varying(1000), + c_stkincmemo2 character varying(1000), + c_debtincfincparty character varying(500), + c_debtincfincpartycorporation character varying(500), + c_debtincfincpartyaddress character varying(500), + c_debtincfincpartyzipcode character varying(30), + c_debtincfincpartyphoneno1 character varying(100), + c_debtincfincpartyphoneno2 character varying(100), + c_debtincfincpartyfax character varying(100), + c_debtincincomerate numeric(7,4), + c_debtincinteresttype character varying(2), + f_debtincattornprice numeric(12,2), + f_debtincattornduration numeric(5,0), + f_debtincbail numeric(12,2), + f_debtincfinccost numeric(9,2), + c_debtincmemo1 character varying(1000), + c_othinvfincparty character varying(500), + c_othinvfincpartycorporation character varying(500), + c_othinvfincpartyaddress character varying(500), + c_othinvfincpartyzipcode character varying(30), + c_othinvfincpartyphoneno1 character varying(100), + c_othinvfincpartyphoneno2 character varying(100), + c_othinvfincpartyfax character varying(100), + f_othinvfinccost numeric(9,2), + c_othinvmemo1 character varying(1000), + c_othinvmemo2 character varying(1000), + c_othinvmemo3 character varying(1000), + c_banktrustcoobank character varying(500), + c_banktrustproductname character varying(500), + c_banktrustproductcode character varying(100), + c_banktrustundertakingletter character varying(2), + c_trustgovgovname character varying(500), + c_trustgovprojecttype character varying(1000), + c_trustgovcootype character varying(4), + c_trustgovoptype character varying(4), + c_housecapital character varying(4), + c_houseispe character varying(2), + c_tradetype character varying(2), + c_businesstype character varying(2), + c_trustname character varying(500), + c_trustidtype character varying(2), + c_trustidno character varying(100), + d_trustidvaliddate timestamp(0) without time zone, + c_trustbankname character varying(500), + c_trustaccounttype character varying(2), + c_trustnameinbank character varying(100), + c_zhtrustbankname character varying(500), + c_zhtrustbankacco character varying(100), + c_issecmarket character varying(2), + c_fundoperation character varying(2), + c_trustmanager character varying(100), + c_tradeother character varying(4000), + c_watchdog character varying(500), + c_memo character varying(1000), + c_benefittype character varying(2), + c_redeemaccotype character varying(2), + c_bonusaccotype character varying(2), + c_fundendaccotype character varying(2), + c_collectfailaccotype character varying(2), + d_lastmodifydate timestamp(0) without time zone, + c_shareholdlimtype character varying(2), + c_redeemtimelimtype character varying(2), + c_isprincipalrepayment character varying(2), + c_principalrepaymenttype character varying(2), + l_interestyeardays numeric(3,0), + l_incomeyeardays numeric(3,0), + c_capuseprovcode character varying(30), + c_capusecitycode character varying(30), + c_capsourceprovcode character varying(30), + c_banktrustcoobankcode character varying(30), + c_banktrustisbankcap character varying(2), + c_trusteefeedesc character varying(4000), + c_managefeedesc character varying(4000), + c_investfeedesc character varying(4000), + f_investadvisordeductratio numeric(7,6), + c_investdeductdesc character varying(4000), + c_investadvisor2 character varying(500), + f_investadvisorratio2 numeric(7,6), + f_investadvisordeductratio2 numeric(7,6), + c_investfeedesc2 character varying(4000), + c_investdeductdesc2 character varying(4000), + c_investadvisor3 character varying(500), + f_investadvisorratio3 numeric(7,6), + f_investadvisordeductratio3 numeric(7,6), + c_investfeedesc3 character varying(4000), + c_investdeductdesc3 character varying(4000), + c_profitclassdesc character varying(4000), + c_deductratiodesc character varying(4000), + c_redeemfeedesc character varying(4000), + l_defaultprecision numeric(10,0), + c_allotfeeaccotype character varying(2), + c_isposf character varying(2), + c_opendaydesc character varying(4000), + c_actualmanager character varying(100), + c_subindustrydetail character varying(30), + c_isbankleading character varying(2), + c_subprojectcode character varying(500), + c_iscycleinvest character varying(2), + f_liquidationinterest numeric(13,10), + c_liquidationinteresttype character varying(2), + c_isbonusinvestfare character varying(2), + c_subfeeaccotype character varying(2), + c_redeemfeeaccotype character varying(2), + c_fundrptcode character varying(30), + c_ordertype character varying(2), + c_flag character varying(2), + c_allotliqtype character varying(2), + l_sharelimitday numeric(5,0), + c_iseverydayopen character varying(2), + c_tradebynetvalue character varying(2), + c_isstage character varying(2), + c_specbenfitmemo character varying(4000), + d_effectivedate timestamp(0) without time zone, + c_issueendflag character varying(2), + c_resharehasrdmfee character varying(2), + jy_fundcode numeric, + jy_fundid numeric, + jy_subfundid numeric, + jy_dptid numeric, + c_iswealth character varying(2), + c_interestcalctype character varying(2), + c_allotinterestcalctype character varying(2), + c_isriskcapital character varying(2), + c_fundstatus_1225 character varying(2), + c_isincomeeverydaycalc character varying(2), + c_isredeemreturninterest character varying(2), + c_isrefundrtninterest character varying(2), + d_estimatedsetupdate timestamp(0) without time zone, + f_estimatedfactcollect numeric(16,2), + c_isfinancialproducts character varying(2), + c_fundredeemtype character varying(2), + c_trademanualinput character varying(2), + f_clientmanageration numeric(7,6), + c_profitclassadjustment character varying(2), + c_mainfundcode character varying(30), + c_contractsealoff character varying(2), + c_permitnextperiod character varying(2), + c_preprofitschematype character varying(2), + c_fundredeemprofit character varying(2), + f_incomeration numeric(9,8), + c_incomecalctype character varying(2), + c_allocateaccoid character varying(30), + c_outfundcode character varying(500), + c_matchprofitclass character varying(30), + l_lastdays numeric(5,0), + c_contractprofitflag character varying(2), + c_agencysaleliqtype character varying(2), + l_delaydays numeric(3,0), + c_profitclassperiod character varying(2), + c_reportshowname character varying(1000), + c_currencyincometype character varying(2), + c_beforeredeemcapital character varying(2), + c_contractversion character varying(30), + c_confirmacceptedflag character varying(2), + c_selectcontract character varying(2), + f_schemainterest numeric(11,8), + c_riskgrade character varying(30), + l_sharedelaydays numeric(3,0), + l_reservationdays numeric(3,0), + c_transfertype character varying(2), + c_schemavoluntarily character varying(2), + l_schemadetaildata numeric(4,0), + c_schemadetailtype character varying(2), + c_iscurrencyconfirm character varying(2), + c_allowmultiaccobank character varying(2), + d_capverif timestamp(0) without time zone, + c_templatetype character varying(12), + c_capitalprecision character varying(2), + c_fundno character varying(100), + c_profittype character varying(2), + d_paydate timestamp(0) without time zone, + d_shelvedate timestamp(0) without time zone, + d_offshelvedate timestamp(0) without time zone, + c_schemabegindatetype character varying(2), + l_schemabegindatedays numeric(3,0), + c_isautoredeem character varying(2), + c_isnettingrequest character varying(2), + c_issuingquotedtype character varying(2), + d_firstdistributedate timestamp(0) without time zone, + c_bonusfrequency character varying(2), + c_interestbigdatetype character varying(2), + c_gzdatatype character varying(2), + f_allotfareratio numeric(5,4), + f_subfareratio numeric(5,4), + c_begindatebeyond character varying(2), + c_profitnotinterest character varying(2), + c_setuplimittype character varying(2), + c_limitredeemtype character varying(2), + c_bonusfrequencytype character varying(2), + c_rfaccotype character varying(2), + c_capitalfee character varying(2), + c_exceedflag character varying(2), + c_enableecd character varying(2), + c_isfixedtrade character varying(2), + c_profitcaltype character varying(2), + f_ominbala numeric(16,2), + f_stepbala numeric(16,2), + c_remittype character varying(30), + c_interestcycle character varying(30), + c_repayguaranteecopy character varying(30), + c_repaytype character varying(30), + c_fundprofitdes character varying(4000), + c_fundinfodes character varying(4000), + c_riskeval character varying(2), + l_maxage numeric(3,0), + l_minage numeric(3,0), + c_fundriskdes character varying(1000), + mig_l_assetid numeric(48,0), + l_faincomedays numeric(10,0), + c_producttype character varying(2), + c_otherbenefitproducttype character varying(2), + c_isotc character varying(2), + c_iseverydayprovision character varying(2), + c_incometogz character varying(2), + c_setuptransfundacco character varying(30), + c_issuefeeownerrequired character varying(2), + c_calcinterestbeforeallot character varying(30), + c_islimit300wnature character varying(2), + c_allowoverflow character varying(30), + c_trustfundtype character varying(30), + c_disclose character varying(2), + c_collectaccoid character varying(30), + c_isissuebymarket character varying(2), + c_setupstatus character varying(30), + c_isentitytrust character varying(2), + l_liquidatesub numeric(10,0), + c_incomeassigndesc character varying(4000), + c_keeporgancode character varying(30), + d_defaultbegincacldate timestamp(0) without time zone, + c_zcbborrower character varying(100), + c_zcbborroweridno character varying(100), + c_zcbremittype character varying(100), + c_registcode character varying(100), + c_redeeminvestaccotype character varying(2), + c_bonusinvestaccotype character varying(2), + c_isabsnotopentrade character varying(2), + l_interestdiffdays numeric(5,0), + c_outfundstatus character varying(2), + c_reqsyntype character varying(2), + c_allredeemtype character varying(2), + c_isabsopentrade character varying(2), + c_funddesc character varying(1000), + l_allotliquidays numeric(3,0), + l_subliquidays numeric(3,0), + c_autoupcontractenddaterule character varying(2), + c_fcsubaccotype character varying(2), + c_fcallotaccotype character varying(2), + c_fcredeemaccotype character varying(2), + c_fcbonusaccotype character varying(2), + c_captranslimitflag character varying(30), + c_redeemprincipaltype character varying(2), + c_interestcalcdealtype character varying(30), + c_collectconfirm character varying(30), + d_oldcontractenddate timestamp(0) without time zone, + c_tnvaluation character varying(30), + c_contractendnotify character varying(2), + c_rdmfeebase character varying(30), + c_exceedcfmratio character varying(30), + c_allowallotcustlimittype character varying(2), + c_yeardayscalctype character varying(2), + c_iscompoundinterest character varying(30), + c_dbcfm character varying(30), + c_limitaccountstype character varying(2), + c_cycleinvestrange character varying(2), + c_tncheckmode character varying(2), + c_enableearlyredeem character varying(2), + c_ispurceandredeemset character varying(30), + c_perfpaydealtype character varying(2), + c_allowappend character varying(2), + c_allowredeem character varying(2), + c_inputstatus character varying(2), + c_profitbalanceadjust character varying(2), + c_profitperiodadjust character varying(2), + c_autogeneratecontractid character varying(2), + c_transferneednetting character varying(100), + underwrite character varying(1000), + undertook character varying(1000), + undertake character varying(1000), + c_issmsend character varying(2), + d_contractshortenddate timestamp(0) without time zone, + d_contractlongenddate timestamp(0) without time zone, + c_assetseperatefundcodesrc character varying(30), + f_averageprofit numeric(11,8), + c_currencycontractlimittype character varying(2), + l_profitlastdays numeric(5,0), + l_liquidationlastdays numeric(5,0), + c_arlimitincludeallreq character varying(2), + c_reqfundchange character varying(2), + c_dealnetvaluerule character varying(2), + c_contractdealtype character varying(2), + c_bonusplanbeginday timestamp(0) without time zone, + c_contractbalaupright character varying(2), + c_isneedinterestrate character varying(2), + c_isneedexcessratio character varying(2), + c_riskgraderemark character varying(1000), + c_lossprobability character varying(2), + c_suitcusttype character varying(2), + c_createbonusschema character varying(2), + d_closedenddate timestamp(0) without time zone, + c_timelimitunit character varying(30), + c_exceedredeemdealtype character varying(2), + c_profitperiod character varying(2), + l_navgetintervaldays numeric(3,0), + load_date timestamp(0) without time zone, + sys_id character varying(10) DEFAULT 'S017'::character varying, + work_date timestamp(0) without time zone, + c_limittransfertype character varying(1), + c_transaccotype character varying(1), + c_incometaxbase character varying(1), + c_isredeemfareyearcalc character varying(1), + c_otherbenefitinputmode character varying(1), + c_aftdefaultinterestdeducttype character varying(1), + c_allowzerobalanceconfirm character varying(1), + c_incomejoinassign character varying(1), + l_liquidateliqbonus numeric(10,0), + c_predefaultinterestdeducttype character varying(1), + c_worktype character varying(1), + c_defaultinterestadduptype character varying(1), + c_issupportsubmode character varying(1), + f_expectedyield numeric(14,0), + c_recodecode character varying(40), + l_liquidatetransfer numeric(10,0), + c_ispayincometax character varying(1), + c_groupmainfundcode character varying(6), + c_redeemfeesplittype character varying(1), + c_capitalfromcrmorta character varying(1), + c_needcalcdefaultinterest character varying(1), + c_issuercode character varying(10), + l_redeemfareyeardays numeric(10,0), + c_floatyield character varying(30), + l_minriskscore numeric(3,0), + c_islocalmoneytypecollect character varying(1) +) +DISTRIBUTE BY SHARD (c_fundcode) to GROUP default_group; +NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. +-- +-- Name: s017_tsharecurrents_all; Type: TABLE; Schema: sync; Owner: gregsun +-- +CREATE TABLE s017_tsharecurrents_all ( + d_cdate timestamp(0) without time zone NOT NULL, + c_cserialno character varying(100), + c_businflag character(2), + d_requestdate timestamp(0) without time zone, + c_requestno character varying(100), + c_custno character varying(30), + c_fundacco character varying(30), + c_tradeacco character varying(100), + c_fundcode character varying(30), + c_sharetype character(1), + c_agencyno character(3), + c_netno character varying(30), + f_occurshares numeric(16,2), + f_occurbalance numeric(16,2), + f_lastshares numeric(16,2), + f_occurfreeze numeric(16,2), + f_lastfreezeshare numeric(16,2), + c_summary character varying(100), + f_gainbalance numeric(16,2), + d_sharevaliddate timestamp(0) without time zone, + c_bonustype character(1), + c_custtype character(1), + c_shareclass character(1), + c_bourseflag character varying(20), + d_exportdate timestamp(0) without time zone, + l_contractserialno numeric(10,0), + c_issend character(1), + c_sendbatch character varying(30), + work_date timestamp(0) without time zone +) +DISTRIBUTE BY SHARD (d_cdate) to GROUP default_group; +NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. +-- +-- Name: s017_ttrustclientinfo_all; Type: TABLE; Schema: sync; Owner: gregsun +-- +CREATE TABLE s017_ttrustclientinfo_all ( + c_custno character varying(30) NOT NULL, + c_custtype character(1), + c_custname character varying(500), + c_shortname character varying(500), + c_helpcode character varying(30), + c_identitytype character(1), + c_identityno character varying(500), + c_zipcode character varying(30), + c_address character varying(1000), + c_phone character varying(100), + c_faxno character varying(500), + c_mobileno character varying(100), + c_email character varying(500), + c_sex character(1), + c_birthday character varying(30), + c_vocation character(2), + c_education character(2), + c_income character varying(30), + c_contact character varying(100), + c_contype character(1), + c_contno character varying(100), + c_billsendflag character(1), + c_callcenter character(1), + c_internet character(1), + c_secretcode character varying(30), + c_nationality character(3), + c_cityno character varying(30), + c_lawname character varying(100), + c_shacco character varying(30), + c_szacco character varying(30), + c_broker character varying(100), + f_agio numeric(5,4), + c_memo character varying(4000), + c_reserve character varying(500), + c_corpname character varying(100), + c_corptel character varying(100), + c_specialcode character varying(100), + c_actcode character varying(30), + c_billsendpass character(1), + c_addressinvalid character(1), + d_appenddate timestamp(0) without time zone, + d_backdate timestamp(0) without time zone, + c_invalidaddress character varying(500), + c_backreason character varying(500), + c_modifyinfo character(2), + c_riskcontent character varying(4000), + l_querydaysltd numeric(3,0), + c_customermanager character varying(100), + c_custproperty character(1), + c_custclass character(1), + c_custright character varying(4000), + c_daysltdtype character(1), + d_idvaliddate timestamp(0) without time zone, + l_custgroup numeric(10,0), + c_recommender character varying(100), + c_recommendertype character(1), + d_idnovaliddate timestamp(0) without time zone, + c_organcode character(10), + c_othercontact character varying(100), + c_taxregistno character varying(100), + c_taxidentitytype character(1), + c_taxidentityno character varying(100), + d_legalvaliddate timestamp(0) without time zone, + c_shareholder character varying(500), + c_shareholderidtype character(1), + c_shareholderidno character varying(100), + d_holderidvaliddate timestamp(0) without time zone, + c_leader character varying(500), + c_leaderidtype character(1), + c_leaderidno character varying(100), + d_leadervaliddate timestamp(0) without time zone, + c_managercode character varying(100), + c_linemanager character varying(100), + c_clientinfoid character varying(30), + c_provincecode character varying(30), + c_countytown character varying(1000), + c_phone2 character varying(100), + c_clienttype character(1), + c_agencyno character(3), + c_industrydetail character varying(30), + c_isqualifiedcust character(1), + c_industryidentityno character varying(100), + c_lawidentitytype character(1), + c_lawidentityno character varying(100), + d_lawidvaliddate timestamp(0) without time zone, + d_conidvaliddate timestamp(0) without time zone, + c_conisrevmsg character(1), + c_conmobileno character varying(100), + c_conmoaddress character varying(1000), + c_conzipcode character varying(30), + c_conphone1 character varying(100), + c_conphone2 character varying(100), + c_conemail character varying(100), + c_confaxno character varying(500), + c_incomsource character varying(500), + c_zhidentityno character varying(500), + c_zhidentitytype character(1), + c_eastcusttype character varying(30), + jy_custid numeric(10,0), + c_idtype201201030 character(1), + c_emcontact character varying(500), + c_emcontactphone character varying(100), + c_instiregaddr character varying(1000), + c_regcusttype character varying(30), + c_riskgrade character varying(30), + c_riskgraderemark character varying(1000), + d_idvaliddatebeg timestamp(0) without time zone, + d_industryidvaliddatebeg timestamp(0) without time zone, + d_industryidvaliddate timestamp(0) without time zone, + c_incomesourceotherdesc character varying(1000), + c_vocationotherdesc character varying(1000), + c_businscope character varying(4000), + d_conidvaliddatebeg timestamp(0) without time zone, + d_lawidvaliddatebeg timestamp(0) without time zone, + c_regmoneytype character(3), + f_regcapital numeric(15,2), + c_orgtype character(2), + c_contrholderno character varying(100), + c_contrholdername character varying(500), + c_contrholderidtype character(2), + c_contrholderidno character varying(500), + d_contrholderidvalidatebeg timestamp(0) without time zone, + d_contrholderidvalidate timestamp(0) without time zone, + c_responpername character varying(500), + c_responperidtype character(2), + c_responperidno character varying(500), + d_responperidvalidatebeg timestamp(0) without time zone, + d_responperidvalidate timestamp(0) without time zone, + c_lawphone character varying(100), + c_contrholderphone character varying(100), + c_responperphone character varying(100), + c_consex character(1), + c_conrelative character varying(500), + l_riskserialno numeric(10,0), + c_convocation character(2), + c_iscustrelated character(1), + c_businlicissuorgan character varying(500), + c_manageridno character varying(500), + c_manageridtype character varying(500), + c_managername character varying(500), + d_companyregdate timestamp(0) without time zone, + c_electronicagreement character(1), + c_householdregno character varying(500), + c_guardianrela character varying(500), + c_guardianname character varying(500), + c_guardianidtype character(1), + c_guardianidno character varying(500), + c_isfranchisingidstry character(1), + c_franchidstrybusinlic character varying(500), + c_workunittype character(2), + c_normalresidaddr character varying(1000), + c_domicile character varying(1000), + c_finainvestyears character(2), + c_parentidtype character(1), + c_parentidno character varying(500), + c_videono character varying(1000), + c_bonustype character(1), + d_retirementdate timestamp(0) without time zone, + c_issendbigcustbill character(1), + c_idaddress character varying(1000), + c_isproinvestor character(1), + c_sendkfflag character(1), + c_sendkfcause character varying(1000), + c_sendsaflag character(1), + c_sendsacause character varying(1000), + c_custrelationchannel character(1), + c_companytype character(1), + c_businlocation character varying(1000), + c_custodian character varying(500), + d_elecsigndate timestamp(0) without time zone, + d_riskinputdate timestamp(0) without time zone, + c_circno character varying(1000), + c_financeindustrydetail character varying(30), + c_outclientinfoid character varying(30), + d_duediligencedate timestamp(0) without time zone, + c_duediligencestatus character(1), + c_inputstatus character(1), + c_address2 character varying(1000), + c_reportcusttype character(1), + c_reportcusttypedetail character varying(30), + c_custsource character varying(30), + work_date timestamp(0) without time zone +) +DISTRIBUTE BY SHARD (c_custno) to GROUP default_group; +NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. +-- +-- Name: sys_stat_error_log; Type: TABLE; Schema: sync; Owner: gregsun +-- +CREATE TABLE sys_stat_error_log ( + proc_name varchar2(50) NOT NULL, + tab_level varchar2(20), + step_no varchar2(20), + step_desc varchar2(500), + begin_time timestamp(0) without time zone, + end_time timestamp(0) without time zone, + workdate timestamp(0) without time zone, + row_num numeric, + elapsed numeric, + all_elapsed numeric, + sql_code varchar2(20), + sql_errm varchar2(500) +) +DISTRIBUTE BY SHARD (proc_name) to GROUP default_group; +NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. +-- +-- Data for Name: b03_ts_remetrade; Type: TABLE DATA; Schema: sync; Owner: gregsun +-- +COPY b03_ts_remetrade (c_fundcode, c_fundname, c_fundacco, f_netvalue, c_agencyname, c_custname, d_date, d_cdate, f_confirmbalance, f_tradefare, f_confirmshares, f_relbalance, f_interest, info, work_date, load_date) FROM stdin; +-- +-- Data for Name: b03_ts_remetrade_bak; Type: TABLE DATA; Schema: sync; Owner: gregsun +-- +COPY b03_ts_remetrade_bak (c_fundcode, c_fundname, c_fundacco, f_netvalue, c_agencyname, c_custname, d_date, d_cdate, f_confirmbalance, f_tradefare, f_confirmshares, f_relbalance, f_interest, info, work_date, load_date) FROM stdin; +-- +-- Data for Name: ks0_fund_base_26; Type: TABLE DATA; Schema: sync; Owner: gregsun +-- +COPY ks0_fund_base_26 (id1, acc_cd, tdate, ins_cd, cost_price_asset, pcol) FROM stdin; +-- +-- Data for Name: p; Type: TABLE DATA; Schema: sync; Owner: gregsun +-- +COPY p (p1, p2) FROM stdin; +-- +-- Data for Name: s017_taccoinfo; Type: TABLE DATA; Schema: sync; Owner: gregsun +-- +COPY s017_taccoinfo (c_custno, c_accounttype, c_fundacco, c_agencyno, c_netno, c_childnetno, d_opendate, d_lastmodify, c_accostatus, c_freezecause, d_backdate, l_changetime, d_firstinvest, c_password, c_bourseflag, c_operator, jy_custid, work_date) FROM stdin; +-- +-- Data for Name: s017_tacconet; Type: TABLE DATA; Schema: sync; Owner: gregsun +-- +COPY s017_tacconet (c_fundacco, c_agencyno, c_netno, c_tradeacco, c_openflag, c_bonustype, c_bankno, c_bankacco, c_nameinbank, d_appenddate, c_childnetno, c_tradeaccobak, c_bankname, c_banklinecode, c_channelbankno, c_bankprovincecode, c_bankcityno, sys_id, work_date, load_date) FROM stdin; +-- +-- Data for Name: s017_tagencyinfo; Type: TABLE DATA; Schema: sync; Owner: gregsun +-- +COPY s017_tagencyinfo (c_agencyno, c_agencyname, c_fullname, c_agncyaddress, c_agncyzipcode, c_agncycontact, c_agncyphone, c_agncyfaxno, c_agncymail, c_agncybankno, c_agncybankacco, c_agncybankname, d_agncyregdate, c_agncystatus, d_lastdate, c_agencytype, c_detail, c_right, c_zdcode, l_liquidateredeem, l_liquidateallot, l_liquidatebonus, l_liquidatesub, c_sharetypes, f_agio, c_ztgonestep, c_preassign, l_cserialno, c_comparetype, c_liquidatetype, c_multitradeacco, c_iversion, c_imode, c_changeonstep, f_outagio, f_agiohint, f_outagiohint, c_allotliqtype, c_redeemliqtype, c_centerflag, c_netno, c_littledealtype, c_overtimedeal, d_lastinputtime, f_interestrate, c_clearsite, c_isdeal, c_agencyenglishname, l_fundaccono, c_rationflag, c_splitflag, c_tacode, c_outdataflag, c_hasindex, c_transferbyadjust, c_sharedetailexptype, c_navexptype, c_ecdmode, c_agencytypedetail, c_advanceshrconfirm, c_ecdversion, c_capmode, c_internetplatform, c_capautoarrive, c_outcapitaldata, c_ecdcheckmode, c_ecddealmode, c_fileimpmode, c_isotc, c_enableecd, c_autoaccotype, c_tncheckmode, c_captureidinfo, c_realfreeze, sys_id, work_date, load_date) FROM stdin; +-- +-- Data for Name: s017_tconfirm_all; Type: TABLE DATA; Schema: sync; Owner: gregsun +-- +COPY s017_tconfirm_all (c_businflag, d_cdate, c_cserialno, d_date, l_serialno, c_agencyno, c_netno, c_fundacco, c_tradeacco, c_fundcode, c_sharetype, f_confirmbalance, f_confirmshares, f_tradefare, f_tafare, f_stamptax, f_backfare, f_otherfare1, f_interest, f_interesttax, f_totalfare, f_agencyfare, f_netvalue, f_frozenbalance, f_unfrozenbalance, c_status, c_cause, c_taflag, c_custtype, c_custno, f_gainbalance, f_orifare, c_requestendflag, f_unbalance, f_unshares, c_reserve, f_interestshare, f_chincome, f_chshare, f_confirmincome, f_oritradefare, f_oritafare, f_oribackfare, f_oriotherfare1, c_requestno, f_balance, f_shares, f_agio, f_lastshares, f_lastfreezeshare, c_othercode, c_otheracco, c_otheragency, c_othernetno, c_bonustype, c_foriginalno, c_exceedflag, c_childnetno, c_othershare, c_actcode, c_acceptmode, c_freezecause, c_freezeenddate, f_totalbalance, f_totalshares, c_outbusinflag, c_protocolno, c_memo, f_registfare, f_fundfare, f_oriagio, c_shareclass, d_cisdate, c_bourseflag, c_fundtype, f_backfareagio, c_bankno, c_subfundmethod, c_combcode, f_returnfare, c_contractno, c_captype, l_contractserialno, l_othercontractserialno, d_exportdate, f_transferfee, f_oriconfirmbalance, f_extendnetvalue, l_remitserialno, c_zhxtht, c_improperredeem, f_untradefare, f_untradeinfare, f_untradeoutfare, c_profitnottransfer, f_outprofit, f_inprofit, c_totrustcontractid, d_repurchasedate, f_chengoutbalance, c_exporting, jy_fundid, jy_contractbh, jy_custid, jy_tocustid, jy_fare, c_trustcontractid, f_taagencyfare, f_taregisterfare, d_cdate_jy, jy_adjust, jy_subfundid, jy_adjust1114, jy_cdate, c_bankacco, c_bankname, c_nameinbank, f_riskcapital, f_replenishriskcapital, c_fromfundcode, c_fromtrustcontractid, c_trustagencyno, l_rdmschserialno, f_redeemprofit, f_redeemproyieldrate, d_redeemprobigdate, d_redeemproenddate, c_changeownerincomebelong, l_midremitserialno, c_fromtype, c_iscycinvest, l_fromserialno, l_frominterestconserialno, c_changeownerinterest, c_msgsendflag, l_sharedelaydays, c_istodayconfirm, f_newincome, f_floorincome, l_incomeremitserialno, c_isnetting, l_bankserialno, c_subfundcode, f_chengoutsum, f_chengoutprofit, l_confirmtransserialno, c_shareadjustgzexpflag, c_issend, c_exchangeflag, yh_date_1112, l_banktocontractserialno, c_payfeetype, c_tobankno, c_tobankacco, c_tobankname, c_tonameinbank, c_tobanklinecode, c_tobankprovincecode, c_tobankcityno, l_assetseperateno, c_sharecserialno, c_redeemprincipaltype, work_date, c_businname) FROM stdin; +-- +-- Data for Name: s017_tdividenddetail; Type: TABLE DATA; Schema: sync; Owner: gregsun +-- +COPY s017_tdividenddetail (d_cdate, c_cserialno, d_regdate, d_date, c_fundacco, c_tradeacco, c_fundcode, c_sharetype, c_agencyno, c_netno, f_totalshare, f_unitprofit, f_totalprofit, f_tax, c_flag, f_realbalance, f_reinvestbalance, f_realshares, f_fare, d_lastdate, f_netvalue, f_frozenbalance, f_frozenshares, f_incometax, c_reserve, d_requestdate, c_shareclass, l_contractserialno, l_specprjserialno, f_investadvisorratio, f_transferfee, l_profitserialno, d_exportdate, c_custid, jy_fundid, jy_subfundid, jy_custid, jy_contractbh, jy_profitsn, jy_profitmoney, jy_capitalmoney, jy_adjust, c_reinvestnetvalue, f_transferbalance, l_relatedserialno, c_printoperator, c_printauditor, sys_id, work_date, load_date, f_remainshares) FROM stdin; +-- +-- Data for Name: s017_tfundday; Type: TABLE DATA; Schema: sync; Owner: gregsun +-- +COPY s017_tfundday (d_date, d_cdate, c_fundcode, c_todaystatus, c_status, f_netvalue, f_lastshares, f_lastasset, f_asucceed, f_rsucceed, c_vastflag, f_encashratio, f_changeratio, c_excessflag, f_subscriberatio, c_inputpersonnel, c_checkpersonnel, f_income, f_incomeratio, f_unassign, f_incomeunit, f_totalnetvalue, f_servicefare, f_assign, f_growthrate, c_netvalueflag, f_managefare, d_exportdate, c_flag, f_advisorfee, d_auditdate, f_extendnetvalue, f_extendtotalnetvalue, jy_fundcode, f_yearincomeratio, f_riskcapital, f_totalincome, f_agencyexpyearincomeration, f_agencyexpincomeunit, f_agencyexpincomeration, f_agencyexpincome, c_isspecflag, c_isasync, sys_id, work_date, load_date) FROM stdin; +-- +-- Data for Name: s017_tfundinfo; Type: TABLE DATA; Schema: sync; Owner: gregsun +-- +COPY s017_tfundinfo (c_fundcode, c_fundname, c_moneytype, c_managername, c_trusteecode, f_parvalue, f_issueprice, c_trusteeacco, d_issuedate, d_setupdate, f_maxbala, f_maxshares, f_minbala, f_minshares, l_elimitday, l_slimitday, l_alimitday, l_mincount, l_climitday, f_maxallot, f_maxredeem, c_fundcharacter, c_fundstatus, c_subscribemode, l_timelimit, l_subscribeunit, c_sharetypes, c_issuetype, f_factcollect, d_failuedate, f_allotratio, c_feeratiotype1, c_feeratiotype2, c_feetype, c_exceedpart, c_bonustype, c_forceredeem, c_interestdealtype, f_redeemfareratio, f_changefareratio, f_managerfee, f_right, c_property, d_evendate, f_totalbonus, c_changefree, c_reportcode, c_backfarecal, l_moneydate, l_netprecision, c_corpuscontent, f_corpusratio, c_farecaltype, l_liquidateallot, l_liquidateredeem, l_liquidatebonus, l_taspecialacco, c_fareprecision, d_issueenddate, c_farebelongasset, l_liquidatechange, l_liquidatefail, l_liquidateend, c_sharedetail, c_trusteebankname, c_boursetradeflag, c_fundenglishname, l_bankaccono, c_cleanflag, c_precision, c_upgradeflag, c_isdeal, c_farecltprecision, c_balanceprecision, c_shareprecision, c_bonusprecision, c_interestprecision, f_maxallotasset, f_maxallotshares, c_foreigntrustee, l_tnconfirm, c_rationallotstatus, f_trusteefee, c_fundacco, c_financetype, l_liquidatechangein, c_custname, c_identitytype, c_custtype, c_identityno, c_deductschemecode, c_customermanager, c_templateid, f_pr0, f_deductratio, c_farecalculatetype, c_saletype, l_maxcount, l_zhallotliqdays, l_zhredeemliqdays, f_liqasset, l_zhallotexpdays, l_zhredeemexpdays, c_limitmode, c_ordermode, c_acntlmtdealmode, l_informdays, c_allowpartredeem, c_fundendmode, f_fundendagio, c_minbalalimitisconfirm, c_gradetype, c_qryfreqtype, l_qrydaysltd, d_contractenddate, c_useinopenday, c_allotcalinterst, c_fundrisk, c_exitallot, c_subinterestcalc, c_earlyexitredfee, c_navexpfqy, l_navexpday, c_isbounded, c_earlyexitfeecalc, c_designdptid, c_fixeddividway, c_trusttype, f_maxnaturalmoney, c_projectid, c_trustclass, f_trustscale, c_structflag, c_priconveyflag, c_repurchasetype, c_iswholerepurchase, f_repurchaseminbala, c_repurchasemainbody, c_canelyrepurchase, c_earlybacktime, c_repurchaseprice, c_premiumpaymenttime, c_liquisource, l_period, c_canextensionflag, c_canelyliquidflag, c_trustassetdesc, c_returnside, c_returnpaymentway, c_returnbase, c_refepaymentway, c_refeside, c_refebase, f_warnline, f_stopline, f_collectinterest, f_durationinterest, f_investadvisorratio, c_bonusschema, c_guaranteetype, c_guaranteedesc, c_expectedyieldtype, f_minexpectedyield, f_maxexpectedyield, c_incomecycletype, f_incomecyclevalue, c_subaccotype, c_allotaccotype, c_fundtype, c_cootype, c_projecttype, c_investdirection, c_investdirectionfractionize, c_industrydetail, c_initeresttype, c_isextended, d_extenddate, c_dealmanagetype, c_investarea, c_projectcode, c_fundshortname, c_contractid, c_functype, c_specialbusintype, c_investindustry, c_managetype, c_area, c_risk, c_iscommitteedisscuss, c_structtype, c_commendplace, l_npmaxcount, c_client, c_clientcusttype, c_clientidtype, c_clientidno, c_clientbankname, c_clientaccono, c_clientaddress, c_clientzipcode, c_clientphoneno1, c_clientphoneno2, c_clientfax, c_beneficiary, c_collectbankname, c_collectbankno, c_collectaccountname, c_collectbankacco, c_keeperbankname, c_keeperaccountname, c_keeperaccountno, c_keepername, c_keepercorporation, c_keeperaddress, c_keeperzipcode, c_keeperphoneno1, c_keeperphoneno2, c_keeperfax, c_incomedistributetype, c_alarmline, c_stoplossline, f_investadvisorfee, c_investadvisordeduct, c_capitalacco, c_stockacconame, c_stocksalesdept, c_thirdpartybankno, c_thirdpartybankname, c_thirdpartyacconame, c_thirdpartyaccono, c_investadvisor, c_investadvisorbankno, c_investadvisorbankname, c_investadvisoracconame, c_investadvisoraccono, c_investadvisorcorporation, c_investadvisoraddress, c_investadvisorzipcode, c_investadvisorphoneno1, c_investadvisorphoneno2, c_investadvisorfax, c_authdelegate, c_loanfinanceparty, c_loanfinancepartycorporation, c_loanfinancepartyaddress, c_loanfinancepartyzipcode, c_loanfinancepartyphoneno1, c_loanfinancepartyphoneno2, c_loanfinancepartyfax, c_loaninteresttype, f_loaninterestrate, f_loanduration, c_loanmanagebank, f_loanmanagefee, f_loanfinancecost, f_creditattornduration, f_creditattorninterestduration, f_creditattornprice, f_billattornduration, f_billattorninterestduration, f_billattornprice, c_stkincfincparty, c_stkincfincpartycorporation, c_stkincfincpartyaddress, c_stkincfincpartyzipcode, c_stkincfincpartyphoneno1, c_stkincfincpartyphoneno2, c_stkincfincpartyfax, c_stkincincomeannualizedrate, c_stkincinteresttype, f_stkincattornprice, f_stkincattornduration, f_stkincbail, f_stkincfinccost, c_stkincmemo1, c_stkincmemo2, c_debtincfincparty, c_debtincfincpartycorporation, c_debtincfincpartyaddress, c_debtincfincpartyzipcode, c_debtincfincpartyphoneno1, c_debtincfincpartyphoneno2, c_debtincfincpartyfax, c_debtincincomerate, c_debtincinteresttype, f_debtincattornprice, f_debtincattornduration, f_debtincbail, f_debtincfinccost, c_debtincmemo1, c_othinvfincparty, c_othinvfincpartycorporation, c_othinvfincpartyaddress, c_othinvfincpartyzipcode, c_othinvfincpartyphoneno1, c_othinvfincpartyphoneno2, c_othinvfincpartyfax, f_othinvfinccost, c_othinvmemo1, c_othinvmemo2, c_othinvmemo3, c_banktrustcoobank, c_banktrustproductname, c_banktrustproductcode, c_banktrustundertakingletter, c_trustgovgovname, c_trustgovprojecttype, c_trustgovcootype, c_trustgovoptype, c_housecapital, c_houseispe, c_tradetype, c_businesstype, c_trustname, c_trustidtype, c_trustidno, d_trustidvaliddate, c_trustbankname, c_trustaccounttype, c_trustnameinbank, c_zhtrustbankname, c_zhtrustbankacco, c_issecmarket, c_fundoperation, c_trustmanager, c_tradeother, c_watchdog, c_memo, c_benefittype, c_redeemaccotype, c_bonusaccotype, c_fundendaccotype, c_collectfailaccotype, d_lastmodifydate, c_shareholdlimtype, c_redeemtimelimtype, c_isprincipalrepayment, c_principalrepaymenttype, l_interestyeardays, l_incomeyeardays, c_capuseprovcode, c_capusecitycode, c_capsourceprovcode, c_banktrustcoobankcode, c_banktrustisbankcap, c_trusteefeedesc, c_managefeedesc, c_investfeedesc, f_investadvisordeductratio, c_investdeductdesc, c_investadvisor2, f_investadvisorratio2, f_investadvisordeductratio2, c_investfeedesc2, c_investdeductdesc2, c_investadvisor3, f_investadvisorratio3, f_investadvisordeductratio3, c_investfeedesc3, c_investdeductdesc3, c_profitclassdesc, c_deductratiodesc, c_redeemfeedesc, l_defaultprecision, c_allotfeeaccotype, c_isposf, c_opendaydesc, c_actualmanager, c_subindustrydetail, c_isbankleading, c_subprojectcode, c_iscycleinvest, f_liquidationinterest, c_liquidationinteresttype, c_isbonusinvestfare, c_subfeeaccotype, c_redeemfeeaccotype, c_fundrptcode, c_ordertype, c_flag, c_allotliqtype, l_sharelimitday, c_iseverydayopen, c_tradebynetvalue, c_isstage, c_specbenfitmemo, d_effectivedate, c_issueendflag, c_resharehasrdmfee, jy_fundcode, jy_fundid, jy_subfundid, jy_dptid, c_iswealth, c_interestcalctype, c_allotinterestcalctype, c_isriskcapital, c_fundstatus_1225, c_isincomeeverydaycalc, c_isredeemreturninterest, c_isrefundrtninterest, d_estimatedsetupdate, f_estimatedfactcollect, c_isfinancialproducts, c_fundredeemtype, c_trademanualinput, f_clientmanageration, c_profitclassadjustment, c_mainfundcode, c_contractsealoff, c_permitnextperiod, c_preprofitschematype, c_fundredeemprofit, f_incomeration, c_incomecalctype, c_allocateaccoid, c_outfundcode, c_matchprofitclass, l_lastdays, c_contractprofitflag, c_agencysaleliqtype, l_delaydays, c_profitclassperiod, c_reportshowname, c_currencyincometype, c_beforeredeemcapital, c_contractversion, c_confirmacceptedflag, c_selectcontract, f_schemainterest, c_riskgrade, l_sharedelaydays, l_reservationdays, c_transfertype, c_schemavoluntarily, l_schemadetaildata, c_schemadetailtype, c_iscurrencyconfirm, c_allowmultiaccobank, d_capverif, c_templatetype, c_capitalprecision, c_fundno, c_profittype, d_paydate, d_shelvedate, d_offshelvedate, c_schemabegindatetype, l_schemabegindatedays, c_isautoredeem, c_isnettingrequest, c_issuingquotedtype, d_firstdistributedate, c_bonusfrequency, c_interestbigdatetype, c_gzdatatype, f_allotfareratio, f_subfareratio, c_begindatebeyond, c_profitnotinterest, c_setuplimittype, c_limitredeemtype, c_bonusfrequencytype, c_rfaccotype, c_capitalfee, c_exceedflag, c_enableecd, c_isfixedtrade, c_profitcaltype, f_ominbala, f_stepbala, c_remittype, c_interestcycle, c_repayguaranteecopy, c_repaytype, c_fundprofitdes, c_fundinfodes, c_riskeval, l_maxage, l_minage, c_fundriskdes, mig_l_assetid, l_faincomedays, c_producttype, c_otherbenefitproducttype, c_isotc, c_iseverydayprovision, c_incometogz, c_setuptransfundacco, c_issuefeeownerrequired, c_calcinterestbeforeallot, c_islimit300wnature, c_allowoverflow, c_trustfundtype, c_disclose, c_collectaccoid, c_isissuebymarket, c_setupstatus, c_isentitytrust, l_liquidatesub, c_incomeassigndesc, c_keeporgancode, d_defaultbegincacldate, c_zcbborrower, c_zcbborroweridno, c_zcbremittype, c_registcode, c_redeeminvestaccotype, c_bonusinvestaccotype, c_isabsnotopentrade, l_interestdiffdays, c_outfundstatus, c_reqsyntype, c_allredeemtype, c_isabsopentrade, c_funddesc, l_allotliquidays, l_subliquidays, c_autoupcontractenddaterule, c_fcsubaccotype, c_fcallotaccotype, c_fcredeemaccotype, c_fcbonusaccotype, c_captranslimitflag, c_redeemprincipaltype, c_interestcalcdealtype, c_collectconfirm, d_oldcontractenddate, c_tnvaluation, c_contractendnotify, c_rdmfeebase, c_exceedcfmratio, c_allowallotcustlimittype, c_yeardayscalctype, c_iscompoundinterest, c_dbcfm, c_limitaccountstype, c_cycleinvestrange, c_tncheckmode, c_enableearlyredeem, c_ispurceandredeemset, c_perfpaydealtype, c_allowappend, c_allowredeem, c_inputstatus, c_profitbalanceadjust, c_profitperiodadjust, c_autogeneratecontractid, c_transferneednetting, underwrite, undertook, undertake, c_issmsend, d_contractshortenddate, d_contractlongenddate, c_assetseperatefundcodesrc, f_averageprofit, c_currencycontractlimittype, l_profitlastdays, l_liquidationlastdays, c_arlimitincludeallreq, c_reqfundchange, c_dealnetvaluerule, c_contractdealtype, c_bonusplanbeginday, c_contractbalaupright, c_isneedinterestrate, c_isneedexcessratio, c_riskgraderemark, c_lossprobability, c_suitcusttype, c_createbonusschema, d_closedenddate, c_timelimitunit, c_exceedredeemdealtype, c_profitperiod, l_navgetintervaldays, load_date, sys_id, work_date, c_limittransfertype, c_transaccotype, c_incometaxbase, c_isredeemfareyearcalc, c_otherbenefitinputmode, c_aftdefaultinterestdeducttype, c_allowzerobalanceconfirm, c_incomejoinassign, l_liquidateliqbonus, c_predefaultinterestdeducttype, c_worktype, c_defaultinterestadduptype, c_issupportsubmode, f_expectedyield, c_recodecode, l_liquidatetransfer, c_ispayincometax, c_groupmainfundcode, c_redeemfeesplittype, c_capitalfromcrmorta, c_needcalcdefaultinterest, c_issuercode, l_redeemfareyeardays, c_floatyield, l_minriskscore, c_islocalmoneytypecollect) FROM stdin; +-- +-- Data for Name: s017_tsharecurrents_all; Type: TABLE DATA; Schema: sync; Owner: gregsun +-- +COPY s017_tsharecurrents_all (d_cdate, c_cserialno, c_businflag, d_requestdate, c_requestno, c_custno, c_fundacco, c_tradeacco, c_fundcode, c_sharetype, c_agencyno, c_netno, f_occurshares, f_occurbalance, f_lastshares, f_occurfreeze, f_lastfreezeshare, c_summary, f_gainbalance, d_sharevaliddate, c_bonustype, c_custtype, c_shareclass, c_bourseflag, d_exportdate, l_contractserialno, c_issend, c_sendbatch, work_date) FROM stdin; +-- +-- Data for Name: s017_ttrustclientinfo_all; Type: TABLE DATA; Schema: sync; Owner: gregsun +-- +COPY s017_ttrustclientinfo_all (c_custno, c_custtype, c_custname, c_shortname, c_helpcode, c_identitytype, c_identityno, c_zipcode, c_address, c_phone, c_faxno, c_mobileno, c_email, c_sex, c_birthday, c_vocation, c_education, c_income, c_contact, c_contype, c_contno, c_billsendflag, c_callcenter, c_internet, c_secretcode, c_nationality, c_cityno, c_lawname, c_shacco, c_szacco, c_broker, f_agio, c_memo, c_reserve, c_corpname, c_corptel, c_specialcode, c_actcode, c_billsendpass, c_addressinvalid, d_appenddate, d_backdate, c_invalidaddress, c_backreason, c_modifyinfo, c_riskcontent, l_querydaysltd, c_customermanager, c_custproperty, c_custclass, c_custright, c_daysltdtype, d_idvaliddate, l_custgroup, c_recommender, c_recommendertype, d_idnovaliddate, c_organcode, c_othercontact, c_taxregistno, c_taxidentitytype, c_taxidentityno, d_legalvaliddate, c_shareholder, c_shareholderidtype, c_shareholderidno, d_holderidvaliddate, c_leader, c_leaderidtype, c_leaderidno, d_leadervaliddate, c_managercode, c_linemanager, c_clientinfoid, c_provincecode, c_countytown, c_phone2, c_clienttype, c_agencyno, c_industrydetail, c_isqualifiedcust, c_industryidentityno, c_lawidentitytype, c_lawidentityno, d_lawidvaliddate, d_conidvaliddate, c_conisrevmsg, c_conmobileno, c_conmoaddress, c_conzipcode, c_conphone1, c_conphone2, c_conemail, c_confaxno, c_incomsource, c_zhidentityno, c_zhidentitytype, c_eastcusttype, jy_custid, c_idtype201201030, c_emcontact, c_emcontactphone, c_instiregaddr, c_regcusttype, c_riskgrade, c_riskgraderemark, d_idvaliddatebeg, d_industryidvaliddatebeg, d_industryidvaliddate, c_incomesourceotherdesc, c_vocationotherdesc, c_businscope, d_conidvaliddatebeg, d_lawidvaliddatebeg, c_regmoneytype, f_regcapital, c_orgtype, c_contrholderno, c_contrholdername, c_contrholderidtype, c_contrholderidno, d_contrholderidvalidatebeg, d_contrholderidvalidate, c_responpername, c_responperidtype, c_responperidno, d_responperidvalidatebeg, d_responperidvalidate, c_lawphone, c_contrholderphone, c_responperphone, c_consex, c_conrelative, l_riskserialno, c_convocation, c_iscustrelated, c_businlicissuorgan, c_manageridno, c_manageridtype, c_managername, d_companyregdate, c_electronicagreement, c_householdregno, c_guardianrela, c_guardianname, c_guardianidtype, c_guardianidno, c_isfranchisingidstry, c_franchidstrybusinlic, c_workunittype, c_normalresidaddr, c_domicile, c_finainvestyears, c_parentidtype, c_parentidno, c_videono, c_bonustype, d_retirementdate, c_issendbigcustbill, c_idaddress, c_isproinvestor, c_sendkfflag, c_sendkfcause, c_sendsaflag, c_sendsacause, c_custrelationchannel, c_companytype, c_businlocation, c_custodian, d_elecsigndate, d_riskinputdate, c_circno, c_financeindustrydetail, c_outclientinfoid, d_duediligencedate, c_duediligencestatus, c_inputstatus, c_address2, c_reportcusttype, c_reportcusttypedetail, c_custsource, work_date) FROM stdin; +-- +-- Data for Name: sys_stat_error_log; Type: TABLE DATA; Schema: sync; Owner: gregsun +-- +COPY sys_stat_error_log (proc_name, tab_level, step_no, step_desc, begin_time, end_time, workdate, row_num, elapsed, all_elapsed, sql_code, sql_errm) FROM stdin; +-- +-- Name: ks0_fund_base_26 pk_ks0_fund_base_26; Type: CONSTRAINT; Schema: sync; Owner: gregsun +-- +ALTER TABLE ONLY ks0_fund_base_26 + ADD CONSTRAINT pk_ks0_fund_base_26 PRIMARY KEY (id1, acc_cd, ins_cd); +-- +-- PostgreSQL database dump complete +-- +create table newtab as + SELECT A.C_FUNDCODE, + A.C_FUNDNAME, + A.C_FUNDACCO, + A.F_NETVALUE, + A.C_AGENCYNAME, + A.C_CUSTNAME, + A.D_DATE, + A.D_CDATE, + A.F_CONFIRMBALANCE, + A.F_TRADEFARE, + A.F_CONFIRMSHARES, + ABS(NVL(B.F_OCCURBALANCE, A.F_RELBALANCE)) F_RELBALANCE, + A.F_INTEREST, + NVL(DECODE(B.C_BUSINFLAG, + '02', + '申购', + '50', + '申购', + '74', + '申购', + '03', + '赎回'), + DECODE(A.C_BUSINFLAG, + '01', + '认购', + '02', + '申购', + '03', + '赎回', + '53', + '强制赎回', + '50', + '产品成立')) AS INFO, + null, + SYSDATE AS LOAD_DATE + FROM (SELECT A.C_FUNDCODE, + C.C_FUNDNAME, + A.C_FUNDACCO, + FUNC_GETLASTNETVALUE(A.C_FUNDCODE, A.D_CDATE::date) F_NETVALUE, + (SELECT C_AGENCYNAME + FROM S017_TAGENCYINFO + WHERE A.C_AGENCYNO = C_AGENCYNO) C_AGENCYNAME, + B.C_CUSTNAME, + TO_CHAR(A.D_DATE, 'yyyy-mm-dd') D_DATE, + TO_CHAR(A.D_CDATE, 'yyyy-mm-dd') D_CDATE, + DECODE(A.C_BUSINFLAG, + '03', + A.F_CONFIRMBALANCE + A.F_TRADEFARE, + '53', + A.F_CONFIRMBALANCE + A.F_TRADEFARE, + A.F_CONFIRMBALANCE) F_CONFIRMBALANCE, + A.F_TRADEFARE, + A.F_CONFIRMSHARES, + DECODE(A.C_BUSINFLAG, + '03', + A.F_CONFIRMBALANCE, + '53', + A.F_CONFIRMBALANCE, + A.F_CONFIRMBALANCE - A.F_TRADEFARE) F_RELBALANCE, + A.F_INTEREST, + A.C_BUSINFLAG, + A.C_CSERIALNO + FROM (SELECT D_DATE, + C_AGENCYNO, + DECODE(C_BUSINFLAG, + '03', + DECODE(C_IMPROPERREDEEM, + '3', + '100', + '5', + '100', + C_BUSINFLAG), + C_BUSINFLAG) C_BUSINFLAG, + C_FUNDACCO, + D_CDATE, + C_FUNDCODE, + F_CONFIRMBALANCE, + F_CONFIRMSHARES, + C_REQUESTNO, + F_TRADEFARE, + C_TRADEACCO, + F_INTEREST, + C_CSERIALNO, + L_SERIALNO, + L_CONTRACTSERIALNO + FROM S017_TCONFIRM_ALL T3 + UNION + SELECT D_DATE, + C_AGENCYNO, + '02' C_BUSINFLAG, + C_FUNDACCO, + D_LASTDATE AS D_CDATE, + C_FUNDCODE, + F_REINVESTBALANCE F_CONFIRMBALANCE, + F_REALSHARES F_CONFIRMSHARES, + '' C_REQUESTNO, + 0 F_TRADEFARE, + C_TRADEACCO, + 0 F_INTEREST, + C_CSERIALNO, + 0 L_SERIALNO, + L_CONTRACTSERIALNO + FROM S017_TDIVIDENDDETAIL T1 + /*WHERE T1.C_FLAG = '0'*/) A + LEFT JOIN S017_TACCONET TACN + ON A.C_TRADEACCO = TACN.C_TRADEACCO + LEFT JOIN (SELECT * FROM S017_TACCOINFO WHERE C_ACCOUNTTYPE = 'A') X + ON A.C_FUNDACCO = X.C_FUNDACCO + LEFT JOIN S017_TTRUSTCLIENTINFO_ALL B + ON X.C_CUSTNO = B.C_CUSTNO + INNER JOIN S017_TFUNDINFO C + ON A.C_FUNDCODE = C.C_FUNDCODE + ) A + LEFT JOIN (SELECT ST1.D_CDATE, + ST1.C_FUNDCODE, + ST1.F_OCCURBALANCE, + ST1.C_BUSINFLAG, + ST1.C_FUNDACCO, + ST1.C_CSERIALNO + FROM S017_TSHARECURRENTS_ALL ST1 + -- WHERE ST1.C_BUSINFLAG <> '74' + UNION ALL + SELECT ST2.D_DATE AS D_CDATE, + ST2.C_FUNDCODE, + ST2.F_TOTALPROFIT AS F_OCCURBALANCE, + '74' AS C_BUSINFLAG, + ST2.C_FUNDACCO, + ST2.C_CSERIALNO + FROM S017_TDIVIDENDDETAIL ST2 + -- WHERE ST2.C_FLAG = '0' + ) B + ON A.C_FUNDCODE = B.C_FUNDCODE + /* + AND A.C_FUNDACCO = B.C_FUNDACCO + AND TO_DATE(A.D_CDATE, 'YYYY-MM-DD') = B.D_CDATE + AND A.C_CSERIALNO = B.C_CSERIALNO*/; +DROP SCHEMA sync cascade; +NOTICE: drop cascades to 16 other objects +DETAIL: drop cascades to function func_getlastnetvalue(varchar2,date) +drop cascades to table b03_ts_remetrade +drop cascades to table b03_ts_remetrade_bak +drop cascades to table ks0_fund_base_26 +drop cascades to table p +drop cascades to table s017_taccoinfo +drop cascades to table s017_tacconet +drop cascades to table s017_tagencyinfo +drop cascades to table s017_tconfirm_all +drop cascades to table s017_tdividenddetail +drop cascades to table s017_tfundday +drop cascades to table s017_tfundinfo +drop cascades to table s017_tsharecurrents_all +drop cascades to table s017_ttrustclientinfo_all +drop cascades to table sys_stat_error_log +drop cascades to table newtab diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 58485cf1..036a73c3 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -164,3 +164,6 @@ test: xl_primary_key xl_foreign_key xl_distribution_column_types xl_alter_table # This runs TBase specific tests test: tbase_explain + +test: redistribute_custom_types pl_bugs +test: nestloop_by_shard diff --git a/src/test/regress/sql/pl_bugs.sql b/src/test/regress/sql/pl_bugs.sql new file mode 100644 index 00000000..0059dc90 --- /dev/null +++ b/src/test/regress/sql/pl_bugs.sql @@ -0,0 +1,2052 @@ +CREATE SCHEMA sync; + +SET search_path = sync, pg_catalog; +set enable_oracle_compatible to on; + +-- +-- Name: func_getlastnetvalue(varchar2, date); Type: FUNCTION; Schema: sync; Owner: gregsun +-- + +CREATE FUNCTION func_getlastnetvalue(v_fundcode varchar2, v_cdate date) RETURNS numeric + LANGUAGE plpgsql + AS $$ + declare v_netvalue text; +begin + begin + select p1 + into v_netvalue + from p + limit 1; + exception + when no_data_found then + return 1; + + end; + return 1; +end; + $$; + + +-- +-- Name: sp_b03_ts_remetrade(varchar2, varchar2, varchar2, varchar2); Type: PROCEDURE; Schema: sync; Owner: gregsun +-- + +CREATE function sp_b03_ts_remetrade(p_start_date varchar2, p_work_date varchar2, INOUT err_num varchar2 DEFAULT 0, INOUT err_msg varchar2 DEFAULT NULL::varchar2) + LANGUAGE plpgsql + AS $$ + declare + V_START_DATE DATE; + V_END_DATE DATE; + V_WORK_DATE DATE; + V_SP_NAME VARCHAR(30); + V_TAB_LEVEL VARCHAR(20); + V_LOG_STEP_NO VARCHAR(20); + V_LOG_BEGIN_TIME DATE := SYSDATE; + V_LOG_END_TIME DATE; + V_LOG_ROWCOUNT NUMBER := 0; + V_ELAPSED NUMBER; + V_ALL_ELAPSED NUMBER; + V_STEP_DESC sys_stat_error_log.STEP_DESC%TYPE; +BEGIN + + V_SP_NAME := 'SP_B03_TS_REMETRADE'; + V_TAB_LEVEL := 'B'; + + IF P_START_DATE IS NULL + THEN + RAISE EXCEPTION 'P_START_DATE IS NULL!'; + ELSE + V_START_DATE := TO_DATE(P_START_DATE, 'YYYY-MM-DD'); + END IF; + IF P_WORK_DATE IS NULL + THEN + RAISE EXCEPTION 'P_WORK_DATE IS NULL!'; + ELSE + V_WORK_DATE := TO_DATE(P_WORK_DATE, 'YYYY-MM-DD'); + END IF; + IF P_WORK_DATE IS NULL + THEN + RAISE EXCEPTION 'P_WORK_DATE IS NULL!'; + ELSE + V_END_DATE := TO_DATE(P_WORK_DATE, 'YYYY-MM-DD'); + END IF; + + + + V_LOG_STEP_NO := 'STEP_01'; + V_STEP_DESC := '清除目标表数据'; + V_LOG_BEGIN_TIME := SYSDATE; + V_LOG_ROWCOUNT := NULL; + CALL SP_PUB_INSERT_LOG_DATE(V_SP_NAME + , + V_TAB_LEVEL + , + V_LOG_STEP_NO + , + V_STEP_DESC + , + V_LOG_BEGIN_TIME + , + V_LOG_END_TIME + , + V_WORK_DATE + , + V_LOG_ROWCOUNT + , + V_ELAPSED + , + V_ALL_ELAPSED); + + CALL SP_PUB_DEL_TB('B03_TS_REMETRADE'); + /*DELETE FROM B03_TS_REMETRADE Y + WHERE Y.ENDDATE >=V_START_DATE;*/ + + GET DIAGNOSTICS V_LOG_ROWCOUNT = ROW_COUNT; + + + + CALL SP_PUB_UPDATE_LOG_DATE(V_SP_NAME + , + V_TAB_LEVEL + , + V_LOG_STEP_NO + , + V_LOG_BEGIN_TIME + , + SYSDATE::DATE + , + V_WORK_DATE + , + V_LOG_ROWCOUNT + , + (SYSDATE - V_LOG_BEGIN_TIME)::NUMERIC + , + V_ALL_ELAPSED); + + V_LOG_STEP_NO := 'STEP_02'; + V_STEP_DESC := '插入目标表B03_TS_REMETRADE'; + V_LOG_BEGIN_TIME := SYSDATE; + V_LOG_ROWCOUNT := NULL; + CALL SP_PUB_INSERT_LOG_DATE(V_SP_NAME, + V_TAB_LEVEL, + V_LOG_STEP_NO, + V_STEP_DESC, + V_LOG_BEGIN_TIME, + V_LOG_END_TIME, + V_WORK_DATE, + V_LOG_ROWCOUNT, + V_ELAPSED, + V_ALL_ELAPSED); + + INSERT INTO B03_TS_REMETRADE + (C_FUNDCODE, + C_FUNDNAME, + C_FUNDACCO, + F_NETVALUE, + C_AGENCYNAME, + C_CUSTNAME, + D_DATE, + D_CDATE, + F_CONFIRMBALANCE, + F_TRADEFARE, + F_CONFIRMSHARES, + F_RELBALANCE, + F_INTEREST, + INFO, + WORK_DATE, + LOAD_DATE) + SELECT A.C_FUNDCODE, + A.C_FUNDNAME, + A.C_FUNDACCO, + A.F_NETVALUE, + A.C_AGENCYNAME, + A.C_CUSTNAME, + A.D_DATE, + A.D_CDATE, + A.F_CONFIRMBALANCE, + A.F_TRADEFARE, + A.F_CONFIRMSHARES, + ABS(NVL(B.F_OCCURBALANCE, A.F_RELBALANCE)) F_RELBALANCE, + A.F_INTEREST, + NVL(DECODE(B.C_BUSINFLAG, + '02', + '申购', + '50', + '申购', + '74', + '申购', + '03', + '赎回'), + DECODE(A.C_BUSINFLAG, + '01', + '认购', + '02', + '申购', + '03', + '赎回', + '53', + '强制赎回', + '50', + '产品成立')) AS INFO, + V_WORK_DATE, + SYSDATE AS LOAD_DATE + FROM (SELECT A.C_FUNDCODE, + C.C_FUNDNAME, + A.C_FUNDACCO, + FUNC_GETLASTNETVALUE(A.C_FUNDCODE, A.D_CDATE) F_NETVALUE, + (SELECT C_AGENCYNAME + FROM S017_TAGENCYINFO + WHERE A.C_AGENCYNO = C_AGENCYNO) C_AGENCYNAME, + B.C_CUSTNAME, + TO_CHAR(A.D_DATE, 'yyyy-mm-dd') D_DATE, + TO_CHAR(A.D_CDATE, 'yyyy-mm-dd') D_CDATE, + DECODE(A.C_BUSINFLAG, + '03', + A.F_CONFIRMBALANCE + A.F_TRADEFARE, + '53', + A.F_CONFIRMBALANCE + A.F_TRADEFARE, + A.F_CONFIRMBALANCE) F_CONFIRMBALANCE, + A.F_TRADEFARE, + A.F_CONFIRMSHARES, + DECODE(A.C_BUSINFLAG, + '03', + A.F_CONFIRMBALANCE, + '53', + A.F_CONFIRMBALANCE, + A.F_CONFIRMBALANCE - A.F_TRADEFARE) F_RELBALANCE, + A.F_INTEREST, + A.C_BUSINFLAG, + A.C_CSERIALNO + FROM (SELECT D_DATE, + C_AGENCYNO, + DECODE(C_BUSINFLAG, + '03', + DECODE(C_IMPROPERREDEEM, + '3', + '100', + '5', + '100', + C_BUSINFLAG), + C_BUSINFLAG) C_BUSINFLAG, + C_FUNDACCO, + D_CDATE, + C_FUNDCODE, + F_CONFIRMBALANCE, + F_CONFIRMSHARES, + C_REQUESTNO, + F_TRADEFARE, + C_TRADEACCO, + F_INTEREST, + C_CSERIALNO, + L_SERIALNO, + L_CONTRACTSERIALNO + FROM S017_TCONFIRM_ALL T3 + UNION + SELECT D_DATE, + C_AGENCYNO, + '02' C_BUSINFLAG, + C_FUNDACCO, + D_LASTDATE AS D_CDATE, + C_FUNDCODE, + F_REINVESTBALANCE F_CONFIRMBALANCE, + F_REALSHARES F_CONFIRMSHARES, + '' C_REQUESTNO, + 0 F_TRADEFARE, + C_TRADEACCO, + 0 F_INTEREST, + C_CSERIALNO, + 0 L_SERIALNO, + L_CONTRACTSERIALNO + FROM S017_TDIVIDENDDETAIL T1 + WHERE T1.C_FLAG = '0') A + LEFT JOIN S017_TACCONET TACN + ON A.C_TRADEACCO = TACN.C_TRADEACCO + LEFT JOIN (SELECT * FROM S017_TACCOINFO WHERE C_ACCOUNTTYPE = 'A') X + ON A.C_FUNDACCO = X.C_FUNDACCO + LEFT JOIN S017_TTRUSTCLIENTINFO_ALL B + ON X.C_CUSTNO = B.C_CUSTNO + INNER JOIN S017_TFUNDINFO C + ON A.C_FUNDCODE = C.C_FUNDCODE + ) A + LEFT JOIN (SELECT ST1.D_CDATE, + ST1.C_FUNDCODE, + ST1.F_OCCURBALANCE, + ST1.C_BUSINFLAG, + ST1.C_FUNDACCO, + ST1.C_CSERIALNO + FROM S017_TSHARECURRENTS_ALL ST1 + WHERE ST1.C_BUSINFLAG <> '74' + UNION ALL + SELECT ST2.D_DATE AS D_CDATE, + ST2.C_FUNDCODE, + ST2.F_TOTALPROFIT AS F_OCCURBALANCE, + '74' AS C_BUSINFLAG, + ST2.C_FUNDACCO, + ST2.C_CSERIALNO + FROM S017_TDIVIDENDDETAIL ST2 + WHERE ST2.C_FLAG = '0') B + ON A.C_FUNDCODE = B.C_FUNDCODE + AND A.C_FUNDACCO = B.C_FUNDACCO + AND TO_DATE(A.D_CDATE, 'YYYY-MM-DD') = B.D_CDATE + AND A.C_CSERIALNO = B.C_CSERIALNO; + GET DIAGNOSTICS V_LOG_ROWCOUNT = ROW_COUNT; + + + CALL SP_PUB_UPDATE_LOG_DATE(V_SP_NAME, + V_TAB_LEVEL, + V_LOG_STEP_NO, + V_LOG_BEGIN_TIME, + SYSDATE, + V_WORK_DATE, + V_LOG_ROWCOUNT, + (SYSDATE - V_LOG_BEGIN_TIME)::NUMERIC, + V_ALL_ELAPSED); + ERR_NUM := 0; + ERR_MSG := 'NORMAL,SUCCESSFUL COMPLETION'; +END; + $$; + + +-- +-- Name: sp_pub_del_tb(varchar2); Type: PROCEDURE; Schema: sync; Owner: gregsun +-- + +CREATE PROCEDURE sp_pub_del_tb(p_tab_name varchar2) + LANGUAGE plpgsql + AS $$ + declare n_sql varchar2(4000); +begin + + n_sql := 'truncate table '||p_tab_name; + + execute immediate n_sql; +exception + when no_data_found then null; + when others then raise; +end ; + $$; + + +-- +-- Name: sp_pub_insert_log_date(varchar2, varchar2, varchar2, varchar2, date, date, date, numeric, numeric, numeric); Type: PROCEDURE; Schema: sync; Owner: gregsun +-- + +CREATE PROCEDURE sp_pub_insert_log_date(p_in_proc_name varchar2, p_in_tab_level varchar2, p_in_step_no varchar2, p_in_step_desc varchar2, p_in_begin_time date, p_in_end_time date, p_in_work_date date, p_in_row_num numeric, p_in_elapsed numeric, p_in_all_elapsed numeric) + LANGUAGE plpgsql + AS $$ + declare + BEGIN + INSERT INTO SYNC.SYS_STAT_ERROR_LOG + (PROC_NAME + ,TAB_LEVEL + ,STEP_NO + ,STEP_DESC + ,BEGIN_TIME + ,END_TIME + ,WORKDATE + ,ROW_NUM + ,ELAPSED + ,ALL_ELAPSED) + VALUES + (P_IN_PROC_NAME + ,P_IN_TAB_LEVEL + ,P_IN_STEP_NO + ,P_IN_STEP_DESC + ,P_IN_BEGIN_TIME + ,P_IN_END_TIME + ,P_IN_WORK_DATE + ,P_IN_ROW_NUM + ,P_IN_ELAPSED + ,P_IN_ALL_ELAPSED); + COMMIT; + END ; + $$; + +-- +-- Name: sp_pub_update_log_date(varchar2, varchar2, varchar2, date, date, date, numeric, numeric, numeric); Type: PROCEDURE; Schema: sync; Owner: gregsun +-- + +CREATE PROCEDURE sp_pub_update_log_date(p_in_proc_name varchar2, p_in_tab_level varchar2, p_in_step_no varchar2, p_in_begin_time date, p_in_end_time date, p_in_work_date date, p_in_row_num numeric, p_in_elapsed numeric, p_in_all_elapsed numeric) + LANGUAGE plpgsql + AS $$ BEGIN + UPDATE SYNC.SYS_STAT_ERROR_LOG + SET END_TIME = P_IN_END_TIME + ,ROW_NUM = P_IN_ROW_NUM + ,ELAPSED = P_IN_ELAPSED + ,ALL_ELAPSED = P_IN_ALL_ELAPSED + WHERE PROC_NAME = P_IN_PROC_NAME + AND TAB_LEVEL = P_IN_TAB_LEVEL + AND STEP_NO = P_IN_STEP_NO + AND BEGIN_TIME = P_IN_BEGIN_TIME + AND WORKDATE = P_IN_WORK_DATE; + COMMIT; + END ; + $$; + + +SET default_tablespace = ''; + +SET default_with_oids = false; + +-- +-- Name: b03_ts_remetrade; Type: TABLE; Schema: sync; Owner: gregsun +-- + +CREATE TABLE b03_ts_remetrade ( + c_fundcode character varying(500) NOT NULL, + c_fundname character varying(4000), + c_fundacco character varying(30), + f_netvalue numeric(16,2), + c_agencyname character varying(4000), + c_custname character varying(4000), + d_date character varying(100), + d_cdate character varying(100), + f_confirmbalance numeric(16,2), + f_tradefare numeric(16,2), + f_confirmshares numeric(16,2), + f_relbalance numeric(16,2), + f_interest numeric(16,2), + info character varying(500), + work_date timestamp(0) without time zone, + load_date timestamp(0) without time zone +) +DISTRIBUTE BY SHARD (c_fundcode) to GROUP default_group; + + +-- +-- Name: b03_ts_remetrade_bak; Type: TABLE; Schema: sync; Owner: gregsun +-- + +CREATE TABLE b03_ts_remetrade_bak ( + c_fundcode character varying(500) NOT NULL, + c_fundname character varying(4000), + c_fundacco character varying(30), + f_netvalue numeric(16,2), + c_agencyname character varying(4000), + c_custname character varying(4000), + d_date character varying(100), + d_cdate character varying(100), + f_confirmbalance numeric(16,2), + f_tradefare numeric(16,2), + f_confirmshares numeric(16,2), + f_relbalance numeric(16,2), + f_interest numeric(16,2), + info character varying(500), + work_date timestamp(0) without time zone, + load_date timestamp(0) without time zone +) +DISTRIBUTE BY SHARD (c_fundcode) to GROUP default_group; + + +-- +-- Name: ks0_fund_base_26; Type: TABLE; Schema: sync; Owner: gregsun +-- + +CREATE TABLE ks0_fund_base_26 ( + id1 numeric(48,0) NOT NULL, + acc_cd character varying(500) NOT NULL, + tdate timestamp(0) without time zone NOT NULL, + ins_cd character varying(500) NOT NULL, + cost_price_asset numeric(30,8), + pcol character varying(50) +) +DISTRIBUTE BY SHARD (id1) to GROUP default_group; + +-- +-- Name: p; Type: TABLE; Schema: sync; Owner: gregsun +-- + +CREATE TABLE p ( + p1 text, + p2 text +) +DISTRIBUTE BY HASH (p1); + + +-- +-- Name: s017_taccoinfo; Type: TABLE; Schema: sync; Owner: gregsun +-- + +CREATE TABLE s017_taccoinfo ( + c_custno character varying(30) NOT NULL, + c_accounttype character(1), + c_fundacco character varying(30), + c_agencyno character(3), + c_netno character varying(30), + c_childnetno character varying(30), + d_opendate timestamp(0) without time zone, + d_lastmodify timestamp(0) without time zone, + c_accostatus character(1), + c_freezecause character(1), + d_backdate timestamp(0) without time zone, + l_changetime numeric(10,0), + d_firstinvest timestamp(0) without time zone, + c_password character varying(100), + c_bourseflag character(1), + c_operator character varying(100), + jy_custid numeric(10,0), + work_date timestamp(0) without time zone +) +DISTRIBUTE BY SHARD (c_custno) to GROUP default_group; + + +-- +-- Name: s017_tacconet; Type: TABLE; Schema: sync; Owner: gregsun +-- + +CREATE TABLE s017_tacconet ( + c_fundacco character varying(30) NOT NULL, + c_agencyno character varying(6), + c_netno character varying(30), + c_tradeacco character varying(100), + c_openflag character varying(2), + c_bonustype character varying(2), + c_bankno character varying(500), + c_bankacco character varying(500), + c_nameinbank character varying(1000), + d_appenddate timestamp(0) without time zone, + c_childnetno character varying(30), + c_tradeaccobak character varying(100), + c_bankname character varying(500), + c_banklinecode character varying(100), + c_channelbankno character varying(30), + c_bankprovincecode character varying(30), + c_bankcityno character varying(30), + sys_id character varying(10), + work_date timestamp(0) without time zone, + load_date timestamp(0) without time zone +) +DISTRIBUTE BY SHARD (c_fundacco) to GROUP default_group; + + +-- +-- Name: s017_tagencyinfo; Type: TABLE; Schema: sync; Owner: gregsun +-- + +CREATE TABLE s017_tagencyinfo ( + c_agencyno character varying(6) NOT NULL, + c_agencyname character varying(1000), + c_fullname character varying(1000), + c_agncyaddress character varying(500), + c_agncyzipcode character varying(30), + c_agncycontact character varying(30), + c_agncyphone character varying(100), + c_agncyfaxno character varying(100), + c_agncymail character varying(100), + c_agncybankno character varying(24), + c_agncybankacco character varying(100), + c_agncybankname character varying(500), + d_agncyregdate timestamp(0) without time zone, + c_agncystatus character varying(2), + d_lastdate timestamp(0) without time zone, + c_agencytype character varying(2), + c_detail character varying(2), + c_right character varying(2), + c_zdcode character varying(30), + l_liquidateredeem numeric(10,0), + l_liquidateallot numeric(10,0), + l_liquidatebonus numeric(10,0), + l_liquidatesub numeric(10,0), + c_sharetypes character varying(30), + f_agio numeric(5,4), + c_ztgonestep character varying(2), + c_preassign character varying(2), + l_cserialno numeric(10,0), + c_comparetype character varying(2), + c_liquidatetype character varying(2), + c_multitradeacco character varying(2), + c_iversion character varying(6), + c_imode character varying(2), + c_changeonstep character varying(2), + f_outagio numeric(5,4), + f_agiohint numeric(5,4), + f_outagiohint numeric(5,4), + c_allotliqtype character varying(2), + c_redeemliqtype character varying(2), + c_centerflag character varying(2), + c_netno character varying(6), + c_littledealtype character varying(2), + c_overtimedeal character varying(2), + d_lastinputtime timestamp(0) without time zone, + f_interestrate numeric(5,4), + c_clearsite character varying(2), + c_isdeal character varying(2), + c_agencyenglishname character varying(100), + l_fundaccono numeric(10,0), + c_rationflag character varying(2), + c_splitflag character varying(2), + c_tacode character varying(30), + c_outdataflag character varying(2), + c_hasindex character varying(2), + c_transferbyadjust character varying(2), + c_sharedetailexptype character varying(2), + c_navexptype character varying(2), + c_ecdmode character varying(2), + c_agencytypedetail character varying(2), + c_advanceshrconfirm character varying(2), + c_ecdversion character varying(2), + c_capmode character varying(2), + c_internetplatform character varying(2), + c_capautoarrive character varying(2), + c_outcapitaldata character varying(30), + c_ecdcheckmode character varying(30), + c_ecddealmode character varying(30), + c_fileimpmode character varying(30), + c_isotc character varying(2), + c_enableecd character varying(30), + c_autoaccotype character varying(30), + c_tncheckmode numeric(10,0), + c_captureidinfo character varying(30), + c_realfreeze character varying(30), + sys_id character varying(10), + work_date timestamp(0) without time zone, + load_date timestamp(0) without time zone +) +DISTRIBUTE BY SHARD (c_agencyno) to GROUP default_group; + + +-- +-- Name: s017_tconfirm_all; Type: TABLE; Schema: sync; Owner: gregsun +-- + +CREATE TABLE s017_tconfirm_all ( + c_businflag character(2) NOT NULL, + d_cdate timestamp(0) without time zone, + c_cserialno character varying(100), + d_date timestamp(0) without time zone, + l_serialno numeric(10,0), + c_agencyno character(3), + c_netno character varying(30), + c_fundacco character varying(30), + c_tradeacco character varying(100), + c_fundcode character varying(30), + c_sharetype character(1), + f_confirmbalance numeric(16,2), + f_confirmshares numeric(16,2), + f_tradefare numeric(16,2), + f_tafare numeric(16,2), + f_stamptax numeric(16,2), + f_backfare numeric(16,2), + f_otherfare1 numeric(16,2), + f_interest numeric(16,2), + f_interesttax numeric(16,2), + f_totalfare numeric(16,2), + f_agencyfare numeric(16,2), + f_netvalue numeric(12,4), + f_frozenbalance numeric(16,2), + f_unfrozenbalance numeric(16,2), + c_status character(1), + c_cause character varying(100), + c_taflag character(1), + c_custtype character(1), + c_custno character varying(30), + f_gainbalance numeric(16,2), + f_orifare numeric(16,2), + c_requestendflag character(1), + f_unbalance numeric(16,2), + f_unshares numeric(16,2), + c_reserve character varying(500), + f_interestshare numeric(16,2), + f_chincome numeric(16,2), + f_chshare numeric(16,2), + f_confirmincome numeric(16,2), + f_oritradefare numeric(16,2), + f_oritafare numeric(16,2), + f_oribackfare numeric(16,2), + f_oriotherfare1 numeric(16,2), + c_requestno character varying(100), + f_balance numeric(16,2), + f_shares numeric(16,2), + f_agio numeric(5,4), + f_lastshares numeric(16,2), + f_lastfreezeshare numeric(16,2), + c_othercode character varying(30), + c_otheracco character varying(30), + c_otheragency character(3), + c_othernetno character varying(30), + c_bonustype character(1), + c_foriginalno character varying(500), + c_exceedflag character(1), + c_childnetno character varying(30), + c_othershare character(1), + c_actcode character(3), + c_acceptmode character(1), + c_freezecause character(1), + c_freezeenddate character varying(100), + f_totalbalance numeric(16,2), + f_totalshares numeric(16,2), + c_outbusinflag character(3), + c_protocolno character varying(30), + c_memo character varying(500), + f_registfare numeric(16,2), + f_fundfare numeric(16,2), + f_oriagio numeric(5,4), + c_shareclass character(1), + d_cisdate timestamp(0) without time zone, + c_bourseflag character(1), + c_fundtype character(1), + f_backfareagio numeric(5,4), + c_bankno character varying(30), + c_subfundmethod character varying(30), + c_combcode character varying(30), + f_returnfare numeric(16,2), + c_contractno character varying(100), + c_captype character(1), + l_contractserialno numeric(10,0), + l_othercontractserialno numeric(10,0), + d_exportdate timestamp(0) without time zone, + f_transferfee numeric(16,2), + f_oriconfirmbalance numeric(16,2), + f_extendnetvalue numeric(23,15), + l_remitserialno numeric(10,0), + c_zhxtht character varying(500), + c_improperredeem character(1), + f_untradefare numeric(16,2), + f_untradeinfare numeric(16,2), + f_untradeoutfare numeric(16,2), + c_profitnottransfer character(1), + f_outprofit numeric(9,6), + f_inprofit numeric(9,6), + c_totrustcontractid character varying(500), + d_repurchasedate timestamp(0) without time zone, + f_chengoutbalance numeric(16,2), + c_exporting character(1), + jy_fundid numeric(10,0), + jy_contractbh character varying(100), + jy_custid numeric(10,0), + jy_tocustid numeric(10,0), + jy_fare numeric(16,2), + c_trustcontractid character varying(500), + f_taagencyfare numeric(16,2), + f_taregisterfare numeric(16,2), + d_cdate_jy timestamp(0) without time zone, + jy_adjust character(1), + jy_subfundid numeric, + jy_adjust1114 character(1), + jy_cdate timestamp(0) without time zone, + c_bankacco character varying(500), + c_bankname character varying(500), + c_nameinbank character varying(1000), + f_riskcapital numeric(16,2), + f_replenishriskcapital numeric(16,2), + c_fromfundcode character varying(30), + c_fromtrustcontractid character varying(500), + c_trustagencyno character varying(100), + l_rdmschserialno numeric(10,0), + f_redeemprofit numeric(16,2), + f_redeemproyieldrate numeric(13,10), + d_redeemprobigdate timestamp(0) without time zone, + d_redeemproenddate timestamp(0) without time zone, + c_changeownerincomebelong character(1), + l_midremitserialno numeric(10,0), + c_fromtype character(1), + c_iscycinvest character(1), + l_fromserialno numeric(10,0), + l_frominterestconserialno numeric(10,0), + c_changeownerinterest character(1), + c_msgsendflag character(1), + l_sharedelaydays numeric(3,0), + c_istodayconfirm character(1), + f_newincome numeric(16,2), + f_floorincome numeric(10,9), + l_incomeremitserialno numeric(10,0), + c_isnetting character(1), + l_bankserialno numeric(10,0), + c_subfundcode character varying(30), + f_chengoutsum numeric(16,2), + f_chengoutprofit numeric(16,2), + l_confirmtransserialno numeric(10,0), + c_shareadjustgzexpflag character(1), + c_issend character(1), + c_exchangeflag character(1), + yh_date_1112 timestamp(0) without time zone, + l_banktocontractserialno numeric(10,0), + c_payfeetype character(1), + c_tobankno character varying(30), + c_tobankacco character varying(500), + c_tobankname character varying(500), + c_tonameinbank character varying(1000), + c_tobanklinecode character varying(100), + c_tobankprovincecode character varying(30), + c_tobankcityno character varying(30), + l_assetseperateno numeric(10,0), + c_sharecserialno character varying(100), + c_redeemprincipaltype character(1), + work_date timestamp(0) without time zone, + c_businname character varying(100) +) +DISTRIBUTE BY SHARD (c_businflag) to GROUP default_group; + + +-- +-- Name: s017_tdividenddetail; Type: TABLE; Schema: sync; Owner: gregsun +-- + +CREATE TABLE s017_tdividenddetail ( + d_cdate timestamp(0) without time zone NOT NULL, + c_cserialno character varying(100), + d_regdate timestamp(0) without time zone, + d_date timestamp(0) without time zone, + c_fundacco character varying(30), + c_tradeacco character varying(100), + c_fundcode character varying(30), + c_sharetype character varying(2), + c_agencyno character varying(6), + c_netno character varying(30), + f_totalshare numeric(16,2), + f_unitprofit numeric(7,4), + f_totalprofit numeric(16,2), + f_tax numeric(16,2), + c_flag character varying(2), + f_realbalance numeric(16,2), + f_reinvestbalance numeric(16,2), + f_realshares numeric(16,2), + f_fare numeric(16,2), + d_lastdate timestamp(0) without time zone, + f_netvalue numeric(7,4), + f_frozenbalance numeric(16,2), + f_frozenshares numeric(16,2), + f_incometax numeric(9,4), + c_reserve character varying(100), + d_requestdate timestamp(0) without time zone, + c_shareclass character varying(30), + l_contractserialno numeric(10,0), + l_specprjserialno numeric(10,0), + f_investadvisorratio numeric(9,8), + f_transferfee numeric(16,2), + l_profitserialno numeric(10,0), + d_exportdate timestamp(0) without time zone, + c_custid character varying(30), + jy_fundid numeric, + jy_subfundid numeric, + jy_custid numeric, + jy_contractbh character varying(100), + jy_profitsn numeric, + jy_profitmoney numeric, + jy_capitalmoney numeric, + jy_adjust character varying(2), + c_reinvestnetvalue character varying(2), + f_transferbalance numeric(16,2), + l_relatedserialno numeric(10,0), + c_printoperator character varying(100), + c_printauditor character varying(100), + sys_id character varying(10), + work_date timestamp(0) without time zone, + load_date timestamp(0) without time zone, + f_remainshares numeric(16,2) +) +DISTRIBUTE BY SHARD (d_cdate) to GROUP default_group; + + +-- +-- Name: s017_tfundday; Type: TABLE; Schema: sync; Owner: gregsun +-- + +CREATE TABLE s017_tfundday ( + d_date timestamp(0) without time zone, + d_cdate timestamp(0) without time zone, + c_fundcode varchar2(30), + c_todaystatus varchar2(2), + c_status varchar2(2), + f_netvalue numeric(7,4), + f_lastshares numeric(16,2), + f_lastasset numeric(16,2), + f_asucceed numeric(16,2), + f_rsucceed numeric(16,2), + c_vastflag varchar2(2), + f_encashratio numeric(9,8), + f_changeratio numeric(9,8), + c_excessflag varchar2(2), + f_subscriberatio numeric(9,8), + c_inputpersonnel varchar2(100), + c_checkpersonnel varchar2(100), + f_income numeric(16,2), + f_incomeratio numeric(9,6), + f_unassign numeric(16,2), + f_incomeunit numeric(10,5), + f_totalnetvalue numeric(7,4), + f_servicefare numeric(16,2), + f_assign numeric(16,2), + f_growthrate numeric(9,8), + c_netvalueflag varchar2(2), + f_managefare numeric(16,2), + d_exportdate timestamp(0) without time zone, + c_flag varchar2(2), + f_advisorfee numeric(16,2), + d_auditdate timestamp(0) without time zone, + f_extendnetvalue numeric(23,15), + f_extendtotalnetvalue numeric(23,15), + jy_fundcode varchar2(30), + f_yearincomeratio numeric(9,6), + f_riskcapital numeric(16,2), + f_totalincome numeric(16,2), + f_agencyexpyearincomeration numeric(9,6), + f_agencyexpincomeunit numeric(10,5), + f_agencyexpincomeration numeric(9,6), + f_agencyexpincome numeric(16,2), + c_isspecflag varchar2(2), + c_isasync varchar2(2), + sys_id varchar2(10), + work_date timestamp(0) without time zone, + load_date timestamp(0) without time zone DEFAULT orcl_sysdate() +) +DISTRIBUTE BY HASH (d_date); + + +-- +-- Name: s017_tfundinfo; Type: TABLE; Schema: sync; Owner: gregsun +-- + +CREATE TABLE s017_tfundinfo ( + c_fundcode character varying(30) NOT NULL, + c_fundname character varying(1000), + c_moneytype character varying(6), + c_managername character varying(100), + c_trusteecode character varying(30), + f_parvalue numeric(7,4), + f_issueprice numeric(12,4), + c_trusteeacco character varying(100), + d_issuedate timestamp(0) without time zone, + d_setupdate timestamp(0) without time zone, + f_maxbala numeric(16,2), + f_maxshares numeric(16,2), + f_minbala numeric(16,2), + f_minshares numeric(16,2), + l_elimitday numeric(10,0), + l_slimitday numeric(10,0), + l_alimitday numeric(10,0), + l_mincount numeric(10,0), + l_climitday numeric(10,0), + f_maxallot numeric(9,8), + f_maxredeem numeric(9,8), + c_fundcharacter character varying(500), + c_fundstatus character varying(2), + c_subscribemode character varying(2), + l_timelimit numeric(10,0), + l_subscribeunit numeric(10,0), + c_sharetypes character varying(30), + c_issuetype character varying(2), + f_factcollect numeric(16,2), + d_failuedate timestamp(0) without time zone, + f_allotratio numeric(9,8), + c_feeratiotype1 character varying(2), + c_feeratiotype2 character varying(2), + c_feetype character varying(2), + c_exceedpart character varying(2), + c_bonustype character varying(2), + c_forceredeem character varying(2), + c_interestdealtype character varying(2), + f_redeemfareratio numeric(5,4), + f_changefareratio numeric(5,4), + f_managerfee numeric(7,6), + f_right numeric(5,4), + c_property character varying(2), + d_evendate timestamp(0) without time zone, + f_totalbonus numeric(7,4), + c_changefree character varying(2), + c_reportcode character varying(30), + c_backfarecal character varying(2), + l_moneydate numeric(10,0), + l_netprecision numeric(10,0), + c_corpuscontent character varying(2), + f_corpusratio numeric(5,4), + c_farecaltype character varying(2), + l_liquidateallot numeric(10,0), + l_liquidateredeem numeric(10,0), + l_liquidatebonus numeric(10,0), + l_taspecialacco numeric(10,0), + c_fareprecision character varying(2), + d_issueenddate timestamp(0) without time zone, + c_farebelongasset character varying(2), + l_liquidatechange numeric(10,0), + l_liquidatefail numeric(10,0), + l_liquidateend numeric(10,0), + c_sharedetail character varying(2), + c_trusteebankname character varying(500), + c_boursetradeflag character varying(2), + c_fundenglishname character varying(100), + l_bankaccono numeric(10,0), + c_cleanflag character varying(2), + c_precision character varying(2), + c_upgradeflag character varying(2), + c_isdeal character varying(2), + c_farecltprecision character varying(2), + c_balanceprecision character varying(2), + c_shareprecision character varying(2), + c_bonusprecision character varying(2), + c_interestprecision character varying(2), + f_maxallotasset numeric(16,2), + f_maxallotshares numeric(16,2), + c_foreigntrustee character varying(6), + l_tnconfirm numeric(3,0), + c_rationallotstatus character varying(2), + f_trusteefee numeric(7,6), + c_fundacco character varying(30), + c_financetype character varying(2), + l_liquidatechangein numeric(10,0), + c_custname character varying(500), + c_identitytype character varying(2), + c_custtype character varying(2), + c_identityno character varying(100), + c_deductschemecode character varying(30), + c_customermanager character varying(30), + c_templateid character varying(30), + f_pr0 numeric(7,4), + f_deductratio numeric(5,4), + c_farecalculatetype character varying(2), + c_saletype character varying(2), + l_maxcount numeric(10,0), + l_zhallotliqdays numeric(10,0), + l_zhredeemliqdays numeric(10,0), + f_liqasset numeric(16,2), + l_zhallotexpdays numeric(10,0), + l_zhredeemexpdays numeric(10,0), + c_limitmode character varying(2), + c_ordermode character varying(2), + c_acntlmtdealmode character varying(2), + l_informdays numeric(2,0), + c_allowpartredeem character varying(2), + c_fundendmode character varying(2), + f_fundendagio numeric(10,9), + c_minbalalimitisconfirm character varying(2), + c_gradetype character varying(2), + c_qryfreqtype character varying(2), + l_qrydaysltd numeric(2,0), + d_contractenddate timestamp(0) without time zone, + c_useinopenday character varying(2), + c_allotcalinterst character varying(2), + c_fundrisk character varying(2), + c_exitallot character varying(2), + c_subinterestcalc character varying(2), + c_earlyexitredfee character varying(2), + c_navexpfqy character varying(2), + l_navexpday numeric(10,0), + c_isbounded character varying(2), + c_earlyexitfeecalc character varying(2), + c_designdptid character varying(100), + c_fixeddividway character varying(2), + c_trusttype character varying(2), + f_maxnaturalmoney numeric(16,2), + c_projectid character varying(30), + c_trustclass character varying(2), + f_trustscale numeric(16,2), + c_structflag character varying(2), + c_priconveyflag character varying(2), + c_repurchasetype character varying(2), + c_iswholerepurchase character varying(2), + f_repurchaseminbala numeric(16,2), + c_repurchasemainbody character varying(2), + c_canelyrepurchase character varying(2), + c_earlybacktime character varying(2), + c_repurchaseprice character varying(2), + c_premiumpaymenttime character varying(2), + c_liquisource character varying(2), + l_period numeric(3,0), + c_canextensionflag character varying(2), + c_canelyliquidflag character varying(2), + c_trustassetdesc character varying(100), + c_returnside character varying(2), + c_returnpaymentway character varying(2), + c_returnbase character varying(2), + c_refepaymentway character varying(2), + c_refeside character varying(2), + c_refebase character varying(2), + f_warnline numeric(5,4), + f_stopline numeric(5,4), + f_collectinterest numeric(11,8), + f_durationinterest numeric(7,4), + f_investadvisorratio numeric(7,6), + c_bonusschema character varying(2), + c_guaranteetype character varying(2), + c_guaranteedesc character varying(100), + c_expectedyieldtype character varying(2), + f_minexpectedyield numeric(12,4), + f_maxexpectedyield numeric(12,4), + c_incomecycletype character varying(2), + f_incomecyclevalue numeric(10,0), + c_subaccotype character varying(2), + c_allotaccotype character varying(2), + c_fundtype character varying(2), + c_cootype character varying(1000), + c_projecttype character varying(2), + c_investdirection character varying(30), + c_investdirectionfractionize character varying(2), + c_industrydetail character varying(1000), + c_initeresttype character varying(2), + c_isextended character varying(2), + d_extenddate timestamp(0) without time zone, + c_dealmanagetype character varying(2), + c_investarea character varying(2), + c_projectcode character varying(1000), + c_fundshortname character varying(500), + c_contractid character varying(500), + c_functype character varying(2), + c_specialbusintype character varying(1000), + c_investindustry character varying(2), + c_managetype character varying(2), + c_area character varying(500), + c_risk character varying(2), + c_iscommitteedisscuss character varying(2), + c_structtype character varying(2), + c_commendplace character varying(2), + l_npmaxcount numeric(5,0), + c_client character varying(100), + c_clientcusttype character varying(2), + c_clientidtype character varying(2), + c_clientidno character varying(100), + c_clientbankname character varying(100), + c_clientaccono character varying(100), + c_clientaddress character varying(500), + c_clientzipcode character varying(30), + c_clientphoneno1 character varying(100), + c_clientphoneno2 character varying(100), + c_clientfax character varying(100), + c_beneficiary character varying(100), + c_collectbankname character varying(500), + c_collectbankno character varying(6), + c_collectaccountname character varying(500), + c_collectbankacco character varying(100), + c_keeperbankname character varying(500), + c_keeperaccountname character varying(500), + c_keeperaccountno character varying(100), + c_keepername character varying(500), + c_keepercorporation character varying(500), + c_keeperaddress character varying(500), + c_keeperzipcode character varying(30), + c_keeperphoneno1 character varying(100), + c_keeperphoneno2 character varying(100), + c_keeperfax character varying(100), + c_incomedistributetype character varying(2), + c_alarmline character varying(1000), + c_stoplossline character varying(1000), + f_investadvisorfee numeric(12,2), + c_investadvisordeduct character varying(1000), + c_capitalacco character varying(500), + c_stockacconame character varying(500), + c_stocksalesdept character varying(500), + c_thirdpartybankno character varying(6), + c_thirdpartybankname character varying(500), + c_thirdpartyacconame character varying(500), + c_thirdpartyaccono character varying(100), + c_investadvisor character varying(500), + c_investadvisorbankno character varying(6), + c_investadvisorbankname character varying(500), + c_investadvisoracconame character varying(500), + c_investadvisoraccono character varying(100), + c_investadvisorcorporation character varying(500), + c_investadvisoraddress character varying(500), + c_investadvisorzipcode character varying(30), + c_investadvisorphoneno1 character varying(100), + c_investadvisorphoneno2 character varying(100), + c_investadvisorfax character varying(100), + c_authdelegate character varying(100), + c_loanfinanceparty character varying(500), + c_loanfinancepartycorporation character varying(500), + c_loanfinancepartyaddress character varying(500), + c_loanfinancepartyzipcode character varying(30), + c_loanfinancepartyphoneno1 character varying(100), + c_loanfinancepartyphoneno2 character varying(100), + c_loanfinancepartyfax character varying(100), + c_loaninteresttype character varying(2), + f_loaninterestrate numeric(7,4), + f_loanduration numeric(5,0), + c_loanmanagebank character varying(500), + f_loanmanagefee numeric(9,2), + f_loanfinancecost numeric(9,2), + f_creditattornduration numeric(5,0), + f_creditattorninterestduration numeric(7,4), + f_creditattornprice numeric(12,2), + f_billattornduration numeric(5,0), + f_billattorninterestduration numeric(7,4), + f_billattornprice numeric(12,2), + c_stkincfincparty character varying(1000), + c_stkincfincpartycorporation character varying(500), + c_stkincfincpartyaddress character varying(500), + c_stkincfincpartyzipcode character varying(30), + c_stkincfincpartyphoneno1 character varying(100), + c_stkincfincpartyphoneno2 character varying(100), + c_stkincfincpartyfax character varying(100), + c_stkincincomeannualizedrate numeric(7,4), + c_stkincinteresttype character varying(2), + f_stkincattornprice numeric(12,2), + f_stkincattornduration numeric(5,0), + f_stkincbail numeric(12,2), + f_stkincfinccost numeric(9,2), + c_stkincmemo1 character varying(1000), + c_stkincmemo2 character varying(1000), + c_debtincfincparty character varying(500), + c_debtincfincpartycorporation character varying(500), + c_debtincfincpartyaddress character varying(500), + c_debtincfincpartyzipcode character varying(30), + c_debtincfincpartyphoneno1 character varying(100), + c_debtincfincpartyphoneno2 character varying(100), + c_debtincfincpartyfax character varying(100), + c_debtincincomerate numeric(7,4), + c_debtincinteresttype character varying(2), + f_debtincattornprice numeric(12,2), + f_debtincattornduration numeric(5,0), + f_debtincbail numeric(12,2), + f_debtincfinccost numeric(9,2), + c_debtincmemo1 character varying(1000), + c_othinvfincparty character varying(500), + c_othinvfincpartycorporation character varying(500), + c_othinvfincpartyaddress character varying(500), + c_othinvfincpartyzipcode character varying(30), + c_othinvfincpartyphoneno1 character varying(100), + c_othinvfincpartyphoneno2 character varying(100), + c_othinvfincpartyfax character varying(100), + f_othinvfinccost numeric(9,2), + c_othinvmemo1 character varying(1000), + c_othinvmemo2 character varying(1000), + c_othinvmemo3 character varying(1000), + c_banktrustcoobank character varying(500), + c_banktrustproductname character varying(500), + c_banktrustproductcode character varying(100), + c_banktrustundertakingletter character varying(2), + c_trustgovgovname character varying(500), + c_trustgovprojecttype character varying(1000), + c_trustgovcootype character varying(4), + c_trustgovoptype character varying(4), + c_housecapital character varying(4), + c_houseispe character varying(2), + c_tradetype character varying(2), + c_businesstype character varying(2), + c_trustname character varying(500), + c_trustidtype character varying(2), + c_trustidno character varying(100), + d_trustidvaliddate timestamp(0) without time zone, + c_trustbankname character varying(500), + c_trustaccounttype character varying(2), + c_trustnameinbank character varying(100), + c_zhtrustbankname character varying(500), + c_zhtrustbankacco character varying(100), + c_issecmarket character varying(2), + c_fundoperation character varying(2), + c_trustmanager character varying(100), + c_tradeother character varying(4000), + c_watchdog character varying(500), + c_memo character varying(1000), + c_benefittype character varying(2), + c_redeemaccotype character varying(2), + c_bonusaccotype character varying(2), + c_fundendaccotype character varying(2), + c_collectfailaccotype character varying(2), + d_lastmodifydate timestamp(0) without time zone, + c_shareholdlimtype character varying(2), + c_redeemtimelimtype character varying(2), + c_isprincipalrepayment character varying(2), + c_principalrepaymenttype character varying(2), + l_interestyeardays numeric(3,0), + l_incomeyeardays numeric(3,0), + c_capuseprovcode character varying(30), + c_capusecitycode character varying(30), + c_capsourceprovcode character varying(30), + c_banktrustcoobankcode character varying(30), + c_banktrustisbankcap character varying(2), + c_trusteefeedesc character varying(4000), + c_managefeedesc character varying(4000), + c_investfeedesc character varying(4000), + f_investadvisordeductratio numeric(7,6), + c_investdeductdesc character varying(4000), + c_investadvisor2 character varying(500), + f_investadvisorratio2 numeric(7,6), + f_investadvisordeductratio2 numeric(7,6), + c_investfeedesc2 character varying(4000), + c_investdeductdesc2 character varying(4000), + c_investadvisor3 character varying(500), + f_investadvisorratio3 numeric(7,6), + f_investadvisordeductratio3 numeric(7,6), + c_investfeedesc3 character varying(4000), + c_investdeductdesc3 character varying(4000), + c_profitclassdesc character varying(4000), + c_deductratiodesc character varying(4000), + c_redeemfeedesc character varying(4000), + l_defaultprecision numeric(10,0), + c_allotfeeaccotype character varying(2), + c_isposf character varying(2), + c_opendaydesc character varying(4000), + c_actualmanager character varying(100), + c_subindustrydetail character varying(30), + c_isbankleading character varying(2), + c_subprojectcode character varying(500), + c_iscycleinvest character varying(2), + f_liquidationinterest numeric(13,10), + c_liquidationinteresttype character varying(2), + c_isbonusinvestfare character varying(2), + c_subfeeaccotype character varying(2), + c_redeemfeeaccotype character varying(2), + c_fundrptcode character varying(30), + c_ordertype character varying(2), + c_flag character varying(2), + c_allotliqtype character varying(2), + l_sharelimitday numeric(5,0), + c_iseverydayopen character varying(2), + c_tradebynetvalue character varying(2), + c_isstage character varying(2), + c_specbenfitmemo character varying(4000), + d_effectivedate timestamp(0) without time zone, + c_issueendflag character varying(2), + c_resharehasrdmfee character varying(2), + jy_fundcode numeric, + jy_fundid numeric, + jy_subfundid numeric, + jy_dptid numeric, + c_iswealth character varying(2), + c_interestcalctype character varying(2), + c_allotinterestcalctype character varying(2), + c_isriskcapital character varying(2), + c_fundstatus_1225 character varying(2), + c_isincomeeverydaycalc character varying(2), + c_isredeemreturninterest character varying(2), + c_isrefundrtninterest character varying(2), + d_estimatedsetupdate timestamp(0) without time zone, + f_estimatedfactcollect numeric(16,2), + c_isfinancialproducts character varying(2), + c_fundredeemtype character varying(2), + c_trademanualinput character varying(2), + f_clientmanageration numeric(7,6), + c_profitclassadjustment character varying(2), + c_mainfundcode character varying(30), + c_contractsealoff character varying(2), + c_permitnextperiod character varying(2), + c_preprofitschematype character varying(2), + c_fundredeemprofit character varying(2), + f_incomeration numeric(9,8), + c_incomecalctype character varying(2), + c_allocateaccoid character varying(30), + c_outfundcode character varying(500), + c_matchprofitclass character varying(30), + l_lastdays numeric(5,0), + c_contractprofitflag character varying(2), + c_agencysaleliqtype character varying(2), + l_delaydays numeric(3,0), + c_profitclassperiod character varying(2), + c_reportshowname character varying(1000), + c_currencyincometype character varying(2), + c_beforeredeemcapital character varying(2), + c_contractversion character varying(30), + c_confirmacceptedflag character varying(2), + c_selectcontract character varying(2), + f_schemainterest numeric(11,8), + c_riskgrade character varying(30), + l_sharedelaydays numeric(3,0), + l_reservationdays numeric(3,0), + c_transfertype character varying(2), + c_schemavoluntarily character varying(2), + l_schemadetaildata numeric(4,0), + c_schemadetailtype character varying(2), + c_iscurrencyconfirm character varying(2), + c_allowmultiaccobank character varying(2), + d_capverif timestamp(0) without time zone, + c_templatetype character varying(12), + c_capitalprecision character varying(2), + c_fundno character varying(100), + c_profittype character varying(2), + d_paydate timestamp(0) without time zone, + d_shelvedate timestamp(0) without time zone, + d_offshelvedate timestamp(0) without time zone, + c_schemabegindatetype character varying(2), + l_schemabegindatedays numeric(3,0), + c_isautoredeem character varying(2), + c_isnettingrequest character varying(2), + c_issuingquotedtype character varying(2), + d_firstdistributedate timestamp(0) without time zone, + c_bonusfrequency character varying(2), + c_interestbigdatetype character varying(2), + c_gzdatatype character varying(2), + f_allotfareratio numeric(5,4), + f_subfareratio numeric(5,4), + c_begindatebeyond character varying(2), + c_profitnotinterest character varying(2), + c_setuplimittype character varying(2), + c_limitredeemtype character varying(2), + c_bonusfrequencytype character varying(2), + c_rfaccotype character varying(2), + c_capitalfee character varying(2), + c_exceedflag character varying(2), + c_enableecd character varying(2), + c_isfixedtrade character varying(2), + c_profitcaltype character varying(2), + f_ominbala numeric(16,2), + f_stepbala numeric(16,2), + c_remittype character varying(30), + c_interestcycle character varying(30), + c_repayguaranteecopy character varying(30), + c_repaytype character varying(30), + c_fundprofitdes character varying(4000), + c_fundinfodes character varying(4000), + c_riskeval character varying(2), + l_maxage numeric(3,0), + l_minage numeric(3,0), + c_fundriskdes character varying(1000), + mig_l_assetid numeric(48,0), + l_faincomedays numeric(10,0), + c_producttype character varying(2), + c_otherbenefitproducttype character varying(2), + c_isotc character varying(2), + c_iseverydayprovision character varying(2), + c_incometogz character varying(2), + c_setuptransfundacco character varying(30), + c_issuefeeownerrequired character varying(2), + c_calcinterestbeforeallot character varying(30), + c_islimit300wnature character varying(2), + c_allowoverflow character varying(30), + c_trustfundtype character varying(30), + c_disclose character varying(2), + c_collectaccoid character varying(30), + c_isissuebymarket character varying(2), + c_setupstatus character varying(30), + c_isentitytrust character varying(2), + l_liquidatesub numeric(10,0), + c_incomeassigndesc character varying(4000), + c_keeporgancode character varying(30), + d_defaultbegincacldate timestamp(0) without time zone, + c_zcbborrower character varying(100), + c_zcbborroweridno character varying(100), + c_zcbremittype character varying(100), + c_registcode character varying(100), + c_redeeminvestaccotype character varying(2), + c_bonusinvestaccotype character varying(2), + c_isabsnotopentrade character varying(2), + l_interestdiffdays numeric(5,0), + c_outfundstatus character varying(2), + c_reqsyntype character varying(2), + c_allredeemtype character varying(2), + c_isabsopentrade character varying(2), + c_funddesc character varying(1000), + l_allotliquidays numeric(3,0), + l_subliquidays numeric(3,0), + c_autoupcontractenddaterule character varying(2), + c_fcsubaccotype character varying(2), + c_fcallotaccotype character varying(2), + c_fcredeemaccotype character varying(2), + c_fcbonusaccotype character varying(2), + c_captranslimitflag character varying(30), + c_redeemprincipaltype character varying(2), + c_interestcalcdealtype character varying(30), + c_collectconfirm character varying(30), + d_oldcontractenddate timestamp(0) without time zone, + c_tnvaluation character varying(30), + c_contractendnotify character varying(2), + c_rdmfeebase character varying(30), + c_exceedcfmratio character varying(30), + c_allowallotcustlimittype character varying(2), + c_yeardayscalctype character varying(2), + c_iscompoundinterest character varying(30), + c_dbcfm character varying(30), + c_limitaccountstype character varying(2), + c_cycleinvestrange character varying(2), + c_tncheckmode character varying(2), + c_enableearlyredeem character varying(2), + c_ispurceandredeemset character varying(30), + c_perfpaydealtype character varying(2), + c_allowappend character varying(2), + c_allowredeem character varying(2), + c_inputstatus character varying(2), + c_profitbalanceadjust character varying(2), + c_profitperiodadjust character varying(2), + c_autogeneratecontractid character varying(2), + c_transferneednetting character varying(100), + underwrite character varying(1000), + undertook character varying(1000), + undertake character varying(1000), + c_issmsend character varying(2), + d_contractshortenddate timestamp(0) without time zone, + d_contractlongenddate timestamp(0) without time zone, + c_assetseperatefundcodesrc character varying(30), + f_averageprofit numeric(11,8), + c_currencycontractlimittype character varying(2), + l_profitlastdays numeric(5,0), + l_liquidationlastdays numeric(5,0), + c_arlimitincludeallreq character varying(2), + c_reqfundchange character varying(2), + c_dealnetvaluerule character varying(2), + c_contractdealtype character varying(2), + c_bonusplanbeginday timestamp(0) without time zone, + c_contractbalaupright character varying(2), + c_isneedinterestrate character varying(2), + c_isneedexcessratio character varying(2), + c_riskgraderemark character varying(1000), + c_lossprobability character varying(2), + c_suitcusttype character varying(2), + c_createbonusschema character varying(2), + d_closedenddate timestamp(0) without time zone, + c_timelimitunit character varying(30), + c_exceedredeemdealtype character varying(2), + c_profitperiod character varying(2), + l_navgetintervaldays numeric(3,0), + load_date timestamp(0) without time zone, + sys_id character varying(10) DEFAULT 'S017'::character varying, + work_date timestamp(0) without time zone, + c_limittransfertype character varying(1), + c_transaccotype character varying(1), + c_incometaxbase character varying(1), + c_isredeemfareyearcalc character varying(1), + c_otherbenefitinputmode character varying(1), + c_aftdefaultinterestdeducttype character varying(1), + c_allowzerobalanceconfirm character varying(1), + c_incomejoinassign character varying(1), + l_liquidateliqbonus numeric(10,0), + c_predefaultinterestdeducttype character varying(1), + c_worktype character varying(1), + c_defaultinterestadduptype character varying(1), + c_issupportsubmode character varying(1), + f_expectedyield numeric(14,0), + c_recodecode character varying(40), + l_liquidatetransfer numeric(10,0), + c_ispayincometax character varying(1), + c_groupmainfundcode character varying(6), + c_redeemfeesplittype character varying(1), + c_capitalfromcrmorta character varying(1), + c_needcalcdefaultinterest character varying(1), + c_issuercode character varying(10), + l_redeemfareyeardays numeric(10,0), + c_floatyield character varying(30), + l_minriskscore numeric(3,0), + c_islocalmoneytypecollect character varying(1) +) +DISTRIBUTE BY SHARD (c_fundcode) to GROUP default_group; + + +-- +-- Name: s017_tsharecurrents_all; Type: TABLE; Schema: sync; Owner: gregsun +-- + +CREATE TABLE s017_tsharecurrents_all ( + d_cdate timestamp(0) without time zone NOT NULL, + c_cserialno character varying(100), + c_businflag character(2), + d_requestdate timestamp(0) without time zone, + c_requestno character varying(100), + c_custno character varying(30), + c_fundacco character varying(30), + c_tradeacco character varying(100), + c_fundcode character varying(30), + c_sharetype character(1), + c_agencyno character(3), + c_netno character varying(30), + f_occurshares numeric(16,2), + f_occurbalance numeric(16,2), + f_lastshares numeric(16,2), + f_occurfreeze numeric(16,2), + f_lastfreezeshare numeric(16,2), + c_summary character varying(100), + f_gainbalance numeric(16,2), + d_sharevaliddate timestamp(0) without time zone, + c_bonustype character(1), + c_custtype character(1), + c_shareclass character(1), + c_bourseflag character varying(20), + d_exportdate timestamp(0) without time zone, + l_contractserialno numeric(10,0), + c_issend character(1), + c_sendbatch character varying(30), + work_date timestamp(0) without time zone +) +DISTRIBUTE BY SHARD (d_cdate) to GROUP default_group; + + +-- +-- Name: s017_ttrustclientinfo_all; Type: TABLE; Schema: sync; Owner: gregsun +-- + +CREATE TABLE s017_ttrustclientinfo_all ( + c_custno character varying(30) NOT NULL, + c_custtype character(1), + c_custname character varying(500), + c_shortname character varying(500), + c_helpcode character varying(30), + c_identitytype character(1), + c_identityno character varying(500), + c_zipcode character varying(30), + c_address character varying(1000), + c_phone character varying(100), + c_faxno character varying(500), + c_mobileno character varying(100), + c_email character varying(500), + c_sex character(1), + c_birthday character varying(30), + c_vocation character(2), + c_education character(2), + c_income character varying(30), + c_contact character varying(100), + c_contype character(1), + c_contno character varying(100), + c_billsendflag character(1), + c_callcenter character(1), + c_internet character(1), + c_secretcode character varying(30), + c_nationality character(3), + c_cityno character varying(30), + c_lawname character varying(100), + c_shacco character varying(30), + c_szacco character varying(30), + c_broker character varying(100), + f_agio numeric(5,4), + c_memo character varying(4000), + c_reserve character varying(500), + c_corpname character varying(100), + c_corptel character varying(100), + c_specialcode character varying(100), + c_actcode character varying(30), + c_billsendpass character(1), + c_addressinvalid character(1), + d_appenddate timestamp(0) without time zone, + d_backdate timestamp(0) without time zone, + c_invalidaddress character varying(500), + c_backreason character varying(500), + c_modifyinfo character(2), + c_riskcontent character varying(4000), + l_querydaysltd numeric(3,0), + c_customermanager character varying(100), + c_custproperty character(1), + c_custclass character(1), + c_custright character varying(4000), + c_daysltdtype character(1), + d_idvaliddate timestamp(0) without time zone, + l_custgroup numeric(10,0), + c_recommender character varying(100), + c_recommendertype character(1), + d_idnovaliddate timestamp(0) without time zone, + c_organcode character(10), + c_othercontact character varying(100), + c_taxregistno character varying(100), + c_taxidentitytype character(1), + c_taxidentityno character varying(100), + d_legalvaliddate timestamp(0) without time zone, + c_shareholder character varying(500), + c_shareholderidtype character(1), + c_shareholderidno character varying(100), + d_holderidvaliddate timestamp(0) without time zone, + c_leader character varying(500), + c_leaderidtype character(1), + c_leaderidno character varying(100), + d_leadervaliddate timestamp(0) without time zone, + c_managercode character varying(100), + c_linemanager character varying(100), + c_clientinfoid character varying(30), + c_provincecode character varying(30), + c_countytown character varying(1000), + c_phone2 character varying(100), + c_clienttype character(1), + c_agencyno character(3), + c_industrydetail character varying(30), + c_isqualifiedcust character(1), + c_industryidentityno character varying(100), + c_lawidentitytype character(1), + c_lawidentityno character varying(100), + d_lawidvaliddate timestamp(0) without time zone, + d_conidvaliddate timestamp(0) without time zone, + c_conisrevmsg character(1), + c_conmobileno character varying(100), + c_conmoaddress character varying(1000), + c_conzipcode character varying(30), + c_conphone1 character varying(100), + c_conphone2 character varying(100), + c_conemail character varying(100), + c_confaxno character varying(500), + c_incomsource character varying(500), + c_zhidentityno character varying(500), + c_zhidentitytype character(1), + c_eastcusttype character varying(30), + jy_custid numeric(10,0), + c_idtype201201030 character(1), + c_emcontact character varying(500), + c_emcontactphone character varying(100), + c_instiregaddr character varying(1000), + c_regcusttype character varying(30), + c_riskgrade character varying(30), + c_riskgraderemark character varying(1000), + d_idvaliddatebeg timestamp(0) without time zone, + d_industryidvaliddatebeg timestamp(0) without time zone, + d_industryidvaliddate timestamp(0) without time zone, + c_incomesourceotherdesc character varying(1000), + c_vocationotherdesc character varying(1000), + c_businscope character varying(4000), + d_conidvaliddatebeg timestamp(0) without time zone, + d_lawidvaliddatebeg timestamp(0) without time zone, + c_regmoneytype character(3), + f_regcapital numeric(15,2), + c_orgtype character(2), + c_contrholderno character varying(100), + c_contrholdername character varying(500), + c_contrholderidtype character(2), + c_contrholderidno character varying(500), + d_contrholderidvalidatebeg timestamp(0) without time zone, + d_contrholderidvalidate timestamp(0) without time zone, + c_responpername character varying(500), + c_responperidtype character(2), + c_responperidno character varying(500), + d_responperidvalidatebeg timestamp(0) without time zone, + d_responperidvalidate timestamp(0) without time zone, + c_lawphone character varying(100), + c_contrholderphone character varying(100), + c_responperphone character varying(100), + c_consex character(1), + c_conrelative character varying(500), + l_riskserialno numeric(10,0), + c_convocation character(2), + c_iscustrelated character(1), + c_businlicissuorgan character varying(500), + c_manageridno character varying(500), + c_manageridtype character varying(500), + c_managername character varying(500), + d_companyregdate timestamp(0) without time zone, + c_electronicagreement character(1), + c_householdregno character varying(500), + c_guardianrela character varying(500), + c_guardianname character varying(500), + c_guardianidtype character(1), + c_guardianidno character varying(500), + c_isfranchisingidstry character(1), + c_franchidstrybusinlic character varying(500), + c_workunittype character(2), + c_normalresidaddr character varying(1000), + c_domicile character varying(1000), + c_finainvestyears character(2), + c_parentidtype character(1), + c_parentidno character varying(500), + c_videono character varying(1000), + c_bonustype character(1), + d_retirementdate timestamp(0) without time zone, + c_issendbigcustbill character(1), + c_idaddress character varying(1000), + c_isproinvestor character(1), + c_sendkfflag character(1), + c_sendkfcause character varying(1000), + c_sendsaflag character(1), + c_sendsacause character varying(1000), + c_custrelationchannel character(1), + c_companytype character(1), + c_businlocation character varying(1000), + c_custodian character varying(500), + d_elecsigndate timestamp(0) without time zone, + d_riskinputdate timestamp(0) without time zone, + c_circno character varying(1000), + c_financeindustrydetail character varying(30), + c_outclientinfoid character varying(30), + d_duediligencedate timestamp(0) without time zone, + c_duediligencestatus character(1), + c_inputstatus character(1), + c_address2 character varying(1000), + c_reportcusttype character(1), + c_reportcusttypedetail character varying(30), + c_custsource character varying(30), + work_date timestamp(0) without time zone +) +DISTRIBUTE BY SHARD (c_custno) to GROUP default_group; + + +-- +-- Name: sys_stat_error_log; Type: TABLE; Schema: sync; Owner: gregsun +-- + +CREATE TABLE sys_stat_error_log ( + proc_name varchar2(50) NOT NULL, + tab_level varchar2(20), + step_no varchar2(20), + step_desc varchar2(500), + begin_time timestamp(0) without time zone, + end_time timestamp(0) without time zone, + workdate timestamp(0) without time zone, + row_num numeric, + elapsed numeric, + all_elapsed numeric, + sql_code varchar2(20), + sql_errm varchar2(500) +) +DISTRIBUTE BY SHARD (proc_name) to GROUP default_group; + + +-- +-- Data for Name: b03_ts_remetrade; Type: TABLE DATA; Schema: sync; Owner: gregsun +-- + +COPY b03_ts_remetrade (c_fundcode, c_fundname, c_fundacco, f_netvalue, c_agencyname, c_custname, d_date, d_cdate, f_confirmbalance, f_tradefare, f_confirmshares, f_relbalance, f_interest, info, work_date, load_date) FROM stdin; +\. + + +-- +-- Data for Name: b03_ts_remetrade_bak; Type: TABLE DATA; Schema: sync; Owner: gregsun +-- + +COPY b03_ts_remetrade_bak (c_fundcode, c_fundname, c_fundacco, f_netvalue, c_agencyname, c_custname, d_date, d_cdate, f_confirmbalance, f_tradefare, f_confirmshares, f_relbalance, f_interest, info, work_date, load_date) FROM stdin; +\. + + +-- +-- Data for Name: ks0_fund_base_26; Type: TABLE DATA; Schema: sync; Owner: gregsun +-- + +COPY ks0_fund_base_26 (id1, acc_cd, tdate, ins_cd, cost_price_asset, pcol) FROM stdin; +\. + + +-- +-- Data for Name: p; Type: TABLE DATA; Schema: sync; Owner: gregsun +-- + +COPY p (p1, p2) FROM stdin; +2021-12-12 2021-12-12 +2021-12-13 2021-12-12 +2020-12-13 2021-12-12 +\. + + +-- +-- Data for Name: s017_taccoinfo; Type: TABLE DATA; Schema: sync; Owner: gregsun +-- + +COPY s017_taccoinfo (c_custno, c_accounttype, c_fundacco, c_agencyno, c_netno, c_childnetno, d_opendate, d_lastmodify, c_accostatus, c_freezecause, d_backdate, l_changetime, d_firstinvest, c_password, c_bourseflag, c_operator, jy_custid, work_date) FROM stdin; +\. + + +-- +-- Data for Name: s017_tacconet; Type: TABLE DATA; Schema: sync; Owner: gregsun +-- + +COPY s017_tacconet (c_fundacco, c_agencyno, c_netno, c_tradeacco, c_openflag, c_bonustype, c_bankno, c_bankacco, c_nameinbank, d_appenddate, c_childnetno, c_tradeaccobak, c_bankname, c_banklinecode, c_channelbankno, c_bankprovincecode, c_bankcityno, sys_id, work_date, load_date) FROM stdin; +\. + + +-- +-- Data for Name: s017_tagencyinfo; Type: TABLE DATA; Schema: sync; Owner: gregsun +-- + +COPY s017_tagencyinfo (c_agencyno, c_agencyname, c_fullname, c_agncyaddress, c_agncyzipcode, c_agncycontact, c_agncyphone, c_agncyfaxno, c_agncymail, c_agncybankno, c_agncybankacco, c_agncybankname, d_agncyregdate, c_agncystatus, d_lastdate, c_agencytype, c_detail, c_right, c_zdcode, l_liquidateredeem, l_liquidateallot, l_liquidatebonus, l_liquidatesub, c_sharetypes, f_agio, c_ztgonestep, c_preassign, l_cserialno, c_comparetype, c_liquidatetype, c_multitradeacco, c_iversion, c_imode, c_changeonstep, f_outagio, f_agiohint, f_outagiohint, c_allotliqtype, c_redeemliqtype, c_centerflag, c_netno, c_littledealtype, c_overtimedeal, d_lastinputtime, f_interestrate, c_clearsite, c_isdeal, c_agencyenglishname, l_fundaccono, c_rationflag, c_splitflag, c_tacode, c_outdataflag, c_hasindex, c_transferbyadjust, c_sharedetailexptype, c_navexptype, c_ecdmode, c_agencytypedetail, c_advanceshrconfirm, c_ecdversion, c_capmode, c_internetplatform, c_capautoarrive, c_outcapitaldata, c_ecdcheckmode, c_ecddealmode, c_fileimpmode, c_isotc, c_enableecd, c_autoaccotype, c_tncheckmode, c_captureidinfo, c_realfreeze, sys_id, work_date, load_date) FROM stdin; +1 \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N +\. + + +-- +-- Data for Name: s017_tconfirm_all; Type: TABLE DATA; Schema: sync; Owner: gregsun +-- + +COPY s017_tconfirm_all (c_businflag, d_cdate, c_cserialno, d_date, l_serialno, c_agencyno, c_netno, c_fundacco, c_tradeacco, c_fundcode, c_sharetype, f_confirmbalance, f_confirmshares, f_tradefare, f_tafare, f_stamptax, f_backfare, f_otherfare1, f_interest, f_interesttax, f_totalfare, f_agencyfare, f_netvalue, f_frozenbalance, f_unfrozenbalance, c_status, c_cause, c_taflag, c_custtype, c_custno, f_gainbalance, f_orifare, c_requestendflag, f_unbalance, f_unshares, c_reserve, f_interestshare, f_chincome, f_chshare, f_confirmincome, f_oritradefare, f_oritafare, f_oribackfare, f_oriotherfare1, c_requestno, f_balance, f_shares, f_agio, f_lastshares, f_lastfreezeshare, c_othercode, c_otheracco, c_otheragency, c_othernetno, c_bonustype, c_foriginalno, c_exceedflag, c_childnetno, c_othershare, c_actcode, c_acceptmode, c_freezecause, c_freezeenddate, f_totalbalance, f_totalshares, c_outbusinflag, c_protocolno, c_memo, f_registfare, f_fundfare, f_oriagio, c_shareclass, d_cisdate, c_bourseflag, c_fundtype, f_backfareagio, c_bankno, c_subfundmethod, c_combcode, f_returnfare, c_contractno, c_captype, l_contractserialno, l_othercontractserialno, d_exportdate, f_transferfee, f_oriconfirmbalance, f_extendnetvalue, l_remitserialno, c_zhxtht, c_improperredeem, f_untradefare, f_untradeinfare, f_untradeoutfare, c_profitnottransfer, f_outprofit, f_inprofit, c_totrustcontractid, d_repurchasedate, f_chengoutbalance, c_exporting, jy_fundid, jy_contractbh, jy_custid, jy_tocustid, jy_fare, c_trustcontractid, f_taagencyfare, f_taregisterfare, d_cdate_jy, jy_adjust, jy_subfundid, jy_adjust1114, jy_cdate, c_bankacco, c_bankname, c_nameinbank, f_riskcapital, f_replenishriskcapital, c_fromfundcode, c_fromtrustcontractid, c_trustagencyno, l_rdmschserialno, f_redeemprofit, f_redeemproyieldrate, d_redeemprobigdate, d_redeemproenddate, c_changeownerincomebelong, l_midremitserialno, c_fromtype, c_iscycinvest, l_fromserialno, l_frominterestconserialno, c_changeownerinterest, c_msgsendflag, l_sharedelaydays, c_istodayconfirm, f_newincome, f_floorincome, l_incomeremitserialno, c_isnetting, l_bankserialno, c_subfundcode, f_chengoutsum, f_chengoutprofit, l_confirmtransserialno, c_shareadjustgzexpflag, c_issend, c_exchangeflag, yh_date_1112, l_banktocontractserialno, c_payfeetype, c_tobankno, c_tobankacco, c_tobankname, c_tonameinbank, c_tobanklinecode, c_tobankprovincecode, c_tobankcityno, l_assetseperateno, c_sharecserialno, c_redeemprincipaltype, work_date, c_businname) FROM stdin; +1 \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N +\. + + +-- +-- Data for Name: s017_tdividenddetail; Type: TABLE DATA; Schema: sync; Owner: gregsun +-- + +COPY s017_tdividenddetail (d_cdate, c_cserialno, d_regdate, d_date, c_fundacco, c_tradeacco, c_fundcode, c_sharetype, c_agencyno, c_netno, f_totalshare, f_unitprofit, f_totalprofit, f_tax, c_flag, f_realbalance, f_reinvestbalance, f_realshares, f_fare, d_lastdate, f_netvalue, f_frozenbalance, f_frozenshares, f_incometax, c_reserve, d_requestdate, c_shareclass, l_contractserialno, l_specprjserialno, f_investadvisorratio, f_transferfee, l_profitserialno, d_exportdate, c_custid, jy_fundid, jy_subfundid, jy_custid, jy_contractbh, jy_profitsn, jy_profitmoney, jy_capitalmoney, jy_adjust, c_reinvestnetvalue, f_transferbalance, l_relatedserialno, c_printoperator, c_printauditor, sys_id, work_date, load_date, f_remainshares) FROM stdin; +2021-04-26 20:34:00 \N \N \N \N \N 2 \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N +\. + + +-- +-- Data for Name: s017_tfundday; Type: TABLE DATA; Schema: sync; Owner: gregsun +-- + +COPY s017_tfundday (d_date, d_cdate, c_fundcode, c_todaystatus, c_status, f_netvalue, f_lastshares, f_lastasset, f_asucceed, f_rsucceed, c_vastflag, f_encashratio, f_changeratio, c_excessflag, f_subscriberatio, c_inputpersonnel, c_checkpersonnel, f_income, f_incomeratio, f_unassign, f_incomeunit, f_totalnetvalue, f_servicefare, f_assign, f_growthrate, c_netvalueflag, f_managefare, d_exportdate, c_flag, f_advisorfee, d_auditdate, f_extendnetvalue, f_extendtotalnetvalue, jy_fundcode, f_yearincomeratio, f_riskcapital, f_totalincome, f_agencyexpyearincomeration, f_agencyexpincomeunit, f_agencyexpincomeration, f_agencyexpincome, c_isspecflag, c_isasync, sys_id, work_date, load_date) FROM stdin; +\. + + +-- +-- Data for Name: s017_tfundinfo; Type: TABLE DATA; Schema: sync; Owner: gregsun +-- + +COPY s017_tfundinfo (c_fundcode, c_fundname, c_moneytype, c_managername, c_trusteecode, f_parvalue, f_issueprice, c_trusteeacco, d_issuedate, d_setupdate, f_maxbala, f_maxshares, f_minbala, f_minshares, l_elimitday, l_slimitday, l_alimitday, l_mincount, l_climitday, f_maxallot, f_maxredeem, c_fundcharacter, c_fundstatus, c_subscribemode, l_timelimit, l_subscribeunit, c_sharetypes, c_issuetype, f_factcollect, d_failuedate, f_allotratio, c_feeratiotype1, c_feeratiotype2, c_feetype, c_exceedpart, c_bonustype, c_forceredeem, c_interestdealtype, f_redeemfareratio, f_changefareratio, f_managerfee, f_right, c_property, d_evendate, f_totalbonus, c_changefree, c_reportcode, c_backfarecal, l_moneydate, l_netprecision, c_corpuscontent, f_corpusratio, c_farecaltype, l_liquidateallot, l_liquidateredeem, l_liquidatebonus, l_taspecialacco, c_fareprecision, d_issueenddate, c_farebelongasset, l_liquidatechange, l_liquidatefail, l_liquidateend, c_sharedetail, c_trusteebankname, c_boursetradeflag, c_fundenglishname, l_bankaccono, c_cleanflag, c_precision, c_upgradeflag, c_isdeal, c_farecltprecision, c_balanceprecision, c_shareprecision, c_bonusprecision, c_interestprecision, f_maxallotasset, f_maxallotshares, c_foreigntrustee, l_tnconfirm, c_rationallotstatus, f_trusteefee, c_fundacco, c_financetype, l_liquidatechangein, c_custname, c_identitytype, c_custtype, c_identityno, c_deductschemecode, c_customermanager, c_templateid, f_pr0, f_deductratio, c_farecalculatetype, c_saletype, l_maxcount, l_zhallotliqdays, l_zhredeemliqdays, f_liqasset, l_zhallotexpdays, l_zhredeemexpdays, c_limitmode, c_ordermode, c_acntlmtdealmode, l_informdays, c_allowpartredeem, c_fundendmode, f_fundendagio, c_minbalalimitisconfirm, c_gradetype, c_qryfreqtype, l_qrydaysltd, d_contractenddate, c_useinopenday, c_allotcalinterst, c_fundrisk, c_exitallot, c_subinterestcalc, c_earlyexitredfee, c_navexpfqy, l_navexpday, c_isbounded, c_earlyexitfeecalc, c_designdptid, c_fixeddividway, c_trusttype, f_maxnaturalmoney, c_projectid, c_trustclass, f_trustscale, c_structflag, c_priconveyflag, c_repurchasetype, c_iswholerepurchase, f_repurchaseminbala, c_repurchasemainbody, c_canelyrepurchase, c_earlybacktime, c_repurchaseprice, c_premiumpaymenttime, c_liquisource, l_period, c_canextensionflag, c_canelyliquidflag, c_trustassetdesc, c_returnside, c_returnpaymentway, c_returnbase, c_refepaymentway, c_refeside, c_refebase, f_warnline, f_stopline, f_collectinterest, f_durationinterest, f_investadvisorratio, c_bonusschema, c_guaranteetype, c_guaranteedesc, c_expectedyieldtype, f_minexpectedyield, f_maxexpectedyield, c_incomecycletype, f_incomecyclevalue, c_subaccotype, c_allotaccotype, c_fundtype, c_cootype, c_projecttype, c_investdirection, c_investdirectionfractionize, c_industrydetail, c_initeresttype, c_isextended, d_extenddate, c_dealmanagetype, c_investarea, c_projectcode, c_fundshortname, c_contractid, c_functype, c_specialbusintype, c_investindustry, c_managetype, c_area, c_risk, c_iscommitteedisscuss, c_structtype, c_commendplace, l_npmaxcount, c_client, c_clientcusttype, c_clientidtype, c_clientidno, c_clientbankname, c_clientaccono, c_clientaddress, c_clientzipcode, c_clientphoneno1, c_clientphoneno2, c_clientfax, c_beneficiary, c_collectbankname, c_collectbankno, c_collectaccountname, c_collectbankacco, c_keeperbankname, c_keeperaccountname, c_keeperaccountno, c_keepername, c_keepercorporation, c_keeperaddress, c_keeperzipcode, c_keeperphoneno1, c_keeperphoneno2, c_keeperfax, c_incomedistributetype, c_alarmline, c_stoplossline, f_investadvisorfee, c_investadvisordeduct, c_capitalacco, c_stockacconame, c_stocksalesdept, c_thirdpartybankno, c_thirdpartybankname, c_thirdpartyacconame, c_thirdpartyaccono, c_investadvisor, c_investadvisorbankno, c_investadvisorbankname, c_investadvisoracconame, c_investadvisoraccono, c_investadvisorcorporation, c_investadvisoraddress, c_investadvisorzipcode, c_investadvisorphoneno1, c_investadvisorphoneno2, c_investadvisorfax, c_authdelegate, c_loanfinanceparty, c_loanfinancepartycorporation, c_loanfinancepartyaddress, c_loanfinancepartyzipcode, c_loanfinancepartyphoneno1, c_loanfinancepartyphoneno2, c_loanfinancepartyfax, c_loaninteresttype, f_loaninterestrate, f_loanduration, c_loanmanagebank, f_loanmanagefee, f_loanfinancecost, f_creditattornduration, f_creditattorninterestduration, f_creditattornprice, f_billattornduration, f_billattorninterestduration, f_billattornprice, c_stkincfincparty, c_stkincfincpartycorporation, c_stkincfincpartyaddress, c_stkincfincpartyzipcode, c_stkincfincpartyphoneno1, c_stkincfincpartyphoneno2, c_stkincfincpartyfax, c_stkincincomeannualizedrate, c_stkincinteresttype, f_stkincattornprice, f_stkincattornduration, f_stkincbail, f_stkincfinccost, c_stkincmemo1, c_stkincmemo2, c_debtincfincparty, c_debtincfincpartycorporation, c_debtincfincpartyaddress, c_debtincfincpartyzipcode, c_debtincfincpartyphoneno1, c_debtincfincpartyphoneno2, c_debtincfincpartyfax, c_debtincincomerate, c_debtincinteresttype, f_debtincattornprice, f_debtincattornduration, f_debtincbail, f_debtincfinccost, c_debtincmemo1, c_othinvfincparty, c_othinvfincpartycorporation, c_othinvfincpartyaddress, c_othinvfincpartyzipcode, c_othinvfincpartyphoneno1, c_othinvfincpartyphoneno2, c_othinvfincpartyfax, f_othinvfinccost, c_othinvmemo1, c_othinvmemo2, c_othinvmemo3, c_banktrustcoobank, c_banktrustproductname, c_banktrustproductcode, c_banktrustundertakingletter, c_trustgovgovname, c_trustgovprojecttype, c_trustgovcootype, c_trustgovoptype, c_housecapital, c_houseispe, c_tradetype, c_businesstype, c_trustname, c_trustidtype, c_trustidno, d_trustidvaliddate, c_trustbankname, c_trustaccounttype, c_trustnameinbank, c_zhtrustbankname, c_zhtrustbankacco, c_issecmarket, c_fundoperation, c_trustmanager, c_tradeother, c_watchdog, c_memo, c_benefittype, c_redeemaccotype, c_bonusaccotype, c_fundendaccotype, c_collectfailaccotype, d_lastmodifydate, c_shareholdlimtype, c_redeemtimelimtype, c_isprincipalrepayment, c_principalrepaymenttype, l_interestyeardays, l_incomeyeardays, c_capuseprovcode, c_capusecitycode, c_capsourceprovcode, c_banktrustcoobankcode, c_banktrustisbankcap, c_trusteefeedesc, c_managefeedesc, c_investfeedesc, f_investadvisordeductratio, c_investdeductdesc, c_investadvisor2, f_investadvisorratio2, f_investadvisordeductratio2, c_investfeedesc2, c_investdeductdesc2, c_investadvisor3, f_investadvisorratio3, f_investadvisordeductratio3, c_investfeedesc3, c_investdeductdesc3, c_profitclassdesc, c_deductratiodesc, c_redeemfeedesc, l_defaultprecision, c_allotfeeaccotype, c_isposf, c_opendaydesc, c_actualmanager, c_subindustrydetail, c_isbankleading, c_subprojectcode, c_iscycleinvest, f_liquidationinterest, c_liquidationinteresttype, c_isbonusinvestfare, c_subfeeaccotype, c_redeemfeeaccotype, c_fundrptcode, c_ordertype, c_flag, c_allotliqtype, l_sharelimitday, c_iseverydayopen, c_tradebynetvalue, c_isstage, c_specbenfitmemo, d_effectivedate, c_issueendflag, c_resharehasrdmfee, jy_fundcode, jy_fundid, jy_subfundid, jy_dptid, c_iswealth, c_interestcalctype, c_allotinterestcalctype, c_isriskcapital, c_fundstatus_1225, c_isincomeeverydaycalc, c_isredeemreturninterest, c_isrefundrtninterest, d_estimatedsetupdate, f_estimatedfactcollect, c_isfinancialproducts, c_fundredeemtype, c_trademanualinput, f_clientmanageration, c_profitclassadjustment, c_mainfundcode, c_contractsealoff, c_permitnextperiod, c_preprofitschematype, c_fundredeemprofit, f_incomeration, c_incomecalctype, c_allocateaccoid, c_outfundcode, c_matchprofitclass, l_lastdays, c_contractprofitflag, c_agencysaleliqtype, l_delaydays, c_profitclassperiod, c_reportshowname, c_currencyincometype, c_beforeredeemcapital, c_contractversion, c_confirmacceptedflag, c_selectcontract, f_schemainterest, c_riskgrade, l_sharedelaydays, l_reservationdays, c_transfertype, c_schemavoluntarily, l_schemadetaildata, c_schemadetailtype, c_iscurrencyconfirm, c_allowmultiaccobank, d_capverif, c_templatetype, c_capitalprecision, c_fundno, c_profittype, d_paydate, d_shelvedate, d_offshelvedate, c_schemabegindatetype, l_schemabegindatedays, c_isautoredeem, c_isnettingrequest, c_issuingquotedtype, d_firstdistributedate, c_bonusfrequency, c_interestbigdatetype, c_gzdatatype, f_allotfareratio, f_subfareratio, c_begindatebeyond, c_profitnotinterest, c_setuplimittype, c_limitredeemtype, c_bonusfrequencytype, c_rfaccotype, c_capitalfee, c_exceedflag, c_enableecd, c_isfixedtrade, c_profitcaltype, f_ominbala, f_stepbala, c_remittype, c_interestcycle, c_repayguaranteecopy, c_repaytype, c_fundprofitdes, c_fundinfodes, c_riskeval, l_maxage, l_minage, c_fundriskdes, mig_l_assetid, l_faincomedays, c_producttype, c_otherbenefitproducttype, c_isotc, c_iseverydayprovision, c_incometogz, c_setuptransfundacco, c_issuefeeownerrequired, c_calcinterestbeforeallot, c_islimit300wnature, c_allowoverflow, c_trustfundtype, c_disclose, c_collectaccoid, c_isissuebymarket, c_setupstatus, c_isentitytrust, l_liquidatesub, c_incomeassigndesc, c_keeporgancode, d_defaultbegincacldate, c_zcbborrower, c_zcbborroweridno, c_zcbremittype, c_registcode, c_redeeminvestaccotype, c_bonusinvestaccotype, c_isabsnotopentrade, l_interestdiffdays, c_outfundstatus, c_reqsyntype, c_allredeemtype, c_isabsopentrade, c_funddesc, l_allotliquidays, l_subliquidays, c_autoupcontractenddaterule, c_fcsubaccotype, c_fcallotaccotype, c_fcredeemaccotype, c_fcbonusaccotype, c_captranslimitflag, c_redeemprincipaltype, c_interestcalcdealtype, c_collectconfirm, d_oldcontractenddate, c_tnvaluation, c_contractendnotify, c_rdmfeebase, c_exceedcfmratio, c_allowallotcustlimittype, c_yeardayscalctype, c_iscompoundinterest, c_dbcfm, c_limitaccountstype, c_cycleinvestrange, c_tncheckmode, c_enableearlyredeem, c_ispurceandredeemset, c_perfpaydealtype, c_allowappend, c_allowredeem, c_inputstatus, c_profitbalanceadjust, c_profitperiodadjust, c_autogeneratecontractid, c_transferneednetting, underwrite, undertook, undertake, c_issmsend, d_contractshortenddate, d_contractlongenddate, c_assetseperatefundcodesrc, f_averageprofit, c_currencycontractlimittype, l_profitlastdays, l_liquidationlastdays, c_arlimitincludeallreq, c_reqfundchange, c_dealnetvaluerule, c_contractdealtype, c_bonusplanbeginday, c_contractbalaupright, c_isneedinterestrate, c_isneedexcessratio, c_riskgraderemark, c_lossprobability, c_suitcusttype, c_createbonusschema, d_closedenddate, c_timelimitunit, c_exceedredeemdealtype, c_profitperiod, l_navgetintervaldays, load_date, sys_id, work_date, c_limittransfertype, c_transaccotype, c_incometaxbase, c_isredeemfareyearcalc, c_otherbenefitinputmode, c_aftdefaultinterestdeducttype, c_allowzerobalanceconfirm, c_incomejoinassign, l_liquidateliqbonus, c_predefaultinterestdeducttype, c_worktype, c_defaultinterestadduptype, c_issupportsubmode, f_expectedyield, c_recodecode, l_liquidatetransfer, c_ispayincometax, c_groupmainfundcode, c_redeemfeesplittype, c_capitalfromcrmorta, c_needcalcdefaultinterest, c_issuercode, l_redeemfareyeardays, c_floatyield, l_minriskscore, c_islocalmoneytypecollect) FROM stdin; +2 \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N S017 \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N +\. + + +-- +-- Data for Name: s017_tsharecurrents_all; Type: TABLE DATA; Schema: sync; Owner: gregsun +-- + +COPY s017_tsharecurrents_all (d_cdate, c_cserialno, c_businflag, d_requestdate, c_requestno, c_custno, c_fundacco, c_tradeacco, c_fundcode, c_sharetype, c_agencyno, c_netno, f_occurshares, f_occurbalance, f_lastshares, f_occurfreeze, f_lastfreezeshare, c_summary, f_gainbalance, d_sharevaliddate, c_bonustype, c_custtype, c_shareclass, c_bourseflag, d_exportdate, l_contractserialno, c_issend, c_sendbatch, work_date) FROM stdin; +\. + + +-- +-- Data for Name: s017_ttrustclientinfo_all; Type: TABLE DATA; Schema: sync; Owner: gregsun +-- + +COPY s017_ttrustclientinfo_all (c_custno, c_custtype, c_custname, c_shortname, c_helpcode, c_identitytype, c_identityno, c_zipcode, c_address, c_phone, c_faxno, c_mobileno, c_email, c_sex, c_birthday, c_vocation, c_education, c_income, c_contact, c_contype, c_contno, c_billsendflag, c_callcenter, c_internet, c_secretcode, c_nationality, c_cityno, c_lawname, c_shacco, c_szacco, c_broker, f_agio, c_memo, c_reserve, c_corpname, c_corptel, c_specialcode, c_actcode, c_billsendpass, c_addressinvalid, d_appenddate, d_backdate, c_invalidaddress, c_backreason, c_modifyinfo, c_riskcontent, l_querydaysltd, c_customermanager, c_custproperty, c_custclass, c_custright, c_daysltdtype, d_idvaliddate, l_custgroup, c_recommender, c_recommendertype, d_idnovaliddate, c_organcode, c_othercontact, c_taxregistno, c_taxidentitytype, c_taxidentityno, d_legalvaliddate, c_shareholder, c_shareholderidtype, c_shareholderidno, d_holderidvaliddate, c_leader, c_leaderidtype, c_leaderidno, d_leadervaliddate, c_managercode, c_linemanager, c_clientinfoid, c_provincecode, c_countytown, c_phone2, c_clienttype, c_agencyno, c_industrydetail, c_isqualifiedcust, c_industryidentityno, c_lawidentitytype, c_lawidentityno, d_lawidvaliddate, d_conidvaliddate, c_conisrevmsg, c_conmobileno, c_conmoaddress, c_conzipcode, c_conphone1, c_conphone2, c_conemail, c_confaxno, c_incomsource, c_zhidentityno, c_zhidentitytype, c_eastcusttype, jy_custid, c_idtype201201030, c_emcontact, c_emcontactphone, c_instiregaddr, c_regcusttype, c_riskgrade, c_riskgraderemark, d_idvaliddatebeg, d_industryidvaliddatebeg, d_industryidvaliddate, c_incomesourceotherdesc, c_vocationotherdesc, c_businscope, d_conidvaliddatebeg, d_lawidvaliddatebeg, c_regmoneytype, f_regcapital, c_orgtype, c_contrholderno, c_contrholdername, c_contrholderidtype, c_contrholderidno, d_contrholderidvalidatebeg, d_contrholderidvalidate, c_responpername, c_responperidtype, c_responperidno, d_responperidvalidatebeg, d_responperidvalidate, c_lawphone, c_contrholderphone, c_responperphone, c_consex, c_conrelative, l_riskserialno, c_convocation, c_iscustrelated, c_businlicissuorgan, c_manageridno, c_manageridtype, c_managername, d_companyregdate, c_electronicagreement, c_householdregno, c_guardianrela, c_guardianname, c_guardianidtype, c_guardianidno, c_isfranchisingidstry, c_franchidstrybusinlic, c_workunittype, c_normalresidaddr, c_domicile, c_finainvestyears, c_parentidtype, c_parentidno, c_videono, c_bonustype, d_retirementdate, c_issendbigcustbill, c_idaddress, c_isproinvestor, c_sendkfflag, c_sendkfcause, c_sendsaflag, c_sendsacause, c_custrelationchannel, c_companytype, c_businlocation, c_custodian, d_elecsigndate, d_riskinputdate, c_circno, c_financeindustrydetail, c_outclientinfoid, d_duediligencedate, c_duediligencestatus, c_inputstatus, c_address2, c_reportcusttype, c_reportcusttypedetail, c_custsource, work_date) FROM stdin; +\. + + +-- +-- Data for Name: sys_stat_error_log; Type: TABLE DATA; Schema: sync; Owner: gregsun +-- + +COPY sys_stat_error_log (proc_name, tab_level, step_no, step_desc, begin_time, end_time, workdate, row_num, elapsed, all_elapsed, sql_code, sql_errm) FROM stdin; +SP_B03_TS_REMETRADE B STEP_01 清除目标表数据 2021-04-26 00:00:00 \N 2021-04-26 00:00:00 \N \N \N \N \N +SP_B03_TS_REMETRADE B STEP_01 清除目标表数据 2021-04-26 00:00:00 \N 2021-04-26 00:00:00 \N \N \N \N \N +SP_B03_TS_REMETRADE B STEP_01 清除目标表数据 2021-04-26 00:00:00 \N 2021-04-26 00:00:00 \N \N \N \N \N +SP_B03_TS_REMETRADE B STEP_01 清除目标表数据 2021-04-26 00:00:00 \N 2021-04-26 00:00:00 \N \N \N \N \N +SP_B03_TS_REMETRADE B STEP_01 清除目标表数据 2021-04-26 00:00:00 \N 2021-04-26 00:00:00 \N \N \N \N \N +SP_B03_TS_REMETRADE B STEP_01 清除目标表数据 2021-04-26 00:00:00 \N 2021-04-26 00:00:00 \N \N \N \N \N +SP_B03_TS_REMETRADE B STEP_01 清除目标表数据 2021-04-26 00:00:00 \N 2021-04-26 00:00:00 \N \N \N \N \N +SP_B03_TS_REMETRADE B STEP_01 清除目标表数据 2021-04-26 00:00:00 \N 2021-04-26 00:00:00 \N \N \N \N \N +SP_B03_TS_REMETRADE B STEP_01 清除目标表数据 2021-04-26 00:00:00 \N 2021-04-26 00:00:00 \N \N \N \N \N +SP_B03_TS_REMETRADE B STEP_01 清除目标表数据 2021-04-26 00:00:00 \N 2021-04-26 00:00:00 \N \N \N \N \N +SP_B03_TS_REMETRADE B STEP_01 清除目标表数据 2021-04-26 00:00:00 \N 2021-04-26 00:00:00 \N \N \N \N \N +SP_B03_TS_REMETRADE B STEP_01 清除目标表数据 2021-04-26 00:00:00 \N 2021-04-26 00:00:00 \N \N \N \N \N +SP_B03_TS_REMETRADE B STEP_01 清除目标表数据 2021-04-26 00:00:00 \N 2021-04-26 00:00:00 \N \N \N \N \N +SP_B03_TS_REMETRADE B STEP_01 清除目标表数据 2021-04-26 00:00:00 \N 2021-04-26 00:00:00 \N \N \N \N \N +SP_B03_TS_REMETRADE B STEP_01 清除目标表数据 2021-04-26 00:00:00 \N 2021-04-26 00:00:00 \N \N \N \N \N +SP_B03_TS_REMETRADE B STEP_01 清除目标表数据 2021-04-26 00:00:00 \N 2021-04-26 00:00:00 \N \N \N \N \N +SP_B03_TS_REMETRADE B STEP_01 清除目标表数据 2021-04-26 00:00:00 \N 2021-04-26 00:00:00 \N \N \N \N \N +SP_B03_TS_REMETRADE B STEP_01 清除目标表数据 2021-04-26 00:00:00 \N 2021-04-26 00:00:00 \N \N \N \N \N +SP_B03_TS_REMETRADE B STEP_01 清除目标表数据 2021-04-26 00:00:00 \N 2021-04-26 00:00:00 \N \N \N \N \N +SP_B03_TS_REMETRADE B STEP_01 清除目标表数据 2021-04-26 00:00:00 \N 2021-04-26 00:00:00 \N \N \N \N \N +SP_B03_TS_REMETRADE B STEP_01 清除目标表数据 2021-04-26 00:00:00 \N 2021-04-26 00:00:00 \N \N \N \N \N +SP_B03_TS_REMETRADE B STEP_01 清除目标表数据 2021-04-26 00:00:00 \N 2021-04-26 00:00:00 \N \N \N \N \N +\. + + +-- +-- Name: ks0_fund_base_26 pk_ks0_fund_base_26; Type: CONSTRAINT; Schema: sync; Owner: gregsun +-- + +ALTER TABLE ONLY ks0_fund_base_26 + ADD CONSTRAINT pk_ks0_fund_base_26 PRIMARY KEY (id1, acc_cd, ins_cd); + + +-- +-- PostgreSQL database dump complete +-- + +create table newtab as + SELECT A.C_FUNDCODE, + A.C_FUNDNAME, + A.C_FUNDACCO, + A.F_NETVALUE, + A.C_AGENCYNAME, + A.C_CUSTNAME, + A.D_DATE, + A.D_CDATE, + A.F_CONFIRMBALANCE, + A.F_TRADEFARE, + A.F_CONFIRMSHARES, + ABS(NVL(B.F_OCCURBALANCE, A.F_RELBALANCE)) F_RELBALANCE, + A.F_INTEREST, + NVL(DECODE(B.C_BUSINFLAG, + '02', + '申购', + '50', + '申购', + '74', + '申购', + '03', + '赎回'), + DECODE(A.C_BUSINFLAG, + '01', + '认购', + '02', + '申购', + '03', + '赎回', + '53', + '强制赎回', + '50', + '产品成立')) AS INFO, + null, + SYSDATE AS LOAD_DATE + FROM (SELECT A.C_FUNDCODE, + C.C_FUNDNAME, + A.C_FUNDACCO, + FUNC_GETLASTNETVALUE(A.C_FUNDCODE, A.D_CDATE::date) F_NETVALUE, + (SELECT C_AGENCYNAME + FROM S017_TAGENCYINFO + WHERE A.C_AGENCYNO = C_AGENCYNO) C_AGENCYNAME, + B.C_CUSTNAME, + TO_CHAR(A.D_DATE, 'yyyy-mm-dd') D_DATE, + TO_CHAR(A.D_CDATE, 'yyyy-mm-dd') D_CDATE, + DECODE(A.C_BUSINFLAG, + '03', + A.F_CONFIRMBALANCE + A.F_TRADEFARE, + '53', + A.F_CONFIRMBALANCE + A.F_TRADEFARE, + A.F_CONFIRMBALANCE) F_CONFIRMBALANCE, + A.F_TRADEFARE, + A.F_CONFIRMSHARES, + DECODE(A.C_BUSINFLAG, + '03', + A.F_CONFIRMBALANCE, + '53', + A.F_CONFIRMBALANCE, + A.F_CONFIRMBALANCE - A.F_TRADEFARE) F_RELBALANCE, + A.F_INTEREST, + A.C_BUSINFLAG, + A.C_CSERIALNO + FROM (SELECT D_DATE, + C_AGENCYNO, + DECODE(C_BUSINFLAG, + '03', + DECODE(C_IMPROPERREDEEM, + '3', + '100', + '5', + '100', + C_BUSINFLAG), + C_BUSINFLAG) C_BUSINFLAG, + C_FUNDACCO, + D_CDATE, + C_FUNDCODE, + F_CONFIRMBALANCE, + F_CONFIRMSHARES, + C_REQUESTNO, + F_TRADEFARE, + C_TRADEACCO, + F_INTEREST, + C_CSERIALNO, + L_SERIALNO, + L_CONTRACTSERIALNO + FROM S017_TCONFIRM_ALL T3 + UNION + SELECT D_DATE, + C_AGENCYNO, + '02' C_BUSINFLAG, + C_FUNDACCO, + D_LASTDATE AS D_CDATE, + C_FUNDCODE, + F_REINVESTBALANCE F_CONFIRMBALANCE, + F_REALSHARES F_CONFIRMSHARES, + '' C_REQUESTNO, + 0 F_TRADEFARE, + C_TRADEACCO, + 0 F_INTEREST, + C_CSERIALNO, + 0 L_SERIALNO, + L_CONTRACTSERIALNO + FROM S017_TDIVIDENDDETAIL T1 + /*WHERE T1.C_FLAG = '0'*/) A + LEFT JOIN S017_TACCONET TACN + ON A.C_TRADEACCO = TACN.C_TRADEACCO + LEFT JOIN (SELECT * FROM S017_TACCOINFO WHERE C_ACCOUNTTYPE = 'A') X + ON A.C_FUNDACCO = X.C_FUNDACCO + LEFT JOIN S017_TTRUSTCLIENTINFO_ALL B + ON X.C_CUSTNO = B.C_CUSTNO + INNER JOIN S017_TFUNDINFO C + ON A.C_FUNDCODE = C.C_FUNDCODE + ) A + LEFT JOIN (SELECT ST1.D_CDATE, + ST1.C_FUNDCODE, + ST1.F_OCCURBALANCE, + ST1.C_BUSINFLAG, + ST1.C_FUNDACCO, + ST1.C_CSERIALNO + FROM S017_TSHARECURRENTS_ALL ST1 + -- WHERE ST1.C_BUSINFLAG <> '74' + UNION ALL + SELECT ST2.D_DATE AS D_CDATE, + ST2.C_FUNDCODE, + ST2.F_TOTALPROFIT AS F_OCCURBALANCE, + '74' AS C_BUSINFLAG, + ST2.C_FUNDACCO, + ST2.C_CSERIALNO + FROM S017_TDIVIDENDDETAIL ST2 + -- WHERE ST2.C_FLAG = '0' + ) B + ON A.C_FUNDCODE = B.C_FUNDCODE + /* + AND A.C_FUNDACCO = B.C_FUNDACCO + AND TO_DATE(A.D_CDATE, 'YYYY-MM-DD') = B.D_CDATE + AND A.C_CSERIALNO = B.C_CSERIALNO*/; + +DROP SCHEMA sync cascade; From c9db8471c320b7d65320bf15696e38f050811d38 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Thu, 20 May 2021 11:41:34 +0800 Subject: [PATCH 377/578] fix GTM standby lost when xlog of GTM host is not available http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131087099711 (merge request !333) Squash merge branch 'sigmalin_oracle' into 'Tbase_v5.09' * fix GTM standby lost when xlog of GTM host is not available --- src/gtm/client/fe-protocol.c | 11 ++++++++++ src/gtm/main/gtm_xlog.c | 39 ++++++++++++++++++++++++++---------- src/gtm/main/main.c | 8 ++++++++ src/include/gtm/gtm_client.h | 3 ++- src/include/gtm/gtm_xlog.h | 3 ++- 5 files changed, 51 insertions(+), 13 deletions(-) diff --git a/src/gtm/client/fe-protocol.c b/src/gtm/client/fe-protocol.c index 89bedf88..a575510e 100644 --- a/src/gtm/client/fe-protocol.c +++ b/src/gtm/client/fe-protocol.c @@ -1382,6 +1382,17 @@ result->gr_status = GTM_RESULT_ERROR; result->gr_resdata.grd_xlog_data.length = 0; result->gr_resdata.grd_xlog_data.xlog_data = NULL; + if (gtmpqGetInt(&result->gr_resdata.grd_xlog_data.status, sizeof(int), conn)) + { + result->gr_status = GTM_RESULT_ERROR; + break; + } + + if (result->gr_resdata.grd_xlog_data.status != Send_OK) + { + break; + } + if (gtmpqGetInt64((int64 *)&result->gr_resdata.grd_xlog_data.flush, conn)) { result->gr_status = GTM_RESULT_ERROR; diff --git a/src/gtm/main/gtm_xlog.c b/src/gtm/main/gtm_xlog.c index d38aea68..3d46942e 100644 --- a/src/gtm/main/gtm_xlog.c +++ b/src/gtm/main/gtm_xlog.c @@ -816,7 +816,7 @@ GTM_GetReplicationResultIfAny(GTM_StandbyReplication *replication,Port *port) return 1; } -static bool +static int ReadXLogFileToBuffIntern(GTM_XLogSegmentBuff *buff,TimeLineID timeline,XLogSegNo segment_no) { char path[MAXFNAMELEN]; @@ -829,7 +829,7 @@ ReadXLogFileToBuffIntern(GTM_XLogSegmentBuff *buff,TimeLineID timeline,XLogSegNo if(fd == -1) { elog(LOG,"Fail to open xlog %s : %s",path,strerror(errno)); - return false; + return Send_Error; } buff->total_length = 0; @@ -842,7 +842,7 @@ ReadXLogFileToBuffIntern(GTM_XLogSegmentBuff *buff,TimeLineID timeline,XLogSegNo { elog(LOG,"Read xlog file %s fails : %s",path,strerror(errno)); close(fd); - return false; + return Send_Error; } if(bytes == 0) @@ -857,10 +857,10 @@ ReadXLogFileToBuffIntern(GTM_XLogSegmentBuff *buff,TimeLineID timeline,XLogSegNo if(enalbe_gtm_xlog_debug) elog(LOG,"read xlog file %s with bytes %d",path,buff->total_length); - return true; + return Send_OK; } -static bool +static int ReadXLogFileToBuff(GTM_XLogSegmentBuff *buff,TimeLineID timeline,XLogSegNo segment_no) { char path[MAXFNAMELEN]; @@ -881,7 +881,8 @@ ReadXLogFileToBuff(GTM_XLogSegmentBuff *buff,TimeLineID timeline,XLogSegNo segme if(access(path,F_OK) < 0) { elog(LOG,"xlog file %s not found ,that is not support to happen.",path); - return false; + /* need to tell the standby that the required xlog is not available */ + return Send_XlogFile_Not_Found; } if(enalbe_gtm_xlog_debug) @@ -986,6 +987,7 @@ GetXLogFileSize(TimeLineID timeline,XLogSegNo segment_no) static int SendXLogDataFromFileBuff(GTM_StandbyReplication *replication,StringInfo message_buff) { + int ret = Send_OK; GTM_XLogSegmentBuff *local_buff = &replication->xlog_read_buff; XLogSegNo request_segment = GetSegmentNo(replication->send_ptr); @@ -998,8 +1000,9 @@ SendXLogDataFromFileBuff(GTM_StandbyReplication *replication,StringInfo message_ return Send_Data_Not_Found; } - if(ReadXLogFileToBuff(local_buff,replication->time_line,request_segment) == false) - return Send_Error; + ret = ReadXLogFileToBuff(local_buff,replication->time_line,request_segment); + if(ret != Send_OK) + return ret; SendXLogDataFromFileBuffInternal(replication,message_buff); @@ -1070,10 +1073,10 @@ SendXLogContext(GTM_StandbyReplication *replication,Port *port) int bytes; StringInfoData out_message; - initStringInfo(&out_message); - pq_beginmessage(&out_message, 'S'); pq_sendint(&out_message, MSG_REPLICATION_CONTENT, 4); + /* send the processing result status to the standby */ + pq_sendint(&out_message, Send_OK, sizeof(int)); pq_sendint64(&out_message, GetReplicationSendRequestPtr(replication)); /* request send reply */ @@ -1081,7 +1084,16 @@ SendXLogContext(GTM_StandbyReplication *replication,Port *port) bytes = SendXLogData(replication,&out_message); - if(bytes == Send_Error) + if (bytes == Send_XlogFile_Not_Found) + { + pfree(out_message.data); + out_message.data = NULL; + pq_beginmessage(&out_message, 'S'); + pq_sendint(&out_message, MSG_REPLICATION_CONTENT, 4); + /* send the processing result status to the standby */ + pq_sendint(&out_message, Send_XlogFile_Not_Found, sizeof(int)); + } + else if(bytes == Send_Error) goto send_fail; pq_endmessage(port,&out_message); @@ -1089,6 +1101,11 @@ SendXLogContext(GTM_StandbyReplication *replication,Port *port) if(pq_flush(port)) goto send_fail; + if (bytes == Send_XlogFile_Not_Found) + { + return false; + } + return true; send_fail: diff --git a/src/gtm/main/main.c b/src/gtm/main/main.c index 7d9563f1..e894a5bc 100644 --- a/src/gtm/main/main.c +++ b/src/gtm/main/main.c @@ -3309,6 +3309,14 @@ GTM_ThreadWalReceiver(void *argp) Assert(res->gr_status == GTM_RESULT_OK); Assert(res->gr_type == MSG_REPLICATION_CONTENT); + if (res->gr_resdata.grd_xlog_data.status != Send_OK) + { + Assert(res->gr_resdata.grd_xlog_data.status == Send_XlogFile_Not_Found); + elog(LOG,"xlog file not found in master, exit now"); + exit(1); + } + + Assert(res->gr_resdata.grd_xlog_data.status == Send_OK); size = res->gr_resdata.grd_xlog_data.length; start_pos = res->gr_resdata.grd_xlog_data.pos; end_pos = start_pos + size; diff --git a/src/include/gtm/gtm_client.h b/src/include/gtm/gtm_client.h index 2ae03ff1..f85e7d13 100644 --- a/src/include/gtm/gtm_client.h +++ b/src/include/gtm/gtm_client.h @@ -75,7 +75,8 @@ typedef union GTM_ResultData char* xlog_data; int reply; XLogRecPtr flush; - } grd_xlog_data; + int status; + } grd_xlog_data; #endif diff --git a/src/include/gtm/gtm_xlog.h b/src/include/gtm/gtm_xlog.h index d2f9c050..986f73c3 100644 --- a/src/include/gtm/gtm_xlog.h +++ b/src/include/gtm/gtm_xlog.h @@ -115,7 +115,8 @@ enum XLogSendResult Send_OK = 1, Send_No_data = 0, Send_Data_Not_Found = -1, - Send_Error = -2 + Send_Error = -2, + Send_XlogFile_Not_Found = -3 }; typedef struct XLogCtlData From 7ed6b5643124e68d8d88870410cff2edf4692ab9 Mon Sep 17 00:00:00 2001 From: whalesong Date: Fri, 21 May 2021 17:23:04 +0800 Subject: [PATCH 378/578] 2pc files opt: add 2pc hash table on shmem (merge request 300), hash table optimize --- src/backend/access/transam/twophase.c | 85 ++++++++++++++++++++++----- src/backend/utils/hash/dynahash.c | 56 +++++++++++++++++- src/backend/utils/misc/guc.c | 9 +++ src/include/access/twophase.h | 1 + src/include/utils/hsearch.h | 2 + 5 files changed, 138 insertions(+), 15 deletions(-) diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 3124e174..56d80bf1 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -157,6 +157,7 @@ bool enable_2pc_file_cache = true; bool enable_2pc_file_check = true; bool enable_2pc_entry_key_check = true; bool enable_2pc_entry_trace = false; +bool enable_2pc_hash_table_check = true; int record_2pc_cache_size = 4096; int record_2pc_entry_size = 2048; @@ -168,7 +169,8 @@ int record_2pc_partitions = 32; #define MAX_2PC_INFO_SIZE (record_2pc_entry_size - MAX_TID_SIZE) #define DFLT_2PC_INFO_SIZE 1024 /* default size */ -#define MAX_RETRY_TIMES 10 +#define HASH_TAB_RETRY_MAX 10 +#define HASH_TAB_RETRY_SLEEP 2000 /* sleep time: 2ms */ /* hash table entry for 2pc record */ typedef struct Cache2pcInfo @@ -2463,10 +2465,15 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon) Assert(NULL != entry); check_2pc_file(entry->key, entry->info, func); - elog(LOG, "[%s] key %s is found in hash table", func, entry->key); + elog(LOG, "[%s] %s is found in hash table seq", func, entry->key); if (IsXidImplicit(entry->key)) { + if (0 == strlen(entry->info)) + { + elog(WARNING, "[%s] %s info length is 0", func, entry->key); + continue; + } memset(info, 0, MAX_2PC_INFO_SIZE); memcpy(info, entry->info, strlen(entry->info)); @@ -2482,12 +2489,10 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon) func, entry->key, PGXCNodeName); continue; } - else - { + elog(LOG, "[%s] %s start node is %s", func, entry->key, PGXCNodeName); } - } else { elog(WARNING, "[%s] %s get start node failed, info: %s", @@ -3565,6 +3570,8 @@ void record_2pc_involved_nodes_xid(const char * tid, XLogRecPtr xlogrec = 0; #endif + enable_hash_table_trace = false; + if (!enable_2pc_recovery_info) { return ; @@ -3711,18 +3718,64 @@ void record_2pc_involved_nodes_xid(const char * tid, } else { - elog(LOG, "[%s] %s is found in hash table", func, tid); + elog(LOG, "[%s] %s is added to hash table, entry: %p, " + "record_2pc_cache: %p, hashvalue: %u", func, tid, entry, + record_2pc_cache, string_hash(tid, MAX_TID_SIZE)); } } - else if (enable_2pc_entry_trace) + else if (enable_2pc_entry_trace || enable_2pc_hash_table_check) { - elog(LOG, "[%s] %s is added to hash table, entry: %p", - func, tid, entry); + elog(LOG, "[%s] %s is added to hash table, entry: %p, " + "record_2pc_cache: %p, hashvalue: %u", func, tid, entry, + record_2pc_cache, string_hash(tid, MAX_TID_SIZE)); } memcpy(entry->info, content.data, size + 1); check_entry_key(tid, entry->key, func); + if (enable_2pc_hash_table_check) + { + int retry_times = 0; + Cache2pcInfo *entry_debug = NULL; + + GET_2PC_FILE_PATH(path, tid); + + while (retry_times++ < HASH_TAB_RETRY_MAX) + { + entry_debug = (Cache2pcInfo *)hash_search(record_2pc_cache, + tid, HASH_FIND, &found); + if (found) + { + Assert(NULL != entry_debug); + check_entry_key(tid, entry_debug->key, func); + break; + } + + /* not found */ + elog(LOG, "[%s] %s is not found in hash table, retry times: %d", + func, tid, retry_times); + + Assert(NULL == entry_debug); + + if (0 == access(path, F_OK)) + { + elog(LOG, "[%s] %s found 2pc file %s", func, tid, path); + break; + } + + print_record_2pc_cache(func); + pg_usleep(HASH_TAB_RETRY_SLEEP); + enable_hash_table_trace = true; + } + + enable_hash_table_trace = false; + + if (retry_times >= HASH_TAB_RETRY_MAX) + { + elog(PANIC, "[%s] %s is not found in hash table", func, tid); + } + } + resetStringInfo(&content); pfree(content.data); return; @@ -3833,7 +3886,7 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta GET_2PC_FILE_PATH(path, tid); - while (NULL != record_2pc_cache && retry_times++ < MAX_RETRY_TIMES) + while (NULL != record_2pc_cache && retry_times++ < HASH_TAB_RETRY_MAX) { Assert(strlen(tid) < MAX_TID_SIZE); entry = (Cache2pcInfo *)hash_search(record_2pc_cache, tid, HASH_FIND, &found); @@ -3884,7 +3937,7 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta } if (fd < 0) { - elog(ERROR, "[%s] could not append timestamp in file %s, errMsg: %s", + elog(ERROR, "[%s] could not append timestamp, file %s, errMsg: %s", func, path, strerror(errno)); } @@ -3931,17 +3984,21 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta func, tid, retry_times); Assert(NULL == entry); - print_record_2pc_cache(func); - if (0 == access(path, F_OK)) { elog(LOG, "[%s] %s found 2pc file %s", func, tid, path); break; } - pg_usleep(5000L); /* sleep 5ms */ + print_record_2pc_cache(func); + + pg_usleep(HASH_TAB_RETRY_SLEEP); + + enable_hash_table_trace = true; } + enable_hash_table_trace = false; + if (NULL != record_2pc_cache) { elog(LOG, "[%s] %s is not found in hash table, get from disk", func, tid); diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c index 8e62e871..0e751a96 100644 --- a/src/backend/utils/hash/dynahash.c +++ b/src/backend/utils/hash/dynahash.c @@ -115,6 +115,8 @@ /* Number of freelists to be used for a partitioned hash table. */ #define NUM_FREELISTS 32 +bool enable_hash_table_trace = false; + /* A hash bucket is a linked list of HASHELEMENTs */ typedef HASHELEMENT *HASHBUCKET; @@ -926,9 +928,15 @@ hash_search_with_hash_value(HTAB *hashp, long segment_ndx; HASHSEGMENT segp; HASHBUCKET currBucket; + HASHBUCKET *firstBucketPtr; HASHBUCKET *prevBucketPtr; + HASHBUCKET *prevBucketPtrCheck; HashCompareFunc match; + char *func = "hash_search_with_hash_value"; + bool is_trace = (enable_hash_table_trace && + 0 == strcmp(hashp->tabname, "Record 2pc Cache")); + #if HASH_STATISTICS hash_accesses++; hctl->accesses++; @@ -965,12 +973,27 @@ hash_search_with_hash_value(HTAB *hashp, segp = hashp->dir[segment_num]; + if (is_trace) + { + elog(LOG, "[%s] %s hashvalue: %u, freelist_idx: %d, IS_PARTITIONED: %d, " + "bucket: %u, segment_num: %ld, segment_ndx %ld, segp: %p", + func, (char *)keyPtr, hashvalue, freelist_idx, IS_PARTITIONED(hctl), + bucket, segment_num, segment_ndx, segp); + } + if (segp == NULL) hash_corrupted(hashp); - prevBucketPtr = &segp[segment_ndx]; + firstBucketPtr = &segp[segment_ndx]; + prevBucketPtr = firstBucketPtr; currBucket = *prevBucketPtr; + if (is_trace) + { + elog(LOG, "[%s] %s prevBucketPtr: %p, currBucket: %p", + func, (char *)keyPtr, prevBucketPtr, currBucket); + } + /* * Follow collision chain looking for matching key */ @@ -979,9 +1002,19 @@ hash_search_with_hash_value(HTAB *hashp, while (currBucket != NULL) { + if (is_trace) + { + elog(LOG, "[%s] %s currBucket: %p", func, (char *)keyPtr, currBucket); + } if (currBucket->hashvalue == hashvalue && match(ELEMENTKEY(currBucket), keyPtr, keysize) == 0) + { + if (is_trace) + { + elog(LOG, "[%s] %s break currBucket: %p", func, (char *)keyPtr, currBucket); + } break; + } prevBucketPtr = &(currBucket->link); currBucket = *prevBucketPtr; #if HASH_STATISTICS @@ -1065,10 +1098,31 @@ hash_search_with_hash_value(HTAB *hashp, errmsg("out of memory"))); } + prevBucketPtrCheck = prevBucketPtr; + + /* if partitioned, must lock freeList */ + if (IS_PARTITIONED(hctl)) + SpinLockAcquire(&(hctl->freeList[freelist_idx].mutex)); + + prevBucketPtr = firstBucketPtr; + while (*prevBucketPtr != NULL) + { + prevBucketPtr = &((*prevBucketPtr)->link); + } + + if (prevBucketPtr != prevBucketPtrCheck) + { + elog(LOG, "[%s] prevBucketPtr(%p) != prevBucketPtrCheck(%p)", + func, prevBucketPtr, prevBucketPtrCheck); + } + /* link into hashbucket chain */ *prevBucketPtr = currBucket; currBucket->link = NULL; + if (IS_PARTITIONED(hctl)) + SpinLockRelease(&hctl->freeList[freelist_idx].mutex); + /* copy key into record */ currBucket->hashvalue = hashvalue; hashp->keycopy(ELEMENTKEY(currBucket), keyPtr, keysize); diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 5066d491..ec9db352 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -2726,6 +2726,15 @@ static struct config_bool ConfigureNamesBool[] = false, NULL, NULL, NULL }, + { + {"enable_2pc_hash_table_check", PGC_USERSET, CUSTOM_OPTIONS, + gettext_noop("Enable 2PC hash table check."), + NULL + }, + &enable_2pc_hash_table_check, + false, + NULL, NULL, NULL + }, #endif #ifdef __TBASE__ diff --git a/src/include/access/twophase.h b/src/include/access/twophase.h index bd76266f..e0fe09d2 100644 --- a/src/include/access/twophase.h +++ b/src/include/access/twophase.h @@ -101,6 +101,7 @@ extern bool enable_2pc_file_cache; extern bool enable_2pc_file_check; extern bool enable_2pc_entry_key_check; extern bool enable_2pc_entry_trace; +extern bool enable_2pc_hash_table_check; extern int record_2pc_cache_size; extern int record_2pc_entry_size; diff --git a/src/include/utils/hsearch.h b/src/include/utils/hsearch.h index 651b3b59..15c7049a 100644 --- a/src/include/utils/hsearch.h +++ b/src/include/utils/hsearch.h @@ -116,6 +116,8 @@ typedef struct HASHELEMENT *curEntry; /* current entry in bucket */ } HASH_SEQ_STATUS; +extern bool enable_hash_table_trace; + /* * prototypes for functions in dynahash.c */ From bc4b6bcde4a8808f5cc0f9163fdd2b51197c313e Mon Sep 17 00:00:00 2001 From: whalesong Date: Fri, 21 May 2021 18:08:38 +0800 Subject: [PATCH 379/578] 2pc files opt: add 2pc hash table on shmem (merge request 300), hash table optimize, regress fix --- src/test/regress/expected/sysviews.out | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index 842fabf5..f9926dda 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -76,6 +76,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_2pc_entry_trace | off enable_2pc_file_cache | on enable_2pc_file_check | on + enable_2pc_hash_table_check | off enable_2pc_recovery_info | on enable_audit | off enable_audit_warning | off @@ -141,7 +142,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_transparent_crypt | on enable_user_authority_force_check | off enable_xlog_mprotect | on -(68 rows) +(69 rows) -- Test that the pg_timezone_names and pg_timezone_abbrevs views are -- more-or-less working. We can't test their contents in any great detail From 8ea9290d305748a6561c29301f74a00414cae73e Mon Sep 17 00:00:00 2001 From: whalesong Date: Mon, 31 May 2021 20:15:40 +0800 Subject: [PATCH 380/578] 2pc files opt: add 2pc hash table on shmem (merge request 300), remove hash table debug code --- src/backend/access/transam/twophase.c | 9 ----- src/backend/utils/hash/dynahash.c | 56 +-------------------------- src/include/utils/hsearch.h | 2 - 3 files changed, 1 insertion(+), 66 deletions(-) diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 56d80bf1..14935855 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -3570,8 +3570,6 @@ void record_2pc_involved_nodes_xid(const char * tid, XLogRecPtr xlogrec = 0; #endif - enable_hash_table_trace = false; - if (!enable_2pc_recovery_info) { return ; @@ -3765,11 +3763,8 @@ void record_2pc_involved_nodes_xid(const char * tid, print_record_2pc_cache(func); pg_usleep(HASH_TAB_RETRY_SLEEP); - enable_hash_table_trace = true; } - enable_hash_table_trace = false; - if (retry_times >= HASH_TAB_RETRY_MAX) { elog(PANIC, "[%s] %s is not found in hash table", func, tid); @@ -3993,12 +3988,8 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta print_record_2pc_cache(func); pg_usleep(HASH_TAB_RETRY_SLEEP); - - enable_hash_table_trace = true; } - enable_hash_table_trace = false; - if (NULL != record_2pc_cache) { elog(LOG, "[%s] %s is not found in hash table, get from disk", func, tid); diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c index 0e751a96..fc7e8bf3 100644 --- a/src/backend/utils/hash/dynahash.c +++ b/src/backend/utils/hash/dynahash.c @@ -115,8 +115,6 @@ /* Number of freelists to be used for a partitioned hash table. */ #define NUM_FREELISTS 32 -bool enable_hash_table_trace = false; - /* A hash bucket is a linked list of HASHELEMENTs */ typedef HASHELEMENT *HASHBUCKET; @@ -928,15 +926,9 @@ hash_search_with_hash_value(HTAB *hashp, long segment_ndx; HASHSEGMENT segp; HASHBUCKET currBucket; - HASHBUCKET *firstBucketPtr; HASHBUCKET *prevBucketPtr; - HASHBUCKET *prevBucketPtrCheck; HashCompareFunc match; - char *func = "hash_search_with_hash_value"; - bool is_trace = (enable_hash_table_trace && - 0 == strcmp(hashp->tabname, "Record 2pc Cache")); - #if HASH_STATISTICS hash_accesses++; hctl->accesses++; @@ -973,27 +965,12 @@ hash_search_with_hash_value(HTAB *hashp, segp = hashp->dir[segment_num]; - if (is_trace) - { - elog(LOG, "[%s] %s hashvalue: %u, freelist_idx: %d, IS_PARTITIONED: %d, " - "bucket: %u, segment_num: %ld, segment_ndx %ld, segp: %p", - func, (char *)keyPtr, hashvalue, freelist_idx, IS_PARTITIONED(hctl), - bucket, segment_num, segment_ndx, segp); - } - if (segp == NULL) hash_corrupted(hashp); - firstBucketPtr = &segp[segment_ndx]; - prevBucketPtr = firstBucketPtr; + prevBucketPtr = &segp[segment_ndx]; currBucket = *prevBucketPtr; - if (is_trace) - { - elog(LOG, "[%s] %s prevBucketPtr: %p, currBucket: %p", - func, (char *)keyPtr, prevBucketPtr, currBucket); - } - /* * Follow collision chain looking for matching key */ @@ -1002,19 +979,9 @@ hash_search_with_hash_value(HTAB *hashp, while (currBucket != NULL) { - if (is_trace) - { - elog(LOG, "[%s] %s currBucket: %p", func, (char *)keyPtr, currBucket); - } if (currBucket->hashvalue == hashvalue && match(ELEMENTKEY(currBucket), keyPtr, keysize) == 0) - { - if (is_trace) - { - elog(LOG, "[%s] %s break currBucket: %p", func, (char *)keyPtr, currBucket); - } break; - } prevBucketPtr = &(currBucket->link); currBucket = *prevBucketPtr; #if HASH_STATISTICS @@ -1098,31 +1065,10 @@ hash_search_with_hash_value(HTAB *hashp, errmsg("out of memory"))); } - prevBucketPtrCheck = prevBucketPtr; - - /* if partitioned, must lock freeList */ - if (IS_PARTITIONED(hctl)) - SpinLockAcquire(&(hctl->freeList[freelist_idx].mutex)); - - prevBucketPtr = firstBucketPtr; - while (*prevBucketPtr != NULL) - { - prevBucketPtr = &((*prevBucketPtr)->link); - } - - if (prevBucketPtr != prevBucketPtrCheck) - { - elog(LOG, "[%s] prevBucketPtr(%p) != prevBucketPtrCheck(%p)", - func, prevBucketPtr, prevBucketPtrCheck); - } - /* link into hashbucket chain */ *prevBucketPtr = currBucket; currBucket->link = NULL; - if (IS_PARTITIONED(hctl)) - SpinLockRelease(&hctl->freeList[freelist_idx].mutex); - /* copy key into record */ currBucket->hashvalue = hashvalue; hashp->keycopy(ELEMENTKEY(currBucket), keyPtr, keysize); diff --git a/src/include/utils/hsearch.h b/src/include/utils/hsearch.h index 15c7049a..651b3b59 100644 --- a/src/include/utils/hsearch.h +++ b/src/include/utils/hsearch.h @@ -116,8 +116,6 @@ typedef struct HASHELEMENT *curEntry; /* current entry in bucket */ } HASH_SEQ_STATUS; -extern bool enable_hash_table_trace; - /* * prototypes for functions in dynahash.c */ From b46f0c53c128f1cd8df419f901eda90e12f55e4f Mon Sep 17 00:00:00 2001 From: bethding Date: Tue, 1 Jun 2021 17:35:42 +0800 Subject: [PATCH 381/578] release seqinfo before return error http://tapd.oa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131088349973&jump_count=1 --- src/gtm/main/gtm_seq.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/gtm/main/gtm_seq.c b/src/gtm/main/gtm_seq.c index d103ce2b..27e0a3e0 100644 --- a/src/gtm/main/gtm_seq.c +++ b/src/gtm/main/gtm_seq.c @@ -457,6 +457,10 @@ GTM_SeqOpen(GTM_SequenceKey seqkey, ereport(LOG, (EEXIST, errmsg("GTM_SeqOpen Sequence with key:%s found in hashtab", seqkey->gsk_key))); + /* + * Release sequence, otherwise the sequence will be busy when be dropped. + */ + seq_release_seqinfo(seqinfo); return EEXIST; } @@ -467,6 +471,10 @@ GTM_SeqOpen(GTM_SequenceKey seqkey, ereport(LOG, (EEXIST, errmsg("GTM_SeqOpen Sequence with key:%s found in store", seqkey->gsk_key))); + /* + * Release sequence, otherwise the sequence will be busy when be dropped. + */ + seq_release_seqinfo(seqinfo); return EEXIST; } #endif From 1915bc1979f759d363529a0521d5d02662dc6ed9 Mon Sep 17 00:00:00 2001 From: bethding Date: Tue, 1 Jun 2021 17:43:48 +0800 Subject: [PATCH 382/578] parallel ddl, leader cn execute firstly --- src/backend/access/transam/xact.c | 23 + src/backend/catalog/dependency.c | 245 +++++- src/backend/catalog/heap.c | 15 +- src/backend/catalog/namespace.c | 2 + src/backend/catalog/objectaddress.c | 27 + src/backend/commands/dbcommands.c | 5 +- src/backend/commands/dropcmds.c | 138 ++- src/backend/commands/sequence.c | 69 +- src/backend/commands/tablecmds.c | 250 ++++-- src/backend/commands/tablespace.c | 67 +- src/backend/commands/user.c | 451 ++++++---- src/backend/commands/view.c | 74 +- src/backend/parser/parse_relation.c | 28 +- src/backend/parser/parse_utilcmd.c | 34 +- src/backend/pgxc/locator/redistrib.c | 4 + src/backend/pgxc/pool/execRemote.c | 608 ++++++------- src/backend/pgxc/pool/pgxcnode.c | 42 +- src/backend/tcop/postgres.c | 1 + src/backend/tcop/utility.c | 1209 +++++++++++++++++++++++--- src/backend/utils/misc/guc.c | 4 + src/include/catalog/dependency.h | 12 + src/include/catalog/objectaddress.h | 4 + src/include/commands/dbcommands.h | 2 +- src/include/commands/defrem.h | 4 +- src/include/commands/sequence.h | 6 + src/include/commands/tablecmds.h | 6 + src/include/commands/tablespace.h | 6 +- src/include/commands/user.h | 23 +- src/include/commands/view.h | 10 +- src/include/nodes/parsenodes.h | 10 + src/include/parser/parse_relation.h | 151 ++-- src/include/parser/parse_utilcmd.h | 11 +- src/include/pgxc/execRemote.h | 28 + src/include/pgxc/pgxcnode.h | 5 +- src/include/tcop/utility.h | 54 +- 35 files changed, 2755 insertions(+), 873 deletions(-) diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index bc765d44..b6881ece 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -98,6 +98,7 @@ #include "pgxc/squeue.h" #include "postmaster/postmaster.h" #include "commands/extension.h" +#include "tcop/utility.h" #endif /* * User-tweakable parameters @@ -4501,6 +4502,9 @@ CommitTransactionCommand(void) } break; } +#ifdef __TBASE__ + leader_cn_executed_ddl = false; +#endif } /* @@ -4626,8 +4630,24 @@ AbortCurrentTransaction(void) * we get ROLLBACK. */ case TBLOCK_SUBINPROGRESS: + { + /* + * In parallel mode, leader cn execute before local cn, so when + * error occured, local cn will send ROLLBACK_SUBTXN to leader + * cn, we deal with subtxn abort there. + */ + if (is_txn_has_parallel_ddl && !IS_PGXC_LOCAL_COORDINATOR) + { + PGXCNodeHandle *leaderCnHandle = NULL; + leaderCnHandle = find_ddl_leader_cn(); + if (is_ddl_leader_cn(leaderCnHandle->nodename)) + { + break; + } + } AbortSubTransaction(); s->blockState = TBLOCK_SUBABORT; + } break; /* @@ -4654,6 +4674,9 @@ AbortCurrentTransaction(void) AbortCurrentTransaction(); break; } +#ifdef __TBASE__ + leader_cn_executed_ddl = false; +#endif } /* diff --git a/src/backend/catalog/dependency.c b/src/backend/catalog/dependency.c index 737e549d..6b6d568b 100644 --- a/src/backend/catalog/dependency.c +++ b/src/backend/catalog/dependency.c @@ -109,6 +109,10 @@ #ifdef _MLS_ #include "utils/relcrypt.h" #endif +#ifdef __TBASE__ +#include "parser/scansup.h" +#include "catalog/catalog.h" +#endif /* * Deletion processing requires additional state for each ObjectAddress that * it's planning to delete. For simplicity and code-sharing we make the @@ -128,7 +132,6 @@ typedef struct #define DEPFLAG_EXTENSION 0x0010 /* reached via extension dependency */ #define DEPFLAG_REVERSE 0x0020 /* reverse internal/extension link */ - /* expansible list of ObjectAddresses */ struct ObjectAddresses { @@ -391,6 +394,246 @@ performDeletion(const ObjectAddress *object, heap_close(depRel, RowExclusiveLock); } +#ifdef __TBASE__ + +/* + * replace all invisible characters with ' ', + * leave no spaces next to ',' or '.' + */ +void +OmitqueryStringSpace(char *queryString) +{ + char *front = queryString; + char *last = queryString; + bool skip = false; + + if (queryString == NULL) + { + return; + } + + /* omit space */ + while (scanner_isspace(*front)) + { + ++front; + } + + while ((*front) != '\0') + { + if(scanner_isspace(*front) && skip == false) + { + while(scanner_isspace(*front)) + { + ++front; + } + + if ((*front) == ',' || (*front) == '.') + { + /* no need space */ + } + else if (last != queryString && (*(last - 1) == ',' || *(last - 1) == '.')) + { + /* no need space */ + } + else + { + /* replace all invisible characters with ' ' */ + *last = ' '; + ++last; + continue; + } + } + + if ((*front) == '\"') + { + skip = (skip == true) ? false : true; + *last = *front; + ++front; + } + else + { + *last = *front; + ++front; + } + ++last; + } + *last = '\0'; +} + +/* + * remove object name in query string (replace with ' ') + */ +void +RemoveObjnameInQueryString(char *queryString, char *full_name) +{ + char *ptr = NULL; + char *tmp = NULL; + char *tmpStr = NULL; + char *start_ptr = queryString; + char *end_ptr = queryString + strlen(queryString) - 1; + int len = 0; + + tmpStr = queryString; + len = strlen(full_name); + while ((ptr = strstr(tmpStr, full_name)) != NULL) + { + /* is not independent string, skip */ + if (((ptr - 1) >= start_ptr && *(ptr - 1) != ' ' && (*(ptr - 1) != ',')) || + ((ptr + len) <= end_ptr && *(ptr + len) != ' ' && *(ptr + len) != ',' && *(ptr + len) != ';')) + { + if (((ptr - 1) >= start_ptr && *(ptr - 1) == '\"' && (ptr + len) <= end_ptr && *(ptr + len) == '\"') && + ((ptr - 2) < start_ptr || *(ptr - 2) != '.')) + { + *(ptr - 1) = ' '; + *(ptr + len) = ' '; + } + else + { + tmpStr = ptr + len; + continue; + } + } + + /* replace obj name with ' ' */ + MemSet(ptr, ' ', len); + + /* find the previous ',' */ + tmp = ptr - 1; + while (tmp >= start_ptr && *tmp == ' ') + { + tmp--; + } + + if (tmp >= start_ptr && *tmp == ',') + { + *tmp = ' '; + } + else + { + /* find the following ',' */ + tmp = ptr + len; + while (tmp <= end_ptr && *tmp == ' ') + { + tmp++; + } + + if (tmp <= end_ptr && *tmp == ',') + { + *tmp = ' '; + } + } + + tmpStr = ptr + len; + } +} + +/* + * Like RemoveRelations, implements drop relations. But the function + * only be used for local cn in parallel ddl mode. + */ +void +RemoveRelationsParallelMode(DropStmt *drop, ObjectAddresses* objects, + List *heap_list) +{ + int flags = 0; + int i = 0; + char relkind; + ListCell *lc; + Oid heap_oid; + + /* Determine required relkind */ + relkind = GetRemoveObjectRelkind(drop->removeType); + + if (drop->concurrent) + { + flags |= PERFORM_DELETION_CONCURRENTLY; + } + + /* + * In DROP INDEX, attempt to acquire lock on the parent table before + * locking the index. + */ + foreach(lc, heap_list) + { + heap_oid = lfirst_oid(lc); + if (flags & PERFORM_DELETION_CONCURRENTLY) + LockRelationOid(heap_oid, ShareUpdateExclusiveLock); + else + LockRelationOid(heap_oid, AccessExclusiveLock); + } + + for (i = 0; i < objects->numrefs; i++) + { + const ObjectAddress* thisobj = objects->refs + i; + Oid relOid = thisobj->objectId; + Relation child_rel = NULL; + + AcquireDeletionLock(thisobj, flags); + + /* could not drop child interval partition or its index */ + if (RELKIND_RELATION == relkind) + { + bool report_error = false; + + elog(DEBUG1, "drop table relOid: %u", relOid); + + if (RELKIND_RELATION == relkind) + { + child_rel = heap_open(relOid, NoLock); + } + else + { + child_rel = index_open(relOid, NoLock); + } + + if (RELATION_IS_CHILD(child_rel)) + { + report_error = true; + } + + if (RELKIND_RELATION == relkind) + { + heap_close(child_rel, NoLock); + } + else + { + index_close(child_rel, NoLock); + } + + if (report_error) + { + ; + } + } + } + + performMultipleDeletions(objects, drop->behavior, flags); +} + +/* + * Implements drop one or more objects such as schema/function/type. + * The function only be used for local cn in parallel ddl mode. + */ +void +RemoveObjectsParallelMode(DropStmt *stmt, ObjectAddresses *objects) +{ + int i; + for (i = 0; i < objects->numrefs; i++) + { + const ObjectAddress* thisobj = objects->refs + i; + + if (IsSharedRelation(thisobj->classId)) + LockSharedObject(thisobj->classId, thisobj->objectId, + 0, AccessExclusiveLock); + else + LockDatabaseObject(thisobj->classId, thisobj->objectId, + 0, AccessExclusiveLock); + } + /* Here we really delete them. */ + performMultipleDeletions(objects, stmt->behavior, 0); +} +#endif + /* * performMultipleDeletions: Similar to performDeletion, but act on multiple * objects at once. diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c index 39a9c235..b79d4be3 100644 --- a/src/backend/catalog/heap.c +++ b/src/backend/catalog/heap.c @@ -116,6 +116,10 @@ #include "catalog/pgxc_key_values.h" #endif +#ifdef __TBASE__ +extern bool enable_parallel_ddl; +#endif + /* Potentially set by pg_upgrade_support functions */ Oid binary_upgrade_next_heap_pg_class_oid = InvalidOid; Oid binary_upgrade_next_toast_pg_class_oid = InvalidOid; @@ -2806,8 +2810,15 @@ heap_drop_with_catalog(Oid relid) * shared-cache-inval notice that will make them update their index lists. */ tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(relid)); - if (!HeapTupleIsValid(tuple)) - elog(ERROR, "cache lookup failed for relation %u", relid); + +#ifdef __TBASE__ + if (enable_parallel_ddl && tuple == NULL) + { + elog(WARNING, "The tuple may have been dropped by parallel ddl"); + return; + } +#endif + if (((Form_pg_class) GETSTRUCT(tuple))->relispartition) { parentOid = get_partition_parent(relid); diff --git a/src/backend/catalog/namespace.c b/src/backend/catalog/namespace.c index 3523874b..53be4dc7 100644 --- a/src/backend/catalog/namespace.c +++ b/src/backend/catalog/namespace.c @@ -385,7 +385,9 @@ RangeVarGetRelidExtended(const RangeVar *relation, LOCKMODE lockmode, if (!OidIsValid(relId)) AcceptInvalidationMessages(); else if (!nowait) + { LockRelationOid(relId, lockmode); + } else if (!ConditionalLockRelationOid(relId, lockmode)) { if (relation->schemaname) diff --git a/src/backend/catalog/objectaddress.c b/src/backend/catalog/objectaddress.c index d085bae1..7a2f0956 100644 --- a/src/backend/catalog/objectaddress.c +++ b/src/backend/catalog/objectaddress.c @@ -1164,6 +1164,33 @@ get_object_address(ObjectType objtype, Node *object, return address; } +#ifdef __TBASE__ +char *GetRemoveObjectName(ObjectType objtype, Node *object) +{ + switch (objtype) + { + case OBJECT_SCHEMA: + { + Value *strVal = (Value *)object; + return strVal(strVal); + } + case OBJECT_TYPE: + { + TypeName *typename = castNode(TypeName, object); + return TypeNameToString(typename); + } + case OBJECT_FUNCTION: + { + ObjectWithArgs *func = castNode(ObjectWithArgs, object); + return NameListToString(func->objname); + } + default: + break; + } + return NULL; +} +#endif + /* * Return an ObjectAddress based on a RangeVar and an object name. The * name of the relation identified by the RangeVar is prepended to the diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index ae8c18c1..070646fd 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -944,7 +944,7 @@ dropdb_prepare(const char *dbname, bool missing_ok) /* * DROP DATABASE */ -void +bool dropdb(const char *dbname, bool missing_ok) {// #lizard forgives Oid db_id; @@ -982,7 +982,7 @@ dropdb(const char *dbname, bool missing_ok) ereport(NOTICE, (errmsg("database \"%s\" does not exist, skipping", dbname))); - return; + return false; } } @@ -1156,6 +1156,7 @@ dropdb(const char *dbname, bool missing_ok) #endif } #endif + return true; } diff --git a/src/backend/commands/dropcmds.c b/src/backend/commands/dropcmds.c index 25dcf184..3d3c522a 100644 --- a/src/backend/commands/dropcmds.c +++ b/src/backend/commands/dropcmds.c @@ -29,13 +29,15 @@ #include "utils/syscache.h" #ifdef __TBASE__ #include "utils/rel.h" +#include "catalog/catalog.h" +#include "storage/lmgr.h" #endif #ifdef _MLS_ #include "utils/mls.h" #endif static void does_not_exist_skipping(ObjectType objtype, - Node *object); + Node *object, bool missing_ok); static bool owningrel_does_not_exist_skipping(List *object, const char **msg, char **name); static bool schema_does_not_exist_skipping(List *object, @@ -43,25 +45,19 @@ static bool schema_does_not_exist_skipping(List *object, static bool type_in_list_does_not_exist_skipping(List *typenames, const char **msg, char **name); - /* - * Drop one or more objects. - * - * We don't currently handle all object types here. Relations, for example, - * require special handling, because (for example) indexes have additional - * locking requirements. - * - * We look up all the objects first, and then delete them in a single - * performMultipleDeletions() call. This avoids unnecessary DROP RESTRICT - * errors if there are dependencies between them. + * Check object exists or not before remove. */ -void -RemoveObjects(DropStmt *stmt) -{// #lizard forgives +ObjectAddresses* PreCheckforRemoveObjects(DropStmt *stmt, bool missing_ok, + bool *need_drop, char *query_string, + bool need_unlock) +{ ObjectAddresses *objects; ListCell *cell1; + bool querystring_omit = false; objects = new_object_addresses(); + *need_drop = false; foreach(cell1, stmt->objects) { @@ -84,8 +80,23 @@ RemoveObjects(DropStmt *stmt) */ if (!OidIsValid(address.objectId)) { + char *relation_name = NULL; Assert(stmt->missing_ok); - does_not_exist_skipping(stmt->removeType, object); + + does_not_exist_skipping(stmt->removeType, object, missing_ok); + +#ifdef __TBASE__ + if (query_string) + { + if (!querystring_omit) + { + OmitqueryStringSpace(query_string); + querystring_omit = true; + } + relation_name = GetRemoveObjectName(stmt->removeType, object); + RemoveObjnameInQueryString(query_string, relation_name); + } +#endif continue; } @@ -150,9 +161,36 @@ RemoveObjects(DropStmt *stmt) } } #endif + *need_drop = true; + if (need_unlock) + { + if (IsSharedRelation(address.classId)) + UnlockSharedObject(address.classId, address.objectId, 0, AccessExclusiveLock); + else + UnlockDatabaseObject(address.classId, address.objectId, 0, AccessExclusiveLock); + } + } + return objects; +} +/* + * Drop one or more objects. + * + * We don't currently handle all object types here. Relations, for example, + * require special handling, because (for example) indexes have additional + * locking requirements. + * + * We look up all the objects first, and then delete them in a single + * performMultipleDeletions() call. This avoids unnecessary DROP RESTRICT + * errors if there are dependencies between them. + */ +void +RemoveObjects(DropStmt *stmt, bool missing_ok, bool *need_drop, char *query_string) +{ + ObjectAddresses *objects; - } + objects = PreCheckforRemoveObjects(stmt, missing_ok, need_drop, + query_string, false); /* Here we really delete them. */ performMultipleDeletions(objects, stmt->behavior, 0); @@ -276,16 +314,24 @@ type_in_list_does_not_exist_skipping(List *typenames, const char **msg, * get_object_address() in RemoveObjects would have thrown an ERROR. */ static void -does_not_exist_skipping(ObjectType objtype, Node *object) -{// #lizard forgives +does_not_exist_skipping(ObjectType objtype, Node *object, bool missing_ok) +{ const char *msg = NULL; char *name = NULL; char *args = NULL; + char *missmsg = "skipping"; + int elevel = NOTICE; + + if (!missing_ok) + { + missmsg = "can not skip in parallel ddl mode"; + elevel = ERROR; + } switch (objtype) { case OBJECT_ACCESS_METHOD: - msg = gettext_noop("access method \"%s\" does not exist, skipping"); + msg = gettext_noop("access method \"%s\" does not exist, %s"); name = strVal((Value *) object); break; case OBJECT_TYPE: @@ -295,7 +341,7 @@ does_not_exist_skipping(ObjectType objtype, Node *object) if (!schema_does_not_exist_skipping(typ->names, &msg, &name)) { - msg = gettext_noop("type \"%s\" does not exist, skipping"); + msg = gettext_noop("type \"%s\" does not exist, %s"); name = TypeNameToString(typ); } } @@ -303,58 +349,58 @@ does_not_exist_skipping(ObjectType objtype, Node *object) case OBJECT_COLLATION: if (!schema_does_not_exist_skipping(castNode(List, object), &msg, &name)) { - msg = gettext_noop("collation \"%s\" does not exist, skipping"); + msg = gettext_noop("collation \"%s\" does not exist, %s"); name = NameListToString(castNode(List, object)); } break; case OBJECT_CONVERSION: if (!schema_does_not_exist_skipping(castNode(List, object), &msg, &name)) { - msg = gettext_noop("conversion \"%s\" does not exist, skipping"); + msg = gettext_noop("conversion \"%s\" does not exist, %s"); name = NameListToString(castNode(List, object)); } break; case OBJECT_SCHEMA: - msg = gettext_noop("schema \"%s\" does not exist, skipping"); + msg = gettext_noop("schema \"%s\" does not exist, %s"); name = strVal((Value *) object); break; case OBJECT_STATISTIC_EXT: if (!schema_does_not_exist_skipping(castNode(List, object), &msg, &name)) { - msg = gettext_noop("statistics object \"%s\" does not exist, skipping"); + msg = gettext_noop("statistics object \"%s\" does not exist, %s"); name = NameListToString(castNode(List, object)); } break; case OBJECT_TSPARSER: if (!schema_does_not_exist_skipping(castNode(List, object), &msg, &name)) { - msg = gettext_noop("text search parser \"%s\" does not exist, skipping"); + msg = gettext_noop("text search parser \"%s\" does not exist, %s"); name = NameListToString(castNode(List, object)); } break; case OBJECT_TSDICTIONARY: if (!schema_does_not_exist_skipping(castNode(List, object), &msg, &name)) { - msg = gettext_noop("text search dictionary \"%s\" does not exist, skipping"); + msg = gettext_noop("text search dictionary \"%s\" does not exist, %s"); name = NameListToString(castNode(List, object)); } break; case OBJECT_TSTEMPLATE: if (!schema_does_not_exist_skipping(castNode(List, object), &msg, &name)) { - msg = gettext_noop("text search template \"%s\" does not exist, skipping"); + msg = gettext_noop("text search template \"%s\" does not exist, %s"); name = NameListToString(castNode(List, object)); } break; case OBJECT_TSCONFIGURATION: if (!schema_does_not_exist_skipping(castNode(List, object), &msg, &name)) { - msg = gettext_noop("text search configuration \"%s\" does not exist, skipping"); + msg = gettext_noop("text search configuration \"%s\" does not exist, %s"); name = NameListToString(castNode(List, object)); } break; case OBJECT_EXTENSION: - msg = gettext_noop("extension \"%s\" does not exist, skipping"); + msg = gettext_noop("extension \"%s\" does not exist, %s"); name = strVal((Value *) object); break; case OBJECT_FUNCTION: @@ -364,7 +410,7 @@ does_not_exist_skipping(ObjectType objtype, Node *object) if (!schema_does_not_exist_skipping(owa->objname, &msg, &name) && !type_in_list_does_not_exist_skipping(owa->objargs, &msg, &name)) { - msg = gettext_noop("function %s(%s) does not exist, skipping"); + msg = gettext_noop("function %s(%s) does not exist, %s"); name = NameListToString(owa->objname); args = TypeNameListToString(owa->objargs); } @@ -377,7 +423,7 @@ does_not_exist_skipping(ObjectType objtype, Node *object) if (!schema_does_not_exist_skipping(owa->objname, &msg, &name) && !type_in_list_does_not_exist_skipping(owa->objargs, &msg, &name)) { - msg = gettext_noop("aggregate %s(%s) does not exist, skipping"); + msg = gettext_noop("aggregate %s(%s) does not exist, %s"); name = NameListToString(owa->objname); args = TypeNameListToString(owa->objargs); } @@ -390,13 +436,13 @@ does_not_exist_skipping(ObjectType objtype, Node *object) if (!schema_does_not_exist_skipping(owa->objname, &msg, &name) && !type_in_list_does_not_exist_skipping(owa->objargs, &msg, &name)) { - msg = gettext_noop("operator %s does not exist, skipping"); + msg = gettext_noop("operator %s does not exist, %s"); name = NameListToString(owa->objname); } break; } case OBJECT_LANGUAGE: - msg = gettext_noop("language \"%s\" does not exist, skipping"); + msg = gettext_noop("language \"%s\" does not exist, %s"); name = strVal((Value *) object); break; case OBJECT_CAST: @@ -405,7 +451,7 @@ does_not_exist_skipping(ObjectType objtype, Node *object) !type_in_list_does_not_exist_skipping(list_make1(lsecond(castNode(List, object))), &msg, &name)) { /* XXX quote or no quote? */ - msg = gettext_noop("cast from type %s to type %s does not exist, skipping"); + msg = gettext_noop("cast from type %s to type %s does not exist, %s"); name = TypeNameToString(linitial_node(TypeName, castNode(List, object))); args = TypeNameToString(lsecond_node(TypeName, castNode(List, object))); } @@ -414,7 +460,7 @@ does_not_exist_skipping(ObjectType objtype, Node *object) case OBJECT_TRANSFORM: if (!type_in_list_does_not_exist_skipping(list_make1(linitial(castNode(List, object))), &msg, &name)) { - msg = gettext_noop("transform for type %s language \"%s\" does not exist, skipping"); + msg = gettext_noop("transform for type %s language \"%s\" does not exist, %s"); name = TypeNameToString(linitial_node(TypeName, castNode(List, object))); args = strVal(lsecond(castNode(List, object))); } @@ -422,7 +468,7 @@ does_not_exist_skipping(ObjectType objtype, Node *object) case OBJECT_TRIGGER: if (!owningrel_does_not_exist_skipping(castNode(List, object), &msg, &name)) { - msg = gettext_noop("trigger \"%s\" for relation \"%s\" does not exist, skipping"); + msg = gettext_noop("trigger \"%s\" for relation \"%s\" does not exist, %s"); name = strVal(llast(castNode(List, object))); args = NameListToString(list_truncate(list_copy(castNode(List, object)), list_length(castNode(List, object)) - 1)); @@ -431,31 +477,31 @@ does_not_exist_skipping(ObjectType objtype, Node *object) case OBJECT_POLICY: if (!owningrel_does_not_exist_skipping(castNode(List, object), &msg, &name)) { - msg = gettext_noop("policy \"%s\" for relation \"%s\" does not exist, skipping"); + msg = gettext_noop("policy \"%s\" for relation \"%s\" does not exist, %s"); name = strVal(llast(castNode(List, object))); args = NameListToString(list_truncate(list_copy(castNode(List, object)), list_length(castNode(List, object)) - 1)); } break; case OBJECT_EVENT_TRIGGER: - msg = gettext_noop("event trigger \"%s\" does not exist, skipping"); + msg = gettext_noop("event trigger \"%s\" does not exist, %s"); name = strVal((Value *) object); break; case OBJECT_RULE: if (!owningrel_does_not_exist_skipping(castNode(List, object), &msg, &name)) { - msg = gettext_noop("rule \"%s\" for relation \"%s\" does not exist, skipping"); + msg = gettext_noop("rule \"%s\" for relation \"%s\" does not exist, %s"); name = strVal(llast(castNode(List, object))); args = NameListToString(list_truncate(list_copy(castNode(List, object)), list_length(castNode(List, object)) - 1)); } break; case OBJECT_FDW: - msg = gettext_noop("foreign-data wrapper \"%s\" does not exist, skipping"); + msg = gettext_noop("foreign-data wrapper \"%s\" does not exist, %s"); name = strVal((Value *) object); break; case OBJECT_FOREIGN_SERVER: - msg = gettext_noop("server \"%s\" does not exist, skipping"); + msg = gettext_noop("server \"%s\" does not exist, %s"); name = strVal((Value *) object); break; case OBJECT_OPCLASS: @@ -464,7 +510,7 @@ does_not_exist_skipping(ObjectType objtype, Node *object) if (!schema_does_not_exist_skipping(opcname, &msg, &name)) { - msg = gettext_noop("operator class \"%s\" does not exist for access method \"%s\", skipping"); + msg = gettext_noop("operator class \"%s\" does not exist for access method \"%s\", %s"); name = NameListToString(opcname); args = strVal(linitial(castNode(List, object))); } @@ -476,14 +522,14 @@ does_not_exist_skipping(ObjectType objtype, Node *object) if (!schema_does_not_exist_skipping(opfname, &msg, &name)) { - msg = gettext_noop("operator family \"%s\" does not exist for access method \"%s\", skipping"); + msg = gettext_noop("operator family \"%s\" does not exist for access method \"%s\", %s"); name = NameListToString(opfname); args = strVal(linitial(castNode(List, object))); } } break; case OBJECT_PUBLICATION: - msg = gettext_noop("publication \"%s\" does not exist, skipping"); + msg = gettext_noop("publication \"%s\" does not exist, %s"); name = strVal((Value *) object); break; default: @@ -492,7 +538,7 @@ does_not_exist_skipping(ObjectType objtype, Node *object) } if (!args) - ereport(NOTICE, (errmsg(msg, name))); + ereport(elevel, (errmsg(msg, name, missmsg))); else - ereport(NOTICE, (errmsg(msg, name, args))); + ereport(elevel, (errmsg(msg, name, args, missmsg))); } diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c index 3cb9c044..a248f5c0 100644 --- a/src/backend/commands/sequence.c +++ b/src/backend/commands/sequence.c @@ -28,6 +28,9 @@ #include "catalog/dependency.h" #include "catalog/indexing.h" #include "catalog/namespace.h" +#ifdef __TBASE__ +#include "catalog/pg_namespace.h" +#endif #include "catalog/objectaccess.h" #include "catalog/pg_sequence.h" #include "catalog/pg_type.h" @@ -155,13 +158,69 @@ static void process_owned_by(Relation seqrel, List *owned_by, bool for_identity) extern bool g_GTM_skip_catalog; #endif +#ifdef __TBASE__ +extern bool is_txn_has_parallel_ddl; + +/* + * Check sequence exists or not + */ +bool PrecheckDefineSequence(CreateSeqStmt *seq) +{ + Oid seqoid; + Oid nspid; + bool need_send = true; + + if (g_GTM_skip_catalog && IS_PGXC_DATANODE) + { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("skip_gtm_catalog can not be true on datanode."))); + } + + if (!g_GTM_skip_catalog) + { + /* Unlogged sequences are not implemented -- not clear if useful. */ + if (seq->sequence->relpersistence == RELPERSISTENCE_UNLOGGED) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("unlogged sequences are not supported"))); + + /* + * If if_not_exists was given and a relation with the same name already + * exists, bail out. (Note: we needn't check this when not if_not_exists, + * because DefineRelation will complain anyway.) + */ + if (seq->if_not_exists) + { + nspid = RangeVarGetAndCheckCreationNamespace(seq->sequence, NoLock, + &seqoid); + if (OidIsValid(seqoid)) + { + ereport(NOTICE, + (errcode(ERRCODE_DUPLICATE_TABLE), + errmsg("relation \"%s\" already exists, skipping", + seq->sequence->relname))); + need_send = false; + } + UnlockDatabaseObject(NamespaceRelationId, nspid, 0, + AccessShareLock); + } + } + + return need_send; +} + /* * DefineSequence * Creates a new sequence relation */ ObjectAddress +DefineSequence(ParseState *pstate, CreateSeqStmt *seq, bool exists_ok) +#else +ObjectAddress DefineSequence(ParseState *pstate, CreateSeqStmt *seq) -{// #lizard forgives +#endif +{ FormData_pg_sequence seqform; FormData_pg_sequence_data seqdataform; bool need_seq_rewrite; @@ -214,6 +273,14 @@ DefineSequence(ParseState *pstate, CreateSeqStmt *seq) RangeVarGetAndCheckCreationNamespace(seq->sequence, NoLock, &seqoid); if (OidIsValid(seqoid)) { +#ifdef __TBASE__ + if (!exists_ok) + ereport(ERROR, + (errcode(ERRCODE_DUPLICATE_TABLE), + errmsg("relation \"%s\" already exists", + seq->sequence->relname))); + else +#endif ereport(NOTICE, (errcode(ERRCODE_DUPLICATE_TABLE), errmsg("relation \"%s\" already exists, skipping", diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 6f525512..eb5b2b6b 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -158,6 +158,9 @@ typedef struct OnCommitItem static List *on_commits = NIL; +#ifdef __TBASE__ +extern bool is_txn_has_parallel_ddl; +#endif /* * State information for ALTER TABLE @@ -1405,70 +1408,6 @@ DropErrorMsgWrongType(const char *relname, char wrongkind, char rightkind) #ifdef __TBASE__ -/* - * replace all invisible characters with ' ', - * leave no spaces next to ',' or '.' - */ -static void -OmitqueryStringSpace(char *queryString) -{ - char *front = queryString; - char *last = queryString; - bool skip = false; - - if (queryString == NULL) - { - return; - } - - /* omit space */ - while (scanner_isspace(*front)) - { - ++front; - } - - while ((*front) != '\0') - { - if(scanner_isspace(*front) && skip == false) - { - while(scanner_isspace(*front)) - { - ++front; - } - - if ((*front) == ',' || (*front) == '.') - { - /* no need space */ - } - else if (last != queryString && (*(last - 1) == ',' || *(last - 1) == '.')) - { - /* no need space */ - } - else - { - /* replace all invisible characters with ' ' */ - *last = ' '; - ++last; - continue; - } - } - - if ((*front) == '\"') - { - skip = (skip == true) ? false : true; - *last = *front; - ++front; - } - else - { - *last = *front; - ++front; - } - ++last; - } - *last = '\0'; -} - /* * remove relname in query string (replace with ' ') */ @@ -1543,6 +1482,146 @@ RemoveRelnameInQueryString(char *queryString, RangeVar *rel) } } +char GetRemoveObjectRelkind(ObjectType removeType) +{ + char relkind; + switch (removeType) + { + case OBJECT_TABLE: + relkind = RELKIND_RELATION; + break; + + case OBJECT_INDEX: + relkind = RELKIND_INDEX; + break; + + case OBJECT_SEQUENCE: + relkind = RELKIND_SEQUENCE; + break; + + case OBJECT_VIEW: + relkind = RELKIND_VIEW; + break; + + case OBJECT_MATVIEW: + relkind = RELKIND_MATVIEW; + break; + + case OBJECT_FOREIGN_TABLE: + relkind = RELKIND_FOREIGN_TABLE; + break; + + default: + elog(ERROR, "unrecognized drop object type: %d", + (int)removeType); + relkind = 0; /* keep compiler quiet */ + break; + } + return relkind; +} + +/* + * PreCheckforRemoveRelation + * Check before implementing DROP TABLE, DROP INDEX, DROP SEQUENCE, + * DROP VIEW, DROP FOREIGN TABLE, DROP MATERIALIZED VIEW, return the + * object of existing relations. + */ +ObjectAddresses* PreCheckforRemoveRelation(DropStmt* drop, char* queryString, + bool *needDrop, List **heap_list) +{ + char relkind; + ListCell *cell; + LOCKMODE lockmode = AccessExclusiveLock; + bool querystring_omit = false; + ObjectAddresses* objects = NULL; + + /* DROP CONCURRENTLY uses a weaker lock, and has some restrictions */ + if (drop->concurrent) + { + lockmode = ShareUpdateExclusiveLock; + Assert(drop->removeType == OBJECT_INDEX); + if (list_length(drop->objects) != 1) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("DROP INDEX CONCURRENTLY does not support dropping multiple objects"))); + if (drop->behavior == DROP_CASCADE) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("DROP INDEX CONCURRENTLY does not support CASCADE"))); + } + + /* + * First we identify all the relations, then we delete them in a single + * performMultipleDeletions() call. This is to avoid unwanted DROP + * RESTRICT errors if one of the relations depends on another. + */ + + /* Determine required relkind */ + relkind = GetRemoveObjectRelkind(drop->removeType); + objects = new_object_addresses(); + *needDrop = false; + + foreach (cell, drop->objects) + { + RangeVar *rel = makeRangeVarFromNameList((List *) lfirst(cell)); + Oid relOid; + ObjectAddress obj; + struct DropRelationCallbackState state; + + /* + * These next few steps are a great deal like relation_openrv, but we + * don't bother building a relcache entry since we don't need it. + * + * Check for shared-cache-inval messages before trying to access the + * relation. This is needed to cover the case where the name + * identifies a rel that has been dropped and recreated since the + * start of our transaction: if we don't flush the old syscache entry, + * then we'll latch onto that entry and suffer an error later. + */ + AcceptInvalidationMessages(); + + /* Look up the appropriate relation using namespace search. */ + state.relkind = relkind; + state.heapOid = InvalidOid; + state.partParentOid = InvalidOid; + state.concurrent = drop->concurrent; + + relOid = RangeVarGetRelidExtended(rel, lockmode, true, false, + RangeVarCallbackForDropRelation, + (void*)&state); + /* Not there? */ + if (!OidIsValid(relOid)) + { + DropErrorMsgNonExistent(rel, relkind, drop->missing_ok); + if (!querystring_omit) + { + OmitqueryStringSpace(queryString); + querystring_omit = true; + } + + RemoveRelnameInQueryString(queryString, rel); + continue; + } + + /* OK, we're ready to delete this one */ + obj.classId = RelationRelationId; + obj.objectId = relOid; + obj.objectSubId = 0; + add_exact_object_address(&obj, objects); + *needDrop = true; + + if (OidIsValid(state.heapOid)) + { + LOCKMODE heapLockMode = AccessExclusiveLock; + if (state.concurrent) + heapLockMode = ShareUpdateExclusiveLock; + UnlockRelationOid(state.heapOid, heapLockMode); + *heap_list = list_append_unique_oid(*heap_list, state.heapOid); + } + UnlockRelationOid(relOid, lockmode); + } + return objects; +} #endif /* @@ -1661,7 +1740,12 @@ RemoveRelations(DropStmt *drop) /* Not there? */ if (!OidIsValid(relOid)) { - DropErrorMsgNonExistent(rel, relkind, drop->missing_ok); + bool missing_ok = drop->missing_ok; +#ifdef __TBASE__ + if (IsConnFromCoord() && is_txn_has_parallel_ddl) + missing_ok = false; +#endif + DropErrorMsgNonExistent(rel, relkind, missing_ok); #ifdef __TBASE__ if (!querystring_omit) { @@ -1681,6 +1765,8 @@ RemoveRelations(DropStmt *drop) { bool report_error = false; + elog(LOG, "drop table relOid: %u", relOid); + if (RELKIND_RELATION == relkind) { child_rel = heap_open(relOid, NoLock); @@ -1761,6 +1847,19 @@ RangeVarCallbackForDropRelation(const RangeVar *rel, Oid relOid, Oid oldRelOid, */ if (relOid != oldRelOid && OidIsValid(state->heapOid)) { +#ifdef __TBASE__ + /* + * Unlock index before unlock table, or may cause deadlock + * when drop index and create same index executed concurrently. + */ + if (is_txn_has_parallel_ddl && relkind == RELKIND_INDEX) + { + Assert(OidIsValid(oldRelOid)); + UnlockRelationOid(oldRelOid, heap_lockmode); + elog(LOG, "Unlock index(name:oid):(%s:%u) before unlock table", + rel->relname, oldRelOid); + } +#endif UnlockRelationOid(state->heapOid, heap_lockmode); state->heapOid = InvalidOid; } @@ -1782,7 +1881,16 @@ RangeVarCallbackForDropRelation(const RangeVar *rel, Oid relOid, Oid oldRelOid, tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(relOid)); if (!HeapTupleIsValid(tuple)) + { +#ifdef __TBASE__ + if (is_txn_has_parallel_ddl && !state->concurrent) + { + elog(ERROR, "Can't get valid tuple, relation %s had been invalid" + "by other process in parallel ddl mode", rel->relname); + } +#endif return; /* concurrently dropped, so nothing to do */ + } classform = (Form_pg_class) GETSTRUCT(tuple); is_partition = classform->relispartition; @@ -1830,8 +1938,20 @@ RangeVarCallbackForDropRelation(const RangeVar *rel, Oid relOid, Oid oldRelOid, { state->heapOid = IndexGetRelation(relOid, true); if (OidIsValid(state->heapOid)) + { LockRelationOid(state->heapOid, heap_lockmode); } +#ifdef __TBASE__ + else + { + if (is_txn_has_parallel_ddl && !state->concurrent) + { + elog(ERROR, "Can't get valid tableoid, index %s had been invalid" + "by other process in parallel ddl mode", rel->relname); + } + } +#endif + } /* * Similarly, if the relation is a partition, we must acquire lock on its diff --git a/src/backend/commands/tablespace.c b/src/backend/commands/tablespace.c index 1b208c6c..e88463ac 100644 --- a/src/backend/commands/tablespace.c +++ b/src/backend/commands/tablespace.c @@ -414,14 +414,70 @@ CreateTableSpace(CreateTableSpaceStmt *stmt) #endif /* HAVE_SYMLINK */ } +#ifdef __TBASE__ +bool +PreCheckforDropTableSpace(DropTableSpaceStmt *stmt) +{ +#ifdef HAVE_SYMLINK + char *tablespacename = stmt->tablespacename; + HeapScanDesc scandesc; + Relation rel; + HeapTuple tuple; + ScanKeyData entry[1]; + + /* + * Find the target tuple + */ + rel = heap_open(TableSpaceRelationId, RowExclusiveLock); + + ScanKeyInit(&entry[0], + Anum_pg_tablespace_spcname, + BTEqualStrategyNumber, F_NAMEEQ, + CStringGetDatum(tablespacename)); + scandesc = heap_beginscan_catalog(rel, 1, entry); + tuple = heap_getnext(scandesc, ForwardScanDirection); + + if (!HeapTupleIsValid(tuple)) + { + if (!stmt->missing_ok) + { + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("tablespace \"%s\" does not exist", + tablespacename))); + } + else + { + ereport(NOTICE, + (errmsg("tablespace \"%s\" does not exist, skipping", + tablespacename))); + /* XXX I assume I need one or both of these next two calls */ + heap_endscan(scandesc); + heap_close(rel, RowExclusiveLock); + } + return false; + } + + heap_endscan(scandesc); + heap_close(rel, RowExclusiveLock); + +#else /* !HAVE_SYMLINK */ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("tablespaces are not supported on this platform"))); +#endif + return true; +} +#endif + /* * Drop a table space * * Be careful to check that the tablespace is empty. */ -void -DropTableSpace(DropTableSpaceStmt *stmt) -{// #lizard forgives +bool +DropTableSpace(DropTableSpaceStmt *stmt, bool missing_ok) +{ #ifdef HAVE_SYMLINK char *tablespacename = stmt->tablespacename; HeapScanDesc scandesc; @@ -444,7 +500,7 @@ DropTableSpace(DropTableSpaceStmt *stmt) if (!HeapTupleIsValid(tuple)) { - if (!stmt->missing_ok) + if (!missing_ok) { ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), @@ -460,7 +516,7 @@ DropTableSpace(DropTableSpaceStmt *stmt) heap_endscan(scandesc); heap_close(rel, NoLock); } - return; + return false; } tablespaceoid = HeapTupleGetOid(tuple); @@ -573,6 +629,7 @@ DropTableSpace(DropTableSpaceStmt *stmt) (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("tablespaces are not supported on this platform"))); #endif /* HAVE_SYMLINK */ + return true; } diff --git a/src/backend/commands/user.c b/src/backend/commands/user.c index 038119db..e98cd209 100644 --- a/src/backend/commands/user.c +++ b/src/backend/commands/user.c @@ -1073,188 +1073,236 @@ AlterRoleSet(AlterRoleSetStmt *stmt) } -/* - * DROP ROLE - */ -void -DropRole(DropRoleStmt *stmt) -{// #lizard forgives - Relation pg_authid_rel, - pg_auth_members_rel; - ListCell *item; +void DropRoleByTuple(char *role, HeapTuple tuple, Relation pg_authid_rel, + Relation pg_auth_members_rel) +{ + HeapTuple tmp_tuple; + ScanKeyData scankey; + char *detail; + char *detail_log; + SysScanDesc sscan; + Oid roleid; + + roleid = HeapTupleGetOid(tuple); - if (!have_createrole_privilege()) + if (roleid == GetUserId()) ereport(ERROR, - (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), - errmsg("permission denied to drop role"))); + (errcode(ERRCODE_OBJECT_IN_USE), + errmsg("current user cannot be dropped"))); + if (roleid == GetOuterUserId()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_IN_USE), + errmsg("current user cannot be dropped"))); + if (roleid == GetSessionUserId()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_IN_USE), + errmsg("session user cannot be dropped"))); /* - * Scan the pg_authid relation to find the Oid of the role(s) to be - * deleted. - */ - pg_authid_rel = heap_open(AuthIdRelationId, RowExclusiveLock); - pg_auth_members_rel = heap_open(AuthMemRelationId, RowExclusiveLock); - - foreach(item, stmt->roles) - { - RoleSpec *rolspec = lfirst(item); - char *role; - HeapTuple tuple, - tmp_tuple; - ScanKeyData scankey; - char *detail; - char *detail_log; - SysScanDesc sscan; - Oid roleid; - - if (rolspec->roletype != ROLESPEC_CSTRING) - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("cannot use special role specifier in DROP ROLE"))); - role = rolspec->rolename; - - tuple = SearchSysCache1(AUTHNAME, PointerGetDatum(role)); - if (!HeapTupleIsValid(tuple)) - { - if (!stmt->missing_ok) - { - ereport(ERROR, - (errcode(ERRCODE_UNDEFINED_OBJECT), - errmsg("role \"%s\" does not exist", role))); - } - else - { - ereport(NOTICE, - (errmsg("role \"%s\" does not exist, skipping", - role))); - } - - continue; - } - - roleid = HeapTupleGetOid(tuple); - - if (roleid == GetUserId()) - ereport(ERROR, - (errcode(ERRCODE_OBJECT_IN_USE), - errmsg("current user cannot be dropped"))); - if (roleid == GetOuterUserId()) - ereport(ERROR, - (errcode(ERRCODE_OBJECT_IN_USE), - errmsg("current user cannot be dropped"))); - if (roleid == GetSessionUserId()) - ereport(ERROR, - (errcode(ERRCODE_OBJECT_IN_USE), - errmsg("session user cannot be dropped"))); - - /* - * For safety's sake, we allow createrole holders to drop ordinary - * roles but not superuser roles. This is mainly to avoid the - * scenario where you accidentally drop the last superuser. - */ - if (((Form_pg_authid) GETSTRUCT(tuple))->rolsuper && - !superuser()) - ereport(ERROR, - (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), - errmsg("must be superuser to drop superusers"))); - - /* DROP hook for the role being removed */ - InvokeObjectDropHook(AuthIdRelationId, roleid, 0); + * For safety's sake, we allow createrole holders to drop ordinary + * roles but not superuser roles. This is mainly to avoid the + * scenario where you accidentally drop the last superuser. + */ + if (((Form_pg_authid) GETSTRUCT(tuple))->rolsuper && + !superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be superuser to drop superusers"))); - /* - * Lock the role, so nobody can add dependencies to her while we drop - * her. We keep the lock until the end of transaction. - */ - LockSharedObject(AuthIdRelationId, roleid, 0, AccessExclusiveLock); + /* DROP hook for the role being removed */ + InvokeObjectDropHook(AuthIdRelationId, roleid, 0); - /* Check for pg_shdepend entries depending on this role */ - if (checkSharedDependencies(AuthIdRelationId, roleid, - &detail, &detail_log)) - ereport(ERROR, - (errcode(ERRCODE_DEPENDENT_OBJECTS_STILL_EXIST), - errmsg("role \"%s\" cannot be dropped because some objects depend on it", - role), - errdetail_internal("%s", detail), - errdetail_log("%s", detail_log))); + /* + * Lock the role, so nobody can add dependencies to her while we drop + * her. We keep the lock until the end of transaction. + */ + LockSharedObject(AuthIdRelationId, roleid, 0, AccessExclusiveLock); + + /* Check for pg_shdepend entries depending on this role */ + if (checkSharedDependencies(AuthIdRelationId, roleid, + &detail, &detail_log)) + ereport(ERROR, + (errcode(ERRCODE_DEPENDENT_OBJECTS_STILL_EXIST), + errmsg("role \"%s\" cannot be dropped because some objects depend on it", + role), + errdetail_internal("%s", detail), + errdetail_log("%s", detail_log))); #ifdef _MLS_ - if (true == mls_check_role_permission(roleid) || - true == cls_check_user_has_policy(roleid)) - { - elog(ERROR, "could not drop role:%s, cause this role has mls poilcy bound", - role); - } + if (true == mls_check_role_permission(roleid) || + true == cls_check_user_has_policy(roleid)) + { + elog(ERROR, "could not drop role:%s, cause this role has mls poilcy bound", + role); + } #endif - /* - * Remove the role from the pg_authid table - */ - CatalogTupleDelete(pg_authid_rel, &tuple->t_self); - - ReleaseSysCache(tuple); + /* + * Remove the role from the pg_authid table + */ + CatalogTupleDelete(pg_authid_rel, &tuple->t_self); - /* - * Remove role from the pg_auth_members table. We have to remove all - * tuples that show it as either a role or a member. - * - * XXX what about grantor entries? Maybe we should do one heap scan. - */ - ScanKeyInit(&scankey, - Anum_pg_auth_members_roleid, - BTEqualStrategyNumber, F_OIDEQ, - ObjectIdGetDatum(roleid)); + ReleaseSysCache(tuple); - sscan = systable_beginscan(pg_auth_members_rel, AuthMemRoleMemIndexId, - true, NULL, 1, &scankey); + /* + * Remove role from the pg_auth_members table. We have to remove all + * tuples that show it as either a role or a member. + * + * XXX what about grantor entries? Maybe we should do one heap scan. + */ + ScanKeyInit(&scankey, + Anum_pg_auth_members_roleid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(roleid)); + + sscan = systable_beginscan(pg_auth_members_rel, AuthMemRoleMemIndexId, + true, NULL, 1, &scankey); + + while (HeapTupleIsValid(tmp_tuple = systable_getnext(sscan))) + { + CatalogTupleDelete(pg_auth_members_rel, &tmp_tuple->t_self); + } - while (HeapTupleIsValid(tmp_tuple = systable_getnext(sscan))) - { - CatalogTupleDelete(pg_auth_members_rel, &tmp_tuple->t_self); - } + systable_endscan(sscan); - systable_endscan(sscan); + ScanKeyInit(&scankey, + Anum_pg_auth_members_member, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(roleid)); - ScanKeyInit(&scankey, - Anum_pg_auth_members_member, - BTEqualStrategyNumber, F_OIDEQ, - ObjectIdGetDatum(roleid)); + sscan = systable_beginscan(pg_auth_members_rel, AuthMemMemRoleIndexId, + true, NULL, 1, &scankey); - sscan = systable_beginscan(pg_auth_members_rel, AuthMemMemRoleIndexId, - true, NULL, 1, &scankey); + while (HeapTupleIsValid(tmp_tuple = systable_getnext(sscan))) + { + CatalogTupleDelete(pg_auth_members_rel, &tmp_tuple->t_self); + } - while (HeapTupleIsValid(tmp_tuple = systable_getnext(sscan))) - { - CatalogTupleDelete(pg_auth_members_rel, &tmp_tuple->t_self); - } + systable_endscan(sscan); - systable_endscan(sscan); + /* + * Remove any comments or security labels on this role. + */ + DeleteSharedComments(roleid, AuthIdRelationId); + DeleteSharedSecurityLabel(roleid, AuthIdRelationId); - /* - * Remove any comments or security labels on this role. - */ - DeleteSharedComments(roleid, AuthIdRelationId); - DeleteSharedSecurityLabel(roleid, AuthIdRelationId); + /* + * Remove settings for this role. + */ + DropSetting(InvalidOid, roleid); - /* - * Remove settings for this role. - */ - DropSetting(InvalidOid, roleid); + /* + * Advance command counter so that later iterations of this loop will + * see the changes already made. This is essential if, for example, + * we are trying to drop both a role and one of its direct members --- + * we'll get an error if we try to delete the linking pg_auth_members + * tuple twice. (We do not need a CCI between the two delete loops + * above, because it's not allowed for a role to directly contain + * itself.) + */ + CommandCounterIncrement(); + + if (POOL_CONN_RELEASE_SUCCESS != PoolManagerClosePooledConnections(NULL, role)) + { + elog(ERROR, "failed to close pooled connection for role:%s", role); + } +} - /* - * Advance command counter so that later iterations of this loop will - * see the changes already made. This is essential if, for example, - * we are trying to drop both a role and one of its direct members --- - * we'll get an error if we try to delete the linking pg_auth_members - * tuple twice. (We do not need a CCI between the two delete loops - * above, because it's not allowed for a role to directly contain - * itself.) - */ - CommandCounterIncrement(); +#ifdef __TBASE__ +bool PreCheckDropRole(DropRoleStmt *stmt, char *query_string, + List **exist_roles) +{ + Relation pg_authid_rel, + pg_auth_members_rel; + ListCell *item; + bool need_drop = false; + bool querystring_omit = false; + + if (!have_createrole_privilege()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("permission denied to drop role"))); + + pg_authid_rel = heap_open(AuthIdRelationId, RowExclusiveLock); + pg_auth_members_rel = heap_open(AuthMemRelationId, RowExclusiveLock); + + foreach(item, stmt->roles) + { + RoleSpec *rolspec = lfirst(item); + char *role; + HeapTuple tuple; + + if (rolspec->roletype != ROLESPEC_CSTRING) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("cannot use special role specifier in DROP ROLE"))); + role = rolspec->rolename; + + tuple = SearchSysCache1(AUTHNAME, PointerGetDatum(role)); + if (!HeapTupleIsValid(tuple)) + { + if (!stmt->missing_ok) + { + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("role \"%s\" does not exist", role))); + } + else + { + ereport(NOTICE, + (errmsg("role \"%s\" does not exist, skipping", + role))); + } + + if (query_string) + { + if (!querystring_omit) + { + OmitqueryStringSpace(query_string); + querystring_omit = true; + } + RemoveObjnameInQueryString(query_string, role); + } + + continue; + } + ReleaseSysCache(tuple); + *exist_roles = lappend(*exist_roles, role); + need_drop = true; + } + heap_close(pg_auth_members_rel, RowExclusiveLock); + heap_close(pg_authid_rel, RowExclusiveLock); + return need_drop; +} - if (POOL_CONN_RELEASE_SUCCESS != PoolManagerClosePooledConnections(NULL, role)) - { - elog(ERROR, "failed to close pooled connection for role:%s", role); - } - } +void DropRoleParallelMode(List *role_list) +{ + Relation pg_authid_rel, + pg_auth_members_rel; + ListCell *item; + + /* + * Scan the pg_authid relation to find the Oid of the role(s) to be + * deleted. + */ + pg_authid_rel = heap_open(AuthIdRelationId, RowExclusiveLock); + pg_auth_members_rel = heap_open(AuthMemRelationId, RowExclusiveLock); + + foreach(item, role_list) + { + char *role; + HeapTuple tuple; + + role = lfirst(item); + /* tuple will be release by DropRoleByTuple below */ + tuple = SearchSysCache1(AUTHNAME, PointerGetDatum(role)); + if (!HeapTupleIsValid(tuple)) + { + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("Precheck role \"%s\" existed, but now does not exist", role))); + } + DropRoleByTuple(role, tuple, pg_authid_rel, pg_auth_members_rel); + } /* * Now we can clean up; but keep locks until commit. @@ -1263,6 +1311,87 @@ DropRole(DropRoleStmt *stmt) heap_close(pg_authid_rel, NoLock); } +#endif + +/* + * DROP ROLE + */ +bool +DropRole(DropRoleStmt *stmt, bool missing_ok, char *query_string) +{ + Relation pg_authid_rel, + pg_auth_members_rel; + ListCell *item; + bool querystring_omit = false; + bool need_drop = false; + + if (!have_createrole_privilege()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("permission denied to drop role"))); + + /* + * Scan the pg_authid relation to find the Oid of the role(s) to be + * deleted. + */ + pg_authid_rel = heap_open(AuthIdRelationId, RowExclusiveLock); + pg_auth_members_rel = heap_open(AuthMemRelationId, RowExclusiveLock); + + foreach(item, stmt->roles) + { + RoleSpec *rolspec = lfirst(item); + HeapTuple tuple; + char *role; + + if (rolspec->roletype != ROLESPEC_CSTRING) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("cannot use special role specifier in DROP ROLE"))); + role = rolspec->rolename; + + tuple = SearchSysCache1(AUTHNAME, PointerGetDatum(role)); + if (!HeapTupleIsValid(tuple)) + { + if (!missing_ok) + { + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("role \"%s\" does not exist", role))); + } + else + { + ereport(NOTICE, + (errmsg("role \"%s\" does not exist, skipping", + role))); + } + + if (query_string) + { + if (!querystring_omit) + { + OmitqueryStringSpace(query_string); + querystring_omit = true; + } + RemoveObjnameInQueryString(query_string, role); + } + + continue; + } + + DropRoleByTuple(role, tuple, pg_authid_rel, pg_auth_members_rel); + + need_drop = true; + } + + /* + * Now we can clean up; but keep locks until commit. + */ + heap_close(pg_auth_members_rel, NoLock); + heap_close(pg_authid_rel, NoLock); + + return need_drop; +} + /* * Rename role */ diff --git a/src/backend/commands/view.c b/src/backend/commands/view.c index b2a9ebc6..30862aa4 100644 --- a/src/backend/commands/view.c +++ b/src/backend/commands/view.c @@ -416,26 +416,18 @@ UpdateRangeTableOfViewParse(Oid viewOid, Query *viewParse) return viewParse; } - /* - * DefineView - * Execute a CREATE VIEW command. + * MakeViewParse + * Run parse analysis to convert the raw parse tree to a Query. Note this + * also acquires sufficient locks on the source table(s). */ -ObjectAddress -DefineView(ViewStmt *stmt, const char *queryString, +Query * +MakeViewParse(ViewStmt* stmt, const char* query_string, int stmt_location, int stmt_len) -{// #lizard forgives +{ + Query *viewParse = NULL; RawStmt *rawstmt; - Query *viewParse; - RangeVar *view; - ListCell *cell; - bool check_option; - ObjectAddress address; - /* - * Run parse analysis to convert the raw parse tree to a Query. Note this - * also acquires sufficient locks on the source table(s). - * * Since parse analysis scribbles on its input, copy the raw parse tree; * this ensures we don't corrupt a prepared statement, for example. */ @@ -443,9 +435,59 @@ DefineView(ViewStmt *stmt, const char *queryString, rawstmt->stmt = (Node *) copyObject(stmt->query); rawstmt->stmt_location = stmt_location; rawstmt->stmt_len = stmt_len; + viewParse = parse_analyze(rawstmt, query_string, NULL, 0, NULL); + return viewParse; +} + +#ifdef __TBASE__ +/* + * IsViewTemp + * Check whethe we need a temporary view. + */ +bool +IsViewTemp(ViewStmt* stmt, const char* query_string, + int stmt_location, int stmt_len, + List **relation_list) +{ + Query *viewParse = NULL; + RangeVar *view = NULL; + + + /* don't corrupt original command */ + view = (RangeVar*)copyObject(stmt->view); + viewParse = MakeViewParse(stmt, query_string, stmt_location, stmt_len); - viewParse = parse_analyze(rawstmt, queryString, NULL, 0, NULL); + /* + * If the user didn't explicitly ask for a temporary view, check whether + * we need one implicitly. We allow TEMP to be inserted automatically as + * long as the CREATE command is consistent with that --- no explicit + * schema name. + */ + if (view->relpersistence == RELPERSISTENCE_PERMANENT && + CheckAndGetRelation(viewParse, relation_list)) + { + view->relpersistence = RELPERSISTENCE_TEMP; + } + + return view->relpersistence == RELPERSISTENCE_TEMP; +} +#endif + +/* + * DefineView + * Execute a CREATE VIEW command. + */ +ObjectAddress +DefineView(ViewStmt *stmt, const char *queryString, + int stmt_location, int stmt_len) +{ + Query *viewParse; + RangeVar *view; + ListCell *cell; + bool check_option; + ObjectAddress address; + viewParse = MakeViewParse(stmt, queryString, stmt_location, stmt_len); /* * The grammar should ensure that the result is a single SELECT Query. * However, it doesn't forbid SELECT INTO, so we have to check for that. diff --git a/src/backend/parser/parse_relation.c b/src/backend/parser/parse_relation.c index 24196b21..10b20a9e 100644 --- a/src/backend/parser/parse_relation.c +++ b/src/backend/parser/parse_relation.c @@ -71,6 +71,12 @@ static int specialAttNum(const char *attname); #endif static bool isQueryUsingTempRelation_walker(Node *node, void *context); +#ifdef __TBASE__ +typedef struct +{ + List *related_oids; /* the related tableoid list */ +} ViewRelatedContext; +#endif /* * refnameRangeTblEntry @@ -3425,6 +3431,19 @@ errorMissingColumn(ParseState *pstate, } } +#ifdef __TBASE__ +bool +CheckAndGetRelation(Query *query, List **relation_list) +{ + bool tmp = false; + ViewRelatedContext context; + + context.related_oids = NIL; + tmp = isQueryUsingTempRelation_walker((Node *) query, &context); + *relation_list = context.related_oids; + return tmp; +} +#endif /* * Examine a fully-parsed query, and return TRUE iff any relation underlying @@ -3455,8 +3474,15 @@ isQueryUsingTempRelation_walker(Node *node, void *context) { Relation rel = heap_open(rte->relid, AccessShareLock); char relpersistence = rel->rd_rel->relpersistence; - heap_close(rel, AccessShareLock); +#ifdef __TBASE__ + if (context) + { + ViewRelatedContext *vrContext = (ViewRelatedContext *)context; + vrContext->related_oids = lappend_oid(vrContext->related_oids, + RelationGetRelid(rel)); + } +#endif if (relpersistence == RELPERSISTENCE_TEMP) return true; } diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c index 88c6077e..08cb09e6 100644 --- a/src/backend/parser/parse_utilcmd.c +++ b/src/backend/parser/parse_utilcmd.c @@ -218,7 +218,11 @@ static char * ChooseSerialName(const char *relname, const char *colname, * then expand those into multiple IndexStmt blocks. * - thomas 1997-12-02 */ -#ifdef XCP +#ifdef __TBASE__ +List * +transformCreateStmt(CreateStmt *stmt, const char *queryString, + bool autodistribute, Oid *nspaceid, bool existsok) +#elif XCP List * transformCreateStmt(CreateStmt *stmt, const char *queryString, bool autodistribute) @@ -312,16 +316,36 @@ transformCreateStmt(CreateStmt *stmt, const char *queryString) &existing_relid); cancel_parser_errposition_callback(&pcbstate); +#ifdef __TBASE__ + if (nspaceid) + *nspaceid = namespaceid; +#endif + /* * If the relation already exists and the user specified "IF NOT EXISTS", * bail out with a NOTICE. */ if (stmt->if_not_exists && OidIsValid(existing_relid)) { + if (existsok) + { ereport(NOTICE, (errcode(ERRCODE_DUPLICATE_TABLE), errmsg("relation \"%s\" already exists, skipping", stmt->relation->relname))); + } + else + { + /* + * In PARALLEL DDL mode, remote node emit error if relation + * already exists to keep consistency with local cn. + */ + ereport(ERROR, + (errcode(ERRCODE_DUPLICATE_TABLE), + errmsg("relation \"%s\" already exists, skipping", + stmt->relation->relname))); + } + return NIL; } @@ -3317,6 +3341,7 @@ transformAlterTableStmt(Oid relid, AlterTableStmt *stmt, RangeTblEntry *rte; #ifdef __TBASE__ List *createlist = NULL; + List *partlist = NIL; #endif /* * We must not scribble on the passed-in AlterTableStmt, so copy it. (This @@ -3550,7 +3575,14 @@ transformAlterTableStmt(Oid relid, AlterTableStmt *stmt, createpart->partbound = NULL; createpart->partspec = NULL; +#ifdef __TBASE__ + partlist = transformCreateStmt(createpart, + queryString, true, + NULL, true); + createlist = list_concat(createlist, partlist); +#else createlist = list_concat(createlist, transformCreateStmt(createpart, queryString, true)); +#endif } } else diff --git a/src/backend/pgxc/locator/redistrib.c b/src/backend/pgxc/locator/redistrib.c index 6a011400..ad13088a 100644 --- a/src/backend/pgxc/locator/redistrib.c +++ b/src/backend/pgxc/locator/redistrib.c @@ -969,7 +969,11 @@ distrib_execute_query(char *sql, bool is_temp, ExecNodes *exec_nodes) /* Redistribution operations only concern Datanodes */ step->exec_type = EXEC_ON_DATANODES; +#ifdef __TBASE__ + ExecRemoteUtility(step, NULL, NON_PARALLEL_DDL); +#else ExecRemoteUtility(step); +#endif pfree(step->sql_statement); pfree(step); diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 17e6f838..1bb82166 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -123,8 +123,7 @@ static bool temp_object_included = false; static abort_callback_type dbcleanup_info = { NULL, NULL }; static int pgxc_node_begin(int conn_count, PGXCNodeHandle ** connections, - GlobalTransactionId gxid, bool need_tran_block, - bool readOnly, char node_type); + GlobalTransactionId gxid, bool need_tran_block, bool readOnly); static PGXCNodeAllHandles *get_exec_connections(RemoteQueryState *planstate, ExecNodes *exec_nodes, @@ -3424,16 +3423,14 @@ is_data_node_ready(PGXCNodeHandle * conn) return false; } - /* * Send BEGIN command to the Datanodes or Coordinators and receive responses. * Also send the GXID for the transaction. */ static int pgxc_node_begin(int conn_count, PGXCNodeHandle **connections, - GlobalTransactionId gxid, bool need_tran_block, - bool readOnly, char node_type) -{// #lizard forgives + GlobalTransactionId gxid, bool need_tran_block, bool readOnly) +{ #define SET_CMD_LENGTH 128 int i; struct timeval *timeout = NULL; @@ -4725,7 +4722,7 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle) { int conn_count = 0; - if (!enable_parallel_ddl || !is_txn_has_parallel_ddl) + if (!is_txn_has_parallel_ddl) { /* normal cases */ conn_count = pgxc_node_remote_commit_internal(get_current_handles(), txn_type); @@ -5899,7 +5896,8 @@ DataNodeCopyBegin(RemoteCopyData *rcstate) gxid = GetCurrentTransactionId(); /* Start transaction on connections where it is not started */ - if (pgxc_node_begin(conn_count, connections, gxid, need_tran_block, false, PGXC_NODE_DATANODE)) + + if (pgxc_node_begin(conn_count, connections, gxid, need_tran_block, false)) { ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), @@ -6718,6 +6716,211 @@ pgxc_start_command_on_connection(PGXCNodeHandle *connection, return true; } +/* + * Get snapshot and gxid for remote utility. + */ +void +GetGlobInfoForRemoteUtility(RemoteQuery *node, GlobalTransactionId *gxid, + Snapshot *snapshot) +{ + bool utility_need_transcation = true; + +#ifdef __TBASE__ + /* Some DDL such as ROLLBACK, SET does not need transaction */ + utility_need_transcation = + (!ExecDDLWithoutAcquireXid(node->parsetree) && !node->is_set); + + if (utility_need_transcation) +#endif + { + elog(LOG, "[SAVEPOINT] node->sql_statement:%s", node->sql_statement); + *gxid = GetCurrentTransactionId(); + } + + if (ActiveSnapshotSet()) + *snapshot = GetActiveSnapshot(); + +#ifdef __TBASE__ + if (utility_need_transcation) +#endif + { + if (!GlobalTransactionIdIsValid(*gxid)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to get next transaction ID"))); + } + +#ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__ + if(!IS_PGXC_LOCAL_COORDINATOR) + { + /* + * Distributed DDLs only dispatch from the requested coordinator, thus + * we skip sending gxid to avoid cycling. + * + * Note: except for 'set_config_option'. + */ + *gxid = InvalidTransactionId; + } + +#endif +} + +/* + * Send snapshot/cmdid/query to remote node. + */ +void +SendTxnInfo(RemoteQuery *node, PGXCNodeHandle *conn, + CommandId cid, Snapshot snapshot) +{ + if (conn->state == DN_CONNECTION_STATE_QUERY) + BufferConnection(conn); + if (snapshot && pgxc_node_send_snapshot(conn, snapshot)) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send snapshot to %s", conn->nodename))); + } + if (pgxc_node_send_cmd_id(conn, cid) < 0) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command ID to %s", conn->nodename))); + } + + if (pgxc_node_send_query(conn, node->sql_statement) != 0) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command to %s", conn->nodename))); + } +} + +/* + * Check response of remote connection. + */ +bool +CheckRemoteRespond(PGXCNodeHandle *conn, ResponseCombiner *combiner, + int *index, int *conn_count) +{ + int res = handle_response(conn, combiner); + if (res == RESPONSE_EOF) + { + (*index)++; + } + else if (res == RESPONSE_COMPLETE) + { + /* Ignore, wait for ReadyForQuery */ + if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Unexpected FATAL ERROR on Connection to " + "Datanode %s pid %d", + conn->nodename, conn->backend_pid))); + } + } + else if (res == RESPONSE_ERROR) + { + /* Ignore, wait for ReadyForQuery */ + } + else if (res == RESPONSE_READY) + { + if ((*index) < --(*conn_count)) + return true; + } + else if (res == RESPONSE_TUPDESC) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Unexpected response from %s pid %d", + conn->nodename, conn->backend_pid))); + } + else if (res == RESPONSE_DATAROW) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Unexpected response from %s pid %d", + conn->nodename, conn->backend_pid))); + } + return false; +} + +/* + * Receive remote response and chek receive status. + */ +void RemoteReceiveAndCheck(int conn_count, PGXCNodeHandle **conns, + ResponseCombiner *combiner) +{ + /* + * Stop if all commands are completed or we got a data row and + * initialized state node for subsequent invocations + */ + while (conn_count > 0) + { + int i = 0; + bool remote_ready = false; + + /* Wait until one of the connections has data available */ + if (pgxc_node_receive(conn_count, + conns, + NULL)) + { + /* + * Got error + * TODO(Tbase): How do we check the error here? + */ + break; + } + + while (i < conn_count) + { + PGXCNodeHandle *conn = NULL; + if (remote_ready) + { + conns[i] = conns[conn_count]; + } + conn = conns[i]; + remote_ready = CheckRemoteRespond(conn, combiner, &i, &conn_count); + } + } +} + +#ifdef __TBASE__ +/* + * Send ddl to leader cn, the function only be invoked + * in parallel ddl mode. + */ +void +LeaderCnExecRemoteUtility(RemoteQuery *node, + PGXCNodeHandle *leader_cn_conn, + ResponseCombiner *combiner, + bool need_tran_block, + GlobalTransactionId gxid, + Snapshot snapshot, + CommandId cid) +{ + int cn_cout = 1; + char *init_str = PGXCNodeGetSessionParamStr(); + if (init_str) + { + pgxc_node_set_query(leader_cn_conn, init_str); + } + + SetPlpgsqlTransactionBegin(leader_cn_conn); + if (pgxc_node_begin(cn_cout, &leader_cn_conn, gxid, + need_tran_block, false)) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Could not begin transaction on leader coordinator"))); + } + + /* Send other txn related messages to leader cn */ + SendTxnInfo(node, leader_cn_conn, cid, snapshot); + + RemoteReceiveAndCheck(cn_cout, &leader_cn_conn, combiner); +} + /* * Execute utility statement on multiple Datanodes * It does approximately the same as @@ -6730,8 +6933,11 @@ pgxc_start_command_on_connection(PGXCNodeHandle *connection, * like allocating tuple slots. */ void +ExecRemoteUtility(RemoteQuery *node, PGXCNodeHandle *leader_cn_conn, ParallelDDLRemoteType type) +#else ExecRemoteUtility(RemoteQuery *node) -{// #lizard forgives +#endif +{ RemoteQueryState *remotestate; ResponseCombiner *combiner; bool force_autocommit = node->force_autocommit; @@ -6739,13 +6945,12 @@ ExecRemoteUtility(RemoteQuery *node) GlobalTransactionId gxid = InvalidGlobalTransactionId; Snapshot snapshot = NULL; PGXCNodeAllHandles *pgxc_connections; - int co_conn_count; - int dn_conn_count; + int co_conn_count = 0; + int dn_conn_count = 0; bool need_tran_block; ExecDirectType exec_direct_type = node->exec_direct_type; int i; CommandId cid = GetCurrentCommandId(true); - bool utility_need_transcation = true; if (!force_autocommit) RegisterTransactionLocalNode(true); @@ -6761,6 +6966,13 @@ ExecRemoteUtility(RemoteQuery *node) pgxc_connections = get_exec_connections(NULL, node->exec_nodes, exec_type, exec_direct_type != EXEC_DIRECT_UTILITY); +#ifdef __TBASE__ + if (type == EXCLUED_LEADER_DDL) + { + delete_leadercn_handle(pgxc_connections, leader_cn_conn); + } +#endif + dn_conn_count = pgxc_connections->dn_conn_count; co_conn_count = pgxc_connections->co_conn_count; @@ -6792,172 +7004,31 @@ ExecRemoteUtility(RemoteQuery *node) "transaction block"))); } -#ifdef __TBASE__ - /* Some DDL such as ROLLBACK, SET does not need transaction */ - utility_need_transcation = - (!ExecDDLWithoutAcquireXid(node->parsetree) && !node->is_set); - - if (utility_need_transcation) -#endif - { - elog(LOG, "[SAVEPOINT] node->sql_statement:%s", node->sql_statement); - gxid = GetCurrentTransactionId(); - } - - if (ActiveSnapshotSet()) - snapshot = GetActiveSnapshot(); + GetGlobInfoForRemoteUtility(node, &gxid, &snapshot); #ifdef __TBASE__ - if (utility_need_transcation) -#endif + if (type == ONLY_LEADER_DDL) { - if (!GlobalTransactionIdIsValid(gxid)) - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to get next transaction ID"))); - } - -#ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__ - if(!IS_PGXC_LOCAL_COORDINATOR) - { - /* - * Distributed DDLs only dispatch from the requested coordinator, thus - * we skip sending gxid to avoid cycling. - * - * Note: except for 'set_config_option'. - */ - gxid = InvalidTransactionId; - } - -#endif - -#ifdef __TBASE__ - /* Set node begin transaction in plpgsql function for CN/DN */ - for (i = 0; i < dn_conn_count; i++) - { - SetPlpgsqlTransactionBegin(pgxc_connections->datanode_handles[i]); - } - - for (i = 0; i < co_conn_count; i++) - { - SetPlpgsqlTransactionBegin(pgxc_connections->coord_handles[i]); - } -#endif - - /* - * DDL will firstly be executed on coordinators then datanodes - * which will avoid deadlocks in cluster. - * Let us assume that user sql and ddl hold conflict locks, - * then there will be two situations: - * 1. The coordinator is not locked, user sql will see datanodes with no lock. - * 2. The coordinator is locked, user sql will wait for ddl to complete. - * - * Send BEGIN control command to all coordinator nodes - */ - if (pgxc_node_begin(co_conn_count, - pgxc_connections->coord_handles, - gxid, - need_tran_block, - false, - PGXC_NODE_COORDINATOR)) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Could not begin transaction on coordinators"))); - } - - /* Send other txn related messages to coordinator nodes */ - for (i = 0; i < co_conn_count; i++) - { - PGXCNodeHandle *conn = pgxc_connections->coord_handles[i]; - - if (snapshot && pgxc_node_send_snapshot(conn, snapshot)) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to send command to coordinators"))); - } - if (pgxc_node_send_cmd_id(conn, cid) < 0) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to send command ID to Datanodes"))); - } - - if (pgxc_node_send_query(conn, node->sql_statement) != 0) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to send command to coordinators"))); - } + LeaderCnExecRemoteUtility(node, leader_cn_conn, combiner, + need_tran_block, gxid, snapshot, cid); + pfree_pgxc_all_handles(pgxc_connections); + pgxc_node_report_error(combiner); + return; } - - /* - * Stop if all commands are completed or we got a data row and - * initialized state node for subsequent invocations - */ - while (co_conn_count > 0) + else { - int i = 0; - - /* Wait until one of the connections has data available */ - if (pgxc_node_receive(co_conn_count, - pgxc_connections->coord_handles, - NULL)) + /* Set node begin transaction in plpgsql function for CN/DN */ + for (i = 0; i < dn_conn_count; i++) { - /* - * Got error - * TODO(Tbase): How do we check the error here? - */ - break; - } - - while (i < co_conn_count) + SetPlpgsqlTransactionBegin(pgxc_connections->datanode_handles[i]); + } + + for (i = 0; i < co_conn_count; i++) { - PGXCNodeHandle *conn = pgxc_connections->coord_handles[i]; - int res = handle_response(conn, combiner); - - if (res == RESPONSE_EOF) - { - i++; - } - else if (res == RESPONSE_COMPLETE) - { - /* Ignore, wait for ReadyForQuery */ - if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Unexpected FATAL ERROR on Connection to " - "Coordinator %s pid %d", - pgxc_connections->coord_handles[i]->nodename, - pgxc_connections->coord_handles[i]->backend_pid))); - } - } - else if (res == RESPONSE_ERROR) - { - /* Ignore, wait for ReadyForQuery */ - } - else if (res == RESPONSE_READY) - { - if (i < --co_conn_count) - pgxc_connections->coord_handles[i] = - pgxc_connections->coord_handles[co_conn_count]; - } - else if (res == RESPONSE_TUPDESC) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Unexpected response from coordinator"))); - } - else if (res == RESPONSE_DATAROW) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Unexpected response from coordinator"))); - } - } - } + SetPlpgsqlTransactionBegin(pgxc_connections->coord_handles[i]); + } + } +#endif /* * DDL will firstly be executed on coordinators then datanodes @@ -6973,8 +7044,7 @@ ExecRemoteUtility(RemoteQuery *node) pgxc_connections->coord_handles, gxid, need_tran_block, - false, - PGXC_NODE_COORDINATOR)) + false)) { ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), @@ -6985,94 +7055,24 @@ ExecRemoteUtility(RemoteQuery *node) for (i = 0; i < co_conn_count; i++) { PGXCNodeHandle *conn = pgxc_connections->coord_handles[i]; - - if (snapshot && pgxc_node_send_snapshot(conn, snapshot)) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to send command to coordinators"))); - } - if (pgxc_node_send_cmd_id(conn, cid) < 0) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to send command ID to Datanodes"))); - } - - if (pgxc_node_send_query(conn, node->sql_statement) != 0) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to send command to coordinators"))); - } + SendTxnInfo(node, conn, cid, snapshot); } /* * Stop if all commands are completed or we got a data row and * initialized state node for subsequent invocations */ - while (co_conn_count > 0) - { - int i = 0; - - /* Wait until one of the connections has data available */ - if (pgxc_node_receive(co_conn_count, + RemoteReceiveAndCheck(co_conn_count, pgxc_connections->coord_handles, - NULL)) - { - /* - * Got error - * TODO(Tbase): How do we check the error here? - */ - break; - } + combiner); - while (i < co_conn_count) - { - PGXCNodeHandle *conn = pgxc_connections->coord_handles[i]; - int res = handle_response(conn, combiner); - - if (res == RESPONSE_EOF) - { - i++; - } - else if (res == RESPONSE_COMPLETE) - { - /* Ignore, wait for ReadyForQuery */ - if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Unexpected FATAL ERROR on Connection to " - "Coordinator %s pid %d", - pgxc_connections->coord_handles[i]->nodename, - pgxc_connections->coord_handles[i]->backend_pid))); - } - } - else if (res == RESPONSE_ERROR) - { - /* Ignore, wait for ReadyForQuery */ - } - else if (res == RESPONSE_READY) - { - if (i < --co_conn_count) - pgxc_connections->coord_handles[i] = - pgxc_connections->coord_handles[co_conn_count]; - } - else if (res == RESPONSE_TUPDESC) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Unexpected response from coordinator"))); - } - else if (res == RESPONSE_DATAROW) +#ifdef __TBASE__ + if (LOCAL_PARALLEL_DDL && combiner && combiner->errorMessage) { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Unexpected response from coordinator"))); - } - } + pfree_pgxc_all_handles(pgxc_connections); + pgxc_node_report_error(combiner); } +#endif /* * Send BEGIN control command to all data nodes @@ -7081,8 +7081,7 @@ ExecRemoteUtility(RemoteQuery *node) pgxc_connections->datanode_handles, gxid, need_tran_block, - false, - PGXC_NODE_DATANODE)) + false)) { ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), @@ -7093,98 +7092,12 @@ ExecRemoteUtility(RemoteQuery *node) for (i = 0; i < dn_conn_count; i++) { PGXCNodeHandle *conn = pgxc_connections->datanode_handles[i]; - - if (conn->state == DN_CONNECTION_STATE_QUERY) - BufferConnection(conn); - if (snapshot && pgxc_node_send_snapshot(conn, snapshot)) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to send snapshot to Datanodes"))); - } - if (pgxc_node_send_cmd_id(conn, cid) < 0) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to send command ID to Datanodes"))); - } - - if (pgxc_node_send_query(conn, node->sql_statement) != 0) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to send command to Datanodes"))); - } + SendTxnInfo(node, conn, cid, snapshot); } - - /* Make the same for data nodes */ - while (dn_conn_count > 0) - { - int i = 0; - - /* Wait until one of the connections has data available */ - if (pgxc_node_receive(dn_conn_count, + RemoteReceiveAndCheck(dn_conn_count, pgxc_connections->datanode_handles, - NULL)) - { - /* - * Got error - * TODO(Tbase): How do we check the error here? - */ - break; - } - - /* - * Handle input from the data nodes. We do not expect data nodes - * returning tuples when running utility command. If we got EOF, move - * to the next connection, will receive more data on the next - * iteration. - */ - while (i < dn_conn_count) - { - PGXCNodeHandle *conn = pgxc_connections->datanode_handles[i]; - int res = handle_response(conn, combiner); - if (res == RESPONSE_EOF) - { - i++; - } - else if (res == RESPONSE_COMPLETE) - { - /* Ignore, wait for ReadyForQuery */ - if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Unexpected FATAL ERROR on Connection to " - "Datanode %s pid %d", - conn->nodename, conn->backend_pid))); - } - } - else if (res == RESPONSE_ERROR) - { - /* Ignore, wait for ReadyForQuery */ - } - else if (res == RESPONSE_READY) - { - if (i < --dn_conn_count) - pgxc_connections->datanode_handles[i] = - pgxc_connections->datanode_handles[dn_conn_count]; - } - else if (res == RESPONSE_TUPDESC) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Unexpected response from Datanode"))); - } - else if (res == RESPONSE_DATAROW) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Unexpected response from Datanode"))); - } - } - } + combiner); /* * We have processed all responses from nodes and if we have error message @@ -9086,7 +8999,7 @@ ExecRemoteQuery(PlanState *pstate) combiner->current_conn = 0; #endif if (pgxc_node_begin(1, &primaryconnection, gxid, need_tran_block, - step->read_only, PGXC_NODE_DATANODE)) + step->read_only)) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Could not begin transaction on data node:%s.", @@ -9155,8 +9068,9 @@ ExecRemoteQuery(PlanState *pstate) #ifdef __TBASE__ connections[i]->recv_datarows = 0; #endif + if (pgxc_node_begin(1, &connections[i], gxid, need_tran_block, - step->read_only, PGXC_NODE_DATANODE)) + step->read_only)) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Could not begin transaction on data node:%s.", @@ -10593,8 +10507,7 @@ ExecFinishInitRemoteSubplan(RemoteSubplanState *node) { PGXCNodeHandle *connection = combiner->connections[i]; - if (pgxc_node_begin(1, &connection, gxid, true, - is_read_only, PGXC_NODE_DATANODE)) + if (pgxc_node_begin(1, &connection, gxid, true, is_read_only)) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Could not begin transaction on data node:%s.", @@ -12641,8 +12554,7 @@ ExecRemoteDML(ModifyTableState *mtstate, ItemPointer tupleid, HeapTuple oldtuple { gxid = GetCurrentTransactionIdIfAny(); - if (pgxc_node_begin(1, &connections[i], gxid, true, - false, PGXC_NODE_DATANODE)) + if (pgxc_node_begin(1, &connections[i], gxid, true, false)) { elog(ERROR, "Could not begin transaction on datanode in ExecRemoteDML, nodeid:%d.", connections[i]->nodeid); diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index 6db7d43b..5424a200 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -2701,6 +2701,7 @@ pgxc_node_send_query_internal(PGXCNodeHandle * handle, const char *query, { int strLen; int msgLen; + /* * Its appropriate to send ROLLBACK commands on a failed connection, but * for everything else we expect the connection to be in a sane state @@ -5829,23 +5830,22 @@ PGXCGetAllDnOid(Oid *nodelist) /* * Return the name of ascii-minimized coordinator as ddl leader cn */ -inline char* +PGXCNodeHandle* find_ddl_leader_cn(void) { int i = 0; - char* result = NULL; + char *name = NULL; + PGXCNodeHandle *result = NULL; for (i = 0; i < NumCoords; i++) { - if(result == NULL || strcmp(co_handles[i].nodename, result) < 0) + if(name == NULL || strcmp(co_handles[i].nodename, name) < 0) { - result = co_handles[i].nodename; + name = co_handles[i].nodename; + result = &co_handles[i]; } } - if(result) - result = pstrdup(result); - return result; } @@ -5866,6 +5866,34 @@ is_pgxc_handles_init() { return (dn_handles != NULL && co_handles != NULL); } + +/* + * Remove leader_cn_handle from pgxc_connections + */ +void +delete_leadercn_handle(PGXCNodeAllHandles *pgxc_connections, + PGXCNodeHandle* leader_cn_handle) +{ + int co_conn_count = 0; + int i = 0; + + if (!pgxc_connections || !leader_cn_handle) + return; + + co_conn_count = pgxc_connections->co_conn_count; + for (i = 0; i < co_conn_count; i++) + { + if (pgxc_connections->coord_handles[i] == leader_cn_handle) + { + if (i+1 < co_conn_count) + pgxc_connections->coord_handles[i] = pgxc_connections->coord_handles[i+1]; + else + pgxc_connections->coord_handles[i] = NULL; + pgxc_connections->co_conn_count--; + break; + } + } +} #endif /* diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 1ea3d3ea..a607a515 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -5195,6 +5195,7 @@ PostgresMain(int argc, char *argv[], #ifdef __TBASE__ /* Clear parallel DDL flag */ is_txn_has_parallel_ddl = false; + leader_cn_executed_ddl = false; #endif /* diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index 73736d71..a984b9e6 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -17,6 +17,7 @@ *------------------------------------------------------------------------- */ #include "postgres.h" +#include "stdio.h" #include "access/htup_details.h" #include "access/reloptions.h" @@ -104,6 +105,8 @@ #include "utils/ruleutils.h" #include "utils/memutils.h" #include "catalog/index.h" +#include "catalog/pg_namespace.h" +#include "storage/lmgr.h" #endif #ifdef __AUDIT__ @@ -152,8 +155,12 @@ extern bool g_GTM_skip_catalog; bool is_txn_has_parallel_ddl; bool enable_parallel_ddl; +bool leader_cn_executed_ddl; + #endif +static RemoteQueryExecType GetRenameExecType(RenameStmt *stmt, bool *is_temp); + #endif /* Hook for plugins to get control in ProcessUtility() */ @@ -665,10 +672,14 @@ ProcessUtilityPre(PlannedStmt *pstmt, /* Clean also remote Coordinators */ snprintf(query, STRINGLENGTH, "CLEAN CONNECTION TO ALL FOR DATABASE %s;", quote_identifier(stmt->dbname)); + ExecUtilityStmtOnNodes(parsetree, query, NULL, sentToRemote, true, EXEC_ON_ALL_NODES, false, false); - if (!stmt->prepare) + /* + * parallel ddl mode, we send drop db prepare in standard_ProcessUtility + */ + if (!stmt->prepare && !is_txn_has_parallel_ddl) { /* Lock database and check the constraints before we actually dropping */ if (stmt->missing_ok) @@ -779,7 +790,9 @@ ProcessUtilityPre(PlannedStmt *pstmt, #ifdef _MIGRATE_ if(!IsConnFromCoord() && !isRestoreMode && IS_PGXC_COORDINATOR) { - ExecUtilityStmtOnNodes(parsetree, queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false, false); + ExecUtilityStmtOnNodes(parsetree, queryString, NULL, + sentToRemote, false, EXEC_ON_ALL_NODES, + false, false); } #endif all_done = true; @@ -790,7 +803,9 @@ ProcessUtilityPre(PlannedStmt *pstmt, #ifdef _MIGRATE_ if(!IsConnFromCoord() && IS_PGXC_COORDINATOR) { - ExecUtilityStmtOnNodes(parsetree, queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false, false); + ExecUtilityStmtOnNodes(parsetree, queryString, NULL, + sentToRemote, false, EXEC_ON_ALL_NODES, + false, false); } #endif all_done = true; @@ -813,7 +828,9 @@ ProcessUtilityPre(PlannedStmt *pstmt, #ifdef _MIGRATE_ if(!IsConnFromCoord() && IS_PGXC_COORDINATOR) { - ExecUtilityStmtOnNodes(parsetree, queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false, false); + ExecUtilityStmtOnNodes(parsetree, queryString, NULL, + sentToRemote, false, EXEC_ON_ALL_NODES, + false, false); } #endif all_done = true; @@ -844,33 +861,10 @@ ProcessUtilityPre(PlannedStmt *pstmt, if (IS_PGXC_LOCAL_COORDINATOR) { - /* - * Get the necessary details about the relation before we - * run ExecRenameStmt locally. Otherwise we may not be able - * to look-up using the old relation name. - */ - if (stmt->relation) - { - /* - * If the table does not exist, don't send the query to - * the remote nodes. The local node will eventually - * report an error, which is then sent back to the - * client. - */ - Oid relid = RangeVarGetRelid(stmt->relation, NoLock, true); - - if (OidIsValid(relid)) - exec_type = ExecUtilityFindNodes(stmt->renameType, - relid, - &is_temp); - else - exec_type = EXEC_ON_NONE; - } - else - exec_type = ExecUtilityFindNodes(stmt->renameType, - InvalidOid, - &is_temp); + exec_type = GetRenameExecType(stmt, &is_temp); #ifdef __TBASE__ + if (LOCAL_PARALLEL_DDL) + exec_type = EXEC_ON_NONE; /* clean connections of the old name first. */ if (OBJECT_DATABASE == stmt->renameType) { @@ -879,8 +873,9 @@ ProcessUtilityPre(PlannedStmt *pstmt, DropDBCleanConnection(stmt->subname); /* Clean also remote nodes */ sprintf(query, "CLEAN CONNECTION TO ALL FOR DATABASE %s;", stmt->subname); - ExecUtilityStmtOnNodes(parsetree, query, NULL, sentToRemote, true, - EXEC_ON_ALL_NODES, false, false); + ExecUtilityStmtOnNodes(parsetree, query, NULL, + sentToRemote, true, EXEC_ON_ALL_NODES, + false, false); } #endif } @@ -899,7 +894,27 @@ ProcessUtilityPre(PlannedStmt *pstmt, * it will cause a deadlock in the cluster at Datanode levels. */ if (!IsConnFromCoord()) + { +#ifdef __TBASE__ + if (LOCAL_PARALLEL_DDL) + { + PGXCNodeHandle* leaderCnHandle = find_ddl_leader_cn(); + RemoteQueryExecType execType = ((RemoteQuery *) parsetree)->exec_type; + if ((execType == EXEC_ON_ALL_NODES || execType == EXEC_ON_COORDS)) + { + if (!is_ddl_leader_cn(leaderCnHandle->nodename)) + Assert(leader_cn_executed_ddl); + } + ExecRemoteUtility((RemoteQuery *) parsetree, + leaderCnHandle, EXCLUED_LEADER_DDL); + } + else + ExecRemoteUtility((RemoteQuery *) parsetree, + NULL, NON_PARALLEL_DDL); +#else ExecRemoteUtility((RemoteQuery *) parsetree); +#endif + } break; case T_CleanConnStmt: @@ -1245,7 +1260,6 @@ ProcessUtilityPre(PlannedStmt *pstmt, ExecUtilityStmtOnNodes(parsetree, queryString, NULL, sentToRemote, auto_commit, exec_type, is_temp, add_context); - return all_done; } @@ -1369,16 +1383,21 @@ ProcessUtilityPost(PlannedStmt *pstmt, add_context = true; exec_type = EXEC_ON_ALL_NODES; break; - + case T_DropdbStmt: + case T_DropRoleStmt: case T_DropTableSpaceStmt: +#ifdef __TBASE__ + if (LOCAL_PARALLEL_DDL) + break; +#endif + exec_type = EXEC_ON_ALL_NODES; + break; case T_AlterTableSpaceOptionsStmt: case T_GrantRoleStmt: case T_AlterDatabaseSetStmt: - case T_DropdbStmt: case T_CreateRoleStmt: case T_AlterRoleStmt: case T_AlterRoleSetStmt: - case T_DropRoleStmt: case T_ReassignOwnedStmt: case T_LockStmt: case T_AlterOwnerStmt: @@ -1658,6 +1677,10 @@ ProcessUtilityPost(PlannedStmt *pstmt, break; case T_CreateSeqStmt: +#ifdef __TBASE__ + if (LOCAL_PARALLEL_DDL) + break; +#endif if (IS_PGXC_LOCAL_COORDINATOR) { CreateSeqStmt *stmt = (CreateSeqStmt *) parsetree; @@ -1781,22 +1804,117 @@ ProcessUtilityPost(PlannedStmt *pstmt, static void parallel_ddl_process(Node *node) { - if (!enable_parallel_ddl || !IS_PGXC_LOCAL_COORDINATOR) + /* + * set is_txn_has_parallel_ddl to be false in case of combination command + * that include some type support parallel ddl and some unsupport parallel + * ddl. eg: create extension which include T_CreateFunctionStmt and + * T_CreateOpClassStmt and so on. + */ + if (is_txn_has_parallel_ddl && nodeTag(node) != T_RemoteQuery) + { + is_txn_has_parallel_ddl = false; + } + + if (!enable_parallel_ddl) { return ; } switch (nodeTag(node)) { + case T_AlterTableStmt: + case T_AlterDatabaseStmt: + case T_AlterDatabaseSetStmt: + case T_AlterRoleSetStmt: + break; + case T_AlterOwnerStmt: + { + AlterOwnerStmt *stmt = (AlterOwnerStmt *) node; + switch (stmt->objectType) + { + case OBJECT_DATABASE: + case OBJECT_SCHEMA: + case OBJECT_TABLE: + case OBJECT_FUNCTION: + case OBJECT_TYPE: + break; + default: + return; + } + } + break; + case T_AlterObjectSchemaStmt: + { + AlterObjectSchemaStmt *stmt = (AlterObjectSchemaStmt *) node; + switch (stmt->objectType) + { + case OBJECT_TABLE: + case OBJECT_FUNCTION: + case OBJECT_VIEW: + case OBJECT_TYPE: + break; + default: + return; + } + } + break; + case T_AlterSeqStmt: case T_CreateStmt: case T_CreateForeignTableStmt: case T_CreateTableAsStmt: case T_CreateSchemaStmt: - case T_AlterTableStmt: - case T_DefineStmt: + case T_CreateTableSpaceStmt: + case T_CreatedbStmt: + case T_CreateRoleStmt: + case T_CompositeTypeStmt: + case T_CreateEnumStmt: + case T_CreateRangeStmt: + case T_CreateSeqStmt: + case T_CreateFunctionStmt: + case T_ViewStmt: + case T_DropTableSpaceStmt: + case T_DropdbStmt: + case T_DropRoleStmt: + break; case T_DropStmt: + { + DropStmt *stmt = (DropStmt *)node; + switch (stmt->removeType) + { + case OBJECT_INDEX: + case OBJECT_SEQUENCE: + case OBJECT_TABLE: + case OBJECT_VIEW: + case OBJECT_MATVIEW: + case OBJECT_FOREIGN_TABLE: + case OBJECT_SCHEMA: + case OBJECT_FUNCTION: + case OBJECT_TYPE: + break; + default: + return; + } + } + break; case T_RenameStmt: - case T_TruncateStmt: + { + RenameStmt *stmt = (RenameStmt *)node; + switch (stmt->renameType) + { + case OBJECT_DATABASE: + case OBJECT_SCHEMA: + case OBJECT_ROLE: + case OBJECT_TABLE: + case OBJECT_INDEX: + case OBJECT_VIEW: + case OBJECT_FUNCTION: + case OBJECT_TYPE: + break; + default: + return; + } + } + break; case T_IndexStmt: /* CONCURRENT INDEX is not supported */ if (IsA(node,IndexStmt) && castNode(IndexStmt,node)->concurrent) @@ -1804,6 +1922,9 @@ parallel_ddl_process(Node *node) return ; } break; + case T_TruncateStmt: + case T_ReindexStmt: + break; default: return ; } @@ -2067,16 +2188,72 @@ standard_ProcessUtility(PlannedStmt *pstmt, /* no event triggers for global objects */ if (IS_PGXC_LOCAL_COORDINATOR) PreventTransactionChain(isTopLevel, "CREATE TABLESPACE"); +#ifdef __TBASE__ + /* + * If I am the main execute CN but not Leader CN, + * Notify the Leader CN to create firstly. + */ + if (!sentToRemote && LOCAL_PARALLEL_DDL) + { + SendLeaderCNUtilityWithContext(queryString, false); + } +#endif CreateTableSpace((CreateTableSpaceStmt *) parsetree); break; case T_DropTableSpaceStmt: - /* no event triggers for global objects */ - /* Allow this to be run inside transaction block on remote nodes */ + { + DropTableSpaceStmt *stmt = (DropTableSpaceStmt *)parsetree; + /* + * no event triggers for global objects + * Allow this to be run inside transaction block on remote nodes + */ if (IS_PGXC_LOCAL_COORDINATOR) PreventTransactionChain(isTopLevel, "DROP TABLESPACE"); - DropTableSpace((DropTableSpaceStmt *) parsetree); +#ifdef __TBASE__ + /* + * If I am the main execute CN but not Leader CN, + * Notify the Leader CN to create firstly. + */ + if (!sentToRemote && LOCAL_PARALLEL_DDL) + { + PGXCNodeHandle *leaderCnHandle = NULL; + leaderCnHandle = find_ddl_leader_cn(); + if (!is_ddl_leader_cn(leaderCnHandle->nodename)) + { + if (PreCheckforDropTableSpace(stmt)) + { + SendLeaderCNUtility(queryString, false); + DropTableSpace(stmt, false); + ExecUtilityStmtOnNodes(parsetree, queryString, + NULL, sentToRemote, false, + EXEC_ON_ALL_NODES, false, + false); + } + } + else if (DropTableSpace(stmt, stmt->missing_ok)) + { + ExecUtilityStmtOnNodes(parsetree, queryString, + NULL, sentToRemote, false, + EXEC_ON_ALL_NODES, false, + false); + } + } + /* From remote cn */ + else if (!IS_PGXC_LOCAL_COORDINATOR && is_txn_has_parallel_ddl) + { + DropTableSpace(stmt, false); + } + /* non parallel ddl mode */ + else + { + DropTableSpace(stmt, stmt->missing_ok); + } +#else + DropTableSpace(stmt, stmt->missing_ok); +#endif + } break; case T_AlterTableSpaceOptionsStmt: @@ -2085,6 +2262,31 @@ standard_ProcessUtility(PlannedStmt *pstmt, break; case T_TruncateStmt: +#ifdef __TBASE__ + /* + * If I am the main execute CN but not Leader CN, + * Notify the Leader CN to create firstly. + */ + if (!sentToRemote && LOCAL_PARALLEL_DDL) + { + bool is_temp = false; + ListCell *cell; + foreach (cell, ((TruncateStmt *) parsetree)->relations) + { + Oid relid; + RangeVar* rel = (RangeVar*)lfirst(cell); + + relid = RangeVarGetRelid(rel, NoLock, false); + + if (IsTempTable(relid)) + { + is_temp = true; + break; + } + } + SendLeaderCNUtility(queryString, is_temp); + } +#endif ExecuteTruncate((TruncateStmt *) parsetree); break; @@ -2127,31 +2329,118 @@ standard_ProcessUtility(PlannedStmt *pstmt, /* no event triggers for global objects */ if (IS_PGXC_LOCAL_COORDINATOR) PreventTransactionChain(isTopLevel, "CREATE DATABASE"); + +#ifdef __TBASE__ + /* + * If I am the main execute CN but not Leader CN, + * Notify the Leader CN to create firstly. + */ + if (!sentToRemote && LOCAL_PARALLEL_DDL) + { + SendLeaderCNUtilityWithContext(queryString, false); + } +#endif + createdb(pstate, (CreatedbStmt *) parsetree); break; case T_AlterDatabaseStmt: +#ifdef __TBASE__ + /* + * If I am the main execute CN but not Leader CN, + * Notify the Leader CN to create firstly. + */ + if (!sentToRemote && LOCAL_PARALLEL_DDL) + { + /* + * If this is not a SET TABLESPACE statement, just propogate + * the cmd as usual. + */ + if (IsSetTableSpace((AlterDatabaseStmt*) parsetree)) + SendLeaderCNUtility(queryString, false); + else + SendLeaderCNUtilityWithContext(queryString, false); + } +#endif /* no event triggers for global objects */ AlterDatabase(pstate, (AlterDatabaseStmt *) parsetree, isTopLevel); break; case T_AlterDatabaseSetStmt: +#ifdef __TBASE__ + /* + * If I am the main execute CN but not Leader CN, + * Notify the Leader CN to create firstly. + */ + if (!sentToRemote && LOCAL_PARALLEL_DDL) + { + SendLeaderCNUtility(queryString, false); + } +#endif /* no event triggers for global objects */ AlterDatabaseSet((AlterDatabaseSetStmt *) parsetree); break; case T_DropdbStmt: { + char prepareQuery[STRINGLENGTH]; DropdbStmt *stmt = (DropdbStmt *) parsetree; - if (!stmt->prepare) { + bool missing_ok = stmt->missing_ok; /* no event triggers for global objects */ if (IS_PGXC_LOCAL_COORDINATOR) { PreventTransactionChain(isTopLevel, "DROP DATABASE"); } - dropdb(stmt->dbname, stmt->missing_ok); + /* + * If I am the main execute CN but not Leader CN, + * Notify the Leader CN to drop firstly. + */ + if (!sentToRemote && LOCAL_PARALLEL_DDL) + { + PGXCNodeHandle *leaderCnHandle = NULL; + Oid db_oid = InvalidOid; + leaderCnHandle = find_ddl_leader_cn(); + + db_oid = get_database_oid(stmt->dbname, missing_ok); + + if (OidIsValid(db_oid)) + { + snprintf(prepareQuery, STRINGLENGTH, "DROP DATABASE PREPARE %s;", + quote_identifier(stmt->dbname)); + if (!is_ddl_leader_cn(leaderCnHandle->nodename)) + SendLeaderCNUtility(prepareQuery, false); + else + dropdb_prepare(stmt->dbname, false); + ExecUtilityStmtOnNodes(parsetree, prepareQuery, + NULL, sentToRemote, false, + EXEC_ON_ALL_NODES, false, + false); + + if (!is_ddl_leader_cn(leaderCnHandle->nodename)) + SendLeaderCNUtility(queryString, false); + } + else + break; + } + /* + * In parallel ddl mode, we only send cmd to remote when + * database exists, so database can not miss when the cmd + * come from remote cn. + */ + if (!IS_PGXC_LOCAL_COORDINATOR && is_txn_has_parallel_ddl) + { + missing_ok = false; + } + + if (dropdb(stmt->dbname, missing_ok) && LOCAL_PARALLEL_DDL) + { + ExecUtilityStmtOnNodes(parsetree, queryString, NULL, + sentToRemote, false, + EXEC_ON_ALL_NODES, false, + false); + } } else { @@ -2301,6 +2590,16 @@ standard_ProcessUtility(PlannedStmt *pstmt, * ******************************** ROLE statements **** */ case T_CreateRoleStmt: +#ifdef __TBASE__ + /* + * If I am the main execute CN but not Leader CN, + * Notify the Leader CN to create firstly. + */ + if (!sentToRemote && LOCAL_PARALLEL_DDL) + { + SendLeaderCNUtility(queryString, false); + } +#endif /* no event triggers for global objects */ CreateRole(pstate, (CreateRoleStmt *) parsetree); break; @@ -2311,13 +2610,29 @@ standard_ProcessUtility(PlannedStmt *pstmt, break; case T_AlterRoleSetStmt: +#ifdef __TBASE__ + /* + * If I am the main execute CN but not Leader CN, + * Notify the Leader CN to create firstly. + */ + if (!sentToRemote && LOCAL_PARALLEL_DDL) + { + SendLeaderCNUtility(queryString, false); + } +#endif /* no event triggers for global objects */ AlterRoleSet((AlterRoleSetStmt *) parsetree); break; case T_DropRoleStmt: + { +#ifdef __TBASE__ + CheckAndDropRole(parsetree, sentToRemote, queryString); +#else /* no event triggers for global objects */ - DropRole((DropRoleStmt *) parsetree); + DropRole(stmt, stmt->missing_ok, NULL); +#endif + } break; case T_ReassignOwnedStmt: @@ -2368,9 +2683,17 @@ standard_ProcessUtility(PlannedStmt *pstmt, switch (stmt->kind) { case REINDEX_OBJECT_INDEX: +#ifdef __TBASE__ + CheckAndSendLeaderCNReindex(sentToRemote, stmt, + queryString); +#endif ReindexIndex(stmt->relation, stmt->options); break; case REINDEX_OBJECT_TABLE: +#ifdef __TBASE__ + CheckAndSendLeaderCNReindex(sentToRemote, stmt, + queryString); +#endif ReindexTable(stmt->relation, stmt->options); break; case REINDEX_OBJECT_SCHEMA: @@ -2437,11 +2760,34 @@ standard_ProcessUtility(PlannedStmt *pstmt, RenameStmt *stmt = (RenameStmt *) parsetree; if (EventTriggerSupportsObjectType(stmt->renameType)) + { ProcessUtilitySlow(pstate, pstmt, queryString, context, params, queryEnv, dest, sentToRemote, completionTag); + } +#ifdef __TBASE__ + else if (LOCAL_PARALLEL_DDL) + { + bool is_temp = false; + PGXCNodeHandle *leaderCnHandle = find_ddl_leader_cn(); + bool is_leader_cn = is_ddl_leader_cn(leaderCnHandle->nodename); + RemoteQueryExecType exec_type = GetRenameExecType(stmt, &is_temp); + /* + * If I am the main execute CN but not Leader CN, + * Notify the Leader CN to create firstly. + */ + if (!is_leader_cn) + { + SendLeaderCNUtility(queryString, is_temp); + } + ExecRenameStmt(stmt); + ExecUtilityStmtOnNodes(parsetree, queryString, NULL, + sentToRemote, false, exec_type, + is_temp, false); + } +#endif else ExecRenameStmt(stmt); @@ -2489,8 +2835,20 @@ standard_ProcessUtility(PlannedStmt *pstmt, sentToRemote, completionTag); else + { +#ifdef __TBASE__ + /* + * If I am the main execute CN but not Leader CN, + * Notify the Leader CN to create firstly. + */ + if (!sentToRemote && LOCAL_PARALLEL_DDL) + { + SendLeaderCNUtility(queryString, false); + } +#endif ExecAlterOwnerStmt(stmt); } + } break; case T_CommentStmt: @@ -2550,7 +2908,9 @@ standard_ProcessUtility(PlannedStmt *pstmt, /* only if am the original session I will revoke other nodes to do the create sharding job */ if(IS_PGXC_COORDINATOR && !IsConnFromCoord()) { - ExecUtilityStmtOnNodes(parsetree, queryString, NULL, sentToRemote, false, EXEC_ON_COORDS, false, false); + ExecUtilityStmtOnNodes(parsetree, queryString, NULL, + sentToRemote, false, EXEC_ON_COORDS, + false, false); execnodes = (ExecNodes *)makeNode(ExecNodes); for(i = 0; i < nodenum; i++) @@ -2566,8 +2926,8 @@ standard_ProcessUtility(PlannedStmt *pstmt, execnodes->nodeList = lappend_int(execnodes->nodeList, nodeIndex[i]); ExecUtilityStmtOnNodes(parsetree, queryString, execnodes, - sentToRemote, false, - EXEC_ON_DATANODES, false, false); + sentToRemote, false, EXEC_ON_DATANODES, + false, false); list_free(execnodes->nodeList); execnodes->nodeList = NIL; } @@ -2622,7 +2982,9 @@ standard_ProcessUtility(PlannedStmt *pstmt, /* Send Move Data Command to All Coordinator, * BUT,it is necessary to add new node to all the Coordinators independently */ - ExecUtilityStmtOnNodes(parsetree, queryString, NULL, sentToRemote, false, EXEC_ON_COORDS, false, false); + ExecUtilityStmtOnNodes(parsetree, queryString, NULL, + sentToRemote, false, EXEC_ON_COORDS, + false, false); /* generate new query string to datanode s*/ switch (stmt->strategy) @@ -2702,8 +3064,8 @@ standard_ProcessUtility(PlannedStmt *pstmt, /* Send Move Data Command to Data Node */ ExecUtilityStmtOnNodes(parsetree, movecmd, execnodes, - sentToRemote, false, - EXEC_ON_DATANODES, false, false); + sentToRemote, false, EXEC_ON_DATANODES, + false, false); pfree(qstring_tonode->data); pfree(qstring_tonode); @@ -2754,7 +3116,9 @@ standard_ProcessUtility(PlannedStmt *pstmt, ExecNodes *execnodes; /* drop remote coord sharding map */ - ExecUtilityStmtOnNodes(parsetree, queryString, NULL, sentToRemote, false, EXEC_ON_COORDS, false, false); + ExecUtilityStmtOnNodes(parsetree, queryString, NULL, + sentToRemote, false, EXEC_ON_COORDS, + false, false); /* drop datanodes sharding map */ GetGroupNodesByNameOrder(group, nodeIndex, &nodenum); @@ -2773,8 +3137,8 @@ standard_ProcessUtility(PlannedStmt *pstmt, execnodes->nodeList = lappend_int(execnodes->nodeList, nodeIndex[i]); } ExecUtilityStmtOnNodes(parsetree, queryString, execnodes, - sentToRemote, false, - EXEC_ON_DATANODES, false, false); + sentToRemote, false, EXEC_ON_DATANODES, + false, false); list_free(execnodes->nodeList); pfree(execnodes); } @@ -2810,7 +3174,9 @@ standard_ProcessUtility(PlannedStmt *pstmt, /* Send cleansharding msg to all other cn and dn */ if (IS_PGXC_LOCAL_COORDINATOR) { - ExecUtilityStmtOnNodes(parsetree, queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false, false); + ExecUtilityStmtOnNodes(parsetree, queryString, NULL, + sentToRemote, false, EXEC_ON_ALL_NODES, + false, false); } /* Then cleansharding self */ ForceRefreshShardMap(InvalidOid); @@ -2867,11 +3233,13 @@ standard_ProcessUtility(PlannedStmt *pstmt, elog(ERROR, "innel error: datanode %d cannot be found.", tooid); execnodes->nodeList = lappend_int(execnodes->nodeList,toidx); ExecUtilityStmtOnNodes(NULL, "CLEAN SHARDING;", execnodes, - sentToRemote, false, - EXEC_ON_DATANODES, false, false); + sentToRemote, false, EXEC_ON_DATANODES, + false, false); //second clean sharding of all cooridnators - ExecUtilityStmtOnNodes(parsetree, queryString, NULL, sentToRemote, false, EXEC_ON_COORDS, false, false); + ExecUtilityStmtOnNodes(parsetree, queryString, NULL, + sentToRemote, false, EXEC_ON_COORDS, + false, false); //and self ForceRefreshShardMap(InvalidOid); @@ -2895,8 +3263,8 @@ standard_ProcessUtility(PlannedStmt *pstmt, execnodes->nodeList = lappend_int(execnodes->nodeList, nodeindex); } ExecUtilityStmtOnNodes(NULL, "CLEAN SHARDING;", execnodes, - sentToRemote, false, - EXEC_ON_DATANODES, false, false); + sentToRemote, false, EXEC_ON_DATANODES, + false, false); //finally clean sharding at from datanode @@ -2909,8 +3277,8 @@ standard_ProcessUtility(PlannedStmt *pstmt, execnodes->nodeList = lappend_int(execnodes->nodeList,fromidx); ExecUtilityStmtOnNodes(NULL, "CLEAN SHARDING;", execnodes, - sentToRemote, false, - EXEC_ON_DATANODES, false, false); + sentToRemote, false, EXEC_ON_DATANODES, + false, false); list_free(execnodes->nodeList); pfree(execnodes); } @@ -3000,7 +3368,9 @@ standard_ProcessUtility(PlannedStmt *pstmt, || (CREATE_KEY_VALUE_EXEC_CN == g_create_key_value_mode)) { /* first tell other coord node to create */ - ExecUtilityStmtOnNodes(parsetree, queryString, NULL, sentToRemote, false, EXEC_ON_COORDS, false, false); + ExecUtilityStmtOnNodes(parsetree, queryString, NULL, + sentToRemote, false, EXEC_ON_COORDS, + false, false); } if ((CREATE_KEY_VALUE_EXEC_ALL == g_create_key_value_mode) @@ -3021,8 +3391,8 @@ standard_ProcessUtility(PlannedStmt *pstmt, execnodes->nodeList = lappend_int(execnodes->nodeList, nodeIndex[i]); ExecUtilityStmtOnNodes(parsetree, queryString, execnodes, - sentToRemote, false, - EXEC_ON_DATANODES, false, false); + sentToRemote, false, EXEC_ON_DATANODES, + false, false); list_free(execnodes->nodeList); execnodes->nodeList = NIL; } @@ -3045,8 +3415,8 @@ standard_ProcessUtility(PlannedStmt *pstmt, execnodes->nodeList = lappend_int(execnodes->nodeList, nodeIndex[i]); ExecUtilityStmtOnNodes(parsetree, queryString, execnodes, - sentToRemote, false, - EXEC_ON_DATANODES, false, false); + sentToRemote, false, EXEC_ON_DATANODES, + false, false); list_free(execnodes->nodeList); execnodes->nodeList = NIL; } @@ -3148,6 +3518,16 @@ ProcessUtilitySlow(ParseState *pstate, * relation and attribute manipulation */ case T_CreateSchemaStmt: +#ifdef __TBASE__ + /* + * If I am the main execute CN but not Leader CN, + * Notify the Leader CN to create firstly. + */ + if (!sentToRemote && LOCAL_PARALLEL_DDL) + { + SendLeaderCNUtility(queryString, false); + } +#endif CreateSchemaCommand((CreateSchemaStmt *) parsetree, queryString, sentToRemote, pstmt->stmt_location, @@ -3172,6 +3552,13 @@ ProcessUtilitySlow(ParseState *pstate, PGXCSubCluster *subcluster = NULL; #endif +#ifdef __TBASE__ + Oid nspaceid; + bool exist_ok = true; + + if (is_txn_has_parallel_ddl && IsConnFromCoord()) + exist_ok = false; + /* Run parse analysis ... */ /* * If sentToRemote is set it is either EXECUTE DIRECT or part @@ -3181,14 +3568,18 @@ ProcessUtilitySlow(ParseState *pstate, * it should explicitly specify distribution. */ stmts = transformCreateStmt((CreateStmt *) parsetree, - queryString, !is_local && !sentToRemote); + queryString, !is_local && !sentToRemote, + &nspaceid, exist_ok); -#ifdef __TBASE__ if (NULL == stmts) { commandCollected = true; break; } + +#else + stmts = transformCreateStmt((CreateStmt *) parsetree, + queryString, !is_local && !sentToRemote); #endif if (IS_PGXC_LOCAL_COORDINATOR) @@ -3278,6 +3669,29 @@ ProcessUtilitySlow(ParseState *pstate, } } } +#ifdef __TBASE__ + /* + * If I am the main execute CN but not Leader CN, + * Notify the Leader CN to create firstly. + */ + if (!sentToRemote && LOCAL_PARALLEL_DDL) + { + PGXCNodeHandle *leader_cn = find_ddl_leader_cn(); + if (!is_ddl_leader_cn(leader_cn->nodename)) + { + /* + * Unlock namespace before send to Leader CN + * in case of concurrent drop schema and create + * schema.xxx dead lock. + */ + UnlockDatabaseObject(NamespaceRelationId, nspaceid, + 0, AccessShareLock); + SendLeaderCNUtility(queryString, is_temp); + LockDatabaseObject(NamespaceRelationId, nspaceid, + 0, AccessShareLock); + } + } +#endif #ifdef __COLD_HOT__ /* Add check overlap remote query on top of query tree */ if (subcluster && distributeby) @@ -3474,8 +3888,9 @@ ProcessUtilitySlow(ParseState *pstate, { if (auditString != NULL) { - ExecUtilityStmtOnNodes(parsetree, auditString, NULL, sentToRemote, true, - EXEC_ON_ALL_NODES, false, false); + ExecUtilityStmtOnNodes(parsetree, auditString, NULL, + sentToRemote, true, EXEC_ON_ALL_NODES, + false, false); } } @@ -3493,8 +3908,9 @@ ProcessUtilitySlow(ParseState *pstate, { if (cleanString != NULL) { - ExecUtilityStmtOnNodes(parsetree, cleanString, NULL, sentToRemote, true, - EXEC_ON_ALL_NODES, false, false); + ExecUtilityStmtOnNodes(parsetree, cleanString, NULL, + sentToRemote, true, EXEC_ON_ALL_NODES, + false, false); } } @@ -3520,6 +3936,36 @@ ProcessUtilitySlow(ParseState *pstate, * permissions. */ lockmode = AlterTableGetLockLevel(atstmt->cmds); +#ifdef __TBASE__ + /* + * If I am the main execute CN but not Leader CN, + * Notify the Leader CN to create firstly. + */ + if (!sentToRemote && LOCAL_PARALLEL_DDL) + { + bool is_temp = false; + PGXCNodeHandle *leaderCnHandle = find_ddl_leader_cn(); + if (!is_ddl_leader_cn(leaderCnHandle->nodename)) + { + relid = RangeVarGetRelid(atstmt->relation, + lockmode, true); + if (OidIsValid(relid)) + { + ExecUtilityFindNodes(atstmt->relkind, + relid, &is_temp); + UnlockRelationOid(relid, lockmode); + SendLeaderCNUtility(queryString, is_temp); + } + else + { + ereport(NOTICE, + (errmsg("relation \"%s\" does not exist, skipping", + atstmt->relation->relname))); + break; + } + } + } +#endif relid = AlterTableLookupRelation(atstmt, lockmode); if (OidIsValid(relid)) @@ -3543,7 +3989,6 @@ ProcessUtilitySlow(ParseState *pstate, exec_type = ExecUtilityFindNodes(atstmt->relkind, relid, &is_temp); - stmts = AddRemoteQueryNode(stmts, queryString, exec_type); } } @@ -3736,12 +4181,44 @@ ProcessUtilitySlow(ParseState *pstate, List *inheritors = NIL; #ifdef __TBASE__ Relation rel = NULL; + bool istemp = false; #endif if (stmt->concurrent) PreventTransactionChain(isTopLevel, "CREATE INDEX CONCURRENTLY"); +#ifdef __TBASE__ + if (!sentToRemote && LOCAL_PARALLEL_DDL) + { + relid = RangeVarGetRelidExtended(stmt->relation, + AccessShareLock, true, + false, NULL, NULL); + if (OidIsValid(relid)) + { + RemoteQueryExecType exectype; + exectype = ExecUtilityFindNodes(OBJECT_INDEX, + relid, &istemp); + + /* + * If I am the main execute CN but not Leader CN, + * Notify the Leader CN to create firstly. + */ + if (exectype == EXEC_ON_ALL_NODES || + exectype == EXEC_ON_COORDS) + { + PGXCNodeHandle *leaderCnHandle; + leaderCnHandle = find_ddl_leader_cn(); + if (!is_ddl_leader_cn(leaderCnHandle->nodename)) + { + UnlockRelationOid(relid, AccessShareLock); + SendLeaderCNUtility(queryString, istemp); + } + } + } + } +#endif + /* * Look up the relation OID just once, right here at the * beginning, so that we don't end up repeating the name @@ -3758,7 +4235,6 @@ ProcessUtilitySlow(ParseState *pstate, false, false, RangeVarCallbackOwnsRelation, NULL); - #if 0 /* could not create index on interval child table directly */ if (OidIsValid(relid)) @@ -3984,7 +4460,11 @@ ProcessUtilitySlow(ParseState *pstate, queryString); /* Send prepare extension msg to all other cn and dn */ extension_query_string = qstring->data; - ExecUtilityStmtOnNodes(parsetree, extension_query_string, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false, false); + ExecUtilityStmtOnNodes(parsetree, + extension_query_string, + NULL, sentToRemote, false, + EXEC_ON_ALL_NODES, + false, false); /* stage 2 */ ExecuteExtension(pstate, (CreateExtensionStmt *) parsetree); @@ -3994,7 +4474,11 @@ ProcessUtilitySlow(ParseState *pstate, queryString); /* Send execute extension msg to all other cn and dn */ extension_query_string = qstring->data; - ExecUtilityStmtOnNodes(parsetree, extension_query_string, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false, false); + ExecUtilityStmtOnNodes(parsetree, + extension_query_string, + NULL, sentToRemote, false, + EXEC_ON_ALL_NODES, + false, false); pfree(qstring->data); pfree(qstring); @@ -4063,17 +4547,46 @@ ProcessUtilitySlow(ParseState *pstate, case T_CompositeTypeStmt: /* CREATE TYPE (composite) */ { CompositeTypeStmt *stmt = (CompositeTypeStmt *) parsetree; - +#ifdef __TBASE__ + /* + * If I am the main execute CN but not Leader CN, + * Notify the Leader CN to create firstly. + */ + if (!sentToRemote && LOCAL_PARALLEL_DDL) + { + SendLeaderCNUtility(queryString, false); + } +#endif address = DefineCompositeType(stmt->typevar, stmt->coldeflist); } break; case T_CreateEnumStmt: /* CREATE TYPE AS ENUM */ +#ifdef __TBASE__ + /* + * If I am the main execute CN but not Leader CN, + * Notify the Leader CN to create firstly. + */ + if (!sentToRemote && LOCAL_PARALLEL_DDL) + { + SendLeaderCNUtility(queryString, false); + } +#endif address = DefineEnum((CreateEnumStmt *) parsetree); break; case T_CreateRangeStmt: /* CREATE TYPE AS RANGE */ +#ifdef __TBASE__ + /* + * If I am the main execute CN but not Leader CN, + * Notify the Leader CN to create firstly. + */ + if (!sentToRemote && LOCAL_PARALLEL_DDL) + { + SendLeaderCNUtility(queryString, false); + } +#endif address = DefineRange((CreateRangeStmt *) parsetree); break; @@ -4083,6 +4596,37 @@ ProcessUtilitySlow(ParseState *pstate, case T_ViewStmt: /* CREATE VIEW */ EventTriggerAlterTableStart(parsetree); +#ifdef __TBASE__ + /* + * If I am the main execute CN but not Leader CN, + * Notify the Leader CN to create firstly. + */ + if (!sentToRemote && LOCAL_PARALLEL_DDL) + { + PGXCNodeHandle *leaderCnHandle = NULL; + leaderCnHandle = find_ddl_leader_cn(); + if (!is_ddl_leader_cn(leaderCnHandle->nodename)) + { + List *relation_list = NIL; + ListCell *lc; + bool tmp = IsViewTemp(((ViewStmt*)parsetree), + queryString, + pstmt->stmt_location, + pstmt->stmt_len, + &relation_list); + + /* Unlock before we send to leander cn */ + foreach(lc, relation_list) + { + Oid reloid = lfirst_oid(lc); + UnlockRelationOid(reloid, AccessShareLock); + } + if (!tmp) + SendLeaderCNUtility(queryString, tmp); + + } + } +#endif address = DefineView((ViewStmt *) parsetree, queryString, pstmt->stmt_location, pstmt->stmt_len); EventTriggerCollectSimpleCommand(address, secondaryObject, @@ -4093,6 +4637,16 @@ ProcessUtilitySlow(ParseState *pstate, break; case T_CreateFunctionStmt: /* CREATE FUNCTION */ +#ifdef __TBASE__ + /* + * If I am the main execute CN but not Leader CN, + * Notify the Leader CN to create firstly. + */ + if (!sentToRemote && LOCAL_PARALLEL_DDL) + { + SendLeaderCNUtility(queryString, false); + } +#endif address = CreateFunction(pstate, (CreateFunctionStmt *) parsetree); break; @@ -4104,27 +4658,91 @@ ProcessUtilitySlow(ParseState *pstate, address = DefineRule((RuleStmt *) parsetree, queryString); break; - case T_CreateSeqStmt: - address = DefineSequence(pstate, (CreateSeqStmt *) parsetree); + case T_CreateSeqStmt: +#ifdef __TBASE__ + { + bool need_send = false; + bool is_temp = false; + bool exist_ok = !is_txn_has_parallel_ddl; + CreateSeqStmt *stmt = (CreateSeqStmt *) parsetree; + if (!stmt->is_serial) + { + is_temp = stmt->sequence->relpersistence == RELPERSISTENCE_TEMP; + } + + if (!sentToRemote && LOCAL_PARALLEL_DDL) + { + PGXCNodeHandle *leaderCnHandle = NULL; + need_send = PrecheckDefineSequence(stmt); + leaderCnHandle = find_ddl_leader_cn(); + + if (!need_send) + break; + + /* + * If I am the main execute CN but not Leader CN, + * Notify the Leader CN to create firstly. + */ + if (!is_ddl_leader_cn(leaderCnHandle->nodename)) + { + if (!is_temp && need_send) + SendLeaderCNUtility(queryString, is_temp); + } + } + + address = DefineSequence(pstate, stmt, exist_ok); + + if (is_temp) + { + PoolManagerSetCommand(NULL, 0, POOL_CMD_TEMP, NULL); + } + + if (need_send) + { + RemoteQueryExecType exec_type = + is_temp ? EXEC_ON_DATANODES : EXEC_ON_ALL_NODES; + ExecUtilityStmtOnNodes(parsetree, queryString, NULL, + sentToRemote, false, exec_type, + is_temp, false); + } + } +#else + address = DefineSequence(pstate, (CreateSeqStmt *) parsetree); +#endif + break; + + case T_AlterSeqStmt: #ifdef __TBASE__ + if (!sentToRemote && LOCAL_PARALLEL_DDL) { + AlterSeqStmt *stmt = (AlterSeqStmt *) parsetree; bool is_temp = false; - CreateSeqStmt *stmt = (CreateSeqStmt *) parsetree; - - if (!stmt->is_serial) + PGXCNodeHandle *leaderCnHandle = NULL; + leaderCnHandle = find_ddl_leader_cn(); + /* + * If I am the main execute CN but not Leader CN, + * Notify the Leader CN to create firstly. + */ + if (!is_ddl_leader_cn(leaderCnHandle->nodename)) { - is_temp = stmt->sequence->relpersistence == RELPERSISTENCE_TEMP; + Oid relid = RangeVarGetRelid(stmt->sequence, + NoLock, stmt->missing_ok); + RemoteQueryExecType exec_type = EXEC_ON_NONE; + if (!OidIsValid(relid)) + { + break; } - - if (is_temp) + exec_type = ExecUtilityFindNodes(OBJECT_SEQUENCE, + relid, &is_temp); + if (exec_type == EXEC_ON_ALL_NODES || + exec_type == EXEC_ON_COORDS) { - PoolManagerSetCommand(NULL, 0, POOL_CMD_TEMP, NULL); + SendLeaderCNUtility(queryString, is_temp); } } -#endif - break; - case T_AlterSeqStmt: + } +#endif address = AlterSequence(pstate, (AlterSeqStmt *) parsetree); break; @@ -4133,6 +4751,16 @@ ProcessUtilitySlow(ParseState *pstate, CreateTableAsStmt *stmt = (CreateTableAsStmt *) parsetree; if (IS_PGXC_DATANODE && stmt->relkind == OBJECT_MATVIEW) stmt->into->skipData = true; +#ifdef __TBASE__ + /* + * If I am the main execute CN but not Leader CN, + * Notify the Leader CN to create firstly. + */ + if (!sentToRemote && LOCAL_PARALLEL_DDL) + { + SendLeaderCNUtility(queryString, false); + } +#endif address = ExecCreateTableAs((CreateTableAsStmt *) parsetree, queryString, params, queryEnv, completionTag); @@ -4286,7 +4914,33 @@ ProcessUtilitySlow(ParseState *pstate, break; case T_RenameStmt: - address = ExecRenameStmt((RenameStmt *) parsetree); + { + RenameStmt * stmt = (RenameStmt *) parsetree; +#ifdef __TBASE__ + if (LOCAL_PARALLEL_DDL) + { + bool is_temp = false; + PGXCNodeHandle *leaderCnHandle = find_ddl_leader_cn(); + bool is_leader_cn = is_ddl_leader_cn(leaderCnHandle->nodename); + RemoteQueryExecType exec_type = GetRenameExecType(stmt, &is_temp); + + /* + * If I am the main execute CN but not Leader CN, + * Notify the Leader CN to create firstly. + */ + if (!is_leader_cn) + { + SendLeaderCNUtility(queryString, is_temp); + } + address = ExecRenameStmt(stmt); + ExecUtilityStmtOnNodes(parsetree, queryString, NULL, + sentToRemote, false, exec_type, + is_temp, false); + } + else +#endif + address = ExecRenameStmt(stmt); + } break; case T_AlterObjectDependsStmt: @@ -4296,12 +4950,32 @@ ProcessUtilitySlow(ParseState *pstate, break; case T_AlterObjectSchemaStmt: +#ifdef __TBASE__ + /* + * If I am the main execute CN but not Leader CN, + * Notify the Leader CN to create firstly. + */ + if (!sentToRemote && LOCAL_PARALLEL_DDL) + { + SendLeaderCNUtility(queryString, false); + } +#endif address = ExecAlterObjectSchemaStmt((AlterObjectSchemaStmt *) parsetree, &secondaryObject); break; case T_AlterOwnerStmt: +#ifdef __TBASE__ + /* + * If I am the main execute CN but not Leader CN, + * Notify the Leader CN to create firstly. + */ + if (!sentToRemote && LOCAL_PARALLEL_DDL) + { + SendLeaderCNUtility(queryString, false); + } +#endif address = ExecAlterOwnerStmt((AlterOwnerStmt *) parsetree); break; @@ -4444,6 +5118,122 @@ ProcessUtilitySlow(ParseState *pstate, EventTriggerEndCompleteQuery(); } +#ifdef __TBASE__ +/* + * SendLeaderCNUtility + * For parallel ddl, we execute ddl in leader cn firstly + * to avoid deadlock. + */ +void SendLeaderCNUtility(const char *queryString, + bool temp) +{ + PGXCNodeHandle *leaderCnHandle = NULL; + RemoteQuery *step = NULL; + + leaderCnHandle = find_ddl_leader_cn(); + if (is_ddl_leader_cn(leaderCnHandle->nodename)) + return; + + step = makeNode(RemoteQuery); + step->combine_type = COMBINE_TYPE_SAME; + step->sql_statement = pstrdup(queryString); + step->exec_type = temp ? EXEC_ON_NONE : EXEC_ON_COORDS; + step->exec_nodes = NULL; + step->is_temp = temp; + ExecRemoteUtility(step, leaderCnHandle, ONLY_LEADER_DDL); + pfree(step); + + leader_cn_executed_ddl = true; +} + +void SendLeaderCNUtilityWithContext(const char *queryString, + bool temp) +{ + PG_TRY(); + { + SendLeaderCNUtility(queryString, temp); + } + PG_CATCH(); + { + + /* + * Some nodes failed. Add context about what all nodes the query + * failed + */ + ExecNodes* coord_success_nodes = NULL; + ExecNodes* data_success_nodes = NULL; + char* msg_failed_nodes = NULL; + + pgxc_all_success_nodes(&data_success_nodes, &coord_success_nodes, &msg_failed_nodes); + if (msg_failed_nodes != NULL) + errcontext("%s", msg_failed_nodes); + PG_RE_THROW(); + } + PG_END_TRY(); +} + +void CheckAndSendLeaderCNReindex(bool sentToRemote, ReindexStmt *stmt, + const char *queryString) +{ + RemoteQueryExecType exec_type = EXEC_ON_NONE; + PGXCNodeHandle *leaderCnHandle = NULL; + + if (sentToRemote || !LOCAL_PARALLEL_DDL) + return; + + /* + * If I am the main execute CN but not Leader CN, notify the Leader CN + * to reindex firstly. + */ + leaderCnHandle = find_ddl_leader_cn(); + if (!is_ddl_leader_cn(leaderCnHandle->nodename)) + { + bool is_temp = false; + Oid relid = RangeVarGetRelid(stmt->relation, AccessShareLock, false); + if (OidIsValid(relid)) + { + exec_type = ExecUtilityFindNodes(stmt->kind, relid, &is_temp); + UnlockRelationOid(relid, AccessShareLock); + } + if (exec_type == EXEC_ON_ALL_NODES || exec_type == EXEC_ON_COORDS) + { + SendLeaderCNUtility(queryString, is_temp); + } + } +} + +#endif + +static RemoteQueryExecType GetRenameExecType(RenameStmt *stmt, bool *is_temp) +{ + RemoteQueryExecType exec_type = EXEC_ON_NONE; + /* + * Get the necessary details about the relation before we + * run ExecRenameStmt locally. Otherwise we may not be able + * to look-up using the old relation name. + */ + if (stmt->relation) + { + /* + * If the table does not exist, don't send the query to + * the remote nodes. The local node will eventually + * report an error, which is then sent back to the + * client. + */ + Oid relid = RangeVarGetRelid(stmt->relation, + NoLock, true); + if (OidIsValid(relid)) + exec_type = ExecUtilityFindNodes(stmt->renameType, + relid, is_temp); + else + exec_type = EXEC_ON_NONE; + } + else + exec_type = ExecUtilityFindNodes(stmt->renameType, + InvalidOid, is_temp); + return exec_type; +} + /* * Dispatch function for DropStmt */ @@ -4489,32 +5279,77 @@ ExecDropStmt(DropStmt *stmt, bool isTopLevel) #ifdef PGXC { bool is_temp = false; + RemoteQueryExecType exec_type = EXEC_ON_ALL_NODES; #ifdef __TBASE__ - int drop_cnt = 0; char *new_query_string = pstrdup(queryString); + ObjectAddresses *new_objects = NULL; + PGXCNodeHandle *leaderCnHandle = NULL; + bool need_sendto_leadercn = false; #endif - RemoteQueryExecType exec_type = EXEC_ON_ALL_NODES; /* Check restrictions on objects dropped */ DropStmtPreTreatment((DropStmt *) stmt, queryString, sentToRemote, &is_temp, &exec_type); #endif - #ifdef __TBASE__ - drop_cnt = RemoveRelations(stmt, new_query_string); + if (!sentToRemote && LOCAL_PARALLEL_DDL) + { + leaderCnHandle = find_ddl_leader_cn(); + if (!is_ddl_leader_cn(leaderCnHandle->nodename)) + need_sendto_leadercn = true; + } + if (need_sendto_leadercn) + { + /* + * For DROP TABLE/INDEX/VIEW/... IF EXISTS query, only + * notice is emitted, if the referred objects are not + * found. In such case, the atomicity and consistency of + * the query or transaction among local CN and remote nodes + * can not be guaranteed against concurrent CREATE TABLE/ + * INDEX/VIEW/... query. + * + * To ensure such atomicity and consistency, we only refer + * to local CN about the visibility of the objects to be + * deleted and rewrite the query into new_query_string + * without the inivisible objects. Later, if the objects in + * new_query_string are not found on remote nodes, which + * should not happen, just ERROR. + */ + bool need_drop = false; + List *heap_list = NIL; + new_objects = PreCheckforRemoveRelation(stmt, + new_query_string, + &need_drop, + &heap_list); + if (need_drop) + { + /* + * If I am the main execute CN but not Leader CN, + * Notify the Leader CN to create firstly. + */ + SendLeaderCNUtility(new_query_string, is_temp); + RemoveRelationsParallelMode(stmt, new_objects, + heap_list); + free_object_addresses(new_objects); + } + else + { + pfree(new_query_string); + free_object_addresses(new_objects); + break; + } + } + else if (RemoveRelations(stmt, new_query_string) == 0) + { + pfree(new_query_string); + break; + } #else RemoveRelations(stmt); #endif #ifdef PGXC #ifdef __TBASE__ - /* if drop nothing, skip */ - if (drop_cnt == 0) - { - pfree(new_query_string); - break; - } - /* DROP is done depending on the object type and its temporary type */ if (IS_PGXC_LOCAL_COORDINATOR) ExecUtilityStmtOnNodes(NULL, new_query_string, NULL, sentToRemote, false, @@ -4529,17 +5364,101 @@ ExecDropStmt(DropStmt *stmt, bool isTopLevel) } #endif break; +#ifdef __TBASE__ + case OBJECT_SCHEMA: + case OBJECT_FUNCTION: + case OBJECT_TYPE: + { + bool is_temp = false; + bool need_drop = false; + RemoteQueryExecType exec_type = EXEC_ON_ALL_NODES; + ObjectAddresses *new_objects = NULL; + PGXCNodeHandle *leaderCnHandle = NULL; + bool is_leader_cn = false; + char *new_query_string = pstrdup(queryString); + + /* Check restrictions on objects dropped */ + DropStmtPreTreatment((DropStmt *) stmt, queryString, sentToRemote, + &is_temp, &exec_type); + + if (!sentToRemote && LOCAL_PARALLEL_DDL) + { + leaderCnHandle = find_ddl_leader_cn(); + is_leader_cn = is_ddl_leader_cn(leaderCnHandle->nodename); + if (!is_leader_cn) + { + /* + * To ensure such atomicity and consistency, we only refer + * to local CN about the visibility of the objects to be + * deleted and rewrite the query into new_query_string + * without the inivisible objects. Later, if the objects in + * new_query_string are not found on remote nodes, which + * should not happen, just ERROR. + */ + new_objects = PreCheckforRemoveObjects(stmt, + true, + &need_drop, + new_query_string, + true); + if (need_drop) + { + /* + * If I am the main execute CN but not Leader CN, + * Notify the Leader CN to create firstly. + */ + SendLeaderCNUtility(new_query_string, is_temp); + RemoveObjectsParallelMode(stmt, new_objects); + free_object_addresses(new_objects); + } + else + { + free_object_addresses(new_objects); + pfree(new_query_string); + break; + } + } + else + { + RemoveObjects(stmt, true, &need_drop, + new_query_string); + if (!need_drop) + { + pfree(new_query_string); + break; + } + } + } + else if (is_txn_has_parallel_ddl) + { + /* parallel ddl mode, from remote cn, can't miss object */ + RemoveObjects(stmt, false, &need_drop, NULL); + } + else + { + /* non parallel ddl mode */ + RemoveObjects(stmt, true, &need_drop, NULL); + } + + if (IS_PGXC_LOCAL_COORDINATOR) + ExecUtilityStmtOnNodes(NULL, new_query_string, NULL, + sentToRemote, false, exec_type, + is_temp, false); + pfree(new_query_string); + } + break; +#endif default: #ifdef PGXC { bool is_temp = false; + bool need_drop = false; RemoteQueryExecType exec_type = EXEC_ON_ALL_NODES; /* Check restrictions on objects dropped */ DropStmtPreTreatment((DropStmt *) stmt, queryString, sentToRemote, &is_temp, &exec_type); #endif - RemoveObjects(stmt); + RemoveObjects(stmt, true, &need_drop, NULL); #ifdef PGXC if (IS_PGXC_LOCAL_COORDINATOR) ExecUtilityStmtOnNodes(NULL, queryString, NULL, sentToRemote, false, @@ -4550,6 +5469,70 @@ ExecDropStmt(DropStmt *stmt, bool isTopLevel) } } +#ifdef __TBASE__ +void +CheckAndDropRole(Node *parsetree, bool sentToRemote, const char *queryString) +{ + DropRoleStmt *stmt = (DropRoleStmt *) parsetree; + char *new_query_string = pstrdup(queryString); + bool need_drop = true; + + if (!sentToRemote && LOCAL_PARALLEL_DDL) + { + PGXCNodeHandle *leaderCnHandle = NULL; + leaderCnHandle = find_ddl_leader_cn(); + + /* + * If I am the main execute CN but not Leader CN, + * Notify the Leader CN to create firstly. + */ + if (!is_ddl_leader_cn(leaderCnHandle->nodename)) + { + List *role_list = NIL; + need_drop = PreCheckDropRole(stmt, new_query_string, &role_list); + if (!need_drop) + { + pfree(new_query_string); + return; + } + SendLeaderCNUtility(new_query_string, false); + DropRoleParallelMode(role_list); + ExecUtilityStmtOnNodes(parsetree, new_query_string, NULL, + sentToRemote, false, + EXEC_ON_ALL_NODES, false, + false); + } + else + { + if (!DropRole(stmt, stmt->missing_ok, new_query_string)) + { + pfree(new_query_string); + return; + } + ExecUtilityStmtOnNodes(parsetree, new_query_string, NULL, + sentToRemote, false, + EXEC_ON_ALL_NODES, false, + false); + } + } + /* From remote cn */ + else if (!IS_PGXC_LOCAL_COORDINATOR && is_txn_has_parallel_ddl) + { + /* + * In parallel ddl mode, we only send cmd to remote when + * database exists, so database can not miss when the cmd + * come from remote cn. + */ + DropRole(stmt, false, NULL); + } + /* Non parallel ddl mode */ + else + { + DropRole(stmt, stmt->missing_ok, NULL); + } + pfree(new_query_string); +} +#endif /* * UtilityReturnsTuples @@ -6421,8 +7404,11 @@ GetCommandLogLevel(Node *parsetree) #ifdef PGXC static void -ExecUtilityStmtOnNodesInternal(Node* parsetree, const char *queryString, ExecNodes *nodes, bool sentToRemote, - bool force_autocommit, RemoteQueryExecType exec_type, bool is_temp) +ExecUtilityStmtOnNodesInternal(Node* parsetree, const char *queryString, + ExecNodes *nodes, bool sentToRemote, + bool force_autocommit, + RemoteQueryExecType exec_type, + bool is_temp) { /* Return if query is launched on no nodes */ if (exec_type == EXEC_ON_NONE) @@ -6449,7 +7435,18 @@ ExecUtilityStmtOnNodesInternal(Node* parsetree, const char *queryString, ExecNod step->force_autocommit = force_autocommit; step->exec_type = exec_type; step->parsetree = parsetree; +#ifdef __TBASE__ + if (LOCAL_PARALLEL_DDL && + (exec_type == EXEC_ON_COORDS || exec_type == EXEC_ON_ALL_NODES)) + { + PGXCNodeHandle* leaderCnHandle = find_ddl_leader_cn(); + ExecRemoteUtility(step, leaderCnHandle, EXCLUED_LEADER_DDL); + } + else + ExecRemoteUtility(step, NULL, NON_PARALLEL_DDL); +#else ExecRemoteUtility(step); +#endif pfree(step->sql_statement); pfree(step); } diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index ec9db352..1f205d34 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -9356,7 +9356,11 @@ set_config_option(const char *name, const char *value, step->force_autocommit = true; step->exec_type = EXEC_ON_CURRENT; step->is_set = true; +#ifdef __TBASE__ + ExecRemoteUtility(step, NULL, NON_PARALLEL_DDL); +#else ExecRemoteUtility(step); +#endif pfree(step); pfree(poolcmd.data); } diff --git a/src/include/catalog/dependency.h b/src/include/catalog/dependency.h index c2c0c9b1..589bbaab 100644 --- a/src/include/catalog/dependency.h +++ b/src/include/catalog/dependency.h @@ -281,6 +281,18 @@ extern void performDeletion(const ObjectAddress *object, extern void performMultipleDeletions(const ObjectAddresses *objects, DropBehavior behavior, int flags); +#ifdef __TBASE__ +extern void RemoveRelationsParallelMode(DropStmt *drop, + ObjectAddresses* objects, + List *heap_list); +extern void RemoveObjectsParallelMode(DropStmt *stmt, ObjectAddresses *objects); +extern void OmitqueryStringSpace(char *queryString); +extern void RemoveObjnameInQueryString(char *queryString, char *full_name); +extern ObjectAddresses* PreCheckforRemoveObjects(DropStmt *stmt, bool missing_ok, + bool *need_drop, char *query_string, + bool need_unlock); +#endif + #ifdef PGXC extern void performRename(const ObjectAddress *object, const char *oldname, diff --git a/src/include/catalog/objectaddress.h b/src/include/catalog/objectaddress.h index 0d80f74c..ec2cd56f 100644 --- a/src/include/catalog/objectaddress.h +++ b/src/include/catalog/objectaddress.h @@ -108,6 +108,10 @@ extern const ObjectAddress InvalidObjectAddress; #define ObjectAddressSet(addr, class_id, object_id) \ ObjectAddressSubSet(addr, class_id, object_id, 0) +#ifdef __TBASE__ +extern char *GetRemoveObjectName(ObjectType objtype, Node *object); +#endif + extern ObjectAddress get_object_address(ObjectType objtype, Node *object, Relation *relp, LOCKMODE lockmode, bool missing_ok); diff --git a/src/include/commands/dbcommands.h b/src/include/commands/dbcommands.h index cb5844ff..539d5ac2 100644 --- a/src/include/commands/dbcommands.h +++ b/src/include/commands/dbcommands.h @@ -20,7 +20,7 @@ #include "nodes/parsenodes.h" extern Oid createdb(ParseState *pstate, const CreatedbStmt *stmt); -extern void dropdb(const char *dbname, bool missing_ok); +extern bool dropdb(const char *dbname, bool missing_ok); extern void dropdb_prepare(const char *dbname, bool missing_ok); extern ObjectAddress RenameDatabase(const char *oldname, const char *newname); extern Oid AlterDatabase(ParseState *pstate, AlterDatabaseStmt *stmt, bool isTopLevel); diff --git a/src/include/commands/defrem.h b/src/include/commands/defrem.h index 1d3959b2..7b276490 100644 --- a/src/include/commands/defrem.h +++ b/src/include/commands/defrem.h @@ -19,8 +19,8 @@ #include "utils/array.h" /* commands/dropcmds.c */ -extern void RemoveObjects(DropStmt *stmt); - +extern void RemoveObjects(DropStmt *stmt, bool missing_ok, + bool *need_drop, char *query_string); /* commands/indexcmds.c */ extern ObjectAddress DefineIndex(Oid relationId, IndexStmt *stmt, diff --git a/src/include/commands/sequence.h b/src/include/commands/sequence.h index 729d73c4..a9f5ddca 100644 --- a/src/include/commands/sequence.h +++ b/src/include/commands/sequence.h @@ -121,7 +121,13 @@ extern int64 nextval_internal(Oid relid, bool check_permissions); extern Datum nextval(PG_FUNCTION_ARGS); extern List *sequence_options(Oid relid); +#ifdef __TBASE__ +extern ObjectAddress DefineSequence(ParseState *pstate, CreateSeqStmt *seq, + bool exists_ok); +extern bool PrecheckDefineSequence(CreateSeqStmt *seq); +#else extern ObjectAddress DefineSequence(ParseState *pstate, CreateSeqStmt *stmt); +#endif extern ObjectAddress AlterSequence(ParseState *pstate, AlterSeqStmt *stmt); extern void DeleteSequenceTuple(Oid relid); extern void ResetSequence(Oid seq_relid); diff --git a/src/include/commands/tablecmds.h b/src/include/commands/tablecmds.h index ea788476..cdc9eb05 100644 --- a/src/include/commands/tablecmds.h +++ b/src/include/commands/tablecmds.h @@ -26,11 +26,17 @@ extern ObjectAddress DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, ObjectAddress *typaddress, const char *queryString); #ifdef __TBASE__ +extern ObjectAddresses* PreCheckforRemoveRelation(DropStmt* drop, + char* queryString, + bool *needDrop, + List **heap_list); extern int RemoveRelations(DropStmt *drop, char* queryString); #else extern void RemoveRelations(DropStmt *drop); #endif +extern char GetRemoveObjectRelkind(ObjectType removeType); + extern Oid AlterTableLookupRelation(AlterTableStmt *stmt, LOCKMODE lockmode); extern void AlterTable(Oid relid, LOCKMODE lockmode, AlterTableStmt *stmt); diff --git a/src/include/commands/tablespace.h b/src/include/commands/tablespace.h index 32805ab4..f4ad6b41 100644 --- a/src/include/commands/tablespace.h +++ b/src/include/commands/tablespace.h @@ -43,7 +43,7 @@ typedef struct TableSpaceOpts } TableSpaceOpts; extern Oid CreateTableSpace(CreateTableSpaceStmt *stmt); -extern void DropTableSpace(DropTableSpaceStmt *stmt); +extern bool DropTableSpace(DropTableSpaceStmt *stmt, bool missing_ok); extern ObjectAddress RenameTableSpace(const char *oldname, const char *newname); extern Oid AlterTableSpaceOptions(AlterTableSpaceOptionsStmt *stmt); @@ -63,4 +63,8 @@ extern void tblspc_redo(XLogReaderState *rptr); extern void tblspc_desc(StringInfo buf, XLogReaderState *rptr); extern const char *tblspc_identify(uint8 info); +#ifdef __TBASE__ +extern bool PreCheckforDropTableSpace(DropTableSpaceStmt *stmt); +#endif + #endif /* TABLESPACE_H */ diff --git a/src/include/commands/user.h b/src/include/commands/user.h index 69e9aa46..e172d500 100644 --- a/src/include/commands/user.h +++ b/src/include/commands/user.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * user.h - * Commands for manipulating roles (formerly called users). + * Commands for manipulating roles (formerly called users). * * * src/include/commands/user.h @@ -17,21 +17,30 @@ #include "parser/parse_node.h" /* GUC. Is actually of type PasswordType. */ -extern int Password_encryption; +extern int Password_encryption; /* Hook to check passwords in CreateRole() and AlterRole() */ typedef void (*check_password_hook_type) (const char *username, const char *shadow_pass, PasswordType password_type, Datum validuntil_time, bool validuntil_null); extern PGDLLIMPORT check_password_hook_type check_password_hook; -extern Oid CreateRole(ParseState *pstate, CreateRoleStmt *stmt); -extern Oid AlterRole(AlterRoleStmt *stmt); -extern Oid AlterRoleSet(AlterRoleSetStmt *stmt); -extern void DropRole(DropRoleStmt *stmt); +extern Oid CreateRole(ParseState *pstate, CreateRoleStmt *stmt); +extern Oid AlterRole(AlterRoleStmt *stmt); +extern Oid AlterRoleSet(AlterRoleSetStmt *stmt); +extern void DropRoleByTuple(char *role, HeapTuple tuple, + Relation pg_authid_rel, + Relation pg_auth_members_rel); +extern bool DropRole(DropRoleStmt *stmt, bool missing_ok, char *query_string); extern void GrantRole(GrantRoleStmt *stmt); extern ObjectAddress RenameRole(const char *oldname, const char *newname); extern void DropOwnedObjects(DropOwnedStmt *stmt); extern void ReassignOwnedObjects(ReassignOwnedStmt *stmt); extern List *roleSpecsToIds(List *memberNames); -#endif /* USER_H */ +#ifdef __TBASE__ +extern bool PreCheckDropRole(DropRoleStmt *stmt, char *query_string, + List **exist_roles); +extern void DropRoleParallelMode(List *role_list); +#endif + +#endif /* USER_H */ diff --git a/src/include/commands/view.h b/src/include/commands/view.h index 270996b8..facf592c 100644 --- a/src/include/commands/view.h +++ b/src/include/commands/view.h @@ -20,8 +20,14 @@ extern void validateWithCheckOption(char *value); extern ObjectAddress DefineView(ViewStmt *stmt, const char *queryString, - int stmt_location, int stmt_len); + int stmt_location, int stmt_len); extern void StoreViewQuery(Oid viewOid, Query *viewParse, bool replace); -#endif /* VIEW_H */ +extern Query *MakeViewParse(ViewStmt* stmt, const char* query_string, + int stmt_location, int stmt_len); +#ifdef __TBASE__ +extern bool IsViewTemp(ViewStmt* stmt, const char* query_string, + int stmt_location, int stmt_len, List **relation_list); +#endif +#endif /* VIEW_H */ diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index 57111155..5554ee7b 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -66,6 +66,16 @@ typedef enum SortByNulls SORTBY_NULLS_LAST } SortByNulls; +#ifdef __TBASE__ +typedef enum ParallelDDLRemoteType +{ + NON_PARALLEL_DDL, /* non parallel ddl mode, exec_type decides */ + /* execution nodes */ + ONLY_LEADER_DDL, /* only leader cn will execute ddl */ + EXCLUED_LEADER_DDL /* remove leader cn from execution nodes */ +} ParallelDDLRemoteType; +#endif + /* * Grantable rights are encoded so that we can OR them together in a bitmask. * The present representation of AclItem limits us to 16 distinct rights, diff --git a/src/include/parser/parse_relation.h b/src/include/parser/parse_relation.h index 896a543a..e22afd54 100644 --- a/src/include/parser/parse_relation.h +++ b/src/include/parser/parse_relation.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * parse_relation.h - * prototypes for parse_relation.c. + * prototypes for parse_relation.c. * * * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group @@ -26,111 +26,114 @@ */ typedef struct { - int distance; /* Weighted distance (lowest so far) */ - RangeTblEntry *rfirst; /* RTE of first */ - AttrNumber first; /* Closest attribute so far */ - RangeTblEntry *rsecond; /* RTE of second */ - AttrNumber second; /* Second closest attribute so far */ + int distance; /* Weighted distance (lowest so far) */ + RangeTblEntry *rfirst; /* RTE of first */ + AttrNumber first; /* Closest attribute so far */ + RangeTblEntry *rsecond; /* RTE of second */ + AttrNumber second; /* Second closest attribute so far */ } FuzzyAttrMatchState; extern RangeTblEntry *refnameRangeTblEntry(ParseState *pstate, - const char *schemaname, - const char *refname, - int location, - int *sublevels_up); + const char *schemaname, + const char *refname, + int location, + int *sublevels_up); extern CommonTableExpr *scanNameSpaceForCTE(ParseState *pstate, - const char *refname, - Index *ctelevelsup); + const char *refname, + Index *ctelevelsup); extern bool scanNameSpaceForENR(ParseState *pstate, const char *refname); extern void checkNameSpaceConflicts(ParseState *pstate, List *namespace1, - List *namespace2); + List *namespace2); extern int RTERangeTablePosn(ParseState *pstate, - RangeTblEntry *rte, - int *sublevels_up); + RangeTblEntry *rte, + int *sublevels_up); extern RangeTblEntry *GetRTEByRangeTablePosn(ParseState *pstate, - int varno, - int sublevels_up); + int varno, + int sublevels_up); extern CommonTableExpr *GetCTEForRTE(ParseState *pstate, RangeTblEntry *rte, - int rtelevelsup); + int rtelevelsup); extern Node *scanRTEForColumn(ParseState *pstate, RangeTblEntry *rte, - char *colname, int location, - int fuzzy_rte_penalty, FuzzyAttrMatchState *fuzzystate); + char *colname, int location, + int fuzzy_rte_penalty, FuzzyAttrMatchState *fuzzystate); extern Node *colNameToVar(ParseState *pstate, char *colname, bool localonly, - int location); + int location); extern void markVarForSelectPriv(ParseState *pstate, Var *var, - RangeTblEntry *rte); + RangeTblEntry *rte); extern Relation parserOpenTable(ParseState *pstate, const RangeVar *relation, - int lockmode); + int lockmode); extern RangeTblEntry *addRangeTableEntry(ParseState *pstate, - RangeVar *relation, - Alias *alias, - bool inh, - bool inFromCl); + RangeVar *relation, + Alias *alias, + bool inh, + bool inFromCl); extern RangeTblEntry *addRangeTableEntryForRelation(ParseState *pstate, - Relation rel, - Alias *alias, - bool inh, - bool inFromCl); + Relation rel, + Alias *alias, + bool inh, + bool inFromCl); extern RangeTblEntry *addRangeTableEntryForSubquery(ParseState *pstate, - Query *subquery, - Alias *alias, - bool lateral, - bool inFromCl); + Query *subquery, + Alias *alias, + bool lateral, + bool inFromCl); extern RangeTblEntry *addRangeTableEntryForFunction(ParseState *pstate, - List *funcnames, - List *funcexprs, - List *coldeflists, - RangeFunction *rangefunc, - bool lateral, - bool inFromCl); + List *funcnames, + List *funcexprs, + List *coldeflists, + RangeFunction *rangefunc, + bool lateral, + bool inFromCl); extern RangeTblEntry *addRangeTableEntryForValues(ParseState *pstate, - List *exprs, - List *coltypes, - List *coltypmods, - List *colcollations, - Alias *alias, - bool lateral, - bool inFromCl); + List *exprs, + List *coltypes, + List *coltypmods, + List *colcollations, + Alias *alias, + bool lateral, + bool inFromCl); extern RangeTblEntry *addRangeTableEntryForTableFunc(ParseState *pstate, - TableFunc *tf, - Alias *alias, - bool lateral, - bool inFromCl); + TableFunc *tf, + Alias *alias, + bool lateral, + bool inFromCl); extern RangeTblEntry *addRangeTableEntryForJoin(ParseState *pstate, - List *colnames, - JoinType jointype, - List *aliasvars, - Alias *alias, - bool inFromCl); + List *colnames, + JoinType jointype, + List *aliasvars, + Alias *alias, + bool inFromCl); extern RangeTblEntry *addRangeTableEntryForCTE(ParseState *pstate, - CommonTableExpr *cte, - Index levelsup, - RangeVar *rv, - bool inFromCl); + CommonTableExpr *cte, + Index levelsup, + RangeVar *rv, + bool inFromCl); extern RangeTblEntry *addRangeTableEntryForENR(ParseState *pstate, - RangeVar *rv, - bool inFromCl); + RangeVar *rv, + bool inFromCl); extern bool isLockedRefname(ParseState *pstate, const char *refname); extern void addRTEtoQuery(ParseState *pstate, RangeTblEntry *rte, - bool addToJoinList, - bool addToRelNameSpace, bool addToVarNameSpace); + bool addToJoinList, + bool addToRelNameSpace, bool addToVarNameSpace); extern void errorMissingRTE(ParseState *pstate, RangeVar *relation) pg_attribute_noreturn(); extern void errorMissingColumn(ParseState *pstate, - char *relname, char *colname, int location) pg_attribute_noreturn(); + char *relname, char *colname, int location) pg_attribute_noreturn(); extern void expandRTE(RangeTblEntry *rte, int rtindex, int sublevels_up, - int location, bool include_dropped, - List **colnames, List **colvars); + int location, bool include_dropped, + List **colnames, List **colvars); extern List *expandRelAttrs(ParseState *pstate, RangeTblEntry *rte, - int rtindex, int sublevels_up, int location); -extern int attnameAttNum(Relation rd, const char *attname, bool sysColOK); + int rtindex, int sublevels_up, int location); +extern int attnameAttNum(Relation rd, const char *attname, bool sysColOK); extern Name attnumAttName(Relation rd, int attid); -extern Oid attnumTypeId(Relation rd, int attid); -extern Oid attnumCollationId(Relation rd, int attid); +extern Oid attnumTypeId(Relation rd, int attid); +extern Oid attnumCollationId(Relation rd, int attid); extern bool isQueryUsingTempRelation(Query *query); +#ifdef __TBASE__ +extern bool CheckAndGetRelation(Query *query, List **relation_list); +#endif #ifdef PGXC -extern int specialAttNum(const char *attname); +extern int specialAttNum(const char *attname); #endif -#endif /* PARSE_RELATION_H */ +#endif /* PARSE_RELATION_H */ diff --git a/src/include/parser/parse_utilcmd.h b/src/include/parser/parse_utilcmd.h index 6cb25dbc..b6a0be60 100644 --- a/src/include/parser/parse_utilcmd.h +++ b/src/include/parser/parse_utilcmd.h @@ -82,13 +82,20 @@ extern bool loose_unique_index; #endif -#ifdef XCP -extern bool loose_constraints; +#ifdef __TBASE__ +extern List *transformCreateStmt(CreateStmt *stmt, const char *queryString, + bool autodistribute, Oid *nspaceid, bool existsok); +#elif XCP extern List *transformCreateStmt(CreateStmt *stmt, const char *queryString, bool autodistribute); #else extern List *transformCreateStmt(CreateStmt *stmt, const char *queryString); #endif + +#ifdef XCP +extern bool loose_constraints; +#endif + extern List *transformAlterTableStmt(Oid relid, AlterTableStmt *stmt, const char *queryString); extern IndexStmt *transformIndexStmt(Oid relid, IndexStmt *stmt, diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h index 98d51719..236979f8 100644 --- a/src/include/pgxc/execRemote.h +++ b/src/include/pgxc/execRemote.h @@ -373,7 +373,13 @@ extern void ExecFinishInitRemoteSubplan(RemoteSubplanState *node); extern TupleTableSlot* ExecRemoteSubplan(PlanState *pstate); extern void ExecEndRemoteSubplan(RemoteSubplanState *node); extern void ExecReScanRemoteSubplan(RemoteSubplanState *node); +#ifdef __TBASE__ +extern void ExecRemoteUtility(RemoteQuery *node, + PGXCNodeHandle *leader_cn_conn, + ParallelDDLRemoteType type); +#else extern void ExecRemoteUtility(RemoteQuery *node); +#endif extern bool is_data_node_ready(PGXCNodeHandle * conn); @@ -439,8 +445,30 @@ extern TupleDesc create_tuple_desc(char *msg_body, size_t len); extern void ExecFinishRemoteSubplan(RemoteSubplanState *node); extern void ExecShutdownRemoteSubplan(RemoteSubplanState *node); extern bool SetSnapshot(EState *state); + +extern void ExecRemoteUtility_ParallelDDLMode(RemoteQuery *node, + PGXCNodeHandle *leader_cn_handle); +extern void LeaderCnExecRemoteUtility(RemoteQuery *node, + PGXCNodeHandle *leader_cn_conn, + ResponseCombiner *combiner, + bool need_tran_block, + GlobalTransactionId gxid, + Snapshot snapshot, + CommandId cid); #endif +extern void GetGlobInfoForRemoteUtility(RemoteQuery *node, + GlobalTransactionId *gxid, + Snapshot *snapshot); +extern void SendTxnInfo(RemoteQuery *node, PGXCNodeHandle *conn, + CommandId cid, Snapshot snapshot); +extern bool CheckRemoteRespond(PGXCNodeHandle *conn, + ResponseCombiner *combiner, + int *index, int *conn_count); +extern void RemoteReceiveAndCheck(int conn_count, + PGXCNodeHandle **conns, + ResponseCombiner *combiner); + #ifdef __SUBSCRIPTION__ extern void pgxc_node_report_error(ResponseCombiner *combiner); extern int pgxc_node_receive_responses(const int conn_count, PGXCNodeHandle ** connections, diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h index e5f9c6e1..22075b68 100644 --- a/src/include/pgxc/pgxcnode.h +++ b/src/include/pgxc/pgxcnode.h @@ -292,12 +292,15 @@ void pgxc_set_coordinator_proc_pid(int proc_pid); int pgxc_get_coordinator_proc_pid(void); void pgxc_set_coordinator_proc_vxid(TransactionId proc_vxid); TransactionId pgxc_get_coordinator_proc_vxid(void); -inline char* find_ddl_leader_cn(void); +PGXCNodeHandle* find_ddl_leader_cn(void); inline bool is_ddl_leader_cn(char *leader_cn); +void CheckInvalidateRemoteHandles(void); extern int pgxc_node_send_sessionid(PGXCNodeHandle * handle); extern void SerializeSessionId(Size maxsize, char *start_address); extern void StartParallelWorkerSessionId(char *address); extern bool is_pgxc_handles_init(void); +void delete_leadercn_handle(PGXCNodeAllHandles *pgxc_connections, + PGXCNodeHandle* leader_cn_handle); #endif #ifdef __AUDIT__ diff --git a/src/include/tcop/utility.h b/src/include/tcop/utility.h index 92605dff..aca694be 100644 --- a/src/include/tcop/utility.h +++ b/src/include/tcop/utility.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * utility.h - * prototypes for utility.c. + * prototypes for utility.c. * * * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group @@ -20,33 +20,33 @@ #endif typedef enum { - PROCESS_UTILITY_TOPLEVEL, /* toplevel interactive command */ - PROCESS_UTILITY_QUERY, /* a complete query, but not toplevel */ - PROCESS_UTILITY_SUBCOMMAND /* a portion of a query */ + PROCESS_UTILITY_TOPLEVEL, /* toplevel interactive command */ + PROCESS_UTILITY_QUERY, /* a complete query, but not toplevel */ + PROCESS_UTILITY_SUBCOMMAND /* a portion of a query */ } ProcessUtilityContext; /* Hook for plugins to get control in ProcessUtility() */ typedef void (*ProcessUtility_hook_type) (PlannedStmt *pstmt, - const char *queryString, ProcessUtilityContext context, - ParamListInfo params, - QueryEnvironment *queryEnv, - DestReceiver *dest, - bool sentToRemote, - char *completionTag); + const char *queryString, ProcessUtilityContext context, + ParamListInfo params, + QueryEnvironment *queryEnv, + DestReceiver *dest, + bool sentToRemote, + char *completionTag); extern PGDLLIMPORT ProcessUtility_hook_type ProcessUtility_hook; extern void ProcessUtility(PlannedStmt *pstmt, const char *queryString, - ProcessUtilityContext context, ParamListInfo params, - QueryEnvironment *queryEnv, - DestReceiver *dest, - bool sentToRemote, - char *completionTag); + ProcessUtilityContext context, ParamListInfo params, + QueryEnvironment *queryEnv, + DestReceiver *dest, + bool sentToRemote, + char *completionTag); extern void standard_ProcessUtility(PlannedStmt *pstmt, const char *queryString, - ProcessUtilityContext context, ParamListInfo params, - QueryEnvironment *queryEnv, - DestReceiver *dest, - bool sentToRemote, - char *completionTag); + ProcessUtilityContext context, ParamListInfo params, + QueryEnvironment *queryEnv, + DestReceiver *dest, + bool sentToRemote, + char *completionTag); extern bool UtilityReturnsTuples(Node *parsetree); @@ -71,5 +71,17 @@ extern PGDLLIMPORT ErrcodeHookType g_pfErrcodeHook; extern bool is_txn_has_parallel_ddl; /* Parallel DDL switch */ extern bool enable_parallel_ddl; + +#define LOCAL_PARALLEL_DDL \ + (IS_PGXC_LOCAL_COORDINATOR && is_txn_has_parallel_ddl) +extern void CheckAndDropRole(Node *parsetree, bool sentToRemote, + const char *queryString); +extern void CheckAndSendLeaderCNReindex(bool sentToRemote, ReindexStmt *stmt, + const char *queryString); + +/* Has leader CN executed ddl */ +extern bool leader_cn_executed_ddl; +extern void SendLeaderCNUtility(const char *queryString, bool temp); +extern void SendLeaderCNUtilityWithContext(const char *queryString, bool temp); #endif -#endif /* UTILITY_H */ +#endif /* UTILITY_H */ From 70b06646068e6bcda93623d1fb8d9ef93e3d939b Mon Sep 17 00:00:00 2001 From: andrelin Date: Thu, 3 Jun 2021 11:08:05 +0800 Subject: [PATCH 383/578] Skip generating remote path for single node shard distribution Skip shutdown remote subplan node if execute locally tapd: http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131088416973 --- src/backend/optimizer/plan/planner.c | 6 +- src/backend/pgxc/pool/execRemote.c | 3 +- src/test/regress/expected/tbase_explain.out | 87 ++++++++++++++++++++- src/test/regress/sql/tbase_explain.sql | 13 ++- 4 files changed, 104 insertions(+), 5 deletions(-) diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index de1c8ab4..df9a5333 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -8322,7 +8322,8 @@ adjust_path_distribution(PlannerInfo *root, Query *parse, Path *path) * already have Result path, and if the distribution is one of * * a) 'hash' restricted to a single node - * b) 'replicate' without volatile functions in the target list + * b) 'shard' restricted to a single node + * c) 'replicate' without volatile functions in the target list * * In those cases we don't need the RemoteSubplan. * @@ -8330,7 +8331,8 @@ adjust_path_distribution(PlannerInfo *root, Query *parse, Path *path) * See planner.c:2730 in 9.5. */ if (!(IsA(path, ResultPath) && /* FIXME missing (result_plan->lefttree == NULL) condition */ - ((root->distribution->distributionType == 'H' && bms_num_members(root->distribution->restrictNodes) == 1) || + (((root->distribution->distributionType == 'H' || root->distribution->distributionType == 'S') && + bms_num_members(root->distribution->restrictNodes) == 1) || (root->distribution->distributionType == 'R' && !contain_mutable_functions((Node *)parse->targetList))))) path = create_remotesubplan_path(root, path, root->distribution); diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 1bb82166..74bdc28e 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -11161,7 +11161,8 @@ ExecShutdownRemoteSubplan(RemoteSubplanState *node) Plan *plan = ps->plan; EState *estate = ps->state; - if ((node->eflags & EXEC_FLAG_EXPLAIN_ONLY) != 0) + /* do nothing if explain only or execute locally */ + if ((node->eflags & EXEC_FLAG_EXPLAIN_ONLY) != 0 || node->local_exec) return; elog(DEBUG1, "shutdown remote subplan worker %d, plan_node_id %d", ParallelWorkerNumber, plan->plan_node_id); diff --git a/src/test/regress/expected/tbase_explain.out b/src/test/regress/expected/tbase_explain.out index 691d1bb5..d91ef65e 100644 --- a/src/test/regress/expected/tbase_explain.out +++ b/src/test/regress/expected/tbase_explain.out @@ -1,10 +1,95 @@ --explain analyze create table a1(id int, num int, name text); create table a2(id int, num int, name text); +--fqs case +explain (costs off,timing off,summary off,analyze,verbose) insert into a1 values(1,generate_series(1,100),'a'); -insert into a1 values(2,generate_series(1,100),'b'); + QUERY PLAN +----------------------------------------------------------------------------------------------- + Remote Fast Query Execution (actual rows=0 loops=1) + Output: 1, generate_series(1, 100), 'a'::text + Node expr: 1 + Remote query: INSERT INTO a1 (id, num, name) VALUES (1, generate_series(1, 100), 'a'::text) +(4 rows) + +set enable_fast_query_shipping to off; +--insert into single value +explain (costs off,timing off,summary off,analyze,verbose) +insert into a1 values(2,1,'b'); + QUERY PLAN +------------------------------------------------------------------ + Remote Subquery Scan on all (datanode_1) (actual rows=0 loops=1) + -> Insert on public.a1 + DN (actual rows=0..0 loops=1..1) + - datanode_1 (actual rows=0 loops=1) + -> Result + DN (actual rows=1..1 loops=1..1) + - datanode_1 (actual rows=1 loops=1) + Output: 2, 1, 'b'::text +(8 rows) + +--insert with set returning function +explain (costs off,timing off,summary off,analyze,verbose) +insert into a1 values(2,generate_series(2,100),'b'); + QUERY PLAN +------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1) (actual rows=0 loops=1) + -> Insert on public.a1 + DN (actual rows=0..0 loops=1..1) + - datanode_1 (actual rows=0 loops=1) + -> Remote Subquery Scan on local node + DN (actual rows=99..99 loops=1..1) + - datanode_1 (actual rows=99 loops=1) + Output: 2, generate_series(2, 100), 'b'::text + Distribute results by H: 2 + -> ProjectSet + DN (actual rows=99..99 loops=1..1) + - datanode_1 (actual rows=99 loops=1) + Output: 2, generate_series(2, 100), 'b'::text + -> Result + DN (actual rows=1..1 loops=1..1) + - datanode_1 (actual rows=1 loops=1) +(16 rows) + +explain (costs off,timing off,summary off,analyze,verbose) insert into a1 values(3,generate_series(1,100),'c'); + QUERY PLAN +------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_2) (actual rows=0 loops=1) + -> Insert on public.a1 + DN (actual rows=0..0 loops=1..1) + - datanode_2 (actual rows=0 loops=1) + -> Remote Subquery Scan on local node + DN (actual rows=100..100 loops=1..1) + - datanode_2 (actual rows=100 loops=1) + Output: 3, generate_series(1, 100), 'c'::text + Distribute results by H: 3 + -> ProjectSet + DN (actual rows=100..100 loops=1..1) + - datanode_2 (actual rows=100 loops=1) + Output: 3, generate_series(1, 100), 'c'::text + -> Result + DN (actual rows=1..1 loops=1..1) + - datanode_2 (actual rows=1 loops=1) +(16 rows) + +explain (costs off,timing off,summary off,analyze,verbose) insert into a2 select * from a1; + QUERY PLAN +----------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) (actual rows=0 loops=1) + -> Insert on public.a2 + DN (actual rows=0..0 loops=1..1) + - datanode_1 (actual rows=0 loops=1) + - datanode_2 (actual rows=0 loops=1) + -> Seq Scan on public.a1 + DN (actual rows=100..200 loops=1..1) + - datanode_1 (actual rows=200 loops=1) + - datanode_2 (actual rows=100 loops=1) + Output: a1.id, a1.num, a1.name +(10 rows) + +reset enable_fast_query_shipping; --normal cases explain (costs off,timing off,summary off,analyze,verbose) select count(*) from a1; diff --git a/src/test/regress/sql/tbase_explain.sql b/src/test/regress/sql/tbase_explain.sql index 7e212bc7..d15c7c2c 100644 --- a/src/test/regress/sql/tbase_explain.sql +++ b/src/test/regress/sql/tbase_explain.sql @@ -1,10 +1,21 @@ --explain analyze create table a1(id int, num int, name text); create table a2(id int, num int, name text); +--fqs case +explain (costs off,timing off,summary off,analyze,verbose) insert into a1 values(1,generate_series(1,100),'a'); -insert into a1 values(2,generate_series(1,100),'b'); +set enable_fast_query_shipping to off; +--insert into single value +explain (costs off,timing off,summary off,analyze,verbose) +insert into a1 values(2,1,'b'); +--insert with set returning function +explain (costs off,timing off,summary off,analyze,verbose) +insert into a1 values(2,generate_series(2,100),'b'); +explain (costs off,timing off,summary off,analyze,verbose) insert into a1 values(3,generate_series(1,100),'c'); +explain (costs off,timing off,summary off,analyze,verbose) insert into a2 select * from a1; +reset enable_fast_query_shipping; --normal cases explain (costs off,timing off,summary off,analyze,verbose) From 54d384f5be8661b7c71cae5f54dd7cbbab92b8f4 Mon Sep 17 00:00:00 2001 From: bethding Date: Fri, 4 Jun 2021 10:07:37 +0800 Subject: [PATCH 384/578] fix rename bug http://tapd.oa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131088349973&jump_count=1 --- src/gtm/main/gtm_seq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gtm/main/gtm_seq.c b/src/gtm/main/gtm_seq.c index 27e0a3e0..c9bb59c8 100644 --- a/src/gtm/main/gtm_seq.c +++ b/src/gtm/main/gtm_seq.c @@ -1027,7 +1027,7 @@ GTM_SeqRename(GTM_SequenceKey seqkey, GTM_SequenceKey newseqkey, { newseqinfo = seq_find_seqinfo(newseqkey); #ifdef __TBASE__ - if (NULL == seqinfo) + if (NULL == newseqinfo) { GTM_FormSeqOfStore(newseqkey); newseqinfo = seq_find_seqinfo(newseqkey); From 50c66c9fb5ddcd3f7c7e234f6c4c22797f6e1e4c Mon Sep 17 00:00:00 2001 From: bethding Date: Fri, 4 Jun 2021 16:10:53 +0800 Subject: [PATCH 385/578] fix warning --- src/backend/executor/execMain.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 3bc95f7d..d30ee629 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -56,6 +56,7 @@ #include "mb/pg_wchar.h" #include "miscadmin.h" #include "optimizer/clauses.h" +#include "optimizer/pgxcship.h" #include "parser/parsetree.h" #include "rewrite/rewriteManip.h" #include "storage/bufmgr.h" From 3e805b5786b598c21882e97c8ebb14bc42d179e6 Mon Sep 17 00:00:00 2001 From: whalesong Date: Fri, 4 Jun 2021 21:12:46 +0800 Subject: [PATCH 386/578] Bugfix: run tpcc core after 2pc files opt, ID88129643 (merge request !361) --- src/backend/access/transam/twophase.c | 1194 ++++++++++++------------ src/backend/storage/lmgr/lwlock.c | 7 + src/backend/utils/misc/guc.c | 19 +- src/include/access/twophase.h | 2 - src/include/storage/lwlock.h | 9 +- src/test/regress/expected/sysviews.out | 5 +- 6 files changed, 595 insertions(+), 641 deletions(-) diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 14935855..61bc6b50 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -138,8 +138,6 @@ int transaction_threshold = 200000; #define FILE_CONTENT_SIZE 2048 -#define GET_START_NODE "startnode:" - /* GUC variable, can't be changed after startup */ #ifdef PGXC int max_prepared_xacts = 10000; /* We require 2PC */ @@ -154,14 +152,12 @@ bool enable_2pc_recovery_info = true; static HTAB *record_2pc_cache = NULL; bool enable_2pc_file_cache = true; -bool enable_2pc_file_check = true; +bool enable_2pc_file_check = false; bool enable_2pc_entry_key_check = true; bool enable_2pc_entry_trace = false; -bool enable_2pc_hash_table_check = true; int record_2pc_cache_size = 4096; int record_2pc_entry_size = 2048; -int record_2pc_partitions = 32; #define MAX_OUTPUT_FILE 1000 @@ -169,8 +165,21 @@ int record_2pc_partitions = 32; #define MAX_2PC_INFO_SIZE (record_2pc_entry_size - MAX_TID_SIZE) #define DFLT_2PC_INFO_SIZE 1024 /* default size */ -#define HASH_TAB_RETRY_MAX 10 -#define HASH_TAB_RETRY_SLEEP 2000 /* sleep time: 2ms */ +uint32 Record2pcCacheHashCode(const char *tid); + +/* + * The 2pc info cache is partitioned to reduce contention. + * To determine which partition lock a given tid requires, compute the tid's + * hash code with Record2pcCacheHashCode(), then apply Cache2pcPartitionLock(). + * NB: NUM_CACHE_2PC_PARTITIONS must be a power of 2! + */ +#define Cache2pcHashPartition(hashcode) \ + ((hashcode) % NUM_CACHE_2PC_PARTITIONS) +#define Cache2pcPartitionLock(hashcode) \ + (&MainLWLockArray[CACHE_2PC_LWLOCK_OFFSET + \ + Cache2pcHashPartition(hashcode)].lock) +#define Cache2pcPartitionLockByIndex(i) \ + (&MainLWLockArray[CACHE_2PC_LWLOCK_OFFSET + (i)].lock) /* hash table entry for 2pc record */ typedef struct Cache2pcInfo @@ -180,14 +189,20 @@ typedef struct Cache2pcInfo } Cache2pcInfo; -inline void -check_entry_key(const char *tid, const char *key, const char *func); +inline void check_entry_key(const char *tid, const char *key); -void -print_record_2pc_cache(const char *func); +bool add_2pc_info(const char *tid, const char *info); + +bool append_2pc_info(const char *tid, const char *info, bool *overflow); + +bool remove_2pc_info(const char *tid); + +bool get_2pc_info(const char *tid, char *info); + +bool save_and_remove_2pc_info(const char *tid); + +void check_2pc_file(const char *tid, const char *info, const char *func); -void -check_2pc_file(const char *tid, const char *info, const char *func); #endif static GlobalTransaction @@ -2310,19 +2325,8 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon) {// #lizard forgives int i; int serialized_xacts = 0; - char *func = "CheckPointTwoPhase"; - -#ifdef __TWO_PHASE_TRANS__ - File fd = -1; - int ret = 0; - int size = 0; - Cache2pcInfo *entry = NULL; - bool found = false; - char path[MAXPGPATH]; -#endif - - elog(LOG, "[%s] checkpoint: "UINT64_FORMAT, func, redo_horizon); + elog(LOG, "[%s] checkpoint: "UINT64_FORMAT, __FUNCTION__, redo_horizon); if (max_prepared_xacts <= 0) return; /* nothing to do */ @@ -2355,97 +2359,41 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon) GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; if ((gxact->valid || gxact->inredo) && - !gxact->ondisk && gxact->prepare_end_lsn <= redo_horizon) { char *buf; int len; + if (!gxact->ondisk) + { /* save to pg_twophase */ XlogReadTwoPhaseData(gxact->prepare_start_lsn, &buf, &len); RecreateTwoPhaseFile(gxact->xid, buf, len); pfree(buf); + gxact->ondisk = true; + gxact->prepare_start_lsn = InvalidXLogRecPtr; + gxact->prepare_end_lsn = InvalidXLogRecPtr; + serialized_xacts++; + } + #ifdef __TWO_PHASE_TRANS__ /* save to pg_2pc */ if (NULL != record_2pc_cache) { Assert(strlen(gxact->gid) < MAX_TID_SIZE); - entry = (Cache2pcInfo *)hash_search(record_2pc_cache, - gxact->gid, HASH_FIND, &found); - if (found) - { - /* save to file */ - Assert(NULL != entry); - check_entry_key(gxact->gid, entry->key, func); - check_2pc_file(gxact->gid, entry->info, func); - - elog(LOG, "[%s] %s is found in hash table", func, gxact->gid); - - size = strlen(entry->info); - - memset(path, 0, MAXPGPATH); - GET_2PC_FILE_PATH(path, gxact->gid); - fd = open(path, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR); - if (fd < 0) + if (!save_and_remove_2pc_info(gxact->gid)) { - elog(ERROR, "[%s] could not create file %s, errMsg: %s", - func, path, strerror(errno)); - } - - ret = write(fd, entry->info, size); - if(ret != size) - { - close(fd); - elog(ERROR, "[%s] could not write file %s, errMsg: %s, " - "ret: %d, info: %s", - func, path, strerror(errno), ret, entry->info); - } - - if (size != strlen(entry->info)) - { - elog(LOG, "[%s] %s size change from %d to %zu, info: %s", - func, gxact->gid, size, strlen(entry->info), entry->info); - - Assert(size < strlen(entry->info)); - ret = write(fd, entry->info + size, strlen(entry->info) - size); - if(ret != strlen(entry->info) - size) - { - close(fd); - elog(ERROR, "[%s] could not write file %s, errMsg: %s, " - "ret: %d, info: %s", - func, path, strerror(errno), ret, entry->info); - } - } - close(fd); - fsync_fname(path, false); - - /* remove from hash table */ - entry = (Cache2pcInfo *)hash_search(record_2pc_cache, - gxact->gid, HASH_REMOVE, &found); - if (!found) - { - elog(WARNING, "[%s] %s is not found in hash table " - "when remove it", func, gxact->gid); - } - else - { - elog(LOG, "[%s] %s is removed from hash table", - func, gxact->gid); - } + elog(LOG, "[%s] %s save to file failed", + __FUNCTION__, gxact->gid); } else { - elog(LOG, "[%s] %s is not found in hash table", func, gxact->gid); + elog(LOG, "[%s] %s is saved to file", __FUNCTION__, gxact->gid); } } #endif - - gxact->ondisk = true; - gxact->prepare_start_lsn = InvalidXLogRecPtr; - gxact->prepare_end_lsn = InvalidXLogRecPtr; - serialized_xacts++; } } LWLockRelease(TwoPhaseStateLock); @@ -2456,101 +2404,71 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon) { HASH_SEQ_STATUS seq; Cache2pcInfo *entry = NULL; - char *start_node = NULL; - char info[MAX_2PC_INFO_SIZE]; + char tid[MAX_TID_SIZE]; + char start_node[MAX_TID_SIZE]; + char *pos = NULL; + int size = 0; + + /* + * set start_node likes ":cn001:" + * use to check whether the tid is started from this node + */ + memset(start_node, 0, MAX_TID_SIZE); + size = strlen(PGXCNodeName); + if (size + 2 >= MAX_TID_SIZE) + { + elog(PANIC, "[%s] node name length(%d) overflow", __FUNCTION__, size); + } + start_node[0] = ':'; + memcpy(start_node + 1, PGXCNodeName, size); + start_node[size + 1] = ':'; hash_seq_init(&seq, record_2pc_cache); while ((entry = hash_seq_search(&seq)) != NULL) { Assert(NULL != entry); - check_2pc_file(entry->key, entry->info, func); - elog(LOG, "[%s] %s is found in hash table seq", func, entry->key); - - if (IsXidImplicit(entry->key)) + size = strlen(entry->key); + Assert(size < MAX_TID_SIZE); + if (0 == size) { - if (0 == strlen(entry->info)) - { - elog(WARNING, "[%s] %s info length is 0", func, entry->key); + elog(LOG, "[%s] entry key is empty", __FUNCTION__); continue; } - memset(info, 0, MAX_2PC_INFO_SIZE); - memcpy(info, entry->info, strlen(entry->info)); - - start_node = strstr(info, GET_START_NODE); - if (NULL != start_node) - { - start_node += strlen(GET_START_NODE); - start_node = strtok(start_node, "\n"); - if (0 != strcmp(start_node, PGXCNodeName)) + memset(tid, 0, MAX_TID_SIZE); + memcpy(tid, entry->key, size + 1); + if (0 == strlen(tid)) { - elog(LOG, "[%s] %s start node is not %s", - func, entry->key, PGXCNodeName); + elog(LOG, "[%s] tid is empty", __FUNCTION__); continue; } + Assert(strlen(tid) < MAX_TID_SIZE); - elog(LOG, "[%s] %s start node is %s", - func, entry->key, PGXCNodeName); - } - else + if (enable_2pc_file_check) { - elog(WARNING, "[%s] %s get start node failed, info: %s", - func, entry->key, entry->info); - } + elog(LOG, "[%s] %s is found in hash table seq", __FUNCTION__, tid); } - size = strlen(entry->info); - - memset(path, 0, MAXPGPATH); - GET_2PC_FILE_PATH(path, entry->key); - - fd = open(path, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR); - if (fd < 0) + if (IsXidImplicit(tid)) { - elog(ERROR, "[%s] could not create file %s, errMsg: %s", - func, path, strerror(errno)); - } - - ret = write(fd, entry->info, size); - if(ret != size) + pos = strstr(tid, start_node); + if (NULL == pos) { - close(fd); - elog(ERROR, "[%s] could not write file %s, errMsg: %s, " - "ret: %d, info: %s", - func, path, strerror(errno), ret, entry->info); + elog(LOG, "[%s] %s is not on start node", __FUNCTION__, tid); + continue; } - if (size != strlen(entry->info)) - { - elog(LOG, "[%s] %s size change from %d to %zu, info: %s", - func, entry->key, size, strlen(entry->info), entry->info); - - Assert(size < strlen(entry->info)); - ret = write(fd, entry->info + size, strlen(entry->info) - size); - if(ret != strlen(entry->info) - size) - { - close(fd); - elog(ERROR, "[%s] could not write file %s, errMsg: %s, " - "ret: %d, info: %s", - func, path, strerror(errno), ret, entry->info); - } + elog(LOG, "[%s] %s is on start node", __FUNCTION__, tid); } - close(fd); - fsync_fname(path, false); - /* remove from hash table */ - entry = (Cache2pcInfo *)hash_search(record_2pc_cache, - entry->key, HASH_REMOVE, &found); - if (!found) + if (!save_and_remove_2pc_info(tid)) { - elog(WARNING, "[%s] %s is not found in hash table " - "when remove it", func, entry->key); + elog(LOG, "[%s] %s save to file failed", __FUNCTION__, tid); } else { - elog(LOG, "[%s] %s is removed from hash table", - func, entry->key); + elog(LOG, "[%s] %s is saved to file", __FUNCTION__, tid); } } } @@ -3392,37 +3310,315 @@ PrepareRedoRemove(TransactionId xid, bool giveWarning) #ifdef __TWO_PHASE_TRANS__ /* - * Check the entry key in the hash table is same with tid. + * check_entry_key: check the entry key in the hash table whether is same with tid. */ -inline void check_entry_key(const char *tid, const char *key, const char *func) +inline void check_entry_key(const char *tid, const char *key) { - if (!enable_2pc_entry_key_check) + if (enable_2pc_entry_key_check) { - return; - } - if (0 != strcmp(tid, key)) { - elog(PANIC, "[%s] %s get wrong key: %s", func, tid, key); + elog(PANIC, "%s(hashvalue: 0x%x) mismatch with %s(hashvalue: 0x%x)", + tid, Record2pcCacheHashCode(tid), key, Record2pcCacheHashCode(key)); + } } } -void print_record_2pc_cache(const char *func) -{ - if (NULL != record_2pc_cache) +/* + * add_2pc_info: add 2pc info to hash table + * return true: add success + * return false: add failed + */ +bool add_2pc_info(const char *tid, const char *info) { - HASH_SEQ_STATUS seq; + bool found = false; Cache2pcInfo *entry = NULL; + uint32 hashvalue = Record2pcCacheHashCode(tid); + LWLock *lock = Cache2pcPartitionLock(hashvalue); - hash_seq_init(&seq, record_2pc_cache); - while ((entry = hash_seq_search(&seq)) != NULL) + Assert(NULL != record_2pc_cache); + Assert(NULL != tid); + Assert(NULL != info); + Assert(strlen(info) < MAX_2PC_INFO_SIZE); + + LWLockAcquire(lock, LW_EXCLUSIVE); + + entry = (Cache2pcInfo *)hash_search_with_hash_value(record_2pc_cache, + tid, hashvalue, HASH_ENTER_NULL, &found); + if (NULL == entry) { + LWLockRelease(lock); + return false; + } + Assert(NULL != entry); - elog(LOG, "[print_record_2pc_cache][%s] key: %s, info: %s", - func, entry->key, entry->info); + check_entry_key(tid, entry->key); + + memcpy(entry->info, info, strlen(info) + 1); + + LWLockRelease(lock); + + if (found) + { + elog(WARNING, "[%s] found %s", __FUNCTION__, tid); + return true; + } + + if (enable_2pc_entry_trace) + { + elog(LOG, "[%s] %s is added to hash table, entry: %p, info: %s", + __FUNCTION__, tid, entry, info); + } + + return true; +} + +/* + * append_2pc_info: append 2pc info to hash table + * return true: append success + * return false: append failed + */ +bool append_2pc_info(const char *tid, const char *info, bool *overflow) + { + bool found = false; + int cur_size = 0; + int app_size = 0; + int new_size = 0; + Cache2pcInfo *entry = NULL; + uint32 hashvalue = Record2pcCacheHashCode(tid); + LWLock *lock = Cache2pcPartitionLock(hashvalue); + + Assert(NULL != record_2pc_cache); + Assert (NULL != tid); + Assert (NULL != info); + Assert(NULL != overflow); + Assert(strlen(info) < MAX_2PC_INFO_SIZE); + + *overflow = false; + + LWLockAcquire(lock, LW_EXCLUSIVE); + + entry = (Cache2pcInfo *)hash_search_with_hash_value(record_2pc_cache, + tid, hashvalue, HASH_FIND, &found); + if (!found) + { + /* not found */ + LWLockRelease(lock); + return false; + } + + /* found */ + Assert(NULL != entry); + check_entry_key(tid, entry->key); + + cur_size = strlen(entry->info); + app_size = strlen(info); + new_size = cur_size + app_size; + if (new_size >= MAX_2PC_INFO_SIZE) + { + /* overflow */ + LWLockRelease(lock); + elog(LOG, "[%s] %s new size(%d) overflow(%d)", + __FUNCTION__, tid, new_size, MAX_2PC_INFO_SIZE); + *overflow = true; + return false; + } + + memcpy(entry->info + cur_size, info, app_size + 1); + + Assert(strlen(entry->info) < MAX_2PC_INFO_SIZE); + + LWLockRelease(lock); + + if (enable_2pc_entry_trace) + { + elog(LOG, "[%s] %s is found in hash table", __FUNCTION__, tid); + } + + return true; + } + +/* + * remove_2pc_info: remove 2pc info from hash table + * return true: remove success + * return false: remove failed + */ +bool remove_2pc_info(const char *tid) + { + bool found = false; + Cache2pcInfo *entry = NULL; + uint32 hashvalue = Record2pcCacheHashCode(tid); + LWLock *lock = Cache2pcPartitionLock(hashvalue); + + Assert(NULL != record_2pc_cache); + Assert(NULL != tid); + + LWLockAcquire(lock, LW_EXCLUSIVE); + + entry = (Cache2pcInfo *)hash_search_with_hash_value(record_2pc_cache, + tid, hashvalue, HASH_REMOVE, &found); + + LWLockRelease(lock); + + if (!found) + { + /* not found */ + Assert(NULL == entry); + return false; + } + + /* found */ + Assert (NULL != entry); + + if (enable_2pc_entry_trace) + { + elog(LOG, "[%s] %s is removed from hash table, entry: %p", + __FUNCTION__, tid, entry); + } + + return true; +} + +/* + * get_2pc_info: get 2pc info from hash table + * return true: get success + * return false: get failed + */ +bool get_2pc_info(const char *tid, char *info) +{ + bool found = false; + Cache2pcInfo *entry = NULL; + uint32 hashvalue = Record2pcCacheHashCode(tid); + LWLock *lock = Cache2pcPartitionLock(hashvalue); + + Assert(NULL != record_2pc_cache); + Assert(NULL != tid); + Assert(NULL != info); + + LWLockAcquire(lock, LW_SHARED); + + entry = (Cache2pcInfo *)hash_search_with_hash_value(record_2pc_cache, + tid, hashvalue, HASH_FIND, &found); + if (!found) + { + /* not found */ + LWLockRelease(lock); + Assert(NULL == entry); + return false; + } + + /* found */ + Assert(NULL != entry); + check_entry_key(tid, entry->key); + + Assert(strlen(entry->info) < MAX_2PC_INFO_SIZE); + memcpy(info, entry->info, strlen(entry->info) + 1); + + LWLockRelease(lock); + return true; +} + +/* + * save_and_remove_2pc_info: save 2pc info from hash table to disk file, + * then remove it + * return true: save and remove success + * return false: save and remove failed + */ +bool save_and_remove_2pc_info(const char *tid) +{ + bool found = false; + Cache2pcInfo *entry = NULL; + File fd = -1; + int ret = 0; + int size = 0; + char path[MAXPGPATH]; + uint32 hashvalue = Record2pcCacheHashCode(tid); + LWLock *lock = Cache2pcPartitionLock(hashvalue); + + Assert(NULL != record_2pc_cache); + Assert(NULL != tid); + + memset(path, 0, MAXPGPATH); + GET_2PC_FILE_PATH(path, tid); + + LWLockAcquire(lock, LW_EXCLUSIVE); + + /* get 2pc info */ + entry = (Cache2pcInfo *)hash_search_with_hash_value(record_2pc_cache, + tid, hashvalue, HASH_FIND, &found); + if (!found) + { + /* not found */ + LWLockRelease(lock); + Assert(NULL == entry); + return false; + } + + /* found */ + Assert(NULL != entry); + check_entry_key(tid, entry->key); + + Assert(strlen(entry->info) < MAX_2PC_INFO_SIZE); + + if (0 == access(path, F_OK)) + { + /* file exist */ + if (enable_2pc_file_check) + { + elog(LOG, "[%s] found file %s", __FUNCTION__, path); + } + + /* remove file */ + if (0 != unlink(path)) + { + elog(WARNING, "[%s] could not unlink file %s, errMsg: %s", + __FUNCTION__, path, strerror(errno)); } + else + { + elog(LOG, "[%s] unlink file %s", __FUNCTION__, path); + } + } + + /* save to file */ + fd = open(path, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR); + if (fd < 0) + { + LWLockRelease(lock); + elog(ERROR, "[%s] could not create file %s, errMsg: %s", + __FUNCTION__, path, strerror(errno)); + } + + size = strlen(entry->info); + ret = write(fd, entry->info, size); + if(ret != size) + { + LWLockRelease(lock); + close(fd); + elog(ERROR, "[%s] could not write file %s, errMsg: %s, " + "ret: %d, size: %d, info: %s", + __FUNCTION__, path, strerror(errno), ret, size, entry->info); } + close(fd); + + /* remove 2pc info */ + entry = (Cache2pcInfo *)hash_search_with_hash_value(record_2pc_cache, + tid, hashvalue, HASH_REMOVE, &found); + + LWLockRelease(lock); + + Assert(found); + Assert(NULL != entry); + + if (enable_2pc_entry_trace) + { + elog(LOG, "[%s] %s is removed from hash table, entry: %p", + __FUNCTION__, tid, entry); + } + + return true; } + /* * Check whether the 2pc file is exist when it is saved in the hash table. */ @@ -3433,12 +3629,13 @@ void check_2pc_file(const char *tid, const char *info, const char *func) int size = 0; struct stat filestate; char path[MAXPGPATH]; - Cache2pcInfo *entry = NULL; - bool found = false; + int ret = 0; + File fd = -1; + char result[MAX_2PC_INFO_SIZE]; - Assert (NULL != tid); - Assert (NULL != info); - Assert (NULL != func); + Assert(NULL != tid); + Assert(NULL != info); + Assert(NULL != func); GET_2PC_FILE_PATH(path, tid); if (0 != access(path, F_OK)) @@ -3446,106 +3643,86 @@ void check_2pc_file(const char *tid, const char *info, const char *func) return; } - elog(LOG, "[check_2pc_file][%s] node(%s) found file %s", - func, PGXCNodeName, path); + elog(LOG, "[check_2pc_file][%s] found file %s", func, path); if(stat(path, &filestate) == -1) { - elog(ERROR, "[check_2pc_file][%s] could not get status of file %s", - func, path); + elog(WARNING, "[check_2pc_file][%s] could not stat file %s, info: %s", + func, path, info); + return; } size = filestate.st_size; - if (0 != size) - { - int ret = 0; - File fd = -1; - char result[size + 1]; - - fd = PathNameOpenFile(path, O_RDONLY, S_IRUSR | S_IWUSR); - if (fd < 0) - { - elog(ERROR, "[check_2pc_file][%s] could not open file %s for read", - func, path); - } - - memset(result, 0, size +1); - ret = FileRead(fd, result, size, WAIT_EVENT_BUFFILE_READ); - if(ret != size) - { - FileClose(fd); - elog(ERROR, "[check_2pc_file][%s] read %s error, ret: %d, size: %d", - func, path, ret, size); - } - FileClose(fd); - - if (0 != strcmp(result, info)) - { - elog(LOG, "[check_2pc_file][%s] file %s result: %s, info: %s", - func, path, result, info); - } - } - else + if (0 == size) { - elog(LOG, "[check_2pc_file][%s] get empty file %s, info: %s", + elog(WARNING, "[check_2pc_file][%s] file %s is empty, info: %s", func, path, info); + return; } - if (NULL == record_2pc_cache) + if (size >= MAX_2PC_INFO_SIZE) { - elog(LOG, "[check_2pc_file][%s] record_2pc_cache is NULL, " - "tid: %s, info: %s", func, tid, info); + elog(WARNING, "[check_2pc_file][%s] file %s size(%d) overflow(%d)", + func, path, size, MAX_2PC_INFO_SIZE); return; } - entry = (Cache2pcInfo *)hash_search(record_2pc_cache, - tid, HASH_FIND, &found); - if (!found) + fd = PathNameOpenFile(path, O_RDONLY, S_IRUSR | S_IWUSR); + if (fd < 0) { - elog(LOG, "[check_2pc_file][%s] %s is not found " - "in hash table, info: %s", func, tid, info); + elog(WARNING, "[check_2pc_file][%s] could not open file %s, " + "errMsg: %s", func, path, strerror(errno)); return; } - Assert (NULL != entry); + memset(result, 0, size +1); + ret = FileRead(fd, result, size, WAIT_EVENT_BUFFILE_READ); + if(ret != size) + { + FileClose(fd); + elog(WARNING, "[check_2pc_file][%s] could not read file %s, " + "ret: %d, file size: %d", func, path, ret, size); + return; + } + FileClose(fd); - if (0 != strcmp(entry->info, info)) + if (0 != strcmp(result, info)) { - elog(LOG, "[check_2pc_file][%s] %s info change from '%s' to '%s'", - func, tid, info, entry->info); + elog(LOG, "[check_2pc_file][%s] file %s mismatch, " + "result: %s, info: %s", func, path, result, info); } } } void record_2pc_redo_remove_gid_xid(TransactionId xid) { - int i; - GlobalTransaction gxact = NULL; - bool found = false; + int i; + GlobalTransaction gxact = NULL; + bool found = false; if(!enable_2pc_recovery_info) { - return ; + return; } - for (i = 0; i < TwoPhaseState->numPrepXacts; i++) - { - gxact = TwoPhaseState->prepXacts[i]; + for (i = 0; i < TwoPhaseState->numPrepXacts; i++) + { + gxact = TwoPhaseState->prepXacts[i]; - if (gxact->xid == xid) - { - found = true; - break; - } - } + if (gxact->xid == xid) + { + found = true; + break; + } + } - Assert(RecoveryInProgress()); + Assert(RecoveryInProgress()); - if (found) - { - remove_2pc_records(gxact->gid, false); - } + if (found) + { + remove_2pc_records(gxact->gid, false); + } } void record_2pc_involved_nodes_xid(const char * tid, @@ -3553,46 +3730,42 @@ void record_2pc_involved_nodes_xid(const char * tid, GlobalTransactionId startxid, char * nodestring, GlobalTransactionId xid) -{// #lizard forgives - File fd = 0; - int ret = 0; - int size = 0; - StringInfoData content; - struct stat fst; - char path[MAXPGPATH]; - off_t fileSize; - char *result = NULL; - Cache2pcInfo *entry = NULL; - bool found = false; - char *func = "record_2pc_involved_nodes_xid"; +{ + File fd = 0; + int ret = 0; + int size = 0; + StringInfoData content; + struct stat fst; + char path[MAXPGPATH]; + char *result = NULL; #ifdef __TWO_PHASE_TESTS__ - XLogRecPtr xlogrec = 0; + XLogRecPtr xlogrec = 0; #endif - - if (!enable_2pc_recovery_info) - { - return ; - } + + if (!enable_2pc_recovery_info) + { + return; + } if (enable_distri_print || enable_2pc_entry_trace) { elog(LOG, "[%s] record %s, startnode: %s, participants: %s", - func, tid, startnode, nodestring); + __FUNCTION__, tid, startnode, nodestring); } if (NULL == tid || '\0' == tid[0]) { - elog(ERROR, "[%s] gid is empty", func); + elog(ERROR, "[%s] gid is empty", __FUNCTION__); } if (NULL == startnode || '\0' == startnode[0]) { - elog(PANIC, "[%s] %s startnode is empty", func, tid); + elog(PANIC, "[%s] %s startnode is empty", __FUNCTION__, tid); } if (NULL == nodestring || '\0' == nodestring[0]) { - elog(PANIC, "[%s] %s participants is empty", func, tid); + elog(PANIC, "[%s] %s participants is empty", __FUNCTION__, tid); } initStringInfo(&content); @@ -3609,19 +3782,20 @@ void record_2pc_involved_nodes_xid(const char * tid, /* if tid already exists, check content and return */ if (NULL != record_2pc_cache) { + char info[MAX_2PC_INFO_SIZE]; + Assert(strlen(tid) < MAX_TID_SIZE); - entry = (Cache2pcInfo *)hash_search(record_2pc_cache, tid, HASH_FIND, &found); - if (found) + + if (get_2pc_info(tid, info)) { - Assert(NULL != entry); - check_entry_key(tid, entry->key, func); - check_2pc_file(tid, entry->info, func); + Assert(strlen(info) < MAX_2PC_INFO_SIZE); + check_2pc_file(tid, info, __FUNCTION__); - if (strncmp(entry->info, content.data, size) != 0) + if (strncmp(info, content.data, size) != 0) { elog(ERROR, "[%s] pg_clean attemp to write %s info conflict, " - "content: %s, info: %s", - func, tid, content.data, entry->info); + "content: %s, info: %s", __FUNCTION__, tid, + content.data, info); } resetStringInfo(&content); @@ -3635,23 +3809,21 @@ void record_2pc_involved_nodes_xid(const char * tid, /* if file already exists, check content and return */ if (stat(path, &fst) >= 0) { - fileSize = fst.st_size; - result = (char *)palloc0(fileSize + 1); + int file_size = fst.st_size; + result = (char *)palloc0(file_size + 1); fd = PathNameOpenFile(path, O_RDONLY, S_IRUSR | S_IWUSR); if (fd < 0) { - ereport(ERROR, - (errcode_for_file_access(), - errmsg("[%s] could not open file %s for read", func, path))); + elog(ERROR, "[%s] could not open file %s, errMsg: %s", + __FUNCTION__, path, strerror(errno)); } - ret = FileRead(fd, result, fileSize, WAIT_EVENT_BUFFILE_READ); - if(ret != fileSize) + ret = FileRead(fd, result, file_size, WAIT_EVENT_BUFFILE_READ); + if(ret != file_size) { FileClose(fd); - ereport(ERROR, - (errcode_for_file_access(), - errmsg("[%s] could not read file %s, ret: %d", func, path, ret))); + elog(ERROR, "[%s] could not read file %s, ret: %d, file_size: %d", + __FUNCTION__, path, ret, file_size); } FileClose(fd); @@ -3661,7 +3833,7 @@ void record_2pc_involved_nodes_xid(const char * tid, { elog(ERROR, "[%s] pg_clean attemp to write %s info conflict, " "content: %s, info: %s", - func, tid, content.data, result); + __FUNCTION__, tid, content.data, result); } pfree(result); @@ -3692,98 +3864,34 @@ void record_2pc_involved_nodes_xid(const char * tid, run_pg_clean = 1; complish = true; elog(STOP, "[%s] twophase exception: simulate kill start node " - "after record 2pc file", func); + "after record 2pc file", __FUNCTION__); } #endif } - if (NULL != record_2pc_cache && size < MAX_2PC_INFO_SIZE) - { - Assert(strlen(tid) < MAX_TID_SIZE); - entry = (Cache2pcInfo *)hash_search(record_2pc_cache, - tid, HASH_ENTER_NULL, &found); - if (NULL != entry) - { - check_entry_key(tid, entry->key, func); - check_2pc_file(tid, entry->info, func); - - if (found) - { - if (RecoveryInProgress()) - { - elog(LOG, "[%s] %s is found in hash table in recovery mode", - func, tid); - } - else - { - elog(LOG, "[%s] %s is added to hash table, entry: %p, " - "record_2pc_cache: %p, hashvalue: %u", func, tid, entry, - record_2pc_cache, string_hash(tid, MAX_TID_SIZE)); - } - } - else if (enable_2pc_entry_trace || enable_2pc_hash_table_check) - { - elog(LOG, "[%s] %s is added to hash table, entry: %p, " - "record_2pc_cache: %p, hashvalue: %u", func, tid, entry, - record_2pc_cache, string_hash(tid, MAX_TID_SIZE)); - } - - memcpy(entry->info, content.data, size + 1); - check_entry_key(tid, entry->key, func); - - if (enable_2pc_hash_table_check) + if (NULL != record_2pc_cache) { - int retry_times = 0; - Cache2pcInfo *entry_debug = NULL; - - GET_2PC_FILE_PATH(path, tid); - - while (retry_times++ < HASH_TAB_RETRY_MAX) + if (size < MAX_2PC_INFO_SIZE) { - entry_debug = (Cache2pcInfo *)hash_search(record_2pc_cache, - tid, HASH_FIND, &found); - if (found) - { - Assert(NULL != entry_debug); - check_entry_key(tid, entry_debug->key, func); - break; - } - - /* not found */ - elog(LOG, "[%s] %s is not found in hash table, retry times: %d", - func, tid, retry_times); - - Assert(NULL == entry_debug); - - if (0 == access(path, F_OK)) - { - elog(LOG, "[%s] %s found 2pc file %s", func, tid, path); - break; - } - - print_record_2pc_cache(func); - pg_usleep(HASH_TAB_RETRY_SLEEP); - } + Assert(strlen(tid) < MAX_TID_SIZE); - if (retry_times >= HASH_TAB_RETRY_MAX) + if (add_2pc_info(tid, content.data)) { - elog(PANIC, "[%s] %s is not found in hash table", func, tid); - } - } + check_2pc_file(tid, content.data, __FUNCTION__); resetStringInfo(&content); pfree(content.data); + return; } + + elog(LOG, "[%s] %s add to cache failed", __FUNCTION__, tid); + } else { - elog(LOG, "[%s] %s entry is NULL", func, tid); - } + elog(LOG, "[%s] %s info size(%d) overflow(%d)", + __FUNCTION__, tid, size, MAX_2PC_INFO_SIZE); } - else if (NULL != record_2pc_cache) - { - elog(LOG, "[%s] %s size: %d, max info size: %d", - func, tid, size, MAX_2PC_INFO_SIZE); } GET_2PC_FILE_PATH(path, tid); @@ -3807,7 +3915,7 @@ void record_2pc_involved_nodes_xid(const char * tid, if (fd < 0) { elog(ERROR, "[%s] could not create file %s, errMsg: %s", - func, path, strerror(errno)); + __FUNCTION__, path, strerror(errno)); return; } @@ -3816,7 +3924,7 @@ void record_2pc_involved_nodes_xid(const char * tid, { FileClose(fd); elog(ERROR, "[%s] could not write file %s, errMsg: %s, ret: %d, content: %s", - func, path, strerror(errno), ret, content.data); + __FUNCTION__, path, strerror(errno), ret, content.data); } FileClose(fd); @@ -3833,12 +3941,7 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta File fd = -1; int ret = 0; int size = 0; - int new_size = 0; - int retry_times = 0; XLogRecPtr xlogrec = 0; - Cache2pcInfo *entry = NULL; - bool found = false; - char *func = "record_2pc_commit_timestamp"; if (!enable_2pc_recovery_info) { @@ -3848,7 +3951,7 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta if (enable_distri_print || enable_2pc_entry_trace) { elog(LOG, "[%s] %s commit_timestamp: "INT64_FORMAT, - func, tid, commit_timestamp); + __FUNCTION__, tid, commit_timestamp); } Assert(tid[0] != '\0'); if (InvalidGlobalTimestamp == commit_timestamp && @@ -3856,7 +3959,7 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta TWO_PHASE_COMMIT_END == g_twophase_state.state)) { elog(ERROR, "[%s] could not commit transaction '%s' on node '%s' " - "with InvalidGlobalTimestamp", func, tid, PGXCNodeName); + "with InvalidGlobalTimestamp", __FUNCTION__, tid, PGXCNodeName); } if (!RecoveryInProgress()) @@ -3879,121 +3982,39 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta size = content.len; Assert(size == strlen(content.data)); - GET_2PC_FILE_PATH(path, tid); - - while (NULL != record_2pc_cache && retry_times++ < HASH_TAB_RETRY_MAX) - { - Assert(strlen(tid) < MAX_TID_SIZE); - entry = (Cache2pcInfo *)hash_search(record_2pc_cache, tid, HASH_FIND, &found); - if (found) - { - Assert(NULL != entry); - check_entry_key(tid, entry->key, func); - check_2pc_file(tid, entry->info, func); - - if (RecoveryInProgress()) - { - elog(LOG, "[%s] %s is found in hash table in recovery mode", - func, tid); - } - else if (enable_2pc_entry_trace) + if (NULL != record_2pc_cache) { - elog(LOG, "[%s] %s is found in hash table", func, tid); - } + bool overflow = false; - new_size = size + strlen(entry->info); + Assert(strlen(tid) < MAX_TID_SIZE); - if (new_size < MAX_2PC_INFO_SIZE) + if (append_2pc_info(tid, content.data, &overflow)) { - /* save to hash table */ - memcpy(entry->info + strlen(entry->info), content.data, size + 1); - check_entry_key(tid, entry->key, func); - resetStringInfo(&content); pfree(content.data); return; } - /* save to file */ - elog(LOG, "[%s] %s new size(%d) overflow(%d)", - func, tid, new_size, MAX_2PC_INFO_SIZE); - - GET_2PC_FILE_PATH(path, tid); - - if (RecoveryInProgress()) - { - fd = PathNameOpenFile(path, O_RDWR | O_TRUNC | O_CREAT, - S_IRUSR | S_IWUSR); - } - else - { - fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL, - S_IRUSR | S_IWUSR); - } - if (fd < 0) - { - elog(ERROR, "[%s] could not append timestamp, file %s, errMsg: %s", - func, path, strerror(errno)); - } - - ret = FileWrite(fd, entry->info, strlen(entry->info), - WAIT_EVENT_BUFFILE_WRITE); - if(ret != strlen(entry->info)) - { - FileClose(fd); - elog(ERROR, "[%s] could not write file %s, errMsg: %s, " - "ret: %d, info: %s", - func, path, strerror(errno), ret, entry->info); - } - ret = FileWrite(fd, content.data, size, - WAIT_EVENT_BUFFILE_WRITE); - if(ret != size) + if (overflow) { - FileClose(fd); - elog(ERROR, "[%s] could not write file %s, errMsg: %s, " - "ret: %d, info: %s", - func, path, strerror(errno), ret, content.data); - } - FileClose(fd); + elog(LOG, "[%s] %s is overflow", __FUNCTION__, tid); - /* remove from hash table */ - entry = (Cache2pcInfo *)hash_search(record_2pc_cache, - tid, HASH_REMOVE, &found); - if (!found) + if (save_and_remove_2pc_info(tid)) { - elog(WARNING, "[%s] %s is not found in hash table when remove it", - func, tid); + elog(LOG, "[%s] %s save to file", __FUNCTION__, tid); } - else if (enable_2pc_entry_trace) + else { - elog(LOG, "[%s] %s is removed from hash table", func, entry->key); + elog(LOG, "[%s] %s save to file failed", __FUNCTION__, tid); } - - resetStringInfo(&content); - pfree(content.data); - return; } - - /* not found */ - elog(LOG, "[%s] %s is not found in hash table, retry times: %d", - func, tid, retry_times); - - Assert(NULL == entry); - if (0 == access(path, F_OK)) + else { - elog(LOG, "[%s] %s found 2pc file %s", func, tid, path); - break; + elog(LOG, "[%s] %s is not found in hash table", __FUNCTION__, tid); } - - print_record_2pc_cache(func); - - pg_usleep(HASH_TAB_RETRY_SLEEP); } - if (NULL != record_2pc_cache) - { - elog(LOG, "[%s] %s is not found in hash table, get from disk", func, tid); - } + GET_2PC_FILE_PATH(path, tid); /* the 2pc file exists already */ fd = PathNameOpenFile(path, O_RDWR | O_APPEND, S_IRUSR | S_IWUSR); @@ -4014,17 +4035,17 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta if (0 == strcmp(gxact->gid, tid)) { elog(ERROR, "[%s] could not append timestamp in file %s, " - "errMsg: %s", func, path, strerror(errno)); + "errMsg: %s", __FUNCTION__, path, strerror(errno)); } } #endif elog(LOG, "[%s] could not open file %s, errMsg: %s", - func, path, strerror(errno)); + __FUNCTION__, path, strerror(errno)); } else { elog(PANIC, "[%s] could not open file %s, errMsg: %s", - func, path, strerror(errno)); + __FUNCTION__, path, strerror(errno)); } return; } @@ -4034,7 +4055,7 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta memset(file_content, 0, FILE_CONTENT_SIZE); ret = FileRead(fd, file_content, FILE_CONTENT_SIZE, WAIT_EVENT_BUFFILE_READ); elog(LOG, "[%s] before append file: %s, file_content: %s, content.data: %s, " - "ret: %d", func, path, file_content, content.data, ret); + "ret: %d", __FUNCTION__, path, file_content, content.data, ret); } ret = FileWrite(fd, content.data, size, WAIT_EVENT_BUFFILE_WRITE); @@ -4042,7 +4063,7 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta { FileClose(fd); elog(ERROR, "[%s] could not write file %s, errMsg: %s", - func, path, strerror(errno)); + __FUNCTION__, path, strerror(errno)); } if (enable_distri_print) @@ -4051,7 +4072,7 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta FileSeek(fd, 0, SEEK_SET); ret = FileRead(fd, file_content, FILE_CONTENT_SIZE, WAIT_EVENT_BUFFILE_READ); elog(LOG, "[%s] after append file: %s, file_content: %s, ret: %d", - func, path, file_content, ret); + __FUNCTION__, path, file_content, ret); } FileClose(fd); @@ -4063,9 +4084,6 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta void remove_2pc_records(const char * tid, bool record_in_xlog) { char path[MAXPGPATH]; - Cache2pcInfo *entry = NULL; - bool found = false; - char *func = "remove_2pc_records"; if (!enable_2pc_recovery_info) { @@ -4074,7 +4092,7 @@ void remove_2pc_records(const char * tid, bool record_in_xlog) if (enable_distri_print || enable_2pc_entry_trace) { - elog(LOG, "[%s] %s record_in_xlog: %d", func, tid, record_in_xlog); + elog(LOG, "[%s] %s record_in_xlog: %d", __FUNCTION__, tid, record_in_xlog); } if (!RecoveryInProgress() && record_in_xlog) @@ -4086,35 +4104,36 @@ void remove_2pc_records(const char * tid, bool record_in_xlog) XLogInsert(RM_XLOG_ID, XLOG_CLEAN_2PC_FILE); } + GET_2PC_FILE_PATH(path, tid); + if (NULL != record_2pc_cache) { Assert(strlen(tid) < MAX_TID_SIZE); + if (enable_2pc_entry_key_check) { - entry = (Cache2pcInfo *)hash_search(record_2pc_cache, - tid, HASH_FIND, &found); - if (found) + char info[MAX_2PC_INFO_SIZE]; + if (get_2pc_info(tid, info)) { - Assert(NULL != entry); - check_entry_key(tid, entry->key, func); - check_2pc_file(tid, entry->info, func); + Assert(strlen(info) < MAX_2PC_INFO_SIZE); + check_2pc_file(tid, info, __FUNCTION__); } } - entry = (Cache2pcInfo *)hash_search(record_2pc_cache, - tid, HASH_REMOVE, &found); - if (found) + + /* remove from hash table */ + if (remove_2pc_info(tid)) { - Assert(NULL != entry); - if (enable_2pc_entry_trace) + if (enable_2pc_file_check) { - elog(LOG, "[%s] %s is removed from hash table", func, tid); + if (0 == access(path, F_OK)) + { + elog(LOG, "[%s] still found file %s", __FUNCTION__, path); + } } return; } } - GET_2PC_FILE_PATH(path, tid); - /* * no need to check file exists. * since when it do not exists, unlink won't success. @@ -4122,7 +4141,7 @@ void remove_2pc_records(const char * tid, bool record_in_xlog) if (0 != unlink(path)) { elog(LOG, "[%s] could not unlink file %s, errMsg: %s", - func, path, strerror(errno)); + __FUNCTION__, path, strerror(errno)); } } @@ -4130,11 +4149,6 @@ void rename_2pc_records(const char *tid, TimestampTz timestamp) { char path[MAXPGPATH]; char new_path[MAXPGPATH]; - Cache2pcInfo *entry = NULL; - bool found = false; - File fd = 0; - int ret = 0; - char *func = "rename_2pc_records"; if (!enable_2pc_recovery_info) { @@ -4143,7 +4157,7 @@ void rename_2pc_records(const char *tid, TimestampTz timestamp) if (enable_distri_print || enable_2pc_entry_trace) { - elog(LOG, "[%s] %s timestamp: "INT64_FORMAT, func, tid, timestamp); + elog(LOG, "[%s] %s timestamp: "INT64_FORMAT, __FUNCTION__, tid, timestamp); } if (0 == timestamp) @@ -4161,103 +4175,64 @@ void rename_2pc_records(const char *tid, TimestampTz timestamp) XLogInsert(RM_XLOG_ID, XLOG_CLEAN_2PC_FILE); } - GET_2PC_FILE_PATH(path, tid); - snprintf(new_path, MAXPGPATH, "%s." INT64_FORMAT ".rollback", path, timestamp); - if (NULL != record_2pc_cache) { Assert(strlen(tid) < MAX_TID_SIZE); - entry = (Cache2pcInfo *)hash_search(record_2pc_cache, - tid, HASH_FIND, &found); - if (found) - { - Assert(NULL != entry); - check_entry_key(tid, entry->key, func); - check_2pc_file(tid, entry->info, func); - if (0 == access(new_path, F_OK)) - { - if (RecoveryInProgress()) + if (save_and_remove_2pc_info(tid)) { - elog(LOG, "[%s] file %s exist", func, new_path); + elog(LOG, "[%s] %s save to file", __FUNCTION__, tid); } else { - elog(WARNING, "[%s] file %s exist", func, new_path); - } - if (0 != unlink(new_path)) - { - elog(ERROR, "[%s] could not unlink file %s, errMsg: %s", - func, new_path, strerror(errno)); + elog(LOG, "[%s] %s save to file failed", __FUNCTION__, tid); } } - fd = PathNameOpenFile(new_path, O_RDWR | O_CREAT | O_EXCL, - S_IRUSR | S_IWUSR); - if (fd < 0) - { - elog(ERROR, "[%s] could not create file %s, errMsg: %s", - func, new_path, strerror(errno)); - } + GET_2PC_FILE_PATH(path, tid); + snprintf(new_path, MAXPGPATH, "%s." INT64_FORMAT ".rollback", path, timestamp); - ret = FileWrite(fd, entry->info, strlen(entry->info), - WAIT_EVENT_BUFFILE_WRITE); - if(ret != strlen(entry->info)) + if (0 != access(path, F_OK)) { - FileClose(fd); - elog(ERROR, "[%s] could not write file %s, errMsg: %s, " - "ret: %d, info: %s", - func, path, strerror(errno), ret, entry->info); - } - FileClose(fd); - - entry = (Cache2pcInfo *)hash_search(record_2pc_cache, - tid, HASH_REMOVE, &found); - if (!found) + if (RecoveryInProgress()) { - elog(ERROR, "[%s] %s is not found in hash table when remove it", - func, tid); + elog(LOG, "[%s] could not access file %s in recovery mode, errMsg: %s", + __FUNCTION__, path, strerror(errno)); } - else if (enable_2pc_entry_trace) + else { - elog(LOG, "[%s] %s is removed from hash table", func, tid); - } - return; - } + elog(WARNING, "[%s] could not access file %s, errMsg: %s", + __FUNCTION__, path, strerror(errno)); } - if (0 != access(path, F_OK)) - { - elog(LOG, "[%s] could not access file %s, errMsg: %s", - func, path, strerror(errno)); return; } if (0 == access(new_path, F_OK)) { if (RecoveryInProgress()) { - elog(LOG, "[%s] file %s exist", func, new_path); + elog(LOG, "[%s] file %s exist", __FUNCTION__, new_path); } else { - elog(WARNING, "[%s] file %s exist", func, new_path); + elog(WARNING, "[%s] file %s exist", __FUNCTION__, new_path); } if (0 != unlink(new_path)) { elog(WARNING, "[%s] could not unlink file %s, errMsg: %s", - func, new_path, strerror(errno)); + __FUNCTION__, new_path, strerror(errno)); return; } } if (0 != link(path, new_path)) { elog(ERROR, "[%s] could not link file %s to %s, errMsg: %s", - func, path, new_path, strerror(errno)); + __FUNCTION__, path, new_path, strerror(errno)); } if (0 != unlink(path)) { elog(WARNING, "[%s] could not unlink file %s, errMsg: %s", - func, path, strerror(errno)); + __FUNCTION__, path, strerror(errno)); } } @@ -4266,10 +4241,7 @@ void record_2pc_readonly(const char *gid) File fd = 0; int ret = 0; char path[MAXPGPATH]; - char content[10] = "readonly"; - Cache2pcInfo *entry = NULL; - bool found = false; - char *func = "record_2pc_readonly"; + char *content = "readonly"; if(!enable_2pc_recovery_info) { @@ -4278,7 +4250,7 @@ void record_2pc_readonly(const char *gid) if (enable_distri_print || enable_2pc_entry_trace) { - elog(LOG, "[%s] %s is readonly", func, gid); + elog(LOG, "[%s] %s is readonly", __FUNCTION__, gid); } if (!RecoveryInProgress()) @@ -4293,39 +4265,14 @@ void record_2pc_readonly(const char *gid) { Assert(strlen(gid) < MAX_TID_SIZE); Assert(strlen(content) < MAX_2PC_INFO_SIZE); - entry = (Cache2pcInfo *)hash_search(record_2pc_cache, - gid, HASH_ENTER_NULL, &found); - if (NULL != entry) - { - check_entry_key(gid, entry->key, func); - check_2pc_file(gid, entry->info, func); - if (found) - { - if (RecoveryInProgress()) - { - elog(LOG, "[%s] %s is found in hash table in recovery mode", - func, gid); - } - else - { - elog(LOG, "[%s] %s is found in hash table", func, gid); - } - } - else if (enable_2pc_entry_trace) + if (add_2pc_info(gid, content)) { - elog(LOG, "[%s] %s is added to hash table", func, gid); - } - - memcpy(entry->info, content, strlen(content) + 1); - check_entry_key(gid, entry->key, func); - + check_2pc_file(gid, content, __FUNCTION__); return; } - else - { - elog(LOG, "[%s] %s entry is NULL", func, gid); - } + + elog(LOG, "[%s] %s add to cache failed", __FUNCTION__, gid); } /* the 2pc dir is already created in initdb */ @@ -4351,7 +4298,7 @@ void record_2pc_readonly(const char *gid) if (fd < 0) { elog(ERROR, "[%s] could not create file %s, errMsg: %s", - func, path, strerror(errno)); + __FUNCTION__, path, strerror(errno)); return; } @@ -4360,7 +4307,7 @@ void record_2pc_readonly(const char *gid) { FileClose(fd); elog(ERROR, "[%s] could not write file %s, errMsg: %s, ret: %d, content: %s", - func, path, strerror(errno), ret, content); + __FUNCTION__, path, strerror(errno), ret, content); } FileClose(fd); } @@ -4370,35 +4317,36 @@ void record_2pc_readonly(const char *gid) */ char *get_2pc_info_from_cache(const char *tid) { - Cache2pcInfo *entry = NULL; - bool found = false; - char *func = "get_2pc_info_from_cache"; + char *info = NULL; - if (NULL != record_2pc_cache) + if (NULL == record_2pc_cache) { + return NULL; + } + Assert(strlen(tid) < MAX_TID_SIZE); - entry = (Cache2pcInfo *)hash_search(record_2pc_cache, - tid, HASH_FIND, &found); - if (found) - { - Assert(NULL != entry); - check_entry_key(tid, entry->key, func); + info = (char *)palloc0(MAX_2PC_INFO_SIZE); + if (get_2pc_info(tid, info)) + { + Assert(strlen(info) < MAX_2PC_INFO_SIZE); + check_2pc_file(tid, info, __FUNCTION__); if (enable_2pc_entry_trace) { - elog(LOG, "[%s] %s is found in hast table, key: %s, info: %s", - func, tid, entry->key, entry->info); + elog(LOG, "[%s] %s is found in hash table", __FUNCTION__, tid); } - return entry->info; + return info; } + pfree(info); + if (enable_2pc_entry_trace) { - elog(LOG, "[%s] %s is not found in hast table", func, tid); - } + elog(LOG, "[%s] %s is not found in hash table", __FUNCTION__, tid); } + return NULL; } @@ -4410,7 +4358,8 @@ char *get_2pc_list_from_cache(int *count) HASH_SEQ_STATUS seq; Cache2pcInfo *entry = NULL; char *recordList = NULL; - char *func = "get_2pc_list_from_cache"; + + Assert(NULL != count); if (NULL == record_2pc_cache) { @@ -4421,12 +4370,7 @@ char *get_2pc_list_from_cache(int *count) while ((entry = hash_seq_search(&seq)) != NULL) { Assert(NULL != entry); - check_2pc_file(entry->key, entry->info, func); - - if (NULL != count && *count >= MAX_OUTPUT_FILE) - { - break; - } + check_2pc_file(entry->key, entry->info, __FUNCTION__); if(NULL == recordList) { @@ -4439,9 +4383,10 @@ char *get_2pc_list_from_cache(int *count) strlen(entry->key) + strlen(recordList) + 2); sprintf(recordList, "%s,%s", recordList, entry->key); } - if (NULL != count) + + if (++(*count) >= MAX_OUTPUT_FILE) { - (*count)++; + break; } } @@ -4465,11 +4410,11 @@ Record2pcCacheInit(void) info.keysize = MAX_TID_SIZE; info.entrysize = record_2pc_entry_size; - info.num_partitions = record_2pc_partitions; + info.num_partitions = NUM_CACHE_2PC_PARTITIONS; flags = HASH_ELEM | HASH_PARTITION; - record_2pc_cache = ShmemInitHash("Record 2pc Cache", + record_2pc_cache = ShmemInitHash("Record 2pc cache", record_2pc_cache_size, record_2pc_cache_size, &info, flags); } @@ -4488,4 +4433,21 @@ Record2pcCacheSize(void) return cache_size; } +/* + * Record2pcCacheHashCode + * Compute the hash code associated with a tid + * + * This must be passed to the lookup/insert/delete routines along with the + * tag. We do it like this because the callers need to know the hash code + * in order to determine which buffer partition to lock, and we don't want + * to do the hash computation twice. + */ +uint32 +Record2pcCacheHashCode(const char *tid) +{ + Assert(NULL != record_2pc_cache); + Assert(NULL != tid); + return get_hash_value(record_2pc_cache, tid); +} + #endif diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index 342621e3..2a555ebc 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -457,6 +457,13 @@ InitializeLWLocks(void) for (id = 0; id < NUM_PREDICATELOCK_PARTITIONS; id++, lock++) LWLockInitialize(&lock->lock, LWTRANCHE_PREDICATE_LOCK_MANAGER); + /* Initialize 2pc info cache LWLocks in main array */ + lock = MainLWLockArray + NUM_INDIVIDUAL_LWLOCKS + + NUM_BUFFER_PARTITIONS + NUM_LOCK_PARTITIONS + + NUM_PREDICATELOCK_PARTITIONS; + for (id = 0; id < NUM_CACHE_2PC_PARTITIONS; id++, lock++) + LWLockInitialize(&lock->lock, LWTRANCHE_2PC_INFO_CACHE); + /* Initialize named tranches. */ if (NamedLWLockTrancheRequests > 0) { diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 1f205d34..378f9c7a 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -2705,7 +2705,7 @@ static struct config_bool ConfigureNamesBool[] = NULL }, &enable_2pc_file_check, - true, + false, NULL, NULL, NULL }, { @@ -2726,15 +2726,6 @@ static struct config_bool ConfigureNamesBool[] = false, NULL, NULL, NULL }, - { - {"enable_2pc_hash_table_check", PGC_USERSET, CUSTOM_OPTIONS, - gettext_noop("Enable 2PC hash table check."), - NULL - }, - &enable_2pc_hash_table_check, - false, - NULL, NULL, NULL - }, #endif #ifdef __TBASE__ @@ -4809,14 +4800,6 @@ static struct config_int ConfigureNamesInt[] = 2048, 1028, INT_MAX, NULL, NULL, NULL }, - { - {"record_2pc_partitions", PGC_POSTMASTER, CUSTOM_OPTIONS, - gettext_noop("2PC info cache partition number."), - }, - &record_2pc_partitions, - 32, 1, INT_MAX, - NULL, NULL, NULL - }, #endif #ifdef __TBASE__ diff --git a/src/include/access/twophase.h b/src/include/access/twophase.h index e0fe09d2..06f9685e 100644 --- a/src/include/access/twophase.h +++ b/src/include/access/twophase.h @@ -101,11 +101,9 @@ extern bool enable_2pc_file_cache; extern bool enable_2pc_file_check; extern bool enable_2pc_entry_key_check; extern bool enable_2pc_entry_trace; -extern bool enable_2pc_hash_table_check; extern int record_2pc_cache_size; extern int record_2pc_entry_size; -extern int record_2pc_partitions; #endif extern Size TwoPhaseShmemSize(void); diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index 4a088f5e..25ee91a8 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -173,6 +173,9 @@ extern PGDLLIMPORT int NamedLWLockTrancheRequests; /* Number of partitions of the shared buffer mapping hashtable */ #define NUM_BUFFER_PARTITIONS 128 +/* Number of partitions of the 2pc info cache hashtable */ +#define NUM_CACHE_2PC_PARTITIONS 128 + /* Number of partitions the shared lock tables are divided into */ #define LOG2_NUM_LOCK_PARTITIONS 4 #define NUM_LOCK_PARTITIONS (1 << LOG2_NUM_LOCK_PARTITIONS) @@ -187,9 +190,10 @@ extern PGDLLIMPORT int NamedLWLockTrancheRequests; (BUFFER_MAPPING_LWLOCK_OFFSET + NUM_BUFFER_PARTITIONS) #define PREDICATELOCK_MANAGER_LWLOCK_OFFSET \ (LOCK_MANAGER_LWLOCK_OFFSET + NUM_LOCK_PARTITIONS) -#define NUM_FIXED_LWLOCKS \ +#define CACHE_2PC_LWLOCK_OFFSET \ (PREDICATELOCK_MANAGER_LWLOCK_OFFSET + NUM_PREDICATELOCK_PARTITIONS) - +#define NUM_FIXED_LWLOCKS \ + (CACHE_2PC_LWLOCK_OFFSET + NUM_CACHE_2PC_PARTITIONS) typedef enum LWLockMode { LW_EXCLUSIVE, @@ -288,6 +292,7 @@ typedef enum BuiltinTrancheIds LWTRANCHE_PARALLEL_WORKER_DSA, #endif LWTRANCHE_TBM, + LWTRANCHE_2PC_INFO_CACHE, LWTRANCHE_FIRST_USER_DEFINED } BuiltinTrancheIds; diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index f9926dda..e13fdd2a 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -75,8 +75,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_2pc_entry_key_check | on enable_2pc_entry_trace | off enable_2pc_file_cache | on - enable_2pc_file_check | on - enable_2pc_hash_table_check | off + enable_2pc_file_check | off enable_2pc_recovery_info | on enable_audit | off enable_audit_warning | off @@ -142,7 +141,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_transparent_crypt | on enable_user_authority_force_check | off enable_xlog_mprotect | on -(69 rows) +(68 rows) -- Test that the pg_timezone_names and pg_timezone_abbrevs views are -- more-or-less working. We can't test their contents in any great detail From 3c4d6a7696c2b85c1af5d8db4a6cb9c0a38ba145 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Thu, 3 Jun 2021 11:15:53 +0800 Subject: [PATCH 387/578] fix gtm standby promote when master shutdown http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131087569165 (merge request !388) Squash merge branch 'sigmalin_v2.15.20' into 'Tbase_v2.15.19' * fix gtm standby promote when master shutdown http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131087569165 --- src/gtm/main/gtm_standby.c | 22 +++++++++++++++------- src/gtm/main/main.c | 8 +++++--- src/include/gtm/gtm_standby.h | 2 +- 3 files changed, 21 insertions(+), 11 deletions(-) diff --git a/src/gtm/main/gtm_standby.c b/src/gtm/main/gtm_standby.c index f441c889..ba11b416 100644 --- a/src/gtm/main/gtm_standby.c +++ b/src/gtm/main/gtm_standby.c @@ -52,7 +52,7 @@ extern int GTMPortNumber; #ifndef __XLOG__ static GTM_Conn *gtm_standby_connect_to_standby_int(int *report_needed); #endif -static GTM_Conn *gtm_standby_connectToActiveGTM(void); +static GTM_Conn *gtm_standby_connectToActiveGTM(int timeout); static void AddBackupLabel(uint64 segment_no); /* Defined in main.c */ @@ -62,9 +62,9 @@ extern int GTM_Standby_Connetion_Timeout; int -gtm_standby_start_startup(void) +gtm_standby_start_startup(int timeout) { - GTM_ActiveConn = gtm_standby_connectToActiveGTM(); + GTM_ActiveConn = gtm_standby_connectToActiveGTM(timeout); if (GTM_ActiveConn == NULL || GTMPQstatus(GTM_ActiveConn) != CONNECTION_OK) { int save_errno = errno; @@ -644,7 +644,7 @@ void gtm_standby_finishActiveConn(void) { - GTM_ActiveConn = gtm_standby_connectToActiveGTM(); + GTM_ActiveConn = gtm_standby_connectToActiveGTM(0); if (GTM_ActiveConn == NULL) { elog(DEBUG3, "Error in connection"); @@ -664,7 +664,7 @@ gtm_standby_finishActiveConn(void) } static GTM_Conn * -gtm_standby_connectToActiveGTM(void) +gtm_standby_connectToActiveGTM(int timeout) { char connect_string[1024]; int active_port = Recovery_StandbyGetActivePort(); @@ -673,8 +673,16 @@ gtm_standby_connectToActiveGTM(void) /* Need to connect to Active-GTM again here */ elog(LOG, "Connecting the GTM active on %s:%d...", active_address, active_port); - sprintf(connect_string, "host=%s port=%d node_name=%s remote_type=%d", - active_address, active_port, NodeName, GTM_NODE_GTM); + if (timeout != 0) + { + sprintf(connect_string, "host=%s port=%d node_name=%s remote_type=%d connect_timeout=%d", + active_address, active_port, NodeName, GTM_NODE_GTM, timeout); + } + else + { + sprintf(connect_string, "host=%s port=%d node_name=%s remote_type=%d", + active_address, active_port, NodeName, GTM_NODE_GTM); + } return PQconnectGTM(connect_string); } diff --git a/src/gtm/main/main.c b/src/gtm/main/main.c index e894a5bc..93fc1901 100644 --- a/src/gtm/main/main.c +++ b/src/gtm/main/main.c @@ -82,6 +82,8 @@ extern char *optarg; #define LOOPS_UNTIL_HIBERNATE 50 #define HIBERNATE_FACTOR 25 +#define GTM_STARTUP_CONNECT_ACTIVE_TIMEOUT (2) + static char *progname = "gtm"; char *ListenAddresses; int GTMPortNumber; @@ -1097,7 +1099,7 @@ main(int argc, char *argv[]) */ if (Recovery_IsStandby()) { - if (!gtm_standby_start_startup()) + if (!gtm_standby_start_startup(GTM_STARTUP_CONNECT_ACTIVE_TIMEOUT)) { #ifdef __TBASE__ elog(LOG, "Failed to establish a connection to active-GTM."); @@ -2046,7 +2048,7 @@ gtm_standby_pre_server_loop(char *data_dir) * retry establish a connection between the active and standby, * controlling frequency with select timeout */ - if (gtm_standby_start_startup()) + if (gtm_standby_start_startup(GTM_STARTUP_CONNECT_ACTIVE_TIMEOUT)) { elog(LOG, "Standby GTM Startup connection established with active-GTM."); break; @@ -3233,7 +3235,7 @@ GTM_ThreadWalReceiver(void *argp) sleep(1); - if (!gtm_standby_start_startup()) + if (!gtm_standby_start_startup(0)) { elog(ERROR, "Failed to establish a connection to active-GTM."); } diff --git a/src/include/gtm/gtm_standby.h b/src/include/gtm/gtm_standby.h index 01a037b7..406fed05 100644 --- a/src/include/gtm/gtm_standby.h +++ b/src/include/gtm/gtm_standby.h @@ -27,7 +27,7 @@ bool gtm_is_standby(void); void gtm_set_standby(bool standby); void gtm_set_active_conninfo(const char *addr, int port); -int gtm_standby_start_startup(void); +int gtm_standby_start_startup(int timeout); int gtm_standby_finish_startup(void); int gtm_standby_restore_next_gxid(void); From 9368787b4435047425ea52eec6a72559af6c056d Mon Sep 17 00:00:00 2001 From: whalesong Date: Wed, 9 Jun 2021 11:17:50 +0800 Subject: [PATCH 388/578] Bugfix: use extended protocol cause perform bad in some case (merge request !392), ID88518281 http://tapd.oa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131088518281 --- src/backend/pgxc/pool/execRemote.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 74bdc28e..d5f96393 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -8966,9 +8966,7 @@ ExecRemoteQuery(PlanState *pstate) if (step->force_autocommit) need_tran_block = false; else - need_tran_block = (step->statement && step->statement[0] != '\0') || - step->cursor || - node->rqs_num_params || + need_tran_block = step->cursor || (!step->read_only && total_conn_count > 1) || (TransactionBlockStatusCode() == 'T'); From 8f3b934a110320eb7f4f1fcbc27b872d20b11214 Mon Sep 17 00:00:00 2001 From: ceciliasu Date: Fri, 11 Jun 2021 15:27:43 +0800 Subject: [PATCH 389/578] fix bug when accessing temp sequence in a redistribution-plan. http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131088413441 --- src/backend/catalog/namespace.c | 9 +++++++++ src/backend/commands/sequence.c | 19 +++++++++++++++++-- src/test/regress/expected/xc_temp.out | 14 ++++++++++++++ src/test/regress/sql/xc_temp.sql | 8 ++++++++ 4 files changed, 48 insertions(+), 2 deletions(-) diff --git a/src/backend/catalog/namespace.c b/src/backend/catalog/namespace.c index 53be4dc7..6af59c1c 100644 --- a/src/backend/catalog/namespace.c +++ b/src/backend/catalog/namespace.c @@ -4108,6 +4108,15 @@ recomputeNamespacePath(void) if (!list_member_oid(oidlist, PG_CATALOG_NAMESPACE)) oidlist = lcons_oid(PG_CATALOG_NAMESPACE, oidlist); +#ifdef __TBASE__ + /* + * If this is secondary backend of a distributed session, check if primary backend + * of the same session has created temporary namespace and wire it up. + */ + if (IsConnFromDatanode() && IS_PGXC_DATANODE && !OidIsValid(myTempNamespace)) + FindTemporaryNamespace(); +#endif + if (OidIsValid(myTempNamespace) && !list_member_oid(oidlist, myTempNamespace)) oidlist = lcons_oid(myTempNamespace, oidlist); diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c index a248f5c0..254b0d63 100644 --- a/src/backend/commands/sequence.c +++ b/src/backend/commands/sequence.c @@ -1922,7 +1922,18 @@ GetGlobalSeqName(Relation seqrel, const char *new_seqname, const char *new_schem char *seqname, *dbname, *relname; char namespace[NAMEDATALEN * 2]; int charlen; - bool is_temp = seqrel->rd_backend == MyBackendId; + bool is_temp = false; + +#ifdef PGXC + /* + * In case of distributed session use MyFirstBackendId for temp objects. + */ + if (OidIsValid(MyCoordId)) + is_temp = seqrel->rd_backend == MyFirstBackendId; + else +#endif + is_temp = seqrel->rd_backend == MyBackendId; + /* Get all the necessary relation names */ dbname = get_database_name(seqrel->rd_node.dbNode); @@ -1989,7 +2000,11 @@ IsTempSequence(Oid relid) /* open and AccessShareLock sequence */ init_sequence(relid, &elm, &seqrel); - +#ifdef PGXC + if (OidIsValid(MyCoordId)) + res = seqrel->rd_backend == MyFirstBackendId; + else +#endif res = seqrel->rd_backend == MyBackendId; relation_close(seqrel, NoLock); return res; diff --git a/src/test/regress/expected/xc_temp.out b/src/test/regress/expected/xc_temp.out index 6f779cc5..4a3843f6 100644 --- a/src/test/regress/expected/xc_temp.out +++ b/src/test/regress/expected/xc_temp.out @@ -1044,3 +1044,17 @@ CREATE TABLE table_child (like table_parent, b int); ERROR: relation "table_parent" does not exist DROP TABLE table_child; ERROR: table "table_child" does not exist +-- Access temp sequence in redistribution. +CREATE TEMP TABLE IF NOT EXISTS rep_tbl_temp (col_int int, col_bigserial bigserial, constraint pk_p_id_key primary key (col_int)) DISTRIBUTE BY REPLICATION; +INSERT INTO rep_tbl_temp values (0); +INSERT INTO rep_tbl_temp values (1); +INSERT INTO rep_tbl_temp values (2); +SELECT col_int, col_bigserial FROM rep_tbl_temp ORDER BY col_int; + col_int | col_bigserial +---------+--------------- + 0 | 1 + 1 | 2 + 2 | 3 +(3 rows) + +DROP TABLE rep_tbl_temp; diff --git a/src/test/regress/sql/xc_temp.sql b/src/test/regress/sql/xc_temp.sql index 539e1c07..1a8ccbed 100644 --- a/src/test/regress/sql/xc_temp.sql +++ b/src/test/regress/sql/xc_temp.sql @@ -141,3 +141,11 @@ DROP TABLE table_rep,table_hash,table_rb; CREATE TEMP TABLE table_parent (a int); CREATE TABLE table_child (like table_parent, b int); DROP TABLE table_child; + +-- Access temp sequence in redistribution. +CREATE TEMP TABLE IF NOT EXISTS rep_tbl_temp (col_int int, col_bigserial bigserial, constraint pk_p_id_key primary key (col_int)) DISTRIBUTE BY REPLICATION; +INSERT INTO rep_tbl_temp values (0); +INSERT INTO rep_tbl_temp values (1); +INSERT INTO rep_tbl_temp values (2); +SELECT col_int, col_bigserial FROM rep_tbl_temp ORDER BY col_int; +DROP TABLE rep_tbl_temp; \ No newline at end of file From f4bd333f3310dfec772e15b194c1bb808b9200a3 Mon Sep 17 00:00:00 2001 From: ceciliasu Date: Fri, 11 Jun 2021 15:59:16 +0800 Subject: [PATCH 390/578] fix review --- src/backend/commands/sequence.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c index 254b0d63..35eb5109 100644 --- a/src/backend/commands/sequence.c +++ b/src/backend/commands/sequence.c @@ -1929,10 +1929,10 @@ GetGlobalSeqName(Relation seqrel, const char *new_seqname, const char *new_schem * In case of distributed session use MyFirstBackendId for temp objects. */ if (OidIsValid(MyCoordId)) - is_temp = seqrel->rd_backend == MyFirstBackendId; + is_temp = (seqrel->rd_backend == MyFirstBackendId); else #endif - is_temp = seqrel->rd_backend == MyBackendId; + is_temp = (seqrel->rd_backend == MyBackendId); /* Get all the necessary relation names */ dbname = get_database_name(seqrel->rd_node.dbNode); From 97522c2fc5bf4f03c5c0c0fa531e9eb9f4a28eba Mon Sep 17 00:00:00 2001 From: ceciliasu Date: Fri, 11 Jun 2021 16:00:35 +0800 Subject: [PATCH 391/578] fix review --- src/backend/commands/sequence.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c index 35eb5109..07209cc8 100644 --- a/src/backend/commands/sequence.c +++ b/src/backend/commands/sequence.c @@ -2000,9 +2000,10 @@ IsTempSequence(Oid relid) /* open and AccessShareLock sequence */ init_sequence(relid, &elm, &seqrel); + #ifdef PGXC if (OidIsValid(MyCoordId)) - res = seqrel->rd_backend == MyFirstBackendId; + res = (seqrel->rd_backend == MyFirstBackendId); else #endif res = seqrel->rd_backend == MyBackendId; From 4621fe6f8867fd69341b7d1879bda8ffeee2a4f8 Mon Sep 17 00:00:00 2001 From: ceciliasu Date: Fri, 11 Jun 2021 16:06:21 +0800 Subject: [PATCH 392/578] fix review --- src/backend/commands/sequence.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c index 07209cc8..c21d7639 100644 --- a/src/backend/commands/sequence.c +++ b/src/backend/commands/sequence.c @@ -2006,7 +2006,7 @@ IsTempSequence(Oid relid) res = (seqrel->rd_backend == MyFirstBackendId); else #endif - res = seqrel->rd_backend == MyBackendId; + res = (seqrel->rd_backend == MyBackendId); relation_close(seqrel, NoLock); return res; } From a2a8e2bd1020a008bfa8c979e7b8f225339ce357 Mon Sep 17 00:00:00 2001 From: andrelin Date: Fri, 11 Jun 2021 20:10:41 +0800 Subject: [PATCH 393/578] Skip invalid relid in group information check this could happen when from list contain CTE tables, but they are okay to join with tapd: http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view?bug_id=1020421696088748699 --- src/test/regress/expected/insert.out | 11 +++++++++++ src/test/regress/sql/insert.sql | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out index d12e3494..528cd56d 100644 --- a/src/test/regress/expected/insert.out +++ b/src/test/regress/expected/insert.out @@ -975,3 +975,14 @@ insert into returningwrtest values (2, 'foo') returning returningwrtest; (1 row) drop table returningwrtest; +-- check insert into a shard table from a CTE table +create table t1(f1 int,f2 int) distribute by shard(f1); +NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. +create table t2(f1 int,f2 int) distribute by shard(f1); +NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. +insert into t1 values(1,1); +insert into t1 values(2,2); +with baseInfo as(select * from t1) +insert into t2 select * from baseInfo; +drop table t1; +drop table t2; diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql index d8f352ab..b9b08d55 100644 --- a/src/test/regress/sql/insert.sql +++ b/src/test/regress/sql/insert.sql @@ -572,4 +572,4 @@ select count(*) from t2_new; drop table t2; drop table t2_rep; drop table t2_new; -reset default_locator_type; \ No newline at end of file +reset default_locator_type; From 09c0fcb2ff705e0953eff4619817306fac6e4394 Mon Sep 17 00:00:00 2001 From: hanwayjiang Date: Wed, 16 Jun 2021 10:30:04 +0800 Subject: [PATCH 394/578] =?UTF-8?q?=E3=80=90=E3=80=90TBase=E3=80=91?= =?UTF-8?q?=E3=80=902.15.19=E3=80=91dblink=E6=8F=92=E4=BB=B6=E7=9A=84DBLIN?= =?UTF-8?q?K=5FCOPY=5FTABLE=E4=B8=8D=E8=83=BD=E8=BF=94=E5=9B=9E=E6=8B=B7?= =?UTF-8?q?=E8=B4=9D=E7=9A=84=E8=A1=8C=E6=95=B0=EF=BC=8C=E5=A2=9E=E5=8A=A0?= =?UTF-8?q?=E8=BF=94=E5=9B=9E=E7=9A=84=E8=A1=8C=E6=95=B0=E3=80=82=E3=80=91?= =?UTF-8?q?http://tapd.oa.com/pgxz/bugtrace/bugs/view=3Fbug=5Fid=3D1010092?= =?UTF-8?q?131088867115=20(merge=20request=20!403)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Squash merge branch 'tbase_v2_hanway_616' into 'Tbase_v2.15.19' * 【【TBase】【2.15.19】dblink插件的DBLINK_COPY_TABLE不能返回拷贝的行数,增加返回的行数。】http://tapd.oa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131088867115 --- contrib/dblink/dblink--1.2.sql | 2 +- contrib/dblink/dblink.c | 23 ++++++++++++++--------- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/contrib/dblink/dblink--1.2.sql b/contrib/dblink/dblink--1.2.sql index fabe10fc..5def7be5 100644 --- a/contrib/dblink/dblink--1.2.sql +++ b/contrib/dblink/dblink--1.2.sql @@ -179,7 +179,7 @@ AS 'MODULE_PATHNAME', 'dblink_is_busy' LANGUAGE C STRICT PARALLEL RESTRICTED; CREATE FUNCTION dblink_copy_table(text, text, text, text, text) -RETURNS int4 +RETURNS int8 AS 'MODULE_PATHNAME', 'dblink_copy_table' LANGUAGE C STRICT PARALLEL RESTRICTED; diff --git a/contrib/dblink/dblink.c b/contrib/dblink/dblink.c index fb1c99ce..82f0e05d 100644 --- a/contrib/dblink/dblink.c +++ b/contrib/dblink/dblink.c @@ -912,15 +912,16 @@ static bool isRemoteTableAsSelect(char * rtblname) * then local server will use COPY FROM statement to copy data into table * directly. */ -static void +static uint64 copyRemoteTableTo(char *nspname, char *tblname, char *rnspname, char *rtblname, char *connstr) { - bool freeconn = false; - char *conname = connstr; - PGconn *conn = NULL; - ParseState *pstate = NULL; - Relation rel = NULL; + bool freeconn = false; + char *conname = connstr; + PGconn *conn = NULL; + ParseState *pstate; + Relation rel; + uint64 processed = 0; dblink_init(); @@ -967,7 +968,8 @@ copyRemoteTableTo(char *nspname, char *tblname, char *rnspname, char *rtblname, cstate = BeginCopyFrom(pstate, rel, NULL, false, copy_read_data, NULL, NIL); - (void) CopyFrom(cstate); + processed = CopyFrom(cstate); + EndCopyFrom(cstate); relation_close(rel, RowExclusiveLock); @@ -989,12 +991,15 @@ copyRemoteTableTo(char *nspname, char *tblname, char *rnspname, char *rtblname, tmp_cbuf = NULL; if (freeconn) PQfinish(conn); + + return processed; } PG_FUNCTION_INFO_V1(dblink_copy_table); Datum dblink_copy_table(PG_FUNCTION_ARGS) { + uint64 processed = 0; char *nspname; char *tblname; char *rnspname; @@ -1011,9 +1016,9 @@ dblink_copy_table(PG_FUNCTION_ARGS) rtblname = text_to_cstring(PG_GETARG_TEXT_PP(3)); connstr = text_to_cstring(PG_GETARG_TEXT_PP(4)); - copyRemoteTableTo(nspname, tblname, rnspname, rtblname, connstr); + processed = copyRemoteTableTo(nspname, tblname, rnspname, rtblname, connstr); - return (Datum) 0; + PG_RETURN_INT64((int64)processed); } /* From 4be1335dbd3ed0a9e51821fd5aa5453cb4d3c035 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Tue, 22 Jun 2021 21:01:17 +0800 Subject: [PATCH 395/578] fix gtm coredump due to LogCollector free http://tapd.oa.com/TBase_C/bugtrace/bugs/view/1020385652089017999 (merge request !416) Squash merge branch 'sigmalin001' into 'Tbase_v2.15.20' * fix gtm coredump due to LogCollector free http://tapd.oa.com/TBase_C/bugtrace/bugs/view/1020385652089017999 --- src/gtm/main/main.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/gtm/main/main.c b/src/gtm/main/main.c index 93fc1901..43a9424a 100644 --- a/src/gtm/main/main.c +++ b/src/gtm/main/main.c @@ -2767,7 +2767,6 @@ GTM_ThreadLogCollector(void *argp) GTM_ProcessLogCollection(); } - GTM_DeInitLogCollector(); elog(LOG, "GTM is shutting down, log collector exits!"); return my_threadinfo; } From 305560a00321c5ef7ff5119d1c4a549a143b8dfd Mon Sep 17 00:00:00 2001 From: yeyukui Date: Thu, 24 Jun 2021 10:00:50 +0800 Subject: [PATCH 396/578] fix the crypt table problems (merge request !419) * add function to clean the invalid elem in rel_crypt_hash table * drop rel will delete elem in rel crypt hash, tapd http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131085590771 --- contrib/tbase_mls/tbase_mls.c | 47 +++++ src/backend/access/rmgrdesc/relcryptdesc.c | 8 +- src/backend/access/transam/twophase.c | 7 + src/backend/access/transam/xact.c | 13 ++ src/backend/catalog/storage.c | 9 + src/backend/storage/freespace/emapage.c | 7 + src/backend/storage/freespace/extent_xlog.c | 7 + src/backend/utils/cache/relcryptmap.c | 193 +++++++++++++++++++- src/include/utils/relcryptmap.h | 7 + 9 files changed, 291 insertions(+), 7 deletions(-) create mode 100644 contrib/tbase_mls/tbase_mls.c diff --git a/contrib/tbase_mls/tbase_mls.c b/contrib/tbase_mls/tbase_mls.c new file mode 100644 index 00000000..e2d53b31 --- /dev/null +++ b/contrib/tbase_mls/tbase_mls.c @@ -0,0 +1,47 @@ +#include "postgres.h" + +#include "catalog/catalog.h" +#include "catalog/storage.h" +#include "miscadmin.h" +#include "fmgr.h" +#include "postmaster/bgwriter.h" + +#include "storage/bufmgr.h" +#include "utils/relcrypt.h" +#include "utils/relcryptmap.h" +#include "utils/mls.h" + +PG_MODULE_MAGIC; + +PG_FUNCTION_INFO_V1(pg_rel_crypt_hash_clean); + +/* + * Add function to clean the rel_crypt_hash table invalid elem + */ +Datum pg_rel_crypt_hash_clean(PG_FUNCTION_ARGS) +{ + RelCryptEntry *relcrypt; + List *mark_delete = NIL; + ListCell * lc; + + if (!is_mls_user()) + { + elog(ERROR, "execute by mls user please"); + } + + /* set to flush rel crypt map */ + RequestFlushRelcryptMap(); + + /* make rel crypt map for a backup file */ + rel_crypt_write_mapfile(true); + + mark_delete = MarkRelCryptInvalid(); + /* delete the elem one by one */ + foreach(lc, mark_delete) + { + relcrypt = (RelCryptEntry *) lfirst(lc); + rel_crypt_hash_delete(&(relcrypt->relfilenode), true); + } + + PG_RETURN_BOOL(true); +} diff --git a/src/backend/access/rmgrdesc/relcryptdesc.c b/src/backend/access/rmgrdesc/relcryptdesc.c index 09082689..2b05fe81 100644 --- a/src/backend/access/rmgrdesc/relcryptdesc.c +++ b/src/backend/access/rmgrdesc/relcryptdesc.c @@ -94,7 +94,6 @@ void rel_crypt_desc(StringInfo buf, XLogReaderState *record) xlrec->algo_id, xlrec->option, xlrec->keysize); break; } - break; case XLOG_CRYPT_KEY_DELETE: appendStringInfo(buf, "xlog type is comming, info:%u", XLOG_CRYPT_KEY_DELETE); break; @@ -107,8 +106,13 @@ void rel_crypt_desc(StringInfo buf, XLogReaderState *record) break; } case XLOG_REL_CRYPT_DELETE: - appendStringInfo(buf, "xlog type is comming, info:%u", XLOG_REL_CRYPT_DELETE); + { + xl_rel_crypt_delete *xlrec; + xlrec = (xl_rel_crypt_delete *) XLogRecGetData(record); + appendStringInfo(buf, "rel crypt delete, database:%u tablespace:%u relnode:%u, algo_id:%d", + xlrec->rnode.dbNode, xlrec->rnode.spcNode, xlrec->rnode.relNode, xlrec->algo_id); break; + } default: Assert(0); break; diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 61bc6b50..a9078bda 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -120,6 +120,7 @@ #ifdef __TBASE__ #include "access/gtm.h" #include "utils/timeout.h" +#include "utils/relcryptmap.h" #endif #include "pgxc/execRemote.h" @@ -2080,6 +2081,12 @@ FinishPreparedTransaction(const char *gid, bool isCommit) SMgrRelation srel = smgropen(delrels[i], InvalidBackendId); smgrdounlink(srel, false); +#ifdef _MLS_ + /* + * clean up the rnode infomation in rel crypt hash table + */ + remove_rel_crypt_hash_elem(&(srel->smgr_relcrypt), true); +#endif smgrclose(srel); } diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index b6881ece..3e59c7f4 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -99,6 +99,7 @@ #include "postmaster/postmaster.h" #include "commands/extension.h" #include "tcop/utility.h" +#include "utils/relcryptmap.h" #endif /* * User-tweakable parameters @@ -7412,6 +7413,12 @@ xact_redo_commit(xl_xact_parsed_commit *parsed, for (fork = 0; fork <= MAX_FORKNUM; fork++) XLogDropRelation(parsed->xnodes[i], fork); smgrdounlink(srel, true); +#ifdef _MLS_ + /* + * clean up the rnode infomation in rel crypt hash table + */ + remove_rel_crypt_hash_elem(&(srel->smgr_relcrypt), false); +#endif smgrclose(srel); } } @@ -7537,6 +7544,12 @@ xact_redo_abort(xl_xact_parsed_abort *parsed, TransactionId xid) for (fork = 0; fork <= MAX_FORKNUM; fork++) XLogDropRelation(parsed->xnodes[i], fork); smgrdounlink(srel, true); +#ifdef _MLS_ + /* + * clean up the rnode infomation in rel crypt hash table + */ + remove_rel_crypt_hash_elem(&(srel->smgr_relcrypt), false); +#endif smgrclose(srel); } } diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index a1396e62..b9136469 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -97,6 +97,7 @@ #include "storage/smgr.h" #include "utils/memutils.h" #include "utils/rel.h" +#include "utils/relcryptmap.h" /* * We keep a list of all relations (represented as RelFileNode values) @@ -466,7 +467,15 @@ smgrDoPendingDeletes(bool isCommit) smgrdounlinkall(srels, nrels, false); for (i = 0; i < nrels; i++) + { +#ifdef _MLS_ + /* + * clean up the rnode infomation in rel crypt hash table + */ + remove_rel_crypt_hash_elem(&(srels[i]->smgr_relcrypt), true); +#endif smgrclose(srels[i]); + } pfree(srels); } diff --git a/src/backend/storage/freespace/emapage.c b/src/backend/storage/freespace/emapage.c index 063d08f3..ade86acd 100644 --- a/src/backend/storage/freespace/emapage.c +++ b/src/backend/storage/freespace/emapage.c @@ -100,6 +100,7 @@ #include "utils/lsyscache.h" #include "funcapi.h" #include "lib/stringinfo.h" +#include "utils/relcryptmap.h" #define ExtentAssertEMEIsFree(eme) ExtentAssert((eme).is_occupied == 0) #define ExtentAssertEMEIsOccup(eme) ExtentAssert((eme).is_occupied == 1) @@ -7078,6 +7079,12 @@ RebuildExtentMap(Relation rel) //TODO: write xlog for truncate extent file RelationOpenSmgr(rel); smgrdounlinkfork(rel->rd_smgr, EXTENT_FORKNUM, false); +#ifdef _MLS_ + /* + * clean up the rnode infomation in rel crypt hash table + */ + remove_rel_crypt_hash_elem(&(rel->rd_smgr->smgr_relcrypt), true); +#endif RelationCloseSmgr(rel); INIT_EXLOG_TRUNCATE(&xlrec); diff --git a/src/backend/storage/freespace/extent_xlog.c b/src/backend/storage/freespace/extent_xlog.c index b08f3f64..1f51a047 100644 --- a/src/backend/storage/freespace/extent_xlog.c +++ b/src/backend/storage/freespace/extent_xlog.c @@ -83,6 +83,7 @@ #include "storage/extentmapping.h" #include "storage/extent_xlog.h" #include "storage/smgr.h" +#include "utils/relcryptmap.h" static void extent_xlog_apply_record(XLogReaderState *record); static void extent_xlog_apply_truncate(XLogReaderState *record); @@ -364,6 +365,12 @@ extent_xlog_apply_truncate(XLogReaderState *record) SMgrRelation reln; reln = smgropen(xlrec->rnode, InvalidBackendId); smgrdounlinkfork(reln, EXTENT_FORKNUM, true); +#ifdef _MLS_ + /* + * clean up the rnode infomation in rel crypt hash table + */ + remove_rel_crypt_hash_elem(&(reln->smgr_relcrypt), false); +#endif smgrclose(reln); } diff --git a/src/backend/utils/cache/relcryptmap.c b/src/backend/utils/cache/relcryptmap.c index fd16e5fb..69fc1a06 100644 --- a/src/backend/utils/cache/relcryptmap.c +++ b/src/backend/utils/cache/relcryptmap.c @@ -113,6 +113,9 @@ #include "utils/relcryptmisc.h" #include "storage/relcryptstorage.h" #include "utils/relcryptmap.h" +#include "catalog/indexing.h" +#include "utils/fmgroids.h" +#include "utils/relfilenodemap.h" #ifdef _MLS_ #include "utils/mls_extension.h" @@ -1212,8 +1215,18 @@ void rel_crypt_redo(XLogReaderState *record) break; } case XLOG_REL_CRYPT_DELETE: - elog(ERROR, "xlog type is comming, info:%u", XLOG_REL_CRYPT_DELETE); + { + xl_rel_crypt_delete *xlrec; + xlrec = (xl_rel_crypt_delete *) XLogRecGetData(record); + if (g_enable_crypt_debug) + { + elog(LOG, "REL_CRYPT_DELETE, redo XLOG_REL_CRYPT_DELETE, relfilenode:%d:%d:%d, algo_id:%d", + xlrec->rnode.dbNode, xlrec->rnode.spcNode, xlrec->rnode.relNode, + xlrec->algo_id); + } + rel_crypt_hash_delete(&(xlrec->rnode), false); break; + } default: elog(ERROR, "recrypt redo, unknown info, info:%u", info & XLR_RMGR_INFO_MASK); break; @@ -1275,6 +1288,89 @@ static int rel_crypt_hash_key_cmp (const void *key1, const void *key2, Size keys return 1; } +/* + * this function is used to remove hash elem + * + * if write_wal is true, remove action will write wal + */ +void remove_rel_crypt_hash_elem(RelCrypt relCrypt, bool write_wal) +{ + if (relCrypt != NULL) + { + /* + * if the algo_id is invalid, skip + */ + if (relCrypt->algo_id == TRANSP_CRYPT_INVALID_ALGORITHM_ID) + { + return; + } + /* + * do remove the rnode and algo_id map in rel_crypt_hash table + */ + rel_crypt_hash_delete(&(relCrypt->relfilenode), write_wal); + } +} + +/* + * do delete rel crypt hash elem about a rnode + */ +void rel_crypt_hash_delete(RelFileNode *rnode, bool write_wal) +{ + RelCrypt relCrypt; + bool found = false; + + uint32 hashcode; + int partitionno; + LWLock *partitionLock; + + hashcode = rel_crypt_hash_code(rnode); + partitionno = rel_crypt_hash_partition(hashcode); + partitionLock = rel_crypt_get_partition_lock(partitionno); + + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + + relCrypt = (RelCrypt) hash_search_with_hash_value(g_rel_crypt_hash, + (void *) rnode, + hashcode, + HASH_REMOVE, + &found); + + if (found) + { + /* + * need to flush crypt map in next checkpoint + */ + RequestFlushRelcryptMap(); + } + + /* + * Critical section + */ + if (found && write_wal) + { + xl_rel_crypt_delete xlrec; + XLogRecPtr lsn; + + /* now errors are fatal ... */ + START_CRIT_SECTION(); + + xlrec.rnode = relCrypt->relfilenode; + xlrec.algo_id = relCrypt->algo_id; + + XLogBeginInsert(); + XLogRegisterData((char *) (&xlrec), sizeof(xl_rel_crypt_delete)); + + lsn = XLogInsert(RM_REL_CRYPT_ID, XLOG_REL_CRYPT_DELETE); + + /* As always, WAL must hit the disk before the data update does */ + XLogFlush(lsn); + + END_CRIT_SECTION(); + } + + LWLockRelease(partitionLock); +} + void rel_crypt_hash_insert(RelFileNode * rnode, AlgoId algo_id, bool write_wal, bool in_building_procedure) {// #lizard forgives RelCrypt relcrypt; @@ -1663,8 +1759,13 @@ static void rel_crypt_write_mapfile_post(RelCryptMapFile *map, int element_cnt, return; } -static void rel_crypt_write_mapfile(void) -{// #lizard forgives +/* + * if is_backup is true, it means to backup the pg_rel_crypt.map + * to pg_rel_crypt.map.backup, if is_backup is false, it means + * flush the data to disk + */ +void rel_crypt_write_mapfile(bool is_backup) +{ int loop; int lock_loop; char *mapfilename; @@ -1685,7 +1786,18 @@ static void rel_crypt_write_mapfile(void) mapfilename = palloc0(MAXPGPATH); mapfilename_new = palloc0(MAXPGPATH); + /* + * if backup the file, the filename will be renamed as pg_rel_crypt.map.backup + * else the file named as pg_rel_crypt.map + */ + if (is_backup) + { + snprintf(mapfilename, MAXPGPATH, "%s/%s.backup", "global", REL_CRYPT_MAP_FILENAME); + } + else + { snprintf(mapfilename, MAXPGPATH, "%s/%s", "global", REL_CRYPT_MAP_FILENAME); + } snprintf(mapfilename_new, MAXPGPATH, "%s/%s.%d", "global", REL_CRYPT_MAP_FILENAME, MyProcPid); buffile = BufFileOpen(mapfilename_new, (O_WRONLY|O_CREAT|PG_BINARY), (S_IRUSR|S_IWUSR), true, ERROR); @@ -2016,13 +2128,80 @@ Datum pg_crypt_key_hash_dump(PG_FUNCTION_ARGS) return (Datum) 0; } +/* + * Check the relfilenode exist + */ +bool CheckRelFileNodeExists(RelFileNode *rnode) +{ + Oid relid; + + if (rnode != NULL) + { + relid = RelidByRelfilenode(rnode->spcNode, rnode->relNode); + + if (OidIsValid(relid)) + { + return true; + } + } + + return false; +} + +/* + * mark the invalid elem in g_rel_crypt_hash to delete + */ +List * MarkRelCryptInvalid(void) +{ + List * result = NIL; + HASH_SEQ_STATUS status; + int lock_loop = 0; + RelCryptEntry *relcrypt; + bool is_exist = false; + + /* lock all partition lock */ + for (lock_loop = 0; lock_loop < REL_CRYPT_HASHTABLE_NUM_PARTITIONS; lock_loop++) + { + LWLockAcquire(rel_crypt_get_partition_lock(lock_loop), LW_SHARED); + } + + hash_seq_init(&status, g_rel_crypt_hash); + while ((relcrypt = (RelCryptEntry *) hash_seq_search(&status)) != NULL) + { + /* only deal with current database */ + if (relcrypt->relfilenode.dbNode != MyDatabaseId) + { + continue; + } + + is_exist = CheckRelFileNodeExists(&(relcrypt->relfilenode)); + if (!is_exist) + { + elog(DEBUG5, "check relfilenode exist, dbNode:%d, spcNode:%d, relNode:%d", + relcrypt->relfilenode.dbNode, relcrypt->relfilenode.spcNode, relcrypt->relfilenode.relNode); + result = lappend(result, relcrypt); + } + } + + /* release all */ + for (lock_loop = REL_CRYPT_HASHTABLE_NUM_PARTITIONS - 1; lock_loop >= 0; lock_loop--) + { + LWLockRelease(rel_crypt_get_partition_lock(lock_loop)); + } + + return result; +} + +/* + * do checkpoint to flush crypt map file to disk + */ void CheckPointRelCrypt(void) { if (g_enable_crypt_debug) { elog(LOG, "CheckPointRelCrypt check to flush crypt mapfile BEGIN"); } - rel_crypt_write_mapfile(); + rel_crypt_write_mapfile(false); crypt_key_info_write_mapfile(); if (g_enable_crypt_debug) { @@ -2031,13 +2210,16 @@ void CheckPointRelCrypt(void) return; } +/* + * if system in startup state, need to flush crypt map file + */ void StartupReachConsistentState(void) { if (g_enable_crypt_debug) { elog(LOG, "StartupReachConsistentState check to flush crypt mapfile BEGIN"); } - rel_crypt_write_mapfile(); + rel_crypt_write_mapfile(false); crypt_key_info_write_mapfile(); if (g_enable_crypt_debug) { @@ -2046,5 +2228,6 @@ void StartupReachConsistentState(void) return; } + #endif diff --git a/src/include/utils/relcryptmap.h b/src/include/utils/relcryptmap.h index 290ef886..ff434bbe 100644 --- a/src/include/utils/relcryptmap.h +++ b/src/include/utils/relcryptmap.h @@ -95,6 +95,8 @@ typedef struct xl_rel_crypt_insert int algo_id; } xl_rel_crypt_insert; +typedef xl_rel_crypt_insert xl_rel_crypt_delete; + extern void rel_crypt_redo(XLogReaderState *record); extern void rel_crypt_desc(StringInfo buf, XLogReaderState *record); extern const char * rel_crypt_identify(uint8 info); @@ -112,6 +114,11 @@ extern void crypt_key_info_load_default_key(void); extern void crypt_key_info_free(CryptKeyInfo cryptkey); extern CryptKeyInfo crypt_key_info_alloc(int option); extern void rel_crypt_hash_insert(RelFileNode * rnode, AlgoId algo_id, bool write_wal, bool in_building_procedure); +extern void remove_rel_crypt_hash_elem(RelCrypt relCrypt, bool write_wal); +extern void rel_crypt_hash_delete(RelFileNode * rnode, bool write_wal); extern void crypt_key_info_hash_insert(CryptKeyInfo cryptkey_input, bool write_wal, bool in_building_procedure); extern int crypt_key_info_cal_key_size(CryptKeyInfo cryptkey); +extern bool CheckRelFileNodeExists(RelFileNode *rnode); +extern List* MarkRelCryptInvalid(void); +extern void rel_crypt_write_mapfile(bool is_backup); #endif /* RELCRYPT_MAP_H */ From 07353c592a502b44e3492aceba77cf6a39081b1a Mon Sep 17 00:00:00 2001 From: andrelin Date: Fri, 23 Apr 2021 15:05:40 +0800 Subject: [PATCH 397/578] Fix explain of INSERT INTO part table --- src/backend/commands/explain.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 2f7ea8e7..d08ccaa8 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -1956,7 +1956,10 @@ ExplainNode(PlanState *planstate, List *ancestors, { case T_ModifyTable: #ifdef __TBASE__ - if(((ModifyTable *) plan)->haspartparent) + /* compatible with make_modifytable */ + if (((ModifyTable *) plan)->haspartparent && + (((ModifyTable *) plan)->operation == CMD_UPDATE || + ((ModifyTable *) plan)->operation == CMD_DELETE)) { ExplainMemberNodes(((ModifyTable *) plan)->partplans, ((ModifyTableState *) planstate)->partplans, From 1f2eab940b8e4642d279ce1445b842cef03db038 Mon Sep 17 00:00:00 2001 From: andrelin Date: Tue, 22 Jun 2021 16:52:11 +0800 Subject: [PATCH 398/578] Give DN a proper session id tapd: http://tapd.oa.com/pgxz/prong/stories/view/1010092131865528291 --- src/backend/pgxc/pool/pgxcnode.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index 5424a200..7f4a9171 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -394,6 +394,10 @@ InitMultinodeExecutor(bool is_force) MemoryContextSwitchTo(oldcontext); PGXCSessionId[0] = '\0'; + if (IsConnFromApp()) + { + sprintf(PGXCSessionId, "%s_%d_%ld", PGXCNodeName, MyProcPid, GetCurrentTimestamp()); + } if (IS_PGXC_COORDINATOR) { @@ -403,8 +407,6 @@ InitMultinodeExecutor(bool is_force) get_pgxc_nodename(co_handles[count].nodeoid)) == 0) PGXCNodeId = count + 1; } - - sprintf(PGXCSessionId, "%s_%d_%ld", PGXCNodeName, MyProcPid, GetCurrentTimestamp()); } else /* DataNode */ { From 7787433330424817b40cba7488e95f0cc7c54d00 Mon Sep 17 00:00:00 2001 From: whalesong Date: Thu, 24 Jun 2021 17:41:16 +0800 Subject: [PATCH 399/578] bugfix: get error after dn switch when persistent_datanode_connections is on (merge request !418) http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131088924419 --- src/backend/pgxc/pool/pgxcnode.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index 7f4a9171..dd034e94 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -5487,6 +5487,11 @@ PgxcNodeRefreshBackendHandlesShmem(List *nodes_alter) int nid; PGXCNodeHandle *handle = NULL; + if (PersistentConnections && nodes_alter != NIL) + { + release_handles(true); + } + foreach(lc, nodes_alter) { char ntype = PGXC_NODE_NONE; From b7aac6119f15ea03c017cf6a1557f88c395ee365 Mon Sep 17 00:00:00 2001 From: bethding Date: Fri, 25 Jun 2021 13:59:37 +0800 Subject: [PATCH 400/578] set enable_parallel_ddl on --- src/backend/utils/misc/guc.c | 2 +- src/test/regress/expected/sysviews.out | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 378f9c7a..42619d16 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -2753,7 +2753,7 @@ static struct config_bool ConfigureNamesBool[] = NULL }, &enable_parallel_ddl, - false, + true, NULL, NULL, NULL }, { diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index e13fdd2a..58642165 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -121,7 +121,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_nestloop_suppression | off enable_null_string | off enable_oracle_compatible | off - enable_parallel_ddl | off + enable_parallel_ddl | on enable_partition_wise_join | off enable_pgbouncer | off enable_plpgsql_debug_print | off From 7bc615ae902fb998af22da21501bf795673c521e Mon Sep 17 00:00:00 2001 From: andrelin Date: Thu, 24 Jun 2021 21:03:15 +0800 Subject: [PATCH 401/578] Allocate a page with enough free space when doing CLUSTER copy tapd: http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131089169139 --- src/backend/access/heap/rewriteheap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c index 2bb4f98b..0e04a8d3 100644 --- a/src/backend/access/heap/rewriteheap.c +++ b/src/backend/access/heap/rewriteheap.c @@ -747,7 +747,7 @@ raw_heap_insert(RewriteState state, HeapTuple tup) #ifdef _SHARDING_ state->rs_buf = RelationGetBufferForTuple_shard(state->rs_new_rel, HeapTupleGetShardId(tup), - BLCKSZ/2, + len, InvalidBuffer, 0, NULL, From 635b1972668ac73160a5f03add1a06b84859d3cd Mon Sep 17 00:00:00 2001 From: sigmalin Date: Fri, 25 Jun 2021 19:51:20 +0800 Subject: [PATCH 402/578] fix seq inconsistency when rename database http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131087148145 (merge request !429) --- src/gtm/main/gtm_store.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/gtm/main/gtm_store.c b/src/gtm/main/gtm_store.c index 43a1c9ab..858c636f 100644 --- a/src/gtm/main/gtm_store.c +++ b/src/gtm/main/gtm_store.c @@ -4302,7 +4302,7 @@ GTMStorageHandle *GTM_StoreGetAllSeqInDatabase(GTM_SequenceKey seq_database_key, GTMStorageHandle bucket_handle = INVALID_STORAGE_HANDLE; GTM_StoredSeqInfo *seq_info = NULL; bool ret = false; - + Assert(seq_database_key->gsk_keylen <= SEQ_KEY_MAX_LENGTH); if (enable_gtm_sequence_debug) { @@ -4326,7 +4326,8 @@ GTMStorageHandle *GTM_StoreGetAllSeqInDatabase(GTM_SequenceKey seq_database_key, { seq_info = GetSeqStore(bucket_handle); - if(strncmp(seq_database_key->gsk_key,seq_info->gs_key.gsk_key,seq_database_key->gsk_keylen - 1) != 0) + if(!(strncmp(seq_database_key->gsk_key,seq_info->gs_key.gsk_key,seq_database_key->gsk_keylen - 1) == 0 && + seq_info->gs_key.gsk_key[seq_database_key->gsk_keylen - 1] == '.')) { bucket_handle = seq_info->gs_next; continue; From eb25d53f3d8d2a07e1ada46548582ddb8b0bf5d6 Mon Sep 17 00:00:00 2001 From: whalesong Date: Thu, 1 Jul 2021 10:50:28 +0800 Subject: [PATCH 403/578] bugfix: prepare regress failed (merge request !440) (cherry picked from commit 817da2ec) 6122f119 bugfix: prepare regress failed --- src/backend/pgxc/pool/execRemote.c | 41 +++++++++++++++++++++++++----- src/backend/pgxc/pool/pgxcnode.c | 6 ----- 2 files changed, 35 insertions(+), 12 deletions(-) diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index d5f96393..7e2ad873 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -3673,12 +3673,6 @@ pgxc_node_remote_cleanup_all(void) return; } - /* Do not cleanup connections if we have prepared statements on nodes */ - if (HaveActiveDatanodeStatements()) - { - return; - } - /* * Send down snapshot followed by DISCARD ALL command. */ @@ -4679,9 +4673,16 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit) } else { + if (HaveActiveDatanodeStatements()) + { + reset_handles(); + } + else + { release_handles(false); } } + } clear_handles(); @@ -4749,10 +4750,17 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle) } else { + if (HaveActiveDatanodeStatements()) + { + reset_handles(); + } + else + { release_handles(false); } } } + } clear_handles(); } @@ -4980,10 +4988,17 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle) } else { + if (HaveActiveDatanodeStatements()) + { + reset_handles(); + } + else + { release_handles(false); } } } + } clear_handles(); #endif @@ -5826,9 +5841,16 @@ pgxc_node_remote_abort(TranscationType txn_type, bool need_release_handle) pgxc_node_remote_cleanup_all(); if (need_release_handle) { + if (HaveActiveDatanodeStatements()) + { + reset_handles(); + } + else + { release_handles(false); } } + } clear_handles(); pfree_pgxc_all_handles(handles); @@ -8763,9 +8785,16 @@ pgxc_node_remote_finish(char *prepareGID, bool commit, } else { + if (HaveActiveDatanodeStatements()) + { + reset_handles(); + } + else + { release_handles(false); } } + } clear_handles(); pfree_pgxc_all_handles(pgxc_handles); pfree(finish_cmd); diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index dd034e94..5b36f087 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -1554,12 +1554,6 @@ reset_handles(void) return; } - /* Do not reset connections if we have prepared statements on nodes */ - if (HaveActiveDatanodeStatements()) - { - return; - } - /* Reset Datanodes handles occupied memory */ for (i = 0; i < NumDataNodes; i++) { From a4a2e7b7d328b3ea866541edf0266baf7b4f6692 Mon Sep 17 00:00:00 2001 From: andrelin Date: Thu, 1 Jul 2021 16:42:12 +0800 Subject: [PATCH 404/578] Not reset global session info when subtrans end http://tapd.oa.com/20421696/bugtrace/bugs/view?bug_id=1020421696089391431&url_cache_key=99b4551652ae6634ef20bbffc9885096 --- src/backend/pgxc/pool/execRemote.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 7e2ad873..492a3ac8 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -3649,13 +3649,17 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections, * specific stuff before releasing them to pool for reuse by other sessions. */ static void -pgxc_node_remote_cleanup_all(void) -{// #lizard forgives +pgxc_node_remote_cleanup_all(bool sub) +{ PGXCNodeAllHandles *handles = get_current_handles(); PGXCNodeHandle *new_connections[handles->co_conn_count + handles->dn_conn_count]; int new_conn_count = 0; int i; - char *resetcmd = "RESET ALL;" + /* if it's called by sub-commit or sub-abort, DO NOT reset global_session */ + char *resetcmd = sub ? "RESET ALL;" + "RESET SESSION AUTHORIZATION;" + "RESET transaction_isolation;" : + "RESET ALL;" "RESET SESSION AUTHORIZATION;" "RESET transaction_isolation;" "RESET global_session"; @@ -4665,7 +4669,7 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit) if (!temp_object_included) { /* Clean up remote sessions */ - pgxc_node_remote_cleanup_all(); + pgxc_node_remote_cleanup_all(false); if (PersistentConnections) { @@ -4737,10 +4741,12 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle) stat_transaction(conn_count); + /* do not cleanup remote session for subtrans */ if (!temp_object_included) { /* Clean up remote sessions */ - pgxc_node_remote_cleanup_all(); + pgxc_node_remote_cleanup_all(txn_type == TXN_TYPE_CommitSubTxn || + txn_type == TXN_TYPE_RollbackSubTxn); if (need_release_handle) { @@ -5838,7 +5844,8 @@ pgxc_node_remote_abort(TranscationType txn_type, bool need_release_handle) if (!temp_object_included) { /* Clean up remote sessions */ - pgxc_node_remote_cleanup_all(); + pgxc_node_remote_cleanup_all(txn_type == TXN_TYPE_CommitSubTxn || + txn_type == TXN_TYPE_RollbackSubTxn); if (need_release_handle) { if (HaveActiveDatanodeStatements()) @@ -8778,7 +8785,7 @@ pgxc_node_remote_finish(char *prepareGID, bool commit, if (!temp_object_included) { /* Clean up remote sessions */ - pgxc_node_remote_cleanup_all(); + pgxc_node_remote_cleanup_all(false); if (PersistentConnections) { reset_handles(); From 32340e531ee717eecd253cfe904e21212c790148 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Thu, 1 Jul 2021 21:09:17 +0800 Subject: [PATCH 405/578] fix syslogger coredump in process_pipe_input http://tapd.oa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131089442025 --- src/gtm/common/elog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gtm/common/elog.c b/src/gtm/common/elog.c index 597a252e..0bc8e0b9 100644 --- a/src/gtm/common/elog.c +++ b/src/gtm/common/elog.c @@ -778,7 +778,7 @@ write_pipe_chunks(char *data, int len, int dest) Assert(len > 0); p.proto.nuls[0] = p.proto.nuls[1] = '\0'; - p.proto.pid = (exit_flag == GTM_DEFAULT_EXIT_FLAG) ? (int) MyThreadID : 0; + p.proto.pid = (exit_flag == GTM_DEFAULT_EXIT_FLAG) ? (int) (ThreadId + 1) : 0; /* write all but the last chunk */ while (len > PIPE_MAX_PAYLOAD) From a749d0b7175bfd320ce16f0c8b9a33b8ec47a66a Mon Sep 17 00:00:00 2001 From: winter Date: Mon, 5 Jul 2021 14:23:24 +0800 Subject: [PATCH 406/578] fix 'could not read block xxxxx ... ' issue after master-slave switch --- src/backend/access/heap/hio.c | 35 ++++++----------------------------- 1 file changed, 6 insertions(+), 29 deletions(-) diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c index e65127f4..78e63235 100644 --- a/src/backend/access/heap/hio.c +++ b/src/backend/access/heap/hio.c @@ -182,7 +182,7 @@ GetVisibilityMapPins(Relation relation, Buffer buffer1, Buffer buffer2, * amount which ramps up as the degree of contention ramps up, but limiting * the result to some sane overall value. */ -static Buffer +static void RelationAddExtraBlocks(Relation relation, BulkInsertState bistate, ShardID sid) {// #lizard forgives Page page; @@ -194,7 +194,6 @@ RelationAddExtraBlocks(Relation relation, BulkInsertState bistate, ShardID sid) Buffer buffer; #ifdef _SHARDING_ - Buffer firstBuffer = InvalidBuffer; if(RelationHasExtent(relation) && !ShardIDIsValid(sid)) { elog(ERROR, "extent-organized relation must extend with shardid."); @@ -275,19 +274,12 @@ RelationAddExtraBlocks(Relation relation, BulkInsertState bistate, ShardID sid) if(bistate) bistate->sid = sid; #endif + UnlockReleaseBuffer(buffer); /* Remember first block number thus added. */ if (firstBlock == InvalidBlockNumber) - { firstBlock = blockNum; - firstBuffer = buffer; - } -#ifdef _SHARDING_ - else - { - UnlockReleaseBuffer(buffer); - } -#endif + /* * Immediately update the bottom level of the FSM. This has a good * chance of making this page visible to other concurrently inserting @@ -308,7 +300,6 @@ RelationAddExtraBlocks(Relation relation, BulkInsertState bistate, ShardID sid) */ UpdateFreeSpaceMap(relation, firstBlock, blockNum, freespace); - return firstBuffer; } #ifdef _SHARDING_ @@ -1056,23 +1047,12 @@ RelationGetBufferForTuple_shard(Relation relation, ShardID sid, Size len, UnlockRelationForExtension(relation, ExclusiveLock); goto loop; } + RelationAddExtraBlocks(relation, bistate, + RelationHasExtent(relation) ? sid : InvalidShardID); } } -#ifdef _SHARDING_ - /* - * We can be certain that locking the otherBuffer first is OK, since it - * must have a lower page number. - */ - if (otherBuffer != InvalidBuffer) - LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE); - /* Time to bulk-extend. */ - buffer = RelationAddExtraBlocks(relation, bistate, - RelationHasExtent(relation) ? sid : InvalidShardID); -#endif - -#if 0 /* * In addition to whatever extension we performed above, we always add at * least one block to satisfy our own request. @@ -1095,7 +1075,6 @@ RelationGetBufferForTuple_shard(Relation relation, ShardID sid, Size len, * Now acquire lock on the new page. */ LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); -#endif /* * Release the file-extension lock; it's now OK for someone else to extend @@ -1117,14 +1096,12 @@ RelationGetBufferForTuple_shard(Relation relation, ShardID sid, Size len, page = BufferGetPage(buffer); -#if 0 if (!PageIsNew(page)) elog(ERROR, "page %u of relation \"%s\" should be empty but is not", BufferGetBlockNumber(buffer), RelationGetRelationName(relation)); - PageInit(page, BufferGetPageSize(buffer), 0, sid); -#endif + PageInit_shard(page, BufferGetPageSize(buffer), 0, sid, false); if (len > PageGetHeapFreeSpace(page)) { From 751cfdb1c3685339ff3d77f75b5acdc0ca48e64f Mon Sep 17 00:00:00 2001 From: bethding Date: Tue, 13 Jul 2021 17:18:50 +0800 Subject: [PATCH 407/578] fix rename db cause gtm metadata inconsistent http://tapd.oa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131087148145 --- src/backend/access/transam/gtm.c | 2 +- src/gtm/main/gtm_seq.c | 4 ++-- src/test/regress/expected/sequence.out | 19 +++++++++++++++++++ src/test/regress/sql/sequence.sql | 20 ++++++++++++++++++++ 4 files changed, 42 insertions(+), 3 deletions(-) diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c index 1ac9e6d2..5f95e859 100644 --- a/src/backend/access/transam/gtm.c +++ b/src/backend/access/transam/gtm.c @@ -118,7 +118,7 @@ void RegisterSeqCreate(char *name, int32 type) old_cxt = MemoryContextSwitchTo(TopMemoryContext); key = (GTM_SequenceKeyData*)palloc(sizeof(GTM_SequenceKeyData)); - key->gsk_keylen = strlen(name); + key->gsk_keylen = strlen(name) + 1; key->gsk_key = pstrdup(name); key->gsk_type = type; g_CreateSeqList = lappend(g_CreateSeqList, key); diff --git a/src/gtm/main/gtm_seq.c b/src/gtm/main/gtm_seq.c index c9bb59c8..9abe9580 100644 --- a/src/gtm/main/gtm_seq.c +++ b/src/gtm/main/gtm_seq.c @@ -3538,11 +3538,11 @@ ProcessDBSequenceRenameCommand(Port *myport, StringInfo message, bool is_backup) GTM_StoreGetSeqKey(handles[i], old_key); temp_seqkey.gsk_key = old_key; - temp_seqkey.gsk_keylen = strnlen(old_key, SEQ_KEY_MAX_LENGTH); + temp_seqkey.gsk_keylen = strnlen(old_key, SEQ_KEY_MAX_LENGTH) + 1; snprintf(new_key, SEQ_KEY_MAX_LENGTH, "%s%s", newseqkey.gsk_key, old_key + strnlen(seqkey.gsk_key, SEQ_KEY_MAX_LENGTH)); temp_newseqkey.gsk_key = new_key; - temp_newseqkey.gsk_keylen = strnlen(new_key, SEQ_KEY_MAX_LENGTH); + temp_newseqkey.gsk_keylen = strnlen(new_key, SEQ_KEY_MAX_LENGTH) + 1; if ((errcode = GTM_SeqRename(&temp_seqkey, &temp_newseqkey, gxid))) { ereport(ERROR, diff --git a/src/test/regress/expected/sequence.out b/src/test/regress/expected/sequence.out index 10c76ead..2eae7bde 100644 --- a/src/test/regress/expected/sequence.out +++ b/src/test/regress/expected/sequence.out @@ -1,6 +1,13 @@ -- -- CREATE SEQUENCE -- +CREATE DATABASE db_seq1; +CREATE DATABASE db_seq2; +\c db_seq1 +CREATE SEQUENCE my_seq; +\c db_seq2 +CREATE SEQUENCE my_seq; +\c regression -- various error cases CREATE UNLOGGED SEQUENCE sequence_testx; ERROR: unlogged sequences are not supported @@ -833,3 +840,15 @@ SELECT nextval('test_seq1'); (1 row) DROP SEQUENCE test_seq1; +-- Test sequece when alter database +ALTER DATABASE db_seq1 RENAME TO db_seq3; +ALTER DATABASE db_seq2 RENAME TO db_seq1; +\c db_seq1 +DROP SEQUENCE my_seq; +CREATE SEQUENCE my_seq; +DROP SEQUENCE my_seq; +\c db_seq3 +DROP SEQUENCE my_seq; +CREATE SEQUENCE my_seq; +DROP SEQUENCE my_seq; +\q \ No newline at end of file diff --git a/src/test/regress/sql/sequence.sql b/src/test/regress/sql/sequence.sql index 3ca98bb3..fda62262 100644 --- a/src/test/regress/sql/sequence.sql +++ b/src/test/regress/sql/sequence.sql @@ -2,6 +2,14 @@ -- CREATE SEQUENCE -- +CREATE DATABASE db_seq1; +CREATE DATABASE db_seq2; +\c db_seq1 +CREATE SEQUENCE my_seq; +\c db_seq2 +CREATE SEQUENCE my_seq; +\c regression + -- various error cases CREATE UNLOGGED SEQUENCE sequence_testx; CREATE SEQUENCE sequence_testx INCREMENT BY 0; @@ -414,3 +422,15 @@ SELECT nextval('test_seq1'); SELECT nextval('test_seq1'); DROP SEQUENCE test_seq1; +-- Test sequece when alter database +ALTER DATABASE db_seq1 RENAME TO db_seq3; +ALTER DATABASE db_seq2 RENAME TO db_seq1; +\c db_seq1 +DROP SEQUENCE my_seq; +CREATE SEQUENCE my_seq; +DROP SEQUENCE my_seq; +\c db_seq3 +DROP SEQUENCE my_seq; +CREATE SEQUENCE my_seq; +DROP SEQUENCE my_seq; +\q From 412c4d20f1364e1febc0a8a448514b384bfd1c87 Mon Sep 17 00:00:00 2001 From: andrelin Date: Wed, 21 Jul 2021 17:11:25 +0800 Subject: [PATCH 408/578] fix regress --- src/test/regress/expected/join_3.out | 120 ++++++++---------- .../regress/expected/updatable_views_1.out | 76 ++++------- 2 files changed, 78 insertions(+), 118 deletions(-) diff --git a/src/test/regress/expected/join_3.out b/src/test/regress/expected/join_3.out index 5b7dfb96..16264c50 100644 --- a/src/test/regress/expected/join_3.out +++ b/src/test/regress/expected/join_3.out @@ -5776,13 +5776,13 @@ select * from j1 inner join j2 on j1.id = j2.id; -> Nested Loop Output: j1.id, j2.id Inner Unique: true + Join Filter: (j1.id = j2.id) -> Seq Scan on public.j1 Output: j1.id - -> Bitmap Heap Scan on public.j2 + -> Materialize + Output: j2.id + -> Seq Scan on public.j2 Output: j2.id - Recheck Cond: (j2.id = j1.id) - -> Bitmap Index Scan on j2_pkey - Index Cond: (j2.id = j1.id) (14 rows) -- ensure join is not unique when not an equi-join @@ -5795,17 +5795,15 @@ select * from j1 inner join j2 on j1.id > j2.id; Join Filter: (j1.id > j2.id) -> Remote Subquery Scan on all (datanode_1,datanode_2) Output: j1.id - -> Bitmap Heap Scan on public.j1 + -> Seq Scan on public.j1 Output: j1.id - -> Bitmap Index Scan on j1_pkey -> Materialize Output: j2.id -> Remote Subquery Scan on all (datanode_1,datanode_2) Output: j2.id - -> Bitmap Heap Scan on public.j2 + -> Seq Scan on public.j2 Output: j2.id - -> Bitmap Index Scan on j2_pkey -(15 rows) +(13 rows) -- ensure non-unique rel is not chosen as inner explain (verbose, costs off) @@ -5819,13 +5817,13 @@ select * from j1 inner join j3 on j1.id = j3.id; -> Nested Loop Output: j1.id, j3.id Inner Unique: true + Join Filter: (j1.id = j3.id) -> Seq Scan on public.j3 Output: j3.id - -> Bitmap Heap Scan on public.j1 + -> Materialize + Output: j1.id + -> Seq Scan on public.j1 Output: j1.id - Recheck Cond: (j1.id = j3.id) - -> Bitmap Index Scan on j1_pkey - Index Cond: (j1.id = j3.id) (14 rows) -- ensure left join is marked as unique @@ -5840,13 +5838,13 @@ select * from j1 left join j2 on j1.id = j2.id; -> Nested Loop Left Join Output: j1.id, j2.id Inner Unique: true + Join Filter: (j1.id = j2.id) -> Seq Scan on public.j1 Output: j1.id - -> Bitmap Heap Scan on public.j2 + -> Materialize + Output: j2.id + -> Seq Scan on public.j2 Output: j2.id - Recheck Cond: (j1.id = j2.id) - -> Bitmap Index Scan on j2_pkey - Index Cond: (j1.id = j2.id) (14 rows) -- ensure right join is marked as unique @@ -5859,13 +5857,13 @@ select * from j1 right join j2 on j1.id = j2.id; -> Nested Loop Left Join Output: j1.id, j2.id Inner Unique: true + Join Filter: (j1.id = j2.id) -> Seq Scan on public.j2 Output: j2.id - -> Bitmap Heap Scan on public.j1 + -> Materialize + Output: j1.id + -> Seq Scan on public.j1 Output: j1.id - Recheck Cond: (j1.id = j2.id) - -> Bitmap Index Scan on j1_pkey - Index Cond: (j1.id = j2.id) (12 rows) -- ensure full join is marked as unique @@ -5898,17 +5896,15 @@ select * from j1 cross join j2; Output: j1.id, j2.id -> Remote Subquery Scan on all (datanode_1,datanode_2) Output: j1.id - -> Bitmap Heap Scan on public.j1 + -> Seq Scan on public.j1 Output: j1.id - -> Bitmap Index Scan on j1_pkey -> Materialize Output: j2.id -> Remote Subquery Scan on all (datanode_1,datanode_2) Output: j2.id - -> Bitmap Heap Scan on public.j2 + -> Seq Scan on public.j2 Output: j2.id - -> Bitmap Index Scan on j2_pkey -(14 rows) +(12 rows) -- ensure a natural join is marked as unique explain (verbose, costs off) @@ -5922,13 +5918,13 @@ select * from j1 natural join j2; -> Nested Loop Output: j1.id Inner Unique: true + Join Filter: (j1.id = j2.id) -> Seq Scan on public.j1 Output: j1.id - -> Bitmap Heap Scan on public.j2 + -> Materialize + Output: j2.id + -> Seq Scan on public.j2 Output: j2.id - Recheck Cond: (j2.id = j1.id) - -> Bitmap Index Scan on j2_pkey - Index Cond: (j2.id = j1.id) (14 rows) -- ensure a distinct clause allows the inner to become unique @@ -5942,6 +5938,7 @@ inner join (select distinct id from j3) j3 on j1.id = j3.id; -> Nested Loop Output: j1.id, j3.id Inner Unique: true + Join Filter: (j1.id = j3.id) -> Unique Output: j3.id -> Sort @@ -5949,12 +5946,9 @@ inner join (select distinct id from j3) j3 on j1.id = j3.id; Sort Key: j3.id -> Seq Scan on public.j3 Output: j3.id - -> Bitmap Heap Scan on public.j1 + -> Seq Scan on public.j1 Output: j1.id - Recheck Cond: (j1.id = j3.id) - -> Bitmap Index Scan on j1_pkey - Index Cond: (j1.id = j3.id) -(17 rows) +(15 rows) -- ensure group by clause allows the inner to become unique explain (verbose, costs off) @@ -5967,6 +5961,7 @@ inner join (select id from j3 group by id) j3 on j1.id = j3.id; -> Nested Loop Output: j1.id, j3.id Inner Unique: true + Join Filter: (j1.id = j3.id) -> Group Output: j3.id Group Key: j3.id @@ -5975,12 +5970,9 @@ inner join (select id from j3 group by id) j3 on j1.id = j3.id; Sort Key: j3.id -> Seq Scan on public.j3 Output: j3.id - -> Bitmap Heap Scan on public.j1 + -> Seq Scan on public.j1 Output: j1.id - Recheck Cond: (j1.id = j3.id) - -> Bitmap Index Scan on j1_pkey - Index Cond: (j1.id = j3.id) -(18 rows) +(16 rows) drop table j1; drop table j2; @@ -6009,7 +6001,7 @@ inner join j2 on j1.id1 = j2.id1; -> Nested Loop Output: j1.id1, j1.id2, j2.id1, j2.id2 Join Filter: (j1.id1 = j2.id1) - -> Index Only Scan using j2_pkey on public.j2 + -> Seq Scan on public.j2 Output: j2.id1, j2.id2 -> Seq Scan on public.j1 Output: j1.id1, j1.id2 @@ -6028,14 +6020,12 @@ inner join j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2; -> Nested Loop Output: j1.id1, j1.id2, j2.id1, j2.id2 Inner Unique: true - -> Index Only Scan using j2_pkey on public.j2 + Join Filter: ((j1.id1 = j2.id1) AND (j1.id2 = j2.id2)) + -> Seq Scan on public.j2 Output: j2.id1, j2.id2 - -> Bitmap Heap Scan on public.j1 + -> Seq Scan on public.j1 Output: j1.id1, j1.id2 - Recheck Cond: ((j1.id1 = j2.id1) AND (j1.id2 = j2.id2)) - -> Bitmap Index Scan on j1_pkey - Index Cond: ((j1.id1 = j2.id1) AND (j1.id2 = j2.id2)) -(14 rows) +(12 rows) -- ensure we don't detect the join to be unique when quals are not part of the -- join condition @@ -6050,15 +6040,13 @@ inner join j2 on j1.id1 = j2.id1 where j1.id2 = 1; Remote query: SELECT j1.id1, j1.id2, j2.id1, j2.id2 FROM (j1 JOIN j2 ON ((j1.id1 = j2.id1))) WHERE (j1.id2 = 1) -> Nested Loop Output: j1.id1, j1.id2, j2.id1, j2.id2 - Inner Unique: true - -> Index Only Scan using j2_pkey on public.j2 - Output: j2.id1, j2.id2 - -> Bitmap Heap Scan on public.j1 + Join Filter: (j1.id1 = j2.id1) + -> Seq Scan on public.j1 Output: j1.id1, j1.id2 - Recheck Cond: ((j1.id1 = j2.id1) AND (j1.id2 = 1)) - -> Bitmap Index Scan on j1_pkey - Index Cond: ((j1.id1 = j2.id1) AND (j1.id2 = 1)) -(14 rows) + Filter: (j1.id2 = 1) + -> Seq Scan on public.j2 + Output: j2.id1, j2.id2 +(12 rows) -- as above, but for left joins. explain (verbose, costs off) @@ -6073,14 +6061,12 @@ left join j2 on j1.id1 = j2.id1 where j1.id2 = 1; -> Nested Loop Left Join Output: j1.id1, j1.id2, j2.id1, j2.id2 Join Filter: (j1.id1 = j2.id1) - -> Bitmap Heap Scan on public.j1 + -> Seq Scan on public.j1 Output: j1.id1, j1.id2 - Recheck Cond: (j1.id2 = 1) - -> Bitmap Index Scan on j1_pkey - Index Cond: (j1.id2 = 1) - -> Index Only Scan using j2_pkey on public.j2 + Filter: (j1.id2 = 1) + -> Seq Scan on public.j2 Output: j2.id1, j2.id2 -(14 rows) +(12 rows) -- validate logic in merge joins which skips mark and restore. -- it should only do this if all quals which were used to detect the unique @@ -6099,13 +6085,11 @@ where j1.id1 % 1000 = 1 and j2.id1 % 1000 = 1; Node/s: datanode_1, datanode_2 -> Nested Loop Join Filter: ((j1.id1 = j2.id1) AND (j1.id2 = j2.id2)) - -> Bitmap Heap Scan on j1 - Recheck Cond: ((id1 % 1000) = 1) - -> Bitmap Index Scan on j1_id1_idx - -> Bitmap Heap Scan on j1 j2 - Recheck Cond: ((id1 % 1000) = 1) - -> Bitmap Index Scan on j1_id1_idx -(10 rows) + -> Seq Scan on j1 + Filter: ((id1 % 1000) = 1) + -> Seq Scan on j1 j2 + Filter: ((id1 % 1000) = 1) +(8 rows) select * from j1 j1 inner join j1 j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2 diff --git a/src/test/regress/expected/updatable_views_1.out b/src/test/regress/expected/updatable_views_1.out index 4c2bfb95..e13b4537 100644 --- a/src/test/regress/expected/updatable_views_1.out +++ b/src/test/regress/expected/updatable_views_1.out @@ -2095,42 +2095,30 @@ UPDATE v1 SET a=100 WHERE snoop(a) AND leakproof(a) AND a < 7 AND a != 6; Update on public.t11 Update on public.t12 Update on public.t111 - -> Bitmap Heap Scan on public.t1 + -> Index Scan using t1_a_idx on public.t1 Output: 100, t1.b, t1.c, t1.ctid, t1.shardid - Recheck Cond: ((t1.a > 5) AND (t1.a < 7)) + Index Cond: ((t1.a > 5) AND (t1.a < 7)) Filter: ((t1.a <> 6) AND (SubPlan 1) AND snoop(t1.a) AND leakproof(t1.a)) - -> Bitmap Index Scan on t1_a_idx - Index Cond: ((t1.a > 5) AND (t1.a < 7)) SubPlan 1 -> Remote Subquery Scan on all (datanode_1) -> Append - -> Bitmap Heap Scan on public.t12 t12_1 - Recheck Cond: (t12_1.a = t1.a) - -> Bitmap Index Scan on t12_a_idx - Index Cond: (t12_1.a = t1.a) - -> Bitmap Heap Scan on public.t111 t111_1 - Recheck Cond: (t111_1.a = t1.a) - -> Bitmap Index Scan on t111_a_idx - Index Cond: (t111_1.a = t1.a) - -> Bitmap Heap Scan on public.t11 + -> Seq Scan on public.t12 t12_1 + Filter: (t12_1.a = t1.a) + -> Seq Scan on public.t111 t111_1 + Filter: (t111_1.a = t1.a) + -> Index Scan using t11_a_idx on public.t11 Output: 100, t11.b, t11.c, t11.d, t11.ctid, t11.shardid - Recheck Cond: ((t11.a > 5) AND (t11.a < 7)) + Index Cond: ((t11.a > 5) AND (t11.a < 7)) Filter: ((t11.a <> 6) AND (SubPlan 1) AND snoop(t11.a) AND leakproof(t11.a)) - -> Bitmap Index Scan on t11_a_idx - Index Cond: ((t11.a > 5) AND (t11.a < 7)) - -> Bitmap Heap Scan on public.t12 + -> Index Scan using t12_a_idx on public.t12 Output: 100, t12.b, t12.c, t12.e, t12.ctid, t12.shardid - Recheck Cond: ((t12.a > 5) AND (t12.a < 7)) + Index Cond: ((t12.a > 5) AND (t12.a < 7)) Filter: ((t12.a <> 6) AND (SubPlan 1) AND snoop(t12.a) AND leakproof(t12.a)) - -> Bitmap Index Scan on t12_a_idx - Index Cond: ((t12.a > 5) AND (t12.a < 7)) - -> Bitmap Heap Scan on public.t111 + -> Index Scan using t111_a_idx on public.t111 Output: 100, t111.b, t111.c, t111.d, t111.e, t111.ctid, t111.shardid - Recheck Cond: ((t111.a > 5) AND (t111.a < 7)) + Index Cond: ((t111.a > 5) AND (t111.a < 7)) Filter: ((t111.a <> 6) AND (SubPlan 1) AND snoop(t111.a) AND leakproof(t111.a)) - -> Bitmap Index Scan on t111_a_idx - Index Cond: ((t111.a > 5) AND (t111.a < 7)) -(41 rows) +(29 rows) UPDATE v1 SET a=100 WHERE snoop(a) AND leakproof(a) AND a < 7 AND a != 6; SELECT * FROM v1 WHERE a=100; -- Nothing should have been changed to 100 @@ -2153,42 +2141,30 @@ UPDATE v1 SET a=a+1 WHERE snoop(a) AND leakproof(a) AND a = 8; Update on public.t11 Update on public.t12 Update on public.t111 - -> Bitmap Heap Scan on public.t1 + -> Index Scan using t1_a_idx on public.t1 Output: (t1.a + 1), t1.b, t1.c, t1.ctid, t1.shardid - Recheck Cond: ((t1.a > 5) AND (t1.a = 8)) + Index Cond: ((t1.a > 5) AND (t1.a = 8)) Filter: ((SubPlan 1) AND snoop(t1.a) AND leakproof(t1.a)) - -> Bitmap Index Scan on t1_a_idx - Index Cond: ((t1.a > 5) AND (t1.a = 8)) SubPlan 1 -> Remote Subquery Scan on all (datanode_1) -> Append - -> Bitmap Heap Scan on public.t12 t12_1 - Recheck Cond: (t12_1.a = t1.a) - -> Bitmap Index Scan on t12_a_idx - Index Cond: (t12_1.a = t1.a) - -> Bitmap Heap Scan on public.t111 t111_1 - Recheck Cond: (t111_1.a = t1.a) - -> Bitmap Index Scan on t111_a_idx - Index Cond: (t111_1.a = t1.a) - -> Bitmap Heap Scan on public.t11 + -> Seq Scan on public.t12 t12_1 + Filter: (t12_1.a = t1.a) + -> Seq Scan on public.t111 t111_1 + Filter: (t111_1.a = t1.a) + -> Index Scan using t11_a_idx on public.t11 Output: (t11.a + 1), t11.b, t11.c, t11.d, t11.ctid, t11.shardid - Recheck Cond: ((t11.a > 5) AND (t11.a = 8)) + Index Cond: ((t11.a > 5) AND (t11.a = 8)) Filter: ((SubPlan 1) AND snoop(t11.a) AND leakproof(t11.a)) - -> Bitmap Index Scan on t11_a_idx - Index Cond: ((t11.a > 5) AND (t11.a = 8)) - -> Bitmap Heap Scan on public.t12 + -> Index Scan using t12_a_idx on public.t12 Output: (t12.a + 1), t12.b, t12.c, t12.e, t12.ctid, t12.shardid - Recheck Cond: ((t12.a > 5) AND (t12.a = 8)) + Index Cond: ((t12.a > 5) AND (t12.a = 8)) Filter: ((SubPlan 1) AND snoop(t12.a) AND leakproof(t12.a)) - -> Bitmap Index Scan on t12_a_idx - Index Cond: ((t12.a > 5) AND (t12.a = 8)) - -> Bitmap Heap Scan on public.t111 + -> Index Scan using t111_a_idx on public.t111 Output: (t111.a + 1), t111.b, t111.c, t111.d, t111.e, t111.ctid, t111.shardid - Recheck Cond: ((t111.a > 5) AND (t111.a = 8)) + Index Cond: ((t111.a > 5) AND (t111.a = 8)) Filter: ((SubPlan 1) AND snoop(t111.a) AND leakproof(t111.a)) - -> Bitmap Index Scan on t111_a_idx - Index Cond: ((t111.a > 5) AND (t111.a = 8)) -(41 rows) +(29 rows) UPDATE v1 SET a=a+1 WHERE snoop(a) AND leakproof(a) AND a = 8; SELECT * FROM v1 WHERE b=8; From d14091ecfa4572f918c88129779bb736ff8e5edc Mon Sep 17 00:00:00 2001 From: andrelin Date: Tue, 20 Jul 2021 16:49:10 +0800 Subject: [PATCH 409/578] fix: plan_id should be reduced by 1, reflecting the serial number in the array --- src/backend/nodes/nodeFuncs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/nodes/nodeFuncs.c b/src/backend/nodes/nodeFuncs.c index 8a10e344..e9916037 100644 --- a/src/backend/nodes/nodeFuncs.c +++ b/src/backend/nodes/nodeFuncs.c @@ -3912,7 +3912,7 @@ plantree_walk_initplans(List *plans, foreach(lc, plans) { Plan *splan = (Plan *) list_nth(subplans, - (lfirst_node(SubPlan, lc))->plan_id); + (lfirst_node(SubPlan, lc))->plan_id - 1); if (walker(splan, context)) return true; From 849322202aca767eb70a9c80f9946086960c9d8a Mon Sep 17 00:00:00 2001 From: youngxie Date: Thu, 29 Jul 2021 17:32:20 +0800 Subject: [PATCH 410/578] Fix duplicate relfilenode with alter command. http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131089349403&url_cache_key=d4e1402777dc733479aac463ad1a9d24 --- src/backend/commands/tablecmds.c | 51 ++++++++++++++++++++++++++++---- src/backend/nodes/copyfuncs.c | 1 + src/include/nodes/parsenodes.h | 1 + 3 files changed, 47 insertions(+), 6 deletions(-) diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index eb5b2b6b..4a44ba49 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -494,7 +494,7 @@ static void ATPostAlterTypeParse(Oid oldId, Oid oldRelId, Oid refRelId, bool rewrite); static void RebuildConstraintComment(AlteredTableInfo *tab, int pass, Oid objid, Relation rel, char *conname); -static void TryReuseIndex(Oid oldId, IndexStmt *stmt); +static void TryReuseIndex(Relation rel, Oid oldId, IndexStmt *stmt); static void TryReuseForeignKey(Oid oldId, Constraint *con); static void change_owner_fix_column_acls(Oid relationOid, Oid oldOwnerId, Oid newOwnerId); @@ -8715,6 +8715,7 @@ ATExecAddIndex(AlteredTableInfo *tab, Relation rel, bool check_rights; bool skip_build; bool quiet; + bool save_oldnode; ObjectAddress address; Assert(IsA(stmt, IndexStmt)); @@ -8725,8 +8726,10 @@ ATExecAddIndex(AlteredTableInfo *tab, Relation rel, /* suppress schema rights check when rebuilding existing index */ check_rights = !is_rebuild; + /* if we're resuing an old node */ + save_oldnode = OidIsValid(stmt->oldNode); /* skip index build if phase 3 will do it or we're reusing an old one */ - skip_build = tab->rewrite > 0 || OidIsValid(stmt->oldNode); + skip_build = tab->rewrite > 0 || save_oldnode; /* suppress notices when rebuilding existing index */ quiet = is_rebuild; @@ -8771,6 +8774,10 @@ ATExecAddIndex(AlteredTableInfo *tab, Relation rel, partidxstmt = (IndexStmt *)copyObject((void*)stmt); partidxstmt->relation->relname = GetPartitionName(RelationGetRelid(rel), i, false); partidxstmt->idxname = GetPartitionName(indexOid, i, true); + if (save_oldnode) + { + partidxstmt->oldNode = list_nth_oid(stmt->partsOldNode, i); + } partOid = get_relname_relid(partidxstmt->relation->relname, RelationGetNamespace(rel)); @@ -8789,6 +8796,19 @@ ATExecAddIndex(AlteredTableInfo *tab, Relation rel, false, /* check_not_in_use */ skip_build, /* skip_build */ quiet); /* quiet */ + /* + * If TryReuseIndex() stashed a relfilenode for us, we used it for the new + * index instead of building from scratch. The DROP of the old edition of + * this index will have scheduled the storage for deletion at commit, so + * cancel that pending deletion. + */ + if (save_oldnode) + { + Relation irel = index_open(addr.objectId, NoLock); + + RelationPreserveStorage(irel->rd_node, true); + index_close(irel, NoLock); + } /* Make dependency entries */ myself.classId = RelationRelationId; @@ -8819,7 +8839,7 @@ ATExecAddIndex(AlteredTableInfo *tab, Relation rel, * this index will have scheduled the storage for deletion at commit, so * cancel that pending deletion. */ - if (OidIsValid(stmt->oldNode)) + if (save_oldnode) { Relation irel = index_open(address.objectId, NoLock); @@ -11912,7 +11932,7 @@ ATPostAlterTypeParse(Oid oldId, Oid oldRelId, Oid refRelId, char *cmd, AlterTableCmd *newcmd; if (!rewrite) - TryReuseIndex(oldId, stmt); + TryReuseIndex(rel, oldId, stmt); stmt->reset_default_tblspc = true; /* keep the index's comment */ stmt->idxcomment = GetComment(oldId, RelationRelationId, 0); @@ -11941,7 +11961,7 @@ ATPostAlterTypeParse(Oid oldId, Oid oldRelId, Oid refRelId, char *cmd, indoid = get_constraint_index(oldId); if (!rewrite) - TryReuseIndex(indoid, indstmt); + TryReuseIndex(rel, indoid, indstmt); /* keep any comment on the index */ indstmt->idxcomment = GetComment(indoid, RelationRelationId, 0); @@ -12028,7 +12048,7 @@ RebuildConstraintComment(AlteredTableInfo *tab, int pass, Oid objid, * for the real analysis, then mutates the IndexStmt based on that verdict. */ static void -TryReuseIndex(Oid oldId, IndexStmt *stmt) +TryReuseIndex(Relation rel, Oid oldId, IndexStmt *stmt) { if (CheckIndexCompatible(oldId, stmt->accessMethod, @@ -12039,6 +12059,25 @@ TryReuseIndex(Oid oldId, IndexStmt *stmt) stmt->oldNode = irel->rd_node.relNode; index_close(irel, NoLock); + + if (RELATION_IS_INTERVAL(rel)) + { + int nParts = 0; + int i = 0; + + nParts = RelationGetNParts(rel); + stmt->partsOldNode = NULL; + + for (i = 0; i < nParts; i++) + { + Relation iprel = index_open(RelationGetPartitionIndex(rel, oldId, i), + NoLock); + + stmt->partsOldNode = lappend_oid(stmt->partsOldNode, + iprel->rd_node.relNode); + index_close(iprel, NoLock); + } + } } } diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 9ccd69bd..702eec38 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -3753,6 +3753,7 @@ _copyIndexStmt(const IndexStmt *from) COPY_SCALAR_FIELD(reset_default_tblspc); #ifdef __TBASE__ COPY_SCALAR_FIELD(parentIndexOid); + COPY_NODE_FIELD(partsOldNode); #endif return newnode; } diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index 5554ee7b..01ab8277 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -2903,6 +2903,7 @@ typedef struct IndexStmt #ifdef __TBASE__ /* used for interval partition */ Oid parentIndexOid; + List *partsOldNode; /* like oldNode just for partition tables */ #endif } IndexStmt; From 1e6affb7c02fccae909bd97d22e91a2ecd8c7576 Mon Sep 17 00:00:00 2001 From: challzhang Date: Mon, 2 Aug 2021 15:14:45 +0800 Subject: [PATCH 411/578] Fix tuple does not match the descriptor when executing insert in JDBC --- src/backend/tcop/pquery.c | 8 ++- src/test/regress/expected/insert_1.out | 52 ++++++++------------ src/test/regress/expected/sanity_check_1.out | 1 + src/test/regress/sql/insert.sql | 12 +++++ 4 files changed, 41 insertions(+), 32 deletions(-) diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c index 1a2cb2cc..715d407b 100644 --- a/src/backend/tcop/pquery.c +++ b/src/backend/tcop/pquery.c @@ -188,6 +188,7 @@ ProcessQuery(PlannedStmt *plan, char *completionTag, int instrument) { + int eflags = 0; QueryDesc *queryDesc; /* @@ -206,10 +207,15 @@ ProcessQuery(PlannedStmt *plan, GetActiveSnapshot(), InvalidSnapshot, dest, params, queryEnv, instrument); + if (plan->hasReturning) + { + eflags |= EXEC_FLAG_RETURNING; + } + /* * Call ExecutorStart to prepare the plan for execution */ - ExecutorStart(queryDesc, 0); + ExecutorStart(queryDesc, eflags); /* * Run the plan to completion. diff --git a/src/test/regress/expected/insert_1.out b/src/test/regress/expected/insert_1.out index 78de338c..21232dff 100644 --- a/src/test/regress/expected/insert_1.out +++ b/src/test/regress/expected/insert_1.out @@ -875,40 +875,30 @@ with baseInfo as(select * from t1) insert into t2 select * from baseInfo; drop table t1; drop table t2; --- Determine whether tables of different groups are allowed to insert. -set default_locator_type to shard; -drop table if exists t2; -NOTICE: table "t2" does not exist, skipping -drop table if exists t2_rep; -NOTICE: table "t2_rep" does not exist, skipping -drop table if exists t2_new; -NOTICE: table "t2_new" does not exist, skipping -create table t2(f1 int,f2 int); -NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. -create table t2_rep(f1 int,f2 int) distribute by replication; -insert into t2_rep values(1,1),(2,2); -insert into t2 select * from t2_rep; -select count(*) from t2_rep; - count -------- - 2 +-- test insert with returning in JDBC +drop table if exists insertwithret; +NOTICE: table "insertwithret" does not exist, skipping +create table insertwithret(a int, b text, c int); +prepare p0(int,text,int) as insert into insertwithret values($1, $2, $3) returning a; +prepare p1(int,text,int) as insert into insertwithret values($1, $2, $3) returning a,b; +prepare p2(int,text,int) as insert into insertwithret values($1, $2, $3) returning c; +prepare p3(int,text,int) as insert into insertwithret values($1, $2, $3); +execute p0(1, 'abc', 1); + a +--- + 1 (1 row) -select count(*) from t2; - count -------- - 2 +execute p1(1, 'abc', 1); + a | b +---+----- + 1 | abc (1 row) -create table t2_new as select * from t2_rep; -NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. -select count(*) from t2_new; - count -------- - 2 +execute p2(1, 'abc', 1); + c +--- + 1 (1 row) -drop table t2; -drop table t2_rep; -drop table t2_new; -reset default_locator_type; +execute p3(1, 'abc', 1); diff --git a/src/test/regress/expected/sanity_check_1.out b/src/test/regress/expected/sanity_check_1.out index 8b55f563..dd80648c 100644 --- a/src/test/regress/expected/sanity_check_1.out +++ b/src/test/regress/expected/sanity_check_1.out @@ -60,6 +60,7 @@ inet_tbl|t inhf|f inhx|t insert_tbl|f +insertwithret|f int2_tbl|f int4_tbl|f int8_tbl|f diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql index b9b08d55..5591b65e 100644 --- a/src/test/regress/sql/insert.sql +++ b/src/test/regress/sql/insert.sql @@ -573,3 +573,15 @@ drop table t2; drop table t2_rep; drop table t2_new; reset default_locator_type; +-- test insert with returning in JDBC +drop table if exists insertwithret; +create table insertwithret(a int, b text, c int); +prepare p0(int,text,int) as insert into insertwithret values($1, $2, $3) returning a; +prepare p1(int,text,int) as insert into insertwithret values($1, $2, $3) returning a,b; +prepare p2(int,text,int) as insert into insertwithret values($1, $2, $3) returning c; +prepare p3(int,text,int) as insert into insertwithret values($1, $2, $3); + +execute p0(1, 'abc', 1); +execute p1(1, 'abc', 1); +execute p2(1, 'abc', 1); +execute p3(1, 'abc', 1); From 95d9ed0e8bc068c7fbaa747abe3d8cf6240a0b65 Mon Sep 17 00:00:00 2001 From: andrelin Date: Wed, 4 Aug 2021 10:53:55 +0800 Subject: [PATCH 412/578] Make cluster_activity visible in the same time as pg_stat_activity Achieved this by adding a hook in pgstat_report_activity tapd: http://tapd.oa.com/20418349/bugtrace/bugs/view?bug_id=1020418349090364325&url_cache_key=adcff0a8af8b863601a2454d42ec091b --- .../pg_stat_cluster_activity.c | 29 +++++++++++++++++-- src/backend/postmaster/pgstat.c | 5 ++++ src/include/pgstat.h | 2 ++ 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c index 0cc836d3..efe74c95 100644 --- a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c +++ b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c @@ -96,6 +96,7 @@ static PgClusterStatus *ClusterStatusArray = NULL; static PgClusterStatus *MyCSEntry = NULL; static shmem_startup_hook_type prev_shmem_startup_hook = NULL; +static pgstat_report_hook_type prev_pgstat_report_hook = NULL; static PortalStart_hook_type prev_PortalStart = NULL; static PortalDrop_hook_type prev_PortalDrop = NULL; static ExecutorStart_hook_type prev_ExecutorStart = NULL; @@ -341,13 +342,34 @@ pgcs_report_common(PgClusterStatus *entry, QueryDesc *desc) /* ---------- * pgcs_report_query_activity + * + * Do nothing but set common field, just enable this cluster entry + * to make it visible in the same time as pg_stat_activity. Hooked + * in pgstat_report_activity, args are redundant. + */ +static void +pgcs_report_query_activity(BackendState state, const char *cmd_str) +{ + volatile PgClusterStatus *entry; + + pgcs_entry_initialize(); + entry = MyCSEntry; + + pgcs_report_common((PgClusterStatus *) entry, NULL); + + if (prev_pgstat_report_hook) + prev_pgstat_report_hook(state, cmd_str); +} + +/* ---------- + * pgcs_report_executor_activity * * Report fileds of per-query referred, hooked as ExecutorStart_hook * report planstate, cursors and common fields. * ---------- */ static void -pgcs_report_query_activity(QueryDesc *desc, int eflags) +pgcs_report_executor_activity(QueryDesc *desc, int eflags) { volatile PgClusterStatus *entry; StringInfo planstate_str = NULL; @@ -1076,12 +1098,14 @@ _PG_init(void) */ prev_shmem_startup_hook = shmem_startup_hook; shmem_startup_hook = pgcs_shmem_startup; + prev_pgstat_report_hook = pgstat_report_hook; + pgstat_report_hook = pgcs_report_query_activity; prev_PortalStart = PortalStart_hook; PortalStart_hook = pgcs_report_activity; prev_PortalDrop = PortalDrop_hook; PortalDrop_hook = pgcs_report_activity; prev_ExecutorStart = ExecutorStart_hook; - ExecutorStart_hook = pgcs_report_query_activity; + ExecutorStart_hook = pgcs_report_executor_activity; } /* @@ -1092,6 +1116,7 @@ _PG_fini(void) { /* Uninstall hooks. */ shmem_startup_hook = prev_shmem_startup_hook; + pgstat_report_hook = prev_pgstat_report_hook; PortalStart_hook = prev_PortalStart; PortalDrop_hook = prev_PortalDrop; ExecutorStart_hook = prev_ExecutorStart; diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 0d77754a..76d4ff19 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -143,6 +143,8 @@ char *pgstat_stat_tmpname = NULL; */ PgStat_MsgBgWriter BgWriterStats; +pgstat_report_hook_type pgstat_report_hook = NULL; + /* ---------- * Local data * ---------- @@ -3128,6 +3130,9 @@ pgstat_report_activity(BackendState state, const char *cmd_str) } pgstat_increment_changecount_after(beentry); + + if (pgstat_report_hook) + pgstat_report_hook(state, cmd_str); } /*----------- diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 15dd8b59..2049d855 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -1132,6 +1132,8 @@ typedef struct PgStat_FunctionCallUsage instr_time f_start; } PgStat_FunctionCallUsage; +typedef void (*pgstat_report_hook_type) (BackendState state, const char *cmd_str); +extern PGDLLIMPORT pgstat_report_hook_type pgstat_report_hook; /* ---------- * GUC parameters From 8bbdbb85d45b414391dcaf7442cfdae829188c4a Mon Sep 17 00:00:00 2001 From: sigmalin Date: Tue, 20 Jul 2021 15:44:20 +0800 Subject: [PATCH 413/578] fix deadlock in BufferConnection http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131088847515 (merge request git status) --- src/backend/pgxc/pool/execRemote.c | 115 ++++++++++++++++++++++++++--- src/include/pgxc/execRemote.h | 4 +- 2 files changed, 108 insertions(+), 11 deletions(-) diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 492a3ac8..19eff0ea 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -1163,8 +1163,8 @@ ValidateAndCloseCombiner(ResponseCombiner *combiner) * connection should be buffered. */ void -BufferConnection(PGXCNodeHandle *conn) -{// #lizard forgives +BufferConnection(PGXCNodeHandle *conn, bool need_prefetch) +{ ResponseCombiner *combiner = conn->combiner; MemoryContext oldcontext; @@ -1375,15 +1375,112 @@ BufferConnection(PGXCNodeHandle *conn) continue; } - /* incomplete message, read more */ if (res == RESPONSE_EOF) { +#ifdef __TBASE__ + if (need_prefetch) + { + /* + * We encountered incomplete message, try to read more. + * Here if we read timeout, then we move to other connections to read, because we + * easily got deadlock if a specific cursor run as producer on two nodes. If we can + * consume data from all all connections, we can break the deadlock loop. + */ + bool bComplete = false; + DNConnectionState state = DN_CONNECTION_STATE_IDLE; + int i = 0; + int ret = 0; + PGXCNodeHandle *save_conn = NULL; + struct timeval timeout; + timeout.tv_sec = 0; + timeout.tv_usec = 1000; + + save_conn = conn; + while (1) + { + conn = save_conn; + state = conn->state; /* Save the connection state. */ + ret = pgxc_node_receive(1, &conn, &timeout); + if (DNStatus_OK == ret) + { + /* We got data, handle it. */ + break; + } + else if (DNStatus_ERR == ret) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to receive more data from data node %u", conn->nodeoid))); + } + else + { + /* Restore the saved state of connection. */ + conn->state = state; + } + + /* Try to read data from other connections. */ + for (i = 0; i < combiner->conn_count; i ++) + { + conn = combiner->connections[i]; + if (save_conn != conn && conn != NULL) + { + /* Save the connection state. */ + state = conn->state; + if (state == DN_CONNECTION_STATE_QUERY) + { + ret = pgxc_node_receive(1, &conn, &timeout); + if (DNStatus_OK == ret) + { + /* We got data, prefetch it. */ + bComplete = PreFetchConnection(conn, i); + if (bComplete) + { + /* Receive Complete on one connection, we need retry to read from current_conn. */ + break; + } + else + { + /* Maybe Suspend or Expired, just move to next connection and read. */ + continue; + } + } + else if (DNStatus_EXPIRED == ret) + { + /* Restore the saved state of connection. */ + conn->state = state; + continue; + } + else + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to receive more data from data node %u", conn->nodeoid))); + } + } + } + } + } + continue; + } + else + { + /* incomplete message, read more */ + if (pgxc_node_receive(1, &conn, NULL)) + { + PGXCNodeSetConnectionState(conn, + DN_CONNECTION_STATE_ERROR_FATAL); + add_error_message(conn, "Failed to fetch from data node"); + } + } +#else + /* incomplete message, read more */ if (pgxc_node_receive(1, &conn, NULL)) { PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_ERROR_FATAL); add_error_message(conn, "Failed to fetch from data node"); } +#endif } /* @@ -3464,7 +3561,7 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections, * any bugs reported */ if (connections[i]->state == DN_CONNECTION_STATE_QUERY) - BufferConnection(connections[i]); + BufferConnection(connections[i], false); /* Send global session id */ if (pgxc_node_send_sessionid(connections[i])) @@ -3979,7 +4076,7 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit) { /* Read in any pending input */ if (conn->state != DN_CONNECTION_STATE_IDLE) - BufferConnection(conn); + BufferConnection(conn, false); if (conn->read_only) { @@ -4857,7 +4954,7 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle) /* Read in any pending input */ if (conn->state != DN_CONNECTION_STATE_IDLE) { - BufferConnection(conn); + BufferConnection(conn, false); } #if 0 @@ -5513,7 +5610,7 @@ pgxc_node_remote_abort(TranscationType txn_type, bool need_release_handle) /* Read in any pending input */ if (conn->state != DN_CONNECTION_STATE_IDLE) { - BufferConnection(conn); + BufferConnection(conn, false); } /* @@ -6802,7 +6899,7 @@ SendTxnInfo(RemoteQuery *node, PGXCNodeHandle *conn, CommandId cid, Snapshot snapshot) { if (conn->state == DN_CONNECTION_STATE_QUERY) - BufferConnection(conn); + BufferConnection(conn, false); if (snapshot && pgxc_node_send_snapshot(conn, snapshot)) { ereport(ERROR, @@ -7176,7 +7273,7 @@ ExecCloseRemoteStatement(const char *stmt_name, List *nodelist) for (i = 0; i < conn_count; i++) { if (connections[i]->state == DN_CONNECTION_STATE_QUERY) - BufferConnection(connections[i]); + BufferConnection(connections[i], false); if (pgxc_node_send_close(connections[i], true, stmt_name) != 0) { /* diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h index 236979f8..c76b946a 100644 --- a/src/include/pgxc/execRemote.h +++ b/src/include/pgxc/execRemote.h @@ -392,7 +392,7 @@ extern void HandleCmdComplete(CmdType commandType, CombineTag *combine, const ch if ((conn)->state == DN_CONNECTION_STATE_QUERY && \ (conn)->combiner && \ (conn)->combiner != (ResponseCombiner *) (node)) \ - BufferConnection(conn); \ + BufferConnection(conn, true); \ (conn)->combiner = (ResponseCombiner *) (node); \ } while(0) @@ -400,7 +400,7 @@ extern TupleTableSlot *FetchTuple(ResponseCombiner *combiner); extern void InitResponseCombiner(ResponseCombiner *combiner, int node_count, CombineType combine_type); extern void CloseCombiner(ResponseCombiner *combiner); -extern void BufferConnection(PGXCNodeHandle *conn); +extern void BufferConnection(PGXCNodeHandle *conn, bool need_prefetch); extern bool PreFetchConnection(PGXCNodeHandle *conn, int32 node_index); extern void ExecRemoteQueryReScan(RemoteQueryState *node, ExprContext *exprCtxt); From 65b808332da7f6e852d4cd347112cf8f86fcb357 Mon Sep 17 00:00:00 2001 From: andrelin Date: Tue, 17 Aug 2021 14:58:43 +0800 Subject: [PATCH 414/578] Support interval table pruning for IN (array) (merge request !598) Squash merge branch 'andrelin/in_pruning' into 'Tbase_v5.06' tapd: http://tapd.oa.com/pgxz/tobject/tobjects/view/10217?system_name=onlinebug --- src/backend/utils/adt/ruleutils.c | 267 ++++++++++++++++++++++-- src/include/catalog/pg_type.h | 3 + src/test/regress/expected/partition.out | 38 ++++ src/test/regress/sql/partition.sql | 19 ++ 4 files changed, 307 insertions(+), 20 deletions(-) diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c index eb2e5420..6b2dd38b 100644 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@ -516,6 +516,7 @@ static char *flatten_reloptions(Oid relid); #ifdef __TBASE__ static Bitmapset *pruning_walker(Relation rel, Node *expr); static Bitmapset *pruning_opexpr(Relation rel, OpExpr *expr); +static Bitmapset *pruning_scalar_array_opexpr(Relation rel, ScalarArrayOpExpr *expr); static Bitmapset *get_full_pruning_result(Relation rel); static int get_daysofmonth(int startmonth, int startday, int endmonth, int endday); @@ -12275,6 +12276,9 @@ pruning_walker(Relation rel, Node *expr) } } break; + case T_ScalarArrayOpExpr: + result = pruning_scalar_array_opexpr(rel, (ScalarArrayOpExpr*)expr); + break; default: result = get_full_pruning_result(rel); break; @@ -12283,6 +12287,49 @@ pruning_walker(Relation rel, Node *expr) return result; } +static int +find_partidx_by_const(Datum constvalue, int consttype, Form_pg_partition_interval routerinfo, QulificationType qualtype) +{ + int partidx = -1; /* full as default */ + + switch(consttype) + { + case INT2OID: /* int2 */ + { + int value_int16; + value_int16 = DatumGetInt16(constvalue); + partidx = find_partidx_by_int(routerinfo->partstartvalue_int, routerinfo->partinterval_int, + routerinfo->partnparts, (int64) value_int16, qualtype); + } + break; + case INT4OID: /* int4 */ + { + int value_int32; + value_int32 = DatumGetInt32(constvalue); + partidx = find_partidx_by_int(routerinfo->partstartvalue_int, routerinfo->partinterval_int, + routerinfo->partnparts, (int64) value_int32, qualtype); + } + break; + case INT8OID: /* int8 */ + { + partidx = find_partidx_by_int(routerinfo->partstartvalue_int, routerinfo->partinterval_int, + routerinfo->partnparts, DatumGetInt64(constvalue), qualtype); + } + break; + case TIMESTAMPOID: /* timestamp */ + partidx = find_partidx_by_timestamp(routerinfo->partstartvalue_ts, routerinfo->partinterval_int, + routerinfo->partinterval_type, + routerinfo->partnparts, DatumGetTimestamp(constvalue), + qualtype); + break; + default: + elog(WARNING, "unsupported partidx type %d", consttype); + break; + } + + return partidx; +} + static Bitmapset * pruning_opexpr(Relation rel, OpExpr *expr) {// #lizard forgives @@ -12381,34 +12428,202 @@ pruning_opexpr(Relation rel, OpExpr *expr) switch(arg_const->consttype) { case INT2OID: /* int2 */ + case INT4OID: /* int4 */ + case INT8OID: /* int8 */ + case TIMESTAMPOID: /* timestamp */ { - int value_int16; - value_int16 = DatumGetInt16(arg_const->constvalue); - partidx = find_partidx_by_int(routerinfo->partstartvalue_int, routerinfo->partinterval_int, - routerinfo->partnparts, (int64)value_int16, qualtype); + partidx = find_partidx_by_const(arg_const->constvalue, arg_const->consttype, routerinfo, qualtype); } break; - case INT4OID: /* int4 */ + default: + elog(ERROR, "unsupported const type:[%u]", arg_const->consttype); + } + + npart = RelationGetNParts(rel); + if(npart <= 0) { - int value_int32; - value_int32 = DatumGetInt32(arg_const->constvalue); - partidx = find_partidx_by_int(routerinfo->partstartvalue_int, routerinfo->partinterval_int, - routerinfo->partnparts, (int64)value_int32, qualtype); + elog(ERROR, "internal error: pruning_opexpr:partitioned table has no partitions"); + } + + if(partidx == PARTITION_ROUTER_RESULT_FULL) + return get_full_pruning_result(rel); + else if(partidx == PARTITION_ROUTER_RESULT_NULL) + return NULL; + else if(partidx >= 0) + { + char *partname = NULL; + Oid partoid = InvalidOid; + + switch(qualtype) + { + case QULIFICATION_TYPE_LS: + case QULIFICATION_TYPE_LE: + { + int i; + for(i = 0; i <= partidx; i++) + { + partname = GetPartitionName(RelationGetRelid(rel), i, false); + partoid = get_relname_relid(partname, RelationGetNamespace(rel)); + if(partoid) + { + result = bms_add_member(result, i); + } + } } break; - case INT8OID: /* int8 */ + case QULIFICATION_TYPE_EQUAL: { - partidx = find_partidx_by_int(routerinfo->partstartvalue_int, routerinfo->partinterval_int, - routerinfo->partnparts, DatumGetInt64(arg_const->constvalue), qualtype); + partname = GetPartitionName(RelationGetRelid(rel), partidx, false); + partoid = get_relname_relid(partname, RelationGetNamespace(rel)); + if(partoid) + { + result = bms_make_singleton(partidx); + } } break; - case TIMESTAMPOID: /* timestamp */ - partidx = find_partidx_by_timestamp(routerinfo->partstartvalue_ts, routerinfo->partinterval_int, - routerinfo->partinterval_type, - routerinfo->partnparts, DatumGetTimestamp(arg_const->constvalue), qualtype); + case QULIFICATION_TYPE_GE: + case QULIFICATION_TYPE_GT: + { + int i; + for(i = partidx; i < npart; i++) + { + partname = GetPartitionName(RelationGetRelid(rel), i, false); + partoid = get_relname_relid(partname, RelationGetNamespace(rel)); + if(partoid) + { + result = bms_add_member(result, i); + } + } + } break; default: - elog(ERROR, "unsupported const type:[%u]", arg_const->consttype); + //nerver occur + elog(ERROR, "internal error: pruning_opexpr: invalid QulificationType[%d]", qualtype); + } + } + + return result; +} + +static Bitmapset * +pruning_scalar_array_opexpr(Relation rel, ScalarArrayOpExpr *expr) +{ + Bitmapset *result = NULL; + char *opname = NULL; + Node *leftarg = NULL; + Node *rightarg = NULL; + Var *arg_var = NULL; + Const *arg_const = NULL; + bool isswap = false; + int npart; + int partidx; + AttrNumber partkey; + QulificationType qualtype = QULIFICATION_TYPE_EQUAL; + Form_pg_partition_interval routerinfo; + ArrayType *arrayval; + int16 elmlen; + bool elmbyval; + char elmalign; + int num_elems; + Datum *elem_values; + int elem_type; + bool *elem_nulls; + int i; + + partkey = RelationGetPartitionColumnIndex(rel); + + if(list_length(expr->args) != 2) + return get_full_pruning_result(rel); + + leftarg = (Node *)list_nth(expr->args,0); + rightarg = (Node *)list_nth(expr->args,1); + + if (IsA(leftarg,Var)) + { + arg_var = (Var *)leftarg; + arg_const = (Const *)rightarg; + } + else if (IsA(rightarg,Var)) + { + arg_var = (Var *)rightarg; + arg_const = (Const *)leftarg; + isswap = true; + } + else + { + return get_full_pruning_result(rel); + } + + if (arg_const == NULL || + (!IsA(arg_const, Const)) || + arg_var->varattno != partkey) + { + return get_full_pruning_result(rel); + } + + opname = get_opname(expr->opno); + + if(strcmp("<",opname) == 0) + { + if(!isswap) + qualtype = QULIFICATION_TYPE_LS; + else + qualtype = QULIFICATION_TYPE_GT; + } + else if(strcmp("<=",opname) == 0) + { + if(!isswap) + qualtype = QULIFICATION_TYPE_LE; + else + qualtype = QULIFICATION_TYPE_GE; + } + else if(strcmp("=",opname) == 0) + { + qualtype = QULIFICATION_TYPE_EQUAL; + } + else if(strcmp(">=",opname) == 0) + { + if(!isswap) + qualtype = QULIFICATION_TYPE_GE; + else + qualtype = QULIFICATION_TYPE_LE; + } + else if(strcmp(">",opname) == 0) + { + if(!isswap) + qualtype = QULIFICATION_TYPE_GT; + else + qualtype = QULIFICATION_TYPE_LS; + } + else + { + /* any other case, get full partitions */ + return get_full_pruning_result(rel); + } + + routerinfo = rel->rd_partitions_info; + + if(!routerinfo) + { + elog(ERROR, "relation[%s] is not a partitioned table", RelationGetRelationName(rel)); + } + + switch(arg_const->consttype) + { + case INT2ARRAYOID: /* int2 */ + elem_type = INT2OID; + break; + case INT4ARRAYOID: /* int4 */ + elem_type = INT4OID; + break; + case INT8ARRAYOID: /* int8 */ + elem_type = INT8OID; + break; + case TIMESTAMPARRAYOID: /* timestamp */ + elem_type = TIMESTAMPOID; + break; + default: + return get_full_pruning_result(rel); } npart = RelationGetNParts(rel); @@ -12417,6 +12632,19 @@ pruning_opexpr(Relation rel, OpExpr *expr) elog(ERROR, "internal error: pruning_opexpr:partitioned table has no partitions"); } + arrayval = DatumGetArrayTypeP(arg_const->constvalue); + /* We could cache this data, but not clear it's worth it */ + get_typlenbyvalalign(ARR_ELEMTYPE(arrayval), + &elmlen, &elmbyval, &elmalign); + deconstruct_array(arrayval, + ARR_ELEMTYPE(arrayval), + elmlen, elmbyval, elmalign, + &elem_values, &elem_nulls, &num_elems); + + for (i = 0; i < num_elems; i++) + { + partidx = find_partidx_by_const(elem_values[i], elem_type, routerinfo, qualtype); + if(partidx == PARTITION_ROUTER_RESULT_FULL) return get_full_pruning_result(rel); else if(partidx == PARTITION_ROUTER_RESULT_NULL) @@ -12431,7 +12659,6 @@ pruning_opexpr(Relation rel, OpExpr *expr) case QULIFICATION_TYPE_LS: case QULIFICATION_TYPE_LE: { - int i; for(i = 0; i <= partidx; i++) { partname = GetPartitionName(RelationGetRelid(rel), i, false); @@ -12449,14 +12676,13 @@ pruning_opexpr(Relation rel, OpExpr *expr) partoid = get_relname_relid(partname, RelationGetNamespace(rel)); if(partoid) { - result = bms_make_singleton(partidx); + result = bms_add_member(result, partidx); } } break; case QULIFICATION_TYPE_GE: case QULIFICATION_TYPE_GT: { - int i; for(i = partidx; i < npart; i++) { partname = GetPartitionName(RelationGetRelid(rel), i, false); @@ -12473,6 +12699,7 @@ pruning_opexpr(Relation rel, OpExpr *expr) elog(ERROR, "internal error: pruning_opexpr: invalid QulificationType[%d]", qualtype); } } + } return result; } diff --git a/src/include/catalog/pg_type.h b/src/include/catalog/pg_type.h index e1b73fca..79cdcf48 100644 --- a/src/include/catalog/pg_type.h +++ b/src/include/catalog/pg_type.h @@ -538,6 +538,7 @@ DATA(insert OID = 1013 ( _oidvector PGNSP PGUID -1 f b A f t \054 0 30 0 arr DATA(insert OID = 1014 ( _bpchar PGNSP PGUID -1 f b A f t \054 0 1042 0 array_in array_out array_recv array_send bpchartypmodin bpchartypmodout array_typanalyze i x f 0 -1 0 100 _null_ _null_ _null_ )); DATA(insert OID = 1015 ( _varchar PGNSP PGUID -1 f b A f t \054 0 1043 0 array_in array_out array_recv array_send varchartypmodin varchartypmodout array_typanalyze i x f 0 -1 0 100 _null_ _null_ _null_ )); DATA(insert OID = 1016 ( _int8 PGNSP PGUID -1 f b A f t \054 0 20 0 array_in array_out array_recv array_send - - array_typanalyze d x f 0 -1 0 0 _null_ _null_ _null_ )); +#define INT8ARRAYOID 1016 DATA(insert OID = 1017 ( _point PGNSP PGUID -1 f b A f t \054 0 600 0 array_in array_out array_recv array_send - - array_typanalyze d x f 0 -1 0 0 _null_ _null_ _null_ )); DATA(insert OID = 1018 ( _lseg PGNSP PGUID -1 f b A f t \054 0 601 0 array_in array_out array_recv array_send - - array_typanalyze d x f 0 -1 0 0 _null_ _null_ _null_ )); DATA(insert OID = 1019 ( _path PGNSP PGUID -1 f b A f t \054 0 602 0 array_in array_out array_recv array_send - - array_typanalyze d x f 0 -1 0 0 _null_ _null_ _null_ )); @@ -579,12 +580,14 @@ DATA(insert OID = 1114 ( timestamp PGNSP PGUID 8 FLOAT8PASSBYVAL b D f t DESCR("date and time"); #define TIMESTAMPOID 1114 DATA(insert OID = 1115 ( _timestamp PGNSP PGUID -1 f b A f t \054 0 1114 0 array_in array_out array_recv array_send timestamptypmodin timestamptypmodout array_typanalyze d x f 0 -1 0 0 _null_ _null_ _null_ )); +#define TIMESTAMPARRAYOID 1115 DATA(insert OID = 1182 ( _date PGNSP PGUID -1 f b A f t \054 0 1082 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ )); DATA(insert OID = 1183 ( _time PGNSP PGUID -1 f b A f t \054 0 1083 0 array_in array_out array_recv array_send timetypmodin timetypmodout array_typanalyze d x f 0 -1 0 0 _null_ _null_ _null_ )); DATA(insert OID = 1184 ( timestamptz PGNSP PGUID 8 FLOAT8PASSBYVAL b D t t \054 0 0 1185 timestamptz_in timestamptz_out timestamptz_recv timestamptz_send timestamptztypmodin timestamptztypmodout - d p f 0 -1 0 0 _null_ _null_ _null_ )); DESCR("date and time with time zone"); #define TIMESTAMPTZOID 1184 DATA(insert OID = 1185 ( _timestamptz PGNSP PGUID -1 f b A f t \054 0 1184 0 array_in array_out array_recv array_send timestamptztypmodin timestamptztypmodout array_typanalyze d x f 0 -1 0 0 _null_ _null_ _null_ )); +#define TIMESTAMPTZARRAYOID 1185 DATA(insert OID = 1186 ( interval PGNSP PGUID 16 f b T t t \054 0 0 1187 interval_in interval_out interval_recv interval_send intervaltypmodin intervaltypmodout - d p f 0 -1 0 0 _null_ _null_ _null_ )); DESCR("@ , time interval"); #define INTERVALOID 1186 diff --git a/src/test/regress/expected/partition.out b/src/test/regress/expected/partition.out index 46ec29a3..d63e6d2f 100644 --- a/src/test/regress/expected/partition.out +++ b/src/test/regress/expected/partition.out @@ -978,3 +978,41 @@ truncate table int_drop partition for(1000); ERROR: the value for locating a partition is out of range truncate table int_drop partition for(370); drop table int_drop; +-- IN expr partition pruning +create table t_in_test(a int, b int, c timestamp) +partition by range (c) begin +(timestamp without time zone '2017-09-01 0:0:0') +step (interval '1 month') partitions (12) +distribute by shard (a) +to group default_group; +NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. +insert into t_in_test values(1,1,'20170901'); +insert into t_in_test values(2,2,'20171001'); +insert into t_in_test values(3,3,'20171101'); +insert into t_in_test values(3,3,'20171201'); +explain (costs off) select * from t_in_test where c in ('20171001', '20171201'); + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------- + Remote Fast Query Execution + Node/s: datanode_1, datanode_2 + -> Append + -> Seq Scan on t_in_test (partition sequence: 1, name: t_in_test_part_1) + Filter: (c = ANY ('{"Sun Oct 01 00:00:00 2017","Fri Dec 01 00:00:00 2017"}'::timestamp without time zone[])) + -> Seq Scan on t_in_test (partition sequence: 3, name: t_in_test_part_3) + Filter: (c = ANY ('{"Sun Oct 01 00:00:00 2017","Fri Dec 01 00:00:00 2017"}'::timestamp without time zone[])) +(7 rows) + +set enable_fast_query_shipping to off; +explain (costs off) select * from t_in_test where c in ('20170901', '20171101'); + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on t_in_test (partition sequence: 0, name: t_in_test_part_0) + Filter: (c = ANY ('{"Fri Sep 01 00:00:00 2017","Wed Nov 01 00:00:00 2017"}'::timestamp without time zone[])) + -> Seq Scan on t_in_test (partition sequence: 2, name: t_in_test_part_2) + Filter: (c = ANY ('{"Fri Sep 01 00:00:00 2017","Wed Nov 01 00:00:00 2017"}'::timestamp without time zone[])) +(6 rows) + +reset enable_fast_query_shipping; +drop table t_in_test; diff --git a/src/test/regress/sql/partition.sql b/src/test/regress/sql/partition.sql index b665cd81..cc2e7dd0 100644 --- a/src/test/regress/sql/partition.sql +++ b/src/test/regress/sql/partition.sql @@ -438,3 +438,22 @@ truncate table int_drop partition for(5); truncate table int_drop partition for(1000); truncate table int_drop partition for(370); drop table int_drop; + +-- IN expr partition pruning +create table t_in_test(a int, b int, c timestamp) +partition by range (c) begin +(timestamp without time zone '2017-09-01 0:0:0') +step (interval '1 month') partitions (12) +distribute by shard (a) +to group default_group; + +insert into t_in_test values(1,1,'20170901'); +insert into t_in_test values(2,2,'20171001'); +insert into t_in_test values(3,3,'20171101'); +insert into t_in_test values(3,3,'20171201'); + +explain (costs off) select * from t_in_test where c in ('20171001', '20171201'); +set enable_fast_query_shipping to off; +explain (costs off) select * from t_in_test where c in ('20170901', '20171101'); +reset enable_fast_query_shipping; +drop table t_in_test; From a4861f6964adc410f25369345fb750c4d23f3a72 Mon Sep 17 00:00:00 2001 From: andrelin Date: Tue, 17 Aug 2021 15:39:06 +0800 Subject: [PATCH 415/578] Replace datid, usesysid with datname, usename http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131090773427&url_cache_key=3896fc0d053f1c19ad00a42c3d3ccca6&action_entry_type=bugs --- .../pg_stat_cluster_activity--1.0.sql | 4 ++-- .../pg_stat_cluster_activity.c | 17 +++++++++++++++-- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity--1.0.sql b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity--1.0.sql index c5514458..72f71480 100644 --- a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity--1.0.sql +++ b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity--1.0.sql @@ -15,8 +15,8 @@ CREATE OR REPLACE FUNCTION pg_stat_get_cluster_activity( OUT client_port integer, OUT nodename text, OUT role text, - OUT datid oid, - OUT usesysid oid, + OUT datname text, + OUT usename text, OUT wait_event_type text, OUT wait_event text, OUT state text, diff --git a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c index efe74c95..1bc9f489 100644 --- a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c +++ b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c @@ -2,6 +2,7 @@ #include "catalog/pg_authid.h" #include "catalog/pg_type.h" +#include "commands/dbcommands.h" #include "commands/explain.h" #include "common/ip.h" #include "fmgr.h" @@ -715,12 +716,24 @@ pg_stat_get_cluster_activity(PG_FUNCTION_ARGS) values[1] = Int32GetDatum(beentry->st_procpid); if (beentry->st_databaseid != InvalidOid) - values[7] = ObjectIdGetDatum(beentry->st_databaseid); + { + char *dbname = get_database_name(beentry->st_databaseid); + if (dbname != NULL) + values[7] = CStringGetTextDatum(dbname); + else + nulls[7] = true; + } else nulls[7] = true; if (beentry->st_userid != InvalidOid) - values[8] = ObjectIdGetDatum(beentry->st_userid); + { + char *usename = GetUserNameFromId(beentry->st_userid, true); + if (usename != NULL) + values[8] = CStringGetTextDatum(usename); + else + nulls[8] = true; + } else nulls[8] = true; From ce7f6f113161083ff35cf7f9fff24b5d04ee1569 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Fri, 10 Nov 2017 16:50:50 -0500 Subject: [PATCH 416/578] Account for the effect of lossy pages when costing bitmap scans. Dilip Kumar, reviewed by Alexander Kumenkov, Amul Sul, and me. Some final adjustments by me. Discussion: http://postgr.es/m/CAFiTN-sYtqUOXQ4SpuhTv0Z9gD0si3YxZGv_PQAAMX8qbOotcg@mail.gmail.com --- src/backend/nodes/tidbitmap.c | 37 +++++++++++------ src/backend/optimizer/path/costsize.c | 59 ++++++++++++++++++++++----- src/include/nodes/tidbitmap.h | 21 +++++----- 3 files changed, 85 insertions(+), 32 deletions(-) diff --git a/src/backend/nodes/tidbitmap.c b/src/backend/nodes/tidbitmap.c index 73820707..c3f800a5 100644 --- a/src/backend/nodes/tidbitmap.c +++ b/src/backend/nodes/tidbitmap.c @@ -265,7 +265,6 @@ TIDBitmap * tbm_create(long maxbytes, dsa_area *dsa) { TIDBitmap *tbm; - long nbuckets; /* Create the TIDBitmap struct and zero all its fields */ tbm = makeNode(TIDBitmap); @@ -273,17 +272,7 @@ tbm_create(long maxbytes, dsa_area *dsa) tbm->mcxt = CurrentMemoryContext; tbm->status = TBM_EMPTY; - /* - * Estimate number of hashtable entries we can have within maxbytes. This - * estimates the hash cost as sizeof(PagetableEntry), which is good enough - * for our purpose. Also count an extra Pointer per entry for the arrays - * created during iteration readout. - */ - nbuckets = maxbytes / - (sizeof(PagetableEntry) + sizeof(Pointer) + sizeof(Pointer)); - nbuckets = Min(nbuckets, INT_MAX - 1); /* safety limit */ - nbuckets = Max(nbuckets, 16); /* sanity limit */ - tbm->maxentries = (int) nbuckets; + tbm->maxentries = (int) tbm_calculate_entries(maxbytes); tbm->lossify_start = 0; tbm->dsa = dsa; tbm->dsapagetable = InvalidDsaPointer; @@ -1546,3 +1535,27 @@ pagetable_free(pagetable_hash *pagetable, void *pointer) tbm->dsapagetableold = InvalidDsaPointer; } } + +/* + * tbm_calculate_entries + * + * Estimate number of hashtable entries we can have within maxbytes. + */ +long +tbm_calculate_entries(double maxbytes) +{ + long nbuckets; + + /* + * Estimate number of hashtable entries we can have within maxbytes. This + * estimates the hash cost as sizeof(PagetableEntry), which is good enough + * for our purpose. Also count an extra Pointer per entry for the arrays + * created during iteration readout. + */ + nbuckets = maxbytes / + (sizeof(PagetableEntry) + sizeof(Pointer) + sizeof(Pointer)); + nbuckets = Min(nbuckets, INT_MAX - 1); /* safety limit */ + nbuckets = Max(nbuckets, 16); /* sanity limit */ + + return nbuckets; +} diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 18ca6a7d..f8ac09e8 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -5417,6 +5417,8 @@ compute_bitmap_pages(PlannerInfo *root, RelOptInfo *baserel, Path *bitmapqual, double T; double pages_fetched; double tuples_fetched; + double heap_pages; + long maxentries; /* * Fetch total cost of obtaining the bitmap, as well as its total @@ -5431,6 +5433,24 @@ compute_bitmap_pages(PlannerInfo *root, RelOptInfo *baserel, Path *bitmapqual, T = (baserel->pages > 1) ? (double) baserel->pages : 1.0; + /* + * For a single scan, the number of heap pages that need to be fetched is + * the same as the Mackert and Lohman formula for the case T <= b (ie, no + * re-reads needed). + */ + pages_fetched = (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched); + + /* + * Calculate the number of pages fetched from the heap. Then based on + * current work_mem estimate get the estimated maxentries in the bitmap. + * (Note that we always do this calculation based on the number of pages + * that would be fetched in a single iteration, even if loop_count > 1. + * That's correct, because only that number of entries will be stored in + * the bitmap at one time.) + */ + heap_pages = Min(pages_fetched, baserel->pages); + maxentries = tbm_calculate_entries(work_mem * 1024L); + if (loop_count > 1) { /* @@ -5445,22 +5465,41 @@ compute_bitmap_pages(PlannerInfo *root, RelOptInfo *baserel, Path *bitmapqual, root); pages_fetched /= loop_count; } - else - { - /* - * For a single scan, the number of heap pages that need to be fetched - * is the same as the Mackert and Lohman formula for the case T <= b - * (ie, no re-reads needed). - */ - pages_fetched = - (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched); - } if (pages_fetched >= T) pages_fetched = T; else pages_fetched = ceil(pages_fetched); + if (maxentries < heap_pages) + { + double exact_pages; + double lossy_pages; + + /* + * Crude approximation of the number of lossy pages. Because of the + * way tbm_lossify() is coded, the number of lossy pages increases + * very sharply as soon as we run short of memory; this formula has + * that property and seems to perform adequately in testing, but it's + * possible we could do better somehow. + */ + lossy_pages = Max(0, heap_pages - maxentries / 2); + exact_pages = heap_pages - lossy_pages; + + /* + * If there are lossy pages then recompute the number of tuples + * processed by the bitmap heap node. We assume here that the chance + * of a given tuple coming from an exact page is the same as the + * chance that a given page is exact. This might not be true, but + * it's not clear how we can do any better. + */ + if (lossy_pages > 0) + tuples_fetched = + clamp_row_est(indexSelectivity * + (exact_pages / heap_pages) * baserel->tuples + + (lossy_pages / heap_pages) * baserel->tuples); + } + if (cost) *cost = indexTotalCost; if (tuple) diff --git a/src/include/nodes/tidbitmap.h b/src/include/nodes/tidbitmap.h index fbd75c20..d3ad0a55 100644 --- a/src/include/nodes/tidbitmap.h +++ b/src/include/nodes/tidbitmap.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * tidbitmap.h - * PostgreSQL tuple-id (TID) bitmap package + * PostgreSQL tuple-id (TID) bitmap package * * This module provides bitmap data structures that are spiritually * similar to Bitmapsets, but are specially adapted to store sets of @@ -39,11 +39,11 @@ typedef struct TBMSharedIterator TBMSharedIterator; /* Result structure for tbm_iterate */ typedef struct { - BlockNumber blockno; /* page number containing tuples */ - int ntuples; /* -1 indicates lossy result */ - bool recheck; /* should the tuples be rechecked? */ - /* Note: recheck is always true if ntuples < 0 */ - OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER]; + BlockNumber blockno; /* page number containing tuples */ + int ntuples; /* -1 indicates lossy result */ + bool recheck; /* should the tuples be rechecked? */ + /* Note: recheck is always true if ntuples < 0 */ + OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER]; } TBMIterateResult; /* function prototypes in nodes/tidbitmap.c */ @@ -53,8 +53,8 @@ extern void tbm_free(TIDBitmap *tbm); extern void tbm_free_shared_area(dsa_area *dsa, dsa_pointer dp); extern void tbm_add_tuples(TIDBitmap *tbm, - const ItemPointer tids, int ntids, - bool recheck); + const ItemPointer tids, int ntids, + bool recheck); extern void tbm_add_page(TIDBitmap *tbm, BlockNumber pageno); extern void tbm_union(TIDBitmap *a, const TIDBitmap *b); @@ -69,6 +69,7 @@ extern TBMIterateResult *tbm_shared_iterate(TBMSharedIterator *iterator); extern void tbm_end_iterate(TBMIterator *iterator); extern void tbm_end_shared_iterate(TBMSharedIterator *iterator); extern TBMSharedIterator *tbm_attach_shared_iterate(dsa_area *dsa, - dsa_pointer dp); + dsa_pointer dp); +extern long tbm_calculate_entries(double maxbytes); -#endif /* TIDBITMAP_H */ +#endif /* TIDBITMAP_H */ From 4156013532fb015044ada2af00c71c788332d100 Mon Sep 17 00:00:00 2001 From: guanhuawang Date: Tue, 24 Aug 2021 23:04:49 +0800 Subject: [PATCH 417/578] fix an error when perform materialized view concurrently refresh. http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131087752683 --- src/backend/commands/matview.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/backend/commands/matview.c b/src/backend/commands/matview.c index 102e2f36..b158aaa4 100644 --- a/src/backend/commands/matview.c +++ b/src/backend/commands/matview.c @@ -832,9 +832,13 @@ refresh_by_match_merge(Oid matviewOid, Oid tempOid, Oid relowner, /* Analyze the diff table. */ resetStringInfo(&querybuf); - appendStringInfo(&querybuf, "ANALYZE %s", diffname); + /* + * Materialized view is stored on CN, use "(COORDINATOR)" option to force + * vacuum analyzing "diff table" on CN. + */ + appendStringInfo(&querybuf, "ANALYZE (COORDINATOR) %s", diffname); if (SPI_exec(querybuf.data, 0) != SPI_OK_UTILITY) - elog(ERROR, "SPI_exec failed: %s", querybuf.data); + elog(ERROR, "SPI_exec failed: %s", querybuf.data);; OpenMatViewIncrementalMaintenance(); From 8a4e9139a9c0fef1e6d1482b78250e591f72da64 Mon Sep 17 00:00:00 2001 From: whalesong Date: Tue, 7 Sep 2021 10:30:52 +0800 Subject: [PATCH 418/578] 2pc stop opt: add clean 2pc process (merge request 656) --- contrib/pg_clean/pg_clean.c | 134 ++- src/backend/access/transam/gtm.c | 9 + src/backend/access/transam/twophase.c | 2 +- src/backend/access/transam/varsup.c | 2 + src/backend/access/transam/xact.c | 109 ++- src/backend/pgxc/pool/execRemote.c | 22 +- src/backend/postmaster/Makefile | 2 +- src/backend/postmaster/clean2pc.c | 1002 ++++++++++++++++++++++ src/backend/postmaster/pgstat.c | 17 + src/backend/postmaster/postmaster.c | 96 +++ src/backend/storage/ipc/ipci.c | 4 + src/backend/storage/lmgr/lwlocknames.txt | 1 + src/backend/storage/lmgr/proc.c | 22 +- src/backend/utils/init/miscinit.c | 3 +- src/backend/utils/init/postinit.c | 9 +- src/backend/utils/misc/guc.c | 44 +- src/include/access/xact.h | 4 + src/include/pgstat.h | 4 +- src/include/postmaster/clean2pc.h | 43 + src/include/storage/pmsignal.h | 32 +- src/include/storage/proc.h | 2 + src/test/regress/expected/sysviews.out | 4 +- 22 files changed, 1512 insertions(+), 55 deletions(-) create mode 100644 src/backend/postmaster/clean2pc.c create mode 100644 src/include/postmaster/clean2pc.h diff --git a/contrib/pg_clean/pg_clean.c b/contrib/pg_clean/pg_clean.c index 459a2fc0..08375f46 100644 --- a/contrib/pg_clean/pg_clean.c +++ b/contrib/pg_clean/pg_clean.c @@ -63,17 +63,17 @@ int transaction_threshold = 200000; #define MAXIMUM_OUTPUT_FILE 1000 #define XIDPREFIX "_$XC$" #define DEFAULT_CLEAN_TIME_INTERVAL 120000000 -#ifdef __TWO_PHASE_TESTS__ -#define LEAST_CLEAN_TIME_INTERVAL 10000000 /* in pg_clean test_mode should not clean twophase trans prepared in ten seconds or commit in ten seconds */ -#else -#define LEAST_CLEAN_TIME_INTERVAL 60000000 /* should not clean twophase trans prepared in a minite or commit in a minite */ -#endif -GlobalTimestamp clean_time_interval = DEFAULT_CLEAN_TIME_INTERVAL; +#define LEAST_CLEAN_TIME_INTERVAL 1000000 /* should not clean twophase trans prepared in 1s or commit in 1s */ +GlobalTimestamp clean_time_interval = DEFAULT_CLEAN_TIME_INTERVAL; PG_MODULE_MAGIC; -#define MAX_GID 50 +#define MAX_GID 64 + +#define CLEAN_CHECK_TIMES 3 +#define CLEAN_CHECK_INTERVAL 10000 + #define MAX_DBNAME 64 #define GET_START_XID "startxid:" #define GET_COMMIT_TIMESTAMP "global_commit_timestamp:" @@ -2397,6 +2397,7 @@ bool check_2pc_belong_node(txn_info * txn) int node_index = 0; char node_type; node_index = find_node_index(abnormal_nodeoid); + Assert(InvalidOid != abnormal_nodeoid); if (abnormal_nodeoid == txn->origcoord) { txn->belong_abnormal_node = true; @@ -2413,6 +2414,60 @@ bool check_2pc_belong_node(txn_info * txn) txn->belong_abnormal_node = true; return true; } + + if (InvalidOid == txn->origcoord) + { + char *startnode = NULL; + int node_oid = InvalidOid; + char gid[MAX_GID]; + + if (!IsXidImplicit(txn->gid)) + { + txn->belong_abnormal_node = true; + return true; + } + + Assert(IsXidImplicit(txn->gid)); + + /* get start node from gid */ + strcpy(gid, txn->gid); + startnode = strtok(gid, ":"); + if (NULL == startnode) + { + elog(WARNING, "get startnode(%s) from gid(%s) failed", + startnode, gid); + txn->belong_abnormal_node = false; + return false; + } + + startnode = strtok(NULL, ":"); + if (NULL == startnode) + { + elog(WARNING, "get startnode(%s) from gid(%s) failed", + startnode, gid); + txn->belong_abnormal_node = false; + return false; + } + + node_oid = get_pgxc_nodeoid(startnode); + if (NULL == startnode) + { + elog(WARNING, "get invalid oid for startnode(%s) from gid(%s)", + startnode, gid); + txn->belong_abnormal_node = false; + return false; + } + + elog(DEBUG5, "get oid(%d) for startnode(%s) from gid(%s)", + node_oid, startnode, gid); + + if (abnormal_nodeoid == node_oid) + { + txn->belong_abnormal_node = true; + return true; + } + } + txn->belong_abnormal_node = false; return false; } @@ -2432,6 +2487,10 @@ bool check_node_participate(txn_info * txn, int node_idx) void recover2PC(txn_info * txn) { + int i = 0; + bool check_ok = false; + MemoryContext current_context = NULL; + ErrorData* edata = NULL; TXN_STATUS txn_stat; txn_stat = check_txn_global_status(txn); txn->global_txn_stat = txn_stat; @@ -2470,12 +2529,40 @@ void recover2PC(txn_info * txn) { txn->op = COMMIT; /* check whether all nodes can commit prepared */ + for (i = 0; i < CLEAN_CHECK_TIMES; i++) + { + check_ok = true; + current_context = CurrentMemoryContext; + PG_TRY(); + { if (!clean_2PC_iscommit(txn, true, true)) { + check_ok = false; + elog(LOG, "check commit 2PC transaction %s failed", + txn->gid); + } + } + PG_CATCH(); + { + (void)MemoryContextSwitchTo(current_context); + edata = CopyErrorData(); + FlushErrorState(); + + check_ok = false; + elog(WARNING, "check commit 2PC transaction %s error: %s", + txn->gid, edata->message); + } + PG_END_TRY(); + + if (!check_ok) + { txn->op_issuccess = false; - elog(LOG, "check commit 2PC transaction %s failed", txn->gid); return; } + + pg_usleep(CLEAN_CHECK_INTERVAL); + } + /* send commit prepared to all nodes */ if (!clean_2PC_iscommit(txn, true, false)) { @@ -2491,12 +2578,40 @@ void recover2PC(txn_info * txn) case TXN_STATUS_ABORTED: txn->op = ABORT; /* check whether all nodes can rollback prepared */ + for (i = 0; i < CLEAN_CHECK_TIMES; i++) + { + check_ok = true; + current_context = CurrentMemoryContext; + PG_TRY(); + { if (!clean_2PC_iscommit(txn, false, true)) { + check_ok = false; + elog(LOG, "check rollback 2PC transaction %s failed", + txn->gid); + } + } + PG_CATCH(); + { + check_ok = false; + (void)MemoryContextSwitchTo(current_context); + edata = CopyErrorData(); + FlushErrorState(); + + elog(WARNING, "check rollback 2PC transaction %s error: %s", + txn->gid, edata->message); + } + PG_END_TRY(); + + if (!check_ok) + { txn->op_issuccess = false; - elog(LOG, "check rollback 2PC transaction %s failed", txn->gid); return; } + + pg_usleep(CLEAN_CHECK_INTERVAL); + } + /* send rollback prepared to all nodes */ if (!clean_2PC_iscommit(txn, false, false)) { @@ -2620,7 +2735,6 @@ TXN_STATUS check_txn_global_status(txn_info *txn) { node_idx = find_node_index(abnormal_nodeoid); if (!check_2pc_belong_node(txn) || - !check_node_participate(txn, node_idx) || abnormal_time < txn->prepare_timestamp[node_idx]) { return TXN_STATUS_INPROGRESS; diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c index 5f95e859..5fb8904a 100644 --- a/src/backend/access/transam/gtm.c +++ b/src/backend/access/transam/gtm.c @@ -22,6 +22,7 @@ #include "pgxc/pgxc.h" #include "gtm/gtm_c.h" #include "postmaster/autovacuum.h" +#include "postmaster/clean2pc.h" #include "postmaster/clustermon.h" #include "storage/backendid.h" #include "tcop/tcopprot.h" @@ -1336,6 +1337,10 @@ InitGTM(void) elog(LOG, "Autovacuum launcher: connection established to GTM with string %s", conn_str); else if (IsClusterMonitorProcess() && GTMDebugPrint) elog(LOG, "Cluster monitor: connection established to GTM with string %s", conn_str); + else if (IsClean2pcWorker() && GTMDebugPrint) + elog(LOG, "Clean 2pc worker: connection established to GTM with string %s", conn_str); + else if (IsClean2pcLauncher() && GTMDebugPrint) + elog(LOG, "Clean 2pc launcher: connection established to GTM with string %s", conn_str); else if(GTMDebugPrint) elog(LOG, "Postmaster child: connection established to GTM with string %s", conn_str); } @@ -1424,6 +1429,10 @@ CloseGTM(void) elog(DEBUG1, "Autovacuum launcher: connection to GTM closed"); else if (IsClusterMonitorProcess()) elog(DEBUG1, "Cluster monitor: connection to GTM closed"); + else if (IsClean2pcWorker()) + elog(DEBUG1, "Clean 2pc worker: connection to GTM closed"); + else if (IsClean2pcLauncher()) + elog(DEBUG1, "Clean 2pc launcher: connection to GTM closed"); else elog(DEBUG1, "Postmaster child: connection to GTM closed"); } diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index a9078bda..387cdf73 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -4051,7 +4051,7 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta } else { - elog(PANIC, "[%s] could not open file %s, errMsg: %s", + elog(ERROR, "[%s] could not open file %s, errMsg: %s", __FUNCTION__, path, strerror(errno)); } return; diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c index 6fdadcc2..42baa98f 100644 --- a/src/backend/access/transam/varsup.c +++ b/src/backend/access/transam/varsup.c @@ -24,6 +24,7 @@ #include "commands/dbcommands.h" #include "miscadmin.h" #include "postmaster/autovacuum.h" +#include "postmaster/clean2pc.h" #include "storage/pmsignal.h" #include "storage/proc.h" #include "utils/syscache.h" @@ -356,6 +357,7 @@ GetNewTransactionId(bool isSubXact) (!IsConnFromCoord() || IsAutoVacuumWorkerProcess() || IsAutoVacuumLauncherProcess() || + IsAnyClean2pcProcess() || GetForceXidFromGTM() || (IsInitProcessingMode() && IsPostmasterEnvironment))) { diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 3e59c7f4..ed02dff9 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -32,6 +32,7 @@ #include "pgxc/pause.h" /* PGXC_DATANODE */ #include "postmaster/autovacuum.h" +#include "postmaster/clean2pc.h" #include "libpq/pqformat.h" #include "libpq/libpq.h" #endif @@ -65,6 +66,7 @@ #include "storage/condition_variable.h" #include "storage/fd.h" #include "storage/lmgr.h" +#include "storage/pmsignal.h" #include "storage/predicate.h" #include "storage/proc.h" #include "storage/procarray.h" @@ -101,6 +103,13 @@ #include "tcop/utility.h" #include "utils/relcryptmap.h" #endif + +#ifdef __TWO_PHASE_TESTS__ +#define TWO_PHASE_TEST_NOT_STOP 1 +#define TWO_PHASE_TEST_STOP_DN 2 +#define TWO_PHASE_TEST_STOP_ALL 3 +#endif + /* * User-tweakable parameters */ @@ -395,6 +404,10 @@ static bool XactLocalNodePrepared; static bool XactReadLocalNode; static bool XactWriteLocalNode; +#ifdef __TWO_PHASE_TRANS__ +bool enable_2pc_error_stop = false; +#endif + /* * Some commands want to force synchronous commit. */ @@ -3743,13 +3756,17 @@ AbortTransaction(void) TransactionState s = CurrentTransactionState; TransactionId latestXid; bool is_parallel_worker; + bool can_abort = true; #ifdef __TWO_PHASE_TRANS__ StringInfoData errormsg; - if ( #ifdef __TWO_PHASE_TESTS__ + bool test_stop = (complish && run_pg_clean); +#endif + can_abort = !( +#ifdef __TWO_PHASE_TESTS__ (complish && run_pg_clean) || #endif TWO_PHASE_COMMITTING == g_twophase_state.state || @@ -3758,18 +3775,76 @@ AbortTransaction(void) TWO_PHASE_ABORT_END == g_twophase_state.state || TWO_PHASE_UNKNOW_STATUS == g_twophase_state.state || (TWO_PHASE_PREPARED == g_twophase_state.state && - false == g_twophase_state.is_start_node)) - { + false == g_twophase_state.is_start_node)); + if (!can_abort) + { if (false == g_twophase_state.isprinted) { print_twophase_state(&errormsg, false); + + if (enable_2pc_error_stop) + { elog(STOP, "errormsg in AbortTransaction:\n %s", errormsg.data); } +#ifdef __TWO_PHASE_TESTS__ + else if (test_stop) + { + switch (run_pg_clean) + { + case TWO_PHASE_TEST_NOT_STOP: + break; + case TWO_PHASE_TEST_STOP_DN: + if (IS_PGXC_LOCAL_COORDINATOR) + { + break; + } + case TWO_PHASE_TEST_STOP_ALL: + elog(STOP, "in test, in AbortTransaction:\n %s", errormsg.data); + break; + default: + break; + } + elog(WARNING, "in test, in AbortTransaction:\n %s", errormsg.data); + } +#endif + else + { + elog(WARNING, "errormsg in AbortTransaction:\n %s", errormsg.data); + } + } else { + if (enable_2pc_error_stop) + { elog(STOP, "STOP postmaster in AbortTransaction"); } +#ifdef __TWO_PHASE_TESTS__ + else if (test_stop) + { + switch (run_pg_clean) + { + case TWO_PHASE_TEST_NOT_STOP: + break; + case TWO_PHASE_TEST_STOP_DN: + if (IS_PGXC_LOCAL_COORDINATOR) + { + break; + } + case TWO_PHASE_TEST_STOP_ALL: + elog(STOP, "in test, postmaster in AbortTransaction"); + break; + default: + break; + } + elog(WARNING, "in test, postmaster in AbortTransaction"); + } +#endif + else + { + elog(WARNING, "WARNING postmaster in AbortTransaction"); + } + } } /* print prepare err in pgxc_node_remote_prepare */ @@ -3786,8 +3861,12 @@ AbortTransaction(void) * Cleanup the files created during database/tablespace operations. * This must happen before we release locks, because we want to hold the * locks acquired initially while we cleanup the files. + * If can_abort is false, needn't do DBCleanup, Createdb, movedb, createtablespace e.g. */ + if (can_abort) + { AtEOXact_DBCleanup(false); + } #ifdef __TBASE__ SqueueProducerExit(); @@ -3798,6 +3877,17 @@ AbortTransaction(void) * transaction at the GTM at thr end */ s->topGlobalTransansactionId = s->transactionId; + +#ifdef __TWO_PHASE_TRANS__ + if (IS_PGXC_LOCAL_COORDINATOR && g_twophase_state.state != TWO_PHASE_INITIALTRANS) + { + elog(LOG, "send signal to clean 2pc launcher, gid: %s", g_twophase_state.gid); + SendPostmasterSignal(PMSIGNAL_WAKEN_CLEAN_2PC_TRIGGER); + } +#endif + + if (can_abort) + { /* * Handle remote abort first. */ @@ -3820,13 +3910,11 @@ AbortTransaction(void) FinishPreparedTransaction(savePrepareGID, false); XactLocalNodePrepared = false; } - else - { + } + #ifdef __TWO_PHASE_TRANS__ - g_twophase_state.state = TWO_PHASE_ABORTTED; ClearLocalTwoPhaseState(); #endif - } if(enable_distri_debug && is_distri_report && IS_PGXC_COORDINATOR) { @@ -4003,7 +4091,10 @@ AbortTransaction(void) #endif latestXid = RecordTransactionAbort(false); #ifdef __TBASE__ + if (can_abort) + { FinishSeqOp(false); + } #endif } else @@ -4049,7 +4140,10 @@ AbortTransaction(void) /* See comments in CommitTransaction */ #ifdef XCP + if (can_abort) + { AtEOXact_GlobalTxn(false); + } #endif ResourceOwnerRelease(TopTransactionResourceOwner, @@ -7839,6 +7933,7 @@ IsPGXCNodeXactDatanodeDirect(void) (IsPostmasterEnvironment || !useLocalXid) && IsNormalProcessingMode() && !IsAutoVacuumLauncherProcess() && + !IsClean2pcLauncher() && #ifdef XCP !IsConnFromDatanode() && #endif diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 19eff0ea..1708343b 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -8384,6 +8384,17 @@ FinishRemotePreparedTransaction(char *prepareGID, bool commit) GlobalTransactionId gxid, prepare_gxid; bool prepared_local = false; +#ifdef __TWO_PHASE_TRANS__ + /* + * Since g_twophase_state is cleared after prepare phase, + * g_twophase_state shoud be assigned here + */ + strncpy(g_twophase_state.gid, prepareGID, GIDSIZE); + strncpy(g_twophase_state.start_node_name, PGXCNodeName, NAMEDATALEN); + g_twophase_state.state = TWO_PHASE_PREPARED; + g_twophase_state.is_start_node = true; +#endif + /* * Get the list of nodes involved in this transaction. * @@ -8441,7 +8452,6 @@ FinishRemotePreparedTransaction(char *prepareGID, bool commit) #endif #ifdef __TWO_PHASE_TRANS__ - /* * not allowed user commit residual transaction in xc_maintenance_mode, * since we need commit them in unified timestamp @@ -8450,19 +8460,11 @@ FinishRemotePreparedTransaction(char *prepareGID, bool commit) { elog(ERROR, "can not commit transaction '%s' in xc_maintainence_mode", prepareGID); } - /* - *since g_twophase_state is cleared after prepare phase - *g_twophase_state shoud be assigned here - */ - strncpy(g_twophase_state.gid, prepareGID, GIDSIZE); - strncpy(g_twophase_state.start_node_name, PGXCNodeName, NAMEDATALEN); - g_twophase_state.state = TWO_PHASE_PREPARED; - g_twophase_state.is_start_node = true; + if (nodestring) { strncpy(g_twophase_state.participants, nodestring,((NAMEDATALEN+1) * (TBASE_MAX_DATANODE_NUMBER + TBASE_MAX_COORDINATOR_NUMBER))); } - #endif #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__ diff --git a/src/backend/postmaster/Makefile b/src/backend/postmaster/Makefile index 56d6a151..2b532af5 100644 --- a/src/backend/postmaster/Makefile +++ b/src/backend/postmaster/Makefile @@ -13,6 +13,6 @@ top_builddir = ../../.. include $(top_builddir)/src/Makefile.global OBJS = auditlogger.o autovacuum.o bgworker.o bgwriter.o checkpointer.o clustermon.o \ - fork_process.o pgarch.o pgstat.o postmaster.o startup.o syslogger.o walwriter.o + fork_process.o pgarch.o pgstat.o postmaster.o startup.o syslogger.o walwriter.o clean2pc.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/postmaster/clean2pc.c b/src/backend/postmaster/clean2pc.c new file mode 100644 index 00000000..80ab7103 --- /dev/null +++ b/src/backend/postmaster/clean2pc.c @@ -0,0 +1,1002 @@ +/*------------------------------------------------------------------------- + * + * clean2pc.c + * + * The background clean 2pc processes are added by whalesong. + * They attempt to clean the abnormal 2pc. + * + * Portions Copyright (c) 1996-2021, TDSQL-PG Development Group + * + * + * IDENTIFICATION + * src/backend/postmaster/clean2pc.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/htup_details.h" +#include "catalog/pg_database.h" +#include "catalog/pg_type.h" +#include "commands/dbcommands.h" +#include "executor/executor.h" +#include "libpq/pqsignal.h" +#include "nodes/makefuncs.h" +#include "postmaster/clean2pc.h" +#include "postmaster/fork_process.h" +#include "postmaster/postmaster.h" +#include "pgxc/execRemote.h" +#include "storage/buf_internals.h" +#include "storage/ipc.h" +#include "storage/pmsignal.h" +#include "tcop/tcopprot.h" +#include "utils/builtins.h" +#include "utils/memutils.h" +#include "utils/ps_status.h" +#include "utils/timeout.h" + +#define MAX_GID 64 + +#define SQL_CMD_LEN 1024 +#define MAX_DB_SIZE 100 + +#define DB_TEMPLATE0 "template0" +#define DB_TEMPLATE1 "template1" +#define DB_DEFAULT "postgres" + +typedef enum +{ + Query2pcAttr_gid = 0, + Query2pcAttr_database = 1, + Query2pcAttr_global_status = 2, + Query2pcAttr_status_on_nodes = 3, + Query2pcAttr_butty +} Query2pcAttrEnum; + +bool enable_clean_2pc_launcher = true; + +int auto_clean_2pc_interval = 10; +int auto_clean_2pc_delay = 3; + +static volatile sig_atomic_t got_SIGTERM = false; +static volatile sig_atomic_t got_SIGHUP = false; +static volatile sig_atomic_t got_SIGUSR2 = false; + +/* Flags to tell if we are in an clean 2pc process */ +static bool am_clean_2pc_launcher = false; +static bool am_clean_2pc_worker = false; + +static StringInfo result_str = NULL; + +#ifdef EXEC_BACKEND +static pid_t clean_2pc_launcher_forkexec(void); +static pid_t clean_2pc_worker_forkexec(void); +#endif + +NON_EXEC_STATIC void +Clean2pcLauncherMain(int argc, char *argv[]) pg_attribute_noreturn(); +NON_EXEC_STATIC void +Clean2pcWorkerMain(int argc, char *argv[]) pg_attribute_noreturn(); + +static void start_query_worker(void); +static void start_clean_worker(int count); + +static void do_query_2pc(void); +static void do_clean_2pc(void); + +static void clean_2pc_sigterm_handler(SIGNAL_ARGS); +static void clean_2pc_sighup_handler(SIGNAL_ARGS); +static void clean_2pc_sigusr2_handler(SIGNAL_ARGS); + +static List *get_database_list(void); +static Oid get_default_database(void); + +static void ExitCleanRunning(int status, Datum arg); + +/* struct to keep track of databases in worker */ +typedef struct Clean2pcDBInfo +{ + Oid db_oid; + char *db_name; +} Clean2pcDBInfo; + +typedef struct +{ + bool worker_running; + Oid worker_db; + + int db_count; + Oid db_list[MAX_DB_SIZE]; +} Clean2pcShmemStruct; + +static Clean2pcShmemStruct *Clean2pcShmem = NULL; + +/* + * Main entry point for 2pc clean launcher, to be called from the + * postmaster. + */ +int +StartClean2pcLauncher(void) +{ + pid_t clean_2pc_pid = 0; + +#ifdef EXEC_BACKEND + switch ((clean_2pc_pid = clean_2pc_launcher_forkexec())) +#else + switch ((clean_2pc_pid = fork_process())) +#endif + { + case -1: + ereport(LOG, + (errmsg("could not fork 2pc clean launcher: %m"))); + return 0; + +#ifndef EXEC_BACKEND + case 0: + /* in postmaster child ... */ + InitPostmasterChild(); + + /* Close the postmaster's sockets */ + ClosePostmasterPorts(false); + + Clean2pcLauncherMain(0, NULL); + break; +#endif + default: + return (int) clean_2pc_pid; + } + + return 0; +} + +/* + * Main loop for the 2pc clean launcher. + */ +NON_EXEC_STATIC void +Clean2pcLauncherMain(int argc, char *argv[]) +{ + int wait_time = 0; + + am_clean_2pc_launcher = true; + + /* Identify myself via ps */ + init_ps_display("2pc clean launcher", "", "", ""); + + elog(LOG, "2pc clean launcher start"); + + SetProcessingMode(InitProcessing); + + /* + * Set up signal handlers. We operate on databases much like a regular + * backend, so we use the same signal handling. See equivalent code in + * tcop/postgres.c. + */ + pqsignal(SIGHUP, clean_2pc_sighup_handler); + pqsignal(SIGTERM, clean_2pc_sigterm_handler); + pqsignal(SIGINT, StatementCancelHandler); + pqsignal(SIGQUIT, quickdie); + InitializeTimeouts(); /* establishes SIGALRM handler */ + + pqsignal(SIGPIPE, SIG_IGN); + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + pqsignal(SIGUSR2, clean_2pc_sigusr2_handler); + pqsignal(SIGFPE, FloatExceptionHandler); + pqsignal(SIGCHLD, SIG_DFL); + + PG_SETMASK(&UnBlockSig); + + /* Early initialization */ + BaseInit(); + + /* + * Create a per-backend PGPROC struct in shared memory, except in the + * EXEC_BACKEND case where this was done in SubPostmasterMain. We must do + * this before we can use LWLocks (and in the EXEC_BACKEND case we already + * had to do some stuff with LWLocks). + */ +#ifndef EXEC_BACKEND + InitProcess(); +#endif + + InitPostgres(NULL, InvalidOid, NULL, InvalidOid, NULL); + + SetProcessingMode(NormalProcessing); + + LWLockAcquire(Clean2pcLock, LW_EXCLUSIVE); + Clean2pcShmem->worker_running = false; + Clean2pcShmem->db_count = 0; + Clean2pcShmem->worker_db = InvalidOid; + LWLockRelease(Clean2pcLock); + + if (result_str == NULL) + { + MemoryContext oldcontext = MemoryContextSwitchTo(TopMemoryContext); + result_str = makeStringInfo(); + MemoryContextSwitchTo(oldcontext); + } + + wait_time = auto_clean_2pc_delay; + for (;;) + { + pg_usleep(1000000L * wait_time); + + if (got_SIGTERM) + { + elog(LOG, "2pc clean launcher got SIGTERM"); + got_SIGTERM = false; + proc_exit(0); + } + + if (got_SIGHUP) + { + elog(LOG, "2pc clean launcher got SIGHUP"); + got_SIGHUP = false; + ProcessConfigFile(PGC_SIGHUP); + wait_time = auto_clean_2pc_delay; + continue; + } + + if (got_SIGUSR2) + { + elog(LOG, "2pc clean launcher got SIGUSR2"); + got_SIGUSR2 = false; + wait_time = auto_clean_2pc_delay; + continue; + } + + start_query_worker(); + + if (got_SIGTERM || got_SIGHUP || got_SIGUSR2) + { + wait_time = 0; + } + else + { + wait_time = auto_clean_2pc_interval; + } + } +} + +/* + * Main entry point for 2pc clean worker, to be called from the + * postmaster. + */ +int +StartClean2pcWorker(void) +{ + pid_t clean_2pc_pid = 0; + +#ifdef EXEC_BACKEND + switch ((clean_2pc_pid = clean_2pc_worker_forkexec())) +#else + switch ((clean_2pc_pid = fork_process())) +#endif + { + case -1: + ereport(LOG, + (errmsg("could not fork 2pc clean worker: %m"))); + return 0; + +#ifndef EXEC_BACKEND + case 0: + /* in postmaster child ... */ + InitPostmasterChild(); + + /* Close the postmaster's sockets */ + ClosePostmasterPorts(false); + + Clean2pcWorkerMain(0, NULL); + break; +#endif + default: + return (int) clean_2pc_pid; + } + + return 0; +} + +/* + * Main for the 2pc clean worker. + */ +NON_EXEC_STATIC void +Clean2pcWorkerMain(int argc, char *argv[]) +{ + char db_name[NAMEDATALEN]; + Oid db_oid = InvalidOid; + int clean_db_count = 0; + + am_clean_2pc_worker = true; + + on_proc_exit(ExitCleanRunning, 0); + + /* Identify myself via ps */ + init_ps_display("2pc clean worker", "", "", ""); + + elog(LOG, "2pc clean worker start"); + + SetProcessingMode(InitProcessing); + + /* + * Set up signal handlers. We operate on databases much like a regular + * backend, so we use the same signal handling. See equivalent code in + * tcop/postgres.c. + */ + pqsignal(SIGHUP, clean_2pc_sighup_handler); + pqsignal(SIGTERM, clean_2pc_sigterm_handler); + pqsignal(SIGINT, StatementCancelHandler); + pqsignal(SIGQUIT, quickdie); + InitializeTimeouts(); /* establishes SIGALRM handler */ + + pqsignal(SIGPIPE, SIG_IGN); + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + pqsignal(SIGUSR2, clean_2pc_sigusr2_handler); + pqsignal(SIGFPE, FloatExceptionHandler); + pqsignal(SIGCHLD, SIG_DFL); + + PG_SETMASK(&UnBlockSig); + + /* Early initialization */ + BaseInit(); + + /* + * Create a per-backend PGPROC struct in shared memory, except in the + * EXEC_BACKEND case where this was done in SubPostmasterMain. We must do + * this before we can use LWLocks (and in the EXEC_BACKEND case we already + * had to do some stuff with LWLocks). + */ +#ifndef EXEC_BACKEND + InitProcess(); +#endif + + LWLockAcquire(Clean2pcLock, LW_EXCLUSIVE); + + Clean2pcShmem->worker_running = true; + + db_oid = Clean2pcShmem->worker_db; + + Assert(OidIsValid(db_oid)); + + InitPostgres(NULL, db_oid, NULL, InvalidOid, db_name); + + SetProcessingMode(NormalProcessing); + + if (result_str == NULL) + { + MemoryContext oldcontext = MemoryContextSwitchTo(TopMemoryContext); + result_str = makeStringInfo(); + MemoryContextSwitchTo(oldcontext); + } + + if (Clean2pcShmem->db_count == 0) + { + elog(DEBUG5, "query 2pc from db: %s", db_name); + do_query_2pc(); + clean_db_count = Clean2pcShmem->db_count; + } + else + { + elog(LOG, "clean 2pc for db: %s", db_name); + do_clean_2pc(); + } + + Clean2pcShmem->worker_running = false; + + LWLockRelease(Clean2pcLock); + + if (clean_db_count != 0) + { + start_clean_worker(clean_db_count); + } + + /* All done, go away */ + proc_exit(0); +} + +static void +do_query_2pc(void) +{ + int i = 0; + int count_db = 0; + int count_2pc = 0; + MemoryContext oldcontext = NULL; + char query[SQL_CMD_LEN]; + char gid[MAX_GID]; + char *startnode = NULL; + bool is_start_from = true; + Oid db_oid = InvalidOid; + Oid last_db_oid = InvalidOid; + EState *estate = NULL; + RemoteQuery *plan = NULL; + RemoteQueryState *pstate = NULL; + TupleTableSlot *result = NULL; + Var *dummy = NULL; + int attr_num = 4; + static const char *attr_name[] = {"gid", "database", + "global_transaction_status", + "transaction_status_on_allnodes"}; + + Assert(result_str != NULL); + resetStringInfo(result_str); + + snprintf(query, SQL_CMD_LEN, "select * FROM pg_clean_check_txn(%d) " + "order by database limit 1000;", auto_clean_2pc_delay); + + elog(DEBUG2, "node(%d) query: %s", PGXCNodeId, query); + + StartTransactionCommand(); + + plan = makeNode(RemoteQuery); + plan->combine_type = COMBINE_TYPE_NONE; + plan->exec_nodes = makeNode(ExecNodes); + plan->exec_type = EXEC_ON_COORDS; + + plan->exec_nodes->nodeList = lappend_int(plan->exec_nodes->nodeList, PGXCNodeId); + + plan->sql_statement = (char*)query; + plan->force_autocommit = false; + + /* + * We only need the target entry to determine result data type. + * So create dummy even if real expression is a function. + */ + for (i = 1; i <= attr_num; i++) + { + dummy = makeVar(1, i, TEXTOID, 0, InvalidOid, 0); + plan->scan.plan.targetlist = lappend(plan->scan.plan.targetlist, + makeTargetEntry((Expr *) dummy, i, NULL, false)); + } + + InitMultinodeExecutor(false); + + /* prepare to execute */ + estate = CreateExecutorState(); + oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); + estate->es_snapshot = GetActiveSnapshot(); + pstate = ExecInitRemoteQuery(plan, estate, 0); + MemoryContextSwitchTo(oldcontext); + + Clean2pcShmem->db_count = 0; + + result = ExecRemoteQuery((PlanState *) pstate); + + while (result != NULL && !TupIsNull(result)) + { + slot_getallattrs(result); + + is_start_from = true; + count_2pc++; + + for (i = 0; i < attr_num; i++) + { + char *value = text_to_cstring(DatumGetTextP(result->tts_values[i])); + appendStringInfo(result_str, "\t%s: %s", attr_name[i], value); + switch (i) + { + case Query2pcAttr_gid: /* value is gid */ + if (IsXidImplicit(value)) + { + /* get start node from gid */ + startnode = NULL; + + strcpy(gid, value); + startnode = strtok(gid, ":"); + if (NULL == startnode) + { + elog(WARNING, "get startnode(%s) from gid(%s) failed", + startnode, gid); + break; + } + + startnode = strtok(NULL, ":"); + if (NULL == startnode) + { + elog(WARNING, "get startnode(%s) from gid(%s) failed", + startnode, gid); + break; + } + + if (strcmp(startnode, PGXCNodeName) != 0) + { + is_start_from = false; + } + } + break; + case Query2pcAttr_database: /* value is database */ + if (is_start_from) + { + db_oid = get_database_oid(value, true); + if (!OidIsValid(db_oid)) + { + elog(WARNING, "get database(%s) oid failed", value); + } + else if (db_oid != last_db_oid) + { + if (Clean2pcShmem->db_count < MAX_DB_SIZE) + { + Clean2pcShmem->db_list[Clean2pcShmem->db_count++] = db_oid; + } + last_db_oid = db_oid; + count_db++; + } + } + break; + default: + break; + } + } + + appendStringInfo(result_str, "\n"); + + result = ExecRemoteQuery((PlanState *) pstate); + } + + ExecEndRemoteQuery(pstate); + + CommitTransactionCommand(); + + if (count_2pc > 0) + { + Assert(result_str->data != NULL); + elog(LOG, "query remain 2pc count(%d), db count(%d):\n%s", + count_2pc, count_db, result_str->data); + } +} + +static void +do_clean_2pc(void) +{ + int i = 0; + int count = 0; + MemoryContext oldcontext = NULL; + char query[SQL_CMD_LEN]; + EState *estate = NULL; + RemoteQuery *plan = NULL; + RemoteQueryState *pstate = NULL; + TupleTableSlot *result = NULL; + Var *dummy = NULL; + int attr_num = 4; + static const char *attr_name[] = {"gid", "global_transaction_status", + "operation", "operation_status"}; + TimestampTz clean_time = 0; + + Assert(result_str != NULL); + resetStringInfo(result_str); + + clean_time = GetCurrentTimestamp() - USECS_PER_SEC * auto_clean_2pc_delay; + + snprintf(query, SQL_CMD_LEN, "select * FROM pg_clean_execute_on_node('%s', %ld)" + " limit 1000;", PGXCNodeName, clean_time); + + elog(DEBUG2, "node(%d) query: %s", PGXCNodeId, query); + + StartTransactionCommand(); + + plan = makeNode(RemoteQuery); + plan->combine_type = COMBINE_TYPE_NONE; + plan->exec_nodes = makeNode(ExecNodes); + plan->exec_type = EXEC_ON_COORDS; + + plan->exec_nodes->nodeList = lappend_int(plan->exec_nodes->nodeList, PGXCNodeId); + + plan->sql_statement = (char*)query; + plan->force_autocommit = false; + + /* + * We only need the target entry to determine result data type. + * So create dummy even if real expression is a function. + */ + for (i = 1; i <= attr_num; i++) + { + dummy = makeVar(1, i, TEXTOID, 0, InvalidOid, 0); + plan->scan.plan.targetlist = lappend(plan->scan.plan.targetlist, + makeTargetEntry((Expr *) dummy, i, NULL, false)); + } + + InitMultinodeExecutor(false); + + /* prepare to execute */ + estate = CreateExecutorState(); + oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); + estate->es_snapshot = GetActiveSnapshot(); + pstate = ExecInitRemoteQuery(plan, estate, 0); + MemoryContextSwitchTo(oldcontext); + + result = ExecRemoteQuery((PlanState *) pstate); + + while (result != NULL && !TupIsNull(result)) + { + slot_getallattrs(result); + + count++; + + for (i = 0; i < attr_num; i++) + { + char *value = text_to_cstring(DatumGetTextP(result->tts_values[i])); + appendStringInfo(result_str, "\t%s: %s", attr_name[i], value); + } + + appendStringInfo(result_str, "\n"); + + result = ExecRemoteQuery((PlanState *) pstate); + } + + ExecEndRemoteQuery(pstate); + + CommitTransactionCommand(); + + if (count > 0) + { + Assert(NULL != result_str->data); + elog(LOG, "clean 2pc count(%d):\n%s", count, result_str->data); + } +} + +/* SIGTERM: set flag to exit normally */ +static void +clean_2pc_sigterm_handler(SIGNAL_ARGS) +{ + elog(LOG, "SIGTERM: %d", postgres_signal_arg); + got_SIGTERM = true; +} + + +/* SIGHUP: set flag to re-read config file at next convenient time */ +static void +clean_2pc_sighup_handler(SIGNAL_ARGS) +{ + elog(LOG, "SIGHUP: %d", postgres_signal_arg); + got_SIGHUP = true; +} + +/* SIGUSR2: used for notify 2pc abnormal */ +static void +clean_2pc_sigusr2_handler(SIGNAL_ARGS) +{ + elog(LOG, "SIGUSR2: %d", postgres_signal_arg); + got_SIGUSR2 = true; +} + +/* + * IsClean2pcLauncher functions + * Return whether this is a 2pc clean launcher. + */ +bool +IsClean2pcLauncher(void) +{ + return am_clean_2pc_launcher; +} + +/* + * IsClean2pcWorker functions + * Return whether this is a 2pc clean worker. + */ +bool +IsClean2pcWorker(void) +{ + return am_clean_2pc_worker; +} + +/* + * get_database_list + * Return a list of all databases found in pg_database. + * + * The list and associated data is allocated in the caller's memory context, + * which is in charge of ensuring that it's properly cleaned up afterwards. + * + * Note: this is the only function in which the autovacuum launcher uses a + * transaction. Although we aren't attached to any particular database and + * therefore can't access most catalogs, we do have enough infrastructure + * to do a seqscan on pg_database. + */ +static List * +get_database_list(void) +{ + List *dblist = NIL; + Relation rel; + HeapScanDesc scan; + HeapTuple tup; + MemoryContext resultcxt; + + /* This is the context that we will allocate our output data in */ + resultcxt = CurrentMemoryContext; + + StartTransactionCommand(); + + rel = heap_open(DatabaseRelationId, AccessShareLock); + scan = heap_beginscan_catalog(rel, 0, NULL); + + while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection))) + { + Form_pg_database pgdatabase = (Form_pg_database) GETSTRUCT(tup); + Clean2pcDBInfo *db_info; + MemoryContext oldcxt; + + oldcxt = MemoryContextSwitchTo(resultcxt); + + db_info = (Clean2pcDBInfo *) palloc(sizeof(Clean2pcDBInfo)); + + db_info->db_oid = HeapTupleGetOid(tup); + db_info->db_name = pstrdup(NameStr(pgdatabase->datname)); + + dblist = lappend(dblist, db_info); + MemoryContextSwitchTo(oldcxt); + } + + heap_endscan(scan); + heap_close(rel, AccessShareLock); + + CommitTransactionCommand(); + + return dblist; +} + +static Oid +get_default_database(void) +{ + Oid default_db = InvalidOid; + Oid template0_db = InvalidOid; + Oid template1_db = InvalidOid; + List *dblist = NULL; + ListCell *cell = NULL; + Clean2pcDBInfo *db_info = NULL; + char *default_db_name = NULL; + + /* Get a list of databases */ + dblist = get_database_list(); + foreach(cell, dblist) + { + db_info = lfirst(cell); + + if (strcmp(db_info->db_name, DB_DEFAULT) == 0) + { + default_db = db_info->db_oid; + default_db_name = db_info->db_name; + break; + } + + if (strcmp(db_info->db_name, DB_TEMPLATE0) == 0) + { + template0_db = db_info->db_oid; + continue; + } + + if (strcmp(db_info->db_name, DB_TEMPLATE1) == 0) + { + template1_db = db_info->db_oid; + continue; + } + + if (!OidIsValid(default_db)) + { + default_db = db_info->db_oid; + default_db_name = db_info->db_name; + } + } + + if (!OidIsValid(default_db)) + { + if (OidIsValid(template1_db)) + { + default_db = template1_db; + default_db_name = DB_TEMPLATE1; + } else if (OidIsValid(template0_db)) + { + default_db = template0_db; + default_db_name = DB_TEMPLATE0; + } + } + + Assert(OidIsValid(default_db)); + + elog(DEBUG2, "get default db: oid(%d), name(%s)", default_db, default_db_name); + + return default_db; +} + +/* + * start query worker to query 2pc + */ +static void +start_query_worker(void) +{ + Oid db_oid = get_default_database(); + if (!OidIsValid(db_oid)) + { + elog(WARNING, "get default database failed"); + return; + } + + Assert(OidIsValid(db_oid)); + + LWLockAcquire(Clean2pcLock, LW_EXCLUSIVE); + + while (Clean2pcShmem->worker_running) + { + LWLockRelease(Clean2pcLock); + + if (got_SIGTERM) + { + proc_exit(0); + } + + pg_usleep(1000000L); /* wait 1s */ + + elog(LOG, "waiting to db(%d)", Clean2pcShmem->worker_db); + + LWLockAcquire(Clean2pcLock, LW_EXCLUSIVE); + } + + Clean2pcShmem->worker_running = true; + Clean2pcShmem->db_count = 0; + Clean2pcShmem->worker_db = db_oid; + + LWLockRelease(Clean2pcLock); + + SendPostmasterSignal(PMSIGNAL_START_CLEAN_2PC_WORKER); + + pg_usleep(1000000L); /* wait 1s */ +} + +/* + * start clean worker to clean 2pc + */ +static void +start_clean_worker(int count) +{ + int i = 0; + + for (i = 0; i < count; i++) + { + LWLockAcquire(Clean2pcLock, LW_EXCLUSIVE); + + while (Clean2pcShmem->worker_running) + { + LWLockRelease(Clean2pcLock); + + if (got_SIGTERM) + { + proc_exit(0); + } + + pg_usleep(1000000L); /* wait 1s */ + + elog(LOG, "waiting to db(%d)", Clean2pcShmem->worker_db); + + LWLockAcquire(Clean2pcLock, LW_EXCLUSIVE); + } + + Clean2pcShmem->worker_db = Clean2pcShmem->db_list[i]; + + if (Clean2pcShmem->db_count != count) + { + elog(WARNING, "db_count(%d)!=count(%d)", Clean2pcShmem->db_count, count); + LWLockRelease(Clean2pcLock); + break; + } + + if (!OidIsValid(Clean2pcShmem->worker_db)) + { + elog(WARNING, "get invalid oid, count: %d, i: %d", count, i); + LWLockRelease(Clean2pcLock); + continue; + } + + Clean2pcShmem->worker_running = true; + SendPostmasterSignal(PMSIGNAL_START_CLEAN_2PC_WORKER); + + LWLockRelease(Clean2pcLock); + + pg_usleep(1000000L); /* wait 1s */ + } +} + +/* + * on_proc_exit callback to set worker_running to false + */ +static void +ExitCleanRunning(int status, Datum arg) +{ + if (Clean2pcShmem->worker_running) + { + Clean2pcShmem->worker_running = false; + elog(LOG, "2pc clean worker exit abnormally"); + } + else + { + elog(DEBUG5, "2pc clean worker exit normally"); + } +} + +/* + * Clean2pcShmemSize + * Compute space needed for clean 2pc related shared memory + */ +Size +Clean2pcShmemSize(void) +{ + Size size; + + /* + * Need the fixed struct and the array of WorkerInfoData. + */ + size = sizeof(Clean2pcShmemStruct); + size = MAXALIGN(size); + + return size; +} + +/* + * Clean2pcShmemInit + * Allocate and initialize clean 2pc related shared memory + */ +void +Clean2pcShmemInit(void) +{ + bool found; + Clean2pcShmem = (Clean2pcShmemStruct *) ShmemInitStruct("Clean 2pc Data", + Clean2pcShmemSize(), + &found); +} + +#ifdef EXEC_BACKEND +/* + * forkexec routine for the 2pc clean launcher process. + * + * Format up the arglist, then fork and exec. + */ +static pid_t +clean_2pc_launcher_forkexec(void) +{ + char *av[10]; + int ac = 0; + + av[ac++] = "postgres"; + av[ac++] = "--forkclean2pclauncher"; + av[ac++] = NULL; /* filled in by postmaster_forkexec */ + av[ac] = NULL; + + Assert(ac < lengthof(av)); + + return postmaster_forkexec(ac, av); +} + +/* + * forkexec routine for the 2pc clean worker process. + * + * Format up the arglist, then fork and exec. + */ +static pid_t +clean_2pc_worker_forkexec(void) +{ + char *av[10]; + int ac = 0; + + av[ac++] = "postgres"; + av[ac++] = "--forkclean2pcworker"; + av[ac++] = NULL; /* filled in by postmaster_forkexec */ + av[ac] = NULL; + + Assert(ac < lengthof(av)); + + return postmaster_forkexec(ac, av); +} + +/* + * We need this set from the outside, before InitProcess is called + */ +void +Clean2pcLauncherIAm(void) +{ + am_clean_2pc_launcher = true; +} + +/* + * We need this set from the outside, before InitProcess is called + */ +void +Clean2pcWorkerIAm(void) +{ + am_clean_2pc_worker = true; +} +#endif diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 76d4ff19..1286cd1d 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -49,6 +49,7 @@ #include "miscadmin.h" #include "pg_trace.h" #include "postmaster/autovacuum.h" +#include "postmaster/clean2pc.h" #include "postmaster/fork_process.h" #include "postmaster/postmaster.h" #include "replication/walsender.h" @@ -2902,6 +2903,16 @@ pgstat_bestart(void) /* Autovacuum Worker */ beentry->st_backendType = B_AUTOVAC_WORKER; } + else if (IsClean2pcLauncher()) + { + /* Clean 2pc Launcher */ + beentry->st_backendType = B_CLEAN_2PC_LAUNCHER; + } + else if (IsClean2pcWorker()) + { + /* Clean 2pc Worker */ + beentry->st_backendType = B_CLEAN_2PC_WORKER; + } else if (am_walsender) { /* Wal sender */ @@ -4191,6 +4202,12 @@ pgstat_get_backend_desc(BackendType backendType) case B_PGXL_CLUSTER_MONITOR: backendDesc = "cluster monitor"; break; + case B_CLEAN_2PC_LAUNCHER: + backendDesc = "2pc clean launcher"; + break; + case B_CLEAN_2PC_WORKER: + backendDesc = "2pc clean worker"; + break; } return backendDesc; diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index c3fe228d..10be77cd 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -120,6 +120,7 @@ #include "pgstat.h" #include "postmaster/autovacuum.h" #include "postmaster/bgworker_internals.h" +#include "postmaster/clean2pc.h" #include "postmaster/fork_process.h" #include "postmaster/pgarch.h" #include "postmaster/postmaster.h" @@ -312,6 +313,7 @@ static pid_t StartupPID = 0, WalWriterPID = 0, WalReceiverPID = 0, AutoVacPID = 0, + Clean2pcPID = 0, PgArchPID = 0, PgStatPID = 0, #ifdef __TBASE__ @@ -2025,6 +2027,12 @@ ServerLoop(void) start_autovac_launcher = false; /* signal processed */ } + if (IS_PGXC_COORDINATOR && Clean2pcPID == 0 && + pmState == PM_RUN && enable_clean_2pc_launcher) + { + Clean2pcPID = StartClean2pcLauncher(); + } + /* If we have lost the stats collector, try to start a new one */ if (PgStatPID == 0 && (pmState == PM_RUN || pmState == PM_HOT_STANDBY)) @@ -2861,6 +2869,8 @@ SIGHUP_handler(SIGNAL_ARGS) signal_child(WalReceiverPID, SIGHUP); if (AutoVacPID != 0) signal_child(AutoVacPID, SIGHUP); + if (Clean2pcPID != 0) + signal_child(Clean2pcPID, SIGHUP); if (PgArchPID != 0) signal_child(PgArchPID, SIGHUP); if (SysLoggerPID != 0) @@ -2966,6 +2976,9 @@ pmdie(SIGNAL_ARGS) /* and the autovac launcher too */ if (AutoVacPID != 0) signal_child(AutoVacPID, SIGTERM); + /* and the clean 2pc launcher too */ + if (Clean2pcPID != 0) + signal_child(Clean2pcPID, SIGTERM); /* and the bgwriter too */ if (BgWriterPID != 0) signal_child(BgWriterPID, SIGTERM); @@ -3093,6 +3106,9 @@ pmdie(SIGNAL_ARGS) /* and the autovac launcher too */ if (AutoVacPID != 0) signal_child(AutoVacPID, SIGTERM); + /* and the clean 2pc launcher too */ + if (Clean2pcPID != 0) + signal_child(Clean2pcPID, SIGTERM); /* and the walwriter too */ if (WalWriterPID != 0) signal_child(WalWriterPID, SIGTERM); @@ -3272,6 +3288,9 @@ reaper(SIGNAL_ARGS) */ if (!IsBinaryUpgrade && AutoVacuumingActive() && AutoVacPID == 0) AutoVacPID = StartAutoVacLauncher(); + if (IS_PGXC_COORDINATOR && Clean2pcPID == 0 && + pmState == PM_RUN && enable_clean_2pc_launcher) + Clean2pcPID = StartClean2pcLauncher(); if (PgArchStartupAllowed() && PgArchPID == 0) PgArchPID = pgarch_start(); if (PgStatPID == 0) @@ -3431,6 +3450,21 @@ reaper(SIGNAL_ARGS) } /* + * Was it the clean 2pc launcher? Normal exit can be ignored; we'll + * start a new one at the next iteration of the postmaster's main + * loop, if necessary. Any other exit condition is treated as a + * crash. + */ + if (pid == Clean2pcPID) + { + Clean2pcPID = 0; + if (!EXIT_STATUS_0(exitstatus)) + HandleChildCrash(pid, exitstatus, + _("clean 2pc launcher process")); + continue; + } + + /* * Was it the archiver? If so, just try to start a new one; no need * to force reset of the rest of the system. (If fail, we'll try * again in future cycles of the main loop.). Unless we were waiting @@ -4001,6 +4035,18 @@ HandleChildCrash(int pid, int exitstatus, const char *procname) signal_child(AutoVacPID, (SendStop ? SIGSTOP : SIGQUIT)); } + /* Take care of the clean 2pc process too */ + if (pid == Clean2pcPID) + Clean2pcPID = 0; + else if (Clean2pcPID != 0 && take_action) + { + ereport(DEBUG2, + (errmsg_internal("sending %s to process %d", + (SendStop ? "SIGSTOP" : "SIGQUIT"), + (int) Clean2pcPID))); + signal_child(Clean2pcPID, (SendStop ? SIGSTOP : SIGQUIT)); + } + #ifdef PGXC /* Take care of the pool manager too */ if (pid == PgPoolerPID) @@ -4237,6 +4283,7 @@ PostmasterStateMachine(void) (CheckpointerPID == 0 || (!FatalError && Shutdown < ImmediateShutdown)) && WalWriterPID == 0 && + Clean2pcPID == 0 && AutoVacPID == 0) { if (Shutdown >= ImmediateShutdown || FatalError) @@ -4345,6 +4392,7 @@ PostmasterStateMachine(void) Assert(CheckpointerPID == 0); Assert(WalWriterPID == 0); Assert(AutoVacPID == 0); + Assert(Clean2pcPID == 0); /* syslogger is not considered here */ pmState = PM_NO_CHILDREN; } @@ -4558,6 +4606,8 @@ TerminateChildren(int signal) signal_child(WalReceiverPID, signal); if (AutoVacPID != 0) signal_child(AutoVacPID, signal); + if (Clean2pcPID != 0) + signal_child(Clean2pcPID, signal); if (PgArchPID != 0) signal_child(PgArchPID, signal); if (PgStatPID != 0) @@ -5417,6 +5467,12 @@ SubPostmasterMain(int argc, char *argv[]) if (strcmp(argv[1], "--forkavworker") == 0) AutovacuumWorkerIAm(); + /* clean 2pc needs this set before calling InitProcess */ + if (strcmp(argv[1], "--forkclean2pclauncher") == 0) + Clean2pcLauncherIAm(); + if (strcmp(argv[1], "--forkclean2pcworker") == 0) + Clean2pcWorkerIAm(); + /* * Start our win32 signal implementation. This has to be done after we * read the backend variables, because we need to pick up the signal pipe @@ -5534,6 +5590,32 @@ SubPostmasterMain(int argc, char *argv[]) AutoVacWorkerMain(argc - 2, argv + 2); /* does not return */ } + if (strcmp(argv[1], "--forkclean2pclauncher") == 0) + { + /* Restore basic shared memory pointers */ + InitShmemAccess(UsedShmemSegAddr); + + /* Need a PGPROC to run CreateSharedMemoryAndSemaphores */ + InitProcess(); + + /* Attach process to shared data structures */ + CreateSharedMemoryAndSemaphores(false, 0); + + Clean2pcLauncherMain(argc - 2, argv + 2); /* does not return */ + } + if (strcmp(argv[1], "--forkclean2pcworker") == 0) + { + /* Restore basic shared memory pointers */ + InitShmemAccess(UsedShmemSegAddr); + + /* Need a PGPROC to run CreateSharedMemoryAndSemaphores */ + InitProcess(); + + /* Attach process to shared data structures */ + CreateSharedMemoryAndSemaphores(false, 0); + + Clean2pcWorkerMain(argc - 2, argv + 2); /* does not return */ + } if (strncmp(argv[1], "--forkbgworker=", 15) == 0) { int shmem_slot; @@ -5782,6 +5864,20 @@ sigusr1_handler(SIGNAL_ARGS) StartAutovacuumWorker(); } + if (CheckPostmasterSignal(PMSIGNAL_WAKEN_CLEAN_2PC_TRIGGER) && + Shutdown == NoShutdown && Clean2pcPID != 0) + { + /* send SIGUSR2 to clean 2pc launcher to trigger clean */ + signal_child(Clean2pcPID, SIGUSR2); + } + + if (CheckPostmasterSignal(PMSIGNAL_START_CLEAN_2PC_WORKER) && + Shutdown == NoShutdown) + { + /* The clean 2pc launcher wants us to start a worker process. */ + StartClean2pcWorker(); + } + if (CheckPostmasterSignal(PMSIGNAL_START_WALRECEIVER)) { /* Startup Process wants us to start the walreceiver process. */ diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index 3cdb9063..ae1a9029 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -90,6 +90,7 @@ #include "postmaster/clustermon.h" #endif #include "postmaster/autovacuum.h" +#include "postmaster/clean2pc.h" #include "postmaster/clustermon.h" #include "postmaster/bgworker_internals.h" #include "postmaster/bgwriter.h" @@ -245,6 +246,7 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port) size = add_size(size, ReplicationOriginShmemSize()); size = add_size(size, WalSndShmemSize()); size = add_size(size, WalRcvShmemSize()); + size = add_size(size, Clean2pcShmemSize()); #ifdef XCP if (IS_PGXC_DATANODE) size = add_size(size, SharedQueueShmemSize()); @@ -421,6 +423,8 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port) WalRcvShmemInit(); ApplyLauncherShmemInit(); + Clean2pcShmemInit(); + #ifdef XCP /* * Set up distributed executor's shared queues diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt index ba4cfcbf..320a55e9 100644 --- a/src/backend/storage/lmgr/lwlocknames.txt +++ b/src/backend/storage/lmgr/lwlocknames.txt @@ -76,4 +76,5 @@ DualWriteLock 58 #ifdef __TBASE__ AnalyzeInfoLock 59 UserAuthLock 60 +Clean2pcLock 61 #endif \ No newline at end of file diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index 9ae8494f..9288b715 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -42,6 +42,7 @@ #include "miscadmin.h" #include "pgstat.h" #include "postmaster/autovacuum.h" +#include "postmaster/clean2pc.h" #ifdef PGXC #include "pgxc/pgxc.h" #include "pgxc/poolmgr.h" @@ -63,6 +64,10 @@ #include "storage/lock.h" #endif +#define AUTOVAC_LAUNCHER_NUM 1 +#define CLEAN_2PC_LAUNCHER_NUM 1 +#define CLEAN_2PC_WORKER_NUM 3 + /* GUC variables */ int DeadlockTimeout = 1000; int StatementTimeout = 0; @@ -187,6 +192,7 @@ InitProcGlobal(void) ProcGlobal->freeProcs = NULL; ProcGlobal->autovacFreeProcs = NULL; ProcGlobal->bgworkerFreeProcs = NULL; + ProcGlobal->clean2pcFreeProcs = NULL; ProcGlobal->startupProc = NULL; ProcGlobal->startupProcPid = 0; ProcGlobal->startupBufferPinWaitBufId = -1; @@ -256,13 +262,21 @@ InitProcGlobal(void) ProcGlobal->freeProcs = &procs[i]; procs[i].procgloballist = &ProcGlobal->freeProcs; } - else if (i < MaxConnections + autovacuum_max_workers + 1) + else if (i < MaxConnections + autovacuum_max_workers + AUTOVAC_LAUNCHER_NUM) { /* PGPROC for AV launcher/worker, add to autovacFreeProcs list */ procs[i].links.next = (SHM_QUEUE *) ProcGlobal->autovacFreeProcs; ProcGlobal->autovacFreeProcs = &procs[i]; procs[i].procgloballist = &ProcGlobal->autovacFreeProcs; } + else if (i < MaxConnections + autovacuum_max_workers + AUTOVAC_LAUNCHER_NUM + + CLEAN_2PC_LAUNCHER_NUM + CLEAN_2PC_WORKER_NUM) + { + /* PGPROC for clean 2pc, add to clean2pcFreeProcs list */ + procs[i].links.next = (SHM_QUEUE *) ProcGlobal->clean2pcFreeProcs; + ProcGlobal->clean2pcFreeProcs = &procs[i]; + procs[i].procgloballist = &ProcGlobal->clean2pcFreeProcs; + } else if (i < MaxBackends) { /* PGPROC for bgworker, add to bgworkerFreeProcs list */ @@ -314,6 +328,8 @@ InitProcess(void) procgloballist = &ProcGlobal->autovacFreeProcs; else if (IsBackgroundWorker) procgloballist = &ProcGlobal->bgworkerFreeProcs; + else if (IsAnyClean2pcProcess()) + procgloballist = &ProcGlobal->clean2pcFreeProcs; else procgloballist = &ProcGlobal->freeProcs; @@ -362,7 +378,7 @@ InitProcess(void) * cleaning up. (XXX autovac launcher currently doesn't participate in * this; it probably should.) */ - if (IsUnderPostmaster && !IsAutoVacuumLauncherProcess()) + if (IsUnderPostmaster && !IsAutoVacuumLauncherProcess() && !IsAnyClean2pcProcess()) MarkPostmasterChildActive(); /* @@ -921,7 +937,7 @@ ProcKill(int code, Datum arg) * way, so tell the postmaster we've cleaned up acceptably well. (XXX * autovac launcher should be included here someday) */ - if (IsUnderPostmaster && !IsAutoVacuumLauncherProcess()) + if (IsUnderPostmaster && !IsAutoVacuumLauncherProcess() && !IsAnyClean2pcProcess()) MarkPostmasterChildInactive(); /* wake autovac launcher if needed -- see comments in FreeWorkerInfo */ diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c index ba4c192d..460061f3 100644 --- a/src/backend/utils/init/miscinit.c +++ b/src/backend/utils/init/miscinit.c @@ -45,6 +45,7 @@ #endif #include "pgstat.h" #include "postmaster/autovacuum.h" +#include "postmaster/clean2pc.h" #include "postmaster/postmaster.h" #include "storage/fd.h" #include "storage/ipc.h" @@ -612,7 +613,7 @@ InitializeSessionUserIdStandalone(void) * This function should only be called in single-user mode, in autovacuum * workers, and in background workers. */ - AssertState(!IsUnderPostmaster || IsAutoVacuumWorkerProcess() || IsBackgroundWorker); + AssertState(!IsUnderPostmaster || IsAutoVacuumWorkerProcess() || IsBackgroundWorker || IsClean2pcWorker()); /* call only once */ AssertState(!OidIsValid(AuthenticatedUserId)); diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index bfe9572d..0d529949 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -42,6 +42,7 @@ #include "postmaster/clustermon.h" #endif #include "postmaster/autovacuum.h" +#include "postmaster/clean2pc.h" #include "postmaster/clustermon.h" #include "postmaster/postmaster.h" #include "replication/walsender.h" @@ -333,7 +334,7 @@ CheckMyDatabase(const char *name, bool am_superuser) * * We do not enforce them for autovacuum worker processes either. */ - if (IsUnderPostmaster && !IsAutoVacuumWorkerProcess()) + if (IsUnderPostmaster && !IsAutoVacuumWorkerProcess() && !IsClean2pcWorker()) { /* * Check that the database is currently allowing connections. @@ -691,7 +692,7 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username, before_shmem_exit(ShutdownPostgres, 0); /* The autovacuum launcher is done here */ - if (IsAutoVacuumLauncherProcess() || IsClusterMonitorProcess()) + if (IsAutoVacuumLauncherProcess() || IsClusterMonitorProcess() || IsClean2pcLauncher()) { /* report this backend in the PgBackendStatus array */ pgstat_bestart(); @@ -731,7 +732,7 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username, * In standalone mode and in autovacuum worker processes, we use a fixed * ID, otherwise we figure it out from the authenticated user name. */ - if (bootstrap || IsAutoVacuumWorkerProcess()) + if (bootstrap || IsAutoVacuumWorkerProcess() || IsClean2pcWorker()) { InitializeSessionUserIdStandalone(); am_superuser = true; @@ -1020,7 +1021,7 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username, */ RelationCacheInitializePhase3(); #ifdef _MLS_ - if (bootstrap || IsAutoVacuumWorkerProcess() || !IsUnderPostmaster || IsBackgroundWorker) + if (bootstrap || IsAutoVacuumWorkerProcess() || IsClean2pcWorker() || !IsUnderPostmaster || IsBackgroundWorker) { ; } diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 42619d16..56abdb1a 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -85,6 +85,7 @@ #include "postmaster/autovacuum.h" #include "postmaster/bgworker_internals.h" #include "postmaster/bgwriter.h" +#include "postmaster/clean2pc.h" #include "postmaster/postmaster.h" #include "postmaster/syslogger.h" #include "postmaster/walwriter.h" @@ -2726,8 +2727,27 @@ static struct config_bool ConfigureNamesBool[] = false, NULL, NULL, NULL }, + { + {"enable_2pc_error_stop", PGC_USERSET, CUSTOM_OPTIONS, + gettext_noop("Enable 2PC stop when commit prepared error."), + NULL + }, + &enable_2pc_error_stop, + false, + NULL, NULL, NULL + }, #endif + { + {"enable_clean_2pc_launcher", PGC_POSTMASTER, CUSTOM_OPTIONS, + gettext_noop("Enable clean 2PC launcher."), + NULL + }, + &enable_clean_2pc_launcher, + true, + NULL, NULL, NULL + }, + #ifdef __TBASE__ { {"enable_lock_account", PGC_SUSET, CUSTOM_OPTIONS, @@ -4710,7 +4730,7 @@ static struct config_int ConfigureNamesInt[] = NULL }, &run_pg_clean, - 0, 0, 1, + 0, 0, 10, NULL, NULL, NULL }, #endif @@ -4833,6 +4853,28 @@ static struct config_int ConfigureNamesInt[] = }, #endif + { + {"auto_clean_2pc_interval", PGC_USERSET, CUSTOM_OPTIONS, + gettext_noop("auto clean 2pc interval"), + NULL, + GUC_UNIT_S + }, + &auto_clean_2pc_interval, + 30, 1, 3600, + NULL, NULL, NULL + }, + + { + {"auto_clean_2pc_delay", PGC_USERSET, CUSTOM_OPTIONS, + gettext_noop("auto clean 2pc delay"), + NULL, + GUC_UNIT_S + }, + &auto_clean_2pc_delay, + 3, 1, 600, + NULL, NULL, NULL + }, + { {"reconnect_gtm_retry_times", PGC_USERSET, CUSTOM_OPTIONS, gettext_noop("reconnect gtm retry times"), diff --git a/src/include/access/xact.h b/src/include/access/xact.h index a06c14d4..0e312662 100644 --- a/src/include/access/xact.h +++ b/src/include/access/xact.h @@ -59,6 +59,10 @@ extern bool XactReadOnly; extern bool GTM_ReadOnly; #endif +#ifdef __TWO_PHASE_TRANS__ +extern bool enable_2pc_error_stop; +#endif + /* * Xact is deferrable -- only meaningful (currently) for read only * SERIALIZABLE transactions diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 2049d855..7976c39b 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -719,7 +719,9 @@ typedef enum BackendType B_WAL_SENDER, B_WAL_WRITER, B_PGXL_CLUSTER_MONITOR, - B_PGXL_POOLER + B_PGXL_POOLER, + B_CLEAN_2PC_LAUNCHER, + B_CLEAN_2PC_WORKER, } BackendType; diff --git a/src/include/postmaster/clean2pc.h b/src/include/postmaster/clean2pc.h new file mode 100644 index 00000000..1d6df23f --- /dev/null +++ b/src/include/postmaster/clean2pc.h @@ -0,0 +1,43 @@ +/*-------------------------------------------------------------------- + * clean2pc.h + * A clean 2pc process is a process able to clean the abnormal 2pc. + * + * + * Portions Copyright (c) 1996-2021, TDSQL-PG Development Group + * + * IDENTIFICATION + * src/include/postmaster/clean2pc.h + *-------------------------------------------------------------------- + */ +#ifndef CLEAN2PC_H +#define CLEAN2PC_H + +#include "storage/block.h" + +extern bool enable_clean_2pc_launcher; + +extern int auto_clean_2pc_interval; +extern int auto_clean_2pc_delay; + +extern bool IsClean2pcLauncher(void); +extern bool IsClean2pcWorker(void); + +#define IsAnyClean2pcProcess() \ + (IsClean2pcLauncher() || IsClean2pcWorker()) + +extern int StartClean2pcLauncher(void); +extern int StartClean2pcWorker(void); + +#ifdef EXEC_BACKEND +extern void Clean2pcLauncherMain(int argc, char *argv[]) pg_attribute_noreturn(); +extern void Clean2pcWorkerMain(int argc, char *argv[]) pg_attribute_noreturn(); + +extern void Clean2pcLauncherIAm(void); +extern void Clean2pcWorkerIAm(void); +#endif + +/* shared memory stuff */ +extern Size Clean2pcShmemSize(void); +extern void Clean2pcShmemInit(void); + +#endif /* CLEAN2PC_H */ diff --git a/src/include/storage/pmsignal.h b/src/include/storage/pmsignal.h index d186137b..3adcb74a 100644 --- a/src/include/storage/pmsignal.h +++ b/src/include/storage/pmsignal.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * pmsignal.h - * routines for signaling the postmaster from its child processes + * routines for signaling the postmaster from its child processes * * * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group @@ -22,23 +22,25 @@ */ typedef enum { - PMSIGNAL_RECOVERY_STARTED, /* recovery has started */ - PMSIGNAL_BEGIN_HOT_STANDBY, /* begin Hot Standby */ - PMSIGNAL_WAKEN_ARCHIVER, /* send a NOTIFY signal to xlog archiver */ - PMSIGNAL_ROTATE_LOGFILE, /* send SIGUSR1 to syslogger to rotate logfile */ - PMSIGNAL_START_AUTOVAC_LAUNCHER, /* start an autovacuum launcher */ - PMSIGNAL_START_AUTOVAC_WORKER, /* start an autovacuum worker */ - PMSIGNAL_BACKGROUND_WORKER_CHANGE, /* background worker state change */ - PMSIGNAL_START_WALRECEIVER, /* start a walreceiver */ - PMSIGNAL_ADVANCE_STATE_MACHINE, /* advance postmaster's state machine */ + PMSIGNAL_RECOVERY_STARTED, /* recovery has started */ + PMSIGNAL_BEGIN_HOT_STANDBY, /* begin Hot Standby */ + PMSIGNAL_WAKEN_ARCHIVER, /* send a NOTIFY signal to xlog archiver */ + PMSIGNAL_ROTATE_LOGFILE, /* send SIGUSR1 to syslogger to rotate logfile */ + PMSIGNAL_START_AUTOVAC_LAUNCHER, /* start an autovacuum launcher */ + PMSIGNAL_START_AUTOVAC_WORKER, /* start an autovacuum worker */ + PMSIGNAL_BACKGROUND_WORKER_CHANGE, /* background worker state change */ + PMSIGNAL_START_WALRECEIVER, /* start a walreceiver */ + PMSIGNAL_ADVANCE_STATE_MACHINE, /* advance postmaster's state machine */ #ifdef __AUDIT__ - PMSIGNAL_ROTATE_AUDIT_LOGFILE, /* send SIGUSR1 to audit logger to rotate logfile */ - PMSIGNAL_WAKEN_AUDIT_LOGGER, /* send SIGUSR2 to audit logger to read audit log */ + PMSIGNAL_ROTATE_AUDIT_LOGFILE, /* send SIGUSR1 to audit logger to rotate logfile */ + PMSIGNAL_WAKEN_AUDIT_LOGGER, /* send SIGUSR2 to audit logger to read audit log */ #endif #ifdef __AUDIT_FGA__ PMSIGNAL_WAKEN_AUDIT_FGA_TRIGGER, /*send SIGUSR1 to audit fga bgworker to trigger function */ #endif - NUM_PMSIGNALS /* Must be last value of enum! */ + PMSIGNAL_WAKEN_CLEAN_2PC_TRIGGER, /* send SIGUSR2 to clean 2pc launcher to trigger clean */ + PMSIGNAL_START_CLEAN_2PC_WORKER, /* start an clean 2pc worker */ + NUM_PMSIGNALS /* Must be last value of enum! */ } PMSignalReason; /* PMSignalData is an opaque struct, details known only within pmsignal.c */ @@ -51,7 +53,7 @@ extern Size PMSignalShmemSize(void); extern void PMSignalShmemInit(void); extern void SendPostmasterSignal(PMSignalReason reason); extern bool CheckPostmasterSignal(PMSignalReason reason); -extern int AssignPostmasterChildSlot(void); +extern int AssignPostmasterChildSlot(void); extern bool ReleasePostmasterChildSlot(int slot); extern bool IsPostmasterChildWalSender(int slot); extern void MarkPostmasterChildActive(void); @@ -59,4 +61,4 @@ extern void MarkPostmasterChildInactive(void); extern void MarkPostmasterChildWalSender(void); extern bool PostmasterIsAlive(void); -#endif /* PMSIGNAL_H */ +#endif /* PMSIGNAL_H */ diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index 48601659..32dea324 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -331,6 +331,8 @@ typedef struct PROC_HDR PGPROC *autovacFreeProcs; /* Head of list of bgworker free PGPROC structures */ PGPROC *bgworkerFreeProcs; + /* Head of list of clean 2pc process free PGPROC structures */ + PGPROC *clean2pcFreeProcs; /* First pgproc waiting for group XID clear */ pg_atomic_uint32 procArrayGroupFirst; /* WALWriter process's latch */ diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index 58642165..4c0e6f5c 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -74,6 +74,7 @@ select name, setting from pg_settings where name like 'enable%'; -----------------------------------+--------- enable_2pc_entry_key_check | on enable_2pc_entry_trace | off + enable_2pc_error_stop | off enable_2pc_file_cache | on enable_2pc_file_check | off enable_2pc_recovery_info | on @@ -83,6 +84,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_bitmapscan | on enable_buffer_mprotect | on enable_check_password | off + enable_clean_2pc_launcher | on enable_clog_mprotect | on enable_cls | on enable_cold_hot_router_print | off @@ -141,7 +143,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_transparent_crypt | on enable_user_authority_force_check | off enable_xlog_mprotect | on -(68 rows) +(70 rows) -- Test that the pg_timezone_names and pg_timezone_abbrevs views are -- more-or-less working. We can't test their contents in any great detail From 898d391e52ec1e88446470fdf46e6518fe537e4e Mon Sep 17 00:00:00 2001 From: sigmalin Date: Wed, 25 Aug 2021 17:11:26 +0800 Subject: [PATCH 419/578] fix could not open relation with OID 0 http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131091148837 (merge request !621) --- src/backend/utils/cache/plancache.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/backend/utils/cache/plancache.c b/src/backend/utils/cache/plancache.c index 94579c4e..240a4f9d 100644 --- a/src/backend/utils/cache/plancache.c +++ b/src/backend/utils/cache/plancache.c @@ -2112,6 +2112,11 @@ SetRemoteSubplan(CachedPlanSource *plansource, const char *plan_string) */ PG_TRY(); { + /* + * Check for shared-cache-inval messages before restoring query plan, + * avoid oid conversion and other operations to find old data. + */ + AcceptInvalidationMessages(); set_portable_input(true); rstmt = (RemoteStmt *) stringToNode((char *) plan_string); } From 311302b2c1d4991e8bd14e3f4d1418f1c1cef663 Mon Sep 17 00:00:00 2001 From: whalesong Date: Tue, 12 Oct 2021 17:35:23 +0800 Subject: [PATCH 420/578] 2pc stop opt: add clean 2pc process (merge request 656), automatic test bugfix --- src/backend/postmaster/clean2pc.c | 63 +++++++++++++++++++++++-------- src/backend/utils/misc/guc.c | 11 ++++++ src/include/postmaster/clean2pc.h | 1 + 3 files changed, 59 insertions(+), 16 deletions(-) diff --git a/src/backend/postmaster/clean2pc.c b/src/backend/postmaster/clean2pc.c index 80ab7103..5e0ceaf2 100644 --- a/src/backend/postmaster/clean2pc.c +++ b/src/backend/postmaster/clean2pc.c @@ -58,6 +58,7 @@ bool enable_clean_2pc_launcher = true; int auto_clean_2pc_interval = 10; int auto_clean_2pc_delay = 3; +int auto_clean_2pc_timeout = 0; static volatile sig_atomic_t got_SIGTERM = false; static volatile sig_atomic_t got_SIGHUP = false; @@ -79,11 +80,11 @@ Clean2pcLauncherMain(int argc, char *argv[]) pg_attribute_noreturn(); NON_EXEC_STATIC void Clean2pcWorkerMain(int argc, char *argv[]) pg_attribute_noreturn(); -static void start_query_worker(void); +static void start_query_worker(TimestampTz clean_time); static void start_clean_worker(int count); -static void do_query_2pc(void); -static void do_clean_2pc(void); +static void do_query_2pc(TimestampTz clean_time); +static void do_clean_2pc(TimestampTz clean_time); static void clean_2pc_sigterm_handler(SIGNAL_ARGS); static void clean_2pc_sighup_handler(SIGNAL_ARGS); @@ -103,6 +104,8 @@ typedef struct Clean2pcDBInfo typedef struct { + TimestampTz clean_time; + bool worker_running; Oid worker_db; @@ -157,6 +160,7 @@ NON_EXEC_STATIC void Clean2pcLauncherMain(int argc, char *argv[]) { int wait_time = 0; + TimestampTz clean_time = GetCurrentTimestamp(); am_clean_2pc_launcher = true; @@ -239,13 +243,15 @@ Clean2pcLauncherMain(int argc, char *argv[]) if (got_SIGUSR2) { - elog(LOG, "2pc clean launcher got SIGUSR2"); got_SIGUSR2 = false; + clean_time = GetCurrentTimestamp(); wait_time = auto_clean_2pc_delay; + elog(LOG, "2pc clean launcher got SIGUSR2, clean_time: " + INT64_FORMAT, clean_time); continue; } - start_query_worker(); + start_query_worker(clean_time); if (got_SIGTERM || got_SIGHUP || got_SIGUSR2) { @@ -371,13 +377,13 @@ Clean2pcWorkerMain(int argc, char *argv[]) if (Clean2pcShmem->db_count == 0) { elog(DEBUG5, "query 2pc from db: %s", db_name); - do_query_2pc(); + do_query_2pc(Clean2pcShmem->clean_time); clean_db_count = Clean2pcShmem->db_count; } else { elog(LOG, "clean 2pc for db: %s", db_name); - do_clean_2pc(); + do_clean_2pc(Clean2pcShmem->clean_time); } Clean2pcShmem->worker_running = false; @@ -394,7 +400,7 @@ Clean2pcWorkerMain(int argc, char *argv[]) } static void -do_query_2pc(void) +do_query_2pc(TimestampTz clean_time) { int i = 0; int count_db = 0; @@ -412,6 +418,8 @@ do_query_2pc(void) TupleTableSlot *result = NULL; Var *dummy = NULL; int attr_num = 4; + int64 check_time = 0; + TimestampTz curr_time = GetCurrentTimestamp(); static const char *attr_name[] = {"gid", "database", "global_transaction_status", "transaction_status_on_allnodes"}; @@ -419,10 +427,25 @@ do_query_2pc(void) Assert(result_str != NULL); resetStringInfo(result_str); - snprintf(query, SQL_CMD_LEN, "select * FROM pg_clean_check_txn(%d) " - "order by database limit 1000;", auto_clean_2pc_delay); + check_time = (curr_time - clean_time)/USECS_PER_SEC; - elog(DEBUG2, "node(%d) query: %s", PGXCNodeId, query); + if (check_time < 0) + { + elog(WARNING, "Invalid check_time: " INT64_FORMAT + ", curr_time: " INT64_FORMAT ", clean_time: " INT64_FORMAT, + check_time, curr_time, clean_time); + return; + } + + if (check_time > INT32_MAX) + { + check_time = INT32_MAX; + } + + snprintf(query, SQL_CMD_LEN, "select * FROM pg_clean_check_txn(" + INT64_FORMAT ") order by database limit 1000;", check_time); + + elog(DEBUG1, "node(%d) query: %s", PGXCNodeId, query); StartTransactionCommand(); @@ -544,7 +567,7 @@ do_query_2pc(void) } static void -do_clean_2pc(void) +do_clean_2pc(TimestampTz clean_time) { int i = 0; int count = 0; @@ -558,13 +581,10 @@ do_clean_2pc(void) int attr_num = 4; static const char *attr_name[] = {"gid", "global_transaction_status", "operation", "operation_status"}; - TimestampTz clean_time = 0; Assert(result_str != NULL); resetStringInfo(result_str); - clean_time = GetCurrentTimestamp() - USECS_PER_SEC * auto_clean_2pc_delay; - snprintf(query, SQL_CMD_LEN, "select * FROM pg_clean_execute_on_node('%s', %ld)" " limit 1000;", PGXCNodeName, clean_time); @@ -798,7 +818,7 @@ get_default_database(void) * start query worker to query 2pc */ static void -start_query_worker(void) +start_query_worker(TimestampTz clean_time) { Oid db_oid = get_default_database(); if (!OidIsValid(db_oid)) @@ -809,8 +829,19 @@ start_query_worker(void) Assert(OidIsValid(db_oid)); + if (auto_clean_2pc_timeout != 0) + { + TimestampTz curr_time = GetCurrentTimestamp(); + if (curr_time - clean_time > auto_clean_2pc_timeout * USECS_PER_SEC) + { + clean_time = curr_time - auto_clean_2pc_timeout * USECS_PER_SEC; + } + } + LWLockAcquire(Clean2pcLock, LW_EXCLUSIVE); + Clean2pcShmem->clean_time = clean_time; + while (Clean2pcShmem->worker_running) { LWLockRelease(Clean2pcLock); diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 56abdb1a..f1e0d2f7 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -4875,6 +4875,17 @@ static struct config_int ConfigureNamesInt[] = NULL, NULL, NULL }, + { + {"auto_clean_2pc_timeout", PGC_USERSET, CUSTOM_OPTIONS, + gettext_noop("auto clean 2pc timeout"), + NULL, + GUC_UNIT_S + }, + &auto_clean_2pc_timeout, + 0, 0, INT_MAX, + NULL, NULL, NULL + }, + { {"reconnect_gtm_retry_times", PGC_USERSET, CUSTOM_OPTIONS, gettext_noop("reconnect gtm retry times"), diff --git a/src/include/postmaster/clean2pc.h b/src/include/postmaster/clean2pc.h index 1d6df23f..2d94442b 100644 --- a/src/include/postmaster/clean2pc.h +++ b/src/include/postmaster/clean2pc.h @@ -18,6 +18,7 @@ extern bool enable_clean_2pc_launcher; extern int auto_clean_2pc_interval; extern int auto_clean_2pc_delay; +extern int auto_clean_2pc_timeout; extern bool IsClean2pcLauncher(void); extern bool IsClean2pcWorker(void); From eb008dcaa3064d7ec67f6016408ace9835163b7b Mon Sep 17 00:00:00 2001 From: bethding Date: Sat, 9 Oct 2021 20:27:13 +0800 Subject: [PATCH 421/578] fix ddl failed in non leader cn http://tapd.oa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131093044435&jump_count=1 --- src/backend/pgxc/pool/pgxcnode.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index 5b36f087..4d93ec43 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -5877,6 +5877,7 @@ delete_leadercn_handle(PGXCNodeAllHandles *pgxc_connections, { int co_conn_count = 0; int i = 0; + bool find_leader_handle = false; if (!pgxc_connections || !leader_cn_handle) return; @@ -5884,14 +5885,18 @@ delete_leadercn_handle(PGXCNodeAllHandles *pgxc_connections, co_conn_count = pgxc_connections->co_conn_count; for (i = 0; i < co_conn_count; i++) { - if (pgxc_connections->coord_handles[i] == leader_cn_handle) + if (pgxc_connections->coord_handles[i] == leader_cn_handle || find_leader_handle) { if (i+1 < co_conn_count) pgxc_connections->coord_handles[i] = pgxc_connections->coord_handles[i+1]; else pgxc_connections->coord_handles[i] = NULL; + + if (!find_leader_handle) + { pgxc_connections->co_conn_count--; - break; + find_leader_handle = true; + } } } } From b40778324bec0c279e26e808cf46c246e04a1917 Mon Sep 17 00:00:00 2001 From: andrelin Date: Mon, 18 Oct 2021 18:09:18 +0800 Subject: [PATCH 422/578] Revert a wrong code causing concurrent UPDATE of partition table coredump tapd: http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view?bug_id=1020421696093297211 --- src/backend/executor/execMain.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index d30ee629..ca60ff43 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -3677,7 +3677,6 @@ EvalPlanQualEnd(EPQState *epqstate) /* Mark EPQState idle */ epqstate->estate = NULL; - epqstate->parentestate = NULL; epqstate->planstate = NULL; epqstate->origslot = NULL; } From 3fcef588f29556f95e17140605e79910e66f2609 Mon Sep 17 00:00:00 2001 From: youngxie Date: Sun, 24 Oct 2021 08:42:25 +0800 Subject: [PATCH 423/578] save --- src/backend/optimizer/util/pgxcship.c | 4 +- src/backend/pgxc/locator/locator.c | 356 ++++++++++++++++++++++++-- src/include/optimizer/pgxcship.h | 1 + src/test/regress/sql/xc_FQS.sql | 21 +- 4 files changed, 347 insertions(+), 35 deletions(-) diff --git a/src/backend/optimizer/util/pgxcship.c b/src/backend/optimizer/util/pgxcship.c index b749e028..6ad269f9 100644 --- a/src/backend/optimizer/util/pgxcship.c +++ b/src/backend/optimizer/util/pgxcship.c @@ -117,8 +117,6 @@ static bool pgxc_is_func_shippable(Oid funcid); /* Check equijoin conditions on given relations */ static Expr *pgxc_find_dist_equijoin_qual(Relids varnos_1, Relids varnos_2, Oid distcol_type, Node *quals, List *rtable); -/* Merge given execution nodes based on join shippability conditions */ -static ExecNodes *pgxc_merge_exec_nodes(ExecNodes *en1, ExecNodes *en2); /* Check if given Query includes distribution column */ static bool pgxc_query_has_distcolgrouping(Query *query); @@ -2485,7 +2483,7 @@ pgxc_find_dist_equi_nodes(Relids varnos_1, * exec_node corresponds to the JOIN of respective relations. * If both exec_nodes can not be merged, it returns NULL. */ -static ExecNodes * +ExecNodes * pgxc_merge_exec_nodes(ExecNodes *en1, ExecNodes *en2) {// #lizard forgives ExecNodes *merged_en = makeNode(ExecNodes); diff --git a/src/backend/pgxc/locator/locator.c b/src/backend/pgxc/locator/locator.c index 20abfd91..3ed59f90 100644 --- a/src/backend/pgxc/locator/locator.c +++ b/src/backend/pgxc/locator/locator.c @@ -41,6 +41,7 @@ #include "utils/varbit.h" #include "nodes/nodes.h" #include "optimizer/clauses.h" +#include "optimizer/pgxcship.h" #include "parser/parse_coerce.h" #include "pgxc/nodemgr.h" #include "pgxc/locator.h" @@ -51,13 +52,19 @@ #include "catalog/pgxc_node.h" #include "catalog/namespace.h" #include "access/hash.h" + #ifdef XCP + #include "utils/date.h" #include "utils/memutils.h" + #ifdef __COLD_HOT__ + #include "catalog/pgxc_key_values.h" #include "pgxc/shardmap.h" + #endif + /* * Locator details are private */ @@ -73,6 +80,7 @@ struct _Locator Datum secValue, bool secIsNull, #endif bool *hasprimary); + Oid dataType; /* values of that type are passed to locateNodes function */ LocatorListType listType; bool primary; @@ -100,6 +108,7 @@ struct _Locator void *nodeMap; /* map index to node reference according to listType */ void *results; /* array to output results */ }; + #endif #ifdef __COLD_HOT__ @@ -116,38 +125,47 @@ int num_preferred_data_nodes = 0; Oid preferred_data_node[MAX_PREFERRED_NODES]; #ifdef XCP + static int modulo_value_len(Oid dataType); + static int locate_static(Locator *self, Datum value, bool isnull, #ifdef __COLD_HOT__ Datum secValue, bool secIsNull, #endif bool *hasprimary); + static int locate_roundrobin(Locator *self, Datum value, bool isnull, #ifdef __COLD_HOT__ Datum secValue, bool secIsNull, #endif bool *hasprimary); + static int locate_modulo_random(Locator *self, Datum value, bool isnull, #ifdef __COLD_HOT__ Datum secValue, bool secIsNull, #endif bool *hasprimary); + static int locate_hash_insert(Locator *self, Datum value, bool isnull, #ifdef __COLD_HOT__ Datum secValue, bool secIsNull, #endif bool *hasprimary); + static int locate_hash_select(Locator *self, Datum value, bool isnull, #ifdef __COLD_HOT__ Datum secValue, bool secIsNull, #endif bool *hasprimary); + #ifdef _MIGRATE_ + static int locate_shard_insert(Locator *self, Datum value, bool isnull, #ifdef __COLD_HOT__ Datum secValue, bool secIsNull, #endif bool *hasprimary); + static int locate_shard_select(Locator *self, Datum value, bool isnull, #ifdef __COLD_HOT__ Datum secValue, bool secIsNull, @@ -155,22 +173,26 @@ static int locate_shard_select(Locator *self, Datum value, bool isnull, bool *hasprimary); #endif + static int locate_modulo_insert(Locator *self, Datum value, bool isnull, #ifdef __COLD_HOT__ Datum secValue, bool secIsNull, #endif bool *hasprimary); + static int locate_modulo_select(Locator *self, Datum value, bool isnull, #ifdef __COLD_HOT__ Datum secValue, bool secIsNull, #endif bool *hasprimary); + static Expr * pgxc_find_distcol_expr(Index varno, AttrNumber attrNum, Node *quals); #ifdef __COLD_HOT__ + static List * pgxc_find_distcol_exprs(Index varno, AttrNumber attrNum, Node *quals); @@ -189,6 +211,7 @@ static ExecNodes *GetRelationTimeStampRangeNodes(RelationLocInfo *rel_loc_info, static bool IsConstAligned(Oid reloid, Datum constvalue, AttrNumber secAttr); static bool TimeStampRange(Oid op); + #endif #endif @@ -218,14 +241,20 @@ GetPreferredReplicationNode(List *relNodes) { if (PGXCNodeGetNodeId(preferred_data_node[cnt_nodes], &nodetype) == lfirst_int(item)) + { nodeid = lfirst_int(item); } + } if (nodeid >= 0) + { break; } + } if (nodeid < 0) + { return list_make1_int(list_nth_int(relNodes, ((unsigned int) random()) % list_length(relNodes))); + } return list_make1_int(nodeid); } @@ -249,15 +278,19 @@ GetAnyDataNode(Bitmapset *nodes) /* OK, found one */ if (bms_is_member(nodeid, nodes)) + { preferred = bms_add_member(preferred, nodeid); } + } /* * If no preferred data nodes or they are not in the desired set, pick up * from the original set. */ if (bms_is_empty(preferred)) + { preferred = bms_copy(nodes); + } /* * Load balance. @@ -269,7 +302,9 @@ GetAnyDataNode(Bitmapset *nodes) /* If there is a single member nothing to balance */ if (nmembers == 1) + { return members[0]; + } /* * In general, the set may contain any number of nodes, and if we save @@ -307,12 +342,15 @@ char *pColName; pColName = GetRelationHashColumn(rel_loc_info); if (pColName == NULL) + { pColName = GetRelationModuloColumn(rel_loc_info); + } return pColName; } #ifdef _MIGRATE_ + /* * IsTypeDistributable * Returns whether the data type is distributable using a column value. @@ -350,10 +388,13 @@ IsTypeDistributable(Oid col_type) || col_type == NVARCHAR2OID #endif ) + { return true; + } return false; } + #endif /* @@ -377,9 +418,13 @@ GetRelationHashColumn(RelationLocInfo * rel_loc_info) char *column_str = NULL; if (rel_loc_info == NULL) + { column_str = NULL; + } else if (rel_loc_info->locatorType != LOCATOR_TYPE_HASH) + { column_str = NULL; + } else { int len = strlen(rel_loc_info->partAttrName); @@ -402,15 +447,21 @@ IsDistColumnForRelId(Oid relid, char *part_col_name) /* if no column is specified, we're done */ if (!part_col_name) + { return false; + } /* if no locator, we're done too */ if (!(rel_loc_info = GetRelationLocInfo(relid))) + { return false; + } /* is the table distributed by column value */ if (!IsRelationDistributedByValue(rel_loc_info)) + { return false; + } /* does the column name match the distribution column */ return !strcmp(part_col_name, rel_loc_info->partAttrName); @@ -438,9 +489,13 @@ GetRelationModuloColumn(RelationLocInfo * rel_loc_info) char *column_str = NULL; if (rel_loc_info == NULL) + { column_str = NULL; + } else if (rel_loc_info->locatorType != LOCATOR_TYPE_MODULO) + { column_str = NULL; + } else { int len = strlen(rel_loc_info->partAttrName); @@ -471,10 +526,14 @@ GetRoundRobinNode(Oid relid) /* Move round robin indicator to next node */ if (rel->rd_locator_info->roundRobinNode->next != NULL) + { rel->rd_locator_info->roundRobinNode = rel->rd_locator_info->roundRobinNode->next; + } else + { /* reset to first one */ rel->rd_locator_info->roundRobinNode = rel->rd_locator_info->rl_nodeList->head; + } relation_close(rel, AccessShareLock); @@ -494,14 +553,18 @@ IsTableDistOnPrimary(RelationLocInfo *rel_loc_info) if (!OidIsValid(primary_data_node) || rel_loc_info == NULL || list_length(rel_loc_info->rl_nodeList = 0)) + { return false; + } foreach(item, rel_loc_info->rl_nodeList) { char ntype = PGXC_NODE_DATANODE; if (PGXCNodeGetNodeId(primary_data_node, &ntype) == lfirst_int(item)) + { return true; } + } return false; } @@ -521,20 +584,28 @@ IsLocatorInfoEqual(RelationLocInfo *rel_loc_info1, RelationLocInfo *rel_loc_info /* Same relation? */ if (rel_loc_info1->relid != rel_loc_info2->relid) + { return false; + } /* Same locator type? */ if (rel_loc_info1->locatorType != rel_loc_info2->locatorType) + { return false; + } /* Same attribute number? */ if (rel_loc_info1->partAttrNum != rel_loc_info2->partAttrNum) + { return false; + } /* Same node list? */ if (list_difference_int(nodeList1, nodeList2) != NIL || list_difference_int(nodeList2, nodeList1) != NIL) + { return false; + } /* Everything is equal */ return true; @@ -592,7 +663,9 @@ GetLocatorType(Oid relid) RelationLocInfo *ret_loc_info = GetRelationLocInfo(relid); if (ret_loc_info != NULL) + { ret = ret_loc_info->locatorType; + } return ret; } @@ -634,8 +707,10 @@ GetAllCoordNodes(void) */ if (i != PGXCNodeId - 1) + { nodeList = lappend_int(nodeList, i); } + } return nodeList; } @@ -727,7 +802,8 @@ RelationBuildLocator(Relation rel) curr_nodeoid = get_pgxc_nodeoid_extend(PGXCNodeName, PGXCMainClusterName); if (InvalidOid == curr_nodeoid) { - elog(ERROR, "no such node:%s on PGXCMainClusterName %s PGXCClustername %s", PGXCNodeName, PGXCMainClusterName, PGXCClusterName); + elog(ERROR, "no such node:%s on PGXCMainClusterName %s PGXCClustername %s", PGXCNodeName, PGXCMainClusterName, + PGXCClusterName); } node_in_group = DatanodeInGroup(&(pgxc_class->nodeoids), curr_nodeoid); @@ -761,7 +837,8 @@ RelationBuildLocator(Relation rel) GetShardNodes(pgxc_class->pcoldgroup, &datanodes, &dn_num, NULL); for(j = 0; j < dn_num; j++) { - relationLocInfo->rl_nodeList = list_append_unique_int(relationLocInfo->rl_nodeList, datanodes[j]); + relationLocInfo->rl_nodeList = list_append_unique_int(relationLocInfo->rl_nodeList, + datanodes[j]); } pfree(datanodes); } @@ -780,7 +857,8 @@ RelationBuildLocator(Relation rel) GetShardNodes(groups[i], &datanodes, &dn_num, NULL); for(j = 0; j < dn_num; j++) { - relationLocInfo->rl_nodeList = list_append_unique_int(relationLocInfo->rl_nodeList, datanodes[j]); + relationLocInfo->rl_nodeList = list_append_unique_int(relationLocInfo->rl_nodeList, + datanodes[j]); } pfree(datanodes); } @@ -849,7 +927,9 @@ GetRelationLocInfo(Oid relid) Assert(rel->rd_isvalid); if (rel->rd_locator_info) + { ret_loc_info = CopyRelationLocInfo(rel->rd_locator_info); + } relation_close(rel, AccessShareLock); @@ -864,7 +944,9 @@ GetRelationLocType(Oid relid) { RelationLocInfo *locinfo = GetRelationLocInfo(relid); if (!locinfo) + { return LOCATOR_TYPE_NONE; + } return locinfo->locatorType; } @@ -885,7 +967,9 @@ CopyRelationLocInfo(RelationLocInfo * src_info) dest_info->locatorType = src_info->locatorType; dest_info->partAttrNum = src_info->partAttrNum; if (src_info->partAttrName) + { dest_info->partAttrName = pstrdup(src_info->partAttrName); + } #ifdef _MIGRATE_ dest_info->groupId = src_info->groupId; #endif @@ -898,7 +982,9 @@ CopyRelationLocInfo(RelationLocInfo * src_info) } #endif if (src_info->rl_nodeList) + { dest_info->rl_nodeList = list_copy(src_info->rl_nodeList); + } /* Note, for round robin, we use the relcache entry */ return dest_info; @@ -914,11 +1000,15 @@ FreeRelationLocInfo(RelationLocInfo *relationLocInfo) if (relationLocInfo) { if (relationLocInfo->partAttrName) + { pfree(relationLocInfo->partAttrName); + } #ifdef __COLD_HOT__ if (relationLocInfo->secAttrName) + { pfree(relationLocInfo->secAttrName); + } #endif list_free(relationLocInfo->rl_nodeList); @@ -937,7 +1027,9 @@ FreeExecNodes(ExecNodes **exec_nodes) /* Nothing to do */ if (!tmp_en) + { return; + } list_free(tmp_en->primarynodelist); list_free(tmp_en->nodeList); pfree(tmp_en); @@ -946,6 +1038,7 @@ FreeExecNodes(ExecNodes **exec_nodes) #ifdef XCP + /* * Determine value length in bytes for specified type for a module locator. * Return -1 if module locator is not supported for the type. @@ -1038,6 +1131,7 @@ hash_func_ptr(Oid dataType) } #ifdef _MIGRATE_ + Locator * createLocator(char locatorType, RelationAccessType accessType, Oid dataType, LocatorListType listType, int nodeCount, @@ -1103,8 +1197,7 @@ createLocator(char locatorType, RelationAccessType accessType, int *intptr; nodeMap = palloc(locator->nodeCount * sizeof(int)); intptr = (int *) nodeMap; - foreach(lc, l) - *intptr++ = lfirst_int(lc); + foreach(lc, l) *intptr++ = lfirst_int(lc); locator->listType = LOCATOR_LIST_INT; } else if (IsA(l, OidList)) @@ -1112,8 +1205,7 @@ createLocator(char locatorType, RelationAccessType accessType, Oid *oidptr; nodeMap = palloc(locator->nodeCount * sizeof(Oid)); oidptr = (Oid *) nodeMap; - foreach(lc, l) - *oidptr++ = lfirst_oid(lc); + foreach(lc, l) *oidptr++ = lfirst_oid(lc); locator->listType = LOCATOR_LIST_OID; } else if (IsA(l, List)) @@ -1121,8 +1213,7 @@ createLocator(char locatorType, RelationAccessType accessType, void **voidptr; nodeMap = palloc(locator->nodeCount * sizeof(void *)); voidptr = (void **) nodeMap; - foreach(lc, l) - *voidptr++ = lfirst(lc); + foreach(lc, l) *voidptr++ = lfirst(lc); locator->listType = LOCATOR_LIST_POINTER; } else @@ -1478,7 +1569,9 @@ createLocator(char locatorType, RelationAccessType accessType, } if (result) + { *result = locator->results; + } return locator; } @@ -1493,7 +1586,9 @@ freeLocator(Locator *locator) * do not free it twice */ if (locator->results != locator->nodeMap) + { pfree(locator->results); + } pfree(locator); } @@ -1510,7 +1605,9 @@ locate_static(Locator *self, Datum value, bool isnull, { /* TODO */ if (hasprimary) + { *hasprimary = false; + } return self->nodeCount; } @@ -1527,9 +1624,13 @@ locate_roundrobin(Locator *self, Datum value, bool isnull, {// #lizard forgives /* TODO */ if (hasprimary) + { *hasprimary = false; + } if (++self->roundRobinNode >= self->nodeCount) + { self->roundRobinNode = 0; + } switch (self->listType) { case LOCATOR_LIST_NONE: @@ -1570,7 +1671,9 @@ locate_modulo_random(Locator *self, Datum value, bool isnull, int offset; if (hasprimary) + { *hasprimary = false; + } Assert(self->nodeCount > 0); offset = compute_modulo(abs(rand()), self->nodeCount); @@ -1611,9 +1714,13 @@ locate_hash_insert(Locator *self, Datum value, bool isnull, {// #lizard forgives int index; if (hasprimary) + { *hasprimary = false; + } if (isnull) + { index = 0; + } else { unsigned int hash32; @@ -1645,6 +1752,7 @@ locate_hash_insert(Locator *self, Datum value, bool isnull, } #ifdef _MIGRATE_ + static int locate_shard_insert(Locator *self, Datum value, bool isnull, #ifdef __COLD_HOT__ Datum secValue, bool secIsNull, @@ -1922,6 +2030,7 @@ static int locate_shard_select(Locator *self, Datum value, bool isnull, } } } + #endif @@ -1937,7 +2046,9 @@ locate_hash_select(Locator *self, Datum value, bool isnull, bool *hasprimary) {// #lizard forgives if (hasprimary) + { *hasprimary = false; + } if (isnull) { int i; @@ -2010,23 +2121,37 @@ locate_modulo_insert(Locator *self, Datum value, bool isnull, {// #lizard forgives int index; if (hasprimary) + { *hasprimary = false; + } if (isnull) + { index = 0; + } else { uint64 val; if (self->valuelen == 8) + { val = (uint64) (GET_8_BYTES(value)); + } else if (self->valuelen == 4) + { val = (uint64) (GET_4_BYTES(value)); + } else if (self->valuelen == 2) + { val = (uint64) (GET_2_BYTES(value)); + } else if (self->valuelen == 1) + { val = (uint64) (GET_1_BYTE(value)); + } else + { val = 0; + } index = compute_modulo(val, self->nodeCount); } @@ -2065,7 +2190,9 @@ locate_modulo_select(Locator *self, Datum value, bool isnull, bool *hasprimary) {// #lizard forgives if (hasprimary) + { *hasprimary = false; + } if (isnull) { int i; @@ -2100,15 +2227,25 @@ locate_modulo_select(Locator *self, Datum value, bool isnull, int index; if (self->valuelen == 8) + { val = (uint64) (GET_8_BYTES(value)); + } else if (self->valuelen == 4) + { val = (unsigned int) (GET_4_BYTES(value)); + } else if (self->valuelen == 2) + { val = (unsigned int) (GET_2_BYTES(value)); + } else if (self->valuelen == 1) + { val = (unsigned int) (GET_1_BYTE(value)); + } else + { val = 0; + } index = compute_modulo(val, self->nodeCount); @@ -2151,6 +2288,7 @@ GET_NODES(Locator *self, Datum value, bool isnull, } #ifdef __TBASE__ + char getLocatorDisType(Locator *self) { @@ -2179,7 +2317,9 @@ int calcDistReplications(char distributionType, Bitmapset *nodes) { if (!nodes) + { return 1; + } if (IsLocatorReplicated(distributionType) || IsLocatorNone(distributionType)) @@ -2189,6 +2329,7 @@ calcDistReplications(char distributionType, Bitmapset *nodes) return 1; } + #endif void * @@ -2210,6 +2351,7 @@ getLocatorNodeCount(Locator *self) { return self->nodeCount; } + #endif /* @@ -2248,7 +2390,9 @@ GetRelationNodes(RelationLocInfo *rel_loc_info, Datum valueForDistCol, #endif if (rel_loc_info == NULL) + { return NULL; + } if (IsLocatorDistributedByValue(rel_loc_info->locatorType)) @@ -2339,7 +2483,7 @@ GetRelationNodesByQuals(Oid reloid, RelationLocInfo *rel_loc_info, {// #lizard forgives #define ONE_SECOND_DATUM 1000000 Expr *distcol_expr = NULL; - ExecNodes *exec_nodes; + ExecNodes *exec_nodes = NULL; Datum distcol_value; bool distcol_isnull; #ifdef __COLD_HOT__ @@ -2352,6 +2496,15 @@ GetRelationNodesByQuals(Oid reloid, RelationLocInfo *rel_loc_info, Oid distcol_type = InvalidOid; Oid *opArray = NULL; bool *isswapArray = NULL; + Oid disttype; + int32 disttypmod; + + if (enable_distri_debug) + { + int r = 1; + while(r) + ; + } if (dis_qual) { @@ -2365,15 +2518,18 @@ GetRelationNodesByQuals(Oid reloid, RelationLocInfo *rel_loc_info, #endif if (!rel_loc_info) + { return NULL; + } /* * If the table distributed by value, check if we can reduce the Datanodes * by looking at the qualifiers for this relation */ + disttype = get_atttype(reloid, rel_loc_info->partAttrNum); + disttypmod = get_atttypmod(reloid, rel_loc_info->partAttrNum); + if (IsRelationDistributedByValue(rel_loc_info)) { - Oid disttype = get_atttype(reloid, rel_loc_info->partAttrNum); - int32 disttypmod = get_atttypmod(reloid, rel_loc_info->partAttrNum); distcol_expr = pgxc_find_distcol_expr(varno, rel_loc_info->partAttrNum, quals); /* @@ -2382,7 +2538,7 @@ GetRelationNodesByQuals(Oid reloid, RelationLocInfo *rel_loc_info, * will happen in case of inserting that type of expression value as the * distribution column value. */ - if (distcol_expr) + if (distcol_expr && !IsA(distcol_expr, ArrayExpr)) { distcol_expr = (Expr *)coerce_to_target_type(NULL, (Node *)distcol_expr, @@ -2553,7 +2709,9 @@ GetRelationNodesByQuals(Oid reloid, RelationLocInfo *rel_loc_info, if (isswapArray[i]) { /* const <= var */ - minStamp = minStamp ? ((const_expr->constvalue >= minStamp) ? minStamp : const_expr->constvalue) : const_expr->constvalue; + minStamp = minStamp ? ((const_expr->constvalue >= minStamp) ? minStamp + : const_expr->constvalue) + : const_expr->constvalue; seccol_type = const_expr->consttype; equal_min = true; } @@ -2577,7 +2735,9 @@ GetRelationNodesByQuals(Oid reloid, RelationLocInfo *rel_loc_info, else { /* var >= const */ - minStamp = minStamp ? ((const_expr->constvalue >= minStamp) ? minStamp : const_expr->constvalue) : const_expr->constvalue; + minStamp = minStamp ? ((const_expr->constvalue >= minStamp) ? minStamp + : const_expr->constvalue) + : const_expr->constvalue; seccol_type = const_expr->consttype; equal_min = true; } @@ -2677,9 +2837,68 @@ GetRelationNodesByQuals(Oid reloid, RelationLocInfo *rel_loc_info, } } #endif + + return GetRelationNodes(rel_loc_info, distcol_value, + distcol_isnull, + seccol_value, seccol_isnull, + relaccess); + } + else if (distcol_expr && IsA(distcol_expr, ArrayExpr) && + rel_loc_info->locatorType == LOCATOR_TYPE_SHARD && !seccol_list) + { + ArrayExpr *arrayExpr = (ArrayExpr *) distcol_expr; + ListCell *lc; + bool success = true; + Const *const_expr; + ExecNodes *temp; + + foreach(lc, arrayExpr->elements) + { + Node *expr = (Node *) lfirst(lc); + + /* convert to distribute column type */ + expr = coerce_to_target_type(NULL, + (Node *) expr, + exprType((Node *) expr), + disttype, disttypmod, + COERCION_ASSIGNMENT, + COERCE_IMPLICIT_CAST, -1); + expr = eval_const_expressions(NULL, + (Node *) expr); + if (!expr || !IsA(expr, Const)) + { + success = false; + break; + } + + const_expr = castNode(Const, expr); + temp = GetRelationNodes(rel_loc_info, const_expr->constvalue, + const_expr->constisnull, + seccol_value, seccol_isnull, + relaccess); + if (!temp) + { + success = false; + break; + } + + if (exec_nodes) + { + exec_nodes->nodeList = list_concat_unique(exec_nodes->nodeList, + temp->nodeList); } else { + exec_nodes = temp; + } + } + + if (success) + { + return exec_nodes; + } + } + distcol_value = (Datum) 0; distcol_isnull = true; #ifdef __TBASE__ @@ -2734,7 +2953,6 @@ GetRelationNodesByQuals(Oid reloid, RelationLocInfo *rel_loc_info, } } #endif - } exec_nodes = GetRelationNodes(rel_loc_info, distcol_value, distcol_isnull, @@ -2754,32 +2972,43 @@ GetRelationDistribColumn(RelationLocInfo *locInfo) { /* No relation, so simply leave */ if (!locInfo) + { return NULL; + } /* No distribution column if relation is not distributed with a key */ if (!IsRelationDistributedByValue(locInfo)) + { return NULL; + } /* Return column name */ return get_attname(locInfo->relid, locInfo->partAttrNum); } #ifdef __COLD_HOT__ + char * GetRelationSecDistribColumn(RelationLocInfo *locInfo) { /* No relation, so simply leave */ if (!locInfo) + { return NULL; + } /* No distribution column if relation is not distributed with a key */ if (!IsRelationDistributedByValue(locInfo)) + { return NULL; + } /* Return column name */ return get_attname(locInfo->relid, locInfo->secAttrNum); } + #endif + /* * pgxc_find_distcol_expr * Search through the quals provided and find out an expression which will give @@ -2802,13 +3031,19 @@ pgxc_find_distcol_expr(Index varno, /* If no quals, no distribution column expression */ if (!quals) + { return NULL; + } /* Convert the qualification into List if it's not already so */ if (!IsA(quals, List)) + { lquals = make_ands_implicit((Expr *)quals); + } else + { lquals = (List *)quals; + } /* * For every ANDed expression, check if that expression is of the form @@ -2817,21 +3052,45 @@ pgxc_find_distcol_expr(Index varno, foreach(qual_cell, lquals) { Expr *qual_expr = (Expr *)lfirst(qual_cell); - OpExpr *op; Expr *lexpr; Expr *rexpr; Var *var_expr; Expr *distcol_expr; + Oid opno; + + if (IsA(qual_expr, OpExpr)) + { + OpExpr *op; - if (!IsA(qual_expr, OpExpr)) - continue; op = (OpExpr *)qual_expr; + /* If not a binary operator, it can not be '='. */ if (list_length(op->args) != 2) + { continue; + } lexpr = linitial(op->args); rexpr = lsecond(op->args); + opno = op->opno; + } + else if (IsA(qual_expr, ScalarArrayOpExpr)) + { + ScalarArrayOpExpr *arrayOpExpr = (ScalarArrayOpExpr *) qual_expr; + + if (list_length(arrayOpExpr->args) != 2) + { + continue; + } + + lexpr = linitial(arrayOpExpr->args); + rexpr = lsecond(arrayOpExpr->args); + opno = arrayOpExpr->opno; + } + else + { + continue; + } /* * If either of the operands is a RelabelType, extract the Var in the RelabelType. @@ -2842,9 +3101,13 @@ pgxc_find_distcol_expr(Index varno, * should be shipped to one of the nodes only */ if (IsA(lexpr, RelabelType)) + { lexpr = ((RelabelType*)lexpr)->arg; + } if (IsA(rexpr, RelabelType)) + { rexpr = ((RelabelType*)rexpr)->arg; + } /* * If either of the operands is a Var expression, assume the other @@ -2862,32 +3125,43 @@ pgxc_find_distcol_expr(Index varno, distcol_expr = lexpr; } else + { continue; + } + /* * If Var found is not the distribution column of required relation, * check next qual */ if (var_expr->varno != varno || var_expr->varattno != attrNum) + { continue; + } + /* * If the operator is not an assignment operator, check next * constraint. An operator is an assignment operator if it's * mergejoinable or hashjoinable. Beware that not every assignment * operator is mergejoinable or hashjoinable, so we might leave some - * oportunity. But then we have to rely on the opname which may not + * opportunity. But then we have to rely on the opname which may not * be something we know to be equality operator as well. */ - if (!op_mergejoinable(op->opno, exprType((Node *)lexpr)) && - !op_hashjoinable(op->opno, exprType((Node *)lexpr))) + if (!op_mergejoinable(opno, exprType((Node *) var_expr)) && + !op_hashjoinable(opno, exprType((Node *) var_expr))) + { continue; + } + /* Found the distribution column expression return it */ return distcol_expr; + } /* Exhausted all quals, but no distribution column expression */ return NULL; } #ifdef __COLD_HOT__ + static bool IsConstAligned(Oid reloid, Datum constvalue, AttrNumber secAttr) {// #lizard forgives bool isalign = false; @@ -2974,13 +3248,19 @@ pgxc_find_distcol_exprs(Index varno, /* If no quals, no distribution column expression */ if (!quals) + { return NULL; + } /* Convert the qualification into List if it's not already so */ if (!IsA(quals, List)) + { lquals = make_ands_implicit((Expr *)quals); + } else + { lquals = (List *)quals; + } /* * For every ANDed expression, check if that expression is of the form @@ -3005,11 +3285,15 @@ pgxc_find_distcol_exprs(Index varno, } if (!IsA(qual_expr, OpExpr)) + { continue; + } op = (OpExpr *)qual_expr; /* If not a binary operator, it can not be '='. */ if (list_length(op->args) != 2) + { continue; + } lexpr = linitial(op->args); rexpr = lsecond(op->args); @@ -3023,9 +3307,13 @@ pgxc_find_distcol_exprs(Index varno, * should be shipped to one of the nodes only */ if (IsA(lexpr, RelabelType)) + { lexpr = ((RelabelType*)lexpr)->arg; + } if (IsA(rexpr, RelabelType)) + { rexpr = ((RelabelType*)rexpr)->arg; + } /* * If either of the operands is a Var expression, assume the other @@ -3044,13 +3332,17 @@ pgxc_find_distcol_exprs(Index varno, isswap = true; } else + { continue; + } /* * If Var found is not the distribution column of required relation, * check next qual */ if (var_expr->varno != varno || var_expr->varattno != attrNum) + { continue; + } /* * If the operator is not an assignment operator, check next * constraint. An operator is an assignment operator if it's @@ -3100,7 +3392,9 @@ GetRelationTimeStampRangeNodes(RelationLocInfo *rel_loc_info, ExecNodes *exec_nodes; if (rel_loc_info == NULL) + { return NULL; + } switch (rel_loc_info->locatorType) @@ -3262,7 +3556,8 @@ GetRelationGroupsByQuals(Oid reloid, RelationLocInfo *rel_loc_info, Node *sec_qu if (isswapArray[i]) { /* const <= var */ - minStamp = minStamp ? ((const_expr->constvalue >= minStamp) ? minStamp : const_expr->constvalue) : const_expr->constvalue; + minStamp = minStamp ? ((const_expr->constvalue >= minStamp) ? minStamp : const_expr->constvalue) + : const_expr->constvalue; seccol_type = const_expr->consttype; } else @@ -3283,7 +3578,8 @@ GetRelationGroupsByQuals(Oid reloid, RelationLocInfo *rel_loc_info, Node *sec_qu else { /* var >= const */ - minStamp = minStamp ? ((const_expr->constvalue >= minStamp) ? minStamp : const_expr->constvalue) : const_expr->constvalue; + minStamp = minStamp ? ((const_expr->constvalue >= minStamp) ? minStamp : const_expr->constvalue) + : const_expr->constvalue; seccol_type = const_expr->consttype; } } @@ -3348,16 +3644,19 @@ GetRelationGroupsByQuals(Oid reloid, RelationLocInfo *rel_loc_info, Node *sec_qu List *oids = NULL; if (minStamp && maxStamp) { - if (IsHotData(minStamp, RELATION_ACCESS_READ, partitionStrategy, interval_step, start_timestamp) && IsHotData(maxStamp, RELATION_ACCESS_READ, partitionStrategy, interval_step, start_timestamp)) + if (IsHotData(minStamp, RELATION_ACCESS_READ, partitionStrategy, interval_step, start_timestamp) && + IsHotData(maxStamp, RELATION_ACCESS_READ, partitionStrategy, interval_step, start_timestamp)) { /* all hot data */ oids = lappend_oid(oids, rel_loc_info->groupId); } - else if (!IsHotData(minStamp, RELATION_ACCESS_READ, partitionStrategy, interval_step, start_timestamp) && !IsHotData(maxStamp, RELATION_ACCESS_READ, partitionStrategy, interval_step, start_timestamp)) + else if (!IsHotData(minStamp, RELATION_ACCESS_READ, partitionStrategy, interval_step, start_timestamp) && + !IsHotData(maxStamp, RELATION_ACCESS_READ, partitionStrategy, interval_step, start_timestamp)) { /* all cold data */ oids = lappend_oid(oids, rel_loc_info->coldGroupId); } - else if(!IsHotData(minStamp, RELATION_ACCESS_READ, partitionStrategy, interval_step, start_timestamp) && IsHotData(maxStamp, RELATION_ACCESS_READ, partitionStrategy, interval_step, start_timestamp)) + else if (!IsHotData(minStamp, RELATION_ACCESS_READ, partitionStrategy, interval_step, start_timestamp) && + IsHotData(maxStamp, RELATION_ACCESS_READ, partitionStrategy, interval_step, start_timestamp)) { /* range across cold and hot group */ oids = lappend_oid(oids, rel_loc_info->groupId); @@ -3407,10 +3706,12 @@ GetRelationGroupsByQuals(Oid reloid, RelationLocInfo *rel_loc_info, Node *sec_qu return oids; } } + #endif #ifdef _MLS_ extern char* g_default_locator_type; + char get_default_locator_type(void) { if (strlen(g_default_locator_type) == 0) @@ -3440,6 +3741,7 @@ char get_default_locator_type(void) return LOCATOR_TYPE_HASH; } + int get_default_distype(void) { if (strlen(g_default_locator_type) == 0) diff --git a/src/include/optimizer/pgxcship.h b/src/include/optimizer/pgxcship.h index c42f3a04..d29c2b8f 100644 --- a/src/include/optimizer/pgxcship.h +++ b/src/include/optimizer/pgxcship.h @@ -38,5 +38,6 @@ extern bool pgxc_is_trigger_shippable(Trigger *trigger); extern Node *get_var_from_arg(Node *arg); extern bool is_var_distribute_column(Var *var, List *rtable); +extern ExecNodes *pgxc_merge_exec_nodes(ExecNodes *en1, ExecNodes *en2); #endif #endif diff --git a/src/test/regress/sql/xc_FQS.sql b/src/test/regress/sql/xc_FQS.sql index bc99b709..53ce737e 100644 --- a/src/test/regress/sql/xc_FQS.sql +++ b/src/test/regress/sql/xc_FQS.sql @@ -276,7 +276,7 @@ explain (verbose on, costs off) delete from tab1_replicated where val = 7; select * from tab1_replicated where val = 7; -- Constant subquery -create table subquery_fqs(id int, a varchar, c int); +create table subquery_fqs(id int, a varchar, c int) distribute by shard(id); insert into subquery_fqs values(1,'gd', 2); insert into subquery_fqs values(1,'zj', 2); insert into subquery_fqs values(1,'sz', 2); @@ -284,15 +284,26 @@ explain select * from subquery_fqs t join (select 1 id, 'gd' a, 2 c from dual un select * from subquery_fqs t join (select 1 id, 'gd' a, 2 c from dual union select 1 id, 'sz' a, 2 c union select 1 id, 'zj' a, 2 c from dual) t2 ON (t.id = t2.id and t.a = t2.a); -- Support subquery FQS only if subquery distributed on same DN with main query(only 1 DN node) -explain select * from subquery_fqs t1 where t1.id = 1 and t1.c IN (select c from subquery_fqs t2 where t2.id=1); +explain (num_nodes on, verbose on, nodes off, costs off) select * from subquery_fqs t1 where t1.id = 1 and t1.c IN (select c from subquery_fqs t2 where t2.id=1); select * from subquery_fqs t1 where t1.id = 1 and t1.c IN (select c from subquery_fqs t2 where t2.id=1); -explain select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select c from subquery_fqs t2 where t2.id=1 order by c limit 1); +explain (num_nodes on, verbose on, nodes off, costs off) select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select c from subquery_fqs t2 where t2.id=1 order by c limit 1); select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select c from subquery_fqs t2 where t2.id=1 order by c limit 1); -explain select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select max(c) from subquery_fqs t2 where t2.id=1); +explain (num_nodes on, verbose on, nodes off, costs off) select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select max(c) from subquery_fqs t2 where t2.id=1); select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select max(c) from subquery_fqs t2 where t2.id=1); -explain select * from (select * from subquery_fqs where id = 1 order by c limit 1) where c = 2; +explain (num_nodes on, verbose on, nodes off, costs off) select * from (select * from subquery_fqs where id = 1 order by c limit 1) where c = 2; select * from (select * from subquery_fqs where id = 1 order by c limit 1) where c = 2; +set enable_oracle_compatible to true; +explain (num_nodes on, verbose on, nodes off, costs off) select * from subquery_fqs t1 where t1.id in (1 ,1); +explain select * from subquery_fqs t1 where t1.id in (1 ,1); +explain select * from subquery_fqs t1 where t1.id in (1 ,1); +explain select * from subquery_fqs t1 where t1.id in (1 ,1); +explain select * from subquery_fqs t1 where t1.id in (1 ,1); +explain select * from subquery_fqs t1 where t1.id in (1 ,1); +explain select * from subquery_fqs t1 where t1.id in (1 ,1); +set enable_oracle_compatible to false; +explain (num_nodes on, verbose on, nodes off, costs off) select * from subquery_fqs t1 where t1.id in (1 ,3); + drop table tab1_rr; drop table tab1_hash; drop table tab1_modulo; From b0348dd847513b6f746812889d707b269351e560 Mon Sep 17 00:00:00 2001 From: youngxie Date: Sun, 24 Oct 2021 09:26:59 +0800 Subject: [PATCH 424/578] final --- src/backend/pgxc/locator/locator.c | 11 +- src/test/regress/expected/xc_FQS.out | 571 +++++++++++++++++++-------- 2 files changed, 405 insertions(+), 177 deletions(-) diff --git a/src/backend/pgxc/locator/locator.c b/src/backend/pgxc/locator/locator.c index 3ed59f90..1c0b541e 100644 --- a/src/backend/pgxc/locator/locator.c +++ b/src/backend/pgxc/locator/locator.c @@ -2499,13 +2499,6 @@ GetRelationNodesByQuals(Oid reloid, RelationLocInfo *rel_loc_info, Oid disttype; int32 disttypmod; - if (enable_distri_debug) - { - int r = 1; - while(r) - ; - } - if (dis_qual) { *dis_qual = NULL; @@ -2843,6 +2836,7 @@ GetRelationNodesByQuals(Oid reloid, RelationLocInfo *rel_loc_info, seccol_value, seccol_isnull, relaccess); } + /* Only for shard table without cold hot seperation */ else if (distcol_expr && IsA(distcol_expr, ArrayExpr) && rel_loc_info->locatorType == LOCATOR_TYPE_SHARD && !seccol_list) { @@ -2884,7 +2878,8 @@ GetRelationNodesByQuals(Oid reloid, RelationLocInfo *rel_loc_info, if (exec_nodes) { - exec_nodes->nodeList = list_concat_unique(exec_nodes->nodeList, + Assert(exec_nodes->baselocatortype == temp->baselocatortype); + exec_nodes->nodeList = list_concat_unique_int(exec_nodes->nodeList, temp->nodeList); } else diff --git a/src/test/regress/expected/xc_FQS.out b/src/test/regress/expected/xc_FQS.out index 6fe94587..f4b88988 100644 --- a/src/test/regress/expected/xc_FQS.out +++ b/src/test/regress/expected/xc_FQS.out @@ -221,21 +221,29 @@ select val, val2 from tab1_rr where val2 = 8 group by val, val2; (1 row) explain (verbose on, nodes off, costs off) select val, val2 from tab1_rr where val2 = 8 group by val, val2; - QUERY PLAN ------------------------------------------------- - Group + QUERY PLAN +------------------------------------------------------------------ + Remote Subquery Scan on all Output: val, val2 - Group Key: tab1_rr.val, tab1_rr.val2 - -> Remote Subquery Scan on all + -> Group Output: val, val2 - Sort Key: tab1_rr.val + Group Key: tab1_rr.val, tab1_rr.val2 -> Sort Output: val, val2 Sort Key: tab1_rr.val - -> Seq Scan on public.tab1_rr + -> Remote Subquery Scan on all Output: val, val2 - Filter: (tab1_rr.val2 = 8) -(12 rows) + Distribute results by H: val + -> Group + Output: val, val2 + Group Key: tab1_rr.val, tab1_rr.val2 + -> Sort + Output: val, val2 + Sort Key: tab1_rr.val + -> Seq Scan on public.tab1_rr + Output: val, val2 + Filter: (tab1_rr.val2 = 8) +(20 rows) -- should not get FQSed because of HAVING clause select sum(val) from tab1_rr where val2 = 2 group by val2 having sum(val) > 1; @@ -245,18 +253,24 @@ select sum(val) from tab1_rr where val2 = 2 group by val2 having sum(val) > 1; (1 row) explain (verbose on, nodes off, costs off) select sum(val) from tab1_rr where val2 = 2 group by val2 having sum(val) > 1; - QUERY PLAN ------------------------------------------- - GroupAggregate + QUERY PLAN +------------------------------------------------------ + Remote Subquery Scan on all Output: sum(val), val2 - Group Key: tab1_rr.val2 - Filter: (sum(tab1_rr.val) > 1) - -> Remote Subquery Scan on all - Output: val2, val - -> Seq Scan on public.tab1_rr - Output: val2, val - Filter: (tab1_rr.val2 = 2) -(9 rows) + -> Finalize GroupAggregate + Output: sum(val), val2 + Group Key: tab1_rr.val2 + Filter: (sum(tab1_rr.val) > 1) + -> Remote Subquery Scan on all + Output: val2, PARTIAL sum(val) + Distribute results by H: val2 + -> Partial GroupAggregate + Output: val2, PARTIAL sum(val) + Group Key: tab1_rr.val2 + -> Seq Scan on public.tab1_rr + Output: val, val2 + Filter: (tab1_rr.val2 = 2) +(15 rows) -- tests for node reduction by application of quals, for roundrobin node -- reduction is not applicable. Having query not FQSed because of existence of ORDER BY, @@ -416,14 +430,14 @@ explain (verbose on, nodes off, costs off) select distinct val2 from tab1_rr whe -- DMLs update tab1_rr set val2 = 1000 where val = 7; explain (verbose on, nodes off, costs off) update tab1_rr set val2 = 1000 where val = 7; - QUERY PLAN ----------------------------------------------------------------- + QUERY PLAN +-------------------------------------------------------------------------------- Remote Fast Query Execution - Output: 1000, tab1_rr.xc_node_id, tab1_rr.ctid + Output: tab1_rr.val, 1000, tab1_rr.xc_node_id, tab1_rr.ctid, tab1_rr.shardid Remote query: UPDATE tab1_rr SET val2 = 1000 WHERE (val = 7) -> Update on public.tab1_rr -> Seq Scan on public.tab1_rr - Output: val, 1000, ctid + Output: val, 1000, ctid, shardid Filter: (tab1_rr.val = 7) (7 rows) @@ -436,15 +450,15 @@ select * from tab1_rr where val = 7; delete from tab1_rr where val = 7; explain (verbose on, costs off) delete from tab1_rr where val = 7; - QUERY PLAN ------------------------------------------------------ + QUERY PLAN +------------------------------------------------------------- Remote Fast Query Execution - Output: tab1_rr.xc_node_id, tab1_rr.ctid + Output: tab1_rr.xc_node_id, tab1_rr.ctid, tab1_rr.shardid Node/s: datanode_1, datanode_2 Remote query: DELETE FROM tab1_rr WHERE (val = 7) -> Delete on public.tab1_rr -> Seq Scan on public.tab1_rr - Output: ctid + Output: ctid, shardid Filter: (tab1_rr.val = 7) (8 rows) @@ -660,18 +674,24 @@ select sum(val) from tab1_hash where val2 = 2 group by val2 having sum(val) > 1; (1 row) explain (verbose on, nodes off, costs off) select sum(val) from tab1_hash where val2 = 2 group by val2 having sum(val) > 1; - QUERY PLAN --------------------------------------------- - GroupAggregate + QUERY PLAN +-------------------------------------------------------- + Remote Subquery Scan on all Output: sum(val), val2 - Group Key: tab1_hash.val2 - Filter: (sum(tab1_hash.val) > 1) - -> Remote Subquery Scan on all - Output: val2, val - -> Seq Scan on public.tab1_hash - Output: val2, val - Filter: (tab1_hash.val2 = 2) -(9 rows) + -> Finalize GroupAggregate + Output: sum(val), val2 + Group Key: tab1_hash.val2 + Filter: (sum(tab1_hash.val) > 1) + -> Remote Subquery Scan on all + Output: val2, PARTIAL sum(val) + Distribute results by H: val2 + -> Partial GroupAggregate + Output: val2, PARTIAL sum(val) + Group Key: tab1_hash.val2 + -> Seq Scan on public.tab1_hash + Output: val, val2 + Filter: (tab1_hash.val2 = 2) +(15 rows) -- tests for node reduction by application of quals. Having query FQSed because of -- existence of ORDER BY, implies that nodes got reduced. @@ -832,14 +852,14 @@ explain (verbose on, nodes off, costs off, num_nodes on) select distinct val2 fr -- DMLs update tab1_hash set val2 = 1000 where val = 7; explain (verbose on, nodes off, costs off) update tab1_hash set val2 = 1000 where val = 7; - QUERY PLAN ------------------------------------------------------------------- + QUERY PLAN +---------------------------------------------------------------------------------------- Remote Fast Query Execution - Output: 1000, tab1_hash.xc_node_id, tab1_hash.ctid + Output: tab1_hash.val, 1000, tab1_hash.xc_node_id, tab1_hash.ctid, tab1_hash.shardid Remote query: UPDATE tab1_hash SET val2 = 1000 WHERE (val = 7) -> Update on public.tab1_hash -> Seq Scan on public.tab1_hash - Output: val, 1000, ctid + Output: val, 1000, ctid, shardid Filter: (tab1_hash.val = 7) (7 rows) @@ -852,15 +872,15 @@ select * from tab1_hash where val = 7; delete from tab1_hash where val = 7; explain (verbose on, costs off) delete from tab1_hash where val = 7; - QUERY PLAN -------------------------------------------------------- + QUERY PLAN +---------------------------------------------------------------------------------- Remote Fast Query Execution - Output: tab1_hash.xc_node_id, tab1_hash.ctid + Output: tab1_hash.xc_node_id, tab1_hash.ctid, tab1_hash.shardid, tab1_hash.val Node/s: datanode_2 Remote query: DELETE FROM tab1_hash WHERE (val = 7) -> Delete on public.tab1_hash -> Seq Scan on public.tab1_hash - Output: ctid + Output: ctid, shardid Filter: (tab1_hash.val = 7) (8 rows) @@ -1076,18 +1096,24 @@ select sum(val) from tab1_modulo where val2 = 2 group by val2 having sum(val) > (1 row) explain (verbose on, nodes off, costs off) select sum(val) from tab1_modulo where val2 = 2 group by val2 having sum(val) > 1; - QUERY PLAN ----------------------------------------------- - GroupAggregate + QUERY PLAN +---------------------------------------------------------- + Remote Subquery Scan on all Output: sum(val), val2 - Group Key: tab1_modulo.val2 - Filter: (sum(tab1_modulo.val) > 1) - -> Remote Subquery Scan on all - Output: val2, val - -> Seq Scan on public.tab1_modulo - Output: val2, val - Filter: (tab1_modulo.val2 = 2) -(9 rows) + -> Finalize GroupAggregate + Output: sum(val), val2 + Group Key: tab1_modulo.val2 + Filter: (sum(tab1_modulo.val) > 1) + -> Remote Subquery Scan on all + Output: val2, PARTIAL sum(val) + Distribute results by H: val2 + -> Partial GroupAggregate + Output: val2, PARTIAL sum(val) + Group Key: tab1_modulo.val2 + -> Seq Scan on public.tab1_modulo + Output: val, val2 + Filter: (tab1_modulo.val2 = 2) +(15 rows) -- tests for node reduction by application of quals. Having query FQSed because of -- existence of ORDER BY, implies that nodes got reduced. @@ -1189,17 +1215,18 @@ select avg(val) from tab1_modulo where val = 7; (1 row) explain (verbose on, nodes off, costs off, num_nodes on) select avg(val) from tab1_modulo where val = 7; - QUERY PLAN -------------------------------------------------------------------------- - Remote Fast Query Execution (primary node count=0, node count=1) - Output: avg(tab1_modulo.val) - Remote query: SELECT avg(val) AS avg FROM tab1_modulo WHERE (val = 7) - -> Aggregate - Output: avg(val) - -> Seq Scan on public.tab1_modulo - Output: val, val2 - Filter: (tab1_modulo.val = 7) -(8 rows) + QUERY PLAN +--------------------------------------------------- + Finalize Aggregate + Output: avg(val) + -> Remote Subquery Scan on all + Output: PARTIAL avg(val) + -> Partial Aggregate + Output: PARTIAL avg(val) + -> Seq Scan on public.tab1_modulo + Output: val, val2 + Filter: (tab1_modulo.val = 7) +(9 rows) select val, val2 from tab1_modulo where val = 7 order by val2; val | val2 @@ -1209,18 +1236,17 @@ select val, val2 from tab1_modulo where val = 7 order by val2; (2 rows) explain (verbose on, nodes off, costs off, num_nodes on) select val, val2 from tab1_modulo where val = 7 order by val2; - QUERY PLAN ---------------------------------------------------------------------------------- - Remote Fast Query Execution (primary node count=0, node count=1) - Output: tab1_modulo.val, tab1_modulo.val2 - Remote query: SELECT val, val2 FROM tab1_modulo WHERE (val = 7) ORDER BY val2 + QUERY PLAN +--------------------------------------------- + Remote Subquery Scan on all + Output: val, val2 -> Sort Output: val, val2 Sort Key: tab1_modulo.val2 -> Seq Scan on public.tab1_modulo Output: val, val2 Filter: (tab1_modulo.val = 7) -(9 rows) +(8 rows) select distinct val2 from tab1_modulo where val = 7; val2 @@ -1230,12 +1256,11 @@ select distinct val2 from tab1_modulo where val = 7; (2 rows) explain (verbose on, nodes off, costs off, num_nodes on) select distinct val2 from tab1_modulo where val = 7; - QUERY PLAN ------------------------------------------------------------------------ - Remote Fast Query Execution (primary node count=0, node count=1) - Output: tab1_modulo.val2 - Remote query: SELECT DISTINCT val2 FROM tab1_modulo WHERE (val = 7) - -> Unique + QUERY PLAN +--------------------------------------------------- + Unique + Output: val2 + -> Remote Subquery Scan on all Output: val2 -> Sort Output: val2 @@ -1243,19 +1268,19 @@ explain (verbose on, nodes off, costs off, num_nodes on) select distinct val2 fr -> Seq Scan on public.tab1_modulo Output: val2 Filter: (tab1_modulo.val = 7) -(11 rows) +(10 rows) -- DMLs update tab1_modulo set val2 = 1000 where val = 7; explain (verbose on, nodes off, costs off) update tab1_modulo set val2 = 1000 where val = 7; - QUERY PLAN --------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------------------------ Remote Fast Query Execution - Output: 1000, tab1_modulo.xc_node_id, tab1_modulo.ctid + Output: tab1_modulo.val, 1000, tab1_modulo.xc_node_id, tab1_modulo.ctid, tab1_modulo.shardid Remote query: UPDATE tab1_modulo SET val2 = 1000 WHERE (val = 7) -> Update on public.tab1_modulo -> Seq Scan on public.tab1_modulo - Output: val, 1000, ctid + Output: val, 1000, ctid, shardid Filter: (tab1_modulo.val = 7) (7 rows) @@ -1268,15 +1293,15 @@ select * from tab1_modulo where val = 7; delete from tab1_modulo where val = 7; explain (verbose on, costs off) delete from tab1_modulo where val = 7; - QUERY PLAN ---------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------------------ Remote Fast Query Execution - Output: tab1_modulo.xc_node_id, tab1_modulo.ctid + Output: tab1_modulo.xc_node_id, tab1_modulo.ctid, tab1_modulo.shardid, tab1_modulo.val Node/s: datanode_2 Remote query: DELETE FROM tab1_modulo WHERE (val = 7) -> Delete on public.tab1_modulo -> Seq Scan on public.tab1_modulo - Output: ctid + Output: ctid, shardid Filter: (tab1_modulo.val = 7) (8 rows) @@ -1310,7 +1335,7 @@ explain (verbose on, nodes off, costs off) insert into tab1_replicated values (9 (6 rows) -- simple select -select * from tab1_replicated; +select * from tab1_replicated order by val; val | val2 -----+------ 1 | 2 @@ -1337,16 +1362,15 @@ select sum(val), avg(val), count(*) from tab1_replicated; (1 row) explain (num_nodes on, verbose on, nodes off, costs off) select sum(val), avg(val), count(*) from tab1_replicated; - QUERY PLAN -------------------------------------------------------------------------------------------------- - Remote Fast Query Execution (primary node count=0, node count=1) - Output: sum(tab1_replicated.val), avg(tab1_replicated.val), count(*) - Remote query: SELECT sum(val) AS sum, avg(val) AS avg, count(*) AS count FROM tab1_replicated + QUERY PLAN +------------------------------------------------ + Remote Subquery Scan on all + Output: sum(val), avg(val), count(*) -> Aggregate Output: sum(val), avg(val), count(*) -> Seq Scan on public.tab1_replicated Output: val, val2 -(7 rows) +(6 rows) select first_value(val) over (partition by val2 order by val) from tab1_replicated; first_value @@ -1359,19 +1383,18 @@ select first_value(val) over (partition by val2 order by val) from tab1_replicat (5 rows) explain (num_nodes on, verbose on, nodes off, costs off) select first_value(val) over (partition by val2 order by val) from tab1_replicated; - QUERY PLAN -------------------------------------------------------------------------------------------------------------------- - Remote Fast Query Execution (primary node count=0, node count=1) - Output: first_value(tab1_replicated.val) OVER (?), tab1_replicated.val, tab1_replicated.val2 - Remote query: SELECT first_value(val) OVER (PARTITION BY val2 ORDER BY val) AS first_value FROM tab1_replicated - -> WindowAgg - Output: first_value(val) OVER (?), val, val2 + QUERY PLAN +------------------------------------------------------------------- + WindowAgg + Output: first_value(val) OVER (?), val, val2 + -> Remote Subquery Scan on all + Output: val, val2 -> Sort Output: val, val2 Sort Key: tab1_replicated.val2, tab1_replicated.val -> Seq Scan on public.tab1_replicated Output: val, val2 -(10 rows) +(9 rows) select * from tab1_replicated where val2 = 2 limit 2; val | val2 @@ -1381,17 +1404,18 @@ select * from tab1_replicated where val2 = 2 limit 2; (2 rows) explain (num_nodes on, verbose on, nodes off, costs off) select * from tab1_replicated where val2 = 2 limit 2; - QUERY PLAN --------------------------------------------------------------------------------- - Remote Fast Query Execution (primary node count=0, node count=1) - Output: tab1_replicated.val, tab1_replicated.val2 - Remote query: SELECT val, val2 FROM tab1_replicated WHERE (val2 = 2) LIMIT 2 - -> Limit + QUERY PLAN +-------------------------------------------------------- + Limit + Output: val, val2 + -> Remote Subquery Scan on all Output: val, val2 - -> Seq Scan on public.tab1_replicated + -> Limit Output: val, val2 - Filter: (tab1_replicated.val2 = 2) -(8 rows) + -> Seq Scan on public.tab1_replicated + Output: val, val2 + Filter: (tab1_replicated.val2 = 2) +(9 rows) select * from tab1_replicated where val2 = 4 offset 1; val | val2 @@ -1399,17 +1423,16 @@ select * from tab1_replicated where val2 = 4 offset 1; (0 rows) explain (num_nodes on, verbose on, nodes off, costs off) select * from tab1_replicated where val2 = 4 offset 1; - QUERY PLAN ---------------------------------------------------------------------------------- - Remote Fast Query Execution (primary node count=0, node count=1) - Output: tab1_replicated.val, tab1_replicated.val2 - Remote query: SELECT val, val2 FROM tab1_replicated WHERE (val2 = 4) OFFSET 1 - -> Limit + QUERY PLAN +-------------------------------------------------- + Limit + Output: val, val2 + -> Remote Subquery Scan on all Output: val, val2 -> Seq Scan on public.tab1_replicated Output: val, val2 Filter: (tab1_replicated.val2 = 4) -(8 rows) +(7 rows) select * from tab1_replicated order by val; val | val2 @@ -1422,17 +1445,16 @@ select * from tab1_replicated order by val; (5 rows) explain (num_nodes on, verbose on, nodes off, costs off) select * from tab1_replicated order by val; - QUERY PLAN --------------------------------------------------------------------- - Remote Fast Query Execution (primary node count=0, node count=1) - Output: tab1_replicated.val, tab1_replicated.val2 - Remote query: SELECT val, val2 FROM tab1_replicated ORDER BY val + QUERY PLAN +------------------------------------------------ + Remote Subquery Scan on all + Output: val, val2 -> Sort Output: val, val2 Sort Key: tab1_replicated.val -> Seq Scan on public.tab1_replicated Output: val, val2 -(8 rows) +(7 rows) select distinct val, val2 from tab1_replicated order by 1, 2; val | val2 @@ -1445,11 +1467,10 @@ select distinct val, val2 from tab1_replicated order by 1, 2; (5 rows) explain (num_nodes on, verbose on, nodes off, costs off) select distinct val, val2 from tab1_replicated order by 1, 2; - QUERY PLAN ------------------------------------------------------------------------------------ - Remote Fast Query Execution (primary node count=0, node count=1) - Output: tab1_replicated.val, tab1_replicated.val2 - Remote query: SELECT DISTINCT val, val2 FROM tab1_replicated ORDER BY val, val2 + QUERY PLAN +-------------------------------------------------------------------- + Remote Subquery Scan on all + Output: val, val2 -> Sort Output: val, val2 Sort Key: tab1_replicated.val, tab1_replicated.val2 @@ -1458,20 +1479,19 @@ explain (num_nodes on, verbose on, nodes off, costs off) select distinct val, va Group Key: tab1_replicated.val, tab1_replicated.val2 -> Seq Scan on public.tab1_replicated Output: val, val2 -(11 rows) +(10 rows) explain (num_nodes on, verbose on, nodes off, costs off) select distinct val, val2 from tab1_replicated; - QUERY PLAN ------------------------------------------------------------------- - Remote Fast Query Execution (primary node count=0, node count=1) - Output: tab1_replicated.val, tab1_replicated.val2 - Remote query: SELECT DISTINCT val, val2 FROM tab1_replicated + QUERY PLAN +-------------------------------------------------------------- + Remote Subquery Scan on all + Output: val, val2 -> HashAggregate Output: val, val2 Group Key: tab1_replicated.val, tab1_replicated.val2 -> Seq Scan on public.tab1_replicated Output: val, val2 -(8 rows) +(7 rows) select val, val2 from tab1_replicated group by val, val2 order by 1, 2; val | val2 @@ -1484,11 +1504,10 @@ select val, val2 from tab1_replicated group by val, val2 order by 1, 2; (5 rows) explain (num_nodes on, verbose on, nodes off, costs off) select val, val2 from tab1_replicated group by val, val2 order by 1, 2; - QUERY PLAN ---------------------------------------------------------------------------------------------- - Remote Fast Query Execution (primary node count=0, node count=1) - Output: tab1_replicated.val, tab1_replicated.val2 - Remote query: SELECT val, val2 FROM tab1_replicated GROUP BY val, val2 ORDER BY val, val2 + QUERY PLAN +-------------------------------------------------------------------- + Remote Subquery Scan on all + Output: val, val2 -> Sort Output: val, val2 Sort Key: tab1_replicated.val, tab1_replicated.val2 @@ -1497,20 +1516,19 @@ explain (num_nodes on, verbose on, nodes off, costs off) select val, val2 from t Group Key: tab1_replicated.val, tab1_replicated.val2 -> Seq Scan on public.tab1_replicated Output: val, val2 -(11 rows) +(10 rows) explain (num_nodes on, verbose on, nodes off, costs off) select val, val2 from tab1_replicated group by val, val2; - QUERY PLAN --------------------------------------------------------------------------- - Remote Fast Query Execution (primary node count=0, node count=1) - Output: tab1_replicated.val, tab1_replicated.val2 - Remote query: SELECT val, val2 FROM tab1_replicated GROUP BY val, val2 + QUERY PLAN +-------------------------------------------------------------- + Remote Subquery Scan on all + Output: val, val2 -> HashAggregate Output: val, val2 Group Key: tab1_replicated.val, tab1_replicated.val2 -> Seq Scan on public.tab1_replicated Output: val, val2 -(8 rows) +(7 rows) select sum(val) from tab1_replicated group by val2 having sum(val) > 1 order by 1; sum @@ -1522,11 +1540,10 @@ select sum(val) from tab1_replicated group by val2 having sum(val) > 1 order by (4 rows) explain (num_nodes on, verbose on, nodes off, costs off) select sum(val) from tab1_replicated group by val2 having sum(val) > 1 order by 1; - QUERY PLAN ---------------------------------------------------------------------------------------------------------------------- - Remote Fast Query Execution (primary node count=0, node count=1) - Output: sum(tab1_replicated.val), tab1_replicated.val2 - Remote query: SELECT sum(val) AS sum FROM tab1_replicated GROUP BY val2 HAVING (sum(val) > 1) ORDER BY (sum(val)) + QUERY PLAN +------------------------------------------------------ + Remote Subquery Scan on all + Output: sum(val), val2 -> Sort Output: (sum(val)), val2 Sort Key: (sum(tab1_replicated.val)) @@ -1536,33 +1553,32 @@ explain (num_nodes on, verbose on, nodes off, costs off) select sum(val) from ta Filter: (sum(tab1_replicated.val) > 1) -> Seq Scan on public.tab1_replicated Output: val, val2 -(12 rows) +(11 rows) explain (num_nodes on, verbose on, nodes off, costs off) select sum(val) from tab1_replicated group by val2 having sum(val) > 1; - QUERY PLAN -------------------------------------------------------------------------------------------------- - Remote Fast Query Execution (primary node count=0, node count=1) - Output: sum(tab1_replicated.val), tab1_replicated.val2 - Remote query: SELECT sum(val) AS sum FROM tab1_replicated GROUP BY val2 HAVING (sum(val) > 1) + QUERY PLAN +------------------------------------------------ + Remote Subquery Scan on all + Output: sum(val), val2 -> HashAggregate Output: sum(val), val2 Group Key: tab1_replicated.val2 Filter: (sum(tab1_replicated.val) > 1) -> Seq Scan on public.tab1_replicated Output: val, val2 -(9 rows) +(8 rows) -- DMLs update tab1_replicated set val2 = 1000 where val = 7; explain (verbose on, nodes off, costs off) update tab1_replicated set val2 = 1000 where val = 7; - QUERY PLAN ------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------------ Remote Fast Query Execution - Output: 1000, tab1_replicated.ctid + Output: tab1_replicated.val, 1000, tab1_replicated.ctid, tab1_replicated.shardid Remote query: UPDATE tab1_replicated SET val2 = 1000 WHERE (val = 7) -> Update on public.tab1_replicated -> Seq Scan on public.tab1_replicated - Output: val, 1000, ctid + Output: val, 1000, ctid, shardid Filter: (tab1_replicated.val = 7) (7 rows) @@ -1577,12 +1593,12 @@ explain (verbose on, costs off) delete from tab1_replicated where val = 7; QUERY PLAN ------------------------------------------------------------- Remote Fast Query Execution - Output: tab1_replicated.ctid + Output: tab1_replicated.ctid, tab1_replicated.shardid Node/s: datanode_1, datanode_2 Remote query: DELETE FROM tab1_replicated WHERE (val = 7) -> Delete on public.tab1_replicated -> Seq Scan on public.tab1_replicated - Output: ctid + Output: ctid, shardid Filter: (tab1_replicated.val = 7) (8 rows) @@ -1591,8 +1607,225 @@ select * from tab1_replicated where val = 7; -----+------ (0 rows) +-- Constant subquery +create table subquery_fqs(id int, a varchar, c int) distribute by shard(id); +NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. +insert into subquery_fqs values(1,'gd', 2); +insert into subquery_fqs values(1,'zj', 2); +insert into subquery_fqs values(1,'sz', 2); +explain select * from subquery_fqs t join (select 1 id, 'gd' a, 2 c from dual union select 1 id, 'sz' a, 2 c union select 1 id, 'zj' a, 2 c from dual) t2 ON (t.id = t2.id and t.a = t2.a); + QUERY PLAN +------------------------------------------------------------------------------ + Remote Fast Query Execution (cost=0.00..0.00 rows=0 width=0) + Node/s: datanode_1, datanode_2 + -> Hash Join (cost=0.19..25.60 rows=1 width=80) + Hash Cond: ((t.id = (1)) AND ((t.a)::text = ('gd'::text))) + -> Seq Scan on subquery_fqs t (cost=0.00..18.80 rows=880 width=40) + -> Hash (cost=0.14..0.14 rows=3 width=40) + -> HashAggregate (cost=0.08..0.11 rows=3 width=40) + Group Key: (1), ('gd'::text), (2) + -> Append (cost=0.00..0.06 rows=3 width=40) + -> Result (cost=0.00..0.01 rows=1 width=40) + -> Result (cost=0.00..0.01 rows=1 width=40) + -> Result (cost=0.00..0.01 rows=1 width=40) +(12 rows) + +select * from subquery_fqs t join (select 1 id, 'gd' a, 2 c from dual union select 1 id, 'sz' a, 2 c union select 1 id, 'zj' a, 2 c from dual) t2 ON (t.id = t2.id and t.a = t2.a); + id | a | c | id | a | c +----+----+---+----+----+--- + 1 | gd | 2 | 1 | gd | 2 + 1 | zj | 2 | 1 | zj | 2 + 1 | sz | 2 | 1 | sz | 2 +(3 rows) + +-- Support subquery FQS only if subquery distributed on same DN with main query(only 1 DN node) +explain (num_nodes on, verbose on, nodes off, costs off) select * from subquery_fqs t1 where t1.id = 1 and t1.c IN (select c from subquery_fqs t2 where t2.id=1); + QUERY PLAN +------------------------------------------------------------ + Remote Subquery Scan on all + Output: t1.id, t1.a, t1.c + -> Nested Loop Semi Join + Output: t1.id, t1.a, t1.c + Join Filter: (t1.c = t2.c) + -> Seq Scan on public.subquery_fqs t1 + Output: t1.id, t1.a, t1.c + Filter: (t1.id = 1) + -> Materialize + Output: t2.c + -> Remote Subquery Scan on all + Output: t2.c + -> Seq Scan on public.subquery_fqs t2 + Output: t2.c + Filter: (t2.id = 1) +(15 rows) + +select * from subquery_fqs t1 where t1.id = 1 and t1.c IN (select c from subquery_fqs t2 where t2.id=1); + id | a | c +----+----+--- + 1 | gd | 2 + 1 | zj | 2 + 1 | sz | 2 +(3 rows) + +explain (num_nodes on, verbose on, nodes off, costs off) select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select c from subquery_fqs t2 where t2.id=1 order by c limit 1); + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------------------------------ + Remote Fast Query Execution (primary node count=0, node count=1) + Output: t1.id, t1.a, t1.c + Remote query: SELECT id, a, c FROM subquery_fqs t1 WHERE ((id = 1) AND (c = (SELECT t2.c FROM subquery_fqs t2 WHERE (t2.id = 1) ORDER BY t2.c LIMIT 1))) + -> Seq Scan on public.subquery_fqs t1 + Output: t1.id, t1.a, t1.c + Filter: ((t1.id = 1) AND (t1.c = $0)) + InitPlan 1 (returns $0) + -> Limit + Output: t2.c + -> Sort + Output: t2.c + Sort Key: t2.c + -> Seq Scan on public.subquery_fqs t2 + Output: t2.c + Filter: (t2.id = 1) +(15 rows) + +select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select c from subquery_fqs t2 where t2.id=1 order by c limit 1); + id | a | c +----+----+--- + 1 | gd | 2 + 1 | zj | 2 + 1 | sz | 2 +(3 rows) + +explain (num_nodes on, verbose on, nodes off, costs off) select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select max(c) from subquery_fqs t2 where t2.id=1); + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------------------------------------- + Remote Fast Query Execution (primary node count=0, node count=1) + Output: t1.id, t1.a, t1.c + Remote query: SELECT id, a, c FROM subquery_fqs t1 WHERE ((id = 1) AND (c = (SELECT max(t2.c) AS max FROM subquery_fqs t2 WHERE (t2.id = 1)))) + -> Seq Scan on public.subquery_fqs t1 + Output: t1.id, t1.a, t1.c + Filter: ((t1.id = 1) AND (t1.c = $0)) + InitPlan 1 (returns $0) + -> Aggregate + Output: max(t2.c) + -> Seq Scan on public.subquery_fqs t2 + Output: t2.id, t2.a, t2.c + Filter: (t2.id = 1) +(12 rows) + +select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select max(c) from subquery_fqs t2 where t2.id=1); + id | a | c +----+----+--- + 1 | gd | 2 + 1 | zj | 2 + 1 | sz | 2 +(3 rows) + +explain (num_nodes on, verbose on, nodes off, costs off) select * from (select * from subquery_fqs where id = 1 order by c limit 1) where c = 2; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Remote Fast Query Execution (primary node count=0, node count=1) + Output: "__Alias_72__".id, "__Alias_72__".a, "__Alias_72__".c + Remote query: SELECT id, a, c FROM (SELECT subquery_fqs.id, subquery_fqs.a, subquery_fqs.c FROM subquery_fqs WHERE (subquery_fqs.id = 1) ORDER BY subquery_fqs.c LIMIT 1) "__Alias_72__" WHERE (c = 2) + -> Subquery Scan on "__Alias_72__" + Output: "__Alias_72__".id, "__Alias_72__".a, "__Alias_72__".c + Filter: ("__Alias_72__".c = 2) + -> Limit + Output: subquery_fqs.id, subquery_fqs.a, subquery_fqs.c + -> Sort + Output: subquery_fqs.id, subquery_fqs.a, subquery_fqs.c + Sort Key: subquery_fqs.c + -> Seq Scan on public.subquery_fqs + Output: subquery_fqs.id, subquery_fqs.a, subquery_fqs.c + Filter: (subquery_fqs.id = 1) +(14 rows) + +select * from (select * from subquery_fqs where id = 1 order by c limit 1) where c = 2; + id | a | c +----+----+--- + 1 | gd | 2 +(1 row) + +set enable_oracle_compatible to true; +explain (num_nodes on, verbose on, nodes off, costs off) select * from subquery_fqs t1 where t1.id in (1 ,1); + QUERY PLAN +------------------------------------------------------------------------------------- + Remote Fast Query Execution (primary node count=0, node count=1) + Output: t1.id, t1.a, t1.c + Remote query: SELECT id, a, c FROM subquery_fqs t1 WHERE (id = ANY (ARRAY[1, 1])) + -> Seq Scan on public.subquery_fqs t1 + Output: id, a, c + Filter: (t1.id = ANY ('{1,1}'::integer[])) +(6 rows) + +explain select * from subquery_fqs t1 where t1.id in (1 ,1); + QUERY PLAN +----------------------------------------------------------------------- + Remote Fast Query Execution (cost=0.00..0.00 rows=0 width=0) + Node/s: datanode_1 + -> Seq Scan on subquery_fqs t1 (cost=0.00..21.00 rows=9 width=40) + Filter: (id = ANY ('{1,1}'::integer[])) +(4 rows) + +explain select * from subquery_fqs t1 where t1.id in (1 ,1); + QUERY PLAN +----------------------------------------------------------------------- + Remote Fast Query Execution (cost=0.00..0.00 rows=0 width=0) + Node/s: datanode_1 + -> Seq Scan on subquery_fqs t1 (cost=0.00..21.00 rows=9 width=40) + Filter: (id = ANY ('{1,1}'::integer[])) +(4 rows) + +explain select * from subquery_fqs t1 where t1.id in (1 ,1); + QUERY PLAN +----------------------------------------------------------------------- + Remote Fast Query Execution (cost=0.00..0.00 rows=0 width=0) + Node/s: datanode_1 + -> Seq Scan on subquery_fqs t1 (cost=0.00..21.00 rows=9 width=40) + Filter: (id = ANY ('{1,1}'::integer[])) +(4 rows) + +explain select * from subquery_fqs t1 where t1.id in (1 ,1); + QUERY PLAN +----------------------------------------------------------------------- + Remote Fast Query Execution (cost=0.00..0.00 rows=0 width=0) + Node/s: datanode_1 + -> Seq Scan on subquery_fqs t1 (cost=0.00..21.00 rows=9 width=40) + Filter: (id = ANY ('{1,1}'::integer[])) +(4 rows) + +explain select * from subquery_fqs t1 where t1.id in (1 ,1); + QUERY PLAN +----------------------------------------------------------------------- + Remote Fast Query Execution (cost=0.00..0.00 rows=0 width=0) + Node/s: datanode_1 + -> Seq Scan on subquery_fqs t1 (cost=0.00..21.00 rows=9 width=40) + Filter: (id = ANY ('{1,1}'::integer[])) +(4 rows) + +explain select * from subquery_fqs t1 where t1.id in (1 ,1); + QUERY PLAN +----------------------------------------------------------------------- + Remote Fast Query Execution (cost=0.00..0.00 rows=0 width=0) + Node/s: datanode_1 + -> Seq Scan on subquery_fqs t1 (cost=0.00..21.00 rows=9 width=40) + Filter: (id = ANY ('{1,1}'::integer[])) +(4 rows) + +set enable_oracle_compatible to false; +explain (num_nodes on, verbose on, nodes off, costs off) select * from subquery_fqs t1 where t1.id in (1 ,3); + QUERY PLAN +------------------------------------------------------------------------------------- + Remote Fast Query Execution (primary node count=0, node count=2) + Output: t1.id, t1.a, t1.c + Remote query: SELECT id, a, c FROM subquery_fqs t1 WHERE (id = ANY (ARRAY[1, 3])) + -> Seq Scan on public.subquery_fqs t1 + Output: id, a, c + Filter: (t1.id = ANY ('{1,3}'::integer[])) +(6 rows) + drop table tab1_rr; drop table tab1_hash; drop table tab1_modulo; drop table tab1_replicated; +drop table subquery_fqs; drop function cr_table(varchar, int[], varchar); From e5e3206ae77ae544317fb7f9074a6c77ffafaa23 Mon Sep 17 00:00:00 2001 From: youngxie Date: Mon, 25 Oct 2021 17:24:51 +0800 Subject: [PATCH 425/578] Fix varchar --- src/backend/pgxc/locator/locator.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/backend/pgxc/locator/locator.c b/src/backend/pgxc/locator/locator.c index 1c0b541e..0c358c85 100644 --- a/src/backend/pgxc/locator/locator.c +++ b/src/backend/pgxc/locator/locator.c @@ -2525,6 +2525,15 @@ GetRelationNodesByQuals(Oid reloid, RelationLocInfo *rel_loc_info, { distcol_expr = pgxc_find_distcol_expr(varno, rel_loc_info->partAttrNum, quals); + + if (distcol_expr && IsA(distcol_expr, ArrayCoerceExpr) && + IsA(((ArrayCoerceExpr *)distcol_expr)->arg, ArrayExpr)) + { + ArrayCoerceExpr *arrayCoerceExpr = (ArrayCoerceExpr *) distcol_expr; + + distcol_expr = arrayCoerceExpr->arg; + } + /* * If the type of expression used to find the Datanode, is not same as * the distribution column type, try casting it. This is same as what From 8d9bccce5bf50399083253750809be9991b348a3 Mon Sep 17 00:00:00 2001 From: youngxie Date: Tue, 26 Oct 2021 10:34:33 +0800 Subject: [PATCH 426/578] Revert autoformat --- src/backend/pgxc/locator/locator.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/backend/pgxc/locator/locator.c b/src/backend/pgxc/locator/locator.c index 0c358c85..b72b65f8 100644 --- a/src/backend/pgxc/locator/locator.c +++ b/src/backend/pgxc/locator/locator.c @@ -1197,7 +1197,10 @@ createLocator(char locatorType, RelationAccessType accessType, int *intptr; nodeMap = palloc(locator->nodeCount * sizeof(int)); intptr = (int *) nodeMap; - foreach(lc, l) *intptr++ = lfirst_int(lc); + foreach(lc, l) + { + *intptr++ = lfirst_int(lc); + } locator->listType = LOCATOR_LIST_INT; } else if (IsA(l, OidList)) @@ -1205,7 +1208,10 @@ createLocator(char locatorType, RelationAccessType accessType, Oid *oidptr; nodeMap = palloc(locator->nodeCount * sizeof(Oid)); oidptr = (Oid *) nodeMap; - foreach(lc, l) *oidptr++ = lfirst_oid(lc); + foreach(lc, l) + { + *oidptr++ = lfirst_oid(lc); + } locator->listType = LOCATOR_LIST_OID; } else if (IsA(l, List)) @@ -1213,7 +1219,10 @@ createLocator(char locatorType, RelationAccessType accessType, void **voidptr; nodeMap = palloc(locator->nodeCount * sizeof(void *)); voidptr = (void **) nodeMap; - foreach(lc, l) *voidptr++ = lfirst(lc); + foreach(lc, l) + { + *voidptr++ = lfirst(lc); + } locator->listType = LOCATOR_LIST_POINTER; } else From 9ec860267adafc16052ec5cf7fc07b822c0f3f25 Mon Sep 17 00:00:00 2001 From: whalesong Date: Mon, 25 Oct 2021 10:37:16 +0800 Subject: [PATCH 427/578] bugfix: consistency check error after cn switch (merge request !846) http://tapd.oa.com/20421696/bugtrace/bugs/view?bug_id=1020421696093416231 (cherry picked from commit 58dfc7f6) fedb6262 bugfix: consistency check error after cn switch --- src/backend/pgxc/pool/pgxcnode.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index 4d93ec43..d84611ee 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -5481,11 +5481,6 @@ PgxcNodeRefreshBackendHandlesShmem(List *nodes_alter) int nid; PGXCNodeHandle *handle = NULL; - if (PersistentConnections && nodes_alter != NIL) - { - release_handles(true); - } - foreach(lc, nodes_alter) { char ntype = PGXC_NODE_NONE; From 055897bc278d6fbd597d7bb00172ac2f5828dfef Mon Sep 17 00:00:00 2001 From: youngxie Date: Tue, 26 Oct 2021 10:49:30 +0800 Subject: [PATCH 428/578] adjust format --- src/backend/pgxc/locator/locator.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/backend/pgxc/locator/locator.c b/src/backend/pgxc/locator/locator.c index b72b65f8..4685ea1e 100644 --- a/src/backend/pgxc/locator/locator.c +++ b/src/backend/pgxc/locator/locator.c @@ -2535,13 +2535,16 @@ GetRelationNodesByQuals(Oid reloid, RelationLocInfo *rel_loc_info, distcol_expr = pgxc_find_distcol_expr(varno, rel_loc_info->partAttrNum, quals); - if (distcol_expr && IsA(distcol_expr, ArrayCoerceExpr) && - IsA(((ArrayCoerceExpr *)distcol_expr)->arg, ArrayExpr)) + /* Remove ArrayCoerceExpr at first */ + if (distcol_expr && IsA(distcol_expr, ArrayCoerceExpr)) { - ArrayCoerceExpr *arrayCoerceExpr = (ArrayCoerceExpr *) distcol_expr; + ArrayCoerceExpr *arrayCoerceExpr = castNode(ArrayCoerceExpr, distcol_expr); + if (arrayCoerceExpr->arg && IsA(arrayCoerceExpr->arg, ArrayExpr)) + { distcol_expr = arrayCoerceExpr->arg; } + } /* * If the type of expression used to find the Datanode, is not same as From 30bd59a349ada2f2fac6ab32975918bbf567afe0 Mon Sep 17 00:00:00 2001 From: youngxie Date: Tue, 26 Oct 2021 10:51:56 +0800 Subject: [PATCH 429/578] revert useless modification. --- src/backend/optimizer/util/pgxcship.c | 4 +++- src/include/optimizer/pgxcship.h | 1 - 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/backend/optimizer/util/pgxcship.c b/src/backend/optimizer/util/pgxcship.c index 6ad269f9..b749e028 100644 --- a/src/backend/optimizer/util/pgxcship.c +++ b/src/backend/optimizer/util/pgxcship.c @@ -117,6 +117,8 @@ static bool pgxc_is_func_shippable(Oid funcid); /* Check equijoin conditions on given relations */ static Expr *pgxc_find_dist_equijoin_qual(Relids varnos_1, Relids varnos_2, Oid distcol_type, Node *quals, List *rtable); +/* Merge given execution nodes based on join shippability conditions */ +static ExecNodes *pgxc_merge_exec_nodes(ExecNodes *en1, ExecNodes *en2); /* Check if given Query includes distribution column */ static bool pgxc_query_has_distcolgrouping(Query *query); @@ -2483,7 +2485,7 @@ pgxc_find_dist_equi_nodes(Relids varnos_1, * exec_node corresponds to the JOIN of respective relations. * If both exec_nodes can not be merged, it returns NULL. */ -ExecNodes * +static ExecNodes * pgxc_merge_exec_nodes(ExecNodes *en1, ExecNodes *en2) {// #lizard forgives ExecNodes *merged_en = makeNode(ExecNodes); diff --git a/src/include/optimizer/pgxcship.h b/src/include/optimizer/pgxcship.h index d29c2b8f..c42f3a04 100644 --- a/src/include/optimizer/pgxcship.h +++ b/src/include/optimizer/pgxcship.h @@ -38,6 +38,5 @@ extern bool pgxc_is_trigger_shippable(Trigger *trigger); extern Node *get_var_from_arg(Node *arg); extern bool is_var_distribute_column(Var *var, List *rtable); -extern ExecNodes *pgxc_merge_exec_nodes(ExecNodes *en1, ExecNodes *en2); #endif #endif From 32f1f9cf00d76de36770dc783f03f7db4100dae3 Mon Sep 17 00:00:00 2001 From: whalesong Date: Wed, 27 Oct 2021 18:00:30 +0800 Subject: [PATCH 430/578] 2pc stop opt: server time diff opt (merge request !849), http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131093399717&url_cache_key=80da2c20fd930784041c74db66ffd4d6&action_entry_type=bugs --- contrib/pg_clean/pg_clean.c | 157 ++++++++++++++++++++++++++---- src/backend/postmaster/clean2pc.c | 66 ++++++++++--- src/backend/utils/misc/guc.c | 17 +++- src/include/postmaster/clean2pc.h | 1 + 4 files changed, 204 insertions(+), 37 deletions(-) diff --git a/contrib/pg_clean/pg_clean.c b/contrib/pg_clean/pg_clean.c index 08375f46..0b2f6f98 100644 --- a/contrib/pg_clean/pg_clean.c +++ b/contrib/pg_clean/pg_clean.c @@ -62,17 +62,21 @@ int transaction_threshold = 200000; #define MAXIMUM_CLEAR_FILE 10000 #define MAXIMUM_OUTPUT_FILE 1000 #define XIDPREFIX "_$XC$" -#define DEFAULT_CLEAN_TIME_INTERVAL 120000000 -#define LEAST_CLEAN_TIME_INTERVAL 1000000 /* should not clean twophase trans prepared in 1s or commit in 1s */ +#define DEFAULT_CLEAN_TIME_INTERVAL 120 +#define LEAST_CLEAN_TIME_INTERVAL 3 /* should not clean twophase trans prepared in 3s */ +#define LEAST_CHECK_TIME_INTERVAL 1 /* should not check twophase trans prepared in 1s */ -GlobalTimestamp clean_time_interval = DEFAULT_CLEAN_TIME_INTERVAL; +GlobalTimestamp clean_time_interval = DEFAULT_CLEAN_TIME_INTERVAL * USECS_PER_SEC; PG_MODULE_MAGIC; #define MAX_GID 64 -#define CLEAN_CHECK_TIMES 3 -#define CLEAN_CHECK_INTERVAL 10000 +#define CLEAN_CHECK_TIMES_DEFAULT 3 +#define CLEAN_CHECK_INTERVAL_DEFAULT 100000 + +#define CLEAN_NODE_CHECK_TIMES 5 +#define CLEAN_NODE_CHECK_INTERVAL 500000 #define MAX_DBNAME 64 #define GET_START_XID "startxid:" @@ -316,6 +320,8 @@ bool send_query_clean_transaction(PGXCNodeHandle * conn, txn_info * txn, const c bool check_2pc_belong_node(txn_info * txn); bool check_node_participate(txn_info * txn, int node_idx); +bool check_2pc_start_from_node(txn_info *txn); + void recover2PC(txn_info * txn); TXN_STATUS check_txn_global_status(txn_info *txn); @@ -395,11 +401,15 @@ Datum pg_clean_execute(PG_FUNCTION_ARGS) /*clear Global*/ ResetGlobalVariables(); execute = true; - clean_time_interval = PG_GETARG_INT32(0) * 1000000; + + clean_time_interval = PG_GETARG_INT32(0); if (LEAST_CLEAN_TIME_INTERVAL > clean_time_interval) { + elog(WARNING, "least clean time interval is %ds", + LEAST_CLEAN_TIME_INTERVAL); clean_time_interval = LEAST_CLEAN_TIME_INTERVAL; } + clean_time_interval *= USECS_PER_SEC; /*get node list*/ PgxcNodeGetOids(&cn_node_list, &dn_node_list, @@ -538,9 +548,11 @@ Datum pg_clean_execute_on_node(PG_FUNCTION_ARGS) } abnormal_time = PG_GETARG_INT64(1); current_time = GetCurrentTimestamp(); - if (abnormal_time >= current_time) + if (abnormal_time >= current_time - LEAST_CLEAN_TIME_INTERVAL * USECS_PER_SEC) { - elog(ERROR, "pg_clean_execute_on_node, abnormal time "INT64_FORMAT" must before current_time "INT64_FORMAT, abnormal_time, current_time); + elog(ERROR, "pg_clean_execute_on_node, least clean time interval is %ds, " + "abnormal time: " INT64_FORMAT ", current_time: " INT64_FORMAT, + LEAST_CLEAN_TIME_INTERVAL, abnormal_time, current_time); } /*get node list*/ @@ -668,11 +680,15 @@ Datum pg_clean_check_txn(PG_FUNCTION_ARGS) /*clear Global*/ ResetGlobalVariables(); - clean_time_interval = PG_GETARG_INT32(0) * 1000000; - if (LEAST_CLEAN_TIME_INTERVAL > clean_time_interval) + clean_time_interval = PG_GETARG_INT32(0); + if (LEAST_CHECK_TIME_INTERVAL > clean_time_interval) { - clean_time_interval = LEAST_CLEAN_TIME_INTERVAL; + elog(WARNING, "least check time interval is %ds", + LEAST_CHECK_TIME_INTERVAL); + clean_time_interval = LEAST_CHECK_TIME_INTERVAL; } + clean_time_interval *= USECS_PER_SEC; + /*get node list*/ PgxcNodeGetOids(&cn_node_list, &dn_node_list, &cn_nodes_num, &dn_nodes_num, true); @@ -1636,7 +1652,7 @@ char *get2PCInfo(const char *tid) return result; } - elog(LOG, "try to get 2pc info from disk, tid: %s", tid); + elog(DEBUG1, "try to get 2pc info from disk, tid: %s", tid); snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid); if(access(path, F_OK) == 0) @@ -2489,12 +2505,20 @@ void recover2PC(txn_info * txn) { int i = 0; bool check_ok = false; + int check_times = CLEAN_CHECK_TIMES_DEFAULT; + int check_interval = CLEAN_CHECK_INTERVAL_DEFAULT; MemoryContext current_context = NULL; ErrorData* edata = NULL; TXN_STATUS txn_stat; txn_stat = check_txn_global_status(txn); txn->global_txn_stat = txn_stat; + if (clear_2pc_belong_node) + { + check_times = CLEAN_NODE_CHECK_TIMES; + check_interval = CLEAN_NODE_CHECK_INTERVAL; + } + #ifdef DEBUG_EXECABORT txn_stat = TXN_STATUS_ABORTED; #endif @@ -2529,7 +2553,7 @@ void recover2PC(txn_info * txn) { txn->op = COMMIT; /* check whether all nodes can commit prepared */ - for (i = 0; i < CLEAN_CHECK_TIMES; i++) + for (i = 0; i < check_times; i++) { check_ok = true; current_context = CurrentMemoryContext; @@ -2560,7 +2584,7 @@ void recover2PC(txn_info * txn) return; } - pg_usleep(CLEAN_CHECK_INTERVAL); + pg_usleep(check_interval); } /* send commit prepared to all nodes */ @@ -2578,7 +2602,7 @@ void recover2PC(txn_info * txn) case TXN_STATUS_ABORTED: txn->op = ABORT; /* check whether all nodes can rollback prepared */ - for (i = 0; i < CLEAN_CHECK_TIMES; i++) + for (i = 0; i < check_times; i++) { check_ok = true; current_context = CurrentMemoryContext; @@ -2609,7 +2633,7 @@ void recover2PC(txn_info * txn) return; } - pg_usleep(CLEAN_CHECK_INTERVAL); + pg_usleep(check_interval); } /* send rollback prepared to all nodes */ @@ -2733,10 +2757,39 @@ TXN_STATUS check_txn_global_status(txn_info *txn) #endif if (clear_2pc_belong_node) { + if (!check_2pc_belong_node(txn)) + { + return TXN_STATUS_INPROGRESS; + } + + if (!check_2pc_start_from_node(txn)) + { + return TXN_STATUS_INPROGRESS; + } + node_idx = find_node_index(abnormal_nodeoid); - if (!check_2pc_belong_node(txn) || - abnormal_time < txn->prepare_timestamp[node_idx]) + if (node_idx >= 0) + { + if (abnormal_time < txn->prepare_timestamp[node_idx]) + { + elog(WARNING, "gid: %s, abnormal time: " INT64_FORMAT + ", prepare timestamp[%d]: " INT64_FORMAT, txn->gid, + abnormal_time, node_idx, txn->prepare_timestamp[node_idx]); + + return TXN_STATUS_INPROGRESS; + } + } + else + { + elog(WARNING, "gid: %s, node_idx: %d", txn->gid, node_idx); + } + + if (abnormal_time < prepared_time) { + elog(WARNING, "gid: %s, abnormal time: " INT64_FORMAT + ", prepared time: " INT64_FORMAT, txn->gid, + abnormal_time, prepared_time); + return TXN_STATUS_INPROGRESS; } } @@ -3310,3 +3363,71 @@ void get_node_handles(PGXCNodeAllHandles **pgxc_handles, Oid nodeoid) *pgxc_handles = get_handles(nodelist, coordlist, false, true, true); } + +bool check_2pc_start_from_node(txn_info *txn) +{ + char node_type; + + Assert(InvalidOid != abnormal_nodeoid); + + if (abnormal_nodeoid == txn->origcoord) + { + return true; + } + + node_type = get_pgxc_nodetype(abnormal_nodeoid); + if (node_type == 'D') + { + return false; + } + + if (InvalidOid == txn->origcoord) + { + char *startnode = NULL; + int node_oid = InvalidOid; + char gid[MAX_GID]; + + if (!IsXidImplicit(txn->gid)) + { + return true; + } + + Assert(IsXidImplicit(txn->gid)); + + /* get start node from gid */ + strcpy(gid, txn->gid); + startnode = strtok(gid, ":"); + if (NULL == startnode) + { + elog(WARNING, "get startnode(%s) from gid(%s) failed", + startnode, gid); + return false; + } + + startnode = strtok(NULL, ":"); + if (NULL == startnode) + { + elog(WARNING, "get startnode(%s) from gid(%s) failed", + startnode, gid); + return false; + } + + node_oid = get_pgxc_nodeoid(startnode); + if (NULL == startnode) + { + elog(WARNING, "get invalid oid for startnode(%s) from gid(%s)", + startnode, gid); + return false; + } + + elog(DEBUG1, "get oid(%d) for startnode(%s) from gid(%s)", + node_oid, startnode, gid); + + if (abnormal_nodeoid == node_oid) + { + return true; + } + } + + return false; +} diff --git a/src/backend/postmaster/clean2pc.c b/src/backend/postmaster/clean2pc.c index 5e0ceaf2..b7f8fa02 100644 --- a/src/backend/postmaster/clean2pc.c +++ b/src/backend/postmaster/clean2pc.c @@ -32,6 +32,7 @@ #include "storage/pmsignal.h" #include "tcop/tcopprot.h" #include "utils/builtins.h" +#include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/ps_status.h" #include "utils/timeout.h" @@ -56,9 +57,10 @@ typedef enum bool enable_clean_2pc_launcher = true; -int auto_clean_2pc_interval = 10; -int auto_clean_2pc_delay = 3; -int auto_clean_2pc_timeout = 0; +int auto_clean_2pc_interval = 30; +int auto_clean_2pc_delay = 10; +int auto_clean_2pc_timeout = 300; +int auto_clean_2pc_max_check_time = 300; static volatile sig_atomic_t got_SIGTERM = false; static volatile sig_atomic_t got_SIGHUP = false; @@ -420,6 +422,9 @@ do_query_2pc(TimestampTz clean_time) int attr_num = 4; int64 check_time = 0; TimestampTz curr_time = GetCurrentTimestamp(); + Oid node_oid = 0; + char node_type = PGXC_NODE_COORDINATOR; + int node_index = 0; static const char *attr_name[] = {"gid", "database", "global_transaction_status", "transaction_status_on_allnodes"}; @@ -442,19 +447,37 @@ do_query_2pc(TimestampTz clean_time) check_time = INT32_MAX; } + if (auto_clean_2pc_max_check_time != 0) + { + if (check_time > auto_clean_2pc_max_check_time) + { + check_time = auto_clean_2pc_max_check_time; + } + } + snprintf(query, SQL_CMD_LEN, "select * FROM pg_clean_check_txn(" INT64_FORMAT ") order by database limit 1000;", check_time); - elog(DEBUG1, "node(%d) query: %s", PGXCNodeId, query); - StartTransactionCommand(); + InitMultinodeExecutor(false); + + node_oid = get_pgxc_nodeoid(PGXCNodeName); + if (!OidIsValid(node_oid)) + { + elog(ERROR, "get node(%s) oid failed", PGXCNodeName); + return; + } + node_index = PGXCNodeGetNodeId(node_oid, &node_type); + + elog(DEBUG1, "node(%d) query: %s", node_index, query); + plan = makeNode(RemoteQuery); plan->combine_type = COMBINE_TYPE_NONE; plan->exec_nodes = makeNode(ExecNodes); plan->exec_type = EXEC_ON_COORDS; - plan->exec_nodes->nodeList = lappend_int(plan->exec_nodes->nodeList, PGXCNodeId); + plan->exec_nodes->nodeList = lappend_int(plan->exec_nodes->nodeList, node_index); plan->sql_statement = (char*)query; plan->force_autocommit = false; @@ -470,8 +493,6 @@ do_query_2pc(TimestampTz clean_time) makeTargetEntry((Expr *) dummy, i, NULL, false)); } - InitMultinodeExecutor(false); - /* prepare to execute */ estate = CreateExecutorState(); oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); @@ -561,8 +582,9 @@ do_query_2pc(TimestampTz clean_time) if (count_2pc > 0) { Assert(result_str->data != NULL); - elog(LOG, "query remain 2pc count(%d), db count(%d):\n%s", - count_2pc, count_db, result_str->data); + elog(LOG, "query remain 2pc count(%d), db count(%d), sql: %s", + count_2pc, count_db, query); + elog(DEBUG1, "remain 2pc:\n%s", result_str->data); } } @@ -579,6 +601,9 @@ do_clean_2pc(TimestampTz clean_time) TupleTableSlot *result = NULL; Var *dummy = NULL; int attr_num = 4; + Oid node_oid = 0; + char node_type = PGXC_NODE_COORDINATOR; + int node_index = 0; static const char *attr_name[] = {"gid", "global_transaction_status", "operation", "operation_status"}; @@ -588,16 +613,26 @@ do_clean_2pc(TimestampTz clean_time) snprintf(query, SQL_CMD_LEN, "select * FROM pg_clean_execute_on_node('%s', %ld)" " limit 1000;", PGXCNodeName, clean_time); - elog(DEBUG2, "node(%d) query: %s", PGXCNodeId, query); - StartTransactionCommand(); + InitMultinodeExecutor(false); + + node_oid = get_pgxc_nodeoid(PGXCNodeName); + if (!OidIsValid(node_oid)) + { + elog(ERROR, "get node(%s) oid failed", PGXCNodeName); + return; + } + node_index = PGXCNodeGetNodeId(node_oid, &node_type); + + elog(DEBUG1, "node(%d) query: %s", node_index, query); + plan = makeNode(RemoteQuery); plan->combine_type = COMBINE_TYPE_NONE; plan->exec_nodes = makeNode(ExecNodes); plan->exec_type = EXEC_ON_COORDS; - plan->exec_nodes->nodeList = lappend_int(plan->exec_nodes->nodeList, PGXCNodeId); + plan->exec_nodes->nodeList = lappend_int(plan->exec_nodes->nodeList, node_index); plan->sql_statement = (char*)query; plan->force_autocommit = false; @@ -613,8 +648,6 @@ do_clean_2pc(TimestampTz clean_time) makeTargetEntry((Expr *) dummy, i, NULL, false)); } - InitMultinodeExecutor(false); - /* prepare to execute */ estate = CreateExecutorState(); oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); @@ -648,7 +681,8 @@ do_clean_2pc(TimestampTz clean_time) if (count > 0) { Assert(NULL != result_str->data); - elog(LOG, "clean 2pc count(%d):\n%s", count, result_str->data); + elog(LOG, "clean 2pc count(%d), sql: %s", count, query); + elog(LOG, "clean 2pc:\n%s", result_str->data); } } diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index f1e0d2f7..fd1b4720 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -4860,7 +4860,7 @@ static struct config_int ConfigureNamesInt[] = GUC_UNIT_S }, &auto_clean_2pc_interval, - 30, 1, 3600, + 30, 10, INT_MAX, NULL, NULL, NULL }, @@ -4871,7 +4871,7 @@ static struct config_int ConfigureNamesInt[] = GUC_UNIT_S }, &auto_clean_2pc_delay, - 3, 1, 600, + 10, 3, INT_MAX, NULL, NULL, NULL }, @@ -4882,7 +4882,18 @@ static struct config_int ConfigureNamesInt[] = GUC_UNIT_S }, &auto_clean_2pc_timeout, - 0, 0, INT_MAX, + 300, 0, INT_MAX, + NULL, NULL, NULL + }, + + { + {"auto_clean_2pc_max_check_time", PGC_USERSET, CUSTOM_OPTIONS, + gettext_noop("auto clean 2pc max check time"), + NULL, + GUC_UNIT_S + }, + &auto_clean_2pc_max_check_time, + 300, 0, INT_MAX, NULL, NULL, NULL }, diff --git a/src/include/postmaster/clean2pc.h b/src/include/postmaster/clean2pc.h index 2d94442b..4754ab63 100644 --- a/src/include/postmaster/clean2pc.h +++ b/src/include/postmaster/clean2pc.h @@ -19,6 +19,7 @@ extern bool enable_clean_2pc_launcher; extern int auto_clean_2pc_interval; extern int auto_clean_2pc_delay; extern int auto_clean_2pc_timeout; +extern int auto_clean_2pc_max_check_time; extern bool IsClean2pcLauncher(void); extern bool IsClean2pcWorker(void); From 4ae15b63748d282a8fc9802387c4d8e3852109c8 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Mon, 1 Nov 2021 10:52:39 +0800 Subject: [PATCH 431/578] fix PGXCNodeSendShowQuery coredump http://tapd.oa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131093712025 --- src/backend/pgxc/pool/pgxcnode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index d84611ee..9e645974 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -5559,8 +5559,7 @@ PGXCNodeSendShowQuery(NODE_CONNECTION *conn, const char *sql_command) resStatus = PQresultStatus(result); if (resStatus == PGRES_TUPLES_OK || resStatus == PGRES_COMMAND_OK) { - /* ignore unit */ - snprintf(number, result->tuples[0][0].len, "%s", PQgetvalue(result, 0, 0)); + snprintf(number, 128, "%s", PQgetvalue(result, 0, 0)); } PQclear(result); From b32ab5e1c4a6f3f0f0106fc8bbe1bdc9cb32d0be Mon Sep 17 00:00:00 2001 From: whalesong Date: Wed, 3 Nov 2021 15:13:19 +0800 Subject: [PATCH 432/578] auto clean 2pc guc configuration default value optimize --- src/backend/postmaster/clean2pc.c | 8 ++++---- src/backend/utils/misc/guc.c | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/backend/postmaster/clean2pc.c b/src/backend/postmaster/clean2pc.c index b7f8fa02..def81c95 100644 --- a/src/backend/postmaster/clean2pc.c +++ b/src/backend/postmaster/clean2pc.c @@ -57,10 +57,10 @@ typedef enum bool enable_clean_2pc_launcher = true; -int auto_clean_2pc_interval = 30; -int auto_clean_2pc_delay = 10; -int auto_clean_2pc_timeout = 300; -int auto_clean_2pc_max_check_time = 300; +int auto_clean_2pc_interval = 60; +int auto_clean_2pc_delay = 300; +int auto_clean_2pc_timeout = 1200; +int auto_clean_2pc_max_check_time = 1200; static volatile sig_atomic_t got_SIGTERM = false; static volatile sig_atomic_t got_SIGHUP = false; diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index fd1b4720..af8cef6e 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -4860,7 +4860,7 @@ static struct config_int ConfigureNamesInt[] = GUC_UNIT_S }, &auto_clean_2pc_interval, - 30, 10, INT_MAX, + 60, 10, INT_MAX, NULL, NULL, NULL }, @@ -4871,7 +4871,7 @@ static struct config_int ConfigureNamesInt[] = GUC_UNIT_S }, &auto_clean_2pc_delay, - 10, 3, INT_MAX, + 300, 3, INT_MAX, NULL, NULL, NULL }, @@ -4882,7 +4882,7 @@ static struct config_int ConfigureNamesInt[] = GUC_UNIT_S }, &auto_clean_2pc_timeout, - 300, 0, INT_MAX, + 1200, 0, INT_MAX, NULL, NULL, NULL }, @@ -4893,7 +4893,7 @@ static struct config_int ConfigureNamesInt[] = GUC_UNIT_S }, &auto_clean_2pc_max_check_time, - 300, 0, INT_MAX, + 1200, 0, INT_MAX, NULL, NULL, NULL }, From 37f66eaa879fb32038cb6555729c0309b5a490ca Mon Sep 17 00:00:00 2001 From: whalesong Date: Wed, 8 Sep 2021 16:35:53 +0800 Subject: [PATCH 433/578] Bugfix: procedure error, ID90798511 (merge request git push origin Tbase_v2.15.19.4) (cherry picked from commit e29ea02b) 69eeac67 bugfix: procedure error, add regress test cases, ID90798511 7ea215ed bugfix: procedure error, ID90798511 --- src/backend/access/transam/varsup.c | 28 ++++++++++++++++++++++++---- src/backend/storage/ipc/procarray.c | 5 ++++- src/include/storage/procarray.h | 3 ++- 3 files changed, 30 insertions(+), 6 deletions(-) diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c index 42baa98f..e4733f82 100644 --- a/src/backend/access/transam/varsup.c +++ b/src/backend/access/transam/varsup.c @@ -93,7 +93,8 @@ GetForceXidFromGTM(void) #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__ static TransactionId local_xid = InvalidTransactionId; static TransactionId local_subxids[PGPROC_MAX_CACHED_SUBXIDS] = {}; -static int local_nsub; +static int local_nsub = 0; +static bool local_overflowed = false; /* exported information about parallel workers, see xact.c */ extern int nParallelCurrentXids; extern TransactionId *ParallelCurrentXids; @@ -129,7 +130,8 @@ StoreGlobalXid(const char *globalXid) else if(IsConnFromDatanode()) { - local_xid = GetLocalTransactionId(globalXid, local_subxids, &local_nsub); + local_xid = GetLocalTransactionId(globalXid, + local_subxids, &local_nsub, &local_overflowed); if(enable_distri_print) { elog (LOG, " global xid %s to local xid %d, %d subxids", globalXid, local_xid, local_nsub); @@ -192,8 +194,6 @@ GetSubTransactions(void) bool TransactIdIsCurentGlobalTransacId(TransactionId xid) { - int i; - if(enable_distri_print) { elog(LOG, "is current transaction xid %u local xid %d", xid, local_xid); @@ -205,12 +205,32 @@ TransactIdIsCurentGlobalTransacId(TransactionId xid) if (TransactionIdEquals(xid, local_xid)) return true; + if (!local_overflowed) + { /* check subxids */ + int i; for (i = 0; i < local_nsub; i++) { if (TransactionIdEquals(local_subxids[i], xid)) return true; } + } + else + { + TransactionId topxid = SubTransGetTopmostTransaction(xid); + Assert(local_nsub == PGPROC_MAX_CACHED_SUBXIDS); + if(enable_distri_print) + { + elog(LOG, "subtransaction overflowed: xid=%d, topxid=%d, local_xid=%d", + xid, topxid, local_xid); + } + + if (!TransactionIdIsValid(topxid)) + return false; + + if (TransactionIdEquals(topxid, local_xid)) + return true; + } return false; } diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 35636976..45a8304e 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -1963,7 +1963,8 @@ GetMaxSnapshotSubxidCount(void) } #ifdef __TBASE__ -TransactionId GetLocalTransactionId(const char *globalXid, TransactionId *subxids, int *nsub) +TransactionId GetLocalTransactionId(const char *globalXid, + TransactionId *subxids, int *nsub, bool *overflowed) { ProcArrayStruct *arrayP = procArray; @@ -1996,6 +1997,8 @@ TransactionId GetLocalTransactionId(const char *globalXid, TransactionId *subxid result = pgxact->xid; + *overflowed = pgxact->overflowed; + /* look for max xid in subtrans */ *nsub = pgxact->nxids; for (nxid = 0; nxid < pgxact->nxids; nxid++) diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h index d6607bcf..3b84d623 100644 --- a/src/include/storage/procarray.h +++ b/src/include/storage/procarray.h @@ -123,7 +123,8 @@ extern bool TransactionIdIsInProgress(TransactionId xid); extern bool TransactionIdIsPrepared(TransactionId xid, Snapshot snapshot, GlobalTimestamp *prepare_ts); #endif #ifdef __TBASE__ -extern TransactionId GetLocalTransactionId(const char *globalXid, TransactionId *subxids, int *nsub); +extern TransactionId GetLocalTransactionId(const char *globalXid, + TransactionId *subxids, int *nsub, bool *overflowed); #endif extern char *GetGlobalTransactionId(const TransactionId pid); extern bool TransactionIdIsActive(TransactionId xid); From 0ab1363e95844d07a15caa368bd0151aecd7fc22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cningxpeng=E2=80=9D?= <“ningxpeng@tencent.com”> Date: Fri, 20 Aug 2021 17:05:53 +0800 Subject: [PATCH 434/578] [BUGFIX] Subtransaction commits should not reset session information (cherry-pick from ffca2f98b83e7375c001cf685c61aabef6f0638c) --- src/backend/pgxc/pool/execRemote.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 1708343b..6ae97233 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -4839,14 +4839,12 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle) stat_transaction(conn_count); /* do not cleanup remote session for subtrans */ - if (!temp_object_included) + if (!temp_object_included && need_release_handle) { /* Clean up remote sessions */ pgxc_node_remote_cleanup_all(txn_type == TXN_TYPE_CommitSubTxn || txn_type == TXN_TYPE_RollbackSubTxn); - if (need_release_handle) - { if (PersistentConnections) { reset_handles(); @@ -4863,7 +4861,6 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle) } } } - } clear_handles(); } @@ -5938,13 +5935,12 @@ pgxc_node_remote_abort(TranscationType txn_type, bool need_release_handle) * certain issues for aborted transactions, we drop the connections. * Revisit and fix the issue */ - if (!temp_object_included) + if (!temp_object_included && need_release_handle) { /* Clean up remote sessions */ pgxc_node_remote_cleanup_all(txn_type == TXN_TYPE_CommitSubTxn || txn_type == TXN_TYPE_RollbackSubTxn); - if (need_release_handle) - { + if (HaveActiveDatanodeStatements()) { reset_handles(); @@ -5954,7 +5950,6 @@ pgxc_node_remote_abort(TranscationType txn_type, bool need_release_handle) release_handles(false); } } - } clear_handles(); pfree_pgxc_all_handles(handles); From 045939ad5a7835066915d328ddbe9f1f0e76128a Mon Sep 17 00:00:00 2001 From: ningxpeng Date: Tue, 17 Aug 2021 11:13:42 +0800 Subject: [PATCH 435/578] [BUGFIX] snapshot still active in CTAS mode --- src/backend/rewrite/rewriteHandler.c | 5 ++- src/backend/tcop/pquery.c | 14 +++++++ src/backend/utils/time/snapmgr.c | 26 +++++++++++- src/include/utils/snapmgr.h | 62 ++++++++++++++++------------ 4 files changed, 78 insertions(+), 29 deletions(-) diff --git a/src/backend/rewrite/rewriteHandler.c b/src/backend/rewrite/rewriteHandler.c index 8df9fe35..b2f6df8f 100644 --- a/src/backend/rewrite/rewriteHandler.c +++ b/src/backend/rewrite/rewriteHandler.c @@ -4112,8 +4112,11 @@ QueryRewriteCTAS(Query *parsetree) ProcessUtility(wrapper, cquery.data, PROCESS_UTILITY_QUERY, NULL, NULL, NULL, false, NULL); - PopActiveSnapshot(); + /* Use new snapshot for insert and update the snapshot status. */ + if (ActiveSnapshotSet()) + PopActiveSnapshot(); PushActiveSnapshot(GetTransactionSnapshot()); + UpdateActiveSnapshotStatus(S_FOR_CTAS); /* diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c index 715d407b..295dc2a2 100644 --- a/src/backend/tcop/pquery.c +++ b/src/backend/tcop/pquery.c @@ -2018,6 +2018,20 @@ PortalRunUtility(Portal portal, PlannedStmt *pstmt, if (snapshot != NULL && ActiveSnapshotSet() && snapshot == GetActiveSnapshot()) PopActiveSnapshot(); + else + { + /* Clear snapshots created in process QueryRewriteCTAS */ + while (ActiveSnapshotSet()) + { + if (S_FOR_CTAS == GetActiveSnapshotStatus() || + snapshot == GetActiveSnapshot()) + { + PopActiveSnapshot(); + continue; + } + break; + } + } } /* diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index 0f4158e4..d7da6a59 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -190,6 +190,7 @@ typedef struct ActiveSnapshotElt { Snapshot as_snap; int as_level; + SnapshotStatus status; struct ActiveSnapshotElt *as_next; } ActiveSnapshotElt; @@ -895,6 +896,7 @@ PushActiveSnapshot(Snapshot snap) newactive->as_next = ActiveSnapshot; newactive->as_level = GetCurrentTransactionNestLevel(); + newactive->status = S_DEFAULT; newactive->as_snap->active_count++; @@ -957,6 +959,22 @@ UpdateActiveSnapshotCommandId(void) #endif } +void +UpdateActiveSnapshotStatus(SnapshotStatus new_status) +{ + Assert(ActiveSnapshot != NULL); + + ActiveSnapshot->status = new_status; +} + +SnapshotStatus +GetActiveSnapshotStatus(void) +{ + Assert(ActiveSnapshot != NULL); + + return ActiveSnapshot->status; +} + /* * PopActiveSnapshot * @@ -1288,10 +1306,16 @@ AtEOXact_Snapshot(bool isCommit, bool resetXmin) elog(WARNING, "registered snapshots seem to remain after cleanup"); /* complain about unpopped active snapshots */ - for (active = ActiveSnapshot; active != NULL; active = active->as_next) + for (active = ActiveSnapshot; active != NULL && active->status != S_FOR_CTAS; active = active->as_next) + { elog(WARNING, "snapshot %p still active", active); } + /* Resources to clean up, pop all active snapshots */ + while (ActiveSnapshotSet()) + PopActiveSnapshot(); + } + /* * And reset our state. We don't need to free the memory explicitly -- * it'll go away with TopTransactionContext. diff --git a/src/include/utils/snapmgr.h b/src/include/utils/snapmgr.h index e1054705..110ed378 100644 --- a/src/include/utils/snapmgr.h +++ b/src/include/utils/snapmgr.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * snapmgr.h - * POSTGRES snapshot manager + * POSTGRES snapshot manager * * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -34,13 +34,19 @@ #define OLD_SNAPSHOT_PADDING_ENTRIES 10 #define OLD_SNAPSHOT_TIME_MAP_ENTRIES (old_snapshot_threshold + OLD_SNAPSHOT_PADDING_ENTRIES) +typedef enum SnapshotStatus +{ + S_DEFAULT, + S_FOR_CTAS /* After creating a table, obtain a new snapshot in the QueryRewriteCTAS process */ +} SnapshotStatus; + /* * Common definition of relation properties that allow early pruning/vacuuming * when old_snapshot_threshold >= 0. */ #define RelationAllowsEarlyPruning(rel) \ ( \ - RelationNeedsWAL(rel) \ + RelationNeedsWAL(rel) \ && !IsCatalogRelation(rel) \ && !RelationIsAccessibleInLogicalDecoding(rel) \ && !RelationHasUnloggedIndex(rel) \ @@ -66,7 +72,7 @@ extern TransactionId RecentGlobalDataXmin; extern GlobalTimestamp RecentCommitTs; extern GlobalTimestamp RecentDataTs; -extern int vacuum_delta; +extern int vacuum_delta; extern bool vacuum_debug_print; @@ -88,6 +94,8 @@ extern void InvalidateCatalogSnapshotConditionally(void); extern void PushActiveSnapshot(Snapshot snapshot); extern void PushCopiedSnapshot(Snapshot snapshot); extern void UpdateActiveSnapshotCommandId(void); +void UpdateActiveSnapshotStatus(SnapshotStatus new_status); +SnapshotStatus GetActiveSnapshotStatus(void); extern void PopActiveSnapshot(void); extern Snapshot GetActiveSnapshot(void); extern bool ActiveSnapshotSet(void); @@ -106,9 +114,9 @@ extern bool XactHasExportedSnapshots(void); extern void DeleteAllExportedSnapshotFiles(void); extern bool ThereAreNoPriorRegisteredSnapshots(void); extern TransactionId TransactionIdLimitedForOldSnapshots(TransactionId recentXmin, - Relation relation); + Relation relation); extern void MaintainOldSnapshotTimeMapping(TimestampTz whenTaken, - TransactionId xmin); + TransactionId xmin); extern char *ExportSnapshot(Snapshot snapshot); @@ -142,28 +150,28 @@ extern bool LookupPreparedXid(TransactionId xid, GlobalTimestamp *prepare_timest static inline bool TestForOldTimestamp(GlobalTimestamp currentTimestamp, GlobalTimestamp oldestTimestamp) { - - if(IsInitProcessingMode()) - { - return true; - } - - if(CommitTimestampIsLocal(currentTimestamp)) - { - return true; - } - - if(currentTimestamp < oldestTimestamp) - { - elog(DEBUG12, "test for old time true ts " INT64_FORMAT " recent " INT64_FORMAT, currentTimestamp, oldestTimestamp); - return true; - } - else - { - elog(DEBUG12, "test for old time false ts " INT64_FORMAT " recent " INT64_FORMAT, currentTimestamp, oldestTimestamp); - return false; - } + + if(IsInitProcessingMode()) + { + return true; + } + + if(CommitTimestampIsLocal(currentTimestamp)) + { + return true; + } + + if(currentTimestamp < oldestTimestamp) + { + elog(DEBUG12, "test for old time true ts " INT64_FORMAT " recent " INT64_FORMAT, currentTimestamp, oldestTimestamp); + return true; + } + else + { + elog(DEBUG12, "test for old time false ts " INT64_FORMAT " recent " INT64_FORMAT, currentTimestamp, oldestTimestamp); + return false; + } } -#endif /* SNAPMGR_H */ +#endif /* SNAPMGR_H */ From 261e4744b01445c008e68435e4b9f7c6ed6dc295 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cningxpeng=E2=80=9D?= <“ningxpeng@tencent.com”> Date: Tue, 7 Sep 2021 15:35:59 +0800 Subject: [PATCH 436/578] [BUGFIX] The original snapshot status must be maintained in CTAS process snapshot replacement --- src/backend/rewrite/rewriteHandler.c | 15 +++++++++++++++ src/backend/utils/time/snapmgr.c | 14 ++++++++++++++ src/include/utils/snapmgr.h | 2 ++ 3 files changed, 31 insertions(+) diff --git a/src/backend/rewrite/rewriteHandler.c b/src/backend/rewrite/rewriteHandler.c index b2f6df8f..35261bee 100644 --- a/src/backend/rewrite/rewriteHandler.c +++ b/src/backend/rewrite/rewriteHandler.c @@ -3970,6 +3970,8 @@ QueryRewriteCTAS(Query *parsetree) CreateTableAsStmt *stmt; IntoClause *into; ListCell *lc; + const int InvalidLevel = -1; + int old_level = InvalidLevel; if (parsetree->commandType != CMD_UTILITY || !IsA(parsetree->utilityStmt, CreateTableAsStmt)) @@ -4114,12 +4116,25 @@ QueryRewriteCTAS(Query *parsetree) /* Use new snapshot for insert and update the snapshot status. */ if (ActiveSnapshotSet()) + { + old_level = GetActiveSnapshotLevel(); PopActiveSnapshot(); + } + PushActiveSnapshot(GetTransactionSnapshot()); UpdateActiveSnapshotStatus(S_FOR_CTAS); /* + * Only snapshot replacement is performed to prevent abnormal snapshot clearing caused by sub-transactions. + * Active snapshots set by this subtransaction will be cleared. + */ + if (old_level != InvalidLevel) + { + SetActiveSnapshotLevel(old_level); + } + + /* * Now fold the CTAS statement into an INSERT INTO statement. The * utility is no more required. */ diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index d7da6a59..71a2f0a3 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -975,6 +975,20 @@ GetActiveSnapshotStatus(void) return ActiveSnapshot->status; } +int +GetActiveSnapshotLevel(void) +{ + Assert(ActiveSnapshot != NULL); + return ActiveSnapshot->as_level; +} + +void +SetActiveSnapshotLevel(int level) +{ + Assert(ActiveSnapshot != NULL); + ActiveSnapshot->as_level = level; +} + /* * PopActiveSnapshot * diff --git a/src/include/utils/snapmgr.h b/src/include/utils/snapmgr.h index 110ed378..896a9ff9 100644 --- a/src/include/utils/snapmgr.h +++ b/src/include/utils/snapmgr.h @@ -96,6 +96,8 @@ extern void PushCopiedSnapshot(Snapshot snapshot); extern void UpdateActiveSnapshotCommandId(void); void UpdateActiveSnapshotStatus(SnapshotStatus new_status); SnapshotStatus GetActiveSnapshotStatus(void); +extern int GetActiveSnapshotLevel(void); +extern void SetActiveSnapshotLevel(int level); extern void PopActiveSnapshot(void); extern Snapshot GetActiveSnapshot(void); extern bool ActiveSnapshotSet(void); From b027d47a1c451bb14f7f6cfd7c7b704afa892fdb Mon Sep 17 00:00:00 2001 From: sigmalin Date: Thu, 4 Nov 2021 15:46:27 +0800 Subject: [PATCH 437/578] fix probabilistic error could not open relation with OID 0 http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131093496729 (merge request !882) Squash merge branch 'sigmalin_v2' into 'Tbase_v2.15.19.4' fix probabilistic error could not open relation with OID 0 http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131093496729 TAPD: --bug=093496729 --- src/backend/access/transam/xact.c | 2 +- src/backend/pgxc/pool/poolmgr.c | 19 +++++++++++++++++++ src/backend/utils/misc/guc.c | 12 +++++++++--- src/include/utils/guc.h | 5 +++++ 4 files changed, 34 insertions(+), 4 deletions(-) diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index ed02dff9..1d255395 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -757,7 +757,7 @@ AssignGlobalXidInternal(void) globalXidVersion++; if(enable_distri_print) { - elog(LOG, "assign global xid %s prono %d seq " UINT64_FORMAT UINT64_FORMAT, + elog(LOG, "assign global xid %s prono %d seq " UINT64_FORMAT" "UINT64_FORMAT, str.data, MyProc->pgprocno, seq, globalXidVersion); } diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c index b9c3bd8c..2db4397e 100644 --- a/src/backend/pgxc/pool/poolmgr.c +++ b/src/backend/pgxc/pool/poolmgr.c @@ -67,6 +67,9 @@ #include "port.h" #include #include +#ifdef __TBASE__ +#include "access/xlog.h" +#endif /* the mini use conut of a connection */ #define MINI_USE_COUNT 10 @@ -409,6 +412,9 @@ static void pooler_subthread_write_log(int elevel, int lineno, const char *filen #define MAX_THREAD_LOG_PIPE_LEN (2 * 1024) /* length of thread log pipe */ #define DEFAULT_LOG_BUF_LEN (1024) /* length of thread log length */ PGPipe *g_ThreadLogQueue = NULL; +#ifdef __TBASE__ +bool g_allow_distri_query_on_standby_node = false; +#endif static inline void RebuildAgentIndex(void); @@ -1830,6 +1836,19 @@ PoolManagerGetConnections(List *datanodelist, List *coordlist, bool raise_error, int j = 0; +#ifdef __TBASE__ + /* + * if it is the standby node of the main plane, the distributed query will be connected to + * the main data node, and the standby cn may generate the same global xid as the main cn, + * so disable the distributed query of the standby node on the main plane + */ + if (g_allow_distri_query_on_standby_node == false && + IsPGXCMainCluster && RecoveryInProgress()) + { + elog(ERROR, "can't do distributed query because it is the main plane standby node."); + } +#endif + HOLD_POOLER_RELOAD(); if (poolHandle == NULL) diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index af8cef6e..8b9a9fe9 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -2501,9 +2501,15 @@ static struct config_bool ConfigureNamesBool[] = false, NULL, NULL, NULL }, - - - + { + {"allow_distri_query_on_standby_node", PGC_POSTMASTER, CUSTOM_OPTIONS, + gettext_noop("allow distributed query on main plane standby node"), + NULL + }, + &g_allow_distri_query_on_standby_node, + false, + NULL, NULL, NULL + }, { {"enable_committs_print", PGC_SUSET, CUSTOM_OPTIONS, gettext_noop("enable commit ts debug print"), diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h index 7ae45b95..95342821 100644 --- a/src/include/utils/guc.h +++ b/src/include/utils/guc.h @@ -312,6 +312,11 @@ extern bool g_allow_dml_on_datanode; extern bool g_allow_force_ddl; extern bool trace_extent; #endif + +#ifdef __TBASE__ +extern bool g_allow_distri_query_on_standby_node; +#endif + #ifdef XCP extern char *global_session_string; #endif From 9ce9464560c4a0ec6846c979fe13695ad789884e Mon Sep 17 00:00:00 2001 From: sigmalin Date: Fri, 13 Aug 2021 10:28:31 +0800 Subject: [PATCH 438/578] add 2pc protection fix http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131089315277 (merge request git status) --- src/backend/pgxc/pool/execRemote.c | 79 ++++++++++--- src/backend/pgxc/pool/pgxcnode.c | 177 +++++++++++++++++++++++++++++ src/include/pgxc/pgxcnode.h | 7 ++ 3 files changed, 245 insertions(+), 18 deletions(-) diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 6ae97233..28af8f31 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -2549,6 +2549,14 @@ FetchTuple(ResponseCombiner *combiner) } else if (res == RESPONSE_COMPLETE) { + if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Unexpected FATAL ERROR on Connection to Datanode %s pid %d", + conn->nodename, conn->backend_pid))); + } + /* * In case of Simple Query Protocol we should receive ReadyForQuery * before removing connection from the list. In case of Extended @@ -2656,13 +2664,6 @@ FetchTuple(ResponseCombiner *combiner) return NULL; } } - else if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Unexpected FATAL ERROR on Connection to Datanode %s pid %d", - conn->nodename, conn->backend_pid))); - } } else if (res == RESPONSE_ERROR) { @@ -3683,6 +3684,8 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections, elog(DEBUG5, "pgxc_node_begin send %s to node %s, pid:%d", cmd, connections[i]->nodename, connections[i]->backend_pid); new_connections[new_count++] = connections[i]; + /* if send begin, register current connection */ + register_transaction_handles(connections[i]); } } @@ -3938,7 +3941,8 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit) ResponseCombiner combiner; PGXCNodeHandle **connections = NULL; int conn_count = 0; - PGXCNodeAllHandles *handles = get_current_handles(); + /* get current transaction handles that we register when pgxc_node_begin */ + PGXCNodeAllHandles *handles = get_current_txn_handles(); #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__ GlobalTimestamp global_prepare_ts = InvalidGlobalTimestamp; #endif @@ -4071,7 +4075,10 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit) * Skip empty slots */ if (conn->sock == NO_SOCKET) - continue; + { + elog(ERROR, "pgxc_node_remote_prepare, remote node %s's connection handle is invalid, backend_pid: %d", + conn->nodename, conn->backend_pid); + } else if (conn->transaction_status == 'T') { /* Read in any pending input */ @@ -4277,7 +4284,10 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit) * Skip empty slots */ if (conn->sock == NO_SOCKET) - continue; + { + elog(ERROR, "pgxc_node_remote_prepare, remote node %s's connection handle is invalid, backend_pid: %d", + conn->nodename, conn->backend_pid); + } else if (conn->transaction_status == 'T') { if (conn->read_only) @@ -5489,7 +5499,8 @@ void get_partnodes(PGXCNodeAllHandles * handles, StringInfo participants) conn = handles->datanode_handles[i]; if (conn->sock == NO_SOCKET) { - continue; + elog(ERROR, "get_partnodes, remote node %s's connection handle is invalid, backend_pid: %d", + conn->nodename, conn->backend_pid); } else if (conn->transaction_status == 'T') { @@ -5499,6 +5510,11 @@ void get_partnodes(PGXCNodeAllHandles * handles, StringInfo participants) appendStringInfo(participants, "%s,", conn->nodename); } } + else if (conn->transaction_status == 'E') + { + elog(ERROR, "get_partnodes, remote node %s is in error state, backend_pid: %d", + conn->nodename, conn->backend_pid); + } } for (i = 0; i < handles->co_conn_count; i++) @@ -5506,7 +5522,8 @@ void get_partnodes(PGXCNodeAllHandles * handles, StringInfo participants) conn = handles->coord_handles[i]; if (conn->sock == NO_SOCKET) { - continue; + elog(ERROR, "get_partnodes, remote node %s's connection handle is invalid, backend_pid: %d", + conn->nodename, conn->backend_pid); } else if (conn->transaction_status == 'T') { @@ -5516,6 +5533,11 @@ void get_partnodes(PGXCNodeAllHandles * handles, StringInfo participants) appendStringInfo(participants, "%s,", conn->nodename); } } + else if (conn->transaction_status == 'E') + { + elog(ERROR, "get_partnodes, remote node %s is in error state, backend_pid: %d", + conn->nodename, conn->backend_pid); + } } if (is_readonly && !IsXidImplicit(gid)) { @@ -7524,6 +7546,7 @@ void AtEOXact_Remote(void) { PGXCNodeResetParams(true); + reset_transaction_handles(); } /* @@ -8066,6 +8089,7 @@ PostPrepare_Remote(char *prepareGID, bool implicit) if (log_gtm_stats) ShowUsageCommon("PostPrepare_Remote", &start_r, &start_t); #endif + reset_transaction_handles(); } /* @@ -8133,8 +8157,8 @@ IsTwoPhaseCommitRequired(bool localWrite) elog(ERROR, "IsTwoPhaseCommitRequired, Found %d sock fatal handles exist", sock_fatal_count); } #endif - - handles = get_current_handles(); + /* get current transaction handles that we register when pgxc_node_begin */ + handles = get_current_txn_handles(); for (i = 0; i < handles->dn_conn_count; i++) { PGXCNodeHandle *conn = handles->datanode_handles[i]; @@ -8143,8 +8167,12 @@ IsTwoPhaseCommitRequired(bool localWrite) elog(DEBUG5, "IsTwoPhaseCommitRequired, conn->nodename=%s, conn->sock=%d, conn->read_only=%d, conn->transaction_status=%c", conn->nodename, conn->sock, conn->read_only, conn->transaction_status); #endif - if (conn->sock != NO_SOCKET && !conn->read_only && - conn->transaction_status == 'T') + if (conn->sock == NO_SOCKET) + { + elog(ERROR, "IsTwoPhaseCommitRequired, remote node %s's connection handle is invalid, backend_pid: %d", + conn->nodename, conn->backend_pid); + } + else if (!conn->read_only && conn->transaction_status == 'T') { if (found) { @@ -8156,6 +8184,11 @@ IsTwoPhaseCommitRequired(bool localWrite) found = true; /* first found */ } } + else if (conn->transaction_status == 'E') + { + elog(ERROR, "IsTwoPhaseCommitRequired, remote node %s is in error state, backend_pid: %d", + conn->nodename, conn->backend_pid); + } } for (i = 0; i < handles->co_conn_count; i++) { @@ -8165,8 +8198,12 @@ IsTwoPhaseCommitRequired(bool localWrite) elog(DEBUG5, "IsTwoPhaseCommitRequired, conn->nodename=%s, conn->sock=%d, conn->read_only=%d, conn->transaction_status=%c", conn->nodename, conn->sock, conn->read_only, conn->transaction_status); #endif - if (conn->sock != NO_SOCKET && !conn->read_only && - conn->transaction_status == 'T') + if (conn->sock == NO_SOCKET) + { + elog(ERROR, "IsTwoPhaseCommitRequired, remote node %s's connection handle is invalid, backend_pid: %d", + conn->nodename, conn->backend_pid); + } + else if (!conn->read_only && conn->transaction_status == 'T') { if (found) { @@ -8178,6 +8215,11 @@ IsTwoPhaseCommitRequired(bool localWrite) found = true; /* first found */ } } + else if (conn->transaction_status == 'E') + { + elog(ERROR, "IsTwoPhaseCommitRequired, remote node %s is in error state, backend_pid: %d", + conn->nodename, conn->backend_pid); + } } pfree_pgxc_all_handles(handles); @@ -8898,6 +8940,7 @@ pgxc_node_remote_finish(char *prepareGID, bool commit, } clear_handles(); pfree_pgxc_all_handles(pgxc_handles); + reset_transaction_handles(); pfree(finish_cmd); #ifdef __TWO_PHASE_TRANS__ diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index 9e645974..8968700d 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -98,6 +98,8 @@ static PGXCNodeHandle *sdn_handles = NULL; */ static PGXCNodeHandle *co_handles = NULL; +PGXCNodeAllHandles *current_transaction_handles = NULL; + #ifdef __TBASE__ /* Hash key: nodeoid value: index in dn_handles or co_handles */ static HTAB *node_handles_hash = NULL; @@ -164,6 +166,8 @@ static void PGXCNodeHandleError(PGXCNodeHandle *handle, char *msg_body, int len) static PGXCNodeAllHandles * get_empty_handles(void); static void get_current_dn_handles_internal(PGXCNodeAllHandles *result); static void get_current_cn_handles_internal(PGXCNodeAllHandles *result); +static void get_current_txn_dn_handles_internal(PGXCNodeAllHandles *result); +static void get_current_txn_cn_handles_internal(PGXCNodeAllHandles *result); #endif /* @@ -324,6 +328,7 @@ InitMultinodeExecutor(bool is_force) "node_handles_hash enter primary datanode nodeoid: %d", node_handle_ent->nodeoid); } + dn_handles[count].node_type = PGXC_NODE_DATANODE; #endif } @@ -354,6 +359,7 @@ InitMultinodeExecutor(bool is_force) "node_handles_hash enter slave datanode nodeoid: %d", node_handle_ent->nodeoid); } + sdn_handles[count].node_type = PGXC_NODE_SLAVEDATANODE; #endif } @@ -383,6 +389,7 @@ InitMultinodeExecutor(bool is_force) "node_handles_hash enter coordinator nodeoid: %d", node_handle_ent->nodeoid); } + co_handles[count].node_type = PGXC_NODE_COORDINATOR; #endif } @@ -427,6 +434,8 @@ InitMultinodeExecutor(bool is_force) #ifdef __TBASE__ if(strcmp(PGXCMainClusterName, PGXCClusterName) == 0) IsPGXCMainCluster = true; + + init_transaction_handles(); #endif } @@ -4165,7 +4174,16 @@ get_current_handles(void) } #ifdef __TBASE__ +/* get current transaction handles that register in pgxc_node_begin */ +PGXCNodeAllHandles * +get_current_txn_handles(void) +{ + PGXCNodeAllHandles *result = get_empty_handles(); + get_current_txn_cn_handles_internal(result); + get_current_txn_dn_handles_internal(result); + return result; +} PGXCNodeAllHandles * get_current_cn_handles(void) @@ -4211,6 +4229,35 @@ get_current_dn_handles_internal(PGXCNodeAllHandles *result) } } +/* get current transaction dn handles that register in pgxc_node_begin */ +static void +get_current_txn_dn_handles_internal(PGXCNodeAllHandles *result) +{ + int i; + int count = 0; + + if (current_transaction_handles == NULL || current_transaction_handles->dn_conn_count == 0) + { + return; + } + + count = current_transaction_handles->dn_conn_count; + result->datanode_handles = (PGXCNodeHandle **) + palloc(count * sizeof(PGXCNodeHandle *)); + if (!result->datanode_handles) + { + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + + result->dn_conn_count = 0; + for (i = 0; i < count; i++) + { + result->datanode_handles[result->dn_conn_count++] = current_transaction_handles->datanode_handles[i]; + } +} + static void get_current_cn_handles_internal(PGXCNodeAllHandles *result) { @@ -4237,6 +4284,35 @@ get_current_cn_handles_internal(PGXCNodeAllHandles *result) } } +/* get current transaction cn handles that register in pgxc_node_begin */ +static void +get_current_txn_cn_handles_internal(PGXCNodeAllHandles *result) +{ + int i; + int count = 0; + + if (current_transaction_handles == NULL || current_transaction_handles->co_conn_count == 0) + { + return; + } + + count = current_transaction_handles->co_conn_count; + result->coord_handles = (PGXCNodeHandle **) + palloc(count * sizeof(PGXCNodeHandle *)); + if (!result->coord_handles) + { + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + + result->co_conn_count = 0; + for (i = 0; i < count; i++) + { + result->coord_handles[result->co_conn_count++] = current_transaction_handles->coord_handles[i]; + } +} + PGXCNodeAllHandles * get_sock_fatal_handles(void) { @@ -4290,6 +4366,107 @@ get_sock_fatal_handles(void) return result; } + +/* + * init current transaction handles for connections + */ +void +init_transaction_handles(void) +{ + MemoryContext oldcontext; + oldcontext = MemoryContextSwitchTo(TopMemoryContext); + if (current_transaction_handles == NULL) + { + current_transaction_handles = (PGXCNodeAllHandles *) palloc0(sizeof(PGXCNodeAllHandles)); + } + + current_transaction_handles->primary_handle = NULL; + + current_transaction_handles->dn_conn_count = 0; + if (current_transaction_handles->datanode_handles == NULL) + { + current_transaction_handles->datanode_handles = (PGXCNodeHandle **) palloc(NumDataNodes * sizeof(PGXCNodeHandle *)); + } + else + { + current_transaction_handles->datanode_handles = (PGXCNodeHandle **) repalloc(current_transaction_handles->datanode_handles, NumDataNodes * sizeof(PGXCNodeHandle *)); + } + + current_transaction_handles->co_conn_count = 0; + if (current_transaction_handles->coord_handles == NULL) + { + current_transaction_handles->coord_handles = (PGXCNodeHandle **) palloc(NumCoords * sizeof(PGXCNodeHandle *)); + } + else + { + current_transaction_handles->coord_handles = (PGXCNodeHandle **) repalloc(current_transaction_handles->coord_handles, NumCoords * sizeof(PGXCNodeHandle *)); + } + MemoryContextSwitchTo(oldcontext); + return; +} + +/* + * reset current transaction handles + */ +void +reset_transaction_handles(void) +{ + if (current_transaction_handles == NULL) + { + return; + } + + current_transaction_handles->dn_conn_count = 0; + current_transaction_handles->co_conn_count = 0; + return; +} + +/* + * register current transaction handle to current_transaction_handles + */ +void +register_transaction_handles(PGXCNodeHandle* handle) +{ + int i = 0; + char node_type = handle->node_type; + + if (!IS_PGXC_LOCAL_COORDINATOR) + { + return; + } + + Assert (current_transaction_handles != NULL); + + if (node_type == PGXC_NODE_DATANODE) + { + for (i = 0; i < current_transaction_handles->dn_conn_count; i++) + { + if (current_transaction_handles->datanode_handles[i] == handle) + { + return; + } + } + current_transaction_handles->datanode_handles[current_transaction_handles->dn_conn_count++] = handle; + Assert(current_transaction_handles->dn_conn_count <= NumDataNodes); + } + else if (node_type == PGXC_NODE_COORDINATOR) + { + for (i = 0; i < current_transaction_handles->co_conn_count; i++) + { + if (current_transaction_handles->coord_handles[i] == handle) + { + return; + } + } + current_transaction_handles->coord_handles[current_transaction_handles->co_conn_count++] = handle; + Assert(current_transaction_handles->co_conn_count <= NumCoords); + } + else + { + elog(ERROR, "invalid node_type %c in register_transaction_handles", node_type); + } +} + #endif /* Free PGXCNodeAllHandles structure */ diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h index 22075b68..8d51d1dc 100644 --- a/src/include/pgxc/pgxcnode.h +++ b/src/include/pgxc/pgxcnode.h @@ -129,6 +129,7 @@ struct pgxc_node_handle long recv_datarows; bool plpgsql_need_begin_sub_txn; bool plpgsql_need_begin_txn; + char node_type; #endif }; typedef struct pgxc_node_handle PGXCNodeHandle; @@ -143,6 +144,8 @@ typedef struct PGXCNodeHandle **coord_handles; /* an array of Coordinator handles */ } PGXCNodeAllHandles; +extern PGXCNodeAllHandles *current_transaction_handles; + extern volatile bool HandlesInvalidatePending; extern void InitMultinodeExecutor(bool is_force); @@ -178,9 +181,13 @@ extern PGXCNodeAllHandles *get_handles(List *datanodelist, List *coordlist, extern PGXCNodeAllHandles *get_current_handles(void); #ifdef __TBASE__ +extern PGXCNodeAllHandles *get_current_txn_handles(void); extern PGXCNodeAllHandles *get_current_cn_handles(void); extern PGXCNodeAllHandles *get_current_dn_handles(void); extern PGXCNodeAllHandles * get_sock_fatal_handles(void); +extern void init_transaction_handles(void); +extern void reset_transaction_handles(void); +extern void register_transaction_handles(PGXCNodeHandle* handle); #endif extern void pfree_pgxc_all_handles(PGXCNodeAllHandles *handles); From a9fd7bd7dc9d050bf3a782ba57c426d710954db7 Mon Sep 17 00:00:00 2001 From: andrelin Date: Thu, 11 Nov 2021 17:24:20 +0800 Subject: [PATCH 439/578] Fix concurrent update when existing initplan http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view?bug_id=1020421696094082541 --- src/backend/executor/execMain.c | 24 +++++------ src/backend/pgxc/pool/execRemote.c | 20 +++++++-- src/test/regress/expected/insert.out | 63 ++++++++++++++++++++++++++++ src/test/regress/sql/insert.sql | 41 ++++++++++++++++++ 4 files changed, 131 insertions(+), 17 deletions(-) diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index ca60ff43..aa69e7a3 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -3188,12 +3188,7 @@ EvalPlanQualInit(EPQState *epqstate, EState *estate, epqstate->planstate = NULL; epqstate->origslot = NULL; /* ... and remember data that EvalPlanQualBegin will need */ - epqstate->plan = copyObject(subplan); - /* Reset cursor name of remote subplans if any */ - ResetRemoteSubplanCursor(epqstate->plan, - (estate->es_plannedstmt ? - estate->es_plannedstmt->subplans : NULL), - "epq"); + epqstate->plan = subplan; epqstate->arowMarks = auxrowmarks; epqstate->epqParam = epqParam; } @@ -3209,12 +3204,7 @@ EvalPlanQualSetPlan(EPQState *epqstate, Plan *subplan, List *auxrowmarks) /* If we have a live EPQ query, shut it down */ EvalPlanQualEnd(epqstate); /* And set/change the plan pointer */ - epqstate->plan = copyObject(subplan); - /* Reset cursor name of remote subplans if any */ - ResetRemoteSubplanCursor(epqstate->plan, - (epqstate->parentestate->es_plannedstmt ? - epqstate->parentestate->es_plannedstmt->subplans : NULL), - "epq"); + epqstate->plan = subplan; /* The rowmarks depend on the plan, too */ epqstate->arowMarks = auxrowmarks; } @@ -3448,7 +3438,7 @@ EvalPlanQualBegin(EPQState *epqstate, EState *parentestate) if (estate == NULL) { /* First time through, so create a child EState */ - EvalPlanQualStart(epqstate, parentestate, epqstate->plan); + EvalPlanQualStart(epqstate, parentestate, copyObject(epqstate->plan)); } else { @@ -3522,9 +3512,15 @@ EvalPlanQualStart(EPQState *epqstate, EState *parentestate, Plan *planTree) estate->es_snapshot = parentestate->es_snapshot; estate->es_crosscheck_snapshot = parentestate->es_crosscheck_snapshot; estate->es_range_table = parentestate->es_range_table; - estate->es_plannedstmt = parentestate->es_plannedstmt; + estate->es_plannedstmt = copyObject(parentestate->es_plannedstmt); estate->es_junkFilter = parentestate->es_junkFilter; estate->es_output_cid = parentestate->es_output_cid; + + ResetRemoteSubplanCursor(planTree, + (estate->es_plannedstmt ? + estate->es_plannedstmt->subplans : NULL), + "epq"); + if (parentestate->es_num_result_relations > 0) { int numResultRelations = parentestate->es_num_result_relations; diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 28af8f31..31ee6014 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -10869,8 +10869,22 @@ encode_epqcontext(PlanState *planstate, char **result) for (i = 0; i < ntuples; i++) { - ItemPointerData tid = estate->es_epqTuple[i]->t_self; - int rtidx = i + 1; + ItemPointerData tid; + int16 rtidx; + int nodeid; + + if (estate->es_epqTuple[i] == NULL) + { + memset(&tid, 0, sizeof(ItemPointerData)); + rtidx = 0; + nodeid = 0; + } + else + { + tid = estate->es_epqTuple[i]->t_self; + rtidx = i + 1; + nodeid = estate->es_epqTuple[i]->t_xc_node_id; + } n16 = htons(rtidx); appendBinaryStringInfo(&buf, (char *) &n16, 2); @@ -10880,7 +10894,7 @@ encode_epqcontext(PlanState *planstate, char **result) appendBinaryStringInfo(&buf, (char *) &n16, 2); n16 = htons(tid.ip_posid); appendBinaryStringInfo(&buf, (char *) &n16, 2); - n32 = htonl(estate->es_epqTuple[i]->t_xc_node_id); + n32 = htonl(nodeid); appendBinaryStringInfo(&buf, (char *) &n32, 4); } diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out index 528cd56d..97d3c276 100644 --- a/src/test/regress/expected/insert.out +++ b/src/test/regress/expected/insert.out @@ -986,3 +986,66 @@ with baseInfo as(select * from t1) insert into t2 select * from baseInfo; drop table t1; drop table t2; +-- test insert with returning in JDBC +drop table if exists insertwithret; +NOTICE: table "insertwithret" does not exist, skipping +create table insertwithret(a int, b text, c int); +prepare p0(int,text,int) as insert into insertwithret values($1, $2, $3) returning a; +prepare p1(int,text,int) as insert into insertwithret values($1, $2, $3) returning a,b; +prepare p2(int,text,int) as insert into insertwithret values($1, $2, $3) returning c; +prepare p3(int,text,int) as insert into insertwithret values($1, $2, $3); +execute p0(1, 'abc', 1); + a +--- + 1 +(1 row) + +execute p1(1, 'abc', 1); + a | b +---+----- + 1 | abc +(1 row) + +execute p2(1, 'abc', 1); + c +--- + 1 +(1 row) + +execute p3(1, 'abc', 1); +-- test complex INSERT +CREATE TABLE ods_time_record ( + id character(8), + mintime character varying(50), + describe character varying(50), + systemtime timestamp(6) without time zone DEFAULT orcl_sysdate(), + remarks character varying(255), + total numeric(255,0) +) +DISTRIBUTE BY SHARD (id) to GROUP default_group; +NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. +CREATE TABLE ods_today_st_river_r ( + stcd character(8) NOT NULL, + tm timestamp(6) without time zone NOT NULL, + z numeric(7,3), + q numeric(9,3), + xsa numeric(9,3), + xsavv numeric(5,3), + xsmxv numeric(5,3), + flwchrcd character(1), + wptn character(1), + msqmt character(1), + msamt character(1), + msvmt character(1), + moditime timestamp(6) without time zone +) +DISTRIBUTE BY SHARD (stcd) to GROUP default_group; +NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. +COPY ods_time_record (id, mintime, describe, systemtime, remarks, total) FROM stdin; +COPY ods_today_st_river_r (stcd, tm, z, q, xsa, xsavv, xsmxv, flwchrcd, wptn, msqmt, msamt, msvmt, moditime) FROM stdin; +ALTER TABLE ONLY ods_today_st_river_r +ADD CONSTRAINT ods_today_st_river_r_pkey PRIMARY KEY (tm, stcd); +insert into ods_time_record ("id",mintime,"describe",remarks,total) +select '1' as "id",max(moditime) as mintime ,'河道' as "describe" ,'st_river_r' as remarks, +(select count(1) from ods_today_st_river_r ) as total from ods_today_st_river_r; +drop table ods_today_st_river_r, ods_time_record; diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql index 5591b65e..be52ef93 100644 --- a/src/test/regress/sql/insert.sql +++ b/src/test/regress/sql/insert.sql @@ -585,3 +585,44 @@ execute p0(1, 'abc', 1); execute p1(1, 'abc', 1); execute p2(1, 'abc', 1); execute p3(1, 'abc', 1); + +-- test complex INSERT +CREATE TABLE ods_time_record ( + id character(8), + mintime character varying(50), + describe character varying(50), + systemtime timestamp(6) without time zone DEFAULT orcl_sysdate(), + remarks character varying(255), + total numeric(255,0) +) +DISTRIBUTE BY SHARD (id) to GROUP default_group; +CREATE TABLE ods_today_st_river_r ( + stcd character(8) NOT NULL, + tm timestamp(6) without time zone NOT NULL, + z numeric(7,3), + q numeric(9,3), + xsa numeric(9,3), + xsavv numeric(5,3), + xsmxv numeric(5,3), + flwchrcd character(1), + wptn character(1), + msqmt character(1), + msamt character(1), + msvmt character(1), + moditime timestamp(6) without time zone +) +DISTRIBUTE BY SHARD (stcd) to GROUP default_group; +COPY ods_time_record (id, mintime, describe, systemtime, remarks, total) FROM stdin; +1 2021-11-04 00:00:00 st_river_r 2021-11-04 00:00:00 河道水情表 0 +1 2021-11-04 00:00:00 st_river_r 2021-11-04 00:00:00 河道水情表 0 +\. +COPY ods_today_st_river_r (stcd, tm, z, q, xsa, xsavv, xsmxv, flwchrcd, wptn, msqmt, msamt, msvmt, moditime) FROM stdin; +30702300 2021-11-11 12:30:00 96.710 1.150 \N \N 0.000 \N 5 1 \N \N 2021-11-11 12:31:54 +41400990 2021-11-11 12:25:00 1.020 \N \N \N 0.000 \N 6 \N \N \N 2021-11-11 12:31:54 +\. +ALTER TABLE ONLY ods_today_st_river_r +ADD CONSTRAINT ods_today_st_river_r_pkey PRIMARY KEY (tm, stcd); +insert into ods_time_record ("id",mintime,"describe",remarks,total) +select '1' as "id",max(moditime) as mintime ,'河道' as "describe" ,'st_river_r' as remarks, +(select count(1) from ods_today_st_river_r ) as total from ods_today_st_river_r; +drop table ods_today_st_river_r, ods_time_record; \ No newline at end of file From 984f5104c01bcb6547169ba72dbc9e19673eb8a5 Mon Sep 17 00:00:00 2001 From: ceciliasu Date: Fri, 12 Nov 2021 15:19:02 +0800 Subject: [PATCH 440/578] Fix a bug in DecodeMultiInsert, which caused the shardID in tuples decoded from MULTI_INSERT xlog always be 0. (merge request !918) TAPD: --story=869170029 --- src/backend/replication/logical/decode.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index f3577766..fa3a078e 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -1381,6 +1381,9 @@ DecodeMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) header->t_infomask = xlhdr->t_infomask; header->t_infomask2 = xlhdr->t_infomask2; header->t_hoff = xlhdr->t_hoff; +#ifdef __STORAGE_SCALABLE__ + header->t_shardid = xlhdr->t_shardid; +#endif } /* From 75a5da40ac8725b22a952ab341866f9513ad902f Mon Sep 17 00:00:00 2001 From: sigmalin Date: Wed, 10 Nov 2021 17:05:11 +0800 Subject: [PATCH 441/578] fix function scan cache lookup failed for type http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131093147751 (merge request !906) Squash merge branch 'sigmalin001' into 'Tbase_v2.15.19.4' fix function scan cache lookup failed for type http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131093147751 --- src/backend/nodes/outfuncs.c | 7 +++++++ src/backend/nodes/readfuncs.c | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index 8266ad33..86320cfb 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -4538,7 +4538,14 @@ _outRangeTblFunction(StringInfo str, const RangeTblFunction *node) WRITE_NODE_FIELD(funcexpr); WRITE_INT_FIELD(funccolcount); WRITE_NODE_FIELD(funccolnames); + if (portable_output) + { + WRITE_TYPID_LIST_FIELD(funccoltypes); + } + else + { WRITE_NODE_FIELD(funccoltypes); + } WRITE_NODE_FIELD(funccoltypmods); WRITE_NODE_FIELD(funccolcollations); WRITE_BITMAPSET_FIELD(funcparams); diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c index db2b9441..33d51a1e 100644 --- a/src/backend/nodes/readfuncs.c +++ b/src/backend/nodes/readfuncs.c @@ -2231,7 +2231,14 @@ _readRangeTblFunction(void) READ_NODE_FIELD(funcexpr); READ_INT_FIELD(funccolcount); READ_NODE_FIELD(funccolnames); + if (portable_input) + { + READ_TYPID_LIST_FIELD(funccoltypes); + } + else + { READ_NODE_FIELD(funccoltypes); + } READ_NODE_FIELD(funccoltypmods); READ_NODE_FIELD(funccolcollations); READ_BITMAPSET_FIELD(funcparams); From bc3231c6269ba713ea64c5fae722a009b1287450 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Fri, 12 Nov 2021 16:25:59 +0800 Subject: [PATCH 442/578] disable PgxcNodeRefresh fix http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131094064259 (merge request !919) Squash merge branch 'sigmalin_v2' into 'Tbase_v2.15.19.3' disable PgxcNodeRefresh fix http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131094064259 --- src/backend/pgxc/pool/poolutils.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/backend/pgxc/pool/poolutils.c b/src/backend/pgxc/pool/poolutils.c index 0b684619..e69fae07 100644 --- a/src/backend/pgxc/pool/poolutils.c +++ b/src/backend/pgxc/pool/poolutils.c @@ -111,9 +111,11 @@ pgxc_pool_reload(PG_FUNCTION_ARGS) if (PgxcNodeRefresh()) PG_RETURN_BOOL(true); #endif +#if 0 + /* TODO: disable node refresh now, consider the handle fd state and enable refresh later */ /* Always send reload msg to pooler */ PgxcNodeRefresh(); - +#endif /* Session is being reloaded, drop prepared and temporary objects */ DropAllPreparedStatements(); From 2a3c5f3e93b5553efae20514911e71857eaf1528 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Tue, 13 Jul 2021 15:45:30 +0800 Subject: [PATCH 443/578] fix dblink error prepared statement http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view?bug_id=1020421696089823793 --- src/backend/tcop/postgres.c | 83 ++++++++++++++++++++++++++++++++++++- 1 file changed, 82 insertions(+), 1 deletion(-) diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index a607a515..6e88c1f3 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -1138,6 +1138,76 @@ pg_plan_queries(List *querytrees, int cursorOptions, ParamListInfo boundParams) return stmt_list; } +/* + * get myself query string from original query string, + * if the query string contain multi stmt + */ +static char* +get_myself_query_string(char* query_string, char** out_query_string) +{ + char *string_delimeter = NULL; + char *myself_query_string = NULL; + int myself_query_string_len = 0; + int pos = 0; + bool in_quotation = false; + int query_string_len = 0; + + if (query_string && query_string[0] != '\0') + { + /* skip space and redundant ';' */ + while (*query_string != '\0') + { + if (ch_is_space(*query_string) || *query_string == ';') + { + query_string++; + } + else + { + break; + } + } + + if (*query_string == '\0') + { + *out_query_string = NULL; + return NULL; + } + + /* find ';' in query string, be careful of '\'' */ + query_string_len = strlen(query_string); + for (pos = 0; pos < query_string_len; pos++) + { + if (query_string[pos] == '\'') + { + in_quotation = (in_quotation) ? false : true; + } + + if (query_string[pos] == ';' && !in_quotation) + { + string_delimeter = &query_string[pos]; + break; + } + } + + if (string_delimeter == NULL) + { + myself_query_string = query_string; + query_string = NULL; + } + else + { + myself_query_string_len = string_delimeter - query_string; + myself_query_string = palloc(myself_query_string_len + 1); + memcpy(myself_query_string, query_string, myself_query_string_len); + myself_query_string[myself_query_string_len] = '\0'; + + query_string = string_delimeter + 1; + } + } + + *out_query_string = myself_query_string; + return query_string; +} /* * exec_simple_query @@ -1156,6 +1226,7 @@ exec_simple_query(const char *query_string) bool isTopLevel; char msec_str[32]; bool multiCommands = false; + char *query_string_tmp = NULL; /* * Report query to various monitoring facilities. @@ -1227,6 +1298,8 @@ exec_simple_query(const char *query_string) errmsg("COMMIT or ROLLBACK " "in multi-statement queries not allowed"))); } + + query_string_tmp = (char*) query_string; } /* @@ -1284,6 +1357,14 @@ exec_simple_query(const char *query_string) Portal portal; DestReceiver *receiver; int16 format; + char *myself_query_string = NULL; + + if (query_string_tmp && query_string_tmp[0] != '\0') + { + /* get this portal's query when has multi parse tree */ + query_string_tmp = get_myself_query_string(query_string_tmp, &myself_query_string); + } + #ifdef PGXC /* @@ -1446,7 +1527,7 @@ exec_simple_query(const char *query_string) */ PortalDefineQuery(portal, NULL, - query_string, + (myself_query_string) ? myself_query_string : query_string, commandTag, plantree_list, NULL); From 59f71a2ac33925d3977b8fe1609a843758c6d9d3 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Mon, 15 Nov 2021 16:14:09 +0800 Subject: [PATCH 444/578] fix complie, cherry-pick from 693dda04e738c8a1c8aa9943d38d9367fb8f6a35 --- src/backend/tcop/postgres.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 6e88c1f3..f7bb987a 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -1138,6 +1138,19 @@ pg_plan_queries(List *querytrees, int cursorOptions, ParamListInfo boundParams) return stmt_list; } +static bool +ch_is_space(char ch) +{ + if (ch == ' ' || ch == '\n' || ch == '\t' || ch == '\r' || ch == '\f') + { + return true; + } + else + { + return false; + } +} + /* * get myself query string from original query string, * if the query string contain multi stmt From a69038032c9d21d3031769698c85d063605eab63 Mon Sep 17 00:00:00 2001 From: andrelin Date: Tue, 2 Nov 2021 16:33:30 +0800 Subject: [PATCH 445/578] Set index valid after DN done index creation if creating index concurrently tapd: http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131092311553 --- src/backend/commands/indexcmds.c | 40 ++++++++++------ src/backend/tcop/utility.c | 81 ++++++++++++++++++++++++++++---- src/include/catalog/index.h | 1 + 3 files changed, 98 insertions(+), 24 deletions(-) diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index 03245150..3d5530b3 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -1341,17 +1341,35 @@ DefineIndex(Oid relationId, * Index can now be marked valid -- update its pg_index entry */ #ifdef __TBASE__ - rel = heap_open(relationId, NoLock); - - if (!RELATION_IS_INTERVAL(rel)) + /* + * local coordinator set this after command sent to DN and other CN + * see ProcessUtilityPost. + */ + if (!IS_PGXC_LOCAL_COORDINATOR) { #endif - index_set_state_flags(indexRelationId, INDEX_CREATE_SET_VALID); + IndexCreateSetValid(indexRelationId, heaprelid.relId); #ifdef __TBASE__ } - - heap_close(rel, NoLock); #endif + + /* + * Last thing to do is release the session-level lock on the parent table. + */ + UnlockRelationIdForSession(&heaprelid, ShareUpdateExclusiveLock); + + return address; +} + +/* + * Set index in pg_index as valid called after 3 phase of concurrent index + * creation. Remember to call it on CN AFTER DN dose + */ +void +IndexCreateSetValid(Oid index, Oid rel) +{ + index_set_state_flags(index, INDEX_CREATE_SET_VALID); + /* * The pg_index update will cause backends (including this one) to update * relcache entries for the index itself, but we should also send a @@ -1360,14 +1378,8 @@ DefineIndex(Oid relationId, * would be useful. (Note that our earlier commits did not create reasons * to replan; so relcache flush on the index itself was sufficient.) */ - CacheInvalidateRelcacheByRelid(heaprelid.relId); - - /* - * Last thing to do is release the session-level lock on the parent table. - */ - UnlockRelationIdForSession(&heaprelid, ShareUpdateExclusiveLock); - - return address; + if (OidIsValid(rel)) + CacheInvalidateRelcacheByRelid(rel); } diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index a984b9e6..c90f1759 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -1648,6 +1648,9 @@ ProcessUtilityPost(PlannedStmt *pstmt, auto_commit = stmt->concurrent; if (stmt->isconstraint) exec_type = EXEC_ON_NONE; + + if (exec_type == EXEC_ON_ALL_NODES && stmt->concurrent) + exec_type = EXEC_ON_DATANODES; } break; @@ -1793,8 +1796,76 @@ ProcessUtilityPost(PlannedStmt *pstmt, #endif if (IS_PGXC_LOCAL_COORDINATOR) + { ExecUtilityStmtOnNodes(parsetree, queryString, NULL, sentToRemote, auto_commit, exec_type, is_temp, add_context); + + if (IsA(parsetree, IndexStmt) && + ((IndexStmt *) parsetree)->concurrent) + { + /* + * When we get here, all DN have done with index creation, time to set index + * valid on CN. + */ + IndexStmt *stmt = (IndexStmt *) parsetree; + Oid indexid = InvalidOid; + Relation rel = relation_openrv_extended(stmt->relation, NoLock, true); + + /* exec_type can't be EXEC_ON_ALL_NODES, as changed in "switch case" above */ + Assert(exec_type != EXEC_ON_ALL_NODES); + + if (rel == NULL) + { + /* + * Failed to get enough message from stmt, have to guess a namespace. + * This should not happen but ... + */ + indexid = RelnameGetRelid(stmt->idxname); + CommitTransactionCommand(); + StartTransactionCommand(); + + IndexCreateSetValid(indexid, InvalidOid); + } + else + { + Oid relid = RelationGetRelid(rel); + Oid namespace = RelationGetNamespace(rel); + int nParts = 0; + int i; + Oid child_index; + Oid child_rel; + + indexid = get_relname_relid(stmt->idxname, namespace); + + if (rel != NULL && RELATION_IS_INTERVAL(rel)) + nParts = RelationGetNParts(rel); + relation_close(rel, NoLock); + + CommitTransactionCommand(); + StartTransactionCommand(); + IndexCreateSetValid(indexid, relid); + + /* if there are interval partitions, do the same thing */ + for (i = 0; i < nParts; i++) + { + child_index = get_relname_relid(GetPartitionName(indexid, i, true), namespace); + child_rel = get_relname_relid(GetPartitionName(relid, i, false), namespace); + + IndexCreateSetValid(child_index, child_rel); + } + + /* + * Notice: community version of partition table is not allow to build + * index concurrently, so don't bother here. + */ + } + + /* finally, tell other CN to create an index */ + if (exec_type != EXEC_ON_NONE) + ExecUtilityStmtOnNodes(parsetree, queryString, NULL, sentToRemote, auto_commit, + EXEC_ON_COORDS, is_temp, add_context); + } + } } #ifdef __TBASE__ @@ -4374,16 +4445,6 @@ ProcessUtilitySlow(ParseState *pstate, } MemoryContextDelete(temp); - - if (stmt->concurrent) - { - /* - * Commit this transaction to make the indisready update visible. - */ - CommitTransactionCommand(); - StartTransactionCommand(); - index_set_state_flags(indexOid, INDEX_CREATE_SET_VALID); - } } else if (RELATION_IS_CHILD(rel)) { diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h index c60ad12f..d37be02e 100644 --- a/src/include/catalog/index.h +++ b/src/include/catalog/index.h @@ -211,5 +211,6 @@ extern bool index_is_interval(Oid indexId); #endif extern void IndexSetParentIndex(Relation idx, Oid parentOid); +extern void IndexCreateSetValid(Oid index, Oid rel); #endif /* INDEX_H */ From d4cb6fb47532bffd4f466e54b9fc8ecfe6bce340 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Mon, 22 Nov 2021 11:46:53 +0800 Subject: [PATCH 446/578] fix error msg when reset handles fix http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131094179541 (merge request !938) Squash merge branch 'sigmalin002' into 'Tbase_v2.15.19.3' fix error msg when reset handles fix http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131094179541 TAPD: --bug=094179541 --- src/backend/pgxc/pool/execRemote.c | 18 ++++++++-- src/backend/pgxc/pool/pgxcnode.c | 57 ++++++++++++++++++++++-------- src/include/pgxc/pgxcnode.h | 2 +- 3 files changed, 59 insertions(+), 18 deletions(-) diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 31ee6014..20cab418 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -3733,7 +3733,15 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections, { for (i = 0; i < new_count; i++) { - pgxc_node_set_query(new_connections[i], init_str); + if (pgxc_node_set_query(new_connections[i], init_str)) + { + /* + * print log here and return eof indicates execution failure + */ + elog(LOG, "pgxc_node_begin send %s to node %s, pid:%d failed", init_str, + new_connections[i]->nodename, new_connections[i]->backend_pid); + return EOF; + } elog(DEBUG5, "pgxc_node_begin send %s to node %s, pid:%d", init_str, new_connections[i]->nodename, new_connections[i]->backend_pid); } @@ -7046,7 +7054,13 @@ LeaderCnExecRemoteUtility(RemoteQuery *node, char *init_str = PGXCNodeGetSessionParamStr(); if (init_str) { - pgxc_node_set_query(leader_cn_conn, init_str); + if (pgxc_node_set_query(leader_cn_conn, init_str)) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("pgxc_node_set_query send %s to node %s, pid:%d failed", init_str, + leader_cn_conn->nodename, leader_cn_conn->backend_pid))); + } } SetPlpgsqlTransactionBegin(leader_cn_conn); diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index 8968700d..23225be6 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -149,7 +149,7 @@ static bool DoRefreshRemoteHandles(void); #ifdef XCP static void pgxc_node_init(PGXCNodeHandle *handle, int sock, - bool global_session, int pid); + bool global_session, int pid, bool is_reset_handle); #else static void pgxc_node_init(PGXCNodeHandle *handle, int sock); #endif @@ -667,8 +667,8 @@ pgxc_node_all_free(void) * Structure stores state info and I/O buffers */ static void -pgxc_node_init(PGXCNodeHandle *handle, int sock, bool global_session, int pid) -{// #lizard forgives +pgxc_node_init(PGXCNodeHandle *handle, int sock, bool global_session, int pid, bool is_reset_handle) +{ char *init_str; handle->sock = sock; @@ -701,9 +701,20 @@ pgxc_node_init(PGXCNodeHandle *handle, int sock, bool global_session, int pid) if (global_session) { init_str = PGXCNodeGetSessionParamStr(); - if (init_str) + if (init_str && pgxc_node_set_query(handle, init_str)) + { + if (is_reset_handle) + { + /* if it is a reset handle, do not throw error, just set handle as error state */ + PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_ERROR_FATAL); + elog(WARNING, "pgxc_node_set_query send %s to node %s, pid:%d failed", init_str, + handle->nodename, handle->backend_pid); + } + else { - pgxc_node_set_query(handle, init_str); + elog(ERROR, "pgxc_node_set_query send %s to node %s, pid:%d failed", init_str, + handle->nodename, handle->backend_pid); + } } } @@ -1546,6 +1557,7 @@ release_handles(bool force) /* * Reset all Datanode and Coordinator connections occupied memory. + * TODO: fix implicit transaction do not commit on dn and remove reset_handles */ void reset_handles(void) @@ -1570,7 +1582,7 @@ reset_handles(void) if (handle->sock != NO_SOCKET) { - pgxc_node_init(handle, handle->sock, true, handle->backend_pid); + pgxc_node_init(handle, handle->sock, true, handle->backend_pid, true); } } @@ -1580,7 +1592,7 @@ reset_handles(void) if (handle->sock != NO_SOCKET) { - pgxc_node_init(handle, handle->sock, true, handle->backend_pid); + pgxc_node_init(handle, handle->sock, true, handle->backend_pid, true); } } @@ -1593,10 +1605,16 @@ reset_handles(void) if (handle->sock != NO_SOCKET) { - pgxc_node_init(handle, handle->sock, true, handle->backend_pid); + pgxc_node_init(handle, handle->sock, true, handle->backend_pid, true); } } } + + if (validate_handles()) + { + elog(LOG, "found bad remote node connections, force release handles now"); + release_handles(true); + } } /* @@ -3727,7 +3745,7 @@ get_any_handle(List *datanodelist) node_handle = &dn_handles[node]; - pgxc_node_init(node_handle, fds[0], true, pids[0]); + pgxc_node_init(node_handle, fds[0], true, pids[0], false); datanode_count++; elog(DEBUG1, "Established a connection with datanode \"%s\"," @@ -4003,7 +4021,7 @@ get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query, bool continue; } - pgxc_node_init(node_handle, fdsock, is_global_session, be_pid); + pgxc_node_init(node_handle, fdsock, is_global_session, be_pid, false); dn_handles[node] = *node_handle; datanode_count++; @@ -4068,7 +4086,7 @@ get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query, bool continue; } - pgxc_node_init(node_handle, fdsock, is_global_session, be_pid); + pgxc_node_init(node_handle, fdsock, is_global_session, be_pid, false); co_handles[node] = *node_handle; coord_count++; @@ -5094,14 +5112,18 @@ PGXCNodeGetTransactionParamStr(void) /* * Send down specified query, read and discard all responses until ReadyForQuery */ -void +int pgxc_node_set_query(PGXCNodeHandle *handle, const char *set_query) { if (pgxc_node_send_query(handle, set_query) != 0) { - ereport(ERROR, + /* + * print log only and decide whether to throw an error at the place where it is called + */ + ereport(LOG, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Failed to send query %s",set_query))); + return EOF; } /* * Now read responses until ReadyForQuery. @@ -5142,8 +5164,11 @@ pgxc_node_set_query(PGXCNodeHandle *handle, const char *set_query) { PGXCNodeHandleError(handle, msg, msglen); PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_ERROR_FATAL); - elog(ERROR,"pgxc_node_set_query: %s",handle->error); - break; + /* + * print log only and decide whether to throw an error at the place where it is called + */ + elog(LOG,"pgxc_node_set_query: %s",handle->error); + return EOF; } if (msgtype == 'Z') /* ReadyForQuery */ @@ -5154,6 +5179,8 @@ pgxc_node_set_query(PGXCNodeHandle *handle, const char *set_query) break; } } + + return 0; } diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h index 8d51d1dc..91db953a 100644 --- a/src/include/pgxc/pgxcnode.h +++ b/src/include/pgxc/pgxcnode.h @@ -281,7 +281,7 @@ extern void PGXCNodeSetParam(bool local, const char *name, const char *value, extern void PGXCNodeResetParams(bool only_local); extern char *PGXCNodeGetSessionParamStr(void); extern char *PGXCNodeGetTransactionParamStr(void); -extern void pgxc_node_set_query(PGXCNodeHandle *handle, const char *set_query); +extern int pgxc_node_set_query(PGXCNodeHandle *handle, const char *set_query); extern void RequestInvalidateRemoteHandles(void); extern void RequestRefreshRemoteHandles(void); extern bool PoolerMessagesPending(void); From 7149d0b4c2f486b6994358b6365b70fcbaf78f93 Mon Sep 17 00:00:00 2001 From: challzhang Date: Thu, 2 Dec 2021 20:02:51 +0800 Subject: [PATCH 447/578] cn analyze ignore toast fields in tuples gathered from dn --- src/backend/commands/analyze.c | 58 ++++++++++++++++++++++++++++++++-- 1 file changed, 56 insertions(+), 2 deletions(-) diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 48cf8d22..d342a1ac 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -1258,6 +1258,7 @@ acquire_sample_rows(Relation onerel, int elevel, { ItemId itemid; HeapTupleData targtuple; + HeapTuple newTuple = &targtuple; bool sample_it = false; itemid = PageGetItemId(targpage, targoffset); @@ -1351,6 +1352,59 @@ acquire_sample_rows(Relation onerel, int elevel, if (sample_it) { /* + * If connection is from Coordinator on datanodes, we discard TOAST fields in sample, + * which will lighten the load of memory usage on coordinator. + */ + if (IS_PGXC_DATANODE && IsConnFromCoord()) + { + Datum *values; + bool *nulls; + TupleDesc tupdesc = NULL; + int nattrs; + Form_pg_attribute *attrs; + int i; + + tupdesc = RelationGetDescr(onerel); + nattrs = tupdesc->natts; + attrs = tupdesc->attrs; + + values = (Datum *) palloc0(nattrs * sizeof(Datum)); + nulls = (bool *) palloc0(nattrs * sizeof(bool)); + + heap_deform_tuple(&targtuple, tupdesc, values, nulls); + + for (i = 0; i < nattrs; i++) + { + if (!attrs[i]->attbyval && attrs[i]->attlen == -1) + { + /* varlena */ + Pointer val = DatumGetPointer(values[i]); + if (val == NULL || VARATT_IS_EXTERNAL(val) || VARATT_IS_COMPRESSED(val)) + { + nulls[i] = true; + } + } + } + + newTuple = heap_form_tuple(tupdesc, values, nulls); + + pfree(values); + pfree(nulls); + + /* + * copy the identification info of the old tuple: t_ctid, t_self, and OID + * (if any) + */ + newTuple->t_data->t_ctid = targtuple.t_data->t_ctid; + newTuple->t_self = targtuple.t_self; + newTuple->t_tableOid = targtuple.t_tableOid; +#ifdef PGXC + newTuple->t_xc_node_id = targtuple.t_xc_node_id; +#endif + if (tupdesc->tdhasoid) + HeapTupleSetOid(newTuple, HeapTupleGetOid(&targtuple)); + } + /* * The first targrows sample rows are simply copied into the * reservoir. Then we start replacing tuples in the sample * until we reach the end of the relation. This algorithm is @@ -1363,7 +1417,7 @@ acquire_sample_rows(Relation onerel, int elevel, * the relation we're done. */ if (numrows < targrows) - rows[numrows++] = heap_copytuple(&targtuple); + rows[numrows++] = heap_copytuple(newTuple); else { /* @@ -1385,7 +1439,7 @@ acquire_sample_rows(Relation onerel, int elevel, Assert(k >= 0 && k < targrows); heap_freetuple(rows[k]); - rows[k] = heap_copytuple(&targtuple); + rows[k] = heap_copytuple(newTuple); } rowstoskip -= 1; From a6b238cb1236a6e9b9a8a9fb73a2f424a42043d0 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Wed, 10 Nov 2021 15:12:38 +0800 Subject: [PATCH 448/578] fix g_commandTag coredump /an:tgit_woa_pro/ts:tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131094029153 (merge request !902) Squash merge branch 'sigmalin_v2oid' into 'Tbase_v2.15.19.4' fix g_commandTag coredump /an:tgit_woa_pro/ts:tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131094029153 TAPD: --bug=094029153 --- src/backend/tcop/postgres.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index f7bb987a..5c80dfb5 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -2452,6 +2452,17 @@ exec_bind_message(StringInfo input_message) /* Copy the plan's query string into the portal */ query_string = pstrdup(psrc->query_string); +#ifdef __AUDIT_FGA__ + if (portal && portal->commandTag) + { + g_commandTag = pnstrdup(portal->commandTag, strlen(portal->commandTag)); + } + else + { + g_commandTag = NULL; + } +#endif + /* Likewise make a copy of the statement name, unless it's unnamed */ if (stmt_name[0]) saved_stmt_name = pstrdup(stmt_name); From 3abf57fd2ff4fdc7dc37bb30ced676a3112e0f49 Mon Sep 17 00:00:00 2001 From: andrelin Date: Tue, 14 Dec 2021 15:13:11 +0800 Subject: [PATCH 449/578] Prune interval partition table before calculate total_table_pages (merge request !1018) Squash merge branch 'andrelin/partition_prune' into 'Tbase_v2.15.19.4' Prune interval partition table before calculate total_table_pages this affects cost evaluation of indexscan of interval partition table tapd: http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131094930123 --- src/backend/optimizer/path/allpaths.c | 95 +++++++++++++++++++++++++++ src/backend/optimizer/path/costsize.c | 55 ---------------- src/backend/optimizer/plan/planmain.c | 2 + src/include/optimizer/paths.h | 3 + 4 files changed, 100 insertions(+), 55 deletions(-) diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 40bd2cf0..b5ddbfcd 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -47,12 +47,14 @@ #include "parser/parsetree.h" #include "partitioning/partprune.h" #include "pgxc/nodemgr.h" +#include "storage/lmgr.h" #ifdef PGXC #include "nodes/makefuncs.h" #include "miscadmin.h" #endif /* PGXC */ #include "rewrite/rewriteManip.h" #include "utils/lsyscache.h" +#include "utils/ruleutils.h" /* results of subquery_is_pushdown_safe */ @@ -3855,3 +3857,96 @@ debug_print_rel(PlannerInfo *root, RelOptInfo *rel) } #endif /* OPTIMIZER_DEBUG */ + +/* + * Prune children of interval partition table by qual, this happens + * before path generation phase, and adjust rel->pages and rel->tuples + * for a better cost evaluation. + */ +void +prune_interval_base_rel(PlannerInfo *root) +{ + Index rti; + + for (rti = 1; rti < root->simple_rel_array_size; rti++) + { + RelOptInfo *rel = root->simple_rel_array[rti]; + + if (rel == NULL) + continue; + + Assert(rel->relid == rti); /* sanity check on array */ + + if (IS_DUMMY_REL(rel)) + continue; + + if (IS_SIMPLE_REL(rel) && rel->intervalparent && !rel->isdefault) + { + RangeTblEntry *rte; + Relation relation; + Oid partoid = InvalidOid; + Bitmapset *tmpset; + + rte = rt_fetch(rel->relid, root->parse->rtable); + relation = heap_open(rte->relid, AccessShareLock); + + /* pruning by qual */ + rel->childs = RelationGetPartitionsByQuals(relation, rel->baserestrictinfo); + +#ifdef __COLD_HOT__ + /* only datanode and SELECT command need to prune hot data */ + if (CMD_SELECT == root->parse->commandType && g_EnableDualWrite && IS_PGXC_DATANODE) + { + /* prune hot data */ + PruneHotData(RelationGetRelid(relation), rel->childs); + } +#endif + + tmpset = bms_copy(rel->childs); + + if (bms_num_members(tmpset) == 1) + { + Relids *attr_needed = rel->attr_needed; + int32 *attr_widths = rel->attr_widths; + rel->estimate_partidx = bms_first_member(tmpset); + partoid = RelationGetPartition(relation, rel->estimate_partidx, false); + + /* degrate from parent to a child of parent */ + rte->relid = partoid; + rel->intervalparent = false; + rel->isdefault = false; + rel->estimate_partidx = -1; + rel->indexlist = NULL; + LockRelationOid(partoid, AccessShareLock); + get_relation_info(root, partoid, false, rel); + rel->attr_needed = attr_needed; + rel->attr_widths = attr_widths; + check_index_predicates(root, rel); + + bms_free(rel->childs); + rel->childs = NULL; + } + else + { + int i; + Relation child; + + rel->pages = 0; + rel->tuples = 0; + + while ((i = bms_first_member(tmpset)) >= 0) + { + partoid = RelationGetPartition(relation, i, false); + + child = heap_open(partoid, AccessShareLock); + rel->pages += child->rd_rel->relpages; + rel->tuples += child->rd_rel->reltuples; + heap_close(child, AccessShareLock); + } + } + + bms_free(tmpset); + heap_close(relation, AccessShareLock); + } + } +} diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index f8ac09e8..5f129790 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -4226,61 +4226,6 @@ set_baserel_size_estimates(PlannerInfo *root, RelOptInfo *rel) /* Should only be applied to base relations */ Assert(rel->relid > 0); -#ifdef __TBASE__ - if(rel->intervalparent && !rel->isdefault) - { - RangeTblEntry *rte; - Relation relation; - - rte = rt_fetch(rel->relid, root->parse->rtable); - relation = heap_open(rte->relid, AccessShareLock); - - //pruning - rel->childs = RelationGetPartitionsByQuals(relation, rel->baserestrictinfo); - -#ifdef __COLD_HOT__ - /* only datanode and SELECT command need to prune hot data */ - if (CMD_SELECT == root->parse->commandType && g_EnableDualWrite && IS_PGXC_DATANODE) - { - /* prune hot data */ - PruneHotData(RelationGetRelid(relation), rel->childs); - } -#endif - - if(bms_num_members(rel->childs) == 1) - { - Oid partoid = InvalidOid; - Relids *attr_needed = rel->attr_needed; - int32 *attr_widths = rel->attr_widths; - Bitmapset * bmscopy = bms_copy(rel->childs); - rel->estimate_partidx = bms_first_member(bmscopy); - partoid = RelationGetPartition(relation, rel->estimate_partidx, false); - - //degrate from parent to a child of parent - rte->relid = partoid; - rel->intervalparent = false; - rel->isdefault = false; - if(rel->childs) - { - bms_free(rel->childs); - rel->childs = NULL; - } - rel->estimate_partidx = -1; - rel->indexlist = NULL; - LockRelationOid(partoid,AccessShareLock); - get_relation_info(root, partoid, false, rel); - rel->attr_needed = attr_needed; - rel->attr_widths = attr_widths; - check_index_predicates(root, rel); - //UnlockRelationOid(partoid,AccessShareLock); - - bms_free(bmscopy); - } - - heap_close(relation, AccessShareLock); - } -#endif - nrows = rel->tuples * clauselist_selectivity(root, rel->baserestrictinfo, diff --git a/src/backend/optimizer/plan/planmain.c b/src/backend/optimizer/plan/planmain.c index a1a689ee..c04e838b 100644 --- a/src/backend/optimizer/plan/planmain.c +++ b/src/backend/optimizer/plan/planmain.c @@ -225,6 +225,8 @@ query_planner(PlannerInfo *root, List *tlist, */ extract_restriction_or_clauses(root); + prune_interval_base_rel(root); + /* * We should now have size estimates for every actual table involved in * the query, and we also know which if any have been deleted from the diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h index 48d6f994..ebeca134 100644 --- a/src/include/optimizer/paths.h +++ b/src/include/optimizer/paths.h @@ -236,4 +236,7 @@ extern PathKey *make_canonical_pathkey(PlannerInfo *root, #ifdef __TBASE__ extern double path_count_datanodes(Path *path); #endif + +extern void prune_interval_base_rel(PlannerInfo *root); + #endif /* PATHS_H */ From cf935e79c136d380f230f9faaab2862ce0894090 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Wed, 8 Dec 2021 21:15:59 +0800 Subject: [PATCH 450/578] fix: tpcc transaction inconsistent. resolve solution: release all handles when handle->transaction_status == 'E' in function pgxc_node_remote_abort (merge request !1009) Squash merge branch 'Tbase_v5.06_tpcc_inconsistent' into 'Tbase_v5.06' fix: tpcc transaction inconsistent. resolve solution: release all handles when handle->transaction_status == 'E' in function pgxc_node_remote_abort Signed-off-by: JennyJennyChen --- src/backend/pgxc/pool/execRemote.c | 8 ++++++-- src/backend/pgxc/pool/pgxcnode.c | 21 +++++++++++++++++++++ 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 20cab418..1d519162 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -3390,6 +3390,8 @@ handle_response(PGXCNodeHandle *conn, ResponseCombiner *combiner) conn->transaction_status = msg[0]; PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_IDLE); conn->combiner = NULL; + + elog(DEBUG5, "remote_node %s remote_pid %d, conn->transaction_status %c", conn->nodename, conn->backend_pid, conn->transaction_status); #ifdef DN_CONNECTION_DEBUG conn->have_row_desc = false; #endif @@ -5661,6 +5663,7 @@ pgxc_node_remote_abort(TranscationType txn_type, bool need_release_handle) /* Read responses from these */ sync_connections[sync_conn_count++] = conn; result = EOF; + elog(DEBUG5, "send SYNC command to CN nodename %s, backend_pid %d", conn->nodename, conn->backend_pid); } } } @@ -5695,6 +5698,7 @@ pgxc_node_remote_abort(TranscationType txn_type, bool need_release_handle) /* Read responses from these */ sync_connections[sync_conn_count++] = conn; result = EOF; + elog(DEBUG5, "send SYNC command to DN nodename %s, backend_pid %d", conn->nodename, conn->backend_pid); } } } @@ -5934,13 +5938,13 @@ pgxc_node_remote_abort(TranscationType txn_type, bool need_release_handle) { ereport(LOG, (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to send SYNC to on one or more nodes errmsg:%s", combiner.errorMessage))); + errmsg("Failed to send ROLLBACK to on one or more nodes errmsg:%s", combiner.errorMessage))); } else { ereport(LOG, (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to send SYNC to on one or more nodes"))); + errmsg("Failed to send ROLLBACK to on one or more nodes"))); } } CloseCombiner(&combiner); diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index 23225be6..11932767 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -1648,6 +1648,13 @@ validate_handles(void) return true; } } + + if(handle->transaction_status == 'E') + { + elog(LOG, "Remote node \"%s\", running with pid %d transaction_status %c is bad", + handle->nodename, handle->backend_pid, handle->transaction_status); + return true; + } } } @@ -1674,6 +1681,13 @@ validate_handles(void) } } + + if(handle->transaction_status == 'E') + { + elog(LOG, "Remote node \"%s\", running with pid %d transaction_status %c is bad", + handle->nodename, handle->backend_pid, handle->transaction_status); + return true; + } } } @@ -1701,6 +1715,13 @@ validate_handles(void) return true; } } + + if(handle->transaction_status == 'E') + { + elog(LOG, "Remote node \"%s\", running with pid %d transaction_status %c is bad", + handle->nodename, handle->backend_pid, handle->transaction_status); + return true; + } } } } From bd290dc41b114ab64596c7915136c5f5eac96b9a Mon Sep 17 00:00:00 2001 From: whalesong Date: Thu, 16 Dec 2021 16:03:11 +0800 Subject: [PATCH 451/578] bugfix: get gts error, get a earlier one (merge request !1023) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 对应的问题单: http://tapd.oa.com/20421696/bugtrace/bugs/view?bug_id=1020421696095181275 问题描述: 跑TPCC测试过程中,频繁进行一致性检查,偶尔会测试出数据不一致 问题原因: 最后一次重试获取gts时,如果出现超时,gtm的连接未释放,导致下一次获取gts,可能取到上一次返回的结果,因此获取到更早的更小的gts,进而影响到tuple的可见性判断,看到错误的tuple版本,导致一致性检查不通过 修改方案: 最后一次重试获取gts时,如果仍获取到无效gts,则重置gtm的连接,以免残留消息导致后面的消息处理错位,获取到错误的gts (cherry picked from commit dc583306) 4b61a83b bugfix: get gts error, get a earlier one --- src/backend/access/transam/gtm.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c index 5fb8904a..8b11b530 100644 --- a/src/backend/access/transam/gtm.c +++ b/src/backend/access/transam/gtm.c @@ -1505,6 +1505,14 @@ GetGlobalTimestampGTM(void) } elog(DEBUG7, "get global timestamp gts " INT64_FORMAT, gts_result.gts); + if (retry_cnt >= reconnect_gtm_retry_times && + !GlobalTimestampIsValid(gts_result.gts)) + { + elog(WARNING, "retry %d times, get a invalid global timestamp, " + "ResetGTMConnection", retry_cnt); + ResetGTMConnection(); + } + if (log_gtm_stats) ShowUsageCommon("BeginTranGTM", &start_r, &start_t); From 62fdfaaebb44014744e0029cc41a0043bb77cc5b Mon Sep 17 00:00:00 2001 From: whalesong Date: Tue, 21 Dec 2021 11:05:44 +0800 Subject: [PATCH 452/578] bugfix: fix gts bug again, get a earlier one (merge request !1031) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 再次修复gts消息错位导致获取到更早之前gts的问题,之前修改后自测还有一条路径有问题,修复之 之前的修改: https://git.woa.com/Tbase/PG-XL-v10/merge_requests/1023 (cherry picked from commit 778c5873) 8af73c72 bugfix: fix gts bug again, get a earlier one --- src/backend/access/transam/gtm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c index 8b11b530..03d76457 100644 --- a/src/backend/access/transam/gtm.c +++ b/src/backend/access/transam/gtm.c @@ -1505,8 +1505,7 @@ GetGlobalTimestampGTM(void) } elog(DEBUG7, "get global timestamp gts " INT64_FORMAT, gts_result.gts); - if (retry_cnt >= reconnect_gtm_retry_times && - !GlobalTimestampIsValid(gts_result.gts)) + if (!GlobalTimestampIsValid(gts_result.gts)) { elog(WARNING, "retry %d times, get a invalid global timestamp, " "ResetGTMConnection", retry_cnt); From 440b2557c34a76433eb8ead912d092902a905d64 Mon Sep 17 00:00:00 2001 From: arrowbowang Date: Tue, 21 Dec 2021 14:09:12 +0800 Subject: [PATCH 453/578] fix: 2pc stop when the first distribute transaction prepared successfully on all dn and then rollback on cn for error, then the second transaction abort on dn for error during executation will cause a stop because the global g_twophase_state is still the first ones status --- src/backend/access/transam/xact.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 1d255395..33cb58c9 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -2707,7 +2707,11 @@ StartTransaction(void) * progress" */ s->state = TRANS_INPROGRESS; - + if (g_twophase_state.state != TWO_PHASE_INITIALTRANS) + { + ClearLocalTwoPhaseState(); + elog(WARNING, "clear g_twophase_state when start transaction") + } ShowTransactionState("StartTransaction"); } From 0f3fcad11f3b4b2ab5c132ab75746c9a8304c447 Mon Sep 17 00:00:00 2001 From: whalesong Date: Wed, 1 Dec 2021 11:36:27 +0800 Subject: [PATCH 454/578] bugfix: rollback slow than commit, tpcc performce optimize(merge request !965) (cherry picked from commit cc4a42d3) 0fc32b96 bugfix: rollback slow than commit 2 b63fe1f1 bugfix: rollback slow than commit --- src/backend/pgxc/pool/execRemote.c | 9 --------- src/backend/pgxc/pool/pgxcnode.c | 19 ++++++++++++++++--- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 1d519162..45bac135 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -7903,11 +7903,6 @@ PreAbort_Remote(TranscationType txn_type, bool need_release_handle) } } - -#if PGXC_CANCEL_DELAY > 0 - pg_usleep(PGXC_CANCEL_DELAY * 1000); -#endif - /* * Now read and discard any data from the connections found "dirty" */ @@ -11765,10 +11760,6 @@ void pgxc_abort_connections(PGXCNodeAllHandles *all_handles) { break; } - /* Sleep a while. */ -#if PGXC_CANCEL_DELAY > 0 - pg_usleep(PGXC_CANCEL_DELAY * 1000); -#endif } } } diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index 11932767..78f1a024 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -2696,6 +2696,7 @@ pgxc_node_flush_read(PGXCNodeHandle *handle) {// #lizard forgives bool is_ready= false; int read_result; + int wait_time = 1; if (handle == NULL) { @@ -2704,9 +2705,6 @@ pgxc_node_flush_read(PGXCNodeHandle *handle) while(true) { -#if PGXC_CANCEL_DELAY > 0 - pg_usleep(PGXC_CANCEL_DELAY * 1000); -#endif /* consume all data */ while (HAS_MESSAGE_BUFFERED(handle)) { @@ -2733,6 +2731,21 @@ pgxc_node_flush_read(PGXCNodeHandle *handle) elog(LOG, "pgxc_node_flush_read node:%s read failure.", handle->nodename); break; } + + if (PGXC_CANCEL_DELAY > 0) + { + elog(DEBUG5, "pgxc_node_flush_read sleep %dus", wait_time); + pg_usleep(wait_time); + + if (wait_time < PGXC_CANCEL_DELAY) + { + wait_time *= 2; + } + if (wait_time > PGXC_CANCEL_DELAY) + { + wait_time = PGXC_CANCEL_DELAY; + } + } } } From 2415002449689880f4b10aaf176d7f487eddde57 Mon Sep 17 00:00:00 2001 From: arrowbowang Date: Tue, 21 Dec 2021 17:40:02 +0800 Subject: [PATCH 455/578] fix: compile error --- src/backend/access/transam/xact.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 33cb58c9..6c393b81 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -2710,7 +2710,7 @@ StartTransaction(void) if (g_twophase_state.state != TWO_PHASE_INITIALTRANS) { ClearLocalTwoPhaseState(); - elog(WARNING, "clear g_twophase_state when start transaction") + elog(WARNING, "clear g_twophase_state when start transaction"); } ShowTransactionState("StartTransaction"); } From a15f23bba9cc18ec802db8bbd45414b0d71e75ae Mon Sep 17 00:00:00 2001 From: guanhuawang Date: Fri, 17 Dec 2021 15:25:09 +0800 Subject: [PATCH 456/578] Fix coredump caused by int128 instructions http://tapd.oa.com/20421696/prong/stories/view/1020421696870813487 --- config/c-compiler.m4 | 41 +++++++++++++++++++++++--- configure | 69 ++++++++++++++++++++++++++++++++++++++++---- src/include/c.h | 1 - 3 files changed, 100 insertions(+), 11 deletions(-) diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 8d9844ab..cb35429c 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -108,29 +108,62 @@ AC_DEFUN([PGAC_TYPE_128BIT_INT], [AC_CACHE_CHECK([for __int128], [pgac_cv__128bit_int], [AC_LINK_IFELSE([AC_LANG_PROGRAM([ /* + * We don't actually run this test, just link it to verify that any support + * functions needed for __int128 are present. + * * These are globals to discourage the compiler from folding all the * arithmetic tests down to compile-time constants. We do not have * convenient support for 64bit literals at this point... + * convenient support for 128bit literals at this point... */ __int128 a = 48828125; -__int128 b = 97656255; +__int128 b = 97656250; ],[ __int128 c,d; a = (a << 12) + 1; /* 200000000001 */ b = (b << 12) + 5; /* 400000000005 */ -/* use the most relevant arithmetic ops */ +/* try the most relevant arithmetic ops */ c = a * b; d = (c + b) / b; -/* return different values, to prevent optimizations */ +/* must use the results, else compiler may optimize arithmetic away */ if (d != a+1) - return 0; return 1; ])], [pgac_cv__128bit_int=yes], [pgac_cv__128bit_int=no])]) if test x"$pgac_cv__128bit_int" = xyes ; then + # Use of non-default alignment with __int128 tickles bugs in some compilers. + # If not cross-compiling, we can test for bugs and disable use of __int128 + # with buggy compilers. If cross-compiling, hope for the best. + # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83925 + AC_CACHE_CHECK([for __int128 alignment bug], [pgac_cv__128bit_int_bug], + [AC_RUN_IFELSE([AC_LANG_PROGRAM([ +/* This must match the corresponding code in c.h: */ +#if defined(__GNUC__) || defined(__SUNPRO_C) || defined(__IBMC__) +#define pg_attribute_aligned(a) __attribute__((aligned(a))) +#endif +typedef __int128 int128a +#if defined(pg_attribute_aligned) +pg_attribute_aligned(8) +#endif +; +int128a holder; +void pass_by_val(void *buffer, int128a par) { holder = par; } +],[ +long int i64 = 97656225L << 12; +int128a q; +pass_by_val(main, (int128a) i64); +q = (int128a) i64; +if (q != holder) + return 1; +])], + [pgac_cv__128bit_int_bug=ok], + [pgac_cv__128bit_int_bug=broken], + [pgac_cv__128bit_int_bug="assuming ok"])]) + if test x"$pgac_cv__128bit_int_bug" != xbroken ; then AC_DEFINE(PG_INT128_TYPE, __int128, [Define to the name of a signed 128-bit integer type.]) AC_CHECK_ALIGNOF(PG_INT128_TYPE) + fi fi])# PGAC_TYPE_128BIT_INT diff --git a/configure b/configure index 26843895..9b92963c 100755 --- a/configure +++ b/configure @@ -15058,12 +15058,15 @@ else /* end confdefs.h. */ /* + * We don't actually run this test, just link it to verify that any support + * functions needed for __int128 are present. + * * These are globals to discourage the compiler from folding all the * arithmetic tests down to compile-time constants. We do not have - * convenient support for 64bit literals at this point... + + * convenient support for 128bit literals at this point... */ __int128 a = 48828125; -__int128 b = 97656255; +__int128 b = 97656250; int main () @@ -15072,12 +15075,11 @@ main () __int128 c,d; a = (a << 12) + 1; /* 200000000001 */ b = (b << 12) + 5; /* 400000000005 */ -/* use the most relevant arithmetic ops */ +/* try the most relevant arithmetic ops */ c = a * b; d = (c + b) / b; -/* return different values, to prevent optimizations */ +/* must use the results, else compiler may optimize arithmetic away */ if (d != a+1) - return 0; return 1; ; @@ -15095,6 +15097,61 @@ fi { $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__128bit_int" >&5 $as_echo "$pgac_cv__128bit_int" >&6; } if test x"$pgac_cv__128bit_int" = xyes ; then + # Use of non-default alignment with __int128 tickles bugs in some compilers. + # If not cross-compiling, we can test for bugs and disable use of __int128 + # with buggy compilers. If cross-compiling, hope for the best. + # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83925 + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __int128 alignment bug" >&5 +$as_echo_n "checking for __int128 alignment bug... " >&6; } +if ${pgac_cv__128bit_int_bug+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test "$cross_compiling" = yes; then : + pgac_cv__128bit_int_bug="assuming ok" +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +/* This must match the corresponding code in c.h: */ +#if defined(__GNUC__) || defined(__SUNPRO_C) || defined(__IBMC__) +#define pg_attribute_aligned(a) __attribute__((aligned(a))) +#endif +typedef __int128 int128a +#if defined(pg_attribute_aligned) +pg_attribute_aligned(8) +#endif +; +int128a holder; +void pass_by_val(void *buffer, int128a par) { holder = par; } + +int +main () +{ + +long int i64 = 97656225L << 12; +int128a q; +pass_by_val(main, (int128a) i64); +q = (int128a) i64; +if (q != holder) + return 1; + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_run "$LINENO"; then : + pgac_cv__128bit_int_bug=ok +else + pgac_cv__128bit_int_bug=broken +fi +rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \ + conftest.$ac_objext conftest.beam conftest.$ac_ext +fi + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__128bit_int_bug" >&5 +$as_echo "$pgac_cv__128bit_int_bug" >&6; } + if test x"$pgac_cv__128bit_int_bug" != xbroken ; then $as_echo "#define PG_INT128_TYPE __int128" >>confdefs.h @@ -15132,7 +15189,7 @@ cat >>confdefs.h <<_ACEOF #define ALIGNOF_PG_INT128_TYPE $ac_cv_alignof_PG_INT128_TYPE _ACEOF - + fi fi # Check for various atomic operations now that we have checked how to declare diff --git a/src/include/c.h b/src/include/c.h index f2c1d8c2..7a6ab8e2 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -377,7 +377,6 @@ typedef unsigned long long int uint64; /* * 128-bit signed and unsigned integers - * There currently is only limited support for such types. * E.g. 128bit literals and snprintf are not supported; but math is. * Also, because we exclude such types when choosing MAXIMUM_ALIGNOF, From 25d5f199de8673417735488aa1d720c1b77ef664 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Fri, 10 Dec 2021 14:48:38 +0800 Subject: [PATCH 457/578] fix deadlock by pg_blocking_pids http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131093597037 (merge request !1011) Squash merge branch 'sigmalin_v2' into 'Tbase_v2.15.19.4' fix deadlock by pg_blocking_pids http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131093597037 TAPD: --bug=093597037 --- src/backend/storage/lmgr/lock.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c index 815124f6..f1570bb3 100644 --- a/src/backend/storage/lmgr/lock.c +++ b/src/backend/storage/lmgr/lock.c @@ -3784,6 +3784,13 @@ GetBlockerStatusData(int blocked_pid) data->waiter_pids = (int *) palloc(sizeof(int) * data->maxpids); /* + * Acquire lock on the entire shared lock data structure. See notes + * in GetLockStatusData(). + */ + for (i = 0; i < NUM_LOCK_PARTITIONS; i++) + LWLockAcquire(LockHashPartitionLockByIndex(i), LW_SHARED); + + /* * In order to search the ProcArray for blocked_pid and assume that that * entry won't immediately disappear under us, we must hold ProcArrayLock. * In addition, to examine the lock grouping fields of any other backend, @@ -3801,13 +3808,6 @@ GetBlockerStatusData(int blocked_pid) /* Nothing to do if it's gone */ if (proc != NULL) { - /* - * Acquire lock on the entire shared lock data structure. See notes - * in GetLockStatusData(). - */ - for (i = 0; i < NUM_LOCK_PARTITIONS; i++) - LWLockAcquire(LockHashPartitionLockByIndex(i), LW_SHARED); - if (proc->lockGroupLeader == NULL) { /* Easy case, proc is not a lock group member */ @@ -3827,17 +3827,17 @@ GetBlockerStatusData(int blocked_pid) } } + Assert(data->nprocs <= data->maxprocs); + } + + LWLockRelease(ProcArrayLock); + /* * And release locks. See notes in GetLockStatusData(). */ for (i = NUM_LOCK_PARTITIONS; --i >= 0;) LWLockRelease(LockHashPartitionLockByIndex(i)); - Assert(data->nprocs <= data->maxprocs); - } - - LWLockRelease(ProcArrayLock); - return data; } From 8add4a85dd3acefafac925071a3197a2deae98b5 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Wed, 22 Dec 2021 20:10:23 +0800 Subject: [PATCH 458/578] fix coredump in explain http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view?bug_id=1020421696095449735 (merge request !1039) Squash merge branch 'sigmalin_v5' into 'Tbase_v5.06.2' fix coredump in explain http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view?bug_id=1020421696095449735 TAPD: --bug=095449735 (cherry picked from commit a0c794fe) 53eb92ee fix coredump in explain http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view?bug_id=1020421696095449735 --- src/backend/commands/explain.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index d08ccaa8..3ae93589 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -4063,7 +4063,7 @@ ExplainRemoteQuery(RemoteQuery *plan, PlanState *planstate, List *ancestors, Exp step->exec_type = EXEC_ON_DATANODES; dummy = makeVar(1, 1, TEXTOID, -1, InvalidOid, 0); - plan->scan.plan.targetlist = lappend(plan->scan.plan.targetlist, + step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, makeTargetEntry((Expr *) dummy, 1, "QUERY PLAN", false)); estate = planstate->state; From 7ac5e8988e9967486cb9484f1ebe7b142b3f77db Mon Sep 17 00:00:00 2001 From: jadenchi Date: Thu, 23 Dec 2021 19:52:59 +0800 Subject: [PATCH 459/578] fix drop database failed caused by internal connection remaining. fix http://tapd.oa.com/TencentDB_for_TBase/prong/stories/view/1020418349870883157 --- src/backend/pgxc/locator/locator.c | 19 +++++++++++++++++++ src/backend/pgxc/pool/poolutils.c | 4 ++-- src/backend/tcop/utility.c | 7 +++++++ src/include/pgxc/locator.h | 1 + 4 files changed, 29 insertions(+), 2 deletions(-) diff --git a/src/backend/pgxc/locator/locator.c b/src/backend/pgxc/locator/locator.c index 4685ea1e..1fa85051 100644 --- a/src/backend/pgxc/locator/locator.c +++ b/src/backend/pgxc/locator/locator.c @@ -715,6 +715,25 @@ GetAllCoordNodes(void) return nodeList; } +/* + * Return a list of all Coordinators. + * Including local Coordinator. + * This is used to clean up pooler connections. + */ +List * +GetEntireCoordNodes(void) +{ + int i; + List *nodeList = NIL; + + for (i = 0; i < NumCoords; i++) + { + nodeList = lappend_int(nodeList, i); + } + + return nodeList; +} + static bool DatanodeInGroup(oidvector* nodeoids, Oid nodeoid) { diff --git a/src/backend/pgxc/pool/poolutils.c b/src/backend/pgxc/pool/poolutils.c index e69fae07..ab95f7e2 100644 --- a/src/backend/pgxc/pool/poolutils.c +++ b/src/backend/pgxc/pool/poolutils.c @@ -354,7 +354,7 @@ CleanConnection(CleanConnStmt *stmt) dn_list = stmt_nodes; else { - co_list = GetAllCoordNodes(); + co_list = GetEntireCoordNodes(); dn_list = GetAllDataNodes(); } @@ -388,7 +388,7 @@ CleanConnection(CleanConnStmt *stmt) void DropDBCleanConnection(char *dbname) { - List *co_list = GetAllCoordNodes(); + List *co_list = GetEntireCoordNodes(); List *dn_list = GetAllDataNodes(); /* Check permissions for this database */ diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index c90f1759..d58ae205 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -2455,6 +2455,7 @@ standard_ProcessUtility(PlannedStmt *pstmt, case T_DropdbStmt: { char prepareQuery[STRINGLENGTH]; + char query[STRINGLENGTH]; DropdbStmt *stmt = (DropdbStmt *) parsetree; if (!stmt->prepare) { @@ -2478,10 +2479,16 @@ standard_ProcessUtility(PlannedStmt *pstmt, if (OidIsValid(db_oid)) { + snprintf(query, STRINGLENGTH, "CLEAN CONNECTION TO ALL FOR DATABASE %s;", + quote_identifier(stmt->dbname)); + snprintf(prepareQuery, STRINGLENGTH, "DROP DATABASE PREPARE %s;", quote_identifier(stmt->dbname)); if (!is_ddl_leader_cn(leaderCnHandle->nodename)) + { + SendLeaderCNUtility(query, false); SendLeaderCNUtility(prepareQuery, false); + } else dropdb_prepare(stmt->dbname, false); ExecUtilityStmtOnNodes(parsetree, prepareQuery, diff --git a/src/include/pgxc/locator.h b/src/include/pgxc/locator.h index 30926928..d5c9d543 100644 --- a/src/include/pgxc/locator.h +++ b/src/include/pgxc/locator.h @@ -225,6 +225,7 @@ extern ExecNodes *GetRelationNodesByQuals(Oid reloid, extern bool IsTypeHashDistributable(Oid col_type); extern List *GetAllDataNodes(void); extern List *GetAllCoordNodes(void); +extern List *GetEntireCoordNodes(void); extern int GetAnyDataNode(Bitmapset *nodes); extern void RelationBuildLocator(Relation rel); extern void FreeRelationLocInfo(RelationLocInfo *relationLocInfo); From cdf1442996a134486d1149ef952b3bcf266b20fb Mon Sep 17 00:00:00 2001 From: andrelin Date: Fri, 24 Dec 2021 12:29:46 +0800 Subject: [PATCH 460/578] Prevent SQL injection in pg_stat_cluster_activity extension --- .../pg_stat_cluster_activity.c | 63 +++++++++++++------ 1 file changed, 44 insertions(+), 19 deletions(-) diff --git a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c index 1bc9f489..74af0249 100644 --- a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c +++ b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c @@ -154,6 +154,41 @@ PG_FUNCTION_INFO_V1(pg_signal_session); PG_FUNCTION_INFO_V1(pg_terminate_session); PG_FUNCTION_INFO_V1(pg_cancel_session); + +static ParamListInfo +EvaluateSessionIDParam(const char *sessionid) +{ + int num_params = 1; + ParamListInfo paramLI = (ParamListInfo) + palloc0(offsetof(ParamListInfoData, params) + + num_params * sizeof(ParamExternData)); + + ParamExternData *prm; + + /* we have static list of params, so no hooks needed */ + paramLI->paramFetch = NULL; + paramLI->paramFetchArg = NULL; + paramLI->parserSetup = NULL; + paramLI->parserSetupArg = NULL; + paramLI->numParams = num_params; + paramLI->paramMask = NULL; + + prm = ¶mLI->params[0]; + prm->ptype = TEXTOID; + prm->pflags = PARAM_FLAG_CONST; + if (sessionid != NULL) + { + prm->value = CStringGetTextDatum(sessionid); + prm->isnull = false; + } + else + { + prm->isnull = true; + } + + return paramLI; +} + /* * walk through planstate tree and gets cursors it contains in * RemoteSubplan node, formed as a single string delimited each @@ -529,7 +564,7 @@ pgstat_fetch_stat_local_csentry(int beid) * ---------- */ static void -pg_stat_get_remote_activity(const char *sessionid, bool coordonly, Tuplestorestate *tupstore) +pg_stat_get_remote_activity(const char *sessionid, bool coordonly, Tuplestorestate *tupstore, TupleDesc tupdesc) { #define QUERY_LEN 1024 char query[QUERY_LEN]; @@ -545,10 +580,7 @@ pg_stat_get_remote_activity(const char *sessionid, bool coordonly, Tuplestoresta * Here we call pg_stat_get_cluster_activity in remote with args: * coordonly = false, localonly = true, to prevent recursive calls in remote nodes. */ - if (sessionid == NULL) - snprintf(query, QUERY_LEN, "select * from pg_stat_get_cluster_activity(NULL, false, true)"); - else - snprintf(query, QUERY_LEN, "select * from pg_stat_get_cluster_activity('%s', false, true)", sessionid); + snprintf(query, QUERY_LEN, "select * from pg_stat_get_cluster_activity($1, false, true)"); plan = makeNode(RemoteQuery); plan->combine_type = COMBINE_TYPE_NONE; @@ -569,22 +601,13 @@ pg_stat_get_remote_activity(const char *sessionid, bool coordonly, Tuplestoresta plan->exec_type = EXEC_ON_COORDS; } - /* - * We only need the target entry to determine result data type. - * So create dummy even if real expression is a function. - */ - for (i = 1; i <= PG_STAT_GET_ClUSTER_ACTIVITY_COLS; i++) - { - dummy = makeVar(1, i, TEXTOID, 0, InvalidOid, 0); - plan->scan.plan.targetlist = lappend(plan->scan.plan.targetlist, - makeTargetEntry((Expr *) dummy, i, NULL, false)); - } - /* prepare to execute */ estate = CreateExecutorState(); oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); estate->es_snapshot = GetActiveSnapshot(); + estate->es_param_list_info = EvaluateSessionIDParam(sessionid); pstate = ExecInitRemoteQuery(plan, estate, 0); + ExecAssignResultType((PlanState *) pstate, tupdesc); MemoryContextSwitchTo(oldcontext); result = ExecRemoteQuery((PlanState *) pstate); @@ -598,7 +621,7 @@ pg_stat_get_remote_activity(const char *sessionid, bool coordonly, Tuplestoresta } ExecEndRemoteQuery(pstate); - return; + FreeExecutorState(estate); } /* ---------- @@ -660,7 +683,7 @@ pg_stat_get_cluster_activity(PG_FUNCTION_ARGS) /* dispatch query to remote if needed */ if (!localonly && IS_PGXC_COORDINATOR) - pg_stat_get_remote_activity(sessionid, coordonly, tupstore); + pg_stat_get_remote_activity(sessionid, coordonly, tupstore, tupdesc); /* 1-based index */ for (curr_backend = 1; curr_backend <= num_backends; curr_backend++) @@ -948,7 +971,7 @@ pgcs_signal_session_remote(const char *sessionid, int signal) Var *dummy; TupleTableSlot *result = NULL; - snprintf(query, QUERY_LEN, "select pg_signal_session('%s', %d, true)", sessionid, signal); + snprintf(query, QUERY_LEN, "select pg_signal_session($1, %d, true)", signal); plan = makeNode(RemoteQuery); plan->combine_type = COMBINE_TYPE_NONE; @@ -973,6 +996,7 @@ pgcs_signal_session_remote(const char *sessionid, int signal) estate = CreateExecutorState(); oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); estate->es_snapshot = GetActiveSnapshot(); + estate->es_param_list_info = EvaluateSessionIDParam(sessionid); pstate = ExecInitRemoteQuery(plan, estate, 0); MemoryContextSwitchTo(oldcontext); @@ -984,6 +1008,7 @@ pgcs_signal_session_remote(const char *sessionid, int signal) return false; } + FreeExecutorState(estate); return true; } From ce845350f1e3e0ac0ce0f654d4521e0be9bccee0 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Thu, 14 Oct 2021 11:58:06 +0800 Subject: [PATCH 461/578] fix deadlock between pgxc_connections_cleanup and SharedQueueFinish http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131093155193 (merge request !800) --- src/backend/pgxc/squeue/squeue.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/backend/pgxc/squeue/squeue.c b/src/backend/pgxc/squeue/squeue.c index a145edc5..5744e22e 100644 --- a/src/backend/pgxc/squeue/squeue.c +++ b/src/backend/pgxc/squeue/squeue.c @@ -2868,6 +2868,14 @@ SharedQueueFinish(SharedQueue squeue, TupleDesc tupDesc, //LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock); } + /* + * Check sq_error status to avoid endless loop here + */ + if (squeue->sq_error) + { + elog(ERROR, "SharedQueueFinish: shared_queue %s error because of query-cancel.", squeue->sq_key); + } + if (unfinish_tuplestore) { pg_usleep(1000L); From 891fedf6bcdf70c26bdce4269829f79124bc099b Mon Sep 17 00:00:00 2001 From: arrowbowang Date: Wed, 26 May 2021 14:50:16 +0800 Subject: [PATCH 462/578] fix getmissingattr core after ALTER TABLE partabc ALTER COLUMN name DROP DEFAULT http://tapd.oa.com/10092131/bugtrace/bugs/view/1010092131087999713 --- src/backend/utils/cache/relcache.c | 2 +- src/test/regress/expected/fast_default.out | 64 +++++++++++----------- 2 files changed, 33 insertions(+), 33 deletions(-) diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 55943dff..23a41d00 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -637,6 +637,7 @@ RelationBuildTupleDesc(Relation relation) attrdef[ndef].adnum = attnum; attrdef[ndef].adbin = NULL; ndef++; + } #ifdef _MLS_ /* Likewise for a missing value */ @@ -693,7 +694,6 @@ RelationBuildTupleDesc(Relation relation) } } #endif - } need--; if (need == 0) break; diff --git a/src/test/regress/expected/fast_default.out b/src/test/regress/expected/fast_default.out index d390a452..bf00c540 100644 --- a/src/test/regress/expected/fast_default.out +++ b/src/test/regress/expected/fast_default.out @@ -131,28 +131,28 @@ SELECT pk, c_int, c_bpchar, c_text, c_date, c_timestamp, FROM T ORDER BY pk; pk | c_int | c_bpchar | c_text | c_date | c_timestamp | c_timestamp_null | c_array | c_small | c_small_null | c_big | c_num | c_time | c_interval | c_hugetext_origdef | c_hugetext_newdef ----+-------+----------+--------+------------+--------------------------+--------------------------+--------------------------+---------+--------------+-------------------+-------------------+----------+------------+--------------------+------------------- - 1 | 1 | | | | | | | | | | | | | | - 2 | 1 | | | | | | | | | | | | | | - 3 | 2 | hello | | | | | | | | | | | | | - 4 | 2 | hello | | | | | | | | | | | | | - 5 | 2 | dog | world | | | | | | | | | | | | - 6 | 2 | dog | world | | | | | | | | | | | | - 7 | 2 | dog | cat | 06-02-2016 | | | | | | | | | | | - 8 | 2 | dog | cat | 06-02-2016 | | | | | | | | | | | - 9 | 2 | dog | cat | 01-01-2010 | Thu Sep 01 12:00:00 2016 | | | | | | | | | | - 10 | 2 | dog | cat | 01-01-2010 | Thu Sep 01 12:00:00 2016 | | | | | | | | | | - 11 | 2 | dog | cat | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,the,real,world} | | | | | | | | - 12 | 2 | dog | cat | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,the,real,world} | | | | | | | | - 13 | 2 | dog | cat | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy} | -5 | | | | | | | - 14 | 2 | dog | cat | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy} | -5 | | | | | | | - 15 | 2 | dog | cat | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy} | 9 | 13 | 180000000000018 | | | | | - 16 | 2 | dog | cat | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy} | 9 | 13 | 180000000000018 | | | | | - 17 | 2 | dog | cat | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy} | 9 | 13 | -9999999999999999 | 1.00000000001 | | | | - 18 | 2 | dog | cat | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy} | 9 | 13 | -9999999999999999 | 1.00000000001 | | | | - 19 | 2 | dog | cat | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy} | 9 | 13 | -9999999999999999 | 2.000000000000002 | 12:00:00 | | | - 20 | 2 | dog | cat | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy} | 9 | 13 | -9999999999999999 | 2.000000000000002 | 12:00:00 | | | - 21 | 2 | dog | cat | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy} | 9 | 13 | -9999999999999999 | 2.000000000000002 | 23:59:59 | @ 1 day | | - 22 | 2 | dog | cat | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy} | 9 | 13 | -9999999999999999 | 2.000000000000002 | 23:59:59 | @ 1 day | | + 1 | 1 | hello | world | 06-02-2016 | Thu Sep 01 12:00:00 2016 | | {This,is,the,real,world} | -5 | | 180000000000018 | 1.00000000001 | 12:00:00 | @ 1 day | t | f + 2 | 1 | hello | world | 06-02-2016 | Thu Sep 01 12:00:00 2016 | | {This,is,the,real,world} | -5 | | 180000000000018 | 1.00000000001 | 12:00:00 | @ 1 day | t | f + 3 | 2 | hello | world | 06-02-2016 | Thu Sep 01 12:00:00 2016 | | {This,is,the,real,world} | -5 | | 180000000000018 | 1.00000000001 | 12:00:00 | @ 1 day | t | f + 4 | 2 | hello | world | 06-02-2016 | Thu Sep 01 12:00:00 2016 | | {This,is,the,real,world} | -5 | | 180000000000018 | 1.00000000001 | 12:00:00 | @ 1 day | t | f + 5 | 2 | dog | world | 06-02-2016 | Thu Sep 01 12:00:00 2016 | | {This,is,the,real,world} | -5 | | 180000000000018 | 1.00000000001 | 12:00:00 | @ 1 day | t | f + 6 | 2 | dog | world | 06-02-2016 | Thu Sep 01 12:00:00 2016 | | {This,is,the,real,world} | -5 | | 180000000000018 | 1.00000000001 | 12:00:00 | @ 1 day | t | f + 7 | 2 | dog | cat | 06-02-2016 | Thu Sep 01 12:00:00 2016 | | {This,is,the,real,world} | -5 | | 180000000000018 | 1.00000000001 | 12:00:00 | @ 1 day | t | f + 8 | 2 | dog | cat | 06-02-2016 | Thu Sep 01 12:00:00 2016 | | {This,is,the,real,world} | -5 | | 180000000000018 | 1.00000000001 | 12:00:00 | @ 1 day | t | f + 9 | 2 | dog | cat | 01-01-2010 | Thu Sep 01 12:00:00 2016 | | {This,is,the,real,world} | -5 | | 180000000000018 | 1.00000000001 | 12:00:00 | @ 1 day | t | f + 10 | 2 | dog | cat | 01-01-2010 | Thu Sep 01 12:00:00 2016 | | {This,is,the,real,world} | -5 | | 180000000000018 | 1.00000000001 | 12:00:00 | @ 1 day | t | f + 11 | 2 | dog | cat | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,the,real,world} | -5 | | 180000000000018 | 1.00000000001 | 12:00:00 | @ 1 day | t | f + 12 | 2 | dog | cat | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,the,real,world} | -5 | | 180000000000018 | 1.00000000001 | 12:00:00 | @ 1 day | t | f + 13 | 2 | dog | cat | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy} | -5 | | 180000000000018 | 1.00000000001 | 12:00:00 | @ 1 day | t | f + 14 | 2 | dog | cat | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy} | -5 | | 180000000000018 | 1.00000000001 | 12:00:00 | @ 1 day | t | f + 15 | 2 | dog | cat | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy} | 9 | 13 | 180000000000018 | 1.00000000001 | 12:00:00 | @ 1 day | t | f + 16 | 2 | dog | cat | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy} | 9 | 13 | 180000000000018 | 1.00000000001 | 12:00:00 | @ 1 day | t | f + 17 | 2 | dog | cat | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy} | 9 | 13 | -9999999999999999 | 1.00000000001 | 12:00:00 | @ 1 day | t | f + 18 | 2 | dog | cat | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy} | 9 | 13 | -9999999999999999 | 1.00000000001 | 12:00:00 | @ 1 day | t | f + 19 | 2 | dog | cat | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy} | 9 | 13 | -9999999999999999 | 2.000000000000002 | 12:00:00 | @ 1 day | t | f + 20 | 2 | dog | cat | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy} | 9 | 13 | -9999999999999999 | 2.000000000000002 | 12:00:00 | @ 1 day | t | f + 21 | 2 | dog | cat | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy} | 9 | 13 | -9999999999999999 | 2.000000000000002 | 23:59:59 | @ 1 day | t | f + 22 | 2 | dog | cat | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy} | 9 | 13 | -9999999999999999 | 2.000000000000002 | 23:59:59 | @ 1 day | t | f 23 | 2 | dog | cat | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy} | 9 | 13 | -9999999999999999 | 2.000000000000002 | 23:59:59 | @ 3 hours | t | f 24 | 2 | dog | cat | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy} | 9 | 13 | -9999999999999999 | 2.000000000000002 | 23:59:59 | @ 3 hours | t | f 25 | 2 | dog | cat | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy} | 9 | 13 | -9999999999999999 | 2.000000000000002 | 23:59:59 | | f | t @@ -226,16 +226,16 @@ INSERT INTO T VALUES (15), (16); SELECT * FROM T order by 1; pk | c_int | c_bpchar | c_text | c_date | c_timestamp | c_array ----+-------+----------+--------------+------------+--------------------------+------------------------------- - 1 | 6 | | | | | - 2 | 6 | | | | | - 3 | 8 | abcd | | | | - 4 | 8 | abcd | | | | - 5 | 8 | abc | abcdef | | | - 6 | 8 | abc | abcdef | | | - 7 | 8 | abc | abcdefghijkl | 06-12-2016 | | - 8 | 8 | abc | abcdefghijkl | 06-12-2016 | | - 9 | 8 | abc | abcdefghijkl | 12-28-2009 | Sun Sep 11 00:00:00 2016 | - 10 | 8 | abc | abcdefghijkl | 12-28-2009 | Sun Sep 11 00:00:00 2016 | + 1 | 6 | abcd | abcdef | 06-12-2016 | Sun Sep 11 00:00:00 2016 | {This,is,abcd,the,real,world} + 2 | 6 | abcd | abcdef | 06-12-2016 | Sun Sep 11 00:00:00 2016 | {This,is,abcd,the,real,world} + 3 | 8 | abcd | abcdef | 06-12-2016 | Sun Sep 11 00:00:00 2016 | {This,is,abcd,the,real,world} + 4 | 8 | abcd | abcdef | 06-12-2016 | Sun Sep 11 00:00:00 2016 | {This,is,abcd,the,real,world} + 5 | 8 | abc | abcdef | 06-12-2016 | Sun Sep 11 00:00:00 2016 | {This,is,abcd,the,real,world} + 6 | 8 | abc | abcdef | 06-12-2016 | Sun Sep 11 00:00:00 2016 | {This,is,abcd,the,real,world} + 7 | 8 | abc | abcdefghijkl | 06-12-2016 | Sun Sep 11 00:00:00 2016 | {This,is,abcd,the,real,world} + 8 | 8 | abc | abcdefghijkl | 06-12-2016 | Sun Sep 11 00:00:00 2016 | {This,is,abcd,the,real,world} + 9 | 8 | abc | abcdefghijkl | 12-28-2009 | Sun Sep 11 00:00:00 2016 | {This,is,abcd,the,real,world} + 10 | 8 | abc | abcdefghijkl | 12-28-2009 | Sun Sep 11 00:00:00 2016 | {This,is,abcd,the,real,world} 11 | 8 | abc | abcdefghijkl | 12-28-2009 | Sat Jan 30 00:00:00 1971 | {This,is,abcd,the,real,world} 12 | 8 | abc | abcdefghijkl | 12-28-2009 | Sat Jan 30 00:00:00 1971 | {This,is,abcd,the,real,world} 13 | | abc | abcdefghijkl | 12-28-2009 | Sat Jan 30 00:00:00 1971 | {This,is,a,fantasy} From 192de069c410854af11f526bd6da263aaa0f3df4 Mon Sep 17 00:00:00 2001 From: andrelin Date: Mon, 27 Dec 2021 16:44:20 +0800 Subject: [PATCH 463/578] Remove IS_PGXC_DATANODE constrains in epqcontext deparse of exec_bind_message tapd: http://tapd.oa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131095603881 --- contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c | 2 -- src/backend/tcop/postgres.c | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c index 74af0249..625c4394 100644 --- a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c +++ b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c @@ -568,12 +568,10 @@ pg_stat_get_remote_activity(const char *sessionid, bool coordonly, Tuplestoresta { #define QUERY_LEN 1024 char query[QUERY_LEN]; - int i; EState *estate; MemoryContext oldcontext; RemoteQuery *plan; RemoteQueryState *pstate; - Var *dummy; TupleTableSlot *result = NULL; /* diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 5c80dfb5..735035de 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -2800,7 +2800,7 @@ exec_bind_message(StringInfo input_message) } /* Get epq context, only datanodes need them */ - if (IS_PGXC_DATANODE && (IsConnFromCoord() || IsConnFromDatanode())) + if (IsConnFromCoord() || IsConnFromDatanode()) { num_epq_tuple = pq_getmsgint(input_message, 2); if (num_epq_tuple > 0) From 31f0f877f657b29ff54d40737ac049ac5f1aa037 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Wed, 8 Dec 2021 15:44:07 +0800 Subject: [PATCH 464/578] fix duplicate binding because of unique overflow http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view?bug_id=1020421696094794629&jump_count=1 (merge request !1004) Squash merge branch 'sigmalin_v5' into 'Tbase_v5.06' fix duplicate binding because of unique overflow http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view?bug_id=1020421696094794629&jump_count=1 TAPD: --bug=094794629 (cherry picked from commit c3df25c4) 4064c9fe fix duplicate binding because of unique overflow http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view?bug_id=1020421696094794629&jump_count=1 --- src/backend/commands/explain.c | 2 +- src/backend/nodes/outfuncs.c | 6 ++++- src/backend/nodes/readfuncs.c | 8 +++++- src/backend/pgxc/pool/execRemote.c | 39 ++++++++++++++++-------------- src/backend/pgxc/squeue/squeue.c | 6 ++--- src/backend/tcop/pquery.c | 2 +- src/include/pgxc/execRemote.h | 2 +- src/include/pgxc/planner.h | 2 +- 8 files changed, 40 insertions(+), 27 deletions(-) diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 3ae93589..5f0ec309 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -1244,7 +1244,7 @@ ExplainNode(PlanState *planstate, List *ancestors, if (rsubplan->cursor) { if (rsubplan->unique) - snprintf(cursor, NAMEDATALEN, "%s_%d", rsubplan->cursor, rsubplan->unique); + snprintf(cursor, NAMEDATALEN, "%s_"INT64_FORMAT, rsubplan->cursor, rsubplan->unique); else strncpy(cursor, rsubplan->cursor, NAMEDATALEN); } diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index 86320cfb..f6b2295d 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -99,6 +99,10 @@ static void outChar(StringInfo str, char c); #define WRITE_UINT_FIELD(fldname) \ appendStringInfo(str, " :" CppAsString(fldname) " %u", node->fldname) +/* Write an int64 field (anything written as ":fldname %d") */ +#define WRITE_INT64_FIELD(fldname) \ + appendStringInfo(str, " :" CppAsString(fldname) " "INT64_FORMAT, node->fldname) + #ifdef XCP /* Only allow output OIDs in not portable mode */ #define WRITE_OID_FIELD(fldname) \ @@ -1700,7 +1704,7 @@ _outRemoteSubplan(StringInfo str, const RemoteSubplan *node) WRITE_BOOL_FIELD(execOnAll); WRITE_NODE_FIELD(sort); WRITE_STRING_FIELD(cursor); - WRITE_INT_FIELD(unique); + WRITE_INT64_FIELD(unique); WRITE_BOOL_FIELD(parallelWorkerSendTuple); WRITE_BITMAPSET_FIELD(initPlanParams); diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c index 33d51a1e..96f4ca05 100644 --- a/src/backend/nodes/readfuncs.c +++ b/src/backend/nodes/readfuncs.c @@ -116,6 +116,12 @@ set_portable_input(bool value) token = pg_strtok(&length); /* get field value */ \ local_node->fldname = atoui(token) +/* Read an integer field (anything written as ":fldname %d") */ +#define READ_INT64_FIELD(fldname) \ + token = pg_strtok(&length); /* skip :fldname */ \ + token = pg_strtok(&length); /* get field value */ \ + local_node->fldname = atoll(token) + #ifdef XCP /* Read a long integer field (anything written as ":fldname %ld") */ #define READ_LONG_FIELD(fldname) \ @@ -3812,7 +3818,7 @@ _readRemoteSubplan(void) READ_BOOL_FIELD(execOnAll); READ_NODE_FIELD(sort); READ_STRING_FIELD(cursor); - READ_INT_FIELD(unique); + READ_INT64_FIELD(unique); READ_BOOL_FIELD(parallelWorkerSendTuple); READ_BITMAPSET_FIELD(initPlanParams); diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 45bac135..4da43b60 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -9706,8 +9706,8 @@ ExecEndRemoteQuery(RemoteQueryState *node) * take writable copy of the plan tree. */ void -RemoteSubplanMakeUnique(Node *plan, int unique) -{// #lizard forgives +RemoteSubplanMakeUnique(Node *plan, int unique, int pid) +{ if (plan == NULL) return; @@ -9716,7 +9716,7 @@ RemoteSubplanMakeUnique(Node *plan, int unique) ListCell *lc; foreach(lc, (List *) plan) { - RemoteSubplanMakeUnique(lfirst(lc), unique); + RemoteSubplanMakeUnique(lfirst(lc), unique, pid); } return; } @@ -9726,34 +9726,37 @@ RemoteSubplanMakeUnique(Node *plan, int unique) */ if (IsA(plan, RemoteSubplan)) { - int old = ((RemoteSubplan *)plan)->unique; - ((RemoteSubplan *)plan)->unique = old * MAX_NODES_NUMBER + unique; + /* + * add node information and pid to make it unique + */ + ((RemoteSubplan *)plan)->unique = ((int64)unique << 32) | pid; } + /* Otherwise it is a Plan descendant */ - RemoteSubplanMakeUnique((Node *) ((Plan *) plan)->lefttree, unique); - RemoteSubplanMakeUnique((Node *) ((Plan *) plan)->righttree, unique); + RemoteSubplanMakeUnique((Node *) ((Plan *) plan)->lefttree, unique, pid); + RemoteSubplanMakeUnique((Node *) ((Plan *) plan)->righttree, unique, pid); /* Tranform special cases */ switch (nodeTag(plan)) { case T_Append: RemoteSubplanMakeUnique((Node *) ((Append *) plan)->appendplans, - unique); + unique, pid); break; case T_MergeAppend: RemoteSubplanMakeUnique((Node *) ((MergeAppend *) plan)->mergeplans, - unique); + unique, pid); break; case T_BitmapAnd: RemoteSubplanMakeUnique((Node *) ((BitmapAnd *) plan)->bitmapplans, - unique); + unique, pid); break; case T_BitmapOr: RemoteSubplanMakeUnique((Node *) ((BitmapOr *) plan)->bitmapplans, - unique); + unique, pid); break; case T_SubqueryScan: RemoteSubplanMakeUnique((Node *) ((SubqueryScan *) plan)->subplan, - unique); + unique, pid); break; default: break; @@ -10290,7 +10293,7 @@ ExecInitRemoteSubplan(RemoteSubplan *node, EState *estate, int eflags) * traverse the subtree and change SharedQueue name to make it * unique. */ - RemoteSubplanMakeUnique((Node *) outerPlan(node), PGXCNodeId); + RemoteSubplanMakeUnique((Node *) outerPlan(node), PGXCNodeId - 1, MyProcPid); elog(DEBUG3, "RemoteSubplanMakeUnique for LOCATOR_TYPE_NONE unique: %d, cursor: %s", PGXCNodeId, node->cursor); } @@ -10577,7 +10580,7 @@ ExecFinishInitRemoteSubplan(RemoteSubplanState *node) Assert(plan->cursor); if (plan->unique) - snprintf(cursor, NAMEDATALEN, "%s_%d", plan->cursor, plan->unique); + snprintf(cursor, NAMEDATALEN, "%s_"INT64_FORMAT, plan->cursor, plan->unique); else strncpy(cursor, plan->cursor, NAMEDATALEN); @@ -10988,7 +10991,7 @@ ExecRemoteSubplan(PlanState *pstate) { fetch = PGXLRemoteFetchSize; if (plan->unique) - snprintf(cursor, NAMEDATALEN, "%s_%d", plan->cursor, plan->unique); + snprintf(cursor, NAMEDATALEN, "%s_"INT64_FORMAT, plan->cursor, plan->unique); else strncpy(cursor, plan->cursor, NAMEDATALEN); } @@ -11411,7 +11414,7 @@ ExecFinishRemoteSubplan(RemoteSubplanState *node) if (plan->cursor) { if (plan->unique) - snprintf(cursor, NAMEDATALEN, "%s_%d", plan->cursor, plan->unique); + snprintf(cursor, NAMEDATALEN, "%s_"INT64_FORMAT, plan->cursor, plan->unique); else strncpy(cursor, plan->cursor, NAMEDATALEN); } @@ -11496,7 +11499,7 @@ ExecDisconnectRemoteSubplan(RemoteSubplanState *node) if (plan->cursor) { if (plan->unique) - snprintf(cursor, NAMEDATALEN, "%s_%d", plan->cursor, plan->unique); + snprintf(cursor, NAMEDATALEN, "%s_"INT64_FORMAT, plan->cursor, plan->unique); else strncpy(cursor, plan->cursor, NAMEDATALEN); } @@ -11888,7 +11891,7 @@ ExecEndRemoteSubplan(RemoteSubplanState *node) if (plan->cursor) { if (plan->unique) - snprintf(cursor, NAMEDATALEN, "%s_%d", plan->cursor, plan->unique); + snprintf(cursor, NAMEDATALEN, "%s_"INT64_FORMAT, plan->cursor, plan->unique); else strncpy(cursor, plan->cursor, NAMEDATALEN); } diff --git a/src/backend/pgxc/squeue/squeue.c b/src/backend/pgxc/squeue/squeue.c index 5744e22e..19387c66 100644 --- a/src/backend/pgxc/squeue/squeue.c +++ b/src/backend/pgxc/squeue/squeue.c @@ -1082,8 +1082,8 @@ SharedQueueBind(const char *sqname, List *consNodes, #endif Assert(consMap); - elog(DEBUG1, "Bind node %s to squeue of step %s as a producer", - PGXC_PARENT_NODE, sqname); + elog(DEBUG1, "Bind node %s to squeue of step %s as a producer, parentPGXCNode %s, parentPGXCPid %d", + PGXC_PARENT_NODE, sqname, parentPGXCNode, parentPGXCPid); /* Initialize the shared queue */ sq->sq_pid = MyProcPid; @@ -1328,7 +1328,7 @@ SharedQueueBind(const char *sqname, List *consNodes, elog(DEBUG1, "SQueue %s has a bound producer from node %d, pid %d", sqname, sq->sq_nodeid, sq->sq_pid); - elog(DEBUG1, "Bind node %s to SQueue %s as a consumer %d", PGXC_PARENT_NODE, sqname, sq->sq_pid); + elog(DEBUG1, "Bind node %s to SQueue %s as a consumer %d, parentPGXCNode %s, parentPGXCPid %d", PGXC_PARENT_NODE, sqname, sq->sq_pid, parentPGXCNode, parentPGXCPid); /* Sanity checks */ Assert(myindex); diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c index 295dc2a2..bb11de0b 100644 --- a/src/backend/tcop/pquery.c +++ b/src/backend/tcop/pquery.c @@ -812,7 +812,7 @@ PortalStart(Portal portal, ParamListInfo params, */ RemoteSubplanMakeUnique( (Node *) queryDesc->plannedstmt->planTree, - PGXC_PARENT_NODE_ID); + PGXC_PARENT_NODE_ID, parentPGXCPid); elog(DEBUG3, "RemoteSubplanMakeUnique for PARAM_EXEC unique: %d, portal: %s", PGXC_PARENT_NODE_ID, portal->name); diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h index c76b946a..3ff1953b 100644 --- a/src/include/pgxc/execRemote.h +++ b/src/include/pgxc/execRemote.h @@ -367,7 +367,7 @@ extern RemoteQueryState *ExecInitRemoteQuery(RemoteQuery *node, EState *estate, extern TupleTableSlot* ExecRemoteQuery(PlanState *pstate); extern void ExecReScanRemoteQuery(RemoteQueryState *node); extern void ExecEndRemoteQuery(RemoteQueryState *step); -extern void RemoteSubplanMakeUnique(Node *plan, int unique); +extern void RemoteSubplanMakeUnique(Node *plan, int unique, int pid); extern RemoteSubplanState *ExecInitRemoteSubplan(RemoteSubplan *node, EState *estate, int eflags); extern void ExecFinishInitRemoteSubplan(RemoteSubplanState *node); extern TupleTableSlot* ExecRemoteSubplan(PlanState *pstate); diff --git a/src/include/pgxc/planner.h b/src/include/pgxc/planner.h index 9e16886d..2acef598 100644 --- a/src/include/pgxc/planner.h +++ b/src/include/pgxc/planner.h @@ -247,7 +247,7 @@ typedef struct bool execOnAll; SimpleSort *sort; char *cursor; - int unique; + int64 unique; #ifdef __TBASE__ /* * if gather is under remotesubplan, parallel worker can send tuples From 031a594970d2a515e4d83d80f6781e2edc415d4c Mon Sep 17 00:00:00 2001 From: arrowbowang Date: Wed, 8 Dec 2021 22:32:24 +0800 Subject: [PATCH 465/578] fix: compile warning --- contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c index 625c4394..8518ae8a 100644 --- a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c +++ b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c @@ -204,7 +204,7 @@ cursorCollectWalker(PlanState *planstate, StringInfo str) { appendStringInfoString(str, plan->cursor); if (plan->unique) - appendStringInfo(str, "_%d", plan->unique); + appendStringInfo(str, "_"INT64_FORMAT, plan->unique); /* add a space as delimiter */ appendStringInfoString(str, " "); } From f8bcda3411bba47d58a210e5c6154ca74f19b102 Mon Sep 17 00:00:00 2001 From: arrowbowang Date: Tue, 28 Dec 2021 18:04:13 +0800 Subject: [PATCH 466/578] fix: change log level to LOG --- src/backend/access/transam/xact.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 6c393b81..c3f87e43 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -2710,7 +2710,7 @@ StartTransaction(void) if (g_twophase_state.state != TWO_PHASE_INITIALTRANS) { ClearLocalTwoPhaseState(); - elog(WARNING, "clear g_twophase_state when start transaction"); + elog(LOG, "clear g_twophase_state when start transaction"); } ShowTransactionState("StartTransaction"); } From fcc044c55d3e9bc71c22503d1cf30916fe175dcc Mon Sep 17 00:00:00 2001 From: arrowbowang Date: Tue, 28 Dec 2021 18:08:36 +0800 Subject: [PATCH 467/578] fix: remove log --- src/backend/access/transam/xact.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index c3f87e43..88a9e05c 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -2710,7 +2710,6 @@ StartTransaction(void) if (g_twophase_state.state != TWO_PHASE_INITIALTRANS) { ClearLocalTwoPhaseState(); - elog(LOG, "clear g_twophase_state when start transaction"); } ShowTransactionState("StartTransaction"); } From 9b3f0cad1cdb00d0ed4d7d5e1cac27ddf06fc8ae Mon Sep 17 00:00:00 2001 From: arrowbowang Date: Mon, 1 Nov 2021 14:59:52 +0800 Subject: [PATCH 468/578] fix: pgxc_abort_connections send too many sync msg --- src/backend/pgxc/pool/execRemote.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 4da43b60..b2d52c44 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -11600,6 +11600,7 @@ void pgxc_abort_connections(PGXCNodeAllHandles *all_handles) int ret = false; int i = 0; bool need_loop_check = false; + bool need_sync = true; if (all_handles) { @@ -11688,11 +11689,16 @@ void pgxc_abort_connections(PGXCNodeAllHandles *all_handles) if (handle->state != DN_CONNECTION_STATE_IDLE || !node_ready_for_query(handle) || pgxc_node_is_data_enqueued(handle)) { elog(DEBUG1, "pgxc_abort_connections recheck node:%s not ready for query, status:%d, sync", handle->nodename, handle->state); - ret = pgxc_node_send_sync(handle); - if (!ret) + + if (need_sync) { - need_loop_check = true; + ret = pgxc_node_send_sync(handle); + if (ret != 0) + elog(WARNING, "pgxc_abort_connections failed to send sync to node %s", handle->nodename); } + + need_loop_check = true; + if (proc_exit_inprogress) { handle->state = DN_CONNECTION_STATE_IDLE; @@ -11729,11 +11735,16 @@ void pgxc_abort_connections(PGXCNodeAllHandles *all_handles) if (handle->state != DN_CONNECTION_STATE_IDLE || !node_ready_for_query(handle) || pgxc_node_is_data_enqueued(handle)) { elog(DEBUG1, "pgxc_abort_connections recheck node:%s not ready for query, status:%d, sync", handle->nodename, handle->state); - ret = pgxc_node_send_sync(handle); - if (!ret) + + if (need_sync) { - need_loop_check = true; + ret = pgxc_node_send_sync(handle); + if (ret != 0) + elog(WARNING, "pgxc_abort_connections failed to send sync to node %s", handle->nodename); } + + need_loop_check = true; + if (proc_exit_inprogress) { handle->state = DN_CONNECTION_STATE_IDLE; @@ -11758,6 +11769,7 @@ void pgxc_abort_connections(PGXCNodeAllHandles *all_handles) } } + need_sync = false; /* no need to recheck, break the loop. */ if (!need_loop_check) { From 54e88388bbe904cb64c6e9e48bfb3d9cf4c89a3c Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Wed, 16 Sep 2020 13:02:57 +0800 Subject: [PATCH 469/578] fix bug that the rollback does not take effect when the stored procedure is called --- src/backend/access/transam/xact.c | 8 +- src/backend/pgxc/pool/execRemote.c | 2 +- src/backend/tcop/postgres.c | 2 + src/backend/utils/error/elog.c | 2 +- src/include/pgxc/execRemote.h | 4 +- .../src/expected/plpgsql_transaction.out | 625 ++++++++++++++++++ src/pl/plpgsql/src/pl_exec.c | 4 +- .../plpgsql/src/sql/plpgsql_transaction.sql | 537 +++++++++++++++ 8 files changed, 1175 insertions(+), 9 deletions(-) create mode 100644 src/pl/plpgsql/src/expected/plpgsql_transaction.out create mode 100644 src/pl/plpgsql/src/sql/plpgsql_transaction.sql diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 88a9e05c..8efd0ff2 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -171,7 +171,7 @@ bool g_allow_force_ddl = false; #endif #ifdef __TBASE__ -extern PGDLLIMPORT bool g_in_plpgsql_exec_fun; +extern PGDLLIMPORT int g_in_plpgsql_exec_fun; extern bool PlpgsqlDebugPrint; #endif @@ -4184,7 +4184,7 @@ AbortTransaction(void) #endif #ifdef __TBASE__ - SetExitPlpgsqlFunc(); + //SetExitPlpgsqlFunc(); SetExitCreateExtension(); SetCurrentHandlesReadonly(); AtEOXact_Global(); @@ -8248,12 +8248,12 @@ void SetTopXactNeedBeginTxn(void) void SetEnterPlpgsqlFunc(void) { - g_in_plpgsql_exec_fun = true; + g_in_plpgsql_exec_fun = g_in_plpgsql_exec_fun + 1; } void SetExitPlpgsqlFunc(void) { - g_in_plpgsql_exec_fun = false; + g_in_plpgsql_exec_fun = g_in_plpgsql_exec_fun - 1; } diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index b2d52c44..b9f9c4a2 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -82,7 +82,7 @@ int PGXLRemoteFetchSize; #ifdef __TBASE__ -bool g_in_plpgsql_exec_fun = false; +int g_in_plpgsql_exec_fun = 0; bool PlpgsqlDebugPrint = false; bool need_global_snapshot = false; diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 735035de..7735fd18 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -5248,6 +5248,8 @@ PostgresMain(int argc, char *argv[], end_query_requested = false; Executor_done = false; + g_in_plpgsql_exec_fun = 0; + ClearQueryAnalyzeInfo(); #endif diff --git a/src/backend/utils/error/elog.c b/src/backend/utils/error/elog.c index ca77995a..c03635cb 100644 --- a/src/backend/utils/error/elog.c +++ b/src/backend/utils/error/elog.c @@ -112,7 +112,7 @@ sigjmp_buf *PG_exception_stack = NULL; extern bool redirection_done; #ifdef __TBASE__ -extern PGDLLIMPORT bool g_in_plpgsql_exec_fun; +extern PGDLLIMPORT int g_in_plpgsql_exec_fun; #endif #ifdef __TBASE__ diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h index 3ff1953b..7047d510 100644 --- a/src/include/pgxc/execRemote.h +++ b/src/include/pgxc/execRemote.h @@ -338,8 +338,8 @@ typedef enum extern int PGXLRemoteFetchSize; -#if __TBASE__ -extern PGDLLIMPORT bool g_in_plpgsql_exec_fun; +#ifdef __TBASE__ +extern PGDLLIMPORT int g_in_plpgsql_exec_fun; #endif diff --git a/src/pl/plpgsql/src/expected/plpgsql_transaction.out b/src/pl/plpgsql/src/expected/plpgsql_transaction.out new file mode 100644 index 00000000..d5fecb16 --- /dev/null +++ b/src/pl/plpgsql/src/expected/plpgsql_transaction.out @@ -0,0 +1,625 @@ +CREATE TABLE test1 (a int, b text); +CREATE PROCEDURE transaction_test1(x int, y text) +LANGUAGE plpgsql +AS $$ +BEGIN + FOR i IN 0..x LOOP + INSERT INTO test1 (a, b) VALUES (i, y); + IF i % 2 = 0 THEN + COMMIT; + ELSE + ROLLBACK; + END IF; + END LOOP; +END +$$; +CALL transaction_test1(9, 'foo'); +SELECT * FROM test1 ORDER BY a; + a | b +---+----- + 0 | foo + 2 | foo + 4 | foo + 6 | foo + 8 | foo +(5 rows) + +TRUNCATE test1; +DO +LANGUAGE plpgsql +$$ +BEGIN + FOR i IN 0..9 LOOP + INSERT INTO test1 (a) VALUES (i); + IF i % 2 = 0 THEN + COMMIT; + ELSE + ROLLBACK; + END IF; + END LOOP; +END +$$; +SELECT * FROM test1 ORDER BY a; + a | b +---+--- + 0 | + 2 | + 4 | + 6 | + 8 | +(5 rows) + +-- transaction commands not allowed when called in transaction block +START TRANSACTION; +CALL transaction_test1(9, 'error'); +ERROR: invalid transaction termination +CONTEXT: PL/pgSQL function transaction_test1(integer,text) line 6 at COMMIT +COMMIT; +START TRANSACTION; +DO LANGUAGE plpgsql $$ BEGIN COMMIT; END $$; +ERROR: invalid transaction termination +CONTEXT: PL/pgSQL function inline_code_block line 1 at COMMIT +COMMIT; +TRUNCATE test1; +-- not allowed in a function +CREATE FUNCTION transaction_test2() RETURNS int +LANGUAGE plpgsql +AS $$ +BEGIN + FOR i IN 0..9 LOOP + INSERT INTO test1 (a) VALUES (i); + IF i % 2 = 0 THEN + COMMIT; + ELSE + ROLLBACK; + END IF; + END LOOP; + RETURN 1; +END +$$; +SELECT transaction_test2(); +ERROR: invalid transaction termination +CONTEXT: PL/pgSQL function transaction_test2() line 6 at COMMIT +SELECT * FROM test1; + a | b +---+--- +(0 rows) + +-- also not allowed if procedure is called from a function +CREATE FUNCTION transaction_test3() RETURNS int +LANGUAGE plpgsql +AS $$ +BEGIN + CALL transaction_test1(9, 'error'); + RETURN 1; +END; +$$; +SELECT transaction_test3(); +ERROR: invalid transaction termination +CONTEXT: PL/pgSQL function transaction_test1(integer,text) line 6 at COMMIT +SQL statement "CALL transaction_test1(9, 'error')" +PL/pgSQL function transaction_test3() line 3 at CALL +SELECT * FROM test1; + a | b +---+--- +(0 rows) + +-- DO block inside function +CREATE FUNCTION transaction_test4() RETURNS int +LANGUAGE plpgsql +AS $$ +BEGIN + EXECUTE 'DO LANGUAGE plpgsql $x$ BEGIN COMMIT; END $x$'; + RETURN 1; +END; +$$; +SELECT transaction_test4(); +ERROR: invalid transaction termination +CONTEXT: PL/pgSQL function inline_code_block line 1 at COMMIT +SQL statement "DO LANGUAGE plpgsql $x$ BEGIN COMMIT; END $x$" +PL/pgSQL function transaction_test4() line 3 at EXECUTE +-- proconfig settings currently disallow transaction statements +CREATE PROCEDURE transaction_test5() +LANGUAGE plpgsql +SET work_mem = 555 +AS $$ +BEGIN + COMMIT; +END; +$$; +CALL transaction_test5(); +ERROR: invalid transaction termination +CONTEXT: PL/pgSQL function transaction_test5() line 3 at COMMIT +-- SECURITY DEFINER currently disallow transaction statements +CREATE PROCEDURE transaction_test5b() +LANGUAGE plpgsql +SECURITY DEFINER +AS $$ +BEGIN + COMMIT; +END; +$$; +CALL transaction_test5b(); +ERROR: invalid transaction termination +CONTEXT: PL/pgSQL function transaction_test5b() line 3 at COMMIT +TRUNCATE test1; +-- nested procedure calls +CREATE PROCEDURE transaction_test6(c text) +LANGUAGE plpgsql +AS $$ +BEGIN + CALL transaction_test1(9, c); +END; +$$; +CALL transaction_test6('bar'); +SELECT * FROM test1 ORDER BY a; + a | b +---+----- + 0 | bar + 2 | bar + 4 | bar + 6 | bar + 8 | bar +(5 rows) + +TRUNCATE test1; +CREATE PROCEDURE transaction_test7() +LANGUAGE plpgsql +AS $$ +BEGIN + DO 'BEGIN CALL transaction_test1(9, $x$baz$x$); END;'; +END; +$$; +CALL transaction_test7(); +SELECT * FROM test1 ORDER BY a; + a | b +---+----- + 0 | baz + 2 | baz + 4 | baz + 6 | baz + 8 | baz +(5 rows) + +CREATE PROCEDURE transaction_test8() +LANGUAGE plpgsql +AS $$ +BEGIN + EXECUTE 'CALL transaction_test1(10, $x$baz$x$)'; +END; +$$; +CALL transaction_test8(); +ERROR: invalid transaction termination +CONTEXT: PL/pgSQL function transaction_test1(integer,text) line 6 at COMMIT +SQL statement "CALL transaction_test1(10, $x$baz$x$)" +PL/pgSQL function transaction_test8() line 3 at EXECUTE +-- commit inside cursor loop +CREATE TABLE test2 (x int); +INSERT INTO test2 VALUES (0), (1), (2), (3), (4); +TRUNCATE test1; +DO LANGUAGE plpgsql $$ +DECLARE + r RECORD; +BEGIN + FOR r IN SELECT * FROM test2 ORDER BY x LOOP + INSERT INTO test1 (a) VALUES (r.x); + COMMIT; + END LOOP; +END; +$$; +SELECT * FROM test1 ORDER BY a; + a | b +---+--- + 0 | + 1 | + 2 | + 3 | + 4 | +(5 rows) + +-- check that this doesn't leak a holdable portal +SELECT * FROM pg_cursors; + name | statement | is_holdable | is_binary | is_scrollable | creation_time +------+-----------+-------------+-----------+---------------+--------------- +(0 rows) + +-- error in cursor loop with commit +TRUNCATE test1; +DO LANGUAGE plpgsql $$ +DECLARE + r RECORD; +BEGIN + FOR r IN SELECT * FROM test2 ORDER BY x LOOP + INSERT INTO test1 (a) VALUES (12/(r.x-2)); + COMMIT; + END LOOP; +END; +$$; +ERROR: division by zero +CONTEXT: SQL statement "INSERT INTO test1 (a) VALUES (12/(r.x-2))" +PL/pgSQL function inline_code_block line 6 at SQL statement +SELECT * FROM test1; + a | b +-----+--- + -6 | + -12 | +(2 rows) + +SELECT * FROM pg_cursors; + name | statement | is_holdable | is_binary | is_scrollable | creation_time +------+-----------+-------------+-----------+---------------+--------------- +(0 rows) + +-- rollback inside cursor loop +TRUNCATE test1; +DO LANGUAGE plpgsql $$ +DECLARE + r RECORD; +BEGIN + FOR r IN SELECT * FROM test2 ORDER BY x LOOP + INSERT INTO test1 (a) VALUES (r.x); + ROLLBACK; + END LOOP; +END; +$$; +SELECT * FROM test1; + a | b +---+--- +(0 rows) + +SELECT * FROM pg_cursors; + name | statement | is_holdable | is_binary | is_scrollable | creation_time +------+-----------+-------------+-----------+---------------+--------------- +(0 rows) + +-- first commit then rollback inside cursor loop +TRUNCATE test1; +DO LANGUAGE plpgsql $$ +DECLARE + r RECORD; +BEGIN + FOR r IN SELECT * FROM test2 ORDER BY x LOOP + INSERT INTO test1 (a) VALUES (r.x); + IF r.x % 2 = 0 THEN + COMMIT; + ELSE + ROLLBACK; + END IF; + END LOOP; +END; +$$; +SELECT * FROM test1 ORDER BY a; + a | b +---+--- + 0 | + 2 | + 4 | +(3 rows) + +SELECT * FROM pg_cursors; + name | statement | is_holdable | is_binary | is_scrollable | creation_time +------+-----------+-------------+-----------+---------------+--------------- +(0 rows) + +-- rollback inside cursor loop +TRUNCATE test1; +DO LANGUAGE plpgsql $$ +DECLARE + r RECORD; +BEGIN + FOR r IN UPDATE test2 SET x = x * 2 RETURNING x LOOP + INSERT INTO test1 (a) VALUES (r.x); + ROLLBACK; + END LOOP; +END; +$$; +ERROR: Distributed column or partition column "x" can't be updated in current version +CONTEXT: SQL statement "UPDATE test2 SET x = x * 2 RETURNING x" +PL/pgSQL function inline_code_block line 5 at FOR over SELECT rows +SELECT * FROM test1; + a | b +---+--- +(0 rows) + +SELECT * FROM test2 ORDER BY x; + x +--- + 0 + 1 + 2 + 3 + 4 +(5 rows) + +SELECT * FROM pg_cursors; + name | statement | is_holdable | is_binary | is_scrollable | creation_time +------+-----------+-------------+-----------+---------------+--------------- +(0 rows) + +-- commit inside block with exception handler +TRUNCATE test1; +DO LANGUAGE plpgsql $$ +BEGIN + BEGIN + INSERT INTO test1 (a) VALUES (1); + COMMIT; + INSERT INTO test1 (a) VALUES (1/0); + COMMIT; + EXCEPTION + WHEN division_by_zero THEN + RAISE NOTICE 'caught division_by_zero'; + END; +END; +$$; +ERROR: cannot commit while a subtransaction is active +CONTEXT: PL/pgSQL function inline_code_block line 5 at COMMIT +SELECT * FROM test1; + a | b +---+--- +(0 rows) + +-- rollback inside block with exception handler +TRUNCATE test1; +DO LANGUAGE plpgsql $$ +BEGIN + BEGIN + INSERT INTO test1 (a) VALUES (1); + ROLLBACK; + INSERT INTO test1 (a) VALUES (1/0); + ROLLBACK; + EXCEPTION + WHEN division_by_zero THEN + RAISE NOTICE 'caught division_by_zero'; + END; +END; +$$; +ERROR: cannot roll back while a subtransaction is active +CONTEXT: PL/pgSQL function inline_code_block line 5 at ROLLBACK +SELECT * FROM test1; + a | b +---+--- +(0 rows) + +-- COMMIT failures +DO LANGUAGE plpgsql $$ +BEGIN + CREATE TABLE test3 (y int UNIQUE DEFERRABLE INITIALLY DEFERRED); + COMMIT; + INSERT INTO test3 (y) VALUES (1); + COMMIT; + INSERT INTO test3 (y) VALUES (1); + INSERT INTO test3 (y) VALUES (2); + COMMIT; + INSERT INTO test3 (y) VALUES (3); -- won't get here +END; +$$; +ERROR: duplicate key value violates unique constraint "test3_y_key" +DETAIL: Key (y)=(1) already exists. +CONTEXT: PL/pgSQL function inline_code_block line 9 at COMMIT +SELECT * FROM test3; + y +--- + 1 +(1 row) + +-- failure while trying to persist a cursor across a transaction (bug #15703) +CREATE PROCEDURE cursor_fail_during_commit() + LANGUAGE plpgsql +AS $$ + DECLARE id int; + BEGIN + FOR id IN SELECT 1/(x-1000) FROM generate_series(1,1000) x LOOP + INSERT INTO test1 VALUES(id); + COMMIT; + END LOOP; + END; +$$; +TRUNCATE test1; +CALL cursor_fail_during_commit(); +ERROR: division by zero +CONTEXT: PL/pgSQL function cursor_fail_during_commit() line 6 at COMMIT +-- note that error occurs during first COMMIT, hence nothing is in test1 +SELECT count(*) FROM test1; + count +------- + 0 +(1 row) + +CREATE PROCEDURE cursor_fail_during_rollback() + LANGUAGE plpgsql +AS $$ + DECLARE id int; + BEGIN + FOR id IN SELECT 1/(x-1000) FROM generate_series(1,1000) x LOOP + INSERT INTO test1 VALUES(id); + ROLLBACK; + END LOOP; + END; +$$; +TRUNCATE test1; +CALL cursor_fail_during_rollback(); +ERROR: division by zero +CONTEXT: PL/pgSQL function cursor_fail_during_rollback() line 6 at ROLLBACK +SELECT count(*) FROM test1; + count +------- + 0 +(1 row) + +-- SET TRANSACTION +DO LANGUAGE plpgsql $$ +BEGIN + PERFORM 1; + RAISE INFO '%', current_setting('transaction_isolation'); + COMMIT; + SET TRANSACTION ISOLATION LEVEL REPEATABLE READ; + PERFORM 1; + RAISE INFO '%', current_setting('transaction_isolation'); + COMMIT; + SET TRANSACTION ISOLATION LEVEL REPEATABLE READ; + RESET TRANSACTION ISOLATION LEVEL; + PERFORM 1; + RAISE INFO '%', current_setting('transaction_isolation'); + COMMIT; +END; +$$; +INFO: read committed +INFO: repeatable read +INFO: read committed +-- error cases +DO LANGUAGE plpgsql $$ +BEGIN + SET TRANSACTION ISOLATION LEVEL REPEATABLE READ; +END; +$$; +ERROR: SET TRANSACTION ISOLATION LEVEL must be called before any query +CONTEXT: SQL statement "SET TRANSACTION ISOLATION LEVEL REPEATABLE READ" +PL/pgSQL function inline_code_block line 3 at SET +DO LANGUAGE plpgsql $$ +BEGIN + SAVEPOINT foo; +END; +$$; +ERROR: unsupported transaction command in PL/pgSQL +CONTEXT: PL/pgSQL function inline_code_block line 3 at SQL statement +DO LANGUAGE plpgsql $$ +BEGIN + EXECUTE 'COMMIT'; +END; +$$; +ERROR: EXECUTE of transaction commands is not implemented +CONTEXT: PL/pgSQL function inline_code_block line 3 at EXECUTE +-- snapshot handling test +TRUNCATE test2; +CREATE PROCEDURE transaction_test9() +LANGUAGE SQL +AS $$ +INSERT INTO test2 VALUES (42); +$$; +DO LANGUAGE plpgsql $$ +BEGIN + ROLLBACK; + CALL transaction_test9(); +END +$$; +SELECT * FROM test2; + x +---- + 42 +(1 row) + +-- Test transaction in procedure with output parameters. This uses a +-- different portal strategy and different code paths in pquery.c. +CREATE PROCEDURE transaction_test10a(INOUT x int) +LANGUAGE plpgsql +AS $$ +BEGIN + x := x + 1; + COMMIT; +END; +$$; +CALL transaction_test10a(10); + x +---- + 11 +(1 row) + +CREATE PROCEDURE transaction_test10b(INOUT x int) +LANGUAGE plpgsql +AS $$ +BEGIN + x := x - 1; + ROLLBACK; +END; +$$; +CALL transaction_test10b(10); + x +--- + 9 +(1 row) + +-- transaction timestamp vs. statement timestamp +CREATE PROCEDURE transaction_test11() +LANGUAGE plpgsql +AS $$ +DECLARE + s1 timestamp with time zone; + s2 timestamp with time zone; + s3 timestamp with time zone; + t1 timestamp with time zone; + t2 timestamp with time zone; + t3 timestamp with time zone; +BEGIN + s1 := statement_timestamp(); + t1 := transaction_timestamp(); + ASSERT s1 = t1; + PERFORM pg_sleep(0.001); + COMMIT; + s2 := statement_timestamp(); + t2 := transaction_timestamp(); + ASSERT s2 = s1; + ASSERT t2 > t1; + PERFORM pg_sleep(0.001); + ROLLBACK; + s3 := statement_timestamp(); + t3 := transaction_timestamp(); + ASSERT s3 = s1; + ASSERT t3 > t2; +END; +$$; +CALL transaction_test11(); +create table test(id int); +create procedure transaction_test12() as $$ +begin + insert into test values(1); + commit; +end; +$$ language plpgsql; +create procedure transaction_test13() as $$ +begin + insert into test values(100); + rollback; +end; +$$ language plpgsql; +create procedure transaction_test14() as $$ +begin + call transaction_test12(); + insert into test values(100); + rollback; +end; +$$ language plpgsql; +do $$ +begin + call transaction_test12(); + call transaction_test13(); + insert into test values(2); + rollback; + insert into test values(3); + commit; +end; +$$ language plpgsql; +select * from test order by 1; + id +---- + 1 + 3 +(2 rows) + +delete from test; +do $$ +begin + call transaction_test14(); + insert into test values(200); + rollback; +end; +$$ language plpgsql; +select * from test order by 1; + id +---- + 1 +(1 row) + +DROP TABLE test1; +DROP TABLE test2; +DROP TABLE test3; +DROP TABLE test; diff --git a/src/pl/plpgsql/src/pl_exec.c b/src/pl/plpgsql/src/pl_exec.c index 78331d9d..dd6cffb0 100644 --- a/src/pl/plpgsql/src/pl_exec.c +++ b/src/pl/plpgsql/src/pl_exec.c @@ -122,7 +122,7 @@ static SimpleEcontextStackEntry *simple_econtext_stack = NULL; MemoryContextAllocZero(get_eval_mcontext(estate), sz) #ifdef __TBASE__ -extern bool PGDLLIMPORT g_in_plpgsql_exec_fun; +extern int PGDLLIMPORT g_in_plpgsql_exec_fun; extern bool PGDLLIMPORT PlpgsqlDebugPrint; #endif @@ -1472,6 +1472,8 @@ exec_stmt_block(PLpgSQL_execstate *estate, PLpgSQL_stmt_block *block) /* Restore stmt_mcontext stack and release the error data */ pop_stmt_mcontext(estate); MemoryContextReset(stmt_mcontext); + + SetExitPlpgsqlFunc(); } PG_END_TRY(); diff --git a/src/pl/plpgsql/src/sql/plpgsql_transaction.sql b/src/pl/plpgsql/src/sql/plpgsql_transaction.sql new file mode 100644 index 00000000..827e0eba --- /dev/null +++ b/src/pl/plpgsql/src/sql/plpgsql_transaction.sql @@ -0,0 +1,537 @@ +CREATE TABLE test1 (a int, b text); + + +CREATE PROCEDURE transaction_test1(x int, y text) +LANGUAGE plpgsql +AS $$ +BEGIN + FOR i IN 0..x LOOP + INSERT INTO test1 (a, b) VALUES (i, y); + IF i % 2 = 0 THEN + COMMIT; + ELSE + ROLLBACK; + END IF; + END LOOP; +END +$$; + +CALL transaction_test1(9, 'foo'); + +SELECT * FROM test1 ORDER BY a; + + +TRUNCATE test1; + +DO +LANGUAGE plpgsql +$$ +BEGIN + FOR i IN 0..9 LOOP + INSERT INTO test1 (a) VALUES (i); + IF i % 2 = 0 THEN + COMMIT; + ELSE + ROLLBACK; + END IF; + END LOOP; +END +$$; + +SELECT * FROM test1 ORDER BY a; + + +-- transaction commands not allowed when called in transaction block +START TRANSACTION; +CALL transaction_test1(9, 'error'); +COMMIT; + +START TRANSACTION; +DO LANGUAGE plpgsql $$ BEGIN COMMIT; END $$; +COMMIT; + + +TRUNCATE test1; + +-- not allowed in a function +CREATE FUNCTION transaction_test2() RETURNS int +LANGUAGE plpgsql +AS $$ +BEGIN + FOR i IN 0..9 LOOP + INSERT INTO test1 (a) VALUES (i); + IF i % 2 = 0 THEN + COMMIT; + ELSE + ROLLBACK; + END IF; + END LOOP; + RETURN 1; +END +$$; + +SELECT transaction_test2(); + +SELECT * FROM test1; + + +-- also not allowed if procedure is called from a function +CREATE FUNCTION transaction_test3() RETURNS int +LANGUAGE plpgsql +AS $$ +BEGIN + CALL transaction_test1(9, 'error'); + RETURN 1; +END; +$$; + +SELECT transaction_test3(); + +SELECT * FROM test1; + + +-- DO block inside function +CREATE FUNCTION transaction_test4() RETURNS int +LANGUAGE plpgsql +AS $$ +BEGIN + EXECUTE 'DO LANGUAGE plpgsql $x$ BEGIN COMMIT; END $x$'; + RETURN 1; +END; +$$; + +SELECT transaction_test4(); + + +-- proconfig settings currently disallow transaction statements +CREATE PROCEDURE transaction_test5() +LANGUAGE plpgsql +SET work_mem = 555 +AS $$ +BEGIN + COMMIT; +END; +$$; + +CALL transaction_test5(); + + +-- SECURITY DEFINER currently disallow transaction statements +CREATE PROCEDURE transaction_test5b() +LANGUAGE plpgsql +SECURITY DEFINER +AS $$ +BEGIN + COMMIT; +END; +$$; + +CALL transaction_test5b(); + + +TRUNCATE test1; + +-- nested procedure calls +CREATE PROCEDURE transaction_test6(c text) +LANGUAGE plpgsql +AS $$ +BEGIN + CALL transaction_test1(9, c); +END; +$$; + +CALL transaction_test6('bar'); + +SELECT * FROM test1 ORDER BY a; + +TRUNCATE test1; + +CREATE PROCEDURE transaction_test7() +LANGUAGE plpgsql +AS $$ +BEGIN + DO 'BEGIN CALL transaction_test1(9, $x$baz$x$); END;'; +END; +$$; + +CALL transaction_test7(); + +SELECT * FROM test1 ORDER BY a; + +CREATE PROCEDURE transaction_test8() +LANGUAGE plpgsql +AS $$ +BEGIN + EXECUTE 'CALL transaction_test1(10, $x$baz$x$)'; +END; +$$; + +CALL transaction_test8(); + + +-- commit inside cursor loop +CREATE TABLE test2 (x int); +INSERT INTO test2 VALUES (0), (1), (2), (3), (4); + +TRUNCATE test1; + +DO LANGUAGE plpgsql $$ +DECLARE + r RECORD; +BEGIN + FOR r IN SELECT * FROM test2 ORDER BY x LOOP + INSERT INTO test1 (a) VALUES (r.x); + COMMIT; + END LOOP; +END; +$$; + +SELECT * FROM test1 ORDER BY a; + +-- check that this doesn't leak a holdable portal +SELECT * FROM pg_cursors; + + +-- error in cursor loop with commit +TRUNCATE test1; + +DO LANGUAGE plpgsql $$ +DECLARE + r RECORD; +BEGIN + FOR r IN SELECT * FROM test2 ORDER BY x LOOP + INSERT INTO test1 (a) VALUES (12/(r.x-2)); + COMMIT; + END LOOP; +END; +$$; + +SELECT * FROM test1; + +SELECT * FROM pg_cursors; + + +-- rollback inside cursor loop +TRUNCATE test1; + +DO LANGUAGE plpgsql $$ +DECLARE + r RECORD; +BEGIN + FOR r IN SELECT * FROM test2 ORDER BY x LOOP + INSERT INTO test1 (a) VALUES (r.x); + ROLLBACK; + END LOOP; +END; +$$; + +SELECT * FROM test1; + +SELECT * FROM pg_cursors; + + +-- first commit then rollback inside cursor loop +TRUNCATE test1; + +DO LANGUAGE plpgsql $$ +DECLARE + r RECORD; +BEGIN + FOR r IN SELECT * FROM test2 ORDER BY x LOOP + INSERT INTO test1 (a) VALUES (r.x); + IF r.x % 2 = 0 THEN + COMMIT; + ELSE + ROLLBACK; + END IF; + END LOOP; +END; +$$; + +SELECT * FROM test1 ORDER BY a; + +SELECT * FROM pg_cursors; + + +-- rollback inside cursor loop +TRUNCATE test1; + +DO LANGUAGE plpgsql $$ +DECLARE + r RECORD; +BEGIN + FOR r IN UPDATE test2 SET x = x * 2 RETURNING x LOOP + INSERT INTO test1 (a) VALUES (r.x); + ROLLBACK; + END LOOP; +END; +$$; + +SELECT * FROM test1; +SELECT * FROM test2 ORDER BY x; + +SELECT * FROM pg_cursors; + + +-- commit inside block with exception handler +TRUNCATE test1; + +DO LANGUAGE plpgsql $$ +BEGIN + BEGIN + INSERT INTO test1 (a) VALUES (1); + COMMIT; + INSERT INTO test1 (a) VALUES (1/0); + COMMIT; + EXCEPTION + WHEN division_by_zero THEN + RAISE NOTICE 'caught division_by_zero'; + END; +END; +$$; + +SELECT * FROM test1; + + +-- rollback inside block with exception handler +TRUNCATE test1; + +DO LANGUAGE plpgsql $$ +BEGIN + BEGIN + INSERT INTO test1 (a) VALUES (1); + ROLLBACK; + INSERT INTO test1 (a) VALUES (1/0); + ROLLBACK; + EXCEPTION + WHEN division_by_zero THEN + RAISE NOTICE 'caught division_by_zero'; + END; +END; +$$; + +SELECT * FROM test1; + + +-- COMMIT failures +DO LANGUAGE plpgsql $$ +BEGIN + CREATE TABLE test3 (y int UNIQUE DEFERRABLE INITIALLY DEFERRED); + COMMIT; + INSERT INTO test3 (y) VALUES (1); + COMMIT; + INSERT INTO test3 (y) VALUES (1); + INSERT INTO test3 (y) VALUES (2); + COMMIT; + INSERT INTO test3 (y) VALUES (3); -- won't get here +END; +$$; + +SELECT * FROM test3; + +-- failure while trying to persist a cursor across a transaction (bug #15703) +CREATE PROCEDURE cursor_fail_during_commit() + LANGUAGE plpgsql +AS $$ + DECLARE id int; + BEGIN + FOR id IN SELECT 1/(x-1000) FROM generate_series(1,1000) x LOOP + INSERT INTO test1 VALUES(id); + COMMIT; + END LOOP; + END; +$$; + +TRUNCATE test1; + +CALL cursor_fail_during_commit(); + +-- note that error occurs during first COMMIT, hence nothing is in test1 +SELECT count(*) FROM test1; + +CREATE PROCEDURE cursor_fail_during_rollback() + LANGUAGE plpgsql +AS $$ + DECLARE id int; + BEGIN + FOR id IN SELECT 1/(x-1000) FROM generate_series(1,1000) x LOOP + INSERT INTO test1 VALUES(id); + ROLLBACK; + END LOOP; + END; +$$; + +TRUNCATE test1; + +CALL cursor_fail_during_rollback(); + +SELECT count(*) FROM test1; + + +-- SET TRANSACTION +DO LANGUAGE plpgsql $$ +BEGIN + PERFORM 1; + RAISE INFO '%', current_setting('transaction_isolation'); + COMMIT; + SET TRANSACTION ISOLATION LEVEL REPEATABLE READ; + PERFORM 1; + RAISE INFO '%', current_setting('transaction_isolation'); + COMMIT; + SET TRANSACTION ISOLATION LEVEL REPEATABLE READ; + RESET TRANSACTION ISOLATION LEVEL; + PERFORM 1; + RAISE INFO '%', current_setting('transaction_isolation'); + COMMIT; +END; +$$; + +-- error cases +DO LANGUAGE plpgsql $$ +BEGIN + SET TRANSACTION ISOLATION LEVEL REPEATABLE READ; +END; +$$; + +DO LANGUAGE plpgsql $$ +BEGIN + SAVEPOINT foo; +END; +$$; + +DO LANGUAGE plpgsql $$ +BEGIN + EXECUTE 'COMMIT'; +END; +$$; + + +-- snapshot handling test +TRUNCATE test2; + +CREATE PROCEDURE transaction_test9() +LANGUAGE SQL +AS $$ +INSERT INTO test2 VALUES (42); +$$; + +DO LANGUAGE plpgsql $$ +BEGIN + ROLLBACK; + CALL transaction_test9(); +END +$$; + +SELECT * FROM test2; + + +-- Test transaction in procedure with output parameters. This uses a +-- different portal strategy and different code paths in pquery.c. +CREATE PROCEDURE transaction_test10a(INOUT x int) +LANGUAGE plpgsql +AS $$ +BEGIN + x := x + 1; + COMMIT; +END; +$$; + +CALL transaction_test10a(10); + +CREATE PROCEDURE transaction_test10b(INOUT x int) +LANGUAGE plpgsql +AS $$ +BEGIN + x := x - 1; + ROLLBACK; +END; +$$; + +CALL transaction_test10b(10); + + +-- transaction timestamp vs. statement timestamp +CREATE PROCEDURE transaction_test11() +LANGUAGE plpgsql +AS $$ +DECLARE + s1 timestamp with time zone; + s2 timestamp with time zone; + s3 timestamp with time zone; + t1 timestamp with time zone; + t2 timestamp with time zone; + t3 timestamp with time zone; +BEGIN + s1 := statement_timestamp(); + t1 := transaction_timestamp(); + ASSERT s1 = t1; + PERFORM pg_sleep(0.001); + COMMIT; + s2 := statement_timestamp(); + t2 := transaction_timestamp(); + ASSERT s2 = s1; + ASSERT t2 > t1; + PERFORM pg_sleep(0.001); + ROLLBACK; + s3 := statement_timestamp(); + t3 := transaction_timestamp(); + ASSERT s3 = s1; + ASSERT t3 > t2; +END; +$$; + +CALL transaction_test11(); + +create table test(id int); + +create procedure transaction_test12() as $$ +begin + insert into test values(1); + commit; +end; +$$ language plpgsql; + +create procedure transaction_test13() as $$ +begin + insert into test values(100); + rollback; +end; +$$ language plpgsql; + +create procedure transaction_test14() as $$ +begin + call transaction_test12(); + insert into test values(100); + rollback; +end; +$$ language plpgsql; + +do $$ +begin + call transaction_test12(); + call transaction_test13(); + insert into test values(2); + rollback; + insert into test values(3); + commit; +end; +$$ language plpgsql; + +select * from test order by 1; + +delete from test; + +do $$ +begin + call transaction_test14(); + insert into test values(200); + rollback; +end; +$$ language plpgsql; + +select * from test order by 1; + +DROP TABLE test1; +DROP TABLE test2; +DROP TABLE test3; +DROP TABLE test; From 4e65b9e7919e4a36051ab8843e645416e0e13bd1 Mon Sep 17 00:00:00 2001 From: bethding Date: Thu, 30 Dec 2021 20:34:14 +0800 Subject: [PATCH 470/578] fix prepare for fqs insert http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131095655877&jumpfrom=RTX --- src/backend/pgxc/pool/execRemote.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index b9f9c4a2..c543fc1f 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -6826,11 +6826,13 @@ pgxc_start_command_on_connection(PGXCNodeHandle *connection, * exist */ if (exec_nodes && exec_nodes->need_rewrite == true) prepared = false; - else if (step->statement) + if (step->statement) prepared = ActivateDatanodeStatementOnNode(step->statement, PGXCNodeGetNodeId(connection->nodeoid, &nodetype)); + if (prepared && exec_nodes && exec_nodes->need_rewrite == true) + prepared = false; /* * execute and fetch rows only if they will be consumed From 960dbcd831141ba03f6cbc8e380dbecd4d5be377 Mon Sep 17 00:00:00 2001 From: whalesong Date: Wed, 5 Jan 2022 15:03:20 +0800 Subject: [PATCH 471/578] bugfix: rollback slow than commit, tpcc performce optimize(merge request 965), too many logs, adjust log level --- src/backend/pgxc/pool/pgxcnode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index 78f1a024..f006dc41 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -2720,7 +2720,7 @@ pgxc_node_flush_read(PGXCNodeHandle *handle) /* break, only if the connection is ready for query. */ if (is_ready) { - elog(LOG, "pgxc_node_flush_read node:%s ready for query.", handle->nodename); + elog(DEBUG1, "pgxc_node_flush_read node:%s ready for query.", handle->nodename); break; } @@ -2728,7 +2728,7 @@ pgxc_node_flush_read(PGXCNodeHandle *handle) read_result = pgxc_node_read_data(handle, true); if (read_result <= 0) { - elog(LOG, "pgxc_node_flush_read node:%s read failure.", handle->nodename); + elog(DEBUG1, "pgxc_node_flush_read node:%s read failure.", handle->nodename); break; } From e87543794bf8bba933d9424e14ae0a51572dfb3e Mon Sep 17 00:00:00 2001 From: jadenchi Date: Wed, 5 Jan 2022 21:28:37 +0800 Subject: [PATCH 472/578] fix rename database failed caused by leader cn's connection remaining, same as drop database. fix http://tapd.oa.com/TencentDB_for_TBase/prong/stories/view/1020418349870883157 --- src/backend/tcop/utility.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index d58ae205..fcdc89f5 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -2455,7 +2455,7 @@ standard_ProcessUtility(PlannedStmt *pstmt, case T_DropdbStmt: { char prepareQuery[STRINGLENGTH]; - char query[STRINGLENGTH]; + char cleanQuery[STRINGLENGTH]; DropdbStmt *stmt = (DropdbStmt *) parsetree; if (!stmt->prepare) { @@ -2479,14 +2479,14 @@ standard_ProcessUtility(PlannedStmt *pstmt, if (OidIsValid(db_oid)) { - snprintf(query, STRINGLENGTH, "CLEAN CONNECTION TO ALL FOR DATABASE %s;", + snprintf(cleanQuery, STRINGLENGTH, "CLEAN CONNECTION TO ALL FOR DATABASE %s;", quote_identifier(stmt->dbname)); snprintf(prepareQuery, STRINGLENGTH, "DROP DATABASE PREPARE %s;", quote_identifier(stmt->dbname)); if (!is_ddl_leader_cn(leaderCnHandle->nodename)) { - SendLeaderCNUtility(query, false); + SendLeaderCNUtility(cleanQuery, false); SendLeaderCNUtility(prepareQuery, false); } else @@ -2858,6 +2858,13 @@ standard_ProcessUtility(PlannedStmt *pstmt, */ if (!is_leader_cn) { + if (OBJECT_DATABASE == stmt->renameType) { + char cleanQuery[STRINGLENGTH]; + snprintf(cleanQuery, STRINGLENGTH, "CLEAN CONNECTION TO ALL FOR DATABASE %s;", + quote_identifier(stmt->subname)); + SendLeaderCNUtility(cleanQuery, false); + } + SendLeaderCNUtility(queryString, is_temp); } ExecRenameStmt(stmt); From 3a34ec77cb90e59649411dbc9f112ea6b1f6f0d4 Mon Sep 17 00:00:00 2001 From: jadenchi Date: Thu, 6 Jan 2022 10:37:31 +0800 Subject: [PATCH 473/578] modify line feed style --- src/backend/tcop/utility.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index fcdc89f5..4f753680 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -2858,7 +2858,8 @@ standard_ProcessUtility(PlannedStmt *pstmt, */ if (!is_leader_cn) { - if (OBJECT_DATABASE == stmt->renameType) { + if (OBJECT_DATABASE == stmt->renameType) + { char cleanQuery[STRINGLENGTH]; snprintf(cleanQuery, STRINGLENGTH, "CLEAN CONNECTION TO ALL FOR DATABASE %s;", quote_identifier(stmt->subname)); From 7d3f95c0f160a9e9ed76be00b9ef2440f1c3b78d Mon Sep 17 00:00:00 2001 From: whalesong Date: Thu, 6 Jan 2022 11:48:22 +0800 Subject: [PATCH 474/578] bugfix: uos testing core in getTxnInfoOnNode (merge request !1071) http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view?bug_id=1020421696095841513&jump_count=1 (cherry picked from commit fcdbc285) 417a33be bugfix: uos testing core in getTxnInfoOnNode --- contrib/pg_clean/pg_clean.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/contrib/pg_clean/pg_clean.c b/contrib/pg_clean/pg_clean.c index 0b2f6f98..68d916a5 100644 --- a/contrib/pg_clean/pg_clean.c +++ b/contrib/pg_clean/pg_clean.c @@ -1033,6 +1033,21 @@ void getTxnInfoOnNode(Oid node) ObjectIdGetDatum(InvalidOid), Int32GetDatum(-1))); + if (gid == NULL) + { + elog(ERROR, "node(%d) gid is null, xid: %d", node, xid); + } + else if (owner == NULL) + { + elog(ERROR, "node(%d) owner is null, xid: %d, gid: %s", + node, xid, gid); + } + else if (datname == NULL) + { + elog(ERROR, "node(%d) db name is null, xid: %d, gid: %s, owner: %s", + node, xid, gid, owner); + } + /*add txn to database*/ add_txn_info(datname, node, xid, gid, owner, prepared_time, TXN_STATUS_PREPARED); if (total_twopc_txn >= MAX_TWOPC_TXN) From 73aa3b85f4645d6166378b97abdfb9ea7db0a7bc Mon Sep 17 00:00:00 2001 From: whalesong Date: Sat, 11 Dec 2021 16:24:43 +0800 Subject: [PATCH 475/578] arm compile error fix: inline func check_entry_key error --- src/backend/access/transam/twophase.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 387cdf73..15ddc9f3 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -190,7 +190,7 @@ typedef struct Cache2pcInfo } Cache2pcInfo; -inline void check_entry_key(const char *tid, const char *key); +static inline void check_entry_key(const char *tid, const char *key); bool add_2pc_info(const char *tid, const char *info); @@ -3319,7 +3319,7 @@ PrepareRedoRemove(TransactionId xid, bool giveWarning) /* * check_entry_key: check the entry key in the hash table whether is same with tid. */ -inline void check_entry_key(const char *tid, const char *key) +static inline void check_entry_key(const char *tid, const char *key) { if (enable_2pc_entry_key_check) { From 9246bef0698a61a3ded01da9cf3bd9540ad430a0 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Wed, 5 Jan 2022 16:52:51 +0800 Subject: [PATCH 476/578] fix modify the system parameters on slave cn http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view/1020421696095817651 TAPD: --bug=095817651 (cherry picked from commit be98dc09) 5647a757 add fix a4b1d061 fix modify the system parameters on slave cn http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view/1020421696095817651 --- src/backend/pgxc/pool/poolmgr.c | 3 +-- src/backend/tcop/utility.c | 4 +++- src/include/pgxc/pgxc.h | 2 ++ 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c index 2db4397e..6e4f3283 100644 --- a/src/backend/pgxc/pool/poolmgr.c +++ b/src/backend/pgxc/pool/poolmgr.c @@ -1842,8 +1842,7 @@ PoolManagerGetConnections(List *datanodelist, List *coordlist, bool raise_error, * the main data node, and the standby cn may generate the same global xid as the main cn, * so disable the distributed query of the standby node on the main plane */ - if (g_allow_distri_query_on_standby_node == false && - IsPGXCMainCluster && RecoveryInProgress()) + if (g_allow_distri_query_on_standby_node == false && IS_PGXC_MAINCLUSTER_SLAVENODE) { elog(ERROR, "can't do distributed query because it is the main plane standby node."); } diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index 4f753680..799b83a2 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -7799,7 +7799,9 @@ IsStmtAllowedInLockedMode(Node *parsetree, const char *queryString) case T_LockNodeStmt: #endif return ALLOW; - + case T_AlterSystemStmt: + /* allow if it's main cluster slave */ + return (IS_PGXC_MAINCLUSTER_SLAVENODE) ? ALLOW : DISALLOW; default: return DISALLOW; } diff --git a/src/include/pgxc/pgxc.h b/src/include/pgxc/pgxc.h index 6c5abcf4..687be6c8 100644 --- a/src/include/pgxc/pgxc.h +++ b/src/include/pgxc/pgxc.h @@ -120,6 +120,8 @@ extern Datum xc_lockForBackupKey2; (IS_PGXC_COORDINATOR && !IsConnFromCoord()) #define IS_PGXC_REMOTE_COORDINATOR \ (IS_PGXC_COORDINATOR && IsConnFromCoord()) +#define IS_PGXC_MAINCLUSTER_SLAVENODE \ + (IsPGXCMainCluster && RecoveryInProgress()) #define PGXC_PARENT_NODE parentPGXCNode #define PGXC_PARENT_NODE_ID parentPGXCNodeId From 3c007cdc499676120d47456e2a4222f30558534f Mon Sep 17 00:00:00 2001 From: sigmalin Date: Tue, 11 Jan 2022 11:16:02 +0800 Subject: [PATCH 477/578] fix core when ExecCloseRemoteStatement http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131095972651 --- src/backend/pgxc/pool/execRemote.c | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index c543fc1f..798f91af 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -7289,9 +7289,9 @@ PGXCNodeCleanAndRelease(int code, Datum arg) stat_log(); } -void -ExecCloseRemoteStatement(const char *stmt_name, List *nodelist) -{// #lizard forgives +static void +ExecCloseRemoteStatementInternal(const char *stmt_name, List *nodelist) +{ PGXCNodeAllHandles *all_handles; PGXCNodeHandle **connections; ResponseCombiner combiner; @@ -7379,6 +7379,28 @@ ExecCloseRemoteStatement(const char *stmt_name, List *nodelist) pfree_pgxc_all_handles(all_handles); } +/* + * close remote statement needs to be inside a transaction so that syscache can be accessed + */ +void +ExecCloseRemoteStatement(const char *stmt_name, List *nodelist) +{ + bool need_abort = false; + + if (IsTransactionIdle()) + { + StartTransactionCommand(); + need_abort = true; + } + + ExecCloseRemoteStatementInternal(stmt_name, nodelist); + + if (need_abort) + { + AbortCurrentTransaction(); + } +} + /* * DataNodeCopyInBinaryForAll * From 0dd416baffc0fdc7f93c82c204092e326a793ce6 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Tue, 18 Jan 2022 15:20:24 +0800 Subject: [PATCH 478/578] fix gtm core when get_node_list http://tapd.woa.com/pgxz/bugtrace/bugs/view/1010092131096263845 (merge request !1106) Squash merge branch 'sigmalin_v2' into 'Tbase_v2.15.19.5' fix gtm core when get_node_list http://tapd.woa.com/pgxz/bugtrace/bugs/view/1010092131096263845 TAPD: --bug=096263845 --- src/gtm/client/gtm_client.c | 85 +++++++++++++++++++++++++++++++++++-- 1 file changed, 82 insertions(+), 3 deletions(-) diff --git a/src/gtm/client/gtm_client.c b/src/gtm/client/gtm_client.c index 8ff8b131..f9cdbd65 100644 --- a/src/gtm/client/gtm_client.c +++ b/src/gtm/client/gtm_client.c @@ -179,7 +179,13 @@ begin_replication_initial_sync(GTM_Conn *conn) goto receive_failed; if (res->gr_status == GTM_RESULT_OK) + { Assert(res->gr_type == NODE_BEGIN_REPLICATION_INIT_RESULT); + if (res->gr_type != NODE_BEGIN_REPLICATION_INIT_RESULT) + { + elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, NODE_BEGIN_REPLICATION_INIT_RESULT); + } + } else return 0; @@ -226,7 +232,13 @@ end_replication_initial_sync(GTM_Conn *conn) goto receive_failed; if (res->gr_status == GTM_RESULT_OK) + { Assert(res->gr_type == NODE_END_REPLICATION_INIT_RESULT); + if (res->gr_type != NODE_END_REPLICATION_INIT_RESULT) + { + elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, NODE_END_REPLICATION_INIT_RESULT); + } + } return 1; @@ -270,6 +282,15 @@ get_node_list(GTM_Conn *conn, GTM_PGXCNodeInfo *data, size_t maxlen) if ((res = GTMPQgetResult(conn)) == NULL) goto receive_failed; + if (res->gr_status == GTM_RESULT_OK) + { + Assert(res->gr_type == NODE_LIST_RESULT); + if (res->gr_type != NODE_LIST_RESULT) + { + elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, NODE_LIST_RESULT); + } + } + /* * Do something here. */ @@ -287,9 +308,6 @@ get_node_list(GTM_Conn *conn, GTM_PGXCNodeInfo *data, size_t maxlen) memcpy(&data[i], res->gr_resdata.grd_node_list.nodeinfo[i], sizeof(GTM_PGXCNodeInfo)); } - if (res->gr_status == GTM_RESULT_OK) - Assert(res->gr_type == NODE_LIST_RESULT); - return num_node; receive_failed: @@ -337,7 +355,13 @@ get_next_gxid(GTM_Conn *conn) next_gxid = res->gr_resdata.grd_next_gxid; if (res->gr_status == GTM_RESULT_OK) + { Assert(res->gr_type == TXN_GET_NEXT_GXID_RESULT); + if (res->gr_type != TXN_GET_NEXT_GXID_RESULT) + { + elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, TXN_GET_NEXT_GXID_RESULT); + } + } /* FIXME: should be a number of gxids */ return next_gxid; @@ -382,7 +406,13 @@ get_txn_gxid_list(GTM_Conn *conn, GTM_Transactions *txn) goto receive_failed; if (res->gr_status == GTM_RESULT_OK) + { Assert(res->gr_type == TXN_GXID_LIST_RESULT); + if (res->gr_type != TXN_GXID_LIST_RESULT) + { + elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, TXN_GXID_LIST_RESULT); + } + } txn_count = gtm_deserialize_transactions(txn, res->gr_resdata.grd_txn_gid_list.ptr, @@ -431,7 +461,13 @@ get_sequence_list(GTM_Conn *conn, GTM_SeqInfo **seq_list) goto receive_failed; if (res->gr_status == GTM_RESULT_OK) + { Assert(res->gr_type == SEQUENCE_LIST_RESULT); + if (res->gr_type != SEQUENCE_LIST_RESULT) + { + elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, SEQUENCE_LIST_RESULT); + } + } *seq_list = res->gr_resdata.grd_seq_list.seq; @@ -951,6 +987,10 @@ commit_transaction_internal(GTM_Conn *conn, GlobalTransactionId gxid, { Assert(res->gr_type == TXN_COMMIT_RESULT); Assert(res->gr_resdata.grd_gxid == gxid); + if (res->gr_type != TXN_COMMIT_RESULT) + { + elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, TXN_COMMIT_RESULT); + } if (waited_xid_count > 0) { @@ -1064,6 +1104,11 @@ commit_prepared_transaction_internal(GTM_Conn *conn, { Assert(res->gr_type == TXN_COMMIT_PREPARED_RESULT); Assert(res->gr_resdata.grd_gxid == gxid); + if (res->gr_type != TXN_COMMIT_PREPARED_RESULT) + { + elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, TXN_COMMIT_PREPARED_RESULT); + } + if (waited_xid_count > 0) { if (res->gr_resdata.grd_eof_txn.status == STATUS_DELAYED) @@ -1138,6 +1183,10 @@ abort_transaction_internal(GTM_Conn *conn, GlobalTransactionId gxid, bool is_bac { Assert(res->gr_type == TXN_ROLLBACK_RESULT); Assert(res->gr_resdata.grd_gxid == gxid); + if (res->gr_type != TXN_ROLLBACK_RESULT) + { + elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, TXN_ROLLBACK_RESULT); + } } return res->gr_status; @@ -1211,6 +1260,10 @@ start_prepared_transaction_internal(GTM_Conn *conn, GlobalTransactionId gxid, ch { Assert(res->gr_type == TXN_START_PREPARED_RESULT); Assert(res->gr_resdata.grd_gxid == gxid); + if (res->gr_type != TXN_START_PREPARED_RESULT) + { + elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, TXN_START_PREPARED_RESULT); + } } return res->gr_status; @@ -1311,6 +1364,10 @@ log_commit_transaction_internal(GTM_Conn *conn, { Assert(res->gr_type == TXN_LOG_TRANSACTION_RESULT); Assert(res->gr_resdata.grd_gxid == gxid); + if (res->gr_type != TXN_LOG_TRANSACTION_RESULT) + { + elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, TXN_LOG_TRANSACTION_RESULT); + } } return res->gr_status; @@ -1398,6 +1455,10 @@ log_scan_transaction_internal(GTM_Conn *conn, { Assert(res->gr_type == TXN_LOG_SCAN_RESULT); Assert(res->gr_resdata.grd_gxid == gxid); + if (res->gr_type != TXN_LOG_SCAN_RESULT) + { + elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, TXN_LOG_SCAN_RESULT); + } } return res->gr_status; @@ -1458,6 +1519,10 @@ prepare_transaction_internal(GTM_Conn *conn, GlobalTransactionId gxid, bool is_b { Assert(res->gr_type == TXN_PREPARE_RESULT); Assert(res->gr_resdata.grd_gxid == gxid); + if (res->gr_type != TXN_PREPARE_RESULT) + { + elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, TXN_PREPARE_RESULT); + } } return res->gr_status; @@ -1651,7 +1716,13 @@ get_storage_file(GTM_Conn *conn, char **data) goto receive_failed; if (res->gr_status == GTM_RESULT_OK) + { Assert(res->gr_type == STORAGE_TRANSFER_RESULT); + if (res->gr_type != STORAGE_TRANSFER_RESULT) + { + elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, STORAGE_TRANSFER_RESULT); + } + } #ifdef __XLOG__ *start_pos = res->grd_storage_data.start_pos; @@ -1921,6 +1992,10 @@ get_snapshot(GTM_Conn *conn, GlobalTransactionId gxid, bool canbe_grouped) if (res->gr_status == GTM_RESULT_OK) { Assert(res->gr_type == res_type); + if (res->gr_type != res_type) + { + elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, res_type); + } /* * !!FIXME - The following assertion fails when snapshots are requested * in non-grouping mode. We did some investigations and it appears that @@ -2691,6 +2766,10 @@ static int node_register_worker(GTM_Conn *conn, { Assert(res->gr_resdata.grd_node.type == type); Assert((strcmp(res->gr_resdata.grd_node.node_name,node_name) == 0)); + if (res->gr_type != NODE_REGISTER_RESULT) + { + elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, NODE_REGISTER_RESULT); + } } return res->gr_status; From c8724ed8012eb5d8f1523f37db74013caad58413 Mon Sep 17 00:00:00 2001 From: guanhuawang Date: Tue, 18 Jan 2022 17:35:16 +0800 Subject: [PATCH 479/578] Fix stddev_samp error caused by numeric_poly_combine (merge request !1108) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Squash merge branch 'winter_tbase_v2.15.19.5_fixstddev' into 'Tbase_v2.15.19.5' Fix stddev_samp error caused by numeric_poly_combine 函数的原地址跟目的地址搞反了。pg的老bug,在2018年修复,由于使用屏蔽int128的patch之后暴露出来。 --- src/backend/access/transam/twophase.c | 2 +- src/backend/utils/adt/numeric.c | 4 ++-- src/bin/pg_rewind/copy_fetch.c | 10 ++++---- src/test/regress/expected/aggregates_1.out | 28 ++++++++++++++++++++++ src/test/regress/sql/aggregates.sql | 18 ++++++++++++++ 5 files changed, 54 insertions(+), 8 deletions(-) diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 15ddc9f3..e78f9c53 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -190,7 +190,7 @@ typedef struct Cache2pcInfo } Cache2pcInfo; -static inline void check_entry_key(const char *tid, const char *key); +static void check_entry_key(const char *tid, const char *key); bool add_2pc_info(const char *tid, const char *info); diff --git a/src/backend/utils/adt/numeric.c b/src/backend/utils/adt/numeric.c index d159c430..1fd3fbb6 100644 --- a/src/backend/utils/adt/numeric.c +++ b/src/backend/utils/adt/numeric.c @@ -4104,8 +4104,8 @@ numeric_poly_combine(PG_FUNCTION_ARGS) state1->sumX = state2->sumX; state1->sumX2 = state2->sumX2; #else - accum_sum_copy(&state2->sumX, &state1->sumX); - accum_sum_copy(&state2->sumX2, &state1->sumX2); + accum_sum_copy(&state1->sumX, &state2->sumX); + accum_sum_copy(&state1->sumX2, &state2->sumX2); #endif MemoryContextSwitchTo(old_context); diff --git a/src/bin/pg_rewind/copy_fetch.c b/src/bin/pg_rewind/copy_fetch.c index 7696a6fa..48a206d7 100644 --- a/src/bin/pg_rewind/copy_fetch.c +++ b/src/bin/pg_rewind/copy_fetch.c @@ -156,8 +156,8 @@ recurse_dir(const char *datadir, const char *parentpath, * If 'trunc' is true, any existing file with the same name is truncated. */ static void -rewind_copy_file_range(const char *path, off_t begin, off_t end, bool trunc) -{// #lizard forgives +tbase_copy_file_range(const char *path, off_t begin, off_t end, bool trunc) +{ char buf[BLCKSZ]; char srcpath[MAXPGPATH]; int srcfd; @@ -222,7 +222,7 @@ copy_executeFileMap(filemap_t *map) break; case FILE_ACTION_COPY: - rewind_copy_file_range(entry->path, 0, entry->newsize, true); + tbase_copy_file_range(entry->path, 0, entry->newsize, true); break; case FILE_ACTION_TRUNCATE: @@ -230,7 +230,7 @@ copy_executeFileMap(filemap_t *map) break; case FILE_ACTION_COPY_TAIL: - rewind_copy_file_range(entry->path, entry->oldsize, entry->newsize, false); + tbase_copy_file_range(entry->path, entry->oldsize, entry->newsize, false); break; case FILE_ACTION_CREATE: @@ -257,7 +257,7 @@ execute_pagemap(datapagemap_t *pagemap, const char *path) while (datapagemap_next(iter, &blkno)) { offset = blkno * BLCKSZ; - rewind_copy_file_range(path, offset, offset + BLCKSZ, false); + tbase_copy_file_range(path, offset, offset + BLCKSZ, false); /* Ok, this block has now been copied from new data dir to old */ } pg_free(iter); diff --git a/src/test/regress/expected/aggregates_1.out b/src/test/regress/expected/aggregates_1.out index 9602196b..2bfcbb7f 100644 --- a/src/test/regress/expected/aggregates_1.out +++ b/src/test/regress/expected/aggregates_1.out @@ -2000,3 +2000,31 @@ select my_sum(one),my_half_sum(one) from (values(1),(2),(3),(4)) t(one); (1 row) rollback; + -- test coverage for aggregate combine/serial/deserial functions + BEGIN ISOLATION LEVEL REPEATABLE READ; + SET parallel_setup_cost = 0; + SET parallel_tuple_cost = 0; + SET min_parallel_table_scan_size = 0; + SET max_parallel_workers_per_gather = 4; + SET enable_indexonlyscan = off; + -- variance(int4) covers numeric_poly_combine + -- sum(int8) covers int8_avg_combine + EXPLAIN (COSTS OFF) + SELECT variance(unique1::int4), sum(unique1::int8) FROM tenk1; + QUERY PLAN + -------------------------------------------------------------------- + Parallel Finalize Aggregate + -> Parallel Remote Subquery Scan on all (datanode_1,datanode_2) + -> Gather + Workers Planned: 4 + -> Partial Aggregate + -> Parallel Seq Scan on tenk1 + (6 rows) + + SELECT variance(unique1::int4), sum(unique1::int8) FROM tenk1; + variance | sum + ----------------------+---------- + 8334166.666666666667 | 49995000 + (1 row) + + ROLLBACK; \ No newline at end of file diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql index 11d9db70..b9e0b2b4 100644 --- a/src/test/regress/sql/aggregates.sql +++ b/src/test/regress/sql/aggregates.sql @@ -844,3 +844,21 @@ create aggregate my_half_sum(int4) select my_sum(one),my_half_sum(one) from (values(1),(2),(3),(4)) t(one); rollback; + +-- test coverage for aggregate combine/serial/deserial functions +BEGIN ISOLATION LEVEL REPEATABLE READ; + +SET parallel_setup_cost = 0; +SET parallel_tuple_cost = 0; +SET min_parallel_table_scan_size = 0; +SET max_parallel_workers_per_gather = 4; +SET enable_indexonlyscan = off; + +-- variance(int4) covers numeric_poly_combine +-- sum(int8) covers int8_avg_combine +EXPLAIN (COSTS OFF) + SELECT variance(unique1::int4), sum(unique1::int8) FROM tenk1; + +SELECT variance(unique1::int4), sum(unique1::int8) FROM tenk1; + +ROLLBACK; From 84f0b9cb913c4dce4d69df6de88f7649f1e166a1 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Tue, 18 Jan 2022 20:15:14 +0800 Subject: [PATCH 480/578] fix gtm core when get_node_list http://tapd.woa.com/pgxz/bugtrace/bugs/view/1010092131096263845 --- src/gtm/client/gtm_client.c | 48 ++++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/src/gtm/client/gtm_client.c b/src/gtm/client/gtm_client.c index f9cdbd65..27232498 100644 --- a/src/gtm/client/gtm_client.c +++ b/src/gtm/client/gtm_client.c @@ -183,7 +183,8 @@ begin_replication_initial_sync(GTM_Conn *conn) Assert(res->gr_type == NODE_BEGIN_REPLICATION_INIT_RESULT); if (res->gr_type != NODE_BEGIN_REPLICATION_INIT_RESULT) { - elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, NODE_BEGIN_REPLICATION_INIT_RESULT); + fprintf(stderr, "res->gr_type %d not match, expected %d.\n", res->gr_type, NODE_BEGIN_REPLICATION_INIT_RESULT); + goto receive_failed; } } else @@ -236,7 +237,8 @@ end_replication_initial_sync(GTM_Conn *conn) Assert(res->gr_type == NODE_END_REPLICATION_INIT_RESULT); if (res->gr_type != NODE_END_REPLICATION_INIT_RESULT) { - elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, NODE_END_REPLICATION_INIT_RESULT); + fprintf(stderr, "res->gr_type %d not match, expected %d.\n", res->gr_type, NODE_END_REPLICATION_INIT_RESULT); + goto receive_failed; } } @@ -287,7 +289,8 @@ get_node_list(GTM_Conn *conn, GTM_PGXCNodeInfo *data, size_t maxlen) Assert(res->gr_type == NODE_LIST_RESULT); if (res->gr_type != NODE_LIST_RESULT) { - elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, NODE_LIST_RESULT); + fprintf(stderr, "res->gr_type %d not match, expected %d.\n", res->gr_type, NODE_LIST_RESULT); + goto receive_failed; } } @@ -359,7 +362,8 @@ get_next_gxid(GTM_Conn *conn) Assert(res->gr_type == TXN_GET_NEXT_GXID_RESULT); if (res->gr_type != TXN_GET_NEXT_GXID_RESULT) { - elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, TXN_GET_NEXT_GXID_RESULT); + fprintf(stderr,"res->gr_type %d not match, expected %d.\n", res->gr_type, TXN_GET_NEXT_GXID_RESULT); + goto receive_failed; } } @@ -410,7 +414,8 @@ get_txn_gxid_list(GTM_Conn *conn, GTM_Transactions *txn) Assert(res->gr_type == TXN_GXID_LIST_RESULT); if (res->gr_type != TXN_GXID_LIST_RESULT) { - elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, TXN_GXID_LIST_RESULT); + fprintf(stderr, "res->gr_type %d not match, expected %d.\n", res->gr_type, TXN_GXID_LIST_RESULT); + goto receive_failed; } } @@ -465,7 +470,8 @@ get_sequence_list(GTM_Conn *conn, GTM_SeqInfo **seq_list) Assert(res->gr_type == SEQUENCE_LIST_RESULT); if (res->gr_type != SEQUENCE_LIST_RESULT) { - elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, SEQUENCE_LIST_RESULT); + fprintf(stderr, "res->gr_type %d not match, expected %d.\n", res->gr_type, SEQUENCE_LIST_RESULT); + goto receive_failed; } } @@ -989,7 +995,8 @@ commit_transaction_internal(GTM_Conn *conn, GlobalTransactionId gxid, Assert(res->gr_resdata.grd_gxid == gxid); if (res->gr_type != TXN_COMMIT_RESULT) { - elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, TXN_COMMIT_RESULT); + fprintf(stderr, "res->gr_type %d not match, expected %d.\n", res->gr_type, TXN_COMMIT_RESULT); + goto receive_failed; } if (waited_xid_count > 0) @@ -1106,7 +1113,8 @@ commit_prepared_transaction_internal(GTM_Conn *conn, Assert(res->gr_resdata.grd_gxid == gxid); if (res->gr_type != TXN_COMMIT_PREPARED_RESULT) { - elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, TXN_COMMIT_PREPARED_RESULT); + fprintf(stderr, "res->gr_type %d not match, expected %d.\n", res->gr_type, TXN_COMMIT_PREPARED_RESULT); + goto receive_failed; } if (waited_xid_count > 0) @@ -1185,7 +1193,8 @@ abort_transaction_internal(GTM_Conn *conn, GlobalTransactionId gxid, bool is_bac Assert(res->gr_resdata.grd_gxid == gxid); if (res->gr_type != TXN_ROLLBACK_RESULT) { - elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, TXN_ROLLBACK_RESULT); + fprintf(stderr, "res->gr_type %d not match, expected %d.\n", res->gr_type, TXN_ROLLBACK_RESULT); + goto receive_failed; } } @@ -1262,7 +1271,8 @@ start_prepared_transaction_internal(GTM_Conn *conn, GlobalTransactionId gxid, ch Assert(res->gr_resdata.grd_gxid == gxid); if (res->gr_type != TXN_START_PREPARED_RESULT) { - elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, TXN_START_PREPARED_RESULT); + fprintf(stderr, "res->gr_type %d not match, expected %d.\n", res->gr_type, TXN_START_PREPARED_RESULT); + goto receive_failed; } } @@ -1366,7 +1376,8 @@ log_commit_transaction_internal(GTM_Conn *conn, Assert(res->gr_resdata.grd_gxid == gxid); if (res->gr_type != TXN_LOG_TRANSACTION_RESULT) { - elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, TXN_LOG_TRANSACTION_RESULT); + fprintf(stderr, "res->gr_type %d not match, expected %d.\n", res->gr_type, TXN_LOG_TRANSACTION_RESULT); + goto receive_failed; } } @@ -1457,7 +1468,8 @@ log_scan_transaction_internal(GTM_Conn *conn, Assert(res->gr_resdata.grd_gxid == gxid); if (res->gr_type != TXN_LOG_SCAN_RESULT) { - elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, TXN_LOG_SCAN_RESULT); + fprintf(stderr, "res->gr_type %d not match, expected %d.\n", res->gr_type, TXN_LOG_SCAN_RESULT); + goto receive_failed; } } @@ -1521,7 +1533,8 @@ prepare_transaction_internal(GTM_Conn *conn, GlobalTransactionId gxid, bool is_b Assert(res->gr_resdata.grd_gxid == gxid); if (res->gr_type != TXN_PREPARE_RESULT) { - elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, TXN_PREPARE_RESULT); + fprintf(stderr, "res->gr_type %d not match, expected %d.\n", res->gr_type, TXN_PREPARE_RESULT); + goto receive_failed; } } @@ -1720,7 +1733,8 @@ get_storage_file(GTM_Conn *conn, char **data) Assert(res->gr_type == STORAGE_TRANSFER_RESULT); if (res->gr_type != STORAGE_TRANSFER_RESULT) { - elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, STORAGE_TRANSFER_RESULT); + fprintf(stderr, "res->gr_type %d not match, expected %d.\n", res->gr_type, STORAGE_TRANSFER_RESULT); + goto receive_failed; } } @@ -1994,7 +2008,8 @@ get_snapshot(GTM_Conn *conn, GlobalTransactionId gxid, bool canbe_grouped) Assert(res->gr_type == res_type); if (res->gr_type != res_type) { - elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, res_type); + fprintf(stderr, "res->gr_type %d not match, expected %d.\n", res->gr_type, res_type); + goto receive_failed; } /* * !!FIXME - The following assertion fails when snapshots are requested @@ -2768,7 +2783,8 @@ static int node_register_worker(GTM_Conn *conn, Assert((strcmp(res->gr_resdata.grd_node.node_name,node_name) == 0)); if (res->gr_type != NODE_REGISTER_RESULT) { - elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, NODE_REGISTER_RESULT); + fprintf(stderr, "res->gr_type %d not match, expected %d.\n", res->gr_type, NODE_REGISTER_RESULT); + goto receive_failed; } } From 4df0b852896cbe342e2dd510db26339a4e2e4a83 Mon Sep 17 00:00:00 2001 From: andrelin Date: Wed, 19 Jan 2022 12:06:33 +0800 Subject: [PATCH 481/578] Remote DML with dropped column tapd: http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131096344771 --- src/backend/pgxc/plan/planner.c | 21 +++++++++++++++++++-- src/backend/pgxc/pool/execRemote.c | 5 +++-- src/test/regress/expected/alter_table_3.out | 19 +++++++++++++++++++ src/test/regress/sql/alter_table.sql | 19 ++++++++++++++++++- 4 files changed, 59 insertions(+), 5 deletions(-) diff --git a/src/backend/pgxc/plan/planner.c b/src/backend/pgxc/plan/planner.c index e2d7158f..9be80f2f 100644 --- a/src/backend/pgxc/plan/planner.c +++ b/src/backend/pgxc/plan/planner.c @@ -844,7 +844,10 @@ pgxc_build_upsert_statement(PlannerInfo *root, CmdType cmdtype, /* Make sure the column has not been dropped */ if (get_rte_attribute_is_dropped(res_rel, col_att)) + { + rqplan->rq_param_types[rqplan->rq_num_params++] = InvalidOid; continue; + } type = exprType((Node *) tle->expr); @@ -948,13 +951,17 @@ pgxc_build_upsert_statement(PlannerInfo *root, CmdType cmdtype, natts = get_relnatts(res_rel->relid); - rqplan->su_param_types = (Oid *)palloc(natts * sizeof(Oid)); + /* natts + 1(xc_node_id) + 1(ctid) */ + rqplan->su_param_types = (Oid *)palloc((natts + 2) * sizeof(Oid)); for (attnum = 1; attnum <= natts; attnum++) { /* Make sure the column has not been dropped */ if (get_rte_attribute_is_dropped(res_rel, attnum)) + { + rqplan->rq_param_types[rqplan->rq_num_params++] = InvalidOid; continue; + } type = get_atttype(res_rel->relid, attnum); pgxc_add_param_as_tle(query_to_deparse, attnum, @@ -1212,7 +1219,10 @@ pgxc_build_dml_statement(PlannerInfo *root, CmdType cmdtype, { /* Make sure the column has not been dropped */ if (get_rte_attribute_is_dropped(res_rel, col_att)) + { + rqplan->rq_param_types[rqplan->rq_num_params++] = InvalidOid; continue; + } /* * Create the param to be used for VALUES caluse ($1, $2 ...) @@ -1254,14 +1264,21 @@ pgxc_build_dml_statement(PlannerInfo *root, CmdType cmdtype, Oid type; int natts = get_relnatts(res_rel->relid); int attnum; + int appendix = 0; - rqplan->rq_param_types = (Oid *)palloc(natts * sizeof(Oid)); + /* count origin attrs and ctid, nodeid */ + appendix += node_id_found ? 1 : 0; + appendix += ctid_found ? 1 : 0; + rqplan->rq_param_types = (Oid *)palloc((natts + appendix) * sizeof(Oid)); for (attnum = 1; attnum <= natts; attnum++) { /* Make sure the column has not been dropped */ if (get_rte_attribute_is_dropped(res_rel, attnum)) + { + rqplan->rq_param_types[rqplan->rq_num_params++] = InvalidOid; continue; + } type = get_atttype(res_rel->relid, attnum); pgxc_add_param_as_tle(query_to_deparse, attnum, diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 798f91af..73cfbde5 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -12504,6 +12504,7 @@ SetDataRowParams(ModifyTableState *mtstate, RemoteQueryState *node, TupleTableSl int numatts = tdesc->natts; ResponseCombiner *combiner = (ResponseCombiner *) node; RemoteQuery *step = (RemoteQuery *) combiner->ss.ps.plan; + Oid *param_types = step->rq_param_types; Form_pg_attribute att; Oid typeOutput; bool typIsVarlena; @@ -12532,7 +12533,7 @@ SetDataRowParams(ModifyTableState *mtstate, RemoteQueryState *node, TupleTableSl uint32 n32; Assert(attindex < numparams); - if (dataSlot->tts_isnull[attindex]) + if (dataSlot->tts_isnull[attindex] || !OidIsValid(param_types[attindex])) { n32 = htonl(-1); appendBinaryStringInfo(&buf, (char *) &n32, 4); @@ -12627,7 +12628,7 @@ SetDataRowParams(ModifyTableState *mtstate, RemoteQueryState *node, TupleTableSl uint32 n32; Assert(attindex < numparams); - if (dataSlot->tts_isnull[attindex]) + if (dataSlot->tts_isnull[attindex] || !OidIsValid(param_types[attindex])) { n32 = htonl(-1); appendBinaryStringInfo(&buf, (char *) &n32, 4); diff --git a/src/test/regress/expected/alter_table_3.out b/src/test/regress/expected/alter_table_3.out index 50bc6605..d6a33ebf 100644 --- a/src/test/regress/expected/alter_table_3.out +++ b/src/test/regress/expected/alter_table_3.out @@ -3684,3 +3684,22 @@ alter table at_test_sql_partop attach partition at_test_sql_partop_1 for values drop table at_test_sql_partop; drop operator class at_test_sql_partop using btree; drop function at_test_sql_partop; +-- remote dml with dropped column +create table dropped_col_remote_dml (a int, b int, c int) distribute by shard(a); +NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. +insert into dropped_col_remote_dml values(1,1,1); +create or replace function dropped_col_remote_dml_func() returns trigger as +$$ +begin + raise notice 'this is a test'; + return new; +end; +$$ + language plpgsql; +create trigger tga after update on dropped_col_remote_dml for each row +execute PROCEDURE dropped_col_remote_dml_func(); +alter table dropped_col_remote_dml drop column c; +update dropped_col_remote_dml set b = 2; +NOTICE: this is a test +drop table dropped_col_remote_dml cascade; +drop function dropped_col_remote_dml_func; diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql index daa8f09d..42a9bbfe 100644 --- a/src/test/regress/sql/alter_table.sql +++ b/src/test/regress/sql/alter_table.sql @@ -2545,4 +2545,21 @@ create table at_test_sql_partop_1 (a int); alter table at_test_sql_partop attach partition at_test_sql_partop_1 for values from (0) to (10); drop table at_test_sql_partop; drop operator class at_test_sql_partop using btree; -drop function at_test_sql_partop; \ No newline at end of file +drop function at_test_sql_partop; +-- remote dml with dropped column +create table dropped_col_remote_dml (a int, b int, c int) distribute by shard(a); +insert into dropped_col_remote_dml values(1,1,1); +create or replace function dropped_col_remote_dml_func() returns trigger as +$$ +begin + raise notice 'this is a test'; + return new; +end; +$$ + language plpgsql; +create trigger tga after update on dropped_col_remote_dml for each row +execute PROCEDURE dropped_col_remote_dml_func(); +alter table dropped_col_remote_dml drop column c; +update dropped_col_remote_dml set b = 2; +drop table dropped_col_remote_dml cascade; +drop function dropped_col_remote_dml_func; From 238bdcb9cd79945b9495eeff49b3b63647ffe594 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Fri, 21 Jan 2022 19:28:21 +0800 Subject: [PATCH 482/578] fix gtm seq bug when create databse or drop databse http://tapd.woa.com/10092131/bugtrace/bugs/view?bug_id=1010092131096383437&jump_count=1 and http://tapd.woa.com/pgxz/bugtrace/bugs/view/1010092131087562597 (merge request !1132) Squash merge branch 'sigmalin_v2_tmp' into 'Tbase_v2.15.19.5' fix gtm seq bug when create databse or drop databse http://tapd.woa.com/10092131/bugtrace/bugs/view?bug_id=1010092131096383437&jump_count=1 and http://tapd.woa.com/pgxz/bugtrace/bugs/view/1010092131087562597 TAPD: --bug=096383437 TAPD: --bug=087562597 --- src/backend/access/transam/gtm.c | 18 +++++ src/backend/commands/dbcommands.c | 18 +++++ src/gtm/client/fe-protocol.c | 2 + src/gtm/client/gtm_client.c | 44 ++++++++++ src/gtm/common/gtm_utils.c | 7 ++ src/gtm/main/gtm_seq.c | 106 +++++++++++++++++++++++++ src/gtm/main/gtm_store.c | 20 ++++- src/gtm/main/main.c | 4 + src/include/access/gtm.h | 1 + src/include/gtm/gtm_c.h | 10 +++ src/include/gtm/gtm_client.h | 2 + src/include/gtm/gtm_msg.h | 2 + src/include/gtm/gtm_seq.h | 1 + src/include/gtm/gtm_store.h | 1 + src/test/regress/expected/sequence.out | 29 +++++++ src/test/regress/sql/sequence.sql | 32 ++++++++ 16 files changed, 296 insertions(+), 1 deletion(-) diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c index 03d76457..ded4fdfb 100644 --- a/src/backend/access/transam/gtm.c +++ b/src/backend/access/transam/gtm.c @@ -2134,6 +2134,24 @@ RenameSequenceGTM(char *seqname, const char *newseqname) return conn ? rename_sequence(conn, &seqkey, &newseqkey, GetTopTransactionId()) : -1; } + +/* + * Copy the database sequences from src database + */ +int +CopyDataBaseSequenceGTM(char *src_dbname, char *dest_dbname) +{ + GTM_SequenceKeyData src_seqkey, dest_seqkey; + CheckConnection(); + src_seqkey.gsk_keylen = strlen(src_dbname) + 1; + src_seqkey.gsk_key = src_dbname; + + dest_seqkey.gsk_keylen = strlen(dest_dbname) + 1; + dest_seqkey.gsk_key = (char *) dest_dbname; + return conn ? copy_database_sequence(conn, &src_seqkey, &dest_seqkey, + GetTopTransactionId()) : -1; +} + /* * Register Given Node * Connection for registering is just used once then closed diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 070646fd..b25eed3c 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -717,6 +717,24 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) sizeof(fparms.dest_dboid)); #endif + if (IS_PGXC_LOCAL_COORDINATOR) + { + /* + * If we use another database as the template database, and there are + * sequences in the template database, we need to create the sequences + * from template database in gtm as well, it's safe because the source + * database can't being accessed by other now. + */ + RegisterSeqCreate(dbname, GTM_SEQ_DB_NAME); + + if (CopyDataBaseSequenceGTM((char*)dbtemplate, dbname) < 0) + { + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("GTM error, could not create sequences for database %s from %s", dbname, dbtemplate))); + } + } + return dboid; } diff --git a/src/gtm/client/fe-protocol.c b/src/gtm/client/fe-protocol.c index a575510e..903e6268 100644 --- a/src/gtm/client/fe-protocol.c +++ b/src/gtm/client/fe-protocol.c @@ -679,6 +679,7 @@ break; case SEQUENCE_RESET_RESULT: case SEQUENCE_CLOSE_RESULT: case SEQUENCE_RENAME_RESULT: + case SEQUENCE_COPY_RESULT: case SEQUENCE_ALTER_RESULT: case SEQUENCE_SET_VAL_RESULT: case MSG_DB_SEQUENCE_RENAME_RESULT: @@ -1511,6 +1512,7 @@ gtmpqFreeResultResource(GTM_Result *result) case SEQUENCE_RESET_RESULT: case SEQUENCE_CLOSE_RESULT: case SEQUENCE_RENAME_RESULT: + case SEQUENCE_COPY_RESULT: case SEQUENCE_ALTER_RESULT: case SEQUENCE_SET_VAL_RESULT: case MSG_DB_SEQUENCE_RENAME_RESULT: diff --git a/src/gtm/client/gtm_client.c b/src/gtm/client/gtm_client.c index 27232498..1677dfc4 100644 --- a/src/gtm/client/gtm_client.c +++ b/src/gtm/client/gtm_client.c @@ -2304,6 +2304,50 @@ rename_sequence_internal(GTM_Conn *conn, GTM_SequenceKey key, GTM_SequenceKey ne return -1; } +/* + * Copy the database sequences from src database + */ +int +copy_database_sequence(GTM_Conn *conn, GTM_SequenceKey src_key, GTM_SequenceKey dest_key, + GlobalTransactionId gxid) +{ + GTM_Result *res = NULL; + time_t finish_time; + + /* Start the message. */ + if (gtmpqPutMsgStart('C', true, conn) || + gtmpqPutInt(MSG_SEQUENCE_COPY, sizeof (GTM_MessageType), conn) || + gtmpqPutInt(src_key->gsk_keylen, 4, conn) || + gtmpqPutnchar(src_key->gsk_key, src_key->gsk_keylen, conn)|| + gtmpqPutInt(dest_key->gsk_keylen, 4, conn) || + gtmpqPutnchar(dest_key->gsk_key, dest_key->gsk_keylen, conn) || + gtmpqPutnchar((char *)&gxid, sizeof (GlobalTransactionId), conn)) + goto send_failed; + + /* Finish the message. */ + if (gtmpqPutMsgEnd(conn)) + goto send_failed; + + /* Flush to ensure backend gets it. */ + if (gtmpqFlush(conn)) + goto send_failed; + + finish_time = time(NULL) + CLIENT_GTM_TIMEOUT; + if (gtmpqWaitTimed(true, false, conn, finish_time) || + gtmpqReadData(conn) < 0) + goto receive_failed; + + if ((res = GTMPQgetResult(conn)) == NULL) + goto receive_failed; + + return res->gr_status; + +receive_failed: +send_failed: + conn->result = makeEmptyResultIfIsNull(conn->result); + conn->result->gr_status = GTM_RESULT_COMM_ERROR; + return -1; +} /* * Request from GTM current value of the specified sequence in the specified diff --git a/src/gtm/common/gtm_utils.c b/src/gtm/common/gtm_utils.c index 79eb782b..0aa1aabd 100644 --- a/src/gtm/common/gtm_utils.c +++ b/src/gtm/common/gtm_utils.c @@ -121,6 +121,10 @@ static struct enum_name message_name_tab[] = {MSG_GET_REPLICATION_STATUS,"MSG_GET_REPLICATION_STATUS"}, {MSG_GET_REPLICATION_TRANSFER,"MSG_GET_REPLICATION_TRANSFER"}, #endif + {MSG_GET_STATISTICS, "MSG_GET_STATISTICS"}, + {MSG_GET_ERRORLOG, "MSG_GET_ERRORLOG"}, + {MSG_SEQUENCE_COPY, "MSG_SEQUENCE_COPY"}, + {-1, NULL} }; @@ -174,6 +178,9 @@ static struct enum_name result_name_tab[] = {TXN_FINISH_GID_RESULT, "TXN_FINISH_GID_RESULT"}, {MSG_DB_SEQUENCE_RENAME_RESULT, "DB_SEQUENCE_RENAME_RESULT"}, #endif + {MSG_GET_GTM_STATISTICS_RESULT, "MSG_GET_GTM_STATISTICS_RESULT"}, + {MSG_GET_GTM_ERRORLOG_RESULT, "MSG_GET_GTM_ERRORLOG_RESULT"}, + {SEQUENCE_COPY_RESULT, "SEQUENCE_COPY_RESULT"}, {-1, NULL} }; diff --git a/src/gtm/main/gtm_seq.c b/src/gtm/main/gtm_seq.c index 9abe9580..640f5bcd 100644 --- a/src/gtm/main/gtm_seq.c +++ b/src/gtm/main/gtm_seq.c @@ -3619,4 +3619,110 @@ ProcessDBSequenceRenameCommand(Port *myport, StringInfo message, bool is_backup) /* FIXME: need to check errors */ } +/* + * Process MSG_SEQUENCE_COPY message. + */ +void +ProcessCopyDataBaseSequenceCommand(Port *myport, StringInfo message) +{ + GTM_SequenceKeyData src_seqkey, dest_seqkey; + StringInfoData buf; + int errcode; + MemoryContext oldContext; + const char *data; + GlobalTransactionId gxid; + GTMStorageHandle *handles = NULL; + int32 i = 0; + int32 count = 0; + + if (Recovery_IsStandby()) + { + if (myport->remote_type != GTM_NODE_GTM) + { + elog(ERROR, "gtm standby can't provide sequence to datanodes or coordinators."); + } + } + + /* get src database name */ + src_seqkey.gsk_keylen = pq_getmsgint(message, sizeof (src_seqkey.gsk_keylen)); + src_seqkey.gsk_key = (char *)pq_getmsgbytes(message, src_seqkey.gsk_keylen); + + /* get dest database name */ + dest_seqkey.gsk_keylen = pq_getmsgint(message, sizeof (dest_seqkey.gsk_keylen)); + dest_seqkey.gsk_key = (char *)pq_getmsgbytes(message, dest_seqkey.gsk_keylen); + + data = pq_getmsgbytes(message, sizeof (gxid)); + if (data == NULL) + ereport(ERROR, + (EPROTO, + errmsg("Message does not contain valid GXID"))); + memcpy(&gxid, data, sizeof (gxid)); + + + /* + * As when creating a sequence, we must use the TopMostMemoryContext + * because the sequence information is not bound to a thread and + * can outlive any of the thread specific contextes. + */ + oldContext = MemoryContextSwitchTo(TopMostMemoryContext); + + handles = GTM_StoreGetAllSeqInDatabase(&src_seqkey, &count); + if (handles) + { + for (i = 0; i < count; i++) + { + GTM_SeqCreateInfo create_info; + GTM_SequenceKeyData newseqkey; + char new_key[SEQ_KEY_MAX_LENGTH]; + + GTM_StoreGetSeqCreateInfo(handles[i], &create_info); + /* generate new sequence key name in dest database */ + newseqkey.gsk_keylen = strlen(create_info.seqkey) - strlen(src_seqkey.gsk_key) + strlen(dest_seqkey.gsk_key) + 1; + if (newseqkey.gsk_keylen > SEQ_KEY_MAX_LENGTH) + { + ereport(ERROR, + (errcode, + errmsg("sequence:%s is too long to copy to database %s", create_info.seqkey, dest_seqkey.gsk_key))); + } + snprintf(new_key, SEQ_KEY_MAX_LENGTH, "%s%s", dest_seqkey.gsk_key, create_info.seqkey + strlen(src_seqkey.gsk_key)); + newseqkey.gsk_key = new_key; + + errcode = GTM_SeqOpen(&newseqkey, create_info.increment_by, create_info.minval, create_info.maxval, create_info.startval, + create_info.cycle, gxid); + if (errcode) + { + ereport(ERROR, + (errcode, + errmsg("Failed to create new sequence:%s for:%s", newseqkey.gsk_key,strerror(errcode)))); + } + } + pfree(handles); + } + + MemoryContextSwitchTo(oldContext); + + pq_getmsgend(message); + + BeforeReplyToClientXLogTrigger(); + + /* Send a SUCCESS message back to the client */ + pq_beginmessage(&buf, 'S'); + pq_sendint(&buf, SEQUENCE_COPY_RESULT, 4); + if (myport->remote_type == GTM_NODE_GTM_PROXY) + { + GTM_ProxyMsgHeader proxyhdr; + proxyhdr.ph_conid = myport->conn_id; + pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader)); + } + pq_sendint(&buf, dest_seqkey.gsk_keylen, 4); + pq_sendbytes(&buf, dest_seqkey.gsk_key, dest_seqkey.gsk_keylen); + pq_endmessage(myport, &buf); + + if (myport->remote_type != GTM_NODE_GTM_PROXY) + { + pq_flush(myport); + } + +} + #endif diff --git a/src/gtm/main/gtm_store.c b/src/gtm/main/gtm_store.c index 858c636f..5e0e5ea9 100644 --- a/src/gtm/main/gtm_store.c +++ b/src/gtm/main/gtm_store.c @@ -3401,7 +3401,8 @@ int32 GTM_StoreDropAllSeqInDatabase(GTM_SequenceKey seq_database_key) { seq_info = GetSeqStore(bucket_handle); - if(strncmp(seq_database_key->gsk_key,seq_info->gs_key.gsk_key,seq_database_key->gsk_keylen - 1) != 0) + if(!(strncmp(seq_database_key->gsk_key,seq_info->gs_key.gsk_key,seq_database_key->gsk_keylen - 1) == 0 && + seq_info->gs_key.gsk_key[seq_database_key->gsk_keylen - 1] == '.')) { bucket_handle = seq_info->gs_next; continue; @@ -4387,3 +4388,20 @@ void GTM_StoreGetSeqKey(GTMStorageHandle handle, char *key) seq_info = GetSeqStore(handle); snprintf(key, SEQ_KEY_MAX_LENGTH, "%s", seq_info->gs_key.gsk_key); } + +/* + * get seq create info + */ +void GTM_StoreGetSeqCreateInfo(GTMStorageHandle handle, GTM_SeqCreateInfo *create_info) +{ + GTM_StoredSeqInfo *seq_info = NULL; + + seq_info = GetSeqStore(handle); + snprintf(create_info->seqkey, SEQ_KEY_MAX_LENGTH, "%s", seq_info->gs_key.gsk_key); + create_info->increment_by = seq_info->gs_increment_by; + create_info->minval = seq_info->gs_min_value; + create_info->maxval = seq_info->gs_max_value; + /* get gs_value as new sequence's startval */ + create_info->startval = seq_info->gs_value; + create_info->cycle = seq_info->gs_cycle; +} diff --git a/src/gtm/main/main.c b/src/gtm/main/main.c index 43a9424a..9b6deb6a 100644 --- a/src/gtm/main/main.c +++ b/src/gtm/main/main.c @@ -4035,6 +4035,7 @@ ProcessCommand(Port *myport, StringInfo input_message) #ifdef __TBASE__ case MSG_DB_SEQUENCE_RENAME: case MSG_BKUP_DB_SEQUENCE_RENAME: + case MSG_SEQUENCE_COPY: #endif ProcessSequenceCommand(myport, mtype, input_message); break; @@ -4797,6 +4798,9 @@ ProcessSequenceCommand(Port *myport, GTM_MessageType mtype, StringInfo message) ProcessSequenceCleanCommand(myport, message, false); break; + case MSG_SEQUENCE_COPY: + ProcessCopyDataBaseSequenceCommand(myport, message); + break; default: Assert(0); /* Shouldn't come here.. keep compiler quite */ } diff --git a/src/include/access/gtm.h b/src/include/access/gtm.h index 5da0eb6a..4df9fc1e 100644 --- a/src/include/access/gtm.h +++ b/src/include/access/gtm.h @@ -170,6 +170,7 @@ extern int AlterSequenceGTM(char *seqname, GTM_Sequence increment, GTM_Sequence lastval, bool cycle, bool is_restart); extern int DropSequenceGTM(char *name, GTM_SequenceKeyType type); extern int RenameSequenceGTM(char *seqname, const char *newseqname); +extern int CopyDataBaseSequenceGTM(char *src_dbname, char *dest_dbname); extern void CleanGTMSeq(void); /* Barrier */ extern int ReportBarrierGTM(const char *barrier_id); diff --git a/src/include/gtm/gtm_c.h b/src/include/gtm/gtm_c.h index ad0be27e..5abc747b 100644 --- a/src/include/gtm/gtm_c.h +++ b/src/include/gtm/gtm_c.h @@ -269,6 +269,16 @@ typedef struct GTM_StoredSeqInfo pg_crc32c gs_crc; /* crc check value */ }GTM_StoredSeqInfo; +typedef struct GTM_SeqCreateInfo +{ + char seqkey[SEQ_KEY_MAX_LENGTH]; + GTM_Sequence increment_by; + GTM_Sequence minval; + GTM_Sequence maxval; + GTM_Sequence startval; + bool cycle; +} GTM_SeqCreateInfo; + typedef struct GTM_StoredTransactionInfo { char gti_gid[GTM_MAX_SESSION_ID_LEN]; diff --git a/src/include/gtm/gtm_client.h b/src/include/gtm/gtm_client.h index f85e7d13..82effb81 100644 --- a/src/include/gtm/gtm_client.h +++ b/src/include/gtm/gtm_client.h @@ -443,6 +443,8 @@ int close_sequence(GTM_Conn *conn, GTM_SequenceKey key, GlobalTransactionId gxid int bkup_close_sequence(GTM_Conn *conn, GTM_SequenceKey key, GlobalTransactionId gxid); int rename_sequence(GTM_Conn *conn, GTM_SequenceKey key, GTM_SequenceKey newkey, GlobalTransactionId gxid); +int copy_database_sequence(GTM_Conn *conn, GTM_SequenceKey key, GTM_SequenceKey newkey, + GlobalTransactionId gxid); int bkup_rename_sequence(GTM_Conn *conn, GTM_SequenceKey key, GTM_SequenceKey newkey, GlobalTransactionId gxid); int get_current(GTM_Conn *conn, GTM_SequenceKey key, diff --git a/src/include/gtm/gtm_msg.h b/src/include/gtm/gtm_msg.h index acedc926..7ed4a0ac 100644 --- a/src/include/gtm/gtm_msg.h +++ b/src/include/gtm/gtm_msg.h @@ -125,6 +125,7 @@ typedef enum GTM_MessageType MSG_GET_STATISTICS, MSG_GET_ERRORLOG, #endif + MSG_SEQUENCE_COPY, /* * Must be at the end @@ -212,6 +213,7 @@ typedef enum GTM_ResultType MSG_GET_GTM_STATISTICS_RESULT, MSG_GET_GTM_ERRORLOG_RESULT, #endif + SEQUENCE_COPY_RESULT, RESULT_TYPE_COUNT } GTM_ResultType; diff --git a/src/include/gtm/gtm_seq.h b/src/include/gtm/gtm_seq.h index d9dd072d..a54e33b4 100644 --- a/src/include/gtm/gtm_seq.h +++ b/src/include/gtm/gtm_seq.h @@ -119,6 +119,7 @@ void ProcessSequenceAlterCommand(Port *myport, StringInfo message, bool is_backu void ProcessSequenceListCommand(Port *myport, StringInfo message); void ProcessSequenceCleanCommand(Port *myport, StringInfo message, bool is_backup); void ProcessDBSequenceRenameCommand(Port *myport, StringInfo message, bool is_backup); +void ProcessCopyDataBaseSequenceCommand(Port *myport, StringInfo message); void decode_seq_key(char* value, GTM_SequenceKey seqkey); void GTM_SaveSeqInfo(FILE *ctlf); diff --git a/src/include/gtm/gtm_store.h b/src/include/gtm/gtm_store.h index 5dfe1cac..81205f8b 100644 --- a/src/include/gtm/gtm_store.h +++ b/src/include/gtm/gtm_store.h @@ -175,4 +175,5 @@ extern bool GTM_StoreGetSysInfo(int64 *identifier, int64 *lsn, GlobalTimestamp * extern void GTM_PrintControlHeader(void); extern GTMStorageHandle *GTM_StoreGetAllSeqInDatabase(GTM_SequenceKey seq_database_key, int32 *number); extern void GTM_StoreGetSeqKey(GTMStorageHandle handle, char *key); +extern void GTM_StoreGetSeqCreateInfo(GTMStorageHandle handle, GTM_SeqCreateInfo *seq_info); #endif diff --git a/src/test/regress/expected/sequence.out b/src/test/regress/expected/sequence.out index 2eae7bde..d510c4f0 100644 --- a/src/test/regress/expected/sequence.out +++ b/src/test/regress/expected/sequence.out @@ -851,4 +851,33 @@ DROP SEQUENCE my_seq; DROP SEQUENCE my_seq; CREATE SEQUENCE my_seq; DROP SEQUENCE my_seq; +-- Test sequece when drop database +\c db_seq1 +create table t1(f1 serial,f2 int); +create table t2(f1 serial,f2 int); +create table t3(f1 serial,f2 int); +insert into t1(f2) values(1); +insert into t2(f2) values(2); +insert into t3(f2) values(3); +create database db_seq1_bak; +\c db_seq1_bak +create table t1(f1 serial,f2 int); +create table t2(f1 serial,f2 int); +create table t3(f1 serial,f2 int); +insert into t1(f2) values(1); +insert into t2(f2) values(2); +insert into t3(f2) values(3); +drop database db_seq1; +\c db_seq1_bak +insert into t1(f2) values(4); +insert into t2(f2) values(5); +insert into t3(f2) values(6); +select gsk_key from pg_list_storage_sequence() where gsk_key like '%db_seq1_bak.%'; + gsk_key +------------------------------ + db_seq1_bak.public.t1_f1_seq + db_seq1_bak.public.t2_f1_seq + db_seq1_bak.public.t3_f1_seq +(3 rows) + \q \ No newline at end of file diff --git a/src/test/regress/sql/sequence.sql b/src/test/regress/sql/sequence.sql index fda62262..a0f8180d 100644 --- a/src/test/regress/sql/sequence.sql +++ b/src/test/regress/sql/sequence.sql @@ -433,4 +433,36 @@ DROP SEQUENCE my_seq; DROP SEQUENCE my_seq; CREATE SEQUENCE my_seq; DROP SEQUENCE my_seq; + +-- Test sequece when drop database +\c db_seq1 +create table t1(f1 serial,f2 int); +create table t2(f1 serial,f2 int); +create table t3(f1 serial,f2 int); +insert into t1(f2) values(1); +insert into t2(f2) values(2); +insert into t3(f2) values(3); +create database db_seq1_bak; + +\c db_seq1_bak +create table t1(f1 serial,f2 int); +create table t2(f1 serial,f2 int); +create table t3(f1 serial,f2 int); +insert into t1(f2) values(1); +insert into t2(f2) values(2); +insert into t3(f2) values(3); +drop database db_seq1; + +\c db_seq1_bak +insert into t1(f2) values(4); +insert into t2(f2) values(5); +insert into t3(f2) values(6); +select gsk_key from pg_list_storage_sequence() where gsk_key like '%db_seq1_bak.%'; \q +<<<<<<< HEAD +======= + + + + +>>>>>>> 85b5350be... fix gtm seq bug when create databse or drop databse http://tapd.woa.com/10092131/bugtrace/bugs/view?bug_id=1010092131096383437&jump_count=1 and http://tapd.woa.com/pgxz/bugtrace/bugs/view/1010092131087562597 (merge request !1132) From 74d47f2d56af2fa7d23901dc3eb8e3155cbe7f77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cningxpeng=E2=80=9D?= <“ningxpeng@tencent.com”> Date: Tue, 25 Jan 2022 16:00:13 +0800 Subject: [PATCH 483/578] [BUGFIX] Kill node, fix abort hang problem.Check the return value is ok. --- src/backend/pgxc/pool/execRemote.c | 25 +++++++++++++++++++++++-- src/backend/pgxc/pool/pgxcnode.c | 18 +++++++++++++++--- src/include/pgxc/pgxcnode.h | 2 +- 3 files changed, 39 insertions(+), 6 deletions(-) diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 73cfbde5..1340649b 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -11625,6 +11625,7 @@ void pgxc_abort_connections(PGXCNodeAllHandles *all_handles) int i = 0; bool need_loop_check = false; bool need_sync = true; + int read_status; if (all_handles) { @@ -11642,7 +11643,17 @@ void pgxc_abort_connections(PGXCNodeAllHandles *all_handles) elog(DEBUG1, "pgxc_abort_connections node:%s not ready for query, status:%d", handle->nodename, handle->state); if (handle->sock != NO_SOCKET) { - pgxc_node_flush_read(handle); + read_status = pgxc_node_flush_read(handle); + if (read_status == EOF || read_status < 0) + { + /* Can not read - no more actions, just discard connection */ + handle->state = DN_CONNECTION_STATE_ERROR_FATAL; + add_error_message(handle, "unexpected EOF on datanode connection."); + elog(LOG, "unexpected EOF on node:%s pid:%d, read_status:%d, EOF:%d", + handle->nodename, handle->backend_pid, read_status, EOF); + return; + } + handle->state = DN_CONNECTION_STATE_IDLE; } /* Clear any previous error messages */ @@ -11679,7 +11690,17 @@ void pgxc_abort_connections(PGXCNodeAllHandles *all_handles) if (handle->state != DN_CONNECTION_STATE_IDLE || !node_ready_for_query(handle) || pgxc_node_is_data_enqueued(handle)) { elog(DEBUG1, "pgxc_abort_connections node:%s not ready for query, status:%d", handle->nodename, handle->state); - pgxc_node_flush_read(handle); + read_status = pgxc_node_flush_read(handle); + if (read_status == EOF || read_status < 0) + { + /* Can not read - no more actions, just discard connection */ + handle->state = DN_CONNECTION_STATE_ERROR_FATAL; + add_error_message(handle, "unexpected EOF on datanode connection."); + elog(LOG, "unexpected EOF on node:%s pid:%d, read_status:%d, EOF:%d", + handle->nodename, handle->backend_pid, read_status, EOF); + return; + } + handle->state = DN_CONNECTION_STATE_IDLE; /* Clear any previous error messages */ diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index f006dc41..173e06eb 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -2690,8 +2690,10 @@ pgxc_node_flush(PGXCNodeHandle *handle) /* * This method won't return until network buffer is empty or error occurs * To ensure all data in network buffers is read and wasted + * + * There are only two possible returns. Return 0 is ok, return is an EOF error when the link is broken. */ -void +int pgxc_node_flush_read(PGXCNodeHandle *handle) {// #lizard forgives bool is_ready= false; @@ -2700,7 +2702,7 @@ pgxc_node_flush_read(PGXCNodeHandle *handle) if (handle == NULL) { - return; + return 0; } while(true) @@ -2726,11 +2728,19 @@ pgxc_node_flush_read(PGXCNodeHandle *handle) /* break, only if the connection is broken. */ read_result = pgxc_node_read_data(handle, true); - if (read_result <= 0) + + /* If no data can be received, the normal break returns success */ + if (read_result == 0) { elog(DEBUG1, "pgxc_node_flush_read node:%s read failure.", handle->nodename); break; } + /* If the link breaks, an EOF error is returned */ + else if (read_result == EOF || read_result < 0) + { + elog(LOG, "pgxc_node_flush_read unexpected EOF on node:%s", handle->nodename); + return EOF; + } if (PGXC_CANCEL_DELAY > 0) { @@ -2747,6 +2757,8 @@ pgxc_node_flush_read(PGXCNodeHandle *handle) } } } + + return 0; } /* diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h index 91db953a..71f8fa40 100644 --- a/src/include/pgxc/pgxcnode.h +++ b/src/include/pgxc/pgxcnode.h @@ -268,7 +268,7 @@ extern int pgxc_node_is_data_enqueued(PGXCNodeHandle *conn); extern int send_some(PGXCNodeHandle * handle, int len); extern int pgxc_node_flush(PGXCNodeHandle *handle); -extern void pgxc_node_flush_read(PGXCNodeHandle *handle); +extern int pgxc_node_flush_read(PGXCNodeHandle *handle); extern char get_message(PGXCNodeHandle *conn, int *len, char **msg); From dbec10999485ea5972ba644e04e836e06d1dfbbd Mon Sep 17 00:00:00 2001 From: bethding Date: Wed, 9 Feb 2022 16:10:21 +0800 Subject: [PATCH 484/578] fix parallel gather core http://tapd.woa.com/10092131/bugtrace/bugs/view?bug_id=1010092131096652841&url_cache_key=from_url_iteration_list_74a055bf3a26e3712c2da14069948f4c&action_entry_type=bugs --- src/backend/executor/nodeGather.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/backend/executor/nodeGather.c b/src/backend/executor/nodeGather.c index 55686429..52991767 100644 --- a/src/backend/executor/nodeGather.c +++ b/src/backend/executor/nodeGather.c @@ -592,6 +592,10 @@ ExecFinishGather(PlanState *pstate) TupleTableSlot *slot = NULL; GatherState *node = castNode(GatherState, pstate); + /* If there if no pei, no need to set status, no need to read data from workers. */ + if (!node->pei) + return; + (*node->pei->executor_done) = true; if (g_DataPumpDebug) From ea9232db8dc670fcc82e2a33ba869b9815e5ae33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cningxpeng=E2=80=9D?= <“ningxpeng@tencent.com”> Date: Tue, 31 Aug 2021 14:29:46 +0800 Subject: [PATCH 485/578] [BUGFIX] trigger support subtransaction --- src/backend/pgxc/pool/execRemote.c | 2 +- src/pl/plpgsql/src/pl_exec.c | 4 ++ src/test/regress/expected/triggers_1.out | 49 ++++++++++++++++++++++++ src/test/regress/sql/triggers.sql | 40 +++++++++++++++++++ 4 files changed, 94 insertions(+), 1 deletion(-) diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 1340649b..9a0912e8 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -3644,7 +3644,7 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections, if ('T' != connections[i]->transaction_status) { elog(PANIC, "[PLPGSQL] pgxc_node_begin need_begin_sub_txn wrong" - "transaction_status"); + "transaction_status[%c]", connections[i]->transaction_status); } } diff --git a/src/pl/plpgsql/src/pl_exec.c b/src/pl/plpgsql/src/pl_exec.c index dd6cffb0..3dfc4ffb 100644 --- a/src/pl/plpgsql/src/pl_exec.c +++ b/src/pl/plpgsql/src/pl_exec.c @@ -1313,6 +1313,7 @@ exec_stmt_block(PLpgSQL_execstate *estate, PLpgSQL_stmt_block *block) */ stmt_mcontext = get_stmt_mcontext(estate); + SetEnterPlpgsqlFunc(); BeginInternalSubTransaction(NULL); /* Want to run statements inside function's memory context */ MemoryContextSwitchTo(oldcontext); @@ -1468,6 +1469,8 @@ exec_stmt_block(PLpgSQL_execstate *estate, PLpgSQL_stmt_block *block) /* If no match found, re-throw the error */ if (e == NULL) ReThrowError(edata); + else + FreeErrorData(edata); /* Restore stmt_mcontext stack and release the error data */ pop_stmt_mcontext(estate); @@ -1477,6 +1480,7 @@ exec_stmt_block(PLpgSQL_execstate *estate, PLpgSQL_stmt_block *block) } PG_END_TRY(); + SetExitPlpgsqlFunc(); Assert(save_cur_error == estate->cur_error); } else diff --git a/src/test/regress/expected/triggers_1.out b/src/test/regress/expected/triggers_1.out index ab6838dc..2eae1b48 100644 --- a/src/test/regress/expected/triggers_1.out +++ b/src/test/regress/expected/triggers_1.out @@ -2120,3 +2120,52 @@ drop table my_table; drop function dump_insert(); drop function dump_update(); drop function dump_delete(); +-- trigger support subtransaction +drop table if exists tb1 cascade; +NOTICE: table "tb1" does not exist, skipping +drop table if exists tb3 cascade; +NOTICE: table "tb3" does not exist, skipping +drop function if exists fun_fbjfyj(); +NOTICE: function fun_fbjfyj() does not exist, skipping +create table tb1(a int, b int, c1 varchar(50), c2 varchar(50) COLLATE "pg_catalog"."default", primary key(c1)); +create table tb3( + a int, + d1 varchar(18) COLLATE "pg_catalog"."default", + d2 varchar(600) COLLATE "pg_catalog"."default" +); +CREATE OR REPLACE FUNCTION fun_fbjfyj() + RETURNS trigger AS $BODY$ + DECLARE + TF integer :=0; + BEGIN + begin + select NVL2(MAX(a), '1', '0') INTO TF from tb1 where a = 7; + IF TF = '1' THEN RETURN new; END IF; + new.d1 := '11'; + new.d2 := '111'; + insert into tb1 values(12, 12, new.d1, new.d2); + end; + RETURN new; +exception + when others then + return new; + END +$BODY$ +LANGUAGE plpgsql VOLATILE +COST 100; +create trigger tb3_insert after insert on tb3 +FOR EACH ROW +EXECUTE PROCEDURE fun_fbjfyj(); +ERROR: Postgres-XL does not support TRIGGER yet +DETAIL: The feature is not currently supported +insert into tb3 values(1, '11', '111'), (2, '22', '222'), (3,'33','333'); +insert into tb3 values(1, '11', '111'), (2, '22', '222'), (3,'33','333'); +select count(*) from tb3; + count +------- + 6 +(1 row) + +drop table tb1 cascade; +drop table tb3 cascade; +drop function fun_fbjfyj(); diff --git a/src/test/regress/sql/triggers.sql b/src/test/regress/sql/triggers.sql index 95b11791..89b019f7 100644 --- a/src/test/regress/sql/triggers.sql +++ b/src/test/regress/sql/triggers.sql @@ -1781,3 +1781,43 @@ drop table my_table; drop function dump_insert(); drop function dump_update(); drop function dump_delete(); + +-- trigger support subtransaction +drop table if exists tb1 cascade; +drop table if exists tb3 cascade; +drop function if exists fun_fbjfyj(); +create table tb1(a int, b int, c1 varchar(50), c2 varchar(50) COLLATE "pg_catalog"."default", primary key(c1)); +create table tb3( + a int, + d1 varchar(18) COLLATE "pg_catalog"."default", + d2 varchar(600) COLLATE "pg_catalog"."default" +); +CREATE OR REPLACE FUNCTION fun_fbjfyj() + RETURNS trigger AS $BODY$ + DECLARE + TF integer :=0; + BEGIN + begin + select NVL2(MAX(a), '1', '0') INTO TF from tb1 where a = 7; + IF TF = '1' THEN RETURN new; END IF; + new.d1 := '11'; + new.d2 := '111'; + insert into tb1 values(12, 12, new.d1, new.d2); + end; + RETURN new; +exception + when others then + return new; + END +$BODY$ +LANGUAGE plpgsql VOLATILE +COST 100; +create trigger tb3_insert after insert on tb3 +FOR EACH ROW +EXECUTE PROCEDURE fun_fbjfyj(); +insert into tb3 values(1, '11', '111'), (2, '22', '222'), (3,'33','333'); +insert into tb3 values(1, '11', '111'), (2, '22', '222'), (3,'33','333'); +select count(*) from tb3; +drop table tb1 cascade; +drop table tb3 cascade; +drop function fun_fbjfyj(); From 4f92c74a90e0506a751446f7d721c200c00fc57d Mon Sep 17 00:00:00 2001 From: ericxwu Date: Mon, 25 Oct 2021 11:52:37 +0800 Subject: [PATCH 486/578] Two fixes of trigger with cursor and exception block 1. Fix the remote sub txn begin logic when remote conn does not begen main txn yet. 2. Portal of cursor have been dropped when sub-txn(begined for execption case) release/rollback, so we just skip the close statment in expetion block. http://tapd.oa.com/20421696/bugtrace/bugs/view?bug_id=1020421696092502857 --- src/backend/pgxc/pool/execRemote.c | 3 +- src/pl/plpgsql/src/pl_exec.c | 20 +- src/pl/plpgsql/src/plpgsql.h | 1099 ++++++++++++++-------------- 3 files changed, 568 insertions(+), 554 deletions(-) diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 9a0912e8..dcb98c13 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -3603,8 +3603,7 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections, need_send_begin = true; } - if (connections[i]->plpgsql_need_begin_txn && - connections[i]->plpgsql_need_begin_sub_txn && + if (connections[i]->plpgsql_need_begin_sub_txn && 'I' == connections[i]->transaction_status) { need_send_begin = true; diff --git a/src/pl/plpgsql/src/pl_exec.c b/src/pl/plpgsql/src/pl_exec.c index 3dfc4ffb..5c232317 100644 --- a/src/pl/plpgsql/src/pl_exec.c +++ b/src/pl/plpgsql/src/pl_exec.c @@ -1318,6 +1318,8 @@ exec_stmt_block(PLpgSQL_execstate *estate, PLpgSQL_stmt_block *block) /* Want to run statements inside function's memory context */ MemoryContextSwitchTo(oldcontext); + estate->handle_exceptions = false; + PG_TRY(); { /* @@ -1374,7 +1376,6 @@ exec_stmt_block(PLpgSQL_execstate *estate, PLpgSQL_stmt_block *block) { ErrorData *edata; ListCell *e; - SetEnterPlpgsqlFunc(); estate->err_text = gettext_noop("during exception cleanup"); @@ -1383,6 +1384,9 @@ exec_stmt_block(PLpgSQL_execstate *estate, PLpgSQL_stmt_block *block) edata = CopyErrorData(); FlushErrorState(); + /* Mark handling exceptions */ + estate->handle_exceptions = true; + /* Abort the inner transaction */ RollbackAndReleaseCurrentSubTransaction(); MemoryContextSwitchTo(oldcontext); @@ -1455,6 +1459,8 @@ exec_stmt_block(PLpgSQL_execstate *estate, PLpgSQL_stmt_block *block) rc = exec_stmts(estate, exception->action); + estate->handle_exceptions = false; + break; } } @@ -1468,15 +1474,16 @@ exec_stmt_block(PLpgSQL_execstate *estate, PLpgSQL_stmt_block *block) /* If no match found, re-throw the error */ if (e == NULL) + { + SetExitPlpgsqlFunc(); ReThrowError(edata); + } else FreeErrorData(edata); /* Restore stmt_mcontext stack and release the error data */ pop_stmt_mcontext(estate); MemoryContextReset(stmt_mcontext); - - SetExitPlpgsqlFunc(); } PG_END_TRY(); @@ -3450,6 +3457,8 @@ plpgsql_estate_setup(PLpgSQL_execstate *estate, estate->cur_error = NULL; estate->tuple_store = NULL; + estate->handle_exceptions = false; + if (rsi) { estate->tuple_store_cxt = rsi->econtext->ecxt_per_query_memory; @@ -4382,9 +4391,14 @@ exec_stmt_close(PLpgSQL_execstate *estate, PLpgSQL_stmt_close *stmt) portal = SPI_cursor_find(curname); if (portal == NULL) + { + if (estate->handle_exceptions) + return PLPGSQL_RC_OK; + ereport(ERROR, (errcode(ERRCODE_UNDEFINED_CURSOR), errmsg("cursor \"%s\" does not exist", curname))); + } /* ---------- * And close it. diff --git a/src/pl/plpgsql/src/plpgsql.h b/src/pl/plpgsql/src/plpgsql.h index 3a810ca2..825a7d5e 100644 --- a/src/pl/plpgsql/src/plpgsql.h +++ b/src/pl/plpgsql/src/plpgsql.h @@ -1,14 +1,14 @@ /*------------------------------------------------------------------------- * - * plpgsql.h - Definitions for the PL/pgSQL - * procedural language + * plpgsql.h - Definitions for the PL/pgSQL + * procedural language * * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * src/pl/plpgsql/src/plpgsql.h + * src/pl/plpgsql/src/plpgsql.h * *------------------------------------------------------------------------- */ @@ -37,10 +37,10 @@ */ typedef enum PLpgSQL_nsitem_type { - PLPGSQL_NSTYPE_LABEL, - PLPGSQL_NSTYPE_VAR, - PLPGSQL_NSTYPE_ROW, - PLPGSQL_NSTYPE_REC + PLPGSQL_NSTYPE_LABEL, + PLPGSQL_NSTYPE_VAR, + PLPGSQL_NSTYPE_ROW, + PLPGSQL_NSTYPE_REC } PLpgSQL_nsitem_type; /* @@ -48,9 +48,9 @@ typedef enum PLpgSQL_nsitem_type */ typedef enum PLpgSQL_label_type { - PLPGSQL_LABEL_BLOCK, /* DECLARE/BEGIN block */ - PLPGSQL_LABEL_LOOP, /* looping construct */ - PLPGSQL_LABEL_OTHER /* anything else */ + PLPGSQL_LABEL_BLOCK, /* DECLARE/BEGIN block */ + PLPGSQL_LABEL_LOOP, /* looping construct */ + PLPGSQL_LABEL_OTHER /* anything else */ } PLpgSQL_label_type; /* @@ -58,12 +58,12 @@ typedef enum PLpgSQL_label_type */ typedef enum PLpgSQL_datum_type { - PLPGSQL_DTYPE_VAR, - PLPGSQL_DTYPE_ROW, - PLPGSQL_DTYPE_REC, - PLPGSQL_DTYPE_RECFIELD, - PLPGSQL_DTYPE_ARRAYELEM, - PLPGSQL_DTYPE_EXPR + PLPGSQL_DTYPE_VAR, + PLPGSQL_DTYPE_ROW, + PLPGSQL_DTYPE_REC, + PLPGSQL_DTYPE_RECFIELD, + PLPGSQL_DTYPE_ARRAYELEM, + PLPGSQL_DTYPE_EXPR } PLpgSQL_datum_type; /* @@ -71,10 +71,10 @@ typedef enum PLpgSQL_datum_type */ typedef enum PLpgSQL_type_type { - PLPGSQL_TTYPE_SCALAR, /* scalar types and domains */ - PLPGSQL_TTYPE_ROW, /* composite types */ - PLPGSQL_TTYPE_REC, /* RECORD pseudotype */ - PLPGSQL_TTYPE_PSEUDO /* other pseudotypes */ + PLPGSQL_TTYPE_SCALAR, /* scalar types and domains */ + PLPGSQL_TTYPE_ROW, /* composite types */ + PLPGSQL_TTYPE_REC, /* RECORD pseudotype */ + PLPGSQL_TTYPE_PSEUDO /* other pseudotypes */ } PLpgSQL_type_type; /* @@ -82,30 +82,30 @@ typedef enum PLpgSQL_type_type */ typedef enum PLpgSQL_stmt_type { - PLPGSQL_STMT_BLOCK, - PLPGSQL_STMT_ASSIGN, - PLPGSQL_STMT_IF, - PLPGSQL_STMT_CASE, - PLPGSQL_STMT_LOOP, - PLPGSQL_STMT_WHILE, - PLPGSQL_STMT_FORI, - PLPGSQL_STMT_FORS, - PLPGSQL_STMT_FORC, - PLPGSQL_STMT_FOREACH_A, - PLPGSQL_STMT_EXIT, - PLPGSQL_STMT_RETURN, - PLPGSQL_STMT_RETURN_NEXT, - PLPGSQL_STMT_RETURN_QUERY, - PLPGSQL_STMT_RAISE, - PLPGSQL_STMT_ASSERT, - PLPGSQL_STMT_EXECSQL, - PLPGSQL_STMT_DYNEXECUTE, - PLPGSQL_STMT_DYNFORS, - PLPGSQL_STMT_GETDIAG, - PLPGSQL_STMT_OPEN, - PLPGSQL_STMT_FETCH, - PLPGSQL_STMT_CLOSE, - PLPGSQL_STMT_PERFORM + PLPGSQL_STMT_BLOCK, + PLPGSQL_STMT_ASSIGN, + PLPGSQL_STMT_IF, + PLPGSQL_STMT_CASE, + PLPGSQL_STMT_LOOP, + PLPGSQL_STMT_WHILE, + PLPGSQL_STMT_FORI, + PLPGSQL_STMT_FORS, + PLPGSQL_STMT_FORC, + PLPGSQL_STMT_FOREACH_A, + PLPGSQL_STMT_EXIT, + PLPGSQL_STMT_RETURN, + PLPGSQL_STMT_RETURN_NEXT, + PLPGSQL_STMT_RETURN_QUERY, + PLPGSQL_STMT_RAISE, + PLPGSQL_STMT_ASSERT, + PLPGSQL_STMT_EXECSQL, + PLPGSQL_STMT_DYNEXECUTE, + PLPGSQL_STMT_DYNFORS, + PLPGSQL_STMT_GETDIAG, + PLPGSQL_STMT_OPEN, + PLPGSQL_STMT_FETCH, + PLPGSQL_STMT_CLOSE, + PLPGSQL_STMT_PERFORM } PLpgSQL_stmt_type; /* @@ -113,10 +113,10 @@ typedef enum PLpgSQL_stmt_type */ enum { - PLPGSQL_RC_OK, - PLPGSQL_RC_EXIT, - PLPGSQL_RC_RETURN, - PLPGSQL_RC_CONTINUE + PLPGSQL_RC_OK, + PLPGSQL_RC_EXIT, + PLPGSQL_RC_RETURN, + PLPGSQL_RC_CONTINUE }; /* @@ -124,19 +124,19 @@ enum */ typedef enum PLpgSQL_getdiag_kind { - PLPGSQL_GETDIAG_ROW_COUNT, - PLPGSQL_GETDIAG_RESULT_OID, - PLPGSQL_GETDIAG_CONTEXT, - PLPGSQL_GETDIAG_ERROR_CONTEXT, - PLPGSQL_GETDIAG_ERROR_DETAIL, - PLPGSQL_GETDIAG_ERROR_HINT, - PLPGSQL_GETDIAG_RETURNED_SQLSTATE, - PLPGSQL_GETDIAG_COLUMN_NAME, - PLPGSQL_GETDIAG_CONSTRAINT_NAME, - PLPGSQL_GETDIAG_DATATYPE_NAME, - PLPGSQL_GETDIAG_MESSAGE_TEXT, - PLPGSQL_GETDIAG_TABLE_NAME, - PLPGSQL_GETDIAG_SCHEMA_NAME + PLPGSQL_GETDIAG_ROW_COUNT, + PLPGSQL_GETDIAG_RESULT_OID, + PLPGSQL_GETDIAG_CONTEXT, + PLPGSQL_GETDIAG_ERROR_CONTEXT, + PLPGSQL_GETDIAG_ERROR_DETAIL, + PLPGSQL_GETDIAG_ERROR_HINT, + PLPGSQL_GETDIAG_RETURNED_SQLSTATE, + PLPGSQL_GETDIAG_COLUMN_NAME, + PLPGSQL_GETDIAG_CONSTRAINT_NAME, + PLPGSQL_GETDIAG_DATATYPE_NAME, + PLPGSQL_GETDIAG_MESSAGE_TEXT, + PLPGSQL_GETDIAG_TABLE_NAME, + PLPGSQL_GETDIAG_SCHEMA_NAME } PLpgSQL_getdiag_kind; /* @@ -144,15 +144,15 @@ typedef enum PLpgSQL_getdiag_kind */ typedef enum PLpgSQL_raise_option_type { - PLPGSQL_RAISEOPTION_ERRCODE, - PLPGSQL_RAISEOPTION_MESSAGE, - PLPGSQL_RAISEOPTION_DETAIL, - PLPGSQL_RAISEOPTION_HINT, - PLPGSQL_RAISEOPTION_COLUMN, - PLPGSQL_RAISEOPTION_CONSTRAINT, - PLPGSQL_RAISEOPTION_DATATYPE, - PLPGSQL_RAISEOPTION_TABLE, - PLPGSQL_RAISEOPTION_SCHEMA + PLPGSQL_RAISEOPTION_ERRCODE, + PLPGSQL_RAISEOPTION_MESSAGE, + PLPGSQL_RAISEOPTION_DETAIL, + PLPGSQL_RAISEOPTION_HINT, + PLPGSQL_RAISEOPTION_COLUMN, + PLPGSQL_RAISEOPTION_CONSTRAINT, + PLPGSQL_RAISEOPTION_DATATYPE, + PLPGSQL_RAISEOPTION_TABLE, + PLPGSQL_RAISEOPTION_SCHEMA } PLpgSQL_raise_option_type; /* @@ -160,9 +160,9 @@ typedef enum PLpgSQL_raise_option_type */ typedef enum PLpgSQL_resolve_option { - PLPGSQL_RESOLVE_ERROR, /* throw error if ambiguous */ - PLPGSQL_RESOLVE_VARIABLE, /* prefer plpgsql var to table column */ - PLPGSQL_RESOLVE_COLUMN /* prefer table column to plpgsql var */ + PLPGSQL_RESOLVE_ERROR, /* throw error if ambiguous */ + PLPGSQL_RESOLVE_VARIABLE, /* prefer plpgsql var to table column */ + PLPGSQL_RESOLVE_COLUMN /* prefer table column to plpgsql var */ } PLpgSQL_resolve_option; @@ -175,16 +175,16 @@ typedef enum PLpgSQL_resolve_option */ typedef struct PLpgSQL_type { - char *typname; /* (simple) name of the type */ - Oid typoid; /* OID of the data type */ - PLpgSQL_type_type ttype; /* PLPGSQL_TTYPE_ code */ - int16 typlen; /* stuff copied from its pg_type entry */ - bool typbyval; - char typtype; - Oid typrelid; - Oid collation; /* from pg_type, but can be overridden */ - bool typisarray; /* is "true" array, or domain over one */ - int32 atttypmod; /* typmod (taken from someplace else) */ + char *typname; /* (simple) name of the type */ + Oid typoid; /* OID of the data type */ + PLpgSQL_type_type ttype; /* PLPGSQL_TTYPE_ code */ + int16 typlen; /* stuff copied from its pg_type entry */ + bool typbyval; + char typtype; + Oid typrelid; + Oid collation; /* from pg_type, but can be overridden */ + bool typisarray; /* is "true" array, or domain over one */ + int32 atttypmod; /* typmod (taken from someplace else) */ } PLpgSQL_type; /* @@ -195,8 +195,8 @@ typedef struct PLpgSQL_type */ typedef struct PLpgSQL_datum { - PLpgSQL_datum_type dtype; - int dno; + PLpgSQL_datum_type dtype; + int dno; } PLpgSQL_datum; /* @@ -207,10 +207,10 @@ typedef struct PLpgSQL_datum */ typedef struct PLpgSQL_variable { - PLpgSQL_datum_type dtype; - int dno; - char *refname; - int lineno; + PLpgSQL_datum_type dtype; + int dno; + char *refname; + int lineno; } PLpgSQL_variable; /* @@ -218,34 +218,34 @@ typedef struct PLpgSQL_variable */ typedef struct PLpgSQL_expr { - PLpgSQL_datum_type dtype; - int dno; - char *query; - SPIPlanPtr plan; - Bitmapset *paramnos; /* all dnos referenced by this query */ - int rwparam; /* dno of read/write param, or -1 if none */ - - /* function containing this expr (not set until we first parse query) */ - struct PLpgSQL_function *func; - - /* namespace chain visible to this expr */ - struct PLpgSQL_nsitem *ns; - - /* fields for "simple expression" fast-path execution: */ - Expr *expr_simple_expr; /* NULL means not a simple expr */ - int expr_simple_generation; /* plancache generation we checked */ - Oid expr_simple_type; /* result type Oid, if simple */ - int32 expr_simple_typmod; /* result typmod, if simple */ - - /* - * if expr is simple AND prepared in current transaction, - * expr_simple_state and expr_simple_in_use are valid. Test validity by - * seeing if expr_simple_lxid matches current LXID. (If not, - * expr_simple_state probably points at garbage!) - */ - ExprState *expr_simple_state; /* eval tree for expr_simple_expr */ - bool expr_simple_in_use; /* true if eval tree is active */ - LocalTransactionId expr_simple_lxid; + PLpgSQL_datum_type dtype; + int dno; + char *query; + SPIPlanPtr plan; + Bitmapset *paramnos; /* all dnos referenced by this query */ + int rwparam; /* dno of read/write param, or -1 if none */ + + /* function containing this expr (not set until we first parse query) */ + struct PLpgSQL_function *func; + + /* namespace chain visible to this expr */ + struct PLpgSQL_nsitem *ns; + + /* fields for "simple expression" fast-path execution: */ + Expr *expr_simple_expr; /* NULL means not a simple expr */ + int expr_simple_generation; /* plancache generation we checked */ + Oid expr_simple_type; /* result type Oid, if simple */ + int32 expr_simple_typmod; /* result typmod, if simple */ + + /* + * if expr is simple AND prepared in current transaction, + * expr_simple_state and expr_simple_in_use are valid. Test validity by + * seeing if expr_simple_lxid matches current LXID. (If not, + * expr_simple_state probably points at garbage!) + */ + ExprState *expr_simple_state; /* eval tree for expr_simple_expr */ + bool expr_simple_in_use; /* true if eval tree is active */ + LocalTransactionId expr_simple_lxid; } PLpgSQL_expr; /* @@ -253,22 +253,22 @@ typedef struct PLpgSQL_expr */ typedef struct PLpgSQL_var { - PLpgSQL_datum_type dtype; - int dno; - char *refname; - int lineno; - - PLpgSQL_type *datatype; - int isconst; - int notnull; - PLpgSQL_expr *default_val; - PLpgSQL_expr *cursor_explicit_expr; - int cursor_explicit_argrow; - int cursor_options; - - Datum value; - bool isnull; - bool freeval; + PLpgSQL_datum_type dtype; + int dno; + char *refname; + int lineno; + + PLpgSQL_type *datatype; + int isconst; + int notnull; + PLpgSQL_expr *default_val; + PLpgSQL_expr *cursor_explicit_expr; + int cursor_explicit_argrow; + int cursor_options; + + Datum value; + bool isnull; + bool freeval; } PLpgSQL_var; /* @@ -276,22 +276,22 @@ typedef struct PLpgSQL_var */ typedef struct PLpgSQL_row { - PLpgSQL_datum_type dtype; - int dno; - char *refname; - int lineno; - - /* Note: TupleDesc is only set up for named rowtypes, else it is NULL. */ - TupleDesc rowtupdesc; - - /* - * Note: if the underlying rowtype contains a dropped column, the - * corresponding fieldnames[] entry will be NULL, and there is no - * corresponding var (varnos[] will be -1). - */ - int nfields; - char **fieldnames; - int *varnos; + PLpgSQL_datum_type dtype; + int dno; + char *refname; + int lineno; + + /* Note: TupleDesc is only set up for named rowtypes, else it is NULL. */ + TupleDesc rowtupdesc; + + /* + * Note: if the underlying rowtype contains a dropped column, the + * corresponding fieldnames[] entry will be NULL, and there is no + * corresponding var (varnos[] will be -1). + */ + int nfields; + char **fieldnames; + int *varnos; } PLpgSQL_row; /* @@ -299,15 +299,15 @@ typedef struct PLpgSQL_row */ typedef struct PLpgSQL_rec { - PLpgSQL_datum_type dtype; - int dno; - char *refname; - int lineno; - - HeapTuple tup; - TupleDesc tupdesc; - bool freetup; - bool freetupdesc; + PLpgSQL_datum_type dtype; + int dno; + char *refname; + int lineno; + + HeapTuple tup; + TupleDesc tupdesc; + bool freetup; + bool freetupdesc; } PLpgSQL_rec; /* @@ -315,10 +315,10 @@ typedef struct PLpgSQL_rec */ typedef struct PLpgSQL_recfield { - PLpgSQL_datum_type dtype; - int dno; - char *fieldname; - int recparentno; /* dno of parent record */ + PLpgSQL_datum_type dtype; + int dno; + char *fieldname; + int recparentno; /* dno of parent record */ } PLpgSQL_recfield; /* @@ -326,21 +326,21 @@ typedef struct PLpgSQL_recfield */ typedef struct PLpgSQL_arrayelem { - PLpgSQL_datum_type dtype; - int dno; - PLpgSQL_expr *subscript; - int arrayparentno; /* dno of parent array variable */ - - /* Remaining fields are cached info about the array variable's type */ - Oid parenttypoid; /* type of array variable; 0 if not yet set */ - int32 parenttypmod; /* typmod of array variable */ - Oid arraytypoid; /* OID of actual array type */ - int32 arraytypmod; /* typmod of array (and its elements too) */ - int16 arraytyplen; /* typlen of array type */ - Oid elemtypoid; /* OID of array element type */ - int16 elemtyplen; /* typlen of element type */ - bool elemtypbyval; /* element type is pass-by-value? */ - char elemtypalign; /* typalign of element type */ + PLpgSQL_datum_type dtype; + int dno; + PLpgSQL_expr *subscript; + int arrayparentno; /* dno of parent array variable */ + + /* Remaining fields are cached info about the array variable's type */ + Oid parenttypoid; /* type of array variable; 0 if not yet set */ + int32 parenttypmod; /* typmod of array variable */ + Oid arraytypoid; /* OID of actual array type */ + int32 arraytypmod; /* typmod of array (and its elements too) */ + int16 arraytyplen; /* typlen of array type */ + Oid elemtypoid; /* OID of array element type */ + int16 elemtyplen; /* typlen of element type */ + bool elemtypbyval; /* element type is pass-by-value? */ + char elemtypalign; /* typalign of element type */ } PLpgSQL_arrayelem; /* @@ -348,15 +348,15 @@ typedef struct PLpgSQL_arrayelem */ typedef struct PLpgSQL_nsitem { - PLpgSQL_nsitem_type itemtype; - - /* - * For labels, itemno is a value of enum PLpgSQL_label_type. For other - * itemtypes, itemno is the associated PLpgSQL_datum's dno. - */ - int itemno; - struct PLpgSQL_nsitem *prev; - char name[FLEXIBLE_ARRAY_MEMBER]; /* nul-terminated string */ + PLpgSQL_nsitem_type itemtype; + + /* + * For labels, itemno is a value of enum PLpgSQL_label_type. For other + * itemtypes, itemno is the associated PLpgSQL_datum's dno. + */ + int itemno; + struct PLpgSQL_nsitem *prev; + char name[FLEXIBLE_ARRAY_MEMBER]; /* nul-terminated string */ } PLpgSQL_nsitem; /* @@ -364,8 +364,8 @@ typedef struct PLpgSQL_nsitem */ typedef struct PLpgSQL_stmt { - PLpgSQL_stmt_type cmd_type; - int lineno; + PLpgSQL_stmt_type cmd_type; + int lineno; } PLpgSQL_stmt; /* @@ -373,9 +373,9 @@ typedef struct PLpgSQL_stmt */ typedef struct PLpgSQL_condition { - int sqlerrstate; /* SQLSTATE code */ - char *condname; /* condition name (for debugging) */ - struct PLpgSQL_condition *next; + int sqlerrstate; /* SQLSTATE code */ + char *condname; /* condition name (for debugging) */ + struct PLpgSQL_condition *next; } PLpgSQL_condition; /* @@ -383,9 +383,9 @@ typedef struct PLpgSQL_condition */ typedef struct PLpgSQL_exception_block { - int sqlstate_varno; - int sqlerrm_varno; - List *exc_list; /* List of WHEN clauses */ + int sqlstate_varno; + int sqlerrm_varno; + List *exc_list; /* List of WHEN clauses */ } PLpgSQL_exception_block; /* @@ -393,9 +393,9 @@ typedef struct PLpgSQL_exception_block */ typedef struct PLpgSQL_exception { - int lineno; - PLpgSQL_condition *conditions; - List *action; /* List of statements */ + int lineno; + PLpgSQL_condition *conditions; + List *action; /* List of statements */ } PLpgSQL_exception; /* @@ -403,13 +403,13 @@ typedef struct PLpgSQL_exception */ typedef struct PLpgSQL_stmt_block { - PLpgSQL_stmt_type cmd_type; - int lineno; - char *label; - List *body; /* List of statements */ - int n_initvars; - int *initvarnos; - PLpgSQL_exception_block *exceptions; + PLpgSQL_stmt_type cmd_type; + int lineno; + char *label; + List *body; /* List of statements */ + int n_initvars; + int *initvarnos; + PLpgSQL_exception_block *exceptions; } PLpgSQL_stmt_block; /* @@ -417,10 +417,10 @@ typedef struct PLpgSQL_stmt_block */ typedef struct PLpgSQL_stmt_assign { - PLpgSQL_stmt_type cmd_type; - int lineno; - int varno; - PLpgSQL_expr *expr; + PLpgSQL_stmt_type cmd_type; + int lineno; + int varno; + PLpgSQL_expr *expr; } PLpgSQL_stmt_assign; /* @@ -428,9 +428,9 @@ typedef struct PLpgSQL_stmt_assign */ typedef struct PLpgSQL_stmt_perform { - PLpgSQL_stmt_type cmd_type; - int lineno; - PLpgSQL_expr *expr; + PLpgSQL_stmt_type cmd_type; + int lineno; + PLpgSQL_expr *expr; } PLpgSQL_stmt_perform; /* @@ -438,8 +438,8 @@ typedef struct PLpgSQL_stmt_perform */ typedef struct PLpgSQL_diag_item { - PLpgSQL_getdiag_kind kind; /* id for diagnostic value desired */ - int target; /* where to assign it */ + PLpgSQL_getdiag_kind kind; /* id for diagnostic value desired */ + int target; /* where to assign it */ } PLpgSQL_diag_item; /* @@ -447,10 +447,10 @@ typedef struct PLpgSQL_diag_item */ typedef struct PLpgSQL_stmt_getdiag { - PLpgSQL_stmt_type cmd_type; - int lineno; - bool is_stacked; /* STACKED or CURRENT diagnostics area? */ - List *diag_items; /* List of PLpgSQL_diag_item */ + PLpgSQL_stmt_type cmd_type; + int lineno; + bool is_stacked; /* STACKED or CURRENT diagnostics area? */ + List *diag_items; /* List of PLpgSQL_diag_item */ } PLpgSQL_stmt_getdiag; /* @@ -458,12 +458,12 @@ typedef struct PLpgSQL_stmt_getdiag */ typedef struct PLpgSQL_stmt_if { - PLpgSQL_stmt_type cmd_type; - int lineno; - PLpgSQL_expr *cond; /* boolean expression for THEN */ - List *then_body; /* List of statements */ - List *elsif_list; /* List of PLpgSQL_if_elsif structs */ - List *else_body; /* List of statements */ + PLpgSQL_stmt_type cmd_type; + int lineno; + PLpgSQL_expr *cond; /* boolean expression for THEN */ + List *then_body; /* List of statements */ + List *elsif_list; /* List of PLpgSQL_if_elsif structs */ + List *else_body; /* List of statements */ } PLpgSQL_stmt_if; /* @@ -471,9 +471,9 @@ typedef struct PLpgSQL_stmt_if */ typedef struct PLpgSQL_if_elsif { - int lineno; - PLpgSQL_expr *cond; /* boolean expression for this case */ - List *stmts; /* List of statements */ + int lineno; + PLpgSQL_expr *cond; /* boolean expression for this case */ + List *stmts; /* List of statements */ } PLpgSQL_if_elsif; /* @@ -481,13 +481,13 @@ typedef struct PLpgSQL_if_elsif */ typedef struct PLpgSQL_stmt_case { - PLpgSQL_stmt_type cmd_type; - int lineno; - PLpgSQL_expr *t_expr; /* test expression, or NULL if none */ - int t_varno; /* var to store test expression value into */ - List *case_when_list; /* List of PLpgSQL_case_when structs */ - bool have_else; /* flag needed because list could be empty */ - List *else_stmts; /* List of statements */ + PLpgSQL_stmt_type cmd_type; + int lineno; + PLpgSQL_expr *t_expr; /* test expression, or NULL if none */ + int t_varno; /* var to store test expression value into */ + List *case_when_list; /* List of PLpgSQL_case_when structs */ + bool have_else; /* flag needed because list could be empty */ + List *else_stmts; /* List of statements */ } PLpgSQL_stmt_case; /* @@ -495,9 +495,9 @@ typedef struct PLpgSQL_stmt_case */ typedef struct PLpgSQL_case_when { - int lineno; - PLpgSQL_expr *expr; /* boolean expression for this case */ - List *stmts; /* List of statements */ + int lineno; + PLpgSQL_expr *expr; /* boolean expression for this case */ + List *stmts; /* List of statements */ } PLpgSQL_case_when; /* @@ -505,10 +505,10 @@ typedef struct PLpgSQL_case_when */ typedef struct PLpgSQL_stmt_loop { - PLpgSQL_stmt_type cmd_type; - int lineno; - char *label; - List *body; /* List of statements */ + PLpgSQL_stmt_type cmd_type; + int lineno; + char *label; + List *body; /* List of statements */ } PLpgSQL_stmt_loop; /* @@ -516,11 +516,11 @@ typedef struct PLpgSQL_stmt_loop */ typedef struct PLpgSQL_stmt_while { - PLpgSQL_stmt_type cmd_type; - int lineno; - char *label; - PLpgSQL_expr *cond; - List *body; /* List of statements */ + PLpgSQL_stmt_type cmd_type; + int lineno; + char *label; + PLpgSQL_expr *cond; + List *body; /* List of statements */ } PLpgSQL_stmt_while; /* @@ -528,15 +528,15 @@ typedef struct PLpgSQL_stmt_while */ typedef struct PLpgSQL_stmt_fori { - PLpgSQL_stmt_type cmd_type; - int lineno; - char *label; - PLpgSQL_var *var; - PLpgSQL_expr *lower; - PLpgSQL_expr *upper; - PLpgSQL_expr *step; /* NULL means default (ie, BY 1) */ - int reverse; - List *body; /* List of statements */ + PLpgSQL_stmt_type cmd_type; + int lineno; + char *label; + PLpgSQL_var *var; + PLpgSQL_expr *lower; + PLpgSQL_expr *upper; + PLpgSQL_expr *step; /* NULL means default (ie, BY 1) */ + int reverse; + List *body; /* List of statements */ } PLpgSQL_stmt_fori; /* @@ -546,12 +546,12 @@ typedef struct PLpgSQL_stmt_fori */ typedef struct PLpgSQL_stmt_forq { - PLpgSQL_stmt_type cmd_type; - int lineno; - char *label; - PLpgSQL_rec *rec; - PLpgSQL_row *row; - List *body; /* List of statements */ + PLpgSQL_stmt_type cmd_type; + int lineno; + char *label; + PLpgSQL_rec *rec; + PLpgSQL_row *row; + List *body; /* List of statements */ } PLpgSQL_stmt_forq; /* @@ -559,14 +559,14 @@ typedef struct PLpgSQL_stmt_forq */ typedef struct PLpgSQL_stmt_fors { - PLpgSQL_stmt_type cmd_type; - int lineno; - char *label; - PLpgSQL_rec *rec; - PLpgSQL_row *row; - List *body; /* List of statements */ - /* end of fields that must match PLpgSQL_stmt_forq */ - PLpgSQL_expr *query; + PLpgSQL_stmt_type cmd_type; + int lineno; + char *label; + PLpgSQL_rec *rec; + PLpgSQL_row *row; + List *body; /* List of statements */ + /* end of fields that must match PLpgSQL_stmt_forq */ + PLpgSQL_expr *query; } PLpgSQL_stmt_fors; /* @@ -574,15 +574,15 @@ typedef struct PLpgSQL_stmt_fors */ typedef struct PLpgSQL_stmt_forc { - PLpgSQL_stmt_type cmd_type; - int lineno; - char *label; - PLpgSQL_rec *rec; - PLpgSQL_row *row; - List *body; /* List of statements */ - /* end of fields that must match PLpgSQL_stmt_forq */ - int curvar; - PLpgSQL_expr *argquery; /* cursor arguments if any */ + PLpgSQL_stmt_type cmd_type; + int lineno; + char *label; + PLpgSQL_rec *rec; + PLpgSQL_row *row; + List *body; /* List of statements */ + /* end of fields that must match PLpgSQL_stmt_forq */ + int curvar; + PLpgSQL_expr *argquery; /* cursor arguments if any */ } PLpgSQL_stmt_forc; /* @@ -590,15 +590,15 @@ typedef struct PLpgSQL_stmt_forc */ typedef struct PLpgSQL_stmt_dynfors { - PLpgSQL_stmt_type cmd_type; - int lineno; - char *label; - PLpgSQL_rec *rec; - PLpgSQL_row *row; - List *body; /* List of statements */ - /* end of fields that must match PLpgSQL_stmt_forq */ - PLpgSQL_expr *query; - List *params; /* USING expressions */ + PLpgSQL_stmt_type cmd_type; + int lineno; + char *label; + PLpgSQL_rec *rec; + PLpgSQL_row *row; + List *body; /* List of statements */ + /* end of fields that must match PLpgSQL_stmt_forq */ + PLpgSQL_expr *query; + List *params; /* USING expressions */ } PLpgSQL_stmt_dynfors; /* @@ -606,13 +606,13 @@ typedef struct PLpgSQL_stmt_dynfors */ typedef struct PLpgSQL_stmt_foreach_a { - PLpgSQL_stmt_type cmd_type; - int lineno; - char *label; - int varno; /* loop target variable */ - int slice; /* slice dimension, or 0 */ - PLpgSQL_expr *expr; /* array expression */ - List *body; /* List of statements */ + PLpgSQL_stmt_type cmd_type; + int lineno; + char *label; + int varno; /* loop target variable */ + int slice; /* slice dimension, or 0 */ + PLpgSQL_expr *expr; /* array expression */ + List *body; /* List of statements */ } PLpgSQL_stmt_foreach_a; /* @@ -620,15 +620,15 @@ typedef struct PLpgSQL_stmt_foreach_a */ typedef struct PLpgSQL_stmt_open { - PLpgSQL_stmt_type cmd_type; - int lineno; - int curvar; - int cursor_options; - PLpgSQL_row *returntype; - PLpgSQL_expr *argquery; - PLpgSQL_expr *query; - PLpgSQL_expr *dynquery; - List *params; /* USING expressions */ + PLpgSQL_stmt_type cmd_type; + int lineno; + int curvar; + int cursor_options; + PLpgSQL_row *returntype; + PLpgSQL_expr *argquery; + PLpgSQL_expr *query; + PLpgSQL_expr *dynquery; + List *params; /* USING expressions */ } PLpgSQL_stmt_open; /* @@ -636,16 +636,16 @@ typedef struct PLpgSQL_stmt_open */ typedef struct PLpgSQL_stmt_fetch { - PLpgSQL_stmt_type cmd_type; - int lineno; - PLpgSQL_rec *rec; /* target, as record or row */ - PLpgSQL_row *row; - int curvar; /* cursor variable to fetch from */ - FetchDirection direction; /* fetch direction */ - long how_many; /* count, if constant (expr is NULL) */ - PLpgSQL_expr *expr; /* count, if expression */ - bool is_move; /* is this a fetch or move? */ - bool returns_multiple_rows; /* can return more than one row? */ + PLpgSQL_stmt_type cmd_type; + int lineno; + PLpgSQL_rec *rec; /* target, as record or row */ + PLpgSQL_row *row; + int curvar; /* cursor variable to fetch from */ + FetchDirection direction; /* fetch direction */ + long how_many; /* count, if constant (expr is NULL) */ + PLpgSQL_expr *expr; /* count, if expression */ + bool is_move; /* is this a fetch or move? */ + bool returns_multiple_rows; /* can return more than one row? */ } PLpgSQL_stmt_fetch; /* @@ -653,9 +653,9 @@ typedef struct PLpgSQL_stmt_fetch */ typedef struct PLpgSQL_stmt_close { - PLpgSQL_stmt_type cmd_type; - int lineno; - int curvar; + PLpgSQL_stmt_type cmd_type; + int lineno; + int curvar; } PLpgSQL_stmt_close; /* @@ -663,11 +663,11 @@ typedef struct PLpgSQL_stmt_close */ typedef struct PLpgSQL_stmt_exit { - PLpgSQL_stmt_type cmd_type; - int lineno; - bool is_exit; /* Is this an exit or a continue? */ - char *label; /* NULL if it's an unlabelled EXIT/CONTINUE */ - PLpgSQL_expr *cond; + PLpgSQL_stmt_type cmd_type; + int lineno; + bool is_exit; /* Is this an exit or a continue? */ + char *label; /* NULL if it's an unlabelled EXIT/CONTINUE */ + PLpgSQL_expr *cond; } PLpgSQL_stmt_exit; /* @@ -675,10 +675,10 @@ typedef struct PLpgSQL_stmt_exit */ typedef struct PLpgSQL_stmt_return { - PLpgSQL_stmt_type cmd_type; - int lineno; - PLpgSQL_expr *expr; - int retvarno; + PLpgSQL_stmt_type cmd_type; + int lineno; + PLpgSQL_expr *expr; + int retvarno; } PLpgSQL_stmt_return; /* @@ -686,10 +686,10 @@ typedef struct PLpgSQL_stmt_return */ typedef struct PLpgSQL_stmt_return_next { - PLpgSQL_stmt_type cmd_type; - int lineno; - PLpgSQL_expr *expr; - int retvarno; + PLpgSQL_stmt_type cmd_type; + int lineno; + PLpgSQL_expr *expr; + int retvarno; } PLpgSQL_stmt_return_next; /* @@ -697,11 +697,11 @@ typedef struct PLpgSQL_stmt_return_next */ typedef struct PLpgSQL_stmt_return_query { - PLpgSQL_stmt_type cmd_type; - int lineno; - PLpgSQL_expr *query; /* if static query */ - PLpgSQL_expr *dynquery; /* if dynamic query (RETURN QUERY EXECUTE) */ - List *params; /* USING arguments for dynamic query */ + PLpgSQL_stmt_type cmd_type; + int lineno; + PLpgSQL_expr *query; /* if static query */ + PLpgSQL_expr *dynquery; /* if dynamic query (RETURN QUERY EXECUTE) */ + List *params; /* USING arguments for dynamic query */ } PLpgSQL_stmt_return_query; /* @@ -709,13 +709,13 @@ typedef struct PLpgSQL_stmt_return_query */ typedef struct PLpgSQL_stmt_raise { - PLpgSQL_stmt_type cmd_type; - int lineno; - int elog_level; - char *condname; /* condition name, SQLSTATE, or NULL */ - char *message; /* old-style message format literal, or NULL */ - List *params; /* list of expressions for old-style message */ - List *options; /* list of PLpgSQL_raise_option */ + PLpgSQL_stmt_type cmd_type; + int lineno; + int elog_level; + char *condname; /* condition name, SQLSTATE, or NULL */ + char *message; /* old-style message format literal, or NULL */ + List *params; /* list of expressions for old-style message */ + List *options; /* list of PLpgSQL_raise_option */ } PLpgSQL_stmt_raise; /* @@ -723,8 +723,8 @@ typedef struct PLpgSQL_stmt_raise */ typedef struct PLpgSQL_raise_option { - PLpgSQL_raise_option_type opt_type; - PLpgSQL_expr *expr; + PLpgSQL_raise_option_type opt_type; + PLpgSQL_expr *expr; } PLpgSQL_raise_option; /* @@ -732,10 +732,10 @@ typedef struct PLpgSQL_raise_option */ typedef struct PLpgSQL_stmt_assert { - PLpgSQL_stmt_type cmd_type; - int lineno; - PLpgSQL_expr *cond; - PLpgSQL_expr *message; + PLpgSQL_stmt_type cmd_type; + int lineno; + PLpgSQL_expr *cond; + PLpgSQL_expr *message; } PLpgSQL_stmt_assert; /* @@ -743,15 +743,15 @@ typedef struct PLpgSQL_stmt_assert */ typedef struct PLpgSQL_stmt_execsql { - PLpgSQL_stmt_type cmd_type; - int lineno; - PLpgSQL_expr *sqlstmt; - bool mod_stmt; /* is the stmt INSERT/UPDATE/DELETE? Note: - * mod_stmt is set when we plan the query */ - bool into; /* INTO supplied? */ - bool strict; /* INTO STRICT flag */ - PLpgSQL_rec *rec; /* INTO target, if record */ - PLpgSQL_row *row; /* INTO target, if row */ + PLpgSQL_stmt_type cmd_type; + int lineno; + PLpgSQL_expr *sqlstmt; + bool mod_stmt; /* is the stmt INSERT/UPDATE/DELETE? Note: + * mod_stmt is set when we plan the query */ + bool into; /* INTO supplied? */ + bool strict; /* INTO STRICT flag */ + PLpgSQL_rec *rec; /* INTO target, if record */ + PLpgSQL_row *row; /* INTO target, if row */ } PLpgSQL_stmt_execsql; /* @@ -759,14 +759,14 @@ typedef struct PLpgSQL_stmt_execsql */ typedef struct PLpgSQL_stmt_dynexecute { - PLpgSQL_stmt_type cmd_type; - int lineno; - PLpgSQL_expr *query; /* string expression */ - bool into; /* INTO supplied? */ - bool strict; /* INTO STRICT flag */ - PLpgSQL_rec *rec; /* INTO target, if record */ - PLpgSQL_row *row; /* INTO target, if row */ - List *params; /* USING expressions */ + PLpgSQL_stmt_type cmd_type; + int lineno; + PLpgSQL_expr *query; /* string expression */ + bool into; /* INTO supplied? */ + bool strict; /* INTO STRICT flag */ + PLpgSQL_rec *rec; /* INTO target, if record */ + PLpgSQL_row *row; /* INTO target, if row */ + List *params; /* USING expressions */ } PLpgSQL_stmt_dynexecute; /* @@ -774,32 +774,32 @@ typedef struct PLpgSQL_stmt_dynexecute */ typedef struct PLpgSQL_func_hashkey { - Oid funcOid; - - bool isTrigger; /* true if called as a trigger */ - - /* be careful that pad bytes in this struct get zeroed! */ - - /* - * For a trigger function, the OID of the trigger is part of the hash key - * --- we want to compile the trigger function separately for each trigger - * it is used with, in case the rowtype or transition table names are - * different. Zero if not called as a trigger. - */ - Oid trigOid; - - /* - * We must include the input collation as part of the hash key too, - * because we have to generate different plans (with different Param - * collations) for different collation settings. - */ - Oid inputCollation; - - /* - * We include actual argument types in the hash key to support polymorphic - * PLpgSQL functions. Be careful that extra positions are zeroed! - */ - Oid argtypes[FUNC_MAX_ARGS]; + Oid funcOid; + + bool isTrigger; /* true if called as a trigger */ + + /* be careful that pad bytes in this struct get zeroed! */ + + /* + * For a trigger function, the OID of the trigger is part of the hash key + * --- we want to compile the trigger function separately for each trigger + * it is used with, in case the rowtype or transition table names are + * different. Zero if not called as a trigger. + */ + Oid trigOid; + + /* + * We must include the input collation as part of the hash key too, + * because we have to generate different plans (with different Param + * collations) for different collation settings. + */ + Oid inputCollation; + + /* + * We include actual argument types in the hash key to support polymorphic + * PLpgSQL functions. Be careful that extra positions are zeroed! + */ + Oid argtypes[FUNC_MAX_ARGS]; } PLpgSQL_func_hashkey; /* @@ -807,9 +807,9 @@ typedef struct PLpgSQL_func_hashkey */ typedef enum PLpgSQL_trigtype { - PLPGSQL_DML_TRIGGER, - PLPGSQL_EVENT_TRIGGER, - PLPGSQL_NOT_TRIGGER + PLPGSQL_DML_TRIGGER, + PLPGSQL_EVENT_TRIGGER, + PLPGSQL_NOT_TRIGGER } PLpgSQL_trigtype; /* @@ -817,62 +817,62 @@ typedef enum PLpgSQL_trigtype */ typedef struct PLpgSQL_function { - char *fn_signature; - Oid fn_oid; - TransactionId fn_xmin; - ItemPointerData fn_tid; - PLpgSQL_trigtype fn_is_trigger; - Oid fn_input_collation; - PLpgSQL_func_hashkey *fn_hashkey; /* back-link to hashtable key */ - MemoryContext fn_cxt; - - Oid fn_rettype; - int fn_rettyplen; - bool fn_retbyval; - bool fn_retistuple; - bool fn_retset; - bool fn_readonly; - - int fn_nargs; - int fn_argvarnos[FUNC_MAX_ARGS]; - int out_param_varno; - int found_varno; - int new_varno; - int old_varno; - int tg_name_varno; - int tg_when_varno; - int tg_level_varno; - int tg_op_varno; - int tg_relid_varno; - int tg_relname_varno; - int tg_table_name_varno; - int tg_table_schema_varno; - int tg_nargs_varno; - int tg_argv_varno; - - /* for event triggers */ - int tg_event_varno; - int tg_tag_varno; - - PLpgSQL_resolve_option resolve_option; - - bool print_strict_params; - - /* extra checks */ - int extra_warnings; - int extra_errors; - - /* the datums representing the function's local variables */ - int ndatums; - PLpgSQL_datum **datums; - Bitmapset *resettable_datums; /* dnos of non-simple vars */ - - /* function body parsetree */ - PLpgSQL_stmt_block *action; - - /* these fields change when the function is used */ - struct PLpgSQL_execstate *cur_estate; - unsigned long use_count; + char *fn_signature; + Oid fn_oid; + TransactionId fn_xmin; + ItemPointerData fn_tid; + PLpgSQL_trigtype fn_is_trigger; + Oid fn_input_collation; + PLpgSQL_func_hashkey *fn_hashkey; /* back-link to hashtable key */ + MemoryContext fn_cxt; + + Oid fn_rettype; + int fn_rettyplen; + bool fn_retbyval; + bool fn_retistuple; + bool fn_retset; + bool fn_readonly; + + int fn_nargs; + int fn_argvarnos[FUNC_MAX_ARGS]; + int out_param_varno; + int found_varno; + int new_varno; + int old_varno; + int tg_name_varno; + int tg_when_varno; + int tg_level_varno; + int tg_op_varno; + int tg_relid_varno; + int tg_relname_varno; + int tg_table_name_varno; + int tg_table_schema_varno; + int tg_nargs_varno; + int tg_argv_varno; + + /* for event triggers */ + int tg_event_varno; + int tg_tag_varno; + + PLpgSQL_resolve_option resolve_option; + + bool print_strict_params; + + /* extra checks */ + int extra_warnings; + int extra_errors; + + /* the datums representing the function's local variables */ + int ndatums; + PLpgSQL_datum **datums; + Bitmapset *resettable_datums; /* dnos of non-simple vars */ + + /* function body parsetree */ + PLpgSQL_stmt_block *action; + + /* these fields change when the function is used */ + struct PLpgSQL_execstate *cur_estate; + unsigned long use_count; } PLpgSQL_function; /* @@ -880,59 +880,60 @@ typedef struct PLpgSQL_function */ typedef struct PLpgSQL_execstate { - PLpgSQL_function *func; /* function being executed */ + PLpgSQL_function *func; /* function being executed */ - Datum retval; - bool retisnull; - Oid rettype; /* type of current retval */ + Datum retval; + bool retisnull; + Oid rettype; /* type of current retval */ - Oid fn_rettype; /* info about declared function rettype */ - bool retistuple; - bool retisset; + Oid fn_rettype; /* info about declared function rettype */ + bool retistuple; + bool retisset; - bool readonly_func; + bool readonly_func; - TupleDesc rettupdesc; - char *exitlabel; /* the "target" label of the current EXIT or - * CONTINUE stmt, if any */ - ErrorData *cur_error; /* current exception handler's error */ + TupleDesc rettupdesc; + char *exitlabel; /* the "target" label of the current EXIT or + * CONTINUE stmt, if any */ + ErrorData *cur_error; /* current exception handler's error */ - Tuplestorestate *tuple_store; /* SRFs accumulate results here */ - MemoryContext tuple_store_cxt; - ResourceOwner tuple_store_owner; - ReturnSetInfo *rsi; + Tuplestorestate *tuple_store; /* SRFs accumulate results here */ + MemoryContext tuple_store_cxt; + ResourceOwner tuple_store_owner; + ReturnSetInfo *rsi; - /* the datums representing the function's local variables */ - int found_varno; - int ndatums; - PLpgSQL_datum **datums; + /* the datums representing the function's local variables */ + int found_varno; + int ndatums; + PLpgSQL_datum **datums; - /* we pass datums[i] to the executor, when needed, in paramLI->params[i] */ - ParamListInfo paramLI; - bool params_dirty; /* T if any resettable datum has been passed */ + /* we pass datums[i] to the executor, when needed, in paramLI->params[i] */ + ParamListInfo paramLI; + bool params_dirty; /* T if any resettable datum has been passed */ - /* EState to use for "simple" expression evaluation */ - EState *simple_eval_estate; + /* EState to use for "simple" expression evaluation */ + EState *simple_eval_estate; - /* lookup table to use for executing type casts */ - HTAB *cast_hash; - MemoryContext cast_hash_context; + /* lookup table to use for executing type casts */ + HTAB *cast_hash; + MemoryContext cast_hash_context; - /* memory context for statement-lifespan temporary values */ - MemoryContext stmt_mcontext; /* current stmt context, or NULL if none */ - MemoryContext stmt_mcontext_parent; /* parent of current context */ + /* memory context for statement-lifespan temporary values */ + MemoryContext stmt_mcontext; /* current stmt context, or NULL if none */ + MemoryContext stmt_mcontext_parent; /* parent of current context */ - /* temporary state for results from evaluation of query or expr */ - SPITupleTable *eval_tuptable; - uint64 eval_processed; - Oid eval_lastoid; - ExprContext *eval_econtext; /* for executing simple expressions */ + /* temporary state for results from evaluation of query or expr */ + SPITupleTable *eval_tuptable; + uint64 eval_processed; + Oid eval_lastoid; + ExprContext *eval_econtext; /* for executing simple expressions */ - /* status information for error context reporting */ - PLpgSQL_stmt *err_stmt; /* current stmt */ - const char *err_text; /* additional state info */ + /* status information for error context reporting */ + PLpgSQL_stmt *err_stmt; /* current stmt */ + const char *err_text; /* additional state info */ - void *plugin_info; /* reserved for use by optional plugin */ + void *plugin_info; /* reserved for use by optional plugin */ + bool handle_exceptions; } PLpgSQL_execstate; /* @@ -967,17 +968,17 @@ typedef struct PLpgSQL_execstate */ typedef struct PLpgSQL_plugin { - /* Function pointers set up by the plugin */ - void (*func_setup) (PLpgSQL_execstate *estate, PLpgSQL_function *func); - void (*func_beg) (PLpgSQL_execstate *estate, PLpgSQL_function *func); - void (*func_end) (PLpgSQL_execstate *estate, PLpgSQL_function *func); - void (*stmt_beg) (PLpgSQL_execstate *estate, PLpgSQL_stmt *stmt); - void (*stmt_end) (PLpgSQL_execstate *estate, PLpgSQL_stmt *stmt); - - /* Function pointers set by PL/pgSQL itself */ - void (*error_callback) (void *arg); - void (*assign_expr) (PLpgSQL_execstate *estate, PLpgSQL_datum *target, - PLpgSQL_expr *expr); + /* Function pointers set up by the plugin */ + void (*func_setup) (PLpgSQL_execstate *estate, PLpgSQL_function *func); + void (*func_beg) (PLpgSQL_execstate *estate, PLpgSQL_function *func); + void (*func_end) (PLpgSQL_execstate *estate, PLpgSQL_function *func); + void (*stmt_beg) (PLpgSQL_execstate *estate, PLpgSQL_stmt *stmt); + void (*stmt_end) (PLpgSQL_execstate *estate, PLpgSQL_stmt *stmt); + + /* Function pointers set by PL/pgSQL itself */ + void (*error_callback) (void *arg); + void (*assign_expr) (PLpgSQL_execstate *estate, PLpgSQL_datum *target, + PLpgSQL_expr *expr); } PLpgSQL_plugin; /* @@ -986,21 +987,21 @@ typedef struct PLpgSQL_plugin typedef struct PLword { - char *ident; /* palloc'd converted identifier */ - bool quoted; /* Was it double-quoted? */ + char *ident; /* palloc'd converted identifier */ + bool quoted; /* Was it double-quoted? */ } PLword; typedef struct PLcword { - List *idents; /* composite identifiers (list of String) */ + List *idents; /* composite identifiers (list of String) */ } PLcword; typedef struct PLwdatum { - PLpgSQL_datum *datum; /* referenced variable */ - char *ident; /* valid if simple name */ - bool quoted; - List *idents; /* valid if composite name */ + PLpgSQL_datum *datum; /* referenced variable */ + char *ident; /* valid if simple name */ + bool quoted; + List *idents; /* valid if composite name */ } PLwdatum; /********************************************************************** @@ -1009,33 +1010,33 @@ typedef struct PLwdatum typedef enum { - IDENTIFIER_LOOKUP_NORMAL, /* normal processing of var names */ - IDENTIFIER_LOOKUP_DECLARE, /* In DECLARE --- don't look up names */ - IDENTIFIER_LOOKUP_EXPR /* In SQL expression --- special case */ + IDENTIFIER_LOOKUP_NORMAL, /* normal processing of var names */ + IDENTIFIER_LOOKUP_DECLARE, /* In DECLARE --- don't look up names */ + IDENTIFIER_LOOKUP_EXPR /* In SQL expression --- special case */ } IdentifierLookup; extern IdentifierLookup plpgsql_IdentifierLookup; -extern int plpgsql_variable_conflict; +extern int plpgsql_variable_conflict; extern bool plpgsql_print_strict_params; extern bool plpgsql_check_asserts; /* extra compile-time checks */ -#define PLPGSQL_XCHECK_NONE 0 -#define PLPGSQL_XCHECK_SHADOWVAR 1 -#define PLPGSQL_XCHECK_ALL ((int) ~0) +#define PLPGSQL_XCHECK_NONE 0 +#define PLPGSQL_XCHECK_SHADOWVAR 1 +#define PLPGSQL_XCHECK_ALL ((int) ~0) -extern int plpgsql_extra_warnings; -extern int plpgsql_extra_errors; +extern int plpgsql_extra_warnings; +extern int plpgsql_extra_errors; extern bool plpgsql_check_syntax; extern bool plpgsql_DumpExecTree; extern PLpgSQL_stmt_block *plpgsql_parse_result; -extern int plpgsql_nDatums; +extern int plpgsql_nDatums; extern PLpgSQL_datum **plpgsql_Datums; extern char *plpgsql_error_funcname; @@ -1053,32 +1054,32 @@ extern PLpgSQL_plugin **plpgsql_plugin_ptr; * Functions in pl_comp.c */ extern PLpgSQL_function *plpgsql_compile(FunctionCallInfo fcinfo, - bool forValidator); + bool forValidator); extern PLpgSQL_function *plpgsql_compile_inline(char *proc_source); extern void plpgsql_parser_setup(struct ParseState *pstate, - PLpgSQL_expr *expr); + PLpgSQL_expr *expr); extern bool plpgsql_parse_word(char *word1, const char *yytxt, - PLwdatum *wdatum, PLword *word); + PLwdatum *wdatum, PLword *word); extern bool plpgsql_parse_dblword(char *word1, char *word2, - PLwdatum *wdatum, PLcword *cword); + PLwdatum *wdatum, PLcword *cword); extern bool plpgsql_parse_tripword(char *word1, char *word2, char *word3, - PLwdatum *wdatum, PLcword *cword); + PLwdatum *wdatum, PLcword *cword); extern PLpgSQL_type *plpgsql_parse_wordtype(char *ident); extern PLpgSQL_type *plpgsql_parse_cwordtype(List *idents); extern PLpgSQL_type *plpgsql_parse_wordrowtype(char *ident); extern PLpgSQL_type *plpgsql_parse_cwordrowtype(List *idents); extern PLpgSQL_type *plpgsql_build_datatype(Oid typeOid, int32 typmod, - Oid collation); + Oid collation); extern PLpgSQL_variable *plpgsql_build_variable(const char *refname, int lineno, - PLpgSQL_type *dtype, - bool add2namespace); + PLpgSQL_type *dtype, + bool add2namespace); extern PLpgSQL_rec *plpgsql_build_record(const char *refname, int lineno, - bool add2namespace); + bool add2namespace); extern int plpgsql_recognize_err_condition(const char *condname, - bool allow_sqlstate); + bool allow_sqlstate); extern PLpgSQL_condition *plpgsql_parse_err_condition(char *condname); extern void plpgsql_adddatum(PLpgSQL_datum *new); -extern int plpgsql_add_initdatums(int **varnos); +extern int plpgsql_add_initdatums(int **varnos); extern void plpgsql_HashTableInit(void); /* @@ -1090,35 +1091,35 @@ extern void _PG_init(void); * Functions in pl_exec.c */ extern Datum plpgsql_exec_function(PLpgSQL_function *func, - FunctionCallInfo fcinfo, - EState *simple_eval_estate); + FunctionCallInfo fcinfo, + EState *simple_eval_estate); extern HeapTuple plpgsql_exec_trigger(PLpgSQL_function *func, - TriggerData *trigdata); + TriggerData *trigdata); extern void plpgsql_exec_event_trigger(PLpgSQL_function *func, - EventTriggerData *trigdata); + EventTriggerData *trigdata); extern void plpgsql_xact_cb(XactEvent event, void *arg); extern void plpgsql_subxact_cb(SubXactEvent event, SubTransactionId mySubid, - SubTransactionId parentSubid, void *arg); + SubTransactionId parentSubid, void *arg); extern Oid plpgsql_exec_get_datum_type(PLpgSQL_execstate *estate, - PLpgSQL_datum *datum); + PLpgSQL_datum *datum); extern void plpgsql_exec_get_datum_type_info(PLpgSQL_execstate *estate, - PLpgSQL_datum *datum, - Oid *typeid, int32 *typmod, Oid *collation); + PLpgSQL_datum *datum, + Oid *typeid, int32 *typmod, Oid *collation); /* * Functions for namespace handling in pl_funcs.c */ extern void plpgsql_ns_init(void); extern void plpgsql_ns_push(const char *label, - PLpgSQL_label_type label_type); + PLpgSQL_label_type label_type); extern void plpgsql_ns_pop(void); extern PLpgSQL_nsitem *plpgsql_ns_top(void); extern void plpgsql_ns_additem(PLpgSQL_nsitem_type itemtype, int itemno, const char *name); extern PLpgSQL_nsitem *plpgsql_ns_lookup(PLpgSQL_nsitem *ns_cur, bool localmode, - const char *name1, const char *name2, - const char *name3, int *names_used); + const char *name1, const char *name2, + const char *name3, int *names_used); extern PLpgSQL_nsitem *plpgsql_ns_lookup_label(PLpgSQL_nsitem *ns_cur, - const char *name); + const char *name); extern PLpgSQL_nsitem *plpgsql_ns_find_nearest_loop(PLpgSQL_nsitem *ns_cur); /* @@ -1132,25 +1133,25 @@ extern void plpgsql_dumptree(PLpgSQL_function *func); /* * Scanner functions in pl_scanner.c */ -extern int plpgsql_base_yylex(void); -extern int plpgsql_yylex(void); +extern int plpgsql_base_yylex(void); +extern int plpgsql_yylex(void); extern void plpgsql_push_back_token(int token); extern bool plpgsql_token_is_unreserved_keyword(int token); extern void plpgsql_append_source_text(StringInfo buf, - int startlocation, int endlocation); -extern int plpgsql_peek(void); + int startlocation, int endlocation); +extern int plpgsql_peek(void); extern void plpgsql_peek2(int *tok1_p, int *tok2_p, int *tok1_loc, - int *tok2_loc); -extern int plpgsql_scanner_errposition(int location); + int *tok2_loc); +extern int plpgsql_scanner_errposition(int location); extern void plpgsql_yyerror(const char *message) pg_attribute_noreturn(); -extern int plpgsql_location_to_lineno(int location); -extern int plpgsql_latest_lineno(void); +extern int plpgsql_location_to_lineno(int location); +extern int plpgsql_latest_lineno(void); extern void plpgsql_scanner_init(const char *str); extern void plpgsql_scanner_finish(void); /* * Externs in gram.y */ -extern int plpgsql_yyparse(void); +extern int plpgsql_yyparse(void); -#endif /* PLPGSQL_H */ +#endif /* PLPGSQL_H */ From 61a02c6fc16088ffaf2ac65934a38e78b2d035f5 Mon Sep 17 00:00:00 2001 From: andrelin Date: Fri, 18 Feb 2022 14:41:31 +0800 Subject: [PATCH 487/578] Split the role settings in pg_stat_cluster_activity to prevent the executor from overwriting the results tapd: http://tapd.woa.com/TBase_Oracle_Migration/bugtrace/bugs/view?bug_id=1020421696096872133 --- .../pg_stat_cluster_activity.c | 25 +++++++++++++------ 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c index 8518ae8a..2b36fe39 100644 --- a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c +++ b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c @@ -321,18 +321,27 @@ pgcs_entry_initialize(void) * * Report common fileds of cluster backend status activity, * called by pgcs_report_query_activity and pgcs_report_activity. - * report role, sqname, also if this backend become consumer, remove - * previous planstate and cursor. * ---------- */ static void -pgcs_report_common(PgClusterStatus *entry, QueryDesc *desc) +pgcs_report_common(PgClusterStatus *entry) { strncpy((char *) entry->sessionid, PGXCSessionId, NAMEDATALEN); entry->sqdone = false; entry->valid = true; - +} + +/* ---------- + * pgcs_report_role + * + * Report role, sqname, also if this backend become consumer, remove + * previous planstate and cursor. + * ---------- + */ +static void +pgcs_report_role(PgClusterStatus *entry, QueryDesc *desc) +{ /* fields need queryDesc */ if (IS_PGXC_DATANODE) { @@ -391,7 +400,7 @@ pgcs_report_query_activity(BackendState state, const char *cmd_str) pgcs_entry_initialize(); entry = MyCSEntry; - pgcs_report_common((PgClusterStatus *) entry, NULL); + pgcs_report_common((PgClusterStatus *) entry); if (prev_pgstat_report_hook) prev_pgstat_report_hook(state, cmd_str); @@ -468,7 +477,8 @@ pgcs_report_executor_activity(QueryDesc *desc, int eflags) if (cursors != NULL && cursors->len > 0) memcpy((char *) entry->cursors, cursors->data, Min(cursors->len + 1, NAMEDATALEN * 64)); - pgcs_report_common((PgClusterStatus *) entry, desc); + pgcs_report_common((PgClusterStatus *) entry); + pgcs_report_role((PgClusterStatus *) entry, desc); increment_changecount_after(entry); } @@ -501,7 +511,8 @@ pgcs_report_activity(Portal portal) increment_changecount_before(entry); strncpy((char *) entry->portal, portal->name, NAMEDATALEN); - pgcs_report_common((PgClusterStatus *) entry, desc); + pgcs_report_common((PgClusterStatus *) entry); + pgcs_report_role((PgClusterStatus *) entry, desc); increment_changecount_after(entry); } From 0384d593f766a171c0a5391ca362ab3beffdae7c Mon Sep 17 00:00:00 2001 From: youngxie Date: Fri, 16 Jul 2021 17:01:22 +0800 Subject: [PATCH 488/578] Fix connection amplification due to remote scan of replicate table. http://tapd.oa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131089932161 (merge request !491) --- src/backend/optimizer/util/pathnode.c | 34 +++++++++++++-------------- src/test/regress/sql/sequence.sql | 7 ------ 2 files changed, 17 insertions(+), 24 deletions(-) diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 032253ed..5c1a3ca5 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -1657,23 +1657,6 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) if (innerd == NULL && outerd == NULL) return NIL; #ifdef __TBASE__ - /* - * DML may need to push down to datanodes, for example: - * DELETE FROM - * geocode_settings as gc - * USING geocode_settings_default AS gf - * WHERE - * gf.name = gc.name and gf.setting = gc.setting; - * prefer_olap means pulling query up to coordinator node, in case data - * re-distribute in TPC-C test case. - * - * TODO: We need to automatically determine whether we need to pull it up, - * but not using GUC. - */ - if(!prefer_olap && false == dml) - { - goto pull_up; - } /* * If outer or inner subpaths are distributed by shard and they do not exist @@ -1802,6 +1785,23 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) return alternate; } + /* + * DML may need to push down to datanodes, for example: + * DELETE FROM + * geocode_settings as gc + * USING geocode_settings_default AS gf + * WHERE + * gf.name = gc.name and gf.setting = gc.setting; + * prefer_olap means pulling query up to coordinator node, in case data + * re-distribute in TPC-C test case. + * + * TODO: We need to automatically determine whether we need to pull it up, + * but not using GUC. + */ + if(!prefer_olap && false == dml) + { + goto pull_up; + } restrictClauses = list_copy(pathnode->joinrestrictinfo); restrictClauses = list_concat(restrictClauses, diff --git a/src/test/regress/sql/sequence.sql b/src/test/regress/sql/sequence.sql index a0f8180d..67c91ef8 100644 --- a/src/test/regress/sql/sequence.sql +++ b/src/test/regress/sql/sequence.sql @@ -459,10 +459,3 @@ insert into t2(f2) values(5); insert into t3(f2) values(6); select gsk_key from pg_list_storage_sequence() where gsk_key like '%db_seq1_bak.%'; \q -<<<<<<< HEAD -======= - - - - ->>>>>>> 85b5350be... fix gtm seq bug when create databse or drop databse http://tapd.woa.com/10092131/bugtrace/bugs/view?bug_id=1010092131096383437&jump_count=1 and http://tapd.woa.com/pgxz/bugtrace/bugs/view/1010092131087562597 (merge request !1132) From aac34ee921b8e61dc55c57e31aed1853673ea7a6 Mon Sep 17 00:00:00 2001 From: arrowbowang Date: Fri, 1 Apr 2022 11:24:49 +0800 Subject: [PATCH 489/578] Revert "fix error msg when reset handles fix http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131094179541 (merge request !938) " This reverts commit eb35163c5928605e9520c6fd7cba1fca1923d51a. --- src/backend/pgxc/pool/execRemote.c | 18 ++-------- src/backend/pgxc/pool/pgxcnode.c | 55 ++++++++---------------------- src/include/pgxc/pgxcnode.h | 2 +- 3 files changed, 17 insertions(+), 58 deletions(-) diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index dcb98c13..297f66bf 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -3734,15 +3734,7 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections, { for (i = 0; i < new_count; i++) { - if (pgxc_node_set_query(new_connections[i], init_str)) - { - /* - * print log here and return eof indicates execution failure - */ - elog(LOG, "pgxc_node_begin send %s to node %s, pid:%d failed", init_str, - new_connections[i]->nodename, new_connections[i]->backend_pid); - return EOF; - } + pgxc_node_set_query(new_connections[i], init_str); elog(DEBUG5, "pgxc_node_begin send %s to node %s, pid:%d", init_str, new_connections[i]->nodename, new_connections[i]->backend_pid); } @@ -7059,13 +7051,7 @@ LeaderCnExecRemoteUtility(RemoteQuery *node, char *init_str = PGXCNodeGetSessionParamStr(); if (init_str) { - if (pgxc_node_set_query(leader_cn_conn, init_str)) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("pgxc_node_set_query send %s to node %s, pid:%d failed", init_str, - leader_cn_conn->nodename, leader_cn_conn->backend_pid))); - } + pgxc_node_set_query(leader_cn_conn, init_str); } SetPlpgsqlTransactionBegin(leader_cn_conn); diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index 173e06eb..890f9715 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -149,7 +149,7 @@ static bool DoRefreshRemoteHandles(void); #ifdef XCP static void pgxc_node_init(PGXCNodeHandle *handle, int sock, - bool global_session, int pid, bool is_reset_handle); + bool global_session, int pid); #else static void pgxc_node_init(PGXCNodeHandle *handle, int sock); #endif @@ -667,7 +667,7 @@ pgxc_node_all_free(void) * Structure stores state info and I/O buffers */ static void -pgxc_node_init(PGXCNodeHandle *handle, int sock, bool global_session, int pid, bool is_reset_handle) +pgxc_node_init(PGXCNodeHandle *handle, int sock, bool global_session, int pid) { char *init_str; @@ -701,20 +701,9 @@ pgxc_node_init(PGXCNodeHandle *handle, int sock, bool global_session, int pid, b if (global_session) { init_str = PGXCNodeGetSessionParamStr(); - if (init_str && pgxc_node_set_query(handle, init_str)) - { - if (is_reset_handle) - { - /* if it is a reset handle, do not throw error, just set handle as error state */ - PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_ERROR_FATAL); - elog(WARNING, "pgxc_node_set_query send %s to node %s, pid:%d failed", init_str, - handle->nodename, handle->backend_pid); - } - else + if (init_str) { - elog(ERROR, "pgxc_node_set_query send %s to node %s, pid:%d failed", init_str, - handle->nodename, handle->backend_pid); - } + pgxc_node_set_query(handle, init_str); } } @@ -1557,7 +1546,6 @@ release_handles(bool force) /* * Reset all Datanode and Coordinator connections occupied memory. - * TODO: fix implicit transaction do not commit on dn and remove reset_handles */ void reset_handles(void) @@ -1582,7 +1570,7 @@ reset_handles(void) if (handle->sock != NO_SOCKET) { - pgxc_node_init(handle, handle->sock, true, handle->backend_pid, true); + pgxc_node_init(handle, handle->sock, true, handle->backend_pid); } } @@ -1592,7 +1580,7 @@ reset_handles(void) if (handle->sock != NO_SOCKET) { - pgxc_node_init(handle, handle->sock, true, handle->backend_pid, true); + pgxc_node_init(handle, handle->sock, true, handle->backend_pid); } } @@ -1605,16 +1593,10 @@ reset_handles(void) if (handle->sock != NO_SOCKET) { - pgxc_node_init(handle, handle->sock, true, handle->backend_pid, true); + pgxc_node_init(handle, handle->sock, true, handle->backend_pid); } } } - - if (validate_handles()) - { - elog(LOG, "found bad remote node connections, force release handles now"); - release_handles(true); - } } /* @@ -3791,7 +3773,7 @@ get_any_handle(List *datanodelist) node_handle = &dn_handles[node]; - pgxc_node_init(node_handle, fds[0], true, pids[0], false); + pgxc_node_init(node_handle, fds[0], true, pids[0]); datanode_count++; elog(DEBUG1, "Established a connection with datanode \"%s\"," @@ -4067,7 +4049,7 @@ get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query, bool continue; } - pgxc_node_init(node_handle, fdsock, is_global_session, be_pid, false); + pgxc_node_init(node_handle, fdsock, is_global_session, be_pid); dn_handles[node] = *node_handle; datanode_count++; @@ -4132,7 +4114,7 @@ get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query, bool continue; } - pgxc_node_init(node_handle, fdsock, is_global_session, be_pid, false); + pgxc_node_init(node_handle, fdsock, is_global_session, be_pid); co_handles[node] = *node_handle; coord_count++; @@ -5158,18 +5140,14 @@ PGXCNodeGetTransactionParamStr(void) /* * Send down specified query, read and discard all responses until ReadyForQuery */ -int +void pgxc_node_set_query(PGXCNodeHandle *handle, const char *set_query) { if (pgxc_node_send_query(handle, set_query) != 0) { - /* - * print log only and decide whether to throw an error at the place where it is called - */ - ereport(LOG, + ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Failed to send query %s",set_query))); - return EOF; } /* * Now read responses until ReadyForQuery. @@ -5210,11 +5188,8 @@ pgxc_node_set_query(PGXCNodeHandle *handle, const char *set_query) { PGXCNodeHandleError(handle, msg, msglen); PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_ERROR_FATAL); - /* - * print log only and decide whether to throw an error at the place where it is called - */ - elog(LOG,"pgxc_node_set_query: %s",handle->error); - return EOF; + elog(ERROR,"pgxc_node_set_query: %s",handle->error); + break; } if (msgtype == 'Z') /* ReadyForQuery */ @@ -5225,8 +5200,6 @@ pgxc_node_set_query(PGXCNodeHandle *handle, const char *set_query) break; } } - - return 0; } diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h index 71f8fa40..402fb28c 100644 --- a/src/include/pgxc/pgxcnode.h +++ b/src/include/pgxc/pgxcnode.h @@ -281,7 +281,7 @@ extern void PGXCNodeSetParam(bool local, const char *name, const char *value, extern void PGXCNodeResetParams(bool only_local); extern char *PGXCNodeGetSessionParamStr(void); extern char *PGXCNodeGetTransactionParamStr(void); -extern int pgxc_node_set_query(PGXCNodeHandle *handle, const char *set_query); +extern void pgxc_node_set_query(PGXCNodeHandle *handle, const char *set_query); extern void RequestInvalidateRemoteHandles(void); extern void RequestRefreshRemoteHandles(void); extern bool PoolerMessagesPending(void); From d2a76cfe88f2ee50cebfd9836eaea800e72023fc Mon Sep 17 00:00:00 2001 From: arrowbowang Date: Fri, 1 Apr 2022 11:25:02 +0800 Subject: [PATCH 490/578] Revert "[BUGFIX] Subtransaction commits should not reset session information" This reverts commit 98634f5bec4daf68503344f3b251ee30b5cd9bbf. --- src/backend/pgxc/pool/execRemote.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 297f66bf..74c414a4 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -4850,12 +4850,14 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle) stat_transaction(conn_count); /* do not cleanup remote session for subtrans */ - if (!temp_object_included && need_release_handle) + if (!temp_object_included) { /* Clean up remote sessions */ pgxc_node_remote_cleanup_all(txn_type == TXN_TYPE_CommitSubTxn || txn_type == TXN_TYPE_RollbackSubTxn); + if (need_release_handle) + { if (PersistentConnections) { reset_handles(); @@ -4872,6 +4874,7 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle) } } } + } clear_handles(); } @@ -5960,12 +5963,13 @@ pgxc_node_remote_abort(TranscationType txn_type, bool need_release_handle) * certain issues for aborted transactions, we drop the connections. * Revisit and fix the issue */ - if (!temp_object_included && need_release_handle) + if (!temp_object_included) { /* Clean up remote sessions */ pgxc_node_remote_cleanup_all(txn_type == TXN_TYPE_CommitSubTxn || txn_type == TXN_TYPE_RollbackSubTxn); - + if (need_release_handle) + { if (HaveActiveDatanodeStatements()) { reset_handles(); @@ -5975,6 +5979,7 @@ pgxc_node_remote_abort(TranscationType txn_type, bool need_release_handle) release_handles(false); } } + } clear_handles(); pfree_pgxc_all_handles(handles); From 406e57884537dc721cd89516cf4720b681addca4 Mon Sep 17 00:00:00 2001 From: arrowbowang Date: Fri, 1 Apr 2022 11:25:13 +0800 Subject: [PATCH 491/578] Revert "bugfix: prepare regress failed (merge request !440)" This reverts commit 56cd97f98d95100917ff7468cf3a6dee83a9d30a. --- src/backend/pgxc/pool/execRemote.c | 41 +++++------------------------- src/backend/pgxc/pool/pgxcnode.c | 6 +++++ 2 files changed, 12 insertions(+), 35 deletions(-) diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 74c414a4..e5dfad24 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -3778,6 +3778,12 @@ pgxc_node_remote_cleanup_all(bool sub) return; } + /* Do not cleanup connections if we have prepared statements on nodes */ + if (HaveActiveDatanodeStatements()) + { + return; + } + /* * Send down snapshot followed by DISCARD ALL command. */ @@ -4785,16 +4791,9 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit) } else { - if (HaveActiveDatanodeStatements()) - { - reset_handles(); - } - else - { release_handles(false); } } - } clear_handles(); @@ -4864,17 +4863,10 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle) } else { - if (HaveActiveDatanodeStatements()) - { - reset_handles(); - } - else - { release_handles(false); } } } - } clear_handles(); } @@ -5102,17 +5094,10 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle) } else { - if (HaveActiveDatanodeStatements()) - { - reset_handles(); - } - else - { release_handles(false); } } } - } clear_handles(); #endif @@ -5970,16 +5955,9 @@ pgxc_node_remote_abort(TranscationType txn_type, bool need_release_handle) txn_type == TXN_TYPE_RollbackSubTxn); if (need_release_handle) { - if (HaveActiveDatanodeStatements()) - { - reset_handles(); - } - else - { release_handles(false); } } - } clear_handles(); pfree_pgxc_all_handles(handles); @@ -8955,16 +8933,9 @@ pgxc_node_remote_finish(char *prepareGID, bool commit, } else { - if (HaveActiveDatanodeStatements()) - { - reset_handles(); - } - else - { release_handles(false); } } - } clear_handles(); pfree_pgxc_all_handles(pgxc_handles); reset_transaction_handles(); diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index 890f9715..17e271dd 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -1563,6 +1563,12 @@ reset_handles(void) return; } + /* Do not reset connections if we have prepared statements on nodes */ + if (HaveActiveDatanodeStatements()) + { + return; + } + /* Reset Datanodes handles occupied memory */ for (i = 0; i < NumDataNodes; i++) { From 9da75bb3a9e3b3cdba4a4599eac47bce8422ea68 Mon Sep 17 00:00:00 2001 From: arrowbowang Date: Fri, 2 Jul 2021 11:47:08 +0800 Subject: [PATCH 492/578] fix: implicit transaction do not commit on dn --- src/backend/pgxc/pool/execRemote.c | 104 ++++++++++++++++------------- src/backend/pgxc/pool/pgxcnode.c | 47 ++++++++++++- src/backend/tcop/postgres.c | 5 ++ src/include/pgxc/pgxcnode.h | 2 + 4 files changed, 111 insertions(+), 47 deletions(-) diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index e5dfad24..0d1fa08e 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -4780,20 +4780,12 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit) else elog(ERROR, "failed to PREPARE transaction on one or more nodes"); - if (!temp_object_included) + if (!temp_object_included && !PersistentConnections) { /* Clean up remote sessions */ pgxc_node_remote_cleanup_all(false); - - if (PersistentConnections) - { - reset_handles(); - } - else - { release_handles(false); } - } clear_handles(); @@ -4848,23 +4840,24 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle) stat_transaction(conn_count); - /* do not cleanup remote session for subtrans */ - if (!temp_object_included) + if (need_release_handle) + { + if (!temp_object_included && !PersistentConnections) { /* Clean up remote sessions */ pgxc_node_remote_cleanup_all(txn_type == TXN_TYPE_CommitSubTxn || txn_type == TXN_TYPE_RollbackSubTxn); - - if (need_release_handle) - { - if (PersistentConnections) - { - reset_handles(); + release_handles(false); + } } else { - release_handles(false); - } + /* in subtxn, we just cleanup the connections. not release the handles. */ + if (!temp_object_included && !PersistentConnections) + { + /* Clean up remote sessions without release handles. */ + pgxc_node_remote_cleanup_all(txn_type == TXN_TYPE_CommitSubTxn || + txn_type == TXN_TYPE_RollbackSubTxn); } } @@ -5081,21 +5074,23 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle) #ifndef __TBASE__ stat_transaction(conn_count); - if (!temp_object_included) - { - /* Clean up remote sessions */ - pgxc_node_remote_cleanup_all(); if (need_release_handle) { - if (PersistentConnections) + if (!temp_object_included && !PersistentConnections) { - reset_handles(); + /* Clean up remote sessions */ + pgxc_node_remote_cleanup_all(); + release_handles(false); + } } else { - release_handles(false); - } + /* in subtxn, we just cleanup the connections. not release the handles. */ + if (!temp_object_included && !PersistentConnections) + { + /* Clean up remote sessions without release handles. */ + pgxc_node_remote_cleanup_all(); } } @@ -5940,25 +5935,26 @@ pgxc_node_remote_abort(TranscationType txn_type, bool need_release_handle) } #endif - /* - * Drop the connections to ensure aborts are handled properly. - * - * XXX We should really be consulting PersistentConnections parameter and - * keep the connections if its set. But as a short term measure, to address - * certain issues for aborted transactions, we drop the connections. - * Revisit and fix the issue - */ + if (need_release_handle) + { if (!temp_object_included) { /* Clean up remote sessions */ pgxc_node_remote_cleanup_all(txn_type == TXN_TYPE_CommitSubTxn || txn_type == TXN_TYPE_RollbackSubTxn); - if (need_release_handle) - { release_handles(false); } } - + else + { + /* in subtxn, we just cleanup the connections. not release the handles. */ + if (!temp_object_included) + { + /* Clean up remote sessions */ + pgxc_node_remote_cleanup_all(txn_type == TXN_TYPE_CommitSubTxn || + txn_type == TXN_TYPE_RollbackSubTxn); + } + } clear_handles(); pfree_pgxc_all_handles(handles); @@ -7959,6 +7955,29 @@ PreAbort_Remote(TranscationType txn_type, bool need_release_handle) pgxc_node_remote_abort(txn_type, need_release_handle); + /* + * Drop the connections to ensure aborts are handled properly. + * + * XXX We should really be consulting PersistentConnections parameter and + * keep the connections if its set. But as a short term measure, to address + * certain issues for aborted transactions, we drop the connections. + * Revisit and fix the issue + */ + elog(DEBUG5, "temp_object_included %d", temp_object_included); + /* cleanup and release handles is already done in pgxc_node_remote_abort */ +#if 0 + if (release_handle) + { + if (!temp_object_included) + { + /* Clean up remote sessions */ + pgxc_node_remote_cleanup_all(); + release_handles(); + } + } + + clear_handles(); +#endif pfree_pgxc_all_handles(all_handles); if (log_gtm_stats) @@ -8923,19 +8942,12 @@ pgxc_node_remote_finish(char *prepareGID, bool commit, } #endif - if (!temp_object_included) + if (!temp_object_included && !PersistentConnections) { /* Clean up remote sessions */ pgxc_node_remote_cleanup_all(false); - if (PersistentConnections) - { - reset_handles(); - } - else - { release_handles(false); } - } clear_handles(); pfree_pgxc_all_handles(pgxc_handles); reset_transaction_handles(); diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index 17e271dd..445a27c1 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -2571,6 +2571,51 @@ pgxc_node_send_sync(PGXCNodeHandle * handle) return pgxc_node_flush(handle); } + +/* + * Send SYNC message down to the Datanode + */ +int +pgxc_node_send_my_sync(PGXCNodeHandle * handle) +{ + /* size */ + int msgLen = 4; + + /* msgType + msgLen */ + if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0) + { + add_error_message(handle, "out of memory"); + return EOF; + } + + handle->outBuffer[handle->outEnd++] = 'L'; + /* size */ + msgLen = htonl(msgLen); + memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4); + handle->outEnd += 4; + + handle->in_extended_query = false; + handle->needSync = false; + + msgLen = 4; + /* msgType + msgLen */ + if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0) + { + add_error_message(handle, "out of memory"); + return EOF; + } + + handle->outBuffer[handle->outEnd++] = 'H'; + /* size */ + msgLen = htonl(msgLen); + memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4); + handle->outEnd += 4; + + handle->in_extended_query = true; + + return pgxc_node_flush(handle); +} + #ifdef __SUBSCRIPTION__ /* * Send logical apply message down to the Datanode @@ -2633,7 +2678,7 @@ pgxc_node_send_query_extended(PGXCNodeHandle *handle, const char *query, if (fetch_size >= 0) if (pgxc_node_send_execute(handle, portal, fetch_size)) return EOF; - if (pgxc_node_send_flush(handle)) + if (pgxc_node_send_my_sync(handle)) return EOF; return 0; diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 7735fd18..01332ba6 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -629,6 +629,7 @@ SocketBackend(StringInfo inBuf) errmsg("invalid frontend message type %d", qtype))); break; + case 'L': case 'S': /* sync */ /* stop any active skip-till-Sync */ ignore_till_sync = false; @@ -5803,6 +5804,10 @@ PostgresMain(int argc, char *argv[], send_ready_for_query = true; break; + case 'L': /* sync */ + pq_getmsgend(&input_message); + finish_xact_command(); + break; #ifdef __TBASE__ case 'N': { diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h index 402fb28c..adbc8f6e 100644 --- a/src/include/pgxc/pgxcnode.h +++ b/src/include/pgxc/pgxcnode.h @@ -213,6 +213,8 @@ extern int pgxc_node_send_execute(PGXCNodeHandle * handle, const char *portal, i extern int pgxc_node_send_close(PGXCNodeHandle * handle, bool is_statement, const char *name); extern int pgxc_node_send_sync(PGXCNodeHandle * handle); +extern int pgxc_node_send_my_sync(PGXCNodeHandle * handle); + #ifdef __SUBSCRIPTION__ extern int pgxc_node_send_apply(PGXCNodeHandle * handle, char * buf, int len, bool ignore_pk_conflict); #endif From 68a21af8b90493a83559c498d3834931c9ba234a Mon Sep 17 00:00:00 2001 From: arrowbowang Date: Mon, 28 Jun 2021 16:05:55 +0800 Subject: [PATCH 493/578] perf: add parse_snapshot to decrease gtm request --- src/backend/tcop/postgres.c | 10 ++++++---- src/backend/utils/cache/plancache.c | 4 ++-- src/backend/utils/misc/guc.c | 9 +++++++++ src/include/utils/guc.h | 1 + 4 files changed, 18 insertions(+), 6 deletions(-) diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 01332ba6..b314aa97 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -211,6 +211,8 @@ static char *remotePrepareGID = NULL; /* for error code contrib */ bool g_is_in_init_phase = false; +bool g_parse_snapshot = true; + bool IsNormalPostgres = false; bool explain_stmt = false; @@ -1443,7 +1445,7 @@ exec_simple_query(const char *query_string) /* * Set up a snapshot if parse analysis/planning will need one. */ - if (analyze_requires_snapshot(parsetree)) + if (analyze_requires_snapshot(parsetree) && g_parse_snapshot) { #ifdef __TBASE__ /* use local snapshot instead of global if told so */ @@ -1918,7 +1920,7 @@ exec_parse_message(const char *query_string, /* string to execute */ /* * Set up a snapshot if parse analysis will need one. */ - if (analyze_requires_snapshot(raw_parse_tree)) + if (analyze_requires_snapshot(raw_parse_tree) && g_parse_snapshot) { #ifdef __TBASE__ /* use local snapshot instead of global if told so */ @@ -2477,9 +2479,9 @@ exec_bind_message(StringInfo input_message) * snapshot active till we're done, so that plancache.c doesn't have to * take new ones. */ - if (numParams > 0 || + if ((numParams > 0 || (psrc->raw_parse_tree && - analyze_requires_snapshot(psrc->raw_parse_tree))) + analyze_requires_snapshot(psrc->raw_parse_tree))) && g_parse_snapshot) { #ifdef __TBASE__ /* use local snapshot instead of global if told so */ diff --git a/src/backend/utils/cache/plancache.c b/src/backend/utils/cache/plancache.c index 240a4f9d..1ad79655 100644 --- a/src/backend/utils/cache/plancache.c +++ b/src/backend/utils/cache/plancache.c @@ -991,9 +991,9 @@ BuildCachedPlan(CachedPlanSource *plansource, List *qlist, * for planning. But if it isn't, and we need one, install one. */ snapshot_set = false; - if (!ActiveSnapshotSet() && + if ((!ActiveSnapshotSet() && plansource->raw_parse_tree && - analyze_requires_snapshot(plansource->raw_parse_tree)) + analyze_requires_snapshot(plansource->raw_parse_tree)) && g_parse_snapshot) { PushActiveSnapshot(GetTransactionSnapshot()); snapshot_set = true; diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 8b9a9fe9..7539eb21 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -2359,6 +2359,15 @@ static struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, #endif + { + {"parse_snapshot", PGC_USERSET, CUSTOM_OPTIONS, + gettext_noop("allow forced ddl of inconsistent metadata"), + NULL + }, + &g_parse_snapshot, + true, + NULL, NULL, NULL + }, #ifdef _SHARDING_ { {"allow_dml_on_datanode", PGC_USERSET, CUSTOM_OPTIONS, diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h index 95342821..179abb61 100644 --- a/src/include/utils/guc.h +++ b/src/include/utils/guc.h @@ -310,6 +310,7 @@ extern int tcp_keepalives_count; #ifdef _SHARDING_ extern bool g_allow_dml_on_datanode; extern bool g_allow_force_ddl; +extern bool g_parse_snapshot; extern bool trace_extent; #endif From 3bbe2f1ac440045b08fb348a8550f8c56e2124a8 Mon Sep 17 00:00:00 2001 From: arrowbowang Date: Wed, 21 Jul 2021 17:16:49 +0800 Subject: [PATCH 494/578] fix comments of mr --- src/backend/pgxc/pool/execRemote.c | 14 ++------------ src/backend/tcop/postgres.c | 8 ++++---- src/backend/utils/cache/plancache.c | 2 +- src/backend/utils/misc/guc.c | 6 +++--- src/include/utils/guc.h | 2 +- 5 files changed, 11 insertions(+), 21 deletions(-) diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 0d1fa08e..9e306cef 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -4840,24 +4840,14 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle) stat_transaction(conn_count); - if (need_release_handle) - { if (!temp_object_included && !PersistentConnections) { /* Clean up remote sessions */ pgxc_node_remote_cleanup_all(txn_type == TXN_TYPE_CommitSubTxn || txn_type == TXN_TYPE_RollbackSubTxn); - release_handles(false); - } - } - else - { - /* in subtxn, we just cleanup the connections. not release the handles. */ - if (!temp_object_included && !PersistentConnections) + if (need_release_handle) { - /* Clean up remote sessions without release handles. */ - pgxc_node_remote_cleanup_all(txn_type == TXN_TYPE_CommitSubTxn || - txn_type == TXN_TYPE_RollbackSubTxn); + release_handles(false); } } diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index b314aa97..126bae58 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -211,7 +211,7 @@ static char *remotePrepareGID = NULL; /* for error code contrib */ bool g_is_in_init_phase = false; -bool g_parse_snapshot = true; +bool g_snapshot_for_analyze = true; bool IsNormalPostgres = false; @@ -1445,7 +1445,7 @@ exec_simple_query(const char *query_string) /* * Set up a snapshot if parse analysis/planning will need one. */ - if (analyze_requires_snapshot(parsetree) && g_parse_snapshot) + if (analyze_requires_snapshot(parsetree) && g_snapshot_for_analyze) { #ifdef __TBASE__ /* use local snapshot instead of global if told so */ @@ -1920,7 +1920,7 @@ exec_parse_message(const char *query_string, /* string to execute */ /* * Set up a snapshot if parse analysis will need one. */ - if (analyze_requires_snapshot(raw_parse_tree) && g_parse_snapshot) + if (analyze_requires_snapshot(raw_parse_tree) && g_snapshot_for_analyze) { #ifdef __TBASE__ /* use local snapshot instead of global if told so */ @@ -2481,7 +2481,7 @@ exec_bind_message(StringInfo input_message) */ if ((numParams > 0 || (psrc->raw_parse_tree && - analyze_requires_snapshot(psrc->raw_parse_tree))) && g_parse_snapshot) + analyze_requires_snapshot(psrc->raw_parse_tree))) && g_snapshot_for_analyze) { #ifdef __TBASE__ /* use local snapshot instead of global if told so */ diff --git a/src/backend/utils/cache/plancache.c b/src/backend/utils/cache/plancache.c index 1ad79655..b69aa7cb 100644 --- a/src/backend/utils/cache/plancache.c +++ b/src/backend/utils/cache/plancache.c @@ -993,7 +993,7 @@ BuildCachedPlan(CachedPlanSource *plansource, List *qlist, snapshot_set = false; if ((!ActiveSnapshotSet() && plansource->raw_parse_tree && - analyze_requires_snapshot(plansource->raw_parse_tree)) && g_parse_snapshot) + analyze_requires_snapshot(plansource->raw_parse_tree)) && g_snapshot_for_analyze) { PushActiveSnapshot(GetTransactionSnapshot()); snapshot_set = true; diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 7539eb21..27901832 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -2360,11 +2360,11 @@ static struct config_bool ConfigureNamesBool[] = }, #endif { - {"parse_snapshot", PGC_USERSET, CUSTOM_OPTIONS, - gettext_noop("allow forced ddl of inconsistent metadata"), + {"snapshot_for_analyze", PGC_USERSET, CUSTOM_OPTIONS, + gettext_noop("enable/disable get snapshot for analyze and rewrite"), NULL }, - &g_parse_snapshot, + &g_snapshot_for_analyze, true, NULL, NULL, NULL }, diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h index 179abb61..2634e983 100644 --- a/src/include/utils/guc.h +++ b/src/include/utils/guc.h @@ -310,7 +310,7 @@ extern int tcp_keepalives_count; #ifdef _SHARDING_ extern bool g_allow_dml_on_datanode; extern bool g_allow_force_ddl; -extern bool g_parse_snapshot; +extern bool g_snapshot_for_analyze; extern bool trace_extent; #endif From 82aa7606f8a69564a37e5805743d481a62621879 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cningxpeng=E2=80=9D?= <“ningxpeng@tencent.com”> Date: Fri, 20 Aug 2021 17:05:53 +0800 Subject: [PATCH 495/578] [BUGFIX] Subtransaction commits should not reset session information --- src/backend/pgxc/pool/execRemote.c | 23 +---- .../regress/expected/xc_create_function.out | 92 +++++++++++++++++++ src/test/regress/sql/xc_create_function.sql | 66 +++++++++++++ 3 files changed, 163 insertions(+), 18 deletions(-) diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 9e306cef..a2e403ce 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -4840,16 +4840,14 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle) stat_transaction(conn_count); - if (!temp_object_included && !PersistentConnections) + /* do not cleanup remote session for subtrans */ + if (!temp_object_included && !PersistentConnections && need_release_handle) { /* Clean up remote sessions */ pgxc_node_remote_cleanup_all(txn_type == TXN_TYPE_CommitSubTxn || txn_type == TXN_TYPE_RollbackSubTxn); - if (need_release_handle) - { release_handles(false); } - } clear_handles(); } @@ -5925,26 +5923,15 @@ pgxc_node_remote_abort(TranscationType txn_type, bool need_release_handle) } #endif - if (need_release_handle) - { - if (!temp_object_included) + /* do not cleanup remote session for subtrans */ + if (!temp_object_included && need_release_handle) { /* Clean up remote sessions */ pgxc_node_remote_cleanup_all(txn_type == TXN_TYPE_CommitSubTxn || txn_type == TXN_TYPE_RollbackSubTxn); release_handles(false); } - } - else - { - /* in subtxn, we just cleanup the connections. not release the handles. */ - if (!temp_object_included) - { - /* Clean up remote sessions */ - pgxc_node_remote_cleanup_all(txn_type == TXN_TYPE_CommitSubTxn || - txn_type == TXN_TYPE_RollbackSubTxn); - } - } + clear_handles(); pfree_pgxc_all_handles(handles); diff --git a/src/test/regress/expected/xc_create_function.out b/src/test/regress/expected/xc_create_function.out index ff83c0a7..a3520bca 100644 --- a/src/test/regress/expected/xc_create_function.out +++ b/src/test/regress/expected/xc_create_function.out @@ -175,3 +175,95 @@ BEGIN str = 'execute direct on (' || node_name || ') $$ ' || query || ' $$' ; execute str; END $D$ language plpgsql; +-- subtransaction guc check +drop table if exists t_abort; +NOTICE: table "t_abort" does not exist, skipping +create table t_abort(a int); +insert into t_abort select generate_series(1,20); +select count(*) from t_abort; + count +------- + 20 +(1 row) + +Reset TimeZone; +show TimeZone; + TimeZone +---------- + PST8PDT +(1 row) + +set TimeZone to 'PRC'; +create or replace procedure subtransaction_guc_check() +as +$$ +declare + names refcursor; + results1 refcursor; + results2 refcursor; + results3 refcursor; + guc_result varchar; + node_names varchar; + node varchar :=''; + cmd1 varchar; + cmd2 varchar; +BEGIN + open names for execute 'select node_name from pgxc_node where node_type=''D'' limit 1'; + fetch names into node_names; + cmd1 := 'execute direct on(' || node_names || ') ''select setting from pg_settings where name=''''TimeZone'''''''; + BEGIN + raise notice '%',cmd1; + open results1 for EXECUTE cmd1; + fetch results1 into guc_result; + raise notice 'TimeZone = %',guc_result; + EXCEPTION when others then + raise notice 'ERROR: (%)', SQLERRM; + close results1; + end; + + BEGIN + raise notice '%',cmd1; + open results2 for EXECUTE cmd1; + fetch results2 into guc_result; + raise notice 'TimeZone = %',guc_result; + cmd2 := 'select a from t_abort'; + EXECUTE cmd2; + EXCEPTION when others then + raise notice 'ERROR: (%)', SQLERRM; + close results2; + Rollback; + end; + + -- check twice, shoud be same. + raise notice '%',cmd1; + open results3 for EXECUTE cmd1; + fetch results3 into guc_result; + raise notice 'TimeZone = %',guc_result; + EXCEPTION when others then + raise notice 'ERROR: (%)', SQLERRM; + close results3; + close names; +end; +$$ +language plpgsql; +call subtransaction_guc_check(); +NOTICE: execute direct on(datanode_1) 'select setting from pg_settings where name=''TimeZone''' +NOTICE: TimeZone = PRC +NOTICE: execute direct on(datanode_1) 'select setting from pg_settings where name=''TimeZone''' +NOTICE: TimeZone = PRC +NOTICE: execute direct on(datanode_1) 'select setting from pg_settings where name=''TimeZone''' +NOTICE: TimeZone = PRC +Show TimeZone; + TimeZone +---------- + PRC +(1 row) + +Reset TimeZone; +Show TimeZone; + TimeZone +---------- + PST8PDT +(1 row) + +drop table t_abort; diff --git a/src/test/regress/sql/xc_create_function.sql b/src/test/regress/sql/xc_create_function.sql index 02f750ea..272b035c 100644 --- a/src/test/regress/sql/xc_create_function.sql +++ b/src/test/regress/sql/xc_create_function.sql @@ -180,3 +180,69 @@ BEGIN str = 'execute direct on (' || node_name || ') $$ ' || query || ' $$' ; execute str; END $D$ language plpgsql; + +-- subtransaction guc check +drop table if exists t_abort; +create table t_abort(a int); +insert into t_abort select generate_series(1,20); +select count(*) from t_abort; +Reset TimeZone; +show TimeZone; +set TimeZone to 'PRC'; +create or replace procedure subtransaction_guc_check() +as +$$ +declare + names refcursor; + results1 refcursor; + results2 refcursor; + results3 refcursor; + guc_result varchar; + node_names varchar; + node varchar :=''; + cmd1 varchar; + cmd2 varchar; +BEGIN + open names for execute 'select node_name from pgxc_node where node_type=''D'' limit 1'; + fetch names into node_names; + cmd1 := 'execute direct on(' || node_names || ') ''select setting from pg_settings where name=''''TimeZone'''''''; + BEGIN + raise notice '%',cmd1; + open results1 for EXECUTE cmd1; + fetch results1 into guc_result; + raise notice 'TimeZone = %',guc_result; + EXCEPTION when others then + raise notice 'ERROR: (%)', SQLERRM; + close results1; + end; + + BEGIN + raise notice '%',cmd1; + open results2 for EXECUTE cmd1; + fetch results2 into guc_result; + raise notice 'TimeZone = %',guc_result; + cmd2 := 'select a from t_abort'; + EXECUTE cmd2; + EXCEPTION when others then + raise notice 'ERROR: (%)', SQLERRM; + close results2; + Rollback; + end; + + -- check twice, shoud be same. + raise notice '%',cmd1; + open results3 for EXECUTE cmd1; + fetch results3 into guc_result; + raise notice 'TimeZone = %',guc_result; + EXCEPTION when others then + raise notice 'ERROR: (%)', SQLERRM; + close results3; + close names; +end; +$$ +language plpgsql; +call subtransaction_guc_check(); +Show TimeZone; +Reset TimeZone; +Show TimeZone; +drop table t_abort; \ No newline at end of file From 316e806d6e887e41984524cb44eed25879e22431 Mon Sep 17 00:00:00 2001 From: ningxpeng Date: Fri, 20 Aug 2021 20:22:22 +0800 Subject: [PATCH 496/578] [revert] Not reset global session info when subtrans end --- src/backend/pgxc/pool/execRemote.c | 33 +++++++----------------------- 1 file changed, 7 insertions(+), 26 deletions(-) diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index a2e403ce..71e4c53b 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -3750,17 +3750,14 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections, * specific stuff before releasing them to pool for reuse by other sessions. */ static void -pgxc_node_remote_cleanup_all(bool sub) +pgxc_node_remote_cleanup_all(void) { PGXCNodeAllHandles *handles = get_current_handles(); PGXCNodeHandle *new_connections[handles->co_conn_count + handles->dn_conn_count]; int new_conn_count = 0; int i; /* if it's called by sub-commit or sub-abort, DO NOT reset global_session */ - char *resetcmd = sub ? "RESET ALL;" - "RESET SESSION AUTHORIZATION;" - "RESET transaction_isolation;" : - "RESET ALL;" + char *resetcmd = "RESET ALL;" "RESET SESSION AUTHORIZATION;" "RESET transaction_isolation;" "RESET global_session"; @@ -4783,7 +4780,7 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit) if (!temp_object_included && !PersistentConnections) { /* Clean up remote sessions */ - pgxc_node_remote_cleanup_all(false); + pgxc_node_remote_cleanup_all(); release_handles(false); } @@ -4840,12 +4837,10 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle) stat_transaction(conn_count); - /* do not cleanup remote session for subtrans */ if (!temp_object_included && !PersistentConnections && need_release_handle) { /* Clean up remote sessions */ - pgxc_node_remote_cleanup_all(txn_type == TXN_TYPE_CommitSubTxn || - txn_type == TXN_TYPE_RollbackSubTxn); + pgxc_node_remote_cleanup_all(); release_handles(false); } @@ -5062,25 +5057,12 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle) #ifndef __TBASE__ stat_transaction(conn_count); - - if (need_release_handle) - { - if (!temp_object_included && !PersistentConnections) + if (!temp_object_included && !PersistentConnections && need_release_handle) { /* Clean up remote sessions */ pgxc_node_remote_cleanup_all(); release_handles(false); } - } - else - { - /* in subtxn, we just cleanup the connections. not release the handles. */ - if (!temp_object_included && !PersistentConnections) - { - /* Clean up remote sessions without release handles. */ - pgxc_node_remote_cleanup_all(); - } - } clear_handles(); #endif @@ -5927,8 +5909,7 @@ pgxc_node_remote_abort(TranscationType txn_type, bool need_release_handle) if (!temp_object_included && need_release_handle) { /* Clean up remote sessions */ - pgxc_node_remote_cleanup_all(txn_type == TXN_TYPE_CommitSubTxn || - txn_type == TXN_TYPE_RollbackSubTxn); + pgxc_node_remote_cleanup_all(); release_handles(false); } @@ -8922,7 +8903,7 @@ pgxc_node_remote_finish(char *prepareGID, bool commit, if (!temp_object_included && !PersistentConnections) { /* Clean up remote sessions */ - pgxc_node_remote_cleanup_all(false); + pgxc_node_remote_cleanup_all(); release_handles(false); } clear_handles(); From a45701511c3bc21f32c8c424cf89c87d35aed719 Mon Sep 17 00:00:00 2001 From: arrowbowang Date: Fri, 1 Apr 2022 15:24:29 +0800 Subject: [PATCH 497/578] fix regrerss: remove the function not supported by v2 --- .../regress/expected/xc_create_function.out | 92 ------------------- src/test/regress/sql/xc_create_function.sql | 68 +------------- 2 files changed, 1 insertion(+), 159 deletions(-) diff --git a/src/test/regress/expected/xc_create_function.out b/src/test/regress/expected/xc_create_function.out index a3520bca..ff83c0a7 100644 --- a/src/test/regress/expected/xc_create_function.out +++ b/src/test/regress/expected/xc_create_function.out @@ -175,95 +175,3 @@ BEGIN str = 'execute direct on (' || node_name || ') $$ ' || query || ' $$' ; execute str; END $D$ language plpgsql; --- subtransaction guc check -drop table if exists t_abort; -NOTICE: table "t_abort" does not exist, skipping -create table t_abort(a int); -insert into t_abort select generate_series(1,20); -select count(*) from t_abort; - count -------- - 20 -(1 row) - -Reset TimeZone; -show TimeZone; - TimeZone ----------- - PST8PDT -(1 row) - -set TimeZone to 'PRC'; -create or replace procedure subtransaction_guc_check() -as -$$ -declare - names refcursor; - results1 refcursor; - results2 refcursor; - results3 refcursor; - guc_result varchar; - node_names varchar; - node varchar :=''; - cmd1 varchar; - cmd2 varchar; -BEGIN - open names for execute 'select node_name from pgxc_node where node_type=''D'' limit 1'; - fetch names into node_names; - cmd1 := 'execute direct on(' || node_names || ') ''select setting from pg_settings where name=''''TimeZone'''''''; - BEGIN - raise notice '%',cmd1; - open results1 for EXECUTE cmd1; - fetch results1 into guc_result; - raise notice 'TimeZone = %',guc_result; - EXCEPTION when others then - raise notice 'ERROR: (%)', SQLERRM; - close results1; - end; - - BEGIN - raise notice '%',cmd1; - open results2 for EXECUTE cmd1; - fetch results2 into guc_result; - raise notice 'TimeZone = %',guc_result; - cmd2 := 'select a from t_abort'; - EXECUTE cmd2; - EXCEPTION when others then - raise notice 'ERROR: (%)', SQLERRM; - close results2; - Rollback; - end; - - -- check twice, shoud be same. - raise notice '%',cmd1; - open results3 for EXECUTE cmd1; - fetch results3 into guc_result; - raise notice 'TimeZone = %',guc_result; - EXCEPTION when others then - raise notice 'ERROR: (%)', SQLERRM; - close results3; - close names; -end; -$$ -language plpgsql; -call subtransaction_guc_check(); -NOTICE: execute direct on(datanode_1) 'select setting from pg_settings where name=''TimeZone''' -NOTICE: TimeZone = PRC -NOTICE: execute direct on(datanode_1) 'select setting from pg_settings where name=''TimeZone''' -NOTICE: TimeZone = PRC -NOTICE: execute direct on(datanode_1) 'select setting from pg_settings where name=''TimeZone''' -NOTICE: TimeZone = PRC -Show TimeZone; - TimeZone ----------- - PRC -(1 row) - -Reset TimeZone; -Show TimeZone; - TimeZone ----------- - PST8PDT -(1 row) - -drop table t_abort; diff --git a/src/test/regress/sql/xc_create_function.sql b/src/test/regress/sql/xc_create_function.sql index 272b035c..7000b6ba 100644 --- a/src/test/regress/sql/xc_create_function.sql +++ b/src/test/regress/sql/xc_create_function.sql @@ -179,70 +179,4 @@ BEGIN node_name = get_xc_node_name(nodenum); str = 'execute direct on (' || node_name || ') $$ ' || query || ' $$' ; execute str; -END $D$ language plpgsql; - --- subtransaction guc check -drop table if exists t_abort; -create table t_abort(a int); -insert into t_abort select generate_series(1,20); -select count(*) from t_abort; -Reset TimeZone; -show TimeZone; -set TimeZone to 'PRC'; -create or replace procedure subtransaction_guc_check() -as -$$ -declare - names refcursor; - results1 refcursor; - results2 refcursor; - results3 refcursor; - guc_result varchar; - node_names varchar; - node varchar :=''; - cmd1 varchar; - cmd2 varchar; -BEGIN - open names for execute 'select node_name from pgxc_node where node_type=''D'' limit 1'; - fetch names into node_names; - cmd1 := 'execute direct on(' || node_names || ') ''select setting from pg_settings where name=''''TimeZone'''''''; - BEGIN - raise notice '%',cmd1; - open results1 for EXECUTE cmd1; - fetch results1 into guc_result; - raise notice 'TimeZone = %',guc_result; - EXCEPTION when others then - raise notice 'ERROR: (%)', SQLERRM; - close results1; - end; - - BEGIN - raise notice '%',cmd1; - open results2 for EXECUTE cmd1; - fetch results2 into guc_result; - raise notice 'TimeZone = %',guc_result; - cmd2 := 'select a from t_abort'; - EXECUTE cmd2; - EXCEPTION when others then - raise notice 'ERROR: (%)', SQLERRM; - close results2; - Rollback; - end; - - -- check twice, shoud be same. - raise notice '%',cmd1; - open results3 for EXECUTE cmd1; - fetch results3 into guc_result; - raise notice 'TimeZone = %',guc_result; - EXCEPTION when others then - raise notice 'ERROR: (%)', SQLERRM; - close results3; - close names; -end; -$$ -language plpgsql; -call subtransaction_guc_check(); -Show TimeZone; -Reset TimeZone; -Show TimeZone; -drop table t_abort; \ No newline at end of file +END $D$ language plpgsql; \ No newline at end of file From 617790375b27f995ce185f0e75395469ae038567 Mon Sep 17 00:00:00 2001 From: arrowbowang Date: Fri, 1 Apr 2022 15:34:23 +0800 Subject: [PATCH 498/578] delete function reset_handles not used --- src/backend/pgxc/pool/pgxcnode.c | 61 -------------------------------- src/include/pgxc/pgxcnode.h | 1 - 2 files changed, 62 deletions(-) diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index 445a27c1..e4767e9d 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -1544,67 +1544,6 @@ release_handles(bool force) slavedatanode_count = 0; } -/* - * Reset all Datanode and Coordinator connections occupied memory. - */ -void -reset_handles(void) -{ - int i; - - /* don't reset connection if holding a cluster lock */ - if (cluster_ex_lock_held) - { - return; - } - - if (datanode_count == 0 && coord_count == 0 && slavedatanode_count == 0) - { - return; - } - - /* Do not reset connections if we have prepared statements on nodes */ - if (HaveActiveDatanodeStatements()) - { - return; - } - - /* Reset Datanodes handles occupied memory */ - for (i = 0; i < NumDataNodes; i++) - { - PGXCNodeHandle *handle = &dn_handles[i]; - - if (handle->sock != NO_SOCKET) - { - pgxc_node_init(handle, handle->sock, true, handle->backend_pid); - } - } - - for (i = 0; i < NumSlaveDataNodes; i++) - { - PGXCNodeHandle *handle = &sdn_handles[i]; - - if (handle->sock != NO_SOCKET) - { - pgxc_node_init(handle, handle->sock, true, handle->backend_pid); - } - } - - if (IS_PGXC_COORDINATOR) - { - /* Collect Coordinator handles */ - for (i = 0; i < NumCoords; i++) - { - PGXCNodeHandle *handle = &co_handles[i]; - - if (handle->sock != NO_SOCKET) - { - pgxc_node_init(handle, handle->sock, true, handle->backend_pid); - } - } - } -} - /* * Check whether there bad connections to remote nodes when abort transactions. */ diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h index adbc8f6e..f0e7c269 100644 --- a/src/include/pgxc/pgxcnode.h +++ b/src/include/pgxc/pgxcnode.h @@ -192,7 +192,6 @@ extern void register_transaction_handles(PGXCNodeHandle* handle); extern void pfree_pgxc_all_handles(PGXCNodeAllHandles *handles); extern void release_handles(bool force); -extern void reset_handles(void); extern void clear_handles(void); extern int get_transaction_nodes(PGXCNodeHandle ** connections, From b3be6035e2d1f03ae0a15697556937e550d01241 Mon Sep 17 00:00:00 2001 From: youngxie Date: Thu, 7 Apr 2022 22:56:04 +0800 Subject: [PATCH 499/578] Increace numumer of LOG2_NUM_LOCK_PARTITIONS to avoid heavy lock contention --- src/include/storage/lwlock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index 25ee91a8..ae936afc 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -177,7 +177,7 @@ extern PGDLLIMPORT int NamedLWLockTrancheRequests; #define NUM_CACHE_2PC_PARTITIONS 128 /* Number of partitions the shared lock tables are divided into */ -#define LOG2_NUM_LOCK_PARTITIONS 4 +#define LOG2_NUM_LOCK_PARTITIONS 8 #define NUM_LOCK_PARTITIONS (1 << LOG2_NUM_LOCK_PARTITIONS) /* Number of partitions the shared predicate lock tables are divided into */ From fd9cce4a3178a342470cda6591ef3273bde44d46 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Mon, 7 Dec 2020 21:30:45 +0800 Subject: [PATCH 500/578] compute_hash support custom type, fix http://tapd.oa.com/my_worktable?source_user=1001433276&workspace_id=20421696&workitem_type=bug&workitem_id=1020421696083670743#&filter_close=true --- src/backend/access/hash/hashfunc.c | 36 +++++ .../expected/redistribute_custom_types.out | 132 ++++++++++++++++++ .../regress/sql/redistribute_custom_types.sql | 65 +++++++++ 3 files changed, 233 insertions(+) create mode 100644 src/test/regress/expected/redistribute_custom_types.out create mode 100644 src/test/regress/sql/redistribute_custom_types.sql diff --git a/src/backend/access/hash/hashfunc.c b/src/backend/access/hash/hashfunc.c index f4959255..16fcbbfa 100644 --- a/src/backend/access/hash/hashfunc.c +++ b/src/backend/access/hash/hashfunc.c @@ -37,6 +37,10 @@ #include "utils/nabstime.h" #endif +#ifdef __TBASE__ +#include "utils/lsyscache.h" +#endif + /* * Datatype-specific hash functions. * @@ -300,6 +304,34 @@ hashvarlena(PG_FUNCTION_ARGS) return result; } +#ifdef __TBASE__ +static Datum +hashcustomtype(PG_FUNCTION_ARGS) +{ + Oid type = PG_GETARG_OID(0); + Datum value = PG_GETARG_DATUM(1); + Oid typsend; + bool typisvarlena; + bytea *outputbytes; + Datum result; + + /* + * Convert the column value to binary + */ + getTypeBinaryOutputInfo(type, &typsend, &typisvarlena); + + outputbytes = OidSendFunctionCall(typsend, value); + + /* + * Compute hash + */ + result = hash_any((unsigned char *) VARDATA(outputbytes), + VARSIZE(outputbytes) - VARHDRSZ); + + pfree(outputbytes); + return result; +} +#endif Datum hashvarlenaextended(PG_FUNCTION_ARGS) @@ -1043,6 +1075,10 @@ compute_hash(Oid type, Datum value, char locator) return DirectFunctionCall1(jsonb_hash, value); #endif default: +#ifdef __TBASE__ + if (locator == LOCATOR_TYPE_SHARD) + return DirectFunctionCall2(hashcustomtype, type, value); +#endif ereport(ERROR,(errmsg("Unhandled datatype:%d for modulo or hash distribution in compute_hash", type))); } /* Control should not come here. */ diff --git a/src/test/regress/expected/redistribute_custom_types.out b/src/test/regress/expected/redistribute_custom_types.out new file mode 100644 index 00000000..2ae77a18 --- /dev/null +++ b/src/test/regress/expected/redistribute_custom_types.out @@ -0,0 +1,132 @@ +-- +-- redistribute custom types +-- +-- enum type +drop table if exists enum_test; +NOTICE: table "enum_test" does not exist, skipping +drop type if exists enumtype; +NOTICE: type "enumtype" does not exist, skipping +create type enumtype AS enum ('Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'); +create table enum_test(a int, b enumtype) distribute by shard(a); +NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. +insert into enum_test(a,b) values(1,'Mon'); +insert into enum_test(a,b) values(2,'Tue'); +insert into enum_test(a,b) values(3,'Wed'); +insert into enum_test(a,b) values(4,'Thu'); +insert into enum_test(a,b) values(5,'Fri'); +insert into enum_test(a,b) values(6,'Sat'); +insert into enum_test(a,b) values(7,'Sun'); +explain select count(*) from enum_test where a < 100 group by b; + QUERY PLAN +--------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) (cost=132.87..134.74 rows=187 width=12) + -> Finalize HashAggregate (cost=132.87..134.74 rows=187 width=12) + Group Key: b + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=129.12..131.93 rows=187 width=0) + Distribute results by S: b + -> Partial HashAggregate (cost=29.12..31.00 rows=187 width=12) + Group Key: b + -> Seq Scan on enum_test (cost=0.00..26.88 rows=450 width=4) + Filter: (a < 100) +(9 rows) + +select count(*) from enum_test where a < 100 group by b; + count +------- + 1 + 1 + 1 + 1 + 1 + 1 + 1 +(7 rows) + +-- composite type +drop table if exists comptype_test; +NOTICE: table "comptype_test" does not exist, skipping +drop type if exists comptype; +NOTICE: type "comptype" does not exist, skipping +create type comptype as (f1 int, f2 int); +create table comptype_test(a int, b comptype) distribute by shard(a); +NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. +insert into comptype_test(a,b) values(1,(1,2)); +insert into comptype_test(a,b) values(2,(2,3)); +insert into comptype_test(a,b) values(3,(3,4)); +insert into comptype_test(a,b) values(4,(4,5)); +insert into comptype_test(a,b) values(5,(5,6)); +insert into comptype_test(a,b) values(6,(6,7)); +explain select count(*) from comptype_test where a < 100 group by b; + QUERY PLAN +--------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) (cost=145.27..147.81 rows=169 width=40) + -> Finalize GroupAggregate (cost=145.27..147.81 rows=169 width=40) + Group Key: b + -> Sort (cost=145.27..145.70 rows=169 width=0) + Sort Key: b + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=134.18..139.02 rows=169 width=0) + Distribute results by S: b + -> Partial GroupAggregate (cost=34.18..38.17 rows=169 width=40) + Group Key: b + -> Sort (cost=34.18..34.95 rows=307 width=32) + Sort Key: b + -> Seq Scan on comptype_test (cost=0.00..21.50 rows=307 width=32) + Filter: (a < 100) +(13 rows) + +select count(*) from comptype_test where a < 100 group by b; + count +------- + 1 + 1 + 1 + 1 + 1 + 1 +(6 rows) + +-- domain type +drop table if exists domaintype_test; +NOTICE: table "domaintype_test" does not exist, skipping +drop domain if exists domaintype; +NOTICE: type "domaintype" does not exist, skipping +create domain domaintype as int check(value < 100); +create table domaintype_test(a int, b domaintype) distribute by shard(a); +NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. +insert into domaintype_test(a,b) values(1,1); +insert into domaintype_test(a,b) values(2,2); +insert into domaintype_test(a,b) values(3,3); +insert into domaintype_test(a,b) values(4,4); +insert into domaintype_test(a,b) values(5,5); +insert into domaintype_test(a,b) values(6,6); +explain select count(*) from domaintype_test where a < 100 group by b; + QUERY PLAN +--------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) (cost=132.87..134.74 rows=187 width=12) + -> Finalize HashAggregate (cost=132.87..134.74 rows=187 width=12) + Group Key: b + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=129.12..131.93 rows=187 width=0) + Distribute results by S: b + -> Partial HashAggregate (cost=29.12..31.00 rows=187 width=12) + Group Key: b + -> Seq Scan on domaintype_test (cost=0.00..26.88 rows=450 width=4) + Filter: (a < 100) +(9 rows) + +select count(*) from domaintype_test where a < 100 group by b; + count +------- + 1 + 1 + 1 + 1 + 1 + 1 +(6 rows) + +drop table enum_test; +drop table comptype_test; +drop table domaintype_test; +drop type enumtype; +drop type comptype; +drop type domaintype; diff --git a/src/test/regress/sql/redistribute_custom_types.sql b/src/test/regress/sql/redistribute_custom_types.sql new file mode 100644 index 00000000..ca392b2f --- /dev/null +++ b/src/test/regress/sql/redistribute_custom_types.sql @@ -0,0 +1,65 @@ +-- +-- redistribute custom types +-- + +-- enum type +drop table if exists enum_test; +drop type if exists enumtype; + +create type enumtype AS enum ('Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'); +create table enum_test(a int, b enumtype) distribute by shard(a); + +insert into enum_test(a,b) values(1,'Mon'); +insert into enum_test(a,b) values(2,'Tue'); +insert into enum_test(a,b) values(3,'Wed'); +insert into enum_test(a,b) values(4,'Thu'); +insert into enum_test(a,b) values(5,'Fri'); +insert into enum_test(a,b) values(6,'Sat'); +insert into enum_test(a,b) values(7,'Sun'); + +explain select count(*) from enum_test where a < 100 group by b; +select count(*) from enum_test where a < 100 group by b; + + +-- composite type +drop table if exists comptype_test; +drop type if exists comptype; + +create type comptype as (f1 int, f2 int); +create table comptype_test(a int, b comptype) distribute by shard(a); + +insert into comptype_test(a,b) values(1,(1,2)); +insert into comptype_test(a,b) values(2,(2,3)); +insert into comptype_test(a,b) values(3,(3,4)); +insert into comptype_test(a,b) values(4,(4,5)); +insert into comptype_test(a,b) values(5,(5,6)); +insert into comptype_test(a,b) values(6,(6,7)); + +explain select count(*) from comptype_test where a < 100 group by b; +select count(*) from comptype_test where a < 100 group by b; + + +-- domain type +drop table if exists domaintype_test; +drop domain if exists domaintype; + +create domain domaintype as int check(value < 100); +create table domaintype_test(a int, b domaintype) distribute by shard(a); + +insert into domaintype_test(a,b) values(1,1); +insert into domaintype_test(a,b) values(2,2); +insert into domaintype_test(a,b) values(3,3); +insert into domaintype_test(a,b) values(4,4); +insert into domaintype_test(a,b) values(5,5); +insert into domaintype_test(a,b) values(6,6); + +explain select count(*) from domaintype_test where a < 100 group by b; +select count(*) from domaintype_test where a < 100 group by b; + +drop table enum_test; +drop table comptype_test; +drop table domaintype_test; + +drop type enumtype; +drop type comptype; +drop type domaintype; \ No newline at end of file From e1bb8b4181aca79e3f2c95460e5007b3c4031568 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Fri, 5 Feb 2021 17:08:07 +0800 Subject: [PATCH 501/578] add nestloop regress test --- .../regress/expected/nestloop_by_shard.out | 343 ++++++++++++++++++ src/test/regress/sql/nestloop_by_shard.sql | 191 ++++++++++ 2 files changed, 534 insertions(+) create mode 100644 src/test/regress/expected/nestloop_by_shard.out create mode 100644 src/test/regress/sql/nestloop_by_shard.sql diff --git a/src/test/regress/expected/nestloop_by_shard.out b/src/test/regress/expected/nestloop_by_shard.out new file mode 100644 index 00000000..da851318 --- /dev/null +++ b/src/test/regress/expected/nestloop_by_shard.out @@ -0,0 +1,343 @@ +-- test nestloop by shard +drop table if exists int8_tbl_s; +NOTICE: table "int8_tbl_s" does not exist, skipping +drop table if exists int4_tbl_s; +NOTICE: table "int4_tbl_s" does not exist, skipping +drop table if exists tenk1_s; +NOTICE: table "tenk1_s" does not exist, skipping +drop table if exists onek_s; +NOTICE: table "onek_s" does not exist, skipping +CREATE TABLE int8_tbl_s(q1 int8, q2 int8) distribute by shard(q1); +NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. +INSERT INTO int8_tbl_s VALUES('123','456'); +INSERT INTO int8_tbl_s VALUES('123','4567890123456789'); +INSERT INTO int8_tbl_s VALUES('4567890123456789','123'); +INSERT INTO int8_tbl_s VALUES(+4567890123456789,'4567890123456789'); +INSERT INTO int8_tbl_s VALUES('+4567890123456789','-4567890123456789'); +CREATE TABLE int4_tbl_s(f1 int4) distribute by shard(f1); +NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. +INSERT INTO int4_tbl_s(f1) VALUES ('0'); +INSERT INTO int4_tbl_s(f1) VALUES ('123456'); +INSERT INTO int4_tbl_s(f1) VALUES ('-123456'); +CREATE TABLE tenk1_s ( + unique1 int4, + unique2 int4, + two int4, + four int4, + ten int4, + twenty int4, + hundred int4, + thousand int4, + twothousand int4, + fivethous int4, + tenthous int4, + odd int4, + even int4, + stringu1 name, + stringu2 name, + string4 name +) distribute by shard(unique1); +NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. +CREATE INDEX unique1_s ON tenk1_s USING btree(unique1 int4_ops); +CREATE INDEX unique2_s ON tenk1_s USING btree(unique2 int4_ops); +CREATE INDEX hundred_s ON tenk1_s USING btree(hundred int4_ops); +CREATE INDEX thous_tenthous_s ON tenk1_s (thousand, tenthous); +COPY tenk1_s FROM '/home/tbase/PG-XL-v10/src/test/regress/data/tenk.data'; +CREATE TABLE onek_s ( + unique1 int4, + unique2 int4, + two int4, + four int4, + ten int4, + twenty int4, + hundred int4, + thousand int4, + twothousand int4, + fivethous int4, + tenthous int4, + odd int4, + even int4, + stringu1 name, + stringu2 name, + string4 name +) distribute by shard(unique1); +NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. +CREATE INDEX onek_unique1_s ON onek_s USING btree(unique1 int4_ops); +CREATE INDEX onek_unique2_s ON onek_s USING btree(unique2 int4_ops); +CREATE INDEX onek_hundred_s ON onek_s USING btree(hundred int4_ops); +CREATE INDEX onek_stringu1_s ON onek_s USING btree(stringu1 name_ops); +COPY onek_s FROM '/home/tbase/PG-XL-v10/src/test/regress/data/onek.data'; +set enable_hashjoin=off; +set enable_mergejoin=off; +set enable_nestloop=on; +explain (num_nodes off, nodes off, costs off) +select * from tenk1_s t1 left join + (tenk1_s t2 join tenk1_s t3 on t2.thousand = t3.unique2) + on t1.hundred = t2.hundred and t1.ten = t3.ten +where t1.unique1 = 1; + QUERY PLAN +------------------------------------------------------------------------------------ + Remote Subquery Scan on all + -> Nested Loop Left Join + -> Remote Subquery Scan on all + Distribute results by S: hundred + -> Seq Scan on tenk1_s t1 + Filter: (unique1 = 1) + -> Materialize + -> Remote Subquery Scan on all + Distribute results by S: hundred + -> Nested Loop + Join Filter: (t1.ten = t3.ten) + -> Remote Subquery Scan on all + Distribute results by S: thousand + -> Index Scan using hundred_s on tenk1_s t2 + Index Cond: (t1.hundred = hundred) + -> Materialize + -> Remote Subquery Scan on all + Distribute results by S: unique2 + -> Index Scan using unique2_s on tenk1_s t3 + Index Cond: (unique2 = t2.thousand) +(20 rows) + +--select * from tenk1_s t1 left join +-- (tenk1_s t2 join tenk1_s t3 on t2.thousand = t3.unique2) +-- on t1.hundred = t2.hundred and t1.ten = t3.ten +--where t1.unique1 = 1; +explain (num_nodes off, nodes off, costs off) +select * from tenk1_s t1 left join + (tenk1_s t2 join tenk1_s t3 on t2.thousand = t3.unique2) + on t1.hundred = t2.hundred and t1.ten + t2.ten = t3.ten +where t1.unique1 = 1; + QUERY PLAN +------------------------------------------------------------------------------------ + Remote Subquery Scan on all + -> Nested Loop Left Join + -> Remote Subquery Scan on all + Distribute results by S: hundred + -> Seq Scan on tenk1_s t1 + Filter: (unique1 = 1) + -> Materialize + -> Remote Subquery Scan on all + Distribute results by S: hundred + -> Nested Loop + Join Filter: ((t1.ten + t2.ten) = t3.ten) + -> Remote Subquery Scan on all + Distribute results by S: thousand + -> Index Scan using hundred_s on tenk1_s t2 + Index Cond: (t1.hundred = hundred) + -> Materialize + -> Remote Subquery Scan on all + Distribute results by S: unique2 + -> Index Scan using unique2_s on tenk1_s t3 + Index Cond: (unique2 = t2.thousand) +(20 rows) + +select * from tenk1_s t1 left join + (tenk1_s t2 join tenk1_s t3 on t2.thousand = t3.unique2) + on t1.hundred = t2.hundred and t1.ten + t2.ten = t3.ten +where t1.unique1 = 1; + unique1 | unique2 | two | four | ten | twenty | hundred | thousand | twothousand | fivethous | tenthous | odd | even | stringu1 | stringu2 | string4 | unique1 | unique2 | two | four | ten | twenty | hundred | thousand | twothousand | fivethous | tenthous | odd | even | stringu1 | stringu2 | string4 | unique1 | unique2 | two | four | ten | twenty | hundred | thousand | twothousand | fivethous | tenthous | odd | even | stringu1 | stringu2 | string4 +---------+---------+-----+------+-----+--------+---------+----------+-------------+-----------+----------+-----+------+----------+----------+---------+---------+---------+-----+------+-----+--------+---------+----------+-------------+-----------+----------+-----+------+----------+----------+---------+---------+---------+-----+------+-----+--------+---------+----------+-------------+-----------+----------+-----+------+----------+----------+--------- + 1 | 2838 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 2 | 3 | BAAAAA | EFEAAA | OOOOxx | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | +(1 row) + +explain (num_nodes off, nodes off, costs off) +select * from +( + select unique1, q1, coalesce(unique1, -1) + q1 as fault + from int8_tbl_s left join tenk1_s on (q2 = unique2) +) ss +where fault = 122 +order by fault; + QUERY PLAN +------------------------------------------------------------------------------------ + Remote Subquery Scan on all + -> Nested Loop Left Join + Join Filter: (int8_tbl_s.q2 = tenk1_s.unique2) + Filter: ((COALESCE(tenk1_s.unique1, '-1'::integer) + int8_tbl_s.q1) = 122) + -> Seq Scan on int8_tbl_s + -> Materialize + -> Remote Subquery Scan on all + -> Seq Scan on tenk1_s +(8 rows) + +select * from +( + select unique1, q1, coalesce(unique1, -1) + q1 as fault + from int8_tbl_s left join tenk1_s on (q2 = unique2) +) ss +where fault = 122 +order by fault; + unique1 | q1 | fault +---------+-----+------- + | 123 | 122 +(1 row) + +explain (num_nodes off, nodes off, costs off) +select q1, unique2, thousand, hundred + from int8_tbl_s a left join tenk1_s b on q1 = unique2 + where coalesce(thousand,123) = q1 and q1 = coalesce(hundred,123); + QUERY PLAN +-------------------------------------------------------------------------------------------- + Remote Subquery Scan on all + -> Nested Loop Left Join + Join Filter: (a.q1 = b.unique2) + Filter: ((COALESCE(b.thousand, 123) = a.q1) AND (a.q1 = COALESCE(b.hundred, 123))) + -> Seq Scan on int8_tbl_s a + -> Materialize + -> Remote Subquery Scan on all + Distribute results by S: COALESCE(thousand, 123) + -> Seq Scan on tenk1_s b +(9 rows) + + +select q1, unique2, thousand, hundred + from int8_tbl_s a left join tenk1_s b on q1 = unique2 + where coalesce(thousand,123) = q1 and q1 = coalesce(hundred,123); + q1 | unique2 | thousand | hundred +----+---------+----------+--------- +(0 rows) + + +explain (num_nodes off, nodes off, costs off) +select f1, unique2, case when unique2 is null then f1 else 0 end + from int4_tbl_s a left join tenk1_s b on f1 = unique2 + where (case when unique2 is null then f1 else 0 end) = 0; + QUERY PLAN +-------------------------------------------------------------------------- + Remote Subquery Scan on all + -> Nested Loop Left Join + Join Filter: (a.f1 = b.unique2) + Filter: (CASE WHEN (b.unique2 IS NULL) THEN a.f1 ELSE 0 END = 0) + -> Seq Scan on int4_tbl_s a + -> Materialize + -> Remote Subquery Scan on all + Distribute results by S: unique2 + -> Seq Scan on tenk1_s b +(9 rows) + +select f1, unique2, case when unique2 is null then f1 else 0 end + from int4_tbl_s a left join tenk1_s b on f1 = unique2 + where (case when unique2 is null then f1 else 0 end) = 0; + f1 | unique2 | case +----+---------+------ + 0 | 0 | 0 +(1 row) + + +explain (verbose, costs off) +select foo1.join_key as foo1_id, foo3.join_key AS foo3_id, bug_field from + (values (0),(1)) foo1(join_key) +left join + (select join_key, bug_field from + (select ss1.join_key, ss1.bug_field from + (select f1 as join_key, 666 as bug_field from int4_tbl_s i1) ss1 + ) foo2 + left join + (select unique2 as join_key from tenk1_s i2) ss2 + using (join_key) + ) foo3 +using (join_key); + QUERY PLAN +----------------------------------------------------------------------------------- + Nested Loop Left Join + Output: "*VALUES*".column1, i1.f1, (666) + Join Filter: ("*VALUES*".column1 = i1.f1) + -> Values Scan on "*VALUES*" + Output: "*VALUES*".column1 + -> Materialize + Output: i1.f1, (666) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: i1.f1, 666 + -> Nested Loop Left Join + Output: i1.f1, 666 + Join Filter: (i1.f1 = i2.unique2) + -> Seq Scan on public.int4_tbl_s i1 + Output: i1.f1 + -> Materialize + Output: i2.unique2 + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: i2.unique2 + Distribute results by S: unique2 + -> Seq Scan on public.tenk1_s i2 + Output: i2.unique2 +(21 rows) + + +select foo1.join_key as foo1_id, foo3.join_key AS foo3_id, bug_field from + (values (0),(1)) foo1(join_key) +left join + (select join_key, bug_field from + (select ss1.join_key, ss1.bug_field from + (select f1 as join_key, 666 as bug_field from int4_tbl_s i1) ss1 + ) foo2 + left join + (select unique2 as join_key from tenk1_s i2) ss2 + using (join_key) + ) foo3 +using (join_key); + foo1_id | foo3_id | bug_field +---------+---------+----------- + 0 | 0 | 666 + 1 | | +(2 rows) + +explain (verbose, costs off) +select t1.unique1, t2.hundred +from onek_s t1, tenk1_s t2 +where exists (select 1 from tenk1_s t3 + where t3.thousand = t1.unique1 and t3.tenthous = t2.hundred) + and t1.unique1 < 1; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + Output: t1.unique1, t2.hundred + -> Nested Loop + Output: t1.unique1, t2.hundred + Join Filter: (t3.tenthous = t2.hundred) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: t2.hundred + Distribute results by S: hundred + -> Seq Scan on public.tenk1_s t2 + Output: t2.hundred + -> Materialize + Output: t1.unique1, t3.tenthous + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: t1.unique1, t3.tenthous + Distribute results by S: tenthous + -> Nested Loop + Output: t1.unique1, t3.tenthous + Join Filter: (t1.unique1 = t3.thousand) + -> HashAggregate + Output: t3.thousand, t3.tenthous + Group Key: t3.thousand, t3.tenthous + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: t3.unique1, t3.unique2, t3.two, t3.four, t3.ten, t3.twenty, t3.hundred, t3.thousand, t3.twothousand, t3.fivethous, t3.tenthous, t3.odd, t3.even, t3.stringu1, t3.stringu2, t3.string4 + Distribute results by S: thousand + -> HashAggregate + Output: t3.unique1, t3.unique2, t3.two, t3.four, t3.ten, t3.twenty, t3.hundred, t3.thousand, t3.twothousand, t3.fivethous, t3.tenthous, t3.odd, t3.even, t3.stringu1, t3.stringu2, t3.string4 + Group Key: t3.thousand, t3.tenthous + -> Seq Scan on public.tenk1_s t3 + Output: t3.unique1, t3.unique2, t3.two, t3.four, t3.ten, t3.twenty, t3.hundred, t3.thousand, t3.twothousand, t3.fivethous, t3.tenthous, t3.odd, t3.even, t3.stringu1, t3.stringu2, t3.string4 + -> Materialize + Output: t1.unique1 + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: t1.unique1 + -> Seq Scan on public.onek_s t1 + Output: t1.unique1 + Filter: (t1.unique1 < 1) +(36 rows) + +--select t1.unique1, t2.hundred +--from onek_s t1, tenk1_s t2 +--where exists (select 1 from tenk1_s t3 +-- where t3.thousand = t1.unique1 and t3.tenthous = t2.hundred) +-- and t1.unique1 < 1; + +reset enable_nestloop; +reset enable_hashjoin; +reset enable_mergejoin; +drop table int8_tbl_s; +drop table int4_tbl_s; +drop table tenk1_s; +drop table onek_s; diff --git a/src/test/regress/sql/nestloop_by_shard.sql b/src/test/regress/sql/nestloop_by_shard.sql new file mode 100644 index 00000000..eb7868b8 --- /dev/null +++ b/src/test/regress/sql/nestloop_by_shard.sql @@ -0,0 +1,191 @@ + +-- test nestloop by shard + +drop table if exists int8_tbl_s; +drop table if exists int4_tbl_s; +drop table if exists tenk1_s; +drop table if exists onek_s; + +CREATE TABLE int8_tbl_s(q1 int8, q2 int8) distribute by shard(q1); +INSERT INTO int8_tbl_s VALUES('123','456'); +INSERT INTO int8_tbl_s VALUES('123','4567890123456789'); +INSERT INTO int8_tbl_s VALUES('4567890123456789','123'); +INSERT INTO int8_tbl_s VALUES(+4567890123456789,'4567890123456789'); +INSERT INTO int8_tbl_s VALUES('+4567890123456789','-4567890123456789'); + +CREATE TABLE int4_tbl_s(f1 int4) distribute by shard(f1); +INSERT INTO int4_tbl_s(f1) VALUES ('0'); +INSERT INTO int4_tbl_s(f1) VALUES ('123456'); +INSERT INTO int4_tbl_s(f1) VALUES ('-123456'); + + +CREATE TABLE tenk1_s ( + unique1 int4, + unique2 int4, + two int4, + four int4, + ten int4, + twenty int4, + hundred int4, + thousand int4, + twothousand int4, + fivethous int4, + tenthous int4, + odd int4, + even int4, + stringu1 name, + stringu2 name, + string4 name +) distribute by shard(unique1); + +CREATE INDEX unique1_s ON tenk1_s USING btree(unique1 int4_ops); +CREATE INDEX unique2_s ON tenk1_s USING btree(unique2 int4_ops); +CREATE INDEX hundred_s ON tenk1_s USING btree(hundred int4_ops); +CREATE INDEX thous_tenthous_s ON tenk1_s (thousand, tenthous); + +COPY tenk1_s FROM '/home/tbase/PG-XL-v10/src/test/regress/data/tenk.data'; + +CREATE TABLE onek_s ( + unique1 int4, + unique2 int4, + two int4, + four int4, + ten int4, + twenty int4, + hundred int4, + thousand int4, + twothousand int4, + fivethous int4, + tenthous int4, + odd int4, + even int4, + stringu1 name, + stringu2 name, + string4 name +) distribute by shard(unique1); + + +CREATE INDEX onek_unique1_s ON onek_s USING btree(unique1 int4_ops); +CREATE INDEX onek_unique2_s ON onek_s USING btree(unique2 int4_ops); +CREATE INDEX onek_hundred_s ON onek_s USING btree(hundred int4_ops); +CREATE INDEX onek_stringu1_s ON onek_s USING btree(stringu1 name_ops); + +COPY onek_s FROM '/home/tbase/PG-XL-v10/src/test/regress/data/onek.data'; + + +set enable_hashjoin=off; +set enable_mergejoin=off; +set enable_nestloop=on; + + +explain (num_nodes off, nodes off, costs off) +select * from tenk1_s t1 left join + (tenk1_s t2 join tenk1_s t3 on t2.thousand = t3.unique2) + on t1.hundred = t2.hundred and t1.ten = t3.ten +where t1.unique1 = 1; + +--select * from tenk1_s t1 left join +-- (tenk1_s t2 join tenk1_s t3 on t2.thousand = t3.unique2) +-- on t1.hundred = t2.hundred and t1.ten = t3.ten +--where t1.unique1 = 1; + +explain (num_nodes off, nodes off, costs off) +select * from tenk1_s t1 left join + (tenk1_s t2 join tenk1_s t3 on t2.thousand = t3.unique2) + on t1.hundred = t2.hundred and t1.ten + t2.ten = t3.ten +where t1.unique1 = 1; + +select * from tenk1_s t1 left join + (tenk1_s t2 join tenk1_s t3 on t2.thousand = t3.unique2) + on t1.hundred = t2.hundred and t1.ten + t2.ten = t3.ten +where t1.unique1 = 1; + + +explain (num_nodes off, nodes off, costs off) +select * from +( + select unique1, q1, coalesce(unique1, -1) + q1 as fault + from int8_tbl_s left join tenk1_s on (q2 = unique2) +) ss +where fault = 122 +order by fault; + +select * from +( + select unique1, q1, coalesce(unique1, -1) + q1 as fault + from int8_tbl_s left join tenk1_s on (q2 = unique2) +) ss +where fault = 122 +order by fault; + + +explain (num_nodes off, nodes off, costs off) +select q1, unique2, thousand, hundred + from int8_tbl_s a left join tenk1_s b on q1 = unique2 + where coalesce(thousand,123) = q1 and q1 = coalesce(hundred,123); + +select q1, unique2, thousand, hundred + from int8_tbl_s a left join tenk1_s b on q1 = unique2 + where coalesce(thousand,123) = q1 and q1 = coalesce(hundred,123); + + +explain (num_nodes off, nodes off, costs off) +select f1, unique2, case when unique2 is null then f1 else 0 end + from int4_tbl_s a left join tenk1_s b on f1 = unique2 + where (case when unique2 is null then f1 else 0 end) = 0; + +select f1, unique2, case when unique2 is null then f1 else 0 end + from int4_tbl_s a left join tenk1_s b on f1 = unique2 + where (case when unique2 is null then f1 else 0 end) = 0; + + +explain (verbose, costs off) +select foo1.join_key as foo1_id, foo3.join_key AS foo3_id, bug_field from + (values (0),(1)) foo1(join_key) +left join + (select join_key, bug_field from + (select ss1.join_key, ss1.bug_field from + (select f1 as join_key, 666 as bug_field from int4_tbl_s i1) ss1 + ) foo2 + left join + (select unique2 as join_key from tenk1_s i2) ss2 + using (join_key) + ) foo3 +using (join_key); + +select foo1.join_key as foo1_id, foo3.join_key AS foo3_id, bug_field from + (values (0),(1)) foo1(join_key) +left join + (select join_key, bug_field from + (select ss1.join_key, ss1.bug_field from + (select f1 as join_key, 666 as bug_field from int4_tbl_s i1) ss1 + ) foo2 + left join + (select unique2 as join_key from tenk1_s i2) ss2 + using (join_key) + ) foo3 +using (join_key); + + +explain (verbose, costs off) +select t1.unique1, t2.hundred +from onek_s t1, tenk1_s t2 +where exists (select 1 from tenk1_s t3 + where t3.thousand = t1.unique1 and t3.tenthous = t2.hundred) + and t1.unique1 < 1; + +--select t1.unique1, t2.hundred +--from onek_s t1, tenk1_s t2 +--where exists (select 1 from tenk1_s t3 +-- where t3.thousand = t1.unique1 and t3.tenthous = t2.hundred) +-- and t1.unique1 < 1; + + +reset enable_nestloop; +reset enable_hashjoin; +reset enable_mergejoin; + +drop table int8_tbl_s; +drop table int4_tbl_s; +drop table tenk1_s; +drop table onek_s; From 7e47ee022aad2b981008826aa56b1138a0711563 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Mon, 4 Jul 2022 17:38:41 +0800 Subject: [PATCH 502/578] fix regress error --- src/backend/access/transam/gtm.c | 32 ++++ src/backend/pgxc/pool/poolmgr.c | 60 ------ src/include/access/xlog.h | 1 - src/test/regress/expected/aggregates_1.out | 56 +++--- src/test/regress/expected/alter_generic.out | 4 +- src/test/regress/expected/alter_table.out | 19 ++ src/test/regress/expected/foreign_data.out | 162 ++++++++--------- src/test/regress/expected/insert.out | 37 ++++ src/test/regress/expected/join_3.out | 100 +++++----- .../regress/expected/partition_join_2.out | 172 ++++++++---------- src/test/regress/expected/sanity_check.out | 1 + src/test/regress/expected/sequence.out | 2 +- src/test/regress/expected/stats_ext_2.out | 32 ++-- src/test/regress/expected/sysviews.out | 2 +- src/test/regress/expected/tbase_explain.out | 10 +- 15 files changed, 350 insertions(+), 340 deletions(-) diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c index ded4fdfb..474610be 100644 --- a/src/backend/access/transam/gtm.c +++ b/src/backend/access/transam/gtm.c @@ -24,6 +24,7 @@ #include "postmaster/autovacuum.h" #include "postmaster/clean2pc.h" #include "postmaster/clustermon.h" +#include "postmaster/postmaster.h" #include "storage/backendid.h" #include "tcop/tcopprot.h" #include "utils/guc.h" @@ -1239,6 +1240,37 @@ ResetGTMConnection(void) InitGTM(); } +#ifdef HAVE_UNIX_SOCKETS +/* + * gtm_unix_socket_file_exists() + * + * Checks whether the gtm unix domain socket file exists. + */ +static bool +gtm_unix_socket_file_exists(void) +{ + char path[MAXGTMPATH]; + char lockfile[MAXPGPATH]; + int fd; + + UNIXSOCK_PATH(path, GtmPort, gtm_unix_socket_directory); + snprintf(lockfile, sizeof(lockfile), "%s.lock", path); + + if ((fd = open(lockfile, O_RDONLY, 0)) < 0) + { + /* ENOTDIR means we will throw a more useful error later */ + if (errno != ENOENT && errno != ENOTDIR) + elog(LOG, "could not open file \"%s\" for reading: %s\n", + lockfile, strerror(errno)); + + return false; + } + + close(fd); + return true; +} +#endif + void InitGTM(void) {// #lizard forgives diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c index 6e4f3283..756b2198 100644 --- a/src/backend/pgxc/pool/poolmgr.c +++ b/src/backend/pgxc/pool/poolmgr.c @@ -10927,66 +10927,6 @@ handle_session_command(PoolAgent * agent, StringInfo s) } } -static bool -remove_all_agent_references(Oid nodeoid) -{ - int i, j, index; - bool res = true; - - /* - * Identify if it's a coordinator or datanode first - * and get its index - */ - for (i = 0; i < agentCount; i++) - { - bool found = false; - PoolAgent *agent; - - index = agentIndexes[i]; - agent = poolAgents[index]; - - for (j = 0; j < agent->num_dn_connections; j++) - { - if (agent->dn_conn_oids[j] == nodeoid) - { - found = true; - break; - } - } - if (found) - { - PGXCNodePoolSlot *slot = agent->dn_connections[j]; - if (slot) - release_connection(agent->pool, slot, j, agent->dn_conn_oids[j], false, false); - agent->dn_connections[j] = NULL; - } - else - { - for (j = 0; j < agent->num_coord_connections; j++) - { - if (agent->coord_conn_oids[j] == nodeoid) - { - found = true; - break; - } - } - if (found) - { - PGXCNodePoolSlot *slot = agent->coord_connections[j]; - if (slot) - release_connection(agent->pool, slot, j, agent->coord_conn_oids[j], true, true); - agent->coord_connections[j] = NULL; - } - else - { - elog(LOG, "Node not found! (%u)", nodeoid); - res = false; - } - } - } - return res; -} - /* * refresh_database_pools * refresh information for all database pools diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 018cecd9..a0db442b 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -244,7 +244,6 @@ extern void CheckXLogRemoved(XLogSegNo segno, TimeLineID tli); extern XLogSegNo XLogGetLastRemovedSegno(void); extern void XLogSetAsyncXactLSN(XLogRecPtr record); extern void XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn); -extern XLogRecPtr XLogGetReplicationSlotMinimumLSN(void); extern void xlog_redo(XLogReaderState *record); extern void xlog_desc(StringInfo buf, XLogReaderState *record); diff --git a/src/test/regress/expected/aggregates_1.out b/src/test/regress/expected/aggregates_1.out index 2bfcbb7f..89f1e157 100644 --- a/src/test/regress/expected/aggregates_1.out +++ b/src/test/regress/expected/aggregates_1.out @@ -2000,31 +2000,31 @@ select my_sum(one),my_half_sum(one) from (values(1),(2),(3),(4)) t(one); (1 row) rollback; - -- test coverage for aggregate combine/serial/deserial functions - BEGIN ISOLATION LEVEL REPEATABLE READ; - SET parallel_setup_cost = 0; - SET parallel_tuple_cost = 0; - SET min_parallel_table_scan_size = 0; - SET max_parallel_workers_per_gather = 4; - SET enable_indexonlyscan = off; - -- variance(int4) covers numeric_poly_combine - -- sum(int8) covers int8_avg_combine - EXPLAIN (COSTS OFF) - SELECT variance(unique1::int4), sum(unique1::int8) FROM tenk1; - QUERY PLAN - -------------------------------------------------------------------- - Parallel Finalize Aggregate - -> Parallel Remote Subquery Scan on all (datanode_1,datanode_2) - -> Gather - Workers Planned: 4 - -> Partial Aggregate - -> Parallel Seq Scan on tenk1 - (6 rows) - - SELECT variance(unique1::int4), sum(unique1::int8) FROM tenk1; - variance | sum - ----------------------+---------- - 8334166.666666666667 | 49995000 - (1 row) - - ROLLBACK; \ No newline at end of file +-- test coverage for aggregate combine/serial/deserial functions +BEGIN ISOLATION LEVEL REPEATABLE READ; +SET parallel_setup_cost = 0; +SET parallel_tuple_cost = 0; +SET min_parallel_table_scan_size = 0; +SET max_parallel_workers_per_gather = 4; +SET enable_indexonlyscan = off; +-- variance(int4) covers numeric_poly_combine +-- sum(int8) covers int8_avg_combine +EXPLAIN (COSTS OFF) + SELECT variance(unique1::int4), sum(unique1::int8) FROM tenk1; + QUERY PLAN +-------------------------------------------------------------------- + Parallel Finalize Aggregate + -> Parallel Remote Subquery Scan on all (datanode_1,datanode_2) + -> Gather + Workers Planned: 4 + -> Partial Aggregate + -> Parallel Seq Scan on tenk1 +(6 rows) + +SELECT variance(unique1::int4), sum(unique1::int8) FROM tenk1; + variance | sum +----------------------+---------- + 8334166.666666666667 | 49995000 +(1 row) + +ROLLBACK; diff --git a/src/test/regress/expected/alter_generic.out b/src/test/regress/expected/alter_generic.out index 788c5964..767c09be 100644 --- a/src/test/regress/expected/alter_generic.out +++ b/src/test/regress/expected/alter_generic.out @@ -159,14 +159,14 @@ ALTER SERVER alt_fserv1 RENAME TO alt_fserv2; -- failed (name conflict) ERROR: server "alt_fserv2" already exists ALTER SERVER alt_fserv1 RENAME TO alt_fserv3; -- OK SELECT fdwname FROM pg_foreign_data_wrapper WHERE fdwname like 'alt_fdw%'; - fdwname + fdwname ---------- alt_fdw2 alt_fdw3 (2 rows) SELECT srvname FROM pg_foreign_server WHERE srvname like 'alt_fserv%'; - srvname + srvname ------------ alt_fserv2 alt_fserv3 diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out index 50f89f13..a1ef7dc2 100644 --- a/src/test/regress/expected/alter_table.out +++ b/src/test/regress/expected/alter_table.out @@ -3812,3 +3812,22 @@ alter table at_test_sql_partop attach partition at_test_sql_partop_1 for values drop table at_test_sql_partop; drop operator class at_test_sql_partop using btree; drop function at_test_sql_partop; +-- remote dml with dropped column +create table dropped_col_remote_dml (a int, b int, c int) distribute by shard(a); +NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. +insert into dropped_col_remote_dml values(1,1,1); +create or replace function dropped_col_remote_dml_func() returns trigger as +$$ +begin + raise notice 'this is a test'; + return new; +end; +$$ + language plpgsql; +create trigger tga after update on dropped_col_remote_dml for each row +execute PROCEDURE dropped_col_remote_dml_func(); +alter table dropped_col_remote_dml drop column c; +update dropped_col_remote_dml set b = 2; +NOTICE: this is a test +drop table dropped_col_remote_dml cascade; +drop function dropped_col_remote_dml_func; diff --git a/src/test/regress/expected/foreign_data.out b/src/test/regress/expected/foreign_data.out index 26e01f5f..9aa94459 100644 --- a/src/test/regress/expected/foreign_data.out +++ b/src/test/regress/expected/foreign_data.out @@ -18,7 +18,7 @@ COMMENT ON FOREIGN DATA WRAPPER dummy IS 'useless'; CREATE FOREIGN DATA WRAPPER postgresql VALIDATOR postgresql_fdw_validator; -- At this point we should have 2 built-in wrappers and no servers. SELECT fdwname, fdwhandler::regproc, fdwvalidator::regproc, fdwoptions FROM pg_foreign_data_wrapper ORDER BY 1, 2, 3; - fdwname | fdwhandler | fdwvalidator | fdwoptions + fdwname | fdwhandler | fdwvalidator | fdwoptions ------------+------------+--------------------------+------------ dummy | - | - | postgresql | - | postgresql_fdw_validator | @@ -39,8 +39,8 @@ CREATE FOREIGN DATA WRAPPER foo VALIDATOR bar; -- ERROR ERROR: function bar(text[], oid) does not exist CREATE FOREIGN DATA WRAPPER foo; \dew - List of foreign-data wrappers - Name | Owner | Handler | Validator + List of foreign-data wrappers + Name | Owner | Handler | Validator ------------+---------------------------+---------+-------------------------- dummy | regress_foreign_data_user | - | - foo | regress_foreign_data_user | - | - @@ -52,8 +52,8 @@ ERROR: foreign-data wrapper "foo" already exists DROP FOREIGN DATA WRAPPER foo; CREATE FOREIGN DATA WRAPPER foo OPTIONS (testing '1'); \dew+ - List of foreign-data wrappers - Name | Owner | Handler | Validator | Access privileges | FDW options | Description + List of foreign-data wrappers + Name | Owner | Handler | Validator | Access privileges | FDW options | Description ------------+---------------------------+---------+--------------------------+-------------------+---------------+------------- dummy | regress_foreign_data_user | - | - | | | useless foo | regress_foreign_data_user | - | - | | (testing '1') | @@ -65,8 +65,8 @@ CREATE FOREIGN DATA WRAPPER foo OPTIONS (testing '1', testing '2'); -- ERROR ERROR: option "testing" provided more than once CREATE FOREIGN DATA WRAPPER foo OPTIONS (testing '1', another '2'); \dew+ - List of foreign-data wrappers - Name | Owner | Handler | Validator | Access privileges | FDW options | Description + List of foreign-data wrappers + Name | Owner | Handler | Validator | Access privileges | FDW options | Description ------------+---------------------------+---------+--------------------------+-------------------+----------------------------+------------- dummy | regress_foreign_data_user | - | - | | | useless foo | regress_foreign_data_user | - | - | | (testing '1', another '2') | @@ -81,8 +81,8 @@ HINT: Must be superuser to create a foreign-data wrapper. RESET ROLE; CREATE FOREIGN DATA WRAPPER foo VALIDATOR postgresql_fdw_validator; \dew+ - List of foreign-data wrappers - Name | Owner | Handler | Validator | Access privileges | FDW options | Description + List of foreign-data wrappers + Name | Owner | Handler | Validator | Access privileges | FDW options | Description ------------+---------------------------+---------+--------------------------+-------------------+-------------+------------- dummy | regress_foreign_data_user | - | - | | | useless foo | regress_foreign_data_user | - | postgresql_fdw_validator | | | @@ -98,8 +98,8 @@ ALTER FOREIGN DATA WRAPPER foo VALIDATOR bar; -- ERROR ERROR: function bar(text[], oid) does not exist ALTER FOREIGN DATA WRAPPER foo NO VALIDATOR; \dew+ - List of foreign-data wrappers - Name | Owner | Handler | Validator | Access privileges | FDW options | Description + List of foreign-data wrappers + Name | Owner | Handler | Validator | Access privileges | FDW options | Description ------------+---------------------------+---------+--------------------------+-------------------+-------------+------------- dummy | regress_foreign_data_user | - | - | | | useless foo | regress_foreign_data_user | - | - | | | @@ -113,8 +113,8 @@ ALTER FOREIGN DATA WRAPPER foo OPTIONS (DROP c); -- ERROR ERROR: option "c" not found ALTER FOREIGN DATA WRAPPER foo OPTIONS (ADD x '1', DROP x); \dew+ - List of foreign-data wrappers - Name | Owner | Handler | Validator | Access privileges | FDW options | Description + List of foreign-data wrappers + Name | Owner | Handler | Validator | Access privileges | FDW options | Description ------------+---------------------------+---------+--------------------------+-------------------+----------------+------------- dummy | regress_foreign_data_user | - | - | | | useless foo | regress_foreign_data_user | - | - | | (a '1', b '2') | @@ -123,8 +123,8 @@ ALTER FOREIGN DATA WRAPPER foo OPTIONS (ADD x '1', DROP x); ALTER FOREIGN DATA WRAPPER foo OPTIONS (DROP a, SET b '3', ADD c '4'); \dew+ - List of foreign-data wrappers - Name | Owner | Handler | Validator | Access privileges | FDW options | Description + List of foreign-data wrappers + Name | Owner | Handler | Validator | Access privileges | FDW options | Description ------------+---------------------------+---------+--------------------------+-------------------+----------------+------------- dummy | regress_foreign_data_user | - | - | | | useless foo | regress_foreign_data_user | - | - | | (b '3', c '4') | @@ -135,8 +135,8 @@ ALTER FOREIGN DATA WRAPPER foo OPTIONS (a '2'); ALTER FOREIGN DATA WRAPPER foo OPTIONS (b '4'); -- ERROR ERROR: option "b" provided more than once \dew+ - List of foreign-data wrappers - Name | Owner | Handler | Validator | Access privileges | FDW options | Description + List of foreign-data wrappers + Name | Owner | Handler | Validator | Access privileges | FDW options | Description ------------+---------------------------+---------+--------------------------+-------------------+-----------------------+------------- dummy | regress_foreign_data_user | - | - | | | useless foo | regress_foreign_data_user | - | - | | (b '3', c '4', a '2') | @@ -150,8 +150,8 @@ HINT: Must be superuser to alter a foreign-data wrapper. SET ROLE regress_test_role_super; ALTER FOREIGN DATA WRAPPER foo OPTIONS (ADD d '5'); \dew+ - List of foreign-data wrappers - Name | Owner | Handler | Validator | Access privileges | FDW options | Description + List of foreign-data wrappers + Name | Owner | Handler | Validator | Access privileges | FDW options | Description ------------+---------------------------+---------+--------------------------+-------------------+------------------------------+------------- dummy | regress_foreign_data_user | - | - | | | useless foo | regress_foreign_data_user | - | - | | (b '3', c '4', a '2', d '5') | @@ -169,8 +169,8 @@ ERROR: permission denied to alter foreign-data wrapper "foo" HINT: Must be superuser to alter a foreign-data wrapper. RESET ROLE; \dew+ - List of foreign-data wrappers - Name | Owner | Handler | Validator | Access privileges | FDW options | Description + List of foreign-data wrappers + Name | Owner | Handler | Validator | Access privileges | FDW options | Description ------------+---------------------------+---------+--------------------------+-------------------+------------------------------+------------- dummy | regress_foreign_data_user | - | - | | | useless foo | regress_test_role_super | - | - | | (b '3', c '4', a '2', d '5') | @@ -179,8 +179,8 @@ RESET ROLE; ALTER FOREIGN DATA WRAPPER foo RENAME TO foo1; \dew+ - List of foreign-data wrappers - Name | Owner | Handler | Validator | Access privileges | FDW options | Description + List of foreign-data wrappers + Name | Owner | Handler | Validator | Access privileges | FDW options | Description ------------+---------------------------+---------+--------------------------+-------------------+------------------------------+------------- dummy | regress_foreign_data_user | - | - | | | useless foo1 | regress_test_role_super | - | - | | (b '3', c '4', a '2', d '5') | @@ -194,8 +194,8 @@ ERROR: foreign-data wrapper "nonexistent" does not exist DROP FOREIGN DATA WRAPPER IF EXISTS nonexistent; NOTICE: foreign-data wrapper "nonexistent" does not exist, skipping \dew+ - List of foreign-data wrappers - Name | Owner | Handler | Validator | Access privileges | FDW options | Description + List of foreign-data wrappers + Name | Owner | Handler | Validator | Access privileges | FDW options | Description ------------+---------------------------+---------+--------------------------+-------------------+------------------------------+------------- dummy | regress_foreign_data_user | - | - | | | useless foo | regress_test_role_super | - | - | | (b '3', c '4', a '2', d '5') | @@ -210,8 +210,8 @@ DROP FOREIGN DATA WRAPPER foo; RESET ROLE; DROP ROLE regress_test_role_super; \dew+ - List of foreign-data wrappers - Name | Owner | Handler | Validator | Access privileges | FDW options | Description + List of foreign-data wrappers + Name | Owner | Handler | Validator | Access privileges | FDW options | Description ------------+---------------------------+---------+--------------------------+-------------------+-------------+------------- dummy | regress_foreign_data_user | - | - | | | useless postgresql | regress_foreign_data_user | - | postgresql_fdw_validator | | | @@ -226,8 +226,8 @@ ERROR: user mapping for "regress_foreign_data_user" already exists for server s CREATE USER MAPPING IF NOT EXISTS FOR current_user SERVER s1; -- NOTICE NOTICE: user mapping for "regress_foreign_data_user" already exists for server s1, skipping \dew+ - List of foreign-data wrappers - Name | Owner | Handler | Validator | Access privileges | FDW options | Description + List of foreign-data wrappers + Name | Owner | Handler | Validator | Access privileges | FDW options | Description ------------+---------------------------+---------+--------------------------+-------------------+-------------+------------- dummy | regress_foreign_data_user | - | - | | | useless foo | regress_foreign_data_user | - | - | | | @@ -235,15 +235,15 @@ NOTICE: user mapping for "regress_foreign_data_user" already exists for server (3 rows) \des+ - List of foreign servers - Name | Owner | Foreign-data wrapper | Access privileges | Type | Version | FDW options | Description + List of foreign servers + Name | Owner | Foreign-data wrapper | Access privileges | Type | Version | FDW options | Description ------+---------------------------+----------------------+-------------------+------+---------+-------------+---------------- s1 | regress_foreign_data_user | foo | | | | | foreign server (1 row) \deu+ - List of user mappings - Server | User name | FDW options + List of user mappings + Server | User name | FDW options --------+---------------------------+------------- s1 | regress_foreign_data_user | (1 row) @@ -262,8 +262,8 @@ NOTICE: drop cascades to 2 other objects DETAIL: drop cascades to server s1 drop cascades to user mapping for regress_foreign_data_user on server s1 \dew+ - List of foreign-data wrappers - Name | Owner | Handler | Validator | Access privileges | FDW options | Description + List of foreign-data wrappers + Name | Owner | Handler | Validator | Access privileges | FDW options | Description ------------+---------------------------+---------+--------------------------+-------------------+-------------+------------- dummy | regress_foreign_data_user | - | - | | | useless postgresql | regress_foreign_data_user | - | postgresql_fdw_validator | | | @@ -301,8 +301,8 @@ ERROR: invalid option "foo" HINT: Valid options in this context are: authtype, service, connect_timeout, dbname, host, hostaddr, port, tty, options, requiressl, sslmode, gsslib CREATE SERVER s8 FOREIGN DATA WRAPPER postgresql OPTIONS (host 'localhost', dbname 's8db'); \des+ - List of foreign servers - Name | Owner | Foreign-data wrapper | Access privileges | Type | Version | FDW options | Description + List of foreign servers + Name | Owner | Foreign-data wrapper | Access privileges | Type | Version | FDW options | Description ------+---------------------------+----------------------+-------------------+--------+---------+-----------------------------------+------------- s1 | regress_foreign_data_user | foo | | | | | s2 | regress_foreign_data_user | foo | | | | (host 'a', dbname 'b') | @@ -323,8 +323,8 @@ SET ROLE regress_test_role; CREATE SERVER t1 FOREIGN DATA WRAPPER foo; RESET ROLE; \des+ - List of foreign servers - Name | Owner | Foreign-data wrapper | Access privileges | Type | Version | FDW options | Description + List of foreign servers + Name | Owner | Foreign-data wrapper | Access privileges | Type | Version | FDW options | Description ------+---------------------------+----------------------+-------------------+--------+---------+-----------------------------------+------------- s1 | regress_foreign_data_user | foo | | | | | s2 | regress_foreign_data_user | foo | | | | (host 'a', dbname 'b') | @@ -347,8 +347,8 @@ GRANT regress_test_indirect TO regress_test_role; SET ROLE regress_test_role; CREATE SERVER t2 FOREIGN DATA WRAPPER foo; \des+ - List of foreign servers - Name | Owner | Foreign-data wrapper | Access privileges | Type | Version | FDW options | Description + List of foreign servers + Name | Owner | Foreign-data wrapper | Access privileges | Type | Version | FDW options | Description ------+---------------------------+----------------------+-------------------+--------+---------+-----------------------------------+------------- s1 | regress_foreign_data_user | foo | | | | | s2 | regress_foreign_data_user | foo | | | | (host 'a', dbname 'b') | @@ -377,8 +377,8 @@ ALTER SERVER s3 OPTIONS ("tns name" 'orcl', port '1521'); GRANT USAGE ON FOREIGN SERVER s1 TO regress_test_role; GRANT USAGE ON FOREIGN SERVER s6 TO regress_test_role2 WITH GRANT OPTION; \des+ - List of foreign servers - Name | Owner | Foreign-data wrapper | Access privileges | Type | Version | FDW options | Description + List of foreign servers + Name | Owner | Foreign-data wrapper | Access privileges | Type | Version | FDW options | Description ------+---------------------------+----------------------+-------------------------------------------------------+--------+---------+-----------------------------------+------------- s1 | regress_foreign_data_user | foo | regress_foreign_data_user=U/regress_foreign_data_user+| | 1.0 | (servername 's1') | | | | regress_test_role=U/regress_foreign_data_user | | | | @@ -428,8 +428,8 @@ ERROR: role "regress_test_indirect" cannot be dropped because some objects depe DETAIL: owner of server s1 privileges for foreign-data wrapper foo \des+ - List of foreign servers - Name | Owner | Foreign-data wrapper | Access privileges | Type | Version | FDW options | Description + List of foreign servers + Name | Owner | Foreign-data wrapper | Access privileges | Type | Version | FDW options | Description ------+---------------------------+----------------------+-------------------------------------------------------+--------+---------+--------------------------------------+------------- s1 | regress_test_indirect | foo | regress_test_indirect=U/regress_test_indirect | | 1.1 | (servername 's1') | s2 | regress_foreign_data_user | foo | | | 1.1 | (host 'a', dbname 'b') | @@ -446,8 +446,8 @@ privileges for foreign-data wrapper foo ALTER SERVER s8 RENAME to s8new; \des+ - List of foreign servers - Name | Owner | Foreign-data wrapper | Access privileges | Type | Version | FDW options | Description + List of foreign servers + Name | Owner | Foreign-data wrapper | Access privileges | Type | Version | FDW options | Description -------+---------------------------+----------------------+-------------------------------------------------------+--------+---------+--------------------------------------+------------- s1 | regress_test_indirect | foo | regress_test_indirect=U/regress_test_indirect | | 1.1 | (servername 's1') | s2 | regress_foreign_data_user | foo | | | 1.1 | (host 'a', dbname 'b') | @@ -469,8 +469,8 @@ ERROR: server "nonexistent" does not exist DROP SERVER IF EXISTS nonexistent; NOTICE: server "nonexistent" does not exist, skipping \des - List of foreign servers - Name | Owner | Foreign-data wrapper + List of foreign servers + Name | Owner | Foreign-data wrapper ------+---------------------------+---------------------- s1 | regress_test_indirect | foo s2 | regress_foreign_data_user | foo @@ -490,8 +490,8 @@ ERROR: must be owner of foreign server s2 DROP SERVER s1; RESET ROLE; \des - List of foreign servers - Name | Owner | Foreign-data wrapper + List of foreign servers + Name | Owner | Foreign-data wrapper ------+---------------------------+---------------------- s2 | regress_foreign_data_user | foo s3 | regress_foreign_data_user | foo @@ -509,8 +509,8 @@ SET ROLE regress_test_role; DROP SERVER s2; RESET ROLE; \des - List of foreign servers - Name | Owner | Foreign-data wrapper + List of foreign servers + Name | Owner | Foreign-data wrapper ------+---------------------------+---------------------- s3 | regress_foreign_data_user | foo s4 | regress_foreign_data_user | foo @@ -524,8 +524,8 @@ RESET ROLE; CREATE USER MAPPING FOR current_user SERVER s3; \deu -List of user mappings - Server | User name + List of user mappings + Server | User name --------+--------------------------- s3 | regress_foreign_data_user (1 row) @@ -537,8 +537,8 @@ HINT: Use DROP ... CASCADE to drop the dependent objects too. DROP SERVER s3 CASCADE; NOTICE: drop cascades to user mapping for regress_foreign_data_user on server s3 \des - List of foreign servers - Name | Owner | Foreign-data wrapper + List of foreign servers + Name | Owner | Foreign-data wrapper ------+---------------------------+---------------------- s4 | regress_foreign_data_user | foo s5 | regress_foreign_data_user | foo @@ -584,8 +584,8 @@ CREATE USER MAPPING FOR current_user SERVER t1 OPTIONS (username 'bob', password CREATE USER MAPPING FOR public SERVER t1; RESET ROLE; \deu -List of user mappings - Server | User name + List of user mappings + Server | User name --------+--------------------------- s4 | public s4 | regress_foreign_data_user @@ -614,8 +614,8 @@ ERROR: must be owner of foreign server s4 ALTER USER MAPPING FOR public SERVER t1 OPTIONS (ADD modified '1'); RESET ROLE; \deu+ - List of user mappings - Server | User name | FDW options + List of user mappings + Server | User name | FDW options --------+---------------------------+---------------------------------- s4 | public | ("this mapping" 'is public') s4 | regress_foreign_data_user | @@ -646,8 +646,8 @@ ERROR: must be owner of foreign server s8 RESET ROLE; DROP SERVER s7; \deu -List of user mappings - Server | User name + List of user mappings + Server | User name --------+--------------------------- s4 | public s4 | regress_foreign_data_user @@ -721,8 +721,8 @@ Server: s0 FDW options: (delimiter ',', quote '"', "be quoted" 'value') \det+ - List of foreign tables - Schema | Table | Server | FDW options | Description + List of foreign tables + Schema | Table | Server | FDW options | Description --------+-------+--------+-------------------------------------------------+------------- public | ft1 | s0 | (delimiter ',', quote '"', "be quoted" 'value') | ft1 (1 row) @@ -877,7 +877,7 @@ ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 RENAME TO foreign_table_1; NOTICE: relation "doesnt_exist_ft1" does not exist, skipping -- Information schema SELECT * FROM information_schema.foreign_data_wrappers ORDER BY 1, 2; - foreign_data_wrapper_catalog | foreign_data_wrapper_name | authorization_identifier | library_name | foreign_data_wrapper_language + foreign_data_wrapper_catalog | foreign_data_wrapper_name | authorization_identifier | library_name | foreign_data_wrapper_language ------------------------------+---------------------------+---------------------------+--------------+------------------------------- regression | dummy | regress_foreign_data_user | | c regression | foo | regress_foreign_data_user | | c @@ -885,13 +885,13 @@ SELECT * FROM information_schema.foreign_data_wrappers ORDER BY 1, 2; (3 rows) SELECT * FROM information_schema.foreign_data_wrapper_options ORDER BY 1, 2, 3; - foreign_data_wrapper_catalog | foreign_data_wrapper_name | option_name | option_value + foreign_data_wrapper_catalog | foreign_data_wrapper_name | option_name | option_value ------------------------------+---------------------------+--------------+-------------- regression | foo | test wrapper | true (1 row) SELECT * FROM information_schema.foreign_servers ORDER BY 1, 2; - foreign_server_catalog | foreign_server_name | foreign_data_wrapper_catalog | foreign_data_wrapper_name | foreign_server_type | foreign_server_version | authorization_identifier + foreign_server_catalog | foreign_server_name | foreign_data_wrapper_catalog | foreign_data_wrapper_name | foreign_server_type | foreign_server_version | authorization_identifier ------------------------+---------------------+------------------------------+---------------------------+---------------------+------------------------+--------------------------- regression | s0 | regression | dummy | | | regress_foreign_data_user regression | s4 | regression | foo | oracle | | regress_foreign_data_user @@ -903,7 +903,7 @@ SELECT * FROM information_schema.foreign_servers ORDER BY 1, 2; (7 rows) SELECT * FROM information_schema.foreign_server_options ORDER BY 1, 2, 3; - foreign_server_catalog | foreign_server_name | option_name | option_value + foreign_server_catalog | foreign_server_name | option_name | option_value ------------------------+---------------------+-----------------+-------------- regression | s4 | dbname | b regression | s4 | host | a @@ -914,7 +914,7 @@ SELECT * FROM information_schema.foreign_server_options ORDER BY 1, 2, 3; (6 rows) SELECT * FROM information_schema.user_mappings ORDER BY lower(authorization_identifier), 2, 3; - authorization_identifier | foreign_server_catalog | foreign_server_name + authorization_identifier | foreign_server_catalog | foreign_server_name ---------------------------+------------------------+--------------------- PUBLIC | regression | s4 PUBLIC | regression | s8 @@ -927,7 +927,7 @@ SELECT * FROM information_schema.user_mappings ORDER BY lower(authorization_iden (8 rows) SELECT * FROM information_schema.user_mapping_options ORDER BY lower(authorization_identifier), 2, 3, 4; - authorization_identifier | foreign_server_catalog | foreign_server_name | option_name | option_value + authorization_identifier | foreign_server_catalog | foreign_server_name | option_name | option_value ---------------------------+------------------------+---------------------+--------------+-------------- PUBLIC | regression | s4 | this mapping | is public PUBLIC | regression | t1 | modified | 1 @@ -939,7 +939,7 @@ SELECT * FROM information_schema.user_mapping_options ORDER BY lower(authorizati (7 rows) SELECT * FROM information_schema.usage_privileges WHERE object_type LIKE 'FOREIGN%' AND object_name IN ('s6', 'foo') ORDER BY 1, 2, 3, 4, 5; - grantor | grantee | object_catalog | object_schema | object_name | object_type | privilege_type | is_grantable + grantor | grantee | object_catalog | object_schema | object_name | object_type | privilege_type | is_grantable ---------------------------+---------------------------+----------------+---------------+-------------+----------------------+----------------+-------------- regress_foreign_data_user | regress_foreign_data_user | regression | | foo | FOREIGN DATA WRAPPER | USAGE | YES regress_foreign_data_user | regress_test_indirect | regression | | foo | FOREIGN DATA WRAPPER | USAGE | NO @@ -948,7 +948,7 @@ SELECT * FROM information_schema.usage_privileges WHERE object_type LIKE 'FOREIG (4 rows) SELECT * FROM information_schema.role_usage_grants WHERE object_type LIKE 'FOREIGN%' AND object_name IN ('s6', 'foo') ORDER BY 1, 2, 3, 4, 5; - grantor | grantee | object_catalog | object_schema | object_name | object_type | privilege_type | is_grantable + grantor | grantee | object_catalog | object_schema | object_name | object_type | privilege_type | is_grantable ---------------------------+---------------------------+----------------+---------------+-------------+----------------------+----------------+-------------- regress_foreign_data_user | regress_foreign_data_user | regression | | foo | FOREIGN DATA WRAPPER | USAGE | YES regress_foreign_data_user | regress_test_indirect | regression | | foo | FOREIGN DATA WRAPPER | USAGE | NO @@ -982,7 +982,7 @@ SELECT * FROM information_schema.user_mapping_options ORDER BY 1, 2, 3, 4; (5 rows) SELECT * FROM information_schema.usage_privileges WHERE object_type LIKE 'FOREIGN%' AND object_name IN ('s6', 'foo') ORDER BY 1, 2, 3, 4, 5; - grantor | grantee | object_catalog | object_schema | object_name | object_type | privilege_type | is_grantable + grantor | grantee | object_catalog | object_schema | object_name | object_type | privilege_type | is_grantable ---------------------------+-----------------------+----------------+---------------+-------------+----------------------+----------------+-------------- regress_foreign_data_user | regress_test_indirect | regression | | foo | FOREIGN DATA WRAPPER | USAGE | NO regress_test_indirect | regress_test_indirect | regression | | s6 | FOREIGN SERVER | USAGE | YES @@ -990,7 +990,7 @@ SELECT * FROM information_schema.usage_privileges WHERE object_type LIKE 'FOREIG (3 rows) SELECT * FROM information_schema.role_usage_grants WHERE object_type LIKE 'FOREIGN%' AND object_name IN ('s6', 'foo') ORDER BY 1, 2, 3, 4, 5; - grantor | grantee | object_catalog | object_schema | object_name | object_type | privilege_type | is_grantable + grantor | grantee | object_catalog | object_schema | object_name | object_type | privilege_type | is_grantable ---------------------------+-----------------------+----------------+---------------+-------------+----------------------+----------------+-------------- regress_foreign_data_user | regress_test_indirect | regression | | foo | FOREIGN DATA WRAPPER | USAGE | NO regress_test_indirect | regress_test_indirect | regression | | s6 | FOREIGN SERVER | USAGE | YES @@ -1208,8 +1208,8 @@ CREATE USER MAPPING FOR public SERVER s10 OPTIONS (user 'secret'); CREATE USER MAPPING FOR regress_unprivileged_role SERVER s10 OPTIONS (user 'secret'); -- owner of server can see some option fields \deu+ - List of user mappings - Server | User name | FDW options + List of user mappings + Server | User name | FDW options --------+---------------------------+------------------- s10 | public | ("user" 'secret') s10 | regress_unprivileged_role | @@ -1225,8 +1225,8 @@ CREATE USER MAPPING FOR regress_unprivileged_role SERVER s10 OPTIONS (user 'secr RESET ROLE; -- superuser can see all option fields \deu+ - List of user mappings - Server | User name | FDW options + List of user mappings + Server | User name | FDW options --------+---------------------------+--------------------- s10 | public | ("user" 'secret') s10 | regress_unprivileged_role | ("user" 'secret') @@ -1242,8 +1242,8 @@ RESET ROLE; -- unprivileged user cannot see any option field SET ROLE regress_unprivileged_role; \deu+ - List of user mappings - Server | User name | FDW options + List of user mappings + Server | User name | FDW options --------+---------------------------+------------- s10 | public | s10 | regress_unprivileged_role | diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out index 97d3c276..7aa37928 100644 --- a/src/test/regress/expected/insert.out +++ b/src/test/regress/expected/insert.out @@ -986,6 +986,43 @@ with baseInfo as(select * from t1) insert into t2 select * from baseInfo; drop table t1; drop table t2; +-- Determine whether tables of different groups are allowed to insert. +set default_locator_type to shard; +drop table if exists t2; +NOTICE: table "t2" does not exist, skipping +drop table if exists t2_rep; +NOTICE: table "t2_rep" does not exist, skipping +drop table if exists t2_new; +NOTICE: table "t2_new" does not exist, skipping +create table t2(f1 int,f2 int); +NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. +create table t2_rep(f1 int,f2 int) distribute by replication; +insert into t2_rep values(1,1),(2,2); +insert into t2 select * from t2_rep; +select count(*) from t2_rep; + count +------- + 2 +(1 row) + +select count(*) from t2; + count +------- + 2 +(1 row) + +create table t2_new as select * from t2_rep; +NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. +select count(*) from t2_new; + count +------- + 2 +(1 row) + +drop table t2; +drop table t2_rep; +drop table t2_new; +reset default_locator_type; -- test insert with returning in JDBC drop table if exists insertwithret; NOTICE: table "insertwithret" does not exist, skipping diff --git a/src/test/regress/expected/join_3.out b/src/test/regress/expected/join_3.out index 16264c50..1225d2ce 100644 --- a/src/test/regress/expected/join_3.out +++ b/src/test/regress/expected/join_3.out @@ -4614,8 +4614,8 @@ select *, (select r from (select q1 as q2) x, (select q2 as r) y) from int8_tbl; ------------------+-------------------+------------------- 123 | 456 | 456 123 | 4567890123456789 | 4567890123456789 - 4567890123456789 | 4567890123456789 | 4567890123456789 4567890123456789 | 123 | 123 + 4567890123456789 | 4567890123456789 | 4567890123456789 4567890123456789 | -4567890123456789 | -4567890123456789 (5 rows) @@ -4624,8 +4624,8 @@ select *, (select r from (select q1 as q2) x, lateral (select q2 as r) y) from i ------------------+-------------------+------------------ 123 | 456 | 123 123 | 4567890123456789 | 123 - 4567890123456789 | 4567890123456789 | 4567890123456789 4567890123456789 | 123 | 4567890123456789 + 4567890123456789 | 4567890123456789 | 4567890123456789 4567890123456789 | -4567890123456789 | 4567890123456789 (5 rows) @@ -4929,13 +4929,13 @@ select * from ------------------+-------------------+------------------+-------------------+------------------+------------------+------------------- 123 | 456 | | | 123 | | 123 | 4567890123456789 | 4567890123456789 | -4567890123456789 | 123 | 4567890123456789 | -4567890123456789 - 123 | 4567890123456789 | 4567890123456789 | 123 | 123 | 4567890123456789 | 123 123 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 | 4567890123456789 - 4567890123456789 | 4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789 | 4567890123456789 | -4567890123456789 - 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 | 4567890123456789 | 123 - 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 123 | 4567890123456789 | 4567890123456789 | 123 | 123 | 4567890123456789 | 123 4567890123456789 | 123 | 123 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 4567890123456789 | 123 | 123 | 456 | 4567890123456789 | 123 | 456 + 4567890123456789 | 4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789 | 4567890123456789 | -4567890123456789 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 | 4567890123456789 | 123 4567890123456789 | -4567890123456789 | | | 4567890123456789 | | (10 rows) @@ -4946,13 +4946,13 @@ select * from ------------------+-------------------+------------------+-------------------+------------------+------------------+------------------- 123 | 456 | | | 123 | | 123 | 4567890123456789 | 4567890123456789 | -4567890123456789 | 123 | 4567890123456789 | -4567890123456789 - 123 | 4567890123456789 | 4567890123456789 | 123 | 123 | 4567890123456789 | 123 123 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 | 4567890123456789 - 4567890123456789 | 4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789 | 4567890123456789 | -4567890123456789 - 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 | 4567890123456789 | 123 - 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 123 | 4567890123456789 | 4567890123456789 | 123 | 123 | 4567890123456789 | 123 4567890123456789 | 123 | 123 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 4567890123456789 | 123 | 123 | 456 | 4567890123456789 | 123 | 456 + 4567890123456789 | 4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789 | 4567890123456789 | -4567890123456789 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 | 4567890123456789 | 123 4567890123456789 | -4567890123456789 | | | 4567890123456789 | | (10 rows) @@ -4965,11 +4965,11 @@ select x.* from 123 | 4567890123456789 123 | 4567890123456789 123 | 4567890123456789 + 4567890123456789 | 123 + 4567890123456789 | 123 4567890123456789 | 4567890123456789 4567890123456789 | 4567890123456789 4567890123456789 | 4567890123456789 - 4567890123456789 | 123 - 4567890123456789 | 123 4567890123456789 | -4567890123456789 (10 rows) @@ -5086,14 +5086,14 @@ select * from q1 | q2 | q1 | q2 | x ------------------+-------------------+------------------+-------------------+------------------ 123 | 456 | | | - 123 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 123 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 + 123 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 123 | 4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789 - 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 - 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 - 4567890123456789 | 4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789 4567890123456789 | 123 | 123 | 456 | 123 4567890123456789 | 123 | 123 | 4567890123456789 | 123 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789 4567890123456789 | -4567890123456789 | | | (10 rows) @@ -5107,14 +5107,14 @@ select * from q1 | q2 | q1 | q2 | x ------------------+-------------------+------------------+-------------------+------------------ 123 | 456 | | | - 123 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 123 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 + 123 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 123 | 4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789 - 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 - 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 - 4567890123456789 | 4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789 4567890123456789 | 123 | 123 | 456 | 123 4567890123456789 | 123 | 123 | 4567890123456789 | 123 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 | 4567890123456789 + 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 + 4567890123456789 | 4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789 4567890123456789 | -4567890123456789 | | | (10 rows) @@ -5284,6 +5284,16 @@ select * from 123 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 123 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 123 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 + 4567890123456789 | 123 | 123 | 123 | 123 + 4567890123456789 | 123 | 123 | 123 | 123 + 4567890123456789 | 123 | 123 | 4567890123456789 | 123 + 4567890123456789 | 123 | 123 | 4567890123456789 | 123 + 4567890123456789 | 123 | 123 | 4567890123456789 | 123 + 4567890123456789 | 123 | 123 | 123 | 123 + 4567890123456789 | 123 | 123 | 123 | 123 + 4567890123456789 | 123 | 123 | 4567890123456789 | 123 + 4567890123456789 | 123 | 123 | 4567890123456789 | 123 + 4567890123456789 | 123 | 123 | 4567890123456789 | 123 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 | 123 4567890123456789 | 4567890123456789 | 4567890123456789 | 123 | 123 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 @@ -5299,16 +5309,6 @@ select * from 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 - 4567890123456789 | 123 | 123 | 123 | 123 - 4567890123456789 | 123 | 123 | 123 | 123 - 4567890123456789 | 123 | 123 | 4567890123456789 | 123 - 4567890123456789 | 123 | 123 | 4567890123456789 | 123 - 4567890123456789 | 123 | 123 | 4567890123456789 | 123 - 4567890123456789 | 123 | 123 | 123 | 123 - 4567890123456789 | 123 | 123 | 123 | 123 - 4567890123456789 | 123 | 123 | 4567890123456789 | 123 - 4567890123456789 | 123 | 123 | 4567890123456789 | 123 - 4567890123456789 | 123 | 123 | 4567890123456789 | 123 4567890123456789 | -4567890123456789 | | | (42 rows) @@ -5523,8 +5523,8 @@ lateral (select * from int8_tbl t1, where t1.q1 = ss.q2) ss0; id | q1 | q2 | q1 | q2 ----+------------------+-------------------+------------------+------------------ - 0 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 0 | 4567890123456789 | 123 | 4567890123456789 | 4567890123456789 + 0 | 4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789 0 | 4567890123456789 | -4567890123456789 | 4567890123456789 | 4567890123456789 (3 rows) @@ -5782,7 +5782,7 @@ select * from j1 inner join j2 on j1.id = j2.id; -> Materialize Output: j2.id -> Seq Scan on public.j2 - Output: j2.id + Output: j2.id (14 rows) -- ensure join is not unique when not an equi-join @@ -5823,7 +5823,7 @@ select * from j1 inner join j3 on j1.id = j3.id; -> Materialize Output: j1.id -> Seq Scan on public.j1 - Output: j1.id + Output: j1.id (14 rows) -- ensure left join is marked as unique @@ -5844,7 +5844,7 @@ select * from j1 left join j2 on j1.id = j2.id; -> Materialize Output: j2.id -> Seq Scan on public.j2 - Output: j2.id + Output: j2.id (14 rows) -- ensure right join is marked as unique @@ -5863,7 +5863,7 @@ select * from j1 right join j2 on j1.id = j2.id; -> Materialize Output: j1.id -> Seq Scan on public.j1 - Output: j1.id + Output: j1.id (12 rows) -- ensure full join is marked as unique @@ -5924,7 +5924,7 @@ select * from j1 natural join j2; -> Materialize Output: j2.id -> Seq Scan on public.j2 - Output: j2.id + Output: j2.id (14 rows) -- ensure a distinct clause allows the inner to become unique @@ -6645,18 +6645,18 @@ explain select t3.b from nestloop_suppression1 t1, nestloop_suppression2 t2, nes where t1.b=2 and t1.c=3 and t1.d like 'char%' and t1.a=t2.a and t3.b>t2.a; QUERY PLAN ------------------------------------------------------------------------------------------------------------------------------ - Nested Loop (cost=200.16..402.39 rows=33 width=4) + Nested Loop (cost=200.16..371.39 rows=33 width=4) Join Filter: (t3.b > t2.a) - -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.16..280.69 rows=1 width=4) - -> Nested Loop (cost=0.16..180.68 rows=1 width=4) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.16..268.69 rows=1 width=4) + -> Nested Loop (cost=0.16..168.68 rows=1 width=4) Join Filter: (t1.a = t2.a) -> Index Scan using idx_nestloop_suppression1_b on nestloop_suppression1 t1 (cost=0.16..8.18 rows=1 width=4) Index Cond: (b = 2) Filter: (((d)::text ~~ 'char%'::text) AND (c = 3)) - -> Seq Scan on nestloop_suppression2 t2 (cost=0.00..110.00 rows=5000 width=4) - -> Materialize (cost=100.00..121.08 rows=50 width=4) - -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.00..120.95 rows=50 width=4) - -> Seq Scan on nestloop_suppression3 t3 (cost=0.00..20.50 rows=50 width=4) + -> Seq Scan on nestloop_suppression2 t2 (cost=0.00..98.00 rows=5000 width=4) + -> Materialize (cost=100.00..102.08 rows=50 width=4) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.00..101.95 rows=50 width=4) + -> Seq Scan on nestloop_suppression3 t3 (cost=0.00..1.50 rows=50 width=4) (12 rows) set enable_nestloop_suppression = true; @@ -6664,19 +6664,19 @@ explain select t3.b from nestloop_suppression1 t1, nestloop_suppression2 t2, nes where t1.b=2 and t1.c=3 and t1.d like 'char%' and t1.a=t2.a and t3.b>t2.a; QUERY PLAN ------------------------------------------------------------------------------------------------------------------------------------ - Nested Loop (cost=200.16..414.89 rows=33 width=4) + Nested Loop (cost=200.16..383.89 rows=33 width=4) Join Filter: (t3.b > t2.a) - -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.16..293.19 rows=1 width=4) - -> Nested Loop (cost=0.16..193.19 rows=1 width=4) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.16..281.19 rows=1 width=4) + -> Nested Loop (cost=0.16..181.19 rows=1 width=4) Join Filter: (t1.a = t2.a) - -> Seq Scan on nestloop_suppression2 t2 (cost=0.00..110.00 rows=5000 width=4) + -> Seq Scan on nestloop_suppression2 t2 (cost=0.00..98.00 rows=5000 width=4) -> Materialize (cost=0.16..8.19 rows=1 width=4) -> Index Scan using idx_nestloop_suppression1_b on nestloop_suppression1 t1 (cost=0.16..8.18 rows=1 width=4) Index Cond: (b = 2) Filter: (((d)::text ~~ 'char%'::text) AND (c = 3)) - -> Materialize (cost=100.00..121.08 rows=50 width=4) - -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.00..120.95 rows=50 width=4) - -> Seq Scan on nestloop_suppression3 t3 (cost=0.00..20.50 rows=50 width=4) + -> Materialize (cost=100.00..102.08 rows=50 width=4) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.00..101.95 rows=50 width=4) + -> Seq Scan on nestloop_suppression3 t3 (cost=0.00..1.50 rows=50 width=4) (13 rows) drop table nestloop_suppression1; diff --git a/src/test/regress/expected/partition_join_2.out b/src/test/regress/expected/partition_join_2.out index 2ae2b8a2..d2435f12 100644 --- a/src/test/regress/expected/partition_join_2.out +++ b/src/test/regress/expected/partition_join_2.out @@ -396,8 +396,8 @@ SELECT * FROM prt1 t1 LEFT JOIN LATERAL -> Hash Join Hash Cond: (t3.b = a) -> Append - -> Index Scan using iprt2_p1_b on prt2_p1 t3 - -> Index Scan using iprt2_p2_b on prt2_p2 t3_1 + -> Seq Scan on prt2_p1 t3 + -> Seq Scan on prt2_p2 t3_1 -> Index Scan using iprt2_p3_b on prt2_p3 t3_2 -> Hash -> Remote Subquery Scan on all (datanode_1,datanode_2) @@ -743,48 +743,38 @@ SELECT t1.a, t1.phv, t2.b, t2.phv, t3.a + t3.b, t3.phv FROM ((SELECT 50 phv, * F -- Semi-join EXPLAIN (COSTS OFF) SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHERE t1.a = 0 AND t1.b = (t2.a + t2.b)/2) AND t1.b = 0 ORDER BY t1.a; - QUERY PLAN ------------------------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------ Remote Subquery Scan on all (datanode_1,datanode_2) - -> Merge Join - Merge Cond: (a = b) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: a - -> Sort - Sort Key: t1.a - -> Append - -> Seq Scan on prt1_p1 t1 - Filter: (b = 0) - -> Seq Scan on prt1_p2 t1_1 - Filter: (b = 0) - -> Seq Scan on prt1_p3 t1_2 - Filter: (b = 0) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b - -> Sort - Sort Key: b - -> HashAggregate - Group Key: b - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b - -> HashAggregate - Group Key: b - -> Hash Join - Hash Cond: (((t2.a + t2.b) / 2) = b) - -> Append - -> Seq Scan on prt1_e_p1 t2 - -> Seq Scan on prt1_e_p2 t2_1 - -> Seq Scan on prt1_e_p3 t2_2 - -> Hash - -> Remote Subquery Scan on all (datanode_2) - -> Append - -> Seq Scan on prt2_p1 t1_3 - Filter: (a = 0) - -> Seq Scan on prt2_p2 t1_4 - Filter: (a = 0) - -> Seq Scan on prt2_p3 t1_5 - Filter: (a = 0) -(39 rows) + -> Nested Loop Semi Join + Join Filter: (t1.a = b) + -> Merge Append + Sort Key: t1.a + -> Index Scan using iprt1_p1_a on prt1_p1 t1 + Filter: (b = 0) + -> Index Scan using iprt1_p2_a on prt1_p2 t1_1 + Filter: (b = 0) + -> Index Scan using iprt1_p3_a on prt1_p3 t1_2 + Filter: (b = 0) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b + -> Hash Join + Hash Cond: (((t2.a + t2.b) / 2) = b) + -> Append + -> Seq Scan on prt1_e_p1 t2 + -> Seq Scan on prt1_e_p2 t2_1 + -> Seq Scan on prt1_e_p3 t2_2 + -> Hash + -> Remote Subquery Scan on all (datanode_2) + -> Append + -> Seq Scan on prt2_p1 t1_3 + Filter: (a = 0) + -> Seq Scan on prt2_p2 t1_4 + Filter: (a = 0) + -> Seq Scan on prt2_p3 t1_5 + Filter: (a = 0) +(29 rows) SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHERE t1.a = 0 AND t1.b = (t2.a + t2.b)/2) AND t1.b = 0 ORDER BY t1.a; a | b | c @@ -800,23 +790,19 @@ SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN ( QUERY PLAN ----------------------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) - -> Merge Semi Join - Merge Cond: (a = b) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: a - -> Sort - Sort Key: t1.a - -> Append - -> Seq Scan on prt1_p1 t1 - Filter: (b = 0) - -> Seq Scan on prt1_p2 t1_1 - Filter: (b = 0) - -> Seq Scan on prt1_p3 t1_2 - Filter: (b = 0) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: b - -> Sort - Sort Key: t1_3.b + -> Nested Loop Semi Join + Join Filter: (t1.a = b) + -> Merge Append + Sort Key: t1.a + -> Index Scan using iprt1_p1_a on prt1_p1 t1 + Filter: (b = 0) + -> Index Scan using iprt1_p2_a on prt1_p2 t1_1 + Filter: (b = 0) + -> Index Scan using iprt1_p3_a on prt1_p3 t1_2 + Filter: (b = 0) + -> Materialize + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: b -> Hash Semi Join Hash Cond: (t1_3.b = ((a + b) / 2)) -> Append @@ -832,7 +818,7 @@ SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN ( Filter: (c = 0) -> Seq Scan on prt1_e_p3 t1_8 Filter: (c = 0) -(33 rows) +(29 rows) SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a; a | b | c @@ -1165,26 +1151,26 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 EXPLAIN (COSTS OFF) SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b, prt1 t3 WHERE t2.b = t3.a; - QUERY PLAN ------------------------------------------------------------------ - Hash Join + QUERY PLAN +----------------------------------------------------------------------- + Hash Left Join Hash Cond: (b = a) - -> Hash Left Join - Hash Cond: (b = a) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Append - -> Seq Scan on prt2_p1 t2 - -> Seq Scan on prt2_p2 t2_1 - -> Seq Scan on prt2_p3 t2_2 - -> Hash - -> Result - One-Time Filter: false + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Hash Join + Hash Cond: (a = t2.b) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on prt1_p1 t3 + -> Seq Scan on prt1_p2 t3_1 + -> Seq Scan on prt1_p3 t3_2 + -> Hash + -> Append + -> Seq Scan on prt2_p1 t2 + -> Seq Scan on prt2_p2 t2_1 + -> Seq Scan on prt2_p3 t2_2 -> Hash - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Append - -> Seq Scan on prt1_p1 t3 - -> Seq Scan on prt1_p2 t3_1 - -> Seq Scan on prt1_p3 t3_2 + -> Result + One-Time Filter: false (18 rows) EXPLAIN (COSTS OFF) @@ -1764,27 +1750,27 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_n t1 LEFT JOIN prt2_n t2 ON (t1.c = t2.c EXPLAIN (COSTS OFF) SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_n t1 JOIN prt2_n t2 ON (t1.c = t2.c) JOIN plt1 t3 ON (t1.c = t3.c); - QUERY PLAN ------------------------------------------------------------------------------------ + QUERY PLAN +----------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) -> Hash Join - Hash Cond: (c = (c)::text) + Hash Cond: (c = (t1.c)::text) -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Append - -> Seq Scan on plt1_p1 t3 - -> Seq Scan on plt1_p2 t3_1 - -> Seq Scan on plt1_p3 t3_2 + -> Seq Scan on prt2_n_p1 t2 + -> Seq Scan on prt2_n_p2 t2_1 -> Hash -> Hash Join - Hash Cond: (t2.c = (c)::text) - -> Append - -> Seq Scan on prt2_n_p1 t2 - -> Seq Scan on prt2_n_p2 t2_1 + Hash Cond: (c = (t1.c)::text) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Append + -> Seq Scan on plt1_p1 t3 + -> Seq Scan on plt1_p2 t3_1 + -> Seq Scan on plt1_p3 t3_2 -> Hash - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Append - -> Seq Scan on prt1_n_p1 t1 - -> Seq Scan on prt1_n_p2 t1_1 + -> Append + -> Seq Scan on prt1_n_p1 t1 + -> Seq Scan on prt1_n_p2 t1_1 (19 rows) -- partition-wise join can not be applied for a join between list and range diff --git a/src/test/regress/expected/sanity_check.out b/src/test/regress/expected/sanity_check.out index 8bea6498..82b95aa8 100644 --- a/src/test/regress/expected/sanity_check.out +++ b/src/test/regress/expected/sanity_check.out @@ -60,6 +60,7 @@ inet_tbl|t inhf|f inhx|t insert_tbl|f +insertwithret|f int2_tbl|f int4_tbl|f int8_tbl|f diff --git a/src/test/regress/expected/sequence.out b/src/test/regress/expected/sequence.out index d510c4f0..c830a7f7 100644 --- a/src/test/regress/expected/sequence.out +++ b/src/test/regress/expected/sequence.out @@ -880,4 +880,4 @@ select gsk_key from pg_list_storage_sequence() where gsk_key like '%db_seq1_bak. db_seq1_bak.public.t3_f1_seq (3 rows) -\q \ No newline at end of file +\q diff --git a/src/test/regress/expected/stats_ext_2.out b/src/test/regress/expected/stats_ext_2.out index 16b06053..3a412ef4 100644 --- a/src/test/regress/expected/stats_ext_2.out +++ b/src/test/regress/expected/stats_ext_2.out @@ -659,10 +659,10 @@ EXPLAIN SELECT count(*) FROM subset WHERE b = 'prefix_1' and c = 1; QUERY PLAN ------------------------------------------------------------------------------------------------- - Finalize Aggregate (cost=177.52..177.53 rows=1 width=8) - -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=177.50..177.52 rows=1 width=0) - -> Partial Aggregate (cost=77.50..77.51 rows=1 width=8) - -> Seq Scan on subset (cost=0.00..77.50 rows=1 width=0) + Finalize Aggregate (cost=163.52..163.53 rows=1 width=8) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=163.50..163.52 rows=1 width=0) + -> Partial Aggregate (cost=63.50..63.51 rows=1 width=8) + -> Seq Scan on subset (cost=0.00..63.50 rows=1 width=0) Filter: ((b = 'prefix_1'::text) AND (c = 1)) (5 rows) @@ -680,10 +680,10 @@ EXPLAIN SELECT count(*) FROM subset WHERE b = 'prefix_1' and c = 1; QUERY PLAN ------------------------------------------------------------------------------------------------- - Finalize Aggregate (cost=177.64..177.65 rows=1 width=8) - -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=177.62..177.64 rows=1 width=0) - -> Partial Aggregate (cost=77.62..77.64 rows=1 width=8) - -> Seq Scan on subset (cost=0.00..77.50 rows=50 width=0) + Finalize Aggregate (cost=163.64..163.65 rows=1 width=8) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=163.62..163.64 rows=1 width=0) + -> Partial Aggregate (cost=63.62..63.63 rows=1 width=8) + -> Seq Scan on subset (cost=0.00..63.50 rows=50 width=0) Filter: ((b = 'prefix_1'::text) AND (c = 1)) (5 rows) @@ -698,10 +698,10 @@ EXPLAIN SELECT count(*) FROM subset WHERE b like '%_1' and c = 1; QUERY PLAN ------------------------------------------------------------------------------------------------- - Finalize Aggregate (cost=177.53..177.54 rows=1 width=8) - -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=177.51..177.53 rows=1 width=0) - -> Partial Aggregate (cost=77.51..77.52 rows=1 width=8) - -> Seq Scan on subset (cost=0.00..77.50 rows=5 width=0) + Finalize Aggregate (cost=163.53..163.54 rows=1 width=8) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=163.51..163.53 rows=1 width=0) + -> Partial Aggregate (cost=63.51..63.52 rows=1 width=8) + -> Seq Scan on subset (cost=0.00..63.50 rows=5 width=0) Filter: ((b ~~ '%_1'::text) AND (c = 1)) (5 rows) @@ -722,10 +722,10 @@ EXPLAIN SELECT count(*) FROM subset WHERE b like '%_1' and c = 1; QUERY PLAN ------------------------------------------------------------------------------------------------- - Finalize Aggregate (cost=177.64..177.65 rows=1 width=8) - -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=177.62..177.64 rows=1 width=0) - -> Partial Aggregate (cost=77.62..77.64 rows=1 width=8) - -> Seq Scan on subset (cost=0.00..77.50 rows=50 width=0) + Finalize Aggregate (cost=163.64..163.65 rows=1 width=8) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=163.62..163.64 rows=1 width=0) + -> Partial Aggregate (cost=63.62..63.63 rows=1 width=8) + -> Seq Scan on subset (cost=0.00..63.50 rows=50 width=0) Filter: ((b ~~ '%_1'::text) AND (c = 1)) (5 rows) diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index 4c0e6f5c..d2150f63 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -143,7 +143,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_transparent_crypt | on enable_user_authority_force_check | off enable_xlog_mprotect | on -(70 rows) +(71 rows) -- Test that the pg_timezone_names and pg_timezone_abbrevs views are -- more-or-less working. We can't test their contents in any great detail diff --git a/src/test/regress/expected/tbase_explain.out b/src/test/regress/expected/tbase_explain.out index d91ef65e..8b56b4d6 100644 --- a/src/test/regress/expected/tbase_explain.out +++ b/src/test/regress/expected/tbase_explain.out @@ -377,17 +377,13 @@ select * from a1 where num >= (select count(*) from a2 where name='c') limit 1; -> Remote Subquery Scan on all (datanode_1,datanode_2) (actual rows=1 loops=1) Output: a1.id, a1.num, a1.name -> Limit - DN (actual rows=1..1 loops=1..1) - - datanode_1 (actual rows=1 loops=1) - - datanode_2 (actual rows=1 loops=1) + DN (never executed) Output: a1.id, a1.num, a1.name -> Seq Scan on public.a1 - DN (actual rows=1..1 loops=1..1) - - datanode_1 (actual rows=1 loops=1) - - datanode_2 (actual rows=1 loops=1) + DN (never executed) Output: a1.id, a1.num, a1.name Filter: (a1.num >= $0) -(31 rows) +(27 rows) explain (costs off,timing off,summary off,analyze,verbose) select count(*) from a1 group by name having count(*) = (select count(*) from a2 where name='a'); From e6a1711b773ca857646d84b1d24a6cd755f62840 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Mon, 4 Jul 2022 19:24:54 +0800 Subject: [PATCH 503/578] fix regress error 2 --- .../expected/redistribute_custom_types.out | 46 +++++++++---------- src/test/regress/parallel_schedule | 1 - 2 files changed, 23 insertions(+), 24 deletions(-) diff --git a/src/test/regress/expected/redistribute_custom_types.out b/src/test/regress/expected/redistribute_custom_types.out index 2ae77a18..24d9ece5 100644 --- a/src/test/regress/expected/redistribute_custom_types.out +++ b/src/test/regress/expected/redistribute_custom_types.out @@ -17,16 +17,16 @@ insert into enum_test(a,b) values(5,'Fri'); insert into enum_test(a,b) values(6,'Sat'); insert into enum_test(a,b) values(7,'Sun'); explain select count(*) from enum_test where a < 100 group by b; - QUERY PLAN ---------------------------------------------------------------------------------------------------------- - Remote Subquery Scan on all (datanode_1,datanode_2) (cost=132.87..134.74 rows=187 width=12) - -> Finalize HashAggregate (cost=132.87..134.74 rows=187 width=12) + QUERY PLAN +-------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) (cost=116.44..117.38 rows=94 width=12) + -> Finalize HashAggregate (cost=116.44..117.38 rows=94 width=12) Group Key: b - -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=129.12..131.93 rows=187 width=0) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=114.56..115.97 rows=94 width=0) Distribute results by S: b - -> Partial HashAggregate (cost=29.12..31.00 rows=187 width=12) + -> Partial HashAggregate (cost=14.56..15.50 rows=94 width=12) Group Key: b - -> Seq Scan on enum_test (cost=0.00..26.88 rows=450 width=4) + -> Seq Scan on enum_test (cost=0.00..13.44 rows=225 width=4) Filter: (a < 100) (9 rows) @@ -57,20 +57,20 @@ insert into comptype_test(a,b) values(4,(4,5)); insert into comptype_test(a,b) values(5,(5,6)); insert into comptype_test(a,b) values(6,(6,7)); explain select count(*) from comptype_test where a < 100 group by b; - QUERY PLAN ---------------------------------------------------------------------------------------------------------------- - Remote Subquery Scan on all (datanode_1,datanode_2) (cost=145.27..147.81 rows=169 width=40) - -> Finalize GroupAggregate (cost=145.27..147.81 rows=169 width=40) + QUERY PLAN +-------------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) (cost=121.50..122.77 rows=85 width=40) + -> Finalize GroupAggregate (cost=121.50..122.77 rows=85 width=40) Group Key: b - -> Sort (cost=145.27..145.70 rows=169 width=0) + -> Sort (cost=121.50..121.71 rows=85 width=0) Sort Key: b - -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=134.18..139.02 rows=169 width=0) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=116.35..118.78 rows=85 width=0) Distribute results by S: b - -> Partial GroupAggregate (cost=34.18..38.17 rows=169 width=40) + -> Partial GroupAggregate (cost=16.35..18.35 rows=85 width=40) Group Key: b - -> Sort (cost=34.18..34.95 rows=307 width=32) + -> Sort (cost=16.35..16.73 rows=154 width=32) Sort Key: b - -> Seq Scan on comptype_test (cost=0.00..21.50 rows=307 width=32) + -> Seq Scan on comptype_test (cost=0.00..10.75 rows=154 width=32) Filter: (a < 100) (13 rows) @@ -100,16 +100,16 @@ insert into domaintype_test(a,b) values(4,4); insert into domaintype_test(a,b) values(5,5); insert into domaintype_test(a,b) values(6,6); explain select count(*) from domaintype_test where a < 100 group by b; - QUERY PLAN ---------------------------------------------------------------------------------------------------------- - Remote Subquery Scan on all (datanode_1,datanode_2) (cost=132.87..134.74 rows=187 width=12) - -> Finalize HashAggregate (cost=132.87..134.74 rows=187 width=12) + QUERY PLAN +-------------------------------------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) (cost=116.44..117.38 rows=94 width=12) + -> Finalize HashAggregate (cost=116.44..117.38 rows=94 width=12) Group Key: b - -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=129.12..131.93 rows=187 width=0) + -> Remote Subquery Scan on all (datanode_1,datanode_2) (cost=114.56..115.97 rows=94 width=0) Distribute results by S: b - -> Partial HashAggregate (cost=29.12..31.00 rows=187 width=12) + -> Partial HashAggregate (cost=14.56..15.50 rows=94 width=12) Group Key: b - -> Seq Scan on domaintype_test (cost=0.00..26.88 rows=450 width=4) + -> Seq Scan on domaintype_test (cost=0.00..13.44 rows=225 width=4) Filter: (a < 100) (9 rows) diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 036a73c3..782b692b 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -166,4 +166,3 @@ test: xl_primary_key xl_foreign_key xl_distribution_column_types xl_alter_table test: tbase_explain test: redistribute_custom_types pl_bugs -test: nestloop_by_shard From 1fc49a746f8845fdb884252fc1b60250c7b3eaf4 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Mon, 4 Jul 2022 19:34:29 +0800 Subject: [PATCH 504/578] update TBASE_VERSION_STR to TBase_V2.4.0_release --- src/backend/utils/adt/version.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/utils/adt/version.c b/src/backend/utils/adt/version.c index b11ef63b..2cf52a05 100644 --- a/src/backend/utils/adt/version.c +++ b/src/backend/utils/adt/version.c @@ -78,7 +78,7 @@ #include "utils/builtins.h" -#define TBASE_VERSION_STR "TBase_master" +#define TBASE_VERSION_STR "TBase_V2.4.0_release" Datum pgsql_version(PG_FUNCTION_ARGS) From ad01198a1c8f5799928d2c42167bac02672bab19 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Tue, 5 Jul 2022 14:41:06 +0800 Subject: [PATCH 505/578] add v2.4.0-release note --- v2.4.0-release-note.txt | 56 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 v2.4.0-release-note.txt diff --git a/v2.4.0-release-note.txt b/v2.4.0-release-note.txt new file mode 100644 index 00000000..3e8ec6d4 --- /dev/null +++ b/v2.4.0-release-note.txt @@ -0,0 +1,56 @@ +V2.4.0-release 版本主要修改集中在: +1、分布式计算性能提升: +1)hasAggs/having/sort/limit/Stable function/ 优化下推到DN上执行,性能提升近百倍 +2)FQS查询下推能力增强:分布key计算返回单条结果时进行sql下推,性能提升约20倍 +3)分布式进程ProcLock 分段锁优化,减少锁冲突,执行效率提升5倍左右 +4)GTM、CN、DN 同台机器部署时,通信使用unix domain socket,性能提升30% +5)执行计划优化:重分布的节点数为1时跳过remote算子,精简执行计划 +6)复制表下推DN策略调整:连接数过多时数据上拉到CN节点,连接数少时下推DN,节省网络资源 +7)优化全局 sequence 获取性能提升约20% + + +2、2PC事务优化: +1)死锁检查性能优化:a)批量获取gxid; b)遍历查找gxid修改为二分查找;c)增加try轮次限制 +2)创建内存hash表,减少2pc磁盘文件句柄的创建 +3)增加2PC cleaner进程自动清理功能 +4)隐式事务DN不自动提交,避免出现部分提交的现象 +5)2PC添加保护模式,杜绝部分提交的情况 +6)优化GTS获取流程,保证每次获取到最新的结果 +7)drop database 增加prepare过程,确保不会出现节点失败残留 + + +3、高可用能力加强: +1)业务正常运行时允许添加DN节点 +2)GTM 主备切换时,2PC事务可以正常进行 +3)GTM 主备切换能力增强,主备同时crash时,备机起来可以自主发起生主操作 +4)GTM高可用优化,在备机恢复时如果xlog与主机相差太大时直接采用重做备机的方式 +5)GTM备机高可用优化,添加备连接主超时时间 + + +4、易用性增强: +1)支持读写分离的读平面修改系统参数,加强了用户对系统的控制力 +2)增加 pooler 多线程日志功能,方便用户进行问题分析 +3)GTM 日志优化,日志汇聚一个文件,并自动拆分活跃日志,方便用户随时查看 +4)支持 gtm_ctl -l 指定日志文件路径启动,方便用户对日志进行管理 +5)全局session视图优化 使用usename,datname,替换oid,展现信息更加直观,可读性强 + + +5、新特性支持: +1)dblink支持copy功能,批量query功能 +2)自研分区表剪枝,添加IN语法剪枝,提前加速剪枝 +3)解除存储过程中累计事务最大为64个的限制 +4)支持带有数据shuffle的并发更新能力 +5)数据shuffle支持用户自定义函数 +6)允许删除带有分布式外键约束的主表的列 +7)全局session安全加固,防止SQL注入 + + +6、已知问题修复: +1)存储过程/触发器中死锁、子事务回滚异常问题解决 +2)全局session活跃视图死锁问题解决 +3)分布式网络通信死锁问题解决 +4)扩展协议带有return 的insert语句不能返回数据问题修复 +5)物化视图并发刷新问题解决 +6)复杂sql并发更新报错问题解决 +7)存在多层并行gather算子时查询异常问题解决 +8)GTM 备机内存泄露问题解决 \ No newline at end of file From 2b65bb7760fb50dc877909737eb235fee6a381ac Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Tue, 5 Jul 2022 14:52:42 +0800 Subject: [PATCH 506/578] update TBASE_VERSION_STR --- src/backend/utils/adt/version.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/utils/adt/version.c b/src/backend/utils/adt/version.c index 2cf52a05..b11ef63b 100644 --- a/src/backend/utils/adt/version.c +++ b/src/backend/utils/adt/version.c @@ -78,7 +78,7 @@ #include "utils/builtins.h" -#define TBASE_VERSION_STR "TBase_V2.4.0_release" +#define TBASE_VERSION_STR "TBase_master" Datum pgsql_version(PG_FUNCTION_ARGS) From 4887e3a62a4d72ab7b04ad538cfcfb8bbf9cc9c7 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Wed, 13 Jul 2022 15:22:28 +0800 Subject: [PATCH 507/578] add v2.4.0-release note --- v2.4.0-release-note.txt | 63 +++++++++-------------------------------- 1 file changed, 14 insertions(+), 49 deletions(-) diff --git a/v2.4.0-release-note.txt b/v2.4.0-release-note.txt index 3e8ec6d4..b3df61df 100644 --- a/v2.4.0-release-note.txt +++ b/v2.4.0-release-note.txt @@ -1,56 +1,21 @@ V2.4.0-release 版本主要修改集中在: 1、分布式计算性能提升: -1)hasAggs/having/sort/limit/Stable function/ 优化下推到DN上执行,性能提升近百倍 -2)FQS查询下推能力增强:分布key计算返回单条结果时进行sql下推,性能提升约20倍 -3)分布式进程ProcLock 分段锁优化,减少锁冲突,执行效率提升5倍左右 -4)GTM、CN、DN 同台机器部署时,通信使用unix domain socket,性能提升30% -5)执行计划优化:重分布的节点数为1时跳过remote算子,精简执行计划 -6)复制表下推DN策略调整:连接数过多时数据上拉到CN节点,连接数少时下推DN,节省网络资源 -7)优化全局 sequence 获取性能提升约20% - +● hasAggs/having/sort/limit/Stable function/ 优化下推到DN上执行,性能提升近百倍。 +● 分布式进程ProcLock分段锁优化,减少锁冲突,执行效率提升约5倍。 +● 执行计划优化:重分布的节点数为1时跳过remote算子,精简执行计划。 2、2PC事务优化: -1)死锁检查性能优化:a)批量获取gxid; b)遍历查找gxid修改为二分查找;c)增加try轮次限制 -2)创建内存hash表,减少2pc磁盘文件句柄的创建 -3)增加2PC cleaner进程自动清理功能 -4)隐式事务DN不自动提交,避免出现部分提交的现象 -5)2PC添加保护模式,杜绝部分提交的情况 -6)优化GTS获取流程,保证每次获取到最新的结果 -7)drop database 增加prepare过程,确保不会出现节点失败残留 - - -3、高可用能力加强: -1)业务正常运行时允许添加DN节点 -2)GTM 主备切换时,2PC事务可以正常进行 -3)GTM 主备切换能力增强,主备同时crash时,备机起来可以自主发起生主操作 -4)GTM高可用优化,在备机恢复时如果xlog与主机相差太大时直接采用重做备机的方式 -5)GTM备机高可用优化,添加备连接主超时时间 - - -4、易用性增强: -1)支持读写分离的读平面修改系统参数,加强了用户对系统的控制力 -2)增加 pooler 多线程日志功能,方便用户进行问题分析 -3)GTM 日志优化,日志汇聚一个文件,并自动拆分活跃日志,方便用户随时查看 -4)支持 gtm_ctl -l 指定日志文件路径启动,方便用户对日志进行管理 -5)全局session视图优化 使用usename,datname,替换oid,展现信息更加直观,可读性强 - +● 死锁检查性能优化:批量获取gxid;遍历查找gxid修改为二分查找;增加try轮次限制。 +● 2PC添加保护模式,杜绝部分提交的情况。 +● drop database 增加prepare过程,确保不会出现节点失败残留。 -5、新特性支持: -1)dblink支持copy功能,批量query功能 -2)自研分区表剪枝,添加IN语法剪枝,提前加速剪枝 -3)解除存储过程中累计事务最大为64个的限制 -4)支持带有数据shuffle的并发更新能力 -5)数据shuffle支持用户自定义函数 -6)允许删除带有分布式外键约束的主表的列 -7)全局session安全加固,防止SQL注入 +3、易用性增强: +● 增加pooler多线程日志功能,方便用户进行问题分析。 +● GTM日志优化,日志汇聚一个文件,并自动拆分活跃日志,方便用户随时查看。 +● 支持gtm_ctl -l指定日志文件路径启动,方便用户对日志进行管理。 +4、新特性支持: +● 自研分区表剪枝,添加IN语法剪枝,提前加速剪枝。 +● 数据shuffle支持用户自定义函数。 +● 允许删除带有分布式外键约束的主表的列。 -6、已知问题修复: -1)存储过程/触发器中死锁、子事务回滚异常问题解决 -2)全局session活跃视图死锁问题解决 -3)分布式网络通信死锁问题解决 -4)扩展协议带有return 的insert语句不能返回数据问题修复 -5)物化视图并发刷新问题解决 -6)复杂sql并发更新报错问题解决 -7)存在多层并行gather算子时查询异常问题解决 -8)GTM 备机内存泄露问题解决 \ No newline at end of file From de97b2f2bba5b8d865140387349dcd3dbec9ef96 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Thu, 22 Mar 2018 15:47:29 -0400 Subject: [PATCH 508/578] Sync up our various ways of estimating pg_class.reltuples. VACUUM thought that reltuples represents the total number of tuples in the relation, while ANALYZE counted only live tuples. This can cause "flapping" in the value when background vacuums and analyzes happen separately. The planner's use of reltuples essentially assumes that it's the count of live (visible) tuples, so let's standardize on having it mean live tuples. Another issue is that the definition of "live tuple" isn't totally clear; what should be done with INSERT_IN_PROGRESS or DELETE_IN_PROGRESS tuples? ANALYZE's choices in this regard are made on the assumption that if the originating transaction commits at all, it will happen after ANALYZE finishes, so we should ignore the effects of the in-progress transaction --- unless it is our own transaction, and then we should count it. Let's propagate this definition into VACUUM, too. Likewise propagate this definition into CREATE INDEX, and into contrib/pgstattuple's pgstattuple_approx() function. Tomas Vondra, reviewed by Haribabu Kommi, some corrections by me Discussion: https://postgr.es/m/16db4468-edfa-830a-f921-39a50498e77e@2ndquadrant.com --- contrib/pgstattuple/pgstatapprox.c | 444 +++++++++++++++-------------- doc/src/sgml/catalogs.sgml | 4 +- src/backend/catalog/index.c | 52 +++- src/backend/commands/vacuum.c | 6 +- src/backend/commands/vacuumlazy.c | 84 ++++-- 5 files changed, 334 insertions(+), 256 deletions(-) diff --git a/contrib/pgstattuple/pgstatapprox.c b/contrib/pgstattuple/pgstatapprox.c index 13ce7f99..21bda0e7 100644 --- a/contrib/pgstattuple/pgstatapprox.c +++ b/contrib/pgstattuple/pgstatapprox.c @@ -1,12 +1,12 @@ /*------------------------------------------------------------------------- * * pgstatapprox.c - * Bloat estimation functions + * Bloat estimation functions * * Copyright (c) 2014-2017, PostgreSQL Global Development Group * * IDENTIFICATION - * contrib/pgstattuple/pgstatapprox.c + * contrib/pgstattuple/pgstatapprox.c * *------------------------------------------------------------------------- */ @@ -31,20 +31,20 @@ PG_FUNCTION_INFO_V1(pgstattuple_approx); PG_FUNCTION_INFO_V1(pgstattuple_approx_v1_5); -Datum pgstattuple_approx_internal(Oid relid, FunctionCallInfo fcinfo); +Datum pgstattuple_approx_internal(Oid relid, FunctionCallInfo fcinfo); typedef struct output_type { - uint64 table_len; - uint64 scanned_percent; - uint64 tuple_count; - uint64 tuple_len; - double tuple_percent; - uint64 dead_tuple_count; - uint64 dead_tuple_len; - double dead_tuple_percent; - uint64 free_space; - double free_percent; + uint64 table_len; + uint64 scanned_percent; + uint64 tuple_count; + uint64 tuple_len; + double tuple_percent; + uint64 dead_tuple_count; + uint64 dead_tuple_len; + double dead_tuple_percent; + uint64 free_space; + double free_percent; } output_type; #define NUM_OUTPUT_COLUMNS 10 @@ -62,147 +62,153 @@ typedef struct output_type static void statapprox_heap(Relation rel, output_type *stat) { - BlockNumber scanned, - nblocks, - blkno; - Buffer vmbuffer = InvalidBuffer; - BufferAccessStrategy bstrategy; - TransactionId OldestXmin; - uint64 misc_count = 0; - - OldestXmin = GetOldestXmin(rel, PROCARRAY_FLAGS_VACUUM); - bstrategy = GetAccessStrategy(BAS_BULKREAD); - - nblocks = RelationGetNumberOfBlocks(rel); - scanned = 0; - - for (blkno = 0; blkno < nblocks; blkno++) - { - Buffer buf; - Page page; - OffsetNumber offnum, - maxoff; - Size freespace; - - CHECK_FOR_INTERRUPTS(); - - /* - * If the page has only visible tuples, then we can find out the free - * space from the FSM and move on. - */ - if (VM_ALL_VISIBLE(rel, blkno, &vmbuffer)) - { - freespace = GetRecordedFreeSpace(rel, blkno); - stat->tuple_len += BLCKSZ - freespace; - stat->free_space += freespace; - continue; - } - - buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, - RBM_NORMAL, bstrategy); - - LockBuffer(buf, BUFFER_LOCK_SHARE); - - page = BufferGetPage(buf); - - /* - * It's not safe to call PageGetHeapFreeSpace() on new pages, so we - * treat them as being free space for our purposes. - */ - if (!PageIsNew(page)) - stat->free_space += PageGetHeapFreeSpace(page); - else - stat->free_space += BLCKSZ - SizeOfPageHeaderData; - - if (PageIsNew(page) || PageIsEmpty(page)) - { - UnlockReleaseBuffer(buf); - continue; - } - - scanned++; - - /* - * Look at each tuple on the page and decide whether it's live or - * dead, then count it and its size. Unlike lazy_scan_heap, we can - * afford to ignore problems and special cases. - */ - maxoff = PageGetMaxOffsetNumber(page); - - for (offnum = FirstOffsetNumber; - offnum <= maxoff; - offnum = OffsetNumberNext(offnum)) - { - ItemId itemid; - HeapTupleData tuple; - - itemid = PageGetItemId(page, offnum); - - if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid) || - ItemIdIsDead(itemid)) - { - continue; - } - - Assert(ItemIdIsNormal(itemid)); - - ItemPointerSet(&(tuple.t_self), blkno, offnum); - - tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); - tuple.t_len = ItemIdGetLength(itemid); - tuple.t_tableOid = RelationGetRelid(rel); - - /* - * We count live and dead tuples, but we also need to add up - * others in order to feed vac_estimate_reltuples. - */ - switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf)) - { - case HEAPTUPLE_RECENTLY_DEAD: - misc_count++; - /* Fall through */ - case HEAPTUPLE_DEAD: - stat->dead_tuple_len += tuple.t_len; - stat->dead_tuple_count++; - break; - case HEAPTUPLE_LIVE: - stat->tuple_len += tuple.t_len; - stat->tuple_count++; - break; - case HEAPTUPLE_INSERT_IN_PROGRESS: - case HEAPTUPLE_DELETE_IN_PROGRESS: - misc_count++; - break; - default: - elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); - break; - } - } - - UnlockReleaseBuffer(buf); - } - - stat->table_len = (uint64) nblocks * BLCKSZ; - - stat->tuple_count = vac_estimate_reltuples(rel, false, nblocks, scanned, - stat->tuple_count + misc_count); - - /* - * Calculate percentages if the relation has one or more pages. - */ - if (nblocks != 0) - { - stat->scanned_percent = 100 * scanned / nblocks; - stat->tuple_percent = 100.0 * stat->tuple_len / stat->table_len; - stat->dead_tuple_percent = 100.0 * stat->dead_tuple_len / stat->table_len; - stat->free_percent = 100.0 * stat->free_space / stat->table_len; - } - - if (BufferIsValid(vmbuffer)) - { - ReleaseBuffer(vmbuffer); - vmbuffer = InvalidBuffer; - } + BlockNumber scanned, + nblocks, + blkno; + Buffer vmbuffer = InvalidBuffer; + BufferAccessStrategy bstrategy; + TransactionId OldestXmin; + + OldestXmin = GetOldestXmin(rel, PROCARRAY_FLAGS_VACUUM); + bstrategy = GetAccessStrategy(BAS_BULKREAD); + + nblocks = RelationGetNumberOfBlocks(rel); + scanned = 0; + + for (blkno = 0; blkno < nblocks; blkno++) + { + Buffer buf; + Page page; + OffsetNumber offnum, + maxoff; + Size freespace; + + CHECK_FOR_INTERRUPTS(); + + /* + * If the page has only visible tuples, then we can find out the free + * space from the FSM and move on. + */ + if (VM_ALL_VISIBLE(rel, blkno, &vmbuffer)) + { + freespace = GetRecordedFreeSpace(rel, blkno); + stat->tuple_len += BLCKSZ - freespace; + stat->free_space += freespace; + continue; + } + + buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, + RBM_NORMAL, bstrategy); + + LockBuffer(buf, BUFFER_LOCK_SHARE); + + page = BufferGetPage(buf); + + /* + * It's not safe to call PageGetHeapFreeSpace() on new pages, so we + * treat them as being free space for our purposes. + */ + if (!PageIsNew(page)) + stat->free_space += PageGetHeapFreeSpace(page); + else + stat->free_space += BLCKSZ - SizeOfPageHeaderData; + + /* We may count the page as scanned even if it's new/empty */ + scanned++; + + if (PageIsNew(page) || PageIsEmpty(page)) + { + UnlockReleaseBuffer(buf); + continue; + } + + /* + * Look at each tuple on the page and decide whether it's live or + * dead, then count it and its size. Unlike lazy_scan_heap, we can + * afford to ignore problems and special cases. + */ + maxoff = PageGetMaxOffsetNumber(page); + + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid; + HeapTupleData tuple; + + itemid = PageGetItemId(page, offnum); + + if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid) || + ItemIdIsDead(itemid)) + { + continue; + } + + Assert(ItemIdIsNormal(itemid)); + + ItemPointerSet(&(tuple.t_self), blkno, offnum); + + tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); + tuple.t_len = ItemIdGetLength(itemid); + tuple.t_tableOid = RelationGetRelid(rel); + + /* + * We follow VACUUM's lead in counting INSERT_IN_PROGRESS tuples + * as "dead" while DELETE_IN_PROGRESS tuples are "live". We don't + * bother distinguishing tuples inserted/deleted by our own + * transaction. + */ + switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf)) + { + case HEAPTUPLE_LIVE: + case HEAPTUPLE_DELETE_IN_PROGRESS: + stat->tuple_len += tuple.t_len; + stat->tuple_count++; + break; + case HEAPTUPLE_DEAD: + case HEAPTUPLE_RECENTLY_DEAD: + case HEAPTUPLE_INSERT_IN_PROGRESS: + stat->dead_tuple_len += tuple.t_len; + stat->dead_tuple_count++; + break; + default: + elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); + break; + } + } + + UnlockReleaseBuffer(buf); + } + + stat->table_len = (uint64) nblocks * BLCKSZ; + + /* + * We don't know how many tuples are in the pages we didn't scan, so + * extrapolate the live-tuple count to the whole table in the same way + * that VACUUM does. (Like VACUUM, we're not taking a random sample, so + * just extrapolating linearly seems unsafe.) There should be no dead + * tuples in all-visible pages, so no correction is needed for that, and + * we already accounted for the space in those pages, too. + */ + stat->tuple_count = vac_estimate_reltuples(rel, false, nblocks, scanned, + stat->tuple_count); + + /* + * Calculate percentages if the relation has one or more pages. + */ + if (nblocks != 0) + { + stat->scanned_percent = 100 * scanned / nblocks; + stat->tuple_percent = 100.0 * stat->tuple_len / stat->table_len; + stat->dead_tuple_percent = 100.0 * stat->dead_tuple_len / stat->table_len; + stat->free_percent = 100.0 * stat->free_space / stat->table_len; + } + + if (BufferIsValid(vmbuffer)) + { + ReleaseBuffer(vmbuffer); + vmbuffer = InvalidBuffer; + } } /* @@ -215,14 +221,14 @@ statapprox_heap(Relation rel, output_type *stat) Datum pgstattuple_approx(PG_FUNCTION_ARGS) { - Oid relid = PG_GETARG_OID(0); + Oid relid = PG_GETARG_OID(0); - if (!superuser()) - ereport(ERROR, - (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), - (errmsg("must be superuser to use pgstattuple functions")))); + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + (errmsg("must be superuser to use pgstattuple functions")))); - PG_RETURN_DATUM(pgstattuple_approx_internal(relid, fcinfo)); + PG_RETURN_DATUM(pgstattuple_approx_internal(relid, fcinfo)); } /* @@ -235,69 +241,69 @@ pgstattuple_approx(PG_FUNCTION_ARGS) Datum pgstattuple_approx_v1_5(PG_FUNCTION_ARGS) { - Oid relid = PG_GETARG_OID(0); + Oid relid = PG_GETARG_OID(0); - PG_RETURN_DATUM(pgstattuple_approx_internal(relid, fcinfo)); + PG_RETURN_DATUM(pgstattuple_approx_internal(relid, fcinfo)); } Datum pgstattuple_approx_internal(Oid relid, FunctionCallInfo fcinfo) { - Relation rel; - output_type stat = {0}; - TupleDesc tupdesc; - bool nulls[NUM_OUTPUT_COLUMNS]; - Datum values[NUM_OUTPUT_COLUMNS]; - HeapTuple ret; - int i = 0; - - if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) - elog(ERROR, "return type must be a row type"); - - if (tupdesc->natts != NUM_OUTPUT_COLUMNS) - elog(ERROR, "incorrect number of output arguments"); - - rel = relation_open(relid, AccessShareLock); - - /* - * Reject attempts to read non-local temporary relations; we would be - * likely to get wrong data since we have no visibility into the owning - * session's local buffers. - */ - if (RELATION_IS_OTHER_TEMP(rel)) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot access temporary tables of other sessions"))); - - /* - * We support only ordinary relations and materialised views, because we - * depend on the visibility map and free space map for our estimates about - * unscanned pages. - */ - if (!(rel->rd_rel->relkind == RELKIND_RELATION || - rel->rd_rel->relkind == RELKIND_MATVIEW)) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("\"%s\" is not a table or materialized view", - RelationGetRelationName(rel)))); - - statapprox_heap(rel, &stat); - - relation_close(rel, AccessShareLock); - - memset(nulls, 0, sizeof(nulls)); - - values[i++] = Int64GetDatum(stat.table_len); - values[i++] = Float8GetDatum(stat.scanned_percent); - values[i++] = Int64GetDatum(stat.tuple_count); - values[i++] = Int64GetDatum(stat.tuple_len); - values[i++] = Float8GetDatum(stat.tuple_percent); - values[i++] = Int64GetDatum(stat.dead_tuple_count); - values[i++] = Int64GetDatum(stat.dead_tuple_len); - values[i++] = Float8GetDatum(stat.dead_tuple_percent); - values[i++] = Int64GetDatum(stat.free_space); - values[i++] = Float8GetDatum(stat.free_percent); - - ret = heap_form_tuple(tupdesc, values, nulls); - return HeapTupleGetDatum(ret); + Relation rel; + output_type stat = {0}; + TupleDesc tupdesc; + bool nulls[NUM_OUTPUT_COLUMNS]; + Datum values[NUM_OUTPUT_COLUMNS]; + HeapTuple ret; + int i = 0; + + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + if (tupdesc->natts != NUM_OUTPUT_COLUMNS) + elog(ERROR, "incorrect number of output arguments"); + + rel = relation_open(relid, AccessShareLock); + + /* + * Reject attempts to read non-local temporary relations; we would be + * likely to get wrong data since we have no visibility into the owning + * session's local buffers. + */ + if (RELATION_IS_OTHER_TEMP(rel)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot access temporary tables of other sessions"))); + + /* + * We support only ordinary relations and materialised views, because we + * depend on the visibility map and free space map for our estimates about + * unscanned pages. + */ + if (!(rel->rd_rel->relkind == RELKIND_RELATION || + rel->rd_rel->relkind == RELKIND_MATVIEW)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("\"%s\" is not a table or materialized view", + RelationGetRelationName(rel)))); + + statapprox_heap(rel, &stat); + + relation_close(rel, AccessShareLock); + + memset(nulls, 0, sizeof(nulls)); + + values[i++] = Int64GetDatum(stat.table_len); + values[i++] = Float8GetDatum(stat.scanned_percent); + values[i++] = Int64GetDatum(stat.tuple_count); + values[i++] = Int64GetDatum(stat.tuple_len); + values[i++] = Float8GetDatum(stat.tuple_percent); + values[i++] = Int64GetDatum(stat.dead_tuple_count); + values[i++] = Int64GetDatum(stat.dead_tuple_len); + values[i++] = Float8GetDatum(stat.dead_tuple_percent); + values[i++] = Int64GetDatum(stat.free_space); + values[i++] = Float8GetDatum(stat.free_percent); + + ret = heap_form_tuple(tupdesc, values, nulls); + return HeapTupleGetDatum(ret); } diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml index 399f8275..9f11a50f 100644 --- a/doc/src/sgml/catalogs.sgml +++ b/doc/src/sgml/catalogs.sgml @@ -1752,8 +1752,8 @@ SCRAM-SHA-256$<iteration count>:<salt>< float4 - Number of rows in the table. This is only an estimate used by the - planner. It is updated by VACUUM, + Number of live rows in the table. This is only an estimate used by + the planner. It is updated by VACUUM, ANALYZE, and a few DDL commands such as CREATE INDEX. diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 89c9a1ea..b1c18dec 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -2420,12 +2420,12 @@ index_build(Relation heapRelation, * things to add it to the new index. After we return, the AM's index * build procedure does whatever cleanup it needs. * - * The total count of heap tuples is returned. This is for updating pg_class - * statistics. (It's annoying not to be able to do that here, but we want - * to merge that update with others; see index_update_stats.) Note that the - * index AM itself must keep track of the number of index tuples; we don't do - * so here because the AM might reject some of the tuples for its own reasons, - * such as being unable to store NULLs. + * The total count of live heap tuples is returned. This is for updating + * pg_class statistics. (It's annoying not to be able to do that here, but we + * want to merge that update with others; see index_update_stats.) Note that + * the index AM itself must keep track of the number of index tuples; we don't + * do so here because the AM might reject some of the tuples for its own + * reasons, such as being unable to store NULLs. * * A side effect is to set indexInfo->ii_BrokenHotChain to true if we detect * any potentially broken HOT chains. Currently, we set this if there are @@ -2455,8 +2455,8 @@ IndexBuildHeapScan(Relation heapRelation, * to scan cannot be done when requesting syncscan. * * When "anyvisible" mode is requested, all tuples visible to any transaction - * are considered, including those inserted or deleted by transactions that are - * still in progress. + * are indexed and counted as live, including those inserted or deleted by + * transactions that are still in progress. */ double IndexBuildHeapRangeScan(Relation heapRelation, @@ -2628,6 +2628,12 @@ IndexBuildHeapRangeScan(Relation heapRelation, */ LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); + /* + * The criteria for counting a tuple as live in this block need to + * match what analyze.c's acquire_sample_rows() does, otherwise + * CREATE INDEX and ANALYZE may produce wildly different reltuples + * values, e.g. when there are many recently-dead tuples. + */ switch (HeapTupleSatisfiesVacuum(heapTuple, OldestXmin, scan->rs_cbuf)) { @@ -2640,6 +2646,8 @@ IndexBuildHeapRangeScan(Relation heapRelation, /* Normal case, index and unique-check it */ indexIt = true; tupleIsAlive = true; + /* Count it as live, too */ + reltuples += 1; break; case HEAPTUPLE_RECENTLY_DEAD: @@ -2653,6 +2661,9 @@ IndexBuildHeapRangeScan(Relation heapRelation, * the live tuple at the end of the HOT-chain. Since this * breaks semantics for pre-existing snapshots, mark the * index as unusable for them. + * + * We don't count recently-dead tuples in reltuples, even + * if we index them; see acquire_sample_rows(). */ if (HeapTupleIsHotUpdated(heapTuple)) { @@ -2675,6 +2686,7 @@ IndexBuildHeapRangeScan(Relation heapRelation, { indexIt = true; tupleIsAlive = true; + reltuples += 1; break; } @@ -2712,6 +2724,15 @@ IndexBuildHeapRangeScan(Relation heapRelation, goto recheck; } } + else + { + /* + * For consistency with acquire_sample_rows(), count + * HEAPTUPLE_INSERT_IN_PROGRESS tuples as live only + * when inserted by our own transaction. + */ + reltuples += 1; + } /* * We must index such tuples, since if the index build @@ -2731,6 +2752,7 @@ IndexBuildHeapRangeScan(Relation heapRelation, { indexIt = true; tupleIsAlive = false; + reltuples += 1; break; } @@ -2774,6 +2796,14 @@ IndexBuildHeapRangeScan(Relation heapRelation, * the same as a RECENTLY_DEAD tuple. */ indexIt = true; + + /* + * Count HEAPTUPLE_DELETE_IN_PROGRESS tuples as live, + * if they were not deleted by the current + * transaction. That's what acquire_sample_rows() + * does, and we want the behavior to be consistent. + */ + reltuples += 1; } else if (HeapTupleIsHotUpdated(heapTuple)) { @@ -2791,8 +2821,8 @@ IndexBuildHeapRangeScan(Relation heapRelation, { /* * It's a regular tuple deleted by our own xact. Index - * it but don't check for uniqueness, the same as a - * RECENTLY_DEAD tuple. + * it, but don't check for uniqueness nor count in + * reltuples, the same as a RECENTLY_DEAD tuple. */ indexIt = true; } @@ -2816,8 +2846,6 @@ IndexBuildHeapRangeScan(Relation heapRelation, tupleIsAlive = true; } - reltuples += 1; - MemoryContextReset(econtext->ecxt_per_tuple_memory); /* Set up for predicate or expression evaluation */ diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index efb5aade..3c337815 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -707,7 +707,8 @@ vacuum_set_xid_limits(Relation rel, * we take the old value of pg_class.reltuples as a measurement of the * tuple density in the unscanned pages. * - * This routine is shared by VACUUM and ANALYZE. + * Note: scanned_tuples should count only *live* tuples, since + * pg_class.reltuples is defined that way. */ double vac_estimate_reltuples(Relation relation, bool is_analyze, @@ -807,6 +808,9 @@ vac_estimate_reltuples(Relation relation, bool is_analyze, * transaction. This is OK since postponing the flag maintenance is * always allowable. * + * Note: num_tuples should count only *live* tuples, since + * pg_class.reltuples is defined that way. + * * This routine is shared by VACUUM and ANALYZE. */ void diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c index 4796152a..90dfe91f 100644 --- a/src/backend/commands/vacuumlazy.c +++ b/src/backend/commands/vacuumlazy.c @@ -123,9 +123,9 @@ typedef struct LVRelStats BlockNumber pinskipped_pages; /* # of pages we skipped due to a pin */ BlockNumber frozenskipped_pages; /* # of frozen pages we skipped */ BlockNumber tupcount_pages; /* pages whose tuples we counted */ - double scanned_tuples; /* counts only tuples on tupcount_pages */ - double old_rel_tuples; /* previous value of pg_class.reltuples */ + double old_live_tuples; /* previous value of pg_class.reltuples */ double new_rel_tuples; /* new estimated total # of tuples */ + double new_live_tuples; /* new estimated total # of live tuples */ double new_dead_tuples; /* new estimated total # of dead tuples */ BlockNumber pages_removed; double tuples_deleted; @@ -316,7 +316,6 @@ lazy_vacuum_rel(Relation onerel, int options, VacuumParams *params, TransactionId xidFullScanLimit; MultiXactId mxactFullScanLimit; BlockNumber new_rel_pages; - double new_rel_tuples; BlockNumber new_rel_allvisible; double new_live_tuples; TransactionId new_frozen_xid; @@ -374,7 +373,7 @@ lazy_vacuum_rel(Relation onerel, int options, VacuumParams *params, vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats)); vacrelstats->old_rel_pages = onerel->rd_rel->relpages; - vacrelstats->old_rel_tuples = onerel->rd_rel->reltuples; + vacrelstats->old_live_tuples = onerel->rd_rel->reltuples; vacrelstats->num_index_scans = 0; vacrelstats->pages_removed = 0; vacrelstats->lock_waiter_detected = false; @@ -451,11 +450,11 @@ lazy_vacuum_rel(Relation onerel, int options, VacuumParams *params, * since then we don't know for certain that all tuples have a newer xmin. */ new_rel_pages = vacrelstats->rel_pages; - new_rel_tuples = vacrelstats->new_rel_tuples; + new_live_tuples = vacrelstats->new_live_tuples; if (vacrelstats->tupcount_pages == 0 && new_rel_pages > 0) { new_rel_pages = vacrelstats->old_rel_pages; - new_rel_tuples = vacrelstats->old_rel_tuples; + new_live_tuples = vacrelstats->old_live_tuples; } visibilitymap_count(onerel, &new_rel_allvisible, NULL); @@ -467,7 +466,7 @@ lazy_vacuum_rel(Relation onerel, int options, VacuumParams *params, vac_update_relstats(onerel, new_rel_pages, - new_rel_tuples, + new_live_tuples, new_rel_allvisible, vacrelstats->hasindex, new_frozen_xid, @@ -475,10 +474,6 @@ lazy_vacuum_rel(Relation onerel, int options, VacuumParams *params, false); /* report results to the stats collector, too */ - new_live_tuples = new_rel_tuples - vacrelstats->new_dead_tuples; - if (new_live_tuples < 0) - new_live_tuples = 0; /* just in case */ - pgstat_report_vacuum(RelationGetRelid(onerel), onerel->rd_rel->relisshared, new_live_tuples, @@ -604,10 +599,11 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, char *relname; BlockNumber empty_pages, vacuumed_pages; - double num_tuples, - tups_vacuumed, - nkeep, - nunused; + double num_tuples, /* total number of nonremovable tuples */ + live_tuples, /* live tuples (reltuples estimate) */ + tups_vacuumed, /* tuples cleaned up by vacuum */ + nkeep, /* dead-but-not-removable tuples */ + nunused; /* unused item pointers */ IndexBulkDeleteResult **indstats; int i; PGRUsage ru0; @@ -632,7 +628,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, relname))); empty_pages = vacuumed_pages = 0; - num_tuples = tups_vacuumed = nkeep = nunused = 0; + num_tuples = live_tuples = tups_vacuumed = nkeep = nunused = 0; indstats = (IndexBulkDeleteResult **) palloc0(nindexes * sizeof(IndexBulkDeleteResult *)); @@ -1131,6 +1127,17 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, tupgone = false; + /* + * The criteria for counting a tuple as live in this block need to + * match what analyze.c's acquire_sample_rows() does, otherwise + * VACUUM and ANALYZE may produce wildly different reltuples + * values, e.g. when there are many recently-dead tuples. + * + * The logic here is a bit simpler than acquire_sample_rows(), as + * VACUUM can't run inside a transaction block, which makes some + * cases impossible (e.g. in-progress insert from the same + * transaction). + */ switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf)) { case HEAPTUPLE_DEAD: @@ -1164,6 +1171,12 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid", relname, blkno, offnum); + /* + * Count it as live. Not only is this natural, but it's + * also what acquire_sample_rows() does. + */ + live_tuples += 1; + /* * Is the tuple definitely visible to all transactions? * @@ -1235,12 +1248,29 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, all_visible = false; break; case HEAPTUPLE_INSERT_IN_PROGRESS: - /* This is an expected case during concurrent vacuum */ + + /* + * This is an expected case during concurrent vacuum. + * + * We do not count these rows as live, because we expect + * the inserting transaction to update the counters at + * commit, and we assume that will happen only after we + * report our results. This assumption is a bit shaky, + * but it is what acquire_sample_rows() does, so be + * consistent. + */ all_visible = false; break; case HEAPTUPLE_DELETE_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ all_visible = false; + + /* + * Count such rows as live. As above, we assume the + * deleting transaction will commit and update the + * counters after we report. + */ + live_tuples += 1; break; default: elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); @@ -1448,15 +1478,18 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, pfree(frozen); /* save stats for use later */ - vacrelstats->scanned_tuples = num_tuples; vacrelstats->tuples_deleted = tups_vacuumed; vacrelstats->new_dead_tuples = nkeep; /* now we can compute the new value for pg_class.reltuples */ - vacrelstats->new_rel_tuples = vac_estimate_reltuples(onerel, false, + vacrelstats->new_live_tuples = vac_estimate_reltuples(onerel, false, nblocks, vacrelstats->tupcount_pages, - num_tuples); + live_tuples); + + /* also compute total number of surviving heap entries */ + vacrelstats->new_rel_tuples = + vacrelstats->new_live_tuples + vacrelstats->new_dead_tuples; /* * Release any remaining pin on visibility map page. @@ -1801,7 +1834,8 @@ lazy_vacuum_index(Relation indrel, ivinfo.analyze_only = false; ivinfo.estimated_count = true; ivinfo.message_level = elevel; - ivinfo.num_heap_tuples = vacrelstats->old_rel_tuples; + /* We can only provide an approximate value of num_heap_tuples here */ + ivinfo.num_heap_tuples = vacrelstats->old_live_tuples; ivinfo.strategy = vac_strategy; /* Do bulk deletion */ @@ -1832,6 +1866,12 @@ lazy_cleanup_index(Relation indrel, ivinfo.analyze_only = false; ivinfo.estimated_count = (vacrelstats->tupcount_pages < vacrelstats->rel_pages); ivinfo.message_level = elevel; + + /* + * Now we can provide a better estimate of total number of surviving + * tuples (we assume indexes are more interested in that than in the + * number of nominally live tuples). + */ ivinfo.num_heap_tuples = vacrelstats->new_rel_tuples; ivinfo.strategy = vac_strategy; @@ -2458,7 +2498,7 @@ truncate_extent_tuples(Relation onerel, vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats)); vacrelstats->old_rel_pages = onerel->rd_rel->relpages; - vacrelstats->old_rel_tuples = onerel->rd_rel->reltuples; + vacrelstats->old_live_tuples = onerel->rd_rel->reltuples; vacrelstats->num_index_scans = 0; vacrelstats->pages_removed = 0; vacrelstats->lock_waiter_detected = false; From 35a4988b243925baf13042a6fc374f66ef738a0d Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Tue, 8 May 2018 00:20:19 -0400 Subject: [PATCH 509/578] Count heap tuples in non-SnapshotAny path in IndexBuildHeapRangeScan(). Brown-paper-bag bug in commit 7c91a0364: when we rearranged the placement of "reltuples += 1" statements, we missed including one in this code path. The net effect of that was that CREATE INDEX CONCURRENTLY would set the table's pg_class.reltuples to zero, as would index builds done during bootstrap mode. (It seems like parallel index builds ought to fail similarly, but they don't, perhaps because reltuples is computed in some other way. You certainly couldn't figure that out from the abysmally underdocumented parallelism code in this area.) I was led to this by wondering why initdb seemed to have slowed down as a result of 7c91a0364, as is evident in the buildfarm's timing history. The reason is that every system catalog with indexes had pg_class.reltuples = 0 after bootstrap, causing the planner to make some terrible choices for queries in the post-bootstrap steps. On my workstation, this fix causes the runtime of "initdb -N" to drop from ~2.0 sec to ~1.4 sec, which is almost though not quite back to where it was in v10. That's not much of a deal for production use perhaps, but it makes a noticeable difference for buildfarm and "make check-world" runs, which do a lot of initdbs. --- src/backend/catalog/index.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index b1c18dec..d3cc3775 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -2844,6 +2844,7 @@ IndexBuildHeapRangeScan(Relation heapRelation, { /* heap_getnext did the time qual check */ tupleIsAlive = true; + reltuples += 1; } MemoryContextReset(econtext->ecxt_per_tuple_memory); From fb97cdce3163324a261569538f198d61660ef301 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Wed, 12 Sep 2018 06:46:01 +0900 Subject: [PATCH 510/578] Parse more strictly integer parameters from connection strings in libpq The following parameters have been parsed in lossy ways when specified in a connection string processed by libpq: - connect_timeout - keepalives - keepalives_count - keepalives_idle - keepalives_interval - port Overflowing values or the presence of incorrect characters were not properly checked, leading to libpq trying to use such values and fail with unhelpful error messages. This commit hardens the parsing of those parameters so as it is possible to find easily incorrect values. Author: Fabien Coelho Reviewed-by: Peter Eisentraut, Michael Paquier Discussion: https://postgr.es/m/alpine.DEB.2.21.1808171206180.20841@lancre --- src/interfaces/libpq/fe-connect.c | 59 ++++++++++++++++++++++++++----- 1 file changed, 51 insertions(+), 8 deletions(-) diff --git a/src/interfaces/libpq/fe-connect.c b/src/interfaces/libpq/fe-connect.c index 8c1ec04b..9bcefd3e 100644 --- a/src/interfaces/libpq/fe-connect.c +++ b/src/interfaces/libpq/fe-connect.c @@ -1597,6 +1597,34 @@ useKeepalives(PGconn *conn) return val != 0 ? 1 : 0; } +/* + * Parse and try to interpret "value" as an integer value, and if successful, + * store it in *result, complaining if there is any trailing garbage or an + * overflow. + */ +static bool +parse_int_param(const char *value, int *result, PGconn *conn, + const char *context) +{ + char *end; + long numval; + + *result = 0; + + errno = 0; + numval = strtol(value, &end, 10); + if (errno == 0 && *end == '\0' && numval == (int) numval) + { + *result = numval; + return true; + } + + appendPQExpBuffer(&conn->errorMessage, + libpq_gettext("invalid integer value \"%s\" for keyword \"%s\"\n"), + value, context); + return false; +} + #ifndef WIN32 /* * Set the keepalive idle timer. @@ -1609,7 +1637,9 @@ setKeepalivesIdle(PGconn *conn) if (conn->keepalives_idle == NULL) return 1; - idle = atoi(conn->keepalives_idle); + if (!parse_int_param(conn->keepalives_idle, &idle, conn, + "keepalives_idle")) + return 0; if (idle < 0) idle = 0; @@ -1641,7 +1671,9 @@ setKeepalivesInterval(PGconn *conn) if (conn->keepalives_interval == NULL) return 1; - interval = atoi(conn->keepalives_interval); + if (!parse_int_param(conn->keepalives_interval, &interval, conn, + "keepalives_interval")) + return 0; if (interval < 0) interval = 0; @@ -1674,7 +1706,9 @@ setKeepalivesCount(PGconn *conn) if (conn->keepalives_count == NULL) return 1; - count = atoi(conn->keepalives_count); + if (!parse_int_param(conn->keepalives_count, &count, conn, + "keepalives_count")) + return 0; if (count < 0) count = 0; @@ -1708,13 +1742,17 @@ setKeepalivesWin32(PGconn *conn) int idle = 0; int interval = 0; - if (conn->keepalives_idle) - idle = atoi(conn->keepalives_idle); + if (conn->keepalives_idle && + !parse_int_param(conn->keepalives_idle, &idle, conn, + "keepalives_idle")) + return 0; if (idle <= 0) idle = 2 * 60 * 60; /* 2 hours = default */ - if (conn->keepalives_interval) - interval = atoi(conn->keepalives_interval); + if (conn->keepalives_interval && + !parse_int_param(conn->keepalives_interval, &interval, conn, + "keepalives_interval")) + return 0; if (interval <= 0) interval = 1; /* 1 second = default */ @@ -1918,7 +1956,10 @@ connectDBComplete(PGconn *conn) */ if (conn->connect_timeout != NULL) { - timeout = atoi(conn->connect_timeout); + if (!parse_int_param(conn->connect_timeout, &timeout, conn, + "connect_timeout")) + return 0; + if (timeout > 0) { /* @@ -1929,6 +1970,8 @@ connectDBComplete(PGconn *conn) /* calculate the finish time based on start + timeout */ finish_time = time(NULL) + timeout; } + else /* negative means 0 */ + timeout = 0; } for (;;) From 75b7561403ab32b49463257b7895f323542ffabf Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Sat, 6 Apr 2019 15:23:37 +0900 Subject: [PATCH 511/578] Add support TCP user timeout in libpq and the backend server Similarly to the set of parameters for keepalive, a connection parameter for libpq is added as well as a backend GUC, called tcp_user_timeout. Increasing the TCP user timeout is useful to allow a connection to survive extended periods without end-to-end connection, and decreasing it allows application to fail faster. By default, the parameter is 0, which makes the connection use the system default, and follows a logic close to the keepalive parameters in its handling. When connecting through a Unix-socket domain, the parameters have no effect. Author: Ryohei Nagaura Reviewed-by: Fabien Coelho, Robert Haas, Kyotaro Horiguchi, Kirk Jamison, Mikalai Keida, Takayuki Tsunakawa, Andrei Yahorau Discussion: https://postgr.es/m/EDA4195584F5064680D8130B1CA91C45367328@G01JPEXMBYT04 --- .../postgres_fdw/expected/postgres_fdw.out | 1 + contrib/postgres_fdw/sql/postgres_fdw.sql | 1 + doc/src/sgml/config.sgml | 25 +++++++ doc/src/sgml/libpq.sgml | 14 ++++ src/backend/libpq/pqcomm.c | 72 +++++++++++++++++++ src/backend/utils/misc/guc.c | 31 ++++++++ src/backend/utils/misc/postgresql.conf.sample | 11 +++ src/include/libpq/libpq-be.h | 6 +- src/include/utils/guc.h | 2 + src/interfaces/libpq/fe-connect.c | 43 +++++++++++ src/interfaces/libpq/libpq-int.h | 1 + 11 files changed, 206 insertions(+), 1 deletion(-) diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out b/contrib/postgres_fdw/expected/postgres_fdw.out index 09aee7c5..25f0967c 100644 --- a/contrib/postgres_fdw/expected/postgres_fdw.out +++ b/contrib/postgres_fdw/expected/postgres_fdw.out @@ -153,6 +153,7 @@ ALTER SERVER testserver1 OPTIONS ( keepalives 'value', keepalives_idle 'value', keepalives_interval 'value', + tcp_user_timeout 'value', -- requiressl 'value', sslcompression 'value', sslmode 'value', diff --git a/contrib/postgres_fdw/sql/postgres_fdw.sql b/contrib/postgres_fdw/sql/postgres_fdw.sql index 471bceae..bf923c0e 100644 --- a/contrib/postgres_fdw/sql/postgres_fdw.sql +++ b/contrib/postgres_fdw/sql/postgres_fdw.sql @@ -166,6 +166,7 @@ ALTER SERVER testserver1 OPTIONS ( keepalives 'value', keepalives_idle 'value', keepalives_interval 'value', + tcp_user_timeout 'value', -- requiressl 'value', sslcompression 'value', sslmode 'value', diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 939ddd85..ed2368fa 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -935,6 +935,31 @@ include_dir 'conf.d' + + tcp_user_timeout (integer) + + tcp_user_timeout configuration parameter + + + + + Specifies the number of milliseconds that transmitted data may + remain unacknowledged before a connection is forcibly closed. + A value of 0 uses the system default. + This parameter is supported only on systems that support + TCP_USER_TIMEOUT; on other systems, it must be zero. + In sessions connected via a Unix-domain socket, this parameter is + ignored and always reads as zero. + + + + This parameter is not supported on Windows and on Linux version + 2.6.36 or older. + + + + + diff --git a/doc/src/sgml/libpq.sgml b/doc/src/sgml/libpq.sgml index ebee3afa..dfa8c5f8 100644 --- a/doc/src/sgml/libpq.sgml +++ b/doc/src/sgml/libpq.sgml @@ -1211,6 +1211,20 @@ postgresql://%2Fvar%2Flib%2Fpostgresql/dbname + + tcp_user_timeout + + + Controls the number of milliseconds that transmitted data may + remain unacknowledged before a connection is forcibly closed. + A value of zero uses the system default. This parameter is + ignored for connections made via a Unix-domain socket. + It is only supported on systems where TCP_USER_TIMEOUT + is available; on other systems, it has no effect. + + + + tty diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c index fb35a142..bd089ae0 100644 --- a/src/backend/libpq/pqcomm.c +++ b/src/backend/libpq/pqcomm.c @@ -908,6 +908,7 @@ StreamConnection(pgsocket server_fd, Port *port) (void) pq_setkeepalivesidle(tcp_keepalives_idle, port); (void) pq_setkeepalivesinterval(tcp_keepalives_interval, port); (void) pq_setkeepalivescount(tcp_keepalives_count, port); + (void) pq_settcpusertimeout(tcp_user_timeout, port); } return STATUS_OK; @@ -2070,4 +2071,75 @@ SetSockKeepAlive(int sock) { elog(LOG, "SetSockKeepAlive setsockopt(TCP_USER_TIMEOUT) failed: %m"); } + +int +pq_gettcpusertimeout(Port *port) +{ +#ifdef TCP_USER_TIMEOUT + if (port == NULL || IS_AF_UNIX(port->laddr.addr.ss_family)) + return 0; + + if (port->tcp_user_timeout != 0) + return port->tcp_user_timeout; + + if (port->default_tcp_user_timeout == 0) + { + ACCEPT_TYPE_ARG3 size = sizeof(port->default_tcp_user_timeout); + + if (getsockopt(port->sock, IPPROTO_TCP, TCP_USER_TIMEOUT, + (char *) &port->default_tcp_user_timeout, + &size) < 0) + { + elog(LOG, "getsockopt(%s) failed: %m", "TCP_USER_TIMEOUT"); + port->default_tcp_user_timeout = -1; /* don't know */ + } + } + + return port->default_tcp_user_timeout; +#else + return 0; +#endif +} + +int +pq_settcpusertimeout(int timeout, Port *port) +{ + if (port == NULL || IS_AF_UNIX(port->laddr.addr.ss_family)) + return STATUS_OK; + +#ifdef TCP_USER_TIMEOUT + if (timeout == port->tcp_user_timeout) + return STATUS_OK; + + if (port->default_tcp_user_timeout <= 0) + { + if (pq_gettcpusertimeout(port) < 0) + { + if (timeout == 0) + return STATUS_OK; /* default is set but unknown */ + else + return STATUS_ERROR; + } + } + + if (timeout == 0) + timeout = port->default_tcp_user_timeout; + + if (setsockopt(port->sock, IPPROTO_TCP, TCP_USER_TIMEOUT, + (char *) &timeout, sizeof(timeout)) < 0) + { + elog(LOG, "setsockopt(%s) failed: %m", "TCP_USER_TIMEOUT"); + return STATUS_ERROR; + } + + port->tcp_user_timeout = timeout; +#else + if (timeout != 0) + { + elog(LOG, "setsockopt(%s) not supported", "TCP_USER_TIMEOUT"); + return STATUS_ERROR; + } +#endif + + return STATUS_OK; } \ No newline at end of file diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 27901832..b53a3c57 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -260,9 +260,11 @@ static const char *show_archive_command(void); static void assign_tcp_keepalives_idle(int newval, void *extra); static void assign_tcp_keepalives_interval(int newval, void *extra); static void assign_tcp_keepalives_count(int newval, void *extra); +static void assign_tcp_user_timeout(int newval, void *extra); static const char *show_tcp_keepalives_idle(void); static const char *show_tcp_keepalives_interval(void); static const char *show_tcp_keepalives_count(void); +static const char *show_tcp_user_timeout(void); static bool check_maxconnections(int *newval, void **extra, GucSource source); static bool check_max_worker_processes(int *newval, void **extra, GucSource source); static bool check_autovacuum_max_workers(int *newval, void **extra, GucSource source); @@ -672,6 +674,7 @@ char *nls_sort_locale = NULL; int tcp_keepalives_idle; int tcp_keepalives_interval; int tcp_keepalives_count; +int tcp_user_timeout; /* * SSL renegotiation was been removed in PostgreSQL 9.5, but we tolerate it @@ -4952,6 +4955,17 @@ static struct config_uint ConfigureNamesUInt[] = NULL, NULL, NULL }, + { + {"tcp_user_timeout", PGC_USERSET, CLIENT_CONN_OTHER, + gettext_noop("TCP user timeout."), + gettext_noop("A value of 0 uses the system default."), + GUC_UNIT_MS + }, + &tcp_user_timeout, + 0, 0, INT_MAX, + NULL, assign_tcp_user_timeout, show_tcp_user_timeout + }, + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL @@ -13442,6 +13456,23 @@ show_tcp_keepalives_count(void) return nbuf; } +static void +assign_tcp_user_timeout(int newval, void *extra) +{ + /* See comments in assign_tcp_keepalives_idle */ + (void) pq_settcpusertimeout(newval, MyProcPort); +} + +static const char * +show_tcp_user_timeout(void) +{ + /* See comments in assign_tcp_keepalives_idle */ + static char nbuf[16]; + + snprintf(nbuf, sizeof(nbuf), "%d", pq_gettcpusertimeout(MyProcPort)); + return nbuf; +} + static bool check_maxconnections(int *newval, void **extra, GucSource source) { diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 5ef4e565..edc6af3f 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -75,6 +75,17 @@ # - Security and Authentication - +# - TCP settings - +# see "man 7 tcp" for details + +#tcp_keepalives_idle = 0 # TCP_KEEPIDLE, in seconds; + # 0 selects the system default +#tcp_keepalives_interval = 0 # TCP_KEEPINTVL, in seconds; + # 0 selects the system default +#tcp_keepalives_count = 0 # TCP_KEEPCNT; + # 0 selects the system default +#tcp_user_timeout = 0 # TCP_USER_TIMEOUT, in milliseconds; + # 0 selects the system default #authentication_timeout = 1min # 1s-600s #ssl = off #ssl_ciphers = 'HIGH:MEDIUM:+3DES:!aNULL' # allowed SSL ciphers diff --git a/src/include/libpq/libpq-be.h b/src/include/libpq/libpq-be.h index 474d9690..00737906 100644 --- a/src/include/libpq/libpq-be.h +++ b/src/include/libpq/libpq-be.h @@ -215,7 +215,7 @@ typedef struct Port TimestampTz SessionStartTime; /* backend start time */ /* - * TCP keepalive settings. + * TCP keepalive and user timeout settings. * * default values are 0 if AF_UNIX or not yet known; current values are 0 * if AF_UNIX or using the default. Also, -1 in a default value means we @@ -224,9 +224,11 @@ typedef struct Port int default_keepalives_idle; int default_keepalives_interval; int default_keepalives_count; + int default_tcp_user_timeout; int keepalives_idle; int keepalives_interval; int keepalives_count; + int tcp_user_timeout; #if defined(ENABLE_GSS) || defined(ENABLE_SSPI) @@ -282,10 +284,12 @@ extern ProtocolVersion FrontendProtocol; extern int pq_getkeepalivesidle(Port *port); extern int pq_getkeepalivesinterval(Port *port); extern int pq_getkeepalivescount(Port *port); +extern int pq_gettcpusertimeout(Port *port); extern int pq_setkeepalivesidle(int idle, Port *port); extern int pq_setkeepalivesinterval(int interval, Port *port); extern int pq_setkeepalivescount(int count, Port *port); +extern int pq_settcpusertimeout(int timeout, Port *port); extern void SetSockKeepAlive(int sock); diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h index 2634e983..c3353b72 100644 --- a/src/include/utils/guc.h +++ b/src/include/utils/guc.h @@ -307,6 +307,8 @@ extern char *nls_sort_locale; extern int tcp_keepalives_idle; extern int tcp_keepalives_interval; extern int tcp_keepalives_count; +extern int tcp_user_timeout; + #ifdef _SHARDING_ extern bool g_allow_dml_on_datanode; extern bool g_allow_force_ddl; diff --git a/src/interfaces/libpq/fe-connect.c b/src/interfaces/libpq/fe-connect.c index 9bcefd3e..5cf94b2f 100644 --- a/src/interfaces/libpq/fe-connect.c +++ b/src/interfaces/libpq/fe-connect.c @@ -322,6 +322,10 @@ static const internalPQconninfoOption PQconninfoOptions[] = { "TCP-Keepalives-Count", "", 10, /* strlen(INT32_MAX) == 10 */ offsetof(struct pg_conn, keepalives_count)}, + {"tcp_user_timeout", NULL, NULL, NULL, + "TCP-User-Timeout", "", 10, /* strlen(INT32_MAX) == 10 */ + offsetof(struct pg_conn, pgtcp_user_timeout)}, + /* * ssl options are allowed even without client SSL support because the * client can still handle SSL modes "disable" and "allow". Other @@ -1781,6 +1785,41 @@ setKeepalivesWin32(PGconn *conn) #endif /* SIO_KEEPALIVE_VALS */ #endif /* WIN32 */ +/* + * Set the TCP user timeout. + */ +static int +setTCPUserTimeout(PGconn *conn) +{ + int timeout; + + if (conn->pgtcp_user_timeout == NULL) + return 1; + + if (!parse_int_param(conn->pgtcp_user_timeout, &timeout, conn, + "tcp_user_timeout")) + return 0; + + if (timeout < 0) + timeout = 0; + +#ifdef TCP_USER_TIMEOUT + if (setsockopt(conn->sock, IPPROTO_TCP, TCP_USER_TIMEOUT, + (char *) &timeout, sizeof(timeout)) < 0) + { + char sebuf[256]; + + appendPQExpBuffer(&conn->errorMessage, + libpq_gettext("setsockopt(%s) failed: %s\n"), + "TCP_USER_TIMEOUT", + SOCK_STRERROR(SOCK_ERRNO, sebuf, sizeof(sebuf))); + return 0; + } +#endif + + return 1; +} + /* ---------- * connectDBStart - * Begin the process of making a connection to the backend. @@ -2302,6 +2341,8 @@ PQconnectPoll(PGconn *conn) err = 1; #endif /* SIO_KEEPALIVE_VALS */ #endif /* WIN32 */ + else if (!setTCPUserTimeout(conn)) + err = 1; if (err) { @@ -3561,6 +3602,8 @@ freePGconn(PGconn *conn) free(conn->pgtty); if (conn->connect_timeout) free(conn->connect_timeout); + if (conn->pgtcp_user_timeout) + free(conn->pgtcp_user_timeout); if (conn->pgoptions) free(conn->pgoptions); if (conn->appname) diff --git a/src/interfaces/libpq/libpq-int.h b/src/interfaces/libpq/libpq-int.h index 4a3c071f..a51f3b7b 100644 --- a/src/interfaces/libpq/libpq-int.h +++ b/src/interfaces/libpq/libpq-int.h @@ -398,6 +398,7 @@ struct pg_conn char *pgtty; /* tty on which the backend messages is * displayed (OBSOLETE, NOT USED) */ char *connect_timeout; /* connection timeout (numeric string) */ + char *pgtcp_user_timeout; /* tcp user timeout (numeric string) */ char *client_encoding_initial; /* encoding to use */ char *pgoptions; /* options to start the backend with */ char *appname; /* application name */ From 6ecd3b1bca35b6ea546acd504adca27d7196f650 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Tue, 18 Jan 2022 14:02:43 -0500 Subject: [PATCH 512/578] Make PQcancel use the PGconn's tcp_user_timeout and keepalives settings. If connectivity to the server has been lost or become flaky, the user might well try to send a query cancel. It's highly annoying if PQcancel hangs up in such a case, but that's exactly what's likely to happen. To ameliorate this problem, apply the PGconn's tcp_user_timeout and keepalives settings to the TCP connection used to send the cancel. This should be safe on Unix machines, since POSIX specifies that setsockopt() is async-signal-safe. We are guessing that WSAIoctl(SIO_KEEPALIVE_VALS) is similarly safe on Windows. (Note that at least in psql and our other frontend programs, there's no safety issue involved anyway, since we run PQcancel in its own thread rather than in a signal handler.) Most of the value here comes from the expectation that tcp_user_timeout will be applied as a connection timeout. That appears to happen on Linux, even though its tcp(7) man page claims differently. The keepalive options probably won't help much, but as long as we can apply them for not much code, we might as well. Jelte Fennema, reviewed by Fujii Masao and myself Discussion: https://postgr.es/m/AM5PR83MB017870DE81FC84D5E21E9D1EF7AA9@AM5PR83MB0178.EURPRD83.prod.outlook.com --- doc/src/sgml/libpq.sgml | 4 +- src/interfaces/libpq/fe-connect.c | 251 +++++++++++++++++++++++------- src/interfaces/libpq/libpq-int.h | 7 + 3 files changed, 205 insertions(+), 57 deletions(-) diff --git a/doc/src/sgml/libpq.sgml b/doc/src/sgml/libpq.sgml index dfa8c5f8..1331c0d8 100644 --- a/doc/src/sgml/libpq.sgml +++ b/doc/src/sgml/libpq.sgml @@ -4965,8 +4965,8 @@ int PQrequestCancel(PGconn *conn); PGconn object, and in case of failure stores the error message in the PGconn object (whence it can be retrieved by PQerrorMessage). Although - the functionality is the same, this approach creates hazards for - multiple-thread programs and signal handlers, since it is possible + the functionality is the same, this approach is not safe within + multiple-thread programs or signal handlers, since it is possible that overwriting the PGconn's error message will mess up the operation currently in progress on the connection. diff --git a/src/interfaces/libpq/fe-connect.c b/src/interfaces/libpq/fe-connect.c index 5cf94b2f..1b7e3fe7 100644 --- a/src/interfaces/libpq/fe-connect.c +++ b/src/interfaces/libpq/fe-connect.c @@ -1737,26 +1737,17 @@ setKeepalivesCount(PGconn *conn) /* * Enable keepalives and set the keepalive values on Win32, * where they are always set in one batch. + * + * CAUTION: This needs to be signal safe, since it's used by PQcancel. */ static int -setKeepalivesWin32(PGconn *conn) +setKeepalivesWin32(pgsocket sock, int idle, int interval) { struct tcp_keepalive ka; DWORD retsize; - int idle = 0; - int interval = 0; - if (conn->keepalives_idle && - !parse_int_param(conn->keepalives_idle, &idle, conn, - "keepalives_idle")) - return 0; if (idle <= 0) idle = 2 * 60 * 60; /* 2 hours = default */ - - if (conn->keepalives_interval && - !parse_int_param(conn->keepalives_interval, &interval, conn, - "keepalives_interval")) - return 0; if (interval <= 0) interval = 1; /* 1 second = default */ @@ -1764,7 +1755,7 @@ setKeepalivesWin32(PGconn *conn) ka.keepalivetime = idle * 1000; ka.keepaliveinterval = interval * 1000; - if (WSAIoctl(conn->sock, + if (WSAIoctl(sock, SIO_KEEPALIVE_VALS, (LPVOID) &ka, sizeof(ka), @@ -1774,6 +1765,26 @@ setKeepalivesWin32(PGconn *conn) NULL, NULL) != 0) + return 0; + return 1; +} + +static int +prepKeepalivesWin32(PGconn *conn) +{ + int idle = -1; + int interval = -1; + + if (conn->keepalives_idle && + !parse_int_param(conn->keepalives_idle, &idle, conn, + "keepalives_idle")) + return 0; + if (conn->keepalives_interval && + !parse_int_param(conn->keepalives_interval, &interval, conn, + "keepalives_interval")) + return 0; + + if (!setKeepalivesWin32(conn->sock, idle, interval)) { appendPQExpBuffer(&conn->errorMessage, libpq_gettext("WSAIoctl(SIO_KEEPALIVE_VALS) failed: %ui\n"), @@ -2337,7 +2348,7 @@ PQconnectPoll(PGconn *conn) err = 1; #else /* WIN32 */ #ifdef SIO_KEEPALIVE_VALS - else if (!setKeepalivesWin32(conn)) + else if (!prepKeepalivesWin32(conn)) err = 1; #endif /* SIO_KEEPALIVE_VALS */ #endif /* WIN32 */ @@ -3923,8 +3934,53 @@ PQgetCancel(PGconn *conn) memcpy(&cancel->raddr, &conn->raddr, sizeof(SockAddr)); cancel->be_pid = conn->be_pid; cancel->be_key = conn->be_key; + /* We use -1 to indicate an unset connection option */ + cancel->pgtcp_user_timeout = -1; + cancel->keepalives = -1; + cancel->keepalives_idle = -1; + cancel->keepalives_interval = -1; + cancel->keepalives_count = -1; + if (conn->pgtcp_user_timeout != NULL) + { + if (!parse_int_param(conn->pgtcp_user_timeout, + &cancel->pgtcp_user_timeout, + conn, "tcp_user_timeout")) + goto fail; + } + if (conn->keepalives != NULL) + { + if (!parse_int_param(conn->keepalives, + &cancel->keepalives, + conn, "keepalives")) + goto fail; + } + if (conn->keepalives_idle != NULL) + { + if (!parse_int_param(conn->keepalives_idle, + &cancel->keepalives_idle, + conn, "keepalives_idle")) + goto fail; + } + if (conn->keepalives_interval != NULL) + { + if (!parse_int_param(conn->keepalives_interval, + &cancel->keepalives_interval, + conn, "keepalives_interval")) + goto fail; + } + if (conn->keepalives_count != NULL) + { + if (!parse_int_param(conn->keepalives_count, + &cancel->keepalives_count, + conn, "keepalives_count")) + goto fail; + } return cancel; + +fail: + free(cancel); + return NULL; } /* PQfreeCancel: free a cancel structure */ @@ -3937,14 +3993,36 @@ PQfreeCancel(PGcancel *cancel) /* - * PQcancel and PQrequestCancel: attempt to request cancellation of the - * current operation. + * Sets an integer socket option on a TCP socket, if the provided value is + * not negative. Returns false if setsockopt fails for some reason. + * + * CAUTION: This needs to be signal safe, since it's used by PQcancel. + */ +#if defined(TCP_USER_TIMEOUT) || !defined(WIN32) +static bool +optional_setsockopt(int fd, int protoid, int optid, int value) +{ + if (value < 0) + return true; + if (setsockopt(fd, protoid, optid, (char *) &value, sizeof(value)) < 0) + return false; + return true; +} +#endif + + +/* + * PQcancel: request query cancel * * The return value is TRUE if the cancel request was successfully * dispatched, FALSE if not (in which case an error message is available). * Note: successful dispatch is no guarantee that there will be any effect at * the backend. The application must read the operation result as usual. * + * On failure, an error message is stored in *errbuf, which must be of size + * errbufsize (recommended size is 256 bytes). *errbuf is not changed on + * success return. + * * CAUTION: we want this routine to be safely callable from a signal handler * (for example, an application might want to call it in a SIGINT handler). * This means we cannot use any C library routine that might be non-reentrant. @@ -3952,14 +4030,10 @@ PQfreeCancel(PGcancel *cancel) * just as dangerous. We avoid sprintf here for that reason. Building up * error messages with strcpy/strcat is tedious but should be quite safe. * We also save/restore errno in case the signal handler support doesn't. - * - * internal_cancel() is an internal helper function to make code-sharing - * between the two versions of the cancel function possible. */ -static int -internal_cancel(SockAddr *raddr, int be_pid, int be_key, - char *errbuf, int errbufsize) -{// #lizard forgives +int +PQcancel(PGcancel *cancel, char *errbuf, int errbufsize) +{ int save_errno = SOCK_ERRNO; pgsocket tmpsock = PGINVALID_SOCKET; char sebuf[256]; @@ -3970,18 +4044,98 @@ internal_cancel(SockAddr *raddr, int be_pid, int be_key, CancelRequestPacket cp; } crp; + if (!cancel) + { + strlcpy(errbuf, "PQcancel() -- no cancel object supplied", errbufsize); + /* strlcpy probably doesn't change errno, but be paranoid */ + SOCK_ERRNO_SET(save_errno); + return false; + } + /* * We need to open a temporary connection to the postmaster. Do this with * only kernel calls. */ - if ((tmpsock = socket(raddr->addr.ss_family, SOCK_STREAM, 0)) == PGINVALID_SOCKET) + if ((tmpsock = socket(cancel->raddr.addr.ss_family, SOCK_STREAM, 0)) == PGINVALID_SOCKET) { strlcpy(errbuf, "PQcancel() -- socket() failed: ", errbufsize); goto cancel_errReturn; } + + /* + * Since this connection will only be used to send a single packet of + * data, we don't need NODELAY. We also don't set the socket to + * nonblocking mode, because the API definition of PQcancel requires the + * cancel to be sent in a blocking way. + * + * We do set socket options related to keepalives and other TCP timeouts. + * This ensures that this function does not block indefinitely when + * reasonable keepalive and timeout settings have been provided. + */ + if (!IS_AF_UNIX(cancel->raddr.addr.ss_family) && + cancel->keepalives != 0) + { +#ifndef WIN32 + if (!optional_setsockopt(tmpsock, SOL_SOCKET, SO_KEEPALIVE, 1)) + { + strlcpy(errbuf, "PQcancel() -- setsockopt(SO_KEEPALIVE) failed: ", errbufsize); + goto cancel_errReturn; + } + +#ifdef PG_TCP_KEEPALIVE_IDLE + if (!optional_setsockopt(tmpsock, IPPROTO_TCP, PG_TCP_KEEPALIVE_IDLE, + cancel->keepalives_idle)) + { + strlcpy(errbuf, "PQcancel() -- setsockopt(" PG_TCP_KEEPALIVE_IDLE_STR ") failed: ", errbufsize); + goto cancel_errReturn; + } +#endif + +#ifdef TCP_KEEPINTVL + if (!optional_setsockopt(tmpsock, IPPROTO_TCP, TCP_KEEPINTVL, + cancel->keepalives_interval)) + { + strlcpy(errbuf, "PQcancel() -- setsockopt(TCP_KEEPINTVL) failed: ", errbufsize); + goto cancel_errReturn; + } +#endif + +#ifdef TCP_KEEPCNT + if (!optional_setsockopt(tmpsock, IPPROTO_TCP, TCP_KEEPCNT, + cancel->keepalives_count)) + { + strlcpy(errbuf, "PQcancel() -- setsockopt(TCP_KEEPCNT) failed: ", errbufsize); + goto cancel_errReturn; + } +#endif + +#else /* WIN32 */ + +#ifdef SIO_KEEPALIVE_VALS + if (!setKeepalivesWin32(tmpsock, + cancel->keepalives_idle, + cancel->keepalives_interval)) + { + strlcpy(errbuf, "PQcancel() -- WSAIoctl(SIO_KEEPALIVE_VALS) failed: ", errbufsize); + goto cancel_errReturn; + } +#endif /* SIO_KEEPALIVE_VALS */ +#endif /* WIN32 */ + + /* TCP_USER_TIMEOUT works the same way on Unix and Windows */ +#ifdef TCP_USER_TIMEOUT + if (!optional_setsockopt(tmpsock, IPPROTO_TCP, TCP_USER_TIMEOUT, + cancel->pgtcp_user_timeout)) + { + strlcpy(errbuf, "PQcancel() -- setsockopt(TCP_USER_TIMEOUT) failed: ", errbufsize); + goto cancel_errReturn; + } +#endif + } + retry3: - if (connect(tmpsock, (struct sockaddr *) &raddr->addr, - raddr->salen) < 0) + if (connect(tmpsock, (struct sockaddr *) &cancel->raddr.addr, + cancel->raddr.salen) < 0) { if (SOCK_ERRNO == EINTR) /* Interrupted system call - we'll just try again */ @@ -3990,16 +4144,12 @@ internal_cancel(SockAddr *raddr, int be_pid, int be_key, goto cancel_errReturn; } - /* - * We needn't set nonblocking I/O or NODELAY options here. - */ - /* Create and send the cancel request packet. */ crp.packetlen = htonl((uint32) sizeof(crp)); crp.cp.cancelRequestCode = (MsgType) htonl(CANCEL_REQUEST_CODE); - crp.cp.backendPID = htonl(be_pid); - crp.cp.cancelAuthCode = htonl(be_key); + crp.cp.backendPID = htonl(cancel->be_pid); + crp.cp.cancelAuthCode = htonl(cancel->be_key); retry4: if (send(tmpsock, (char *) &crp, sizeof(crp), 0) != (int) sizeof(crp)) @@ -4149,27 +4299,6 @@ internal_end_query(SockAddr *raddr, int be_pid, int be_key, #endif -/* - * PQcancel: request query cancel - * - * Returns TRUE if able to send the cancel request, FALSE if not. - * - * On failure, an error message is stored in *errbuf, which must be of size - * errbufsize (recommended size is 256 bytes). *errbuf is not changed on - * success return. - */ -int -PQcancel(PGcancel *cancel, char *errbuf, int errbufsize) -{ - if (!cancel) - { - strlcpy(errbuf, "PQcancel() -- no cancel object supplied", errbufsize); - return FALSE; - } - - return internal_cancel(&cancel->raddr, cancel->be_pid, cancel->be_key, - errbuf, errbufsize); -} #ifdef __TBASE__ int @@ -4203,6 +4332,7 @@ int PQrequestCancel(PGconn *conn) { int r; + PGcancel *cancel; /* Check we have an open connection */ if (!conn) @@ -4218,8 +4348,19 @@ PQrequestCancel(PGconn *conn) return FALSE; } - r = internal_cancel(&conn->raddr, conn->be_pid, conn->be_key, - conn->errorMessage.data, conn->errorMessage.maxlen); + cancel = PQgetCancel(conn); + if (cancel) + { + r = PQcancel(cancel, conn->errorMessage.data, + conn->errorMessage.maxlen); + PQfreeCancel(cancel); + } + else + { + strlcpy(conn->errorMessage.data, "out of memory", + conn->errorMessage.maxlen); + r = false; + } if (!r) conn->errorMessage.len = strlen(conn->errorMessage.data); diff --git a/src/interfaces/libpq/libpq-int.h b/src/interfaces/libpq/libpq-int.h index a51f3b7b..ad150bf7 100644 --- a/src/interfaces/libpq/libpq-int.h +++ b/src/interfaces/libpq/libpq-int.h @@ -576,6 +576,13 @@ struct pg_cancel SockAddr raddr; /* Remote address */ int be_pid; /* PID of backend --- needed for cancels */ int be_key; /* key of backend --- needed for cancels */ + int pgtcp_user_timeout; /* tcp user timeout */ + int keepalives; /* use TCP keepalives? */ + int keepalives_idle; /* time between TCP keepalives */ + int keepalives_interval; /* time between TCP keepalive + * retransmits */ + int keepalives_count; /* maximum number of TCP keepalive + * retransmits */ }; From 54fb76aa54441fa425cec2ee622259e79d53226a Mon Sep 17 00:00:00 2001 From: sigmalin Date: Tue, 1 Mar 2022 21:34:13 +0800 Subject: [PATCH 513/578] fix hang when pqcancel http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131097019641 --- src/backend/libpq/pqcomm.c | 1 + src/backend/pgxc/pool/poolmgr.c | 54 +++++++++++++++++++++++++++++++++ src/backend/utils/misc/guc.c | 22 ++++++-------- 3 files changed, 65 insertions(+), 12 deletions(-) diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c index bd089ae0..db3b1ea1 100644 --- a/src/backend/libpq/pqcomm.c +++ b/src/backend/libpq/pqcomm.c @@ -2071,6 +2071,7 @@ SetSockKeepAlive(int sock) { elog(LOG, "SetSockKeepAlive setsockopt(TCP_USER_TIMEOUT) failed: %m"); } +} int pq_gettcpusertimeout(Port *port) diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c index 756b2198..72d5786e 100644 --- a/src/backend/pgxc/pool/poolmgr.c +++ b/src/backend/pgxc/pool/poolmgr.c @@ -7418,6 +7418,57 @@ connect_pools(void) } } +/* + * Set cancel socket keepalive and user_timeout. + * We can use this to detect the broken connection quickly. + * see SetSockKeepAlive + */ +static void +set_cancel_conn_keepalive(PGcancel *cancelConn) +{ + uint32 user_timeout = UINT32_MAX / 1000 < tcp_keepalives_idle ? + 0 : tcp_keepalives_idle * (uint32) 1000; + + if (cancelConn == NULL) + { + return; + } + + /* + * If the connection did not use the connection option + * set the option here + * */ + if (cancelConn->keepalives == -1) + { + /* use TCP keepalives */ + cancelConn->keepalives = 1; + + if (tcp_keepalives_idle > 0) + { + /* time between TCP keepalives */ + cancelConn->keepalives_idle = tcp_keepalives_idle; + } + + if (tcp_keepalives_interval > 0) + { + /*time between TCP keepalive retransmits */ + cancelConn->keepalives_interval = tcp_keepalives_interval; + } + + if (tcp_keepalives_count > 0) + { + /* maximum number of TCP keepalive retransmits */ + cancelConn->keepalives_count = tcp_keepalives_count; + } + } + + if (cancelConn->pgtcp_user_timeout == -1 && user_timeout > 0) + { + /* tcp user timeout */ + cancelConn->pgtcp_user_timeout = user_timeout; + } +} + static bool preconnect_and_warm(DatabasePool *dbPool) {// #lizard forgives @@ -7521,6 +7572,7 @@ preconnect_and_warm(DatabasePool *dbPool) slot->xc_cancelConn = (NODE_CANCEL *) PQgetCancel((PGconn *)slot->conn); SetSockKeepAlive(((PGconn *)slot->conn)->sock); + set_cancel_conn_keepalive((PGcancel *)slot->xc_cancelConn); /* Increase count of pool size */ nodePool->slot[nodePool->freeSize] = slot; @@ -7628,6 +7680,7 @@ void *pooler_async_connection_management_thread(void *arg) slot->xc_cancelConn = (NODE_CANCEL *) PQgetCancel((PGconn *)slot->conn); slot->bwarmed = false; SetSockKeepAlive(((PGconn *)slot->conn)->sock); + set_cancel_conn_keepalive((PGcancel *)slot->xc_cancelConn); } break; } @@ -7910,6 +7963,7 @@ void *pooler_sync_remote_operator_thread(void *arg) slot->xc_cancelConn = (NODE_CANCEL *) PQgetCancel((PGconn *)slot->conn); slot->bwarmed = false; SetSockKeepAlive(((PGconn *)slot->conn)->sock); + set_cancel_conn_keepalive((PGcancel *)slot->xc_cancelConn); /* set the time flags */ slot->released = time(NULL); diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index b53a3c57..dbccb8f6 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -4298,7 +4298,16 @@ static struct config_int ConfigureNamesInt[] = 0, 0, INT_MAX, NULL, assign_tcp_keepalives_count, show_tcp_keepalives_count }, - + { + {"tcp_user_timeout", PGC_USERSET, CLIENT_CONN_OTHER, + gettext_noop("TCP user timeout."), + gettext_noop("A value of 0 uses the system default."), + GUC_UNIT_MS + }, + &tcp_user_timeout, + 0, 0, INT_MAX, + NULL, assign_tcp_user_timeout, show_tcp_user_timeout + }, { {"gin_fuzzy_search_limit", PGC_USERSET, CLIENT_CONN_OTHER, gettext_noop("Sets the maximum allowed result for exact search by GIN."), @@ -4955,17 +4964,6 @@ static struct config_uint ConfigureNamesUInt[] = NULL, NULL, NULL }, - { - {"tcp_user_timeout", PGC_USERSET, CLIENT_CONN_OTHER, - gettext_noop("TCP user timeout."), - gettext_noop("A value of 0 uses the system default."), - GUC_UNIT_MS - }, - &tcp_user_timeout, - 0, 0, INT_MAX, - NULL, assign_tcp_user_timeout, show_tcp_user_timeout - }, - /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL From 8cb8cf654b937bef3768a8a48fd0c4a073707201 Mon Sep 17 00:00:00 2001 From: aslanxli Date: Thu, 17 Mar 2022 19:14:19 +0800 Subject: [PATCH 514/578] When analyzing a interval partitioned table, the sub-table not be locked when make the oids listd. Therefore, when serially analyzing sub-tables, if the sub-table is droped before processing, the analysis process will be interrupted. The fix is to use try_relation_open instead of relation_open when processing sub-tables, and skip if the opening fails. --- src/backend/commands/analyze.c | 47 +++++-------------------------- src/backend/commands/vacuum.c | 17 +++++------ src/backend/utils/adt/ruleutils.c | 24 ++++++++++++++++ src/include/utils/ruleutils.h | 5 +++- 4 files changed, 44 insertions(+), 49 deletions(-) diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index d342a1ac..edd33ef1 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -170,34 +170,6 @@ analyze_rel(Oid relid, RangeVar *relation, int options, int elevel; AcquireSampleRowsFunc acquirefunc = NULL; BlockNumber relpages = 0; -#ifdef __TBASE__ - List *childs = NULL; - Oid child; - ListCell *lc; - - if(!IsAutoVacuumWorkerProcess()) - { - onerel = relation_open(relid, NoLock); - - if(RELATION_IS_INTERVAL(onerel)) - { - childs = RelationGetAllPartitions(onerel); - foreach(lc, childs) - { - child = lfirst_oid(lc); - analyze_rel(child, relation, options, params, va_cols, in_outer_xact, - bstrategy); - } - if (childs) - pfree(childs); - childs = NULL; - CommandCounterIncrement(); - } - - relation_close(onerel, NoLock); - onerel = NULL; - } -#endif /* Select logging level */ if (options & VACOPT_VERBOSE) @@ -1549,7 +1521,7 @@ acquire_inherited_sample_rows(Relation onerel, int elevel, */ if (RELATION_IS_INTERVAL(onerel)) { - tableOIDs = RelationGetAllPartitions(onerel); + tableOIDs = RelationGetAllPartitionsWithLock(onerel, AccessShareLock); } else { @@ -1562,8 +1534,9 @@ acquire_inherited_sample_rows(Relation onerel, int elevel, * child but no longer does. In that case, we can clear the * relhassubclass field so as not to make the same mistake again later. * (This is safe because we hold ShareUpdateExclusiveLock.) + * */ - if (list_length(tableOIDs) < 2) + if (list_length(tableOIDs) < 2 && !(list_length(tableOIDs) == 1 && RELATION_IS_INTERVAL(onerel))) { /* CCI because we already updated the pg_class row in this command */ CommandCounterIncrement(); @@ -1594,14 +1567,8 @@ acquire_inherited_sample_rows(Relation onerel, int elevel, BlockNumber relpages = 0; /* We already got the needed lock */ - if (RELATION_IS_INTERVAL(onerel)) - { - childrel = heap_open(childOID, AccessShareLock); - } - else - { childrel = heap_open(childOID, NoLock); - } + /* Ignore if temp table of another backend */ if (RELATION_IS_OTHER_TEMP(childrel)) @@ -4878,12 +4845,12 @@ get_rel_pages_visiblepages(Relation onerel, if (RELATION_IS_INTERVAL(onerel)) { - childs = RelationGetAllPartitions(onerel); + childs = RelationGetAllPartitionsWithLock(onerel, AccessShareLock); } else { childs = - find_all_inheritors(RelationGetRelid(onerel), NoLock, NULL); + find_all_inheritors(RelationGetRelid(onerel), AccessShareLock, NULL); } *pages = 0; @@ -4896,7 +4863,7 @@ get_rel_pages_visiblepages(Relation onerel, BlockNumber visible; /* We already got the needed lock */ - childrel = heap_open(childOID, AccessShareLock); + childrel = heap_open(childOID, NoLock); /* Ignore if temp table of another backend */ if (RELATION_IS_OTHER_TEMP(childrel)) diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 3c337815..7f6c4e18 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -456,6 +456,15 @@ get_rel_oids(Oid relid, const RangeVar *vacrel) if (include_parts) oid_list = list_concat(oid_list, find_all_inheritors(relid, NoLock, NULL)); + else if (!IsAutoVacuumWorkerProcess() && + classForm->relpartkind == RELPARTKIND_PARENT) + { + Relation p_rel; + p_rel = relation_open(relid, NoLock); + oid_list = lappend_oid(oid_list, relid); + oid_list = list_concat(oid_list, RelationGetAllPartitions(p_rel)); + relation_close(p_rel, NoLock); + } else oid_list = lappend_oid(oid_list, relid); MemoryContextSwitchTo(oldcontext); @@ -1266,14 +1275,6 @@ vacuum_rel(Oid relid, RangeVar *relation, int options, VacuumParams *params) Oid save_userid; int save_sec_context; int save_nestlevel; -#ifdef __TBASE__ - bool part_vacuum_result = true; - List *childs = NULL; - List *new_childs = NULL; - Oid child; - ListCell *lc; - MemoryContext oldmctx; -#endif Assert(params != NULL); diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c index 6b2dd38b..80623c91 100644 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@ -93,6 +93,7 @@ #include "postmaster/postmaster.h" #endif +#include "storage/lmgr.h" /* ---------- * Pretty formatting constants * ---------- @@ -12050,6 +12051,12 @@ RelationGetPartitionByValue(Relation rel, Const *value) List * RelationGetAllPartitions(Relation rel) +{ + return RelationGetAllPartitionsWithLock(rel, NoLock); +} + +List * +RelationGetAllPartitionsWithLock(Relation rel, LOCKMODE lockmode) { int nparts = 0; char *partname = NULL; @@ -12072,7 +12079,24 @@ RelationGetAllPartitions(Relation rel) { continue; } + if (lockmode != NoLock) + { + /* Get the lock to synchronize against concurrent drop */ + LockRelationOid(partoid, lockmode); + /* + * Now that we have the lock, double-check to see if the relation + * really exists or not. If not, assume it was dropped while we + * waited to acquire lock, and ignore it. + */ + if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(partoid))) + { + /* Release useless lock */ + UnlockRelationOid(partoid, lockmode); + /* And ignore this relation */ + continue; + } + } result = lappend_oid(result, partoid); } diff --git a/src/include/utils/ruleutils.h b/src/include/utils/ruleutils.h index 5dc0e217..db0106ea 100644 --- a/src/include/utils/ruleutils.h +++ b/src/include/utils/ruleutils.h @@ -76,7 +76,8 @@ #include "nodes/nodes.h" #include "nodes/parsenodes.h" #include "nodes/pg_list.h" - +#include "nodes/relation.h" +#include "storage/lockdefs.h" extern char *pg_get_indexdef_string(Oid indexrelid); extern char *pg_get_indexdef_columns(Oid indexrelid, bool pretty); @@ -101,6 +102,8 @@ extern char * GetPartitionName(Oid parentrelid, int partidx, bool isindex); extern int RelationGetPartitionIdxByValue(Relation rel, Datum value); extern List *RelationGetAllPartitions(Relation rel); +extern List *RelationGetAllPartitionsWithLock(Relation rel, LOCKMODE lockmode); +extern int GetAllPartitionIntervalCount(Oid parent_oid); extern int GetAllPartitionIntervalCount(Oid parent_oid); From 03fe6cadb50c582b6e327cb3c12f7de2ac48f627 Mon Sep 17 00:00:00 2001 From: aslanxli Date: Thu, 17 Mar 2022 19:59:36 +0800 Subject: [PATCH 515/578] When analyzing a interval partitioned table, the sub-table not be locked when make the oids listd. Therefore, when serially analyzing sub-tables, if the sub-table is droped before processing, the analysis process will be interrupted. The fix is to use try_relation_open instead of relation_open when processing sub-tables, and skip if the opening fails. --- src/backend/commands/analyze.c | 33 ++++++++++++++++++++ src/backend/commands/vacuum.c | 55 ++++++++++++++++++++-------------- 2 files changed, 65 insertions(+), 23 deletions(-) diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index edd33ef1..679c521a 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -170,6 +170,39 @@ analyze_rel(Oid relid, RangeVar *relation, int options, int elevel; AcquireSampleRowsFunc acquirefunc = NULL; BlockNumber relpages = 0; +#ifdef __TBASE__ + List *childs = NULL; + Oid child; + ListCell *lc; + if (!IsAutoVacuumWorkerProcess()) + { + onerel = try_relation_open(relid, NoLock); + if(!onerel) + return; + + if (RELATION_IS_INTERVAL(onerel)) + { + childs = RelationGetAllPartitions(onerel); + foreach (lc, childs) + { + child = lfirst_oid(lc); + analyze_rel(child, + relation, + options, + params, + va_cols, + in_outer_xact, + bstrategy); + } + if (childs) + pfree(childs); + childs = NULL; + CommandCounterIncrement(); + } + relation_close(onerel, NoLock); + onerel = NULL; + } +#endif /* Select logging level */ if (options & VACOPT_VERBOSE) diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 7f6c4e18..f9c3f3dc 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -456,15 +456,6 @@ get_rel_oids(Oid relid, const RangeVar *vacrel) if (include_parts) oid_list = list_concat(oid_list, find_all_inheritors(relid, NoLock, NULL)); - else if (!IsAutoVacuumWorkerProcess() && - classForm->relpartkind == RELPARTKIND_PARENT) - { - Relation p_rel; - p_rel = relation_open(relid, NoLock); - oid_list = lappend_oid(oid_list, relid); - oid_list = list_concat(oid_list, RelationGetAllPartitions(p_rel)); - relation_close(p_rel, NoLock); - } else oid_list = lappend_oid(oid_list, relid); MemoryContextSwitchTo(oldcontext); @@ -500,6 +491,9 @@ get_rel_oids(Oid relid, const RangeVar *vacrel) if (classForm->relpartkind == RELPARTKIND_CHILD) continue; + if (classForm->relpartkind == RELPARTKIND_CHILD) + continue; + /* Make a relation list entry for this guy */ oldcontext = MemoryContextSwitchTo(vac_context); oid_list = lappend_oid(oid_list, HeapTupleGetOid(tuple)); @@ -1275,6 +1269,15 @@ vacuum_rel(Oid relid, RangeVar *relation, int options, VacuumParams *params) Oid save_userid; int save_sec_context; int save_nestlevel; +#ifdef __TBASE__ + bool part_vacuum_result = true; + List *childs = NULL; + List *new_childs = NULL; + Oid child; + ListCell *lc; + MemoryContext oldmctx; +#endif + Assert(params != NULL); @@ -1285,19 +1288,25 @@ vacuum_rel(Oid relid, RangeVar *relation, int options, VacuumParams *params) /* functions in indexes may want a snapshot set */ PushActiveSnapshot(GetLocalTransactionSnapshot()); - onerel = relation_open(relid, NoLock); + onerel = try_relation_open(relid, NoLock); + if (!onerel) + { + PopActiveSnapshot(); + CommitTransactionCommand(); + return false; + } - if(RELATION_IS_INTERVAL(onerel)) + if (RELATION_IS_INTERVAL(onerel)) { - childs = RelationGetAllPartitions(onerel); + childs = RelationGetAllPartitions(onerel); - oldmctx = MemoryContextSwitchTo(vac_context); + oldmctx = MemoryContextSwitchTo(vac_context); new_childs = list_copy(childs); MemoryContextSwitchTo(oldmctx); - if (childs) - pfree(childs); - childs = NULL; + if (childs) + pfree(childs); + childs = NULL; onerelid = onerel->rd_lockInfo.lockRelId; LockRelationIdForSession(&onerelid, RowExclusiveLock); } @@ -1307,23 +1316,23 @@ vacuum_rel(Oid relid, RangeVar *relation, int options, VacuumParams *params) PopActiveSnapshot(); CommitTransactionCommand(); - - if(new_childs) + + if (new_childs) { - foreach(lc, new_childs) + foreach (lc, new_childs) { - child = lfirst_oid(lc); + child = lfirst_oid(lc); part_vacuum_result = vacuum_rel(child, relation, options, params); } UnlockRelationIdForSession(&onerelid, RowExclusiveLock); pfree(new_childs); - if(!part_vacuum_result) + if (!part_vacuum_result) { return false; - } + } } - } + } #endif /* Begin a transaction for vacuuming this relation */ From f11a813eaf8cab1f4890a09d2b198a5f44630e25 Mon Sep 17 00:00:00 2001 From: aslanxli Date: Fri, 18 Mar 2022 12:26:01 +0800 Subject: [PATCH 516/578] When analyzing a interval partitioned table, the sub-table not be locked when make the oids listd. Therefore, when serially analyzing sub-tables, if the sub-table is droped before processing, the analysis process will be interrupted. The fix is to use try_relation_open instead of relation_open when processing sub-tables, and skip if the opening fails. --- src/backend/commands/analyze.c | 7 +++++-- src/backend/commands/vacuum.c | 4 ++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 679c521a..ace0cc24 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -176,13 +176,15 @@ analyze_rel(Oid relid, RangeVar *relation, int options, ListCell *lc; if (!IsAutoVacuumWorkerProcess()) { - onerel = try_relation_open(relid, NoLock); + onerel = try_relation_open(relid, AccessShareLock); if(!onerel) return; if (RELATION_IS_INTERVAL(onerel)) { childs = RelationGetAllPartitions(onerel); + /* no need maintain parent lock,unlock and close */ + relation_close(onerel, AccessShareLock); foreach (lc, childs) { child = lfirst_oid(lc); @@ -199,7 +201,8 @@ analyze_rel(Oid relid, RangeVar *relation, int options, childs = NULL; CommandCounterIncrement(); } - relation_close(onerel, NoLock); + else + relation_close(onerel, AccessShareLock); onerel = NULL; } #endif diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index f9c3f3dc..93c91f68 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -1288,7 +1288,7 @@ vacuum_rel(Oid relid, RangeVar *relation, int options, VacuumParams *params) /* functions in indexes may want a snapshot set */ PushActiveSnapshot(GetLocalTransactionSnapshot()); - onerel = try_relation_open(relid, NoLock); + onerel = try_relation_open(relid, AccessShareLock); if (!onerel) { PopActiveSnapshot(); @@ -1311,7 +1311,7 @@ vacuum_rel(Oid relid, RangeVar *relation, int options, VacuumParams *params) LockRelationIdForSession(&onerelid, RowExclusiveLock); } - relation_close(onerel, NoLock); + relation_close(onerel, AccessShareLock); onerel = NULL; PopActiveSnapshot(); From fc15164aaadd9e4e1d745ca06525994194852bb8 Mon Sep 17 00:00:00 2001 From: aslanxli Date: Fri, 18 Mar 2022 15:34:53 +0800 Subject: [PATCH 517/578] When analyzing a interval partitioned table, the sub-table not be locked when make the oids listd. Therefore, when serially analyzing sub-tables, if the sub-table is droped before processing, the analysis process will be interrupted. The fix is to use try_relation_open instead of relation_open when processing sub-tables, and skip if the opening fails. --- src/backend/commands/analyze.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index ace0cc24..2549b543 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -1570,12 +1570,18 @@ acquire_inherited_sample_rows(Relation onerel, int elevel, * child but no longer does. In that case, we can clear the * relhassubclass field so as not to make the same mistake again later. * (This is safe because we hold ShareUpdateExclusiveLock.) - * + * No need to deal with the parent table of interval partitioned table, so tableOIDs + * only carry children table oids. */ if (list_length(tableOIDs) < 2 && !(list_length(tableOIDs) == 1 && RELATION_IS_INTERVAL(onerel))) { /* CCI because we already updated the pg_class row in this command */ CommandCounterIncrement(); + /* + * the interval partitioned table has nothing to do with attribute named + * relhassubclass + */ + if(!RELATION_IS_INTERVAL(onerel)) SetRelationHasSubclass(RelationGetRelid(onerel), false); ereport(elevel, (errmsg("skipping analyze of \"%s.%s\" inheritance tree --- this inheritance tree contains no child tables", From c7bb8fd0cedc0573f6adaa6147ebf9e1461d661c Mon Sep 17 00:00:00 2001 From: aslanxli Date: Mon, 21 Mar 2022 17:19:29 +0800 Subject: [PATCH 518/578] When analyzing a interval partitioned table, the sub-table not be locked when make the oids listd. Therefore, when serially analyzing sub-tables, if the sub-table is droped before processing, the analysis process will be interrupted. The fix is to use try_relation_open instead of relation_open when processing sub-tables, and skip if the opening fails. --- src/backend/commands/vacuumlazy.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c index 90dfe91f..70687960 100644 --- a/src/backend/commands/vacuumlazy.c +++ b/src/backend/commands/vacuumlazy.c @@ -213,7 +213,7 @@ lazy_vacuum_interval_rel(Relation onerel, VacuumParams *params) &multiXactCutoff, NULL); } - childs = RelationGetAllPartitions(onerel); + childs = RelationGetAllPartitionsWithLock(onerel, AccessShareLock); foreach (lc, childs) { @@ -222,7 +222,7 @@ lazy_vacuum_interval_rel(Relation onerel, VacuumParams *params) PgStat_StatTabEntry *tabentry; /* We already got the needed lock */ - childrel = heap_open(childOID, AccessShareLock); + childrel = heap_open(childOID, NoLock); /* Ignore if temp table of another backend */ if (RELATION_IS_OTHER_TEMP(childrel)) From 6846def21385aa794885a59ce81f4fbef7ea4046 Mon Sep 17 00:00:00 2001 From: ericxwu Date: Wed, 15 Sep 2021 16:23:13 +0800 Subject: [PATCH 519/578] Fix plantree_walk_initplans bug that missing one input parm of walker (merge request !701) http://tapd.oa.com/20421696/bugtrace/bugs/view?bug_id=1020421696092259197 --- src/backend/nodes/nodeFuncs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/nodes/nodeFuncs.c b/src/backend/nodes/nodeFuncs.c index e9916037..9bec67b0 100644 --- a/src/backend/nodes/nodeFuncs.c +++ b/src/backend/nodes/nodeFuncs.c @@ -3914,7 +3914,7 @@ plantree_walk_initplans(List *plans, Plan *splan = (Plan *) list_nth(subplans, (lfirst_node(SubPlan, lc))->plan_id - 1); - if (walker(splan, context)) + if (walker(splan, subplans, context)) return true; } From 4b9dcbd259c97729ae345a991668d03d8ffc32c6 Mon Sep 17 00:00:00 2001 From: whalesong Date: Tue, 12 Apr 2022 11:55:03 +0800 Subject: [PATCH 520/578] Bugfix: report "prepared statement XXX does not exist" errors all the time after cn switch (merge request !1244), http://tapd.oa.com/20421696/bugtrace/bugs/view?bug_id=1020421696098282911 (cherry picked from commit dbc2ef63) 774a3df1 bugfix: report prepared statement XXX does not exist errors all the time after cn switch, http://tapd.oa.com/20421696/bugtrace/bugs/view?bug_id=1020421696098282911 --- src/backend/pgxc/pool/pgxcnode.c | 42 ++++++++++++++++---------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index e4767e9d..c19325a9 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -4033,27 +4033,6 @@ get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query, bool node_handle = &dn_handles[node]; - if (be_pid == 0 && !raise_error) - { - PGXCNodeSetConnectionState(node_handle, DN_CONNECTION_STATE_ERROR_FATAL); - continue; - } - - pgxc_node_init(node_handle, fdsock, is_global_session, be_pid); - dn_handles[node] = *node_handle; - datanode_count++; - - elog(DEBUG1, "Established a connection with datanode \"%s\"," - "remote backend PID %d, socket fd %d, global session %c", - node_handle->nodename, (int) be_pid, fdsock, - is_global_session ? 'T' : 'F'); -#ifdef _PG_REGRESS_ - elog(LOG, "Established a connection with datanode \"%s\"," - "remote backend PID %d, socket fd %d, global session %c", - node_handle->nodename, (int) be_pid, fdsock, - is_global_session ? 'T' : 'F'); -#endif - if (IS_PGXC_COORDINATOR) { char nodetype = PGXC_NODE_DATANODE; @@ -4078,6 +4057,27 @@ get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query, bool "oid %d, type %c, max nodes %d", node_handle->nodename, nodeidx, node_handle->nodeoid, nodetype, NumDataNodes); } + + if (be_pid == 0 && !raise_error) + { + PGXCNodeSetConnectionState(node_handle, DN_CONNECTION_STATE_ERROR_FATAL); + continue; + } + + pgxc_node_init(node_handle, fdsock, is_global_session, be_pid); + dn_handles[node] = *node_handle; + datanode_count++; + + elog(DEBUG1, "Established a connection with datanode \"%s\"," + "remote backend PID %d, socket fd %d, global session %c", + node_handle->nodename, (int) be_pid, fdsock, + is_global_session ? 'T' : 'F'); +#ifdef _PG_REGRESS_ + elog(LOG, "Established a connection with datanode \"%s\"," + "remote backend PID %d, socket fd %d, global session %c", + node_handle->nodename, (int) be_pid, fdsock, + is_global_session ? 'T' : 'F'); +#endif } } /* Initialisation for Coordinators */ From fb5cdc4931b580fed244cd8e9a0bc0854d474b62 Mon Sep 17 00:00:00 2001 From: arrowbowang Date: Tue, 12 Apr 2022 14:20:20 +0800 Subject: [PATCH 521/578] fix: when use exetended protocol change pg_stat_activity stat to idle after the sql finished on dn --- src/backend/tcop/postgres.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 126bae58..c04d65ed 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -4833,6 +4833,7 @@ PostgresMain(int argc, char *argv[], StringInfoData input_message; sigjmp_buf local_sigjmp_buf; volatile bool send_ready_for_query = true; + volatile bool need_report_activity = false; bool disable_idle_in_transaction_timeout = false; #ifdef PGXC /* PGXC_DATANODE */ @@ -5410,7 +5411,7 @@ PostgresMain(int argc, char *argv[], * uncommitted updates (that confuses autovacuum). The notification * processor wants a call too, if we are not in a transaction block. */ - if (send_ready_for_query) + if (send_ready_for_query || need_report_activity) { if (IsAbortedTransactionBlockState()) { @@ -5447,6 +5448,7 @@ PostgresMain(int argc, char *argv[], pgstat_report_activity(STATE_IDLE, NULL); } + if(send_ready_for_query) ReadyForQuery(whereToSendOutput); #ifdef XCP @@ -5469,6 +5471,7 @@ PostgresMain(int argc, char *argv[], #endif send_ready_for_query = false; + need_report_activity = false; } /* @@ -5809,6 +5812,7 @@ PostgresMain(int argc, char *argv[], case 'L': /* sync */ pq_getmsgend(&input_message); finish_xact_command(); + need_report_activity = true; break; #ifdef __TBASE__ case 'N': From 4991a508465cb8d420e53f83c31f5ab6a88b0c0b Mon Sep 17 00:00:00 2001 From: sigmalin Date: Sat, 2 Apr 2022 16:11:07 +0800 Subject: [PATCH 522/578] fix core when create node http://tapd.woa.com/TBase_Oracle_Migration/bugtrace/bugs/view?bug_id=1020421696098208891&jump_count=1 (merge request !1237) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Squash merge branch 'sigmalin002' into 'Tbase_v5.06.2' 问题:多个stmt的语句,在exec_simple_query进行了拆分。插件这里(pg_stat_log,pg_stat_statements中pgsl_store和pgss_store)还是按照原始的query_string去计算目标的query位置,可能会导致内存访问的问题,从而core。 修复:原来exec_simple_query中拆分sql是自己去匹配分号,修改为根据stmt中的query_location来进行拆分,拆分后的sql作为portal的sourceText,stmt里面的query_location置为0 TAPD: --bug=098208891 --- src/backend/tcop/postgres.c | 88 ++++++++++++------------------------- 1 file changed, 27 insertions(+), 61 deletions(-) diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index c04d65ed..37819150 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -1159,70 +1159,43 @@ ch_is_space(char ch) * if the query string contain multi stmt */ static char* -get_myself_query_string(char* query_string, char** out_query_string) +get_myself_query_string(const char* query_string, RawStmt *parsetree) { - char *string_delimeter = NULL; - char *myself_query_string = NULL; - int myself_query_string_len = 0; - int pos = 0; - bool in_quotation = false; - int query_string_len = 0; - - if (query_string && query_string[0] != '\0') - { - /* skip space and redundant ';' */ - while (*query_string != '\0') + static StringInfo myself_query_string = NULL; + int query_location; + int query_len; + MemoryContext oldcontext; + + if (parsetree->stmt_location >= 0) { - if (ch_is_space(*query_string) || *query_string == ';') - { - query_string++; + Assert(parsetree->stmt_location <= strlen(query_string)); + query_location = parsetree->stmt_location; + /* Length of 0 (or -1) means "rest of string" */ + query_len = (parsetree->stmt_len <= 0) ? strlen(query_string) : parsetree->stmt_len; + /* update the location */ + parsetree->stmt_location = 0; } else { - break; - } - } - - if (*query_string == '\0') - { - *out_query_string = NULL; - return NULL; - } - - /* find ';' in query string, be careful of '\'' */ - query_string_len = strlen(query_string); - for (pos = 0; pos < query_string_len; pos++) - { - if (query_string[pos] == '\'') - { - in_quotation = (in_quotation) ? false : true; + /* If query location is unknown, distrust query_len as well */ + query_location = 0; + query_len = strlen(query_string); } - if (query_string[pos] == ';' && !in_quotation) + oldcontext = MemoryContextSwitchTo(TopMemoryContext); + if (myself_query_string == NULL) { - string_delimeter = &query_string[pos]; - break; - } - } - - if (string_delimeter == NULL) - { - myself_query_string = query_string; - query_string = NULL; + myself_query_string = makeStringInfo(); } else { - myself_query_string_len = string_delimeter - query_string; - myself_query_string = palloc(myself_query_string_len + 1); - memcpy(myself_query_string, query_string, myself_query_string_len); - myself_query_string[myself_query_string_len] = '\0'; - - query_string = string_delimeter + 1; - } + resetStringInfo(myself_query_string); } - *out_query_string = myself_query_string; - return query_string; + appendBinaryStringInfo(myself_query_string, query_string + query_location, query_len); + MemoryContextSwitchTo(oldcontext); + + return myself_query_string->data; } /* @@ -1242,7 +1215,6 @@ exec_simple_query(const char *query_string) bool isTopLevel; char msec_str[32]; bool multiCommands = false; - char *query_string_tmp = NULL; /* * Report query to various monitoring facilities. @@ -1314,8 +1286,6 @@ exec_simple_query(const char *query_string) errmsg("COMMIT or ROLLBACK " "in multi-statement queries not allowed"))); } - - query_string_tmp = (char*) query_string; } /* @@ -1373,13 +1343,9 @@ exec_simple_query(const char *query_string) Portal portal; DestReceiver *receiver; int16 format; - char *myself_query_string = NULL; - - if (query_string_tmp && query_string_tmp[0] != '\0') - { /* get this portal's query when has multi parse tree */ - query_string_tmp = get_myself_query_string(query_string_tmp, &myself_query_string); - } + const char *myself_query_string = isTopLevel ? debug_query_string : + (const char *)get_myself_query_string(debug_query_string, parsetree); #ifdef PGXC @@ -1543,7 +1509,7 @@ exec_simple_query(const char *query_string) */ PortalDefineQuery(portal, NULL, - (myself_query_string) ? myself_query_string : query_string, + myself_query_string, commandTag, plantree_list, NULL); From 20e9ddda00b8ddaf82d74416ae2937db80fe7c55 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Thu, 7 Apr 2022 16:50:47 +0800 Subject: [PATCH 523/578] fix bug in PgxcNodeAlter --- src/backend/pgxc/nodemgr/nodemgr.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/backend/pgxc/nodemgr/nodemgr.c b/src/backend/pgxc/nodemgr/nodemgr.c index 65fbcccd..a906e3dd 100644 --- a/src/backend/pgxc/nodemgr/nodemgr.c +++ b/src/backend/pgxc/nodemgr/nodemgr.c @@ -1461,8 +1461,6 @@ PgxcNodeAlter(AlterNodeStmt *stmt) /* Check that node exists */ if (!OidIsValid(nodeOid)) { - nodeOid = get_pgxc_nodeoid_extend(node_name, PGXCDefaultClusterName); - if (!OidIsValid(nodeOid)) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("PGXC Node %s: object not defined", From 580a9ea2dc1a7063e346900db01412229cd4d355 Mon Sep 17 00:00:00 2001 From: andrelin Date: Fri, 25 Mar 2022 14:18:07 +0800 Subject: [PATCH 524/578] free planstate related memory in pg_stat_cluster_activity by a memory context tapd: http://tapd.woa.com/TEG_TBase/bugtrace/bugs/view/1020423208097794079 --- .../pg_stat_cluster_activity.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c index 2b36fe39..58c989f1 100644 --- a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c +++ b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c @@ -20,6 +20,7 @@ #include "storage/shmem.h" #include "utils/builtins.h" #include "utils/guc.h" +#include "utils/memutils.h" #include "utils/portal.h" #include "utils/snapmgr.h" #include "utils/timestamp.h" @@ -104,6 +105,8 @@ static ExecutorStart_hook_type prev_ExecutorStart = NULL; static bool pgcs_enable_planstate; /* whether to show planstate in result sets */ +MemoryContext PGCSMemoryContext = NULL; + /* * Macros to load and store st_changecount with the memory barriers. * @@ -419,6 +422,8 @@ pgcs_report_executor_activity(QueryDesc *desc, int eflags) volatile PgClusterStatus *entry; StringInfo planstate_str = NULL; StringInfo cursors = NULL; + ExplainState *es = NULL; + MemoryContext oldctx; if (prev_ExecutorStart) prev_ExecutorStart(desc, eflags); @@ -440,12 +445,14 @@ pgcs_report_executor_activity(QueryDesc *desc, int eflags) return; } + oldctx = MemoryContextSwitchTo(PGCSMemoryContext); + if (desc->planstate != NULL) { /* make planstate text tree if enabled */ if (pgcs_enable_planstate) { - ExplainState *es = NewExplainState(); + es = NewExplainState(); es->costs = false; /* we don't want plan->targetlist been changed */ @@ -470,6 +477,9 @@ pgcs_report_executor_activity(QueryDesc *desc, int eflags) cursorCollectWalker(desc->planstate, cursors); } + MemoryContextSwitchTo(oldctx); + MemoryContextResetAndDeleteChildren(PGCSMemoryContext); + increment_changecount_before(entry); if (planstate_str != NULL && planstate_str->len > 0) @@ -1140,6 +1150,9 @@ _PG_init(void) */ RequestAddinShmemSpace(pgcs_memsize()); + PGCSMemoryContext = AllocSetContextCreate(TopMemoryContext, + "pg_stat_cluster_activity planstate", + ALLOCSET_DEFAULT_SIZES); /* * Install hooks. */ From ed6567e4cdefdad5903d97fd23b111e64c574715 Mon Sep 17 00:00:00 2001 From: andrelin Date: Thu, 14 Apr 2022 10:05:31 +0800 Subject: [PATCH 525/578] Reset PGCSMemoryContext properly in pg_stat_cluster_activity --- contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c index 58c989f1..4e721ead 100644 --- a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c +++ b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c @@ -477,9 +477,6 @@ pgcs_report_executor_activity(QueryDesc *desc, int eflags) cursorCollectWalker(desc->planstate, cursors); } - MemoryContextSwitchTo(oldctx); - MemoryContextResetAndDeleteChildren(PGCSMemoryContext); - increment_changecount_before(entry); if (planstate_str != NULL && planstate_str->len > 0) @@ -491,6 +488,9 @@ pgcs_report_executor_activity(QueryDesc *desc, int eflags) pgcs_report_role((PgClusterStatus *) entry, desc); increment_changecount_after(entry); + + MemoryContextSwitchTo(oldctx); + MemoryContextResetAndDeleteChildren(PGCSMemoryContext); } /* ---------- From 6855a6dfe611a11229d5aae9780d9bc7139e61a1 Mon Sep 17 00:00:00 2001 From: andrelin Date: Thu, 14 Apr 2022 15:24:00 +0800 Subject: [PATCH 526/578] Squash merge branch 'andrelin/Tbase_v5.06.2' into 'Tbase_v5.06.2' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit revert了历史提交,用desc->estate->es_query_cxt来保证是每个query独立的context 测试了之前单进程跑的内存测试用例,带着插件跑tpcc都OK了 --- .../pg_stat_cluster_activity.c | 20 ++++++++----------- 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c index 4e721ead..4bd82758 100644 --- a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c +++ b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c @@ -20,7 +20,6 @@ #include "storage/shmem.h" #include "utils/builtins.h" #include "utils/guc.h" -#include "utils/memutils.h" #include "utils/portal.h" #include "utils/snapmgr.h" #include "utils/timestamp.h" @@ -105,8 +104,6 @@ static ExecutorStart_hook_type prev_ExecutorStart = NULL; static bool pgcs_enable_planstate; /* whether to show planstate in result sets */ -MemoryContext PGCSMemoryContext = NULL; - /* * Macros to load and store st_changecount with the memory barriers. * @@ -422,8 +419,7 @@ pgcs_report_executor_activity(QueryDesc *desc, int eflags) volatile PgClusterStatus *entry; StringInfo planstate_str = NULL; StringInfo cursors = NULL; - ExplainState *es = NULL; - MemoryContext oldctx; + MemoryContext oldcxt; if (prev_ExecutorStart) prev_ExecutorStart(desc, eflags); @@ -445,14 +441,18 @@ pgcs_report_executor_activity(QueryDesc *desc, int eflags) return; } - oldctx = MemoryContextSwitchTo(PGCSMemoryContext); + /* + * Make sure we operate in the per-query context, so any cruft will be + * discarded later during ExecutorEnd. estate should be set by standard_ExecutorStart. + */ + oldcxt = MemoryContextSwitchTo(desc->estate->es_query_cxt); if (desc->planstate != NULL) { /* make planstate text tree if enabled */ if (pgcs_enable_planstate) { - es = NewExplainState(); + ExplainState *es = NewExplainState(); es->costs = false; /* we don't want plan->targetlist been changed */ @@ -489,8 +489,7 @@ pgcs_report_executor_activity(QueryDesc *desc, int eflags) increment_changecount_after(entry); - MemoryContextSwitchTo(oldctx); - MemoryContextResetAndDeleteChildren(PGCSMemoryContext); + MemoryContextSwitchTo(oldcxt); } /* ---------- @@ -1150,9 +1149,6 @@ _PG_init(void) */ RequestAddinShmemSpace(pgcs_memsize()); - PGCSMemoryContext = AllocSetContextCreate(TopMemoryContext, - "pg_stat_cluster_activity planstate", - ALLOCSET_DEFAULT_SIZES); /* * Install hooks. */ From 20f6359700f6f9ed2be7a25e5bff1300a544aec7 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Thu, 10 Feb 2022 11:23:25 +0800 Subject: [PATCH 527/578] fix size calculation error in SharedQueueShmemSize http://tapd.woa.com/20421696/bugtrace/bugs/view?bug_id=1020421696096562981 --- src/backend/pgxc/squeue/squeue.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/backend/pgxc/squeue/squeue.c b/src/backend/pgxc/squeue/squeue.c index 19387c66..61289161 100644 --- a/src/backend/pgxc/squeue/squeue.c +++ b/src/backend/pgxc/squeue/squeue.c @@ -772,15 +772,20 @@ SharedQueueShmemSize(void) { Size sqs_size; + /* Shared Queues Sync */ sqs_size = mul_size(NUM_SQUEUES, SQUEUE_SYNC_SIZE); + /* Shared Queue Locks */ + sqs_size = add_size(sqs_size, mul_size((NUM_SQUEUES * (TBASE_MAX_DATANODE_NUMBER)), sizeof(LWLockPadded))); #ifdef __TBASE__ if (g_UseDataPump) { + /* Disconnect Consumers */ sqs_size = add_size(sqs_size, hash_estimate_size(NUM_SQUEUES, sizeof(DisConsumer))); } #endif + /* Shared Queues */ if(g_UseDataPump) return add_size(sqs_size, hash_estimate_size(NUM_SQUEUES, SQUEUE_HDR_SIZE(TBASE_MAX_DATANODE_NUMBER))); else From 77a12b5587b5ea8feb6d725114e647feea7db79d Mon Sep 17 00:00:00 2001 From: aslanxli Date: Mon, 7 Mar 2022 09:22:35 +0800 Subject: [PATCH 528/578] support sync statistic to other coordinator node when execute analyze with sync check pick commit: 7b23c7da --- src/backend/commands/analyze.c | 572 +++++++++++++++++++++++++++- src/backend/commands/vacuum.c | 16 +- src/backend/nodes/copyfuncs.c | 15 + src/backend/nodes/equalfuncs.c | 13 + src/backend/parser/gram.y | 43 ++- src/backend/postmaster/autovacuum.c | 3 +- src/backend/tcop/utility.c | 161 +++++++- src/include/commands/vacuum.h | 22 +- src/include/nodes/nodes.h | 1 + src/include/nodes/parsenodes.h | 9 + src/include/parser/kwlist.h | 1 + 11 files changed, 823 insertions(+), 33 deletions(-) diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 2549b543..f0f92a5b 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -121,7 +121,7 @@ static BufferAccessStrategy vac_strategy; static void do_analyze_rel(Relation onerel, int options, VacuumParams *params, List *va_cols, AcquireSampleRowsFunc acquirefunc, BlockNumber relpages, - bool inh, bool in_outer_xact, int elevel); + bool inh, bool in_outer_xact, int elevel, AnalyzeSyncOpt *syncOpt); static void compute_index_stats(Relation onerel, double totalrows, AnlIndexData *indexdata, int nindexes, HeapTuple *rows, int numrows, @@ -139,6 +139,14 @@ static void update_attstats(Oid relid, bool inh, int natts, VacAttrStats **vacattrstats); static Datum std_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull); static Datum ind_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull); +static void analyze_rel_sync(Relation onerel, + bool inh, + int attr_cnt, + VacAttrStats **vacattrstats, + int nindexes, + Relation *indexes, + AnlIndexData *indexdata, + AnalyzeSyncOpt *syncOpt); #ifdef XCP static void analyze_rel_coordinator(Relation onerel, bool inh, int attr_cnt, @@ -162,10 +170,15 @@ static int acquire_coordinator_sample_rows(Relation onerel, int elevel, * analyze_rel() -- analyze one relation */ void -analyze_rel(Oid relid, RangeVar *relation, int options, - VacuumParams *params, List *va_cols, bool in_outer_xact, - BufferAccessStrategy bstrategy) -{// #lizard forgives +analyze_rel(Oid relid, + RangeVar *relation, + int options, + VacuumParams *params, + List *va_cols, + bool in_outer_xact, + BufferAccessStrategy bstrategy, + AnalyzeSyncOpt *syncOpt) +{ Relation onerel; int elevel; AcquireSampleRowsFunc acquirefunc = NULL; @@ -194,7 +207,8 @@ analyze_rel(Oid relid, RangeVar *relation, int options, params, va_cols, in_outer_xact, - bstrategy); + bstrategy, + syncOpt); } if (childs) pfree(childs); @@ -381,14 +395,14 @@ analyze_rel(Oid relid, RangeVar *relation, int options, */ if (onerel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE) do_analyze_rel(onerel, options, params, va_cols, acquirefunc, - relpages, false, in_outer_xact, elevel); + relpages, false, in_outer_xact, elevel, syncOpt); /* * If there are child tables, do recursive ANALYZE. */ if (onerel->rd_rel->relhassubclass) do_analyze_rel(onerel, options, params, va_cols, acquirefunc, relpages, - true, in_outer_xact, elevel); + true, in_outer_xact, elevel, syncOpt); /* * Close source relation now, but keep lock so that no one deletes it @@ -415,11 +429,17 @@ analyze_rel(Oid relid, RangeVar *relation, int options, * appropriate acquirefunc for each child table. */ static void -do_analyze_rel(Relation onerel, int options, VacuumParams *params, - List *va_cols, AcquireSampleRowsFunc acquirefunc, - BlockNumber relpages, bool inh, bool in_outer_xact, - int elevel) -{// #lizard forgives +do_analyze_rel(Relation onerel, + int options, + VacuumParams *params, + List *va_cols, + AcquireSampleRowsFunc acquirefunc, + BlockNumber relpages, + bool inh, + bool in_outer_xact, + int elevel, + AnalyzeSyncOpt *syncOpt) +{ int attr_cnt, tcnt, i, @@ -609,6 +629,24 @@ do_analyze_rel(Relation onerel, int options, VacuumParams *params, onerel->rd_locator_info && !RELATION_IS_COORDINATOR_LOCAL(onerel)); + /* + * Sync statistics if this session is connected to other remote Coordinator. + * When receiving sync commands directly from the client, we also sync statistics. + */ + if (iscoordinator && IsConnFromCoord() && + (syncOpt != NULL && syncOpt->is_sync_from == true)) + { + elog(INFO, "SYNC statistic"); + analyze_rel_sync(onerel, + inh, + attr_cnt, + vacattrstats, + nindexes, + Irel, + indexdata, + syncOpt); + goto cleanup; + } #ifdef XCP #ifdef __TBASE__ if (!enable_sampling_analyze && iscoordinator) @@ -5310,3 +5348,511 @@ acquire_coordinator_sample_rows(Relation onerel, int elevel, #endif + + +/* + * coord_collect_simple_stats + * Collect simple stats for a relation (pg_statistic contents). + * + * Collects statistics from the datanodes, and then keeps the one of the + * received statistics for each attribute (the first one we receive, but + * it's mostly random). + * + * XXX We do not try to build statistics covering data fro all the nodes, + * either by collecting fresh sample of rows or merging the statistics + * somehow. The current approach is very simple and cheap, but may have + * negative impact on estimate accuracy as the stats only covers data + * from a single node, and we may end up with stats from different node + * for each attribute. + */ +static void +coord_collect_stats(Relation onerel, bool inh, int attr_cnt, + VacAttrStats **vacattrstats, AnalyzeSyncOpt *syncOpt) +{ + char *nspname; + char *relname; + /* Fields to run query to read statistics from data nodes */ + StringInfoData query; + EState *estate; + MemoryContext oldcontext; + RemoteQuery *step; + RemoteQueryState *node; + TupleTableSlot *result; + int i; + /* Number of data nodes from which attribute statistics are received. */ + int *numnodes; + int reltuples; + int relpages; + int relallvisible; + bool relhasindex; + ListCell *lc; + int nodeIdx; + ExecNodes *execnodes = (ExecNodes *)makeNode(ExecNodes); + /* Get the relation identifier */ + relname = RelationGetRelationName(onerel); + nspname = get_namespace_name(RelationGetNamespace(onerel)); + + /* Make up query string */ + initStringInfo(&query); + /* Generic statistic fields */ + appendStringInfoString(&query, + "SELECT s.staattnum, " + "c.reltuples, " + "c.relpages," + "c.relallvisible," + "c.relhasindex," + "s.stanullfrac, " + "s.stawidth, " + "s.stadistinct"); + /* Detailed statistic slots */ + for (i = 1; i <= STATISTIC_NUM_SLOTS; i++) + appendStringInfo(&query, ", s.stakind%d" + ", o%d.oprname" + ", no%d.nspname" + ", t%dl.typname" + ", nt%dl.nspname" + ", t%dr.typname" + ", nt%dr.nspname" + ", s.stanumbers%d" + ", s.stavalues%d", + i, i, i, i, i, i, i, i, i); + + /* Common part of FROM clause */ + appendStringInfoString(&query, " FROM pg_statistic s JOIN pg_class c " + " ON s.starelid = c.oid " + "JOIN pg_namespace nc " + " ON c.relnamespace = nc.oid "); + /* Info about involved operations */ + for (i = 1; i <= STATISTIC_NUM_SLOTS; i++) + appendStringInfo(&query, "LEFT JOIN (pg_operator o%d " + " JOIN pg_namespace no%d " + " ON o%d.oprnamespace = no%d.oid " + " JOIN pg_type t%dl " + " ON o%d.oprleft = t%dl.oid " + " JOIN pg_namespace nt%dl " + " ON t%dl.typnamespace = nt%dl.oid " + " JOIN pg_type t%dr " + " ON o%d.oprright = t%dr.oid " + " JOIN pg_namespace nt%dr " + " ON t%dr.typnamespace = nt%dr.oid) " + " ON s.staop%d = o%d.oid ", + i, i, i, i, i, i, i, i, i, + i, i, i, i, i, i, i, i, i); + appendStringInfo(&query, "WHERE nc.nspname = '%s' " + "AND c.relname = '%s'", + nspname, relname); + + /* Build up RemoteQuery */ + execnodes->accesstype = RELATION_ACCESS_READ; + execnodes->baselocatortype = LOCATOR_TYPE_SHARD; /* not used */ + execnodes->en_expr = NULL; + execnodes->en_relid = InvalidOid; + execnodes->primarynodelist = NIL; + + foreach (lc, syncOpt->nodes) + { + char node_type = PGXC_NODE_COORDINATOR; + nodeIdx = PGXCNodeGetNodeIdFromName(strVal(lfirst(lc)), &node_type); + execnodes->nodeList = lappend_int(execnodes->nodeList, nodeIdx); + } + step = makeNode(RemoteQuery); + step->combine_type = COMBINE_TYPE_NONE; + step->exec_nodes = execnodes; + step->sql_statement = query.data; + step->force_autocommit = true; + step->exec_type = EXEC_ON_COORDS; + + /* Add targetlist entries */ + step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, + make_relation_tle(StatisticRelationId, + "pg_statistic", + "staattnum")); + step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, + make_relation_tle(RelationRelationId, + "pg_class", + "reltuples")); + step->scan.plan.targetlist = + lappend(step->scan.plan.targetlist, + make_relation_tle(RelationRelationId, "pg_class", "relpages")); + step->scan.plan.targetlist = + lappend(step->scan.plan.targetlist, + make_relation_tle(RelationRelationId, "pg_class", "relallvisible")); + step->scan.plan.targetlist = + lappend(step->scan.plan.targetlist, + make_relation_tle(RelationRelationId, "pg_class", "relhasindex")); + step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, + make_relation_tle(StatisticRelationId, + "pg_statistic", + "stanullfrac")); + step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, + make_relation_tle(StatisticRelationId, + "pg_statistic", + "stawidth")); + step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, + make_relation_tle(StatisticRelationId, + "pg_statistic", + "stadistinct")); + for (i = 1; i <= STATISTIC_NUM_SLOTS; i++) + { + /* 16 characters would be enough */ + char colname[16]; + + sprintf(colname, "stakind%d", i); + step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, + make_relation_tle(StatisticRelationId, + "pg_statistic", + colname)); + + step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, + make_relation_tle(OperatorRelationId, + "pg_operator", + "oprname")); + step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, + make_relation_tle(NamespaceRelationId, + "pg_namespace", + "nspname")); + step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, + make_relation_tle(TypeRelationId, + "pg_type", + "typname")); + step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, + make_relation_tle(NamespaceRelationId, + "pg_namespace", + "nspname")); + step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, + make_relation_tle(TypeRelationId, + "pg_type", + "typname")); + step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, + make_relation_tle(NamespaceRelationId, + "pg_namespace", + "nspname")); + + sprintf(colname, "stanumbers%d", i); + step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, + make_relation_tle(StatisticRelationId, + "pg_statistic", + colname)); + + sprintf(colname, "stavalues%d", i); + step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, + make_relation_tle(StatisticRelationId, + "pg_statistic", + colname)); + } + /* Execute query on the data nodes */ + estate = CreateExecutorState(); + + oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); + + /* + * Take a fresh snapshot so that we see the effects of the ANALYZE command + * on the datanode. That command is run in auto-commit mode hence just + * bumping up the command ID is not good enough + */ + /* PushActiveSnapshot(GetLocalTransactionSnapshot()); */ + estate->es_snapshot = GetActiveSnapshot(); + + node = ExecInitRemoteQuery(step, estate, 0); + MemoryContextSwitchTo(oldcontext); + + /* get ready to combine results */ + numnodes = (int *) palloc(attr_cnt * sizeof(int)); + for (i = 0; i < attr_cnt; i++) + numnodes[i] = 0; + + result = ExecRemoteQuery((PlanState *) node); + /* PopActiveSnapshot(); */ + while (result != NULL && !TupIsNull(result)) + { + Datum value; + bool isnull; + int colnum = 1; + int16 attnum; + float4 nullfrac; + int32 width; + float4 distinct; + VacAttrStats *stats = NULL; + + + /* Process statistics from the data node */ + value = slot_getattr(result, colnum++, &isnull); /* staattnum */ + attnum = DatumGetInt16(value); + for (i = 0; i < attr_cnt; i++) + if (vacattrstats[i]->attr->attnum == attnum) + { + stats = vacattrstats[i]; + stats->stats_valid = true; + numnodes[i]++; + break; + } + + value = slot_getattr(result, colnum++, &isnull); /* reltuples */ + reltuples = DatumGetFloat4(value); + + value = slot_getattr(result, colnum++, &isnull); /* relpages */ + relpages = DatumGetInt32(value); + + value = slot_getattr(result, colnum++, &isnull); /* relallvisible */ + relallvisible = DatumGetInt32(value); + + value = slot_getattr(result, colnum++, &isnull); /* relhasindex */ + relhasindex = DatumGetBool(value); + + if (stats) + { + value = slot_getattr(result, colnum++, &isnull); /* stanullfrac */ + nullfrac = DatumGetFloat4(value); + stats->stanullfrac = nullfrac; + + value = slot_getattr(result, colnum++, &isnull); /* stawidth */ + width = DatumGetInt32(value); + stats->stawidth = width; + + value = slot_getattr(result, colnum++, &isnull); /* stadistinct */ + distinct = DatumGetFloat4(value); + stats->stadistinct = distinct; + + /* Detailed statistics */ + for (i = 1; i <= STATISTIC_NUM_SLOTS; i++) + { + int16 kind; + float4 *numbers; + Datum *values; + int nnumbers, nvalues; + int k; + + value = slot_getattr(result, colnum++, &isnull); /* kind */ + kind = DatumGetInt16(value); + + if (kind == 0) + { + /* + * Empty slot - skip next 8 fields: 6 fields of the + * operation identifier and two data fields (numbers and + * values) + */ + colnum += 8; + continue; + } + else + { + Oid oprid; + + /* Get operator */ + value = slot_getattr(result, colnum++, &isnull); /* oprname */ + if (isnull) + { + /* + * Operator is not specified for that kind, skip remaining + * fields to lookup the operator + */ + oprid = InvalidOid; + colnum += 5; /* skip operation nsp and types */ + } + else + { + char *oprname; + char *oprnspname; + Oid ltypid, rtypid; + char *ltypname, + *rtypname; + char *ltypnspname, + *rtypnspname; + oprname = DatumGetCString(value); + value = slot_getattr(result, colnum++, &isnull); /* oprnspname */ + oprnspname = DatumGetCString(value); + /* Get left operand data type */ + value = slot_getattr(result, colnum++, &isnull); /* typname */ + ltypname = DatumGetCString(value); + value = slot_getattr(result, colnum++, &isnull); /* typnspname */ + ltypnspname = DatumGetCString(value); + ltypid = get_typname_typid(ltypname, + get_namespaceid(ltypnspname)); + /* Get right operand data type */ + value = slot_getattr(result, colnum++, &isnull); /* typname */ + rtypname = DatumGetCString(value); + value = slot_getattr(result, colnum++, &isnull); /* typnspname */ + rtypnspname = DatumGetCString(value); + rtypid = get_typname_typid(rtypname, + get_namespaceid(rtypnspname)); + /* lookup operator */ + oprid = get_operid(oprname, ltypid, rtypid, + get_namespaceid(oprnspname)); + } + /* + * Look up a statistics slot. If there is an entry of the + * same kind already, leave it, assuming the statistics + * is approximately the same on all nodes, so values from + * one node are representing entire relation well. + * If empty slot is found store values here. If no more + * slots skip remaining values. + */ + for (k = 0; k < STATISTIC_NUM_SLOTS; k++) + { + if (stats->stakind[k] == 0 || + (stats->stakind[k] == kind && stats->staop[k] == oprid)) + break; + } + + if (k >= STATISTIC_NUM_SLOTS) + { + /* No empty slots */ + break; + } + + /* + * If it is an existing slot which has numbers or values + * continue to the next set. If slot exists but without + * numbers and values, try to acquire them now + */ + if (stats->stakind[k] != 0 && (stats->numnumbers[k] > 0 || + stats->numvalues[k] > 0)) + { + colnum += 2; /* skip numbers and values */ + continue; + } + + /* + * Initialize slot + */ + stats->stakind[k] = kind; + stats->staop[k] = oprid; + stats->numnumbers[k] = 0; + stats->stanumbers[k] = NULL; + stats->numvalues[k] = 0; + stats->stavalues[k] = NULL; + stats->statypid[k] = InvalidOid; + stats->statyplen[k] = -1; + stats->statypalign[k] = 'i'; + stats->statypbyval[k] = true; + } + + + /* get numbers */ + value = slot_getattr(result, colnum++, &isnull); /* numbers */ + if (!isnull) + { + ArrayType *arry = DatumGetArrayTypeP(value); + + /* + * We expect the array to be a 1-D float4 array; verify that. We don't + * need to use deconstruct_array() since the array data is just going + * to look like a C array of float4 values. + */ + nnumbers = ARR_DIMS(arry)[0]; + if (ARR_NDIM(arry) != 1 || nnumbers <= 0 || + ARR_HASNULL(arry) || + ARR_ELEMTYPE(arry) != FLOAT4OID) + elog(ERROR, "stanumbers is not a 1-D float4 array"); + numbers = (float4 *) palloc(nnumbers * sizeof(float4)); + memcpy(numbers, ARR_DATA_PTR(arry), + nnumbers * sizeof(float4)); + + /* + * Free arry if it's a detoasted copy. + */ + if ((Pointer) arry != DatumGetPointer(value)) + pfree(arry); + + stats->numnumbers[k] = nnumbers; + stats->stanumbers[k] = numbers; + } + /* get values */ + value = slot_getattr(result, colnum++, &isnull); /* values */ + if (!isnull) + { + int j; + ArrayType *arry; + int16 elmlen; + bool elmbyval; + char elmalign; + arry = DatumGetArrayTypeP(value); + /* We could cache this data, but not clear it's worth it */ + get_typlenbyvalalign(ARR_ELEMTYPE(arry), + &elmlen, &elmbyval, &elmalign); + /* Deconstruct array into Datum elements; NULLs not expected */ + deconstruct_array(arry, + ARR_ELEMTYPE(arry), + elmlen, elmbyval, elmalign, + &values, NULL, &nvalues); + + /* + * If the element type is pass-by-reference, we now have a bunch of + * Datums that are pointers into the syscache value. Copy them to + * avoid problems if syscache decides to drop the entry. + */ + if (!elmbyval) + { + for (j = 0; j < nvalues; j++) + values[j] = datumCopy(values[j], elmbyval, elmlen); + } + + /* + * Free statarray if it's a detoasted copy. + */ + if ((Pointer) arry != DatumGetPointer(value)) + pfree(arry); + + stats->numvalues[k] = nvalues; + stats->stavalues[k] = values; + /* store details about values data type */ + stats->statypid[k] = ARR_ELEMTYPE(arry); + stats->statyplen[k] = elmlen; + stats->statypalign[k] = elmalign; + stats->statypbyval[k] = elmbyval; + } + } + } + + /* fetch next */ + result = ExecRemoteQuery((PlanState *) node); + } + ExecEndRemoteQuery(node); + + /* for (i = 0; i < attr_cnt; i++) */ + /* { */ + /* VacAttrStats *stats = vacattrstats[i]; */ + + /* if (numnodes[i] > 0) */ + /* { */ + /* stats->stanullfrac /= numnodes[i]; */ + /* stats->stawidth /= numnodes[i]; */ + /* stats->stadistinct /= numnodes[i]; */ + /* } */ + /* } */ + update_attstats(RelationGetRelid(onerel), + inh, + attr_cnt, + vacattrstats, + RelationGetRelPersistence(onerel)); + vac_update_relstats(onerel, + relpages, + reltuples, + relallvisible, + relhasindex, + InvalidTransactionId, + InvalidMultiXactId, + false); +} + +static void +analyze_rel_sync(Relation onerel, bool inh, int attr_cnt, + VacAttrStats **vacattrstats, int nindexes, + Relation *indexes, AnlIndexData *indexdata, AnalyzeSyncOpt *syncOpt) +{ + + int i; + /* collect and fit simple statistics (pg_statistic) for the relation */ + coord_collect_stats(onerel, inh, attr_cnt, vacattrstats, syncOpt); + + /* collect and fit simple statistics (pg_statistic) for all indexes */ + for (i = 0; i < nindexes; i++) + coord_collect_stats(indexes[i], + false, + indexdata[i].attr_cnt, + indexdata[i].vacattrstats, syncOpt); + + /* extended statistics (pg_statistic) for the relation */ + /* coord_collect_extended_stats(onerel, attr_cnt); */ +} diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 93c91f68..f001e056 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -145,7 +145,7 @@ ExecVacuum(VacuumStmt *vacstmt, bool isTopLevel) /* Now go through the common routine */ vacuum(vacstmt->options, vacstmt->relation, InvalidOid, ¶ms, - vacstmt->va_cols, NULL, isTopLevel); + vacstmt->va_cols, NULL, isTopLevel, vacstmt->sync_option); } /* @@ -171,9 +171,15 @@ ExecVacuum(VacuumStmt *vacstmt, bool isTopLevel) * memory context that will not disappear at transaction commit. */ void -vacuum(int options, RangeVar *relation, Oid relid, VacuumParams *params, - List *va_cols, BufferAccessStrategy bstrategy, bool isTopLevel) -{// #lizard forgives +vacuum(int options, + RangeVar *relation, + Oid relid, + VacuumParams *params, + List *va_cols, + BufferAccessStrategy bstrategy, + bool isTopLevel, + AnalyzeSyncOpt *syncOpt) +{ const char *stmttype; volatile bool in_outer_xact, use_own_xacts; @@ -344,7 +350,7 @@ vacuum(int options, RangeVar *relation, Oid relid, VacuumParams *params, } analyze_rel(relid, relation, options, params, - va_cols, in_outer_xact, vac_strategy); + va_cols, in_outer_xact, vac_strategy, syncOpt); if (use_own_xacts) { diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 702eec38..10a1d424 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -4144,6 +4144,18 @@ _copyVacuumStmt(const VacuumStmt *from) COPY_SCALAR_FIELD(options); COPY_NODE_FIELD(relation); COPY_NODE_FIELD(va_cols); + COPY_NODE_FIELD(sync_option); + + return newnode; +} + +static AnalyzeSyncOpt * +_copyAnalyzeSyncOpt(const AnalyzeSyncOpt *from) +{ + AnalyzeSyncOpt *newnode = makeNode(AnalyzeSyncOpt); + + COPY_SCALAR_FIELD(is_sync_from); + COPY_NODE_FIELD(nodes); return newnode; } @@ -5914,6 +5926,9 @@ copyObjectImpl(const void *from) case T_VacuumStmt: retval = _copyVacuumStmt(from); break; + case T_AnalyzeSyncOpt: + retval = _copyAnalyzeSyncOpt(from); + break; #ifdef _SHARDING_ case T_VacuumShardStmt: retval = _copyVacuumShardStmt(from); diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c index 7bbe8255..87934c2d 100644 --- a/src/backend/nodes/equalfuncs.c +++ b/src/backend/nodes/equalfuncs.c @@ -1694,6 +1694,16 @@ _equalVacuumStmt(const VacuumStmt *a, const VacuumStmt *b) COMPARE_SCALAR_FIELD(options); COMPARE_NODE_FIELD(relation); COMPARE_NODE_FIELD(va_cols); + COMPARE_NODE_FIELD(sync_option); + + return true; +} + +static bool +_equalAnalyzeSyncOpt(const AnalyzeSyncOpt *a, const AnalyzeSyncOpt *b) +{ + COMPARE_SCALAR_FIELD(is_sync_from); + COMPARE_NODE_FIELD(nodes); return true; } @@ -3592,6 +3602,9 @@ equal(const void *a, const void *b) case T_VacuumStmt: retval = _equalVacuumStmt(a, b); break; + case T_AnalyzeSyncOpt: + retval = _equalAnalyzeSyncOpt(a, b); + break; #ifdef _SHARDING_ case T_VacuumShardStmt: retval = _equalVacuumShardStmt(a, b); diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 2f34a131..7efd70e5 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -263,6 +263,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); RoleSpec *rolespec; PartitionForExpr *partfor; PartitionBy *partby; + AnalyzeSyncOpt *analyze_sync_opt; } %type stmt schema_stmt @@ -634,6 +635,8 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); %type audit_stmt audit_obj_type opt_when_success_or_not success_or_not /* __AUDIT__ END */ +/* AYALYZE */ +%type analyze_sync_option /* * Non-keyword token types. These are hard-wired into the "flex" lexer. * They must be listed first so that their numeric codes do not depend on @@ -730,7 +733,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); SERIALIZABLE SERVER SESSION SESSION_USER SESSIONTIMEZONE SET SETS SETOF SHARDING SHARE SHOW SIMILAR SIMPLE SKIP SLOT SMALLINT SNAPSHOT SOME SQL_P STABLE STANDALONE_P START STATEMENT STATISTICS STDIN STDOUT STEP STORAGE STRICT_P STRIP_P - SUBSCRIPTION SUBSTRING SUCCESSFUL SYMMETRIC SYSDATE SYSID SYSTEM_P SYSTIMESTAMP + SUBSCRIPTION SUBSTRING SUCCESSFUL SYMMETRIC SYNC SYSDATE SYSID SYSTEM_P SYSTIMESTAMP TABLE TABLES TABLESAMPLE TABLESPACE TBASE_P TEMP TEMPLATE TEMPORARY TEXT_P THEN TIME TIMESTAMP TO TRAILING TRANSACTION TRANSFORM TREAT TRIGGER TRIM TRUE_P @@ -11060,7 +11063,7 @@ vacuum_option_elem: ; AnalyzeStmt: - analyze_keyword opt_verbose + analyze_keyword opt_verbose analyze_sync_option { VacuumStmt *n = makeNode(VacuumStmt); n->options = VACOPT_ANALYZE; @@ -11068,9 +11071,10 @@ AnalyzeStmt: n->options |= VACOPT_VERBOSE; n->relation = NULL; n->va_cols = NIL; + n->sync_option = $3; $$ = (Node *)n; } - | analyze_keyword opt_verbose qualified_name opt_name_list + | analyze_keyword opt_verbose qualified_name opt_name_list analyze_sync_option { VacuumStmt *n = makeNode(VacuumStmt); n->options = VACOPT_ANALYZE; @@ -11078,22 +11082,25 @@ AnalyzeStmt: n->options |= VACOPT_VERBOSE; n->relation = $3; n->va_cols = $4; + n->sync_option = $5; $$ = (Node *)n; } - | analyze_keyword '(' analyze_option_list ')' + | analyze_keyword '(' analyze_option_list ')' analyze_sync_option { VacuumStmt *n = makeNode(VacuumStmt); n->options = VACOPT_ANALYZE | $3; n->relation = NULL; n->va_cols = NIL; + n->sync_option = $5; $$ = (Node *)n; } - | analyze_keyword '(' analyze_option_list ')' qualified_name opt_name_list + | analyze_keyword '(' analyze_option_list ')' qualified_name opt_name_list analyze_sync_option { VacuumStmt *n = makeNode(VacuumStmt); n->options = VACOPT_ANALYZE | $3; n->relation = $5; n->va_cols = $6; + n->sync_option = $7; $$ = (Node *)n; } ; @@ -11103,6 +11110,31 @@ analyze_keyword: | ANALYSE /* British */ {} ; +analyze_sync_option : +/* SYNC + { + AnalyzeSyncOpt *n = makeNode(AnalyzeSyncOpt); + n->is_sync_from = false; + n->nodes = NIL; + $$ = (Node *)n; + } + |*/ SYNC TO pgxcnode_list + { + AnalyzeSyncOpt *n = makeNode(AnalyzeSyncOpt); + n->is_sync_from = false; + n->nodes = $3; + $$ = n; + } + | SYNC FROM pgxcnode_list + { + AnalyzeSyncOpt *n = makeNode(AnalyzeSyncOpt); + n->is_sync_from = true; + n->nodes = $3; + $$ = n; + } + | /*EMPTY*/ { $$ = NULL; } + ; + opt_verbose: VERBOSE { $$ = TRUE; } | /*EMPTY*/ { $$ = FALSE; } @@ -16883,6 +16915,7 @@ unreserved_keyword: | STRICT_P | STRIP_P | SUBSCRIPTION + | SYNC | SYSID | SYSTEM_P | TABLES diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index f05013fa..f7f9904e 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -3203,8 +3203,9 @@ autovacuum_do_vac_analyze(autovac_table *tab, BufferAccessStrategy bstrategy) /* Let pgstat know what we're doing */ autovac_report_activity(tab); + /* no need sync for auto vacuum and/or analyze*/ vacuum(tab->at_vacoptions, &rangevar, tab->at_relid, &tab->at_params, NIL, - bstrategy, true); + bstrategy, true, NULL); } /* diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index 799b83a2..6521598b 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -711,8 +711,47 @@ ProcessUtilityPre(PlannedStmt *pstmt, VacuumStmt *stmt = (VacuumStmt *) parsetree; /* we choose to allow this during "read only" transactions */ - PreventCommandDuringRecovery((stmt->options & VACOPT_VACUUM) ? - "VACUUM" : "ANALYZE"); + PreventCommandDuringRecovery((stmt->options & VACOPT_VACUUM) ? "VACUUM" + : "ANALYZE"); + /* When statement is emit by the coordinating node, the statement is not + * rewritten, we adapt it here */ + if (IsConnFromCoord() && IS_PGXC_COORDINATOR && + (stmt->options & VACOPT_ANALYZE) && stmt->sync_option) + { + stmt->sync_option->is_sync_from = true; + list_free_deep(stmt->sync_option->nodes); + stmt->sync_option->nodes = NIL; + stmt->sync_option->nodes = list_make1(makeString(parentPGXCNode)); + } + if (!IsConnFromCoord() && IS_PGXC_COORDINATOR && stmt->sync_option && + stmt->sync_option->nodes != NIL) + { + const ListCell *cell; + char node_type = PGXC_NODE_COORDINATOR; + foreach (cell, stmt->sync_option->nodes) + { + if (0 == strcmp(strVal(lfirst(cell)), PGXCNodeName)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("Can not sync to/from local!"))); + + PGXCNodeGetNodeIdFromName(strVal(lfirst(cell)), &node_type); + if (node_type == PGXC_NODE_NONE) + { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("can not find coordinator %s!", + strVal(lfirst(cell))))); + } + if (node_type != PGXC_NODE_COORDINATOR) + { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("node %s is not coordinator!", + strVal(lfirst(cell))))); + } + } + } /* * We have to run the command on nodes before Coordinator because * vacuum() pops active snapshot and we can not send it to nodes @@ -1275,6 +1314,7 @@ ProcessUtilityPost(PlannedStmt *pstmt, bool auto_commit = false; bool add_context = false; RemoteQueryExecType exec_type = EXEC_ON_NONE; + ExecNodes *exec_nodes = NULL; /* * auto_commit and is_temp is initialised to false and changed if required. @@ -1313,7 +1353,122 @@ ProcessUtilityPost(PlannedStmt *pstmt, case T_NotifyStmt: case T_ListenStmt: case T_UnlistenStmt: + break; case T_VacuumStmt: + { + VacuumStmt *vstmt = (VacuumStmt *)parsetree; + if (!IsConnFromCoord() && IS_PGXC_COORDINATOR && + (vstmt->options & VACOPT_ANALYZE) && vstmt->sync_option) + { + exec_type = EXEC_ON_COORDS; + if (vstmt->sync_option->nodes) + { + ListCell *lc; + int nodeIdx; + exec_nodes = (ExecNodes *)makeNode(ExecNodes); + exec_nodes->accesstype = RELATION_ACCESS_INSERT; + exec_nodes->baselocatortype = LOCATOR_TYPE_SHARD; /* not used */ + exec_nodes->en_expr = NULL; + exec_nodes->en_relid = InvalidOid; + exec_nodes->primarynodelist = NIL; + + foreach (lc, vstmt->sync_option->nodes) + { + char node_type = PGXC_NODE_COORDINATOR; + nodeIdx = + PGXCNodeGetNodeIdFromName(strVal(lfirst(lc)), &node_type); + /* Assert(nodeIdx > 0 && nodeIdx < NumDataNodes); */ + /* if(node_type != PGXC_NODE_COORDINATOR){ */ + /* ereport(ERROR, */ + /* (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), */ + /* errmsg("node %s is not coordinator!", + * strVal(lfirst(lc))))); */ + /* } */ + /* already check/rewrite in pre, just add it */ + exec_nodes->nodeList = lappend_int(exec_nodes->nodeList, nodeIdx); + } + } + PopActiveSnapshot(); + CommitTransactionCommand(); + StartTransactionCommand(); + } + /* if (vstmt->options & VACOPT_ANALYZE && vstmt->sync_option != NULL && */ + /* vstmt->sync_option->is_sync_from != true) */ + /* { */ + /* StringInfo queryStr = makeStringInfo(); */ + /* appendStringInfo(queryStr, "ANALYZE (COORDINATOR"); */ + /* if (vstmt->options & VACOPT_VERBOSE) */ + /* { */ + /* appendStringInfoString(queryStr, " ,VERBOSE"); */ + /* } */ + /* appendStringInfoChar(queryStr, ')'); */ + /* if (vstmt->relation) */ + /* appendStringInfo(queryStr, " %s", RangeVarGetName(vstmt->relation)); + */ + /* if (vstmt->va_cols) */ + /* { */ + /* ListCell *lc; */ + /* bool comma = false; */ + /* appendStringInfoString(queryStr, " ("); */ + /* foreach (lc, vstmt->va_cols) */ + /* { */ + /* if (comma) */ + /* comma = true; */ + /* else */ + /* appendStringInfoChar(queryStr, ','); */ + /* appendStringInfoString(queryStr, strVal(lfirst(lc))); */ + /* } */ + /* appendStringInfoChar(queryStr, ')'); */ + /* } */ + + /* appendStringInfo(queryStr, " SYNC FROM %s", PGXCNodeName); */ + /* PopActiveSnapshot(); */ + /* CommitTransactionCommand(); */ + /* StartTransactionCommand(); */ + /* if (vstmt->sync_option->nodes) */ + /* { */ + /* ExecNodes *execnodes; */ + /* ListCell *lc; */ + /* int nodeIdx; */ + /* execnodes = (ExecNodes *)makeNode(ExecNodes); */ + /* execnodes->accesstype = RELATION_ACCESS_INSERT; */ + /* execnodes->baselocatortype = LOCATOR_TYPE_SHARD; /\* not used *\/ */ + /* execnodes->en_expr = NULL; */ + /* execnodes->en_relid = InvalidOid; */ + /* execnodes->primarynodelist = NIL; */ + + /* foreach(lc, vstmt->sync_option->nodes){ */ + /* char node_type = PGXC_NODE_COORDINATOR; */ + /* nodeIdx = */ + /* PGXCNodeGetNodeIdFromName(strVal(lfirst(lc)), &node_type); */ + /* Assert(nodeIdx > 0 && nodeIdx < NumDataNodes); */ + /* execnodes->nodeList = lappend_int(execnodes->nodeList, nodeIdx); + */ + /* } */ + /* ExecUtilityStmtOnNodes(parsetree, */ + /* queryStr->data, */ + /* execnodes, */ + /* sentToRemote, */ + /* false, */ + /* EXEC_ON_COORDS, */ + /* false, */ + /* false); */ + /* list_free(execnodes->nodeList); */ + /* } */ + /* else */ + /* ExecUtilityStmtOnNodes(parsetree, */ + /* queryStr->data, */ + /* NULL, */ + /* sentToRemote, */ + /* auto_commit, */ + /* EXEC_ON_COORDS, */ + /* false, */ + /* false); */ + /* pfree(queryStr->data); */ + /* pfree(queryStr); */ + /* } */ + break; + } #ifdef _SHARDING_ case T_VacuumShardStmt: #endif @@ -1797,7 +1952,7 @@ ProcessUtilityPost(PlannedStmt *pstmt, if (IS_PGXC_LOCAL_COORDINATOR) { - ExecUtilityStmtOnNodes(parsetree, queryString, NULL, sentToRemote, auto_commit, + ExecUtilityStmtOnNodes(parsetree, queryString, exec_nodes, sentToRemote, auto_commit, exec_type, is_temp, add_context); if (IsA(parsetree, IndexStmt) && diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h index cd79ba61..9da04880 100644 --- a/src/include/commands/vacuum.h +++ b/src/include/commands/vacuum.h @@ -288,9 +288,14 @@ typedef struct /* in commands/vacuum.c */ extern void ExecVacuum(VacuumStmt *vacstmt, bool isTopLevel); -extern void vacuum(int options, RangeVar *relation, Oid relid, - VacuumParams *params, List *va_cols, - BufferAccessStrategy bstrategy, bool isTopLevel); +extern void vacuum(int options, + RangeVar *relation, + Oid relid, + VacuumParams *params, + List *va_cols, + BufferAccessStrategy bstrategy, + bool isTopLevel, + AnalyzeSyncOpt *syncOpt); extern void vac_open_indexes(Relation relation, LOCKMODE lockmode, int *nindexes, Relation **Irel); extern void vac_close_indexes(int nindexes, Relation *Irel, LOCKMODE lockmode); @@ -338,9 +343,14 @@ extern void ExecVacuumShard(VacuumShardStmt *stmt); #endif /* in commands/analyze.c */ -extern void analyze_rel(Oid relid, RangeVar *relation, int options, - VacuumParams *params, List *va_cols, bool in_outer_xact, - BufferAccessStrategy bstrategy); +extern void analyze_rel(Oid relid, + RangeVar *relation, + int options, + VacuumParams *params, + List *va_cols, + bool in_outer_xact, + BufferAccessStrategy bstrategy, + AnalyzeSyncOpt *syncOpt); extern bool std_typanalyze(VacAttrStats *stats); /* in utils/misc/sampling.c --- duplicate of declarations in utils/sampling.h */ diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index 2f585807..227af23f 100644 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -586,6 +586,7 @@ typedef enum NodeTag #ifdef _MLS_ ,T_SyncBufIdInfo /* in bufmgr.c*/ #endif + ,T_AnalyzeSyncOpt } NodeTag; /* diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index 01ab8277..4dc323ab 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -3301,12 +3301,21 @@ typedef enum VacuumOption */ } VacuumOption; +typedef struct AnalyzeSyncOpt +{ + NodeTag type; + bool is_sync_from; /* false: sync to other CN node; true: sync from node identified by + node_name */ + List *nodes; /* node list for sync to/from */ +} AnalyzeSyncOpt; + typedef struct VacuumStmt { NodeTag type; int options; /* OR of VacuumOption flags */ RangeVar *relation; /* single table to process, or NULL */ List *va_cols; /* list of column names, or NIL for all */ + AnalyzeSyncOpt *sync_option; /* Sync statistics to/from other nodes, or NULL */ } VacuumStmt; #ifdef _SHARDING_ diff --git a/src/include/parser/kwlist.h b/src/include/parser/kwlist.h index dc44c414..d77a2e68 100644 --- a/src/include/parser/kwlist.h +++ b/src/include/parser/kwlist.h @@ -477,6 +477,7 @@ PG_KEYWORD("substring", SUBSTRING, COL_NAME_KEYWORD) PG_KEYWORD("successful", SUCCESSFUL, RESERVED_KEYWORD) #endif PG_KEYWORD("symmetric", SYMMETRIC, RESERVED_KEYWORD) +PG_KEYWORD("sync", SYNC, UNRESERVED_KEYWORD) #ifdef _PG_ORCL_ PG_KEYWORD("sysdate", SYSDATE, RESERVED_KEYWORD) #endif From 69079e0bd7e1b902930776c460a824b33abfef57 Mon Sep 17 00:00:00 2001 From: aslanxli Date: Mon, 7 Mar 2022 16:29:55 +0800 Subject: [PATCH 529/578] Added the feature: statistics synchronization.The ANALYZE syntax has been extended cherry pick: 9510c58d e47d6f98 808f8a6b 8a5348ac 378af856 60882fef --- src/backend/commands/analyze.c | 586 ++++++++++++++++++++------------- src/backend/parser/gram.y | 10 +- src/backend/tcop/utility.c | 113 +------ 3 files changed, 371 insertions(+), 338 deletions(-) diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index f0f92a5b..d558c5a3 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -5349,64 +5349,174 @@ acquire_coordinator_sample_rows(Relation onerel, int elevel, #endif +static RemoteQuery * +init_sync_remotequery(AnalyzeSyncOpt *syncOpt, char **cnname) +{ + RemoteQuery *step; + ListCell *lc; + int nodeIdx; + ExecNodes *execnodes = (ExecNodes *)makeNode(ExecNodes); + char node_type = PGXC_NODE_COORDINATOR; + execnodes->accesstype = RELATION_ACCESS_READ; + execnodes->baselocatortype = LOCATOR_TYPE_SHARD; /* not used */ + execnodes->en_expr = NULL; + execnodes->en_relid = InvalidOid; + execnodes->primarynodelist = NIL; + + lc = list_head(syncOpt->nodes); + *cnname = strVal(lfirst(lc)); + nodeIdx = PGXCNodeGetNodeIdFromName(*cnname, &node_type); + Assert(node_type == PGXC_NODE_COORDINATOR); + execnodes->nodeList = lappend_int(execnodes->nodeList, nodeIdx); + + step = makeNode(RemoteQuery); + step->combine_type = COMBINE_TYPE_NONE; + step->exec_nodes = execnodes; + step->exec_type = EXEC_ON_COORDS; + return step; +} /* - * coord_collect_simple_stats - * Collect simple stats for a relation (pg_statistic contents). - * - * Collects statistics from the datanodes, and then keeps the one of the - * received statistics for each attribute (the first one we receive, but - * it's mostly random). - * - * XXX We do not try to build statistics covering data fro all the nodes, - * either by collecting fresh sample of rows or merging the statistics - * somehow. The current approach is very simple and cheap, but may have - * negative impact on estimate accuracy as the stats only covers data - * from a single node, and we may end up with stats from different node - * for each attribute. + * coord_sync_rel_stats + * sync relation stats from the coordinator node specified by syncOpt. */ static void -coord_collect_stats(Relation onerel, bool inh, int attr_cnt, - VacAttrStats **vacattrstats, AnalyzeSyncOpt *syncOpt) +coord_sync_rel_stats(Relation onerel, AnalyzeSyncOpt *syncOpt) { char *nspname; char *relname; - /* Fields to run query to read statistics from data nodes */ + char *cnname; + /* Fields to run query to read statistics from coordinator nodes */ StringInfoData query; EState *estate; MemoryContext oldcontext; RemoteQuery *step; RemoteQueryState *node; TupleTableSlot *result; - int i; - /* Number of data nodes from which attribute statistics are received. */ - int *numnodes; int reltuples; int relpages; int relallvisible; bool relhasindex; - ListCell *lc; - int nodeIdx; - ExecNodes *execnodes = (ExecNodes *)makeNode(ExecNodes); /* Get the relation identifier */ relname = RelationGetRelationName(onerel); nspname = get_namespace_name(RelationGetNamespace(onerel)); /* Make up query string */ initStringInfo(&query); - /* Generic statistic fields */ - appendStringInfoString(&query, - "SELECT s.staattnum, " + appendStringInfo(&query, + "SELECT " "c.reltuples, " "c.relpages," "c.relallvisible," - "c.relhasindex," + "c.relhasindex" + " FROM pg_class c JOIN pg_namespace nc on c.relnamespace = " + "nc.oid WHERE nc.nspname = '%s' and c.relname = '%s'", + nspname, + relname); + + /* Build up RemoteQuery */ + step = init_sync_remotequery(syncOpt, &cnname); + step->sql_statement = query.data; + + /* Add targetlist entries */ + step->scan.plan.targetlist = + lappend(step->scan.plan.targetlist, + make_relation_tle(RelationRelationId, "pg_class", "reltuples")); + step->scan.plan.targetlist = + lappend(step->scan.plan.targetlist, + make_relation_tle(RelationRelationId, "pg_class", "relpages")); + step->scan.plan.targetlist = + lappend(step->scan.plan.targetlist, + make_relation_tle(RelationRelationId, "pg_class", "relallvisible")); + step->scan.plan.targetlist = + lappend(step->scan.plan.targetlist, + make_relation_tle(RelationRelationId, "pg_class", "relhasindex")); + /* Execute query on the data nodes */ + estate = CreateExecutorState(); + + oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); + node = ExecInitRemoteQuery(step, estate, 0); + MemoryContextSwitchTo(oldcontext); + + result = ExecRemoteQuery((PlanState *)node); + if (result != NULL && !TupIsNull(result)) + { + Datum value; + bool isnull; + int colnum = 1; + + /* Process statistics */ + value = slot_getattr(result, colnum++, &isnull); /* reltuple */ + reltuples = DatumGetFloat4(value); + + value = slot_getattr(result, colnum++, &isnull); /* relpages */ + relpages = DatumGetInt32(value); + + value = slot_getattr(result, colnum++, &isnull); /* relallvisible */ + relallvisible = DatumGetInt32(value); + + value = slot_getattr(result, colnum++, &isnull); /* relhasindex */ + relhasindex = DatumGetBool(value); + + vac_update_relstats(onerel, + relpages, + reltuples, + relallvisible, + relhasindex, + InvalidTransactionId, + InvalidMultiXactId, + false); + } + else + { + ereport(WARNING, + (errcode(ERRCODE_UNDEFINED_TABLE), + errmsg("Relation \"%s\" does not exist in coordinator %s", + relname, + cnname))); + } + ExecEndRemoteQuery(node); +} + +/* + * coord_sync_col_stats + * sync column stats from the coordinator node specified by syncOpt. + */ +static void +coord_sync_col_stats(Relation onerel, + bool inh, + int attr_cnt, + VacAttrStats **vacattrstats, + AnalyzeSyncOpt *syncOpt) +{ + char *nspname; + char *relname; + char *cnname; + /* Fields to run query to read statistics from coordinator nodes */ + StringInfoData query; + EState *estate; + MemoryContext oldcontext; + RemoteQuery *step; + RemoteQueryState *node; + TupleTableSlot *result; + int i; + + /* Get the relation identifier */ + relname = RelationGetRelationName(onerel); + nspname = get_namespace_name(RelationGetNamespace(onerel)); + + /* Make up query string */ + initStringInfo(&query); + /* Generic statistic fields */ + appendStringInfoString(&query, + "SELECT s.staattnum, " "s.stanullfrac, " "s.stawidth, " "s.stadistinct"); /* Detailed statistic slots */ for (i = 1; i <= STATISTIC_NUM_SLOTS; i++) - appendStringInfo(&query, ", s.stakind%d" + appendStringInfo(&query, + ", s.stakind%d" ", o%d.oprname" ", no%d.nspname" ", t%dl.typname" @@ -5418,13 +5528,15 @@ coord_collect_stats(Relation onerel, bool inh, int attr_cnt, i, i, i, i, i, i, i, i, i); /* Common part of FROM clause */ - appendStringInfoString(&query, " FROM pg_statistic s JOIN pg_class c " + appendStringInfoString(&query, + " FROM pg_statistic s JOIN pg_class c " " ON s.starelid = c.oid " "JOIN pg_namespace nc " " ON c.relnamespace = nc.oid "); /* Info about involved operations */ for (i = 1; i <= STATISTIC_NUM_SLOTS; i++) - appendStringInfo(&query, "LEFT JOIN (pg_operator o%d " + appendStringInfo(&query, + "LEFT JOIN (pg_operator o%d " " JOIN pg_namespace no%d " " ON o%d.oprnamespace = no%d.oid " " JOIN pg_type t%dl " @@ -5436,133 +5548,77 @@ coord_collect_stats(Relation onerel, bool inh, int attr_cnt, " JOIN pg_namespace nt%dr " " ON t%dr.typnamespace = nt%dr.oid) " " ON s.staop%d = o%d.oid ", - i, i, i, i, i, i, i, i, i, - i, i, i, i, i, i, i, i, i); - appendStringInfo(&query, "WHERE nc.nspname = '%s' " + i, i, i, i, i, i, i, i, i, i, i, i, i, i, i, i, i, i); + appendStringInfo(&query, + "WHERE nc.nspname = '%s' " "AND c.relname = '%s'", - nspname, relname); + nspname, + relname); /* Build up RemoteQuery */ - execnodes->accesstype = RELATION_ACCESS_READ; - execnodes->baselocatortype = LOCATOR_TYPE_SHARD; /* not used */ - execnodes->en_expr = NULL; - execnodes->en_relid = InvalidOid; - execnodes->primarynodelist = NIL; - - foreach (lc, syncOpt->nodes) - { - char node_type = PGXC_NODE_COORDINATOR; - nodeIdx = PGXCNodeGetNodeIdFromName(strVal(lfirst(lc)), &node_type); - execnodes->nodeList = lappend_int(execnodes->nodeList, nodeIdx); - } - step = makeNode(RemoteQuery); - step->combine_type = COMBINE_TYPE_NONE; - step->exec_nodes = execnodes; + step = init_sync_remotequery(syncOpt, &cnname); step->sql_statement = query.data; - step->force_autocommit = true; - step->exec_type = EXEC_ON_COORDS; /* Add targetlist entries */ - step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, - make_relation_tle(StatisticRelationId, - "pg_statistic", - "staattnum")); - step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, - make_relation_tle(RelationRelationId, - "pg_class", - "reltuples")); step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, - make_relation_tle(RelationRelationId, "pg_class", "relpages")); + make_relation_tle(StatisticRelationId, "pg_statistic", "staattnum")); step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, - make_relation_tle(RelationRelationId, "pg_class", "relallvisible")); + make_relation_tle(StatisticRelationId, "pg_statistic", "stanullfrac")); step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, - make_relation_tle(RelationRelationId, "pg_class", "relhasindex")); - step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, - make_relation_tle(StatisticRelationId, - "pg_statistic", - "stanullfrac")); - step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, - make_relation_tle(StatisticRelationId, - "pg_statistic", - "stawidth")); - step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, - make_relation_tle(StatisticRelationId, - "pg_statistic", - "stadistinct")); + make_relation_tle(StatisticRelationId, "pg_statistic", "stawidth")); + step->scan.plan.targetlist = + lappend(step->scan.plan.targetlist, + make_relation_tle(StatisticRelationId, "pg_statistic", "stadistinct")); for (i = 1; i <= STATISTIC_NUM_SLOTS; i++) { /* 16 characters would be enough */ char colname[16]; sprintf(colname, "stakind%d", i); - step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, - make_relation_tle(StatisticRelationId, - "pg_statistic", - colname)); - - step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, - make_relation_tle(OperatorRelationId, - "pg_operator", - "oprname")); - step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, - make_relation_tle(NamespaceRelationId, - "pg_namespace", - "nspname")); - step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, - make_relation_tle(TypeRelationId, - "pg_type", - "typname")); - step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, - make_relation_tle(NamespaceRelationId, - "pg_namespace", - "nspname")); - step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, - make_relation_tle(TypeRelationId, - "pg_type", - "typname")); - step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, - make_relation_tle(NamespaceRelationId, - "pg_namespace", - "nspname")); + step->scan.plan.targetlist = + lappend(step->scan.plan.targetlist, + make_relation_tle(StatisticRelationId, "pg_statistic", colname)); + + step->scan.plan.targetlist = + lappend(step->scan.plan.targetlist, + make_relation_tle(OperatorRelationId, "pg_operator", "oprname")); + step->scan.plan.targetlist = + lappend(step->scan.plan.targetlist, + make_relation_tle(NamespaceRelationId, "pg_namespace", "nspname")); + step->scan.plan.targetlist = + lappend(step->scan.plan.targetlist, + make_relation_tle(TypeRelationId, "pg_type", "typname")); + step->scan.plan.targetlist = + lappend(step->scan.plan.targetlist, + make_relation_tle(NamespaceRelationId, "pg_namespace", "nspname")); + step->scan.plan.targetlist = + lappend(step->scan.plan.targetlist, + make_relation_tle(TypeRelationId, "pg_type", "typname")); + step->scan.plan.targetlist = + lappend(step->scan.plan.targetlist, + make_relation_tle(NamespaceRelationId, "pg_namespace", "nspname")); sprintf(colname, "stanumbers%d", i); - step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, - make_relation_tle(StatisticRelationId, - "pg_statistic", - colname)); + step->scan.plan.targetlist = + lappend(step->scan.plan.targetlist, + make_relation_tle(StatisticRelationId, "pg_statistic", colname)); sprintf(colname, "stavalues%d", i); - step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, - make_relation_tle(StatisticRelationId, - "pg_statistic", - colname)); + step->scan.plan.targetlist = + lappend(step->scan.plan.targetlist, + make_relation_tle(StatisticRelationId, "pg_statistic", colname)); } /* Execute query on the data nodes */ estate = CreateExecutorState(); oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); - - /* - * Take a fresh snapshot so that we see the effects of the ANALYZE command - * on the datanode. That command is run in auto-commit mode hence just - * bumping up the command ID is not good enough - */ - /* PushActiveSnapshot(GetLocalTransactionSnapshot()); */ - estate->es_snapshot = GetActiveSnapshot(); - node = ExecInitRemoteQuery(step, estate, 0); MemoryContextSwitchTo(oldcontext); - /* get ready to combine results */ - numnodes = (int *) palloc(attr_cnt * sizeof(int)); - for (i = 0; i < attr_cnt; i++) - numnodes[i] = 0; - result = ExecRemoteQuery((PlanState *) node); - /* PopActiveSnapshot(); */ while (result != NULL && !TupIsNull(result)) { Datum value; @@ -5574,7 +5630,6 @@ coord_collect_stats(Relation onerel, bool inh, int attr_cnt, float4 distinct; VacAttrStats *stats = NULL; - /* Process statistics from the data node */ value = slot_getattr(result, colnum++, &isnull); /* staattnum */ attnum = DatumGetInt16(value); @@ -5583,22 +5638,9 @@ coord_collect_stats(Relation onerel, bool inh, int attr_cnt, { stats = vacattrstats[i]; stats->stats_valid = true; - numnodes[i]++; break; } - value = slot_getattr(result, colnum++, &isnull); /* reltuples */ - reltuples = DatumGetFloat4(value); - - value = slot_getattr(result, colnum++, &isnull); /* relpages */ - relpages = DatumGetInt32(value); - - value = slot_getattr(result, colnum++, &isnull); /* relallvisible */ - relallvisible = DatumGetInt32(value); - - value = slot_getattr(result, colnum++, &isnull); /* relhasindex */ - relhasindex = DatumGetBool(value); - if (stats) { value = slot_getattr(result, colnum++, &isnull); /* stanullfrac */ @@ -5614,13 +5656,12 @@ coord_collect_stats(Relation onerel, bool inh, int attr_cnt, stats->stadistinct = distinct; /* Detailed statistics */ - for (i = 1; i <= STATISTIC_NUM_SLOTS; i++) + for (i = 0; i < STATISTIC_NUM_SLOTS; i++) { int16 kind; float4 *numbers; Datum *values; int nnumbers, nvalues; - int k; value = slot_getattr(result, colnum++, &isnull); /* kind */ kind = DatumGetInt16(value); @@ -5655,10 +5696,8 @@ coord_collect_stats(Relation onerel, bool inh, int attr_cnt, char *oprname; char *oprnspname; Oid ltypid, rtypid; - char *ltypname, - *rtypname; - char *ltypnspname, - *rtypnspname; + char *ltypname, *rtypname; + char *ltypnspname, *rtypnspname; oprname = DatumGetCString(value); value = slot_getattr(result, colnum++, &isnull); /* oprnspname */ oprnspname = DatumGetCString(value); @@ -5667,68 +5706,37 @@ coord_collect_stats(Relation onerel, bool inh, int attr_cnt, ltypname = DatumGetCString(value); value = slot_getattr(result, colnum++, &isnull); /* typnspname */ ltypnspname = DatumGetCString(value); - ltypid = get_typname_typid(ltypname, - get_namespaceid(ltypnspname)); + ltypid = + get_typname_typid(ltypname, get_namespaceid(ltypnspname)); /* Get right operand data type */ value = slot_getattr(result, colnum++, &isnull); /* typname */ rtypname = DatumGetCString(value); value = slot_getattr(result, colnum++, &isnull); /* typnspname */ rtypnspname = DatumGetCString(value); - rtypid = get_typname_typid(rtypname, - get_namespaceid(rtypnspname)); + rtypid = + get_typname_typid(rtypname, get_namespaceid(rtypnspname)); /* lookup operator */ - oprid = get_operid(oprname, ltypid, rtypid, + oprid = get_operid(oprname, + ltypid, + rtypid, get_namespaceid(oprnspname)); } - /* - * Look up a statistics slot. If there is an entry of the - * same kind already, leave it, assuming the statistics - * is approximately the same on all nodes, so values from - * one node are representing entire relation well. - * If empty slot is found store values here. If no more - * slots skip remaining values. - */ - for (k = 0; k < STATISTIC_NUM_SLOTS; k++) - { - if (stats->stakind[k] == 0 || - (stats->stakind[k] == kind && stats->staop[k] == oprid)) - break; - } - - if (k >= STATISTIC_NUM_SLOTS) - { - /* No empty slots */ - break; - } - - /* - * If it is an existing slot which has numbers or values - * continue to the next set. If slot exists but without - * numbers and values, try to acquire them now - */ - if (stats->stakind[k] != 0 && (stats->numnumbers[k] > 0 || - stats->numvalues[k] > 0)) - { - colnum += 2; /* skip numbers and values */ - continue; - } /* * Initialize slot */ - stats->stakind[k] = kind; - stats->staop[k] = oprid; - stats->numnumbers[k] = 0; - stats->stanumbers[k] = NULL; - stats->numvalues[k] = 0; - stats->stavalues[k] = NULL; - stats->statypid[k] = InvalidOid; - stats->statyplen[k] = -1; - stats->statypalign[k] = 'i'; - stats->statypbyval[k] = true; + stats->stakind[i] = kind; + stats->staop[i] = oprid; + stats->numnumbers[i] = 0; + stats->stanumbers[i] = NULL; + stats->numvalues[i] = 0; + stats->stavalues[i] = NULL; + stats->statypid[i] = InvalidOid; + stats->statyplen[i] = -1; + stats->statypalign[i] = 'i'; + stats->statypbyval[i] = true; } - /* get numbers */ value = slot_getattr(result, colnum++, &isnull); /* numbers */ if (!isnull) @@ -5741,13 +5749,11 @@ coord_collect_stats(Relation onerel, bool inh, int attr_cnt, * to look like a C array of float4 values. */ nnumbers = ARR_DIMS(arry)[0]; - if (ARR_NDIM(arry) != 1 || nnumbers <= 0 || - ARR_HASNULL(arry) || + if (ARR_NDIM(arry) != 1 || nnumbers <= 0 || ARR_HASNULL(arry) || ARR_ELEMTYPE(arry) != FLOAT4OID) elog(ERROR, "stanumbers is not a 1-D float4 array"); numbers = (float4 *) palloc(nnumbers * sizeof(float4)); - memcpy(numbers, ARR_DATA_PTR(arry), - nnumbers * sizeof(float4)); + memcpy(numbers, ARR_DATA_PTR(arry), nnumbers * sizeof(float4)); /* * Free arry if it's a detoasted copy. @@ -5755,8 +5761,8 @@ coord_collect_stats(Relation onerel, bool inh, int attr_cnt, if ((Pointer) arry != DatumGetPointer(value)) pfree(arry); - stats->numnumbers[k] = nnumbers; - stats->stanumbers[k] = numbers; + stats->numnumbers[i] = nnumbers; + stats->stanumbers[i] = numbers; } /* get values */ value = slot_getattr(result, colnum++, &isnull); /* values */ @@ -5770,12 +5776,18 @@ coord_collect_stats(Relation onerel, bool inh, int attr_cnt, arry = DatumGetArrayTypeP(value); /* We could cache this data, but not clear it's worth it */ get_typlenbyvalalign(ARR_ELEMTYPE(arry), - &elmlen, &elmbyval, &elmalign); + &elmlen, + &elmbyval, + &elmalign); /* Deconstruct array into Datum elements; NULLs not expected */ deconstruct_array(arry, ARR_ELEMTYPE(arry), - elmlen, elmbyval, elmalign, - &values, NULL, &nvalues); + elmlen, + elmbyval, + elmalign, + &values, + NULL, + &nvalues); /* * If the element type is pass-by-reference, we now have a bunch of @@ -5794,13 +5806,13 @@ coord_collect_stats(Relation onerel, bool inh, int attr_cnt, if ((Pointer) arry != DatumGetPointer(value)) pfree(arry); - stats->numvalues[k] = nvalues; - stats->stavalues[k] = values; + stats->numvalues[i] = nvalues; + stats->stavalues[i] = values; /* store details about values data type */ - stats->statypid[k] = ARR_ELEMTYPE(arry); - stats->statyplen[k] = elmlen; - stats->statypalign[k] = elmalign; - stats->statypbyval[k] = elmbyval; + stats->statypid[i] = ARR_ELEMTYPE(arry); + stats->statyplen[i] = elmlen; + stats->statypalign[i] = elmalign; + stats->statypbyval[i] = elmbyval; } } } @@ -5810,49 +5822,151 @@ coord_collect_stats(Relation onerel, bool inh, int attr_cnt, } ExecEndRemoteQuery(node); - /* for (i = 0; i < attr_cnt; i++) */ - /* { */ - /* VacAttrStats *stats = vacattrstats[i]; */ - - /* if (numnodes[i] > 0) */ - /* { */ - /* stats->stanullfrac /= numnodes[i]; */ - /* stats->stawidth /= numnodes[i]; */ - /* stats->stadistinct /= numnodes[i]; */ - /* } */ - /* } */ update_attstats(RelationGetRelid(onerel), inh, attr_cnt, vacattrstats, RelationGetRelPersistence(onerel)); - vac_update_relstats(onerel, - relpages, - reltuples, - relallvisible, - relhasindex, - InvalidTransactionId, - InvalidMultiXactId, - false); } +/* + * coord_collect_extended_stats + * sync extended stats for a relation (pg_statistic_ext contents). + * + * Sync statistics from the coordinator node specified by syncOpt. + * + */ static void -analyze_rel_sync(Relation onerel, bool inh, int attr_cnt, - VacAttrStats **vacattrstats, int nindexes, - Relation *indexes, AnlIndexData *indexdata, AnalyzeSyncOpt *syncOpt) +coord_sync_extended_stats(Relation onerel, int attr_cnt, AnalyzeSyncOpt *syncOpt) { + char *nspname; + char *relname; + char *cnname; + /* Fields to run query to read statistics from data nodes */ + StringInfoData query; + EState *estate; + MemoryContext oldcontext; + RemoteQuery *step; + RemoteQueryState *node; + TupleTableSlot *result; + int i; + /* Number of data nodes from which attribute statistics are received. */ + int *numnodes; + /* Get the relation identifier */ + relname = RelationGetRelationName(onerel); + nspname = get_namespace_name(RelationGetNamespace(onerel)); + + initStringInfo(&query); + + appendStringInfo(&query, + "SELECT ns.nspname, " + "stxname, " + "stxndistinct::bytea AS stxndistinct, " + "stxdependencies::bytea AS stxdependencies " + " FROM pg_statistic_ext s JOIN pg_class c " + " ON s.stxrelid = c.oid " + "JOIN pg_namespace nc " + " ON c.relnamespace = nc.oid " + "JOIN pg_namespace ns " + " ON s.stxnamespace = ns.oid " + "WHERE nc.nspname = '%s' AND c.relname = '%s'", + nspname, + relname); + + /* Build up RemoteQuery */ + step = init_sync_remotequery(syncOpt, &cnname); + step->sql_statement = query.data; + + /* Add targetlist entries */ + step->scan.plan.targetlist = + lappend(step->scan.plan.targetlist, + make_relation_tle(NamespaceRelationId, "pg_namespace", "nspname")); + + step->scan.plan.targetlist = + lappend(step->scan.plan.targetlist, + make_relation_tle(StatisticExtRelationId, "pg_statistic_ext", "stxname")); + + step->scan.plan.targetlist = lappend( + step->scan.plan.targetlist, + make_relation_tle(StatisticExtRelationId, "pg_statistic_ext", "stxndistinct")); + + step->scan.plan.targetlist = lappend( + step->scan.plan.targetlist, + make_relation_tle(StatisticExtRelationId, "pg_statistic_ext", "stxdependencies")); + + /* Execute query on the data nodes */ + estate = CreateExecutorState(); + oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); + node = ExecInitRemoteQuery(step, estate, 0); + MemoryContextSwitchTo(oldcontext); + + /* get ready to combine results */ + numnodes = (int *)palloc(attr_cnt * sizeof(int)); + for (i = 0; i < attr_cnt; i++) + numnodes[i] = 0; + + result = ExecRemoteQuery((PlanState *)node); + + while (result != NULL && !TupIsNull(result)) + { + Datum value; + bool isnull; + Name nspname; + Name stxname; + bytea *stxndistinct = NULL; + bytea *stxdependencies = NULL; + + /* Process statistics from the data node */ + value = slot_getattr(result, 1, &isnull); /* nspname */ + nspname = DatumGetName(value); + + value = slot_getattr(result, 2, &isnull); /* stxname */ + stxname = DatumGetName(value); + + value = slot_getattr(result, 3, &isnull); /* stxndistinct */ + if (!isnull) + stxndistinct = DatumGetByteaP(value); + + value = slot_getattr(result, 4, &isnull); /* stxdependencies */ + if (!isnull) + stxdependencies = DatumGetByteaP(value); + + update_ext_stats(nspname, stxname, stxndistinct, stxdependencies); + + /* fetch stats from next node */ + result = ExecRemoteQuery((PlanState *)node); + } + ExecEndRemoteQuery(node); +} + +static void +analyze_rel_sync(Relation onerel, + bool inh, + int attr_cnt, + VacAttrStats **vacattrstats, + int nindexes, + Relation *indexes, + AnlIndexData *indexdata, + AnalyzeSyncOpt *syncOpt) +{ int i; - /* collect and fit simple statistics (pg_statistic) for the relation */ - coord_collect_stats(onerel, inh, attr_cnt, vacattrstats, syncOpt); + /* sync statistics for the relation */ + coord_sync_rel_stats(onerel, syncOpt); + /* sync column statistics (pg_statistic) for the relation */ + coord_sync_col_stats(onerel, inh, attr_cnt, vacattrstats, syncOpt); - /* collect and fit simple statistics (pg_statistic) for all indexes */ + /* sync simple statistics (pg_statistic) for all indexes */ for (i = 0; i < nindexes; i++) - coord_collect_stats(indexes[i], + { + coord_sync_rel_stats(indexes[i], syncOpt); + coord_sync_col_stats(indexes[i], false, indexdata[i].attr_cnt, - indexdata[i].vacattrstats, syncOpt); + indexdata[i].vacattrstats, + syncOpt); + } /* extended statistics (pg_statistic) for the relation */ - /* coord_collect_extended_stats(onerel, attr_cnt); */ + coord_sync_extended_stats(onerel, attr_cnt, syncOpt); } diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 7efd70e5..45aef61f 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -11111,25 +11111,25 @@ analyze_keyword: ; analyze_sync_option : -/* SYNC + SYNC ALL { AnalyzeSyncOpt *n = makeNode(AnalyzeSyncOpt); n->is_sync_from = false; n->nodes = NIL; - $$ = (Node *)n; + $$ = n; } - |*/ SYNC TO pgxcnode_list + | SYNC TO pgxcnode_list { AnalyzeSyncOpt *n = makeNode(AnalyzeSyncOpt); n->is_sync_from = false; n->nodes = $3; $$ = n; } - | SYNC FROM pgxcnode_list + | SYNC FROM pgxcnode_name { AnalyzeSyncOpt *n = makeNode(AnalyzeSyncOpt); n->is_sync_from = true; - n->nodes = $3; + n->nodes = list_make1(makeString($3)); $$ = n; } | /*EMPTY*/ { $$ = NULL; } diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index 6521598b..8240f0d6 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -713,16 +713,6 @@ ProcessUtilityPre(PlannedStmt *pstmt, /* we choose to allow this during "read only" transactions */ PreventCommandDuringRecovery((stmt->options & VACOPT_VACUUM) ? "VACUUM" : "ANALYZE"); - /* When statement is emit by the coordinating node, the statement is not - * rewritten, we adapt it here */ - if (IsConnFromCoord() && IS_PGXC_COORDINATOR && - (stmt->options & VACOPT_ANALYZE) && stmt->sync_option) - { - stmt->sync_option->is_sync_from = true; - list_free_deep(stmt->sync_option->nodes); - stmt->sync_option->nodes = NIL; - stmt->sync_option->nodes = list_make1(makeString(parentPGXCNode)); - } if (!IsConnFromCoord() && IS_PGXC_COORDINATOR && stmt->sync_option && stmt->sync_option->nodes != NIL) { @@ -740,10 +730,10 @@ ProcessUtilityPre(PlannedStmt *pstmt, { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("can not find coordinator %s!", + errmsg("Can not find coordinator %s!", strVal(lfirst(cell))))); } - if (node_type != PGXC_NODE_COORDINATOR) + else if (node_type != PGXC_NODE_COORDINATOR) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), @@ -752,11 +742,22 @@ ProcessUtilityPre(PlannedStmt *pstmt, } } } + + /* When statement is emit by the coordinating node, the statement is not + * rewritten, adapt it here */ + if (IsConnFromCoord() && IS_PGXC_COORDINATOR && + (stmt->options & VACOPT_ANALYZE) && stmt->sync_option) + { + stmt->sync_option->is_sync_from = true; + list_free_deep(stmt->sync_option->nodes); + stmt->sync_option->nodes = NIL; + stmt->sync_option->nodes = list_make1(makeString(parentPGXCNode)); + } /* - * We have to run the command on nodes before Coordinator because + * Not SYNC command, We have to run the command on nodes before Coordinator because * vacuum() pops active snapshot and we can not send it to nodes */ - if (!(stmt->options & VACOPT_COORDINATOR)) + else if (!(stmt->options & VACOPT_COORDINATOR)) exec_type = EXEC_ON_DATANODES; auto_commit = true; } @@ -1357,6 +1358,7 @@ ProcessUtilityPost(PlannedStmt *pstmt, case T_VacuumStmt: { VacuumStmt *vstmt = (VacuumStmt *)parsetree; + /* Send synchronization statements to other coordinator nodes */ if (!IsConnFromCoord() && IS_PGXC_COORDINATOR && (vstmt->options & VACOPT_ANALYZE) && vstmt->sync_option) { @@ -1377,14 +1379,6 @@ ProcessUtilityPost(PlannedStmt *pstmt, char node_type = PGXC_NODE_COORDINATOR; nodeIdx = PGXCNodeGetNodeIdFromName(strVal(lfirst(lc)), &node_type); - /* Assert(nodeIdx > 0 && nodeIdx < NumDataNodes); */ - /* if(node_type != PGXC_NODE_COORDINATOR){ */ - /* ereport(ERROR, */ - /* (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), */ - /* errmsg("node %s is not coordinator!", - * strVal(lfirst(lc))))); */ - /* } */ - /* already check/rewrite in pre, just add it */ exec_nodes->nodeList = lappend_int(exec_nodes->nodeList, nodeIdx); } } @@ -1392,81 +1386,6 @@ ProcessUtilityPost(PlannedStmt *pstmt, CommitTransactionCommand(); StartTransactionCommand(); } - /* if (vstmt->options & VACOPT_ANALYZE && vstmt->sync_option != NULL && */ - /* vstmt->sync_option->is_sync_from != true) */ - /* { */ - /* StringInfo queryStr = makeStringInfo(); */ - /* appendStringInfo(queryStr, "ANALYZE (COORDINATOR"); */ - /* if (vstmt->options & VACOPT_VERBOSE) */ - /* { */ - /* appendStringInfoString(queryStr, " ,VERBOSE"); */ - /* } */ - /* appendStringInfoChar(queryStr, ')'); */ - /* if (vstmt->relation) */ - /* appendStringInfo(queryStr, " %s", RangeVarGetName(vstmt->relation)); - */ - /* if (vstmt->va_cols) */ - /* { */ - /* ListCell *lc; */ - /* bool comma = false; */ - /* appendStringInfoString(queryStr, " ("); */ - /* foreach (lc, vstmt->va_cols) */ - /* { */ - /* if (comma) */ - /* comma = true; */ - /* else */ - /* appendStringInfoChar(queryStr, ','); */ - /* appendStringInfoString(queryStr, strVal(lfirst(lc))); */ - /* } */ - /* appendStringInfoChar(queryStr, ')'); */ - /* } */ - - /* appendStringInfo(queryStr, " SYNC FROM %s", PGXCNodeName); */ - /* PopActiveSnapshot(); */ - /* CommitTransactionCommand(); */ - /* StartTransactionCommand(); */ - /* if (vstmt->sync_option->nodes) */ - /* { */ - /* ExecNodes *execnodes; */ - /* ListCell *lc; */ - /* int nodeIdx; */ - /* execnodes = (ExecNodes *)makeNode(ExecNodes); */ - /* execnodes->accesstype = RELATION_ACCESS_INSERT; */ - /* execnodes->baselocatortype = LOCATOR_TYPE_SHARD; /\* not used *\/ */ - /* execnodes->en_expr = NULL; */ - /* execnodes->en_relid = InvalidOid; */ - /* execnodes->primarynodelist = NIL; */ - - /* foreach(lc, vstmt->sync_option->nodes){ */ - /* char node_type = PGXC_NODE_COORDINATOR; */ - /* nodeIdx = */ - /* PGXCNodeGetNodeIdFromName(strVal(lfirst(lc)), &node_type); */ - /* Assert(nodeIdx > 0 && nodeIdx < NumDataNodes); */ - /* execnodes->nodeList = lappend_int(execnodes->nodeList, nodeIdx); - */ - /* } */ - /* ExecUtilityStmtOnNodes(parsetree, */ - /* queryStr->data, */ - /* execnodes, */ - /* sentToRemote, */ - /* false, */ - /* EXEC_ON_COORDS, */ - /* false, */ - /* false); */ - /* list_free(execnodes->nodeList); */ - /* } */ - /* else */ - /* ExecUtilityStmtOnNodes(parsetree, */ - /* queryStr->data, */ - /* NULL, */ - /* sentToRemote, */ - /* auto_commit, */ - /* EXEC_ON_COORDS, */ - /* false, */ - /* false); */ - /* pfree(queryStr->data); */ - /* pfree(queryStr); */ - /* } */ break; } #ifdef _SHARDING_ From 1ddbbef3301e20505bb21ca30564be8a5ef2fcac Mon Sep 17 00:00:00 2001 From: aslanxli Date: Tue, 8 Mar 2022 08:53:48 +0800 Subject: [PATCH 530/578] format error info --- src/backend/tcop/utility.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index 8240f0d6..b8d32cbb 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -721,24 +721,16 @@ ProcessUtilityPre(PlannedStmt *pstmt, foreach (cell, stmt->sync_option->nodes) { if (0 == strcmp(strVal(lfirst(cell)), PGXCNodeName)) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("Can not sync to/from local!"))); + elog(ERROR, "Can not sync to/from local!"); PGXCNodeGetNodeIdFromName(strVal(lfirst(cell)), &node_type); if (node_type == PGXC_NODE_NONE) { - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("Can not find coordinator %s!", - strVal(lfirst(cell))))); + elog(ERROR, "Can not find coordinator %s!", strVal(lfirst(cell))); } else if (node_type != PGXC_NODE_COORDINATOR) { - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("node %s is not coordinator!", - strVal(lfirst(cell))))); + elog(ERROR, "node %s is not coordinator!", strVal(lfirst(cell))); } } } From 2895697a8aff288bda95b822ff458a63152fa4de Mon Sep 17 00:00:00 2001 From: aslanxli Date: Tue, 8 Mar 2022 09:52:58 +0800 Subject: [PATCH 531/578] "analyze" and "analyze sync" behave the same: sync statistics by default --- src/backend/parser/gram.y | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 45aef61f..15f0cc65 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -11132,7 +11132,13 @@ analyze_sync_option : n->nodes = list_make1(makeString($3)); $$ = n; } - | /*EMPTY*/ { $$ = NULL; } + | /*EMPTY*/ + { + AnalyzeSyncOpt *n = makeNode(AnalyzeSyncOpt); + n->is_sync_from = false; + n->nodes = NIL; + $$ = n; + } ; opt_verbose: From 17339b16f9b83089b0f232a3e0be1840a65628cc Mon Sep 17 00:00:00 2001 From: aslanxli Date: Mon, 14 Mar 2022 09:01:22 +0800 Subject: [PATCH 532/578] format comment --- src/backend/tcop/utility.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index b8d32cbb..16e95deb 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -735,8 +735,10 @@ ProcessUtilityPre(PlannedStmt *pstmt, } } - /* When statement is emit by the coordinating node, the statement is not - * rewritten, adapt it here */ + /* + * When statement is emit by the coordinating node, the statement is not + * rewritten, adapt it here + */ if (IsConnFromCoord() && IS_PGXC_COORDINATOR && (stmt->options & VACOPT_ANALYZE) && stmt->sync_option) { From 761bbda286a7f69641ae5b3cf55d6e85b2e3794d Mon Sep 17 00:00:00 2001 From: aslanxli Date: Tue, 22 Mar 2022 17:00:20 +0800 Subject: [PATCH 533/578] Added the feature: statistics synchronization.The ANALYZE syntax has been extended support sync vacuum statistics for relation cherry-pick: 9510c58d e47d6f98 808f8a6b 8a5348ac 378af856 60882fef --- src/backend/commands/analyze.c | 24 +++-- src/backend/commands/vacuum.c | 156 +++++++++++++++++++++++++++++---- src/backend/nodes/copyfuncs.c | 10 +-- src/backend/nodes/equalfuncs.c | 6 +- src/backend/parser/gram.y | 20 +++-- src/backend/tcop/utility.c | 27 ++++-- src/backend/utils/adt/dbsize.c | 3 +- src/include/commands/vacuum.h | 9 +- src/include/nodes/parsenodes.h | 6 +- 9 files changed, 202 insertions(+), 59 deletions(-) diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index d558c5a3..901b0a60 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -121,7 +121,7 @@ static BufferAccessStrategy vac_strategy; static void do_analyze_rel(Relation onerel, int options, VacuumParams *params, List *va_cols, AcquireSampleRowsFunc acquirefunc, BlockNumber relpages, - bool inh, bool in_outer_xact, int elevel, AnalyzeSyncOpt *syncOpt); + bool inh, bool in_outer_xact, int elevel, StatSyncOpt *syncOpt); static void compute_index_stats(Relation onerel, double totalrows, AnlIndexData *indexdata, int nindexes, HeapTuple *rows, int numrows, @@ -146,7 +146,7 @@ static void analyze_rel_sync(Relation onerel, int nindexes, Relation *indexes, AnlIndexData *indexdata, - AnalyzeSyncOpt *syncOpt); + StatSyncOpt *syncOpt); #ifdef XCP static void analyze_rel_coordinator(Relation onerel, bool inh, int attr_cnt, @@ -177,7 +177,7 @@ analyze_rel(Oid relid, List *va_cols, bool in_outer_xact, BufferAccessStrategy bstrategy, - AnalyzeSyncOpt *syncOpt) + StatSyncOpt *syncOpt) { Relation onerel; int elevel; @@ -438,7 +438,7 @@ do_analyze_rel(Relation onerel, bool inh, bool in_outer_xact, int elevel, - AnalyzeSyncOpt *syncOpt) + StatSyncOpt *syncOpt) { int attr_cnt, tcnt, @@ -633,10 +633,8 @@ do_analyze_rel(Relation onerel, * Sync statistics if this session is connected to other remote Coordinator. * When receiving sync commands directly from the client, we also sync statistics. */ - if (iscoordinator && IsConnFromCoord() && - (syncOpt != NULL && syncOpt->is_sync_from == true)) + if (iscoordinator && (syncOpt != NULL && syncOpt->is_sync_from == true)) { - elog(INFO, "SYNC statistic"); analyze_rel_sync(onerel, inh, attr_cnt, @@ -5349,8 +5347,8 @@ acquire_coordinator_sample_rows(Relation onerel, int elevel, #endif -static RemoteQuery * -init_sync_remotequery(AnalyzeSyncOpt *syncOpt, char **cnname) +RemoteQuery * +init_sync_remotequery(StatSyncOpt *syncOpt, char **cnname) { RemoteQuery *step; ListCell *lc; @@ -5381,7 +5379,7 @@ init_sync_remotequery(AnalyzeSyncOpt *syncOpt, char **cnname) * sync relation stats from the coordinator node specified by syncOpt. */ static void -coord_sync_rel_stats(Relation onerel, AnalyzeSyncOpt *syncOpt) +coord_sync_rel_stats(Relation onerel, StatSyncOpt *syncOpt) { char *nspname; char *relname; @@ -5487,7 +5485,7 @@ coord_sync_col_stats(Relation onerel, bool inh, int attr_cnt, VacAttrStats **vacattrstats, - AnalyzeSyncOpt *syncOpt) + StatSyncOpt *syncOpt) { char *nspname; char *relname; @@ -5837,7 +5835,7 @@ coord_sync_col_stats(Relation onerel, * */ static void -coord_sync_extended_stats(Relation onerel, int attr_cnt, AnalyzeSyncOpt *syncOpt) +coord_sync_extended_stats(Relation onerel, int attr_cnt, StatSyncOpt *syncOpt) { char *nspname; char *relname; @@ -5948,7 +5946,7 @@ analyze_rel_sync(Relation onerel, int nindexes, Relation *indexes, AnlIndexData *indexdata, - AnalyzeSyncOpt *syncOpt) + StatSyncOpt *syncOpt) { int i; /* sync statistics for the relation */ diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index f001e056..78cc13ef 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -98,7 +98,7 @@ static void vac_truncate_clog(TransactionId frozenXID, TransactionId lastSaneFrozenXid, MultiXactId lastSaneMinMulti); static bool vacuum_rel(Oid relid, RangeVar *relation, int options, - VacuumParams *params); + VacuumParams *params, StatSyncOpt *syncOpt); /* * Primary entry point for manual VACUUM and ANALYZE commands @@ -178,7 +178,7 @@ vacuum(int options, List *va_cols, BufferAccessStrategy bstrategy, bool isTopLevel, - AnalyzeSyncOpt *syncOpt) + StatSyncOpt *syncOpt) { const char *stmttype; volatile bool in_outer_xact, @@ -332,7 +332,7 @@ vacuum(int options, if (options & VACOPT_VACUUM) { - if (!vacuum_rel(relid, relation, options, params)) + if (!vacuum_rel(relid, relation, options, params, syncOpt)) continue; } @@ -1266,8 +1266,8 @@ vac_truncate_clog(TransactionId frozenXID, * At entry and exit, we are not inside a transaction. */ static bool -vacuum_rel(Oid relid, RangeVar *relation, int options, VacuumParams *params) -{// #lizard forgives +vacuum_rel(Oid relid, RangeVar *relation, int options, VacuumParams *params, StatSyncOpt *syncOpt) +{ LOCKMODE lmode; Relation onerel; LockRelId onerelid; @@ -1328,7 +1328,7 @@ vacuum_rel(Oid relid, RangeVar *relation, int options, VacuumParams *params) foreach (lc, new_childs) { child = lfirst_oid(lc); - part_vacuum_result = vacuum_rel(child, relation, options, params); + part_vacuum_result = vacuum_rel(child, relation, options, params, syncOpt); } UnlockRelationIdForSession(&onerelid, RowExclusiveLock); pfree(new_childs); @@ -1554,7 +1554,7 @@ vacuum_rel(Oid relid, RangeVar *relation, int options, VacuumParams *params) */ if (toast_relid != InvalidOid) { - vacuum_rel(toast_relid, relation, options, params); + vacuum_rel(toast_relid, relation, options, params, syncOpt); } /* @@ -1574,7 +1574,7 @@ vacuum_rel(Oid relid, RangeVar *relation, int options, VacuumParams *params) */ if (IS_PGXC_COORDINATOR && onerel->rd_locator_info) { - vacuum_rel_coordinator(onerel, true, params); + vacuum_rel_coordinator(onerel, true, params, syncOpt); } else #endif @@ -1618,7 +1618,7 @@ vacuum_rel(Oid relid, RangeVar *relation, int options, VacuumParams *params) * totally unimportant for toast relations. */ if (toast_relid != InvalidOid) - vacuum_rel(toast_relid, relation, options, params); + vacuum_rel(toast_relid, relation, options, params, syncOpt); /* * Now release the session-level lock on the master table. @@ -1928,13 +1928,112 @@ get_remote_relstat(char *nspname, char *relname, bool replicated, } +/* + * Get relation statistics from coordinator node specified by syncOpt + */ +static void +sync_remote_relstat(char *nspname, char *relname, bool replicated, + int32 *pages, int32 *allvisiblepages, + float4 *tuples, TransactionId *frozenXid, StatSyncOpt *syncOpt) +{ + char *cnname; + StringInfoData query; + EState *estate; + MemoryContext oldcontext; + RemoteQuery *step; + RemoteQueryState *node; + TupleTableSlot *result; + + /* Make up query string */ + initStringInfo(&query); + appendStringInfo(&query, "SELECT c.relpages, " + "c.reltuples, " + "c.relallvisible, " + "c.relfrozenxid " + "FROM pg_class c JOIN pg_namespace n " + "ON c.relnamespace = n.oid " + "WHERE n.nspname = '%s' " + "AND c.relname = '%s'", + nspname, relname); + + /* Build up RemoteQuery */ + step = init_sync_remotequery(syncOpt, &cnname); + step->sql_statement = query.data; + step->force_autocommit = true; + + + /* Add targetlist entries */ + step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, + make_relation_tle(RelationRelationId, + "pg_class", + "relpages")); + step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, + make_relation_tle(RelationRelationId, + "pg_class", + "reltuples")); + step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, + make_relation_tle(RelationRelationId, + "pg_class", + "relallvisible")); + step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, + make_relation_tle(RelationRelationId, + "pg_class", + "relfrozenxid")); + + /* Execute query on the data nodes */ + estate = CreateExecutorState(); + oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); + + node = ExecInitRemoteQuery(step, estate, 0); + MemoryContextSwitchTo(oldcontext); + /* get ready to combine results */ + *pages = 0; + *allvisiblepages = 0; + *tuples = 0.0; + *frozenXid = InvalidTransactionId; + + result = ExecRemoteQuery((PlanState *) node); + if (result != NULL && !TupIsNull(result)) + { + Datum value; + bool isnull; + /* Process statistics from the data node */ + value = slot_getattr(result, 1, &isnull); /* relpages */ + if (!isnull) + { + *pages = DatumGetInt32(value); + } + value = slot_getattr(result, 2, &isnull); /* reltuples */ + if (!isnull) + { + *tuples = DatumGetFloat4(value); + } + value = slot_getattr(result, 3, &isnull); /* relallvisible */ + if (!isnull) + { + *allvisiblepages = DatumGetInt32(value); + } + value = slot_getattr(result, 4, &isnull); /* relfrozenxid */ + if (!isnull) + { + TransactionId xid = DatumGetTransactionId(value); + if (TransactionIdIsValid(xid)) + { + *frozenXid = xid; + } + } + } + ExecEndRemoteQuery(node); +} + + /* * Coordinator does not contain any data, so we never need to vacuum relations. * This function only updates optimizer statistics based on info from the * data nodes. */ void -vacuum_rel_coordinator(Relation onerel, bool is_outer, VacuumParams *params) +vacuum_rel_coordinator(Relation onerel, bool is_outer, VacuumParams *params, StatSyncOpt *syncOpt) { char *nspname; char *relname; @@ -1945,7 +2044,8 @@ vacuum_rel_coordinator(Relation onerel, bool is_outer, VacuumParams *params) TransactionId min_frozenxid; bool hasindex; bool replicated; - int rel_nodes; + int rel_nodes = 0; + bool isSync = false; #ifdef __TBASE__ TransactionId oldestXmin = InvalidTransactionId; TransactionId freezeLimit = InvalidTransactionId; @@ -1976,10 +2076,23 @@ vacuum_rel_coordinator(Relation onerel, bool is_outer, VacuumParams *params) * Get stats from the remote nodes. Function returns the number of nodes * returning correct stats. */ + if (syncOpt != NULL && syncOpt->is_sync_from == true && + !RELATION_IS_COORDINATOR_LOCAL(onerel)) + { + sync_remote_relstat(nspname, + relname, + replicated, + &num_pages, + &num_allvisible_pages, + &num_tuples, + &min_frozenxid, + syncOpt); + isSync = true; + }else rel_nodes = get_remote_relstat(nspname, relname, replicated, &num_pages, &num_allvisible_pages, &num_tuples, &min_frozenxid); - if (rel_nodes > 0) + if (rel_nodes > 0 || isSync) { int nindexes; Relation *Irel; @@ -1998,22 +2111,33 @@ vacuum_rel_coordinator(Relation onerel, bool is_outer, VacuumParams *params) int32 idx_pages, idx_allvisible_pages; float4 idx_tuples; TransactionId idx_frozenxid; - int idx_nodes; + int idx_nodes = 0; /* Get the index identifier */ relname = RelationGetRelationName(Irel[i]); nspname = get_namespace_name(RelationGetNamespace(Irel[i])); /* Index is replicated if parent relation is replicated */ + if(isSync) + { + sync_remote_relstat(nspname, + relname, + replicated, + &idx_pages, + &idx_allvisible_pages, + &idx_tuples, + &idx_frozenxid, + syncOpt); + }else idx_nodes = get_remote_relstat(nspname, relname, replicated, &idx_pages, &idx_allvisible_pages, &idx_tuples, &idx_frozenxid); - if (idx_nodes > 0) + if (idx_nodes > 0 || isSync) { /* * Do not update the frozenxid if information was not from * all the expected nodes. */ - if (idx_nodes < nodes) + if (idx_nodes < nodes && !isSync) { idx_frozenxid = InvalidTransactionId; } @@ -2038,7 +2162,7 @@ vacuum_rel_coordinator(Relation onerel, bool is_outer, VacuumParams *params) * Do not update the frozenxid if information was not from all * the expected nodes. */ - if (rel_nodes < nodes) + if (rel_nodes < nodes && !isSync) { min_frozenxid = InvalidTransactionId; } diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 10a1d424..876c407e 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -4149,10 +4149,10 @@ _copyVacuumStmt(const VacuumStmt *from) return newnode; } -static AnalyzeSyncOpt * -_copyAnalyzeSyncOpt(const AnalyzeSyncOpt *from) +static StatSyncOpt * +_copyStatSyncOpt(const StatSyncOpt *from) { - AnalyzeSyncOpt *newnode = makeNode(AnalyzeSyncOpt); + StatSyncOpt *newnode = makeNode(StatSyncOpt); COPY_SCALAR_FIELD(is_sync_from); COPY_NODE_FIELD(nodes); @@ -5926,8 +5926,8 @@ copyObjectImpl(const void *from) case T_VacuumStmt: retval = _copyVacuumStmt(from); break; - case T_AnalyzeSyncOpt: - retval = _copyAnalyzeSyncOpt(from); + case T_StatSyncOpt: + retval = _copyStatSyncOpt(from); break; #ifdef _SHARDING_ case T_VacuumShardStmt: diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c index 87934c2d..f5f2bc77 100644 --- a/src/backend/nodes/equalfuncs.c +++ b/src/backend/nodes/equalfuncs.c @@ -1700,7 +1700,7 @@ _equalVacuumStmt(const VacuumStmt *a, const VacuumStmt *b) } static bool -_equalAnalyzeSyncOpt(const AnalyzeSyncOpt *a, const AnalyzeSyncOpt *b) +_equalStatSyncOpt(const StatSyncOpt *a, const StatSyncOpt *b) { COMPARE_SCALAR_FIELD(is_sync_from); COMPARE_NODE_FIELD(nodes); @@ -3602,8 +3602,8 @@ equal(const void *a, const void *b) case T_VacuumStmt: retval = _equalVacuumStmt(a, b); break; - case T_AnalyzeSyncOpt: - retval = _equalAnalyzeSyncOpt(a, b); + case T_StatSyncOpt: + retval = _equalStatSyncOpt(a, b); break; #ifdef _SHARDING_ case T_VacuumShardStmt: diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 15f0cc65..dad866bf 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -10969,7 +10969,7 @@ cluster_index_specification: * *****************************************************************************/ -VacuumStmt: VACUUM opt_full opt_freeze opt_verbose +VacuumStmt: VACUUM opt_full opt_freeze opt_verbose analyze_sync_option { VacuumStmt *n = makeNode(VacuumStmt); n->options = VACOPT_VACUUM; @@ -10981,9 +10981,10 @@ VacuumStmt: VACUUM opt_full opt_freeze opt_verbose n->options |= VACOPT_VERBOSE; n->relation = NULL; n->va_cols = NIL; + n->sync_option = $5; $$ = (Node *)n; } - | VACUUM opt_full opt_freeze opt_verbose qualified_name + | VACUUM opt_full opt_freeze opt_verbose qualified_name analyze_sync_option { VacuumStmt *n = makeNode(VacuumStmt); n->options = VACOPT_VACUUM; @@ -10995,6 +10996,7 @@ VacuumStmt: VACUUM opt_full opt_freeze opt_verbose n->options |= VACOPT_VERBOSE; n->relation = $5; n->va_cols = NIL; + n->sync_option = $6; $$ = (Node *)n; } | VACUUM opt_full opt_freeze opt_verbose AnalyzeStmt @@ -11009,15 +11011,16 @@ VacuumStmt: VACUUM opt_full opt_freeze opt_verbose n->options |= VACOPT_VERBOSE; $$ = (Node *)n; } - | VACUUM '(' vacuum_option_list ')' + | VACUUM '(' vacuum_option_list ')' analyze_sync_option { VacuumStmt *n = makeNode(VacuumStmt); n->options = VACOPT_VACUUM | $3; n->relation = NULL; n->va_cols = NIL; + n->sync_option = $5; $$ = (Node *) n; } - | VACUUM '(' vacuum_option_list ')' qualified_name opt_name_list + | VACUUM '(' vacuum_option_list ')' qualified_name opt_name_list analyze_sync_option { VacuumStmt *n = makeNode(VacuumStmt); n->options = VACOPT_VACUUM | $3; @@ -11025,6 +11028,7 @@ VacuumStmt: VACUUM opt_full opt_freeze opt_verbose n->va_cols = $6; if (n->va_cols != NIL) /* implies analyze */ n->options |= VACOPT_ANALYZE; + n->sync_option = $7; $$ = (Node *) n; } /* _SHARDING_ BEGIN */ @@ -11113,28 +11117,28 @@ analyze_keyword: analyze_sync_option : SYNC ALL { - AnalyzeSyncOpt *n = makeNode(AnalyzeSyncOpt); + StatSyncOpt *n = makeNode(StatSyncOpt); n->is_sync_from = false; n->nodes = NIL; $$ = n; } | SYNC TO pgxcnode_list { - AnalyzeSyncOpt *n = makeNode(AnalyzeSyncOpt); + StatSyncOpt *n = makeNode(StatSyncOpt); n->is_sync_from = false; n->nodes = $3; $$ = n; } | SYNC FROM pgxcnode_name { - AnalyzeSyncOpt *n = makeNode(AnalyzeSyncOpt); + StatSyncOpt *n = makeNode(StatSyncOpt); n->is_sync_from = true; n->nodes = list_make1(makeString($3)); $$ = n; } | /*EMPTY*/ { - AnalyzeSyncOpt *n = makeNode(AnalyzeSyncOpt); + StatSyncOpt *n = makeNode(StatSyncOpt); n->is_sync_from = false; n->nodes = NIL; $$ = n; diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index 16e95deb..87be1ef4 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -739,8 +739,7 @@ ProcessUtilityPre(PlannedStmt *pstmt, * When statement is emit by the coordinating node, the statement is not * rewritten, adapt it here */ - if (IsConnFromCoord() && IS_PGXC_COORDINATOR && - (stmt->options & VACOPT_ANALYZE) && stmt->sync_option) + if (IsConnFromCoord() && IS_PGXC_COORDINATOR && stmt->sync_option) { stmt->sync_option->is_sync_from = true; list_free_deep(stmt->sync_option->nodes); @@ -748,10 +747,10 @@ ProcessUtilityPre(PlannedStmt *pstmt, stmt->sync_option->nodes = list_make1(makeString(parentPGXCNode)); } /* - * Not SYNC command, We have to run the command on nodes before Coordinator because + * If it is not a SYNC FROM command, We have to run the command on nodes before Coordinator because * vacuum() pops active snapshot and we can not send it to nodes */ - else if (!(stmt->options & VACOPT_COORDINATOR)) + else if (!(stmt->options & VACOPT_COORDINATOR) && !(stmt->sync_option && stmt->sync_option->is_sync_from == true)) exec_type = EXEC_ON_DATANODES; auto_commit = true; } @@ -1352,9 +1351,21 @@ ProcessUtilityPost(PlannedStmt *pstmt, case T_VacuumStmt: { VacuumStmt *vstmt = (VacuumStmt *)parsetree; - /* Send synchronization statements to other coordinator nodes */ + if (vstmt->relation != NULL) + { + Relation rel = + relation_openrv_extended(vstmt->relation, NoLock, true, false); + if (rel && rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP) + { + relation_close(rel, NoLock); + break; + } + if (rel) + relation_close(rel, NoLock); + } if (!IsConnFromCoord() && IS_PGXC_COORDINATOR && - (vstmt->options & VACOPT_ANALYZE) && vstmt->sync_option) + !IsInTransactionChain(context == PROCESS_UTILITY_TOPLEVEL) && + vstmt->sync_option) { exec_type = EXEC_ON_COORDS; if (vstmt->sync_option->nodes) @@ -1376,10 +1387,14 @@ ProcessUtilityPost(PlannedStmt *pstmt, exec_nodes->nodeList = lappend_int(exec_nodes->nodeList, nodeIdx); } } + if (ActiveSnapshotSet()) + { PopActiveSnapshot(); + } CommitTransactionCommand(); StartTransactionCommand(); } + auto_commit = true; break; } #ifdef _SHARDING_ diff --git a/src/backend/utils/adt/dbsize.c b/src/backend/utils/adt/dbsize.c index 17005175..eed6bb8b 100644 --- a/src/backend/utils/adt/dbsize.c +++ b/src/backend/utils/adt/dbsize.c @@ -567,9 +567,10 @@ pg_relation_size(PG_FUNCTION_ARGS) partoid = lfirst_oid(lc); child_rel = try_relation_open(partoid, AccessShareLock); + /* skip calculate size of child not exists */ if (NULL == child_rel) { - PG_RETURN_NULL(); + continue; } size += calculate_relation_size(&(child_rel->rd_node), child_rel->rd_backend, forkname_to_number(text_to_cstring(forkName)), NULL); diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h index 9da04880..47859b0b 100644 --- a/src/include/commands/vacuum.h +++ b/src/include/commands/vacuum.h @@ -84,7 +84,7 @@ #include "storage/lock.h" #include "storage/relfilenode.h" #include "utils/relcache.h" - +#include "pgxc/planner.h" /*---------- * ANALYZE builds one of these structs for each attribute (column) that is @@ -295,7 +295,7 @@ extern void vacuum(int options, List *va_cols, BufferAccessStrategy bstrategy, bool isTopLevel, - AnalyzeSyncOpt *syncOpt); + StatSyncOpt *syncOpt); extern void vac_open_indexes(Relation relation, LOCKMODE lockmode, int *nindexes, Relation **Irel); extern void vac_close_indexes(int nindexes, Relation *Irel, LOCKMODE lockmode); @@ -323,7 +323,7 @@ extern void vacuum_set_xid_limits(Relation rel, extern void vac_update_datfrozenxid(void); extern void vacuum_delay_point(void); #ifdef XCP -extern void vacuum_rel_coordinator(Relation onerel, bool is_outer, VacuumParams *params); +extern void vacuum_rel_coordinator(Relation onerel, bool is_outer, VacuumParams *params, StatSyncOpt *syncOpt); TargetEntry *make_relation_tle(Oid reloid, const char *relname, const char *column); #endif @@ -350,13 +350,14 @@ extern void analyze_rel(Oid relid, List *va_cols, bool in_outer_xact, BufferAccessStrategy bstrategy, - AnalyzeSyncOpt *syncOpt); + StatSyncOpt *syncOpt); extern bool std_typanalyze(VacAttrStats *stats); /* in utils/misc/sampling.c --- duplicate of declarations in utils/sampling.h */ extern double anl_random_fract(void); extern double anl_init_selection_state(int n); extern double anl_get_next_S(double t, int n, double *stateptr); +extern RemoteQuery *init_sync_remotequery(StatSyncOpt *syncOpt, char **cnname); #ifdef __TBASE__ extern Size QueryAnalyzeInfoShmemSize(void); diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index 4dc323ab..e8ac3d54 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -3301,13 +3301,13 @@ typedef enum VacuumOption */ } VacuumOption; -typedef struct AnalyzeSyncOpt +typedef struct StatSyncOpt { NodeTag type; bool is_sync_from; /* false: sync to other CN node; true: sync from node identified by node_name */ List *nodes; /* node list for sync to/from */ -} AnalyzeSyncOpt; +} StatSyncOpt; typedef struct VacuumStmt { @@ -3315,7 +3315,7 @@ typedef struct VacuumStmt int options; /* OR of VacuumOption flags */ RangeVar *relation; /* single table to process, or NULL */ List *va_cols; /* list of column names, or NIL for all */ - AnalyzeSyncOpt *sync_option; /* Sync statistics to/from other nodes, or NULL */ + StatSyncOpt *sync_option; /* Sync statistics to/from other nodes, or NULL */ } VacuumStmt; #ifdef _SHARDING_ From 529c1a75c2ae89dbf8992bdc935bd168c1566d09 Mon Sep 17 00:00:00 2001 From: aslanxli Date: Wed, 20 Apr 2022 17:36:42 +0800 Subject: [PATCH 534/578] Added the feature: statistics synchronization.The ANALYZE syntax has been extended free unused estate in coord_sync_rel_stats/coord_sync_col_stats/coord_sync_extended_stats/sync_remote_relstat --- src/backend/commands/analyze.c | 3 +++ src/backend/commands/vacuum.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 901b0a60..31b7cfbf 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -5474,6 +5474,7 @@ coord_sync_rel_stats(Relation onerel, StatSyncOpt *syncOpt) cnname))); } ExecEndRemoteQuery(node); + FreeExecutorState(estate); } /* @@ -5819,6 +5820,7 @@ coord_sync_col_stats(Relation onerel, result = ExecRemoteQuery((PlanState *) node); } ExecEndRemoteQuery(node); + FreeExecutorState(estate); update_attstats(RelationGetRelid(onerel), inh, @@ -5936,6 +5938,7 @@ coord_sync_extended_stats(Relation onerel, int attr_cnt, StatSyncOpt *syncOpt) result = ExecRemoteQuery((PlanState *)node); } ExecEndRemoteQuery(node); + FreeExecutorState(estate); } static void diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 78cc13ef..217b82ef 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -2024,9 +2024,9 @@ sync_remote_relstat(char *nspname, char *relname, bool replicated, } } ExecEndRemoteQuery(node); + FreeExecutorState(estate); } - /* * Coordinator does not contain any data, so we never need to vacuum relations. * This function only updates optimizer statistics based on info from the From d637e4e55ea6989b19d1efe8c7d2d3e49e69b154 Mon Sep 17 00:00:00 2001 From: aslanxli Date: Thu, 21 Apr 2022 17:38:19 +0800 Subject: [PATCH 535/578] When analyzing a interval partitioned table, the sub-table not be locked when make the oids listd. Therefore, when serially analyzing sub-tables, if the sub-table is droped before processing, the analysis process will be interrupted. The fix is to use try_relation_open instead of relation_open when processing sub-tables, and skip if the opening fails. --- src/backend/commands/vacuum.c | 1 - src/backend/utils/adt/ruleutils.c | 143 ++++++++++++++---------------- src/include/utils/ruleutils.h | 2 - 3 files changed, 68 insertions(+), 78 deletions(-) diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 217b82ef..5dfa41e0 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -1340,7 +1340,6 @@ vacuum_rel(Oid relid, RangeVar *relation, int options, VacuumParams *params, Sta } } #endif - /* Begin a transaction for vacuuming this relation */ StartTransactionCommand(); diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c index 80623c91..5ff8c9e7 100644 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@ -92,7 +92,6 @@ #ifdef __COLD_HOT__ #include "postmaster/postmaster.h" #endif - #include "storage/lmgr.h" /* ---------- * Pretty formatting constants @@ -361,7 +360,7 @@ static void decompile_column_index_array(Datum column_index_array, Oid relId, static char *pg_get_ruledef_worker(Oid ruleoid, int prettyFlags); static char *pg_get_indexdef_worker(Oid indexrelid, int colno, const Oid *excludeOps, - bool attrsOnly, bool showTblSpc, bool inherits, + bool attrsOnly, bool showTblSpc, bool inherits, int prettyFlags, bool missing_ok); static char *pg_get_statisticsobj_worker(Oid statextid, bool missing_ok); static char *pg_get_partkeydef_worker(Oid relid, int prettyFlags, @@ -1144,7 +1143,7 @@ pg_get_indexdef(PG_FUNCTION_ARGS) prettyFlags = PRETTYFLAG_INDENT; - res = pg_get_indexdef_worker(indexrelid, 0, NULL, false, false, false, + res = pg_get_indexdef_worker(indexrelid, 0, NULL, false, false, false, prettyFlags, true); if (res == NULL) @@ -1165,7 +1164,7 @@ pg_get_indexdef_ext(PG_FUNCTION_ARGS) prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT; res = pg_get_indexdef_worker(indexrelid, colno, NULL, colno != 0, false, - false, prettyFlags, true); + false, prettyFlags, true); if (res == NULL) PG_RETURN_NULL(); @@ -1181,7 +1180,7 @@ pg_get_indexdef_ext(PG_FUNCTION_ARGS) char * pg_get_indexdef_string(Oid indexrelid) { - return pg_get_indexdef_worker(indexrelid, 0, NULL, false, true, true, 0, false); + return pg_get_indexdef_worker(indexrelid, 0, NULL, false, true, true, 0, false); } /* Internal version that just reports the column definitions */ @@ -1191,7 +1190,7 @@ pg_get_indexdef_columns(Oid indexrelid, bool pretty) int prettyFlags; prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT; - return pg_get_indexdef_worker(indexrelid, 0, NULL, true, false, false, + return pg_get_indexdef_worker(indexrelid, 0, NULL, true, false, false, prettyFlags, false); } @@ -1204,7 +1203,7 @@ pg_get_indexdef_columns(Oid indexrelid, bool pretty) static char * pg_get_indexdef_worker(Oid indexrelid, int colno, const Oid *excludeOps, - bool attrsOnly, bool showTblSpc, bool inherits, + bool attrsOnly, bool showTblSpc, bool inherits, int prettyFlags, bool missing_ok) {// #lizard forgives /* might want a separate isConstraint parameter later */ @@ -1320,11 +1319,11 @@ pg_get_indexdef_worker(Oid indexrelid, int colno, if (!attrsOnly) { if (!isConstraint) - appendStringInfo(&buf, "CREATE %sINDEX %s ON %s%s USING %s (", + appendStringInfo(&buf, "CREATE %sINDEX %s ON %s%s USING %s (", idxrec->indisunique ? "UNIQUE " : "", quote_identifier(NameStr(idxrelrec->relname)), - idxrelrec->relkind == RELKIND_PARTITIONED_INDEX - && !inherits ? "ONLY " : "", + idxrelrec->relkind == RELKIND_PARTITIONED_INDEX + && !inherits ? "ONLY " : "", generate_relation_name(indrelid, NIL), quote_identifier(NameStr(amrec->amname))); else /* currently, must be EXCLUDE constraint */ @@ -1476,14 +1475,14 @@ pg_get_indexdef_worker(Oid indexrelid, int colno, Oid tblspc; tblspc = get_rel_tablespace(indexrelid); - if (OidIsValid(tblspc)) - { + if (OidIsValid(tblspc)) + { if (isConstraint) appendStringInfoString(&buf, " USING INDEX"); appendStringInfo(&buf, " TABLESPACE %s", quote_identifier(get_tablespace_name(tblspc))); } - } + } /* * If it's a partial index, decompile and append the predicate @@ -1650,7 +1649,7 @@ pg_get_statisticsobj_worker(Oid statextid, bool missing_ok) * * Returns the partition key specification, ie, the following: * - * PARTITION BY { RANGE | LIST | HASH } (column opt_collation opt_opclass [, ...]) + * PARTITION BY { RANGE | LIST | HASH } (column opt_collation opt_opclass [, ...]) */ Datum pg_get_partkeydef(PG_FUNCTION_ARGS) @@ -1754,10 +1753,10 @@ pg_get_partkeydef_worker(Oid relid, int prettyFlags, switch (form->partstrat) { - case PARTITION_STRATEGY_HASH: - if (!attrsOnly) - appendStringInfo(&buf, "HASH"); - break; + case PARTITION_STRATEGY_HASH: + if (!attrsOnly) + appendStringInfo(&buf, "HASH"); + break; case PARTITION_STRATEGY_LIST: if (!attrsOnly) appendStringInfo(&buf, "LIST"); @@ -1854,7 +1853,7 @@ pg_get_partition_constraintdef(PG_FUNCTION_ARGS) constr_expr = get_partition_qual_relid(relationId); - /* Quick exit if no partition constraint */ + /* Quick exit if no partition constraint */ if (constr_expr == NULL) PG_RETURN_NULL(); @@ -2131,12 +2130,12 @@ pg_get_constraintdef_worker(Oid constraintId, bool fullCommand, pfree(options); } - /* - * Print the tablespace, unless it's the database default. - * This is to help ALTER TABLE usage of this facility, - * which needs this behavior to recreate exact catalog - * state. - */ + /* + * Print the tablespace, unless it's the database default. + * This is to help ALTER TABLE usage of this facility, + * which needs this behavior to recreate exact catalog + * state. + */ tblspc = get_rel_tablespace(indexId); if (OidIsValid(tblspc)) appendStringInfo(&buf, " USING INDEX TABLESPACE %s", @@ -2241,7 +2240,7 @@ pg_get_constraintdef_worker(Oid constraintId, bool fullCommand, operators, false, false, - false, + false, prettyFlags, false)); break; @@ -9387,23 +9386,23 @@ get_rule_expr(Node *node, deparse_context *context, ListCell *cell; char *sep; - if (spec->is_default) - { - appendStringInfoString(buf, "DEFAULT"); - break; - } - + if (spec->is_default) + { + appendStringInfoString(buf, "DEFAULT"); + break; + } + switch (spec->strategy) { - case PARTITION_STRATEGY_HASH: - Assert(spec->modulus > 0 && spec->remainder >= 0); - Assert(spec->modulus > spec->remainder); - - appendStringInfoString(buf, "FOR VALUES"); - appendStringInfo(buf, " WITH (modulus %d, remainder %d)", - spec->modulus, spec->remainder); - break; - + case PARTITION_STRATEGY_HASH: + Assert(spec->modulus > 0 && spec->remainder >= 0); + Assert(spec->modulus > spec->remainder); + + appendStringInfoString(buf, "FOR VALUES"); + appendStringInfo(buf, " WITH (modulus %d, remainder %d)", + spec->modulus, spec->remainder); + break; + case PARTITION_STRATEGY_LIST: Assert(spec->listdatums != NIL); @@ -12063,18 +12062,14 @@ RelationGetAllPartitionsWithLock(Relation rel, LOCKMODE lockmode) Oid partoid = InvalidOid; int partidx = 0; List * result = NULL; - nparts = RelationGetNParts(rel); - for(partidx = 0; partidx < nparts; partidx++) { partname = GetPartitionName(RelationGetRelid(rel), partidx, false); partoid = get_relname_relid(partname, RelationGetNamespace(rel)); - if(partname) pfree(partname); partname = NULL; - if (InvalidOid == partoid) { continue; @@ -12083,7 +12078,6 @@ RelationGetAllPartitionsWithLock(Relation rel, LOCKMODE lockmode) { /* Get the lock to synchronize against concurrent drop */ LockRelationOid(partoid, lockmode); - /* * Now that we have the lock, double-check to see if the relation * really exists or not. If not, assume it was dropped while we @@ -12099,39 +12093,38 @@ RelationGetAllPartitionsWithLock(Relation rel, LOCKMODE lockmode) } result = lappend_oid(result, partoid); } - return result; } int -GetAllPartitionIntervalCount(Oid parent_oid) -{ - int count = 0; - List *children = NULL; - Relation rel = heap_open(parent_oid, NoLock); - - children = RelationGetAllPartitions(rel); - - if(children) - { - count = children->length; - list_free(children); - } - - heap_close(rel, NoLock); - - return count; -} - -Datum -partitions_number(PG_FUNCTION_ARGS) -{ - Oid parent_oid = PG_GETARG_OID(0); - int ret = GetAllPartitionIntervalCount(parent_oid); - PG_RETURN_INT32(ret); -} - -int +GetAllPartitionIntervalCount(Oid parent_oid) +{ + int count = 0; + List *children = NULL; + Relation rel = heap_open(parent_oid, NoLock); + + children = RelationGetAllPartitions(rel); + + if(children) + { + count = children->length; + list_free(children); + } + + heap_close(rel, NoLock); + + return count; +} + +Datum +partitions_number(PG_FUNCTION_ARGS) +{ + Oid parent_oid = PG_GETARG_OID(0); + int ret = GetAllPartitionIntervalCount(parent_oid); + PG_RETURN_INT32(ret); +} + +int RelationGetChildIndex(Relation rel, Oid childoid) { int nparts = 0; diff --git a/src/include/utils/ruleutils.h b/src/include/utils/ruleutils.h index db0106ea..fd7d7ae4 100644 --- a/src/include/utils/ruleutils.h +++ b/src/include/utils/ruleutils.h @@ -105,8 +105,6 @@ extern List *RelationGetAllPartitions(Relation rel); extern List *RelationGetAllPartitionsWithLock(Relation rel, LOCKMODE lockmode); extern int GetAllPartitionIntervalCount(Oid parent_oid); -extern int GetAllPartitionIntervalCount(Oid parent_oid); - extern int RelationGetChildIndex(Relation rel, Oid childoid); extern Oid RelationGetPartitionIndex(Relation rel, Oid indexOid, int partidx); From 107470a78db5ae0050cc6d4197ec1dc521aecf41 Mon Sep 17 00:00:00 2001 From: jadenchi Date: Tue, 12 Apr 2022 16:36:20 +0800 Subject: [PATCH 536/578] fix: http://tapd.woa.com/20421696/bugtrace/bugs/view?bug_id=1020421696097145423&url_cache_key=from_url_bug_query_list_ebb900eeeb806309840478207fdf43ae --- src/backend/parser/parse_utilcmd.c | 37 ++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c index 08cb09e6..559aee3d 100644 --- a/src/backend/parser/parse_utilcmd.c +++ b/src/backend/parser/parse_utilcmd.c @@ -3312,6 +3312,9 @@ transformRuleStmt(RuleStmt *stmt, const char *queryString, } +/* check the year is leak year or common year */ +#define is_leak_year(year) ((year % 100 != 0 && year % 4 == 0) || (year % 400 == 0)) + /* * transformAlterTableStmt - * parse analysis for ALTER TABLE @@ -3472,6 +3475,10 @@ transformAlterTableStmt(Oid relid, AlterTableStmt *stmt, int newnparts; Oid groupId; + struct pg_tm start_time; + fsec_t start_sec; + int gap = 0; + existnparts = RelationGetNParts(rel); newnparts = ((AddDropPartitions*)cmd->def)->nparts; @@ -3480,6 +3487,36 @@ transformAlterTableStmt(Oid relid, AlterTableStmt *stmt, elog(ERROR, "number of partitions to add cannot be negative or zero"); } + /* + * Self-developed partition table compatibility processing + */ + Form_pg_partition_interval routerinfo = NULL; + routerinfo = rel->rd_partitions_info; + + if (routerinfo->partdatatype == TIMESTAMPOID) + { + /* timestamp convert to posix struct */ + if(timestamp2tm(routerinfo->partstartvalue_ts, NULL, &start_time, &start_sec, NULL, NULL) != 0) + ereport(ERROR, + (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), + errmsg("timestamp out of range"))); + + if (routerinfo->partinterval_type == IntervalType_Day && + !is_leak_year(start_time.tm_year) && start_time.tm_mon <= 2 && start_time.tm_mday <= 28) + { + if (start_time.tm_mon < 2) + gap = (31 - start_time.tm_mday) + 28 + 1; + else + gap = 28 - start_time.tm_mday + 1; + + if (gap >= existnparts && gap <= newnparts + existnparts) + { + newnparts++; + ((AddDropPartitions*)cmd->def)->nparts = newnparts; + } + } + } + if(newnparts + existnparts > MAX_NUM_INTERVAL_PARTITIONS) { elog(ERROR, "one table only have %d partitions at most", MAX_NUM_INTERVAL_PARTITIONS); From 4b3f78fb54308864fc09f1ded0293ba8ca926c5a Mon Sep 17 00:00:00 2001 From: jadenchi Date: Wed, 13 Apr 2022 11:35:36 +0800 Subject: [PATCH 537/578] fix: http://tapd.woa.com/20421696/bugtrace/bugs/view?bug_id=1020421696097145423&url_cache_key=from_url_bug_query_list_ebb900eeeb806309840478207fdf43ae, modify some codes --- src/backend/parser/parse_utilcmd.c | 54 ++++++++++++++++++------------ src/backend/utils/adt/ruleutils.c | 33 ++++++++++++++++++ src/include/utils/ruleutils.h | 1 + 3 files changed, 66 insertions(+), 22 deletions(-) diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c index 559aee3d..3c2edaa0 100644 --- a/src/backend/parser/parse_utilcmd.c +++ b/src/backend/parser/parse_utilcmd.c @@ -3313,7 +3313,7 @@ transformRuleStmt(RuleStmt *stmt, const char *queryString, /* check the year is leak year or common year */ -#define is_leak_year(year) ((year % 100 != 0 && year % 4 == 0) || (year % 400 == 0)) +#define is_leap_year(year) ((year % 100 != 0 && year % 4 == 0) || (year % 400 == 0)) /* * transformAlterTableStmt - @@ -3472,12 +3472,16 @@ transformAlterTableStmt(Oid relid, AlterTableStmt *stmt, { int existnparts; int partidx; + int realPartidx; int newnparts; + int realNewnparts; Oid groupId; struct pg_tm start_time; fsec_t start_sec; - int gap = 0; + int year; + int mon; + int day; existnparts = RelationGetNParts(rel); newnparts = ((AddDropPartitions*)cmd->def)->nparts; @@ -3487,43 +3491,47 @@ transformAlterTableStmt(Oid relid, AlterTableStmt *stmt, elog(ERROR, "number of partitions to add cannot be negative or zero"); } + if(newnparts + existnparts > MAX_NUM_INTERVAL_PARTITIONS) + { + elog(ERROR, "one table only have %d partitions at most", MAX_NUM_INTERVAL_PARTITIONS); + } + /* * Self-developed partition table compatibility processing */ Form_pg_partition_interval routerinfo = NULL; routerinfo = rel->rd_partitions_info; - if (routerinfo->partdatatype == TIMESTAMPOID) - { /* timestamp convert to posix struct */ if(timestamp2tm(routerinfo->partstartvalue_ts, NULL, &start_time, &start_sec, NULL, NULL) != 0) ereport(ERROR, (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), errmsg("timestamp out of range"))); - if (routerinfo->partinterval_type == IntervalType_Day && - !is_leak_year(start_time.tm_year) && start_time.tm_mon <= 2 && start_time.tm_mday <= 28) - { - if (start_time.tm_mon < 2) - gap = (31 - start_time.tm_mday) + 28 + 1; - else - gap = 28 - start_time.tm_mday + 1; + year = start_time.tm_year; + mon = start_time.tm_mon; + day = start_time.tm_mday; - if (gap >= existnparts && gap <= newnparts + existnparts) + realPartidx = existnparts; + realNewnparts = newnparts; + + for(partidx = existnparts; partidx < existnparts + newnparts; partidx++) { - newnparts++; - ((AddDropPartitions*)cmd->def)->nparts = newnparts; - } - } - } + /* + * for compatible with the calculation of the normal time of the self-developed partition table + */ + if (routerinfo->partdatatype == TIMESTAMPOID && !is_leap_year(year) && routerinfo->partinterval_type == IntervalType_Day) + { + calculate_time(&year, &mon, &day, 1, IntervalType_Day, false); - if(newnparts + existnparts > MAX_NUM_INTERVAL_PARTITIONS) + if (mon == 2 && day == 28) { - elog(ERROR, "one table only have %d partitions at most", MAX_NUM_INTERVAL_PARTITIONS); + partidx--; + realNewnparts++; + ((AddDropPartitions*)cmd->def)->nparts = realNewnparts; + } } - for(partidx = existnparts; partidx < existnparts + newnparts; partidx++) - { TableLikeClause *likeclause = makeNode(TableLikeClause); CreateStmt * createpart = makeNode(CreateStmt); createpart->relation = copyObject((void *) stmt->relation); @@ -3535,7 +3543,7 @@ transformAlterTableStmt(Oid relid, AlterTableStmt *stmt, createpart->tableElts = lappend(createpart->tableElts, likeclause); createpart->interval_child = true; - createpart->interval_child_idx = partidx; + createpart->interval_child_idx = realPartidx; createpart->interval_parentId = RelationGetRelid(rel); @@ -3620,6 +3628,8 @@ transformAlterTableStmt(Oid relid, AlterTableStmt *stmt, #else createlist = list_concat(createlist, transformCreateStmt(createpart, queryString, true)); #endif + + realPartidx++; } } else diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c index 5ff8c9e7..fa4fcd89 100644 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@ -119,6 +119,7 @@ #ifdef __TBASE__ static int daysofmonth[13] = {0,31,29,31,30,31,30,31,31,30,31,30,31}; +static int daysofmonth_common_year[13] = {0,31,28,31,30,31,30,31,31,30,31,30,31}; static struct pg_tm g_partition_base_time = { 0, 0, @@ -13254,4 +13255,36 @@ is_first_day_from_start(int step, int steptype, struct pg_tm *start_time, struct return result; } + +/* + * base on a time, add step days + */ +void +calculate_time(int *year, int *mon, int *day, int step, int steptype, bool is_leap_year) +{ + int monDays; + + if (!is_leap_year) + monDays = daysofmonth_common_year[*mon]; + else + monDays = daysofmonth[*year]; + + /* partition by one day */ + if (step == 1 && steptype == IntervalType_Day) + { + if (*day == monDays) + { + *day = 1; + if (*mon < 12) + (*mon)++; + else + { + *mon = 1; + (*year)++; + } + } + else + (*day)++; + } +} #endif diff --git a/src/include/utils/ruleutils.h b/src/include/utils/ruleutils.h index fd7d7ae4..db582a18 100644 --- a/src/include/utils/ruleutils.h +++ b/src/include/utils/ruleutils.h @@ -95,6 +95,7 @@ extern List *select_rtable_names_for_explain(List *rtable, Bitmapset *rels_used); extern char *generate_collation_name(Oid collid); extern char *get_range_partbound_string(List *bound_datums); +extern void calculate_time(int *year, int *mon, int *day, int step, int steptype, bool is_leap_year); #ifdef __TBASE__ extern char * GetPartitionName(Oid parentrelid, int partidx, bool isindex); From 1e42ab850e81fb9b55e870208e8b4fcd3148567f Mon Sep 17 00:00:00 2001 From: jadenchi Date: Wed, 13 Apr 2022 21:27:48 +0800 Subject: [PATCH 538/578] add regress for 'fix common year partition' --- src/test/regress/expected/partition.out | 14 ++++++++++++++ src/test/regress/sql/partition.sql | 14 ++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/src/test/regress/expected/partition.out b/src/test/regress/expected/partition.out index d63e6d2f..331f0ead 100644 --- a/src/test/regress/expected/partition.out +++ b/src/test/regress/expected/partition.out @@ -1016,3 +1016,17 @@ explain (costs off) select * from t_in_test where c in ('20170901', '20171101'); reset enable_fast_query_shipping; drop table t_in_test; +-- for February of common year timestamp partition, add sub table should be ok +create table t_time_range (a int, b int, c timestamp) +partition by range (c) begin +(timestamp without time zone '2022-02-27 0:0:0') +step (interval '1 day') partitions (2) +distribute by shard(a) +to group default_group; +NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. +insert into t_time_range values(1, 1, '2022-02-28'); +insert into t_time_range values(1, 1, '2022-03-1'); +ERROR: value to inserted execeed range of partitioned table +ALTER TABLE t_time_range ADD PARTITIONS 1; +insert into t_time_range values(1, 1, '2022-03-1'); +drop table t_time_range; diff --git a/src/test/regress/sql/partition.sql b/src/test/regress/sql/partition.sql index cc2e7dd0..564995be 100644 --- a/src/test/regress/sql/partition.sql +++ b/src/test/regress/sql/partition.sql @@ -457,3 +457,17 @@ set enable_fast_query_shipping to off; explain (costs off) select * from t_in_test where c in ('20170901', '20171101'); reset enable_fast_query_shipping; drop table t_in_test; + +-- for February of common year timestamp partition, add sub table should be ok +create table t_time_range (a int, b int, c timestamp) +partition by range (c) begin +(timestamp without time zone '2022-02-27 0:0:0') +step (interval '1 day') partitions (2) +distribute by shard(a) +to group default_group; + +insert into t_time_range values(1, 1, '2022-02-28'); +insert into t_time_range values(1, 1, '2022-03-1'); +ALTER TABLE t_time_range ADD PARTITIONS 1; +insert into t_time_range values(1, 1, '2022-03-1'); +drop table t_time_range; From 16ae553cfb1adbe9eb420096e7a4d0cc526dce5d Mon Sep 17 00:00:00 2001 From: jadenchi Date: Fri, 15 Apr 2022 11:21:20 +0800 Subject: [PATCH 539/578] Modification some time calculation for AT_AddPartitions --- src/backend/parser/parse_utilcmd.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c index 3c2edaa0..80f45171 100644 --- a/src/backend/parser/parse_utilcmd.c +++ b/src/backend/parser/parse_utilcmd.c @@ -3512,6 +3512,12 @@ transformAlterTableStmt(Oid relid, AlterTableStmt *stmt, mon = start_time.tm_mon; day = start_time.tm_mday; + if(routerinfo->partdatatype == TIMESTAMPOID && !is_leap_year(year) && routerinfo->partinterval_type == IntervalType_Day) + { + for(partidx = 1; partidx < existnparts; partidx++) + calculate_time(&year, &mon, &day, 1, IntervalType_Day, false); + } + realPartidx = existnparts; realNewnparts = newnparts; @@ -3524,7 +3530,7 @@ transformAlterTableStmt(Oid relid, AlterTableStmt *stmt, { calculate_time(&year, &mon, &day, 1, IntervalType_Day, false); - if (mon == 2 && day == 28) + if(mon == 3 && day == 1) { partidx--; realNewnparts++; From 51d57f24a6a8b40bbcf2c188237955f6121885bc Mon Sep 17 00:00:00 2001 From: jadenchi Date: Mon, 18 Apr 2022 16:35:14 +0800 Subject: [PATCH 540/578] add some regress for common/leap year partition --- src/test/regress/expected/partition.out | 36 ++++++++++++++++++++++-- src/test/regress/sql/partition.sql | 37 +++++++++++++++++++++++-- 2 files changed, 69 insertions(+), 4 deletions(-) diff --git a/src/test/regress/expected/partition.out b/src/test/regress/expected/partition.out index 331f0ead..af67ce24 100644 --- a/src/test/regress/expected/partition.out +++ b/src/test/regress/expected/partition.out @@ -1019,8 +1019,8 @@ drop table t_in_test; -- for February of common year timestamp partition, add sub table should be ok create table t_time_range (a int, b int, c timestamp) partition by range (c) begin -(timestamp without time zone '2022-02-27 0:0:0') -step (interval '1 day') partitions (2) +(timestamp without time zone '2022-02-26 0:0:0') +step (interval '1 day') partitions (3) distribute by shard(a) to group default_group; NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. @@ -1030,3 +1030,35 @@ ERROR: value to inserted execeed range of partitioned table ALTER TABLE t_time_range ADD PARTITIONS 1; insert into t_time_range values(1, 1, '2022-03-1'); drop table t_time_range; +create table t_time_range (a int, b int, c timestamp) +partition by range (c) begin +(timestamp without time zone '2022-02-26 0:0:0') +step (interval '1 day') partitions (1) +distribute by shard(a) +to group default_group; +NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. +insert into t_time_range values(1, 1, '2022-02-26'); +ALTER TABLE t_time_range ADD PARTITIONS 2; +insert into t_time_range values(1, 1, '2022-02-28'); +insert into t_time_range values(1, 1, '2022-03-1'); +ERROR: value to inserted execeed range of partitioned table +ALTER TABLE t_time_range ADD PARTITIONS 1; +insert into t_time_range values(1, 1, '2022-03-1'); +drop table t_time_range; +-- for February of leap year timestamp partition, add sub table should be ok +create table t_time_range (a int, b int, c timestamp) +partition by range (c) begin +(timestamp without time zone '2020-02-26 0:0:0') +step (interval '1 day') partitions (3) +distribute by shard(a) +to group default_group; +NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. +insert into t_time_range values(1, 1, '2020-02-26'); +insert into t_time_range values(1, 1, '2020-02-27'); +insert into t_time_range values(1, 1, '2020-02-28'); +insert into t_time_range values(1, 1, '2020-02-29'); +ERROR: value to inserted execeed range of partitioned table +ALTER TABLE t_time_range ADD PARTITIONS 2; +insert into t_time_range values(1, 1, '2020-02-29'); +insert into t_time_range values(1, 1, '2020-03-01'); +drop table t_time_range; diff --git a/src/test/regress/sql/partition.sql b/src/test/regress/sql/partition.sql index 564995be..43c95f97 100644 --- a/src/test/regress/sql/partition.sql +++ b/src/test/regress/sql/partition.sql @@ -461,8 +461,8 @@ drop table t_in_test; -- for February of common year timestamp partition, add sub table should be ok create table t_time_range (a int, b int, c timestamp) partition by range (c) begin -(timestamp without time zone '2022-02-27 0:0:0') -step (interval '1 day') partitions (2) +(timestamp without time zone '2022-02-26 0:0:0') +step (interval '1 day') partitions (3) distribute by shard(a) to group default_group; @@ -471,3 +471,36 @@ insert into t_time_range values(1, 1, '2022-03-1'); ALTER TABLE t_time_range ADD PARTITIONS 1; insert into t_time_range values(1, 1, '2022-03-1'); drop table t_time_range; + +create table t_time_range (a int, b int, c timestamp) +partition by range (c) begin +(timestamp without time zone '2022-02-26 0:0:0') +step (interval '1 day') partitions (1) +distribute by shard(a) +to group default_group; + +insert into t_time_range values(1, 1, '2022-02-26'); +ALTER TABLE t_time_range ADD PARTITIONS 2; +insert into t_time_range values(1, 1, '2022-02-28'); +insert into t_time_range values(1, 1, '2022-03-1'); +ALTER TABLE t_time_range ADD PARTITIONS 1; +insert into t_time_range values(1, 1, '2022-03-1'); +drop table t_time_range; + +-- for February of leap year timestamp partition, add sub table should be ok +create table t_time_range (a int, b int, c timestamp) +partition by range (c) begin +(timestamp without time zone '2020-02-26 0:0:0') +step (interval '1 day') partitions (3) +distribute by shard(a) +to group default_group; + +insert into t_time_range values(1, 1, '2020-02-26'); +insert into t_time_range values(1, 1, '2020-02-27'); +insert into t_time_range values(1, 1, '2020-02-28'); +insert into t_time_range values(1, 1, '2020-02-29'); +ALTER TABLE t_time_range ADD PARTITIONS 2; +insert into t_time_range values(1, 1, '2020-02-29'); +insert into t_time_range values(1, 1, '2020-03-01'); +drop table t_time_range; + From c38ab6bcf3875c345617fb56262ec15dad6e4a2b Mon Sep 17 00:00:00 2001 From: jadenchi Date: Tue, 19 Apr 2022 15:54:26 +0800 Subject: [PATCH 541/578] modify function name calculate_time to add_day_calculation --- src/backend/parser/parse_utilcmd.c | 4 ++-- src/backend/utils/adt/ruleutils.c | 2 +- src/include/utils/ruleutils.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c index 80f45171..2be306b7 100644 --- a/src/backend/parser/parse_utilcmd.c +++ b/src/backend/parser/parse_utilcmd.c @@ -3515,7 +3515,7 @@ transformAlterTableStmt(Oid relid, AlterTableStmt *stmt, if(routerinfo->partdatatype == TIMESTAMPOID && !is_leap_year(year) && routerinfo->partinterval_type == IntervalType_Day) { for(partidx = 1; partidx < existnparts; partidx++) - calculate_time(&year, &mon, &day, 1, IntervalType_Day, false); + add_day_calculation(&year, &mon, &day, 1, IntervalType_Day, false); } realPartidx = existnparts; @@ -3528,7 +3528,7 @@ transformAlterTableStmt(Oid relid, AlterTableStmt *stmt, */ if (routerinfo->partdatatype == TIMESTAMPOID && !is_leap_year(year) && routerinfo->partinterval_type == IntervalType_Day) { - calculate_time(&year, &mon, &day, 1, IntervalType_Day, false); + add_day_calculation(&year, &mon, &day, 1, IntervalType_Day, false); if(mon == 3 && day == 1) { diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c index fa4fcd89..ef5eb6ca 100644 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@ -13260,7 +13260,7 @@ is_first_day_from_start(int step, int steptype, struct pg_tm *start_time, struct * base on a time, add step days */ void -calculate_time(int *year, int *mon, int *day, int step, int steptype, bool is_leap_year) +add_day_calculation(int *year, int *mon, int *day, int step, int steptype, bool is_leap_year) { int monDays; diff --git a/src/include/utils/ruleutils.h b/src/include/utils/ruleutils.h index db582a18..03d502f9 100644 --- a/src/include/utils/ruleutils.h +++ b/src/include/utils/ruleutils.h @@ -95,7 +95,7 @@ extern List *select_rtable_names_for_explain(List *rtable, Bitmapset *rels_used); extern char *generate_collation_name(Oid collid); extern char *get_range_partbound_string(List *bound_datums); -extern void calculate_time(int *year, int *mon, int *day, int step, int steptype, bool is_leap_year); +extern void add_day_calculation(int *year, int *mon, int *day, int step, int steptype, bool is_leap_year); #ifdef __TBASE__ extern char * GetPartitionName(Oid parentrelid, int partidx, bool isindex); From 0bbd7846eb3447030f9393a8ea0921c5ac704d8b Mon Sep 17 00:00:00 2001 From: jadenchi Date: Fri, 22 Apr 2022 15:18:49 +0800 Subject: [PATCH 542/578] fix add_day_calculation get daysofmonth by mon --- src/backend/utils/adt/ruleutils.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c index ef5eb6ca..7d16c0f6 100644 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@ -13267,7 +13267,7 @@ add_day_calculation(int *year, int *mon, int *day, int step, int steptype, bool if (!is_leap_year) monDays = daysofmonth_common_year[*mon]; else - monDays = daysofmonth[*year]; + monDays = daysofmonth[*mon]; /* partition by one day */ if (step == 1 && steptype == IntervalType_Day) From c96c7115f74309724d9fc728e8a1d80026339f64 Mon Sep 17 00:00:00 2001 From: whalesong Date: Fri, 15 Apr 2022 14:43:44 +0800 Subject: [PATCH 543/578] bugfix: server time different cause 2pc clean error (merge request 1170), http://tapd.oa.com/20421696/bugtrace/bugs/view?bug_id=1020421696096815567 --- contrib/pg_clean/pg_clean--1.0.sql | 6 + .../pg_clean/pg_clean--unpackaged--1.0.sql | 1 + contrib/pg_clean/pg_clean.c | 934 ++++++++++++++---- src/backend/access/transam/twophase.c | 51 +- src/backend/access/transam/xlog.c | 42 +- src/backend/pgxc/pool/execRemote.c | 49 +- src/backend/postmaster/clean2pc.c | 45 +- src/backend/utils/misc/guc.c | 20 +- src/include/access/twophase.h | 4 + 9 files changed, 910 insertions(+), 242 deletions(-) diff --git a/contrib/pg_clean/pg_clean--1.0.sql b/contrib/pg_clean/pg_clean--1.0.sql index e5bbc9ca..be8623f7 100644 --- a/contrib/pg_clean/pg_clean--1.0.sql +++ b/contrib/pg_clean/pg_clean--1.0.sql @@ -50,6 +50,11 @@ RETURNS text AS 'MODULE_PATHNAME' LANGUAGE C; +CREATE FUNCTION pgxc_get_2pc_prepare_timestamp(gid text) +RETURNS text +AS 'MODULE_PATHNAME' +LANGUAGE C; + CREATE FUNCTION pgxc_get_2pc_commit_timestamp(gid text) RETURNS text AS 'MODULE_PATHNAME' @@ -96,6 +101,7 @@ GRANT ALL ON FUNCTION pg_clean_check_txn(time_interval integer) TO PUBLIC; GRANT ALL ON FUNCTION pgxc_get_2pc_nodes(gid text) TO PUBLIC; GRANT ALL ON FUNCTION pgxc_get_2pc_startnode(gid text) TO PUBLIC; GRANT ALL ON FUNCTION pgxc_get_2pc_startxid(gid text) TO PUBLIC; +GRANT ALL ON FUNCTION pgxc_get_2pc_prepare_timestamp(gid text) TO PUBLIC; GRANT ALL ON FUNCTION pgxc_get_2pc_commit_timestamp(gid text) TO PUBLIC; GRANT ALL ON FUNCTION pgxc_get_2pc_xid(gid text) TO PUBLIC; GRANT ALL ON FUNCTION pgxc_get_2pc_file(gid text) TO PUBLIC; diff --git a/contrib/pg_clean/pg_clean--unpackaged--1.0.sql b/contrib/pg_clean/pg_clean--unpackaged--1.0.sql index a6a67659..d173a607 100644 --- a/contrib/pg_clean/pg_clean--unpackaged--1.0.sql +++ b/contrib/pg_clean/pg_clean--unpackaged--1.0.sql @@ -9,6 +9,7 @@ ALTER EXTENSION pg_clean ADD function pg_clean_check_txn(time_interval integer); ALTER EXTENSION pg_clean ADD function pgxc_get_2pc_nodes(gid text); ALTER EXTENSION pg_clean ADD function pgxc_get_2pc_startnode(gid text); ALTER EXTENSION pg_clean ADD function pgxc_get_2pc_startxid(gid text); +ALTER EXTENSION pg_clean ADD function pgxc_get_2pc_prepare_timestamp(gid text); ALTER EXTENSION pg_clean ADD function pgxc_get_2pc_commit_timestamp(gid text); ALTER EXTENSION pg_clean ADD function pgxc_get_2pc_xid(gid text); ALTER EXTENSION pg_clean ADD function pgxc_get_2pc_file(gid text); diff --git a/contrib/pg_clean/pg_clean.c b/contrib/pg_clean/pg_clean.c index 68d916a5..4ee21911 100644 --- a/contrib/pg_clean/pg_clean.c +++ b/contrib/pg_clean/pg_clean.c @@ -63,8 +63,14 @@ int transaction_threshold = 200000; #define MAXIMUM_OUTPUT_FILE 1000 #define XIDPREFIX "_$XC$" #define DEFAULT_CLEAN_TIME_INTERVAL 120 -#define LEAST_CLEAN_TIME_INTERVAL 3 /* should not clean twophase trans prepared in 3s */ + +#ifdef __TWO_PHASE_TESTS__ +#define LEAST_CLEAN_TIME_INTERVAL 1 /* should not clean twophase trans prepared in 1s */ #define LEAST_CHECK_TIME_INTERVAL 1 /* should not check twophase trans prepared in 1s */ +#else +#define LEAST_CLEAN_TIME_INTERVAL 10 /* should not clean twophase trans prepared in 10s */ +#define LEAST_CHECK_TIME_INTERVAL 3 /* should not check twophase trans prepared in 3s */ +#endif GlobalTimestamp clean_time_interval = DEFAULT_CLEAN_TIME_INTERVAL * USECS_PER_SEC; @@ -72,19 +78,15 @@ PG_MODULE_MAGIC; #define MAX_GID 64 -#define CLEAN_CHECK_TIMES_DEFAULT 3 -#define CLEAN_CHECK_INTERVAL_DEFAULT 100000 - -#define CLEAN_NODE_CHECK_TIMES 5 -#define CLEAN_NODE_CHECK_INTERVAL 500000 - #define MAX_DBNAME 64 #define GET_START_XID "startxid:" +#define GET_PREPARE_TIMESTAMP "global_prepare_timestamp:" #define GET_COMMIT_TIMESTAMP "global_commit_timestamp:" #define GET_START_NODE "startnode:" #define GET_NODE "nodes:" #define GET_XID "\nxid:" #define GET_READONLY "readonly" +#define ROLLBACK_POSTFIX ".rollback" /* 2pc file postfix when the 2pc is rollbacked */ #define GIDSIZE (200 + 24) #define MAX_TWOPC_TXN 1000 #define STRING_BUFF_LEN 1024 @@ -190,6 +192,7 @@ typedef struct txn_info TXN_STATUS *txn_stat; /* Array for each nodes */ char *msg; /* Notice message for this txn. */ GlobalTimestamp global_commit_timestamp; /* get global_commit_timestamp from node once it is committed*/ + GlobalTimestamp global_prepare_timestamp; /* get global_prepare_timestamp from node once it is prepared*/ TXN_STATUS global_txn_stat; OPERATION op; @@ -262,8 +265,10 @@ database_info *last_database_info = NULL; bool execute = false; int total_twopc_txn = 0; -TimestampTz current_time; -GlobalTimestamp abnormal_time = InvalidGlobalTimestamp; +TimestampTz current_time = 0; +TimestampTz abnormal_time = 0; +GlobalTimestamp current_gts = InvalidGlobalTimestamp; /* use to save current gts */ +GlobalTimestamp abnormal_gts = InvalidGlobalTimestamp; /* use to save abnormal gts, clean 2PCs which prepare gts less than abnormal gts */ char *abnormal_nodename = NULL; Oid abnormal_nodeoid = InvalidOid; bool clear_2pc_belong_node = false; @@ -341,6 +346,14 @@ static void static void get_node_handles(PGXCNodeAllHandles ** pgxc_handles, Oid nodeoid); +uint32 get_start_xid_from_gid(char *gid); +char *get_start_node_from_gid(char *gid); +Oid get_start_node_oid_from_gid(char *gid); + +bool is_xid_running_on_node(uint32 xid, Oid node_oid); +bool is_gid_start_xid_running(char *gid); +bool is_txn_start_xid_running(txn_info *txn); + Datum pg_clean_execute(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(pg_clean_execute); Datum pg_clean_execute(PG_FUNCTION_ARGS) @@ -494,6 +507,7 @@ Datum pg_clean_execute_on_node(PG_FUNCTION_ARGS) char txn_status[100]; char txn_op[100]; char txn_op_issuccess[100]; + int64 time_gap = 0; Datum values[ACCESS_CONTROL_ATTR_NUM]; bool nulls[ACCESS_CONTROL_ATTR_NUM]; @@ -540,21 +554,36 @@ Datum pg_clean_execute_on_node(PG_FUNCTION_ARGS) execute = true; clear_2pc_belong_node = true; + if (0 == PG_GETARG_DATUM(0)) + { + elog(ERROR, "pg_clean_execute_on_node: node name is empty"); + } abnormal_nodename = text_to_cstring(PG_GETARG_TEXT_P(0)); abnormal_nodeoid = get_pgxc_nodeoid(abnormal_nodename); if (InvalidOid == abnormal_nodeoid) { - elog(ERROR, "pg_clean_execute_on_node, cannot clear 2pc of invalid nodename '%s'", abnormal_nodename); + elog(ERROR, "pg_clean_execute_on_node, cannot clear 2pc of " + "invalid nodename '%s'", abnormal_nodename); } abnormal_time = PG_GETARG_INT64(1); current_time = GetCurrentTimestamp(); - if (abnormal_time >= current_time - LEAST_CLEAN_TIME_INTERVAL * USECS_PER_SEC) + time_gap = current_time - abnormal_time; + if (time_gap < LEAST_CLEAN_TIME_INTERVAL * USECS_PER_SEC) { - elog(ERROR, "pg_clean_execute_on_node, least clean time interval is %ds, " - "abnormal time: " INT64_FORMAT ", current_time: " INT64_FORMAT, + /*time gap less than LEAST_CLEAN_TIME_INTERVAL, can not clean*/ + elog(ERROR, "pg_clean_execute_on_node, least clean interval is %ds, " + "abnormal time: " INT64_FORMAT ", current time: " INT64_FORMAT, LEAST_CLEAN_TIME_INTERVAL, abnormal_time, current_time); } + current_gts = GetGlobalTimestampGTM(); + if (!GlobalTimestampIsValid(current_gts)) + { + /*get invalid gts, can not clean*/ + elog(ERROR, "pg_clean_execute_on_node, get invalid gts"); + } + abnormal_gts = current_gts - time_gap; + /*get node list*/ PgxcNodeGetOids(&cn_node_list, &dn_node_list, &cn_nodes_num, &dn_nodes_num, true); @@ -770,7 +799,9 @@ static void ResetGlobalVariables(void) head_database_info = last_database_info = NULL; current_time = 0; - abnormal_time = InvalidGlobalTimestamp; + abnormal_time = 0; + current_gts = InvalidGlobalTimestamp; + abnormal_gts = InvalidGlobalTimestamp; abnormal_nodename = NULL; abnormal_nodeoid = InvalidOid; clear_2pc_belong_node = false; @@ -922,7 +953,7 @@ static void getDatabaseList(void) { int i; TupleTableSlots result_db; - const char *query_db = "select datname::text from pg_database;"; + const char *query_db = "select datname::text from pg_catalog.pg_database"; /*add datname into tail of head_database_info*/ if (execute_query_on_single_node(my_nodeoid, query_db, 1, &result_db) == (Datum) 1) { @@ -979,6 +1010,12 @@ static void getTxnInfoOnNodesAll(void) { int i; current_time = GetCurrentTimestamp(); + current_gts = GetGlobalTimestampGTM(); + if (!GlobalTimestampIsValid(current_gts)) + { + /*get invalid gts, get txn info error*/ + elog(ERROR, "getTxnInfoOnNodesAll, get invalid gts"); + } /*upload 2PC transaction from CN*/ for (i = 0; i < cn_nodes_num; i++) { @@ -1002,10 +1039,12 @@ void getTxnInfoOnNode(Oid node) TupleTableSlots result_txn; Datum execute_res; char query_execute[1024]; - const char *query_txn_status = "select transaction::text, gid::text, owner::text, database::text, timestamptz_out(prepared)::text " - "from pg_prepared_xacts;"; - const char *query_txn_status_execute = "select transaction::text, gid::text, owner::text, database::text, timestamptz_out(prepared)::text " - "from pg_prepared_xacts where database = '%s';"; + const char *query_txn_status = "select transaction::text, gid::text, " + "owner::text, database::text, timestamptz_out(prepared)::text " + "from pg_catalog.pg_prepared_xacts"; + const char *query_txn_status_execute = "select transaction::text, gid::text, " + "owner::text, database::text, timestamptz_out(prepared)::text " + "from pg_catalog.pg_prepared_xacts where database = '%s'"; snprintf(query_execute, 1024, query_txn_status_execute, get_database_name(MyDatabaseId)); if (execute) @@ -1106,6 +1145,7 @@ TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info *txn, Oid node_oid) char *file_content = NULL; uint32 startxid = 0; char *str_startxid = NULL; + char *str_prepare_gts = NULL; char *str_timestamp = NULL; char *temp = NULL; Oid temp_nodeoid; @@ -1113,7 +1153,7 @@ TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info *txn, Oid node_oid) int temp_nodeidx; char stmt[1024]; static const char *STMT_FORM = "select pgxc_get_2pc_file('%s')::text"; - snprintf(stmt, 1024, STMT_FORM, txn->gid, txn->gid, txn->gid, txn->gid); + snprintf(stmt, 1024, STMT_FORM, txn->gid); if (execute_query_on_single_node(node_oid, stmt, 1, &result) == (Datum) 1) { @@ -1126,6 +1166,12 @@ TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info *txn, Oid node_oid) { file_content = TTSgetvalue(&result, 0, 0); + if (strlen(file_content) == 0) + { + elog(LOG, "gid: %s, 2pc file is not exist", txn->gid); + return TWOPHASE_FILE_NOT_EXISTS; + } + if (!IsXidImplicit(txn->gid) && strstr(file_content, GET_READONLY)) { txn->is_readonly = true; @@ -1135,6 +1181,7 @@ TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info *txn, Oid node_oid) } startnode = strstr(file_content, GET_START_NODE); str_startxid = strstr(file_content, GET_START_XID); + str_prepare_gts = strstr(file_content, GET_PREPARE_TIMESTAMP); partnodes = strstr(file_content, GET_NODE); temp = strstr(file_content, GET_COMMIT_TIMESTAMP); @@ -1146,6 +1193,7 @@ TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info *txn, Oid node_oid) temp = strstr(temp, GET_COMMIT_TIMESTAMP); } + /* get start node name */ if (startnode) { startnode += strlen(GET_START_NODE); @@ -1153,6 +1201,7 @@ TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info *txn, Oid node_oid) txn->origcoord = get_pgxc_nodeoid(startnode); } + /* get start xid */ if (str_startxid) { str_startxid += strlen(GET_START_XID); @@ -1161,6 +1210,7 @@ TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info *txn, Oid node_oid) txn->startxid = startxid; } + /* get participated nodes */ if (partnodes) { partnodes += strlen(GET_NODE); @@ -1183,15 +1233,37 @@ TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info *txn, Oid node_oid) return res; } + /* get prepare gts */ + if (str_prepare_gts) + { + str_prepare_gts += strlen(GET_PREPARE_TIMESTAMP); + str_prepare_gts = strtok(str_prepare_gts, "\n"); + txn->global_prepare_timestamp = strtoull(str_prepare_gts, NULL, 10); + } + else + { + txn->global_prepare_timestamp = InvalidGlobalTimestamp; + } + + /* get commit gts */ if (str_timestamp) { str_timestamp += strlen(GET_COMMIT_TIMESTAMP); str_timestamp = strtok(str_timestamp, "\n"); txn->global_commit_timestamp = strtoull(str_timestamp, NULL, 10); } + else + { + txn->global_commit_timestamp = InvalidGlobalTimestamp; + } + + elog(DEBUG1, "get 2pc txn: %s, partnodes in nodename: %s(nodeoid:%u), " + "partnodes: (%s), startnode: %s(startnodeoid: %u), startxid: %u, " + "global_prepare_timestamp: %ld, global_commit_timestamp: %ld", + txn->gid, get_pgxc_nodename(node_oid), node_oid, + partnodes, startnode, txn->origcoord, startxid, + txn->global_prepare_timestamp, txn->global_commit_timestamp); - elog(DEBUG1, "get 2pc txn:%s partnodes in nodename: %s (nodeoid:%u) result: partnodes:%s, startnode:%s, startnodeoid:%u, startxid:%u", - txn->gid, get_pgxc_nodename(node_oid), node_oid, partnodes, startnode, txn->origcoord, startxid); /* in explicit transaction startnode participate the transaction */ if (strstr(partnodes, startnode) || !IsXidImplicit(txn->gid)) { @@ -1457,7 +1529,8 @@ void getTxnInfoOnOtherNodes(txn_info *txn) node_oid = get_pgxc_nodeoid(ptr); status = GetTransactionPartNodes(txn, node_oid); } - else + + if (status == TWOPHASE_FILE_NOT_EXISTS) { for (ii = 0; ii < cn_nodes_num + dn_nodes_num; ii++) { @@ -1622,7 +1695,7 @@ void getTxnStatus(txn_info *txn, int node_idx) TupleTableSlots result; static const char *STMT_FORM = "SELECT pgxc_is_committed('%d'::xid)::text"; - snprintf(stmt, 1024, STMT_FORM, txn->xid[node_idx], txn->xid[node_idx]); + snprintf(stmt, 1024, STMT_FORM, txn->xid[node_idx]); node_oid = find_node_oid(node_idx); if (0 != execute_query_on_single_node(node_oid, stmt, 1, &result)) @@ -1713,6 +1786,10 @@ char *get2PCInfo(const char *tid) return NULL; } +/* + * pgxc_get_2pc_file + * Get 2pc file content + */ Datum pgxc_get_2pc_file(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(pgxc_get_2pc_file); Datum pgxc_get_2pc_file(PG_FUNCTION_ARGS) @@ -1721,6 +1798,10 @@ Datum pgxc_get_2pc_file(PG_FUNCTION_ARGS) char *result = NULL; text *t_result = NULL; + if (0 == PG_GETARG_DATUM(0)) + { + elog(ERROR, "2PC gid is empty"); + } tid = text_to_cstring(PG_GETARG_TEXT_P(0)); result = get2PCInfo(tid); if (NULL != result) @@ -1732,7 +1813,10 @@ Datum pgxc_get_2pc_file(PG_FUNCTION_ARGS) PG_RETURN_NULL(); } - +/* + * pgxc_get_2pc_nodes + * Get 2pc participants + */ Datum pgxc_get_2pc_nodes(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(pgxc_get_2pc_nodes); Datum pgxc_get_2pc_nodes(PG_FUNCTION_ARGS) @@ -1742,6 +1826,10 @@ Datum pgxc_get_2pc_nodes(PG_FUNCTION_ARGS) char *nodename = NULL; text *t_result = NULL; + if (0 == PG_GETARG_DATUM(0)) + { + elog(ERROR, "2PC gid is empty"); + } tid = text_to_cstring(PG_GETARG_TEXT_P(0)); result = get2PCInfo(tid); if (NULL != result) @@ -1756,10 +1844,13 @@ Datum pgxc_get_2pc_nodes(PG_FUNCTION_ARGS) return PointerGetDatum(t_result); } } - PG_RETURN_NULL(); } +/* + * pgxc_get_2pc_startnode + * Get 2pc start node + */ Datum pgxc_get_2pc_startnode(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(pgxc_get_2pc_startnode); Datum pgxc_get_2pc_startnode(PG_FUNCTION_ARGS) @@ -1769,6 +1860,10 @@ Datum pgxc_get_2pc_startnode(PG_FUNCTION_ARGS) char *nodename = NULL; text *t_result = NULL; + if (0 == PG_GETARG_DATUM(0)) + { + elog(ERROR, "2PC gid is empty"); + } tid = text_to_cstring(PG_GETARG_TEXT_P(0)); result = get2PCInfo(tid); if (NULL != result) @@ -1787,6 +1882,10 @@ Datum pgxc_get_2pc_startnode(PG_FUNCTION_ARGS) PG_RETURN_NULL(); } +/* + * pgxc_get_2pc_startxid + * Get 2pc start xid + */ Datum pgxc_get_2pc_startxid(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(pgxc_get_2pc_startxid); Datum pgxc_get_2pc_startxid(PG_FUNCTION_ARGS) @@ -1796,6 +1895,10 @@ Datum pgxc_get_2pc_startxid(PG_FUNCTION_ARGS) char *startxid = NULL; text *t_result = NULL; + if (0 == PG_GETARG_DATUM(0)) + { + elog(ERROR, "2PC gid is empty"); + } tid = text_to_cstring(PG_GETARG_TEXT_P(0)); result = get2PCInfo(tid); if (NULL != result) @@ -1813,7 +1916,44 @@ Datum pgxc_get_2pc_startxid(PG_FUNCTION_ARGS) PG_RETURN_NULL(); } +/* + * pgxc_get_2pc_prepare_timestamp + * Get 2pc prepare timestamp + */ +Datum pgxc_get_2pc_prepare_timestamp(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(pgxc_get_2pc_prepare_timestamp); +Datum pgxc_get_2pc_prepare_timestamp(PG_FUNCTION_ARGS) +{ + char *tid = NULL; + char *result = NULL; + char *prepare_timestamp = NULL; + text *t_result = NULL; + + if (0 == PG_GETARG_DATUM(0)) + { + elog(ERROR, "2PC gid is empty"); + } + tid = text_to_cstring(PG_GETARG_TEXT_P(0)); + result = get2PCInfo(tid); + if (NULL != result) + { + prepare_timestamp = strstr(result, GET_PREPARE_TIMESTAMP); + if (NULL != prepare_timestamp) + { + prepare_timestamp += strlen(GET_PREPARE_TIMESTAMP); + prepare_timestamp = strtok(prepare_timestamp, "\n"); + t_result = cstring_to_text(prepare_timestamp); + pfree(result); + return PointerGetDatum(t_result); + } + } + PG_RETURN_NULL(); +} +/* + * pgxc_get_2pc_commit_timestamp + * Get 2pc commit timestamp + */ Datum pgxc_get_2pc_commit_timestamp(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(pgxc_get_2pc_commit_timestamp); Datum pgxc_get_2pc_commit_timestamp(PG_FUNCTION_ARGS) @@ -1823,6 +1963,10 @@ Datum pgxc_get_2pc_commit_timestamp(PG_FUNCTION_ARGS) char *commit_timestamp = NULL; text *t_result = NULL; + if (0 == PG_GETARG_DATUM(0)) + { + elog(ERROR, "2PC gid is empty"); + } tid = text_to_cstring(PG_GETARG_TEXT_P(0)); result = get2PCInfo(tid); if (NULL != result) @@ -1840,17 +1984,23 @@ Datum pgxc_get_2pc_commit_timestamp(PG_FUNCTION_ARGS) PG_RETURN_NULL(); } - - +/* + * pgxc_get_2pc_xid + * Get 2pc local xid + */ Datum pgxc_get_2pc_xid(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(pgxc_get_2pc_xid); Datum pgxc_get_2pc_xid(PG_FUNCTION_ARGS) { + GlobalTransactionId xid; char *tid = NULL; char *result = NULL; char *str_xid = NULL; - GlobalTransactionId xid; + if (0 == PG_GETARG_DATUM(0)) + { + elog(ERROR, "2PC gid is empty"); + } tid = text_to_cstring(PG_GETARG_TEXT_P(0)); result = get2PCInfo(tid); if (NULL != result) @@ -1868,16 +2018,31 @@ Datum pgxc_get_2pc_xid(PG_FUNCTION_ARGS) PG_RETURN_NULL(); } +/* + * pgxc_remove_2pc_records + * Remove a 2pc file + */ Datum pgxc_remove_2pc_records(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(pgxc_remove_2pc_records); Datum pgxc_remove_2pc_records(PG_FUNCTION_ARGS) { - char *tid = text_to_cstring(PG_GETARG_TEXT_P(0)); + char *tid = NULL; + + if (0 == PG_GETARG_DATUM(0)) + { + elog(ERROR, "2PC gid is empty"); + } + tid = text_to_cstring(PG_GETARG_TEXT_P(0)); + remove_2pc_records(tid, true); pfree(tid); PG_RETURN_BOOL(true); } +/* + * pgxc_clear_2pc_records + * Clear all 2pc files which are not running + */ Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(pgxc_clear_2pc_records); Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS) @@ -1901,6 +2066,8 @@ Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS) elog(ERROR, "can only called on coordinator"); } + elog(LOG, "clear 2pc files"); + mycontext = AllocSetContextCreate(CurrentMemoryContext, "clean_check", ALLOCSET_DEFAULT_MINSIZE, @@ -1909,25 +2076,6 @@ Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS) oldcontext = MemoryContextSwitchTo(mycontext); ResetGlobalVariables(); -#if 0 - if((dir = opendir(TWOPHASE_RECORD_DIR))) - { - while((ptr = readdir(dir)) != NULL) - { - if (count > 999) - break; - if(strcmp(ptr->d_name,".") == 0 || strcmp(ptr->d_name,"..") == 0) - { - continue; - } - snprintf(path[count], MAX_GID, "/%s", ptr->d_name); - //snprintf(path[count], MAX_GID, "/%s", ptr->d_name); - count++; - } - - closedir(dir); - } -#endif /*get node list*/ PgxcNodeGetOids(&cn_node_list, &dn_node_list, @@ -1948,28 +2096,14 @@ Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS) { (void) execute_query_on_single_node(dn_node_list[i], query, 1, result+cn_nodes_num+i); } + /*get all database info*/ getDatabaseList(); /*get all info of 2PC transactions*/ getTxnInfoOnNodesAll(); -#if 0 - if((dir = opendir(TWOPHASE_RECORD_DIR))) - { - while (i < count) - { - if (!find_txn(path[i])) - { - unlink(path[i]); - WriteClean2pcXlogRec(path[i]); - } - i++; - } - closedir(dir); - } -#endif - /*delete all rest 2pc file in each nodes*/ + /*delete all rest 2pc files in each cn*/ for (i = 0; i < cn_nodes_num; i++) { if (0 == result[i].slot_count) @@ -1977,24 +2111,54 @@ Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS) continue; } if (!(twopcfiles = TTSgetvalue(result+i, 0, 0))) + { continue; + } + + /*iterate through all 2pc files, delete rest ones*/ ptr = strtok(twopcfiles, ","); - while(ptr) + for (;ptr != NULL; ptr = strtok(NULL, ",")) { if (count >= MAXIMUM_CLEAR_FILE) + { break; - if (!find_txn(ptr)) + } + + /*whether 2pc is running?*/ + if (find_txn(ptr)) + { + /*2pc is running, do not delete its file*/ + continue; + } + + /*whether 2pc is rollbacked?*/ + if (strstr(ptr, ROLLBACK_POSTFIX) == NULL) { + /*2pc is not rollbacked*/ + + /*whether 2pc start xid transaction is running?*/ + if (is_gid_start_xid_running(ptr)) + { + /*2pc start xid transaction is running, do not delete its file*/ + elog(LOG, "2PC '%s' is running", ptr); + continue; + } + } + + /*2pc is not running, delete its file*/ snprintf(clear_query, 100, CLEAR_STMT, ptr); - if (execute_query_on_single_node(cn_node_list[i], clear_query, 1, &clear_result) == (Datum)0) + elog(LOG, "clear 2pc file: %s", ptr); + if (execute_query_on_single_node(cn_node_list[i], + clear_query, 1, &clear_result) == (Datum)0) + { res = false; + } DropTupleTableSlots(&clear_result); count++; } - ptr = strtok(NULL, ","); - } } + /*delete all rest 2pc files in each dn*/ for (i = 0; i < dn_nodes_num; i++) { if (0 == result[cn_nodes_num+i].slot_count) @@ -2002,22 +2166,51 @@ Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS) continue; } if (!(twopcfiles = TTSgetvalue(result+cn_nodes_num+i, 0, 0))) + { continue; + } + + /*iterate through all 2pc files, delete rest ones*/ ptr = strtok(twopcfiles, ","); - while(ptr) + for (;ptr != NULL; ptr = strtok(NULL, ",")) { if (count >= MAXIMUM_CLEAR_FILE) + { break; - if (!find_txn(ptr)) + } + + /*whether 2pc is running?*/ + if (find_txn(ptr)) + { + /*2pc is running, do not delete its file*/ + continue; + } + + /*whether 2pc is rollbacked?*/ + if (strstr(ptr, ROLLBACK_POSTFIX) == NULL) + { + /*2pc is not rollbacked*/ + + /*whether 2pc start xid transaction is running?*/ + if (is_gid_start_xid_running(ptr)) { + /*2pc start xid transaction is running, do not delete its file*/ + elog(LOG, "2PC '%s' is running", ptr); + continue; + } + } + + /*2pc is not running, delete its file*/ snprintf(clear_query, 100, CLEAR_STMT, ptr); - if (execute_query_on_single_node(dn_node_list[i], clear_query, 1, &clear_result) == (Datum)0) + elog(LOG, "clear 2pc file: %s", ptr); + if (execute_query_on_single_node(dn_node_list[i], + clear_query, 1, &clear_result) == (Datum)0) + { res = false; + } DropTupleTableSlots(&clear_result); count++; } - ptr = strtok(NULL, ","); - } } for (i = 0; i < pgxc_clean_node_count; i++) @@ -2033,6 +2226,10 @@ Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS) PG_RETURN_BOOL(res); } +/* + * pgxc_get_record_list + * Get 2pc files list + */ Datum pgxc_get_record_list(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(pgxc_get_record_list); Datum pgxc_get_record_list(PG_FUNCTION_ARGS) @@ -2047,7 +2244,11 @@ Datum pgxc_get_record_list(PG_FUNCTION_ARGS) recordList = get_2pc_list_from_cache(&count); if (count >= MAXIMUM_OUTPUT_FILE) { - Assert(NULL != recordList); + if (NULL == recordList) + { + elog(PANIC, "recordList is NULL"); + } + t_recordList = cstring_to_text(recordList); return PointerGetDatum(t_recordList); } @@ -2130,8 +2331,18 @@ Datum pgxc_commit_on_node(PG_FUNCTION_ARGS) cn_health_map = palloc0(cn_nodes_num * sizeof(bool)); dn_health_map = palloc0(dn_nodes_num * sizeof(bool)); + if (0 == PG_GETARG_DATUM(0)) + { + elog(ERROR, "pgxc_commit_on_node: node name is empty"); + } nodename = text_to_cstring(PG_GETARG_TEXT_P(0)); + + if (0 == PG_GETARG_DATUM(1)) + { + elog(ERROR, "pgxc_commit_on_node: gid is empty"); + } gid = text_to_cstring(PG_GETARG_TEXT_P(1)); + nodeoid = get_pgxc_nodeoid(nodename); if (InvalidOid == nodeoid) { @@ -2163,6 +2374,10 @@ Datum pgxc_commit_on_node(PG_FUNCTION_ARGS) else { txn->global_commit_timestamp = GetGlobalTimestampGTM(); + if (!GlobalTimestampIsValid(current_gts)) + { + elog(ERROR, "pgxc_commit_on_node, get invalid gts"); + } } } @@ -2236,8 +2451,18 @@ Datum pgxc_abort_on_node(PG_FUNCTION_ARGS) cn_health_map = palloc0(cn_nodes_num * sizeof(bool)); dn_health_map = palloc0(dn_nodes_num * sizeof(bool)); + if (0 == PG_GETARG_DATUM(0)) + { + elog(ERROR, "pgxc_abort_on_node: node name is empty"); + } nodename = text_to_cstring(PG_GETARG_TEXT_P(0)); + + if (0 == PG_GETARG_DATUM(1)) + { + elog(ERROR, "pgxc_abort_on_node: gid is empty"); + } gid = text_to_cstring(PG_GETARG_TEXT_P(1)); + nodeoid = get_pgxc_nodeoid(nodename); if (InvalidOid == nodeoid) { @@ -2403,6 +2628,15 @@ bool send_query_clean_transaction(PGXCNodeHandle* conn, txn_info *txn, const cha TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK"))); } + if (InvalidGlobalTimestamp != txn->global_prepare_timestamp && + pgxc_node_send_prepare_timestamp(conn, txn->global_prepare_timestamp)) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("in pg_clean failed to send prepare timestamp for %s PREPARED command", + TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK"))); + } + if (NULL != txn->participants && pgxc_node_send_partnodes(conn, txn->participants)) { ereport(ERROR, @@ -2428,7 +2662,13 @@ bool check_2pc_belong_node(txn_info * txn) int node_index = 0; char node_type; node_index = find_node_index(abnormal_nodeoid); - Assert(InvalidOid != abnormal_nodeoid); + + /* abnormal node oid must be valid here */ + if (InvalidOid == abnormal_nodeoid) + { + elog(PANIC, "abnormal_nodeoid is invalid"); + } + if (abnormal_nodeoid == txn->origcoord) { txn->belong_abnormal_node = true; @@ -2448,9 +2688,7 @@ bool check_2pc_belong_node(txn_info * txn) if (InvalidOid == txn->origcoord) { - char *startnode = NULL; int node_oid = InvalidOid; - char gid[MAX_GID]; if (!IsXidImplicit(txn->gid)) { @@ -2458,39 +2696,16 @@ bool check_2pc_belong_node(txn_info * txn) return true; } - Assert(IsXidImplicit(txn->gid)); - - /* get start node from gid */ - strcpy(gid, txn->gid); - startnode = strtok(gid, ":"); - if (NULL == startnode) - { - elog(WARNING, "get startnode(%s) from gid(%s) failed", - startnode, gid); - txn->belong_abnormal_node = false; - return false; - } - - startnode = strtok(NULL, ":"); - if (NULL == startnode) + /* Get start node oid from gid */ + node_oid = get_start_node_oid_from_gid(txn->gid); + if (node_oid == InvalidOid) { - elog(WARNING, "get startnode(%s) from gid(%s) failed", - startnode, gid); + elog(WARNING, "Get invalid start node oid from gid(%s)", txn->gid); txn->belong_abnormal_node = false; return false; } - node_oid = get_pgxc_nodeoid(startnode); - if (NULL == startnode) - { - elog(WARNING, "get invalid oid for startnode(%s) from gid(%s)", - startnode, gid); - txn->belong_abnormal_node = false; - return false; - } - - elog(DEBUG5, "get oid(%d) for startnode(%s) from gid(%s)", - node_oid, startnode, gid); + elog(DEBUG1, "Get start node oid(%d) from gid(%s)", node_oid, txn->gid); if (abnormal_nodeoid == node_oid) { @@ -2518,22 +2733,13 @@ bool check_node_participate(txn_info * txn, int node_idx) void recover2PC(txn_info * txn) { - int i = 0; - bool check_ok = false; - int check_times = CLEAN_CHECK_TIMES_DEFAULT; - int check_interval = CLEAN_CHECK_INTERVAL_DEFAULT; + bool is_running = true; MemoryContext current_context = NULL; ErrorData* edata = NULL; TXN_STATUS txn_stat; txn_stat = check_txn_global_status(txn); txn->global_txn_stat = txn_stat; - if (clear_2pc_belong_node) - { - check_times = CLEAN_NODE_CHECK_TIMES; - check_interval = CLEAN_NODE_CHECK_INTERVAL; - } - #ifdef DEBUG_EXECABORT txn_stat = TXN_STATUS_ABORTED; #endif @@ -2567,46 +2773,59 @@ void recover2PC(txn_info * txn) else { txn->op = COMMIT; - /* check whether all nodes can commit prepared */ - for (i = 0; i < check_times; i++) + + /* check whether the 2pc start xid is 0 */ + if (txn->startxid == 0 && IsXidImplicit(txn->gid)) + { + elog(WARNING, "Commit 2PC '%s' start xid is 0", txn->gid); + txn->op_issuccess = false; + return; + } + + /* check whether the 2pc start xid is still running on start node */ + if (is_txn_start_xid_running(txn)) { - check_ok = true; + elog(WARNING, "Commit 2PC '%s' start xid %d is running", + txn->gid, txn->startxid); + txn->op_issuccess = false; + return; + } + + /* check whether the 2pc is still running on participants */ + is_running = false; current_context = CurrentMemoryContext; PG_TRY(); { if (!clean_2PC_iscommit(txn, true, true)) { - check_ok = false; - elog(LOG, "check commit 2PC transaction %s failed", - txn->gid); + is_running = true; + elog(WARNING, "Commit 2PC '%s' check failed", txn->gid); } } PG_CATCH(); { + is_running = true; (void)MemoryContextSwitchTo(current_context); edata = CopyErrorData(); FlushErrorState(); - check_ok = false; - elog(WARNING, "check commit 2PC transaction %s error: %s", + elog(WARNING, "Commit 2PC '%s' is running, error: %s", txn->gid, edata->message); } PG_END_TRY(); - if (!check_ok) + /* 2pc is still running, do not try to clean */ + if (is_running) { txn->op_issuccess = false; return; } - pg_usleep(check_interval); - } - /* send commit prepared to all nodes */ if (!clean_2PC_iscommit(txn, true, false)) { txn->op_issuccess = false; - elog(LOG, "commit 2PC transaction %s failed", txn->gid); + elog(WARNING, "Commit 2PC '%s' failed", txn->gid); return; } txn->op_issuccess = true; @@ -2616,46 +2835,57 @@ void recover2PC(txn_info * txn) case TXN_STATUS_ABORTED: txn->op = ABORT; - /* check whether all nodes can rollback prepared */ - for (i = 0; i < check_times; i++) + + /* check whether the 2pc start xid is 0 */ + if (txn->startxid == 0 && IsXidImplicit(txn->gid)) { - check_ok = true; + elog(WARNING, "Rollback 2PC '%s' start xid is 0", txn->gid); + } + + /* check whether the 2pc start xid is still running on start node */ + if (is_txn_start_xid_running(txn)) + { + elog(WARNING, "Rollback 2PC '%s' start xid %d is running", + txn->gid, txn->startxid); + txn->op_issuccess = false; + return; + } + + /* check whether the 2pc is still running on participants */ + is_running = false; current_context = CurrentMemoryContext; PG_TRY(); { if (!clean_2PC_iscommit(txn, false, true)) { - check_ok = false; - elog(LOG, "check rollback 2PC transaction %s failed", - txn->gid); + is_running = true; + elog(WARNING, "Rollback 2PC '%s' check failed", txn->gid); } } PG_CATCH(); { - check_ok = false; + is_running = true; (void)MemoryContextSwitchTo(current_context); edata = CopyErrorData(); FlushErrorState(); - elog(WARNING, "check rollback 2PC transaction %s error: %s", + elog(WARNING, "Rollback 2PC '%s' is running, error: %s", txn->gid, edata->message); } PG_END_TRY(); - if (!check_ok) + /* 2pc is still running, do not try to clean */ + if (is_running) { txn->op_issuccess = false; return; } - pg_usleep(check_interval); - } - /* send rollback prepared to all nodes */ if (!clean_2PC_iscommit(txn, false, false)) { txn->op_issuccess = false; - elog(LOG, "rollback 2PC transaction %s failed", txn->gid); + elog(WARNING, "Rollback 2PC '%s' failed", txn->gid); return; } txn->op_issuccess = true; @@ -2685,7 +2915,6 @@ TXN_STATUS check_txn_global_status(txn_info *txn) #define TXN_INPROGRESS 0X0020 int ii; int check_flag = 0; - int node_idx = 0; TimestampTz prepared_time = 0; TimestampTz time_gap = clean_time_interval; @@ -2770,43 +2999,124 @@ TXN_STATUS check_txn_global_status(txn_info *txn) return TXN_STATUS_INPROGRESS; } #endif - if (clear_2pc_belong_node) + + /* start xid is 0, maybe at the beginning of the 2pc */ + if (txn->startxid == 0) { - if (!check_2pc_belong_node(txn)) + /* prepare timestamp must be invalid */ + if (GlobalTimestampIsValid(txn->global_prepare_timestamp)) { - return TXN_STATUS_INPROGRESS; + elog(PANIC, "gid: %s, start xid is 0, global_prepare_timestamp: %ld", + txn->gid, txn->global_prepare_timestamp); } - if (!check_2pc_start_from_node(txn)) + elog(DEBUG2, "2PC '%s' start xid is 0", txn->gid); + + if (check_flag & TXN_INPROGRESS + || current_time - prepared_time <= time_gap) { + /* inprogress or less than time gap, do not clean it */ + elog(LOG, "2PC '%s' start xid is 0, inprogress, " + "current_time: %ld, prepared_time: %ld, " + "time_gap: %ld, time_diff: %ld", + txn->gid, current_time, prepared_time, + time_gap, current_time - prepared_time); + return TXN_STATUS_INPROGRESS; } + else + { + /* otherwise, abort it */ + elog(WARNING, "2PC '%s' start xid is 0, " + "current_time: %ld, prepared_time: %ld, " + "time_gap: %ld, time_diff: %ld", + txn->gid, current_time, prepared_time, + time_gap, current_time - prepared_time); + + return TXN_STATUS_ABORTED; + } + } - node_idx = find_node_index(abnormal_nodeoid); - if (node_idx >= 0) + /* use for upgrade from old version, no prepare timestamp in old version */ + if (!GlobalTimestampIsValid(txn->global_prepare_timestamp)) { - if (abnormal_time < txn->prepare_timestamp[node_idx]) + elog(WARNING, "gid: %s, start xid is %d, global_prepare_timestamp " + "is invalid", txn->gid, txn->startxid); + + if (check_flag & TXN_INPROGRESS + || current_time - prepared_time <= time_gap) { - elog(WARNING, "gid: %s, abnormal time: " INT64_FORMAT - ", prepare timestamp[%d]: " INT64_FORMAT, txn->gid, - abnormal_time, node_idx, txn->prepare_timestamp[node_idx]); + /* inprogress or less than time gap, do not clean it */ + elog(WARNING, "gid: %s, start xid is %d, inprogress, " + "current_time: %ld, prepared_time: %ld, " + "time_gap: %ld, time_diff: %ld", + txn->gid, txn->startxid, current_time, prepared_time, + time_gap, current_time - prepared_time); return TXN_STATUS_INPROGRESS; } + else + { + /* otherwise, set prepare timestamp */ + if (clear_2pc_belong_node) + { + txn->global_prepare_timestamp = abnormal_gts; } else { - elog(WARNING, "gid: %s, node_idx: %d", txn->gid, node_idx); + txn->global_prepare_timestamp = current_gts - time_gap; + } + + elog(WARNING, "gid: %s, start xid is %d, " + "current_time: %ld, prepared_time: %ld, " + "time_gap: %ld, time_diff: %ld, " + "set global_prepare_timestamp: %ld", + txn->gid, txn->startxid, current_time, prepared_time, + time_gap, current_time - prepared_time, + txn->global_prepare_timestamp); + } + } + + if (clear_2pc_belong_node) + { + if (!check_2pc_belong_node(txn)) + { + return TXN_STATUS_INPROGRESS; + } + + if (!check_2pc_start_from_node(txn)) + { + return TXN_STATUS_INPROGRESS; + } + + /* abnormal gts must be valid */ + if (!GlobalTimestampIsValid(abnormal_gts)) + { + elog(PANIC, "gid: %s, abnormal_gts is invalid gts", txn->gid); } - if (abnormal_time < prepared_time) + /* abnormal gts less than prepare gts, do not clean it */ + if (abnormal_gts < txn->global_prepare_timestamp) { - elog(WARNING, "gid: %s, abnormal time: " INT64_FORMAT - ", prepared time: " INT64_FORMAT, txn->gid, - abnormal_time, prepared_time); + elog(LOG, "gid: %s, abnormal gts: " INT64_FORMAT + ", prepare gts: " INT64_FORMAT, txn->gid, + abnormal_gts, txn->global_prepare_timestamp); return TXN_STATUS_INPROGRESS; } + + if (GlobalTimestampIsValid(txn->global_commit_timestamp)) + { + /* abnormal gts less than commit gts, do not clean it */ + if (abnormal_gts < txn->global_commit_timestamp) + { + elog(LOG, "gid: %s, abnormal gts: " INT64_FORMAT + ", commit gts: " INT64_FORMAT, txn->gid, + abnormal_gts, txn->global_commit_timestamp); + + return TXN_STATUS_INPROGRESS; + } + } } else { @@ -2815,8 +3125,36 @@ TXN_STATUS check_txn_global_status(txn_info *txn) /* transaction inprogress */ return TXN_STATUS_INPROGRESS; } + + /* current gts must be valid */ + if (!GlobalTimestampIsValid(current_gts)) + { + elog(PANIC, "gid: %s, current_gts is invalid gts", txn->gid); + } + + /* 2pc prepare gts gap less than time gap, do not clean it */ + if (current_gts - txn->global_prepare_timestamp < time_gap) + { + elog(LOG, "gid: %s, current gts: " INT64_FORMAT + ", prepare gts: " INT64_FORMAT ", time gap: " INT64_FORMAT, + txn->gid, current_gts, txn->global_prepare_timestamp, time_gap); + + return TXN_STATUS_INPROGRESS; } + if (GlobalTimestampIsValid(txn->global_commit_timestamp)) + { + /* 2pc commit gts gap less than time gap, do not clean it */ + if (current_gts - txn->global_commit_timestamp <= time_gap) + { + elog(LOG, "gid: %s, current gts: " INT64_FORMAT + ", commit gts: " INT64_FORMAT ", time gap: " INT64_FORMAT, + txn->gid, current_gts, txn->global_commit_timestamp, time_gap); + + return TXN_STATUS_INPROGRESS; + } + } + } if (!IsXidImplicit(txn->gid) && txn->after_first_phase && (TXN_PREPARED == check_flag)) { @@ -2837,6 +3175,21 @@ TXN_STATUS check_txn_global_status(txn_info *txn) if (check_flag & TXN_COMMITTED) /* Some 2PC transactions are committed. Need to commit others. */ return TXN_STATUS_COMMITTED; + + /* If 2PC commit gts is valid, must commit it. */ + if (GlobalTimestampIsValid(txn->global_commit_timestamp)) + { + elog(LOG, "'%s' global_commit_timestamp: %ld", + txn->gid, txn->global_commit_timestamp); + + if (!(check_flag & TXN_PREPARED)) + { + elog(PANIC, "gid: %s, check_flag: %d", txn->gid, check_flag); + } + + return TXN_STATUS_COMMITTED; + } + /* All the transactions remain prepared. No need to recover. */ return TXN_STATUS_ABORTED; } @@ -2901,6 +3254,11 @@ bool clean_2PC_iscommit(txn_info *txn, bool is_commit, bool is_check) { node_oid = pgxc_handles->datanode_handles[ii]->nodeoid; node_idx = find_node_index(node_oid); + if (node_idx < 0 || node_idx >= cn_nodes_num + dn_nodes_num) + { + elog(PANIC, "gid: %s, node_idx(%d) is invalid", txn->gid, node_idx); + } + if (TXN_STATUS_PREPARED != txn->txn_stat[ node_idx]) { continue; @@ -2934,6 +3292,11 @@ bool clean_2PC_iscommit(txn_info *txn, bool is_commit, bool is_check) { node_oid = pgxc_handles->coord_handles[ii]->nodeoid; node_idx = find_node_index(node_oid); + if (node_idx < 0 || node_idx >= cn_nodes_num + dn_nodes_num) + { + elog(PANIC, "gid: %s, node_idx(%d) is invalid", txn->gid, node_idx); + } + if (TXN_STATUS_PREPARED != txn->txn_stat[ node_idx]) { continue; @@ -2961,7 +3324,6 @@ bool clean_2PC_iscommit(txn_info *txn, bool is_commit, bool is_check) } #endif } - } /* receive response */ @@ -3000,10 +3362,14 @@ bool clean_2PC_iscommit(txn_info *txn, bool is_commit, bool is_check) if (txn->origcoord != InvalidOid) { node_idx = find_node_index(txn->origcoord); + if (node_idx < 0 || node_idx >= cn_nodes_num + dn_nodes_num) + { + elog(PANIC, "gid: %s, node_idx(%d) is invalid", txn->gid, node_idx); + } + if (txn->coordparts[node_idx] == 1) { /*send global timestamp to dn_node_list[ii]*/ - if (txn->txn_stat[node_idx] == TXN_STATUS_PREPARED) { get_node_handles(&pgxc_handles, txn->origcoord); @@ -3072,7 +3438,8 @@ bool clean_2PC_files(txn_info * txn) } else { - elog(LOG, "pg_clean: failed clean 2pc file of transaction %s on node %s", txn->gid, get_pgxc_nodename(dn_node_list[ii])); + elog(LOG, "pg_clean: failed clean 2pc file of transaction %s on node %s", + txn->gid, get_pgxc_nodename(dn_node_list[ii])); issuccess = false; } DropTupleTableSlots(&result); @@ -3086,14 +3453,15 @@ bool clean_2PC_files(txn_info * txn) { if (TTSgetvalue(&result, 0, 0) == false) { - elog(LOG, "Error:delete 2PC file failed of transaction %s on node %s", + elog(LOG, "pg_clean: delete 2PC file failed of transaction %s on node %s", txn->gid, get_pgxc_nodename(txn->coordparts[ii])); issuccess = false; } } else { - elog(LOG, "pg_clean: failed clean 2pc file of transaction %s on node %s", txn->gid, get_pgxc_nodename(cn_node_list[ii])); + elog(LOG, "pg_clean: failed clean 2pc file of transaction %s on node %s", + txn->gid, get_pgxc_nodename(cn_node_list[ii])); issuccess = false; } DropTupleTableSlots(&result); @@ -3378,12 +3746,14 @@ void get_node_handles(PGXCNodeAllHandles **pgxc_handles, Oid nodeoid) *pgxc_handles = get_handles(nodelist, coordlist, false, true, true); } - bool check_2pc_start_from_node(txn_info *txn) { char node_type; - Assert(InvalidOid != abnormal_nodeoid); + if (InvalidOid == abnormal_nodeoid) + { + elog(PANIC, "gid: %s, abnormal_nodeoid is invalid", txn->gid); + } if (abnormal_nodeoid == txn->origcoord) { @@ -3398,51 +3768,239 @@ bool check_2pc_start_from_node(txn_info *txn) if (InvalidOid == txn->origcoord) { - char *startnode = NULL; int node_oid = InvalidOid; - char gid[MAX_GID]; if (!IsXidImplicit(txn->gid)) { return true; } - Assert(IsXidImplicit(txn->gid)); - - /* get start node from gid */ - strcpy(gid, txn->gid); - startnode = strtok(gid, ":"); - if (NULL == startnode) + /* Get start node oid from gid */ + node_oid = get_start_node_oid_from_gid(txn->gid); + if (InvalidOid == node_oid) { - elog(WARNING, "get startnode(%s) from gid(%s) failed", - startnode, gid); + elog(WARNING, "Get invalid start node oid from gid(%s)", txn->gid); return false; } - startnode = strtok(NULL, ":"); - if (NULL == startnode) + elog(DEBUG1, "Get start node oid(%d) from gid(%s)", node_oid, txn->gid); + + if (abnormal_nodeoid == node_oid) { - elog(WARNING, "get startnode(%s) from gid(%s) failed", - startnode, gid); + return true; + } + } + return false; } - node_oid = get_pgxc_nodeoid(startnode); - if (NULL == startnode) +/* + * get_start_node_from_gid + * Get start node name from gid + * gid: 2pc gid + */ +char *get_start_node_from_gid(char *gid) { - elog(WARNING, "get invalid oid for startnode(%s) from gid(%s)", - startnode, gid); - return false; + char *str_start_node = NULL; + + if (!IsXidImplicit(gid)) + { + elog(WARNING, "2PC '%s' is not implicit", gid); + return NULL; + } + + /* Get start node name from gid */ + str_start_node = strtok(gid, ":"); + if (str_start_node == NULL) + { + elog(WARNING, "Get start node from gid(%s) failed", gid); + return NULL; } - elog(DEBUG1, "get oid(%d) for startnode(%s) from gid(%s)", - node_oid, startnode, gid); + str_start_node = strtok(NULL, ":"); + if (str_start_node == NULL) + { + elog(WARNING, "Get start node from gid(%s) failed", gid); + return NULL; + } - if (abnormal_nodeoid == node_oid) + return str_start_node; +} + +/* + * get_start_node_oid_from_gid + * Get start node oid from gid + * gid: 2pc gid + */ +Oid get_start_node_oid_from_gid(char *gid) +{ + Oid start_node_oid = 0; + char *str_start_node = NULL; + char gid_buf[MAX_GID]; + + /* Get start node oid from gid */ + strcpy(gid_buf, gid); + str_start_node = get_start_node_from_gid(gid_buf); + if (str_start_node == NULL) + { + elog(WARNING, "Get start node from gid(%s) failed", gid); + return 0; + } + + elog(LOG, "Get start node(%s) from gid(%s)", str_start_node, gid); + + start_node_oid = get_pgxc_nodeoid(str_start_node); + if (start_node_oid == InvalidOid) + { + elog(WARNING, "Get invalid oid for start node(%s) from gid(%s)", + str_start_node, gid); + return 0; + } + + return start_node_oid; +} + +/* + * get_start_xid_from_gid + * Get start xid from gid + * gid: 2pc gid + */ +uint32 get_start_xid_from_gid(char *gid) +{ + uint32 start_xid = 0; + char *str_start_xid = NULL; + char gid_buf[MAX_GID]; + + if (!IsXidImplicit(gid)) + { + elog(WARNING, "2PC '%s' is not implicit", gid); + return 0; + } + + /* Get start xid from gid */ + strcpy(gid_buf, gid); + str_start_xid = gid_buf + strlen(XIDPREFIX); + str_start_xid = strtok(str_start_xid, ":"); + start_xid = strtoul(str_start_xid, NULL, 10); + if (start_xid == 0) { + elog(WARNING, "Get start xid from gid(%s) failed", gid); + return 0; + } + + return start_xid; +} + +/* + * is_xid_running_on_node + * Whether the transaction with the xid is still running on the node + * xid: transaction id + * node_oid: node oid + */ +bool is_xid_running_on_node(uint32 xid, Oid node_oid) +{ + bool is_running = true; + + Datum execute_res; + TupleTableSlots result; + char command[MAX_CMD_LENGTH]; + + if (xid == 0 || node_oid == InvalidOid) + { + elog(PANIC, "2PC xid: %d, node oid: %d", xid, node_oid); return true; } + + snprintf(command, MAX_CMD_LENGTH, "select pid::text, backend_xid::text " + "from pg_catalog.pg_stat_activity where backend_xid=%d", xid); + + execute_res = execute_query_on_single_node(node_oid, command, 2, &result); + if (execute_res == (Datum) 1) + { + if (result.slot_count == 0) + { + is_running = false; + } + else + { + is_running = true; + + if (result.slot_count != 1) + { + elog(PANIC, "Get %d resules for xid: %d", result.slot_count, xid); + } + } + } + else + { + elog(WARNING, "pg_clean: Faile to query xid %d on node %s", + xid, get_pgxc_nodename(node_oid)); + is_running = true; + } + DropTupleTableSlots(&result); + + return is_running; } +/* + * is_gid_start_xid_running + * Whether the transaction with the start xid is still running on start node + * gid: 2pc gid + */ +bool is_gid_start_xid_running(char *gid) +{ + uint32 start_xid = 0; + Oid start_node_oid = InvalidOid; + + if (!IsXidImplicit(gid)) + { + elog(LOG, "Explicit 2PC '%s'", gid); + return true; + } + + /* Get start xid from gid */ + start_xid = get_start_xid_from_gid(gid); + if (start_xid == 0) + { + elog(ERROR, "Get start xid from gid(%s) failed", gid); + return true; + } + + elog(LOG, "Get start xid(%d) from gid(%s)", start_xid, gid); + + /* Get start node oid from gid */ + start_node_oid = get_start_node_oid_from_gid(gid); + if (start_node_oid == InvalidOid) + { + elog(WARNING, "Get invalid start node oid from gid(%s)", gid); return false; } + + elog(LOG, "Get start node oid(%d) from gid(%s)", start_node_oid, gid); + + return is_xid_running_on_node(start_xid, start_node_oid); +} + +/* + * is_txn_start_xid_running + * Whether the transaction with the start xid is still running on start node + * txn: 2pc transaction info + */ +bool is_txn_start_xid_running(txn_info *txn) +{ + if (txn->startxid != 0) + { + Assert(txn->origcoord != InvalidOid); + return is_xid_running_on_node(txn->startxid, txn->origcoord); + } + + Assert(txn->origcoord == InvalidOid); + + if (!IsXidImplicit(txn->gid)) + { + elog(LOG, "Explicit 2PC '%s' start xid is %d", txn->gid, txn->startxid); + return false; + } + + return is_gid_start_xid_running(txn->gid); +} diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index e78f9c53..ea188961 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -2392,12 +2392,11 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon) if (!save_and_remove_2pc_info(gxact->gid)) { - elog(LOG, "[%s] %s save to file failed", - __FUNCTION__, gxact->gid); + elog(DEBUG1, "checkpoint: %s save to file failed", gxact->gid); } else { - elog(LOG, "[%s] %s is saved to file", __FUNCTION__, gxact->gid); + elog(LOG, "checkpoint: %s is saved to file", gxact->gid); } } #endif @@ -3741,10 +3740,12 @@ void record_2pc_involved_nodes_xid(const char * tid, File fd = 0; int ret = 0; int size = 0; + int pg_clean_check_size = 0; StringInfoData content; struct stat fst; char path[MAXPGPATH]; char *result = NULL; + GlobalTimestamp prepare_gts = InvalidGlobalTimestamp; #ifdef __TWO_PHASE_TESTS__ XLogRecPtr xlogrec = 0; @@ -3755,6 +3756,18 @@ void record_2pc_involved_nodes_xid(const char * tid, return; } + prepare_gts = GetGlobalPrepareTimestamp(); + if (!GlobalTimestampIsValid(prepare_gts)) + { + elog(WARNING, "prepare gts is invalid"); + prepare_gts = GetGlobalTimestampGTM(); + if (!GlobalTimestampIsValid(prepare_gts)) + { + elog(ERROR, "get gts for prepare is invalid"); + } + SetGlobalPrepareTimestamp(prepare_gts); + } + if (enable_distri_print || enable_2pc_entry_trace) { elog(LOG, "[%s] record %s, startnode: %s, participants: %s", @@ -3780,6 +3793,10 @@ void record_2pc_involved_nodes_xid(const char * tid, appendStringInfo(&content, "startxid:%u\n", startxid); appendStringInfo(&content, "nodes:%s\n", nodestring); appendStringInfo(&content, "xid:%u\n", xid); + pg_clean_check_size = content.len; + Assert(pg_clean_check_size == strlen(content.data)); + + appendStringInfo(&content, "global_prepare_timestamp:%ld\n", prepare_gts); size = content.len; Assert(size == strlen(content.data)); @@ -3798,11 +3815,10 @@ void record_2pc_involved_nodes_xid(const char * tid, Assert(strlen(info) < MAX_2PC_INFO_SIZE); check_2pc_file(tid, info, __FUNCTION__); - if (strncmp(info, content.data, size) != 0) + if (pg_strncasecmp(info, content.data, pg_clean_check_size) != 0) { - elog(ERROR, "[%s] pg_clean attemp to write %s info conflict, " - "content: %s, info: %s", __FUNCTION__, tid, - content.data, info); + elog(ERROR, "pg_clean attemp to write %s info conflict, " + "content: %s, info: %s", tid, content.data, info); } resetStringInfo(&content); @@ -3836,11 +3852,10 @@ void record_2pc_involved_nodes_xid(const char * tid, Assert(NULL != result); - if (strncmp(result, content.data, size) != 0) + if (pg_strncasecmp(result, content.data, pg_clean_check_size) != 0) { - elog(ERROR, "[%s] pg_clean attemp to write %s info conflict, " - "content: %s, info: %s", - __FUNCTION__, tid, content.data, result); + elog(ERROR, "pg_clean attemp to write %s info conflict, " + "content: %s, info: %s", tid, content.data, result); } pfree(result); @@ -3853,12 +3868,16 @@ void record_2pc_involved_nodes_xid(const char * tid, if (!RecoveryInProgress()) { + char *fmt_v2 = XLOG_FMT_2PC_V2; XLogBeginInsert(); XLogRegisterData((char *)tid, strlen(tid) + 1); + XLogRegisterData((char *)fmt_v2, strlen(fmt_v2) + 1); XLogRegisterData((char *)startnode, strlen(startnode) + 1); - XLogRegisterData((char *)&startxid, sizeof(GlobalTransactionId) + 1); + XLogRegisterData((char *)&startxid, sizeof(GlobalTransactionId)); XLogRegisterData((char *)nodestring, strlen(nodestring) + 1); - XLogRegisterData((char *)&xid, sizeof(GlobalTransactionId) + 1); + XLogRegisterData((char *)&xid, sizeof(GlobalTransactionId)); + XLogRegisterData((char *)&prepare_gts, sizeof(GlobalTimestamp)); + #ifdef __TWO_PHASE_TESTS__ xlogrec = #endif @@ -3973,7 +3992,7 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta { XLogBeginInsert(); XLogRegisterData((char *)tid, strlen(tid) + 1); - XLogRegisterData((char *)&commit_timestamp, sizeof(GlobalTimestamp) + 1); + XLogRegisterData((char *)&commit_timestamp, sizeof(GlobalTimestamp)); xlogrec = XLogInsert(RM_XLOG_ID, XLOG_RECORD_2PC_TIMESTAMP); /* only start node need to flush and sync XLOG_RECORD_2PC_TIMESTAMP */ if (IS_PGXC_LOCAL_COORDINATOR) @@ -4178,7 +4197,7 @@ void rename_2pc_records(const char *tid, TimestampTz timestamp) XLogBeginInsert(); XLogRegisterData((char *)tid, strlen(tid) + 1); XLogRegisterData((char *)type, strlen(type) + 1); - XLogRegisterData((char *)×tamp, sizeof(TimestampTz) + 1); + XLogRegisterData((char *)×tamp, sizeof(TimestampTz)); XLogInsert(RM_XLOG_ID, XLOG_CLEAN_2PC_FILE); } @@ -4388,7 +4407,7 @@ char *get_2pc_list_from_cache(int *count) { recordList = (char *) repalloc(recordList, strlen(entry->key) + strlen(recordList) + 2); - sprintf(recordList, "%s,%s", recordList, entry->key); + sprintf(recordList + strlen(recordList), ",%s", entry->key); } if (++(*count) >= MAX_OUTPUT_FILE) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 7044cd8b..99cc62f3 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -10177,10 +10177,10 @@ xlog_redo(XLogReaderState *record) TimestampTz timestamp = 0; gid = XLogRecGetData(record); type = gid + strlen(gid) + 1; - pos = type + strlen(type) + 1; - memcpy(×tamp, pos, sizeof(TimestampTz)); if (0 == strcmp(type, "rename")) { + pos = type + strlen(type) + 1; + memcpy(×tamp, pos, sizeof(TimestampTz)); rename_2pc_records(gid, timestamp); } else @@ -10192,11 +10192,13 @@ xlog_redo(XLogReaderState *record) { TransactionId xid; TransactionId startxid; + GlobalTimestamp prepare_gts = InvalidGlobalTimestamp; + char *fmt_v2 = XLOG_FMT_2PC_V2; char *gid; char *startnode; char *nodestring; char *pos; - char *temp; + char *type; #ifdef __TWO_PHASE_TESTS__ TransactionId old_shem_nextxid = ShmemVariableCache->nextXid; #endif @@ -10204,27 +10206,48 @@ xlog_redo(XLogReaderState *record) gid = XLogRecGetData(record); pos = gid + strlen(gid) +1; /* if the transaction is readonly */ - temp = pos; - pos = pos + strlen(temp) +1; + type = pos; + pos = pos + strlen(type) + 1; - if (0 != strcmp(temp, "readonly")) + if (0 != strcmp(type, "readonly")) { - startnode = temp; + if (0 == strcmp(type, fmt_v2)) + { + startnode = pos; + pos = pos + strlen(startnode) + 1; + memcpy(&startxid, pos, sizeof(TransactionId)); + pos = pos + sizeof(TransactionId); + nodestring = pos; + pos = pos + strlen(nodestring) + 1; + memcpy(&xid, pos, sizeof(TransactionId)); + pos = pos + sizeof(TransactionId); + memcpy(&prepare_gts, pos, sizeof(GlobalTimestamp)); + pos = pos + sizeof(GlobalTimestamp); + } + else + { + /* compatible with old format */ + startnode = type; memcpy(&startxid, pos, sizeof(TransactionId)); pos = pos + sizeof(TransactionId) + 1; nodestring = pos; pos = pos + strlen(nodestring) + 1; memcpy(&xid, pos, sizeof(TransactionId)); + pos = pos + sizeof(TransactionId) + 1; + } + if (enable_distri_print) { elog(LOG, "xlog redo 2pc file name: '%s', startnode: %s, " - "startxid: %u, nodestring: %s, xid: %u", - gid, startnode, startxid, nodestring, xid); + "startxid: %u, prepare_gts: %ld, nodestring: %s, xid: %u", + gid, startnode, startxid, prepare_gts, nodestring, xid); } + #ifdef __TWO_PHASE_TESTS__ if (FILE_XLOG_EXISTED == twophase_exception_case) { elog(LOG, "FILE_XLOG_EXISTED complish"); + SetGlobalPrepareTimestamp(prepare_gts); record_2pc_involved_nodes_xid(gid, startnode, startxid, nodestring, xid); } #endif @@ -10248,6 +10271,7 @@ xlog_redo(XLogReaderState *record) LWLockRelease(XidGenLock); } + SetGlobalPrepareTimestamp(prepare_gts); record_2pc_involved_nodes_xid(gid, startnode, startxid, nodestring, xid); } else diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 71e4c53b..2cedb37c 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -3974,11 +3974,10 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit) #endif #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__ - if(implicit) - { if(enable_distri_print) { - elog(LOG, "prepare remote transaction xid %d gid %s", GetTopTransactionIdIfAny(), prepareGID); + elog(LOG, "prepare remote transaction xid %d gid %s", + GetTopTransactionIdIfAny(), prepareGID); } global_prepare_ts = GetGlobalTimestampGTM(); @@ -3988,17 +3987,19 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit) global_prepare_ts = 0; } #endif - if(!GlobalTimestampIsValid(global_prepare_ts)){ + + if (!GlobalTimestampIsValid(global_prepare_ts)) + { ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("failed to get global timestamp for PREPARED command"))); } if(enable_distri_print) { - elog(LOG, "prepare phase get global prepare timestamp gid %s, time " INT64_FORMAT, prepareGID, global_prepare_ts); + elog(LOG, "prepare phase get global prepare timestamp gid %s, time " + INT64_FORMAT, prepareGID, global_prepare_ts); } SetGlobalPrepareTimestamp(global_prepare_ts); - } #endif #ifdef __TWO_PHASE_TRANS__ @@ -4093,19 +4094,18 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit) { #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__ - if(implicit) - { if(enable_distri_print) { - elog(LOG, "send prepare timestamp for xid %d gid %s prepare ts " INT64_FORMAT,GetTopTransactionIdIfAny(), + elog(LOG, "send prepare timestamp for xid %d gid %s prepare ts " + INT64_FORMAT, GetTopTransactionIdIfAny(), prepareGID, global_prepare_ts); } if (pgxc_node_send_prepare_timestamp(conn, global_prepare_ts)) { ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("failed to send global prepare committs for PREPARED command"))); - } + errmsg("failed to send global prepare committs for " + "PREPARED command"))); } #endif /* Send down prepare command */ @@ -4139,11 +4139,10 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit) #endif #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__ - if(implicit) - { if(enable_distri_print) { - elog(LOG, "send prepare timestamp for xid %d gid %s prepare ts " INT64_FORMAT,GetTopTransactionIdIfAny(), + elog(LOG, "send prepare timestamp for xid %d gid %s prepare ts " + INT64_FORMAT, GetTopTransactionIdIfAny(), prepareGID, global_prepare_ts); } if (pgxc_node_send_prepare_timestamp(conn, global_prepare_ts)) @@ -4157,8 +4156,8 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit) #endif ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("failed to send global prepare committs for PREPARED command"))); - } + errmsg("failed to send global prepare committs for " + "PREPARED command"))); } #endif @@ -4297,19 +4296,18 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit) if (conn->read_only) { #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__ - if(implicit) - { if(enable_distri_print) { - elog(LOG, "send prepare timestamp for xid %d gid %s prepare ts " INT64_FORMAT,GetTopTransactionIdIfAny(), + elog(LOG, "send prepare timestamp for xid %d gid %s prepare ts " + INT64_FORMAT,GetTopTransactionIdIfAny(), prepareGID, global_prepare_ts); } if (pgxc_node_send_prepare_timestamp(conn, global_prepare_ts)) { ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("failed to send global prepare committs for PREPARED command"))); - } + errmsg("failed to send global prepare committs for " + "PREPARED command"))); } #endif /* Send down prepare command */ @@ -4340,11 +4338,10 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit) #endif #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__ - if(implicit) - { if(enable_distri_print) { - elog(LOG, "send prepare timestamp for xid %d gid %s prepare ts " INT64_FORMAT,GetTopTransactionIdIfAny(), + elog(LOG, "send prepare timestamp for xid %d gid %s prepare ts " + INT64_FORMAT,GetTopTransactionIdIfAny(), prepareGID, global_prepare_ts); } if (pgxc_node_send_prepare_timestamp(conn, global_prepare_ts)) @@ -4358,8 +4355,8 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit) #endif ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("failed to send global prepare committs for PREPARED command"))); - } + errmsg("failed to send global prepare committs for " + "PREPARED command"))); } #endif diff --git a/src/backend/postmaster/clean2pc.c b/src/backend/postmaster/clean2pc.c index def81c95..c1e3a31f 100644 --- a/src/backend/postmaster/clean2pc.c +++ b/src/backend/postmaster/clean2pc.c @@ -17,6 +17,7 @@ #include "postgres.h" #include "access/htup_details.h" +#include "catalog/namespace.h" #include "catalog/pg_database.h" #include "catalog/pg_type.h" #include "commands/dbcommands.h" @@ -58,7 +59,7 @@ typedef enum bool enable_clean_2pc_launcher = true; int auto_clean_2pc_interval = 60; -int auto_clean_2pc_delay = 300; +int auto_clean_2pc_delay = 60; int auto_clean_2pc_timeout = 1200; int auto_clean_2pc_max_check_time = 1200; @@ -88,6 +89,8 @@ static void start_clean_worker(int count); static void do_query_2pc(TimestampTz clean_time); static void do_clean_2pc(TimestampTz clean_time); +static bool check_pg_clean_extension(void); + static void clean_2pc_sigterm_handler(SIGNAL_ARGS); static void clean_2pc_sighup_handler(SIGNAL_ARGS); static void clean_2pc_sigusr2_handler(SIGNAL_ARGS); @@ -432,6 +435,12 @@ do_query_2pc(TimestampTz clean_time) Assert(result_str != NULL); resetStringInfo(result_str); + if (!check_pg_clean_extension()) + { + elog(WARNING, "create extension pg_clean please"); + return; + } + check_time = (curr_time - clean_time)/USECS_PER_SEC; if (check_time < 0) @@ -686,6 +695,40 @@ do_clean_2pc(TimestampTz clean_time) } } +/* + * check if pg_clean_check_txn funciton exist + */ +static bool +check_pg_clean_extension(void) +{ + bool res = false; + List *names = NULL; + FuncCandidateList clist = NULL; + char *fuc_name = "pg_clean_check_txn"; + + StartTransactionCommand(); + + /* + * Parse the name into components and see if it matches any pg_proc + * entries in the current search path. + */ + names = list_make1(makeString(fuc_name)); + clist = FuncnameGetCandidates(names, -1, NIL, false, false, true); + + if (clist == NULL || clist->next != NULL) + { + res = false; + } + else + { + res = true; + } + + CommitTransactionCommand(); + + return res; +} + /* SIGTERM: set flag to exit normally */ static void clean_2pc_sigterm_handler(SIGNAL_ARGS) diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index dbccb8f6..dc1d39ed 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -4887,7 +4887,11 @@ static struct config_int ConfigureNamesInt[] = GUC_UNIT_S }, &auto_clean_2pc_interval, - 60, 10, INT_MAX, +#ifdef __TWO_PHASE_TESTS__ + 60, 0, INT_MAX, +#else + 60, 30, INT_MAX, +#endif NULL, NULL, NULL }, @@ -4898,7 +4902,11 @@ static struct config_int ConfigureNamesInt[] = GUC_UNIT_S }, &auto_clean_2pc_delay, - 300, 3, INT_MAX, +#ifdef __TWO_PHASE_TESTS__ + 60, 0, INT_MAX, +#else + 60, 30, INT_MAX, +#endif NULL, NULL, NULL }, @@ -4909,7 +4917,11 @@ static struct config_int ConfigureNamesInt[] = GUC_UNIT_S }, &auto_clean_2pc_timeout, +#ifdef __TWO_PHASE_TESTS__ 1200, 0, INT_MAX, +#else + 1200, 30, INT_MAX, +#endif NULL, NULL, NULL }, @@ -4920,7 +4932,11 @@ static struct config_int ConfigureNamesInt[] = GUC_UNIT_S }, &auto_clean_2pc_max_check_time, +#ifdef __TWO_PHASE_TESTS__ 1200, 0, INT_MAX, +#else + 1200, 30, INT_MAX, +#endif NULL, NULL, NULL }, diff --git a/src/include/access/twophase.h b/src/include/access/twophase.h index 06f9685e..132f19d8 100644 --- a/src/include/access/twophase.h +++ b/src/include/access/twophase.h @@ -81,6 +81,10 @@ #include "gtm/gtm_c.h" #define GIDSIZE (200 + 24) + +/* 2pc xlog v2 add prepare timestamp */ +#define XLOG_FMT_2PC_V2 "fmt_v2" + /* * GlobalTransactionData is defined in twophase.c; other places have no * business knowing the internal definition. From db0d112324d582dec1d5f1a464b3d38f7cd173d4 Mon Sep 17 00:00:00 2001 From: whalesong Date: Fri, 22 Apr 2022 14:16:30 +0800 Subject: [PATCH 544/578] support wal sender proxy on cn (merge request 1183), http://tapd.woa.com/20421696/prong/stories/view/1020421696872688189 --- src/backend/access/common/printtup.c | 2 +- src/backend/pgxc/pool/execRemote.c | 323 ++++++++++++++++++++- src/backend/pgxc/pool/pgxcnode.c | 66 +++++ src/backend/postmaster/pgstat.c | 8 + src/backend/postmaster/postmaster.c | 39 +++ src/backend/replication/walsender.c | 3 + src/backend/tcop/postgres.c | 279 ++++++++++++++++++ src/backend/utils/misc/guc.c | 8 + src/backend/utils/misc/ps_status.c | 23 ++ src/include/pgstat.h | 1 + src/include/pgxc/execRemote.h | 3 + src/include/pgxc/pgxc.h | 2 + src/include/pgxc/pgxcnode.h | 8 + src/include/postgres.h | 419 ++++++++++++++------------- src/include/replication/walsender.h | 43 +-- src/include/utils/ps_status.h | 6 +- 16 files changed, 1004 insertions(+), 229 deletions(-) diff --git a/src/backend/access/common/printtup.c b/src/backend/access/common/printtup.c index dfd64707..3c12980a 100644 --- a/src/backend/access/common/printtup.c +++ b/src/backend/access/common/printtup.c @@ -228,7 +228,7 @@ SendRowDescriptionMessage(TupleDesc typeinfo, List *targetlist, int16 *formats) * Send the type name from a Postgres-XC backend node. * This preserves from OID inconsistencies as architecture is shared nothing. */ - if (IsConnFromCoord()) + if (IsConnFromCoord() && !IsConnFromProxy()) { char *typename; typename = get_typenamespace_typename(atttypid); diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 2cedb37c..7ad4a8be 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -30,6 +30,7 @@ #include "executor/executor.h" #include "gtm/gtm_c.h" #include "libpq/libpq.h" +#include "libpq/pqformat.h" #include "miscadmin.h" #include "pgxc/execRemote.h" #include "tcop/tcopprot.h" @@ -156,6 +157,7 @@ static void pgxc_connections_cleanup(ResponseCombiner *combiner); static bool determine_param_types(Plan *plan, struct find_params_context *context); +static int handle_reply_msg_on_proxy(PGXCNodeHandle *conn); #define REMOVE_CURR_CONN(combiner) \ if ((combiner)->current_conn < --((combiner)->conn_count)) \ @@ -3026,7 +3028,17 @@ pgxc_node_receive_responses(const int conn_count, PGXCNodeHandle ** connections, while (i < count) { int32 nbytes = 0; - int result = handle_response(to_receive[i], combiner); + int result = 0; + + if (am_proxy_for_dn) + { + result = handle_response_on_proxy(to_receive[i], combiner); + } + else + { + result = handle_response(to_receive[i], combiner); + } + #ifdef __TBASE__ #ifdef _PG_REGRESS_ elog(LOG, "Received response %d on connection to node %s", @@ -13090,4 +13102,313 @@ SetSnapshot(EState *state) return result; } + +/* + * Reveive dn message on proxy. + * Forward the dn message to client and forward the client reply message to dn. + */ +int pgxc_node_receive_on_proxy(PGXCNodeHandle *handle) +{ + int result = 0; + ResponseCombiner combiner; + + struct timeval timeout; + timeout.tv_sec = 1; + timeout.tv_usec = 0; + + MemSet(&combiner, 0, sizeof(ResponseCombiner)); + + InitResponseCombiner(&combiner, 1, COMBINE_TYPE_NONE); + + /* Receive responses */ + result = pgxc_node_receive_responses(1, &handle, &timeout, &combiner); + if (result != 0) + { + elog(LOG, "Proxy receive responses result is %d", result); + return result; + } + + CloseCombiner(&combiner); + return result; +} + +/* + * Handle reply message on proxy. + * Forward the client reply message to dn. + */ +int handle_reply_msg_on_proxy(PGXCNodeHandle *conn) +{ + int ret = 0; + unsigned char firstchar; + StringInfoData msg; + + Assert(IS_PGXC_COORDINATOR); + + initStringInfo(&msg); + + for (;;) + { + pq_startmsgread(); + ret = pq_getbyte_if_available(&firstchar); + if (ret < 0) + { + /* Unexpected error or EOF */ + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("unexpected EOF on proxy for %s", proxy_for_dn))); + } + + if (ret == 0) + { + /* No data available without blocking */ + pq_endmsgread(); + break; + } + + /* Read the message contents */ + if (pq_getmessage(&msg, 0)) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("unexpected EOF on proxy for %s", proxy_for_dn))); + } + + elog(DEBUG2, "%s proxy firstchar is %c(%d), reply message length: %d", + proxy_for_dn, firstchar, firstchar, msg.len); + + ret = pgxc_node_send_on_proxy(conn, firstchar, &msg); + if (ret != 0) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("%s proxy send reply message error: %d", + proxy_for_dn, ret))); + } + + /* Handle the very limited subset of commands expected in this phase */ + switch (firstchar) + { + /* + * 'd' means a client reply message. + */ + case 'd': + break; + + /* + * 'c' means the client requested to finish streaming. + */ + case 'c': + elog(LOG, "%s proxy: reply message type %c(%d), " + "the client requested to finish streaming", + proxy_for_dn, firstchar, firstchar); + + /* When replicate stream is closed, set stream_closed to true */ + conn->stream_closed = true; + + break; + + /* + * 'X' means the client is closing down the socket. + */ + case 'X': + elog(LOG, "%s proxy: reply message type %c(%d), " + "the client is closing down the socket", + proxy_for_dn, firstchar, firstchar); + + proc_exit(0); + + default: + elog(FATAL, "%s proxy: unexpected message type %c(%d), length: %d", + proxy_for_dn, firstchar, firstchar, msg.len); + break; + } + } + + return ret; +} + +/* + * Read next message from the connection and update + * connection state accordingly on the proxy + * If we are in an error state we just consume the messages, and do not proxy + * Long term, we should look into cancelling executing statements + * and closing the connections. + * It returns if states need to be handled + * Return values: + * RESPONSE_EOF - need to receive more data for the connection + * RESPONSE_READY - got ReadyForQuery + * RESPONSE_COMPLETE - done with the connection, but not yet ready for query. + * Also this result is output in case of error + * RESPONSE_TUPLEDESC - got tuple description + * RESPONSE_DATAROW - got data row + */ +int handle_response_on_proxy(PGXCNodeHandle *conn, ResponseCombiner *combiner) +{ + char *msg; + int msg_len; + char msg_type; + int ret = 0; + StringInfoData buf; + + /* proxy must be cn */ + Assert(IS_PGXC_COORDINATOR); + + /* proxy must be not in extended query */ + Assert(!conn->in_extended_query); + Assert(!combiner->extended_query); + + for (;;) + { + /* + * If we are in the process of shutting down, we + * may be rolling back, and the buffer may contain other messages. + * We want to avoid a procarray exception + * as well as an error stack overflow. + */ + if (proc_exit_inprogress) + { + PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_ERROR_FATAL); + } + + /* + * Don't read from from the connection if there is a fatal error. + * We still return RESPONSE_COMPLETE, not RESPONSE_ERROR, since + * Handling of RESPONSE_ERROR assumes sending SYNC message, but + * State DN_CONNECTION_STATE_ERROR_FATAL indicates connection is + * not usable. + */ + if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL) + { + return RESPONSE_COMPLETE; + } + + ret = handle_reply_msg_on_proxy(conn); + if (ret != 0) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Handle reply message on proxy for %s error: %d", + proxy_for_dn, ret))); + } + + /* No data available, exit */ + if (!HAS_MESSAGE_BUFFERED(conn)) + return RESPONSE_EOF; + + Assert(conn->combiner == combiner || conn->combiner == NULL); + + msg_type = get_message(conn, &msg_len, &msg); + elog(DEBUG1, "handle_response_on_proxy - received message %c, node %s, " + "current_state %d", msg_type, conn->nodename, conn->state); + + /* + * Add some protection code when receiving a messy message, + * close the connection, and throw error + */ + if (msg_len < 0) + { + PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_ERROR_FATAL); + + elog(LOG, "handle_response_on_proxy, fatal_conn=%p, " + "fatal_conn->nodename=%s, fatal_conn->sock=%d, " + "fatal_conn->read_only=%d, fatal_conn->transaction_status=%c, " + "fatal_conn->sock_fatal_occurred=%d, conn->backend_pid=%d, " + "fatal_conn->error=%s", conn, conn->nodename, conn->sock, + conn->read_only, conn->transaction_status, + conn->sock_fatal_occurred, conn->backend_pid, conn->error); + + closesocket(conn->sock); + conn->sock = NO_SOCKET; + conn->sock_fatal_occurred = true; + + elog(LOG, "Received messy message from node:%s host:%s port:%d pid:%d, " + "inBuffer:%p inSize:%lu inStart:%lu inEnd:%lu inCursor:%lu " + "msg_len:%d, This probably means the remote node terminated " + "abnormally before or while processing the request.", + conn->nodename, conn->nodehost, conn->nodeport, conn->backend_pid, + conn->inBuffer, conn->inSize, conn->inStart, conn->inEnd, + conn->inCursor, msg_len); + + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Proxy: handle_response_on_proxy - received message " + "length %d, type %c, node %s, current_state %d", + msg_len, msg_type, conn->nodename, conn->state))); + } + + if (msg_type == '\0') + { + /* Not enough data in the buffer */ + return RESPONSE_EOF; + } + + if (conn->stream_closed && msg_type == 'd') + { + /* When replicate stream is closed, skip 'd' message */ + elog(DEBUG1, "Proxy: handle_response_on_proxy - received message " + "type %c, length %d, node %s, current_state %d, remote pid %d, skip", + msg_type, msg_len, conn->nodename, conn->state, conn->backend_pid); + continue;; + } + + conn->last_command = msg_type; + + elog(DEBUG1, "Proxy: handle_response_on_proxy - received message " + "type %c, length %d, node %s, current_state %d, remote pid %d", + msg_type, msg_len, conn->nodename, conn->state, conn->backend_pid); + + /* Send message to client */ + pq_beginmessage(&buf, msg_type); + pq_sendbytes(&buf, msg, msg_len); + pq_endmessage(&buf); + pq_flush(); + + switch (msg_type) + { + case 'c': /* CopyToCommandComplete */ + break; + + case 'C': /* CommandComplete */ + conn->combiner = NULL; + PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_IDLE); + return RESPONSE_COMPLETE; + + case 'E': /* ErrorResponse */ + HandleError(combiner, msg, msg_len, conn); + add_error_message_from_combiner(conn, combiner); + + combiner->errorNode = conn->nodename; + combiner->backend_pid = conn->backend_pid; + return RESPONSE_ERROR; + + case 'Z': /* ReadyForQuery */ + conn->transaction_status = msg[0]; + PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_IDLE); + conn->combiner = NULL; + return RESPONSE_READY; + + case 'T': /* RowDescription */ + return RESPONSE_TUPDESC; + + case 'D': /* DataRow */ + return RESPONSE_DATAROW; + + case 'd': /* CopyOutDataRow */ + PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_COPY_OUT); + break; + + case 'W': /* CopyBothResponse */ + /* Get a CopyBothResponse message when start streaming */ + break; + + default: + elog(DEBUG1, "Proxy received message type: %c", msg_type); + break; + } + } + + /* Never happen, but keep compiler quiet */ + return RESPONSE_EOF; +} + #endif diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index c19325a9..84259600 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -2595,6 +2595,72 @@ pgxc_node_send_apply(PGXCNodeHandle * handle, char * buf, int len, bool ignore_p } #endif +/* + * Send message to dn + */ +int +pgxc_node_send_on_proxy(PGXCNodeHandle *handle, int firstchar, StringInfo inBuf) +{ + /* size + len */ + int msgLen = 4 + inBuf->len; + + /* msgType + msgLen */ + if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0) + { + add_error_message(handle, "out of memory"); + return EOF; + } + + /* msg type */ + handle->outBuffer[handle->outEnd++] = firstchar; + + /* size */ + msgLen = htonl(msgLen); + memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4); + handle->outEnd += 4; + + /* msg data */ + memcpy(handle->outBuffer + handle->outEnd, inBuf->data, inBuf->len); + handle->outEnd += inBuf->len; + + PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_QUERY); + handle->in_extended_query = false; + + return pgxc_node_flush(handle); +} + +/* + * Send proxy configuration to dn + */ +int +pgxc_node_send_proxy_flag(PGXCNodeHandle *handle, int flag) +{ + /* size + flag */ + int msgLen = 4 + sizeof(int); + + /* msgType + msgLen */ + if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0) + { + add_error_message(handle, "out of memory"); + return EOF; + } + + /* msg type */ + handle->outBuffer[handle->outEnd++] = 'w'; + + /* size */ + msgLen = htonl(msgLen); + memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4); + handle->outEnd += 4; + + /* flag */ + flag = htonl(flag); + memcpy(handle->outBuffer + handle->outEnd, &flag, sizeof(int)); + handle->outEnd += sizeof(int); + + return pgxc_node_flush(handle); +} + /* * Send series of Extended Query protocol messages to the data node */ diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 1286cd1d..cfeea974 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -2913,6 +2913,11 @@ pgstat_bestart(void) /* Clean 2pc Worker */ beentry->st_backendType = B_CLEAN_2PC_WORKER; } + else if (am_proxy_for_dn) + { + /* Proxy for dn */ + beentry->st_backendType = B_PROXY_FOR_DN; + } else if (am_walsender) { /* Wal sender */ @@ -4208,6 +4213,9 @@ pgstat_get_backend_desc(BackendType backendType) case B_CLEAN_2PC_WORKER: backendDesc = "2pc clean worker"; break; + case B_PROXY_FOR_DN: + backendDesc = "proxy for dn"; + break; } return backendDesc; diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 10be77cd..7d6d230b 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -162,6 +162,8 @@ #include "audit/audit_fga.h" #endif +#define PS_DISPLAY_MAX_LENGTH 256 /* process display max length */ + /* * Possible types of a backend. Beyond being the possible bkend_type values in * struct bkend, these are OR-able request flag bits for SignalSomeChildren() @@ -2387,6 +2389,20 @@ ProcessStartupPacket(Port *port, bool SSLdone) valptr), errhint("Valid values are: \"false\", 0, \"true\", 1, \"database\"."))); } + else if (strcmp(nameptr, "proxy_for_dn") == 0) + { + if (!IS_PGXC_COORDINATOR) + { + ereport(FATAL, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("parameter \"%s\" only support on cn", nameptr))); + } + + elog(LOG, "Proxy for dn: %s", valptr); + + am_proxy_for_dn = true; + proxy_for_dn = pstrdup(valptr); + } else { /* Assume it's a generic GUC option */ @@ -4940,12 +4956,35 @@ BackendInitialize(Port *port) * as dbname to init_ps_display(). XXX: should add a new variant of * init_ps_display() to avoid abusing the parameters like this. */ + if (am_proxy_for_dn) + { + char proxy_display[PS_DISPLAY_MAX_LENGTH]; if (am_walsender) + { + snprintf(proxy_display, PS_DISPLAY_MAX_LENGTH, + "wal sender proxy for %s", proxy_for_dn); + } + else + { + snprintf(proxy_display, PS_DISPLAY_MAX_LENGTH, + "proxy for %s", proxy_for_dn); + } + init_ps_display(proxy_display, port->user_name, remote_ps_data, + update_process_title ? "authentication" : ""); + } + else + { + if (am_walsender) + { init_ps_display("wal sender process", port->user_name, remote_ps_data, update_process_title ? "authentication" : ""); + } else + { init_ps_display(port->user_name, port->database_name, remote_ps_data, update_process_title ? "authentication" : ""); + } + } /* * Disable the timeout, and prevent SIGTERM/SIGQUIT again. diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index 4b46d9c8..464cfcd9 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -3327,7 +3327,10 @@ WalSndSignals(void) pqsignal(SIGINT, StatementCancelHandler); /* query cancel */ pqsignal(SIGTERM, die); /* request shutdown */ pqsignal(SIGQUIT, quickdie); /* hard crash time */ + if (!IsConnFromProxy()) + { InitializeTimeouts(); /* establishes SIGALRM handler */ + } pqsignal(SIGPIPE, SIG_IGN); pqsignal(SIGUSR1, procsignal_sigusr1_handler); pqsignal(SIGUSR2, WalSndLastCycleHandler); /* request a last cycle and diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 37819150..b075a7e8 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -124,6 +124,10 @@ #include "replication/worker_internal.h" #endif +char *proxy_for_dn = NULL; /* Proxy for which dn? */ +bool am_proxy_for_dn = false; /* Am I a proxy for dn? */ +bool am_conn_from_proxy = false; /* Am I connected from proxy? */ + extern int optind; /* ---------------- @@ -250,6 +254,13 @@ static void replace_null_with_blank(char *src, int length); static bool NeedResourceOwner(const char *stmt_name); #endif +static PGXCNodeHandle * +get_handle_on_proxy(void); +static PGXCNodeHandle * +handle_request_msg_on_proxy(PGXCNodeHandle *conn, int firstchar, StringInfo input_msg); +void +set_flag_from_proxy(int flag, const char *username); + #ifdef __COLD_HOT__ /* * Release memory alloc in TopMemoryContext and only used in single Session. @@ -654,6 +665,7 @@ SocketBackend(StringInfo inBuf) (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("invalid frontend message type %d", qtype))); break; + case 'w': /* Set connected by proxy */ #ifdef PGXC /* PGXC_DATANODE */ #ifdef __TBASE__ case 'N': @@ -4802,6 +4814,8 @@ PostgresMain(int argc, char *argv[], volatile bool need_report_activity = false; bool disable_idle_in_transaction_timeout = false; + PGXCNodeHandle *proxy_conn = NULL; + #ifdef PGXC /* PGXC_DATANODE */ /* Snapshot info */ TransactionId xmin PG_USED_FOR_ASSERTS_ONLY; @@ -5513,6 +5527,12 @@ PostgresMain(int argc, char *argv[], } #endif /* XCP */ + if (am_proxy_for_dn) + { + proxy_conn = handle_request_msg_on_proxy(proxy_conn, firstchar, &input_message); + continue; + } + switch (firstchar) { case 'Q': /* simple query */ @@ -6127,6 +6147,18 @@ PostgresMain(int argc, char *argv[], } break; #endif + case 'w': /* Set connected by proxy */ + { + int flag = 0; + + Assert(input_message.len == 4); + + flag = pq_getmsgint(&input_message, 4); + pq_getmsgend(&input_message); + + set_flag_from_proxy(flag, username); + } + break; default: ereport(FATAL, (errcode(ERRCODE_PROTOCOL_VIOLATION), @@ -6403,4 +6435,251 @@ IsExtendedQuery(void) { return doing_extended_query_message; } + +/* + * Get a dn connection on proxy + */ +PGXCNodeHandle * +get_handle_on_proxy(void) +{ + PGXCNodeHandle *conn = NULL; + char node_type = PGXC_NODE_DATANODE; + Oid node_oid = InvalidOid; + int node_id = -1; + int flag = 0; + PGXCNodeAllHandles *handles = NULL; + List *dnList = NIL; + int ret = 0; + + Assert(IS_PGXC_COORDINATOR); + + /* Get dn oid */ + StartTransactionCommand(); + InitMultinodeExecutor(false); + node_oid = get_pgxc_nodeoid(proxy_for_dn); + CommitTransactionCommand(); + + if (node_oid == InvalidOid) + { + ereport(FATAL, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("Unknow dn: %s, oid is invalid", proxy_for_dn))); + } + + /* Get dn id */ + node_id = PGXCNodeGetNodeId(node_oid, &node_type); + if (node_id == -1) + { + ereport(FATAL, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("Unknow dn: %s, oid: %d, id: -1", proxy_for_dn, node_oid))); + } + + elog(LOG, "Proxy for dn %s, node oid %d, node id %d", + proxy_for_dn, node_oid, node_id); + + /* Get dn connection */ + dnList = lappend_int(dnList, node_id); + Assert(list_length(dnList) == 1); + handles = get_handles(dnList, NIL, false, false, true); + if (handles == NULL) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Get connections failed for %s", proxy_for_dn))); + + } + if (handles->dn_conn_count == 0) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Get 0 connection for %s", proxy_for_dn))); + } + + Assert(handles->co_conn_count == 0); + Assert(handles->dn_conn_count == 1); + + conn = handles->datanode_handles[0]; + Assert(conn != NULL); + + pfree_pgxc_all_handles(handles); + handles = NULL; + + /* Set dn process */ + if (am_walsender) + { + flag |= FLAG_AM_WALSENDER; + if (am_db_walsender) + { + flag |= FLAG_AM_DB_WALSENDER; + } + } + ret = pgxc_node_send_proxy_flag(conn, flag); + if (ret != 0) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Proxy send flag to %s error: %d", proxy_for_dn, ret))); + } + + return conn; +} + +/* + * Forward client request command to dn and receive response + */ +PGXCNodeHandle * +handle_request_msg_on_proxy(PGXCNodeHandle *conn, int firstchar, StringInfo input_msg) +{ + int ret = 0; + + Assert(IS_PGXC_COORDINATOR); + + if (conn == NULL) + { + conn = get_handle_on_proxy(); + } + + Assert(conn != NULL); + + /* Before query, replicate stream is not closed, set stream_closed to false */ + conn->stream_closed = false; + + if (firstchar == 'Q') + { + const char *query_string = pq_getmsgstring(input_msg); + pq_getmsgend(input_msg); + debug_query_string = query_string; + } + + elog(DEBUG1, "Proxy: firstchar is %c(%d)", firstchar, firstchar); + + /* Send message */ + ret = pgxc_node_send_on_proxy(conn, firstchar, input_msg); + if (ret != 0) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Proxy send request to %s error: %d", proxy_for_dn, ret))); + } + + switch (firstchar) + { + /* + * 'X' means that the frontend is closing down the socket. EOF + * means unexpected loss of frontend connection. Either way, + * perform normal shutdown. + */ + case 'X': + case EOF: + /* + * Reset whereToSendOutput to prevent ereport from attempting + * to send any more messages to client. + */ + if (whereToSendOutput == DestRemote) + { + elog(LOG, "Set whereToSendOutput from %d to %d", + whereToSendOutput, DestNone); + whereToSendOutput = DestNone; + } + + /* Destroy the dn connection on proxy */ + PoolManagerDisconnect(); + + /* + * NOTE: if you are tempted to add more code here, DON'T! + * Whatever you had in mind to do should be set up as an + * on_proc_exit or on_shmem_exit callback, instead. Otherwise + * it will fail to be called during other backend-shutdown + * scenarios. + */ + proc_exit(0); + + default: + break; + } + + /* Receive message */ + ret = pgxc_node_receive_on_proxy(conn); + if (ret != 0) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Proxy receive from %s error: %d", proxy_for_dn, ret))); + } + + debug_query_string = NULL; + + return conn; +} + +/* + * Set flag from proxy + */ +void +set_flag_from_proxy(int flag, const char *username) +{ + if (am_conn_from_proxy) + { + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_EXCEPTION), + errmsg("It is connected from proxy already"))); + } + + am_conn_from_proxy = true; + + elog(LOG, "It is connected from proxy"); + + if (am_walsender) + { + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_EXCEPTION), + errmsg("It is a wal sender already"))); + } + + if (flag & FLAG_AM_WALSENDER) + { + am_walsender = true; + if (flag & FLAG_AM_DB_WALSENDER) + { + am_db_walsender = true; + } + } + + elog(LOG, "Set wal sender: am_walsender(%d), am_db_walsender(%d)", + am_walsender, am_db_walsender); + + if (am_walsender) + { + int fixed_len = 0; + const char *fixed = get_ps_display_fixed(&fixed_len); + char fixed_buf[fixed_len + 1]; + char *display = NULL; + + if (fixed_len != 0) + { + Assert (fixed != NULL); + + snprintf(fixed_buf, fixed_len, "%s", fixed); + fixed_buf[fixed_len] = '\0'; + + display = strstr(fixed_buf, username); + Assert (display != NULL); + + init_ps_display("wal sender used by proxy", display, "", ""); + } + else + { + elog(WARNING, "Get ps display fixed length is 0"); + + init_ps_display("wal sender used by proxy", "", "", ""); + } + + IsNormalPostgres = false; + + WalSndSignals(); + InitWalSender(); + } +} + #endif diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index dc1d39ed..3f2e046a 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -7321,6 +7321,14 @@ ResetAllOptions(void) {// #lizard forgives int i; + if (am_walsender) + { + /* never be here */ + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("RESET ALL is forbidden on wal sender"))); + } + for (i = 0; i < num_guc_variables; i++) { struct config_generic *gconf = guc_variables[i]; diff --git a/src/backend/utils/misc/ps_status.c b/src/backend/utils/misc/ps_status.c index 06f6c857..51f668d1 100644 --- a/src/backend/utils/misc/ps_status.c +++ b/src/backend/utils/misc/ps_status.c @@ -417,3 +417,26 @@ get_ps_display(int *displen) return ps_buffer + ps_buffer_fixed_size; } + +/* + * Returns the fixed part in the ps display, in case someone needs + * it. Note that only the fixed part is returned. + * The string will not be null-terminated, so return the effective + * length into *fixlen. + */ +const char * +get_ps_display_fixed(int *fixlen) +{ +#ifdef PS_USE_CLOBBER_ARGV + /* If ps_buffer is a pointer, it might still be null */ + if (!ps_buffer) + { + *fixlen = 0; + return ""; + } +#endif + + *fixlen = (int) ps_buffer_fixed_size; + + return ps_buffer; +} diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 7976c39b..6c4c5886 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -722,6 +722,7 @@ typedef enum BackendType B_PGXL_POOLER, B_CLEAN_2PC_LAUNCHER, B_CLEAN_2PC_WORKER, + B_PROXY_FOR_DN, } BackendType; diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h index 7047d510..baa30f65 100644 --- a/src/include/pgxc/execRemote.h +++ b/src/include/pgxc/execRemote.h @@ -384,6 +384,7 @@ extern void ExecRemoteUtility(RemoteQuery *node); extern bool is_data_node_ready(PGXCNodeHandle * conn); extern int handle_response(PGXCNodeHandle *conn, ResponseCombiner *combiner); +extern int handle_response_on_proxy(PGXCNodeHandle *conn, ResponseCombiner *combiner); extern void HandleCmdComplete(CmdType commandType, CombineTag *combine, const char *msg_body, size_t len); @@ -476,6 +477,8 @@ extern int pgxc_node_receive_responses(const int conn_count, PGXCNodeHandle ** c extern bool validate_combiner(ResponseCombiner *combiner); #endif +extern int pgxc_node_receive_on_proxy(PGXCNodeHandle *handle); + #ifdef __TWO_PHASE_TRANS__ extern char *get_nodelist(char * prepareGID, bool localNode, bool implicit); extern void InitLocalTwoPhaseState(void); diff --git a/src/include/pgxc/pgxc.h b/src/include/pgxc/pgxc.h index 687be6c8..370882dd 100644 --- a/src/include/pgxc/pgxc.h +++ b/src/include/pgxc/pgxc.h @@ -134,6 +134,8 @@ extern Datum xc_lockForBackupKey2; #define IsConnFromGtm() (remoteConnType == REMOTE_CONN_GTM) #define IsConnFromGtmProxy() (remoteConnType == REMOTE_CONN_GTM_PROXY) +#define IsConnFromProxy() (am_conn_from_proxy) + /* key pair to be used as object id while using advisory lock for backup */ #define XC_LOCK_FOR_BACKUP_KEY_1 0xFFFF #define XC_LOCK_FOR_BACKUP_KEY_2 0xFFFF diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h index f0e7c269..d69aa7f1 100644 --- a/src/include/pgxc/pgxcnode.h +++ b/src/include/pgxc/pgxcnode.h @@ -123,6 +123,9 @@ struct pgxc_node_handle bool in_extended_query; bool needSync; /* set when error and extend query. */ + + bool stream_closed; /* Whether replicate stream is closed on proxy? */ + #ifdef __TBASE__ bool sock_fatal_occurred; /*Network failure occurred, and sock descriptor was closed */ char last_command; /*last command we processed. */ @@ -217,6 +220,11 @@ extern int pgxc_node_send_my_sync(PGXCNodeHandle * handle); #ifdef __SUBSCRIPTION__ extern int pgxc_node_send_apply(PGXCNodeHandle * handle, char * buf, int len, bool ignore_pk_conflict); #endif + +extern int pgxc_node_send_proxy_flag(PGXCNodeHandle *handle, int flag); +extern int pgxc_node_send_on_proxy(PGXCNodeHandle *handle, int firstchar, + StringInfo inBuf); + #ifdef __TBASE__ extern int pgxc_node_send_disconnect(PGXCNodeHandle * handle, char *cursor, int cons); #endif diff --git a/src/include/postgres.h b/src/include/postgres.h index 2074389f..c35967b0 100644 --- a/src/include/postgres.h +++ b/src/include/postgres.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * postgres.h - * Primary include file for PostgreSQL server .c files + * Primary include file for PostgreSQL server .c files * * This should be the first file included by PostgreSQL backend modules. * Client-side code should include postgres_fe.h instead. @@ -17,28 +17,28 @@ */ /* *---------------------------------------------------------------- - * TABLE OF CONTENTS + * TABLE OF CONTENTS * - * When adding stuff to this file, please try to put stuff - * into the relevant section, or add new sections as appropriate. + * When adding stuff to this file, please try to put stuff + * into the relevant section, or add new sections as appropriate. * - * section description - * ------- ------------------------------------------------ - * 1) variable-length datatypes (TOAST support) - * 2) datum type + support macros - * 3) exception handling backend support + * section description + * ------- ------------------------------------------------ + * 1) variable-length datatypes (TOAST support) + * 2) datum type + support macros + * 3) exception handling backend support * - * NOTES + * NOTES * - * In general, this file should contain declarations that are widely needed - * in the backend environment, but are of no interest outside the backend. + * In general, this file should contain declarations that are widely needed + * in the backend environment, but are of no interest outside the backend. * - * Simple type definitions live in c.h, where they are shared with - * postgres_fe.h. We do that since those type definitions are needed by - * frontend modules that want to deal with binary data transmission to or - * from the backend. Type definitions in this file should be for - * representations that never escape the backend, such as Datum or - * TOASTed varlena objects. + * Simple type definitions live in c.h, where they are shared with + * postgres_fe.h. We do that since those type definitions are needed by + * frontend modules that want to deal with binary data transmission to or + * from the backend. Type definitions in this file should be for + * representations that never escape the backend, such as Datum or + * TOASTed varlena objects. * *---------------------------------------------------------------- */ @@ -71,8 +71,13 @@ #define EXTENT_FIRST_BLOCKNUMBER(eid) ((eid)*PAGES_PER_EXTENTS) #endif +extern char *proxy_for_dn; /* Proxy for which dn? */ +extern bool am_proxy_for_dn; /* Am I a proxy for dn? */ +extern bool am_conn_from_proxy; /* Am I connected from proxy? */ + + /* ---------------------------------------------------------------- - * Section 1: variable-length datatypes (TOAST support) + * Section 1: variable-length datatypes (TOAST support) * ---------------------------------------------------------------- */ @@ -90,11 +95,11 @@ */ typedef struct varatt_external { - int32 va_rawsize; /* Original data size (includes header) */ - int32 va_extsize; /* External saved size (doesn't) */ - Oid va_valueid; /* Unique ID of value within TOAST table */ - Oid va_toastrelid; /* RelID of TOAST table containing it */ -} varatt_external; + int32 va_rawsize; /* Original data size (includes header) */ + int32 va_extsize; /* External saved size (doesn't) */ + Oid va_valueid; /* Unique ID of value within TOAST table */ + Oid va_toastrelid; /* RelID of TOAST table containing it */ +} varatt_external; /* * struct varatt_indirect is a "TOAST pointer" representing an out-of-line @@ -107,8 +112,8 @@ typedef struct varatt_external */ typedef struct varatt_indirect { - struct varlena *pointer; /* Pointer to in-memory varlena */ -} varatt_indirect; + struct varlena *pointer; /* Pointer to in-memory varlena */ +} varatt_indirect; /* * struct varatt_expanded is a "TOAST pointer" representing an out-of-line @@ -124,7 +129,7 @@ typedef struct ExpandedObjectHeader ExpandedObjectHeader; typedef struct varatt_expanded { - ExpandedObjectHeader *eohptr; + ExpandedObjectHeader *eohptr; } varatt_expanded; /* @@ -134,21 +139,21 @@ typedef struct varatt_expanded */ typedef enum vartag_external { - VARTAG_INDIRECT = 1, - VARTAG_EXPANDED_RO = 2, - VARTAG_EXPANDED_RW = 3, - VARTAG_ONDISK = 18 + VARTAG_INDIRECT = 1, + VARTAG_EXPANDED_RO = 2, + VARTAG_EXPANDED_RW = 3, + VARTAG_ONDISK = 18 } vartag_external; /* this test relies on the specific tag values above */ #define VARTAG_IS_EXPANDED(tag) \ - (((tag) & ~1) == VARTAG_EXPANDED_RO) + (((tag) & ~1) == VARTAG_EXPANDED_RO) #define VARTAG_SIZE(tag) \ - ((tag) == VARTAG_INDIRECT ? sizeof(varatt_indirect) : \ - VARTAG_IS_EXPANDED(tag) ? sizeof(varatt_expanded) : \ - (tag) == VARTAG_ONDISK ? sizeof(varatt_external) : \ - TrapMacro(true, "unrecognized TOAST vartag")) + ((tag) == VARTAG_INDIRECT ? sizeof(varatt_indirect) : \ + VARTAG_IS_EXPANDED(tag) ? sizeof(varatt_expanded) : \ + (tag) == VARTAG_ONDISK ? sizeof(varatt_external) : \ + TrapMacro(true, "unrecognized TOAST vartag")) /* * These structs describe the header of a varlena object that may have been @@ -161,31 +166,31 @@ typedef enum vartag_external */ typedef union { - struct /* Normal varlena (4-byte length) */ - { - uint32 va_header; - char va_data[FLEXIBLE_ARRAY_MEMBER]; - } va_4byte; - struct /* Compressed-in-line format */ - { - uint32 va_header; - uint32 va_rawsize; /* Original data size (excludes header) */ - char va_data[FLEXIBLE_ARRAY_MEMBER]; /* Compressed data */ - } va_compressed; + struct /* Normal varlena (4-byte length) */ + { + uint32 va_header; + char va_data[FLEXIBLE_ARRAY_MEMBER]; + } va_4byte; + struct /* Compressed-in-line format */ + { + uint32 va_header; + uint32 va_rawsize; /* Original data size (excludes header) */ + char va_data[FLEXIBLE_ARRAY_MEMBER]; /* Compressed data */ + } va_compressed; } varattrib_4b; typedef struct { - uint8 va_header; - char va_data[FLEXIBLE_ARRAY_MEMBER]; /* Data begins here */ + uint8 va_header; + char va_data[FLEXIBLE_ARRAY_MEMBER]; /* Data begins here */ } varattrib_1b; /* TOAST pointers are a subset of varattrib_1b with an identifying tag byte */ typedef struct { - uint8 va_header; /* Always 0x80 or 0x01 */ - uint8 va_tag; /* Type of datum */ - char va_data[FLEXIBLE_ARRAY_MEMBER]; /* Type-specific data */ + uint8 va_header; /* Always 0x80 or 0x01 */ + uint8 va_tag; /* Type of datum */ + char va_data[FLEXIBLE_ARRAY_MEMBER]; /* Type-specific data */ } varattrib_1b_e; /* @@ -226,86 +231,86 @@ typedef struct #ifdef WORDS_BIGENDIAN #define VARATT_IS_4B(PTR) \ - ((((varattrib_1b *) (PTR))->va_header & 0x80) == 0x00) + ((((varattrib_1b *) (PTR))->va_header & 0x80) == 0x00) #define VARATT_IS_4B_U(PTR) \ - ((((varattrib_1b *) (PTR))->va_header & 0xC0) == 0x00) + ((((varattrib_1b *) (PTR))->va_header & 0xC0) == 0x00) #define VARATT_IS_4B_C(PTR) \ - ((((varattrib_1b *) (PTR))->va_header & 0xC0) == 0x40) + ((((varattrib_1b *) (PTR))->va_header & 0xC0) == 0x40) #define VARATT_IS_1B(PTR) \ - ((((varattrib_1b *) (PTR))->va_header & 0x80) == 0x80) + ((((varattrib_1b *) (PTR))->va_header & 0x80) == 0x80) #define VARATT_IS_1B_E(PTR) \ - ((((varattrib_1b *) (PTR))->va_header) == 0x80) + ((((varattrib_1b *) (PTR))->va_header) == 0x80) #define VARATT_NOT_PAD_BYTE(PTR) \ - (*((uint8 *) (PTR)) != 0) + (*((uint8 *) (PTR)) != 0) /* VARSIZE_4B() should only be used on known-aligned data */ #define VARSIZE_4B(PTR) \ - (((varattrib_4b *) (PTR))->va_4byte.va_header & 0x3FFFFFFF) + (((varattrib_4b *) (PTR))->va_4byte.va_header & 0x3FFFFFFF) #define VARSIZE_1B(PTR) \ - (((varattrib_1b *) (PTR))->va_header & 0x7F) + (((varattrib_1b *) (PTR))->va_header & 0x7F) #define VARTAG_1B_E(PTR) \ - (((varattrib_1b_e *) (PTR))->va_tag) + (((varattrib_1b_e *) (PTR))->va_tag) #define SET_VARSIZE_4B(PTR,len) \ - (((varattrib_4b *) (PTR))->va_4byte.va_header = (len) & 0x3FFFFFFF) + (((varattrib_4b *) (PTR))->va_4byte.va_header = (len) & 0x3FFFFFFF) #define SET_VARSIZE_4B_C(PTR,len) \ - (((varattrib_4b *) (PTR))->va_4byte.va_header = ((len) & 0x3FFFFFFF) | 0x40000000) + (((varattrib_4b *) (PTR))->va_4byte.va_header = ((len) & 0x3FFFFFFF) | 0x40000000) #define SET_VARSIZE_1B(PTR,len) \ - (((varattrib_1b *) (PTR))->va_header = (len) | 0x80) + (((varattrib_1b *) (PTR))->va_header = (len) | 0x80) #define SET_VARTAG_1B_E(PTR,tag) \ - (((varattrib_1b_e *) (PTR))->va_header = 0x80, \ - ((varattrib_1b_e *) (PTR))->va_tag = (tag)) -#else /* !WORDS_BIGENDIAN */ + (((varattrib_1b_e *) (PTR))->va_header = 0x80, \ + ((varattrib_1b_e *) (PTR))->va_tag = (tag)) +#else /* !WORDS_BIGENDIAN */ #define VARATT_IS_4B(PTR) \ - ((((varattrib_1b *) (PTR))->va_header & 0x01) == 0x00) + ((((varattrib_1b *) (PTR))->va_header & 0x01) == 0x00) #define VARATT_IS_4B_U(PTR) \ - ((((varattrib_1b *) (PTR))->va_header & 0x03) == 0x00) + ((((varattrib_1b *) (PTR))->va_header & 0x03) == 0x00) #define VARATT_IS_4B_C(PTR) \ - ((((varattrib_1b *) (PTR))->va_header & 0x03) == 0x02) + ((((varattrib_1b *) (PTR))->va_header & 0x03) == 0x02) #define VARATT_IS_1B(PTR) \ - ((((varattrib_1b *) (PTR))->va_header & 0x01) == 0x01) + ((((varattrib_1b *) (PTR))->va_header & 0x01) == 0x01) #define VARATT_IS_1B_E(PTR) \ - ((((varattrib_1b *) (PTR))->va_header) == 0x01) + ((((varattrib_1b *) (PTR))->va_header) == 0x01) #define VARATT_NOT_PAD_BYTE(PTR) \ - (*((uint8 *) (PTR)) != 0) + (*((uint8 *) (PTR)) != 0) /* VARSIZE_4B() should only be used on known-aligned data */ #define VARSIZE_4B(PTR) \ - ((((varattrib_4b *) (PTR))->va_4byte.va_header >> 2) & 0x3FFFFFFF) + ((((varattrib_4b *) (PTR))->va_4byte.va_header >> 2) & 0x3FFFFFFF) #define VARSIZE_1B(PTR) \ - ((((varattrib_1b *) (PTR))->va_header >> 1) & 0x7F) + ((((varattrib_1b *) (PTR))->va_header >> 1) & 0x7F) #define VARTAG_1B_E(PTR) \ - (((varattrib_1b_e *) (PTR))->va_tag) + (((varattrib_1b_e *) (PTR))->va_tag) #define SET_VARSIZE_4B(PTR,len) \ - (((varattrib_4b *) (PTR))->va_4byte.va_header = (((uint32) (len)) << 2)) + (((varattrib_4b *) (PTR))->va_4byte.va_header = (((uint32) (len)) << 2)) #define SET_VARSIZE_4B_C(PTR,len) \ - (((varattrib_4b *) (PTR))->va_4byte.va_header = (((uint32) (len)) << 2) | 0x02) + (((varattrib_4b *) (PTR))->va_4byte.va_header = (((uint32) (len)) << 2) | 0x02) #define SET_VARSIZE_1B(PTR,len) \ - (((varattrib_1b *) (PTR))->va_header = (((uint8) (len)) << 1) | 0x01) + (((varattrib_1b *) (PTR))->va_header = (((uint8) (len)) << 1) | 0x01) #define SET_VARTAG_1B_E(PTR,tag) \ - (((varattrib_1b_e *) (PTR))->va_header = 0x01, \ - ((varattrib_1b_e *) (PTR))->va_tag = (tag)) -#endif /* WORDS_BIGENDIAN */ + (((varattrib_1b_e *) (PTR))->va_header = 0x01, \ + ((varattrib_1b_e *) (PTR))->va_tag = (tag)) +#endif /* WORDS_BIGENDIAN */ -#define VARHDRSZ_SHORT offsetof(varattrib_1b, va_data) -#define VARATT_SHORT_MAX 0x7F +#define VARHDRSZ_SHORT offsetof(varattrib_1b, va_data) +#define VARATT_SHORT_MAX 0x7F #define VARATT_CAN_MAKE_SHORT(PTR) \ - (VARATT_IS_4B_U(PTR) && \ - (VARSIZE(PTR) - VARHDRSZ + VARHDRSZ_SHORT) <= VARATT_SHORT_MAX) + (VARATT_IS_4B_U(PTR) && \ + (VARSIZE(PTR) - VARHDRSZ + VARHDRSZ_SHORT) <= VARATT_SHORT_MAX) #define VARATT_CONVERTED_SHORT_SIZE(PTR) \ - (VARSIZE(PTR) - VARHDRSZ + VARHDRSZ_SHORT) + (VARSIZE(PTR) - VARHDRSZ + VARHDRSZ_SHORT) -#define VARHDRSZ_EXTERNAL offsetof(varattrib_1b_e, va_data) +#define VARHDRSZ_EXTERNAL offsetof(varattrib_1b_e, va_data) -#define VARDATA_4B(PTR) (((varattrib_4b *) (PTR))->va_4byte.va_data) -#define VARDATA_4B_C(PTR) (((varattrib_4b *) (PTR))->va_compressed.va_data) -#define VARDATA_1B(PTR) (((varattrib_1b *) (PTR))->va_data) -#define VARDATA_1B_E(PTR) (((varattrib_1b_e *) (PTR))->va_data) +#define VARDATA_4B(PTR) (((varattrib_4b *) (PTR))->va_4byte.va_data) +#define VARDATA_4B_C(PTR) (((varattrib_4b *) (PTR))->va_compressed.va_data) +#define VARDATA_1B(PTR) (((varattrib_1b *) (PTR))->va_data) +#define VARDATA_1B_E(PTR) (((varattrib_1b_e *) (PTR))->va_data) #define VARRAWSIZE_4B_C(PTR) \ - (((varattrib_4b *) (PTR))->va_compressed.va_rawsize) + (((varattrib_4b *) (PTR))->va_compressed.va_rawsize) /* Externally visible macros */ @@ -323,66 +328,66 @@ typedef struct * Other macros here should usually be used only by tuple assembly/disassembly * code and code that specifically wants to work with still-toasted Datums. */ -#define VARDATA(PTR) VARDATA_4B(PTR) -#define VARSIZE(PTR) VARSIZE_4B(PTR) +#define VARDATA(PTR) VARDATA_4B(PTR) +#define VARSIZE(PTR) VARSIZE_4B(PTR) -#define VARSIZE_SHORT(PTR) VARSIZE_1B(PTR) -#define VARDATA_SHORT(PTR) VARDATA_1B(PTR) +#define VARSIZE_SHORT(PTR) VARSIZE_1B(PTR) +#define VARDATA_SHORT(PTR) VARDATA_1B(PTR) -#define VARTAG_EXTERNAL(PTR) VARTAG_1B_E(PTR) -#define VARSIZE_EXTERNAL(PTR) (VARHDRSZ_EXTERNAL + VARTAG_SIZE(VARTAG_EXTERNAL(PTR))) -#define VARDATA_EXTERNAL(PTR) VARDATA_1B_E(PTR) +#define VARTAG_EXTERNAL(PTR) VARTAG_1B_E(PTR) +#define VARSIZE_EXTERNAL(PTR) (VARHDRSZ_EXTERNAL + VARTAG_SIZE(VARTAG_EXTERNAL(PTR))) +#define VARDATA_EXTERNAL(PTR) VARDATA_1B_E(PTR) -#define VARATT_IS_COMPRESSED(PTR) VARATT_IS_4B_C(PTR) -#define VARATT_IS_EXTERNAL(PTR) VARATT_IS_1B_E(PTR) +#define VARATT_IS_COMPRESSED(PTR) VARATT_IS_4B_C(PTR) +#define VARATT_IS_EXTERNAL(PTR) VARATT_IS_1B_E(PTR) #define VARATT_IS_EXTERNAL_ONDISK(PTR) \ - (VARATT_IS_EXTERNAL(PTR) && VARTAG_EXTERNAL(PTR) == VARTAG_ONDISK) + (VARATT_IS_EXTERNAL(PTR) && VARTAG_EXTERNAL(PTR) == VARTAG_ONDISK) #define VARATT_IS_EXTERNAL_INDIRECT(PTR) \ - (VARATT_IS_EXTERNAL(PTR) && VARTAG_EXTERNAL(PTR) == VARTAG_INDIRECT) + (VARATT_IS_EXTERNAL(PTR) && VARTAG_EXTERNAL(PTR) == VARTAG_INDIRECT) #define VARATT_IS_EXTERNAL_EXPANDED_RO(PTR) \ - (VARATT_IS_EXTERNAL(PTR) && VARTAG_EXTERNAL(PTR) == VARTAG_EXPANDED_RO) + (VARATT_IS_EXTERNAL(PTR) && VARTAG_EXTERNAL(PTR) == VARTAG_EXPANDED_RO) #define VARATT_IS_EXTERNAL_EXPANDED_RW(PTR) \ - (VARATT_IS_EXTERNAL(PTR) && VARTAG_EXTERNAL(PTR) == VARTAG_EXPANDED_RW) + (VARATT_IS_EXTERNAL(PTR) && VARTAG_EXTERNAL(PTR) == VARTAG_EXPANDED_RW) #define VARATT_IS_EXTERNAL_EXPANDED(PTR) \ - (VARATT_IS_EXTERNAL(PTR) && VARTAG_IS_EXPANDED(VARTAG_EXTERNAL(PTR))) -#define VARATT_IS_SHORT(PTR) VARATT_IS_1B(PTR) -#define VARATT_IS_EXTENDED(PTR) (!VARATT_IS_4B_U(PTR)) + (VARATT_IS_EXTERNAL(PTR) && VARTAG_IS_EXPANDED(VARTAG_EXTERNAL(PTR))) +#define VARATT_IS_SHORT(PTR) VARATT_IS_1B(PTR) +#define VARATT_IS_EXTENDED(PTR) (!VARATT_IS_4B_U(PTR)) -#define SET_VARSIZE(PTR, len) SET_VARSIZE_4B(PTR, len) -#define SET_VARSIZE_SHORT(PTR, len) SET_VARSIZE_1B(PTR, len) -#define SET_VARSIZE_COMPRESSED(PTR, len) SET_VARSIZE_4B_C(PTR, len) +#define SET_VARSIZE(PTR, len) SET_VARSIZE_4B(PTR, len) +#define SET_VARSIZE_SHORT(PTR, len) SET_VARSIZE_1B(PTR, len) +#define SET_VARSIZE_COMPRESSED(PTR, len) SET_VARSIZE_4B_C(PTR, len) -#define SET_VARTAG_EXTERNAL(PTR, tag) SET_VARTAG_1B_E(PTR, tag) +#define SET_VARTAG_EXTERNAL(PTR, tag) SET_VARTAG_1B_E(PTR, tag) #define VARSIZE_ANY(PTR) \ - (VARATT_IS_1B_E(PTR) ? VARSIZE_EXTERNAL(PTR) : \ - (VARATT_IS_1B(PTR) ? VARSIZE_1B(PTR) : \ - VARSIZE_4B(PTR))) + (VARATT_IS_1B_E(PTR) ? VARSIZE_EXTERNAL(PTR) : \ + (VARATT_IS_1B(PTR) ? VARSIZE_1B(PTR) : \ + VARSIZE_4B(PTR))) /* Size of a varlena data, excluding header */ #define VARSIZE_ANY_EXHDR(PTR) \ - (VARATT_IS_1B_E(PTR) ? VARSIZE_EXTERNAL(PTR)-VARHDRSZ_EXTERNAL : \ - (VARATT_IS_1B(PTR) ? VARSIZE_1B(PTR)-VARHDRSZ_SHORT : \ - VARSIZE_4B(PTR)-VARHDRSZ)) + (VARATT_IS_1B_E(PTR) ? VARSIZE_EXTERNAL(PTR)-VARHDRSZ_EXTERNAL : \ + (VARATT_IS_1B(PTR) ? VARSIZE_1B(PTR)-VARHDRSZ_SHORT : \ + VARSIZE_4B(PTR)-VARHDRSZ)) /* caution: this will not work on an external or compressed-in-line Datum */ /* caution: this will return a possibly unaligned pointer */ #define VARDATA_ANY(PTR) \ - (VARATT_IS_1B(PTR) ? VARDATA_1B(PTR) : VARDATA_4B(PTR)) + (VARATT_IS_1B(PTR) ? VARDATA_1B(PTR) : VARDATA_4B(PTR)) /* ---------------------------------------------------------------- - * Section 2: datum type + support macros + * Section 2: datum type + support macros * ---------------------------------------------------------------- */ /* * Port Notes: - * Postgres makes the following assumptions about datatype sizes: + * Postgres makes the following assumptions about datatype sizes: * - * sizeof(Datum) == sizeof(void *) == 4 or 8 - * sizeof(char) == 1 - * sizeof(short) == 2 + * sizeof(Datum) == sizeof(void *) == 4 or 8 + * sizeof(char) == 1 + * sizeof(short) == 2 * * When a type narrower than Datum is stored in a Datum, we place it in the * low-order bits and are careful that the DatumGetXXX macro for it discards @@ -398,26 +403,26 @@ typedef uintptr_t Datum; typedef Datum *DatumPtr; -#define GET_1_BYTE(datum) (((Datum) (datum)) & 0x000000ff) -#define GET_2_BYTES(datum) (((Datum) (datum)) & 0x0000ffff) -#define GET_4_BYTES(datum) (((Datum) (datum)) & 0xffffffff) +#define GET_1_BYTE(datum) (((Datum) (datum)) & 0x000000ff) +#define GET_2_BYTES(datum) (((Datum) (datum)) & 0x0000ffff) +#define GET_4_BYTES(datum) (((Datum) (datum)) & 0xffffffff) #if SIZEOF_DATUM == 8 -#define GET_8_BYTES(datum) ((Datum) (datum)) +#define GET_8_BYTES(datum) ((Datum) (datum)) #endif -#define SET_1_BYTE(value) (((Datum) (value)) & 0x000000ff) -#define SET_2_BYTES(value) (((Datum) (value)) & 0x0000ffff) -#define SET_4_BYTES(value) (((Datum) (value)) & 0xffffffff) +#define SET_1_BYTE(value) (((Datum) (value)) & 0x000000ff) +#define SET_2_BYTES(value) (((Datum) (value)) & 0x0000ffff) +#define SET_4_BYTES(value) (((Datum) (value)) & 0xffffffff) #if SIZEOF_DATUM == 8 -#define SET_8_BYTES(value) ((Datum) (value)) +#define SET_8_BYTES(value) ((Datum) (value)) #endif #ifdef XCP -#define CONTROL_INTERVAL 50000 +#define CONTROL_INTERVAL 50000 #endif /* * DatumGetBool - * Returns boolean value of a datum. + * Returns boolean value of a datum. * * Note: any nonzero value will be considered TRUE, but we ignore bits to * the left of the width of bool, per comment above. @@ -427,7 +432,7 @@ typedef Datum *DatumPtr; /* * BoolGetDatum - * Returns datum representation for a boolean. + * Returns datum representation for a boolean. * * Note: any nonzero value will be considered TRUE. */ @@ -436,161 +441,161 @@ typedef Datum *DatumPtr; /* * DatumGetChar - * Returns character value of a datum. + * Returns character value of a datum. */ #define DatumGetChar(X) ((char) GET_1_BYTE(X)) /* * CharGetDatum - * Returns datum representation for a character. + * Returns datum representation for a character. */ #define CharGetDatum(X) ((Datum) SET_1_BYTE(X)) /* * Int8GetDatum - * Returns datum representation for an 8-bit integer. + * Returns datum representation for an 8-bit integer. */ #define Int8GetDatum(X) ((Datum) SET_1_BYTE(X)) /* * DatumGetUInt8 - * Returns 8-bit unsigned integer value of a datum. + * Returns 8-bit unsigned integer value of a datum. */ #define DatumGetUInt8(X) ((uint8) GET_1_BYTE(X)) /* * UInt8GetDatum - * Returns datum representation for an 8-bit unsigned integer. + * Returns datum representation for an 8-bit unsigned integer. */ #define UInt8GetDatum(X) ((Datum) SET_1_BYTE(X)) /* * DatumGetInt16 - * Returns 16-bit integer value of a datum. + * Returns 16-bit integer value of a datum. */ #define DatumGetInt16(X) ((int16) GET_2_BYTES(X)) /* * Int16GetDatum - * Returns datum representation for a 16-bit integer. + * Returns datum representation for a 16-bit integer. */ #define Int16GetDatum(X) ((Datum) SET_2_BYTES(X)) /* * DatumGetUInt16 - * Returns 16-bit unsigned integer value of a datum. + * Returns 16-bit unsigned integer value of a datum. */ #define DatumGetUInt16(X) ((uint16) GET_2_BYTES(X)) /* * UInt16GetDatum - * Returns datum representation for a 16-bit unsigned integer. + * Returns datum representation for a 16-bit unsigned integer. */ #define UInt16GetDatum(X) ((Datum) SET_2_BYTES(X)) /* * DatumGetInt32 - * Returns 32-bit integer value of a datum. + * Returns 32-bit integer value of a datum. */ #define DatumGetInt32(X) ((int32) GET_4_BYTES(X)) /* * Int32GetDatum - * Returns datum representation for a 32-bit integer. + * Returns datum representation for a 32-bit integer. */ #define Int32GetDatum(X) ((Datum) SET_4_BYTES(X)) /* * DatumGetUInt32 - * Returns 32-bit unsigned integer value of a datum. + * Returns 32-bit unsigned integer value of a datum. */ #define DatumGetUInt32(X) ((uint32) GET_4_BYTES(X)) /* * UInt32GetDatum - * Returns datum representation for a 32-bit unsigned integer. + * Returns datum representation for a 32-bit unsigned integer. */ #define UInt32GetDatum(X) ((Datum) SET_4_BYTES(X)) /* * DatumGetObjectId - * Returns object identifier value of a datum. + * Returns object identifier value of a datum. */ #define DatumGetObjectId(X) ((Oid) GET_4_BYTES(X)) /* * ObjectIdGetDatum - * Returns datum representation for an object identifier. + * Returns datum representation for an object identifier. */ #define ObjectIdGetDatum(X) ((Datum) SET_4_BYTES(X)) /* * DatumGetTransactionId - * Returns transaction identifier value of a datum. + * Returns transaction identifier value of a datum. */ #define DatumGetTransactionId(X) ((TransactionId) GET_4_BYTES(X)) /* * TransactionIdGetDatum - * Returns datum representation for a transaction identifier. + * Returns datum representation for a transaction identifier. */ #define TransactionIdGetDatum(X) ((Datum) SET_4_BYTES((X))) /* * MultiXactIdGetDatum - * Returns datum representation for a multixact identifier. + * Returns datum representation for a multixact identifier. */ #define MultiXactIdGetDatum(X) ((Datum) SET_4_BYTES((X))) /* * DatumGetCommandId - * Returns command identifier value of a datum. + * Returns command identifier value of a datum. */ #define DatumGetCommandId(X) ((CommandId) GET_4_BYTES(X)) /* * CommandIdGetDatum - * Returns datum representation for a command identifier. + * Returns datum representation for a command identifier. */ #define CommandIdGetDatum(X) ((Datum) SET_4_BYTES(X)) /* * DatumGetPointer - * Returns pointer value of a datum. + * Returns pointer value of a datum. */ #define DatumGetPointer(X) ((Pointer) (X)) /* * PointerGetDatum - * Returns datum representation for a pointer. + * Returns datum representation for a pointer. */ #define PointerGetDatum(X) ((Datum) (X)) /* * DatumGetCString - * Returns C string (null-terminated string) value of a datum. + * Returns C string (null-terminated string) value of a datum. * * Note: C string is not a full-fledged Postgres type at present, * but type input functions use this conversion for their inputs. @@ -600,7 +605,7 @@ typedef Datum *DatumPtr; /* * CStringGetDatum - * Returns datum representation for a C string (null-terminated string). + * Returns datum representation for a C string (null-terminated string). * * Note: C string is not a full-fledged Postgres type at present, * but type output functions use this conversion for their outputs. @@ -612,14 +617,14 @@ typedef Datum *DatumPtr; /* * DatumGetName - * Returns name value of a datum. + * Returns name value of a datum. */ #define DatumGetName(X) ((Name) DatumGetPointer(X)) /* * NameGetDatum - * Returns datum representation for a name. + * Returns datum representation for a name. * * Note: Name is pass-by-reference; caller must ensure the pointed-to * value has adequate lifetime. @@ -629,7 +634,7 @@ typedef Datum *DatumPtr; /* * DatumGetInt64 - * Returns 64-bit integer value of a datum. + * Returns 64-bit integer value of a datum. * * Note: this macro hides whether int64 is pass by value or by reference. */ @@ -642,7 +647,7 @@ typedef Datum *DatumPtr; /* * Int64GetDatum - * Returns datum representation for a 64-bit integer. + * Returns datum representation for a 64-bit integer. * * Note: if int64 is pass by reference, this function returns a reference * to palloc'd space. @@ -656,7 +661,7 @@ extern Datum Int64GetDatum(int64 X); /* * DatumGetUInt64 - * Returns 64-bit unsigned integer value of a datum. + * Returns 64-bit unsigned integer value of a datum. * * Note: this macro hides whether int64 is pass by value or by reference. */ @@ -669,7 +674,7 @@ extern Datum Int64GetDatum(int64 X); /* * UInt64GetDatum - * Returns datum representation for a 64-bit unsigned integer. + * Returns datum representation for a 64-bit unsigned integer. * * Note: if int64 is pass by reference, this function returns a reference * to palloc'd space. @@ -691,7 +696,7 @@ extern Datum Int64GetDatum(int64 X); /* * DatumGetFloat4 - * Returns 4-byte floating point value of a datum. + * Returns 4-byte floating point value of a datum. * * Note: this macro hides whether float4 is pass by value or by reference. */ @@ -700,14 +705,14 @@ extern Datum Int64GetDatum(int64 X); static inline float4 DatumGetFloat4(Datum X) { - union - { - int32 value; - float4 retval; - } myunion; - - myunion.value = DatumGetInt32(X); - return myunion.retval; + union + { + int32 value; + float4 retval; + } myunion; + + myunion.value = DatumGetInt32(X); + return myunion.retval; } #else #define DatumGetFloat4(X) (* ((float4 *) DatumGetPointer(X))) @@ -715,7 +720,7 @@ DatumGetFloat4(Datum X) /* * Float4GetDatum - * Returns datum representation for a 4-byte floating point number. + * Returns datum representation for a 4-byte floating point number. * * Note: if float4 is pass by reference, this function returns a reference * to palloc'd space. @@ -724,14 +729,14 @@ DatumGetFloat4(Datum X) static inline Datum Float4GetDatum(float4 X) { - union - { - float4 value; - int32 retval; - } myunion; - - myunion.value = X; - return Int32GetDatum(myunion.retval); + union + { + float4 value; + int32 retval; + } myunion; + + myunion.value = X; + return Int32GetDatum(myunion.retval); } #else extern Datum Float4GetDatum(float4 X); @@ -739,7 +744,7 @@ extern Datum Float4GetDatum(float4 X); /* * DatumGetFloat8 - * Returns 8-byte floating point value of a datum. + * Returns 8-byte floating point value of a datum. * * Note: this macro hides whether float8 is pass by value or by reference. */ @@ -748,14 +753,14 @@ extern Datum Float4GetDatum(float4 X); static inline float8 DatumGetFloat8(Datum X) { - union - { - int64 value; - float8 retval; - } myunion; - - myunion.value = DatumGetInt64(X); - return myunion.retval; + union + { + int64 value; + float8 retval; + } myunion; + + myunion.value = DatumGetInt64(X); + return myunion.retval; } #else #define DatumGetFloat8(X) (* ((float8 *) DatumGetPointer(X))) @@ -763,7 +768,7 @@ DatumGetFloat8(Datum X) /* * Float8GetDatum - * Returns datum representation for an 8-byte floating point number. + * Returns datum representation for an 8-byte floating point number. * * Note: if float8 is pass by reference, this function returns a reference * to palloc'd space. @@ -773,14 +778,14 @@ DatumGetFloat8(Datum X) static inline Datum Float8GetDatum(float8 X) { - union - { - float8 value; - int64 retval; - } myunion; - - myunion.value = X; - return Int64GetDatum(myunion.retval); + union + { + float8 value; + int64 retval; + } myunion; + + myunion.value = X; + return Int64GetDatum(myunion.retval); } #else extern Datum Float8GetDatum(float8 X); @@ -817,7 +822,7 @@ extern Datum Float8GetDatum(float8 X); /* ---------------------------------------------------------------- - * Section 3: exception handling backend support + * Section 3: exception handling backend support * ---------------------------------------------------------------- */ @@ -827,14 +832,14 @@ extern Datum Float8GetDatum(float8 X); * ExceptionalCondition must be present even when assertions are not enabled. */ extern void ExceptionalCondition(const char *conditionName, - const char *errorType, - const char *fileName, int lineNumber) pg_attribute_noreturn(); + const char *errorType, + const char *fileName, int lineNumber) pg_attribute_noreturn(); extern void ResetUsageCommon(struct rusage *save_r, struct timeval *save_t); extern void ResetUsage(void); extern void ShowUsageCommon(const char *title, struct rusage *save_r, struct - timeval *save_t); + timeval *save_t); #ifdef __TBASE__ #define CLEAR_BIT(data, bit) data = (~(1 << (bit)) & (data)) #define SET_BIT(data, bit) data = ((1 << (bit)) | (data)) @@ -843,4 +848,4 @@ extern void ShowUsageCommon(const char *title, struct rusage *save_r, struct /* for error code */ extern bool g_is_in_init_phase; #endif -#endif /* POSTGRES_H */ +#endif /* POSTGRES_H */ diff --git a/src/include/replication/walsender.h b/src/include/replication/walsender.h index 7c88a772..a3b7876c 100644 --- a/src/include/replication/walsender.h +++ b/src/include/replication/walsender.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * walsender.h - * Exports from replication/walsender.c. + * Exports from replication/walsender.c. * * Portions Copyright (c) 2010-2017, PostgreSQL Global Development Group * @@ -16,14 +16,21 @@ #include "fmgr.h" +#define FLAG_AM_WALSENDER 0x01 /* Flag to set am_walsender(Am I a walsender process?) */ +#define FLAG_AM_DB_WALSENDER 0x02 /* Flag to set am_db_walsender(Am I a + walsender process and connected to + a database? + Yes: used for logical replicate. + No: used for physical replicate. */ + /* * What to do with a snapshot in create replication slot command. */ typedef enum { - CRS_EXPORT_SNAPSHOT, - CRS_NOEXPORT_SNAPSHOT, - CRS_USE_SNAPSHOT + CRS_EXPORT_SNAPSHOT, + CRS_NOEXPORT_SNAPSHOT, + CRS_USE_SNAPSHOT } CRSSnapshotAction; /* global state */ @@ -33,8 +40,8 @@ extern bool am_db_walsender; extern bool wake_wal_senders; /* user-settable parameters */ -extern int max_wal_senders; -extern int wal_sender_timeout; +extern int max_wal_senders; +extern int wal_sender_timeout; extern bool log_replication_commands; extern void InitWalSender(void); @@ -56,20 +63,20 @@ extern void WalSndRqstFileReload(void); * while holding contended locks. */ #define WalSndWakeupRequest() \ - do { wake_wal_senders = true; } while (0) + do { wake_wal_senders = true; } while (0) /* * wakeup walsenders if there is work to be done */ -#define WalSndWakeupProcessRequests() \ - do \ - { \ - if (wake_wal_senders) \ - { \ - wake_wal_senders = false; \ - if (max_wal_senders > 0) \ - WalSndWakeup(); \ - } \ - } while (0) +#define WalSndWakeupProcessRequests() \ + do \ + { \ + if (wake_wal_senders) \ + { \ + wake_wal_senders = false; \ + if (max_wal_senders > 0) \ + WalSndWakeup(); \ + } \ + } while (0) -#endif /* _WALSENDER_H */ +#endif /* _WALSENDER_H */ diff --git a/src/include/utils/ps_status.h b/src/include/utils/ps_status.h index ea26cfab..097474c5 100644 --- a/src/include/utils/ps_status.h +++ b/src/include/utils/ps_status.h @@ -17,10 +17,12 @@ extern bool update_process_title; extern char **save_ps_display_args(int argc, char **argv); extern void init_ps_display(const char *username, const char *dbname, - const char *host_info, const char *initial_str); + const char *host_info, const char *initial_str); extern void set_ps_display(const char *activity, bool force); extern const char *get_ps_display(int *displen); -#endif /* PS_STATUS_H */ +extern const char *get_ps_display_fixed(int *displen); + +#endif /* PS_STATUS_H */ From 0b24a5adeb3f42f34e1c579877670dc756eea222 Mon Sep 17 00:00:00 2001 From: aslanxli Date: Sun, 24 Apr 2022 10:14:09 +0800 Subject: [PATCH 545/578] fix multi-values insert error: Failing row contains (null, null). TAPD:http://tapd.woa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131098492799&jump_count=1 --- src/backend/executor/spi.c | 37 +++++++++- src/backend/nodes/copyfuncs.c | 16 ++++- src/backend/parser/analyze.c | 16 ++++- src/backend/utils/cache/plancache.c | 101 ++++++++++++++++++++++++++-- 4 files changed, 159 insertions(+), 11 deletions(-) diff --git a/src/backend/executor/spi.c b/src/backend/executor/spi.c index 808c75f8..fa81c2b1 100644 --- a/src/backend/executor/spi.c +++ b/src/backend/executor/spi.c @@ -34,7 +34,7 @@ #include "utils/snapmgr.h" #include "utils/syscache.h" #include "utils/typcache.h" - +#include "parser/analyze.h" uint64 SPI_processed = 0; Oid SPI_lastoid = InvalidOid; @@ -1881,6 +1881,41 @@ _SPI_pgxc_prepare_plan(const char *src, List *src_parsetree, SPIPlanPtr plan) _SPI_current->queryEnv); } + if (unlikely(IS_PGXC_COORDINATOR && list_length(stmt_list) == 1 + && IsA(parsetree->stmt, InsertStmt))) + { + Query *parse = (Query *)linitial(stmt_list); + /* + * set insert_into when we get multi-values insert, not + * often happen + */ + if (unlikely(parse->isMultiValues && !parse->hasUnshippableTriggers)) + { + MemoryContext old_ctx; + InsertStmt *iStmt = (InsertStmt*)parsetree->stmt; + InsertStmt *pStmt = (InsertStmt*)plansource->raw_parse_tree->stmt; + int colIdx = 0; + int rowIdx = 0; + + plansource->insert_into = true; + old_ctx = MemoryContextSwitchTo(plansource->context); + if (iStmt->data_list != NULL) + { + pStmt->data_list = (char ***)palloc(sizeof(char **) * iStmt->ndatarows); + for (rowIdx = 0; rowIdx < iStmt->ndatarows; rowIdx++) + { + pStmt->data_list[rowIdx] = (char **)palloc( + sizeof(char *) * iStmt->ninsert_columns); + for (colIdx = 0; colIdx < iStmt->ninsert_columns; colIdx++) + pStmt->data_list[rowIdx][colIdx] = pstrdup(iStmt->data_list[rowIdx][colIdx]); + } + } + pStmt->ndatarows = iStmt->ndatarows; + pStmt->ninsert_columns = iStmt->ninsert_columns; + MemoryContextSwitchTo(old_ctx); + } + } + /* Finish filling in the CachedPlanSource */ CompleteCachedPlan(plansource, stmt_list, diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 876c407e..5bc4e05c 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -3303,6 +3303,8 @@ static InsertStmt * _copyInsertStmt(const InsertStmt *from) { InsertStmt *newnode = makeNode(InsertStmt); + int colIdx = 0; + int rowIdx = 0; COPY_NODE_FIELD(relation); COPY_NODE_FIELD(cols); @@ -3313,8 +3315,20 @@ _copyInsertStmt(const InsertStmt *from) COPY_SCALAR_FIELD(override); #ifdef __TBASE__ COPY_SCALAR_FIELD(ninsert_columns); + if(from->data_list != NULL) + { + newnode->data_list = + (char ***)palloc(sizeof(char **) * from->ndatarows); + for (rowIdx = 0; rowIdx < from->ndatarows; rowIdx++) { + newnode->data_list[rowIdx] = + (char **)palloc(sizeof(char *) * from->ninsert_columns); + for (colIdx = 0; colIdx < from->ninsert_columns; colIdx++) + newnode->data_list[rowIdx][colIdx] = + pstrdup(from->data_list[rowIdx][colIdx]); + } + } + COPY_SCALAR_FIELD(ndatarows); #endif - return newnode; } diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c index 539cd7c8..6e2cf055 100644 --- a/src/backend/parser/analyze.c +++ b/src/backend/parser/analyze.c @@ -875,6 +875,7 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt) List *colcollations = NIL; int sublist_length = -1; bool lateral = false; + bool all_params = true; Assert(selectStmt->intoClause == NULL); @@ -1005,11 +1006,12 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt) switch(nodeTag(v)) { case T_A_Const: + all_params = false; break; case T_TypeCast: { TypeCast *cast = (TypeCast *)v; - + all_params = false; if (IsA(cast->arg, A_Const)) { v = (A_Const *)cast->arg; @@ -1020,6 +1022,9 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt) } break; } + case T_ParamRef: + copy_from = all_params; + break; default: { copy_from = false; @@ -1034,7 +1039,9 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt) } index++; - + if(all_params){ + continue; + } /* A_Const */ switch(v->val.type) { @@ -1092,7 +1099,7 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt) if (copy_from) { - if (ndatarows != column_index) + if (ndatarows != column_index && !all_params) { elog(ERROR, "datarow count mismatched, expected %d, result %d", ndatarows, column_index); @@ -1100,7 +1107,10 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt) qry->copy_filename = palloc(MAXPGPATH); snprintf(qry->copy_filename, MAXPGPATH, "%s", "Insert_into to Copy_from(Simple Protocl)"); stmt->ndatarows = ndatarows; + if(!all_params) stmt->data_list = data_list; + else + goto TRANSFORM_VALUELISTS; } else { diff --git a/src/backend/utils/cache/plancache.c b/src/backend/utils/cache/plancache.c index b69aa7cb..39915901 100644 --- a/src/backend/utils/cache/plancache.c +++ b/src/backend/utils/cache/plancache.c @@ -71,6 +71,7 @@ #include "utils/rls.h" #include "utils/snapmgr.h" #include "utils/syscache.h" +#include "utils/lsyscache.h" #ifdef PGXC #include "commands/prepare.h" #include "pgxc/execRemote.h" @@ -956,7 +957,7 @@ BuildCachedPlan(CachedPlanSource *plansource, List *qlist, MemoryContext plan_context; MemoryContext oldcxt = CurrentMemoryContext; ListCell *lc; - + char ***data_list = NULL; /* * Normally the querytree should be valid already, but if it's not, * rebuild it. @@ -1004,6 +1005,18 @@ BuildCachedPlan(CachedPlanSource *plansource, List *qlist, */ plist = pg_plan_queries(qlist, plansource->cursor_options, boundParams); + /* + * When get the cached multi-values insert plan, we transform insert to copyfrom plan + */ + if (plansource->insert_into && plansource->raw_parse_tree != NULL + && IsA(plansource->raw_parse_tree->stmt, InsertStmt)) + { + InsertStmt *iStmt = + (InsertStmt *)plansource->raw_parse_tree->stmt; + Query *query = (Query*) linitial(qlist); + bool suc = false; + plist=transformInsertValuesIntoCopyFrom(NULL, iStmt, &suc, query->copy_filename, query); + } /* Release snapshot if we got one */ if (snapshot_set) PopActiveSnapshot(); @@ -1024,12 +1037,17 @@ BuildCachedPlan(CachedPlanSource *plansource, List *qlist, * Copy plan into the new context. */ MemoryContextSwitchTo(plan_context); - + /* + * when we got a CopyStmt tansformed from multi values InsertStmt, + * no need copy data_list, we set later + */ plist = copyObject(plist); } else plan_context = CurrentMemoryContext; + + #ifdef PGXC /* * If this plansource belongs to a named prepared statement, store the stmt @@ -1355,7 +1373,78 @@ GetCachedPlan(CachedPlanSource *plansource, ParamListInfo boundParams, } Assert(plan != NULL); + if (plansource->insert_into && plansource->raw_parse_tree != NULL && + IsA(plansource->raw_parse_tree->stmt, InsertStmt)) { + MemoryContext old_top; + InsertStmt *iStmt = (InsertStmt *)plansource->raw_parse_tree->stmt; + char ***data_list = NULL; + PlannedStmt *planstmt = (PlannedStmt *)linitial(plan->stmt_list); + CopyStmt *copyStmt = (CopyStmt *)planstmt->utilityStmt; + /* + * we got parameters passed in, need trans them into data_list in + * InsertStmt, then trans the insertStmt to copyStmt + */ + if (boundParams != NULL) + { + int colCnt = iStmt->ninsert_columns; + int i = 0; + char *valStr = NULL; + int colIdx = 0; + int rowIdx = 0; + + if (colCnt == 0 || boundParams->numParams == 0 || + boundParams->numParams % colCnt != 0) + plansource->insert_into = false; + + old_top = MemoryContextSwitchTo(TopTransactionContext); + data_list = (char ***)palloc0(sizeof(char **) * + (boundParams->numParams / colCnt)); + for (i = 0; i < (boundParams->numParams / colCnt); i++) + { + data_list[i] = (char **)palloc0(sizeof(char *) * colCnt); + } + for (i = 0; i < boundParams->numParams; i++) + { + Oid typOutput; + bool typIsVarlena; + Datum value; + Oid ptype = boundParams->params[i].ptype; + getTypeOutputInfo(ptype, &typOutput, &typIsVarlena); + + if(typIsVarlena) + { + value = PointerGetDatum(PG_DETOAST_DATUM(boundParams->params[i].value)); + } + else + { + value = boundParams->params[i].value; + } + + if (boundParams->params[i].isnull) + data_list[rowIdx][colIdx++] = NULL; + else { + valStr = OidOutputFunctionCall(typOutput, value); + data_list[rowIdx][colIdx++] = pstrdup(valStr); + } + if (colIdx >= colCnt) + { + colIdx = 0; + rowIdx++; + } + } + copyStmt->data_list = data_list; + copyStmt->ndatarows = rowIdx; + copyStmt->ncolumns = colCnt; + MemoryContextSwitchTo(old_top); + } + else if(iStmt->data_list != NULL) + { + copyStmt->data_list = iStmt->data_list; + copyStmt->ndatarows = iStmt->ndatarows; + copyStmt->ncolumns = iStmt->ninsert_columns; + } + } /* Flag the plan as in use by caller */ if (useResOwner) ResourceOwnerEnlargePlanCacheRefs(CurrentResourceOwner); @@ -1364,10 +1453,10 @@ GetCachedPlan(CachedPlanSource *plansource, ParamListInfo boundParams, ResourceOwnerRememberPlanCacheRef(CurrentResourceOwner, plan); /* - * Saved plans should be under CacheMemoryContext so they will not go away - * until their reference count goes to zero. In the generic-plan cases we - * already took care of that, but for a custom plan, do it as soon as we - * have created a reference-counted link. + * Saved plans should be under CacheMemoryContext so they will not go + * away until their reference count goes to zero. In the generic-plan + * cases we already took care of that, but for a custom plan, do it as + * soon as we have created a reference-counted link. */ if (customplan && plansource->is_saved) { From 7a91cc7edd9747643999b1dd2e1a897ece0cf9a8 Mon Sep 17 00:00:00 2001 From: aslanxli Date: Sun, 24 Apr 2022 15:40:59 +0800 Subject: [PATCH 546/578] fix multi-values insert error: Failing row contains (null, null). TAPD:http://tapd.woa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131098492799&jump_count=1 --- src/backend/executor/spi.c | 104 ++++++++++++++++++---------- src/backend/parser/analyze.c | 15 +--- src/backend/utils/cache/plancache.c | 99 ++------------------------ 3 files changed, 77 insertions(+), 141 deletions(-) diff --git a/src/backend/executor/spi.c b/src/backend/executor/spi.c index fa81c2b1..510f1fcb 100644 --- a/src/backend/executor/spi.c +++ b/src/backend/executor/spi.c @@ -53,6 +53,10 @@ static Portal SPI_cursor_open_internal(const char *name, SPIPlanPtr plan, static void _SPI_pgxc_prepare_plan(const char *src, List *src_parsetree, SPIPlanPtr plan); #endif +static void _SPI_multi_insert_rewrite( + CachedPlanSource *plansource, RawStmt *parsetree, + List *stmtList); + static void _SPI_prepare_plan(const char *src, SPIPlanPtr plan); static void _SPI_prepare_oneshot_plan(const char *src, SPIPlanPtr plan); @@ -1881,40 +1885,7 @@ _SPI_pgxc_prepare_plan(const char *src, List *src_parsetree, SPIPlanPtr plan) _SPI_current->queryEnv); } - if (unlikely(IS_PGXC_COORDINATOR && list_length(stmt_list) == 1 - && IsA(parsetree->stmt, InsertStmt))) - { - Query *parse = (Query *)linitial(stmt_list); - /* - * set insert_into when we get multi-values insert, not - * often happen - */ - if (unlikely(parse->isMultiValues && !parse->hasUnshippableTriggers)) - { - MemoryContext old_ctx; - InsertStmt *iStmt = (InsertStmt*)parsetree->stmt; - InsertStmt *pStmt = (InsertStmt*)plansource->raw_parse_tree->stmt; - int colIdx = 0; - int rowIdx = 0; - - plansource->insert_into = true; - old_ctx = MemoryContextSwitchTo(plansource->context); - if (iStmt->data_list != NULL) - { - pStmt->data_list = (char ***)palloc(sizeof(char **) * iStmt->ndatarows); - for (rowIdx = 0; rowIdx < iStmt->ndatarows; rowIdx++) - { - pStmt->data_list[rowIdx] = (char **)palloc( - sizeof(char *) * iStmt->ninsert_columns); - for (colIdx = 0; colIdx < iStmt->ninsert_columns; colIdx++) - pStmt->data_list[rowIdx][colIdx] = pstrdup(iStmt->data_list[rowIdx][colIdx]); - } - } - pStmt->ndatarows = iStmt->ndatarows; - pStmt->ninsert_columns = iStmt->ninsert_columns; - MemoryContextSwitchTo(old_ctx); - } - } + _SPI_multi_insert_rewrite(plansource, parsetree, stmt_list); /* Finish filling in the CachedPlanSource */ CompleteCachedPlan(plansource, @@ -2114,7 +2085,7 @@ _SPI_execute_plan(SPIPlanPtr plan, ParamListInfo paramLI, plan->nargs, _SPI_current->queryEnv); } - + _SPI_multi_insert_rewrite(plansource, parsetree, stmt_list); /* Finish filling in the CachedPlanSource */ CompleteCachedPlan(plansource, stmt_list, @@ -2132,6 +2103,25 @@ _SPI_execute_plan(SPIPlanPtr plan, ParamListInfo paramLI, * plan, the refcount must be backed by the CurrentResourceOwner. */ cplan = GetCachedPlan(plansource, paramLI, plan->saved, _SPI_current->queryEnv); + /* + * TODO: now we don't support param, if multi values contains paramref, do not + * transform to CopyStmt, refactor later + */ + if (plansource->insert_into && plansource->raw_parse_tree != NULL && + IsA(plansource->raw_parse_tree->stmt, InsertStmt)) + { + bool suc; + InsertStmt *iStmt = (InsertStmt *) plansource->raw_parse_tree->stmt; + PlannedStmt *pStmt = (PlannedStmt *) linitial(cplan->stmt_list); + Query *query = (Query*) linitial(plansource->query_list); + if (!(pStmt->utilityStmt && IsA(pStmt->utilityStmt, CopyStmt)) && iStmt->data_list != NULL) + { + MemoryContext old_ctx; + old_ctx = MemoryContextSwitchTo(plansource->context); + cplan->stmt_list = transformInsertValuesIntoCopyFrom(NULL, iStmt, &suc, query->copy_filename, query); + MemoryContextSwitchTo(old_ctx); + } + } stmt_list = cplan->stmt_list; /* @@ -2889,3 +2879,47 @@ SPI_register_trigger_data(TriggerData *tdata) return SPI_OK_TD_REGISTER; } + +/* + * _SPI_multi_insert_rewrite + * If current stmt is a multi-line insert statement, copy the + * datalist to the raw_parse_tree in plansource and set plansource->insert_into + */ +static void _SPI_multi_insert_rewrite(CachedPlanSource *plansource, + RawStmt *parsetree, List *stmtList) +{ + if (IS_PGXC_COORDINATOR && list_length(stmtList) == 1 + && IsA(parsetree->stmt, InsertStmt)) + { + Query *parse = (Query *)linitial(stmtList); + /* + * set insert_into when we get multi-values insert, not + * often happen + */ + if (unlikely(parse->isMultiValues && !parse->hasUnshippableTriggers)) + { + MemoryContext old_ctx; + InsertStmt *iStmt = (InsertStmt*)parsetree->stmt; + InsertStmt *pStmt = (InsertStmt*)plansource->raw_parse_tree->stmt; + int colIdx = 0; + int rowIdx = 0; + + plansource->insert_into = true; + old_ctx = MemoryContextSwitchTo(plansource->context); + if (iStmt->data_list != NULL) + { + pStmt->data_list = (char ***)palloc(sizeof(char **) * iStmt->ndatarows); + for (rowIdx = 0; rowIdx < iStmt->ndatarows; rowIdx++) + { + pStmt->data_list[rowIdx] = (char **)palloc( + sizeof(char *) * iStmt->ninsert_columns); + for (colIdx = 0; colIdx < iStmt->ninsert_columns; colIdx++) + pStmt->data_list[rowIdx][colIdx] = pstrdup(iStmt->data_list[rowIdx][colIdx]); + } + } + pStmt->ndatarows = iStmt->ndatarows; + pStmt->ninsert_columns = iStmt->ninsert_columns; + MemoryContextSwitchTo(old_ctx); + } + } +} diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c index 6e2cf055..f5a26ad3 100644 --- a/src/backend/parser/analyze.c +++ b/src/backend/parser/analyze.c @@ -648,6 +648,7 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt) qry->isSingleValues = false; qry->isMultiValues = false; stmt->ninsert_columns = 0; + qry->copy_filename = NULL; #endif /* process the WITH clause independently of all else */ @@ -875,7 +876,6 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt) List *colcollations = NIL; int sublist_length = -1; bool lateral = false; - bool all_params = true; Assert(selectStmt->intoClause == NULL); @@ -1006,12 +1006,10 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt) switch(nodeTag(v)) { case T_A_Const: - all_params = false; break; case T_TypeCast: { TypeCast *cast = (TypeCast *)v; - all_params = false; if (IsA(cast->arg, A_Const)) { v = (A_Const *)cast->arg; @@ -1022,9 +1020,6 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt) } break; } - case T_ParamRef: - copy_from = all_params; - break; default: { copy_from = false; @@ -1039,9 +1034,6 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt) } index++; - if(all_params){ - continue; - } /* A_Const */ switch(v->val.type) { @@ -1099,7 +1091,7 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt) if (copy_from) { - if (ndatarows != column_index && !all_params) + if (ndatarows != column_index) { elog(ERROR, "datarow count mismatched, expected %d, result %d", ndatarows, column_index); @@ -1107,10 +1099,7 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt) qry->copy_filename = palloc(MAXPGPATH); snprintf(qry->copy_filename, MAXPGPATH, "%s", "Insert_into to Copy_from(Simple Protocl)"); stmt->ndatarows = ndatarows; - if(!all_params) stmt->data_list = data_list; - else - goto TRANSFORM_VALUELISTS; } else { diff --git a/src/backend/utils/cache/plancache.c b/src/backend/utils/cache/plancache.c index 39915901..ac2037a4 100644 --- a/src/backend/utils/cache/plancache.c +++ b/src/backend/utils/cache/plancache.c @@ -71,7 +71,6 @@ #include "utils/rls.h" #include "utils/snapmgr.h" #include "utils/syscache.h" -#include "utils/lsyscache.h" #ifdef PGXC #include "commands/prepare.h" #include "pgxc/execRemote.h" @@ -957,7 +956,7 @@ BuildCachedPlan(CachedPlanSource *plansource, List *qlist, MemoryContext plan_context; MemoryContext oldcxt = CurrentMemoryContext; ListCell *lc; - char ***data_list = NULL; + /* * Normally the querytree should be valid already, but if it's not, * rebuild it. @@ -1005,18 +1004,6 @@ BuildCachedPlan(CachedPlanSource *plansource, List *qlist, */ plist = pg_plan_queries(qlist, plansource->cursor_options, boundParams); - /* - * When get the cached multi-values insert plan, we transform insert to copyfrom plan - */ - if (plansource->insert_into && plansource->raw_parse_tree != NULL - && IsA(plansource->raw_parse_tree->stmt, InsertStmt)) - { - InsertStmt *iStmt = - (InsertStmt *)plansource->raw_parse_tree->stmt; - Query *query = (Query*) linitial(qlist); - bool suc = false; - plist=transformInsertValuesIntoCopyFrom(NULL, iStmt, &suc, query->copy_filename, query); - } /* Release snapshot if we got one */ if (snapshot_set) PopActiveSnapshot(); @@ -1037,10 +1024,7 @@ BuildCachedPlan(CachedPlanSource *plansource, List *qlist, * Copy plan into the new context. */ MemoryContextSwitchTo(plan_context); - /* - * when we got a CopyStmt tansformed from multi values InsertStmt, - * no need copy data_list, we set later - */ + plist = copyObject(plist); } else @@ -1373,78 +1357,7 @@ GetCachedPlan(CachedPlanSource *plansource, ParamListInfo boundParams, } Assert(plan != NULL); - if (plansource->insert_into && plansource->raw_parse_tree != NULL && - IsA(plansource->raw_parse_tree->stmt, InsertStmt)) { - MemoryContext old_top; - InsertStmt *iStmt = (InsertStmt *)plansource->raw_parse_tree->stmt; - char ***data_list = NULL; - PlannedStmt *planstmt = (PlannedStmt *)linitial(plan->stmt_list); - CopyStmt *copyStmt = (CopyStmt *)planstmt->utilityStmt; - /* - * we got parameters passed in, need trans them into data_list in - * InsertStmt, then trans the insertStmt to copyStmt - */ - if (boundParams != NULL) - { - int colCnt = iStmt->ninsert_columns; - int i = 0; - char *valStr = NULL; - int colIdx = 0; - int rowIdx = 0; - - if (colCnt == 0 || boundParams->numParams == 0 || - boundParams->numParams % colCnt != 0) - plansource->insert_into = false; - - old_top = MemoryContextSwitchTo(TopTransactionContext); - data_list = (char ***)palloc0(sizeof(char **) * - (boundParams->numParams / colCnt)); - for (i = 0; i < (boundParams->numParams / colCnt); i++) - { - data_list[i] = (char **)palloc0(sizeof(char *) * colCnt); - } - for (i = 0; i < boundParams->numParams; i++) - { - Oid typOutput; - bool typIsVarlena; - Datum value; - Oid ptype = boundParams->params[i].ptype; - getTypeOutputInfo(ptype, &typOutput, &typIsVarlena); - - if(typIsVarlena) - { - value = PointerGetDatum(PG_DETOAST_DATUM(boundParams->params[i].value)); - } - else - { - value = boundParams->params[i].value; - } - - if (boundParams->params[i].isnull) - data_list[rowIdx][colIdx++] = NULL; - else { - valStr = OidOutputFunctionCall(typOutput, value); - data_list[rowIdx][colIdx++] = pstrdup(valStr); - } - if (colIdx >= colCnt) - { - colIdx = 0; - rowIdx++; - } - } - copyStmt->data_list = data_list; - copyStmt->ndatarows = rowIdx; - copyStmt->ncolumns = colCnt; - MemoryContextSwitchTo(old_top); - } - else if(iStmt->data_list != NULL) - { - copyStmt->data_list = iStmt->data_list; - copyStmt->ndatarows = iStmt->ndatarows; - copyStmt->ncolumns = iStmt->ninsert_columns; - } - } /* Flag the plan as in use by caller */ if (useResOwner) ResourceOwnerEnlargePlanCacheRefs(CurrentResourceOwner); @@ -1453,10 +1366,10 @@ GetCachedPlan(CachedPlanSource *plansource, ParamListInfo boundParams, ResourceOwnerRememberPlanCacheRef(CurrentResourceOwner, plan); /* - * Saved plans should be under CacheMemoryContext so they will not go - * away until their reference count goes to zero. In the generic-plan - * cases we already took care of that, but for a custom plan, do it as - * soon as we have created a reference-counted link. + * Saved plans should be under CacheMemoryContext so they will not go away + * until their reference count goes to zero. In the generic-plan cases we + * already took care of that, but for a custom plan, do it as soon as we + * have created a reference-counted link. */ if (customplan && plansource->is_saved) { From fba17235e072054fb8f820760c8f04ee08f18005 Mon Sep 17 00:00:00 2001 From: whalesong Date: Sun, 24 Apr 2022 16:35:29 +0800 Subject: [PATCH 547/578] Revert "bugfix: server time different cause 2pc clean error (merge request 1170), http://tapd.oa.com/20421696/bugtrace/bugs/view?bug_id=1020421696096815567" This reverts commit f442a37db1fe4f279b63b421a3a9f3305eb6c636. --- contrib/pg_clean/pg_clean--1.0.sql | 6 - .../pg_clean/pg_clean--unpackaged--1.0.sql | 1 - contrib/pg_clean/pg_clean.c | 934 ++++-------------- src/backend/access/transam/twophase.c | 51 +- src/backend/access/transam/xlog.c | 42 +- src/backend/pgxc/pool/execRemote.c | 49 +- src/backend/postmaster/clean2pc.c | 45 +- src/backend/utils/misc/guc.c | 20 +- src/include/access/twophase.h | 4 - 9 files changed, 242 insertions(+), 910 deletions(-) diff --git a/contrib/pg_clean/pg_clean--1.0.sql b/contrib/pg_clean/pg_clean--1.0.sql index be8623f7..e5bbc9ca 100644 --- a/contrib/pg_clean/pg_clean--1.0.sql +++ b/contrib/pg_clean/pg_clean--1.0.sql @@ -50,11 +50,6 @@ RETURNS text AS 'MODULE_PATHNAME' LANGUAGE C; -CREATE FUNCTION pgxc_get_2pc_prepare_timestamp(gid text) -RETURNS text -AS 'MODULE_PATHNAME' -LANGUAGE C; - CREATE FUNCTION pgxc_get_2pc_commit_timestamp(gid text) RETURNS text AS 'MODULE_PATHNAME' @@ -101,7 +96,6 @@ GRANT ALL ON FUNCTION pg_clean_check_txn(time_interval integer) TO PUBLIC; GRANT ALL ON FUNCTION pgxc_get_2pc_nodes(gid text) TO PUBLIC; GRANT ALL ON FUNCTION pgxc_get_2pc_startnode(gid text) TO PUBLIC; GRANT ALL ON FUNCTION pgxc_get_2pc_startxid(gid text) TO PUBLIC; -GRANT ALL ON FUNCTION pgxc_get_2pc_prepare_timestamp(gid text) TO PUBLIC; GRANT ALL ON FUNCTION pgxc_get_2pc_commit_timestamp(gid text) TO PUBLIC; GRANT ALL ON FUNCTION pgxc_get_2pc_xid(gid text) TO PUBLIC; GRANT ALL ON FUNCTION pgxc_get_2pc_file(gid text) TO PUBLIC; diff --git a/contrib/pg_clean/pg_clean--unpackaged--1.0.sql b/contrib/pg_clean/pg_clean--unpackaged--1.0.sql index d173a607..a6a67659 100644 --- a/contrib/pg_clean/pg_clean--unpackaged--1.0.sql +++ b/contrib/pg_clean/pg_clean--unpackaged--1.0.sql @@ -9,7 +9,6 @@ ALTER EXTENSION pg_clean ADD function pg_clean_check_txn(time_interval integer); ALTER EXTENSION pg_clean ADD function pgxc_get_2pc_nodes(gid text); ALTER EXTENSION pg_clean ADD function pgxc_get_2pc_startnode(gid text); ALTER EXTENSION pg_clean ADD function pgxc_get_2pc_startxid(gid text); -ALTER EXTENSION pg_clean ADD function pgxc_get_2pc_prepare_timestamp(gid text); ALTER EXTENSION pg_clean ADD function pgxc_get_2pc_commit_timestamp(gid text); ALTER EXTENSION pg_clean ADD function pgxc_get_2pc_xid(gid text); ALTER EXTENSION pg_clean ADD function pgxc_get_2pc_file(gid text); diff --git a/contrib/pg_clean/pg_clean.c b/contrib/pg_clean/pg_clean.c index 4ee21911..5a20456e 100644 --- a/contrib/pg_clean/pg_clean.c +++ b/contrib/pg_clean/pg_clean.c @@ -63,14 +63,8 @@ int transaction_threshold = 200000; #define MAXIMUM_OUTPUT_FILE 1000 #define XIDPREFIX "_$XC$" #define DEFAULT_CLEAN_TIME_INTERVAL 120 - -#ifdef __TWO_PHASE_TESTS__ -#define LEAST_CLEAN_TIME_INTERVAL 1 /* should not clean twophase trans prepared in 1s */ +#define LEAST_CLEAN_TIME_INTERVAL 3 /* should not clean twophase trans prepared in 3s */ #define LEAST_CHECK_TIME_INTERVAL 1 /* should not check twophase trans prepared in 1s */ -#else -#define LEAST_CLEAN_TIME_INTERVAL 10 /* should not clean twophase trans prepared in 10s */ -#define LEAST_CHECK_TIME_INTERVAL 3 /* should not check twophase trans prepared in 3s */ -#endif GlobalTimestamp clean_time_interval = DEFAULT_CLEAN_TIME_INTERVAL * USECS_PER_SEC; @@ -78,15 +72,19 @@ PG_MODULE_MAGIC; #define MAX_GID 64 +#define CLEAN_CHECK_TIMES_DEFAULT 3 +#define CLEAN_CHECK_INTERVAL_DEFAULT 100000 + +#define CLEAN_NODE_CHECK_TIMES 5 +#define CLEAN_NODE_CHECK_INTERVAL 500000 + #define MAX_DBNAME 64 #define GET_START_XID "startxid:" -#define GET_PREPARE_TIMESTAMP "global_prepare_timestamp:" #define GET_COMMIT_TIMESTAMP "global_commit_timestamp:" #define GET_START_NODE "startnode:" #define GET_NODE "nodes:" #define GET_XID "\nxid:" #define GET_READONLY "readonly" -#define ROLLBACK_POSTFIX ".rollback" /* 2pc file postfix when the 2pc is rollbacked */ #define GIDSIZE (200 + 24) #define MAX_TWOPC_TXN 1000 #define STRING_BUFF_LEN 1024 @@ -192,7 +190,6 @@ typedef struct txn_info TXN_STATUS *txn_stat; /* Array for each nodes */ char *msg; /* Notice message for this txn. */ GlobalTimestamp global_commit_timestamp; /* get global_commit_timestamp from node once it is committed*/ - GlobalTimestamp global_prepare_timestamp; /* get global_prepare_timestamp from node once it is prepared*/ TXN_STATUS global_txn_stat; OPERATION op; @@ -265,10 +262,8 @@ database_info *last_database_info = NULL; bool execute = false; int total_twopc_txn = 0; -TimestampTz current_time = 0; -TimestampTz abnormal_time = 0; -GlobalTimestamp current_gts = InvalidGlobalTimestamp; /* use to save current gts */ -GlobalTimestamp abnormal_gts = InvalidGlobalTimestamp; /* use to save abnormal gts, clean 2PCs which prepare gts less than abnormal gts */ +TimestampTz current_time; +GlobalTimestamp abnormal_time = InvalidGlobalTimestamp; char *abnormal_nodename = NULL; Oid abnormal_nodeoid = InvalidOid; bool clear_2pc_belong_node = false; @@ -346,14 +341,6 @@ static void static void get_node_handles(PGXCNodeAllHandles ** pgxc_handles, Oid nodeoid); -uint32 get_start_xid_from_gid(char *gid); -char *get_start_node_from_gid(char *gid); -Oid get_start_node_oid_from_gid(char *gid); - -bool is_xid_running_on_node(uint32 xid, Oid node_oid); -bool is_gid_start_xid_running(char *gid); -bool is_txn_start_xid_running(txn_info *txn); - Datum pg_clean_execute(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(pg_clean_execute); Datum pg_clean_execute(PG_FUNCTION_ARGS) @@ -507,7 +494,6 @@ Datum pg_clean_execute_on_node(PG_FUNCTION_ARGS) char txn_status[100]; char txn_op[100]; char txn_op_issuccess[100]; - int64 time_gap = 0; Datum values[ACCESS_CONTROL_ATTR_NUM]; bool nulls[ACCESS_CONTROL_ATTR_NUM]; @@ -554,36 +540,21 @@ Datum pg_clean_execute_on_node(PG_FUNCTION_ARGS) execute = true; clear_2pc_belong_node = true; - if (0 == PG_GETARG_DATUM(0)) - { - elog(ERROR, "pg_clean_execute_on_node: node name is empty"); - } abnormal_nodename = text_to_cstring(PG_GETARG_TEXT_P(0)); abnormal_nodeoid = get_pgxc_nodeoid(abnormal_nodename); if (InvalidOid == abnormal_nodeoid) { - elog(ERROR, "pg_clean_execute_on_node, cannot clear 2pc of " - "invalid nodename '%s'", abnormal_nodename); + elog(ERROR, "pg_clean_execute_on_node, cannot clear 2pc of invalid nodename '%s'", abnormal_nodename); } abnormal_time = PG_GETARG_INT64(1); current_time = GetCurrentTimestamp(); - time_gap = current_time - abnormal_time; - if (time_gap < LEAST_CLEAN_TIME_INTERVAL * USECS_PER_SEC) + if (abnormal_time >= current_time - LEAST_CLEAN_TIME_INTERVAL * USECS_PER_SEC) { - /*time gap less than LEAST_CLEAN_TIME_INTERVAL, can not clean*/ - elog(ERROR, "pg_clean_execute_on_node, least clean interval is %ds, " - "abnormal time: " INT64_FORMAT ", current time: " INT64_FORMAT, + elog(ERROR, "pg_clean_execute_on_node, least clean time interval is %ds, " + "abnormal time: " INT64_FORMAT ", current_time: " INT64_FORMAT, LEAST_CLEAN_TIME_INTERVAL, abnormal_time, current_time); } - current_gts = GetGlobalTimestampGTM(); - if (!GlobalTimestampIsValid(current_gts)) - { - /*get invalid gts, can not clean*/ - elog(ERROR, "pg_clean_execute_on_node, get invalid gts"); - } - abnormal_gts = current_gts - time_gap; - /*get node list*/ PgxcNodeGetOids(&cn_node_list, &dn_node_list, &cn_nodes_num, &dn_nodes_num, true); @@ -799,9 +770,7 @@ static void ResetGlobalVariables(void) head_database_info = last_database_info = NULL; current_time = 0; - abnormal_time = 0; - current_gts = InvalidGlobalTimestamp; - abnormal_gts = InvalidGlobalTimestamp; + abnormal_time = InvalidGlobalTimestamp; abnormal_nodename = NULL; abnormal_nodeoid = InvalidOid; clear_2pc_belong_node = false; @@ -953,7 +922,7 @@ static void getDatabaseList(void) { int i; TupleTableSlots result_db; - const char *query_db = "select datname::text from pg_catalog.pg_database"; + const char *query_db = "select datname::text from pg_database;"; /*add datname into tail of head_database_info*/ if (execute_query_on_single_node(my_nodeoid, query_db, 1, &result_db) == (Datum) 1) { @@ -1010,12 +979,6 @@ static void getTxnInfoOnNodesAll(void) { int i; current_time = GetCurrentTimestamp(); - current_gts = GetGlobalTimestampGTM(); - if (!GlobalTimestampIsValid(current_gts)) - { - /*get invalid gts, get txn info error*/ - elog(ERROR, "getTxnInfoOnNodesAll, get invalid gts"); - } /*upload 2PC transaction from CN*/ for (i = 0; i < cn_nodes_num; i++) { @@ -1039,12 +1002,10 @@ void getTxnInfoOnNode(Oid node) TupleTableSlots result_txn; Datum execute_res; char query_execute[1024]; - const char *query_txn_status = "select transaction::text, gid::text, " - "owner::text, database::text, timestamptz_out(prepared)::text " - "from pg_catalog.pg_prepared_xacts"; - const char *query_txn_status_execute = "select transaction::text, gid::text, " - "owner::text, database::text, timestamptz_out(prepared)::text " - "from pg_catalog.pg_prepared_xacts where database = '%s'"; + const char *query_txn_status = "select transaction::text, gid::text, owner::text, database::text, timestamptz_out(prepared)::text " + "from pg_prepared_xacts;"; + const char *query_txn_status_execute = "select transaction::text, gid::text, owner::text, database::text, timestamptz_out(prepared)::text " + "from pg_prepared_xacts where database = '%s';"; snprintf(query_execute, 1024, query_txn_status_execute, get_database_name(MyDatabaseId)); if (execute) @@ -1145,7 +1106,6 @@ TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info *txn, Oid node_oid) char *file_content = NULL; uint32 startxid = 0; char *str_startxid = NULL; - char *str_prepare_gts = NULL; char *str_timestamp = NULL; char *temp = NULL; Oid temp_nodeoid; @@ -1153,7 +1113,7 @@ TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info *txn, Oid node_oid) int temp_nodeidx; char stmt[1024]; static const char *STMT_FORM = "select pgxc_get_2pc_file('%s')::text"; - snprintf(stmt, 1024, STMT_FORM, txn->gid); + snprintf(stmt, 1024, STMT_FORM, txn->gid, txn->gid, txn->gid, txn->gid); if (execute_query_on_single_node(node_oid, stmt, 1, &result) == (Datum) 1) { @@ -1166,12 +1126,6 @@ TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info *txn, Oid node_oid) { file_content = TTSgetvalue(&result, 0, 0); - if (strlen(file_content) == 0) - { - elog(LOG, "gid: %s, 2pc file is not exist", txn->gid); - return TWOPHASE_FILE_NOT_EXISTS; - } - if (!IsXidImplicit(txn->gid) && strstr(file_content, GET_READONLY)) { txn->is_readonly = true; @@ -1181,7 +1135,6 @@ TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info *txn, Oid node_oid) } startnode = strstr(file_content, GET_START_NODE); str_startxid = strstr(file_content, GET_START_XID); - str_prepare_gts = strstr(file_content, GET_PREPARE_TIMESTAMP); partnodes = strstr(file_content, GET_NODE); temp = strstr(file_content, GET_COMMIT_TIMESTAMP); @@ -1193,7 +1146,6 @@ TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info *txn, Oid node_oid) temp = strstr(temp, GET_COMMIT_TIMESTAMP); } - /* get start node name */ if (startnode) { startnode += strlen(GET_START_NODE); @@ -1201,7 +1153,6 @@ TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info *txn, Oid node_oid) txn->origcoord = get_pgxc_nodeoid(startnode); } - /* get start xid */ if (str_startxid) { str_startxid += strlen(GET_START_XID); @@ -1210,7 +1161,6 @@ TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info *txn, Oid node_oid) txn->startxid = startxid; } - /* get participated nodes */ if (partnodes) { partnodes += strlen(GET_NODE); @@ -1233,37 +1183,15 @@ TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info *txn, Oid node_oid) return res; } - /* get prepare gts */ - if (str_prepare_gts) - { - str_prepare_gts += strlen(GET_PREPARE_TIMESTAMP); - str_prepare_gts = strtok(str_prepare_gts, "\n"); - txn->global_prepare_timestamp = strtoull(str_prepare_gts, NULL, 10); - } - else - { - txn->global_prepare_timestamp = InvalidGlobalTimestamp; - } - - /* get commit gts */ if (str_timestamp) { str_timestamp += strlen(GET_COMMIT_TIMESTAMP); str_timestamp = strtok(str_timestamp, "\n"); txn->global_commit_timestamp = strtoull(str_timestamp, NULL, 10); } - else - { - txn->global_commit_timestamp = InvalidGlobalTimestamp; - } - - elog(DEBUG1, "get 2pc txn: %s, partnodes in nodename: %s(nodeoid:%u), " - "partnodes: (%s), startnode: %s(startnodeoid: %u), startxid: %u, " - "global_prepare_timestamp: %ld, global_commit_timestamp: %ld", - txn->gid, get_pgxc_nodename(node_oid), node_oid, - partnodes, startnode, txn->origcoord, startxid, - txn->global_prepare_timestamp, txn->global_commit_timestamp); + elog(DEBUG1, "get 2pc txn:%s partnodes in nodename: %s (nodeoid:%u) result: partnodes:%s, startnode:%s, startnodeoid:%u, startxid:%u", + txn->gid, get_pgxc_nodename(node_oid), node_oid, partnodes, startnode, txn->origcoord, startxid); /* in explicit transaction startnode participate the transaction */ if (strstr(partnodes, startnode) || !IsXidImplicit(txn->gid)) { @@ -1529,8 +1457,7 @@ void getTxnInfoOnOtherNodes(txn_info *txn) node_oid = get_pgxc_nodeoid(ptr); status = GetTransactionPartNodes(txn, node_oid); } - - if (status == TWOPHASE_FILE_NOT_EXISTS) + else { for (ii = 0; ii < cn_nodes_num + dn_nodes_num; ii++) { @@ -1695,7 +1622,7 @@ void getTxnStatus(txn_info *txn, int node_idx) TupleTableSlots result; static const char *STMT_FORM = "SELECT pgxc_is_committed('%d'::xid)::text"; - snprintf(stmt, 1024, STMT_FORM, txn->xid[node_idx]); + snprintf(stmt, 1024, STMT_FORM, txn->xid[node_idx], txn->xid[node_idx]); node_oid = find_node_oid(node_idx); if (0 != execute_query_on_single_node(node_oid, stmt, 1, &result)) @@ -1786,10 +1713,6 @@ char *get2PCInfo(const char *tid) return NULL; } -/* - * pgxc_get_2pc_file - * Get 2pc file content - */ Datum pgxc_get_2pc_file(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(pgxc_get_2pc_file); Datum pgxc_get_2pc_file(PG_FUNCTION_ARGS) @@ -1798,10 +1721,6 @@ Datum pgxc_get_2pc_file(PG_FUNCTION_ARGS) char *result = NULL; text *t_result = NULL; - if (0 == PG_GETARG_DATUM(0)) - { - elog(ERROR, "2PC gid is empty"); - } tid = text_to_cstring(PG_GETARG_TEXT_P(0)); result = get2PCInfo(tid); if (NULL != result) @@ -1813,10 +1732,7 @@ Datum pgxc_get_2pc_file(PG_FUNCTION_ARGS) PG_RETURN_NULL(); } -/* - * pgxc_get_2pc_nodes - * Get 2pc participants - */ + Datum pgxc_get_2pc_nodes(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(pgxc_get_2pc_nodes); Datum pgxc_get_2pc_nodes(PG_FUNCTION_ARGS) @@ -1826,10 +1742,6 @@ Datum pgxc_get_2pc_nodes(PG_FUNCTION_ARGS) char *nodename = NULL; text *t_result = NULL; - if (0 == PG_GETARG_DATUM(0)) - { - elog(ERROR, "2PC gid is empty"); - } tid = text_to_cstring(PG_GETARG_TEXT_P(0)); result = get2PCInfo(tid); if (NULL != result) @@ -1844,13 +1756,10 @@ Datum pgxc_get_2pc_nodes(PG_FUNCTION_ARGS) return PointerGetDatum(t_result); } } + PG_RETURN_NULL(); } -/* - * pgxc_get_2pc_startnode - * Get 2pc start node - */ Datum pgxc_get_2pc_startnode(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(pgxc_get_2pc_startnode); Datum pgxc_get_2pc_startnode(PG_FUNCTION_ARGS) @@ -1860,10 +1769,6 @@ Datum pgxc_get_2pc_startnode(PG_FUNCTION_ARGS) char *nodename = NULL; text *t_result = NULL; - if (0 == PG_GETARG_DATUM(0)) - { - elog(ERROR, "2PC gid is empty"); - } tid = text_to_cstring(PG_GETARG_TEXT_P(0)); result = get2PCInfo(tid); if (NULL != result) @@ -1882,10 +1787,6 @@ Datum pgxc_get_2pc_startnode(PG_FUNCTION_ARGS) PG_RETURN_NULL(); } -/* - * pgxc_get_2pc_startxid - * Get 2pc start xid - */ Datum pgxc_get_2pc_startxid(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(pgxc_get_2pc_startxid); Datum pgxc_get_2pc_startxid(PG_FUNCTION_ARGS) @@ -1895,10 +1796,6 @@ Datum pgxc_get_2pc_startxid(PG_FUNCTION_ARGS) char *startxid = NULL; text *t_result = NULL; - if (0 == PG_GETARG_DATUM(0)) - { - elog(ERROR, "2PC gid is empty"); - } tid = text_to_cstring(PG_GETARG_TEXT_P(0)); result = get2PCInfo(tid); if (NULL != result) @@ -1916,44 +1813,7 @@ Datum pgxc_get_2pc_startxid(PG_FUNCTION_ARGS) PG_RETURN_NULL(); } -/* - * pgxc_get_2pc_prepare_timestamp - * Get 2pc prepare timestamp - */ -Datum pgxc_get_2pc_prepare_timestamp(PG_FUNCTION_ARGS); -PG_FUNCTION_INFO_V1(pgxc_get_2pc_prepare_timestamp); -Datum pgxc_get_2pc_prepare_timestamp(PG_FUNCTION_ARGS) -{ - char *tid = NULL; - char *result = NULL; - char *prepare_timestamp = NULL; - text *t_result = NULL; - - if (0 == PG_GETARG_DATUM(0)) - { - elog(ERROR, "2PC gid is empty"); - } - tid = text_to_cstring(PG_GETARG_TEXT_P(0)); - result = get2PCInfo(tid); - if (NULL != result) - { - prepare_timestamp = strstr(result, GET_PREPARE_TIMESTAMP); - if (NULL != prepare_timestamp) - { - prepare_timestamp += strlen(GET_PREPARE_TIMESTAMP); - prepare_timestamp = strtok(prepare_timestamp, "\n"); - t_result = cstring_to_text(prepare_timestamp); - pfree(result); - return PointerGetDatum(t_result); - } - } - PG_RETURN_NULL(); -} -/* - * pgxc_get_2pc_commit_timestamp - * Get 2pc commit timestamp - */ Datum pgxc_get_2pc_commit_timestamp(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(pgxc_get_2pc_commit_timestamp); Datum pgxc_get_2pc_commit_timestamp(PG_FUNCTION_ARGS) @@ -1963,10 +1823,6 @@ Datum pgxc_get_2pc_commit_timestamp(PG_FUNCTION_ARGS) char *commit_timestamp = NULL; text *t_result = NULL; - if (0 == PG_GETARG_DATUM(0)) - { - elog(ERROR, "2PC gid is empty"); - } tid = text_to_cstring(PG_GETARG_TEXT_P(0)); result = get2PCInfo(tid); if (NULL != result) @@ -1984,23 +1840,17 @@ Datum pgxc_get_2pc_commit_timestamp(PG_FUNCTION_ARGS) PG_RETURN_NULL(); } -/* - * pgxc_get_2pc_xid - * Get 2pc local xid - */ + + Datum pgxc_get_2pc_xid(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(pgxc_get_2pc_xid); Datum pgxc_get_2pc_xid(PG_FUNCTION_ARGS) { - GlobalTransactionId xid; char *tid = NULL; char *result = NULL; char *str_xid = NULL; + GlobalTransactionId xid; - if (0 == PG_GETARG_DATUM(0)) - { - elog(ERROR, "2PC gid is empty"); - } tid = text_to_cstring(PG_GETARG_TEXT_P(0)); result = get2PCInfo(tid); if (NULL != result) @@ -2018,31 +1868,16 @@ Datum pgxc_get_2pc_xid(PG_FUNCTION_ARGS) PG_RETURN_NULL(); } -/* - * pgxc_remove_2pc_records - * Remove a 2pc file - */ Datum pgxc_remove_2pc_records(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(pgxc_remove_2pc_records); Datum pgxc_remove_2pc_records(PG_FUNCTION_ARGS) { - char *tid = NULL; - - if (0 == PG_GETARG_DATUM(0)) - { - elog(ERROR, "2PC gid is empty"); - } - tid = text_to_cstring(PG_GETARG_TEXT_P(0)); - + char *tid = text_to_cstring(PG_GETARG_TEXT_P(0)); remove_2pc_records(tid, true); pfree(tid); PG_RETURN_BOOL(true); } -/* - * pgxc_clear_2pc_records - * Clear all 2pc files which are not running - */ Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(pgxc_clear_2pc_records); Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS) @@ -2066,8 +1901,6 @@ Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS) elog(ERROR, "can only called on coordinator"); } - elog(LOG, "clear 2pc files"); - mycontext = AllocSetContextCreate(CurrentMemoryContext, "clean_check", ALLOCSET_DEFAULT_MINSIZE, @@ -2076,6 +1909,25 @@ Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS) oldcontext = MemoryContextSwitchTo(mycontext); ResetGlobalVariables(); +#if 0 + if((dir = opendir(TWOPHASE_RECORD_DIR))) + { + while((ptr = readdir(dir)) != NULL) + { + if (count > 999) + break; + if(strcmp(ptr->d_name,".") == 0 || strcmp(ptr->d_name,"..") == 0) + { + continue; + } + snprintf(path[count], MAX_GID, "/%s", ptr->d_name); + //snprintf(path[count], MAX_GID, "/%s", ptr->d_name); + count++; + } + + closedir(dir); + } +#endif /*get node list*/ PgxcNodeGetOids(&cn_node_list, &dn_node_list, @@ -2096,14 +1948,28 @@ Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS) { (void) execute_query_on_single_node(dn_node_list[i], query, 1, result+cn_nodes_num+i); } - /*get all database info*/ getDatabaseList(); /*get all info of 2PC transactions*/ getTxnInfoOnNodesAll(); +#if 0 + if((dir = opendir(TWOPHASE_RECORD_DIR))) + { + while (i < count) + { + if (!find_txn(path[i])) + { + unlink(path[i]); + WriteClean2pcXlogRec(path[i]); + } + i++; + } - /*delete all rest 2pc files in each cn*/ + closedir(dir); + } +#endif + /*delete all rest 2pc file in each nodes*/ for (i = 0; i < cn_nodes_num; i++) { if (0 == result[i].slot_count) @@ -2111,54 +1977,24 @@ Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS) continue; } if (!(twopcfiles = TTSgetvalue(result+i, 0, 0))) - { continue; - } - - /*iterate through all 2pc files, delete rest ones*/ ptr = strtok(twopcfiles, ","); - for (;ptr != NULL; ptr = strtok(NULL, ",")) + while(ptr) { if (count >= MAXIMUM_CLEAR_FILE) - { break; - } - - /*whether 2pc is running?*/ - if (find_txn(ptr)) - { - /*2pc is running, do not delete its file*/ - continue; - } - - /*whether 2pc is rollbacked?*/ - if (strstr(ptr, ROLLBACK_POSTFIX) == NULL) + if (!find_txn(ptr)) { - /*2pc is not rollbacked*/ - - /*whether 2pc start xid transaction is running?*/ - if (is_gid_start_xid_running(ptr)) - { - /*2pc start xid transaction is running, do not delete its file*/ - elog(LOG, "2PC '%s' is running", ptr); - continue; - } - } - - /*2pc is not running, delete its file*/ snprintf(clear_query, 100, CLEAR_STMT, ptr); - elog(LOG, "clear 2pc file: %s", ptr); - if (execute_query_on_single_node(cn_node_list[i], - clear_query, 1, &clear_result) == (Datum)0) - { + if (execute_query_on_single_node(cn_node_list[i], clear_query, 1, &clear_result) == (Datum)0) res = false; - } DropTupleTableSlots(&clear_result); count++; } + ptr = strtok(NULL, ","); + } } - /*delete all rest 2pc files in each dn*/ for (i = 0; i < dn_nodes_num; i++) { if (0 == result[cn_nodes_num+i].slot_count) @@ -2166,51 +2002,22 @@ Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS) continue; } if (!(twopcfiles = TTSgetvalue(result+cn_nodes_num+i, 0, 0))) - { continue; - } - - /*iterate through all 2pc files, delete rest ones*/ ptr = strtok(twopcfiles, ","); - for (;ptr != NULL; ptr = strtok(NULL, ",")) + while(ptr) { if (count >= MAXIMUM_CLEAR_FILE) - { break; - } - - /*whether 2pc is running?*/ - if (find_txn(ptr)) - { - /*2pc is running, do not delete its file*/ - continue; - } - - /*whether 2pc is rollbacked?*/ - if (strstr(ptr, ROLLBACK_POSTFIX) == NULL) - { - /*2pc is not rollbacked*/ - - /*whether 2pc start xid transaction is running?*/ - if (is_gid_start_xid_running(ptr)) + if (!find_txn(ptr)) { - /*2pc start xid transaction is running, do not delete its file*/ - elog(LOG, "2PC '%s' is running", ptr); - continue; - } - } - - /*2pc is not running, delete its file*/ snprintf(clear_query, 100, CLEAR_STMT, ptr); - elog(LOG, "clear 2pc file: %s", ptr); - if (execute_query_on_single_node(dn_node_list[i], - clear_query, 1, &clear_result) == (Datum)0) - { + if (execute_query_on_single_node(dn_node_list[i], clear_query, 1, &clear_result) == (Datum)0) res = false; - } DropTupleTableSlots(&clear_result); count++; } + ptr = strtok(NULL, ","); + } } for (i = 0; i < pgxc_clean_node_count; i++) @@ -2226,10 +2033,6 @@ Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS) PG_RETURN_BOOL(res); } -/* - * pgxc_get_record_list - * Get 2pc files list - */ Datum pgxc_get_record_list(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(pgxc_get_record_list); Datum pgxc_get_record_list(PG_FUNCTION_ARGS) @@ -2244,11 +2047,7 @@ Datum pgxc_get_record_list(PG_FUNCTION_ARGS) recordList = get_2pc_list_from_cache(&count); if (count >= MAXIMUM_OUTPUT_FILE) { - if (NULL == recordList) - { - elog(PANIC, "recordList is NULL"); - } - + Assert(NULL != recordList); t_recordList = cstring_to_text(recordList); return PointerGetDatum(t_recordList); } @@ -2331,18 +2130,8 @@ Datum pgxc_commit_on_node(PG_FUNCTION_ARGS) cn_health_map = palloc0(cn_nodes_num * sizeof(bool)); dn_health_map = palloc0(dn_nodes_num * sizeof(bool)); - if (0 == PG_GETARG_DATUM(0)) - { - elog(ERROR, "pgxc_commit_on_node: node name is empty"); - } nodename = text_to_cstring(PG_GETARG_TEXT_P(0)); - - if (0 == PG_GETARG_DATUM(1)) - { - elog(ERROR, "pgxc_commit_on_node: gid is empty"); - } gid = text_to_cstring(PG_GETARG_TEXT_P(1)); - nodeoid = get_pgxc_nodeoid(nodename); if (InvalidOid == nodeoid) { @@ -2374,10 +2163,6 @@ Datum pgxc_commit_on_node(PG_FUNCTION_ARGS) else { txn->global_commit_timestamp = GetGlobalTimestampGTM(); - if (!GlobalTimestampIsValid(current_gts)) - { - elog(ERROR, "pgxc_commit_on_node, get invalid gts"); - } } } @@ -2451,18 +2236,8 @@ Datum pgxc_abort_on_node(PG_FUNCTION_ARGS) cn_health_map = palloc0(cn_nodes_num * sizeof(bool)); dn_health_map = palloc0(dn_nodes_num * sizeof(bool)); - if (0 == PG_GETARG_DATUM(0)) - { - elog(ERROR, "pgxc_abort_on_node: node name is empty"); - } nodename = text_to_cstring(PG_GETARG_TEXT_P(0)); - - if (0 == PG_GETARG_DATUM(1)) - { - elog(ERROR, "pgxc_abort_on_node: gid is empty"); - } gid = text_to_cstring(PG_GETARG_TEXT_P(1)); - nodeoid = get_pgxc_nodeoid(nodename); if (InvalidOid == nodeoid) { @@ -2628,15 +2403,6 @@ bool send_query_clean_transaction(PGXCNodeHandle* conn, txn_info *txn, const cha TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK"))); } - if (InvalidGlobalTimestamp != txn->global_prepare_timestamp && - pgxc_node_send_prepare_timestamp(conn, txn->global_prepare_timestamp)) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("in pg_clean failed to send prepare timestamp for %s PREPARED command", - TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK"))); - } - if (NULL != txn->participants && pgxc_node_send_partnodes(conn, txn->participants)) { ereport(ERROR, @@ -2662,13 +2428,7 @@ bool check_2pc_belong_node(txn_info * txn) int node_index = 0; char node_type; node_index = find_node_index(abnormal_nodeoid); - - /* abnormal node oid must be valid here */ - if (InvalidOid == abnormal_nodeoid) - { - elog(PANIC, "abnormal_nodeoid is invalid"); - } - + Assert(InvalidOid != abnormal_nodeoid); if (abnormal_nodeoid == txn->origcoord) { txn->belong_abnormal_node = true; @@ -2688,7 +2448,9 @@ bool check_2pc_belong_node(txn_info * txn) if (InvalidOid == txn->origcoord) { + char *startnode = NULL; int node_oid = InvalidOid; + char gid[MAX_GID]; if (!IsXidImplicit(txn->gid)) { @@ -2696,16 +2458,39 @@ bool check_2pc_belong_node(txn_info * txn) return true; } - /* Get start node oid from gid */ - node_oid = get_start_node_oid_from_gid(txn->gid); - if (node_oid == InvalidOid) + Assert(IsXidImplicit(txn->gid)); + + /* get start node from gid */ + strcpy(gid, txn->gid); + startnode = strtok(gid, ":"); + if (NULL == startnode) + { + elog(WARNING, "get startnode(%s) from gid(%s) failed", + startnode, gid); + txn->belong_abnormal_node = false; + return false; + } + + startnode = strtok(NULL, ":"); + if (NULL == startnode) { - elog(WARNING, "Get invalid start node oid from gid(%s)", txn->gid); + elog(WARNING, "get startnode(%s) from gid(%s) failed", + startnode, gid); txn->belong_abnormal_node = false; return false; } - elog(DEBUG1, "Get start node oid(%d) from gid(%s)", node_oid, txn->gid); + node_oid = get_pgxc_nodeoid(startnode); + if (NULL == startnode) + { + elog(WARNING, "get invalid oid for startnode(%s) from gid(%s)", + startnode, gid); + txn->belong_abnormal_node = false; + return false; + } + + elog(DEBUG5, "get oid(%d) for startnode(%s) from gid(%s)", + node_oid, startnode, gid); if (abnormal_nodeoid == node_oid) { @@ -2733,13 +2518,22 @@ bool check_node_participate(txn_info * txn, int node_idx) void recover2PC(txn_info * txn) { - bool is_running = true; + int i = 0; + bool check_ok = false; + int check_times = CLEAN_CHECK_TIMES_DEFAULT; + int check_interval = CLEAN_CHECK_INTERVAL_DEFAULT; MemoryContext current_context = NULL; ErrorData* edata = NULL; TXN_STATUS txn_stat; txn_stat = check_txn_global_status(txn); txn->global_txn_stat = txn_stat; + if (clear_2pc_belong_node) + { + check_times = CLEAN_NODE_CHECK_TIMES; + check_interval = CLEAN_NODE_CHECK_INTERVAL; + } + #ifdef DEBUG_EXECABORT txn_stat = TXN_STATUS_ABORTED; #endif @@ -2773,59 +2567,46 @@ void recover2PC(txn_info * txn) else { txn->op = COMMIT; - - /* check whether the 2pc start xid is 0 */ - if (txn->startxid == 0 && IsXidImplicit(txn->gid)) - { - elog(WARNING, "Commit 2PC '%s' start xid is 0", txn->gid); - txn->op_issuccess = false; - return; - } - - /* check whether the 2pc start xid is still running on start node */ - if (is_txn_start_xid_running(txn)) + /* check whether all nodes can commit prepared */ + for (i = 0; i < check_times; i++) { - elog(WARNING, "Commit 2PC '%s' start xid %d is running", - txn->gid, txn->startxid); - txn->op_issuccess = false; - return; - } - - /* check whether the 2pc is still running on participants */ - is_running = false; + check_ok = true; current_context = CurrentMemoryContext; PG_TRY(); { if (!clean_2PC_iscommit(txn, true, true)) { - is_running = true; - elog(WARNING, "Commit 2PC '%s' check failed", txn->gid); + check_ok = false; + elog(LOG, "check commit 2PC transaction %s failed", + txn->gid); } } PG_CATCH(); { - is_running = true; (void)MemoryContextSwitchTo(current_context); edata = CopyErrorData(); FlushErrorState(); - elog(WARNING, "Commit 2PC '%s' is running, error: %s", + check_ok = false; + elog(WARNING, "check commit 2PC transaction %s error: %s", txn->gid, edata->message); } PG_END_TRY(); - /* 2pc is still running, do not try to clean */ - if (is_running) + if (!check_ok) { txn->op_issuccess = false; return; } + pg_usleep(check_interval); + } + /* send commit prepared to all nodes */ if (!clean_2PC_iscommit(txn, true, false)) { txn->op_issuccess = false; - elog(WARNING, "Commit 2PC '%s' failed", txn->gid); + elog(LOG, "commit 2PC transaction %s failed", txn->gid); return; } txn->op_issuccess = true; @@ -2835,57 +2616,46 @@ void recover2PC(txn_info * txn) case TXN_STATUS_ABORTED: txn->op = ABORT; - - /* check whether the 2pc start xid is 0 */ - if (txn->startxid == 0 && IsXidImplicit(txn->gid)) + /* check whether all nodes can rollback prepared */ + for (i = 0; i < check_times; i++) { - elog(WARNING, "Rollback 2PC '%s' start xid is 0", txn->gid); - } - - /* check whether the 2pc start xid is still running on start node */ - if (is_txn_start_xid_running(txn)) - { - elog(WARNING, "Rollback 2PC '%s' start xid %d is running", - txn->gid, txn->startxid); - txn->op_issuccess = false; - return; - } - - /* check whether the 2pc is still running on participants */ - is_running = false; + check_ok = true; current_context = CurrentMemoryContext; PG_TRY(); { if (!clean_2PC_iscommit(txn, false, true)) { - is_running = true; - elog(WARNING, "Rollback 2PC '%s' check failed", txn->gid); + check_ok = false; + elog(LOG, "check rollback 2PC transaction %s failed", + txn->gid); } } PG_CATCH(); { - is_running = true; + check_ok = false; (void)MemoryContextSwitchTo(current_context); edata = CopyErrorData(); FlushErrorState(); - elog(WARNING, "Rollback 2PC '%s' is running, error: %s", + elog(WARNING, "check rollback 2PC transaction %s error: %s", txn->gid, edata->message); } PG_END_TRY(); - /* 2pc is still running, do not try to clean */ - if (is_running) + if (!check_ok) { txn->op_issuccess = false; return; } + pg_usleep(check_interval); + } + /* send rollback prepared to all nodes */ if (!clean_2PC_iscommit(txn, false, false)) { txn->op_issuccess = false; - elog(WARNING, "Rollback 2PC '%s' failed", txn->gid); + elog(LOG, "rollback 2PC transaction %s failed", txn->gid); return; } txn->op_issuccess = true; @@ -2915,6 +2685,7 @@ TXN_STATUS check_txn_global_status(txn_info *txn) #define TXN_INPROGRESS 0X0020 int ii; int check_flag = 0; + int node_idx = 0; TimestampTz prepared_time = 0; TimestampTz time_gap = clean_time_interval; @@ -2999,84 +2770,6 @@ TXN_STATUS check_txn_global_status(txn_info *txn) return TXN_STATUS_INPROGRESS; } #endif - - /* start xid is 0, maybe at the beginning of the 2pc */ - if (txn->startxid == 0) - { - /* prepare timestamp must be invalid */ - if (GlobalTimestampIsValid(txn->global_prepare_timestamp)) - { - elog(PANIC, "gid: %s, start xid is 0, global_prepare_timestamp: %ld", - txn->gid, txn->global_prepare_timestamp); - } - - elog(DEBUG2, "2PC '%s' start xid is 0", txn->gid); - - if (check_flag & TXN_INPROGRESS - || current_time - prepared_time <= time_gap) - { - /* inprogress or less than time gap, do not clean it */ - elog(LOG, "2PC '%s' start xid is 0, inprogress, " - "current_time: %ld, prepared_time: %ld, " - "time_gap: %ld, time_diff: %ld", - txn->gid, current_time, prepared_time, - time_gap, current_time - prepared_time); - - return TXN_STATUS_INPROGRESS; - } - else - { - /* otherwise, abort it */ - elog(WARNING, "2PC '%s' start xid is 0, " - "current_time: %ld, prepared_time: %ld, " - "time_gap: %ld, time_diff: %ld", - txn->gid, current_time, prepared_time, - time_gap, current_time - prepared_time); - - return TXN_STATUS_ABORTED; - } - } - - /* use for upgrade from old version, no prepare timestamp in old version */ - if (!GlobalTimestampIsValid(txn->global_prepare_timestamp)) - { - elog(WARNING, "gid: %s, start xid is %d, global_prepare_timestamp " - "is invalid", txn->gid, txn->startxid); - - if (check_flag & TXN_INPROGRESS - || current_time - prepared_time <= time_gap) - { - /* inprogress or less than time gap, do not clean it */ - elog(WARNING, "gid: %s, start xid is %d, inprogress, " - "current_time: %ld, prepared_time: %ld, " - "time_gap: %ld, time_diff: %ld", - txn->gid, txn->startxid, current_time, prepared_time, - time_gap, current_time - prepared_time); - - return TXN_STATUS_INPROGRESS; - } - else - { - /* otherwise, set prepare timestamp */ - if (clear_2pc_belong_node) - { - txn->global_prepare_timestamp = abnormal_gts; - } - else - { - txn->global_prepare_timestamp = current_gts - time_gap; - } - - elog(WARNING, "gid: %s, start xid is %d, " - "current_time: %ld, prepared_time: %ld, " - "time_gap: %ld, time_diff: %ld, " - "set global_prepare_timestamp: %ld", - txn->gid, txn->startxid, current_time, prepared_time, - time_gap, current_time - prepared_time, - txn->global_prepare_timestamp); - } - } - if (clear_2pc_belong_node) { if (!check_2pc_belong_node(txn)) @@ -3089,35 +2782,32 @@ TXN_STATUS check_txn_global_status(txn_info *txn) return TXN_STATUS_INPROGRESS; } - /* abnormal gts must be valid */ - if (!GlobalTimestampIsValid(abnormal_gts)) + node_idx = find_node_index(abnormal_nodeoid); + if (node_idx >= 0) { - elog(PANIC, "gid: %s, abnormal_gts is invalid gts", txn->gid); - } - - /* abnormal gts less than prepare gts, do not clean it */ - if (abnormal_gts < txn->global_prepare_timestamp) + if (abnormal_time < txn->prepare_timestamp[node_idx]) { - elog(LOG, "gid: %s, abnormal gts: " INT64_FORMAT - ", prepare gts: " INT64_FORMAT, txn->gid, - abnormal_gts, txn->global_prepare_timestamp); + elog(WARNING, "gid: %s, abnormal time: " INT64_FORMAT + ", prepare timestamp[%d]: " INT64_FORMAT, txn->gid, + abnormal_time, node_idx, txn->prepare_timestamp[node_idx]); return TXN_STATUS_INPROGRESS; } - - if (GlobalTimestampIsValid(txn->global_commit_timestamp)) + } + else { - /* abnormal gts less than commit gts, do not clean it */ - if (abnormal_gts < txn->global_commit_timestamp) + elog(WARNING, "gid: %s, node_idx: %d", txn->gid, node_idx); + } + + if (abnormal_time < prepared_time) { - elog(LOG, "gid: %s, abnormal gts: " INT64_FORMAT - ", commit gts: " INT64_FORMAT, txn->gid, - abnormal_gts, txn->global_commit_timestamp); + elog(WARNING, "gid: %s, abnormal time: " INT64_FORMAT + ", prepared time: " INT64_FORMAT, txn->gid, + abnormal_time, prepared_time); return TXN_STATUS_INPROGRESS; } } - } else { if (check_flag & TXN_INPROGRESS ||current_time - prepared_time <= time_gap) @@ -3125,36 +2815,8 @@ TXN_STATUS check_txn_global_status(txn_info *txn) /* transaction inprogress */ return TXN_STATUS_INPROGRESS; } - - /* current gts must be valid */ - if (!GlobalTimestampIsValid(current_gts)) - { - elog(PANIC, "gid: %s, current_gts is invalid gts", txn->gid); - } - - /* 2pc prepare gts gap less than time gap, do not clean it */ - if (current_gts - txn->global_prepare_timestamp < time_gap) - { - elog(LOG, "gid: %s, current gts: " INT64_FORMAT - ", prepare gts: " INT64_FORMAT ", time gap: " INT64_FORMAT, - txn->gid, current_gts, txn->global_prepare_timestamp, time_gap); - - return TXN_STATUS_INPROGRESS; } - if (GlobalTimestampIsValid(txn->global_commit_timestamp)) - { - /* 2pc commit gts gap less than time gap, do not clean it */ - if (current_gts - txn->global_commit_timestamp <= time_gap) - { - elog(LOG, "gid: %s, current gts: " INT64_FORMAT - ", commit gts: " INT64_FORMAT ", time gap: " INT64_FORMAT, - txn->gid, current_gts, txn->global_commit_timestamp, time_gap); - - return TXN_STATUS_INPROGRESS; - } - } - } if (!IsXidImplicit(txn->gid) && txn->after_first_phase && (TXN_PREPARED == check_flag)) { @@ -3175,21 +2837,6 @@ TXN_STATUS check_txn_global_status(txn_info *txn) if (check_flag & TXN_COMMITTED) /* Some 2PC transactions are committed. Need to commit others. */ return TXN_STATUS_COMMITTED; - - /* If 2PC commit gts is valid, must commit it. */ - if (GlobalTimestampIsValid(txn->global_commit_timestamp)) - { - elog(LOG, "'%s' global_commit_timestamp: %ld", - txn->gid, txn->global_commit_timestamp); - - if (!(check_flag & TXN_PREPARED)) - { - elog(PANIC, "gid: %s, check_flag: %d", txn->gid, check_flag); - } - - return TXN_STATUS_COMMITTED; - } - /* All the transactions remain prepared. No need to recover. */ return TXN_STATUS_ABORTED; } @@ -3254,11 +2901,6 @@ bool clean_2PC_iscommit(txn_info *txn, bool is_commit, bool is_check) { node_oid = pgxc_handles->datanode_handles[ii]->nodeoid; node_idx = find_node_index(node_oid); - if (node_idx < 0 || node_idx >= cn_nodes_num + dn_nodes_num) - { - elog(PANIC, "gid: %s, node_idx(%d) is invalid", txn->gid, node_idx); - } - if (TXN_STATUS_PREPARED != txn->txn_stat[ node_idx]) { continue; @@ -3292,11 +2934,6 @@ bool clean_2PC_iscommit(txn_info *txn, bool is_commit, bool is_check) { node_oid = pgxc_handles->coord_handles[ii]->nodeoid; node_idx = find_node_index(node_oid); - if (node_idx < 0 || node_idx >= cn_nodes_num + dn_nodes_num) - { - elog(PANIC, "gid: %s, node_idx(%d) is invalid", txn->gid, node_idx); - } - if (TXN_STATUS_PREPARED != txn->txn_stat[ node_idx]) { continue; @@ -3324,6 +2961,7 @@ bool clean_2PC_iscommit(txn_info *txn, bool is_commit, bool is_check) } #endif } + } /* receive response */ @@ -3362,14 +3000,10 @@ bool clean_2PC_iscommit(txn_info *txn, bool is_commit, bool is_check) if (txn->origcoord != InvalidOid) { node_idx = find_node_index(txn->origcoord); - if (node_idx < 0 || node_idx >= cn_nodes_num + dn_nodes_num) - { - elog(PANIC, "gid: %s, node_idx(%d) is invalid", txn->gid, node_idx); - } - if (txn->coordparts[node_idx] == 1) { /*send global timestamp to dn_node_list[ii]*/ + if (txn->txn_stat[node_idx] == TXN_STATUS_PREPARED) { get_node_handles(&pgxc_handles, txn->origcoord); @@ -3438,8 +3072,7 @@ bool clean_2PC_files(txn_info * txn) } else { - elog(LOG, "pg_clean: failed clean 2pc file of transaction %s on node %s", - txn->gid, get_pgxc_nodename(dn_node_list[ii])); + elog(LOG, "pg_clean: failed clean 2pc file of transaction %s on node %s", txn->gid, get_pgxc_nodename(dn_node_list[ii])); issuccess = false; } DropTupleTableSlots(&result); @@ -3453,15 +3086,14 @@ bool clean_2PC_files(txn_info * txn) { if (TTSgetvalue(&result, 0, 0) == false) { - elog(LOG, "pg_clean: delete 2PC file failed of transaction %s on node %s", + elog(LOG, "Error:delete 2PC file failed of transaction %s on node %s", txn->gid, get_pgxc_nodename(txn->coordparts[ii])); issuccess = false; } } else { - elog(LOG, "pg_clean: failed clean 2pc file of transaction %s on node %s", - txn->gid, get_pgxc_nodename(cn_node_list[ii])); + elog(LOG, "pg_clean: failed clean 2pc file of transaction %s on node %s", txn->gid, get_pgxc_nodename(cn_node_list[ii])); issuccess = false; } DropTupleTableSlots(&result); @@ -3746,14 +3378,12 @@ void get_node_handles(PGXCNodeAllHandles **pgxc_handles, Oid nodeoid) *pgxc_handles = get_handles(nodelist, coordlist, false, true, true); } + bool check_2pc_start_from_node(txn_info *txn) { char node_type; - if (InvalidOid == abnormal_nodeoid) - { - elog(PANIC, "gid: %s, abnormal_nodeoid is invalid", txn->gid); - } + Assert(InvalidOid != abnormal_nodeoid); if (abnormal_nodeoid == txn->origcoord) { @@ -3768,239 +3398,51 @@ bool check_2pc_start_from_node(txn_info *txn) if (InvalidOid == txn->origcoord) { + char *startnode = NULL; int node_oid = InvalidOid; + char gid[MAX_GID]; if (!IsXidImplicit(txn->gid)) { return true; } - /* Get start node oid from gid */ - node_oid = get_start_node_oid_from_gid(txn->gid); - if (InvalidOid == node_oid) - { - elog(WARNING, "Get invalid start node oid from gid(%s)", txn->gid); - return false; - } - - elog(DEBUG1, "Get start node oid(%d) from gid(%s)", node_oid, txn->gid); + Assert(IsXidImplicit(txn->gid)); - if (abnormal_nodeoid == node_oid) + /* get start node from gid */ + strcpy(gid, txn->gid); + startnode = strtok(gid, ":"); + if (NULL == startnode) { - return true; - } - } - + elog(WARNING, "get startnode(%s) from gid(%s) failed", + startnode, gid); return false; } -/* - * get_start_node_from_gid - * Get start node name from gid - * gid: 2pc gid - */ -char *get_start_node_from_gid(char *gid) - { - char *str_start_node = NULL; - - if (!IsXidImplicit(gid)) - { - elog(WARNING, "2PC '%s' is not implicit", gid); - return NULL; - } - - /* Get start node name from gid */ - str_start_node = strtok(gid, ":"); - if (str_start_node == NULL) - { - elog(WARNING, "Get start node from gid(%s) failed", gid); - return NULL; - } - - str_start_node = strtok(NULL, ":"); - if (str_start_node == NULL) - { - elog(WARNING, "Get start node from gid(%s) failed", gid); - return NULL; - } - - return str_start_node; -} - -/* - * get_start_node_oid_from_gid - * Get start node oid from gid - * gid: 2pc gid - */ -Oid get_start_node_oid_from_gid(char *gid) -{ - Oid start_node_oid = 0; - char *str_start_node = NULL; - char gid_buf[MAX_GID]; - - /* Get start node oid from gid */ - strcpy(gid_buf, gid); - str_start_node = get_start_node_from_gid(gid_buf); - if (str_start_node == NULL) - { - elog(WARNING, "Get start node from gid(%s) failed", gid); - return 0; - } - - elog(LOG, "Get start node(%s) from gid(%s)", str_start_node, gid); - - start_node_oid = get_pgxc_nodeoid(str_start_node); - if (start_node_oid == InvalidOid) - { - elog(WARNING, "Get invalid oid for start node(%s) from gid(%s)", - str_start_node, gid); - return 0; - } - - return start_node_oid; -} - -/* - * get_start_xid_from_gid - * Get start xid from gid - * gid: 2pc gid - */ -uint32 get_start_xid_from_gid(char *gid) -{ - uint32 start_xid = 0; - char *str_start_xid = NULL; - char gid_buf[MAX_GID]; - - if (!IsXidImplicit(gid)) - { - elog(WARNING, "2PC '%s' is not implicit", gid); - return 0; - } - - /* Get start xid from gid */ - strcpy(gid_buf, gid); - str_start_xid = gid_buf + strlen(XIDPREFIX); - str_start_xid = strtok(str_start_xid, ":"); - start_xid = strtoul(str_start_xid, NULL, 10); - if (start_xid == 0) - { - elog(WARNING, "Get start xid from gid(%s) failed", gid); - return 0; - } - - return start_xid; -} - -/* - * is_xid_running_on_node - * Whether the transaction with the xid is still running on the node - * xid: transaction id - * node_oid: node oid - */ -bool is_xid_running_on_node(uint32 xid, Oid node_oid) + startnode = strtok(NULL, ":"); + if (NULL == startnode) { - bool is_running = true; - - Datum execute_res; - TupleTableSlots result; - char command[MAX_CMD_LENGTH]; - - if (xid == 0 || node_oid == InvalidOid) - { - elog(PANIC, "2PC xid: %d, node oid: %d", xid, node_oid); - return true; - } - - snprintf(command, MAX_CMD_LENGTH, "select pid::text, backend_xid::text " - "from pg_catalog.pg_stat_activity where backend_xid=%d", xid); - - execute_res = execute_query_on_single_node(node_oid, command, 2, &result); - if (execute_res == (Datum) 1) - { - if (result.slot_count == 0) - { - is_running = false; + elog(WARNING, "get startnode(%s) from gid(%s) failed", + startnode, gid); + return false; } - else - { - is_running = true; - if (result.slot_count != 1) + node_oid = get_pgxc_nodeoid(startnode); + if (NULL == startnode) { - elog(PANIC, "Get %d resules for xid: %d", result.slot_count, xid); - } - } - } - else - { - elog(WARNING, "pg_clean: Faile to query xid %d on node %s", - xid, get_pgxc_nodename(node_oid)); - is_running = true; + elog(WARNING, "get invalid oid for startnode(%s) from gid(%s)", + startnode, gid); + return false; } - DropTupleTableSlots(&result); - return is_running; - } + elog(DEBUG1, "get oid(%d) for startnode(%s) from gid(%s)", + node_oid, startnode, gid); -/* - * is_gid_start_xid_running - * Whether the transaction with the start xid is still running on start node - * gid: 2pc gid - */ -bool is_gid_start_xid_running(char *gid) -{ - uint32 start_xid = 0; - Oid start_node_oid = InvalidOid; - - if (!IsXidImplicit(gid)) + if (abnormal_nodeoid == node_oid) { - elog(LOG, "Explicit 2PC '%s'", gid); return true; } - - /* Get start xid from gid */ - start_xid = get_start_xid_from_gid(gid); - if (start_xid == 0) - { - elog(ERROR, "Get start xid from gid(%s) failed", gid); - return true; } - elog(LOG, "Get start xid(%d) from gid(%s)", start_xid, gid); - - /* Get start node oid from gid */ - start_node_oid = get_start_node_oid_from_gid(gid); - if (start_node_oid == InvalidOid) - { - elog(WARNING, "Get invalid start node oid from gid(%s)", gid); return false; } - - elog(LOG, "Get start node oid(%d) from gid(%s)", start_node_oid, gid); - - return is_xid_running_on_node(start_xid, start_node_oid); -} - -/* - * is_txn_start_xid_running - * Whether the transaction with the start xid is still running on start node - * txn: 2pc transaction info - */ -bool is_txn_start_xid_running(txn_info *txn) -{ - if (txn->startxid != 0) - { - Assert(txn->origcoord != InvalidOid); - return is_xid_running_on_node(txn->startxid, txn->origcoord); - } - - Assert(txn->origcoord == InvalidOid); - - if (!IsXidImplicit(txn->gid)) - { - elog(LOG, "Explicit 2PC '%s' start xid is %d", txn->gid, txn->startxid); - return false; - } - - return is_gid_start_xid_running(txn->gid); -} diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index ea188961..e78f9c53 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -2392,11 +2392,12 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon) if (!save_and_remove_2pc_info(gxact->gid)) { - elog(DEBUG1, "checkpoint: %s save to file failed", gxact->gid); + elog(LOG, "[%s] %s save to file failed", + __FUNCTION__, gxact->gid); } else { - elog(LOG, "checkpoint: %s is saved to file", gxact->gid); + elog(LOG, "[%s] %s is saved to file", __FUNCTION__, gxact->gid); } } #endif @@ -3740,12 +3741,10 @@ void record_2pc_involved_nodes_xid(const char * tid, File fd = 0; int ret = 0; int size = 0; - int pg_clean_check_size = 0; StringInfoData content; struct stat fst; char path[MAXPGPATH]; char *result = NULL; - GlobalTimestamp prepare_gts = InvalidGlobalTimestamp; #ifdef __TWO_PHASE_TESTS__ XLogRecPtr xlogrec = 0; @@ -3756,18 +3755,6 @@ void record_2pc_involved_nodes_xid(const char * tid, return; } - prepare_gts = GetGlobalPrepareTimestamp(); - if (!GlobalTimestampIsValid(prepare_gts)) - { - elog(WARNING, "prepare gts is invalid"); - prepare_gts = GetGlobalTimestampGTM(); - if (!GlobalTimestampIsValid(prepare_gts)) - { - elog(ERROR, "get gts for prepare is invalid"); - } - SetGlobalPrepareTimestamp(prepare_gts); - } - if (enable_distri_print || enable_2pc_entry_trace) { elog(LOG, "[%s] record %s, startnode: %s, participants: %s", @@ -3793,10 +3780,6 @@ void record_2pc_involved_nodes_xid(const char * tid, appendStringInfo(&content, "startxid:%u\n", startxid); appendStringInfo(&content, "nodes:%s\n", nodestring); appendStringInfo(&content, "xid:%u\n", xid); - pg_clean_check_size = content.len; - Assert(pg_clean_check_size == strlen(content.data)); - - appendStringInfo(&content, "global_prepare_timestamp:%ld\n", prepare_gts); size = content.len; Assert(size == strlen(content.data)); @@ -3815,10 +3798,11 @@ void record_2pc_involved_nodes_xid(const char * tid, Assert(strlen(info) < MAX_2PC_INFO_SIZE); check_2pc_file(tid, info, __FUNCTION__); - if (pg_strncasecmp(info, content.data, pg_clean_check_size) != 0) + if (strncmp(info, content.data, size) != 0) { - elog(ERROR, "pg_clean attemp to write %s info conflict, " - "content: %s, info: %s", tid, content.data, info); + elog(ERROR, "[%s] pg_clean attemp to write %s info conflict, " + "content: %s, info: %s", __FUNCTION__, tid, + content.data, info); } resetStringInfo(&content); @@ -3852,10 +3836,11 @@ void record_2pc_involved_nodes_xid(const char * tid, Assert(NULL != result); - if (pg_strncasecmp(result, content.data, pg_clean_check_size) != 0) + if (strncmp(result, content.data, size) != 0) { - elog(ERROR, "pg_clean attemp to write %s info conflict, " - "content: %s, info: %s", tid, content.data, result); + elog(ERROR, "[%s] pg_clean attemp to write %s info conflict, " + "content: %s, info: %s", + __FUNCTION__, tid, content.data, result); } pfree(result); @@ -3868,16 +3853,12 @@ void record_2pc_involved_nodes_xid(const char * tid, if (!RecoveryInProgress()) { - char *fmt_v2 = XLOG_FMT_2PC_V2; XLogBeginInsert(); XLogRegisterData((char *)tid, strlen(tid) + 1); - XLogRegisterData((char *)fmt_v2, strlen(fmt_v2) + 1); XLogRegisterData((char *)startnode, strlen(startnode) + 1); - XLogRegisterData((char *)&startxid, sizeof(GlobalTransactionId)); + XLogRegisterData((char *)&startxid, sizeof(GlobalTransactionId) + 1); XLogRegisterData((char *)nodestring, strlen(nodestring) + 1); - XLogRegisterData((char *)&xid, sizeof(GlobalTransactionId)); - XLogRegisterData((char *)&prepare_gts, sizeof(GlobalTimestamp)); - + XLogRegisterData((char *)&xid, sizeof(GlobalTransactionId) + 1); #ifdef __TWO_PHASE_TESTS__ xlogrec = #endif @@ -3992,7 +3973,7 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta { XLogBeginInsert(); XLogRegisterData((char *)tid, strlen(tid) + 1); - XLogRegisterData((char *)&commit_timestamp, sizeof(GlobalTimestamp)); + XLogRegisterData((char *)&commit_timestamp, sizeof(GlobalTimestamp) + 1); xlogrec = XLogInsert(RM_XLOG_ID, XLOG_RECORD_2PC_TIMESTAMP); /* only start node need to flush and sync XLOG_RECORD_2PC_TIMESTAMP */ if (IS_PGXC_LOCAL_COORDINATOR) @@ -4197,7 +4178,7 @@ void rename_2pc_records(const char *tid, TimestampTz timestamp) XLogBeginInsert(); XLogRegisterData((char *)tid, strlen(tid) + 1); XLogRegisterData((char *)type, strlen(type) + 1); - XLogRegisterData((char *)×tamp, sizeof(TimestampTz)); + XLogRegisterData((char *)×tamp, sizeof(TimestampTz) + 1); XLogInsert(RM_XLOG_ID, XLOG_CLEAN_2PC_FILE); } @@ -4407,7 +4388,7 @@ char *get_2pc_list_from_cache(int *count) { recordList = (char *) repalloc(recordList, strlen(entry->key) + strlen(recordList) + 2); - sprintf(recordList + strlen(recordList), ",%s", entry->key); + sprintf(recordList, "%s,%s", recordList, entry->key); } if (++(*count) >= MAX_OUTPUT_FILE) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 99cc62f3..bf528c0d 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -10177,10 +10177,10 @@ xlog_redo(XLogReaderState *record) TimestampTz timestamp = 0; gid = XLogRecGetData(record); type = gid + strlen(gid) + 1; - if (0 == strcmp(type, "rename")) - { pos = type + strlen(type) + 1; memcpy(×tamp, pos, sizeof(TimestampTz)); + if (0 == strcmp(type, "rename")) + { rename_2pc_records(gid, timestamp); } else @@ -10192,13 +10192,11 @@ xlog_redo(XLogReaderState *record) { TransactionId xid; TransactionId startxid; - GlobalTimestamp prepare_gts = InvalidGlobalTimestamp; - char *fmt_v2 = XLOG_FMT_2PC_V2; char *gid; char *startnode; char *nodestring; char *pos; - char *type; + char *temp; #ifdef __TWO_PHASE_TESTS__ TransactionId old_shem_nextxid = ShmemVariableCache->nextXid; #endif @@ -10206,48 +10204,27 @@ xlog_redo(XLogReaderState *record) gid = XLogRecGetData(record); pos = gid + strlen(gid) +1; /* if the transaction is readonly */ - type = pos; - pos = pos + strlen(type) + 1; + temp = pos; + pos = pos + strlen(temp) + 1; - if (0 != strcmp(type, "readonly")) + if (0 != strcmp(temp, "readonly")) { - if (0 == strcmp(type, fmt_v2)) - { - startnode = pos; - pos = pos + strlen(startnode) + 1; - memcpy(&startxid, pos, sizeof(TransactionId)); - pos = pos + sizeof(TransactionId); - nodestring = pos; - pos = pos + strlen(nodestring) + 1; - memcpy(&xid, pos, sizeof(TransactionId)); - pos = pos + sizeof(TransactionId); - memcpy(&prepare_gts, pos, sizeof(GlobalTimestamp)); - pos = pos + sizeof(GlobalTimestamp); - } - else - { - /* compatible with old format */ - startnode = type; + startnode = temp; memcpy(&startxid, pos, sizeof(TransactionId)); pos = pos + sizeof(TransactionId) + 1; nodestring = pos; pos = pos + strlen(nodestring) + 1; memcpy(&xid, pos, sizeof(TransactionId)); - pos = pos + sizeof(TransactionId) + 1; - } - if (enable_distri_print) { elog(LOG, "xlog redo 2pc file name: '%s', startnode: %s, " - "startxid: %u, prepare_gts: %ld, nodestring: %s, xid: %u", - gid, startnode, startxid, prepare_gts, nodestring, xid); + "startxid: %u, nodestring: %s, xid: %u", + gid, startnode, startxid, nodestring, xid); } - #ifdef __TWO_PHASE_TESTS__ if (FILE_XLOG_EXISTED == twophase_exception_case) { elog(LOG, "FILE_XLOG_EXISTED complish"); - SetGlobalPrepareTimestamp(prepare_gts); record_2pc_involved_nodes_xid(gid, startnode, startxid, nodestring, xid); } #endif @@ -10271,7 +10248,6 @@ xlog_redo(XLogReaderState *record) LWLockRelease(XidGenLock); } - SetGlobalPrepareTimestamp(prepare_gts); record_2pc_involved_nodes_xid(gid, startnode, startxid, nodestring, xid); } else diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 7ad4a8be..c3ee221a 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -3986,10 +3986,11 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit) #endif #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__ + if(implicit) + { if(enable_distri_print) { - elog(LOG, "prepare remote transaction xid %d gid %s", - GetTopTransactionIdIfAny(), prepareGID); + elog(LOG, "prepare remote transaction xid %d gid %s", GetTopTransactionIdIfAny(), prepareGID); } global_prepare_ts = GetGlobalTimestampGTM(); @@ -3999,19 +4000,17 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit) global_prepare_ts = 0; } #endif - - if (!GlobalTimestampIsValid(global_prepare_ts)) - { + if(!GlobalTimestampIsValid(global_prepare_ts)){ ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("failed to get global timestamp for PREPARED command"))); } if(enable_distri_print) { - elog(LOG, "prepare phase get global prepare timestamp gid %s, time " - INT64_FORMAT, prepareGID, global_prepare_ts); + elog(LOG, "prepare phase get global prepare timestamp gid %s, time " INT64_FORMAT, prepareGID, global_prepare_ts); } SetGlobalPrepareTimestamp(global_prepare_ts); + } #endif #ifdef __TWO_PHASE_TRANS__ @@ -4106,18 +4105,19 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit) { #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__ + if(implicit) + { if(enable_distri_print) { - elog(LOG, "send prepare timestamp for xid %d gid %s prepare ts " - INT64_FORMAT, GetTopTransactionIdIfAny(), + elog(LOG, "send prepare timestamp for xid %d gid %s prepare ts " INT64_FORMAT,GetTopTransactionIdIfAny(), prepareGID, global_prepare_ts); } if (pgxc_node_send_prepare_timestamp(conn, global_prepare_ts)) { ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("failed to send global prepare committs for " - "PREPARED command"))); + errmsg("failed to send global prepare committs for PREPARED command"))); + } } #endif /* Send down prepare command */ @@ -4151,10 +4151,11 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit) #endif #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__ + if(implicit) + { if(enable_distri_print) { - elog(LOG, "send prepare timestamp for xid %d gid %s prepare ts " - INT64_FORMAT, GetTopTransactionIdIfAny(), + elog(LOG, "send prepare timestamp for xid %d gid %s prepare ts " INT64_FORMAT,GetTopTransactionIdIfAny(), prepareGID, global_prepare_ts); } if (pgxc_node_send_prepare_timestamp(conn, global_prepare_ts)) @@ -4168,8 +4169,8 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit) #endif ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("failed to send global prepare committs for " - "PREPARED command"))); + errmsg("failed to send global prepare committs for PREPARED command"))); + } } #endif @@ -4308,18 +4309,19 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit) if (conn->read_only) { #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__ + if(implicit) + { if(enable_distri_print) { - elog(LOG, "send prepare timestamp for xid %d gid %s prepare ts " - INT64_FORMAT,GetTopTransactionIdIfAny(), + elog(LOG, "send prepare timestamp for xid %d gid %s prepare ts " INT64_FORMAT,GetTopTransactionIdIfAny(), prepareGID, global_prepare_ts); } if (pgxc_node_send_prepare_timestamp(conn, global_prepare_ts)) { ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("failed to send global prepare committs for " - "PREPARED command"))); + errmsg("failed to send global prepare committs for PREPARED command"))); + } } #endif /* Send down prepare command */ @@ -4350,10 +4352,11 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit) #endif #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__ + if(implicit) + { if(enable_distri_print) { - elog(LOG, "send prepare timestamp for xid %d gid %s prepare ts " - INT64_FORMAT,GetTopTransactionIdIfAny(), + elog(LOG, "send prepare timestamp for xid %d gid %s prepare ts " INT64_FORMAT,GetTopTransactionIdIfAny(), prepareGID, global_prepare_ts); } if (pgxc_node_send_prepare_timestamp(conn, global_prepare_ts)) @@ -4367,8 +4370,8 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit) #endif ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("failed to send global prepare committs for " - "PREPARED command"))); + errmsg("failed to send global prepare committs for PREPARED command"))); + } } #endif diff --git a/src/backend/postmaster/clean2pc.c b/src/backend/postmaster/clean2pc.c index c1e3a31f..def81c95 100644 --- a/src/backend/postmaster/clean2pc.c +++ b/src/backend/postmaster/clean2pc.c @@ -17,7 +17,6 @@ #include "postgres.h" #include "access/htup_details.h" -#include "catalog/namespace.h" #include "catalog/pg_database.h" #include "catalog/pg_type.h" #include "commands/dbcommands.h" @@ -59,7 +58,7 @@ typedef enum bool enable_clean_2pc_launcher = true; int auto_clean_2pc_interval = 60; -int auto_clean_2pc_delay = 60; +int auto_clean_2pc_delay = 300; int auto_clean_2pc_timeout = 1200; int auto_clean_2pc_max_check_time = 1200; @@ -89,8 +88,6 @@ static void start_clean_worker(int count); static void do_query_2pc(TimestampTz clean_time); static void do_clean_2pc(TimestampTz clean_time); -static bool check_pg_clean_extension(void); - static void clean_2pc_sigterm_handler(SIGNAL_ARGS); static void clean_2pc_sighup_handler(SIGNAL_ARGS); static void clean_2pc_sigusr2_handler(SIGNAL_ARGS); @@ -435,12 +432,6 @@ do_query_2pc(TimestampTz clean_time) Assert(result_str != NULL); resetStringInfo(result_str); - if (!check_pg_clean_extension()) - { - elog(WARNING, "create extension pg_clean please"); - return; - } - check_time = (curr_time - clean_time)/USECS_PER_SEC; if (check_time < 0) @@ -695,40 +686,6 @@ do_clean_2pc(TimestampTz clean_time) } } -/* - * check if pg_clean_check_txn funciton exist - */ -static bool -check_pg_clean_extension(void) -{ - bool res = false; - List *names = NULL; - FuncCandidateList clist = NULL; - char *fuc_name = "pg_clean_check_txn"; - - StartTransactionCommand(); - - /* - * Parse the name into components and see if it matches any pg_proc - * entries in the current search path. - */ - names = list_make1(makeString(fuc_name)); - clist = FuncnameGetCandidates(names, -1, NIL, false, false, true); - - if (clist == NULL || clist->next != NULL) - { - res = false; - } - else - { - res = true; - } - - CommitTransactionCommand(); - - return res; -} - /* SIGTERM: set flag to exit normally */ static void clean_2pc_sigterm_handler(SIGNAL_ARGS) diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 3f2e046a..8b7af537 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -4887,11 +4887,7 @@ static struct config_int ConfigureNamesInt[] = GUC_UNIT_S }, &auto_clean_2pc_interval, -#ifdef __TWO_PHASE_TESTS__ - 60, 0, INT_MAX, -#else - 60, 30, INT_MAX, -#endif + 60, 10, INT_MAX, NULL, NULL, NULL }, @@ -4902,11 +4898,7 @@ static struct config_int ConfigureNamesInt[] = GUC_UNIT_S }, &auto_clean_2pc_delay, -#ifdef __TWO_PHASE_TESTS__ - 60, 0, INT_MAX, -#else - 60, 30, INT_MAX, -#endif + 300, 3, INT_MAX, NULL, NULL, NULL }, @@ -4917,11 +4909,7 @@ static struct config_int ConfigureNamesInt[] = GUC_UNIT_S }, &auto_clean_2pc_timeout, -#ifdef __TWO_PHASE_TESTS__ 1200, 0, INT_MAX, -#else - 1200, 30, INT_MAX, -#endif NULL, NULL, NULL }, @@ -4932,11 +4920,7 @@ static struct config_int ConfigureNamesInt[] = GUC_UNIT_S }, &auto_clean_2pc_max_check_time, -#ifdef __TWO_PHASE_TESTS__ 1200, 0, INT_MAX, -#else - 1200, 30, INT_MAX, -#endif NULL, NULL, NULL }, diff --git a/src/include/access/twophase.h b/src/include/access/twophase.h index 132f19d8..06f9685e 100644 --- a/src/include/access/twophase.h +++ b/src/include/access/twophase.h @@ -81,10 +81,6 @@ #include "gtm/gtm_c.h" #define GIDSIZE (200 + 24) - -/* 2pc xlog v2 add prepare timestamp */ -#define XLOG_FMT_2PC_V2 "fmt_v2" - /* * GlobalTransactionData is defined in twophase.c; other places have no * business knowing the internal definition. From c3495ad823068e6ec6f8f39fa05f5c3948083b62 Mon Sep 17 00:00:00 2001 From: andrelin Date: Fri, 22 Apr 2022 16:15:29 +0800 Subject: [PATCH 548/578] pull up to cn if has user defined functions of plpgsql code sync from 5.06.1.1 Author: arrowbowang --- src/backend/commands/functioncmds.c | 32 +++++++++++- src/backend/nodes/copyfuncs.c | 1 + src/backend/optimizer/util/clauses.c | 39 +++++++++++++- src/backend/optimizer/util/pathnode.c | 38 +++++++------- src/backend/parser/analyze.c | 1 + src/backend/parser/gram.y | 11 +++- src/backend/parser/parse_func.c | 34 ++++++++++++ src/backend/utils/adt/ruleutils.c | 2 + src/backend/utils/cache/lsyscache.c | 36 +++++++++++++ src/bin/pg_dump/pg_dump.c | 15 ++++++ src/include/catalog/pg_proc.h | 2 +- src/include/nodes/parsenodes.h | 1 + src/include/nodes/relation.h | 2 + src/include/optimizer/clauses.h | 1 + src/include/parser/kwlist.h | 1 + src/include/parser/parse_func.h | 58 +++++++++++---------- src/include/parser/parse_node.h | 1 + src/include/utils/lsyscache.h | 2 + src/test/regress/expected/privileges.out | 2 +- src/test/regress/expected/rowsecurity_1.out | 2 +- src/test/regress/expected/select_views.out | 2 +- src/test/regress/expected/union_1.out | 34 ++++++++++-- src/test/regress/sql/privileges.sql | 2 +- src/test/regress/sql/rowsecurity.sql | 2 +- src/test/regress/sql/select_views.sql | 2 +- src/test/regress/sql/union.sql | 11 +++- 26 files changed, 273 insertions(+), 61 deletions(-) diff --git a/src/backend/commands/functioncmds.c b/src/backend/commands/functioncmds.c index a9bba8e3..a0cd8d3d 100644 --- a/src/backend/commands/functioncmds.c +++ b/src/backend/commands/functioncmds.c @@ -456,6 +456,7 @@ compute_common_attribute(ParseState *pstate, DefElem **strict_item, DefElem **security_item, DefElem **leakproof_item, + DefElem **pushdow_item, List **set_items, DefElem **cost_item, DefElem **rows_item, @@ -489,6 +490,13 @@ compute_common_attribute(ParseState *pstate, *leakproof_item = defel; } + else if (strcmp(defel->defname, "pushdown") == 0) + { + if (*pushdow_item) + goto duplicate_error; + + *pushdow_item = defel; + } else if (strcmp(defel->defname, "set") == 0) { *set_items = lappend(*set_items, defel->arg); @@ -612,6 +620,7 @@ compute_attributes_sql_style(ParseState *pstate, bool *strict_p, bool *security_definer, bool *leakproof_p, + bool *pushable_p, ArrayType **proconfig, float4 *procost, float4 *prorows, @@ -626,6 +635,7 @@ compute_attributes_sql_style(ParseState *pstate, DefElem *strict_item = NULL; DefElem *security_item = NULL; DefElem *leakproof_item = NULL; + DefElem *pushdown_item = NULL; List *set_items = NIL; DefElem *cost_item = NULL; DefElem *rows_item = NULL; @@ -677,6 +687,7 @@ compute_attributes_sql_style(ParseState *pstate, &strict_item, &security_item, &leakproof_item, + &pushdown_item, &set_items, &cost_item, &rows_item, @@ -724,6 +735,8 @@ compute_attributes_sql_style(ParseState *pstate, *security_definer = intVal(security_item->arg); if (leakproof_item) *leakproof_p = intVal(leakproof_item->arg); + if (pushdown_item) + *pushable_p = intVal(pushdown_item->arg); if (set_items) *proconfig = update_proconfig_value(NULL, set_items); if (cost_item) @@ -883,7 +896,8 @@ CreateFunction(ParseState *pstate, CreateFunctionStmt *stmt) bool isWindowFunc, isStrict, security, - isLeakProof; + isLeakProof, + isPushdown; char volatility; ArrayType *proconfig; float4 procost; @@ -908,6 +922,7 @@ CreateFunction(ParseState *pstate, CreateFunctionStmt *stmt) isStrict = false; security = false; isLeakProof = false; + isPushdown = false; volatility = PROVOLATILE_VOLATILE; proconfig = NULL; procost = -1; /* indicates not set */ @@ -919,7 +934,7 @@ CreateFunction(ParseState *pstate, CreateFunctionStmt *stmt) stmt->options, &as_clause, &language, &transformDefElem, &isWindowFunc, &volatility, - &isStrict, &security, &isLeakProof, + &isStrict, &security, &isLeakProof, &isPushdown, &proconfig, &procost, &prorows, ¶llel); /* Look up the language and validate permissions */ @@ -1064,6 +1079,9 @@ CreateFunction(ParseState *pstate, CreateFunctionStmt *stmt) else procost = 100; } + if(isPushdown) + procost = -procost; + if (prorows < 0) { if (returnsSet) @@ -1174,6 +1192,7 @@ AlterFunction(ParseState *pstate, AlterFunctionStmt *stmt) DefElem *strict_item = NULL; DefElem *security_def_item = NULL; DefElem *leakproof_item = NULL; + DefElem *pushdown_item = NULL; List *set_items = NIL; DefElem *cost_item = NULL; DefElem *rows_item = NULL; @@ -1212,6 +1231,7 @@ AlterFunction(ParseState *pstate, AlterFunctionStmt *stmt) &strict_item, &security_def_item, &leakproof_item, + &pushdown_item, &set_items, &cost_item, &rows_item, @@ -1241,6 +1261,14 @@ AlterFunction(ParseState *pstate, AlterFunctionStmt *stmt) (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("COST must be positive"))); } + if (pushdown_item) + { + bool pushdown = intVal(pushdown_item->arg); + if (pushdown && procForm->procost > 0) + procForm->procost = -procForm->procost; + if ((!pushdown) && procForm->procost < 0) + procForm->procost = -procForm->procost; + } if (rows_item) { procForm->prorows = defGetNumeric(rows_item); diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 5bc4e05c..e2e6b7fc 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -3260,6 +3260,7 @@ _copyQuery(const Query *from) COPY_SCALAR_FIELD(isSingleValues); COPY_SCALAR_FIELD(isMultiValues); COPY_SCALAR_FIELD(hasUnshippableTriggers); + COPY_SCALAR_FIELD(hasCoordFuncs); COPY_STRING_FIELD(copy_filename); #endif COPY_NODE_FIELD(cteList); diff --git a/src/backend/optimizer/util/clauses.c b/src/backend/optimizer/util/clauses.c index ef96602f..f2c9cc1e 100644 --- a/src/backend/optimizer/util/clauses.c +++ b/src/backend/optimizer/util/clauses.c @@ -5367,4 +5367,41 @@ replace_eval_sql_value_function(Node *node) return expression_tree_mutator(node, replace_eval_sql_value_function, NULL); } -#endif \ No newline at end of file +#endif +/***************************************************************************** + * Check clauses for pull-up-ed user defined functions + *****************************************************************************/ + +static bool +contain_user_defined_functions_checker(Oid func_id, void *context) +{ + return func_is_pullup(func_id); +} + +static bool +contain_check_functions_walker(Node *node, bool (*checker)()) +{ + if (node == NULL) + return false; + + if (check_functions_in_node(node, checker, + NULL)) + return true; + + /* Recurse to check arguments */ + if (IsA(node, Query)) + { + /* Recurse into subselects */ + return query_tree_walker((Query *) node, + contain_check_functions_walker, + checker, 0); + } + return expression_tree_walker(node, contain_check_functions_walker, + checker); +} + +bool +contain_user_defined_functions(Node *clause) +{ + return contain_check_functions_walker(clause, &contain_user_defined_functions_checker); +} diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 5c1a3ca5..f3d1adb2 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -1657,6 +1657,26 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) if (innerd == NULL && outerd == NULL) return NIL; #ifdef __TBASE__ + /* + * DML may need to push down to datanodes, for example: + * DELETE FROM + * geocode_settings as gc + * USING geocode_settings_default AS gf + * WHERE + * gf.name = gc.name and gf.setting = gc.setting; + * prefer_olap means pulling query up to coordinator node, in case data + * re-distribute in TPC-C test case. + * + * TODO: We need to automatically determine whether we need to pull it up, + * but not using GUC. + */ + if(!dml && + (!prefer_olap || + (root->parse && + root->parse->hasCoordFuncs))) + { + goto pull_up; + } /* * If outer or inner subpaths are distributed by shard and they do not exist @@ -1785,24 +1805,6 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) return alternate; } - /* - * DML may need to push down to datanodes, for example: - * DELETE FROM - * geocode_settings as gc - * USING geocode_settings_default AS gf - * WHERE - * gf.name = gc.name and gf.setting = gc.setting; - * prefer_olap means pulling query up to coordinator node, in case data - * re-distribute in TPC-C test case. - * - * TODO: We need to automatically determine whether we need to pull it up, - * but not using GUC. - */ - if(!prefer_olap && false == dml) - { - goto pull_up; - } - restrictClauses = list_copy(pathnode->joinrestrictinfo); restrictClauses = list_concat(restrictClauses, pathnode->movedrestrictinfo); diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c index f5a26ad3..c68d7b06 100644 --- a/src/backend/parser/analyze.c +++ b/src/backend/parser/analyze.c @@ -447,6 +447,7 @@ transformStmt(ParseState *pstate, Node *parseTree) /* Mark as original query until we learn differently */ result->querySource = QSRC_ORIGINAL; result->canSetTag = true; + result->hasCoordFuncs = pstate->p_hasCoordFuncs; return result; } diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index dad866bf..756b4bad 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -720,7 +720,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); PARALLEL PARSER PARTIAL PARTITION PARTITIONS PASSING PASSWORD PAUSE PLACING PLANS POLICY POSITION PRECEDING PRECISION PREFERRED PRESERVE PREPARE PREPARED PRIMARY - PRIOR PRIVILEGES PROCEDURAL PROCEDURE PROGRAM PUBLICATION + PRIOR PRIVILEGES PROCEDURAL PROCEDURE PROGRAM PUBLICATION PUSHDOWN QUOTE @@ -8287,6 +8287,14 @@ common_func_opt_item: { $$ = makeDefElem("leakproof", (Node *)makeInteger(FALSE), @1); } + | PUSHDOWN + { + $$ = makeDefElem("pushdown", (Node *)makeInteger(TRUE), @1); + } + | NOT PUSHDOWN + { + $$ = makeDefElem("pushdown", (Node *)makeInteger(FALSE), @1); + } | COST NumericOnly { $$ = makeDefElem("cost", (Node *)$2, @1); @@ -16853,6 +16861,7 @@ unreserved_keyword: | PROCEDURE | PROGRAM | PUBLICATION + | PUSHDOWN | QUOTE /* PGXC_BEGIN */ | RANDOMLY diff --git a/src/backend/parser/parse_func.c b/src/backend/parser/parse_func.c index 8778a4b7..ac9fc9c0 100644 --- a/src/backend/parser/parse_func.c +++ b/src/backend/parser/parse_func.c @@ -18,10 +18,12 @@ #include "catalog/pg_aggregate.h" #include "catalog/pg_proc.h" #include "catalog/pg_type.h" +#include "commands/proclang.h" #include "funcapi.h" #include "lib/stringinfo.h" #include "nodes/makefuncs.h" #include "nodes/nodeFuncs.h" +#include "optimizer/clauses.h" #include "parser/parse_agg.h" #include "parser/parse_clause.h" #include "parser/parse_coerce.h" @@ -253,6 +255,8 @@ ParseFuncOrColumn(ParseState *pstate, List *funcname, List *fargs, cancel_parser_errposition_callback(&pcbstate); + pstate->p_hasCoordFuncs = func_is_pullup(funcid); + if (fdresult == FUNCDETAIL_COERCION) { /* @@ -2257,3 +2261,33 @@ check_srf_call_placement(ParseState *pstate, Node *last_srf, int location) ParseExprKindName(pstate->p_expr_kind)), parser_errposition(pstate, location))); } + +bool +func_is_pullup(Oid func_id) +{ + char *name = NULL; + if (func_id >= FirstNormalObjectId) + { + Oid func_lang_oid; + Oid plpgsql_oid; + float cost; + + /* + * A set returning function is not supposed to be in targetlist + * so ignore it. + */ + if (get_func_retset(func_id)) + return false; + + /* A stable function surely can be pushed down to DN */ + if (func_volatile(func_id) == PROVOLATILE_STABLE) + return false; + + func_lang_oid = get_func_lang(func_id); + plpgsql_oid = get_language_oid("plpgsql", true); + cost = get_func_cost_with_sign(func_id); + if (func_lang_oid == plpgsql_oid && cost >= 0) + return true; + } + return false; +} diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c index 7d16c0f6..e7a6bdd9 100644 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@ -2613,6 +2613,8 @@ pg_get_functiondef(PG_FUNCTION_ARGS) appendStringInfoString(&buf, " SECURITY DEFINER"); if (proc->proleakproof) appendStringInfoString(&buf, " LEAKPROOF"); + if (proc->procost < 0) + appendStringInfoString(&buf, " PUSHDOWN"); /* This code for the default cost and rows should match functioncmds.c */ if (proc->prolang == INTERNALlanguageId || diff --git a/src/backend/utils/cache/lsyscache.c b/src/backend/utils/cache/lsyscache.c index 9061c0ed..33005a7e 100644 --- a/src/backend/utils/cache/lsyscache.c +++ b/src/backend/utils/cache/lsyscache.c @@ -1766,6 +1766,23 @@ get_func_cost(Oid funcid) elog(ERROR, "cache lookup failed for function %u", funcid); result = ((Form_pg_proc) GETSTRUCT(tp))->procost; + if (result < 0) + result = -result; + ReleaseSysCache(tp); + return result; +} + +float4 +get_func_cost_with_sign(Oid funcid) +{ + HeapTuple tp; + float4 result; + + tp = SearchSysCache1(PROCOID, ObjectIdGetDatum(funcid)); + if (!HeapTupleIsValid(tp)) + elog(ERROR, "cache lookup failed for function %u", funcid); + + result = ((Form_pg_proc) GETSTRUCT(tp))->procost; ReleaseSysCache(tp); return result; } @@ -1789,6 +1806,25 @@ get_func_rows(Oid funcid) return result; } +Oid +get_func_lang(Oid funcid) +{ + HeapTuple tp; + + tp = SearchSysCache1(PROCOID, ObjectIdGetDatum(funcid)); + if (HeapTupleIsValid(tp)) + { + Form_pg_proc functup = (Form_pg_proc) GETSTRUCT(tp); + Oid result; + + result = functup->prolang; + ReleaseSysCache(tp); + return result; + } + else + return InvalidOid; +} + /* ---------- RELATION CACHE ---------- */ /* diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index 89685c9f..f987ae1d 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -12020,6 +12020,21 @@ dumpFunc(Archive *fout, FuncInfo *finfo) * break backwards-compatibility of the dump without need. Keep this code * in sync with the defaults in functioncmds.c. */ + if(procost[0] == '-') + { + char* temp; + int len; + + appendPQExpBufferStr(q, " PUSHDOWN"); + len = strlen(procost); + temp = pg_malloc(len + 1); + strcpy(temp, procost+1); + temp[len-1] = '\0'; + strcpy(procost, temp); + procost[len-1] = 0; + pg_free(temp); + temp = NULL; + } if (strcmp(procost, "0") != 0) { if (strcmp(lanname, "internal") == 0 || strcmp(lanname, "c") == 0) diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index 8f79ca30..1eb1b97f 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -100,7 +100,7 @@ CATALOG(pg_proc,1255) BKI_BOOTSTRAP BKI_ROWTYPE_OID(81) BKI_SCHEMA_MACRO Oid pronamespace; /* OID of namespace containing this proc */ Oid proowner; /* procedure owner */ Oid prolang; /* OID of pg_language entry */ - float4 procost; /* estimated execution cost */ + float4 procost; /* estimated execution cost, the negtive number means the function can be pushed down*/ float4 prorows; /* estimated # of rows out (if proretset) */ Oid provariadic; /* element type of variadic array, or 0 */ regproc protransform; /* transforms calls to it during planning */ diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index e8ac3d54..f6e83887 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -162,6 +162,7 @@ typedef struct Query * only used for DML. Will be set at the plan phase * in shippability check. */ + bool hasCoordFuncs; char *copy_filename; /* fake filename for copy from */ Bitmapset *conflict_cols; #endif diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h index 689a392f..0736ab79 100644 --- a/src/include/nodes/relation.h +++ b/src/include/nodes/relation.h @@ -225,6 +225,8 @@ typedef struct PlannerGlobal bool parallelModeNeeded; /* parallel mode actually required? */ + bool hasCoordFuncs; + char maxParallelHazard; /* worst PROPARALLEL hazard level */ } PlannerGlobal; diff --git a/src/include/optimizer/clauses.h b/src/include/optimizer/clauses.h index e55c6033..52918fd6 100644 --- a/src/include/optimizer/clauses.h +++ b/src/include/optimizer/clauses.h @@ -95,4 +95,5 @@ extern Node *replace_distribkey_func(Node *node); extern Node *replace_eval_sql_value_function(Node *node); +extern bool contain_user_defined_functions(Node *clause); #endif /* CLAUSES_H */ diff --git a/src/include/parser/kwlist.h b/src/include/parser/kwlist.h index d77a2e68..8d54cc60 100644 --- a/src/include/parser/kwlist.h +++ b/src/include/parser/kwlist.h @@ -382,6 +382,7 @@ PG_KEYWORD("procedural", PROCEDURAL, UNRESERVED_KEYWORD) PG_KEYWORD("procedure", PROCEDURE, UNRESERVED_KEYWORD) PG_KEYWORD("program", PROGRAM, UNRESERVED_KEYWORD) PG_KEYWORD("publication", PUBLICATION, UNRESERVED_KEYWORD) +PG_KEYWORD("pushdown", PUSHDOWN, UNRESERVED_KEYWORD) PG_KEYWORD("quote", QUOTE, UNRESERVED_KEYWORD) #ifdef PGXC PG_KEYWORD("randomly", RANDOMLY, UNRESERVED_KEYWORD) diff --git a/src/include/parser/parse_func.h b/src/include/parser/parse_func.h index 4b8697fe..c6199410 100644 --- a/src/include/parser/parse_func.h +++ b/src/include/parser/parse_func.h @@ -21,54 +21,56 @@ /* Result codes for func_get_detail */ typedef enum { - FUNCDETAIL_NOTFOUND, /* no matching function */ - FUNCDETAIL_MULTIPLE, /* too many matching functions */ - FUNCDETAIL_NORMAL, /* found a matching regular function */ - FUNCDETAIL_AGGREGATE, /* found a matching aggregate function */ - FUNCDETAIL_WINDOWFUNC, /* found a matching window function */ - FUNCDETAIL_COERCION /* it's a type coercion request */ + FUNCDETAIL_NOTFOUND, /* no matching function */ + FUNCDETAIL_MULTIPLE, /* too many matching functions */ + FUNCDETAIL_NORMAL, /* found a matching regular function */ + FUNCDETAIL_AGGREGATE, /* found a matching aggregate function */ + FUNCDETAIL_WINDOWFUNC, /* found a matching window function */ + FUNCDETAIL_COERCION /* it's a type coercion request */ } FuncDetailCode; extern Node *ParseFuncOrColumn(ParseState *pstate, List *funcname, List *fargs, - Node *last_srf, FuncCall *fn, int location); + Node *last_srf, FuncCall *fn, int location); extern FuncDetailCode func_get_detail(List *funcname, - List *fargs, List *fargnames, - int nargs, Oid *argtypes, - bool expand_variadic, bool expand_defaults, - Oid *funcid, Oid *rettype, - bool *retset, int *nvargs, Oid *vatype, - Oid **true_typeids, List **argdefaults); + List *fargs, List *fargnames, + int nargs, Oid *argtypes, + bool expand_variadic, bool expand_defaults, + Oid *funcid, Oid *rettype, + bool *retset, int *nvargs, Oid *vatype, + Oid **true_typeids, List **argdefaults); extern int func_match_argtypes(int nargs, - Oid *input_typeids, - FuncCandidateList raw_candidates, - FuncCandidateList *candidates); + Oid *input_typeids, + FuncCandidateList raw_candidates, + FuncCandidateList *candidates); extern FuncCandidateList func_select_candidate(int nargs, - Oid *input_typeids, - FuncCandidateList candidates); + Oid *input_typeids, + FuncCandidateList candidates); extern void make_fn_arguments(ParseState *pstate, - List *fargs, - Oid *actual_arg_types, - Oid *declared_arg_types); + List *fargs, + Oid *actual_arg_types, + Oid *declared_arg_types); extern const char *funcname_signature_string(const char *funcname, int nargs, - List *argnames, const Oid *argtypes); + List *argnames, const Oid *argtypes); extern const char *func_signature_string(List *funcname, int nargs, - List *argnames, const Oid *argtypes); + List *argnames, const Oid *argtypes); extern Oid LookupFuncName(List *funcname, int nargs, const Oid *argtypes, - bool noError); + bool noError); extern Oid LookupFuncWithArgs(ObjectWithArgs *func, - bool noError); + bool noError); extern Oid LookupAggWithArgs(ObjectWithArgs *agg, - bool noError); + bool noError); extern void check_srf_call_placement(ParseState *pstate, Node *last_srf, - int location); + int location); extern void check_pg_get_expr_args(ParseState *pstate, Oid fnoid, List *args); -#endif /* PARSE_FUNC_H */ + +extern bool func_is_pullup(Oid func_id); +#endif /* PARSE_FUNC_H */ diff --git a/src/include/parser/parse_node.h b/src/include/parser/parse_node.h index 0f0490d6..5ae643ce 100644 --- a/src/include/parser/parse_node.h +++ b/src/include/parser/parse_node.h @@ -260,6 +260,7 @@ struct ParseState bool p_hasTargetSRFs; bool p_hasSubLinks; bool p_hasModifyingCTE; + bool p_hasCoordFuncs; Node *p_last_srf; /* most recent set-returning func/op found */ diff --git a/src/include/utils/lsyscache.h b/src/include/utils/lsyscache.h index e94c510b..6ad2a50f 100644 --- a/src/include/utils/lsyscache.h +++ b/src/include/utils/lsyscache.h @@ -132,7 +132,9 @@ extern char func_volatile(Oid funcid); extern char func_parallel(Oid funcid); extern bool get_func_leakproof(Oid funcid); extern float4 get_func_cost(Oid funcid); +extern float4 get_func_cost_with_sign(Oid funcid); extern float4 get_func_rows(Oid funcid); +extern Oid get_func_lang(Oid funcid); extern Oid get_relname_relid(const char *relname, Oid relnamespace); #ifdef PGXC extern int get_relnatts(Oid relid); diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out index ccf6aba3..d7454603 100644 --- a/src/test/regress/expected/privileges.out +++ b/src/test/regress/expected/privileges.out @@ -194,7 +194,7 @@ CREATE INDEX ON atest12 (abs(a)); VACUUM ANALYZE atest12; CREATE FUNCTION leak(integer,integer) RETURNS boolean AS $$begin return $1 < $2; end$$ - LANGUAGE plpgsql immutable; + LANGUAGE plpgsql immutable pushdown; CREATE OPERATOR <<< (procedure = leak, leftarg = integer, rightarg = integer, restrict = scalarltsel); -- view with leaky operator diff --git a/src/test/regress/expected/rowsecurity_1.out b/src/test/regress/expected/rowsecurity_1.out index 770c320f..2370da75 100644 --- a/src/test/regress/expected/rowsecurity_1.out +++ b/src/test/regress/expected/rowsecurity_1.out @@ -28,7 +28,7 @@ GRANT ALL ON SCHEMA regress_rls_schema to public; SET search_path = regress_rls_schema; -- setup of malicious function CREATE OR REPLACE FUNCTION f_leak(text) RETURNS bool - COST 0.0000001 LANGUAGE plpgsql + COST 0.0000001 LANGUAGE plpgsql pushdown AS 'BEGIN RAISE NOTICE ''f_leak => %'', $1; RETURN true; END'; GRANT EXECUTE ON FUNCTION f_leak(text) TO public; -- BASIC Row-Level Security Scenario diff --git a/src/test/regress/expected/select_views.out b/src/test/regress/expected/select_views.out index 2406dabc..9abe04ed 100644 --- a/src/test/regress/expected/select_views.out +++ b/src/test/regress/expected/select_views.out @@ -1252,7 +1252,7 @@ SELECT * FROM toyemp WHERE name = 'sharon'; -- CREATE ROLE regress_alice; CREATE FUNCTION f_leak (text) - RETURNS bool LANGUAGE 'plpgsql' COST 0.0000001 + RETURNS bool LANGUAGE 'plpgsql' COST 0.0000001 pushdown AS 'BEGIN RAISE NOTICE ''f_leak => %'', $1; RETURN true; END'; CREATE TABLE customer ( cid int primary key, diff --git a/src/test/regress/expected/union_1.out b/src/test/regress/expected/union_1.out index 41c0c7fa..08670b47 100644 --- a/src/test/regress/expected/union_1.out +++ b/src/test/regress/expected/union_1.out @@ -822,15 +822,15 @@ ORDER BY x; -- Test proper handling of parameterized appendrel paths when the -- potential join qual is expensive create function expensivefunc(int) returns int -language plpgsql immutable strict cost 10000 +language plpgsql immutable strict pushdown cost 10000 as $$begin return $1; end$$; create temp table t3 as select generate_series(-1000,1000) as x; create index t3i on t3 (expensivefunc(x)); analyze t3; explain (num_nodes off, nodes off, costs off) select * from - (select * from t3 a union all select * from t3 b) ss - join int4_tbl on f1 = expensivefunc(x); + (select * from t3 a union all select * from t3 b) ss + join int4_tbl on f1 = expensivefunc(x); QUERY PLAN ------------------------------------------------------------------ Remote Subquery Scan on all @@ -843,6 +843,34 @@ select * from Index Cond: (expensivefunc(x) = int4_tbl.f1) (8 rows) +select * from + (select * from t3 a union all select * from t3 b) ss + join int4_tbl on f1 = expensivefunc(x); + x | f1 +---+---- + 0 | 0 + 0 | 0 +(2 rows) + +alter function expensivefunc not pushdown; +explain (num_nodes off, nodes off, costs off) +select * from + (select * from t3 a union all select * from t3 b) ss + join int4_tbl on f1 = expensivefunc(x); + QUERY PLAN +------------------------------------------------------------------------ + Nested Loop + -> Remote Subquery Scan on all + -> Seq Scan on int4_tbl + -> Materialize + -> Remote Subquery Scan on all + -> Append + -> Index Scan using t3i on t3 a + Index Cond: (expensivefunc(x) = int4_tbl.f1) + -> Index Scan using t3i on t3 b + Index Cond: (expensivefunc(x) = int4_tbl.f1) +(10 rows) + select * from (select * from t3 a union all select * from t3 b) ss join int4_tbl on f1 = expensivefunc(x); diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql index 09122394..a6b92c2f 100644 --- a/src/test/regress/sql/privileges.sql +++ b/src/test/regress/sql/privileges.sql @@ -140,7 +140,7 @@ VACUUM ANALYZE atest12; CREATE FUNCTION leak(integer,integer) RETURNS boolean AS $$begin return $1 < $2; end$$ - LANGUAGE plpgsql immutable; + LANGUAGE plpgsql immutable pushdown; CREATE OPERATOR <<< (procedure = leak, leftarg = integer, rightarg = integer, restrict = scalarltsel); diff --git a/src/test/regress/sql/rowsecurity.sql b/src/test/regress/sql/rowsecurity.sql index 4ed98e68..9e5609e9 100644 --- a/src/test/regress/sql/rowsecurity.sql +++ b/src/test/regress/sql/rowsecurity.sql @@ -37,7 +37,7 @@ SET search_path = regress_rls_schema; -- setup of malicious function CREATE OR REPLACE FUNCTION f_leak(text) RETURNS bool - COST 0.0000001 LANGUAGE plpgsql + COST 0.0000001 LANGUAGE plpgsql pushdown AS 'BEGIN RAISE NOTICE ''f_leak => %'', $1; RETURN true; END'; GRANT EXECUTE ON FUNCTION f_leak(text) TO public; diff --git a/src/test/regress/sql/select_views.sql b/src/test/regress/sql/select_views.sql index 1b175469..a6820358 100644 --- a/src/test/regress/sql/select_views.sql +++ b/src/test/regress/sql/select_views.sql @@ -15,7 +15,7 @@ SELECT * FROM toyemp WHERE name = 'sharon'; CREATE ROLE regress_alice; CREATE FUNCTION f_leak (text) - RETURNS bool LANGUAGE 'plpgsql' COST 0.0000001 + RETURNS bool LANGUAGE 'plpgsql' COST 0.0000001 pushdown AS 'BEGIN RAISE NOTICE ''f_leak => %'', $1; RETURN true; END'; CREATE TABLE customer ( diff --git a/src/test/regress/sql/union.sql b/src/test/regress/sql/union.sql index bf51c9a5..58bc43a7 100644 --- a/src/test/regress/sql/union.sql +++ b/src/test/regress/sql/union.sql @@ -341,13 +341,22 @@ ORDER BY x; -- Test proper handling of parameterized appendrel paths when the -- potential join qual is expensive create function expensivefunc(int) returns int -language plpgsql immutable strict cost 10000 +language plpgsql immutable strict pushdown cost 10000 as $$begin return $1; end$$; create temp table t3 as select generate_series(-1000,1000) as x; create index t3i on t3 (expensivefunc(x)); analyze t3; +explain (num_nodes off, nodes off, costs off) +select * from + (select * from t3 a union all select * from t3 b) ss + join int4_tbl on f1 = expensivefunc(x); +select * from + (select * from t3 a union all select * from t3 b) ss + join int4_tbl on f1 = expensivefunc(x); + +alter function expensivefunc not pushdown; explain (num_nodes off, nodes off, costs off) select * from (select * from t3 a union all select * from t3 b) ss From 22758055448aed22b9189d14646930edbec74da5 Mon Sep 17 00:00:00 2001 From: andrelin Date: Wed, 1 Dec 2021 12:03:54 +0800 Subject: [PATCH 549/578] Support for creating a Result node to do specific qualifications Before this, a Result node can only do simple qualification like "Select 1 < 2; --true" or acting as a "gating" node, see create_gating_plan. This commit add a QualPath and create_qual_path to eventually create a Result node that able to do like plan: Result Filter(a < b) -> Remote Subquery Scan on all: -> Seqscan on t output: a, b It seems useless that qualification can performed just after scanning a. But it comes helpful if we have something computed AFTER a remote subplan collecting tuples, such as "rownum" expr and UDF that processed on CN. --- src/backend/executor/nodeResult.c | 409 +++++++++++++----------- src/backend/nodes/outfuncs.c | 14 + src/backend/optimizer/plan/createplan.c | 56 +++- src/backend/optimizer/util/pathnode.c | 28 ++ src/include/nodes/execnodes.h | 1 + src/include/nodes/nodes.h | 1 + src/include/nodes/relation.h | 7 + src/include/optimizer/pathnode.h | 1 + 8 files changed, 308 insertions(+), 209 deletions(-) diff --git a/src/backend/executor/nodeResult.c b/src/backend/executor/nodeResult.c index 905e4f1f..0269d6d3 100644 --- a/src/backend/executor/nodeResult.c +++ b/src/backend/executor/nodeResult.c @@ -1,44 +1,44 @@ /*------------------------------------------------------------------------- * * nodeResult.c - * support for constant nodes needing special code. + * support for constant nodes needing special code. * * DESCRIPTION * - * Result nodes are used in queries where no relations are scanned. - * Examples of such queries are: + * Result nodes are used in queries where no relations are scanned. + * Examples of such queries are: * - * select 1 * 2 + * select 1 * 2 * - * insert into emp values ('mike', 15000) + * insert into emp values ('mike', 15000) * - * (Remember that in an INSERT or UPDATE, we need a plan tree that - * generates the new rows.) + * (Remember that in an INSERT or UPDATE, we need a plan tree that + * generates the new rows.) * - * Result nodes are also used to optimise queries with constant - * qualifications (ie, quals that do not depend on the scanned data), - * such as: + * Result nodes are also used to optimise queries with constant + * qualifications (ie, quals that do not depend on the scanned data), + * such as: * - * select * from emp where 2 > 1 + * select * from emp where 2 > 1 * - * In this case, the plan generated is + * In this case, the plan generated is * - * Result (with 2 > 1 qual) - * / - * SeqScan (emp.*) + * Result (with 2 > 1 qual) + * / + * SeqScan (emp.*) * - * At runtime, the Result node evaluates the constant qual once, - * which is shown by EXPLAIN as a One-Time Filter. If it's - * false, we can return an empty result set without running the - * controlled plan at all. If it's true, we run the controlled - * plan normally and pass back the results. + * At runtime, the Result node evaluates the constant qual once, + * which is shown by EXPLAIN as a One-Time Filter. If it's + * false, we can return an empty result set without running the + * controlled plan at all. If it's true, we run the controlled + * plan normally and pass back the results. * * * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * src/backend/executor/nodeResult.c + * src/backend/executor/nodeResult.c * *------------------------------------------------------------------------- */ @@ -48,230 +48,249 @@ #include "executor/executor.h" #include "executor/nodeResult.h" #include "miscadmin.h" +#include "optimizer/clauses.h" #include "utils/memutils.h" /* ---------------------------------------------------------------- - * ExecResult(node) + * ExecResult(node) * - * returns the tuples from the outer plan which satisfy the - * qualification clause. Since result nodes with right - * subtrees are never planned, we ignore the right subtree - * entirely (for now).. -cim 10/7/89 + * returns the tuples from the outer plan which satisfy the + * qualification clause. Since result nodes with right + * subtrees are never planned, we ignore the right subtree + * entirely (for now).. -cim 10/7/89 * - * The qualification containing only constant clauses are - * checked first before any processing is done. It always returns - * 'nil' if the constant qualification is not satisfied. + * The qualification containing only constant clauses are + * checked first before any processing is done. It always returns + * 'nil' if the constant qualification is not satisfied. * ---------------------------------------------------------------- */ static TupleTableSlot * ExecResult(PlanState *pstate) { - ResultState *node = castNode(ResultState, pstate); - TupleTableSlot *outerTupleSlot; - PlanState *outerPlan; - ExprContext *econtext; - - CHECK_FOR_INTERRUPTS(); - - econtext = node->ps.ps_ExprContext; - - /* - * check constant qualifications like (2 > 1), if not already done - */ - if (node->rs_checkqual) - { - bool qualResult = ExecQual(node->resconstantqual, econtext); - - node->rs_checkqual = false; - if (!qualResult) - { - node->rs_done = true; - return NULL; - } - } - - /* - * Reset per-tuple memory context to free any expression evaluation - * storage allocated in the previous tuple cycle. - */ - ResetExprContext(econtext); - - /* - * if rs_done is true then it means that we were asked to return a - * constant tuple and we already did the last time ExecResult() was - * called, OR that we failed the constant qual check. Either way, now we - * are through. - */ - while (!node->rs_done) - { - outerPlan = outerPlanState(node); - - if (outerPlan != NULL) - { - /* - * retrieve tuples from the outer plan until there are no more. - */ - outerTupleSlot = ExecProcNode(outerPlan); - - if (TupIsNull(outerTupleSlot)) - return NULL; - - /* - * prepare to compute projection expressions, which will expect to - * access the input tuples as varno OUTER. - */ - econtext->ecxt_outertuple = outerTupleSlot; - } - else - { - /* - * if we don't have an outer plan, then we are just generating the - * results from a constant target list. Do it only once. - */ - node->rs_done = true; - } - - /* form the result tuple using ExecProject(), and return it */ - return ExecProject(node->ps.ps_ProjInfo); - } - - return NULL; + ResultState *node = castNode(ResultState, pstate); + ExprState *qual = node->ps.qual; + TupleTableSlot *outerTupleSlot; + PlanState *outerPlan; + ExprContext *econtext; + + CHECK_FOR_INTERRUPTS(); + + econtext = node->ps.ps_ExprContext; + + /* + * check constant qualifications like (2 > 1), if not already done + */ + if (node->rs_checkqual) + { + bool qualResult = ExecQual(node->resconstantqual, econtext); + + node->rs_checkqual = false; + if (!qualResult) + { + node->rs_done = true; + return NULL; + } + } + + /* + * Reset per-tuple memory context to free any expression evaluation + * storage allocated in the previous tuple cycle. + */ + ResetExprContext(econtext); + + /* + * if rs_done is true then it means that we were asked to return a + * constant tuple and we already did the last time ExecResult() was + * called, OR that we failed the constant qual check. Either way, now we + * are through. + */ + while (!node->rs_done) + { + outerPlan = outerPlanState(node); + + if (outerPlan != NULL) + { + /* + * retrieve tuples from the outer plan until there are no more. + */ + outerTupleSlot = ExecProcNode(outerPlan); + + if (TupIsNull(outerTupleSlot)) + return NULL; + + if (qual) + { + econtext->ecxt_outertuple = outerTupleSlot; + econtext->ecxt_scantuple = outerTupleSlot; + + if (!ExecQual(qual, econtext)) + { + if (node->rs_fail_return) + return NULL; + else + continue; + } + + ResetExprContext(econtext); + } + + /* + * prepare to compute projection expressions, which will expect to + * access the input tuples as varno OUTER. + */ + econtext->ecxt_outertuple = outerTupleSlot; + } + else + { + /* + * if we don't have an outer plan, then we are just generating the + * results from a constant target list. Do it only once. + */ + node->rs_done = true; + } + + /* form the result tuple using ExecProject(), and return it */ + return ExecProject(node->ps.ps_ProjInfo); + } + + return NULL; } /* ---------------------------------------------------------------- - * ExecResultMarkPos + * ExecResultMarkPos * ---------------------------------------------------------------- */ void ExecResultMarkPos(ResultState *node) { - PlanState *outerPlan = outerPlanState(node); + PlanState *outerPlan = outerPlanState(node); - if (outerPlan != NULL) - ExecMarkPos(outerPlan); - else - elog(DEBUG2, "Result nodes do not support mark/restore"); + if (outerPlan != NULL) + ExecMarkPos(outerPlan); + else + elog(DEBUG2, "Result nodes do not support mark/restore"); } /* ---------------------------------------------------------------- - * ExecResultRestrPos + * ExecResultRestrPos * ---------------------------------------------------------------- */ void ExecResultRestrPos(ResultState *node) { - PlanState *outerPlan = outerPlanState(node); + PlanState *outerPlan = outerPlanState(node); - if (outerPlan != NULL) - ExecRestrPos(outerPlan); - else - elog(ERROR, "Result nodes do not support mark/restore"); + if (outerPlan != NULL) + ExecRestrPos(outerPlan); + else + elog(ERROR, "Result nodes do not support mark/restore"); } /* ---------------------------------------------------------------- - * ExecInitResult + * ExecInitResult * - * Creates the run-time state information for the result node - * produced by the planner and initializes outer relations - * (child nodes). + * Creates the run-time state information for the result node + * produced by the planner and initializes outer relations + * (child nodes). * ---------------------------------------------------------------- */ ResultState * ExecInitResult(Result *node, EState *estate, int eflags) { - ResultState *resstate; - - /* check for unsupported flags */ - Assert(!(eflags & (EXEC_FLAG_MARK | EXEC_FLAG_BACKWARD)) || - outerPlan(node) != NULL); - - /* - * create state structure - */ - resstate = makeNode(ResultState); - resstate->ps.plan = (Plan *) node; - resstate->ps.state = estate; - resstate->ps.ExecProcNode = ExecResult; - - resstate->rs_done = false; - resstate->rs_checkqual = (node->resconstantqual == NULL) ? false : true; - - /* - * Miscellaneous initialization - * - * create expression context for node - */ - ExecAssignExprContext(estate, &resstate->ps); - - /* - * tuple table initialization - */ - ExecInitResultTupleSlot(estate, &resstate->ps); - - /* - * initialize child expressions - */ - resstate->ps.qual = - ExecInitQual(node->plan.qual, (PlanState *) resstate); - resstate->resconstantqual = - ExecInitQual((List *) node->resconstantqual, (PlanState *) resstate); - - /* - * initialize child nodes - */ - outerPlanState(resstate) = ExecInitNode(outerPlan(node), estate, eflags); - - /* - * we don't use inner plan - */ - Assert(innerPlan(node) == NULL); - - /* - * initialize tuple type and projection info - */ - ExecAssignResultTypeFromTL(&resstate->ps); - ExecAssignProjectionInfo(&resstate->ps, NULL); - - return resstate; + ResultState *resstate; + + /* check for unsupported flags */ + Assert(!(eflags & (EXEC_FLAG_MARK | EXEC_FLAG_BACKWARD)) || + outerPlan(node) != NULL); + + /* + * create state structure + */ + resstate = makeNode(ResultState); + resstate->ps.plan = (Plan *) node; + resstate->ps.state = estate; + resstate->ps.ExecProcNode = ExecResult; + + resstate->rs_done = false; + resstate->rs_checkqual = (node->resconstantqual == NULL) ? false : true; + resstate->rs_fail_return = contain_rownum_fetch((Node *) node->plan.qual); + + /* + * Miscellaneous initialization + * + * create expression context for node + */ + ExecAssignExprContext(estate, &resstate->ps); + + /* + * tuple table initialization + */ + ExecInitResultTupleSlot(estate, &resstate->ps); + + /* + * initialize child expressions + */ + resstate->ps.qual = + ExecInitQual(node->plan.qual, (PlanState *) resstate); + resstate->resconstantqual = + ExecInitQual((List *) node->resconstantqual, (PlanState *) resstate); + + /* + * initialize child nodes + */ + outerPlanState(resstate) = ExecInitNode(outerPlan(node), estate, eflags); + + /* + * we don't use inner plan + */ + Assert(innerPlan(node) == NULL); + + /* + * initialize tuple type and projection info + */ + ExecAssignResultTypeFromTL(&resstate->ps); + ExecAssignProjectionInfo(&resstate->ps, NULL); + + return resstate; } /* ---------------------------------------------------------------- - * ExecEndResult + * ExecEndResult * - * frees up storage allocated through C routines + * frees up storage allocated through C routines * ---------------------------------------------------------------- */ void ExecEndResult(ResultState *node) { - /* - * Free the exprcontext - */ - ExecFreeExprContext(&node->ps); - - /* - * clean out the tuple table - */ - ExecClearTuple(node->ps.ps_ResultTupleSlot); - - /* - * shut down subplans - */ - ExecEndNode(outerPlanState(node)); + /* + * Free the exprcontext + */ + ExecFreeExprContext(&node->ps); + + /* + * clean out the tuple table + */ + ExecClearTuple(node->ps.ps_ResultTupleSlot); + + /* + * shut down subplans + */ + ExecEndNode(outerPlanState(node)); } void ExecReScanResult(ResultState *node) { - node->rs_done = false; - node->rs_checkqual = (node->resconstantqual == NULL) ? false : true; - - /* - * If chgParam of subnode is not null then plan will be re-scanned by - * first ExecProcNode. - */ - if (node->ps.lefttree && - node->ps.lefttree->chgParam == NULL) - ExecReScan(node->ps.lefttree); + node->rs_done = false; + node->rs_checkqual = (node->resconstantqual == NULL) ? false : true; + + /* + * If chgParam of subnode is not null then plan will be re-scanned by + * first ExecProcNode. + */ + if (node->ps.lefttree && + node->ps.lefttree->chgParam == NULL) + ExecReScan(node->ps.lefttree); } diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index f6b2295d..5a4602f5 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -3255,6 +3255,17 @@ _outGatherPath(StringInfo str, const GatherPath *node) WRITE_INT_FIELD(num_workers); } +static void +_outQualPath(StringInfo str, const QualPath *node) +{ + WRITE_NODE_TYPE("QUALPATH"); + + _outPathInfo(str, (const Path *) node); + + WRITE_NODE_FIELD(subpath); + WRITE_NODE_FIELD(quals); +} + static void _outProjectionPath(StringInfo str, const ProjectionPath *node) { @@ -5502,6 +5513,9 @@ outNode(StringInfo str, const void *obj) case T_ResultPath: _outResultPath(str, obj); break; + case T_QualPath: + _outQualPath(str, obj); + break; case T_MaterialPath: _outMaterialPath(str, obj); break; diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 367b6766..08273bf9 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -132,6 +132,7 @@ static RemoteSubplan *create_remotescan_plan(PlannerInfo *root, RemoteSubPath *best_path); //static char *get_internal_cursor(void); #endif +static Result *create_qual_plan(PlannerInfo *root, QualPath *best_path); static ProjectSet *create_project_set_plan(PlannerInfo *root, ProjectSetPath *best_path); static Material *create_material_plan(PlannerInfo *root, MaterialPath *best_path, int flags); @@ -325,7 +326,7 @@ static SetOp *make_setop(SetOpCmd cmd, SetOpStrategy strategy, Plan *lefttree, List *distinctList, AttrNumber flagColIdx, int firstFlag, long numGroups); static LockRows *make_lockrows(Plan *lefttree, List *rowMarks, int epqParam); -static Result *make_result(List *tlist, Node *resconstantqual, Plan *subplan); +static Result *make_result(List *tlist, Node *resconstantqual, Plan *subplan, List *qual); static ProjectSet *make_project_set(List *tlist, Plan *subplan); static ModifyTable *make_modifytable(PlannerInfo *root, CmdType operation, bool canSetTag, @@ -479,6 +480,10 @@ create_plan_recurse(PlannerInfo *root, Path *best_path, int flags) plan = (Plan *) create_minmaxagg_plan(root, (MinMaxAggPath *) best_path); } + else if (IsA(best_path, QualPath)) + { + plan = (Plan *) create_qual_plan(root, (QualPath *) best_path); + } else { Assert(IsA(best_path, ResultPath)); @@ -1048,7 +1053,7 @@ create_scan_plan(PlannerInfo *root, Path *best_path, int flags) /* if (need_projection) { - plan = (Plan *)make_result(outtlist, NULL, plan); + plan = (Plan *)make_result(outtlist, NULL, plan, NULL); plan->parallel_aware = best_path->parallel_aware; } */ @@ -1259,7 +1264,7 @@ create_gating_plan(PlannerInfo *root, Path *path, Plan *plan, */ gplan = (Plan *) make_result(build_path_tlist(root, path), (Node *) gating_quals, - plan); + plan, NULL); /* * Notice that we don't change cost or size estimates when doing gating. @@ -1374,7 +1379,7 @@ create_append_plan(PlannerInfo *root, AppendPath *best_path) plan = (Plan *) make_result(tlist, (Node *) list_make1(makeBoolConst(false, false)), - NULL); + NULL, NULL); copy_generic_path_info(plan, (Path *) best_path); @@ -1545,7 +1550,29 @@ create_result_plan(PlannerInfo *root, ResultPath *best_path) /* best_path->quals is just bare clauses */ quals = order_qual_clauses(root, best_path->quals); - plan = make_result(tlist, (Node *) quals, NULL); + plan = make_result(tlist, (Node *) quals, NULL, NULL); + + copy_generic_path_info(&plan->plan, (Path *) best_path); + + return plan; +} + +static Result * +create_qual_plan(PlannerInfo *root, QualPath *best_path) +{ + Result *plan; + Plan *subplan; + List *tlist; + List *quals; + + subplan = create_plan_recurse(root, best_path->subpath, 0); + + tlist = build_path_tlist(root, &best_path->path); + + /* best_path->quals is just bare clauses */ + quals = order_qual_clauses(root, best_path->quals); + + plan = make_result(tlist, NULL, subplan, quals); copy_generic_path_info(&plan->plan, (Path *) best_path); @@ -2142,7 +2169,7 @@ create_projection_plan(PlannerInfo *root, ProjectionPath *best_path) else { /* We need a Result node */ - plan = (Plan *) make_result(tlist, NULL, subplan); + plan = (Plan *) make_result(tlist, NULL, subplan, NULL); copy_generic_path_info(plan, (Path *) best_path); } @@ -2166,7 +2193,7 @@ inject_projection_plan(Plan *subplan, List *tlist, bool parallel_safe) { Plan *plan; - plan = (Plan *) make_result(tlist, NULL, subplan); + plan = (Plan *) make_result(tlist, NULL, subplan, NULL); /* * In principle, we should charge tlist eval cost plus cpu_per_tuple per @@ -2626,7 +2653,7 @@ create_minmaxagg_plan(PlannerInfo *root, MinMaxAggPath *best_path) /* Generate the output plan --- basically just a Result */ tlist = build_path_tlist(root, &best_path->path); - plan = make_result(tlist, (Node *) best_path->quals, NULL); + plan = make_result(tlist, (Node *) best_path->quals, NULL, NULL); copy_generic_path_info(&plan->plan, (Path *) best_path); @@ -6842,7 +6869,7 @@ make_remotesubplan(PlannerInfo *root, { List *newtlist = list_copy(leftchild->targetlist); newtlist = lappend(newtlist, newtle); - leftchild = (Plan *) make_result(newtlist, NULL, leftchild); + leftchild = (Plan *) make_result(newtlist, NULL, leftchild, NULL); lefttree->lefttree = leftchild; } } @@ -6853,7 +6880,7 @@ make_remotesubplan(PlannerInfo *root, /* Use Result node to calculate expression */ List *newtlist = list_copy(lefttree->targetlist); newtlist = lappend(newtlist, newtle); - lefttree = (Plan *) make_result(newtlist, NULL, lefttree); + lefttree = (Plan *) make_result(newtlist, NULL, lefttree, NULL); } node->distributionKey = newtle->resno; @@ -7071,7 +7098,7 @@ make_remotesubplan(PlannerInfo *root, { /* copy needed so we don't modify input's tlist below */ tlist = copyObject(tlist); - lefttree = (Plan *) make_result(tlist, NULL, lefttree); + lefttree = (Plan *) make_result(tlist, NULL, lefttree, NULL); } /* @@ -8416,13 +8443,14 @@ make_limit(Plan *lefttree, Node *limitOffset, Node *limitCount, static Result * make_result(List *tlist, Node *resconstantqual, - Plan *subplan) -{// #lizard forgives + Plan *subplan, + List *qual) +{ Result *node = makeNode(Result); Plan *plan = &node->plan; plan->targetlist = tlist; - plan->qual = NIL; + plan->qual = qual; plan->lefttree = subplan; plan->righttree = NULL; node->resconstantqual = resconstantqual; diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index f3d1adb2..9b03a3d9 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -4649,6 +4649,34 @@ create_merge_append_path(PlannerInfo *root, return pathnode; } +QualPath * +create_qual_path(PlannerInfo *root, Path *subpath, List *quals) +{ + QualPath *pathnode = makeNode(QualPath); + RelOptInfo *rel = subpath->parent; + QualCost qual_cost; + Cost run_cost; + + cost_qual_eval(&qual_cost, quals, root); + + pathnode->path.pathtype = T_Result; + pathnode->path.parent = rel; + pathnode->path.pathtarget = subpath->pathtarget; + pathnode->path.parallel_safe = rel->consider_parallel; + + pathnode->quals = quals; + pathnode->subpath = subpath; + + pathnode->path.rows = subpath->rows; + run_cost = subpath->total_cost - subpath->startup_cost; + run_cost += (cpu_operator_cost + qual_cost.per_tuple) * pathnode->path.rows; + + pathnode->path.startup_cost = subpath->startup_cost + qual_cost.startup; + pathnode->path.total_cost = subpath->total_cost + run_cost; + + return pathnode; +} + /* * create_result_path * Creates a path representing a Result-and-nothing-else plan. diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index ddb99ddf..b12d1e31 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1018,6 +1018,7 @@ typedef struct ResultState ExprState *resconstantqual; bool rs_done; /* are we done? */ bool rs_checkqual; /* do we need to check the qual? */ + bool rs_fail_return; /* should return after failing qual? */ } ResultState; /* ---------------- diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index 227af23f..854f36a4 100644 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -304,6 +304,7 @@ typedef enum NodeTag T_AppendPath, T_MergeAppendPath, T_ResultPath, + T_QualPath, T_MaterialPath, T_UniquePath, T_GatherPath, diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h index 0736ab79..a0f11da0 100644 --- a/src/include/nodes/relation.h +++ b/src/include/nodes/relation.h @@ -1639,6 +1639,13 @@ typedef struct ProjectionPath bool dummypp; /* true if no separate Result is needed */ } ProjectionPath; +typedef struct QualPath +{ + Path path; + Path *subpath; + List *quals; +} QualPath; + /* * ProjectSetPath represents evaluation of a targetlist that includes * set-returning function(s), which will need to be implemented by a diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index 505cb463..e1fe0a4f 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -133,6 +133,7 @@ extern MergeAppendPath *create_merge_append_path(PlannerInfo *root, List *pathkeys, Relids required_outer, List *partitioned_rels); +extern QualPath *create_qual_path(PlannerInfo *root, Path *subpath, List *quals); extern ResultPath *create_result_path(PlannerInfo *root, RelOptInfo *rel, PathTarget *target, List *resconstantqual); extern MaterialPath *create_material_path(RelOptInfo *rel, Path *subpath); From 76644d8823ce645f726fd49d5985bb3cc85f2d9b Mon Sep 17 00:00:00 2001 From: andrelin Date: Fri, 22 Apr 2022 16:48:17 +0800 Subject: [PATCH 550/578] Core changes of cn-udf implement Change the pathtarget by make_udf_input_target apply appropriate remote path, projection path and qual path before sort and grouping. --- src/backend/executor/nodeResult.c | 6 +- src/backend/optimizer/path/allpaths.c | 3 + src/backend/optimizer/plan/createplan.c | 16 +++ src/backend/optimizer/plan/initsplan.c | 20 ++++ src/backend/optimizer/plan/planner.c | 136 ++++++++++++++++++++++++ src/include/nodes/execnodes.h | 1 - src/include/nodes/relation.h | 1 + 7 files changed, 177 insertions(+), 6 deletions(-) diff --git a/src/backend/executor/nodeResult.c b/src/backend/executor/nodeResult.c index 0269d6d3..82aaa846 100644 --- a/src/backend/executor/nodeResult.c +++ b/src/backend/executor/nodeResult.c @@ -126,10 +126,7 @@ ExecResult(PlanState *pstate) if (!ExecQual(qual, econtext)) { - if (node->rs_fail_return) - return NULL; - else - continue; + continue; } ResetExprContext(econtext); @@ -214,7 +211,6 @@ ExecInitResult(Result *node, EState *estate, int eflags) resstate->rs_done = false; resstate->rs_checkqual = (node->resconstantqual == NULL) ? false : true; - resstate->rs_fail_return = contain_rownum_fetch((Node *) node->plan.qual); /* * Miscellaneous initialization diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index b5ddbfcd..1b6b71bb 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -2038,7 +2038,10 @@ set_subquery_pathlist(PlannerInfo *root, RelOptInfo *rel, /* * The upper query might not use all the subquery's output columns; if * not, we can simplify. + * + * but if upper query have cn-udf, don't try it. */ + if (!root->udf_quals) remove_unused_subquery_outputs(subquery, rel); /* diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 08273bf9..f674826f 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -1131,6 +1131,13 @@ use_physical_tlist(PlannerInfo *root, Path *path, int flags) return false; /* + * if we got cn-udf or rownum expr, return false to use + * pathtarget to generate tlist. + */ + if (root->parse && root->parse->hasCoordFuncs) + return false; + + /* * We can do this for real relation scans, subquery scans, function scans, * tablefunc scans, values scans, and CTE scans (but not for, eg, joins). */ @@ -8456,6 +8463,14 @@ make_result(List *tlist, node->resconstantqual = resconstantqual; #ifdef XCP + /* + * Do not consider pushing down node if this node is make to process any + * project or qual that contain rownum or cn-udf. + */ + if (contain_user_defined_functions((Node *) tlist) || + contain_user_defined_functions((Node *) qual)) + return node; + if (subplan) { /* @@ -8828,6 +8843,7 @@ is_projection_capable_path(Path *path) case T_ModifyTable: case T_MergeAppend: case T_RecursiveUnion: + case T_RemoteSubplan: return false; case T_Append: diff --git a/src/backend/optimizer/plan/initsplan.c b/src/backend/optimizer/plan/initsplan.c index ab0972d5..92ddfc0f 100644 --- a/src/backend/optimizer/plan/initsplan.c +++ b/src/backend/optimizer/plan/initsplan.c @@ -1740,6 +1740,8 @@ distribute_qual_to_rels(PlannerInfo *root, Node *clause, Relids nullable_relids; RestrictInfo *restrictinfo; + bool contain_udf = contain_user_defined_functions((Node *) clause); + /* * Retrieve all relids mentioned within the clause. */ @@ -2092,9 +2094,27 @@ distribute_qual_to_rels(PlannerInfo *root, Node *clause, } } + if (root->parse && root->parse->commandType == CMD_SELECT && contain_udf) + { + List *quals_var; + + /* clause contain cn-udf, don't distribute it to rels, collect it */ + root->udf_quals = lappend(root->udf_quals, restrictinfo->clause); + + /* cn-udf quals will not distribute to rels, but vars must be added */ + quals_var = pull_var_clause((Node *) root->udf_quals, + PVC_RECURSE_AGGREGATES | + PVC_RECURSE_WINDOWFUNCS | + PVC_INCLUDE_PLACEHOLDERS); + + add_vars_to_targetlist(root, quals_var, bms_make_singleton(0), false); + } + else + { /* No EC special case applies, so push it into the clause lists */ distribute_restrictinfo_to_rels(root, restrictinfo); } +} /* * check_outerjoin_delay diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index df9a5333..090b3588 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -1758,6 +1758,72 @@ inheritance_planner(PlannerInfo *root) SS_assign_special_param(root))); } +/* + * Like make_rownum_input_target, exclude any udf expr from origin_target, + * only those udf that need to execute on CN will be considered, check + * function: contain_user_defined_functions. + */ +static PathTarget * +make_udf_input_target(PlannerInfo *root, PathTarget *origin_target) +{ + PathTarget *input_target = create_empty_pathtarget(); + Query *parse = root->parse; + List *udf_cols = NIL; + List *udf_vars = NIL; + int i; + ListCell *lc; + + i = 0; + foreach(lc, origin_target->exprs) + { + Expr *expr = (Expr *) lfirst(lc); + Index sgref = get_pathtarget_sortgroupref(origin_target, i); + + if (!contain_user_defined_functions((Node *) expr)) + { + add_column_to_pathtarget(input_target, expr, sgref); + } + else + { + /* + * Non-cn-udf column, so just remember the expression for later + * call to pull_var_clause. + */ + udf_cols = lappend(udf_cols, expr); + } + + i++; + } + + /* + * TODO: having cn-udf expr. + */ + if (parse->havingQual) + udf_cols = lappend(udf_cols, parse->havingQual); + + udf_cols = list_concat(udf_cols, list_copy(root->udf_quals)); + + /* + * Pull out all the Vars mentioned in non-cn-udf cols, and + * add them to the input target if not already present. Note this + * includes Vars used in resjunk items, so we are covering the needs of + * ORDER BY and window specifications. Vars used within Aggrefs and + * WindowFuncs will be pulled out here, too. + */ + udf_vars = pull_var_clause((Node *) udf_cols, + PVC_RECURSE_AGGREGATES | + PVC_RECURSE_WINDOWFUNCS | + PVC_INCLUDE_PLACEHOLDERS); + add_new_columns_to_pathtarget(input_target, udf_vars); + + /* clean up cruft */ + list_free(udf_vars); + list_free(udf_cols); + + /* XXX this causes some redundant cost calculation ... */ + return set_pathtarget_cost_width(root, input_target); +} + /*-------------------- * grouping_planner * Perform planning steps related to grouping, aggregation, etc. @@ -1906,6 +1972,7 @@ grouping_planner(PlannerInfo *root, bool inheritance_update, List *scanjoin_targets; List *scanjoin_targets_contain_srfs; bool scanjoin_target_parallel_safe; + PathTarget *cn_process_target; /* including rownum_target */ bool have_grouping; AggClauseCosts agg_costs; WindowFuncLists *wflists = NULL; @@ -2096,6 +2163,39 @@ grouping_planner(PlannerInfo *root, bool inheritance_update, } /* + * In postgresql, vars in qual didn't count into targetlist as junk, + * since they are evaluated just after scan happened, but a qual with + * rownum expr or cn-udf will be evaluated after collecting tuple + * to CN, so we need to pull out vars from them. + * + * This is a bit ugly doing things here, but root->rownum_quals and + * root->udf_quals are determined after query_planner, and targetlist + * is determined way before that. + */ + if (root->udf_quals) + { + List *quals_var = pull_var_clause((Node *) root->udf_quals, + PVC_RECURSE_AGGREGATES | + PVC_RECURSE_WINDOWFUNCS | + PVC_INCLUDE_PLACEHOLDERS); + + /* copy to make other targets clean */ + if (scanjoin_target == grouping_target) + scanjoin_target = copy_pathtarget(scanjoin_target); + + foreach(lc, quals_var) + { + if (!list_member(scanjoin_target->exprs, lfirst_node(Var, lc))) + add_column_to_pathtarget(scanjoin_target, (Expr *) lfirst_node(Var, lc), 0); + } + } + + cn_process_target = scanjoin_target; + /* exclude cn-udf from scanjoin_target */ + if (parse->hasCoordFuncs) + scanjoin_target = make_udf_input_target(root, scanjoin_target); + + /* * If there are any SRFs in the targetlist, we must separate each of * these PathTargets into SRF-computing and SRF-free targets. Replace * each of the named targets with a SRF-free version, and remember the @@ -2235,6 +2335,42 @@ grouping_planner(PlannerInfo *root, bool inheritance_update, root->upper_targets[UPPERREL_WINDOW] = sort_input_target; root->upper_targets[UPPERREL_GROUP_AGG] = grouping_target; + if (parse->hasCoordFuncs) + { + Path *path; + + foreach(lc, current_rel->pathlist) + { + path = (Path *) lfirst(lc); + + /* must collect tuple to cn for further processing */ + if (path->distribution != NULL) + path = create_remotesubplan_path(root, path, NULL); + + /* add other projection step, currently it's only cn-udf */ + path = apply_projection_to_path(root, current_rel, + path, cn_process_target); + + /* then evaluate other qual on CN, currently it's only cn-udf */ + if (root->udf_quals != NIL) + path = (Path *) create_qual_path(root, path, root->udf_quals); + + /* apply final target if no grouping and no post-pone projection */ + if (!have_grouping && final_target == sort_input_target && !activeWindows) + path = apply_projection_to_path(root, current_rel, + path, final_target); + + lfirst(lc) = path; + } + + set_cheapest(current_rel); + } + else if (root->udf_quals != NIL) + { + /* cn-quals found but no cn-target specified, should not happen but raise an error */ + elog(ERROR, "remote qualification must exist in target list"); + } + /* * If we have grouping and/or aggregation, consider ways to implement * that. We build a new upperrel representing the output of this diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index b12d1e31..ddb99ddf 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1018,7 +1018,6 @@ typedef struct ResultState ExprState *resconstantqual; bool rs_done; /* are we done? */ bool rs_checkqual; /* do we need to check the qual? */ - bool rs_fail_return; /* should return after failing qual? */ } ResultState; /* ---------------- diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h index a0f11da0..2be2556e 100644 --- a/src/include/nodes/relation.h +++ b/src/include/nodes/relation.h @@ -433,6 +433,7 @@ typedef struct PlannerInfo bool haspart_tobe_modify; Index partrelindex; Bitmapset *partpruning; + List *udf_quals; /* quals that contain CN-udf */ #endif #endif } PlannerInfo; From 9e05e5e9c002681596e46d8ae9c2ecf9266d9682 Mon Sep 17 00:00:00 2001 From: andrelin Date: Fri, 22 Apr 2022 18:37:48 +0800 Subject: [PATCH 551/578] Write and read Query struct in a smart way for upgrade --- src/backend/nodes/equalfuncs.c | 1 + src/backend/nodes/outfuncs.c | 1 + src/backend/nodes/readfuncs.c | 13 ++++++++++++- 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c index f5f2bc77..aff497b5 100644 --- a/src/backend/nodes/equalfuncs.c +++ b/src/backend/nodes/equalfuncs.c @@ -986,6 +986,7 @@ _equalQuery(const Query *a, const Query *b) COMPARE_SCALAR_FIELD(hasModifyingCTE); COMPARE_SCALAR_FIELD(hasForUpdate); COMPARE_SCALAR_FIELD(hasRowSecurity); + COMPARE_SCALAR_FIELD(hasCoordFuncs); COMPARE_NODE_FIELD(cteList); COMPARE_NODE_FIELD(rtable); COMPARE_NODE_FIELD(jointree); diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index 5a4602f5..c7892f52 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -4310,6 +4310,7 @@ _outQuery(StringInfo str, const Query *node) WRITE_BOOL_FIELD(hasModifyingCTE); WRITE_BOOL_FIELD(hasForUpdate); WRITE_BOOL_FIELD(hasRowSecurity); + WRITE_BOOL_FIELD(hasCoordFuncs); WRITE_NODE_FIELD(cteList); WRITE_NODE_FIELD(rtable); WRITE_NODE_FIELD(jointree); diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c index 96f4ca05..28cd7dbd 100644 --- a/src/backend/nodes/readfuncs.c +++ b/src/backend/nodes/readfuncs.c @@ -572,7 +572,18 @@ _readQuery(void) READ_BOOL_FIELD(hasModifyingCTE); READ_BOOL_FIELD(hasForUpdate); READ_BOOL_FIELD(hasRowSecurity); - READ_NODE_FIELD(cteList); + token = pg_strtok(&length); /* get :fldname hasRowSecurity or cteList */ + if (strncmp(nullable_string(token, length), ":hasCoordFuncs", length) == 0) + { + token = pg_strtok(&length); /* get field value */ + local_node->hasCoordFuncs = strtobool(token); + token = pg_strtok(&length); /* skip :fldname cteList */ + } + else + { + local_node->hasCoordFuncs = false; + } + local_node->cteList = nodeRead(NULL, 0); READ_NODE_FIELD(rtable); READ_NODE_FIELD(jointree); READ_NODE_FIELD(targetList); From d6f9378aa09232d7406eed85cd05caf033236d77 Mon Sep 17 00:00:00 2001 From: andrelin Date: Sun, 24 Apr 2022 15:19:09 +0800 Subject: [PATCH 552/578] Ban pull-up functinos in DML on DN --- src/backend/optimizer/path/allpaths.c | 11 +++++++++++ src/backend/parser/analyze.c | 10 ++++++++++ 2 files changed, 21 insertions(+) diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 1b6b71bb..61769e22 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -1951,6 +1951,17 @@ set_subquery_pathlist(PlannerInfo *root, RelOptInfo *rel, RelOptInfo *sub_final_rel; ListCell *lc; + if (subquery->hasCoordFuncs && + (parse->commandType == CMD_UPDATE || + parse->commandType == CMD_INSERT || + parse->commandType == CMD_DELETE)) + { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("DML has a subquery contains a function runs on CN"), + errhint("You might need to push that function down to DN."))); + } + /* * Must copy the Query so that planning doesn't mess up the RTE contents * (really really need to fix the planner to not scribble on its input, diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c index c68d7b06..a720c1fc 100644 --- a/src/backend/parser/analyze.c +++ b/src/backend/parser/analyze.c @@ -448,6 +448,16 @@ transformStmt(ParseState *pstate, Node *parseTree) result->querySource = QSRC_ORIGINAL; result->canSetTag = true; result->hasCoordFuncs = pstate->p_hasCoordFuncs; + if (result->hasCoordFuncs && + (result->commandType == CMD_UPDATE || + result->commandType == CMD_INSERT || + result->commandType == CMD_DELETE)) + { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("DML contains a function runs on CN which is not supported"), + errhint("You might need to push that function down to DN."))); + } return result; } From a8bd6e852c9805374188c7946550c6e90ef4bdbc Mon Sep 17 00:00:00 2001 From: andrelin Date: Sun, 24 Apr 2022 17:04:18 +0800 Subject: [PATCH 553/578] sync regress --- src/test/regress/expected/plpgsql_1.out | 24 +++--- .../regress/expected/select_parallel_4.out | 22 +++--- src/test/regress/expected/subselect.out | 6 +- src/test/regress/expected/transactions.out | 6 +- .../regress/expected/updatable_views_1.out | 78 ++++++++++++++++--- src/test/regress/expected/xc_remote.out | 2 +- src/test/regress/sql/transactions.sql | 4 +- src/test/regress/sql/updatable_views.sql | 28 +++++++ src/test/regress/sql/xc_remote.sql | 2 +- 9 files changed, 134 insertions(+), 38 deletions(-) diff --git a/src/test/regress/expected/plpgsql_1.out b/src/test/regress/expected/plpgsql_1.out index 4efbac5f..20f49e11 100644 --- a/src/test/regress/expected/plpgsql_1.out +++ b/src/test/regress/expected/plpgsql_1.out @@ -1548,26 +1548,26 @@ update PSlot set slotlink = 'HS.base.hub1.1' where slotname = 'PS.base.b2'; -- -- PGXCTODO: This is failing due to issue 3522907, complicated SELECT queries in plpgsql functions select * from PField_v1 where pfname = 'PF0_1' order by slotname; - pfname | slotname | backside | patch ---------+----------------------+----------------------------+----------------- - PF0_1 | PS.base.a1 | WS.001.1a in room 001 -> - | PS.base.ta1 -> - PF0_1 | PS.base.a2 | | - - PF0_1 | PS.base.a3 | WS.001.2a in room 001 -> - | PS.base.ta2 -> + pfname | slotname | backside | patch +--------+----------------------+----------------------------+------------------ + PF0_1 | PS.base.a1 | WS.001.1a in room 001 -> - | PS.base.ta1 -> - + PF0_1 | PS.base.a2 | WS.001.1b in room 001 -> - | - + PF0_1 | PS.base.a3 | WS.001.2a in room 001 -> - | PS.base.ta2 -> - PF0_1 | PS.base.a4 | - | - PF0_1 | PS.base.a5 | - | - PF0_1 | PS.base.a6 | - | - - PF0_1 | PS.base.b1 | | PS.base.ta5 -> - PF0_1 | PS.base.b2 | | - PF0_1 | PS.base.b3 | | PS.base.tb2 -> - PF0_1 | PS.base.b4 | | - + PF0_1 | PS.base.b1 | WS.002.1a in room 002 -> - | PS.base.ta5 -> - + PF0_1 | PS.base.b2 | WS.002.1b in room 002 -> - | + PF0_1 | PS.base.b3 | WS.002.2a in room 002 -> - | PS.base.tb2 -> - + PF0_1 | PS.base.b4 | WS.002.2b in room 002 -> - | - PF0_1 | PS.base.b5 | WS.002.3a in room 002 -> - | - - PF0_1 | PS.base.b6 | | - + PF0_1 | PS.base.b6 | WS.002.3b in room 002 -> - | - PF0_1 | PS.base.c1 | WS.003.1a in room 003 -> - | - PF0_1 | PS.base.c2 | WS.003.1b in room 003 -> - | - PF0_1 | PS.base.c3 | WS.003.2a in room 003 -> - | - - PF0_1 | PS.base.c4 | | - + PF0_1 | PS.base.c4 | WS.003.2b in room 003 -> - | - PF0_1 | PS.base.c5 | WS.003.3a in room 003 -> - | - - PF0_1 | PS.base.c6 | | - + PF0_1 | PS.base.c6 | WS.003.3b in room 003 -> - | - (18 rows) select * from PField_v1 where pfname = 'PF0_2' order by slotname; diff --git a/src/test/regress/expected/select_parallel_4.out b/src/test/regress/expected/select_parallel_4.out index f3b81ec8..b57f5248 100644 --- a/src/test/regress/expected/select_parallel_4.out +++ b/src/test/regress/expected/select_parallel_4.out @@ -43,20 +43,21 @@ explain (verbose, costs off) select parallel_restricted(unique1) from tenk1 where stringu1 = 'GRAAAA' order by 1; QUERY PLAN ---------------------------------------------------------------- - Remote Subquery Scan on all (datanode_1,datanode_2) - Output: parallel_restricted(unique1) - Sort Key: parallel_restricted(tenk1.unique1) - -> Sort +--------------------------------------------------------------------- + Sort Output: (parallel_restricted(unique1)) Sort Key: (parallel_restricted(tenk1.unique1)) - -> Gather + -> Result Output: parallel_restricted(unique1) + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Output: unique1 + -> Gather + Output: unique1 Workers Planned: 4 -> Parallel Seq Scan on public.tenk1 Output: unique1 Filter: (tenk1.stringu1 = 'GRAAAA'::name) -(12 rows) +(13 rows) -- test parallel plan when group by expression is in target list. explain (costs off) @@ -125,14 +126,15 @@ explain (costs off) select sum(parallel_restricted(unique1)) from tenk1 group by(parallel_restricted(unique1)); QUERY PLAN -------------------------------------------------------------------------- +----------------------------------------------------------------- HashAggregate Group Key: parallel_restricted(unique1) + -> Result -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Gather Workers Planned: 4 - -> Parallel Index Only Scan using tenk1_unique1 on tenk1 -(6 rows) + -> Parallel Seq Scan on tenk1 +(7 rows) -- test parallel plans for queries containing un-correlated subplans. alter table tenk2 set (parallel_workers = 0); diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out index 32ed8e4f..876fd5c8 100644 --- a/src/test/regress/expected/subselect.out +++ b/src/test/regress/expected/subselect.out @@ -1081,9 +1081,9 @@ select * from where tattle(x, 8); QUERY PLAN ---------------------------------------------------------- - Subquery Scan on ss - Output: x, u - Filter: tattle(ss.x, 8) + Result + Output: (9), (unnest('{1,2,3,11,12,13}'::integer[])) + Filter: tattle((9), 8) -> ProjectSet Output: 9, unnest('{1,2,3,11,12,13}'::integer[]) -> Result diff --git a/src/test/regress/expected/transactions.out b/src/test/regress/expected/transactions.out index 659d5a7c..4b5c54d5 100644 --- a/src/test/regress/expected/transactions.out +++ b/src/test/regress/expected/transactions.out @@ -582,6 +582,10 @@ end$$ language plpgsql volatile; create table revalidate_bug (c float8 unique); insert into revalidate_bug values (1); insert into revalidate_bug values (inverse(0)); +ERROR: DML contains a function runs on CN which is not supported +HINT: You might need to push that function down to DN. +alter function inverse(int) pushdown; +insert into revalidate_bug values (inverse(0)); drop table revalidate_bug; drop function inverse(int); -- verify that cursors created during an aborted subtransaction are @@ -633,7 +637,7 @@ fetch from foo; abort; -- Test for proper cleanup after a failure in a cursor portal -- that was created in an outer subtransaction -CREATE FUNCTION invert(x float8) RETURNS float8 LANGUAGE plpgsql AS +CREATE FUNCTION invert(x float8) RETURNS float8 pushdown LANGUAGE plpgsql AS $$ begin return 1/x; end $$; CREATE FUNCTION create_temp_tab() RETURNS text LANGUAGE plpgsql AS $$ diff --git a/src/test/regress/expected/updatable_views_1.out b/src/test/regress/expected/updatable_views_1.out index e13b4537..ae85f4e2 100644 --- a/src/test/regress/expected/updatable_views_1.out +++ b/src/test/regress/expected/updatable_views_1.out @@ -1829,6 +1829,8 @@ END; $$ LANGUAGE plpgsql STRICT IMMUTABLE LEAKPROOF; SELECT * FROM rw_view1 WHERE snoop(person); +NOTICE: snooped value: Tom +NOTICE: snooped value: Harry person -------- Tom @@ -1836,7 +1838,15 @@ SELECT * FROM rw_view1 WHERE snoop(person); (2 rows) UPDATE rw_view1 SET person=person WHERE snoop(person); +ERROR: DML contains a function runs on CN which is not supported +HINT: You might need to push that function down to DN. DELETE FROM rw_view1 WHERE NOT snoop(person); +ERROR: DML contains a function runs on CN which is not supported +HINT: You might need to push that function down to DN. +ALTER FUNCTION snoop(anyelement) pushdown; +UPDATE rw_view1 SET person=person WHERE snoop(person); +DELETE FROM rw_view1 WHERE NOT snoop(person); +ALTER FUNCTION snoop(anyelement) not pushdown; ALTER VIEW rw_view1 SET (security_barrier = true); SELECT table_name, is_insertable_into FROM information_schema.tables @@ -1864,12 +1874,21 @@ SELECT table_name, column_name, is_updatable (1 row) SELECT * FROM rw_view1 WHERE snoop(person); +NOTICE: snooped value: Tom +NOTICE: snooped value: Harry person -------- Tom Harry (2 rows) +UPDATE rw_view1 SET person=person WHERE snoop(person); +ERROR: DML contains a function runs on CN which is not supported +HINT: You might need to push that function down to DN. +DELETE FROM rw_view1 WHERE NOT snoop(person); +ERROR: DML contains a function runs on CN which is not supported +HINT: You might need to push that function down to DN. +ALTER FUNCTION snoop(anyelement) pushdown; UPDATE rw_view1 SET person=person WHERE snoop(person); DELETE FROM rw_view1 WHERE NOT snoop(person); EXPLAIN (costs off, nodes off) SELECT * FROM rw_view1 WHERE snoop(person); @@ -1900,6 +1919,7 @@ EXPLAIN (costs off, nodes off) DELETE FROM rw_view1 WHERE NOT snoop(person); Filter: ((visibility = 'public'::text) AND (NOT snoop(person))) (4 rows) +ALTER FUNCTION snoop(anyelement) not pushdown; -- security barrier view on top of security barrier view CREATE VIEW rw_view2 WITH (security_barrier = true) AS SELECT * FROM rw_view1 WHERE snoop(person); @@ -1929,20 +1949,31 @@ SELECT table_name, column_name, is_updatable (1 row) SELECT * FROM rw_view2 WHERE snoop(person); +NOTICE: snooped value: Tom +NOTICE: snooped value: Tom +NOTICE: snooped value: Harry +NOTICE: snooped value: Harry person -------- Tom Harry (2 rows) +UPDATE rw_view2 SET person=person WHERE snoop(person); +ERROR: DML contains a function runs on CN which is not supported +HINT: You might need to push that function down to DN. +DELETE FROM rw_view2 WHERE NOT snoop(person); +ERROR: DML contains a function runs on CN which is not supported +HINT: You might need to push that function down to DN. +ALTER FUNCTION snoop(anyelement) pushdown; UPDATE rw_view2 SET person=person WHERE snoop(person); DELETE FROM rw_view2 WHERE NOT snoop(person); EXPLAIN (costs off, nodes off) SELECT * FROM rw_view2 WHERE snoop(person); QUERY PLAN ----------------------------------------------------------- - Remote Subquery Scan on all - -> Subquery Scan on rw_view2 - Filter: snoop(rw_view2.person) + Subquery Scan on rw_view2 + Filter: snoop(rw_view2.person) + -> Remote Subquery Scan on all -> Subquery Scan on rw_view1 Filter: snoop(rw_view1.person) -> Seq Scan on base_tbl @@ -1967,6 +1998,7 @@ EXPLAIN (costs off, nodes off) DELETE FROM rw_view2 WHERE NOT snoop(person); Filter: ((visibility = 'public'::text) AND snoop(person) AND (NOT snoop(person))) (4 rows) +ALTER FUNCTION snoop(anyelement) not pushdown; DROP TABLE base_tbl CASCADE; NOTICE: drop cascades to 2 other objects DETAIL: drop cascades to view rw_view1 @@ -1989,6 +2021,13 @@ SELECT * FROM rw_view1; 1 | Row 1 (1 row) +EXPLAIN (costs off, nodes off) DELETE FROM rw_view1 WHERE id = 1 AND snoop(data); +ERROR: DML contains a function runs on CN which is not supported +HINT: You might need to push that function down to DN. +DELETE FROM rw_view1 WHERE id = 1 AND snoop(data); +ERROR: DML contains a function runs on CN which is not supported +HINT: You might need to push that function down to DN. +ALTER FUNCTION snoop(anyelement) pushdown; EXPLAIN (costs off, nodes off) DELETE FROM rw_view1 WHERE id = 1 AND snoop(data); QUERY PLAN ------------------------------------------------------------------------- @@ -2003,6 +2042,7 @@ EXPLAIN (costs off, nodes off) DELETE FROM rw_view1 WHERE id = 1 AND snoop(data) (8 rows) DELETE FROM rw_view1 WHERE id = 1 AND snoop(data); +ALTER FUNCTION snoop(anyelement) not pushdown; EXPLAIN (costs off, nodes off) INSERT INTO rw_view1 VALUES (2, 'New row 2'); QUERY PLAN ----------------------------------------------------------------------- @@ -2085,6 +2125,14 @@ SELECT * FROM v1 WHERE a=8; 8 | 8 | t111 | t11d (4 rows) +EXPLAIN (VERBOSE, COSTS OFF) +UPDATE v1 SET a=100 WHERE snoop(a) AND leakproof(a) AND a < 7 AND a != 6; +ERROR: DML contains a function runs on CN which is not supported +HINT: You might need to push that function down to DN. +UPDATE v1 SET a=100 WHERE snoop(a) AND leakproof(a) AND a < 7 AND a != 6; +ERROR: DML contains a function runs on CN which is not supported +HINT: You might need to push that function down to DN. +ALTER FUNCTION leakproof(anyelement) pushdown; EXPLAIN (VERBOSE, COSTS OFF) UPDATE v1 SET a=100 WHERE snoop(a) AND leakproof(a) AND a < 7 AND a != 6; QUERY PLAN @@ -2121,6 +2169,7 @@ UPDATE v1 SET a=100 WHERE snoop(a) AND leakproof(a) AND a < 7 AND a != 6; (29 rows) UPDATE v1 SET a=100 WHERE snoop(a) AND leakproof(a) AND a < 7 AND a != 6; +ALTER FUNCTION leakproof(anyelement) not pushdown; SELECT * FROM v1 WHERE a=100; -- Nothing should have been changed to 100 a | b | c | d ---+---+---+--- @@ -2131,6 +2180,15 @@ SELECT * FROM t1 WHERE a=100; -- Nothing should have been changed to 100 ---+---+--- (0 rows) +EXPLAIN (VERBOSE, COSTS OFF) +UPDATE v1 SET a=a+1 WHERE snoop(a) AND leakproof(a) AND a = 8; +ERROR: DML contains a function runs on CN which is not supported +HINT: You might need to push that function down to DN. +UPDATE v1 SET a=a+1 WHERE snoop(a) AND leakproof(a) AND a = 8; +ERROR: DML contains a function runs on CN which is not supported +HINT: You might need to push that function down to DN. +ALTER FUNCTION leakproof(anyelement) pushdown; +ALTER FUNCTION snoop(anyelement) pushdown; EXPLAIN (VERBOSE, COSTS OFF) UPDATE v1 SET a=a+1 WHERE snoop(a) AND leakproof(a) AND a = 8; QUERY PLAN @@ -2168,15 +2226,17 @@ UPDATE v1 SET a=a+1 WHERE snoop(a) AND leakproof(a) AND a = 8; UPDATE v1 SET a=a+1 WHERE snoop(a) AND leakproof(a) AND a = 8; SELECT * FROM v1 WHERE b=8; - a | b | c | d ----+---+------+------ - 9 | 8 | t1 | t11d - 9 | 8 | t11 | t11d - 9 | 8 | t12 | t11d - 9 | 8 | t111 | t11d + a | b | c | d +---+---+------+------- + 9 | 8 | t1 | t111d + 9 | 8 | t11 | t111d + 9 | 8 | t12 | t111d + 9 | 8 | t111 | t111d (4 rows) DELETE FROM v1 WHERE snoop(a) AND leakproof(a); -- should not delete everything, just where a>5 +ALTER FUNCTION leakproof(anyelement) not pushdown; +ALTER FUNCTION snoop(anyelement) not pushdown; TABLE t1; -- verify all a<=5 are intact a | b | c ---+---+------ diff --git a/src/test/regress/expected/xc_remote.out b/src/test/regress/expected/xc_remote.out index 16b7075b..9e01a5e6 100644 --- a/src/test/regress/expected/xc_remote.out +++ b/src/test/regress/expected/xc_remote.out @@ -366,7 +366,7 @@ CREATE TABLE xcrem_employee (EMPNO CHAR(6) NOT NULL, FIRSTNAME VARCHAR(12) NOT N INSERT INTO xcrem_employee (EMPNO,FIRSTNAME,MIDINIT,LASTNAME,WORKDEPT,PHONENO,HIREDATE,JOB,EDLEVEL,SEX,BIRTHDATE,SALARY,BONUS,COMM) VALUES( '000180','MARILYN', 'S', 'SCOUTTEN', 'D11', '1682','1973-07-07','DESIGNER', 17, 'F', '1949-02-21', 21340.00,500,1707); create table xcrem_temptable as select * from xcrem_employee; -create or replace function volatile_func(id int) returns int as +create or replace function volatile_func(id int) returns int pushdown as $$begin return 3;end $$ language plpgsql; \set EXP 'explain (verbose true, costs false, nodes false)' \set SEL 'select empno, edlevel, lastname, salary, bonus from xcrem_employee order by empno' diff --git a/src/test/regress/sql/transactions.sql b/src/test/regress/sql/transactions.sql index 80f235e0..b9f52f16 100644 --- a/src/test/regress/sql/transactions.sql +++ b/src/test/regress/sql/transactions.sql @@ -388,6 +388,8 @@ end$$ language plpgsql volatile; create table revalidate_bug (c float8 unique); insert into revalidate_bug values (1); insert into revalidate_bug values (inverse(0)); +alter function inverse(int) pushdown; +insert into revalidate_bug values (inverse(0)); drop table revalidate_bug; drop function inverse(int); @@ -431,7 +433,7 @@ abort; -- Test for proper cleanup after a failure in a cursor portal -- that was created in an outer subtransaction -CREATE FUNCTION invert(x float8) RETURNS float8 LANGUAGE plpgsql AS +CREATE FUNCTION invert(x float8) RETURNS float8 pushdown LANGUAGE plpgsql AS $$ begin return 1/x; end $$; CREATE FUNCTION create_temp_tab() RETURNS text diff --git a/src/test/regress/sql/updatable_views.sql b/src/test/regress/sql/updatable_views.sql index 6aa951ca..f0c092c8 100644 --- a/src/test/regress/sql/updatable_views.sql +++ b/src/test/regress/sql/updatable_views.sql @@ -886,6 +886,10 @@ LANGUAGE plpgsql STRICT IMMUTABLE LEAKPROOF; SELECT * FROM rw_view1 WHERE snoop(person); UPDATE rw_view1 SET person=person WHERE snoop(person); DELETE FROM rw_view1 WHERE NOT snoop(person); +ALTER FUNCTION snoop(anyelement) pushdown; +UPDATE rw_view1 SET person=person WHERE snoop(person); +DELETE FROM rw_view1 WHERE NOT snoop(person); +ALTER FUNCTION snoop(anyelement) not pushdown; ALTER VIEW rw_view1 SET (security_barrier = true); @@ -905,10 +909,14 @@ SELECT table_name, column_name, is_updatable SELECT * FROM rw_view1 WHERE snoop(person); UPDATE rw_view1 SET person=person WHERE snoop(person); DELETE FROM rw_view1 WHERE NOT snoop(person); +ALTER FUNCTION snoop(anyelement) pushdown; +UPDATE rw_view1 SET person=person WHERE snoop(person); +DELETE FROM rw_view1 WHERE NOT snoop(person); EXPLAIN (costs off, nodes off) SELECT * FROM rw_view1 WHERE snoop(person); EXPLAIN (costs off, nodes off) UPDATE rw_view1 SET person=person WHERE snoop(person); EXPLAIN (costs off, nodes off) DELETE FROM rw_view1 WHERE NOT snoop(person); +ALTER FUNCTION snoop(anyelement) not pushdown; -- security barrier view on top of security barrier view @@ -931,10 +939,14 @@ SELECT table_name, column_name, is_updatable SELECT * FROM rw_view2 WHERE snoop(person); UPDATE rw_view2 SET person=person WHERE snoop(person); DELETE FROM rw_view2 WHERE NOT snoop(person); +ALTER FUNCTION snoop(anyelement) pushdown; +UPDATE rw_view2 SET person=person WHERE snoop(person); +DELETE FROM rw_view2 WHERE NOT snoop(person); EXPLAIN (costs off, nodes off) SELECT * FROM rw_view2 WHERE snoop(person); EXPLAIN (costs off, nodes off) UPDATE rw_view2 SET person=person WHERE snoop(person); EXPLAIN (costs off, nodes off) DELETE FROM rw_view2 WHERE NOT snoop(person); +ALTER FUNCTION snoop(anyelement) not pushdown; DROP TABLE base_tbl CASCADE; @@ -959,6 +971,10 @@ SELECT * FROM rw_view1; EXPLAIN (costs off, nodes off) DELETE FROM rw_view1 WHERE id = 1 AND snoop(data); DELETE FROM rw_view1 WHERE id = 1 AND snoop(data); +ALTER FUNCTION snoop(anyelement) pushdown; +EXPLAIN (costs off, nodes off) DELETE FROM rw_view1 WHERE id = 1 AND snoop(data); +DELETE FROM rw_view1 WHERE id = 1 AND snoop(data); +ALTER FUNCTION snoop(anyelement) not pushdown; EXPLAIN (costs off, nodes off) INSERT INTO rw_view1 VALUES (2, 'New row 2'); INSERT INTO rw_view1 VALUES (2, 'New row 2'); @@ -1003,10 +1019,20 @@ SELECT * FROM v1 WHERE a=8; EXPLAIN (VERBOSE, COSTS OFF) UPDATE v1 SET a=100 WHERE snoop(a) AND leakproof(a) AND a < 7 AND a != 6; UPDATE v1 SET a=100 WHERE snoop(a) AND leakproof(a) AND a < 7 AND a != 6; +ALTER FUNCTION leakproof(anyelement) pushdown; +EXPLAIN (VERBOSE, COSTS OFF) +UPDATE v1 SET a=100 WHERE snoop(a) AND leakproof(a) AND a < 7 AND a != 6; +UPDATE v1 SET a=100 WHERE snoop(a) AND leakproof(a) AND a < 7 AND a != 6; +ALTER FUNCTION leakproof(anyelement) not pushdown; SELECT * FROM v1 WHERE a=100; -- Nothing should have been changed to 100 SELECT * FROM t1 WHERE a=100; -- Nothing should have been changed to 100 +EXPLAIN (VERBOSE, COSTS OFF) +UPDATE v1 SET a=a+1 WHERE snoop(a) AND leakproof(a) AND a = 8; +UPDATE v1 SET a=a+1 WHERE snoop(a) AND leakproof(a) AND a = 8; +ALTER FUNCTION leakproof(anyelement) pushdown; +ALTER FUNCTION snoop(anyelement) pushdown; EXPLAIN (VERBOSE, COSTS OFF) UPDATE v1 SET a=a+1 WHERE snoop(a) AND leakproof(a) AND a = 8; UPDATE v1 SET a=a+1 WHERE snoop(a) AND leakproof(a) AND a = 8; @@ -1014,6 +1040,8 @@ UPDATE v1 SET a=a+1 WHERE snoop(a) AND leakproof(a) AND a = 8; SELECT * FROM v1 WHERE b=8; DELETE FROM v1 WHERE snoop(a) AND leakproof(a); -- should not delete everything, just where a>5 +ALTER FUNCTION leakproof(anyelement) not pushdown; +ALTER FUNCTION snoop(anyelement) not pushdown; TABLE t1; -- verify all a<=5 are intact diff --git a/src/test/regress/sql/xc_remote.sql b/src/test/regress/sql/xc_remote.sql index edd73ac6..39ef7ecf 100644 --- a/src/test/regress/sql/xc_remote.sql +++ b/src/test/regress/sql/xc_remote.sql @@ -185,7 +185,7 @@ CREATE TABLE xcrem_employee (EMPNO CHAR(6) NOT NULL, FIRSTNAME VARCHAR(12) NOT N create table xcrem_temptable as select * from xcrem_employee; -create or replace function volatile_func(id int) returns int as +create or replace function volatile_func(id int) returns int pushdown as $$begin return 3;end $$ language plpgsql; \set EXP 'explain (verbose true, costs false, nodes false)' From 86603cd8a9b781755a91add6af96b1cc4418ed96 Mon Sep 17 00:00:00 2001 From: andrelin Date: Thu, 31 Mar 2022 16:06:43 +0800 Subject: [PATCH 554/578] Consider restricted node number in cost module --- src/backend/optimizer/util/pathnode.c | 8 +++++++- src/test/regress/expected/nestloop_by_shard.out | 8 ++++---- src/test/regress/expected/xc_FQS_2.out | 12 ++++++------ 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 9b03a3d9..ed05ea1e 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -7414,7 +7414,13 @@ path_count_datanodes(Path *path) (path->distribution->distributionType == LOCATOR_TYPE_SHARD || path->distribution->distributionType == LOCATOR_TYPE_HASH)) { - double nodes = bms_num_members(path->distribution->nodes); + double nodes; + + nodes = bms_num_members(path->distribution->restrictNodes); + if (nodes > 0) + return nodes; + + nodes = bms_num_members(path->distribution->nodes); if (nodes > 0) return nodes; } diff --git a/src/test/regress/expected/nestloop_by_shard.out b/src/test/regress/expected/nestloop_by_shard.out index da851318..47ebc061 100644 --- a/src/test/regress/expected/nestloop_by_shard.out +++ b/src/test/regress/expected/nestloop_by_shard.out @@ -81,8 +81,8 @@ where t1.unique1 = 1; -> Nested Loop Left Join -> Remote Subquery Scan on all Distribute results by S: hundred - -> Seq Scan on tenk1_s t1 - Filter: (unique1 = 1) + -> Index Scan using unique1_s on tenk1_s t1 + Index Cond: (unique1 = 1) -> Materialize -> Remote Subquery Scan on all Distribute results by S: hundred @@ -114,8 +114,8 @@ where t1.unique1 = 1; -> Nested Loop Left Join -> Remote Subquery Scan on all Distribute results by S: hundred - -> Seq Scan on tenk1_s t1 - Filter: (unique1 = 1) + -> Index Scan using unique1_s on tenk1_s t1 + Index Cond: (unique1 = 1) -> Materialize -> Remote Subquery Scan on all Distribute results by S: hundred diff --git a/src/test/regress/expected/xc_FQS_2.out b/src/test/regress/expected/xc_FQS_2.out index 7f9570b4..1089bd60 100644 --- a/src/test/regress/expected/xc_FQS_2.out +++ b/src/test/regress/expected/xc_FQS_2.out @@ -1641,14 +1641,14 @@ select * from subquery_fqs t join (select 1 id, 'gd' a, 2 c from dual union sele explain select * from subquery_fqs t1 where t1.id = 1 and t1.c IN (select c from subquery_fqs t2 where t2.id=1); QUERY PLAN -------------------------------------------------------------------------------------------------- - Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.00..121.16 rows=1 width=40) - -> Nested Loop Semi Join (cost=100.00..121.16 rows=1 width=40) + Remote Subquery Scan on all (datanode_1,datanode_2) (cost=100.00..142.55 rows=1 width=40) + -> Nested Loop Semi Join (cost=100.00..142.55 rows=1 width=40) Join Filter: (t1.c = t2.c) - -> Seq Scan on subquery_fqs t1 (cost=0.00..10.50 rows=2 width=40) + -> Seq Scan on subquery_fqs t1 (cost=0.00..21.00 rows=4 width=40) Filter: (id = 1) - -> Materialize (cost=100.00..110.55 rows=4 width=4) - -> Remote Subquery Scan on all (datanode_1) (cost=100.00..110.53 rows=4 width=4) - -> Seq Scan on subquery_fqs t2 (cost=0.00..10.50 rows=2 width=4) + -> Materialize (cost=100.00..121.09 rows=8 width=4) + -> Remote Subquery Scan on all (datanode_1) (cost=100.00..121.05 rows=8 width=4) + -> Seq Scan on subquery_fqs t2 (cost=0.00..21.00 rows=4 width=4) Filter: (id = 1) (9 rows) From 2183c18380f0e2324c1368d9c1439485d71a40d2 Mon Sep 17 00:00:00 2001 From: aslanxli Date: Sun, 24 Apr 2022 17:43:58 +0800 Subject: [PATCH 555/578] fix multi-values insert error: Failing row contains (null, null). TAPD:http://tapd.woa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131098492799&jump_count=1 add regress test case --- src/backend/commands/copy.c | 3 ++- src/backend/executor/spi.c | 4 ++-- src/pl/plpgsql/src/pl_exec.c | 6 ++++++ src/test/regress/expected/plpgsql_1.out | 25 +++++++++++++++++++++++++ src/test/regress/sql/plpgsql.sql | 18 ++++++++++++++++++ 5 files changed, 53 insertions(+), 3 deletions(-) diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 8bf02419..5600bbf2 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -5039,7 +5039,8 @@ CopyReadLine(CopyState cstate) cstate->line_buf.data[cstate->line_buf.len] = '\0'; break; case EOL_UNKNOWN: - /* shouldn't get here */ + /* shouldn't get here except we are transform from insert */ + if (!cstate->internal_mode) Assert(false); break; } diff --git a/src/backend/executor/spi.c b/src/backend/executor/spi.c index 510f1fcb..416f3d3a 100644 --- a/src/backend/executor/spi.c +++ b/src/backend/executor/spi.c @@ -2107,7 +2107,7 @@ _SPI_execute_plan(SPIPlanPtr plan, ParamListInfo paramLI, * TODO: now we don't support param, if multi values contains paramref, do not * transform to CopyStmt, refactor later */ - if (plansource->insert_into && plansource->raw_parse_tree != NULL && + if (g_transform_insert_to_copy && plansource->insert_into && plansource->raw_parse_tree != NULL && IsA(plansource->raw_parse_tree->stmt, InsertStmt)) { bool suc; @@ -2896,7 +2896,7 @@ static void _SPI_multi_insert_rewrite(CachedPlanSource *plansource, * set insert_into when we get multi-values insert, not * often happen */ - if (unlikely(parse->isMultiValues && !parse->hasUnshippableTriggers)) + if (unlikely(g_transform_insert_to_copy && parse->isMultiValues && !parse->hasUnshippableTriggers)) { MemoryContext old_ctx; InsertStmt *iStmt = (InsertStmt*)parsetree->stmt; diff --git a/src/pl/plpgsql/src/pl_exec.c b/src/pl/plpgsql/src/pl_exec.c index 5c232317..74eb8d76 100644 --- a/src/pl/plpgsql/src/pl_exec.c +++ b/src/pl/plpgsql/src/pl_exec.c @@ -29,6 +29,7 @@ #include "optimizer/planner.h" #include "parser/parse_coerce.h" #include "parser/scansup.h" +#include "parser/analyze.h" #include "storage/proc.h" #include "tcop/tcopprot.h" #include "utils/array.h" @@ -3689,6 +3690,11 @@ exec_stmt_execsql(PLpgSQL_execstate *estate, q->commandType == CMD_UPDATE || q->commandType == CMD_DELETE) stmt->mod_stmt = true; + + /* when transform insert to copy, reset mod_stmt */ + if (g_transform_insert_to_copy && q->commandType == CMD_INSERT && + q->isMultiValues && !q->hasUnshippableTriggers) + stmt->mod_stmt = false; /* PGXCTODO: Support a better parameter interface for XC with DMLs */ if #ifdef XCP diff --git a/src/test/regress/expected/plpgsql_1.out b/src/test/regress/expected/plpgsql_1.out index 20f49e11..a686a7e8 100644 --- a/src/test/regress/expected/plpgsql_1.out +++ b/src/test/regress/expected/plpgsql_1.out @@ -6184,3 +6184,28 @@ SELECT * FROM list_partitioned_table() AS t; 2 (2 rows) +set transform_insert_to_copy to on; +create table multi_itb(f1 int,f2 int); +create or replace function insert_mul () returns text as +$$ +begin + insert into multi_itb values(1,1),(2,2); + return 'ok'; +end; +$$ +language plpgsql; +select insert_mul(); + insert_mul +------------ + ok +(1 row) + +select * from multi_itb order by f1; + f1 | f2 +----+---- + 1 | 1 + 2 | 2 +(2 rows) + +drop table multi_itb; +set transform_insert_to_copy to off; diff --git a/src/test/regress/sql/plpgsql.sql b/src/test/regress/sql/plpgsql.sql index a614da36..4a7c4ab3 100644 --- a/src/test/regress/sql/plpgsql.sql +++ b/src/test/regress/sql/plpgsql.sql @@ -4921,3 +4921,21 @@ BEGIN END; $$ LANGUAGE plpgsql; SELECT * FROM list_partitioned_table() AS t; + +set transform_insert_to_copy to on; + +create table multi_itb(f1 int,f2 int); + +create or replace function insert_mul () returns text as +$$ +begin + insert into multi_itb values(1,1),(2,2); + return 'ok'; +end; +$$ +language plpgsql; + +select insert_mul(); +select * from multi_itb order by f1; +drop table multi_itb; +set transform_insert_to_copy to off; From 5533f0ae1d02d73bc88718fdf0f16f787aa5c964 Mon Sep 17 00:00:00 2001 From: aslanxli Date: Sun, 24 Apr 2022 20:30:29 +0800 Subject: [PATCH 556/578] fix multi-values insert error: Failing row contains (null, null). TAPD:http://tapd.woa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131098492799&jump_count=1 reformat code --- src/backend/nodes/copyfuncs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index e2e6b7fc..3cc64b1d 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -3320,7 +3320,8 @@ _copyInsertStmt(const InsertStmt *from) { newnode->data_list = (char ***)palloc(sizeof(char **) * from->ndatarows); - for (rowIdx = 0; rowIdx < from->ndatarows; rowIdx++) { + for (rowIdx = 0; rowIdx < from->ndatarows; rowIdx++) + { newnode->data_list[rowIdx] = (char **)palloc(sizeof(char *) * from->ninsert_columns); for (colIdx = 0; colIdx < from->ninsert_columns; colIdx++) From 54b0b4741989bf0758bcdec1e80e961c22eef46f Mon Sep 17 00:00:00 2001 From: andrelin Date: Mon, 25 Apr 2022 11:46:47 +0800 Subject: [PATCH 557/578] fix 2 warnings --- src/backend/nodes/readfuncs.c | 3 ++- src/backend/parser/parse_func.c | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c index 28cd7dbd..2730b6f3 100644 --- a/src/backend/nodes/readfuncs.c +++ b/src/backend/nodes/readfuncs.c @@ -573,7 +573,8 @@ _readQuery(void) READ_BOOL_FIELD(hasForUpdate); READ_BOOL_FIELD(hasRowSecurity); token = pg_strtok(&length); /* get :fldname hasRowSecurity or cteList */ - if (strncmp(nullable_string(token, length), ":hasCoordFuncs", length) == 0) + Assert(length != 0); + if (strncmp(debackslash(token, length), ":hasCoordFuncs", length) == 0) { token = pg_strtok(&length); /* get field value */ local_node->hasCoordFuncs = strtobool(token); diff --git a/src/backend/parser/parse_func.c b/src/backend/parser/parse_func.c index ac9fc9c0..b5d44478 100644 --- a/src/backend/parser/parse_func.c +++ b/src/backend/parser/parse_func.c @@ -2265,7 +2265,6 @@ check_srf_call_placement(ParseState *pstate, Node *last_srf, int location) bool func_is_pullup(Oid func_id) { - char *name = NULL; if (func_id >= FirstNormalObjectId) { Oid func_lang_oid; From ca57eab6c2e1468a741bf0c812e1e4a7766f235c Mon Sep 17 00:00:00 2001 From: aslanxli Date: Mon, 25 Apr 2022 12:00:28 +0800 Subject: [PATCH 558/578] fix multi-values insert error: Failing row contains (null, null). TAPD:http://tapd.woa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131098492799&jump_count=1 Check NULL in data_list, when copy it from one InsertStmt to another. And add test case for this scenario. --- src/backend/executor/spi.c | 5 +++++ src/backend/nodes/copyfuncs.c | 5 +++++ src/test/regress/expected/plpgsql_1.out | 22 ++++++++++++++++++++++ src/test/regress/sql/plpgsql.sql | 12 ++++++++++++ 4 files changed, 44 insertions(+) diff --git a/src/backend/executor/spi.c b/src/backend/executor/spi.c index 416f3d3a..dbad0299 100644 --- a/src/backend/executor/spi.c +++ b/src/backend/executor/spi.c @@ -2914,9 +2914,14 @@ static void _SPI_multi_insert_rewrite(CachedPlanSource *plansource, pStmt->data_list[rowIdx] = (char **)palloc( sizeof(char *) * iStmt->ninsert_columns); for (colIdx = 0; colIdx < iStmt->ninsert_columns; colIdx++) + { + if (iStmt->data_list[rowIdx][colIdx] == NULL) + pStmt->data_list[rowIdx][colIdx] = NULL; + else pStmt->data_list[rowIdx][colIdx] = pstrdup(iStmt->data_list[rowIdx][colIdx]); } } + } pStmt->ndatarows = iStmt->ndatarows; pStmt->ninsert_columns = iStmt->ninsert_columns; MemoryContextSwitchTo(old_ctx); diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 3cc64b1d..475cb4ee 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -3325,10 +3325,15 @@ _copyInsertStmt(const InsertStmt *from) newnode->data_list[rowIdx] = (char **)palloc(sizeof(char *) * from->ninsert_columns); for (colIdx = 0; colIdx < from->ninsert_columns; colIdx++) + { + if(from->data_list[rowIdx][colIdx] == NULL) + newnode->data_list[rowIdx][colIdx] = NULL; + else newnode->data_list[rowIdx][colIdx] = pstrdup(from->data_list[rowIdx][colIdx]); } } + } COPY_SCALAR_FIELD(ndatarows); #endif return newnode; diff --git a/src/test/regress/expected/plpgsql_1.out b/src/test/regress/expected/plpgsql_1.out index a686a7e8..3ae9212f 100644 --- a/src/test/regress/expected/plpgsql_1.out +++ b/src/test/regress/expected/plpgsql_1.out @@ -6194,6 +6194,14 @@ begin end; $$ language plpgsql; +create or replace function insert_mul_null () returns text as +$$ +begin + insert into multi_itb values(1,null),(2,null); + return 'ok'; +end; +$$ +language plpgsql; select insert_mul(); insert_mul ------------ @@ -6207,5 +6215,19 @@ select * from multi_itb order by f1; 2 | 2 (2 rows) +truncate multi_itb; +select insert_mul_null(); + insert_mul_null +----------------- + ok +(1 row) + +select * from multi_itb order by f1; + f1 | f2 +----+---- + 1 | + 2 | +(2 rows) + drop table multi_itb; set transform_insert_to_copy to off; diff --git a/src/test/regress/sql/plpgsql.sql b/src/test/regress/sql/plpgsql.sql index 4a7c4ab3..3f0a9757 100644 --- a/src/test/regress/sql/plpgsql.sql +++ b/src/test/regress/sql/plpgsql.sql @@ -4935,7 +4935,19 @@ end; $$ language plpgsql; +create or replace function insert_mul_null () returns text as +$$ +begin + insert into multi_itb values(1,null),(2,null); + return 'ok'; +end; +$$ +language plpgsql; + select insert_mul(); select * from multi_itb order by f1; +truncate multi_itb; +select insert_mul_null(); +select * from multi_itb order by f1; drop table multi_itb; set transform_insert_to_copy to off; From 397be63b070ca7b849f18b3024f072dc8005e0d6 Mon Sep 17 00:00:00 2001 From: andrelin Date: Mon, 25 Apr 2022 12:49:24 +0800 Subject: [PATCH 559/578] Fix bug of setting pstate->p_hasCoordFuncs and sync regress --- src/backend/parser/parse_func.c | 2 +- src/test/regress/expected/pl_bugs.out | 2 +- src/test/regress/expected/polymorphism.out | 5 +++++ src/test/regress/sql/pl_bugs.sql | 2 +- src/test/regress/sql/updatable_views.sql | 2 ++ 5 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/backend/parser/parse_func.c b/src/backend/parser/parse_func.c index b5d44478..dd7e8a00 100644 --- a/src/backend/parser/parse_func.c +++ b/src/backend/parser/parse_func.c @@ -255,7 +255,7 @@ ParseFuncOrColumn(ParseState *pstate, List *funcname, List *fargs, cancel_parser_errposition_callback(&pcbstate); - pstate->p_hasCoordFuncs = func_is_pullup(funcid); + pstate->p_hasCoordFuncs = pstate->p_hasCoordFuncs ? true : func_is_pullup(funcid); if (fdresult == FUNCDETAIL_COERCION) { diff --git a/src/test/regress/expected/pl_bugs.out b/src/test/regress/expected/pl_bugs.out index 0930dd68..dea24fcd 100644 --- a/src/test/regress/expected/pl_bugs.out +++ b/src/test/regress/expected/pl_bugs.out @@ -4,7 +4,7 @@ set enable_oracle_compatible to on; -- -- Name: func_getlastnetvalue(varchar2, date); Type: FUNCTION; Schema: sync; Owner: gregsun -- -CREATE FUNCTION func_getlastnetvalue(v_fundcode varchar2, v_cdate date) RETURNS numeric +CREATE FUNCTION func_getlastnetvalue(v_fundcode varchar2, v_cdate date) RETURNS numeric pushdown LANGUAGE plpgsql AS $$ declare v_netvalue text; diff --git a/src/test/regress/expected/polymorphism.out b/src/test/regress/expected/polymorphism.out index b37872fa..d3a86fb7 100644 --- a/src/test/regress/expected/polymorphism.out +++ b/src/test/regress/expected/polymorphism.out @@ -554,6 +554,11 @@ select case when $1 then $2 else $3 end $$ language sql; -- Note this would fail with integer overflow, never mind wrong bleat() output, -- if the CASE expression were not successfully inlined select f1, sql_if(f1 > 0, bleat(f1), bleat(f1 + 1)) from (select * from int4_tbl order by f1) q order by 1, 2; +NOTICE: bleat -2147483646 +NOTICE: bleat -123455 +NOTICE: bleat 1 +NOTICE: bleat 123456 +NOTICE: bleat 2147483647 f1 | sql_if -------------+------------- -2147483647 | -2147483646 diff --git a/src/test/regress/sql/pl_bugs.sql b/src/test/regress/sql/pl_bugs.sql index 0059dc90..0ac392e0 100644 --- a/src/test/regress/sql/pl_bugs.sql +++ b/src/test/regress/sql/pl_bugs.sql @@ -7,7 +7,7 @@ set enable_oracle_compatible to on; -- Name: func_getlastnetvalue(varchar2, date); Type: FUNCTION; Schema: sync; Owner: gregsun -- -CREATE FUNCTION func_getlastnetvalue(v_fundcode varchar2, v_cdate date) RETURNS numeric +CREATE FUNCTION func_getlastnetvalue(v_fundcode varchar2, v_cdate date) RETURNS numeric pushdown LANGUAGE plpgsql AS $$ declare v_netvalue text; diff --git a/src/test/regress/sql/updatable_views.sql b/src/test/regress/sql/updatable_views.sql index f0c092c8..6c984268 100644 --- a/src/test/regress/sql/updatable_views.sql +++ b/src/test/regress/sql/updatable_views.sql @@ -1020,10 +1020,12 @@ EXPLAIN (VERBOSE, COSTS OFF) UPDATE v1 SET a=100 WHERE snoop(a) AND leakproof(a) AND a < 7 AND a != 6; UPDATE v1 SET a=100 WHERE snoop(a) AND leakproof(a) AND a < 7 AND a != 6; ALTER FUNCTION leakproof(anyelement) pushdown; +ALTER FUNCTION snoop(anyelement) pushdown; EXPLAIN (VERBOSE, COSTS OFF) UPDATE v1 SET a=100 WHERE snoop(a) AND leakproof(a) AND a < 7 AND a != 6; UPDATE v1 SET a=100 WHERE snoop(a) AND leakproof(a) AND a < 7 AND a != 6; ALTER FUNCTION leakproof(anyelement) not pushdown; +ALTER FUNCTION snoop(anyelement) not pushdown; SELECT * FROM v1 WHERE a=100; -- Nothing should have been changed to 100 SELECT * FROM t1 WHERE a=100; -- Nothing should have been changed to 100 From fb04c62519acf4e42e7e8075d4d0d8397ab156f6 Mon Sep 17 00:00:00 2001 From: whalesong Date: Tue, 26 Apr 2022 10:14:03 +0800 Subject: [PATCH 560/578] Revert "support wal sender proxy on cn (merge request 1183), http://tapd.woa.com/20421696/prong/stories/view/1020421696872688189" This reverts commit 39bf77a82a18fabcb6a5e3f4911c06ea4d9b3559. --- src/backend/access/common/printtup.c | 2 +- src/backend/pgxc/pool/execRemote.c | 323 +-------------------------- src/backend/pgxc/pool/pgxcnode.c | 66 ------ src/backend/postmaster/pgstat.c | 8 - src/backend/postmaster/postmaster.c | 39 ---- src/backend/replication/walsender.c | 3 - src/backend/tcop/postgres.c | 278 ----------------------- src/backend/utils/misc/guc.c | 8 - src/backend/utils/misc/ps_status.c | 23 -- src/include/pgstat.h | 1 - src/include/pgxc/execRemote.h | 3 - src/include/pgxc/pgxc.h | 2 - src/include/pgxc/pgxcnode.h | 8 - src/include/postgres.h | 5 - src/include/replication/walsender.h | 7 - src/include/utils/ps_status.h | 2 - 16 files changed, 2 insertions(+), 776 deletions(-) diff --git a/src/backend/access/common/printtup.c b/src/backend/access/common/printtup.c index 3c12980a..fa66df73 100644 --- a/src/backend/access/common/printtup.c +++ b/src/backend/access/common/printtup.c @@ -228,7 +228,7 @@ SendRowDescriptionMessage(TupleDesc typeinfo, List *targetlist, int16 *formats) * Send the type name from a Postgres-XC backend node. * This preserves from OID inconsistencies as architecture is shared nothing. */ - if (IsConnFromCoord() && !IsConnFromProxy()) + if (IsConnFromCoord()) { char *typename; typename = get_typenamespace_typename(atttypid); diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index c3ee221a..c37ac46e 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -30,7 +30,6 @@ #include "executor/executor.h" #include "gtm/gtm_c.h" #include "libpq/libpq.h" -#include "libpq/pqformat.h" #include "miscadmin.h" #include "pgxc/execRemote.h" #include "tcop/tcopprot.h" @@ -157,7 +156,6 @@ static void pgxc_connections_cleanup(ResponseCombiner *combiner); static bool determine_param_types(Plan *plan, struct find_params_context *context); -static int handle_reply_msg_on_proxy(PGXCNodeHandle *conn); #define REMOVE_CURR_CONN(combiner) \ if ((combiner)->current_conn < --((combiner)->conn_count)) \ @@ -3028,17 +3026,7 @@ pgxc_node_receive_responses(const int conn_count, PGXCNodeHandle ** connections, while (i < count) { int32 nbytes = 0; - int result = 0; - - if (am_proxy_for_dn) - { - result = handle_response_on_proxy(to_receive[i], combiner); - } - else - { - result = handle_response(to_receive[i], combiner); - } - + int result = handle_response(to_receive[i], combiner); #ifdef __TBASE__ #ifdef _PG_REGRESS_ elog(LOG, "Received response %d on connection to node %s", @@ -13105,313 +13093,4 @@ SetSnapshot(EState *state) return result; } - -/* - * Reveive dn message on proxy. - * Forward the dn message to client and forward the client reply message to dn. - */ -int pgxc_node_receive_on_proxy(PGXCNodeHandle *handle) -{ - int result = 0; - ResponseCombiner combiner; - - struct timeval timeout; - timeout.tv_sec = 1; - timeout.tv_usec = 0; - - MemSet(&combiner, 0, sizeof(ResponseCombiner)); - - InitResponseCombiner(&combiner, 1, COMBINE_TYPE_NONE); - - /* Receive responses */ - result = pgxc_node_receive_responses(1, &handle, &timeout, &combiner); - if (result != 0) - { - elog(LOG, "Proxy receive responses result is %d", result); - return result; - } - - CloseCombiner(&combiner); - return result; -} - -/* - * Handle reply message on proxy. - * Forward the client reply message to dn. - */ -int handle_reply_msg_on_proxy(PGXCNodeHandle *conn) -{ - int ret = 0; - unsigned char firstchar; - StringInfoData msg; - - Assert(IS_PGXC_COORDINATOR); - - initStringInfo(&msg); - - for (;;) - { - pq_startmsgread(); - ret = pq_getbyte_if_available(&firstchar); - if (ret < 0) - { - /* Unexpected error or EOF */ - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("unexpected EOF on proxy for %s", proxy_for_dn))); - } - - if (ret == 0) - { - /* No data available without blocking */ - pq_endmsgread(); - break; - } - - /* Read the message contents */ - if (pq_getmessage(&msg, 0)) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("unexpected EOF on proxy for %s", proxy_for_dn))); - } - - elog(DEBUG2, "%s proxy firstchar is %c(%d), reply message length: %d", - proxy_for_dn, firstchar, firstchar, msg.len); - - ret = pgxc_node_send_on_proxy(conn, firstchar, &msg); - if (ret != 0) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("%s proxy send reply message error: %d", - proxy_for_dn, ret))); - } - - /* Handle the very limited subset of commands expected in this phase */ - switch (firstchar) - { - /* - * 'd' means a client reply message. - */ - case 'd': - break; - - /* - * 'c' means the client requested to finish streaming. - */ - case 'c': - elog(LOG, "%s proxy: reply message type %c(%d), " - "the client requested to finish streaming", - proxy_for_dn, firstchar, firstchar); - - /* When replicate stream is closed, set stream_closed to true */ - conn->stream_closed = true; - - break; - - /* - * 'X' means the client is closing down the socket. - */ - case 'X': - elog(LOG, "%s proxy: reply message type %c(%d), " - "the client is closing down the socket", - proxy_for_dn, firstchar, firstchar); - - proc_exit(0); - - default: - elog(FATAL, "%s proxy: unexpected message type %c(%d), length: %d", - proxy_for_dn, firstchar, firstchar, msg.len); - break; - } - } - - return ret; -} - -/* - * Read next message from the connection and update - * connection state accordingly on the proxy - * If we are in an error state we just consume the messages, and do not proxy - * Long term, we should look into cancelling executing statements - * and closing the connections. - * It returns if states need to be handled - * Return values: - * RESPONSE_EOF - need to receive more data for the connection - * RESPONSE_READY - got ReadyForQuery - * RESPONSE_COMPLETE - done with the connection, but not yet ready for query. - * Also this result is output in case of error - * RESPONSE_TUPLEDESC - got tuple description - * RESPONSE_DATAROW - got data row - */ -int handle_response_on_proxy(PGXCNodeHandle *conn, ResponseCombiner *combiner) -{ - char *msg; - int msg_len; - char msg_type; - int ret = 0; - StringInfoData buf; - - /* proxy must be cn */ - Assert(IS_PGXC_COORDINATOR); - - /* proxy must be not in extended query */ - Assert(!conn->in_extended_query); - Assert(!combiner->extended_query); - - for (;;) - { - /* - * If we are in the process of shutting down, we - * may be rolling back, and the buffer may contain other messages. - * We want to avoid a procarray exception - * as well as an error stack overflow. - */ - if (proc_exit_inprogress) - { - PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_ERROR_FATAL); - } - - /* - * Don't read from from the connection if there is a fatal error. - * We still return RESPONSE_COMPLETE, not RESPONSE_ERROR, since - * Handling of RESPONSE_ERROR assumes sending SYNC message, but - * State DN_CONNECTION_STATE_ERROR_FATAL indicates connection is - * not usable. - */ - if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL) - { - return RESPONSE_COMPLETE; - } - - ret = handle_reply_msg_on_proxy(conn); - if (ret != 0) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Handle reply message on proxy for %s error: %d", - proxy_for_dn, ret))); - } - - /* No data available, exit */ - if (!HAS_MESSAGE_BUFFERED(conn)) - return RESPONSE_EOF; - - Assert(conn->combiner == combiner || conn->combiner == NULL); - - msg_type = get_message(conn, &msg_len, &msg); - elog(DEBUG1, "handle_response_on_proxy - received message %c, node %s, " - "current_state %d", msg_type, conn->nodename, conn->state); - - /* - * Add some protection code when receiving a messy message, - * close the connection, and throw error - */ - if (msg_len < 0) - { - PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_ERROR_FATAL); - - elog(LOG, "handle_response_on_proxy, fatal_conn=%p, " - "fatal_conn->nodename=%s, fatal_conn->sock=%d, " - "fatal_conn->read_only=%d, fatal_conn->transaction_status=%c, " - "fatal_conn->sock_fatal_occurred=%d, conn->backend_pid=%d, " - "fatal_conn->error=%s", conn, conn->nodename, conn->sock, - conn->read_only, conn->transaction_status, - conn->sock_fatal_occurred, conn->backend_pid, conn->error); - - closesocket(conn->sock); - conn->sock = NO_SOCKET; - conn->sock_fatal_occurred = true; - - elog(LOG, "Received messy message from node:%s host:%s port:%d pid:%d, " - "inBuffer:%p inSize:%lu inStart:%lu inEnd:%lu inCursor:%lu " - "msg_len:%d, This probably means the remote node terminated " - "abnormally before or while processing the request.", - conn->nodename, conn->nodehost, conn->nodeport, conn->backend_pid, - conn->inBuffer, conn->inSize, conn->inStart, conn->inEnd, - conn->inCursor, msg_len); - - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Proxy: handle_response_on_proxy - received message " - "length %d, type %c, node %s, current_state %d", - msg_len, msg_type, conn->nodename, conn->state))); - } - - if (msg_type == '\0') - { - /* Not enough data in the buffer */ - return RESPONSE_EOF; - } - - if (conn->stream_closed && msg_type == 'd') - { - /* When replicate stream is closed, skip 'd' message */ - elog(DEBUG1, "Proxy: handle_response_on_proxy - received message " - "type %c, length %d, node %s, current_state %d, remote pid %d, skip", - msg_type, msg_len, conn->nodename, conn->state, conn->backend_pid); - continue;; - } - - conn->last_command = msg_type; - - elog(DEBUG1, "Proxy: handle_response_on_proxy - received message " - "type %c, length %d, node %s, current_state %d, remote pid %d", - msg_type, msg_len, conn->nodename, conn->state, conn->backend_pid); - - /* Send message to client */ - pq_beginmessage(&buf, msg_type); - pq_sendbytes(&buf, msg, msg_len); - pq_endmessage(&buf); - pq_flush(); - - switch (msg_type) - { - case 'c': /* CopyToCommandComplete */ - break; - - case 'C': /* CommandComplete */ - conn->combiner = NULL; - PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_IDLE); - return RESPONSE_COMPLETE; - - case 'E': /* ErrorResponse */ - HandleError(combiner, msg, msg_len, conn); - add_error_message_from_combiner(conn, combiner); - - combiner->errorNode = conn->nodename; - combiner->backend_pid = conn->backend_pid; - return RESPONSE_ERROR; - - case 'Z': /* ReadyForQuery */ - conn->transaction_status = msg[0]; - PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_IDLE); - conn->combiner = NULL; - return RESPONSE_READY; - - case 'T': /* RowDescription */ - return RESPONSE_TUPDESC; - - case 'D': /* DataRow */ - return RESPONSE_DATAROW; - - case 'd': /* CopyOutDataRow */ - PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_COPY_OUT); - break; - - case 'W': /* CopyBothResponse */ - /* Get a CopyBothResponse message when start streaming */ - break; - - default: - elog(DEBUG1, "Proxy received message type: %c", msg_type); - break; - } - } - - /* Never happen, but keep compiler quiet */ - return RESPONSE_EOF; -} - #endif diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index 84259600..c19325a9 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -2595,72 +2595,6 @@ pgxc_node_send_apply(PGXCNodeHandle * handle, char * buf, int len, bool ignore_p } #endif -/* - * Send message to dn - */ -int -pgxc_node_send_on_proxy(PGXCNodeHandle *handle, int firstchar, StringInfo inBuf) -{ - /* size + len */ - int msgLen = 4 + inBuf->len; - - /* msgType + msgLen */ - if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0) - { - add_error_message(handle, "out of memory"); - return EOF; - } - - /* msg type */ - handle->outBuffer[handle->outEnd++] = firstchar; - - /* size */ - msgLen = htonl(msgLen); - memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4); - handle->outEnd += 4; - - /* msg data */ - memcpy(handle->outBuffer + handle->outEnd, inBuf->data, inBuf->len); - handle->outEnd += inBuf->len; - - PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_QUERY); - handle->in_extended_query = false; - - return pgxc_node_flush(handle); -} - -/* - * Send proxy configuration to dn - */ -int -pgxc_node_send_proxy_flag(PGXCNodeHandle *handle, int flag) -{ - /* size + flag */ - int msgLen = 4 + sizeof(int); - - /* msgType + msgLen */ - if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0) - { - add_error_message(handle, "out of memory"); - return EOF; - } - - /* msg type */ - handle->outBuffer[handle->outEnd++] = 'w'; - - /* size */ - msgLen = htonl(msgLen); - memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4); - handle->outEnd += 4; - - /* flag */ - flag = htonl(flag); - memcpy(handle->outBuffer + handle->outEnd, &flag, sizeof(int)); - handle->outEnd += sizeof(int); - - return pgxc_node_flush(handle); -} - /* * Send series of Extended Query protocol messages to the data node */ diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index cfeea974..1286cd1d 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -2913,11 +2913,6 @@ pgstat_bestart(void) /* Clean 2pc Worker */ beentry->st_backendType = B_CLEAN_2PC_WORKER; } - else if (am_proxy_for_dn) - { - /* Proxy for dn */ - beentry->st_backendType = B_PROXY_FOR_DN; - } else if (am_walsender) { /* Wal sender */ @@ -4213,9 +4208,6 @@ pgstat_get_backend_desc(BackendType backendType) case B_CLEAN_2PC_WORKER: backendDesc = "2pc clean worker"; break; - case B_PROXY_FOR_DN: - backendDesc = "proxy for dn"; - break; } return backendDesc; diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 7d6d230b..10be77cd 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -162,8 +162,6 @@ #include "audit/audit_fga.h" #endif -#define PS_DISPLAY_MAX_LENGTH 256 /* process display max length */ - /* * Possible types of a backend. Beyond being the possible bkend_type values in * struct bkend, these are OR-able request flag bits for SignalSomeChildren() @@ -2389,20 +2387,6 @@ ProcessStartupPacket(Port *port, bool SSLdone) valptr), errhint("Valid values are: \"false\", 0, \"true\", 1, \"database\"."))); } - else if (strcmp(nameptr, "proxy_for_dn") == 0) - { - if (!IS_PGXC_COORDINATOR) - { - ereport(FATAL, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("parameter \"%s\" only support on cn", nameptr))); - } - - elog(LOG, "Proxy for dn: %s", valptr); - - am_proxy_for_dn = true; - proxy_for_dn = pstrdup(valptr); - } else { /* Assume it's a generic GUC option */ @@ -4956,35 +4940,12 @@ BackendInitialize(Port *port) * as dbname to init_ps_display(). XXX: should add a new variant of * init_ps_display() to avoid abusing the parameters like this. */ - if (am_proxy_for_dn) - { - char proxy_display[PS_DISPLAY_MAX_LENGTH]; if (am_walsender) - { - snprintf(proxy_display, PS_DISPLAY_MAX_LENGTH, - "wal sender proxy for %s", proxy_for_dn); - } - else - { - snprintf(proxy_display, PS_DISPLAY_MAX_LENGTH, - "proxy for %s", proxy_for_dn); - } - init_ps_display(proxy_display, port->user_name, remote_ps_data, - update_process_title ? "authentication" : ""); - } - else - { - if (am_walsender) - { init_ps_display("wal sender process", port->user_name, remote_ps_data, update_process_title ? "authentication" : ""); - } else - { init_ps_display(port->user_name, port->database_name, remote_ps_data, update_process_title ? "authentication" : ""); - } - } /* * Disable the timeout, and prevent SIGTERM/SIGQUIT again. diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index 464cfcd9..4b46d9c8 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -3327,10 +3327,7 @@ WalSndSignals(void) pqsignal(SIGINT, StatementCancelHandler); /* query cancel */ pqsignal(SIGTERM, die); /* request shutdown */ pqsignal(SIGQUIT, quickdie); /* hard crash time */ - if (!IsConnFromProxy()) - { InitializeTimeouts(); /* establishes SIGALRM handler */ - } pqsignal(SIGPIPE, SIG_IGN); pqsignal(SIGUSR1, procsignal_sigusr1_handler); pqsignal(SIGUSR2, WalSndLastCycleHandler); /* request a last cycle and diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index b075a7e8..abab8378 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -124,9 +124,6 @@ #include "replication/worker_internal.h" #endif -char *proxy_for_dn = NULL; /* Proxy for which dn? */ -bool am_proxy_for_dn = false; /* Am I a proxy for dn? */ -bool am_conn_from_proxy = false; /* Am I connected from proxy? */ extern int optind; @@ -254,13 +251,6 @@ static void replace_null_with_blank(char *src, int length); static bool NeedResourceOwner(const char *stmt_name); #endif -static PGXCNodeHandle * -get_handle_on_proxy(void); -static PGXCNodeHandle * -handle_request_msg_on_proxy(PGXCNodeHandle *conn, int firstchar, StringInfo input_msg); -void -set_flag_from_proxy(int flag, const char *username); - #ifdef __COLD_HOT__ /* * Release memory alloc in TopMemoryContext and only used in single Session. @@ -665,7 +655,6 @@ SocketBackend(StringInfo inBuf) (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("invalid frontend message type %d", qtype))); break; - case 'w': /* Set connected by proxy */ #ifdef PGXC /* PGXC_DATANODE */ #ifdef __TBASE__ case 'N': @@ -4814,8 +4803,6 @@ PostgresMain(int argc, char *argv[], volatile bool need_report_activity = false; bool disable_idle_in_transaction_timeout = false; - PGXCNodeHandle *proxy_conn = NULL; - #ifdef PGXC /* PGXC_DATANODE */ /* Snapshot info */ TransactionId xmin PG_USED_FOR_ASSERTS_ONLY; @@ -5527,12 +5514,6 @@ PostgresMain(int argc, char *argv[], } #endif /* XCP */ - if (am_proxy_for_dn) - { - proxy_conn = handle_request_msg_on_proxy(proxy_conn, firstchar, &input_message); - continue; - } - switch (firstchar) { case 'Q': /* simple query */ @@ -6147,18 +6128,6 @@ PostgresMain(int argc, char *argv[], } break; #endif - case 'w': /* Set connected by proxy */ - { - int flag = 0; - - Assert(input_message.len == 4); - - flag = pq_getmsgint(&input_message, 4); - pq_getmsgend(&input_message); - - set_flag_from_proxy(flag, username); - } - break; default: ereport(FATAL, (errcode(ERRCODE_PROTOCOL_VIOLATION), @@ -6435,251 +6404,4 @@ IsExtendedQuery(void) { return doing_extended_query_message; } - -/* - * Get a dn connection on proxy - */ -PGXCNodeHandle * -get_handle_on_proxy(void) -{ - PGXCNodeHandle *conn = NULL; - char node_type = PGXC_NODE_DATANODE; - Oid node_oid = InvalidOid; - int node_id = -1; - int flag = 0; - PGXCNodeAllHandles *handles = NULL; - List *dnList = NIL; - int ret = 0; - - Assert(IS_PGXC_COORDINATOR); - - /* Get dn oid */ - StartTransactionCommand(); - InitMultinodeExecutor(false); - node_oid = get_pgxc_nodeoid(proxy_for_dn); - CommitTransactionCommand(); - - if (node_oid == InvalidOid) - { - ereport(FATAL, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("Unknow dn: %s, oid is invalid", proxy_for_dn))); - } - - /* Get dn id */ - node_id = PGXCNodeGetNodeId(node_oid, &node_type); - if (node_id == -1) - { - ereport(FATAL, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("Unknow dn: %s, oid: %d, id: -1", proxy_for_dn, node_oid))); - } - - elog(LOG, "Proxy for dn %s, node oid %d, node id %d", - proxy_for_dn, node_oid, node_id); - - /* Get dn connection */ - dnList = lappend_int(dnList, node_id); - Assert(list_length(dnList) == 1); - handles = get_handles(dnList, NIL, false, false, true); - if (handles == NULL) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Get connections failed for %s", proxy_for_dn))); - - } - if (handles->dn_conn_count == 0) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Get 0 connection for %s", proxy_for_dn))); - } - - Assert(handles->co_conn_count == 0); - Assert(handles->dn_conn_count == 1); - - conn = handles->datanode_handles[0]; - Assert(conn != NULL); - - pfree_pgxc_all_handles(handles); - handles = NULL; - - /* Set dn process */ - if (am_walsender) - { - flag |= FLAG_AM_WALSENDER; - if (am_db_walsender) - { - flag |= FLAG_AM_DB_WALSENDER; - } - } - ret = pgxc_node_send_proxy_flag(conn, flag); - if (ret != 0) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Proxy send flag to %s error: %d", proxy_for_dn, ret))); - } - - return conn; -} - -/* - * Forward client request command to dn and receive response - */ -PGXCNodeHandle * -handle_request_msg_on_proxy(PGXCNodeHandle *conn, int firstchar, StringInfo input_msg) -{ - int ret = 0; - - Assert(IS_PGXC_COORDINATOR); - - if (conn == NULL) - { - conn = get_handle_on_proxy(); - } - - Assert(conn != NULL); - - /* Before query, replicate stream is not closed, set stream_closed to false */ - conn->stream_closed = false; - - if (firstchar == 'Q') - { - const char *query_string = pq_getmsgstring(input_msg); - pq_getmsgend(input_msg); - debug_query_string = query_string; - } - - elog(DEBUG1, "Proxy: firstchar is %c(%d)", firstchar, firstchar); - - /* Send message */ - ret = pgxc_node_send_on_proxy(conn, firstchar, input_msg); - if (ret != 0) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Proxy send request to %s error: %d", proxy_for_dn, ret))); - } - - switch (firstchar) - { - /* - * 'X' means that the frontend is closing down the socket. EOF - * means unexpected loss of frontend connection. Either way, - * perform normal shutdown. - */ - case 'X': - case EOF: - /* - * Reset whereToSendOutput to prevent ereport from attempting - * to send any more messages to client. - */ - if (whereToSendOutput == DestRemote) - { - elog(LOG, "Set whereToSendOutput from %d to %d", - whereToSendOutput, DestNone); - whereToSendOutput = DestNone; - } - - /* Destroy the dn connection on proxy */ - PoolManagerDisconnect(); - - /* - * NOTE: if you are tempted to add more code here, DON'T! - * Whatever you had in mind to do should be set up as an - * on_proc_exit or on_shmem_exit callback, instead. Otherwise - * it will fail to be called during other backend-shutdown - * scenarios. - */ - proc_exit(0); - - default: - break; - } - - /* Receive message */ - ret = pgxc_node_receive_on_proxy(conn); - if (ret != 0) - { - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Proxy receive from %s error: %d", proxy_for_dn, ret))); - } - - debug_query_string = NULL; - - return conn; -} - -/* - * Set flag from proxy - */ -void -set_flag_from_proxy(int flag, const char *username) -{ - if (am_conn_from_proxy) - { - ereport(ERROR, - (errcode(ERRCODE_CONNECTION_EXCEPTION), - errmsg("It is connected from proxy already"))); - } - - am_conn_from_proxy = true; - - elog(LOG, "It is connected from proxy"); - - if (am_walsender) - { - ereport(ERROR, - (errcode(ERRCODE_CONNECTION_EXCEPTION), - errmsg("It is a wal sender already"))); - } - - if (flag & FLAG_AM_WALSENDER) - { - am_walsender = true; - if (flag & FLAG_AM_DB_WALSENDER) - { - am_db_walsender = true; - } - } - - elog(LOG, "Set wal sender: am_walsender(%d), am_db_walsender(%d)", - am_walsender, am_db_walsender); - - if (am_walsender) - { - int fixed_len = 0; - const char *fixed = get_ps_display_fixed(&fixed_len); - char fixed_buf[fixed_len + 1]; - char *display = NULL; - - if (fixed_len != 0) - { - Assert (fixed != NULL); - - snprintf(fixed_buf, fixed_len, "%s", fixed); - fixed_buf[fixed_len] = '\0'; - - display = strstr(fixed_buf, username); - Assert (display != NULL); - - init_ps_display("wal sender used by proxy", display, "", ""); - } - else - { - elog(WARNING, "Get ps display fixed length is 0"); - - init_ps_display("wal sender used by proxy", "", "", ""); - } - - IsNormalPostgres = false; - - WalSndSignals(); - InitWalSender(); - } -} - #endif diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 8b7af537..dbccb8f6 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -7305,14 +7305,6 @@ ResetAllOptions(void) {// #lizard forgives int i; - if (am_walsender) - { - /* never be here */ - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("RESET ALL is forbidden on wal sender"))); - } - for (i = 0; i < num_guc_variables; i++) { struct config_generic *gconf = guc_variables[i]; diff --git a/src/backend/utils/misc/ps_status.c b/src/backend/utils/misc/ps_status.c index 51f668d1..06f6c857 100644 --- a/src/backend/utils/misc/ps_status.c +++ b/src/backend/utils/misc/ps_status.c @@ -417,26 +417,3 @@ get_ps_display(int *displen) return ps_buffer + ps_buffer_fixed_size; } - -/* - * Returns the fixed part in the ps display, in case someone needs - * it. Note that only the fixed part is returned. - * The string will not be null-terminated, so return the effective - * length into *fixlen. - */ -const char * -get_ps_display_fixed(int *fixlen) -{ -#ifdef PS_USE_CLOBBER_ARGV - /* If ps_buffer is a pointer, it might still be null */ - if (!ps_buffer) - { - *fixlen = 0; - return ""; - } -#endif - - *fixlen = (int) ps_buffer_fixed_size; - - return ps_buffer; -} diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 6c4c5886..7976c39b 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -722,7 +722,6 @@ typedef enum BackendType B_PGXL_POOLER, B_CLEAN_2PC_LAUNCHER, B_CLEAN_2PC_WORKER, - B_PROXY_FOR_DN, } BackendType; diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h index baa30f65..7047d510 100644 --- a/src/include/pgxc/execRemote.h +++ b/src/include/pgxc/execRemote.h @@ -384,7 +384,6 @@ extern void ExecRemoteUtility(RemoteQuery *node); extern bool is_data_node_ready(PGXCNodeHandle * conn); extern int handle_response(PGXCNodeHandle *conn, ResponseCombiner *combiner); -extern int handle_response_on_proxy(PGXCNodeHandle *conn, ResponseCombiner *combiner); extern void HandleCmdComplete(CmdType commandType, CombineTag *combine, const char *msg_body, size_t len); @@ -477,8 +476,6 @@ extern int pgxc_node_receive_responses(const int conn_count, PGXCNodeHandle ** c extern bool validate_combiner(ResponseCombiner *combiner); #endif -extern int pgxc_node_receive_on_proxy(PGXCNodeHandle *handle); - #ifdef __TWO_PHASE_TRANS__ extern char *get_nodelist(char * prepareGID, bool localNode, bool implicit); extern void InitLocalTwoPhaseState(void); diff --git a/src/include/pgxc/pgxc.h b/src/include/pgxc/pgxc.h index 370882dd..687be6c8 100644 --- a/src/include/pgxc/pgxc.h +++ b/src/include/pgxc/pgxc.h @@ -134,8 +134,6 @@ extern Datum xc_lockForBackupKey2; #define IsConnFromGtm() (remoteConnType == REMOTE_CONN_GTM) #define IsConnFromGtmProxy() (remoteConnType == REMOTE_CONN_GTM_PROXY) -#define IsConnFromProxy() (am_conn_from_proxy) - /* key pair to be used as object id while using advisory lock for backup */ #define XC_LOCK_FOR_BACKUP_KEY_1 0xFFFF #define XC_LOCK_FOR_BACKUP_KEY_2 0xFFFF diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h index d69aa7f1..f0e7c269 100644 --- a/src/include/pgxc/pgxcnode.h +++ b/src/include/pgxc/pgxcnode.h @@ -123,9 +123,6 @@ struct pgxc_node_handle bool in_extended_query; bool needSync; /* set when error and extend query. */ - - bool stream_closed; /* Whether replicate stream is closed on proxy? */ - #ifdef __TBASE__ bool sock_fatal_occurred; /*Network failure occurred, and sock descriptor was closed */ char last_command; /*last command we processed. */ @@ -220,11 +217,6 @@ extern int pgxc_node_send_my_sync(PGXCNodeHandle * handle); #ifdef __SUBSCRIPTION__ extern int pgxc_node_send_apply(PGXCNodeHandle * handle, char * buf, int len, bool ignore_pk_conflict); #endif - -extern int pgxc_node_send_proxy_flag(PGXCNodeHandle *handle, int flag); -extern int pgxc_node_send_on_proxy(PGXCNodeHandle *handle, int firstchar, - StringInfo inBuf); - #ifdef __TBASE__ extern int pgxc_node_send_disconnect(PGXCNodeHandle * handle, char *cursor, int cons); #endif diff --git a/src/include/postgres.h b/src/include/postgres.h index c35967b0..bee66144 100644 --- a/src/include/postgres.h +++ b/src/include/postgres.h @@ -71,11 +71,6 @@ #define EXTENT_FIRST_BLOCKNUMBER(eid) ((eid)*PAGES_PER_EXTENTS) #endif -extern char *proxy_for_dn; /* Proxy for which dn? */ -extern bool am_proxy_for_dn; /* Am I a proxy for dn? */ -extern bool am_conn_from_proxy; /* Am I connected from proxy? */ - - /* ---------------------------------------------------------------- * Section 1: variable-length datatypes (TOAST support) * ---------------------------------------------------------------- diff --git a/src/include/replication/walsender.h b/src/include/replication/walsender.h index a3b7876c..1f20db82 100644 --- a/src/include/replication/walsender.h +++ b/src/include/replication/walsender.h @@ -16,13 +16,6 @@ #include "fmgr.h" -#define FLAG_AM_WALSENDER 0x01 /* Flag to set am_walsender(Am I a walsender process?) */ -#define FLAG_AM_DB_WALSENDER 0x02 /* Flag to set am_db_walsender(Am I a - walsender process and connected to - a database? - Yes: used for logical replicate. - No: used for physical replicate. */ - /* * What to do with a snapshot in create replication slot command. */ diff --git a/src/include/utils/ps_status.h b/src/include/utils/ps_status.h index 097474c5..2ba5a0ea 100644 --- a/src/include/utils/ps_status.h +++ b/src/include/utils/ps_status.h @@ -23,6 +23,4 @@ extern void set_ps_display(const char *activity, bool force); extern const char *get_ps_display(int *displen); -extern const char *get_ps_display_fixed(int *displen); - #endif /* PS_STATUS_H */ From a7e94a89f47fea1588d6a457d1dbdc27ec21d45b Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Sat, 6 Apr 2019 15:23:37 +0900 Subject: [PATCH 561/578] Add support TCP user timeout in libpq and the backend server Similarly to the set of parameters for keepalive, a connection parameter for libpq is added as well as a backend GUC, called tcp_user_timeout. Increasing the TCP user timeout is useful to allow a connection to survive extended periods without end-to-end connection, and decreasing it allows application to fail faster. By default, the parameter is 0, which makes the connection use the system default, and follows a logic close to the keepalive parameters in its handling. When connecting through a Unix-socket domain, the parameters have no effect. Author: Ryohei Nagaura Reviewed-by: Fabien Coelho, Robert Haas, Kyotaro Horiguchi, Kirk Jamison, Mikalai Keida, Takayuki Tsunakawa, Andrei Yahorau Discussion: https://postgr.es/m/EDA4195584F5064680D8130B1CA91C45367328@G01JPEXMBYT04 --- src/backend/utils/misc/guc.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index dbccb8f6..e38bcbba 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -4964,6 +4964,17 @@ static struct config_uint ConfigureNamesUInt[] = NULL, NULL, NULL }, + { + {"tcp_user_timeout", PGC_USERSET, CLIENT_CONN_OTHER, + gettext_noop("TCP user timeout."), + gettext_noop("A value of 0 uses the system default."), + GUC_UNIT_MS + }, + &tcp_user_timeout, + 0, 0, INT_MAX, + NULL, assign_tcp_user_timeout, show_tcp_user_timeout + }, + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL From 29754107356f9d38197728e9f8a9368fbcb69e2d Mon Sep 17 00:00:00 2001 From: sigmalin Date: Tue, 1 Mar 2022 21:34:13 +0800 Subject: [PATCH 562/578] fix hang when pqcancel http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131097019641 --- src/backend/libpq/pqcomm.c | 1 + src/backend/utils/misc/guc.c | 11 ----------- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c index db3b1ea1..132b85b2 100644 --- a/src/backend/libpq/pqcomm.c +++ b/src/backend/libpq/pqcomm.c @@ -2072,6 +2072,7 @@ SetSockKeepAlive(int sock) elog(LOG, "SetSockKeepAlive setsockopt(TCP_USER_TIMEOUT) failed: %m"); } } +} int pq_gettcpusertimeout(Port *port) diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index e38bcbba..dbccb8f6 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -4964,17 +4964,6 @@ static struct config_uint ConfigureNamesUInt[] = NULL, NULL, NULL }, - { - {"tcp_user_timeout", PGC_USERSET, CLIENT_CONN_OTHER, - gettext_noop("TCP user timeout."), - gettext_noop("A value of 0 uses the system default."), - GUC_UNIT_MS - }, - &tcp_user_timeout, - 0, 0, INT_MAX, - NULL, assign_tcp_user_timeout, show_tcp_user_timeout - }, - /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL From 00f576f088e8d8286e4a1661d515e393f6431822 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Wed, 16 Mar 2022 16:40:15 +0800 Subject: [PATCH 563/578] fix pooler bug http://tapd.woa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131099016045 --- src/backend/pgxc/pool/poolmgr.c | 127 ++++++++++--------- src/include/pgxc/poolmgr.h | 215 ++++++++++++++++---------------- 2 files changed, 179 insertions(+), 163 deletions(-) diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c index 72d5786e..28b0d3c6 100644 --- a/src/backend/pgxc/pool/poolmgr.c +++ b/src/backend/pgxc/pool/poolmgr.c @@ -209,7 +209,7 @@ typedef struct Oid nodeoid; /* Node Oid related to this pool */ char *connstr; /* palloc memory, need free */ - time_t m_version; /* version of node pool */ + int64 m_version; /* version of node pool */ int32 size; /* total pool size */ int32 validSize; /* valid data element number */ bool failed; @@ -513,14 +513,14 @@ static int agent_acquire_connections(PoolAgent *agent, List *datanodelist, List static int send_local_commands(PoolAgent *agent, List *datanodelist, List *coordlist); static int cancel_query_on_connections(PoolAgent *agent, List *datanodelist, List *coordlist, int signal); static PGXCNodePoolSlot *acquire_connection(DatabasePool *dbPool, PGXCNodePool **pool,int32 nodeidx, Oid node, bool bCoord); -static void agent_release_connections(PoolAgent *agent, bool force_destroy); +static void agent_release_connections(PoolAgent *agent, bool force_destroy, bool sync); static void agent_return_connections(PoolAgent *agent); static bool agent_reset_session(PoolAgent *agent); static void release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot, - int32 nodeidx, Oid node, bool force_destroy, bool bCoord); -static void destroy_slot_ex(int32 nodeidx, Oid node, PGXCNodePoolSlot *slot, char *file, int32 line); -#define destroy_slot(nodeidx, node, slot) destroy_slot_ex(nodeidx, node, slot, __FILE__, __LINE__) + int32 nodeidx, Oid node, bool force_destroy, bool bCoord, bool sync); +static void destroy_slot_ex(int32 nodeidx, Oid node, PGXCNodePoolSlot *slot, bool sync, char *file, int32 line); +#define destroy_slot(nodeidx, node, slot, sync) destroy_slot_ex(nodeidx, node, slot, sync, __FILE__, __LINE__) static void close_slot(int32 nodeidx, Oid node, PGXCNodePoolSlot *slot); @@ -572,7 +572,7 @@ static void *pooler_async_utility_thread(void *arg); static void *pooler_async_connection_management_thread(void *arg); static void *pooler_sync_remote_operator_thread(void *arg); -static bool pooler_async_build_connection(DatabasePool *pool, time_t pool_version, int32 nodeidx, Oid node, +static bool pooler_async_build_connection(DatabasePool *pool, int64 pool_version, int32 nodeidx, Oid node, int32 size, char *connStr, bool bCoord); static BitmapMgr *BmpMgrCreate(uint32 objnum); static int BmpMgrAlloc(BitmapMgr *mgr); @@ -1623,7 +1623,7 @@ agent_init(PoolAgent *agent, const char *database, const char *user_name, /* disconnect if we are still connected */ if (agent->pool) { - agent_release_connections(agent, false); + agent_release_connections(agent, false, false); } oldcontext = MemoryContextSwitchTo(agent->mcxt); @@ -1697,7 +1697,10 @@ agent_destroy(PoolAgent *agent) if (bsync) { - agent_release_connections(agent, true); + /* + * if temporary objects used for this pool session, release using synchronization + */ + agent_release_connections(agent, true, agent->is_temp); } } @@ -1758,7 +1761,7 @@ destroy_pend_agent(PoolAgent *agent) */ if (bsync) { - agent_release_connections(agent, true); + agent_release_connections(agent, true, false); } } @@ -2257,7 +2260,7 @@ agent_handle_input(PoolAgent * agent, StringInfo s) { elog(LOG, POOL_MGR_PREFIX"receive command %c from agent:%d. destory=%d", qtype, agent->pid, destroy); } - agent_release_connections(agent, destroy); + agent_release_connections(agent, destroy, false); } break; @@ -3968,8 +3971,8 @@ PoolManagerCancelQuery(int dn_count, int* dn_list, int co_count, int* co_list, i * Release connections for Datanodes and Coordinators */ static void -agent_release_connections(PoolAgent *agent, bool force_destroy) -{// #lizard forgives +agent_release_connections(PoolAgent *agent, bool force_destroy, bool sync) +{ MemoryContext oldcontext; int i; @@ -4033,7 +4036,7 @@ agent_release_connections(PoolAgent *agent, bool force_destroy) { elog(LOG, POOL_MGR_PREFIX"++++agent_release_connections pid:%d release slot_seq:%d nodename:%s backend_pid:%d++++", agent->pid, slot->seqnum, slot->node_name, slot->backend_pid); } - release_connection(agent->pool, slot, i, agent->dn_conn_oids[i], force_destroy, false); + release_connection(agent->pool, slot, i, agent->dn_conn_oids[i], force_destroy, false, sync); } agent->dn_connections[i] = NULL; } @@ -4053,7 +4056,7 @@ agent_release_connections(PoolAgent *agent, bool force_destroy) { elog(LOG, POOL_MGR_PREFIX"++++agent_release_connections pid:%d release slot_seq:%d nodename:%s backend_pid:%d++++", agent->pid, slot->seqnum, slot->node_name, slot->backend_pid); } - release_connection(agent->pool, slot, i, agent->coord_conn_oids[i], force_destroy, true); + release_connection(agent->pool, slot, i, agent->coord_conn_oids[i], force_destroy, true, sync); } agent->coord_connections[i] = NULL; } @@ -4126,7 +4129,7 @@ agent_return_connections(PoolAgent *agent) { elog(LOG, POOL_MGR_PREFIX"++++agent_return_connections pid:%d release slot_seq:%d++++", agent->pid, slot->seqnum); } - release_connection(agent->pool, slot, i, agent->dn_conn_oids[i], false, false); + release_connection(agent->pool, slot, i, agent->dn_conn_oids[i], false, false, false); } agent->dn_connections[i] = NULL; } @@ -4146,7 +4149,7 @@ agent_return_connections(PoolAgent *agent) { elog(LOG, POOL_MGR_PREFIX"++++agent_return_connections pid:%d release slot_seq:%d++++", agent->pid, slot->seqnum); } - release_connection(agent->pool, slot, i, agent->coord_conn_oids[i], false, true); + release_connection(agent->pool, slot, i, agent->coord_conn_oids[i], false, true, false); } agent->coord_connections[i] = NULL; } @@ -4216,7 +4219,7 @@ agent_reset_session(PoolAgent *agent) { elog(LOG, POOL_MGR_PREFIX"++++agent_reset_session pid:%d release slot_seq:%d++++", agent->pid, slot->seqnum); } - release_connection(agent->pool, slot, i, agent->dn_conn_oids[i], false, false); + release_connection(agent->pool, slot, i, agent->dn_conn_oids[i], false, false, false); agent->dn_connections[i] = NULL; } @@ -4258,7 +4261,7 @@ agent_reset_session(PoolAgent *agent) elog(LOG, POOL_MGR_PREFIX"++++agent_reset_session pid:%d release slot_seq:%d++++", agent->pid, slot->seqnum); } agent->coord_connections[i] = NULL; - release_connection(agent->pool, slot, i, agent->coord_conn_oids[i], false, false); + release_connection(agent->pool, slot, i, agent->coord_conn_oids[i], false, false, false); } else @@ -4422,6 +4425,7 @@ create_database_pool(const char *database, const char *user_name, const char *pg databasePool->bneed_warm = false; databasePool->bneed_precreate = false; databasePool->bneed_pool = need_pool; + databasePool->version = 0; return databasePool; } @@ -4457,7 +4461,7 @@ reload_database_pools(PoolAgent *agent) * Release node connections if any held. It is not guaranteed client session * does the same so don't ever try to return them to pool and reuse */ - agent_release_connections(agent, true); + agent_release_connections(agent, true, false); /* before destory nodepool, just wait for all async task is done */ bsucceed = pooler_wait_for_async_task_done(); @@ -4518,10 +4522,10 @@ reload_database_pools(PoolAgent *agent) destroy_node_pool_free_slots(nodePool); /* increase the node pool version */ - nodePool->m_version = time(NULL); + nodePool->m_version = databasePool->version++; elog(LOG, POOL_MGR_PREFIX"nodePool:%s has been changed, " "size:%d, freeSize:%d, reload_database_pools: nodePools " - "of node (%u, %s) has increased version %lu.", + "of node (%u, %s) has increased version "INT64_FORMAT, nodePool->connstr, nodePool->size, nodePool->freeSize, nodePool->nodeoid, nodePool->node_name, nodePool->m_version); @@ -4672,7 +4676,7 @@ acquire_connection(DatabasePool *dbPool, PGXCNodePool **pool,int32 nodeidx, Oid elog(WARNING, POOL_MGR_PREFIX"connection to node %u contains invalid fd:%d", node, fd); } } - destroy_slot(nodeidx, node, slot); + destroy_slot(nodeidx, node, slot, false); slot = NULL; /* Decrement current max pool size */ @@ -4721,8 +4725,8 @@ acquire_connection(DatabasePool *dbPool, PGXCNodePool **pool,int32 nodeidx, Oid */ static void release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot, - int32 nodeidx, Oid node, bool force_destroy, bool bCoord) -{// #lizard forgives + int32 nodeidx, Oid node, bool force_destroy, bool bCoord, bool sync) +{ PGXCNodePool *nodePool; time_t now; @@ -4752,7 +4756,7 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot, nodePool->node_name, slot->backend_pid, nodeidx, nodePool->size, nodePool->freeSize); } - destroy_slot(nodeidx, node, slot); + destroy_slot(nodeidx, node, slot, sync); return; } @@ -4788,7 +4792,7 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot, nodePool->node_name, slot->backend_pid, nodeidx, agentCount, nodePool->size, nodePool->freeSize, nodePool->m_version, slot->m_version); } - destroy_slot(nodeidx, node, slot); + destroy_slot(nodeidx, node, slot, sync); return; } @@ -4911,7 +4915,7 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot, nodePool->node_name, slot->backend_pid, nodeidx, nodePool->size, nodePool->freeSize); } - destroy_slot(nodeidx, node, slot); + destroy_slot(nodeidx, node, slot, sync); /* Decrease pool size */ DecreasePoolerSize(nodePool,__FILE__, __LINE__); @@ -4989,7 +4993,7 @@ grow_pool(DatabasePool *dbPool, int32 nodeidx, Oid node, bool bCoord) snprintf(nodePool->node_name, NAMEDATALEN, "%s", name_str); MemoryContextSwitchTo(oldcontext); - nodePool->m_version = time(NULL); + nodePool->m_version = dbPool->version++; elog(LOG, "grow_pool: nodePools of node (%u, %s) is created.", nodePool->nodeoid, nodePool->node_name); @@ -5038,8 +5042,8 @@ grow_pool(DatabasePool *dbPool, int32 nodeidx, Oid node, bool bCoord) * Destroy pool slot, including slot itself. */ static void -destroy_slot_ex(int32 nodeidx, Oid node, PGXCNodePoolSlot *slot, char *file, int32 line) -{// #lizard forgives +destroy_slot_ex(int32 nodeidx, Oid node, PGXCNodePoolSlot *slot, bool sync, char *file, int32 line) +{ int32 threadid = 0; uint64 pipeput_loops = 0; PGXCPoolConnectReq *connReq; @@ -5070,6 +5074,16 @@ destroy_slot_ex(int32 nodeidx, Oid node, PGXCNodePoolSlot *slot, char *file, int elog(LOG, POOL_MGR_PREFIX"destroy_slot invalid slot status, null pointer conn:%p xc_cancelConn:%p", slot->conn, slot->xc_cancelConn); } + if (sync) + { + /* release now if sync */ + PQfreeCancel((PGcancel *)slot->xc_cancelConn); + PGXCNodeClose(slot->conn); + slot->bdestoryed = true; + pfree(slot); + return; + } + /* if no free pipe line avaliable, just do it sync */ threadid = pooler_async_task_pick_thread(&g_PoolConnControl, nodeidx); if (-1 == threadid) @@ -5228,7 +5242,7 @@ destroy_node_pool(PGXCNodePool *node_pool) nodeidx = get_node_index_by_nodeoid(node_pool->nodeoid); for (i = 0; i < node_pool->freeSize; i++) { - destroy_slot(nodeidx, node_pool->nodeoid, node_pool->slot[i]); + destroy_slot(nodeidx, node_pool->nodeoid, node_pool->slot[i], false); } pfree(node_pool->slot); node_pool->size -= node_pool->freeSize; @@ -5252,7 +5266,7 @@ destroy_node_pool_free_slots(PGXCNodePool *node_pool) if (PoolConnectDebugPrint) { elog(LOG, - POOL_MGR_PREFIX"About to destroy slots of node pool %s, node_pool version:%lu " + POOL_MGR_PREFIX"About to destroy slots of node pool %s, node_pool version:"INT64_FORMAT "agentCount is %d current size is %d, freeSize is %d, %d connections are in use", node_pool->connstr, node_pool->m_version, agentCount, node_pool->size, node_pool->freeSize, node_pool->size - node_pool->freeSize); @@ -5264,7 +5278,7 @@ destroy_node_pool_free_slots(PGXCNodePool *node_pool) nodeidx = get_node_index_by_nodeoid(node_pool->nodeoid); for (i = 0; i < node_pool->freeSize; i++) { - destroy_slot(nodeidx, node_pool->nodeoid, node_pool->slot[i]); + destroy_slot(nodeidx, node_pool->nodeoid, node_pool->slot[i], false); node_pool->slot[i] = NULL; } node_pool->size -= node_pool->freeSize; @@ -5786,7 +5800,7 @@ clean_connection(List *node_discard, const char *database, const char *user_name nodeidx = get_node_index_by_nodeoid(nodePool->nodeoid); for (i = 0; i < nodePool->freeSize; i++) { - destroy_slot(nodeidx, nodePool->nodeoid, nodePool->slot[i]); + destroy_slot(nodeidx, nodePool->nodeoid, nodePool->slot[i], false); nodePool->slot[i] = NULL; } } @@ -5965,7 +5979,7 @@ shrink_pool(DatabasePool *pool) nodePool->size, nodePool->freeSize); } /* connection is idle for long, close it */ - destroy_slot(nodeidx, nodePool->nodeoid, slot); + destroy_slot(nodeidx, nodePool->nodeoid, slot, false); /* reduce pool size and total number of connections */ DecreasePoolerFreesize(nodePool,__FILE__,__LINE__); @@ -6304,7 +6318,7 @@ static void pooler_handle_sync_response_queue(void) { if (connRsp->slot->conn) { - destroy_slot(connRsp->nodeindex, connRsp->nodepool->nodeoid, connRsp->slot); + destroy_slot(connRsp->nodeindex, connRsp->nodepool->nodeoid, connRsp->slot, false); } else { @@ -6335,7 +6349,7 @@ static void pooler_handle_sync_response_queue(void) /* Force to close the connection. */ if (slot) { - release_connection(connRsp->agent->pool, slot, connRsp->nodeindex, nodeoid, true, connRsp->bCoord); + release_connection(connRsp->agent->pool, slot, connRsp->nodeindex, nodeoid, true, connRsp->bCoord, false); } } pfree(connRsp); @@ -6388,7 +6402,7 @@ static void pooler_handle_sync_response_queue(void) /* Force to close the connection. */ if (slot) { - release_connection(connRsp->agent->pool, slot, connRsp->nodeindex, nodeOid, true, connRsp->bCoord); + release_connection(connRsp->agent->pool, slot, connRsp->nodeindex, nodeOid, true, connRsp->bCoord, false); } else { @@ -6602,7 +6616,7 @@ static void pooler_handle_sync_response_queue(void) /* Force to close the connection. */ if (slot) { - release_connection(connRsp->agent->pool, slot, connRsp->nodeindex, nodeoid, true, connRsp->bCoord); + release_connection(connRsp->agent->pool, slot, connRsp->nodeindex, nodeoid, true, connRsp->bCoord, false); } } else if (connRsp->error_flag) @@ -6709,7 +6723,7 @@ static void pooler_sync_connections_to_nodepool(void) } /* time to close the connection */ - destroy_slot(asyncInfo->nodeindex, asyncInfo->node, asyncInfo->slot); + destroy_slot(asyncInfo->nodeindex, asyncInfo->node, asyncInfo->slot, false); if (nodePool) { /* Decrement pool size */ @@ -6753,7 +6767,7 @@ static void pooler_sync_connections_to_nodepool(void) nodePool->coord = false; /* in this case, only datanode */ nodePool->nwarming = 0; nodePool->nquery = 0; - nodePool->m_version = time(NULL); + nodePool->m_version = asyncInfo->dbPool->version++; name_str = get_node_name_by_nodeoid(asyncInfo->node); if (NULL == name_str) @@ -6769,7 +6783,7 @@ static void pooler_sync_connections_to_nodepool(void) if (COMMAND_CONNECTION_WARM == asyncInfo->cmd && false == asyncInfo->slot->bwarmed) { nodeidx = get_node_index_by_nodeoid(asyncInfo->node); - destroy_slot(nodeidx, asyncInfo->node, asyncInfo->slot); + destroy_slot(nodeidx, asyncInfo->node, asyncInfo->slot, false); /* Decrease pool size */ DecreasePoolerSize(nodePool,__FILE__, __LINE__); @@ -6803,13 +6817,15 @@ static void pooler_sync_connections_to_nodepool(void) { elog(LOG, POOL_MGR_PREFIX"destory connection to node:%u " "nodeidx:%d nodepool size:%d freeSize:%d for unmatch " - "version, slot->m_version:%lu, nodePool->m_version:%lu", + "version, slot->m_version:"INT64_FORMAT", nodePool->m_version:"INT64_FORMAT, asyncInfo->node, nodeidx, nodePool->size, nodePool->freeSize, asyncInfo->slot->m_version, nodePool->m_version); } nodeidx = get_node_index_by_nodeoid(asyncInfo->node); - destroy_slot(nodeidx, asyncInfo->node, asyncInfo->slot); + + destroy_slot(nodeidx, asyncInfo->node, asyncInfo->slot, false); + break; } @@ -6922,8 +6938,7 @@ static void pooler_sync_connections_to_nodepool(void) errmsg(POOL_MGR_PREFIX"get node %u name failed", connRsp->nodeoid))); } snprintf(nodePool->node_name, NAMEDATALEN, "%s", name_str); - - nodePool->m_version = now; + nodePool->m_version = connRsp->dbPool->version++; elog(LOG, "pooler_sync_connections_to_nodepool: nodePools of " "node (%u, %s) is created.", @@ -6973,12 +6988,12 @@ static void pooler_sync_connections_to_nodepool(void) } else { - destroy_slot(connRsp->nodeindex, connRsp->nodeoid, slot); + destroy_slot(connRsp->nodeindex, connRsp->nodeoid, slot, false); if (PoolConnectDebugPrint) { elog(LOG, POOL_MGR_PREFIX"destroy slot poolsize:%d, " "freeSize:%d, node:%u, MaxPoolSize:%d, " - "connRsp->m_version:%lu, nodePool->m_version:%lu", + "connRsp->m_version:"INT64_FORMAT", nodePool->m_version:"INT64_FORMAT, nodePool->size, nodePool->freeSize, nodePool->nodeoid, MaxPoolSize, connRsp->m_version, nodePool->m_version); @@ -7171,7 +7186,7 @@ static void pooler_async_ping_node(Oid node) /* async batch connection build */ -static bool pooler_async_build_connection(DatabasePool *pool, time_t pool_version, int32 nodeidx, Oid node, int32 size, char *connStr, bool bCoord) +static bool pooler_async_build_connection(DatabasePool *pool, int64 pool_version, int32 nodeidx, Oid node, int32 size, char *connStr, bool bCoord) { int32 threadid; uint64 pipeput_loops = 0; @@ -7533,7 +7548,7 @@ preconnect_and_warm(DatabasePool *dbPool) } snprintf(nodePool->node_name, NAMEDATALEN, "%s", name_str); - nodePool->m_version = time(NULL); + nodePool->m_version = dbPool->version++; elog(LOG, "preconnect_and_warm: nodePools of node (%u, %s) is created.", nodePool->nodeoid, nodePool->node_name); @@ -7564,7 +7579,7 @@ preconnect_and_warm(DatabasePool *dbPool) (errcode(ERRCODE_CONNECTION_FAILURE), errmsg(POOL_MGR_PREFIX"failed to connect to Datanode:[%s],errmsg[%s]", nodePool->connstr, PQerrorMessage((PGconn*)(slot->conn))))); nodeidx = get_node_index_by_nodeoid(nodePool->nodeoid); - destroy_slot(nodeidx, nodePool->nodeoid, slot); + destroy_slot(nodeidx, nodePool->nodeoid, slot, false); pfree((void*)dnOids); pfree((void*)success); return false; @@ -9097,7 +9112,7 @@ static inline bool dispatch_reset_request(PGXCASyncTaskCtl *taskControl, { elog(LOG, POOL_MGR_PREFIX"++++dispatch_reset_request pid:%d release slot_seq:%d++++", agent->pid, slot->seqnum); } - release_connection(agent->pool, slot, nodeindex, node, false, bCoord); + release_connection(agent->pool, slot, nodeindex, node, false, bCoord, false); } } return ret; @@ -10154,7 +10169,7 @@ static void print_pooler_slot(PGXCNodePoolSlot *slot) } else { - elog(LOG, "slot=%p bwarmed=%d usecount=%d refcount=%d m_version=%lu pid=%d seqnum=%d " + elog(LOG, "slot=%p bwarmed=%d usecount=%d refcount=%d m_version="INT64_FORMAT" pid=%d seqnum=%d " "bdestoryed=%d file=%s lineno=%d node_name=%s backend_pid=%d", slot, slot->bwarmed, slot->usecount, slot->refcount,slot->m_version,slot->pid,slot->seqnum, @@ -11071,10 +11086,10 @@ refresh_database_pools(PoolAgent *agent) destroy_node_pool_free_slots(nodePool); /* increase the node pool version */ - nodePool->m_version = time(NULL); + nodePool->m_version = databasePool->version++; elog(LOG, "refresh_database_pools: Found an altered node (%u %s) " - "size %d freesize %d increased m_version %lu" + "size %d freesize %d increased m_version "INT64_FORMAT "connstr_chk=%s, nodePool->connstr=%s", nodePool->nodeoid, nodePool->node_name, nodePool->size, nodePool->freeSize, nodePool->m_version, @@ -11381,7 +11396,7 @@ handle_close_pooled_connections(PoolAgent * agent, StringInfo s) destroy_node_pool_free_slots(nodePool); /* increase the node pool version */ - nodePool->m_version = time(NULL); + nodePool->m_version = databasePool->version++; } } diff --git a/src/include/pgxc/poolmgr.h b/src/include/pgxc/poolmgr.h index e5cee0b6..43ad4d65 100644 --- a/src/include/pgxc/poolmgr.h +++ b/src/include/pgxc/poolmgr.h @@ -2,7 +2,7 @@ * * poolmgr.h * - * Definitions for the Datanode connection pool. + * Definitions for the Datanode connection pool. * * * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group @@ -56,16 +56,16 @@ */ typedef enum { - POOL_CMD_TEMP, /* Temporary object flag */ - POOL_CMD_LOCAL_SET, /* Local SET flag, current transaction block only */ - POOL_CMD_GLOBAL_SET /* Global SET flag */ + POOL_CMD_TEMP, /* Temporary object flag */ + POOL_CMD_LOCAL_SET, /* Local SET flag, current transaction block only */ + POOL_CMD_GLOBAL_SET /* Global SET flag */ } PoolCommandType; #ifdef __TBASE__ typedef enum { - SIGNAL_SIGINT = 0, - SIGNAL_SIGUSR2 = 1 + SIGNAL_SIGINT = 0, + SIGNAL_SIGUSR2 = 1 } SignalType; /* @@ -74,17 +74,17 @@ typedef enum */ typedef enum { - POOL_ERR_NONE, - POOL_ERR_GET_CONNECTIONS_POOLER_LOCKED, - POOL_ERR_GET_CONNECTIONS_TASK_NOT_DONE, - POOL_ERR_GET_CONNECTIONS_DISPATCH_FAILED, - POOL_ERR_GET_CONNECTIONS_INVALID_ARGUMENT, - POOL_ERR_GET_CONNECTIONS_OOM, - POOL_ERR_GET_CONNECTIONS_CONNECTION_BAD, - POOL_ERR_CANCEL_TASK_NOT_DONE, - POOL_ERR_CANCEL_DISPATCH_FAILED, - POOL_ERR_CANCEL_SEND_FAILED, - NUMBER_POOL_ERRS + POOL_ERR_NONE, + POOL_ERR_GET_CONNECTIONS_POOLER_LOCKED, + POOL_ERR_GET_CONNECTIONS_TASK_NOT_DONE, + POOL_ERR_GET_CONNECTIONS_DISPATCH_FAILED, + POOL_ERR_GET_CONNECTIONS_INVALID_ARGUMENT, + POOL_ERR_GET_CONNECTIONS_OOM, + POOL_ERR_GET_CONNECTIONS_CONNECTION_BAD, + POOL_ERR_CANCEL_TASK_NOT_DONE, + POOL_ERR_CANCEL_DISPATCH_FAILED, + POOL_ERR_CANCEL_SEND_FAILED, + NUMBER_POOL_ERRS } PoolErrorCode; #define PoolErrIsValid(err) ((bool) (err > POOL_ERR_NONE && err < NUMBER_POOL_ERRS)) @@ -93,90 +93,91 @@ typedef enum /* Connection pool entry */ typedef struct { - /* stamp elements */ - time_t released;/* timestamp when the connection last time release */ - time_t checked; /* timestamp when the connection last time check */ - time_t created; /* timestamp when the connection created */ - bool bwarmed; - - int32 usecount; - NODE_CONNECTION *conn; - NODE_CANCEL *xc_cancelConn; - - /* trace info */ - int32 refcount; /* reference count */ - time_t m_version; /* version of node slot */ - int32 pid; /* agent pid that contains the slot */ - int32 seqnum; /* slot seqnum for the slot, unique for one slot */ - bool bdestoryed; /* used to show whether we are destoryed */ - char *file; /* file where destroy the slot */ - int32 lineno; /* lineno where destroy the slot */ - char *node_name; /* connection node name , pointer to datanode_pool node_name, no memory allocated*/ - int32 backend_pid;/* backend pid of remote connection */ + /* stamp elements */ + time_t released;/* timestamp when the connection last time release */ + time_t checked; /* timestamp when the connection last time check */ + time_t created; /* timestamp when the connection created */ + bool bwarmed; + + int32 usecount; + NODE_CONNECTION *conn; + NODE_CANCEL *xc_cancelConn; + + /* trace info */ + int32 refcount; /* reference count */ + int64 m_version; /* version of node slot */ + int32 pid; /* agent pid that contains the slot */ + int32 seqnum; /* slot seqnum for the slot, unique for one slot */ + bool bdestoryed; /* used to show whether we are destoryed */ + char *file; /* file where destroy the slot */ + int32 lineno; /* lineno where destroy the slot */ + char *node_name; /* connection node name , pointer to datanode_pool node_name, no memory allocated*/ + int32 backend_pid;/* backend pid of remote connection */ } PGXCNodePoolSlot; /* Pool of connections to specified pgxc node */ typedef struct { - Oid nodeoid; /* Node Oid related to this pool */ - bool coord; /* whether am I coordinator */ - bool asyncInProgress;/* whether am in asyn building */ - char *connstr; - int nwarming; /* connection number warming in progress */ - int nquery; /* connection number query memory size in progress */ - int freeSize; /* available connections */ - int size; /* total pool size */ - - char node_name[NAMEDATALEN]; /* name of the node.*/ - time_t m_version; /* version of node pool */ - PGXCNodePoolSlot **slot; + Oid nodeoid; /* Node Oid related to this pool */ + bool coord; /* whether am I coordinator */ + bool asyncInProgress;/* whether am in asyn building */ + char *connstr; + int nwarming; /* connection number warming in progress */ + int nquery; /* connection number query memory size in progress */ + int freeSize; /* available connections */ + int size; /* total pool size */ + + char node_name[NAMEDATALEN]; /* name of the node.*/ + int64 m_version; /* version of node pool */ + PGXCNodePoolSlot **slot; } PGXCNodePool; /* All pools for specified database */ typedef struct databasepool { - char *database; - char *user_name; - char *pgoptions; /* Connection options */ - HTAB *nodePools; /* Hashtable of PGXCNodePool, one entry for each - * Coordinator or DataNode */ - time_t oldest_idle; - bool bneed_warm; - bool bneed_precreate; - bool bneed_pool; /* check whether need connect pool */ - MemoryContext mcxt; - struct databasepool *next; /* Reference to next to organize linked list */ + char *database; + char *user_name; + char *pgoptions; /* Connection options */ + HTAB *nodePools; /* Hashtable of PGXCNodePool, one entry for each + * Coordinator or DataNode */ + time_t oldest_idle; + bool bneed_warm; + bool bneed_precreate; + bool bneed_pool; /* check whether need connect pool */ + int64 version; /* used to generate node_pool's version */ + MemoryContext mcxt; + struct databasepool *next; /* Reference to next to organize linked list */ } DatabasePool; #define PGXC_POOL_ERROR_MSG_LEN 512 typedef struct PGXCASyncTaskCtl { - slock_t m_lock; /* common lock */ - int32 m_status; /* PoolAyncCtlStaus */ - int32 m_mumber_total; - int32 m_number_done; - - /* acquire connections */ - int32 *m_result; /* fd array */ - int32 *m_pidresult; /* pid array */ - List *m_datanodelist; - List *m_coordlist; - int32 m_number_succeed; - - /* set local command */ - int32 m_res; - - /* set command */ - char *m_command; - int32 m_total; - int32 m_succeed; - - /* last command for 'g' and 's' */ - CommandId m_max_command_id; - - /* errmsg and error status. */ + slock_t m_lock; /* common lock */ + int32 m_status; /* PoolAyncCtlStaus */ + int32 m_mumber_total; + int32 m_number_done; + + /* acquire connections */ + int32 *m_result; /* fd array */ + int32 *m_pidresult; /* pid array */ + List *m_datanodelist; + List *m_coordlist; + int32 m_number_succeed; + + /* set local command */ + int32 m_res; + + /* set command */ + char *m_command; + int32 m_total; + int32 m_succeed; + + /* last command for 'g' and 's' */ + CommandId m_max_command_id; + + /* errmsg and error status. */ bool m_missing_ok; - int32 m_error_offset; - char m_error_msg[PGXC_POOL_ERROR_MSG_LEN]; + int32 m_error_offset; + char m_error_msg[PGXC_POOL_ERROR_MSG_LEN]; }PGXCASyncTaskCtl; @@ -222,8 +223,8 @@ typedef struct /* Handle to the pool manager (Session's side) */ typedef struct { - /* communication channel */ - PoolPort port; + /* communication channel */ + PoolPort port; } PoolHandle; typedef struct PoolerCmdStatistics @@ -245,39 +246,39 @@ typedef struct PoolerCmdStatistics #define POOLER_ERROR_MSG_LEN 256 -extern int MinPoolSize; -extern int MaxPoolSize; -extern int InitPoolSize; -extern int MinFreeSize; +extern int MinPoolSize; +extern int MaxPoolSize; +extern int InitPoolSize; +extern int MinFreeSize; -extern int PoolerPort; -extern int PoolConnKeepAlive; -extern int PoolMaintenanceTimeout; +extern int PoolerPort; +extern int PoolConnKeepAlive; +extern int PoolMaintenanceTimeout; extern bool PersistentConnections; extern char *g_PoolerWarmBufferInfo; extern char *g_unpooled_database; extern char *g_unpooled_user; -extern int PoolSizeCheckGap; -extern int PoolConnMaxLifetime; -extern int PoolMaxMemoryLimit; -extern int PoolConnectTimeOut; +extern int PoolSizeCheckGap; +extern int PoolConnMaxLifetime; +extern int PoolMaxMemoryLimit; +extern int PoolConnectTimeOut; extern int PoolScaleFactor; extern int PoolDNSetTimeout; extern int PoolCheckSlotTimeout; extern int PoolPrintStatTimeout; -extern bool PoolConnectDebugPrint; +extern bool PoolConnectDebugPrint; extern bool PoolSubThreadLogPrint; /* Status inquiry functions */ extern void PGXCPoolerProcessIam(void); extern bool IsPGXCPoolerProcess(void); /* Initialize internal structures */ -extern int PoolManagerInit(void); +extern int PoolManagerInit(void); /* Destroy internal structures */ -extern int PoolManagerDestroy(void); +extern int PoolManagerDestroy(void); /* * Get handle to pool manager. This function should be called just before @@ -308,8 +309,8 @@ extern char *session_options(void); * initialize respective connection pool */ extern void PoolManagerConnect(PoolHandle *handle, - const char *database, const char *user_name, - char *pgoptions); + const char *database, const char *user_name, + char *pgoptions); /* * Reconnect to pool manager @@ -327,7 +328,7 @@ extern void PoolManagerReconnect(void); #define POOL_SET_COMMAND_NONE 0 extern int PoolManagerSetCommand(PGXCNodeHandle **connections, int32 count, PoolCommandType command_type, - const char *set_command); + const char *set_command); /* Get pooled connections */ extern int *PoolManagerGetConnections(List *datanodelist, List *coordlist, bool raise_error, int **pids); @@ -342,7 +343,7 @@ extern bool PoolManagerCheckConnectionInfo(void); extern void PoolManagerReloadConnectionInfo(void); /* Send Abort signal to transactions being run */ -extern int PoolManagerAbortTransactions(char *dbname, char *username, int **proc_pids); +extern int PoolManagerAbortTransactions(char *dbname, char *username, int **proc_pids); /* Return connections back to the pool, for both Coordinator and Datanode connections */ extern void PoolManagerReleaseConnections(bool force); @@ -364,7 +365,7 @@ extern void PoolAsyncPingNodes(void); extern void PoolPingNodes(void); extern void PoolPingNodeRecheck(Oid nodeoid); extern bool check_persistent_connections(bool *newval, void **extra, - GucSource source); + GucSource source); /* Refresh connection data in pooler and drop connections of altered nodes in pooler */ extern int PoolManagerRefreshConnectionInfo(void); From 7b3dcdeb119bf29eb88bf29efc7308248e1c9002 Mon Sep 17 00:00:00 2001 From: andrelin Date: Tue, 26 Apr 2022 20:06:41 +0800 Subject: [PATCH 564/578] Fix bug of transparent decryption on a col tapd: http://tapd.woa.com/pgxz/bugtrace/bugs/view/1010092131098985873 --- src/backend/utils/misc/relcrypt.c | 7 +++-- src/test/regress/expected/mls_check.out | 31 +++++++++++++++++++ .../regress/expected/updatable_views_1.out | 2 ++ src/test/regress/sql/mls_check.sql | 13 ++++++++ 4 files changed, 51 insertions(+), 2 deletions(-) diff --git a/src/backend/utils/misc/relcrypt.c b/src/backend/utils/misc/relcrypt.c index 954ff6c6..ca65b3b4 100644 --- a/src/backend/utils/misc/relcrypt.c +++ b/src/backend/utils/misc/relcrypt.c @@ -1605,17 +1605,20 @@ Datum trsprt_crypt_decrypt_one_col_value(TranspCrypt*transp_crypt, { Datum datum_ret; text * datum_text; + text * input_text; if (TRANSP_CRYPT_INVALID_ALGORITHM_ID != transp_crypt->algo_id) { - datum_text = decrypt_procedure(transp_crypt->algo_id, DatumGetTextP(inputval), INVALID_CONTEXT_LENGTH); + input_text = DatumGetTextP(inputval); + + datum_text = decrypt_procedure(transp_crypt->algo_id, input_text, INVALID_CONTEXT_LENGTH); if (datum_text) { datum_ret = transparent_crypt_text_get_datum(datum_text, attr); } else { - datum_ret = transparent_crypt_text_get_datum(DatumGetTextP(inputval), attr); + datum_ret = transparent_crypt_text_get_datum(input_text, attr); } return datum_ret; diff --git a/src/test/regress/expected/mls_check.out b/src/test/regress/expected/mls_check.out index fd8a30b5..129a101f 100644 --- a/src/test/regress/expected/mls_check.out +++ b/src/test/regress/expected/mls_check.out @@ -2781,6 +2781,37 @@ select MLS_TRANSPARENT_CRYPT_ALGORITHM_UNBIND_SCHEMA('crypt_schema_sm4'); \c - godlike alter schema crypt_schema_sm4 rename to crypt_schema_sm5; drop schema crypt_schema_sm5; +create table tbl_col_sm4(normala int, normalb int, encrypted varchar) distribute by shard(normala); +NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. +\c - mls_admin +select MLS_TRANSPARENT_CRYPT_ALGORITHM_BIND_TABLE('public', 'tbl_col_sm4', 'encrypted', 4); + mls_transparent_crypt_algorithm_bind_table +-------------------------------------------- + t +(1 row) + +\c - godlike +insert into tbl_col_sm4 values(1, 11, '1111dfa11'); +insert into tbl_col_sm4 values(2, 22, repeat('a', 16)); +insert into tbl_col_sm4 values(3, 33, 'dsfanle1={ntwkqweg-dibjf"sdfaw21(){{()"wjqtoij2j 199'); +select * from tbl_col_sm4 order by 1; + normala | normalb | encrypted +---------+---------+------------------------------------------------------ + 1 | 11 | 1111dfa11 + 2 | 22 | aaaaaaaaaaaaaaaa + 3 | 33 | dsfanle1={ntwkqweg-dibjf"sdfaw21(){{()"wjqtoij2j 199 +(3 rows) + +truncate tbl_col_sm4; +\c - mls_admin +select MLS_TRANSPARENT_CRYPT_ALGORITHM_UNBIND_TABLE('public', 'tbl_col_sm4', 'encrypted'); + mls_transparent_crypt_algorithm_unbind_table +---------------------------------------------- + t +(1 row) + +\c - godlike +drop table tbl_col_sm4; --case rename tables in crypted schema \c - godlike create schema crypt_schema_sm66; diff --git a/src/test/regress/expected/updatable_views_1.out b/src/test/regress/expected/updatable_views_1.out index ae85f4e2..df12b4be 100644 --- a/src/test/regress/expected/updatable_views_1.out +++ b/src/test/regress/expected/updatable_views_1.out @@ -2133,6 +2133,7 @@ UPDATE v1 SET a=100 WHERE snoop(a) AND leakproof(a) AND a < 7 AND a != 6; ERROR: DML contains a function runs on CN which is not supported HINT: You might need to push that function down to DN. ALTER FUNCTION leakproof(anyelement) pushdown; +ALTER FUNCTION snoop(anyelement) pushdown; EXPLAIN (VERBOSE, COSTS OFF) UPDATE v1 SET a=100 WHERE snoop(a) AND leakproof(a) AND a < 7 AND a != 6; QUERY PLAN @@ -2170,6 +2171,7 @@ UPDATE v1 SET a=100 WHERE snoop(a) AND leakproof(a) AND a < 7 AND a != 6; UPDATE v1 SET a=100 WHERE snoop(a) AND leakproof(a) AND a < 7 AND a != 6; ALTER FUNCTION leakproof(anyelement) not pushdown; +ALTER FUNCTION snoop(anyelement) not pushdown; SELECT * FROM v1 WHERE a=100; -- Nothing should have been changed to 100 a | b | c | d ---+---+---+--- diff --git a/src/test/regress/sql/mls_check.sql b/src/test/regress/sql/mls_check.sql index 0b96a0c6..c1eaaeb9 100644 --- a/src/test/regress/sql/mls_check.sql +++ b/src/test/regress/sql/mls_check.sql @@ -973,6 +973,19 @@ select MLS_TRANSPARENT_CRYPT_ALGORITHM_UNBIND_SCHEMA('crypt_schema_sm4'); \c - godlike alter schema crypt_schema_sm4 rename to crypt_schema_sm5; drop schema crypt_schema_sm5; +create table tbl_col_sm4(normala int, normalb int, encrypted varchar) distribute by shard(normala); +\c - mls_admin +select MLS_TRANSPARENT_CRYPT_ALGORITHM_BIND_TABLE('public', 'tbl_col_sm4', 'encrypted', 4); +\c - godlike +insert into tbl_col_sm4 values(1, 11, '1111dfa11'); +insert into tbl_col_sm4 values(2, 22, repeat('a', 16)); +insert into tbl_col_sm4 values(3, 33, 'dsfanle1={ntwkqweg-dibjf"sdfaw21(){{()"wjqtoij2j 199'); +select * from tbl_col_sm4 order by 1; +truncate tbl_col_sm4; +\c - mls_admin +select MLS_TRANSPARENT_CRYPT_ALGORITHM_UNBIND_TABLE('public', 'tbl_col_sm4', 'encrypted'); +\c - godlike +drop table tbl_col_sm4; --case rename tables in crypted schema \c - godlike From 15d669556eb36e4fa4d15dfaa1e62c39c888f207 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Mon, 9 May 2022 15:37:36 +0800 Subject: [PATCH 565/578] fix compile and warning --- src/backend/parser/parse_utilcmd.c | 7 ++++--- src/backend/tcop/postgres.c | 13 ------------- 2 files changed, 4 insertions(+), 16 deletions(-) diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c index 2be306b7..87901b13 100644 --- a/src/backend/parser/parse_utilcmd.c +++ b/src/backend/parser/parse_utilcmd.c @@ -3482,6 +3482,7 @@ transformAlterTableStmt(Oid relid, AlterTableStmt *stmt, int year; int mon; int day; + Form_pg_partition_interval routerinfo = NULL; existnparts = RelationGetNParts(rel); newnparts = ((AddDropPartitions*)cmd->def)->nparts; @@ -3499,7 +3500,6 @@ transformAlterTableStmt(Oid relid, AlterTableStmt *stmt, /* * Self-developed partition table compatibility processing */ - Form_pg_partition_interval routerinfo = NULL; routerinfo = rel->rd_partitions_info; /* timestamp convert to posix struct */ @@ -3523,6 +3523,9 @@ transformAlterTableStmt(Oid relid, AlterTableStmt *stmt, for(partidx = existnparts; partidx < existnparts + newnparts; partidx++) { + TableLikeClause *likeclause = makeNode(TableLikeClause); + CreateStmt * createpart = makeNode(CreateStmt); + /* * for compatible with the calculation of the normal time of the self-developed partition table */ @@ -3538,8 +3541,6 @@ transformAlterTableStmt(Oid relid, AlterTableStmt *stmt, } } - TableLikeClause *likeclause = makeNode(TableLikeClause); - CreateStmt * createpart = makeNode(CreateStmt); createpart->relation = copyObject((void *) stmt->relation); createpart->relation->schemaname = get_namespace_name(RelationGetNamespace(rel)); //createpart->relation->relname = GetPartitionName(RelationGetRelid(rel), partidx, false); diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index abab8378..77992099 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -1142,19 +1142,6 @@ pg_plan_queries(List *querytrees, int cursorOptions, ParamListInfo boundParams) return stmt_list; } -static bool -ch_is_space(char ch) -{ - if (ch == ' ' || ch == '\n' || ch == '\t' || ch == '\r' || ch == '\f') - { - return true; - } - else - { - return false; - } -} - /* * get myself query string from original query string, * if the query string contain multi stmt From 63403784f1d23baba0b5845f052a920b4aadc449 Mon Sep 17 00:00:00 2001 From: ceciliasu Date: Fri, 8 Apr 2022 16:19:30 +0800 Subject: [PATCH 566/578] Ignore member of the extensions in function pg_get_publication_tables (merge request !1235) TAPD: --bug=098374847 http://tapd.woa.com/20421696/bugtrace/bugs/view?bug_id=1020421696098374847 --- src/backend/catalog/pg_publication.c | 153 ++++++++++++++++++++++++++- 1 file changed, 152 insertions(+), 1 deletion(-) diff --git a/src/backend/catalog/pg_publication.c b/src/backend/catalog/pg_publication.c index 438a2a74..4d21340b 100644 --- a/src/backend/catalog/pg_publication.c +++ b/src/backend/catalog/pg_publication.c @@ -93,6 +93,8 @@ #include "catalog/pg_type.h" #include "catalog/pg_publication.h" #include "catalog/pg_publication_rel.h" +#include "catalog/pg_depend.h" +#include "catalog/pg_extension.h" #include "utils/array.h" #include "utils/builtins.h" @@ -110,6 +112,145 @@ #include "replication/logicalrelation.h" #endif +typedef struct +{ + Oid tableoid; + Oid oid; +} CatalogId; + +/* This is an array of object identities. */ +static CatalogId *extmembers; +static int numextmembers; + +#define oidcmp(x,y) ( ((x) < (y) ? -1 : ((x) > (y)) ? 1 : 0) ) + +/* + * qsort comparator for CatalogId. + */ +static int +CatalogIdCompare(const void *p1, const void *p2) +{ + const CatalogId *obj1 = (const CatalogId *) p1; + const CatalogId *obj2 = (const CatalogId *) p2; + int cmpval; + + /* + * Compare OID first since it's usually unique, whereas there will only be + * a few distinct values of tableoid. + */ + cmpval = oidcmp(obj1->oid, obj2->oid); + if (cmpval == 0) + cmpval = oidcmp(obj1->tableoid, obj2->tableoid); + return cmpval; +} + +/* + * setExtensionMembership + * accept and save data about which objects belong to extensions + */ +static void +setExtensionMembership(CatalogId *extmems, int nextmems) +{ + /* Sort array in preparation for binary searches */ + if (nextmems > 1) + qsort((void *) extmems, nextmems, sizeof(CatalogId), + CatalogIdCompare); + /* And save */ + extmembers = extmems; + numextmembers = nextmems; +} + +/* + * getExtensionMembership --- obtain extension membership data + * + * We need to identify objects that are extension members as soon as they're + * loaded, so that we can correctly determine whether they need to be dentified as publishable. + * Generally speaking, extension member objects will get marked as *not* to be publishable. + */ +static void +getExtensionMembership() +{ + CatalogId *extmembers; + Relation depRel; + SysScanDesc depScan; + HeapTuple depTup; + int maxObjs = 32; + int nextmembers = 0; + + extmembers = (CatalogId *) palloc0(maxObjs * sizeof(CatalogId)); + + depRel = heap_open(DependRelationId, AccessShareLock); + depScan = systable_beginscan(depRel, DependReferenceIndexId, true, NULL, 0, NULL); + while (HeapTupleIsValid(depTup = systable_getnext(depScan))) + { + /* + * We scan pg_depend to find those relations(RelationRelationId) + * that depend on the given extension type. + * (We assume we can ignore refobjsubid for a type.) + */ + Form_pg_depend pg_depend = (Form_pg_depend) GETSTRUCT(depTup); + if (pg_depend->refclassid != ExtensionRelationId + || pg_depend->deptype != DEPENDENCY_EXTENSION + || pg_depend->classid != RelationRelationId) + continue; + + if (nextmembers >= maxObjs) + { + maxObjs *= 2; + extmembers = (CatalogId *) repalloc(extmembers, maxObjs * sizeof(CatalogId)); + } + extmembers[nextmembers].tableoid = pg_depend->classid; + extmembers[nextmembers].oid = pg_depend->objid; + nextmembers++; + } + + systable_endscan(depScan); + relation_close(depRel, AccessShareLock); + + /* Remember the data for use later */ + setExtensionMembership(extmembers, nextmembers); +} + +/* + * IsCatalogIdExtensionMember + * return If the specified catalog ID depends on some extension. + */ +static bool +IsCatalogIdExtensionMember(CatalogId catalogId) +{ + CatalogId *low; + CatalogId *high; + + /* + * We could use bsearch() here, but the notational cruft of calling + * bsearch is nearly as bad as doing it ourselves; and the generalized + * bsearch function is noticeably slower as well. + */ + if (numextmembers <= 0) + return false; + + low = extmembers; + high = extmembers + (numextmembers - 1); + while (low <= high) + { + CatalogId *middle; + int difference; + + middle = low + (high - low) / 2; + /* comparison must match ExtensionMemberIdCompare, below */ + difference = oidcmp(middle->oid, catalogId.oid); + if (difference == 0) + difference = oidcmp(middle->tableoid, catalogId.tableoid); + if (difference == 0) + return true; + else if (difference < 0) + low = middle + 1; + else + high = middle - 1; + } + return false; +} + /* * Check if relation can be in given publication and throws appropriate * error if not. @@ -416,6 +557,8 @@ GetAllTablesPublicationRelations(void) HeapTuple tuple; List *result = NIL; + getExtensionMembership(); + classRel = heap_open(RelationRelationId, AccessShareLock); ScanKeyInit(&key[0], @@ -427,16 +570,24 @@ GetAllTablesPublicationRelations(void) while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL) { + CatalogId pub_rel; Oid relid = HeapTupleGetOid(tuple); Form_pg_class relForm = (Form_pg_class) GETSTRUCT(tuple); - if (is_publishable_class(relid, relForm)) + pub_rel.tableoid = RelationRelationId; + pub_rel.oid = relid; + + if (is_publishable_class(relid, relForm) + && !IsCatalogIdExtensionMember(pub_rel)) result = lappend_oid(result, relid); } heap_endscan(scan); heap_close(classRel, AccessShareLock); + if (extmembers) + pfree(extmembers); + return result; } From 4b94902b46804245eeb5fdf85c7d03d0ff9719a7 Mon Sep 17 00:00:00 2001 From: andrelin Date: Tue, 29 Mar 2022 14:08:57 +0800 Subject: [PATCH 567/578] Code sync from pg11: 372102b81dd0096764b712deffab00732f3c9d80 background: The MLS related code uses the "expand_tuple" function of PG11, and guesses it is to deal with encrypted columns. The EPQ context distribution also uses this logic to call "expand_tuple", but this function has been fixed in the PG follow-up: The t_self of the new tuple is also set. Without this fix, the t_self of the expanded tuple will be an illegal value, affecting subsequent use --- src/backend/access/common/heaptuple.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/access/common/heaptuple.c b/src/backend/access/common/heaptuple.c index d858d75b..a8e14398 100644 --- a/src/backend/access/common/heaptuple.c +++ b/src/backend/access/common/heaptuple.c @@ -2236,7 +2236,7 @@ expand_tuple(HeapTuple *targetHeapTuple, = (HeapTupleHeader) ((char *) *targetHeapTuple + HEAPTUPLESIZE); (*targetHeapTuple)->t_len = len; (*targetHeapTuple)->t_tableOid = sourceTuple->t_tableOid; - ItemPointerSetInvalid(&((*targetHeapTuple)->t_self)); + (*targetHeapTuple)->t_self = sourceTuple->t_self; targetTHeader->t_infomask = sourceTHeader->t_infomask; targetTHeader->t_hoff = hoff; From 93c2a2a782381a7f802aa075b57421ef71aabab2 Mon Sep 17 00:00:00 2001 From: andrelin Date: Thu, 31 Mar 2022 16:05:39 +0800 Subject: [PATCH 568/578] Should check oldrel's distribution type when ALTER since newrel would be a local temp table with no distribution --- src/backend/commands/tablecmds.c | 2 +- src/test/regress/expected/alter_table.out | 25 +++++++++++++++++++++++ src/test/regress/sql/alter_table.sql | 11 ++++++++++ 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 4a44ba49..6aaa9c30 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -6409,7 +6409,7 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap, LOCKMODE lockmode) * since the per-tuple memory context will be reset shortly. */ #ifdef _SHARDING_ - if(newrel && RelationIsSharded(newrel)) + if (RelationIsSharded(oldrel)) tuple = heap_form_tuple_plain(newTupDesc, values,isnull, diskey, secdiskey, RelationGetRelid(newrel)); else #endif diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out index a1ef7dc2..4daa2206 100644 --- a/src/test/regress/expected/alter_table.out +++ b/src/test/regress/expected/alter_table.out @@ -3831,3 +3831,28 @@ update dropped_col_remote_dml set b = 2; NOTICE: this is a test drop table dropped_col_remote_dml cascade; drop function dropped_col_remote_dml_func; +-- add column with default values and check shardid +create table t_default_shardid(a int, b int) distribute by shard(a); +NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. +insert into t_default_shardid values(1,1), (2,2), (3,3); +select shardid, a, b from t_default_shardid order by 1; + shardid | a | b +---------+---+--- + 105 | 3 | 3 + 2234 | 1 | 1 + 3318 | 2 | 2 +(3 rows) + +create sequence s_default_shardid; +alter table t_default_shardid add column c int default nextval('s_default_shardid'); +-- shardid should not change +select shardid, a, b from t_default_shardid order by 1; + shardid | a | b +---------+---+--- + 105 | 3 | 3 + 2234 | 1 | 1 + 3318 | 2 | 2 +(3 rows) + +drop table t_default_shardid; +drop sequence s_default_shardid; diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql index 42a9bbfe..48dee67c 100644 --- a/src/test/regress/sql/alter_table.sql +++ b/src/test/regress/sql/alter_table.sql @@ -2563,3 +2563,14 @@ alter table dropped_col_remote_dml drop column c; update dropped_col_remote_dml set b = 2; drop table dropped_col_remote_dml cascade; drop function dropped_col_remote_dml_func; + +-- add column with default values and check shardid +create table t_default_shardid(a int, b int) distribute by shard(a); +insert into t_default_shardid values(1,1), (2,2), (3,3); +select shardid, a, b from t_default_shardid order by 1; +create sequence s_default_shardid; +alter table t_default_shardid add column c int default nextval('s_default_shardid'); +-- shardid should not change +select shardid, a, b from t_default_shardid order by 1; +drop table t_default_shardid; +drop sequence s_default_shardid; From b86a845b85a476736ed9eb747dd9851ae1325445 Mon Sep 17 00:00:00 2001 From: andrelin Date: Thu, 31 Mar 2022 16:06:43 +0800 Subject: [PATCH 569/578] Consider restricted node number in cost module --- src/test/regress/expected/alter_table_3.out | 25 +++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/test/regress/expected/alter_table_3.out b/src/test/regress/expected/alter_table_3.out index d6a33ebf..0153fa37 100644 --- a/src/test/regress/expected/alter_table_3.out +++ b/src/test/regress/expected/alter_table_3.out @@ -3703,3 +3703,28 @@ update dropped_col_remote_dml set b = 2; NOTICE: this is a test drop table dropped_col_remote_dml cascade; drop function dropped_col_remote_dml_func; +-- add column with default values and check shardid +create table t_default_shardid(a int, b int) distribute by shard(a); +NOTICE: Replica identity is needed for shard table, please add to this table through "alter table" command. +insert into t_default_shardid values(1,1), (2,2), (3,3); +select shardid, a, b from t_default_shardid order by 1; + shardid | a | b +---------+---+--- + 105 | 3 | 3 + 2234 | 1 | 1 + 3318 | 2 | 2 +(3 rows) + +create sequence s_default_shardid; +alter table t_default_shardid add column c int default nextval('s_default_shardid'); +-- shardid should not change +select shardid, a, b from t_default_shardid order by 1; + shardid | a | b +---------+---+--- + 105 | 3 | 3 + 2234 | 1 | 1 + 3318 | 2 | 2 +(3 rows) + +drop table t_default_shardid; +drop sequence s_default_shardid; From 6d6ba4885d2dacefa1d18d07e1ddc132caa41751 Mon Sep 17 00:00:00 2001 From: aslanxli Date: Wed, 11 May 2022 14:36:59 +0800 Subject: [PATCH 570/578] FIX http://tapd.woa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131099359801&jump_count=1 To sync and update local statistics coordinator queries pg_statistic tables on datanodes and other coordinators, but these are not selectable by PUBLIC. Previously we just disable check for SELECT permission if query referring the pg_statistic table is parsed on datanodes, now disable check for SELECT permission when conn from other coordinator. --- src/backend/parser/parse_relation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/parser/parse_relation.c b/src/backend/parser/parse_relation.c index 10b20a9e..373c6c85 100644 --- a/src/backend/parser/parse_relation.c +++ b/src/backend/parser/parse_relation.c @@ -949,7 +949,7 @@ markRTEForSelectPriv(ParseState *pstate, RangeTblEntry *rte, * have arbitrary query parsed on datanode is EXECUTE DIRECT, it is only * available for superuser. */ - if (IS_PGXC_DATANODE && rte->relid == StatisticRelationId) + if ((IS_PGXC_DATANODE || IsConnFromCoord()) && rte->relid == StatisticRelationId) rte->requiredPerms = 0; else #endif From e4137ffab2954f65bddeac70bdedfeac3d3db9d0 Mon Sep 17 00:00:00 2001 From: sigmalin Date: Wed, 11 May 2022 19:41:36 +0800 Subject: [PATCH 571/578] fix gtm_ctl exit bug http://tapd.woa.com/pgxz/bugtrace/bugs/view/1010092131098965677 (merge request !1306) Squash merge branch 'sigmalin005' into 'Tbase_v2.15.19.8' fix gtm_ctl exit bug http://tapd.woa.com/pgxz/bugtrace/bugs/view/1010092131098965677 TAPD: --bug=098965677 --- src/gtm/gtm_ctl/gtm_ctl.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/gtm/gtm_ctl/gtm_ctl.c b/src/gtm/gtm_ctl/gtm_ctl.c index fa26bed5..8aea4576 100644 --- a/src/gtm/gtm_ctl/gtm_ctl.c +++ b/src/gtm/gtm_ctl/gtm_ctl.c @@ -488,6 +488,9 @@ test_gtm_connection() GTMPQfinish(conn); print_msg("."); sleep(1); /* 1 sec */ + /* if the GTM process is not alive, exit directly */ + if (!gtm_is_alive(get_pgpid())) + break; } } From 7c5577548ae53022a4a852f39c5e65ca2006589a Mon Sep 17 00:00:00 2001 From: sigmalin Date: Wed, 11 May 2022 16:33:34 +0800 Subject: [PATCH 572/578] fix bug when create CreateSenderThread fail https://zhiyan.woa.com/requirement/1162/bug/4982#/bug?story_tab=info&wsn=164&wtype=bug git cherry-pick 8a3aff89 --- src/backend/pgxc/squeue/squeue.c | 10 +++++----- src/include/pgxc/squeue.h | 1 + 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/backend/pgxc/squeue/squeue.c b/src/backend/pgxc/squeue/squeue.c index 61289161..4d681ac9 100644 --- a/src/backend/pgxc/squeue/squeue.c +++ b/src/backend/pgxc/squeue/squeue.c @@ -4101,9 +4101,6 @@ DataPumpSender BuildDataPumpSenderControl(SharedQueue sq) end = sender_control->node_num; } InitDataPumpThreadControl(&sender_control->thread_control[i], sender_control->nodes, base, end, sender_control->node_num); - - /* Set running status for the thread. not running now */ - sender_control->thread_control[i].thread_running = false; } /* set sqname and max connection */ @@ -4832,6 +4829,7 @@ void DataPumpCleanThread(DataPumpSenderControl *sender) { int32 threadid = 0; DataPumpThreadControl *thread = NULL; + bool *send_quit = (bool *)palloc0(sizeof(bool) * sender->thread_num); for (threadid = 0; threadid < sender->thread_num; threadid ++) { @@ -4841,18 +4839,20 @@ void DataPumpCleanThread(DataPumpSenderControl *sender) { thread->thread_need_quit = true; ThreadSemaUp(&thread->send_sem); + send_quit[threadid] = true; } } for (threadid = 0; threadid < sender->thread_num; threadid ++) { + if (send_quit[threadid]) + { thread = &sender->thread_control[threadid]; /* Wait for sender to quit. */ - if (thread->thread_need_quit) - { ThreadSemaDown(&thread->quit_sem); } } + pfree(send_quit); ConvertDone(&sender->convert_control); } diff --git a/src/include/pgxc/squeue.h b/src/include/pgxc/squeue.h index 3f0a6408..fc020c9f 100644 --- a/src/include/pgxc/squeue.h +++ b/src/include/pgxc/squeue.h @@ -114,6 +114,7 @@ typedef enum typedef enum { + ConvertInit, ConvertRunning, ConvertListenError, ConvertAcceptError, From be29047e38cfd8cabe921022d9a58865753f2ef4 Mon Sep 17 00:00:00 2001 From: andrelin Date: Wed, 21 Apr 2021 19:32:33 +0800 Subject: [PATCH 573/578] Support join tables from different group on DN * sending shard route map to lower nodes * Add a guc to constrain group where to execute join op tapd: http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view/1020421696086892879 --- src/backend/commands/portalcmds.c | 3 + src/backend/optimizer/util/pathnode.c | 77 ++++++++++++------------ src/backend/parser/analyze.c | 31 ++-------- src/backend/pgxc/nodemgr/groupmgr.c | 3 +- src/backend/pgxc/pool/execRemote.c | 34 +++++++++-- src/backend/pgxc/pool/pgxcnode.c | 32 ++++++++-- src/backend/pgxc/shard/shardmap.c | 87 +++++++++++++++++++++++++++ src/backend/postmaster/postmaster.c | 2 + src/backend/tcop/postgres.c | 8 +++ src/backend/utils/misc/guc.c | 46 ++++++++++++++ src/include/optimizer/pathnode.h | 3 + src/include/pgxc/pgxc.h | 3 + src/include/pgxc/pgxcnode.h | 2 +- src/include/pgxc/shardmap.h | 5 ++ 14 files changed, 260 insertions(+), 76 deletions(-) diff --git a/src/backend/commands/portalcmds.c b/src/backend/commands/portalcmds.c index 4bea0943..cdd42fa5 100644 --- a/src/backend/commands/portalcmds.c +++ b/src/backend/commands/portalcmds.c @@ -362,6 +362,9 @@ PortalCleanup(Portal portal) /* If cleanup fails below prevent double cleanup */ portal->queryDesc = NULL; + /* invalidate remote shard map info no matter producer or consumer */ + InvalidRemoteShardmap(); + /* * If portal is producing it has an executor which should be * shut down diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index ed05ea1e..5143ac3f 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -50,6 +50,7 @@ #include "optimizer/pgxcship.h" #include "pgxc/groupmgr.h" #include "pgxc/pgxcnode.h" +#include "utils/memutils.h" #endif #ifdef _MIGRATE_ @@ -68,6 +69,11 @@ bool restrict_query = false; /* Support fast query shipping for subquery */ bool enable_subquery_shipping = false; +/* join will happen in these nodes forcibly */ +char *g_constrain_group; /* the GUC variable */ +static Bitmapset *constrainNodes = NULL; +#define BMS_EQUAL_CONSTRAINT(bms) (bms_is_empty(constrainNodes) || bms_equal(constrainNodes, (bms))) + #define REPLICATION_FACTOR 0.8 #endif @@ -1678,28 +1684,6 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) goto pull_up; } - /* - * If outer or inner subpaths are distributed by shard and they do not exist - * in same node set, which means we may need to redistribute tuples to data - * nodes which use different router map to producer. - * We don't support that, so pull it up to CN to accomplish the join. - * - * TODO: - * 1. if the join is "REPLICATION join SHARD", and node set of SHARD table - * is subset of REPLICATION table, no need to pull up. - * 2. find out which side of this join needs to dispatch, and only decide - * whether to pull up by the distributionType of another side subpath. - * 3. pass target router map to another group maybe ? thus nothing need to - * pull up to CN. - */ - if (innerd && outerd && - (outerd->distributionType == LOCATOR_TYPE_SHARD || - (innerd->distributionType == LOCATOR_TYPE_SHARD)) && - !bms_equal(outerd->nodes, innerd->nodes)) - { - goto pull_up; - } - /* * the join of cold-hot tables must be pulled up to CN until we find a way * to determine whether this join occurs in a specific group. @@ -1818,7 +1802,8 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) innerd->distributionType == outerd->distributionType && innerd->distributionExpr && outerd->distributionExpr && - bms_equal(innerd->nodes, outerd->nodes)) + bms_equal(innerd->nodes, outerd->nodes) && + BMS_EQUAL_CONSTRAINT(innerd->nodes)) { ListCell *lc; @@ -2245,7 +2230,7 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) */ cost_qual_eval_node(&cost, (Node *) ri, root); - if (outerd->distributionExpr) + if (outerd->distributionExpr && BMS_EQUAL_CONSTRAINT(outerd->nodes)) { #ifdef __TBASE__ /* @@ -2294,7 +2279,7 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) continue; } } - if (innerd->distributionExpr) + if (innerd->distributionExpr && BMS_EQUAL_CONSTRAINT(innerd->nodes)) { #ifdef __TBASE__ /* For UPDATE/DELETE, make sure inner rel does not need to distribute */ @@ -2453,26 +2438,14 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) /* If we redistribute both parts do join on all nodes ... */ if (new_inner_key && new_outer_key) { + if (bms_is_empty(constrainNodes)) + { int i; for (i = 0; i < NumDataNodes; i++) nodes = bms_add_member(nodes, i); #ifdef __TBASE__ - /* - * We end up here that we don't have replication table and whether - * 1. we have no shard table at both sides OR - * 2. we have shard table but spread in same node set - * so check distribution type and decide what's next. - */ - if (innerd->distributionType == LOCATOR_TYPE_SHARD || - outerd->distributionType == LOCATOR_TYPE_SHARD) - { - /* must be same node set, just copy */ - Assert(bms_equal(innerd->nodes, innerd->nodes)); - nodes = bms_copy(outerd->nodes); - } - /* check if we can distribute by shard */ - else if (OidIsValid(group)) + if (OidIsValid(group)) { int node_index; int32 dn_num; @@ -2527,6 +2500,13 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) } #endif } + else + { + nodes = bms_copy(constrainNodes); + replicate_inner = false; + replicate_outer = false; + } + } /* * ... if we do only one of them redistribute it on the same nodes * as other. @@ -7427,4 +7407,21 @@ path_count_datanodes(Path *path) return 1; } + +void +assign_constrain_nodes(List *node_list) +{ + MemoryContext oldctx = MemoryContextSwitchTo(TopMemoryContext); + ListCell *lc; + + bms_free(constrainNodes); + constrainNodes = NULL; + + foreach(lc, node_list) + { + constrainNodes = bms_add_member(constrainNodes, lfirst_int(lc)); + } + + MemoryContextSwitchTo(oldctx); +} #endif diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c index a720c1fc..9c460dd4 100644 --- a/src/backend/parser/analyze.c +++ b/src/backend/parser/analyze.c @@ -753,36 +753,17 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt) ParseState *sub_pstate = make_parsestate(pstate); Query *selectQuery; -#ifdef __TBASE__ +#ifdef __COLD_HOT__ /* prevent insert into cold_hot table select ... */ if (pstate->p_target_relation) { - RelationLocInfo *target_rel_loc_info = pstate->p_target_relation->rd_locator_info; - RelationLocInfo *from_rel_loc_info; - - if (target_rel_loc_info && target_rel_loc_info->locatorType == LOCATOR_TYPE_SHARD) + RelationLocInfo *rel_loc_info = pstate->p_target_relation->rd_locator_info; + if (rel_loc_info) { - foreach(lc, selectStmt->fromClause) + if (AttributeNumberIsValid(rel_loc_info->secAttrNum) + || OidIsValid(rel_loc_info->coldGroupId)) { - Node *node = lfirst(lc); - if (IsA(node, RangeVar)) - { - Oid relid = RangeVarGetRelid((RangeVar *) node, NoLock, true); - - if (InvalidOid != relid) - { - Relation rel = heap_open(relid, AccessShareLock); - - from_rel_loc_info = rel->rd_locator_info; - if (!is_table_allowed_insert(from_rel_loc_info, target_rel_loc_info)) - { - elog(ERROR, - "shard table could not be inserted from any other tables in different group"); - } - - heap_close(rel, AccessShareLock); - } - } + elog(ERROR, "table in cold-hot group or key-value group could not join with other tables."); } } } diff --git a/src/backend/pgxc/nodemgr/groupmgr.c b/src/backend/pgxc/nodemgr/groupmgr.c index be034bed..e5de7d81 100644 --- a/src/backend/pgxc/nodemgr/groupmgr.c +++ b/src/backend/pgxc/nodemgr/groupmgr.c @@ -678,7 +678,8 @@ GetMyGroupName(void) return NULL; } -char* GetGroupNameByNode(Oid nodeoid) +char * +GetGroupNameByNode(Oid nodeoid) { Relation relation; SysScanDesc scan; diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index c37ac46e..4986c4a7 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -60,12 +60,13 @@ #include "catalog/pgxc_class.h" #ifdef __TBASE__ #include "commands/explain_dist.h" -#include "pgxc/squeue.h" #include "executor/execParallel.h" -#include "postmaster/postmaster.h" #include "executor/nodeModifyTable.h" -#include "utils/syscache.h" #include "nodes/print.h" +#include "optimizer/pathnode.h" +#include "pgxc/squeue.h" +#include "postmaster/postmaster.h" +#include "utils/syscache.h" #endif /* * We do not want it too long, when query is terminating abnormally we just @@ -10941,6 +10942,9 @@ ExecRemoteSubplan(PlanState *pstate) OidIsValid(primary_data_node) && combiner->conn_count > 1 && !g_UseDataPump); char cursor[NAMEDATALEN]; +#ifdef __TBASE__ + StringInfo shardmap = NULL; +#endif if (plan->cursor) { @@ -11000,6 +11004,26 @@ ExecRemoteSubplan(PlanState *pstate) if (estate->es_epqTuple != NULL) epqctxlen = encode_epqcontext(&combiner->ss.ps, &epqctxdata); +#ifdef __TBASE__ + /* + * consider whether to distribute shard map info + * we do that when: + * 1. this is a DN node + * 2. plan distribution is by shard + * 3. target of distribution is not in our group + */ + if (IS_PGXC_DATANODE && node->execNodes != NIL && + plan->distributionType == LOCATOR_TYPE_SHARD) + { + ListCell *cell; + + foreach(cell, node->execNodes) + { + if (!list_member_int(PGXCGroupNodeList, lfirst_int(cell))) + shardmap = SerializeShardmap(); + } + } +#endif /* * The subplan being rescanned, need to restore connections and * re-bind the portal @@ -11039,7 +11063,7 @@ ExecRemoteSubplan(PlanState *pstate) /* rebind */ pgxc_node_send_bind(conn, combiner->cursor, combiner->cursor, - paramlen, paramdata, epqctxlen, epqctxdata); + paramlen, paramdata, epqctxlen, epqctxdata, shardmap); if (enable_statistic) { elog(LOG, "Bind Message:pid:%d,remote_pid:%d,remote_ip:%s,remote_port:%d,fd:%d,cursor:%s", @@ -11128,7 +11152,7 @@ ExecRemoteSubplan(PlanState *pstate) /* bind */ pgxc_node_send_bind(conn, cursor, cursor, paramlen, paramdata, - epqctxlen, epqctxdata); + epqctxlen, epqctxdata, shardmap); if (enable_statistic) { diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index c19325a9..d755e5be 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -70,6 +70,7 @@ #include "catalog/pg_authid.h" #endif #ifdef __TBASE__ +#include "pgxc/groupmgr.h" #include "postmaster/postmaster.h" #endif @@ -398,6 +399,16 @@ InitMultinodeExecutor(bool is_force) slavedatanode_count = 0; PGXCNodeId = 0; + if (IS_PGXC_DATANODE) + { + if (PGXCGroupNodeList != NIL) + { + list_free(PGXCGroupNodeList); + PGXCGroupNodeList = NIL; + } + PGXCGroupNodeList = GetGroupNodeList(GetMyGroupOid()); + } + MemoryContextSwitchTo(oldcontext); PGXCSessionId[0] = '\0'; @@ -2231,7 +2242,7 @@ pgxc_node_send_plan(PGXCNodeHandle * handle, const char *statement, int pgxc_node_send_bind(PGXCNodeHandle * handle, const char *portal, const char *statement, int paramlen, const char *params, - int epqctxlen, const char *epqctx) + int epqctxlen, const char *epqctx, StringInfo shardmap) { int pnameLen; int stmtLen; @@ -2240,6 +2251,7 @@ pgxc_node_send_bind(PGXCNodeHandle * handle, const char *portal, int paramOutLen; int epqCtxLen; int msgLen; + int shardMapLen; /* Invalid connection state, return error */ if (handle->state != DN_CONNECTION_STATE_IDLE) @@ -2257,8 +2269,11 @@ pgxc_node_send_bind(PGXCNodeHandle * handle, const char *portal, paramOutLen = 2; /* size of epq context, 2 if not epq */ epqCtxLen = epqctxlen ? epqctxlen : 2; - /* size + pnameLen + stmtLen + parameters */ - msgLen = 4 + pnameLen + stmtLen + paramCodeLen + paramValueLen + paramOutLen + epqCtxLen; + /* size of shard map information */ + shardMapLen = shardmap ? shardmap->len + 1 : 1; + /* size + pnameLen + stmtLen + parameters + epqctx + shardmap */ + msgLen = 4 + pnameLen + stmtLen + paramCodeLen + paramValueLen + + paramOutLen + epqCtxLen + shardMapLen; /* msgType + msgLen */ if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0) @@ -2317,6 +2332,15 @@ pgxc_node_send_bind(PGXCNodeHandle * handle, const char *portal, handle->outBuffer[handle->outEnd++] = 0; } + /* shard map info */ + if (shardmap && shardMapLen > 1) + { + memcpy(handle->outBuffer + handle->outEnd, shardmap->data, shardMapLen); + handle->outEnd += shardMapLen; + } + else + handle->outBuffer[handle->outEnd++] = '\0'; + handle->in_extended_query = true; return 0; } @@ -2609,7 +2633,7 @@ pgxc_node_send_query_extended(PGXCNodeHandle *handle, const char *query, if (query) if (pgxc_node_send_parse(handle, statement, query, num_params, param_types)) return EOF; - if (pgxc_node_send_bind(handle, portal, statement, paramlen, params, 0, NULL)) + if (pgxc_node_send_bind(handle, portal, statement, paramlen, params, 0, NULL, NULL)) return EOF; if (send_describe) if (pgxc_node_send_describe(handle, false, portal)) diff --git a/src/backend/pgxc/shard/shardmap.c b/src/backend/pgxc/shard/shardmap.c index 6583be1c..29f9a946 100644 --- a/src/backend/pgxc/shard/shardmap.c +++ b/src/backend/pgxc/shard/shardmap.c @@ -173,6 +173,10 @@ static HTAB *g_GroupHashTab = NULL; /*For DN*/ static ShardNodeGroupInfo_DN *g_GroupShardingMgr_DN = NULL; +/* For local DN received from parent node */ +static bool g_ShardMapValid = false; +static ShardMapItemDef g_ShardMap[SHARD_MAP_GROUP_NUM]; + /* used for datanodes */ Bitmapset *g_DatanodeShardgroupBitmap = NULL; @@ -1315,6 +1319,9 @@ int32 GetNodeIndexByHashValue(Oid group, long hashvalue) } shardIdx = abs(hashvalue) % (g_GroupShardingMgr_DN->members->shmemNumShards); + if (g_ShardMapValid) + shardgroup = &g_ShardMap[shardIdx]; + else shardgroup = &g_GroupShardingMgr_DN->members->shmemshardmap[shardIdx]; nodeIdx = shardgroup->nodeindex; } @@ -5718,4 +5725,84 @@ List* GetShardMapRangeList(Oid group, Oid coldgroup, Oid relation, Oid type, Dat return list; } +/* serialize shard map info for dispatching to lower DNs */ +StringInfo +SerializeShardmap(void) +{ + GroupShardInfo *info; + StringInfo data; + int i; + + if (!IS_PGXC_DATANODE) + elog(ERROR, "shouldn't try to serialize group shard info on CN"); + + info = g_GroupShardingMgr_DN->members; + data = makeStringInfo(); + + appendStringInfo(data, "%d", info->shmemNumShards); + for (i = 0; i < info->shmemNumShards; i++) + { + appendStringInfo(data, ",%d", + info->shmemshardmap[i].nodeindex); + } + + return data; +} + +/* + * Deserialize shard map info into g_ShardMap, these information + * comes from parent DN and will replace local info for distribution + * across multi groups. + */ +void +DeserializeShardmap(const char *data) +{ + char *tmp_head = (char *) data; + char *tmp_pos; + int num_shards, i; + + num_shards = (int) strtod(tmp_head, &tmp_pos); + tmp_head = tmp_pos + 1; + + if (num_shards != SHARD_MAP_SHARD_NUM) + { + /* + * for now num_shards should always be SHARD_MAP_GROUP_NUM + * since SHARD_MAP_SHARD_NUM == EXTENSION_SHARD_MAP_SHARD_NUM + * but maybe it will change someday, error out to avoid more + * critical error. + */ + elog(ERROR, "deserializing invalid num of shard map, %d", num_shards); + } + + for (i = 0; i < num_shards; i++) + { + g_ShardMap[i].shardgroupid = i; + g_ShardMap[i].nodeindex = (int) strtod(tmp_head, &tmp_pos); + tmp_head = tmp_pos + 1; + } + + /* enable remote shard map info */ + g_ShardMapValid = true; +} + +/* g_ShardMap is a static array, simply disable it by another static bool */ +void +InvalidRemoteShardmap(void) +{ + g_ShardMapValid = false; +} + +/* + * return group oid of this node in + * return invalid if it's not in a group or it's a CN. + */ +Oid +GetMyGroupOid(void) +{ + if (IS_PGXC_DATANODE) + return g_GroupShardingMgr_DN->members->group; + else + return InvalidOid; +} #endif diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 10be77cd..74c941db 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -470,6 +470,8 @@ bool IsPGXCMainCluster = false; int PGXCNodeId = 0; #ifdef __TBASE__ char PGXCSessionId[NAMEDATALEN]; +int PGXCLevelId = -1; +List *PGXCGroupNodeList = NIL; #endif /* * When a particular node starts up, store the node identifier in this variable diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 77992099..e6afe664 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -2225,6 +2225,7 @@ exec_bind_message(StringInfo input_message) int column_index; int index; char ***data_list = NULL; + const char *shard_map; MemoryContext old_top; #endif @@ -2756,6 +2757,8 @@ exec_bind_message(StringInfo input_message) rformats[i] = pq_getmsgint(input_message, 2); } + InvalidRemoteShardmap(); + /* Get epq context, only datanodes need them */ if (IsConnFromCoord() || IsConnFromDatanode()) { @@ -2779,6 +2782,11 @@ exec_bind_message(StringInfo input_message) portal->epqContext->nodeid[i] = pq_getmsgint(input_message, 4); } } + + /* Get shard map info */ + shard_map = pq_getmsgstring(input_message); + if (shard_map[0] != '\0') + DeserializeShardmap(shard_map); } pq_getmsgend(input_message); diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index dbccb8f6..16fd75c7 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -125,6 +125,8 @@ #include "tcop/pquery.h" #include "optimizer/plancat.h" #include "parser/analyze.h" +#include "pgxc/groupmgr.h" +#include "utils/lsyscache.h" #endif #ifdef __AUDIT__ @@ -294,6 +296,9 @@ static void strreplace_all(char *str, char *needle, char *replacement); #ifdef __TBASE__ static bool set_warm_shared_buffer(bool *newval, void **extra, GucSource source); static const char *show_total_memorysize(void); + +static bool check_constrain_group(char **newval, void **extra, GucSource source); +static void assign_constrain_group(const char *newval, void *extra); #endif #ifdef __COLD_HOT__ static void assign_cold_hot_partition_type(const char *newval, void *extra); @@ -5871,6 +5876,16 @@ static struct config_string ConfigureNamesString[] = "mls_admin", NULL, NULL, NULL }, + { + {"join_constrain_group", PGC_USERSET, CUSTOM_OPTIONS, + gettext_noop("name of the group that join execute in, " + "any data that not in this group will be redistributed"), + NULL + }, + &g_constrain_group, + "", + check_constrain_group, assign_constrain_group, NULL + }, #endif #ifdef _PG_ORCL_ { @@ -13737,6 +13752,37 @@ show_total_memorysize(void) snprintf(buf, sizeof(buf), "%dM", size); return buf; } + +static bool +check_constrain_group(char **newval, void **extra, GucSource source) +{ + char *group_name = NULL; + if (!IsUnderPostmaster) + return true; + + if ((*newval)[0] == '\0') + return true; + + group_name = pstrdup(*newval); + return get_pgxc_groupoid(group_name) != InvalidOid; +} + +static void +assign_constrain_group(const char *newval, void *extra) +{ + char *group_name = NULL; + if (!IsUnderPostmaster) + return; + + if (newval[0] == '\0') + { + assign_constrain_nodes(NIL); + return; + } + + group_name = pstrdup(newval); + assign_constrain_nodes(GetGroupNodeList(get_pgxc_groupoid(group_name))); +} #endif #ifdef __COLD_HOT__ static void diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index e1fe0a4f..59dddbe6 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -372,10 +372,13 @@ extern Path *create_redistribute_distinct_agg_path(PlannerInfo *root, Aggref *agg); extern void contains_remotesubplan(Path *path, int *number, bool *redistribute); +extern void assign_constrain_nodes(List *node_list); + extern int replication_level; extern bool restrict_query; extern bool enable_subquery_shipping; +extern char *g_constrain_group; #endif #endif /* PATHNODE_H */ diff --git a/src/include/pgxc/pgxc.h b/src/include/pgxc/pgxc.h index 687be6c8..264bfbed 100644 --- a/src/include/pgxc/pgxc.h +++ b/src/include/pgxc/pgxc.h @@ -76,6 +76,7 @@ #define PGXC_H #include "postgres.h" +#include "nodes/pg_list.h" extern bool isPGXCCoordinator; extern bool isPGXCDataNode; @@ -107,6 +108,8 @@ extern char *PGXCMainClusterName; extern char *PGXCDefaultClusterName; #ifdef __TBASE__ extern char PGXCSessionId[NAMEDATALEN]; +extern int PGXCLevelId; +extern List *PGXCGroupNodeList; #endif diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h index f0e7c269..13757e15 100644 --- a/src/include/pgxc/pgxcnode.h +++ b/src/include/pgxc/pgxcnode.h @@ -222,7 +222,7 @@ extern int pgxc_node_send_disconnect(PGXCNodeHandle * handle, char *cursor, int #endif extern int pgxc_node_send_bind(PGXCNodeHandle * handle, const char *portal, const char *statement, int paramlen, const char *params, - int eqpctxlen, const char *epqctx); + int eqpctxlen, const char *epqctx, StringInfo shardmap); extern int pgxc_node_send_parse(PGXCNodeHandle * handle, const char* statement, const char *query, short num_params, Oid *param_types); extern int pgxc_node_send_flush(PGXCNodeHandle * handle); diff --git a/src/include/pgxc/shardmap.h b/src/include/pgxc/shardmap.h index c62e3144..0674185d 100644 --- a/src/include/pgxc/shardmap.h +++ b/src/include/pgxc/shardmap.h @@ -234,6 +234,11 @@ extern bool ScanNeedExecute(Relation rel); extern List* GetShardMapRangeList(Oid group, Oid coldgroup, Oid relation, Oid type, Datum dvalue, AttrNumber secAttr, Oid secType, Datum minValue, Datum maxValue, bool equalMin, bool equalMax, RelationAccessType accessType); + +extern StringInfo SerializeShardmap(void); +extern void DeserializeShardmap(const char *data); +extern void InvalidRemoteShardmap(void); +extern Oid GetMyGroupOid(void); #endif #endif /*_SHARDMAP_H_*/ From 6024988daeb53b9a38c29add75b9947f13ee6a12 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Thu, 9 Mar 2023 19:35:39 +0800 Subject: [PATCH 574/578] fix compile error --- src/backend/commands/analyze.c | 5 ++--- src/backend/libpq/pqcomm.c | 1 - src/backend/parser/gram.y | 2 +- src/backend/tcop/utility.c | 2 +- src/include/nodes/nodes.h | 2 +- 5 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 31b7cfbf..49c60295 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -655,7 +655,7 @@ do_analyze_rel(Relation onerel, /* * Fetch relation statistics from remote nodes and update */ - vacuum_rel_coordinator(onerel, in_outer_xact, params); + vacuum_rel_coordinator(onerel, in_outer_xact, params, NULL); /* * Fetch attribute statistics from remote nodes. @@ -5825,8 +5825,7 @@ coord_sync_col_stats(Relation onerel, update_attstats(RelationGetRelid(onerel), inh, attr_cnt, - vacattrstats, - RelationGetRelPersistence(onerel)); + vacattrstats); } /* diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c index 132b85b2..db3b1ea1 100644 --- a/src/backend/libpq/pqcomm.c +++ b/src/backend/libpq/pqcomm.c @@ -2072,7 +2072,6 @@ SetSockKeepAlive(int sock) elog(LOG, "SetSockKeepAlive setsockopt(TCP_USER_TIMEOUT) failed: %m"); } } -} int pq_gettcpusertimeout(Port *port) diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 756b4bad..401ffcd6 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -263,7 +263,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); RoleSpec *rolespec; PartitionForExpr *partfor; PartitionBy *partby; - AnalyzeSyncOpt *analyze_sync_opt; + StatSyncOpt *analyze_sync_opt; } %type stmt schema_stmt diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index 87be1ef4..c1293401 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -1354,7 +1354,7 @@ ProcessUtilityPost(PlannedStmt *pstmt, if (vstmt->relation != NULL) { Relation rel = - relation_openrv_extended(vstmt->relation, NoLock, true, false); + relation_openrv_extended(vstmt->relation, NoLock, true); if (rel && rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP) { relation_close(rel, NoLock); diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index 854f36a4..d4daa350 100644 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -587,7 +587,7 @@ typedef enum NodeTag #ifdef _MLS_ ,T_SyncBufIdInfo /* in bufmgr.c*/ #endif - ,T_AnalyzeSyncOpt + ,T_StatSyncOpt } NodeTag; /* From 0ff366d6825cc0a40fc29b5f0847dc4246a104a5 Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Thu, 9 Mar 2023 19:51:39 +0800 Subject: [PATCH 575/578] adject regress expected --- src/test/regress/expected/join_3.out | 75 ++++--- src/test/regress/expected/limit.out | 8 +- src/test/regress/expected/partition_index.out | 18 +- .../regress/expected/partition_join_2.out | 188 +++++++++--------- src/test/regress/expected/rules.out | 3 + .../regress/expected/select_parallel_4.out | 38 ++-- src/test/regress/expected/subselect.out | 2 +- src/test/regress/expected/transactions_2.out | 6 +- 8 files changed, 169 insertions(+), 169 deletions(-) diff --git a/src/test/regress/expected/join_3.out b/src/test/regress/expected/join_3.out index 1225d2ce..94914044 100644 --- a/src/test/regress/expected/join_3.out +++ b/src/test/regress/expected/join_3.out @@ -2420,7 +2420,7 @@ select count(*) from tenk1 a, tenk1 b -> Partial Aggregate -> Hash Join Hash Cond: (a.hundred = b.thousand) - -> Index Only Scan using tenk1_hundred on tenk1 a + -> Seq Scan on tenk1 a -> Hash -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Seq Scan on tenk1 b @@ -3291,7 +3291,7 @@ select count(*) from Join Filter: (a.unique2 = b.unique1) -> Remote Subquery Scan on all Distribute results by H: thousand - -> Index Only Scan using tenk1_thous_tenthous on tenk1 c + -> Seq Scan on tenk1 c -> Hash -> Remote Subquery Scan on all Distribute results by H: thousand @@ -3336,7 +3336,7 @@ select b.unique1 from Join Filter: (b.unique1 = 42) -> Remote Subquery Scan on all Distribute results by H: 42 - -> Index Only Scan using tenk1_thous_tenthous on tenk1 c + -> Seq Scan on tenk1 c -> Hash -> Remote Subquery Scan on all Distribute results by H: unique1 @@ -3441,19 +3441,17 @@ select f1, unique2, case when unique2 is null then f1 else 0 end QUERY PLAN -------------------------------------------------------------------------- Remote Subquery Scan on all - -> Merge Right Join - Merge Cond: (b.unique2 = a.f1) + -> Hash Right Join + Hash Cond: (b.unique2 = a.f1) Filter: (CASE WHEN (b.unique2 IS NULL) THEN a.f1 ELSE 0 END = 0) -> Remote Subquery Scan on all Distribute results by H: unique2 - -> Index Only Scan using tenk1_unique2 on tenk1 b - -> Materialize + -> Seq Scan on tenk1 b + -> Hash -> Remote Subquery Scan on all Distribute results by H: f1 - -> Sort - Sort Key: a.f1 - -> Seq Scan on int4_tbl a -(13 rows) + -> Seq Scan on int4_tbl a +(11 rows) select f1, unique2, case when unique2 is null then f1 else 0 end from int4_tbl a left join tenk1 b on f1 = unique2 @@ -3512,32 +3510,28 @@ left join using (join_key) ) foo3 using (join_key); - QUERY PLAN --------------------------------------------------------------------------------- + QUERY PLAN +----------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) Output: "*VALUES*".column1, i1.f1, 666 -> Hash Right Join Output: "*VALUES*".column1, i1.f1, (666) Hash Cond: (i1.f1 = "*VALUES*".column1) - -> Merge Right Join + -> Hash Right Join Output: i1.f1, 666 - Merge Cond: (i2.unique2 = i1.f1) + Hash Cond: (i2.unique2 = i1.f1) -> Remote Subquery Scan on all (datanode_1,datanode_2) Output: i2.unique2 Distribute results by H: unique2 - Sort Key: i2.unique2 - -> Index Only Scan using tenk1_unique2 on public.tenk1 i2 + -> Seq Scan on public.tenk1 i2 Output: i2.unique2 - -> Materialize + -> Hash Output: i1.f1 -> Remote Subquery Scan on all (datanode_1) Output: i1.f1 Distribute results by H: f1 - -> Sort + -> Seq Scan on public.int4_tbl i1 Output: i1.f1 - Sort Key: i1.f1 - -> Seq Scan on public.int4_tbl i1 - Output: i1.f1 -> Hash Output: "*VALUES*".column1 -> Remote Subquery Scan on all (datanode_1) @@ -3545,7 +3539,7 @@ using (join_key); Distribute results by H: column1 -> Values Scan on "*VALUES*" Output: "*VALUES*".column1 -(31 rows) +(27 rows) select foo1.join_key as foo1_id, foo3.join_key AS foo3_id, bug_field from (values (0),(1)) foo1(join_key) @@ -4702,18 +4696,19 @@ select * from generate_series(100,200) g, explain (num_nodes off, nodes off, costs off) select count(*) from tenk1 a, tenk1 b join lateral (values(a.unique1)) ss(x) on b.unique2 = ss.x; - QUERY PLAN ------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------ Finalize Aggregate -> Remote Subquery Scan on all -> Partial Aggregate - -> Merge Join - Merge Cond: (b.unique2 = a.unique1) + -> Hash Join + Hash Cond: (b.unique2 = a.unique1) -> Remote Subquery Scan on all Distribute results by H: unique2 - -> Index Only Scan using tenk1_unique2 on tenk1 b - -> Index Only Scan using tenk1_unique1 on tenk1 a -(9 rows) + -> Seq Scan on tenk1 b + -> Hash + -> Seq Scan on tenk1 a +(10 rows) select count(*) from tenk1 a, tenk1 b join lateral (values(a.unique1)) ss(x) on b.unique2 = ss.x; @@ -4726,18 +4721,18 @@ select count(*) from tenk1 a, explain (num_nodes off, nodes off, costs off) select count(*) from tenk1 a, tenk1 b join lateral (values(a.unique1),(-1)) ss(x) on b.unique2 = ss.x; - QUERY PLAN ------------------------------------------------------------------------- + QUERY PLAN +----------------------------------------------------- Aggregate -> Hash Join Hash Cond: ("*VALUES*".column1 = b.unique2) -> Nested Loop -> Remote Subquery Scan on all - -> Index Only Scan using tenk1_unique1 on tenk1 a + -> Seq Scan on tenk1 a -> Values Scan on "*VALUES*" -> Hash -> Remote Subquery Scan on all - -> Index Only Scan using tenk1_unique2 on tenk1 b + -> Seq Scan on tenk1 b (10 rows) select count(*) from tenk1 a, @@ -6113,8 +6108,8 @@ from onek t1, tenk1 t2 where exists (select 1 from tenk1 t3 where t3.thousand = t1.unique1 and t3.tenthous = t2.hundred) and t1.unique1 < 1; - QUERY PLAN ---------------------------------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) Output: t1.unique1, t2.hundred -> Nested Loop @@ -6129,13 +6124,13 @@ where exists (select 1 from tenk1 t3 Output: t3.thousand, t3.tenthous Group Key: t3.thousand, t3.tenthous -> Remote Subquery Scan on all (datanode_1,datanode_2) - Output: t3.thousand, t3.tenthous + Output: t3.unique1, t3.unique2, t3.two, t3.four, t3.ten, t3.twenty, t3.hundred, t3.thousand, t3.twothousand, t3.fivethous, t3.tenthous, t3.odd, t3.even, t3.stringu1, t3.stringu2, t3.string4 Distribute results by H: thousand -> HashAggregate - Output: t3.thousand, t3.tenthous + Output: t3.unique1, t3.unique2, t3.two, t3.four, t3.ten, t3.twenty, t3.hundred, t3.thousand, t3.twothousand, t3.fivethous, t3.tenthous, t3.odd, t3.even, t3.stringu1, t3.stringu2, t3.string4 Group Key: t3.thousand, t3.tenthous - -> Index Only Scan using tenk1_thous_tenthous on public.tenk1 t3 - Output: t3.thousand, t3.tenthous + -> Seq Scan on public.tenk1 t3 + Output: t3.unique1, t3.unique2, t3.two, t3.four, t3.ten, t3.twenty, t3.hundred, t3.thousand, t3.twothousand, t3.fivethous, t3.tenthous, t3.odd, t3.even, t3.stringu1, t3.stringu2, t3.string4 -> Hash Output: t1.unique1 -> Remote Subquery Scan on all (datanode_1,datanode_2) diff --git a/src/test/regress/expected/limit.out b/src/test/regress/expected/limit.out index 61a3f53e..1da03844 100644 --- a/src/test/regress/expected/limit.out +++ b/src/test/regress/expected/limit.out @@ -503,8 +503,8 @@ order by s2 desc; explain (verbose, costs off) select sum(tenthous) as s1, sum(tenthous) + random()*0 as s2 from tenk1 group by thousand order by thousand limit 3; - QUERY PLAN ------------------------------------------------------------------------------------------------------------------------------------ + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Limit Output: (sum(tenthous)), (((sum(tenthous))::double precision + (random() * '0'::double precision))), thousand -> Remote Subquery Scan on all (datanode_1,datanode_2) @@ -526,8 +526,8 @@ select sum(tenthous) as s1, sum(tenthous) + random()*0 as s2 -> Partial HashAggregate Output: thousand, PARTIAL sum(tenthous) Group Key: tenk1.thousand - -> Index Only Scan using tenk1_thous_tenthous on public.tenk1 - Output: thousand, tenthous + -> Seq Scan on public.tenk1 + Output: unique1, unique2, two, four, ten, twenty, hundred, thousand, twothousand, fivethous, tenthous, odd, even, stringu1, stringu2, string4 (23 rows) select sum(tenthous) as s1, sum(tenthous) + random()*0 as s2 diff --git a/src/test/regress/expected/partition_index.out b/src/test/regress/expected/partition_index.out index afe95aab..bf476902 100644 --- a/src/test/regress/expected/partition_index.out +++ b/src/test/regress/expected/partition_index.out @@ -83,8 +83,8 @@ select c1,c2 from t_day_1 where c2 < timestamp without time zone '2015-09-04' or 1 | Tue Sep 01 13:11:00 2015 2 | Wed Sep 02 13:11:00 2015 3 | Wed Sep 02 13:11:00 2015 - 7 | Thu Sep 03 13:11:00 2015 4 | Thu Sep 03 13:11:00 2015 + 7 | Thu Sep 03 13:11:00 2015 (5 rows) select c1,c2 from t_day_1 where c2 < timestamp without time zone '2015-09-04' order by c2 desc limit 5; @@ -103,8 +103,8 @@ select shardid,c2 from t_day_1 where c2 < timestamp without time zone '2015-09-0 2234 | Tue Sep 01 13:11:00 2015 3318 | Wed Sep 02 13:11:00 2015 105 | Wed Sep 02 13:11:00 2015 - 1025 | Thu Sep 03 13:11:00 2015 213 | Thu Sep 03 13:11:00 2015 + 1025 | Thu Sep 03 13:11:00 2015 (5 rows) select shardid,c1+c3 from t_day_1 where c2 < timestamp without time zone '2015-09-04' order by c2 limit 5; @@ -113,8 +113,8 @@ select shardid,c1+c3 from t_day_1 where c2 < timestamp without time zone '2015-0 2234 | 2 3318 | 3 105 | 4 - 1025 | 8 213 | 5 + 1025 | 8 (5 rows) select shardid,c1+c3 from t_day_1 where c2 < timestamp without time zone '2015-09-04' order by c3,c2 limit 5; @@ -221,8 +221,8 @@ select c1,c2 from t_day_7 where c2 < timestamp without time zone '2015-09-20' or 1 | Tue Sep 01 13:11:00 2015 2 | Tue Sep 08 13:11:00 2015 3 | Tue Sep 08 13:11:00 2015 - 7 | Tue Sep 15 13:11:00 2015 4 | Tue Sep 15 13:11:00 2015 + 7 | Tue Sep 15 13:11:00 2015 (5 rows) select c1,c2 from t_day_7 where c2 < timestamp without time zone '2015-09-20' order by c2 desc limit 5; @@ -241,8 +241,8 @@ select shardid,c2 from t_day_7 where c2 < timestamp without time zone '2015-09-2 2234 | Tue Sep 01 13:11:00 2015 3318 | Tue Sep 08 13:11:00 2015 105 | Tue Sep 08 13:11:00 2015 - 1025 | Tue Sep 15 13:11:00 2015 213 | Tue Sep 15 13:11:00 2015 + 1025 | Tue Sep 15 13:11:00 2015 (5 rows) select shardid,c1+c3 from t_day_7 where c2 < timestamp without time zone '2015-09-20' order by c2 limit 5; @@ -251,8 +251,8 @@ select shardid,c1+c3 from t_day_7 where c2 < timestamp without time zone '2015-0 2234 | 2 3318 | 3 105 | 4 - 1025 | 8 213 | 5 + 1025 | 8 (5 rows) select shardid,c1+c3 from t_day_7 where c2 < timestamp without time zone '2015-09-20' order by c3,c2 limit 5; @@ -388,10 +388,10 @@ select c1,c2 from t_month_3 where c2 < timestamp without time zone '2016-02-01' select c1,c2 from t_month_3 where c2 < timestamp without time zone '2016-02-01' order by c2 desc limit 5; c1 | c2 ----+-------------------------- - 17 | Fri Jan 01 13:11:00 2016 - 19 | Fri Jan 01 13:11:00 2016 21 | Fri Jan 01 13:11:00 2016 23 | Fri Jan 01 13:11:00 2016 + 17 | Fri Jan 01 13:11:00 2016 + 19 | Fri Jan 01 13:11:00 2016 26 | Fri Jan 01 13:11:00 2016 (5 rows) @@ -421,8 +421,8 @@ select shardid,c1+c3 from t_month_3 where c2 < timestamp without time zone '2016 2234 | 2 3318 | 3 105 | 4 - 213 | 5 1025 | 8 + 213 | 5 (5 rows) select shardid,c1 from t_month_3 where c2 < timestamp without time zone '2016-02-01' and mod(c1,2) = 1 order by c1 desc limit 5; diff --git a/src/test/regress/expected/partition_join_2.out b/src/test/regress/expected/partition_join_2.out index d2435f12..c8622909 100644 --- a/src/test/regress/expected/partition_join_2.out +++ b/src/test/regress/expected/partition_join_2.out @@ -148,33 +148,33 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b WHE -- full outer join, with placeholder vars EXPLAIN (COSTS OFF) SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b) WHERE t1.phv = t1.a OR t2.phv = t2.b ORDER BY t1.a, t2.b; - QUERY PLAN ------------------------------------------------------------------------ + QUERY PLAN +----------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) -> Sort Sort Key: a, b -> Hash Full Join - Hash Cond: (a = b) + Hash Cond: (b = a) Filter: (((50) = a) OR ((75) = b)) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: a + -> Remote Subquery Scan on all (datanode_2) + Distribute results by H: b -> Append - -> Seq Scan on prt1_p1 - Filter: (b = 0) - -> Seq Scan on prt1_p2 - Filter: (b = 0) - -> Seq Scan on prt1_p3 - Filter: (b = 0) + -> Seq Scan on prt2_p1 + Filter: (a = 0) + -> Seq Scan on prt2_p2 + Filter: (a = 0) + -> Seq Scan on prt2_p3 + Filter: (a = 0) -> Hash - -> Remote Subquery Scan on all (datanode_2) - Distribute results by H: b + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a -> Append - -> Seq Scan on prt2_p1 - Filter: (a = 0) - -> Seq Scan on prt2_p2 - Filter: (a = 0) - -> Seq Scan on prt2_p3 - Filter: (a = 0) + -> Seq Scan on prt1_p1 + Filter: (b = 0) + -> Seq Scan on prt1_p2 + Filter: (b = 0) + -> Seq Scan on prt1_p3 + Filter: (b = 0) (25 rows) SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b) WHERE t1.phv = t1.a OR t2.phv = t2.b ORDER BY t1.a, t2.b; @@ -301,27 +301,11 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 FULL JO -- Semi-join EXPLAIN (COSTS OFF) SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t2.b FROM prt2 t2 WHERE t2.a = 0) AND t1.b = 0 ORDER BY t1.a; - QUERY PLAN --------------------------------------------------------------------------- + QUERY PLAN +-------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) - -> Merge Join - Merge Cond: (b = t1.a) - -> Remote Subquery Scan on all (datanode_2) - -> Sort - Sort Key: b - -> HashAggregate - Group Key: b - -> Remote Subquery Scan on all (datanode_2) - Distribute results by H: b - -> HashAggregate - Group Key: t2.b - -> Append - -> Seq Scan on prt2_p1 t2 - Filter: (a = 0) - -> Seq Scan on prt2_p2 t2_1 - Filter: (a = 0) - -> Seq Scan on prt2_p3 t2_2 - Filter: (a = 0) + -> Merge Semi Join + Merge Cond: (t1.a = b) -> Sort Sort Key: t1.a -> Append @@ -331,7 +315,17 @@ SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t2.b FROM prt2 t2 WHERE t2.a = 0) Filter: (b = 0) -> Seq Scan on prt1_p3 t1_2 Filter: (b = 0) -(28 rows) + -> Remote Subquery Scan on all (datanode_2) + -> Sort + Sort Key: t2.b + -> Append + -> Seq Scan on prt2_p1 t2 + Filter: (a = 0) + -> Seq Scan on prt2_p2 t2_1 + Filter: (a = 0) + -> Seq Scan on prt2_p3 t2_2 + Filter: (a = 0) +(22 rows) SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t2.b FROM prt2 t2 WHERE t2.a = 0) AND t1.b = 0 ORDER BY t1.a; a | b | c @@ -689,8 +683,8 @@ SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 -- make sure these go to null as expected EXPLAIN (COSTS OFF) SELECT t1.a, t1.phv, t2.b, t2.phv, t3.a + t3.b, t3.phv FROM ((SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b)) FULL JOIN (SELECT 50 phv, * FROM prt1_e WHERE prt1_e.c = 0) t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.a = t1.phv OR t2.b = t2.phv OR (t3.a + t3.b)/2 = t3.phv ORDER BY t1.a, t2.b, t3.a + t3.b; - QUERY PLAN ------------------------------------------------------------------------------------ + QUERY PLAN +----------------------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) -> Sort Sort Key: a, b, ((a + b)) @@ -700,26 +694,26 @@ SELECT t1.a, t1.phv, t2.b, t2.phv, t3.a + t3.b, t3.phv FROM ((SELECT 50 phv, * F -> Remote Subquery Scan on all (datanode_1,datanode_2) Distribute results by H: a -> Hash Full Join - Hash Cond: (a = b) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: a + Hash Cond: (b = a) + -> Remote Subquery Scan on all (datanode_2) + Distribute results by H: b -> Append - -> Seq Scan on prt1_p1 - Filter: (b = 0) - -> Seq Scan on prt1_p2 - Filter: (b = 0) - -> Seq Scan on prt1_p3 - Filter: (b = 0) + -> Seq Scan on prt2_p1 + Filter: (a = 0) + -> Seq Scan on prt2_p2 + Filter: (a = 0) + -> Seq Scan on prt2_p3 + Filter: (a = 0) -> Hash - -> Remote Subquery Scan on all (datanode_2) - Distribute results by H: b + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a -> Append - -> Seq Scan on prt2_p1 - Filter: (a = 0) - -> Seq Scan on prt2_p2 - Filter: (a = 0) - -> Seq Scan on prt2_p3 - Filter: (a = 0) + -> Seq Scan on prt1_p1 + Filter: (b = 0) + -> Seq Scan on prt1_p2 + Filter: (b = 0) + -> Seq Scan on prt1_p3 + Filter: (b = 0) -> Hash -> Remote Subquery Scan on all (datanode_1,datanode_2) Distribute results by H: ((a + b) / 2) @@ -743,8 +737,8 @@ SELECT t1.a, t1.phv, t2.b, t2.phv, t3.a + t3.b, t3.phv FROM ((SELECT 50 phv, * F -- Semi-join EXPLAIN (COSTS OFF) SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHERE t1.a = 0 AND t1.b = (t2.a + t2.b)/2) AND t1.b = 0 ORDER BY t1.a; - QUERY PLAN ------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------ Remote Subquery Scan on all (datanode_1,datanode_2) -> Nested Loop Semi Join Join Filter: (t1.a = b) @@ -760,20 +754,20 @@ SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHER -> Remote Subquery Scan on all (datanode_1,datanode_2) Distribute results by H: b -> Hash Join - Hash Cond: (((t2.a + t2.b) / 2) = b) - -> Append - -> Seq Scan on prt1_e_p1 t2 - -> Seq Scan on prt1_e_p2 t2_1 - -> Seq Scan on prt1_e_p3 t2_2 + Hash Cond: (b = ((t2.a + t2.b) / 2)) + -> Remote Subquery Scan on all (datanode_2) + -> Append + -> Seq Scan on prt2_p1 t1_3 + Filter: (a = 0) + -> Seq Scan on prt2_p2 t1_4 + Filter: (a = 0) + -> Seq Scan on prt2_p3 t1_5 + Filter: (a = 0) -> Hash - -> Remote Subquery Scan on all (datanode_2) - -> Append - -> Seq Scan on prt2_p1 t1_3 - Filter: (a = 0) - -> Seq Scan on prt2_p2 t1_4 - Filter: (a = 0) - -> Seq Scan on prt2_p3 t1_5 - Filter: (a = 0) + -> Append + -> Seq Scan on prt1_e_p1 t2 + -> Seq Scan on prt1_e_p2 t2_1 + -> Seq Scan on prt1_e_p3 t2_2 (29 rows) SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHERE t1.a = 0 AND t1.b = (t2.a + t2.b)/2) AND t1.b = 0 ORDER BY t1.a; @@ -1429,36 +1423,36 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 RIGHT JOIN prt2_l t2 ON t1.a = t2.b -- full join EXPLAIN (COSTS OFF) SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_l WHERE prt1_l.b = 0) t1 FULL JOIN (SELECT * FROM prt2_l WHERE prt2_l.a = 0) t2 ON (t1.a = t2.b AND t1.c = t2.c) ORDER BY t1.a, t2.b; - QUERY PLAN ------------------------------------------------------------------------ + QUERY PLAN +----------------------------------------------------------------------------- Remote Subquery Scan on all (datanode_1,datanode_2) -> Sort Sort Key: a, b -> Hash Full Join - Hash Cond: ((a = b) AND ((c)::text = (c)::text)) - -> Remote Subquery Scan on all (datanode_1,datanode_2) - Distribute results by H: a + Hash Cond: ((b = a) AND ((c)::text = (c)::text)) + -> Remote Subquery Scan on all (datanode_2) + Distribute results by H: b -> Append - -> Seq Scan on prt1_l_p1 - Filter: (b = 0) - -> Seq Scan on prt1_l_p2_p1 - Filter: (b = 0) - -> Seq Scan on prt1_l_p2_p2 - Filter: (b = 0) - -> Seq Scan on prt1_l_p3_p1 - Filter: (b = 0) + -> Seq Scan on prt2_l_p1 + Filter: (a = 0) + -> Seq Scan on prt2_l_p2_p1 + Filter: (a = 0) + -> Seq Scan on prt2_l_p2_p2 + Filter: (a = 0) + -> Seq Scan on prt2_l_p3_p1 + Filter: (a = 0) -> Hash - -> Remote Subquery Scan on all (datanode_2) - Distribute results by H: b + -> Remote Subquery Scan on all (datanode_1,datanode_2) + Distribute results by H: a -> Append - -> Seq Scan on prt2_l_p1 - Filter: (a = 0) - -> Seq Scan on prt2_l_p2_p1 - Filter: (a = 0) - -> Seq Scan on prt2_l_p2_p2 - Filter: (a = 0) - -> Seq Scan on prt2_l_p3_p1 - Filter: (a = 0) + -> Seq Scan on prt1_l_p1 + Filter: (b = 0) + -> Seq Scan on prt1_l_p2_p1 + Filter: (b = 0) + -> Seq Scan on prt1_l_p2_p2 + Filter: (b = 0) + -> Seq Scan on prt1_l_p3_p1 + Filter: (b = 0) (28 rows) SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_l WHERE prt1_l.b = 0) t1 FULL JOIN (SELECT * FROM prt2_l WHERE prt2_l.a = 0) t2 ON (t1.a = t2.b AND t1.c = t2.c) ORDER BY t1.a, t2.b; diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index ba5666ef..3c4adea9 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -2397,6 +2397,9 @@ toyemp| SELECT emp.name, emp.location, (12 * emp.salary) AS annualsal FROM emp; +zv1| SELECT zt1.f1, + 'dummy'::text AS junk + FROM pg_temp_31.zt1; SELECT tablename, rulename, definition FROM pg_rules ORDER BY tablename, rulename; pg_settings|pg_settings_n|CREATE RULE pg_settings_n AS diff --git a/src/test/regress/expected/select_parallel_4.out b/src/test/regress/expected/select_parallel_4.out index b57f5248..990ac6ec 100644 --- a/src/test/regress/expected/select_parallel_4.out +++ b/src/test/regress/expected/select_parallel_4.out @@ -42,21 +42,21 @@ alter table tenk1 set (parallel_workers = 4); explain (verbose, costs off) select parallel_restricted(unique1) from tenk1 where stringu1 = 'GRAAAA' order by 1; - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------- Sort - Output: (parallel_restricted(unique1)) - Sort Key: (parallel_restricted(tenk1.unique1)) + Output: (parallel_restricted(unique1)) + Sort Key: (parallel_restricted(tenk1.unique1)) -> Result - Output: parallel_restricted(unique1) + Output: parallel_restricted(unique1) -> Remote Subquery Scan on all (datanode_1,datanode_2) Output: unique1 -> Gather Output: unique1 - Workers Planned: 4 - -> Parallel Seq Scan on public.tenk1 - Output: unique1 - Filter: (tenk1.stringu1 = 'GRAAAA'::name) + Workers Planned: 4 + -> Parallel Seq Scan on public.tenk1 + Output: unique1 + Filter: (tenk1.stringu1 = 'GRAAAA'::name) (13 rows) -- test parallel plan when group by expression is in target list. @@ -125,14 +125,14 @@ select count(stringu1) as num, (CASE WHEN length(stringu1) > 5 THEN 'LONG' ELSE explain (costs off) select sum(parallel_restricted(unique1)) from tenk1 group by(parallel_restricted(unique1)); - QUERY PLAN + QUERY PLAN ----------------------------------------------------------------- HashAggregate Group Key: parallel_restricted(unique1) -> Result - -> Remote Subquery Scan on all (datanode_1,datanode_2) - -> Gather - Workers Planned: 4 + -> Remote Subquery Scan on all (datanode_1,datanode_2) + -> Gather + Workers Planned: 4 -> Parallel Seq Scan on tenk1 (7 rows) @@ -279,8 +279,8 @@ set enable_hashjoin to off; set enable_nestloop to off; explain (costs off) select count(*) from tenk1, tenk2 where tenk1.unique1 = tenk2.unique1; - QUERY PLAN -------------------------------------------------------------------------------------- + QUERY PLAN +----------------------------------------------------------------------- Finalize Aggregate -> Remote Subquery Scan on all (datanode_1,datanode_2) -> Gather @@ -288,9 +288,13 @@ explain (costs off) -> Partial Aggregate -> Parallel Merge Join Merge Cond: (tenk1.unique1 = tenk2.unique1) - -> Parallel Index Only Scan using tenk1_unique1 on tenk1 - -> Index Only Scan using tenk2_unique1 on tenk2 -(9 rows) + -> Sort + Sort Key: tenk1.unique1 + -> Parallel Seq Scan on tenk1 + -> Sort + Sort Key: tenk2.unique1 + -> Seq Scan on tenk2 +(13 rows) select count(*) from tenk1, tenk2 where tenk1.unique1 = tenk2.unique1; count diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out index 876fd5c8..c841f7c9 100644 --- a/src/test/regress/expected/subselect.out +++ b/src/test/regress/expected/subselect.out @@ -885,7 +885,7 @@ select * from int4_tbl where SubPlan 1 -> Remote Subquery Scan on all (datanode_1,datanode_2) Output: a.unique1 - -> Index Only Scan using tenk1_unique1 on public.tenk1 a + -> Seq Scan on public.tenk1 a Output: a.unique1 (26 rows) diff --git a/src/test/regress/expected/transactions_2.out b/src/test/regress/expected/transactions_2.out index 30a34e63..12f40682 100644 --- a/src/test/regress/expected/transactions_2.out +++ b/src/test/regress/expected/transactions_2.out @@ -582,6 +582,10 @@ end$$ language plpgsql volatile; create table revalidate_bug (c float8 unique); insert into revalidate_bug values (1); insert into revalidate_bug values (inverse(0)); +ERROR: DML contains a function runs on CN which is not supported +HINT: You might need to push that function down to DN. +alter function inverse(int) pushdown; +insert into revalidate_bug values (inverse(0)); drop table revalidate_bug; drop function inverse(int); -- verify that cursors created during an aborted subtransaction are @@ -633,7 +637,7 @@ fetch from foo; abort; -- Test for proper cleanup after a failure in a cursor portal -- that was created in an outer subtransaction -CREATE FUNCTION invert(x float8) RETURNS float8 LANGUAGE plpgsql AS +CREATE FUNCTION invert(x float8) RETURNS float8 pushdown LANGUAGE plpgsql AS $$ begin return 1/x; end $$; CREATE FUNCTION create_temp_tab() RETURNS text LANGUAGE plpgsql AS $$ From 7cf7f8afbcab7290538ad5e65893561710be3dfa Mon Sep 17 00:00:00 2001 From: JennyJennyChen Date: Thu, 9 Mar 2023 20:02:41 +0800 Subject: [PATCH 576/578] add v2.5.0-release-note --- v2.5.0-release-note.txt | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 v2.5.0-release-note.txt diff --git a/v2.5.0-release-note.txt b/v2.5.0-release-note.txt new file mode 100644 index 00000000..e6ae92c6 --- /dev/null +++ b/v2.5.0-release-note.txt @@ -0,0 +1,15 @@ +V2.5.0-release 版本主要修改集中在: +1、性能优化 +使用扩展协议时/function中执行insert多条数据重写为copy,写入性能提升数十倍 +cost代价估算准确性提升,执行计划性能提升2倍以上 +pg_stat_cluster_activity 内存使用优化 + + +2、功能增强 +支持资源隔离的不同nodegroup的表进行join +支持从CN订阅DN WAL日志 +支持analyze信息同步到其他CN,保障不同CN的统计信息一致 +增加libpq TCP超时设置 +错误信息提示加强,方便用户进行问题分析 + +3、其他已知bug修复 From 0451d9d9e610297c86c091b405692c972492e1e1 Mon Sep 17 00:00:00 2001 From: runewrz <32592054+runewrz@users.noreply.github.com> Date: Thu, 26 Sep 2024 17:22:01 +0800 Subject: [PATCH 577/578] fix strcmp issue (#151) --- src/backend/pgxc/pool/pgxcnode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) mode change 100644 => 100755 src/backend/pgxc/pool/pgxcnode.c diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c old mode 100644 new mode 100755 index d755e5be..ace82b0e --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -497,7 +497,7 @@ PGXCNodeConnStr(char *host, int port, char *dbname, * remote type can be Coordinator, Datanode or application. */ #ifdef _MLS_ - if (strcmp(user, MLS_USER) == 0 || strcmp(user, AUDIT_USER)) + if (strcmp(user, MLS_USER) == 0 || strcmp(user, AUDIT_USER) == 0) { if (same_host) { From 128d47502476f84392a4ac54603efe007d063285 Mon Sep 17 00:00:00 2001 From: ryanrzwu Date: Mon, 7 Jul 2025 20:58:20 +0800 Subject: [PATCH 578/578] Fix company name. --- COPYRIGHT | 4 ++-- LICENSE.txt | 6 ++---- contrib/btree_gin/btree_gin.c | 4 ++-- contrib/pg_visibility/pg_visibility.c | 4 ++-- contrib/pgxc_ctl/coord_cmd.h | 4 ++-- contrib/pgxc_ctl/varnames.h | 4 ++-- contrib/pgxc_monitor/pgxc_monitor.c | 4 ++-- src/backend/access/common/bufmask.c | 4 ++-- src/backend/access/common/reloptions.c | 4 ++-- src/backend/access/hash/hash_xlog.c | 4 ++-- src/backend/access/heap/visibilitymap.c | 4 ++-- src/backend/access/index/indexam.c | 4 ++-- src/backend/access/rmgrdesc/extentdesc.c | 6 ++---- src/backend/access/rmgrdesc/relcryptdesc.c | 5 ++--- src/backend/access/rmgrdesc/replslotdesc.c | 4 ++-- src/backend/access/rmgrdesc/smgrdesc.c | 4 ++-- src/backend/access/rmgrdesc/xactdesc.c | 4 ++-- src/backend/access/rmgrdesc/xlogdesc.c | 4 ++-- src/backend/access/transam/lru.c | 4 ++-- src/backend/access/transam/rmgr.c | 4 ++-- src/backend/audit/audit.c | 6 ++---- src/backend/audit/audit_fga.c | 10 ++++------ src/backend/bootstrap/bootparse.y | 4 ++-- src/backend/bootstrap/bootstrap.c | 4 ++-- src/backend/catalog/catalog.c | 4 ++-- src/backend/catalog/pg_partition_interval.c | 5 ++--- src/backend/catalog/pg_publication.c | 4 ++-- src/backend/catalog/pgxc_key_values.c | 5 ++--- src/backend/catalog/pgxc_shard_map.c | 5 ++--- src/backend/catalog/storage.c | 4 ++-- src/backend/catalog/toasting.c | 4 ++-- src/backend/commands/event_trigger.c | 4 ++-- src/backend/commands/opclasscmds.c | 4 ++-- src/backend/commands/portalcmds.c | 4 ++-- src/backend/commands/schemacmds.c | 5 ++--- src/backend/contrib/pgcrypto/blf.c | 4 ++-- src/backend/contrib/pgcrypto/crypt-blowfish.c | 4 ++-- src/backend/contrib/pgcrypto/crypt-des.c | 4 ++-- src/backend/contrib/pgcrypto/crypt-gensalt.c | 4 ++-- src/backend/contrib/pgcrypto/crypt-md5.c | 4 ++-- src/backend/contrib/pgcrypto/fortuna.c | 4 ++-- src/backend/contrib/pgcrypto/imath.c | 4 ++-- src/backend/contrib/pgcrypto/internal-sha2.c | 4 ++-- src/backend/contrib/pgcrypto/internal.c | 4 ++-- src/backend/contrib/pgcrypto/mbuf.c | 4 ++-- src/backend/contrib/pgcrypto/md5.c | 4 ++-- src/backend/contrib/pgcrypto/openssl.c | 4 ++-- src/backend/contrib/pgcrypto/pgcrypto.c | 6 ++---- src/backend/contrib/pgcrypto/pgp-armor.c | 4 ++-- src/backend/contrib/pgcrypto/pgp-cfb.c | 4 ++-- src/backend/contrib/pgcrypto/pgp-compress.c | 4 ++-- src/backend/contrib/pgcrypto/pgp-decrypt.c | 4 ++-- src/backend/contrib/pgcrypto/pgp-encrypt.c | 4 ++-- src/backend/contrib/pgcrypto/pgp-info.c | 4 ++-- .../contrib/pgcrypto/pgp-mpi-internal.c | 4 ++-- src/backend/contrib/pgcrypto/pgp-mpi-openssl.c | 4 ++-- src/backend/contrib/pgcrypto/pgp-mpi.c | 4 ++-- src/backend/contrib/pgcrypto/pgp-pgsql.c | 4 ++-- src/backend/contrib/pgcrypto/pgp-pubdec.c | 4 ++-- src/backend/contrib/pgcrypto/pgp-pubenc.c | 4 ++-- src/backend/contrib/pgcrypto/pgp-pubkey.c | 4 ++-- src/backend/contrib/pgcrypto/pgp-s2k.c | 4 ++-- src/backend/contrib/pgcrypto/pgp.c | 4 ++-- src/backend/contrib/pgcrypto/px-crypt.c | 4 ++-- src/backend/contrib/pgcrypto/px-hmac.c | 4 ++-- src/backend/contrib/pgcrypto/px.c | 4 ++-- src/backend/contrib/pgcrypto/random.c | 4 ++-- src/backend/contrib/pgcrypto/rijndael.c | 4 ++-- src/backend/contrib/pgcrypto/sha1.c | 4 ++-- src/backend/contrib/pgcrypto/sha2.c | 4 ++-- src/backend/contrib/sm/sm4.c | 4 ++-- src/backend/executor/execAmi.c | 4 ++-- src/backend/executor/execIndexing.c | 4 ++-- src/backend/executor/execUtils.c | 4 ++-- src/backend/executor/nodeBitmapAnd.c | 4 ++-- src/backend/executor/nodeBitmapIndexscan.c | 4 ++-- src/backend/executor/nodeBitmapOr.c | 4 ++-- src/backend/executor/nodeSubplan.c | 4 ++-- src/backend/libpq/be-fsstubs.c | 4 ++-- src/backend/libpq/be-secure.c | 4 ++-- src/backend/libpq/hba.c | 4 ++-- src/backend/libpq/pqcomm.c | 6 +++--- src/backend/nodes/bitmapset.c | 4 ++-- src/backend/nodes/makefuncs.c | 4 ++-- src/backend/optimizer/plan/subselect.c | 4 ++-- src/backend/optimizer/prep/prepjointree.c | 4 ++-- src/backend/optimizer/util/tlist.c | 4 ++-- src/backend/oracle/charpad.c | 4 ++-- src/backend/oracle/convert.c | 5 ++--- src/backend/oracle/datefce.c | 4 ++-- src/backend/oracle/others.c | 5 ++--- src/backend/oracle/plvstr.c | 6 ++---- src/backend/parser/analyze.c | 4 ++-- src/backend/parser/parse_clause.c | 4 ++-- src/backend/parser/parse_expr.c | 4 ++-- src/backend/parser/parse_oper.c | 4 ++-- src/backend/pgxc/nodemgr/groupmgr.c | 5 ++--- src/backend/pgxc/shard/shard_vacuum.c | 7 +++---- src/backend/pgxc/shard/shardbarrier.c | 5 ++--- src/backend/pgxc/shard/shardmap.c | 4 ++-- src/backend/postmaster/auditlogger.c | 4 ++-- src/backend/postmaster/bgworker.c | 4 ++-- src/backend/replication/logical/decode.c | 4 ++-- src/backend/replication/logical/relation.c | 4 ++-- .../replication/logical/reorderbuffer.c | 4 ++-- src/backend/replication/repl_gram.y | 4 ++-- src/backend/replication/repl_scanner.l | 4 ++-- src/backend/replication/syncrep_scanner.l | 4 ++-- src/backend/replication/walreceiver.c | 4 ++-- src/backend/storage/buffer/freelist.c | 5 ++--- src/backend/storage/file/buffile.c | 5 ++--- src/backend/storage/file/fd.c | 4 ++-- src/backend/storage/freespace/emapage.c | 5 ++--- src/backend/storage/freespace/extent_xlog.c | 6 ++---- src/backend/storage/ipc/ipc.c | 4 ++-- src/backend/storage/ipc/ipci.c | 4 ++-- src/backend/storage/ipc/procsignal.c | 4 ++-- src/backend/storage/lmgr/lmgr.c | 4 ++-- src/backend/storage/lmgr/lock.c | 5 ++--- src/backend/storage/lmgr/nodelock.c | 4 ++-- src/backend/storage/smgr/md.c | 4 ++-- src/backend/tcop/dest.c | 4 ++-- src/backend/utils/adt/datetime.c | 4 ++-- src/backend/utils/adt/format_type.c | 4 ++-- src/backend/utils/adt/formatting.c | 4 ++-- src/backend/utils/adt/json.c | 4 ++-- src/backend/utils/adt/misc.c | 4 ++-- src/backend/utils/adt/oid.c | 4 ++-- src/backend/utils/adt/selfuncs.c | 4 ++-- src/backend/utils/adt/varchar.c | 5 ++--- src/backend/utils/adt/varlena.c | 4 ++-- src/backend/utils/adt/version.c | 5 ++--- src/backend/utils/adt/xml.c | 4 ++-- src/backend/utils/cache/inval.c | 5 ++--- src/backend/utils/cache/relcryptmap.c | 5 ++--- src/backend/utils/init/globals.c | 4 ++-- src/backend/utils/misc/cls.c | 4 ++-- src/backend/utils/misc/datamask.c | 5 ++--- src/backend/utils/misc/mls.c | 5 ++--- src/backend/utils/misc/relcrypt.c | 4 ++-- src/backend/utils/misc/timeout.c | 5 ++--- src/backend/utils/mmgr/aset.c | 4 ++-- src/backend/utils/mmgr/mcxt.c | 4 ++-- src/backend/utils/resowner/resowner.c | 6 ++---- src/backend/utils/sort/tuplestore.c | 4 ++-- src/bin/confmod/conf.c | 6 ++---- src/bin/confmod/conf.h | 4 ++-- src/bin/confmod/confmod.c | 5 ++--- src/bin/confmod/log.c | 4 ++-- src/bin/confmod/log.h | 4 ++-- src/bin/confmod/stree.c | 6 ++---- src/bin/confmod/stree.h | 4 ++-- src/bin/confmod/util.c | 4 ++-- src/bin/confmod/util.h | 4 ++-- src/bin/confmod/var.c | 5 ++--- src/bin/confmod/var.h | 4 ++-- src/bin/pg_basebackup/pg_basebackup.c | 4 ++-- src/bin/pg_controldata/pg_controldata.c | 4 ++-- src/bin/pg_ctl/pg_ctl.c | 4 ++-- src/bin/pg_dump/compress_io.h | 4 ++-- src/bin/pg_dump/pg_backup_archiver.h | 4 ++-- src/bin/pg_dump/pg_dump.h | 4 ++-- src/bin/pg_dump/pg_dump_security.c | 6 ++---- src/bin/pg_upgrade/exec.c | 4 ++-- src/bin/psql/common.c | 5 ++--- src/common/relpath.c | 4 ++-- src/gtm/common/gtm_time.c | 4 ++-- src/gtm/common/heap.c | 6 ++---- src/gtm/main/gtm_backup.c | 4 ++-- src/gtm/main/gtm_store.c | 4 ++-- src/gtm/main/gtm_xlog.c | 5 ++--- src/gtm/main/replication.c | 4 ++-- src/gtm/path/path.c | 4 ++-- src/gtm/proxy/proxy_main.c | 4 ++-- src/gtm/xlog_test/xlog_reader.c | 8 ++++---- src/gtm/xlog_test/xlog_test.c | 13 ++++++++----- src/include/access/gtm.h | 4 ++-- src/include/access/heapam_xlog.h | 4 ++-- src/include/access/lru.h | 4 ++-- src/include/access/parallel.h | 4 ++-- src/include/access/printtup.h | 4 ++-- src/include/access/relcryptaccess.h | 4 ++-- src/include/access/relscan.h | 4 ++-- src/include/access/replslotdesc.h | 4 ++-- src/include/access/rmgrlist.h | 4 ++-- src/include/access/transam.h | 4 ++-- src/include/access/tupdesc_details.h | 4 ++-- src/include/access/twophase.h | 4 ++-- src/include/access/visibilitymap.h | 4 ++-- src/include/access/xlogreader.h | 4 ++-- src/include/access/xlogrecord.h | 4 ++-- src/include/audit/audit.h | 5 ++--- src/include/audit/audit_fga.h | 7 +++---- src/include/c.h | 4 ++-- src/include/catalog/audit/pg_audit_d.h | 6 ++---- src/include/catalog/audit/pg_audit_fga.h | 6 ++---- src/include/catalog/audit/pg_audit_o.h | 6 ++---- src/include/catalog/audit/pg_audit_s.h | 6 ++---- src/include/catalog/audit/pg_audit_u.h | 6 ++---- src/include/catalog/catalog.h | 4 ++-- src/include/catalog/dependency.h | 4 ++-- src/include/catalog/index.h | 4 ++-- src/include/catalog/mls/pg_cls_compartment.h | 4 ++-- src/include/catalog/mls/pg_cls_group.h | 4 ++-- src/include/catalog/mls/pg_cls_label.h | 4 ++-- src/include/catalog/mls/pg_cls_level.h | 4 ++-- src/include/catalog/mls/pg_cls_policy.h | 6 ++---- src/include/catalog/mls/pg_cls_table.h | 4 ++-- src/include/catalog/mls/pg_cls_user.h | 4 ++-- src/include/catalog/mls/pg_data_mask_map.h | 4 ++-- src/include/catalog/mls/pg_data_mask_user.h | 4 ++-- src/include/catalog/namespace.h | 4 ++-- src/include/catalog/objectaddress.h | 4 ++-- src/include/catalog/pg_audit.h | 4 ++-- src/include/catalog/pg_authid.h | 4 ++-- src/include/catalog/pg_cast.h | 4 ++-- src/include/catalog/pg_mls.h | 4 ++-- src/include/catalog/pg_namespace.h | 4 ++-- src/include/catalog/pg_operator.h | 4 ++-- src/include/catalog/pg_partition_interval.h | 4 ++-- src/include/catalog/pg_proc.h | 4 ++-- src/include/catalog/pg_publication.h | 4 ++-- src/include/catalog/pg_publication_shard.h | 5 ++--- src/include/catalog/pg_subscription_shard.h | 6 ++---- src/include/catalog/pg_subscription_table.h | 5 ++--- src/include/catalog/pg_type.h | 4 ++-- src/include/catalog/pgxc_key_values.h | 6 +++--- src/include/catalog/pgxc_shard_map.h | 4 ++-- src/include/catalog/storage_xlog.h | 4 ++-- src/include/commands/cluster.h | 4 ++-- src/include/commands/prepare.h | 4 ++-- src/include/commands/publicationcmds.h | 4 ++-- src/include/commands/relcryptcommand.h | 4 ++-- src/include/commands/schemacmds.h | 4 ++-- src/include/commands/sequence.h | 4 ++-- src/include/commands/subscriptioncmds.h | 4 ++-- src/include/commands/vacuum.h | 4 ++-- src/include/contrib/pgcrypto/blf.h | 4 ++-- src/include/contrib/pgcrypto/fortuna.h | 4 ++-- src/include/contrib/pgcrypto/imath.h | 4 ++-- src/include/contrib/pgcrypto/mbuf.h | 4 ++-- src/include/contrib/pgcrypto/md5.h | 4 ++-- src/include/contrib/pgcrypto/pgcrypto.h | 4 ++-- src/include/contrib/pgcrypto/pgp.h | 6 ++---- src/include/contrib/pgcrypto/px-crypt.h | 4 ++-- src/include/contrib/pgcrypto/px.h | 4 ++-- src/include/contrib/pgcrypto/rijndael.h | 4 ++-- src/include/contrib/pgcrypto/sha1.h | 4 ++-- src/include/contrib/pgcrypto/sha2.h | 4 ++-- src/include/contrib/sm/sm4.h | 4 ++-- src/include/executor/execdesc.h | 4 ++-- src/include/executor/hashjoin.h | 4 ++-- src/include/executor/tqueue.h | 4 ++-- src/include/gtm/elog.h | 4 ++-- src/include/gtm/gtm_checkpoint.h | 4 ++-- src/include/gtm/gtm_conn.h | 4 ++-- src/include/gtm/gtm_gxid.h | 4 ++-- src/include/gtm/gtm_lock.h | 4 ++-- src/include/gtm/gtm_store.h | 4 ++-- src/include/gtm/gtm_xlog.h | 5 ++--- src/include/gtm/gtm_xlog_internal.h | 4 ++-- src/include/gtm/heap.h | 5 ++--- src/include/gtm/libpq-be.h | 4 ++-- src/include/gtm/register.h | 4 ++-- src/include/libpq/auth.h | 4 ++-- src/include/libpq/libpq-be.h | 4 ++-- src/include/libpq/pqcomm.h | 4 ++-- src/include/miscadmin.h | 4 ++-- src/include/nodes/bitmapset.h | 4 ++-- src/include/nodes/makefuncs.h | 4 ++-- src/include/nodes/nodeFuncs.h | 4 ++-- src/include/nodes/relation.h | 4 ++-- src/include/optimizer/pathnode.h | 4 ++-- src/include/optimizer/plancat.h | 4 ++-- src/include/optimizer/planmain.h | 4 ++-- src/include/optimizer/planner.h | 4 ++-- src/include/optimizer/subselect.h | 4 ++-- src/include/optimizer/var.h | 4 ++-- src/include/oracle/oracle.h | 4 ++-- src/include/parser/analyze.h | 4 ++-- src/include/parser/parse_node.h | 4 ++-- src/include/parser/parse_utilcmd.h | 4 ++-- src/include/parser/parser.h | 4 ++-- src/include/pg_config_manual.h | 4 ++-- src/include/pgxc/groupmgr.h | 4 ++-- src/include/pgxc/pgxc.h | 4 ++-- src/include/pgxc/planner.h | 4 ++-- src/include/pgxc/shard_vacuum.h | 4 ++-- src/include/pgxc/shardmap.h | 4 ++-- src/include/postgres_ext.h | 4 ++-- src/include/postmaster/auditlogger.h | 4 ++-- src/include/postmaster/bgworker.h | 4 ++-- src/include/postmaster/bgwriter.h | 4 ++-- src/include/postmaster/pgarch.h | 4 ++-- src/include/replication/decode.h | 4 ++-- src/include/replication/logical_statistic.h | 5 ++--- src/include/replication/logicallauncher.h | 4 ++-- src/include/replication/logicalrelation.h | 4 ++-- src/include/replication/walreceiver.h | 4 ++-- src/include/replication/worker_internal.h | 4 ++-- src/include/storage/buf_internals.h | 4 ++-- src/include/storage/buffile.h | 4 ++-- src/include/storage/extent_xlog.h | 4 ++-- src/include/storage/extentmapping.h | 4 ++-- src/include/storage/lmgr.h | 4 ++-- src/include/storage/lwlock.h | 4 ++-- src/include/storage/nodelock.h | 4 ++-- src/include/storage/proc.h | 4 ++-- src/include/storage/procsignal.h | 4 ++-- src/include/storage/relcryptstorage.h | 4 ++-- src/include/storage/relfilenode.h | 4 ++-- src/include/tcop/dest.h | 4 ++-- src/include/tcop/pquery.h | 4 ++-- src/include/tcop/tcopprot.h | 4 ++-- src/include/utils/builtins.h | 4 ++-- src/include/utils/cls.h | 4 ++-- src/include/utils/datamask.h | 4 ++-- src/include/utils/elog.h | 4 ++-- src/include/utils/guc_tables.h | 4 ++-- src/include/utils/inval.h | 4 ++-- src/include/utils/memutils.h | 4 ++-- src/include/utils/mls.h | 4 ++-- src/include/utils/mls_extension.h | 4 ++-- src/include/utils/pg_locale.h | 4 ++-- src/include/utils/plancache.h | 4 ++-- src/include/utils/portal.h | 4 ++-- src/include/utils/relcrypt.h | 4 ++-- src/include/utils/relcryptcache.h | 4 ++-- src/include/utils/relcryptmap.h | 4 ++-- src/include/utils/relcryptmisc.h | 4 ++-- src/include/utils/resowner_private.h | 4 ++-- src/include/utils/ruleutils.h | 4 ++-- src/include/utils/snapshot.h | 4 ++-- src/include/utils/syscache.h | 4 ++-- src/include/utils/timeout.h | 4 ++-- src/include/utils/tqual.h | 4 ++-- src/include/utils/tuplestore.h | 4 ++-- src/interfaces/libpq/fe-connect.c | 4 ++-- src/interfaces/libpq/fe-protocol3.c | 4 ++-- src/interfaces/libpq/libpq-fe.h | 4 ++-- src/interfaces/libpq/libpq-int.h | 4 ++-- src/pl/plperl/plperl_helpers.h | 4 ++-- src/test/isolation/isolation_test.conf | 18 ++---------------- 343 files changed, 700 insertions(+), 783 deletions(-) diff --git a/COPYRIGHT b/COPYRIGHT index 26cb400c..fc3cef0e 100644 --- a/COPYRIGHT +++ b/COPYRIGHT @@ -2,7 +2,7 @@ TBase Cluster Database Management System Tencent is pleased to support the open source community by making TBase available. -Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +Copyright (C) 2019 Tencent. All rights reserved. TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. @@ -48,7 +48,7 @@ Redistribution and use in source and binary forms, with or without modification, 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. -3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without +3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/LICENSE.txt b/LICENSE.txt index d4589fbe..487e0176 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,6 +1,6 @@ Tencent is pleased to support the open source community by making TBase available. -Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +Copyright (C) 2019 Tencent. All rights reserved. TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. @@ -46,7 +46,7 @@ Redistribution and use in source and binary forms, with or without modification, 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. -3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without +3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -55,5 +55,3 @@ BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUEN GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - diff --git a/contrib/btree_gin/btree_gin.c b/contrib/btree_gin/btree_gin.c index 9b56b3c2..313df90d 100644 --- a/contrib/btree_gin/btree_gin.c +++ b/contrib/btree_gin/btree_gin.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/contrib/pg_visibility/pg_visibility.c b/contrib/pg_visibility/pg_visibility.c index 8b9836e9..74af3424 100644 --- a/contrib/pg_visibility/pg_visibility.c +++ b/contrib/pg_visibility/pg_visibility.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/contrib/pgxc_ctl/coord_cmd.h b/contrib/pgxc_ctl/coord_cmd.h index 79a71dd9..f889c443 100644 --- a/contrib/pgxc_ctl/coord_cmd.h +++ b/contrib/pgxc_ctl/coord_cmd.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/contrib/pgxc_ctl/varnames.h b/contrib/pgxc_ctl/varnames.h index 61fa33fe..601b11a0 100644 --- a/contrib/pgxc_ctl/varnames.h +++ b/contrib/pgxc_ctl/varnames.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/contrib/pgxc_monitor/pgxc_monitor.c b/contrib/pgxc_monitor/pgxc_monitor.c index 83f831f5..6604da2d 100644 --- a/contrib/pgxc_monitor/pgxc_monitor.c +++ b/contrib/pgxc_monitor/pgxc_monitor.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/access/common/bufmask.c b/src/backend/access/common/bufmask.c index 20f3e4ba..28a3d264 100644 --- a/src/backend/access/common/bufmask.c +++ b/src/backend/access/common/bufmask.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c index f3602fb6..e422b27f 100644 --- a/src/backend/access/common/reloptions.c +++ b/src/backend/access/common/reloptions.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/access/hash/hash_xlog.c b/src/backend/access/hash/hash_xlog.c index f8dec838..977d4b27 100644 --- a/src/backend/access/hash/hash_xlog.c +++ b/src/backend/access/hash/hash_xlog.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index d0a83854..b66f2473 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 931f71cc..0675dad8 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/access/rmgrdesc/extentdesc.c b/src/backend/access/rmgrdesc/extentdesc.c index 6071d8c5..329b3900 100644 --- a/src/backend/access/rmgrdesc/extentdesc.c +++ b/src/backend/access/rmgrdesc/extentdesc.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -403,5 +403,3 @@ extent_identify(uint8 info) return "Extent ERROR"; } } - - diff --git a/src/backend/access/rmgrdesc/relcryptdesc.c b/src/backend/access/rmgrdesc/relcryptdesc.c index 2b05fe81..11f904a8 100644 --- a/src/backend/access/rmgrdesc/relcryptdesc.c +++ b/src/backend/access/rmgrdesc/relcryptdesc.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -146,4 +146,3 @@ const char * rel_crypt_identify(uint8 info) return id; } - diff --git a/src/backend/access/rmgrdesc/replslotdesc.c b/src/backend/access/rmgrdesc/replslotdesc.c index 0d71eb8b..9b2e6169 100644 --- a/src/backend/access/rmgrdesc/replslotdesc.c +++ b/src/backend/access/rmgrdesc/replslotdesc.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/access/rmgrdesc/smgrdesc.c b/src/backend/access/rmgrdesc/smgrdesc.c index 3fc546ed..c03f4c78 100644 --- a/src/backend/access/rmgrdesc/smgrdesc.c +++ b/src/backend/access/rmgrdesc/smgrdesc.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/access/rmgrdesc/xactdesc.c b/src/backend/access/rmgrdesc/xactdesc.c index 450e2594..832d6a30 100644 --- a/src/backend/access/rmgrdesc/xactdesc.c +++ b/src/backend/access/rmgrdesc/xactdesc.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c index 1246fdd5..0101d24c 100644 --- a/src/backend/access/rmgrdesc/xlogdesc.c +++ b/src/backend/access/rmgrdesc/xlogdesc.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/access/transam/lru.c b/src/backend/access/transam/lru.c index 0c772617..68009bd0 100644 --- a/src/backend/access/transam/lru.c +++ b/src/backend/access/transam/lru.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c index 5e1848d4..145d3e73 100644 --- a/src/backend/access/transam/rmgr.c +++ b/src/backend/access/transam/rmgr.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/audit/audit.c b/src/backend/audit/audit.c index 7185796a..5ea922ee 100644 --- a/src/backend/audit/audit.c +++ b/src/backend/audit/audit.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -8975,5 +8975,3 @@ static void audit_hit_process_result_info(bool is_success) #ifdef Audit_004_For_Log #endif - - diff --git a/src/backend/audit/audit_fga.c b/src/backend/audit/audit_fga.c index 2327ac00..2c998b76 100644 --- a/src/backend/audit/audit_fga.c +++ b/src/backend/audit/audit_fga.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -431,7 +431,7 @@ exec_policy_funct_on_other_node(char *query_string) { cn_node_list = (Oid *) palloc0(cn_nodes_num * sizeof(Oid)); - PGXCGetCoordOidOthers(cn_node_list); + PGXCGetCoordOidOthers(cn_node_list); pgxc_execute_on_nodes(cn_nodes_num, cn_node_list, query_string); } } @@ -1822,7 +1822,7 @@ process_fga_trigger(bool timeout) else { elog(LOG, "AUDIT_FGA: cannot connect to db"); - PQfinish(conn); + PQfinish(conn); } } } @@ -1976,5 +1976,3 @@ write_trigger_handle_to_shmem(Oid func_oid) return ; } - - diff --git a/src/backend/bootstrap/bootparse.y b/src/backend/bootstrap/bootparse.y index e720c618..5800c275 100644 --- a/src/backend/bootstrap/bootparse.y +++ b/src/backend/bootstrap/bootparse.y @@ -2,7 +2,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -48,7 +48,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c index 5a1ace81..760cd82e 100644 --- a/src/backend/bootstrap/bootstrap.c +++ b/src/backend/bootstrap/bootstrap.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/catalog/catalog.c b/src/backend/catalog/catalog.c index 203f31c2..2ba9dee1 100644 --- a/src/backend/catalog/catalog.c +++ b/src/backend/catalog/catalog.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/catalog/pg_partition_interval.c b/src/backend/catalog/pg_partition_interval.c index 8ca204e5..db6ccd90 100644 --- a/src/backend/catalog/pg_partition_interval.c +++ b/src/backend/catalog/pg_partition_interval.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -271,4 +271,3 @@ ModifyPartitionStartValue(Oid relid, int64 startval) ReleaseSysCache(tup); heap_close(rel,RowExclusiveLock); } - diff --git a/src/backend/catalog/pg_publication.c b/src/backend/catalog/pg_publication.c index 4d21340b..ab9f7d37 100644 --- a/src/backend/catalog/pg_publication.c +++ b/src/backend/catalog/pg_publication.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/catalog/pgxc_key_values.c b/src/backend/catalog/pgxc_key_values.c index 6af66d75..b78d05dc 100644 --- a/src/backend/catalog/pgxc_key_values.c +++ b/src/backend/catalog/pgxc_key_values.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -613,4 +613,3 @@ bool IsKeyValues(Oid db, Oid rel, char *value) CStringGetDatum(value), 0); } - diff --git a/src/backend/catalog/pgxc_shard_map.c b/src/backend/catalog/pgxc_shard_map.c index 64ef2f5c..2f79b4a6 100644 --- a/src/backend/catalog/pgxc_shard_map.c +++ b/src/backend/catalog/pgxc_shard_map.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -336,4 +336,3 @@ void DropShardMap_Node(Oid group) RegisterInvalidShmemShardMap(group, ShardOpType_drop); } - diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index b9136469..5e3d424c 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/catalog/toasting.c b/src/backend/catalog/toasting.c index 95b0564c..336727e8 100644 --- a/src/backend/catalog/toasting.c +++ b/src/backend/catalog/toasting.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/commands/event_trigger.c b/src/backend/commands/event_trigger.c index 742e23d4..8ced270c 100644 --- a/src/backend/commands/event_trigger.c +++ b/src/backend/commands/event_trigger.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/commands/opclasscmds.c b/src/backend/commands/opclasscmds.c index 6e0f12b7..acb248ea 100644 --- a/src/backend/commands/opclasscmds.c +++ b/src/backend/commands/opclasscmds.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/commands/portalcmds.c b/src/backend/commands/portalcmds.c index cdd42fa5..dfc941fa 100644 --- a/src/backend/commands/portalcmds.c +++ b/src/backend/commands/portalcmds.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/commands/schemacmds.c b/src/backend/commands/schemacmds.c index 3ef2e6a1..8b41446c 100644 --- a/src/backend/commands/schemacmds.c +++ b/src/backend/commands/schemacmds.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -562,4 +562,3 @@ char * GetSchemaNameByOid(Oid schemaOid) } #endif - diff --git a/src/backend/contrib/pgcrypto/blf.c b/src/backend/contrib/pgcrypto/blf.c index 598b65f6..efac09b9 100644 --- a/src/backend/contrib/pgcrypto/blf.c +++ b/src/backend/contrib/pgcrypto/blf.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/contrib/pgcrypto/crypt-blowfish.c b/src/backend/contrib/pgcrypto/crypt-blowfish.c index f856f8b2..60a86848 100644 --- a/src/backend/contrib/pgcrypto/crypt-blowfish.c +++ b/src/backend/contrib/pgcrypto/crypt-blowfish.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/contrib/pgcrypto/crypt-des.c b/src/backend/contrib/pgcrypto/crypt-des.c index 20d366b9..a9ae0b0b 100644 --- a/src/backend/contrib/pgcrypto/crypt-des.c +++ b/src/backend/contrib/pgcrypto/crypt-des.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/contrib/pgcrypto/crypt-gensalt.c b/src/backend/contrib/pgcrypto/crypt-gensalt.c index 95916c47..e13c5e2e 100644 --- a/src/backend/contrib/pgcrypto/crypt-gensalt.c +++ b/src/backend/contrib/pgcrypto/crypt-gensalt.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/contrib/pgcrypto/crypt-md5.c b/src/backend/contrib/pgcrypto/crypt-md5.c index b4f58219..b6a6792d 100644 --- a/src/backend/contrib/pgcrypto/crypt-md5.c +++ b/src/backend/contrib/pgcrypto/crypt-md5.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/contrib/pgcrypto/fortuna.c b/src/backend/contrib/pgcrypto/fortuna.c index eb6bf895..b237f1fd 100644 --- a/src/backend/contrib/pgcrypto/fortuna.c +++ b/src/backend/contrib/pgcrypto/fortuna.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/contrib/pgcrypto/imath.c b/src/backend/contrib/pgcrypto/imath.c index 09c1f899..db31d9c3 100644 --- a/src/backend/contrib/pgcrypto/imath.c +++ b/src/backend/contrib/pgcrypto/imath.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/contrib/pgcrypto/internal-sha2.c b/src/backend/contrib/pgcrypto/internal-sha2.c index 10a1e979..370b2a76 100644 --- a/src/backend/contrib/pgcrypto/internal-sha2.c +++ b/src/backend/contrib/pgcrypto/internal-sha2.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/contrib/pgcrypto/internal.c b/src/backend/contrib/pgcrypto/internal.c index 63d1df30..d41c831e 100644 --- a/src/backend/contrib/pgcrypto/internal.c +++ b/src/backend/contrib/pgcrypto/internal.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/contrib/pgcrypto/mbuf.c b/src/backend/contrib/pgcrypto/mbuf.c index 985887a3..ad632f8c 100644 --- a/src/backend/contrib/pgcrypto/mbuf.c +++ b/src/backend/contrib/pgcrypto/mbuf.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/contrib/pgcrypto/md5.c b/src/backend/contrib/pgcrypto/md5.c index 965a480d..a0cdf5d9 100644 --- a/src/backend/contrib/pgcrypto/md5.c +++ b/src/backend/contrib/pgcrypto/md5.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/contrib/pgcrypto/openssl.c b/src/backend/contrib/pgcrypto/openssl.c index 451f9e27..4482a5a0 100644 --- a/src/backend/contrib/pgcrypto/openssl.c +++ b/src/backend/contrib/pgcrypto/openssl.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/contrib/pgcrypto/pgcrypto.c b/src/backend/contrib/pgcrypto/pgcrypto.c index fa5d1962..c93f92bc 100644 --- a/src/backend/contrib/pgcrypto/pgcrypto.c +++ b/src/backend/contrib/pgcrypto/pgcrypto.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -576,5 +576,3 @@ void * crypt_memset(void *s, int c, size_t n) { return memset(s,c,n); } - - diff --git a/src/backend/contrib/pgcrypto/pgp-armor.c b/src/backend/contrib/pgcrypto/pgp-armor.c index c3203f88..827ae206 100644 --- a/src/backend/contrib/pgcrypto/pgp-armor.c +++ b/src/backend/contrib/pgcrypto/pgp-armor.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/contrib/pgcrypto/pgp-cfb.c b/src/backend/contrib/pgcrypto/pgp-cfb.c index 7caa7697..7be68343 100644 --- a/src/backend/contrib/pgcrypto/pgp-cfb.c +++ b/src/backend/contrib/pgcrypto/pgp-cfb.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/contrib/pgcrypto/pgp-compress.c b/src/backend/contrib/pgcrypto/pgp-compress.c index 5a9955f1..2568b948 100644 --- a/src/backend/contrib/pgcrypto/pgp-compress.c +++ b/src/backend/contrib/pgcrypto/pgp-compress.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/contrib/pgcrypto/pgp-decrypt.c b/src/backend/contrib/pgcrypto/pgp-decrypt.c index 1bf73a89..af441918 100644 --- a/src/backend/contrib/pgcrypto/pgp-decrypt.c +++ b/src/backend/contrib/pgcrypto/pgp-decrypt.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/contrib/pgcrypto/pgp-encrypt.c b/src/backend/contrib/pgcrypto/pgp-encrypt.c index 88ccde0d..4f10306f 100644 --- a/src/backend/contrib/pgcrypto/pgp-encrypt.c +++ b/src/backend/contrib/pgcrypto/pgp-encrypt.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/contrib/pgcrypto/pgp-info.c b/src/backend/contrib/pgcrypto/pgp-info.c index 14128959..cd01c140 100644 --- a/src/backend/contrib/pgcrypto/pgp-info.c +++ b/src/backend/contrib/pgcrypto/pgp-info.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/contrib/pgcrypto/pgp-mpi-internal.c b/src/backend/contrib/pgcrypto/pgp-mpi-internal.c index a09b3d0f..91689436 100644 --- a/src/backend/contrib/pgcrypto/pgp-mpi-internal.c +++ b/src/backend/contrib/pgcrypto/pgp-mpi-internal.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/contrib/pgcrypto/pgp-mpi-openssl.c b/src/backend/contrib/pgcrypto/pgp-mpi-openssl.c index f13f1254..6c1fda11 100644 --- a/src/backend/contrib/pgcrypto/pgp-mpi-openssl.c +++ b/src/backend/contrib/pgcrypto/pgp-mpi-openssl.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/contrib/pgcrypto/pgp-mpi.c b/src/backend/contrib/pgcrypto/pgp-mpi.c index 69edf48b..2b2831fa 100644 --- a/src/backend/contrib/pgcrypto/pgp-mpi.c +++ b/src/backend/contrib/pgcrypto/pgp-mpi.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/contrib/pgcrypto/pgp-pgsql.c b/src/backend/contrib/pgcrypto/pgp-pgsql.c index ef00e276..1180bc16 100644 --- a/src/backend/contrib/pgcrypto/pgp-pgsql.c +++ b/src/backend/contrib/pgcrypto/pgp-pgsql.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/contrib/pgcrypto/pgp-pubdec.c b/src/backend/contrib/pgcrypto/pgp-pubdec.c index 94d2a200..cc4ac0cd 100644 --- a/src/backend/contrib/pgcrypto/pgp-pubdec.c +++ b/src/backend/contrib/pgcrypto/pgp-pubdec.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/contrib/pgcrypto/pgp-pubenc.c b/src/backend/contrib/pgcrypto/pgp-pubenc.c index 424fe7e3..273c5d9a 100644 --- a/src/backend/contrib/pgcrypto/pgp-pubenc.c +++ b/src/backend/contrib/pgcrypto/pgp-pubenc.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/contrib/pgcrypto/pgp-pubkey.c b/src/backend/contrib/pgcrypto/pgp-pubkey.c index ef35d515..6b5a31a3 100644 --- a/src/backend/contrib/pgcrypto/pgp-pubkey.c +++ b/src/backend/contrib/pgcrypto/pgp-pubkey.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/contrib/pgcrypto/pgp-s2k.c b/src/backend/contrib/pgcrypto/pgp-s2k.c index d0ee95cf..7f9b5b70 100644 --- a/src/backend/contrib/pgcrypto/pgp-s2k.c +++ b/src/backend/contrib/pgcrypto/pgp-s2k.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/contrib/pgcrypto/pgp.c b/src/backend/contrib/pgcrypto/pgp.c index 9ab0aa30..f5a969f1 100644 --- a/src/backend/contrib/pgcrypto/pgp.c +++ b/src/backend/contrib/pgcrypto/pgp.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/contrib/pgcrypto/px-crypt.c b/src/backend/contrib/pgcrypto/px-crypt.c index 74fdc92d..4825f3f8 100644 --- a/src/backend/contrib/pgcrypto/px-crypt.c +++ b/src/backend/contrib/pgcrypto/px-crypt.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/contrib/pgcrypto/px-hmac.c b/src/backend/contrib/pgcrypto/px-hmac.c index 6bdc23f3..bd249290 100644 --- a/src/backend/contrib/pgcrypto/px-hmac.c +++ b/src/backend/contrib/pgcrypto/px-hmac.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/contrib/pgcrypto/px.c b/src/backend/contrib/pgcrypto/px.c index 45505eca..a29e4be3 100644 --- a/src/backend/contrib/pgcrypto/px.c +++ b/src/backend/contrib/pgcrypto/px.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/contrib/pgcrypto/random.c b/src/backend/contrib/pgcrypto/random.c index c6fdc789..039e769f 100644 --- a/src/backend/contrib/pgcrypto/random.c +++ b/src/backend/contrib/pgcrypto/random.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/contrib/pgcrypto/rijndael.c b/src/backend/contrib/pgcrypto/rijndael.c index 82dee497..653f1613 100644 --- a/src/backend/contrib/pgcrypto/rijndael.c +++ b/src/backend/contrib/pgcrypto/rijndael.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/contrib/pgcrypto/sha1.c b/src/backend/contrib/pgcrypto/sha1.c index cb563ae7..18ed0a32 100644 --- a/src/backend/contrib/pgcrypto/sha1.c +++ b/src/backend/contrib/pgcrypto/sha1.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/contrib/pgcrypto/sha2.c b/src/backend/contrib/pgcrypto/sha2.c index 2829714a..c10bfc95 100644 --- a/src/backend/contrib/pgcrypto/sha2.c +++ b/src/backend/contrib/pgcrypto/sha2.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/contrib/sm/sm4.c b/src/backend/contrib/sm/sm4.c index 857acd7d..2acb3c20 100644 --- a/src/backend/contrib/sm/sm4.c +++ b/src/backend/contrib/sm/sm4.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/executor/execAmi.c b/src/backend/executor/execAmi.c index 1e819521..44651753 100644 --- a/src/backend/executor/execAmi.c +++ b/src/backend/executor/execAmi.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c index ba54f4cf..983c0a78 100644 --- a/src/backend/executor/execIndexing.c +++ b/src/backend/executor/execIndexing.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/executor/execUtils.c b/src/backend/executor/execUtils.c index c6401651..f7fca4aa 100644 --- a/src/backend/executor/execUtils.c +++ b/src/backend/executor/execUtils.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/executor/nodeBitmapAnd.c b/src/backend/executor/nodeBitmapAnd.c index ae2be3d2..d6882a88 100644 --- a/src/backend/executor/nodeBitmapAnd.c +++ b/src/backend/executor/nodeBitmapAnd.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/executor/nodeBitmapIndexscan.c b/src/backend/executor/nodeBitmapIndexscan.c index 08549075..1ab9dcd9 100644 --- a/src/backend/executor/nodeBitmapIndexscan.c +++ b/src/backend/executor/nodeBitmapIndexscan.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/executor/nodeBitmapOr.c b/src/backend/executor/nodeBitmapOr.c index 6c662006..5bcc770b 100644 --- a/src/backend/executor/nodeBitmapOr.c +++ b/src/backend/executor/nodeBitmapOr.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/executor/nodeSubplan.c b/src/backend/executor/nodeSubplan.c index 6df349ab..98e37494 100644 --- a/src/backend/executor/nodeSubplan.c +++ b/src/backend/executor/nodeSubplan.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/libpq/be-fsstubs.c b/src/backend/libpq/be-fsstubs.c index 3f52bfa4..128bb410 100644 --- a/src/backend/libpq/be-fsstubs.c +++ b/src/backend/libpq/be-fsstubs.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/libpq/be-secure.c b/src/backend/libpq/be-secure.c index ea947f5b..59fb602c 100644 --- a/src/backend/libpq/be-secure.c +++ b/src/backend/libpq/be-secure.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/libpq/hba.c b/src/backend/libpq/hba.c index 35ad2dc1..7c0a4b24 100644 --- a/src/backend/libpq/hba.c +++ b/src/backend/libpq/hba.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c index db3b1ea1..9b62a647 100644 --- a/src/backend/libpq/pqcomm.c +++ b/src/backend/libpq/pqcomm.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -2143,4 +2143,4 @@ pq_settcpusertimeout(int timeout, Port *port) #endif return STATUS_OK; -} \ No newline at end of file +} diff --git a/src/backend/nodes/bitmapset.c b/src/backend/nodes/bitmapset.c index 61b30a35..66778548 100644 --- a/src/backend/nodes/bitmapset.c +++ b/src/backend/nodes/bitmapset.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/nodes/makefuncs.c b/src/backend/nodes/makefuncs.c index 30c49729..a5e24898 100644 --- a/src/backend/nodes/makefuncs.c +++ b/src/backend/nodes/makefuncs.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index 61647167..248040d0 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/optimizer/prep/prepjointree.c b/src/backend/optimizer/prep/prepjointree.c index 45e03eb6..9be1cb2b 100644 --- a/src/backend/optimizer/prep/prepjointree.c +++ b/src/backend/optimizer/prep/prepjointree.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/optimizer/util/tlist.c b/src/backend/optimizer/util/tlist.c index 496fd970..4c1c7bc8 100644 --- a/src/backend/optimizer/util/tlist.c +++ b/src/backend/optimizer/util/tlist.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/oracle/charpad.c b/src/backend/oracle/charpad.c index 20c656e5..3d913f28 100644 --- a/src/backend/oracle/charpad.c +++ b/src/backend/oracle/charpad.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/oracle/convert.c b/src/backend/oracle/convert.c index 89602a4c..8fd69209 100644 --- a/src/backend/oracle/convert.c +++ b/src/backend/oracle/convert.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -658,4 +658,3 @@ orcl_to_single_byte(PG_FUNCTION_ARGS) PG_RETURN_TEXT_P(dst); } - diff --git a/src/backend/oracle/datefce.c b/src/backend/oracle/datefce.c index 034fcf3f..f394320c 100644 --- a/src/backend/oracle/datefce.c +++ b/src/backend/oracle/datefce.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/oracle/others.c b/src/backend/oracle/others.c index e27c9a95..6f1ebd33 100644 --- a/src/backend/oracle/others.c +++ b/src/backend/oracle/others.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -631,4 +631,3 @@ ORCL_DECODE_FOR(date); ORCL_DECODE_FOR(time); ORCL_DECODE_FOR(timestamp); ORCL_DECODE_FOR(timestamptz); - diff --git a/src/backend/oracle/plvstr.c b/src/backend/oracle/plvstr.c index 9f4b3596..b5999653 100644 --- a/src/backend/oracle/plvstr.c +++ b/src/backend/oracle/plvstr.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -285,5 +285,3 @@ orcl_instr4(PG_FUNCTION_ARGS) PG_RETURN_INT32(orcl_instr(arg1, arg2, arg3, arg4)); } - - diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c index 9c460dd4..f4155215 100644 --- a/src/backend/parser/analyze.c +++ b/src/backend/parser/analyze.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/parser/parse_clause.c b/src/backend/parser/parse_clause.c index e91f59dd..df8de7ca 100644 --- a/src/backend/parser/parse_clause.c +++ b/src/backend/parser/parse_clause.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/parser/parse_expr.c b/src/backend/parser/parse_expr.c index 8de92ed6..b1143fad 100644 --- a/src/backend/parser/parse_expr.c +++ b/src/backend/parser/parse_expr.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/parser/parse_oper.c b/src/backend/parser/parse_oper.c index 4f9f4949..2dbd35fd 100644 --- a/src/backend/parser/parse_oper.c +++ b/src/backend/parser/parse_oper.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/pgxc/nodemgr/groupmgr.c b/src/backend/pgxc/nodemgr/groupmgr.c index e5de7d81..7a36c5cb 100644 --- a/src/backend/pgxc/nodemgr/groupmgr.c +++ b/src/backend/pgxc/nodemgr/groupmgr.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -723,4 +723,3 @@ GetGroupNameByNode(Oid nodeoid) #endif - diff --git a/src/backend/pgxc/shard/shard_vacuum.c b/src/backend/pgxc/shard/shard_vacuum.c index 2f933972..905c81c8 100644 --- a/src/backend/pgxc/shard/shard_vacuum.c +++ b/src/backend/pgxc/shard/shard_vacuum.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -449,7 +449,7 @@ int64 vacuum_shard_internal(Relation rel, Bitmapset *to_vacuum, Snapshot vacuum_ if(to_delete) { tuples++; - if(tuples > 2000) + if(tuples > 2000) { tuples = 0; pg_usleep(sleep_interval * 1000); @@ -574,4 +574,3 @@ List * GetShardRelations_NoChild(bool is_contain_replic) return result; } - diff --git a/src/backend/pgxc/shard/shardbarrier.c b/src/backend/pgxc/shard/shardbarrier.c index 3219e3e1..9c761bb4 100644 --- a/src/backend/pgxc/shard/shardbarrier.c +++ b/src/backend/pgxc/shard/shardbarrier.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -382,4 +382,3 @@ Datum pg_stat_barrier_shards(PG_FUNCTION_ARGS) result = HeapTupleGetDatum(tuple); SRF_RETURN_NEXT(funcctx, result); } - diff --git a/src/backend/pgxc/shard/shardmap.c b/src/backend/pgxc/shard/shardmap.c index 29f9a946..655ed921 100644 --- a/src/backend/pgxc/shard/shardmap.c +++ b/src/backend/pgxc/shard/shardmap.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/postmaster/auditlogger.c b/src/backend/postmaster/auditlogger.c index 5aea9c14..81151f67 100644 --- a/src/backend/postmaster/auditlogger.c +++ b/src/backend/postmaster/auditlogger.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/postmaster/bgworker.c b/src/backend/postmaster/bgworker.c index 84682f1b..56b86ebe 100644 --- a/src/backend/postmaster/bgworker.c +++ b/src/backend/postmaster/bgworker.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index fa3a078e..8c15abd9 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/replication/logical/relation.c b/src/backend/replication/logical/relation.c index ec8f6ac4..f7ceb93e 100644 --- a/src/backend/replication/logical/relation.c +++ b/src/backend/replication/logical/relation.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c index 4fbc6120..ec5f82ef 100644 --- a/src/backend/replication/logical/reorderbuffer.c +++ b/src/backend/replication/logical/reorderbuffer.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/replication/repl_gram.y b/src/backend/replication/repl_gram.y index e812c5dd..299786f1 100644 --- a/src/backend/replication/repl_gram.y +++ b/src/backend/replication/repl_gram.y @@ -2,7 +2,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -48,7 +48,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/replication/repl_scanner.l b/src/backend/replication/repl_scanner.l index 93596914..7c8f30b4 100644 --- a/src/backend/replication/repl_scanner.l +++ b/src/backend/replication/repl_scanner.l @@ -2,7 +2,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -48,7 +48,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/replication/syncrep_scanner.l b/src/backend/replication/syncrep_scanner.l index 64fe25ac..8a5bf473 100644 --- a/src/backend/replication/syncrep_scanner.l +++ b/src/backend/replication/syncrep_scanner.l @@ -2,7 +2,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -48,7 +48,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c index ec3c962f..39a7434b 100644 --- a/src/backend/replication/walreceiver.c +++ b/src/backend/replication/walreceiver.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index 2d5b5b95..306fb027 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -771,4 +771,3 @@ GetAccessStrategy_part(int npart) return strategy; } #endif - diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c index 20190dc7..6e81a3b8 100644 --- a/src/backend/storage/file/buffile.c +++ b/src/backend/storage/file/buffile.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -843,4 +843,3 @@ BufFile * BufFileOpen(char* fileName, int fileFlags, int fileMode, bool interXac } #endif - diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 67ae7984..26165089 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/storage/freespace/emapage.c b/src/backend/storage/freespace/emapage.c index ade86acd..ea1ca701 100644 --- a/src/backend/storage/freespace/emapage.c +++ b/src/backend/storage/freespace/emapage.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -7437,4 +7437,3 @@ Datum pg_check_extent(PG_FUNCTION_ARGS) SRF_RETURN_DONE(funcctx); } - diff --git a/src/backend/storage/freespace/extent_xlog.c b/src/backend/storage/freespace/extent_xlog.c index 1f51a047..5fb5047a 100644 --- a/src/backend/storage/freespace/extent_xlog.c +++ b/src/backend/storage/freespace/extent_xlog.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -570,5 +570,3 @@ static void extend_heap(RelFileNode rnode, xl_extent_seteme *xlogrec) RelationExtendHeapForRedo(rnode, xlogrec->extentid, xlogrec->eme.shardid); } - - diff --git a/src/backend/storage/ipc/ipc.c b/src/backend/storage/ipc/ipc.c index 2b3a74d8..eed31a12 100644 --- a/src/backend/storage/ipc/ipc.c +++ b/src/backend/storage/ipc/ipc.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index ae1a9029..4c17851c 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c index a709ead2..b39bc165 100644 --- a/src/backend/storage/ipc/procsignal.c +++ b/src/backend/storage/ipc/procsignal.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c index 75cf2ab6..41a4f68b 100644 --- a/src/backend/storage/lmgr/lmgr.c +++ b/src/backend/storage/lmgr/lmgr.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c index f1570bb3..2a08199b 100644 --- a/src/backend/storage/lmgr/lock.c +++ b/src/backend/storage/lmgr/lock.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -4638,4 +4638,3 @@ void StillHoldlock(void) } } #endif - diff --git a/src/backend/storage/lmgr/nodelock.c b/src/backend/storage/lmgr/nodelock.c index 060e0a0b..08a16ba5 100644 --- a/src/backend/storage/lmgr/nodelock.c +++ b/src/backend/storage/lmgr/nodelock.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 2269e8e9..3d265c9e 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/tcop/dest.c b/src/backend/tcop/dest.c index 95e361a3..e5c20b27 100644 --- a/src/backend/tcop/dest.c +++ b/src/backend/tcop/dest.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/utils/adt/datetime.c b/src/backend/utils/adt/datetime.c index 6661ab49..3a0dc618 100644 --- a/src/backend/utils/adt/datetime.c +++ b/src/backend/utils/adt/datetime.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/utils/adt/format_type.c b/src/backend/utils/adt/format_type.c index a202942a..4e6ee08f 100644 --- a/src/backend/utils/adt/format_type.c +++ b/src/backend/utils/adt/format_type.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index 5938bf2d..28bd464e 100644 --- a/src/backend/utils/adt/formatting.c +++ b/src/backend/utils/adt/formatting.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/utils/adt/json.c b/src/backend/utils/adt/json.c index e0fed865..34ece726 100644 --- a/src/backend/utils/adt/json.c +++ b/src/backend/utils/adt/json.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/utils/adt/misc.c b/src/backend/utils/adt/misc.c index 3cb719ee..3c4b9d65 100644 --- a/src/backend/utils/adt/misc.c +++ b/src/backend/utils/adt/misc.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/utils/adt/oid.c b/src/backend/utils/adt/oid.c index 8b28d653..598e23d5 100644 --- a/src/backend/utils/adt/oid.c +++ b/src/backend/utils/adt/oid.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 80a4ce72..63c834bf 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/utils/adt/varchar.c b/src/backend/utils/adt/varchar.c index c60b452b..a32ef514 100644 --- a/src/backend/utils/adt/varchar.c +++ b/src/backend/utils/adt/varchar.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -1519,4 +1519,3 @@ nvarchar2(PG_FUNCTION_ARGS) } #endif - diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 51d53bab..9c5795f5 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/utils/adt/version.c b/src/backend/utils/adt/version.c index b11ef63b..5b6276a0 100644 --- a/src/backend/utils/adt/version.c +++ b/src/backend/utils/adt/version.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -93,4 +93,3 @@ tbase_version(PG_FUNCTION_ARGS) PG_RETURN_TEXT_P(cstring_to_text(TBASE_VERSION_STR)); } #endif - diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c index 88552709..f94b8c70 100644 --- a/src/backend/utils/adt/xml.c +++ b/src/backend/utils/adt/xml.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/utils/cache/inval.c b/src/backend/utils/cache/inval.c index 59f529a8..ef5f194c 100644 --- a/src/backend/utils/cache/inval.c +++ b/src/backend/utils/cache/inval.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -1686,4 +1686,3 @@ void MlsRegisterRelcacheInvalidation(Oid dbId, Oid relId) } #endif - diff --git a/src/backend/utils/cache/relcryptmap.c b/src/backend/utils/cache/relcryptmap.c index 69fc1a06..63c43bfd 100644 --- a/src/backend/utils/cache/relcryptmap.c +++ b/src/backend/utils/cache/relcryptmap.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -2230,4 +2230,3 @@ void StartupReachConsistentState(void) #endif - diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c index 2f466b9b..ab9558e8 100644 --- a/src/backend/utils/init/globals.c +++ b/src/backend/utils/init/globals.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/utils/misc/cls.c b/src/backend/utils/misc/cls.c index 916968ad..28e5f7a2 100644 --- a/src/backend/utils/misc/cls.c +++ b/src/backend/utils/misc/cls.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/utils/misc/datamask.c b/src/backend/utils/misc/datamask.c index a0101b84..321d6186 100644 --- a/src/backend/utils/misc/datamask.c +++ b/src/backend/utils/misc/datamask.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -1331,4 +1331,3 @@ bool datamask_check_column_in_expr(Node * node, void * context) #endif - diff --git a/src/backend/utils/misc/mls.c b/src/backend/utils/misc/mls.c index 4d7ea96e..60f3441d 100644 --- a/src/backend/utils/misc/mls.c +++ b/src/backend/utils/misc/mls.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -1858,4 +1858,3 @@ void check_tbase_mls_extension(void) errmsg("This operation is not allowed until the extension \"%s\" is installed.", MLS_EXTENSION_NAME))); } - diff --git a/src/backend/utils/misc/relcrypt.c b/src/backend/utils/misc/relcrypt.c index ca65b3b4..3f3f7335 100644 --- a/src/backend/utils/misc/relcrypt.c +++ b/src/backend/utils/misc/relcrypt.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/utils/misc/timeout.c b/src/backend/utils/misc/timeout.c index 8fd4212a..dd51de22 100644 --- a/src/backend/utils/misc/timeout.c +++ b/src/backend/utils/misc/timeout.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -742,4 +742,3 @@ disable_timeout_safely(void) disable_timeout(STATEMENT_TIMEOUT, false); } #endif - diff --git a/src/backend/utils/mmgr/aset.c b/src/backend/utils/mmgr/aset.c index 426191db..959a6505 100644 --- a/src/backend/utils/mmgr/aset.c +++ b/src/backend/utils/mmgr/aset.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/utils/mmgr/mcxt.c b/src/backend/utils/mmgr/mcxt.c index 34b4827b..a05ebd3f 100644 --- a/src/backend/utils/mmgr/mcxt.c +++ b/src/backend/utils/mmgr/mcxt.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/backend/utils/resowner/resowner.c b/src/backend/utils/resowner/resowner.c index 05ef66d8..333442c8 100644 --- a/src/backend/utils/resowner/resowner.c +++ b/src/backend/utils/resowner/resowner.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -1505,5 +1505,3 @@ uint32 GetResourceArrayLastidx(void) #endif - - diff --git a/src/backend/utils/sort/tuplestore.c b/src/backend/utils/sort/tuplestore.c index 02d0696d..595a34eb 100644 --- a/src/backend/utils/sort/tuplestore.c +++ b/src/backend/utils/sort/tuplestore.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/bin/confmod/conf.c b/src/bin/confmod/conf.c index 8b779b0e..29dab18c 100644 --- a/src/bin/confmod/conf.c +++ b/src/bin/confmod/conf.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -167,5 +167,3 @@ read_vars(FILE *conf, const char * del) return lineno; } - - diff --git a/src/bin/confmod/conf.h b/src/bin/confmod/conf.h index 95bc5262..71bf56cc 100644 --- a/src/bin/confmod/conf.h +++ b/src/bin/confmod/conf.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/bin/confmod/confmod.c b/src/bin/confmod/confmod.c index d8a15eb2..3e0fc80e 100644 --- a/src/bin/confmod/confmod.c +++ b/src/bin/confmod/confmod.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -353,4 +353,3 @@ int main(int argc, char *argv[]) return 0; } - diff --git a/src/bin/confmod/log.c b/src/bin/confmod/log.c index 891b31b9..e04fbf6e 100644 --- a/src/bin/confmod/log.c +++ b/src/bin/confmod/log.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/bin/confmod/log.h b/src/bin/confmod/log.h index ae095bb3..078c4fda 100644 --- a/src/bin/confmod/log.h +++ b/src/bin/confmod/log.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/bin/confmod/stree.c b/src/bin/confmod/stree.c index cac9af32..c35b19ed 100644 --- a/src/bin/confmod/stree.c +++ b/src/bin/confmod/stree.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -124,5 +124,3 @@ stree_pre_traverse(stree * root, void (*traverse)(void *)) stree_pre_traverse(root->right, traverse); } } - - diff --git a/src/bin/confmod/stree.h b/src/bin/confmod/stree.h index caa5bdfb..0ee16a20 100644 --- a/src/bin/confmod/stree.h +++ b/src/bin/confmod/stree.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/bin/confmod/util.c b/src/bin/confmod/util.c index dda4c4cb..d96a29a2 100644 --- a/src/bin/confmod/util.c +++ b/src/bin/confmod/util.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/bin/confmod/util.h b/src/bin/confmod/util.h index 3588481c..5adae595 100644 --- a/src/bin/confmod/util.h +++ b/src/bin/confmod/util.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/bin/confmod/var.c b/src/bin/confmod/var.c index 073c785b..6f827fda 100644 --- a/src/bin/confmod/var.c +++ b/src/bin/confmod/var.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -209,4 +209,3 @@ var_hash_2_stree() return root; } - diff --git a/src/bin/confmod/var.h b/src/bin/confmod/var.h index 053d0635..602fa44b 100644 --- a/src/bin/confmod/var.h +++ b/src/bin/confmod/var.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c index 91af5b93..5317584f 100644 --- a/src/bin/pg_basebackup/pg_basebackup.c +++ b/src/bin/pg_basebackup/pg_basebackup.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c index 5c22ea00..1c00288c 100644 --- a/src/bin/pg_controldata/pg_controldata.c +++ b/src/bin/pg_controldata/pg_controldata.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/bin/pg_ctl/pg_ctl.c b/src/bin/pg_ctl/pg_ctl.c index 3079adb4..170fa3d3 100644 --- a/src/bin/pg_ctl/pg_ctl.c +++ b/src/bin/pg_ctl/pg_ctl.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/bin/pg_dump/compress_io.h b/src/bin/pg_dump/compress_io.h index 3fee84ef..2dd5a86a 100644 --- a/src/bin/pg_dump/compress_io.h +++ b/src/bin/pg_dump/compress_io.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/bin/pg_dump/pg_backup_archiver.h b/src/bin/pg_dump/pg_backup_archiver.h index d3139c75..cd1ac12f 100644 --- a/src/bin/pg_dump/pg_backup_archiver.h +++ b/src/bin/pg_dump/pg_backup_archiver.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/bin/pg_dump/pg_dump.h b/src/bin/pg_dump/pg_dump.h index 133a66a9..c8da4a4e 100644 --- a/src/bin/pg_dump/pg_dump.h +++ b/src/bin/pg_dump/pg_dump.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/bin/pg_dump/pg_dump_security.c b/src/bin/pg_dump/pg_dump_security.c index dd7d5024..a9431897 100644 --- a/src/bin/pg_dump/pg_dump_security.c +++ b/src/bin/pg_dump/pg_dump_security.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -834,5 +834,3 @@ static void dump_pg_transparent_crypt_policy_map(PGconn *conn) destroyPQExpBuffer(query); return; } - - diff --git a/src/bin/pg_upgrade/exec.c b/src/bin/pg_upgrade/exec.c index 9199369c..b3d67cd5 100644 --- a/src/bin/pg_upgrade/exec.c +++ b/src/bin/pg_upgrade/exec.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/bin/psql/common.c b/src/bin/psql/common.c index ff401aa4..9991898f 100644 --- a/src/bin/psql/common.c +++ b/src/bin/psql/common.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -2269,4 +2269,3 @@ bool is_mls_or_audit_user_front(void) return false; } #endif - diff --git a/src/common/relpath.c b/src/common/relpath.c index a9abdcd7..08ddedec 100644 --- a/src/common/relpath.c +++ b/src/common/relpath.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/gtm/common/gtm_time.c b/src/gtm/common/gtm_time.c index ee451bbb..5b449b4e 100644 --- a/src/gtm/common/gtm_time.c +++ b/src/gtm/common/gtm_time.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/gtm/common/heap.c b/src/gtm/common/heap.c index f9168d38..1f6b2be8 100644 --- a/src/gtm/common/heap.c +++ b/src/gtm/common/heap.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -456,5 +456,3 @@ void heap_foreach(heap* h, void (*func)(void*,void*)) { func(entry->key, entry->value); } } - - diff --git a/src/gtm/main/gtm_backup.c b/src/gtm/main/gtm_backup.c index 66d0ba87..50d60772 100644 --- a/src/gtm/main/gtm_backup.c +++ b/src/gtm/main/gtm_backup.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/gtm/main/gtm_store.c b/src/gtm/main/gtm_store.c index 5e0e5ea9..9ee1a245 100644 --- a/src/gtm/main/gtm_store.c +++ b/src/gtm/main/gtm_store.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/gtm/main/gtm_xlog.c b/src/gtm/main/gtm_xlog.c index 3d46942e..4a305a30 100644 --- a/src/gtm/main/gtm_xlog.c +++ b/src/gtm/main/gtm_xlog.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -4462,4 +4462,3 @@ load_sync_structures(void) load_syncconfig(); load_xlogsync(); } - diff --git a/src/gtm/main/replication.c b/src/gtm/main/replication.c index 723d4f40..faba0bd7 100644 --- a/src/gtm/main/replication.c +++ b/src/gtm/main/replication.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/gtm/path/path.c b/src/gtm/path/path.c index 5444cbc3..38d82e2e 100644 --- a/src/gtm/path/path.c +++ b/src/gtm/path/path.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/gtm/proxy/proxy_main.c b/src/gtm/proxy/proxy_main.c index 279471d3..fc4e2e49 100644 --- a/src/gtm/proxy/proxy_main.c +++ b/src/gtm/proxy/proxy_main.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/gtm/xlog_test/xlog_reader.c b/src/gtm/xlog_test/xlog_reader.c index 3e0d6520..4471425f 100644 --- a/src/gtm/xlog_test/xlog_reader.c +++ b/src/gtm/xlog_test/xlog_reader.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -3325,7 +3325,8 @@ void GTM_TimerRun(void) } } - void * + void +* GTM_TimerThread(void *argp) { GTM_ThreadInfo *thrinfo = (GTM_ThreadInfo *)argp; @@ -3598,4 +3599,3 @@ main(int argc, char *argv[]) Read_XLogRecovery(argv[1],seg,0); return 0; } - diff --git a/src/gtm/xlog_test/xlog_test.c b/src/gtm/xlog_test/xlog_test.c index 2972fd9d..6b756d38 100644 --- a/src/gtm/xlog_test/xlog_test.c +++ b/src/gtm/xlog_test/xlog_test.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -1508,7 +1508,8 @@ void bind_service_threads(void) } /* time keeper thread will not handle any signal, any signal will cause the thread exit. */ -void * +void +* GTM_ThreadTimeKeeper(void *argp) { GTM_ThreadInfo *my_threadinfo = (GTM_ThreadInfo *)argp; @@ -1621,7 +1622,8 @@ GTM_ThreadTimeKeeper(void *argp) /* time keeper thread will not handle any signal, any signal will cause the thread exit. */ -void * +void +* GTM_ThreadTimeBackup(void *argp) { GTM_ThreadInfo *my_threadinfo = (GTM_ThreadInfo *)argp; @@ -3925,7 +3927,8 @@ void GTM_TimerRun(void) } } - void * + void +* GTM_TimerThread(void *argp) { GTM_ThreadInfo *thrinfo = (GTM_ThreadInfo *)argp; diff --git a/src/include/access/gtm.h b/src/include/access/gtm.h index 4df9fc1e..3fda8250 100644 --- a/src/include/access/gtm.h +++ b/src/include/access/gtm.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h index 0a76bb5a..8ae2326f 100644 --- a/src/include/access/heapam_xlog.h +++ b/src/include/access/heapam_xlog.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/access/lru.h b/src/include/access/lru.h index 7d9980a3..4053e99b 100644 --- a/src/include/access/lru.h +++ b/src/include/access/lru.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/access/parallel.h b/src/include/access/parallel.h index 2e258ca4..b9f51485 100644 --- a/src/include/access/parallel.h +++ b/src/include/access/parallel.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/access/printtup.h b/src/include/access/printtup.h index 0b395eb9..0af5c09d 100644 --- a/src/include/access/printtup.h +++ b/src/include/access/printtup.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/access/relcryptaccess.h b/src/include/access/relcryptaccess.h index 511f948d..7d31750d 100644 --- a/src/include/access/relcryptaccess.h +++ b/src/include/access/relcryptaccess.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index 79e8cab7..af98fdf8 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/access/replslotdesc.h b/src/include/access/replslotdesc.h index 4f6ad1e7..9fe16de4 100644 --- a/src/include/access/replslotdesc.h +++ b/src/include/access/replslotdesc.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h index 3321c275..bd4f49a1 100644 --- a/src/include/access/rmgrlist.h +++ b/src/include/access/rmgrlist.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/access/transam.h b/src/include/access/transam.h index d94c4d26..cbbaf4b3 100644 --- a/src/include/access/transam.h +++ b/src/include/access/transam.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/access/tupdesc_details.h b/src/include/access/tupdesc_details.h index 84f6225a..e712d721 100644 --- a/src/include/access/tupdesc_details.h +++ b/src/include/access/tupdesc_details.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/access/twophase.h b/src/include/access/twophase.h index 06f9685e..9d7a06ba 100644 --- a/src/include/access/twophase.h +++ b/src/include/access/twophase.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/access/visibilitymap.h b/src/include/access/visibilitymap.h index dda3c7a4..7f916fa2 100644 --- a/src/include/access/visibilitymap.h +++ b/src/include/access/visibilitymap.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/access/xlogreader.h b/src/include/access/xlogreader.h index 55656d15..88d47831 100644 --- a/src/include/access/xlogreader.h +++ b/src/include/access/xlogreader.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/access/xlogrecord.h b/src/include/access/xlogrecord.h index 909518f7..55594331 100644 --- a/src/include/access/xlogrecord.h +++ b/src/include/access/xlogrecord.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/audit/audit.h b/src/include/audit/audit.h index a3828441..a2eb5e80 100644 --- a/src/include/audit/audit.h +++ b/src/include/audit/audit.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -279,4 +279,3 @@ extern bool audituser(void); extern bool audituser_arg(Oid roleid); #endif - diff --git a/src/include/audit/audit_fga.h b/src/include/audit/audit_fga.h index c60c3549..f17336d0 100644 --- a/src/include/audit/audit_fga.h +++ b/src/include/audit/audit_fga.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -70,7 +70,7 @@ #define AUDIT_TRIGGER_FEEDBACK_LEN 256 extern bool enable_fga; -extern const char *g_commandTag; +extern const char *g_commandTag; /* simple list of strings */ @@ -142,4 +142,3 @@ extern void write_trigger_handle_to_shmem(Oid func); #endif /*AUDIT_FGA_H*/ - diff --git a/src/include/c.h b/src/include/c.h index 7a6ab8e2..9a4d2e4e 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/catalog/audit/pg_audit_d.h b/src/include/catalog/audit/pg_audit_d.h index e7d9bfe9..12488366 100644 --- a/src/include/catalog/audit/pg_audit_d.h +++ b/src/include/catalog/audit/pg_audit_d.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -90,5 +90,3 @@ typedef FormData_audit_obj_def_opts *Form_audit_obj_def_opts; /* DATA(insert OID = 5104 (10 4 n f)); */ #endif /* PGXC_AUDIT_DEFAULT_H */ - - diff --git a/src/include/catalog/audit/pg_audit_fga.h b/src/include/catalog/audit/pg_audit_fga.h index 23764839..f281fa42 100644 --- a/src/include/catalog/audit/pg_audit_fga.h +++ b/src/include/catalog/audit/pg_audit_fga.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -104,5 +104,3 @@ typedef FormData_audit_fga_conf *Form_audit_fga_conf; #define Anum_audit_fga_conf_audit_column_opts 13 #endif /* PGXC_AUDIT_FGA_H */ - - diff --git a/src/include/catalog/audit/pg_audit_o.h b/src/include/catalog/audit/pg_audit_o.h index 18b4b206..f10399cc 100644 --- a/src/include/catalog/audit/pg_audit_o.h +++ b/src/include/catalog/audit/pg_audit_o.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -98,5 +98,3 @@ typedef FormData_audit_obj_conf *Form_audit_obj_conf; /* DATA(insert OID = 5111 ( 4200 2617 95 0 6 n f)); */ #endif /* PGXC_AUDIT_OBJ_H */ - - diff --git a/src/include/catalog/audit/pg_audit_s.h b/src/include/catalog/audit/pg_audit_s.h index f5aafe30..ca476406 100644 --- a/src/include/catalog/audit/pg_audit_s.h +++ b/src/include/catalog/audit/pg_audit_s.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -93,5 +93,3 @@ typedef FormData_audit_stmt_conf *Form_audit_stmt_conf; /* DATA(insert OID = 5119 ( 4200 6 n f)); */ #endif /* PGXC_AUDIT_STMT_H */ - - diff --git a/src/include/catalog/audit/pg_audit_u.h b/src/include/catalog/audit/pg_audit_u.h index fc171782..164abea2 100644 --- a/src/include/catalog/audit/pg_audit_u.h +++ b/src/include/catalog/audit/pg_audit_u.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -98,5 +98,3 @@ typedef FormData_audit_user_conf *Form_audit_user_conf; /* DATA(insert OID = 5130 ( 4200 3377 6 n f)); */ #endif /* PGXC_AUDIT_USER_H */ - - diff --git a/src/include/catalog/catalog.h b/src/include/catalog/catalog.h index 1394114e..542ca147 100644 --- a/src/include/catalog/catalog.h +++ b/src/include/catalog/catalog.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/catalog/dependency.h b/src/include/catalog/dependency.h index 589bbaab..70fdc4e0 100644 --- a/src/include/catalog/dependency.h +++ b/src/include/catalog/dependency.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h index d37be02e..abf5041f 100644 --- a/src/include/catalog/index.h +++ b/src/include/catalog/index.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/catalog/mls/pg_cls_compartment.h b/src/include/catalog/mls/pg_cls_compartment.h index 356c72f1..888b1d1c 100644 --- a/src/include/catalog/mls/pg_cls_compartment.h +++ b/src/include/catalog/mls/pg_cls_compartment.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/catalog/mls/pg_cls_group.h b/src/include/catalog/mls/pg_cls_group.h index 136681ec..b0a776cc 100644 --- a/src/include/catalog/mls/pg_cls_group.h +++ b/src/include/catalog/mls/pg_cls_group.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/catalog/mls/pg_cls_label.h b/src/include/catalog/mls/pg_cls_label.h index 7b788917..f07b41c8 100644 --- a/src/include/catalog/mls/pg_cls_label.h +++ b/src/include/catalog/mls/pg_cls_label.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/catalog/mls/pg_cls_level.h b/src/include/catalog/mls/pg_cls_level.h index 5ddaf7d9..956e360a 100644 --- a/src/include/catalog/mls/pg_cls_level.h +++ b/src/include/catalog/mls/pg_cls_level.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/catalog/mls/pg_cls_policy.h b/src/include/catalog/mls/pg_cls_policy.h index 189d6d0e..bf5a30d4 100644 --- a/src/include/catalog/mls/pg_cls_policy.h +++ b/src/include/catalog/mls/pg_cls_policy.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -83,5 +83,3 @@ typedef FormData_cls_policy * Form_pg_cls_policy; #define Anum_pg_cls_policy_reloption 4 #endif /* PG_CLS_POLICY_H */ - - diff --git a/src/include/catalog/mls/pg_cls_table.h b/src/include/catalog/mls/pg_cls_table.h index 46a5c4a5..7933869c 100644 --- a/src/include/catalog/mls/pg_cls_table.h +++ b/src/include/catalog/mls/pg_cls_table.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/catalog/mls/pg_cls_user.h b/src/include/catalog/mls/pg_cls_user.h index dda745c1..144b2395 100644 --- a/src/include/catalog/mls/pg_cls_user.h +++ b/src/include/catalog/mls/pg_cls_user.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/catalog/mls/pg_data_mask_map.h b/src/include/catalog/mls/pg_data_mask_map.h index ec6132dc..1e49dc26 100644 --- a/src/include/catalog/mls/pg_data_mask_map.h +++ b/src/include/catalog/mls/pg_data_mask_map.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/catalog/mls/pg_data_mask_user.h b/src/include/catalog/mls/pg_data_mask_user.h index e703d58f..664979cf 100644 --- a/src/include/catalog/mls/pg_data_mask_user.h +++ b/src/include/catalog/mls/pg_data_mask_user.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/catalog/namespace.h b/src/include/catalog/namespace.h index 16ef82ae..ef656a77 100644 --- a/src/include/catalog/namespace.h +++ b/src/include/catalog/namespace.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/catalog/objectaddress.h b/src/include/catalog/objectaddress.h index ec2cd56f..8fbafdb1 100644 --- a/src/include/catalog/objectaddress.h +++ b/src/include/catalog/objectaddress.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/catalog/pg_audit.h b/src/include/catalog/pg_audit.h index 54c58959..c1ac3d76 100644 --- a/src/include/catalog/pg_audit.h +++ b/src/include/catalog/pg_audit.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/catalog/pg_authid.h b/src/include/catalog/pg_authid.h index 5d9a9b26..ad4ddad1 100644 --- a/src/include/catalog/pg_authid.h +++ b/src/include/catalog/pg_authid.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/catalog/pg_cast.h b/src/include/catalog/pg_cast.h index 2bcfc409..6495aec1 100644 --- a/src/include/catalog/pg_cast.h +++ b/src/include/catalog/pg_cast.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/catalog/pg_mls.h b/src/include/catalog/pg_mls.h index b2c670fb..3af65945 100644 --- a/src/include/catalog/pg_mls.h +++ b/src/include/catalog/pg_mls.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/catalog/pg_namespace.h b/src/include/catalog/pg_namespace.h index 91ef4b66..b4577e66 100644 --- a/src/include/catalog/pg_namespace.h +++ b/src/include/catalog/pg_namespace.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/catalog/pg_operator.h b/src/include/catalog/pg_operator.h index 2b3bf872..20316d65 100644 --- a/src/include/catalog/pg_operator.h +++ b/src/include/catalog/pg_operator.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/catalog/pg_partition_interval.h b/src/include/catalog/pg_partition_interval.h index 4b10f876..d65eb686 100644 --- a/src/include/catalog/pg_partition_interval.h +++ b/src/include/catalog/pg_partition_interval.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index 1eb1b97f..f5ebe168 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/catalog/pg_publication.h b/src/include/catalog/pg_publication.h index 4e3a8241..fea62666 100644 --- a/src/include/catalog/pg_publication.h +++ b/src/include/catalog/pg_publication.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/catalog/pg_publication_shard.h b/src/include/catalog/pg_publication_shard.h index b42e86d4..17ea8b51 100644 --- a/src/include/catalog/pg_publication_shard.h +++ b/src/include/catalog/pg_publication_shard.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -111,4 +111,3 @@ typedef FormData_pg_publication_shard *Form_pg_publication_shard; #define Anum_pg_publication_shard_prshardid 2 #endif /* PG_PUBLICATION_SHARD_H */ - diff --git a/src/include/catalog/pg_subscription_shard.h b/src/include/catalog/pg_subscription_shard.h index cb73b335..819ae03e 100644 --- a/src/include/catalog/pg_subscription_shard.h +++ b/src/include/catalog/pg_subscription_shard.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -102,5 +102,3 @@ typedef FormData_pg_subscription_shard *Form_pg_subscription_shard; #define Anum_pg_subscription_shard_pubname 3 #endif /* PG_SUBSCRIPTION_SHARD_H */ - - diff --git a/src/include/catalog/pg_subscription_table.h b/src/include/catalog/pg_subscription_table.h index c5a53621..80ae633d 100644 --- a/src/include/catalog/pg_subscription_table.h +++ b/src/include/catalog/pg_subscription_table.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -102,4 +102,3 @@ typedef FormData_pg_subscription_table *Form_pg_subscription_table; #define Anum_pg_subscription_table_pubname 3 #endif /* PG_SUBSCRIPTION_TABLE_H */ - diff --git a/src/include/catalog/pg_type.h b/src/include/catalog/pg_type.h index 79cdcf48..97e2c573 100644 --- a/src/include/catalog/pg_type.h +++ b/src/include/catalog/pg_type.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/catalog/pgxc_key_values.h b/src/include/catalog/pgxc_key_values.h index 982507f0..01bff97e 100644 --- a/src/include/catalog/pgxc_key_values.h +++ b/src/include/catalog/pgxc_key_values.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -102,4 +102,4 @@ extern char *BuildRelationCheckoverlapsStr(DistributeBy *distributeby, PGXCSubCl extern Oid GetKeyValuesGroup(Oid db, Oid rel, char *value, Oid *coldgroup); -extern bool IsKeyValues(Oid db, Oid rel, char *value); \ No newline at end of file +extern bool IsKeyValues(Oid db, Oid rel, char *value); diff --git a/src/include/catalog/pgxc_shard_map.h b/src/include/catalog/pgxc_shard_map.h index 721c237c..8429b1fe 100644 --- a/src/include/catalog/pgxc_shard_map.h +++ b/src/include/catalog/pgxc_shard_map.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/catalog/storage_xlog.h b/src/include/catalog/storage_xlog.h index eace4c4e..0ec10fa3 100644 --- a/src/include/catalog/storage_xlog.h +++ b/src/include/catalog/storage_xlog.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/commands/cluster.h b/src/include/commands/cluster.h index c3da4cec..f7e9e845 100644 --- a/src/include/commands/cluster.h +++ b/src/include/commands/cluster.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/commands/prepare.h b/src/include/commands/prepare.h index a5d6383e..a3433532 100644 --- a/src/include/commands/prepare.h +++ b/src/include/commands/prepare.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/commands/publicationcmds.h b/src/include/commands/publicationcmds.h index 16a0d0f9..f11c030c 100644 --- a/src/include/commands/publicationcmds.h +++ b/src/include/commands/publicationcmds.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/commands/relcryptcommand.h b/src/include/commands/relcryptcommand.h index a34abdb5..bbdf6bc4 100644 --- a/src/include/commands/relcryptcommand.h +++ b/src/include/commands/relcryptcommand.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/commands/schemacmds.h b/src/include/commands/schemacmds.h index 16eb00a5..5acc6fb2 100644 --- a/src/include/commands/schemacmds.h +++ b/src/include/commands/schemacmds.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/commands/sequence.h b/src/include/commands/sequence.h index a9f5ddca..e0c0a554 100644 --- a/src/include/commands/sequence.h +++ b/src/include/commands/sequence.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/commands/subscriptioncmds.h b/src/include/commands/subscriptioncmds.h index b1f9ab20..a0ce73db 100644 --- a/src/include/commands/subscriptioncmds.h +++ b/src/include/commands/subscriptioncmds.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h index 47859b0b..b7af5023 100644 --- a/src/include/commands/vacuum.h +++ b/src/include/commands/vacuum.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/contrib/pgcrypto/blf.h b/src/include/contrib/pgcrypto/blf.h index 71345083..938fc4bf 100644 --- a/src/include/contrib/pgcrypto/blf.h +++ b/src/include/contrib/pgcrypto/blf.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/contrib/pgcrypto/fortuna.h b/src/include/contrib/pgcrypto/fortuna.h index 61a2ea6b..b1b812a5 100644 --- a/src/include/contrib/pgcrypto/fortuna.h +++ b/src/include/contrib/pgcrypto/fortuna.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/contrib/pgcrypto/imath.h b/src/include/contrib/pgcrypto/imath.h index 0695eed8..3d3a1456 100644 --- a/src/include/contrib/pgcrypto/imath.h +++ b/src/include/contrib/pgcrypto/imath.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/contrib/pgcrypto/mbuf.h b/src/include/contrib/pgcrypto/mbuf.h index ad0768f0..92eb8fb9 100644 --- a/src/include/contrib/pgcrypto/mbuf.h +++ b/src/include/contrib/pgcrypto/mbuf.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/contrib/pgcrypto/md5.h b/src/include/contrib/pgcrypto/md5.h index 9fbc95d7..aefa9725 100644 --- a/src/include/contrib/pgcrypto/md5.h +++ b/src/include/contrib/pgcrypto/md5.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/contrib/pgcrypto/pgcrypto.h b/src/include/contrib/pgcrypto/pgcrypto.h index 68bb8ead..377b69b9 100644 --- a/src/include/contrib/pgcrypto/pgcrypto.h +++ b/src/include/contrib/pgcrypto/pgcrypto.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/contrib/pgcrypto/pgp.h b/src/include/contrib/pgcrypto/pgp.h index afaffaa1..615f7f96 100644 --- a/src/include/contrib/pgcrypto/pgp.h +++ b/src/include/contrib/pgcrypto/pgp.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -393,5 +393,3 @@ decrypt_internal(int is_pubenc, int need_text, text *data, extern bytea * encrypt_internal(int is_pubenc, int is_text, text *data, text *key, text *args); - - diff --git a/src/include/contrib/pgcrypto/px-crypt.h b/src/include/contrib/pgcrypto/px-crypt.h index 401eeeec..a6a2919d 100644 --- a/src/include/contrib/pgcrypto/px-crypt.h +++ b/src/include/contrib/pgcrypto/px-crypt.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/contrib/pgcrypto/px.h b/src/include/contrib/pgcrypto/px.h index afd7c44a..a6829357 100644 --- a/src/include/contrib/pgcrypto/px.h +++ b/src/include/contrib/pgcrypto/px.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/contrib/pgcrypto/rijndael.h b/src/include/contrib/pgcrypto/rijndael.h index d876b1b2..df23fbd3 100644 --- a/src/include/contrib/pgcrypto/rijndael.h +++ b/src/include/contrib/pgcrypto/rijndael.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/contrib/pgcrypto/sha1.h b/src/include/contrib/pgcrypto/sha1.h index c1f207e9..df33c32b 100644 --- a/src/include/contrib/pgcrypto/sha1.h +++ b/src/include/contrib/pgcrypto/sha1.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/contrib/pgcrypto/sha2.h b/src/include/contrib/pgcrypto/sha2.h index 20d03790..25ccca2d 100644 --- a/src/include/contrib/pgcrypto/sha2.h +++ b/src/include/contrib/pgcrypto/sha2.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/contrib/sm/sm4.h b/src/include/contrib/sm/sm4.h index b9bb3693..37cc84d3 100644 --- a/src/include/contrib/sm/sm4.h +++ b/src/include/contrib/sm/sm4.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/executor/execdesc.h b/src/include/executor/execdesc.h index 00e26823..b9929034 100644 --- a/src/include/executor/execdesc.h +++ b/src/include/executor/execdesc.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h index efafc8da..f65f8f3a 100644 --- a/src/include/executor/hashjoin.h +++ b/src/include/executor/hashjoin.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/executor/tqueue.h b/src/include/executor/tqueue.h index d4d12a8e..989e6914 100644 --- a/src/include/executor/tqueue.h +++ b/src/include/executor/tqueue.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/gtm/elog.h b/src/include/gtm/elog.h index ffdc99c5..52545c27 100644 --- a/src/include/gtm/elog.h +++ b/src/include/gtm/elog.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/gtm/gtm_checkpoint.h b/src/include/gtm/gtm_checkpoint.h index 5b37c8d6..7cf7992c 100644 --- a/src/include/gtm/gtm_checkpoint.h +++ b/src/include/gtm/gtm_checkpoint.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/gtm/gtm_conn.h b/src/include/gtm/gtm_conn.h index 6709b2b1..8e257db7 100644 --- a/src/include/gtm/gtm_conn.h +++ b/src/include/gtm/gtm_conn.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/gtm/gtm_gxid.h b/src/include/gtm/gtm_gxid.h index 978286e3..4461b0a3 100644 --- a/src/include/gtm/gtm_gxid.h +++ b/src/include/gtm/gtm_gxid.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/gtm/gtm_lock.h b/src/include/gtm/gtm_lock.h index a5aa93a4..15fd6194 100644 --- a/src/include/gtm/gtm_lock.h +++ b/src/include/gtm/gtm_lock.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/gtm/gtm_store.h b/src/include/gtm/gtm_store.h index 81205f8b..2e84f1a0 100644 --- a/src/include/gtm/gtm_store.h +++ b/src/include/gtm/gtm_store.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/gtm/gtm_xlog.h b/src/include/gtm/gtm_xlog.h index 986f73c3..76ebb36e 100644 --- a/src/include/gtm/gtm_xlog.h +++ b/src/include/gtm/gtm_xlog.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -291,4 +291,3 @@ extern void RegisterNewSyncStandby(GTM_StandbyReplication *replication); extern void RemoveSyncStandby(GTM_StandbyReplication *replication); extern void load_sync_structures(void); #endif /* GTM_XLOG_H */ - diff --git a/src/include/gtm/gtm_xlog_internal.h b/src/include/gtm/gtm_xlog_internal.h index 78353939..e8ee1ef8 100644 --- a/src/include/gtm/gtm_xlog_internal.h +++ b/src/include/gtm/gtm_xlog_internal.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/gtm/heap.h b/src/include/gtm/heap.h index d2370cca..340cd217 100644 --- a/src/include/gtm/heap.h +++ b/src/include/gtm/heap.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -143,4 +143,3 @@ void heap_foreach(heap* h, void (*func)(void*,void*)); void heap_destroy(heap* h); #endif - diff --git a/src/include/gtm/libpq-be.h b/src/include/gtm/libpq-be.h index e8243e34..a24c58cd 100644 --- a/src/include/gtm/libpq-be.h +++ b/src/include/gtm/libpq-be.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/gtm/register.h b/src/include/gtm/register.h index 0100096d..57216a2f 100644 --- a/src/include/gtm/register.h +++ b/src/include/gtm/register.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/libpq/auth.h b/src/include/libpq/auth.h index bc6ef2f6..04cee3b6 100644 --- a/src/include/libpq/auth.h +++ b/src/include/libpq/auth.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/libpq/libpq-be.h b/src/include/libpq/libpq-be.h index 00737906..e40b9623 100644 --- a/src/include/libpq/libpq-be.h +++ b/src/include/libpq/libpq-be.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/libpq/pqcomm.h b/src/include/libpq/pqcomm.h index c41c0bbf..914c8a66 100644 --- a/src/include/libpq/pqcomm.h +++ b/src/include/libpq/pqcomm.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 0d948295..65ca053e 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/nodes/bitmapset.h b/src/include/nodes/bitmapset.h index fc101c8f..7e39d00d 100644 --- a/src/include/nodes/bitmapset.h +++ b/src/include/nodes/bitmapset.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/nodes/makefuncs.h b/src/include/nodes/makefuncs.h index 6b1997ea..92cdb50a 100644 --- a/src/include/nodes/makefuncs.h +++ b/src/include/nodes/makefuncs.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/nodes/nodeFuncs.h b/src/include/nodes/nodeFuncs.h index cfb41c3c..36d4965d 100644 --- a/src/include/nodes/nodeFuncs.h +++ b/src/include/nodes/nodeFuncs.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h index 2be2556e..73e6c150 100644 --- a/src/include/nodes/relation.h +++ b/src/include/nodes/relation.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index 59dddbe6..c74130c6 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/optimizer/plancat.h b/src/include/optimizer/plancat.h index dc0eb282..21fcc48d 100644 --- a/src/include/optimizer/plancat.h +++ b/src/include/optimizer/plancat.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h index dc4ca53e..dae7b638 100644 --- a/src/include/optimizer/planmain.h +++ b/src/include/optimizer/planmain.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/optimizer/planner.h b/src/include/optimizer/planner.h index 2e47c1e3..6b5f4781 100644 --- a/src/include/optimizer/planner.h +++ b/src/include/optimizer/planner.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/optimizer/subselect.h b/src/include/optimizer/subselect.h index bfba85b3..ef3265f6 100644 --- a/src/include/optimizer/subselect.h +++ b/src/include/optimizer/subselect.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/optimizer/var.h b/src/include/optimizer/var.h index 5cbb90d2..a762f1fa 100644 --- a/src/include/optimizer/var.h +++ b/src/include/optimizer/var.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/oracle/oracle.h b/src/include/oracle/oracle.h index 207a6cba..2e7e37f9 100644 --- a/src/include/oracle/oracle.h +++ b/src/include/oracle/oracle.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/parser/analyze.h b/src/include/parser/analyze.h index 1efb963c..0bba2711 100644 --- a/src/include/parser/analyze.h +++ b/src/include/parser/analyze.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/parser/parse_node.h b/src/include/parser/parse_node.h index 5ae643ce..8bc4609c 100644 --- a/src/include/parser/parse_node.h +++ b/src/include/parser/parse_node.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/parser/parse_utilcmd.h b/src/include/parser/parse_utilcmd.h index b6a0be60..4c5ba439 100644 --- a/src/include/parser/parse_utilcmd.h +++ b/src/include/parser/parse_utilcmd.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/parser/parser.h b/src/include/parser/parser.h index e4da6bbf..bcfa1ad3 100644 --- a/src/include/parser/parser.h +++ b/src/include/parser/parser.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h index 8d110b88..e0d7f306 100644 --- a/src/include/pg_config_manual.h +++ b/src/include/pg_config_manual.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/pgxc/groupmgr.h b/src/include/pgxc/groupmgr.h index d27d6c2b..a5be4a63 100644 --- a/src/include/pgxc/groupmgr.h +++ b/src/include/pgxc/groupmgr.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/pgxc/pgxc.h b/src/include/pgxc/pgxc.h index 264bfbed..0ae12298 100644 --- a/src/include/pgxc/pgxc.h +++ b/src/include/pgxc/pgxc.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/pgxc/planner.h b/src/include/pgxc/planner.h index 2acef598..fbbac577 100644 --- a/src/include/pgxc/planner.h +++ b/src/include/pgxc/planner.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/pgxc/shard_vacuum.h b/src/include/pgxc/shard_vacuum.h index eac39965..7559b8d4 100644 --- a/src/include/pgxc/shard_vacuum.h +++ b/src/include/pgxc/shard_vacuum.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/pgxc/shardmap.h b/src/include/pgxc/shardmap.h index 0674185d..8ad6b88e 100644 --- a/src/include/pgxc/shardmap.h +++ b/src/include/pgxc/shardmap.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/postgres_ext.h b/src/include/postgres_ext.h index d02f346c..4d3bed00 100644 --- a/src/include/postgres_ext.h +++ b/src/include/postgres_ext.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/postmaster/auditlogger.h b/src/include/postmaster/auditlogger.h index 2ec37bc1..0087b854 100644 --- a/src/include/postmaster/auditlogger.h +++ b/src/include/postmaster/auditlogger.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/postmaster/bgworker.h b/src/include/postmaster/bgworker.h index c862bec5..f54138eb 100644 --- a/src/include/postmaster/bgworker.h +++ b/src/include/postmaster/bgworker.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/postmaster/bgwriter.h b/src/include/postmaster/bgwriter.h index a9530074..69ed2fef 100644 --- a/src/include/postmaster/bgwriter.h +++ b/src/include/postmaster/bgwriter.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/postmaster/pgarch.h b/src/include/postmaster/pgarch.h index 7bc5eec1..682efcc1 100644 --- a/src/include/postmaster/pgarch.h +++ b/src/include/postmaster/pgarch.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/replication/decode.h b/src/include/replication/decode.h index e6a4211b..8e3d49b6 100644 --- a/src/include/replication/decode.h +++ b/src/include/replication/decode.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/replication/logical_statistic.h b/src/include/replication/logical_statistic.h index db94ca6d..8e63192a 100644 --- a/src/include/replication/logical_statistic.h +++ b/src/include/replication/logical_statistic.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -137,4 +137,3 @@ extern Datum tbase_remove_pubtable_stat(PG_FUNCTION_ARGS); extern Datum tbase_remove_sub_stat(PG_FUNCTION_ARGS); extern Datum tbase_remove_subtable_stat(PG_FUNCTION_ARGS); #endif /* PG_SUBSCRIPTION_STATISTIC_H */ - diff --git a/src/include/replication/logicallauncher.h b/src/include/replication/logicallauncher.h index 60a23a95..c885867f 100644 --- a/src/include/replication/logicallauncher.h +++ b/src/include/replication/logicallauncher.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/replication/logicalrelation.h b/src/include/replication/logicalrelation.h index 0c6af1cf..fdf5b629 100644 --- a/src/include/replication/logicalrelation.h +++ b/src/include/replication/logicalrelation.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/replication/walreceiver.h b/src/include/replication/walreceiver.h index d50bc882..3935cf95 100644 --- a/src/include/replication/walreceiver.h +++ b/src/include/replication/walreceiver.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/replication/worker_internal.h b/src/include/replication/worker_internal.h index 5673c572..0def2474 100644 --- a/src/include/replication/worker_internal.h +++ b/src/include/replication/worker_internal.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index 6eae2368..3979c675 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/storage/buffile.h b/src/include/storage/buffile.h index 05f6ec03..e7cdfe00 100644 --- a/src/include/storage/buffile.h +++ b/src/include/storage/buffile.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/storage/extent_xlog.h b/src/include/storage/extent_xlog.h index 93d6d8a6..c4e1b66c 100644 --- a/src/include/storage/extent_xlog.h +++ b/src/include/storage/extent_xlog.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/storage/extentmapping.h b/src/include/storage/extentmapping.h index 13d4017c..f401957d 100644 --- a/src/include/storage/extentmapping.h +++ b/src/include/storage/extentmapping.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/storage/lmgr.h b/src/include/storage/lmgr.h index 1924df26..3cc900ca 100644 --- a/src/include/storage/lmgr.h +++ b/src/include/storage/lmgr.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index ae936afc..55a3a7de 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/storage/nodelock.h b/src/include/storage/nodelock.h index 6a87d592..cb66cc0c 100644 --- a/src/include/storage/nodelock.h +++ b/src/include/storage/nodelock.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index 32dea324..ee30fe95 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/storage/procsignal.h b/src/include/storage/procsignal.h index 96064c66..e9baa497 100644 --- a/src/include/storage/procsignal.h +++ b/src/include/storage/procsignal.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/storage/relcryptstorage.h b/src/include/storage/relcryptstorage.h index 19e4eae2..c310316e 100644 --- a/src/include/storage/relcryptstorage.h +++ b/src/include/storage/relcryptstorage.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/storage/relfilenode.h b/src/include/storage/relfilenode.h index b1095952..799fd31a 100644 --- a/src/include/storage/relfilenode.h +++ b/src/include/storage/relfilenode.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/tcop/dest.h b/src/include/tcop/dest.h index 4e72e4a2..2cc43d88 100644 --- a/src/include/tcop/dest.h +++ b/src/include/tcop/dest.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/tcop/pquery.h b/src/include/tcop/pquery.h index 1c920257..5401d969 100644 --- a/src/include/tcop/pquery.h +++ b/src/include/tcop/pquery.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/tcop/tcopprot.h b/src/include/tcop/tcopprot.h index 65bbd737..ccc140db 100644 --- a/src/include/tcop/tcopprot.h +++ b/src/include/tcop/tcopprot.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h index 4218d9d2..228da956 100644 --- a/src/include/utils/builtins.h +++ b/src/include/utils/builtins.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/utils/cls.h b/src/include/utils/cls.h index 2d753eec..75709956 100644 --- a/src/include/utils/cls.h +++ b/src/include/utils/cls.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/utils/datamask.h b/src/include/utils/datamask.h index 64a4a48c..ca19cfc9 100644 --- a/src/include/utils/datamask.h +++ b/src/include/utils/datamask.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/utils/elog.h b/src/include/utils/elog.h index ef6b381f..7f170aca 100644 --- a/src/include/utils/elog.h +++ b/src/include/utils/elog.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/utils/guc_tables.h b/src/include/utils/guc_tables.h index 2a890162..747ff219 100644 --- a/src/include/utils/guc_tables.h +++ b/src/include/utils/guc_tables.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/utils/inval.h b/src/include/utils/inval.h index b4ef58b1..45dbeadc 100644 --- a/src/include/utils/inval.h +++ b/src/include/utils/inval.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/utils/memutils.h b/src/include/utils/memutils.h index 74404bf1..10d904ff 100644 --- a/src/include/utils/memutils.h +++ b/src/include/utils/memutils.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/utils/mls.h b/src/include/utils/mls.h index 67cd004b..c9b23731 100644 --- a/src/include/utils/mls.h +++ b/src/include/utils/mls.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/utils/mls_extension.h b/src/include/utils/mls_extension.h index d633b596..817adafa 100644 --- a/src/include/utils/mls_extension.h +++ b/src/include/utils/mls_extension.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index deffc0d2..37c37faf 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/utils/plancache.h b/src/include/utils/plancache.h index e04b03d8..0b6e53fa 100644 --- a/src/include/utils/plancache.h +++ b/src/include/utils/plancache.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/utils/portal.h b/src/include/utils/portal.h index d662e3f6..ec184420 100644 --- a/src/include/utils/portal.h +++ b/src/include/utils/portal.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/utils/relcrypt.h b/src/include/utils/relcrypt.h index fc071bad..cd847b88 100644 --- a/src/include/utils/relcrypt.h +++ b/src/include/utils/relcrypt.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/utils/relcryptcache.h b/src/include/utils/relcryptcache.h index 7b8ac22b..737f4791 100644 --- a/src/include/utils/relcryptcache.h +++ b/src/include/utils/relcryptcache.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/utils/relcryptmap.h b/src/include/utils/relcryptmap.h index ff434bbe..6451659c 100644 --- a/src/include/utils/relcryptmap.h +++ b/src/include/utils/relcryptmap.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/utils/relcryptmisc.h b/src/include/utils/relcryptmisc.h index 354beb42..1ecafb08 100644 --- a/src/include/utils/relcryptmisc.h +++ b/src/include/utils/relcryptmisc.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/utils/resowner_private.h b/src/include/utils/resowner_private.h index 34996bfa..8990610a 100644 --- a/src/include/utils/resowner_private.h +++ b/src/include/utils/resowner_private.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/utils/ruleutils.h b/src/include/utils/ruleutils.h index 03d502f9..85ad751c 100644 --- a/src/include/utils/ruleutils.h +++ b/src/include/utils/ruleutils.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h index 4ba6f96b..5bfa289c 100644 --- a/src/include/utils/snapshot.h +++ b/src/include/utils/snapshot.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/utils/syscache.h b/src/include/utils/syscache.h index 9ab96811..65a8c960 100644 --- a/src/include/utils/syscache.h +++ b/src/include/utils/syscache.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/utils/timeout.h b/src/include/utils/timeout.h index ebf2d15f..5186eaf1 100644 --- a/src/include/utils/timeout.h +++ b/src/include/utils/timeout.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/utils/tqual.h b/src/include/utils/tqual.h index 70c67bb7..f4db7bda 100644 --- a/src/include/utils/tqual.h +++ b/src/include/utils/tqual.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/include/utils/tuplestore.h b/src/include/utils/tuplestore.h index ec59731a..d4782980 100644 --- a/src/include/utils/tuplestore.h +++ b/src/include/utils/tuplestore.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/interfaces/libpq/fe-connect.c b/src/interfaces/libpq/fe-connect.c index 1b7e3fe7..1dd1c662 100644 --- a/src/interfaces/libpq/fe-connect.c +++ b/src/interfaces/libpq/fe-connect.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/interfaces/libpq/fe-protocol3.c b/src/interfaces/libpq/fe-protocol3.c index 10f873e0..62d1c323 100644 --- a/src/interfaces/libpq/fe-protocol3.c +++ b/src/interfaces/libpq/fe-protocol3.c @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/interfaces/libpq/libpq-fe.h b/src/interfaces/libpq/libpq-fe.h index e0cd17c0..ac477ed0 100644 --- a/src/interfaces/libpq/libpq-fe.h +++ b/src/interfaces/libpq/libpq-fe.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/interfaces/libpq/libpq-int.h b/src/interfaces/libpq/libpq-int.h index ad150bf7..a2f3532d 100644 --- a/src/interfaces/libpq/libpq-int.h +++ b/src/interfaces/libpq/libpq-int.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/pl/plperl/plperl_helpers.h b/src/pl/plperl/plperl_helpers.h index 020538f9..591c1c01 100644 --- a/src/pl/plperl/plperl_helpers.h +++ b/src/pl/plperl/plperl_helpers.h @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, diff --git a/src/test/isolation/isolation_test.conf b/src/test/isolation/isolation_test.conf index 49eaa259..89ea1281 100644 --- a/src/test/isolation/isolation_test.conf +++ b/src/test/isolation/isolation_test.conf @@ -1,7 +1,7 @@ /* * Tencent is pleased to support the open source community by making TBase available. * - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Copyright (C) 2019 Tencent. All rights reserved. * * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. * @@ -47,7 +47,7 @@ * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without + * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, @@ -64,17 +64,3 @@ diskey3=3 connect:dbname=postgres hostaddr=172.0.0.1 port=30001 user=qguo connect:dbname=postgres hostaddr=172.0.0.1 port=30002 user=qguo connect:dbname=postgres hostaddr=172.0.0.1 port=30003 user=qguo - - - - - - - - - - - - - -